From: Mark Wooding <mdw@distorted.org.uk>
Date: Mon, 19 Oct 2020 03:21:35 +0000 (+0100)
Subject: Multiple architectures, more solutions.
X-Git-Url: https://git.distorted.org.uk/~mdw/xchg-rax-rax/commitdiff_plain/90c4eee32bd52dbdba4913a5030c7b27cffaa103

Multiple architectures, more solutions.
---

diff --git a/.gitignore b/.gitignore
index 505bf5b..8a19953 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,5 @@
 *.o
-/xchg
+/xchg-amd64
+/xchg-i386
+/xchg-armhf
+/xchg-arm64
diff --git a/Makefile b/Makefile
index ca069ff..5488b34 100644
--- a/Makefile
+++ b/Makefile
@@ -8,23 +8,36 @@ V_AT			 = $(call vcond,@)
 vtag			 = \
 	$(call vcond,@printf "  %-8s %s\n" "$1" "$(or $2,$@)";)
 
-CC			 = gcc -m64
-CFLAGS			 = -O2 -g -Wall -Werror
-
-AS			 = gcc -m64
-ASFLAGS			 = -O2 -g
+ARCH			 =
 
-LD			 = gcc -m64
+CFLAGS			 = -O2 -g -Wall -Werror
+ASFLAGS			 = $(CFLAGS)
 LDFLAGS			 =
 
-%.o: %.c
-	$(call vtag,CC)$(CC) -c -o $@ $(CFLAGS) $<
-
-%.o: %.S
-	$(call vtag,AS)$(AS) -c -o $@ $(ASFLAGS) $<
-
-all:: xchg
-xchg: xchg.o main.o
-	$(call vtag,LD)$(LD) -o $@ $^
-
-clean::; rm -f xchg *.o
+ARCH			+= i386
+CC/i386			 = gcc -m32
+
+ARCH			+= amd64
+CC/amd64		 = gcc -m64
+
+ARCH			+= armhf
+CC/armhf		 = arm-linux-gnueabihf-gcc
+
+ARCH			+= arm64
+CC/arm64		 = aarch64-linux-gnu-gcc
+
+define defarch.body
+AS/$1			 = $$(CC/$1)
+LD/$1			 = $$(CC/$1)
+%-$1.o: %.c
+	$$(call vtag,CC/$1)$$(CC/$1) -c -o $$@ $$(CFLAGS) $$(CFLAGS/$1) $$<
+%-$1.o: %.S
+	$$(call vtag,AS/$1)$$(AS/$1) -c -o $$@ $$(ASFLAGS) $$(ASFLAGS/$1) $$<
+all:: xchg-$1
+xchg-$1: xchg-$1.o main-$1.o
+	$$(call vtag,LD/$1)$$(LD/$1) -o $$@ $$^
+endef
+defarch			 = $(eval $(call defarch.body,$1))
+$(foreach a,$(ARCH), $(call defarch,$a))
+
+clean::; rm -f $(foreach a,$(ARCH), xchg-$a) *.o
diff --git a/main.c b/main.c
index 9ab4c2e..df4b941 100644
--- a/main.c
+++ b/main.c
@@ -12,10 +12,42 @@ union reg {
   long u;
 };
 
+#if defined(__i386__)
+#  define GPREGS(_)							\
+	_(a) _(b) _(c) _(d) _(si) _(di) _(bp)
+#elif defined(__x86_64__)
+#  define GPREGS(_)							\
+	_(a) _(b) _(c) _(d) _(si) _(di) _(bp)				\
+	_(r8) _(r9) _(r10) _(r11) _(r12) _(r13) _(r14) _(r15)
+#elif defined(__arm__)
+#  define GPREGS(_)							\
+	_(r0) _(r1) _(r2) _(r3) _(r4) _(r5) _(r6) _(r7)			\
+	_(r8) _(r9) _(r10) _(r11) _(r12)
+#elif defined(__aarch64__)
+#  define GPREGS(_)							\
+	_(x0) _(x1) _(x2) _(x3) _(x4) _(x5) _(x6) _(x7)			\
+	_(x8) _(x9) _(x10) _(x11) _(x12) _(x13) _(x14) _(x15)
+#else
+#  error "not supported"
+#endif
+
+enum {
+#define DEFCONST(r) R_##r,
+  GPREGS(DEFCONST)
+#undef DEFCONST
+  R_flags,
+  NREGS
+};
+
+static const char *const rname[] = {
+#define DEFNAME(r) #r,
+  GPREGS(DEFNAME)
+#undef DEFNAME
+  "f"
+};
+
 struct regs {
-  union reg
-    a, b, c, d, si, di, bp, f,
-    r8, r9, r10, r11, r12, r13, r14, r15;
+  union reg r[NREGS];
 };
 
 struct seg {
@@ -25,6 +57,8 @@ struct seg {
 
 #define N(v) (sizeof(v)/sizeof((v)[0]))
 
+#define STRCMP(a, op, b) (strcmp((a), (b)) op 0)
+#define STRNCMP(a, op, b, n) (strncmp((a), (b), (n)) op 0)
 #define CTYPE_HACK(func, ch) func((unsigned char)(ch))
 #define ISDIGIT(ch) CTYPE_HACK(isdigit, ch)
 #define ISSPACE(ch) CTYPE_HACK(isspace, ch)
@@ -106,12 +140,10 @@ static int hex_digit(char ch)
   else return (-1);
 }
 
-static void setreg(union reg *r,
-		   struct seg **seg_inout,
-		   int *i_inout, int argc, char *argv[])
+static void setreg(union reg *r, struct seg **seg_inout, const char *p)
 {
   struct seg *seg;
-  const char *p, *pp;
+  const char *pp;
   unsigned char *q;
   int hi, lo;
   size_t n;
@@ -119,11 +151,9 @@ static void setreg(union reg *r,
 #define LONG_REG(p) (parse_long("signed register", (p), LONG_MIN, LONG_MAX))
 #define ULONG_REG(p) (parse_ulong("unsigned register", (p), 0, ULONG_MAX))
 
-  p = *i_inout >= argc ? "-" : argv[(*i_inout)++];
   switch (*p) {
     case '-':
       if (p[1]) r->i = LONG_REG(p);
-      else r->u = 0xdeadbeefdeadbeef;
       break;
     case 'i':
       if (p[1] != ':') goto bad;
@@ -154,6 +184,12 @@ static void setreg(union reg *r,
       }
       r->p = seg->p;
       break;
+    case 'z':
+      if (p[1] != ':') goto bad;
+      n = parse_ulong("buffer length", p + 2, 0, ~(size_t)0);
+      seg = (*seg_inout)++; seg->p = q = xmalloc(n); seg->sz = n;
+      r->p = q; memset(q, 0, n);
+      break;
     default:
       if (ISDIGIT(*p)) r->u = ULONG_REG(p);
       else if (*p == '+') r->i = LONG_REG(p);
@@ -173,7 +209,11 @@ static void dumpreg(const char *name, const union reg *r,
 {
   size_t i;
 
+#if ULONG_MAX == 0xffffffff
+  printf("%3s =         0x%08lx = %20ld = %20lu", name, r->u, r->i, r->u);
+#else
   printf("%3s = 0x%016lx = %20ld = %20lu", name, r->u, r->i, r->u);
+#endif
   if (r->u >= ' ' && r->u <= '~') printf(" = '%c'", (int)r->u);
   for (i = 0; i < nseg; i++) {
     if (r->p == seg[i].p)
@@ -212,54 +252,54 @@ int main(int argc, char *argv[])
 {
   struct regs r;
   struct seg seg[16], *segp = seg;
-  size_t nseg;
-  int i, j;
+  size_t nseg, n;
+  const char *p;
+  char *q;
+  unsigned long f;
+  int i, j, k;
+  unsigned long l;
 
   prog = strrchr(argv[0], '/'); if (prog) prog++; else prog = argv[0];
 
   if (argc < 2)
-    barf("usage: %s I [A B C D SI DI BP R8 R9 R10 R11 R12 R13 R14 R15 F]",
+    barf("usage: %s I [REG...]",
 	 prog);
 
   j = parse_long("program index", argv[1], -1, N(x) - 1);
 
-  i = 2;
-  setreg(&r.a, &segp, &i, argc, argv);
-  setreg(&r.b, &segp, &i, argc, argv);
-  setreg(&r.c, &segp, &i, argc, argv);
-  setreg(&r.d, &segp, &i, argc, argv);
-  setreg(&r.si, &segp, &i, argc, argv);
-  setreg(&r.di, &segp, &i, argc, argv);
-  setreg(&r.bp, &segp, &i, argc, argv);
-  setreg(&r.r8, &segp, &i, argc, argv);
-  setreg(&r.r9, &segp, &i, argc, argv);
-  setreg(&r.r10, &segp, &i, argc, argv);
-  setreg(&r.r11, &segp, &i, argc, argv);
-  setreg(&r.r12, &segp, &i, argc, argv);
-  setreg(&r.r13, &segp, &i, argc, argv);
-  setreg(&r.r14, &segp, &i, argc, argv);
-  setreg(&r.r15, &segp, &i, argc, argv);
-  setreg(&r.f, &segp, &i, argc, argv);
-  nseg = segp - seg;
+#if ULONG_MAX == 0xffffffff
+#  define DEAD 0xdeadbeef
+#else
+#  define DEAD 0xdeadbeefdeadbeef
+#endif
+  for (i = 0; i < NREGS - 1; i++) r.r[i].u = DEAD;
+#undef DEAD
+  r.r[R_flags].u = 0;
+
+  i = 0;
+  argv += 2;
+  while (*argv) {
+    p = *argv++;
+    if (ISDIGIT(*p)) {
+      l = strtoul(p, &q, 10);
+      if (l < NREGS && *q == '=') { p = q + 1; i = l; }
+    } else for (k = 0; k < NREGS; k++) {
+      n = strlen(rname[k]);
+      if (STRNCMP(p, ==, rname[k], n) && p[n] == '=')
+	{ i = k; p += n + 1; break; }
+    }
+    if (i >= NREGS) barf("too many registers");
+    setreg(&r.r[i], &segp, p); i++;
+  }
 
+  nseg = segp - seg;
   call_example(j < 0 ? &nop : x[j], &r);
 
-  dumpreg("rax", &r.a, seg, nseg);
-  dumpreg("rbx", &r.b, seg, nseg);
-  dumpreg("rcx", &r.c, seg, nseg);
-  dumpreg("rdx", &r.d, seg, nseg);
-  dumpreg("rsi", &r.si, seg, nseg);
-  dumpreg("rdi", &r.di, seg, nseg);
-  dumpreg("rbp", &r.bp, seg, nseg);
-  dumpreg("rbp", &r.bp, seg, nseg);
-  dumpreg("r8", &r.r8, seg, nseg);
-  dumpreg("r9", &r.r9, seg, nseg);
-  dumpreg("r10", &r.r10, seg, nseg);
-  dumpreg("r11", &r.r11, seg, nseg);
-  dumpreg("r12", &r.r12, seg, nseg);
-  dumpreg("r13", &r.r13, seg, nseg);
-  dumpreg("r14", &r.r14, seg, nseg);
-  dumpreg("r15", &r.r15, seg, nseg);
+  for (i = 0; i < NREGS; i++) dumpreg(rname[i], &r.r[i], seg, nseg);
+
+  f = r.r[R_flags].u;
+
+#if defined(__i386__) || defined(__x86_64__)
 
 #define CF (1 <<  0)
 #define PF (1 <<  2)
@@ -267,38 +307,37 @@ int main(int argc, char *argv[])
 #define SF (1 <<  7)
 #define OF (1 << 11)
 
-  dumpreg("f", &r.f, seg, nseg);
   printf("\tstatus: %ccf %cpf %caf %czf %csf %cdf %cof\n",
-	 (r.f.u >>  0)&1u ? '+' : '-',
-	 (r.f.u >>  2)&1u ? '+' : '-',
-	 (r.f.u >>  4)&1u ? '+' : '-',
-	 (r.f.u >>  6)&1u ? '+' : '-',
-	 (r.f.u >>  7)&1u ? '+' : '-',
-	 (r.f.u >> 10)&1u ? '+' : '-',
-	 (r.f.u >> 11)&1u ? '+' : '-');
+	 (f >>  0)&1u ? '+' : '-',
+	 (f >>  2)&1u ? '+' : '-',
+	 (f >>  4)&1u ? '+' : '-',
+	 (f >>  6)&1u ? '+' : '-',
+	 (f >>  7)&1u ? '+' : '-',
+	 (f >> 10)&1u ? '+' : '-',
+	 (f >> 11)&1u ? '+' : '-');
   printf("\tcond:");
-  if (r.f.u&CF) printf(" c/b/nae"); else printf(" nc/ae/nb");
-  if (r.f.u&ZF) printf(" e/z"); else printf(" ne/nz");
-  if (r.f.u&SF) printf(" s"); else printf(" ns");
-  if (r.f.u&OF) printf(" o"); else printf(" no");
-  if (r.f.u&PF) printf(" p"); else printf(" np");
-  if ((r.f.u&CF) || (r.f.u&ZF)) printf(" be/na"); else printf(" a/nbe");
-  if (!(r.f.u&OF) == !(r.f.u&SF)) printf(" ge/nl"); else printf(" l/nge");
-  if (!(r.f.u&OF) == !(r.f.u&SF) && !(r.f.u&ZF))
+  if (f&CF) printf(" c/b/nae"); else printf(" nc/ae/nb");
+  if (f&ZF) printf(" e/z"); else printf(" ne/nz");
+  if (f&SF) printf(" s"); else printf(" ns");
+  if (f&OF) printf(" o"); else printf(" no");
+  if (f&PF) printf(" p"); else printf(" np");
+  if ((f&CF) || (f&ZF)) printf(" be/na"); else printf(" a/nbe");
+  if (!(f&OF) == !(f&SF)) printf(" ge/nl"); else printf(" l/nge");
+  if (!(f&OF) == !(f&SF) && !(f&ZF))
     printf(" g/nle"); else printf(" le/ng");
   putchar('\n');
   printf("\tsystem: %ctf %cif iopl=%d %cnt "
 			"%crf %cvm %cac %cvif %cvip %cid\n",
-	 (r.f.u >>  8)&1u ? '+' : '-',
-	 (r.f.u >>  9)&1u ? '+' : '-',
-	 (int)((r.f.u >> 12)&1u),
-	 (r.f.u >> 14)&1u ? '+' : '-',
-	 (r.f.u >> 16)&1u ? '+' : '-',
-	 (r.f.u >> 17)&1u ? '+' : '-',
-	 (r.f.u >> 18)&1u ? '+' : '-',
-	 (r.f.u >> 19)&1u ? '+' : '-',
-	 (r.f.u >> 20)&1u ? '+' : '-',
-	 (r.f.u >> 21)&1u ? '+' : '-');
+	 (f >>  8)&1u ? '+' : '-',
+	 (f >>  9)&1u ? '+' : '-',
+	 (int)((f >> 12)&1u),
+	 (f >> 14)&1u ? '+' : '-',
+	 (f >> 16)&1u ? '+' : '-',
+	 (f >> 17)&1u ? '+' : '-',
+	 (f >> 18)&1u ? '+' : '-',
+	 (f >> 19)&1u ? '+' : '-',
+	 (f >> 20)&1u ? '+' : '-',
+	 (f >> 21)&1u ? '+' : '-');
 
 #undef CF
 #undef PF
@@ -306,6 +345,90 @@ int main(int argc, char *argv[])
 #undef SF
 #undef OF
 
+#elif defined(__arm__)
+
+#define NF (1u << 31)
+#define ZF (1u << 30)
+#define CF (1u << 29)
+#define VF (1u << 28)
+
+  {
+  static const char
+    *modetab[] = { "?00", "?01", "?02", "?03", "?04", "?05", "?06", "?07",
+		   "?08", "?09", "?10", "?11", "?12", "?13", "?14", "?15",
+		   "usr", "fiq", "irq", "svc", "?20", "?21", "mon", "abt",
+		   "?24", "?25", "hyp", "und", "?28", "?29", "?30", "sys" },
+    *condtab[] = { "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
+		   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv" };
+
+  printf("\tuser: %cn %cz %cc %cv %cq ge=%c%c%c%c;",
+	 (f >> 31)&1u ? '+' : '-',
+	 (f >> 30)&1u ? '+' : '-',
+	 (f >> 29)&1u ? '+' : '-',
+	 (f >> 28)&1u ? '+' : '-',
+	 (f >> 27)&1u ? '+' : '-',
+	 (f >> 19)&1u ? '1' : '0',
+	 (f >> 18)&1u ? '1' : '0',
+	 (f >> 17)&1u ? '1' : '0',
+	 (f >> 16)&1u ? '1' : '0');
+  if (f&NF) printf(" mi"); else printf(" pl");
+  if (f&ZF) printf(" eq"); else printf(" ne");
+  if (f&CF) printf(" cs/hs"); else printf(" cc/lo");
+  if (f&VF) printf(" vs"); else printf(" vc");
+  if ((f&CF) && !(f&ZF)) printf(" hi"); else printf(" ls");
+  if (!(f&VF) == !(f&NF)) printf(" ge"); else printf(" lt");
+  if (!(f&VF) == !(f&NF) && !(f&ZF)) printf(" gt"); else printf(" le");
+  putchar('\n');
+  printf("\tsystem: %cj it=%s:%c%c%c%c %ce %ca %ci %cf %ct m=%s\n",
+	 (f >> 24)&1u ? '+' : '-',
+	 condtab[(f >> 12)&15u],
+	 (f >> 11)&1u ? '1' : '0',
+	 (f >> 10)&1u ? '1' : '0',
+	 (f >> 26)&1u ? '1' : '0',
+	 (f >> 25)&1u ? '1' : '0',
+	 (f >>  9)&1u ? '+' : '-',
+	 (f >>  8)&1u ? '+' : '-',
+	 (f >>  7)&1u ? '+' : '-',
+	 (f >>  6)&1u ? '+' : '-',
+	 (f >>  5)&1u ? '+' : '-',
+	 modetab[(f >>  0)&31u]);
+  }
+
+#undef NF
+#undef ZF
+#undef CF
+#undef VF
+
+#elif defined(__aarch64__)
+
+#define NF (1u << 31)
+#define ZF (1u << 30)
+#define CF (1u << 29)
+#define VF (1u << 28)
+
+  printf("\tuser: %cn %cz %cc %cv;",
+	 (f >> 31)&1u ? '+' : '-',
+	 (f >> 30)&1u ? '+' : '-',
+	 (f >> 29)&1u ? '+' : '-',
+	 (f >> 28)&1u ? '+' : '-');
+  if (f&NF) printf(" mi"); else printf(" pl");
+  if (f&ZF) printf(" eq"); else printf(" ne");
+  if (f&CF) printf(" cs/hs"); else printf(" cc/lo");
+  if (f&VF) printf(" vs"); else printf(" vc");
+  if ((f&CF) && !(f&ZF)) printf(" hi"); else printf(" ls");
+  if (!(f&VF) == !(f&NF)) printf(" ge"); else printf(" lt");
+  if (!(f&VF) == !(f&NF) && !(f&ZF)) printf(" gt"); else printf(" le");
+  putchar('\n');
+
+#undef NF
+#undef ZF
+#undef CF
+#undef VF
+
+#else
+#  error "not supported"
+#endif
+
   for (i = 0; i < nseg; i++)
     { printf("seg[%d] (%p):\n", i, seg[i].p); dumpseg(&seg[i]); }
 
diff --git a/xchg.S b/xchg.S
index 12aeeba..0dcb6c8 100644
--- a/xchg.S
+++ b/xchg.S
@@ -1,8 +1,75 @@
-/// -*- mode: asm; asm-comment-char: ?/ -*-
+/// -*- mode: asm; asm-comment-char: 0 -*-
+
+///--------------------------------------------------------------------------
+/// Preliminaries.
+
+#include <sys/syscall.h>
+
+#if defined(__i386__) || defined(__x86_64__)
 
 	.intel_syntax noprefix
 
-	.section .note.GNU-stack, "", @progbits
+#elif defined(__arm__)
+
+.macro	ret
+	bx	r14
+.endm
+
+	.arch	armv7-a
+
+#elif defined(__aarch64__)
+
+.macro	cmov	rd, rn, cc
+	csel	\rd, \rn, \rd, \cc
+.endm
+#define _COND(_)							\
+	_(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl)			\
+	_(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv)			\
+	_(hs) _(lo)
+#define _INST(_)							\
+	_(ccmp) _(ccmn)							\
+	_(csel) _(cmov)							\
+	_(csinc) _(cinc) _(cset)					\
+	_(csneg) _(cneg)						\
+	_(csinv) _(cinv) _(csetm)
+#define _CONDVAR(cc) _definstvar cc;
+#define _INSTVARS(inst)							\
+	.macro _definstvar cc;						\
+	  .macro inst.\cc args:vararg; inst \args, \cc; .endm;		\
+	.endm;								\
+	_COND(_CONDVAR);						\
+	.purgem _definstvar;
+	_INST(_INSTVARS)
+#undef _COND
+#undef _INST
+#undef _CONDVAR
+#undef _INSTVARS
+
+#define CCMP_N 8
+#define CCMP_Z 4
+#define CCMP_C 2
+#define CCMP_V 1
+
+#define CCMP_MI CCMP_N
+#define CCMP_PL 0
+#define CCMP_EQ CCMP_Z
+#define CCMP_NE 0
+#define CCMP_CS CCMP_C
+#define CCMP_HS CCMP_C
+#define CCMP_CC 0
+#define CCMP_LO 0
+#define CCMP_VS CCMP_V
+#define CCMP_VC 0
+#define CCMP_HI CCMP_C
+#define CCMP_LS 0
+#define CCMP_LT CCMP_N
+#define CCMP_GE 0
+#define CCMP_LE CCMP_N
+#define CCMP_GT 0
+
+#else
+#  error "not supported"
+#endif
 
 .macro	proc	name
 	.globl	\name
@@ -16,6 +83,36 @@
 .endm
 
 .macro ch c
+#if defined(__i386__)
+
+	pushf
+	push	eax
+	push	ebx
+	push	ecx
+	push	edx
+	push	ebp
+	mov	ebp, esp
+	and	esp, -16
+
+	push	\c
+	call	putchar@plt
+
+	call	get_pc_ebx
+	add	ebx, offset _GLOBAL_OFFSET_TABLE
+	mov	eax, [ebx + stdout@GOT]
+	mov	eax, [eax]
+	call	fflush@plt
+
+	mov	esp, ebp
+	pop	ebp
+	pop	edx
+	pop	ecx
+	pop	ebx
+	pop	eax
+	popf
+
+#elif defined(__x86_64__)
+
 	pushf
 	push	rax
 	push	rcx
@@ -44,12 +141,154 @@
 	pop	rcx
 	pop	rax
 	popf
+
+#elif defined(__arm__)
+
+	stmfd	r13!, {r0-r4, r12, r14}
+
+	mov	r4, r13
+	bic	r14, r4, #15
+	mov	r13, r14
+
+	mov	r0, #\c
+	bl	putchar@plt
+
+	ldr	r14, .L$_c$gotoff$\@
+.L$_c$gotpc$\@:
+	add	r14, pc, r14
+	b	.L$_c$cont$\@
+.L$_c$gotoff$\@:
+	.word	_GLOBAL_OFFSET_TABLE - .L$_c$gotpc$\@ - 8
+.L$_c$cont$\@:
+	bl	fflush@plt
+
+	mov	r13, r4
+	ldmfd	r13!, {r0-r4, r12, r14}
+
+#elif defined(__aarch64__)
+
+	sub	sp, sp, #20*8
+	stp	 x0,  x1, [sp,   #0]
+	stp	 x2,  x3, [sp,  #16]
+	stp	 x4,  x5, [sp,  #32]
+	stp	 x6,  x7, [sp,  #48]
+	stp	 x8,  x9, [sp,  #64]
+	stp	x10, x11, [sp,  #80]
+	stp	x12, x13, [sp,  #96]
+	stp	x14, x15, [sp, #112]
+	stp	x16, x17, [sp, #128]
+	mrs	x16, nzcv
+	stp	x16, x30, [sp, #144]
+
+	mov	w0, #\c
+	bl	putchar
+	adrp	x0, :got:stdout
+	ldr	x0, [x0, #:got_lo12:stdout]
+	ldr	x0, [x0]
+	bl	fflush
+
+	ldp	x16, x30, [sp, #144]
+	msr	nzcv, x16
+	ldp	x16, x17, [sp, #128]
+	ldp	x14, x15, [sp, #112]
+	ldp	x12, x13, [sp,  #96]
+	ldp	x10, x11, [sp,  #80]
+	ldp	 x8,  x9, [sp,  #64]
+	ldp	 x6,  x7, [sp,  #48]
+	ldp	 x4,  x5, [sp,  #32]
+	ldp	 x2,  x3, [sp,  #16]
+	ldp	 x0,  x1, [sp,   #0]
+	add	sp, sp, #20*8
+
+#else
+#  error "not supported"
+#endif
 .endm
 
+.macro	notimpl
+#if defined(__i386__) || defined(__x86_64__)
+	ud2
+#elif defined(__arm__)
+	udf
+#elif defined(__aarch64__)
+	hlt	#0
+#else
+#  error "not supported"
+#endif
+.endm
+
+	.section .note.GNU-stack, "", %progbits
+
 	.text
 
+#if defined(__i386__)
+get_pc_ebx:
+	mov	ebx, [esp]
+	ret
+#endif
+
+
 proc	call_example
 
+#if defined(__i386__)
+
+	push	ebx			// ebx
+	push	esi			// esi, ebx
+	push	edi			// edi, esi, ebx
+	push	ebp			// flags, ebp, ..., ebx
+	pushf
+
+	mov	edi, [esp + 4*6]
+	mov	esi, [esp + 4*7]
+	push	esi			// regs, flags, ebp, ..., ebx
+
+	call	get_pc_ebx
+	lea	eax, [ebx + 9f - .]
+	push	eax			// cont, regs, flags, ebp, ..., ebx
+	push	edi		    // func, cont, regs, flags, ebp, ..., ebx
+
+	mov	eax, [esi + 28]
+	pushf
+	pop	ecx
+	and	eax,  0x0cd5
+	and	ecx, ~0x0cd5
+	or	eax, ecx
+	push	eax
+	popf
+	mov	eax, [esi +  0]
+	mov	ebx, [esi +  4]
+	mov	ecx, [esi +  8]
+	mov	edx, [esi + 12]
+	mov	edi, [esi + 20]
+	mov	ebp, [esi + 24]
+	mov	esi, [esi + 16]
+
+	ret			       // -> func; regs, flags, ebp, ..., ebx
+
+9:	pushf				// eflags, regs, flags, ebp, ..., ebx
+	push	esi		   // esi, eflags, regs, flags, ebp, ..., ebx
+	mov	esi, [esp + 8]
+	mov	[esi +  0], eax
+	mov	[esi +  4], ebx
+	mov	[esi +  8], ecx
+	mov	[esi + 12], edx
+	mov	[esi + 20], edi
+	mov	[esi + 24], ebp
+	pop	eax			// rflags, regs, flags, ebp, ..., ebx
+	mov	[esi + 16], eax
+	pop	eax			// regs, flags, ebp, ..., ebx
+	mov	[esi + 28], eax
+
+	add	esp, 4			// flags, ebp, ..., ebx
+	popf				// ebp, ..., ebx
+	pop	ebp			// ..., ebx
+	pop	edi
+	pop	esi
+	pop	ebx			//
+	ret
+
+#elif defined(__x86_64__)
+
 	push	rbx			// rbx
 	push	r10
 	push	r11
@@ -66,7 +305,7 @@ proc	call_example
 	push	rax			// cont, regs, flags, rbp, ..., rbx
 	push	rdi		    // func, cont, regs, flags, rbp, ..., rbx
 
-	mov	rax, [rsi + 56]
+	mov	rax, [rsi + 8*15]
 	pushf
 	pop	rcx
 	and	rax,  0x0cd5
@@ -74,29 +313,45 @@ proc	call_example
 	or	rax, rcx
 	push	rax
 	popf
-	mov	rax, [rsi +  0]
-	mov	rbx, [rsi +  8]
-	mov	rcx, [rsi + 16]
-	mov	rdx, [rsi + 24]
-	mov	rdi, [rsi + 40]
-	mov	rbp, [rsi + 48]
-	mov	rsi, [rsi + 32]
+	mov	rax, [rsi +   0]
+	mov	rbx, [rsi +   8]
+	mov	rcx, [rsi +  16]
+	mov	rdx, [rsi +  24]
+	mov	rdi, [rsi +  40]
+	mov	rbp, [rsi +  48]
+	mov	r8,  [rsi +  56]
+	mov	r9,  [rsi +  64]
+	mov	r10, [rsi +  72]
+	mov	r11, [rsi +  80]
+	mov	r12, [rsi +  88]
+	mov	r13, [rsi +  96]
+	mov	r14, [rsi + 104]
+	mov	r15, [rsi + 112]
+	mov	rsi, [rsi +  32]
 
 	ret			       // -> func; regs, flags, rbp, ..., rbx
 
 9:	pushf				// rflags, regs, flags, rbp, ..., rbx
 	push	rsi		   // rsi, rflags, regs, flags, rbp, ..., rbx
 	mov	rsi, [rsp + 16]
-	mov	[rsi +  0], rax
-	mov	[rsi +  8], rbx
-	mov	[rsi + 16], rcx
-	mov	[rsi + 24], rdx
-	mov	[rsi + 40], rdi
-	mov	[rsi + 48], rbp
+	mov	[rsi +   0], rax
+	mov	[rsi +   8], rbx
+	mov	[rsi +  16], rcx
+	mov	[rsi +  24], rdx
+	mov	[rsi +  40], rdi
+	mov	[rsi +  48], rbp
+	mov	[rsi +  56],  r8
+	mov	[rsi +  64],  r9
+	mov	[rsi +  72], r10
+	mov	[rsi +  80], r11
+	mov	[rsi +  88], r12
+	mov	[rsi +  96], r13
+	mov	[rsi + 104], r14
+	mov	[rsi + 112], r15
 	pop	rax			// rflags, regs, flags, rbp, ..., rbx
-	mov	[rsi + 32], rax
+	mov	[rsi +  32], rax
 	pop	rax			// regs, flags, rbp, ..., rbx
-	mov	[rsi + 56], rax
+	mov	[rsi + 120], rax
 
 	add	rsp, 8			// flags, rbp, ..., rbx
 	popf				// rbp, ..., rbx
@@ -110,6 +365,68 @@ proc	call_example
 	pop	rbx			//
 	ret
 
+#elif defined(__arm__)
+
+	stmfd	r13!, {r0, r1, r4-r11, r14}
+	ldmia	r1, {r0-r12, r14}
+	msr	cpsr, r14
+	mov	r14, pc
+	ldr	pc, [r13], #4
+	ldr	r14, [r13], #4
+	stmia	r14!, {r0-r12}
+	mrs	r0, cpsr
+	str	r0, [r14]
+	ldmfd	r13!, {r4-r11, pc}
+
+#elif defined(__aarch64__)
+
+	stp	x29, x30, [sp, #-13*8]!
+	mov	x29, sp
+	stp	x19, x20, [sp,  #16]
+	stp	x21, x22, [sp,  #32]
+	stp	x23, x24, [sp,  #48]
+	stp	x25, x26, [sp,  #64]
+	stp	x27, x28, [sp,  #80]
+	str	x1, [sp, #96]
+
+	mov	x16, x0
+
+	ldr	x17,	  [x1, #128]
+	ldp	x14, x15, [x1, #112]
+	ldp	x12, x13, [x1,  #96]
+	ldp	x10, x11, [x1,  #80]
+	ldp	 x8,  x9, [x1,  #64]
+	ldp	 x6,  x7, [x1,  #48]
+	ldp	 x4,  x5, [x1,  #32]
+	ldp	 x2,  x3, [x1,  #16]
+	ldp	 x0,  x1, [x1,   #0]
+	msr	nzcv, x17
+
+	blr	x16
+
+	ldr	x16, [sp, #96]
+	mrs	x17, nzcv
+	str	x17,	  [x16, #128]
+	stp	x14, x15, [x16, #112]
+	stp	x12, x13, [x16,  #96]
+	stp	x10, x11, [x16,  #80]
+	stp	 x8,  x9, [x16,  #64]
+	stp	 x6,  x7, [x16,  #48]
+	stp	 x4,  x5, [x16,  #32]
+	stp	 x2,  x3, [x16,  #16]
+	stp	 x0,  x1, [x16,   #0]
+
+	ldp	x19, x20, [sp,  #16]
+	ldp	x21, x22, [sp,  #32]
+	ldp	x23, x24, [sp,  #48]
+	ldp	x25, x26, [sp,  #64]
+	ldp	x27, x28, [sp,  #80]
+	ldp	x29, x30, [sp], #13*8
+
+#else
+#  error "not supported"
+#endif
+
 endproc
 
 proc	nop
@@ -119,19 +436,58 @@ proc	nop
 endproc
 
 ///--------------------------------------------------------------------------
+/// 0x00--0x0f
 
 proc	x00
 
 	// clear all 64 bits of extended traditional registers
-	xor      eax,eax		// clear rax
-	lea      rbx,[0]		// rbx -> _|_
+
+#if defined(__x86_64__)
+
+	xor      eax, eax		// clear rax
+	lea      rbx, [0]		// rbx -> _|_
 	loop     .			// iterate, decrement rcx until zero
-	mov      rdx,0			// set rdx = 0
-	and      esi,0			// clear all bits of rsi
-	sub      edi,edi		// set rdi = edi - edi = 0
+	mov      rdx, 0			// set rdx = 0
+	and      esi, 0			// clear all bits of rsi
+	sub      edi, edi		// set rdi = edi - edi = 0
 	push     0
 	pop      rbp			// pop 0 into rbp
 
+#elif defined(__i386__)
+
+	xor	eax, eax
+	lea	ebx, [0]
+	loop	.
+	mov	edx, 0
+	and	esi, 0
+	sub	edi, edi
+	push	0
+	pop	ebp
+
+#elif defined(__arm__)
+
+	eor	r0, r0, r0
+	rsb	r1, r1, r1
+0:	subs	r2, r2, #1
+	bne	0b
+	mov	r3, #0
+	and	r4, r4, #0
+	sub	r5, r5, r5
+
+#elif defined(__aarch64__)
+
+	eor	w0, w0, w0
+	mov	w1, wzr
+0:	sub	w2, w2, #1
+	cbnz	w2, 0b
+	mov	w3, #0
+	and	w4, w4, wzr
+	sub	w5, w5, w5
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -142,11 +498,43 @@ proc	x01
 	//
 	// on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1}
 	// and f_{i+c}, where f_{i+1} = f_i + f_{i-1}
+
+#if defined(__x86_64__)
+
 0:	xadd	rax, rdx		// a, d = a + d, a
 					//      = f_{i+1} + f_i, f_{i+1}
 					//      = f_{i+2}, f_{i+1}
 	loop	0b			// advance i, decrement c, iterate
 
+#elif defined(__i386__)
+
+0:	xadd	eax, edx
+	loop	0b
+
+#elif defined(__arm__)
+
+0:	subs	r2, r2, #2
+	add	r3, r3, r0
+	blo	8f
+	add	r0, r0, r3
+	bhi	0b
+
+8:	movne	r0, r3
+
+#elif defined(__aarch64__)
+
+0:	subs	x2, x2, #2
+	add	x3, x3, x0
+	b.lo	8f
+	add	x0, x0, x3
+	b.hi	0b
+
+8:	cmov.ne	x0, x3
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -155,10 +543,63 @@ proc	x02
 
 	// boolean canonify a: if a = 0 on entry, leave it zero; otherwise
 	// set a = 1
+
+#if defined(__x86_64__)
+
 	neg	rax			// set cf iff a /= 0
 	sbb	rax, rax		// a = a - a - cf = -cf
 	neg	rax			// a = cf
 
+#elif defined(__i386__)
+
+	neg	eax
+	sbb	eax, eax
+	neg	eax
+
+#elif defined(__arm__)
+
+	movs	r1, r0			// the easy way
+	movne	r1, #1			// mvnne r1, #1 for mask
+
+	cmp	r0, #1			// clear cf iff a == 0
+	sbc	r2, r0, r0		// c' = a - a - 1 + cf = cf - 1
+	add	r2, r2, #1		// c' = cf
+
+	sub	r3, r0, r0, lsr #1	// d' top bit clear; d' = 0 iff a = 0
+	rsb	r3, r3, #0		// d' top bit set iff a /= 0
+	mov	r3, r3, lsr #31		// asr for mask
+
+	rsbs	r0, r0, #0
+	sbc	r0, r0, r0
+	rsb	r0, r0, #0
+
+#elif defined(__aarch64__)
+
+	cmp	x0, #0			// trivial
+	cset.ne	x1			// csetm for mask
+
+	cmp	xzr, x0			// set cf iff a == 0
+	sbc	x2, x0, x0		// c' = a - a - 1 + cf = cf - 1
+	neg	x2, x2			// c' = 1 - cf
+
+	sub	x3, x0, x0, lsr #1	// if a < 2^63 then a' = ceil(d/2) <
+					// 2^63
+					// if a >= 2^63, write a = 2^63 + t
+					// with t < 2^63; d' = 2^63 - 2^62 +
+					// ceil(t/2) = 2^62 + ceil(t/2), and
+					// ceil(t/2) < 2^62
+					// anyway d' < 2^63 and d' = 0 iff
+					// a = 0
+	neg	x3, x3			// d' top bit set iff a /= 0
+	lsr	x3, x3, #63		// asr for mask
+
+	cmp	x0, #1			// set cf iff a /= 0
+	adc	x0, xzr, xzr		// a' = 0 + 0 + cf = cf
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -166,11 +607,46 @@ endproc
 proc	x03
 
 	// set a = min(a, d) (unsigned); clobber c, d
+
+#if defined(__x86_64__)
+
 	sub	rdx, rax		// d' = d - a; set cf if a > d
 	sbb	rcx, rcx		// c = -cf = -[a > d]
 	and	rcx, rdx		// c = a > d ? d - a : 0
 	add	rax, rcx		// a' = a > d ? d : a
 
+#elif defined(__i386__)
+
+	sub	edx, eax
+	sbb	ecx, ecx
+	and	ecx, edx
+	add	eax, ecx
+
+#elif defined(__arm__)
+
+	cmp	r0, r3			// the easy way
+	movlo	r1, r0			// only needed for out-of-place
+	movhs	r1, r3
+
+	subs	r3, r3, r0
+	sbc	r12, r12, r12
+	and	r12, r12, r3
+	add	r0, r0, r12
+
+#elif defined(__aarch64__)
+
+	cmp	x0, x3			// the easy way
+	csel.lo	x1, x0, x3
+
+	subs	x3, x3, x0		// d' = d - a; set cf if d >= a
+	sbc	x16, xzr, xzr		// t = -1 + cf = -[a > d]
+	and	x16, x16, x3		// t = a > d ? d - a : 0
+	add	x0, x0, x16		// a' = a > d ? d : a
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -178,8 +654,76 @@ endproc
 proc	x04
 
 	// switch case?
+
+#if defined(__x86_64__)
+
+  // unrelated playing
+  mov	ecx, eax
+  mov	rbx, -1
+  mov	edx, ecx
+  sub	edx, '0'
+  cmp	edx, 10
+  cmovb	rbx, rdx
+  or	ecx, 0x20
+  mov	edx, ecx
+  sub	edx, 'a'
+  sub	ecx, 'a' - 10
+  cmp	edx, 6
+  cmovb	rbx, rcx
+
+	xor	al, 0x20
+
+#elif defined(__i386__)
+
+  // unrelated playing
+  mov	ecx, eax
+  mov	ebx, -1
+  mov	edx, ecx
+  sub	edx, '0'
+  cmp	edx, 10
+  cmovb	ebx, edx
+  or	ecx, 0x20
+  mov	edx, ecx
+  sub	edx, 'a'
+  sub	ecx, 'a' - 10
+  cmp	edx, 6
+  cmovb	ebx, ecx
+
 	xor	al, 0x20
 
+#elif defined(__arm__)
+
+  // unrelated playing
+  mvn	r1, #0
+  sub	r12, r0, #'0'
+  cmp	r12, #10
+  movlo	r1, r12
+  orr	r12, r0, #0x20
+  sub	r12, r12, #'a'
+  cmp	r12, #6
+  addlo	r1, r12, #10
+
+	eor	r0, r0, #0x20
+
+#elif defined(__aarch64__)
+
+  // unrelated playing
+  mov	x1, #-1
+  sub	w16, w0, #'0'
+  cmp	w16, #10
+  cmov.lo	x1, x16
+  orr	w16, w0, #0x20
+  sub	w16, w16, #'a' - 10
+  cmp	w16, #10
+  ccmp.hs	w16, #16, #CCMP_HS
+  cmov.lo	x1, x16
+
+	eor	w0, w0, #0x20
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -187,6 +731,9 @@ endproc
 proc	x05
 
 	// answer whether 5 <= a </<= 9.
+
+#if defined(__x86_64__)
+
 	sub	rax, 5			// a' = a - 5
 	cmp	rax, 4			// is a' - 5 </<= 4?
 
@@ -211,6 +758,28 @@ proc	x05
 	// g/nle	a' > 4			a > 9 or a < -2^63 + 5
 	// le/ng	a' <= 4			-2^63 + 5 <= a <= 9
 
+#elif defined(__i386__)
+
+	sub	eax, 5
+	cmp	eax, 4
+
+#elif defined(__arm__)
+
+	// i dimly remember having a slick way to do this way back in the
+	// day, but i can't figure it out any more.
+	sub	r0, #5
+	cmp	r0, #4
+
+#elif defined(__aarch64__)
+
+	// literal translation is too obvious
+	cmp	x0, #5
+	ccmp.hs	x0, #9, #CCMP_HS
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -219,10 +788,35 @@ proc	x06
 
 	// leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of,
 	// set sf to msb(a)
+
+#if defined(__x86_64__)
+
 	not	rax			// a' = -a - 1
 	inc	rax			// a' = -a
 	neg	rax			// a' = a
 
+#elif defined(__i386__)
+
+	not	eax
+	inc	eax
+	neg	eax
+
+#elif defined(__arm__)
+
+	mvn	r0, r0
+	add	r0, r0, #1
+	rsbs	r0, r0, #0		// cf has opposite sense
+
+#elif defined(__aarch64__)
+
+	mvn	x0, x0
+	add	x0, x0, #1
+	negs	x0, x0			// cf has opposite sense
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -230,11 +824,39 @@ endproc
 proc	x07
 
 	// same as before (?)
+
+#if defined(__x86_64__)
+
 	inc	rax			// a' = a + 1
 	neg	rax			// a' = -a - 1
 	inc	rax			// a' = -a
 	neg	rax			// a' = a
 
+#elif defined(__i386__)
+
+	inc	eax
+	neg	eax
+	inc	eax
+	neg	eax
+
+#elif defined(__arm__)
+
+	add	r0, r0, #1
+	rsb	r0, r0, #0
+	add	r0, r0, #1
+	rsbs	r0, r0, #0
+
+#elif defined(__aarch64__)
+
+	add	x0, x0, #1
+	neg	x0, x0
+	add	x0, x0, #1
+	negs	x0, x0			// cf has opposite sense
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -243,10 +865,45 @@ proc	x08
 
 	// floor((a + d)/2), correctly handling overflow conditions; final cf
 	// is lsb(a + d), probably uninteresting
+
+#if defined(__x86_64__)
+
 	add	rax, rdx		// cf || a' = a + d
 	rcr	rax, 1			// shift 65-bit result right by one
 					// place; lsb moves into carry
 
+#elif defined(__i386__)
+
+	add	eax, edx
+	rcr	eax, 1
+
+#elif defined(__arm__)
+
+	// like the two-instruction a64 version
+	sub	r1, r3, r0
+	add	r1, r0, r1, lsr #1
+
+	// the slick version, similar to the above
+	adds	r0, r0, r3
+	mov	r0, r0, rrx
+
+#elif defined(__aarch64__)
+
+	// a64 lacks a32's rrx.  literal translation.
+	adds	x1, x0, x3		// cf || a' = a + d
+	adc	x16, xzr, xzr		// realize cf in extra register
+	extr	x1, x16, x1, #1		// shift down one place
+
+	// two instruction version: clobbers additional register.  (if you
+	// wanted the answer in any other register, even overwriting d, then
+	// this is unnecessary.)  also depends on d >= a.
+	sub	x16, x3, x0		// compute difference
+	add	x0, x0, x16, lsr #1	// add half of it (rounded down)
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -255,10 +912,33 @@ proc	x09
 
 	// a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3
 	// (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8).
+
+#if defined(__x86_64__)
+
 	shr	rax, 3			// a' = floor(a/8); cf = 1 if a ==
 					// 4, 5, 6, 7 (mod 8)
 	adc	rax, 0			// a' = floor(a/8) + cf
 
+#elif defined(__i386__)
+
+	shr	eax, 3
+	adc	eax, 0
+
+#elif defined(__arm__)
+
+	movs	r0, r0, lsr #3
+	adc	r0, r0, #0
+
+#elif defined(__aarch64__)
+
+	tst	x0, #4
+	orr	x0, xzr, x0, lsr #3
+	cinc.ne	x0, x0
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -266,11 +946,43 @@ endproc
 proc	x0a
 
 	// increment c-byte little-endian bignum at rdi
+
+#if defined(__x86_64__)
+
 	add	byte ptr [rdi], 1
 0:	inc	rdi
 	adc	byte ptr [rdi], 0
 	loop	0b
 
+#elif defined(__i386__)
+
+	add	byte ptr [edi], 1
+0:	inc	edi
+	adc	byte ptr [edi], 0
+	loop	0b
+
+#elif defined(__arm__)
+
+	mov	r12, #256		// set initial carry
+0:	ldrb	r0, [r5]
+	subs	r2, r2, #1
+	add	r12, r0, r12, lsr #8
+	strb	r12, [r5], #1
+	bne	0b
+
+#elif defined(__aarch64__)
+
+	mov	w17, #256		// set initial carry
+0:	ldrb	w16, [x5]
+	sub	x2, x2, #1
+	add	w17, w16, w17, lsr #8
+	strb	w17, [x5], #1
+	cbnz	x2, 0b
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -278,11 +990,36 @@ endproc
 proc	x0b
 
 	// negate double-precision d:a
+
+#if defined(__x86_64__)
+
 	not	rdx			// d' = -d - 1
 	neg	rax			// a' = -a;
 					// cf = 1 iff a /= 0
 	sbb	rdx, -1			// d' = -d - cf
 
+#elif defined(__i386__)
+
+	not	edx
+	neg	eax
+	sbb	edx, -1
+
+#elif defined(__arm__)
+
+	// reverse subtract is awesome
+	rsbs	r0, r0, #0
+	rsc	r3, r3, #0
+
+#elif defined(__aarch64__)
+
+	// easy way: everything is better with zero registers.
+	negs	x0, x0
+	ngc	x3, x3
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -291,6 +1028,8 @@ proc	x0c
 
 	// rotate is distributive over xor.
 
+#if defined(__x86_64__)
+
 	// rax				// = a_1 || a_0
 	// rbx				// = b_1 || b_0
 	mov	rcx, rax		// = a_1 || a_0
@@ -304,6 +1043,48 @@ proc	x0c
 
 	cmp	rax, rcx		// always equal
 
+#elif defined(__i386__)
+
+	mov	ecx, eax		// = a_1 || a_0
+
+	xor	ecx, ebx		// = (a_1 XOR b_1) || (a_0 XOR b_0)
+	ror	ecx, 0xd		// = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+	ror	eax, 0xd		// = a_0 || a_1
+	ror	ebx, 0xd		// = b_0 || b_1
+	xor	eax, ebx		// = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+	cmp	eax, ecx		// always equal
+
+#elif defined(__arm__)
+
+
+	// r0				// = a_1 || a_0
+	// r1				// = b_1 || b_0
+	eor	r2, r0, r1		// = (a_1 XOR b_1) || (a_0 XOR b_0)
+	mov	r2, r2, ror #13		// = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+	mov	r1, r1, ror #13		// = b_0 || b_1
+	eor	r0, r1, r0, ror #13	// = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+	cmp	r0, r2			// always equal
+
+#elif defined(__aarch64__)
+
+	// x0				// = a_1 || a_0
+	// x1				// = b_1 || b_0
+	eor	x2, x0, x1		// = (a_1 XOR b_1) || (a_0 XOR b_0)
+	ror	x2, x2, #13		// = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+	ror	x1, x1, #13		// = b_0 || b_1
+	eor	x0, x1, x0, ror #13	// = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+	cmp	x0, x2			// always equal
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -312,6 +1093,8 @@ proc	x0d
 
 	// and is distributive over xor.
 
+#if defined(__x86_64__)
+
 	mov	rdx, rbx		// = b
 
 	xor	rbx, rcx		// = b XOR c
@@ -324,6 +1107,50 @@ proc	x0d
 
 	cmp	rax, rbx		// always equal
 
+#elif defined(__i386__)
+
+	mov	edx, ebx		// = b
+
+	xor	ebx, ecx		// = b XOR c
+	and	ebx, eax		// = a AND (b XOR c)
+
+	and	edx, eax		// = a AND b
+	and	eax, ecx		// = a AND c
+	xor	eax, edx		// = (a AND b) XOR (a AND c)
+					// = a AND (b XOR c)
+
+	cmp	eax, ebx		// always equal
+
+#elif defined(__arm__)
+
+	and	r3, r0, r1		// = a AND b
+
+	eor	r1, r1, r2		// = b XOR c
+	and	r1, r1, r0		// = a AND (b XOR c)
+
+	and	r0, r0, r2		// = a AND c
+	eor	r0, r0, r3		// = (a AND b) XOR (a AND c)
+					// = a AND (b XOR c)
+
+	cmp	r0, r1			// always equal
+
+#elif defined(__aarch64__)
+
+	and	x3, x0, x1		// = a AND b
+
+	eor	x1, x1, x2		// = b XOR c
+	and	x1, x1, x0		// = a AND (b XOR c)
+
+	and	x0, x0, x2		// = a AND c
+	eor	x0, x0, x3		// = (a AND b) XOR (a AND c)
+					// = a AND (b XOR c)
+
+	cmp	x0, x1			// always equal
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -332,6 +1159,8 @@ proc	x0e
 
 	// de morgan's law
 
+#if defined(__x86_64__)
+
 	mov	rcx, rax		// = a
 
 	and	rcx, rbx		// = a AND b
@@ -342,7 +1171,46 @@ proc	x0e
 	or	rax, rbx		// = (NOT a) OR (NOT b)
 					// = NOT (a AND b)
 
-	cmp	rax, rcx
+	cmp	rax, rcx		// always equal
+
+#elif defined(__i386__)
+
+	mov	ecx, eax		// = a
+
+	and	ecx, ebx		// = a AND b
+	not	ecx			// = NOT (a AND b)
+
+	not	eax			// = NOT a
+	not	ebx			// = NOT b
+	or	eax, ebx		// = (NOT a) OR (NOT b)
+					// = NOT (a AND b)
+
+	cmp	eax, ecx		// always equal
+
+#elif defined(__arm__)
+
+	and	r2, r0, r1		// = a AND b
+	mvn	r2, r2			// = NOT (a AND b)
+
+	mvn	r0, r0			// = NOT a
+	mvn	r1, r1			// = NOT b
+	orr	r0, r0, r1		// = (NOT a) OR (NOT b)
+
+	cmp	r0, r2			// always equal
+
+#elif defined(__aarch64__)
+
+	and	x2, x0, x1		// = a AND b
+	mvn	x2, x2			// = NOT (a AND b)
+
+	mvn	x0, x0			// = NOT a
+	orn	x0, x0, x1		// = (NOT a) OR (NOT b)
+
+	cmp	x0, x2			// always equal
+
+#else
+	notimpl
+#endif
 
 	ret
 
@@ -355,20 +1223,51 @@ proc	x0f
 	//
 	// not sure why you'd do this.
 
-	cld
+#if defined(__x86_64__)
 
 0:	xor	[rsi], al
 	lodsb
 	loop	0b
 
+#elif defined(__i386__)
+
+0:	xor	[esi], al
+	lodsb
+	loop	0b
+
+#elif defined(__arm__)
+
+0:	ldrb	r12, [r4]
+	subs	r2, r2, #1
+	eor	r0, r0, r12
+	strb	r0, [r4], #1
+	bne	0b
+
+#elif defined(__aarch64__)
+
+0:	ldrb	w16, [x4]
+	sub	x2, x2, #1
+	eor	w0, w0, w16
+	strb	w0, [x4], #1
+	cbnz	x2, 0b
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
 
+///--------------------------------------------------------------------------
+/// 0x10--0x1f
+
 proc	x10
 
 	// four different ways to swap a pair of registers.
 
+#if defined(__x86_64__)
+
 	push	rax
 	push	rcx
 	pop	rax
@@ -385,6 +1284,76 @@ proc	x10
 
 	xchg	rax, rcx
 
+#elif defined(__i386__)
+
+	push	eax
+	push	ecx
+	pop	eax
+	pop	ecx
+
+	xor	eax, ecx
+	xor	ecx, eax
+	xor	eax, ecx
+
+	add	eax, ecx
+	sub	ecx, eax
+	add	eax, ecx
+	neg	ecx
+
+	xchg	eax, ecx
+
+#elif defined(__arm__)
+
+	stmfd	r13!, {r0, r2}
+	ldr	r0, [r13, #4]
+	ldr	r2, [r13], #8
+
+	eor	r0, r0, r2
+	eor	r2, r2, r0
+	eor	r0, r0, r2
+
+	sub	r0, r0, r2
+	add	r2, r2, r0
+	rsb	r0, r0, r2		// don't need 3-addr with reverse-sub
+
+	mov	r12, r0
+	mov	r0, r2
+	mov	r2, r0
+
+#elif defined(__aarch64__)
+
+	// anything you can do
+	stp	x0, x2, [sp, #-16]!
+	ldp	x2, x0, [sp], #16
+
+	eor	x0, x0, x2
+	eor	x2, x2, x0
+	eor	x0, x0, x2
+
+	// the add/sub/add thing was daft.  you can do it in three if you're
+	// clever -- and have three-address operations.
+	sub	x0, x0, x2
+	add	x2, x2, x0
+	sub	x0, x2, x0
+
+	// but we lack a fourth.  we can't do this in fewer than three
+	// instructions without hitting memory.  only `ldp' will modify two
+	// registers at a time, so we need at least two instructions -- but
+	// if the first one sets one of our two registers to its final value
+	// then we lose the other input value with no way to recover it, so
+	// we must either write a fresh third register, or write something
+	// other than the final value, and in both cases we need a third
+	// instruction to fix everything up.  we've done the wrong-something-
+	// other trick twice, so here's the captain-obvious use-a-third-
+	// register version.
+	mov	x16, x0
+	mov	x0, x2
+	mov	x2, x16
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -398,6 +1367,8 @@ proc	x11
 	// in particular, a will be zero (and zf set) if and only if the two
 	// strings are equal.
 
+#if defined(__x86_64__)
+
 0:	mov	dl, [rsi]
 	xor	dl, [rdi]
 	inc	rsi
@@ -405,6 +1376,37 @@ proc	x11
 	or	al, dl
 	loop	0b
 
+#elif defined(__i386__)
+
+0:	mov	dl, [esi]
+	xor	dl, [edi]
+	inc	esi
+	inc	edi
+	or	al, dl
+	loop	0b
+
+#elif defined(__arm__)
+
+0:	ldrb	r1, [r4], #1
+	ldrb	r12, [r5], #1
+	subs	r2, r2, #1
+	eor	r12, r12, r1
+	orr	r0, r0, r12
+	bne	0b
+
+#elif defined(__aarch64__)
+
+0:	ldrb	w16, [x4], #1
+	ldrb	w17, [x5], #1
+	sub	x2, x2, #1
+	eor	w16, w16, w17
+	orr	w0, w0, w16
+	cbnz	x2, 0b
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -418,11 +1420,36 @@ proc	x12
 	// move all of the set bits in d to a, unless there's already a bit
 	// there.  this clearly doesn't change the sum.
 
+#if defined(__x86_64__)
+
 	mov	rcx, rdx		// c' = d
 	and	rdx, rax		// d' = a AND d
 	or	rax, rcx		// a' = a OR d
 	add	rax, rdx
 
+#elif defined(__i386__)
+
+	mov	ecx, edx		// c' = d
+	and	edx, eax		// d' = a AND d
+	or	eax, ecx		// a' = a OR d
+	add	eax, edx
+
+#elif defined(__arm__)
+
+	and	r2, r0, r3		// c' = a AND d
+	orr	r0, r0, r3		// a' = a OR d
+	add	r0, r0, r2
+
+#elif defined(__aarch64__)
+
+	and	x2, x0, x3		// c' = a AND d
+	orr	x0, x0, x3		// a' = a OR d
+	add	x0, x0, x2
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -432,13 +1459,46 @@ proc	x13
 	// ok, so this is a really obtuse way of adding a and b; the result
 	// is in a and d.  but why does it work?
 
+#if defined(__x86_64__)
+
 	mov	rcx, 0x40		// carry chains at most 64 long
 0:	mov	rdx, rax		// copy a'
 	xor	rax, rbx		// low bits of each bitwise sum
 	and	rbx, rdx		// carry bits from each bitwise sum
-	shl	rbx, 001		// carry them into next position
+	shl	rbx, 1			// carry them into next position
+	loop	0b
+
+#elif defined(__i386__)
+
+	mov	ecx, 0x40		// carry chains at most 64 long
+0:	mov	edx, eax		// copy a'
+	xor	eax, ebx		// low bits of each bitwise sum
+	and	ebx, edx		// carry bits from each bitwise sum
+	shl	ebx, 1			// carry them into next position
 	loop	0b
 
+#elif defined(__arm__)
+
+	mov	r2, #0x40
+0:	and	r3, r0, r1
+	subs	r2, r2, #1
+	eor	r0, r0, r1
+	lsl	r1, r3, #1
+	bne	0b
+
+#elif defined(__aarch64__)
+
+	mov	x2, #0x40
+0:	and	x3, x0, x1
+	sub	x2, x2, #1
+	eor	x0, x0, x1
+	lsl	x1, x3, #1
+	cbnz	x2, 0b
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -447,6 +1507,8 @@ proc	x14
 
 	// floor((a + d)/2), like x08.
 
+#if defined(__x86_64__)
+
 	mov	rcx, rax		// copy a for later
 	and	rcx, rdx		// carry bits
 
@@ -455,6 +1517,32 @@ proc	x14
 
 	add	rax, rcx		// add the carries; done
 
+#elif defined(__i386__)
+
+	mov	ecx, eax		// copy a for later
+	and	ecx, edx		// carry bits
+
+	xor	eax, edx		// low bits of each bitwise sum
+	shr	eax, 1			// divide by 2; carries now in place
+
+	add	eax, ecx		// add the carries; done
+
+#elif defined(__arm__)
+
+	and	r2, r0, r3
+	eor	r0, r0, r3
+	add	r0, r2, r0, lsr #1
+
+#elif defined(__aarch64__)
+
+	and	x2, x0, x3
+	eor	x0, x0, x3
+	add	x0, x2, x0, lsr #1
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -463,7 +1551,9 @@ proc	x15
 
 	// sign extension 32 -> 64 bits.
 
-	//movsx	rbx, eax		// like this?
+#if defined(__x86_64__)
+
+	movsx	rbx, eax		// like this?
 
 	mov	rdx, 0xffffffff80000000
 	add	rax, rdx		// if bit 31 of a is set then bits
@@ -472,15 +1562,46 @@ proc	x15
 					// exactly backwards
 	xor	rax, rdx		// so fix it
 
+#elif defined(__i386__)
+
+	movsx	ebx, ax			// like this?
+
+	mov	edx, 0xffff8000
+	add	eax, edx		// if bit 31 of a is set then bits
+					// 31--63 of a' are clear; otherwise,
+					// these bits are all set -- which is
+					// exactly backwards
+	xor	eax, edx		// so fix it
+
+#elif defined(__arm__)
+
+	sxth	r1, r0			// like this
+
+	mov	r12, #0x80000000
+	add	r0, r0, r12, asr #16
+	eor	r0, r0, r12, asr #16
+
+#elif defined(__aarch64__)
+
+	sxtw	x1, w0			// like this
+
+	mov	x16, #0xffffffff80000000
+	add	x0, x0, x16
+	eor	x0, x0, x16
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
 
 proc	x16
 
-  //shl rax, 56
-  //shl rbx, 56
-  //shl rcx, 56
+	// ??? i don't know why you'd want to calculate this.
+
+#if defined(__x86_64__)
 
 	xor	rax, rbx		// a' = a XOR b
 	xor	rbx, rcx		// b' = b XOR c
@@ -490,67 +1611,725 @@ proc	x16
 	xor	rax, rbx		// a' = cf ? 0 : a XOR c
 	cmp	rax, rsi
 
+#elif defined(__i386__)
+
+	xor	eax, ebx		// a' = a XOR b
+	xor	ebx, ecx		// b' = b XOR c
+	mov	esi, eax		// t = a XOR b
+	add	esi, ebx		// t = (a XOR b) + (b XOR c)
+	cmovc	eax, ebx		// a' = cf ? b XOR c : a XOR b
+	xor	eax, ebx		// a' = cf ? 0 : a XOR c
+	cmp	eax, esi
+
+#elif defined(__arm__)
+
+	eor	r0, r0, r1
+	eor	r1, r1, r2
+	adds	r4, r0, r1
+	movcs	r0, r1
+	eor	r0, r0, r1
+	cmp	r0, r4
+
+#elif defined(__aarch64__)
+
+	eor	x0, x0, x1
+	eor	x1, x1, x2
+	adds	x4, x0, x1
+	cmov.cs	x0, x1
+	eor	x0, x0, x1
+	cmp	x0, x4
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
 
 proc	x17
 
-	ud2
+	// absolute value
+
+#if defined(__x86_64__)
+
+	cqo				// d = a < 0 ? -1 : 0
+	xor	rax, rdx		// a' = a < 0 ? -a - 1 : a
+	sub	rax, rdx		// a' = a < 0 ? -a : a
+
+#elif defined(__i386__)
+
+	cdq				// d = a < 0 ? -1 : 0
+	xor	eax, edx		// a' = a < 0 ? -a - 1 : a
+	sub	eax, edx		// a' = a < 0 ? -a : a
+
+#elif defined(__arm__)
+
+	// direct approach
+	movs	r1, r0
+	rsbmi	r1, r0, #0
+
+	// faithful-ish conversion
+	eor	r3, r0, r0, asr #31
+	sub	r0, r3, r0, asr #31
+
+#elif defined(__aarch64__)
+
+	// direct approach
+	tst	x0, #1 << 63
+	cneg.ne x1, x0
+
+	// faithful-ish conversion
+	eor	x3, x0, x0, asr #63
+	sub	x0, x3, x0, asr #63
+
+#else
+	notimpl
+#endif
+
+	ret
 
 endproc
 
 proc	x18
 
-	ud2
+	// should always set sf, clear zf, unless we get rescheduled to a
+	// different core.
+
+#if defined(__x86_64__)
+
+	rdtsc				// d || a = cycles
+	shl	rdx, 0x20
+	or	rax, rdx		// a = cycles
+	mov	rcx, rax		// c = cycles
+
+	rdtsc				// d || a = cycles'
+	shl	rdx, 0x20
+	or	rax, rdx		// a = cycles'
+
+	cmp	rcx, rax
+
+#elif defined(__i386__)
+
+	rdtsc				// d || a = cycles
+	mov	ebx, eax
+	mov	ecx, edx		// c || b = cycles
+
+	rdtsc				// d || a = cycles'
+
+	sub	ebx, eax
+	sbb	ecx, edx
+
+#elif defined(__arm__)
+
+	// cycle clock not available in user mode
+	mrrc	p15, 0, r0, r1, c9
+	mrrc	p15, 0, r2, r3, c9
+	subs	r0, r0, r2
+	sbcs	r1, r1, r3
+
+#elif defined(__aarch64__)
+
+	// cycle clock not available in user mode
+	mrs	x0, pmccntr_el0
+	mrs	x1, pmccntr_el0
+	cmp	x0, x1
+
+#else
+	notimpl
+#endif
+
+	ret
 
 endproc
 
 proc	x19
 
-	ud2
+	// stupid way to capture a pointer to inline data and jump past it.
+	// confuses the return-address predictor something chronic.  worse
+	// because amd64 calling convention doesn't usually pass arguments on
+	// the stack.
+
+#if defined(__x86_64__)
+
+	call	8f
+	.string	"hello world!\n\0"
+8:	call	print_str
+	add	rsp, 8
+	ret
+
+print_str:
+	// actually implement this ridiculous thing
+	mov	rsi, [rsp + 8]
+	xor	edx, edx
+0:	mov	al, [rsi + rdx]
+	inc	rdx
+	cmp	al, 0
+	jnz	0b
+	mov	eax, SYS_write
+	mov	edi, 1
+	dec	rdx
+	syscall				// clobbers r11 :-(
+	ret
+
+#elif defined(__i386__)
+
+	call	8f
+	.string	"hello world!\n\0"
+8:	call	print_str
+	add	esp, 4
+	ret
+
+print_str:
+	// actually implement this ridiculous thing
+	mov	ecx, [esp + 4]
+	xor	edx, edx
+0:	mov	al, [ecx + edx]
+	inc	edx
+	cmp	al, 0
+	jnz	0b
+	mov	eax, SYS_write
+	mov	ebx, 1
+	dec	edx
+	int	0x80
+	ret
+
+#elif defined(__arm__)
+
+	// why am i doing this?
+	stmfd	r13!, {r14}
+	bl	8f
+	.string	"hello world!\n\0"
+	.balign	4
+8:	mov	r1, r14		      // might as well make it easy on myself
+	bl	print_str
+	ldmfd	r13!, {pc}
+
+print_str:
+	mov	r2, #0
+0:	ldrb	r0, [r1, r2]
+	cmp	r0, #0
+	addne	r2, r2, #1
+	bne	0b
+	mov	r0, #1
+	mov	r7, #SYS_write
+	swi	0
+	bx	r14
+
+#elif defined(__aarch64__)
+
+	// why am i doing this?
+	str	x30, [sp, #-16]!
+	bl	8f
+	.string	"hello world!\n\0"
+	.balign	4
+8:	mov	x1, x30		      // might as well make it easy on myself
+	bl	print_str
+	ldr	x30, [sp], #16
+	ret
+
+print_str:
+	mov	x2, #0
+0:	ldrb	w0, [x1, x2]
+	cmp	w0, #0
+	cinc.ne	x2, x2
+	b.ne	0b
+	mov	x0, #1
+	mov	x8, #SYS_write
+	svc	#0
+	ret
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x1a
 
-	ud2
+	// collect the current instruction-pointer address.  this was an old
+	// 32-bit i386 trick for position-independent code, but (a) it
+	// confuses the return predictor, and (b) amd64 has true pc-relative
+	// addressing.
+
+#if defined(__x86_64__)
+
+	// the actual example
+	call	0f
+0:	pop	rax
+
+	// the modern i386 trick doesn't confuse the return-address
+	// predictor.
+	call	calladdr_rbx
+	sub	rbx, . - 0b
+
+	// but rip-relative addressing is even better
+	lea	rcx, [rip + 0b]
+
+	ret
+
+calladdr_rbx:
+	mov	rbx, [rsp]
+	ret
+
+#elif defined(__i386__)
+
+	// the actual example
+	call	0f
+0:	pop	eax
+
+	// the modern i386 trick doesn't confuse the return-address
+	// predictor.
+	call	get_pc_ebx
+	sub	ebx, . - 0b
+
+	ret
+
+#elif defined(__arm__)
+
+	stmfd	r13!, {r14}
+
+	bl	0f
+0:	mov	r0, r14
+
+	bl	return
+	sub	r1, r14, #. - 0b
+
+	adr	r2, 0b
+
+	ldmfd	r13!, {pc}
+
+return:	bx	r14
+
+#elif defined(__aarch64__)
+
+	str	x30, [sp, #-16]!
+
+	// we can do all of the above using a64
+	bl	0f
+0:	mov	x0, x30
+
+	bl	return
+	sub	x1, x30, #. - 0b
+
+	adr	x2, 0b
+
+	ldr	x30, [sp], #16
+return:	ret
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x1b
 
-	ud2
+#if defined(__x86_64__)
+
+	// retpolines: an mitigation against adversarially influenced
+	// speculative execution at indirect branches.  if an adversary can
+	// prepare a branch-target buffer entry matching an indirect branch
+	// in the victim's address space then they can cause the victim to
+	// /speculatively/ (but not architecturally) execute any code in
+	// their address space, possibly leading to leaking secrets through
+	// the cache.  retpolines aren't susceptible to this because the
+	// predicted destination address is from the return-prediction stack
+	// which the adversary can't prime.  the performance penalty is still
+	// essentially a branch misprediction -- for this return, and
+	// possibly all others already stacked.
+
+	// (try not to crash)
+	lea	rax, [rip + 9f]
+
+	push	rax
+9:	ret
+
+#elif defined(__i386__)
+
+	call	get_pc_ebx
+	lea	eax, [ebx + 9f - .]
+
+	push	eax
+9:	ret
+
+#elif defined(__arm__)
+
+	stmfd	r13!, {r14}
+
+	adr	r14, 8f
+	bx	r14
+
+8:	ldmfd	r13!, {pc}
+
+#elif defined(__aarch64__)
+
+	str	x30, [sp, #-16]!
+
+	adr	x30, 8f
+	ret
+
+8:	ldr	x30, [sp], #16
+	ret
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x1c
 
-	ud2
+	// ok, having a hard time seeing a use for this.  the most important
+	// thing to note is that sp is set from `pop' /after/ it's
+	// incremented.
+
+#if defined(__x86_64__)
+
+	// try not to crash
+	mov	rax, rsp
+	and	rsp, -16
+	push	rax
+
+	pop	rsp
+
+	// check it worked
+	mov	rbx, rsp
+	ret
+
+#elif defined(__i386__)
+
+	// try not to crash
+	mov	eax, esp
+	and	esp, -16
+	push	eax
+
+	pop	esp
+
+	// check it worked
+	mov	ebx, esp
+	ret
+
+#elif defined(__arm__)
+
+	// not even going to dignify this
+	notimpl
+
+#elif defined(__aarch64__)
+
+	// not even going to dignify this
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x1d
 
-	ud2
+	// monumentally cheesy way to copy 8 n bytes from buff1 to buff2.
+	// also clobbers words at buff2 + 8 n and buff2 - 8 for good measure.
+
+	n = 4
+
+#if defined(__x86_64__)
+
+	mov	rax, rsp			// safekeeping
+
+	// we're toast if we get hit by a signal now.  fingers crossed...
+  .if 0
+	mov	rsp, buff2 + 8*n + 8
+	mov	rbp, buff1 + 8*n
+  .else
+	lea	rsp, [rdi + 8*n + 16]
+	lea	rbp, [rsi + 8*n]
+  .endif
+	enter	0, n + 1
+
+	// precise action:
+	//
+	//	   +---------+			+---------+
+	//  rbp -> |   ???   |		 rsp -> |   ???   |
+	//	   +---------+			+---------+
+	//         | w_{n-1} |			|   rbp   | <- rbp'
+	//	   +---------+			+---------+
+	//         |   ...   |			| w_{n-1} |
+	//	   +---------+			+---------+
+	//         |   w_1   |			|   ...   |
+	//	   +---------+			+---------+
+	//         |   w_0   |			|   w_1   |
+	//	   +---------+			+---------+
+	//					|   w_0   |
+	//					+---------+
+	//					|   rbp'  | <- rsp'
+	//					+---------+
+
+	mov	rdx, rsp
+	mov	rsp, rax
+
+#elif defined(__i386__)
+
+	mov	eax, esp			// safekeeping
+
+	// we're toast if we get hit by a signal now.  fingers crossed...
+  .if 0
+	mov	esp, buff2 + 4*n + 4
+	mov	ebp, buff1 + 4*n
+  .else
+	lea	esp, [edi + 4*n + 8]
+	lea	ebp, [esi + 4*n]
+  .endif
+	enter	0, n + 1
+
+	mov	edx, esp
+	mov	esp, eax
+
+#elif defined(__arm__)
+
+	add	r4, r4, #4*n
+	add	r5, r5, #4*n + 8
+
+	str	r4, [r5, #-4]!
+  .rept n/2
+	ldrd	r0, r1, [r4, #-8]!
+	strd	r0, r1, [r5, #-8]!
+  .endr
+	add	r4, r5, #4*n
+	str	r4, [r5, #-4]!
+
+#elif defined(__aarch64__)
+
+	// omgwtf.  let's not actually screw with the stack pointer.
+
+	add	x4, x4, #8*n
+	add	x5, x5, #8*n + 16
+
+	str	x4, [x5, #-8]!
+  .rept n/2
+	ldp	x16, x17, [x4, #-16]!
+	stp	x16, x17, [x5, #-16]!
+  .endr
+	add	x4, x5, #8*n
+	str	x4, [x5, #-8]!
+
+#else
+	notimpl
+#endif
+
+	ret
 
 endproc
 
 proc	x1e
 
-	ud2
+	// convert nibble value to (uppercase) hex; other input values yield
+	// nonsense.
+
+#if defined(__x86_64__)
+
+	// das doesn't work in 64-bit mode; best i can come up with
+	mov	edx, eax
+	add	al, '0'
+	add	dl, 'A' - 10
+	cmp	al, '9' + 1
+	cmovae	eax, edx
+
+#elif defined(__i386__)
+
+	cmp	al, 0x0a		// cf = 1 iff a < 10
+	sbb	al, 0x69		// if 0 <= a < 10, a' = a - 0x6a, so
+					// 0x96 <= a' < 0x70, setting af, cf
+					// if 10 <= a < 16, a' = a - 0x69, so
+					// 0x71 <= a' < 0x77, setting cf but
+					// clearing af
+	das				// if 0 <= a < 10, then af and cf are
+					// both set, so set subtract 0x66
+					// from a' leaving 0x30 <= a' < 0x3a;
+					// if 10 <= a < 16 then af clear but
+					// cf set, so subtract 0x60 from a'
+					// leaving 0x41 <= a' < 0x47
+
+#elif defined(__arm__)
+
+	// significantly less tricksy
+	cmp	r0, #10
+	addlo	r0, r0, #'0'
+	addhs	r0, r0, #'A' - 10
+
+#elif defined(__aarch64__)
+
+	// with less versatile conditional execution this is the best we can
+	// do
+	cmp	w0, #10
+	add	w16, w0, #'A' - 10
+	add	w0, w0, #'0'
+	cmov.hs	w0, w16
+
+#else
+	notimpl
+#endif
+
+	ret
 
 endproc
 
 proc	x1f
 
-	ud2
+	// verify collatz conjecture starting at a; assume a /= 0!
+
+#if defined(__x86_64__)
+
+0:	bsf	rcx, rax		// clobber c if a = 0
+	shr	rax, cl			// a = 2^c a'
+  cmp rdx, 0
+  je 1f
+  stosq
+  dec rdx
+1:
+	cmp	rax, 1			// done?
+	je	9f
+	lea	rax, [2*rax + rax + 1]	// a' = 3 a' + 1
+	jmp	0b			// again
+
+9:	ret
+
+#elif defined(__i386__)
+
+0:	bsf	ecx, eax		// clobber c if a = 0
+	shr	eax, cl			// a = 2^c a'
+  cmp edx, 0
+  je 1f
+  stosd
+  dec edx
+1:
+	cmp	eax, 1			// done?
+	je	9f
+	lea	eax, [2*eax + eax + 1]	// a' = 3 a' + 1
+	jmp	0b			// again
+
+9:	ret
+
+#elif defined(__arm__)
+
+	// rbit introduced in armv7
+0:	rbit	r2, r0
+	clz	r2, r2
+	mov	r0, r0, lsr r2		// a = 2^c a'
+  cmp r3, #0
+  strne r0, [r5], #4
+  subne r3, r3, #1
+	cmp	r0, #1
+	adcne	r0, r0, r0, lsl #1	// a' = 3 a' + 1 (because c set)
+	bne	0b
+
+	ret
+
+#elif defined(__aarch64__)
+
+0:	rbit	w2, w0
+	clz	w2, w2
+	lsr	w0, w0, w2		// a = 2^c a'
+  cmp x3, #0
+  beq 1f
+  str x0, [x5], #8
+  sub x3, x3, #1
+1:
+	cmp	w0, #1
+	add	w16, w0, w0, lsl #1	// t = 3 a' + 1 (because c set)
+	csinc.eq w0, w0, w16
+	b.ne	0b
+
+	ret
+
+#else
+	notimpl
+#endif
 
 endproc
 
+///--------------------------------------------------------------------------
+/// 0x20--0x2f
+
 proc	x20
 
-	ud2
+	// calculate 1337 a slowly
+
+#if defined(__x86_64__)
+
+	// original version
+	mov	rcx, rax		// c = a
+	shl	rcx, 2			// c = 4 a
+	add	rcx, rax		// c = 5 a
+	shl	rcx, 3			// c = 40 a
+	add	rcx, rax		// c = 41 a
+	shl	rcx, 1			// c = 82 a
+	add	rcx, rax		// c = 83 a
+	shl	rcx, 1			// c = 166 a
+	add	rcx, rax		// c = 167 a
+	shl	rcx, 3			// c = 1336 a
+	add	rcx, rax		// c = 1337 a
+
+	// a quick way
+	lea	rdx, [2*rax + rax]	// t = 3 a
+	shl	rdx, 6			// t = 192 a
+	sub	rdx, rax		// t = 191 a
+	lea	rbx, [8*rdx]		// b = 1528 a
+	sub	rbx, rdx		// b = 1337 a
+
+#elif defined(__i386__)
+
+	// original version
+	mov	ecx, eax		// c = a
+	shl	ecx, 2			// c = 4 a
+	add	ecx, eax		// c = 5 a
+	shl	ecx, 3			// c = 40 a
+	add	ecx, eax		// c = 41 a
+	shl	ecx, 1			// c = 82 a
+	add	ecx, eax		// c = 83 a
+	shl	ecx, 1			// c = 166 a
+	add	ecx, eax		// c = 167 a
+	shl	ecx, 3			// c = 1336 a
+	add	ecx, eax		// c = 1337 a
+
+	// a quick way
+	lea	edx, [2*eax + eax]	// t = 3 a
+	shl	edx, 6			// t = 192 a
+	sub	edx, eax		// t = 191 a
+	lea	ebx, [8*edx]		// b = 1528 a
+	sub	ebx, edx		// b = 1337 a
+
+#elif defined(__arm__)
+
+	// original version, ish
+	add	r2, r0, r0, lsl #2	// c = 5 a
+	add	r2, r0, r2, lsl #3	// c = 41 a
+	add	r2, r0, r2, lsl #1	// c = 83 a
+	add	r2, r0, r2, lsl #1	// c = 167 a
+	add	r2, r0, r2, lsl #3	// c = 1337 a
+
+	// quicker way
+	add	r1, r0, r0, lsl #1	// b = 3 a
+	rsb	r1, r0, r1, lsl #6	// b = 191 a
+	rsb	r1, r1, r1, lsl #3	// b = 1337 a
+
+#elif defined(__aarch64__)
+
+	// original version, ish
+	add	x2, x0, x0, lsl #2	// c = 5 a
+	add	x2, x0, x2, lsl #3	// c = 41 a
+	add	x2, x0, x2, lsl #1	// c = 83 a
+	add	x2, x0, x2, lsl #1	// c = 167 a
+	add	x2, x0, x2, lsl #3	// c = 1337 a
+
+	// sleazy because no rsb
+	add	x1, x0, x0, lsl #1	// b = 3 a
+	sub	x1, x0, x1, lsl #6	// b = -191 a
+	sub	x1, x1, x1, lsl #3	// b = 1337 a
+
+#else
+	notimpl
+#endif
 
 	ret
 
@@ -558,97 +2337,667 @@ endproc
 
 proc	x21
 
-	ud2
+	// multiply complex numbers a + b i and c + d i
+	//
+	//	(a + b i) (c + d i) = (a c - b d) + (a d + b c) i
+	//
+	// somewhat slick approach uses only three multiplications
+
+#if defined(__x86_64__)
+
+	mov	rsi, rax		// t = a
+	add	rax, rbx		// a' = a + b
+	mov	rdi, rdx		// u = d
+	sub	rdx, rcx		// d' = d - c
+	add	rdi, rcx		// u = c + d
+
+	imul	rax, rcx		// a' = c (a + b)
+	imul	rsi, rdx		// t = a (d - c)
+	imul	rdi, rbx		// u = b (c + d)
+
+	add	rsi, rax		// t = a (d - c) + c (a + b)
+	mov	rbx, rsi		// b' = a (d - c) + c (a + b)
+					//	= a d + b c
+	sub	rax, rdi		// a' = c (a + b) - b (c + d)
+					//	= a c - b d
+
+#elif defined(__i386__)
+
+	mov	esi, eax		// t = a
+	add	eax, ebx		// a' = a + b
+	mov	edi, edx		// u = d
+	sub	edx, ecx		// d' = d - c
+	add	edi, ecx		// u = c + d
+
+	imul	eax, ecx		// a' = c (a + b)
+	imul	esi, edx		// t = a (d - c)
+	imul	edi, ebx		// u = b (c + d)
+
+	add	esi, eax		// t = a (d - c) + c (a + b)
+	mov	ebx, esi		// b' = a (d - c) + c (a + b)
+					//	= a d + b c
+	sub	eax, edi		// a' = c (a + b) - b (c + d)
+					//	= a c - b d
+
+#elif defined(__arm__)
+
+	add	r4, r0, r1		// t = a + b
+	add	r5, r2, r3		// u = c + d
+	sub	r3, r3, r2		// d' = d - c
+
+	// mls introduced in armv7
+	mul	r4, r4, r2		// t = c (a + b)
+	mov	r2, r1			// c' = a (bah!)
+	mla	r1, r0, r3, r4		// b' = a (d - c) + c (a + b)
+					//	= a d + b c
+	mls	r0, r2, r5, r4		// a' = c (a + b) - b (c + d)
+					//	= a c - b d
+
+#elif defined(__aarch64__)
+
+	add	x4, x0, x1		// t = a + b
+	add	x5, x2, x3		// u = c + d
+	sub	x3, x3, x2		// d' = d - c
+
+	// mls intxoduced in axmv7
+	mul	x4, x4, x2		// t = c (a + b)
+	mov	x2, x1			// c' = a (bah!)
+	madd	x1, x0, x3, x4		// b' = a (d - c) + c (a + b)
+					//	= a d + b c
+	msub	x0, x2, x5, x4		// a' = c (a + b) - b (c + d)
+					//	= a c - b d
+
+#else
+	notimpl
+#endif
+
+	ret
 
 endproc
 
 proc	x22
 
-	ud2
+	// divide by 3
+
+#if defined(__x86_64__)
+
+	mov	rdx, 0xaaaaaaaaaaaaaaab // = ceil(2/3 2^64)
+	mul	rdx			// d' || a' =~ 2/3 a 2^64
+	shr	rdx, 1			// d' = floor(a/3)
+	mov	rax, rdx		// a' = floor(a/3)
+
+	// we start with 0 <= a < 2^64.  write f = ceil(2/3 2^64), so that
+	// 2/3 < f/2^64 < 2/3 + 1/2^64.  then floor(2/3 a) <= floor(a f/2^64)
+	// <= floor(2/3 a + a/2^64), but a < 2^64 so a/2^64 < 1 and
+	// floor(a f/2^64) = floor(2/3 a).
+
+#elif defined(__i386__)
+
+	mov	edx, 0xaaaaaaab		// = ceil(2/3 2^32)
+	mul	edx			// d' || a' =~ 2/3 a 2^32
+	shr	edx, 1			// d' = floor(a/3)
+	mov	eax, edx		// a' = floor(a/3)
+
+#elif defined(__arm__)
+
+	ldr	r12, =0xaaaaaaab
+	umull	r12, r0, r0, r12
+	mov	r0, r0, lsr #1
+
+#elif defined(__aarch64__)
+
+	ldr	x16, =0xaaaaaaaaaaaaaaab
+	umulh	x0, x0, x16
+	lsr	x0, x0, #1
+
+#else
+	notimpl
+#endif
+
+	ret
 
 endproc
 
 proc	x23
 
-	ud2
+#if defined(__x86_64__)
+
+	// main loop: shorten a preserving residue class mod 3
+0:	cmp	rax, 5
+	jbe	8f
+	// a > 5
+	mov	rdx, rax		// d' = a
+	shr	rdx, 2			// d' = floor(a/4)
+	and	rax, 3			// a = 4 d' + a' (0 <= a' < 4)
+	add	rax, rdx		// a' == a (mod 3) but a' < a/4 + 4
+	jmp	0b
+
+	// fix up final value 0 <= a < 6: want 0 <= a < 3
+	//
+	// the tricky part is actually a = 3; but the other final cases take
+	// additional iterations which we can avoid.
+8:	cmp	rax, 3			// set cf iff a < 3
+	cmc				// set cf iff a >= 3
+	sbb	rdx, rdx		// d' = a >= 3 ? -1 : 0
+	and	rdx, 3			// d' = a >= 3 ? 3 : 0
+	sub	rax, rdx		// a' = a - (a >= 3 ? 3 : 0)
+					//	= a (mod 3)
+
+#elif defined(__i386__)
+
+	// main loop: shorten a preserving residue class mod 3
+0:	cmp	eax, 5
+	jbe	8f
+	// a > 5
+	mov	edx, eax		// d' = a
+	shr	edx, 2			// d' = floor(a/4)
+	and	eax, 3			// a = 4 d' + a' (0 <= a' < 4)
+	add	eax, edx		// a' == a (mod 3) but a' < a/4 + 4
+	jmp	0b
+
+	// fix up final value 0 <= a < 6: want 0 <= a < 3
+	//
+	// the tricky part is actually a = 3; but the other final cases take
+	// additional iterations which we can avoid.
+8:	cmp	eax, 3			// set cf iff a < 3
+	cmc				// set cf iff a >= 3
+	sbb	edx, edx		// d' = a >= 3 ? -1 : 0
+	and	edx, 3			// d' = a >= 3 ? 3 : 0
+	sub	eax, edx		// a' = a - (a >= 3 ? 3 : 0)
+					//	= a (mod 3)
+
+#elif defined(__arm__)
+
+0:	cmp	r0, #6
+	andhs	r12, r0, #3
+	addhs	r0, r12, r0, lsr #2
+	bhs	0b
+
+	cmp	r0, #3
+	subhs	r0, r0, #3
+
+#elif defined(__aarch64__)
+
+0:	cmp	x0, #6
+	// blunder on through regardless since this doesn't affect the result
+	and	x16, x0, #3
+	add	x0, x16, x0, lsr #2
+	b.hs	0b
+
+	subs	x16, x0, #3
+	cmov.hs	x0, x16
+
+#else
+	notimpl
+#endif
+
+	ret
 
 endproc
 
 proc	x24
 
-	ud2
+	// invert (odd) a mod 2^64
+	//
+	// suppose a a_i == 1 (mod 2^{2^i})
+	//
+	// clearly good for i = 0, since 2^i = 1 and 2^{2^i} = 2, and a_0 =
+	// a == 1 (mod 2) by assumption
+	//
+	// write a a_i == b_i 2^{2^i} + 1 (mod 2^{2^{i+1}})
+	// then b_i == (a a_i - 1)/2^{2^i} (mod 2^{2^i})
+	// to lift inverse, we want x such that a x == -b_i (mod 2^{2^i});
+	// clearly x = -a_i b_i will do, since a a_i == 1 (mod 2^{2^i})
+	// then:
+	// a_{i+1} = a_i - a_i b_i 2^{2^i} = a_i (1 - (a a_i - 1))
+	//	= 2 a_i - a a_i^2
+	//
+	// check:
+	// a a_{i+1} = 2 a a_i - a^2 a_i^2
+	//	== 2 a a_i - (b_i 2^{2^i} + 1)^2
+	//	== 2 (b_i 2^{2^i} + 1) -
+	//		(b_i^2 2^{2^{i+1}} + 2 b_i 2^{2^i} + 1)
+	//	== 1 (mod 2^{2^{i+1}})
+
+#if defined(__x86_64__)
+
+	// rax				// a_0 = a
+	mov	rbx, rax		// b' = a
+	mov	rsi, rax		// t = a_0
+
+0:
+  cmp rbp, 0
+  je 1f
+  stosq
+  dec rbp
+1:
+	mul	rbx			// a' = a a_i
+	mov	rcx, rax		// c = a a_i
+
+	sub	rax, 2			// a' = a a_i - 2
+	neg	rax			// a' = 2 - a a_i
+	mul	rsi			// a_{i+1} = a_i (2 - a a_i)
+					//	= 2 a_i - a a_i^2
+	mov	rsi, rax		// t = a_{i+1}
+
+	cmp	rcx, 1			// done?
+	ja	0b			// no -- iterate
+
+#elif defined(__i386__)
+
+	// eax				// a_0 = a
+	mov	ebx, eax		// b' = a
+	mov	esi, eax		// t = a_0
+
+0:
+  cmp ebp, 0
+  je 1f
+  stosd
+  dec ebp
+1:
+	mul	ebx			// a' = a a_i
+	mov	ecx, eax		// c = a a_i
+
+	sub	eax, 2			// a' = a a_i - 2
+	jb	9f			// done if < 2
+	neg	eax			// a' = 2 - a a_i
+	mul	esi			// a_{i+1} = a_i (2 - a a_i)
+					//	= 2 a_i - a a_i^2
+	mov	esi, eax		// t = a_{i+1}
+
+	jmp	0b			// and iterate
+9:	mov	eax, esi		// restore
+
+#elif defined(__arm__)
+
+	// r0				// a_0 = a
+	mov	r1, r0			// b' = a
+
+0:
+  cmp r6, #0
+  strne r0, [r5], #4
+  subne r6, r6, #1
+	mul	r2, r0, r1		// c = a a_i
+	rsbs	r2, r2, #2		// c = 2 - a a_i
+	mul	r0, r0, r2		// a_{i+1} = a_i (2 - a a_i)
+					//	= 2 a_i - a a_i^2
+	blo	0b
+
+#elif defined(__aarch64__)
+
+	// x0				// a_0 = a
+	mov	x1, x0			// b' = a
+	mov	x16, #2			// because we have no rsb
+
+0:
+  cmp x6, #0
+  b.eq 1f
+  str x0, [x5], #8
+  sub x6, x6, #1
+1:
+	mul	x2, x0, x1		// c = a a_i
+	subs	x2, x16, x2		// c = 2 - a a_i
+	mul	x0, x0, x2		// a_{i+1} = a_i (2 - a a_i)
+					//	= 2 a_i - a a_i^2
+	b.lo	0b
+
+#else
+	notimpl
+#endif
+
+	ret
 
 endproc
 
 proc	x25
 
-	ud2
+	// a poor approximation to pi/4
+	//
+	// think of x and y as being in 16.16 fixed-point format.  we sample
+	// points in the unit square, and determine how many of them are
+	// within a unit quarter-circle centred at the origin.  the area of
+	// the quarter-circle is pi/4.
+
+#if defined(__x86_64__)
+
+	xor	eax, eax		// a = 0
+	mov	rcx, 1
+	shl	rcx, 0x20		// c =~ 4 billion
+
+0:	movzx	rbx, cx			// x = low 16 bits of c
+	imul	rbx, rbx		// b = x^2
+
+	ror	rcx, 0x10		// switch halves of c
+	movzx	rdx, cx			// y = high 16 bits of c
+	imul	rdx, rdx		// d = y^2
+	rol	rcx, 0x10		// switch back
+
+	add	rbx, rdx		// r^2 = x^2 + y^2
+	shr	rbx, 0x20		// r^2 >= 1?
+	cmp	rbx, 1			// set cf iff r^2 >= 1
+	adc	rax, 0			// and add onto accumulator
+	loop	0b
+
+#elif defined(__i386__)
+
+	// this is actually better done in 32 bits.  the carry has the wrong
+	// sense here, so instead deduct one for each point outside the
+	// quarter-circle rather than adding one for each point inside it.
+	xor	eax, eax
+	xor	ecx, ecx
+
+0:	movzx	ebx, cx
+	imul	ebx, ebx
+
+	ror	ecx, 0x10
+	movzx	edx, cx
+	imul	edx, edx
+	rol	ecx, 0x10
+
+	add	ebx, edx		// see?
+	sbb	eax, 0
+	loop	0b
+
+#elif defined(__arm__)
+
+	mov	r0, #0
+	mov	r2, #0
+
+0:	uxth	r1, r2, ror #0
+	uxth	r3, r2, ror #16
+	mul	r1, r1, r1
+	mul	r3, r3, r3
+	cmn	r1, r3			// mlas doesn't set cf usefully
+	addcc	r0, r0, #1
+	adds	r2, r2, #1
+	bne	0b
+
+#elif defined(__aarch64__)
+
+	mov	w0, #0
+	mov	w2, #0
+
+0:	ubfx	w1, w2, #0, #16
+	ubfx	w3, w2, #16, #16
+	sub	w2, w2, #1
+	mul	w1, w1, w1
+	mul	w3, w3, w3
+	cmn	w1, w3
+	cinc.cc	w0, w0
+	cbnz	w2, 0b
+
+#else
+	notimpl
+#endif
+
+	ret
 
 endproc
 
 proc	x26
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x27
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x28
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x29
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x2a
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x2b
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x2c
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x2d
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x2e
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x2f
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
+///--------------------------------------------------------------------------
+/// 0x30--0x3f
+
 proc	x30
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 	ret
 
@@ -656,90 +3005,362 @@ endproc
 
 proc	x31
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x32
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x33
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x34
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x35
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x36
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x37
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x38
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x39
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x3a
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x3b
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x3c
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x3d
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x3e
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
 proc	x3f
 
-	ud2
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
+
+///----- That's all, folks --------------------------------------------------