X-Git-Url: https://git.distorted.org.uk/~mdw/xchg-rax-rax/blobdiff_plain/06297a937874bfd892c0c85ace29fb26a4a0ddc0..99fe70cb14a9586fcea77bfea14cb931257807a8:/xchg.S

diff --git a/xchg.S b/xchg.S
index 5374727..b95c665 100644
--- a/xchg.S
+++ b/xchg.S
@@ -1,8 +1,76 @@
-/// -*- mode: asm; asm-comment-char: ?/ -*-
+/// -*- mode: asm; asm-comment-char: 0 -*-
+
+///--------------------------------------------------------------------------
+/// Preliminaries.
+
+#include <sys/syscall.h>
+
+#if defined(__i386__) || defined(__x86_64__)
 
 	.intel_syntax noprefix
 
-	.section .note.GNU-stack, "", @progbits
+#elif defined(__arm__)
+
+.macro	ret
+	bx	r14
+.endm
+
+	.arch	armv7-a
+	.fpu	neon
+
+#elif defined(__aarch64__)
+
+.macro	cmov	rd, rn, cc
+	csel	\rd, \rn, \rd, \cc
+.endm
+#define _COND(_)							\
+	_(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl)			\
+	_(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv)			\
+	_(hs) _(lo)
+#define _INST(_)							\
+	_(ccmp) _(ccmn)							\
+	_(csel) _(cmov)							\
+	_(csinc) _(cinc) _(cset)					\
+	_(csneg) _(cneg)						\
+	_(csinv) _(cinv) _(csetm)
+#define _CONDVAR(cc) _definstvar cc;
+#define _INSTVARS(inst)							\
+	.macro _definstvar cc;						\
+	  .macro inst.\cc args:vararg; inst \args, \cc; .endm;		\
+	.endm;								\
+	_COND(_CONDVAR);						\
+	.purgem _definstvar;
+	_INST(_INSTVARS)
+#undef _COND
+#undef _INST
+#undef _CONDVAR
+#undef _INSTVARS
+
+#define CCMP_N 8
+#define CCMP_Z 4
+#define CCMP_C 2
+#define CCMP_V 1
+
+#define CCMP_MI CCMP_N
+#define CCMP_PL 0
+#define CCMP_EQ CCMP_Z
+#define CCMP_NE 0
+#define CCMP_CS CCMP_C
+#define CCMP_HS CCMP_C
+#define CCMP_CC 0
+#define CCMP_LO 0
+#define CCMP_VS CCMP_V
+#define CCMP_VC 0
+#define CCMP_HI CCMP_C
+#define CCMP_LS 0
+#define CCMP_LT CCMP_N
+#define CCMP_GE 0
+#define CCMP_LE CCMP_N
+#define CCMP_GT 0
+
+#else
+#  error "not supported"
+#endif
 
 .macro	proc	name
 	.globl	\name
@@ -16,6 +84,36 @@
 .endm
 
 .macro ch c
+#if defined(__i386__)
+
+	pushf
+	push	eax
+	push	ebx
+	push	ecx
+	push	edx
+	push	ebp
+	mov	ebp, esp
+	and	esp, -16
+
+	push	\c
+	call	putchar@plt
+
+	call	get_pc_ebx
+	add	ebx, offset _GLOBAL_OFFSET_TABLE
+	mov	eax, [ebx + stdout@GOT]
+	mov	eax, [eax]
+	call	fflush@plt
+
+	mov	esp, ebp
+	pop	ebp
+	pop	edx
+	pop	ecx
+	pop	ebx
+	pop	eax
+	popf
+
+#elif defined(__x86_64__)
+
 	pushf
 	push	rax
 	push	rcx
@@ -44,12 +142,154 @@
 	pop	rcx
 	pop	rax
 	popf
+
+#elif defined(__arm__)
+
+	stmfd	r13!, {r0-r4, r12, r14}
+
+	mov	r4, r13
+	bic	r14, r4, #15
+	mov	r13, r14
+
+	mov	r0, #\c
+	bl	putchar@plt
+
+	ldr	r14, .L$_c$gotoff$\@
+.L$_c$gotpc$\@:
+	add	r14, pc, r14
+	b	.L$_c$cont$\@
+.L$_c$gotoff$\@:
+	.word	_GLOBAL_OFFSET_TABLE - .L$_c$gotpc$\@ - 8
+.L$_c$cont$\@:
+	bl	fflush@plt
+
+	mov	r13, r4
+	ldmfd	r13!, {r0-r4, r12, r14}
+
+#elif defined(__aarch64__)
+
+	sub	sp, sp, #20*8
+	stp	 x0,  x1, [sp,   #0]
+	stp	 x2,  x3, [sp,  #16]
+	stp	 x4,  x5, [sp,  #32]
+	stp	 x6,  x7, [sp,  #48]
+	stp	 x8,  x9, [sp,  #64]
+	stp	x10, x11, [sp,  #80]
+	stp	x12, x13, [sp,  #96]
+	stp	x14, x15, [sp, #112]
+	stp	x16, x17, [sp, #128]
+	mrs	x16, nzcv
+	stp	x16, x30, [sp, #144]
+
+	mov	w0, #\c
+	bl	putchar
+	adrp	x0, :got:stdout
+	ldr	x0, [x0, #:got_lo12:stdout]
+	ldr	x0, [x0]
+	bl	fflush
+
+	ldp	x16, x30, [sp, #144]
+	msr	nzcv, x16
+	ldp	x16, x17, [sp, #128]
+	ldp	x14, x15, [sp, #112]
+	ldp	x12, x13, [sp,  #96]
+	ldp	x10, x11, [sp,  #80]
+	ldp	 x8,  x9, [sp,  #64]
+	ldp	 x6,  x7, [sp,  #48]
+	ldp	 x4,  x5, [sp,  #32]
+	ldp	 x2,  x3, [sp,  #16]
+	ldp	 x0,  x1, [sp,   #0]
+	add	sp, sp, #20*8
+
+#else
+#  error "not supported"
+#endif
 .endm
 
+.macro	notimpl
+#if defined(__i386__) || defined(__x86_64__)
+	ud2
+#elif defined(__arm__)
+	udf
+#elif defined(__aarch64__)
+	hlt	#0
+#else
+#  error "not supported"
+#endif
+.endm
+
+	.section .note.GNU-stack, "", %progbits
+
 	.text
 
+#if defined(__i386__)
+get_pc_ebx:
+	mov	ebx, [esp]
+	ret
+#endif
+
+
 proc	call_example
 
+#if defined(__i386__)
+
+	push	ebx			// ebx
+	push	esi			// esi, ebx
+	push	edi			// edi, esi, ebx
+	push	ebp			// flags, ebp, ..., ebx
+	pushf
+
+	mov	edi, [esp + 4*6]
+	mov	esi, [esp + 4*7]
+	push	esi			// regs, flags, ebp, ..., ebx
+
+	call	get_pc_ebx
+	lea	eax, [ebx + 9f - .]
+	push	eax			// cont, regs, flags, ebp, ..., ebx
+	push	edi		    // func, cont, regs, flags, ebp, ..., ebx
+
+	mov	eax, [esi + 28]
+	pushf
+	pop	ecx
+	and	eax,  0x0cd5
+	and	ecx, ~0x0cd5
+	or	eax, ecx
+	push	eax
+	popf
+	mov	eax, [esi +  0]
+	mov	ebx, [esi +  4]
+	mov	ecx, [esi +  8]
+	mov	edx, [esi + 12]
+	mov	edi, [esi + 20]
+	mov	ebp, [esi + 24]
+	mov	esi, [esi + 16]
+
+	ret			       // -> func; regs, flags, ebp, ..., ebx
+
+9:	pushf				// eflags, regs, flags, ebp, ..., ebx
+	push	esi		   // esi, eflags, regs, flags, ebp, ..., ebx
+	mov	esi, [esp + 8]
+	mov	[esi +  0], eax
+	mov	[esi +  4], ebx
+	mov	[esi +  8], ecx
+	mov	[esi + 12], edx
+	mov	[esi + 20], edi
+	mov	[esi + 24], ebp
+	pop	eax			// rflags, regs, flags, ebp, ..., ebx
+	mov	[esi + 16], eax
+	pop	eax			// regs, flags, ebp, ..., ebx
+	mov	[esi + 28], eax
+
+	add	esp, 4			// flags, ebp, ..., ebx
+	popf				// ebp, ..., ebx
+	pop	ebp			// ..., ebx
+	pop	edi
+	pop	esi
+	pop	ebx			//
+	ret
+
+#elif defined(__x86_64__)
+
 	push	rbx			// rbx
 	push	r10
 	push	r11
@@ -66,7 +306,7 @@ proc	call_example
 	push	rax			// cont, regs, flags, rbp, ..., rbx
 	push	rdi		    // func, cont, regs, flags, rbp, ..., rbx
 
-	mov	rax, [rsi + 56]
+	mov	rax, [rsi + 8*15]
 	pushf
 	pop	rcx
 	and	rax,  0x0cd5
@@ -74,29 +314,45 @@ proc	call_example
 	or	rax, rcx
 	push	rax
 	popf
-	mov	rax, [rsi +  0]
-	mov	rbx, [rsi +  8]
-	mov	rcx, [rsi + 16]
-	mov	rdx, [rsi + 24]
-	mov	rdi, [rsi + 40]
-	mov	rbp, [rsi + 48]
-	mov	rsi, [rsi + 32]
+	mov	rax, [rsi +   0]
+	mov	rbx, [rsi +   8]
+	mov	rcx, [rsi +  16]
+	mov	rdx, [rsi +  24]
+	mov	rdi, [rsi +  40]
+	mov	rbp, [rsi +  48]
+	mov	r8,  [rsi +  56]
+	mov	r9,  [rsi +  64]
+	mov	r10, [rsi +  72]
+	mov	r11, [rsi +  80]
+	mov	r12, [rsi +  88]
+	mov	r13, [rsi +  96]
+	mov	r14, [rsi + 104]
+	mov	r15, [rsi + 112]
+	mov	rsi, [rsi +  32]
 
 	ret			       // -> func; regs, flags, rbp, ..., rbx
 
 9:	pushf				// rflags, regs, flags, rbp, ..., rbx
 	push	rsi		   // rsi, rflags, regs, flags, rbp, ..., rbx
 	mov	rsi, [rsp + 16]
-	mov	[rsi +  0], rax
-	mov	[rsi +  8], rbx
-	mov	[rsi + 16], rcx
-	mov	[rsi + 24], rdx
-	mov	[rsi + 40], rdi
-	mov	[rsi + 48], rbp
+	mov	[rsi +   0], rax
+	mov	[rsi +   8], rbx
+	mov	[rsi +  16], rcx
+	mov	[rsi +  24], rdx
+	mov	[rsi +  40], rdi
+	mov	[rsi +  48], rbp
+	mov	[rsi +  56],  r8
+	mov	[rsi +  64],  r9
+	mov	[rsi +  72], r10
+	mov	[rsi +  80], r11
+	mov	[rsi +  88], r12
+	mov	[rsi +  96], r13
+	mov	[rsi + 104], r14
+	mov	[rsi + 112], r15
 	pop	rax			// rflags, regs, flags, rbp, ..., rbx
-	mov	[rsi + 32], rax
+	mov	[rsi +  32], rax
 	pop	rax			// regs, flags, rbp, ..., rbx
-	mov	[rsi + 56], rax
+	mov	[rsi + 120], rax
 
 	add	rsp, 8			// flags, rbp, ..., rbx
 	popf				// rbp, ..., rbx
@@ -110,6 +366,82 @@ proc	call_example
 	pop	rbx			//
 	ret
 
+#elif defined(__arm__)
+
+	stmfd	r13!, {r0, r1, r4-r11, r14}
+	ldmia	r1, {r0-r12, r14}
+	msr	cpsr, r14
+	mov	r14, pc
+	ldr	pc, [r13], #4
+	ldr	r14, [r13], #4
+	stmia	r14!, {r0-r12}
+	mrs	r0, cpsr
+	str	r0, [r14]
+	ldmfd	r13!, {r4-r11, pc}
+
+#elif defined(__aarch64__)
+
+	stp	x29, x30, [sp, #-14*8]!
+	mov	x29, sp
+	stp	x19, x20, [sp,  #16]
+	stp	x21, x22, [sp,  #32]
+	stp	x23, x24, [sp,  #48]
+	stp	x25, x26, [sp,  #64]
+	stp	x27, x28, [sp,  #80]
+	str	      x1, [sp, #104]
+
+	ldp	x29, x30, [x1, #224]
+	msr	nzcv, x30
+	mov	x30, x0
+	ldp	x27, x28, [x1, #208]
+	ldp	x25, x26, [x1, #192]
+	ldp	x23, x24, [x1, #176]
+	ldp	x21, x22, [x1, #160]
+	ldp	x19, x20, [x1, #144]
+	ldp	x16, x17, [x1, #128]
+	ldp	x14, x15, [x1, #112]
+	ldp	x12, x13, [x1,  #96]
+	ldp	x10, x11, [x1,  #80]
+	ldp	 x8,  x9, [x1,  #64]
+	ldp	 x6,  x7, [x1,  #48]
+	ldp	 x4,  x5, [x1,  #32]
+	ldp	 x2,  x3, [x1,  #16]
+	ldp	 x0,  x1, [x1,   #0]
+
+	blr	x30
+
+	ldr	x30, [sp, #104]
+	stp	x27, x28, [x30, #208]
+	stp	x25, x26, [x30, #192]
+	stp	x23, x24, [x30, #176]
+	stp	x21, x22, [x30, #160]
+	stp	x19, x20, [x30, #144]
+	stp	x16, x17, [x30, #128]
+	stp	x14, x15, [x30, #112]
+	stp	x12, x13, [x30,  #96]
+	stp	x10, x11, [x30,  #80]
+	stp	 x8,  x9, [x30,  #64]
+	stp	 x6,  x7, [x30,  #48]
+	stp	 x4,  x5, [x30,  #32]
+	stp	 x2,  x3, [x30,  #16]
+	stp	 x0,  x1, [x30,   #0]
+	mov	x0, x30
+	mrs	x30, nzcv
+	stp	x29, x30,  [x0, #224]
+
+	ldp	x19, x20, [sp,  #16]
+	ldp	x21, x22, [sp,  #32]
+	ldp	x23, x24, [sp,  #48]
+	ldp	x25, x26, [sp,  #64]
+	ldp	x27, x28, [sp,  #80]
+	ldp	x29, x30, [sp], #14*8
+
+	ret
+
+#else
+#  error "not supported"
+#endif
+
 endproc
 
 proc	nop
@@ -119,19 +451,58 @@ proc	nop
 endproc
 
 ///--------------------------------------------------------------------------
+/// 0x00--0x0f
 
 proc	x00
 
 	// clear all 64 bits of extended traditional registers
-	xor      eax,eax		// clear rax
-	lea      rbx,[0]		// rbx -> _|_
+
+#if defined(__x86_64__)
+
+	xor      eax, eax		// clear rax
+	lea      rbx, [0]		// rbx -> _|_
 	loop     .			// iterate, decrement rcx until zero
-	mov      rdx,0			// set rdx = 0
-	and      esi,0			// clear all bits of rsi
-	sub      edi,edi		// set rdi = edi - edi = 0
+	mov      rdx, 0			// set rdx = 0
+	and      esi, 0			// clear all bits of rsi
+	sub      edi, edi		// set rdi = edi - edi = 0
 	push     0
 	pop      rbp			// pop 0 into rbp
 
+#elif defined(__i386__)
+
+	xor	eax, eax
+	lea	ebx, [0]
+	loop	.
+	mov	edx, 0
+	and	esi, 0
+	sub	edi, edi
+	push	0
+	pop	ebp
+
+#elif defined(__arm__)
+
+	eor	r0, r0, r0
+	rsb	r1, r1, r1
+0:	subs	r2, r2, #1
+	bne	0b
+	mov	r3, #0
+	and	r4, r4, #0
+	sub	r5, r5, r5
+
+#elif defined(__aarch64__)
+
+	eor	w0, w0, w0
+	mov	w1, wzr
+0:	sub	w2, w2, #1
+	cbnz	w2, 0b
+	mov	w3, #0
+	and	w4, w4, wzr
+	sub	w5, w5, w5
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -142,11 +513,43 @@ proc	x01
 	//
 	// on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1}
 	// and f_{i+c}, where f_{i+1} = f_i + f_{i-1}
+
+#if defined(__x86_64__)
+
 0:	xadd	rax, rdx		// a, d = a + d, a
 					//      = f_{i+1} + f_i, f_{i+1}
 					//      = f_{i+2}, f_{i+1}
 	loop	0b			// advance i, decrement c, iterate
 
+#elif defined(__i386__)
+
+0:	xadd	eax, edx
+	loop	0b
+
+#elif defined(__arm__)
+
+0:	subs	r2, r2, #2
+	add	r3, r3, r0
+	blo	8f
+	add	r0, r0, r3
+	bhi	0b
+
+8:	movne	r0, r3
+
+#elif defined(__aarch64__)
+
+0:	subs	x2, x2, #2
+	add	x3, x3, x0
+	b.lo	8f
+	add	x0, x0, x3
+	b.hi	0b
+
+8:	cmov.ne	x0, x3
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -155,10 +558,63 @@ proc	x02
 
 	// boolean canonify a: if a = 0 on entry, leave it zero; otherwise
 	// set a = 1
+
+#if defined(__x86_64__)
+
 	neg	rax			// set cf iff a /= 0
 	sbb	rax, rax		// a = a - a - cf = -cf
 	neg	rax			// a = cf
 
+#elif defined(__i386__)
+
+	neg	eax
+	sbb	eax, eax
+	neg	eax
+
+#elif defined(__arm__)
+
+	movs	r1, r0			// the easy way
+	movne	r1, #1			// mvnne r1, #1 for mask
+
+	cmp	r0, #1			// clear cf iff a == 0
+	sbc	r2, r0, r0		// c' = a - a - 1 + cf = cf - 1
+	add	r2, r2, #1		// c' = cf
+
+	sub	r3, r0, r0, lsr #1	// d' top bit clear; d' = 0 iff a = 0
+	rsb	r3, r3, #0		// d' top bit set iff a /= 0
+	mov	r3, r3, lsr #31		// asr for mask
+
+	rsbs	r0, r0, #0
+	sbc	r0, r0, r0
+	rsb	r0, r0, #0
+
+#elif defined(__aarch64__)
+
+	cmp	x0, #0			// trivial
+	cset.ne	x1			// csetm for mask
+
+	cmp	xzr, x0			// set cf iff a == 0
+	sbc	x2, x0, x0		// c' = a - a - 1 + cf = cf - 1
+	neg	x2, x2			// c' = 1 - cf
+
+	sub	x3, x0, x0, lsr #1	// if a < 2^63 then a' = ceil(d/2) <
+					// 2^63
+					// if a >= 2^63, write a = 2^63 + t
+					// with t < 2^63; d' = 2^63 - 2^62 +
+					// ceil(t/2) = 2^62 + ceil(t/2), and
+					// ceil(t/2) < 2^62
+					// anyway d' < 2^63 and d' = 0 iff
+					// a = 0
+	neg	x3, x3			// d' top bit set iff a /= 0
+	lsr	x3, x3, #63		// asr for mask
+
+	cmp	x0, #1			// set cf iff a /= 0
+	adc	x0, xzr, xzr		// a' = 0 + 0 + cf = cf
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -166,11 +622,46 @@ endproc
 proc	x03
 
 	// set a = min(a, d) (unsigned); clobber c, d
+
+#if defined(__x86_64__)
+
 	sub	rdx, rax		// d' = d - a; set cf if a > d
 	sbb	rcx, rcx		// c = -cf = -[a > d]
 	and	rcx, rdx		// c = a > d ? d - a : 0
 	add	rax, rcx		// a' = a > d ? d : a
 
+#elif defined(__i386__)
+
+	sub	edx, eax
+	sbb	ecx, ecx
+	and	ecx, edx
+	add	eax, ecx
+
+#elif defined(__arm__)
+
+	cmp	r0, r3			// the easy way
+	movlo	r1, r0			// only needed for out-of-place
+	movhs	r1, r3
+
+	subs	r3, r3, r0
+	sbc	r12, r12, r12
+	and	r12, r12, r3
+	add	r0, r0, r12
+
+#elif defined(__aarch64__)
+
+	cmp	x0, x3			// the easy way
+	csel.lo	x1, x0, x3
+
+	subs	x3, x3, x0		// d' = d - a; set cf if d >= a
+	sbc	x16, xzr, xzr		// t = -1 + cf = -[a > d]
+	and	x16, x16, x3		// t = a > d ? d - a : 0
+	add	x0, x0, x16		// a' = a > d ? d : a
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -178,8 +669,76 @@ endproc
 proc	x04
 
 	// switch case?
+
+#if defined(__x86_64__)
+
+  // unrelated playing
+  mov	ecx, eax
+  mov	rbx, -1
+  mov	edx, ecx
+  sub	edx, '0'
+  cmp	edx, 10
+  cmovb	rbx, rdx
+  or	ecx, 0x20
+  mov	edx, ecx
+  sub	edx, 'a'
+  sub	ecx, 'a' - 10
+  cmp	edx, 6
+  cmovb	rbx, rcx
+
 	xor	al, 0x20
 
+#elif defined(__i386__)
+
+  // unrelated playing
+  mov	ecx, eax
+  mov	ebx, -1
+  mov	edx, ecx
+  sub	edx, '0'
+  cmp	edx, 10
+  cmovb	ebx, edx
+  or	ecx, 0x20
+  mov	edx, ecx
+  sub	edx, 'a'
+  sub	ecx, 'a' - 10
+  cmp	edx, 6
+  cmovb	ebx, ecx
+
+	xor	al, 0x20
+
+#elif defined(__arm__)
+
+  // unrelated playing
+  mvn	r1, #0
+  sub	r12, r0, #'0'
+  cmp	r12, #10
+  movlo	r1, r12
+  orr	r12, r0, #0x20
+  sub	r12, r12, #'a'
+  cmp	r12, #6
+  addlo	r1, r12, #10
+
+	eor	r0, r0, #0x20
+
+#elif defined(__aarch64__)
+
+  // unrelated playing
+  mov	x1, #-1
+  sub	w16, w0, #'0'
+  cmp	w16, #10
+  cmov.lo	x1, x16
+  orr	w16, w0, #0x20
+  sub	w16, w16, #'a' - 10
+  cmp	w16, #10
+  ccmp.hs	w16, #16, #CCMP_HS
+  cmov.lo	x1, x16
+
+	eor	w0, w0, #0x20
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -187,6 +746,9 @@ endproc
 proc	x05
 
 	// answer whether 5 <= a </<= 9.
+
+#if defined(__x86_64__)
+
 	sub	rax, 5			// a' = a - 5
 	cmp	rax, 4			// is a' - 5 </<= 4?
 
@@ -211,6 +773,28 @@ proc	x05
 	// g/nle	a' > 4			a > 9 or a < -2^63 + 5
 	// le/ng	a' <= 4			-2^63 + 5 <= a <= 9
 
+#elif defined(__i386__)
+
+	sub	eax, 5
+	cmp	eax, 4
+
+#elif defined(__arm__)
+
+	// i dimly remember having a slick way to do this way back in the
+	// day, but i can't figure it out any more.
+	sub	r0, #5
+	cmp	r0, #4
+
+#elif defined(__aarch64__)
+
+	// literal translation is too obvious
+	cmp	x0, #5
+	ccmp.hs	x0, #9, #CCMP_HS
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -219,10 +803,35 @@ proc	x06
 
 	// leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of,
 	// set sf to msb(a)
+
+#if defined(__x86_64__)
+
 	not	rax			// a' = -a - 1
 	inc	rax			// a' = -a
 	neg	rax			// a' = a
 
+#elif defined(__i386__)
+
+	not	eax
+	inc	eax
+	neg	eax
+
+#elif defined(__arm__)
+
+	mvn	r0, r0
+	add	r0, r0, #1
+	rsbs	r0, r0, #0		// cf has opposite sense
+
+#elif defined(__aarch64__)
+
+	mvn	x0, x0
+	add	x0, x0, #1
+	negs	x0, x0			// cf has opposite sense
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -230,11 +839,39 @@ endproc
 proc	x07
 
 	// same as before (?)
+
+#if defined(__x86_64__)
+
 	inc	rax			// a' = a + 1
 	neg	rax			// a' = -a - 1
 	inc	rax			// a' = -a
 	neg	rax			// a' = a
 
+#elif defined(__i386__)
+
+	inc	eax
+	neg	eax
+	inc	eax
+	neg	eax
+
+#elif defined(__arm__)
+
+	add	r0, r0, #1
+	rsb	r0, r0, #0
+	add	r0, r0, #1
+	rsbs	r0, r0, #0
+
+#elif defined(__aarch64__)
+
+	add	x0, x0, #1
+	neg	x0, x0
+	add	x0, x0, #1
+	negs	x0, x0			// cf has opposite sense
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -243,10 +880,45 @@ proc	x08
 
 	// floor((a + d)/2), correctly handling overflow conditions; final cf
 	// is lsb(a + d), probably uninteresting
+
+#if defined(__x86_64__)
+
 	add	rax, rdx		// cf || a' = a + d
 	rcr	rax, 1			// shift 65-bit result right by one
 					// place; lsb moves into carry
 
+#elif defined(__i386__)
+
+	add	eax, edx
+	rcr	eax, 1
+
+#elif defined(__arm__)
+
+	// like the two-instruction a64 version
+	sub	r1, r3, r0
+	add	r1, r0, r1, lsr #1
+
+	// the slick version, similar to the above
+	adds	r0, r0, r3
+	mov	r0, r0, rrx
+
+#elif defined(__aarch64__)
+
+	// a64 lacks a32's rrx.  literal translation.
+	adds	x1, x0, x3		// cf || a' = a + d
+	adc	x16, xzr, xzr		// realize cf in extra register
+	extr	x1, x16, x1, #1		// shift down one place
+
+	// two instruction version: clobbers additional register.  (if you
+	// wanted the answer in any other register, even overwriting d, then
+	// this is unnecessary.)  also depends on d >= a.
+	sub	x16, x3, x0		// compute difference
+	add	x0, x0, x16, lsr #1	// add half of it (rounded down)
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -255,10 +927,33 @@ proc	x09
 
 	// a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3
 	// (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8).
+
+#if defined(__x86_64__)
+
 	shr	rax, 3			// a' = floor(a/8); cf = 1 if a ==
 					// 4, 5, 6, 7 (mod 8)
 	adc	rax, 0			// a' = floor(a/8) + cf
 
+#elif defined(__i386__)
+
+	shr	eax, 3
+	adc	eax, 0
+
+#elif defined(__arm__)
+
+	movs	r0, r0, lsr #3
+	adc	r0, r0, #0
+
+#elif defined(__aarch64__)
+
+	tst	x0, #4
+	orr	x0, xzr, x0, lsr #3
+	cinc.ne	x0, x0
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -266,11 +961,43 @@ endproc
 proc	x0a
 
 	// increment c-byte little-endian bignum at rdi
+
+#if defined(__x86_64__)
+
 	add	byte ptr [rdi], 1
 0:	inc	rdi
 	adc	byte ptr [rdi], 0
 	loop	0b
 
+#elif defined(__i386__)
+
+	add	byte ptr [edi], 1
+0:	inc	edi
+	adc	byte ptr [edi], 0
+	loop	0b
+
+#elif defined(__arm__)
+
+	mov	r12, #256		// set initial carry
+0:	ldrb	r0, [r5]
+	subs	r2, r2, #1
+	add	r12, r0, r12, lsr #8
+	strb	r12, [r5], #1
+	bne	0b
+
+#elif defined(__aarch64__)
+
+	mov	w17, #256		// set initial carry
+0:	ldrb	w16, [x5]
+	sub	x2, x2, #1
+	add	w17, w16, w17, lsr #8
+	strb	w17, [x5], #1
+	cbnz	x2, 0b
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -278,11 +1005,36 @@ endproc
 proc	x0b
 
 	// negate double-precision d:a
+
+#if defined(__x86_64__)
+
 	not	rdx			// d' = -d - 1
 	neg	rax			// a' = -a;
 					// cf = 1 iff a /= 0
 	sbb	rdx, -1			// d' = -d - cf
 
+#elif defined(__i386__)
+
+	not	edx
+	neg	eax
+	sbb	edx, -1
+
+#elif defined(__arm__)
+
+	// reverse subtract is awesome
+	rsbs	r0, r0, #0
+	rsc	r3, r3, #0
+
+#elif defined(__aarch64__)
+
+	// easy way: everything is better with zero registers.
+	negs	x0, x0
+	ngc	x3, x3
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -291,6 +1043,8 @@ proc	x0c
 
 	// rotate is distributive over xor.
 
+#if defined(__x86_64__)
+
 	// rax				// = a_1 || a_0
 	// rbx				// = b_1 || b_0
 	mov	rcx, rax		// = a_1 || a_0
@@ -304,6 +1058,48 @@ proc	x0c
 
 	cmp	rax, rcx		// always equal
 
+#elif defined(__i386__)
+
+	mov	ecx, eax		// = a_1 || a_0
+
+	xor	ecx, ebx		// = (a_1 XOR b_1) || (a_0 XOR b_0)
+	ror	ecx, 0xd		// = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+	ror	eax, 0xd		// = a_0 || a_1
+	ror	ebx, 0xd		// = b_0 || b_1
+	xor	eax, ebx		// = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+	cmp	eax, ecx		// always equal
+
+#elif defined(__arm__)
+
+
+	// r0				// = a_1 || a_0
+	// r1				// = b_1 || b_0
+	eor	r2, r0, r1		// = (a_1 XOR b_1) || (a_0 XOR b_0)
+	mov	r2, r2, ror #13		// = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+	mov	r1, r1, ror #13		// = b_0 || b_1
+	eor	r0, r1, r0, ror #13	// = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+	cmp	r0, r2			// always equal
+
+#elif defined(__aarch64__)
+
+	// x0				// = a_1 || a_0
+	// x1				// = b_1 || b_0
+	eor	x2, x0, x1		// = (a_1 XOR b_1) || (a_0 XOR b_0)
+	ror	x2, x2, #13		// = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+	ror	x1, x1, #13		// = b_0 || b_1
+	eor	x0, x1, x0, ror #13	// = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+	cmp	x0, x2			// always equal
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -312,6 +1108,8 @@ proc	x0d
 
 	// and is distributive over xor.
 
+#if defined(__x86_64__)
+
 	mov	rdx, rbx		// = b
 
 	xor	rbx, rcx		// = b XOR c
@@ -324,6 +1122,50 @@ proc	x0d
 
 	cmp	rax, rbx		// always equal
 
+#elif defined(__i386__)
+
+	mov	edx, ebx		// = b
+
+	xor	ebx, ecx		// = b XOR c
+	and	ebx, eax		// = a AND (b XOR c)
+
+	and	edx, eax		// = a AND b
+	and	eax, ecx		// = a AND c
+	xor	eax, edx		// = (a AND b) XOR (a AND c)
+					// = a AND (b XOR c)
+
+	cmp	eax, ebx		// always equal
+
+#elif defined(__arm__)
+
+	and	r3, r0, r1		// = a AND b
+
+	eor	r1, r1, r2		// = b XOR c
+	and	r1, r1, r0		// = a AND (b XOR c)
+
+	and	r0, r0, r2		// = a AND c
+	eor	r0, r0, r3		// = (a AND b) XOR (a AND c)
+					// = a AND (b XOR c)
+
+	cmp	r0, r1			// always equal
+
+#elif defined(__aarch64__)
+
+	and	x3, x0, x1		// = a AND b
+
+	eor	x1, x1, x2		// = b XOR c
+	and	x1, x1, x0		// = a AND (b XOR c)
+
+	and	x0, x0, x2		// = a AND c
+	eor	x0, x0, x3		// = (a AND b) XOR (a AND c)
+					// = a AND (b XOR c)
+
+	cmp	x0, x1			// always equal
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -332,6 +1174,8 @@ proc	x0e
 
 	// de morgan's law
 
+#if defined(__x86_64__)
+
 	mov	rcx, rax		// = a
 
 	and	rcx, rbx		// = a AND b
@@ -342,7 +1186,46 @@ proc	x0e
 	or	rax, rbx		// = (NOT a) OR (NOT b)
 					// = NOT (a AND b)
 
-	cmp	rax, rcx
+	cmp	rax, rcx		// always equal
+
+#elif defined(__i386__)
+
+	mov	ecx, eax		// = a
+
+	and	ecx, ebx		// = a AND b
+	not	ecx			// = NOT (a AND b)
+
+	not	eax			// = NOT a
+	not	ebx			// = NOT b
+	or	eax, ebx		// = (NOT a) OR (NOT b)
+					// = NOT (a AND b)
+
+	cmp	eax, ecx		// always equal
+
+#elif defined(__arm__)
+
+	and	r2, r0, r1		// = a AND b
+	mvn	r2, r2			// = NOT (a AND b)
+
+	mvn	r0, r0			// = NOT a
+	mvn	r1, r1			// = NOT b
+	orr	r0, r0, r1		// = (NOT a) OR (NOT b)
+
+	cmp	r0, r2			// always equal
+
+#elif defined(__aarch64__)
+
+	and	x2, x0, x1		// = a AND b
+	mvn	x2, x2			// = NOT (a AND b)
+
+	mvn	x0, x0			// = NOT a
+	orn	x0, x0, x1		// = (NOT a) OR (NOT b)
+
+	cmp	x0, x2			// always equal
+
+#else
+	notimpl
+#endif
 
 	ret
 
@@ -355,20 +1238,51 @@ proc	x0f
 	//
 	// not sure why you'd do this.
 
-	cld
+#if defined(__x86_64__)
 
 0:	xor	[rsi], al
 	lodsb
 	loop	0b
 
+#elif defined(__i386__)
+
+0:	xor	[esi], al
+	lodsb
+	loop	0b
+
+#elif defined(__arm__)
+
+0:	ldrb	r12, [r4]
+	subs	r2, r2, #1
+	eor	r0, r0, r12
+	strb	r0, [r4], #1
+	bne	0b
+
+#elif defined(__aarch64__)
+
+0:	ldrb	w16, [x4]
+	sub	x2, x2, #1
+	eor	w0, w0, w16
+	strb	w0, [x4], #1
+	cbnz	x2, 0b
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
 
+///--------------------------------------------------------------------------
+/// 0x10--0x1f
+
 proc	x10
 
 	// four different ways to swap a pair of registers.
 
+#if defined(__x86_64__)
+
 	push	rax
 	push	rcx
 	pop	rax
@@ -385,6 +1299,76 @@ proc	x10
 
 	xchg	rax, rcx
 
+#elif defined(__i386__)
+
+	push	eax
+	push	ecx
+	pop	eax
+	pop	ecx
+
+	xor	eax, ecx
+	xor	ecx, eax
+	xor	eax, ecx
+
+	add	eax, ecx
+	sub	ecx, eax
+	add	eax, ecx
+	neg	ecx
+
+	xchg	eax, ecx
+
+#elif defined(__arm__)
+
+	stmfd	r13!, {r0, r2}
+	ldr	r0, [r13, #4]
+	ldr	r2, [r13], #8
+
+	eor	r0, r0, r2
+	eor	r2, r2, r0
+	eor	r0, r0, r2
+
+	sub	r0, r0, r2
+	add	r2, r2, r0
+	rsb	r0, r0, r2		// don't need 3-addr with reverse-sub
+
+	mov	r12, r0
+	mov	r0, r2
+	mov	r2, r0
+
+#elif defined(__aarch64__)
+
+	// anything you can do
+	stp	x0, x2, [sp, #-16]!
+	ldp	x2, x0, [sp], #16
+
+	eor	x0, x0, x2
+	eor	x2, x2, x0
+	eor	x0, x0, x2
+
+	// the add/sub/add thing was daft.  you can do it in three if you're
+	// clever -- and have three-address operations.
+	sub	x0, x0, x2
+	add	x2, x2, x0
+	sub	x0, x2, x0
+
+	// but we lack a fourth.  we can't do this in fewer than three
+	// instructions without hitting memory.  only `ldp' will modify two
+	// registers at a time, so we need at least two instructions -- but
+	// if the first one sets one of our two registers to its final value
+	// then we lose the other input value with no way to recover it, so
+	// we must either write a fresh third register, or write something
+	// other than the final value, and in both cases we need a third
+	// instruction to fix everything up.  we've done the wrong-something-
+	// other trick twice, so here's the captain-obvious use-a-third-
+	// register version.
+	mov	x16, x0
+	mov	x0, x2
+	mov	x2, x16
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -398,6 +1382,8 @@ proc	x11
 	// in particular, a will be zero (and zf set) if and only if the two
 	// strings are equal.
 
+#if defined(__x86_64__)
+
 0:	mov	dl, [rsi]
 	xor	dl, [rdi]
 	inc	rsi
@@ -405,6 +1391,37 @@ proc	x11
 	or	al, dl
 	loop	0b
 
+#elif defined(__i386__)
+
+0:	mov	dl, [esi]
+	xor	dl, [edi]
+	inc	esi
+	inc	edi
+	or	al, dl
+	loop	0b
+
+#elif defined(__arm__)
+
+0:	ldrb	r1, [r4], #1
+	ldrb	r12, [r5], #1
+	subs	r2, r2, #1
+	eor	r12, r12, r1
+	orr	r0, r0, r12
+	bne	0b
+
+#elif defined(__aarch64__)
+
+0:	ldrb	w16, [x4], #1
+	ldrb	w17, [x5], #1
+	sub	x2, x2, #1
+	eor	w16, w16, w17
+	orr	w0, w0, w16
+	cbnz	x2, 0b
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -418,27 +1435,85 @@ proc	x12
 	// move all of the set bits in d to a, unless there's already a bit
 	// there.  this clearly doesn't change the sum.
 
+#if defined(__x86_64__)
+
 	mov	rcx, rdx		// c' = d
 	and	rdx, rax		// d' = a AND d
 	or	rax, rcx		// a' = a OR d
 	add	rax, rdx
 
-	ret
+#elif defined(__i386__)
 
-endproc
+	mov	ecx, edx		// c' = d
+	and	edx, eax		// d' = a AND d
+	or	eax, ecx		// a' = a OR d
+	add	eax, edx
 
-proc	x13
+#elif defined(__arm__)
 
-	// ok, so this is a really obtuse way of adding a and b; the result
+	and	r2, r0, r3		// c' = a AND d
+	orr	r0, r0, r3		// a' = a OR d
+	add	r0, r0, r2
+
+#elif defined(__aarch64__)
+
+	and	x2, x0, x3		// c' = a AND d
+	orr	x0, x0, x3		// a' = a OR d
+	add	x0, x0, x2
+
+#else
+	notimpl
+#endif
+
+	ret
+
+endproc
+
+proc	x13
+
+	// ok, so this is a really obtuse way of adding a and b; the result
 	// is in a and d.  but why does it work?
 
+#if defined(__x86_64__)
+
 	mov	rcx, 0x40		// carry chains at most 64 long
 0:	mov	rdx, rax		// copy a'
 	xor	rax, rbx		// low bits of each bitwise sum
 	and	rbx, rdx		// carry bits from each bitwise sum
-	shl	rbx, 001		// carry them into next position
+	shl	rbx, 1			// carry them into next position
 	loop	0b
 
+#elif defined(__i386__)
+
+	mov	ecx, 0x40		// carry chains at most 64 long
+0:	mov	edx, eax		// copy a'
+	xor	eax, ebx		// low bits of each bitwise sum
+	and	ebx, edx		// carry bits from each bitwise sum
+	shl	ebx, 1			// carry them into next position
+	loop	0b
+
+#elif defined(__arm__)
+
+	mov	r2, #0x40
+0:	and	r3, r0, r1
+	subs	r2, r2, #1
+	eor	r0, r0, r1
+	lsl	r1, r3, #1
+	bne	0b
+
+#elif defined(__aarch64__)
+
+	mov	x2, #0x40
+0:	and	x3, x0, x1
+	sub	x2, x2, #1
+	eor	x0, x0, x1
+	lsl	x1, x3, #1
+	cbnz	x2, 0b
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -447,6 +1522,8 @@ proc	x14
 
 	// floor((a + d)/2), like x08.
 
+#if defined(__x86_64__)
+
 	mov	rcx, rax		// copy a for later
 	and	rcx, rdx		// carry bits
 
@@ -455,6 +1532,32 @@ proc	x14
 
 	add	rax, rcx		// add the carries; done
 
+#elif defined(__i386__)
+
+	mov	ecx, eax		// copy a for later
+	and	ecx, edx		// carry bits
+
+	xor	eax, edx		// low bits of each bitwise sum
+	shr	eax, 1			// divide by 2; carries now in place
+
+	add	eax, ecx		// add the carries; done
+
+#elif defined(__arm__)
+
+	and	r2, r0, r3
+	eor	r0, r0, r3
+	add	r0, r2, r0, lsr #1
+
+#elif defined(__aarch64__)
+
+	and	x2, x0, x3
+	eor	x0, x0, x3
+	add	x0, x2, x0, lsr #1
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
@@ -463,7 +1566,9 @@ proc	x15
 
 	// sign extension 32 -> 64 bits.
 
-	//movsx	rbx, eax		// like this?
+#if defined(__x86_64__)
+
+	movsx	rbx, eax		// like this?
 
 	mov	rdx, 0xffffffff80000000
 	add	rax, rdx		// if bit 31 of a is set then bits
@@ -472,15 +1577,46 @@ proc	x15
 					// exactly backwards
 	xor	rax, rdx		// so fix it
 
+#elif defined(__i386__)
+
+	movsx	ebx, ax			// like this?
+
+	mov	edx, 0xffff8000
+	add	eax, edx		// if bit 31 of a is set then bits
+					// 31--63 of a' are clear; otherwise,
+					// these bits are all set -- which is
+					// exactly backwards
+	xor	eax, edx		// so fix it
+
+#elif defined(__arm__)
+
+	sxth	r1, r0			// like this
+
+	mov	r12, #0x80000000
+	add	r0, r0, r12, asr #16
+	eor	r0, r0, r12, asr #16
+
+#elif defined(__aarch64__)
+
+	sxtw	x1, w0			// like this
+
+	mov	x16, #0xffffffff80000000
+	add	x0, x0, x16
+	eor	x0, x0, x16
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
 
 proc	x16
 
-  shl rax, 56
-  shl rbx, 56
-  shl rcx, 56
+	// ??? i don't know why you'd want to calculate this.
+
+#if defined(__x86_64__)
 
 	xor	rax, rbx		// a' = a XOR b
 	xor	rbx, rcx		// b' = b XOR c
@@ -490,256 +1626,2202 @@ proc	x16
 	xor	rax, rbx		// a' = cf ? 0 : a XOR c
 	cmp	rax, rsi
 
+#elif defined(__i386__)
+
+	xor	eax, ebx		// a' = a XOR b
+	xor	ebx, ecx		// b' = b XOR c
+	mov	esi, eax		// t = a XOR b
+	add	esi, ebx		// t = (a XOR b) + (b XOR c)
+	cmovc	eax, ebx		// a' = cf ? b XOR c : a XOR b
+	xor	eax, ebx		// a' = cf ? 0 : a XOR c
+	cmp	eax, esi
+
+#elif defined(__arm__)
+
+	eor	r0, r0, r1
+	eor	r1, r1, r2
+	adds	r4, r0, r1
+	movcs	r0, r1
+	eor	r0, r0, r1
+	cmp	r0, r4
+
+#elif defined(__aarch64__)
+
+	eor	x0, x0, x1
+	eor	x1, x1, x2
+	adds	x4, x0, x1
+	cmov.cs	x0, x1
+	eor	x0, x0, x1
+	cmp	x0, x4
+
+#else
+	notimpl
+#endif
+
 	ret
 
 endproc
 
 proc	x17
 
-	ud2
+	// absolute value
+
+#if defined(__x86_64__)
+
+	cqo				// d = a < 0 ? -1 : 0
+	xor	rax, rdx		// a' = a < 0 ? -a - 1 : a
+	sub	rax, rdx		// a' = a < 0 ? -a : a
+
+#elif defined(__i386__)
+
+	cdq				// d = a < 0 ? -1 : 0
+	xor	eax, edx		// a' = a < 0 ? -a - 1 : a
+	sub	eax, edx		// a' = a < 0 ? -a : a
+
+#elif defined(__arm__)
+
+	// direct approach
+	movs	r1, r0
+	rsbmi	r1, r0, #0
+
+	// faithful-ish conversion
+	eor	r3, r0, r0, asr #31
+	sub	r0, r3, r0, asr #31
+
+#elif defined(__aarch64__)
+
+	// direct approach
+	tst	x0, #1 << 63
+	cneg.ne x1, x0
+
+	// faithful-ish conversion
+	eor	x3, x0, x0, asr #63
+	sub	x0, x3, x0, asr #63
+
+#else
+	notimpl
+#endif
+
+	ret
 
 endproc
 
 proc	x18
 
-	ud2
+	// should always set sf, clear zf, unless we get rescheduled to a
+	// different core.
+
+#if defined(__x86_64__)
+
+	rdtsc				// d || a = cycles
+	shl	rdx, 0x20
+	or	rax, rdx		// a = cycles
+	mov	rcx, rax		// c = cycles
+
+	rdtsc				// d || a = cycles'
+	shl	rdx, 0x20
+	or	rax, rdx		// a = cycles'
+
+	cmp	rcx, rax
+
+#elif defined(__i386__)
+
+	rdtsc				// d || a = cycles
+	mov	ebx, eax
+	mov	ecx, edx		// c || b = cycles
+
+	rdtsc				// d || a = cycles'
+
+	sub	ebx, eax
+	sbb	ecx, edx
+
+#elif defined(__arm__)
+
+	// cycle clock not available in user mode
+	mrrc	p15, 0, r0, r1, c9
+	mrrc	p15, 0, r2, r3, c9
+	subs	r0, r0, r2
+	sbcs	r1, r1, r3
+
+#elif defined(__aarch64__)
+
+	// cycle clock not available in user mode
+	mrs	x0, pmccntr_el0
+	mrs	x1, pmccntr_el0
+	cmp	x0, x1
+
+#else
+	notimpl
+#endif
+
+	ret
 
 endproc
 
 proc	x19
 
-	ud2
+	// stupid way to capture a pointer to inline data and jump past it.
+	// confuses the return-address predictor something chronic.  worse
+	// because amd64 calling convention doesn't usually pass arguments on
+	// the stack.
 
-endproc
+#if defined(__x86_64__)
 
-proc	x1a
+	call	8f
+	.string	"hello world!\n\0"
+8:	call	print_str
+	add	rsp, 8
+	ret
 
-	ud2
+print_str:
+	// actually implement this ridiculous thing
+	mov	rsi, [rsp + 8]
+	xor	edx, edx
+0:	mov	al, [rsi + rdx]
+	inc	rdx
+	cmp	al, 0
+	jnz	0b
+	mov	eax, SYS_write
+	mov	edi, 1
+	dec	rdx
+	syscall				// clobbers r11 :-(
+	ret
 
-endproc
+#elif defined(__i386__)
 
-proc	x1b
+	call	8f
+	.string	"hello world!\n\0"
+8:	call	print_str
+	add	esp, 4
+	ret
 
-	ud2
+print_str:
+	// actually implement this ridiculous thing
+	mov	ecx, [esp + 4]
+	xor	edx, edx
+0:	mov	al, [ecx + edx]
+	inc	edx
+	cmp	al, 0
+	jnz	0b
+	mov	eax, SYS_write
+	mov	ebx, 1
+	dec	edx
+	int	0x80
+	ret
 
-endproc
+#elif defined(__arm__)
+
+	// why am i doing this?
+	stmfd	r13!, {r14}
+	bl	8f
+	.string	"hello world!\n\0"
+	.balign	4
+8:	mov	r1, r14		      // might as well make it easy on myself
+	bl	print_str
+	ldmfd	r13!, {pc}
+
+print_str:
+	mov	r2, #0
+0:	ldrb	r0, [r1, r2]
+	cmp	r0, #0
+	addne	r2, r2, #1
+	bne	0b
+	mov	r0, #1
+	mov	r7, #SYS_write
+	swi	0
+	bx	r14
+
+#elif defined(__aarch64__)
+
+	// why am i doing this?
+	str	x30, [sp, #-16]!
+	bl	8f
+	.string	"hello world!\n\0"
+	.balign	4
+8:	mov	x1, x30		      // might as well make it easy on myself
+	bl	print_str
+	ldr	x30, [sp], #16
+	ret
 
-proc	x1c
+print_str:
+	mov	x2, #0
+0:	ldrb	w0, [x1, x2]
+	cmp	w0, #0
+	cinc.ne	x2, x2
+	b.ne	0b
+	mov	x0, #1
+	mov	x8, #SYS_write
+	svc	#0
+	ret
 
-	ud2
+#else
+	notimpl
+#endif
 
 endproc
 
-proc	x1d
+proc	x1a
 
-	ud2
+	// collect the current instruction-pointer address.  this was an old
+	// 32-bit i386 trick for position-independent code, but (a) it
+	// confuses the return predictor, and (b) amd64 has true pc-relative
+	// addressing.
 
-endproc
+#if defined(__x86_64__)
 
-proc	x1e
+	// the actual example
+	call	0f
+0:	pop	rax
 
-	ud2
+	// the modern i386 trick doesn't confuse the return-address
+	// predictor.
+	call	calladdr_rbx
+	sub	rbx, . - 0b
 
-endproc
+	// but rip-relative addressing is even better
+	lea	rcx, [rip + 0b]
 
-proc	x1f
+	ret
 
-	ud2
+calladdr_rbx:
+	mov	rbx, [rsp]
+	ret
 
-endproc
+#elif defined(__i386__)
 
-proc	x20
+	// the actual example
+	call	0f
+0:	pop	eax
 
-	ud2
+	// the modern i386 trick doesn't confuse the return-address
+	// predictor.
+	call	get_pc_ebx
+	sub	ebx, . - 0b
 
 	ret
 
-endproc
+#elif defined(__arm__)
 
-proc	x21
+	stmfd	r13!, {r14}
 
-	ud2
+	bl	0f
+0:	mov	r0, r14
 
-endproc
+	bl	return
+	sub	r1, r14, #. - 0b
 
-proc	x22
+	adr	r2, 0b
 
-	ud2
+	ldmfd	r13!, {pc}
 
-endproc
+return:	bx	r14
 
-proc	x23
+#elif defined(__aarch64__)
 
-	ud2
+	str	x30, [sp, #-16]!
 
-endproc
+	// we can do all of the above using a64
+	bl	0f
+0:	mov	x0, x30
 
-proc	x24
+	bl	return
+	sub	x1, x30, #. - 0b
 
-	ud2
+	adr	x2, 0b
+
+	ldr	x30, [sp], #16
+return:	ret
+
+#else
+	notimpl
+#endif
 
 endproc
 
-proc	x25
+proc	x1b
 
-	ud2
+#if defined(__x86_64__)
+
+	// retpolines: an mitigation against adversarially influenced
+	// speculative execution at indirect branches.  if an adversary can
+	// prepare a branch-target buffer entry matching an indirect branch
+	// in the victim's address space then they can cause the victim to
+	// /speculatively/ (but not architecturally) execute any code in
+	// their address space, possibly leading to leaking secrets through
+	// the cache.  retpolines aren't susceptible to this because the
+	// predicted destination address is from the return-prediction stack
+	// which the adversary can't prime.  the performance penalty is still
+	// essentially a branch misprediction -- for this return, and
+	// possibly all others already stacked.
+
+	// (try not to crash)
+	lea	rax, [rip + 9f]
 
-endproc
+	push	rax
+9:	ret
 
-proc	x26
+#elif defined(__i386__)
 
-	ud2
+	call	get_pc_ebx
+	lea	eax, [ebx + 9f - .]
 
-endproc
+	push	eax
+9:	ret
 
-proc	x27
+#elif defined(__arm__)
 
-	ud2
+	stmfd	r13!, {r14}
 
-endproc
+	adr	r14, 8f
+	bx	r14
 
-proc	x28
+8:	ldmfd	r13!, {pc}
 
-	ud2
+#elif defined(__aarch64__)
 
-endproc
+	str	x30, [sp, #-16]!
 
-proc	x29
+	adr	x30, 8f
+	ret
 
-	ud2
+8:	ldr	x30, [sp], #16
+	ret
+
+#else
+	notimpl
+#endif
 
 endproc
 
-proc	x2a
+proc	x1c
 
-	ud2
+	// ok, having a hard time seeing a use for this.  the most important
+	// thing to note is that sp is set from `pop' /after/ it's
+	// incremented.
 
-endproc
+#if defined(__x86_64__)
 
-proc	x2b
+	// try not to crash
+	mov	rax, rsp
+	and	rsp, -16
+	push	rax
 
-	ud2
+	pop	rsp
 
-endproc
+	// check it worked
+	mov	rbx, rsp
+	ret
 
-proc	x2c
+#elif defined(__i386__)
 
-	ud2
+	// try not to crash
+	mov	eax, esp
+	and	esp, -16
+	push	eax
 
-endproc
+	pop	esp
 
-proc	x2d
+	// check it worked
+	mov	ebx, esp
+	ret
 
-	ud2
+#elif defined(__arm__)
 
-endproc
+	// not even going to dignify this
+	notimpl
 
-proc	x2e
+#elif defined(__aarch64__)
 
-	ud2
+	// not even going to dignify this
+	notimpl
+
+#else
+	notimpl
+#endif
 
 endproc
 
-proc	x2f
+proc	x1d
 
-	ud2
+	// monumentally cheesy way to copy 8 n bytes from buff1 to buff2.
+	// also clobbers words at buff2 + 8 n and buff2 - 8 for good measure.
 
-endproc
+	n = 4
 
-proc	x30
+#if defined(__x86_64__)
 
-	ud2
+	mov	rax, rsp			// safekeeping
+
+	// we're toast if we get hit by a signal now.  fingers crossed...
+  .if 0
+	mov	rsp, buff2 + 8*n + 8
+	mov	rbp, buff1 + 8*n
+  .else
+	lea	rsp, [rdi + 8*n + 16]
+	lea	rbp, [rsi + 8*n]
+  .endif
+	enter	0, n + 1
+
+	// precise action:
+	//
+	//	   +---------+			+---------+
+	//  rbp -> |   ???   |		 rsp -> |   ???   |
+	//	   +---------+			+---------+
+	//         | w_{n-1} |			|   rbp   | <- rbp'
+	//	   +---------+			+---------+
+	//         |   ...   |			| w_{n-1} |
+	//	   +---------+			+---------+
+	//         |   w_1   |			|   ...   |
+	//	   +---------+			+---------+
+	//         |   w_0   |			|   w_1   |
+	//	   +---------+			+---------+
+	//					|   w_0   |
+	//					+---------+
+	//					|   rbp'  | <- rsp'
+	//					+---------+
+
+	mov	rdx, rsp
+	mov	rsp, rax
+
+#elif defined(__i386__)
+
+	mov	eax, esp			// safekeeping
+
+	// we're toast if we get hit by a signal now.  fingers crossed...
+  .if 0
+	mov	esp, buff2 + 4*n + 4
+	mov	ebp, buff1 + 4*n
+  .else
+	lea	esp, [edi + 4*n + 8]
+	lea	ebp, [esi + 4*n]
+  .endif
+	enter	0, n + 1
+
+	mov	edx, esp
+	mov	esp, eax
+
+#elif defined(__arm__)
+
+	add	r4, r4, #4*n
+	add	r5, r5, #4*n + 8
+
+	str	r4, [r5, #-4]!
+  .rept n/2
+	ldrd	r0, r1, [r4, #-8]!
+	strd	r0, r1, [r5, #-8]!
+  .endr
+	add	r4, r5, #4*n
+	str	r4, [r5, #-4]!
+
+#elif defined(__aarch64__)
+
+	// omgwtf.  let's not actually screw with the stack pointer.
+
+	add	x4, x4, #8*n
+	add	x5, x5, #8*n + 16
+
+	str	x4, [x5, #-8]!
+  .rept n/2
+	ldp	x16, x17, [x4, #-16]!
+	stp	x16, x17, [x5, #-16]!
+  .endr
+	add	x4, x5, #8*n
+	str	x4, [x5, #-8]!
+
+#else
+	notimpl
+#endif
 
 	ret
 
 endproc
 
-proc	x31
+proc	x1e
 
-	ud2
+	// convert nibble value to (uppercase) hex; other input values yield
+	// nonsense.
+
+#if defined(__x86_64__)
+
+	// das doesn't work in 64-bit mode; best i can come up with
+	mov	edx, eax
+	add	al, '0'
+	add	dl, 'A' - 10
+	cmp	al, '9' + 1
+	cmovae	eax, edx
+
+#elif defined(__i386__)
+
+	cmp	al, 0x0a		// cf = 1 iff a < 10
+	sbb	al, 0x69		// if 0 <= a < 10, a' = a - 0x6a, so
+					// 0x96 <= a' < 0x70, setting af, cf
+					// if 10 <= a < 16, a' = a - 0x69, so
+					// 0x71 <= a' < 0x77, setting cf but
+					// clearing af
+	das				// if 0 <= a < 10, then af and cf are
+					// both set, so set subtract 0x66
+					// from a' leaving 0x30 <= a' < 0x3a;
+					// if 10 <= a < 16 then af clear but
+					// cf set, so subtract 0x60 from a'
+					// leaving 0x41 <= a' < 0x47
+
+#elif defined(__arm__)
+
+	// significantly less tricksy
+	cmp	r0, #10
+	addlo	r0, r0, #'0'
+	addhs	r0, r0, #'A' - 10
+
+#elif defined(__aarch64__)
+
+	// with less versatile conditional execution this is the best we can
+	// do
+	cmp	w0, #10
+	add	w16, w0, #'A' - 10
+	add	w0, w0, #'0'
+	cmov.hs	w0, w16
+
+#else
+	notimpl
+#endif
+
+	ret
 
 endproc
 
-proc	x32
+proc	x1f
 
-	ud2
+	// verify collatz conjecture starting at a; assume a /= 0!
+
+#if defined(__x86_64__)
+
+0:	bsf	rcx, rax		// clobber c if a = 0
+	shr	rax, cl			// a = 2^c a'
+  cmp rdx, 0
+  je 1f
+  stosq
+  dec rdx
+1:
+	cmp	rax, 1			// done?
+	je	9f
+	lea	rax, [2*rax + rax + 1]	// a' = 3 a' + 1
+	jmp	0b			// again
+
+9:	ret
+
+#elif defined(__i386__)
+
+0:	bsf	ecx, eax		// clobber c if a = 0
+	shr	eax, cl			// a = 2^c a'
+  cmp edx, 0
+  je 1f
+  stosd
+  dec edx
+1:
+	cmp	eax, 1			// done?
+	je	9f
+	lea	eax, [2*eax + eax + 1]	// a' = 3 a' + 1
+	jmp	0b			// again
+
+9:	ret
+
+#elif defined(__arm__)
+
+	// rbit introduced in armv7
+0:	rbit	r2, r0
+	clz	r2, r2
+	mov	r0, r0, lsr r2		// a = 2^c a'
+  cmp r3, #0
+  strne r0, [r5], #4
+  subne r3, r3, #1
+	cmp	r0, #1
+	adcne	r0, r0, r0, lsl #1	// a' = 3 a' + 1 (because c set)
+	bne	0b
 
-endproc
+	ret
 
-proc	x33
+#elif defined(__aarch64__)
+
+0:	rbit	w2, w0
+	clz	w2, w2
+	lsr	w0, w0, w2		// a = 2^c a'
+  cmp x3, #0
+  beq 1f
+  str x0, [x5], #8
+  sub x3, x3, #1
+1:
+	cmp	w0, #1
+	add	w16, w0, w0, lsl #1	// t = 3 a' + 1 (because c set)
+	csinc.eq w0, w0, w16
+	b.ne	0b
 
-	ud2
+	ret
+
+#else
+	notimpl
+#endif
 
 endproc
 
-proc	x34
+///--------------------------------------------------------------------------
+/// 0x20--0x2f
 
-	ud2
+proc	x20
+
+	// calculate 1337 a slowly
+
+#if defined(__x86_64__)
+
+	// original version
+	mov	rcx, rax		// c = a
+	shl	rcx, 2			// c = 4 a
+	add	rcx, rax		// c = 5 a
+	shl	rcx, 3			// c = 40 a
+	add	rcx, rax		// c = 41 a
+	shl	rcx, 1			// c = 82 a
+	add	rcx, rax		// c = 83 a
+	shl	rcx, 1			// c = 166 a
+	add	rcx, rax		// c = 167 a
+	shl	rcx, 3			// c = 1336 a
+	add	rcx, rax		// c = 1337 a
+
+	// a quick way
+	lea	rdx, [2*rax + rax]	// t = 3 a
+	shl	rdx, 6			// t = 192 a
+	sub	rdx, rax		// t = 191 a
+	lea	rbx, [8*rdx]		// b = 1528 a
+	sub	rbx, rdx		// b = 1337 a
+
+#elif defined(__i386__)
+
+	// original version
+	mov	ecx, eax		// c = a
+	shl	ecx, 2			// c = 4 a
+	add	ecx, eax		// c = 5 a
+	shl	ecx, 3			// c = 40 a
+	add	ecx, eax		// c = 41 a
+	shl	ecx, 1			// c = 82 a
+	add	ecx, eax		// c = 83 a
+	shl	ecx, 1			// c = 166 a
+	add	ecx, eax		// c = 167 a
+	shl	ecx, 3			// c = 1336 a
+	add	ecx, eax		// c = 1337 a
+
+	// a quick way
+	lea	edx, [2*eax + eax]	// t = 3 a
+	shl	edx, 6			// t = 192 a
+	sub	edx, eax		// t = 191 a
+	lea	ebx, [8*edx]		// b = 1528 a
+	sub	ebx, edx		// b = 1337 a
+
+#elif defined(__arm__)
+
+	// original version, ish
+	add	r2, r0, r0, lsl #2	// c = 5 a
+	add	r2, r0, r2, lsl #3	// c = 41 a
+	add	r2, r0, r2, lsl #1	// c = 83 a
+	add	r2, r0, r2, lsl #1	// c = 167 a
+	add	r2, r0, r2, lsl #3	// c = 1337 a
+
+	// quicker way
+	add	r1, r0, r0, lsl #1	// b = 3 a
+	rsb	r1, r0, r1, lsl #6	// b = 191 a
+	rsb	r1, r1, r1, lsl #3	// b = 1337 a
+
+#elif defined(__aarch64__)
+
+	// original version, ish
+	add	x2, x0, x0, lsl #2	// c = 5 a
+	add	x2, x0, x2, lsl #3	// c = 41 a
+	add	x2, x0, x2, lsl #1	// c = 83 a
+	add	x2, x0, x2, lsl #1	// c = 167 a
+	add	x2, x0, x2, lsl #3	// c = 1337 a
+
+	// sleazy because no rsb
+	add	x1, x0, x0, lsl #1	// b = 3 a
+	sub	x1, x0, x1, lsl #6	// b = -191 a
+	sub	x1, x1, x1, lsl #3	// b = 1337 a
+
+#else
+	notimpl
+#endif
+
+	ret
 
 endproc
 
-proc	x35
+proc	x21
 
-	ud2
+	// multiply complex numbers a + b i and c + d i
+	//
+	//	(a + b i) (c + d i) = (a c - b d) + (a d + b c) i
+	//
+	// somewhat slick approach uses only three multiplications
+
+#if defined(__x86_64__)
+
+	mov	rsi, rax		// t = a
+	add	rax, rbx		// a' = a + b
+	mov	rdi, rdx		// u = d
+	sub	rdx, rcx		// d' = d - c
+	add	rdi, rcx		// u = c + d
+
+	imul	rax, rcx		// a' = c (a + b)
+	imul	rsi, rdx		// t = a (d - c)
+	imul	rdi, rbx		// u = b (c + d)
+
+	add	rsi, rax		// t = a (d - c) + c (a + b)
+	mov	rbx, rsi		// b' = a (d - c) + c (a + b)
+					//	= a d + b c
+	sub	rax, rdi		// a' = c (a + b) - b (c + d)
+					//	= a c - b d
+
+#elif defined(__i386__)
+
+	mov	esi, eax		// t = a
+	add	eax, ebx		// a' = a + b
+	mov	edi, edx		// u = d
+	sub	edx, ecx		// d' = d - c
+	add	edi, ecx		// u = c + d
+
+	imul	eax, ecx		// a' = c (a + b)
+	imul	esi, edx		// t = a (d - c)
+	imul	edi, ebx		// u = b (c + d)
+
+	add	esi, eax		// t = a (d - c) + c (a + b)
+	mov	ebx, esi		// b' = a (d - c) + c (a + b)
+					//	= a d + b c
+	sub	eax, edi		// a' = c (a + b) - b (c + d)
+					//	= a c - b d
+
+#elif defined(__arm__)
+
+	add	r4, r0, r1		// t = a + b
+	add	r5, r2, r3		// u = c + d
+	sub	r3, r3, r2		// d' = d - c
+
+	// mls introduced in armv7
+	mul	r4, r4, r2		// t = c (a + b)
+	mov	r2, r1			// c' = a (bah!)
+	mla	r1, r0, r3, r4		// b' = a (d - c) + c (a + b)
+					//	= a d + b c
+	mls	r0, r2, r5, r4		// a' = c (a + b) - b (c + d)
+					//	= a c - b d
+
+#elif defined(__aarch64__)
+
+	add	x4, x0, x1		// t = a + b
+	add	x5, x2, x3		// u = c + d
+	sub	x3, x3, x2		// d' = d - c
+
+	// mls intxoduced in axmv7
+	mul	x4, x4, x2		// t = c (a + b)
+	mov	x2, x1			// c' = a (bah!)
+	madd	x1, x0, x3, x4		// b' = a (d - c) + c (a + b)
+					//	= a d + b c
+	msub	x0, x2, x5, x4		// a' = c (a + b) - b (c + d)
+					//	= a c - b d
+
+#else
+	notimpl
+#endif
+
+	ret
 
 endproc
 
-proc	x36
+proc	x22
 
-	ud2
+	// divide by 3
 
-endproc
+#if defined(__x86_64__)
 
-proc	x37
+	mov	rdx, 0xaaaaaaaaaaaaaaab // = ceil(2/3 2^64)
+	mul	rdx			// d' || a' =~ 2/3 a 2^64
+	shr	rdx, 1			// d' = floor(a/3)
+	mov	rax, rdx		// a' = floor(a/3)
 
-	ud2
+	// we start with 0 <= a < 2^64.  write f = ceil(2/3 2^64), so that
+	// 2/3 < f/2^64 < 2/3 + 1/2^64.  then floor(2/3 a) <= floor(a f/2^64)
+	// <= floor(2/3 a + a/2^64), but a < 2^64 so a/2^64 < 1 and
+	// floor(a f/2^64) = floor(2/3 a).
 
-endproc
+#elif defined(__i386__)
 
-proc	x38
+	mov	edx, 0xaaaaaaab		// = ceil(2/3 2^32)
+	mul	edx			// d' || a' =~ 2/3 a 2^32
+	shr	edx, 1			// d' = floor(a/3)
+	mov	eax, edx		// a' = floor(a/3)
 
-	ud2
+#elif defined(__arm__)
 
-endproc
+	ldr	r12, =0xaaaaaaab
+	umull	r12, r0, r0, r12
+	mov	r0, r0, lsr #1
 
-proc	x39
+#elif defined(__aarch64__)
 
-	ud2
+	ldr	x16, =0xaaaaaaaaaaaaaaab
+	umulh	x0, x0, x16
+	lsr	x0, x0, #1
+
+#else
+	notimpl
+#endif
+
+	ret
 
 endproc
 
-proc	x3a
+proc	x23
 
-	ud2
+#if defined(__x86_64__)
 
-endproc
+	// main loop: shorten a preserving residue class mod 3
+0:	cmp	rax, 5
+	jbe	8f
+	// a > 5
+	mov	rdx, rax		// d' = a
+	shr	rdx, 2			// d' = floor(a/4)
+	and	rax, 3			// a = 4 d' + a' (0 <= a' < 4)
+	add	rax, rdx		// a' == a (mod 3) but a' < a/4 + 4
+	jmp	0b
 
-proc	x3b
+	// fix up final value 0 <= a < 6: want 0 <= a < 3
+	//
+	// the tricky part is actually a = 3; but the other final cases take
+	// additional iterations which we can avoid.
+8:	cmp	rax, 3			// set cf iff a < 3
+	cmc				// set cf iff a >= 3
+	sbb	rdx, rdx		// d' = a >= 3 ? -1 : 0
+	and	rdx, 3			// d' = a >= 3 ? 3 : 0
+	sub	rax, rdx		// a' = a - (a >= 3 ? 3 : 0)
+					//	= a (mod 3)
+
+#elif defined(__i386__)
+
+	// main loop: shorten a preserving residue class mod 3
+0:	cmp	eax, 5
+	jbe	8f
+	// a > 5
+	mov	edx, eax		// d' = a
+	shr	edx, 2			// d' = floor(a/4)
+	and	eax, 3			// a = 4 d' + a' (0 <= a' < 4)
+	add	eax, edx		// a' == a (mod 3) but a' < a/4 + 4
+	jmp	0b
+
+	// fix up final value 0 <= a < 6: want 0 <= a < 3
+	//
+	// the tricky part is actually a = 3; but the other final cases take
+	// additional iterations which we can avoid.
+8:	cmp	eax, 3			// set cf iff a < 3
+	cmc				// set cf iff a >= 3
+	sbb	edx, edx		// d' = a >= 3 ? -1 : 0
+	and	edx, 3			// d' = a >= 3 ? 3 : 0
+	sub	eax, edx		// a' = a - (a >= 3 ? 3 : 0)
+					//	= a (mod 3)
 
-	ud2
+#elif defined(__arm__)
 
-endproc
+0:	cmp	r0, #6
+	andhs	r12, r0, #3
+	addhs	r0, r12, r0, lsr #2
+	bhs	0b
 
-proc	x3c
+	cmp	r0, #3
+	subhs	r0, r0, #3
 
-	ud2
+#elif defined(__aarch64__)
 
-endproc
+0:	cmp	x0, #6
+	// blunder on through regardless since this doesn't affect the result
+	and	x16, x0, #3
+	add	x0, x16, x0, lsr #2
+	b.hs	0b
 
-proc	x3d
+	subs	x16, x0, #3
+	cmov.hs	x0, x16
 
-	ud2
+#else
+	notimpl
+#endif
+
+	ret
 
 endproc
 
-proc	x3e
+proc	x24
 
-	ud2
+	// invert (odd) a mod 2^64
+	//
+	// suppose a a_i == 1 (mod 2^{2^i})
+	//
+	// clearly good for i = 0, since 2^i = 1 and 2^{2^i} = 2, and a_0 =
+	// a == 1 (mod 2) by assumption
+	//
+	// write a a_i == b_i 2^{2^i} + 1 (mod 2^{2^{i+1}})
+	// then b_i == (a a_i - 1)/2^{2^i} (mod 2^{2^i})
+	// to lift inverse, we want x such that a x == -b_i (mod 2^{2^i});
+	// clearly x = -a_i b_i will do, since a a_i == 1 (mod 2^{2^i})
+	// then:
+	// a_{i+1} = a_i - a_i b_i 2^{2^i} = a_i (1 - (a a_i - 1))
+	//	= 2 a_i - a a_i^2
+	//
+	// check:
+	// a a_{i+1} = 2 a a_i - a^2 a_i^2
+	//	== 2 a a_i - (b_i 2^{2^i} + 1)^2
+	//	== 2 (b_i 2^{2^i} + 1) -
+	//		(b_i^2 2^{2^{i+1}} + 2 b_i 2^{2^i} + 1)
+	//	== 1 (mod 2^{2^{i+1}})
+
+#if defined(__x86_64__)
+
+	// rax				// a_0 = a
+	mov	rbx, rax		// b' = a
+	mov	rsi, rax		// t = a_0
+
+0:
+  cmp rbp, 0
+  je 1f
+  stosq
+  dec rbp
+1:
+	mul	rbx			// a' = a a_i
+	mov	rcx, rax		// c = a a_i
+
+	sub	rax, 2			// a' = a a_i - 2
+	neg	rax			// a' = 2 - a a_i
+	mul	rsi			// a_{i+1} = a_i (2 - a a_i)
+					//	= 2 a_i - a a_i^2
+	mov	rsi, rax		// t = a_{i+1}
+
+	cmp	rcx, 1			// done?
+	ja	0b			// no -- iterate
+
+#elif defined(__i386__)
+
+	// eax				// a_0 = a
+	mov	ebx, eax		// b' = a
+	mov	esi, eax		// t = a_0
+
+0:
+  cmp ebp, 0
+  je 1f
+  stosd
+  dec ebp
+1:
+	mul	ebx			// a' = a a_i
+	mov	ecx, eax		// c = a a_i
+
+	sub	eax, 2			// a' = a a_i - 2
+	jb	9f			// done if < 2
+	neg	eax			// a' = 2 - a a_i
+	mul	esi			// a_{i+1} = a_i (2 - a a_i)
+					//	= 2 a_i - a a_i^2
+	mov	esi, eax		// t = a_{i+1}
+
+	jmp	0b			// and iterate
+9:	mov	eax, esi		// restore
+
+#elif defined(__arm__)
+
+	// r0				// a_0 = a
+	mov	r1, r0			// b' = a
+
+0:
+  cmp r6, #0
+  strne r0, [r5], #4
+  subne r6, r6, #1
+	mul	r2, r0, r1		// c = a a_i
+	rsbs	r2, r2, #2		// c = 2 - a a_i
+	mul	r0, r0, r2		// a_{i+1} = a_i (2 - a a_i)
+					//	= 2 a_i - a a_i^2
+	blo	0b
+
+#elif defined(__aarch64__)
+
+	// x0				// a_0 = a
+	mov	x1, x0			// b' = a
+	mov	x16, #2			// because we have no rsb
+
+0:
+  cmp x6, #0
+  b.eq 1f
+  str x0, [x5], #8
+  sub x6, x6, #1
+1:
+	mul	x2, x0, x1		// c = a a_i
+	subs	x2, x16, x2		// c = 2 - a a_i
+	mul	x0, x0, x2		// a_{i+1} = a_i (2 - a a_i)
+					//	= 2 a_i - a a_i^2
+	b.lo	0b
+
+#else
+	notimpl
+#endif
+
+	ret
 
 endproc
 
-proc	x3f
+proc	x25
 
-	ud2
+	// a poor approximation to pi/4
+	//
+	// think of x and y as being in 16.16 fixed-point format.  we sample
+	// points in the unit square, and determine how many of them are
+	// within a unit quarter-circle centred at the origin.  the area of
+	// the quarter-circle is pi/4.
 
-endproc
+#if defined(__x86_64__)
+
+	xor	eax, eax		// a = 0
+	mov	rcx, 1
+	shl	rcx, 0x20		// c =~ 4 billion
+
+0:	movzx	rbx, cx			// x = low 16 bits of c
+	imul	rbx, rbx		// b = x^2
+
+	ror	rcx, 0x10		// switch halves of c
+	movzx	rdx, cx			// y = high 16 bits of c
+	imul	rdx, rdx		// d = y^2
+	rol	rcx, 0x10		// switch back
+
+	add	rbx, rdx		// r^2 = x^2 + y^2
+	shr	rbx, 0x20		// r^2 >= 1?
+	cmp	rbx, 1			// set cf iff r^2 >= 1
+	adc	rax, 0			// and add onto accumulator
+	loop	0b
+
+#elif defined(__i386__)
+
+	// this is actually better done in 32 bits.  the carry has the wrong
+	// sense here, so instead deduct one for each point outside the
+	// quarter-circle rather than adding one for each point inside it.
+	xor	eax, eax
+	xor	ecx, ecx
+
+0:	movzx	ebx, cx
+	imul	ebx, ebx
+
+	mov	edx, ecx
+	shr	edx, 0x10
+	imul	edx, edx
+
+	add	ebx, edx		// see?
+	sbb	eax, 0
+	loop	0b
+
+#elif defined(__arm__)
+
+	mov	r0, #0
+	mov	r2, #0
+
+0:	uxth	r1, r2, ror #0
+	uxth	r3, r2, ror #16
+	mul	r1, r1, r1
+	mul	r3, r3, r3
+	cmn	r1, r3			// mlas doesn't set cf usefully
+	addcc	r0, r0, #1
+	adds	r2, r2, #1
+	bne	0b
+
+#elif defined(__aarch64__)
+
+	mov	w0, #0
+	mov	w2, #0
+
+0:	ubfx	w1, w2, #0, #16
+	ubfx	w3, w2, #16, #16
+	sub	w2, w2, #1
+	mul	w1, w1, w1
+	mul	w3, w3, w3
+	cmn	w1, w3
+	cinc.cc	w0, w0
+	cbnz	w2, 0b
+
+#else
+	notimpl
+#endif
+
+	ret
+
+endproc
+
+proc	x26
+
+	// a bad way to rotate a right by 7 places
+
+#if defined(__x86_64__)
+
+	mov	rbx, rax
+	ror	rbx, 7			// better
+
+	mov	rdx, rax		// d' = a
+	shr	rax, 7			// a' = a >> 7
+	shl	rdx, 0x39		// d' = a << 57
+	or	rax, rdx		// a' = a >>> 7
+
+#elif defined(__i386__)
+
+	mov	ebx, eax
+	ror	ebx, 7			// better
+
+	mov	edx, eax		// d' = a
+	shr	eax, 7			// a' = a >> 7
+	shl	edx, 0x39		// d' = a << 57
+	or	eax, edx		// a' = a >>> 7
+
+#elif defined(__arm__)
+
+	mov	r1, r0, ror #7		// easy way
+
+	// even the hard way is fairly easy on arm
+	mov	r3, r0, lsl #25
+	orr	r0, r3, r0, lsr #7	// hard way
+
+#elif defined(__aarch64__)
+
+	ror	x1, x0, #7		// easy way
+
+	// even the hard way is fairly easy on arm
+	lsl	x3, x0, #57
+	orr	x0, x3, x0, lsr #7	// hard way
+
+#else
+	notimpl
+#endif
+
+	ret
+
+endproc
+
+proc	x27
+
+	// shift a right by c places, in two halves
+
+#if defined(__x86_64__)
+
+	mov	ch, cl			// c' = [c, c]
+	inc	ch			// c' = [c, c + 1]
+	shr	ch, 1
+	shr	cl, 1			// c' = [floor(c/2), ceil(c/2)]
+	shr	rax, cl
+	xchg	ch, cl
+	shr	rax, cl
+
+#elif defined(__i386__)
+
+	mov	ch, cl			// c' = [c, c]
+	inc	ch			// c' = [c, c + 1]
+	shr	ch, 1
+	shr	cl, 1			// c' = [floor(c/2), ceil(c/2)]
+	shr	eax, cl
+	xchg	ch, cl
+	shr	eax, cl
+
+#elif defined(__arm__)
+
+	// it would be clearer and more efficient to say: `mov r12, r2, lsr
+	// #1; sub r2, r2, r12', but that's not the lesson this exercise is
+	// trying to teach.
+	add	r12, r2, #1
+	mov	r2, r2, lsr #1
+	mov	r12, r12, lsr #1
+	mov	r0, r0, lsr r2
+	mov	r0, r0, lsr r12
+
+#elif defined(__aarch64__)
+
+	add	w16, w2, #1
+	lsr	w2, w2, #1
+	lsr	w16, w16, #1
+	lsr	x0, x0, x2
+	lsr	x0, x0, x16
+
+#else
+	notimpl
+#endif
+
+	ret
+
+endproc
+
+proc	x28
+
+	// divide c-byte little-endian bignum at rsi by 2 (rounding down)
+
+#if defined(__x86_64__)
+
+	clc
+0:	rcr	byte ptr [rsi], 1
+	inc	rsi
+	loop	0b
+
+#elif defined(__i386__)
+
+	clc
+0:	rcr	byte ptr [esi], 1
+	inc	esi
+	loop	0b
+
+#elif defined(__arm__)
+
+	// we could hack this a word at a time using rrx
+	mov	r3, #0
+0:	ldrb	r12, [r4]
+	subs	r2, r2, #1
+	orr	r3, r3, r12, lsr #1
+	strb	r3, [r4], #1
+	mov	r3, r12, lsl #7
+	bne	0b
+
+#elif defined(__aarch64__)
+
+	mov	w16, #0
+0:	ldrb	w17, [x4]
+	sub	x2, x2, #1
+	orr	w16, w16, w17, lsr #1
+	strb	w16, [x4], #1
+	lsl	w16, w17, #7
+	cbnz	x2, 0b
+
+#else
+	notimpl
+#endif
+
+	ret
+
+endproc
+
+proc	x29
+
+	// fill a buffer with a 3-byte pattern
+
+#if defined(__x86_64__)
+
+	lea	rdi, [rsi + 3]
+	rep movsb
+
+#elif defined(__i386__)
+
+	lea	edi, [esi + 3]
+	rep movsb
+
+#elif defined(__arm__)
+
+	add	r5, r4, #3
+0:	subs	r2, r2, #1
+	ldrhsb	r12, [r4], #1
+	strhsb	r12, [r5], #1
+	bhs	0b
+
+#elif defined(__aarch64__)
+
+	cbz	x2, 9f
+	add	x5, x4, #3
+0:	sub	x2, x2, #1
+	ldrb	w16, [x4], #1
+	strb	w16, [x5], #1
+	cbnz	x2, 0b
+9:
+
+#else
+	notimpl
+#endif
+
+	ret
+
+endproc
+
+proc	x2a
+
+	// rotate the words in a buffer, so that the last word comes first,
+	// the first comes second, and so on.  this isn't a good way to do
+	// it.
+
+#if defined(__x86_64__)
+
+	mov	rsi, rbx		// set string pointers
+	mov	rdi, rbx
+0:	lodsq				// fetch next word
+	xchg	rax, qword ptr [rbx]	// stash it for next iteration and
+					// replace it with the previously
+					// stashed word
+	stosq				// store in output
+	// (note that the first iteration doesn't actually do anything)
+	loop	0b			// continue until all done
+
+#elif defined(__i386__)
+
+	mov	esi, ebx		// set string pointers
+	mov	edi, ebx
+0:	lodsd				// fetch next word
+	xchg	eax, dword ptr [ebx]	// stash it for next iteration and
+					// replace it with the previously
+					// stashed word
+	stosd				// store in output
+	loop	0b			// continue until all done
+
+#elif defined(__arm__)
+
+	// let's do this a sensible way.  (we could go faster using ldm/stm.)
+	add	r0, r1, r2, lsl #2	// find the end of the buffer
+	ldr	r0, [r0, #-4]		// collect final element
+0:	subs	r2, r2, #1
+	ldr	r12, [r1]
+	str	r0, [r1], #4
+	mov	r0, r12
+	bne	0b
+
+#elif defined(__aarch64__)
+
+	add	x0, x1, x2, lsl #3	// find the end of the buffer
+	ldr	x0, [x0, #-8]		// collect final element
+0:	sub	x2, x2, #1
+	ldr	x16, [x1]
+	str	x0, [x1], #8
+	mov	x0, x16
+	cbnz	x2, 0b
+
+#else
+	notimpl
+#endif
+
+	ret
+
+endproc
+
+proc	x2b
+
+	// find a cycle in a function f: B -> B, where B = {0, 1, ..., 255}
+
+#if defined(__x86_64__)
+
+	// this is floyd's cycle-finding algorithm.
+	//
+	// consider the sequence s_0 = 0, s_1 = f(0), s_2 = f(f(0)), ...,
+	// s_{i+1} = f(s_i).  since B is finite, there must be some smallest
+	// t and c such that s(t) = s(t + c); then we have s_i = s_j iff
+	// i >= t, j >= t, and i == j (mod c).
+	//
+	// the algorithm sets two cursors advancing through the sequence: a
+	// /tortoise/ which advances one step at a time, and a /hare/ which
+	// advances by two, so when the tortoise is at element s_i, the hare
+	// is at s_{2i}.  the hare will run around the cycle and catch the
+	// tortoise when i >= t and i == 2 i (mod c); the latter is simply i
+	// == 0 (mod c), which therefore happens first when i = k = t +
+	// (-t mod c).
+	//
+	// i'm not sure what good xlatb does here that mov al, [rbx + al]
+	// doesn't.
+
+	xor	eax, eax		// tortoise starts at 0
+	xor	edx, edx		// hare starts at 0
+0:	xlatb				// advance tortoise
+	xchg	rax, rdx		// switch to hare
+	xlatb				// advance hare ...
+	xlatb				// ... twice
+	xchg	rax, rdx		// switch back
+	cmp	al, dl			// hare caught the tortoise?
+	jnz	0b			// no -- go around again
+
+	// now we trace the initial tail: reset the tortoise to s_0, and slow
+	// the hare down so that both take only a single step in each
+	// iteration.  this loop terminates when i >= t and i == i + 2 k
+	// (mod c).  we know k is a multiple of c, so the latter condition
+	// always holds, so this finds the first step of the cycle.
+
+	xor	eax, eax		// reset the tortoise
+0:	xlatb				// advance tortoise
+	xchg	rax, rdx		// switch to hare
+	xlatb				// advance hare
+	xchg	rax, rdx		// and switch back
+	cmp	al, dl			// done?
+	jnz	0b			// no -- iterate
+
+#elif defined(__i386__)
+
+	xor	eax, eax		// tortoise starts at 0
+	xor	edx, edx		// hare starts at 0
+0:	xlatb				// advance tortoise
+	xchg	eax, edx		// switch to hare
+	xlatb				// advance hare ...
+	xlatb				// ... twice
+	xchg	eax, edx		// switch back
+	cmp	al, dl			// hare caught the tortoise?
+	jnz	0b			// no -- go around again
+
+	xor	eax, eax		// reset the tortoise
+0:	xlatb				// advance tortoise
+	xchg	eax, edx		// switch to hare
+	xlatb				// advance hare
+	xchg	eax, edx		// and switch back
+	cmp	al, dl			// done?
+	jnz	0b			// no -- iterate
+
+#elif defined(__arm__)
+
+	mov	r0, #0
+	mov	r3, #0
+0:	ldrb	r0, [r1, r0]
+	ldrb	r3, [r1, r3]
+	ldrb	r3, [r1, r3]
+	cmp	r0, r3
+	bne	0b
+
+	mov	r0, #0
+0:	ldrb	r0, [r1, r0]
+	ldrb	r3, [r1, r3]
+	cmp	r0, r3
+	bne	0b
+
+#elif defined(__aarch64__)
+
+	mov	w0, #0
+	mov	w3, #0
+0:	ldrb	w0, [x1, x0]
+	ldrb	w3, [x1, x3]
+	ldrb	w3, [x1, x3]
+	cmp	w0, w3
+	b.ne	0b
+
+	mov	w0, #0
+0:	ldrb	w0, [x1, x0]
+	ldrb	w3, [x1, x3]
+	cmp	w0, w3
+	b.ne	0b
+
+#else
+	notimpl
+#endif
+
+	ret
+
+endproc
+
+proc	x2c
+
+	// a convoluted way to set rax = rsi
+
+#if defined(__x86_64__)
+
+	mov	qword ptr [rbx + 8*rcx], 0 // b[c] = 0
+	mov	qword ptr [rbx + 8*rdx], 1 // b[d] = 1
+	mov	rax, [rbx + 8*rcx]	// a' = b[c] = 0
+
+	mov	[rbx], rsi		// b[0] = t
+	mov	[rbx + 8], rdi		// b[1] = u
+	mov	rax, [rbx + 8*rax]	// a' = b[a'] = b[0] = t
+
+#elif defined(__i386__)
+
+	mov	dword ptr [ebx + 8*ecx], 0 // b[c] = 0
+	mov	dword ptr [ebx + 8*edx], 1 // b[d] = 1
+	mov	eax, [ebx + 8*ecx]	// a' = b[c] = 0
+
+	mov	[ebx], esi		// b[0] = t
+	mov	[ebx + 8], edi		// b[1] = u
+	mov	eax, [ebx + 8*eax]	// a' = b[a'] = b[0] = t
+
+#elif defined(__arm__)
+
+	mov	r0, #0
+	mov	r12, #1
+
+	str	r0, [r1, r2, lsl #2]
+	str	r12, [r1, r3, lsl #2]
+	ldr	r0, [r1, r2, lsl #2]
+
+	str	r4, [r1]
+	str	r5, [r1, #4]
+	ldr	r0, [r1, r0, lsl #2]
+
+#elif defined(__aarch64__)
+
+	mov	x16, #1
+
+	str	xzr, [x1, x2, lsl #3]
+	str	x16, [x1, x3, lsl #3]
+	ldr	x0, [x1, x2, lsl #3]
+
+	str	x4, [x1]
+	str	x5, [x1, #8]
+	ldr	x0, [x1, x0, lsl #3]
+
+#else
+	notimpl
+#endif
+
+	ret
+
+endproc
+
+proc	x2d
+
+	// clear the least significant set bit in a, by calculating a' =
+	// a AND (a - 1).
+	//
+	// if a = 0 then a' = 0.  otherwise, a - 1 differs from a exactly in
+	// the least significant /set/ bit of a, and all bits of lesser
+	// significance.  to put it another way: write a = u 2^{k+1} + 2^k;
+	// then a - 1 = u 2^{k+1} + 2^{k-1} + ... + 2 + 1.  taking the
+	// bitwise AND of these leaves only the bits common to both, i.e.,
+	// u 2^{k+1}.
+
+#if defined(__x86_64__)
+
+	mov	rdx, rax		// d' = a
+	dec	rax			// a' = a - 1
+	and	rax, rdx		// a' = a AND (a - 1)
+
+#elif defined(__i386__)
+
+	mov	edx, eax		// d' = a
+	dec	eax			// a' = a - 1
+	and	eax, edx		// a' = a AND (a - 1)
+
+#elif defined(__arm__)
+
+	sub	r3, r0, #1
+	and	r0, r0, r3
+
+#elif defined(__aarch64__)
+
+	sub	x3, x0, #1
+	and	x0, x0, x3
+
+#else
+	notimpl
+#endif
+
+	ret
+
+endproc
+
+proc	x2e
+
+	// compute a mask of one bits in exactly the positions of the
+	// low-order run of zero bits in a
+
+#if defined(__x86_64__)
+
+	mov	rdx, rax		// d' = a
+	dec	rdx			// d' = a - 1
+	xor	rax, rdx		// a = a XOR (a - 1)
+					//   set bits are least significant
+					//   set bit of a, and all bits of
+					//   lesser significance
+	shr	rax, 1			// now only bits of lesser
+					//   significance; a' = 0 iff a odd
+	cmp	rax, rdx		// equal if a = 0 or 2^k; otherwise
+					//   strictly less
+
+#elif defined(__i386__)
+
+	mov	edx, eax
+	dec	edx
+	xor	eax, edx
+	shr	eax, 1
+	cmp	eax, edx
+
+#elif defined(__arm__)
+
+	sub	r3, r0, #1
+	eor	r0, r0, r3
+	mov	r0, r0, lsr #1		// probably fold shift into next inst
+	cmp	r0, r3
+
+#elif defined(__aarch64__)
+
+	sub	x3, x0, #1
+	eor	x0, x0, x3
+	mov	x0, x0, lsr #1		// probably fold shift into next inst
+	cmp	x0, x3
+
+#else
+	notimpl
+#endif
+
+	ret
+
+endproc
+
+proc	x2f
+
+	// a slow population count
+
+#if defined(__x86_64__)
+
+	popcnt	rbx, rcx		// the easy way
+
+	// a fast version in software
+	mov	rax, rcx
+
+	mov	rdx, rcx
+	shr	rdx, 1
+	mov	rsi, 0x5555555555555555
+	and	rax, rsi
+	and	rdx, rsi
+	add	rax, rdx
+
+	mov	rdx, rax
+	shr	rdx, 2
+	mov	rsi, 0x3333333333333333
+	and	rax, rsi
+	and	rdx, rsi
+	add	rax, rdx
+
+	mov	rdx, rax
+	shr	rdx, 32
+	add	rax, rdx
+
+	mov	rdx, rax
+	shr	rdx, 4
+	and	rax, 0x0f0f0f0f
+	and	rdx, 0x0f0f0f0f
+	add	rax, rdx
+
+	mov	rdx, rax
+	shr	rdx, 8
+	add	rax, rdx
+
+	mov	rdx, rax
+	shr	rdx, 16
+	add	rax, rdx
+	movzx	rsi, al
+
+	// the official version
+	xor	eax, eax		// clear iteration counter
+0:	jrcxz	9f			// bail if c = 0
+	inc	rax			// bump iteration count
+	mov	rdx, rcx		// d' = c
+	dec	rdx			// d' = c - 1
+	and	rcx, rdx		// zap least significant set bit of c
+	jmp	0b			// and go again
+9:
+
+#elif defined(__i386__)
+
+	popcnt	ebx, ecx		// the easy way
+
+	mov	eax, ecx
+
+	mov	edx, ecx
+	shr	edx, 1
+	and	eax, 0x55555555
+	and	edx, 0x55555555
+	add	eax, edx
+
+	mov	edx, eax
+	shr	edx, 2
+	and	eax, 0x33333333
+	and	edx, 0x33333333
+	add	eax, edx
+
+	mov	edx, eax
+	shr	edx, 4
+	add	eax, edx
+
+	mov	edx, eax
+	shr	edx, 8
+	and	eax, 0x000f000f
+	and	edx, 0x000f000f
+	add	eax, edx
+
+	mov	edx, eax
+	shr	edx, 16
+	add	eax, edx
+	movzx	esi, al
+
+	xor	eax, eax
+0:	jecxz	9f
+	inc	eax
+	mov	edx, ecx
+	dec	edx
+	and	ecx, edx
+	jmp	0b
+9:
+
+#elif defined(__arm__)
+
+	// the easy-ish way
+	vmov	d0[0], r2
+	vcnt.8	d0, d0
+	vmov	r1, d0[0]
+	add	r1, r1, r1, lsl #8
+	add	r1, r1, r1, lsl #16
+	mov	r1, r1, lsr #24
+
+	// the hard way
+	movw	r12, #0x5555
+	movt	r12, #0x5555
+	and	r3, r12, r2, lsr #1
+	and	r0, r12, r2
+	add	r0, r0, r3
+
+	movw	r12, #0x3333
+	movt	r12, #0x3333
+	and	r3, r12, r0, lsr #2
+	and	r0, r12, r0
+	add	r0, r0, r3
+
+	add	r0, r0, r0, lsl #16
+
+	movt	r12, 0x0f0f
+	and	r3, r12, r0, lsr #4
+	and	r0, r12, r0
+	add	r0, r0, r3
+
+	add	r0, r0, r0, lsl #8
+
+	mov	r4, r0, lsr #24
+
+	// and following the exercise
+	mov	r0, #0
+	cmp	r2, #0
+	beq	9f
+0:	add	r0, r0, #1
+	sub	r3, r2, #1
+	ands	r2, r2, r3
+	bne	0b
+9:
+
+#elif defined(__aarch64__)
+
+	// the easy-ish way
+	mov	v0.d[0], x2
+	cnt	v0.8b, v0.8b
+	mov	x1, v0.d[0]
+	add	x1, x1, x1, lsl #8
+	add	x1, x1, x1, lsl #16
+	add	x1, x1, x1, lsl #32
+	lsr	x1, x1, #56
+
+	// the hard way -- though arm64's immediate constant encodings and
+	// shifting make this actually rather pleasant.
+	and	x3, x2, #0xaaaaaaaaaaaaaaaa
+	and	x0, x2, #0x5555555555555555
+	add	x0, x0, x3, lsr #1
+
+	and	x3, x0, #0xcccccccccccccccc
+	and	x0, x0, #0x3333333333333333
+	add	x0, x0, x3, lsr #2
+
+	add	x0, x0, x0, lsr #4
+
+	and	x3, x0, #0x0f000f000f000f00
+	and	x0, x0, #0x000f000f000f000f
+	add	x0, x3, x0, lsl #8
+
+	add	x0, x0, x0, lsl #16
+	add	x0, x0, x0, lsl #32
+	lsr	x4, x0, #56
+
+	// and the official way
+	mov	x0, #0
+	cbz	x2, 9f
+0:	add	x0, x0, #1
+	sub	x3, x2, #1
+	and	x2, x2, x3
+	cbnz	x2, 0b
+9:
+
+#else
+	notimpl
+#endif
+
+	ret
+
+endproc
+
+///--------------------------------------------------------------------------
+/// 0x30--0x3f
+
+proc	x30
+
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
+
+	ret
+
+endproc
+
+proc	x31
+
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
+
+endproc
+
+proc	x32
+
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
+
+endproc
+
+proc	x33
+
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
+
+endproc
+
+proc	x34
+
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
+
+endproc
+
+proc	x35
+
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
+
+endproc
+
+proc	x36
+
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
+
+endproc
+
+proc	x37
+
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
+
+endproc
+
+proc	x38
+
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
+
+endproc
+
+proc	x39
+
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
+
+endproc
+
+proc	x3a
+
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
+
+endproc
+
+proc	x3b
+
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
+
+endproc
+
+proc	x3c
+
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
+
+endproc
+
+proc	x3d
+
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
+
+endproc
+
+proc	x3e
+
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
+
+endproc
+
+proc	x3f
+
+#if defined(__x86_64__)
+
+	notimpl
+
+#elif defined(__i386__)
+
+	notimpl
+
+#elif defined(__arm__)
+
+	notimpl
+
+#elif defined(__aarch64__)
+
+	notimpl
+
+#else
+	notimpl
+#endif
+
+endproc
+
+///----- That's all, folks --------------------------------------------------