[xchg-rax-rax] / xchg.S

/// -*- mode: asm; asm-comment-char: 0 -*-

///--------------------------------------------------------------------------
/// Preliminaries.

#include <sys/syscall.h>

#if defined(__i386__) || defined(__x86_64__)

	.intel_syntax noprefix

#elif defined(__arm__)

.macro	ret
	bx	r14
.endm

	.arch	armv7-a
	.fpu	neon

#elif defined(__aarch64__)

.macro	cmov	rd, rn, cc
	csel	\rd, \rn, \rd, \cc
.endm
#define _COND(_)							\
	_(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl)			\
	_(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv)			\
	_(hs) _(lo)
#define _INST(_)							\
	_(ccmp) _(ccmn)							\
	_(csel) _(cmov)							\
	_(csinc) _(cinc) _(cset)					\
	_(csneg) _(cneg)						\
	_(csinv) _(cinv) _(csetm)
#define _CONDVAR(cc) _definstvar cc;
#define _INSTVARS(inst)							\
	.macro _definstvar cc;						\
	  .macro inst.\cc args:vararg; inst \args, \cc; .endm;		\
	.endm;								\
	_COND(_CONDVAR);						\
	.purgem _definstvar;
	_INST(_INSTVARS)
#undef _COND
#undef _INST
#undef _CONDVAR
#undef _INSTVARS

#define CCMP_N 8
#define CCMP_Z 4
#define CCMP_C 2
#define CCMP_V 1

#define CCMP_MI CCMP_N
#define CCMP_PL 0
#define CCMP_EQ CCMP_Z
#define CCMP_NE 0
#define CCMP_CS CCMP_C
#define CCMP_HS CCMP_C
#define CCMP_CC 0
#define CCMP_LO 0
#define CCMP_VS CCMP_V
#define CCMP_VC 0
#define CCMP_HI CCMP_C
#define CCMP_LS 0
#define CCMP_LT CCMP_N
#define CCMP_GE 0
#define CCMP_LE CCMP_N
#define CCMP_GT 0

#else
#  error "not supported"
#endif

.macro	proc	name
	.globl	\name
	.type	\name, STT_FUNC
	.p2align 4
\name\():
  .macro endproc
	.size	\name, . - \name
	.purgem	endproc
  .endm
.endm

.macro ch c
#if defined(__i386__)

	pushf
	push	eax
	push	ebx
	push	ecx
	push	edx
	push	ebp
	mov	ebp, esp
	and	esp, -16

	push	\c
	call	putchar@plt

	call	get_pc_ebx
	add	ebx, offset _GLOBAL_OFFSET_TABLE
	mov	eax, [ebx + stdout@GOT]
	mov	eax, [eax]
	call	fflush@plt

	mov	esp, ebp
	pop	ebp
	pop	edx
	pop	ecx
	pop	ebx
	pop	eax
	popf

#elif defined(__x86_64__)

	pushf
	push	rax
	push	rcx
	push	rdx
	push	rsi
	push	rdi
	push	r8
	push	r9
	push	rbp
	mov	rbp, rsp
	and	rsp, -16

	mov	rdi, \c
	call	putchar@plt

	mov	rdi, [rip + stdout]
	call	fflush@plt

	mov	rsp, rbp
	pop	rbp
	pop	r9
	pop	r8
	pop	rdi
	pop	rsi
	pop	rdx
	pop	rcx
	pop	rax
	popf

#elif defined(__arm__)

	stmfd	r13!, {r0-r4, r12, r14}

	mov	r4, r13
	bic	r14, r4, #15
	mov	r13, r14

	mov	r0, #\c
	bl	putchar@plt

	ldr	r14, .L$_c$gotoff$\@
.L$_c$gotpc$\@:
	add	r14, pc, r14
	b	.L$_c$cont$\@
.L$_c$gotoff$\@:
	.word	_GLOBAL_OFFSET_TABLE - .L$_c$gotpc$\@ - 8
.L$_c$cont$\@:
	bl	fflush@plt

	mov	r13, r4
	ldmfd	r13!, {r0-r4, r12, r14}

#elif defined(__aarch64__)

	sub	sp, sp, #20*8
	stp	 x0,  x1, [sp,   #0]
	stp	 x2,  x3, [sp,  #16]
	stp	 x4,  x5, [sp,  #32]
	stp	 x6,  x7, [sp,  #48]
	stp	 x8,  x9, [sp,  #64]
	stp	x10, x11, [sp,  #80]
	stp	x12, x13, [sp,  #96]
	stp	x14, x15, [sp, #112]
	stp	x16, x17, [sp, #128]
	mrs	x16, nzcv
	stp	x16, x30, [sp, #144]

	mov	w0, #\c
	bl	putchar
	adrp	x0, :got:stdout
	ldr	x0, [x0, #:got_lo12:stdout]
	ldr	x0, [x0]
	bl	fflush

	ldp	x16, x30, [sp, #144]
	msr	nzcv, x16
	ldp	x16, x17, [sp, #128]
	ldp	x14, x15, [sp, #112]
	ldp	x12, x13, [sp,  #96]
	ldp	x10, x11, [sp,  #80]
	ldp	 x8,  x9, [sp,  #64]
	ldp	 x6,  x7, [sp,  #48]
	ldp	 x4,  x5, [sp,  #32]
	ldp	 x2,  x3, [sp,  #16]
	ldp	 x0,  x1, [sp,   #0]
	add	sp, sp, #20*8

#else
#  error "not supported"
#endif
.endm

.macro	notimpl
#if defined(__i386__) || defined(__x86_64__)
	ud2
#elif defined(__arm__)
	udf
#elif defined(__aarch64__)
	hlt	#0
#else
#  error "not supported"
#endif
.endm

	.section .note.GNU-stack, "", %progbits

	.text

#if defined(__i386__)
get_pc_ebx:
	mov	ebx, [esp]
	ret
#endif


proc	call_example

#if defined(__i386__)

	push	ebx			// ebx
	push	esi			// esi, ebx
	push	edi			// edi, esi, ebx
	push	ebp			// flags, ebp, ..., ebx
	pushf

	mov	edi, [esp + 4*6]
	mov	esi, [esp + 4*7]
	push	esi			// regs, flags, ebp, ..., ebx

	call	get_pc_ebx
	lea	eax, [ebx + 9f - .]
	push	eax			// cont, regs, flags, ebp, ..., ebx
	push	edi		    // func, cont, regs, flags, ebp, ..., ebx

	mov	eax, [esi + 28]
	pushf
	pop	ecx
	and	eax,  0x0cd5
	and	ecx, ~0x0cd5
	or	eax, ecx
	push	eax
	popf
	mov	eax, [esi +  0]
	mov	ebx, [esi +  4]
	mov	ecx, [esi +  8]
	mov	edx, [esi + 12]
	mov	edi, [esi + 20]
	mov	ebp, [esi + 24]
	mov	esi, [esi + 16]

	ret			       // -> func; regs, flags, ebp, ..., ebx

9:	pushf				// eflags, regs, flags, ebp, ..., ebx
	push	esi		   // esi, eflags, regs, flags, ebp, ..., ebx
	mov	esi, [esp + 8]
	mov	[esi +  0], eax
	mov	[esi +  4], ebx
	mov	[esi +  8], ecx
	mov	[esi + 12], edx
	mov	[esi + 20], edi
	mov	[esi + 24], ebp
	pop	eax			// rflags, regs, flags, ebp, ..., ebx
	mov	[esi + 16], eax
	pop	eax			// regs, flags, ebp, ..., ebx
	mov	[esi + 28], eax

	add	esp, 4			// flags, ebp, ..., ebx
	popf				// ebp, ..., ebx
	pop	ebp			// ..., ebx
	pop	edi
	pop	esi
	pop	ebx			//
	ret

#elif defined(__x86_64__)

	push	rbx			// rbx
	push	r10
	push	r11
	push	r12
	push	r13
	push	r14
	push	r15
	push	rbp			// flags, rbp, ..., rbx
	pushf

	push	rsi			// regs, flags, rbp, ..., rbx

	lea	rax, [rip + 9f]
	push	rax			// cont, regs, flags, rbp, ..., rbx
	push	rdi		    // func, cont, regs, flags, rbp, ..., rbx

	mov	rax, [rsi + 8*15]
	pushf
	pop	rcx
	and	rax,  0x0cd5
	and	rcx, ~0x0cd5
	or	rax, rcx
	push	rax
	popf
	mov	rax, [rsi +   0]
	mov	rbx, [rsi +   8]
	mov	rcx, [rsi +  16]
	mov	rdx, [rsi +  24]
	mov	rdi, [rsi +  40]
	mov	rbp, [rsi +  48]
	mov	r8,  [rsi +  56]
	mov	r9,  [rsi +  64]
	mov	r10, [rsi +  72]
	mov	r11, [rsi +  80]
	mov	r12, [rsi +  88]
	mov	r13, [rsi +  96]
	mov	r14, [rsi + 104]
	mov	r15, [rsi + 112]
	mov	rsi, [rsi +  32]

	ret			       // -> func; regs, flags, rbp, ..., rbx

9:	pushf				// rflags, regs, flags, rbp, ..., rbx
	push	rsi		   // rsi, rflags, regs, flags, rbp, ..., rbx
	mov	rsi, [rsp + 16]
	mov	[rsi +   0], rax
	mov	[rsi +   8], rbx
	mov	[rsi +  16], rcx
	mov	[rsi +  24], rdx
	mov	[rsi +  40], rdi
	mov	[rsi +  48], rbp
	mov	[rsi +  56],  r8
	mov	[rsi +  64],  r9
	mov	[rsi +  72], r10
	mov	[rsi +  80], r11
	mov	[rsi +  88], r12
	mov	[rsi +  96], r13
	mov	[rsi + 104], r14
	mov	[rsi + 112], r15
	pop	rax			// rflags, regs, flags, rbp, ..., rbx
	mov	[rsi +  32], rax
	pop	rax			// regs, flags, rbp, ..., rbx
	mov	[rsi + 120], rax

	add	rsp, 8			// flags, rbp, ..., rbx
	popf				// rbp, ..., rbx
	pop	rbp			// ..., rbx
	pop	r15
	pop	r14
	pop	r13
	pop	r12
	pop	r11
	pop	r10
	pop	rbx			//
	ret

#elif defined(__arm__)

	stmfd	r13!, {r0, r1, r4-r11, r14}
	ldmia	r1, {r0-r12, r14}
	msr	cpsr, r14
	mov	r14, pc
	ldr	pc, [r13], #4
	ldr	r14, [r13], #4
	stmia	r14!, {r0-r12}
	mrs	r0, cpsr
	str	r0, [r14]
	ldmfd	r13!, {r4-r11, pc}

#elif defined(__aarch64__)

	stp	x29, x30, [sp, #-14*8]!
	mov	x29, sp
	stp	x19, x20, [sp,  #16]
	stp	x21, x22, [sp,  #32]
	stp	x23, x24, [sp,  #48]
	stp	x25, x26, [sp,  #64]
	stp	x27, x28, [sp,  #80]
	str	      x1, [sp, #104]

	ldp	x29, x30, [x1, #224]
	msr	nzcv, x30
	mov	x30, x0
	ldp	x27, x28, [x1, #208]
	ldp	x25, x26, [x1, #192]
	ldp	x23, x24, [x1, #176]
	ldp	x21, x22, [x1, #160]
	ldp	x19, x20, [x1, #144]
	ldp	x16, x17, [x1, #128]
	ldp	x14, x15, [x1, #112]
	ldp	x12, x13, [x1,  #96]
	ldp	x10, x11, [x1,  #80]
	ldp	 x8,  x9, [x1,  #64]
	ldp	 x6,  x7, [x1,  #48]
	ldp	 x4,  x5, [x1,  #32]
	ldp	 x2,  x3, [x1,  #16]
	ldp	 x0,  x1, [x1,   #0]

	blr	x30

	ldr	x30, [sp, #104]
	stp	x27, x28, [x30, #208]
	stp	x25, x26, [x30, #192]
	stp	x23, x24, [x30, #176]
	stp	x21, x22, [x30, #160]
	stp	x19, x20, [x30, #144]
	stp	x16, x17, [x30, #128]
	stp	x14, x15, [x30, #112]
	stp	x12, x13, [x30,  #96]
	stp	x10, x11, [x30,  #80]
	stp	 x8,  x9, [x30,  #64]
	stp	 x6,  x7, [x30,  #48]
	stp	 x4,  x5, [x30,  #32]
	stp	 x2,  x3, [x30,  #16]
	stp	 x0,  x1, [x30,   #0]
	mov	x0, x30
	mrs	x30, nzcv
	stp	x29, x30,  [x0, #224]

	ldp	x19, x20, [sp,  #16]
	ldp	x21, x22, [sp,  #32]
	ldp	x23, x24, [sp,  #48]
	ldp	x25, x26, [sp,  #64]
	ldp	x27, x28, [sp,  #80]
	ldp	x29, x30, [sp], #14*8

	ret

#else
#  error "not supported"
#endif

endproc

proc	nop

	ret

endproc

///--------------------------------------------------------------------------
/// 0x00--0x0f

proc	x00

	// clear all 64 bits of extended traditional registers

#if defined(__x86_64__)

	xor      eax, eax		// clear rax
	lea      rbx, [0]		// rbx -> _|_
	loop     .			// iterate, decrement rcx until zero
	mov      rdx, 0			// set rdx = 0
	and      esi, 0			// clear all bits of rsi
	sub      edi, edi		// set rdi = edi - edi = 0
	push     0
	pop      rbp			// pop 0 into rbp

#elif defined(__i386__)

	xor	eax, eax
	lea	ebx, [0]
	loop	.
	mov	edx, 0
	and	esi, 0
	sub	edi, edi
	push	0
	pop	ebp

#elif defined(__arm__)

	eor	r0, r0, r0
	rsb	r1, r1, r1
0:	subs	r2, r2, #1
	bne	0b
	mov	r3, #0
	and	r4, r4, #0
	sub	r5, r5, r5

#elif defined(__aarch64__)

	eor	w0, w0, w0
	mov	w1, wzr
0:	sub	w2, w2, #1
	cbnz	w2, 0b
	mov	w3, #0
	and	w4, w4, wzr
	sub	w5, w5, w5

#else
	notimpl
#endif

	ret

endproc

proc	x01

	// advance a fibonacci pair by c steps
	//
	// on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1}
	// and f_{i+c}, where f_{i+1} = f_i + f_{i-1}

#if defined(__x86_64__)

0:	xadd	rax, rdx		// a, d = a + d, a
					//      = f_{i+1} + f_i, f_{i+1}
					//      = f_{i+2}, f_{i+1}
	loop	0b			// advance i, decrement c, iterate

#elif defined(__i386__)

0:	xadd	eax, edx
	loop	0b

#elif defined(__arm__)

0:	subs	r2, r2, #2
	add	r3, r3, r0
	blo	8f
	add	r0, r0, r3
	bhi	0b

8:	movne	r0, r3

#elif defined(__aarch64__)

0:	subs	x2, x2, #2
	add	x3, x3, x0
	b.lo	8f
	add	x0, x0, x3
	b.hi	0b

8:	cmov.ne	x0, x3

#else
	notimpl
#endif

	ret

endproc

proc	x02

	// boolean canonify a: if a = 0 on entry, leave it zero; otherwise
	// set a = 1

#if defined(__x86_64__)

	neg	rax			// set cf iff a /= 0
	sbb	rax, rax		// a = a - a - cf = -cf
	neg	rax			// a = cf

#elif defined(__i386__)

	neg	eax
	sbb	eax, eax
	neg	eax

#elif defined(__arm__)

	movs	r1, r0			// the easy way
	movne	r1, #1			// mvnne r1, #1 for mask

	cmp	r0, #1			// clear cf iff a == 0
	sbc	r2, r0, r0		// c' = a - a - 1 + cf = cf - 1
	add	r2, r2, #1		// c' = cf

	sub	r3, r0, r0, lsr #1	// d' top bit clear; d' = 0 iff a = 0
	rsb	r3, r3, #0		// d' top bit set iff a /= 0
	mov	r3, r3, lsr #31		// asr for mask

	rsbs	r0, r0, #0
	sbc	r0, r0, r0
	rsb	r0, r0, #0

#elif defined(__aarch64__)

	cmp	x0, #0			// trivial
	cset.ne	x1			// csetm for mask

	cmp	xzr, x0			// set cf iff a == 0
	sbc	x2, x0, x0		// c' = a - a - 1 + cf = cf - 1
	neg	x2, x2			// c' = 1 - cf

	sub	x3, x0, x0, lsr #1	// if a < 2^63 then a' = ceil(d/2) <
					// 2^63
					// if a >= 2^63, write a = 2^63 + t
					// with t < 2^63; d' = 2^63 - 2^62 +
					// ceil(t/2) = 2^62 + ceil(t/2), and
					// ceil(t/2) < 2^62
					// anyway d' < 2^63 and d' = 0 iff
					// a = 0
	neg	x3, x3			// d' top bit set iff a /= 0
	lsr	x3, x3, #63		// asr for mask

	cmp	x0, #1			// set cf iff a /= 0
	adc	x0, xzr, xzr		// a' = 0 + 0 + cf = cf

#else
	notimpl
#endif

	ret

endproc

proc	x03

	// set a = min(a, d) (unsigned); clobber c, d

#if defined(__x86_64__)

	sub	rdx, rax		// d' = d - a; set cf if a > d
	sbb	rcx, rcx		// c = -cf = -[a > d]
	and	rcx, rdx		// c = a > d ? d - a : 0
	add	rax, rcx		// a' = a > d ? d : a

#elif defined(__i386__)

	sub	edx, eax
	sbb	ecx, ecx
	and	ecx, edx
	add	eax, ecx

#elif defined(__arm__)

	cmp	r0, r3			// the easy way
	movlo	r1, r0			// only needed for out-of-place
	movhs	r1, r3

	subs	r3, r3, r0
	sbc	r12, r12, r12
	and	r12, r12, r3
	add	r0, r0, r12

#elif defined(__aarch64__)

	cmp	x0, x3			// the easy way
	csel.lo	x1, x0, x3

	subs	x3, x3, x0		// d' = d - a; set cf if d >= a
	sbc	x16, xzr, xzr		// t = -1 + cf = -[a > d]
	and	x16, x16, x3		// t = a > d ? d - a : 0
	add	x0, x0, x16		// a' = a > d ? d : a

#else
	notimpl
#endif

	ret

endproc

proc	x04

	// switch case?

#if defined(__x86_64__)

  // unrelated playing
  mov	ecx, eax
  mov	rbx, -1
  mov	edx, ecx
  sub	edx, '0'
  cmp	edx, 10
  cmovb	rbx, rdx
  or	ecx, 0x20
  mov	edx, ecx
  sub	edx, 'a'
  sub	ecx, 'a' - 10
  cmp	edx, 6
  cmovb	rbx, rcx

	xor	al, 0x20

#elif defined(__i386__)

  // unrelated playing
  mov	ecx, eax
  mov	ebx, -1
  mov	edx, ecx
  sub	edx, '0'
  cmp	edx, 10
  cmovb	ebx, edx
  or	ecx, 0x20
  mov	edx, ecx
  sub	edx, 'a'
  sub	ecx, 'a' - 10
  cmp	edx, 6
  cmovb	ebx, ecx

	xor	al, 0x20

#elif defined(__arm__)

  // unrelated playing
  mvn	r1, #0
  sub	r12, r0, #'0'
  cmp	r12, #10
  movlo	r1, r12
  orr	r12, r0, #0x20
  sub	r12, r12, #'a'
  cmp	r12, #6
  addlo	r1, r12, #10

	eor	r0, r0, #0x20

#elif defined(__aarch64__)

  // unrelated playing
  mov	x1, #-1
  sub	w16, w0, #'0'
  cmp	w16, #10
  cmov.lo	x1, x16
  orr	w16, w0, #0x20
  sub	w16, w16, #'a' - 10
  cmp	w16, #10
  ccmp.hs	w16, #16, #CCMP_HS
  cmov.lo	x1, x16

	eor	w0, w0, #0x20

#else
	notimpl
#endif

	ret

endproc

proc	x05

	// answer whether 5 <= a </<= 9.

#if defined(__x86_64__)

	sub	rax, 5			// a' = a - 5
	cmp	rax, 4			// is a' - 5 </<= 4?

	// cc		a'			a
	//
	// z/e		a' = 4			a = 9
	// nz/ne	a' /= 4			a /= 9
	//
	// a/nbe	a' > 4			a > 9 or a < 5
	// nc/ae/nb	a' >= 4			a >= 9 or a < 5
	// c/b/nae	a' < 4			5 <= a < 9
	// be/na	a' <= 4			5 <= a <= 9
	//
	// o		a' < -2^63 + 4		-2^63 + 5 <= a < -2^63 + 9
	// no		a' >= -2^63 + 4		a >= -2^63 + 9 or
	//						a < -2^63 + 5
	// s		-2^63 + 4 <= a' < 4	-2^63 + 9 <= a < 9
	// ns		a' < -2^63 + 4 or	a < -2^63 + 9 or a >= 9
	//			a' >= 4
	// ge/nl	a' >= 4			a >= 9 or a < -2^63 + 5
	// l/nge	a' < 4			-2^63 + 5 <= a < 9
	// g/nle	a' > 4			a > 9 or a < -2^63 + 5
	// le/ng	a' <= 4			-2^63 + 5 <= a <= 9

#elif defined(__i386__)

	sub	eax, 5
	cmp	eax, 4

#elif defined(__arm__)

	// i dimly remember having a slick way to do this way back in the
	// day, but i can't figure it out any more.
	sub	r0, #5
	cmp	r0, #4

#elif defined(__aarch64__)

	// literal translation is too obvious
	cmp	x0, #5
	ccmp.hs	x0, #9, #CCMP_HS

#else
	notimpl
#endif

	ret

endproc

proc	x06

	// leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of,
	// set sf to msb(a)

#if defined(__x86_64__)

	not	rax			// a' = -a - 1
	inc	rax			// a' = -a
	neg	rax			// a' = a

#elif defined(__i386__)

	not	eax
	inc	eax
	neg	eax

#elif defined(__arm__)

	mvn	r0, r0
	add	r0, r0, #1
	rsbs	r0, r0, #0		// cf has opposite sense

#elif defined(__aarch64__)

	mvn	x0, x0
	add	x0, x0, #1
	negs	x0, x0			// cf has opposite sense

#else
	notimpl
#endif

	ret

endproc

proc	x07

	// same as before (?)

#if defined(__x86_64__)

	inc	rax			// a' = a + 1
	neg	rax			// a' = -a - 1
	inc	rax			// a' = -a
	neg	rax			// a' = a

#elif defined(__i386__)

	inc	eax
	neg	eax
	inc	eax
	neg	eax

#elif defined(__arm__)

	add	r0, r0, #1
	rsb	r0, r0, #0
	add	r0, r0, #1
	rsbs	r0, r0, #0

#elif defined(__aarch64__)

	add	x0, x0, #1
	neg	x0, x0
	add	x0, x0, #1
	negs	x0, x0			// cf has opposite sense

#else
	notimpl
#endif

	ret

endproc

proc	x08

	// floor((a + d)/2), correctly handling overflow conditions; final cf
	// is lsb(a + d), probably uninteresting

#if defined(__x86_64__)

	add	rax, rdx		// cf || a' = a + d
	rcr	rax, 1			// shift 65-bit result right by one
					// place; lsb moves into carry

#elif defined(__i386__)

	add	eax, edx
	rcr	eax, 1

#elif defined(__arm__)

	// like the two-instruction a64 version
	sub	r1, r3, r0
	add	r1, r0, r1, lsr #1

	// the slick version, similar to the above
	adds	r0, r0, r3
	mov	r0, r0, rrx

#elif defined(__aarch64__)

	// a64 lacks a32's rrx.  literal translation.
	adds	x1, x0, x3		// cf || a' = a + d
	adc	x16, xzr, xzr		// realize cf in extra register
	extr	x1, x16, x1, #1		// shift down one place

	// two instruction version: clobbers additional register.  (if you
	// wanted the answer in any other register, even overwriting d, then
	// this is unnecessary.)  also depends on d >= a.
	sub	x16, x3, x0		// compute difference
	add	x0, x0, x16, lsr #1	// add half of it (rounded down)

#else
	notimpl
#endif

	ret

endproc

proc	x09

	// a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3
	// (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8).

#if defined(__x86_64__)

	shr	rax, 3			// a' = floor(a/8); cf = 1 if a ==
					// 4, 5, 6, 7 (mod 8)
	adc	rax, 0			// a' = floor(a/8) + cf

#elif defined(__i386__)

	shr	eax, 3
	adc	eax, 0

#elif defined(__arm__)

	movs	r0, r0, lsr #3
	adc	r0, r0, #0

#elif defined(__aarch64__)

	tst	x0, #4
	orr	x0, xzr, x0, lsr #3
	cinc.ne	x0, x0

#else
	notimpl
#endif

	ret

endproc

proc	x0a

	// increment c-byte little-endian bignum at rdi

#if defined(__x86_64__)

	add	byte ptr [rdi], 1
0:	inc	rdi
	adc	byte ptr [rdi], 0
	loop	0b

#elif defined(__i386__)

	add	byte ptr [edi], 1
0:	inc	edi
	adc	byte ptr [edi], 0
	loop	0b

#elif defined(__arm__)

	mov	r12, #256		// set initial carry
0:	ldrb	r0, [r5]
	subs	r2, r2, #1
	add	r12, r0, r12, lsr #8
	strb	r12, [r5], #1
	bne	0b

#elif defined(__aarch64__)

	mov	w17, #256		// set initial carry
0:	ldrb	w16, [x5]
	sub	x2, x2, #1
	add	w17, w16, w17, lsr #8
	strb	w17, [x5], #1
	cbnz	x2, 0b

#else
	notimpl
#endif

	ret

endproc

proc	x0b

	// negate double-precision d:a

#if defined(__x86_64__)

	not	rdx			// d' = -d - 1
	neg	rax			// a' = -a;
					// cf = 1 iff a /= 0
	sbb	rdx, -1			// d' = -d - cf

#elif defined(__i386__)

	not	edx
	neg	eax
	sbb	edx, -1

#elif defined(__arm__)

	// reverse subtract is awesome
	rsbs	r0, r0, #0
	rsc	r3, r3, #0

#elif defined(__aarch64__)

	// easy way: everything is better with zero registers.
	negs	x0, x0
	ngc	x3, x3

#else
	notimpl
#endif

	ret

endproc

proc	x0c

	// rotate is distributive over xor.

#if defined(__x86_64__)

	// rax				// = a_1 || a_0
	// rbx				// = b_1 || b_0
	mov	rcx, rax		// = a_1 || a_0

	xor	rcx, rbx		// = (a_1 XOR b_1) || (a_0 XOR b_0)
	ror	rcx, 0xd		// = (a_0 XOR b_0) || (a_1 XOR b_1)

	ror	rax, 0xd		// = a_0 || a_1
	ror	rbx, 0xd		// = b_0 || b_1
	xor	rax, rbx		// = (a_0 XOR b_0) || (a_1 XOR b_1)

	cmp	rax, rcx		// always equal

#elif defined(__i386__)

	mov	ecx, eax		// = a_1 || a_0

	xor	ecx, ebx		// = (a_1 XOR b_1) || (a_0 XOR b_0)
	ror	ecx, 0xd		// = (a_0 XOR b_0) || (a_1 XOR b_1)

	ror	eax, 0xd		// = a_0 || a_1
	ror	ebx, 0xd		// = b_0 || b_1
	xor	eax, ebx		// = (a_0 XOR b_0) || (a_1 XOR b_1)

	cmp	eax, ecx		// always equal

#elif defined(__arm__)


	// r0				// = a_1 || a_0
	// r1				// = b_1 || b_0
	eor	r2, r0, r1		// = (a_1 XOR b_1) || (a_0 XOR b_0)
	mov	r2, r2, ror #13		// = (a_0 XOR b_0) || (a_1 XOR b_1)

	mov	r1, r1, ror #13		// = b_0 || b_1
	eor	r0, r1, r0, ror #13	// = (a_0 XOR b_0) || (a_1 XOR b_1)

	cmp	r0, r2			// always equal

#elif defined(__aarch64__)

	// x0				// = a_1 || a_0
	// x1				// = b_1 || b_0
	eor	x2, x0, x1		// = (a_1 XOR b_1) || (a_0 XOR b_0)
	ror	x2, x2, #13		// = (a_0 XOR b_0) || (a_1 XOR b_1)

	ror	x1, x1, #13		// = b_0 || b_1
	eor	x0, x1, x0, ror #13	// = (a_0 XOR b_0) || (a_1 XOR b_1)

	cmp	x0, x2			// always equal

#else
	notimpl
#endif

	ret

endproc

proc	x0d

	// and is distributive over xor.

#if defined(__x86_64__)

	mov	rdx, rbx		// = b

	xor	rbx, rcx		// = b XOR c
	and	rbx, rax		// = a AND (b XOR c)

	and	rdx, rax		// = a AND b
	and	rax, rcx		// = a AND c
	xor	rax, rdx		// = (a AND b) XOR (a AND c)
					// = a AND (b XOR c)

	cmp	rax, rbx		// always equal

#elif defined(__i386__)

	mov	edx, ebx		// = b

	xor	ebx, ecx		// = b XOR c
	and	ebx, eax		// = a AND (b XOR c)

	and	edx, eax		// = a AND b
	and	eax, ecx		// = a AND c
	xor	eax, edx		// = (a AND b) XOR (a AND c)
					// = a AND (b XOR c)

	cmp	eax, ebx		// always equal

#elif defined(__arm__)

	and	r3, r0, r1		// = a AND b

	eor	r1, r1, r2		// = b XOR c
	and	r1, r1, r0		// = a AND (b XOR c)

	and	r0, r0, r2		// = a AND c
	eor	r0, r0, r3		// = (a AND b) XOR (a AND c)
					// = a AND (b XOR c)

	cmp	r0, r1			// always equal

#elif defined(__aarch64__)

	and	x3, x0, x1		// = a AND b

	eor	x1, x1, x2		// = b XOR c
	and	x1, x1, x0		// = a AND (b XOR c)

	and	x0, x0, x2		// = a AND c
	eor	x0, x0, x3		// = (a AND b) XOR (a AND c)
					// = a AND (b XOR c)

	cmp	x0, x1			// always equal

#else
	notimpl
#endif

	ret

endproc

proc	x0e

	// de morgan's law

#if defined(__x86_64__)

	mov	rcx, rax		// = a

	and	rcx, rbx		// = a AND b
	not	rcx			// = NOT (a AND b)

	not	rax			// = NOT a
	not	rbx			// = NOT b
	or	rax, rbx		// = (NOT a) OR (NOT b)
					// = NOT (a AND b)

	cmp	rax, rcx		// always equal

#elif defined(__i386__)

	mov	ecx, eax		// = a

	and	ecx, ebx		// = a AND b
	not	ecx			// = NOT (a AND b)

	not	eax			// = NOT a
	not	ebx			// = NOT b
	or	eax, ebx		// = (NOT a) OR (NOT b)
					// = NOT (a AND b)

	cmp	eax, ecx		// always equal

#elif defined(__arm__)

	and	r2, r0, r1		// = a AND b
	mvn	r2, r2			// = NOT (a AND b)

	mvn	r0, r0			// = NOT a
	mvn	r1, r1			// = NOT b
	orr	r0, r0, r1		// = (NOT a) OR (NOT b)

	cmp	r0, r2			// always equal

#elif defined(__aarch64__)

	and	x2, x0, x1		// = a AND b
	mvn	x2, x2			// = NOT (a AND b)

	mvn	x0, x0			// = NOT a
	orn	x0, x0, x1		// = (NOT a) OR (NOT b)

	cmp	x0, x2			// always equal

#else
	notimpl
#endif

	ret

endproc

proc	x0f

	// replace input buffer bytes with cumulative XORs with initial a;
	// final a is XOR of all buffer bytes and initial a.
	//
	// not sure why you'd do this.

#if defined(__x86_64__)

0:	xor	[rsi], al
	lodsb
	loop	0b

#elif defined(__i386__)

0:	xor	[esi], al
	lodsb
	loop	0b

#elif defined(__arm__)

0:	ldrb	r12, [r4]
	subs	r2, r2, #1
	eor	r0, r0, r12
	strb	r0, [r4], #1
	bne	0b

#elif defined(__aarch64__)

0:	ldrb	w16, [x4]
	sub	x2, x2, #1
	eor	w0, w0, w16
	strb	w0, [x4], #1
	cbnz	x2, 0b

#else
	notimpl
#endif

	ret

endproc

///--------------------------------------------------------------------------
/// 0x10--0x1f

proc	x10

	// four different ways to swap a pair of registers.

#if defined(__x86_64__)

	push	rax
	push	rcx
	pop	rax
	pop	rcx

	xor	rax, rcx
	xor	rcx, rax
	xor	rax, rcx

	add	rax, rcx
	sub	rcx, rax
	add	rax, rcx
	neg	rcx

	xchg	rax, rcx

#elif defined(__i386__)

	push	eax
	push	ecx
	pop	eax
	pop	ecx

	xor	eax, ecx
	xor	ecx, eax
	xor	eax, ecx

	add	eax, ecx
	sub	ecx, eax
	add	eax, ecx
	neg	ecx

	xchg	eax, ecx

#elif defined(__arm__)

	stmfd	r13!, {r0, r2}
	ldr	r0, [r13, #4]
	ldr	r2, [r13], #8

	eor	r0, r0, r2
	eor	r2, r2, r0
	eor	r0, r0, r2

	sub	r0, r0, r2
	add	r2, r2, r0
	rsb	r0, r0, r2		// don't need 3-addr with reverse-sub

	mov	r12, r0
	mov	r0, r2
	mov	r2, r0

#elif defined(__aarch64__)

	// anything you can do
	stp	x0, x2, [sp, #-16]!
	ldp	x2, x0, [sp], #16

	eor	x0, x0, x2
	eor	x2, x2, x0
	eor	x0, x0, x2

	// the add/sub/add thing was daft.  you can do it in three if you're
	// clever -- and have three-address operations.
	sub	x0, x0, x2
	add	x2, x2, x0
	sub	x0, x2, x0

	// but we lack a fourth.  we can't do this in fewer than three
	// instructions without hitting memory.  only `ldp' will modify two
	// registers at a time, so we need at least two instructions -- but
	// if the first one sets one of our two registers to its final value
	// then we lose the other input value with no way to recover it, so
	// we must either write a fresh third register, or write something
	// other than the final value, and in both cases we need a third
	// instruction to fix everything up.  we've done the wrong-something-
	// other trick twice, so here's the captain-obvious use-a-third-
	// register version.
	mov	x16, x0
	mov	x0, x2
	mov	x2, x16

#else
	notimpl
#endif

	ret

endproc

proc	x11

	// assuming a is initialized to zero, set a to the inclusive or of
	// the xor-differences of corresponding bytes in the c-byte strings
	// at si and di.
	//
	// in particular, a will be zero (and zf set) if and only if the two
	// strings are equal.

#if defined(__x86_64__)

0:	mov	dl, [rsi]
	xor	dl, [rdi]
	inc	rsi
	inc	rdi
	or	al, dl
	loop	0b

#elif defined(__i386__)

0:	mov	dl, [esi]
	xor	dl, [edi]
	inc	esi
	inc	edi
	or	al, dl
	loop	0b

#elif defined(__arm__)

0:	ldrb	r1, [r4], #1
	ldrb	r12, [r5], #1
	subs	r2, r2, #1
	eor	r12, r12, r1
	orr	r0, r0, r12
	bne	0b

#elif defined(__aarch64__)

0:	ldrb	w16, [x4], #1
	ldrb	w17, [x5], #1
	sub	x2, x2, #1
	eor	w16, w16, w17
	orr	w0, w0, w16
	cbnz	x2, 0b

#else
	notimpl
#endif

	ret

endproc

proc	x12

	// an obtuse way of adding two registers.  for any bit position, a
	// OR d is set if and only if at least one of a and d has a bit set
	// in that position, and a AND d is set if and only if both have a
	// bit set in that position.  essentially, then, what we've done is
	// move all of the set bits in d to a, unless there's already a bit
	// there.  this clearly doesn't change the sum.

#if defined(__x86_64__)

	mov	rcx, rdx		// c' = d
	and	rdx, rax		// d' = a AND d
	or	rax, rcx		// a' = a OR d
	add	rax, rdx

#elif defined(__i386__)

	mov	ecx, edx		// c' = d
	and	edx, eax		// d' = a AND d
	or	eax, ecx		// a' = a OR d
	add	eax, edx

#elif defined(__arm__)

	and	r2, r0, r3		// c' = a AND d
	orr	r0, r0, r3		// a' = a OR d
	add	r0, r0, r2

#elif defined(__aarch64__)

	and	x2, x0, x3		// c' = a AND d
	orr	x0, x0, x3		// a' = a OR d
	add	x0, x0, x2

#else
	notimpl
#endif

	ret

endproc

proc	x13

	// ok, so this is a really obtuse way of adding a and b; the result
	// is in a and d.  but why does it work?

#if defined(__x86_64__)

	mov	rcx, 0x40		// carry chains at most 64 long
0:	mov	rdx, rax		// copy a'
	xor	rax, rbx		// low bits of each bitwise sum
	and	rbx, rdx		// carry bits from each bitwise sum
	shl	rbx, 1			// carry them into next position
	loop	0b

#elif defined(__i386__)

	mov	ecx, 0x40		// carry chains at most 64 long
0:	mov	edx, eax		// copy a'
	xor	eax, ebx		// low bits of each bitwise sum
	and	ebx, edx		// carry bits from each bitwise sum
	shl	ebx, 1			// carry them into next position
	loop	0b

#elif defined(__arm__)

	mov	r2, #0x40
0:	and	r3, r0, r1
	subs	r2, r2, #1
	eor	r0, r0, r1
	lsl	r1, r3, #1
	bne	0b

#elif defined(__aarch64__)

	mov	x2, #0x40
0:	and	x3, x0, x1
	sub	x2, x2, #1
	eor	x0, x0, x1
	lsl	x1, x3, #1
	cbnz	x2, 0b

#else
	notimpl
#endif

	ret

endproc

proc	x14

	// floor((a + d)/2), like x08.

#if defined(__x86_64__)

	mov	rcx, rax		// copy a for later
	and	rcx, rdx		// carry bits

	xor	rax, rdx		// low bits of each bitwise sum
	shr	rax, 1			// divide by 2; carries now in place

	add	rax, rcx		// add the carries; done

#elif defined(__i386__)

	mov	ecx, eax		// copy a for later
	and	ecx, edx		// carry bits

	xor	eax, edx		// low bits of each bitwise sum
	shr	eax, 1			// divide by 2; carries now in place

	add	eax, ecx		// add the carries; done

#elif defined(__arm__)

	and	r2, r0, r3
	eor	r0, r0, r3
	add	r0, r2, r0, lsr #1

#elif defined(__aarch64__)

	and	x2, x0, x3
	eor	x0, x0, x3
	add	x0, x2, x0, lsr #1

#else
	notimpl
#endif

	ret

endproc

proc	x15

	// sign extension 32 -> 64 bits.

#if defined(__x86_64__)

	movsx	rbx, eax		// like this?

	mov	rdx, 0xffffffff80000000
	add	rax, rdx		// if bit 31 of a is set then bits
					// 31--63 of a' are clear; otherwise,
					// these bits are all set -- which is
					// exactly backwards
	xor	rax, rdx		// so fix it

#elif defined(__i386__)

	movsx	ebx, ax			// like this?

	mov	edx, 0xffff8000
	add	eax, edx		// if bit 31 of a is set then bits
					// 31--63 of a' are clear; otherwise,
					// these bits are all set -- which is
					// exactly backwards
	xor	eax, edx		// so fix it

#elif defined(__arm__)

	sxth	r1, r0			// like this

	mov	r12, #0x80000000
	add	r0, r0, r12, asr #16
	eor	r0, r0, r12, asr #16

#elif defined(__aarch64__)

	sxtw	x1, w0			// like this

	mov	x16, #0xffffffff80000000
	add	x0, x0, x16
	eor	x0, x0, x16

#else
	notimpl
#endif

	ret

endproc

proc	x16

	// ??? i don't know why you'd want to calculate this.

#if defined(__x86_64__)

	xor	rax, rbx		// a' = a XOR b
	xor	rbx, rcx		// b' = b XOR c
	mov	rsi, rax		// t = a XOR b
	add	rsi, rbx		// t = (a XOR b) + (b XOR c)
	cmovc	rax, rbx		// a' = cf ? b XOR c : a XOR b
	xor	rax, rbx		// a' = cf ? 0 : a XOR c
	cmp	rax, rsi

#elif defined(__i386__)

	xor	eax, ebx		// a' = a XOR b
	xor	ebx, ecx		// b' = b XOR c
	mov	esi, eax		// t = a XOR b
	add	esi, ebx		// t = (a XOR b) + (b XOR c)
	cmovc	eax, ebx		// a' = cf ? b XOR c : a XOR b
	xor	eax, ebx		// a' = cf ? 0 : a XOR c
	cmp	eax, esi

#elif defined(__arm__)

	eor	r0, r0, r1
	eor	r1, r1, r2
	adds	r4, r0, r1
	movcs	r0, r1
	eor	r0, r0, r1
	cmp	r0, r4

#elif defined(__aarch64__)

	eor	x0, x0, x1
	eor	x1, x1, x2
	adds	x4, x0, x1
	cmov.cs	x0, x1
	eor	x0, x0, x1
	cmp	x0, x4

#else
	notimpl
#endif

	ret

endproc

proc	x17

	// absolute value

#if defined(__x86_64__)

	cqo				// d = a < 0 ? -1 : 0
	xor	rax, rdx		// a' = a < 0 ? -a - 1 : a
	sub	rax, rdx		// a' = a < 0 ? -a : a

#elif defined(__i386__)

	cdq				// d = a < 0 ? -1 : 0
	xor	eax, edx		// a' = a < 0 ? -a - 1 : a
	sub	eax, edx		// a' = a < 0 ? -a : a

#elif defined(__arm__)

	// direct approach
	movs	r1, r0
	rsbmi	r1, r0, #0

	// faithful-ish conversion
	eor	r3, r0, r0, asr #31
	sub	r0, r3, r0, asr #31

#elif defined(__aarch64__)

	// direct approach
	tst	x0, #1 << 63
	cneg.ne x1, x0

	// faithful-ish conversion
	eor	x3, x0, x0, asr #63
	sub	x0, x3, x0, asr #63

#else
	notimpl
#endif

	ret

endproc

proc	x18

	// should always set sf, clear zf, unless we get rescheduled to a
	// different core.

#if defined(__x86_64__)

	rdtsc				// d || a = cycles
	shl	rdx, 0x20
	or	rax, rdx		// a = cycles
	mov	rcx, rax		// c = cycles

	rdtsc				// d || a = cycles'
	shl	rdx, 0x20
	or	rax, rdx		// a = cycles'

	cmp	rcx, rax

#elif defined(__i386__)

	rdtsc				// d || a = cycles
	mov	ebx, eax
	mov	ecx, edx		// c || b = cycles

	rdtsc				// d || a = cycles'

	sub	ebx, eax
	sbb	ecx, edx

#elif defined(__arm__)

	// cycle clock not available in user mode
	mrrc	p15, 0, r0, r1, c9
	mrrc	p15, 0, r2, r3, c9
	subs	r0, r0, r2
	sbcs	r1, r1, r3

#elif defined(__aarch64__)

	// cycle clock not available in user mode
	mrs	x0, pmccntr_el0
	mrs	x1, pmccntr_el0
	cmp	x0, x1

#else
	notimpl
#endif

	ret

endproc

proc	x19

	// stupid way to capture a pointer to inline data and jump past it.
	// confuses the return-address predictor something chronic.  worse
	// because amd64 calling convention doesn't usually pass arguments on
	// the stack.

#if defined(__x86_64__)

	call	8f
	.string	"hello world!\n\0"
8:	call	print_str
	add	rsp, 8
	ret

print_str:
	// actually implement this ridiculous thing
	mov	rsi, [rsp + 8]
	xor	edx, edx
0:	mov	al, [rsi + rdx]
	inc	rdx
	cmp	al, 0
	jnz	0b
	mov	eax, SYS_write
	mov	edi, 1
	dec	rdx
	syscall				// clobbers r11 :-(
	ret

#elif defined(__i386__)

	call	8f
	.string	"hello world!\n\0"
8:	call	print_str
	add	esp, 4
	ret

print_str:
	// actually implement this ridiculous thing
	mov	ecx, [esp + 4]
	xor	edx, edx
0:	mov	al, [ecx + edx]
	inc	edx
	cmp	al, 0
	jnz	0b
	mov	eax, SYS_write
	mov	ebx, 1
	dec	edx
	int	0x80
	ret

#elif defined(__arm__)

	// why am i doing this?
	stmfd	r13!, {r14}
	bl	8f
	.string	"hello world!\n\0"
	.balign	4
8:	mov	r1, r14		      // might as well make it easy on myself
	bl	print_str
	ldmfd	r13!, {pc}

print_str:
	mov	r2, #0
0:	ldrb	r0, [r1, r2]
	cmp	r0, #0
	addne	r2, r2, #1
	bne	0b
	mov	r0, #1
	mov	r7, #SYS_write
	swi	0
	bx	r14

#elif defined(__aarch64__)

	// why am i doing this?
	str	x30, [sp, #-16]!
	bl	8f
	.string	"hello world!\n\0"
	.balign	4
8:	mov	x1, x30		      // might as well make it easy on myself
	bl	print_str
	ldr	x30, [sp], #16
	ret

print_str:
	mov	x2, #0
0:	ldrb	w0, [x1, x2]
	cmp	w0, #0
	cinc.ne	x2, x2
	b.ne	0b
	mov	x0, #1
	mov	x8, #SYS_write
	svc	#0
	ret

#else
	notimpl
#endif

endproc

proc	x1a

	// collect the current instruction-pointer address.  this was an old
	// 32-bit i386 trick for position-independent code, but (a) it
	// confuses the return predictor, and (b) amd64 has true pc-relative
	// addressing.

#if defined(__x86_64__)

	// the actual example
	call	0f
0:	pop	rax

	// the modern i386 trick doesn't confuse the return-address
	// predictor.
	call	calladdr_rbx
	sub	rbx, . - 0b

	// but rip-relative addressing is even better
	lea	rcx, [rip + 0b]

	ret

calladdr_rbx:
	mov	rbx, [rsp]
	ret

#elif defined(__i386__)

	// the actual example
	call	0f
0:	pop	eax

	// the modern i386 trick doesn't confuse the return-address
	// predictor.
	call	get_pc_ebx
	sub	ebx, . - 0b

	ret

#elif defined(__arm__)

	stmfd	r13!, {r14}

	bl	0f
0:	mov	r0, r14

	bl	return
	sub	r1, r14, #. - 0b

	adr	r2, 0b

	ldmfd	r13!, {pc}

return:	bx	r14

#elif defined(__aarch64__)

	str	x30, [sp, #-16]!

	// we can do all of the above using a64
	bl	0f
0:	mov	x0, x30

	bl	return
	sub	x1, x30, #. - 0b

	adr	x2, 0b

	ldr	x30, [sp], #16
return:	ret

#else
	notimpl
#endif

endproc

proc	x1b

#if defined(__x86_64__)

	// retpolines: an mitigation against adversarially influenced
	// speculative execution at indirect branches.  if an adversary can
	// prepare a branch-target buffer entry matching an indirect branch
	// in the victim's address space then they can cause the victim to
	// /speculatively/ (but not architecturally) execute any code in
	// their address space, possibly leading to leaking secrets through
	// the cache.  retpolines aren't susceptible to this because the
	// predicted destination address is from the return-prediction stack
	// which the adversary can't prime.  the performance penalty is still
	// essentially a branch misprediction -- for this return, and
	// possibly all others already stacked.

	// (try not to crash)
	lea	rax, [rip + 9f]

	push	rax
9:	ret

#elif defined(__i386__)

	call	get_pc_ebx
	lea	eax, [ebx + 9f - .]

	push	eax
9:	ret

#elif defined(__arm__)

	stmfd	r13!, {r14}

	adr	r14, 8f
	bx	r14

8:	ldmfd	r13!, {pc}

#elif defined(__aarch64__)

	str	x30, [sp, #-16]!

	adr	x30, 8f
	ret

8:	ldr	x30, [sp], #16
	ret

#else
	notimpl
#endif

endproc

proc	x1c

	// ok, having a hard time seeing a use for this.  the most important
	// thing to note is that sp is set from `pop' /after/ it's
	// incremented.

#if defined(__x86_64__)

	// try not to crash
	mov	rax, rsp
	and	rsp, -16
	push	rax

	pop	rsp

	// check it worked
	mov	rbx, rsp
	ret

#elif defined(__i386__)

	// try not to crash
	mov	eax, esp
	and	esp, -16
	push	eax

	pop	esp

	// check it worked
	mov	ebx, esp
	ret

#elif defined(__arm__)

	// not even going to dignify this
	notimpl

#elif defined(__aarch64__)

	// not even going to dignify this
	notimpl

#else
	notimpl
#endif

endproc

proc	x1d

	// monumentally cheesy way to copy 8 n bytes from buff1 to buff2.
	// also clobbers words at buff2 + 8 n and buff2 - 8 for good measure.

	n = 4

#if defined(__x86_64__)

	mov	rax, rsp			// safekeeping

	// we're toast if we get hit by a signal now.  fingers crossed...
  .if 0
	mov	rsp, buff2 + 8*n + 8
	mov	rbp, buff1 + 8*n
  .else
	lea	rsp, [rdi + 8*n + 16]
	lea	rbp, [rsi + 8*n]
  .endif
	enter	0, n + 1

	// precise action:
	//
	//	   +---------+			+---------+
	//  rbp -> |   ???   |		 rsp -> |   ???   |
	//	   +---------+			+---------+
	//         | w_{n-1} |			|   rbp   | <- rbp'
	//	   +---------+			+---------+
	//         |   ...   |			| w_{n-1} |
	//	   +---------+			+---------+
	//         |   w_1   |			|   ...   |
	//	   +---------+			+---------+
	//         |   w_0   |			|   w_1   |
	//	   +---------+			+---------+
	//					|   w_0   |
	//					+---------+
	//					|   rbp'  | <- rsp'
	//					+---------+

	mov	rdx, rsp
	mov	rsp, rax

#elif defined(__i386__)

	mov	eax, esp			// safekeeping

	// we're toast if we get hit by a signal now.  fingers crossed...
  .if 0
	mov	esp, buff2 + 4*n + 4
	mov	ebp, buff1 + 4*n
  .else
	lea	esp, [edi + 4*n + 8]
	lea	ebp, [esi + 4*n]
  .endif
	enter	0, n + 1

	mov	edx, esp
	mov	esp, eax

#elif defined(__arm__)

	add	r4, r4, #4*n
	add	r5, r5, #4*n + 8

	str	r4, [r5, #-4]!
  .rept n/2
	ldrd	r0, r1, [r4, #-8]!
	strd	r0, r1, [r5, #-8]!
  .endr
	add	r4, r5, #4*n
	str	r4, [r5, #-4]!

#elif defined(__aarch64__)

	// omgwtf.  let's not actually screw with the stack pointer.

	add	x4, x4, #8*n
	add	x5, x5, #8*n + 16

	str	x4, [x5, #-8]!
  .rept n/2
	ldp	x16, x17, [x4, #-16]!
	stp	x16, x17, [x5, #-16]!
  .endr
	add	x4, x5, #8*n
	str	x4, [x5, #-8]!

#else
	notimpl
#endif

	ret

endproc

proc	x1e

	// convert nibble value to (uppercase) hex; other input values yield
	// nonsense.

#if defined(__x86_64__)

	// das doesn't work in 64-bit mode; best i can come up with
	mov	edx, eax
	add	al, '0'
	add	dl, 'A' - 10
	cmp	al, '9' + 1
	cmovae	eax, edx

#elif defined(__i386__)

	cmp	al, 0x0a		// cf = 1 iff a < 10
	sbb	al, 0x69		// if 0 <= a < 10, a' = a - 0x6a, so
					// 0x96 <= a' < 0x70, setting af, cf
					// if 10 <= a < 16, a' = a - 0x69, so
					// 0x71 <= a' < 0x77, setting cf but
					// clearing af
	das				// if 0 <= a < 10, then af and cf are
					// both set, so set subtract 0x66
					// from a' leaving 0x30 <= a' < 0x3a;
					// if 10 <= a < 16 then af clear but
					// cf set, so subtract 0x60 from a'
					// leaving 0x41 <= a' < 0x47

#elif defined(__arm__)

	// significantly less tricksy
	cmp	r0, #10
	addlo	r0, r0, #'0'
	addhs	r0, r0, #'A' - 10

#elif defined(__aarch64__)

	// with less versatile conditional execution this is the best we can
	// do
	cmp	w0, #10
	add	w16, w0, #'A' - 10
	add	w0, w0, #'0'
	cmov.hs	w0, w16

#else
	notimpl
#endif

	ret

endproc

proc	x1f

	// verify collatz conjecture starting at a; assume a /= 0!

#if defined(__x86_64__)

0:	bsf	rcx, rax		// clobber c if a = 0
	shr	rax, cl			// a = 2^c a'
  cmp rdx, 0
  je 1f
  stosq
  dec rdx
1:
	cmp	rax, 1			// done?
	je	9f
	lea	rax, [2*rax + rax + 1]	// a' = 3 a' + 1
	jmp	0b			// again

9:	ret

#elif defined(__i386__)

0:	bsf	ecx, eax		// clobber c if a = 0
	shr	eax, cl			// a = 2^c a'
  cmp edx, 0
  je 1f
  stosd
  dec edx
1:
	cmp	eax, 1			// done?
	je	9f
	lea	eax, [2*eax + eax + 1]	// a' = 3 a' + 1
	jmp	0b			// again

9:	ret

#elif defined(__arm__)

	// rbit introduced in armv7
0:	rbit	r2, r0
	clz	r2, r2
	mov	r0, r0, lsr r2		// a = 2^c a'
  cmp r3, #0
  strne r0, [r5], #4
  subne r3, r3, #1
	cmp	r0, #1
	adcne	r0, r0, r0, lsl #1	// a' = 3 a' + 1 (because c set)
	bne	0b

	ret

#elif defined(__aarch64__)

0:	rbit	w2, w0
	clz	w2, w2
	lsr	w0, w0, w2		// a = 2^c a'
  cmp x3, #0
  beq 1f
  str x0, [x5], #8
  sub x3, x3, #1
1:
	cmp	w0, #1
	add	w16, w0, w0, lsl #1	// t = 3 a' + 1 (because c set)
	csinc.eq w0, w0, w16
	b.ne	0b

	ret

#else
	notimpl
#endif

endproc

///--------------------------------------------------------------------------
/// 0x20--0x2f

proc	x20

	// calculate 1337 a slowly

#if defined(__x86_64__)

	// original version
	mov	rcx, rax		// c = a
	shl	rcx, 2			// c = 4 a
	add	rcx, rax		// c = 5 a
	shl	rcx, 3			// c = 40 a
	add	rcx, rax		// c = 41 a
	shl	rcx, 1			// c = 82 a
	add	rcx, rax		// c = 83 a
	shl	rcx, 1			// c = 166 a
	add	rcx, rax		// c = 167 a
	shl	rcx, 3			// c = 1336 a
	add	rcx, rax		// c = 1337 a

	// a quick way
	lea	rdx, [2*rax + rax]	// t = 3 a
	shl	rdx, 6			// t = 192 a
	sub	rdx, rax		// t = 191 a
	lea	rbx, [8*rdx]		// b = 1528 a
	sub	rbx, rdx		// b = 1337 a

#elif defined(__i386__)

	// original version
	mov	ecx, eax		// c = a
	shl	ecx, 2			// c = 4 a
	add	ecx, eax		// c = 5 a
	shl	ecx, 3			// c = 40 a
	add	ecx, eax		// c = 41 a
	shl	ecx, 1			// c = 82 a
	add	ecx, eax		// c = 83 a
	shl	ecx, 1			// c = 166 a
	add	ecx, eax		// c = 167 a
	shl	ecx, 3			// c = 1336 a
	add	ecx, eax		// c = 1337 a

	// a quick way
	lea	edx, [2*eax + eax]	// t = 3 a
	shl	edx, 6			// t = 192 a
	sub	edx, eax		// t = 191 a
	lea	ebx, [8*edx]		// b = 1528 a
	sub	ebx, edx		// b = 1337 a

#elif defined(__arm__)

	// original version, ish
	add	r2, r0, r0, lsl #2	// c = 5 a
	add	r2, r0, r2, lsl #3	// c = 41 a
	add	r2, r0, r2, lsl #1	// c = 83 a
	add	r2, r0, r2, lsl #1	// c = 167 a
	add	r2, r0, r2, lsl #3	// c = 1337 a

	// quicker way
	add	r1, r0, r0, lsl #1	// b = 3 a
	rsb	r1, r0, r1, lsl #6	// b = 191 a
	rsb	r1, r1, r1, lsl #3	// b = 1337 a

#elif defined(__aarch64__)

	// original version, ish
	add	x2, x0, x0, lsl #2	// c = 5 a
	add	x2, x0, x2, lsl #3	// c = 41 a
	add	x2, x0, x2, lsl #1	// c = 83 a
	add	x2, x0, x2, lsl #1	// c = 167 a
	add	x2, x0, x2, lsl #3	// c = 1337 a

	// sleazy because no rsb
	add	x1, x0, x0, lsl #1	// b = 3 a
	sub	x1, x0, x1, lsl #6	// b = -191 a
	sub	x1, x1, x1, lsl #3	// b = 1337 a

#else
	notimpl
#endif

	ret

endproc

proc	x21

	// multiply complex numbers a + b i and c + d i
	//
	//	(a + b i) (c + d i) = (a c - b d) + (a d + b c) i
	//
	// somewhat slick approach uses only three multiplications

#if defined(__x86_64__)

	mov	rsi, rax		// t = a
	add	rax, rbx		// a' = a + b
	mov	rdi, rdx		// u = d
	sub	rdx, rcx		// d' = d - c
	add	rdi, rcx		// u = c + d

	imul	rax, rcx		// a' = c (a + b)
	imul	rsi, rdx		// t = a (d - c)
	imul	rdi, rbx		// u = b (c + d)

	add	rsi, rax		// t = a (d - c) + c (a + b)
	mov	rbx, rsi		// b' = a (d - c) + c (a + b)
					//	= a d + b c
	sub	rax, rdi		// a' = c (a + b) - b (c + d)
					//	= a c - b d

#elif defined(__i386__)

	mov	esi, eax		// t = a
	add	eax, ebx		// a' = a + b
	mov	edi, edx		// u = d
	sub	edx, ecx		// d' = d - c
	add	edi, ecx		// u = c + d

	imul	eax, ecx		// a' = c (a + b)
	imul	esi, edx		// t = a (d - c)
	imul	edi, ebx		// u = b (c + d)

	add	esi, eax		// t = a (d - c) + c (a + b)
	mov	ebx, esi		// b' = a (d - c) + c (a + b)
					//	= a d + b c
	sub	eax, edi		// a' = c (a + b) - b (c + d)
					//	= a c - b d

#elif defined(__arm__)

	add	r4, r0, r1		// t = a + b
	add	r5, r2, r3		// u = c + d
	sub	r3, r3, r2		// d' = d - c

	// mls introduced in armv7
	mul	r4, r4, r2		// t = c (a + b)
	mov	r2, r1			// c' = a (bah!)
	mla	r1, r0, r3, r4		// b' = a (d - c) + c (a + b)
					//	= a d + b c
	mls	r0, r2, r5, r4		// a' = c (a + b) - b (c + d)
					//	= a c - b d

#elif defined(__aarch64__)

	add	x4, x0, x1		// t = a + b
	add	x5, x2, x3		// u = c + d
	sub	x3, x3, x2		// d' = d - c

	// mls intxoduced in axmv7
	mul	x4, x4, x2		// t = c (a + b)
	mov	x2, x1			// c' = a (bah!)
	madd	x1, x0, x3, x4		// b' = a (d - c) + c (a + b)
					//	= a d + b c
	msub	x0, x2, x5, x4		// a' = c (a + b) - b (c + d)
					//	= a c - b d

#else
	notimpl
#endif

	ret

endproc

proc	x22

	// divide by 3

#if defined(__x86_64__)

	mov	rdx, 0xaaaaaaaaaaaaaaab // = ceil(2/3 2^64)
	mul	rdx			// d' || a' =~ 2/3 a 2^64
	shr	rdx, 1			// d' = floor(a/3)
	mov	rax, rdx		// a' = floor(a/3)

	// we start with 0 <= a < 2^64.  write f = ceil(2/3 2^64), so that
	// 2/3 < f/2^64 < 2/3 + 1/2^64.  then floor(2/3 a) <= floor(a f/2^64)
	// <= floor(2/3 a + a/2^64), but a < 2^64 so a/2^64 < 1 and
	// floor(a f/2^64) = floor(2/3 a).

#elif defined(__i386__)

	mov	edx, 0xaaaaaaab		// = ceil(2/3 2^32)
	mul	edx			// d' || a' =~ 2/3 a 2^32
	shr	edx, 1			// d' = floor(a/3)
	mov	eax, edx		// a' = floor(a/3)

#elif defined(__arm__)

	ldr	r12, =0xaaaaaaab
	umull	r12, r0, r0, r12
	mov	r0, r0, lsr #1

#elif defined(__aarch64__)

	ldr	x16, =0xaaaaaaaaaaaaaaab
	umulh	x0, x0, x16
	lsr	x0, x0, #1

#else
	notimpl
#endif

	ret

endproc

proc	x23

#if defined(__x86_64__)

	// main loop: shorten a preserving residue class mod 3
0:	cmp	rax, 5
	jbe	8f
	// a > 5
	mov	rdx, rax		// d' = a
	shr	rdx, 2			// d' = floor(a/4)
	and	rax, 3			// a = 4 d' + a' (0 <= a' < 4)
	add	rax, rdx		// a' == a (mod 3) but a' < a/4 + 4
	jmp	0b

	// fix up final value 0 <= a < 6: want 0 <= a < 3
	//
	// the tricky part is actually a = 3; but the other final cases take
	// additional iterations which we can avoid.
8:	cmp	rax, 3			// set cf iff a < 3
	cmc				// set cf iff a >= 3
	sbb	rdx, rdx		// d' = a >= 3 ? -1 : 0
	and	rdx, 3			// d' = a >= 3 ? 3 : 0
	sub	rax, rdx		// a' = a - (a >= 3 ? 3 : 0)
					//	= a (mod 3)

#elif defined(__i386__)

	// main loop: shorten a preserving residue class mod 3
0:	cmp	eax, 5
	jbe	8f
	// a > 5
	mov	edx, eax		// d' = a
	shr	edx, 2			// d' = floor(a/4)
	and	eax, 3			// a = 4 d' + a' (0 <= a' < 4)
	add	eax, edx		// a' == a (mod 3) but a' < a/4 + 4
	jmp	0b

	// fix up final value 0 <= a < 6: want 0 <= a < 3
	//
	// the tricky part is actually a = 3; but the other final cases take
	// additional iterations which we can avoid.
8:	cmp	eax, 3			// set cf iff a < 3
	cmc				// set cf iff a >= 3
	sbb	edx, edx		// d' = a >= 3 ? -1 : 0
	and	edx, 3			// d' = a >= 3 ? 3 : 0
	sub	eax, edx		// a' = a - (a >= 3 ? 3 : 0)
					//	= a (mod 3)

#elif defined(__arm__)

0:	cmp	r0, #6
	andhs	r12, r0, #3
	addhs	r0, r12, r0, lsr #2
	bhs	0b

	cmp	r0, #3
	subhs	r0, r0, #3

#elif defined(__aarch64__)

0:	cmp	x0, #6
	// blunder on through regardless since this doesn't affect the result
	and	x16, x0, #3
	add	x0, x16, x0, lsr #2
	b.hs	0b

	subs	x16, x0, #3
	cmov.hs	x0, x16

#else
	notimpl
#endif

	ret

endproc

proc	x24

	// invert (odd) a mod 2^64
	//
	// suppose a a_i == 1 (mod 2^{2^i})
	//
	// clearly good for i = 0, since 2^i = 1 and 2^{2^i} = 2, and a_0 =
	// a == 1 (mod 2) by assumption
	//
	// write a a_i == b_i 2^{2^i} + 1 (mod 2^{2^{i+1}})
	// then b_i == (a a_i - 1)/2^{2^i} (mod 2^{2^i})
	// to lift inverse, we want x such that a x == -b_i (mod 2^{2^i});
	// clearly x = -a_i b_i will do, since a a_i == 1 (mod 2^{2^i})
	// then:
	// a_{i+1} = a_i - a_i b_i 2^{2^i} = a_i (1 - (a a_i - 1))
	//	= 2 a_i - a a_i^2
	//
	// check:
	// a a_{i+1} = 2 a a_i - a^2 a_i^2
	//	== 2 a a_i - (b_i 2^{2^i} + 1)^2
	//	== 2 (b_i 2^{2^i} + 1) -
	//		(b_i^2 2^{2^{i+1}} + 2 b_i 2^{2^i} + 1)
	//	== 1 (mod 2^{2^{i+1}})

#if defined(__x86_64__)

	// rax				// a_0 = a
	mov	rbx, rax		// b' = a
	mov	rsi, rax		// t = a_0

0:
  cmp rbp, 0
  je 1f
  stosq
  dec rbp
1:
	mul	rbx			// a' = a a_i
	mov	rcx, rax		// c = a a_i

	sub	rax, 2			// a' = a a_i - 2
	neg	rax			// a' = 2 - a a_i
	mul	rsi			// a_{i+1} = a_i (2 - a a_i)
					//	= 2 a_i - a a_i^2
	mov	rsi, rax		// t = a_{i+1}

	cmp	rcx, 1			// done?
	ja	0b			// no -- iterate

#elif defined(__i386__)

	// eax				// a_0 = a
	mov	ebx, eax		// b' = a
	mov	esi, eax		// t = a_0

0:
  cmp ebp, 0
  je 1f
  stosd
  dec ebp
1:
	mul	ebx			// a' = a a_i
	mov	ecx, eax		// c = a a_i

	sub	eax, 2			// a' = a a_i - 2
	jb	9f			// done if < 2
	neg	eax			// a' = 2 - a a_i
	mul	esi			// a_{i+1} = a_i (2 - a a_i)
					//	= 2 a_i - a a_i^2
	mov	esi, eax		// t = a_{i+1}

	jmp	0b			// and iterate
9:	mov	eax, esi		// restore

#elif defined(__arm__)

	// r0				// a_0 = a
	mov	r1, r0			// b' = a

0:
  cmp r6, #0
  strne r0, [r5], #4
  subne r6, r6, #1
	mul	r2, r0, r1		// c = a a_i
	rsbs	r2, r2, #2		// c = 2 - a a_i
	mul	r0, r0, r2		// a_{i+1} = a_i (2 - a a_i)
					//	= 2 a_i - a a_i^2
	blo	0b

#elif defined(__aarch64__)

	// x0				// a_0 = a
	mov	x1, x0			// b' = a
	mov	x16, #2			// because we have no rsb

0:
  cmp x6, #0
  b.eq 1f
  str x0, [x5], #8
  sub x6, x6, #1
1:
	mul	x2, x0, x1		// c = a a_i
	subs	x2, x16, x2		// c = 2 - a a_i
	mul	x0, x0, x2		// a_{i+1} = a_i (2 - a a_i)
					//	= 2 a_i - a a_i^2
	b.lo	0b

#else
	notimpl
#endif

	ret

endproc

proc	x25

	// a poor approximation to pi/4
	//
	// think of x and y as being in 16.16 fixed-point format.  we sample
	// points in the unit square, and determine how many of them are
	// within a unit quarter-circle centred at the origin.  the area of
	// the quarter-circle is pi/4.

#if defined(__x86_64__)

	xor	eax, eax		// a = 0
	mov	rcx, 1
	shl	rcx, 0x20		// c =~ 4 billion

0:	movzx	rbx, cx			// x = low 16 bits of c
	imul	rbx, rbx		// b = x^2

	ror	rcx, 0x10		// switch halves of c
	movzx	rdx, cx			// y = high 16 bits of c
	imul	rdx, rdx		// d = y^2
	rol	rcx, 0x10		// switch back

	add	rbx, rdx		// r^2 = x^2 + y^2
	shr	rbx, 0x20		// r^2 >= 1?
	cmp	rbx, 1			// set cf iff r^2 >= 1
	adc	rax, 0			// and add onto accumulator
	loop	0b

#elif defined(__i386__)

	// this is actually better done in 32 bits.  the carry has the wrong
	// sense here, so instead deduct one for each point outside the
	// quarter-circle rather than adding one for each point inside it.
	xor	eax, eax
	xor	ecx, ecx

0:	movzx	ebx, cx
	imul	ebx, ebx

	mov	edx, ecx
	shr	edx, 0x10
	imul	edx, edx

	add	ebx, edx		// see?
	sbb	eax, 0
	loop	0b

#elif defined(__arm__)

	mov	r0, #0
	mov	r2, #0

0:	uxth	r1, r2, ror #0
	uxth	r3, r2, ror #16
	mul	r1, r1, r1
	mul	r3, r3, r3
	cmn	r1, r3			// mlas doesn't set cf usefully
	addcc	r0, r0, #1
	adds	r2, r2, #1
	bne	0b

#elif defined(__aarch64__)

	mov	w0, #0
	mov	w2, #0

0:	ubfx	w1, w2, #0, #16
	ubfx	w3, w2, #16, #16
	sub	w2, w2, #1
	mul	w1, w1, w1
	mul	w3, w3, w3
	cmn	w1, w3
	cinc.cc	w0, w0
	cbnz	w2, 0b

#else
	notimpl
#endif

	ret

endproc

proc	x26

	// a bad way to rotate a right by 7 places

#if defined(__x86_64__)

	mov	rbx, rax
	ror	rbx, 7			// better

	mov	rdx, rax		// d' = a
	shr	rax, 7			// a' = a >> 7
	shl	rdx, 0x39		// d' = a << 57
	or	rax, rdx		// a' = a >>> 7

#elif defined(__i386__)

	mov	ebx, eax
	ror	ebx, 7			// better

	mov	edx, eax		// d' = a
	shr	eax, 7			// a' = a >> 7
	shl	edx, 0x39		// d' = a << 57
	or	eax, edx		// a' = a >>> 7

#elif defined(__arm__)

	mov	r1, r0, ror #7		// easy way

	// even the hard way is fairly easy on arm
	mov	r3, r0, lsl #25
	orr	r0, r3, r0, lsr #7	// hard way

#elif defined(__aarch64__)

	ror	x1, x0, #7		// easy way

	// even the hard way is fairly easy on arm
	lsl	x3, x0, #57
	orr	x0, x3, x0, lsr #7	// hard way

#else
	notimpl
#endif

	ret

endproc

proc	x27

	// shift a right by c places, in two halves

#if defined(__x86_64__)

	mov	ch, cl			// c' = [c, c]
	inc	ch			// c' = [c, c + 1]
	shr	ch, 1
	shr	cl, 1			// c' = [floor(c/2), ceil(c/2)]
	shr	rax, cl
	xchg	ch, cl
	shr	rax, cl

#elif defined(__i386__)

	mov	ch, cl			// c' = [c, c]
	inc	ch			// c' = [c, c + 1]
	shr	ch, 1
	shr	cl, 1			// c' = [floor(c/2), ceil(c/2)]
	shr	eax, cl
	xchg	ch, cl
	shr	eax, cl

#elif defined(__arm__)

	// it would be clearer and more efficient to say: `mov r12, r2, lsr
	// #1; sub r2, r2, r12', but that's not the lesson this exercise is
	// trying to teach.
	add	r12, r2, #1
	mov	r2, r2, lsr #1
	mov	r12, r12, lsr #1
	mov	r0, r0, lsr r2
	mov	r0, r0, lsr r12

#elif defined(__aarch64__)

	add	w16, w2, #1
	lsr	w2, w2, #1
	lsr	w16, w16, #1
	lsr	x0, x0, x2
	lsr	x0, x0, x16

#else
	notimpl
#endif

	ret

endproc

proc	x28

	// divide c-byte little-endian bignum at rsi by 2 (rounding down)

#if defined(__x86_64__)

	clc
0:	rcr	byte ptr [rsi], 1
	inc	rsi
	loop	0b

#elif defined(__i386__)

	clc
0:	rcr	byte ptr [esi], 1
	inc	esi
	loop	0b

#elif defined(__arm__)

	// we could hack this a word at a time using rrx
	mov	r3, #0
0:	ldrb	r12, [r4]
	subs	r2, r2, #1
	orr	r3, r3, r12, lsr #1
	strb	r3, [r4], #1
	mov	r3, r12, lsl #7
	bne	0b

#elif defined(__aarch64__)

	mov	w16, #0
0:	ldrb	w17, [x4]
	sub	x2, x2, #1
	orr	w16, w16, w17, lsr #1
	strb	w16, [x4], #1
	lsl	w16, w17, #7
	cbnz	x2, 0b

#else
	notimpl
#endif

	ret

endproc

proc	x29

	// fill a buffer with a 3-byte pattern

#if defined(__x86_64__)

	lea	rdi, [rsi + 3]
	rep movsb

#elif defined(__i386__)

	lea	edi, [esi + 3]
	rep movsb

#elif defined(__arm__)

	add	r5, r4, #3
0:	subs	r2, r2, #1
	ldrhsb	r12, [r4], #1
	strhsb	r12, [r5], #1
	bhs	0b

#elif defined(__aarch64__)

	cbz	x2, 9f
	add	x5, x4, #3
0:	sub	x2, x2, #1
	ldrb	w16, [x4], #1
	strb	w16, [x5], #1
	cbnz	x2, 0b
9:

#else
	notimpl
#endif

	ret

endproc

proc	x2a

	// rotate the words in a buffer, so that the last word comes first,
	// the first comes second, and so on.  this isn't a good way to do
	// it.

#if defined(__x86_64__)

	mov	rsi, rbx		// set string pointers
	mov	rdi, rbx
0:	lodsq				// fetch next word
	xchg	rax, qword ptr [rbx]	// stash it for next iteration and
					// replace it with the previously
					// stashed word
	stosq				// store in output
	// (note that the first iteration doesn't actually do anything)
	loop	0b			// continue until all done

#elif defined(__i386__)

	mov	esi, ebx		// set string pointers
	mov	edi, ebx
0:	lodsd				// fetch next word
	xchg	eax, dword ptr [ebx]	// stash it for next iteration and
					// replace it with the previously
					// stashed word
	stosd				// store in output
	loop	0b			// continue until all done

#elif defined(__arm__)

	// let's do this a sensible way.  (we could go faster using ldm/stm.)
	add	r0, r1, r2, lsl #2	// find the end of the buffer
	ldr	r0, [r0, #-4]		// collect final element
0:	subs	r2, r2, #1
	ldr	r12, [r1]
	str	r0, [r1], #4
	mov	r0, r12
	bne	0b

#elif defined(__aarch64__)

	add	x0, x1, x2, lsl #3	// find the end of the buffer
	ldr	x0, [x0, #-8]		// collect final element
0:	sub	x2, x2, #1
	ldr	x16, [x1]
	str	x0, [x1], #8
	mov	x0, x16
	cbnz	x2, 0b

#else
	notimpl
#endif

	ret

endproc

proc	x2b

	// find a cycle in a function f: B -> B, where B = {0, 1, ..., 255}

#if defined(__x86_64__)

	// this is floyd's cycle-finding algorithm.
	//
	// consider the sequence s_0 = 0, s_1 = f(0), s_2 = f(f(0)), ...,
	// s_{i+1} = f(s_i).  since B is finite, there must be some smallest
	// t and c such that s(t) = s(t + c); then we have s_i = s_j iff
	// i >= t, j >= t, and i == j (mod c).
	//
	// the algorithm sets two cursors advancing through the sequence: a
	// /tortoise/ which advances one step at a time, and a /hare/ which
	// advances by two, so when the tortoise is at element s_i, the hare
	// is at s_{2i}.  the hare will run around the cycle and catch the
	// tortoise when i >= t and i == 2 i (mod c); the latter is simply i
	// == 0 (mod c), which therefore happens first when i = k = t +
	// (-t mod c).
	//
	// i'm not sure what good xlatb does here that mov al, [rbx + al]
	// doesn't.

	xor	eax, eax		// tortoise starts at 0
	xor	edx, edx		// hare starts at 0
0:	xlatb				// advance tortoise
	xchg	rax, rdx		// switch to hare
	xlatb				// advance hare ...
	xlatb				// ... twice
	xchg	rax, rdx		// switch back
	cmp	al, dl			// hare caught the tortoise?
	jnz	0b			// no -- go around again

	// now we trace the initial tail: reset the tortoise to s_0, and slow
	// the hare down so that both take only a single step in each
	// iteration.  this loop terminates when i >= t and i == i + 2 k
	// (mod c).  we know k is a multiple of c, so the latter condition
	// always holds, so this finds the first step of the cycle.

	xor	eax, eax		// reset the tortoise
0:	xlatb				// advance tortoise
	xchg	rax, rdx		// switch to hare
	xlatb				// advance hare
	xchg	rax, rdx		// and switch back
	cmp	al, dl			// done?
	jnz	0b			// no -- iterate

#elif defined(__i386__)

	xor	eax, eax		// tortoise starts at 0
	xor	edx, edx		// hare starts at 0
0:	xlatb				// advance tortoise
	xchg	eax, edx		// switch to hare
	xlatb				// advance hare ...
	xlatb				// ... twice
	xchg	eax, edx		// switch back
	cmp	al, dl			// hare caught the tortoise?
	jnz	0b			// no -- go around again

	xor	eax, eax		// reset the tortoise
0:	xlatb				// advance tortoise
	xchg	eax, edx		// switch to hare
	xlatb				// advance hare
	xchg	eax, edx		// and switch back
	cmp	al, dl			// done?
	jnz	0b			// no -- iterate

#elif defined(__arm__)

	mov	r0, #0
	mov	r3, #0
0:	ldrb	r0, [r1, r0]
	ldrb	r3, [r1, r3]
	ldrb	r3, [r1, r3]
	cmp	r0, r3
	bne	0b

	mov	r0, #0
0:	ldrb	r0, [r1, r0]
	ldrb	r3, [r1, r3]
	cmp	r0, r3
	bne	0b

#elif defined(__aarch64__)

	mov	w0, #0
	mov	w3, #0
0:	ldrb	w0, [x1, x0]
	ldrb	w3, [x1, x3]
	ldrb	w3, [x1, x3]
	cmp	w0, w3
	b.ne	0b

	mov	w0, #0
0:	ldrb	w0, [x1, x0]
	ldrb	w3, [x1, x3]
	cmp	w0, w3
	b.ne	0b

#else
	notimpl
#endif

	ret

endproc

proc	x2c

	// a convoluted way to set rax = rsi

#if defined(__x86_64__)

	mov	qword ptr [rbx + 8*rcx], 0 // b[c] = 0
	mov	qword ptr [rbx + 8*rdx], 1 // b[d] = 1
	mov	rax, [rbx + 8*rcx]	// a' = b[c] = 0

	mov	[rbx], rsi		// b[0] = t
	mov	[rbx + 8], rdi		// b[1] = u
	mov	rax, [rbx + 8*rax]	// a' = b[a'] = b[0] = t

#elif defined(__i386__)

	mov	dword ptr [ebx + 8*ecx], 0 // b[c] = 0
	mov	dword ptr [ebx + 8*edx], 1 // b[d] = 1
	mov	eax, [ebx + 8*ecx]	// a' = b[c] = 0

	mov	[ebx], esi		// b[0] = t
	mov	[ebx + 8], edi		// b[1] = u
	mov	eax, [ebx + 8*eax]	// a' = b[a'] = b[0] = t

#elif defined(__arm__)

	mov	r0, #0
	mov	r12, #1

	str	r0, [r1, r2, lsl #2]
	str	r12, [r1, r3, lsl #2]
	ldr	r0, [r1, r2, lsl #2]

	str	r4, [r1]
	str	r5, [r1, #4]
	ldr	r0, [r1, r0, lsl #2]

#elif defined(__aarch64__)

	mov	x16, #1

	str	xzr, [x1, x2, lsl #3]
	str	x16, [x1, x3, lsl #3]
	ldr	x0, [x1, x2, lsl #3]

	str	x4, [x1]
	str	x5, [x1, #8]
	ldr	x0, [x1, x0, lsl #3]

#else
	notimpl
#endif

	ret

endproc

proc	x2d

	// clear the least significant set bit in a, by calculating a' =
	// a AND (a - 1).
	//
	// if a = 0 then a' = 0.  otherwise, a - 1 differs from a exactly in
	// the least significant /set/ bit of a, and all bits of lesser
	// significance.  to put it another way: write a = u 2^{k+1} + 2^k;
	// then a - 1 = u 2^{k+1} + 2^{k-1} + ... + 2 + 1.  taking the
	// bitwise AND of these leaves only the bits common to both, i.e.,
	// u 2^{k+1}.

#if defined(__x86_64__)

	mov	rdx, rax		// d' = a
	dec	rax			// a' = a - 1
	and	rax, rdx		// a' = a AND (a - 1)

#elif defined(__i386__)

	mov	edx, eax		// d' = a
	dec	eax			// a' = a - 1
	and	eax, edx		// a' = a AND (a - 1)

#elif defined(__arm__)

	sub	r3, r0, #1
	and	r0, r0, r3

#elif defined(__aarch64__)

	sub	x3, x0, #1
	and	x0, x0, x3

#else
	notimpl
#endif

	ret

endproc

proc	x2e

	// compute a mask of one bits in exactly the positions of the
	// low-order run of zero bits in a

#if defined(__x86_64__)

	mov	rdx, rax		// d' = a
	dec	rdx			// d' = a - 1
	xor	rax, rdx		// a = a XOR (a - 1)
					//   set bits are least significant
					//   set bit of a, and all bits of
					//   lesser significance
	shr	rax, 1			// now only bits of lesser
					//   significance; a' = 0 iff a odd
	cmp	rax, rdx		// equal if a = 0 or 2^k; otherwise
					//   strictly less

#elif defined(__i386__)

	mov	edx, eax
	dec	edx
	xor	eax, edx
	shr	eax, 1
	cmp	eax, edx

#elif defined(__arm__)

	sub	r3, r0, #1
	eor	r0, r0, r3
	mov	r0, r0, lsr #1		// probably fold shift into next inst
	cmp	r0, r3

#elif defined(__aarch64__)

	sub	x3, x0, #1
	eor	x0, x0, x3
	mov	x0, x0, lsr #1		// probably fold shift into next inst
	cmp	x0, x3

#else
	notimpl
#endif

	ret

endproc

proc	x2f

	// a slow population count

#if defined(__x86_64__)

	popcnt	rbx, rcx		// the easy way

	// a fast version in software
	mov	rax, rcx

	mov	rdx, rcx
	shr	rdx, 1
	mov	rsi, 0x5555555555555555
	and	rax, rsi
	and	rdx, rsi
	add	rax, rdx

	mov	rdx, rax
	shr	rdx, 2
	mov	rsi, 0x3333333333333333
	and	rax, rsi
	and	rdx, rsi
	add	rax, rdx

	mov	rdx, rax
	shr	rdx, 32
	add	rax, rdx

	mov	rdx, rax
	shr	rdx, 4
	and	rax, 0x0f0f0f0f
	and	rdx, 0x0f0f0f0f
	add	rax, rdx

	mov	rdx, rax
	shr	rdx, 8
	add	rax, rdx

	mov	rdx, rax
	shr	rdx, 16
	add	rax, rdx
	movzx	rsi, al

	// the official version
	xor	eax, eax		// clear iteration counter
0:	jrcxz	9f			// bail if c = 0
	inc	rax			// bump iteration count
	mov	rdx, rcx		// d' = c
	dec	rdx			// d' = c - 1
	and	rcx, rdx		// zap least significant set bit of c
	jmp	0b			// and go again
9:

#elif defined(__i386__)

	popcnt	ebx, ecx		// the easy way

	mov	eax, ecx

	mov	edx, ecx
	shr	edx, 1
	and	eax, 0x55555555
	and	edx, 0x55555555
	add	eax, edx

	mov	edx, eax
	shr	edx, 2
	and	eax, 0x33333333
	and	edx, 0x33333333
	add	eax, edx

	mov	edx, eax
	shr	edx, 4
	add	eax, edx

	mov	edx, eax
	shr	edx, 8
	and	eax, 0x000f000f
	and	edx, 0x000f000f
	add	eax, edx

	mov	edx, eax
	shr	edx, 16
	add	eax, edx
	movzx	esi, al

	xor	eax, eax
0:	jecxz	9f
	inc	eax
	mov	edx, ecx
	dec	edx
	and	ecx, edx
	jmp	0b
9:

#elif defined(__arm__)

	// the easy-ish way
	vmov	d0[0], r2
	vcnt.8	d0, d0
	vmov	r1, d0[0]
	add	r1, r1, r1, lsl #8
	add	r1, r1, r1, lsl #16
	mov	r1, r1, lsr #24

	// the hard way
	movw	r12, #0x5555
	movt	r12, #0x5555
	and	r3, r12, r2, lsr #1
	and	r0, r12, r2
	add	r0, r0, r3

	movw	r12, #0x3333
	movt	r12, #0x3333
	and	r3, r12, r0, lsr #2
	and	r0, r12, r0
	add	r0, r0, r3

	add	r0, r0, r0, lsl #16

	movt	r12, 0x0f0f
	and	r3, r12, r0, lsr #4
	and	r0, r12, r0
	add	r0, r0, r3

	add	r0, r0, r0, lsl #8

	mov	r4, r0, lsr #24

	// and following the exercise
	mov	r0, #0
	cmp	r2, #0
	beq	9f
0:	add	r0, r0, #1
	sub	r3, r2, #1
	ands	r2, r2, r3
	bne	0b
9:

#elif defined(__aarch64__)

	// the easy-ish way
	mov	v0.d[0], x2
	cnt	v0.8b, v0.8b
	mov	x1, v0.d[0]
	add	x1, x1, x1, lsl #8
	add	x1, x1, x1, lsl #16
	add	x1, x1, x1, lsl #32
	lsr	x1, x1, #56

	// the hard way -- though arm64's immediate constant encodings and
	// shifting make this actually rather pleasant.
	and	x3, x2, #0xaaaaaaaaaaaaaaaa
	and	x0, x2, #0x5555555555555555
	add	x0, x0, x3, lsr #1

	and	x3, x0, #0xcccccccccccccccc
	and	x0, x0, #0x3333333333333333
	add	x0, x0, x3, lsr #2

	add	x0, x0, x0, lsr #4

	and	x3, x0, #0x0f000f000f000f00
	and	x0, x0, #0x000f000f000f000f
	add	x0, x3, x0, lsl #8

	add	x0, x0, x0, lsl #16
	add	x0, x0, x0, lsl #32
	lsr	x4, x0, #56

	// and the official way
	mov	x0, #0
	cbz	x2, 9f
0:	add	x0, x0, #1
	sub	x3, x2, #1
	and	x2, x2, x3
	cbnz	x2, 0b
9:

#else
	notimpl
#endif

	ret

endproc

///--------------------------------------------------------------------------
/// 0x30--0x3f

proc	x30

#if defined(__x86_64__)

	notimpl

#elif defined(__i386__)

	notimpl

#elif defined(__arm__)

	notimpl

#elif defined(__aarch64__)

	notimpl

#else
	notimpl
#endif

	ret

endproc

proc	x31

#if defined(__x86_64__)

	notimpl

#elif defined(__i386__)

	notimpl

#elif defined(__arm__)

	notimpl

#elif defined(__aarch64__)

	notimpl

#else
	notimpl
#endif

endproc

proc	x32

#if defined(__x86_64__)

	notimpl

#elif defined(__i386__)

	notimpl

#elif defined(__arm__)

	notimpl

#elif defined(__aarch64__)

	notimpl

#else
	notimpl
#endif

endproc

proc	x33

#if defined(__x86_64__)

	notimpl

#elif defined(__i386__)

	notimpl

#elif defined(__arm__)

	notimpl

#elif defined(__aarch64__)

	notimpl

#else
	notimpl
#endif

endproc

proc	x34

#if defined(__x86_64__)

	notimpl

#elif defined(__i386__)

	notimpl

#elif defined(__arm__)

	notimpl

#elif defined(__aarch64__)

	notimpl

#else
	notimpl
#endif

endproc

proc	x35

#if defined(__x86_64__)

	notimpl

#elif defined(__i386__)

	notimpl

#elif defined(__arm__)

	notimpl

#elif defined(__aarch64__)

	notimpl

#else
	notimpl
#endif

endproc

proc	x36

#if defined(__x86_64__)

	notimpl

#elif defined(__i386__)

	notimpl

#elif defined(__arm__)

	notimpl

#elif defined(__aarch64__)

	notimpl

#else
	notimpl
#endif

endproc

proc	x37

#if defined(__x86_64__)

	notimpl

#elif defined(__i386__)

	notimpl

#elif defined(__arm__)

	notimpl

#elif defined(__aarch64__)

	notimpl

#else
	notimpl
#endif

endproc

proc	x38

#if defined(__x86_64__)

	notimpl

#elif defined(__i386__)

	notimpl

#elif defined(__arm__)

	notimpl

#elif defined(__aarch64__)

	notimpl

#else
	notimpl
#endif

endproc

proc	x39

#if defined(__x86_64__)

	notimpl

#elif defined(__i386__)

	notimpl

#elif defined(__arm__)

	notimpl

#elif defined(__aarch64__)

	notimpl

#else
	notimpl
#endif

endproc

proc	x3a

#if defined(__x86_64__)

	notimpl

#elif defined(__i386__)

	notimpl

#elif defined(__arm__)

	notimpl

#elif defined(__aarch64__)

	notimpl

#else
	notimpl
#endif

endproc

proc	x3b

#if defined(__x86_64__)

	notimpl

#elif defined(__i386__)

	notimpl

#elif defined(__arm__)

	notimpl

#elif defined(__aarch64__)

	notimpl

#else
	notimpl
#endif

endproc

proc	x3c

#if defined(__x86_64__)

	notimpl

#elif defined(__i386__)

	notimpl

#elif defined(__arm__)

	notimpl

#elif defined(__aarch64__)

	notimpl

#else
	notimpl
#endif

endproc

proc	x3d

#if defined(__x86_64__)

	notimpl

#elif defined(__i386__)

	notimpl

#elif defined(__arm__)

	notimpl

#elif defined(__aarch64__)

	notimpl

#else
	notimpl
#endif

endproc

proc	x3e

#if defined(__x86_64__)

	notimpl

#elif defined(__i386__)

	notimpl

#elif defined(__arm__)

	notimpl

#elif defined(__aarch64__)

	notimpl

#else
	notimpl
#endif

endproc

proc	x3f

#if defined(__x86_64__)

	notimpl

#elif defined(__i386__)

	notimpl

#elif defined(__arm__)

	notimpl

#elif defined(__aarch64__)

	notimpl

#else
	notimpl
#endif

endproc

///----- That's all, folks --------------------------------------------------