mdw@git.distorted.org.uk Git - xchg-rax-rax/blame_incremental

... / ...

Commit	Line	Data
	1	/// -- mode: asm; asm-comment-char: 0 --
	2
	3	///--------------------------------------------------------------------------
	4	/// Preliminaries.
	5
	6	#include <sys/syscall.h>
	7
	8	#if defined(__i386__) \|\| defined(__x86_64__)
	9
	10	.intel_syntax noprefix
	11
	12	#elif defined(__arm__)
	13
	14	.macro ret
	15	bx r14
	16	.endm
	17
	18	.arch armv7-a
	19	.fpu neon
	20
	21	#elif defined(__aarch64__)
	22
	23	.macro cmov rd, rn, cc
	24	csel \rd, \rn, \rd, \cc
	25	.endm
	26	#define _COND(_) \
	27	_(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl) \
	28	_(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv) \
	29	_(hs) _(lo)
	30	#define _INST(_) \
	31	_(ccmp) _(ccmn) \
	32	_(csel) _(cmov) \
	33	_(csinc) _(cinc) _(cset) \
	34	_(csneg) _(cneg) \
	35	_(csinv) _(cinv) _(csetm)
	36	#define _CONDVAR(cc) _definstvar cc;
	37	#define _INSTVARS(inst) \
	38	.macro _definstvar cc; \
	39	.macro inst.\cc args:vararg; inst \args, \cc; .endm; \
	40	.endm; \
	41	_COND(_CONDVAR); \
	42	.purgem _definstvar;
	43	_INST(_INSTVARS)
	44	#undef _COND
	45	#undef _INST
	46	#undef _CONDVAR
	47	#undef _INSTVARS
	48
	49	#define CCMP_N 8
	50	#define CCMP_Z 4
	51	#define CCMP_C 2
	52	#define CCMP_V 1
	53
	54	#define CCMP_MI CCMP_N
	55	#define CCMP_PL 0
	56	#define CCMP_EQ CCMP_Z
	57	#define CCMP_NE 0
	58	#define CCMP_CS CCMP_C
	59	#define CCMP_HS CCMP_C
	60	#define CCMP_CC 0
	61	#define CCMP_LO 0
	62	#define CCMP_VS CCMP_V
	63	#define CCMP_VC 0
	64	#define CCMP_HI CCMP_C
	65	#define CCMP_LS 0
	66	#define CCMP_LT CCMP_N
	67	#define CCMP_GE 0
	68	#define CCMP_LE CCMP_N
	69	#define CCMP_GT 0
	70
	71	#else
	72	# error "not supported"
	73	#endif
	74
	75	.macro proc name
	76	.globl \name
	77	.type \name, STT_FUNC
	78	.p2align 4
	79	\name\():
	80	.macro endproc
	81	.size \name, . - \name
	82	.purgem endproc
	83	.endm
	84	.endm
	85
	86	.macro ch c
	87	#if defined(__i386__)
	88
	89	pushf
	90	push eax
	91	push ebx
	92	push ecx
	93	push edx
	94	push ebp
	95	mov ebp, esp
	96	and esp, -16
	97
	98	push \c
	99	call putchar@plt
	100
	101	call get_pc_ebx
	102	add ebx, offset _GLOBAL_OFFSET_TABLE
	103	mov eax, [ebx + stdout@GOT]
	104	mov eax, [eax]
	105	call fflush@plt
	106
	107	mov esp, ebp
	108	pop ebp
	109	pop edx
	110	pop ecx
	111	pop ebx
	112	pop eax
	113	popf
	114
	115	#elif defined(__x86_64__)
	116
	117	pushf
	118	push rax
	119	push rcx
	120	push rdx
	121	push rsi
	122	push rdi
	123	push r8
	124	push r9
	125	push rbp
	126	mov rbp, rsp
	127	and rsp, -16
	128
	129	mov rdi, \c
	130	call putchar@plt
	131
	132	mov rdi, [rip + stdout]
	133	call fflush@plt
	134
	135	mov rsp, rbp
	136	pop rbp
	137	pop r9
	138	pop r8
	139	pop rdi
	140	pop rsi
	141	pop rdx
	142	pop rcx
	143	pop rax
	144	popf
	145
	146	#elif defined(__arm__)
	147
	148	stmfd r13!, {r0-r4, r12, r14}
	149
	150	mov r4, r13
	151	bic r14, r4, #15
	152	mov r13, r14
	153
	154	mov r0, #\c
	155	bl putchar@plt
	156
	157	ldr r14, .L$_c$gotoff$\@
	158	.L$_c$gotpc$\@:
	159	add r14, pc, r14
	160	b .L$_c$cont$\@
	161	.L$_c$gotoff$\@:
	162	.word _GLOBAL_OFFSET_TABLE - .L$_c$gotpc$\@ - 8
	163	.L$_c$cont$\@:
	164	bl fflush@plt
	165
	166	mov r13, r4
	167	ldmfd r13!, {r0-r4, r12, r14}
	168
	169	#elif defined(__aarch64__)
	170
	171	sub sp, sp, #20*8
	172	stp x0, x1, [sp, #0]
	173	stp x2, x3, [sp, #16]
	174	stp x4, x5, [sp, #32]
	175	stp x6, x7, [sp, #48]
	176	stp x8, x9, [sp, #64]
	177	stp x10, x11, [sp, #80]
	178	stp x12, x13, [sp, #96]
	179	stp x14, x15, [sp, #112]
	180	stp x16, x17, [sp, #128]
	181	mrs x16, nzcv
	182	stp x16, x30, [sp, #144]
	183
	184	mov w0, #\c
	185	bl putchar
	186	adrp x0, :got:stdout
	187	ldr x0, [x0, #:got_lo12:stdout]
	188	ldr x0, [x0]
	189	bl fflush
	190
	191	ldp x16, x30, [sp, #144]
	192	msr nzcv, x16
	193	ldp x16, x17, [sp, #128]
	194	ldp x14, x15, [sp, #112]
	195	ldp x12, x13, [sp, #96]
	196	ldp x10, x11, [sp, #80]
	197	ldp x8, x9, [sp, #64]
	198	ldp x6, x7, [sp, #48]
	199	ldp x4, x5, [sp, #32]
	200	ldp x2, x3, [sp, #16]
	201	ldp x0, x1, [sp, #0]
	202	add sp, sp, #20*8
	203
	204	#else
	205	# error "not supported"
	206	#endif
	207	.endm
	208
	209	.macro notimpl
	210	#if defined(__i386__) \|\| defined(__x86_64__)
	211	ud2
	212	#elif defined(__arm__)
	213	udf
	214	#elif defined(__aarch64__)
	215	hlt #0
	216	#else
	217	# error "not supported"
	218	#endif
	219	.endm
	220
	221	.section .note.GNU-stack, "", %progbits
	222
	223	.text
	224
	225	#if defined(__i386__)
	226	get_pc_ebx:
	227	mov ebx, [esp]
	228	ret
	229	#endif
	230
	231
	232	proc call_example
	233
	234	#if defined(__i386__)
	235
	236	push ebx // ebx
	237	push esi // esi, ebx
	238	push edi // edi, esi, ebx
	239	push ebp // flags, ebp, ..., ebx
	240	pushf
	241
	242	mov edi, [esp + 4*6]
	243	mov esi, [esp + 4*7]
	244	push esi // regs, flags, ebp, ..., ebx
	245
	246	call get_pc_ebx
	247	lea eax, [ebx + 9f - .]
	248	push eax // cont, regs, flags, ebp, ..., ebx
	249	push edi // func, cont, regs, flags, ebp, ..., ebx
	250
	251	mov eax, [esi + 28]
	252	pushf
	253	pop ecx
	254	and eax, 0x0cd5
	255	and ecx, ~0x0cd5
	256	or eax, ecx
	257	push eax
	258	popf
	259	mov eax, [esi + 0]
	260	mov ebx, [esi + 4]
	261	mov ecx, [esi + 8]
	262	mov edx, [esi + 12]
	263	mov edi, [esi + 20]
	264	mov ebp, [esi + 24]
	265	mov esi, [esi + 16]
	266
	267	ret // -> func; regs, flags, ebp, ..., ebx
	268
	269	9: pushf // eflags, regs, flags, ebp, ..., ebx
	270	push esi // esi, eflags, regs, flags, ebp, ..., ebx
	271	mov esi, [esp + 8]
	272	mov [esi + 0], eax
	273	mov [esi + 4], ebx
	274	mov [esi + 8], ecx
	275	mov [esi + 12], edx
	276	mov [esi + 20], edi
	277	mov [esi + 24], ebp
	278	pop eax // rflags, regs, flags, ebp, ..., ebx
	279	mov [esi + 16], eax
	280	pop eax // regs, flags, ebp, ..., ebx
	281	mov [esi + 28], eax
	282
	283	add esp, 4 // flags, ebp, ..., ebx
	284	popf // ebp, ..., ebx
	285	pop ebp // ..., ebx
	286	pop edi
	287	pop esi
	288	pop ebx //
	289	ret
	290
	291	#elif defined(__x86_64__)
	292
	293	push rbx // rbx
	294	push r10
	295	push r11
	296	push r12
	297	push r13
	298	push r14
	299	push r15
	300	push rbp // flags, rbp, ..., rbx
	301	pushf
	302
	303	push rsi // regs, flags, rbp, ..., rbx
	304
	305	lea rax, [rip + 9f]
	306	push rax // cont, regs, flags, rbp, ..., rbx
	307	push rdi // func, cont, regs, flags, rbp, ..., rbx
	308
	309	mov rax, [rsi + 8*15]
	310	pushf
	311	pop rcx
	312	and rax, 0x0cd5
	313	and rcx, ~0x0cd5
	314	or rax, rcx
	315	push rax
	316	popf
	317	mov rax, [rsi + 0]
	318	mov rbx, [rsi + 8]
	319	mov rcx, [rsi + 16]
	320	mov rdx, [rsi + 24]
	321	mov rdi, [rsi + 40]
	322	mov rbp, [rsi + 48]
	323	mov r8, [rsi + 56]
	324	mov r9, [rsi + 64]
	325	mov r10, [rsi + 72]
	326	mov r11, [rsi + 80]
	327	mov r12, [rsi + 88]
	328	mov r13, [rsi + 96]
	329	mov r14, [rsi + 104]
	330	mov r15, [rsi + 112]
	331	mov rsi, [rsi + 32]
	332
	333	ret // -> func; regs, flags, rbp, ..., rbx
	334
	335	9: pushf // rflags, regs, flags, rbp, ..., rbx
	336	push rsi // rsi, rflags, regs, flags, rbp, ..., rbx
	337	mov rsi, [rsp + 16]
	338	mov [rsi + 0], rax
	339	mov [rsi + 8], rbx
	340	mov [rsi + 16], rcx
	341	mov [rsi + 24], rdx
	342	mov [rsi + 40], rdi
	343	mov [rsi + 48], rbp
	344	mov [rsi + 56], r8
	345	mov [rsi + 64], r9
	346	mov [rsi + 72], r10
	347	mov [rsi + 80], r11
	348	mov [rsi + 88], r12
	349	mov [rsi + 96], r13
	350	mov [rsi + 104], r14
	351	mov [rsi + 112], r15
	352	pop rax // rflags, regs, flags, rbp, ..., rbx
	353	mov [rsi + 32], rax
	354	pop rax // regs, flags, rbp, ..., rbx
	355	mov [rsi + 120], rax
	356
	357	add rsp, 8 // flags, rbp, ..., rbx
	358	popf // rbp, ..., rbx
	359	pop rbp // ..., rbx
	360	pop r15
	361	pop r14
	362	pop r13
	363	pop r12
	364	pop r11
	365	pop r10
	366	pop rbx //
	367	ret
	368
	369	#elif defined(__arm__)
	370
	371	stmfd r13!, {r0, r1, r4-r11, r14}
	372	ldmia r1, {r0-r12, r14}
	373	msr cpsr, r14
	374	mov r14, pc
	375	ldr pc, [r13], #4
	376	ldr r14, [r13], #4
	377	stmia r14!, {r0-r12}
	378	mrs r0, cpsr
	379	str r0, [r14]
	380	ldmfd r13!, {r4-r11, pc}
	381
	382	#elif defined(__aarch64__)
	383
	384	stp x29, x30, [sp, #-14*8]!
	385	mov x29, sp
	386	stp x19, x20, [sp, #16]
	387	stp x21, x22, [sp, #32]
	388	stp x23, x24, [sp, #48]
	389	stp x25, x26, [sp, #64]
	390	stp x27, x28, [sp, #80]
	391	str x1, [sp, #104]
	392
	393	ldp x29, x30, [x1, #224]
	394	msr nzcv, x30
	395	mov x30, x0
	396	ldp x27, x28, [x1, #208]
	397	ldp x25, x26, [x1, #192]
	398	ldp x23, x24, [x1, #176]
	399	ldp x21, x22, [x1, #160]
	400	ldp x19, x20, [x1, #144]
	401	ldp x16, x17, [x1, #128]
	402	ldp x14, x15, [x1, #112]
	403	ldp x12, x13, [x1, #96]
	404	ldp x10, x11, [x1, #80]
	405	ldp x8, x9, [x1, #64]
	406	ldp x6, x7, [x1, #48]
	407	ldp x4, x5, [x1, #32]
	408	ldp x2, x3, [x1, #16]
	409	ldp x0, x1, [x1, #0]
	410
	411	blr x30
	412
	413	ldr x30, [sp, #104]
	414	stp x27, x28, [x30, #208]
	415	stp x25, x26, [x30, #192]
	416	stp x23, x24, [x30, #176]
	417	stp x21, x22, [x30, #160]
	418	stp x19, x20, [x30, #144]
	419	stp x16, x17, [x30, #128]
	420	stp x14, x15, [x30, #112]
	421	stp x12, x13, [x30, #96]
	422	stp x10, x11, [x30, #80]
	423	stp x8, x9, [x30, #64]
	424	stp x6, x7, [x30, #48]
	425	stp x4, x5, [x30, #32]
	426	stp x2, x3, [x30, #16]
	427	stp x0, x1, [x30, #0]
	428	mov x0, x30
	429	mrs x30, nzcv
	430	stp x29, x30, [x0, #224]
	431
	432	ldp x19, x20, [sp, #16]
	433	ldp x21, x22, [sp, #32]
	434	ldp x23, x24, [sp, #48]
	435	ldp x25, x26, [sp, #64]
	436	ldp x27, x28, [sp, #80]
	437	ldp x29, x30, [sp], #14*8
	438
	439	ret
	440
	441	#else
	442	# error "not supported"
	443	#endif
	444
	445	endproc
	446
	447	proc nop
	448
	449	ret
	450
	451	endproc
	452
	453	///--------------------------------------------------------------------------
	454	/// 0x00--0x0f
	455
	456	proc x00
	457
	458	// clear all 64 bits of extended traditional registers
	459
	460	#if defined(__x86_64__)
	461
	462	xor eax, eax // clear rax
	463	lea rbx, [0] // rbx -> _\|_
	464	loop . // iterate, decrement rcx until zero
	465	mov rdx, 0 // set rdx = 0
	466	and esi, 0 // clear all bits of rsi
	467	sub edi, edi // set rdi = edi - edi = 0
	468	push 0
	469	pop rbp // pop 0 into rbp
	470
	471	#elif defined(__i386__)
	472
	473	xor eax, eax
	474	lea ebx, [0]
	475	loop .
	476	mov edx, 0
	477	and esi, 0
	478	sub edi, edi
	479	push 0
	480	pop ebp
	481
	482	#elif defined(__arm__)
	483
	484	eor r0, r0, r0
	485	rsb r1, r1, r1
	486	0: subs r2, r2, #1
	487	bne 0b
	488	mov r3, #0
	489	and r4, r4, #0
	490	sub r5, r5, r5
	491
	492	#elif defined(__aarch64__)
	493
	494	eor w0, w0, w0
	495	mov w1, wzr
	496	0: sub w2, w2, #1
	497	cbnz w2, 0b
	498	mov w3, #0
	499	and w4, w4, wzr
	500	sub w5, w5, w5
	501
	502	#else
	503	notimpl
	504	#endif
	505
	506	ret
	507
	508	endproc
	509
	510	proc x01
	511
	512	// advance a fibonacci pair by c steps
	513	//
	514	// on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1}
	515	// and f_{i+c}, where f_{i+1} = f_i + f_{i-1}
	516
	517	#if defined(__x86_64__)
	518
	519	0: xadd rax, rdx // a, d = a + d, a
	520	// = f_{i+1} + f_i, f_{i+1}
	521	// = f_{i+2}, f_{i+1}
	522	loop 0b // advance i, decrement c, iterate
	523
	524	#elif defined(__i386__)
	525
	526	0: xadd eax, edx
	527	loop 0b
	528
	529	#elif defined(__arm__)
	530
	531	0: subs r2, r2, #2
	532	add r3, r3, r0
	533	blo 8f
	534	add r0, r0, r3
	535	bhi 0b
	536
	537	8: movne r0, r3
	538
	539	#elif defined(__aarch64__)
	540
	541	0: subs x2, x2, #2
	542	add x3, x3, x0
	543	b.lo 8f
	544	add x0, x0, x3
	545	b.hi 0b
	546
	547	8: cmov.ne x0, x3
	548
	549	#else
	550	notimpl
	551	#endif
	552
	553	ret
	554
	555	endproc
	556
	557	proc x02
	558
	559	// boolean canonify a: if a = 0 on entry, leave it zero; otherwise
	560	// set a = 1
	561
	562	#if defined(__x86_64__)
	563
	564	neg rax // set cf iff a /= 0
	565	sbb rax, rax // a = a - a - cf = -cf
	566	neg rax // a = cf
	567
	568	#elif defined(__i386__)
	569
	570	neg eax
	571	sbb eax, eax
	572	neg eax
	573
	574	#elif defined(__arm__)
	575
	576	movs r1, r0 // the easy way
	577	movne r1, #1 // mvnne r1, #1 for mask
	578
	579	cmp r0, #1 // clear cf iff a == 0
	580	sbc r2, r0, r0 // c' = a - a - 1 + cf = cf - 1
	581	add r2, r2, #1 // c' = cf
	582
	583	sub r3, r0, r0, lsr #1 // d' top bit clear; d' = 0 iff a = 0
	584	rsb r3, r3, #0 // d' top bit set iff a /= 0
	585	mov r3, r3, lsr #31 // asr for mask
	586
	587	rsbs r0, r0, #0
	588	sbc r0, r0, r0
	589	rsb r0, r0, #0
	590
	591	#elif defined(__aarch64__)
	592
	593	cmp x0, #0 // trivial
	594	cset.ne x1 // csetm for mask
	595
	596	cmp xzr, x0 // set cf iff a == 0
	597	sbc x2, x0, x0 // c' = a - a - 1 + cf = cf - 1
	598	neg x2, x2 // c' = 1 - cf
	599
	600	sub x3, x0, x0, lsr #1 // if a < 2^63 then a' = ceil(d/2) <
	601	// 2^63
	602	// if a >= 2^63, write a = 2^63 + t
	603	// with t < 2^63; d' = 2^63 - 2^62 +
	604	// ceil(t/2) = 2^62 + ceil(t/2), and
	605	// ceil(t/2) < 2^62
	606	// anyway d' < 2^63 and d' = 0 iff
	607	// a = 0
	608	neg x3, x3 // d' top bit set iff a /= 0
	609	lsr x3, x3, #63 // asr for mask
	610
	611	cmp x0, #1 // set cf iff a /= 0
	612	adc x0, xzr, xzr // a' = 0 + 0 + cf = cf
	613
	614	#else
	615	notimpl
	616	#endif
	617
	618	ret
	619
	620	endproc
	621
	622	proc x03
	623
	624	// set a = min(a, d) (unsigned); clobber c, d
	625
	626	#if defined(__x86_64__)
	627
	628	sub rdx, rax // d' = d - a; set cf if a > d
	629	sbb rcx, rcx // c = -cf = -[a > d]
	630	and rcx, rdx // c = a > d ? d - a : 0
	631	add rax, rcx // a' = a > d ? d : a
	632
	633	#elif defined(__i386__)
	634
	635	sub edx, eax
	636	sbb ecx, ecx
	637	and ecx, edx
	638	add eax, ecx
	639
	640	#elif defined(__arm__)
	641
	642	cmp r0, r3 // the easy way
	643	movlo r1, r0 // only needed for out-of-place
	644	movhs r1, r3
	645
	646	subs r3, r3, r0
	647	sbc r12, r12, r12
	648	and r12, r12, r3
	649	add r0, r0, r12
	650
	651	#elif defined(__aarch64__)
	652
	653	cmp x0, x3 // the easy way
	654	csel.lo x1, x0, x3
	655
	656	subs x3, x3, x0 // d' = d - a; set cf if d >= a
	657	sbc x16, xzr, xzr // t = -1 + cf = -[a > d]
	658	and x16, x16, x3 // t = a > d ? d - a : 0
	659	add x0, x0, x16 // a' = a > d ? d : a
	660
	661	#else
	662	notimpl
	663	#endif
	664
	665	ret
	666
	667	endproc
	668
	669	proc x04
	670
	671	// switch case?
	672
	673	#if defined(__x86_64__)
	674
	675	// unrelated playing
	676	mov ecx, eax
	677	mov rbx, -1
	678	mov edx, ecx
	679	sub edx, '0'
	680	cmp edx, 10
	681	cmovb rbx, rdx
	682	or ecx, 0x20
	683	mov edx, ecx
	684	sub edx, 'a'
	685	sub ecx, 'a' - 10
	686	cmp edx, 6
	687	cmovb rbx, rcx
	688
	689	xor al, 0x20
	690
	691	#elif defined(__i386__)
	692
	693	// unrelated playing
	694	mov ecx, eax
	695	mov ebx, -1
	696	mov edx, ecx
	697	sub edx, '0'
	698	cmp edx, 10
	699	cmovb ebx, edx
	700	or ecx, 0x20
	701	mov edx, ecx
	702	sub edx, 'a'
	703	sub ecx, 'a' - 10
	704	cmp edx, 6
	705	cmovb ebx, ecx
	706
	707	xor al, 0x20
	708
	709	#elif defined(__arm__)
	710
	711	// unrelated playing
	712	mvn r1, #0
	713	sub r12, r0, #'0'
	714	cmp r12, #10
	715	movlo r1, r12
	716	orr r12, r0, #0x20
	717	sub r12, r12, #'a'
	718	cmp r12, #6
	719	addlo r1, r12, #10
	720
	721	eor r0, r0, #0x20
	722
	723	#elif defined(__aarch64__)
	724
	725	// unrelated playing
	726	mov x1, #-1
	727	sub w16, w0, #'0'
	728	cmp w16, #10
	729	cmov.lo x1, x16
	730	orr w16, w0, #0x20
	731	sub w16, w16, #'a' - 10
	732	cmp w16, #10
	733	ccmp.hs w16, #16, #CCMP_HS
	734	cmov.lo x1, x16
	735
	736	eor w0, w0, #0x20
	737
	738	#else
	739	notimpl
	740	#endif
	741
	742	ret
	743
	744	endproc
	745
	746	proc x05
	747
	748	// answer whether 5 <= a </<= 9.
	749
	750	#if defined(__x86_64__)
	751
	752	sub rax, 5 // a' = a - 5
	753	cmp rax, 4 // is a' - 5 </<= 4?
	754
	755	// cc a' a
	756	//
	757	// z/e a' = 4 a = 9
	758	// nz/ne a' /= 4 a /= 9
	759	//
	760	// a/nbe a' > 4 a > 9 or a < 5
	761	// nc/ae/nb a' >= 4 a >= 9 or a < 5
	762	// c/b/nae a' < 4 5 <= a < 9
	763	// be/na a' <= 4 5 <= a <= 9
	764	//
	765	// o a' < -2^63 + 4 -2^63 + 5 <= a < -2^63 + 9
	766	// no a' >= -2^63 + 4 a >= -2^63 + 9 or
	767	// a < -2^63 + 5
	768	// s -2^63 + 4 <= a' < 4 -2^63 + 9 <= a < 9
	769	// ns a' < -2^63 + 4 or a < -2^63 + 9 or a >= 9
	770	// a' >= 4
	771	// ge/nl a' >= 4 a >= 9 or a < -2^63 + 5
	772	// l/nge a' < 4 -2^63 + 5 <= a < 9
	773	// g/nle a' > 4 a > 9 or a < -2^63 + 5
	774	// le/ng a' <= 4 -2^63 + 5 <= a <= 9
	775
	776	#elif defined(__i386__)
	777
	778	sub eax, 5
	779	cmp eax, 4
	780
	781	#elif defined(__arm__)
	782
	783	// i dimly remember having a slick way to do this way back in the
	784	// day, but i can't figure it out any more.
	785	sub r0, #5
	786	cmp r0, #4
	787
	788	#elif defined(__aarch64__)
	789
	790	// literal translation is too obvious
	791	cmp x0, #5
	792	ccmp.hs x0, #9, #CCMP_HS
	793
	794	#else
	795	notimpl
	796	#endif
	797
	798	ret
	799
	800	endproc
	801
	802	proc x06
	803
	804	// leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of,
	805	// set sf to msb(a)
	806
	807	#if defined(__x86_64__)
	808
	809	not rax // a' = -a - 1
	810	inc rax // a' = -a
	811	neg rax // a' = a
	812
	813	#elif defined(__i386__)
	814
	815	not eax
	816	inc eax
	817	neg eax
	818
	819	#elif defined(__arm__)
	820
	821	mvn r0, r0
	822	add r0, r0, #1
	823	rsbs r0, r0, #0 // cf has opposite sense
	824
	825	#elif defined(__aarch64__)
	826
	827	mvn x0, x0
	828	add x0, x0, #1
	829	negs x0, x0 // cf has opposite sense
	830
	831	#else
	832	notimpl
	833	#endif
	834
	835	ret
	836
	837	endproc
	838
	839	proc x07
	840
	841	// same as before (?)
	842
	843	#if defined(__x86_64__)
	844
	845	inc rax // a' = a + 1
	846	neg rax // a' = -a - 1
	847	inc rax // a' = -a
	848	neg rax // a' = a
	849
	850	#elif defined(__i386__)
	851
	852	inc eax
	853	neg eax
	854	inc eax
	855	neg eax
	856
	857	#elif defined(__arm__)
	858
	859	add r0, r0, #1
	860	rsb r0, r0, #0
	861	add r0, r0, #1
	862	rsbs r0, r0, #0
	863
	864	#elif defined(__aarch64__)
	865
	866	add x0, x0, #1
	867	neg x0, x0
	868	add x0, x0, #1
	869	negs x0, x0 // cf has opposite sense
	870
	871	#else
	872	notimpl
	873	#endif
	874
	875	ret
	876
	877	endproc
	878
	879	proc x08
	880
	881	// floor((a + d)/2), correctly handling overflow conditions; final cf
	882	// is lsb(a + d), probably uninteresting
	883
	884	#if defined(__x86_64__)
	885
	886	add rax, rdx // cf \|\| a' = a + d
	887	rcr rax, 1 // shift 65-bit result right by one
	888	// place; lsb moves into carry
	889
	890	#elif defined(__i386__)
	891
	892	add eax, edx
	893	rcr eax, 1
	894
	895	#elif defined(__arm__)
	896
	897	// like the two-instruction a64 version
	898	sub r1, r3, r0
	899	add r1, r0, r1, lsr #1
	900
	901	// the slick version, similar to the above
	902	adds r0, r0, r3
	903	mov r0, r0, rrx
	904
	905	#elif defined(__aarch64__)
	906
	907	// a64 lacks a32's rrx. literal translation.
	908	adds x1, x0, x3 // cf \|\| a' = a + d
	909	adc x16, xzr, xzr // realize cf in extra register
	910	extr x1, x16, x1, #1 // shift down one place
	911
	912	// two instruction version: clobbers additional register. (if you
	913	// wanted the answer in any other register, even overwriting d, then
	914	// this is unnecessary.) also depends on d >= a.
	915	sub x16, x3, x0 // compute difference
	916	add x0, x0, x16, lsr #1 // add half of it (rounded down)
	917
	918	#else
	919	notimpl
	920	#endif
	921
	922	ret
	923
	924	endproc
	925
	926	proc x09
	927
	928	// a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3
	929	// (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8).
	930
	931	#if defined(__x86_64__)
	932
	933	shr rax, 3 // a' = floor(a/8); cf = 1 if a ==
	934	// 4, 5, 6, 7 (mod 8)
	935	adc rax, 0 // a' = floor(a/8) + cf
	936
	937	#elif defined(__i386__)
	938
	939	shr eax, 3
	940	adc eax, 0
	941
	942	#elif defined(__arm__)
	943
	944	movs r0, r0, lsr #3
	945	adc r0, r0, #0
	946
	947	#elif defined(__aarch64__)
	948
	949	tst x0, #4
	950	orr x0, xzr, x0, lsr #3
	951	cinc.ne x0, x0
	952
	953	#else
	954	notimpl
	955	#endif
	956
	957	ret
	958
	959	endproc
	960
	961	proc x0a
	962
	963	// increment c-byte little-endian bignum at rdi
	964
	965	#if defined(__x86_64__)
	966
	967	add byte ptr [rdi], 1
	968	0: inc rdi
	969	adc byte ptr [rdi], 0
	970	loop 0b
	971
	972	#elif defined(__i386__)
	973
	974	add byte ptr [edi], 1
	975	0: inc edi
	976	adc byte ptr [edi], 0
	977	loop 0b
	978
	979	#elif defined(__arm__)
	980
	981	mov r12, #256 // set initial carry
	982	0: ldrb r0, [r5]
	983	subs r2, r2, #1
	984	add r12, r0, r12, lsr #8
	985	strb r12, [r5], #1
	986	bne 0b
	987
	988	#elif defined(__aarch64__)
	989
	990	mov w17, #256 // set initial carry
	991	0: ldrb w16, [x5]
	992	sub x2, x2, #1
	993	add w17, w16, w17, lsr #8
	994	strb w17, [x5], #1
	995	cbnz x2, 0b
	996
	997	#else
	998	notimpl
	999	#endif
	1000
	1001	ret
	1002
	1003	endproc
	1004
	1005	proc x0b
	1006
	1007	// negate double-precision d:a
	1008
	1009	#if defined(__x86_64__)
	1010
	1011	not rdx // d' = -d - 1
	1012	neg rax // a' = -a;
	1013	// cf = 1 iff a /= 0
	1014	sbb rdx, -1 // d' = -d - cf
	1015
	1016	#elif defined(__i386__)
	1017
	1018	not edx
	1019	neg eax
	1020	sbb edx, -1
	1021
	1022	#elif defined(__arm__)
	1023
	1024	// reverse subtract is awesome
	1025	rsbs r0, r0, #0
	1026	rsc r3, r3, #0
	1027
	1028	#elif defined(__aarch64__)
	1029
	1030	// easy way: everything is better with zero registers.
	1031	negs x0, x0
	1032	ngc x3, x3
	1033
	1034	#else
	1035	notimpl
	1036	#endif
	1037
	1038	ret
	1039
	1040	endproc
	1041
	1042	proc x0c
	1043
	1044	// rotate is distributive over xor.
	1045
	1046	#if defined(__x86_64__)
	1047
	1048	// rax // = a_1 \|\| a_0
	1049	// rbx // = b_1 \|\| b_0
	1050	mov rcx, rax // = a_1 \|\| a_0
	1051
	1052	xor rcx, rbx // = (a_1 XOR b_1) \|\| (a_0 XOR b_0)
	1053	ror rcx, 0xd // = (a_0 XOR b_0) \|\| (a_1 XOR b_1)
	1054
	1055	ror rax, 0xd // = a_0 \|\| a_1
	1056	ror rbx, 0xd // = b_0 \|\| b_1
	1057	xor rax, rbx // = (a_0 XOR b_0) \|\| (a_1 XOR b_1)
	1058
	1059	cmp rax, rcx // always equal
	1060
	1061	#elif defined(__i386__)
	1062
	1063	mov ecx, eax // = a_1 \|\| a_0
	1064
	1065	xor ecx, ebx // = (a_1 XOR b_1) \|\| (a_0 XOR b_0)
	1066	ror ecx, 0xd // = (a_0 XOR b_0) \|\| (a_1 XOR b_1)
	1067
	1068	ror eax, 0xd // = a_0 \|\| a_1
	1069	ror ebx, 0xd // = b_0 \|\| b_1
	1070	xor eax, ebx // = (a_0 XOR b_0) \|\| (a_1 XOR b_1)
	1071
	1072	cmp eax, ecx // always equal
	1073
	1074	#elif defined(__arm__)
	1075
	1076
	1077	// r0 // = a_1 \|\| a_0
	1078	// r1 // = b_1 \|\| b_0
	1079	eor r2, r0, r1 // = (a_1 XOR b_1) \|\| (a_0 XOR b_0)
	1080	mov r2, r2, ror #13 // = (a_0 XOR b_0) \|\| (a_1 XOR b_1)
	1081
	1082	mov r1, r1, ror #13 // = b_0 \|\| b_1
	1083	eor r0, r1, r0, ror #13 // = (a_0 XOR b_0) \|\| (a_1 XOR b_1)
	1084
	1085	cmp r0, r2 // always equal
	1086
	1087	#elif defined(__aarch64__)
	1088
	1089	// x0 // = a_1 \|\| a_0
	1090	// x1 // = b_1 \|\| b_0
	1091	eor x2, x0, x1 // = (a_1 XOR b_1) \|\| (a_0 XOR b_0)
	1092	ror x2, x2, #13 // = (a_0 XOR b_0) \|\| (a_1 XOR b_1)
	1093
	1094	ror x1, x1, #13 // = b_0 \|\| b_1
	1095	eor x0, x1, x0, ror #13 // = (a_0 XOR b_0) \|\| (a_1 XOR b_1)
	1096
	1097	cmp x0, x2 // always equal
	1098
	1099	#else
	1100	notimpl
	1101	#endif
	1102
	1103	ret
	1104
	1105	endproc
	1106
	1107	proc x0d
	1108
	1109	// and is distributive over xor.
	1110
	1111	#if defined(__x86_64__)
	1112
	1113	mov rdx, rbx // = b
	1114
	1115	xor rbx, rcx // = b XOR c
	1116	and rbx, rax // = a AND (b XOR c)
	1117
	1118	and rdx, rax // = a AND b
	1119	and rax, rcx // = a AND c
	1120	xor rax, rdx // = (a AND b) XOR (a AND c)
	1121	// = a AND (b XOR c)
	1122
	1123	cmp rax, rbx // always equal
	1124
	1125	#elif defined(__i386__)
	1126
	1127	mov edx, ebx // = b
	1128
	1129	xor ebx, ecx // = b XOR c
	1130	and ebx, eax // = a AND (b XOR c)
	1131
	1132	and edx, eax // = a AND b
	1133	and eax, ecx // = a AND c
	1134	xor eax, edx // = (a AND b) XOR (a AND c)
	1135	// = a AND (b XOR c)
	1136
	1137	cmp eax, ebx // always equal
	1138
	1139	#elif defined(__arm__)
	1140
	1141	and r3, r0, r1 // = a AND b
	1142
	1143	eor r1, r1, r2 // = b XOR c
	1144	and r1, r1, r0 // = a AND (b XOR c)
	1145
	1146	and r0, r0, r2 // = a AND c
	1147	eor r0, r0, r3 // = (a AND b) XOR (a AND c)
	1148	// = a AND (b XOR c)
	1149
	1150	cmp r0, r1 // always equal
	1151
	1152	#elif defined(__aarch64__)
	1153
	1154	and x3, x0, x1 // = a AND b
	1155
	1156	eor x1, x1, x2 // = b XOR c
	1157	and x1, x1, x0 // = a AND (b XOR c)
	1158
	1159	and x0, x0, x2 // = a AND c
	1160	eor x0, x0, x3 // = (a AND b) XOR (a AND c)
	1161	// = a AND (b XOR c)
	1162
	1163	cmp x0, x1 // always equal
	1164
	1165	#else
	1166	notimpl
	1167	#endif
	1168
	1169	ret
	1170
	1171	endproc
	1172
	1173	proc x0e
	1174
	1175	// de morgan's law
	1176
	1177	#if defined(__x86_64__)
	1178
	1179	mov rcx, rax // = a
	1180
	1181	and rcx, rbx // = a AND b
	1182	not rcx // = NOT (a AND b)
	1183
	1184	not rax // = NOT a
	1185	not rbx // = NOT b
	1186	or rax, rbx // = (NOT a) OR (NOT b)
	1187	// = NOT (a AND b)
	1188
	1189	cmp rax, rcx // always equal
	1190
	1191	#elif defined(__i386__)
	1192
	1193	mov ecx, eax // = a
	1194
	1195	and ecx, ebx // = a AND b
	1196	not ecx // = NOT (a AND b)
	1197
	1198	not eax // = NOT a
	1199	not ebx // = NOT b
	1200	or eax, ebx // = (NOT a) OR (NOT b)
	1201	// = NOT (a AND b)
	1202
	1203	cmp eax, ecx // always equal
	1204
	1205	#elif defined(__arm__)
	1206
	1207	and r2, r0, r1 // = a AND b
	1208	mvn r2, r2 // = NOT (a AND b)
	1209
	1210	mvn r0, r0 // = NOT a
	1211	mvn r1, r1 // = NOT b
	1212	orr r0, r0, r1 // = (NOT a) OR (NOT b)
	1213
	1214	cmp r0, r2 // always equal
	1215
	1216	#elif defined(__aarch64__)
	1217
	1218	and x2, x0, x1 // = a AND b
	1219	mvn x2, x2 // = NOT (a AND b)
	1220
	1221	mvn x0, x0 // = NOT a
	1222	orn x0, x0, x1 // = (NOT a) OR (NOT b)
	1223
	1224	cmp x0, x2 // always equal
	1225
	1226	#else
	1227	notimpl
	1228	#endif
	1229
	1230	ret
	1231
	1232	endproc
	1233
	1234	proc x0f
	1235
	1236	// replace input buffer bytes with cumulative XORs with initial a;
	1237	// final a is XOR of all buffer bytes and initial a.
	1238	//
	1239	// not sure why you'd do this.
	1240
	1241	#if defined(__x86_64__)
	1242
	1243	0: xor [rsi], al
	1244	lodsb
	1245	loop 0b
	1246
	1247	#elif defined(__i386__)
	1248
	1249	0: xor [esi], al
	1250	lodsb
	1251	loop 0b
	1252
	1253	#elif defined(__arm__)
	1254
	1255	0: ldrb r12, [r4]
	1256	subs r2, r2, #1
	1257	eor r0, r0, r12
	1258	strb r0, [r4], #1
	1259	bne 0b
	1260
	1261	#elif defined(__aarch64__)
	1262
	1263	0: ldrb w16, [x4]
	1264	sub x2, x2, #1
	1265	eor w0, w0, w16
	1266	strb w0, [x4], #1
	1267	cbnz x2, 0b
	1268
	1269	#else
	1270	notimpl
	1271	#endif
	1272
	1273	ret
	1274
	1275	endproc
	1276
	1277	///--------------------------------------------------------------------------
	1278	/// 0x10--0x1f
	1279
	1280	proc x10
	1281
	1282	// four different ways to swap a pair of registers.
	1283
	1284	#if defined(__x86_64__)
	1285
	1286	push rax
	1287	push rcx
	1288	pop rax
	1289	pop rcx
	1290
	1291	xor rax, rcx
	1292	xor rcx, rax
	1293	xor rax, rcx
	1294
	1295	add rax, rcx
	1296	sub rcx, rax
	1297	add rax, rcx
	1298	neg rcx
	1299
	1300	xchg rax, rcx
	1301
	1302	#elif defined(__i386__)
	1303
	1304	push eax
	1305	push ecx
	1306	pop eax
	1307	pop ecx
	1308
	1309	xor eax, ecx
	1310	xor ecx, eax
	1311	xor eax, ecx
	1312
	1313	add eax, ecx
	1314	sub ecx, eax
	1315	add eax, ecx
	1316	neg ecx
	1317
	1318	xchg eax, ecx
	1319
	1320	#elif defined(__arm__)
	1321
	1322	stmfd r13!, {r0, r2}
	1323	ldr r0, [r13, #4]
	1324	ldr r2, [r13], #8
	1325
	1326	eor r0, r0, r2
	1327	eor r2, r2, r0
	1328	eor r0, r0, r2
	1329
	1330	sub r0, r0, r2
	1331	add r2, r2, r0
	1332	rsb r0, r0, r2 // don't need 3-addr with reverse-sub
	1333
	1334	mov r12, r0
	1335	mov r0, r2
	1336	mov r2, r0
	1337
	1338	#elif defined(__aarch64__)
	1339
	1340	// anything you can do
	1341	stp x0, x2, [sp, #-16]!
	1342	ldp x2, x0, [sp], #16
	1343
	1344	eor x0, x0, x2
	1345	eor x2, x2, x0
	1346	eor x0, x0, x2
	1347
	1348	// the add/sub/add thing was daft. you can do it in three if you're
	1349	// clever -- and have three-address operations.
	1350	sub x0, x0, x2
	1351	add x2, x2, x0
	1352	sub x0, x2, x0
	1353
	1354	// but we lack a fourth. we can't do this in fewer than three
	1355	// instructions without hitting memory. only `ldp' will modify two
	1356	// registers at a time, so we need at least two instructions -- but
	1357	// if the first one sets one of our two registers to its final value
	1358	// then we lose the other input value with no way to recover it, so
	1359	// we must either write a fresh third register, or write something
	1360	// other than the final value, and in both cases we need a third
	1361	// instruction to fix everything up. we've done the wrong-something-
	1362	// other trick twice, so here's the captain-obvious use-a-third-
	1363	// register version.
	1364	mov x16, x0
	1365	mov x0, x2
	1366	mov x2, x16
	1367
	1368	#else
	1369	notimpl
	1370	#endif
	1371
	1372	ret
	1373
	1374	endproc
	1375
	1376	proc x11
	1377
	1378	// assuming a is initialized to zero, set a to the inclusive or of
	1379	// the xor-differences of corresponding bytes in the c-byte strings
	1380	// at si and di.
	1381	//
	1382	// in particular, a will be zero (and zf set) if and only if the two
	1383	// strings are equal.
	1384
	1385	#if defined(__x86_64__)
	1386
	1387	0: mov dl, [rsi]
	1388	xor dl, [rdi]
	1389	inc rsi
	1390	inc rdi
	1391	or al, dl
	1392	loop 0b
	1393
	1394	#elif defined(__i386__)
	1395
	1396	0: mov dl, [esi]
	1397	xor dl, [edi]
	1398	inc esi
	1399	inc edi
	1400	or al, dl
	1401	loop 0b
	1402
	1403	#elif defined(__arm__)
	1404
	1405	0: ldrb r1, [r4], #1
	1406	ldrb r12, [r5], #1
	1407	subs r2, r2, #1
	1408	eor r12, r12, r1
	1409	orr r0, r0, r12
	1410	bne 0b
	1411
	1412	#elif defined(__aarch64__)
	1413
	1414	0: ldrb w16, [x4], #1
	1415	ldrb w17, [x5], #1
	1416	sub x2, x2, #1
	1417	eor w16, w16, w17
	1418	orr w0, w0, w16
	1419	cbnz x2, 0b
	1420
	1421	#else
	1422	notimpl
	1423	#endif
	1424
	1425	ret
	1426
	1427	endproc
	1428
	1429	proc x12
	1430
	1431	// an obtuse way of adding two registers. for any bit position, a
	1432	// OR d is set if and only if at least one of a and d has a bit set
	1433	// in that position, and a AND d is set if and only if both have a
	1434	// bit set in that position. essentially, then, what we've done is
	1435	// move all of the set bits in d to a, unless there's already a bit
	1436	// there. this clearly doesn't change the sum.
	1437
	1438	#if defined(__x86_64__)
	1439
	1440	mov rcx, rdx // c' = d
	1441	and rdx, rax // d' = a AND d
	1442	or rax, rcx // a' = a OR d
	1443	add rax, rdx
	1444
	1445	#elif defined(__i386__)
	1446
	1447	mov ecx, edx // c' = d
	1448	and edx, eax // d' = a AND d
	1449	or eax, ecx // a' = a OR d
	1450	add eax, edx
	1451
	1452	#elif defined(__arm__)
	1453
	1454	and r2, r0, r3 // c' = a AND d
	1455	orr r0, r0, r3 // a' = a OR d
	1456	add r0, r0, r2
	1457
	1458	#elif defined(__aarch64__)
	1459
	1460	and x2, x0, x3 // c' = a AND d
	1461	orr x0, x0, x3 // a' = a OR d
	1462	add x0, x0, x2
	1463
	1464	#else
	1465	notimpl
	1466	#endif
	1467
	1468	ret
	1469
	1470	endproc
	1471
	1472	proc x13
	1473
	1474	// ok, so this is a really obtuse way of adding a and b; the result
	1475	// is in a and d. but why does it work?
	1476
	1477	#if defined(__x86_64__)
	1478
	1479	mov rcx, 0x40 // carry chains at most 64 long
	1480	0: mov rdx, rax // copy a'
	1481	xor rax, rbx // low bits of each bitwise sum
	1482	and rbx, rdx // carry bits from each bitwise sum
	1483	shl rbx, 1 // carry them into next position
	1484	loop 0b
	1485
	1486	#elif defined(__i386__)
	1487
	1488	mov ecx, 0x40 // carry chains at most 64 long
	1489	0: mov edx, eax // copy a'
	1490	xor eax, ebx // low bits of each bitwise sum
	1491	and ebx, edx // carry bits from each bitwise sum
	1492	shl ebx, 1 // carry them into next position
	1493	loop 0b
	1494
	1495	#elif defined(__arm__)
	1496
	1497	mov r2, #0x40
	1498	0: and r3, r0, r1
	1499	subs r2, r2, #1
	1500	eor r0, r0, r1
	1501	lsl r1, r3, #1
	1502	bne 0b
	1503
	1504	#elif defined(__aarch64__)
	1505
	1506	mov x2, #0x40
	1507	0: and x3, x0, x1
	1508	sub x2, x2, #1
	1509	eor x0, x0, x1
	1510	lsl x1, x3, #1
	1511	cbnz x2, 0b
	1512
	1513	#else
	1514	notimpl
	1515	#endif
	1516
	1517	ret
	1518
	1519	endproc
	1520
	1521	proc x14
	1522
	1523	// floor((a + d)/2), like x08.
	1524
	1525	#if defined(__x86_64__)
	1526
	1527	mov rcx, rax // copy a for later
	1528	and rcx, rdx // carry bits
	1529
	1530	xor rax, rdx // low bits of each bitwise sum
	1531	shr rax, 1 // divide by 2; carries now in place
	1532
	1533	add rax, rcx // add the carries; done
	1534
	1535	#elif defined(__i386__)
	1536
	1537	mov ecx, eax // copy a for later
	1538	and ecx, edx // carry bits
	1539
	1540	xor eax, edx // low bits of each bitwise sum
	1541	shr eax, 1 // divide by 2; carries now in place
	1542
	1543	add eax, ecx // add the carries; done
	1544
	1545	#elif defined(__arm__)
	1546
	1547	and r2, r0, r3
	1548	eor r0, r0, r3
	1549	add r0, r2, r0, lsr #1
	1550
	1551	#elif defined(__aarch64__)
	1552
	1553	and x2, x0, x3
	1554	eor x0, x0, x3
	1555	add x0, x2, x0, lsr #1
	1556
	1557	#else
	1558	notimpl
	1559	#endif
	1560
	1561	ret
	1562
	1563	endproc
	1564
	1565	proc x15
	1566
	1567	// sign extension 32 -> 64 bits.
	1568
	1569	#if defined(__x86_64__)
	1570
	1571	movsx rbx, eax // like this?
	1572
	1573	mov rdx, 0xffffffff80000000
	1574	add rax, rdx // if bit 31 of a is set then bits
	1575	// 31--63 of a' are clear; otherwise,
	1576	// these bits are all set -- which is
	1577	// exactly backwards
	1578	xor rax, rdx // so fix it
	1579
	1580	#elif defined(__i386__)
	1581
	1582	movsx ebx, ax // like this?
	1583
	1584	mov edx, 0xffff8000
	1585	add eax, edx // if bit 31 of a is set then bits
	1586	// 31--63 of a' are clear; otherwise,
	1587	// these bits are all set -- which is
	1588	// exactly backwards
	1589	xor eax, edx // so fix it
	1590
	1591	#elif defined(__arm__)
	1592
	1593	sxth r1, r0 // like this
	1594
	1595	mov r12, #0x80000000
	1596	add r0, r0, r12, asr #16
	1597	eor r0, r0, r12, asr #16
	1598
	1599	#elif defined(__aarch64__)
	1600
	1601	sxtw x1, w0 // like this
	1602
	1603	mov x16, #0xffffffff80000000
	1604	add x0, x0, x16
	1605	eor x0, x0, x16
	1606
	1607	#else
	1608	notimpl
	1609	#endif
	1610
	1611	ret
	1612
	1613	endproc
	1614
	1615	proc x16
	1616
	1617	// ??? i don't know why you'd want to calculate this.
	1618
	1619	#if defined(__x86_64__)
	1620
	1621	xor rax, rbx // a' = a XOR b
	1622	xor rbx, rcx // b' = b XOR c
	1623	mov rsi, rax // t = a XOR b
	1624	add rsi, rbx // t = (a XOR b) + (b XOR c)
	1625	cmovc rax, rbx // a' = cf ? b XOR c : a XOR b
	1626	xor rax, rbx // a' = cf ? 0 : a XOR c
	1627	cmp rax, rsi
	1628
	1629	#elif defined(__i386__)
	1630
	1631	xor eax, ebx // a' = a XOR b
	1632	xor ebx, ecx // b' = b XOR c
	1633	mov esi, eax // t = a XOR b
	1634	add esi, ebx // t = (a XOR b) + (b XOR c)
	1635	cmovc eax, ebx // a' = cf ? b XOR c : a XOR b
	1636	xor eax, ebx // a' = cf ? 0 : a XOR c
	1637	cmp eax, esi
	1638
	1639	#elif defined(__arm__)
	1640
	1641	eor r0, r0, r1
	1642	eor r1, r1, r2
	1643	adds r4, r0, r1
	1644	movcs r0, r1
	1645	eor r0, r0, r1
	1646	cmp r0, r4
	1647
	1648	#elif defined(__aarch64__)
	1649
	1650	eor x0, x0, x1
	1651	eor x1, x1, x2
	1652	adds x4, x0, x1
	1653	cmov.cs x0, x1
	1654	eor x0, x0, x1
	1655	cmp x0, x4
	1656
	1657	#else
	1658	notimpl
	1659	#endif
	1660
	1661	ret
	1662
	1663	endproc
	1664
	1665	proc x17
	1666
	1667	// absolute value
	1668
	1669	#if defined(__x86_64__)
	1670
	1671	cqo // d = a < 0 ? -1 : 0
	1672	xor rax, rdx // a' = a < 0 ? -a - 1 : a
	1673	sub rax, rdx // a' = a < 0 ? -a : a
	1674
	1675	#elif defined(__i386__)
	1676
	1677	cdq // d = a < 0 ? -1 : 0
	1678	xor eax, edx // a' = a < 0 ? -a - 1 : a
	1679	sub eax, edx // a' = a < 0 ? -a : a
	1680
	1681	#elif defined(__arm__)
	1682
	1683	// direct approach
	1684	movs r1, r0
	1685	rsbmi r1, r0, #0
	1686
	1687	// faithful-ish conversion
	1688	eor r3, r0, r0, asr #31
	1689	sub r0, r3, r0, asr #31
	1690
	1691	#elif defined(__aarch64__)
	1692
	1693	// direct approach
	1694	tst x0, #1 << 63
	1695	cneg.ne x1, x0
	1696
	1697	// faithful-ish conversion
	1698	eor x3, x0, x0, asr #63
	1699	sub x0, x3, x0, asr #63
	1700
	1701	#else
	1702	notimpl
	1703	#endif
	1704
	1705	ret
	1706
	1707	endproc
	1708
	1709	proc x18
	1710
	1711	// should always set sf, clear zf, unless we get rescheduled to a
	1712	// different core.
	1713
	1714	#if defined(__x86_64__)
	1715
	1716	rdtsc // d \|\| a = cycles
	1717	shl rdx, 0x20
	1718	or rax, rdx // a = cycles
	1719	mov rcx, rax // c = cycles
	1720
	1721	rdtsc // d \|\| a = cycles'
	1722	shl rdx, 0x20
	1723	or rax, rdx // a = cycles'
	1724
	1725	cmp rcx, rax
	1726
	1727	#elif defined(__i386__)
	1728
	1729	rdtsc // d \|\| a = cycles
	1730	mov ebx, eax
	1731	mov ecx, edx // c \|\| b = cycles
	1732
	1733	rdtsc // d \|\| a = cycles'
	1734
	1735	sub ebx, eax
	1736	sbb ecx, edx
	1737
	1738	#elif defined(__arm__)
	1739
	1740	// cycle clock not available in user mode
	1741	mrrc p15, 0, r0, r1, c9
	1742	mrrc p15, 0, r2, r3, c9
	1743	subs r0, r0, r2
	1744	sbcs r1, r1, r3
	1745
	1746	#elif defined(__aarch64__)
	1747
	1748	// cycle clock not available in user mode
	1749	mrs x0, pmccntr_el0
	1750	mrs x1, pmccntr_el0
	1751	cmp x0, x1
	1752
	1753	#else
	1754	notimpl
	1755	#endif
	1756
	1757	ret
	1758
	1759	endproc
	1760
	1761	proc x19
	1762
	1763	// stupid way to capture a pointer to inline data and jump past it.
	1764	// confuses the return-address predictor something chronic. worse
	1765	// because amd64 calling convention doesn't usually pass arguments on
	1766	// the stack.
	1767
	1768	#if defined(__x86_64__)
	1769
	1770	call 8f
	1771	.string "hello world!\n\0"
	1772	8: call print_str
	1773	add rsp, 8
	1774	ret
	1775
	1776	print_str:
	1777	// actually implement this ridiculous thing
	1778	mov rsi, [rsp + 8]
	1779	xor edx, edx
	1780	0: mov al, [rsi + rdx]
	1781	inc rdx
	1782	cmp al, 0
	1783	jnz 0b
	1784	mov eax, SYS_write
	1785	mov edi, 1
	1786	dec rdx
	1787	syscall // clobbers r11 :-(
	1788	ret
	1789
	1790	#elif defined(__i386__)
	1791
	1792	call 8f
	1793	.string "hello world!\n\0"
	1794	8: call print_str
	1795	add esp, 4
	1796	ret
	1797
	1798	print_str:
	1799	// actually implement this ridiculous thing
	1800	mov ecx, [esp + 4]
	1801	xor edx, edx
	1802	0: mov al, [ecx + edx]
	1803	inc edx
	1804	cmp al, 0
	1805	jnz 0b
	1806	mov eax, SYS_write
	1807	mov ebx, 1
	1808	dec edx
	1809	int 0x80
	1810	ret
	1811
	1812	#elif defined(__arm__)
	1813
	1814	// why am i doing this?
	1815	stmfd r13!, {r14}
	1816	bl 8f
	1817	.string "hello world!\n\0"
	1818	.balign 4
	1819	8: mov r1, r14 // might as well make it easy on myself
	1820	bl print_str
	1821	ldmfd r13!, {pc}
	1822
	1823	print_str:
	1824	mov r2, #0
	1825	0: ldrb r0, [r1, r2]
	1826	cmp r0, #0
	1827	addne r2, r2, #1
	1828	bne 0b
	1829	mov r0, #1
	1830	mov r7, #SYS_write
	1831	swi 0
	1832	bx r14
	1833
	1834	#elif defined(__aarch64__)
	1835
	1836	// why am i doing this?
	1837	str x30, [sp, #-16]!
	1838	bl 8f
	1839	.string "hello world!\n\0"
	1840	.balign 4
	1841	8: mov x1, x30 // might as well make it easy on myself
	1842	bl print_str
	1843	ldr x30, [sp], #16
	1844	ret
	1845
	1846	print_str:
	1847	mov x2, #0
	1848	0: ldrb w0, [x1, x2]
	1849	cmp w0, #0
	1850	cinc.ne x2, x2
	1851	b.ne 0b
	1852	mov x0, #1
	1853	mov x8, #SYS_write
	1854	svc #0
	1855	ret
	1856
	1857	#else
	1858	notimpl
	1859	#endif
	1860
	1861	endproc
	1862
	1863	proc x1a
	1864
	1865	// collect the current instruction-pointer address. this was an old
	1866	// 32-bit i386 trick for position-independent code, but (a) it
	1867	// confuses the return predictor, and (b) amd64 has true pc-relative
	1868	// addressing.
	1869
	1870	#if defined(__x86_64__)
	1871
	1872	// the actual example
	1873	call 0f
	1874	0: pop rax
	1875
	1876	// the modern i386 trick doesn't confuse the return-address
	1877	// predictor.
	1878	call calladdr_rbx
	1879	sub rbx, . - 0b
	1880
	1881	// but rip-relative addressing is even better
	1882	lea rcx, [rip + 0b]
	1883
	1884	ret
	1885
	1886	calladdr_rbx:
	1887	mov rbx, [rsp]
	1888	ret
	1889
	1890	#elif defined(__i386__)
	1891
	1892	// the actual example
	1893	call 0f
	1894	0: pop eax
	1895
	1896	// the modern i386 trick doesn't confuse the return-address
	1897	// predictor.
	1898	call get_pc_ebx
	1899	sub ebx, . - 0b
	1900
	1901	ret
	1902
	1903	#elif defined(__arm__)
	1904
	1905	stmfd r13!, {r14}
	1906
	1907	bl 0f
	1908	0: mov r0, r14
	1909
	1910	bl return
	1911	sub r1, r14, #. - 0b
	1912
	1913	adr r2, 0b
	1914
	1915	ldmfd r13!, {pc}
	1916
	1917	return: bx r14
	1918
	1919	#elif defined(__aarch64__)
	1920
	1921	str x30, [sp, #-16]!
	1922
	1923	// we can do all of the above using a64
	1924	bl 0f
	1925	0: mov x0, x30
	1926
	1927	bl return
	1928	sub x1, x30, #. - 0b
	1929
	1930	adr x2, 0b
	1931
	1932	ldr x30, [sp], #16
	1933	return: ret
	1934
	1935	#else
	1936	notimpl
	1937	#endif
	1938
	1939	endproc
	1940
	1941	proc x1b
	1942
	1943	#if defined(__x86_64__)
	1944
	1945	// retpolines: an mitigation against adversarially influenced
	1946	// speculative execution at indirect branches. if an adversary can
	1947	// prepare a branch-target buffer entry matching an indirect branch
	1948	// in the victim's address space then they can cause the victim to
	1949	// /speculatively/ (but not architecturally) execute any code in
	1950	// their address space, possibly leading to leaking secrets through
	1951	// the cache. retpolines aren't susceptible to this because the
	1952	// predicted destination address is from the return-prediction stack
	1953	// which the adversary can't prime. the performance penalty is still
	1954	// essentially a branch misprediction -- for this return, and
	1955	// possibly all others already stacked.
	1956
	1957	// (try not to crash)
	1958	lea rax, [rip + 9f]
	1959
	1960	push rax
	1961	9: ret
	1962
	1963	#elif defined(__i386__)
	1964
	1965	call get_pc_ebx
	1966	lea eax, [ebx + 9f - .]
	1967
	1968	push eax
	1969	9: ret
	1970
	1971	#elif defined(__arm__)
	1972
	1973	stmfd r13!, {r14}
	1974
	1975	adr r14, 8f
	1976	bx r14
	1977
	1978	8: ldmfd r13!, {pc}
	1979
	1980	#elif defined(__aarch64__)
	1981
	1982	str x30, [sp, #-16]!
	1983
	1984	adr x30, 8f
	1985	ret
	1986
	1987	8: ldr x30, [sp], #16
	1988	ret
	1989
	1990	#else
	1991	notimpl
	1992	#endif
	1993
	1994	endproc
	1995
	1996	proc x1c
	1997
	1998	// ok, having a hard time seeing a use for this. the most important
	1999	// thing to note is that sp is set from `pop' /after/ it's
	2000	// incremented.
	2001
	2002	#if defined(__x86_64__)
	2003
	2004	// try not to crash
	2005	mov rax, rsp
	2006	and rsp, -16
	2007	push rax
	2008
	2009	pop rsp
	2010
	2011	// check it worked
	2012	mov rbx, rsp
	2013	ret
	2014
	2015	#elif defined(__i386__)
	2016
	2017	// try not to crash
	2018	mov eax, esp
	2019	and esp, -16
	2020	push eax
	2021
	2022	pop esp
	2023
	2024	// check it worked
	2025	mov ebx, esp
	2026	ret
	2027
	2028	#elif defined(__arm__)
	2029
	2030	// not even going to dignify this
	2031	notimpl
	2032
	2033	#elif defined(__aarch64__)
	2034
	2035	// not even going to dignify this
	2036	notimpl
	2037
	2038	#else
	2039	notimpl
	2040	#endif
	2041
	2042	endproc
	2043
	2044	proc x1d
	2045
	2046	// monumentally cheesy way to copy 8 n bytes from buff1 to buff2.
	2047	// also clobbers words at buff2 + 8 n and buff2 - 8 for good measure.
	2048
	2049	n = 4
	2050
	2051	#if defined(__x86_64__)
	2052
	2053	mov rax, rsp // safekeeping
	2054
	2055	// we're toast if we get hit by a signal now. fingers crossed...
	2056	.if 0
	2057	mov rsp, buff2 + 8*n + 8
	2058	mov rbp, buff1 + 8*n
	2059	.else
	2060	lea rsp, [rdi + 8*n + 16]
	2061	lea rbp, [rsi + 8*n]
	2062	.endif
	2063	enter 0, n + 1
	2064
	2065	// precise action:
	2066	//
	2067	// +---------+ +---------+
	2068	// rbp -> \| ??? \| rsp -> \| ??? \|
	2069	// +---------+ +---------+
	2070	// \| w_{n-1} \| \| rbp \| <- rbp'
	2071	// +---------+ +---------+
	2072	// \| ... \| \| w_{n-1} \|
	2073	// +---------+ +---------+
	2074	// \| w_1 \| \| ... \|
	2075	// +---------+ +---------+
	2076	// \| w_0 \| \| w_1 \|
	2077	// +---------+ +---------+
	2078	// \| w_0 \|
	2079	// +---------+
	2080	// \| rbp' \| <- rsp'
	2081	// +---------+
	2082
	2083	mov rdx, rsp
	2084	mov rsp, rax
	2085
	2086	#elif defined(__i386__)
	2087
	2088	mov eax, esp // safekeeping
	2089
	2090	// we're toast if we get hit by a signal now. fingers crossed...
	2091	.if 0
	2092	mov esp, buff2 + 4*n + 4
	2093	mov ebp, buff1 + 4*n
	2094	.else
	2095	lea esp, [edi + 4*n + 8]
	2096	lea ebp, [esi + 4*n]
	2097	.endif
	2098	enter 0, n + 1
	2099
	2100	mov edx, esp
	2101	mov esp, eax
	2102
	2103	#elif defined(__arm__)
	2104
	2105	add r4, r4, #4*n
	2106	add r5, r5, #4*n + 8
	2107
	2108	str r4, [r5, #-4]!
	2109	.rept n/2
	2110	ldrd r0, r1, [r4, #-8]!
	2111	strd r0, r1, [r5, #-8]!
	2112	.endr
	2113	add r4, r5, #4*n
	2114	str r4, [r5, #-4]!
	2115
	2116	#elif defined(__aarch64__)
	2117
	2118	// omgwtf. let's not actually screw with the stack pointer.
	2119
	2120	add x4, x4, #8*n
	2121	add x5, x5, #8*n + 16
	2122
	2123	str x4, [x5, #-8]!
	2124	.rept n/2
	2125	ldp x16, x17, [x4, #-16]!
	2126	stp x16, x17, [x5, #-16]!
	2127	.endr
	2128	add x4, x5, #8*n
	2129	str x4, [x5, #-8]!
	2130
	2131	#else
	2132	notimpl
	2133	#endif
	2134
	2135	ret
	2136
	2137	endproc
	2138
	2139	proc x1e
	2140
	2141	// convert nibble value to (uppercase) hex; other input values yield
	2142	// nonsense.
	2143
	2144	#if defined(__x86_64__)
	2145
	2146	// das doesn't work in 64-bit mode; best i can come up with
	2147	mov edx, eax
	2148	add al, '0'
	2149	add dl, 'A' - 10
	2150	cmp al, '9' + 1
	2151	cmovae eax, edx
	2152
	2153	#elif defined(__i386__)
	2154
	2155	cmp al, 0x0a // cf = 1 iff a < 10
	2156	sbb al, 0x69 // if 0 <= a < 10, a' = a - 0x6a, so
	2157	// 0x96 <= a' < 0x70, setting af, cf
	2158	// if 10 <= a < 16, a' = a - 0x69, so
	2159	// 0x71 <= a' < 0x77, setting cf but
	2160	// clearing af
	2161	das // if 0 <= a < 10, then af and cf are
	2162	// both set, so set subtract 0x66
	2163	// from a' leaving 0x30 <= a' < 0x3a;
	2164	// if 10 <= a < 16 then af clear but
	2165	// cf set, so subtract 0x60 from a'
	2166	// leaving 0x41 <= a' < 0x47
	2167
	2168	#elif defined(__arm__)
	2169
	2170	// significantly less tricksy
	2171	cmp r0, #10
	2172	addlo r0, r0, #'0'
	2173	addhs r0, r0, #'A' - 10
	2174
	2175	#elif defined(__aarch64__)
	2176
	2177	// with less versatile conditional execution this is the best we can
	2178	// do
	2179	cmp w0, #10
	2180	add w16, w0, #'A' - 10
	2181	add w0, w0, #'0'
	2182	cmov.hs w0, w16
	2183
	2184	#else
	2185	notimpl
	2186	#endif
	2187
	2188	ret
	2189
	2190	endproc
	2191
	2192	proc x1f
	2193
	2194	// verify collatz conjecture starting at a; assume a /= 0!
	2195
	2196	#if defined(__x86_64__)
	2197
	2198	0: bsf rcx, rax // clobber c if a = 0
	2199	shr rax, cl // a = 2^c a'
	2200	cmp rdx, 0
	2201	je 1f
	2202	stosq
	2203	dec rdx
	2204	1:
	2205	cmp rax, 1 // done?
	2206	je 9f
	2207	lea rax, [2*rax + rax + 1] // a' = 3 a' + 1
	2208	jmp 0b // again
	2209
	2210	9: ret
	2211
	2212	#elif defined(__i386__)
	2213
	2214	0: bsf ecx, eax // clobber c if a = 0
	2215	shr eax, cl // a = 2^c a'
	2216	cmp edx, 0
	2217	je 1f
	2218	stosd
	2219	dec edx
	2220	1:
	2221	cmp eax, 1 // done?
	2222	je 9f
	2223	lea eax, [2*eax + eax + 1] // a' = 3 a' + 1
	2224	jmp 0b // again
	2225
	2226	9: ret
	2227
	2228	#elif defined(__arm__)
	2229
	2230	// rbit introduced in armv7
	2231	0: rbit r2, r0
	2232	clz r2, r2
	2233	mov r0, r0, lsr r2 // a = 2^c a'
	2234	cmp r3, #0
	2235	strne r0, [r5], #4
	2236	subne r3, r3, #1
	2237	cmp r0, #1
	2238	adcne r0, r0, r0, lsl #1 // a' = 3 a' + 1 (because c set)
	2239	bne 0b
	2240
	2241	ret
	2242
	2243	#elif defined(__aarch64__)
	2244
	2245	0: rbit w2, w0
	2246	clz w2, w2
	2247	lsr w0, w0, w2 // a = 2^c a'
	2248	cmp x3, #0
	2249	beq 1f
	2250	str x0, [x5], #8
	2251	sub x3, x3, #1
	2252	1:
	2253	cmp w0, #1
	2254	add w16, w0, w0, lsl #1 // t = 3 a' + 1 (because c set)
	2255	csinc.eq w0, w0, w16
	2256	b.ne 0b
	2257
	2258	ret
	2259
	2260	#else
	2261	notimpl
	2262	#endif
	2263
	2264	endproc
	2265
	2266	///--------------------------------------------------------------------------
	2267	/// 0x20--0x2f
	2268
	2269	proc x20
	2270
	2271	// calculate 1337 a slowly
	2272
	2273	#if defined(__x86_64__)
	2274
	2275	// original version
	2276	mov rcx, rax // c = a
	2277	shl rcx, 2 // c = 4 a
	2278	add rcx, rax // c = 5 a
	2279	shl rcx, 3 // c = 40 a
	2280	add rcx, rax // c = 41 a
	2281	shl rcx, 1 // c = 82 a
	2282	add rcx, rax // c = 83 a
	2283	shl rcx, 1 // c = 166 a
	2284	add rcx, rax // c = 167 a
	2285	shl rcx, 3 // c = 1336 a
	2286	add rcx, rax // c = 1337 a
	2287
	2288	// a quick way
	2289	lea rdx, [2*rax + rax] // t = 3 a
	2290	shl rdx, 6 // t = 192 a
	2291	sub rdx, rax // t = 191 a
	2292	lea rbx, [8*rdx] // b = 1528 a
	2293	sub rbx, rdx // b = 1337 a
	2294
	2295	#elif defined(__i386__)
	2296
	2297	// original version
	2298	mov ecx, eax // c = a
	2299	shl ecx, 2 // c = 4 a
	2300	add ecx, eax // c = 5 a
	2301	shl ecx, 3 // c = 40 a
	2302	add ecx, eax // c = 41 a
	2303	shl ecx, 1 // c = 82 a
	2304	add ecx, eax // c = 83 a
	2305	shl ecx, 1 // c = 166 a
	2306	add ecx, eax // c = 167 a
	2307	shl ecx, 3 // c = 1336 a
	2308	add ecx, eax // c = 1337 a
	2309
	2310	// a quick way
	2311	lea edx, [2*eax + eax] // t = 3 a
	2312	shl edx, 6 // t = 192 a
	2313	sub edx, eax // t = 191 a
	2314	lea ebx, [8*edx] // b = 1528 a
	2315	sub ebx, edx // b = 1337 a
	2316
	2317	#elif defined(__arm__)
	2318
	2319	// original version, ish
	2320	add r2, r0, r0, lsl #2 // c = 5 a
	2321	add r2, r0, r2, lsl #3 // c = 41 a
	2322	add r2, r0, r2, lsl #1 // c = 83 a
	2323	add r2, r0, r2, lsl #1 // c = 167 a
	2324	add r2, r0, r2, lsl #3 // c = 1337 a
	2325
	2326	// quicker way
	2327	add r1, r0, r0, lsl #1 // b = 3 a
	2328	rsb r1, r0, r1, lsl #6 // b = 191 a
	2329	rsb r1, r1, r1, lsl #3 // b = 1337 a
	2330
	2331	#elif defined(__aarch64__)
	2332
	2333	// original version, ish
	2334	add x2, x0, x0, lsl #2 // c = 5 a
	2335	add x2, x0, x2, lsl #3 // c = 41 a
	2336	add x2, x0, x2, lsl #1 // c = 83 a
	2337	add x2, x0, x2, lsl #1 // c = 167 a
	2338	add x2, x0, x2, lsl #3 // c = 1337 a
	2339
	2340	// sleazy because no rsb
	2341	add x1, x0, x0, lsl #1 // b = 3 a
	2342	sub x1, x0, x1, lsl #6 // b = -191 a
	2343	sub x1, x1, x1, lsl #3 // b = 1337 a
	2344
	2345	#else
	2346	notimpl
	2347	#endif
	2348
	2349	ret
	2350
	2351	endproc
	2352
	2353	proc x21
	2354
	2355	// multiply complex numbers a + b i and c + d i
	2356	//
	2357	// (a + b i) (c + d i) = (a c - b d) + (a d + b c) i
	2358	//
	2359	// somewhat slick approach uses only three multiplications
	2360
	2361	#if defined(__x86_64__)
	2362
	2363	mov rsi, rax // t = a
	2364	add rax, rbx // a' = a + b
	2365	mov rdi, rdx // u = d
	2366	sub rdx, rcx // d' = d - c
	2367	add rdi, rcx // u = c + d
	2368
	2369	imul rax, rcx // a' = c (a + b)
	2370	imul rsi, rdx // t = a (d - c)
	2371	imul rdi, rbx // u = b (c + d)
	2372
	2373	add rsi, rax // t = a (d - c) + c (a + b)
	2374	mov rbx, rsi // b' = a (d - c) + c (a + b)
	2375	// = a d + b c
	2376	sub rax, rdi // a' = c (a + b) - b (c + d)
	2377	// = a c - b d
	2378
	2379	#elif defined(__i386__)
	2380
	2381	mov esi, eax // t = a
	2382	add eax, ebx // a' = a + b
	2383	mov edi, edx // u = d
	2384	sub edx, ecx // d' = d - c
	2385	add edi, ecx // u = c + d
	2386
	2387	imul eax, ecx // a' = c (a + b)
	2388	imul esi, edx // t = a (d - c)
	2389	imul edi, ebx // u = b (c + d)
	2390
	2391	add esi, eax // t = a (d - c) + c (a + b)
	2392	mov ebx, esi // b' = a (d - c) + c (a + b)
	2393	// = a d + b c
	2394	sub eax, edi // a' = c (a + b) - b (c + d)
	2395	// = a c - b d
	2396
	2397	#elif defined(__arm__)
	2398
	2399	add r4, r0, r1 // t = a + b
	2400	add r5, r2, r3 // u = c + d
	2401	sub r3, r3, r2 // d' = d - c
	2402
	2403	// mls introduced in armv7
	2404	mul r4, r4, r2 // t = c (a + b)
	2405	mov r2, r1 // c' = a (bah!)
	2406	mla r1, r0, r3, r4 // b' = a (d - c) + c (a + b)
	2407	// = a d + b c
	2408	mls r0, r2, r5, r4 // a' = c (a + b) - b (c + d)
	2409	// = a c - b d
	2410
	2411	#elif defined(__aarch64__)
	2412
	2413	add x4, x0, x1 // t = a + b
	2414	add x5, x2, x3 // u = c + d
	2415	sub x3, x3, x2 // d' = d - c
	2416
	2417	// mls intxoduced in axmv7
	2418	mul x4, x4, x2 // t = c (a + b)
	2419	mov x2, x1 // c' = a (bah!)
	2420	madd x1, x0, x3, x4 // b' = a (d - c) + c (a + b)
	2421	// = a d + b c
	2422	msub x0, x2, x5, x4 // a' = c (a + b) - b (c + d)
	2423	// = a c - b d
	2424
	2425	#else
	2426	notimpl
	2427	#endif
	2428
	2429	ret
	2430
	2431	endproc
	2432
	2433	proc x22
	2434
	2435	// divide by 3
	2436
	2437	#if defined(__x86_64__)
	2438
	2439	mov rdx, 0xaaaaaaaaaaaaaaab // = ceil(2/3 2^64)
	2440	mul rdx // d' \|\| a' =~ 2/3 a 2^64
	2441	shr rdx, 1 // d' = floor(a/3)
	2442	mov rax, rdx // a' = floor(a/3)
	2443
	2444	// we start with 0 <= a < 2^64. write f = ceil(2/3 2^64), so that
	2445	// 2/3 < f/2^64 < 2/3 + 1/2^64. then floor(2/3 a) <= floor(a f/2^64)
	2446	// <= floor(2/3 a + a/2^64), but a < 2^64 so a/2^64 < 1 and
	2447	// floor(a f/2^64) = floor(2/3 a).
	2448
	2449	#elif defined(__i386__)
	2450
	2451	mov edx, 0xaaaaaaab // = ceil(2/3 2^32)
	2452	mul edx // d' \|\| a' =~ 2/3 a 2^32
	2453	shr edx, 1 // d' = floor(a/3)
	2454	mov eax, edx // a' = floor(a/3)
	2455
	2456	#elif defined(__arm__)
	2457
	2458	ldr r12, =0xaaaaaaab
	2459	umull r12, r0, r0, r12
	2460	mov r0, r0, lsr #1
	2461
	2462	#elif defined(__aarch64__)
	2463
	2464	ldr x16, =0xaaaaaaaaaaaaaaab
	2465	umulh x0, x0, x16
	2466	lsr x0, x0, #1
	2467
	2468	#else
	2469	notimpl
	2470	#endif
	2471
	2472	ret
	2473
	2474	endproc
	2475
	2476	proc x23
	2477
	2478	#if defined(__x86_64__)
	2479
	2480	// main loop: shorten a preserving residue class mod 3
	2481	0: cmp rax, 5
	2482	jbe 8f
	2483	// a > 5
	2484	mov rdx, rax // d' = a
	2485	shr rdx, 2 // d' = floor(a/4)
	2486	and rax, 3 // a = 4 d' + a' (0 <= a' < 4)
	2487	add rax, rdx // a' == a (mod 3) but a' < a/4 + 4
	2488	jmp 0b
	2489
	2490	// fix up final value 0 <= a < 6: want 0 <= a < 3
	2491	//
	2492	// the tricky part is actually a = 3; but the other final cases take
	2493	// additional iterations which we can avoid.
	2494	8: cmp rax, 3 // set cf iff a < 3
	2495	cmc // set cf iff a >= 3
	2496	sbb rdx, rdx // d' = a >= 3 ? -1 : 0
	2497	and rdx, 3 // d' = a >= 3 ? 3 : 0
	2498	sub rax, rdx // a' = a - (a >= 3 ? 3 : 0)
	2499	// = a (mod 3)
	2500
	2501	#elif defined(__i386__)
	2502
	2503	// main loop: shorten a preserving residue class mod 3
	2504	0: cmp eax, 5
	2505	jbe 8f
	2506	// a > 5
	2507	mov edx, eax // d' = a
	2508	shr edx, 2 // d' = floor(a/4)
	2509	and eax, 3 // a = 4 d' + a' (0 <= a' < 4)
	2510	add eax, edx // a' == a (mod 3) but a' < a/4 + 4
	2511	jmp 0b
	2512
	2513	// fix up final value 0 <= a < 6: want 0 <= a < 3
	2514	//
	2515	// the tricky part is actually a = 3; but the other final cases take
	2516	// additional iterations which we can avoid.
	2517	8: cmp eax, 3 // set cf iff a < 3
	2518	cmc // set cf iff a >= 3
	2519	sbb edx, edx // d' = a >= 3 ? -1 : 0
	2520	and edx, 3 // d' = a >= 3 ? 3 : 0
	2521	sub eax, edx // a' = a - (a >= 3 ? 3 : 0)
	2522	// = a (mod 3)
	2523
	2524	#elif defined(__arm__)
	2525
	2526	0: cmp r0, #6
	2527	andhs r12, r0, #3
	2528	addhs r0, r12, r0, lsr #2
	2529	bhs 0b
	2530
	2531	cmp r0, #3
	2532	subhs r0, r0, #3
	2533
	2534	#elif defined(__aarch64__)
	2535
	2536	0: cmp x0, #6
	2537	// blunder on through regardless since this doesn't affect the result
	2538	and x16, x0, #3
	2539	add x0, x16, x0, lsr #2
	2540	b.hs 0b
	2541
	2542	subs x16, x0, #3
	2543	cmov.hs x0, x16
	2544
	2545	#else
	2546	notimpl
	2547	#endif
	2548
	2549	ret
	2550
	2551	endproc
	2552
	2553	proc x24
	2554
	2555	// invert (odd) a mod 2^64
	2556	//
	2557	// suppose a a_i == 1 (mod 2^{2^i})
	2558	//
	2559	// clearly good for i = 0, since 2^i = 1 and 2^{2^i} = 2, and a_0 =
	2560	// a == 1 (mod 2) by assumption
	2561	//
	2562	// write a a_i == b_i 2^{2^i} + 1 (mod 2^{2^{i+1}})
	2563	// then b_i == (a a_i - 1)/2^{2^i} (mod 2^{2^i})
	2564	// to lift inverse, we want x such that a x == -b_i (mod 2^{2^i});
	2565	// clearly x = -a_i b_i will do, since a a_i == 1 (mod 2^{2^i})
	2566	// then:
	2567	// a_{i+1} = a_i - a_i b_i 2^{2^i} = a_i (1 - (a a_i - 1))
	2568	// = 2 a_i - a a_i^2
	2569	//
	2570	// check:
	2571	// a a_{i+1} = 2 a a_i - a^2 a_i^2
	2572	// == 2 a a_i - (b_i 2^{2^i} + 1)^2
	2573	// == 2 (b_i 2^{2^i} + 1) -
	2574	// (b_i^2 2^{2^{i+1}} + 2 b_i 2^{2^i} + 1)
	2575	// == 1 (mod 2^{2^{i+1}})
	2576
	2577	#if defined(__x86_64__)
	2578
	2579	// rax // a_0 = a
	2580	mov rbx, rax // b' = a
	2581	mov rsi, rax // t = a_0
	2582
	2583	0:
	2584	cmp rbp, 0
	2585	je 1f
	2586	stosq
	2587	dec rbp
	2588	1:
	2589	mul rbx // a' = a a_i
	2590	mov rcx, rax // c = a a_i
	2591
	2592	sub rax, 2 // a' = a a_i - 2
	2593	neg rax // a' = 2 - a a_i
	2594	mul rsi // a_{i+1} = a_i (2 - a a_i)
	2595	// = 2 a_i - a a_i^2
	2596	mov rsi, rax // t = a_{i+1}
	2597
	2598	cmp rcx, 1 // done?
	2599	ja 0b // no -- iterate
	2600
	2601	#elif defined(__i386__)
	2602
	2603	// eax // a_0 = a
	2604	mov ebx, eax // b' = a
	2605	mov esi, eax // t = a_0
	2606
	2607	0:
	2608	cmp ebp, 0
	2609	je 1f
	2610	stosd
	2611	dec ebp
	2612	1:
	2613	mul ebx // a' = a a_i
	2614	mov ecx, eax // c = a a_i
	2615
	2616	sub eax, 2 // a' = a a_i - 2
	2617	jb 9f // done if < 2
	2618	neg eax // a' = 2 - a a_i
	2619	mul esi // a_{i+1} = a_i (2 - a a_i)
	2620	// = 2 a_i - a a_i^2
	2621	mov esi, eax // t = a_{i+1}
	2622
	2623	jmp 0b // and iterate
	2624	9: mov eax, esi // restore
	2625
	2626	#elif defined(__arm__)
	2627
	2628	// r0 // a_0 = a
	2629	mov r1, r0 // b' = a
	2630
	2631	0:
	2632	cmp r6, #0
	2633	strne r0, [r5], #4
	2634	subne r6, r6, #1
	2635	mul r2, r0, r1 // c = a a_i
	2636	rsbs r2, r2, #2 // c = 2 - a a_i
	2637	mul r0, r0, r2 // a_{i+1} = a_i (2 - a a_i)
	2638	// = 2 a_i - a a_i^2
	2639	blo 0b
	2640
	2641	#elif defined(__aarch64__)
	2642
	2643	// x0 // a_0 = a
	2644	mov x1, x0 // b' = a
	2645	mov x16, #2 // because we have no rsb
	2646
	2647	0:
	2648	cmp x6, #0
	2649	b.eq 1f
	2650	str x0, [x5], #8
	2651	sub x6, x6, #1
	2652	1:
	2653	mul x2, x0, x1 // c = a a_i
	2654	subs x2, x16, x2 // c = 2 - a a_i
	2655	mul x0, x0, x2 // a_{i+1} = a_i (2 - a a_i)
	2656	// = 2 a_i - a a_i^2
	2657	b.lo 0b
	2658
	2659	#else
	2660	notimpl
	2661	#endif
	2662
	2663	ret
	2664
	2665	endproc
	2666
	2667	proc x25
	2668
	2669	// a poor approximation to pi/4
	2670	//
	2671	// think of x and y as being in 16.16 fixed-point format. we sample
	2672	// points in the unit square, and determine how many of them are
	2673	// within a unit quarter-circle centred at the origin. the area of
	2674	// the quarter-circle is pi/4.
	2675
	2676	#if defined(__x86_64__)
	2677
	2678	xor eax, eax // a = 0
	2679	mov rcx, 1
	2680	shl rcx, 0x20 // c =~ 4 billion
	2681
	2682	0: movzx rbx, cx // x = low 16 bits of c
	2683	imul rbx, rbx // b = x^2
	2684
	2685	ror rcx, 0x10 // switch halves of c
	2686	movzx rdx, cx // y = high 16 bits of c
	2687	imul rdx, rdx // d = y^2
	2688	rol rcx, 0x10 // switch back
	2689
	2690	add rbx, rdx // r^2 = x^2 + y^2
	2691	shr rbx, 0x20 // r^2 >= 1?
	2692	cmp rbx, 1 // set cf iff r^2 >= 1
	2693	adc rax, 0 // and add onto accumulator
	2694	loop 0b
	2695
	2696	#elif defined(__i386__)
	2697
	2698	// this is actually better done in 32 bits. the carry has the wrong
	2699	// sense here, so instead deduct one for each point outside the
	2700	// quarter-circle rather than adding one for each point inside it.
	2701	xor eax, eax
	2702	xor ecx, ecx
	2703
	2704	0: movzx ebx, cx
	2705	imul ebx, ebx
	2706
	2707	mov edx, ecx
	2708	shr edx, 0x10
	2709	imul edx, edx
	2710
	2711	add ebx, edx // see?
	2712	sbb eax, 0
	2713	loop 0b
	2714
	2715	#elif defined(__arm__)
	2716
	2717	mov r0, #0
	2718	mov r2, #0
	2719
	2720	0: uxth r1, r2, ror #0
	2721	uxth r3, r2, ror #16
	2722	mul r1, r1, r1
	2723	mul r3, r3, r3
	2724	cmn r1, r3 // mlas doesn't set cf usefully
	2725	addcc r0, r0, #1
	2726	adds r2, r2, #1
	2727	bne 0b
	2728
	2729	#elif defined(__aarch64__)
	2730
	2731	mov w0, #0
	2732	mov w2, #0
	2733
	2734	0: ubfx w1, w2, #0, #16
	2735	ubfx w3, w2, #16, #16
	2736	sub w2, w2, #1
	2737	mul w1, w1, w1
	2738	mul w3, w3, w3
	2739	cmn w1, w3
	2740	cinc.cc w0, w0
	2741	cbnz w2, 0b
	2742
	2743	#else
	2744	notimpl
	2745	#endif
	2746
	2747	ret
	2748
	2749	endproc
	2750
	2751	proc x26
	2752
	2753	// a bad way to rotate a right by 7 places
	2754
	2755	#if defined(__x86_64__)
	2756
	2757	mov rbx, rax
	2758	ror rbx, 7 // better
	2759
	2760	mov rdx, rax // d' = a
	2761	shr rax, 7 // a' = a >> 7
	2762	shl rdx, 0x39 // d' = a << 57
	2763	or rax, rdx // a' = a >>> 7
	2764
	2765	#elif defined(__i386__)
	2766
	2767	mov ebx, eax
	2768	ror ebx, 7 // better
	2769
	2770	mov edx, eax // d' = a
	2771	shr eax, 7 // a' = a >> 7
	2772	shl edx, 0x39 // d' = a << 57
	2773	or eax, edx // a' = a >>> 7
	2774
	2775	#elif defined(__arm__)
	2776
	2777	mov r1, r0, ror #7 // easy way
	2778
	2779	// even the hard way is fairly easy on arm
	2780	mov r3, r0, lsl #25
	2781	orr r0, r3, r0, lsr #7 // hard way
	2782
	2783	#elif defined(__aarch64__)
	2784
	2785	ror x1, x0, #7 // easy way
	2786
	2787	// even the hard way is fairly easy on arm
	2788	lsl x3, x0, #57
	2789	orr x0, x3, x0, lsr #7 // hard way
	2790
	2791	#else
	2792	notimpl
	2793	#endif
	2794
	2795	ret
	2796
	2797	endproc
	2798
	2799	proc x27
	2800
	2801	// shift a right by c places, in two halves
	2802
	2803	#if defined(__x86_64__)
	2804
	2805	mov ch, cl // c' = [c, c]
	2806	inc ch // c' = [c, c + 1]
	2807	shr ch, 1
	2808	shr cl, 1 // c' = [floor(c/2), ceil(c/2)]
	2809	shr rax, cl
	2810	xchg ch, cl
	2811	shr rax, cl
	2812
	2813	#elif defined(__i386__)
	2814
	2815	mov ch, cl // c' = [c, c]
	2816	inc ch // c' = [c, c + 1]
	2817	shr ch, 1
	2818	shr cl, 1 // c' = [floor(c/2), ceil(c/2)]
	2819	shr eax, cl
	2820	xchg ch, cl
	2821	shr eax, cl
	2822
	2823	#elif defined(__arm__)
	2824
	2825	// it would be clearer and more efficient to say: `mov r12, r2, lsr
	2826	// #1; sub r2, r2, r12', but that's not the lesson this exercise is
	2827	// trying to teach.
	2828	add r12, r2, #1
	2829	mov r2, r2, lsr #1
	2830	mov r12, r12, lsr #1
	2831	mov r0, r0, lsr r2
	2832	mov r0, r0, lsr r12
	2833
	2834	#elif defined(__aarch64__)
	2835
	2836	add w16, w2, #1
	2837	lsr w2, w2, #1
	2838	lsr w16, w16, #1
	2839	lsr x0, x0, x2
	2840	lsr x0, x0, x16
	2841
	2842	#else
	2843	notimpl
	2844	#endif
	2845
	2846	ret
	2847
	2848	endproc
	2849
	2850	proc x28
	2851
	2852	// divide c-byte little-endian bignum at rsi by 2 (rounding down)
	2853
	2854	#if defined(__x86_64__)
	2855
	2856	clc
	2857	0: rcr byte ptr [rsi], 1
	2858	inc rsi
	2859	loop 0b
	2860
	2861	#elif defined(__i386__)
	2862
	2863	clc
	2864	0: rcr byte ptr [esi], 1
	2865	inc esi
	2866	loop 0b
	2867
	2868	#elif defined(__arm__)
	2869
	2870	// we could hack this a word at a time using rrx
	2871	mov r3, #0
	2872	0: ldrb r12, [r4]
	2873	subs r2, r2, #1
	2874	orr r3, r3, r12, lsr #1
	2875	strb r3, [r4], #1
	2876	mov r3, r12, lsl #7
	2877	bne 0b
	2878
	2879	#elif defined(__aarch64__)
	2880
	2881	mov w16, #0
	2882	0: ldrb w17, [x4]
	2883	sub x2, x2, #1
	2884	orr w16, w16, w17, lsr #1
	2885	strb w16, [x4], #1
	2886	lsl w16, w17, #7
	2887	cbnz x2, 0b
	2888
	2889	#else
	2890	notimpl
	2891	#endif
	2892
	2893	ret
	2894
	2895	endproc
	2896
	2897	proc x29
	2898
	2899	// fill a buffer with a 3-byte pattern
	2900
	2901	#if defined(__x86_64__)
	2902
	2903	lea rdi, [rsi + 3]
	2904	rep movsb
	2905
	2906	#elif defined(__i386__)
	2907
	2908	lea edi, [esi + 3]
	2909	rep movsb
	2910
	2911	#elif defined(__arm__)
	2912
	2913	add r5, r4, #3
	2914	0: subs r2, r2, #1
	2915	ldrhsb r12, [r4], #1
	2916	strhsb r12, [r5], #1
	2917	bhs 0b
	2918
	2919	#elif defined(__aarch64__)
	2920
	2921	cbz x2, 9f
	2922	add x5, x4, #3
	2923	0: sub x2, x2, #1
	2924	ldrb w16, [x4], #1
	2925	strb w16, [x5], #1
	2926	cbnz x2, 0b
	2927	9:
	2928
	2929	#else
	2930	notimpl
	2931	#endif
	2932
	2933	ret
	2934
	2935	endproc
	2936
	2937	proc x2a
	2938
	2939	// rotate the words in a buffer, so that the last word comes first,
	2940	// the first comes second, and so on. this isn't a good way to do
	2941	// it.
	2942
	2943	#if defined(__x86_64__)
	2944
	2945	mov rsi, rbx // set string pointers
	2946	mov rdi, rbx
	2947	0: lodsq // fetch next word
	2948	xchg rax, qword ptr [rbx] // stash it for next iteration and
	2949	// replace it with the previously
	2950	// stashed word
	2951	stosq // store in output
	2952	// (note that the first iteration doesn't actually do anything)
	2953	loop 0b // continue until all done
	2954
	2955	#elif defined(__i386__)
	2956
	2957	mov esi, ebx // set string pointers
	2958	mov edi, ebx
	2959	0: lodsd // fetch next word
	2960	xchg eax, dword ptr [ebx] // stash it for next iteration and
	2961	// replace it with the previously
	2962	// stashed word
	2963	stosd // store in output
	2964	loop 0b // continue until all done
	2965
	2966	#elif defined(__arm__)
	2967
	2968	// let's do this a sensible way. (we could go faster using ldm/stm.)
	2969	add r0, r1, r2, lsl #2 // find the end of the buffer
	2970	ldr r0, [r0, #-4] // collect final element
	2971	0: subs r2, r2, #1
	2972	ldr r12, [r1]
	2973	str r0, [r1], #4
	2974	mov r0, r12
	2975	bne 0b
	2976
	2977	#elif defined(__aarch64__)
	2978
	2979	add x0, x1, x2, lsl #3 // find the end of the buffer
	2980	ldr x0, [x0, #-8] // collect final element
	2981	0: sub x2, x2, #1
	2982	ldr x16, [x1]
	2983	str x0, [x1], #8
	2984	mov x0, x16
	2985	cbnz x2, 0b
	2986
	2987	#else
	2988	notimpl
	2989	#endif
	2990
	2991	ret
	2992
	2993	endproc
	2994
	2995	proc x2b
	2996
	2997	// find a cycle in a function f: B -> B, where B = {0, 1, ..., 255}
	2998
	2999	#if defined(__x86_64__)
	3000
	3001	// this is floyd's cycle-finding algorithm.
	3002	//
	3003	// consider the sequence s_0 = 0, s_1 = f(0), s_2 = f(f(0)), ...,
	3004	// s_{i+1} = f(s_i). since B is finite, there must be some smallest
	3005	// t and c such that s(t) = s(t + c); then we have s_i = s_j iff
	3006	// i >= t, j >= t, and i == j (mod c).
	3007	//
	3008	// the algorithm sets two cursors advancing through the sequence: a
	3009	// /tortoise/ which advances one step at a time, and a /hare/ which
	3010	// advances by two, so when the tortoise is at element s_i, the hare
	3011	// is at s_{2i}. the hare will run around the cycle and catch the
	3012	// tortoise when i >= t and i == 2 i (mod c); the latter is simply i
	3013	// == 0 (mod c), which therefore happens first when i = k = t +
	3014	// (-t mod c).
	3015	//
	3016	// i'm not sure what good xlatb does here that mov al, [rbx + al]
	3017	// doesn't.
	3018
	3019	xor eax, eax // tortoise starts at 0
	3020	xor edx, edx // hare starts at 0
	3021	0: xlatb // advance tortoise
	3022	xchg rax, rdx // switch to hare
	3023	xlatb // advance hare ...
	3024	xlatb // ... twice
	3025	xchg rax, rdx // switch back
	3026	cmp al, dl // hare caught the tortoise?
	3027	jnz 0b // no -- go around again
	3028
	3029	// now we trace the initial tail: reset the tortoise to s_0, and slow
	3030	// the hare down so that both take only a single step in each
	3031	// iteration. this loop terminates when i >= t and i == i + 2 k
	3032	// (mod c). we know k is a multiple of c, so the latter condition
	3033	// always holds, so this finds the first step of the cycle.
	3034
	3035	xor eax, eax // reset the tortoise
	3036	0: xlatb // advance tortoise
	3037	xchg rax, rdx // switch to hare
	3038	xlatb // advance hare
	3039	xchg rax, rdx // and switch back
	3040	cmp al, dl // done?
	3041	jnz 0b // no -- iterate
	3042
	3043	#elif defined(__i386__)
	3044
	3045	xor eax, eax // tortoise starts at 0
	3046	xor edx, edx // hare starts at 0
	3047	0: xlatb // advance tortoise
	3048	xchg eax, edx // switch to hare
	3049	xlatb // advance hare ...
	3050	xlatb // ... twice
	3051	xchg eax, edx // switch back
	3052	cmp al, dl // hare caught the tortoise?
	3053	jnz 0b // no -- go around again
	3054
	3055	xor eax, eax // reset the tortoise
	3056	0: xlatb // advance tortoise
	3057	xchg eax, edx // switch to hare
	3058	xlatb // advance hare
	3059	xchg eax, edx // and switch back
	3060	cmp al, dl // done?
	3061	jnz 0b // no -- iterate
	3062
	3063	#elif defined(__arm__)
	3064
	3065	mov r0, #0
	3066	mov r3, #0
	3067	0: ldrb r0, [r1, r0]
	3068	ldrb r3, [r1, r3]
	3069	ldrb r3, [r1, r3]
	3070	cmp r0, r3
	3071	bne 0b
	3072
	3073	mov r0, #0
	3074	0: ldrb r0, [r1, r0]
	3075	ldrb r3, [r1, r3]
	3076	cmp r0, r3
	3077	bne 0b
	3078
	3079	#elif defined(__aarch64__)
	3080
	3081	mov w0, #0
	3082	mov w3, #0
	3083	0: ldrb w0, [x1, x0]
	3084	ldrb w3, [x1, x3]
	3085	ldrb w3, [x1, x3]
	3086	cmp w0, w3
	3087	b.ne 0b
	3088
	3089	mov w0, #0
	3090	0: ldrb w0, [x1, x0]
	3091	ldrb w3, [x1, x3]
	3092	cmp w0, w3
	3093	b.ne 0b
	3094
	3095	#else
	3096	notimpl
	3097	#endif
	3098
	3099	ret
	3100
	3101	endproc
	3102
	3103	proc x2c
	3104
	3105	// a convoluted way to set rax = rsi
	3106
	3107	#if defined(__x86_64__)
	3108
	3109	mov qword ptr [rbx + 8*rcx], 0 // b[c] = 0
	3110	mov qword ptr [rbx + 8*rdx], 1 // b[d] = 1
	3111	mov rax, [rbx + 8*rcx] // a' = b[c] = 0
	3112
	3113	mov [rbx], rsi // b[0] = t
	3114	mov [rbx + 8], rdi // b[1] = u
	3115	mov rax, [rbx + 8*rax] // a' = b[a'] = b[0] = t
	3116
	3117	#elif defined(__i386__)
	3118
	3119	mov dword ptr [ebx + 8*ecx], 0 // b[c] = 0
	3120	mov dword ptr [ebx + 8*edx], 1 // b[d] = 1
	3121	mov eax, [ebx + 8*ecx] // a' = b[c] = 0
	3122
	3123	mov [ebx], esi // b[0] = t
	3124	mov [ebx + 8], edi // b[1] = u
	3125	mov eax, [ebx + 8*eax] // a' = b[a'] = b[0] = t
	3126
	3127	#elif defined(__arm__)
	3128
	3129	mov r0, #0
	3130	mov r12, #1
	3131
	3132	str r0, [r1, r2, lsl #2]
	3133	str r12, [r1, r3, lsl #2]
	3134	ldr r0, [r1, r2, lsl #2]
	3135
	3136	str r4, [r1]
	3137	str r5, [r1, #4]
	3138	ldr r0, [r1, r0, lsl #2]
	3139
	3140	#elif defined(__aarch64__)
	3141
	3142	mov x16, #1
	3143
	3144	str xzr, [x1, x2, lsl #3]
	3145	str x16, [x1, x3, lsl #3]
	3146	ldr x0, [x1, x2, lsl #3]
	3147
	3148	str x4, [x1]
	3149	str x5, [x1, #8]
	3150	ldr x0, [x1, x0, lsl #3]
	3151
	3152	#else
	3153	notimpl
	3154	#endif
	3155
	3156	ret
	3157
	3158	endproc
	3159
	3160	proc x2d
	3161
	3162	// clear the least significant set bit in a, by calculating a' =
	3163	// a AND (a - 1).
	3164	//
	3165	// if a = 0 then a' = 0. otherwise, a - 1 differs from a exactly in
	3166	// the least significant /set/ bit of a, and all bits of lesser
	3167	// significance. to put it another way: write a = u 2^{k+1} + 2^k;
	3168	// then a - 1 = u 2^{k+1} + 2^{k-1} + ... + 2 + 1. taking the
	3169	// bitwise AND of these leaves only the bits common to both, i.e.,
	3170	// u 2^{k+1}.
	3171
	3172	#if defined(__x86_64__)
	3173
	3174	mov rdx, rax // d' = a
	3175	dec rax // a' = a - 1
	3176	and rax, rdx // a' = a AND (a - 1)
	3177
	3178	#elif defined(__i386__)
	3179
	3180	mov edx, eax // d' = a
	3181	dec eax // a' = a - 1
	3182	and eax, edx // a' = a AND (a - 1)
	3183
	3184	#elif defined(__arm__)
	3185
	3186	sub r3, r0, #1
	3187	and r0, r0, r3
	3188
	3189	#elif defined(__aarch64__)
	3190
	3191	sub x3, x0, #1
	3192	and x0, x0, x3
	3193
	3194	#else
	3195	notimpl
	3196	#endif
	3197
	3198	ret
	3199
	3200	endproc
	3201
	3202	proc x2e
	3203
	3204	// compute a mask of one bits in exactly the positions of the
	3205	// low-order run of zero bits in a
	3206
	3207	#if defined(__x86_64__)
	3208
	3209	mov rdx, rax // d' = a
	3210	dec rdx // d' = a - 1
	3211	xor rax, rdx // a = a XOR (a - 1)
	3212	// set bits are least significant
	3213	// set bit of a, and all bits of
	3214	// lesser significance
	3215	shr rax, 1 // now only bits of lesser
	3216	// significance; a' = 0 iff a odd
	3217	cmp rax, rdx // equal if a = 0 or 2^k; otherwise
	3218	// strictly less
	3219
	3220	#elif defined(__i386__)
	3221
	3222	mov edx, eax
	3223	dec edx
	3224	xor eax, edx
	3225	shr eax, 1
	3226	cmp eax, edx
	3227
	3228	#elif defined(__arm__)
	3229
	3230	sub r3, r0, #1
	3231	eor r0, r0, r3
	3232	mov r0, r0, lsr #1 // probably fold shift into next inst
	3233	cmp r0, r3
	3234
	3235	#elif defined(__aarch64__)
	3236
	3237	sub x3, x0, #1
	3238	eor x0, x0, x3
	3239	mov x0, x0, lsr #1 // probably fold shift into next inst
	3240	cmp x0, x3
	3241
	3242	#else
	3243	notimpl
	3244	#endif
	3245
	3246	ret
	3247
	3248	endproc
	3249
	3250	proc x2f
	3251
	3252	// a slow population count
	3253
	3254	#if defined(__x86_64__)
	3255
	3256	popcnt rbx, rcx // the easy way
	3257
	3258	// a fast version in software
	3259	mov rax, rcx
	3260
	3261	mov rdx, rcx
	3262	shr rdx, 1
	3263	mov rsi, 0x5555555555555555
	3264	and rax, rsi
	3265	and rdx, rsi
	3266	add rax, rdx
	3267
	3268	mov rdx, rax
	3269	shr rdx, 2
	3270	mov rsi, 0x3333333333333333
	3271	and rax, rsi
	3272	and rdx, rsi
	3273	add rax, rdx
	3274
	3275	mov rdx, rax
	3276	shr rdx, 32
	3277	add rax, rdx
	3278
	3279	mov rdx, rax
	3280	shr rdx, 4
	3281	and rax, 0x0f0f0f0f
	3282	and rdx, 0x0f0f0f0f
	3283	add rax, rdx
	3284
	3285	mov rdx, rax
	3286	shr rdx, 8
	3287	add rax, rdx
	3288
	3289	mov rdx, rax
	3290	shr rdx, 16
	3291	add rax, rdx
	3292	movzx rsi, al
	3293
	3294	// the official version
	3295	xor eax, eax // clear iteration counter
	3296	0: jrcxz 9f // bail if c = 0
	3297	inc rax // bump iteration count
	3298	mov rdx, rcx // d' = c
	3299	dec rdx // d' = c - 1
	3300	and rcx, rdx // zap least significant set bit of c
	3301	jmp 0b // and go again
	3302	9:
	3303
	3304	#elif defined(__i386__)
	3305
	3306	popcnt ebx, ecx // the easy way
	3307
	3308	mov eax, ecx
	3309
	3310	mov edx, ecx
	3311	shr edx, 1
	3312	and eax, 0x55555555
	3313	and edx, 0x55555555
	3314	add eax, edx
	3315
	3316	mov edx, eax
	3317	shr edx, 2
	3318	and eax, 0x33333333
	3319	and edx, 0x33333333
	3320	add eax, edx
	3321
	3322	mov edx, eax
	3323	shr edx, 4
	3324	add eax, edx
	3325
	3326	mov edx, eax
	3327	shr edx, 8
	3328	and eax, 0x000f000f
	3329	and edx, 0x000f000f
	3330	add eax, edx
	3331
	3332	mov edx, eax
	3333	shr edx, 16
	3334	add eax, edx
	3335	movzx esi, al
	3336
	3337	xor eax, eax
	3338	0: jecxz 9f
	3339	inc eax
	3340	mov edx, ecx
	3341	dec edx
	3342	and ecx, edx
	3343	jmp 0b
	3344	9:
	3345
	3346	#elif defined(__arm__)
	3347
	3348	// the easy-ish way
	3349	vmov d0[0], r2
	3350	vcnt.8 d0, d0
	3351	vmov r1, d0[0]
	3352	add r1, r1, r1, lsl #8
	3353	add r1, r1, r1, lsl #16
	3354	mov r1, r1, lsr #24
	3355
	3356	// the hard way
	3357	movw r12, #0x5555
	3358	movt r12, #0x5555
	3359	and r3, r12, r2, lsr #1
	3360	and r0, r12, r2
	3361	add r0, r0, r3
	3362
	3363	movw r12, #0x3333
	3364	movt r12, #0x3333
	3365	and r3, r12, r0, lsr #2
	3366	and r0, r12, r0
	3367	add r0, r0, r3
	3368
	3369	add r0, r0, r0, lsl #16
	3370
	3371	movt r12, 0x0f0f
	3372	and r3, r12, r0, lsr #4
	3373	and r0, r12, r0
	3374	add r0, r0, r3
	3375
	3376	add r0, r0, r0, lsl #8
	3377
	3378	mov r4, r0, lsr #24
	3379
	3380	// and following the exercise
	3381	mov r0, #0
	3382	cmp r2, #0
	3383	beq 9f
	3384	0: add r0, r0, #1
	3385	sub r3, r2, #1
	3386	ands r2, r2, r3
	3387	bne 0b
	3388	9:
	3389
	3390	#elif defined(__aarch64__)
	3391
	3392	// the easy-ish way
	3393	mov v0.d[0], x2
	3394	cnt v0.8b, v0.8b
	3395	mov x1, v0.d[0]
	3396	add x1, x1, x1, lsl #8
	3397	add x1, x1, x1, lsl #16
	3398	add x1, x1, x1, lsl #32
	3399	lsr x1, x1, #56
	3400
	3401	// the hard way -- though arm64's immediate constant encodings and
	3402	// shifting make this actually rather pleasant.
	3403	and x3, x2, #0xaaaaaaaaaaaaaaaa
	3404	and x0, x2, #0x5555555555555555
	3405	add x0, x0, x3, lsr #1
	3406
	3407	and x3, x0, #0xcccccccccccccccc
	3408	and x0, x0, #0x3333333333333333
	3409	add x0, x0, x3, lsr #2
	3410
	3411	add x0, x0, x0, lsr #4
	3412
	3413	and x3, x0, #0x0f000f000f000f00
	3414	and x0, x0, #0x000f000f000f000f
	3415	add x0, x3, x0, lsl #8
	3416
	3417	add x0, x0, x0, lsl #16
	3418	add x0, x0, x0, lsl #32
	3419	lsr x4, x0, #56
	3420
	3421	// and the official way
	3422	mov x0, #0
	3423	cbz x2, 9f
	3424	0: add x0, x0, #1
	3425	sub x3, x2, #1
	3426	and x2, x2, x3
	3427	cbnz x2, 0b
	3428	9:
	3429
	3430	#else
	3431	notimpl
	3432	#endif
	3433
	3434	ret
	3435
	3436	endproc
	3437
	3438	///--------------------------------------------------------------------------
	3439	/// 0x30--0x3f
	3440
	3441	proc x30
	3442
	3443	#if defined(__x86_64__)
	3444
	3445	notimpl
	3446
	3447	#elif defined(__i386__)
	3448
	3449	notimpl
	3450
	3451	#elif defined(__arm__)
	3452
	3453	notimpl
	3454
	3455	#elif defined(__aarch64__)
	3456
	3457	notimpl
	3458
	3459	#else
	3460	notimpl
	3461	#endif
	3462
	3463	ret
	3464
	3465	endproc
	3466
	3467	proc x31
	3468
	3469	#if defined(__x86_64__)
	3470
	3471	notimpl
	3472
	3473	#elif defined(__i386__)
	3474
	3475	notimpl
	3476
	3477	#elif defined(__arm__)
	3478
	3479	notimpl
	3480
	3481	#elif defined(__aarch64__)
	3482
	3483	notimpl
	3484
	3485	#else
	3486	notimpl
	3487	#endif
	3488
	3489	endproc
	3490
	3491	proc x32
	3492
	3493	#if defined(__x86_64__)
	3494
	3495	notimpl
	3496
	3497	#elif defined(__i386__)
	3498
	3499	notimpl
	3500
	3501	#elif defined(__arm__)
	3502
	3503	notimpl
	3504
	3505	#elif defined(__aarch64__)
	3506
	3507	notimpl
	3508
	3509	#else
	3510	notimpl
	3511	#endif
	3512
	3513	endproc
	3514
	3515	proc x33
	3516
	3517	#if defined(__x86_64__)
	3518
	3519	notimpl
	3520
	3521	#elif defined(__i386__)
	3522
	3523	notimpl
	3524
	3525	#elif defined(__arm__)
	3526
	3527	notimpl
	3528
	3529	#elif defined(__aarch64__)
	3530
	3531	notimpl
	3532
	3533	#else
	3534	notimpl
	3535	#endif
	3536
	3537	endproc
	3538
	3539	proc x34
	3540
	3541	#if defined(__x86_64__)
	3542
	3543	notimpl
	3544
	3545	#elif defined(__i386__)
	3546
	3547	notimpl
	3548
	3549	#elif defined(__arm__)
	3550
	3551	notimpl
	3552
	3553	#elif defined(__aarch64__)
	3554
	3555	notimpl
	3556
	3557	#else
	3558	notimpl
	3559	#endif
	3560
	3561	endproc
	3562
	3563	proc x35
	3564
	3565	#if defined(__x86_64__)
	3566
	3567	notimpl
	3568
	3569	#elif defined(__i386__)
	3570
	3571	notimpl
	3572
	3573	#elif defined(__arm__)
	3574
	3575	notimpl
	3576
	3577	#elif defined(__aarch64__)
	3578
	3579	notimpl
	3580
	3581	#else
	3582	notimpl
	3583	#endif
	3584
	3585	endproc
	3586
	3587	proc x36
	3588
	3589	#if defined(__x86_64__)
	3590
	3591	notimpl
	3592
	3593	#elif defined(__i386__)
	3594
	3595	notimpl
	3596
	3597	#elif defined(__arm__)
	3598
	3599	notimpl
	3600
	3601	#elif defined(__aarch64__)
	3602
	3603	notimpl
	3604
	3605	#else
	3606	notimpl
	3607	#endif
	3608
	3609	endproc
	3610
	3611	proc x37
	3612
	3613	#if defined(__x86_64__)
	3614
	3615	notimpl
	3616
	3617	#elif defined(__i386__)
	3618
	3619	notimpl
	3620
	3621	#elif defined(__arm__)
	3622
	3623	notimpl
	3624
	3625	#elif defined(__aarch64__)
	3626
	3627	notimpl
	3628
	3629	#else
	3630	notimpl
	3631	#endif
	3632
	3633	endproc
	3634
	3635	proc x38
	3636
	3637	#if defined(__x86_64__)
	3638
	3639	notimpl
	3640
	3641	#elif defined(__i386__)
	3642
	3643	notimpl
	3644
	3645	#elif defined(__arm__)
	3646
	3647	notimpl
	3648
	3649	#elif defined(__aarch64__)
	3650
	3651	notimpl
	3652
	3653	#else
	3654	notimpl
	3655	#endif
	3656
	3657	endproc
	3658
	3659	proc x39
	3660
	3661	#if defined(__x86_64__)
	3662
	3663	notimpl
	3664
	3665	#elif defined(__i386__)
	3666
	3667	notimpl
	3668
	3669	#elif defined(__arm__)
	3670
	3671	notimpl
	3672
	3673	#elif defined(__aarch64__)
	3674
	3675	notimpl
	3676
	3677	#else
	3678	notimpl
	3679	#endif
	3680
	3681	endproc
	3682
	3683	proc x3a
	3684
	3685	#if defined(__x86_64__)
	3686
	3687	notimpl
	3688
	3689	#elif defined(__i386__)
	3690
	3691	notimpl
	3692
	3693	#elif defined(__arm__)
	3694
	3695	notimpl
	3696
	3697	#elif defined(__aarch64__)
	3698
	3699	notimpl
	3700
	3701	#else
	3702	notimpl
	3703	#endif
	3704
	3705	endproc
	3706
	3707	proc x3b
	3708
	3709	#if defined(__x86_64__)
	3710
	3711	notimpl
	3712
	3713	#elif defined(__i386__)
	3714
	3715	notimpl
	3716
	3717	#elif defined(__arm__)
	3718
	3719	notimpl
	3720
	3721	#elif defined(__aarch64__)
	3722
	3723	notimpl
	3724
	3725	#else
	3726	notimpl
	3727	#endif
	3728
	3729	endproc
	3730
	3731	proc x3c
	3732
	3733	#if defined(__x86_64__)
	3734
	3735	notimpl
	3736
	3737	#elif defined(__i386__)
	3738
	3739	notimpl
	3740
	3741	#elif defined(__arm__)
	3742
	3743	notimpl
	3744
	3745	#elif defined(__aarch64__)
	3746
	3747	notimpl
	3748
	3749	#else
	3750	notimpl
	3751	#endif
	3752
	3753	endproc
	3754
	3755	proc x3d
	3756
	3757	#if defined(__x86_64__)
	3758
	3759	notimpl
	3760
	3761	#elif defined(__i386__)
	3762
	3763	notimpl
	3764
	3765	#elif defined(__arm__)
	3766
	3767	notimpl
	3768
	3769	#elif defined(__aarch64__)
	3770
	3771	notimpl
	3772
	3773	#else
	3774	notimpl
	3775	#endif
	3776
	3777	endproc
	3778
	3779	proc x3e
	3780
	3781	#if defined(__x86_64__)
	3782
	3783	notimpl
	3784
	3785	#elif defined(__i386__)
	3786
	3787	notimpl
	3788
	3789	#elif defined(__arm__)
	3790
	3791	notimpl
	3792
	3793	#elif defined(__aarch64__)
	3794
	3795	notimpl
	3796
	3797	#else
	3798	notimpl
	3799	#endif
	3800
	3801	endproc
	3802
	3803	proc x3f
	3804
	3805	#if defined(__x86_64__)
	3806
	3807	notimpl
	3808
	3809	#elif defined(__i386__)
	3810
	3811	notimpl
	3812
	3813	#elif defined(__arm__)
	3814
	3815	notimpl
	3816
	3817	#elif defined(__aarch64__)
	3818
	3819	notimpl
	3820
	3821	#else
	3822	notimpl
	3823	#endif
	3824
	3825	endproc
	3826
	3827	///----- That's all, folks --------------------------------------------------