mdw@git.distorted.org.uk Git - xchg-rax-rax/blame_incremental

... / ...

Commit	Line	Data
	1	/// -- mode: asm; asm-comment-char: 0 --
	2
	3	///--------------------------------------------------------------------------
	4	/// Preliminaries.
	5
	6	#include <sys/syscall.h>
	7
	8	#if defined(__i386__) \|\| defined(__x86_64__)
	9
	10	.intel_syntax noprefix
	11
	12	#elif defined(__arm__)
	13
	14	.macro ret
	15	bx r14
	16	.endm
	17
	18	.arch armv7-a
	19
	20	#elif defined(__aarch64__)
	21
	22	.macro cmov rd, rn, cc
	23	csel \rd, \rn, \rd, \cc
	24	.endm
	25	#define _COND(_) \
	26	_(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl) \
	27	_(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv) \
	28	_(hs) _(lo)
	29	#define _INST(_) \
	30	_(ccmp) _(ccmn) \
	31	_(csel) _(cmov) \
	32	_(csinc) _(cinc) _(cset) \
	33	_(csneg) _(cneg) \
	34	_(csinv) _(cinv) _(csetm)
	35	#define _CONDVAR(cc) _definstvar cc;
	36	#define _INSTVARS(inst) \
	37	.macro _definstvar cc; \
	38	.macro inst.\cc args:vararg; inst \args, \cc; .endm; \
	39	.endm; \
	40	_COND(_CONDVAR); \
	41	.purgem _definstvar;
	42	_INST(_INSTVARS)
	43	#undef _COND
	44	#undef _INST
	45	#undef _CONDVAR
	46	#undef _INSTVARS
	47
	48	#define CCMP_N 8
	49	#define CCMP_Z 4
	50	#define CCMP_C 2
	51	#define CCMP_V 1
	52
	53	#define CCMP_MI CCMP_N
	54	#define CCMP_PL 0
	55	#define CCMP_EQ CCMP_Z
	56	#define CCMP_NE 0
	57	#define CCMP_CS CCMP_C
	58	#define CCMP_HS CCMP_C
	59	#define CCMP_CC 0
	60	#define CCMP_LO 0
	61	#define CCMP_VS CCMP_V
	62	#define CCMP_VC 0
	63	#define CCMP_HI CCMP_C
	64	#define CCMP_LS 0
	65	#define CCMP_LT CCMP_N
	66	#define CCMP_GE 0
	67	#define CCMP_LE CCMP_N
	68	#define CCMP_GT 0
	69
	70	#else
	71	# error "not supported"
	72	#endif
	73
	74	.macro proc name
	75	.globl \name
	76	.type \name, STT_FUNC
	77	.p2align 4
	78	\name\():
	79	.macro endproc
	80	.size \name, . - \name
	81	.purgem endproc
	82	.endm
	83	.endm
	84
	85	.macro ch c
	86	#if defined(__i386__)
	87
	88	pushf
	89	push eax
	90	push ebx
	91	push ecx
	92	push edx
	93	push ebp
	94	mov ebp, esp
	95	and esp, -16
	96
	97	push \c
	98	call putchar@plt
	99
	100	call get_pc_ebx
	101	add ebx, offset _GLOBAL_OFFSET_TABLE
	102	mov eax, [ebx + stdout@GOT]
	103	mov eax, [eax]
	104	call fflush@plt
	105
	106	mov esp, ebp
	107	pop ebp
	108	pop edx
	109	pop ecx
	110	pop ebx
	111	pop eax
	112	popf
	113
	114	#elif defined(__x86_64__)
	115
	116	pushf
	117	push rax
	118	push rcx
	119	push rdx
	120	push rsi
	121	push rdi
	122	push r8
	123	push r9
	124	push rbp
	125	mov rbp, rsp
	126	and rsp, -16
	127
	128	mov rdi, \c
	129	call putchar@plt
	130
	131	mov rdi, [rip + stdout]
	132	call fflush@plt
	133
	134	mov rsp, rbp
	135	pop rbp
	136	pop r9
	137	pop r8
	138	pop rdi
	139	pop rsi
	140	pop rdx
	141	pop rcx
	142	pop rax
	143	popf
	144
	145	#elif defined(__arm__)
	146
	147	stmfd r13!, {r0-r4, r12, r14}
	148
	149	mov r4, r13
	150	bic r14, r4, #15
	151	mov r13, r14
	152
	153	mov r0, #\c
	154	bl putchar@plt
	155
	156	ldr r14, .L$_c$gotoff$\@
	157	.L$_c$gotpc$\@:
	158	add r14, pc, r14
	159	b .L$_c$cont$\@
	160	.L$_c$gotoff$\@:
	161	.word _GLOBAL_OFFSET_TABLE - .L$_c$gotpc$\@ - 8
	162	.L$_c$cont$\@:
	163	bl fflush@plt
	164
	165	mov r13, r4
	166	ldmfd r13!, {r0-r4, r12, r14}
	167
	168	#elif defined(__aarch64__)
	169
	170	sub sp, sp, #20*8
	171	stp x0, x1, [sp, #0]
	172	stp x2, x3, [sp, #16]
	173	stp x4, x5, [sp, #32]
	174	stp x6, x7, [sp, #48]
	175	stp x8, x9, [sp, #64]
	176	stp x10, x11, [sp, #80]
	177	stp x12, x13, [sp, #96]
	178	stp x14, x15, [sp, #112]
	179	stp x16, x17, [sp, #128]
	180	mrs x16, nzcv
	181	stp x16, x30, [sp, #144]
	182
	183	mov w0, #\c
	184	bl putchar
	185	adrp x0, :got:stdout
	186	ldr x0, [x0, #:got_lo12:stdout]
	187	ldr x0, [x0]
	188	bl fflush
	189
	190	ldp x16, x30, [sp, #144]
	191	msr nzcv, x16
	192	ldp x16, x17, [sp, #128]
	193	ldp x14, x15, [sp, #112]
	194	ldp x12, x13, [sp, #96]
	195	ldp x10, x11, [sp, #80]
	196	ldp x8, x9, [sp, #64]
	197	ldp x6, x7, [sp, #48]
	198	ldp x4, x5, [sp, #32]
	199	ldp x2, x3, [sp, #16]
	200	ldp x0, x1, [sp, #0]
	201	add sp, sp, #20*8
	202
	203	#else
	204	# error "not supported"
	205	#endif
	206	.endm
	207
	208	.macro notimpl
	209	#if defined(__i386__) \|\| defined(__x86_64__)
	210	ud2
	211	#elif defined(__arm__)
	212	udf
	213	#elif defined(__aarch64__)
	214	hlt #0
	215	#else
	216	# error "not supported"
	217	#endif
	218	.endm
	219
	220	.section .note.GNU-stack, "", %progbits
	221
	222	.text
	223
	224	#if defined(__i386__)
	225	get_pc_ebx:
	226	mov ebx, [esp]
	227	ret
	228	#endif
	229
	230
	231	proc call_example
	232
	233	#if defined(__i386__)
	234
	235	push ebx // ebx
	236	push esi // esi, ebx
	237	push edi // edi, esi, ebx
	238	push ebp // flags, ebp, ..., ebx
	239	pushf
	240
	241	mov edi, [esp + 4*6]
	242	mov esi, [esp + 4*7]
	243	push esi // regs, flags, ebp, ..., ebx
	244
	245	call get_pc_ebx
	246	lea eax, [ebx + 9f - .]
	247	push eax // cont, regs, flags, ebp, ..., ebx
	248	push edi // func, cont, regs, flags, ebp, ..., ebx
	249
	250	mov eax, [esi + 28]
	251	pushf
	252	pop ecx
	253	and eax, 0x0cd5
	254	and ecx, ~0x0cd5
	255	or eax, ecx
	256	push eax
	257	popf
	258	mov eax, [esi + 0]
	259	mov ebx, [esi + 4]
	260	mov ecx, [esi + 8]
	261	mov edx, [esi + 12]
	262	mov edi, [esi + 20]
	263	mov ebp, [esi + 24]
	264	mov esi, [esi + 16]
	265
	266	ret // -> func; regs, flags, ebp, ..., ebx
	267
	268	9: pushf // eflags, regs, flags, ebp, ..., ebx
	269	push esi // esi, eflags, regs, flags, ebp, ..., ebx
	270	mov esi, [esp + 8]
	271	mov [esi + 0], eax
	272	mov [esi + 4], ebx
	273	mov [esi + 8], ecx
	274	mov [esi + 12], edx
	275	mov [esi + 20], edi
	276	mov [esi + 24], ebp
	277	pop eax // rflags, regs, flags, ebp, ..., ebx
	278	mov [esi + 16], eax
	279	pop eax // regs, flags, ebp, ..., ebx
	280	mov [esi + 28], eax
	281
	282	add esp, 4 // flags, ebp, ..., ebx
	283	popf // ebp, ..., ebx
	284	pop ebp // ..., ebx
	285	pop edi
	286	pop esi
	287	pop ebx //
	288	ret
	289
	290	#elif defined(__x86_64__)
	291
	292	push rbx // rbx
	293	push r10
	294	push r11
	295	push r12
	296	push r13
	297	push r14
	298	push r15
	299	push rbp // flags, rbp, ..., rbx
	300	pushf
	301
	302	push rsi // regs, flags, rbp, ..., rbx
	303
	304	lea rax, [rip + 9f]
	305	push rax // cont, regs, flags, rbp, ..., rbx
	306	push rdi // func, cont, regs, flags, rbp, ..., rbx
	307
	308	mov rax, [rsi + 8*15]
	309	pushf
	310	pop rcx
	311	and rax, 0x0cd5
	312	and rcx, ~0x0cd5
	313	or rax, rcx
	314	push rax
	315	popf
	316	mov rax, [rsi + 0]
	317	mov rbx, [rsi + 8]
	318	mov rcx, [rsi + 16]
	319	mov rdx, [rsi + 24]
	320	mov rdi, [rsi + 40]
	321	mov rbp, [rsi + 48]
	322	mov r8, [rsi + 56]
	323	mov r9, [rsi + 64]
	324	mov r10, [rsi + 72]
	325	mov r11, [rsi + 80]
	326	mov r12, [rsi + 88]
	327	mov r13, [rsi + 96]
	328	mov r14, [rsi + 104]
	329	mov r15, [rsi + 112]
	330	mov rsi, [rsi + 32]
	331
	332	ret // -> func; regs, flags, rbp, ..., rbx
	333
	334	9: pushf // rflags, regs, flags, rbp, ..., rbx
	335	push rsi // rsi, rflags, regs, flags, rbp, ..., rbx
	336	mov rsi, [rsp + 16]
	337	mov [rsi + 0], rax
	338	mov [rsi + 8], rbx
	339	mov [rsi + 16], rcx
	340	mov [rsi + 24], rdx
	341	mov [rsi + 40], rdi
	342	mov [rsi + 48], rbp
	343	mov [rsi + 56], r8
	344	mov [rsi + 64], r9
	345	mov [rsi + 72], r10
	346	mov [rsi + 80], r11
	347	mov [rsi + 88], r12
	348	mov [rsi + 96], r13
	349	mov [rsi + 104], r14
	350	mov [rsi + 112], r15
	351	pop rax // rflags, regs, flags, rbp, ..., rbx
	352	mov [rsi + 32], rax
	353	pop rax // regs, flags, rbp, ..., rbx
	354	mov [rsi + 120], rax
	355
	356	add rsp, 8 // flags, rbp, ..., rbx
	357	popf // rbp, ..., rbx
	358	pop rbp // ..., rbx
	359	pop r15
	360	pop r14
	361	pop r13
	362	pop r12
	363	pop r11
	364	pop r10
	365	pop rbx //
	366	ret
	367
	368	#elif defined(__arm__)
	369
	370	stmfd r13!, {r0, r1, r4-r11, r14}
	371	ldmia r1, {r0-r12, r14}
	372	msr cpsr, r14
	373	mov r14, pc
	374	ldr pc, [r13], #4
	375	ldr r14, [r13], #4
	376	stmia r14!, {r0-r12}
	377	mrs r0, cpsr
	378	str r0, [r14]
	379	ldmfd r13!, {r4-r11, pc}
	380
	381	#elif defined(__aarch64__)
	382
	383	stp x29, x30, [sp, #-14*8]!
	384	mov x29, sp
	385	stp x19, x20, [sp, #16]
	386	stp x21, x22, [sp, #32]
	387	stp x23, x24, [sp, #48]
	388	stp x25, x26, [sp, #64]
	389	stp x27, x28, [sp, #80]
	390	str x1, [sp, #104]
	391
	392	ldp x29, x30, [x1, #224]
	393	msr nzcv, x30
	394	mov x30, x0
	395	ldp x27, x28, [x1, #208]
	396	ldp x25, x26, [x1, #192]
	397	ldp x23, x24, [x1, #176]
	398	ldp x21, x22, [x1, #160]
	399	ldp x19, x20, [x1, #144]
	400	ldp x16, x17, [x1, #128]
	401	ldp x14, x15, [x1, #112]
	402	ldp x12, x13, [x1, #96]
	403	ldp x10, x11, [x1, #80]
	404	ldp x8, x9, [x1, #64]
	405	ldp x6, x7, [x1, #48]
	406	ldp x4, x5, [x1, #32]
	407	ldp x2, x3, [x1, #16]
	408	ldp x0, x1, [x1, #0]
	409
	410	blr x30
	411
	412	ldr x30, [sp, #104]
	413	stp x27, x28, [x30, #208]
	414	stp x25, x26, [x30, #192]
	415	stp x23, x24, [x30, #176]
	416	stp x21, x22, [x30, #160]
	417	stp x19, x20, [x30, #144]
	418	stp x16, x17, [x30, #128]
	419	stp x14, x15, [x30, #112]
	420	stp x12, x13, [x30, #96]
	421	stp x10, x11, [x30, #80]
	422	stp x8, x9, [x30, #64]
	423	stp x6, x7, [x30, #48]
	424	stp x4, x5, [x30, #32]
	425	stp x2, x3, [x30, #16]
	426	stp x0, x1, [x30, #0]
	427	mov x0, x30
	428	mrs x30, nzcv
	429	stp x29, x30, [x0, #224]
	430
	431	ldp x19, x20, [sp, #16]
	432	ldp x21, x22, [sp, #32]
	433	ldp x23, x24, [sp, #48]
	434	ldp x25, x26, [sp, #64]
	435	ldp x27, x28, [sp, #80]
	436	ldp x29, x30, [sp], #14*8
	437
	438	ret
	439
	440	#else
	441	# error "not supported"
	442	#endif
	443
	444	endproc
	445
	446	proc nop
	447
	448	ret
	449
	450	endproc
	451
	452	///--------------------------------------------------------------------------
	453	/// 0x00--0x0f
	454
	455	proc x00
	456
	457	// clear all 64 bits of extended traditional registers
	458
	459	#if defined(__x86_64__)
	460
	461	xor eax, eax // clear rax
	462	lea rbx, [0] // rbx -> _\|_
	463	loop . // iterate, decrement rcx until zero
	464	mov rdx, 0 // set rdx = 0
	465	and esi, 0 // clear all bits of rsi
	466	sub edi, edi // set rdi = edi - edi = 0
	467	push 0
	468	pop rbp // pop 0 into rbp
	469
	470	#elif defined(__i386__)
	471
	472	xor eax, eax
	473	lea ebx, [0]
	474	loop .
	475	mov edx, 0
	476	and esi, 0
	477	sub edi, edi
	478	push 0
	479	pop ebp
	480
	481	#elif defined(__arm__)
	482
	483	eor r0, r0, r0
	484	rsb r1, r1, r1
	485	0: subs r2, r2, #1
	486	bne 0b
	487	mov r3, #0
	488	and r4, r4, #0
	489	sub r5, r5, r5
	490
	491	#elif defined(__aarch64__)
	492
	493	eor w0, w0, w0
	494	mov w1, wzr
	495	0: sub w2, w2, #1
	496	cbnz w2, 0b
	497	mov w3, #0
	498	and w4, w4, wzr
	499	sub w5, w5, w5
	500
	501	#else
	502	notimpl
	503	#endif
	504
	505	ret
	506
	507	endproc
	508
	509	proc x01
	510
	511	// advance a fibonacci pair by c steps
	512	//
	513	// on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1}
	514	// and f_{i+c}, where f_{i+1} = f_i + f_{i-1}
	515
	516	#if defined(__x86_64__)
	517
	518	0: xadd rax, rdx // a, d = a + d, a
	519	// = f_{i+1} + f_i, f_{i+1}
	520	// = f_{i+2}, f_{i+1}
	521	loop 0b // advance i, decrement c, iterate
	522
	523	#elif defined(__i386__)
	524
	525	0: xadd eax, edx
	526	loop 0b
	527
	528	#elif defined(__arm__)
	529
	530	0: subs r2, r2, #2
	531	add r3, r3, r0
	532	blo 8f
	533	add r0, r0, r3
	534	bhi 0b
	535
	536	8: movne r0, r3
	537
	538	#elif defined(__aarch64__)
	539
	540	0: subs x2, x2, #2
	541	add x3, x3, x0
	542	b.lo 8f
	543	add x0, x0, x3
	544	b.hi 0b
	545
	546	8: cmov.ne x0, x3
	547
	548	#else
	549	notimpl
	550	#endif
	551
	552	ret
	553
	554	endproc
	555
	556	proc x02
	557
	558	// boolean canonify a: if a = 0 on entry, leave it zero; otherwise
	559	// set a = 1
	560
	561	#if defined(__x86_64__)
	562
	563	neg rax // set cf iff a /= 0
	564	sbb rax, rax // a = a - a - cf = -cf
	565	neg rax // a = cf
	566
	567	#elif defined(__i386__)
	568
	569	neg eax
	570	sbb eax, eax
	571	neg eax
	572
	573	#elif defined(__arm__)
	574
	575	movs r1, r0 // the easy way
	576	movne r1, #1 // mvnne r1, #1 for mask
	577
	578	cmp r0, #1 // clear cf iff a == 0
	579	sbc r2, r0, r0 // c' = a - a - 1 + cf = cf - 1
	580	add r2, r2, #1 // c' = cf
	581
	582	sub r3, r0, r0, lsr #1 // d' top bit clear; d' = 0 iff a = 0
	583	rsb r3, r3, #0 // d' top bit set iff a /= 0
	584	mov r3, r3, lsr #31 // asr for mask
	585
	586	rsbs r0, r0, #0
	587	sbc r0, r0, r0
	588	rsb r0, r0, #0
	589
	590	#elif defined(__aarch64__)
	591
	592	cmp x0, #0 // trivial
	593	cset.ne x1 // csetm for mask
	594
	595	cmp xzr, x0 // set cf iff a == 0
	596	sbc x2, x0, x0 // c' = a - a - 1 + cf = cf - 1
	597	neg x2, x2 // c' = 1 - cf
	598
	599	sub x3, x0, x0, lsr #1 // if a < 2^63 then a' = ceil(d/2) <
	600	// 2^63
	601	// if a >= 2^63, write a = 2^63 + t
	602	// with t < 2^63; d' = 2^63 - 2^62 +
	603	// ceil(t/2) = 2^62 + ceil(t/2), and
	604	// ceil(t/2) < 2^62
	605	// anyway d' < 2^63 and d' = 0 iff
	606	// a = 0
	607	neg x3, x3 // d' top bit set iff a /= 0
	608	lsr x3, x3, #63 // asr for mask
	609
	610	cmp x0, #1 // set cf iff a /= 0
	611	adc x0, xzr, xzr // a' = 0 + 0 + cf = cf
	612
	613	#else
	614	notimpl
	615	#endif
	616
	617	ret
	618
	619	endproc
	620
	621	proc x03
	622
	623	// set a = min(a, d) (unsigned); clobber c, d
	624
	625	#if defined(__x86_64__)
	626
	627	sub rdx, rax // d' = d - a; set cf if a > d
	628	sbb rcx, rcx // c = -cf = -[a > d]
	629	and rcx, rdx // c = a > d ? d - a : 0
	630	add rax, rcx // a' = a > d ? d : a
	631
	632	#elif defined(__i386__)
	633
	634	sub edx, eax
	635	sbb ecx, ecx
	636	and ecx, edx
	637	add eax, ecx
	638
	639	#elif defined(__arm__)
	640
	641	cmp r0, r3 // the easy way
	642	movlo r1, r0 // only needed for out-of-place
	643	movhs r1, r3
	644
	645	subs r3, r3, r0
	646	sbc r12, r12, r12
	647	and r12, r12, r3
	648	add r0, r0, r12
	649
	650	#elif defined(__aarch64__)
	651
	652	cmp x0, x3 // the easy way
	653	csel.lo x1, x0, x3
	654
	655	subs x3, x3, x0 // d' = d - a; set cf if d >= a
	656	sbc x16, xzr, xzr // t = -1 + cf = -[a > d]
	657	and x16, x16, x3 // t = a > d ? d - a : 0
	658	add x0, x0, x16 // a' = a > d ? d : a
	659
	660	#else
	661	notimpl
	662	#endif
	663
	664	ret
	665
	666	endproc
	667
	668	proc x04
	669
	670	// switch case?
	671
	672	#if defined(__x86_64__)
	673
	674	// unrelated playing
	675	mov ecx, eax
	676	mov rbx, -1
	677	mov edx, ecx
	678	sub edx, '0'
	679	cmp edx, 10
	680	cmovb rbx, rdx
	681	or ecx, 0x20
	682	mov edx, ecx
	683	sub edx, 'a'
	684	sub ecx, 'a' - 10
	685	cmp edx, 6
	686	cmovb rbx, rcx
	687
	688	xor al, 0x20
	689
	690	#elif defined(__i386__)
	691
	692	// unrelated playing
	693	mov ecx, eax
	694	mov ebx, -1
	695	mov edx, ecx
	696	sub edx, '0'
	697	cmp edx, 10
	698	cmovb ebx, edx
	699	or ecx, 0x20
	700	mov edx, ecx
	701	sub edx, 'a'
	702	sub ecx, 'a' - 10
	703	cmp edx, 6
	704	cmovb ebx, ecx
	705
	706	xor al, 0x20
	707
	708	#elif defined(__arm__)
	709
	710	// unrelated playing
	711	mvn r1, #0
	712	sub r12, r0, #'0'
	713	cmp r12, #10
	714	movlo r1, r12
	715	orr r12, r0, #0x20
	716	sub r12, r12, #'a'
	717	cmp r12, #6
	718	addlo r1, r12, #10
	719
	720	eor r0, r0, #0x20
	721
	722	#elif defined(__aarch64__)
	723
	724	// unrelated playing
	725	mov x1, #-1
	726	sub w16, w0, #'0'
	727	cmp w16, #10
	728	cmov.lo x1, x16
	729	orr w16, w0, #0x20
	730	sub w16, w16, #'a' - 10
	731	cmp w16, #10
	732	ccmp.hs w16, #16, #CCMP_HS
	733	cmov.lo x1, x16
	734
	735	eor w0, w0, #0x20
	736
	737	#else
	738	notimpl
	739	#endif
	740
	741	ret
	742
	743	endproc
	744
	745	proc x05
	746
	747	// answer whether 5 <= a </<= 9.
	748
	749	#if defined(__x86_64__)
	750
	751	sub rax, 5 // a' = a - 5
	752	cmp rax, 4 // is a' - 5 </<= 4?
	753
	754	// cc a' a
	755	//
	756	// z/e a' = 4 a = 9
	757	// nz/ne a' /= 4 a /= 9
	758	//
	759	// a/nbe a' > 4 a > 9 or a < 5
	760	// nc/ae/nb a' >= 4 a >= 9 or a < 5
	761	// c/b/nae a' < 4 5 <= a < 9
	762	// be/na a' <= 4 5 <= a <= 9
	763	//
	764	// o a' < -2^63 + 4 -2^63 + 5 <= a < -2^63 + 9
	765	// no a' >= -2^63 + 4 a >= -2^63 + 9 or
	766	// a < -2^63 + 5
	767	// s -2^63 + 4 <= a' < 4 -2^63 + 9 <= a < 9
	768	// ns a' < -2^63 + 4 or a < -2^63 + 9 or a >= 9
	769	// a' >= 4
	770	// ge/nl a' >= 4 a >= 9 or a < -2^63 + 5
	771	// l/nge a' < 4 -2^63 + 5 <= a < 9
	772	// g/nle a' > 4 a > 9 or a < -2^63 + 5
	773	// le/ng a' <= 4 -2^63 + 5 <= a <= 9
	774
	775	#elif defined(__i386__)
	776
	777	sub eax, 5
	778	cmp eax, 4
	779
	780	#elif defined(__arm__)
	781
	782	// i dimly remember having a slick way to do this way back in the
	783	// day, but i can't figure it out any more.
	784	sub r0, #5
	785	cmp r0, #4
	786
	787	#elif defined(__aarch64__)
	788
	789	// literal translation is too obvious
	790	cmp x0, #5
	791	ccmp.hs x0, #9, #CCMP_HS
	792
	793	#else
	794	notimpl
	795	#endif
	796
	797	ret
	798
	799	endproc
	800
	801	proc x06
	802
	803	// leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of,
	804	// set sf to msb(a)
	805
	806	#if defined(__x86_64__)
	807
	808	not rax // a' = -a - 1
	809	inc rax // a' = -a
	810	neg rax // a' = a
	811
	812	#elif defined(__i386__)
	813
	814	not eax
	815	inc eax
	816	neg eax
	817
	818	#elif defined(__arm__)
	819
	820	mvn r0, r0
	821	add r0, r0, #1
	822	rsbs r0, r0, #0 // cf has opposite sense
	823
	824	#elif defined(__aarch64__)
	825
	826	mvn x0, x0
	827	add x0, x0, #1
	828	negs x0, x0 // cf has opposite sense
	829
	830	#else
	831	notimpl
	832	#endif
	833
	834	ret
	835
	836	endproc
	837
	838	proc x07
	839
	840	// same as before (?)
	841
	842	#if defined(__x86_64__)
	843
	844	inc rax // a' = a + 1
	845	neg rax // a' = -a - 1
	846	inc rax // a' = -a
	847	neg rax // a' = a
	848
	849	#elif defined(__i386__)
	850
	851	inc eax
	852	neg eax
	853	inc eax
	854	neg eax
	855
	856	#elif defined(__arm__)
	857
	858	add r0, r0, #1
	859	rsb r0, r0, #0
	860	add r0, r0, #1
	861	rsbs r0, r0, #0
	862
	863	#elif defined(__aarch64__)
	864
	865	add x0, x0, #1
	866	neg x0, x0
	867	add x0, x0, #1
	868	negs x0, x0 // cf has opposite sense
	869
	870	#else
	871	notimpl
	872	#endif
	873
	874	ret
	875
	876	endproc
	877
	878	proc x08
	879
	880	// floor((a + d)/2), correctly handling overflow conditions; final cf
	881	// is lsb(a + d), probably uninteresting
	882
	883	#if defined(__x86_64__)
	884
	885	add rax, rdx // cf \|\| a' = a + d
	886	rcr rax, 1 // shift 65-bit result right by one
	887	// place; lsb moves into carry
	888
	889	#elif defined(__i386__)
	890
	891	add eax, edx
	892	rcr eax, 1
	893
	894	#elif defined(__arm__)
	895
	896	// like the two-instruction a64 version
	897	sub r1, r3, r0
	898	add r1, r0, r1, lsr #1
	899
	900	// the slick version, similar to the above
	901	adds r0, r0, r3
	902	mov r0, r0, rrx
	903
	904	#elif defined(__aarch64__)
	905
	906	// a64 lacks a32's rrx. literal translation.
	907	adds x1, x0, x3 // cf \|\| a' = a + d
	908	adc x16, xzr, xzr // realize cf in extra register
	909	extr x1, x16, x1, #1 // shift down one place
	910
	911	// two instruction version: clobbers additional register. (if you
	912	// wanted the answer in any other register, even overwriting d, then
	913	// this is unnecessary.) also depends on d >= a.
	914	sub x16, x3, x0 // compute difference
	915	add x0, x0, x16, lsr #1 // add half of it (rounded down)
	916
	917	#else
	918	notimpl
	919	#endif
	920
	921	ret
	922
	923	endproc
	924
	925	proc x09
	926
	927	// a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3
	928	// (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8).
	929
	930	#if defined(__x86_64__)
	931
	932	shr rax, 3 // a' = floor(a/8); cf = 1 if a ==
	933	// 4, 5, 6, 7 (mod 8)
	934	adc rax, 0 // a' = floor(a/8) + cf
	935
	936	#elif defined(__i386__)
	937
	938	shr eax, 3
	939	adc eax, 0
	940
	941	#elif defined(__arm__)
	942
	943	movs r0, r0, lsr #3
	944	adc r0, r0, #0
	945
	946	#elif defined(__aarch64__)
	947
	948	tst x0, #4
	949	orr x0, xzr, x0, lsr #3
	950	cinc.ne x0, x0
	951
	952	#else
	953	notimpl
	954	#endif
	955
	956	ret
	957
	958	endproc
	959
	960	proc x0a
	961
	962	// increment c-byte little-endian bignum at rdi
	963
	964	#if defined(__x86_64__)
	965
	966	add byte ptr [rdi], 1
	967	0: inc rdi
	968	adc byte ptr [rdi], 0
	969	loop 0b
	970
	971	#elif defined(__i386__)
	972
	973	add byte ptr [edi], 1
	974	0: inc edi
	975	adc byte ptr [edi], 0
	976	loop 0b
	977
	978	#elif defined(__arm__)
	979
	980	mov r12, #256 // set initial carry
	981	0: ldrb r0, [r5]
	982	subs r2, r2, #1
	983	add r12, r0, r12, lsr #8
	984	strb r12, [r5], #1
	985	bne 0b
	986
	987	#elif defined(__aarch64__)
	988
	989	mov w17, #256 // set initial carry
	990	0: ldrb w16, [x5]
	991	sub x2, x2, #1
	992	add w17, w16, w17, lsr #8
	993	strb w17, [x5], #1
	994	cbnz x2, 0b
	995
	996	#else
	997	notimpl
	998	#endif
	999
	1000	ret
	1001
	1002	endproc
	1003
	1004	proc x0b
	1005
	1006	// negate double-precision d:a
	1007
	1008	#if defined(__x86_64__)
	1009
	1010	not rdx // d' = -d - 1
	1011	neg rax // a' = -a;
	1012	// cf = 1 iff a /= 0
	1013	sbb rdx, -1 // d' = -d - cf
	1014
	1015	#elif defined(__i386__)
	1016
	1017	not edx
	1018	neg eax
	1019	sbb edx, -1
	1020
	1021	#elif defined(__arm__)
	1022
	1023	// reverse subtract is awesome
	1024	rsbs r0, r0, #0
	1025	rsc r3, r3, #0
	1026
	1027	#elif defined(__aarch64__)
	1028
	1029	// easy way: everything is better with zero registers.
	1030	negs x0, x0
	1031	ngc x3, x3
	1032
	1033	#else
	1034	notimpl
	1035	#endif
	1036
	1037	ret
	1038
	1039	endproc
	1040
	1041	proc x0c
	1042
	1043	// rotate is distributive over xor.
	1044
	1045	#if defined(__x86_64__)
	1046
	1047	// rax // = a_1 \|\| a_0
	1048	// rbx // = b_1 \|\| b_0
	1049	mov rcx, rax // = a_1 \|\| a_0
	1050
	1051	xor rcx, rbx // = (a_1 XOR b_1) \|\| (a_0 XOR b_0)
	1052	ror rcx, 0xd // = (a_0 XOR b_0) \|\| (a_1 XOR b_1)
	1053
	1054	ror rax, 0xd // = a_0 \|\| a_1
	1055	ror rbx, 0xd // = b_0 \|\| b_1
	1056	xor rax, rbx // = (a_0 XOR b_0) \|\| (a_1 XOR b_1)
	1057
	1058	cmp rax, rcx // always equal
	1059
	1060	#elif defined(__i386__)
	1061
	1062	mov ecx, eax // = a_1 \|\| a_0
	1063
	1064	xor ecx, ebx // = (a_1 XOR b_1) \|\| (a_0 XOR b_0)
	1065	ror ecx, 0xd // = (a_0 XOR b_0) \|\| (a_1 XOR b_1)
	1066
	1067	ror eax, 0xd // = a_0 \|\| a_1
	1068	ror ebx, 0xd // = b_0 \|\| b_1
	1069	xor eax, ebx // = (a_0 XOR b_0) \|\| (a_1 XOR b_1)
	1070
	1071	cmp eax, ecx // always equal
	1072
	1073	#elif defined(__arm__)
	1074
	1075
	1076	// r0 // = a_1 \|\| a_0
	1077	// r1 // = b_1 \|\| b_0
	1078	eor r2, r0, r1 // = (a_1 XOR b_1) \|\| (a_0 XOR b_0)
	1079	mov r2, r2, ror #13 // = (a_0 XOR b_0) \|\| (a_1 XOR b_1)
	1080
	1081	mov r1, r1, ror #13 // = b_0 \|\| b_1
	1082	eor r0, r1, r0, ror #13 // = (a_0 XOR b_0) \|\| (a_1 XOR b_1)
	1083
	1084	cmp r0, r2 // always equal
	1085
	1086	#elif defined(__aarch64__)
	1087
	1088	// x0 // = a_1 \|\| a_0
	1089	// x1 // = b_1 \|\| b_0
	1090	eor x2, x0, x1 // = (a_1 XOR b_1) \|\| (a_0 XOR b_0)
	1091	ror x2, x2, #13 // = (a_0 XOR b_0) \|\| (a_1 XOR b_1)
	1092
	1093	ror x1, x1, #13 // = b_0 \|\| b_1
	1094	eor x0, x1, x0, ror #13 // = (a_0 XOR b_0) \|\| (a_1 XOR b_1)
	1095
	1096	cmp x0, x2 // always equal
	1097
	1098	#else
	1099	notimpl
	1100	#endif
	1101
	1102	ret
	1103
	1104	endproc
	1105
	1106	proc x0d
	1107
	1108	// and is distributive over xor.
	1109
	1110	#if defined(__x86_64__)
	1111
	1112	mov rdx, rbx // = b
	1113
	1114	xor rbx, rcx // = b XOR c
	1115	and rbx, rax // = a AND (b XOR c)
	1116
	1117	and rdx, rax // = a AND b
	1118	and rax, rcx // = a AND c
	1119	xor rax, rdx // = (a AND b) XOR (a AND c)
	1120	// = a AND (b XOR c)
	1121
	1122	cmp rax, rbx // always equal
	1123
	1124	#elif defined(__i386__)
	1125
	1126	mov edx, ebx // = b
	1127
	1128	xor ebx, ecx // = b XOR c
	1129	and ebx, eax // = a AND (b XOR c)
	1130
	1131	and edx, eax // = a AND b
	1132	and eax, ecx // = a AND c
	1133	xor eax, edx // = (a AND b) XOR (a AND c)
	1134	// = a AND (b XOR c)
	1135
	1136	cmp eax, ebx // always equal
	1137
	1138	#elif defined(__arm__)
	1139
	1140	and r3, r0, r1 // = a AND b
	1141
	1142	eor r1, r1, r2 // = b XOR c
	1143	and r1, r1, r0 // = a AND (b XOR c)
	1144
	1145	and r0, r0, r2 // = a AND c
	1146	eor r0, r0, r3 // = (a AND b) XOR (a AND c)
	1147	// = a AND (b XOR c)
	1148
	1149	cmp r0, r1 // always equal
	1150
	1151	#elif defined(__aarch64__)
	1152
	1153	and x3, x0, x1 // = a AND b
	1154
	1155	eor x1, x1, x2 // = b XOR c
	1156	and x1, x1, x0 // = a AND (b XOR c)
	1157
	1158	and x0, x0, x2 // = a AND c
	1159	eor x0, x0, x3 // = (a AND b) XOR (a AND c)
	1160	// = a AND (b XOR c)
	1161
	1162	cmp x0, x1 // always equal
	1163
	1164	#else
	1165	notimpl
	1166	#endif
	1167
	1168	ret
	1169
	1170	endproc
	1171
	1172	proc x0e
	1173
	1174	// de morgan's law
	1175
	1176	#if defined(__x86_64__)
	1177
	1178	mov rcx, rax // = a
	1179
	1180	and rcx, rbx // = a AND b
	1181	not rcx // = NOT (a AND b)
	1182
	1183	not rax // = NOT a
	1184	not rbx // = NOT b
	1185	or rax, rbx // = (NOT a) OR (NOT b)
	1186	// = NOT (a AND b)
	1187
	1188	cmp rax, rcx // always equal
	1189
	1190	#elif defined(__i386__)
	1191
	1192	mov ecx, eax // = a
	1193
	1194	and ecx, ebx // = a AND b
	1195	not ecx // = NOT (a AND b)
	1196
	1197	not eax // = NOT a
	1198	not ebx // = NOT b
	1199	or eax, ebx // = (NOT a) OR (NOT b)
	1200	// = NOT (a AND b)
	1201
	1202	cmp eax, ecx // always equal
	1203
	1204	#elif defined(__arm__)
	1205
	1206	and r2, r0, r1 // = a AND b
	1207	mvn r2, r2 // = NOT (a AND b)
	1208
	1209	mvn r0, r0 // = NOT a
	1210	mvn r1, r1 // = NOT b
	1211	orr r0, r0, r1 // = (NOT a) OR (NOT b)
	1212
	1213	cmp r0, r2 // always equal
	1214
	1215	#elif defined(__aarch64__)
	1216
	1217	and x2, x0, x1 // = a AND b
	1218	mvn x2, x2 // = NOT (a AND b)
	1219
	1220	mvn x0, x0 // = NOT a
	1221	orn x0, x0, x1 // = (NOT a) OR (NOT b)
	1222
	1223	cmp x0, x2 // always equal
	1224
	1225	#else
	1226	notimpl
	1227	#endif
	1228
	1229	ret
	1230
	1231	endproc
	1232
	1233	proc x0f
	1234
	1235	// replace input buffer bytes with cumulative XORs with initial a;
	1236	// final a is XOR of all buffer bytes and initial a.
	1237	//
	1238	// not sure why you'd do this.
	1239
	1240	#if defined(__x86_64__)
	1241
	1242	0: xor [rsi], al
	1243	lodsb
	1244	loop 0b
	1245
	1246	#elif defined(__i386__)
	1247
	1248	0: xor [esi], al
	1249	lodsb
	1250	loop 0b
	1251
	1252	#elif defined(__arm__)
	1253
	1254	0: ldrb r12, [r4]
	1255	subs r2, r2, #1
	1256	eor r0, r0, r12
	1257	strb r0, [r4], #1
	1258	bne 0b
	1259
	1260	#elif defined(__aarch64__)
	1261
	1262	0: ldrb w16, [x4]
	1263	sub x2, x2, #1
	1264	eor w0, w0, w16
	1265	strb w0, [x4], #1
	1266	cbnz x2, 0b
	1267
	1268	#else
	1269	notimpl
	1270	#endif
	1271
	1272	ret
	1273
	1274	endproc
	1275
	1276	///--------------------------------------------------------------------------
	1277	/// 0x10--0x1f
	1278
	1279	proc x10
	1280
	1281	// four different ways to swap a pair of registers.
	1282
	1283	#if defined(__x86_64__)
	1284
	1285	push rax
	1286	push rcx
	1287	pop rax
	1288	pop rcx
	1289
	1290	xor rax, rcx
	1291	xor rcx, rax
	1292	xor rax, rcx
	1293
	1294	add rax, rcx
	1295	sub rcx, rax
	1296	add rax, rcx
	1297	neg rcx
	1298
	1299	xchg rax, rcx
	1300
	1301	#elif defined(__i386__)
	1302
	1303	push eax
	1304	push ecx
	1305	pop eax
	1306	pop ecx
	1307
	1308	xor eax, ecx
	1309	xor ecx, eax
	1310	xor eax, ecx
	1311
	1312	add eax, ecx
	1313	sub ecx, eax
	1314	add eax, ecx
	1315	neg ecx
	1316
	1317	xchg eax, ecx
	1318
	1319	#elif defined(__arm__)
	1320
	1321	stmfd r13!, {r0, r2}
	1322	ldr r0, [r13, #4]
	1323	ldr r2, [r13], #8
	1324
	1325	eor r0, r0, r2
	1326	eor r2, r2, r0
	1327	eor r0, r0, r2
	1328
	1329	sub r0, r0, r2
	1330	add r2, r2, r0
	1331	rsb r0, r0, r2 // don't need 3-addr with reverse-sub
	1332
	1333	mov r12, r0
	1334	mov r0, r2
	1335	mov r2, r0
	1336
	1337	#elif defined(__aarch64__)
	1338
	1339	// anything you can do
	1340	stp x0, x2, [sp, #-16]!
	1341	ldp x2, x0, [sp], #16
	1342
	1343	eor x0, x0, x2
	1344	eor x2, x2, x0
	1345	eor x0, x0, x2
	1346
	1347	// the add/sub/add thing was daft. you can do it in three if you're
	1348	// clever -- and have three-address operations.
	1349	sub x0, x0, x2
	1350	add x2, x2, x0
	1351	sub x0, x2, x0
	1352
	1353	// but we lack a fourth. we can't do this in fewer than three
	1354	// instructions without hitting memory. only `ldp' will modify two
	1355	// registers at a time, so we need at least two instructions -- but
	1356	// if the first one sets one of our two registers to its final value
	1357	// then we lose the other input value with no way to recover it, so
	1358	// we must either write a fresh third register, or write something
	1359	// other than the final value, and in both cases we need a third
	1360	// instruction to fix everything up. we've done the wrong-something-
	1361	// other trick twice, so here's the captain-obvious use-a-third-
	1362	// register version.
	1363	mov x16, x0
	1364	mov x0, x2
	1365	mov x2, x16
	1366
	1367	#else
	1368	notimpl
	1369	#endif
	1370
	1371	ret
	1372
	1373	endproc
	1374
	1375	proc x11
	1376
	1377	// assuming a is initialized to zero, set a to the inclusive or of
	1378	// the xor-differences of corresponding bytes in the c-byte strings
	1379	// at si and di.
	1380	//
	1381	// in particular, a will be zero (and zf set) if and only if the two
	1382	// strings are equal.
	1383
	1384	#if defined(__x86_64__)
	1385
	1386	0: mov dl, [rsi]
	1387	xor dl, [rdi]
	1388	inc rsi
	1389	inc rdi
	1390	or al, dl
	1391	loop 0b
	1392
	1393	#elif defined(__i386__)
	1394
	1395	0: mov dl, [esi]
	1396	xor dl, [edi]
	1397	inc esi
	1398	inc edi
	1399	or al, dl
	1400	loop 0b
	1401
	1402	#elif defined(__arm__)
	1403
	1404	0: ldrb r1, [r4], #1
	1405	ldrb r12, [r5], #1
	1406	subs r2, r2, #1
	1407	eor r12, r12, r1
	1408	orr r0, r0, r12
	1409	bne 0b
	1410
	1411	#elif defined(__aarch64__)
	1412
	1413	0: ldrb w16, [x4], #1
	1414	ldrb w17, [x5], #1
	1415	sub x2, x2, #1
	1416	eor w16, w16, w17
	1417	orr w0, w0, w16
	1418	cbnz x2, 0b
	1419
	1420	#else
	1421	notimpl
	1422	#endif
	1423
	1424	ret
	1425
	1426	endproc
	1427
	1428	proc x12
	1429
	1430	// an obtuse way of adding two registers. for any bit position, a
	1431	// OR d is set if and only if at least one of a and d has a bit set
	1432	// in that position, and a AND d is set if and only if both have a
	1433	// bit set in that position. essentially, then, what we've done is
	1434	// move all of the set bits in d to a, unless there's already a bit
	1435	// there. this clearly doesn't change the sum.
	1436
	1437	#if defined(__x86_64__)
	1438
	1439	mov rcx, rdx // c' = d
	1440	and rdx, rax // d' = a AND d
	1441	or rax, rcx // a' = a OR d
	1442	add rax, rdx
	1443
	1444	#elif defined(__i386__)
	1445
	1446	mov ecx, edx // c' = d
	1447	and edx, eax // d' = a AND d
	1448	or eax, ecx // a' = a OR d
	1449	add eax, edx
	1450
	1451	#elif defined(__arm__)
	1452
	1453	and r2, r0, r3 // c' = a AND d
	1454	orr r0, r0, r3 // a' = a OR d
	1455	add r0, r0, r2
	1456
	1457	#elif defined(__aarch64__)
	1458
	1459	and x2, x0, x3 // c' = a AND d
	1460	orr x0, x0, x3 // a' = a OR d
	1461	add x0, x0, x2
	1462
	1463	#else
	1464	notimpl
	1465	#endif
	1466
	1467	ret
	1468
	1469	endproc
	1470
	1471	proc x13
	1472
	1473	// ok, so this is a really obtuse way of adding a and b; the result
	1474	// is in a and d. but why does it work?
	1475
	1476	#if defined(__x86_64__)
	1477
	1478	mov rcx, 0x40 // carry chains at most 64 long
	1479	0: mov rdx, rax // copy a'
	1480	xor rax, rbx // low bits of each bitwise sum
	1481	and rbx, rdx // carry bits from each bitwise sum
	1482	shl rbx, 1 // carry them into next position
	1483	loop 0b
	1484
	1485	#elif defined(__i386__)
	1486
	1487	mov ecx, 0x40 // carry chains at most 64 long
	1488	0: mov edx, eax // copy a'
	1489	xor eax, ebx // low bits of each bitwise sum
	1490	and ebx, edx // carry bits from each bitwise sum
	1491	shl ebx, 1 // carry them into next position
	1492	loop 0b
	1493
	1494	#elif defined(__arm__)
	1495
	1496	mov r2, #0x40
	1497	0: and r3, r0, r1
	1498	subs r2, r2, #1
	1499	eor r0, r0, r1
	1500	lsl r1, r3, #1
	1501	bne 0b
	1502
	1503	#elif defined(__aarch64__)
	1504
	1505	mov x2, #0x40
	1506	0: and x3, x0, x1
	1507	sub x2, x2, #1
	1508	eor x0, x0, x1
	1509	lsl x1, x3, #1
	1510	cbnz x2, 0b
	1511
	1512	#else
	1513	notimpl
	1514	#endif
	1515
	1516	ret
	1517
	1518	endproc
	1519
	1520	proc x14
	1521
	1522	// floor((a + d)/2), like x08.
	1523
	1524	#if defined(__x86_64__)
	1525
	1526	mov rcx, rax // copy a for later
	1527	and rcx, rdx // carry bits
	1528
	1529	xor rax, rdx // low bits of each bitwise sum
	1530	shr rax, 1 // divide by 2; carries now in place
	1531
	1532	add rax, rcx // add the carries; done
	1533
	1534	#elif defined(__i386__)
	1535
	1536	mov ecx, eax // copy a for later
	1537	and ecx, edx // carry bits
	1538
	1539	xor eax, edx // low bits of each bitwise sum
	1540	shr eax, 1 // divide by 2; carries now in place
	1541
	1542	add eax, ecx // add the carries; done
	1543
	1544	#elif defined(__arm__)
	1545
	1546	and r2, r0, r3
	1547	eor r0, r0, r3
	1548	add r0, r2, r0, lsr #1
	1549
	1550	#elif defined(__aarch64__)
	1551
	1552	and x2, x0, x3
	1553	eor x0, x0, x3
	1554	add x0, x2, x0, lsr #1
	1555
	1556	#else
	1557	notimpl
	1558	#endif
	1559
	1560	ret
	1561
	1562	endproc
	1563
	1564	proc x15
	1565
	1566	// sign extension 32 -> 64 bits.
	1567
	1568	#if defined(__x86_64__)
	1569
	1570	movsx rbx, eax // like this?
	1571
	1572	mov rdx, 0xffffffff80000000
	1573	add rax, rdx // if bit 31 of a is set then bits
	1574	// 31--63 of a' are clear; otherwise,
	1575	// these bits are all set -- which is
	1576	// exactly backwards
	1577	xor rax, rdx // so fix it
	1578
	1579	#elif defined(__i386__)
	1580
	1581	movsx ebx, ax // like this?
	1582
	1583	mov edx, 0xffff8000
	1584	add eax, edx // if bit 31 of a is set then bits
	1585	// 31--63 of a' are clear; otherwise,
	1586	// these bits are all set -- which is
	1587	// exactly backwards
	1588	xor eax, edx // so fix it
	1589
	1590	#elif defined(__arm__)
	1591
	1592	sxth r1, r0 // like this
	1593
	1594	mov r12, #0x80000000
	1595	add r0, r0, r12, asr #16
	1596	eor r0, r0, r12, asr #16
	1597
	1598	#elif defined(__aarch64__)
	1599
	1600	sxtw x1, w0 // like this
	1601
	1602	mov x16, #0xffffffff80000000
	1603	add x0, x0, x16
	1604	eor x0, x0, x16
	1605
	1606	#else
	1607	notimpl
	1608	#endif
	1609
	1610	ret
	1611
	1612	endproc
	1613
	1614	proc x16
	1615
	1616	// ??? i don't know why you'd want to calculate this.
	1617
	1618	#if defined(__x86_64__)
	1619
	1620	xor rax, rbx // a' = a XOR b
	1621	xor rbx, rcx // b' = b XOR c
	1622	mov rsi, rax // t = a XOR b
	1623	add rsi, rbx // t = (a XOR b) + (b XOR c)
	1624	cmovc rax, rbx // a' = cf ? b XOR c : a XOR b
	1625	xor rax, rbx // a' = cf ? 0 : a XOR c
	1626	cmp rax, rsi
	1627
	1628	#elif defined(__i386__)
	1629
	1630	xor eax, ebx // a' = a XOR b
	1631	xor ebx, ecx // b' = b XOR c
	1632	mov esi, eax // t = a XOR b
	1633	add esi, ebx // t = (a XOR b) + (b XOR c)
	1634	cmovc eax, ebx // a' = cf ? b XOR c : a XOR b
	1635	xor eax, ebx // a' = cf ? 0 : a XOR c
	1636	cmp eax, esi
	1637
	1638	#elif defined(__arm__)
	1639
	1640	eor r0, r0, r1
	1641	eor r1, r1, r2
	1642	adds r4, r0, r1
	1643	movcs r0, r1
	1644	eor r0, r0, r1
	1645	cmp r0, r4
	1646
	1647	#elif defined(__aarch64__)
	1648
	1649	eor x0, x0, x1
	1650	eor x1, x1, x2
	1651	adds x4, x0, x1
	1652	cmov.cs x0, x1
	1653	eor x0, x0, x1
	1654	cmp x0, x4
	1655
	1656	#else
	1657	notimpl
	1658	#endif
	1659
	1660	ret
	1661
	1662	endproc
	1663
	1664	proc x17
	1665
	1666	// absolute value
	1667
	1668	#if defined(__x86_64__)
	1669
	1670	cqo // d = a < 0 ? -1 : 0
	1671	xor rax, rdx // a' = a < 0 ? -a - 1 : a
	1672	sub rax, rdx // a' = a < 0 ? -a : a
	1673
	1674	#elif defined(__i386__)
	1675
	1676	cdq // d = a < 0 ? -1 : 0
	1677	xor eax, edx // a' = a < 0 ? -a - 1 : a
	1678	sub eax, edx // a' = a < 0 ? -a : a
	1679
	1680	#elif defined(__arm__)
	1681
	1682	// direct approach
	1683	movs r1, r0
	1684	rsbmi r1, r0, #0
	1685
	1686	// faithful-ish conversion
	1687	eor r3, r0, r0, asr #31
	1688	sub r0, r3, r0, asr #31
	1689
	1690	#elif defined(__aarch64__)
	1691
	1692	// direct approach
	1693	tst x0, #1 << 63
	1694	cneg.ne x1, x0
	1695
	1696	// faithful-ish conversion
	1697	eor x3, x0, x0, asr #63
	1698	sub x0, x3, x0, asr #63
	1699
	1700	#else
	1701	notimpl
	1702	#endif
	1703
	1704	ret
	1705
	1706	endproc
	1707
	1708	proc x18
	1709
	1710	// should always set sf, clear zf, unless we get rescheduled to a
	1711	// different core.
	1712
	1713	#if defined(__x86_64__)
	1714
	1715	rdtsc // d \|\| a = cycles
	1716	shl rdx, 0x20
	1717	or rax, rdx // a = cycles
	1718	mov rcx, rax // c = cycles
	1719
	1720	rdtsc // d \|\| a = cycles'
	1721	shl rdx, 0x20
	1722	or rax, rdx // a = cycles'
	1723
	1724	cmp rcx, rax
	1725
	1726	#elif defined(__i386__)
	1727
	1728	rdtsc // d \|\| a = cycles
	1729	mov ebx, eax
	1730	mov ecx, edx // c \|\| b = cycles
	1731
	1732	rdtsc // d \|\| a = cycles'
	1733
	1734	sub ebx, eax
	1735	sbb ecx, edx
	1736
	1737	#elif defined(__arm__)
	1738
	1739	// cycle clock not available in user mode
	1740	mrrc p15, 0, r0, r1, c9
	1741	mrrc p15, 0, r2, r3, c9
	1742	subs r0, r0, r2
	1743	sbcs r1, r1, r3
	1744
	1745	#elif defined(__aarch64__)
	1746
	1747	// cycle clock not available in user mode
	1748	mrs x0, pmccntr_el0
	1749	mrs x1, pmccntr_el0
	1750	cmp x0, x1
	1751
	1752	#else
	1753	notimpl
	1754	#endif
	1755
	1756	ret
	1757
	1758	endproc
	1759
	1760	proc x19
	1761
	1762	// stupid way to capture a pointer to inline data and jump past it.
	1763	// confuses the return-address predictor something chronic. worse
	1764	// because amd64 calling convention doesn't usually pass arguments on
	1765	// the stack.
	1766
	1767	#if defined(__x86_64__)
	1768
	1769	call 8f
	1770	.string "hello world!\n\0"
	1771	8: call print_str
	1772	add rsp, 8
	1773	ret
	1774
	1775	print_str:
	1776	// actually implement this ridiculous thing
	1777	mov rsi, [rsp + 8]
	1778	xor edx, edx
	1779	0: mov al, [rsi + rdx]
	1780	inc rdx
	1781	cmp al, 0
	1782	jnz 0b
	1783	mov eax, SYS_write
	1784	mov edi, 1
	1785	dec rdx
	1786	syscall // clobbers r11 :-(
	1787	ret
	1788
	1789	#elif defined(__i386__)
	1790
	1791	call 8f
	1792	.string "hello world!\n\0"
	1793	8: call print_str
	1794	add esp, 4
	1795	ret
	1796
	1797	print_str:
	1798	// actually implement this ridiculous thing
	1799	mov ecx, [esp + 4]
	1800	xor edx, edx
	1801	0: mov al, [ecx + edx]
	1802	inc edx
	1803	cmp al, 0
	1804	jnz 0b
	1805	mov eax, SYS_write
	1806	mov ebx, 1
	1807	dec edx
	1808	int 0x80
	1809	ret
	1810
	1811	#elif defined(__arm__)
	1812
	1813	// why am i doing this?
	1814	stmfd r13!, {r14}
	1815	bl 8f
	1816	.string "hello world!\n\0"
	1817	.balign 4
	1818	8: mov r1, r14 // might as well make it easy on myself
	1819	bl print_str
	1820	ldmfd r13!, {pc}
	1821
	1822	print_str:
	1823	mov r2, #0
	1824	0: ldrb r0, [r1, r2]
	1825	cmp r0, #0
	1826	addne r2, r2, #1
	1827	bne 0b
	1828	mov r0, #1
	1829	mov r7, #SYS_write
	1830	swi 0
	1831	bx r14
	1832
	1833	#elif defined(__aarch64__)
	1834
	1835	// why am i doing this?
	1836	str x30, [sp, #-16]!
	1837	bl 8f
	1838	.string "hello world!\n\0"
	1839	.balign 4
	1840	8: mov x1, x30 // might as well make it easy on myself
	1841	bl print_str
	1842	ldr x30, [sp], #16
	1843	ret
	1844
	1845	print_str:
	1846	mov x2, #0
	1847	0: ldrb w0, [x1, x2]
	1848	cmp w0, #0
	1849	cinc.ne x2, x2
	1850	b.ne 0b
	1851	mov x0, #1
	1852	mov x8, #SYS_write
	1853	svc #0
	1854	ret
	1855
	1856	#else
	1857	notimpl
	1858	#endif
	1859
	1860	endproc
	1861
	1862	proc x1a
	1863
	1864	// collect the current instruction-pointer address. this was an old
	1865	// 32-bit i386 trick for position-independent code, but (a) it
	1866	// confuses the return predictor, and (b) amd64 has true pc-relative
	1867	// addressing.
	1868
	1869	#if defined(__x86_64__)
	1870
	1871	// the actual example
	1872	call 0f
	1873	0: pop rax
	1874
	1875	// the modern i386 trick doesn't confuse the return-address
	1876	// predictor.
	1877	call calladdr_rbx
	1878	sub rbx, . - 0b
	1879
	1880	// but rip-relative addressing is even better
	1881	lea rcx, [rip + 0b]
	1882
	1883	ret
	1884
	1885	calladdr_rbx:
	1886	mov rbx, [rsp]
	1887	ret
	1888
	1889	#elif defined(__i386__)
	1890
	1891	// the actual example
	1892	call 0f
	1893	0: pop eax
	1894
	1895	// the modern i386 trick doesn't confuse the return-address
	1896	// predictor.
	1897	call get_pc_ebx
	1898	sub ebx, . - 0b
	1899
	1900	ret
	1901
	1902	#elif defined(__arm__)
	1903
	1904	stmfd r13!, {r14}
	1905
	1906	bl 0f
	1907	0: mov r0, r14
	1908
	1909	bl return
	1910	sub r1, r14, #. - 0b
	1911
	1912	adr r2, 0b
	1913
	1914	ldmfd r13!, {pc}
	1915
	1916	return: bx r14
	1917
	1918	#elif defined(__aarch64__)
	1919
	1920	str x30, [sp, #-16]!
	1921
	1922	// we can do all of the above using a64
	1923	bl 0f
	1924	0: mov x0, x30
	1925
	1926	bl return
	1927	sub x1, x30, #. - 0b
	1928
	1929	adr x2, 0b
	1930
	1931	ldr x30, [sp], #16
	1932	return: ret
	1933
	1934	#else
	1935	notimpl
	1936	#endif
	1937
	1938	endproc
	1939
	1940	proc x1b
	1941
	1942	#if defined(__x86_64__)
	1943
	1944	// retpolines: an mitigation against adversarially influenced
	1945	// speculative execution at indirect branches. if an adversary can
	1946	// prepare a branch-target buffer entry matching an indirect branch
	1947	// in the victim's address space then they can cause the victim to
	1948	// /speculatively/ (but not architecturally) execute any code in
	1949	// their address space, possibly leading to leaking secrets through
	1950	// the cache. retpolines aren't susceptible to this because the
	1951	// predicted destination address is from the return-prediction stack
	1952	// which the adversary can't prime. the performance penalty is still
	1953	// essentially a branch misprediction -- for this return, and
	1954	// possibly all others already stacked.
	1955
	1956	// (try not to crash)
	1957	lea rax, [rip + 9f]
	1958
	1959	push rax
	1960	9: ret
	1961
	1962	#elif defined(__i386__)
	1963
	1964	call get_pc_ebx
	1965	lea eax, [ebx + 9f - .]
	1966
	1967	push eax
	1968	9: ret
	1969
	1970	#elif defined(__arm__)
	1971
	1972	stmfd r13!, {r14}
	1973
	1974	adr r14, 8f
	1975	bx r14
	1976
	1977	8: ldmfd r13!, {pc}
	1978
	1979	#elif defined(__aarch64__)
	1980
	1981	str x30, [sp, #-16]!
	1982
	1983	adr x30, 8f
	1984	ret
	1985
	1986	8: ldr x30, [sp], #16
	1987	ret
	1988
	1989	#else
	1990	notimpl
	1991	#endif
	1992
	1993	endproc
	1994
	1995	proc x1c
	1996
	1997	// ok, having a hard time seeing a use for this. the most important
	1998	// thing to note is that sp is set from `pop' /after/ it's
	1999	// incremented.
	2000
	2001	#if defined(__x86_64__)
	2002
	2003	// try not to crash
	2004	mov rax, rsp
	2005	and rsp, -16
	2006	push rax
	2007
	2008	pop rsp
	2009
	2010	// check it worked
	2011	mov rbx, rsp
	2012	ret
	2013
	2014	#elif defined(__i386__)
	2015
	2016	// try not to crash
	2017	mov eax, esp
	2018	and esp, -16
	2019	push eax
	2020
	2021	pop esp
	2022
	2023	// check it worked
	2024	mov ebx, esp
	2025	ret
	2026
	2027	#elif defined(__arm__)
	2028
	2029	// not even going to dignify this
	2030	notimpl
	2031
	2032	#elif defined(__aarch64__)
	2033
	2034	// not even going to dignify this
	2035	notimpl
	2036
	2037	#else
	2038	notimpl
	2039	#endif
	2040
	2041	endproc
	2042
	2043	proc x1d
	2044
	2045	// monumentally cheesy way to copy 8 n bytes from buff1 to buff2.
	2046	// also clobbers words at buff2 + 8 n and buff2 - 8 for good measure.
	2047
	2048	n = 4
	2049
	2050	#if defined(__x86_64__)
	2051
	2052	mov rax, rsp // safekeeping
	2053
	2054	// we're toast if we get hit by a signal now. fingers crossed...
	2055	.if 0
	2056	mov rsp, buff2 + 8*n + 8
	2057	mov rbp, buff1 + 8*n
	2058	.else
	2059	lea rsp, [rdi + 8*n + 16]
	2060	lea rbp, [rsi + 8*n]
	2061	.endif
	2062	enter 0, n + 1
	2063
	2064	// precise action:
	2065	//
	2066	// +---------+ +---------+
	2067	// rbp -> \| ??? \| rsp -> \| ??? \|
	2068	// +---------+ +---------+
	2069	// \| w_{n-1} \| \| rbp \| <- rbp'
	2070	// +---------+ +---------+
	2071	// \| ... \| \| w_{n-1} \|
	2072	// +---------+ +---------+
	2073	// \| w_1 \| \| ... \|
	2074	// +---------+ +---------+
	2075	// \| w_0 \| \| w_1 \|
	2076	// +---------+ +---------+
	2077	// \| w_0 \|
	2078	// +---------+
	2079	// \| rbp' \| <- rsp'
	2080	// +---------+
	2081
	2082	mov rdx, rsp
	2083	mov rsp, rax
	2084
	2085	#elif defined(__i386__)
	2086
	2087	mov eax, esp // safekeeping
	2088
	2089	// we're toast if we get hit by a signal now. fingers crossed...
	2090	.if 0
	2091	mov esp, buff2 + 4*n + 4
	2092	mov ebp, buff1 + 4*n
	2093	.else
	2094	lea esp, [edi + 4*n + 8]
	2095	lea ebp, [esi + 4*n]
	2096	.endif
	2097	enter 0, n + 1
	2098
	2099	mov edx, esp
	2100	mov esp, eax
	2101
	2102	#elif defined(__arm__)
	2103
	2104	add r4, r4, #4*n
	2105	add r5, r5, #4*n + 8
	2106
	2107	str r4, [r5, #-4]!
	2108	.rept n/2
	2109	ldrd r0, r1, [r4, #-8]!
	2110	strd r0, r1, [r5, #-8]!
	2111	.endr
	2112	add r4, r5, #4*n
	2113	str r4, [r5, #-4]!
	2114
	2115	#elif defined(__aarch64__)
	2116
	2117	// omgwtf. let's not actually screw with the stack pointer.
	2118
	2119	add x4, x4, #8*n
	2120	add x5, x5, #8*n + 16
	2121
	2122	str x4, [x5, #-8]!
	2123	.rept n/2
	2124	ldp x16, x17, [x4, #-16]!
	2125	stp x16, x17, [x5, #-16]!
	2126	.endr
	2127	add x4, x5, #8*n
	2128	str x4, [x5, #-8]!
	2129
	2130	#else
	2131	notimpl
	2132	#endif
	2133
	2134	ret
	2135
	2136	endproc
	2137
	2138	proc x1e
	2139
	2140	// convert nibble value to (uppercase) hex; other input values yield
	2141	// nonsense.
	2142
	2143	#if defined(__x86_64__)
	2144
	2145	// das doesn't work in 64-bit mode; best i can come up with
	2146	mov edx, eax
	2147	add al, '0'
	2148	add dl, 'A' - 10
	2149	cmp al, '9' + 1
	2150	cmovae eax, edx
	2151
	2152	#elif defined(__i386__)
	2153
	2154	cmp al, 0x0a // cf = 1 iff a < 10
	2155	sbb al, 0x69 // if 0 <= a < 10, a' = a - 0x6a, so
	2156	// 0x96 <= a' < 0x70, setting af, cf
	2157	// if 10 <= a < 16, a' = a - 0x69, so
	2158	// 0x71 <= a' < 0x77, setting cf but
	2159	// clearing af
	2160	das // if 0 <= a < 10, then af and cf are
	2161	// both set, so set subtract 0x66
	2162	// from a' leaving 0x30 <= a' < 0x3a;
	2163	// if 10 <= a < 16 then af clear but
	2164	// cf set, so subtract 0x60 from a'
	2165	// leaving 0x41 <= a' < 0x47
	2166
	2167	#elif defined(__arm__)
	2168
	2169	// significantly less tricksy
	2170	cmp r0, #10
	2171	addlo r0, r0, #'0'
	2172	addhs r0, r0, #'A' - 10
	2173
	2174	#elif defined(__aarch64__)
	2175
	2176	// with less versatile conditional execution this is the best we can
	2177	// do
	2178	cmp w0, #10
	2179	add w16, w0, #'A' - 10
	2180	add w0, w0, #'0'
	2181	cmov.hs w0, w16
	2182
	2183	#else
	2184	notimpl
	2185	#endif
	2186
	2187	ret
	2188
	2189	endproc
	2190
	2191	proc x1f
	2192
	2193	// verify collatz conjecture starting at a; assume a /= 0!
	2194
	2195	#if defined(__x86_64__)
	2196
	2197	0: bsf rcx, rax // clobber c if a = 0
	2198	shr rax, cl // a = 2^c a'
	2199	cmp rdx, 0
	2200	je 1f
	2201	stosq
	2202	dec rdx
	2203	1:
	2204	cmp rax, 1 // done?
	2205	je 9f
	2206	lea rax, [2*rax + rax + 1] // a' = 3 a' + 1
	2207	jmp 0b // again
	2208
	2209	9: ret
	2210
	2211	#elif defined(__i386__)
	2212
	2213	0: bsf ecx, eax // clobber c if a = 0
	2214	shr eax, cl // a = 2^c a'
	2215	cmp edx, 0
	2216	je 1f
	2217	stosd
	2218	dec edx
	2219	1:
	2220	cmp eax, 1 // done?
	2221	je 9f
	2222	lea eax, [2*eax + eax + 1] // a' = 3 a' + 1
	2223	jmp 0b // again
	2224
	2225	9: ret
	2226
	2227	#elif defined(__arm__)
	2228
	2229	// rbit introduced in armv7
	2230	0: rbit r2, r0
	2231	clz r2, r2
	2232	mov r0, r0, lsr r2 // a = 2^c a'
	2233	cmp r3, #0
	2234	strne r0, [r5], #4
	2235	subne r3, r3, #1
	2236	cmp r0, #1
	2237	adcne r0, r0, r0, lsl #1 // a' = 3 a' + 1 (because c set)
	2238	bne 0b
	2239
	2240	ret
	2241
	2242	#elif defined(__aarch64__)
	2243
	2244	0: rbit w2, w0
	2245	clz w2, w2
	2246	lsr w0, w0, w2 // a = 2^c a'
	2247	cmp x3, #0
	2248	beq 1f
	2249	str x0, [x5], #8
	2250	sub x3, x3, #1
	2251	1:
	2252	cmp w0, #1
	2253	add w16, w0, w0, lsl #1 // t = 3 a' + 1 (because c set)
	2254	csinc.eq w0, w0, w16
	2255	b.ne 0b
	2256
	2257	ret
	2258
	2259	#else
	2260	notimpl
	2261	#endif
	2262
	2263	endproc
	2264
	2265	///--------------------------------------------------------------------------
	2266	/// 0x20--0x2f
	2267
	2268	proc x20
	2269
	2270	// calculate 1337 a slowly
	2271
	2272	#if defined(__x86_64__)
	2273
	2274	// original version
	2275	mov rcx, rax // c = a
	2276	shl rcx, 2 // c = 4 a
	2277	add rcx, rax // c = 5 a
	2278	shl rcx, 3 // c = 40 a
	2279	add rcx, rax // c = 41 a
	2280	shl rcx, 1 // c = 82 a
	2281	add rcx, rax // c = 83 a
	2282	shl rcx, 1 // c = 166 a
	2283	add rcx, rax // c = 167 a
	2284	shl rcx, 3 // c = 1336 a
	2285	add rcx, rax // c = 1337 a
	2286
	2287	// a quick way
	2288	lea rdx, [2*rax + rax] // t = 3 a
	2289	shl rdx, 6 // t = 192 a
	2290	sub rdx, rax // t = 191 a
	2291	lea rbx, [8*rdx] // b = 1528 a
	2292	sub rbx, rdx // b = 1337 a
	2293
	2294	#elif defined(__i386__)
	2295
	2296	// original version
	2297	mov ecx, eax // c = a
	2298	shl ecx, 2 // c = 4 a
	2299	add ecx, eax // c = 5 a
	2300	shl ecx, 3 // c = 40 a
	2301	add ecx, eax // c = 41 a
	2302	shl ecx, 1 // c = 82 a
	2303	add ecx, eax // c = 83 a
	2304	shl ecx, 1 // c = 166 a
	2305	add ecx, eax // c = 167 a
	2306	shl ecx, 3 // c = 1336 a
	2307	add ecx, eax // c = 1337 a
	2308
	2309	// a quick way
	2310	lea edx, [2*eax + eax] // t = 3 a
	2311	shl edx, 6 // t = 192 a
	2312	sub edx, eax // t = 191 a
	2313	lea ebx, [8*edx] // b = 1528 a
	2314	sub ebx, edx // b = 1337 a
	2315
	2316	#elif defined(__arm__)
	2317
	2318	// original version, ish
	2319	add r2, r0, r0, lsl #2 // c = 5 a
	2320	add r2, r0, r2, lsl #3 // c = 41 a
	2321	add r2, r0, r2, lsl #1 // c = 83 a
	2322	add r2, r0, r2, lsl #1 // c = 167 a
	2323	add r2, r0, r2, lsl #3 // c = 1337 a
	2324
	2325	// quicker way
	2326	add r1, r0, r0, lsl #1 // b = 3 a
	2327	rsb r1, r0, r1, lsl #6 // b = 191 a
	2328	rsb r1, r1, r1, lsl #3 // b = 1337 a
	2329
	2330	#elif defined(__aarch64__)
	2331
	2332	// original version, ish
	2333	add x2, x0, x0, lsl #2 // c = 5 a
	2334	add x2, x0, x2, lsl #3 // c = 41 a
	2335	add x2, x0, x2, lsl #1 // c = 83 a
	2336	add x2, x0, x2, lsl #1 // c = 167 a
	2337	add x2, x0, x2, lsl #3 // c = 1337 a
	2338
	2339	// sleazy because no rsb
	2340	add x1, x0, x0, lsl #1 // b = 3 a
	2341	sub x1, x0, x1, lsl #6 // b = -191 a
	2342	sub x1, x1, x1, lsl #3 // b = 1337 a
	2343
	2344	#else
	2345	notimpl
	2346	#endif
	2347
	2348	ret
	2349
	2350	endproc
	2351
	2352	proc x21
	2353
	2354	// multiply complex numbers a + b i and c + d i
	2355	//
	2356	// (a + b i) (c + d i) = (a c - b d) + (a d + b c) i
	2357	//
	2358	// somewhat slick approach uses only three multiplications
	2359
	2360	#if defined(__x86_64__)
	2361
	2362	mov rsi, rax // t = a
	2363	add rax, rbx // a' = a + b
	2364	mov rdi, rdx // u = d
	2365	sub rdx, rcx // d' = d - c
	2366	add rdi, rcx // u = c + d
	2367
	2368	imul rax, rcx // a' = c (a + b)
	2369	imul rsi, rdx // t = a (d - c)
	2370	imul rdi, rbx // u = b (c + d)
	2371
	2372	add rsi, rax // t = a (d - c) + c (a + b)
	2373	mov rbx, rsi // b' = a (d - c) + c (a + b)
	2374	// = a d + b c
	2375	sub rax, rdi // a' = c (a + b) - b (c + d)
	2376	// = a c - b d
	2377
	2378	#elif defined(__i386__)
	2379
	2380	mov esi, eax // t = a
	2381	add eax, ebx // a' = a + b
	2382	mov edi, edx // u = d
	2383	sub edx, ecx // d' = d - c
	2384	add edi, ecx // u = c + d
	2385
	2386	imul eax, ecx // a' = c (a + b)
	2387	imul esi, edx // t = a (d - c)
	2388	imul edi, ebx // u = b (c + d)
	2389
	2390	add esi, eax // t = a (d - c) + c (a + b)
	2391	mov ebx, esi // b' = a (d - c) + c (a + b)
	2392	// = a d + b c
	2393	sub eax, edi // a' = c (a + b) - b (c + d)
	2394	// = a c - b d
	2395
	2396	#elif defined(__arm__)
	2397
	2398	add r4, r0, r1 // t = a + b
	2399	add r5, r2, r3 // u = c + d
	2400	sub r3, r3, r2 // d' = d - c
	2401
	2402	// mls introduced in armv7
	2403	mul r4, r4, r2 // t = c (a + b)
	2404	mov r2, r1 // c' = a (bah!)
	2405	mla r1, r0, r3, r4 // b' = a (d - c) + c (a + b)
	2406	// = a d + b c
	2407	mls r0, r2, r5, r4 // a' = c (a + b) - b (c + d)
	2408	// = a c - b d
	2409
	2410	#elif defined(__aarch64__)
	2411
	2412	add x4, x0, x1 // t = a + b
	2413	add x5, x2, x3 // u = c + d
	2414	sub x3, x3, x2 // d' = d - c
	2415
	2416	// mls intxoduced in axmv7
	2417	mul x4, x4, x2 // t = c (a + b)
	2418	mov x2, x1 // c' = a (bah!)
	2419	madd x1, x0, x3, x4 // b' = a (d - c) + c (a + b)
	2420	// = a d + b c
	2421	msub x0, x2, x5, x4 // a' = c (a + b) - b (c + d)
	2422	// = a c - b d
	2423
	2424	#else
	2425	notimpl
	2426	#endif
	2427
	2428	ret
	2429
	2430	endproc
	2431
	2432	proc x22
	2433
	2434	// divide by 3
	2435
	2436	#if defined(__x86_64__)
	2437
	2438	mov rdx, 0xaaaaaaaaaaaaaaab // = ceil(2/3 2^64)
	2439	mul rdx // d' \|\| a' =~ 2/3 a 2^64
	2440	shr rdx, 1 // d' = floor(a/3)
	2441	mov rax, rdx // a' = floor(a/3)
	2442
	2443	// we start with 0 <= a < 2^64. write f = ceil(2/3 2^64), so that
	2444	// 2/3 < f/2^64 < 2/3 + 1/2^64. then floor(2/3 a) <= floor(a f/2^64)
	2445	// <= floor(2/3 a + a/2^64), but a < 2^64 so a/2^64 < 1 and
	2446	// floor(a f/2^64) = floor(2/3 a).
	2447
	2448	#elif defined(__i386__)
	2449
	2450	mov edx, 0xaaaaaaab // = ceil(2/3 2^32)
	2451	mul edx // d' \|\| a' =~ 2/3 a 2^32
	2452	shr edx, 1 // d' = floor(a/3)
	2453	mov eax, edx // a' = floor(a/3)
	2454
	2455	#elif defined(__arm__)
	2456
	2457	ldr r12, =0xaaaaaaab
	2458	umull r12, r0, r0, r12
	2459	mov r0, r0, lsr #1
	2460
	2461	#elif defined(__aarch64__)
	2462
	2463	ldr x16, =0xaaaaaaaaaaaaaaab
	2464	umulh x0, x0, x16
	2465	lsr x0, x0, #1
	2466
	2467	#else
	2468	notimpl
	2469	#endif
	2470
	2471	ret
	2472
	2473	endproc
	2474
	2475	proc x23
	2476
	2477	#if defined(__x86_64__)
	2478
	2479	// main loop: shorten a preserving residue class mod 3
	2480	0: cmp rax, 5
	2481	jbe 8f
	2482	// a > 5
	2483	mov rdx, rax // d' = a
	2484	shr rdx, 2 // d' = floor(a/4)
	2485	and rax, 3 // a = 4 d' + a' (0 <= a' < 4)
	2486	add rax, rdx // a' == a (mod 3) but a' < a/4 + 4
	2487	jmp 0b
	2488
	2489	// fix up final value 0 <= a < 6: want 0 <= a < 3
	2490	//
	2491	// the tricky part is actually a = 3; but the other final cases take
	2492	// additional iterations which we can avoid.
	2493	8: cmp rax, 3 // set cf iff a < 3
	2494	cmc // set cf iff a >= 3
	2495	sbb rdx, rdx // d' = a >= 3 ? -1 : 0
	2496	and rdx, 3 // d' = a >= 3 ? 3 : 0
	2497	sub rax, rdx // a' = a - (a >= 3 ? 3 : 0)
	2498	// = a (mod 3)
	2499
	2500	#elif defined(__i386__)
	2501
	2502	// main loop: shorten a preserving residue class mod 3
	2503	0: cmp eax, 5
	2504	jbe 8f
	2505	// a > 5
	2506	mov edx, eax // d' = a
	2507	shr edx, 2 // d' = floor(a/4)
	2508	and eax, 3 // a = 4 d' + a' (0 <= a' < 4)
	2509	add eax, edx // a' == a (mod 3) but a' < a/4 + 4
	2510	jmp 0b
	2511
	2512	// fix up final value 0 <= a < 6: want 0 <= a < 3
	2513	//
	2514	// the tricky part is actually a = 3; but the other final cases take
	2515	// additional iterations which we can avoid.
	2516	8: cmp eax, 3 // set cf iff a < 3
	2517	cmc // set cf iff a >= 3
	2518	sbb edx, edx // d' = a >= 3 ? -1 : 0
	2519	and edx, 3 // d' = a >= 3 ? 3 : 0
	2520	sub eax, edx // a' = a - (a >= 3 ? 3 : 0)
	2521	// = a (mod 3)
	2522
	2523	#elif defined(__arm__)
	2524
	2525	0: cmp r0, #6
	2526	andhs r12, r0, #3
	2527	addhs r0, r12, r0, lsr #2
	2528	bhs 0b
	2529
	2530	cmp r0, #3
	2531	subhs r0, r0, #3
	2532
	2533	#elif defined(__aarch64__)
	2534
	2535	0: cmp x0, #6
	2536	// blunder on through regardless since this doesn't affect the result
	2537	and x16, x0, #3
	2538	add x0, x16, x0, lsr #2
	2539	b.hs 0b
	2540
	2541	subs x16, x0, #3
	2542	cmov.hs x0, x16
	2543
	2544	#else
	2545	notimpl
	2546	#endif
	2547
	2548	ret
	2549
	2550	endproc
	2551
	2552	proc x24
	2553
	2554	// invert (odd) a mod 2^64
	2555	//
	2556	// suppose a a_i == 1 (mod 2^{2^i})
	2557	//
	2558	// clearly good for i = 0, since 2^i = 1 and 2^{2^i} = 2, and a_0 =
	2559	// a == 1 (mod 2) by assumption
	2560	//
	2561	// write a a_i == b_i 2^{2^i} + 1 (mod 2^{2^{i+1}})
	2562	// then b_i == (a a_i - 1)/2^{2^i} (mod 2^{2^i})
	2563	// to lift inverse, we want x such that a x == -b_i (mod 2^{2^i});
	2564	// clearly x = -a_i b_i will do, since a a_i == 1 (mod 2^{2^i})
	2565	// then:
	2566	// a_{i+1} = a_i - a_i b_i 2^{2^i} = a_i (1 - (a a_i - 1))
	2567	// = 2 a_i - a a_i^2
	2568	//
	2569	// check:
	2570	// a a_{i+1} = 2 a a_i - a^2 a_i^2
	2571	// == 2 a a_i - (b_i 2^{2^i} + 1)^2
	2572	// == 2 (b_i 2^{2^i} + 1) -
	2573	// (b_i^2 2^{2^{i+1}} + 2 b_i 2^{2^i} + 1)
	2574	// == 1 (mod 2^{2^{i+1}})
	2575
	2576	#if defined(__x86_64__)
	2577
	2578	// rax // a_0 = a
	2579	mov rbx, rax // b' = a
	2580	mov rsi, rax // t = a_0
	2581
	2582	0:
	2583	cmp rbp, 0
	2584	je 1f
	2585	stosq
	2586	dec rbp
	2587	1:
	2588	mul rbx // a' = a a_i
	2589	mov rcx, rax // c = a a_i
	2590
	2591	sub rax, 2 // a' = a a_i - 2
	2592	neg rax // a' = 2 - a a_i
	2593	mul rsi // a_{i+1} = a_i (2 - a a_i)
	2594	// = 2 a_i - a a_i^2
	2595	mov rsi, rax // t = a_{i+1}
	2596
	2597	cmp rcx, 1 // done?
	2598	ja 0b // no -- iterate
	2599
	2600	#elif defined(__i386__)
	2601
	2602	// eax // a_0 = a
	2603	mov ebx, eax // b' = a
	2604	mov esi, eax // t = a_0
	2605
	2606	0:
	2607	cmp ebp, 0
	2608	je 1f
	2609	stosd
	2610	dec ebp
	2611	1:
	2612	mul ebx // a' = a a_i
	2613	mov ecx, eax // c = a a_i
	2614
	2615	sub eax, 2 // a' = a a_i - 2
	2616	jb 9f // done if < 2
	2617	neg eax // a' = 2 - a a_i
	2618	mul esi // a_{i+1} = a_i (2 - a a_i)
	2619	// = 2 a_i - a a_i^2
	2620	mov esi, eax // t = a_{i+1}
	2621
	2622	jmp 0b // and iterate
	2623	9: mov eax, esi // restore
	2624
	2625	#elif defined(__arm__)
	2626
	2627	// r0 // a_0 = a
	2628	mov r1, r0 // b' = a
	2629
	2630	0:
	2631	cmp r6, #0
	2632	strne r0, [r5], #4
	2633	subne r6, r6, #1
	2634	mul r2, r0, r1 // c = a a_i
	2635	rsbs r2, r2, #2 // c = 2 - a a_i
	2636	mul r0, r0, r2 // a_{i+1} = a_i (2 - a a_i)
	2637	// = 2 a_i - a a_i^2
	2638	blo 0b
	2639
	2640	#elif defined(__aarch64__)
	2641
	2642	// x0 // a_0 = a
	2643	mov x1, x0 // b' = a
	2644	mov x16, #2 // because we have no rsb
	2645
	2646	0:
	2647	cmp x6, #0
	2648	b.eq 1f
	2649	str x0, [x5], #8
	2650	sub x6, x6, #1
	2651	1:
	2652	mul x2, x0, x1 // c = a a_i
	2653	subs x2, x16, x2 // c = 2 - a a_i
	2654	mul x0, x0, x2 // a_{i+1} = a_i (2 - a a_i)
	2655	// = 2 a_i - a a_i^2
	2656	b.lo 0b
	2657
	2658	#else
	2659	notimpl
	2660	#endif
	2661
	2662	ret
	2663
	2664	endproc
	2665
	2666	proc x25
	2667
	2668	// a poor approximation to pi/4
	2669	//
	2670	// think of x and y as being in 16.16 fixed-point format. we sample
	2671	// points in the unit square, and determine how many of them are
	2672	// within a unit quarter-circle centred at the origin. the area of
	2673	// the quarter-circle is pi/4.
	2674
	2675	#if defined(__x86_64__)
	2676
	2677	xor eax, eax // a = 0
	2678	mov rcx, 1
	2679	shl rcx, 0x20 // c =~ 4 billion
	2680
	2681	0: movzx rbx, cx // x = low 16 bits of c
	2682	imul rbx, rbx // b = x^2
	2683
	2684	ror rcx, 0x10 // switch halves of c
	2685	movzx rdx, cx // y = high 16 bits of c
	2686	imul rdx, rdx // d = y^2
	2687	rol rcx, 0x10 // switch back
	2688
	2689	add rbx, rdx // r^2 = x^2 + y^2
	2690	shr rbx, 0x20 // r^2 >= 1?
	2691	cmp rbx, 1 // set cf iff r^2 >= 1
	2692	adc rax, 0 // and add onto accumulator
	2693	loop 0b
	2694
	2695	#elif defined(__i386__)
	2696
	2697	// this is actually better done in 32 bits. the carry has the wrong
	2698	// sense here, so instead deduct one for each point outside the
	2699	// quarter-circle rather than adding one for each point inside it.
	2700	xor eax, eax
	2701	xor ecx, ecx
	2702
	2703	0: movzx ebx, cx
	2704	imul ebx, ebx
	2705
	2706	ror ecx, 0x10
	2707	movzx edx, cx
	2708	imul edx, edx
	2709	rol ecx, 0x10
	2710
	2711	add ebx, edx // see?
	2712	sbb eax, 0
	2713	loop 0b
	2714
	2715	#elif defined(__arm__)
	2716
	2717	mov r0, #0
	2718	mov r2, #0
	2719
	2720	0: uxth r1, r2, ror #0
	2721	uxth r3, r2, ror #16
	2722	mul r1, r1, r1
	2723	mul r3, r3, r3
	2724	cmn r1, r3 // mlas doesn't set cf usefully
	2725	addcc r0, r0, #1
	2726	adds r2, r2, #1
	2727	bne 0b
	2728
	2729	#elif defined(__aarch64__)
	2730
	2731	mov w0, #0
	2732	mov w2, #0
	2733
	2734	0: ubfx w1, w2, #0, #16
	2735	ubfx w3, w2, #16, #16
	2736	sub w2, w2, #1
	2737	mul w1, w1, w1
	2738	mul w3, w3, w3
	2739	cmn w1, w3
	2740	cinc.cc w0, w0
	2741	cbnz w2, 0b
	2742
	2743	#else
	2744	notimpl
	2745	#endif
	2746
	2747	ret
	2748
	2749	endproc
	2750
	2751	proc x26
	2752
	2753	// a bad way to rotate a right by 7 places
	2754
	2755	#if defined(__x86_64__)
	2756
	2757	mov rbx, rax
	2758	ror rbx, 7 // better
	2759
	2760	mov rdx, rax // d' = a
	2761	shr rax, 7 // a' = a >> 7
	2762	shl rdx, 0x39 // d' = a << 57
	2763	or rax, rdx // a' = a >>> 7
	2764
	2765	#elif defined(__i386__)
	2766
	2767	mov ebx, eax
	2768	ror ebx, 7 // better
	2769
	2770	mov edx, eax // d' = a
	2771	shr eax, 7 // a' = a >> 7
	2772	shl edx, 0x39 // d' = a << 57
	2773	or eax, edx // a' = a >>> 7
	2774
	2775	#elif defined(__arm__)
	2776
	2777	mov r1, r0, ror #7 // easy way
	2778
	2779	// even the hard way is fairly easy on arm
	2780	mov r3, r0, lsl #25
	2781	orr r0, r3, r0, lsr #7 // hard way
	2782
	2783	#elif defined(__aarch64__)
	2784
	2785	ror x1, x0, #7 // easy way
	2786
	2787	// even the hard way is fairly easy on arm
	2788	lsl x3, x0, #57
	2789	orr x0, x3, x0, lsr #7 // hard way
	2790
	2791	#else
	2792	notimpl
	2793	#endif
	2794
	2795	ret
	2796
	2797	endproc
	2798
	2799	proc x27
	2800
	2801	// shift a right by c places, in two halves
	2802
	2803	#if defined(__x86_64__)
	2804
	2805	mov ch, cl // c' = [c, c]
	2806	inc ch // c' = [c, c + 1]
	2807	shr ch, 1
	2808	shr cl, 1 // c' = [floor(c/2), ceil(c/2)]
	2809	shr rax, cl
	2810	xchg ch, cl
	2811	shr rax, cl
	2812
	2813	#elif defined(__i386__)
	2814
	2815	mov ch, cl // c' = [c, c]
	2816	inc ch // c' = [c, c + 1]
	2817	shr ch, 1
	2818	shr cl, 1 // c' = [floor(c/2), ceil(c/2)]
	2819	shr eax, cl
	2820	xchg ch, cl
	2821	shr eax, cl
	2822
	2823	#elif defined(__arm__)
	2824
	2825	// it would be clearer and more efficient to say: `mov r12, r2, lsr
	2826	// #1; sub r2, r2, r12', but that's not the lesson this exercise is
	2827	// trying to teach.
	2828	add r12, r2, #1
	2829	mov r2, r2, lsr #1
	2830	mov r12, r12, lsr #1
	2831	mov r0, r0, lsr r2
	2832	mov r0, r0, lsr r12
	2833
	2834	#elif defined(__aarch64__)
	2835
	2836	add w16, w2, #1
	2837	lsr w2, w2, #1
	2838	lsr w16, w16, #1
	2839	lsr x0, x0, x2
	2840	lsr x0, x0, x16
	2841
	2842	#else
	2843	notimpl
	2844	#endif
	2845
	2846	ret
	2847
	2848	endproc
	2849
	2850	proc x28
	2851
	2852	#if defined(__x86_64__)
	2853
	2854	notimpl
	2855
	2856	#elif defined(__i386__)
	2857
	2858	notimpl
	2859
	2860	#elif defined(__arm__)
	2861
	2862	notimpl
	2863
	2864	#elif defined(__aarch64__)
	2865
	2866	notimpl
	2867
	2868	#else
	2869	notimpl
	2870	#endif
	2871
	2872	endproc
	2873
	2874	proc x29
	2875
	2876	#if defined(__x86_64__)
	2877
	2878	notimpl
	2879
	2880	#elif defined(__i386__)
	2881
	2882	notimpl
	2883
	2884	#elif defined(__arm__)
	2885
	2886	notimpl
	2887
	2888	#elif defined(__aarch64__)
	2889
	2890	notimpl
	2891
	2892	#else
	2893	notimpl
	2894	#endif
	2895
	2896	endproc
	2897
	2898	proc x2a
	2899
	2900	#if defined(__x86_64__)
	2901
	2902	notimpl
	2903
	2904	#elif defined(__i386__)
	2905
	2906	notimpl
	2907
	2908	#elif defined(__arm__)
	2909
	2910	notimpl
	2911
	2912	#elif defined(__aarch64__)
	2913
	2914	notimpl
	2915
	2916	#else
	2917	notimpl
	2918	#endif
	2919
	2920	endproc
	2921
	2922	proc x2b
	2923
	2924	#if defined(__x86_64__)
	2925
	2926	notimpl
	2927
	2928	#elif defined(__i386__)
	2929
	2930	notimpl
	2931
	2932	#elif defined(__arm__)
	2933
	2934	notimpl
	2935
	2936	#elif defined(__aarch64__)
	2937
	2938	notimpl
	2939
	2940	#else
	2941	notimpl
	2942	#endif
	2943
	2944	endproc
	2945
	2946	proc x2c
	2947
	2948	#if defined(__x86_64__)
	2949
	2950	notimpl
	2951
	2952	#elif defined(__i386__)
	2953
	2954	notimpl
	2955
	2956	#elif defined(__arm__)
	2957
	2958	notimpl
	2959
	2960	#elif defined(__aarch64__)
	2961
	2962	notimpl
	2963
	2964	#else
	2965	notimpl
	2966	#endif
	2967
	2968	endproc
	2969
	2970	proc x2d
	2971
	2972	#if defined(__x86_64__)
	2973
	2974	notimpl
	2975
	2976	#elif defined(__i386__)
	2977
	2978	notimpl
	2979
	2980	#elif defined(__arm__)
	2981
	2982	notimpl
	2983
	2984	#elif defined(__aarch64__)
	2985
	2986	notimpl
	2987
	2988	#else
	2989	notimpl
	2990	#endif
	2991
	2992	endproc
	2993
	2994	proc x2e
	2995
	2996	#if defined(__x86_64__)
	2997
	2998	notimpl
	2999
	3000	#elif defined(__i386__)
	3001
	3002	notimpl
	3003
	3004	#elif defined(__arm__)
	3005
	3006	notimpl
	3007
	3008	#elif defined(__aarch64__)
	3009
	3010	notimpl
	3011
	3012	#else
	3013	notimpl
	3014	#endif
	3015
	3016	endproc
	3017
	3018	proc x2f
	3019
	3020	#if defined(__x86_64__)
	3021
	3022	notimpl
	3023
	3024	#elif defined(__i386__)
	3025
	3026	notimpl
	3027
	3028	#elif defined(__arm__)
	3029
	3030	notimpl
	3031
	3032	#elif defined(__aarch64__)
	3033
	3034	notimpl
	3035
	3036	#else
	3037	notimpl
	3038	#endif
	3039
	3040	endproc
	3041
	3042	///--------------------------------------------------------------------------
	3043	/// 0x30--0x3f
	3044
	3045	proc x30
	3046
	3047	#if defined(__x86_64__)
	3048
	3049	notimpl
	3050
	3051	#elif defined(__i386__)
	3052
	3053	notimpl
	3054
	3055	#elif defined(__arm__)
	3056
	3057	notimpl
	3058
	3059	#elif defined(__aarch64__)
	3060
	3061	notimpl
	3062
	3063	#else
	3064	notimpl
	3065	#endif
	3066
	3067	ret
	3068
	3069	endproc
	3070
	3071	proc x31
	3072
	3073	#if defined(__x86_64__)
	3074
	3075	notimpl
	3076
	3077	#elif defined(__i386__)
	3078
	3079	notimpl
	3080
	3081	#elif defined(__arm__)
	3082
	3083	notimpl
	3084
	3085	#elif defined(__aarch64__)
	3086
	3087	notimpl
	3088
	3089	#else
	3090	notimpl
	3091	#endif
	3092
	3093	endproc
	3094
	3095	proc x32
	3096
	3097	#if defined(__x86_64__)
	3098
	3099	notimpl
	3100
	3101	#elif defined(__i386__)
	3102
	3103	notimpl
	3104
	3105	#elif defined(__arm__)
	3106
	3107	notimpl
	3108
	3109	#elif defined(__aarch64__)
	3110
	3111	notimpl
	3112
	3113	#else
	3114	notimpl
	3115	#endif
	3116
	3117	endproc
	3118
	3119	proc x33
	3120
	3121	#if defined(__x86_64__)
	3122
	3123	notimpl
	3124
	3125	#elif defined(__i386__)
	3126
	3127	notimpl
	3128
	3129	#elif defined(__arm__)
	3130
	3131	notimpl
	3132
	3133	#elif defined(__aarch64__)
	3134
	3135	notimpl
	3136
	3137	#else
	3138	notimpl
	3139	#endif
	3140
	3141	endproc
	3142
	3143	proc x34
	3144
	3145	#if defined(__x86_64__)
	3146
	3147	notimpl
	3148
	3149	#elif defined(__i386__)
	3150
	3151	notimpl
	3152
	3153	#elif defined(__arm__)
	3154
	3155	notimpl
	3156
	3157	#elif defined(__aarch64__)
	3158
	3159	notimpl
	3160
	3161	#else
	3162	notimpl
	3163	#endif
	3164
	3165	endproc
	3166
	3167	proc x35
	3168
	3169	#if defined(__x86_64__)
	3170
	3171	notimpl
	3172
	3173	#elif defined(__i386__)
	3174
	3175	notimpl
	3176
	3177	#elif defined(__arm__)
	3178
	3179	notimpl
	3180
	3181	#elif defined(__aarch64__)
	3182
	3183	notimpl
	3184
	3185	#else
	3186	notimpl
	3187	#endif
	3188
	3189	endproc
	3190
	3191	proc x36
	3192
	3193	#if defined(__x86_64__)
	3194
	3195	notimpl
	3196
	3197	#elif defined(__i386__)
	3198
	3199	notimpl
	3200
	3201	#elif defined(__arm__)
	3202
	3203	notimpl
	3204
	3205	#elif defined(__aarch64__)
	3206
	3207	notimpl
	3208
	3209	#else
	3210	notimpl
	3211	#endif
	3212
	3213	endproc
	3214
	3215	proc x37
	3216
	3217	#if defined(__x86_64__)
	3218
	3219	notimpl
	3220
	3221	#elif defined(__i386__)
	3222
	3223	notimpl
	3224
	3225	#elif defined(__arm__)
	3226
	3227	notimpl
	3228
	3229	#elif defined(__aarch64__)
	3230
	3231	notimpl
	3232
	3233	#else
	3234	notimpl
	3235	#endif
	3236
	3237	endproc
	3238
	3239	proc x38
	3240
	3241	#if defined(__x86_64__)
	3242
	3243	notimpl
	3244
	3245	#elif defined(__i386__)
	3246
	3247	notimpl
	3248
	3249	#elif defined(__arm__)
	3250
	3251	notimpl
	3252
	3253	#elif defined(__aarch64__)
	3254
	3255	notimpl
	3256
	3257	#else
	3258	notimpl
	3259	#endif
	3260
	3261	endproc
	3262
	3263	proc x39
	3264
	3265	#if defined(__x86_64__)
	3266
	3267	notimpl
	3268
	3269	#elif defined(__i386__)
	3270
	3271	notimpl
	3272
	3273	#elif defined(__arm__)
	3274
	3275	notimpl
	3276
	3277	#elif defined(__aarch64__)
	3278
	3279	notimpl
	3280
	3281	#else
	3282	notimpl
	3283	#endif
	3284
	3285	endproc
	3286
	3287	proc x3a
	3288
	3289	#if defined(__x86_64__)
	3290
	3291	notimpl
	3292
	3293	#elif defined(__i386__)
	3294
	3295	notimpl
	3296
	3297	#elif defined(__arm__)
	3298
	3299	notimpl
	3300
	3301	#elif defined(__aarch64__)
	3302
	3303	notimpl
	3304
	3305	#else
	3306	notimpl
	3307	#endif
	3308
	3309	endproc
	3310
	3311	proc x3b
	3312
	3313	#if defined(__x86_64__)
	3314
	3315	notimpl
	3316
	3317	#elif defined(__i386__)
	3318
	3319	notimpl
	3320
	3321	#elif defined(__arm__)
	3322
	3323	notimpl
	3324
	3325	#elif defined(__aarch64__)
	3326
	3327	notimpl
	3328
	3329	#else
	3330	notimpl
	3331	#endif
	3332
	3333	endproc
	3334
	3335	proc x3c
	3336
	3337	#if defined(__x86_64__)
	3338
	3339	notimpl
	3340
	3341	#elif defined(__i386__)
	3342
	3343	notimpl
	3344
	3345	#elif defined(__arm__)
	3346
	3347	notimpl
	3348
	3349	#elif defined(__aarch64__)
	3350
	3351	notimpl
	3352
	3353	#else
	3354	notimpl
	3355	#endif
	3356
	3357	endproc
	3358
	3359	proc x3d
	3360
	3361	#if defined(__x86_64__)
	3362
	3363	notimpl
	3364
	3365	#elif defined(__i386__)
	3366
	3367	notimpl
	3368
	3369	#elif defined(__arm__)
	3370
	3371	notimpl
	3372
	3373	#elif defined(__aarch64__)
	3374
	3375	notimpl
	3376
	3377	#else
	3378	notimpl
	3379	#endif
	3380
	3381	endproc
	3382
	3383	proc x3e
	3384
	3385	#if defined(__x86_64__)
	3386
	3387	notimpl
	3388
	3389	#elif defined(__i386__)
	3390
	3391	notimpl
	3392
	3393	#elif defined(__arm__)
	3394
	3395	notimpl
	3396
	3397	#elif defined(__aarch64__)
	3398
	3399	notimpl
	3400
	3401	#else
	3402	notimpl
	3403	#endif
	3404
	3405	endproc
	3406
	3407	proc x3f
	3408
	3409	#if defined(__x86_64__)
	3410
	3411	notimpl
	3412
	3413	#elif defined(__i386__)
	3414
	3415	notimpl
	3416
	3417	#elif defined(__arm__)
	3418
	3419	notimpl
	3420
	3421	#elif defined(__aarch64__)
	3422
	3423	notimpl
	3424
	3425	#else
	3426	notimpl
	3427	#endif
	3428
	3429	endproc
	3430
	3431	///----- That's all, folks --------------------------------------------------