-/// -*- mode: asm; asm-comment-char: ?/ -*-
+/// -*- mode: asm; asm-comment-char: 0 -*-
+
+///--------------------------------------------------------------------------
+/// Preliminaries.
+
+#include <sys/syscall.h>
+
+#if defined(__i386__) || defined(__x86_64__)
.intel_syntax noprefix
- .section .note.GNU-stack, "", @progbits
+#elif defined(__arm__)
+
+.macro ret
+ bx r14
+.endm
+
+ .arch armv7-a
+ .fpu neon
+
+#elif defined(__aarch64__)
+
+.macro cmov rd, rn, cc
+ csel \rd, \rn, \rd, \cc
+.endm
+#define _COND(_) \
+ _(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl) \
+ _(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv) \
+ _(hs) _(lo)
+#define _INST(_) \
+ _(ccmp) _(ccmn) \
+ _(csel) _(cmov) \
+ _(csinc) _(cinc) _(cset) \
+ _(csneg) _(cneg) \
+ _(csinv) _(cinv) _(csetm)
+#define _CONDVAR(cc) _definstvar cc;
+#define _INSTVARS(inst) \
+ .macro _definstvar cc; \
+ .macro inst.\cc args:vararg; inst \args, \cc; .endm; \
+ .endm; \
+ _COND(_CONDVAR); \
+ .purgem _definstvar;
+ _INST(_INSTVARS)
+#undef _COND
+#undef _INST
+#undef _CONDVAR
+#undef _INSTVARS
+
+#define CCMP_N 8
+#define CCMP_Z 4
+#define CCMP_C 2
+#define CCMP_V 1
+
+#define CCMP_MI CCMP_N
+#define CCMP_PL 0
+#define CCMP_EQ CCMP_Z
+#define CCMP_NE 0
+#define CCMP_CS CCMP_C
+#define CCMP_HS CCMP_C
+#define CCMP_CC 0
+#define CCMP_LO 0
+#define CCMP_VS CCMP_V
+#define CCMP_VC 0
+#define CCMP_HI CCMP_C
+#define CCMP_LS 0
+#define CCMP_LT CCMP_N
+#define CCMP_GE 0
+#define CCMP_LE CCMP_N
+#define CCMP_GT 0
+
+#else
+# error "not supported"
+#endif
.macro proc name
.globl \name
.endm
.macro ch c
+#if defined(__i386__)
+
+ pushf
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push ebp
+ mov ebp, esp
+ and esp, -16
+
+ push \c
+ call putchar@plt
+
+ call get_pc_ebx
+ add ebx, offset _GLOBAL_OFFSET_TABLE
+ mov eax, [ebx + stdout@GOT]
+ mov eax, [eax]
+ call fflush@plt
+
+ mov esp, ebp
+ pop ebp
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ popf
+
+#elif defined(__x86_64__)
+
pushf
push rax
push rcx
pop rcx
pop rax
popf
+
+#elif defined(__arm__)
+
+ stmfd r13!, {r0-r4, r12, r14}
+
+ mov r4, r13
+ bic r14, r4, #15
+ mov r13, r14
+
+ mov r0, #\c
+ bl putchar@plt
+
+ ldr r14, .L$_c$gotoff$\@
+.L$_c$gotpc$\@:
+ add r14, pc, r14
+ b .L$_c$cont$\@
+.L$_c$gotoff$\@:
+ .word _GLOBAL_OFFSET_TABLE - .L$_c$gotpc$\@ - 8
+.L$_c$cont$\@:
+ bl fflush@plt
+
+ mov r13, r4
+ ldmfd r13!, {r0-r4, r12, r14}
+
+#elif defined(__aarch64__)
+
+ sub sp, sp, #20*8
+ stp x0, x1, [sp, #0]
+ stp x2, x3, [sp, #16]
+ stp x4, x5, [sp, #32]
+ stp x6, x7, [sp, #48]
+ stp x8, x9, [sp, #64]
+ stp x10, x11, [sp, #80]
+ stp x12, x13, [sp, #96]
+ stp x14, x15, [sp, #112]
+ stp x16, x17, [sp, #128]
+ mrs x16, nzcv
+ stp x16, x30, [sp, #144]
+
+ mov w0, #\c
+ bl putchar
+ adrp x0, :got:stdout
+ ldr x0, [x0, #:got_lo12:stdout]
+ ldr x0, [x0]
+ bl fflush
+
+ ldp x16, x30, [sp, #144]
+ msr nzcv, x16
+ ldp x16, x17, [sp, #128]
+ ldp x14, x15, [sp, #112]
+ ldp x12, x13, [sp, #96]
+ ldp x10, x11, [sp, #80]
+ ldp x8, x9, [sp, #64]
+ ldp x6, x7, [sp, #48]
+ ldp x4, x5, [sp, #32]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #0]
+ add sp, sp, #20*8
+
+#else
+# error "not supported"
+#endif
.endm
+.macro notimpl
+#if defined(__i386__) || defined(__x86_64__)
+ ud2
+#elif defined(__arm__)
+ udf
+#elif defined(__aarch64__)
+ hlt #0
+#else
+# error "not supported"
+#endif
+.endm
+
+ .section .note.GNU-stack, "", %progbits
+
.text
+#if defined(__i386__)
+get_pc_ebx:
+ mov ebx, [esp]
+ ret
+#endif
+
+
proc call_example
+#if defined(__i386__)
+
+ push ebx // ebx
+ push esi // esi, ebx
+ push edi // edi, esi, ebx
+ push ebp // flags, ebp, ..., ebx
+ pushf
+
+ mov edi, [esp + 4*6]
+ mov esi, [esp + 4*7]
+ push esi // regs, flags, ebp, ..., ebx
+
+ call get_pc_ebx
+ lea eax, [ebx + 9f - .]
+ push eax // cont, regs, flags, ebp, ..., ebx
+ push edi // func, cont, regs, flags, ebp, ..., ebx
+
+ mov eax, [esi + 28]
+ pushf
+ pop ecx
+ and eax, 0x0cd5
+ and ecx, ~0x0cd5
+ or eax, ecx
+ push eax
+ popf
+ mov eax, [esi + 0]
+ mov ebx, [esi + 4]
+ mov ecx, [esi + 8]
+ mov edx, [esi + 12]
+ mov edi, [esi + 20]
+ mov ebp, [esi + 24]
+ mov esi, [esi + 16]
+
+ ret // -> func; regs, flags, ebp, ..., ebx
+
+9: pushf // eflags, regs, flags, ebp, ..., ebx
+ push esi // esi, eflags, regs, flags, ebp, ..., ebx
+ mov esi, [esp + 8]
+ mov [esi + 0], eax
+ mov [esi + 4], ebx
+ mov [esi + 8], ecx
+ mov [esi + 12], edx
+ mov [esi + 20], edi
+ mov [esi + 24], ebp
+ pop eax // rflags, regs, flags, ebp, ..., ebx
+ mov [esi + 16], eax
+ pop eax // regs, flags, ebp, ..., ebx
+ mov [esi + 28], eax
+
+ add esp, 4 // flags, ebp, ..., ebx
+ popf // ebp, ..., ebx
+ pop ebp // ..., ebx
+ pop edi
+ pop esi
+ pop ebx //
+ ret
+
+#elif defined(__x86_64__)
+
push rbx // rbx
push r10
push r11
push rax // cont, regs, flags, rbp, ..., rbx
push rdi // func, cont, regs, flags, rbp, ..., rbx
- mov rax, [rsi + 56]
+ mov rax, [rsi + 8*15]
pushf
pop rcx
and rax, 0x0cd5
or rax, rcx
push rax
popf
- mov rax, [rsi + 0]
- mov rbx, [rsi + 8]
- mov rcx, [rsi + 16]
- mov rdx, [rsi + 24]
- mov rdi, [rsi + 40]
- mov rbp, [rsi + 48]
- mov rsi, [rsi + 32]
+ mov rax, [rsi + 0]
+ mov rbx, [rsi + 8]
+ mov rcx, [rsi + 16]
+ mov rdx, [rsi + 24]
+ mov rdi, [rsi + 40]
+ mov rbp, [rsi + 48]
+ mov r8, [rsi + 56]
+ mov r9, [rsi + 64]
+ mov r10, [rsi + 72]
+ mov r11, [rsi + 80]
+ mov r12, [rsi + 88]
+ mov r13, [rsi + 96]
+ mov r14, [rsi + 104]
+ mov r15, [rsi + 112]
+ mov rsi, [rsi + 32]
ret // -> func; regs, flags, rbp, ..., rbx
9: pushf // rflags, regs, flags, rbp, ..., rbx
push rsi // rsi, rflags, regs, flags, rbp, ..., rbx
mov rsi, [rsp + 16]
- mov [rsi + 0], rax
- mov [rsi + 8], rbx
- mov [rsi + 16], rcx
- mov [rsi + 24], rdx
- mov [rsi + 40], rdi
- mov [rsi + 48], rbp
+ mov [rsi + 0], rax
+ mov [rsi + 8], rbx
+ mov [rsi + 16], rcx
+ mov [rsi + 24], rdx
+ mov [rsi + 40], rdi
+ mov [rsi + 48], rbp
+ mov [rsi + 56], r8
+ mov [rsi + 64], r9
+ mov [rsi + 72], r10
+ mov [rsi + 80], r11
+ mov [rsi + 88], r12
+ mov [rsi + 96], r13
+ mov [rsi + 104], r14
+ mov [rsi + 112], r15
pop rax // rflags, regs, flags, rbp, ..., rbx
- mov [rsi + 32], rax
+ mov [rsi + 32], rax
pop rax // regs, flags, rbp, ..., rbx
- mov [rsi + 56], rax
+ mov [rsi + 120], rax
add rsp, 8 // flags, rbp, ..., rbx
popf // rbp, ..., rbx
pop rbx //
ret
+#elif defined(__arm__)
+
+ stmfd r13!, {r0, r1, r4-r11, r14}
+ ldmia r1, {r0-r12, r14}
+ msr cpsr, r14
+ mov r14, pc
+ ldr pc, [r13], #4
+ ldr r14, [r13], #4
+ stmia r14!, {r0-r12}
+ mrs r0, cpsr
+ str r0, [r14]
+ ldmfd r13!, {r4-r11, pc}
+
+#elif defined(__aarch64__)
+
+ stp x29, x30, [sp, #-14*8]!
+ mov x29, sp
+ stp x19, x20, [sp, #16]
+ stp x21, x22, [sp, #32]
+ stp x23, x24, [sp, #48]
+ stp x25, x26, [sp, #64]
+ stp x27, x28, [sp, #80]
+ str x1, [sp, #104]
+
+ ldp x29, x30, [x1, #224]
+ msr nzcv, x30
+ mov x30, x0
+ ldp x27, x28, [x1, #208]
+ ldp x25, x26, [x1, #192]
+ ldp x23, x24, [x1, #176]
+ ldp x21, x22, [x1, #160]
+ ldp x19, x20, [x1, #144]
+ ldp x16, x17, [x1, #128]
+ ldp x14, x15, [x1, #112]
+ ldp x12, x13, [x1, #96]
+ ldp x10, x11, [x1, #80]
+ ldp x8, x9, [x1, #64]
+ ldp x6, x7, [x1, #48]
+ ldp x4, x5, [x1, #32]
+ ldp x2, x3, [x1, #16]
+ ldp x0, x1, [x1, #0]
+
+ blr x30
+
+ ldr x30, [sp, #104]
+ stp x27, x28, [x30, #208]
+ stp x25, x26, [x30, #192]
+ stp x23, x24, [x30, #176]
+ stp x21, x22, [x30, #160]
+ stp x19, x20, [x30, #144]
+ stp x16, x17, [x30, #128]
+ stp x14, x15, [x30, #112]
+ stp x12, x13, [x30, #96]
+ stp x10, x11, [x30, #80]
+ stp x8, x9, [x30, #64]
+ stp x6, x7, [x30, #48]
+ stp x4, x5, [x30, #32]
+ stp x2, x3, [x30, #16]
+ stp x0, x1, [x30, #0]
+ mov x0, x30
+ mrs x30, nzcv
+ stp x29, x30, [x0, #224]
+
+ ldp x19, x20, [sp, #16]
+ ldp x21, x22, [sp, #32]
+ ldp x23, x24, [sp, #48]
+ ldp x25, x26, [sp, #64]
+ ldp x27, x28, [sp, #80]
+ ldp x29, x30, [sp], #14*8
+
+ ret
+
+#else
+# error "not supported"
+#endif
+
endproc
proc nop
endproc
///--------------------------------------------------------------------------
+/// 0x00--0x0f
proc x00
// clear all 64 bits of extended traditional registers
- xor eax,eax // clear rax
- lea rbx,[0] // rbx -> _|_
+
+#if defined(__x86_64__)
+
+ xor eax, eax // clear rax
+ lea rbx, [0] // rbx -> _|_
loop . // iterate, decrement rcx until zero
- mov rdx,0 // set rdx = 0
- and esi,0 // clear all bits of rsi
- sub edi,edi // set rdi = edi - edi = 0
+ mov rdx, 0 // set rdx = 0
+ and esi, 0 // clear all bits of rsi
+ sub edi, edi // set rdi = edi - edi = 0
push 0
pop rbp // pop 0 into rbp
+#elif defined(__i386__)
+
+ xor eax, eax
+ lea ebx, [0]
+ loop .
+ mov edx, 0
+ and esi, 0
+ sub edi, edi
+ push 0
+ pop ebp
+
+#elif defined(__arm__)
+
+ eor r0, r0, r0
+ rsb r1, r1, r1
+0: subs r2, r2, #1
+ bne 0b
+ mov r3, #0
+ and r4, r4, #0
+ sub r5, r5, r5
+
+#elif defined(__aarch64__)
+
+ eor w0, w0, w0
+ mov w1, wzr
+0: sub w2, w2, #1
+ cbnz w2, 0b
+ mov w3, #0
+ and w4, w4, wzr
+ sub w5, w5, w5
+
+#else
+ notimpl
+#endif
+
ret
endproc
//
// on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1}
// and f_{i+c}, where f_{i+1} = f_i + f_{i-1}
+
+#if defined(__x86_64__)
+
0: xadd rax, rdx // a, d = a + d, a
// = f_{i+1} + f_i, f_{i+1}
// = f_{i+2}, f_{i+1}
loop 0b // advance i, decrement c, iterate
+#elif defined(__i386__)
+
+0: xadd eax, edx
+ loop 0b
+
+#elif defined(__arm__)
+
+0: subs r2, r2, #2
+ add r3, r3, r0
+ blo 8f
+ add r0, r0, r3
+ bhi 0b
+
+8: movne r0, r3
+
+#elif defined(__aarch64__)
+
+0: subs x2, x2, #2
+ add x3, x3, x0
+ b.lo 8f
+ add x0, x0, x3
+ b.hi 0b
+
+8: cmov.ne x0, x3
+
+#else
+ notimpl
+#endif
+
ret
endproc
// boolean canonify a: if a = 0 on entry, leave it zero; otherwise
// set a = 1
+
+#if defined(__x86_64__)
+
neg rax // set cf iff a /= 0
sbb rax, rax // a = a - a - cf = -cf
neg rax // a = cf
+#elif defined(__i386__)
+
+ neg eax
+ sbb eax, eax
+ neg eax
+
+#elif defined(__arm__)
+
+ movs r1, r0 // the easy way
+ movne r1, #1 // mvnne r1, #1 for mask
+
+ cmp r0, #1 // clear cf iff a == 0
+ sbc r2, r0, r0 // c' = a - a - 1 + cf = cf - 1
+ add r2, r2, #1 // c' = cf
+
+ sub r3, r0, r0, lsr #1 // d' top bit clear; d' = 0 iff a = 0
+ rsb r3, r3, #0 // d' top bit set iff a /= 0
+ mov r3, r3, lsr #31 // asr for mask
+
+ rsbs r0, r0, #0
+ sbc r0, r0, r0
+ rsb r0, r0, #0
+
+#elif defined(__aarch64__)
+
+ cmp x0, #0 // trivial
+ cset.ne x1 // csetm for mask
+
+ cmp xzr, x0 // set cf iff a == 0
+ sbc x2, x0, x0 // c' = a - a - 1 + cf = cf - 1
+ neg x2, x2 // c' = 1 - cf
+
+ sub x3, x0, x0, lsr #1 // if a < 2^63 then a' = ceil(d/2) <
+ // 2^63
+ // if a >= 2^63, write a = 2^63 + t
+ // with t < 2^63; d' = 2^63 - 2^62 +
+ // ceil(t/2) = 2^62 + ceil(t/2), and
+ // ceil(t/2) < 2^62
+ // anyway d' < 2^63 and d' = 0 iff
+ // a = 0
+ neg x3, x3 // d' top bit set iff a /= 0
+ lsr x3, x3, #63 // asr for mask
+
+ cmp x0, #1 // set cf iff a /= 0
+ adc x0, xzr, xzr // a' = 0 + 0 + cf = cf
+
+#else
+ notimpl
+#endif
+
ret
endproc
proc x03
// set a = min(a, d) (unsigned); clobber c, d
+
+#if defined(__x86_64__)
+
sub rdx, rax // d' = d - a; set cf if a > d
sbb rcx, rcx // c = -cf = -[a > d]
and rcx, rdx // c = a > d ? d - a : 0
add rax, rcx // a' = a > d ? d : a
+#elif defined(__i386__)
+
+ sub edx, eax
+ sbb ecx, ecx
+ and ecx, edx
+ add eax, ecx
+
+#elif defined(__arm__)
+
+ cmp r0, r3 // the easy way
+ movlo r1, r0 // only needed for out-of-place
+ movhs r1, r3
+
+ subs r3, r3, r0
+ sbc r12, r12, r12
+ and r12, r12, r3
+ add r0, r0, r12
+
+#elif defined(__aarch64__)
+
+ cmp x0, x3 // the easy way
+ csel.lo x1, x0, x3
+
+ subs x3, x3, x0 // d' = d - a; set cf if d >= a
+ sbc x16, xzr, xzr // t = -1 + cf = -[a > d]
+ and x16, x16, x3 // t = a > d ? d - a : 0
+ add x0, x0, x16 // a' = a > d ? d : a
+
+#else
+ notimpl
+#endif
+
ret
endproc
proc x04
// switch case?
+
+#if defined(__x86_64__)
+
+ // unrelated playing
+ mov ecx, eax
+ mov rbx, -1
+ mov edx, ecx
+ sub edx, '0'
+ cmp edx, 10
+ cmovb rbx, rdx
+ or ecx, 0x20
+ mov edx, ecx
+ sub edx, 'a'
+ sub ecx, 'a' - 10
+ cmp edx, 6
+ cmovb rbx, rcx
+
xor al, 0x20
+#elif defined(__i386__)
+
+ // unrelated playing
+ mov ecx, eax
+ mov ebx, -1
+ mov edx, ecx
+ sub edx, '0'
+ cmp edx, 10
+ cmovb ebx, edx
+ or ecx, 0x20
+ mov edx, ecx
+ sub edx, 'a'
+ sub ecx, 'a' - 10
+ cmp edx, 6
+ cmovb ebx, ecx
+
+ xor al, 0x20
+
+#elif defined(__arm__)
+
+ // unrelated playing
+ mvn r1, #0
+ sub r12, r0, #'0'
+ cmp r12, #10
+ movlo r1, r12
+ orr r12, r0, #0x20
+ sub r12, r12, #'a'
+ cmp r12, #6
+ addlo r1, r12, #10
+
+ eor r0, r0, #0x20
+
+#elif defined(__aarch64__)
+
+ // unrelated playing
+ mov x1, #-1
+ sub w16, w0, #'0'
+ cmp w16, #10
+ cmov.lo x1, x16
+ orr w16, w0, #0x20
+ sub w16, w16, #'a' - 10
+ cmp w16, #10
+ ccmp.hs w16, #16, #CCMP_HS
+ cmov.lo x1, x16
+
+ eor w0, w0, #0x20
+
+#else
+ notimpl
+#endif
+
ret
endproc
proc x05
// answer whether 5 <= a </<= 9.
+
+#if defined(__x86_64__)
+
sub rax, 5 // a' = a - 5
cmp rax, 4 // is a' - 5 </<= 4?
// g/nle a' > 4 a > 9 or a < -2^63 + 5
// le/ng a' <= 4 -2^63 + 5 <= a <= 9
+#elif defined(__i386__)
+
+ sub eax, 5
+ cmp eax, 4
+
+#elif defined(__arm__)
+
+ // i dimly remember having a slick way to do this way back in the
+ // day, but i can't figure it out any more.
+ sub r0, #5
+ cmp r0, #4
+
+#elif defined(__aarch64__)
+
+ // literal translation is too obvious
+ cmp x0, #5
+ ccmp.hs x0, #9, #CCMP_HS
+
+#else
+ notimpl
+#endif
+
ret
endproc
// leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of,
// set sf to msb(a)
+
+#if defined(__x86_64__)
+
not rax // a' = -a - 1
inc rax // a' = -a
neg rax // a' = a
+#elif defined(__i386__)
+
+ not eax
+ inc eax
+ neg eax
+
+#elif defined(__arm__)
+
+ mvn r0, r0
+ add r0, r0, #1
+ rsbs r0, r0, #0 // cf has opposite sense
+
+#elif defined(__aarch64__)
+
+ mvn x0, x0
+ add x0, x0, #1
+ negs x0, x0 // cf has opposite sense
+
+#else
+ notimpl
+#endif
+
ret
endproc
proc x07
// same as before (?)
+
+#if defined(__x86_64__)
+
inc rax // a' = a + 1
neg rax // a' = -a - 1
inc rax // a' = -a
neg rax // a' = a
+#elif defined(__i386__)
+
+ inc eax
+ neg eax
+ inc eax
+ neg eax
+
+#elif defined(__arm__)
+
+ add r0, r0, #1
+ rsb r0, r0, #0
+ add r0, r0, #1
+ rsbs r0, r0, #0
+
+#elif defined(__aarch64__)
+
+ add x0, x0, #1
+ neg x0, x0
+ add x0, x0, #1
+ negs x0, x0 // cf has opposite sense
+
+#else
+ notimpl
+#endif
+
ret
endproc
// floor((a + d)/2), correctly handling overflow conditions; final cf
// is lsb(a + d), probably uninteresting
+
+#if defined(__x86_64__)
+
add rax, rdx // cf || a' = a + d
rcr rax, 1 // shift 65-bit result right by one
// place; lsb moves into carry
+#elif defined(__i386__)
+
+ add eax, edx
+ rcr eax, 1
+
+#elif defined(__arm__)
+
+ // like the two-instruction a64 version
+ sub r1, r3, r0
+ add r1, r0, r1, lsr #1
+
+ // the slick version, similar to the above
+ adds r0, r0, r3
+ mov r0, r0, rrx
+
+#elif defined(__aarch64__)
+
+ // a64 lacks a32's rrx. literal translation.
+ adds x1, x0, x3 // cf || a' = a + d
+ adc x16, xzr, xzr // realize cf in extra register
+ extr x1, x16, x1, #1 // shift down one place
+
+ // two instruction version: clobbers additional register. (if you
+ // wanted the answer in any other register, even overwriting d, then
+ // this is unnecessary.) also depends on d >= a.
+ sub x16, x3, x0 // compute difference
+ add x0, x0, x16, lsr #1 // add half of it (rounded down)
+
+#else
+ notimpl
+#endif
+
ret
endproc
// a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3
// (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8).
+
+#if defined(__x86_64__)
+
shr rax, 3 // a' = floor(a/8); cf = 1 if a ==
// 4, 5, 6, 7 (mod 8)
adc rax, 0 // a' = floor(a/8) + cf
+#elif defined(__i386__)
+
+ shr eax, 3
+ adc eax, 0
+
+#elif defined(__arm__)
+
+ movs r0, r0, lsr #3
+ adc r0, r0, #0
+
+#elif defined(__aarch64__)
+
+ tst x0, #4
+ orr x0, xzr, x0, lsr #3
+ cinc.ne x0, x0
+
+#else
+ notimpl
+#endif
+
ret
endproc
proc x0a
// increment c-byte little-endian bignum at rdi
+
+#if defined(__x86_64__)
+
add byte ptr [rdi], 1
0: inc rdi
adc byte ptr [rdi], 0
loop 0b
+#elif defined(__i386__)
+
+ add byte ptr [edi], 1
+0: inc edi
+ adc byte ptr [edi], 0
+ loop 0b
+
+#elif defined(__arm__)
+
+ mov r12, #256 // set initial carry
+0: ldrb r0, [r5]
+ subs r2, r2, #1
+ add r12, r0, r12, lsr #8
+ strb r12, [r5], #1
+ bne 0b
+
+#elif defined(__aarch64__)
+
+ mov w17, #256 // set initial carry
+0: ldrb w16, [x5]
+ sub x2, x2, #1
+ add w17, w16, w17, lsr #8
+ strb w17, [x5], #1
+ cbnz x2, 0b
+
+#else
+ notimpl
+#endif
+
ret
endproc
proc x0b
// negate double-precision d:a
+
+#if defined(__x86_64__)
+
not rdx // d' = -d - 1
neg rax // a' = -a;
// cf = 1 iff a /= 0
sbb rdx, -1 // d' = -d - cf
+#elif defined(__i386__)
+
+ not edx
+ neg eax
+ sbb edx, -1
+
+#elif defined(__arm__)
+
+ // reverse subtract is awesome
+ rsbs r0, r0, #0
+ rsc r3, r3, #0
+
+#elif defined(__aarch64__)
+
+ // easy way: everything is better with zero registers.
+ negs x0, x0
+ ngc x3, x3
+
+#else
+ notimpl
+#endif
+
ret
endproc
// rotate is distributive over xor.
+#if defined(__x86_64__)
+
// rax // = a_1 || a_0
// rbx // = b_1 || b_0
mov rcx, rax // = a_1 || a_0
cmp rax, rcx // always equal
+#elif defined(__i386__)
+
+ mov ecx, eax // = a_1 || a_0
+
+ xor ecx, ebx // = (a_1 XOR b_1) || (a_0 XOR b_0)
+ ror ecx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+ ror eax, 0xd // = a_0 || a_1
+ ror ebx, 0xd // = b_0 || b_1
+ xor eax, ebx // = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+ cmp eax, ecx // always equal
+
+#elif defined(__arm__)
+
+
+ // r0 // = a_1 || a_0
+ // r1 // = b_1 || b_0
+ eor r2, r0, r1 // = (a_1 XOR b_1) || (a_0 XOR b_0)
+ mov r2, r2, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+ mov r1, r1, ror #13 // = b_0 || b_1
+ eor r0, r1, r0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+ cmp r0, r2 // always equal
+
+#elif defined(__aarch64__)
+
+ // x0 // = a_1 || a_0
+ // x1 // = b_1 || b_0
+ eor x2, x0, x1 // = (a_1 XOR b_1) || (a_0 XOR b_0)
+ ror x2, x2, #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+ ror x1, x1, #13 // = b_0 || b_1
+ eor x0, x1, x0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+ cmp x0, x2 // always equal
+
+#else
+ notimpl
+#endif
+
ret
endproc
// and is distributive over xor.
+#if defined(__x86_64__)
+
mov rdx, rbx // = b
xor rbx, rcx // = b XOR c
cmp rax, rbx // always equal
+#elif defined(__i386__)
+
+ mov edx, ebx // = b
+
+ xor ebx, ecx // = b XOR c
+ and ebx, eax // = a AND (b XOR c)
+
+ and edx, eax // = a AND b
+ and eax, ecx // = a AND c
+ xor eax, edx // = (a AND b) XOR (a AND c)
+ // = a AND (b XOR c)
+
+ cmp eax, ebx // always equal
+
+#elif defined(__arm__)
+
+ and r3, r0, r1 // = a AND b
+
+ eor r1, r1, r2 // = b XOR c
+ and r1, r1, r0 // = a AND (b XOR c)
+
+ and r0, r0, r2 // = a AND c
+ eor r0, r0, r3 // = (a AND b) XOR (a AND c)
+ // = a AND (b XOR c)
+
+ cmp r0, r1 // always equal
+
+#elif defined(__aarch64__)
+
+ and x3, x0, x1 // = a AND b
+
+ eor x1, x1, x2 // = b XOR c
+ and x1, x1, x0 // = a AND (b XOR c)
+
+ and x0, x0, x2 // = a AND c
+ eor x0, x0, x3 // = (a AND b) XOR (a AND c)
+ // = a AND (b XOR c)
+
+ cmp x0, x1 // always equal
+
+#else
+ notimpl
+#endif
+
ret
endproc
// de morgan's law
+#if defined(__x86_64__)
+
mov rcx, rax // = a
and rcx, rbx // = a AND b
or rax, rbx // = (NOT a) OR (NOT b)
// = NOT (a AND b)
- cmp rax, rcx
+ cmp rax, rcx // always equal
+
+#elif defined(__i386__)
+
+ mov ecx, eax // = a
+
+ and ecx, ebx // = a AND b
+ not ecx // = NOT (a AND b)
+
+ not eax // = NOT a
+ not ebx // = NOT b
+ or eax, ebx // = (NOT a) OR (NOT b)
+ // = NOT (a AND b)
+
+ cmp eax, ecx // always equal
+
+#elif defined(__arm__)
+
+ and r2, r0, r1 // = a AND b
+ mvn r2, r2 // = NOT (a AND b)
+
+ mvn r0, r0 // = NOT a
+ mvn r1, r1 // = NOT b
+ orr r0, r0, r1 // = (NOT a) OR (NOT b)
+
+ cmp r0, r2 // always equal
+
+#elif defined(__aarch64__)
+
+ and x2, x0, x1 // = a AND b
+ mvn x2, x2 // = NOT (a AND b)
+
+ mvn x0, x0 // = NOT a
+ orn x0, x0, x1 // = (NOT a) OR (NOT b)
+
+ cmp x0, x2 // always equal
+
+#else
+ notimpl
+#endif
ret
//
// not sure why you'd do this.
- cld
+#if defined(__x86_64__)
0: xor [rsi], al
lodsb
loop 0b
+#elif defined(__i386__)
+
+0: xor [esi], al
+ lodsb
+ loop 0b
+
+#elif defined(__arm__)
+
+0: ldrb r12, [r4]
+ subs r2, r2, #1
+ eor r0, r0, r12
+ strb r0, [r4], #1
+ bne 0b
+
+#elif defined(__aarch64__)
+
+0: ldrb w16, [x4]
+ sub x2, x2, #1
+ eor w0, w0, w16
+ strb w0, [x4], #1
+ cbnz x2, 0b
+
+#else
+ notimpl
+#endif
+
ret
endproc
+///--------------------------------------------------------------------------
+/// 0x10--0x1f
+
proc x10
// four different ways to swap a pair of registers.
+#if defined(__x86_64__)
+
push rax
push rcx
pop rax
xchg rax, rcx
+#elif defined(__i386__)
+
+ push eax
+ push ecx
+ pop eax
+ pop ecx
+
+ xor eax, ecx
+ xor ecx, eax
+ xor eax, ecx
+
+ add eax, ecx
+ sub ecx, eax
+ add eax, ecx
+ neg ecx
+
+ xchg eax, ecx
+
+#elif defined(__arm__)
+
+ stmfd r13!, {r0, r2}
+ ldr r0, [r13, #4]
+ ldr r2, [r13], #8
+
+ eor r0, r0, r2
+ eor r2, r2, r0
+ eor r0, r0, r2
+
+ sub r0, r0, r2
+ add r2, r2, r0
+ rsb r0, r0, r2 // don't need 3-addr with reverse-sub
+
+ mov r12, r0
+ mov r0, r2
+ mov r2, r0
+
+#elif defined(__aarch64__)
+
+ // anything you can do
+ stp x0, x2, [sp, #-16]!
+ ldp x2, x0, [sp], #16
+
+ eor x0, x0, x2
+ eor x2, x2, x0
+ eor x0, x0, x2
+
+ // the add/sub/add thing was daft. you can do it in three if you're
+ // clever -- and have three-address operations.
+ sub x0, x0, x2
+ add x2, x2, x0
+ sub x0, x2, x0
+
+ // but we lack a fourth. we can't do this in fewer than three
+ // instructions without hitting memory. only `ldp' will modify two
+ // registers at a time, so we need at least two instructions -- but
+ // if the first one sets one of our two registers to its final value
+ // then we lose the other input value with no way to recover it, so
+ // we must either write a fresh third register, or write something
+ // other than the final value, and in both cases we need a third
+ // instruction to fix everything up. we've done the wrong-something-
+ // other trick twice, so here's the captain-obvious use-a-third-
+ // register version.
+ mov x16, x0
+ mov x0, x2
+ mov x2, x16
+
+#else
+ notimpl
+#endif
+
ret
endproc
// in particular, a will be zero (and zf set) if and only if the two
// strings are equal.
+#if defined(__x86_64__)
+
0: mov dl, [rsi]
xor dl, [rdi]
inc rsi
or al, dl
loop 0b
+#elif defined(__i386__)
+
+0: mov dl, [esi]
+ xor dl, [edi]
+ inc esi
+ inc edi
+ or al, dl
+ loop 0b
+
+#elif defined(__arm__)
+
+0: ldrb r1, [r4], #1
+ ldrb r12, [r5], #1
+ subs r2, r2, #1
+ eor r12, r12, r1
+ orr r0, r0, r12
+ bne 0b
+
+#elif defined(__aarch64__)
+
+0: ldrb w16, [x4], #1
+ ldrb w17, [x5], #1
+ sub x2, x2, #1
+ eor w16, w16, w17
+ orr w0, w0, w16
+ cbnz x2, 0b
+
+#else
+ notimpl
+#endif
+
ret
endproc
// move all of the set bits in d to a, unless there's already a bit
// there. this clearly doesn't change the sum.
+#if defined(__x86_64__)
+
mov rcx, rdx // c' = d
and rdx, rax // d' = a AND d
or rax, rcx // a' = a OR d
add rax, rdx
- ret
+#elif defined(__i386__)
-endproc
+ mov ecx, edx // c' = d
+ and edx, eax // d' = a AND d
+ or eax, ecx // a' = a OR d
+ add eax, edx
-proc x13
+#elif defined(__arm__)
- // ok, so this is a really obtuse way of adding a and b; the result
+ and r2, r0, r3 // c' = a AND d
+ orr r0, r0, r3 // a' = a OR d
+ add r0, r0, r2
+
+#elif defined(__aarch64__)
+
+ and x2, x0, x3 // c' = a AND d
+ orr x0, x0, x3 // a' = a OR d
+ add x0, x0, x2
+
+#else
+ notimpl
+#endif
+
+ ret
+
+endproc
+
+proc x13
+
+ // ok, so this is a really obtuse way of adding a and b; the result
// is in a and d. but why does it work?
+#if defined(__x86_64__)
+
mov rcx, 0x40 // carry chains at most 64 long
0: mov rdx, rax // copy a'
xor rax, rbx // low bits of each bitwise sum
and rbx, rdx // carry bits from each bitwise sum
- shl rbx, 001 // carry them into next position
+ shl rbx, 1 // carry them into next position
loop 0b
+#elif defined(__i386__)
+
+ mov ecx, 0x40 // carry chains at most 64 long
+0: mov edx, eax // copy a'
+ xor eax, ebx // low bits of each bitwise sum
+ and ebx, edx // carry bits from each bitwise sum
+ shl ebx, 1 // carry them into next position
+ loop 0b
+
+#elif defined(__arm__)
+
+ mov r2, #0x40
+0: and r3, r0, r1
+ subs r2, r2, #1
+ eor r0, r0, r1
+ lsl r1, r3, #1
+ bne 0b
+
+#elif defined(__aarch64__)
+
+ mov x2, #0x40
+0: and x3, x0, x1
+ sub x2, x2, #1
+ eor x0, x0, x1
+ lsl x1, x3, #1
+ cbnz x2, 0b
+
+#else
+ notimpl
+#endif
+
ret
endproc
// floor((a + d)/2), like x08.
+#if defined(__x86_64__)
+
mov rcx, rax // copy a for later
and rcx, rdx // carry bits
add rax, rcx // add the carries; done
+#elif defined(__i386__)
+
+ mov ecx, eax // copy a for later
+ and ecx, edx // carry bits
+
+ xor eax, edx // low bits of each bitwise sum
+ shr eax, 1 // divide by 2; carries now in place
+
+ add eax, ecx // add the carries; done
+
+#elif defined(__arm__)
+
+ and r2, r0, r3
+ eor r0, r0, r3
+ add r0, r2, r0, lsr #1
+
+#elif defined(__aarch64__)
+
+ and x2, x0, x3
+ eor x0, x0, x3
+ add x0, x2, x0, lsr #1
+
+#else
+ notimpl
+#endif
+
ret
endproc
// sign extension 32 -> 64 bits.
- //movsx rbx, eax // like this?
+#if defined(__x86_64__)
+
+ movsx rbx, eax // like this?
mov rdx, 0xffffffff80000000
add rax, rdx // if bit 31 of a is set then bits
// exactly backwards
xor rax, rdx // so fix it
+#elif defined(__i386__)
+
+ movsx ebx, ax // like this?
+
+ mov edx, 0xffff8000
+ add eax, edx // if bit 31 of a is set then bits
+ // 31--63 of a' are clear; otherwise,
+ // these bits are all set -- which is
+ // exactly backwards
+ xor eax, edx // so fix it
+
+#elif defined(__arm__)
+
+ sxth r1, r0 // like this
+
+ mov r12, #0x80000000
+ add r0, r0, r12, asr #16
+ eor r0, r0, r12, asr #16
+
+#elif defined(__aarch64__)
+
+ sxtw x1, w0 // like this
+
+ mov x16, #0xffffffff80000000
+ add x0, x0, x16
+ eor x0, x0, x16
+
+#else
+ notimpl
+#endif
+
ret
endproc
proc x16
- shl rax, 56
- shl rbx, 56
- shl rcx, 56
+ // ??? i don't know why you'd want to calculate this.
+
+#if defined(__x86_64__)
xor rax, rbx // a' = a XOR b
xor rbx, rcx // b' = b XOR c
xor rax, rbx // a' = cf ? 0 : a XOR c
cmp rax, rsi
+#elif defined(__i386__)
+
+ xor eax, ebx // a' = a XOR b
+ xor ebx, ecx // b' = b XOR c
+ mov esi, eax // t = a XOR b
+ add esi, ebx // t = (a XOR b) + (b XOR c)
+ cmovc eax, ebx // a' = cf ? b XOR c : a XOR b
+ xor eax, ebx // a' = cf ? 0 : a XOR c
+ cmp eax, esi
+
+#elif defined(__arm__)
+
+ eor r0, r0, r1
+ eor r1, r1, r2
+ adds r4, r0, r1
+ movcs r0, r1
+ eor r0, r0, r1
+ cmp r0, r4
+
+#elif defined(__aarch64__)
+
+ eor x0, x0, x1
+ eor x1, x1, x2
+ adds x4, x0, x1
+ cmov.cs x0, x1
+ eor x0, x0, x1
+ cmp x0, x4
+
+#else
+ notimpl
+#endif
+
ret
endproc
proc x17
- ud2
+ // absolute value
+
+#if defined(__x86_64__)
+
+ cqo // d = a < 0 ? -1 : 0
+ xor rax, rdx // a' = a < 0 ? -a - 1 : a
+ sub rax, rdx // a' = a < 0 ? -a : a
+
+#elif defined(__i386__)
+
+ cdq // d = a < 0 ? -1 : 0
+ xor eax, edx // a' = a < 0 ? -a - 1 : a
+ sub eax, edx // a' = a < 0 ? -a : a
+
+#elif defined(__arm__)
+
+ // direct approach
+ movs r1, r0
+ rsbmi r1, r0, #0
+
+ // faithful-ish conversion
+ eor r3, r0, r0, asr #31
+ sub r0, r3, r0, asr #31
+
+#elif defined(__aarch64__)
+
+ // direct approach
+ tst x0, #1 << 63
+ cneg.ne x1, x0
+
+ // faithful-ish conversion
+ eor x3, x0, x0, asr #63
+ sub x0, x3, x0, asr #63
+
+#else
+ notimpl
+#endif
+
+ ret
endproc
proc x18
- ud2
+ // should always set sf, clear zf, unless we get rescheduled to a
+ // different core.
+
+#if defined(__x86_64__)
+
+ rdtsc // d || a = cycles
+ shl rdx, 0x20
+ or rax, rdx // a = cycles
+ mov rcx, rax // c = cycles
+
+ rdtsc // d || a = cycles'
+ shl rdx, 0x20
+ or rax, rdx // a = cycles'
+
+ cmp rcx, rax
+
+#elif defined(__i386__)
+
+ rdtsc // d || a = cycles
+ mov ebx, eax
+ mov ecx, edx // c || b = cycles
+
+ rdtsc // d || a = cycles'
+
+ sub ebx, eax
+ sbb ecx, edx
+
+#elif defined(__arm__)
+
+ // cycle clock not available in user mode
+ mrrc p15, 0, r0, r1, c9
+ mrrc p15, 0, r2, r3, c9
+ subs r0, r0, r2
+ sbcs r1, r1, r3
+
+#elif defined(__aarch64__)
+
+ // cycle clock not available in user mode
+ mrs x0, pmccntr_el0
+ mrs x1, pmccntr_el0
+ cmp x0, x1
+
+#else
+ notimpl
+#endif
+
+ ret
endproc
proc x19
- ud2
+ // stupid way to capture a pointer to inline data and jump past it.
+ // confuses the return-address predictor something chronic. worse
+ // because amd64 calling convention doesn't usually pass arguments on
+ // the stack.
-endproc
+#if defined(__x86_64__)
-proc x1a
+ call 8f
+ .string "hello world!\n\0"
+8: call print_str
+ add rsp, 8
+ ret
- ud2
+print_str:
+ // actually implement this ridiculous thing
+ mov rsi, [rsp + 8]
+ xor edx, edx
+0: mov al, [rsi + rdx]
+ inc rdx
+ cmp al, 0
+ jnz 0b
+ mov eax, SYS_write
+ mov edi, 1
+ dec rdx
+ syscall // clobbers r11 :-(
+ ret
-endproc
+#elif defined(__i386__)
-proc x1b
+ call 8f
+ .string "hello world!\n\0"
+8: call print_str
+ add esp, 4
+ ret
- ud2
+print_str:
+ // actually implement this ridiculous thing
+ mov ecx, [esp + 4]
+ xor edx, edx
+0: mov al, [ecx + edx]
+ inc edx
+ cmp al, 0
+ jnz 0b
+ mov eax, SYS_write
+ mov ebx, 1
+ dec edx
+ int 0x80
+ ret
-endproc
+#elif defined(__arm__)
+
+ // why am i doing this?
+ stmfd r13!, {r14}
+ bl 8f
+ .string "hello world!\n\0"
+ .balign 4
+8: mov r1, r14 // might as well make it easy on myself
+ bl print_str
+ ldmfd r13!, {pc}
+
+print_str:
+ mov r2, #0
+0: ldrb r0, [r1, r2]
+ cmp r0, #0
+ addne r2, r2, #1
+ bne 0b
+ mov r0, #1
+ mov r7, #SYS_write
+ swi 0
+ bx r14
+
+#elif defined(__aarch64__)
+
+ // why am i doing this?
+ str x30, [sp, #-16]!
+ bl 8f
+ .string "hello world!\n\0"
+ .balign 4
+8: mov x1, x30 // might as well make it easy on myself
+ bl print_str
+ ldr x30, [sp], #16
+ ret
-proc x1c
+print_str:
+ mov x2, #0
+0: ldrb w0, [x1, x2]
+ cmp w0, #0
+ cinc.ne x2, x2
+ b.ne 0b
+ mov x0, #1
+ mov x8, #SYS_write
+ svc #0
+ ret
- ud2
+#else
+ notimpl
+#endif
endproc
-proc x1d
+proc x1a
- ud2
+ // collect the current instruction-pointer address. this was an old
+ // 32-bit i386 trick for position-independent code, but (a) it
+ // confuses the return predictor, and (b) amd64 has true pc-relative
+ // addressing.
-endproc
+#if defined(__x86_64__)
-proc x1e
+ // the actual example
+ call 0f
+0: pop rax
- ud2
+ // the modern i386 trick doesn't confuse the return-address
+ // predictor.
+ call calladdr_rbx
+ sub rbx, . - 0b
-endproc
+ // but rip-relative addressing is even better
+ lea rcx, [rip + 0b]
-proc x1f
+ ret
- ud2
+calladdr_rbx:
+ mov rbx, [rsp]
+ ret
-endproc
+#elif defined(__i386__)
-proc x20
+ // the actual example
+ call 0f
+0: pop eax
- ud2
+ // the modern i386 trick doesn't confuse the return-address
+ // predictor.
+ call get_pc_ebx
+ sub ebx, . - 0b
ret
-endproc
+#elif defined(__arm__)
-proc x21
+ stmfd r13!, {r14}
- ud2
+ bl 0f
+0: mov r0, r14
-endproc
+ bl return
+ sub r1, r14, #. - 0b
-proc x22
+ adr r2, 0b
- ud2
+ ldmfd r13!, {pc}
-endproc
+return: bx r14
-proc x23
+#elif defined(__aarch64__)
- ud2
+ str x30, [sp, #-16]!
-endproc
+ // we can do all of the above using a64
+ bl 0f
+0: mov x0, x30
-proc x24
+ bl return
+ sub x1, x30, #. - 0b
- ud2
+ adr x2, 0b
+
+ ldr x30, [sp], #16
+return: ret
+
+#else
+ notimpl
+#endif
endproc
-proc x25
+proc x1b
- ud2
+#if defined(__x86_64__)
+
+ // retpolines: an mitigation against adversarially influenced
+ // speculative execution at indirect branches. if an adversary can
+ // prepare a branch-target buffer entry matching an indirect branch
+ // in the victim's address space then they can cause the victim to
+ // /speculatively/ (but not architecturally) execute any code in
+ // their address space, possibly leading to leaking secrets through
+ // the cache. retpolines aren't susceptible to this because the
+ // predicted destination address is from the return-prediction stack
+ // which the adversary can't prime. the performance penalty is still
+ // essentially a branch misprediction -- for this return, and
+ // possibly all others already stacked.
+
+ // (try not to crash)
+ lea rax, [rip + 9f]
-endproc
+ push rax
+9: ret
-proc x26
+#elif defined(__i386__)
- ud2
+ call get_pc_ebx
+ lea eax, [ebx + 9f - .]
-endproc
+ push eax
+9: ret
-proc x27
+#elif defined(__arm__)
- ud2
+ stmfd r13!, {r14}
-endproc
+ adr r14, 8f
+ bx r14
-proc x28
+8: ldmfd r13!, {pc}
- ud2
+#elif defined(__aarch64__)
-endproc
+ str x30, [sp, #-16]!
-proc x29
+ adr x30, 8f
+ ret
- ud2
+8: ldr x30, [sp], #16
+ ret
+
+#else
+ notimpl
+#endif
endproc
-proc x2a
+proc x1c
- ud2
+ // ok, having a hard time seeing a use for this. the most important
+ // thing to note is that sp is set from `pop' /after/ it's
+ // incremented.
-endproc
+#if defined(__x86_64__)
-proc x2b
+ // try not to crash
+ mov rax, rsp
+ and rsp, -16
+ push rax
- ud2
+ pop rsp
-endproc
+ // check it worked
+ mov rbx, rsp
+ ret
-proc x2c
+#elif defined(__i386__)
- ud2
+ // try not to crash
+ mov eax, esp
+ and esp, -16
+ push eax
-endproc
+ pop esp
-proc x2d
+ // check it worked
+ mov ebx, esp
+ ret
- ud2
+#elif defined(__arm__)
-endproc
+ // not even going to dignify this
+ notimpl
-proc x2e
+#elif defined(__aarch64__)
- ud2
+ // not even going to dignify this
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
-proc x2f
+proc x1d
- ud2
+ // monumentally cheesy way to copy 8 n bytes from buff1 to buff2.
+ // also clobbers words at buff2 + 8 n and buff2 - 8 for good measure.
-endproc
+ n = 4
-proc x30
+#if defined(__x86_64__)
- ud2
+ mov rax, rsp // safekeeping
+
+ // we're toast if we get hit by a signal now. fingers crossed...
+ .if 0
+ mov rsp, buff2 + 8*n + 8
+ mov rbp, buff1 + 8*n
+ .else
+ lea rsp, [rdi + 8*n + 16]
+ lea rbp, [rsi + 8*n]
+ .endif
+ enter 0, n + 1
+
+ // precise action:
+ //
+ // +---------+ +---------+
+ // rbp -> | ??? | rsp -> | ??? |
+ // +---------+ +---------+
+ // | w_{n-1} | | rbp | <- rbp'
+ // +---------+ +---------+
+ // | ... | | w_{n-1} |
+ // +---------+ +---------+
+ // | w_1 | | ... |
+ // +---------+ +---------+
+ // | w_0 | | w_1 |
+ // +---------+ +---------+
+ // | w_0 |
+ // +---------+
+ // | rbp' | <- rsp'
+ // +---------+
+
+ mov rdx, rsp
+ mov rsp, rax
+
+#elif defined(__i386__)
+
+ mov eax, esp // safekeeping
+
+ // we're toast if we get hit by a signal now. fingers crossed...
+ .if 0
+ mov esp, buff2 + 4*n + 4
+ mov ebp, buff1 + 4*n
+ .else
+ lea esp, [edi + 4*n + 8]
+ lea ebp, [esi + 4*n]
+ .endif
+ enter 0, n + 1
+
+ mov edx, esp
+ mov esp, eax
+
+#elif defined(__arm__)
+
+ add r4, r4, #4*n
+ add r5, r5, #4*n + 8
+
+ str r4, [r5, #-4]!
+ .rept n/2
+ ldrd r0, r1, [r4, #-8]!
+ strd r0, r1, [r5, #-8]!
+ .endr
+ add r4, r5, #4*n
+ str r4, [r5, #-4]!
+
+#elif defined(__aarch64__)
+
+ // omgwtf. let's not actually screw with the stack pointer.
+
+ add x4, x4, #8*n
+ add x5, x5, #8*n + 16
+
+ str x4, [x5, #-8]!
+ .rept n/2
+ ldp x16, x17, [x4, #-16]!
+ stp x16, x17, [x5, #-16]!
+ .endr
+ add x4, x5, #8*n
+ str x4, [x5, #-8]!
+
+#else
+ notimpl
+#endif
ret
endproc
-proc x31
+proc x1e
- ud2
+ // convert nibble value to (uppercase) hex; other input values yield
+ // nonsense.
+
+#if defined(__x86_64__)
+
+ // das doesn't work in 64-bit mode; best i can come up with
+ mov edx, eax
+ add al, '0'
+ add dl, 'A' - 10
+ cmp al, '9' + 1
+ cmovae eax, edx
+
+#elif defined(__i386__)
+
+ cmp al, 0x0a // cf = 1 iff a < 10
+ sbb al, 0x69 // if 0 <= a < 10, a' = a - 0x6a, so
+ // 0x96 <= a' < 0x70, setting af, cf
+ // if 10 <= a < 16, a' = a - 0x69, so
+ // 0x71 <= a' < 0x77, setting cf but
+ // clearing af
+ das // if 0 <= a < 10, then af and cf are
+ // both set, so set subtract 0x66
+ // from a' leaving 0x30 <= a' < 0x3a;
+ // if 10 <= a < 16 then af clear but
+ // cf set, so subtract 0x60 from a'
+ // leaving 0x41 <= a' < 0x47
+
+#elif defined(__arm__)
+
+ // significantly less tricksy
+ cmp r0, #10
+ addlo r0, r0, #'0'
+ addhs r0, r0, #'A' - 10
+
+#elif defined(__aarch64__)
+
+ // with less versatile conditional execution this is the best we can
+ // do
+ cmp w0, #10
+ add w16, w0, #'A' - 10
+ add w0, w0, #'0'
+ cmov.hs w0, w16
+
+#else
+ notimpl
+#endif
+
+ ret
endproc
-proc x32
+proc x1f
- ud2
+ // verify collatz conjecture starting at a; assume a /= 0!
+
+#if defined(__x86_64__)
+
+0: bsf rcx, rax // clobber c if a = 0
+ shr rax, cl // a = 2^c a'
+ cmp rdx, 0
+ je 1f
+ stosq
+ dec rdx
+1:
+ cmp rax, 1 // done?
+ je 9f
+ lea rax, [2*rax + rax + 1] // a' = 3 a' + 1
+ jmp 0b // again
+
+9: ret
+
+#elif defined(__i386__)
+
+0: bsf ecx, eax // clobber c if a = 0
+ shr eax, cl // a = 2^c a'
+ cmp edx, 0
+ je 1f
+ stosd
+ dec edx
+1:
+ cmp eax, 1 // done?
+ je 9f
+ lea eax, [2*eax + eax + 1] // a' = 3 a' + 1
+ jmp 0b // again
+
+9: ret
+
+#elif defined(__arm__)
+
+ // rbit introduced in armv7
+0: rbit r2, r0
+ clz r2, r2
+ mov r0, r0, lsr r2 // a = 2^c a'
+ cmp r3, #0
+ strne r0, [r5], #4
+ subne r3, r3, #1
+ cmp r0, #1
+ adcne r0, r0, r0, lsl #1 // a' = 3 a' + 1 (because c set)
+ bne 0b
-endproc
+ ret
-proc x33
+#elif defined(__aarch64__)
+
+0: rbit w2, w0
+ clz w2, w2
+ lsr w0, w0, w2 // a = 2^c a'
+ cmp x3, #0
+ beq 1f
+ str x0, [x5], #8
+ sub x3, x3, #1
+1:
+ cmp w0, #1
+ add w16, w0, w0, lsl #1 // t = 3 a' + 1 (because c set)
+ csinc.eq w0, w0, w16
+ b.ne 0b
- ud2
+ ret
+
+#else
+ notimpl
+#endif
endproc
-proc x34
+///--------------------------------------------------------------------------
+/// 0x20--0x2f
- ud2
+proc x20
+
+ // calculate 1337 a slowly
+
+#if defined(__x86_64__)
+
+ // original version
+ mov rcx, rax // c = a
+ shl rcx, 2 // c = 4 a
+ add rcx, rax // c = 5 a
+ shl rcx, 3 // c = 40 a
+ add rcx, rax // c = 41 a
+ shl rcx, 1 // c = 82 a
+ add rcx, rax // c = 83 a
+ shl rcx, 1 // c = 166 a
+ add rcx, rax // c = 167 a
+ shl rcx, 3 // c = 1336 a
+ add rcx, rax // c = 1337 a
+
+ // a quick way
+ lea rdx, [2*rax + rax] // t = 3 a
+ shl rdx, 6 // t = 192 a
+ sub rdx, rax // t = 191 a
+ lea rbx, [8*rdx] // b = 1528 a
+ sub rbx, rdx // b = 1337 a
+
+#elif defined(__i386__)
+
+ // original version
+ mov ecx, eax // c = a
+ shl ecx, 2 // c = 4 a
+ add ecx, eax // c = 5 a
+ shl ecx, 3 // c = 40 a
+ add ecx, eax // c = 41 a
+ shl ecx, 1 // c = 82 a
+ add ecx, eax // c = 83 a
+ shl ecx, 1 // c = 166 a
+ add ecx, eax // c = 167 a
+ shl ecx, 3 // c = 1336 a
+ add ecx, eax // c = 1337 a
+
+ // a quick way
+ lea edx, [2*eax + eax] // t = 3 a
+ shl edx, 6 // t = 192 a
+ sub edx, eax // t = 191 a
+ lea ebx, [8*edx] // b = 1528 a
+ sub ebx, edx // b = 1337 a
+
+#elif defined(__arm__)
+
+ // original version, ish
+ add r2, r0, r0, lsl #2 // c = 5 a
+ add r2, r0, r2, lsl #3 // c = 41 a
+ add r2, r0, r2, lsl #1 // c = 83 a
+ add r2, r0, r2, lsl #1 // c = 167 a
+ add r2, r0, r2, lsl #3 // c = 1337 a
+
+ // quicker way
+ add r1, r0, r0, lsl #1 // b = 3 a
+ rsb r1, r0, r1, lsl #6 // b = 191 a
+ rsb r1, r1, r1, lsl #3 // b = 1337 a
+
+#elif defined(__aarch64__)
+
+ // original version, ish
+ add x2, x0, x0, lsl #2 // c = 5 a
+ add x2, x0, x2, lsl #3 // c = 41 a
+ add x2, x0, x2, lsl #1 // c = 83 a
+ add x2, x0, x2, lsl #1 // c = 167 a
+ add x2, x0, x2, lsl #3 // c = 1337 a
+
+ // sleazy because no rsb
+ add x1, x0, x0, lsl #1 // b = 3 a
+ sub x1, x0, x1, lsl #6 // b = -191 a
+ sub x1, x1, x1, lsl #3 // b = 1337 a
+
+#else
+ notimpl
+#endif
+
+ ret
endproc
-proc x35
+proc x21
- ud2
+ // multiply complex numbers a + b i and c + d i
+ //
+ // (a + b i) (c + d i) = (a c - b d) + (a d + b c) i
+ //
+ // somewhat slick approach uses only three multiplications
+
+#if defined(__x86_64__)
+
+ mov rsi, rax // t = a
+ add rax, rbx // a' = a + b
+ mov rdi, rdx // u = d
+ sub rdx, rcx // d' = d - c
+ add rdi, rcx // u = c + d
+
+ imul rax, rcx // a' = c (a + b)
+ imul rsi, rdx // t = a (d - c)
+ imul rdi, rbx // u = b (c + d)
+
+ add rsi, rax // t = a (d - c) + c (a + b)
+ mov rbx, rsi // b' = a (d - c) + c (a + b)
+ // = a d + b c
+ sub rax, rdi // a' = c (a + b) - b (c + d)
+ // = a c - b d
+
+#elif defined(__i386__)
+
+ mov esi, eax // t = a
+ add eax, ebx // a' = a + b
+ mov edi, edx // u = d
+ sub edx, ecx // d' = d - c
+ add edi, ecx // u = c + d
+
+ imul eax, ecx // a' = c (a + b)
+ imul esi, edx // t = a (d - c)
+ imul edi, ebx // u = b (c + d)
+
+ add esi, eax // t = a (d - c) + c (a + b)
+ mov ebx, esi // b' = a (d - c) + c (a + b)
+ // = a d + b c
+ sub eax, edi // a' = c (a + b) - b (c + d)
+ // = a c - b d
+
+#elif defined(__arm__)
+
+ add r4, r0, r1 // t = a + b
+ add r5, r2, r3 // u = c + d
+ sub r3, r3, r2 // d' = d - c
+
+ // mls introduced in armv7
+ mul r4, r4, r2 // t = c (a + b)
+ mov r2, r1 // c' = a (bah!)
+ mla r1, r0, r3, r4 // b' = a (d - c) + c (a + b)
+ // = a d + b c
+ mls r0, r2, r5, r4 // a' = c (a + b) - b (c + d)
+ // = a c - b d
+
+#elif defined(__aarch64__)
+
+ add x4, x0, x1 // t = a + b
+ add x5, x2, x3 // u = c + d
+ sub x3, x3, x2 // d' = d - c
+
+ // mls intxoduced in axmv7
+ mul x4, x4, x2 // t = c (a + b)
+ mov x2, x1 // c' = a (bah!)
+ madd x1, x0, x3, x4 // b' = a (d - c) + c (a + b)
+ // = a d + b c
+ msub x0, x2, x5, x4 // a' = c (a + b) - b (c + d)
+ // = a c - b d
+
+#else
+ notimpl
+#endif
+
+ ret
endproc
-proc x36
+proc x22
- ud2
+ // divide by 3
-endproc
+#if defined(__x86_64__)
-proc x37
+ mov rdx, 0xaaaaaaaaaaaaaaab // = ceil(2/3 2^64)
+ mul rdx // d' || a' =~ 2/3 a 2^64
+ shr rdx, 1 // d' = floor(a/3)
+ mov rax, rdx // a' = floor(a/3)
- ud2
+ // we start with 0 <= a < 2^64. write f = ceil(2/3 2^64), so that
+ // 2/3 < f/2^64 < 2/3 + 1/2^64. then floor(2/3 a) <= floor(a f/2^64)
+ // <= floor(2/3 a + a/2^64), but a < 2^64 so a/2^64 < 1 and
+ // floor(a f/2^64) = floor(2/3 a).
-endproc
+#elif defined(__i386__)
-proc x38
+ mov edx, 0xaaaaaaab // = ceil(2/3 2^32)
+ mul edx // d' || a' =~ 2/3 a 2^32
+ shr edx, 1 // d' = floor(a/3)
+ mov eax, edx // a' = floor(a/3)
- ud2
+#elif defined(__arm__)
-endproc
+ ldr r12, =0xaaaaaaab
+ umull r12, r0, r0, r12
+ mov r0, r0, lsr #1
-proc x39
+#elif defined(__aarch64__)
- ud2
+ ldr x16, =0xaaaaaaaaaaaaaaab
+ umulh x0, x0, x16
+ lsr x0, x0, #1
+
+#else
+ notimpl
+#endif
+
+ ret
endproc
-proc x3a
+proc x23
- ud2
+#if defined(__x86_64__)
-endproc
+ // main loop: shorten a preserving residue class mod 3
+0: cmp rax, 5
+ jbe 8f
+ // a > 5
+ mov rdx, rax // d' = a
+ shr rdx, 2 // d' = floor(a/4)
+ and rax, 3 // a = 4 d' + a' (0 <= a' < 4)
+ add rax, rdx // a' == a (mod 3) but a' < a/4 + 4
+ jmp 0b
-proc x3b
+ // fix up final value 0 <= a < 6: want 0 <= a < 3
+ //
+ // the tricky part is actually a = 3; but the other final cases take
+ // additional iterations which we can avoid.
+8: cmp rax, 3 // set cf iff a < 3
+ cmc // set cf iff a >= 3
+ sbb rdx, rdx // d' = a >= 3 ? -1 : 0
+ and rdx, 3 // d' = a >= 3 ? 3 : 0
+ sub rax, rdx // a' = a - (a >= 3 ? 3 : 0)
+ // = a (mod 3)
+
+#elif defined(__i386__)
+
+ // main loop: shorten a preserving residue class mod 3
+0: cmp eax, 5
+ jbe 8f
+ // a > 5
+ mov edx, eax // d' = a
+ shr edx, 2 // d' = floor(a/4)
+ and eax, 3 // a = 4 d' + a' (0 <= a' < 4)
+ add eax, edx // a' == a (mod 3) but a' < a/4 + 4
+ jmp 0b
+
+ // fix up final value 0 <= a < 6: want 0 <= a < 3
+ //
+ // the tricky part is actually a = 3; but the other final cases take
+ // additional iterations which we can avoid.
+8: cmp eax, 3 // set cf iff a < 3
+ cmc // set cf iff a >= 3
+ sbb edx, edx // d' = a >= 3 ? -1 : 0
+ and edx, 3 // d' = a >= 3 ? 3 : 0
+ sub eax, edx // a' = a - (a >= 3 ? 3 : 0)
+ // = a (mod 3)
- ud2
+#elif defined(__arm__)
-endproc
+0: cmp r0, #6
+ andhs r12, r0, #3
+ addhs r0, r12, r0, lsr #2
+ bhs 0b
-proc x3c
+ cmp r0, #3
+ subhs r0, r0, #3
- ud2
+#elif defined(__aarch64__)
-endproc
+0: cmp x0, #6
+ // blunder on through regardless since this doesn't affect the result
+ and x16, x0, #3
+ add x0, x16, x0, lsr #2
+ b.hs 0b
-proc x3d
+ subs x16, x0, #3
+ cmov.hs x0, x16
- ud2
+#else
+ notimpl
+#endif
+
+ ret
endproc
-proc x3e
+proc x24
- ud2
+ // invert (odd) a mod 2^64
+ //
+ // suppose a a_i == 1 (mod 2^{2^i})
+ //
+ // clearly good for i = 0, since 2^i = 1 and 2^{2^i} = 2, and a_0 =
+ // a == 1 (mod 2) by assumption
+ //
+ // write a a_i == b_i 2^{2^i} + 1 (mod 2^{2^{i+1}})
+ // then b_i == (a a_i - 1)/2^{2^i} (mod 2^{2^i})
+ // to lift inverse, we want x such that a x == -b_i (mod 2^{2^i});
+ // clearly x = -a_i b_i will do, since a a_i == 1 (mod 2^{2^i})
+ // then:
+ // a_{i+1} = a_i - a_i b_i 2^{2^i} = a_i (1 - (a a_i - 1))
+ // = 2 a_i - a a_i^2
+ //
+ // check:
+ // a a_{i+1} = 2 a a_i - a^2 a_i^2
+ // == 2 a a_i - (b_i 2^{2^i} + 1)^2
+ // == 2 (b_i 2^{2^i} + 1) -
+ // (b_i^2 2^{2^{i+1}} + 2 b_i 2^{2^i} + 1)
+ // == 1 (mod 2^{2^{i+1}})
+
+#if defined(__x86_64__)
+
+ // rax // a_0 = a
+ mov rbx, rax // b' = a
+ mov rsi, rax // t = a_0
+
+0:
+ cmp rbp, 0
+ je 1f
+ stosq
+ dec rbp
+1:
+ mul rbx // a' = a a_i
+ mov rcx, rax // c = a a_i
+
+ sub rax, 2 // a' = a a_i - 2
+ neg rax // a' = 2 - a a_i
+ mul rsi // a_{i+1} = a_i (2 - a a_i)
+ // = 2 a_i - a a_i^2
+ mov rsi, rax // t = a_{i+1}
+
+ cmp rcx, 1 // done?
+ ja 0b // no -- iterate
+
+#elif defined(__i386__)
+
+ // eax // a_0 = a
+ mov ebx, eax // b' = a
+ mov esi, eax // t = a_0
+
+0:
+ cmp ebp, 0
+ je 1f
+ stosd
+ dec ebp
+1:
+ mul ebx // a' = a a_i
+ mov ecx, eax // c = a a_i
+
+ sub eax, 2 // a' = a a_i - 2
+ jb 9f // done if < 2
+ neg eax // a' = 2 - a a_i
+ mul esi // a_{i+1} = a_i (2 - a a_i)
+ // = 2 a_i - a a_i^2
+ mov esi, eax // t = a_{i+1}
+
+ jmp 0b // and iterate
+9: mov eax, esi // restore
+
+#elif defined(__arm__)
+
+ // r0 // a_0 = a
+ mov r1, r0 // b' = a
+
+0:
+ cmp r6, #0
+ strne r0, [r5], #4
+ subne r6, r6, #1
+ mul r2, r0, r1 // c = a a_i
+ rsbs r2, r2, #2 // c = 2 - a a_i
+ mul r0, r0, r2 // a_{i+1} = a_i (2 - a a_i)
+ // = 2 a_i - a a_i^2
+ blo 0b
+
+#elif defined(__aarch64__)
+
+ // x0 // a_0 = a
+ mov x1, x0 // b' = a
+ mov x16, #2 // because we have no rsb
+
+0:
+ cmp x6, #0
+ b.eq 1f
+ str x0, [x5], #8
+ sub x6, x6, #1
+1:
+ mul x2, x0, x1 // c = a a_i
+ subs x2, x16, x2 // c = 2 - a a_i
+ mul x0, x0, x2 // a_{i+1} = a_i (2 - a a_i)
+ // = 2 a_i - a a_i^2
+ b.lo 0b
+
+#else
+ notimpl
+#endif
+
+ ret
endproc
-proc x3f
+proc x25
- ud2
+ // a poor approximation to pi/4
+ //
+ // think of x and y as being in 16.16 fixed-point format. we sample
+ // points in the unit square, and determine how many of them are
+ // within a unit quarter-circle centred at the origin. the area of
+ // the quarter-circle is pi/4.
-endproc
+#if defined(__x86_64__)
+
+ xor eax, eax // a = 0
+ mov rcx, 1
+ shl rcx, 0x20 // c =~ 4 billion
+
+0: movzx rbx, cx // x = low 16 bits of c
+ imul rbx, rbx // b = x^2
+
+ ror rcx, 0x10 // switch halves of c
+ movzx rdx, cx // y = high 16 bits of c
+ imul rdx, rdx // d = y^2
+ rol rcx, 0x10 // switch back
+
+ add rbx, rdx // r^2 = x^2 + y^2
+ shr rbx, 0x20 // r^2 >= 1?
+ cmp rbx, 1 // set cf iff r^2 >= 1
+ adc rax, 0 // and add onto accumulator
+ loop 0b
+
+#elif defined(__i386__)
+
+ // this is actually better done in 32 bits. the carry has the wrong
+ // sense here, so instead deduct one for each point outside the
+ // quarter-circle rather than adding one for each point inside it.
+ xor eax, eax
+ xor ecx, ecx
+
+0: movzx ebx, cx
+ imul ebx, ebx
+
+ mov edx, ecx
+ shr edx, 0x10
+ imul edx, edx
+
+ add ebx, edx // see?
+ sbb eax, 0
+ loop 0b
+
+#elif defined(__arm__)
+
+ mov r0, #0
+ mov r2, #0
+
+0: uxth r1, r2, ror #0
+ uxth r3, r2, ror #16
+ mul r1, r1, r1
+ mul r3, r3, r3
+ cmn r1, r3 // mlas doesn't set cf usefully
+ addcc r0, r0, #1
+ adds r2, r2, #1
+ bne 0b
+
+#elif defined(__aarch64__)
+
+ mov w0, #0
+ mov w2, #0
+
+0: ubfx w1, w2, #0, #16
+ ubfx w3, w2, #16, #16
+ sub w2, w2, #1
+ mul w1, w1, w1
+ mul w3, w3, w3
+ cmn w1, w3
+ cinc.cc w0, w0
+ cbnz w2, 0b
+
+#else
+ notimpl
+#endif
+
+ ret
+
+endproc
+
+proc x26
+
+ // a bad way to rotate a right by 7 places
+
+#if defined(__x86_64__)
+
+ mov rbx, rax
+ ror rbx, 7 // better
+
+ mov rdx, rax // d' = a
+ shr rax, 7 // a' = a >> 7
+ shl rdx, 0x39 // d' = a << 57
+ or rax, rdx // a' = a >>> 7
+
+#elif defined(__i386__)
+
+ mov ebx, eax
+ ror ebx, 7 // better
+
+ mov edx, eax // d' = a
+ shr eax, 7 // a' = a >> 7
+ shl edx, 0x39 // d' = a << 57
+ or eax, edx // a' = a >>> 7
+
+#elif defined(__arm__)
+
+ mov r1, r0, ror #7 // easy way
+
+ // even the hard way is fairly easy on arm
+ mov r3, r0, lsl #25
+ orr r0, r3, r0, lsr #7 // hard way
+
+#elif defined(__aarch64__)
+
+ ror x1, x0, #7 // easy way
+
+ // even the hard way is fairly easy on arm
+ lsl x3, x0, #57
+ orr x0, x3, x0, lsr #7 // hard way
+
+#else
+ notimpl
+#endif
+
+ ret
+
+endproc
+
+proc x27
+
+ // shift a right by c places, in two halves
+
+#if defined(__x86_64__)
+
+ mov ch, cl // c' = [c, c]
+ inc ch // c' = [c, c + 1]
+ shr ch, 1
+ shr cl, 1 // c' = [floor(c/2), ceil(c/2)]
+ shr rax, cl
+ xchg ch, cl
+ shr rax, cl
+
+#elif defined(__i386__)
+
+ mov ch, cl // c' = [c, c]
+ inc ch // c' = [c, c + 1]
+ shr ch, 1
+ shr cl, 1 // c' = [floor(c/2), ceil(c/2)]
+ shr eax, cl
+ xchg ch, cl
+ shr eax, cl
+
+#elif defined(__arm__)
+
+ // it would be clearer and more efficient to say: `mov r12, r2, lsr
+ // #1; sub r2, r2, r12', but that's not the lesson this exercise is
+ // trying to teach.
+ add r12, r2, #1
+ mov r2, r2, lsr #1
+ mov r12, r12, lsr #1
+ mov r0, r0, lsr r2
+ mov r0, r0, lsr r12
+
+#elif defined(__aarch64__)
+
+ add w16, w2, #1
+ lsr w2, w2, #1
+ lsr w16, w16, #1
+ lsr x0, x0, x2
+ lsr x0, x0, x16
+
+#else
+ notimpl
+#endif
+
+ ret
+
+endproc
+
+proc x28
+
+ // divide c-byte little-endian bignum at rsi by 2 (rounding down)
+
+#if defined(__x86_64__)
+
+ clc
+0: rcr byte ptr [rsi], 1
+ inc rsi
+ loop 0b
+
+#elif defined(__i386__)
+
+ clc
+0: rcr byte ptr [esi], 1
+ inc esi
+ loop 0b
+
+#elif defined(__arm__)
+
+ // we could hack this a word at a time using rrx
+ mov r3, #0
+0: ldrb r12, [r4]
+ subs r2, r2, #1
+ orr r3, r3, r12, lsr #1
+ strb r3, [r4], #1
+ mov r3, r12, lsl #7
+ bne 0b
+
+#elif defined(__aarch64__)
+
+ mov w16, #0
+0: ldrb w17, [x4]
+ sub x2, x2, #1
+ orr w16, w16, w17, lsr #1
+ strb w16, [x4], #1
+ lsl w16, w17, #7
+ cbnz x2, 0b
+
+#else
+ notimpl
+#endif
+
+ ret
+
+endproc
+
+proc x29
+
+ // fill a buffer with a 3-byte pattern
+
+#if defined(__x86_64__)
+
+ lea rdi, [rsi + 3]
+ rep movsb
+
+#elif defined(__i386__)
+
+ lea edi, [esi + 3]
+ rep movsb
+
+#elif defined(__arm__)
+
+ add r5, r4, #3
+0: subs r2, r2, #1
+ ldrhsb r12, [r4], #1
+ strhsb r12, [r5], #1
+ bhs 0b
+
+#elif defined(__aarch64__)
+
+ cbz x2, 9f
+ add x5, x4, #3
+0: sub x2, x2, #1
+ ldrb w16, [x4], #1
+ strb w16, [x5], #1
+ cbnz x2, 0b
+9:
+
+#else
+ notimpl
+#endif
+
+ ret
+
+endproc
+
+proc x2a
+
+ // rotate the words in a buffer, so that the last word comes first,
+ // the first comes second, and so on. this isn't a good way to do
+ // it.
+
+#if defined(__x86_64__)
+
+ mov rsi, rbx // set string pointers
+ mov rdi, rbx
+0: lodsq // fetch next word
+ xchg rax, qword ptr [rbx] // stash it for next iteration and
+ // replace it with the previously
+ // stashed word
+ stosq // store in output
+ // (note that the first iteration doesn't actually do anything)
+ loop 0b // continue until all done
+
+#elif defined(__i386__)
+
+ mov esi, ebx // set string pointers
+ mov edi, ebx
+0: lodsd // fetch next word
+ xchg eax, dword ptr [ebx] // stash it for next iteration and
+ // replace it with the previously
+ // stashed word
+ stosd // store in output
+ loop 0b // continue until all done
+
+#elif defined(__arm__)
+
+ // let's do this a sensible way. (we could go faster using ldm/stm.)
+ add r0, r1, r2, lsl #2 // find the end of the buffer
+ ldr r0, [r0, #-4] // collect final element
+0: subs r2, r2, #1
+ ldr r12, [r1]
+ str r0, [r1], #4
+ mov r0, r12
+ bne 0b
+
+#elif defined(__aarch64__)
+
+ add x0, x1, x2, lsl #3 // find the end of the buffer
+ ldr x0, [x0, #-8] // collect final element
+0: sub x2, x2, #1
+ ldr x16, [x1]
+ str x0, [x1], #8
+ mov x0, x16
+ cbnz x2, 0b
+
+#else
+ notimpl
+#endif
+
+ ret
+
+endproc
+
+proc x2b
+
+ // find a cycle in a function f: B -> B, where B = {0, 1, ..., 255}
+
+#if defined(__x86_64__)
+
+ // this is floyd's cycle-finding algorithm.
+ //
+ // consider the sequence s_0 = 0, s_1 = f(0), s_2 = f(f(0)), ...,
+ // s_{i+1} = f(s_i). since B is finite, there must be some smallest
+ // t and c such that s(t) = s(t + c); then we have s_i = s_j iff
+ // i >= t, j >= t, and i == j (mod c).
+ //
+ // the algorithm sets two cursors advancing through the sequence: a
+ // /tortoise/ which advances one step at a time, and a /hare/ which
+ // advances by two, so when the tortoise is at element s_i, the hare
+ // is at s_{2i}. the hare will run around the cycle and catch the
+ // tortoise when i >= t and i == 2 i (mod c); the latter is simply i
+ // == 0 (mod c), which therefore happens first when i = k = t +
+ // (-t mod c).
+ //
+ // i'm not sure what good xlatb does here that mov al, [rbx + al]
+ // doesn't.
+
+ xor eax, eax // tortoise starts at 0
+ xor edx, edx // hare starts at 0
+0: xlatb // advance tortoise
+ xchg rax, rdx // switch to hare
+ xlatb // advance hare ...
+ xlatb // ... twice
+ xchg rax, rdx // switch back
+ cmp al, dl // hare caught the tortoise?
+ jnz 0b // no -- go around again
+
+ // now we trace the initial tail: reset the tortoise to s_0, and slow
+ // the hare down so that both take only a single step in each
+ // iteration. this loop terminates when i >= t and i == i + 2 k
+ // (mod c). we know k is a multiple of c, so the latter condition
+ // always holds, so this finds the first step of the cycle.
+
+ xor eax, eax // reset the tortoise
+0: xlatb // advance tortoise
+ xchg rax, rdx // switch to hare
+ xlatb // advance hare
+ xchg rax, rdx // and switch back
+ cmp al, dl // done?
+ jnz 0b // no -- iterate
+
+#elif defined(__i386__)
+
+ xor eax, eax // tortoise starts at 0
+ xor edx, edx // hare starts at 0
+0: xlatb // advance tortoise
+ xchg eax, edx // switch to hare
+ xlatb // advance hare ...
+ xlatb // ... twice
+ xchg eax, edx // switch back
+ cmp al, dl // hare caught the tortoise?
+ jnz 0b // no -- go around again
+
+ xor eax, eax // reset the tortoise
+0: xlatb // advance tortoise
+ xchg eax, edx // switch to hare
+ xlatb // advance hare
+ xchg eax, edx // and switch back
+ cmp al, dl // done?
+ jnz 0b // no -- iterate
+
+#elif defined(__arm__)
+
+ mov r0, #0
+ mov r3, #0
+0: ldrb r0, [r1, r0]
+ ldrb r3, [r1, r3]
+ ldrb r3, [r1, r3]
+ cmp r0, r3
+ bne 0b
+
+ mov r0, #0
+0: ldrb r0, [r1, r0]
+ ldrb r3, [r1, r3]
+ cmp r0, r3
+ bne 0b
+
+#elif defined(__aarch64__)
+
+ mov w0, #0
+ mov w3, #0
+0: ldrb w0, [x1, x0]
+ ldrb w3, [x1, x3]
+ ldrb w3, [x1, x3]
+ cmp w0, w3
+ b.ne 0b
+
+ mov w0, #0
+0: ldrb w0, [x1, x0]
+ ldrb w3, [x1, x3]
+ cmp w0, w3
+ b.ne 0b
+
+#else
+ notimpl
+#endif
+
+ ret
+
+endproc
+
+proc x2c
+
+ // a convoluted way to set rax = rsi
+
+#if defined(__x86_64__)
+
+ mov qword ptr [rbx + 8*rcx], 0 // b[c] = 0
+ mov qword ptr [rbx + 8*rdx], 1 // b[d] = 1
+ mov rax, [rbx + 8*rcx] // a' = b[c] = 0
+
+ mov [rbx], rsi // b[0] = t
+ mov [rbx + 8], rdi // b[1] = u
+ mov rax, [rbx + 8*rax] // a' = b[a'] = b[0] = t
+
+#elif defined(__i386__)
+
+ mov dword ptr [ebx + 8*ecx], 0 // b[c] = 0
+ mov dword ptr [ebx + 8*edx], 1 // b[d] = 1
+ mov eax, [ebx + 8*ecx] // a' = b[c] = 0
+
+ mov [ebx], esi // b[0] = t
+ mov [ebx + 8], edi // b[1] = u
+ mov eax, [ebx + 8*eax] // a' = b[a'] = b[0] = t
+
+#elif defined(__arm__)
+
+ mov r0, #0
+ mov r12, #1
+
+ str r0, [r1, r2, lsl #2]
+ str r12, [r1, r3, lsl #2]
+ ldr r0, [r1, r2, lsl #2]
+
+ str r4, [r1]
+ str r5, [r1, #4]
+ ldr r0, [r1, r0, lsl #2]
+
+#elif defined(__aarch64__)
+
+ mov x16, #1
+
+ str xzr, [x1, x2, lsl #3]
+ str x16, [x1, x3, lsl #3]
+ ldr x0, [x1, x2, lsl #3]
+
+ str x4, [x1]
+ str x5, [x1, #8]
+ ldr x0, [x1, x0, lsl #3]
+
+#else
+ notimpl
+#endif
+
+ ret
+
+endproc
+
+proc x2d
+
+ // clear the least significant set bit in a, by calculating a' =
+ // a AND (a - 1).
+ //
+ // if a = 0 then a' = 0. otherwise, a - 1 differs from a exactly in
+ // the least significant /set/ bit of a, and all bits of lesser
+ // significance. to put it another way: write a = u 2^{k+1} + 2^k;
+ // then a - 1 = u 2^{k+1} + 2^{k-1} + ... + 2 + 1. taking the
+ // bitwise AND of these leaves only the bits common to both, i.e.,
+ // u 2^{k+1}.
+
+#if defined(__x86_64__)
+
+ mov rdx, rax // d' = a
+ dec rax // a' = a - 1
+ and rax, rdx // a' = a AND (a - 1)
+
+#elif defined(__i386__)
+
+ mov edx, eax // d' = a
+ dec eax // a' = a - 1
+ and eax, edx // a' = a AND (a - 1)
+
+#elif defined(__arm__)
+
+ sub r3, r0, #1
+ and r0, r0, r3
+
+#elif defined(__aarch64__)
+
+ sub x3, x0, #1
+ and x0, x0, x3
+
+#else
+ notimpl
+#endif
+
+ ret
+
+endproc
+
+proc x2e
+
+ // compute a mask of one bits in exactly the positions of the
+ // low-order run of zero bits in a
+
+#if defined(__x86_64__)
+
+ mov rdx, rax // d' = a
+ dec rdx // d' = a - 1
+ xor rax, rdx // a = a XOR (a - 1)
+ // set bits are least significant
+ // set bit of a, and all bits of
+ // lesser significance
+ shr rax, 1 // now only bits of lesser
+ // significance; a' = 0 iff a odd
+ cmp rax, rdx // equal if a = 0 or 2^k; otherwise
+ // strictly less
+
+#elif defined(__i386__)
+
+ mov edx, eax
+ dec edx
+ xor eax, edx
+ shr eax, 1
+ cmp eax, edx
+
+#elif defined(__arm__)
+
+ sub r3, r0, #1
+ eor r0, r0, r3
+ mov r0, r0, lsr #1 // probably fold shift into next inst
+ cmp r0, r3
+
+#elif defined(__aarch64__)
+
+ sub x3, x0, #1
+ eor x0, x0, x3
+ mov x0, x0, lsr #1 // probably fold shift into next inst
+ cmp x0, x3
+
+#else
+ notimpl
+#endif
+
+ ret
+
+endproc
+
+proc x2f
+
+ // a slow population count
+
+#if defined(__x86_64__)
+
+ popcnt rbx, rcx // the easy way
+
+ // a fast version in software
+ mov rax, rcx
+
+ mov rdx, rcx
+ shr rdx, 1
+ mov rsi, 0x5555555555555555
+ and rax, rsi
+ and rdx, rsi
+ add rax, rdx
+
+ mov rdx, rax
+ shr rdx, 2
+ mov rsi, 0x3333333333333333
+ and rax, rsi
+ and rdx, rsi
+ add rax, rdx
+
+ mov rdx, rax
+ shr rdx, 32
+ add rax, rdx
+
+ mov rdx, rax
+ shr rdx, 4
+ and rax, 0x0f0f0f0f
+ and rdx, 0x0f0f0f0f
+ add rax, rdx
+
+ mov rdx, rax
+ shr rdx, 8
+ add rax, rdx
+
+ mov rdx, rax
+ shr rdx, 16
+ add rax, rdx
+ movzx rsi, al
+
+ // the official version
+ xor eax, eax // clear iteration counter
+0: jrcxz 9f // bail if c = 0
+ inc rax // bump iteration count
+ mov rdx, rcx // d' = c
+ dec rdx // d' = c - 1
+ and rcx, rdx // zap least significant set bit of c
+ jmp 0b // and go again
+9:
+
+#elif defined(__i386__)
+
+ popcnt ebx, ecx // the easy way
+
+ mov eax, ecx
+
+ mov edx, ecx
+ shr edx, 1
+ and eax, 0x55555555
+ and edx, 0x55555555
+ add eax, edx
+
+ mov edx, eax
+ shr edx, 2
+ and eax, 0x33333333
+ and edx, 0x33333333
+ add eax, edx
+
+ mov edx, eax
+ shr edx, 4
+ add eax, edx
+
+ mov edx, eax
+ shr edx, 8
+ and eax, 0x000f000f
+ and edx, 0x000f000f
+ add eax, edx
+
+ mov edx, eax
+ shr edx, 16
+ add eax, edx
+ movzx esi, al
+
+ xor eax, eax
+0: jecxz 9f
+ inc eax
+ mov edx, ecx
+ dec edx
+ and ecx, edx
+ jmp 0b
+9:
+
+#elif defined(__arm__)
+
+ // the easy-ish way
+ vmov d0[0], r2
+ vcnt.8 d0, d0
+ vmov r1, d0[0]
+ add r1, r1, r1, lsl #8
+ add r1, r1, r1, lsl #16
+ mov r1, r1, lsr #24
+
+ // the hard way
+ movw r12, #0x5555
+ movt r12, #0x5555
+ and r3, r12, r2, lsr #1
+ and r0, r12, r2
+ add r0, r0, r3
+
+ movw r12, #0x3333
+ movt r12, #0x3333
+ and r3, r12, r0, lsr #2
+ and r0, r12, r0
+ add r0, r0, r3
+
+ add r0, r0, r0, lsl #16
+
+ movt r12, 0x0f0f
+ and r3, r12, r0, lsr #4
+ and r0, r12, r0
+ add r0, r0, r3
+
+ add r0, r0, r0, lsl #8
+
+ mov r4, r0, lsr #24
+
+ // and following the exercise
+ mov r0, #0
+ cmp r2, #0
+ beq 9f
+0: add r0, r0, #1
+ sub r3, r2, #1
+ ands r2, r2, r3
+ bne 0b
+9:
+
+#elif defined(__aarch64__)
+
+ // the easy-ish way
+ mov v0.d[0], x2
+ cnt v0.8b, v0.8b
+ mov x1, v0.d[0]
+ add x1, x1, x1, lsl #8
+ add x1, x1, x1, lsl #16
+ add x1, x1, x1, lsl #32
+ lsr x1, x1, #56
+
+ // the hard way -- though arm64's immediate constant encodings and
+ // shifting make this actually rather pleasant.
+ and x3, x2, #0xaaaaaaaaaaaaaaaa
+ and x0, x2, #0x5555555555555555
+ add x0, x0, x3, lsr #1
+
+ and x3, x0, #0xcccccccccccccccc
+ and x0, x0, #0x3333333333333333
+ add x0, x0, x3, lsr #2
+
+ add x0, x0, x0, lsr #4
+
+ and x3, x0, #0x0f000f000f000f00
+ and x0, x0, #0x000f000f000f000f
+ add x0, x3, x0, lsl #8
+
+ add x0, x0, x0, lsl #16
+ add x0, x0, x0, lsl #32
+ lsr x4, x0, #56
+
+ // and the official way
+ mov x0, #0
+ cbz x2, 9f
+0: add x0, x0, #1
+ sub x3, x2, #1
+ and x2, x2, x3
+ cbnz x2, 0b
+9:
+
+#else
+ notimpl
+#endif
+
+ ret
+
+endproc
+
+///--------------------------------------------------------------------------
+/// 0x30--0x3f
+
+proc x30
+
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
+
+ ret
+
+endproc
+
+proc x31
+
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
+
+endproc
+
+proc x32
+
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
+
+endproc
+
+proc x33
+
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
+
+endproc
+
+proc x34
+
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
+
+endproc
+
+proc x35
+
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
+
+endproc
+
+proc x36
+
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
+
+endproc
+
+proc x37
+
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
+
+endproc
+
+proc x38
+
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
+
+endproc
+
+proc x39
+
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
+
+endproc
+
+proc x3a
+
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
+
+endproc
+
+proc x3b
+
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
+
+endproc
+
+proc x3c
+
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
+
+endproc
+
+proc x3d
+
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
+
+endproc
+
+proc x3e
+
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
+
+endproc
+
+proc x3f
+
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
+
+endproc
+
+///----- That's all, folks --------------------------------------------------