X-Git-Url: https://git.distorted.org.uk/~mdw/xchg-rax-rax/blobdiff_plain/06297a937874bfd892c0c85ace29fb26a4a0ddc0..99fe70cb14a9586fcea77bfea14cb931257807a8:/xchg.S diff --git a/xchg.S b/xchg.S index 5374727..b95c665 100644 --- a/xchg.S +++ b/xchg.S @@ -1,8 +1,76 @@ -/// -*- mode: asm; asm-comment-char: ?/ -*- +/// -*- mode: asm; asm-comment-char: 0 -*- + +///-------------------------------------------------------------------------- +/// Preliminaries. + +#include + +#if defined(__i386__) || defined(__x86_64__) .intel_syntax noprefix - .section .note.GNU-stack, "", @progbits +#elif defined(__arm__) + +.macro ret + bx r14 +.endm + + .arch armv7-a + .fpu neon + +#elif defined(__aarch64__) + +.macro cmov rd, rn, cc + csel \rd, \rn, \rd, \cc +.endm +#define _COND(_) \ + _(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl) \ + _(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv) \ + _(hs) _(lo) +#define _INST(_) \ + _(ccmp) _(ccmn) \ + _(csel) _(cmov) \ + _(csinc) _(cinc) _(cset) \ + _(csneg) _(cneg) \ + _(csinv) _(cinv) _(csetm) +#define _CONDVAR(cc) _definstvar cc; +#define _INSTVARS(inst) \ + .macro _definstvar cc; \ + .macro inst.\cc args:vararg; inst \args, \cc; .endm; \ + .endm; \ + _COND(_CONDVAR); \ + .purgem _definstvar; + _INST(_INSTVARS) +#undef _COND +#undef _INST +#undef _CONDVAR +#undef _INSTVARS + +#define CCMP_N 8 +#define CCMP_Z 4 +#define CCMP_C 2 +#define CCMP_V 1 + +#define CCMP_MI CCMP_N +#define CCMP_PL 0 +#define CCMP_EQ CCMP_Z +#define CCMP_NE 0 +#define CCMP_CS CCMP_C +#define CCMP_HS CCMP_C +#define CCMP_CC 0 +#define CCMP_LO 0 +#define CCMP_VS CCMP_V +#define CCMP_VC 0 +#define CCMP_HI CCMP_C +#define CCMP_LS 0 +#define CCMP_LT CCMP_N +#define CCMP_GE 0 +#define CCMP_LE CCMP_N +#define CCMP_GT 0 + +#else +# error "not supported" +#endif .macro proc name .globl \name @@ -16,6 +84,36 @@ .endm .macro ch c +#if defined(__i386__) + + pushf + push eax + push ebx + push ecx + push edx + push ebp + mov ebp, esp + and esp, -16 + + push \c + call putchar@plt + + call get_pc_ebx + add ebx, offset _GLOBAL_OFFSET_TABLE + mov eax, [ebx + stdout@GOT] + mov eax, [eax] + call fflush@plt + + mov esp, ebp + pop ebp + pop edx + pop ecx + pop ebx + pop eax + popf + +#elif defined(__x86_64__) + pushf push rax push rcx @@ -44,12 +142,154 @@ pop rcx pop rax popf + +#elif defined(__arm__) + + stmfd r13!, {r0-r4, r12, r14} + + mov r4, r13 + bic r14, r4, #15 + mov r13, r14 + + mov r0, #\c + bl putchar@plt + + ldr r14, .L$_c$gotoff$\@ +.L$_c$gotpc$\@: + add r14, pc, r14 + b .L$_c$cont$\@ +.L$_c$gotoff$\@: + .word _GLOBAL_OFFSET_TABLE - .L$_c$gotpc$\@ - 8 +.L$_c$cont$\@: + bl fflush@plt + + mov r13, r4 + ldmfd r13!, {r0-r4, r12, r14} + +#elif defined(__aarch64__) + + sub sp, sp, #20*8 + stp x0, x1, [sp, #0] + stp x2, x3, [sp, #16] + stp x4, x5, [sp, #32] + stp x6, x7, [sp, #48] + stp x8, x9, [sp, #64] + stp x10, x11, [sp, #80] + stp x12, x13, [sp, #96] + stp x14, x15, [sp, #112] + stp x16, x17, [sp, #128] + mrs x16, nzcv + stp x16, x30, [sp, #144] + + mov w0, #\c + bl putchar + adrp x0, :got:stdout + ldr x0, [x0, #:got_lo12:stdout] + ldr x0, [x0] + bl fflush + + ldp x16, x30, [sp, #144] + msr nzcv, x16 + ldp x16, x17, [sp, #128] + ldp x14, x15, [sp, #112] + ldp x12, x13, [sp, #96] + ldp x10, x11, [sp, #80] + ldp x8, x9, [sp, #64] + ldp x6, x7, [sp, #48] + ldp x4, x5, [sp, #32] + ldp x2, x3, [sp, #16] + ldp x0, x1, [sp, #0] + add sp, sp, #20*8 + +#else +# error "not supported" +#endif .endm +.macro notimpl +#if defined(__i386__) || defined(__x86_64__) + ud2 +#elif defined(__arm__) + udf +#elif defined(__aarch64__) + hlt #0 +#else +# error "not supported" +#endif +.endm + + .section .note.GNU-stack, "", %progbits + .text +#if defined(__i386__) +get_pc_ebx: + mov ebx, [esp] + ret +#endif + + proc call_example +#if defined(__i386__) + + push ebx // ebx + push esi // esi, ebx + push edi // edi, esi, ebx + push ebp // flags, ebp, ..., ebx + pushf + + mov edi, [esp + 4*6] + mov esi, [esp + 4*7] + push esi // regs, flags, ebp, ..., ebx + + call get_pc_ebx + lea eax, [ebx + 9f - .] + push eax // cont, regs, flags, ebp, ..., ebx + push edi // func, cont, regs, flags, ebp, ..., ebx + + mov eax, [esi + 28] + pushf + pop ecx + and eax, 0x0cd5 + and ecx, ~0x0cd5 + or eax, ecx + push eax + popf + mov eax, [esi + 0] + mov ebx, [esi + 4] + mov ecx, [esi + 8] + mov edx, [esi + 12] + mov edi, [esi + 20] + mov ebp, [esi + 24] + mov esi, [esi + 16] + + ret // -> func; regs, flags, ebp, ..., ebx + +9: pushf // eflags, regs, flags, ebp, ..., ebx + push esi // esi, eflags, regs, flags, ebp, ..., ebx + mov esi, [esp + 8] + mov [esi + 0], eax + mov [esi + 4], ebx + mov [esi + 8], ecx + mov [esi + 12], edx + mov [esi + 20], edi + mov [esi + 24], ebp + pop eax // rflags, regs, flags, ebp, ..., ebx + mov [esi + 16], eax + pop eax // regs, flags, ebp, ..., ebx + mov [esi + 28], eax + + add esp, 4 // flags, ebp, ..., ebx + popf // ebp, ..., ebx + pop ebp // ..., ebx + pop edi + pop esi + pop ebx // + ret + +#elif defined(__x86_64__) + push rbx // rbx push r10 push r11 @@ -66,7 +306,7 @@ proc call_example push rax // cont, regs, flags, rbp, ..., rbx push rdi // func, cont, regs, flags, rbp, ..., rbx - mov rax, [rsi + 56] + mov rax, [rsi + 8*15] pushf pop rcx and rax, 0x0cd5 @@ -74,29 +314,45 @@ proc call_example or rax, rcx push rax popf - mov rax, [rsi + 0] - mov rbx, [rsi + 8] - mov rcx, [rsi + 16] - mov rdx, [rsi + 24] - mov rdi, [rsi + 40] - mov rbp, [rsi + 48] - mov rsi, [rsi + 32] + mov rax, [rsi + 0] + mov rbx, [rsi + 8] + mov rcx, [rsi + 16] + mov rdx, [rsi + 24] + mov rdi, [rsi + 40] + mov rbp, [rsi + 48] + mov r8, [rsi + 56] + mov r9, [rsi + 64] + mov r10, [rsi + 72] + mov r11, [rsi + 80] + mov r12, [rsi + 88] + mov r13, [rsi + 96] + mov r14, [rsi + 104] + mov r15, [rsi + 112] + mov rsi, [rsi + 32] ret // -> func; regs, flags, rbp, ..., rbx 9: pushf // rflags, regs, flags, rbp, ..., rbx push rsi // rsi, rflags, regs, flags, rbp, ..., rbx mov rsi, [rsp + 16] - mov [rsi + 0], rax - mov [rsi + 8], rbx - mov [rsi + 16], rcx - mov [rsi + 24], rdx - mov [rsi + 40], rdi - mov [rsi + 48], rbp + mov [rsi + 0], rax + mov [rsi + 8], rbx + mov [rsi + 16], rcx + mov [rsi + 24], rdx + mov [rsi + 40], rdi + mov [rsi + 48], rbp + mov [rsi + 56], r8 + mov [rsi + 64], r9 + mov [rsi + 72], r10 + mov [rsi + 80], r11 + mov [rsi + 88], r12 + mov [rsi + 96], r13 + mov [rsi + 104], r14 + mov [rsi + 112], r15 pop rax // rflags, regs, flags, rbp, ..., rbx - mov [rsi + 32], rax + mov [rsi + 32], rax pop rax // regs, flags, rbp, ..., rbx - mov [rsi + 56], rax + mov [rsi + 120], rax add rsp, 8 // flags, rbp, ..., rbx popf // rbp, ..., rbx @@ -110,6 +366,82 @@ proc call_example pop rbx // ret +#elif defined(__arm__) + + stmfd r13!, {r0, r1, r4-r11, r14} + ldmia r1, {r0-r12, r14} + msr cpsr, r14 + mov r14, pc + ldr pc, [r13], #4 + ldr r14, [r13], #4 + stmia r14!, {r0-r12} + mrs r0, cpsr + str r0, [r14] + ldmfd r13!, {r4-r11, pc} + +#elif defined(__aarch64__) + + stp x29, x30, [sp, #-14*8]! + mov x29, sp + stp x19, x20, [sp, #16] + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + stp x25, x26, [sp, #64] + stp x27, x28, [sp, #80] + str x1, [sp, #104] + + ldp x29, x30, [x1, #224] + msr nzcv, x30 + mov x30, x0 + ldp x27, x28, [x1, #208] + ldp x25, x26, [x1, #192] + ldp x23, x24, [x1, #176] + ldp x21, x22, [x1, #160] + ldp x19, x20, [x1, #144] + ldp x16, x17, [x1, #128] + ldp x14, x15, [x1, #112] + ldp x12, x13, [x1, #96] + ldp x10, x11, [x1, #80] + ldp x8, x9, [x1, #64] + ldp x6, x7, [x1, #48] + ldp x4, x5, [x1, #32] + ldp x2, x3, [x1, #16] + ldp x0, x1, [x1, #0] + + blr x30 + + ldr x30, [sp, #104] + stp x27, x28, [x30, #208] + stp x25, x26, [x30, #192] + stp x23, x24, [x30, #176] + stp x21, x22, [x30, #160] + stp x19, x20, [x30, #144] + stp x16, x17, [x30, #128] + stp x14, x15, [x30, #112] + stp x12, x13, [x30, #96] + stp x10, x11, [x30, #80] + stp x8, x9, [x30, #64] + stp x6, x7, [x30, #48] + stp x4, x5, [x30, #32] + stp x2, x3, [x30, #16] + stp x0, x1, [x30, #0] + mov x0, x30 + mrs x30, nzcv + stp x29, x30, [x0, #224] + + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp x25, x26, [sp, #64] + ldp x27, x28, [sp, #80] + ldp x29, x30, [sp], #14*8 + + ret + +#else +# error "not supported" +#endif + endproc proc nop @@ -119,19 +451,58 @@ proc nop endproc ///-------------------------------------------------------------------------- +/// 0x00--0x0f proc x00 // clear all 64 bits of extended traditional registers - xor eax,eax // clear rax - lea rbx,[0] // rbx -> _|_ + +#if defined(__x86_64__) + + xor eax, eax // clear rax + lea rbx, [0] // rbx -> _|_ loop . // iterate, decrement rcx until zero - mov rdx,0 // set rdx = 0 - and esi,0 // clear all bits of rsi - sub edi,edi // set rdi = edi - edi = 0 + mov rdx, 0 // set rdx = 0 + and esi, 0 // clear all bits of rsi + sub edi, edi // set rdi = edi - edi = 0 push 0 pop rbp // pop 0 into rbp +#elif defined(__i386__) + + xor eax, eax + lea ebx, [0] + loop . + mov edx, 0 + and esi, 0 + sub edi, edi + push 0 + pop ebp + +#elif defined(__arm__) + + eor r0, r0, r0 + rsb r1, r1, r1 +0: subs r2, r2, #1 + bne 0b + mov r3, #0 + and r4, r4, #0 + sub r5, r5, r5 + +#elif defined(__aarch64__) + + eor w0, w0, w0 + mov w1, wzr +0: sub w2, w2, #1 + cbnz w2, 0b + mov w3, #0 + and w4, w4, wzr + sub w5, w5, w5 + +#else + notimpl +#endif + ret endproc @@ -142,11 +513,43 @@ proc x01 // // on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1} // and f_{i+c}, where f_{i+1} = f_i + f_{i-1} + +#if defined(__x86_64__) + 0: xadd rax, rdx // a, d = a + d, a // = f_{i+1} + f_i, f_{i+1} // = f_{i+2}, f_{i+1} loop 0b // advance i, decrement c, iterate +#elif defined(__i386__) + +0: xadd eax, edx + loop 0b + +#elif defined(__arm__) + +0: subs r2, r2, #2 + add r3, r3, r0 + blo 8f + add r0, r0, r3 + bhi 0b + +8: movne r0, r3 + +#elif defined(__aarch64__) + +0: subs x2, x2, #2 + add x3, x3, x0 + b.lo 8f + add x0, x0, x3 + b.hi 0b + +8: cmov.ne x0, x3 + +#else + notimpl +#endif + ret endproc @@ -155,10 +558,63 @@ proc x02 // boolean canonify a: if a = 0 on entry, leave it zero; otherwise // set a = 1 + +#if defined(__x86_64__) + neg rax // set cf iff a /= 0 sbb rax, rax // a = a - a - cf = -cf neg rax // a = cf +#elif defined(__i386__) + + neg eax + sbb eax, eax + neg eax + +#elif defined(__arm__) + + movs r1, r0 // the easy way + movne r1, #1 // mvnne r1, #1 for mask + + cmp r0, #1 // clear cf iff a == 0 + sbc r2, r0, r0 // c' = a - a - 1 + cf = cf - 1 + add r2, r2, #1 // c' = cf + + sub r3, r0, r0, lsr #1 // d' top bit clear; d' = 0 iff a = 0 + rsb r3, r3, #0 // d' top bit set iff a /= 0 + mov r3, r3, lsr #31 // asr for mask + + rsbs r0, r0, #0 + sbc r0, r0, r0 + rsb r0, r0, #0 + +#elif defined(__aarch64__) + + cmp x0, #0 // trivial + cset.ne x1 // csetm for mask + + cmp xzr, x0 // set cf iff a == 0 + sbc x2, x0, x0 // c' = a - a - 1 + cf = cf - 1 + neg x2, x2 // c' = 1 - cf + + sub x3, x0, x0, lsr #1 // if a < 2^63 then a' = ceil(d/2) < + // 2^63 + // if a >= 2^63, write a = 2^63 + t + // with t < 2^63; d' = 2^63 - 2^62 + + // ceil(t/2) = 2^62 + ceil(t/2), and + // ceil(t/2) < 2^62 + // anyway d' < 2^63 and d' = 0 iff + // a = 0 + neg x3, x3 // d' top bit set iff a /= 0 + lsr x3, x3, #63 // asr for mask + + cmp x0, #1 // set cf iff a /= 0 + adc x0, xzr, xzr // a' = 0 + 0 + cf = cf + +#else + notimpl +#endif + ret endproc @@ -166,11 +622,46 @@ endproc proc x03 // set a = min(a, d) (unsigned); clobber c, d + +#if defined(__x86_64__) + sub rdx, rax // d' = d - a; set cf if a > d sbb rcx, rcx // c = -cf = -[a > d] and rcx, rdx // c = a > d ? d - a : 0 add rax, rcx // a' = a > d ? d : a +#elif defined(__i386__) + + sub edx, eax + sbb ecx, ecx + and ecx, edx + add eax, ecx + +#elif defined(__arm__) + + cmp r0, r3 // the easy way + movlo r1, r0 // only needed for out-of-place + movhs r1, r3 + + subs r3, r3, r0 + sbc r12, r12, r12 + and r12, r12, r3 + add r0, r0, r12 + +#elif defined(__aarch64__) + + cmp x0, x3 // the easy way + csel.lo x1, x0, x3 + + subs x3, x3, x0 // d' = d - a; set cf if d >= a + sbc x16, xzr, xzr // t = -1 + cf = -[a > d] + and x16, x16, x3 // t = a > d ? d - a : 0 + add x0, x0, x16 // a' = a > d ? d : a + +#else + notimpl +#endif + ret endproc @@ -178,8 +669,76 @@ endproc proc x04 // switch case? + +#if defined(__x86_64__) + + // unrelated playing + mov ecx, eax + mov rbx, -1 + mov edx, ecx + sub edx, '0' + cmp edx, 10 + cmovb rbx, rdx + or ecx, 0x20 + mov edx, ecx + sub edx, 'a' + sub ecx, 'a' - 10 + cmp edx, 6 + cmovb rbx, rcx + xor al, 0x20 +#elif defined(__i386__) + + // unrelated playing + mov ecx, eax + mov ebx, -1 + mov edx, ecx + sub edx, '0' + cmp edx, 10 + cmovb ebx, edx + or ecx, 0x20 + mov edx, ecx + sub edx, 'a' + sub ecx, 'a' - 10 + cmp edx, 6 + cmovb ebx, ecx + + xor al, 0x20 + +#elif defined(__arm__) + + // unrelated playing + mvn r1, #0 + sub r12, r0, #'0' + cmp r12, #10 + movlo r1, r12 + orr r12, r0, #0x20 + sub r12, r12, #'a' + cmp r12, #6 + addlo r1, r12, #10 + + eor r0, r0, #0x20 + +#elif defined(__aarch64__) + + // unrelated playing + mov x1, #-1 + sub w16, w0, #'0' + cmp w16, #10 + cmov.lo x1, x16 + orr w16, w0, #0x20 + sub w16, w16, #'a' - 10 + cmp w16, #10 + ccmp.hs w16, #16, #CCMP_HS + cmov.lo x1, x16 + + eor w0, w0, #0x20 + +#else + notimpl +#endif + ret endproc @@ -187,6 +746,9 @@ endproc proc x05 // answer whether 5 <= a 4 a > 9 or a < -2^63 + 5 // le/ng a' <= 4 -2^63 + 5 <= a <= 9 +#elif defined(__i386__) + + sub eax, 5 + cmp eax, 4 + +#elif defined(__arm__) + + // i dimly remember having a slick way to do this way back in the + // day, but i can't figure it out any more. + sub r0, #5 + cmp r0, #4 + +#elif defined(__aarch64__) + + // literal translation is too obvious + cmp x0, #5 + ccmp.hs x0, #9, #CCMP_HS + +#else + notimpl +#endif + ret endproc @@ -219,10 +803,35 @@ proc x06 // leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of, // set sf to msb(a) + +#if defined(__x86_64__) + not rax // a' = -a - 1 inc rax // a' = -a neg rax // a' = a +#elif defined(__i386__) + + not eax + inc eax + neg eax + +#elif defined(__arm__) + + mvn r0, r0 + add r0, r0, #1 + rsbs r0, r0, #0 // cf has opposite sense + +#elif defined(__aarch64__) + + mvn x0, x0 + add x0, x0, #1 + negs x0, x0 // cf has opposite sense + +#else + notimpl +#endif + ret endproc @@ -230,11 +839,39 @@ endproc proc x07 // same as before (?) + +#if defined(__x86_64__) + inc rax // a' = a + 1 neg rax // a' = -a - 1 inc rax // a' = -a neg rax // a' = a +#elif defined(__i386__) + + inc eax + neg eax + inc eax + neg eax + +#elif defined(__arm__) + + add r0, r0, #1 + rsb r0, r0, #0 + add r0, r0, #1 + rsbs r0, r0, #0 + +#elif defined(__aarch64__) + + add x0, x0, #1 + neg x0, x0 + add x0, x0, #1 + negs x0, x0 // cf has opposite sense + +#else + notimpl +#endif + ret endproc @@ -243,10 +880,45 @@ proc x08 // floor((a + d)/2), correctly handling overflow conditions; final cf // is lsb(a + d), probably uninteresting + +#if defined(__x86_64__) + add rax, rdx // cf || a' = a + d rcr rax, 1 // shift 65-bit result right by one // place; lsb moves into carry +#elif defined(__i386__) + + add eax, edx + rcr eax, 1 + +#elif defined(__arm__) + + // like the two-instruction a64 version + sub r1, r3, r0 + add r1, r0, r1, lsr #1 + + // the slick version, similar to the above + adds r0, r0, r3 + mov r0, r0, rrx + +#elif defined(__aarch64__) + + // a64 lacks a32's rrx. literal translation. + adds x1, x0, x3 // cf || a' = a + d + adc x16, xzr, xzr // realize cf in extra register + extr x1, x16, x1, #1 // shift down one place + + // two instruction version: clobbers additional register. (if you + // wanted the answer in any other register, even overwriting d, then + // this is unnecessary.) also depends on d >= a. + sub x16, x3, x0 // compute difference + add x0, x0, x16, lsr #1 // add half of it (rounded down) + +#else + notimpl +#endif + ret endproc @@ -255,10 +927,33 @@ proc x09 // a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3 // (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8). + +#if defined(__x86_64__) + shr rax, 3 // a' = floor(a/8); cf = 1 if a == // 4, 5, 6, 7 (mod 8) adc rax, 0 // a' = floor(a/8) + cf +#elif defined(__i386__) + + shr eax, 3 + adc eax, 0 + +#elif defined(__arm__) + + movs r0, r0, lsr #3 + adc r0, r0, #0 + +#elif defined(__aarch64__) + + tst x0, #4 + orr x0, xzr, x0, lsr #3 + cinc.ne x0, x0 + +#else + notimpl +#endif + ret endproc @@ -266,11 +961,43 @@ endproc proc x0a // increment c-byte little-endian bignum at rdi + +#if defined(__x86_64__) + add byte ptr [rdi], 1 0: inc rdi adc byte ptr [rdi], 0 loop 0b +#elif defined(__i386__) + + add byte ptr [edi], 1 +0: inc edi + adc byte ptr [edi], 0 + loop 0b + +#elif defined(__arm__) + + mov r12, #256 // set initial carry +0: ldrb r0, [r5] + subs r2, r2, #1 + add r12, r0, r12, lsr #8 + strb r12, [r5], #1 + bne 0b + +#elif defined(__aarch64__) + + mov w17, #256 // set initial carry +0: ldrb w16, [x5] + sub x2, x2, #1 + add w17, w16, w17, lsr #8 + strb w17, [x5], #1 + cbnz x2, 0b + +#else + notimpl +#endif + ret endproc @@ -278,11 +1005,36 @@ endproc proc x0b // negate double-precision d:a + +#if defined(__x86_64__) + not rdx // d' = -d - 1 neg rax // a' = -a; // cf = 1 iff a /= 0 sbb rdx, -1 // d' = -d - cf +#elif defined(__i386__) + + not edx + neg eax + sbb edx, -1 + +#elif defined(__arm__) + + // reverse subtract is awesome + rsbs r0, r0, #0 + rsc r3, r3, #0 + +#elif defined(__aarch64__) + + // easy way: everything is better with zero registers. + negs x0, x0 + ngc x3, x3 + +#else + notimpl +#endif + ret endproc @@ -291,6 +1043,8 @@ proc x0c // rotate is distributive over xor. +#if defined(__x86_64__) + // rax // = a_1 || a_0 // rbx // = b_1 || b_0 mov rcx, rax // = a_1 || a_0 @@ -304,6 +1058,48 @@ proc x0c cmp rax, rcx // always equal +#elif defined(__i386__) + + mov ecx, eax // = a_1 || a_0 + + xor ecx, ebx // = (a_1 XOR b_1) || (a_0 XOR b_0) + ror ecx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1) + + ror eax, 0xd // = a_0 || a_1 + ror ebx, 0xd // = b_0 || b_1 + xor eax, ebx // = (a_0 XOR b_0) || (a_1 XOR b_1) + + cmp eax, ecx // always equal + +#elif defined(__arm__) + + + // r0 // = a_1 || a_0 + // r1 // = b_1 || b_0 + eor r2, r0, r1 // = (a_1 XOR b_1) || (a_0 XOR b_0) + mov r2, r2, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1) + + mov r1, r1, ror #13 // = b_0 || b_1 + eor r0, r1, r0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1) + + cmp r0, r2 // always equal + +#elif defined(__aarch64__) + + // x0 // = a_1 || a_0 + // x1 // = b_1 || b_0 + eor x2, x0, x1 // = (a_1 XOR b_1) || (a_0 XOR b_0) + ror x2, x2, #13 // = (a_0 XOR b_0) || (a_1 XOR b_1) + + ror x1, x1, #13 // = b_0 || b_1 + eor x0, x1, x0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1) + + cmp x0, x2 // always equal + +#else + notimpl +#endif + ret endproc @@ -312,6 +1108,8 @@ proc x0d // and is distributive over xor. +#if defined(__x86_64__) + mov rdx, rbx // = b xor rbx, rcx // = b XOR c @@ -324,6 +1122,50 @@ proc x0d cmp rax, rbx // always equal +#elif defined(__i386__) + + mov edx, ebx // = b + + xor ebx, ecx // = b XOR c + and ebx, eax // = a AND (b XOR c) + + and edx, eax // = a AND b + and eax, ecx // = a AND c + xor eax, edx // = (a AND b) XOR (a AND c) + // = a AND (b XOR c) + + cmp eax, ebx // always equal + +#elif defined(__arm__) + + and r3, r0, r1 // = a AND b + + eor r1, r1, r2 // = b XOR c + and r1, r1, r0 // = a AND (b XOR c) + + and r0, r0, r2 // = a AND c + eor r0, r0, r3 // = (a AND b) XOR (a AND c) + // = a AND (b XOR c) + + cmp r0, r1 // always equal + +#elif defined(__aarch64__) + + and x3, x0, x1 // = a AND b + + eor x1, x1, x2 // = b XOR c + and x1, x1, x0 // = a AND (b XOR c) + + and x0, x0, x2 // = a AND c + eor x0, x0, x3 // = (a AND b) XOR (a AND c) + // = a AND (b XOR c) + + cmp x0, x1 // always equal + +#else + notimpl +#endif + ret endproc @@ -332,6 +1174,8 @@ proc x0e // de morgan's law +#if defined(__x86_64__) + mov rcx, rax // = a and rcx, rbx // = a AND b @@ -342,7 +1186,46 @@ proc x0e or rax, rbx // = (NOT a) OR (NOT b) // = NOT (a AND b) - cmp rax, rcx + cmp rax, rcx // always equal + +#elif defined(__i386__) + + mov ecx, eax // = a + + and ecx, ebx // = a AND b + not ecx // = NOT (a AND b) + + not eax // = NOT a + not ebx // = NOT b + or eax, ebx // = (NOT a) OR (NOT b) + // = NOT (a AND b) + + cmp eax, ecx // always equal + +#elif defined(__arm__) + + and r2, r0, r1 // = a AND b + mvn r2, r2 // = NOT (a AND b) + + mvn r0, r0 // = NOT a + mvn r1, r1 // = NOT b + orr r0, r0, r1 // = (NOT a) OR (NOT b) + + cmp r0, r2 // always equal + +#elif defined(__aarch64__) + + and x2, x0, x1 // = a AND b + mvn x2, x2 // = NOT (a AND b) + + mvn x0, x0 // = NOT a + orn x0, x0, x1 // = (NOT a) OR (NOT b) + + cmp x0, x2 // always equal + +#else + notimpl +#endif ret @@ -355,20 +1238,51 @@ proc x0f // // not sure why you'd do this. - cld +#if defined(__x86_64__) 0: xor [rsi], al lodsb loop 0b +#elif defined(__i386__) + +0: xor [esi], al + lodsb + loop 0b + +#elif defined(__arm__) + +0: ldrb r12, [r4] + subs r2, r2, #1 + eor r0, r0, r12 + strb r0, [r4], #1 + bne 0b + +#elif defined(__aarch64__) + +0: ldrb w16, [x4] + sub x2, x2, #1 + eor w0, w0, w16 + strb w0, [x4], #1 + cbnz x2, 0b + +#else + notimpl +#endif + ret endproc +///-------------------------------------------------------------------------- +/// 0x10--0x1f + proc x10 // four different ways to swap a pair of registers. +#if defined(__x86_64__) + push rax push rcx pop rax @@ -385,6 +1299,76 @@ proc x10 xchg rax, rcx +#elif defined(__i386__) + + push eax + push ecx + pop eax + pop ecx + + xor eax, ecx + xor ecx, eax + xor eax, ecx + + add eax, ecx + sub ecx, eax + add eax, ecx + neg ecx + + xchg eax, ecx + +#elif defined(__arm__) + + stmfd r13!, {r0, r2} + ldr r0, [r13, #4] + ldr r2, [r13], #8 + + eor r0, r0, r2 + eor r2, r2, r0 + eor r0, r0, r2 + + sub r0, r0, r2 + add r2, r2, r0 + rsb r0, r0, r2 // don't need 3-addr with reverse-sub + + mov r12, r0 + mov r0, r2 + mov r2, r0 + +#elif defined(__aarch64__) + + // anything you can do + stp x0, x2, [sp, #-16]! + ldp x2, x0, [sp], #16 + + eor x0, x0, x2 + eor x2, x2, x0 + eor x0, x0, x2 + + // the add/sub/add thing was daft. you can do it in three if you're + // clever -- and have three-address operations. + sub x0, x0, x2 + add x2, x2, x0 + sub x0, x2, x0 + + // but we lack a fourth. we can't do this in fewer than three + // instructions without hitting memory. only `ldp' will modify two + // registers at a time, so we need at least two instructions -- but + // if the first one sets one of our two registers to its final value + // then we lose the other input value with no way to recover it, so + // we must either write a fresh third register, or write something + // other than the final value, and in both cases we need a third + // instruction to fix everything up. we've done the wrong-something- + // other trick twice, so here's the captain-obvious use-a-third- + // register version. + mov x16, x0 + mov x0, x2 + mov x2, x16 + +#else + notimpl +#endif + ret endproc @@ -398,6 +1382,8 @@ proc x11 // in particular, a will be zero (and zf set) if and only if the two // strings are equal. +#if defined(__x86_64__) + 0: mov dl, [rsi] xor dl, [rdi] inc rsi @@ -405,6 +1391,37 @@ proc x11 or al, dl loop 0b +#elif defined(__i386__) + +0: mov dl, [esi] + xor dl, [edi] + inc esi + inc edi + or al, dl + loop 0b + +#elif defined(__arm__) + +0: ldrb r1, [r4], #1 + ldrb r12, [r5], #1 + subs r2, r2, #1 + eor r12, r12, r1 + orr r0, r0, r12 + bne 0b + +#elif defined(__aarch64__) + +0: ldrb w16, [x4], #1 + ldrb w17, [x5], #1 + sub x2, x2, #1 + eor w16, w16, w17 + orr w0, w0, w16 + cbnz x2, 0b + +#else + notimpl +#endif + ret endproc @@ -418,27 +1435,85 @@ proc x12 // move all of the set bits in d to a, unless there's already a bit // there. this clearly doesn't change the sum. +#if defined(__x86_64__) + mov rcx, rdx // c' = d and rdx, rax // d' = a AND d or rax, rcx // a' = a OR d add rax, rdx - ret +#elif defined(__i386__) -endproc + mov ecx, edx // c' = d + and edx, eax // d' = a AND d + or eax, ecx // a' = a OR d + add eax, edx -proc x13 +#elif defined(__arm__) - // ok, so this is a really obtuse way of adding a and b; the result + and r2, r0, r3 // c' = a AND d + orr r0, r0, r3 // a' = a OR d + add r0, r0, r2 + +#elif defined(__aarch64__) + + and x2, x0, x3 // c' = a AND d + orr x0, x0, x3 // a' = a OR d + add x0, x0, x2 + +#else + notimpl +#endif + + ret + +endproc + +proc x13 + + // ok, so this is a really obtuse way of adding a and b; the result // is in a and d. but why does it work? +#if defined(__x86_64__) + mov rcx, 0x40 // carry chains at most 64 long 0: mov rdx, rax // copy a' xor rax, rbx // low bits of each bitwise sum and rbx, rdx // carry bits from each bitwise sum - shl rbx, 001 // carry them into next position + shl rbx, 1 // carry them into next position loop 0b +#elif defined(__i386__) + + mov ecx, 0x40 // carry chains at most 64 long +0: mov edx, eax // copy a' + xor eax, ebx // low bits of each bitwise sum + and ebx, edx // carry bits from each bitwise sum + shl ebx, 1 // carry them into next position + loop 0b + +#elif defined(__arm__) + + mov r2, #0x40 +0: and r3, r0, r1 + subs r2, r2, #1 + eor r0, r0, r1 + lsl r1, r3, #1 + bne 0b + +#elif defined(__aarch64__) + + mov x2, #0x40 +0: and x3, x0, x1 + sub x2, x2, #1 + eor x0, x0, x1 + lsl x1, x3, #1 + cbnz x2, 0b + +#else + notimpl +#endif + ret endproc @@ -447,6 +1522,8 @@ proc x14 // floor((a + d)/2), like x08. +#if defined(__x86_64__) + mov rcx, rax // copy a for later and rcx, rdx // carry bits @@ -455,6 +1532,32 @@ proc x14 add rax, rcx // add the carries; done +#elif defined(__i386__) + + mov ecx, eax // copy a for later + and ecx, edx // carry bits + + xor eax, edx // low bits of each bitwise sum + shr eax, 1 // divide by 2; carries now in place + + add eax, ecx // add the carries; done + +#elif defined(__arm__) + + and r2, r0, r3 + eor r0, r0, r3 + add r0, r2, r0, lsr #1 + +#elif defined(__aarch64__) + + and x2, x0, x3 + eor x0, x0, x3 + add x0, x2, x0, lsr #1 + +#else + notimpl +#endif + ret endproc @@ -463,7 +1566,9 @@ proc x15 // sign extension 32 -> 64 bits. - //movsx rbx, eax // like this? +#if defined(__x86_64__) + + movsx rbx, eax // like this? mov rdx, 0xffffffff80000000 add rax, rdx // if bit 31 of a is set then bits @@ -472,15 +1577,46 @@ proc x15 // exactly backwards xor rax, rdx // so fix it +#elif defined(__i386__) + + movsx ebx, ax // like this? + + mov edx, 0xffff8000 + add eax, edx // if bit 31 of a is set then bits + // 31--63 of a' are clear; otherwise, + // these bits are all set -- which is + // exactly backwards + xor eax, edx // so fix it + +#elif defined(__arm__) + + sxth r1, r0 // like this + + mov r12, #0x80000000 + add r0, r0, r12, asr #16 + eor r0, r0, r12, asr #16 + +#elif defined(__aarch64__) + + sxtw x1, w0 // like this + + mov x16, #0xffffffff80000000 + add x0, x0, x16 + eor x0, x0, x16 + +#else + notimpl +#endif + ret endproc proc x16 - shl rax, 56 - shl rbx, 56 - shl rcx, 56 + // ??? i don't know why you'd want to calculate this. + +#if defined(__x86_64__) xor rax, rbx // a' = a XOR b xor rbx, rcx // b' = b XOR c @@ -490,256 +1626,2202 @@ proc x16 xor rax, rbx // a' = cf ? 0 : a XOR c cmp rax, rsi +#elif defined(__i386__) + + xor eax, ebx // a' = a XOR b + xor ebx, ecx // b' = b XOR c + mov esi, eax // t = a XOR b + add esi, ebx // t = (a XOR b) + (b XOR c) + cmovc eax, ebx // a' = cf ? b XOR c : a XOR b + xor eax, ebx // a' = cf ? 0 : a XOR c + cmp eax, esi + +#elif defined(__arm__) + + eor r0, r0, r1 + eor r1, r1, r2 + adds r4, r0, r1 + movcs r0, r1 + eor r0, r0, r1 + cmp r0, r4 + +#elif defined(__aarch64__) + + eor x0, x0, x1 + eor x1, x1, x2 + adds x4, x0, x1 + cmov.cs x0, x1 + eor x0, x0, x1 + cmp x0, x4 + +#else + notimpl +#endif + ret endproc proc x17 - ud2 + // absolute value + +#if defined(__x86_64__) + + cqo // d = a < 0 ? -1 : 0 + xor rax, rdx // a' = a < 0 ? -a - 1 : a + sub rax, rdx // a' = a < 0 ? -a : a + +#elif defined(__i386__) + + cdq // d = a < 0 ? -1 : 0 + xor eax, edx // a' = a < 0 ? -a - 1 : a + sub eax, edx // a' = a < 0 ? -a : a + +#elif defined(__arm__) + + // direct approach + movs r1, r0 + rsbmi r1, r0, #0 + + // faithful-ish conversion + eor r3, r0, r0, asr #31 + sub r0, r3, r0, asr #31 + +#elif defined(__aarch64__) + + // direct approach + tst x0, #1 << 63 + cneg.ne x1, x0 + + // faithful-ish conversion + eor x3, x0, x0, asr #63 + sub x0, x3, x0, asr #63 + +#else + notimpl +#endif + + ret endproc proc x18 - ud2 + // should always set sf, clear zf, unless we get rescheduled to a + // different core. + +#if defined(__x86_64__) + + rdtsc // d || a = cycles + shl rdx, 0x20 + or rax, rdx // a = cycles + mov rcx, rax // c = cycles + + rdtsc // d || a = cycles' + shl rdx, 0x20 + or rax, rdx // a = cycles' + + cmp rcx, rax + +#elif defined(__i386__) + + rdtsc // d || a = cycles + mov ebx, eax + mov ecx, edx // c || b = cycles + + rdtsc // d || a = cycles' + + sub ebx, eax + sbb ecx, edx + +#elif defined(__arm__) + + // cycle clock not available in user mode + mrrc p15, 0, r0, r1, c9 + mrrc p15, 0, r2, r3, c9 + subs r0, r0, r2 + sbcs r1, r1, r3 + +#elif defined(__aarch64__) + + // cycle clock not available in user mode + mrs x0, pmccntr_el0 + mrs x1, pmccntr_el0 + cmp x0, x1 + +#else + notimpl +#endif + + ret endproc proc x19 - ud2 + // stupid way to capture a pointer to inline data and jump past it. + // confuses the return-address predictor something chronic. worse + // because amd64 calling convention doesn't usually pass arguments on + // the stack. -endproc +#if defined(__x86_64__) -proc x1a + call 8f + .string "hello world!\n\0" +8: call print_str + add rsp, 8 + ret - ud2 +print_str: + // actually implement this ridiculous thing + mov rsi, [rsp + 8] + xor edx, edx +0: mov al, [rsi + rdx] + inc rdx + cmp al, 0 + jnz 0b + mov eax, SYS_write + mov edi, 1 + dec rdx + syscall // clobbers r11 :-( + ret -endproc +#elif defined(__i386__) -proc x1b + call 8f + .string "hello world!\n\0" +8: call print_str + add esp, 4 + ret - ud2 +print_str: + // actually implement this ridiculous thing + mov ecx, [esp + 4] + xor edx, edx +0: mov al, [ecx + edx] + inc edx + cmp al, 0 + jnz 0b + mov eax, SYS_write + mov ebx, 1 + dec edx + int 0x80 + ret -endproc +#elif defined(__arm__) + + // why am i doing this? + stmfd r13!, {r14} + bl 8f + .string "hello world!\n\0" + .balign 4 +8: mov r1, r14 // might as well make it easy on myself + bl print_str + ldmfd r13!, {pc} + +print_str: + mov r2, #0 +0: ldrb r0, [r1, r2] + cmp r0, #0 + addne r2, r2, #1 + bne 0b + mov r0, #1 + mov r7, #SYS_write + swi 0 + bx r14 + +#elif defined(__aarch64__) + + // why am i doing this? + str x30, [sp, #-16]! + bl 8f + .string "hello world!\n\0" + .balign 4 +8: mov x1, x30 // might as well make it easy on myself + bl print_str + ldr x30, [sp], #16 + ret -proc x1c +print_str: + mov x2, #0 +0: ldrb w0, [x1, x2] + cmp w0, #0 + cinc.ne x2, x2 + b.ne 0b + mov x0, #1 + mov x8, #SYS_write + svc #0 + ret - ud2 +#else + notimpl +#endif endproc -proc x1d +proc x1a - ud2 + // collect the current instruction-pointer address. this was an old + // 32-bit i386 trick for position-independent code, but (a) it + // confuses the return predictor, and (b) amd64 has true pc-relative + // addressing. -endproc +#if defined(__x86_64__) -proc x1e + // the actual example + call 0f +0: pop rax - ud2 + // the modern i386 trick doesn't confuse the return-address + // predictor. + call calladdr_rbx + sub rbx, . - 0b -endproc + // but rip-relative addressing is even better + lea rcx, [rip + 0b] -proc x1f + ret - ud2 +calladdr_rbx: + mov rbx, [rsp] + ret -endproc +#elif defined(__i386__) -proc x20 + // the actual example + call 0f +0: pop eax - ud2 + // the modern i386 trick doesn't confuse the return-address + // predictor. + call get_pc_ebx + sub ebx, . - 0b ret -endproc +#elif defined(__arm__) -proc x21 + stmfd r13!, {r14} - ud2 + bl 0f +0: mov r0, r14 -endproc + bl return + sub r1, r14, #. - 0b -proc x22 + adr r2, 0b - ud2 + ldmfd r13!, {pc} -endproc +return: bx r14 -proc x23 +#elif defined(__aarch64__) - ud2 + str x30, [sp, #-16]! -endproc + // we can do all of the above using a64 + bl 0f +0: mov x0, x30 -proc x24 + bl return + sub x1, x30, #. - 0b - ud2 + adr x2, 0b + + ldr x30, [sp], #16 +return: ret + +#else + notimpl +#endif endproc -proc x25 +proc x1b - ud2 +#if defined(__x86_64__) + + // retpolines: an mitigation against adversarially influenced + // speculative execution at indirect branches. if an adversary can + // prepare a branch-target buffer entry matching an indirect branch + // in the victim's address space then they can cause the victim to + // /speculatively/ (but not architecturally) execute any code in + // their address space, possibly leading to leaking secrets through + // the cache. retpolines aren't susceptible to this because the + // predicted destination address is from the return-prediction stack + // which the adversary can't prime. the performance penalty is still + // essentially a branch misprediction -- for this return, and + // possibly all others already stacked. + + // (try not to crash) + lea rax, [rip + 9f] -endproc + push rax +9: ret -proc x26 +#elif defined(__i386__) - ud2 + call get_pc_ebx + lea eax, [ebx + 9f - .] -endproc + push eax +9: ret -proc x27 +#elif defined(__arm__) - ud2 + stmfd r13!, {r14} -endproc + adr r14, 8f + bx r14 -proc x28 +8: ldmfd r13!, {pc} - ud2 +#elif defined(__aarch64__) -endproc + str x30, [sp, #-16]! -proc x29 + adr x30, 8f + ret - ud2 +8: ldr x30, [sp], #16 + ret + +#else + notimpl +#endif endproc -proc x2a +proc x1c - ud2 + // ok, having a hard time seeing a use for this. the most important + // thing to note is that sp is set from `pop' /after/ it's + // incremented. -endproc +#if defined(__x86_64__) -proc x2b + // try not to crash + mov rax, rsp + and rsp, -16 + push rax - ud2 + pop rsp -endproc + // check it worked + mov rbx, rsp + ret -proc x2c +#elif defined(__i386__) - ud2 + // try not to crash + mov eax, esp + and esp, -16 + push eax -endproc + pop esp -proc x2d + // check it worked + mov ebx, esp + ret - ud2 +#elif defined(__arm__) -endproc + // not even going to dignify this + notimpl -proc x2e +#elif defined(__aarch64__) - ud2 + // not even going to dignify this + notimpl + +#else + notimpl +#endif endproc -proc x2f +proc x1d - ud2 + // monumentally cheesy way to copy 8 n bytes from buff1 to buff2. + // also clobbers words at buff2 + 8 n and buff2 - 8 for good measure. -endproc + n = 4 -proc x30 +#if defined(__x86_64__) - ud2 + mov rax, rsp // safekeeping + + // we're toast if we get hit by a signal now. fingers crossed... + .if 0 + mov rsp, buff2 + 8*n + 8 + mov rbp, buff1 + 8*n + .else + lea rsp, [rdi + 8*n + 16] + lea rbp, [rsi + 8*n] + .endif + enter 0, n + 1 + + // precise action: + // + // +---------+ +---------+ + // rbp -> | ??? | rsp -> | ??? | + // +---------+ +---------+ + // | w_{n-1} | | rbp | <- rbp' + // +---------+ +---------+ + // | ... | | w_{n-1} | + // +---------+ +---------+ + // | w_1 | | ... | + // +---------+ +---------+ + // | w_0 | | w_1 | + // +---------+ +---------+ + // | w_0 | + // +---------+ + // | rbp' | <- rsp' + // +---------+ + + mov rdx, rsp + mov rsp, rax + +#elif defined(__i386__) + + mov eax, esp // safekeeping + + // we're toast if we get hit by a signal now. fingers crossed... + .if 0 + mov esp, buff2 + 4*n + 4 + mov ebp, buff1 + 4*n + .else + lea esp, [edi + 4*n + 8] + lea ebp, [esi + 4*n] + .endif + enter 0, n + 1 + + mov edx, esp + mov esp, eax + +#elif defined(__arm__) + + add r4, r4, #4*n + add r5, r5, #4*n + 8 + + str r4, [r5, #-4]! + .rept n/2 + ldrd r0, r1, [r4, #-8]! + strd r0, r1, [r5, #-8]! + .endr + add r4, r5, #4*n + str r4, [r5, #-4]! + +#elif defined(__aarch64__) + + // omgwtf. let's not actually screw with the stack pointer. + + add x4, x4, #8*n + add x5, x5, #8*n + 16 + + str x4, [x5, #-8]! + .rept n/2 + ldp x16, x17, [x4, #-16]! + stp x16, x17, [x5, #-16]! + .endr + add x4, x5, #8*n + str x4, [x5, #-8]! + +#else + notimpl +#endif ret endproc -proc x31 +proc x1e - ud2 + // convert nibble value to (uppercase) hex; other input values yield + // nonsense. + +#if defined(__x86_64__) + + // das doesn't work in 64-bit mode; best i can come up with + mov edx, eax + add al, '0' + add dl, 'A' - 10 + cmp al, '9' + 1 + cmovae eax, edx + +#elif defined(__i386__) + + cmp al, 0x0a // cf = 1 iff a < 10 + sbb al, 0x69 // if 0 <= a < 10, a' = a - 0x6a, so + // 0x96 <= a' < 0x70, setting af, cf + // if 10 <= a < 16, a' = a - 0x69, so + // 0x71 <= a' < 0x77, setting cf but + // clearing af + das // if 0 <= a < 10, then af and cf are + // both set, so set subtract 0x66 + // from a' leaving 0x30 <= a' < 0x3a; + // if 10 <= a < 16 then af clear but + // cf set, so subtract 0x60 from a' + // leaving 0x41 <= a' < 0x47 + +#elif defined(__arm__) + + // significantly less tricksy + cmp r0, #10 + addlo r0, r0, #'0' + addhs r0, r0, #'A' - 10 + +#elif defined(__aarch64__) + + // with less versatile conditional execution this is the best we can + // do + cmp w0, #10 + add w16, w0, #'A' - 10 + add w0, w0, #'0' + cmov.hs w0, w16 + +#else + notimpl +#endif + + ret endproc -proc x32 +proc x1f - ud2 + // verify collatz conjecture starting at a; assume a /= 0! + +#if defined(__x86_64__) + +0: bsf rcx, rax // clobber c if a = 0 + shr rax, cl // a = 2^c a' + cmp rdx, 0 + je 1f + stosq + dec rdx +1: + cmp rax, 1 // done? + je 9f + lea rax, [2*rax + rax + 1] // a' = 3 a' + 1 + jmp 0b // again + +9: ret + +#elif defined(__i386__) + +0: bsf ecx, eax // clobber c if a = 0 + shr eax, cl // a = 2^c a' + cmp edx, 0 + je 1f + stosd + dec edx +1: + cmp eax, 1 // done? + je 9f + lea eax, [2*eax + eax + 1] // a' = 3 a' + 1 + jmp 0b // again + +9: ret + +#elif defined(__arm__) + + // rbit introduced in armv7 +0: rbit r2, r0 + clz r2, r2 + mov r0, r0, lsr r2 // a = 2^c a' + cmp r3, #0 + strne r0, [r5], #4 + subne r3, r3, #1 + cmp r0, #1 + adcne r0, r0, r0, lsl #1 // a' = 3 a' + 1 (because c set) + bne 0b -endproc + ret -proc x33 +#elif defined(__aarch64__) + +0: rbit w2, w0 + clz w2, w2 + lsr w0, w0, w2 // a = 2^c a' + cmp x3, #0 + beq 1f + str x0, [x5], #8 + sub x3, x3, #1 +1: + cmp w0, #1 + add w16, w0, w0, lsl #1 // t = 3 a' + 1 (because c set) + csinc.eq w0, w0, w16 + b.ne 0b - ud2 + ret + +#else + notimpl +#endif endproc -proc x34 +///-------------------------------------------------------------------------- +/// 0x20--0x2f - ud2 +proc x20 + + // calculate 1337 a slowly + +#if defined(__x86_64__) + + // original version + mov rcx, rax // c = a + shl rcx, 2 // c = 4 a + add rcx, rax // c = 5 a + shl rcx, 3 // c = 40 a + add rcx, rax // c = 41 a + shl rcx, 1 // c = 82 a + add rcx, rax // c = 83 a + shl rcx, 1 // c = 166 a + add rcx, rax // c = 167 a + shl rcx, 3 // c = 1336 a + add rcx, rax // c = 1337 a + + // a quick way + lea rdx, [2*rax + rax] // t = 3 a + shl rdx, 6 // t = 192 a + sub rdx, rax // t = 191 a + lea rbx, [8*rdx] // b = 1528 a + sub rbx, rdx // b = 1337 a + +#elif defined(__i386__) + + // original version + mov ecx, eax // c = a + shl ecx, 2 // c = 4 a + add ecx, eax // c = 5 a + shl ecx, 3 // c = 40 a + add ecx, eax // c = 41 a + shl ecx, 1 // c = 82 a + add ecx, eax // c = 83 a + shl ecx, 1 // c = 166 a + add ecx, eax // c = 167 a + shl ecx, 3 // c = 1336 a + add ecx, eax // c = 1337 a + + // a quick way + lea edx, [2*eax + eax] // t = 3 a + shl edx, 6 // t = 192 a + sub edx, eax // t = 191 a + lea ebx, [8*edx] // b = 1528 a + sub ebx, edx // b = 1337 a + +#elif defined(__arm__) + + // original version, ish + add r2, r0, r0, lsl #2 // c = 5 a + add r2, r0, r2, lsl #3 // c = 41 a + add r2, r0, r2, lsl #1 // c = 83 a + add r2, r0, r2, lsl #1 // c = 167 a + add r2, r0, r2, lsl #3 // c = 1337 a + + // quicker way + add r1, r0, r0, lsl #1 // b = 3 a + rsb r1, r0, r1, lsl #6 // b = 191 a + rsb r1, r1, r1, lsl #3 // b = 1337 a + +#elif defined(__aarch64__) + + // original version, ish + add x2, x0, x0, lsl #2 // c = 5 a + add x2, x0, x2, lsl #3 // c = 41 a + add x2, x0, x2, lsl #1 // c = 83 a + add x2, x0, x2, lsl #1 // c = 167 a + add x2, x0, x2, lsl #3 // c = 1337 a + + // sleazy because no rsb + add x1, x0, x0, lsl #1 // b = 3 a + sub x1, x0, x1, lsl #6 // b = -191 a + sub x1, x1, x1, lsl #3 // b = 1337 a + +#else + notimpl +#endif + + ret endproc -proc x35 +proc x21 - ud2 + // multiply complex numbers a + b i and c + d i + // + // (a + b i) (c + d i) = (a c - b d) + (a d + b c) i + // + // somewhat slick approach uses only three multiplications + +#if defined(__x86_64__) + + mov rsi, rax // t = a + add rax, rbx // a' = a + b + mov rdi, rdx // u = d + sub rdx, rcx // d' = d - c + add rdi, rcx // u = c + d + + imul rax, rcx // a' = c (a + b) + imul rsi, rdx // t = a (d - c) + imul rdi, rbx // u = b (c + d) + + add rsi, rax // t = a (d - c) + c (a + b) + mov rbx, rsi // b' = a (d - c) + c (a + b) + // = a d + b c + sub rax, rdi // a' = c (a + b) - b (c + d) + // = a c - b d + +#elif defined(__i386__) + + mov esi, eax // t = a + add eax, ebx // a' = a + b + mov edi, edx // u = d + sub edx, ecx // d' = d - c + add edi, ecx // u = c + d + + imul eax, ecx // a' = c (a + b) + imul esi, edx // t = a (d - c) + imul edi, ebx // u = b (c + d) + + add esi, eax // t = a (d - c) + c (a + b) + mov ebx, esi // b' = a (d - c) + c (a + b) + // = a d + b c + sub eax, edi // a' = c (a + b) - b (c + d) + // = a c - b d + +#elif defined(__arm__) + + add r4, r0, r1 // t = a + b + add r5, r2, r3 // u = c + d + sub r3, r3, r2 // d' = d - c + + // mls introduced in armv7 + mul r4, r4, r2 // t = c (a + b) + mov r2, r1 // c' = a (bah!) + mla r1, r0, r3, r4 // b' = a (d - c) + c (a + b) + // = a d + b c + mls r0, r2, r5, r4 // a' = c (a + b) - b (c + d) + // = a c - b d + +#elif defined(__aarch64__) + + add x4, x0, x1 // t = a + b + add x5, x2, x3 // u = c + d + sub x3, x3, x2 // d' = d - c + + // mls intxoduced in axmv7 + mul x4, x4, x2 // t = c (a + b) + mov x2, x1 // c' = a (bah!) + madd x1, x0, x3, x4 // b' = a (d - c) + c (a + b) + // = a d + b c + msub x0, x2, x5, x4 // a' = c (a + b) - b (c + d) + // = a c - b d + +#else + notimpl +#endif + + ret endproc -proc x36 +proc x22 - ud2 + // divide by 3 -endproc +#if defined(__x86_64__) -proc x37 + mov rdx, 0xaaaaaaaaaaaaaaab // = ceil(2/3 2^64) + mul rdx // d' || a' =~ 2/3 a 2^64 + shr rdx, 1 // d' = floor(a/3) + mov rax, rdx // a' = floor(a/3) - ud2 + // we start with 0 <= a < 2^64. write f = ceil(2/3 2^64), so that + // 2/3 < f/2^64 < 2/3 + 1/2^64. then floor(2/3 a) <= floor(a f/2^64) + // <= floor(2/3 a + a/2^64), but a < 2^64 so a/2^64 < 1 and + // floor(a f/2^64) = floor(2/3 a). -endproc +#elif defined(__i386__) -proc x38 + mov edx, 0xaaaaaaab // = ceil(2/3 2^32) + mul edx // d' || a' =~ 2/3 a 2^32 + shr edx, 1 // d' = floor(a/3) + mov eax, edx // a' = floor(a/3) - ud2 +#elif defined(__arm__) -endproc + ldr r12, =0xaaaaaaab + umull r12, r0, r0, r12 + mov r0, r0, lsr #1 -proc x39 +#elif defined(__aarch64__) - ud2 + ldr x16, =0xaaaaaaaaaaaaaaab + umulh x0, x0, x16 + lsr x0, x0, #1 + +#else + notimpl +#endif + + ret endproc -proc x3a +proc x23 - ud2 +#if defined(__x86_64__) -endproc + // main loop: shorten a preserving residue class mod 3 +0: cmp rax, 5 + jbe 8f + // a > 5 + mov rdx, rax // d' = a + shr rdx, 2 // d' = floor(a/4) + and rax, 3 // a = 4 d' + a' (0 <= a' < 4) + add rax, rdx // a' == a (mod 3) but a' < a/4 + 4 + jmp 0b -proc x3b + // fix up final value 0 <= a < 6: want 0 <= a < 3 + // + // the tricky part is actually a = 3; but the other final cases take + // additional iterations which we can avoid. +8: cmp rax, 3 // set cf iff a < 3 + cmc // set cf iff a >= 3 + sbb rdx, rdx // d' = a >= 3 ? -1 : 0 + and rdx, 3 // d' = a >= 3 ? 3 : 0 + sub rax, rdx // a' = a - (a >= 3 ? 3 : 0) + // = a (mod 3) + +#elif defined(__i386__) + + // main loop: shorten a preserving residue class mod 3 +0: cmp eax, 5 + jbe 8f + // a > 5 + mov edx, eax // d' = a + shr edx, 2 // d' = floor(a/4) + and eax, 3 // a = 4 d' + a' (0 <= a' < 4) + add eax, edx // a' == a (mod 3) but a' < a/4 + 4 + jmp 0b + + // fix up final value 0 <= a < 6: want 0 <= a < 3 + // + // the tricky part is actually a = 3; but the other final cases take + // additional iterations which we can avoid. +8: cmp eax, 3 // set cf iff a < 3 + cmc // set cf iff a >= 3 + sbb edx, edx // d' = a >= 3 ? -1 : 0 + and edx, 3 // d' = a >= 3 ? 3 : 0 + sub eax, edx // a' = a - (a >= 3 ? 3 : 0) + // = a (mod 3) - ud2 +#elif defined(__arm__) -endproc +0: cmp r0, #6 + andhs r12, r0, #3 + addhs r0, r12, r0, lsr #2 + bhs 0b -proc x3c + cmp r0, #3 + subhs r0, r0, #3 - ud2 +#elif defined(__aarch64__) -endproc +0: cmp x0, #6 + // blunder on through regardless since this doesn't affect the result + and x16, x0, #3 + add x0, x16, x0, lsr #2 + b.hs 0b -proc x3d + subs x16, x0, #3 + cmov.hs x0, x16 - ud2 +#else + notimpl +#endif + + ret endproc -proc x3e +proc x24 - ud2 + // invert (odd) a mod 2^64 + // + // suppose a a_i == 1 (mod 2^{2^i}) + // + // clearly good for i = 0, since 2^i = 1 and 2^{2^i} = 2, and a_0 = + // a == 1 (mod 2) by assumption + // + // write a a_i == b_i 2^{2^i} + 1 (mod 2^{2^{i+1}}) + // then b_i == (a a_i - 1)/2^{2^i} (mod 2^{2^i}) + // to lift inverse, we want x such that a x == -b_i (mod 2^{2^i}); + // clearly x = -a_i b_i will do, since a a_i == 1 (mod 2^{2^i}) + // then: + // a_{i+1} = a_i - a_i b_i 2^{2^i} = a_i (1 - (a a_i - 1)) + // = 2 a_i - a a_i^2 + // + // check: + // a a_{i+1} = 2 a a_i - a^2 a_i^2 + // == 2 a a_i - (b_i 2^{2^i} + 1)^2 + // == 2 (b_i 2^{2^i} + 1) - + // (b_i^2 2^{2^{i+1}} + 2 b_i 2^{2^i} + 1) + // == 1 (mod 2^{2^{i+1}}) + +#if defined(__x86_64__) + + // rax // a_0 = a + mov rbx, rax // b' = a + mov rsi, rax // t = a_0 + +0: + cmp rbp, 0 + je 1f + stosq + dec rbp +1: + mul rbx // a' = a a_i + mov rcx, rax // c = a a_i + + sub rax, 2 // a' = a a_i - 2 + neg rax // a' = 2 - a a_i + mul rsi // a_{i+1} = a_i (2 - a a_i) + // = 2 a_i - a a_i^2 + mov rsi, rax // t = a_{i+1} + + cmp rcx, 1 // done? + ja 0b // no -- iterate + +#elif defined(__i386__) + + // eax // a_0 = a + mov ebx, eax // b' = a + mov esi, eax // t = a_0 + +0: + cmp ebp, 0 + je 1f + stosd + dec ebp +1: + mul ebx // a' = a a_i + mov ecx, eax // c = a a_i + + sub eax, 2 // a' = a a_i - 2 + jb 9f // done if < 2 + neg eax // a' = 2 - a a_i + mul esi // a_{i+1} = a_i (2 - a a_i) + // = 2 a_i - a a_i^2 + mov esi, eax // t = a_{i+1} + + jmp 0b // and iterate +9: mov eax, esi // restore + +#elif defined(__arm__) + + // r0 // a_0 = a + mov r1, r0 // b' = a + +0: + cmp r6, #0 + strne r0, [r5], #4 + subne r6, r6, #1 + mul r2, r0, r1 // c = a a_i + rsbs r2, r2, #2 // c = 2 - a a_i + mul r0, r0, r2 // a_{i+1} = a_i (2 - a a_i) + // = 2 a_i - a a_i^2 + blo 0b + +#elif defined(__aarch64__) + + // x0 // a_0 = a + mov x1, x0 // b' = a + mov x16, #2 // because we have no rsb + +0: + cmp x6, #0 + b.eq 1f + str x0, [x5], #8 + sub x6, x6, #1 +1: + mul x2, x0, x1 // c = a a_i + subs x2, x16, x2 // c = 2 - a a_i + mul x0, x0, x2 // a_{i+1} = a_i (2 - a a_i) + // = 2 a_i - a a_i^2 + b.lo 0b + +#else + notimpl +#endif + + ret endproc -proc x3f +proc x25 - ud2 + // a poor approximation to pi/4 + // + // think of x and y as being in 16.16 fixed-point format. we sample + // points in the unit square, and determine how many of them are + // within a unit quarter-circle centred at the origin. the area of + // the quarter-circle is pi/4. -endproc +#if defined(__x86_64__) + + xor eax, eax // a = 0 + mov rcx, 1 + shl rcx, 0x20 // c =~ 4 billion + +0: movzx rbx, cx // x = low 16 bits of c + imul rbx, rbx // b = x^2 + + ror rcx, 0x10 // switch halves of c + movzx rdx, cx // y = high 16 bits of c + imul rdx, rdx // d = y^2 + rol rcx, 0x10 // switch back + + add rbx, rdx // r^2 = x^2 + y^2 + shr rbx, 0x20 // r^2 >= 1? + cmp rbx, 1 // set cf iff r^2 >= 1 + adc rax, 0 // and add onto accumulator + loop 0b + +#elif defined(__i386__) + + // this is actually better done in 32 bits. the carry has the wrong + // sense here, so instead deduct one for each point outside the + // quarter-circle rather than adding one for each point inside it. + xor eax, eax + xor ecx, ecx + +0: movzx ebx, cx + imul ebx, ebx + + mov edx, ecx + shr edx, 0x10 + imul edx, edx + + add ebx, edx // see? + sbb eax, 0 + loop 0b + +#elif defined(__arm__) + + mov r0, #0 + mov r2, #0 + +0: uxth r1, r2, ror #0 + uxth r3, r2, ror #16 + mul r1, r1, r1 + mul r3, r3, r3 + cmn r1, r3 // mlas doesn't set cf usefully + addcc r0, r0, #1 + adds r2, r2, #1 + bne 0b + +#elif defined(__aarch64__) + + mov w0, #0 + mov w2, #0 + +0: ubfx w1, w2, #0, #16 + ubfx w3, w2, #16, #16 + sub w2, w2, #1 + mul w1, w1, w1 + mul w3, w3, w3 + cmn w1, w3 + cinc.cc w0, w0 + cbnz w2, 0b + +#else + notimpl +#endif + + ret + +endproc + +proc x26 + + // a bad way to rotate a right by 7 places + +#if defined(__x86_64__) + + mov rbx, rax + ror rbx, 7 // better + + mov rdx, rax // d' = a + shr rax, 7 // a' = a >> 7 + shl rdx, 0x39 // d' = a << 57 + or rax, rdx // a' = a >>> 7 + +#elif defined(__i386__) + + mov ebx, eax + ror ebx, 7 // better + + mov edx, eax // d' = a + shr eax, 7 // a' = a >> 7 + shl edx, 0x39 // d' = a << 57 + or eax, edx // a' = a >>> 7 + +#elif defined(__arm__) + + mov r1, r0, ror #7 // easy way + + // even the hard way is fairly easy on arm + mov r3, r0, lsl #25 + orr r0, r3, r0, lsr #7 // hard way + +#elif defined(__aarch64__) + + ror x1, x0, #7 // easy way + + // even the hard way is fairly easy on arm + lsl x3, x0, #57 + orr x0, x3, x0, lsr #7 // hard way + +#else + notimpl +#endif + + ret + +endproc + +proc x27 + + // shift a right by c places, in two halves + +#if defined(__x86_64__) + + mov ch, cl // c' = [c, c] + inc ch // c' = [c, c + 1] + shr ch, 1 + shr cl, 1 // c' = [floor(c/2), ceil(c/2)] + shr rax, cl + xchg ch, cl + shr rax, cl + +#elif defined(__i386__) + + mov ch, cl // c' = [c, c] + inc ch // c' = [c, c + 1] + shr ch, 1 + shr cl, 1 // c' = [floor(c/2), ceil(c/2)] + shr eax, cl + xchg ch, cl + shr eax, cl + +#elif defined(__arm__) + + // it would be clearer and more efficient to say: `mov r12, r2, lsr + // #1; sub r2, r2, r12', but that's not the lesson this exercise is + // trying to teach. + add r12, r2, #1 + mov r2, r2, lsr #1 + mov r12, r12, lsr #1 + mov r0, r0, lsr r2 + mov r0, r0, lsr r12 + +#elif defined(__aarch64__) + + add w16, w2, #1 + lsr w2, w2, #1 + lsr w16, w16, #1 + lsr x0, x0, x2 + lsr x0, x0, x16 + +#else + notimpl +#endif + + ret + +endproc + +proc x28 + + // divide c-byte little-endian bignum at rsi by 2 (rounding down) + +#if defined(__x86_64__) + + clc +0: rcr byte ptr [rsi], 1 + inc rsi + loop 0b + +#elif defined(__i386__) + + clc +0: rcr byte ptr [esi], 1 + inc esi + loop 0b + +#elif defined(__arm__) + + // we could hack this a word at a time using rrx + mov r3, #0 +0: ldrb r12, [r4] + subs r2, r2, #1 + orr r3, r3, r12, lsr #1 + strb r3, [r4], #1 + mov r3, r12, lsl #7 + bne 0b + +#elif defined(__aarch64__) + + mov w16, #0 +0: ldrb w17, [x4] + sub x2, x2, #1 + orr w16, w16, w17, lsr #1 + strb w16, [x4], #1 + lsl w16, w17, #7 + cbnz x2, 0b + +#else + notimpl +#endif + + ret + +endproc + +proc x29 + + // fill a buffer with a 3-byte pattern + +#if defined(__x86_64__) + + lea rdi, [rsi + 3] + rep movsb + +#elif defined(__i386__) + + lea edi, [esi + 3] + rep movsb + +#elif defined(__arm__) + + add r5, r4, #3 +0: subs r2, r2, #1 + ldrhsb r12, [r4], #1 + strhsb r12, [r5], #1 + bhs 0b + +#elif defined(__aarch64__) + + cbz x2, 9f + add x5, x4, #3 +0: sub x2, x2, #1 + ldrb w16, [x4], #1 + strb w16, [x5], #1 + cbnz x2, 0b +9: + +#else + notimpl +#endif + + ret + +endproc + +proc x2a + + // rotate the words in a buffer, so that the last word comes first, + // the first comes second, and so on. this isn't a good way to do + // it. + +#if defined(__x86_64__) + + mov rsi, rbx // set string pointers + mov rdi, rbx +0: lodsq // fetch next word + xchg rax, qword ptr [rbx] // stash it for next iteration and + // replace it with the previously + // stashed word + stosq // store in output + // (note that the first iteration doesn't actually do anything) + loop 0b // continue until all done + +#elif defined(__i386__) + + mov esi, ebx // set string pointers + mov edi, ebx +0: lodsd // fetch next word + xchg eax, dword ptr [ebx] // stash it for next iteration and + // replace it with the previously + // stashed word + stosd // store in output + loop 0b // continue until all done + +#elif defined(__arm__) + + // let's do this a sensible way. (we could go faster using ldm/stm.) + add r0, r1, r2, lsl #2 // find the end of the buffer + ldr r0, [r0, #-4] // collect final element +0: subs r2, r2, #1 + ldr r12, [r1] + str r0, [r1], #4 + mov r0, r12 + bne 0b + +#elif defined(__aarch64__) + + add x0, x1, x2, lsl #3 // find the end of the buffer + ldr x0, [x0, #-8] // collect final element +0: sub x2, x2, #1 + ldr x16, [x1] + str x0, [x1], #8 + mov x0, x16 + cbnz x2, 0b + +#else + notimpl +#endif + + ret + +endproc + +proc x2b + + // find a cycle in a function f: B -> B, where B = {0, 1, ..., 255} + +#if defined(__x86_64__) + + // this is floyd's cycle-finding algorithm. + // + // consider the sequence s_0 = 0, s_1 = f(0), s_2 = f(f(0)), ..., + // s_{i+1} = f(s_i). since B is finite, there must be some smallest + // t and c such that s(t) = s(t + c); then we have s_i = s_j iff + // i >= t, j >= t, and i == j (mod c). + // + // the algorithm sets two cursors advancing through the sequence: a + // /tortoise/ which advances one step at a time, and a /hare/ which + // advances by two, so when the tortoise is at element s_i, the hare + // is at s_{2i}. the hare will run around the cycle and catch the + // tortoise when i >= t and i == 2 i (mod c); the latter is simply i + // == 0 (mod c), which therefore happens first when i = k = t + + // (-t mod c). + // + // i'm not sure what good xlatb does here that mov al, [rbx + al] + // doesn't. + + xor eax, eax // tortoise starts at 0 + xor edx, edx // hare starts at 0 +0: xlatb // advance tortoise + xchg rax, rdx // switch to hare + xlatb // advance hare ... + xlatb // ... twice + xchg rax, rdx // switch back + cmp al, dl // hare caught the tortoise? + jnz 0b // no -- go around again + + // now we trace the initial tail: reset the tortoise to s_0, and slow + // the hare down so that both take only a single step in each + // iteration. this loop terminates when i >= t and i == i + 2 k + // (mod c). we know k is a multiple of c, so the latter condition + // always holds, so this finds the first step of the cycle. + + xor eax, eax // reset the tortoise +0: xlatb // advance tortoise + xchg rax, rdx // switch to hare + xlatb // advance hare + xchg rax, rdx // and switch back + cmp al, dl // done? + jnz 0b // no -- iterate + +#elif defined(__i386__) + + xor eax, eax // tortoise starts at 0 + xor edx, edx // hare starts at 0 +0: xlatb // advance tortoise + xchg eax, edx // switch to hare + xlatb // advance hare ... + xlatb // ... twice + xchg eax, edx // switch back + cmp al, dl // hare caught the tortoise? + jnz 0b // no -- go around again + + xor eax, eax // reset the tortoise +0: xlatb // advance tortoise + xchg eax, edx // switch to hare + xlatb // advance hare + xchg eax, edx // and switch back + cmp al, dl // done? + jnz 0b // no -- iterate + +#elif defined(__arm__) + + mov r0, #0 + mov r3, #0 +0: ldrb r0, [r1, r0] + ldrb r3, [r1, r3] + ldrb r3, [r1, r3] + cmp r0, r3 + bne 0b + + mov r0, #0 +0: ldrb r0, [r1, r0] + ldrb r3, [r1, r3] + cmp r0, r3 + bne 0b + +#elif defined(__aarch64__) + + mov w0, #0 + mov w3, #0 +0: ldrb w0, [x1, x0] + ldrb w3, [x1, x3] + ldrb w3, [x1, x3] + cmp w0, w3 + b.ne 0b + + mov w0, #0 +0: ldrb w0, [x1, x0] + ldrb w3, [x1, x3] + cmp w0, w3 + b.ne 0b + +#else + notimpl +#endif + + ret + +endproc + +proc x2c + + // a convoluted way to set rax = rsi + +#if defined(__x86_64__) + + mov qword ptr [rbx + 8*rcx], 0 // b[c] = 0 + mov qword ptr [rbx + 8*rdx], 1 // b[d] = 1 + mov rax, [rbx + 8*rcx] // a' = b[c] = 0 + + mov [rbx], rsi // b[0] = t + mov [rbx + 8], rdi // b[1] = u + mov rax, [rbx + 8*rax] // a' = b[a'] = b[0] = t + +#elif defined(__i386__) + + mov dword ptr [ebx + 8*ecx], 0 // b[c] = 0 + mov dword ptr [ebx + 8*edx], 1 // b[d] = 1 + mov eax, [ebx + 8*ecx] // a' = b[c] = 0 + + mov [ebx], esi // b[0] = t + mov [ebx + 8], edi // b[1] = u + mov eax, [ebx + 8*eax] // a' = b[a'] = b[0] = t + +#elif defined(__arm__) + + mov r0, #0 + mov r12, #1 + + str r0, [r1, r2, lsl #2] + str r12, [r1, r3, lsl #2] + ldr r0, [r1, r2, lsl #2] + + str r4, [r1] + str r5, [r1, #4] + ldr r0, [r1, r0, lsl #2] + +#elif defined(__aarch64__) + + mov x16, #1 + + str xzr, [x1, x2, lsl #3] + str x16, [x1, x3, lsl #3] + ldr x0, [x1, x2, lsl #3] + + str x4, [x1] + str x5, [x1, #8] + ldr x0, [x1, x0, lsl #3] + +#else + notimpl +#endif + + ret + +endproc + +proc x2d + + // clear the least significant set bit in a, by calculating a' = + // a AND (a - 1). + // + // if a = 0 then a' = 0. otherwise, a - 1 differs from a exactly in + // the least significant /set/ bit of a, and all bits of lesser + // significance. to put it another way: write a = u 2^{k+1} + 2^k; + // then a - 1 = u 2^{k+1} + 2^{k-1} + ... + 2 + 1. taking the + // bitwise AND of these leaves only the bits common to both, i.e., + // u 2^{k+1}. + +#if defined(__x86_64__) + + mov rdx, rax // d' = a + dec rax // a' = a - 1 + and rax, rdx // a' = a AND (a - 1) + +#elif defined(__i386__) + + mov edx, eax // d' = a + dec eax // a' = a - 1 + and eax, edx // a' = a AND (a - 1) + +#elif defined(__arm__) + + sub r3, r0, #1 + and r0, r0, r3 + +#elif defined(__aarch64__) + + sub x3, x0, #1 + and x0, x0, x3 + +#else + notimpl +#endif + + ret + +endproc + +proc x2e + + // compute a mask of one bits in exactly the positions of the + // low-order run of zero bits in a + +#if defined(__x86_64__) + + mov rdx, rax // d' = a + dec rdx // d' = a - 1 + xor rax, rdx // a = a XOR (a - 1) + // set bits are least significant + // set bit of a, and all bits of + // lesser significance + shr rax, 1 // now only bits of lesser + // significance; a' = 0 iff a odd + cmp rax, rdx // equal if a = 0 or 2^k; otherwise + // strictly less + +#elif defined(__i386__) + + mov edx, eax + dec edx + xor eax, edx + shr eax, 1 + cmp eax, edx + +#elif defined(__arm__) + + sub r3, r0, #1 + eor r0, r0, r3 + mov r0, r0, lsr #1 // probably fold shift into next inst + cmp r0, r3 + +#elif defined(__aarch64__) + + sub x3, x0, #1 + eor x0, x0, x3 + mov x0, x0, lsr #1 // probably fold shift into next inst + cmp x0, x3 + +#else + notimpl +#endif + + ret + +endproc + +proc x2f + + // a slow population count + +#if defined(__x86_64__) + + popcnt rbx, rcx // the easy way + + // a fast version in software + mov rax, rcx + + mov rdx, rcx + shr rdx, 1 + mov rsi, 0x5555555555555555 + and rax, rsi + and rdx, rsi + add rax, rdx + + mov rdx, rax + shr rdx, 2 + mov rsi, 0x3333333333333333 + and rax, rsi + and rdx, rsi + add rax, rdx + + mov rdx, rax + shr rdx, 32 + add rax, rdx + + mov rdx, rax + shr rdx, 4 + and rax, 0x0f0f0f0f + and rdx, 0x0f0f0f0f + add rax, rdx + + mov rdx, rax + shr rdx, 8 + add rax, rdx + + mov rdx, rax + shr rdx, 16 + add rax, rdx + movzx rsi, al + + // the official version + xor eax, eax // clear iteration counter +0: jrcxz 9f // bail if c = 0 + inc rax // bump iteration count + mov rdx, rcx // d' = c + dec rdx // d' = c - 1 + and rcx, rdx // zap least significant set bit of c + jmp 0b // and go again +9: + +#elif defined(__i386__) + + popcnt ebx, ecx // the easy way + + mov eax, ecx + + mov edx, ecx + shr edx, 1 + and eax, 0x55555555 + and edx, 0x55555555 + add eax, edx + + mov edx, eax + shr edx, 2 + and eax, 0x33333333 + and edx, 0x33333333 + add eax, edx + + mov edx, eax + shr edx, 4 + add eax, edx + + mov edx, eax + shr edx, 8 + and eax, 0x000f000f + and edx, 0x000f000f + add eax, edx + + mov edx, eax + shr edx, 16 + add eax, edx + movzx esi, al + + xor eax, eax +0: jecxz 9f + inc eax + mov edx, ecx + dec edx + and ecx, edx + jmp 0b +9: + +#elif defined(__arm__) + + // the easy-ish way + vmov d0[0], r2 + vcnt.8 d0, d0 + vmov r1, d0[0] + add r1, r1, r1, lsl #8 + add r1, r1, r1, lsl #16 + mov r1, r1, lsr #24 + + // the hard way + movw r12, #0x5555 + movt r12, #0x5555 + and r3, r12, r2, lsr #1 + and r0, r12, r2 + add r0, r0, r3 + + movw r12, #0x3333 + movt r12, #0x3333 + and r3, r12, r0, lsr #2 + and r0, r12, r0 + add r0, r0, r3 + + add r0, r0, r0, lsl #16 + + movt r12, 0x0f0f + and r3, r12, r0, lsr #4 + and r0, r12, r0 + add r0, r0, r3 + + add r0, r0, r0, lsl #8 + + mov r4, r0, lsr #24 + + // and following the exercise + mov r0, #0 + cmp r2, #0 + beq 9f +0: add r0, r0, #1 + sub r3, r2, #1 + ands r2, r2, r3 + bne 0b +9: + +#elif defined(__aarch64__) + + // the easy-ish way + mov v0.d[0], x2 + cnt v0.8b, v0.8b + mov x1, v0.d[0] + add x1, x1, x1, lsl #8 + add x1, x1, x1, lsl #16 + add x1, x1, x1, lsl #32 + lsr x1, x1, #56 + + // the hard way -- though arm64's immediate constant encodings and + // shifting make this actually rather pleasant. + and x3, x2, #0xaaaaaaaaaaaaaaaa + and x0, x2, #0x5555555555555555 + add x0, x0, x3, lsr #1 + + and x3, x0, #0xcccccccccccccccc + and x0, x0, #0x3333333333333333 + add x0, x0, x3, lsr #2 + + add x0, x0, x0, lsr #4 + + and x3, x0, #0x0f000f000f000f00 + and x0, x0, #0x000f000f000f000f + add x0, x3, x0, lsl #8 + + add x0, x0, x0, lsl #16 + add x0, x0, x0, lsl #32 + lsr x4, x0, #56 + + // and the official way + mov x0, #0 + cbz x2, 9f +0: add x0, x0, #1 + sub x3, x2, #1 + and x2, x2, x3 + cbnz x2, 0b +9: + +#else + notimpl +#endif + + ret + +endproc + +///-------------------------------------------------------------------------- +/// 0x30--0x3f + +proc x30 + +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif + + ret + +endproc + +proc x31 + +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif + +endproc + +proc x32 + +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif + +endproc + +proc x33 + +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif + +endproc + +proc x34 + +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif + +endproc + +proc x35 + +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif + +endproc + +proc x36 + +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif + +endproc + +proc x37 + +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif + +endproc + +proc x38 + +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif + +endproc + +proc x39 + +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif + +endproc + +proc x3a + +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif + +endproc + +proc x3b + +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif + +endproc + +proc x3c + +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif + +endproc + +proc x3d + +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif + +endproc + +proc x3e + +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif + +endproc + +proc x3f + +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif + +endproc + +///----- That's all, folks --------------------------------------------------