/// -*- mode: asm; asm-comment-char: 0 -*- ///-------------------------------------------------------------------------- /// Preliminaries. #include #if defined(__i386__) || defined(__x86_64__) .intel_syntax noprefix #elif defined(__arm__) .macro ret bx r14 .endm .arch armv7-a #elif defined(__aarch64__) .macro cmov rd, rn, cc csel \rd, \rn, \rd, \cc .endm #define _COND(_) \ _(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl) \ _(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv) \ _(hs) _(lo) #define _INST(_) \ _(ccmp) _(ccmn) \ _(csel) _(cmov) \ _(csinc) _(cinc) _(cset) \ _(csneg) _(cneg) \ _(csinv) _(cinv) _(csetm) #define _CONDVAR(cc) _definstvar cc; #define _INSTVARS(inst) \ .macro _definstvar cc; \ .macro inst.\cc args:vararg; inst \args, \cc; .endm; \ .endm; \ _COND(_CONDVAR); \ .purgem _definstvar; _INST(_INSTVARS) #undef _COND #undef _INST #undef _CONDVAR #undef _INSTVARS #define CCMP_N 8 #define CCMP_Z 4 #define CCMP_C 2 #define CCMP_V 1 #define CCMP_MI CCMP_N #define CCMP_PL 0 #define CCMP_EQ CCMP_Z #define CCMP_NE 0 #define CCMP_CS CCMP_C #define CCMP_HS CCMP_C #define CCMP_CC 0 #define CCMP_LO 0 #define CCMP_VS CCMP_V #define CCMP_VC 0 #define CCMP_HI CCMP_C #define CCMP_LS 0 #define CCMP_LT CCMP_N #define CCMP_GE 0 #define CCMP_LE CCMP_N #define CCMP_GT 0 #else # error "not supported" #endif .macro proc name .globl \name .type \name, STT_FUNC .p2align 4 \name\(): .macro endproc .size \name, . - \name .purgem endproc .endm .endm .macro ch c #if defined(__i386__) pushf push eax push ebx push ecx push edx push ebp mov ebp, esp and esp, -16 push \c call putchar@plt call get_pc_ebx add ebx, offset _GLOBAL_OFFSET_TABLE mov eax, [ebx + stdout@GOT] mov eax, [eax] call fflush@plt mov esp, ebp pop ebp pop edx pop ecx pop ebx pop eax popf #elif defined(__x86_64__) pushf push rax push rcx push rdx push rsi push rdi push r8 push r9 push rbp mov rbp, rsp and rsp, -16 mov rdi, \c call putchar@plt mov rdi, [rip + stdout] call fflush@plt mov rsp, rbp pop rbp pop r9 pop r8 pop rdi pop rsi pop rdx pop rcx pop rax popf #elif defined(__arm__) stmfd r13!, {r0-r4, r12, r14} mov r4, r13 bic r14, r4, #15 mov r13, r14 mov r0, #\c bl putchar@plt ldr r14, .L$_c$gotoff$\@ .L$_c$gotpc$\@: add r14, pc, r14 b .L$_c$cont$\@ .L$_c$gotoff$\@: .word _GLOBAL_OFFSET_TABLE - .L$_c$gotpc$\@ - 8 .L$_c$cont$\@: bl fflush@plt mov r13, r4 ldmfd r13!, {r0-r4, r12, r14} #elif defined(__aarch64__) sub sp, sp, #20*8 stp x0, x1, [sp, #0] stp x2, x3, [sp, #16] stp x4, x5, [sp, #32] stp x6, x7, [sp, #48] stp x8, x9, [sp, #64] stp x10, x11, [sp, #80] stp x12, x13, [sp, #96] stp x14, x15, [sp, #112] stp x16, x17, [sp, #128] mrs x16, nzcv stp x16, x30, [sp, #144] mov w0, #\c bl putchar adrp x0, :got:stdout ldr x0, [x0, #:got_lo12:stdout] ldr x0, [x0] bl fflush ldp x16, x30, [sp, #144] msr nzcv, x16 ldp x16, x17, [sp, #128] ldp x14, x15, [sp, #112] ldp x12, x13, [sp, #96] ldp x10, x11, [sp, #80] ldp x8, x9, [sp, #64] ldp x6, x7, [sp, #48] ldp x4, x5, [sp, #32] ldp x2, x3, [sp, #16] ldp x0, x1, [sp, #0] add sp, sp, #20*8 #else # error "not supported" #endif .endm .macro notimpl #if defined(__i386__) || defined(__x86_64__) ud2 #elif defined(__arm__) udf #elif defined(__aarch64__) hlt #0 #else # error "not supported" #endif .endm .section .note.GNU-stack, "", %progbits .text #if defined(__i386__) get_pc_ebx: mov ebx, [esp] ret #endif proc call_example #if defined(__i386__) push ebx // ebx push esi // esi, ebx push edi // edi, esi, ebx push ebp // flags, ebp, ..., ebx pushf mov edi, [esp + 4*6] mov esi, [esp + 4*7] push esi // regs, flags, ebp, ..., ebx call get_pc_ebx lea eax, [ebx + 9f - .] push eax // cont, regs, flags, ebp, ..., ebx push edi // func, cont, regs, flags, ebp, ..., ebx mov eax, [esi + 28] pushf pop ecx and eax, 0x0cd5 and ecx, ~0x0cd5 or eax, ecx push eax popf mov eax, [esi + 0] mov ebx, [esi + 4] mov ecx, [esi + 8] mov edx, [esi + 12] mov edi, [esi + 20] mov ebp, [esi + 24] mov esi, [esi + 16] ret // -> func; regs, flags, ebp, ..., ebx 9: pushf // eflags, regs, flags, ebp, ..., ebx push esi // esi, eflags, regs, flags, ebp, ..., ebx mov esi, [esp + 8] mov [esi + 0], eax mov [esi + 4], ebx mov [esi + 8], ecx mov [esi + 12], edx mov [esi + 20], edi mov [esi + 24], ebp pop eax // rflags, regs, flags, ebp, ..., ebx mov [esi + 16], eax pop eax // regs, flags, ebp, ..., ebx mov [esi + 28], eax add esp, 4 // flags, ebp, ..., ebx popf // ebp, ..., ebx pop ebp // ..., ebx pop edi pop esi pop ebx // ret #elif defined(__x86_64__) push rbx // rbx push r10 push r11 push r12 push r13 push r14 push r15 push rbp // flags, rbp, ..., rbx pushf push rsi // regs, flags, rbp, ..., rbx lea rax, [rip + 9f] push rax // cont, regs, flags, rbp, ..., rbx push rdi // func, cont, regs, flags, rbp, ..., rbx mov rax, [rsi + 8*15] pushf pop rcx and rax, 0x0cd5 and rcx, ~0x0cd5 or rax, rcx push rax popf mov rax, [rsi + 0] mov rbx, [rsi + 8] mov rcx, [rsi + 16] mov rdx, [rsi + 24] mov rdi, [rsi + 40] mov rbp, [rsi + 48] mov r8, [rsi + 56] mov r9, [rsi + 64] mov r10, [rsi + 72] mov r11, [rsi + 80] mov r12, [rsi + 88] mov r13, [rsi + 96] mov r14, [rsi + 104] mov r15, [rsi + 112] mov rsi, [rsi + 32] ret // -> func; regs, flags, rbp, ..., rbx 9: pushf // rflags, regs, flags, rbp, ..., rbx push rsi // rsi, rflags, regs, flags, rbp, ..., rbx mov rsi, [rsp + 16] mov [rsi + 0], rax mov [rsi + 8], rbx mov [rsi + 16], rcx mov [rsi + 24], rdx mov [rsi + 40], rdi mov [rsi + 48], rbp mov [rsi + 56], r8 mov [rsi + 64], r9 mov [rsi + 72], r10 mov [rsi + 80], r11 mov [rsi + 88], r12 mov [rsi + 96], r13 mov [rsi + 104], r14 mov [rsi + 112], r15 pop rax // rflags, regs, flags, rbp, ..., rbx mov [rsi + 32], rax pop rax // regs, flags, rbp, ..., rbx mov [rsi + 120], rax add rsp, 8 // flags, rbp, ..., rbx popf // rbp, ..., rbx pop rbp // ..., rbx pop r15 pop r14 pop r13 pop r12 pop r11 pop r10 pop rbx // ret #elif defined(__arm__) stmfd r13!, {r0, r1, r4-r11, r14} ldmia r1, {r0-r12, r14} msr cpsr, r14 mov r14, pc ldr pc, [r13], #4 ldr r14, [r13], #4 stmia r14!, {r0-r12} mrs r0, cpsr str r0, [r14] ldmfd r13!, {r4-r11, pc} #elif defined(__aarch64__) stp x29, x30, [sp, #-13*8]! mov x29, sp stp x19, x20, [sp, #16] stp x21, x22, [sp, #32] stp x23, x24, [sp, #48] stp x25, x26, [sp, #64] stp x27, x28, [sp, #80] str x1, [sp, #96] mov x16, x0 ldr x17, [x1, #128] ldp x14, x15, [x1, #112] ldp x12, x13, [x1, #96] ldp x10, x11, [x1, #80] ldp x8, x9, [x1, #64] ldp x6, x7, [x1, #48] ldp x4, x5, [x1, #32] ldp x2, x3, [x1, #16] ldp x0, x1, [x1, #0] msr nzcv, x17 blr x16 ldr x16, [sp, #96] mrs x17, nzcv str x17, [x16, #128] stp x14, x15, [x16, #112] stp x12, x13, [x16, #96] stp x10, x11, [x16, #80] stp x8, x9, [x16, #64] stp x6, x7, [x16, #48] stp x4, x5, [x16, #32] stp x2, x3, [x16, #16] stp x0, x1, [x16, #0] ldp x19, x20, [sp, #16] ldp x21, x22, [sp, #32] ldp x23, x24, [sp, #48] ldp x25, x26, [sp, #64] ldp x27, x28, [sp, #80] ldp x29, x30, [sp], #13*8 ret #else # error "not supported" #endif endproc proc nop ret endproc ///-------------------------------------------------------------------------- /// 0x00--0x0f proc x00 // clear all 64 bits of extended traditional registers #if defined(__x86_64__) xor eax, eax // clear rax lea rbx, [0] // rbx -> _|_ loop . // iterate, decrement rcx until zero mov rdx, 0 // set rdx = 0 and esi, 0 // clear all bits of rsi sub edi, edi // set rdi = edi - edi = 0 push 0 pop rbp // pop 0 into rbp #elif defined(__i386__) xor eax, eax lea ebx, [0] loop . mov edx, 0 and esi, 0 sub edi, edi push 0 pop ebp #elif defined(__arm__) eor r0, r0, r0 rsb r1, r1, r1 0: subs r2, r2, #1 bne 0b mov r3, #0 and r4, r4, #0 sub r5, r5, r5 #elif defined(__aarch64__) eor w0, w0, w0 mov w1, wzr 0: sub w2, w2, #1 cbnz w2, 0b mov w3, #0 and w4, w4, wzr sub w5, w5, w5 #else notimpl #endif ret endproc proc x01 // advance a fibonacci pair by c steps // // on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1} // and f_{i+c}, where f_{i+1} = f_i + f_{i-1} #if defined(__x86_64__) 0: xadd rax, rdx // a, d = a + d, a // = f_{i+1} + f_i, f_{i+1} // = f_{i+2}, f_{i+1} loop 0b // advance i, decrement c, iterate #elif defined(__i386__) 0: xadd eax, edx loop 0b #elif defined(__arm__) 0: subs r2, r2, #2 add r3, r3, r0 blo 8f add r0, r0, r3 bhi 0b 8: movne r0, r3 #elif defined(__aarch64__) 0: subs x2, x2, #2 add x3, x3, x0 b.lo 8f add x0, x0, x3 b.hi 0b 8: cmov.ne x0, x3 #else notimpl #endif ret endproc proc x02 // boolean canonify a: if a = 0 on entry, leave it zero; otherwise // set a = 1 #if defined(__x86_64__) neg rax // set cf iff a /= 0 sbb rax, rax // a = a - a - cf = -cf neg rax // a = cf #elif defined(__i386__) neg eax sbb eax, eax neg eax #elif defined(__arm__) movs r1, r0 // the easy way movne r1, #1 // mvnne r1, #1 for mask cmp r0, #1 // clear cf iff a == 0 sbc r2, r0, r0 // c' = a - a - 1 + cf = cf - 1 add r2, r2, #1 // c' = cf sub r3, r0, r0, lsr #1 // d' top bit clear; d' = 0 iff a = 0 rsb r3, r3, #0 // d' top bit set iff a /= 0 mov r3, r3, lsr #31 // asr for mask rsbs r0, r0, #0 sbc r0, r0, r0 rsb r0, r0, #0 #elif defined(__aarch64__) cmp x0, #0 // trivial cset.ne x1 // csetm for mask cmp xzr, x0 // set cf iff a == 0 sbc x2, x0, x0 // c' = a - a - 1 + cf = cf - 1 neg x2, x2 // c' = 1 - cf sub x3, x0, x0, lsr #1 // if a < 2^63 then a' = ceil(d/2) < // 2^63 // if a >= 2^63, write a = 2^63 + t // with t < 2^63; d' = 2^63 - 2^62 + // ceil(t/2) = 2^62 + ceil(t/2), and // ceil(t/2) < 2^62 // anyway d' < 2^63 and d' = 0 iff // a = 0 neg x3, x3 // d' top bit set iff a /= 0 lsr x3, x3, #63 // asr for mask cmp x0, #1 // set cf iff a /= 0 adc x0, xzr, xzr // a' = 0 + 0 + cf = cf #else notimpl #endif ret endproc proc x03 // set a = min(a, d) (unsigned); clobber c, d #if defined(__x86_64__) sub rdx, rax // d' = d - a; set cf if a > d sbb rcx, rcx // c = -cf = -[a > d] and rcx, rdx // c = a > d ? d - a : 0 add rax, rcx // a' = a > d ? d : a #elif defined(__i386__) sub edx, eax sbb ecx, ecx and ecx, edx add eax, ecx #elif defined(__arm__) cmp r0, r3 // the easy way movlo r1, r0 // only needed for out-of-place movhs r1, r3 subs r3, r3, r0 sbc r12, r12, r12 and r12, r12, r3 add r0, r0, r12 #elif defined(__aarch64__) cmp x0, x3 // the easy way csel.lo x1, x0, x3 subs x3, x3, x0 // d' = d - a; set cf if d >= a sbc x16, xzr, xzr // t = -1 + cf = -[a > d] and x16, x16, x3 // t = a > d ? d - a : 0 add x0, x0, x16 // a' = a > d ? d : a #else notimpl #endif ret endproc proc x04 // switch case? #if defined(__x86_64__) // unrelated playing mov ecx, eax mov rbx, -1 mov edx, ecx sub edx, '0' cmp edx, 10 cmovb rbx, rdx or ecx, 0x20 mov edx, ecx sub edx, 'a' sub ecx, 'a' - 10 cmp edx, 6 cmovb rbx, rcx xor al, 0x20 #elif defined(__i386__) // unrelated playing mov ecx, eax mov ebx, -1 mov edx, ecx sub edx, '0' cmp edx, 10 cmovb ebx, edx or ecx, 0x20 mov edx, ecx sub edx, 'a' sub ecx, 'a' - 10 cmp edx, 6 cmovb ebx, ecx xor al, 0x20 #elif defined(__arm__) // unrelated playing mvn r1, #0 sub r12, r0, #'0' cmp r12, #10 movlo r1, r12 orr r12, r0, #0x20 sub r12, r12, #'a' cmp r12, #6 addlo r1, r12, #10 eor r0, r0, #0x20 #elif defined(__aarch64__) // unrelated playing mov x1, #-1 sub w16, w0, #'0' cmp w16, #10 cmov.lo x1, x16 orr w16, w0, #0x20 sub w16, w16, #'a' - 10 cmp w16, #10 ccmp.hs w16, #16, #CCMP_HS cmov.lo x1, x16 eor w0, w0, #0x20 #else notimpl #endif ret endproc proc x05 // answer whether 5 <= a 4 a > 9 or a < 5 // nc/ae/nb a' >= 4 a >= 9 or a < 5 // c/b/nae a' < 4 5 <= a < 9 // be/na a' <= 4 5 <= a <= 9 // // o a' < -2^63 + 4 -2^63 + 5 <= a < -2^63 + 9 // no a' >= -2^63 + 4 a >= -2^63 + 9 or // a < -2^63 + 5 // s -2^63 + 4 <= a' < 4 -2^63 + 9 <= a < 9 // ns a' < -2^63 + 4 or a < -2^63 + 9 or a >= 9 // a' >= 4 // ge/nl a' >= 4 a >= 9 or a < -2^63 + 5 // l/nge a' < 4 -2^63 + 5 <= a < 9 // g/nle a' > 4 a > 9 or a < -2^63 + 5 // le/ng a' <= 4 -2^63 + 5 <= a <= 9 #elif defined(__i386__) sub eax, 5 cmp eax, 4 #elif defined(__arm__) // i dimly remember having a slick way to do this way back in the // day, but i can't figure it out any more. sub r0, #5 cmp r0, #4 #elif defined(__aarch64__) // literal translation is too obvious cmp x0, #5 ccmp.hs x0, #9, #CCMP_HS #else notimpl #endif ret endproc proc x06 // leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of, // set sf to msb(a) #if defined(__x86_64__) not rax // a' = -a - 1 inc rax // a' = -a neg rax // a' = a #elif defined(__i386__) not eax inc eax neg eax #elif defined(__arm__) mvn r0, r0 add r0, r0, #1 rsbs r0, r0, #0 // cf has opposite sense #elif defined(__aarch64__) mvn x0, x0 add x0, x0, #1 negs x0, x0 // cf has opposite sense #else notimpl #endif ret endproc proc x07 // same as before (?) #if defined(__x86_64__) inc rax // a' = a + 1 neg rax // a' = -a - 1 inc rax // a' = -a neg rax // a' = a #elif defined(__i386__) inc eax neg eax inc eax neg eax #elif defined(__arm__) add r0, r0, #1 rsb r0, r0, #0 add r0, r0, #1 rsbs r0, r0, #0 #elif defined(__aarch64__) add x0, x0, #1 neg x0, x0 add x0, x0, #1 negs x0, x0 // cf has opposite sense #else notimpl #endif ret endproc proc x08 // floor((a + d)/2), correctly handling overflow conditions; final cf // is lsb(a + d), probably uninteresting #if defined(__x86_64__) add rax, rdx // cf || a' = a + d rcr rax, 1 // shift 65-bit result right by one // place; lsb moves into carry #elif defined(__i386__) add eax, edx rcr eax, 1 #elif defined(__arm__) // like the two-instruction a64 version sub r1, r3, r0 add r1, r0, r1, lsr #1 // the slick version, similar to the above adds r0, r0, r3 mov r0, r0, rrx #elif defined(__aarch64__) // a64 lacks a32's rrx. literal translation. adds x1, x0, x3 // cf || a' = a + d adc x16, xzr, xzr // realize cf in extra register extr x1, x16, x1, #1 // shift down one place // two instruction version: clobbers additional register. (if you // wanted the answer in any other register, even overwriting d, then // this is unnecessary.) also depends on d >= a. sub x16, x3, x0 // compute difference add x0, x0, x16, lsr #1 // add half of it (rounded down) #else notimpl #endif ret endproc proc x09 // a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3 // (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8). #if defined(__x86_64__) shr rax, 3 // a' = floor(a/8); cf = 1 if a == // 4, 5, 6, 7 (mod 8) adc rax, 0 // a' = floor(a/8) + cf #elif defined(__i386__) shr eax, 3 adc eax, 0 #elif defined(__arm__) movs r0, r0, lsr #3 adc r0, r0, #0 #elif defined(__aarch64__) tst x0, #4 orr x0, xzr, x0, lsr #3 cinc.ne x0, x0 #else notimpl #endif ret endproc proc x0a // increment c-byte little-endian bignum at rdi #if defined(__x86_64__) add byte ptr [rdi], 1 0: inc rdi adc byte ptr [rdi], 0 loop 0b #elif defined(__i386__) add byte ptr [edi], 1 0: inc edi adc byte ptr [edi], 0 loop 0b #elif defined(__arm__) mov r12, #256 // set initial carry 0: ldrb r0, [r5] subs r2, r2, #1 add r12, r0, r12, lsr #8 strb r12, [r5], #1 bne 0b #elif defined(__aarch64__) mov w17, #256 // set initial carry 0: ldrb w16, [x5] sub x2, x2, #1 add w17, w16, w17, lsr #8 strb w17, [x5], #1 cbnz x2, 0b #else notimpl #endif ret endproc proc x0b // negate double-precision d:a #if defined(__x86_64__) not rdx // d' = -d - 1 neg rax // a' = -a; // cf = 1 iff a /= 0 sbb rdx, -1 // d' = -d - cf #elif defined(__i386__) not edx neg eax sbb edx, -1 #elif defined(__arm__) // reverse subtract is awesome rsbs r0, r0, #0 rsc r3, r3, #0 #elif defined(__aarch64__) // easy way: everything is better with zero registers. negs x0, x0 ngc x3, x3 #else notimpl #endif ret endproc proc x0c // rotate is distributive over xor. #if defined(__x86_64__) // rax // = a_1 || a_0 // rbx // = b_1 || b_0 mov rcx, rax // = a_1 || a_0 xor rcx, rbx // = (a_1 XOR b_1) || (a_0 XOR b_0) ror rcx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1) ror rax, 0xd // = a_0 || a_1 ror rbx, 0xd // = b_0 || b_1 xor rax, rbx // = (a_0 XOR b_0) || (a_1 XOR b_1) cmp rax, rcx // always equal #elif defined(__i386__) mov ecx, eax // = a_1 || a_0 xor ecx, ebx // = (a_1 XOR b_1) || (a_0 XOR b_0) ror ecx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1) ror eax, 0xd // = a_0 || a_1 ror ebx, 0xd // = b_0 || b_1 xor eax, ebx // = (a_0 XOR b_0) || (a_1 XOR b_1) cmp eax, ecx // always equal #elif defined(__arm__) // r0 // = a_1 || a_0 // r1 // = b_1 || b_0 eor r2, r0, r1 // = (a_1 XOR b_1) || (a_0 XOR b_0) mov r2, r2, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1) mov r1, r1, ror #13 // = b_0 || b_1 eor r0, r1, r0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1) cmp r0, r2 // always equal #elif defined(__aarch64__) // x0 // = a_1 || a_0 // x1 // = b_1 || b_0 eor x2, x0, x1 // = (a_1 XOR b_1) || (a_0 XOR b_0) ror x2, x2, #13 // = (a_0 XOR b_0) || (a_1 XOR b_1) ror x1, x1, #13 // = b_0 || b_1 eor x0, x1, x0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1) cmp x0, x2 // always equal #else notimpl #endif ret endproc proc x0d // and is distributive over xor. #if defined(__x86_64__) mov rdx, rbx // = b xor rbx, rcx // = b XOR c and rbx, rax // = a AND (b XOR c) and rdx, rax // = a AND b and rax, rcx // = a AND c xor rax, rdx // = (a AND b) XOR (a AND c) // = a AND (b XOR c) cmp rax, rbx // always equal #elif defined(__i386__) mov edx, ebx // = b xor ebx, ecx // = b XOR c and ebx, eax // = a AND (b XOR c) and edx, eax // = a AND b and eax, ecx // = a AND c xor eax, edx // = (a AND b) XOR (a AND c) // = a AND (b XOR c) cmp eax, ebx // always equal #elif defined(__arm__) and r3, r0, r1 // = a AND b eor r1, r1, r2 // = b XOR c and r1, r1, r0 // = a AND (b XOR c) and r0, r0, r2 // = a AND c eor r0, r0, r3 // = (a AND b) XOR (a AND c) // = a AND (b XOR c) cmp r0, r1 // always equal #elif defined(__aarch64__) and x3, x0, x1 // = a AND b eor x1, x1, x2 // = b XOR c and x1, x1, x0 // = a AND (b XOR c) and x0, x0, x2 // = a AND c eor x0, x0, x3 // = (a AND b) XOR (a AND c) // = a AND (b XOR c) cmp x0, x1 // always equal #else notimpl #endif ret endproc proc x0e // de morgan's law #if defined(__x86_64__) mov rcx, rax // = a and rcx, rbx // = a AND b not rcx // = NOT (a AND b) not rax // = NOT a not rbx // = NOT b or rax, rbx // = (NOT a) OR (NOT b) // = NOT (a AND b) cmp rax, rcx // always equal #elif defined(__i386__) mov ecx, eax // = a and ecx, ebx // = a AND b not ecx // = NOT (a AND b) not eax // = NOT a not ebx // = NOT b or eax, ebx // = (NOT a) OR (NOT b) // = NOT (a AND b) cmp eax, ecx // always equal #elif defined(__arm__) and r2, r0, r1 // = a AND b mvn r2, r2 // = NOT (a AND b) mvn r0, r0 // = NOT a mvn r1, r1 // = NOT b orr r0, r0, r1 // = (NOT a) OR (NOT b) cmp r0, r2 // always equal #elif defined(__aarch64__) and x2, x0, x1 // = a AND b mvn x2, x2 // = NOT (a AND b) mvn x0, x0 // = NOT a orn x0, x0, x1 // = (NOT a) OR (NOT b) cmp x0, x2 // always equal #else notimpl #endif ret endproc proc x0f // replace input buffer bytes with cumulative XORs with initial a; // final a is XOR of all buffer bytes and initial a. // // not sure why you'd do this. #if defined(__x86_64__) 0: xor [rsi], al lodsb loop 0b #elif defined(__i386__) 0: xor [esi], al lodsb loop 0b #elif defined(__arm__) 0: ldrb r12, [r4] subs r2, r2, #1 eor r0, r0, r12 strb r0, [r4], #1 bne 0b #elif defined(__aarch64__) 0: ldrb w16, [x4] sub x2, x2, #1 eor w0, w0, w16 strb w0, [x4], #1 cbnz x2, 0b #else notimpl #endif ret endproc ///-------------------------------------------------------------------------- /// 0x10--0x1f proc x10 // four different ways to swap a pair of registers. #if defined(__x86_64__) push rax push rcx pop rax pop rcx xor rax, rcx xor rcx, rax xor rax, rcx add rax, rcx sub rcx, rax add rax, rcx neg rcx xchg rax, rcx #elif defined(__i386__) push eax push ecx pop eax pop ecx xor eax, ecx xor ecx, eax xor eax, ecx add eax, ecx sub ecx, eax add eax, ecx neg ecx xchg eax, ecx #elif defined(__arm__) stmfd r13!, {r0, r2} ldr r0, [r13, #4] ldr r2, [r13], #8 eor r0, r0, r2 eor r2, r2, r0 eor r0, r0, r2 sub r0, r0, r2 add r2, r2, r0 rsb r0, r0, r2 // don't need 3-addr with reverse-sub mov r12, r0 mov r0, r2 mov r2, r0 #elif defined(__aarch64__) // anything you can do stp x0, x2, [sp, #-16]! ldp x2, x0, [sp], #16 eor x0, x0, x2 eor x2, x2, x0 eor x0, x0, x2 // the add/sub/add thing was daft. you can do it in three if you're // clever -- and have three-address operations. sub x0, x0, x2 add x2, x2, x0 sub x0, x2, x0 // but we lack a fourth. we can't do this in fewer than three // instructions without hitting memory. only `ldp' will modify two // registers at a time, so we need at least two instructions -- but // if the first one sets one of our two registers to its final value // then we lose the other input value with no way to recover it, so // we must either write a fresh third register, or write something // other than the final value, and in both cases we need a third // instruction to fix everything up. we've done the wrong-something- // other trick twice, so here's the captain-obvious use-a-third- // register version. mov x16, x0 mov x0, x2 mov x2, x16 #else notimpl #endif ret endproc proc x11 // assuming a is initialized to zero, set a to the inclusive or of // the xor-differences of corresponding bytes in the c-byte strings // at si and di. // // in particular, a will be zero (and zf set) if and only if the two // strings are equal. #if defined(__x86_64__) 0: mov dl, [rsi] xor dl, [rdi] inc rsi inc rdi or al, dl loop 0b #elif defined(__i386__) 0: mov dl, [esi] xor dl, [edi] inc esi inc edi or al, dl loop 0b #elif defined(__arm__) 0: ldrb r1, [r4], #1 ldrb r12, [r5], #1 subs r2, r2, #1 eor r12, r12, r1 orr r0, r0, r12 bne 0b #elif defined(__aarch64__) 0: ldrb w16, [x4], #1 ldrb w17, [x5], #1 sub x2, x2, #1 eor w16, w16, w17 orr w0, w0, w16 cbnz x2, 0b #else notimpl #endif ret endproc proc x12 // an obtuse way of adding two registers. for any bit position, a // OR d is set if and only if at least one of a and d has a bit set // in that position, and a AND d is set if and only if both have a // bit set in that position. essentially, then, what we've done is // move all of the set bits in d to a, unless there's already a bit // there. this clearly doesn't change the sum. #if defined(__x86_64__) mov rcx, rdx // c' = d and rdx, rax // d' = a AND d or rax, rcx // a' = a OR d add rax, rdx #elif defined(__i386__) mov ecx, edx // c' = d and edx, eax // d' = a AND d or eax, ecx // a' = a OR d add eax, edx #elif defined(__arm__) and r2, r0, r3 // c' = a AND d orr r0, r0, r3 // a' = a OR d add r0, r0, r2 #elif defined(__aarch64__) and x2, x0, x3 // c' = a AND d orr x0, x0, x3 // a' = a OR d add x0, x0, x2 #else notimpl #endif ret endproc proc x13 // ok, so this is a really obtuse way of adding a and b; the result // is in a and d. but why does it work? #if defined(__x86_64__) mov rcx, 0x40 // carry chains at most 64 long 0: mov rdx, rax // copy a' xor rax, rbx // low bits of each bitwise sum and rbx, rdx // carry bits from each bitwise sum shl rbx, 1 // carry them into next position loop 0b #elif defined(__i386__) mov ecx, 0x40 // carry chains at most 64 long 0: mov edx, eax // copy a' xor eax, ebx // low bits of each bitwise sum and ebx, edx // carry bits from each bitwise sum shl ebx, 1 // carry them into next position loop 0b #elif defined(__arm__) mov r2, #0x40 0: and r3, r0, r1 subs r2, r2, #1 eor r0, r0, r1 lsl r1, r3, #1 bne 0b #elif defined(__aarch64__) mov x2, #0x40 0: and x3, x0, x1 sub x2, x2, #1 eor x0, x0, x1 lsl x1, x3, #1 cbnz x2, 0b #else notimpl #endif ret endproc proc x14 // floor((a + d)/2), like x08. #if defined(__x86_64__) mov rcx, rax // copy a for later and rcx, rdx // carry bits xor rax, rdx // low bits of each bitwise sum shr rax, 1 // divide by 2; carries now in place add rax, rcx // add the carries; done #elif defined(__i386__) mov ecx, eax // copy a for later and ecx, edx // carry bits xor eax, edx // low bits of each bitwise sum shr eax, 1 // divide by 2; carries now in place add eax, ecx // add the carries; done #elif defined(__arm__) and r2, r0, r3 eor r0, r0, r3 add r0, r2, r0, lsr #1 #elif defined(__aarch64__) and x2, x0, x3 eor x0, x0, x3 add x0, x2, x0, lsr #1 #else notimpl #endif ret endproc proc x15 // sign extension 32 -> 64 bits. #if defined(__x86_64__) movsx rbx, eax // like this? mov rdx, 0xffffffff80000000 add rax, rdx // if bit 31 of a is set then bits // 31--63 of a' are clear; otherwise, // these bits are all set -- which is // exactly backwards xor rax, rdx // so fix it #elif defined(__i386__) movsx ebx, ax // like this? mov edx, 0xffff8000 add eax, edx // if bit 31 of a is set then bits // 31--63 of a' are clear; otherwise, // these bits are all set -- which is // exactly backwards xor eax, edx // so fix it #elif defined(__arm__) sxth r1, r0 // like this mov r12, #0x80000000 add r0, r0, r12, asr #16 eor r0, r0, r12, asr #16 #elif defined(__aarch64__) sxtw x1, w0 // like this mov x16, #0xffffffff80000000 add x0, x0, x16 eor x0, x0, x16 #else notimpl #endif ret endproc proc x16 // ??? i don't know why you'd want to calculate this. #if defined(__x86_64__) xor rax, rbx // a' = a XOR b xor rbx, rcx // b' = b XOR c mov rsi, rax // t = a XOR b add rsi, rbx // t = (a XOR b) + (b XOR c) cmovc rax, rbx // a' = cf ? b XOR c : a XOR b xor rax, rbx // a' = cf ? 0 : a XOR c cmp rax, rsi #elif defined(__i386__) xor eax, ebx // a' = a XOR b xor ebx, ecx // b' = b XOR c mov esi, eax // t = a XOR b add esi, ebx // t = (a XOR b) + (b XOR c) cmovc eax, ebx // a' = cf ? b XOR c : a XOR b xor eax, ebx // a' = cf ? 0 : a XOR c cmp eax, esi #elif defined(__arm__) eor r0, r0, r1 eor r1, r1, r2 adds r4, r0, r1 movcs r0, r1 eor r0, r0, r1 cmp r0, r4 #elif defined(__aarch64__) eor x0, x0, x1 eor x1, x1, x2 adds x4, x0, x1 cmov.cs x0, x1 eor x0, x0, x1 cmp x0, x4 #else notimpl #endif ret endproc proc x17 // absolute value #if defined(__x86_64__) cqo // d = a < 0 ? -1 : 0 xor rax, rdx // a' = a < 0 ? -a - 1 : a sub rax, rdx // a' = a < 0 ? -a : a #elif defined(__i386__) cdq // d = a < 0 ? -1 : 0 xor eax, edx // a' = a < 0 ? -a - 1 : a sub eax, edx // a' = a < 0 ? -a : a #elif defined(__arm__) // direct approach movs r1, r0 rsbmi r1, r0, #0 // faithful-ish conversion eor r3, r0, r0, asr #31 sub r0, r3, r0, asr #31 #elif defined(__aarch64__) // direct approach tst x0, #1 << 63 cneg.ne x1, x0 // faithful-ish conversion eor x3, x0, x0, asr #63 sub x0, x3, x0, asr #63 #else notimpl #endif ret endproc proc x18 // should always set sf, clear zf, unless we get rescheduled to a // different core. #if defined(__x86_64__) rdtsc // d || a = cycles shl rdx, 0x20 or rax, rdx // a = cycles mov rcx, rax // c = cycles rdtsc // d || a = cycles' shl rdx, 0x20 or rax, rdx // a = cycles' cmp rcx, rax #elif defined(__i386__) rdtsc // d || a = cycles mov ebx, eax mov ecx, edx // c || b = cycles rdtsc // d || a = cycles' sub ebx, eax sbb ecx, edx #elif defined(__arm__) // cycle clock not available in user mode mrrc p15, 0, r0, r1, c9 mrrc p15, 0, r2, r3, c9 subs r0, r0, r2 sbcs r1, r1, r3 #elif defined(__aarch64__) // cycle clock not available in user mode mrs x0, pmccntr_el0 mrs x1, pmccntr_el0 cmp x0, x1 #else notimpl #endif ret endproc proc x19 // stupid way to capture a pointer to inline data and jump past it. // confuses the return-address predictor something chronic. worse // because amd64 calling convention doesn't usually pass arguments on // the stack. #if defined(__x86_64__) call 8f .string "hello world!\n\0" 8: call print_str add rsp, 8 ret print_str: // actually implement this ridiculous thing mov rsi, [rsp + 8] xor edx, edx 0: mov al, [rsi + rdx] inc rdx cmp al, 0 jnz 0b mov eax, SYS_write mov edi, 1 dec rdx syscall // clobbers r11 :-( ret #elif defined(__i386__) call 8f .string "hello world!\n\0" 8: call print_str add esp, 4 ret print_str: // actually implement this ridiculous thing mov ecx, [esp + 4] xor edx, edx 0: mov al, [ecx + edx] inc edx cmp al, 0 jnz 0b mov eax, SYS_write mov ebx, 1 dec edx int 0x80 ret #elif defined(__arm__) // why am i doing this? stmfd r13!, {r14} bl 8f .string "hello world!\n\0" .balign 4 8: mov r1, r14 // might as well make it easy on myself bl print_str ldmfd r13!, {pc} print_str: mov r2, #0 0: ldrb r0, [r1, r2] cmp r0, #0 addne r2, r2, #1 bne 0b mov r0, #1 mov r7, #SYS_write swi 0 bx r14 #elif defined(__aarch64__) // why am i doing this? str x30, [sp, #-16]! bl 8f .string "hello world!\n\0" .balign 4 8: mov x1, x30 // might as well make it easy on myself bl print_str ldr x30, [sp], #16 ret print_str: mov x2, #0 0: ldrb w0, [x1, x2] cmp w0, #0 cinc.ne x2, x2 b.ne 0b mov x0, #1 mov x8, #SYS_write svc #0 ret #else notimpl #endif endproc proc x1a // collect the current instruction-pointer address. this was an old // 32-bit i386 trick for position-independent code, but (a) it // confuses the return predictor, and (b) amd64 has true pc-relative // addressing. #if defined(__x86_64__) // the actual example call 0f 0: pop rax // the modern i386 trick doesn't confuse the return-address // predictor. call calladdr_rbx sub rbx, . - 0b // but rip-relative addressing is even better lea rcx, [rip + 0b] ret calladdr_rbx: mov rbx, [rsp] ret #elif defined(__i386__) // the actual example call 0f 0: pop eax // the modern i386 trick doesn't confuse the return-address // predictor. call get_pc_ebx sub ebx, . - 0b ret #elif defined(__arm__) stmfd r13!, {r14} bl 0f 0: mov r0, r14 bl return sub r1, r14, #. - 0b adr r2, 0b ldmfd r13!, {pc} return: bx r14 #elif defined(__aarch64__) str x30, [sp, #-16]! // we can do all of the above using a64 bl 0f 0: mov x0, x30 bl return sub x1, x30, #. - 0b adr x2, 0b ldr x30, [sp], #16 return: ret #else notimpl #endif endproc proc x1b #if defined(__x86_64__) // retpolines: an mitigation against adversarially influenced // speculative execution at indirect branches. if an adversary can // prepare a branch-target buffer entry matching an indirect branch // in the victim's address space then they can cause the victim to // /speculatively/ (but not architecturally) execute any code in // their address space, possibly leading to leaking secrets through // the cache. retpolines aren't susceptible to this because the // predicted destination address is from the return-prediction stack // which the adversary can't prime. the performance penalty is still // essentially a branch misprediction -- for this return, and // possibly all others already stacked. // (try not to crash) lea rax, [rip + 9f] push rax 9: ret #elif defined(__i386__) call get_pc_ebx lea eax, [ebx + 9f - .] push eax 9: ret #elif defined(__arm__) stmfd r13!, {r14} adr r14, 8f bx r14 8: ldmfd r13!, {pc} #elif defined(__aarch64__) str x30, [sp, #-16]! adr x30, 8f ret 8: ldr x30, [sp], #16 ret #else notimpl #endif endproc proc x1c // ok, having a hard time seeing a use for this. the most important // thing to note is that sp is set from `pop' /after/ it's // incremented. #if defined(__x86_64__) // try not to crash mov rax, rsp and rsp, -16 push rax pop rsp // check it worked mov rbx, rsp ret #elif defined(__i386__) // try not to crash mov eax, esp and esp, -16 push eax pop esp // check it worked mov ebx, esp ret #elif defined(__arm__) // not even going to dignify this notimpl #elif defined(__aarch64__) // not even going to dignify this notimpl #else notimpl #endif endproc proc x1d // monumentally cheesy way to copy 8 n bytes from buff1 to buff2. // also clobbers words at buff2 + 8 n and buff2 - 8 for good measure. n = 4 #if defined(__x86_64__) mov rax, rsp // safekeeping // we're toast if we get hit by a signal now. fingers crossed... .if 0 mov rsp, buff2 + 8*n + 8 mov rbp, buff1 + 8*n .else lea rsp, [rdi + 8*n + 16] lea rbp, [rsi + 8*n] .endif enter 0, n + 1 // precise action: // // +---------+ +---------+ // rbp -> | ??? | rsp -> | ??? | // +---------+ +---------+ // | w_{n-1} | | rbp | <- rbp' // +---------+ +---------+ // | ... | | w_{n-1} | // +---------+ +---------+ // | w_1 | | ... | // +---------+ +---------+ // | w_0 | | w_1 | // +---------+ +---------+ // | w_0 | // +---------+ // | rbp' | <- rsp' // +---------+ mov rdx, rsp mov rsp, rax #elif defined(__i386__) mov eax, esp // safekeeping // we're toast if we get hit by a signal now. fingers crossed... .if 0 mov esp, buff2 + 4*n + 4 mov ebp, buff1 + 4*n .else lea esp, [edi + 4*n + 8] lea ebp, [esi + 4*n] .endif enter 0, n + 1 mov edx, esp mov esp, eax #elif defined(__arm__) add r4, r4, #4*n add r5, r5, #4*n + 8 str r4, [r5, #-4]! .rept n/2 ldrd r0, r1, [r4, #-8]! strd r0, r1, [r5, #-8]! .endr add r4, r5, #4*n str r4, [r5, #-4]! #elif defined(__aarch64__) // omgwtf. let's not actually screw with the stack pointer. add x4, x4, #8*n add x5, x5, #8*n + 16 str x4, [x5, #-8]! .rept n/2 ldp x16, x17, [x4, #-16]! stp x16, x17, [x5, #-16]! .endr add x4, x5, #8*n str x4, [x5, #-8]! #else notimpl #endif ret endproc proc x1e // convert nibble value to (uppercase) hex; other input values yield // nonsense. #if defined(__x86_64__) // das doesn't work in 64-bit mode; best i can come up with mov edx, eax add al, '0' add dl, 'A' - 10 cmp al, '9' + 1 cmovae eax, edx #elif defined(__i386__) cmp al, 0x0a // cf = 1 iff a < 10 sbb al, 0x69 // if 0 <= a < 10, a' = a - 0x6a, so // 0x96 <= a' < 0x70, setting af, cf // if 10 <= a < 16, a' = a - 0x69, so // 0x71 <= a' < 0x77, setting cf but // clearing af das // if 0 <= a < 10, then af and cf are // both set, so set subtract 0x66 // from a' leaving 0x30 <= a' < 0x3a; // if 10 <= a < 16 then af clear but // cf set, so subtract 0x60 from a' // leaving 0x41 <= a' < 0x47 #elif defined(__arm__) // significantly less tricksy cmp r0, #10 addlo r0, r0, #'0' addhs r0, r0, #'A' - 10 #elif defined(__aarch64__) // with less versatile conditional execution this is the best we can // do cmp w0, #10 add w16, w0, #'A' - 10 add w0, w0, #'0' cmov.hs w0, w16 #else notimpl #endif ret endproc proc x1f // verify collatz conjecture starting at a; assume a /= 0! #if defined(__x86_64__) 0: bsf rcx, rax // clobber c if a = 0 shr rax, cl // a = 2^c a' cmp rdx, 0 je 1f stosq dec rdx 1: cmp rax, 1 // done? je 9f lea rax, [2*rax + rax + 1] // a' = 3 a' + 1 jmp 0b // again 9: ret #elif defined(__i386__) 0: bsf ecx, eax // clobber c if a = 0 shr eax, cl // a = 2^c a' cmp edx, 0 je 1f stosd dec edx 1: cmp eax, 1 // done? je 9f lea eax, [2*eax + eax + 1] // a' = 3 a' + 1 jmp 0b // again 9: ret #elif defined(__arm__) // rbit introduced in armv7 0: rbit r2, r0 clz r2, r2 mov r0, r0, lsr r2 // a = 2^c a' cmp r3, #0 strne r0, [r5], #4 subne r3, r3, #1 cmp r0, #1 adcne r0, r0, r0, lsl #1 // a' = 3 a' + 1 (because c set) bne 0b ret #elif defined(__aarch64__) 0: rbit w2, w0 clz w2, w2 lsr w0, w0, w2 // a = 2^c a' cmp x3, #0 beq 1f str x0, [x5], #8 sub x3, x3, #1 1: cmp w0, #1 add w16, w0, w0, lsl #1 // t = 3 a' + 1 (because c set) csinc.eq w0, w0, w16 b.ne 0b ret #else notimpl #endif endproc ///-------------------------------------------------------------------------- /// 0x20--0x2f proc x20 // calculate 1337 a slowly #if defined(__x86_64__) // original version mov rcx, rax // c = a shl rcx, 2 // c = 4 a add rcx, rax // c = 5 a shl rcx, 3 // c = 40 a add rcx, rax // c = 41 a shl rcx, 1 // c = 82 a add rcx, rax // c = 83 a shl rcx, 1 // c = 166 a add rcx, rax // c = 167 a shl rcx, 3 // c = 1336 a add rcx, rax // c = 1337 a // a quick way lea rdx, [2*rax + rax] // t = 3 a shl rdx, 6 // t = 192 a sub rdx, rax // t = 191 a lea rbx, [8*rdx] // b = 1528 a sub rbx, rdx // b = 1337 a #elif defined(__i386__) // original version mov ecx, eax // c = a shl ecx, 2 // c = 4 a add ecx, eax // c = 5 a shl ecx, 3 // c = 40 a add ecx, eax // c = 41 a shl ecx, 1 // c = 82 a add ecx, eax // c = 83 a shl ecx, 1 // c = 166 a add ecx, eax // c = 167 a shl ecx, 3 // c = 1336 a add ecx, eax // c = 1337 a // a quick way lea edx, [2*eax + eax] // t = 3 a shl edx, 6 // t = 192 a sub edx, eax // t = 191 a lea ebx, [8*edx] // b = 1528 a sub ebx, edx // b = 1337 a #elif defined(__arm__) // original version, ish add r2, r0, r0, lsl #2 // c = 5 a add r2, r0, r2, lsl #3 // c = 41 a add r2, r0, r2, lsl #1 // c = 83 a add r2, r0, r2, lsl #1 // c = 167 a add r2, r0, r2, lsl #3 // c = 1337 a // quicker way add r1, r0, r0, lsl #1 // b = 3 a rsb r1, r0, r1, lsl #6 // b = 191 a rsb r1, r1, r1, lsl #3 // b = 1337 a #elif defined(__aarch64__) // original version, ish add x2, x0, x0, lsl #2 // c = 5 a add x2, x0, x2, lsl #3 // c = 41 a add x2, x0, x2, lsl #1 // c = 83 a add x2, x0, x2, lsl #1 // c = 167 a add x2, x0, x2, lsl #3 // c = 1337 a // sleazy because no rsb add x1, x0, x0, lsl #1 // b = 3 a sub x1, x0, x1, lsl #6 // b = -191 a sub x1, x1, x1, lsl #3 // b = 1337 a #else notimpl #endif ret endproc proc x21 // multiply complex numbers a + b i and c + d i // // (a + b i) (c + d i) = (a c - b d) + (a d + b c) i // // somewhat slick approach uses only three multiplications #if defined(__x86_64__) mov rsi, rax // t = a add rax, rbx // a' = a + b mov rdi, rdx // u = d sub rdx, rcx // d' = d - c add rdi, rcx // u = c + d imul rax, rcx // a' = c (a + b) imul rsi, rdx // t = a (d - c) imul rdi, rbx // u = b (c + d) add rsi, rax // t = a (d - c) + c (a + b) mov rbx, rsi // b' = a (d - c) + c (a + b) // = a d + b c sub rax, rdi // a' = c (a + b) - b (c + d) // = a c - b d #elif defined(__i386__) mov esi, eax // t = a add eax, ebx // a' = a + b mov edi, edx // u = d sub edx, ecx // d' = d - c add edi, ecx // u = c + d imul eax, ecx // a' = c (a + b) imul esi, edx // t = a (d - c) imul edi, ebx // u = b (c + d) add esi, eax // t = a (d - c) + c (a + b) mov ebx, esi // b' = a (d - c) + c (a + b) // = a d + b c sub eax, edi // a' = c (a + b) - b (c + d) // = a c - b d #elif defined(__arm__) add r4, r0, r1 // t = a + b add r5, r2, r3 // u = c + d sub r3, r3, r2 // d' = d - c // mls introduced in armv7 mul r4, r4, r2 // t = c (a + b) mov r2, r1 // c' = a (bah!) mla r1, r0, r3, r4 // b' = a (d - c) + c (a + b) // = a d + b c mls r0, r2, r5, r4 // a' = c (a + b) - b (c + d) // = a c - b d #elif defined(__aarch64__) add x4, x0, x1 // t = a + b add x5, x2, x3 // u = c + d sub x3, x3, x2 // d' = d - c // mls intxoduced in axmv7 mul x4, x4, x2 // t = c (a + b) mov x2, x1 // c' = a (bah!) madd x1, x0, x3, x4 // b' = a (d - c) + c (a + b) // = a d + b c msub x0, x2, x5, x4 // a' = c (a + b) - b (c + d) // = a c - b d #else notimpl #endif ret endproc proc x22 // divide by 3 #if defined(__x86_64__) mov rdx, 0xaaaaaaaaaaaaaaab // = ceil(2/3 2^64) mul rdx // d' || a' =~ 2/3 a 2^64 shr rdx, 1 // d' = floor(a/3) mov rax, rdx // a' = floor(a/3) // we start with 0 <= a < 2^64. write f = ceil(2/3 2^64), so that // 2/3 < f/2^64 < 2/3 + 1/2^64. then floor(2/3 a) <= floor(a f/2^64) // <= floor(2/3 a + a/2^64), but a < 2^64 so a/2^64 < 1 and // floor(a f/2^64) = floor(2/3 a). #elif defined(__i386__) mov edx, 0xaaaaaaab // = ceil(2/3 2^32) mul edx // d' || a' =~ 2/3 a 2^32 shr edx, 1 // d' = floor(a/3) mov eax, edx // a' = floor(a/3) #elif defined(__arm__) ldr r12, =0xaaaaaaab umull r12, r0, r0, r12 mov r0, r0, lsr #1 #elif defined(__aarch64__) ldr x16, =0xaaaaaaaaaaaaaaab umulh x0, x0, x16 lsr x0, x0, #1 #else notimpl #endif ret endproc proc x23 #if defined(__x86_64__) // main loop: shorten a preserving residue class mod 3 0: cmp rax, 5 jbe 8f // a > 5 mov rdx, rax // d' = a shr rdx, 2 // d' = floor(a/4) and rax, 3 // a = 4 d' + a' (0 <= a' < 4) add rax, rdx // a' == a (mod 3) but a' < a/4 + 4 jmp 0b // fix up final value 0 <= a < 6: want 0 <= a < 3 // // the tricky part is actually a = 3; but the other final cases take // additional iterations which we can avoid. 8: cmp rax, 3 // set cf iff a < 3 cmc // set cf iff a >= 3 sbb rdx, rdx // d' = a >= 3 ? -1 : 0 and rdx, 3 // d' = a >= 3 ? 3 : 0 sub rax, rdx // a' = a - (a >= 3 ? 3 : 0) // = a (mod 3) #elif defined(__i386__) // main loop: shorten a preserving residue class mod 3 0: cmp eax, 5 jbe 8f // a > 5 mov edx, eax // d' = a shr edx, 2 // d' = floor(a/4) and eax, 3 // a = 4 d' + a' (0 <= a' < 4) add eax, edx // a' == a (mod 3) but a' < a/4 + 4 jmp 0b // fix up final value 0 <= a < 6: want 0 <= a < 3 // // the tricky part is actually a = 3; but the other final cases take // additional iterations which we can avoid. 8: cmp eax, 3 // set cf iff a < 3 cmc // set cf iff a >= 3 sbb edx, edx // d' = a >= 3 ? -1 : 0 and edx, 3 // d' = a >= 3 ? 3 : 0 sub eax, edx // a' = a - (a >= 3 ? 3 : 0) // = a (mod 3) #elif defined(__arm__) 0: cmp r0, #6 andhs r12, r0, #3 addhs r0, r12, r0, lsr #2 bhs 0b cmp r0, #3 subhs r0, r0, #3 #elif defined(__aarch64__) 0: cmp x0, #6 // blunder on through regardless since this doesn't affect the result and x16, x0, #3 add x0, x16, x0, lsr #2 b.hs 0b subs x16, x0, #3 cmov.hs x0, x16 #else notimpl #endif ret endproc proc x24 // invert (odd) a mod 2^64 // // suppose a a_i == 1 (mod 2^{2^i}) // // clearly good for i = 0, since 2^i = 1 and 2^{2^i} = 2, and a_0 = // a == 1 (mod 2) by assumption // // write a a_i == b_i 2^{2^i} + 1 (mod 2^{2^{i+1}}) // then b_i == (a a_i - 1)/2^{2^i} (mod 2^{2^i}) // to lift inverse, we want x such that a x == -b_i (mod 2^{2^i}); // clearly x = -a_i b_i will do, since a a_i == 1 (mod 2^{2^i}) // then: // a_{i+1} = a_i - a_i b_i 2^{2^i} = a_i (1 - (a a_i - 1)) // = 2 a_i - a a_i^2 // // check: // a a_{i+1} = 2 a a_i - a^2 a_i^2 // == 2 a a_i - (b_i 2^{2^i} + 1)^2 // == 2 (b_i 2^{2^i} + 1) - // (b_i^2 2^{2^{i+1}} + 2 b_i 2^{2^i} + 1) // == 1 (mod 2^{2^{i+1}}) #if defined(__x86_64__) // rax // a_0 = a mov rbx, rax // b' = a mov rsi, rax // t = a_0 0: cmp rbp, 0 je 1f stosq dec rbp 1: mul rbx // a' = a a_i mov rcx, rax // c = a a_i sub rax, 2 // a' = a a_i - 2 neg rax // a' = 2 - a a_i mul rsi // a_{i+1} = a_i (2 - a a_i) // = 2 a_i - a a_i^2 mov rsi, rax // t = a_{i+1} cmp rcx, 1 // done? ja 0b // no -- iterate #elif defined(__i386__) // eax // a_0 = a mov ebx, eax // b' = a mov esi, eax // t = a_0 0: cmp ebp, 0 je 1f stosd dec ebp 1: mul ebx // a' = a a_i mov ecx, eax // c = a a_i sub eax, 2 // a' = a a_i - 2 jb 9f // done if < 2 neg eax // a' = 2 - a a_i mul esi // a_{i+1} = a_i (2 - a a_i) // = 2 a_i - a a_i^2 mov esi, eax // t = a_{i+1} jmp 0b // and iterate 9: mov eax, esi // restore #elif defined(__arm__) // r0 // a_0 = a mov r1, r0 // b' = a 0: cmp r6, #0 strne r0, [r5], #4 subne r6, r6, #1 mul r2, r0, r1 // c = a a_i rsbs r2, r2, #2 // c = 2 - a a_i mul r0, r0, r2 // a_{i+1} = a_i (2 - a a_i) // = 2 a_i - a a_i^2 blo 0b #elif defined(__aarch64__) // x0 // a_0 = a mov x1, x0 // b' = a mov x16, #2 // because we have no rsb 0: cmp x6, #0 b.eq 1f str x0, [x5], #8 sub x6, x6, #1 1: mul x2, x0, x1 // c = a a_i subs x2, x16, x2 // c = 2 - a a_i mul x0, x0, x2 // a_{i+1} = a_i (2 - a a_i) // = 2 a_i - a a_i^2 b.lo 0b #else notimpl #endif ret endproc proc x25 // a poor approximation to pi/4 // // think of x and y as being in 16.16 fixed-point format. we sample // points in the unit square, and determine how many of them are // within a unit quarter-circle centred at the origin. the area of // the quarter-circle is pi/4. #if defined(__x86_64__) xor eax, eax // a = 0 mov rcx, 1 shl rcx, 0x20 // c =~ 4 billion 0: movzx rbx, cx // x = low 16 bits of c imul rbx, rbx // b = x^2 ror rcx, 0x10 // switch halves of c movzx rdx, cx // y = high 16 bits of c imul rdx, rdx // d = y^2 rol rcx, 0x10 // switch back add rbx, rdx // r^2 = x^2 + y^2 shr rbx, 0x20 // r^2 >= 1? cmp rbx, 1 // set cf iff r^2 >= 1 adc rax, 0 // and add onto accumulator loop 0b #elif defined(__i386__) // this is actually better done in 32 bits. the carry has the wrong // sense here, so instead deduct one for each point outside the // quarter-circle rather than adding one for each point inside it. xor eax, eax xor ecx, ecx 0: movzx ebx, cx imul ebx, ebx ror ecx, 0x10 movzx edx, cx imul edx, edx rol ecx, 0x10 add ebx, edx // see? sbb eax, 0 loop 0b #elif defined(__arm__) mov r0, #0 mov r2, #0 0: uxth r1, r2, ror #0 uxth r3, r2, ror #16 mul r1, r1, r1 mul r3, r3, r3 cmn r1, r3 // mlas doesn't set cf usefully addcc r0, r0, #1 adds r2, r2, #1 bne 0b #elif defined(__aarch64__) mov w0, #0 mov w2, #0 0: ubfx w1, w2, #0, #16 ubfx w3, w2, #16, #16 sub w2, w2, #1 mul w1, w1, w1 mul w3, w3, w3 cmn w1, w3 cinc.cc w0, w0 cbnz w2, 0b #else notimpl #endif ret endproc proc x26 #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc proc x27 #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc proc x28 #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc proc x29 #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc proc x2a #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc proc x2b #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc proc x2c #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc proc x2d #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc proc x2e #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc proc x2f #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc ///-------------------------------------------------------------------------- /// 0x30--0x3f proc x30 #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif ret endproc proc x31 #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc proc x32 #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc proc x33 #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc proc x34 #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc proc x35 #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc proc x36 #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc proc x37 #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc proc x38 #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc proc x39 #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc proc x3a #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc proc x3b #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc proc x3c #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc proc x3d #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc proc x3e #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc proc x3f #if defined(__x86_64__) notimpl #elif defined(__i386__) notimpl #elif defined(__arm__) notimpl #elif defined(__aarch64__) notimpl #else notimpl #endif endproc ///----- That's all, folks --------------------------------------------------