Multiple architectures, more solutions.

[xchg-rax-rax] / xchg.S
diff --git a/xchg.S b/xchg.S

index 12aeeba..0dcb6c8 100644 (file)
--- a/xchg.S
+++ b/xchg.S
@@ -1,8 +1,75 @@
-/// -*- mode: asm; asm-comment-char: ?/ -*-
+/// -*- mode: asm; asm-comment-char: 0 -*-
+
+///--------------------------------------------------------------------------
+/// Preliminaries.
+
+#include <sys/syscall.h>
+
+#if defined(__i386__) || defined(__x86_64__)
  
         .intel_syntax noprefix
  
-       .section .note.GNU-stack, "", @progbits
+#elif defined(__arm__)
+
+.macro ret
+       bx      r14
+.endm
+
+       .arch   armv7-a
+
+#elif defined(__aarch64__)
+
+.macro cmov    rd, rn, cc
+       csel    \rd, \rn, \rd, \cc
+.endm
+#define _COND(_)                                                       \
+       _(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl)                 \
+       _(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv)                 \
+       _(hs) _(lo)
+#define _INST(_)                                                       \
+       _(ccmp) _(ccmn)                                                 \
+       _(csel) _(cmov)                                                 \
+       _(csinc) _(cinc) _(cset)                                        \
+       _(csneg) _(cneg)                                                \
+       _(csinv) _(cinv) _(csetm)
+#define _CONDVAR(cc) _definstvar cc;
+#define _INSTVARS(inst)                                                        \
+       .macro _definstvar cc;                                          \
+         .macro inst.\cc args:vararg; inst \args, \cc; .endm;          \
+       .endm;                                                          \
+       _COND(_CONDVAR);                                                \
+       .purgem _definstvar;
+       _INST(_INSTVARS)
+#undef _COND
+#undef _INST
+#undef _CONDVAR
+#undef _INSTVARS
+
+#define CCMP_N 8
+#define CCMP_Z 4
+#define CCMP_C 2
+#define CCMP_V 1
+
+#define CCMP_MI CCMP_N
+#define CCMP_PL 0
+#define CCMP_EQ CCMP_Z
+#define CCMP_NE 0
+#define CCMP_CS CCMP_C
+#define CCMP_HS CCMP_C
+#define CCMP_CC 0
+#define CCMP_LO 0
+#define CCMP_VS CCMP_V
+#define CCMP_VC 0
+#define CCMP_HI CCMP_C
+#define CCMP_LS 0
+#define CCMP_LT CCMP_N
+#define CCMP_GE 0
+#define CCMP_LE CCMP_N
+#define CCMP_GT 0
+
+#else
+#  error "not supported"
+#endif
  
  .macro proc    name
         .globl  \name
@@ -16,6 +83,36 @@
  .endm
  
  .macro ch c
+#if defined(__i386__)
+
+       pushf
+       push    eax
+       push    ebx
+       push    ecx
+       push    edx
+       push    ebp
+       mov     ebp, esp
+       and     esp, -16
+
+       push    \c
+       call    putchar@plt
+
+       call    get_pc_ebx
+       add     ebx, offset _GLOBAL_OFFSET_TABLE
+       mov     eax, [ebx + stdout@GOT]
+       mov     eax, [eax]
+       call    fflush@plt
+
+       mov     esp, ebp
+       pop     ebp
+       pop     edx
+       pop     ecx
+       pop     ebx
+       pop     eax
+       popf
+
+#elif defined(__x86_64__)
+
         pushf
         push    rax
         push    rcx
@@ -44,12 +141,154 @@
         pop     rcx
         pop     rax
         popf
+
+#elif defined(__arm__)
+
+       stmfd   r13!, {r0-r4, r12, r14}
+
+       mov     r4, r13
+       bic     r14, r4, #15
+       mov     r13, r14
+
+       mov     r0, #\c
+       bl      putchar@plt
+
+       ldr     r14, .L$_c$gotoff$\@
+.L$_c$gotpc$\@:
+       add     r14, pc, r14
+       b       .L$_c$cont$\@
+.L$_c$gotoff$\@:
+       .word   _GLOBAL_OFFSET_TABLE - .L$_c$gotpc$\@ - 8
+.L$_c$cont$\@:
+       bl      fflush@plt
+
+       mov     r13, r4
+       ldmfd   r13!, {r0-r4, r12, r14}
+
+#elif defined(__aarch64__)
+
+       sub     sp, sp, #20*8
+       stp      x0,  x1, [sp,   #0]
+       stp      x2,  x3, [sp,  #16]
+       stp      x4,  x5, [sp,  #32]
+       stp      x6,  x7, [sp,  #48]
+       stp      x8,  x9, [sp,  #64]
+       stp     x10, x11, [sp,  #80]
+       stp     x12, x13, [sp,  #96]
+       stp     x14, x15, [sp, #112]
+       stp     x16, x17, [sp, #128]
+       mrs     x16, nzcv
+       stp     x16, x30, [sp, #144]
+
+       mov     w0, #\c
+       bl      putchar
+       adrp    x0, :got:stdout
+       ldr     x0, [x0, #:got_lo12:stdout]
+       ldr     x0, [x0]
+       bl      fflush
+
+       ldp     x16, x30, [sp, #144]
+       msr     nzcv, x16
+       ldp     x16, x17, [sp, #128]
+       ldp     x14, x15, [sp, #112]
+       ldp     x12, x13, [sp,  #96]
+       ldp     x10, x11, [sp,  #80]
+       ldp      x8,  x9, [sp,  #64]
+       ldp      x6,  x7, [sp,  #48]
+       ldp      x4,  x5, [sp,  #32]
+       ldp      x2,  x3, [sp,  #16]
+       ldp      x0,  x1, [sp,   #0]
+       add     sp, sp, #20*8
+
+#else
+#  error "not supported"
+#endif
  .endm
  
+.macro notimpl
+#if defined(__i386__) || defined(__x86_64__)
+       ud2
+#elif defined(__arm__)
+       udf
+#elif defined(__aarch64__)
+       hlt     #0
+#else
+#  error "not supported"
+#endif
+.endm
+
+       .section .note.GNU-stack, "", %progbits
+
         .text
  
+#if defined(__i386__)
+get_pc_ebx:
+       mov     ebx, [esp]
+       ret
+#endif
+
+
  proc   call_example
  
+#if defined(__i386__)
+
+       push    ebx                     // ebx
+       push    esi                     // esi, ebx
+       push    edi                     // edi, esi, ebx
+       push    ebp                     // flags, ebp, ..., ebx
+       pushf
+
+       mov     edi, [esp + 4*6]
+       mov     esi, [esp + 4*7]
+       push    esi                     // regs, flags, ebp, ..., ebx
+
+       call    get_pc_ebx
+       lea     eax, [ebx + 9f - .]
+       push    eax                     // cont, regs, flags, ebp, ..., ebx
+       push    edi                 // func, cont, regs, flags, ebp, ..., ebx
+
+       mov     eax, [esi + 28]
+       pushf
+       pop     ecx
+       and     eax,  0x0cd5
+       and     ecx, ~0x0cd5
+       or      eax, ecx
+       push    eax
+       popf
+       mov     eax, [esi +  0]
+       mov     ebx, [esi +  4]
+       mov     ecx, [esi +  8]
+       mov     edx, [esi + 12]
+       mov     edi, [esi + 20]
+       mov     ebp, [esi + 24]
+       mov     esi, [esi + 16]
+
+       ret                            // -> func; regs, flags, ebp, ..., ebx
+
+9:     pushf                           // eflags, regs, flags, ebp, ..., ebx
+       push    esi                // esi, eflags, regs, flags, ebp, ..., ebx
+       mov     esi, [esp + 8]
+       mov     [esi +  0], eax
+       mov     [esi +  4], ebx
+       mov     [esi +  8], ecx
+       mov     [esi + 12], edx
+       mov     [esi + 20], edi
+       mov     [esi + 24], ebp
+       pop     eax                     // rflags, regs, flags, ebp, ..., ebx
+       mov     [esi + 16], eax
+       pop     eax                     // regs, flags, ebp, ..., ebx
+       mov     [esi + 28], eax
+
+       add     esp, 4                  // flags, ebp, ..., ebx
+       popf                            // ebp, ..., ebx
+       pop     ebp                     // ..., ebx
+       pop     edi
+       pop     esi
+       pop     ebx                     //
+       ret
+
+#elif defined(__x86_64__)
+
         push    rbx                     // rbx
         push    r10
         push    r11
@@ -66,7 +305,7 @@ proc call_example
         push    rax                     // cont, regs, flags, rbp, ..., rbx
         push    rdi                 // func, cont, regs, flags, rbp, ..., rbx
  
-       mov     rax, [rsi + 56]
+       mov     rax, [rsi + 8*15]
         pushf
         pop     rcx
         and     rax,  0x0cd5
@@ -74,29 +313,45 @@ proc       call_example
         or      rax, rcx
         push    rax
         popf
-       mov     rax, [rsi +  0]
-       mov     rbx, [rsi +  8]
-       mov     rcx, [rsi + 16]
-       mov     rdx, [rsi + 24]
-       mov     rdi, [rsi + 40]
-       mov     rbp, [rsi + 48]
-       mov     rsi, [rsi + 32]
+       mov     rax, [rsi +   0]
+       mov     rbx, [rsi +   8]
+       mov     rcx, [rsi +  16]
+       mov     rdx, [rsi +  24]
+       mov     rdi, [rsi +  40]
+       mov     rbp, [rsi +  48]
+       mov     r8,  [rsi +  56]
+       mov     r9,  [rsi +  64]
+       mov     r10, [rsi +  72]
+       mov     r11, [rsi +  80]
+       mov     r12, [rsi +  88]
+       mov     r13, [rsi +  96]
+       mov     r14, [rsi + 104]
+       mov     r15, [rsi + 112]
+       mov     rsi, [rsi +  32]
  
         ret                            // -> func; regs, flags, rbp, ..., rbx
  
  9:     pushf                           // rflags, regs, flags, rbp, ..., rbx
         push    rsi                // rsi, rflags, regs, flags, rbp, ..., rbx
         mov     rsi, [rsp + 16]
-       mov     [rsi +  0], rax
-       mov     [rsi +  8], rbx
-       mov     [rsi + 16], rcx
-       mov     [rsi + 24], rdx
-       mov     [rsi + 40], rdi
-       mov     [rsi + 48], rbp
+       mov     [rsi +   0], rax
+       mov     [rsi +   8], rbx
+       mov     [rsi +  16], rcx
+       mov     [rsi +  24], rdx
+       mov     [rsi +  40], rdi
+       mov     [rsi +  48], rbp
+       mov     [rsi +  56],  r8
+       mov     [rsi +  64],  r9
+       mov     [rsi +  72], r10
+       mov     [rsi +  80], r11
+       mov     [rsi +  88], r12
+       mov     [rsi +  96], r13
+       mov     [rsi + 104], r14
+       mov     [rsi + 112], r15
         pop     rax                     // rflags, regs, flags, rbp, ..., rbx
-       mov     [rsi + 32], rax
+       mov     [rsi +  32], rax
         pop     rax                     // regs, flags, rbp, ..., rbx
-       mov     [rsi + 56], rax
+       mov     [rsi + 120], rax
  
         add     rsp, 8                  // flags, rbp, ..., rbx
         popf                            // rbp, ..., rbx
@@ -110,6 +365,68 @@ proc       call_example
         pop     rbx                     //
         ret
  
+#elif defined(__arm__)
+
+       stmfd   r13!, {r0, r1, r4-r11, r14}
+       ldmia   r1, {r0-r12, r14}
+       msr     cpsr, r14
+       mov     r14, pc
+       ldr     pc, [r13], #4
+       ldr     r14, [r13], #4
+       stmia   r14!, {r0-r12}
+       mrs     r0, cpsr
+       str     r0, [r14]
+       ldmfd   r13!, {r4-r11, pc}
+
+#elif defined(__aarch64__)
+
+       stp     x29, x30, [sp, #-13*8]!
+       mov     x29, sp
+       stp     x19, x20, [sp,  #16]
+       stp     x21, x22, [sp,  #32]
+       stp     x23, x24, [sp,  #48]
+       stp     x25, x26, [sp,  #64]
+       stp     x27, x28, [sp,  #80]
+       str     x1, [sp, #96]
+
+       mov     x16, x0
+
+       ldr     x17,      [x1, #128]
+       ldp     x14, x15, [x1, #112]
+       ldp     x12, x13, [x1,  #96]
+       ldp     x10, x11, [x1,  #80]
+       ldp      x8,  x9, [x1,  #64]
+       ldp      x6,  x7, [x1,  #48]
+       ldp      x4,  x5, [x1,  #32]
+       ldp      x2,  x3, [x1,  #16]
+       ldp      x0,  x1, [x1,   #0]
+       msr     nzcv, x17
+
+       blr     x16
+
+       ldr     x16, [sp, #96]
+       mrs     x17, nzcv
+       str     x17,      [x16, #128]
+       stp     x14, x15, [x16, #112]
+       stp     x12, x13, [x16,  #96]
+       stp     x10, x11, [x16,  #80]
+       stp      x8,  x9, [x16,  #64]
+       stp      x6,  x7, [x16,  #48]
+       stp      x4,  x5, [x16,  #32]
+       stp      x2,  x3, [x16,  #16]
+       stp      x0,  x1, [x16,   #0]
+
+       ldp     x19, x20, [sp,  #16]
+       ldp     x21, x22, [sp,  #32]
+       ldp     x23, x24, [sp,  #48]
+       ldp     x25, x26, [sp,  #64]
+       ldp     x27, x28, [sp,  #80]
+       ldp     x29, x30, [sp], #13*8
+
+#else
+#  error "not supported"
+#endif
+
  endproc
  
  proc   nop
@@ -119,19 +436,58 @@ proc      nop
  endproc
  
  ///--------------------------------------------------------------------------
+/// 0x00--0x0f
  
  proc   x00
  
         // clear all 64 bits of extended traditional registers
-       xor      eax,eax                // clear rax
-       lea      rbx,[0]                // rbx -> _|_
+
+#if defined(__x86_64__)
+
+       xor      eax, eax               // clear rax
+       lea      rbx, [0]               // rbx -> _|_
         loop     .                      // iterate, decrement rcx until zero
-       mov      rdx,0                  // set rdx = 0
-       and      esi,0                  // clear all bits of rsi
-       sub      edi,edi                // set rdi = edi - edi = 0
+       mov      rdx, 0                 // set rdx = 0
+       and      esi, 0                 // clear all bits of rsi
+       sub      edi, edi               // set rdi = edi - edi = 0
         push     0
         pop      rbp                    // pop 0 into rbp
  
+#elif defined(__i386__)
+
+       xor     eax, eax
+       lea     ebx, [0]
+       loop    .
+       mov     edx, 0
+       and     esi, 0
+       sub     edi, edi
+       push    0
+       pop     ebp
+
+#elif defined(__arm__)
+
+       eor     r0, r0, r0
+       rsb     r1, r1, r1
+0:     subs    r2, r2, #1
+       bne     0b
+       mov     r3, #0
+       and     r4, r4, #0
+       sub     r5, r5, r5
+
+#elif defined(__aarch64__)
+
+       eor     w0, w0, w0
+       mov     w1, wzr
+0:     sub     w2, w2, #1
+       cbnz    w2, 0b
+       mov     w3, #0
+       and     w4, w4, wzr
+       sub     w5, w5, w5
+
+#else
+       notimpl
+#endif
+
         ret
  
  endproc
@@ -142,11 +498,43 @@ proc      x01
         //
         // on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1}
         // and f_{i+c}, where f_{i+1} = f_i + f_{i-1}
+
+#if defined(__x86_64__)
+
  0:     xadd    rax, rdx                // a, d = a + d, a
                                         //      = f_{i+1} + f_i, f_{i+1}
                                         //      = f_{i+2}, f_{i+1}
         loop    0b                      // advance i, decrement c, iterate
  
+#elif defined(__i386__)
+
+0:     xadd    eax, edx
+       loop    0b
+
+#elif defined(__arm__)
+
+0:     subs    r2, r2, #2
+       add     r3, r3, r0
+       blo     8f
+       add     r0, r0, r3
+       bhi     0b
+
+8:     movne   r0, r3
+
+#elif defined(__aarch64__)
+
+0:     subs    x2, x2, #2
+       add     x3, x3, x0
+       b.lo    8f
+       add     x0, x0, x3
+       b.hi    0b
+
+8:     cmov.ne x0, x3
+
+#else
+       notimpl
+#endif
+
         ret
  
  endproc
@@ -155,10 +543,63 @@ proc      x02
  
         // boolean canonify a: if a = 0 on entry, leave it zero; otherwise
         // set a = 1
+
+#if defined(__x86_64__)
+
         neg     rax                     // set cf iff a /= 0
         sbb     rax, rax                // a = a - a - cf = -cf
         neg     rax                     // a = cf
  
+#elif defined(__i386__)
+
+       neg     eax
+       sbb     eax, eax
+       neg     eax
+
+#elif defined(__arm__)
+
+       movs    r1, r0                  // the easy way
+       movne   r1, #1                  // mvnne r1, #1 for mask
+
+       cmp     r0, #1                  // clear cf iff a == 0
+       sbc     r2, r0, r0              // c' = a - a - 1 + cf = cf - 1
+       add     r2, r2, #1              // c' = cf
+
+       sub     r3, r0, r0, lsr #1      // d' top bit clear; d' = 0 iff a = 0
+       rsb     r3, r3, #0              // d' top bit set iff a /= 0
+       mov     r3, r3, lsr #31         // asr for mask
+
+       rsbs    r0, r0, #0
+       sbc     r0, r0, r0
+       rsb     r0, r0, #0
+
+#elif defined(__aarch64__)
+
+       cmp     x0, #0                  // trivial
+       cset.ne x1                      // csetm for mask
+
+       cmp     xzr, x0                 // set cf iff a == 0
+       sbc     x2, x0, x0              // c' = a - a - 1 + cf = cf - 1
+       neg     x2, x2                  // c' = 1 - cf
+
+       sub     x3, x0, x0, lsr #1      // if a < 2^63 then a' = ceil(d/2) <
+                                       // 2^63
+                                       // if a >= 2^63, write a = 2^63 + t
+                                       // with t < 2^63; d' = 2^63 - 2^62 +
+                                       // ceil(t/2) = 2^62 + ceil(t/2), and
+                                       // ceil(t/2) < 2^62
+                                       // anyway d' < 2^63 and d' = 0 iff
+                                       // a = 0
+       neg     x3, x3                  // d' top bit set iff a /= 0
+       lsr     x3, x3, #63             // asr for mask
+
+       cmp     x0, #1                  // set cf iff a /= 0
+       adc     x0, xzr, xzr            // a' = 0 + 0 + cf = cf
+
+#else
+       notimpl
+#endif
+
         ret
  
  endproc
@@ -166,11 +607,46 @@ endproc
  proc   x03
  
         // set a = min(a, d) (unsigned); clobber c, d
+
+#if defined(__x86_64__)
+
         sub     rdx, rax                // d' = d - a; set cf if a > d
         sbb     rcx, rcx                // c = -cf = -[a > d]
         and     rcx, rdx                // c = a > d ? d - a : 0
         add     rax, rcx                // a' = a > d ? d : a
  
+#elif defined(__i386__)
+
+       sub     edx, eax
+       sbb     ecx, ecx
+       and     ecx, edx
+       add     eax, ecx
+
+#elif defined(__arm__)
+
+       cmp     r0, r3                  // the easy way
+       movlo   r1, r0                  // only needed for out-of-place
+       movhs   r1, r3
+
+       subs    r3, r3, r0
+       sbc     r12, r12, r12
+       and     r12, r12, r3
+       add     r0, r0, r12
+
+#elif defined(__aarch64__)
+
+       cmp     x0, x3                  // the easy way
+       csel.lo x1, x0, x3
+
+       subs    x3, x3, x0              // d' = d - a; set cf if d >= a
+       sbc     x16, xzr, xzr           // t = -1 + cf = -[a > d]
+       and     x16, x16, x3            // t = a > d ? d - a : 0
+       add     x0, x0, x16             // a' = a > d ? d : a
+
+#else
+       notimpl
+#endif
+
         ret
  
  endproc
@@ -178,8 +654,76 @@ endproc
  proc   x04
  
         // switch case?
+
+#if defined(__x86_64__)
+
+  // unrelated playing
+  mov  ecx, eax
+  mov  rbx, -1
+  mov  edx, ecx
+  sub  edx, '0'
+  cmp  edx, 10
+  cmovb        rbx, rdx
+  or   ecx, 0x20
+  mov  edx, ecx
+  sub  edx, 'a'
+  sub  ecx, 'a' - 10
+  cmp  edx, 6
+  cmovb        rbx, rcx
+
+       xor     al, 0x20
+
+#elif defined(__i386__)
+
+  // unrelated playing
+  mov  ecx, eax
+  mov  ebx, -1
+  mov  edx, ecx
+  sub  edx, '0'
+  cmp  edx, 10
+  cmovb        ebx, edx
+  or   ecx, 0x20
+  mov  edx, ecx
+  sub  edx, 'a'
+  sub  ecx, 'a' - 10
+  cmp  edx, 6
+  cmovb        ebx, ecx
+
         xor     al, 0x20
  
+#elif defined(__arm__)
+
+  // unrelated playing
+  mvn  r1, #0
+  sub  r12, r0, #'0'
+  cmp  r12, #10
+  movlo        r1, r12
+  orr  r12, r0, #0x20
+  sub  r12, r12, #'a'
+  cmp  r12, #6
+  addlo        r1, r12, #10
+
+       eor     r0, r0, #0x20
+
+#elif defined(__aarch64__)
+
+  // unrelated playing
+  mov  x1, #-1
+  sub  w16, w0, #'0'
+  cmp  w16, #10
+  cmov.lo      x1, x16
+  orr  w16, w0, #0x20
+  sub  w16, w16, #'a' - 10
+  cmp  w16, #10
+  ccmp.hs      w16, #16, #CCMP_HS
+  cmov.lo      x1, x16
+
+       eor     w0, w0, #0x20
+
+#else
+       notimpl
+#endif
+
         ret
  
  endproc
@@ -187,6 +731,9 @@ endproc
  proc   x05
  
         // answer whether 5 <= a </<= 9.
+
+#if defined(__x86_64__)
+
         sub     rax, 5                  // a' = a - 5
         cmp     rax, 4                  // is a' - 5 </<= 4?
  
@@ -211,6 +758,28 @@ proc       x05
         // g/nle        a' > 4                  a > 9 or a < -2^63 + 5
         // le/ng        a' <= 4                 -2^63 + 5 <= a <= 9
  
+#elif defined(__i386__)
+
+       sub     eax, 5
+       cmp     eax, 4
+
+#elif defined(__arm__)
+
+       // i dimly remember having a slick way to do this way back in the
+       // day, but i can't figure it out any more.
+       sub     r0, #5
+       cmp     r0, #4
+
+#elif defined(__aarch64__)
+
+       // literal translation is too obvious
+       cmp     x0, #5
+       ccmp.hs x0, #9, #CCMP_HS
+
+#else
+       notimpl
+#endif
+
         ret
  
  endproc
@@ -219,10 +788,35 @@ proc      x06
  
         // leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of,
         // set sf to msb(a)
+
+#if defined(__x86_64__)
+
         not     rax                     // a' = -a - 1
         inc     rax                     // a' = -a
         neg     rax                     // a' = a
  
+#elif defined(__i386__)
+
+       not     eax
+       inc     eax
+       neg     eax
+
+#elif defined(__arm__)
+
+       mvn     r0, r0
+       add     r0, r0, #1
+       rsbs    r0, r0, #0              // cf has opposite sense
+
+#elif defined(__aarch64__)
+
+       mvn     x0, x0
+       add     x0, x0, #1
+       negs    x0, x0                  // cf has opposite sense
+
+#else
+       notimpl
+#endif
+
         ret
  
  endproc
@@ -230,11 +824,39 @@ endproc
  proc   x07
  
         // same as before (?)
+
+#if defined(__x86_64__)
+
         inc     rax                     // a' = a + 1
         neg     rax                     // a' = -a - 1
         inc     rax                     // a' = -a
         neg     rax                     // a' = a
  
+#elif defined(__i386__)
+
+       inc     eax
+       neg     eax
+       inc     eax
+       neg     eax
+
+#elif defined(__arm__)
+
+       add     r0, r0, #1
+       rsb     r0, r0, #0
+       add     r0, r0, #1
+       rsbs    r0, r0, #0
+
+#elif defined(__aarch64__)
+
+       add     x0, x0, #1
+       neg     x0, x0
+       add     x0, x0, #1
+       negs    x0, x0                  // cf has opposite sense
+
+#else
+       notimpl
+#endif
+
         ret
  
  endproc
@@ -243,10 +865,45 @@ proc      x08
  
         // floor((a + d)/2), correctly handling overflow conditions; final cf
         // is lsb(a + d), probably uninteresting
+
+#if defined(__x86_64__)
+
         add     rax, rdx                // cf || a' = a + d
         rcr     rax, 1                  // shift 65-bit result right by one
                                         // place; lsb moves into carry
  
+#elif defined(__i386__)
+
+       add     eax, edx
+       rcr     eax, 1
+
+#elif defined(__arm__)
+
+       // like the two-instruction a64 version
+       sub     r1, r3, r0
+       add     r1, r0, r1, lsr #1
+
+       // the slick version, similar to the above
+       adds    r0, r0, r3
+       mov     r0, r0, rrx
+
+#elif defined(__aarch64__)
+
+       // a64 lacks a32's rrx.  literal translation.
+       adds    x1, x0, x3              // cf || a' = a + d
+       adc     x16, xzr, xzr           // realize cf in extra register
+       extr    x1, x16, x1, #1         // shift down one place
+
+       // two instruction version: clobbers additional register.  (if you
+       // wanted the answer in any other register, even overwriting d, then
+       // this is unnecessary.)  also depends on d >= a.
+       sub     x16, x3, x0             // compute difference
+       add     x0, x0, x16, lsr #1     // add half of it (rounded down)
+
+#else
+       notimpl
+#endif
+
         ret
  
  endproc
@@ -255,10 +912,33 @@ proc      x09
  
         // a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3
         // (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8).
+
+#if defined(__x86_64__)
+
         shr     rax, 3                  // a' = floor(a/8); cf = 1 if a ==
                                         // 4, 5, 6, 7 (mod 8)
         adc     rax, 0                  // a' = floor(a/8) + cf
  
+#elif defined(__i386__)
+
+       shr     eax, 3
+       adc     eax, 0
+
+#elif defined(__arm__)
+
+       movs    r0, r0, lsr #3
+       adc     r0, r0, #0
+
+#elif defined(__aarch64__)
+
+       tst     x0, #4
+       orr     x0, xzr, x0, lsr #3
+       cinc.ne x0, x0
+
+#else
+       notimpl
+#endif
+
         ret
  
  endproc
@@ -266,11 +946,43 @@ endproc
  proc   x0a
  
         // increment c-byte little-endian bignum at rdi
+
+#if defined(__x86_64__)
+
         add     byte ptr [rdi], 1
  0:     inc     rdi
         adc     byte ptr [rdi], 0
         loop    0b
  
+#elif defined(__i386__)
+
+       add     byte ptr [edi], 1
+0:     inc     edi
+       adc     byte ptr [edi], 0
+       loop    0b
+
+#elif defined(__arm__)
+
+       mov     r12, #256               // set initial carry
+0:     ldrb    r0, [r5]
+       subs    r2, r2, #1
+       add     r12, r0, r12, lsr #8
+       strb    r12, [r5], #1
+       bne     0b
+
+#elif defined(__aarch64__)
+
+       mov     w17, #256               // set initial carry
+0:     ldrb    w16, [x5]
+       sub     x2, x2, #1
+       add     w17, w16, w17, lsr #8
+       strb    w17, [x5], #1
+       cbnz    x2, 0b
+
+#else
+       notimpl
+#endif
+
         ret
  
  endproc
@@ -278,11 +990,36 @@ endproc
  proc   x0b
  
         // negate double-precision d:a
+
+#if defined(__x86_64__)
+
         not     rdx                     // d' = -d - 1
         neg     rax                     // a' = -a;
                                         // cf = 1 iff a /= 0
         sbb     rdx, -1                 // d' = -d - cf
  
+#elif defined(__i386__)
+
+       not     edx
+       neg     eax
+       sbb     edx, -1
+
+#elif defined(__arm__)
+
+       // reverse subtract is awesome
+       rsbs    r0, r0, #0
+       rsc     r3, r3, #0
+
+#elif defined(__aarch64__)
+
+       // easy way: everything is better with zero registers.
+       negs    x0, x0
+       ngc     x3, x3
+
+#else
+       notimpl
+#endif
+
         ret
  
  endproc
@@ -291,6 +1028,8 @@ proc       x0c
  
         // rotate is distributive over xor.
  
+#if defined(__x86_64__)
+
         // rax                          // = a_1 || a_0
         // rbx                          // = b_1 || b_0
         mov     rcx, rax                // = a_1 || a_0
@@ -304,6 +1043,48 @@ proc      x0c
  
         cmp     rax, rcx                // always equal
  
+#elif defined(__i386__)
+
+       mov     ecx, eax                // = a_1 || a_0
+
+       xor     ecx, ebx                // = (a_1 XOR b_1) || (a_0 XOR b_0)
+       ror     ecx, 0xd                // = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+       ror     eax, 0xd                // = a_0 || a_1
+       ror     ebx, 0xd                // = b_0 || b_1
+       xor     eax, ebx                // = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+       cmp     eax, ecx                // always equal
+
+#elif defined(__arm__)
+
+
+       // r0                           // = a_1 || a_0
+       // r1                           // = b_1 || b_0
+       eor     r2, r0, r1              // = (a_1 XOR b_1) || (a_0 XOR b_0)
+       mov     r2, r2, ror #13         // = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+       mov     r1, r1, ror #13         // = b_0 || b_1
+       eor     r0, r1, r0, ror #13     // = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+       cmp     r0, r2                  // always equal
+
+#elif defined(__aarch64__)
+
+       // x0                           // = a_1 || a_0
+       // x1                           // = b_1 || b_0
+       eor     x2, x0, x1              // = (a_1 XOR b_1) || (a_0 XOR b_0)
+       ror     x2, x2, #13             // = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+       ror     x1, x1, #13             // = b_0 || b_1
+       eor     x0, x1, x0, ror #13     // = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+       cmp     x0, x2                  // always equal
+
+#else
+       notimpl
+#endif
+
         ret
  
  endproc
@@ -312,6 +1093,8 @@ proc       x0d
  
         // and is distributive over xor.
  
+#if defined(__x86_64__)
+
         mov     rdx, rbx                // = b
  
         xor     rbx, rcx                // = b XOR c
@@ -324,6 +1107,50 @@ proc      x0d
  
         cmp     rax, rbx                // always equal
  
+#elif defined(__i386__)
+
+       mov     edx, ebx                // = b
+
+       xor     ebx, ecx                // = b XOR c
+       and     ebx, eax                // = a AND (b XOR c)
+
+       and     edx, eax                // = a AND b
+       and     eax, ecx                // = a AND c
+       xor     eax, edx                // = (a AND b) XOR (a AND c)
+                                       // = a AND (b XOR c)
+
+       cmp     eax, ebx                // always equal
+
+#elif defined(__arm__)
+
+       and     r3, r0, r1              // = a AND b
+
+       eor     r1, r1, r2              // = b XOR c
+       and     r1, r1, r0              // = a AND (b XOR c)
+
+       and     r0, r0, r2              // = a AND c
+       eor     r0, r0, r3              // = (a AND b) XOR (a AND c)
+                                       // = a AND (b XOR c)
+
+       cmp     r0, r1                  // always equal
+
+#elif defined(__aarch64__)
+
+       and     x3, x0, x1              // = a AND b
+
+       eor     x1, x1, x2              // = b XOR c
+       and     x1, x1, x0              // = a AND (b XOR c)
+
+       and     x0, x0, x2              // = a AND c
+       eor     x0, x0, x3              // = (a AND b) XOR (a AND c)
+                                       // = a AND (b XOR c)
+
+       cmp     x0, x1                  // always equal
+
+#else
+       notimpl
+#endif
+
         ret
  
  endproc
@@ -332,6 +1159,8 @@ proc       x0e
  
         // de morgan's law
  
+#if defined(__x86_64__)
+
         mov     rcx, rax                // = a
  
         and     rcx, rbx                // = a AND b
@@ -342,7 +1171,46 @@ proc      x0e
         or      rax, rbx                // = (NOT a) OR (NOT b)
                                         // = NOT (a AND b)
  
-       cmp     rax, rcx
+       cmp     rax, rcx                // always equal
+
+#elif defined(__i386__)
+
+       mov     ecx, eax                // = a
+
+       and     ecx, ebx                // = a AND b
+       not     ecx                     // = NOT (a AND b)
+
+       not     eax                     // = NOT a
+       not     ebx                     // = NOT b
+       or      eax, ebx                // = (NOT a) OR (NOT b)
+                                       // = NOT (a AND b)
+
+       cmp     eax, ecx                // always equal
+
+#elif defined(__arm__)
+
+       and     r2, r0, r1              // = a AND b
+       mvn     r2, r2                  // = NOT (a AND b)
+
+       mvn     r0, r0                  // = NOT a
+       mvn     r1, r1                  // = NOT b
+       orr     r0, r0, r1              // = (NOT a) OR (NOT b)
+
+       cmp     r0, r2                  // always equal
+
+#elif defined(__aarch64__)
+
+       and     x2, x0, x1              // = a AND b
+       mvn     x2, x2                  // = NOT (a AND b)
+
+       mvn     x0, x0                  // = NOT a
+       orn     x0, x0, x1              // = (NOT a) OR (NOT b)
+
+       cmp     x0, x2                  // always equal
+
+#else
+       notimpl
+#endif
  
         ret
  
@@ -355,20 +1223,51 @@ proc     x0f
         //
         // not sure why you'd do this.
  
-       cld
+#if defined(__x86_64__)
  
  0:     xor     [rsi], al
         lodsb
         loop    0b
  
+#elif defined(__i386__)
+
+0:     xor     [esi], al
+       lodsb
+       loop    0b
+
+#elif defined(__arm__)
+
+0:     ldrb    r12, [r4]
+       subs    r2, r2, #1
+       eor     r0, r0, r12
+       strb    r0, [r4], #1
+       bne     0b
+
+#elif defined(__aarch64__)
+
+0:     ldrb    w16, [x4]
+       sub     x2, x2, #1
+       eor     w0, w0, w16
+       strb    w0, [x4], #1
+       cbnz    x2, 0b
+
+#else
+       notimpl
+#endif
+
         ret
  
  endproc
  
+///--------------------------------------------------------------------------
+/// 0x10--0x1f
+
  proc   x10
  
         // four different ways to swap a pair of registers.
  
+#if defined(__x86_64__)
+
         push    rax
         push    rcx
         pop     rax
@@ -385,6 +1284,76 @@ proc      x10
  
         xchg    rax, rcx
  
+#elif defined(__i386__)
+
+       push    eax
+       push    ecx
+       pop     eax
+       pop     ecx
+
+       xor     eax, ecx
+       xor     ecx, eax
+       xor     eax, ecx
+
+       add     eax, ecx
+       sub     ecx, eax
+       add     eax, ecx
+       neg     ecx
+
+       xchg    eax, ecx
+
+#elif defined(__arm__)
+
+       stmfd   r13!, {r0, r2}
+       ldr     r0, [r13, #4]
+       ldr     r2, [r13], #8
+
+       eor     r0, r0, r2
+       eor     r2, r2, r0
+       eor     r0, r0, r2
+
+       sub     r0, r0, r2
+       add     r2, r2, r0
+       rsb     r0, r0, r2              // don't need 3-addr with reverse-sub
+
+       mov     r12, r0
+       mov     r0, r2
+       mov     r2, r0
+
+#elif defined(__aarch64__)
+
+       // anything you can do
+       stp     x0, x2, [sp, #-16]!
+       ldp     x2, x0, [sp], #16
+
+       eor     x0, x0, x2
+       eor     x2, x2, x0
+       eor     x0, x0, x2
+
+       // the add/sub/add thing was daft.  you can do it in three if you're
+       // clever -- and have three-address operations.
+       sub     x0, x0, x2
+       add     x2, x2, x0
+       sub     x0, x2, x0
+
+       // but we lack a fourth.  we can't do this in fewer than three
+       // instructions without hitting memory.  only `ldp' will modify two
+       // registers at a time, so we need at least two instructions -- but
+       // if the first one sets one of our two registers to its final value
+       // then we lose the other input value with no way to recover it, so
+       // we must either write a fresh third register, or write something
+       // other than the final value, and in both cases we need a third
+       // instruction to fix everything up.  we've done the wrong-something-
+       // other trick twice, so here's the captain-obvious use-a-third-
+       // register version.
+       mov     x16, x0
+       mov     x0, x2
+       mov     x2, x16
+
+#else
+       notimpl
+#endif
+
         ret
  
  endproc
@@ -398,6 +1367,8 @@ proc       x11
         // in particular, a will be zero (and zf set) if and only if the two
         // strings are equal.
  
+#if defined(__x86_64__)
+
  0:     mov     dl, [rsi]
         xor     dl, [rdi]
         inc     rsi
@@ -405,6 +1376,37 @@ proc      x11
         or      al, dl
         loop    0b
  
+#elif defined(__i386__)
+
+0:     mov     dl, [esi]
+       xor     dl, [edi]
+       inc     esi
+       inc     edi
+       or      al, dl
+       loop    0b
+
+#elif defined(__arm__)
+
+0:     ldrb    r1, [r4], #1
+       ldrb    r12, [r5], #1
+       subs    r2, r2, #1
+       eor     r12, r12, r1
+       orr     r0, r0, r12
+       bne     0b
+
+#elif defined(__aarch64__)
+
+0:     ldrb    w16, [x4], #1
+       ldrb    w17, [x5], #1
+       sub     x2, x2, #1
+       eor     w16, w16, w17
+       orr     w0, w0, w16
+       cbnz    x2, 0b
+
+#else
+       notimpl
+#endif
+
         ret
  
  endproc
@@ -418,11 +1420,36 @@ proc     x12
         // move all of the set bits in d to a, unless there's already a bit
         // there.  this clearly doesn't change the sum.
  
+#if defined(__x86_64__)
+
         mov     rcx, rdx                // c' = d
         and     rdx, rax                // d' = a AND d
         or      rax, rcx                // a' = a OR d
         add     rax, rdx
  
+#elif defined(__i386__)
+
+       mov     ecx, edx                // c' = d
+       and     edx, eax                // d' = a AND d
+       or      eax, ecx                // a' = a OR d
+       add     eax, edx
+
+#elif defined(__arm__)
+
+       and     r2, r0, r3              // c' = a AND d
+       orr     r0, r0, r3              // a' = a OR d
+       add     r0, r0, r2
+
+#elif defined(__aarch64__)
+
+       and     x2, x0, x3              // c' = a AND d
+       orr     x0, x0, x3              // a' = a OR d
+       add     x0, x0, x2
+
+#else
+       notimpl
+#endif
+
         ret
  
  endproc
@@ -432,13 +1459,46 @@ proc     x13
         // ok, so this is a really obtuse way of adding a and b; the result
         // is in a and d.  but why does it work?
  
+#if defined(__x86_64__)
+
         mov     rcx, 0x40               // carry chains at most 64 long
  0:     mov     rdx, rax                // copy a'
         xor     rax, rbx                // low bits of each bitwise sum
         and     rbx, rdx                // carry bits from each bitwise sum
-       shl     rbx, 001                // carry them into next position
+       shl     rbx, 1                  // carry them into next position
+       loop    0b
+
+#elif defined(__i386__)
+
+       mov     ecx, 0x40               // carry chains at most 64 long
+0:     mov     edx, eax                // copy a'
+       xor     eax, ebx                // low bits of each bitwise sum
+       and     ebx, edx                // carry bits from each bitwise sum
+       shl     ebx, 1                  // carry them into next position
         loop    0b
  
+#elif defined(__arm__)
+
+       mov     r2, #0x40
+0:     and     r3, r0, r1
+       subs    r2, r2, #1
+       eor     r0, r0, r1
+       lsl     r1, r3, #1
+       bne     0b
+
+#elif defined(__aarch64__)
+
+       mov     x2, #0x40
+0:     and     x3, x0, x1
+       sub     x2, x2, #1
+       eor     x0, x0, x1
+       lsl     x1, x3, #1
+       cbnz    x2, 0b
+
+#else
+       notimpl
+#endif
+
         ret
  
  endproc
@@ -447,6 +1507,8 @@ proc       x14
  
         // floor((a + d)/2), like x08.
  
+#if defined(__x86_64__)
+
         mov     rcx, rax                // copy a for later
         and     rcx, rdx                // carry bits
  
@@ -455,6 +1517,32 @@ proc      x14
  
         add     rax, rcx                // add the carries; done
  
+#elif defined(__i386__)
+
+       mov     ecx, eax                // copy a for later
+       and     ecx, edx                // carry bits
+
+       xor     eax, edx                // low bits of each bitwise sum
+       shr     eax, 1                  // divide by 2; carries now in place
+
+       add     eax, ecx                // add the carries; done
+
+#elif defined(__arm__)
+
+       and     r2, r0, r3
+       eor     r0, r0, r3
+       add     r0, r2, r0, lsr #1
+
+#elif defined(__aarch64__)
+
+       and     x2, x0, x3
+       eor     x0, x0, x3
+       add     x0, x2, x0, lsr #1
+
+#else
+       notimpl
+#endif
+
         ret
  
  endproc
@@ -463,7 +1551,9 @@ proc       x15
  
         // sign extension 32 -> 64 bits.
  
-       //movsx rbx, eax                // like this?
+#if defined(__x86_64__)
+
+       movsx   rbx, eax                // like this?
  
         mov     rdx, 0xffffffff80000000
         add     rax, rdx                // if bit 31 of a is set then bits
@@ -472,15 +1562,46 @@ proc     x15
                                         // exactly backwards
         xor     rax, rdx                // so fix it
  
+#elif defined(__i386__)
+
+       movsx   ebx, ax                 // like this?
+
+       mov     edx, 0xffff8000
+       add     eax, edx                // if bit 31 of a is set then bits
+                                       // 31--63 of a' are clear; otherwise,
+                                       // these bits are all set -- which is
+                                       // exactly backwards
+       xor     eax, edx                // so fix it
+
+#elif defined(__arm__)
+
+       sxth    r1, r0                  // like this
+
+       mov     r12, #0x80000000
+       add     r0, r0, r12, asr #16
+       eor     r0, r0, r12, asr #16
+
+#elif defined(__aarch64__)
+
+       sxtw    x1, w0                  // like this
+
+       mov     x16, #0xffffffff80000000
+       add     x0, x0, x16
+       eor     x0, x0, x16
+
+#else
+       notimpl
+#endif
+
         ret
  
  endproc
  
  proc   x16
  
-  //shl rax, 56
-  //shl rbx, 56
-  //shl rcx, 56
+       // ??? i don't know why you'd want to calculate this.
+
+#if defined(__x86_64__)
  
         xor     rax, rbx                // a' = a XOR b
         xor     rbx, rcx                // b' = b XOR c
@@ -490,67 +1611,725 @@ proc    x16
         xor     rax, rbx                // a' = cf ? 0 : a XOR c
         cmp     rax, rsi
  
+#elif defined(__i386__)
+
+       xor     eax, ebx                // a' = a XOR b
+       xor     ebx, ecx                // b' = b XOR c
+       mov     esi, eax                // t = a XOR b
+       add     esi, ebx                // t = (a XOR b) + (b XOR c)
+       cmovc   eax, ebx                // a' = cf ? b XOR c : a XOR b
+       xor     eax, ebx                // a' = cf ? 0 : a XOR c
+       cmp     eax, esi
+
+#elif defined(__arm__)
+
+       eor     r0, r0, r1
+       eor     r1, r1, r2
+       adds    r4, r0, r1
+       movcs   r0, r1
+       eor     r0, r0, r1
+       cmp     r0, r4
+
+#elif defined(__aarch64__)
+
+       eor     x0, x0, x1
+       eor     x1, x1, x2
+       adds    x4, x0, x1
+       cmov.cs x0, x1
+       eor     x0, x0, x1
+       cmp     x0, x4
+
+#else
+       notimpl
+#endif
+
         ret
  
  endproc
  
  proc   x17
  
-       ud2
+       // absolute value
+
+#if defined(__x86_64__)
+
+       cqo                             // d = a < 0 ? -1 : 0
+       xor     rax, rdx                // a' = a < 0 ? -a - 1 : a
+       sub     rax, rdx                // a' = a < 0 ? -a : a
+
+#elif defined(__i386__)
+
+       cdq                             // d = a < 0 ? -1 : 0
+       xor     eax, edx                // a' = a < 0 ? -a - 1 : a
+       sub     eax, edx                // a' = a < 0 ? -a : a
+
+#elif defined(__arm__)
+
+       // direct approach
+       movs    r1, r0
+       rsbmi   r1, r0, #0
+
+       // faithful-ish conversion
+       eor     r3, r0, r0, asr #31
+       sub     r0, r3, r0, asr #31
+
+#elif defined(__aarch64__)
+
+       // direct approach
+       tst     x0, #1 << 63
+       cneg.ne x1, x0
+
+       // faithful-ish conversion
+       eor     x3, x0, x0, asr #63
+       sub     x0, x3, x0, asr #63
+
+#else
+       notimpl
+#endif
+
+       ret
  
  endproc
  
  proc   x18
  
-       ud2
+       // should always set sf, clear zf, unless we get rescheduled to a
+       // different core.
+
+#if defined(__x86_64__)
+
+       rdtsc                           // d || a = cycles
+       shl     rdx, 0x20
+       or      rax, rdx                // a = cycles
+       mov     rcx, rax                // c = cycles
+
+       rdtsc                           // d || a = cycles'
+       shl     rdx, 0x20
+       or      rax, rdx                // a = cycles'
+
+       cmp     rcx, rax
+
+#elif defined(__i386__)
+
+       rdtsc                           // d || a = cycles
+       mov     ebx, eax
+       mov     ecx, edx                // c || b = cycles
+
+       rdtsc                           // d || a = cycles'
+
+       sub     ebx, eax
+       sbb     ecx, edx
+
+#elif defined(__arm__)
+
+       // cycle clock not available in user mode
+       mrrc    p15, 0, r0, r1, c9
+       mrrc    p15, 0, r2, r3, c9
+       subs    r0, r0, r2
+       sbcs    r1, r1, r3
+
+#elif defined(__aarch64__)
+
+       // cycle clock not available in user mode
+       mrs     x0, pmccntr_el0
+       mrs     x1, pmccntr_el0
+       cmp     x0, x1
+
+#else
+       notimpl
+#endif
+
+       ret
  
  endproc
  
  proc   x19
  
-       ud2
+       // stupid way to capture a pointer to inline data and jump past it.
+       // confuses the return-address predictor something chronic.  worse
+       // because amd64 calling convention doesn't usually pass arguments on
+       // the stack.
+
+#if defined(__x86_64__)
+
+       call    8f
+       .string "hello world!\n\0"
+8:     call    print_str
+       add     rsp, 8
+       ret
+
+print_str:
+       // actually implement this ridiculous thing
+       mov     rsi, [rsp + 8]
+       xor     edx, edx
+0:     mov     al, [rsi + rdx]
+       inc     rdx
+       cmp     al, 0
+       jnz     0b
+       mov     eax, SYS_write
+       mov     edi, 1
+       dec     rdx
+       syscall                         // clobbers r11 :-(
+       ret
+
+#elif defined(__i386__)
+
+       call    8f
+       .string "hello world!\n\0"
+8:     call    print_str
+       add     esp, 4
+       ret
+
+print_str:
+       // actually implement this ridiculous thing
+       mov     ecx, [esp + 4]
+       xor     edx, edx
+0:     mov     al, [ecx + edx]
+       inc     edx
+       cmp     al, 0
+       jnz     0b
+       mov     eax, SYS_write
+       mov     ebx, 1
+       dec     edx
+       int     0x80
+       ret
+
+#elif defined(__arm__)
+
+       // why am i doing this?
+       stmfd   r13!, {r14}
+       bl      8f
+       .string "hello world!\n\0"
+       .balign 4
+8:     mov     r1, r14               // might as well make it easy on myself
+       bl      print_str
+       ldmfd   r13!, {pc}
+
+print_str:
+       mov     r2, #0
+0:     ldrb    r0, [r1, r2]
+       cmp     r0, #0
+       addne   r2, r2, #1
+       bne     0b
+       mov     r0, #1
+       mov     r7, #SYS_write
+       swi     0
+       bx      r14
+
+#elif defined(__aarch64__)
+
+       // why am i doing this?
+       str     x30, [sp, #-16]!
+       bl      8f
+       .string "hello world!\n\0"
+       .balign 4
+8:     mov     x1, x30               // might as well make it easy on myself
+       bl      print_str
+       ldr     x30, [sp], #16
+       ret
+
+print_str:
+       mov     x2, #0
+0:     ldrb    w0, [x1, x2]
+       cmp     w0, #0
+       cinc.ne x2, x2
+       b.ne    0b
+       mov     x0, #1
+       mov     x8, #SYS_write
+       svc     #0
+       ret
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x1a
  
-       ud2
+       // collect the current instruction-pointer address.  this was an old
+       // 32-bit i386 trick for position-independent code, but (a) it
+       // confuses the return predictor, and (b) amd64 has true pc-relative
+       // addressing.
+
+#if defined(__x86_64__)
+
+       // the actual example
+       call    0f
+0:     pop     rax
+
+       // the modern i386 trick doesn't confuse the return-address
+       // predictor.
+       call    calladdr_rbx
+       sub     rbx, . - 0b
+
+       // but rip-relative addressing is even better
+       lea     rcx, [rip + 0b]
+
+       ret
+
+calladdr_rbx:
+       mov     rbx, [rsp]
+       ret
+
+#elif defined(__i386__)
+
+       // the actual example
+       call    0f
+0:     pop     eax
+
+       // the modern i386 trick doesn't confuse the return-address
+       // predictor.
+       call    get_pc_ebx
+       sub     ebx, . - 0b
+
+       ret
+
+#elif defined(__arm__)
+
+       stmfd   r13!, {r14}
+
+       bl      0f
+0:     mov     r0, r14
+
+       bl      return
+       sub     r1, r14, #. - 0b
+
+       adr     r2, 0b
+
+       ldmfd   r13!, {pc}
+
+return:        bx      r14
+
+#elif defined(__aarch64__)
+
+       str     x30, [sp, #-16]!
+
+       // we can do all of the above using a64
+       bl      0f
+0:     mov     x0, x30
+
+       bl      return
+       sub     x1, x30, #. - 0b
+
+       adr     x2, 0b
+
+       ldr     x30, [sp], #16
+return:        ret
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x1b
  
-       ud2
+#if defined(__x86_64__)
+
+       // retpolines: an mitigation against adversarially influenced
+       // speculative execution at indirect branches.  if an adversary can
+       // prepare a branch-target buffer entry matching an indirect branch
+       // in the victim's address space then they can cause the victim to
+       // /speculatively/ (but not architecturally) execute any code in
+       // their address space, possibly leading to leaking secrets through
+       // the cache.  retpolines aren't susceptible to this because the
+       // predicted destination address is from the return-prediction stack
+       // which the adversary can't prime.  the performance penalty is still
+       // essentially a branch misprediction -- for this return, and
+       // possibly all others already stacked.
+
+       // (try not to crash)
+       lea     rax, [rip + 9f]
+
+       push    rax
+9:     ret
+
+#elif defined(__i386__)
+
+       call    get_pc_ebx
+       lea     eax, [ebx + 9f - .]
+
+       push    eax
+9:     ret
+
+#elif defined(__arm__)
+
+       stmfd   r13!, {r14}
+
+       adr     r14, 8f
+       bx      r14
+
+8:     ldmfd   r13!, {pc}
+
+#elif defined(__aarch64__)
+
+       str     x30, [sp, #-16]!
+
+       adr     x30, 8f
+       ret
+
+8:     ldr     x30, [sp], #16
+       ret
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x1c
  
-       ud2
+       // ok, having a hard time seeing a use for this.  the most important
+       // thing to note is that sp is set from `pop' /after/ it's
+       // incremented.
+
+#if defined(__x86_64__)
+
+       // try not to crash
+       mov     rax, rsp
+       and     rsp, -16
+       push    rax
+
+       pop     rsp
+
+       // check it worked
+       mov     rbx, rsp
+       ret
+
+#elif defined(__i386__)
+
+       // try not to crash
+       mov     eax, esp
+       and     esp, -16
+       push    eax
+
+       pop     esp
+
+       // check it worked
+       mov     ebx, esp
+       ret
+
+#elif defined(__arm__)
+
+       // not even going to dignify this
+       notimpl
+
+#elif defined(__aarch64__)
+
+       // not even going to dignify this
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x1d
  
-       ud2
+       // monumentally cheesy way to copy 8 n bytes from buff1 to buff2.
+       // also clobbers words at buff2 + 8 n and buff2 - 8 for good measure.
+
+       n = 4
+
+#if defined(__x86_64__)
+
+       mov     rax, rsp                        // safekeeping
+
+       // we're toast if we get hit by a signal now.  fingers crossed...
+  .if 0
+       mov     rsp, buff2 + 8*n + 8
+       mov     rbp, buff1 + 8*n
+  .else
+       lea     rsp, [rdi + 8*n + 16]
+       lea     rbp, [rsi + 8*n]
+  .endif
+       enter   0, n + 1
+
+       // precise action:
+       //
+       //         +---------+                  +---------+
+       //  rbp -> |   ???   |           rsp -> |   ???   |
+       //         +---------+                  +---------+
+       //         | w_{n-1} |                  |   rbp   | <- rbp'
+       //         +---------+                  +---------+
+       //         |   ...   |                  | w_{n-1} |
+       //         +---------+                  +---------+
+       //         |   w_1   |                  |   ...   |
+       //         +---------+                  +---------+
+       //         |   w_0   |                  |   w_1   |
+       //         +---------+                  +---------+
+       //                                      |   w_0   |
+       //                                      +---------+
+       //                                      |   rbp'  | <- rsp'
+       //                                      +---------+
+
+       mov     rdx, rsp
+       mov     rsp, rax
+
+#elif defined(__i386__)
+
+       mov     eax, esp                        // safekeeping
+
+       // we're toast if we get hit by a signal now.  fingers crossed...
+  .if 0
+       mov     esp, buff2 + 4*n + 4
+       mov     ebp, buff1 + 4*n
+  .else
+       lea     esp, [edi + 4*n + 8]
+       lea     ebp, [esi + 4*n]
+  .endif
+       enter   0, n + 1
+
+       mov     edx, esp
+       mov     esp, eax
+
+#elif defined(__arm__)
+
+       add     r4, r4, #4*n
+       add     r5, r5, #4*n + 8
+
+       str     r4, [r5, #-4]!
+  .rept n/2
+       ldrd    r0, r1, [r4, #-8]!
+       strd    r0, r1, [r5, #-8]!
+  .endr
+       add     r4, r5, #4*n
+       str     r4, [r5, #-4]!
+
+#elif defined(__aarch64__)
+
+       // omgwtf.  let's not actually screw with the stack pointer.
+
+       add     x4, x4, #8*n
+       add     x5, x5, #8*n + 16
+
+       str     x4, [x5, #-8]!
+  .rept n/2
+       ldp     x16, x17, [x4, #-16]!
+       stp     x16, x17, [x5, #-16]!
+  .endr
+       add     x4, x5, #8*n
+       str     x4, [x5, #-8]!
+
+#else
+       notimpl
+#endif
+
+       ret
  
  endproc
  
  proc   x1e
  
-       ud2
+       // convert nibble value to (uppercase) hex; other input values yield
+       // nonsense.
+
+#if defined(__x86_64__)
+
+       // das doesn't work in 64-bit mode; best i can come up with
+       mov     edx, eax
+       add     al, '0'
+       add     dl, 'A' - 10
+       cmp     al, '9' + 1
+       cmovae  eax, edx
+
+#elif defined(__i386__)
+
+       cmp     al, 0x0a                // cf = 1 iff a < 10
+       sbb     al, 0x69                // if 0 <= a < 10, a' = a - 0x6a, so
+                                       // 0x96 <= a' < 0x70, setting af, cf
+                                       // if 10 <= a < 16, a' = a - 0x69, so
+                                       // 0x71 <= a' < 0x77, setting cf but
+                                       // clearing af
+       das                             // if 0 <= a < 10, then af and cf are
+                                       // both set, so set subtract 0x66
+                                       // from a' leaving 0x30 <= a' < 0x3a;
+                                       // if 10 <= a < 16 then af clear but
+                                       // cf set, so subtract 0x60 from a'
+                                       // leaving 0x41 <= a' < 0x47
+
+#elif defined(__arm__)
+
+       // significantly less tricksy
+       cmp     r0, #10
+       addlo   r0, r0, #'0'
+       addhs   r0, r0, #'A' - 10
+
+#elif defined(__aarch64__)
+
+       // with less versatile conditional execution this is the best we can
+       // do
+       cmp     w0, #10
+       add     w16, w0, #'A' - 10
+       add     w0, w0, #'0'
+       cmov.hs w0, w16
+
+#else
+       notimpl
+#endif
+
+       ret
  
  endproc
  
  proc   x1f
  
-       ud2
+       // verify collatz conjecture starting at a; assume a /= 0!
+
+#if defined(__x86_64__)
+
+0:     bsf     rcx, rax                // clobber c if a = 0
+       shr     rax, cl                 // a = 2^c a'
+  cmp rdx, 0
+  je 1f
+  stosq
+  dec rdx
+1:
+       cmp     rax, 1                  // done?
+       je      9f
+       lea     rax, [2*rax + rax + 1]  // a' = 3 a' + 1
+       jmp     0b                      // again
+
+9:     ret
+
+#elif defined(__i386__)
+
+0:     bsf     ecx, eax                // clobber c if a = 0
+       shr     eax, cl                 // a = 2^c a'
+  cmp edx, 0
+  je 1f
+  stosd
+  dec edx
+1:
+       cmp     eax, 1                  // done?
+       je      9f
+       lea     eax, [2*eax + eax + 1]  // a' = 3 a' + 1
+       jmp     0b                      // again
+
+9:     ret
+
+#elif defined(__arm__)
+
+       // rbit introduced in armv7
+0:     rbit    r2, r0
+       clz     r2, r2
+       mov     r0, r0, lsr r2          // a = 2^c a'
+  cmp r3, #0
+  strne r0, [r5], #4
+  subne r3, r3, #1
+       cmp     r0, #1
+       adcne   r0, r0, r0, lsl #1      // a' = 3 a' + 1 (because c set)
+       bne     0b
+
+       ret
+
+#elif defined(__aarch64__)
+
+0:     rbit    w2, w0
+       clz     w2, w2
+       lsr     w0, w0, w2              // a = 2^c a'
+  cmp x3, #0
+  beq 1f
+  str x0, [x5], #8
+  sub x3, x3, #1
+1:
+       cmp     w0, #1
+       add     w16, w0, w0, lsl #1     // t = 3 a' + 1 (because c set)
+       csinc.eq w0, w0, w16
+       b.ne    0b
+
+       ret
+
+#else
+       notimpl
+#endif
  
  endproc
  
+///--------------------------------------------------------------------------
+/// 0x20--0x2f
+
  proc   x20
  
-       ud2
+       // calculate 1337 a slowly
+
+#if defined(__x86_64__)
+
+       // original version
+       mov     rcx, rax                // c = a
+       shl     rcx, 2                  // c = 4 a
+       add     rcx, rax                // c = 5 a
+       shl     rcx, 3                  // c = 40 a
+       add     rcx, rax                // c = 41 a
+       shl     rcx, 1                  // c = 82 a
+       add     rcx, rax                // c = 83 a
+       shl     rcx, 1                  // c = 166 a
+       add     rcx, rax                // c = 167 a
+       shl     rcx, 3                  // c = 1336 a
+       add     rcx, rax                // c = 1337 a
+
+       // a quick way
+       lea     rdx, [2*rax + rax]      // t = 3 a
+       shl     rdx, 6                  // t = 192 a
+       sub     rdx, rax                // t = 191 a
+       lea     rbx, [8*rdx]            // b = 1528 a
+       sub     rbx, rdx                // b = 1337 a
+
+#elif defined(__i386__)
+
+       // original version
+       mov     ecx, eax                // c = a
+       shl     ecx, 2                  // c = 4 a
+       add     ecx, eax                // c = 5 a
+       shl     ecx, 3                  // c = 40 a
+       add     ecx, eax                // c = 41 a
+       shl     ecx, 1                  // c = 82 a
+       add     ecx, eax                // c = 83 a
+       shl     ecx, 1                  // c = 166 a
+       add     ecx, eax                // c = 167 a
+       shl     ecx, 3                  // c = 1336 a
+       add     ecx, eax                // c = 1337 a
+
+       // a quick way
+       lea     edx, [2*eax + eax]      // t = 3 a
+       shl     edx, 6                  // t = 192 a
+       sub     edx, eax                // t = 191 a
+       lea     ebx, [8*edx]            // b = 1528 a
+       sub     ebx, edx                // b = 1337 a
+
+#elif defined(__arm__)
+
+       // original version, ish
+       add     r2, r0, r0, lsl #2      // c = 5 a
+       add     r2, r0, r2, lsl #3      // c = 41 a
+       add     r2, r0, r2, lsl #1      // c = 83 a
+       add     r2, r0, r2, lsl #1      // c = 167 a
+       add     r2, r0, r2, lsl #3      // c = 1337 a
+
+       // quicker way
+       add     r1, r0, r0, lsl #1      // b = 3 a
+       rsb     r1, r0, r1, lsl #6      // b = 191 a
+       rsb     r1, r1, r1, lsl #3      // b = 1337 a
+
+#elif defined(__aarch64__)
+
+       // original version, ish
+       add     x2, x0, x0, lsl #2      // c = 5 a
+       add     x2, x0, x2, lsl #3      // c = 41 a
+       add     x2, x0, x2, lsl #1      // c = 83 a
+       add     x2, x0, x2, lsl #1      // c = 167 a
+       add     x2, x0, x2, lsl #3      // c = 1337 a
+
+       // sleazy because no rsb
+       add     x1, x0, x0, lsl #1      // b = 3 a
+       sub     x1, x0, x1, lsl #6      // b = -191 a
+       sub     x1, x1, x1, lsl #3      // b = 1337 a
+
+#else
+       notimpl
+#endif
  
         ret
  
@@ -558,97 +2337,667 @@ endproc
  
  proc   x21
  
-       ud2
+       // multiply complex numbers a + b i and c + d i
+       //
+       //      (a + b i) (c + d i) = (a c - b d) + (a d + b c) i
+       //
+       // somewhat slick approach uses only three multiplications
+
+#if defined(__x86_64__)
+
+       mov     rsi, rax                // t = a
+       add     rax, rbx                // a' = a + b
+       mov     rdi, rdx                // u = d
+       sub     rdx, rcx                // d' = d - c
+       add     rdi, rcx                // u = c + d
+
+       imul    rax, rcx                // a' = c (a + b)
+       imul    rsi, rdx                // t = a (d - c)
+       imul    rdi, rbx                // u = b (c + d)
+
+       add     rsi, rax                // t = a (d - c) + c (a + b)
+       mov     rbx, rsi                // b' = a (d - c) + c (a + b)
+                                       //      = a d + b c
+       sub     rax, rdi                // a' = c (a + b) - b (c + d)
+                                       //      = a c - b d
+
+#elif defined(__i386__)
+
+       mov     esi, eax                // t = a
+       add     eax, ebx                // a' = a + b
+       mov     edi, edx                // u = d
+       sub     edx, ecx                // d' = d - c
+       add     edi, ecx                // u = c + d
+
+       imul    eax, ecx                // a' = c (a + b)
+       imul    esi, edx                // t = a (d - c)
+       imul    edi, ebx                // u = b (c + d)
+
+       add     esi, eax                // t = a (d - c) + c (a + b)
+       mov     ebx, esi                // b' = a (d - c) + c (a + b)
+                                       //      = a d + b c
+       sub     eax, edi                // a' = c (a + b) - b (c + d)
+                                       //      = a c - b d
+
+#elif defined(__arm__)
+
+       add     r4, r0, r1              // t = a + b
+       add     r5, r2, r3              // u = c + d
+       sub     r3, r3, r2              // d' = d - c
+
+       // mls introduced in armv7
+       mul     r4, r4, r2              // t = c (a + b)
+       mov     r2, r1                  // c' = a (bah!)
+       mla     r1, r0, r3, r4          // b' = a (d - c) + c (a + b)
+                                       //      = a d + b c
+       mls     r0, r2, r5, r4          // a' = c (a + b) - b (c + d)
+                                       //      = a c - b d
+
+#elif defined(__aarch64__)
+
+       add     x4, x0, x1              // t = a + b
+       add     x5, x2, x3              // u = c + d
+       sub     x3, x3, x2              // d' = d - c
+
+       // mls intxoduced in axmv7
+       mul     x4, x4, x2              // t = c (a + b)
+       mov     x2, x1                  // c' = a (bah!)
+       madd    x1, x0, x3, x4          // b' = a (d - c) + c (a + b)
+                                       //      = a d + b c
+       msub    x0, x2, x5, x4          // a' = c (a + b) - b (c + d)
+                                       //      = a c - b d
+
+#else
+       notimpl
+#endif
+
+       ret
  
  endproc
  
  proc   x22
  
-       ud2
+       // divide by 3
+
+#if defined(__x86_64__)
+
+       mov     rdx, 0xaaaaaaaaaaaaaaab // = ceil(2/3 2^64)
+       mul     rdx                     // d' || a' =~ 2/3 a 2^64
+       shr     rdx, 1                  // d' = floor(a/3)
+       mov     rax, rdx                // a' = floor(a/3)
+
+       // we start with 0 <= a < 2^64.  write f = ceil(2/3 2^64), so that
+       // 2/3 < f/2^64 < 2/3 + 1/2^64.  then floor(2/3 a) <= floor(a f/2^64)
+       // <= floor(2/3 a + a/2^64), but a < 2^64 so a/2^64 < 1 and
+       // floor(a f/2^64) = floor(2/3 a).
+
+#elif defined(__i386__)
+
+       mov     edx, 0xaaaaaaab         // = ceil(2/3 2^32)
+       mul     edx                     // d' || a' =~ 2/3 a 2^32
+       shr     edx, 1                  // d' = floor(a/3)
+       mov     eax, edx                // a' = floor(a/3)
+
+#elif defined(__arm__)
+
+       ldr     r12, =0xaaaaaaab
+       umull   r12, r0, r0, r12
+       mov     r0, r0, lsr #1
+
+#elif defined(__aarch64__)
+
+       ldr     x16, =0xaaaaaaaaaaaaaaab
+       umulh   x0, x0, x16
+       lsr     x0, x0, #1
+
+#else
+       notimpl
+#endif
+
+       ret
  
  endproc
  
  proc   x23
  
-       ud2
+#if defined(__x86_64__)
+
+       // main loop: shorten a preserving residue class mod 3
+0:     cmp     rax, 5
+       jbe     8f
+       // a > 5
+       mov     rdx, rax                // d' = a
+       shr     rdx, 2                  // d' = floor(a/4)
+       and     rax, 3                  // a = 4 d' + a' (0 <= a' < 4)
+       add     rax, rdx                // a' == a (mod 3) but a' < a/4 + 4
+       jmp     0b
+
+       // fix up final value 0 <= a < 6: want 0 <= a < 3
+       //
+       // the tricky part is actually a = 3; but the other final cases take
+       // additional iterations which we can avoid.
+8:     cmp     rax, 3                  // set cf iff a < 3
+       cmc                             // set cf iff a >= 3
+       sbb     rdx, rdx                // d' = a >= 3 ? -1 : 0
+       and     rdx, 3                  // d' = a >= 3 ? 3 : 0
+       sub     rax, rdx                // a' = a - (a >= 3 ? 3 : 0)
+                                       //      = a (mod 3)
+
+#elif defined(__i386__)
+
+       // main loop: shorten a preserving residue class mod 3
+0:     cmp     eax, 5
+       jbe     8f
+       // a > 5
+       mov     edx, eax                // d' = a
+       shr     edx, 2                  // d' = floor(a/4)
+       and     eax, 3                  // a = 4 d' + a' (0 <= a' < 4)
+       add     eax, edx                // a' == a (mod 3) but a' < a/4 + 4
+       jmp     0b
+
+       // fix up final value 0 <= a < 6: want 0 <= a < 3
+       //
+       // the tricky part is actually a = 3; but the other final cases take
+       // additional iterations which we can avoid.
+8:     cmp     eax, 3                  // set cf iff a < 3
+       cmc                             // set cf iff a >= 3
+       sbb     edx, edx                // d' = a >= 3 ? -1 : 0
+       and     edx, 3                  // d' = a >= 3 ? 3 : 0
+       sub     eax, edx                // a' = a - (a >= 3 ? 3 : 0)
+                                       //      = a (mod 3)
+
+#elif defined(__arm__)
+
+0:     cmp     r0, #6
+       andhs   r12, r0, #3
+       addhs   r0, r12, r0, lsr #2
+       bhs     0b
+
+       cmp     r0, #3
+       subhs   r0, r0, #3
+
+#elif defined(__aarch64__)
+
+0:     cmp     x0, #6
+       // blunder on through regardless since this doesn't affect the result
+       and     x16, x0, #3
+       add     x0, x16, x0, lsr #2
+       b.hs    0b
+
+       subs    x16, x0, #3
+       cmov.hs x0, x16
+
+#else
+       notimpl
+#endif
+
+       ret
  
  endproc
  
  proc   x24
  
-       ud2
+       // invert (odd) a mod 2^64
+       //
+       // suppose a a_i == 1 (mod 2^{2^i})
+       //
+       // clearly good for i = 0, since 2^i = 1 and 2^{2^i} = 2, and a_0 =
+       // a == 1 (mod 2) by assumption
+       //
+       // write a a_i == b_i 2^{2^i} + 1 (mod 2^{2^{i+1}})
+       // then b_i == (a a_i - 1)/2^{2^i} (mod 2^{2^i})
+       // to lift inverse, we want x such that a x == -b_i (mod 2^{2^i});
+       // clearly x = -a_i b_i will do, since a a_i == 1 (mod 2^{2^i})
+       // then:
+       // a_{i+1} = a_i - a_i b_i 2^{2^i} = a_i (1 - (a a_i - 1))
+       //      = 2 a_i - a a_i^2
+       //
+       // check:
+       // a a_{i+1} = 2 a a_i - a^2 a_i^2
+       //      == 2 a a_i - (b_i 2^{2^i} + 1)^2
+       //      == 2 (b_i 2^{2^i} + 1) -
+       //              (b_i^2 2^{2^{i+1}} + 2 b_i 2^{2^i} + 1)
+       //      == 1 (mod 2^{2^{i+1}})
+
+#if defined(__x86_64__)
+
+       // rax                          // a_0 = a
+       mov     rbx, rax                // b' = a
+       mov     rsi, rax                // t = a_0
+
+0:
+  cmp rbp, 0
+  je 1f
+  stosq
+  dec rbp
+1:
+       mul     rbx                     // a' = a a_i
+       mov     rcx, rax                // c = a a_i
+
+       sub     rax, 2                  // a' = a a_i - 2
+       neg     rax                     // a' = 2 - a a_i
+       mul     rsi                     // a_{i+1} = a_i (2 - a a_i)
+                                       //      = 2 a_i - a a_i^2
+       mov     rsi, rax                // t = a_{i+1}
+
+       cmp     rcx, 1                  // done?
+       ja      0b                      // no -- iterate
+
+#elif defined(__i386__)
+
+       // eax                          // a_0 = a
+       mov     ebx, eax                // b' = a
+       mov     esi, eax                // t = a_0
+
+0:
+  cmp ebp, 0
+  je 1f
+  stosd
+  dec ebp
+1:
+       mul     ebx                     // a' = a a_i
+       mov     ecx, eax                // c = a a_i
+
+       sub     eax, 2                  // a' = a a_i - 2
+       jb      9f                      // done if < 2
+       neg     eax                     // a' = 2 - a a_i
+       mul     esi                     // a_{i+1} = a_i (2 - a a_i)
+                                       //      = 2 a_i - a a_i^2
+       mov     esi, eax                // t = a_{i+1}
+
+       jmp     0b                      // and iterate
+9:     mov     eax, esi                // restore
+
+#elif defined(__arm__)
+
+       // r0                           // a_0 = a
+       mov     r1, r0                  // b' = a
+
+0:
+  cmp r6, #0
+  strne r0, [r5], #4
+  subne r6, r6, #1
+       mul     r2, r0, r1              // c = a a_i
+       rsbs    r2, r2, #2              // c = 2 - a a_i
+       mul     r0, r0, r2              // a_{i+1} = a_i (2 - a a_i)
+                                       //      = 2 a_i - a a_i^2
+       blo     0b
+
+#elif defined(__aarch64__)
+
+       // x0                           // a_0 = a
+       mov     x1, x0                  // b' = a
+       mov     x16, #2                 // because we have no rsb
+
+0:
+  cmp x6, #0
+  b.eq 1f
+  str x0, [x5], #8
+  sub x6, x6, #1
+1:
+       mul     x2, x0, x1              // c = a a_i
+       subs    x2, x16, x2             // c = 2 - a a_i
+       mul     x0, x0, x2              // a_{i+1} = a_i (2 - a a_i)
+                                       //      = 2 a_i - a a_i^2
+       b.lo    0b
+
+#else
+       notimpl
+#endif
+
+       ret
  
  endproc
  
  proc   x25
  
-       ud2
+       // a poor approximation to pi/4
+       //
+       // think of x and y as being in 16.16 fixed-point format.  we sample
+       // points in the unit square, and determine how many of them are
+       // within a unit quarter-circle centred at the origin.  the area of
+       // the quarter-circle is pi/4.
+
+#if defined(__x86_64__)
+
+       xor     eax, eax                // a = 0
+       mov     rcx, 1
+       shl     rcx, 0x20               // c =~ 4 billion
+
+0:     movzx   rbx, cx                 // x = low 16 bits of c
+       imul    rbx, rbx                // b = x^2
+
+       ror     rcx, 0x10               // switch halves of c
+       movzx   rdx, cx                 // y = high 16 bits of c
+       imul    rdx, rdx                // d = y^2
+       rol     rcx, 0x10               // switch back
+
+       add     rbx, rdx                // r^2 = x^2 + y^2
+       shr     rbx, 0x20               // r^2 >= 1?
+       cmp     rbx, 1                  // set cf iff r^2 >= 1
+       adc     rax, 0                  // and add onto accumulator
+       loop    0b
+
+#elif defined(__i386__)
+
+       // this is actually better done in 32 bits.  the carry has the wrong
+       // sense here, so instead deduct one for each point outside the
+       // quarter-circle rather than adding one for each point inside it.
+       xor     eax, eax
+       xor     ecx, ecx
+
+0:     movzx   ebx, cx
+       imul    ebx, ebx
+
+       ror     ecx, 0x10
+       movzx   edx, cx
+       imul    edx, edx
+       rol     ecx, 0x10
+
+       add     ebx, edx                // see?
+       sbb     eax, 0
+       loop    0b
+
+#elif defined(__arm__)
+
+       mov     r0, #0
+       mov     r2, #0
+
+0:     uxth    r1, r2, ror #0
+       uxth    r3, r2, ror #16
+       mul     r1, r1, r1
+       mul     r3, r3, r3
+       cmn     r1, r3                  // mlas doesn't set cf usefully
+       addcc   r0, r0, #1
+       adds    r2, r2, #1
+       bne     0b
+
+#elif defined(__aarch64__)
+
+       mov     w0, #0
+       mov     w2, #0
+
+0:     ubfx    w1, w2, #0, #16
+       ubfx    w3, w2, #16, #16
+       sub     w2, w2, #1
+       mul     w1, w1, w1
+       mul     w3, w3, w3
+       cmn     w1, w3
+       cinc.cc w0, w0
+       cbnz    w2, 0b
+
+#else
+       notimpl
+#endif
+
+       ret
  
  endproc
  
  proc   x26
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x27
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x28
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x29
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x2a
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x2b
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x2c
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x2d
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x2e
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x2f
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
+///--------------------------------------------------------------------------
+/// 0x30--0x3f
+
  proc   x30
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
         ret
  
@@ -656,90 +3005,362 @@ endproc
  
  proc   x31
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x32
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x33
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x34
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x35
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x36
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x37
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x38
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x39
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x3a
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x3b
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x3c
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x3d
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x3e
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
  
  proc   x3f
  
-       ud2
+#if defined(__x86_64__)
+
+       notimpl
+
+#elif defined(__i386__)
+
+       notimpl
+
+#elif defined(__arm__)
+
+       notimpl
+
+#elif defined(__aarch64__)
+
+       notimpl
+
+#else
+       notimpl
+#endif
  
  endproc
+
+///----- That's all, folks --------------------------------------------------