mdw@git.distorted.org.uk Git - xchg-rax-rax/blob - xchg.S

   1 /// -*- mode: asm; asm-comment-char: 0 -*-
   2
   3 ///--------------------------------------------------------------------------
   4 /// Preliminaries.
   5
   6 #include <sys/syscall.h>
   7
   8 #if defined(__i386__) || defined(__x86_64__)
   9
  10         .intel_syntax noprefix
  11
  12 #elif defined(__arm__)
  13
  14 .macro  ret
  15         bx      r14
  16 .endm
  17
  18         .arch   armv7-a
  19
  20 #elif defined(__aarch64__)
  21
  22 .macro  cmov    rd, rn, cc
  23         csel    \rd, \rn, \rd, \cc
  24 .endm
  25 #define _COND(_)                                                        \
  26         _(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl)                 \
  27         _(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv)                 \
  28         _(hs) _(lo)
  29 #define _INST(_)                                                        \
  30         _(ccmp) _(ccmn)                                                 \
  31         _(csel) _(cmov)                                                 \
  32         _(csinc) _(cinc) _(cset)                                        \
  33         _(csneg) _(cneg)                                                \
  34         _(csinv) _(cinv) _(csetm)
  35 #define _CONDVAR(cc) _definstvar cc;
  36 #define _INSTVARS(inst)                                                 \
  37         .macro _definstvar cc;                                          \
  38           .macro inst.\cc args:vararg; inst \args, \cc; .endm;          \
  39         .endm;                                                          \
  40         _COND(_CONDVAR);                                                \
  41         .purgem _definstvar;
  42         _INST(_INSTVARS)
  43 #undef _COND
  44 #undef _INST
  45 #undef _CONDVAR
  46 #undef _INSTVARS
  47
  48 #define CCMP_N 8
  49 #define CCMP_Z 4
  50 #define CCMP_C 2
  51 #define CCMP_V 1
  52
  53 #define CCMP_MI CCMP_N
  54 #define CCMP_PL 0
  55 #define CCMP_EQ CCMP_Z
  56 #define CCMP_NE 0
  57 #define CCMP_CS CCMP_C
  58 #define CCMP_HS CCMP_C
  59 #define CCMP_CC 0
  60 #define CCMP_LO 0
  61 #define CCMP_VS CCMP_V
  62 #define CCMP_VC 0
  63 #define CCMP_HI CCMP_C
  64 #define CCMP_LS 0
  65 #define CCMP_LT CCMP_N
  66 #define CCMP_GE 0
  67 #define CCMP_LE CCMP_N
  68 #define CCMP_GT 0
  69
  70 #else
  71 #  error "not supported"
  72 #endif
  73
  74 .macro  proc    name
  75         .globl  \name
  76         .type   \name, STT_FUNC
  77         .p2align 4
  78 \name\():
  79   .macro endproc
  80         .size   \name, . - \name
  81         .purgem endproc
  82   .endm
  83 .endm
  84
  85 .macro ch c
  86 #if defined(__i386__)
  87
  88         pushf
  89         push    eax
  90         push    ebx
  91         push    ecx
  92         push    edx
  93         push    ebp
  94         mov     ebp, esp
  95         and     esp, -16
  96
  97         push    \c
  98         call    putchar@plt
  99
 100         call    get_pc_ebx
 101         add     ebx, offset _GLOBAL_OFFSET_TABLE
 102         mov     eax, [ebx + stdout@GOT]
 103         mov     eax, [eax]
 104         call    fflush@plt
 105
 106         mov     esp, ebp
 107         pop     ebp
 108         pop     edx
 109         pop     ecx
 110         pop     ebx
 111         pop     eax
 112         popf
 113
 114 #elif defined(__x86_64__)
 115
 116         pushf
 117         push    rax
 118         push    rcx
 119         push    rdx
 120         push    rsi
 121         push    rdi
 122         push    r8
 123         push    r9
 124         push    rbp
 125         mov     rbp, rsp
 126         and     rsp, -16
 127
 128         mov     rdi, \c
 129         call    putchar@plt
 130
 131         mov     rdi, [rip + stdout]
 132         call    fflush@plt
 133
 134         mov     rsp, rbp
 135         pop     rbp
 136         pop     r9
 137         pop     r8
 138         pop     rdi
 139         pop     rsi
 140         pop     rdx
 141         pop     rcx
 142         pop     rax
 143         popf
 144
 145 #elif defined(__arm__)
 146
 147         stmfd   r13!, {r0-r4, r12, r14}
 148
 149         mov     r4, r13
 150         bic     r14, r4, #15
 151         mov     r13, r14
 152
 153         mov     r0, #\c
 154         bl      putchar@plt
 155
 156         ldr     r14, .L$_c$gotoff$\@
 157 .L$_c$gotpc$\@:
 158         add     r14, pc, r14
 159         b       .L$_c$cont$\@
 160 .L$_c$gotoff$\@:
 161         .word   _GLOBAL_OFFSET_TABLE - .L$_c$gotpc$\@ - 8
 162 .L$_c$cont$\@:
 163         bl      fflush@plt
 164
 165         mov     r13, r4
 166         ldmfd   r13!, {r0-r4, r12, r14}
 167
 168 #elif defined(__aarch64__)
 169
 170         sub     sp, sp, #20*8
 171         stp      x0,  x1, [sp,   #0]
 172         stp      x2,  x3, [sp,  #16]
 173         stp      x4,  x5, [sp,  #32]
 174         stp      x6,  x7, [sp,  #48]
 175         stp      x8,  x9, [sp,  #64]
 176         stp     x10, x11, [sp,  #80]
 177         stp     x12, x13, [sp,  #96]
 178         stp     x14, x15, [sp, #112]
 179         stp     x16, x17, [sp, #128]
 180         mrs     x16, nzcv
 181         stp     x16, x30, [sp, #144]
 182
 183         mov     w0, #\c
 184         bl      putchar
 185         adrp    x0, :got:stdout
 186         ldr     x0, [x0, #:got_lo12:stdout]
 187         ldr     x0, [x0]
 188         bl      fflush
 189
 190         ldp     x16, x30, [sp, #144]
 191         msr     nzcv, x16
 192         ldp     x16, x17, [sp, #128]
 193         ldp     x14, x15, [sp, #112]
 194         ldp     x12, x13, [sp,  #96]
 195         ldp     x10, x11, [sp,  #80]
 196         ldp      x8,  x9, [sp,  #64]
 197         ldp      x6,  x7, [sp,  #48]
 198         ldp      x4,  x5, [sp,  #32]
 199         ldp      x2,  x3, [sp,  #16]
 200         ldp      x0,  x1, [sp,   #0]
 201         add     sp, sp, #20*8
 202
 203 #else
 204 #  error "not supported"
 205 #endif
 206 .endm
 207
 208 .macro  notimpl
 209 #if defined(__i386__) || defined(__x86_64__)
 210         ud2
 211 #elif defined(__arm__)
 212         udf
 213 #elif defined(__aarch64__)
 214         hlt     #0
 215 #else
 216 #  error "not supported"
 217 #endif
 218 .endm
 219
 220         .section .note.GNU-stack, "", %progbits
 221
 222         .text
 223
 224 #if defined(__i386__)
 225 get_pc_ebx:
 226         mov     ebx, [esp]
 227         ret
 228 #endif
 229
 230
 231 proc    call_example
 232
 233 #if defined(__i386__)
 234
 235         push    ebx                     // ebx
 236         push    esi                     // esi, ebx
 237         push    edi                     // edi, esi, ebx
 238         push    ebp                     // flags, ebp, ..., ebx
 239         pushf
 240
 241         mov     edi, [esp + 4*6]
 242         mov     esi, [esp + 4*7]
 243         push    esi                     // regs, flags, ebp, ..., ebx
 244
 245         call    get_pc_ebx
 246         lea     eax, [ebx + 9f - .]
 247         push    eax                     // cont, regs, flags, ebp, ..., ebx
 248         push    edi                 // func, cont, regs, flags, ebp, ..., ebx
 249
 250         mov     eax, [esi + 28]
 251         pushf
 252         pop     ecx
 253         and     eax,  0x0cd5
 254         and     ecx, ~0x0cd5
 255         or      eax, ecx
 256         push    eax
 257         popf
 258         mov     eax, [esi +  0]
 259         mov     ebx, [esi +  4]
 260         mov     ecx, [esi +  8]
 261         mov     edx, [esi + 12]
 262         mov     edi, [esi + 20]
 263         mov     ebp, [esi + 24]
 264         mov     esi, [esi + 16]
 265
 266         ret                            // -> func; regs, flags, ebp, ..., ebx
 267
 268 9:      pushf                           // eflags, regs, flags, ebp, ..., ebx
 269         push    esi                // esi, eflags, regs, flags, ebp, ..., ebx
 270         mov     esi, [esp + 8]
 271         mov     [esi +  0], eax
 272         mov     [esi +  4], ebx
 273         mov     [esi +  8], ecx
 274         mov     [esi + 12], edx
 275         mov     [esi + 20], edi
 276         mov     [esi + 24], ebp
 277         pop     eax                     // rflags, regs, flags, ebp, ..., ebx
 278         mov     [esi + 16], eax
 279         pop     eax                     // regs, flags, ebp, ..., ebx
 280         mov     [esi + 28], eax
 281
 282         add     esp, 4                  // flags, ebp, ..., ebx
 283         popf                            // ebp, ..., ebx
 284         pop     ebp                     // ..., ebx
 285         pop     edi
 286         pop     esi
 287         pop     ebx                     //
 288         ret
 289
 290 #elif defined(__x86_64__)
 291
 292         push    rbx                     // rbx
 293         push    r10
 294         push    r11
 295         push    r12
 296         push    r13
 297         push    r14
 298         push    r15
 299         push    rbp                     // flags, rbp, ..., rbx
 300         pushf
 301
 302         push    rsi                     // regs, flags, rbp, ..., rbx
 303
 304         lea     rax, [rip + 9f]
 305         push    rax                     // cont, regs, flags, rbp, ..., rbx
 306         push    rdi                 // func, cont, regs, flags, rbp, ..., rbx
 307
 308         mov     rax, [rsi + 8*15]
 309         pushf
 310         pop     rcx
 311         and     rax,  0x0cd5
 312         and     rcx, ~0x0cd5
 313         or      rax, rcx
 314         push    rax
 315         popf
 316         mov     rax, [rsi +   0]
 317         mov     rbx, [rsi +   8]
 318         mov     rcx, [rsi +  16]
 319         mov     rdx, [rsi +  24]
 320         mov     rdi, [rsi +  40]
 321         mov     rbp, [rsi +  48]
 322         mov     r8,  [rsi +  56]
 323         mov     r9,  [rsi +  64]
 324         mov     r10, [rsi +  72]
 325         mov     r11, [rsi +  80]
 326         mov     r12, [rsi +  88]
 327         mov     r13, [rsi +  96]
 328         mov     r14, [rsi + 104]
 329         mov     r15, [rsi + 112]
 330         mov     rsi, [rsi +  32]
 331
 332         ret                            // -> func; regs, flags, rbp, ..., rbx
 333
 334 9:      pushf                           // rflags, regs, flags, rbp, ..., rbx
 335         push    rsi                // rsi, rflags, regs, flags, rbp, ..., rbx
 336         mov     rsi, [rsp + 16]
 337         mov     [rsi +   0], rax
 338         mov     [rsi +   8], rbx
 339         mov     [rsi +  16], rcx
 340         mov     [rsi +  24], rdx
 341         mov     [rsi +  40], rdi
 342         mov     [rsi +  48], rbp
 343         mov     [rsi +  56],  r8
 344         mov     [rsi +  64],  r9
 345         mov     [rsi +  72], r10
 346         mov     [rsi +  80], r11
 347         mov     [rsi +  88], r12
 348         mov     [rsi +  96], r13
 349         mov     [rsi + 104], r14
 350         mov     [rsi + 112], r15
 351         pop     rax                     // rflags, regs, flags, rbp, ..., rbx
 352         mov     [rsi +  32], rax
 353         pop     rax                     // regs, flags, rbp, ..., rbx
 354         mov     [rsi + 120], rax
 355
 356         add     rsp, 8                  // flags, rbp, ..., rbx
 357         popf                            // rbp, ..., rbx
 358         pop     rbp                     // ..., rbx
 359         pop     r15
 360         pop     r14
 361         pop     r13
 362         pop     r12
 363         pop     r11
 364         pop     r10
 365         pop     rbx                     //
 366         ret
 367
 368 #elif defined(__arm__)
 369
 370         stmfd   r13!, {r0, r1, r4-r11, r14}
 371         ldmia   r1, {r0-r12, r14}
 372         msr     cpsr, r14
 373         mov     r14, pc
 374         ldr     pc, [r13], #4
 375         ldr     r14, [r13], #4
 376         stmia   r14!, {r0-r12}
 377         mrs     r0, cpsr
 378         str     r0, [r14]
 379         ldmfd   r13!, {r4-r11, pc}
 380
 381 #elif defined(__aarch64__)
 382
 383         stp     x29, x30, [sp, #-13*8]!
 384         mov     x29, sp
 385         stp     x19, x20, [sp,  #16]
 386         stp     x21, x22, [sp,  #32]
 387         stp     x23, x24, [sp,  #48]
 388         stp     x25, x26, [sp,  #64]
 389         stp     x27, x28, [sp,  #80]
 390         str     x1, [sp, #96]
 391
 392         mov     x16, x0
 393
 394         ldr     x17,      [x1, #128]
 395         ldp     x14, x15, [x1, #112]
 396         ldp     x12, x13, [x1,  #96]
 397         ldp     x10, x11, [x1,  #80]
 398         ldp      x8,  x9, [x1,  #64]
 399         ldp      x6,  x7, [x1,  #48]
 400         ldp      x4,  x5, [x1,  #32]
 401         ldp      x2,  x3, [x1,  #16]
 402         ldp      x0,  x1, [x1,   #0]
 403         msr     nzcv, x17
 404
 405         blr     x16
 406
 407         ldr     x16, [sp, #96]
 408         mrs     x17, nzcv
 409         str     x17,      [x16, #128]
 410         stp     x14, x15, [x16, #112]
 411         stp     x12, x13, [x16,  #96]
 412         stp     x10, x11, [x16,  #80]
 413         stp      x8,  x9, [x16,  #64]
 414         stp      x6,  x7, [x16,  #48]
 415         stp      x4,  x5, [x16,  #32]
 416         stp      x2,  x3, [x16,  #16]
 417         stp      x0,  x1, [x16,   #0]
 418
 419         ldp     x19, x20, [sp,  #16]
 420         ldp     x21, x22, [sp,  #32]
 421         ldp     x23, x24, [sp,  #48]
 422         ldp     x25, x26, [sp,  #64]
 423         ldp     x27, x28, [sp,  #80]
 424         ldp     x29, x30, [sp], #13*8
 425
 426 #else
 427 #  error "not supported"
 428 #endif
 429
 430 endproc
 431
 432 proc    nop
 433
 434         ret
 435
 436 endproc
 437
 438 ///--------------------------------------------------------------------------
 439 /// 0x00--0x0f
 440
 441 proc    x00
 442
 443         // clear all 64 bits of extended traditional registers
 444
 445 #if defined(__x86_64__)
 446
 447         xor      eax, eax               // clear rax
 448         lea      rbx, [0]               // rbx -> _|_
 449         loop     .                      // iterate, decrement rcx until zero
 450         mov      rdx, 0                 // set rdx = 0
 451         and      esi, 0                 // clear all bits of rsi
 452         sub      edi, edi               // set rdi = edi - edi = 0
 453         push     0
 454         pop      rbp                    // pop 0 into rbp
 455
 456 #elif defined(__i386__)
 457
 458         xor     eax, eax
 459         lea     ebx, [0]
 460         loop    .
 461         mov     edx, 0
 462         and     esi, 0
 463         sub     edi, edi
 464         push    0
 465         pop     ebp
 466
 467 #elif defined(__arm__)
 468
 469         eor     r0, r0, r0
 470         rsb     r1, r1, r1
 471 0:      subs    r2, r2, #1
 472         bne     0b
 473         mov     r3, #0
 474         and     r4, r4, #0
 475         sub     r5, r5, r5
 476
 477 #elif defined(__aarch64__)
 478
 479         eor     w0, w0, w0
 480         mov     w1, wzr
 481 0:      sub     w2, w2, #1
 482         cbnz    w2, 0b
 483         mov     w3, #0
 484         and     w4, w4, wzr
 485         sub     w5, w5, w5
 486
 487 #else
 488         notimpl
 489 #endif
 490
 491         ret
 492
 493 endproc
 494
 495 proc    x01
 496
 497         // advance a fibonacci pair by c steps
 498         //
 499         // on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1}
 500         // and f_{i+c}, where f_{i+1} = f_i + f_{i-1}
 501
 502 #if defined(__x86_64__)
 503
 504 0:      xadd    rax, rdx                // a, d = a + d, a
 505                                         //      = f_{i+1} + f_i, f_{i+1}
 506                                         //      = f_{i+2}, f_{i+1}
 507         loop    0b                      // advance i, decrement c, iterate
 508
 509 #elif defined(__i386__)
 510
 511 0:      xadd    eax, edx
 512         loop    0b
 513
 514 #elif defined(__arm__)
 515
 516 0:      subs    r2, r2, #2
 517         add     r3, r3, r0
 518         blo     8f
 519         add     r0, r0, r3
 520         bhi     0b
 521
 522 8:      movne   r0, r3
 523
 524 #elif defined(__aarch64__)
 525
 526 0:      subs    x2, x2, #2
 527         add     x3, x3, x0
 528         b.lo    8f
 529         add     x0, x0, x3
 530         b.hi    0b
 531
 532 8:      cmov.ne x0, x3
 533
 534 #else
 535         notimpl
 536 #endif
 537
 538         ret
 539
 540 endproc
 541
 542 proc    x02
 543
 544         // boolean canonify a: if a = 0 on entry, leave it zero; otherwise
 545         // set a = 1
 546
 547 #if defined(__x86_64__)
 548
 549         neg     rax                     // set cf iff a /= 0
 550         sbb     rax, rax                // a = a - a - cf = -cf
 551         neg     rax                     // a = cf
 552
 553 #elif defined(__i386__)
 554
 555         neg     eax
 556         sbb     eax, eax
 557         neg     eax
 558
 559 #elif defined(__arm__)
 560
 561         movs    r1, r0                  // the easy way
 562         movne   r1, #1                  // mvnne r1, #1 for mask
 563
 564         cmp     r0, #1                  // clear cf iff a == 0
 565         sbc     r2, r0, r0              // c' = a - a - 1 + cf = cf - 1
 566         add     r2, r2, #1              // c' = cf
 567
 568         sub     r3, r0, r0, lsr #1      // d' top bit clear; d' = 0 iff a = 0
 569         rsb     r3, r3, #0              // d' top bit set iff a /= 0
 570         mov     r3, r3, lsr #31         // asr for mask
 571
 572         rsbs    r0, r0, #0
 573         sbc     r0, r0, r0
 574         rsb     r0, r0, #0
 575
 576 #elif defined(__aarch64__)
 577
 578         cmp     x0, #0                  // trivial
 579         cset.ne x1                      // csetm for mask
 580
 581         cmp     xzr, x0                 // set cf iff a == 0
 582         sbc     x2, x0, x0              // c' = a - a - 1 + cf = cf - 1
 583         neg     x2, x2                  // c' = 1 - cf
 584
 585         sub     x3, x0, x0, lsr #1      // if a < 2^63 then a' = ceil(d/2) <
 586                                         // 2^63
 587                                         // if a >= 2^63, write a = 2^63 + t
 588                                         // with t < 2^63; d' = 2^63 - 2^62 +
 589                                         // ceil(t/2) = 2^62 + ceil(t/2), and
 590                                         // ceil(t/2) < 2^62
 591                                         // anyway d' < 2^63 and d' = 0 iff
 592                                         // a = 0
 593         neg     x3, x3                  // d' top bit set iff a /= 0
 594         lsr     x3, x3, #63             // asr for mask
 595
 596         cmp     x0, #1                  // set cf iff a /= 0
 597         adc     x0, xzr, xzr            // a' = 0 + 0 + cf = cf
 598
 599 #else
 600         notimpl
 601 #endif
 602
 603         ret
 604
 605 endproc
 606
 607 proc    x03
 608
 609         // set a = min(a, d) (unsigned); clobber c, d
 610
 611 #if defined(__x86_64__)
 612
 613         sub     rdx, rax                // d' = d - a; set cf if a > d
 614         sbb     rcx, rcx                // c = -cf = -[a > d]
 615         and     rcx, rdx                // c = a > d ? d - a : 0
 616         add     rax, rcx                // a' = a > d ? d : a
 617
 618 #elif defined(__i386__)
 619
 620         sub     edx, eax
 621         sbb     ecx, ecx
 622         and     ecx, edx
 623         add     eax, ecx
 624
 625 #elif defined(__arm__)
 626
 627         cmp     r0, r3                  // the easy way
 628         movlo   r1, r0                  // only needed for out-of-place
 629         movhs   r1, r3
 630
 631         subs    r3, r3, r0
 632         sbc     r12, r12, r12
 633         and     r12, r12, r3
 634         add     r0, r0, r12
 635
 636 #elif defined(__aarch64__)
 637
 638         cmp     x0, x3                  // the easy way
 639         csel.lo x1, x0, x3
 640
 641         subs    x3, x3, x0              // d' = d - a; set cf if d >= a
 642         sbc     x16, xzr, xzr           // t = -1 + cf = -[a > d]
 643         and     x16, x16, x3            // t = a > d ? d - a : 0
 644         add     x0, x0, x16             // a' = a > d ? d : a
 645
 646 #else
 647         notimpl
 648 #endif
 649
 650         ret
 651
 652 endproc
 653
 654 proc    x04
 655
 656         // switch case?
 657
 658 #if defined(__x86_64__)
 659
 660   // unrelated playing
 661   mov   ecx, eax
 662   mov   rbx, -1
 663   mov   edx, ecx
 664   sub   edx, '0'
 665   cmp   edx, 10
 666   cmovb rbx, rdx
 667   or    ecx, 0x20
 668   mov   edx, ecx
 669   sub   edx, 'a'
 670   sub   ecx, 'a' - 10
 671   cmp   edx, 6
 672   cmovb rbx, rcx
 673
 674         xor     al, 0x20
 675
 676 #elif defined(__i386__)
 677
 678   // unrelated playing
 679   mov   ecx, eax
 680   mov   ebx, -1
 681   mov   edx, ecx
 682   sub   edx, '0'
 683   cmp   edx, 10
 684   cmovb ebx, edx
 685   or    ecx, 0x20
 686   mov   edx, ecx
 687   sub   edx, 'a'
 688   sub   ecx, 'a' - 10
 689   cmp   edx, 6
 690   cmovb ebx, ecx
 691
 692         xor     al, 0x20
 693
 694 #elif defined(__arm__)
 695
 696   // unrelated playing
 697   mvn   r1, #0
 698   sub   r12, r0, #'0'
 699   cmp   r12, #10
 700   movlo r1, r12
 701   orr   r12, r0, #0x20
 702   sub   r12, r12, #'a'
 703   cmp   r12, #6
 704   addlo r1, r12, #10
 705
 706         eor     r0, r0, #0x20
 707
 708 #elif defined(__aarch64__)
 709
 710   // unrelated playing
 711   mov   x1, #-1
 712   sub   w16, w0, #'0'
 713   cmp   w16, #10
 714   cmov.lo       x1, x16
 715   orr   w16, w0, #0x20
 716   sub   w16, w16, #'a' - 10
 717   cmp   w16, #10
 718   ccmp.hs       w16, #16, #CCMP_HS
 719   cmov.lo       x1, x16
 720
 721         eor     w0, w0, #0x20
 722
 723 #else
 724         notimpl
 725 #endif
 726
 727         ret
 728
 729 endproc
 730
 731 proc    x05
 732
 733         // answer whether 5 <= a </<= 9.
 734
 735 #if defined(__x86_64__)
 736
 737         sub     rax, 5                  // a' = a - 5
 738         cmp     rax, 4                  // is a' - 5 </<= 4?
 739
 740         // cc           a'                      a
 741         //
 742         // z/e          a' = 4                  a = 9
 743         // nz/ne        a' /= 4                 a /= 9
 744         //
 745         // a/nbe        a' > 4                  a > 9 or a < 5
 746         // nc/ae/nb     a' >= 4                 a >= 9 or a < 5
 747         // c/b/nae      a' < 4                  5 <= a < 9
 748         // be/na        a' <= 4                 5 <= a <= 9
 749         //
 750         // o            a' < -2^63 + 4          -2^63 + 5 <= a < -2^63 + 9
 751         // no           a' >= -2^63 + 4         a >= -2^63 + 9 or
 752         //                                              a < -2^63 + 5
 753         // s            -2^63 + 4 <= a' < 4     -2^63 + 9 <= a < 9
 754         // ns           a' < -2^63 + 4 or       a < -2^63 + 9 or a >= 9
 755         //                      a' >= 4
 756         // ge/nl        a' >= 4                 a >= 9 or a < -2^63 + 5
 757         // l/nge        a' < 4                  -2^63 + 5 <= a < 9
 758         // g/nle        a' > 4                  a > 9 or a < -2^63 + 5
 759         // le/ng        a' <= 4                 -2^63 + 5 <= a <= 9
 760
 761 #elif defined(__i386__)
 762
 763         sub     eax, 5
 764         cmp     eax, 4
 765
 766 #elif defined(__arm__)
 767
 768         // i dimly remember having a slick way to do this way back in the
 769         // day, but i can't figure it out any more.
 770         sub     r0, #5
 771         cmp     r0, #4
 772
 773 #elif defined(__aarch64__)
 774
 775         // literal translation is too obvious
 776         cmp     x0, #5
 777         ccmp.hs x0, #9, #CCMP_HS
 778
 779 #else
 780         notimpl
 781 #endif
 782
 783         ret
 784
 785 endproc
 786
 787 proc    x06
 788
 789         // leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of,
 790         // set sf to msb(a)
 791
 792 #if defined(__x86_64__)
 793
 794         not     rax                     // a' = -a - 1
 795         inc     rax                     // a' = -a
 796         neg     rax                     // a' = a
 797
 798 #elif defined(__i386__)
 799
 800         not     eax
 801         inc     eax
 802         neg     eax
 803
 804 #elif defined(__arm__)
 805
 806         mvn     r0, r0
 807         add     r0, r0, #1
 808         rsbs    r0, r0, #0              // cf has opposite sense
 809
 810 #elif defined(__aarch64__)
 811
 812         mvn     x0, x0
 813         add     x0, x0, #1
 814         negs    x0, x0                  // cf has opposite sense
 815
 816 #else
 817         notimpl
 818 #endif
 819
 820         ret
 821
 822 endproc
 823
 824 proc    x07
 825
 826         // same as before (?)
 827
 828 #if defined(__x86_64__)
 829
 830         inc     rax                     // a' = a + 1
 831         neg     rax                     // a' = -a - 1
 832         inc     rax                     // a' = -a
 833         neg     rax                     // a' = a
 834
 835 #elif defined(__i386__)
 836
 837         inc     eax
 838         neg     eax
 839         inc     eax
 840         neg     eax
 841
 842 #elif defined(__arm__)
 843
 844         add     r0, r0, #1
 845         rsb     r0, r0, #0
 846         add     r0, r0, #1
 847         rsbs    r0, r0, #0
 848
 849 #elif defined(__aarch64__)
 850
 851         add     x0, x0, #1
 852         neg     x0, x0
 853         add     x0, x0, #1
 854         negs    x0, x0                  // cf has opposite sense
 855
 856 #else
 857         notimpl
 858 #endif
 859
 860         ret
 861
 862 endproc
 863
 864 proc    x08
 865
 866         // floor((a + d)/2), correctly handling overflow conditions; final cf
 867         // is lsb(a + d), probably uninteresting
 868
 869 #if defined(__x86_64__)
 870
 871         add     rax, rdx                // cf || a' = a + d
 872         rcr     rax, 1                  // shift 65-bit result right by one
 873                                         // place; lsb moves into carry
 874
 875 #elif defined(__i386__)
 876
 877         add     eax, edx
 878         rcr     eax, 1
 879
 880 #elif defined(__arm__)
 881
 882         // like the two-instruction a64 version
 883         sub     r1, r3, r0
 884         add     r1, r0, r1, lsr #1
 885
 886         // the slick version, similar to the above
 887         adds    r0, r0, r3
 888         mov     r0, r0, rrx
 889
 890 #elif defined(__aarch64__)
 891
 892         // a64 lacks a32's rrx.  literal translation.
 893         adds    x1, x0, x3              // cf || a' = a + d
 894         adc     x16, xzr, xzr           // realize cf in extra register
 895         extr    x1, x16, x1, #1         // shift down one place
 896
 897         // two instruction version: clobbers additional register.  (if you
 898         // wanted the answer in any other register, even overwriting d, then
 899         // this is unnecessary.)  also depends on d >= a.
 900         sub     x16, x3, x0             // compute difference
 901         add     x0, x0, x16, lsr #1     // add half of it (rounded down)
 902
 903 #else
 904         notimpl
 905 #endif
 906
 907         ret
 908
 909 endproc
 910
 911 proc    x09
 912
 913         // a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3
 914         // (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8).
 915
 916 #if defined(__x86_64__)
 917
 918         shr     rax, 3                  // a' = floor(a/8); cf = 1 if a ==
 919                                         // 4, 5, 6, 7 (mod 8)
 920         adc     rax, 0                  // a' = floor(a/8) + cf
 921
 922 #elif defined(__i386__)
 923
 924         shr     eax, 3
 925         adc     eax, 0
 926
 927 #elif defined(__arm__)
 928
 929         movs    r0, r0, lsr #3
 930         adc     r0, r0, #0
 931
 932 #elif defined(__aarch64__)
 933
 934         tst     x0, #4
 935         orr     x0, xzr, x0, lsr #3
 936         cinc.ne x0, x0
 937
 938 #else
 939         notimpl
 940 #endif
 941
 942         ret
 943
 944 endproc
 945
 946 proc    x0a
 947
 948         // increment c-byte little-endian bignum at rdi
 949
 950 #if defined(__x86_64__)
 951
 952         add     byte ptr [rdi], 1
 953 0:      inc     rdi
 954         adc     byte ptr [rdi], 0
 955         loop    0b
 956
 957 #elif defined(__i386__)
 958
 959         add     byte ptr [edi], 1
 960 0:      inc     edi
 961         adc     byte ptr [edi], 0
 962         loop    0b
 963
 964 #elif defined(__arm__)
 965
 966         mov     r12, #256               // set initial carry
 967 0:      ldrb    r0, [r5]
 968         subs    r2, r2, #1
 969         add     r12, r0, r12, lsr #8
 970         strb    r12, [r5], #1
 971         bne     0b
 972
 973 #elif defined(__aarch64__)
 974
 975         mov     w17, #256               // set initial carry
 976 0:      ldrb    w16, [x5]
 977         sub     x2, x2, #1
 978         add     w17, w16, w17, lsr #8
 979         strb    w17, [x5], #1
 980         cbnz    x2, 0b
 981
 982 #else
 983         notimpl
 984 #endif
 985
 986         ret
 987
 988 endproc
 989
 990 proc    x0b
 991
 992         // negate double-precision d:a
 993
 994 #if defined(__x86_64__)
 995
 996         not     rdx                     // d' = -d - 1
 997         neg     rax                     // a' = -a;
 998                                         // cf = 1 iff a /= 0
 999         sbb     rdx, -1                 // d' = -d - cf
1000
1001 #elif defined(__i386__)
1002
1003         not     edx
1004         neg     eax
1005         sbb     edx, -1
1006
1007 #elif defined(__arm__)
1008
1009         // reverse subtract is awesome
1010         rsbs    r0, r0, #0
1011         rsc     r3, r3, #0
1012
1013 #elif defined(__aarch64__)
1014
1015         // easy way: everything is better with zero registers.
1016         negs    x0, x0
1017         ngc     x3, x3
1018
1019 #else
1020         notimpl
1021 #endif
1022
1023         ret
1024
1025 endproc
1026
1027 proc    x0c
1028
1029         // rotate is distributive over xor.
1030
1031 #if defined(__x86_64__)
1032
1033         // rax                          // = a_1 || a_0
1034         // rbx                          // = b_1 || b_0
1035         mov     rcx, rax                // = a_1 || a_0
1036
1037         xor     rcx, rbx                // = (a_1 XOR b_1) || (a_0 XOR b_0)
1038         ror     rcx, 0xd                // = (a_0 XOR b_0) || (a_1 XOR b_1)
1039
1040         ror     rax, 0xd                // = a_0 || a_1
1041         ror     rbx, 0xd                // = b_0 || b_1
1042         xor     rax, rbx                // = (a_0 XOR b_0) || (a_1 XOR b_1)
1043
1044         cmp     rax, rcx                // always equal
1045
1046 #elif defined(__i386__)
1047
1048         mov     ecx, eax                // = a_1 || a_0
1049
1050         xor     ecx, ebx                // = (a_1 XOR b_1) || (a_0 XOR b_0)
1051         ror     ecx, 0xd                // = (a_0 XOR b_0) || (a_1 XOR b_1)
1052
1053         ror     eax, 0xd                // = a_0 || a_1
1054         ror     ebx, 0xd                // = b_0 || b_1
1055         xor     eax, ebx                // = (a_0 XOR b_0) || (a_1 XOR b_1)
1056
1057         cmp     eax, ecx                // always equal
1058
1059 #elif defined(__arm__)
1060
1061
1062         // r0                           // = a_1 || a_0
1063         // r1                           // = b_1 || b_0
1064         eor     r2, r0, r1              // = (a_1 XOR b_1) || (a_0 XOR b_0)
1065         mov     r2, r2, ror #13         // = (a_0 XOR b_0) || (a_1 XOR b_1)
1066
1067         mov     r1, r1, ror #13         // = b_0 || b_1
1068         eor     r0, r1, r0, ror #13     // = (a_0 XOR b_0) || (a_1 XOR b_1)
1069
1070         cmp     r0, r2                  // always equal
1071
1072 #elif defined(__aarch64__)
1073
1074         // x0                           // = a_1 || a_0
1075         // x1                           // = b_1 || b_0
1076         eor     x2, x0, x1              // = (a_1 XOR b_1) || (a_0 XOR b_0)
1077         ror     x2, x2, #13             // = (a_0 XOR b_0) || (a_1 XOR b_1)
1078
1079         ror     x1, x1, #13             // = b_0 || b_1
1080         eor     x0, x1, x0, ror #13     // = (a_0 XOR b_0) || (a_1 XOR b_1)
1081
1082         cmp     x0, x2                  // always equal
1083
1084 #else
1085         notimpl
1086 #endif
1087
1088         ret
1089
1090 endproc
1091
1092 proc    x0d
1093
1094         // and is distributive over xor.
1095
1096 #if defined(__x86_64__)
1097
1098         mov     rdx, rbx                // = b
1099
1100         xor     rbx, rcx                // = b XOR c
1101         and     rbx, rax                // = a AND (b XOR c)
1102
1103         and     rdx, rax                // = a AND b
1104         and     rax, rcx                // = a AND c
1105         xor     rax, rdx                // = (a AND b) XOR (a AND c)
1106                                         // = a AND (b XOR c)
1107
1108         cmp     rax, rbx                // always equal
1109
1110 #elif defined(__i386__)
1111
1112         mov     edx, ebx                // = b
1113
1114         xor     ebx, ecx                // = b XOR c
1115         and     ebx, eax                // = a AND (b XOR c)
1116
1117         and     edx, eax                // = a AND b
1118         and     eax, ecx                // = a AND c
1119         xor     eax, edx                // = (a AND b) XOR (a AND c)
1120                                         // = a AND (b XOR c)
1121
1122         cmp     eax, ebx                // always equal
1123
1124 #elif defined(__arm__)
1125
1126         and     r3, r0, r1              // = a AND b
1127
1128         eor     r1, r1, r2              // = b XOR c
1129         and     r1, r1, r0              // = a AND (b XOR c)
1130
1131         and     r0, r0, r2              // = a AND c
1132         eor     r0, r0, r3              // = (a AND b) XOR (a AND c)
1133                                         // = a AND (b XOR c)
1134
1135         cmp     r0, r1                  // always equal
1136
1137 #elif defined(__aarch64__)
1138
1139         and     x3, x0, x1              // = a AND b
1140
1141         eor     x1, x1, x2              // = b XOR c
1142         and     x1, x1, x0              // = a AND (b XOR c)
1143
1144         and     x0, x0, x2              // = a AND c
1145         eor     x0, x0, x3              // = (a AND b) XOR (a AND c)
1146                                         // = a AND (b XOR c)
1147
1148         cmp     x0, x1                  // always equal
1149
1150 #else
1151         notimpl
1152 #endif
1153
1154         ret
1155
1156 endproc
1157
1158 proc    x0e
1159
1160         // de morgan's law
1161
1162 #if defined(__x86_64__)
1163
1164         mov     rcx, rax                // = a
1165
1166         and     rcx, rbx                // = a AND b
1167         not     rcx                     // = NOT (a AND b)
1168
1169         not     rax                     // = NOT a
1170         not     rbx                     // = NOT b
1171         or      rax, rbx                // = (NOT a) OR (NOT b)
1172                                         // = NOT (a AND b)
1173
1174         cmp     rax, rcx                // always equal
1175
1176 #elif defined(__i386__)
1177
1178         mov     ecx, eax                // = a
1179
1180         and     ecx, ebx                // = a AND b
1181         not     ecx                     // = NOT (a AND b)
1182
1183         not     eax                     // = NOT a
1184         not     ebx                     // = NOT b
1185         or      eax, ebx                // = (NOT a) OR (NOT b)
1186                                         // = NOT (a AND b)
1187
1188         cmp     eax, ecx                // always equal
1189
1190 #elif defined(__arm__)
1191
1192         and     r2, r0, r1              // = a AND b
1193         mvn     r2, r2                  // = NOT (a AND b)
1194
1195         mvn     r0, r0                  // = NOT a
1196         mvn     r1, r1                  // = NOT b
1197         orr     r0, r0, r1              // = (NOT a) OR (NOT b)
1198
1199         cmp     r0, r2                  // always equal
1200
1201 #elif defined(__aarch64__)
1202
1203         and     x2, x0, x1              // = a AND b
1204         mvn     x2, x2                  // = NOT (a AND b)
1205
1206         mvn     x0, x0                  // = NOT a
1207         orn     x0, x0, x1              // = (NOT a) OR (NOT b)
1208
1209         cmp     x0, x2                  // always equal
1210
1211 #else
1212         notimpl
1213 #endif
1214
1215         ret
1216
1217 endproc
1218
1219 proc    x0f
1220
1221         // replace input buffer bytes with cumulative XORs with initial a;
1222         // final a is XOR of all buffer bytes and initial a.
1223         //
1224         // not sure why you'd do this.
1225
1226 #if defined(__x86_64__)
1227
1228 0:      xor     [rsi], al
1229         lodsb
1230         loop    0b
1231
1232 #elif defined(__i386__)
1233
1234 0:      xor     [esi], al
1235         lodsb
1236         loop    0b
1237
1238 #elif defined(__arm__)
1239
1240 0:      ldrb    r12, [r4]
1241         subs    r2, r2, #1
1242         eor     r0, r0, r12
1243         strb    r0, [r4], #1
1244         bne     0b
1245
1246 #elif defined(__aarch64__)
1247
1248 0:      ldrb    w16, [x4]
1249         sub     x2, x2, #1
1250         eor     w0, w0, w16
1251         strb    w0, [x4], #1
1252         cbnz    x2, 0b
1253
1254 #else
1255         notimpl
1256 #endif
1257
1258         ret
1259
1260 endproc
1261
1262 ///--------------------------------------------------------------------------
1263 /// 0x10--0x1f
1264
1265 proc    x10
1266
1267         // four different ways to swap a pair of registers.
1268
1269 #if defined(__x86_64__)
1270
1271         push    rax
1272         push    rcx
1273         pop     rax
1274         pop     rcx
1275
1276         xor     rax, rcx
1277         xor     rcx, rax
1278         xor     rax, rcx
1279
1280         add     rax, rcx
1281         sub     rcx, rax
1282         add     rax, rcx
1283         neg     rcx
1284
1285         xchg    rax, rcx
1286
1287 #elif defined(__i386__)
1288
1289         push    eax
1290         push    ecx
1291         pop     eax
1292         pop     ecx
1293
1294         xor     eax, ecx
1295         xor     ecx, eax
1296         xor     eax, ecx
1297
1298         add     eax, ecx
1299         sub     ecx, eax
1300         add     eax, ecx
1301         neg     ecx
1302
1303         xchg    eax, ecx
1304
1305 #elif defined(__arm__)
1306
1307         stmfd   r13!, {r0, r2}
1308         ldr     r0, [r13, #4]
1309         ldr     r2, [r13], #8
1310
1311         eor     r0, r0, r2
1312         eor     r2, r2, r0
1313         eor     r0, r0, r2
1314
1315         sub     r0, r0, r2
1316         add     r2, r2, r0
1317         rsb     r0, r0, r2              // don't need 3-addr with reverse-sub
1318
1319         mov     r12, r0
1320         mov     r0, r2
1321         mov     r2, r0
1322
1323 #elif defined(__aarch64__)
1324
1325         // anything you can do
1326         stp     x0, x2, [sp, #-16]!
1327         ldp     x2, x0, [sp], #16
1328
1329         eor     x0, x0, x2
1330         eor     x2, x2, x0
1331         eor     x0, x0, x2
1332
1333         // the add/sub/add thing was daft.  you can do it in three if you're
1334         // clever -- and have three-address operations.
1335         sub     x0, x0, x2
1336         add     x2, x2, x0
1337         sub     x0, x2, x0
1338
1339         // but we lack a fourth.  we can't do this in fewer than three
1340         // instructions without hitting memory.  only `ldp' will modify two
1341         // registers at a time, so we need at least two instructions -- but
1342         // if the first one sets one of our two registers to its final value
1343         // then we lose the other input value with no way to recover it, so
1344         // we must either write a fresh third register, or write something
1345         // other than the final value, and in both cases we need a third
1346         // instruction to fix everything up.  we've done the wrong-something-
1347         // other trick twice, so here's the captain-obvious use-a-third-
1348         // register version.
1349         mov     x16, x0
1350         mov     x0, x2
1351         mov     x2, x16
1352
1353 #else
1354         notimpl
1355 #endif
1356
1357         ret
1358
1359 endproc
1360
1361 proc    x11
1362
1363         // assuming a is initialized to zero, set a to the inclusive or of
1364         // the xor-differences of corresponding bytes in the c-byte strings
1365         // at si and di.
1366         //
1367         // in particular, a will be zero (and zf set) if and only if the two
1368         // strings are equal.
1369
1370 #if defined(__x86_64__)
1371
1372 0:      mov     dl, [rsi]
1373         xor     dl, [rdi]
1374         inc     rsi
1375         inc     rdi
1376         or      al, dl
1377         loop    0b
1378
1379 #elif defined(__i386__)
1380
1381 0:      mov     dl, [esi]
1382         xor     dl, [edi]
1383         inc     esi
1384         inc     edi
1385         or      al, dl
1386         loop    0b
1387
1388 #elif defined(__arm__)
1389
1390 0:      ldrb    r1, [r4], #1
1391         ldrb    r12, [r5], #1
1392         subs    r2, r2, #1
1393         eor     r12, r12, r1
1394         orr     r0, r0, r12
1395         bne     0b
1396
1397 #elif defined(__aarch64__)
1398
1399 0:      ldrb    w16, [x4], #1
1400         ldrb    w17, [x5], #1
1401         sub     x2, x2, #1
1402         eor     w16, w16, w17
1403         orr     w0, w0, w16
1404         cbnz    x2, 0b
1405
1406 #else
1407         notimpl
1408 #endif
1409
1410         ret
1411
1412 endproc
1413
1414 proc    x12
1415
1416         // an obtuse way of adding two registers.  for any bit position, a
1417         // OR d is set if and only if at least one of a and d has a bit set
1418         // in that position, and a AND d is set if and only if both have a
1419         // bit set in that position.  essentially, then, what we've done is
1420         // move all of the set bits in d to a, unless there's already a bit
1421         // there.  this clearly doesn't change the sum.
1422
1423 #if defined(__x86_64__)
1424
1425         mov     rcx, rdx                // c' = d
1426         and     rdx, rax                // d' = a AND d
1427         or      rax, rcx                // a' = a OR d
1428         add     rax, rdx
1429
1430 #elif defined(__i386__)
1431
1432         mov     ecx, edx                // c' = d
1433         and     edx, eax                // d' = a AND d
1434         or      eax, ecx                // a' = a OR d
1435         add     eax, edx
1436
1437 #elif defined(__arm__)
1438
1439         and     r2, r0, r3              // c' = a AND d
1440         orr     r0, r0, r3              // a' = a OR d
1441         add     r0, r0, r2
1442
1443 #elif defined(__aarch64__)
1444
1445         and     x2, x0, x3              // c' = a AND d
1446         orr     x0, x0, x3              // a' = a OR d
1447         add     x0, x0, x2
1448
1449 #else
1450         notimpl
1451 #endif
1452
1453         ret
1454
1455 endproc
1456
1457 proc    x13
1458
1459         // ok, so this is a really obtuse way of adding a and b; the result
1460         // is in a and d.  but why does it work?
1461
1462 #if defined(__x86_64__)
1463
1464         mov     rcx, 0x40               // carry chains at most 64 long
1465 0:      mov     rdx, rax                // copy a'
1466         xor     rax, rbx                // low bits of each bitwise sum
1467         and     rbx, rdx                // carry bits from each bitwise sum
1468         shl     rbx, 1                  // carry them into next position
1469         loop    0b
1470
1471 #elif defined(__i386__)
1472
1473         mov     ecx, 0x40               // carry chains at most 64 long
1474 0:      mov     edx, eax                // copy a'
1475         xor     eax, ebx                // low bits of each bitwise sum
1476         and     ebx, edx                // carry bits from each bitwise sum
1477         shl     ebx, 1                  // carry them into next position
1478         loop    0b
1479
1480 #elif defined(__arm__)
1481
1482         mov     r2, #0x40
1483 0:      and     r3, r0, r1
1484         subs    r2, r2, #1
1485         eor     r0, r0, r1
1486         lsl     r1, r3, #1
1487         bne     0b
1488
1489 #elif defined(__aarch64__)
1490
1491         mov     x2, #0x40
1492 0:      and     x3, x0, x1
1493         sub     x2, x2, #1
1494         eor     x0, x0, x1
1495         lsl     x1, x3, #1
1496         cbnz    x2, 0b
1497
1498 #else
1499         notimpl
1500 #endif
1501
1502         ret
1503
1504 endproc
1505
1506 proc    x14
1507
1508         // floor((a + d)/2), like x08.
1509
1510 #if defined(__x86_64__)
1511
1512         mov     rcx, rax                // copy a for later
1513         and     rcx, rdx                // carry bits
1514
1515         xor     rax, rdx                // low bits of each bitwise sum
1516         shr     rax, 1                  // divide by 2; carries now in place
1517
1518         add     rax, rcx                // add the carries; done
1519
1520 #elif defined(__i386__)
1521
1522         mov     ecx, eax                // copy a for later
1523         and     ecx, edx                // carry bits
1524
1525         xor     eax, edx                // low bits of each bitwise sum
1526         shr     eax, 1                  // divide by 2; carries now in place
1527
1528         add     eax, ecx                // add the carries; done
1529
1530 #elif defined(__arm__)
1531
1532         and     r2, r0, r3
1533         eor     r0, r0, r3
1534         add     r0, r2, r0, lsr #1
1535
1536 #elif defined(__aarch64__)
1537
1538         and     x2, x0, x3
1539         eor     x0, x0, x3
1540         add     x0, x2, x0, lsr #1
1541
1542 #else
1543         notimpl
1544 #endif
1545
1546         ret
1547
1548 endproc
1549
1550 proc    x15
1551
1552         // sign extension 32 -> 64 bits.
1553
1554 #if defined(__x86_64__)
1555
1556         movsx   rbx, eax                // like this?
1557
1558         mov     rdx, 0xffffffff80000000
1559         add     rax, rdx                // if bit 31 of a is set then bits
1560                                         // 31--63 of a' are clear; otherwise,
1561                                         // these bits are all set -- which is
1562                                         // exactly backwards
1563         xor     rax, rdx                // so fix it
1564
1565 #elif defined(__i386__)
1566
1567         movsx   ebx, ax                 // like this?
1568
1569         mov     edx, 0xffff8000
1570         add     eax, edx                // if bit 31 of a is set then bits
1571                                         // 31--63 of a' are clear; otherwise,
1572                                         // these bits are all set -- which is
1573                                         // exactly backwards
1574         xor     eax, edx                // so fix it
1575
1576 #elif defined(__arm__)
1577
1578         sxth    r1, r0                  // like this
1579
1580         mov     r12, #0x80000000
1581         add     r0, r0, r12, asr #16
1582         eor     r0, r0, r12, asr #16
1583
1584 #elif defined(__aarch64__)
1585
1586         sxtw    x1, w0                  // like this
1587
1588         mov     x16, #0xffffffff80000000
1589         add     x0, x0, x16
1590         eor     x0, x0, x16
1591
1592 #else
1593         notimpl
1594 #endif
1595
1596         ret
1597
1598 endproc
1599
1600 proc    x16
1601
1602         // ??? i don't know why you'd want to calculate this.
1603
1604 #if defined(__x86_64__)
1605
1606         xor     rax, rbx                // a' = a XOR b
1607         xor     rbx, rcx                // b' = b XOR c
1608         mov     rsi, rax                // t = a XOR b
1609         add     rsi, rbx                // t = (a XOR b) + (b XOR c)
1610         cmovc   rax, rbx                // a' = cf ? b XOR c : a XOR b
1611         xor     rax, rbx                // a' = cf ? 0 : a XOR c
1612         cmp     rax, rsi
1613
1614 #elif defined(__i386__)
1615
1616         xor     eax, ebx                // a' = a XOR b
1617         xor     ebx, ecx                // b' = b XOR c
1618         mov     esi, eax                // t = a XOR b
1619         add     esi, ebx                // t = (a XOR b) + (b XOR c)
1620         cmovc   eax, ebx                // a' = cf ? b XOR c : a XOR b
1621         xor     eax, ebx                // a' = cf ? 0 : a XOR c
1622         cmp     eax, esi
1623
1624 #elif defined(__arm__)
1625
1626         eor     r0, r0, r1
1627         eor     r1, r1, r2
1628         adds    r4, r0, r1
1629         movcs   r0, r1
1630         eor     r0, r0, r1
1631         cmp     r0, r4
1632
1633 #elif defined(__aarch64__)
1634
1635         eor     x0, x0, x1
1636         eor     x1, x1, x2
1637         adds    x4, x0, x1
1638         cmov.cs x0, x1
1639         eor     x0, x0, x1
1640         cmp     x0, x4
1641
1642 #else
1643         notimpl
1644 #endif
1645
1646         ret
1647
1648 endproc
1649
1650 proc    x17
1651
1652         // absolute value
1653
1654 #if defined(__x86_64__)
1655
1656         cqo                             // d = a < 0 ? -1 : 0
1657         xor     rax, rdx                // a' = a < 0 ? -a - 1 : a
1658         sub     rax, rdx                // a' = a < 0 ? -a : a
1659
1660 #elif defined(__i386__)
1661
1662         cdq                             // d = a < 0 ? -1 : 0
1663         xor     eax, edx                // a' = a < 0 ? -a - 1 : a
1664         sub     eax, edx                // a' = a < 0 ? -a : a
1665
1666 #elif defined(__arm__)
1667
1668         // direct approach
1669         movs    r1, r0
1670         rsbmi   r1, r0, #0
1671
1672         // faithful-ish conversion
1673         eor     r3, r0, r0, asr #31
1674         sub     r0, r3, r0, asr #31
1675
1676 #elif defined(__aarch64__)
1677
1678         // direct approach
1679         tst     x0, #1 << 63
1680         cneg.ne x1, x0
1681
1682         // faithful-ish conversion
1683         eor     x3, x0, x0, asr #63
1684         sub     x0, x3, x0, asr #63
1685
1686 #else
1687         notimpl
1688 #endif
1689
1690         ret
1691
1692 endproc
1693
1694 proc    x18
1695
1696         // should always set sf, clear zf, unless we get rescheduled to a
1697         // different core.
1698
1699 #if defined(__x86_64__)
1700
1701         rdtsc                           // d || a = cycles
1702         shl     rdx, 0x20
1703         or      rax, rdx                // a = cycles
1704         mov     rcx, rax                // c = cycles
1705
1706         rdtsc                           // d || a = cycles'
1707         shl     rdx, 0x20
1708         or      rax, rdx                // a = cycles'
1709
1710         cmp     rcx, rax
1711
1712 #elif defined(__i386__)
1713
1714         rdtsc                           // d || a = cycles
1715         mov     ebx, eax
1716         mov     ecx, edx                // c || b = cycles
1717
1718         rdtsc                           // d || a = cycles'
1719
1720         sub     ebx, eax
1721         sbb     ecx, edx
1722
1723 #elif defined(__arm__)
1724
1725         // cycle clock not available in user mode
1726         mrrc    p15, 0, r0, r1, c9
1727         mrrc    p15, 0, r2, r3, c9
1728         subs    r0, r0, r2
1729         sbcs    r1, r1, r3
1730
1731 #elif defined(__aarch64__)
1732
1733         // cycle clock not available in user mode
1734         mrs     x0, pmccntr_el0
1735         mrs     x1, pmccntr_el0
1736         cmp     x0, x1
1737
1738 #else
1739         notimpl
1740 #endif
1741
1742         ret
1743
1744 endproc
1745
1746 proc    x19
1747
1748         // stupid way to capture a pointer to inline data and jump past it.
1749         // confuses the return-address predictor something chronic.  worse
1750         // because amd64 calling convention doesn't usually pass arguments on
1751         // the stack.
1752
1753 #if defined(__x86_64__)
1754
1755         call    8f
1756         .string "hello world!\n\0"
1757 8:      call    print_str
1758         add     rsp, 8
1759         ret
1760
1761 print_str:
1762         // actually implement this ridiculous thing
1763         mov     rsi, [rsp + 8]
1764         xor     edx, edx
1765 0:      mov     al, [rsi + rdx]
1766         inc     rdx
1767         cmp     al, 0
1768         jnz     0b
1769         mov     eax, SYS_write
1770         mov     edi, 1
1771         dec     rdx
1772         syscall                         // clobbers r11 :-(
1773         ret
1774
1775 #elif defined(__i386__)
1776
1777         call    8f
1778         .string "hello world!\n\0"
1779 8:      call    print_str
1780         add     esp, 4
1781         ret
1782
1783 print_str:
1784         // actually implement this ridiculous thing
1785         mov     ecx, [esp + 4]
1786         xor     edx, edx
1787 0:      mov     al, [ecx + edx]
1788         inc     edx
1789         cmp     al, 0
1790         jnz     0b
1791         mov     eax, SYS_write
1792         mov     ebx, 1
1793         dec     edx
1794         int     0x80
1795         ret
1796
1797 #elif defined(__arm__)
1798
1799         // why am i doing this?
1800         stmfd   r13!, {r14}
1801         bl      8f
1802         .string "hello world!\n\0"
1803         .balign 4
1804 8:      mov     r1, r14               // might as well make it easy on myself
1805         bl      print_str
1806         ldmfd   r13!, {pc}
1807
1808 print_str:
1809         mov     r2, #0
1810 0:      ldrb    r0, [r1, r2]
1811         cmp     r0, #0
1812         addne   r2, r2, #1
1813         bne     0b
1814         mov     r0, #1
1815         mov     r7, #SYS_write
1816         swi     0
1817         bx      r14
1818
1819 #elif defined(__aarch64__)
1820
1821         // why am i doing this?
1822         str     x30, [sp, #-16]!
1823         bl      8f
1824         .string "hello world!\n\0"
1825         .balign 4
1826 8:      mov     x1, x30               // might as well make it easy on myself
1827         bl      print_str
1828         ldr     x30, [sp], #16
1829         ret
1830
1831 print_str:
1832         mov     x2, #0
1833 0:      ldrb    w0, [x1, x2]
1834         cmp     w0, #0
1835         cinc.ne x2, x2
1836         b.ne    0b
1837         mov     x0, #1
1838         mov     x8, #SYS_write
1839         svc     #0
1840         ret
1841
1842 #else
1843         notimpl
1844 #endif
1845
1846 endproc
1847
1848 proc    x1a
1849
1850         // collect the current instruction-pointer address.  this was an old
1851         // 32-bit i386 trick for position-independent code, but (a) it
1852         // confuses the return predictor, and (b) amd64 has true pc-relative
1853         // addressing.
1854
1855 #if defined(__x86_64__)
1856
1857         // the actual example
1858         call    0f
1859 0:      pop     rax
1860
1861         // the modern i386 trick doesn't confuse the return-address
1862         // predictor.
1863         call    calladdr_rbx
1864         sub     rbx, . - 0b
1865
1866         // but rip-relative addressing is even better
1867         lea     rcx, [rip + 0b]
1868
1869         ret
1870
1871 calladdr_rbx:
1872         mov     rbx, [rsp]
1873         ret
1874
1875 #elif defined(__i386__)
1876
1877         // the actual example
1878         call    0f
1879 0:      pop     eax
1880
1881         // the modern i386 trick doesn't confuse the return-address
1882         // predictor.
1883         call    get_pc_ebx
1884         sub     ebx, . - 0b
1885
1886         ret
1887
1888 #elif defined(__arm__)
1889
1890         stmfd   r13!, {r14}
1891
1892         bl      0f
1893 0:      mov     r0, r14
1894
1895         bl      return
1896         sub     r1, r14, #. - 0b
1897
1898         adr     r2, 0b
1899
1900         ldmfd   r13!, {pc}
1901
1902 return: bx      r14
1903
1904 #elif defined(__aarch64__)
1905
1906         str     x30, [sp, #-16]!
1907
1908         // we can do all of the above using a64
1909         bl      0f
1910 0:      mov     x0, x30
1911
1912         bl      return
1913         sub     x1, x30, #. - 0b
1914
1915         adr     x2, 0b
1916
1917         ldr     x30, [sp], #16
1918 return: ret
1919
1920 #else
1921         notimpl
1922 #endif
1923
1924 endproc
1925
1926 proc    x1b
1927
1928 #if defined(__x86_64__)
1929
1930         // retpolines: an mitigation against adversarially influenced
1931         // speculative execution at indirect branches.  if an adversary can
1932         // prepare a branch-target buffer entry matching an indirect branch
1933         // in the victim's address space then they can cause the victim to
1934         // /speculatively/ (but not architecturally) execute any code in
1935         // their address space, possibly leading to leaking secrets through
1936         // the cache.  retpolines aren't susceptible to this because the
1937         // predicted destination address is from the return-prediction stack
1938         // which the adversary can't prime.  the performance penalty is still
1939         // essentially a branch misprediction -- for this return, and
1940         // possibly all others already stacked.
1941
1942         // (try not to crash)
1943         lea     rax, [rip + 9f]
1944
1945         push    rax
1946 9:      ret
1947
1948 #elif defined(__i386__)
1949
1950         call    get_pc_ebx
1951         lea     eax, [ebx + 9f - .]
1952
1953         push    eax
1954 9:      ret
1955
1956 #elif defined(__arm__)
1957
1958         stmfd   r13!, {r14}
1959
1960         adr     r14, 8f
1961         bx      r14
1962
1963 8:      ldmfd   r13!, {pc}
1964
1965 #elif defined(__aarch64__)
1966
1967         str     x30, [sp, #-16]!
1968
1969         adr     x30, 8f
1970         ret
1971
1972 8:      ldr     x30, [sp], #16
1973         ret
1974
1975 #else
1976         notimpl
1977 #endif
1978
1979 endproc
1980
1981 proc    x1c
1982
1983         // ok, having a hard time seeing a use for this.  the most important
1984         // thing to note is that sp is set from `pop' /after/ it's
1985         // incremented.
1986
1987 #if defined(__x86_64__)
1988
1989         // try not to crash
1990         mov     rax, rsp
1991         and     rsp, -16
1992         push    rax
1993
1994         pop     rsp
1995
1996         // check it worked
1997         mov     rbx, rsp
1998         ret
1999
2000 #elif defined(__i386__)
2001
2002         // try not to crash
2003         mov     eax, esp
2004         and     esp, -16
2005         push    eax
2006
2007         pop     esp
2008
2009         // check it worked
2010         mov     ebx, esp
2011         ret
2012
2013 #elif defined(__arm__)
2014
2015         // not even going to dignify this
2016         notimpl
2017
2018 #elif defined(__aarch64__)
2019
2020         // not even going to dignify this
2021         notimpl
2022
2023 #else
2024         notimpl
2025 #endif
2026
2027 endproc
2028
2029 proc    x1d
2030
2031         // monumentally cheesy way to copy 8 n bytes from buff1 to buff2.
2032         // also clobbers words at buff2 + 8 n and buff2 - 8 for good measure.
2033
2034         n = 4
2035
2036 #if defined(__x86_64__)
2037
2038         mov     rax, rsp                        // safekeeping
2039
2040         // we're toast if we get hit by a signal now.  fingers crossed...
2041   .if 0
2042         mov     rsp, buff2 + 8*n + 8
2043         mov     rbp, buff1 + 8*n
2044   .else
2045         lea     rsp, [rdi + 8*n + 16]
2046         lea     rbp, [rsi + 8*n]
2047   .endif
2048         enter   0, n + 1
2049
2050         // precise action:
2051         //
2052         //         +---------+                  +---------+
2053         //  rbp -> |   ???   |           rsp -> |   ???   |
2054         //         +---------+                  +---------+
2055         //         | w_{n-1} |                  |   rbp   | <- rbp'
2056         //         +---------+                  +---------+
2057         //         |   ...   |                  | w_{n-1} |
2058         //         +---------+                  +---------+
2059         //         |   w_1   |                  |   ...   |
2060         //         +---------+                  +---------+
2061         //         |   w_0   |                  |   w_1   |
2062         //         +---------+                  +---------+
2063         //                                      |   w_0   |
2064         //                                      +---------+
2065         //                                      |   rbp'  | <- rsp'
2066         //                                      +---------+
2067
2068         mov     rdx, rsp
2069         mov     rsp, rax
2070
2071 #elif defined(__i386__)
2072
2073         mov     eax, esp                        // safekeeping
2074
2075         // we're toast if we get hit by a signal now.  fingers crossed...
2076   .if 0
2077         mov     esp, buff2 + 4*n + 4
2078         mov     ebp, buff1 + 4*n
2079   .else
2080         lea     esp, [edi + 4*n + 8]
2081         lea     ebp, [esi + 4*n]
2082   .endif
2083         enter   0, n + 1
2084
2085         mov     edx, esp
2086         mov     esp, eax
2087
2088 #elif defined(__arm__)
2089
2090         add     r4, r4, #4*n
2091         add     r5, r5, #4*n + 8
2092
2093         str     r4, [r5, #-4]!
2094   .rept n/2
2095         ldrd    r0, r1, [r4, #-8]!
2096         strd    r0, r1, [r5, #-8]!
2097   .endr
2098         add     r4, r5, #4*n
2099         str     r4, [r5, #-4]!
2100
2101 #elif defined(__aarch64__)
2102
2103         // omgwtf.  let's not actually screw with the stack pointer.
2104
2105         add     x4, x4, #8*n
2106         add     x5, x5, #8*n + 16
2107
2108         str     x4, [x5, #-8]!
2109   .rept n/2
2110         ldp     x16, x17, [x4, #-16]!
2111         stp     x16, x17, [x5, #-16]!
2112   .endr
2113         add     x4, x5, #8*n
2114         str     x4, [x5, #-8]!
2115
2116 #else
2117         notimpl
2118 #endif
2119
2120         ret
2121
2122 endproc
2123
2124 proc    x1e
2125
2126         // convert nibble value to (uppercase) hex; other input values yield
2127         // nonsense.
2128
2129 #if defined(__x86_64__)
2130
2131         // das doesn't work in 64-bit mode; best i can come up with
2132         mov     edx, eax
2133         add     al, '0'
2134         add     dl, 'A' - 10
2135         cmp     al, '9' + 1
2136         cmovae  eax, edx
2137
2138 #elif defined(__i386__)
2139
2140         cmp     al, 0x0a                // cf = 1 iff a < 10
2141         sbb     al, 0x69                // if 0 <= a < 10, a' = a - 0x6a, so
2142                                         // 0x96 <= a' < 0x70, setting af, cf
2143                                         // if 10 <= a < 16, a' = a - 0x69, so
2144                                         // 0x71 <= a' < 0x77, setting cf but
2145                                         // clearing af
2146         das                             // if 0 <= a < 10, then af and cf are
2147                                         // both set, so set subtract 0x66
2148                                         // from a' leaving 0x30 <= a' < 0x3a;
2149                                         // if 10 <= a < 16 then af clear but
2150                                         // cf set, so subtract 0x60 from a'
2151                                         // leaving 0x41 <= a' < 0x47
2152
2153 #elif defined(__arm__)
2154
2155         // significantly less tricksy
2156         cmp     r0, #10
2157         addlo   r0, r0, #'0'
2158         addhs   r0, r0, #'A' - 10
2159
2160 #elif defined(__aarch64__)
2161
2162         // with less versatile conditional execution this is the best we can
2163         // do
2164         cmp     w0, #10
2165         add     w16, w0, #'A' - 10
2166         add     w0, w0, #'0'
2167         cmov.hs w0, w16
2168
2169 #else
2170         notimpl
2171 #endif
2172
2173         ret
2174
2175 endproc
2176
2177 proc    x1f
2178
2179         // verify collatz conjecture starting at a; assume a /= 0!
2180
2181 #if defined(__x86_64__)
2182
2183 0:      bsf     rcx, rax                // clobber c if a = 0
2184         shr     rax, cl                 // a = 2^c a'
2185   cmp rdx, 0
2186   je 1f
2187   stosq
2188   dec rdx
2189 1:
2190         cmp     rax, 1                  // done?
2191         je      9f
2192         lea     rax, [2*rax + rax + 1]  // a' = 3 a' + 1
2193         jmp     0b                      // again
2194
2195 9:      ret
2196
2197 #elif defined(__i386__)
2198
2199 0:      bsf     ecx, eax                // clobber c if a = 0
2200         shr     eax, cl                 // a = 2^c a'
2201   cmp edx, 0
2202   je 1f
2203   stosd
2204   dec edx
2205 1:
2206         cmp     eax, 1                  // done?
2207         je      9f
2208         lea     eax, [2*eax + eax + 1]  // a' = 3 a' + 1
2209         jmp     0b                      // again
2210
2211 9:      ret
2212
2213 #elif defined(__arm__)
2214
2215         // rbit introduced in armv7
2216 0:      rbit    r2, r0
2217         clz     r2, r2
2218         mov     r0, r0, lsr r2          // a = 2^c a'
2219   cmp r3, #0
2220   strne r0, [r5], #4
2221   subne r3, r3, #1
2222         cmp     r0, #1
2223         adcne   r0, r0, r0, lsl #1      // a' = 3 a' + 1 (because c set)
2224         bne     0b
2225
2226         ret
2227
2228 #elif defined(__aarch64__)
2229
2230 0:      rbit    w2, w0
2231         clz     w2, w2
2232         lsr     w0, w0, w2              // a = 2^c a'
2233   cmp x3, #0
2234   beq 1f
2235   str x0, [x5], #8
2236   sub x3, x3, #1
2237 1:
2238         cmp     w0, #1
2239         add     w16, w0, w0, lsl #1     // t = 3 a' + 1 (because c set)
2240         csinc.eq w0, w0, w16
2241         b.ne    0b
2242
2243         ret
2244
2245 #else
2246         notimpl
2247 #endif
2248
2249 endproc
2250
2251 ///--------------------------------------------------------------------------
2252 /// 0x20--0x2f
2253
2254 proc    x20
2255
2256         // calculate 1337 a slowly
2257
2258 #if defined(__x86_64__)
2259
2260         // original version
2261         mov     rcx, rax                // c = a
2262         shl     rcx, 2                  // c = 4 a
2263         add     rcx, rax                // c = 5 a
2264         shl     rcx, 3                  // c = 40 a
2265         add     rcx, rax                // c = 41 a
2266         shl     rcx, 1                  // c = 82 a
2267         add     rcx, rax                // c = 83 a
2268         shl     rcx, 1                  // c = 166 a
2269         add     rcx, rax                // c = 167 a
2270         shl     rcx, 3                  // c = 1336 a
2271         add     rcx, rax                // c = 1337 a
2272
2273         // a quick way
2274         lea     rdx, [2*rax + rax]      // t = 3 a
2275         shl     rdx, 6                  // t = 192 a
2276         sub     rdx, rax                // t = 191 a
2277         lea     rbx, [8*rdx]            // b = 1528 a
2278         sub     rbx, rdx                // b = 1337 a
2279
2280 #elif defined(__i386__)
2281
2282         // original version
2283         mov     ecx, eax                // c = a
2284         shl     ecx, 2                  // c = 4 a
2285         add     ecx, eax                // c = 5 a
2286         shl     ecx, 3                  // c = 40 a
2287         add     ecx, eax                // c = 41 a
2288         shl     ecx, 1                  // c = 82 a
2289         add     ecx, eax                // c = 83 a
2290         shl     ecx, 1                  // c = 166 a
2291         add     ecx, eax                // c = 167 a
2292         shl     ecx, 3                  // c = 1336 a
2293         add     ecx, eax                // c = 1337 a
2294
2295         // a quick way
2296         lea     edx, [2*eax + eax]      // t = 3 a
2297         shl     edx, 6                  // t = 192 a
2298         sub     edx, eax                // t = 191 a
2299         lea     ebx, [8*edx]            // b = 1528 a
2300         sub     ebx, edx                // b = 1337 a
2301
2302 #elif defined(__arm__)
2303
2304         // original version, ish
2305         add     r2, r0, r0, lsl #2      // c = 5 a
2306         add     r2, r0, r2, lsl #3      // c = 41 a
2307         add     r2, r0, r2, lsl #1      // c = 83 a
2308         add     r2, r0, r2, lsl #1      // c = 167 a
2309         add     r2, r0, r2, lsl #3      // c = 1337 a
2310
2311         // quicker way
2312         add     r1, r0, r0, lsl #1      // b = 3 a
2313         rsb     r1, r0, r1, lsl #6      // b = 191 a
2314         rsb     r1, r1, r1, lsl #3      // b = 1337 a
2315
2316 #elif defined(__aarch64__)
2317
2318         // original version, ish
2319         add     x2, x0, x0, lsl #2      // c = 5 a
2320         add     x2, x0, x2, lsl #3      // c = 41 a
2321         add     x2, x0, x2, lsl #1      // c = 83 a
2322         add     x2, x0, x2, lsl #1      // c = 167 a
2323         add     x2, x0, x2, lsl #3      // c = 1337 a
2324
2325         // sleazy because no rsb
2326         add     x1, x0, x0, lsl #1      // b = 3 a
2327         sub     x1, x0, x1, lsl #6      // b = -191 a
2328         sub     x1, x1, x1, lsl #3      // b = 1337 a
2329
2330 #else
2331         notimpl
2332 #endif
2333
2334         ret
2335
2336 endproc
2337
2338 proc    x21
2339
2340         // multiply complex numbers a + b i and c + d i
2341         //
2342         //      (a + b i) (c + d i) = (a c - b d) + (a d + b c) i
2343         //
2344         // somewhat slick approach uses only three multiplications
2345
2346 #if defined(__x86_64__)
2347
2348         mov     rsi, rax                // t = a
2349         add     rax, rbx                // a' = a + b
2350         mov     rdi, rdx                // u = d
2351         sub     rdx, rcx                // d' = d - c
2352         add     rdi, rcx                // u = c + d
2353
2354         imul    rax, rcx                // a' = c (a + b)
2355         imul    rsi, rdx                // t = a (d - c)
2356         imul    rdi, rbx                // u = b (c + d)
2357
2358         add     rsi, rax                // t = a (d - c) + c (a + b)
2359         mov     rbx, rsi                // b' = a (d - c) + c (a + b)
2360                                         //      = a d + b c
2361         sub     rax, rdi                // a' = c (a + b) - b (c + d)
2362                                         //      = a c - b d
2363
2364 #elif defined(__i386__)
2365
2366         mov     esi, eax                // t = a
2367         add     eax, ebx                // a' = a + b
2368         mov     edi, edx                // u = d
2369         sub     edx, ecx                // d' = d - c
2370         add     edi, ecx                // u = c + d
2371
2372         imul    eax, ecx                // a' = c (a + b)
2373         imul    esi, edx                // t = a (d - c)
2374         imul    edi, ebx                // u = b (c + d)
2375
2376         add     esi, eax                // t = a (d - c) + c (a + b)
2377         mov     ebx, esi                // b' = a (d - c) + c (a + b)
2378                                         //      = a d + b c
2379         sub     eax, edi                // a' = c (a + b) - b (c + d)
2380                                         //      = a c - b d
2381
2382 #elif defined(__arm__)
2383
2384         add     r4, r0, r1              // t = a + b
2385         add     r5, r2, r3              // u = c + d
2386         sub     r3, r3, r2              // d' = d - c
2387
2388         // mls introduced in armv7
2389         mul     r4, r4, r2              // t = c (a + b)
2390         mov     r2, r1                  // c' = a (bah!)
2391         mla     r1, r0, r3, r4          // b' = a (d - c) + c (a + b)
2392                                         //      = a d + b c
2393         mls     r0, r2, r5, r4          // a' = c (a + b) - b (c + d)
2394                                         //      = a c - b d
2395
2396 #elif defined(__aarch64__)
2397
2398         add     x4, x0, x1              // t = a + b
2399         add     x5, x2, x3              // u = c + d
2400         sub     x3, x3, x2              // d' = d - c
2401
2402         // mls intxoduced in axmv7
2403         mul     x4, x4, x2              // t = c (a + b)
2404         mov     x2, x1                  // c' = a (bah!)
2405         madd    x1, x0, x3, x4          // b' = a (d - c) + c (a + b)
2406                                         //      = a d + b c
2407         msub    x0, x2, x5, x4          // a' = c (a + b) - b (c + d)
2408                                         //      = a c - b d
2409
2410 #else
2411         notimpl
2412 #endif
2413
2414         ret
2415
2416 endproc
2417
2418 proc    x22
2419
2420         // divide by 3
2421
2422 #if defined(__x86_64__)
2423
2424         mov     rdx, 0xaaaaaaaaaaaaaaab // = ceil(2/3 2^64)
2425         mul     rdx                     // d' || a' =~ 2/3 a 2^64
2426         shr     rdx, 1                  // d' = floor(a/3)
2427         mov     rax, rdx                // a' = floor(a/3)
2428
2429         // we start with 0 <= a < 2^64.  write f = ceil(2/3 2^64), so that
2430         // 2/3 < f/2^64 < 2/3 + 1/2^64.  then floor(2/3 a) <= floor(a f/2^64)
2431         // <= floor(2/3 a + a/2^64), but a < 2^64 so a/2^64 < 1 and
2432         // floor(a f/2^64) = floor(2/3 a).
2433
2434 #elif defined(__i386__)
2435
2436         mov     edx, 0xaaaaaaab         // = ceil(2/3 2^32)
2437         mul     edx                     // d' || a' =~ 2/3 a 2^32
2438         shr     edx, 1                  // d' = floor(a/3)
2439         mov     eax, edx                // a' = floor(a/3)
2440
2441 #elif defined(__arm__)
2442
2443         ldr     r12, =0xaaaaaaab
2444         umull   r12, r0, r0, r12
2445         mov     r0, r0, lsr #1
2446
2447 #elif defined(__aarch64__)
2448
2449         ldr     x16, =0xaaaaaaaaaaaaaaab
2450         umulh   x0, x0, x16
2451         lsr     x0, x0, #1
2452
2453 #else
2454         notimpl
2455 #endif
2456
2457         ret
2458
2459 endproc
2460
2461 proc    x23
2462
2463 #if defined(__x86_64__)
2464
2465         // main loop: shorten a preserving residue class mod 3
2466 0:      cmp     rax, 5
2467         jbe     8f
2468         // a > 5
2469         mov     rdx, rax                // d' = a
2470         shr     rdx, 2                  // d' = floor(a/4)
2471         and     rax, 3                  // a = 4 d' + a' (0 <= a' < 4)
2472         add     rax, rdx                // a' == a (mod 3) but a' < a/4 + 4
2473         jmp     0b
2474
2475         // fix up final value 0 <= a < 6: want 0 <= a < 3
2476         //
2477         // the tricky part is actually a = 3; but the other final cases take
2478         // additional iterations which we can avoid.
2479 8:      cmp     rax, 3                  // set cf iff a < 3
2480         cmc                             // set cf iff a >= 3
2481         sbb     rdx, rdx                // d' = a >= 3 ? -1 : 0
2482         and     rdx, 3                  // d' = a >= 3 ? 3 : 0
2483         sub     rax, rdx                // a' = a - (a >= 3 ? 3 : 0)
2484                                         //      = a (mod 3)
2485
2486 #elif defined(__i386__)
2487
2488         // main loop: shorten a preserving residue class mod 3
2489 0:      cmp     eax, 5
2490         jbe     8f
2491         // a > 5
2492         mov     edx, eax                // d' = a
2493         shr     edx, 2                  // d' = floor(a/4)
2494         and     eax, 3                  // a = 4 d' + a' (0 <= a' < 4)
2495         add     eax, edx                // a' == a (mod 3) but a' < a/4 + 4
2496         jmp     0b
2497
2498         // fix up final value 0 <= a < 6: want 0 <= a < 3
2499         //
2500         // the tricky part is actually a = 3; but the other final cases take
2501         // additional iterations which we can avoid.
2502 8:      cmp     eax, 3                  // set cf iff a < 3
2503         cmc                             // set cf iff a >= 3
2504         sbb     edx, edx                // d' = a >= 3 ? -1 : 0
2505         and     edx, 3                  // d' = a >= 3 ? 3 : 0
2506         sub     eax, edx                // a' = a - (a >= 3 ? 3 : 0)
2507                                         //      = a (mod 3)
2508
2509 #elif defined(__arm__)
2510
2511 0:      cmp     r0, #6
2512         andhs   r12, r0, #3
2513         addhs   r0, r12, r0, lsr #2
2514         bhs     0b
2515
2516         cmp     r0, #3
2517         subhs   r0, r0, #3
2518
2519 #elif defined(__aarch64__)
2520
2521 0:      cmp     x0, #6
2522         // blunder on through regardless since this doesn't affect the result
2523         and     x16, x0, #3
2524         add     x0, x16, x0, lsr #2
2525         b.hs    0b
2526
2527         subs    x16, x0, #3
2528         cmov.hs x0, x16
2529
2530 #else
2531         notimpl
2532 #endif
2533
2534         ret
2535
2536 endproc
2537
2538 proc    x24
2539
2540         // invert (odd) a mod 2^64
2541         //
2542         // suppose a a_i == 1 (mod 2^{2^i})
2543         //
2544         // clearly good for i = 0, since 2^i = 1 and 2^{2^i} = 2, and a_0 =
2545         // a == 1 (mod 2) by assumption
2546         //
2547         // write a a_i == b_i 2^{2^i} + 1 (mod 2^{2^{i+1}})
2548         // then b_i == (a a_i - 1)/2^{2^i} (mod 2^{2^i})
2549         // to lift inverse, we want x such that a x == -b_i (mod 2^{2^i});
2550         // clearly x = -a_i b_i will do, since a a_i == 1 (mod 2^{2^i})
2551         // then:
2552         // a_{i+1} = a_i - a_i b_i 2^{2^i} = a_i (1 - (a a_i - 1))
2553         //      = 2 a_i - a a_i^2
2554         //
2555         // check:
2556         // a a_{i+1} = 2 a a_i - a^2 a_i^2
2557         //      == 2 a a_i - (b_i 2^{2^i} + 1)^2
2558         //      == 2 (b_i 2^{2^i} + 1) -
2559         //              (b_i^2 2^{2^{i+1}} + 2 b_i 2^{2^i} + 1)
2560         //      == 1 (mod 2^{2^{i+1}})
2561
2562 #if defined(__x86_64__)
2563
2564         // rax                          // a_0 = a
2565         mov     rbx, rax                // b' = a
2566         mov     rsi, rax                // t = a_0
2567
2568 0:
2569   cmp rbp, 0
2570   je 1f
2571   stosq
2572   dec rbp
2573 1:
2574         mul     rbx                     // a' = a a_i
2575         mov     rcx, rax                // c = a a_i
2576
2577         sub     rax, 2                  // a' = a a_i - 2
2578         neg     rax                     // a' = 2 - a a_i
2579         mul     rsi                     // a_{i+1} = a_i (2 - a a_i)
2580                                         //      = 2 a_i - a a_i^2
2581         mov     rsi, rax                // t = a_{i+1}
2582
2583         cmp     rcx, 1                  // done?
2584         ja      0b                      // no -- iterate
2585
2586 #elif defined(__i386__)
2587
2588         // eax                          // a_0 = a
2589         mov     ebx, eax                // b' = a
2590         mov     esi, eax                // t = a_0
2591
2592 0:
2593   cmp ebp, 0
2594   je 1f
2595   stosd
2596   dec ebp
2597 1:
2598         mul     ebx                     // a' = a a_i
2599         mov     ecx, eax                // c = a a_i
2600
2601         sub     eax, 2                  // a' = a a_i - 2
2602         jb      9f                      // done if < 2
2603         neg     eax                     // a' = 2 - a a_i
2604         mul     esi                     // a_{i+1} = a_i (2 - a a_i)
2605                                         //      = 2 a_i - a a_i^2
2606         mov     esi, eax                // t = a_{i+1}
2607
2608         jmp     0b                      // and iterate
2609 9:      mov     eax, esi                // restore
2610
2611 #elif defined(__arm__)
2612
2613         // r0                           // a_0 = a
2614         mov     r1, r0                  // b' = a
2615
2616 0:
2617   cmp r6, #0
2618   strne r0, [r5], #4
2619   subne r6, r6, #1
2620         mul     r2, r0, r1              // c = a a_i
2621         rsbs    r2, r2, #2              // c = 2 - a a_i
2622         mul     r0, r0, r2              // a_{i+1} = a_i (2 - a a_i)
2623                                         //      = 2 a_i - a a_i^2
2624         blo     0b
2625
2626 #elif defined(__aarch64__)
2627
2628         // x0                           // a_0 = a
2629         mov     x1, x0                  // b' = a
2630         mov     x16, #2                 // because we have no rsb
2631
2632 0:
2633   cmp x6, #0
2634   b.eq 1f
2635   str x0, [x5], #8
2636   sub x6, x6, #1
2637 1:
2638         mul     x2, x0, x1              // c = a a_i
2639         subs    x2, x16, x2             // c = 2 - a a_i
2640         mul     x0, x0, x2              // a_{i+1} = a_i (2 - a a_i)
2641                                         //      = 2 a_i - a a_i^2
2642         b.lo    0b
2643
2644 #else
2645         notimpl
2646 #endif
2647
2648         ret
2649
2650 endproc
2651
2652 proc    x25
2653
2654         // a poor approximation to pi/4
2655         //
2656         // think of x and y as being in 16.16 fixed-point format.  we sample
2657         // points in the unit square, and determine how many of them are
2658         // within a unit quarter-circle centred at the origin.  the area of
2659         // the quarter-circle is pi/4.
2660
2661 #if defined(__x86_64__)
2662
2663         xor     eax, eax                // a = 0
2664         mov     rcx, 1
2665         shl     rcx, 0x20               // c =~ 4 billion
2666
2667 0:      movzx   rbx, cx                 // x = low 16 bits of c
2668         imul    rbx, rbx                // b = x^2
2669
2670         ror     rcx, 0x10               // switch halves of c
2671         movzx   rdx, cx                 // y = high 16 bits of c
2672         imul    rdx, rdx                // d = y^2
2673         rol     rcx, 0x10               // switch back
2674
2675         add     rbx, rdx                // r^2 = x^2 + y^2
2676         shr     rbx, 0x20               // r^2 >= 1?
2677         cmp     rbx, 1                  // set cf iff r^2 >= 1
2678         adc     rax, 0                  // and add onto accumulator
2679         loop    0b
2680
2681 #elif defined(__i386__)
2682
2683         // this is actually better done in 32 bits.  the carry has the wrong
2684         // sense here, so instead deduct one for each point outside the
2685         // quarter-circle rather than adding one for each point inside it.
2686         xor     eax, eax
2687         xor     ecx, ecx
2688
2689 0:      movzx   ebx, cx
2690         imul    ebx, ebx
2691
2692         ror     ecx, 0x10
2693         movzx   edx, cx
2694         imul    edx, edx
2695         rol     ecx, 0x10
2696
2697         add     ebx, edx                // see?
2698         sbb     eax, 0
2699         loop    0b
2700
2701 #elif defined(__arm__)
2702
2703         mov     r0, #0
2704         mov     r2, #0
2705
2706 0:      uxth    r1, r2, ror #0
2707         uxth    r3, r2, ror #16
2708         mul     r1, r1, r1
2709         mul     r3, r3, r3
2710         cmn     r1, r3                  // mlas doesn't set cf usefully
2711         addcc   r0, r0, #1
2712         adds    r2, r2, #1
2713         bne     0b
2714
2715 #elif defined(__aarch64__)
2716
2717         mov     w0, #0
2718         mov     w2, #0
2719
2720 0:      ubfx    w1, w2, #0, #16
2721         ubfx    w3, w2, #16, #16
2722         sub     w2, w2, #1
2723         mul     w1, w1, w1
2724         mul     w3, w3, w3
2725         cmn     w1, w3
2726         cinc.cc w0, w0
2727         cbnz    w2, 0b
2728
2729 #else
2730         notimpl
2731 #endif
2732
2733         ret
2734
2735 endproc
2736
2737 proc    x26
2738
2739 #if defined(__x86_64__)
2740
2741         notimpl
2742
2743 #elif defined(__i386__)
2744
2745         notimpl
2746
2747 #elif defined(__arm__)
2748
2749         notimpl
2750
2751 #elif defined(__aarch64__)
2752
2753         notimpl
2754
2755 #else
2756         notimpl
2757 #endif
2758
2759 endproc
2760
2761 proc    x27
2762
2763 #if defined(__x86_64__)
2764
2765         notimpl
2766
2767 #elif defined(__i386__)
2768
2769         notimpl
2770
2771 #elif defined(__arm__)
2772
2773         notimpl
2774
2775 #elif defined(__aarch64__)
2776
2777         notimpl
2778
2779 #else
2780         notimpl
2781 #endif
2782
2783 endproc
2784
2785 proc    x28
2786
2787 #if defined(__x86_64__)
2788
2789         notimpl
2790
2791 #elif defined(__i386__)
2792
2793         notimpl
2794
2795 #elif defined(__arm__)
2796
2797         notimpl
2798
2799 #elif defined(__aarch64__)
2800
2801         notimpl
2802
2803 #else
2804         notimpl
2805 #endif
2806
2807 endproc
2808
2809 proc    x29
2810
2811 #if defined(__x86_64__)
2812
2813         notimpl
2814
2815 #elif defined(__i386__)
2816
2817         notimpl
2818
2819 #elif defined(__arm__)
2820
2821         notimpl
2822
2823 #elif defined(__aarch64__)
2824
2825         notimpl
2826
2827 #else
2828         notimpl
2829 #endif
2830
2831 endproc
2832
2833 proc    x2a
2834
2835 #if defined(__x86_64__)
2836
2837         notimpl
2838
2839 #elif defined(__i386__)
2840
2841         notimpl
2842
2843 #elif defined(__arm__)
2844
2845         notimpl
2846
2847 #elif defined(__aarch64__)
2848
2849         notimpl
2850
2851 #else
2852         notimpl
2853 #endif
2854
2855 endproc
2856
2857 proc    x2b
2858
2859 #if defined(__x86_64__)
2860
2861         notimpl
2862
2863 #elif defined(__i386__)
2864
2865         notimpl
2866
2867 #elif defined(__arm__)
2868
2869         notimpl
2870
2871 #elif defined(__aarch64__)
2872
2873         notimpl
2874
2875 #else
2876         notimpl
2877 #endif
2878
2879 endproc
2880
2881 proc    x2c
2882
2883 #if defined(__x86_64__)
2884
2885         notimpl
2886
2887 #elif defined(__i386__)
2888
2889         notimpl
2890
2891 #elif defined(__arm__)
2892
2893         notimpl
2894
2895 #elif defined(__aarch64__)
2896
2897         notimpl
2898
2899 #else
2900         notimpl
2901 #endif
2902
2903 endproc
2904
2905 proc    x2d
2906
2907 #if defined(__x86_64__)
2908
2909         notimpl
2910
2911 #elif defined(__i386__)
2912
2913         notimpl
2914
2915 #elif defined(__arm__)
2916
2917         notimpl
2918
2919 #elif defined(__aarch64__)
2920
2921         notimpl
2922
2923 #else
2924         notimpl
2925 #endif
2926
2927 endproc
2928
2929 proc    x2e
2930
2931 #if defined(__x86_64__)
2932
2933         notimpl
2934
2935 #elif defined(__i386__)
2936
2937         notimpl
2938
2939 #elif defined(__arm__)
2940
2941         notimpl
2942
2943 #elif defined(__aarch64__)
2944
2945         notimpl
2946
2947 #else
2948         notimpl
2949 #endif
2950
2951 endproc
2952
2953 proc    x2f
2954
2955 #if defined(__x86_64__)
2956
2957         notimpl
2958
2959 #elif defined(__i386__)
2960
2961         notimpl
2962
2963 #elif defined(__arm__)
2964
2965         notimpl
2966
2967 #elif defined(__aarch64__)
2968
2969         notimpl
2970
2971 #else
2972         notimpl
2973 #endif
2974
2975 endproc
2976
2977 ///--------------------------------------------------------------------------
2978 /// 0x30--0x3f
2979
2980 proc    x30
2981
2982 #if defined(__x86_64__)
2983
2984         notimpl
2985
2986 #elif defined(__i386__)
2987
2988         notimpl
2989
2990 #elif defined(__arm__)
2991
2992         notimpl
2993
2994 #elif defined(__aarch64__)
2995
2996         notimpl
2997
2998 #else
2999         notimpl
3000 #endif
3001
3002         ret
3003
3004 endproc
3005
3006 proc    x31
3007
3008 #if defined(__x86_64__)
3009
3010         notimpl
3011
3012 #elif defined(__i386__)
3013
3014         notimpl
3015
3016 #elif defined(__arm__)
3017
3018         notimpl
3019
3020 #elif defined(__aarch64__)
3021
3022         notimpl
3023
3024 #else
3025         notimpl
3026 #endif
3027
3028 endproc
3029
3030 proc    x32
3031
3032 #if defined(__x86_64__)
3033
3034         notimpl
3035
3036 #elif defined(__i386__)
3037
3038         notimpl
3039
3040 #elif defined(__arm__)
3041
3042         notimpl
3043
3044 #elif defined(__aarch64__)
3045
3046         notimpl
3047
3048 #else
3049         notimpl
3050 #endif
3051
3052 endproc
3053
3054 proc    x33
3055
3056 #if defined(__x86_64__)
3057
3058         notimpl
3059
3060 #elif defined(__i386__)
3061
3062         notimpl
3063
3064 #elif defined(__arm__)
3065
3066         notimpl
3067
3068 #elif defined(__aarch64__)
3069
3070         notimpl
3071
3072 #else
3073         notimpl
3074 #endif
3075
3076 endproc
3077
3078 proc    x34
3079
3080 #if defined(__x86_64__)
3081
3082         notimpl
3083
3084 #elif defined(__i386__)
3085
3086         notimpl
3087
3088 #elif defined(__arm__)
3089
3090         notimpl
3091
3092 #elif defined(__aarch64__)
3093
3094         notimpl
3095
3096 #else
3097         notimpl
3098 #endif
3099
3100 endproc
3101
3102 proc    x35
3103
3104 #if defined(__x86_64__)
3105
3106         notimpl
3107
3108 #elif defined(__i386__)
3109
3110         notimpl
3111
3112 #elif defined(__arm__)
3113
3114         notimpl
3115
3116 #elif defined(__aarch64__)
3117
3118         notimpl
3119
3120 #else
3121         notimpl
3122 #endif
3123
3124 endproc
3125
3126 proc    x36
3127
3128 #if defined(__x86_64__)
3129
3130         notimpl
3131
3132 #elif defined(__i386__)
3133
3134         notimpl
3135
3136 #elif defined(__arm__)
3137
3138         notimpl
3139
3140 #elif defined(__aarch64__)
3141
3142         notimpl
3143
3144 #else
3145         notimpl
3146 #endif
3147
3148 endproc
3149
3150 proc    x37
3151
3152 #if defined(__x86_64__)
3153
3154         notimpl
3155
3156 #elif defined(__i386__)
3157
3158         notimpl
3159
3160 #elif defined(__arm__)
3161
3162         notimpl
3163
3164 #elif defined(__aarch64__)
3165
3166         notimpl
3167
3168 #else
3169         notimpl
3170 #endif
3171
3172 endproc
3173
3174 proc    x38
3175
3176 #if defined(__x86_64__)
3177
3178         notimpl
3179
3180 #elif defined(__i386__)
3181
3182         notimpl
3183
3184 #elif defined(__arm__)
3185
3186         notimpl
3187
3188 #elif defined(__aarch64__)
3189
3190         notimpl
3191
3192 #else
3193         notimpl
3194 #endif
3195
3196 endproc
3197
3198 proc    x39
3199
3200 #if defined(__x86_64__)
3201
3202         notimpl
3203
3204 #elif defined(__i386__)
3205
3206         notimpl
3207
3208 #elif defined(__arm__)
3209
3210         notimpl
3211
3212 #elif defined(__aarch64__)
3213
3214         notimpl
3215
3216 #else
3217         notimpl
3218 #endif
3219
3220 endproc
3221
3222 proc    x3a
3223
3224 #if defined(__x86_64__)
3225
3226         notimpl
3227
3228 #elif defined(__i386__)
3229
3230         notimpl
3231
3232 #elif defined(__arm__)
3233
3234         notimpl
3235
3236 #elif defined(__aarch64__)
3237
3238         notimpl
3239
3240 #else
3241         notimpl
3242 #endif
3243
3244 endproc
3245
3246 proc    x3b
3247
3248 #if defined(__x86_64__)
3249
3250         notimpl
3251
3252 #elif defined(__i386__)
3253
3254         notimpl
3255
3256 #elif defined(__arm__)
3257
3258         notimpl
3259
3260 #elif defined(__aarch64__)
3261
3262         notimpl
3263
3264 #else
3265         notimpl
3266 #endif
3267
3268 endproc
3269
3270 proc    x3c
3271
3272 #if defined(__x86_64__)
3273
3274         notimpl
3275
3276 #elif defined(__i386__)
3277
3278         notimpl
3279
3280 #elif defined(__arm__)
3281
3282         notimpl
3283
3284 #elif defined(__aarch64__)
3285
3286         notimpl
3287
3288 #else
3289         notimpl
3290 #endif
3291
3292 endproc
3293
3294 proc    x3d
3295
3296 #if defined(__x86_64__)
3297
3298         notimpl
3299
3300 #elif defined(__i386__)
3301
3302         notimpl
3303
3304 #elif defined(__arm__)
3305
3306         notimpl
3307
3308 #elif defined(__aarch64__)
3309
3310         notimpl
3311
3312 #else
3313         notimpl
3314 #endif
3315
3316 endproc
3317
3318 proc    x3e
3319
3320 #if defined(__x86_64__)
3321
3322         notimpl
3323
3324 #elif defined(__i386__)
3325
3326         notimpl
3327
3328 #elif defined(__arm__)
3329
3330         notimpl
3331
3332 #elif defined(__aarch64__)
3333
3334         notimpl
3335
3336 #else
3337         notimpl
3338 #endif
3339
3340 endproc
3341
3342 proc    x3f
3343
3344 #if defined(__x86_64__)
3345
3346         notimpl
3347
3348 #elif defined(__i386__)
3349
3350         notimpl
3351
3352 #elif defined(__arm__)
3353
3354         notimpl
3355
3356 #elif defined(__aarch64__)
3357
3358         notimpl
3359
3360 #else
3361         notimpl
3362 #endif
3363
3364 endproc
3365
3366 ///----- That's all, folks --------------------------------------------------