mdw@git.distorted.org.uk Git - xchg-rax-rax/blob - xchg.S

   1 /// -*- mode: asm; asm-comment-char: 0 -*-
   2
   3 ///--------------------------------------------------------------------------
   4 /// Preliminaries.
   5
   6 #include <sys/syscall.h>
   7
   8 #if defined(__i386__) || defined(__x86_64__)
   9
  10         .intel_syntax noprefix
  11
  12 #elif defined(__arm__)
  13
  14 .macro  ret
  15         bx      r14
  16 .endm
  17
  18         .arch   armv7-a
  19         .fpu    neon
  20
  21 #elif defined(__aarch64__)
  22
  23 .macro  cmov    rd, rn, cc
  24         csel    \rd, \rn, \rd, \cc
  25 .endm
  26 #define _COND(_)                                                        \
  27         _(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl)                 \
  28         _(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv)                 \
  29         _(hs) _(lo)
  30 #define _INST(_)                                                        \
  31         _(ccmp) _(ccmn)                                                 \
  32         _(csel) _(cmov)                                                 \
  33         _(csinc) _(cinc) _(cset)                                        \
  34         _(csneg) _(cneg)                                                \
  35         _(csinv) _(cinv) _(csetm)
  36 #define _CONDVAR(cc) _definstvar cc;
  37 #define _INSTVARS(inst)                                                 \
  38         .macro _definstvar cc;                                          \
  39           .macro inst.\cc args:vararg; inst \args, \cc; .endm;          \
  40         .endm;                                                          \
  41         _COND(_CONDVAR);                                                \
  42         .purgem _definstvar;
  43         _INST(_INSTVARS)
  44 #undef _COND
  45 #undef _INST
  46 #undef _CONDVAR
  47 #undef _INSTVARS
  48
  49 #define CCMP_N 8
  50 #define CCMP_Z 4
  51 #define CCMP_C 2
  52 #define CCMP_V 1
  53
  54 #define CCMP_MI CCMP_N
  55 #define CCMP_PL 0
  56 #define CCMP_EQ CCMP_Z
  57 #define CCMP_NE 0
  58 #define CCMP_CS CCMP_C
  59 #define CCMP_HS CCMP_C
  60 #define CCMP_CC 0
  61 #define CCMP_LO 0
  62 #define CCMP_VS CCMP_V
  63 #define CCMP_VC 0
  64 #define CCMP_HI CCMP_C
  65 #define CCMP_LS 0
  66 #define CCMP_LT CCMP_N
  67 #define CCMP_GE 0
  68 #define CCMP_LE CCMP_N
  69 #define CCMP_GT 0
  70
  71 #else
  72 #  error "not supported"
  73 #endif
  74
  75 .macro  proc    name
  76         .globl  \name
  77         .type   \name, STT_FUNC
  78         .p2align 4
  79 \name\():
  80   .macro endproc
  81         .size   \name, . - \name
  82         .purgem endproc
  83   .endm
  84 .endm
  85
  86 .macro ch c
  87 #if defined(__i386__)
  88
  89         pushf
  90         push    eax
  91         push    ebx
  92         push    ecx
  93         push    edx
  94         push    ebp
  95         mov     ebp, esp
  96         and     esp, -16
  97
  98         push    \c
  99         call    putchar@plt
 100
 101         call    get_pc_ebx
 102         add     ebx, offset _GLOBAL_OFFSET_TABLE
 103         mov     eax, [ebx + stdout@GOT]
 104         mov     eax, [eax]
 105         call    fflush@plt
 106
 107         mov     esp, ebp
 108         pop     ebp
 109         pop     edx
 110         pop     ecx
 111         pop     ebx
 112         pop     eax
 113         popf
 114
 115 #elif defined(__x86_64__)
 116
 117         pushf
 118         push    rax
 119         push    rcx
 120         push    rdx
 121         push    rsi
 122         push    rdi
 123         push    r8
 124         push    r9
 125         push    rbp
 126         mov     rbp, rsp
 127         and     rsp, -16
 128
 129         mov     rdi, \c
 130         call    putchar@plt
 131
 132         mov     rdi, [rip + stdout]
 133         call    fflush@plt
 134
 135         mov     rsp, rbp
 136         pop     rbp
 137         pop     r9
 138         pop     r8
 139         pop     rdi
 140         pop     rsi
 141         pop     rdx
 142         pop     rcx
 143         pop     rax
 144         popf
 145
 146 #elif defined(__arm__)
 147
 148         stmfd   r13!, {r0-r4, r12, r14}
 149
 150         mov     r4, r13
 151         bic     r14, r4, #15
 152         mov     r13, r14
 153
 154         mov     r0, #\c
 155         bl      putchar@plt
 156
 157         ldr     r14, .L$_c$gotoff$\@
 158 .L$_c$gotpc$\@:
 159         add     r14, pc, r14
 160         b       .L$_c$cont$\@
 161 .L$_c$gotoff$\@:
 162         .word   _GLOBAL_OFFSET_TABLE - .L$_c$gotpc$\@ - 8
 163 .L$_c$cont$\@:
 164         bl      fflush@plt
 165
 166         mov     r13, r4
 167         ldmfd   r13!, {r0-r4, r12, r14}
 168
 169 #elif defined(__aarch64__)
 170
 171         sub     sp, sp, #20*8
 172         stp      x0,  x1, [sp,   #0]
 173         stp      x2,  x3, [sp,  #16]
 174         stp      x4,  x5, [sp,  #32]
 175         stp      x6,  x7, [sp,  #48]
 176         stp      x8,  x9, [sp,  #64]
 177         stp     x10, x11, [sp,  #80]
 178         stp     x12, x13, [sp,  #96]
 179         stp     x14, x15, [sp, #112]
 180         stp     x16, x17, [sp, #128]
 181         mrs     x16, nzcv
 182         stp     x16, x30, [sp, #144]
 183
 184         mov     w0, #\c
 185         bl      putchar
 186         adrp    x0, :got:stdout
 187         ldr     x0, [x0, #:got_lo12:stdout]
 188         ldr     x0, [x0]
 189         bl      fflush
 190
 191         ldp     x16, x30, [sp, #144]
 192         msr     nzcv, x16
 193         ldp     x16, x17, [sp, #128]
 194         ldp     x14, x15, [sp, #112]
 195         ldp     x12, x13, [sp,  #96]
 196         ldp     x10, x11, [sp,  #80]
 197         ldp      x8,  x9, [sp,  #64]
 198         ldp      x6,  x7, [sp,  #48]
 199         ldp      x4,  x5, [sp,  #32]
 200         ldp      x2,  x3, [sp,  #16]
 201         ldp      x0,  x1, [sp,   #0]
 202         add     sp, sp, #20*8
 203
 204 #else
 205 #  error "not supported"
 206 #endif
 207 .endm
 208
 209 .macro  notimpl
 210 #if defined(__i386__) || defined(__x86_64__)
 211         ud2
 212 #elif defined(__arm__)
 213         udf
 214 #elif defined(__aarch64__)
 215         hlt     #0
 216 #else
 217 #  error "not supported"
 218 #endif
 219 .endm
 220
 221         .section .note.GNU-stack, "", %progbits
 222
 223         .text
 224
 225 #if defined(__i386__)
 226 get_pc_ebx:
 227         mov     ebx, [esp]
 228         ret
 229 #endif
 230
 231
 232 proc    call_example
 233
 234 #if defined(__i386__)
 235
 236         push    ebx                     // ebx
 237         push    esi                     // esi, ebx
 238         push    edi                     // edi, esi, ebx
 239         push    ebp                     // flags, ebp, ..., ebx
 240         pushf
 241
 242         mov     edi, [esp + 4*6]
 243         mov     esi, [esp + 4*7]
 244         push    esi                     // regs, flags, ebp, ..., ebx
 245
 246         call    get_pc_ebx
 247         lea     eax, [ebx + 9f - .]
 248         push    eax                     // cont, regs, flags, ebp, ..., ebx
 249         push    edi                 // func, cont, regs, flags, ebp, ..., ebx
 250
 251         mov     eax, [esi + 28]
 252         pushf
 253         pop     ecx
 254         and     eax,  0x0cd5
 255         and     ecx, ~0x0cd5
 256         or      eax, ecx
 257         push    eax
 258         popf
 259         mov     eax, [esi +  0]
 260         mov     ebx, [esi +  4]
 261         mov     ecx, [esi +  8]
 262         mov     edx, [esi + 12]
 263         mov     edi, [esi + 20]
 264         mov     ebp, [esi + 24]
 265         mov     esi, [esi + 16]
 266
 267         ret                            // -> func; regs, flags, ebp, ..., ebx
 268
 269 9:      pushf                           // eflags, regs, flags, ebp, ..., ebx
 270         push    esi                // esi, eflags, regs, flags, ebp, ..., ebx
 271         mov     esi, [esp + 8]
 272         mov     [esi +  0], eax
 273         mov     [esi +  4], ebx
 274         mov     [esi +  8], ecx
 275         mov     [esi + 12], edx
 276         mov     [esi + 20], edi
 277         mov     [esi + 24], ebp
 278         pop     eax                     // rflags, regs, flags, ebp, ..., ebx
 279         mov     [esi + 16], eax
 280         pop     eax                     // regs, flags, ebp, ..., ebx
 281         mov     [esi + 28], eax
 282
 283         add     esp, 4                  // flags, ebp, ..., ebx
 284         popf                            // ebp, ..., ebx
 285         pop     ebp                     // ..., ebx
 286         pop     edi
 287         pop     esi
 288         pop     ebx                     //
 289         ret
 290
 291 #elif defined(__x86_64__)
 292
 293         push    rbx                     // rbx
 294         push    r10
 295         push    r11
 296         push    r12
 297         push    r13
 298         push    r14
 299         push    r15
 300         push    rbp                     // flags, rbp, ..., rbx
 301         pushf
 302
 303         push    rsi                     // regs, flags, rbp, ..., rbx
 304
 305         lea     rax, [rip + 9f]
 306         push    rax                     // cont, regs, flags, rbp, ..., rbx
 307         push    rdi                 // func, cont, regs, flags, rbp, ..., rbx
 308
 309         mov     rax, [rsi + 8*15]
 310         pushf
 311         pop     rcx
 312         and     rax,  0x0cd5
 313         and     rcx, ~0x0cd5
 314         or      rax, rcx
 315         push    rax
 316         popf
 317         mov     rax, [rsi +   0]
 318         mov     rbx, [rsi +   8]
 319         mov     rcx, [rsi +  16]
 320         mov     rdx, [rsi +  24]
 321         mov     rdi, [rsi +  40]
 322         mov     rbp, [rsi +  48]
 323         mov     r8,  [rsi +  56]
 324         mov     r9,  [rsi +  64]
 325         mov     r10, [rsi +  72]
 326         mov     r11, [rsi +  80]
 327         mov     r12, [rsi +  88]
 328         mov     r13, [rsi +  96]
 329         mov     r14, [rsi + 104]
 330         mov     r15, [rsi + 112]
 331         mov     rsi, [rsi +  32]
 332
 333         ret                            // -> func; regs, flags, rbp, ..., rbx
 334
 335 9:      pushf                           // rflags, regs, flags, rbp, ..., rbx
 336         push    rsi                // rsi, rflags, regs, flags, rbp, ..., rbx
 337         mov     rsi, [rsp + 16]
 338         mov     [rsi +   0], rax
 339         mov     [rsi +   8], rbx
 340         mov     [rsi +  16], rcx
 341         mov     [rsi +  24], rdx
 342         mov     [rsi +  40], rdi
 343         mov     [rsi +  48], rbp
 344         mov     [rsi +  56],  r8
 345         mov     [rsi +  64],  r9
 346         mov     [rsi +  72], r10
 347         mov     [rsi +  80], r11
 348         mov     [rsi +  88], r12
 349         mov     [rsi +  96], r13
 350         mov     [rsi + 104], r14
 351         mov     [rsi + 112], r15
 352         pop     rax                     // rflags, regs, flags, rbp, ..., rbx
 353         mov     [rsi +  32], rax
 354         pop     rax                     // regs, flags, rbp, ..., rbx
 355         mov     [rsi + 120], rax
 356
 357         add     rsp, 8                  // flags, rbp, ..., rbx
 358         popf                            // rbp, ..., rbx
 359         pop     rbp                     // ..., rbx
 360         pop     r15
 361         pop     r14
 362         pop     r13
 363         pop     r12
 364         pop     r11
 365         pop     r10
 366         pop     rbx                     //
 367         ret
 368
 369 #elif defined(__arm__)
 370
 371         stmfd   r13!, {r0, r1, r4-r11, r14}
 372         ldmia   r1, {r0-r12, r14}
 373         msr     cpsr, r14
 374         mov     r14, pc
 375         ldr     pc, [r13], #4
 376         ldr     r14, [r13], #4
 377         stmia   r14!, {r0-r12}
 378         mrs     r0, cpsr
 379         str     r0, [r14]
 380         ldmfd   r13!, {r4-r11, pc}
 381
 382 #elif defined(__aarch64__)
 383
 384         stp     x29, x30, [sp, #-14*8]!
 385         mov     x29, sp
 386         stp     x19, x20, [sp,  #16]
 387         stp     x21, x22, [sp,  #32]
 388         stp     x23, x24, [sp,  #48]
 389         stp     x25, x26, [sp,  #64]
 390         stp     x27, x28, [sp,  #80]
 391         str           x1, [sp, #104]
 392
 393         ldp     x29, x30, [x1, #224]
 394         msr     nzcv, x30
 395         mov     x30, x0
 396         ldp     x27, x28, [x1, #208]
 397         ldp     x25, x26, [x1, #192]
 398         ldp     x23, x24, [x1, #176]
 399         ldp     x21, x22, [x1, #160]
 400         ldp     x19, x20, [x1, #144]
 401         ldp     x16, x17, [x1, #128]
 402         ldp     x14, x15, [x1, #112]
 403         ldp     x12, x13, [x1,  #96]
 404         ldp     x10, x11, [x1,  #80]
 405         ldp      x8,  x9, [x1,  #64]
 406         ldp      x6,  x7, [x1,  #48]
 407         ldp      x4,  x5, [x1,  #32]
 408         ldp      x2,  x3, [x1,  #16]
 409         ldp      x0,  x1, [x1,   #0]
 410
 411         blr     x30
 412
 413         ldr     x30, [sp, #104]
 414         stp     x27, x28, [x30, #208]
 415         stp     x25, x26, [x30, #192]
 416         stp     x23, x24, [x30, #176]
 417         stp     x21, x22, [x30, #160]
 418         stp     x19, x20, [x30, #144]
 419         stp     x16, x17, [x30, #128]
 420         stp     x14, x15, [x30, #112]
 421         stp     x12, x13, [x30,  #96]
 422         stp     x10, x11, [x30,  #80]
 423         stp      x8,  x9, [x30,  #64]
 424         stp      x6,  x7, [x30,  #48]
 425         stp      x4,  x5, [x30,  #32]
 426         stp      x2,  x3, [x30,  #16]
 427         stp      x0,  x1, [x30,   #0]
 428         mov     x0, x30
 429         mrs     x30, nzcv
 430         stp     x29, x30,  [x0, #224]
 431
 432         ldp     x19, x20, [sp,  #16]
 433         ldp     x21, x22, [sp,  #32]
 434         ldp     x23, x24, [sp,  #48]
 435         ldp     x25, x26, [sp,  #64]
 436         ldp     x27, x28, [sp,  #80]
 437         ldp     x29, x30, [sp], #14*8
 438
 439         ret
 440
 441 #else
 442 #  error "not supported"
 443 #endif
 444
 445 endproc
 446
 447 proc    nop
 448
 449         ret
 450
 451 endproc
 452
 453 ///--------------------------------------------------------------------------
 454 /// 0x00--0x0f
 455
 456 proc    x00
 457
 458         // clear all 64 bits of extended traditional registers
 459
 460 #if defined(__x86_64__)
 461
 462         xor      eax, eax               // clear rax
 463         lea      rbx, [0]               // rbx -> _|_
 464         loop     .                      // iterate, decrement rcx until zero
 465         mov      rdx, 0                 // set rdx = 0
 466         and      esi, 0                 // clear all bits of rsi
 467         sub      edi, edi               // set rdi = edi - edi = 0
 468         push     0
 469         pop      rbp                    // pop 0 into rbp
 470
 471 #elif defined(__i386__)
 472
 473         xor     eax, eax
 474         lea     ebx, [0]
 475         loop    .
 476         mov     edx, 0
 477         and     esi, 0
 478         sub     edi, edi
 479         push    0
 480         pop     ebp
 481
 482 #elif defined(__arm__)
 483
 484         eor     r0, r0, r0
 485         rsb     r1, r1, r1
 486 0:      subs    r2, r2, #1
 487         bne     0b
 488         mov     r3, #0
 489         and     r4, r4, #0
 490         sub     r5, r5, r5
 491
 492 #elif defined(__aarch64__)
 493
 494         eor     w0, w0, w0
 495         mov     w1, wzr
 496 0:      sub     w2, w2, #1
 497         cbnz    w2, 0b
 498         mov     w3, #0
 499         and     w4, w4, wzr
 500         sub     w5, w5, w5
 501
 502 #else
 503         notimpl
 504 #endif
 505
 506         ret
 507
 508 endproc
 509
 510 proc    x01
 511
 512         // advance a fibonacci pair by c steps
 513         //
 514         // on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1}
 515         // and f_{i+c}, where f_{i+1} = f_i + f_{i-1}
 516
 517 #if defined(__x86_64__)
 518
 519 0:      xadd    rax, rdx                // a, d = a + d, a
 520                                         //      = f_{i+1} + f_i, f_{i+1}
 521                                         //      = f_{i+2}, f_{i+1}
 522         loop    0b                      // advance i, decrement c, iterate
 523
 524 #elif defined(__i386__)
 525
 526 0:      xadd    eax, edx
 527         loop    0b
 528
 529 #elif defined(__arm__)
 530
 531 0:      subs    r2, r2, #2
 532         add     r3, r3, r0
 533         blo     8f
 534         add     r0, r0, r3
 535         bhi     0b
 536
 537 8:      movne   r0, r3
 538
 539 #elif defined(__aarch64__)
 540
 541 0:      subs    x2, x2, #2
 542         add     x3, x3, x0
 543         b.lo    8f
 544         add     x0, x0, x3
 545         b.hi    0b
 546
 547 8:      cmov.ne x0, x3
 548
 549 #else
 550         notimpl
 551 #endif
 552
 553         ret
 554
 555 endproc
 556
 557 proc    x02
 558
 559         // boolean canonify a: if a = 0 on entry, leave it zero; otherwise
 560         // set a = 1
 561
 562 #if defined(__x86_64__)
 563
 564         neg     rax                     // set cf iff a /= 0
 565         sbb     rax, rax                // a = a - a - cf = -cf
 566         neg     rax                     // a = cf
 567
 568 #elif defined(__i386__)
 569
 570         neg     eax
 571         sbb     eax, eax
 572         neg     eax
 573
 574 #elif defined(__arm__)
 575
 576         movs    r1, r0                  // the easy way
 577         movne   r1, #1                  // mvnne r1, #1 for mask
 578
 579         cmp     r0, #1                  // clear cf iff a == 0
 580         sbc     r2, r0, r0              // c' = a - a - 1 + cf = cf - 1
 581         add     r2, r2, #1              // c' = cf
 582
 583         sub     r3, r0, r0, lsr #1      // d' top bit clear; d' = 0 iff a = 0
 584         rsb     r3, r3, #0              // d' top bit set iff a /= 0
 585         mov     r3, r3, lsr #31         // asr for mask
 586
 587         rsbs    r0, r0, #0
 588         sbc     r0, r0, r0
 589         rsb     r0, r0, #0
 590
 591 #elif defined(__aarch64__)
 592
 593         cmp     x0, #0                  // trivial
 594         cset.ne x1                      // csetm for mask
 595
 596         cmp     xzr, x0                 // set cf iff a == 0
 597         sbc     x2, x0, x0              // c' = a - a - 1 + cf = cf - 1
 598         neg     x2, x2                  // c' = 1 - cf
 599
 600         sub     x3, x0, x0, lsr #1      // if a < 2^63 then a' = ceil(d/2) <
 601                                         // 2^63
 602                                         // if a >= 2^63, write a = 2^63 + t
 603                                         // with t < 2^63; d' = 2^63 - 2^62 +
 604                                         // ceil(t/2) = 2^62 + ceil(t/2), and
 605                                         // ceil(t/2) < 2^62
 606                                         // anyway d' < 2^63 and d' = 0 iff
 607                                         // a = 0
 608         neg     x3, x3                  // d' top bit set iff a /= 0
 609         lsr     x3, x3, #63             // asr for mask
 610
 611         cmp     x0, #1                  // set cf iff a /= 0
 612         adc     x0, xzr, xzr            // a' = 0 + 0 + cf = cf
 613
 614 #else
 615         notimpl
 616 #endif
 617
 618         ret
 619
 620 endproc
 621
 622 proc    x03
 623
 624         // set a = min(a, d) (unsigned); clobber c, d
 625
 626 #if defined(__x86_64__)
 627
 628         sub     rdx, rax                // d' = d - a; set cf if a > d
 629         sbb     rcx, rcx                // c = -cf = -[a > d]
 630         and     rcx, rdx                // c = a > d ? d - a : 0
 631         add     rax, rcx                // a' = a > d ? d : a
 632
 633 #elif defined(__i386__)
 634
 635         sub     edx, eax
 636         sbb     ecx, ecx
 637         and     ecx, edx
 638         add     eax, ecx
 639
 640 #elif defined(__arm__)
 641
 642         cmp     r0, r3                  // the easy way
 643         movlo   r1, r0                  // only needed for out-of-place
 644         movhs   r1, r3
 645
 646         subs    r3, r3, r0
 647         sbc     r12, r12, r12
 648         and     r12, r12, r3
 649         add     r0, r0, r12
 650
 651 #elif defined(__aarch64__)
 652
 653         cmp     x0, x3                  // the easy way
 654         csel.lo x1, x0, x3
 655
 656         subs    x3, x3, x0              // d' = d - a; set cf if d >= a
 657         sbc     x16, xzr, xzr           // t = -1 + cf = -[a > d]
 658         and     x16, x16, x3            // t = a > d ? d - a : 0
 659         add     x0, x0, x16             // a' = a > d ? d : a
 660
 661 #else
 662         notimpl
 663 #endif
 664
 665         ret
 666
 667 endproc
 668
 669 proc    x04
 670
 671         // switch case?
 672
 673 #if defined(__x86_64__)
 674
 675   // unrelated playing
 676   mov   ecx, eax
 677   mov   rbx, -1
 678   mov   edx, ecx
 679   sub   edx, '0'
 680   cmp   edx, 10
 681   cmovb rbx, rdx
 682   or    ecx, 0x20
 683   mov   edx, ecx
 684   sub   edx, 'a'
 685   sub   ecx, 'a' - 10
 686   cmp   edx, 6
 687   cmovb rbx, rcx
 688
 689         xor     al, 0x20
 690
 691 #elif defined(__i386__)
 692
 693   // unrelated playing
 694   mov   ecx, eax
 695   mov   ebx, -1
 696   mov   edx, ecx
 697   sub   edx, '0'
 698   cmp   edx, 10
 699   cmovb ebx, edx
 700   or    ecx, 0x20
 701   mov   edx, ecx
 702   sub   edx, 'a'
 703   sub   ecx, 'a' - 10
 704   cmp   edx, 6
 705   cmovb ebx, ecx
 706
 707         xor     al, 0x20
 708
 709 #elif defined(__arm__)
 710
 711   // unrelated playing
 712   mvn   r1, #0
 713   sub   r12, r0, #'0'
 714   cmp   r12, #10
 715   movlo r1, r12
 716   orr   r12, r0, #0x20
 717   sub   r12, r12, #'a'
 718   cmp   r12, #6
 719   addlo r1, r12, #10
 720
 721         eor     r0, r0, #0x20
 722
 723 #elif defined(__aarch64__)
 724
 725   // unrelated playing
 726   mov   x1, #-1
 727   sub   w16, w0, #'0'
 728   cmp   w16, #10
 729   cmov.lo       x1, x16
 730   orr   w16, w0, #0x20
 731   sub   w16, w16, #'a' - 10
 732   cmp   w16, #10
 733   ccmp.hs       w16, #16, #CCMP_HS
 734   cmov.lo       x1, x16
 735
 736         eor     w0, w0, #0x20
 737
 738 #else
 739         notimpl
 740 #endif
 741
 742         ret
 743
 744 endproc
 745
 746 proc    x05
 747
 748         // answer whether 5 <= a </<= 9.
 749
 750 #if defined(__x86_64__)
 751
 752         sub     rax, 5                  // a' = a - 5
 753         cmp     rax, 4                  // is a' - 5 </<= 4?
 754
 755         // cc           a'                      a
 756         //
 757         // z/e          a' = 4                  a = 9
 758         // nz/ne        a' /= 4                 a /= 9
 759         //
 760         // a/nbe        a' > 4                  a > 9 or a < 5
 761         // nc/ae/nb     a' >= 4                 a >= 9 or a < 5
 762         // c/b/nae      a' < 4                  5 <= a < 9
 763         // be/na        a' <= 4                 5 <= a <= 9
 764         //
 765         // o            a' < -2^63 + 4          -2^63 + 5 <= a < -2^63 + 9
 766         // no           a' >= -2^63 + 4         a >= -2^63 + 9 or
 767         //                                              a < -2^63 + 5
 768         // s            -2^63 + 4 <= a' < 4     -2^63 + 9 <= a < 9
 769         // ns           a' < -2^63 + 4 or       a < -2^63 + 9 or a >= 9
 770         //                      a' >= 4
 771         // ge/nl        a' >= 4                 a >= 9 or a < -2^63 + 5
 772         // l/nge        a' < 4                  -2^63 + 5 <= a < 9
 773         // g/nle        a' > 4                  a > 9 or a < -2^63 + 5
 774         // le/ng        a' <= 4                 -2^63 + 5 <= a <= 9
 775
 776 #elif defined(__i386__)
 777
 778         sub     eax, 5
 779         cmp     eax, 4
 780
 781 #elif defined(__arm__)
 782
 783         // i dimly remember having a slick way to do this way back in the
 784         // day, but i can't figure it out any more.
 785         sub     r0, #5
 786         cmp     r0, #4
 787
 788 #elif defined(__aarch64__)
 789
 790         // literal translation is too obvious
 791         cmp     x0, #5
 792         ccmp.hs x0, #9, #CCMP_HS
 793
 794 #else
 795         notimpl
 796 #endif
 797
 798         ret
 799
 800 endproc
 801
 802 proc    x06
 803
 804         // leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of,
 805         // set sf to msb(a)
 806
 807 #if defined(__x86_64__)
 808
 809         not     rax                     // a' = -a - 1
 810         inc     rax                     // a' = -a
 811         neg     rax                     // a' = a
 812
 813 #elif defined(__i386__)
 814
 815         not     eax
 816         inc     eax
 817         neg     eax
 818
 819 #elif defined(__arm__)
 820
 821         mvn     r0, r0
 822         add     r0, r0, #1
 823         rsbs    r0, r0, #0              // cf has opposite sense
 824
 825 #elif defined(__aarch64__)
 826
 827         mvn     x0, x0
 828         add     x0, x0, #1
 829         negs    x0, x0                  // cf has opposite sense
 830
 831 #else
 832         notimpl
 833 #endif
 834
 835         ret
 836
 837 endproc
 838
 839 proc    x07
 840
 841         // same as before (?)
 842
 843 #if defined(__x86_64__)
 844
 845         inc     rax                     // a' = a + 1
 846         neg     rax                     // a' = -a - 1
 847         inc     rax                     // a' = -a
 848         neg     rax                     // a' = a
 849
 850 #elif defined(__i386__)
 851
 852         inc     eax
 853         neg     eax
 854         inc     eax
 855         neg     eax
 856
 857 #elif defined(__arm__)
 858
 859         add     r0, r0, #1
 860         rsb     r0, r0, #0
 861         add     r0, r0, #1
 862         rsbs    r0, r0, #0
 863
 864 #elif defined(__aarch64__)
 865
 866         add     x0, x0, #1
 867         neg     x0, x0
 868         add     x0, x0, #1
 869         negs    x0, x0                  // cf has opposite sense
 870
 871 #else
 872         notimpl
 873 #endif
 874
 875         ret
 876
 877 endproc
 878
 879 proc    x08
 880
 881         // floor((a + d)/2), correctly handling overflow conditions; final cf
 882         // is lsb(a + d), probably uninteresting
 883
 884 #if defined(__x86_64__)
 885
 886         add     rax, rdx                // cf || a' = a + d
 887         rcr     rax, 1                  // shift 65-bit result right by one
 888                                         // place; lsb moves into carry
 889
 890 #elif defined(__i386__)
 891
 892         add     eax, edx
 893         rcr     eax, 1
 894
 895 #elif defined(__arm__)
 896
 897         // like the two-instruction a64 version
 898         sub     r1, r3, r0
 899         add     r1, r0, r1, lsr #1
 900
 901         // the slick version, similar to the above
 902         adds    r0, r0, r3
 903         mov     r0, r0, rrx
 904
 905 #elif defined(__aarch64__)
 906
 907         // a64 lacks a32's rrx.  literal translation.
 908         adds    x1, x0, x3              // cf || a' = a + d
 909         adc     x16, xzr, xzr           // realize cf in extra register
 910         extr    x1, x16, x1, #1         // shift down one place
 911
 912         // two instruction version: clobbers additional register.  (if you
 913         // wanted the answer in any other register, even overwriting d, then
 914         // this is unnecessary.)  also depends on d >= a.
 915         sub     x16, x3, x0             // compute difference
 916         add     x0, x0, x16, lsr #1     // add half of it (rounded down)
 917
 918 #else
 919         notimpl
 920 #endif
 921
 922         ret
 923
 924 endproc
 925
 926 proc    x09
 927
 928         // a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3
 929         // (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8).
 930
 931 #if defined(__x86_64__)
 932
 933         shr     rax, 3                  // a' = floor(a/8); cf = 1 if a ==
 934                                         // 4, 5, 6, 7 (mod 8)
 935         adc     rax, 0                  // a' = floor(a/8) + cf
 936
 937 #elif defined(__i386__)
 938
 939         shr     eax, 3
 940         adc     eax, 0
 941
 942 #elif defined(__arm__)
 943
 944         movs    r0, r0, lsr #3
 945         adc     r0, r0, #0
 946
 947 #elif defined(__aarch64__)
 948
 949         tst     x0, #4
 950         orr     x0, xzr, x0, lsr #3
 951         cinc.ne x0, x0
 952
 953 #else
 954         notimpl
 955 #endif
 956
 957         ret
 958
 959 endproc
 960
 961 proc    x0a
 962
 963         // increment c-byte little-endian bignum at rdi
 964
 965 #if defined(__x86_64__)
 966
 967         add     byte ptr [rdi], 1
 968 0:      inc     rdi
 969         adc     byte ptr [rdi], 0
 970         loop    0b
 971
 972 #elif defined(__i386__)
 973
 974         add     byte ptr [edi], 1
 975 0:      inc     edi
 976         adc     byte ptr [edi], 0
 977         loop    0b
 978
 979 #elif defined(__arm__)
 980
 981         mov     r12, #256               // set initial carry
 982 0:      ldrb    r0, [r5]
 983         subs    r2, r2, #1
 984         add     r12, r0, r12, lsr #8
 985         strb    r12, [r5], #1
 986         bne     0b
 987
 988 #elif defined(__aarch64__)
 989
 990         mov     w17, #256               // set initial carry
 991 0:      ldrb    w16, [x5]
 992         sub     x2, x2, #1
 993         add     w17, w16, w17, lsr #8
 994         strb    w17, [x5], #1
 995         cbnz    x2, 0b
 996
 997 #else
 998         notimpl
 999 #endif
1000
1001         ret
1002
1003 endproc
1004
1005 proc    x0b
1006
1007         // negate double-precision d:a
1008
1009 #if defined(__x86_64__)
1010
1011         not     rdx                     // d' = -d - 1
1012         neg     rax                     // a' = -a;
1013                                         // cf = 1 iff a /= 0
1014         sbb     rdx, -1                 // d' = -d - cf
1015
1016 #elif defined(__i386__)
1017
1018         not     edx
1019         neg     eax
1020         sbb     edx, -1
1021
1022 #elif defined(__arm__)
1023
1024         // reverse subtract is awesome
1025         rsbs    r0, r0, #0
1026         rsc     r3, r3, #0
1027
1028 #elif defined(__aarch64__)
1029
1030         // easy way: everything is better with zero registers.
1031         negs    x0, x0
1032         ngc     x3, x3
1033
1034 #else
1035         notimpl
1036 #endif
1037
1038         ret
1039
1040 endproc
1041
1042 proc    x0c
1043
1044         // rotate is distributive over xor.
1045
1046 #if defined(__x86_64__)
1047
1048         // rax                          // = a_1 || a_0
1049         // rbx                          // = b_1 || b_0
1050         mov     rcx, rax                // = a_1 || a_0
1051
1052         xor     rcx, rbx                // = (a_1 XOR b_1) || (a_0 XOR b_0)
1053         ror     rcx, 0xd                // = (a_0 XOR b_0) || (a_1 XOR b_1)
1054
1055         ror     rax, 0xd                // = a_0 || a_1
1056         ror     rbx, 0xd                // = b_0 || b_1
1057         xor     rax, rbx                // = (a_0 XOR b_0) || (a_1 XOR b_1)
1058
1059         cmp     rax, rcx                // always equal
1060
1061 #elif defined(__i386__)
1062
1063         mov     ecx, eax                // = a_1 || a_0
1064
1065         xor     ecx, ebx                // = (a_1 XOR b_1) || (a_0 XOR b_0)
1066         ror     ecx, 0xd                // = (a_0 XOR b_0) || (a_1 XOR b_1)
1067
1068         ror     eax, 0xd                // = a_0 || a_1
1069         ror     ebx, 0xd                // = b_0 || b_1
1070         xor     eax, ebx                // = (a_0 XOR b_0) || (a_1 XOR b_1)
1071
1072         cmp     eax, ecx                // always equal
1073
1074 #elif defined(__arm__)
1075
1076
1077         // r0                           // = a_1 || a_0
1078         // r1                           // = b_1 || b_0
1079         eor     r2, r0, r1              // = (a_1 XOR b_1) || (a_0 XOR b_0)
1080         mov     r2, r2, ror #13         // = (a_0 XOR b_0) || (a_1 XOR b_1)
1081
1082         mov     r1, r1, ror #13         // = b_0 || b_1
1083         eor     r0, r1, r0, ror #13     // = (a_0 XOR b_0) || (a_1 XOR b_1)
1084
1085         cmp     r0, r2                  // always equal
1086
1087 #elif defined(__aarch64__)
1088
1089         // x0                           // = a_1 || a_0
1090         // x1                           // = b_1 || b_0
1091         eor     x2, x0, x1              // = (a_1 XOR b_1) || (a_0 XOR b_0)
1092         ror     x2, x2, #13             // = (a_0 XOR b_0) || (a_1 XOR b_1)
1093
1094         ror     x1, x1, #13             // = b_0 || b_1
1095         eor     x0, x1, x0, ror #13     // = (a_0 XOR b_0) || (a_1 XOR b_1)
1096
1097         cmp     x0, x2                  // always equal
1098
1099 #else
1100         notimpl
1101 #endif
1102
1103         ret
1104
1105 endproc
1106
1107 proc    x0d
1108
1109         // and is distributive over xor.
1110
1111 #if defined(__x86_64__)
1112
1113         mov     rdx, rbx                // = b
1114
1115         xor     rbx, rcx                // = b XOR c
1116         and     rbx, rax                // = a AND (b XOR c)
1117
1118         and     rdx, rax                // = a AND b
1119         and     rax, rcx                // = a AND c
1120         xor     rax, rdx                // = (a AND b) XOR (a AND c)
1121                                         // = a AND (b XOR c)
1122
1123         cmp     rax, rbx                // always equal
1124
1125 #elif defined(__i386__)
1126
1127         mov     edx, ebx                // = b
1128
1129         xor     ebx, ecx                // = b XOR c
1130         and     ebx, eax                // = a AND (b XOR c)
1131
1132         and     edx, eax                // = a AND b
1133         and     eax, ecx                // = a AND c
1134         xor     eax, edx                // = (a AND b) XOR (a AND c)
1135                                         // = a AND (b XOR c)
1136
1137         cmp     eax, ebx                // always equal
1138
1139 #elif defined(__arm__)
1140
1141         and     r3, r0, r1              // = a AND b
1142
1143         eor     r1, r1, r2              // = b XOR c
1144         and     r1, r1, r0              // = a AND (b XOR c)
1145
1146         and     r0, r0, r2              // = a AND c
1147         eor     r0, r0, r3              // = (a AND b) XOR (a AND c)
1148                                         // = a AND (b XOR c)
1149
1150         cmp     r0, r1                  // always equal
1151
1152 #elif defined(__aarch64__)
1153
1154         and     x3, x0, x1              // = a AND b
1155
1156         eor     x1, x1, x2              // = b XOR c
1157         and     x1, x1, x0              // = a AND (b XOR c)
1158
1159         and     x0, x0, x2              // = a AND c
1160         eor     x0, x0, x3              // = (a AND b) XOR (a AND c)
1161                                         // = a AND (b XOR c)
1162
1163         cmp     x0, x1                  // always equal
1164
1165 #else
1166         notimpl
1167 #endif
1168
1169         ret
1170
1171 endproc
1172
1173 proc    x0e
1174
1175         // de morgan's law
1176
1177 #if defined(__x86_64__)
1178
1179         mov     rcx, rax                // = a
1180
1181         and     rcx, rbx                // = a AND b
1182         not     rcx                     // = NOT (a AND b)
1183
1184         not     rax                     // = NOT a
1185         not     rbx                     // = NOT b
1186         or      rax, rbx                // = (NOT a) OR (NOT b)
1187                                         // = NOT (a AND b)
1188
1189         cmp     rax, rcx                // always equal
1190
1191 #elif defined(__i386__)
1192
1193         mov     ecx, eax                // = a
1194
1195         and     ecx, ebx                // = a AND b
1196         not     ecx                     // = NOT (a AND b)
1197
1198         not     eax                     // = NOT a
1199         not     ebx                     // = NOT b
1200         or      eax, ebx                // = (NOT a) OR (NOT b)
1201                                         // = NOT (a AND b)
1202
1203         cmp     eax, ecx                // always equal
1204
1205 #elif defined(__arm__)
1206
1207         and     r2, r0, r1              // = a AND b
1208         mvn     r2, r2                  // = NOT (a AND b)
1209
1210         mvn     r0, r0                  // = NOT a
1211         mvn     r1, r1                  // = NOT b
1212         orr     r0, r0, r1              // = (NOT a) OR (NOT b)
1213
1214         cmp     r0, r2                  // always equal
1215
1216 #elif defined(__aarch64__)
1217
1218         and     x2, x0, x1              // = a AND b
1219         mvn     x2, x2                  // = NOT (a AND b)
1220
1221         mvn     x0, x0                  // = NOT a
1222         orn     x0, x0, x1              // = (NOT a) OR (NOT b)
1223
1224         cmp     x0, x2                  // always equal
1225
1226 #else
1227         notimpl
1228 #endif
1229
1230         ret
1231
1232 endproc
1233
1234 proc    x0f
1235
1236         // replace input buffer bytes with cumulative XORs with initial a;
1237         // final a is XOR of all buffer bytes and initial a.
1238         //
1239         // not sure why you'd do this.
1240
1241 #if defined(__x86_64__)
1242
1243 0:      xor     [rsi], al
1244         lodsb
1245         loop    0b
1246
1247 #elif defined(__i386__)
1248
1249 0:      xor     [esi], al
1250         lodsb
1251         loop    0b
1252
1253 #elif defined(__arm__)
1254
1255 0:      ldrb    r12, [r4]
1256         subs    r2, r2, #1
1257         eor     r0, r0, r12
1258         strb    r0, [r4], #1
1259         bne     0b
1260
1261 #elif defined(__aarch64__)
1262
1263 0:      ldrb    w16, [x4]
1264         sub     x2, x2, #1
1265         eor     w0, w0, w16
1266         strb    w0, [x4], #1
1267         cbnz    x2, 0b
1268
1269 #else
1270         notimpl
1271 #endif
1272
1273         ret
1274
1275 endproc
1276
1277 ///--------------------------------------------------------------------------
1278 /// 0x10--0x1f
1279
1280 proc    x10
1281
1282         // four different ways to swap a pair of registers.
1283
1284 #if defined(__x86_64__)
1285
1286         push    rax
1287         push    rcx
1288         pop     rax
1289         pop     rcx
1290
1291         xor     rax, rcx
1292         xor     rcx, rax
1293         xor     rax, rcx
1294
1295         add     rax, rcx
1296         sub     rcx, rax
1297         add     rax, rcx
1298         neg     rcx
1299
1300         xchg    rax, rcx
1301
1302 #elif defined(__i386__)
1303
1304         push    eax
1305         push    ecx
1306         pop     eax
1307         pop     ecx
1308
1309         xor     eax, ecx
1310         xor     ecx, eax
1311         xor     eax, ecx
1312
1313         add     eax, ecx
1314         sub     ecx, eax
1315         add     eax, ecx
1316         neg     ecx
1317
1318         xchg    eax, ecx
1319
1320 #elif defined(__arm__)
1321
1322         stmfd   r13!, {r0, r2}
1323         ldr     r0, [r13, #4]
1324         ldr     r2, [r13], #8
1325
1326         eor     r0, r0, r2
1327         eor     r2, r2, r0
1328         eor     r0, r0, r2
1329
1330         sub     r0, r0, r2
1331         add     r2, r2, r0
1332         rsb     r0, r0, r2              // don't need 3-addr with reverse-sub
1333
1334         mov     r12, r0
1335         mov     r0, r2
1336         mov     r2, r0
1337
1338 #elif defined(__aarch64__)
1339
1340         // anything you can do
1341         stp     x0, x2, [sp, #-16]!
1342         ldp     x2, x0, [sp], #16
1343
1344         eor     x0, x0, x2
1345         eor     x2, x2, x0
1346         eor     x0, x0, x2
1347
1348         // the add/sub/add thing was daft.  you can do it in three if you're
1349         // clever -- and have three-address operations.
1350         sub     x0, x0, x2
1351         add     x2, x2, x0
1352         sub     x0, x2, x0
1353
1354         // but we lack a fourth.  we can't do this in fewer than three
1355         // instructions without hitting memory.  only `ldp' will modify two
1356         // registers at a time, so we need at least two instructions -- but
1357         // if the first one sets one of our two registers to its final value
1358         // then we lose the other input value with no way to recover it, so
1359         // we must either write a fresh third register, or write something
1360         // other than the final value, and in both cases we need a third
1361         // instruction to fix everything up.  we've done the wrong-something-
1362         // other trick twice, so here's the captain-obvious use-a-third-
1363         // register version.
1364         mov     x16, x0
1365         mov     x0, x2
1366         mov     x2, x16
1367
1368 #else
1369         notimpl
1370 #endif
1371
1372         ret
1373
1374 endproc
1375
1376 proc    x11
1377
1378         // assuming a is initialized to zero, set a to the inclusive or of
1379         // the xor-differences of corresponding bytes in the c-byte strings
1380         // at si and di.
1381         //
1382         // in particular, a will be zero (and zf set) if and only if the two
1383         // strings are equal.
1384
1385 #if defined(__x86_64__)
1386
1387 0:      mov     dl, [rsi]
1388         xor     dl, [rdi]
1389         inc     rsi
1390         inc     rdi
1391         or      al, dl
1392         loop    0b
1393
1394 #elif defined(__i386__)
1395
1396 0:      mov     dl, [esi]
1397         xor     dl, [edi]
1398         inc     esi
1399         inc     edi
1400         or      al, dl
1401         loop    0b
1402
1403 #elif defined(__arm__)
1404
1405 0:      ldrb    r1, [r4], #1
1406         ldrb    r12, [r5], #1
1407         subs    r2, r2, #1
1408         eor     r12, r12, r1
1409         orr     r0, r0, r12
1410         bne     0b
1411
1412 #elif defined(__aarch64__)
1413
1414 0:      ldrb    w16, [x4], #1
1415         ldrb    w17, [x5], #1
1416         sub     x2, x2, #1
1417         eor     w16, w16, w17
1418         orr     w0, w0, w16
1419         cbnz    x2, 0b
1420
1421 #else
1422         notimpl
1423 #endif
1424
1425         ret
1426
1427 endproc
1428
1429 proc    x12
1430
1431         // an obtuse way of adding two registers.  for any bit position, a
1432         // OR d is set if and only if at least one of a and d has a bit set
1433         // in that position, and a AND d is set if and only if both have a
1434         // bit set in that position.  essentially, then, what we've done is
1435         // move all of the set bits in d to a, unless there's already a bit
1436         // there.  this clearly doesn't change the sum.
1437
1438 #if defined(__x86_64__)
1439
1440         mov     rcx, rdx                // c' = d
1441         and     rdx, rax                // d' = a AND d
1442         or      rax, rcx                // a' = a OR d
1443         add     rax, rdx
1444
1445 #elif defined(__i386__)
1446
1447         mov     ecx, edx                // c' = d
1448         and     edx, eax                // d' = a AND d
1449         or      eax, ecx                // a' = a OR d
1450         add     eax, edx
1451
1452 #elif defined(__arm__)
1453
1454         and     r2, r0, r3              // c' = a AND d
1455         orr     r0, r0, r3              // a' = a OR d
1456         add     r0, r0, r2
1457
1458 #elif defined(__aarch64__)
1459
1460         and     x2, x0, x3              // c' = a AND d
1461         orr     x0, x0, x3              // a' = a OR d
1462         add     x0, x0, x2
1463
1464 #else
1465         notimpl
1466 #endif
1467
1468         ret
1469
1470 endproc
1471
1472 proc    x13
1473
1474         // ok, so this is a really obtuse way of adding a and b; the result
1475         // is in a and d.  but why does it work?
1476
1477 #if defined(__x86_64__)
1478
1479         mov     rcx, 0x40               // carry chains at most 64 long
1480 0:      mov     rdx, rax                // copy a'
1481         xor     rax, rbx                // low bits of each bitwise sum
1482         and     rbx, rdx                // carry bits from each bitwise sum
1483         shl     rbx, 1                  // carry them into next position
1484         loop    0b
1485
1486 #elif defined(__i386__)
1487
1488         mov     ecx, 0x40               // carry chains at most 64 long
1489 0:      mov     edx, eax                // copy a'
1490         xor     eax, ebx                // low bits of each bitwise sum
1491         and     ebx, edx                // carry bits from each bitwise sum
1492         shl     ebx, 1                  // carry them into next position
1493         loop    0b
1494
1495 #elif defined(__arm__)
1496
1497         mov     r2, #0x40
1498 0:      and     r3, r0, r1
1499         subs    r2, r2, #1
1500         eor     r0, r0, r1
1501         lsl     r1, r3, #1
1502         bne     0b
1503
1504 #elif defined(__aarch64__)
1505
1506         mov     x2, #0x40
1507 0:      and     x3, x0, x1
1508         sub     x2, x2, #1
1509         eor     x0, x0, x1
1510         lsl     x1, x3, #1
1511         cbnz    x2, 0b
1512
1513 #else
1514         notimpl
1515 #endif
1516
1517         ret
1518
1519 endproc
1520
1521 proc    x14
1522
1523         // floor((a + d)/2), like x08.
1524
1525 #if defined(__x86_64__)
1526
1527         mov     rcx, rax                // copy a for later
1528         and     rcx, rdx                // carry bits
1529
1530         xor     rax, rdx                // low bits of each bitwise sum
1531         shr     rax, 1                  // divide by 2; carries now in place
1532
1533         add     rax, rcx                // add the carries; done
1534
1535 #elif defined(__i386__)
1536
1537         mov     ecx, eax                // copy a for later
1538         and     ecx, edx                // carry bits
1539
1540         xor     eax, edx                // low bits of each bitwise sum
1541         shr     eax, 1                  // divide by 2; carries now in place
1542
1543         add     eax, ecx                // add the carries; done
1544
1545 #elif defined(__arm__)
1546
1547         and     r2, r0, r3
1548         eor     r0, r0, r3
1549         add     r0, r2, r0, lsr #1
1550
1551 #elif defined(__aarch64__)
1552
1553         and     x2, x0, x3
1554         eor     x0, x0, x3
1555         add     x0, x2, x0, lsr #1
1556
1557 #else
1558         notimpl
1559 #endif
1560
1561         ret
1562
1563 endproc
1564
1565 proc    x15
1566
1567         // sign extension 32 -> 64 bits.
1568
1569 #if defined(__x86_64__)
1570
1571         movsx   rbx, eax                // like this?
1572
1573         mov     rdx, 0xffffffff80000000
1574         add     rax, rdx                // if bit 31 of a is set then bits
1575                                         // 31--63 of a' are clear; otherwise,
1576                                         // these bits are all set -- which is
1577                                         // exactly backwards
1578         xor     rax, rdx                // so fix it
1579
1580 #elif defined(__i386__)
1581
1582         movsx   ebx, ax                 // like this?
1583
1584         mov     edx, 0xffff8000
1585         add     eax, edx                // if bit 31 of a is set then bits
1586                                         // 31--63 of a' are clear; otherwise,
1587                                         // these bits are all set -- which is
1588                                         // exactly backwards
1589         xor     eax, edx                // so fix it
1590
1591 #elif defined(__arm__)
1592
1593         sxth    r1, r0                  // like this
1594
1595         mov     r12, #0x80000000
1596         add     r0, r0, r12, asr #16
1597         eor     r0, r0, r12, asr #16
1598
1599 #elif defined(__aarch64__)
1600
1601         sxtw    x1, w0                  // like this
1602
1603         mov     x16, #0xffffffff80000000
1604         add     x0, x0, x16
1605         eor     x0, x0, x16
1606
1607 #else
1608         notimpl
1609 #endif
1610
1611         ret
1612
1613 endproc
1614
1615 proc    x16
1616
1617         // ??? i don't know why you'd want to calculate this.
1618
1619 #if defined(__x86_64__)
1620
1621         xor     rax, rbx                // a' = a XOR b
1622         xor     rbx, rcx                // b' = b XOR c
1623         mov     rsi, rax                // t = a XOR b
1624         add     rsi, rbx                // t = (a XOR b) + (b XOR c)
1625         cmovc   rax, rbx                // a' = cf ? b XOR c : a XOR b
1626         xor     rax, rbx                // a' = cf ? 0 : a XOR c
1627         cmp     rax, rsi
1628
1629 #elif defined(__i386__)
1630
1631         xor     eax, ebx                // a' = a XOR b
1632         xor     ebx, ecx                // b' = b XOR c
1633         mov     esi, eax                // t = a XOR b
1634         add     esi, ebx                // t = (a XOR b) + (b XOR c)
1635         cmovc   eax, ebx                // a' = cf ? b XOR c : a XOR b
1636         xor     eax, ebx                // a' = cf ? 0 : a XOR c
1637         cmp     eax, esi
1638
1639 #elif defined(__arm__)
1640
1641         eor     r0, r0, r1
1642         eor     r1, r1, r2
1643         adds    r4, r0, r1
1644         movcs   r0, r1
1645         eor     r0, r0, r1
1646         cmp     r0, r4
1647
1648 #elif defined(__aarch64__)
1649
1650         eor     x0, x0, x1
1651         eor     x1, x1, x2
1652         adds    x4, x0, x1
1653         cmov.cs x0, x1
1654         eor     x0, x0, x1
1655         cmp     x0, x4
1656
1657 #else
1658         notimpl
1659 #endif
1660
1661         ret
1662
1663 endproc
1664
1665 proc    x17
1666
1667         // absolute value
1668
1669 #if defined(__x86_64__)
1670
1671         cqo                             // d = a < 0 ? -1 : 0
1672         xor     rax, rdx                // a' = a < 0 ? -a - 1 : a
1673         sub     rax, rdx                // a' = a < 0 ? -a : a
1674
1675 #elif defined(__i386__)
1676
1677         cdq                             // d = a < 0 ? -1 : 0
1678         xor     eax, edx                // a' = a < 0 ? -a - 1 : a
1679         sub     eax, edx                // a' = a < 0 ? -a : a
1680
1681 #elif defined(__arm__)
1682
1683         // direct approach
1684         movs    r1, r0
1685         rsbmi   r1, r0, #0
1686
1687         // faithful-ish conversion
1688         eor     r3, r0, r0, asr #31
1689         sub     r0, r3, r0, asr #31
1690
1691 #elif defined(__aarch64__)
1692
1693         // direct approach
1694         tst     x0, #1 << 63
1695         cneg.ne x1, x0
1696
1697         // faithful-ish conversion
1698         eor     x3, x0, x0, asr #63
1699         sub     x0, x3, x0, asr #63
1700
1701 #else
1702         notimpl
1703 #endif
1704
1705         ret
1706
1707 endproc
1708
1709 proc    x18
1710
1711         // should always set sf, clear zf, unless we get rescheduled to a
1712         // different core.
1713
1714 #if defined(__x86_64__)
1715
1716         rdtsc                           // d || a = cycles
1717         shl     rdx, 0x20
1718         or      rax, rdx                // a = cycles
1719         mov     rcx, rax                // c = cycles
1720
1721         rdtsc                           // d || a = cycles'
1722         shl     rdx, 0x20
1723         or      rax, rdx                // a = cycles'
1724
1725         cmp     rcx, rax
1726
1727 #elif defined(__i386__)
1728
1729         rdtsc                           // d || a = cycles
1730         mov     ebx, eax
1731         mov     ecx, edx                // c || b = cycles
1732
1733         rdtsc                           // d || a = cycles'
1734
1735         sub     ebx, eax
1736         sbb     ecx, edx
1737
1738 #elif defined(__arm__)
1739
1740         // cycle clock not available in user mode
1741         mrrc    p15, 0, r0, r1, c9
1742         mrrc    p15, 0, r2, r3, c9
1743         subs    r0, r0, r2
1744         sbcs    r1, r1, r3
1745
1746 #elif defined(__aarch64__)
1747
1748         // cycle clock not available in user mode
1749         mrs     x0, pmccntr_el0
1750         mrs     x1, pmccntr_el0
1751         cmp     x0, x1
1752
1753 #else
1754         notimpl
1755 #endif
1756
1757         ret
1758
1759 endproc
1760
1761 proc    x19
1762
1763         // stupid way to capture a pointer to inline data and jump past it.
1764         // confuses the return-address predictor something chronic.  worse
1765         // because amd64 calling convention doesn't usually pass arguments on
1766         // the stack.
1767
1768 #if defined(__x86_64__)
1769
1770         call    8f
1771         .string "hello world!\n\0"
1772 8:      call    print_str
1773         add     rsp, 8
1774         ret
1775
1776 print_str:
1777         // actually implement this ridiculous thing
1778         mov     rsi, [rsp + 8]
1779         xor     edx, edx
1780 0:      mov     al, [rsi + rdx]
1781         inc     rdx
1782         cmp     al, 0
1783         jnz     0b
1784         mov     eax, SYS_write
1785         mov     edi, 1
1786         dec     rdx
1787         syscall                         // clobbers r11 :-(
1788         ret
1789
1790 #elif defined(__i386__)
1791
1792         call    8f
1793         .string "hello world!\n\0"
1794 8:      call    print_str
1795         add     esp, 4
1796         ret
1797
1798 print_str:
1799         // actually implement this ridiculous thing
1800         mov     ecx, [esp + 4]
1801         xor     edx, edx
1802 0:      mov     al, [ecx + edx]
1803         inc     edx
1804         cmp     al, 0
1805         jnz     0b
1806         mov     eax, SYS_write
1807         mov     ebx, 1
1808         dec     edx
1809         int     0x80
1810         ret
1811
1812 #elif defined(__arm__)
1813
1814         // why am i doing this?
1815         stmfd   r13!, {r14}
1816         bl      8f
1817         .string "hello world!\n\0"
1818         .balign 4
1819 8:      mov     r1, r14               // might as well make it easy on myself
1820         bl      print_str
1821         ldmfd   r13!, {pc}
1822
1823 print_str:
1824         mov     r2, #0
1825 0:      ldrb    r0, [r1, r2]
1826         cmp     r0, #0
1827         addne   r2, r2, #1
1828         bne     0b
1829         mov     r0, #1
1830         mov     r7, #SYS_write
1831         swi     0
1832         bx      r14
1833
1834 #elif defined(__aarch64__)
1835
1836         // why am i doing this?
1837         str     x30, [sp, #-16]!
1838         bl      8f
1839         .string "hello world!\n\0"
1840         .balign 4
1841 8:      mov     x1, x30               // might as well make it easy on myself
1842         bl      print_str
1843         ldr     x30, [sp], #16
1844         ret
1845
1846 print_str:
1847         mov     x2, #0
1848 0:      ldrb    w0, [x1, x2]
1849         cmp     w0, #0
1850         cinc.ne x2, x2
1851         b.ne    0b
1852         mov     x0, #1
1853         mov     x8, #SYS_write
1854         svc     #0
1855         ret
1856
1857 #else
1858         notimpl
1859 #endif
1860
1861 endproc
1862
1863 proc    x1a
1864
1865         // collect the current instruction-pointer address.  this was an old
1866         // 32-bit i386 trick for position-independent code, but (a) it
1867         // confuses the return predictor, and (b) amd64 has true pc-relative
1868         // addressing.
1869
1870 #if defined(__x86_64__)
1871
1872         // the actual example
1873         call    0f
1874 0:      pop     rax
1875
1876         // the modern i386 trick doesn't confuse the return-address
1877         // predictor.
1878         call    calladdr_rbx
1879         sub     rbx, . - 0b
1880
1881         // but rip-relative addressing is even better
1882         lea     rcx, [rip + 0b]
1883
1884         ret
1885
1886 calladdr_rbx:
1887         mov     rbx, [rsp]
1888         ret
1889
1890 #elif defined(__i386__)
1891
1892         // the actual example
1893         call    0f
1894 0:      pop     eax
1895
1896         // the modern i386 trick doesn't confuse the return-address
1897         // predictor.
1898         call    get_pc_ebx
1899         sub     ebx, . - 0b
1900
1901         ret
1902
1903 #elif defined(__arm__)
1904
1905         stmfd   r13!, {r14}
1906
1907         bl      0f
1908 0:      mov     r0, r14
1909
1910         bl      return
1911         sub     r1, r14, #. - 0b
1912
1913         adr     r2, 0b
1914
1915         ldmfd   r13!, {pc}
1916
1917 return: bx      r14
1918
1919 #elif defined(__aarch64__)
1920
1921         str     x30, [sp, #-16]!
1922
1923         // we can do all of the above using a64
1924         bl      0f
1925 0:      mov     x0, x30
1926
1927         bl      return
1928         sub     x1, x30, #. - 0b
1929
1930         adr     x2, 0b
1931
1932         ldr     x30, [sp], #16
1933 return: ret
1934
1935 #else
1936         notimpl
1937 #endif
1938
1939 endproc
1940
1941 proc    x1b
1942
1943 #if defined(__x86_64__)
1944
1945         // retpolines: an mitigation against adversarially influenced
1946         // speculative execution at indirect branches.  if an adversary can
1947         // prepare a branch-target buffer entry matching an indirect branch
1948         // in the victim's address space then they can cause the victim to
1949         // /speculatively/ (but not architecturally) execute any code in
1950         // their address space, possibly leading to leaking secrets through
1951         // the cache.  retpolines aren't susceptible to this because the
1952         // predicted destination address is from the return-prediction stack
1953         // which the adversary can't prime.  the performance penalty is still
1954         // essentially a branch misprediction -- for this return, and
1955         // possibly all others already stacked.
1956
1957         // (try not to crash)
1958         lea     rax, [rip + 9f]
1959
1960         push    rax
1961 9:      ret
1962
1963 #elif defined(__i386__)
1964
1965         call    get_pc_ebx
1966         lea     eax, [ebx + 9f - .]
1967
1968         push    eax
1969 9:      ret
1970
1971 #elif defined(__arm__)
1972
1973         stmfd   r13!, {r14}
1974
1975         adr     r14, 8f
1976         bx      r14
1977
1978 8:      ldmfd   r13!, {pc}
1979
1980 #elif defined(__aarch64__)
1981
1982         str     x30, [sp, #-16]!
1983
1984         adr     x30, 8f
1985         ret
1986
1987 8:      ldr     x30, [sp], #16
1988         ret
1989
1990 #else
1991         notimpl
1992 #endif
1993
1994 endproc
1995
1996 proc    x1c
1997
1998         // ok, having a hard time seeing a use for this.  the most important
1999         // thing to note is that sp is set from `pop' /after/ it's
2000         // incremented.
2001
2002 #if defined(__x86_64__)
2003
2004         // try not to crash
2005         mov     rax, rsp
2006         and     rsp, -16
2007         push    rax
2008
2009         pop     rsp
2010
2011         // check it worked
2012         mov     rbx, rsp
2013         ret
2014
2015 #elif defined(__i386__)
2016
2017         // try not to crash
2018         mov     eax, esp
2019         and     esp, -16
2020         push    eax
2021
2022         pop     esp
2023
2024         // check it worked
2025         mov     ebx, esp
2026         ret
2027
2028 #elif defined(__arm__)
2029
2030         // not even going to dignify this
2031         notimpl
2032
2033 #elif defined(__aarch64__)
2034
2035         // not even going to dignify this
2036         notimpl
2037
2038 #else
2039         notimpl
2040 #endif
2041
2042 endproc
2043
2044 proc    x1d
2045
2046         // monumentally cheesy way to copy 8 n bytes from buff1 to buff2.
2047         // also clobbers words at buff2 + 8 n and buff2 - 8 for good measure.
2048
2049         n = 4
2050
2051 #if defined(__x86_64__)
2052
2053         mov     rax, rsp                        // safekeeping
2054
2055         // we're toast if we get hit by a signal now.  fingers crossed...
2056   .if 0
2057         mov     rsp, buff2 + 8*n + 8
2058         mov     rbp, buff1 + 8*n
2059   .else
2060         lea     rsp, [rdi + 8*n + 16]
2061         lea     rbp, [rsi + 8*n]
2062   .endif
2063         enter   0, n + 1
2064
2065         // precise action:
2066         //
2067         //         +---------+                  +---------+
2068         //  rbp -> |   ???   |           rsp -> |   ???   |
2069         //         +---------+                  +---------+
2070         //         | w_{n-1} |                  |   rbp   | <- rbp'
2071         //         +---------+                  +---------+
2072         //         |   ...   |                  | w_{n-1} |
2073         //         +---------+                  +---------+
2074         //         |   w_1   |                  |   ...   |
2075         //         +---------+                  +---------+
2076         //         |   w_0   |                  |   w_1   |
2077         //         +---------+                  +---------+
2078         //                                      |   w_0   |
2079         //                                      +---------+
2080         //                                      |   rbp'  | <- rsp'
2081         //                                      +---------+
2082
2083         mov     rdx, rsp
2084         mov     rsp, rax
2085
2086 #elif defined(__i386__)
2087
2088         mov     eax, esp                        // safekeeping
2089
2090         // we're toast if we get hit by a signal now.  fingers crossed...
2091   .if 0
2092         mov     esp, buff2 + 4*n + 4
2093         mov     ebp, buff1 + 4*n
2094   .else
2095         lea     esp, [edi + 4*n + 8]
2096         lea     ebp, [esi + 4*n]
2097   .endif
2098         enter   0, n + 1
2099
2100         mov     edx, esp
2101         mov     esp, eax
2102
2103 #elif defined(__arm__)
2104
2105         add     r4, r4, #4*n
2106         add     r5, r5, #4*n + 8
2107
2108         str     r4, [r5, #-4]!
2109   .rept n/2
2110         ldrd    r0, r1, [r4, #-8]!
2111         strd    r0, r1, [r5, #-8]!
2112   .endr
2113         add     r4, r5, #4*n
2114         str     r4, [r5, #-4]!
2115
2116 #elif defined(__aarch64__)
2117
2118         // omgwtf.  let's not actually screw with the stack pointer.
2119
2120         add     x4, x4, #8*n
2121         add     x5, x5, #8*n + 16
2122
2123         str     x4, [x5, #-8]!
2124   .rept n/2
2125         ldp     x16, x17, [x4, #-16]!
2126         stp     x16, x17, [x5, #-16]!
2127   .endr
2128         add     x4, x5, #8*n
2129         str     x4, [x5, #-8]!
2130
2131 #else
2132         notimpl
2133 #endif
2134
2135         ret
2136
2137 endproc
2138
2139 proc    x1e
2140
2141         // convert nibble value to (uppercase) hex; other input values yield
2142         // nonsense.
2143
2144 #if defined(__x86_64__)
2145
2146         // das doesn't work in 64-bit mode; best i can come up with
2147         mov     edx, eax
2148         add     al, '0'
2149         add     dl, 'A' - 10
2150         cmp     al, '9' + 1
2151         cmovae  eax, edx
2152
2153 #elif defined(__i386__)
2154
2155         cmp     al, 0x0a                // cf = 1 iff a < 10
2156         sbb     al, 0x69                // if 0 <= a < 10, a' = a - 0x6a, so
2157                                         // 0x96 <= a' < 0x70, setting af, cf
2158                                         // if 10 <= a < 16, a' = a - 0x69, so
2159                                         // 0x71 <= a' < 0x77, setting cf but
2160                                         // clearing af
2161         das                             // if 0 <= a < 10, then af and cf are
2162                                         // both set, so set subtract 0x66
2163                                         // from a' leaving 0x30 <= a' < 0x3a;
2164                                         // if 10 <= a < 16 then af clear but
2165                                         // cf set, so subtract 0x60 from a'
2166                                         // leaving 0x41 <= a' < 0x47
2167
2168 #elif defined(__arm__)
2169
2170         // significantly less tricksy
2171         cmp     r0, #10
2172         addlo   r0, r0, #'0'
2173         addhs   r0, r0, #'A' - 10
2174
2175 #elif defined(__aarch64__)
2176
2177         // with less versatile conditional execution this is the best we can
2178         // do
2179         cmp     w0, #10
2180         add     w16, w0, #'A' - 10
2181         add     w0, w0, #'0'
2182         cmov.hs w0, w16
2183
2184 #else
2185         notimpl
2186 #endif
2187
2188         ret
2189
2190 endproc
2191
2192 proc    x1f
2193
2194         // verify collatz conjecture starting at a; assume a /= 0!
2195
2196 #if defined(__x86_64__)
2197
2198 0:      bsf     rcx, rax                // clobber c if a = 0
2199         shr     rax, cl                 // a = 2^c a'
2200   cmp rdx, 0
2201   je 1f
2202   stosq
2203   dec rdx
2204 1:
2205         cmp     rax, 1                  // done?
2206         je      9f
2207         lea     rax, [2*rax + rax + 1]  // a' = 3 a' + 1
2208         jmp     0b                      // again
2209
2210 9:      ret
2211
2212 #elif defined(__i386__)
2213
2214 0:      bsf     ecx, eax                // clobber c if a = 0
2215         shr     eax, cl                 // a = 2^c a'
2216   cmp edx, 0
2217   je 1f
2218   stosd
2219   dec edx
2220 1:
2221         cmp     eax, 1                  // done?
2222         je      9f
2223         lea     eax, [2*eax + eax + 1]  // a' = 3 a' + 1
2224         jmp     0b                      // again
2225
2226 9:      ret
2227
2228 #elif defined(__arm__)
2229
2230         // rbit introduced in armv7
2231 0:      rbit    r2, r0
2232         clz     r2, r2
2233         mov     r0, r0, lsr r2          // a = 2^c a'
2234   cmp r3, #0
2235   strne r0, [r5], #4
2236   subne r3, r3, #1
2237         cmp     r0, #1
2238         adcne   r0, r0, r0, lsl #1      // a' = 3 a' + 1 (because c set)
2239         bne     0b
2240
2241         ret
2242
2243 #elif defined(__aarch64__)
2244
2245 0:      rbit    w2, w0
2246         clz     w2, w2
2247         lsr     w0, w0, w2              // a = 2^c a'
2248   cmp x3, #0
2249   beq 1f
2250   str x0, [x5], #8
2251   sub x3, x3, #1
2252 1:
2253         cmp     w0, #1
2254         add     w16, w0, w0, lsl #1     // t = 3 a' + 1 (because c set)
2255         csinc.eq w0, w0, w16
2256         b.ne    0b
2257
2258         ret
2259
2260 #else
2261         notimpl
2262 #endif
2263
2264 endproc
2265
2266 ///--------------------------------------------------------------------------
2267 /// 0x20--0x2f
2268
2269 proc    x20
2270
2271         // calculate 1337 a slowly
2272
2273 #if defined(__x86_64__)
2274
2275         // original version
2276         mov     rcx, rax                // c = a
2277         shl     rcx, 2                  // c = 4 a
2278         add     rcx, rax                // c = 5 a
2279         shl     rcx, 3                  // c = 40 a
2280         add     rcx, rax                // c = 41 a
2281         shl     rcx, 1                  // c = 82 a
2282         add     rcx, rax                // c = 83 a
2283         shl     rcx, 1                  // c = 166 a
2284         add     rcx, rax                // c = 167 a
2285         shl     rcx, 3                  // c = 1336 a
2286         add     rcx, rax                // c = 1337 a
2287
2288         // a quick way
2289         lea     rdx, [2*rax + rax]      // t = 3 a
2290         shl     rdx, 6                  // t = 192 a
2291         sub     rdx, rax                // t = 191 a
2292         lea     rbx, [8*rdx]            // b = 1528 a
2293         sub     rbx, rdx                // b = 1337 a
2294
2295 #elif defined(__i386__)
2296
2297         // original version
2298         mov     ecx, eax                // c = a
2299         shl     ecx, 2                  // c = 4 a
2300         add     ecx, eax                // c = 5 a
2301         shl     ecx, 3                  // c = 40 a
2302         add     ecx, eax                // c = 41 a
2303         shl     ecx, 1                  // c = 82 a
2304         add     ecx, eax                // c = 83 a
2305         shl     ecx, 1                  // c = 166 a
2306         add     ecx, eax                // c = 167 a
2307         shl     ecx, 3                  // c = 1336 a
2308         add     ecx, eax                // c = 1337 a
2309
2310         // a quick way
2311         lea     edx, [2*eax + eax]      // t = 3 a
2312         shl     edx, 6                  // t = 192 a
2313         sub     edx, eax                // t = 191 a
2314         lea     ebx, [8*edx]            // b = 1528 a
2315         sub     ebx, edx                // b = 1337 a
2316
2317 #elif defined(__arm__)
2318
2319         // original version, ish
2320         add     r2, r0, r0, lsl #2      // c = 5 a
2321         add     r2, r0, r2, lsl #3      // c = 41 a
2322         add     r2, r0, r2, lsl #1      // c = 83 a
2323         add     r2, r0, r2, lsl #1      // c = 167 a
2324         add     r2, r0, r2, lsl #3      // c = 1337 a
2325
2326         // quicker way
2327         add     r1, r0, r0, lsl #1      // b = 3 a
2328         rsb     r1, r0, r1, lsl #6      // b = 191 a
2329         rsb     r1, r1, r1, lsl #3      // b = 1337 a
2330
2331 #elif defined(__aarch64__)
2332
2333         // original version, ish
2334         add     x2, x0, x0, lsl #2      // c = 5 a
2335         add     x2, x0, x2, lsl #3      // c = 41 a
2336         add     x2, x0, x2, lsl #1      // c = 83 a
2337         add     x2, x0, x2, lsl #1      // c = 167 a
2338         add     x2, x0, x2, lsl #3      // c = 1337 a
2339
2340         // sleazy because no rsb
2341         add     x1, x0, x0, lsl #1      // b = 3 a
2342         sub     x1, x0, x1, lsl #6      // b = -191 a
2343         sub     x1, x1, x1, lsl #3      // b = 1337 a
2344
2345 #else
2346         notimpl
2347 #endif
2348
2349         ret
2350
2351 endproc
2352
2353 proc    x21
2354
2355         // multiply complex numbers a + b i and c + d i
2356         //
2357         //      (a + b i) (c + d i) = (a c - b d) + (a d + b c) i
2358         //
2359         // somewhat slick approach uses only three multiplications
2360
2361 #if defined(__x86_64__)
2362
2363         mov     rsi, rax                // t = a
2364         add     rax, rbx                // a' = a + b
2365         mov     rdi, rdx                // u = d
2366         sub     rdx, rcx                // d' = d - c
2367         add     rdi, rcx                // u = c + d
2368
2369         imul    rax, rcx                // a' = c (a + b)
2370         imul    rsi, rdx                // t = a (d - c)
2371         imul    rdi, rbx                // u = b (c + d)
2372
2373         add     rsi, rax                // t = a (d - c) + c (a + b)
2374         mov     rbx, rsi                // b' = a (d - c) + c (a + b)
2375                                         //      = a d + b c
2376         sub     rax, rdi                // a' = c (a + b) - b (c + d)
2377                                         //      = a c - b d
2378
2379 #elif defined(__i386__)
2380
2381         mov     esi, eax                // t = a
2382         add     eax, ebx                // a' = a + b
2383         mov     edi, edx                // u = d
2384         sub     edx, ecx                // d' = d - c
2385         add     edi, ecx                // u = c + d
2386
2387         imul    eax, ecx                // a' = c (a + b)
2388         imul    esi, edx                // t = a (d - c)
2389         imul    edi, ebx                // u = b (c + d)
2390
2391         add     esi, eax                // t = a (d - c) + c (a + b)
2392         mov     ebx, esi                // b' = a (d - c) + c (a + b)
2393                                         //      = a d + b c
2394         sub     eax, edi                // a' = c (a + b) - b (c + d)
2395                                         //      = a c - b d
2396
2397 #elif defined(__arm__)
2398
2399         add     r4, r0, r1              // t = a + b
2400         add     r5, r2, r3              // u = c + d
2401         sub     r3, r3, r2              // d' = d - c
2402
2403         // mls introduced in armv7
2404         mul     r4, r4, r2              // t = c (a + b)
2405         mov     r2, r1                  // c' = a (bah!)
2406         mla     r1, r0, r3, r4          // b' = a (d - c) + c (a + b)
2407                                         //      = a d + b c
2408         mls     r0, r2, r5, r4          // a' = c (a + b) - b (c + d)
2409                                         //      = a c - b d
2410
2411 #elif defined(__aarch64__)
2412
2413         add     x4, x0, x1              // t = a + b
2414         add     x5, x2, x3              // u = c + d
2415         sub     x3, x3, x2              // d' = d - c
2416
2417         // mls intxoduced in axmv7
2418         mul     x4, x4, x2              // t = c (a + b)
2419         mov     x2, x1                  // c' = a (bah!)
2420         madd    x1, x0, x3, x4          // b' = a (d - c) + c (a + b)
2421                                         //      = a d + b c
2422         msub    x0, x2, x5, x4          // a' = c (a + b) - b (c + d)
2423                                         //      = a c - b d
2424
2425 #else
2426         notimpl
2427 #endif
2428
2429         ret
2430
2431 endproc
2432
2433 proc    x22
2434
2435         // divide by 3
2436
2437 #if defined(__x86_64__)
2438
2439         mov     rdx, 0xaaaaaaaaaaaaaaab // = ceil(2/3 2^64)
2440         mul     rdx                     // d' || a' =~ 2/3 a 2^64
2441         shr     rdx, 1                  // d' = floor(a/3)
2442         mov     rax, rdx                // a' = floor(a/3)
2443
2444         // we start with 0 <= a < 2^64.  write f = ceil(2/3 2^64), so that
2445         // 2/3 < f/2^64 < 2/3 + 1/2^64.  then floor(2/3 a) <= floor(a f/2^64)
2446         // <= floor(2/3 a + a/2^64), but a < 2^64 so a/2^64 < 1 and
2447         // floor(a f/2^64) = floor(2/3 a).
2448
2449 #elif defined(__i386__)
2450
2451         mov     edx, 0xaaaaaaab         // = ceil(2/3 2^32)
2452         mul     edx                     // d' || a' =~ 2/3 a 2^32
2453         shr     edx, 1                  // d' = floor(a/3)
2454         mov     eax, edx                // a' = floor(a/3)
2455
2456 #elif defined(__arm__)
2457
2458         ldr     r12, =0xaaaaaaab
2459         umull   r12, r0, r0, r12
2460         mov     r0, r0, lsr #1
2461
2462 #elif defined(__aarch64__)
2463
2464         ldr     x16, =0xaaaaaaaaaaaaaaab
2465         umulh   x0, x0, x16
2466         lsr     x0, x0, #1
2467
2468 #else
2469         notimpl
2470 #endif
2471
2472         ret
2473
2474 endproc
2475
2476 proc    x23
2477
2478 #if defined(__x86_64__)
2479
2480         // main loop: shorten a preserving residue class mod 3
2481 0:      cmp     rax, 5
2482         jbe     8f
2483         // a > 5
2484         mov     rdx, rax                // d' = a
2485         shr     rdx, 2                  // d' = floor(a/4)
2486         and     rax, 3                  // a = 4 d' + a' (0 <= a' < 4)
2487         add     rax, rdx                // a' == a (mod 3) but a' < a/4 + 4
2488         jmp     0b
2489
2490         // fix up final value 0 <= a < 6: want 0 <= a < 3
2491         //
2492         // the tricky part is actually a = 3; but the other final cases take
2493         // additional iterations which we can avoid.
2494 8:      cmp     rax, 3                  // set cf iff a < 3
2495         cmc                             // set cf iff a >= 3
2496         sbb     rdx, rdx                // d' = a >= 3 ? -1 : 0
2497         and     rdx, 3                  // d' = a >= 3 ? 3 : 0
2498         sub     rax, rdx                // a' = a - (a >= 3 ? 3 : 0)
2499                                         //      = a (mod 3)
2500
2501 #elif defined(__i386__)
2502
2503         // main loop: shorten a preserving residue class mod 3
2504 0:      cmp     eax, 5
2505         jbe     8f
2506         // a > 5
2507         mov     edx, eax                // d' = a
2508         shr     edx, 2                  // d' = floor(a/4)
2509         and     eax, 3                  // a = 4 d' + a' (0 <= a' < 4)
2510         add     eax, edx                // a' == a (mod 3) but a' < a/4 + 4
2511         jmp     0b
2512
2513         // fix up final value 0 <= a < 6: want 0 <= a < 3
2514         //
2515         // the tricky part is actually a = 3; but the other final cases take
2516         // additional iterations which we can avoid.
2517 8:      cmp     eax, 3                  // set cf iff a < 3
2518         cmc                             // set cf iff a >= 3
2519         sbb     edx, edx                // d' = a >= 3 ? -1 : 0
2520         and     edx, 3                  // d' = a >= 3 ? 3 : 0
2521         sub     eax, edx                // a' = a - (a >= 3 ? 3 : 0)
2522                                         //      = a (mod 3)
2523
2524 #elif defined(__arm__)
2525
2526 0:      cmp     r0, #6
2527         andhs   r12, r0, #3
2528         addhs   r0, r12, r0, lsr #2
2529         bhs     0b
2530
2531         cmp     r0, #3
2532         subhs   r0, r0, #3
2533
2534 #elif defined(__aarch64__)
2535
2536 0:      cmp     x0, #6
2537         // blunder on through regardless since this doesn't affect the result
2538         and     x16, x0, #3
2539         add     x0, x16, x0, lsr #2
2540         b.hs    0b
2541
2542         subs    x16, x0, #3
2543         cmov.hs x0, x16
2544
2545 #else
2546         notimpl
2547 #endif
2548
2549         ret
2550
2551 endproc
2552
2553 proc    x24
2554
2555         // invert (odd) a mod 2^64
2556         //
2557         // suppose a a_i == 1 (mod 2^{2^i})
2558         //
2559         // clearly good for i = 0, since 2^i = 1 and 2^{2^i} = 2, and a_0 =
2560         // a == 1 (mod 2) by assumption
2561         //
2562         // write a a_i == b_i 2^{2^i} + 1 (mod 2^{2^{i+1}})
2563         // then b_i == (a a_i - 1)/2^{2^i} (mod 2^{2^i})
2564         // to lift inverse, we want x such that a x == -b_i (mod 2^{2^i});
2565         // clearly x = -a_i b_i will do, since a a_i == 1 (mod 2^{2^i})
2566         // then:
2567         // a_{i+1} = a_i - a_i b_i 2^{2^i} = a_i (1 - (a a_i - 1))
2568         //      = 2 a_i - a a_i^2
2569         //
2570         // check:
2571         // a a_{i+1} = 2 a a_i - a^2 a_i^2
2572         //      == 2 a a_i - (b_i 2^{2^i} + 1)^2
2573         //      == 2 (b_i 2^{2^i} + 1) -
2574         //              (b_i^2 2^{2^{i+1}} + 2 b_i 2^{2^i} + 1)
2575         //      == 1 (mod 2^{2^{i+1}})
2576
2577 #if defined(__x86_64__)
2578
2579         // rax                          // a_0 = a
2580         mov     rbx, rax                // b' = a
2581         mov     rsi, rax                // t = a_0
2582
2583 0:
2584   cmp rbp, 0
2585   je 1f
2586   stosq
2587   dec rbp
2588 1:
2589         mul     rbx                     // a' = a a_i
2590         mov     rcx, rax                // c = a a_i
2591
2592         sub     rax, 2                  // a' = a a_i - 2
2593         neg     rax                     // a' = 2 - a a_i
2594         mul     rsi                     // a_{i+1} = a_i (2 - a a_i)
2595                                         //      = 2 a_i - a a_i^2
2596         mov     rsi, rax                // t = a_{i+1}
2597
2598         cmp     rcx, 1                  // done?
2599         ja      0b                      // no -- iterate
2600
2601 #elif defined(__i386__)
2602
2603         // eax                          // a_0 = a
2604         mov     ebx, eax                // b' = a
2605         mov     esi, eax                // t = a_0
2606
2607 0:
2608   cmp ebp, 0
2609   je 1f
2610   stosd
2611   dec ebp
2612 1:
2613         mul     ebx                     // a' = a a_i
2614         mov     ecx, eax                // c = a a_i
2615
2616         sub     eax, 2                  // a' = a a_i - 2
2617         jb      9f                      // done if < 2
2618         neg     eax                     // a' = 2 - a a_i
2619         mul     esi                     // a_{i+1} = a_i (2 - a a_i)
2620                                         //      = 2 a_i - a a_i^2
2621         mov     esi, eax                // t = a_{i+1}
2622
2623         jmp     0b                      // and iterate
2624 9:      mov     eax, esi                // restore
2625
2626 #elif defined(__arm__)
2627
2628         // r0                           // a_0 = a
2629         mov     r1, r0                  // b' = a
2630
2631 0:
2632   cmp r6, #0
2633   strne r0, [r5], #4
2634   subne r6, r6, #1
2635         mul     r2, r0, r1              // c = a a_i
2636         rsbs    r2, r2, #2              // c = 2 - a a_i
2637         mul     r0, r0, r2              // a_{i+1} = a_i (2 - a a_i)
2638                                         //      = 2 a_i - a a_i^2
2639         blo     0b
2640
2641 #elif defined(__aarch64__)
2642
2643         // x0                           // a_0 = a
2644         mov     x1, x0                  // b' = a
2645         mov     x16, #2                 // because we have no rsb
2646
2647 0:
2648   cmp x6, #0
2649   b.eq 1f
2650   str x0, [x5], #8
2651   sub x6, x6, #1
2652 1:
2653         mul     x2, x0, x1              // c = a a_i
2654         subs    x2, x16, x2             // c = 2 - a a_i
2655         mul     x0, x0, x2              // a_{i+1} = a_i (2 - a a_i)
2656                                         //      = 2 a_i - a a_i^2
2657         b.lo    0b
2658
2659 #else
2660         notimpl
2661 #endif
2662
2663         ret
2664
2665 endproc
2666
2667 proc    x25
2668
2669         // a poor approximation to pi/4
2670         //
2671         // think of x and y as being in 16.16 fixed-point format.  we sample
2672         // points in the unit square, and determine how many of them are
2673         // within a unit quarter-circle centred at the origin.  the area of
2674         // the quarter-circle is pi/4.
2675
2676 #if defined(__x86_64__)
2677
2678         xor     eax, eax                // a = 0
2679         mov     rcx, 1
2680         shl     rcx, 0x20               // c =~ 4 billion
2681
2682 0:      movzx   rbx, cx                 // x = low 16 bits of c
2683         imul    rbx, rbx                // b = x^2
2684
2685         ror     rcx, 0x10               // switch halves of c
2686         movzx   rdx, cx                 // y = high 16 bits of c
2687         imul    rdx, rdx                // d = y^2
2688         rol     rcx, 0x10               // switch back
2689
2690         add     rbx, rdx                // r^2 = x^2 + y^2
2691         shr     rbx, 0x20               // r^2 >= 1?
2692         cmp     rbx, 1                  // set cf iff r^2 >= 1
2693         adc     rax, 0                  // and add onto accumulator
2694         loop    0b
2695
2696 #elif defined(__i386__)
2697
2698         // this is actually better done in 32 bits.  the carry has the wrong
2699         // sense here, so instead deduct one for each point outside the
2700         // quarter-circle rather than adding one for each point inside it.
2701         xor     eax, eax
2702         xor     ecx, ecx
2703
2704 0:      movzx   ebx, cx
2705         imul    ebx, ebx
2706
2707         mov     edx, ecx
2708         shr     edx, 0x10
2709         imul    edx, edx
2710
2711         add     ebx, edx                // see?
2712         sbb     eax, 0
2713         loop    0b
2714
2715 #elif defined(__arm__)
2716
2717         mov     r0, #0
2718         mov     r2, #0
2719
2720 0:      uxth    r1, r2, ror #0
2721         uxth    r3, r2, ror #16
2722         mul     r1, r1, r1
2723         mul     r3, r3, r3
2724         cmn     r1, r3                  // mlas doesn't set cf usefully
2725         addcc   r0, r0, #1
2726         adds    r2, r2, #1
2727         bne     0b
2728
2729 #elif defined(__aarch64__)
2730
2731         mov     w0, #0
2732         mov     w2, #0
2733
2734 0:      ubfx    w1, w2, #0, #16
2735         ubfx    w3, w2, #16, #16
2736         sub     w2, w2, #1
2737         mul     w1, w1, w1
2738         mul     w3, w3, w3
2739         cmn     w1, w3
2740         cinc.cc w0, w0
2741         cbnz    w2, 0b
2742
2743 #else
2744         notimpl
2745 #endif
2746
2747         ret
2748
2749 endproc
2750
2751 proc    x26
2752
2753         // a bad way to rotate a right by 7 places
2754
2755 #if defined(__x86_64__)
2756
2757         mov     rbx, rax
2758         ror     rbx, 7                  // better
2759
2760         mov     rdx, rax                // d' = a
2761         shr     rax, 7                  // a' = a >> 7
2762         shl     rdx, 0x39               // d' = a << 57
2763         or      rax, rdx                // a' = a >>> 7
2764
2765 #elif defined(__i386__)
2766
2767         mov     ebx, eax
2768         ror     ebx, 7                  // better
2769
2770         mov     edx, eax                // d' = a
2771         shr     eax, 7                  // a' = a >> 7
2772         shl     edx, 0x39               // d' = a << 57
2773         or      eax, edx                // a' = a >>> 7
2774
2775 #elif defined(__arm__)
2776
2777         mov     r1, r0, ror #7          // easy way
2778
2779         // even the hard way is fairly easy on arm
2780         mov     r3, r0, lsl #25
2781         orr     r0, r3, r0, lsr #7      // hard way
2782
2783 #elif defined(__aarch64__)
2784
2785         ror     x1, x0, #7              // easy way
2786
2787         // even the hard way is fairly easy on arm
2788         lsl     x3, x0, #57
2789         orr     x0, x3, x0, lsr #7      // hard way
2790
2791 #else
2792         notimpl
2793 #endif
2794
2795         ret
2796
2797 endproc
2798
2799 proc    x27
2800
2801         // shift a right by c places, in two halves
2802
2803 #if defined(__x86_64__)
2804
2805         mov     ch, cl                  // c' = [c, c]
2806         inc     ch                      // c' = [c, c + 1]
2807         shr     ch, 1
2808         shr     cl, 1                   // c' = [floor(c/2), ceil(c/2)]
2809         shr     rax, cl
2810         xchg    ch, cl
2811         shr     rax, cl
2812
2813 #elif defined(__i386__)
2814
2815         mov     ch, cl                  // c' = [c, c]
2816         inc     ch                      // c' = [c, c + 1]
2817         shr     ch, 1
2818         shr     cl, 1                   // c' = [floor(c/2), ceil(c/2)]
2819         shr     eax, cl
2820         xchg    ch, cl
2821         shr     eax, cl
2822
2823 #elif defined(__arm__)
2824
2825         // it would be clearer and more efficient to say: `mov r12, r2, lsr
2826         // #1; sub r2, r2, r12', but that's not the lesson this exercise is
2827         // trying to teach.
2828         add     r12, r2, #1
2829         mov     r2, r2, lsr #1
2830         mov     r12, r12, lsr #1
2831         mov     r0, r0, lsr r2
2832         mov     r0, r0, lsr r12
2833
2834 #elif defined(__aarch64__)
2835
2836         add     w16, w2, #1
2837         lsr     w2, w2, #1
2838         lsr     w16, w16, #1
2839         lsr     x0, x0, x2
2840         lsr     x0, x0, x16
2841
2842 #else
2843         notimpl
2844 #endif
2845
2846         ret
2847
2848 endproc
2849
2850 proc    x28
2851
2852         // divide c-byte little-endian bignum at rsi by 2 (rounding down)
2853
2854 #if defined(__x86_64__)
2855
2856         clc
2857 0:      rcr     byte ptr [rsi], 1
2858         inc     rsi
2859         loop    0b
2860
2861 #elif defined(__i386__)
2862
2863         clc
2864 0:      rcr     byte ptr [esi], 1
2865         inc     esi
2866         loop    0b
2867
2868 #elif defined(__arm__)
2869
2870         // we could hack this a word at a time using rrx
2871         mov     r3, #0
2872 0:      ldrb    r12, [r4]
2873         subs    r2, r2, #1
2874         orr     r3, r3, r12, lsr #1
2875         strb    r3, [r4], #1
2876         mov     r3, r12, lsl #7
2877         bne     0b
2878
2879 #elif defined(__aarch64__)
2880
2881         mov     w16, #0
2882 0:      ldrb    w17, [x4]
2883         sub     x2, x2, #1
2884         orr     w16, w16, w17, lsr #1
2885         strb    w16, [x4], #1
2886         lsl     w16, w17, #7
2887         cbnz    x2, 0b
2888
2889 #else
2890         notimpl
2891 #endif
2892
2893         ret
2894
2895 endproc
2896
2897 proc    x29
2898
2899         // fill a buffer with a 3-byte pattern
2900
2901 #if defined(__x86_64__)
2902
2903         lea     rdi, [rsi + 3]
2904         rep movsb
2905
2906 #elif defined(__i386__)
2907
2908         lea     edi, [esi + 3]
2909         rep movsb
2910
2911 #elif defined(__arm__)
2912
2913         add     r5, r4, #3
2914 0:      subs    r2, r2, #1
2915         ldrhsb  r12, [r4], #1
2916         strhsb  r12, [r5], #1
2917         bhs     0b
2918
2919 #elif defined(__aarch64__)
2920
2921         cbz     x2, 9f
2922         add     x5, x4, #3
2923 0:      sub     x2, x2, #1
2924         ldrb    w16, [x4], #1
2925         strb    w16, [x5], #1
2926         cbnz    x2, 0b
2927 9:
2928
2929 #else
2930         notimpl
2931 #endif
2932
2933         ret
2934
2935 endproc
2936
2937 proc    x2a
2938
2939         // rotate the words in a buffer, so that the last word comes first,
2940         // the first comes second, and so on.  this isn't a good way to do
2941         // it.
2942
2943 #if defined(__x86_64__)
2944
2945         mov     rsi, rbx                // set string pointers
2946         mov     rdi, rbx
2947 0:      lodsq                           // fetch next word
2948         xchg    rax, qword ptr [rbx]    // stash it for next iteration and
2949                                         // replace it with the previously
2950                                         // stashed word
2951         stosq                           // store in output
2952         // (note that the first iteration doesn't actually do anything)
2953         loop    0b                      // continue until all done
2954
2955 #elif defined(__i386__)
2956
2957         mov     esi, ebx                // set string pointers
2958         mov     edi, ebx
2959 0:      lodsd                           // fetch next word
2960         xchg    eax, dword ptr [ebx]    // stash it for next iteration and
2961                                         // replace it with the previously
2962                                         // stashed word
2963         stosd                           // store in output
2964         loop    0b                      // continue until all done
2965
2966 #elif defined(__arm__)
2967
2968         // let's do this a sensible way.  (we could go faster using ldm/stm.)
2969         add     r0, r1, r2, lsl #2      // find the end of the buffer
2970         ldr     r0, [r0, #-4]           // collect final element
2971 0:      subs    r2, r2, #1
2972         ldr     r12, [r1]
2973         str     r0, [r1], #4
2974         mov     r0, r12
2975         bne     0b
2976
2977 #elif defined(__aarch64__)
2978
2979         add     x0, x1, x2, lsl #3      // find the end of the buffer
2980         ldr     x0, [x0, #-8]           // collect final element
2981 0:      sub     x2, x2, #1
2982         ldr     x16, [x1]
2983         str     x0, [x1], #8
2984         mov     x0, x16
2985         cbnz    x2, 0b
2986
2987 #else
2988         notimpl
2989 #endif
2990
2991         ret
2992
2993 endproc
2994
2995 proc    x2b
2996
2997         // find a cycle in a function f: B -> B, where B = {0, 1, ..., 255}
2998
2999 #if defined(__x86_64__)
3000
3001         // this is floyd's cycle-finding algorithm.
3002         //
3003         // consider the sequence s_0 = 0, s_1 = f(0), s_2 = f(f(0)), ...,
3004         // s_{i+1} = f(s_i).  since B is finite, there must be some smallest
3005         // t and c such that s(t) = s(t + c); then we have s_i = s_j iff
3006         // i >= t, j >= t, and i == j (mod c).
3007         //
3008         // the algorithm sets two cursors advancing through the sequence: a
3009         // /tortoise/ which advances one step at a time, and a /hare/ which
3010         // advances by two, so when the tortoise is at element s_i, the hare
3011         // is at s_{2i}.  the hare will run around the cycle and catch the
3012         // tortoise when i >= t and i == 2 i (mod c); the latter is simply i
3013         // == 0 (mod c), which therefore happens first when i = k = t +
3014         // (-t mod c).
3015         //
3016         // i'm not sure what good xlatb does here that mov al, [rbx + al]
3017         // doesn't.
3018
3019         xor     eax, eax                // tortoise starts at 0
3020         xor     edx, edx                // hare starts at 0
3021 0:      xlatb                           // advance tortoise
3022         xchg    rax, rdx                // switch to hare
3023         xlatb                           // advance hare ...
3024         xlatb                           // ... twice
3025         xchg    rax, rdx                // switch back
3026         cmp     al, dl                  // hare caught the tortoise?
3027         jnz     0b                      // no -- go around again
3028
3029         // now we trace the initial tail: reset the tortoise to s_0, and slow
3030         // the hare down so that both take only a single step in each
3031         // iteration.  this loop terminates when i >= t and i == i + 2 k
3032         // (mod c).  we know k is a multiple of c, so the latter condition
3033         // always holds, so this finds the first step of the cycle.
3034
3035         xor     eax, eax                // reset the tortoise
3036 0:      xlatb                           // advance tortoise
3037         xchg    rax, rdx                // switch to hare
3038         xlatb                           // advance hare
3039         xchg    rax, rdx                // and switch back
3040         cmp     al, dl                  // done?
3041         jnz     0b                      // no -- iterate
3042
3043 #elif defined(__i386__)
3044
3045         xor     eax, eax                // tortoise starts at 0
3046         xor     edx, edx                // hare starts at 0
3047 0:      xlatb                           // advance tortoise
3048         xchg    eax, edx                // switch to hare
3049         xlatb                           // advance hare ...
3050         xlatb                           // ... twice
3051         xchg    eax, edx                // switch back
3052         cmp     al, dl                  // hare caught the tortoise?
3053         jnz     0b                      // no -- go around again
3054
3055         xor     eax, eax                // reset the tortoise
3056 0:      xlatb                           // advance tortoise
3057         xchg    eax, edx                // switch to hare
3058         xlatb                           // advance hare
3059         xchg    eax, edx                // and switch back
3060         cmp     al, dl                  // done?
3061         jnz     0b                      // no -- iterate
3062
3063 #elif defined(__arm__)
3064
3065         mov     r0, #0
3066         mov     r3, #0
3067 0:      ldrb    r0, [r1, r0]
3068         ldrb    r3, [r1, r3]
3069         ldrb    r3, [r1, r3]
3070         cmp     r0, r3
3071         bne     0b
3072
3073         mov     r0, #0
3074 0:      ldrb    r0, [r1, r0]
3075         ldrb    r3, [r1, r3]
3076         cmp     r0, r3
3077         bne     0b
3078
3079 #elif defined(__aarch64__)
3080
3081         mov     w0, #0
3082         mov     w3, #0
3083 0:      ldrb    w0, [x1, x0]
3084         ldrb    w3, [x1, x3]
3085         ldrb    w3, [x1, x3]
3086         cmp     w0, w3
3087         b.ne    0b
3088
3089         mov     w0, #0
3090 0:      ldrb    w0, [x1, x0]
3091         ldrb    w3, [x1, x3]
3092         cmp     w0, w3
3093         b.ne    0b
3094
3095 #else
3096         notimpl
3097 #endif
3098
3099         ret
3100
3101 endproc
3102
3103 proc    x2c
3104
3105         // a convoluted way to set rax = rsi
3106
3107 #if defined(__x86_64__)
3108
3109         mov     qword ptr [rbx + 8*rcx], 0 // b[c] = 0
3110         mov     qword ptr [rbx + 8*rdx], 1 // b[d] = 1
3111         mov     rax, [rbx + 8*rcx]      // a' = b[c] = 0
3112
3113         mov     [rbx], rsi              // b[0] = t
3114         mov     [rbx + 8], rdi          // b[1] = u
3115         mov     rax, [rbx + 8*rax]      // a' = b[a'] = b[0] = t
3116
3117 #elif defined(__i386__)
3118
3119         mov     dword ptr [ebx + 8*ecx], 0 // b[c] = 0
3120         mov     dword ptr [ebx + 8*edx], 1 // b[d] = 1
3121         mov     eax, [ebx + 8*ecx]      // a' = b[c] = 0
3122
3123         mov     [ebx], esi              // b[0] = t
3124         mov     [ebx + 8], edi          // b[1] = u
3125         mov     eax, [ebx + 8*eax]      // a' = b[a'] = b[0] = t
3126
3127 #elif defined(__arm__)
3128
3129         mov     r0, #0
3130         mov     r12, #1
3131
3132         str     r0, [r1, r2, lsl #2]
3133         str     r12, [r1, r3, lsl #2]
3134         ldr     r0, [r1, r2, lsl #2]
3135
3136         str     r4, [r1]
3137         str     r5, [r1, #4]
3138         ldr     r0, [r1, r0, lsl #2]
3139
3140 #elif defined(__aarch64__)
3141
3142         mov     x16, #1
3143
3144         str     xzr, [x1, x2, lsl #3]
3145         str     x16, [x1, x3, lsl #3]
3146         ldr     x0, [x1, x2, lsl #3]
3147
3148         str     x4, [x1]
3149         str     x5, [x1, #8]
3150         ldr     x0, [x1, x0, lsl #3]
3151
3152 #else
3153         notimpl
3154 #endif
3155
3156         ret
3157
3158 endproc
3159
3160 proc    x2d
3161
3162         // clear the least significant set bit in a, by calculating a' =
3163         // a AND (a - 1).
3164         //
3165         // if a = 0 then a' = 0.  otherwise, a - 1 differs from a exactly in
3166         // the least significant /set/ bit of a, and all bits of lesser
3167         // significance.  to put it another way: write a = u 2^{k+1} + 2^k;
3168         // then a - 1 = u 2^{k+1} + 2^{k-1} + ... + 2 + 1.  taking the
3169         // bitwise AND of these leaves only the bits common to both, i.e.,
3170         // u 2^{k+1}.
3171
3172 #if defined(__x86_64__)
3173
3174         mov     rdx, rax                // d' = a
3175         dec     rax                     // a' = a - 1
3176         and     rax, rdx                // a' = a AND (a - 1)
3177
3178 #elif defined(__i386__)
3179
3180         mov     edx, eax                // d' = a
3181         dec     eax                     // a' = a - 1
3182         and     eax, edx                // a' = a AND (a - 1)
3183
3184 #elif defined(__arm__)
3185
3186         sub     r3, r0, #1
3187         and     r0, r0, r3
3188
3189 #elif defined(__aarch64__)
3190
3191         sub     x3, x0, #1
3192         and     x0, x0, x3
3193
3194 #else
3195         notimpl
3196 #endif
3197
3198         ret
3199
3200 endproc
3201
3202 proc    x2e
3203
3204         // compute a mask of one bits in exactly the positions of the
3205         // low-order run of zero bits in a
3206
3207 #if defined(__x86_64__)
3208
3209         mov     rdx, rax                // d' = a
3210         dec     rdx                     // d' = a - 1
3211         xor     rax, rdx                // a = a XOR (a - 1)
3212                                         //   set bits are least significant
3213                                         //   set bit of a, and all bits of
3214                                         //   lesser significance
3215         shr     rax, 1                  // now only bits of lesser
3216                                         //   significance; a' = 0 iff a odd
3217         cmp     rax, rdx                // equal if a = 0 or 2^k; otherwise
3218                                         //   strictly less
3219
3220 #elif defined(__i386__)
3221
3222         mov     edx, eax
3223         dec     edx
3224         xor     eax, edx
3225         shr     eax, 1
3226         cmp     eax, edx
3227
3228 #elif defined(__arm__)
3229
3230         sub     r3, r0, #1
3231         eor     r0, r0, r3
3232         mov     r0, r0, lsr #1          // probably fold shift into next inst
3233         cmp     r0, r3
3234
3235 #elif defined(__aarch64__)
3236
3237         sub     x3, x0, #1
3238         eor     x0, x0, x3
3239         mov     x0, x0, lsr #1          // probably fold shift into next inst
3240         cmp     x0, x3
3241
3242 #else
3243         notimpl
3244 #endif
3245
3246         ret
3247
3248 endproc
3249
3250 proc    x2f
3251
3252         // a slow population count
3253
3254 #if defined(__x86_64__)
3255
3256         popcnt  rbx, rcx                // the easy way
3257
3258         // a fast version in software
3259         mov     rax, rcx
3260
3261         mov     rdx, rcx
3262         shr     rdx, 1
3263         mov     rsi, 0x5555555555555555
3264         and     rax, rsi
3265         and     rdx, rsi
3266         add     rax, rdx
3267
3268         mov     rdx, rax
3269         shr     rdx, 2
3270         mov     rsi, 0x3333333333333333
3271         and     rax, rsi
3272         and     rdx, rsi
3273         add     rax, rdx
3274
3275         mov     rdx, rax
3276         shr     rdx, 32
3277         add     rax, rdx
3278
3279         mov     rdx, rax
3280         shr     rdx, 4
3281         and     rax, 0x0f0f0f0f
3282         and     rdx, 0x0f0f0f0f
3283         add     rax, rdx
3284
3285         mov     rdx, rax
3286         shr     rdx, 8
3287         add     rax, rdx
3288
3289         mov     rdx, rax
3290         shr     rdx, 16
3291         add     rax, rdx
3292         movzx   rsi, al
3293
3294         // the official version
3295         xor     eax, eax                // clear iteration counter
3296 0:      jrcxz   9f                      // bail if c = 0
3297         inc     rax                     // bump iteration count
3298         mov     rdx, rcx                // d' = c
3299         dec     rdx                     // d' = c - 1
3300         and     rcx, rdx                // zap least significant set bit of c
3301         jmp     0b                      // and go again
3302 9:
3303
3304 #elif defined(__i386__)
3305
3306         popcnt  ebx, ecx                // the easy way
3307
3308         mov     eax, ecx
3309
3310         mov     edx, ecx
3311         shr     edx, 1
3312         and     eax, 0x55555555
3313         and     edx, 0x55555555
3314         add     eax, edx
3315
3316         mov     edx, eax
3317         shr     edx, 2
3318         and     eax, 0x33333333
3319         and     edx, 0x33333333
3320         add     eax, edx
3321
3322         mov     edx, eax
3323         shr     edx, 4
3324         add     eax, edx
3325
3326         mov     edx, eax
3327         shr     edx, 8
3328         and     eax, 0x000f000f
3329         and     edx, 0x000f000f
3330         add     eax, edx
3331
3332         mov     edx, eax
3333         shr     edx, 16
3334         add     eax, edx
3335         movzx   esi, al
3336
3337         xor     eax, eax
3338 0:      jecxz   9f
3339         inc     eax
3340         mov     edx, ecx
3341         dec     edx
3342         and     ecx, edx
3343         jmp     0b
3344 9:
3345
3346 #elif defined(__arm__)
3347
3348         // the easy-ish way
3349         vmov    d0[0], r2
3350         vcnt.8  d0, d0
3351         vmov    r1, d0[0]
3352         add     r1, r1, r1, lsl #8
3353         add     r1, r1, r1, lsl #16
3354         mov     r1, r1, lsr #24
3355
3356         // the hard way
3357         movw    r12, #0x5555
3358         movt    r12, #0x5555
3359         and     r3, r12, r2, lsr #1
3360         and     r0, r12, r2
3361         add     r0, r0, r3
3362
3363         movw    r12, #0x3333
3364         movt    r12, #0x3333
3365         and     r3, r12, r0, lsr #2
3366         and     r0, r12, r0
3367         add     r0, r0, r3
3368
3369         add     r0, r0, r0, lsl #16
3370
3371         movt    r12, 0x0f0f
3372         and     r3, r12, r0, lsr #4
3373         and     r0, r12, r0
3374         add     r0, r0, r3
3375
3376         add     r0, r0, r0, lsl #8
3377
3378         mov     r4, r0, lsr #24
3379
3380         // and following the exercise
3381         mov     r0, #0
3382         cmp     r2, #0
3383         beq     9f
3384 0:      add     r0, r0, #1
3385         sub     r3, r2, #1
3386         ands    r2, r2, r3
3387         bne     0b
3388 9:
3389
3390 #elif defined(__aarch64__)
3391
3392         // the easy-ish way
3393         mov     v0.d[0], x2
3394         cnt     v0.8b, v0.8b
3395         mov     x1, v0.d[0]
3396         add     x1, x1, x1, lsl #8
3397         add     x1, x1, x1, lsl #16
3398         add     x1, x1, x1, lsl #32
3399         lsr     x1, x1, #56
3400
3401         // the hard way -- though arm64's immediate constant encodings and
3402         // shifting make this actually rather pleasant.
3403         and     x3, x2, #0xaaaaaaaaaaaaaaaa
3404         and     x0, x2, #0x5555555555555555
3405         add     x0, x0, x3, lsr #1
3406
3407         and     x3, x0, #0xcccccccccccccccc
3408         and     x0, x0, #0x3333333333333333
3409         add     x0, x0, x3, lsr #2
3410
3411         add     x0, x0, x0, lsr #4
3412
3413         and     x3, x0, #0x0f000f000f000f00
3414         and     x0, x0, #0x000f000f000f000f
3415         add     x0, x3, x0, lsl #8
3416
3417         add     x0, x0, x0, lsl #16
3418         add     x0, x0, x0, lsl #32
3419         lsr     x4, x0, #56
3420
3421         // and the official way
3422         mov     x0, #0
3423         cbz     x2, 9f
3424 0:      add     x0, x0, #1
3425         sub     x3, x2, #1
3426         and     x2, x2, x3
3427         cbnz    x2, 0b
3428 9:
3429
3430 #else
3431         notimpl
3432 #endif
3433
3434         ret
3435
3436 endproc
3437
3438 ///--------------------------------------------------------------------------
3439 /// 0x30--0x3f
3440
3441 proc    x30
3442
3443 #if defined(__x86_64__)
3444
3445         notimpl
3446
3447 #elif defined(__i386__)
3448
3449         notimpl
3450
3451 #elif defined(__arm__)
3452
3453         notimpl
3454
3455 #elif defined(__aarch64__)
3456
3457         notimpl
3458
3459 #else
3460         notimpl
3461 #endif
3462
3463         ret
3464
3465 endproc
3466
3467 proc    x31
3468
3469 #if defined(__x86_64__)
3470
3471         notimpl
3472
3473 #elif defined(__i386__)
3474
3475         notimpl
3476
3477 #elif defined(__arm__)
3478
3479         notimpl
3480
3481 #elif defined(__aarch64__)
3482
3483         notimpl
3484
3485 #else
3486         notimpl
3487 #endif
3488
3489 endproc
3490
3491 proc    x32
3492
3493 #if defined(__x86_64__)
3494
3495         notimpl
3496
3497 #elif defined(__i386__)
3498
3499         notimpl
3500
3501 #elif defined(__arm__)
3502
3503         notimpl
3504
3505 #elif defined(__aarch64__)
3506
3507         notimpl
3508
3509 #else
3510         notimpl
3511 #endif
3512
3513 endproc
3514
3515 proc    x33
3516
3517 #if defined(__x86_64__)
3518
3519         notimpl
3520
3521 #elif defined(__i386__)
3522
3523         notimpl
3524
3525 #elif defined(__arm__)
3526
3527         notimpl
3528
3529 #elif defined(__aarch64__)
3530
3531         notimpl
3532
3533 #else
3534         notimpl
3535 #endif
3536
3537 endproc
3538
3539 proc    x34
3540
3541 #if defined(__x86_64__)
3542
3543         notimpl
3544
3545 #elif defined(__i386__)
3546
3547         notimpl
3548
3549 #elif defined(__arm__)
3550
3551         notimpl
3552
3553 #elif defined(__aarch64__)
3554
3555         notimpl
3556
3557 #else
3558         notimpl
3559 #endif
3560
3561 endproc
3562
3563 proc    x35
3564
3565 #if defined(__x86_64__)
3566
3567         notimpl
3568
3569 #elif defined(__i386__)
3570
3571         notimpl
3572
3573 #elif defined(__arm__)
3574
3575         notimpl
3576
3577 #elif defined(__aarch64__)
3578
3579         notimpl
3580
3581 #else
3582         notimpl
3583 #endif
3584
3585 endproc
3586
3587 proc    x36
3588
3589 #if defined(__x86_64__)
3590
3591         notimpl
3592
3593 #elif defined(__i386__)
3594
3595         notimpl
3596
3597 #elif defined(__arm__)
3598
3599         notimpl
3600
3601 #elif defined(__aarch64__)
3602
3603         notimpl
3604
3605 #else
3606         notimpl
3607 #endif
3608
3609 endproc
3610
3611 proc    x37
3612
3613 #if defined(__x86_64__)
3614
3615         notimpl
3616
3617 #elif defined(__i386__)
3618
3619         notimpl
3620
3621 #elif defined(__arm__)
3622
3623         notimpl
3624
3625 #elif defined(__aarch64__)
3626
3627         notimpl
3628
3629 #else
3630         notimpl
3631 #endif
3632
3633 endproc
3634
3635 proc    x38
3636
3637 #if defined(__x86_64__)
3638
3639         notimpl
3640
3641 #elif defined(__i386__)
3642
3643         notimpl
3644
3645 #elif defined(__arm__)
3646
3647         notimpl
3648
3649 #elif defined(__aarch64__)
3650
3651         notimpl
3652
3653 #else
3654         notimpl
3655 #endif
3656
3657 endproc
3658
3659 proc    x39
3660
3661 #if defined(__x86_64__)
3662
3663         notimpl
3664
3665 #elif defined(__i386__)
3666
3667         notimpl
3668
3669 #elif defined(__arm__)
3670
3671         notimpl
3672
3673 #elif defined(__aarch64__)
3674
3675         notimpl
3676
3677 #else
3678         notimpl
3679 #endif
3680
3681 endproc
3682
3683 proc    x3a
3684
3685 #if defined(__x86_64__)
3686
3687         notimpl
3688
3689 #elif defined(__i386__)
3690
3691         notimpl
3692
3693 #elif defined(__arm__)
3694
3695         notimpl
3696
3697 #elif defined(__aarch64__)
3698
3699         notimpl
3700
3701 #else
3702         notimpl
3703 #endif
3704
3705 endproc
3706
3707 proc    x3b
3708
3709 #if defined(__x86_64__)
3710
3711         notimpl
3712
3713 #elif defined(__i386__)
3714
3715         notimpl
3716
3717 #elif defined(__arm__)
3718
3719         notimpl
3720
3721 #elif defined(__aarch64__)
3722
3723         notimpl
3724
3725 #else
3726         notimpl
3727 #endif
3728
3729 endproc
3730
3731 proc    x3c
3732
3733 #if defined(__x86_64__)
3734
3735         notimpl
3736
3737 #elif defined(__i386__)
3738
3739         notimpl
3740
3741 #elif defined(__arm__)
3742
3743         notimpl
3744
3745 #elif defined(__aarch64__)
3746
3747         notimpl
3748
3749 #else
3750         notimpl
3751 #endif
3752
3753 endproc
3754
3755 proc    x3d
3756
3757 #if defined(__x86_64__)
3758
3759         notimpl
3760
3761 #elif defined(__i386__)
3762
3763         notimpl
3764
3765 #elif defined(__arm__)
3766
3767         notimpl
3768
3769 #elif defined(__aarch64__)
3770
3771         notimpl
3772
3773 #else
3774         notimpl
3775 #endif
3776
3777 endproc
3778
3779 proc    x3e
3780
3781 #if defined(__x86_64__)
3782
3783         notimpl
3784
3785 #elif defined(__i386__)
3786
3787         notimpl
3788
3789 #elif defined(__arm__)
3790
3791         notimpl
3792
3793 #elif defined(__aarch64__)
3794
3795         notimpl
3796
3797 #else
3798         notimpl
3799 #endif
3800
3801 endproc
3802
3803 proc    x3f
3804
3805 #if defined(__x86_64__)
3806
3807         notimpl
3808
3809 #elif defined(__i386__)
3810
3811         notimpl
3812
3813 #elif defined(__arm__)
3814
3815         notimpl
3816
3817 #elif defined(__aarch64__)
3818
3819         notimpl
3820
3821 #else
3822         notimpl
3823 #endif
3824
3825 endproc
3826
3827 ///----- That's all, folks --------------------------------------------------