1 /// -*- mode: asm; asm-comment-char: 0 -*-
3 ///--------------------------------------------------------------------------
6 #include <sys/syscall.h>
8 #if defined(__i386__) || defined(__x86_64__)
10 .intel_syntax noprefix
12 #elif defined(__arm__)
21 #elif defined(__aarch64__)
23 .macro cmov rd, rn, cc
24 csel \rd, \rn, \rd, \cc
27 _(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl) \
28 _(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv) \
33 _(csinc) _(cinc) _(cset) \
35 _(csinv) _(cinv) _(csetm)
36 #define _CONDVAR(cc) _definstvar cc;
37 #define _INSTVARS(inst) \
38 .macro _definstvar cc; \
39 .macro inst.\cc args:vararg; inst \args, \cc; .endm; \
54 #define CCMP_MI CCMP_N
56 #define CCMP_EQ CCMP_Z
58 #define CCMP_CS CCMP_C
59 #define CCMP_HS CCMP_C
62 #define CCMP_VS CCMP_V
64 #define CCMP_HI CCMP_C
66 #define CCMP_LT CCMP_N
68 #define CCMP_LE CCMP_N
72 # error "not supported"
81 .size \name, . - \name
102 add ebx, offset _GLOBAL_OFFSET_TABLE
103 mov eax, [ebx + stdout@GOT]
115 #elif defined(__x86_64__)
132 mov rdi, [rip + stdout]
146 #elif defined(__arm__)
148 stmfd r13!, {r0-r4, r12, r14}
157 ldr r14, .L$_c$gotoff$\@
162 .word _GLOBAL_OFFSET_TABLE - .L$_c$gotpc$\@ - 8
167 ldmfd r13!, {r0-r4, r12, r14}
169 #elif defined(__aarch64__)
173 stp x2, x3, [sp, #16]
174 stp x4, x5, [sp, #32]
175 stp x6, x7, [sp, #48]
176 stp x8, x9, [sp, #64]
177 stp x10, x11, [sp, #80]
178 stp x12, x13, [sp, #96]
179 stp x14, x15, [sp, #112]
180 stp x16, x17, [sp, #128]
182 stp x16, x30, [sp, #144]
187 ldr x0, [x0, #:got_lo12:stdout]
191 ldp x16, x30, [sp, #144]
193 ldp x16, x17, [sp, #128]
194 ldp x14, x15, [sp, #112]
195 ldp x12, x13, [sp, #96]
196 ldp x10, x11, [sp, #80]
197 ldp x8, x9, [sp, #64]
198 ldp x6, x7, [sp, #48]
199 ldp x4, x5, [sp, #32]
200 ldp x2, x3, [sp, #16]
205 # error "not supported"
210 #if defined(__i386__) || defined(__x86_64__)
212 #elif defined(__arm__)
214 #elif defined(__aarch64__)
217 # error "not supported"
221 .section .note.GNU-stack, "", %progbits
225 #if defined(__i386__)
234 #if defined(__i386__)
238 push edi // edi, esi, ebx
239 push ebp // flags, ebp, ..., ebx
244 push esi // regs, flags, ebp, ..., ebx
247 lea eax, [ebx + 9f - .]
248 push eax // cont, regs, flags, ebp, ..., ebx
249 push edi // func, cont, regs, flags, ebp, ..., ebx
267 ret // -> func; regs, flags, ebp, ..., ebx
269 9: pushf // eflags, regs, flags, ebp, ..., ebx
270 push esi // esi, eflags, regs, flags, ebp, ..., ebx
278 pop eax // rflags, regs, flags, ebp, ..., ebx
280 pop eax // regs, flags, ebp, ..., ebx
283 add esp, 4 // flags, ebp, ..., ebx
284 popf // ebp, ..., ebx
291 #elif defined(__x86_64__)
300 push rbp // flags, rbp, ..., rbx
303 push rsi // regs, flags, rbp, ..., rbx
306 push rax // cont, regs, flags, rbp, ..., rbx
307 push rdi // func, cont, regs, flags, rbp, ..., rbx
309 mov rax, [rsi + 8*15]
333 ret // -> func; regs, flags, rbp, ..., rbx
335 9: pushf // rflags, regs, flags, rbp, ..., rbx
336 push rsi // rsi, rflags, regs, flags, rbp, ..., rbx
352 pop rax // rflags, regs, flags, rbp, ..., rbx
354 pop rax // regs, flags, rbp, ..., rbx
357 add rsp, 8 // flags, rbp, ..., rbx
358 popf // rbp, ..., rbx
369 #elif defined(__arm__)
371 stmfd r13!, {r0, r1, r4-r11, r14}
372 ldmia r1, {r0-r12, r14}
380 ldmfd r13!, {r4-r11, pc}
382 #elif defined(__aarch64__)
384 stp x29, x30, [sp, #-14*8]!
386 stp x19, x20, [sp, #16]
387 stp x21, x22, [sp, #32]
388 stp x23, x24, [sp, #48]
389 stp x25, x26, [sp, #64]
390 stp x27, x28, [sp, #80]
393 ldp x29, x30, [x1, #224]
396 ldp x27, x28, [x1, #208]
397 ldp x25, x26, [x1, #192]
398 ldp x23, x24, [x1, #176]
399 ldp x21, x22, [x1, #160]
400 ldp x19, x20, [x1, #144]
401 ldp x16, x17, [x1, #128]
402 ldp x14, x15, [x1, #112]
403 ldp x12, x13, [x1, #96]
404 ldp x10, x11, [x1, #80]
405 ldp x8, x9, [x1, #64]
406 ldp x6, x7, [x1, #48]
407 ldp x4, x5, [x1, #32]
408 ldp x2, x3, [x1, #16]
414 stp x27, x28, [x30, #208]
415 stp x25, x26, [x30, #192]
416 stp x23, x24, [x30, #176]
417 stp x21, x22, [x30, #160]
418 stp x19, x20, [x30, #144]
419 stp x16, x17, [x30, #128]
420 stp x14, x15, [x30, #112]
421 stp x12, x13, [x30, #96]
422 stp x10, x11, [x30, #80]
423 stp x8, x9, [x30, #64]
424 stp x6, x7, [x30, #48]
425 stp x4, x5, [x30, #32]
426 stp x2, x3, [x30, #16]
427 stp x0, x1, [x30, #0]
430 stp x29, x30, [x0, #224]
432 ldp x19, x20, [sp, #16]
433 ldp x21, x22, [sp, #32]
434 ldp x23, x24, [sp, #48]
435 ldp x25, x26, [sp, #64]
436 ldp x27, x28, [sp, #80]
437 ldp x29, x30, [sp], #14*8
442 # error "not supported"
453 ///--------------------------------------------------------------------------
458 // clear all 64 bits of extended traditional registers
460 #if defined(__x86_64__)
462 xor eax, eax // clear rax
463 lea rbx, [0] // rbx -> _|_
464 loop . // iterate, decrement rcx until zero
465 mov rdx, 0 // set rdx = 0
466 and esi, 0 // clear all bits of rsi
467 sub edi, edi // set rdi = edi - edi = 0
469 pop rbp // pop 0 into rbp
471 #elif defined(__i386__)
482 #elif defined(__arm__)
492 #elif defined(__aarch64__)
512 // advance a fibonacci pair by c steps
514 // on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1}
515 // and f_{i+c}, where f_{i+1} = f_i + f_{i-1}
517 #if defined(__x86_64__)
519 0: xadd rax, rdx // a, d = a + d, a
520 // = f_{i+1} + f_i, f_{i+1}
521 // = f_{i+2}, f_{i+1}
522 loop 0b // advance i, decrement c, iterate
524 #elif defined(__i386__)
529 #elif defined(__arm__)
539 #elif defined(__aarch64__)
559 // boolean canonify a: if a = 0 on entry, leave it zero; otherwise
562 #if defined(__x86_64__)
564 neg rax // set cf iff a /= 0
565 sbb rax, rax // a = a - a - cf = -cf
568 #elif defined(__i386__)
574 #elif defined(__arm__)
576 movs r1, r0 // the easy way
577 movne r1, #1 // mvnne r1, #1 for mask
579 cmp r0, #1 // clear cf iff a == 0
580 sbc r2, r0, r0 // c' = a - a - 1 + cf = cf - 1
581 add r2, r2, #1 // c' = cf
583 sub r3, r0, r0, lsr #1 // d' top bit clear; d' = 0 iff a = 0
584 rsb r3, r3, #0 // d' top bit set iff a /= 0
585 mov r3, r3, lsr #31 // asr for mask
591 #elif defined(__aarch64__)
593 cmp x0, #0 // trivial
594 cset.ne x1 // csetm for mask
596 cmp xzr, x0 // set cf iff a == 0
597 sbc x2, x0, x0 // c' = a - a - 1 + cf = cf - 1
598 neg x2, x2 // c' = 1 - cf
600 sub x3, x0, x0, lsr #1 // if a < 2^63 then a' = ceil(d/2) <
602 // if a >= 2^63, write a = 2^63 + t
603 // with t < 2^63; d' = 2^63 - 2^62 +
604 // ceil(t/2) = 2^62 + ceil(t/2), and
606 // anyway d' < 2^63 and d' = 0 iff
608 neg x3, x3 // d' top bit set iff a /= 0
609 lsr x3, x3, #63 // asr for mask
611 cmp x0, #1 // set cf iff a /= 0
612 adc x0, xzr, xzr // a' = 0 + 0 + cf = cf
624 // set a = min(a, d) (unsigned); clobber c, d
626 #if defined(__x86_64__)
628 sub rdx, rax // d' = d - a; set cf if a > d
629 sbb rcx, rcx // c = -cf = -[a > d]
630 and rcx, rdx // c = a > d ? d - a : 0
631 add rax, rcx // a' = a > d ? d : a
633 #elif defined(__i386__)
640 #elif defined(__arm__)
642 cmp r0, r3 // the easy way
643 movlo r1, r0 // only needed for out-of-place
651 #elif defined(__aarch64__)
653 cmp x0, x3 // the easy way
656 subs x3, x3, x0 // d' = d - a; set cf if d >= a
657 sbc x16, xzr, xzr // t = -1 + cf = -[a > d]
658 and x16, x16, x3 // t = a > d ? d - a : 0
659 add x0, x0, x16 // a' = a > d ? d : a
673 #if defined(__x86_64__)
691 #elif defined(__i386__)
709 #elif defined(__arm__)
723 #elif defined(__aarch64__)
731 sub w16, w16, #'a' - 10
733 ccmp.hs w16, #16, #CCMP_HS
748 // answer whether 5 <= a </<= 9.
750 #if defined(__x86_64__)
752 sub rax, 5 // a' = a - 5
753 cmp rax, 4 // is a' - 5 </<= 4?
758 // nz/ne a' /= 4 a /= 9
760 // a/nbe a' > 4 a > 9 or a < 5
761 // nc/ae/nb a' >= 4 a >= 9 or a < 5
762 // c/b/nae a' < 4 5 <= a < 9
763 // be/na a' <= 4 5 <= a <= 9
765 // o a' < -2^63 + 4 -2^63 + 5 <= a < -2^63 + 9
766 // no a' >= -2^63 + 4 a >= -2^63 + 9 or
768 // s -2^63 + 4 <= a' < 4 -2^63 + 9 <= a < 9
769 // ns a' < -2^63 + 4 or a < -2^63 + 9 or a >= 9
771 // ge/nl a' >= 4 a >= 9 or a < -2^63 + 5
772 // l/nge a' < 4 -2^63 + 5 <= a < 9
773 // g/nle a' > 4 a > 9 or a < -2^63 + 5
774 // le/ng a' <= 4 -2^63 + 5 <= a <= 9
776 #elif defined(__i386__)
781 #elif defined(__arm__)
783 // i dimly remember having a slick way to do this way back in the
784 // day, but i can't figure it out any more.
788 #elif defined(__aarch64__)
790 // literal translation is too obvious
792 ccmp.hs x0, #9, #CCMP_HS
804 // leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of,
807 #if defined(__x86_64__)
809 not rax // a' = -a - 1
813 #elif defined(__i386__)
819 #elif defined(__arm__)
823 rsbs r0, r0, #0 // cf has opposite sense
825 #elif defined(__aarch64__)
829 negs x0, x0 // cf has opposite sense
841 // same as before (?)
843 #if defined(__x86_64__)
845 inc rax // a' = a + 1
846 neg rax // a' = -a - 1
850 #elif defined(__i386__)
857 #elif defined(__arm__)
864 #elif defined(__aarch64__)
869 negs x0, x0 // cf has opposite sense
881 // floor((a + d)/2), correctly handling overflow conditions; final cf
882 // is lsb(a + d), probably uninteresting
884 #if defined(__x86_64__)
886 add rax, rdx // cf || a' = a + d
887 rcr rax, 1 // shift 65-bit result right by one
888 // place; lsb moves into carry
890 #elif defined(__i386__)
895 #elif defined(__arm__)
897 // like the two-instruction a64 version
899 add r1, r0, r1, lsr #1
901 // the slick version, similar to the above
905 #elif defined(__aarch64__)
907 // a64 lacks a32's rrx. literal translation.
908 adds x1, x0, x3 // cf || a' = a + d
909 adc x16, xzr, xzr // realize cf in extra register
910 extr x1, x16, x1, #1 // shift down one place
912 // two instruction version: clobbers additional register. (if you
913 // wanted the answer in any other register, even overwriting d, then
914 // this is unnecessary.) also depends on d >= a.
915 sub x16, x3, x0 // compute difference
916 add x0, x0, x16, lsr #1 // add half of it (rounded down)
928 // a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3
929 // (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8).
931 #if defined(__x86_64__)
933 shr rax, 3 // a' = floor(a/8); cf = 1 if a ==
934 // 4, 5, 6, 7 (mod 8)
935 adc rax, 0 // a' = floor(a/8) + cf
937 #elif defined(__i386__)
942 #elif defined(__arm__)
947 #elif defined(__aarch64__)
950 orr x0, xzr, x0, lsr #3
963 // increment c-byte little-endian bignum at rdi
965 #if defined(__x86_64__)
967 add byte ptr [rdi], 1
969 adc byte ptr [rdi], 0
972 #elif defined(__i386__)
974 add byte ptr [edi], 1
976 adc byte ptr [edi], 0
979 #elif defined(__arm__)
981 mov r12, #256 // set initial carry
984 add r12, r0, r12, lsr #8
988 #elif defined(__aarch64__)
990 mov w17, #256 // set initial carry
993 add w17, w16, w17, lsr #8
1007 // negate double-precision d:a
1009 #if defined(__x86_64__)
1011 not rdx // d' = -d - 1
1013 // cf = 1 iff a /= 0
1014 sbb rdx, -1 // d' = -d - cf
1016 #elif defined(__i386__)
1022 #elif defined(__arm__)
1024 // reverse subtract is awesome
1028 #elif defined(__aarch64__)
1030 // easy way: everything is better with zero registers.
1044 // rotate is distributive over xor.
1046 #if defined(__x86_64__)
1048 // rax // = a_1 || a_0
1049 // rbx // = b_1 || b_0
1050 mov rcx, rax // = a_1 || a_0
1052 xor rcx, rbx // = (a_1 XOR b_1) || (a_0 XOR b_0)
1053 ror rcx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1)
1055 ror rax, 0xd // = a_0 || a_1
1056 ror rbx, 0xd // = b_0 || b_1
1057 xor rax, rbx // = (a_0 XOR b_0) || (a_1 XOR b_1)
1059 cmp rax, rcx // always equal
1061 #elif defined(__i386__)
1063 mov ecx, eax // = a_1 || a_0
1065 xor ecx, ebx // = (a_1 XOR b_1) || (a_0 XOR b_0)
1066 ror ecx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1)
1068 ror eax, 0xd // = a_0 || a_1
1069 ror ebx, 0xd // = b_0 || b_1
1070 xor eax, ebx // = (a_0 XOR b_0) || (a_1 XOR b_1)
1072 cmp eax, ecx // always equal
1074 #elif defined(__arm__)
1077 // r0 // = a_1 || a_0
1078 // r1 // = b_1 || b_0
1079 eor r2, r0, r1 // = (a_1 XOR b_1) || (a_0 XOR b_0)
1080 mov r2, r2, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1082 mov r1, r1, ror #13 // = b_0 || b_1
1083 eor r0, r1, r0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1085 cmp r0, r2 // always equal
1087 #elif defined(__aarch64__)
1089 // x0 // = a_1 || a_0
1090 // x1 // = b_1 || b_0
1091 eor x2, x0, x1 // = (a_1 XOR b_1) || (a_0 XOR b_0)
1092 ror x2, x2, #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1094 ror x1, x1, #13 // = b_0 || b_1
1095 eor x0, x1, x0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1097 cmp x0, x2 // always equal
1109 // and is distributive over xor.
1111 #if defined(__x86_64__)
1115 xor rbx, rcx // = b XOR c
1116 and rbx, rax // = a AND (b XOR c)
1118 and rdx, rax // = a AND b
1119 and rax, rcx // = a AND c
1120 xor rax, rdx // = (a AND b) XOR (a AND c)
1121 // = a AND (b XOR c)
1123 cmp rax, rbx // always equal
1125 #elif defined(__i386__)
1129 xor ebx, ecx // = b XOR c
1130 and ebx, eax // = a AND (b XOR c)
1132 and edx, eax // = a AND b
1133 and eax, ecx // = a AND c
1134 xor eax, edx // = (a AND b) XOR (a AND c)
1135 // = a AND (b XOR c)
1137 cmp eax, ebx // always equal
1139 #elif defined(__arm__)
1141 and r3, r0, r1 // = a AND b
1143 eor r1, r1, r2 // = b XOR c
1144 and r1, r1, r0 // = a AND (b XOR c)
1146 and r0, r0, r2 // = a AND c
1147 eor r0, r0, r3 // = (a AND b) XOR (a AND c)
1148 // = a AND (b XOR c)
1150 cmp r0, r1 // always equal
1152 #elif defined(__aarch64__)
1154 and x3, x0, x1 // = a AND b
1156 eor x1, x1, x2 // = b XOR c
1157 and x1, x1, x0 // = a AND (b XOR c)
1159 and x0, x0, x2 // = a AND c
1160 eor x0, x0, x3 // = (a AND b) XOR (a AND c)
1161 // = a AND (b XOR c)
1163 cmp x0, x1 // always equal
1177 #if defined(__x86_64__)
1181 and rcx, rbx // = a AND b
1182 not rcx // = NOT (a AND b)
1186 or rax, rbx // = (NOT a) OR (NOT b)
1189 cmp rax, rcx // always equal
1191 #elif defined(__i386__)
1195 and ecx, ebx // = a AND b
1196 not ecx // = NOT (a AND b)
1200 or eax, ebx // = (NOT a) OR (NOT b)
1203 cmp eax, ecx // always equal
1205 #elif defined(__arm__)
1207 and r2, r0, r1 // = a AND b
1208 mvn r2, r2 // = NOT (a AND b)
1210 mvn r0, r0 // = NOT a
1211 mvn r1, r1 // = NOT b
1212 orr r0, r0, r1 // = (NOT a) OR (NOT b)
1214 cmp r0, r2 // always equal
1216 #elif defined(__aarch64__)
1218 and x2, x0, x1 // = a AND b
1219 mvn x2, x2 // = NOT (a AND b)
1221 mvn x0, x0 // = NOT a
1222 orn x0, x0, x1 // = (NOT a) OR (NOT b)
1224 cmp x0, x2 // always equal
1236 // replace input buffer bytes with cumulative XORs with initial a;
1237 // final a is XOR of all buffer bytes and initial a.
1239 // not sure why you'd do this.
1241 #if defined(__x86_64__)
1247 #elif defined(__i386__)
1253 #elif defined(__arm__)
1261 #elif defined(__aarch64__)
1277 ///--------------------------------------------------------------------------
1282 // four different ways to swap a pair of registers.
1284 #if defined(__x86_64__)
1302 #elif defined(__i386__)
1320 #elif defined(__arm__)
1322 stmfd r13!, {r0, r2}
1332 rsb r0, r0, r2 // don't need 3-addr with reverse-sub
1338 #elif defined(__aarch64__)
1340 // anything you can do
1341 stp x0, x2, [sp, #-16]!
1342 ldp x2, x0, [sp], #16
1348 // the add/sub/add thing was daft. you can do it in three if you're
1349 // clever -- and have three-address operations.
1354 // but we lack a fourth. we can't do this in fewer than three
1355 // instructions without hitting memory. only `ldp' will modify two
1356 // registers at a time, so we need at least two instructions -- but
1357 // if the first one sets one of our two registers to its final value
1358 // then we lose the other input value with no way to recover it, so
1359 // we must either write a fresh third register, or write something
1360 // other than the final value, and in both cases we need a third
1361 // instruction to fix everything up. we've done the wrong-something-
1362 // other trick twice, so here's the captain-obvious use-a-third-
1363 // register version.
1378 // assuming a is initialized to zero, set a to the inclusive or of
1379 // the xor-differences of corresponding bytes in the c-byte strings
1382 // in particular, a will be zero (and zf set) if and only if the two
1383 // strings are equal.
1385 #if defined(__x86_64__)
1394 #elif defined(__i386__)
1403 #elif defined(__arm__)
1405 0: ldrb r1, [r4], #1
1412 #elif defined(__aarch64__)
1414 0: ldrb w16, [x4], #1
1431 // an obtuse way of adding two registers. for any bit position, a
1432 // OR d is set if and only if at least one of a and d has a bit set
1433 // in that position, and a AND d is set if and only if both have a
1434 // bit set in that position. essentially, then, what we've done is
1435 // move all of the set bits in d to a, unless there's already a bit
1436 // there. this clearly doesn't change the sum.
1438 #if defined(__x86_64__)
1440 mov rcx, rdx // c' = d
1441 and rdx, rax // d' = a AND d
1442 or rax, rcx // a' = a OR d
1445 #elif defined(__i386__)
1447 mov ecx, edx // c' = d
1448 and edx, eax // d' = a AND d
1449 or eax, ecx // a' = a OR d
1452 #elif defined(__arm__)
1454 and r2, r0, r3 // c' = a AND d
1455 orr r0, r0, r3 // a' = a OR d
1458 #elif defined(__aarch64__)
1460 and x2, x0, x3 // c' = a AND d
1461 orr x0, x0, x3 // a' = a OR d
1474 // ok, so this is a really obtuse way of adding a and b; the result
1475 // is in a and d. but why does it work?
1477 #if defined(__x86_64__)
1479 mov rcx, 0x40 // carry chains at most 64 long
1480 0: mov rdx, rax // copy a'
1481 xor rax, rbx // low bits of each bitwise sum
1482 and rbx, rdx // carry bits from each bitwise sum
1483 shl rbx, 1 // carry them into next position
1486 #elif defined(__i386__)
1488 mov ecx, 0x40 // carry chains at most 64 long
1489 0: mov edx, eax // copy a'
1490 xor eax, ebx // low bits of each bitwise sum
1491 and ebx, edx // carry bits from each bitwise sum
1492 shl ebx, 1 // carry them into next position
1495 #elif defined(__arm__)
1504 #elif defined(__aarch64__)
1523 // floor((a + d)/2), like x08.
1525 #if defined(__x86_64__)
1527 mov rcx, rax // copy a for later
1528 and rcx, rdx // carry bits
1530 xor rax, rdx // low bits of each bitwise sum
1531 shr rax, 1 // divide by 2; carries now in place
1533 add rax, rcx // add the carries; done
1535 #elif defined(__i386__)
1537 mov ecx, eax // copy a for later
1538 and ecx, edx // carry bits
1540 xor eax, edx // low bits of each bitwise sum
1541 shr eax, 1 // divide by 2; carries now in place
1543 add eax, ecx // add the carries; done
1545 #elif defined(__arm__)
1549 add r0, r2, r0, lsr #1
1551 #elif defined(__aarch64__)
1555 add x0, x2, x0, lsr #1
1567 // sign extension 32 -> 64 bits.
1569 #if defined(__x86_64__)
1571 movsx rbx, eax // like this?
1573 mov rdx, 0xffffffff80000000
1574 add rax, rdx // if bit 31 of a is set then bits
1575 // 31--63 of a' are clear; otherwise,
1576 // these bits are all set -- which is
1577 // exactly backwards
1578 xor rax, rdx // so fix it
1580 #elif defined(__i386__)
1582 movsx ebx, ax // like this?
1585 add eax, edx // if bit 31 of a is set then bits
1586 // 31--63 of a' are clear; otherwise,
1587 // these bits are all set -- which is
1588 // exactly backwards
1589 xor eax, edx // so fix it
1591 #elif defined(__arm__)
1593 sxth r1, r0 // like this
1595 mov r12, #0x80000000
1596 add r0, r0, r12, asr #16
1597 eor r0, r0, r12, asr #16
1599 #elif defined(__aarch64__)
1601 sxtw x1, w0 // like this
1603 mov x16, #0xffffffff80000000
1617 // ??? i don't know why you'd want to calculate this.
1619 #if defined(__x86_64__)
1621 xor rax, rbx // a' = a XOR b
1622 xor rbx, rcx // b' = b XOR c
1623 mov rsi, rax // t = a XOR b
1624 add rsi, rbx // t = (a XOR b) + (b XOR c)
1625 cmovc rax, rbx // a' = cf ? b XOR c : a XOR b
1626 xor rax, rbx // a' = cf ? 0 : a XOR c
1629 #elif defined(__i386__)
1631 xor eax, ebx // a' = a XOR b
1632 xor ebx, ecx // b' = b XOR c
1633 mov esi, eax // t = a XOR b
1634 add esi, ebx // t = (a XOR b) + (b XOR c)
1635 cmovc eax, ebx // a' = cf ? b XOR c : a XOR b
1636 xor eax, ebx // a' = cf ? 0 : a XOR c
1639 #elif defined(__arm__)
1648 #elif defined(__aarch64__)
1669 #if defined(__x86_64__)
1671 cqo // d = a < 0 ? -1 : 0
1672 xor rax, rdx // a' = a < 0 ? -a - 1 : a
1673 sub rax, rdx // a' = a < 0 ? -a : a
1675 #elif defined(__i386__)
1677 cdq // d = a < 0 ? -1 : 0
1678 xor eax, edx // a' = a < 0 ? -a - 1 : a
1679 sub eax, edx // a' = a < 0 ? -a : a
1681 #elif defined(__arm__)
1687 // faithful-ish conversion
1688 eor r3, r0, r0, asr #31
1689 sub r0, r3, r0, asr #31
1691 #elif defined(__aarch64__)
1697 // faithful-ish conversion
1698 eor x3, x0, x0, asr #63
1699 sub x0, x3, x0, asr #63
1711 // should always set sf, clear zf, unless we get rescheduled to a
1714 #if defined(__x86_64__)
1716 rdtsc // d || a = cycles
1718 or rax, rdx // a = cycles
1719 mov rcx, rax // c = cycles
1721 rdtsc // d || a = cycles'
1723 or rax, rdx // a = cycles'
1727 #elif defined(__i386__)
1729 rdtsc // d || a = cycles
1731 mov ecx, edx // c || b = cycles
1733 rdtsc // d || a = cycles'
1738 #elif defined(__arm__)
1740 // cycle clock not available in user mode
1741 mrrc p15, 0, r0, r1, c9
1742 mrrc p15, 0, r2, r3, c9
1746 #elif defined(__aarch64__)
1748 // cycle clock not available in user mode
1763 // stupid way to capture a pointer to inline data and jump past it.
1764 // confuses the return-address predictor something chronic. worse
1765 // because amd64 calling convention doesn't usually pass arguments on
1768 #if defined(__x86_64__)
1771 .string "hello world!\n\0"
1777 // actually implement this ridiculous thing
1780 0: mov al, [rsi + rdx]
1787 syscall // clobbers r11 :-(
1790 #elif defined(__i386__)
1793 .string "hello world!\n\0"
1799 // actually implement this ridiculous thing
1802 0: mov al, [ecx + edx]
1812 #elif defined(__arm__)
1814 // why am i doing this?
1817 .string "hello world!\n\0"
1819 8: mov r1, r14 // might as well make it easy on myself
1825 0: ldrb r0, [r1, r2]
1834 #elif defined(__aarch64__)
1836 // why am i doing this?
1837 str x30, [sp, #-16]!
1839 .string "hello world!\n\0"
1841 8: mov x1, x30 // might as well make it easy on myself
1848 0: ldrb w0, [x1, x2]
1865 // collect the current instruction-pointer address. this was an old
1866 // 32-bit i386 trick for position-independent code, but (a) it
1867 // confuses the return predictor, and (b) amd64 has true pc-relative
1870 #if defined(__x86_64__)
1872 // the actual example
1876 // the modern i386 trick doesn't confuse the return-address
1881 // but rip-relative addressing is even better
1890 #elif defined(__i386__)
1892 // the actual example
1896 // the modern i386 trick doesn't confuse the return-address
1903 #elif defined(__arm__)
1911 sub r1, r14, #. - 0b
1919 #elif defined(__aarch64__)
1921 str x30, [sp, #-16]!
1923 // we can do all of the above using a64
1928 sub x1, x30, #. - 0b
1943 #if defined(__x86_64__)
1945 // retpolines: an mitigation against adversarially influenced
1946 // speculative execution at indirect branches. if an adversary can
1947 // prepare a branch-target buffer entry matching an indirect branch
1948 // in the victim's address space then they can cause the victim to
1949 // /speculatively/ (but not architecturally) execute any code in
1950 // their address space, possibly leading to leaking secrets through
1951 // the cache. retpolines aren't susceptible to this because the
1952 // predicted destination address is from the return-prediction stack
1953 // which the adversary can't prime. the performance penalty is still
1954 // essentially a branch misprediction -- for this return, and
1955 // possibly all others already stacked.
1957 // (try not to crash)
1963 #elif defined(__i386__)
1966 lea eax, [ebx + 9f - .]
1971 #elif defined(__arm__)
1980 #elif defined(__aarch64__)
1982 str x30, [sp, #-16]!
1987 8: ldr x30, [sp], #16
1998 // ok, having a hard time seeing a use for this. the most important
1999 // thing to note is that sp is set from `pop' /after/ it's
2002 #if defined(__x86_64__)
2015 #elif defined(__i386__)
2028 #elif defined(__arm__)
2030 // not even going to dignify this
2033 #elif defined(__aarch64__)
2035 // not even going to dignify this
2046 // monumentally cheesy way to copy 8 n bytes from buff1 to buff2.
2047 // also clobbers words at buff2 + 8 n and buff2 - 8 for good measure.
2051 #if defined(__x86_64__)
2053 mov rax, rsp // safekeeping
2055 // we're toast if we get hit by a signal now. fingers crossed...
2057 mov rsp, buff2 + 8*n + 8
2058 mov rbp, buff1 + 8*n
2060 lea rsp, [rdi + 8*n + 16]
2061 lea rbp, [rsi + 8*n]
2067 // +---------+ +---------+
2068 // rbp -> | ??? | rsp -> | ??? |
2069 // +---------+ +---------+
2070 // | w_{n-1} | | rbp | <- rbp'
2071 // +---------+ +---------+
2072 // | ... | | w_{n-1} |
2073 // +---------+ +---------+
2075 // +---------+ +---------+
2077 // +---------+ +---------+
2086 #elif defined(__i386__)
2088 mov eax, esp // safekeeping
2090 // we're toast if we get hit by a signal now. fingers crossed...
2092 mov esp, buff2 + 4*n + 4
2093 mov ebp, buff1 + 4*n
2095 lea esp, [edi + 4*n + 8]
2096 lea ebp, [esi + 4*n]
2103 #elif defined(__arm__)
2106 add r5, r5, #4*n + 8
2110 ldrd r0, r1, [r4, #-8]!
2111 strd r0, r1, [r5, #-8]!
2116 #elif defined(__aarch64__)
2118 // omgwtf. let's not actually screw with the stack pointer.
2121 add x5, x5, #8*n + 16
2125 ldp x16, x17, [x4, #-16]!
2126 stp x16, x17, [x5, #-16]!
2141 // convert nibble value to (uppercase) hex; other input values yield
2144 #if defined(__x86_64__)
2146 // das doesn't work in 64-bit mode; best i can come up with
2153 #elif defined(__i386__)
2155 cmp al, 0x0a // cf = 1 iff a < 10
2156 sbb al, 0x69 // if 0 <= a < 10, a' = a - 0x6a, so
2157 // 0x96 <= a' < 0x70, setting af, cf
2158 // if 10 <= a < 16, a' = a - 0x69, so
2159 // 0x71 <= a' < 0x77, setting cf but
2161 das // if 0 <= a < 10, then af and cf are
2162 // both set, so set subtract 0x66
2163 // from a' leaving 0x30 <= a' < 0x3a;
2164 // if 10 <= a < 16 then af clear but
2165 // cf set, so subtract 0x60 from a'
2166 // leaving 0x41 <= a' < 0x47
2168 #elif defined(__arm__)
2170 // significantly less tricksy
2173 addhs r0, r0, #'A' - 10
2175 #elif defined(__aarch64__)
2177 // with less versatile conditional execution this is the best we can
2180 add w16, w0, #'A' - 10
2194 // verify collatz conjecture starting at a; assume a /= 0!
2196 #if defined(__x86_64__)
2198 0: bsf rcx, rax // clobber c if a = 0
2199 shr rax, cl // a = 2^c a'
2207 lea rax, [2*rax + rax + 1] // a' = 3 a' + 1
2212 #elif defined(__i386__)
2214 0: bsf ecx, eax // clobber c if a = 0
2215 shr eax, cl // a = 2^c a'
2223 lea eax, [2*eax + eax + 1] // a' = 3 a' + 1
2228 #elif defined(__arm__)
2230 // rbit introduced in armv7
2233 mov r0, r0, lsr r2 // a = 2^c a'
2238 adcne r0, r0, r0, lsl #1 // a' = 3 a' + 1 (because c set)
2243 #elif defined(__aarch64__)
2247 lsr w0, w0, w2 // a = 2^c a'
2254 add w16, w0, w0, lsl #1 // t = 3 a' + 1 (because c set)
2255 csinc.eq w0, w0, w16
2266 ///--------------------------------------------------------------------------
2271 // calculate 1337 a slowly
2273 #if defined(__x86_64__)
2276 mov rcx, rax // c = a
2277 shl rcx, 2 // c = 4 a
2278 add rcx, rax // c = 5 a
2279 shl rcx, 3 // c = 40 a
2280 add rcx, rax // c = 41 a
2281 shl rcx, 1 // c = 82 a
2282 add rcx, rax // c = 83 a
2283 shl rcx, 1 // c = 166 a
2284 add rcx, rax // c = 167 a
2285 shl rcx, 3 // c = 1336 a
2286 add rcx, rax // c = 1337 a
2289 lea rdx, [2*rax + rax] // t = 3 a
2290 shl rdx, 6 // t = 192 a
2291 sub rdx, rax // t = 191 a
2292 lea rbx, [8*rdx] // b = 1528 a
2293 sub rbx, rdx // b = 1337 a
2295 #elif defined(__i386__)
2298 mov ecx, eax // c = a
2299 shl ecx, 2 // c = 4 a
2300 add ecx, eax // c = 5 a
2301 shl ecx, 3 // c = 40 a
2302 add ecx, eax // c = 41 a
2303 shl ecx, 1 // c = 82 a
2304 add ecx, eax // c = 83 a
2305 shl ecx, 1 // c = 166 a
2306 add ecx, eax // c = 167 a
2307 shl ecx, 3 // c = 1336 a
2308 add ecx, eax // c = 1337 a
2311 lea edx, [2*eax + eax] // t = 3 a
2312 shl edx, 6 // t = 192 a
2313 sub edx, eax // t = 191 a
2314 lea ebx, [8*edx] // b = 1528 a
2315 sub ebx, edx // b = 1337 a
2317 #elif defined(__arm__)
2319 // original version, ish
2320 add r2, r0, r0, lsl #2 // c = 5 a
2321 add r2, r0, r2, lsl #3 // c = 41 a
2322 add r2, r0, r2, lsl #1 // c = 83 a
2323 add r2, r0, r2, lsl #1 // c = 167 a
2324 add r2, r0, r2, lsl #3 // c = 1337 a
2327 add r1, r0, r0, lsl #1 // b = 3 a
2328 rsb r1, r0, r1, lsl #6 // b = 191 a
2329 rsb r1, r1, r1, lsl #3 // b = 1337 a
2331 #elif defined(__aarch64__)
2333 // original version, ish
2334 add x2, x0, x0, lsl #2 // c = 5 a
2335 add x2, x0, x2, lsl #3 // c = 41 a
2336 add x2, x0, x2, lsl #1 // c = 83 a
2337 add x2, x0, x2, lsl #1 // c = 167 a
2338 add x2, x0, x2, lsl #3 // c = 1337 a
2340 // sleazy because no rsb
2341 add x1, x0, x0, lsl #1 // b = 3 a
2342 sub x1, x0, x1, lsl #6 // b = -191 a
2343 sub x1, x1, x1, lsl #3 // b = 1337 a
2355 // multiply complex numbers a + b i and c + d i
2357 // (a + b i) (c + d i) = (a c - b d) + (a d + b c) i
2359 // somewhat slick approach uses only three multiplications
2361 #if defined(__x86_64__)
2363 mov rsi, rax // t = a
2364 add rax, rbx // a' = a + b
2365 mov rdi, rdx // u = d
2366 sub rdx, rcx // d' = d - c
2367 add rdi, rcx // u = c + d
2369 imul rax, rcx // a' = c (a + b)
2370 imul rsi, rdx // t = a (d - c)
2371 imul rdi, rbx // u = b (c + d)
2373 add rsi, rax // t = a (d - c) + c (a + b)
2374 mov rbx, rsi // b' = a (d - c) + c (a + b)
2376 sub rax, rdi // a' = c (a + b) - b (c + d)
2379 #elif defined(__i386__)
2381 mov esi, eax // t = a
2382 add eax, ebx // a' = a + b
2383 mov edi, edx // u = d
2384 sub edx, ecx // d' = d - c
2385 add edi, ecx // u = c + d
2387 imul eax, ecx // a' = c (a + b)
2388 imul esi, edx // t = a (d - c)
2389 imul edi, ebx // u = b (c + d)
2391 add esi, eax // t = a (d - c) + c (a + b)
2392 mov ebx, esi // b' = a (d - c) + c (a + b)
2394 sub eax, edi // a' = c (a + b) - b (c + d)
2397 #elif defined(__arm__)
2399 add r4, r0, r1 // t = a + b
2400 add r5, r2, r3 // u = c + d
2401 sub r3, r3, r2 // d' = d - c
2403 // mls introduced in armv7
2404 mul r4, r4, r2 // t = c (a + b)
2405 mov r2, r1 // c' = a (bah!)
2406 mla r1, r0, r3, r4 // b' = a (d - c) + c (a + b)
2408 mls r0, r2, r5, r4 // a' = c (a + b) - b (c + d)
2411 #elif defined(__aarch64__)
2413 add x4, x0, x1 // t = a + b
2414 add x5, x2, x3 // u = c + d
2415 sub x3, x3, x2 // d' = d - c
2417 // mls intxoduced in axmv7
2418 mul x4, x4, x2 // t = c (a + b)
2419 mov x2, x1 // c' = a (bah!)
2420 madd x1, x0, x3, x4 // b' = a (d - c) + c (a + b)
2422 msub x0, x2, x5, x4 // a' = c (a + b) - b (c + d)
2437 #if defined(__x86_64__)
2439 mov rdx, 0xaaaaaaaaaaaaaaab // = ceil(2/3 2^64)
2440 mul rdx // d' || a' =~ 2/3 a 2^64
2441 shr rdx, 1 // d' = floor(a/3)
2442 mov rax, rdx // a' = floor(a/3)
2444 // we start with 0 <= a < 2^64. write f = ceil(2/3 2^64), so that
2445 // 2/3 < f/2^64 < 2/3 + 1/2^64. then floor(2/3 a) <= floor(a f/2^64)
2446 // <= floor(2/3 a + a/2^64), but a < 2^64 so a/2^64 < 1 and
2447 // floor(a f/2^64) = floor(2/3 a).
2449 #elif defined(__i386__)
2451 mov edx, 0xaaaaaaab // = ceil(2/3 2^32)
2452 mul edx // d' || a' =~ 2/3 a 2^32
2453 shr edx, 1 // d' = floor(a/3)
2454 mov eax, edx // a' = floor(a/3)
2456 #elif defined(__arm__)
2458 ldr r12, =0xaaaaaaab
2459 umull r12, r0, r0, r12
2462 #elif defined(__aarch64__)
2464 ldr x16, =0xaaaaaaaaaaaaaaab
2478 #if defined(__x86_64__)
2480 // main loop: shorten a preserving residue class mod 3
2484 mov rdx, rax // d' = a
2485 shr rdx, 2 // d' = floor(a/4)
2486 and rax, 3 // a = 4 d' + a' (0 <= a' < 4)
2487 add rax, rdx // a' == a (mod 3) but a' < a/4 + 4
2490 // fix up final value 0 <= a < 6: want 0 <= a < 3
2492 // the tricky part is actually a = 3; but the other final cases take
2493 // additional iterations which we can avoid.
2494 8: cmp rax, 3 // set cf iff a < 3
2495 cmc // set cf iff a >= 3
2496 sbb rdx, rdx // d' = a >= 3 ? -1 : 0
2497 and rdx, 3 // d' = a >= 3 ? 3 : 0
2498 sub rax, rdx // a' = a - (a >= 3 ? 3 : 0)
2501 #elif defined(__i386__)
2503 // main loop: shorten a preserving residue class mod 3
2507 mov edx, eax // d' = a
2508 shr edx, 2 // d' = floor(a/4)
2509 and eax, 3 // a = 4 d' + a' (0 <= a' < 4)
2510 add eax, edx // a' == a (mod 3) but a' < a/4 + 4
2513 // fix up final value 0 <= a < 6: want 0 <= a < 3
2515 // the tricky part is actually a = 3; but the other final cases take
2516 // additional iterations which we can avoid.
2517 8: cmp eax, 3 // set cf iff a < 3
2518 cmc // set cf iff a >= 3
2519 sbb edx, edx // d' = a >= 3 ? -1 : 0
2520 and edx, 3 // d' = a >= 3 ? 3 : 0
2521 sub eax, edx // a' = a - (a >= 3 ? 3 : 0)
2524 #elif defined(__arm__)
2528 addhs r0, r12, r0, lsr #2
2534 #elif defined(__aarch64__)
2537 // blunder on through regardless since this doesn't affect the result
2539 add x0, x16, x0, lsr #2
2555 // invert (odd) a mod 2^64
2557 // suppose a a_i == 1 (mod 2^{2^i})
2559 // clearly good for i = 0, since 2^i = 1 and 2^{2^i} = 2, and a_0 =
2560 // a == 1 (mod 2) by assumption
2562 // write a a_i == b_i 2^{2^i} + 1 (mod 2^{2^{i+1}})
2563 // then b_i == (a a_i - 1)/2^{2^i} (mod 2^{2^i})
2564 // to lift inverse, we want x such that a x == -b_i (mod 2^{2^i});
2565 // clearly x = -a_i b_i will do, since a a_i == 1 (mod 2^{2^i})
2567 // a_{i+1} = a_i - a_i b_i 2^{2^i} = a_i (1 - (a a_i - 1))
2568 // = 2 a_i - a a_i^2
2571 // a a_{i+1} = 2 a a_i - a^2 a_i^2
2572 // == 2 a a_i - (b_i 2^{2^i} + 1)^2
2573 // == 2 (b_i 2^{2^i} + 1) -
2574 // (b_i^2 2^{2^{i+1}} + 2 b_i 2^{2^i} + 1)
2575 // == 1 (mod 2^{2^{i+1}})
2577 #if defined(__x86_64__)
2580 mov rbx, rax // b' = a
2581 mov rsi, rax // t = a_0
2589 mul rbx // a' = a a_i
2590 mov rcx, rax // c = a a_i
2592 sub rax, 2 // a' = a a_i - 2
2593 neg rax // a' = 2 - a a_i
2594 mul rsi // a_{i+1} = a_i (2 - a a_i)
2595 // = 2 a_i - a a_i^2
2596 mov rsi, rax // t = a_{i+1}
2599 ja 0b // no -- iterate
2601 #elif defined(__i386__)
2604 mov ebx, eax // b' = a
2605 mov esi, eax // t = a_0
2613 mul ebx // a' = a a_i
2614 mov ecx, eax // c = a a_i
2616 sub eax, 2 // a' = a a_i - 2
2617 jb 9f // done if < 2
2618 neg eax // a' = 2 - a a_i
2619 mul esi // a_{i+1} = a_i (2 - a a_i)
2620 // = 2 a_i - a a_i^2
2621 mov esi, eax // t = a_{i+1}
2623 jmp 0b // and iterate
2624 9: mov eax, esi // restore
2626 #elif defined(__arm__)
2629 mov r1, r0 // b' = a
2635 mul r2, r0, r1 // c = a a_i
2636 rsbs r2, r2, #2 // c = 2 - a a_i
2637 mul r0, r0, r2 // a_{i+1} = a_i (2 - a a_i)
2638 // = 2 a_i - a a_i^2
2641 #elif defined(__aarch64__)
2644 mov x1, x0 // b' = a
2645 mov x16, #2 // because we have no rsb
2653 mul x2, x0, x1 // c = a a_i
2654 subs x2, x16, x2 // c = 2 - a a_i
2655 mul x0, x0, x2 // a_{i+1} = a_i (2 - a a_i)
2656 // = 2 a_i - a a_i^2
2669 // a poor approximation to pi/4
2671 // think of x and y as being in 16.16 fixed-point format. we sample
2672 // points in the unit square, and determine how many of them are
2673 // within a unit quarter-circle centred at the origin. the area of
2674 // the quarter-circle is pi/4.
2676 #if defined(__x86_64__)
2678 xor eax, eax // a = 0
2680 shl rcx, 0x20 // c =~ 4 billion
2682 0: movzx rbx, cx // x = low 16 bits of c
2683 imul rbx, rbx // b = x^2
2685 ror rcx, 0x10 // switch halves of c
2686 movzx rdx, cx // y = high 16 bits of c
2687 imul rdx, rdx // d = y^2
2688 rol rcx, 0x10 // switch back
2690 add rbx, rdx // r^2 = x^2 + y^2
2691 shr rbx, 0x20 // r^2 >= 1?
2692 cmp rbx, 1 // set cf iff r^2 >= 1
2693 adc rax, 0 // and add onto accumulator
2696 #elif defined(__i386__)
2698 // this is actually better done in 32 bits. the carry has the wrong
2699 // sense here, so instead deduct one for each point outside the
2700 // quarter-circle rather than adding one for each point inside it.
2711 add ebx, edx // see?
2715 #elif defined(__arm__)
2720 0: uxth r1, r2, ror #0
2721 uxth r3, r2, ror #16
2724 cmn r1, r3 // mlas doesn't set cf usefully
2729 #elif defined(__aarch64__)
2734 0: ubfx w1, w2, #0, #16
2735 ubfx w3, w2, #16, #16
2753 // a bad way to rotate a right by 7 places
2755 #if defined(__x86_64__)
2758 ror rbx, 7 // better
2760 mov rdx, rax // d' = a
2761 shr rax, 7 // a' = a >> 7
2762 shl rdx, 0x39 // d' = a << 57
2763 or rax, rdx // a' = a >>> 7
2765 #elif defined(__i386__)
2768 ror ebx, 7 // better
2770 mov edx, eax // d' = a
2771 shr eax, 7 // a' = a >> 7
2772 shl edx, 0x39 // d' = a << 57
2773 or eax, edx // a' = a >>> 7
2775 #elif defined(__arm__)
2777 mov r1, r0, ror #7 // easy way
2779 // even the hard way is fairly easy on arm
2781 orr r0, r3, r0, lsr #7 // hard way
2783 #elif defined(__aarch64__)
2785 ror x1, x0, #7 // easy way
2787 // even the hard way is fairly easy on arm
2789 orr x0, x3, x0, lsr #7 // hard way
2801 // shift a right by c places, in two halves
2803 #if defined(__x86_64__)
2805 mov ch, cl // c' = [c, c]
2806 inc ch // c' = [c, c + 1]
2808 shr cl, 1 // c' = [floor(c/2), ceil(c/2)]
2813 #elif defined(__i386__)
2815 mov ch, cl // c' = [c, c]
2816 inc ch // c' = [c, c + 1]
2818 shr cl, 1 // c' = [floor(c/2), ceil(c/2)]
2823 #elif defined(__arm__)
2825 // it would be clearer and more efficient to say: `mov r12, r2, lsr
2826 // #1; sub r2, r2, r12', but that's not the lesson this exercise is
2830 mov r12, r12, lsr #1
2834 #elif defined(__aarch64__)
2852 // divide c-byte little-endian bignum at rsi by 2 (rounding down)
2854 #if defined(__x86_64__)
2857 0: rcr byte ptr [rsi], 1
2861 #elif defined(__i386__)
2864 0: rcr byte ptr [esi], 1
2868 #elif defined(__arm__)
2870 // we could hack this a word at a time using rrx
2874 orr r3, r3, r12, lsr #1
2879 #elif defined(__aarch64__)
2884 orr w16, w16, w17, lsr #1
2899 // fill a buffer with a 3-byte pattern
2901 #if defined(__x86_64__)
2906 #elif defined(__i386__)
2911 #elif defined(__arm__)
2915 ldrhsb r12, [r4], #1
2916 strhsb r12, [r5], #1
2919 #elif defined(__aarch64__)
2939 // rotate the words in a buffer, so that the last word comes first,
2940 // the first comes second, and so on. this isn't a good way to do
2943 #if defined(__x86_64__)
2945 mov rsi, rbx // set string pointers
2947 0: lodsq // fetch next word
2948 xchg rax, qword ptr [rbx] // stash it for next iteration and
2949 // replace it with the previously
2951 stosq // store in output
2952 // (note that the first iteration doesn't actually do anything)
2953 loop 0b // continue until all done
2955 #elif defined(__i386__)
2957 mov esi, ebx // set string pointers
2959 0: lodsd // fetch next word
2960 xchg eax, dword ptr [ebx] // stash it for next iteration and
2961 // replace it with the previously
2963 stosd // store in output
2964 loop 0b // continue until all done
2966 #elif defined(__arm__)
2968 // let's do this a sensible way. (we could go faster using ldm/stm.)
2969 add r0, r1, r2, lsl #2 // find the end of the buffer
2970 ldr r0, [r0, #-4] // collect final element
2977 #elif defined(__aarch64__)
2979 add x0, x1, x2, lsl #3 // find the end of the buffer
2980 ldr x0, [x0, #-8] // collect final element
2997 // find a cycle in a function f: B -> B, where B = {0, 1, ..., 255}
2999 #if defined(__x86_64__)
3001 // this is floyd's cycle-finding algorithm.
3003 // consider the sequence s_0 = 0, s_1 = f(0), s_2 = f(f(0)), ...,
3004 // s_{i+1} = f(s_i). since B is finite, there must be some smallest
3005 // t and c such that s(t) = s(t + c); then we have s_i = s_j iff
3006 // i >= t, j >= t, and i == j (mod c).
3008 // the algorithm sets two cursors advancing through the sequence: a
3009 // /tortoise/ which advances one step at a time, and a /hare/ which
3010 // advances by two, so when the tortoise is at element s_i, the hare
3011 // is at s_{2i}. the hare will run around the cycle and catch the
3012 // tortoise when i >= t and i == 2 i (mod c); the latter is simply i
3013 // == 0 (mod c), which therefore happens first when i = k = t +
3016 // i'm not sure what good xlatb does here that mov al, [rbx + al]
3019 xor eax, eax // tortoise starts at 0
3020 xor edx, edx // hare starts at 0
3021 0: xlatb // advance tortoise
3022 xchg rax, rdx // switch to hare
3023 xlatb // advance hare ...
3025 xchg rax, rdx // switch back
3026 cmp al, dl // hare caught the tortoise?
3027 jnz 0b // no -- go around again
3029 // now we trace the initial tail: reset the tortoise to s_0, and slow
3030 // the hare down so that both take only a single step in each
3031 // iteration. this loop terminates when i >= t and i == i + 2 k
3032 // (mod c). we know k is a multiple of c, so the latter condition
3033 // always holds, so this finds the first step of the cycle.
3035 xor eax, eax // reset the tortoise
3036 0: xlatb // advance tortoise
3037 xchg rax, rdx // switch to hare
3038 xlatb // advance hare
3039 xchg rax, rdx // and switch back
3041 jnz 0b // no -- iterate
3043 #elif defined(__i386__)
3045 xor eax, eax // tortoise starts at 0
3046 xor edx, edx // hare starts at 0
3047 0: xlatb // advance tortoise
3048 xchg eax, edx // switch to hare
3049 xlatb // advance hare ...
3051 xchg eax, edx // switch back
3052 cmp al, dl // hare caught the tortoise?
3053 jnz 0b // no -- go around again
3055 xor eax, eax // reset the tortoise
3056 0: xlatb // advance tortoise
3057 xchg eax, edx // switch to hare
3058 xlatb // advance hare
3059 xchg eax, edx // and switch back
3061 jnz 0b // no -- iterate
3063 #elif defined(__arm__)
3067 0: ldrb r0, [r1, r0]
3074 0: ldrb r0, [r1, r0]
3079 #elif defined(__aarch64__)
3083 0: ldrb w0, [x1, x0]
3090 0: ldrb w0, [x1, x0]
3105 // a convoluted way to set rax = rsi
3107 #if defined(__x86_64__)
3109 mov qword ptr [rbx + 8*rcx], 0 // b[c] = 0
3110 mov qword ptr [rbx + 8*rdx], 1 // b[d] = 1
3111 mov rax, [rbx + 8*rcx] // a' = b[c] = 0
3113 mov [rbx], rsi // b[0] = t
3114 mov [rbx + 8], rdi // b[1] = u
3115 mov rax, [rbx + 8*rax] // a' = b[a'] = b[0] = t
3117 #elif defined(__i386__)
3119 mov dword ptr [ebx + 8*ecx], 0 // b[c] = 0
3120 mov dword ptr [ebx + 8*edx], 1 // b[d] = 1
3121 mov eax, [ebx + 8*ecx] // a' = b[c] = 0
3123 mov [ebx], esi // b[0] = t
3124 mov [ebx + 8], edi // b[1] = u
3125 mov eax, [ebx + 8*eax] // a' = b[a'] = b[0] = t
3127 #elif defined(__arm__)
3132 str r0, [r1, r2, lsl #2]
3133 str r12, [r1, r3, lsl #2]
3134 ldr r0, [r1, r2, lsl #2]
3138 ldr r0, [r1, r0, lsl #2]
3140 #elif defined(__aarch64__)
3144 str xzr, [x1, x2, lsl #3]
3145 str x16, [x1, x3, lsl #3]
3146 ldr x0, [x1, x2, lsl #3]
3150 ldr x0, [x1, x0, lsl #3]
3162 // clear the least significant set bit in a, by calculating a' =
3165 // if a = 0 then a' = 0. otherwise, a - 1 differs from a exactly in
3166 // the least significant /set/ bit of a, and all bits of lesser
3167 // significance. to put it another way: write a = u 2^{k+1} + 2^k;
3168 // then a - 1 = u 2^{k+1} + 2^{k-1} + ... + 2 + 1. taking the
3169 // bitwise AND of these leaves only the bits common to both, i.e.,
3172 #if defined(__x86_64__)
3174 mov rdx, rax // d' = a
3175 dec rax // a' = a - 1
3176 and rax, rdx // a' = a AND (a - 1)
3178 #elif defined(__i386__)
3180 mov edx, eax // d' = a
3181 dec eax // a' = a - 1
3182 and eax, edx // a' = a AND (a - 1)
3184 #elif defined(__arm__)
3189 #elif defined(__aarch64__)
3204 // compute a mask of one bits in exactly the positions of the
3205 // low-order run of zero bits in a
3207 #if defined(__x86_64__)
3209 mov rdx, rax // d' = a
3210 dec rdx // d' = a - 1
3211 xor rax, rdx // a = a XOR (a - 1)
3212 // set bits are least significant
3213 // set bit of a, and all bits of
3214 // lesser significance
3215 shr rax, 1 // now only bits of lesser
3216 // significance; a' = 0 iff a odd
3217 cmp rax, rdx // equal if a = 0 or 2^k; otherwise
3220 #elif defined(__i386__)
3228 #elif defined(__arm__)
3232 mov r0, r0, lsr #1 // probably fold shift into next inst
3235 #elif defined(__aarch64__)
3239 mov x0, x0, lsr #1 // probably fold shift into next inst
3252 // a slow population count
3254 #if defined(__x86_64__)
3256 popcnt rbx, rcx // the easy way
3258 // a fast version in software
3263 mov rsi, 0x5555555555555555
3270 mov rsi, 0x3333333333333333
3294 // the official version
3295 xor eax, eax // clear iteration counter
3296 0: jrcxz 9f // bail if c = 0
3297 inc rax // bump iteration count
3298 mov rdx, rcx // d' = c
3299 dec rdx // d' = c - 1
3300 and rcx, rdx // zap least significant set bit of c
3301 jmp 0b // and go again
3304 #elif defined(__i386__)
3306 popcnt ebx, ecx // the easy way
3346 #elif defined(__arm__)
3352 add r1, r1, r1, lsl #8
3353 add r1, r1, r1, lsl #16
3359 and r3, r12, r2, lsr #1
3365 and r3, r12, r0, lsr #2
3369 add r0, r0, r0, lsl #16
3372 and r3, r12, r0, lsr #4
3376 add r0, r0, r0, lsl #8
3380 // and following the exercise
3390 #elif defined(__aarch64__)
3396 add x1, x1, x1, lsl #8
3397 add x1, x1, x1, lsl #16
3398 add x1, x1, x1, lsl #32
3401 // the hard way -- though arm64's immediate constant encodings and
3402 // shifting make this actually rather pleasant.
3403 and x3, x2, #0xaaaaaaaaaaaaaaaa
3404 and x0, x2, #0x5555555555555555
3405 add x0, x0, x3, lsr #1
3407 and x3, x0, #0xcccccccccccccccc
3408 and x0, x0, #0x3333333333333333
3409 add x0, x0, x3, lsr #2
3411 add x0, x0, x0, lsr #4
3413 and x3, x0, #0x0f000f000f000f00
3414 and x0, x0, #0x000f000f000f000f
3415 add x0, x3, x0, lsl #8
3417 add x0, x0, x0, lsl #16
3418 add x0, x0, x0, lsl #32
3421 // and the official way
3438 ///--------------------------------------------------------------------------
3443 #if defined(__x86_64__)
3447 #elif defined(__i386__)
3451 #elif defined(__arm__)
3455 #elif defined(__aarch64__)
3469 #if defined(__x86_64__)
3473 #elif defined(__i386__)
3477 #elif defined(__arm__)
3481 #elif defined(__aarch64__)
3493 #if defined(__x86_64__)
3497 #elif defined(__i386__)
3501 #elif defined(__arm__)
3505 #elif defined(__aarch64__)
3517 #if defined(__x86_64__)
3521 #elif defined(__i386__)
3525 #elif defined(__arm__)
3529 #elif defined(__aarch64__)
3541 #if defined(__x86_64__)
3545 #elif defined(__i386__)
3549 #elif defined(__arm__)
3553 #elif defined(__aarch64__)
3565 #if defined(__x86_64__)
3569 #elif defined(__i386__)
3573 #elif defined(__arm__)
3577 #elif defined(__aarch64__)
3589 #if defined(__x86_64__)
3593 #elif defined(__i386__)
3597 #elif defined(__arm__)
3601 #elif defined(__aarch64__)
3613 #if defined(__x86_64__)
3617 #elif defined(__i386__)
3621 #elif defined(__arm__)
3625 #elif defined(__aarch64__)
3637 #if defined(__x86_64__)
3641 #elif defined(__i386__)
3645 #elif defined(__arm__)
3649 #elif defined(__aarch64__)
3661 #if defined(__x86_64__)
3665 #elif defined(__i386__)
3669 #elif defined(__arm__)
3673 #elif defined(__aarch64__)
3685 #if defined(__x86_64__)
3689 #elif defined(__i386__)
3693 #elif defined(__arm__)
3697 #elif defined(__aarch64__)
3709 #if defined(__x86_64__)
3713 #elif defined(__i386__)
3717 #elif defined(__arm__)
3721 #elif defined(__aarch64__)
3733 #if defined(__x86_64__)
3737 #elif defined(__i386__)
3741 #elif defined(__arm__)
3745 #elif defined(__aarch64__)
3757 #if defined(__x86_64__)
3761 #elif defined(__i386__)
3765 #elif defined(__arm__)
3769 #elif defined(__aarch64__)
3781 #if defined(__x86_64__)
3785 #elif defined(__i386__)
3789 #elif defined(__arm__)
3793 #elif defined(__aarch64__)
3805 #if defined(__x86_64__)
3809 #elif defined(__i386__)
3813 #elif defined(__arm__)
3817 #elif defined(__aarch64__)
3827 ///----- That's all, folks --------------------------------------------------