1 /// -*- mode: asm; asm-comment-char: 0 -*-
3 ///--------------------------------------------------------------------------
6 #include <sys/syscall.h>
8 #if defined(__i386__) || defined(__x86_64__)
10 .intel_syntax noprefix
12 #elif defined(__arm__)
20 #elif defined(__aarch64__)
22 .macro cmov rd, rn, cc
23 csel \rd, \rn, \rd, \cc
26 _(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl) \
27 _(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv) \
32 _(csinc) _(cinc) _(cset) \
34 _(csinv) _(cinv) _(csetm)
35 #define _CONDVAR(cc) _definstvar cc;
36 #define _INSTVARS(inst) \
37 .macro _definstvar cc; \
38 .macro inst.\cc args:vararg; inst \args, \cc; .endm; \
53 #define CCMP_MI CCMP_N
55 #define CCMP_EQ CCMP_Z
57 #define CCMP_CS CCMP_C
58 #define CCMP_HS CCMP_C
61 #define CCMP_VS CCMP_V
63 #define CCMP_HI CCMP_C
65 #define CCMP_LT CCMP_N
67 #define CCMP_LE CCMP_N
71 # error "not supported"
80 .size \name, . - \name
101 add ebx, offset _GLOBAL_OFFSET_TABLE
102 mov eax, [ebx + stdout@GOT]
114 #elif defined(__x86_64__)
131 mov rdi, [rip + stdout]
145 #elif defined(__arm__)
147 stmfd r13!, {r0-r4, r12, r14}
156 ldr r14, .L$_c$gotoff$\@
161 .word _GLOBAL_OFFSET_TABLE - .L$_c$gotpc$\@ - 8
166 ldmfd r13!, {r0-r4, r12, r14}
168 #elif defined(__aarch64__)
172 stp x2, x3, [sp, #16]
173 stp x4, x5, [sp, #32]
174 stp x6, x7, [sp, #48]
175 stp x8, x9, [sp, #64]
176 stp x10, x11, [sp, #80]
177 stp x12, x13, [sp, #96]
178 stp x14, x15, [sp, #112]
179 stp x16, x17, [sp, #128]
181 stp x16, x30, [sp, #144]
186 ldr x0, [x0, #:got_lo12:stdout]
190 ldp x16, x30, [sp, #144]
192 ldp x16, x17, [sp, #128]
193 ldp x14, x15, [sp, #112]
194 ldp x12, x13, [sp, #96]
195 ldp x10, x11, [sp, #80]
196 ldp x8, x9, [sp, #64]
197 ldp x6, x7, [sp, #48]
198 ldp x4, x5, [sp, #32]
199 ldp x2, x3, [sp, #16]
204 # error "not supported"
209 #if defined(__i386__) || defined(__x86_64__)
211 #elif defined(__arm__)
213 #elif defined(__aarch64__)
216 # error "not supported"
220 .section .note.GNU-stack, "", %progbits
224 #if defined(__i386__)
233 #if defined(__i386__)
237 push edi // edi, esi, ebx
238 push ebp // flags, ebp, ..., ebx
243 push esi // regs, flags, ebp, ..., ebx
246 lea eax, [ebx + 9f - .]
247 push eax // cont, regs, flags, ebp, ..., ebx
248 push edi // func, cont, regs, flags, ebp, ..., ebx
266 ret // -> func; regs, flags, ebp, ..., ebx
268 9: pushf // eflags, regs, flags, ebp, ..., ebx
269 push esi // esi, eflags, regs, flags, ebp, ..., ebx
277 pop eax // rflags, regs, flags, ebp, ..., ebx
279 pop eax // regs, flags, ebp, ..., ebx
282 add esp, 4 // flags, ebp, ..., ebx
283 popf // ebp, ..., ebx
290 #elif defined(__x86_64__)
299 push rbp // flags, rbp, ..., rbx
302 push rsi // regs, flags, rbp, ..., rbx
305 push rax // cont, regs, flags, rbp, ..., rbx
306 push rdi // func, cont, regs, flags, rbp, ..., rbx
308 mov rax, [rsi + 8*15]
332 ret // -> func; regs, flags, rbp, ..., rbx
334 9: pushf // rflags, regs, flags, rbp, ..., rbx
335 push rsi // rsi, rflags, regs, flags, rbp, ..., rbx
351 pop rax // rflags, regs, flags, rbp, ..., rbx
353 pop rax // regs, flags, rbp, ..., rbx
356 add rsp, 8 // flags, rbp, ..., rbx
357 popf // rbp, ..., rbx
368 #elif defined(__arm__)
370 stmfd r13!, {r0, r1, r4-r11, r14}
371 ldmia r1, {r0-r12, r14}
379 ldmfd r13!, {r4-r11, pc}
381 #elif defined(__aarch64__)
383 stp x29, x30, [sp, #-14*8]!
385 stp x19, x20, [sp, #16]
386 stp x21, x22, [sp, #32]
387 stp x23, x24, [sp, #48]
388 stp x25, x26, [sp, #64]
389 stp x27, x28, [sp, #80]
392 ldp x29, x30, [x1, #224]
395 ldp x27, x28, [x1, #208]
396 ldp x25, x26, [x1, #192]
397 ldp x23, x24, [x1, #176]
398 ldp x21, x22, [x1, #160]
399 ldp x19, x20, [x1, #144]
400 ldp x16, x17, [x1, #128]
401 ldp x14, x15, [x1, #112]
402 ldp x12, x13, [x1, #96]
403 ldp x10, x11, [x1, #80]
404 ldp x8, x9, [x1, #64]
405 ldp x6, x7, [x1, #48]
406 ldp x4, x5, [x1, #32]
407 ldp x2, x3, [x1, #16]
413 stp x27, x28, [x30, #208]
414 stp x25, x26, [x30, #192]
415 stp x23, x24, [x30, #176]
416 stp x21, x22, [x30, #160]
417 stp x19, x20, [x30, #144]
418 stp x16, x17, [x30, #128]
419 stp x14, x15, [x30, #112]
420 stp x12, x13, [x30, #96]
421 stp x10, x11, [x30, #80]
422 stp x8, x9, [x30, #64]
423 stp x6, x7, [x30, #48]
424 stp x4, x5, [x30, #32]
425 stp x2, x3, [x30, #16]
426 stp x0, x1, [x30, #0]
429 stp x29, x30, [x0, #224]
431 ldp x19, x20, [sp, #16]
432 ldp x21, x22, [sp, #32]
433 ldp x23, x24, [sp, #48]
434 ldp x25, x26, [sp, #64]
435 ldp x27, x28, [sp, #80]
436 ldp x29, x30, [sp], #14*8
441 # error "not supported"
452 ///--------------------------------------------------------------------------
457 // clear all 64 bits of extended traditional registers
459 #if defined(__x86_64__)
461 xor eax, eax // clear rax
462 lea rbx, [0] // rbx -> _|_
463 loop . // iterate, decrement rcx until zero
464 mov rdx, 0 // set rdx = 0
465 and esi, 0 // clear all bits of rsi
466 sub edi, edi // set rdi = edi - edi = 0
468 pop rbp // pop 0 into rbp
470 #elif defined(__i386__)
481 #elif defined(__arm__)
491 #elif defined(__aarch64__)
511 // advance a fibonacci pair by c steps
513 // on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1}
514 // and f_{i+c}, where f_{i+1} = f_i + f_{i-1}
516 #if defined(__x86_64__)
518 0: xadd rax, rdx // a, d = a + d, a
519 // = f_{i+1} + f_i, f_{i+1}
520 // = f_{i+2}, f_{i+1}
521 loop 0b // advance i, decrement c, iterate
523 #elif defined(__i386__)
528 #elif defined(__arm__)
538 #elif defined(__aarch64__)
558 // boolean canonify a: if a = 0 on entry, leave it zero; otherwise
561 #if defined(__x86_64__)
563 neg rax // set cf iff a /= 0
564 sbb rax, rax // a = a - a - cf = -cf
567 #elif defined(__i386__)
573 #elif defined(__arm__)
575 movs r1, r0 // the easy way
576 movne r1, #1 // mvnne r1, #1 for mask
578 cmp r0, #1 // clear cf iff a == 0
579 sbc r2, r0, r0 // c' = a - a - 1 + cf = cf - 1
580 add r2, r2, #1 // c' = cf
582 sub r3, r0, r0, lsr #1 // d' top bit clear; d' = 0 iff a = 0
583 rsb r3, r3, #0 // d' top bit set iff a /= 0
584 mov r3, r3, lsr #31 // asr for mask
590 #elif defined(__aarch64__)
592 cmp x0, #0 // trivial
593 cset.ne x1 // csetm for mask
595 cmp xzr, x0 // set cf iff a == 0
596 sbc x2, x0, x0 // c' = a - a - 1 + cf = cf - 1
597 neg x2, x2 // c' = 1 - cf
599 sub x3, x0, x0, lsr #1 // if a < 2^63 then a' = ceil(d/2) <
601 // if a >= 2^63, write a = 2^63 + t
602 // with t < 2^63; d' = 2^63 - 2^62 +
603 // ceil(t/2) = 2^62 + ceil(t/2), and
605 // anyway d' < 2^63 and d' = 0 iff
607 neg x3, x3 // d' top bit set iff a /= 0
608 lsr x3, x3, #63 // asr for mask
610 cmp x0, #1 // set cf iff a /= 0
611 adc x0, xzr, xzr // a' = 0 + 0 + cf = cf
623 // set a = min(a, d) (unsigned); clobber c, d
625 #if defined(__x86_64__)
627 sub rdx, rax // d' = d - a; set cf if a > d
628 sbb rcx, rcx // c = -cf = -[a > d]
629 and rcx, rdx // c = a > d ? d - a : 0
630 add rax, rcx // a' = a > d ? d : a
632 #elif defined(__i386__)
639 #elif defined(__arm__)
641 cmp r0, r3 // the easy way
642 movlo r1, r0 // only needed for out-of-place
650 #elif defined(__aarch64__)
652 cmp x0, x3 // the easy way
655 subs x3, x3, x0 // d' = d - a; set cf if d >= a
656 sbc x16, xzr, xzr // t = -1 + cf = -[a > d]
657 and x16, x16, x3 // t = a > d ? d - a : 0
658 add x0, x0, x16 // a' = a > d ? d : a
672 #if defined(__x86_64__)
690 #elif defined(__i386__)
708 #elif defined(__arm__)
722 #elif defined(__aarch64__)
730 sub w16, w16, #'a' - 10
732 ccmp.hs w16, #16, #CCMP_HS
747 // answer whether 5 <= a </<= 9.
749 #if defined(__x86_64__)
751 sub rax, 5 // a' = a - 5
752 cmp rax, 4 // is a' - 5 </<= 4?
757 // nz/ne a' /= 4 a /= 9
759 // a/nbe a' > 4 a > 9 or a < 5
760 // nc/ae/nb a' >= 4 a >= 9 or a < 5
761 // c/b/nae a' < 4 5 <= a < 9
762 // be/na a' <= 4 5 <= a <= 9
764 // o a' < -2^63 + 4 -2^63 + 5 <= a < -2^63 + 9
765 // no a' >= -2^63 + 4 a >= -2^63 + 9 or
767 // s -2^63 + 4 <= a' < 4 -2^63 + 9 <= a < 9
768 // ns a' < -2^63 + 4 or a < -2^63 + 9 or a >= 9
770 // ge/nl a' >= 4 a >= 9 or a < -2^63 + 5
771 // l/nge a' < 4 -2^63 + 5 <= a < 9
772 // g/nle a' > 4 a > 9 or a < -2^63 + 5
773 // le/ng a' <= 4 -2^63 + 5 <= a <= 9
775 #elif defined(__i386__)
780 #elif defined(__arm__)
782 // i dimly remember having a slick way to do this way back in the
783 // day, but i can't figure it out any more.
787 #elif defined(__aarch64__)
789 // literal translation is too obvious
791 ccmp.hs x0, #9, #CCMP_HS
803 // leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of,
806 #if defined(__x86_64__)
808 not rax // a' = -a - 1
812 #elif defined(__i386__)
818 #elif defined(__arm__)
822 rsbs r0, r0, #0 // cf has opposite sense
824 #elif defined(__aarch64__)
828 negs x0, x0 // cf has opposite sense
840 // same as before (?)
842 #if defined(__x86_64__)
844 inc rax // a' = a + 1
845 neg rax // a' = -a - 1
849 #elif defined(__i386__)
856 #elif defined(__arm__)
863 #elif defined(__aarch64__)
868 negs x0, x0 // cf has opposite sense
880 // floor((a + d)/2), correctly handling overflow conditions; final cf
881 // is lsb(a + d), probably uninteresting
883 #if defined(__x86_64__)
885 add rax, rdx // cf || a' = a + d
886 rcr rax, 1 // shift 65-bit result right by one
887 // place; lsb moves into carry
889 #elif defined(__i386__)
894 #elif defined(__arm__)
896 // like the two-instruction a64 version
898 add r1, r0, r1, lsr #1
900 // the slick version, similar to the above
904 #elif defined(__aarch64__)
906 // a64 lacks a32's rrx. literal translation.
907 adds x1, x0, x3 // cf || a' = a + d
908 adc x16, xzr, xzr // realize cf in extra register
909 extr x1, x16, x1, #1 // shift down one place
911 // two instruction version: clobbers additional register. (if you
912 // wanted the answer in any other register, even overwriting d, then
913 // this is unnecessary.) also depends on d >= a.
914 sub x16, x3, x0 // compute difference
915 add x0, x0, x16, lsr #1 // add half of it (rounded down)
927 // a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3
928 // (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8).
930 #if defined(__x86_64__)
932 shr rax, 3 // a' = floor(a/8); cf = 1 if a ==
933 // 4, 5, 6, 7 (mod 8)
934 adc rax, 0 // a' = floor(a/8) + cf
936 #elif defined(__i386__)
941 #elif defined(__arm__)
946 #elif defined(__aarch64__)
949 orr x0, xzr, x0, lsr #3
962 // increment c-byte little-endian bignum at rdi
964 #if defined(__x86_64__)
966 add byte ptr [rdi], 1
968 adc byte ptr [rdi], 0
971 #elif defined(__i386__)
973 add byte ptr [edi], 1
975 adc byte ptr [edi], 0
978 #elif defined(__arm__)
980 mov r12, #256 // set initial carry
983 add r12, r0, r12, lsr #8
987 #elif defined(__aarch64__)
989 mov w17, #256 // set initial carry
992 add w17, w16, w17, lsr #8
1006 // negate double-precision d:a
1008 #if defined(__x86_64__)
1010 not rdx // d' = -d - 1
1012 // cf = 1 iff a /= 0
1013 sbb rdx, -1 // d' = -d - cf
1015 #elif defined(__i386__)
1021 #elif defined(__arm__)
1023 // reverse subtract is awesome
1027 #elif defined(__aarch64__)
1029 // easy way: everything is better with zero registers.
1043 // rotate is distributive over xor.
1045 #if defined(__x86_64__)
1047 // rax // = a_1 || a_0
1048 // rbx // = b_1 || b_0
1049 mov rcx, rax // = a_1 || a_0
1051 xor rcx, rbx // = (a_1 XOR b_1) || (a_0 XOR b_0)
1052 ror rcx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1)
1054 ror rax, 0xd // = a_0 || a_1
1055 ror rbx, 0xd // = b_0 || b_1
1056 xor rax, rbx // = (a_0 XOR b_0) || (a_1 XOR b_1)
1058 cmp rax, rcx // always equal
1060 #elif defined(__i386__)
1062 mov ecx, eax // = a_1 || a_0
1064 xor ecx, ebx // = (a_1 XOR b_1) || (a_0 XOR b_0)
1065 ror ecx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1)
1067 ror eax, 0xd // = a_0 || a_1
1068 ror ebx, 0xd // = b_0 || b_1
1069 xor eax, ebx // = (a_0 XOR b_0) || (a_1 XOR b_1)
1071 cmp eax, ecx // always equal
1073 #elif defined(__arm__)
1076 // r0 // = a_1 || a_0
1077 // r1 // = b_1 || b_0
1078 eor r2, r0, r1 // = (a_1 XOR b_1) || (a_0 XOR b_0)
1079 mov r2, r2, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1081 mov r1, r1, ror #13 // = b_0 || b_1
1082 eor r0, r1, r0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1084 cmp r0, r2 // always equal
1086 #elif defined(__aarch64__)
1088 // x0 // = a_1 || a_0
1089 // x1 // = b_1 || b_0
1090 eor x2, x0, x1 // = (a_1 XOR b_1) || (a_0 XOR b_0)
1091 ror x2, x2, #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1093 ror x1, x1, #13 // = b_0 || b_1
1094 eor x0, x1, x0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1096 cmp x0, x2 // always equal
1108 // and is distributive over xor.
1110 #if defined(__x86_64__)
1114 xor rbx, rcx // = b XOR c
1115 and rbx, rax // = a AND (b XOR c)
1117 and rdx, rax // = a AND b
1118 and rax, rcx // = a AND c
1119 xor rax, rdx // = (a AND b) XOR (a AND c)
1120 // = a AND (b XOR c)
1122 cmp rax, rbx // always equal
1124 #elif defined(__i386__)
1128 xor ebx, ecx // = b XOR c
1129 and ebx, eax // = a AND (b XOR c)
1131 and edx, eax // = a AND b
1132 and eax, ecx // = a AND c
1133 xor eax, edx // = (a AND b) XOR (a AND c)
1134 // = a AND (b XOR c)
1136 cmp eax, ebx // always equal
1138 #elif defined(__arm__)
1140 and r3, r0, r1 // = a AND b
1142 eor r1, r1, r2 // = b XOR c
1143 and r1, r1, r0 // = a AND (b XOR c)
1145 and r0, r0, r2 // = a AND c
1146 eor r0, r0, r3 // = (a AND b) XOR (a AND c)
1147 // = a AND (b XOR c)
1149 cmp r0, r1 // always equal
1151 #elif defined(__aarch64__)
1153 and x3, x0, x1 // = a AND b
1155 eor x1, x1, x2 // = b XOR c
1156 and x1, x1, x0 // = a AND (b XOR c)
1158 and x0, x0, x2 // = a AND c
1159 eor x0, x0, x3 // = (a AND b) XOR (a AND c)
1160 // = a AND (b XOR c)
1162 cmp x0, x1 // always equal
1176 #if defined(__x86_64__)
1180 and rcx, rbx // = a AND b
1181 not rcx // = NOT (a AND b)
1185 or rax, rbx // = (NOT a) OR (NOT b)
1188 cmp rax, rcx // always equal
1190 #elif defined(__i386__)
1194 and ecx, ebx // = a AND b
1195 not ecx // = NOT (a AND b)
1199 or eax, ebx // = (NOT a) OR (NOT b)
1202 cmp eax, ecx // always equal
1204 #elif defined(__arm__)
1206 and r2, r0, r1 // = a AND b
1207 mvn r2, r2 // = NOT (a AND b)
1209 mvn r0, r0 // = NOT a
1210 mvn r1, r1 // = NOT b
1211 orr r0, r0, r1 // = (NOT a) OR (NOT b)
1213 cmp r0, r2 // always equal
1215 #elif defined(__aarch64__)
1217 and x2, x0, x1 // = a AND b
1218 mvn x2, x2 // = NOT (a AND b)
1220 mvn x0, x0 // = NOT a
1221 orn x0, x0, x1 // = (NOT a) OR (NOT b)
1223 cmp x0, x2 // always equal
1235 // replace input buffer bytes with cumulative XORs with initial a;
1236 // final a is XOR of all buffer bytes and initial a.
1238 // not sure why you'd do this.
1240 #if defined(__x86_64__)
1246 #elif defined(__i386__)
1252 #elif defined(__arm__)
1260 #elif defined(__aarch64__)
1276 ///--------------------------------------------------------------------------
1281 // four different ways to swap a pair of registers.
1283 #if defined(__x86_64__)
1301 #elif defined(__i386__)
1319 #elif defined(__arm__)
1321 stmfd r13!, {r0, r2}
1331 rsb r0, r0, r2 // don't need 3-addr with reverse-sub
1337 #elif defined(__aarch64__)
1339 // anything you can do
1340 stp x0, x2, [sp, #-16]!
1341 ldp x2, x0, [sp], #16
1347 // the add/sub/add thing was daft. you can do it in three if you're
1348 // clever -- and have three-address operations.
1353 // but we lack a fourth. we can't do this in fewer than three
1354 // instructions without hitting memory. only `ldp' will modify two
1355 // registers at a time, so we need at least two instructions -- but
1356 // if the first one sets one of our two registers to its final value
1357 // then we lose the other input value with no way to recover it, so
1358 // we must either write a fresh third register, or write something
1359 // other than the final value, and in both cases we need a third
1360 // instruction to fix everything up. we've done the wrong-something-
1361 // other trick twice, so here's the captain-obvious use-a-third-
1362 // register version.
1377 // assuming a is initialized to zero, set a to the inclusive or of
1378 // the xor-differences of corresponding bytes in the c-byte strings
1381 // in particular, a will be zero (and zf set) if and only if the two
1382 // strings are equal.
1384 #if defined(__x86_64__)
1393 #elif defined(__i386__)
1402 #elif defined(__arm__)
1404 0: ldrb r1, [r4], #1
1411 #elif defined(__aarch64__)
1413 0: ldrb w16, [x4], #1
1430 // an obtuse way of adding two registers. for any bit position, a
1431 // OR d is set if and only if at least one of a and d has a bit set
1432 // in that position, and a AND d is set if and only if both have a
1433 // bit set in that position. essentially, then, what we've done is
1434 // move all of the set bits in d to a, unless there's already a bit
1435 // there. this clearly doesn't change the sum.
1437 #if defined(__x86_64__)
1439 mov rcx, rdx // c' = d
1440 and rdx, rax // d' = a AND d
1441 or rax, rcx // a' = a OR d
1444 #elif defined(__i386__)
1446 mov ecx, edx // c' = d
1447 and edx, eax // d' = a AND d
1448 or eax, ecx // a' = a OR d
1451 #elif defined(__arm__)
1453 and r2, r0, r3 // c' = a AND d
1454 orr r0, r0, r3 // a' = a OR d
1457 #elif defined(__aarch64__)
1459 and x2, x0, x3 // c' = a AND d
1460 orr x0, x0, x3 // a' = a OR d
1473 // ok, so this is a really obtuse way of adding a and b; the result
1474 // is in a and d. but why does it work?
1476 #if defined(__x86_64__)
1478 mov rcx, 0x40 // carry chains at most 64 long
1479 0: mov rdx, rax // copy a'
1480 xor rax, rbx // low bits of each bitwise sum
1481 and rbx, rdx // carry bits from each bitwise sum
1482 shl rbx, 1 // carry them into next position
1485 #elif defined(__i386__)
1487 mov ecx, 0x40 // carry chains at most 64 long
1488 0: mov edx, eax // copy a'
1489 xor eax, ebx // low bits of each bitwise sum
1490 and ebx, edx // carry bits from each bitwise sum
1491 shl ebx, 1 // carry them into next position
1494 #elif defined(__arm__)
1503 #elif defined(__aarch64__)
1522 // floor((a + d)/2), like x08.
1524 #if defined(__x86_64__)
1526 mov rcx, rax // copy a for later
1527 and rcx, rdx // carry bits
1529 xor rax, rdx // low bits of each bitwise sum
1530 shr rax, 1 // divide by 2; carries now in place
1532 add rax, rcx // add the carries; done
1534 #elif defined(__i386__)
1536 mov ecx, eax // copy a for later
1537 and ecx, edx // carry bits
1539 xor eax, edx // low bits of each bitwise sum
1540 shr eax, 1 // divide by 2; carries now in place
1542 add eax, ecx // add the carries; done
1544 #elif defined(__arm__)
1548 add r0, r2, r0, lsr #1
1550 #elif defined(__aarch64__)
1554 add x0, x2, x0, lsr #1
1566 // sign extension 32 -> 64 bits.
1568 #if defined(__x86_64__)
1570 movsx rbx, eax // like this?
1572 mov rdx, 0xffffffff80000000
1573 add rax, rdx // if bit 31 of a is set then bits
1574 // 31--63 of a' are clear; otherwise,
1575 // these bits are all set -- which is
1576 // exactly backwards
1577 xor rax, rdx // so fix it
1579 #elif defined(__i386__)
1581 movsx ebx, ax // like this?
1584 add eax, edx // if bit 31 of a is set then bits
1585 // 31--63 of a' are clear; otherwise,
1586 // these bits are all set -- which is
1587 // exactly backwards
1588 xor eax, edx // so fix it
1590 #elif defined(__arm__)
1592 sxth r1, r0 // like this
1594 mov r12, #0x80000000
1595 add r0, r0, r12, asr #16
1596 eor r0, r0, r12, asr #16
1598 #elif defined(__aarch64__)
1600 sxtw x1, w0 // like this
1602 mov x16, #0xffffffff80000000
1616 // ??? i don't know why you'd want to calculate this.
1618 #if defined(__x86_64__)
1620 xor rax, rbx // a' = a XOR b
1621 xor rbx, rcx // b' = b XOR c
1622 mov rsi, rax // t = a XOR b
1623 add rsi, rbx // t = (a XOR b) + (b XOR c)
1624 cmovc rax, rbx // a' = cf ? b XOR c : a XOR b
1625 xor rax, rbx // a' = cf ? 0 : a XOR c
1628 #elif defined(__i386__)
1630 xor eax, ebx // a' = a XOR b
1631 xor ebx, ecx // b' = b XOR c
1632 mov esi, eax // t = a XOR b
1633 add esi, ebx // t = (a XOR b) + (b XOR c)
1634 cmovc eax, ebx // a' = cf ? b XOR c : a XOR b
1635 xor eax, ebx // a' = cf ? 0 : a XOR c
1638 #elif defined(__arm__)
1647 #elif defined(__aarch64__)
1668 #if defined(__x86_64__)
1670 cqo // d = a < 0 ? -1 : 0
1671 xor rax, rdx // a' = a < 0 ? -a - 1 : a
1672 sub rax, rdx // a' = a < 0 ? -a : a
1674 #elif defined(__i386__)
1676 cdq // d = a < 0 ? -1 : 0
1677 xor eax, edx // a' = a < 0 ? -a - 1 : a
1678 sub eax, edx // a' = a < 0 ? -a : a
1680 #elif defined(__arm__)
1686 // faithful-ish conversion
1687 eor r3, r0, r0, asr #31
1688 sub r0, r3, r0, asr #31
1690 #elif defined(__aarch64__)
1696 // faithful-ish conversion
1697 eor x3, x0, x0, asr #63
1698 sub x0, x3, x0, asr #63
1710 // should always set sf, clear zf, unless we get rescheduled to a
1713 #if defined(__x86_64__)
1715 rdtsc // d || a = cycles
1717 or rax, rdx // a = cycles
1718 mov rcx, rax // c = cycles
1720 rdtsc // d || a = cycles'
1722 or rax, rdx // a = cycles'
1726 #elif defined(__i386__)
1728 rdtsc // d || a = cycles
1730 mov ecx, edx // c || b = cycles
1732 rdtsc // d || a = cycles'
1737 #elif defined(__arm__)
1739 // cycle clock not available in user mode
1740 mrrc p15, 0, r0, r1, c9
1741 mrrc p15, 0, r2, r3, c9
1745 #elif defined(__aarch64__)
1747 // cycle clock not available in user mode
1762 // stupid way to capture a pointer to inline data and jump past it.
1763 // confuses the return-address predictor something chronic. worse
1764 // because amd64 calling convention doesn't usually pass arguments on
1767 #if defined(__x86_64__)
1770 .string "hello world!\n\0"
1776 // actually implement this ridiculous thing
1779 0: mov al, [rsi + rdx]
1786 syscall // clobbers r11 :-(
1789 #elif defined(__i386__)
1792 .string "hello world!\n\0"
1798 // actually implement this ridiculous thing
1801 0: mov al, [ecx + edx]
1811 #elif defined(__arm__)
1813 // why am i doing this?
1816 .string "hello world!\n\0"
1818 8: mov r1, r14 // might as well make it easy on myself
1824 0: ldrb r0, [r1, r2]
1833 #elif defined(__aarch64__)
1835 // why am i doing this?
1836 str x30, [sp, #-16]!
1838 .string "hello world!\n\0"
1840 8: mov x1, x30 // might as well make it easy on myself
1847 0: ldrb w0, [x1, x2]
1864 // collect the current instruction-pointer address. this was an old
1865 // 32-bit i386 trick for position-independent code, but (a) it
1866 // confuses the return predictor, and (b) amd64 has true pc-relative
1869 #if defined(__x86_64__)
1871 // the actual example
1875 // the modern i386 trick doesn't confuse the return-address
1880 // but rip-relative addressing is even better
1889 #elif defined(__i386__)
1891 // the actual example
1895 // the modern i386 trick doesn't confuse the return-address
1902 #elif defined(__arm__)
1910 sub r1, r14, #. - 0b
1918 #elif defined(__aarch64__)
1920 str x30, [sp, #-16]!
1922 // we can do all of the above using a64
1927 sub x1, x30, #. - 0b
1942 #if defined(__x86_64__)
1944 // retpolines: an mitigation against adversarially influenced
1945 // speculative execution at indirect branches. if an adversary can
1946 // prepare a branch-target buffer entry matching an indirect branch
1947 // in the victim's address space then they can cause the victim to
1948 // /speculatively/ (but not architecturally) execute any code in
1949 // their address space, possibly leading to leaking secrets through
1950 // the cache. retpolines aren't susceptible to this because the
1951 // predicted destination address is from the return-prediction stack
1952 // which the adversary can't prime. the performance penalty is still
1953 // essentially a branch misprediction -- for this return, and
1954 // possibly all others already stacked.
1956 // (try not to crash)
1962 #elif defined(__i386__)
1965 lea eax, [ebx + 9f - .]
1970 #elif defined(__arm__)
1979 #elif defined(__aarch64__)
1981 str x30, [sp, #-16]!
1986 8: ldr x30, [sp], #16
1997 // ok, having a hard time seeing a use for this. the most important
1998 // thing to note is that sp is set from `pop' /after/ it's
2001 #if defined(__x86_64__)
2014 #elif defined(__i386__)
2027 #elif defined(__arm__)
2029 // not even going to dignify this
2032 #elif defined(__aarch64__)
2034 // not even going to dignify this
2045 // monumentally cheesy way to copy 8 n bytes from buff1 to buff2.
2046 // also clobbers words at buff2 + 8 n and buff2 - 8 for good measure.
2050 #if defined(__x86_64__)
2052 mov rax, rsp // safekeeping
2054 // we're toast if we get hit by a signal now. fingers crossed...
2056 mov rsp, buff2 + 8*n + 8
2057 mov rbp, buff1 + 8*n
2059 lea rsp, [rdi + 8*n + 16]
2060 lea rbp, [rsi + 8*n]
2066 // +---------+ +---------+
2067 // rbp -> | ??? | rsp -> | ??? |
2068 // +---------+ +---------+
2069 // | w_{n-1} | | rbp | <- rbp'
2070 // +---------+ +---------+
2071 // | ... | | w_{n-1} |
2072 // +---------+ +---------+
2074 // +---------+ +---------+
2076 // +---------+ +---------+
2085 #elif defined(__i386__)
2087 mov eax, esp // safekeeping
2089 // we're toast if we get hit by a signal now. fingers crossed...
2091 mov esp, buff2 + 4*n + 4
2092 mov ebp, buff1 + 4*n
2094 lea esp, [edi + 4*n + 8]
2095 lea ebp, [esi + 4*n]
2102 #elif defined(__arm__)
2105 add r5, r5, #4*n + 8
2109 ldrd r0, r1, [r4, #-8]!
2110 strd r0, r1, [r5, #-8]!
2115 #elif defined(__aarch64__)
2117 // omgwtf. let's not actually screw with the stack pointer.
2120 add x5, x5, #8*n + 16
2124 ldp x16, x17, [x4, #-16]!
2125 stp x16, x17, [x5, #-16]!
2140 // convert nibble value to (uppercase) hex; other input values yield
2143 #if defined(__x86_64__)
2145 // das doesn't work in 64-bit mode; best i can come up with
2152 #elif defined(__i386__)
2154 cmp al, 0x0a // cf = 1 iff a < 10
2155 sbb al, 0x69 // if 0 <= a < 10, a' = a - 0x6a, so
2156 // 0x96 <= a' < 0x70, setting af, cf
2157 // if 10 <= a < 16, a' = a - 0x69, so
2158 // 0x71 <= a' < 0x77, setting cf but
2160 das // if 0 <= a < 10, then af and cf are
2161 // both set, so set subtract 0x66
2162 // from a' leaving 0x30 <= a' < 0x3a;
2163 // if 10 <= a < 16 then af clear but
2164 // cf set, so subtract 0x60 from a'
2165 // leaving 0x41 <= a' < 0x47
2167 #elif defined(__arm__)
2169 // significantly less tricksy
2172 addhs r0, r0, #'A' - 10
2174 #elif defined(__aarch64__)
2176 // with less versatile conditional execution this is the best we can
2179 add w16, w0, #'A' - 10
2193 // verify collatz conjecture starting at a; assume a /= 0!
2195 #if defined(__x86_64__)
2197 0: bsf rcx, rax // clobber c if a = 0
2198 shr rax, cl // a = 2^c a'
2206 lea rax, [2*rax + rax + 1] // a' = 3 a' + 1
2211 #elif defined(__i386__)
2213 0: bsf ecx, eax // clobber c if a = 0
2214 shr eax, cl // a = 2^c a'
2222 lea eax, [2*eax + eax + 1] // a' = 3 a' + 1
2227 #elif defined(__arm__)
2229 // rbit introduced in armv7
2232 mov r0, r0, lsr r2 // a = 2^c a'
2237 adcne r0, r0, r0, lsl #1 // a' = 3 a' + 1 (because c set)
2242 #elif defined(__aarch64__)
2246 lsr w0, w0, w2 // a = 2^c a'
2253 add w16, w0, w0, lsl #1 // t = 3 a' + 1 (because c set)
2254 csinc.eq w0, w0, w16
2265 ///--------------------------------------------------------------------------
2270 // calculate 1337 a slowly
2272 #if defined(__x86_64__)
2275 mov rcx, rax // c = a
2276 shl rcx, 2 // c = 4 a
2277 add rcx, rax // c = 5 a
2278 shl rcx, 3 // c = 40 a
2279 add rcx, rax // c = 41 a
2280 shl rcx, 1 // c = 82 a
2281 add rcx, rax // c = 83 a
2282 shl rcx, 1 // c = 166 a
2283 add rcx, rax // c = 167 a
2284 shl rcx, 3 // c = 1336 a
2285 add rcx, rax // c = 1337 a
2288 lea rdx, [2*rax + rax] // t = 3 a
2289 shl rdx, 6 // t = 192 a
2290 sub rdx, rax // t = 191 a
2291 lea rbx, [8*rdx] // b = 1528 a
2292 sub rbx, rdx // b = 1337 a
2294 #elif defined(__i386__)
2297 mov ecx, eax // c = a
2298 shl ecx, 2 // c = 4 a
2299 add ecx, eax // c = 5 a
2300 shl ecx, 3 // c = 40 a
2301 add ecx, eax // c = 41 a
2302 shl ecx, 1 // c = 82 a
2303 add ecx, eax // c = 83 a
2304 shl ecx, 1 // c = 166 a
2305 add ecx, eax // c = 167 a
2306 shl ecx, 3 // c = 1336 a
2307 add ecx, eax // c = 1337 a
2310 lea edx, [2*eax + eax] // t = 3 a
2311 shl edx, 6 // t = 192 a
2312 sub edx, eax // t = 191 a
2313 lea ebx, [8*edx] // b = 1528 a
2314 sub ebx, edx // b = 1337 a
2316 #elif defined(__arm__)
2318 // original version, ish
2319 add r2, r0, r0, lsl #2 // c = 5 a
2320 add r2, r0, r2, lsl #3 // c = 41 a
2321 add r2, r0, r2, lsl #1 // c = 83 a
2322 add r2, r0, r2, lsl #1 // c = 167 a
2323 add r2, r0, r2, lsl #3 // c = 1337 a
2326 add r1, r0, r0, lsl #1 // b = 3 a
2327 rsb r1, r0, r1, lsl #6 // b = 191 a
2328 rsb r1, r1, r1, lsl #3 // b = 1337 a
2330 #elif defined(__aarch64__)
2332 // original version, ish
2333 add x2, x0, x0, lsl #2 // c = 5 a
2334 add x2, x0, x2, lsl #3 // c = 41 a
2335 add x2, x0, x2, lsl #1 // c = 83 a
2336 add x2, x0, x2, lsl #1 // c = 167 a
2337 add x2, x0, x2, lsl #3 // c = 1337 a
2339 // sleazy because no rsb
2340 add x1, x0, x0, lsl #1 // b = 3 a
2341 sub x1, x0, x1, lsl #6 // b = -191 a
2342 sub x1, x1, x1, lsl #3 // b = 1337 a
2354 // multiply complex numbers a + b i and c + d i
2356 // (a + b i) (c + d i) = (a c - b d) + (a d + b c) i
2358 // somewhat slick approach uses only three multiplications
2360 #if defined(__x86_64__)
2362 mov rsi, rax // t = a
2363 add rax, rbx // a' = a + b
2364 mov rdi, rdx // u = d
2365 sub rdx, rcx // d' = d - c
2366 add rdi, rcx // u = c + d
2368 imul rax, rcx // a' = c (a + b)
2369 imul rsi, rdx // t = a (d - c)
2370 imul rdi, rbx // u = b (c + d)
2372 add rsi, rax // t = a (d - c) + c (a + b)
2373 mov rbx, rsi // b' = a (d - c) + c (a + b)
2375 sub rax, rdi // a' = c (a + b) - b (c + d)
2378 #elif defined(__i386__)
2380 mov esi, eax // t = a
2381 add eax, ebx // a' = a + b
2382 mov edi, edx // u = d
2383 sub edx, ecx // d' = d - c
2384 add edi, ecx // u = c + d
2386 imul eax, ecx // a' = c (a + b)
2387 imul esi, edx // t = a (d - c)
2388 imul edi, ebx // u = b (c + d)
2390 add esi, eax // t = a (d - c) + c (a + b)
2391 mov ebx, esi // b' = a (d - c) + c (a + b)
2393 sub eax, edi // a' = c (a + b) - b (c + d)
2396 #elif defined(__arm__)
2398 add r4, r0, r1 // t = a + b
2399 add r5, r2, r3 // u = c + d
2400 sub r3, r3, r2 // d' = d - c
2402 // mls introduced in armv7
2403 mul r4, r4, r2 // t = c (a + b)
2404 mov r2, r1 // c' = a (bah!)
2405 mla r1, r0, r3, r4 // b' = a (d - c) + c (a + b)
2407 mls r0, r2, r5, r4 // a' = c (a + b) - b (c + d)
2410 #elif defined(__aarch64__)
2412 add x4, x0, x1 // t = a + b
2413 add x5, x2, x3 // u = c + d
2414 sub x3, x3, x2 // d' = d - c
2416 // mls intxoduced in axmv7
2417 mul x4, x4, x2 // t = c (a + b)
2418 mov x2, x1 // c' = a (bah!)
2419 madd x1, x0, x3, x4 // b' = a (d - c) + c (a + b)
2421 msub x0, x2, x5, x4 // a' = c (a + b) - b (c + d)
2436 #if defined(__x86_64__)
2438 mov rdx, 0xaaaaaaaaaaaaaaab // = ceil(2/3 2^64)
2439 mul rdx // d' || a' =~ 2/3 a 2^64
2440 shr rdx, 1 // d' = floor(a/3)
2441 mov rax, rdx // a' = floor(a/3)
2443 // we start with 0 <= a < 2^64. write f = ceil(2/3 2^64), so that
2444 // 2/3 < f/2^64 < 2/3 + 1/2^64. then floor(2/3 a) <= floor(a f/2^64)
2445 // <= floor(2/3 a + a/2^64), but a < 2^64 so a/2^64 < 1 and
2446 // floor(a f/2^64) = floor(2/3 a).
2448 #elif defined(__i386__)
2450 mov edx, 0xaaaaaaab // = ceil(2/3 2^32)
2451 mul edx // d' || a' =~ 2/3 a 2^32
2452 shr edx, 1 // d' = floor(a/3)
2453 mov eax, edx // a' = floor(a/3)
2455 #elif defined(__arm__)
2457 ldr r12, =0xaaaaaaab
2458 umull r12, r0, r0, r12
2461 #elif defined(__aarch64__)
2463 ldr x16, =0xaaaaaaaaaaaaaaab
2477 #if defined(__x86_64__)
2479 // main loop: shorten a preserving residue class mod 3
2483 mov rdx, rax // d' = a
2484 shr rdx, 2 // d' = floor(a/4)
2485 and rax, 3 // a = 4 d' + a' (0 <= a' < 4)
2486 add rax, rdx // a' == a (mod 3) but a' < a/4 + 4
2489 // fix up final value 0 <= a < 6: want 0 <= a < 3
2491 // the tricky part is actually a = 3; but the other final cases take
2492 // additional iterations which we can avoid.
2493 8: cmp rax, 3 // set cf iff a < 3
2494 cmc // set cf iff a >= 3
2495 sbb rdx, rdx // d' = a >= 3 ? -1 : 0
2496 and rdx, 3 // d' = a >= 3 ? 3 : 0
2497 sub rax, rdx // a' = a - (a >= 3 ? 3 : 0)
2500 #elif defined(__i386__)
2502 // main loop: shorten a preserving residue class mod 3
2506 mov edx, eax // d' = a
2507 shr edx, 2 // d' = floor(a/4)
2508 and eax, 3 // a = 4 d' + a' (0 <= a' < 4)
2509 add eax, edx // a' == a (mod 3) but a' < a/4 + 4
2512 // fix up final value 0 <= a < 6: want 0 <= a < 3
2514 // the tricky part is actually a = 3; but the other final cases take
2515 // additional iterations which we can avoid.
2516 8: cmp eax, 3 // set cf iff a < 3
2517 cmc // set cf iff a >= 3
2518 sbb edx, edx // d' = a >= 3 ? -1 : 0
2519 and edx, 3 // d' = a >= 3 ? 3 : 0
2520 sub eax, edx // a' = a - (a >= 3 ? 3 : 0)
2523 #elif defined(__arm__)
2527 addhs r0, r12, r0, lsr #2
2533 #elif defined(__aarch64__)
2536 // blunder on through regardless since this doesn't affect the result
2538 add x0, x16, x0, lsr #2
2554 // invert (odd) a mod 2^64
2556 // suppose a a_i == 1 (mod 2^{2^i})
2558 // clearly good for i = 0, since 2^i = 1 and 2^{2^i} = 2, and a_0 =
2559 // a == 1 (mod 2) by assumption
2561 // write a a_i == b_i 2^{2^i} + 1 (mod 2^{2^{i+1}})
2562 // then b_i == (a a_i - 1)/2^{2^i} (mod 2^{2^i})
2563 // to lift inverse, we want x such that a x == -b_i (mod 2^{2^i});
2564 // clearly x = -a_i b_i will do, since a a_i == 1 (mod 2^{2^i})
2566 // a_{i+1} = a_i - a_i b_i 2^{2^i} = a_i (1 - (a a_i - 1))
2567 // = 2 a_i - a a_i^2
2570 // a a_{i+1} = 2 a a_i - a^2 a_i^2
2571 // == 2 a a_i - (b_i 2^{2^i} + 1)^2
2572 // == 2 (b_i 2^{2^i} + 1) -
2573 // (b_i^2 2^{2^{i+1}} + 2 b_i 2^{2^i} + 1)
2574 // == 1 (mod 2^{2^{i+1}})
2576 #if defined(__x86_64__)
2579 mov rbx, rax // b' = a
2580 mov rsi, rax // t = a_0
2588 mul rbx // a' = a a_i
2589 mov rcx, rax // c = a a_i
2591 sub rax, 2 // a' = a a_i - 2
2592 neg rax // a' = 2 - a a_i
2593 mul rsi // a_{i+1} = a_i (2 - a a_i)
2594 // = 2 a_i - a a_i^2
2595 mov rsi, rax // t = a_{i+1}
2598 ja 0b // no -- iterate
2600 #elif defined(__i386__)
2603 mov ebx, eax // b' = a
2604 mov esi, eax // t = a_0
2612 mul ebx // a' = a a_i
2613 mov ecx, eax // c = a a_i
2615 sub eax, 2 // a' = a a_i - 2
2616 jb 9f // done if < 2
2617 neg eax // a' = 2 - a a_i
2618 mul esi // a_{i+1} = a_i (2 - a a_i)
2619 // = 2 a_i - a a_i^2
2620 mov esi, eax // t = a_{i+1}
2622 jmp 0b // and iterate
2623 9: mov eax, esi // restore
2625 #elif defined(__arm__)
2628 mov r1, r0 // b' = a
2634 mul r2, r0, r1 // c = a a_i
2635 rsbs r2, r2, #2 // c = 2 - a a_i
2636 mul r0, r0, r2 // a_{i+1} = a_i (2 - a a_i)
2637 // = 2 a_i - a a_i^2
2640 #elif defined(__aarch64__)
2643 mov x1, x0 // b' = a
2644 mov x16, #2 // because we have no rsb
2652 mul x2, x0, x1 // c = a a_i
2653 subs x2, x16, x2 // c = 2 - a a_i
2654 mul x0, x0, x2 // a_{i+1} = a_i (2 - a a_i)
2655 // = 2 a_i - a a_i^2
2668 // a poor approximation to pi/4
2670 // think of x and y as being in 16.16 fixed-point format. we sample
2671 // points in the unit square, and determine how many of them are
2672 // within a unit quarter-circle centred at the origin. the area of
2673 // the quarter-circle is pi/4.
2675 #if defined(__x86_64__)
2677 xor eax, eax // a = 0
2679 shl rcx, 0x20 // c =~ 4 billion
2681 0: movzx rbx, cx // x = low 16 bits of c
2682 imul rbx, rbx // b = x^2
2684 ror rcx, 0x10 // switch halves of c
2685 movzx rdx, cx // y = high 16 bits of c
2686 imul rdx, rdx // d = y^2
2687 rol rcx, 0x10 // switch back
2689 add rbx, rdx // r^2 = x^2 + y^2
2690 shr rbx, 0x20 // r^2 >= 1?
2691 cmp rbx, 1 // set cf iff r^2 >= 1
2692 adc rax, 0 // and add onto accumulator
2695 #elif defined(__i386__)
2697 // this is actually better done in 32 bits. the carry has the wrong
2698 // sense here, so instead deduct one for each point outside the
2699 // quarter-circle rather than adding one for each point inside it.
2711 add ebx, edx // see?
2715 #elif defined(__arm__)
2720 0: uxth r1, r2, ror #0
2721 uxth r3, r2, ror #16
2724 cmn r1, r3 // mlas doesn't set cf usefully
2729 #elif defined(__aarch64__)
2734 0: ubfx w1, w2, #0, #16
2735 ubfx w3, w2, #16, #16
2753 // a bad way to rotate a right by 7 places
2755 #if defined(__x86_64__)
2758 ror rbx, 7 // better
2760 mov rdx, rax // d' = a
2761 shr rax, 7 // a' = a >> 7
2762 shl rdx, 0x39 // d' = a << 57
2763 or rax, rdx // a' = a >>> 7
2765 #elif defined(__i386__)
2768 ror ebx, 7 // better
2770 mov edx, eax // d' = a
2771 shr eax, 7 // a' = a >> 7
2772 shl edx, 0x39 // d' = a << 57
2773 or eax, edx // a' = a >>> 7
2775 #elif defined(__arm__)
2777 mov r1, r0, ror #7 // easy way
2779 // even the hard way is fairly easy on arm
2781 orr r0, r3, r0, lsr #7 // hard way
2783 #elif defined(__aarch64__)
2785 ror x1, x0, #7 // easy way
2787 // even the hard way is fairly easy on arm
2789 orr x0, x3, x0, lsr #7 // hard way
2801 // shift a right by c places, in two halves
2803 #if defined(__x86_64__)
2805 mov ch, cl // c' = [c, c]
2806 inc ch // c' = [c, c + 1]
2808 shr cl, 1 // c' = [floor(c/2), ceil(c/2)]
2813 #elif defined(__i386__)
2815 mov ch, cl // c' = [c, c]
2816 inc ch // c' = [c, c + 1]
2818 shr cl, 1 // c' = [floor(c/2), ceil(c/2)]
2823 #elif defined(__arm__)
2825 // it would be clearer and more efficient to say: `mov r12, r2, lsr
2826 // #1; sub r2, r2, r12', but that's not the lesson this exercise is
2830 mov r12, r12, lsr #1
2834 #elif defined(__aarch64__)
2852 #if defined(__x86_64__)
2856 #elif defined(__i386__)
2860 #elif defined(__arm__)
2864 #elif defined(__aarch64__)
2876 #if defined(__x86_64__)
2880 #elif defined(__i386__)
2884 #elif defined(__arm__)
2888 #elif defined(__aarch64__)
2900 #if defined(__x86_64__)
2904 #elif defined(__i386__)
2908 #elif defined(__arm__)
2912 #elif defined(__aarch64__)
2924 #if defined(__x86_64__)
2928 #elif defined(__i386__)
2932 #elif defined(__arm__)
2936 #elif defined(__aarch64__)
2948 #if defined(__x86_64__)
2952 #elif defined(__i386__)
2956 #elif defined(__arm__)
2960 #elif defined(__aarch64__)
2972 #if defined(__x86_64__)
2976 #elif defined(__i386__)
2980 #elif defined(__arm__)
2984 #elif defined(__aarch64__)
2996 #if defined(__x86_64__)
3000 #elif defined(__i386__)
3004 #elif defined(__arm__)
3008 #elif defined(__aarch64__)
3020 #if defined(__x86_64__)
3024 #elif defined(__i386__)
3028 #elif defined(__arm__)
3032 #elif defined(__aarch64__)
3042 ///--------------------------------------------------------------------------
3047 #if defined(__x86_64__)
3051 #elif defined(__i386__)
3055 #elif defined(__arm__)
3059 #elif defined(__aarch64__)
3073 #if defined(__x86_64__)
3077 #elif defined(__i386__)
3081 #elif defined(__arm__)
3085 #elif defined(__aarch64__)
3097 #if defined(__x86_64__)
3101 #elif defined(__i386__)
3105 #elif defined(__arm__)
3109 #elif defined(__aarch64__)
3121 #if defined(__x86_64__)
3125 #elif defined(__i386__)
3129 #elif defined(__arm__)
3133 #elif defined(__aarch64__)
3145 #if defined(__x86_64__)
3149 #elif defined(__i386__)
3153 #elif defined(__arm__)
3157 #elif defined(__aarch64__)
3169 #if defined(__x86_64__)
3173 #elif defined(__i386__)
3177 #elif defined(__arm__)
3181 #elif defined(__aarch64__)
3193 #if defined(__x86_64__)
3197 #elif defined(__i386__)
3201 #elif defined(__arm__)
3205 #elif defined(__aarch64__)
3217 #if defined(__x86_64__)
3221 #elif defined(__i386__)
3225 #elif defined(__arm__)
3229 #elif defined(__aarch64__)
3241 #if defined(__x86_64__)
3245 #elif defined(__i386__)
3249 #elif defined(__arm__)
3253 #elif defined(__aarch64__)
3265 #if defined(__x86_64__)
3269 #elif defined(__i386__)
3273 #elif defined(__arm__)
3277 #elif defined(__aarch64__)
3289 #if defined(__x86_64__)
3293 #elif defined(__i386__)
3297 #elif defined(__arm__)
3301 #elif defined(__aarch64__)
3313 #if defined(__x86_64__)
3317 #elif defined(__i386__)
3321 #elif defined(__arm__)
3325 #elif defined(__aarch64__)
3337 #if defined(__x86_64__)
3341 #elif defined(__i386__)
3345 #elif defined(__arm__)
3349 #elif defined(__aarch64__)
3361 #if defined(__x86_64__)
3365 #elif defined(__i386__)
3369 #elif defined(__arm__)
3373 #elif defined(__aarch64__)
3385 #if defined(__x86_64__)
3389 #elif defined(__i386__)
3393 #elif defined(__arm__)
3397 #elif defined(__aarch64__)
3409 #if defined(__x86_64__)
3413 #elif defined(__i386__)
3417 #elif defined(__arm__)
3421 #elif defined(__aarch64__)
3431 ///----- That's all, folks --------------------------------------------------