1 /// -*- mode: asm; asm-comment-char: 0 -*-
3 ///--------------------------------------------------------------------------
6 #include <sys/syscall.h>
8 #if defined(__i386__) || defined(__x86_64__)
10 .intel_syntax noprefix
12 #elif defined(__arm__)
20 #elif defined(__aarch64__)
22 .macro cmov rd, rn, cc
23 csel \rd, \rn, \rd, \cc
26 _(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl) \
27 _(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv) \
32 _(csinc) _(cinc) _(cset) \
34 _(csinv) _(cinv) _(csetm)
35 #define _CONDVAR(cc) _definstvar cc;
36 #define _INSTVARS(inst) \
37 .macro _definstvar cc; \
38 .macro inst.\cc args:vararg; inst \args, \cc; .endm; \
53 #define CCMP_MI CCMP_N
55 #define CCMP_EQ CCMP_Z
57 #define CCMP_CS CCMP_C
58 #define CCMP_HS CCMP_C
61 #define CCMP_VS CCMP_V
63 #define CCMP_HI CCMP_C
65 #define CCMP_LT CCMP_N
67 #define CCMP_LE CCMP_N
71 # error "not supported"
80 .size \name, . - \name
101 add ebx, offset _GLOBAL_OFFSET_TABLE
102 mov eax, [ebx + stdout@GOT]
114 #elif defined(__x86_64__)
131 mov rdi, [rip + stdout]
145 #elif defined(__arm__)
147 stmfd r13!, {r0-r4, r12, r14}
156 ldr r14, .L$_c$gotoff$\@
161 .word _GLOBAL_OFFSET_TABLE - .L$_c$gotpc$\@ - 8
166 ldmfd r13!, {r0-r4, r12, r14}
168 #elif defined(__aarch64__)
172 stp x2, x3, [sp, #16]
173 stp x4, x5, [sp, #32]
174 stp x6, x7, [sp, #48]
175 stp x8, x9, [sp, #64]
176 stp x10, x11, [sp, #80]
177 stp x12, x13, [sp, #96]
178 stp x14, x15, [sp, #112]
179 stp x16, x17, [sp, #128]
181 stp x16, x30, [sp, #144]
186 ldr x0, [x0, #:got_lo12:stdout]
190 ldp x16, x30, [sp, #144]
192 ldp x16, x17, [sp, #128]
193 ldp x14, x15, [sp, #112]
194 ldp x12, x13, [sp, #96]
195 ldp x10, x11, [sp, #80]
196 ldp x8, x9, [sp, #64]
197 ldp x6, x7, [sp, #48]
198 ldp x4, x5, [sp, #32]
199 ldp x2, x3, [sp, #16]
204 # error "not supported"
209 #if defined(__i386__) || defined(__x86_64__)
211 #elif defined(__arm__)
213 #elif defined(__aarch64__)
216 # error "not supported"
220 .section .note.GNU-stack, "", %progbits
224 #if defined(__i386__)
233 #if defined(__i386__)
237 push edi // edi, esi, ebx
238 push ebp // flags, ebp, ..., ebx
243 push esi // regs, flags, ebp, ..., ebx
246 lea eax, [ebx + 9f - .]
247 push eax // cont, regs, flags, ebp, ..., ebx
248 push edi // func, cont, regs, flags, ebp, ..., ebx
266 ret // -> func; regs, flags, ebp, ..., ebx
268 9: pushf // eflags, regs, flags, ebp, ..., ebx
269 push esi // esi, eflags, regs, flags, ebp, ..., ebx
277 pop eax // rflags, regs, flags, ebp, ..., ebx
279 pop eax // regs, flags, ebp, ..., ebx
282 add esp, 4 // flags, ebp, ..., ebx
283 popf // ebp, ..., ebx
290 #elif defined(__x86_64__)
299 push rbp // flags, rbp, ..., rbx
302 push rsi // regs, flags, rbp, ..., rbx
305 push rax // cont, regs, flags, rbp, ..., rbx
306 push rdi // func, cont, regs, flags, rbp, ..., rbx
308 mov rax, [rsi + 8*15]
332 ret // -> func; regs, flags, rbp, ..., rbx
334 9: pushf // rflags, regs, flags, rbp, ..., rbx
335 push rsi // rsi, rflags, regs, flags, rbp, ..., rbx
351 pop rax // rflags, regs, flags, rbp, ..., rbx
353 pop rax // regs, flags, rbp, ..., rbx
356 add rsp, 8 // flags, rbp, ..., rbx
357 popf // rbp, ..., rbx
368 #elif defined(__arm__)
370 stmfd r13!, {r0, r1, r4-r11, r14}
371 ldmia r1, {r0-r12, r14}
379 ldmfd r13!, {r4-r11, pc}
381 #elif defined(__aarch64__)
383 stp x29, x30, [sp, #-13*8]!
385 stp x19, x20, [sp, #16]
386 stp x21, x22, [sp, #32]
387 stp x23, x24, [sp, #48]
388 stp x25, x26, [sp, #64]
389 stp x27, x28, [sp, #80]
395 ldp x14, x15, [x1, #112]
396 ldp x12, x13, [x1, #96]
397 ldp x10, x11, [x1, #80]
398 ldp x8, x9, [x1, #64]
399 ldp x6, x7, [x1, #48]
400 ldp x4, x5, [x1, #32]
401 ldp x2, x3, [x1, #16]
410 stp x14, x15, [x16, #112]
411 stp x12, x13, [x16, #96]
412 stp x10, x11, [x16, #80]
413 stp x8, x9, [x16, #64]
414 stp x6, x7, [x16, #48]
415 stp x4, x5, [x16, #32]
416 stp x2, x3, [x16, #16]
417 stp x0, x1, [x16, #0]
419 ldp x19, x20, [sp, #16]
420 ldp x21, x22, [sp, #32]
421 ldp x23, x24, [sp, #48]
422 ldp x25, x26, [sp, #64]
423 ldp x27, x28, [sp, #80]
424 ldp x29, x30, [sp], #13*8
427 # error "not supported"
438 ///--------------------------------------------------------------------------
443 // clear all 64 bits of extended traditional registers
445 #if defined(__x86_64__)
447 xor eax, eax // clear rax
448 lea rbx, [0] // rbx -> _|_
449 loop . // iterate, decrement rcx until zero
450 mov rdx, 0 // set rdx = 0
451 and esi, 0 // clear all bits of rsi
452 sub edi, edi // set rdi = edi - edi = 0
454 pop rbp // pop 0 into rbp
456 #elif defined(__i386__)
467 #elif defined(__arm__)
477 #elif defined(__aarch64__)
497 // advance a fibonacci pair by c steps
499 // on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1}
500 // and f_{i+c}, where f_{i+1} = f_i + f_{i-1}
502 #if defined(__x86_64__)
504 0: xadd rax, rdx // a, d = a + d, a
505 // = f_{i+1} + f_i, f_{i+1}
506 // = f_{i+2}, f_{i+1}
507 loop 0b // advance i, decrement c, iterate
509 #elif defined(__i386__)
514 #elif defined(__arm__)
524 #elif defined(__aarch64__)
544 // boolean canonify a: if a = 0 on entry, leave it zero; otherwise
547 #if defined(__x86_64__)
549 neg rax // set cf iff a /= 0
550 sbb rax, rax // a = a - a - cf = -cf
553 #elif defined(__i386__)
559 #elif defined(__arm__)
561 movs r1, r0 // the easy way
562 movne r1, #1 // mvnne r1, #1 for mask
564 cmp r0, #1 // clear cf iff a == 0
565 sbc r2, r0, r0 // c' = a - a - 1 + cf = cf - 1
566 add r2, r2, #1 // c' = cf
568 sub r3, r0, r0, lsr #1 // d' top bit clear; d' = 0 iff a = 0
569 rsb r3, r3, #0 // d' top bit set iff a /= 0
570 mov r3, r3, lsr #31 // asr for mask
576 #elif defined(__aarch64__)
578 cmp x0, #0 // trivial
579 cset.ne x1 // csetm for mask
581 cmp xzr, x0 // set cf iff a == 0
582 sbc x2, x0, x0 // c' = a - a - 1 + cf = cf - 1
583 neg x2, x2 // c' = 1 - cf
585 sub x3, x0, x0, lsr #1 // if a < 2^63 then a' = ceil(d/2) <
587 // if a >= 2^63, write a = 2^63 + t
588 // with t < 2^63; d' = 2^63 - 2^62 +
589 // ceil(t/2) = 2^62 + ceil(t/2), and
591 // anyway d' < 2^63 and d' = 0 iff
593 neg x3, x3 // d' top bit set iff a /= 0
594 lsr x3, x3, #63 // asr for mask
596 cmp x0, #1 // set cf iff a /= 0
597 adc x0, xzr, xzr // a' = 0 + 0 + cf = cf
609 // set a = min(a, d) (unsigned); clobber c, d
611 #if defined(__x86_64__)
613 sub rdx, rax // d' = d - a; set cf if a > d
614 sbb rcx, rcx // c = -cf = -[a > d]
615 and rcx, rdx // c = a > d ? d - a : 0
616 add rax, rcx // a' = a > d ? d : a
618 #elif defined(__i386__)
625 #elif defined(__arm__)
627 cmp r0, r3 // the easy way
628 movlo r1, r0 // only needed for out-of-place
636 #elif defined(__aarch64__)
638 cmp x0, x3 // the easy way
641 subs x3, x3, x0 // d' = d - a; set cf if d >= a
642 sbc x16, xzr, xzr // t = -1 + cf = -[a > d]
643 and x16, x16, x3 // t = a > d ? d - a : 0
644 add x0, x0, x16 // a' = a > d ? d : a
658 #if defined(__x86_64__)
676 #elif defined(__i386__)
694 #elif defined(__arm__)
708 #elif defined(__aarch64__)
716 sub w16, w16, #'a' - 10
718 ccmp.hs w16, #16, #CCMP_HS
733 // answer whether 5 <= a </<= 9.
735 #if defined(__x86_64__)
737 sub rax, 5 // a' = a - 5
738 cmp rax, 4 // is a' - 5 </<= 4?
743 // nz/ne a' /= 4 a /= 9
745 // a/nbe a' > 4 a > 9 or a < 5
746 // nc/ae/nb a' >= 4 a >= 9 or a < 5
747 // c/b/nae a' < 4 5 <= a < 9
748 // be/na a' <= 4 5 <= a <= 9
750 // o a' < -2^63 + 4 -2^63 + 5 <= a < -2^63 + 9
751 // no a' >= -2^63 + 4 a >= -2^63 + 9 or
753 // s -2^63 + 4 <= a' < 4 -2^63 + 9 <= a < 9
754 // ns a' < -2^63 + 4 or a < -2^63 + 9 or a >= 9
756 // ge/nl a' >= 4 a >= 9 or a < -2^63 + 5
757 // l/nge a' < 4 -2^63 + 5 <= a < 9
758 // g/nle a' > 4 a > 9 or a < -2^63 + 5
759 // le/ng a' <= 4 -2^63 + 5 <= a <= 9
761 #elif defined(__i386__)
766 #elif defined(__arm__)
768 // i dimly remember having a slick way to do this way back in the
769 // day, but i can't figure it out any more.
773 #elif defined(__aarch64__)
775 // literal translation is too obvious
777 ccmp.hs x0, #9, #CCMP_HS
789 // leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of,
792 #if defined(__x86_64__)
794 not rax // a' = -a - 1
798 #elif defined(__i386__)
804 #elif defined(__arm__)
808 rsbs r0, r0, #0 // cf has opposite sense
810 #elif defined(__aarch64__)
814 negs x0, x0 // cf has opposite sense
826 // same as before (?)
828 #if defined(__x86_64__)
830 inc rax // a' = a + 1
831 neg rax // a' = -a - 1
835 #elif defined(__i386__)
842 #elif defined(__arm__)
849 #elif defined(__aarch64__)
854 negs x0, x0 // cf has opposite sense
866 // floor((a + d)/2), correctly handling overflow conditions; final cf
867 // is lsb(a + d), probably uninteresting
869 #if defined(__x86_64__)
871 add rax, rdx // cf || a' = a + d
872 rcr rax, 1 // shift 65-bit result right by one
873 // place; lsb moves into carry
875 #elif defined(__i386__)
880 #elif defined(__arm__)
882 // like the two-instruction a64 version
884 add r1, r0, r1, lsr #1
886 // the slick version, similar to the above
890 #elif defined(__aarch64__)
892 // a64 lacks a32's rrx. literal translation.
893 adds x1, x0, x3 // cf || a' = a + d
894 adc x16, xzr, xzr // realize cf in extra register
895 extr x1, x16, x1, #1 // shift down one place
897 // two instruction version: clobbers additional register. (if you
898 // wanted the answer in any other register, even overwriting d, then
899 // this is unnecessary.) also depends on d >= a.
900 sub x16, x3, x0 // compute difference
901 add x0, x0, x16, lsr #1 // add half of it (rounded down)
913 // a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3
914 // (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8).
916 #if defined(__x86_64__)
918 shr rax, 3 // a' = floor(a/8); cf = 1 if a ==
919 // 4, 5, 6, 7 (mod 8)
920 adc rax, 0 // a' = floor(a/8) + cf
922 #elif defined(__i386__)
927 #elif defined(__arm__)
932 #elif defined(__aarch64__)
935 orr x0, xzr, x0, lsr #3
948 // increment c-byte little-endian bignum at rdi
950 #if defined(__x86_64__)
952 add byte ptr [rdi], 1
954 adc byte ptr [rdi], 0
957 #elif defined(__i386__)
959 add byte ptr [edi], 1
961 adc byte ptr [edi], 0
964 #elif defined(__arm__)
966 mov r12, #256 // set initial carry
969 add r12, r0, r12, lsr #8
973 #elif defined(__aarch64__)
975 mov w17, #256 // set initial carry
978 add w17, w16, w17, lsr #8
992 // negate double-precision d:a
994 #if defined(__x86_64__)
996 not rdx // d' = -d - 1
999 sbb rdx, -1 // d' = -d - cf
1001 #elif defined(__i386__)
1007 #elif defined(__arm__)
1009 // reverse subtract is awesome
1013 #elif defined(__aarch64__)
1015 // easy way: everything is better with zero registers.
1029 // rotate is distributive over xor.
1031 #if defined(__x86_64__)
1033 // rax // = a_1 || a_0
1034 // rbx // = b_1 || b_0
1035 mov rcx, rax // = a_1 || a_0
1037 xor rcx, rbx // = (a_1 XOR b_1) || (a_0 XOR b_0)
1038 ror rcx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1)
1040 ror rax, 0xd // = a_0 || a_1
1041 ror rbx, 0xd // = b_0 || b_1
1042 xor rax, rbx // = (a_0 XOR b_0) || (a_1 XOR b_1)
1044 cmp rax, rcx // always equal
1046 #elif defined(__i386__)
1048 mov ecx, eax // = a_1 || a_0
1050 xor ecx, ebx // = (a_1 XOR b_1) || (a_0 XOR b_0)
1051 ror ecx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1)
1053 ror eax, 0xd // = a_0 || a_1
1054 ror ebx, 0xd // = b_0 || b_1
1055 xor eax, ebx // = (a_0 XOR b_0) || (a_1 XOR b_1)
1057 cmp eax, ecx // always equal
1059 #elif defined(__arm__)
1062 // r0 // = a_1 || a_0
1063 // r1 // = b_1 || b_0
1064 eor r2, r0, r1 // = (a_1 XOR b_1) || (a_0 XOR b_0)
1065 mov r2, r2, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1067 mov r1, r1, ror #13 // = b_0 || b_1
1068 eor r0, r1, r0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1070 cmp r0, r2 // always equal
1072 #elif defined(__aarch64__)
1074 // x0 // = a_1 || a_0
1075 // x1 // = b_1 || b_0
1076 eor x2, x0, x1 // = (a_1 XOR b_1) || (a_0 XOR b_0)
1077 ror x2, x2, #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1079 ror x1, x1, #13 // = b_0 || b_1
1080 eor x0, x1, x0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1082 cmp x0, x2 // always equal
1094 // and is distributive over xor.
1096 #if defined(__x86_64__)
1100 xor rbx, rcx // = b XOR c
1101 and rbx, rax // = a AND (b XOR c)
1103 and rdx, rax // = a AND b
1104 and rax, rcx // = a AND c
1105 xor rax, rdx // = (a AND b) XOR (a AND c)
1106 // = a AND (b XOR c)
1108 cmp rax, rbx // always equal
1110 #elif defined(__i386__)
1114 xor ebx, ecx // = b XOR c
1115 and ebx, eax // = a AND (b XOR c)
1117 and edx, eax // = a AND b
1118 and eax, ecx // = a AND c
1119 xor eax, edx // = (a AND b) XOR (a AND c)
1120 // = a AND (b XOR c)
1122 cmp eax, ebx // always equal
1124 #elif defined(__arm__)
1126 and r3, r0, r1 // = a AND b
1128 eor r1, r1, r2 // = b XOR c
1129 and r1, r1, r0 // = a AND (b XOR c)
1131 and r0, r0, r2 // = a AND c
1132 eor r0, r0, r3 // = (a AND b) XOR (a AND c)
1133 // = a AND (b XOR c)
1135 cmp r0, r1 // always equal
1137 #elif defined(__aarch64__)
1139 and x3, x0, x1 // = a AND b
1141 eor x1, x1, x2 // = b XOR c
1142 and x1, x1, x0 // = a AND (b XOR c)
1144 and x0, x0, x2 // = a AND c
1145 eor x0, x0, x3 // = (a AND b) XOR (a AND c)
1146 // = a AND (b XOR c)
1148 cmp x0, x1 // always equal
1162 #if defined(__x86_64__)
1166 and rcx, rbx // = a AND b
1167 not rcx // = NOT (a AND b)
1171 or rax, rbx // = (NOT a) OR (NOT b)
1174 cmp rax, rcx // always equal
1176 #elif defined(__i386__)
1180 and ecx, ebx // = a AND b
1181 not ecx // = NOT (a AND b)
1185 or eax, ebx // = (NOT a) OR (NOT b)
1188 cmp eax, ecx // always equal
1190 #elif defined(__arm__)
1192 and r2, r0, r1 // = a AND b
1193 mvn r2, r2 // = NOT (a AND b)
1195 mvn r0, r0 // = NOT a
1196 mvn r1, r1 // = NOT b
1197 orr r0, r0, r1 // = (NOT a) OR (NOT b)
1199 cmp r0, r2 // always equal
1201 #elif defined(__aarch64__)
1203 and x2, x0, x1 // = a AND b
1204 mvn x2, x2 // = NOT (a AND b)
1206 mvn x0, x0 // = NOT a
1207 orn x0, x0, x1 // = (NOT a) OR (NOT b)
1209 cmp x0, x2 // always equal
1221 // replace input buffer bytes with cumulative XORs with initial a;
1222 // final a is XOR of all buffer bytes and initial a.
1224 // not sure why you'd do this.
1226 #if defined(__x86_64__)
1232 #elif defined(__i386__)
1238 #elif defined(__arm__)
1246 #elif defined(__aarch64__)
1262 ///--------------------------------------------------------------------------
1267 // four different ways to swap a pair of registers.
1269 #if defined(__x86_64__)
1287 #elif defined(__i386__)
1305 #elif defined(__arm__)
1307 stmfd r13!, {r0, r2}
1317 rsb r0, r0, r2 // don't need 3-addr with reverse-sub
1323 #elif defined(__aarch64__)
1325 // anything you can do
1326 stp x0, x2, [sp, #-16]!
1327 ldp x2, x0, [sp], #16
1333 // the add/sub/add thing was daft. you can do it in three if you're
1334 // clever -- and have three-address operations.
1339 // but we lack a fourth. we can't do this in fewer than three
1340 // instructions without hitting memory. only `ldp' will modify two
1341 // registers at a time, so we need at least two instructions -- but
1342 // if the first one sets one of our two registers to its final value
1343 // then we lose the other input value with no way to recover it, so
1344 // we must either write a fresh third register, or write something
1345 // other than the final value, and in both cases we need a third
1346 // instruction to fix everything up. we've done the wrong-something-
1347 // other trick twice, so here's the captain-obvious use-a-third-
1348 // register version.
1363 // assuming a is initialized to zero, set a to the inclusive or of
1364 // the xor-differences of corresponding bytes in the c-byte strings
1367 // in particular, a will be zero (and zf set) if and only if the two
1368 // strings are equal.
1370 #if defined(__x86_64__)
1379 #elif defined(__i386__)
1388 #elif defined(__arm__)
1390 0: ldrb r1, [r4], #1
1397 #elif defined(__aarch64__)
1399 0: ldrb w16, [x4], #1
1416 // an obtuse way of adding two registers. for any bit position, a
1417 // OR d is set if and only if at least one of a and d has a bit set
1418 // in that position, and a AND d is set if and only if both have a
1419 // bit set in that position. essentially, then, what we've done is
1420 // move all of the set bits in d to a, unless there's already a bit
1421 // there. this clearly doesn't change the sum.
1423 #if defined(__x86_64__)
1425 mov rcx, rdx // c' = d
1426 and rdx, rax // d' = a AND d
1427 or rax, rcx // a' = a OR d
1430 #elif defined(__i386__)
1432 mov ecx, edx // c' = d
1433 and edx, eax // d' = a AND d
1434 or eax, ecx // a' = a OR d
1437 #elif defined(__arm__)
1439 and r2, r0, r3 // c' = a AND d
1440 orr r0, r0, r3 // a' = a OR d
1443 #elif defined(__aarch64__)
1445 and x2, x0, x3 // c' = a AND d
1446 orr x0, x0, x3 // a' = a OR d
1459 // ok, so this is a really obtuse way of adding a and b; the result
1460 // is in a and d. but why does it work?
1462 #if defined(__x86_64__)
1464 mov rcx, 0x40 // carry chains at most 64 long
1465 0: mov rdx, rax // copy a'
1466 xor rax, rbx // low bits of each bitwise sum
1467 and rbx, rdx // carry bits from each bitwise sum
1468 shl rbx, 1 // carry them into next position
1471 #elif defined(__i386__)
1473 mov ecx, 0x40 // carry chains at most 64 long
1474 0: mov edx, eax // copy a'
1475 xor eax, ebx // low bits of each bitwise sum
1476 and ebx, edx // carry bits from each bitwise sum
1477 shl ebx, 1 // carry them into next position
1480 #elif defined(__arm__)
1489 #elif defined(__aarch64__)
1508 // floor((a + d)/2), like x08.
1510 #if defined(__x86_64__)
1512 mov rcx, rax // copy a for later
1513 and rcx, rdx // carry bits
1515 xor rax, rdx // low bits of each bitwise sum
1516 shr rax, 1 // divide by 2; carries now in place
1518 add rax, rcx // add the carries; done
1520 #elif defined(__i386__)
1522 mov ecx, eax // copy a for later
1523 and ecx, edx // carry bits
1525 xor eax, edx // low bits of each bitwise sum
1526 shr eax, 1 // divide by 2; carries now in place
1528 add eax, ecx // add the carries; done
1530 #elif defined(__arm__)
1534 add r0, r2, r0, lsr #1
1536 #elif defined(__aarch64__)
1540 add x0, x2, x0, lsr #1
1552 // sign extension 32 -> 64 bits.
1554 #if defined(__x86_64__)
1556 movsx rbx, eax // like this?
1558 mov rdx, 0xffffffff80000000
1559 add rax, rdx // if bit 31 of a is set then bits
1560 // 31--63 of a' are clear; otherwise,
1561 // these bits are all set -- which is
1562 // exactly backwards
1563 xor rax, rdx // so fix it
1565 #elif defined(__i386__)
1567 movsx ebx, ax // like this?
1570 add eax, edx // if bit 31 of a is set then bits
1571 // 31--63 of a' are clear; otherwise,
1572 // these bits are all set -- which is
1573 // exactly backwards
1574 xor eax, edx // so fix it
1576 #elif defined(__arm__)
1578 sxth r1, r0 // like this
1580 mov r12, #0x80000000
1581 add r0, r0, r12, asr #16
1582 eor r0, r0, r12, asr #16
1584 #elif defined(__aarch64__)
1586 sxtw x1, w0 // like this
1588 mov x16, #0xffffffff80000000
1602 // ??? i don't know why you'd want to calculate this.
1604 #if defined(__x86_64__)
1606 xor rax, rbx // a' = a XOR b
1607 xor rbx, rcx // b' = b XOR c
1608 mov rsi, rax // t = a XOR b
1609 add rsi, rbx // t = (a XOR b) + (b XOR c)
1610 cmovc rax, rbx // a' = cf ? b XOR c : a XOR b
1611 xor rax, rbx // a' = cf ? 0 : a XOR c
1614 #elif defined(__i386__)
1616 xor eax, ebx // a' = a XOR b
1617 xor ebx, ecx // b' = b XOR c
1618 mov esi, eax // t = a XOR b
1619 add esi, ebx // t = (a XOR b) + (b XOR c)
1620 cmovc eax, ebx // a' = cf ? b XOR c : a XOR b
1621 xor eax, ebx // a' = cf ? 0 : a XOR c
1624 #elif defined(__arm__)
1633 #elif defined(__aarch64__)
1654 #if defined(__x86_64__)
1656 cqo // d = a < 0 ? -1 : 0
1657 xor rax, rdx // a' = a < 0 ? -a - 1 : a
1658 sub rax, rdx // a' = a < 0 ? -a : a
1660 #elif defined(__i386__)
1662 cdq // d = a < 0 ? -1 : 0
1663 xor eax, edx // a' = a < 0 ? -a - 1 : a
1664 sub eax, edx // a' = a < 0 ? -a : a
1666 #elif defined(__arm__)
1672 // faithful-ish conversion
1673 eor r3, r0, r0, asr #31
1674 sub r0, r3, r0, asr #31
1676 #elif defined(__aarch64__)
1682 // faithful-ish conversion
1683 eor x3, x0, x0, asr #63
1684 sub x0, x3, x0, asr #63
1696 // should always set sf, clear zf, unless we get rescheduled to a
1699 #if defined(__x86_64__)
1701 rdtsc // d || a = cycles
1703 or rax, rdx // a = cycles
1704 mov rcx, rax // c = cycles
1706 rdtsc // d || a = cycles'
1708 or rax, rdx // a = cycles'
1712 #elif defined(__i386__)
1714 rdtsc // d || a = cycles
1716 mov ecx, edx // c || b = cycles
1718 rdtsc // d || a = cycles'
1723 #elif defined(__arm__)
1725 // cycle clock not available in user mode
1726 mrrc p15, 0, r0, r1, c9
1727 mrrc p15, 0, r2, r3, c9
1731 #elif defined(__aarch64__)
1733 // cycle clock not available in user mode
1748 // stupid way to capture a pointer to inline data and jump past it.
1749 // confuses the return-address predictor something chronic. worse
1750 // because amd64 calling convention doesn't usually pass arguments on
1753 #if defined(__x86_64__)
1756 .string "hello world!\n\0"
1762 // actually implement this ridiculous thing
1765 0: mov al, [rsi + rdx]
1772 syscall // clobbers r11 :-(
1775 #elif defined(__i386__)
1778 .string "hello world!\n\0"
1784 // actually implement this ridiculous thing
1787 0: mov al, [ecx + edx]
1797 #elif defined(__arm__)
1799 // why am i doing this?
1802 .string "hello world!\n\0"
1804 8: mov r1, r14 // might as well make it easy on myself
1810 0: ldrb r0, [r1, r2]
1819 #elif defined(__aarch64__)
1821 // why am i doing this?
1822 str x30, [sp, #-16]!
1824 .string "hello world!\n\0"
1826 8: mov x1, x30 // might as well make it easy on myself
1833 0: ldrb w0, [x1, x2]
1850 // collect the current instruction-pointer address. this was an old
1851 // 32-bit i386 trick for position-independent code, but (a) it
1852 // confuses the return predictor, and (b) amd64 has true pc-relative
1855 #if defined(__x86_64__)
1857 // the actual example
1861 // the modern i386 trick doesn't confuse the return-address
1866 // but rip-relative addressing is even better
1875 #elif defined(__i386__)
1877 // the actual example
1881 // the modern i386 trick doesn't confuse the return-address
1888 #elif defined(__arm__)
1896 sub r1, r14, #. - 0b
1904 #elif defined(__aarch64__)
1906 str x30, [sp, #-16]!
1908 // we can do all of the above using a64
1913 sub x1, x30, #. - 0b
1928 #if defined(__x86_64__)
1930 // retpolines: an mitigation against adversarially influenced
1931 // speculative execution at indirect branches. if an adversary can
1932 // prepare a branch-target buffer entry matching an indirect branch
1933 // in the victim's address space then they can cause the victim to
1934 // /speculatively/ (but not architecturally) execute any code in
1935 // their address space, possibly leading to leaking secrets through
1936 // the cache. retpolines aren't susceptible to this because the
1937 // predicted destination address is from the return-prediction stack
1938 // which the adversary can't prime. the performance penalty is still
1939 // essentially a branch misprediction -- for this return, and
1940 // possibly all others already stacked.
1942 // (try not to crash)
1948 #elif defined(__i386__)
1951 lea eax, [ebx + 9f - .]
1956 #elif defined(__arm__)
1965 #elif defined(__aarch64__)
1967 str x30, [sp, #-16]!
1972 8: ldr x30, [sp], #16
1983 // ok, having a hard time seeing a use for this. the most important
1984 // thing to note is that sp is set from `pop' /after/ it's
1987 #if defined(__x86_64__)
2000 #elif defined(__i386__)
2013 #elif defined(__arm__)
2015 // not even going to dignify this
2018 #elif defined(__aarch64__)
2020 // not even going to dignify this
2031 // monumentally cheesy way to copy 8 n bytes from buff1 to buff2.
2032 // also clobbers words at buff2 + 8 n and buff2 - 8 for good measure.
2036 #if defined(__x86_64__)
2038 mov rax, rsp // safekeeping
2040 // we're toast if we get hit by a signal now. fingers crossed...
2042 mov rsp, buff2 + 8*n + 8
2043 mov rbp, buff1 + 8*n
2045 lea rsp, [rdi + 8*n + 16]
2046 lea rbp, [rsi + 8*n]
2052 // +---------+ +---------+
2053 // rbp -> | ??? | rsp -> | ??? |
2054 // +---------+ +---------+
2055 // | w_{n-1} | | rbp | <- rbp'
2056 // +---------+ +---------+
2057 // | ... | | w_{n-1} |
2058 // +---------+ +---------+
2060 // +---------+ +---------+
2062 // +---------+ +---------+
2071 #elif defined(__i386__)
2073 mov eax, esp // safekeeping
2075 // we're toast if we get hit by a signal now. fingers crossed...
2077 mov esp, buff2 + 4*n + 4
2078 mov ebp, buff1 + 4*n
2080 lea esp, [edi + 4*n + 8]
2081 lea ebp, [esi + 4*n]
2088 #elif defined(__arm__)
2091 add r5, r5, #4*n + 8
2095 ldrd r0, r1, [r4, #-8]!
2096 strd r0, r1, [r5, #-8]!
2101 #elif defined(__aarch64__)
2103 // omgwtf. let's not actually screw with the stack pointer.
2106 add x5, x5, #8*n + 16
2110 ldp x16, x17, [x4, #-16]!
2111 stp x16, x17, [x5, #-16]!
2126 // convert nibble value to (uppercase) hex; other input values yield
2129 #if defined(__x86_64__)
2131 // das doesn't work in 64-bit mode; best i can come up with
2138 #elif defined(__i386__)
2140 cmp al, 0x0a // cf = 1 iff a < 10
2141 sbb al, 0x69 // if 0 <= a < 10, a' = a - 0x6a, so
2142 // 0x96 <= a' < 0x70, setting af, cf
2143 // if 10 <= a < 16, a' = a - 0x69, so
2144 // 0x71 <= a' < 0x77, setting cf but
2146 das // if 0 <= a < 10, then af and cf are
2147 // both set, so set subtract 0x66
2148 // from a' leaving 0x30 <= a' < 0x3a;
2149 // if 10 <= a < 16 then af clear but
2150 // cf set, so subtract 0x60 from a'
2151 // leaving 0x41 <= a' < 0x47
2153 #elif defined(__arm__)
2155 // significantly less tricksy
2158 addhs r0, r0, #'A' - 10
2160 #elif defined(__aarch64__)
2162 // with less versatile conditional execution this is the best we can
2165 add w16, w0, #'A' - 10
2179 // verify collatz conjecture starting at a; assume a /= 0!
2181 #if defined(__x86_64__)
2183 0: bsf rcx, rax // clobber c if a = 0
2184 shr rax, cl // a = 2^c a'
2192 lea rax, [2*rax + rax + 1] // a' = 3 a' + 1
2197 #elif defined(__i386__)
2199 0: bsf ecx, eax // clobber c if a = 0
2200 shr eax, cl // a = 2^c a'
2208 lea eax, [2*eax + eax + 1] // a' = 3 a' + 1
2213 #elif defined(__arm__)
2215 // rbit introduced in armv7
2218 mov r0, r0, lsr r2 // a = 2^c a'
2223 adcne r0, r0, r0, lsl #1 // a' = 3 a' + 1 (because c set)
2228 #elif defined(__aarch64__)
2232 lsr w0, w0, w2 // a = 2^c a'
2239 add w16, w0, w0, lsl #1 // t = 3 a' + 1 (because c set)
2240 csinc.eq w0, w0, w16
2251 ///--------------------------------------------------------------------------
2256 // calculate 1337 a slowly
2258 #if defined(__x86_64__)
2261 mov rcx, rax // c = a
2262 shl rcx, 2 // c = 4 a
2263 add rcx, rax // c = 5 a
2264 shl rcx, 3 // c = 40 a
2265 add rcx, rax // c = 41 a
2266 shl rcx, 1 // c = 82 a
2267 add rcx, rax // c = 83 a
2268 shl rcx, 1 // c = 166 a
2269 add rcx, rax // c = 167 a
2270 shl rcx, 3 // c = 1336 a
2271 add rcx, rax // c = 1337 a
2274 lea rdx, [2*rax + rax] // t = 3 a
2275 shl rdx, 6 // t = 192 a
2276 sub rdx, rax // t = 191 a
2277 lea rbx, [8*rdx] // b = 1528 a
2278 sub rbx, rdx // b = 1337 a
2280 #elif defined(__i386__)
2283 mov ecx, eax // c = a
2284 shl ecx, 2 // c = 4 a
2285 add ecx, eax // c = 5 a
2286 shl ecx, 3 // c = 40 a
2287 add ecx, eax // c = 41 a
2288 shl ecx, 1 // c = 82 a
2289 add ecx, eax // c = 83 a
2290 shl ecx, 1 // c = 166 a
2291 add ecx, eax // c = 167 a
2292 shl ecx, 3 // c = 1336 a
2293 add ecx, eax // c = 1337 a
2296 lea edx, [2*eax + eax] // t = 3 a
2297 shl edx, 6 // t = 192 a
2298 sub edx, eax // t = 191 a
2299 lea ebx, [8*edx] // b = 1528 a
2300 sub ebx, edx // b = 1337 a
2302 #elif defined(__arm__)
2304 // original version, ish
2305 add r2, r0, r0, lsl #2 // c = 5 a
2306 add r2, r0, r2, lsl #3 // c = 41 a
2307 add r2, r0, r2, lsl #1 // c = 83 a
2308 add r2, r0, r2, lsl #1 // c = 167 a
2309 add r2, r0, r2, lsl #3 // c = 1337 a
2312 add r1, r0, r0, lsl #1 // b = 3 a
2313 rsb r1, r0, r1, lsl #6 // b = 191 a
2314 rsb r1, r1, r1, lsl #3 // b = 1337 a
2316 #elif defined(__aarch64__)
2318 // original version, ish
2319 add x2, x0, x0, lsl #2 // c = 5 a
2320 add x2, x0, x2, lsl #3 // c = 41 a
2321 add x2, x0, x2, lsl #1 // c = 83 a
2322 add x2, x0, x2, lsl #1 // c = 167 a
2323 add x2, x0, x2, lsl #3 // c = 1337 a
2325 // sleazy because no rsb
2326 add x1, x0, x0, lsl #1 // b = 3 a
2327 sub x1, x0, x1, lsl #6 // b = -191 a
2328 sub x1, x1, x1, lsl #3 // b = 1337 a
2340 // multiply complex numbers a + b i and c + d i
2342 // (a + b i) (c + d i) = (a c - b d) + (a d + b c) i
2344 // somewhat slick approach uses only three multiplications
2346 #if defined(__x86_64__)
2348 mov rsi, rax // t = a
2349 add rax, rbx // a' = a + b
2350 mov rdi, rdx // u = d
2351 sub rdx, rcx // d' = d - c
2352 add rdi, rcx // u = c + d
2354 imul rax, rcx // a' = c (a + b)
2355 imul rsi, rdx // t = a (d - c)
2356 imul rdi, rbx // u = b (c + d)
2358 add rsi, rax // t = a (d - c) + c (a + b)
2359 mov rbx, rsi // b' = a (d - c) + c (a + b)
2361 sub rax, rdi // a' = c (a + b) - b (c + d)
2364 #elif defined(__i386__)
2366 mov esi, eax // t = a
2367 add eax, ebx // a' = a + b
2368 mov edi, edx // u = d
2369 sub edx, ecx // d' = d - c
2370 add edi, ecx // u = c + d
2372 imul eax, ecx // a' = c (a + b)
2373 imul esi, edx // t = a (d - c)
2374 imul edi, ebx // u = b (c + d)
2376 add esi, eax // t = a (d - c) + c (a + b)
2377 mov ebx, esi // b' = a (d - c) + c (a + b)
2379 sub eax, edi // a' = c (a + b) - b (c + d)
2382 #elif defined(__arm__)
2384 add r4, r0, r1 // t = a + b
2385 add r5, r2, r3 // u = c + d
2386 sub r3, r3, r2 // d' = d - c
2388 // mls introduced in armv7
2389 mul r4, r4, r2 // t = c (a + b)
2390 mov r2, r1 // c' = a (bah!)
2391 mla r1, r0, r3, r4 // b' = a (d - c) + c (a + b)
2393 mls r0, r2, r5, r4 // a' = c (a + b) - b (c + d)
2396 #elif defined(__aarch64__)
2398 add x4, x0, x1 // t = a + b
2399 add x5, x2, x3 // u = c + d
2400 sub x3, x3, x2 // d' = d - c
2402 // mls intxoduced in axmv7
2403 mul x4, x4, x2 // t = c (a + b)
2404 mov x2, x1 // c' = a (bah!)
2405 madd x1, x0, x3, x4 // b' = a (d - c) + c (a + b)
2407 msub x0, x2, x5, x4 // a' = c (a + b) - b (c + d)
2422 #if defined(__x86_64__)
2424 mov rdx, 0xaaaaaaaaaaaaaaab // = ceil(2/3 2^64)
2425 mul rdx // d' || a' =~ 2/3 a 2^64
2426 shr rdx, 1 // d' = floor(a/3)
2427 mov rax, rdx // a' = floor(a/3)
2429 // we start with 0 <= a < 2^64. write f = ceil(2/3 2^64), so that
2430 // 2/3 < f/2^64 < 2/3 + 1/2^64. then floor(2/3 a) <= floor(a f/2^64)
2431 // <= floor(2/3 a + a/2^64), but a < 2^64 so a/2^64 < 1 and
2432 // floor(a f/2^64) = floor(2/3 a).
2434 #elif defined(__i386__)
2436 mov edx, 0xaaaaaaab // = ceil(2/3 2^32)
2437 mul edx // d' || a' =~ 2/3 a 2^32
2438 shr edx, 1 // d' = floor(a/3)
2439 mov eax, edx // a' = floor(a/3)
2441 #elif defined(__arm__)
2443 ldr r12, =0xaaaaaaab
2444 umull r12, r0, r0, r12
2447 #elif defined(__aarch64__)
2449 ldr x16, =0xaaaaaaaaaaaaaaab
2463 #if defined(__x86_64__)
2465 // main loop: shorten a preserving residue class mod 3
2469 mov rdx, rax // d' = a
2470 shr rdx, 2 // d' = floor(a/4)
2471 and rax, 3 // a = 4 d' + a' (0 <= a' < 4)
2472 add rax, rdx // a' == a (mod 3) but a' < a/4 + 4
2475 // fix up final value 0 <= a < 6: want 0 <= a < 3
2477 // the tricky part is actually a = 3; but the other final cases take
2478 // additional iterations which we can avoid.
2479 8: cmp rax, 3 // set cf iff a < 3
2480 cmc // set cf iff a >= 3
2481 sbb rdx, rdx // d' = a >= 3 ? -1 : 0
2482 and rdx, 3 // d' = a >= 3 ? 3 : 0
2483 sub rax, rdx // a' = a - (a >= 3 ? 3 : 0)
2486 #elif defined(__i386__)
2488 // main loop: shorten a preserving residue class mod 3
2492 mov edx, eax // d' = a
2493 shr edx, 2 // d' = floor(a/4)
2494 and eax, 3 // a = 4 d' + a' (0 <= a' < 4)
2495 add eax, edx // a' == a (mod 3) but a' < a/4 + 4
2498 // fix up final value 0 <= a < 6: want 0 <= a < 3
2500 // the tricky part is actually a = 3; but the other final cases take
2501 // additional iterations which we can avoid.
2502 8: cmp eax, 3 // set cf iff a < 3
2503 cmc // set cf iff a >= 3
2504 sbb edx, edx // d' = a >= 3 ? -1 : 0
2505 and edx, 3 // d' = a >= 3 ? 3 : 0
2506 sub eax, edx // a' = a - (a >= 3 ? 3 : 0)
2509 #elif defined(__arm__)
2513 addhs r0, r12, r0, lsr #2
2519 #elif defined(__aarch64__)
2522 // blunder on through regardless since this doesn't affect the result
2524 add x0, x16, x0, lsr #2
2540 // invert (odd) a mod 2^64
2542 // suppose a a_i == 1 (mod 2^{2^i})
2544 // clearly good for i = 0, since 2^i = 1 and 2^{2^i} = 2, and a_0 =
2545 // a == 1 (mod 2) by assumption
2547 // write a a_i == b_i 2^{2^i} + 1 (mod 2^{2^{i+1}})
2548 // then b_i == (a a_i - 1)/2^{2^i} (mod 2^{2^i})
2549 // to lift inverse, we want x such that a x == -b_i (mod 2^{2^i});
2550 // clearly x = -a_i b_i will do, since a a_i == 1 (mod 2^{2^i})
2552 // a_{i+1} = a_i - a_i b_i 2^{2^i} = a_i (1 - (a a_i - 1))
2553 // = 2 a_i - a a_i^2
2556 // a a_{i+1} = 2 a a_i - a^2 a_i^2
2557 // == 2 a a_i - (b_i 2^{2^i} + 1)^2
2558 // == 2 (b_i 2^{2^i} + 1) -
2559 // (b_i^2 2^{2^{i+1}} + 2 b_i 2^{2^i} + 1)
2560 // == 1 (mod 2^{2^{i+1}})
2562 #if defined(__x86_64__)
2565 mov rbx, rax // b' = a
2566 mov rsi, rax // t = a_0
2574 mul rbx // a' = a a_i
2575 mov rcx, rax // c = a a_i
2577 sub rax, 2 // a' = a a_i - 2
2578 neg rax // a' = 2 - a a_i
2579 mul rsi // a_{i+1} = a_i (2 - a a_i)
2580 // = 2 a_i - a a_i^2
2581 mov rsi, rax // t = a_{i+1}
2584 ja 0b // no -- iterate
2586 #elif defined(__i386__)
2589 mov ebx, eax // b' = a
2590 mov esi, eax // t = a_0
2598 mul ebx // a' = a a_i
2599 mov ecx, eax // c = a a_i
2601 sub eax, 2 // a' = a a_i - 2
2602 jb 9f // done if < 2
2603 neg eax // a' = 2 - a a_i
2604 mul esi // a_{i+1} = a_i (2 - a a_i)
2605 // = 2 a_i - a a_i^2
2606 mov esi, eax // t = a_{i+1}
2608 jmp 0b // and iterate
2609 9: mov eax, esi // restore
2611 #elif defined(__arm__)
2614 mov r1, r0 // b' = a
2620 mul r2, r0, r1 // c = a a_i
2621 rsbs r2, r2, #2 // c = 2 - a a_i
2622 mul r0, r0, r2 // a_{i+1} = a_i (2 - a a_i)
2623 // = 2 a_i - a a_i^2
2626 #elif defined(__aarch64__)
2629 mov x1, x0 // b' = a
2630 mov x16, #2 // because we have no rsb
2638 mul x2, x0, x1 // c = a a_i
2639 subs x2, x16, x2 // c = 2 - a a_i
2640 mul x0, x0, x2 // a_{i+1} = a_i (2 - a a_i)
2641 // = 2 a_i - a a_i^2
2654 // a poor approximation to pi/4
2656 // think of x and y as being in 16.16 fixed-point format. we sample
2657 // points in the unit square, and determine how many of them are
2658 // within a unit quarter-circle centred at the origin. the area of
2659 // the quarter-circle is pi/4.
2661 #if defined(__x86_64__)
2663 xor eax, eax // a = 0
2665 shl rcx, 0x20 // c =~ 4 billion
2667 0: movzx rbx, cx // x = low 16 bits of c
2668 imul rbx, rbx // b = x^2
2670 ror rcx, 0x10 // switch halves of c
2671 movzx rdx, cx // y = high 16 bits of c
2672 imul rdx, rdx // d = y^2
2673 rol rcx, 0x10 // switch back
2675 add rbx, rdx // r^2 = x^2 + y^2
2676 shr rbx, 0x20 // r^2 >= 1?
2677 cmp rbx, 1 // set cf iff r^2 >= 1
2678 adc rax, 0 // and add onto accumulator
2681 #elif defined(__i386__)
2683 // this is actually better done in 32 bits. the carry has the wrong
2684 // sense here, so instead deduct one for each point outside the
2685 // quarter-circle rather than adding one for each point inside it.
2697 add ebx, edx // see?
2701 #elif defined(__arm__)
2706 0: uxth r1, r2, ror #0
2707 uxth r3, r2, ror #16
2710 cmn r1, r3 // mlas doesn't set cf usefully
2715 #elif defined(__aarch64__)
2720 0: ubfx w1, w2, #0, #16
2721 ubfx w3, w2, #16, #16
2739 #if defined(__x86_64__)
2743 #elif defined(__i386__)
2747 #elif defined(__arm__)
2751 #elif defined(__aarch64__)
2763 #if defined(__x86_64__)
2767 #elif defined(__i386__)
2771 #elif defined(__arm__)
2775 #elif defined(__aarch64__)
2787 #if defined(__x86_64__)
2791 #elif defined(__i386__)
2795 #elif defined(__arm__)
2799 #elif defined(__aarch64__)
2811 #if defined(__x86_64__)
2815 #elif defined(__i386__)
2819 #elif defined(__arm__)
2823 #elif defined(__aarch64__)
2835 #if defined(__x86_64__)
2839 #elif defined(__i386__)
2843 #elif defined(__arm__)
2847 #elif defined(__aarch64__)
2859 #if defined(__x86_64__)
2863 #elif defined(__i386__)
2867 #elif defined(__arm__)
2871 #elif defined(__aarch64__)
2883 #if defined(__x86_64__)
2887 #elif defined(__i386__)
2891 #elif defined(__arm__)
2895 #elif defined(__aarch64__)
2907 #if defined(__x86_64__)
2911 #elif defined(__i386__)
2915 #elif defined(__arm__)
2919 #elif defined(__aarch64__)
2931 #if defined(__x86_64__)
2935 #elif defined(__i386__)
2939 #elif defined(__arm__)
2943 #elif defined(__aarch64__)
2955 #if defined(__x86_64__)
2959 #elif defined(__i386__)
2963 #elif defined(__arm__)
2967 #elif defined(__aarch64__)
2977 ///--------------------------------------------------------------------------
2982 #if defined(__x86_64__)
2986 #elif defined(__i386__)
2990 #elif defined(__arm__)
2994 #elif defined(__aarch64__)
3008 #if defined(__x86_64__)
3012 #elif defined(__i386__)
3016 #elif defined(__arm__)
3020 #elif defined(__aarch64__)
3032 #if defined(__x86_64__)
3036 #elif defined(__i386__)
3040 #elif defined(__arm__)
3044 #elif defined(__aarch64__)
3056 #if defined(__x86_64__)
3060 #elif defined(__i386__)
3064 #elif defined(__arm__)
3068 #elif defined(__aarch64__)
3080 #if defined(__x86_64__)
3084 #elif defined(__i386__)
3088 #elif defined(__arm__)
3092 #elif defined(__aarch64__)
3104 #if defined(__x86_64__)
3108 #elif defined(__i386__)
3112 #elif defined(__arm__)
3116 #elif defined(__aarch64__)
3128 #if defined(__x86_64__)
3132 #elif defined(__i386__)
3136 #elif defined(__arm__)
3140 #elif defined(__aarch64__)
3152 #if defined(__x86_64__)
3156 #elif defined(__i386__)
3160 #elif defined(__arm__)
3164 #elif defined(__aarch64__)
3176 #if defined(__x86_64__)
3180 #elif defined(__i386__)
3184 #elif defined(__arm__)
3188 #elif defined(__aarch64__)
3200 #if defined(__x86_64__)
3204 #elif defined(__i386__)
3208 #elif defined(__arm__)
3212 #elif defined(__aarch64__)
3224 #if defined(__x86_64__)
3228 #elif defined(__i386__)
3232 #elif defined(__arm__)
3236 #elif defined(__aarch64__)
3248 #if defined(__x86_64__)
3252 #elif defined(__i386__)
3256 #elif defined(__arm__)
3260 #elif defined(__aarch64__)
3272 #if defined(__x86_64__)
3276 #elif defined(__i386__)
3280 #elif defined(__arm__)
3284 #elif defined(__aarch64__)
3296 #if defined(__x86_64__)
3300 #elif defined(__i386__)
3304 #elif defined(__arm__)
3308 #elif defined(__aarch64__)
3320 #if defined(__x86_64__)
3324 #elif defined(__i386__)
3328 #elif defined(__arm__)
3332 #elif defined(__aarch64__)
3344 #if defined(__x86_64__)
3348 #elif defined(__i386__)
3352 #elif defined(__arm__)
3356 #elif defined(__aarch64__)
3366 ///----- That's all, folks --------------------------------------------------