1 /// -*- mode: asm; asm-comment-char: 0 -*-
3 ///--------------------------------------------------------------------------
6 #include <sys/syscall.h>
8 #if defined(__i386__) || defined(__x86_64__)
10 .intel_syntax noprefix
12 #elif defined(__arm__)
20 #elif defined(__aarch64__)
22 .macro cmov rd, rn, cc
23 csel \rd, \rn, \rd, \cc
26 _(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl) \
27 _(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv) \
32 _(csinc) _(cinc) _(cset) \
34 _(csinv) _(cinv) _(csetm)
35 #define _CONDVAR(cc) _definstvar cc;
36 #define _INSTVARS(inst) \
37 .macro _definstvar cc; \
38 .macro inst.\cc args:vararg; inst \args, \cc; .endm; \
53 #define CCMP_MI CCMP_N
55 #define CCMP_EQ CCMP_Z
57 #define CCMP_CS CCMP_C
58 #define CCMP_HS CCMP_C
61 #define CCMP_VS CCMP_V
63 #define CCMP_HI CCMP_C
65 #define CCMP_LT CCMP_N
67 #define CCMP_LE CCMP_N
71 # error "not supported"
80 .size \name, . - \name
101 add ebx, offset _GLOBAL_OFFSET_TABLE
102 mov eax, [ebx + stdout@GOT]
114 #elif defined(__x86_64__)
131 mov rdi, [rip + stdout]
145 #elif defined(__arm__)
147 stmfd r13!, {r0-r4, r12, r14}
156 ldr r14, .L$_c$gotoff$\@
161 .word _GLOBAL_OFFSET_TABLE - .L$_c$gotpc$\@ - 8
166 ldmfd r13!, {r0-r4, r12, r14}
168 #elif defined(__aarch64__)
172 stp x2, x3, [sp, #16]
173 stp x4, x5, [sp, #32]
174 stp x6, x7, [sp, #48]
175 stp x8, x9, [sp, #64]
176 stp x10, x11, [sp, #80]
177 stp x12, x13, [sp, #96]
178 stp x14, x15, [sp, #112]
179 stp x16, x17, [sp, #128]
181 stp x16, x30, [sp, #144]
186 ldr x0, [x0, #:got_lo12:stdout]
190 ldp x16, x30, [sp, #144]
192 ldp x16, x17, [sp, #128]
193 ldp x14, x15, [sp, #112]
194 ldp x12, x13, [sp, #96]
195 ldp x10, x11, [sp, #80]
196 ldp x8, x9, [sp, #64]
197 ldp x6, x7, [sp, #48]
198 ldp x4, x5, [sp, #32]
199 ldp x2, x3, [sp, #16]
204 # error "not supported"
209 #if defined(__i386__) || defined(__x86_64__)
211 #elif defined(__arm__)
213 #elif defined(__aarch64__)
216 # error "not supported"
220 .section .note.GNU-stack, "", %progbits
224 #if defined(__i386__)
233 #if defined(__i386__)
237 push edi // edi, esi, ebx
238 push ebp // flags, ebp, ..., ebx
243 push esi // regs, flags, ebp, ..., ebx
246 lea eax, [ebx + 9f - .]
247 push eax // cont, regs, flags, ebp, ..., ebx
248 push edi // func, cont, regs, flags, ebp, ..., ebx
266 ret // -> func; regs, flags, ebp, ..., ebx
268 9: pushf // eflags, regs, flags, ebp, ..., ebx
269 push esi // esi, eflags, regs, flags, ebp, ..., ebx
277 pop eax // rflags, regs, flags, ebp, ..., ebx
279 pop eax // regs, flags, ebp, ..., ebx
282 add esp, 4 // flags, ebp, ..., ebx
283 popf // ebp, ..., ebx
290 #elif defined(__x86_64__)
299 push rbp // flags, rbp, ..., rbx
302 push rsi // regs, flags, rbp, ..., rbx
305 push rax // cont, regs, flags, rbp, ..., rbx
306 push rdi // func, cont, regs, flags, rbp, ..., rbx
308 mov rax, [rsi + 8*15]
332 ret // -> func; regs, flags, rbp, ..., rbx
334 9: pushf // rflags, regs, flags, rbp, ..., rbx
335 push rsi // rsi, rflags, regs, flags, rbp, ..., rbx
351 pop rax // rflags, regs, flags, rbp, ..., rbx
353 pop rax // regs, flags, rbp, ..., rbx
356 add rsp, 8 // flags, rbp, ..., rbx
357 popf // rbp, ..., rbx
368 #elif defined(__arm__)
370 stmfd r13!, {r0, r1, r4-r11, r14}
371 ldmia r1, {r0-r12, r14}
379 ldmfd r13!, {r4-r11, pc}
381 #elif defined(__aarch64__)
383 stp x29, x30, [sp, #-13*8]!
385 stp x19, x20, [sp, #16]
386 stp x21, x22, [sp, #32]
387 stp x23, x24, [sp, #48]
388 stp x25, x26, [sp, #64]
389 stp x27, x28, [sp, #80]
395 ldp x14, x15, [x1, #112]
396 ldp x12, x13, [x1, #96]
397 ldp x10, x11, [x1, #80]
398 ldp x8, x9, [x1, #64]
399 ldp x6, x7, [x1, #48]
400 ldp x4, x5, [x1, #32]
401 ldp x2, x3, [x1, #16]
410 stp x14, x15, [x16, #112]
411 stp x12, x13, [x16, #96]
412 stp x10, x11, [x16, #80]
413 stp x8, x9, [x16, #64]
414 stp x6, x7, [x16, #48]
415 stp x4, x5, [x16, #32]
416 stp x2, x3, [x16, #16]
417 stp x0, x1, [x16, #0]
419 ldp x19, x20, [sp, #16]
420 ldp x21, x22, [sp, #32]
421 ldp x23, x24, [sp, #48]
422 ldp x25, x26, [sp, #64]
423 ldp x27, x28, [sp, #80]
424 ldp x29, x30, [sp], #13*8
429 # error "not supported"
440 ///--------------------------------------------------------------------------
445 // clear all 64 bits of extended traditional registers
447 #if defined(__x86_64__)
449 xor eax, eax // clear rax
450 lea rbx, [0] // rbx -> _|_
451 loop . // iterate, decrement rcx until zero
452 mov rdx, 0 // set rdx = 0
453 and esi, 0 // clear all bits of rsi
454 sub edi, edi // set rdi = edi - edi = 0
456 pop rbp // pop 0 into rbp
458 #elif defined(__i386__)
469 #elif defined(__arm__)
479 #elif defined(__aarch64__)
499 // advance a fibonacci pair by c steps
501 // on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1}
502 // and f_{i+c}, where f_{i+1} = f_i + f_{i-1}
504 #if defined(__x86_64__)
506 0: xadd rax, rdx // a, d = a + d, a
507 // = f_{i+1} + f_i, f_{i+1}
508 // = f_{i+2}, f_{i+1}
509 loop 0b // advance i, decrement c, iterate
511 #elif defined(__i386__)
516 #elif defined(__arm__)
526 #elif defined(__aarch64__)
546 // boolean canonify a: if a = 0 on entry, leave it zero; otherwise
549 #if defined(__x86_64__)
551 neg rax // set cf iff a /= 0
552 sbb rax, rax // a = a - a - cf = -cf
555 #elif defined(__i386__)
561 #elif defined(__arm__)
563 movs r1, r0 // the easy way
564 movne r1, #1 // mvnne r1, #1 for mask
566 cmp r0, #1 // clear cf iff a == 0
567 sbc r2, r0, r0 // c' = a - a - 1 + cf = cf - 1
568 add r2, r2, #1 // c' = cf
570 sub r3, r0, r0, lsr #1 // d' top bit clear; d' = 0 iff a = 0
571 rsb r3, r3, #0 // d' top bit set iff a /= 0
572 mov r3, r3, lsr #31 // asr for mask
578 #elif defined(__aarch64__)
580 cmp x0, #0 // trivial
581 cset.ne x1 // csetm for mask
583 cmp xzr, x0 // set cf iff a == 0
584 sbc x2, x0, x0 // c' = a - a - 1 + cf = cf - 1
585 neg x2, x2 // c' = 1 - cf
587 sub x3, x0, x0, lsr #1 // if a < 2^63 then a' = ceil(d/2) <
589 // if a >= 2^63, write a = 2^63 + t
590 // with t < 2^63; d' = 2^63 - 2^62 +
591 // ceil(t/2) = 2^62 + ceil(t/2), and
593 // anyway d' < 2^63 and d' = 0 iff
595 neg x3, x3 // d' top bit set iff a /= 0
596 lsr x3, x3, #63 // asr for mask
598 cmp x0, #1 // set cf iff a /= 0
599 adc x0, xzr, xzr // a' = 0 + 0 + cf = cf
611 // set a = min(a, d) (unsigned); clobber c, d
613 #if defined(__x86_64__)
615 sub rdx, rax // d' = d - a; set cf if a > d
616 sbb rcx, rcx // c = -cf = -[a > d]
617 and rcx, rdx // c = a > d ? d - a : 0
618 add rax, rcx // a' = a > d ? d : a
620 #elif defined(__i386__)
627 #elif defined(__arm__)
629 cmp r0, r3 // the easy way
630 movlo r1, r0 // only needed for out-of-place
638 #elif defined(__aarch64__)
640 cmp x0, x3 // the easy way
643 subs x3, x3, x0 // d' = d - a; set cf if d >= a
644 sbc x16, xzr, xzr // t = -1 + cf = -[a > d]
645 and x16, x16, x3 // t = a > d ? d - a : 0
646 add x0, x0, x16 // a' = a > d ? d : a
660 #if defined(__x86_64__)
678 #elif defined(__i386__)
696 #elif defined(__arm__)
710 #elif defined(__aarch64__)
718 sub w16, w16, #'a' - 10
720 ccmp.hs w16, #16, #CCMP_HS
735 // answer whether 5 <= a </<= 9.
737 #if defined(__x86_64__)
739 sub rax, 5 // a' = a - 5
740 cmp rax, 4 // is a' - 5 </<= 4?
745 // nz/ne a' /= 4 a /= 9
747 // a/nbe a' > 4 a > 9 or a < 5
748 // nc/ae/nb a' >= 4 a >= 9 or a < 5
749 // c/b/nae a' < 4 5 <= a < 9
750 // be/na a' <= 4 5 <= a <= 9
752 // o a' < -2^63 + 4 -2^63 + 5 <= a < -2^63 + 9
753 // no a' >= -2^63 + 4 a >= -2^63 + 9 or
755 // s -2^63 + 4 <= a' < 4 -2^63 + 9 <= a < 9
756 // ns a' < -2^63 + 4 or a < -2^63 + 9 or a >= 9
758 // ge/nl a' >= 4 a >= 9 or a < -2^63 + 5
759 // l/nge a' < 4 -2^63 + 5 <= a < 9
760 // g/nle a' > 4 a > 9 or a < -2^63 + 5
761 // le/ng a' <= 4 -2^63 + 5 <= a <= 9
763 #elif defined(__i386__)
768 #elif defined(__arm__)
770 // i dimly remember having a slick way to do this way back in the
771 // day, but i can't figure it out any more.
775 #elif defined(__aarch64__)
777 // literal translation is too obvious
779 ccmp.hs x0, #9, #CCMP_HS
791 // leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of,
794 #if defined(__x86_64__)
796 not rax // a' = -a - 1
800 #elif defined(__i386__)
806 #elif defined(__arm__)
810 rsbs r0, r0, #0 // cf has opposite sense
812 #elif defined(__aarch64__)
816 negs x0, x0 // cf has opposite sense
828 // same as before (?)
830 #if defined(__x86_64__)
832 inc rax // a' = a + 1
833 neg rax // a' = -a - 1
837 #elif defined(__i386__)
844 #elif defined(__arm__)
851 #elif defined(__aarch64__)
856 negs x0, x0 // cf has opposite sense
868 // floor((a + d)/2), correctly handling overflow conditions; final cf
869 // is lsb(a + d), probably uninteresting
871 #if defined(__x86_64__)
873 add rax, rdx // cf || a' = a + d
874 rcr rax, 1 // shift 65-bit result right by one
875 // place; lsb moves into carry
877 #elif defined(__i386__)
882 #elif defined(__arm__)
884 // like the two-instruction a64 version
886 add r1, r0, r1, lsr #1
888 // the slick version, similar to the above
892 #elif defined(__aarch64__)
894 // a64 lacks a32's rrx. literal translation.
895 adds x1, x0, x3 // cf || a' = a + d
896 adc x16, xzr, xzr // realize cf in extra register
897 extr x1, x16, x1, #1 // shift down one place
899 // two instruction version: clobbers additional register. (if you
900 // wanted the answer in any other register, even overwriting d, then
901 // this is unnecessary.) also depends on d >= a.
902 sub x16, x3, x0 // compute difference
903 add x0, x0, x16, lsr #1 // add half of it (rounded down)
915 // a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3
916 // (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8).
918 #if defined(__x86_64__)
920 shr rax, 3 // a' = floor(a/8); cf = 1 if a ==
921 // 4, 5, 6, 7 (mod 8)
922 adc rax, 0 // a' = floor(a/8) + cf
924 #elif defined(__i386__)
929 #elif defined(__arm__)
934 #elif defined(__aarch64__)
937 orr x0, xzr, x0, lsr #3
950 // increment c-byte little-endian bignum at rdi
952 #if defined(__x86_64__)
954 add byte ptr [rdi], 1
956 adc byte ptr [rdi], 0
959 #elif defined(__i386__)
961 add byte ptr [edi], 1
963 adc byte ptr [edi], 0
966 #elif defined(__arm__)
968 mov r12, #256 // set initial carry
971 add r12, r0, r12, lsr #8
975 #elif defined(__aarch64__)
977 mov w17, #256 // set initial carry
980 add w17, w16, w17, lsr #8
994 // negate double-precision d:a
996 #if defined(__x86_64__)
998 not rdx // d' = -d - 1
1000 // cf = 1 iff a /= 0
1001 sbb rdx, -1 // d' = -d - cf
1003 #elif defined(__i386__)
1009 #elif defined(__arm__)
1011 // reverse subtract is awesome
1015 #elif defined(__aarch64__)
1017 // easy way: everything is better with zero registers.
1031 // rotate is distributive over xor.
1033 #if defined(__x86_64__)
1035 // rax // = a_1 || a_0
1036 // rbx // = b_1 || b_0
1037 mov rcx, rax // = a_1 || a_0
1039 xor rcx, rbx // = (a_1 XOR b_1) || (a_0 XOR b_0)
1040 ror rcx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1)
1042 ror rax, 0xd // = a_0 || a_1
1043 ror rbx, 0xd // = b_0 || b_1
1044 xor rax, rbx // = (a_0 XOR b_0) || (a_1 XOR b_1)
1046 cmp rax, rcx // always equal
1048 #elif defined(__i386__)
1050 mov ecx, eax // = a_1 || a_0
1052 xor ecx, ebx // = (a_1 XOR b_1) || (a_0 XOR b_0)
1053 ror ecx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1)
1055 ror eax, 0xd // = a_0 || a_1
1056 ror ebx, 0xd // = b_0 || b_1
1057 xor eax, ebx // = (a_0 XOR b_0) || (a_1 XOR b_1)
1059 cmp eax, ecx // always equal
1061 #elif defined(__arm__)
1064 // r0 // = a_1 || a_0
1065 // r1 // = b_1 || b_0
1066 eor r2, r0, r1 // = (a_1 XOR b_1) || (a_0 XOR b_0)
1067 mov r2, r2, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1069 mov r1, r1, ror #13 // = b_0 || b_1
1070 eor r0, r1, r0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1072 cmp r0, r2 // always equal
1074 #elif defined(__aarch64__)
1076 // x0 // = a_1 || a_0
1077 // x1 // = b_1 || b_0
1078 eor x2, x0, x1 // = (a_1 XOR b_1) || (a_0 XOR b_0)
1079 ror x2, x2, #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1081 ror x1, x1, #13 // = b_0 || b_1
1082 eor x0, x1, x0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1084 cmp x0, x2 // always equal
1096 // and is distributive over xor.
1098 #if defined(__x86_64__)
1102 xor rbx, rcx // = b XOR c
1103 and rbx, rax // = a AND (b XOR c)
1105 and rdx, rax // = a AND b
1106 and rax, rcx // = a AND c
1107 xor rax, rdx // = (a AND b) XOR (a AND c)
1108 // = a AND (b XOR c)
1110 cmp rax, rbx // always equal
1112 #elif defined(__i386__)
1116 xor ebx, ecx // = b XOR c
1117 and ebx, eax // = a AND (b XOR c)
1119 and edx, eax // = a AND b
1120 and eax, ecx // = a AND c
1121 xor eax, edx // = (a AND b) XOR (a AND c)
1122 // = a AND (b XOR c)
1124 cmp eax, ebx // always equal
1126 #elif defined(__arm__)
1128 and r3, r0, r1 // = a AND b
1130 eor r1, r1, r2 // = b XOR c
1131 and r1, r1, r0 // = a AND (b XOR c)
1133 and r0, r0, r2 // = a AND c
1134 eor r0, r0, r3 // = (a AND b) XOR (a AND c)
1135 // = a AND (b XOR c)
1137 cmp r0, r1 // always equal
1139 #elif defined(__aarch64__)
1141 and x3, x0, x1 // = a AND b
1143 eor x1, x1, x2 // = b XOR c
1144 and x1, x1, x0 // = a AND (b XOR c)
1146 and x0, x0, x2 // = a AND c
1147 eor x0, x0, x3 // = (a AND b) XOR (a AND c)
1148 // = a AND (b XOR c)
1150 cmp x0, x1 // always equal
1164 #if defined(__x86_64__)
1168 and rcx, rbx // = a AND b
1169 not rcx // = NOT (a AND b)
1173 or rax, rbx // = (NOT a) OR (NOT b)
1176 cmp rax, rcx // always equal
1178 #elif defined(__i386__)
1182 and ecx, ebx // = a AND b
1183 not ecx // = NOT (a AND b)
1187 or eax, ebx // = (NOT a) OR (NOT b)
1190 cmp eax, ecx // always equal
1192 #elif defined(__arm__)
1194 and r2, r0, r1 // = a AND b
1195 mvn r2, r2 // = NOT (a AND b)
1197 mvn r0, r0 // = NOT a
1198 mvn r1, r1 // = NOT b
1199 orr r0, r0, r1 // = (NOT a) OR (NOT b)
1201 cmp r0, r2 // always equal
1203 #elif defined(__aarch64__)
1205 and x2, x0, x1 // = a AND b
1206 mvn x2, x2 // = NOT (a AND b)
1208 mvn x0, x0 // = NOT a
1209 orn x0, x0, x1 // = (NOT a) OR (NOT b)
1211 cmp x0, x2 // always equal
1223 // replace input buffer bytes with cumulative XORs with initial a;
1224 // final a is XOR of all buffer bytes and initial a.
1226 // not sure why you'd do this.
1228 #if defined(__x86_64__)
1234 #elif defined(__i386__)
1240 #elif defined(__arm__)
1248 #elif defined(__aarch64__)
1264 ///--------------------------------------------------------------------------
1269 // four different ways to swap a pair of registers.
1271 #if defined(__x86_64__)
1289 #elif defined(__i386__)
1307 #elif defined(__arm__)
1309 stmfd r13!, {r0, r2}
1319 rsb r0, r0, r2 // don't need 3-addr with reverse-sub
1325 #elif defined(__aarch64__)
1327 // anything you can do
1328 stp x0, x2, [sp, #-16]!
1329 ldp x2, x0, [sp], #16
1335 // the add/sub/add thing was daft. you can do it in three if you're
1336 // clever -- and have three-address operations.
1341 // but we lack a fourth. we can't do this in fewer than three
1342 // instructions without hitting memory. only `ldp' will modify two
1343 // registers at a time, so we need at least two instructions -- but
1344 // if the first one sets one of our two registers to its final value
1345 // then we lose the other input value with no way to recover it, so
1346 // we must either write a fresh third register, or write something
1347 // other than the final value, and in both cases we need a third
1348 // instruction to fix everything up. we've done the wrong-something-
1349 // other trick twice, so here's the captain-obvious use-a-third-
1350 // register version.
1365 // assuming a is initialized to zero, set a to the inclusive or of
1366 // the xor-differences of corresponding bytes in the c-byte strings
1369 // in particular, a will be zero (and zf set) if and only if the two
1370 // strings are equal.
1372 #if defined(__x86_64__)
1381 #elif defined(__i386__)
1390 #elif defined(__arm__)
1392 0: ldrb r1, [r4], #1
1399 #elif defined(__aarch64__)
1401 0: ldrb w16, [x4], #1
1418 // an obtuse way of adding two registers. for any bit position, a
1419 // OR d is set if and only if at least one of a and d has a bit set
1420 // in that position, and a AND d is set if and only if both have a
1421 // bit set in that position. essentially, then, what we've done is
1422 // move all of the set bits in d to a, unless there's already a bit
1423 // there. this clearly doesn't change the sum.
1425 #if defined(__x86_64__)
1427 mov rcx, rdx // c' = d
1428 and rdx, rax // d' = a AND d
1429 or rax, rcx // a' = a OR d
1432 #elif defined(__i386__)
1434 mov ecx, edx // c' = d
1435 and edx, eax // d' = a AND d
1436 or eax, ecx // a' = a OR d
1439 #elif defined(__arm__)
1441 and r2, r0, r3 // c' = a AND d
1442 orr r0, r0, r3 // a' = a OR d
1445 #elif defined(__aarch64__)
1447 and x2, x0, x3 // c' = a AND d
1448 orr x0, x0, x3 // a' = a OR d
1461 // ok, so this is a really obtuse way of adding a and b; the result
1462 // is in a and d. but why does it work?
1464 #if defined(__x86_64__)
1466 mov rcx, 0x40 // carry chains at most 64 long
1467 0: mov rdx, rax // copy a'
1468 xor rax, rbx // low bits of each bitwise sum
1469 and rbx, rdx // carry bits from each bitwise sum
1470 shl rbx, 1 // carry them into next position
1473 #elif defined(__i386__)
1475 mov ecx, 0x40 // carry chains at most 64 long
1476 0: mov edx, eax // copy a'
1477 xor eax, ebx // low bits of each bitwise sum
1478 and ebx, edx // carry bits from each bitwise sum
1479 shl ebx, 1 // carry them into next position
1482 #elif defined(__arm__)
1491 #elif defined(__aarch64__)
1510 // floor((a + d)/2), like x08.
1512 #if defined(__x86_64__)
1514 mov rcx, rax // copy a for later
1515 and rcx, rdx // carry bits
1517 xor rax, rdx // low bits of each bitwise sum
1518 shr rax, 1 // divide by 2; carries now in place
1520 add rax, rcx // add the carries; done
1522 #elif defined(__i386__)
1524 mov ecx, eax // copy a for later
1525 and ecx, edx // carry bits
1527 xor eax, edx // low bits of each bitwise sum
1528 shr eax, 1 // divide by 2; carries now in place
1530 add eax, ecx // add the carries; done
1532 #elif defined(__arm__)
1536 add r0, r2, r0, lsr #1
1538 #elif defined(__aarch64__)
1542 add x0, x2, x0, lsr #1
1554 // sign extension 32 -> 64 bits.
1556 #if defined(__x86_64__)
1558 movsx rbx, eax // like this?
1560 mov rdx, 0xffffffff80000000
1561 add rax, rdx // if bit 31 of a is set then bits
1562 // 31--63 of a' are clear; otherwise,
1563 // these bits are all set -- which is
1564 // exactly backwards
1565 xor rax, rdx // so fix it
1567 #elif defined(__i386__)
1569 movsx ebx, ax // like this?
1572 add eax, edx // if bit 31 of a is set then bits
1573 // 31--63 of a' are clear; otherwise,
1574 // these bits are all set -- which is
1575 // exactly backwards
1576 xor eax, edx // so fix it
1578 #elif defined(__arm__)
1580 sxth r1, r0 // like this
1582 mov r12, #0x80000000
1583 add r0, r0, r12, asr #16
1584 eor r0, r0, r12, asr #16
1586 #elif defined(__aarch64__)
1588 sxtw x1, w0 // like this
1590 mov x16, #0xffffffff80000000
1604 // ??? i don't know why you'd want to calculate this.
1606 #if defined(__x86_64__)
1608 xor rax, rbx // a' = a XOR b
1609 xor rbx, rcx // b' = b XOR c
1610 mov rsi, rax // t = a XOR b
1611 add rsi, rbx // t = (a XOR b) + (b XOR c)
1612 cmovc rax, rbx // a' = cf ? b XOR c : a XOR b
1613 xor rax, rbx // a' = cf ? 0 : a XOR c
1616 #elif defined(__i386__)
1618 xor eax, ebx // a' = a XOR b
1619 xor ebx, ecx // b' = b XOR c
1620 mov esi, eax // t = a XOR b
1621 add esi, ebx // t = (a XOR b) + (b XOR c)
1622 cmovc eax, ebx // a' = cf ? b XOR c : a XOR b
1623 xor eax, ebx // a' = cf ? 0 : a XOR c
1626 #elif defined(__arm__)
1635 #elif defined(__aarch64__)
1656 #if defined(__x86_64__)
1658 cqo // d = a < 0 ? -1 : 0
1659 xor rax, rdx // a' = a < 0 ? -a - 1 : a
1660 sub rax, rdx // a' = a < 0 ? -a : a
1662 #elif defined(__i386__)
1664 cdq // d = a < 0 ? -1 : 0
1665 xor eax, edx // a' = a < 0 ? -a - 1 : a
1666 sub eax, edx // a' = a < 0 ? -a : a
1668 #elif defined(__arm__)
1674 // faithful-ish conversion
1675 eor r3, r0, r0, asr #31
1676 sub r0, r3, r0, asr #31
1678 #elif defined(__aarch64__)
1684 // faithful-ish conversion
1685 eor x3, x0, x0, asr #63
1686 sub x0, x3, x0, asr #63
1698 // should always set sf, clear zf, unless we get rescheduled to a
1701 #if defined(__x86_64__)
1703 rdtsc // d || a = cycles
1705 or rax, rdx // a = cycles
1706 mov rcx, rax // c = cycles
1708 rdtsc // d || a = cycles'
1710 or rax, rdx // a = cycles'
1714 #elif defined(__i386__)
1716 rdtsc // d || a = cycles
1718 mov ecx, edx // c || b = cycles
1720 rdtsc // d || a = cycles'
1725 #elif defined(__arm__)
1727 // cycle clock not available in user mode
1728 mrrc p15, 0, r0, r1, c9
1729 mrrc p15, 0, r2, r3, c9
1733 #elif defined(__aarch64__)
1735 // cycle clock not available in user mode
1750 // stupid way to capture a pointer to inline data and jump past it.
1751 // confuses the return-address predictor something chronic. worse
1752 // because amd64 calling convention doesn't usually pass arguments on
1755 #if defined(__x86_64__)
1758 .string "hello world!\n\0"
1764 // actually implement this ridiculous thing
1767 0: mov al, [rsi + rdx]
1774 syscall // clobbers r11 :-(
1777 #elif defined(__i386__)
1780 .string "hello world!\n\0"
1786 // actually implement this ridiculous thing
1789 0: mov al, [ecx + edx]
1799 #elif defined(__arm__)
1801 // why am i doing this?
1804 .string "hello world!\n\0"
1806 8: mov r1, r14 // might as well make it easy on myself
1812 0: ldrb r0, [r1, r2]
1821 #elif defined(__aarch64__)
1823 // why am i doing this?
1824 str x30, [sp, #-16]!
1826 .string "hello world!\n\0"
1828 8: mov x1, x30 // might as well make it easy on myself
1835 0: ldrb w0, [x1, x2]
1852 // collect the current instruction-pointer address. this was an old
1853 // 32-bit i386 trick for position-independent code, but (a) it
1854 // confuses the return predictor, and (b) amd64 has true pc-relative
1857 #if defined(__x86_64__)
1859 // the actual example
1863 // the modern i386 trick doesn't confuse the return-address
1868 // but rip-relative addressing is even better
1877 #elif defined(__i386__)
1879 // the actual example
1883 // the modern i386 trick doesn't confuse the return-address
1890 #elif defined(__arm__)
1898 sub r1, r14, #. - 0b
1906 #elif defined(__aarch64__)
1908 str x30, [sp, #-16]!
1910 // we can do all of the above using a64
1915 sub x1, x30, #. - 0b
1930 #if defined(__x86_64__)
1932 // retpolines: an mitigation against adversarially influenced
1933 // speculative execution at indirect branches. if an adversary can
1934 // prepare a branch-target buffer entry matching an indirect branch
1935 // in the victim's address space then they can cause the victim to
1936 // /speculatively/ (but not architecturally) execute any code in
1937 // their address space, possibly leading to leaking secrets through
1938 // the cache. retpolines aren't susceptible to this because the
1939 // predicted destination address is from the return-prediction stack
1940 // which the adversary can't prime. the performance penalty is still
1941 // essentially a branch misprediction -- for this return, and
1942 // possibly all others already stacked.
1944 // (try not to crash)
1950 #elif defined(__i386__)
1953 lea eax, [ebx + 9f - .]
1958 #elif defined(__arm__)
1967 #elif defined(__aarch64__)
1969 str x30, [sp, #-16]!
1974 8: ldr x30, [sp], #16
1985 // ok, having a hard time seeing a use for this. the most important
1986 // thing to note is that sp is set from `pop' /after/ it's
1989 #if defined(__x86_64__)
2002 #elif defined(__i386__)
2015 #elif defined(__arm__)
2017 // not even going to dignify this
2020 #elif defined(__aarch64__)
2022 // not even going to dignify this
2033 // monumentally cheesy way to copy 8 n bytes from buff1 to buff2.
2034 // also clobbers words at buff2 + 8 n and buff2 - 8 for good measure.
2038 #if defined(__x86_64__)
2040 mov rax, rsp // safekeeping
2042 // we're toast if we get hit by a signal now. fingers crossed...
2044 mov rsp, buff2 + 8*n + 8
2045 mov rbp, buff1 + 8*n
2047 lea rsp, [rdi + 8*n + 16]
2048 lea rbp, [rsi + 8*n]
2054 // +---------+ +---------+
2055 // rbp -> | ??? | rsp -> | ??? |
2056 // +---------+ +---------+
2057 // | w_{n-1} | | rbp | <- rbp'
2058 // +---------+ +---------+
2059 // | ... | | w_{n-1} |
2060 // +---------+ +---------+
2062 // +---------+ +---------+
2064 // +---------+ +---------+
2073 #elif defined(__i386__)
2075 mov eax, esp // safekeeping
2077 // we're toast if we get hit by a signal now. fingers crossed...
2079 mov esp, buff2 + 4*n + 4
2080 mov ebp, buff1 + 4*n
2082 lea esp, [edi + 4*n + 8]
2083 lea ebp, [esi + 4*n]
2090 #elif defined(__arm__)
2093 add r5, r5, #4*n + 8
2097 ldrd r0, r1, [r4, #-8]!
2098 strd r0, r1, [r5, #-8]!
2103 #elif defined(__aarch64__)
2105 // omgwtf. let's not actually screw with the stack pointer.
2108 add x5, x5, #8*n + 16
2112 ldp x16, x17, [x4, #-16]!
2113 stp x16, x17, [x5, #-16]!
2128 // convert nibble value to (uppercase) hex; other input values yield
2131 #if defined(__x86_64__)
2133 // das doesn't work in 64-bit mode; best i can come up with
2140 #elif defined(__i386__)
2142 cmp al, 0x0a // cf = 1 iff a < 10
2143 sbb al, 0x69 // if 0 <= a < 10, a' = a - 0x6a, so
2144 // 0x96 <= a' < 0x70, setting af, cf
2145 // if 10 <= a < 16, a' = a - 0x69, so
2146 // 0x71 <= a' < 0x77, setting cf but
2148 das // if 0 <= a < 10, then af and cf are
2149 // both set, so set subtract 0x66
2150 // from a' leaving 0x30 <= a' < 0x3a;
2151 // if 10 <= a < 16 then af clear but
2152 // cf set, so subtract 0x60 from a'
2153 // leaving 0x41 <= a' < 0x47
2155 #elif defined(__arm__)
2157 // significantly less tricksy
2160 addhs r0, r0, #'A' - 10
2162 #elif defined(__aarch64__)
2164 // with less versatile conditional execution this is the best we can
2167 add w16, w0, #'A' - 10
2181 // verify collatz conjecture starting at a; assume a /= 0!
2183 #if defined(__x86_64__)
2185 0: bsf rcx, rax // clobber c if a = 0
2186 shr rax, cl // a = 2^c a'
2194 lea rax, [2*rax + rax + 1] // a' = 3 a' + 1
2199 #elif defined(__i386__)
2201 0: bsf ecx, eax // clobber c if a = 0
2202 shr eax, cl // a = 2^c a'
2210 lea eax, [2*eax + eax + 1] // a' = 3 a' + 1
2215 #elif defined(__arm__)
2217 // rbit introduced in armv7
2220 mov r0, r0, lsr r2 // a = 2^c a'
2225 adcne r0, r0, r0, lsl #1 // a' = 3 a' + 1 (because c set)
2230 #elif defined(__aarch64__)
2234 lsr w0, w0, w2 // a = 2^c a'
2241 add w16, w0, w0, lsl #1 // t = 3 a' + 1 (because c set)
2242 csinc.eq w0, w0, w16
2253 ///--------------------------------------------------------------------------
2258 // calculate 1337 a slowly
2260 #if defined(__x86_64__)
2263 mov rcx, rax // c = a
2264 shl rcx, 2 // c = 4 a
2265 add rcx, rax // c = 5 a
2266 shl rcx, 3 // c = 40 a
2267 add rcx, rax // c = 41 a
2268 shl rcx, 1 // c = 82 a
2269 add rcx, rax // c = 83 a
2270 shl rcx, 1 // c = 166 a
2271 add rcx, rax // c = 167 a
2272 shl rcx, 3 // c = 1336 a
2273 add rcx, rax // c = 1337 a
2276 lea rdx, [2*rax + rax] // t = 3 a
2277 shl rdx, 6 // t = 192 a
2278 sub rdx, rax // t = 191 a
2279 lea rbx, [8*rdx] // b = 1528 a
2280 sub rbx, rdx // b = 1337 a
2282 #elif defined(__i386__)
2285 mov ecx, eax // c = a
2286 shl ecx, 2 // c = 4 a
2287 add ecx, eax // c = 5 a
2288 shl ecx, 3 // c = 40 a
2289 add ecx, eax // c = 41 a
2290 shl ecx, 1 // c = 82 a
2291 add ecx, eax // c = 83 a
2292 shl ecx, 1 // c = 166 a
2293 add ecx, eax // c = 167 a
2294 shl ecx, 3 // c = 1336 a
2295 add ecx, eax // c = 1337 a
2298 lea edx, [2*eax + eax] // t = 3 a
2299 shl edx, 6 // t = 192 a
2300 sub edx, eax // t = 191 a
2301 lea ebx, [8*edx] // b = 1528 a
2302 sub ebx, edx // b = 1337 a
2304 #elif defined(__arm__)
2306 // original version, ish
2307 add r2, r0, r0, lsl #2 // c = 5 a
2308 add r2, r0, r2, lsl #3 // c = 41 a
2309 add r2, r0, r2, lsl #1 // c = 83 a
2310 add r2, r0, r2, lsl #1 // c = 167 a
2311 add r2, r0, r2, lsl #3 // c = 1337 a
2314 add r1, r0, r0, lsl #1 // b = 3 a
2315 rsb r1, r0, r1, lsl #6 // b = 191 a
2316 rsb r1, r1, r1, lsl #3 // b = 1337 a
2318 #elif defined(__aarch64__)
2320 // original version, ish
2321 add x2, x0, x0, lsl #2 // c = 5 a
2322 add x2, x0, x2, lsl #3 // c = 41 a
2323 add x2, x0, x2, lsl #1 // c = 83 a
2324 add x2, x0, x2, lsl #1 // c = 167 a
2325 add x2, x0, x2, lsl #3 // c = 1337 a
2327 // sleazy because no rsb
2328 add x1, x0, x0, lsl #1 // b = 3 a
2329 sub x1, x0, x1, lsl #6 // b = -191 a
2330 sub x1, x1, x1, lsl #3 // b = 1337 a
2342 // multiply complex numbers a + b i and c + d i
2344 // (a + b i) (c + d i) = (a c - b d) + (a d + b c) i
2346 // somewhat slick approach uses only three multiplications
2348 #if defined(__x86_64__)
2350 mov rsi, rax // t = a
2351 add rax, rbx // a' = a + b
2352 mov rdi, rdx // u = d
2353 sub rdx, rcx // d' = d - c
2354 add rdi, rcx // u = c + d
2356 imul rax, rcx // a' = c (a + b)
2357 imul rsi, rdx // t = a (d - c)
2358 imul rdi, rbx // u = b (c + d)
2360 add rsi, rax // t = a (d - c) + c (a + b)
2361 mov rbx, rsi // b' = a (d - c) + c (a + b)
2363 sub rax, rdi // a' = c (a + b) - b (c + d)
2366 #elif defined(__i386__)
2368 mov esi, eax // t = a
2369 add eax, ebx // a' = a + b
2370 mov edi, edx // u = d
2371 sub edx, ecx // d' = d - c
2372 add edi, ecx // u = c + d
2374 imul eax, ecx // a' = c (a + b)
2375 imul esi, edx // t = a (d - c)
2376 imul edi, ebx // u = b (c + d)
2378 add esi, eax // t = a (d - c) + c (a + b)
2379 mov ebx, esi // b' = a (d - c) + c (a + b)
2381 sub eax, edi // a' = c (a + b) - b (c + d)
2384 #elif defined(__arm__)
2386 add r4, r0, r1 // t = a + b
2387 add r5, r2, r3 // u = c + d
2388 sub r3, r3, r2 // d' = d - c
2390 // mls introduced in armv7
2391 mul r4, r4, r2 // t = c (a + b)
2392 mov r2, r1 // c' = a (bah!)
2393 mla r1, r0, r3, r4 // b' = a (d - c) + c (a + b)
2395 mls r0, r2, r5, r4 // a' = c (a + b) - b (c + d)
2398 #elif defined(__aarch64__)
2400 add x4, x0, x1 // t = a + b
2401 add x5, x2, x3 // u = c + d
2402 sub x3, x3, x2 // d' = d - c
2404 // mls intxoduced in axmv7
2405 mul x4, x4, x2 // t = c (a + b)
2406 mov x2, x1 // c' = a (bah!)
2407 madd x1, x0, x3, x4 // b' = a (d - c) + c (a + b)
2409 msub x0, x2, x5, x4 // a' = c (a + b) - b (c + d)
2424 #if defined(__x86_64__)
2426 mov rdx, 0xaaaaaaaaaaaaaaab // = ceil(2/3 2^64)
2427 mul rdx // d' || a' =~ 2/3 a 2^64
2428 shr rdx, 1 // d' = floor(a/3)
2429 mov rax, rdx // a' = floor(a/3)
2431 // we start with 0 <= a < 2^64. write f = ceil(2/3 2^64), so that
2432 // 2/3 < f/2^64 < 2/3 + 1/2^64. then floor(2/3 a) <= floor(a f/2^64)
2433 // <= floor(2/3 a + a/2^64), but a < 2^64 so a/2^64 < 1 and
2434 // floor(a f/2^64) = floor(2/3 a).
2436 #elif defined(__i386__)
2438 mov edx, 0xaaaaaaab // = ceil(2/3 2^32)
2439 mul edx // d' || a' =~ 2/3 a 2^32
2440 shr edx, 1 // d' = floor(a/3)
2441 mov eax, edx // a' = floor(a/3)
2443 #elif defined(__arm__)
2445 ldr r12, =0xaaaaaaab
2446 umull r12, r0, r0, r12
2449 #elif defined(__aarch64__)
2451 ldr x16, =0xaaaaaaaaaaaaaaab
2465 #if defined(__x86_64__)
2467 // main loop: shorten a preserving residue class mod 3
2471 mov rdx, rax // d' = a
2472 shr rdx, 2 // d' = floor(a/4)
2473 and rax, 3 // a = 4 d' + a' (0 <= a' < 4)
2474 add rax, rdx // a' == a (mod 3) but a' < a/4 + 4
2477 // fix up final value 0 <= a < 6: want 0 <= a < 3
2479 // the tricky part is actually a = 3; but the other final cases take
2480 // additional iterations which we can avoid.
2481 8: cmp rax, 3 // set cf iff a < 3
2482 cmc // set cf iff a >= 3
2483 sbb rdx, rdx // d' = a >= 3 ? -1 : 0
2484 and rdx, 3 // d' = a >= 3 ? 3 : 0
2485 sub rax, rdx // a' = a - (a >= 3 ? 3 : 0)
2488 #elif defined(__i386__)
2490 // main loop: shorten a preserving residue class mod 3
2494 mov edx, eax // d' = a
2495 shr edx, 2 // d' = floor(a/4)
2496 and eax, 3 // a = 4 d' + a' (0 <= a' < 4)
2497 add eax, edx // a' == a (mod 3) but a' < a/4 + 4
2500 // fix up final value 0 <= a < 6: want 0 <= a < 3
2502 // the tricky part is actually a = 3; but the other final cases take
2503 // additional iterations which we can avoid.
2504 8: cmp eax, 3 // set cf iff a < 3
2505 cmc // set cf iff a >= 3
2506 sbb edx, edx // d' = a >= 3 ? -1 : 0
2507 and edx, 3 // d' = a >= 3 ? 3 : 0
2508 sub eax, edx // a' = a - (a >= 3 ? 3 : 0)
2511 #elif defined(__arm__)
2515 addhs r0, r12, r0, lsr #2
2521 #elif defined(__aarch64__)
2524 // blunder on through regardless since this doesn't affect the result
2526 add x0, x16, x0, lsr #2
2542 // invert (odd) a mod 2^64
2544 // suppose a a_i == 1 (mod 2^{2^i})
2546 // clearly good for i = 0, since 2^i = 1 and 2^{2^i} = 2, and a_0 =
2547 // a == 1 (mod 2) by assumption
2549 // write a a_i == b_i 2^{2^i} + 1 (mod 2^{2^{i+1}})
2550 // then b_i == (a a_i - 1)/2^{2^i} (mod 2^{2^i})
2551 // to lift inverse, we want x such that a x == -b_i (mod 2^{2^i});
2552 // clearly x = -a_i b_i will do, since a a_i == 1 (mod 2^{2^i})
2554 // a_{i+1} = a_i - a_i b_i 2^{2^i} = a_i (1 - (a a_i - 1))
2555 // = 2 a_i - a a_i^2
2558 // a a_{i+1} = 2 a a_i - a^2 a_i^2
2559 // == 2 a a_i - (b_i 2^{2^i} + 1)^2
2560 // == 2 (b_i 2^{2^i} + 1) -
2561 // (b_i^2 2^{2^{i+1}} + 2 b_i 2^{2^i} + 1)
2562 // == 1 (mod 2^{2^{i+1}})
2564 #if defined(__x86_64__)
2567 mov rbx, rax // b' = a
2568 mov rsi, rax // t = a_0
2576 mul rbx // a' = a a_i
2577 mov rcx, rax // c = a a_i
2579 sub rax, 2 // a' = a a_i - 2
2580 neg rax // a' = 2 - a a_i
2581 mul rsi // a_{i+1} = a_i (2 - a a_i)
2582 // = 2 a_i - a a_i^2
2583 mov rsi, rax // t = a_{i+1}
2586 ja 0b // no -- iterate
2588 #elif defined(__i386__)
2591 mov ebx, eax // b' = a
2592 mov esi, eax // t = a_0
2600 mul ebx // a' = a a_i
2601 mov ecx, eax // c = a a_i
2603 sub eax, 2 // a' = a a_i - 2
2604 jb 9f // done if < 2
2605 neg eax // a' = 2 - a a_i
2606 mul esi // a_{i+1} = a_i (2 - a a_i)
2607 // = 2 a_i - a a_i^2
2608 mov esi, eax // t = a_{i+1}
2610 jmp 0b // and iterate
2611 9: mov eax, esi // restore
2613 #elif defined(__arm__)
2616 mov r1, r0 // b' = a
2622 mul r2, r0, r1 // c = a a_i
2623 rsbs r2, r2, #2 // c = 2 - a a_i
2624 mul r0, r0, r2 // a_{i+1} = a_i (2 - a a_i)
2625 // = 2 a_i - a a_i^2
2628 #elif defined(__aarch64__)
2631 mov x1, x0 // b' = a
2632 mov x16, #2 // because we have no rsb
2640 mul x2, x0, x1 // c = a a_i
2641 subs x2, x16, x2 // c = 2 - a a_i
2642 mul x0, x0, x2 // a_{i+1} = a_i (2 - a a_i)
2643 // = 2 a_i - a a_i^2
2656 // a poor approximation to pi/4
2658 // think of x and y as being in 16.16 fixed-point format. we sample
2659 // points in the unit square, and determine how many of them are
2660 // within a unit quarter-circle centred at the origin. the area of
2661 // the quarter-circle is pi/4.
2663 #if defined(__x86_64__)
2665 xor eax, eax // a = 0
2667 shl rcx, 0x20 // c =~ 4 billion
2669 0: movzx rbx, cx // x = low 16 bits of c
2670 imul rbx, rbx // b = x^2
2672 ror rcx, 0x10 // switch halves of c
2673 movzx rdx, cx // y = high 16 bits of c
2674 imul rdx, rdx // d = y^2
2675 rol rcx, 0x10 // switch back
2677 add rbx, rdx // r^2 = x^2 + y^2
2678 shr rbx, 0x20 // r^2 >= 1?
2679 cmp rbx, 1 // set cf iff r^2 >= 1
2680 adc rax, 0 // and add onto accumulator
2683 #elif defined(__i386__)
2685 // this is actually better done in 32 bits. the carry has the wrong
2686 // sense here, so instead deduct one for each point outside the
2687 // quarter-circle rather than adding one for each point inside it.
2699 add ebx, edx // see?
2703 #elif defined(__arm__)
2708 0: uxth r1, r2, ror #0
2709 uxth r3, r2, ror #16
2712 cmn r1, r3 // mlas doesn't set cf usefully
2717 #elif defined(__aarch64__)
2722 0: ubfx w1, w2, #0, #16
2723 ubfx w3, w2, #16, #16
2741 #if defined(__x86_64__)
2745 #elif defined(__i386__)
2749 #elif defined(__arm__)
2753 #elif defined(__aarch64__)
2765 #if defined(__x86_64__)
2769 #elif defined(__i386__)
2773 #elif defined(__arm__)
2777 #elif defined(__aarch64__)
2789 #if defined(__x86_64__)
2793 #elif defined(__i386__)
2797 #elif defined(__arm__)
2801 #elif defined(__aarch64__)
2813 #if defined(__x86_64__)
2817 #elif defined(__i386__)
2821 #elif defined(__arm__)
2825 #elif defined(__aarch64__)
2837 #if defined(__x86_64__)
2841 #elif defined(__i386__)
2845 #elif defined(__arm__)
2849 #elif defined(__aarch64__)
2861 #if defined(__x86_64__)
2865 #elif defined(__i386__)
2869 #elif defined(__arm__)
2873 #elif defined(__aarch64__)
2885 #if defined(__x86_64__)
2889 #elif defined(__i386__)
2893 #elif defined(__arm__)
2897 #elif defined(__aarch64__)
2909 #if defined(__x86_64__)
2913 #elif defined(__i386__)
2917 #elif defined(__arm__)
2921 #elif defined(__aarch64__)
2933 #if defined(__x86_64__)
2937 #elif defined(__i386__)
2941 #elif defined(__arm__)
2945 #elif defined(__aarch64__)
2957 #if defined(__x86_64__)
2961 #elif defined(__i386__)
2965 #elif defined(__arm__)
2969 #elif defined(__aarch64__)
2979 ///--------------------------------------------------------------------------
2984 #if defined(__x86_64__)
2988 #elif defined(__i386__)
2992 #elif defined(__arm__)
2996 #elif defined(__aarch64__)
3010 #if defined(__x86_64__)
3014 #elif defined(__i386__)
3018 #elif defined(__arm__)
3022 #elif defined(__aarch64__)
3034 #if defined(__x86_64__)
3038 #elif defined(__i386__)
3042 #elif defined(__arm__)
3046 #elif defined(__aarch64__)
3058 #if defined(__x86_64__)
3062 #elif defined(__i386__)
3066 #elif defined(__arm__)
3070 #elif defined(__aarch64__)
3082 #if defined(__x86_64__)
3086 #elif defined(__i386__)
3090 #elif defined(__arm__)
3094 #elif defined(__aarch64__)
3106 #if defined(__x86_64__)
3110 #elif defined(__i386__)
3114 #elif defined(__arm__)
3118 #elif defined(__aarch64__)
3130 #if defined(__x86_64__)
3134 #elif defined(__i386__)
3138 #elif defined(__arm__)
3142 #elif defined(__aarch64__)
3154 #if defined(__x86_64__)
3158 #elif defined(__i386__)
3162 #elif defined(__arm__)
3166 #elif defined(__aarch64__)
3178 #if defined(__x86_64__)
3182 #elif defined(__i386__)
3186 #elif defined(__arm__)
3190 #elif defined(__aarch64__)
3202 #if defined(__x86_64__)
3206 #elif defined(__i386__)
3210 #elif defined(__arm__)
3214 #elif defined(__aarch64__)
3226 #if defined(__x86_64__)
3230 #elif defined(__i386__)
3234 #elif defined(__arm__)
3238 #elif defined(__aarch64__)
3250 #if defined(__x86_64__)
3254 #elif defined(__i386__)
3258 #elif defined(__arm__)
3262 #elif defined(__aarch64__)
3274 #if defined(__x86_64__)
3278 #elif defined(__i386__)
3282 #elif defined(__arm__)
3286 #elif defined(__aarch64__)
3298 #if defined(__x86_64__)
3302 #elif defined(__i386__)
3306 #elif defined(__arm__)
3310 #elif defined(__aarch64__)
3322 #if defined(__x86_64__)
3326 #elif defined(__i386__)
3330 #elif defined(__arm__)
3334 #elif defined(__aarch64__)
3346 #if defined(__x86_64__)
3350 #elif defined(__i386__)
3354 #elif defined(__arm__)
3358 #elif defined(__aarch64__)
3368 ///----- That's all, folks --------------------------------------------------