Set and dump (almost) all of the ARM64 general registers.
[xchg-rax-rax] / xchg.S
1 /// -*- mode: asm; asm-comment-char: 0 -*-
2
3 ///--------------------------------------------------------------------------
4 /// Preliminaries.
5
6 #include <sys/syscall.h>
7
8 #if defined(__i386__) || defined(__x86_64__)
9
10 .intel_syntax noprefix
11
12 #elif defined(__arm__)
13
14 .macro ret
15 bx r14
16 .endm
17
18 .arch armv7-a
19
20 #elif defined(__aarch64__)
21
22 .macro cmov rd, rn, cc
23 csel \rd, \rn, \rd, \cc
24 .endm
25 #define _COND(_) \
26 _(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl) \
27 _(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv) \
28 _(hs) _(lo)
29 #define _INST(_) \
30 _(ccmp) _(ccmn) \
31 _(csel) _(cmov) \
32 _(csinc) _(cinc) _(cset) \
33 _(csneg) _(cneg) \
34 _(csinv) _(cinv) _(csetm)
35 #define _CONDVAR(cc) _definstvar cc;
36 #define _INSTVARS(inst) \
37 .macro _definstvar cc; \
38 .macro inst.\cc args:vararg; inst \args, \cc; .endm; \
39 .endm; \
40 _COND(_CONDVAR); \
41 .purgem _definstvar;
42 _INST(_INSTVARS)
43 #undef _COND
44 #undef _INST
45 #undef _CONDVAR
46 #undef _INSTVARS
47
48 #define CCMP_N 8
49 #define CCMP_Z 4
50 #define CCMP_C 2
51 #define CCMP_V 1
52
53 #define CCMP_MI CCMP_N
54 #define CCMP_PL 0
55 #define CCMP_EQ CCMP_Z
56 #define CCMP_NE 0
57 #define CCMP_CS CCMP_C
58 #define CCMP_HS CCMP_C
59 #define CCMP_CC 0
60 #define CCMP_LO 0
61 #define CCMP_VS CCMP_V
62 #define CCMP_VC 0
63 #define CCMP_HI CCMP_C
64 #define CCMP_LS 0
65 #define CCMP_LT CCMP_N
66 #define CCMP_GE 0
67 #define CCMP_LE CCMP_N
68 #define CCMP_GT 0
69
70 #else
71 # error "not supported"
72 #endif
73
74 .macro proc name
75 .globl \name
76 .type \name, STT_FUNC
77 .p2align 4
78 \name\():
79 .macro endproc
80 .size \name, . - \name
81 .purgem endproc
82 .endm
83 .endm
84
85 .macro ch c
86 #if defined(__i386__)
87
88 pushf
89 push eax
90 push ebx
91 push ecx
92 push edx
93 push ebp
94 mov ebp, esp
95 and esp, -16
96
97 push \c
98 call putchar@plt
99
100 call get_pc_ebx
101 add ebx, offset _GLOBAL_OFFSET_TABLE
102 mov eax, [ebx + stdout@GOT]
103 mov eax, [eax]
104 call fflush@plt
105
106 mov esp, ebp
107 pop ebp
108 pop edx
109 pop ecx
110 pop ebx
111 pop eax
112 popf
113
114 #elif defined(__x86_64__)
115
116 pushf
117 push rax
118 push rcx
119 push rdx
120 push rsi
121 push rdi
122 push r8
123 push r9
124 push rbp
125 mov rbp, rsp
126 and rsp, -16
127
128 mov rdi, \c
129 call putchar@plt
130
131 mov rdi, [rip + stdout]
132 call fflush@plt
133
134 mov rsp, rbp
135 pop rbp
136 pop r9
137 pop r8
138 pop rdi
139 pop rsi
140 pop rdx
141 pop rcx
142 pop rax
143 popf
144
145 #elif defined(__arm__)
146
147 stmfd r13!, {r0-r4, r12, r14}
148
149 mov r4, r13
150 bic r14, r4, #15
151 mov r13, r14
152
153 mov r0, #\c
154 bl putchar@plt
155
156 ldr r14, .L$_c$gotoff$\@
157 .L$_c$gotpc$\@:
158 add r14, pc, r14
159 b .L$_c$cont$\@
160 .L$_c$gotoff$\@:
161 .word _GLOBAL_OFFSET_TABLE - .L$_c$gotpc$\@ - 8
162 .L$_c$cont$\@:
163 bl fflush@plt
164
165 mov r13, r4
166 ldmfd r13!, {r0-r4, r12, r14}
167
168 #elif defined(__aarch64__)
169
170 sub sp, sp, #20*8
171 stp x0, x1, [sp, #0]
172 stp x2, x3, [sp, #16]
173 stp x4, x5, [sp, #32]
174 stp x6, x7, [sp, #48]
175 stp x8, x9, [sp, #64]
176 stp x10, x11, [sp, #80]
177 stp x12, x13, [sp, #96]
178 stp x14, x15, [sp, #112]
179 stp x16, x17, [sp, #128]
180 mrs x16, nzcv
181 stp x16, x30, [sp, #144]
182
183 mov w0, #\c
184 bl putchar
185 adrp x0, :got:stdout
186 ldr x0, [x0, #:got_lo12:stdout]
187 ldr x0, [x0]
188 bl fflush
189
190 ldp x16, x30, [sp, #144]
191 msr nzcv, x16
192 ldp x16, x17, [sp, #128]
193 ldp x14, x15, [sp, #112]
194 ldp x12, x13, [sp, #96]
195 ldp x10, x11, [sp, #80]
196 ldp x8, x9, [sp, #64]
197 ldp x6, x7, [sp, #48]
198 ldp x4, x5, [sp, #32]
199 ldp x2, x3, [sp, #16]
200 ldp x0, x1, [sp, #0]
201 add sp, sp, #20*8
202
203 #else
204 # error "not supported"
205 #endif
206 .endm
207
208 .macro notimpl
209 #if defined(__i386__) || defined(__x86_64__)
210 ud2
211 #elif defined(__arm__)
212 udf
213 #elif defined(__aarch64__)
214 hlt #0
215 #else
216 # error "not supported"
217 #endif
218 .endm
219
220 .section .note.GNU-stack, "", %progbits
221
222 .text
223
224 #if defined(__i386__)
225 get_pc_ebx:
226 mov ebx, [esp]
227 ret
228 #endif
229
230
231 proc call_example
232
233 #if defined(__i386__)
234
235 push ebx // ebx
236 push esi // esi, ebx
237 push edi // edi, esi, ebx
238 push ebp // flags, ebp, ..., ebx
239 pushf
240
241 mov edi, [esp + 4*6]
242 mov esi, [esp + 4*7]
243 push esi // regs, flags, ebp, ..., ebx
244
245 call get_pc_ebx
246 lea eax, [ebx + 9f - .]
247 push eax // cont, regs, flags, ebp, ..., ebx
248 push edi // func, cont, regs, flags, ebp, ..., ebx
249
250 mov eax, [esi + 28]
251 pushf
252 pop ecx
253 and eax, 0x0cd5
254 and ecx, ~0x0cd5
255 or eax, ecx
256 push eax
257 popf
258 mov eax, [esi + 0]
259 mov ebx, [esi + 4]
260 mov ecx, [esi + 8]
261 mov edx, [esi + 12]
262 mov edi, [esi + 20]
263 mov ebp, [esi + 24]
264 mov esi, [esi + 16]
265
266 ret // -> func; regs, flags, ebp, ..., ebx
267
268 9: pushf // eflags, regs, flags, ebp, ..., ebx
269 push esi // esi, eflags, regs, flags, ebp, ..., ebx
270 mov esi, [esp + 8]
271 mov [esi + 0], eax
272 mov [esi + 4], ebx
273 mov [esi + 8], ecx
274 mov [esi + 12], edx
275 mov [esi + 20], edi
276 mov [esi + 24], ebp
277 pop eax // rflags, regs, flags, ebp, ..., ebx
278 mov [esi + 16], eax
279 pop eax // regs, flags, ebp, ..., ebx
280 mov [esi + 28], eax
281
282 add esp, 4 // flags, ebp, ..., ebx
283 popf // ebp, ..., ebx
284 pop ebp // ..., ebx
285 pop edi
286 pop esi
287 pop ebx //
288 ret
289
290 #elif defined(__x86_64__)
291
292 push rbx // rbx
293 push r10
294 push r11
295 push r12
296 push r13
297 push r14
298 push r15
299 push rbp // flags, rbp, ..., rbx
300 pushf
301
302 push rsi // regs, flags, rbp, ..., rbx
303
304 lea rax, [rip + 9f]
305 push rax // cont, regs, flags, rbp, ..., rbx
306 push rdi // func, cont, regs, flags, rbp, ..., rbx
307
308 mov rax, [rsi + 8*15]
309 pushf
310 pop rcx
311 and rax, 0x0cd5
312 and rcx, ~0x0cd5
313 or rax, rcx
314 push rax
315 popf
316 mov rax, [rsi + 0]
317 mov rbx, [rsi + 8]
318 mov rcx, [rsi + 16]
319 mov rdx, [rsi + 24]
320 mov rdi, [rsi + 40]
321 mov rbp, [rsi + 48]
322 mov r8, [rsi + 56]
323 mov r9, [rsi + 64]
324 mov r10, [rsi + 72]
325 mov r11, [rsi + 80]
326 mov r12, [rsi + 88]
327 mov r13, [rsi + 96]
328 mov r14, [rsi + 104]
329 mov r15, [rsi + 112]
330 mov rsi, [rsi + 32]
331
332 ret // -> func; regs, flags, rbp, ..., rbx
333
334 9: pushf // rflags, regs, flags, rbp, ..., rbx
335 push rsi // rsi, rflags, regs, flags, rbp, ..., rbx
336 mov rsi, [rsp + 16]
337 mov [rsi + 0], rax
338 mov [rsi + 8], rbx
339 mov [rsi + 16], rcx
340 mov [rsi + 24], rdx
341 mov [rsi + 40], rdi
342 mov [rsi + 48], rbp
343 mov [rsi + 56], r8
344 mov [rsi + 64], r9
345 mov [rsi + 72], r10
346 mov [rsi + 80], r11
347 mov [rsi + 88], r12
348 mov [rsi + 96], r13
349 mov [rsi + 104], r14
350 mov [rsi + 112], r15
351 pop rax // rflags, regs, flags, rbp, ..., rbx
352 mov [rsi + 32], rax
353 pop rax // regs, flags, rbp, ..., rbx
354 mov [rsi + 120], rax
355
356 add rsp, 8 // flags, rbp, ..., rbx
357 popf // rbp, ..., rbx
358 pop rbp // ..., rbx
359 pop r15
360 pop r14
361 pop r13
362 pop r12
363 pop r11
364 pop r10
365 pop rbx //
366 ret
367
368 #elif defined(__arm__)
369
370 stmfd r13!, {r0, r1, r4-r11, r14}
371 ldmia r1, {r0-r12, r14}
372 msr cpsr, r14
373 mov r14, pc
374 ldr pc, [r13], #4
375 ldr r14, [r13], #4
376 stmia r14!, {r0-r12}
377 mrs r0, cpsr
378 str r0, [r14]
379 ldmfd r13!, {r4-r11, pc}
380
381 #elif defined(__aarch64__)
382
383 stp x29, x30, [sp, #-14*8]!
384 mov x29, sp
385 stp x19, x20, [sp, #16]
386 stp x21, x22, [sp, #32]
387 stp x23, x24, [sp, #48]
388 stp x25, x26, [sp, #64]
389 stp x27, x28, [sp, #80]
390 str x1, [sp, #104]
391
392 ldp x29, x30, [x1, #224]
393 msr nzcv, x30
394 mov x30, x0
395 ldp x27, x28, [x1, #208]
396 ldp x25, x26, [x1, #192]
397 ldp x23, x24, [x1, #176]
398 ldp x21, x22, [x1, #160]
399 ldp x19, x20, [x1, #144]
400 ldp x16, x17, [x1, #128]
401 ldp x14, x15, [x1, #112]
402 ldp x12, x13, [x1, #96]
403 ldp x10, x11, [x1, #80]
404 ldp x8, x9, [x1, #64]
405 ldp x6, x7, [x1, #48]
406 ldp x4, x5, [x1, #32]
407 ldp x2, x3, [x1, #16]
408 ldp x0, x1, [x1, #0]
409
410 blr x30
411
412 ldr x30, [sp, #104]
413 stp x27, x28, [x30, #208]
414 stp x25, x26, [x30, #192]
415 stp x23, x24, [x30, #176]
416 stp x21, x22, [x30, #160]
417 stp x19, x20, [x30, #144]
418 stp x16, x17, [x30, #128]
419 stp x14, x15, [x30, #112]
420 stp x12, x13, [x30, #96]
421 stp x10, x11, [x30, #80]
422 stp x8, x9, [x30, #64]
423 stp x6, x7, [x30, #48]
424 stp x4, x5, [x30, #32]
425 stp x2, x3, [x30, #16]
426 stp x0, x1, [x30, #0]
427 mov x0, x30
428 mrs x30, nzcv
429 stp x29, x30, [x0, #224]
430
431 ldp x19, x20, [sp, #16]
432 ldp x21, x22, [sp, #32]
433 ldp x23, x24, [sp, #48]
434 ldp x25, x26, [sp, #64]
435 ldp x27, x28, [sp, #80]
436 ldp x29, x30, [sp], #14*8
437
438 ret
439
440 #else
441 # error "not supported"
442 #endif
443
444 endproc
445
446 proc nop
447
448 ret
449
450 endproc
451
452 ///--------------------------------------------------------------------------
453 /// 0x00--0x0f
454
455 proc x00
456
457 // clear all 64 bits of extended traditional registers
458
459 #if defined(__x86_64__)
460
461 xor eax, eax // clear rax
462 lea rbx, [0] // rbx -> _|_
463 loop . // iterate, decrement rcx until zero
464 mov rdx, 0 // set rdx = 0
465 and esi, 0 // clear all bits of rsi
466 sub edi, edi // set rdi = edi - edi = 0
467 push 0
468 pop rbp // pop 0 into rbp
469
470 #elif defined(__i386__)
471
472 xor eax, eax
473 lea ebx, [0]
474 loop .
475 mov edx, 0
476 and esi, 0
477 sub edi, edi
478 push 0
479 pop ebp
480
481 #elif defined(__arm__)
482
483 eor r0, r0, r0
484 rsb r1, r1, r1
485 0: subs r2, r2, #1
486 bne 0b
487 mov r3, #0
488 and r4, r4, #0
489 sub r5, r5, r5
490
491 #elif defined(__aarch64__)
492
493 eor w0, w0, w0
494 mov w1, wzr
495 0: sub w2, w2, #1
496 cbnz w2, 0b
497 mov w3, #0
498 and w4, w4, wzr
499 sub w5, w5, w5
500
501 #else
502 notimpl
503 #endif
504
505 ret
506
507 endproc
508
509 proc x01
510
511 // advance a fibonacci pair by c steps
512 //
513 // on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1}
514 // and f_{i+c}, where f_{i+1} = f_i + f_{i-1}
515
516 #if defined(__x86_64__)
517
518 0: xadd rax, rdx // a, d = a + d, a
519 // = f_{i+1} + f_i, f_{i+1}
520 // = f_{i+2}, f_{i+1}
521 loop 0b // advance i, decrement c, iterate
522
523 #elif defined(__i386__)
524
525 0: xadd eax, edx
526 loop 0b
527
528 #elif defined(__arm__)
529
530 0: subs r2, r2, #2
531 add r3, r3, r0
532 blo 8f
533 add r0, r0, r3
534 bhi 0b
535
536 8: movne r0, r3
537
538 #elif defined(__aarch64__)
539
540 0: subs x2, x2, #2
541 add x3, x3, x0
542 b.lo 8f
543 add x0, x0, x3
544 b.hi 0b
545
546 8: cmov.ne x0, x3
547
548 #else
549 notimpl
550 #endif
551
552 ret
553
554 endproc
555
556 proc x02
557
558 // boolean canonify a: if a = 0 on entry, leave it zero; otherwise
559 // set a = 1
560
561 #if defined(__x86_64__)
562
563 neg rax // set cf iff a /= 0
564 sbb rax, rax // a = a - a - cf = -cf
565 neg rax // a = cf
566
567 #elif defined(__i386__)
568
569 neg eax
570 sbb eax, eax
571 neg eax
572
573 #elif defined(__arm__)
574
575 movs r1, r0 // the easy way
576 movne r1, #1 // mvnne r1, #1 for mask
577
578 cmp r0, #1 // clear cf iff a == 0
579 sbc r2, r0, r0 // c' = a - a - 1 + cf = cf - 1
580 add r2, r2, #1 // c' = cf
581
582 sub r3, r0, r0, lsr #1 // d' top bit clear; d' = 0 iff a = 0
583 rsb r3, r3, #0 // d' top bit set iff a /= 0
584 mov r3, r3, lsr #31 // asr for mask
585
586 rsbs r0, r0, #0
587 sbc r0, r0, r0
588 rsb r0, r0, #0
589
590 #elif defined(__aarch64__)
591
592 cmp x0, #0 // trivial
593 cset.ne x1 // csetm for mask
594
595 cmp xzr, x0 // set cf iff a == 0
596 sbc x2, x0, x0 // c' = a - a - 1 + cf = cf - 1
597 neg x2, x2 // c' = 1 - cf
598
599 sub x3, x0, x0, lsr #1 // if a < 2^63 then a' = ceil(d/2) <
600 // 2^63
601 // if a >= 2^63, write a = 2^63 + t
602 // with t < 2^63; d' = 2^63 - 2^62 +
603 // ceil(t/2) = 2^62 + ceil(t/2), and
604 // ceil(t/2) < 2^62
605 // anyway d' < 2^63 and d' = 0 iff
606 // a = 0
607 neg x3, x3 // d' top bit set iff a /= 0
608 lsr x3, x3, #63 // asr for mask
609
610 cmp x0, #1 // set cf iff a /= 0
611 adc x0, xzr, xzr // a' = 0 + 0 + cf = cf
612
613 #else
614 notimpl
615 #endif
616
617 ret
618
619 endproc
620
621 proc x03
622
623 // set a = min(a, d) (unsigned); clobber c, d
624
625 #if defined(__x86_64__)
626
627 sub rdx, rax // d' = d - a; set cf if a > d
628 sbb rcx, rcx // c = -cf = -[a > d]
629 and rcx, rdx // c = a > d ? d - a : 0
630 add rax, rcx // a' = a > d ? d : a
631
632 #elif defined(__i386__)
633
634 sub edx, eax
635 sbb ecx, ecx
636 and ecx, edx
637 add eax, ecx
638
639 #elif defined(__arm__)
640
641 cmp r0, r3 // the easy way
642 movlo r1, r0 // only needed for out-of-place
643 movhs r1, r3
644
645 subs r3, r3, r0
646 sbc r12, r12, r12
647 and r12, r12, r3
648 add r0, r0, r12
649
650 #elif defined(__aarch64__)
651
652 cmp x0, x3 // the easy way
653 csel.lo x1, x0, x3
654
655 subs x3, x3, x0 // d' = d - a; set cf if d >= a
656 sbc x16, xzr, xzr // t = -1 + cf = -[a > d]
657 and x16, x16, x3 // t = a > d ? d - a : 0
658 add x0, x0, x16 // a' = a > d ? d : a
659
660 #else
661 notimpl
662 #endif
663
664 ret
665
666 endproc
667
668 proc x04
669
670 // switch case?
671
672 #if defined(__x86_64__)
673
674 // unrelated playing
675 mov ecx, eax
676 mov rbx, -1
677 mov edx, ecx
678 sub edx, '0'
679 cmp edx, 10
680 cmovb rbx, rdx
681 or ecx, 0x20
682 mov edx, ecx
683 sub edx, 'a'
684 sub ecx, 'a' - 10
685 cmp edx, 6
686 cmovb rbx, rcx
687
688 xor al, 0x20
689
690 #elif defined(__i386__)
691
692 // unrelated playing
693 mov ecx, eax
694 mov ebx, -1
695 mov edx, ecx
696 sub edx, '0'
697 cmp edx, 10
698 cmovb ebx, edx
699 or ecx, 0x20
700 mov edx, ecx
701 sub edx, 'a'
702 sub ecx, 'a' - 10
703 cmp edx, 6
704 cmovb ebx, ecx
705
706 xor al, 0x20
707
708 #elif defined(__arm__)
709
710 // unrelated playing
711 mvn r1, #0
712 sub r12, r0, #'0'
713 cmp r12, #10
714 movlo r1, r12
715 orr r12, r0, #0x20
716 sub r12, r12, #'a'
717 cmp r12, #6
718 addlo r1, r12, #10
719
720 eor r0, r0, #0x20
721
722 #elif defined(__aarch64__)
723
724 // unrelated playing
725 mov x1, #-1
726 sub w16, w0, #'0'
727 cmp w16, #10
728 cmov.lo x1, x16
729 orr w16, w0, #0x20
730 sub w16, w16, #'a' - 10
731 cmp w16, #10
732 ccmp.hs w16, #16, #CCMP_HS
733 cmov.lo x1, x16
734
735 eor w0, w0, #0x20
736
737 #else
738 notimpl
739 #endif
740
741 ret
742
743 endproc
744
745 proc x05
746
747 // answer whether 5 <= a </<= 9.
748
749 #if defined(__x86_64__)
750
751 sub rax, 5 // a' = a - 5
752 cmp rax, 4 // is a' - 5 </<= 4?
753
754 // cc a' a
755 //
756 // z/e a' = 4 a = 9
757 // nz/ne a' /= 4 a /= 9
758 //
759 // a/nbe a' > 4 a > 9 or a < 5
760 // nc/ae/nb a' >= 4 a >= 9 or a < 5
761 // c/b/nae a' < 4 5 <= a < 9
762 // be/na a' <= 4 5 <= a <= 9
763 //
764 // o a' < -2^63 + 4 -2^63 + 5 <= a < -2^63 + 9
765 // no a' >= -2^63 + 4 a >= -2^63 + 9 or
766 // a < -2^63 + 5
767 // s -2^63 + 4 <= a' < 4 -2^63 + 9 <= a < 9
768 // ns a' < -2^63 + 4 or a < -2^63 + 9 or a >= 9
769 // a' >= 4
770 // ge/nl a' >= 4 a >= 9 or a < -2^63 + 5
771 // l/nge a' < 4 -2^63 + 5 <= a < 9
772 // g/nle a' > 4 a > 9 or a < -2^63 + 5
773 // le/ng a' <= 4 -2^63 + 5 <= a <= 9
774
775 #elif defined(__i386__)
776
777 sub eax, 5
778 cmp eax, 4
779
780 #elif defined(__arm__)
781
782 // i dimly remember having a slick way to do this way back in the
783 // day, but i can't figure it out any more.
784 sub r0, #5
785 cmp r0, #4
786
787 #elif defined(__aarch64__)
788
789 // literal translation is too obvious
790 cmp x0, #5
791 ccmp.hs x0, #9, #CCMP_HS
792
793 #else
794 notimpl
795 #endif
796
797 ret
798
799 endproc
800
801 proc x06
802
803 // leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of,
804 // set sf to msb(a)
805
806 #if defined(__x86_64__)
807
808 not rax // a' = -a - 1
809 inc rax // a' = -a
810 neg rax // a' = a
811
812 #elif defined(__i386__)
813
814 not eax
815 inc eax
816 neg eax
817
818 #elif defined(__arm__)
819
820 mvn r0, r0
821 add r0, r0, #1
822 rsbs r0, r0, #0 // cf has opposite sense
823
824 #elif defined(__aarch64__)
825
826 mvn x0, x0
827 add x0, x0, #1
828 negs x0, x0 // cf has opposite sense
829
830 #else
831 notimpl
832 #endif
833
834 ret
835
836 endproc
837
838 proc x07
839
840 // same as before (?)
841
842 #if defined(__x86_64__)
843
844 inc rax // a' = a + 1
845 neg rax // a' = -a - 1
846 inc rax // a' = -a
847 neg rax // a' = a
848
849 #elif defined(__i386__)
850
851 inc eax
852 neg eax
853 inc eax
854 neg eax
855
856 #elif defined(__arm__)
857
858 add r0, r0, #1
859 rsb r0, r0, #0
860 add r0, r0, #1
861 rsbs r0, r0, #0
862
863 #elif defined(__aarch64__)
864
865 add x0, x0, #1
866 neg x0, x0
867 add x0, x0, #1
868 negs x0, x0 // cf has opposite sense
869
870 #else
871 notimpl
872 #endif
873
874 ret
875
876 endproc
877
878 proc x08
879
880 // floor((a + d)/2), correctly handling overflow conditions; final cf
881 // is lsb(a + d), probably uninteresting
882
883 #if defined(__x86_64__)
884
885 add rax, rdx // cf || a' = a + d
886 rcr rax, 1 // shift 65-bit result right by one
887 // place; lsb moves into carry
888
889 #elif defined(__i386__)
890
891 add eax, edx
892 rcr eax, 1
893
894 #elif defined(__arm__)
895
896 // like the two-instruction a64 version
897 sub r1, r3, r0
898 add r1, r0, r1, lsr #1
899
900 // the slick version, similar to the above
901 adds r0, r0, r3
902 mov r0, r0, rrx
903
904 #elif defined(__aarch64__)
905
906 // a64 lacks a32's rrx. literal translation.
907 adds x1, x0, x3 // cf || a' = a + d
908 adc x16, xzr, xzr // realize cf in extra register
909 extr x1, x16, x1, #1 // shift down one place
910
911 // two instruction version: clobbers additional register. (if you
912 // wanted the answer in any other register, even overwriting d, then
913 // this is unnecessary.) also depends on d >= a.
914 sub x16, x3, x0 // compute difference
915 add x0, x0, x16, lsr #1 // add half of it (rounded down)
916
917 #else
918 notimpl
919 #endif
920
921 ret
922
923 endproc
924
925 proc x09
926
927 // a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3
928 // (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8).
929
930 #if defined(__x86_64__)
931
932 shr rax, 3 // a' = floor(a/8); cf = 1 if a ==
933 // 4, 5, 6, 7 (mod 8)
934 adc rax, 0 // a' = floor(a/8) + cf
935
936 #elif defined(__i386__)
937
938 shr eax, 3
939 adc eax, 0
940
941 #elif defined(__arm__)
942
943 movs r0, r0, lsr #3
944 adc r0, r0, #0
945
946 #elif defined(__aarch64__)
947
948 tst x0, #4
949 orr x0, xzr, x0, lsr #3
950 cinc.ne x0, x0
951
952 #else
953 notimpl
954 #endif
955
956 ret
957
958 endproc
959
960 proc x0a
961
962 // increment c-byte little-endian bignum at rdi
963
964 #if defined(__x86_64__)
965
966 add byte ptr [rdi], 1
967 0: inc rdi
968 adc byte ptr [rdi], 0
969 loop 0b
970
971 #elif defined(__i386__)
972
973 add byte ptr [edi], 1
974 0: inc edi
975 adc byte ptr [edi], 0
976 loop 0b
977
978 #elif defined(__arm__)
979
980 mov r12, #256 // set initial carry
981 0: ldrb r0, [r5]
982 subs r2, r2, #1
983 add r12, r0, r12, lsr #8
984 strb r12, [r5], #1
985 bne 0b
986
987 #elif defined(__aarch64__)
988
989 mov w17, #256 // set initial carry
990 0: ldrb w16, [x5]
991 sub x2, x2, #1
992 add w17, w16, w17, lsr #8
993 strb w17, [x5], #1
994 cbnz x2, 0b
995
996 #else
997 notimpl
998 #endif
999
1000 ret
1001
1002 endproc
1003
1004 proc x0b
1005
1006 // negate double-precision d:a
1007
1008 #if defined(__x86_64__)
1009
1010 not rdx // d' = -d - 1
1011 neg rax // a' = -a;
1012 // cf = 1 iff a /= 0
1013 sbb rdx, -1 // d' = -d - cf
1014
1015 #elif defined(__i386__)
1016
1017 not edx
1018 neg eax
1019 sbb edx, -1
1020
1021 #elif defined(__arm__)
1022
1023 // reverse subtract is awesome
1024 rsbs r0, r0, #0
1025 rsc r3, r3, #0
1026
1027 #elif defined(__aarch64__)
1028
1029 // easy way: everything is better with zero registers.
1030 negs x0, x0
1031 ngc x3, x3
1032
1033 #else
1034 notimpl
1035 #endif
1036
1037 ret
1038
1039 endproc
1040
1041 proc x0c
1042
1043 // rotate is distributive over xor.
1044
1045 #if defined(__x86_64__)
1046
1047 // rax // = a_1 || a_0
1048 // rbx // = b_1 || b_0
1049 mov rcx, rax // = a_1 || a_0
1050
1051 xor rcx, rbx // = (a_1 XOR b_1) || (a_0 XOR b_0)
1052 ror rcx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1)
1053
1054 ror rax, 0xd // = a_0 || a_1
1055 ror rbx, 0xd // = b_0 || b_1
1056 xor rax, rbx // = (a_0 XOR b_0) || (a_1 XOR b_1)
1057
1058 cmp rax, rcx // always equal
1059
1060 #elif defined(__i386__)
1061
1062 mov ecx, eax // = a_1 || a_0
1063
1064 xor ecx, ebx // = (a_1 XOR b_1) || (a_0 XOR b_0)
1065 ror ecx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1)
1066
1067 ror eax, 0xd // = a_0 || a_1
1068 ror ebx, 0xd // = b_0 || b_1
1069 xor eax, ebx // = (a_0 XOR b_0) || (a_1 XOR b_1)
1070
1071 cmp eax, ecx // always equal
1072
1073 #elif defined(__arm__)
1074
1075
1076 // r0 // = a_1 || a_0
1077 // r1 // = b_1 || b_0
1078 eor r2, r0, r1 // = (a_1 XOR b_1) || (a_0 XOR b_0)
1079 mov r2, r2, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1080
1081 mov r1, r1, ror #13 // = b_0 || b_1
1082 eor r0, r1, r0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1083
1084 cmp r0, r2 // always equal
1085
1086 #elif defined(__aarch64__)
1087
1088 // x0 // = a_1 || a_0
1089 // x1 // = b_1 || b_0
1090 eor x2, x0, x1 // = (a_1 XOR b_1) || (a_0 XOR b_0)
1091 ror x2, x2, #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1092
1093 ror x1, x1, #13 // = b_0 || b_1
1094 eor x0, x1, x0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1095
1096 cmp x0, x2 // always equal
1097
1098 #else
1099 notimpl
1100 #endif
1101
1102 ret
1103
1104 endproc
1105
1106 proc x0d
1107
1108 // and is distributive over xor.
1109
1110 #if defined(__x86_64__)
1111
1112 mov rdx, rbx // = b
1113
1114 xor rbx, rcx // = b XOR c
1115 and rbx, rax // = a AND (b XOR c)
1116
1117 and rdx, rax // = a AND b
1118 and rax, rcx // = a AND c
1119 xor rax, rdx // = (a AND b) XOR (a AND c)
1120 // = a AND (b XOR c)
1121
1122 cmp rax, rbx // always equal
1123
1124 #elif defined(__i386__)
1125
1126 mov edx, ebx // = b
1127
1128 xor ebx, ecx // = b XOR c
1129 and ebx, eax // = a AND (b XOR c)
1130
1131 and edx, eax // = a AND b
1132 and eax, ecx // = a AND c
1133 xor eax, edx // = (a AND b) XOR (a AND c)
1134 // = a AND (b XOR c)
1135
1136 cmp eax, ebx // always equal
1137
1138 #elif defined(__arm__)
1139
1140 and r3, r0, r1 // = a AND b
1141
1142 eor r1, r1, r2 // = b XOR c
1143 and r1, r1, r0 // = a AND (b XOR c)
1144
1145 and r0, r0, r2 // = a AND c
1146 eor r0, r0, r3 // = (a AND b) XOR (a AND c)
1147 // = a AND (b XOR c)
1148
1149 cmp r0, r1 // always equal
1150
1151 #elif defined(__aarch64__)
1152
1153 and x3, x0, x1 // = a AND b
1154
1155 eor x1, x1, x2 // = b XOR c
1156 and x1, x1, x0 // = a AND (b XOR c)
1157
1158 and x0, x0, x2 // = a AND c
1159 eor x0, x0, x3 // = (a AND b) XOR (a AND c)
1160 // = a AND (b XOR c)
1161
1162 cmp x0, x1 // always equal
1163
1164 #else
1165 notimpl
1166 #endif
1167
1168 ret
1169
1170 endproc
1171
1172 proc x0e
1173
1174 // de morgan's law
1175
1176 #if defined(__x86_64__)
1177
1178 mov rcx, rax // = a
1179
1180 and rcx, rbx // = a AND b
1181 not rcx // = NOT (a AND b)
1182
1183 not rax // = NOT a
1184 not rbx // = NOT b
1185 or rax, rbx // = (NOT a) OR (NOT b)
1186 // = NOT (a AND b)
1187
1188 cmp rax, rcx // always equal
1189
1190 #elif defined(__i386__)
1191
1192 mov ecx, eax // = a
1193
1194 and ecx, ebx // = a AND b
1195 not ecx // = NOT (a AND b)
1196
1197 not eax // = NOT a
1198 not ebx // = NOT b
1199 or eax, ebx // = (NOT a) OR (NOT b)
1200 // = NOT (a AND b)
1201
1202 cmp eax, ecx // always equal
1203
1204 #elif defined(__arm__)
1205
1206 and r2, r0, r1 // = a AND b
1207 mvn r2, r2 // = NOT (a AND b)
1208
1209 mvn r0, r0 // = NOT a
1210 mvn r1, r1 // = NOT b
1211 orr r0, r0, r1 // = (NOT a) OR (NOT b)
1212
1213 cmp r0, r2 // always equal
1214
1215 #elif defined(__aarch64__)
1216
1217 and x2, x0, x1 // = a AND b
1218 mvn x2, x2 // = NOT (a AND b)
1219
1220 mvn x0, x0 // = NOT a
1221 orn x0, x0, x1 // = (NOT a) OR (NOT b)
1222
1223 cmp x0, x2 // always equal
1224
1225 #else
1226 notimpl
1227 #endif
1228
1229 ret
1230
1231 endproc
1232
1233 proc x0f
1234
1235 // replace input buffer bytes with cumulative XORs with initial a;
1236 // final a is XOR of all buffer bytes and initial a.
1237 //
1238 // not sure why you'd do this.
1239
1240 #if defined(__x86_64__)
1241
1242 0: xor [rsi], al
1243 lodsb
1244 loop 0b
1245
1246 #elif defined(__i386__)
1247
1248 0: xor [esi], al
1249 lodsb
1250 loop 0b
1251
1252 #elif defined(__arm__)
1253
1254 0: ldrb r12, [r4]
1255 subs r2, r2, #1
1256 eor r0, r0, r12
1257 strb r0, [r4], #1
1258 bne 0b
1259
1260 #elif defined(__aarch64__)
1261
1262 0: ldrb w16, [x4]
1263 sub x2, x2, #1
1264 eor w0, w0, w16
1265 strb w0, [x4], #1
1266 cbnz x2, 0b
1267
1268 #else
1269 notimpl
1270 #endif
1271
1272 ret
1273
1274 endproc
1275
1276 ///--------------------------------------------------------------------------
1277 /// 0x10--0x1f
1278
1279 proc x10
1280
1281 // four different ways to swap a pair of registers.
1282
1283 #if defined(__x86_64__)
1284
1285 push rax
1286 push rcx
1287 pop rax
1288 pop rcx
1289
1290 xor rax, rcx
1291 xor rcx, rax
1292 xor rax, rcx
1293
1294 add rax, rcx
1295 sub rcx, rax
1296 add rax, rcx
1297 neg rcx
1298
1299 xchg rax, rcx
1300
1301 #elif defined(__i386__)
1302
1303 push eax
1304 push ecx
1305 pop eax
1306 pop ecx
1307
1308 xor eax, ecx
1309 xor ecx, eax
1310 xor eax, ecx
1311
1312 add eax, ecx
1313 sub ecx, eax
1314 add eax, ecx
1315 neg ecx
1316
1317 xchg eax, ecx
1318
1319 #elif defined(__arm__)
1320
1321 stmfd r13!, {r0, r2}
1322 ldr r0, [r13, #4]
1323 ldr r2, [r13], #8
1324
1325 eor r0, r0, r2
1326 eor r2, r2, r0
1327 eor r0, r0, r2
1328
1329 sub r0, r0, r2
1330 add r2, r2, r0
1331 rsb r0, r0, r2 // don't need 3-addr with reverse-sub
1332
1333 mov r12, r0
1334 mov r0, r2
1335 mov r2, r0
1336
1337 #elif defined(__aarch64__)
1338
1339 // anything you can do
1340 stp x0, x2, [sp, #-16]!
1341 ldp x2, x0, [sp], #16
1342
1343 eor x0, x0, x2
1344 eor x2, x2, x0
1345 eor x0, x0, x2
1346
1347 // the add/sub/add thing was daft. you can do it in three if you're
1348 // clever -- and have three-address operations.
1349 sub x0, x0, x2
1350 add x2, x2, x0
1351 sub x0, x2, x0
1352
1353 // but we lack a fourth. we can't do this in fewer than three
1354 // instructions without hitting memory. only `ldp' will modify two
1355 // registers at a time, so we need at least two instructions -- but
1356 // if the first one sets one of our two registers to its final value
1357 // then we lose the other input value with no way to recover it, so
1358 // we must either write a fresh third register, or write something
1359 // other than the final value, and in both cases we need a third
1360 // instruction to fix everything up. we've done the wrong-something-
1361 // other trick twice, so here's the captain-obvious use-a-third-
1362 // register version.
1363 mov x16, x0
1364 mov x0, x2
1365 mov x2, x16
1366
1367 #else
1368 notimpl
1369 #endif
1370
1371 ret
1372
1373 endproc
1374
1375 proc x11
1376
1377 // assuming a is initialized to zero, set a to the inclusive or of
1378 // the xor-differences of corresponding bytes in the c-byte strings
1379 // at si and di.
1380 //
1381 // in particular, a will be zero (and zf set) if and only if the two
1382 // strings are equal.
1383
1384 #if defined(__x86_64__)
1385
1386 0: mov dl, [rsi]
1387 xor dl, [rdi]
1388 inc rsi
1389 inc rdi
1390 or al, dl
1391 loop 0b
1392
1393 #elif defined(__i386__)
1394
1395 0: mov dl, [esi]
1396 xor dl, [edi]
1397 inc esi
1398 inc edi
1399 or al, dl
1400 loop 0b
1401
1402 #elif defined(__arm__)
1403
1404 0: ldrb r1, [r4], #1
1405 ldrb r12, [r5], #1
1406 subs r2, r2, #1
1407 eor r12, r12, r1
1408 orr r0, r0, r12
1409 bne 0b
1410
1411 #elif defined(__aarch64__)
1412
1413 0: ldrb w16, [x4], #1
1414 ldrb w17, [x5], #1
1415 sub x2, x2, #1
1416 eor w16, w16, w17
1417 orr w0, w0, w16
1418 cbnz x2, 0b
1419
1420 #else
1421 notimpl
1422 #endif
1423
1424 ret
1425
1426 endproc
1427
1428 proc x12
1429
1430 // an obtuse way of adding two registers. for any bit position, a
1431 // OR d is set if and only if at least one of a and d has a bit set
1432 // in that position, and a AND d is set if and only if both have a
1433 // bit set in that position. essentially, then, what we've done is
1434 // move all of the set bits in d to a, unless there's already a bit
1435 // there. this clearly doesn't change the sum.
1436
1437 #if defined(__x86_64__)
1438
1439 mov rcx, rdx // c' = d
1440 and rdx, rax // d' = a AND d
1441 or rax, rcx // a' = a OR d
1442 add rax, rdx
1443
1444 #elif defined(__i386__)
1445
1446 mov ecx, edx // c' = d
1447 and edx, eax // d' = a AND d
1448 or eax, ecx // a' = a OR d
1449 add eax, edx
1450
1451 #elif defined(__arm__)
1452
1453 and r2, r0, r3 // c' = a AND d
1454 orr r0, r0, r3 // a' = a OR d
1455 add r0, r0, r2
1456
1457 #elif defined(__aarch64__)
1458
1459 and x2, x0, x3 // c' = a AND d
1460 orr x0, x0, x3 // a' = a OR d
1461 add x0, x0, x2
1462
1463 #else
1464 notimpl
1465 #endif
1466
1467 ret
1468
1469 endproc
1470
1471 proc x13
1472
1473 // ok, so this is a really obtuse way of adding a and b; the result
1474 // is in a and d. but why does it work?
1475
1476 #if defined(__x86_64__)
1477
1478 mov rcx, 0x40 // carry chains at most 64 long
1479 0: mov rdx, rax // copy a'
1480 xor rax, rbx // low bits of each bitwise sum
1481 and rbx, rdx // carry bits from each bitwise sum
1482 shl rbx, 1 // carry them into next position
1483 loop 0b
1484
1485 #elif defined(__i386__)
1486
1487 mov ecx, 0x40 // carry chains at most 64 long
1488 0: mov edx, eax // copy a'
1489 xor eax, ebx // low bits of each bitwise sum
1490 and ebx, edx // carry bits from each bitwise sum
1491 shl ebx, 1 // carry them into next position
1492 loop 0b
1493
1494 #elif defined(__arm__)
1495
1496 mov r2, #0x40
1497 0: and r3, r0, r1
1498 subs r2, r2, #1
1499 eor r0, r0, r1
1500 lsl r1, r3, #1
1501 bne 0b
1502
1503 #elif defined(__aarch64__)
1504
1505 mov x2, #0x40
1506 0: and x3, x0, x1
1507 sub x2, x2, #1
1508 eor x0, x0, x1
1509 lsl x1, x3, #1
1510 cbnz x2, 0b
1511
1512 #else
1513 notimpl
1514 #endif
1515
1516 ret
1517
1518 endproc
1519
1520 proc x14
1521
1522 // floor((a + d)/2), like x08.
1523
1524 #if defined(__x86_64__)
1525
1526 mov rcx, rax // copy a for later
1527 and rcx, rdx // carry bits
1528
1529 xor rax, rdx // low bits of each bitwise sum
1530 shr rax, 1 // divide by 2; carries now in place
1531
1532 add rax, rcx // add the carries; done
1533
1534 #elif defined(__i386__)
1535
1536 mov ecx, eax // copy a for later
1537 and ecx, edx // carry bits
1538
1539 xor eax, edx // low bits of each bitwise sum
1540 shr eax, 1 // divide by 2; carries now in place
1541
1542 add eax, ecx // add the carries; done
1543
1544 #elif defined(__arm__)
1545
1546 and r2, r0, r3
1547 eor r0, r0, r3
1548 add r0, r2, r0, lsr #1
1549
1550 #elif defined(__aarch64__)
1551
1552 and x2, x0, x3
1553 eor x0, x0, x3
1554 add x0, x2, x0, lsr #1
1555
1556 #else
1557 notimpl
1558 #endif
1559
1560 ret
1561
1562 endproc
1563
1564 proc x15
1565
1566 // sign extension 32 -> 64 bits.
1567
1568 #if defined(__x86_64__)
1569
1570 movsx rbx, eax // like this?
1571
1572 mov rdx, 0xffffffff80000000
1573 add rax, rdx // if bit 31 of a is set then bits
1574 // 31--63 of a' are clear; otherwise,
1575 // these bits are all set -- which is
1576 // exactly backwards
1577 xor rax, rdx // so fix it
1578
1579 #elif defined(__i386__)
1580
1581 movsx ebx, ax // like this?
1582
1583 mov edx, 0xffff8000
1584 add eax, edx // if bit 31 of a is set then bits
1585 // 31--63 of a' are clear; otherwise,
1586 // these bits are all set -- which is
1587 // exactly backwards
1588 xor eax, edx // so fix it
1589
1590 #elif defined(__arm__)
1591
1592 sxth r1, r0 // like this
1593
1594 mov r12, #0x80000000
1595 add r0, r0, r12, asr #16
1596 eor r0, r0, r12, asr #16
1597
1598 #elif defined(__aarch64__)
1599
1600 sxtw x1, w0 // like this
1601
1602 mov x16, #0xffffffff80000000
1603 add x0, x0, x16
1604 eor x0, x0, x16
1605
1606 #else
1607 notimpl
1608 #endif
1609
1610 ret
1611
1612 endproc
1613
1614 proc x16
1615
1616 // ??? i don't know why you'd want to calculate this.
1617
1618 #if defined(__x86_64__)
1619
1620 xor rax, rbx // a' = a XOR b
1621 xor rbx, rcx // b' = b XOR c
1622 mov rsi, rax // t = a XOR b
1623 add rsi, rbx // t = (a XOR b) + (b XOR c)
1624 cmovc rax, rbx // a' = cf ? b XOR c : a XOR b
1625 xor rax, rbx // a' = cf ? 0 : a XOR c
1626 cmp rax, rsi
1627
1628 #elif defined(__i386__)
1629
1630 xor eax, ebx // a' = a XOR b
1631 xor ebx, ecx // b' = b XOR c
1632 mov esi, eax // t = a XOR b
1633 add esi, ebx // t = (a XOR b) + (b XOR c)
1634 cmovc eax, ebx // a' = cf ? b XOR c : a XOR b
1635 xor eax, ebx // a' = cf ? 0 : a XOR c
1636 cmp eax, esi
1637
1638 #elif defined(__arm__)
1639
1640 eor r0, r0, r1
1641 eor r1, r1, r2
1642 adds r4, r0, r1
1643 movcs r0, r1
1644 eor r0, r0, r1
1645 cmp r0, r4
1646
1647 #elif defined(__aarch64__)
1648
1649 eor x0, x0, x1
1650 eor x1, x1, x2
1651 adds x4, x0, x1
1652 cmov.cs x0, x1
1653 eor x0, x0, x1
1654 cmp x0, x4
1655
1656 #else
1657 notimpl
1658 #endif
1659
1660 ret
1661
1662 endproc
1663
1664 proc x17
1665
1666 // absolute value
1667
1668 #if defined(__x86_64__)
1669
1670 cqo // d = a < 0 ? -1 : 0
1671 xor rax, rdx // a' = a < 0 ? -a - 1 : a
1672 sub rax, rdx // a' = a < 0 ? -a : a
1673
1674 #elif defined(__i386__)
1675
1676 cdq // d = a < 0 ? -1 : 0
1677 xor eax, edx // a' = a < 0 ? -a - 1 : a
1678 sub eax, edx // a' = a < 0 ? -a : a
1679
1680 #elif defined(__arm__)
1681
1682 // direct approach
1683 movs r1, r0
1684 rsbmi r1, r0, #0
1685
1686 // faithful-ish conversion
1687 eor r3, r0, r0, asr #31
1688 sub r0, r3, r0, asr #31
1689
1690 #elif defined(__aarch64__)
1691
1692 // direct approach
1693 tst x0, #1 << 63
1694 cneg.ne x1, x0
1695
1696 // faithful-ish conversion
1697 eor x3, x0, x0, asr #63
1698 sub x0, x3, x0, asr #63
1699
1700 #else
1701 notimpl
1702 #endif
1703
1704 ret
1705
1706 endproc
1707
1708 proc x18
1709
1710 // should always set sf, clear zf, unless we get rescheduled to a
1711 // different core.
1712
1713 #if defined(__x86_64__)
1714
1715 rdtsc // d || a = cycles
1716 shl rdx, 0x20
1717 or rax, rdx // a = cycles
1718 mov rcx, rax // c = cycles
1719
1720 rdtsc // d || a = cycles'
1721 shl rdx, 0x20
1722 or rax, rdx // a = cycles'
1723
1724 cmp rcx, rax
1725
1726 #elif defined(__i386__)
1727
1728 rdtsc // d || a = cycles
1729 mov ebx, eax
1730 mov ecx, edx // c || b = cycles
1731
1732 rdtsc // d || a = cycles'
1733
1734 sub ebx, eax
1735 sbb ecx, edx
1736
1737 #elif defined(__arm__)
1738
1739 // cycle clock not available in user mode
1740 mrrc p15, 0, r0, r1, c9
1741 mrrc p15, 0, r2, r3, c9
1742 subs r0, r0, r2
1743 sbcs r1, r1, r3
1744
1745 #elif defined(__aarch64__)
1746
1747 // cycle clock not available in user mode
1748 mrs x0, pmccntr_el0
1749 mrs x1, pmccntr_el0
1750 cmp x0, x1
1751
1752 #else
1753 notimpl
1754 #endif
1755
1756 ret
1757
1758 endproc
1759
1760 proc x19
1761
1762 // stupid way to capture a pointer to inline data and jump past it.
1763 // confuses the return-address predictor something chronic. worse
1764 // because amd64 calling convention doesn't usually pass arguments on
1765 // the stack.
1766
1767 #if defined(__x86_64__)
1768
1769 call 8f
1770 .string "hello world!\n\0"
1771 8: call print_str
1772 add rsp, 8
1773 ret
1774
1775 print_str:
1776 // actually implement this ridiculous thing
1777 mov rsi, [rsp + 8]
1778 xor edx, edx
1779 0: mov al, [rsi + rdx]
1780 inc rdx
1781 cmp al, 0
1782 jnz 0b
1783 mov eax, SYS_write
1784 mov edi, 1
1785 dec rdx
1786 syscall // clobbers r11 :-(
1787 ret
1788
1789 #elif defined(__i386__)
1790
1791 call 8f
1792 .string "hello world!\n\0"
1793 8: call print_str
1794 add esp, 4
1795 ret
1796
1797 print_str:
1798 // actually implement this ridiculous thing
1799 mov ecx, [esp + 4]
1800 xor edx, edx
1801 0: mov al, [ecx + edx]
1802 inc edx
1803 cmp al, 0
1804 jnz 0b
1805 mov eax, SYS_write
1806 mov ebx, 1
1807 dec edx
1808 int 0x80
1809 ret
1810
1811 #elif defined(__arm__)
1812
1813 // why am i doing this?
1814 stmfd r13!, {r14}
1815 bl 8f
1816 .string "hello world!\n\0"
1817 .balign 4
1818 8: mov r1, r14 // might as well make it easy on myself
1819 bl print_str
1820 ldmfd r13!, {pc}
1821
1822 print_str:
1823 mov r2, #0
1824 0: ldrb r0, [r1, r2]
1825 cmp r0, #0
1826 addne r2, r2, #1
1827 bne 0b
1828 mov r0, #1
1829 mov r7, #SYS_write
1830 swi 0
1831 bx r14
1832
1833 #elif defined(__aarch64__)
1834
1835 // why am i doing this?
1836 str x30, [sp, #-16]!
1837 bl 8f
1838 .string "hello world!\n\0"
1839 .balign 4
1840 8: mov x1, x30 // might as well make it easy on myself
1841 bl print_str
1842 ldr x30, [sp], #16
1843 ret
1844
1845 print_str:
1846 mov x2, #0
1847 0: ldrb w0, [x1, x2]
1848 cmp w0, #0
1849 cinc.ne x2, x2
1850 b.ne 0b
1851 mov x0, #1
1852 mov x8, #SYS_write
1853 svc #0
1854 ret
1855
1856 #else
1857 notimpl
1858 #endif
1859
1860 endproc
1861
1862 proc x1a
1863
1864 // collect the current instruction-pointer address. this was an old
1865 // 32-bit i386 trick for position-independent code, but (a) it
1866 // confuses the return predictor, and (b) amd64 has true pc-relative
1867 // addressing.
1868
1869 #if defined(__x86_64__)
1870
1871 // the actual example
1872 call 0f
1873 0: pop rax
1874
1875 // the modern i386 trick doesn't confuse the return-address
1876 // predictor.
1877 call calladdr_rbx
1878 sub rbx, . - 0b
1879
1880 // but rip-relative addressing is even better
1881 lea rcx, [rip + 0b]
1882
1883 ret
1884
1885 calladdr_rbx:
1886 mov rbx, [rsp]
1887 ret
1888
1889 #elif defined(__i386__)
1890
1891 // the actual example
1892 call 0f
1893 0: pop eax
1894
1895 // the modern i386 trick doesn't confuse the return-address
1896 // predictor.
1897 call get_pc_ebx
1898 sub ebx, . - 0b
1899
1900 ret
1901
1902 #elif defined(__arm__)
1903
1904 stmfd r13!, {r14}
1905
1906 bl 0f
1907 0: mov r0, r14
1908
1909 bl return
1910 sub r1, r14, #. - 0b
1911
1912 adr r2, 0b
1913
1914 ldmfd r13!, {pc}
1915
1916 return: bx r14
1917
1918 #elif defined(__aarch64__)
1919
1920 str x30, [sp, #-16]!
1921
1922 // we can do all of the above using a64
1923 bl 0f
1924 0: mov x0, x30
1925
1926 bl return
1927 sub x1, x30, #. - 0b
1928
1929 adr x2, 0b
1930
1931 ldr x30, [sp], #16
1932 return: ret
1933
1934 #else
1935 notimpl
1936 #endif
1937
1938 endproc
1939
1940 proc x1b
1941
1942 #if defined(__x86_64__)
1943
1944 // retpolines: an mitigation against adversarially influenced
1945 // speculative execution at indirect branches. if an adversary can
1946 // prepare a branch-target buffer entry matching an indirect branch
1947 // in the victim's address space then they can cause the victim to
1948 // /speculatively/ (but not architecturally) execute any code in
1949 // their address space, possibly leading to leaking secrets through
1950 // the cache. retpolines aren't susceptible to this because the
1951 // predicted destination address is from the return-prediction stack
1952 // which the adversary can't prime. the performance penalty is still
1953 // essentially a branch misprediction -- for this return, and
1954 // possibly all others already stacked.
1955
1956 // (try not to crash)
1957 lea rax, [rip + 9f]
1958
1959 push rax
1960 9: ret
1961
1962 #elif defined(__i386__)
1963
1964 call get_pc_ebx
1965 lea eax, [ebx + 9f - .]
1966
1967 push eax
1968 9: ret
1969
1970 #elif defined(__arm__)
1971
1972 stmfd r13!, {r14}
1973
1974 adr r14, 8f
1975 bx r14
1976
1977 8: ldmfd r13!, {pc}
1978
1979 #elif defined(__aarch64__)
1980
1981 str x30, [sp, #-16]!
1982
1983 adr x30, 8f
1984 ret
1985
1986 8: ldr x30, [sp], #16
1987 ret
1988
1989 #else
1990 notimpl
1991 #endif
1992
1993 endproc
1994
1995 proc x1c
1996
1997 // ok, having a hard time seeing a use for this. the most important
1998 // thing to note is that sp is set from `pop' /after/ it's
1999 // incremented.
2000
2001 #if defined(__x86_64__)
2002
2003 // try not to crash
2004 mov rax, rsp
2005 and rsp, -16
2006 push rax
2007
2008 pop rsp
2009
2010 // check it worked
2011 mov rbx, rsp
2012 ret
2013
2014 #elif defined(__i386__)
2015
2016 // try not to crash
2017 mov eax, esp
2018 and esp, -16
2019 push eax
2020
2021 pop esp
2022
2023 // check it worked
2024 mov ebx, esp
2025 ret
2026
2027 #elif defined(__arm__)
2028
2029 // not even going to dignify this
2030 notimpl
2031
2032 #elif defined(__aarch64__)
2033
2034 // not even going to dignify this
2035 notimpl
2036
2037 #else
2038 notimpl
2039 #endif
2040
2041 endproc
2042
2043 proc x1d
2044
2045 // monumentally cheesy way to copy 8 n bytes from buff1 to buff2.
2046 // also clobbers words at buff2 + 8 n and buff2 - 8 for good measure.
2047
2048 n = 4
2049
2050 #if defined(__x86_64__)
2051
2052 mov rax, rsp // safekeeping
2053
2054 // we're toast if we get hit by a signal now. fingers crossed...
2055 .if 0
2056 mov rsp, buff2 + 8*n + 8
2057 mov rbp, buff1 + 8*n
2058 .else
2059 lea rsp, [rdi + 8*n + 16]
2060 lea rbp, [rsi + 8*n]
2061 .endif
2062 enter 0, n + 1
2063
2064 // precise action:
2065 //
2066 // +---------+ +---------+
2067 // rbp -> | ??? | rsp -> | ??? |
2068 // +---------+ +---------+
2069 // | w_{n-1} | | rbp | <- rbp'
2070 // +---------+ +---------+
2071 // | ... | | w_{n-1} |
2072 // +---------+ +---------+
2073 // | w_1 | | ... |
2074 // +---------+ +---------+
2075 // | w_0 | | w_1 |
2076 // +---------+ +---------+
2077 // | w_0 |
2078 // +---------+
2079 // | rbp' | <- rsp'
2080 // +---------+
2081
2082 mov rdx, rsp
2083 mov rsp, rax
2084
2085 #elif defined(__i386__)
2086
2087 mov eax, esp // safekeeping
2088
2089 // we're toast if we get hit by a signal now. fingers crossed...
2090 .if 0
2091 mov esp, buff2 + 4*n + 4
2092 mov ebp, buff1 + 4*n
2093 .else
2094 lea esp, [edi + 4*n + 8]
2095 lea ebp, [esi + 4*n]
2096 .endif
2097 enter 0, n + 1
2098
2099 mov edx, esp
2100 mov esp, eax
2101
2102 #elif defined(__arm__)
2103
2104 add r4, r4, #4*n
2105 add r5, r5, #4*n + 8
2106
2107 str r4, [r5, #-4]!
2108 .rept n/2
2109 ldrd r0, r1, [r4, #-8]!
2110 strd r0, r1, [r5, #-8]!
2111 .endr
2112 add r4, r5, #4*n
2113 str r4, [r5, #-4]!
2114
2115 #elif defined(__aarch64__)
2116
2117 // omgwtf. let's not actually screw with the stack pointer.
2118
2119 add x4, x4, #8*n
2120 add x5, x5, #8*n + 16
2121
2122 str x4, [x5, #-8]!
2123 .rept n/2
2124 ldp x16, x17, [x4, #-16]!
2125 stp x16, x17, [x5, #-16]!
2126 .endr
2127 add x4, x5, #8*n
2128 str x4, [x5, #-8]!
2129
2130 #else
2131 notimpl
2132 #endif
2133
2134 ret
2135
2136 endproc
2137
2138 proc x1e
2139
2140 // convert nibble value to (uppercase) hex; other input values yield
2141 // nonsense.
2142
2143 #if defined(__x86_64__)
2144
2145 // das doesn't work in 64-bit mode; best i can come up with
2146 mov edx, eax
2147 add al, '0'
2148 add dl, 'A' - 10
2149 cmp al, '9' + 1
2150 cmovae eax, edx
2151
2152 #elif defined(__i386__)
2153
2154 cmp al, 0x0a // cf = 1 iff a < 10
2155 sbb al, 0x69 // if 0 <= a < 10, a' = a - 0x6a, so
2156 // 0x96 <= a' < 0x70, setting af, cf
2157 // if 10 <= a < 16, a' = a - 0x69, so
2158 // 0x71 <= a' < 0x77, setting cf but
2159 // clearing af
2160 das // if 0 <= a < 10, then af and cf are
2161 // both set, so set subtract 0x66
2162 // from a' leaving 0x30 <= a' < 0x3a;
2163 // if 10 <= a < 16 then af clear but
2164 // cf set, so subtract 0x60 from a'
2165 // leaving 0x41 <= a' < 0x47
2166
2167 #elif defined(__arm__)
2168
2169 // significantly less tricksy
2170 cmp r0, #10
2171 addlo r0, r0, #'0'
2172 addhs r0, r0, #'A' - 10
2173
2174 #elif defined(__aarch64__)
2175
2176 // with less versatile conditional execution this is the best we can
2177 // do
2178 cmp w0, #10
2179 add w16, w0, #'A' - 10
2180 add w0, w0, #'0'
2181 cmov.hs w0, w16
2182
2183 #else
2184 notimpl
2185 #endif
2186
2187 ret
2188
2189 endproc
2190
2191 proc x1f
2192
2193 // verify collatz conjecture starting at a; assume a /= 0!
2194
2195 #if defined(__x86_64__)
2196
2197 0: bsf rcx, rax // clobber c if a = 0
2198 shr rax, cl // a = 2^c a'
2199 cmp rdx, 0
2200 je 1f
2201 stosq
2202 dec rdx
2203 1:
2204 cmp rax, 1 // done?
2205 je 9f
2206 lea rax, [2*rax + rax + 1] // a' = 3 a' + 1
2207 jmp 0b // again
2208
2209 9: ret
2210
2211 #elif defined(__i386__)
2212
2213 0: bsf ecx, eax // clobber c if a = 0
2214 shr eax, cl // a = 2^c a'
2215 cmp edx, 0
2216 je 1f
2217 stosd
2218 dec edx
2219 1:
2220 cmp eax, 1 // done?
2221 je 9f
2222 lea eax, [2*eax + eax + 1] // a' = 3 a' + 1
2223 jmp 0b // again
2224
2225 9: ret
2226
2227 #elif defined(__arm__)
2228
2229 // rbit introduced in armv7
2230 0: rbit r2, r0
2231 clz r2, r2
2232 mov r0, r0, lsr r2 // a = 2^c a'
2233 cmp r3, #0
2234 strne r0, [r5], #4
2235 subne r3, r3, #1
2236 cmp r0, #1
2237 adcne r0, r0, r0, lsl #1 // a' = 3 a' + 1 (because c set)
2238 bne 0b
2239
2240 ret
2241
2242 #elif defined(__aarch64__)
2243
2244 0: rbit w2, w0
2245 clz w2, w2
2246 lsr w0, w0, w2 // a = 2^c a'
2247 cmp x3, #0
2248 beq 1f
2249 str x0, [x5], #8
2250 sub x3, x3, #1
2251 1:
2252 cmp w0, #1
2253 add w16, w0, w0, lsl #1 // t = 3 a' + 1 (because c set)
2254 csinc.eq w0, w0, w16
2255 b.ne 0b
2256
2257 ret
2258
2259 #else
2260 notimpl
2261 #endif
2262
2263 endproc
2264
2265 ///--------------------------------------------------------------------------
2266 /// 0x20--0x2f
2267
2268 proc x20
2269
2270 // calculate 1337 a slowly
2271
2272 #if defined(__x86_64__)
2273
2274 // original version
2275 mov rcx, rax // c = a
2276 shl rcx, 2 // c = 4 a
2277 add rcx, rax // c = 5 a
2278 shl rcx, 3 // c = 40 a
2279 add rcx, rax // c = 41 a
2280 shl rcx, 1 // c = 82 a
2281 add rcx, rax // c = 83 a
2282 shl rcx, 1 // c = 166 a
2283 add rcx, rax // c = 167 a
2284 shl rcx, 3 // c = 1336 a
2285 add rcx, rax // c = 1337 a
2286
2287 // a quick way
2288 lea rdx, [2*rax + rax] // t = 3 a
2289 shl rdx, 6 // t = 192 a
2290 sub rdx, rax // t = 191 a
2291 lea rbx, [8*rdx] // b = 1528 a
2292 sub rbx, rdx // b = 1337 a
2293
2294 #elif defined(__i386__)
2295
2296 // original version
2297 mov ecx, eax // c = a
2298 shl ecx, 2 // c = 4 a
2299 add ecx, eax // c = 5 a
2300 shl ecx, 3 // c = 40 a
2301 add ecx, eax // c = 41 a
2302 shl ecx, 1 // c = 82 a
2303 add ecx, eax // c = 83 a
2304 shl ecx, 1 // c = 166 a
2305 add ecx, eax // c = 167 a
2306 shl ecx, 3 // c = 1336 a
2307 add ecx, eax // c = 1337 a
2308
2309 // a quick way
2310 lea edx, [2*eax + eax] // t = 3 a
2311 shl edx, 6 // t = 192 a
2312 sub edx, eax // t = 191 a
2313 lea ebx, [8*edx] // b = 1528 a
2314 sub ebx, edx // b = 1337 a
2315
2316 #elif defined(__arm__)
2317
2318 // original version, ish
2319 add r2, r0, r0, lsl #2 // c = 5 a
2320 add r2, r0, r2, lsl #3 // c = 41 a
2321 add r2, r0, r2, lsl #1 // c = 83 a
2322 add r2, r0, r2, lsl #1 // c = 167 a
2323 add r2, r0, r2, lsl #3 // c = 1337 a
2324
2325 // quicker way
2326 add r1, r0, r0, lsl #1 // b = 3 a
2327 rsb r1, r0, r1, lsl #6 // b = 191 a
2328 rsb r1, r1, r1, lsl #3 // b = 1337 a
2329
2330 #elif defined(__aarch64__)
2331
2332 // original version, ish
2333 add x2, x0, x0, lsl #2 // c = 5 a
2334 add x2, x0, x2, lsl #3 // c = 41 a
2335 add x2, x0, x2, lsl #1 // c = 83 a
2336 add x2, x0, x2, lsl #1 // c = 167 a
2337 add x2, x0, x2, lsl #3 // c = 1337 a
2338
2339 // sleazy because no rsb
2340 add x1, x0, x0, lsl #1 // b = 3 a
2341 sub x1, x0, x1, lsl #6 // b = -191 a
2342 sub x1, x1, x1, lsl #3 // b = 1337 a
2343
2344 #else
2345 notimpl
2346 #endif
2347
2348 ret
2349
2350 endproc
2351
2352 proc x21
2353
2354 // multiply complex numbers a + b i and c + d i
2355 //
2356 // (a + b i) (c + d i) = (a c - b d) + (a d + b c) i
2357 //
2358 // somewhat slick approach uses only three multiplications
2359
2360 #if defined(__x86_64__)
2361
2362 mov rsi, rax // t = a
2363 add rax, rbx // a' = a + b
2364 mov rdi, rdx // u = d
2365 sub rdx, rcx // d' = d - c
2366 add rdi, rcx // u = c + d
2367
2368 imul rax, rcx // a' = c (a + b)
2369 imul rsi, rdx // t = a (d - c)
2370 imul rdi, rbx // u = b (c + d)
2371
2372 add rsi, rax // t = a (d - c) + c (a + b)
2373 mov rbx, rsi // b' = a (d - c) + c (a + b)
2374 // = a d + b c
2375 sub rax, rdi // a' = c (a + b) - b (c + d)
2376 // = a c - b d
2377
2378 #elif defined(__i386__)
2379
2380 mov esi, eax // t = a
2381 add eax, ebx // a' = a + b
2382 mov edi, edx // u = d
2383 sub edx, ecx // d' = d - c
2384 add edi, ecx // u = c + d
2385
2386 imul eax, ecx // a' = c (a + b)
2387 imul esi, edx // t = a (d - c)
2388 imul edi, ebx // u = b (c + d)
2389
2390 add esi, eax // t = a (d - c) + c (a + b)
2391 mov ebx, esi // b' = a (d - c) + c (a + b)
2392 // = a d + b c
2393 sub eax, edi // a' = c (a + b) - b (c + d)
2394 // = a c - b d
2395
2396 #elif defined(__arm__)
2397
2398 add r4, r0, r1 // t = a + b
2399 add r5, r2, r3 // u = c + d
2400 sub r3, r3, r2 // d' = d - c
2401
2402 // mls introduced in armv7
2403 mul r4, r4, r2 // t = c (a + b)
2404 mov r2, r1 // c' = a (bah!)
2405 mla r1, r0, r3, r4 // b' = a (d - c) + c (a + b)
2406 // = a d + b c
2407 mls r0, r2, r5, r4 // a' = c (a + b) - b (c + d)
2408 // = a c - b d
2409
2410 #elif defined(__aarch64__)
2411
2412 add x4, x0, x1 // t = a + b
2413 add x5, x2, x3 // u = c + d
2414 sub x3, x3, x2 // d' = d - c
2415
2416 // mls intxoduced in axmv7
2417 mul x4, x4, x2 // t = c (a + b)
2418 mov x2, x1 // c' = a (bah!)
2419 madd x1, x0, x3, x4 // b' = a (d - c) + c (a + b)
2420 // = a d + b c
2421 msub x0, x2, x5, x4 // a' = c (a + b) - b (c + d)
2422 // = a c - b d
2423
2424 #else
2425 notimpl
2426 #endif
2427
2428 ret
2429
2430 endproc
2431
2432 proc x22
2433
2434 // divide by 3
2435
2436 #if defined(__x86_64__)
2437
2438 mov rdx, 0xaaaaaaaaaaaaaaab // = ceil(2/3 2^64)
2439 mul rdx // d' || a' =~ 2/3 a 2^64
2440 shr rdx, 1 // d' = floor(a/3)
2441 mov rax, rdx // a' = floor(a/3)
2442
2443 // we start with 0 <= a < 2^64. write f = ceil(2/3 2^64), so that
2444 // 2/3 < f/2^64 < 2/3 + 1/2^64. then floor(2/3 a) <= floor(a f/2^64)
2445 // <= floor(2/3 a + a/2^64), but a < 2^64 so a/2^64 < 1 and
2446 // floor(a f/2^64) = floor(2/3 a).
2447
2448 #elif defined(__i386__)
2449
2450 mov edx, 0xaaaaaaab // = ceil(2/3 2^32)
2451 mul edx // d' || a' =~ 2/3 a 2^32
2452 shr edx, 1 // d' = floor(a/3)
2453 mov eax, edx // a' = floor(a/3)
2454
2455 #elif defined(__arm__)
2456
2457 ldr r12, =0xaaaaaaab
2458 umull r12, r0, r0, r12
2459 mov r0, r0, lsr #1
2460
2461 #elif defined(__aarch64__)
2462
2463 ldr x16, =0xaaaaaaaaaaaaaaab
2464 umulh x0, x0, x16
2465 lsr x0, x0, #1
2466
2467 #else
2468 notimpl
2469 #endif
2470
2471 ret
2472
2473 endproc
2474
2475 proc x23
2476
2477 #if defined(__x86_64__)
2478
2479 // main loop: shorten a preserving residue class mod 3
2480 0: cmp rax, 5
2481 jbe 8f
2482 // a > 5
2483 mov rdx, rax // d' = a
2484 shr rdx, 2 // d' = floor(a/4)
2485 and rax, 3 // a = 4 d' + a' (0 <= a' < 4)
2486 add rax, rdx // a' == a (mod 3) but a' < a/4 + 4
2487 jmp 0b
2488
2489 // fix up final value 0 <= a < 6: want 0 <= a < 3
2490 //
2491 // the tricky part is actually a = 3; but the other final cases take
2492 // additional iterations which we can avoid.
2493 8: cmp rax, 3 // set cf iff a < 3
2494 cmc // set cf iff a >= 3
2495 sbb rdx, rdx // d' = a >= 3 ? -1 : 0
2496 and rdx, 3 // d' = a >= 3 ? 3 : 0
2497 sub rax, rdx // a' = a - (a >= 3 ? 3 : 0)
2498 // = a (mod 3)
2499
2500 #elif defined(__i386__)
2501
2502 // main loop: shorten a preserving residue class mod 3
2503 0: cmp eax, 5
2504 jbe 8f
2505 // a > 5
2506 mov edx, eax // d' = a
2507 shr edx, 2 // d' = floor(a/4)
2508 and eax, 3 // a = 4 d' + a' (0 <= a' < 4)
2509 add eax, edx // a' == a (mod 3) but a' < a/4 + 4
2510 jmp 0b
2511
2512 // fix up final value 0 <= a < 6: want 0 <= a < 3
2513 //
2514 // the tricky part is actually a = 3; but the other final cases take
2515 // additional iterations which we can avoid.
2516 8: cmp eax, 3 // set cf iff a < 3
2517 cmc // set cf iff a >= 3
2518 sbb edx, edx // d' = a >= 3 ? -1 : 0
2519 and edx, 3 // d' = a >= 3 ? 3 : 0
2520 sub eax, edx // a' = a - (a >= 3 ? 3 : 0)
2521 // = a (mod 3)
2522
2523 #elif defined(__arm__)
2524
2525 0: cmp r0, #6
2526 andhs r12, r0, #3
2527 addhs r0, r12, r0, lsr #2
2528 bhs 0b
2529
2530 cmp r0, #3
2531 subhs r0, r0, #3
2532
2533 #elif defined(__aarch64__)
2534
2535 0: cmp x0, #6
2536 // blunder on through regardless since this doesn't affect the result
2537 and x16, x0, #3
2538 add x0, x16, x0, lsr #2
2539 b.hs 0b
2540
2541 subs x16, x0, #3
2542 cmov.hs x0, x16
2543
2544 #else
2545 notimpl
2546 #endif
2547
2548 ret
2549
2550 endproc
2551
2552 proc x24
2553
2554 // invert (odd) a mod 2^64
2555 //
2556 // suppose a a_i == 1 (mod 2^{2^i})
2557 //
2558 // clearly good for i = 0, since 2^i = 1 and 2^{2^i} = 2, and a_0 =
2559 // a == 1 (mod 2) by assumption
2560 //
2561 // write a a_i == b_i 2^{2^i} + 1 (mod 2^{2^{i+1}})
2562 // then b_i == (a a_i - 1)/2^{2^i} (mod 2^{2^i})
2563 // to lift inverse, we want x such that a x == -b_i (mod 2^{2^i});
2564 // clearly x = -a_i b_i will do, since a a_i == 1 (mod 2^{2^i})
2565 // then:
2566 // a_{i+1} = a_i - a_i b_i 2^{2^i} = a_i (1 - (a a_i - 1))
2567 // = 2 a_i - a a_i^2
2568 //
2569 // check:
2570 // a a_{i+1} = 2 a a_i - a^2 a_i^2
2571 // == 2 a a_i - (b_i 2^{2^i} + 1)^2
2572 // == 2 (b_i 2^{2^i} + 1) -
2573 // (b_i^2 2^{2^{i+1}} + 2 b_i 2^{2^i} + 1)
2574 // == 1 (mod 2^{2^{i+1}})
2575
2576 #if defined(__x86_64__)
2577
2578 // rax // a_0 = a
2579 mov rbx, rax // b' = a
2580 mov rsi, rax // t = a_0
2581
2582 0:
2583 cmp rbp, 0
2584 je 1f
2585 stosq
2586 dec rbp
2587 1:
2588 mul rbx // a' = a a_i
2589 mov rcx, rax // c = a a_i
2590
2591 sub rax, 2 // a' = a a_i - 2
2592 neg rax // a' = 2 - a a_i
2593 mul rsi // a_{i+1} = a_i (2 - a a_i)
2594 // = 2 a_i - a a_i^2
2595 mov rsi, rax // t = a_{i+1}
2596
2597 cmp rcx, 1 // done?
2598 ja 0b // no -- iterate
2599
2600 #elif defined(__i386__)
2601
2602 // eax // a_0 = a
2603 mov ebx, eax // b' = a
2604 mov esi, eax // t = a_0
2605
2606 0:
2607 cmp ebp, 0
2608 je 1f
2609 stosd
2610 dec ebp
2611 1:
2612 mul ebx // a' = a a_i
2613 mov ecx, eax // c = a a_i
2614
2615 sub eax, 2 // a' = a a_i - 2
2616 jb 9f // done if < 2
2617 neg eax // a' = 2 - a a_i
2618 mul esi // a_{i+1} = a_i (2 - a a_i)
2619 // = 2 a_i - a a_i^2
2620 mov esi, eax // t = a_{i+1}
2621
2622 jmp 0b // and iterate
2623 9: mov eax, esi // restore
2624
2625 #elif defined(__arm__)
2626
2627 // r0 // a_0 = a
2628 mov r1, r0 // b' = a
2629
2630 0:
2631 cmp r6, #0
2632 strne r0, [r5], #4
2633 subne r6, r6, #1
2634 mul r2, r0, r1 // c = a a_i
2635 rsbs r2, r2, #2 // c = 2 - a a_i
2636 mul r0, r0, r2 // a_{i+1} = a_i (2 - a a_i)
2637 // = 2 a_i - a a_i^2
2638 blo 0b
2639
2640 #elif defined(__aarch64__)
2641
2642 // x0 // a_0 = a
2643 mov x1, x0 // b' = a
2644 mov x16, #2 // because we have no rsb
2645
2646 0:
2647 cmp x6, #0
2648 b.eq 1f
2649 str x0, [x5], #8
2650 sub x6, x6, #1
2651 1:
2652 mul x2, x0, x1 // c = a a_i
2653 subs x2, x16, x2 // c = 2 - a a_i
2654 mul x0, x0, x2 // a_{i+1} = a_i (2 - a a_i)
2655 // = 2 a_i - a a_i^2
2656 b.lo 0b
2657
2658 #else
2659 notimpl
2660 #endif
2661
2662 ret
2663
2664 endproc
2665
2666 proc x25
2667
2668 // a poor approximation to pi/4
2669 //
2670 // think of x and y as being in 16.16 fixed-point format. we sample
2671 // points in the unit square, and determine how many of them are
2672 // within a unit quarter-circle centred at the origin. the area of
2673 // the quarter-circle is pi/4.
2674
2675 #if defined(__x86_64__)
2676
2677 xor eax, eax // a = 0
2678 mov rcx, 1
2679 shl rcx, 0x20 // c =~ 4 billion
2680
2681 0: movzx rbx, cx // x = low 16 bits of c
2682 imul rbx, rbx // b = x^2
2683
2684 ror rcx, 0x10 // switch halves of c
2685 movzx rdx, cx // y = high 16 bits of c
2686 imul rdx, rdx // d = y^2
2687 rol rcx, 0x10 // switch back
2688
2689 add rbx, rdx // r^2 = x^2 + y^2
2690 shr rbx, 0x20 // r^2 >= 1?
2691 cmp rbx, 1 // set cf iff r^2 >= 1
2692 adc rax, 0 // and add onto accumulator
2693 loop 0b
2694
2695 #elif defined(__i386__)
2696
2697 // this is actually better done in 32 bits. the carry has the wrong
2698 // sense here, so instead deduct one for each point outside the
2699 // quarter-circle rather than adding one for each point inside it.
2700 xor eax, eax
2701 xor ecx, ecx
2702
2703 0: movzx ebx, cx
2704 imul ebx, ebx
2705
2706 ror ecx, 0x10
2707 movzx edx, cx
2708 imul edx, edx
2709 rol ecx, 0x10
2710
2711 add ebx, edx // see?
2712 sbb eax, 0
2713 loop 0b
2714
2715 #elif defined(__arm__)
2716
2717 mov r0, #0
2718 mov r2, #0
2719
2720 0: uxth r1, r2, ror #0
2721 uxth r3, r2, ror #16
2722 mul r1, r1, r1
2723 mul r3, r3, r3
2724 cmn r1, r3 // mlas doesn't set cf usefully
2725 addcc r0, r0, #1
2726 adds r2, r2, #1
2727 bne 0b
2728
2729 #elif defined(__aarch64__)
2730
2731 mov w0, #0
2732 mov w2, #0
2733
2734 0: ubfx w1, w2, #0, #16
2735 ubfx w3, w2, #16, #16
2736 sub w2, w2, #1
2737 mul w1, w1, w1
2738 mul w3, w3, w3
2739 cmn w1, w3
2740 cinc.cc w0, w0
2741 cbnz w2, 0b
2742
2743 #else
2744 notimpl
2745 #endif
2746
2747 ret
2748
2749 endproc
2750
2751 proc x26
2752
2753 #if defined(__x86_64__)
2754
2755 notimpl
2756
2757 #elif defined(__i386__)
2758
2759 notimpl
2760
2761 #elif defined(__arm__)
2762
2763 notimpl
2764
2765 #elif defined(__aarch64__)
2766
2767 notimpl
2768
2769 #else
2770 notimpl
2771 #endif
2772
2773 endproc
2774
2775 proc x27
2776
2777 #if defined(__x86_64__)
2778
2779 notimpl
2780
2781 #elif defined(__i386__)
2782
2783 notimpl
2784
2785 #elif defined(__arm__)
2786
2787 notimpl
2788
2789 #elif defined(__aarch64__)
2790
2791 notimpl
2792
2793 #else
2794 notimpl
2795 #endif
2796
2797 endproc
2798
2799 proc x28
2800
2801 #if defined(__x86_64__)
2802
2803 notimpl
2804
2805 #elif defined(__i386__)
2806
2807 notimpl
2808
2809 #elif defined(__arm__)
2810
2811 notimpl
2812
2813 #elif defined(__aarch64__)
2814
2815 notimpl
2816
2817 #else
2818 notimpl
2819 #endif
2820
2821 endproc
2822
2823 proc x29
2824
2825 #if defined(__x86_64__)
2826
2827 notimpl
2828
2829 #elif defined(__i386__)
2830
2831 notimpl
2832
2833 #elif defined(__arm__)
2834
2835 notimpl
2836
2837 #elif defined(__aarch64__)
2838
2839 notimpl
2840
2841 #else
2842 notimpl
2843 #endif
2844
2845 endproc
2846
2847 proc x2a
2848
2849 #if defined(__x86_64__)
2850
2851 notimpl
2852
2853 #elif defined(__i386__)
2854
2855 notimpl
2856
2857 #elif defined(__arm__)
2858
2859 notimpl
2860
2861 #elif defined(__aarch64__)
2862
2863 notimpl
2864
2865 #else
2866 notimpl
2867 #endif
2868
2869 endproc
2870
2871 proc x2b
2872
2873 #if defined(__x86_64__)
2874
2875 notimpl
2876
2877 #elif defined(__i386__)
2878
2879 notimpl
2880
2881 #elif defined(__arm__)
2882
2883 notimpl
2884
2885 #elif defined(__aarch64__)
2886
2887 notimpl
2888
2889 #else
2890 notimpl
2891 #endif
2892
2893 endproc
2894
2895 proc x2c
2896
2897 #if defined(__x86_64__)
2898
2899 notimpl
2900
2901 #elif defined(__i386__)
2902
2903 notimpl
2904
2905 #elif defined(__arm__)
2906
2907 notimpl
2908
2909 #elif defined(__aarch64__)
2910
2911 notimpl
2912
2913 #else
2914 notimpl
2915 #endif
2916
2917 endproc
2918
2919 proc x2d
2920
2921 #if defined(__x86_64__)
2922
2923 notimpl
2924
2925 #elif defined(__i386__)
2926
2927 notimpl
2928
2929 #elif defined(__arm__)
2930
2931 notimpl
2932
2933 #elif defined(__aarch64__)
2934
2935 notimpl
2936
2937 #else
2938 notimpl
2939 #endif
2940
2941 endproc
2942
2943 proc x2e
2944
2945 #if defined(__x86_64__)
2946
2947 notimpl
2948
2949 #elif defined(__i386__)
2950
2951 notimpl
2952
2953 #elif defined(__arm__)
2954
2955 notimpl
2956
2957 #elif defined(__aarch64__)
2958
2959 notimpl
2960
2961 #else
2962 notimpl
2963 #endif
2964
2965 endproc
2966
2967 proc x2f
2968
2969 #if defined(__x86_64__)
2970
2971 notimpl
2972
2973 #elif defined(__i386__)
2974
2975 notimpl
2976
2977 #elif defined(__arm__)
2978
2979 notimpl
2980
2981 #elif defined(__aarch64__)
2982
2983 notimpl
2984
2985 #else
2986 notimpl
2987 #endif
2988
2989 endproc
2990
2991 ///--------------------------------------------------------------------------
2992 /// 0x30--0x3f
2993
2994 proc x30
2995
2996 #if defined(__x86_64__)
2997
2998 notimpl
2999
3000 #elif defined(__i386__)
3001
3002 notimpl
3003
3004 #elif defined(__arm__)
3005
3006 notimpl
3007
3008 #elif defined(__aarch64__)
3009
3010 notimpl
3011
3012 #else
3013 notimpl
3014 #endif
3015
3016 ret
3017
3018 endproc
3019
3020 proc x31
3021
3022 #if defined(__x86_64__)
3023
3024 notimpl
3025
3026 #elif defined(__i386__)
3027
3028 notimpl
3029
3030 #elif defined(__arm__)
3031
3032 notimpl
3033
3034 #elif defined(__aarch64__)
3035
3036 notimpl
3037
3038 #else
3039 notimpl
3040 #endif
3041
3042 endproc
3043
3044 proc x32
3045
3046 #if defined(__x86_64__)
3047
3048 notimpl
3049
3050 #elif defined(__i386__)
3051
3052 notimpl
3053
3054 #elif defined(__arm__)
3055
3056 notimpl
3057
3058 #elif defined(__aarch64__)
3059
3060 notimpl
3061
3062 #else
3063 notimpl
3064 #endif
3065
3066 endproc
3067
3068 proc x33
3069
3070 #if defined(__x86_64__)
3071
3072 notimpl
3073
3074 #elif defined(__i386__)
3075
3076 notimpl
3077
3078 #elif defined(__arm__)
3079
3080 notimpl
3081
3082 #elif defined(__aarch64__)
3083
3084 notimpl
3085
3086 #else
3087 notimpl
3088 #endif
3089
3090 endproc
3091
3092 proc x34
3093
3094 #if defined(__x86_64__)
3095
3096 notimpl
3097
3098 #elif defined(__i386__)
3099
3100 notimpl
3101
3102 #elif defined(__arm__)
3103
3104 notimpl
3105
3106 #elif defined(__aarch64__)
3107
3108 notimpl
3109
3110 #else
3111 notimpl
3112 #endif
3113
3114 endproc
3115
3116 proc x35
3117
3118 #if defined(__x86_64__)
3119
3120 notimpl
3121
3122 #elif defined(__i386__)
3123
3124 notimpl
3125
3126 #elif defined(__arm__)
3127
3128 notimpl
3129
3130 #elif defined(__aarch64__)
3131
3132 notimpl
3133
3134 #else
3135 notimpl
3136 #endif
3137
3138 endproc
3139
3140 proc x36
3141
3142 #if defined(__x86_64__)
3143
3144 notimpl
3145
3146 #elif defined(__i386__)
3147
3148 notimpl
3149
3150 #elif defined(__arm__)
3151
3152 notimpl
3153
3154 #elif defined(__aarch64__)
3155
3156 notimpl
3157
3158 #else
3159 notimpl
3160 #endif
3161
3162 endproc
3163
3164 proc x37
3165
3166 #if defined(__x86_64__)
3167
3168 notimpl
3169
3170 #elif defined(__i386__)
3171
3172 notimpl
3173
3174 #elif defined(__arm__)
3175
3176 notimpl
3177
3178 #elif defined(__aarch64__)
3179
3180 notimpl
3181
3182 #else
3183 notimpl
3184 #endif
3185
3186 endproc
3187
3188 proc x38
3189
3190 #if defined(__x86_64__)
3191
3192 notimpl
3193
3194 #elif defined(__i386__)
3195
3196 notimpl
3197
3198 #elif defined(__arm__)
3199
3200 notimpl
3201
3202 #elif defined(__aarch64__)
3203
3204 notimpl
3205
3206 #else
3207 notimpl
3208 #endif
3209
3210 endproc
3211
3212 proc x39
3213
3214 #if defined(__x86_64__)
3215
3216 notimpl
3217
3218 #elif defined(__i386__)
3219
3220 notimpl
3221
3222 #elif defined(__arm__)
3223
3224 notimpl
3225
3226 #elif defined(__aarch64__)
3227
3228 notimpl
3229
3230 #else
3231 notimpl
3232 #endif
3233
3234 endproc
3235
3236 proc x3a
3237
3238 #if defined(__x86_64__)
3239
3240 notimpl
3241
3242 #elif defined(__i386__)
3243
3244 notimpl
3245
3246 #elif defined(__arm__)
3247
3248 notimpl
3249
3250 #elif defined(__aarch64__)
3251
3252 notimpl
3253
3254 #else
3255 notimpl
3256 #endif
3257
3258 endproc
3259
3260 proc x3b
3261
3262 #if defined(__x86_64__)
3263
3264 notimpl
3265
3266 #elif defined(__i386__)
3267
3268 notimpl
3269
3270 #elif defined(__arm__)
3271
3272 notimpl
3273
3274 #elif defined(__aarch64__)
3275
3276 notimpl
3277
3278 #else
3279 notimpl
3280 #endif
3281
3282 endproc
3283
3284 proc x3c
3285
3286 #if defined(__x86_64__)
3287
3288 notimpl
3289
3290 #elif defined(__i386__)
3291
3292 notimpl
3293
3294 #elif defined(__arm__)
3295
3296 notimpl
3297
3298 #elif defined(__aarch64__)
3299
3300 notimpl
3301
3302 #else
3303 notimpl
3304 #endif
3305
3306 endproc
3307
3308 proc x3d
3309
3310 #if defined(__x86_64__)
3311
3312 notimpl
3313
3314 #elif defined(__i386__)
3315
3316 notimpl
3317
3318 #elif defined(__arm__)
3319
3320 notimpl
3321
3322 #elif defined(__aarch64__)
3323
3324 notimpl
3325
3326 #else
3327 notimpl
3328 #endif
3329
3330 endproc
3331
3332 proc x3e
3333
3334 #if defined(__x86_64__)
3335
3336 notimpl
3337
3338 #elif defined(__i386__)
3339
3340 notimpl
3341
3342 #elif defined(__arm__)
3343
3344 notimpl
3345
3346 #elif defined(__aarch64__)
3347
3348 notimpl
3349
3350 #else
3351 notimpl
3352 #endif
3353
3354 endproc
3355
3356 proc x3f
3357
3358 #if defined(__x86_64__)
3359
3360 notimpl
3361
3362 #elif defined(__i386__)
3363
3364 notimpl
3365
3366 #elif defined(__arm__)
3367
3368 notimpl
3369
3370 #elif defined(__aarch64__)
3371
3372 notimpl
3373
3374 #else
3375 notimpl
3376 #endif
3377
3378 endproc
3379
3380 ///----- That's all, folks --------------------------------------------------