xchg.S: Add missing `ret' in ARM64 version of `call_example'.
[xchg-rax-rax] / xchg.S
1 /// -*- mode: asm; asm-comment-char: 0 -*-
2
3 ///--------------------------------------------------------------------------
4 /// Preliminaries.
5
6 #include <sys/syscall.h>
7
8 #if defined(__i386__) || defined(__x86_64__)
9
10 .intel_syntax noprefix
11
12 #elif defined(__arm__)
13
14 .macro ret
15 bx r14
16 .endm
17
18 .arch armv7-a
19
20 #elif defined(__aarch64__)
21
22 .macro cmov rd, rn, cc
23 csel \rd, \rn, \rd, \cc
24 .endm
25 #define _COND(_) \
26 _(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl) \
27 _(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv) \
28 _(hs) _(lo)
29 #define _INST(_) \
30 _(ccmp) _(ccmn) \
31 _(csel) _(cmov) \
32 _(csinc) _(cinc) _(cset) \
33 _(csneg) _(cneg) \
34 _(csinv) _(cinv) _(csetm)
35 #define _CONDVAR(cc) _definstvar cc;
36 #define _INSTVARS(inst) \
37 .macro _definstvar cc; \
38 .macro inst.\cc args:vararg; inst \args, \cc; .endm; \
39 .endm; \
40 _COND(_CONDVAR); \
41 .purgem _definstvar;
42 _INST(_INSTVARS)
43 #undef _COND
44 #undef _INST
45 #undef _CONDVAR
46 #undef _INSTVARS
47
48 #define CCMP_N 8
49 #define CCMP_Z 4
50 #define CCMP_C 2
51 #define CCMP_V 1
52
53 #define CCMP_MI CCMP_N
54 #define CCMP_PL 0
55 #define CCMP_EQ CCMP_Z
56 #define CCMP_NE 0
57 #define CCMP_CS CCMP_C
58 #define CCMP_HS CCMP_C
59 #define CCMP_CC 0
60 #define CCMP_LO 0
61 #define CCMP_VS CCMP_V
62 #define CCMP_VC 0
63 #define CCMP_HI CCMP_C
64 #define CCMP_LS 0
65 #define CCMP_LT CCMP_N
66 #define CCMP_GE 0
67 #define CCMP_LE CCMP_N
68 #define CCMP_GT 0
69
70 #else
71 # error "not supported"
72 #endif
73
74 .macro proc name
75 .globl \name
76 .type \name, STT_FUNC
77 .p2align 4
78 \name\():
79 .macro endproc
80 .size \name, . - \name
81 .purgem endproc
82 .endm
83 .endm
84
85 .macro ch c
86 #if defined(__i386__)
87
88 pushf
89 push eax
90 push ebx
91 push ecx
92 push edx
93 push ebp
94 mov ebp, esp
95 and esp, -16
96
97 push \c
98 call putchar@plt
99
100 call get_pc_ebx
101 add ebx, offset _GLOBAL_OFFSET_TABLE
102 mov eax, [ebx + stdout@GOT]
103 mov eax, [eax]
104 call fflush@plt
105
106 mov esp, ebp
107 pop ebp
108 pop edx
109 pop ecx
110 pop ebx
111 pop eax
112 popf
113
114 #elif defined(__x86_64__)
115
116 pushf
117 push rax
118 push rcx
119 push rdx
120 push rsi
121 push rdi
122 push r8
123 push r9
124 push rbp
125 mov rbp, rsp
126 and rsp, -16
127
128 mov rdi, \c
129 call putchar@plt
130
131 mov rdi, [rip + stdout]
132 call fflush@plt
133
134 mov rsp, rbp
135 pop rbp
136 pop r9
137 pop r8
138 pop rdi
139 pop rsi
140 pop rdx
141 pop rcx
142 pop rax
143 popf
144
145 #elif defined(__arm__)
146
147 stmfd r13!, {r0-r4, r12, r14}
148
149 mov r4, r13
150 bic r14, r4, #15
151 mov r13, r14
152
153 mov r0, #\c
154 bl putchar@plt
155
156 ldr r14, .L$_c$gotoff$\@
157 .L$_c$gotpc$\@:
158 add r14, pc, r14
159 b .L$_c$cont$\@
160 .L$_c$gotoff$\@:
161 .word _GLOBAL_OFFSET_TABLE - .L$_c$gotpc$\@ - 8
162 .L$_c$cont$\@:
163 bl fflush@plt
164
165 mov r13, r4
166 ldmfd r13!, {r0-r4, r12, r14}
167
168 #elif defined(__aarch64__)
169
170 sub sp, sp, #20*8
171 stp x0, x1, [sp, #0]
172 stp x2, x3, [sp, #16]
173 stp x4, x5, [sp, #32]
174 stp x6, x7, [sp, #48]
175 stp x8, x9, [sp, #64]
176 stp x10, x11, [sp, #80]
177 stp x12, x13, [sp, #96]
178 stp x14, x15, [sp, #112]
179 stp x16, x17, [sp, #128]
180 mrs x16, nzcv
181 stp x16, x30, [sp, #144]
182
183 mov w0, #\c
184 bl putchar
185 adrp x0, :got:stdout
186 ldr x0, [x0, #:got_lo12:stdout]
187 ldr x0, [x0]
188 bl fflush
189
190 ldp x16, x30, [sp, #144]
191 msr nzcv, x16
192 ldp x16, x17, [sp, #128]
193 ldp x14, x15, [sp, #112]
194 ldp x12, x13, [sp, #96]
195 ldp x10, x11, [sp, #80]
196 ldp x8, x9, [sp, #64]
197 ldp x6, x7, [sp, #48]
198 ldp x4, x5, [sp, #32]
199 ldp x2, x3, [sp, #16]
200 ldp x0, x1, [sp, #0]
201 add sp, sp, #20*8
202
203 #else
204 # error "not supported"
205 #endif
206 .endm
207
208 .macro notimpl
209 #if defined(__i386__) || defined(__x86_64__)
210 ud2
211 #elif defined(__arm__)
212 udf
213 #elif defined(__aarch64__)
214 hlt #0
215 #else
216 # error "not supported"
217 #endif
218 .endm
219
220 .section .note.GNU-stack, "", %progbits
221
222 .text
223
224 #if defined(__i386__)
225 get_pc_ebx:
226 mov ebx, [esp]
227 ret
228 #endif
229
230
231 proc call_example
232
233 #if defined(__i386__)
234
235 push ebx // ebx
236 push esi // esi, ebx
237 push edi // edi, esi, ebx
238 push ebp // flags, ebp, ..., ebx
239 pushf
240
241 mov edi, [esp + 4*6]
242 mov esi, [esp + 4*7]
243 push esi // regs, flags, ebp, ..., ebx
244
245 call get_pc_ebx
246 lea eax, [ebx + 9f - .]
247 push eax // cont, regs, flags, ebp, ..., ebx
248 push edi // func, cont, regs, flags, ebp, ..., ebx
249
250 mov eax, [esi + 28]
251 pushf
252 pop ecx
253 and eax, 0x0cd5
254 and ecx, ~0x0cd5
255 or eax, ecx
256 push eax
257 popf
258 mov eax, [esi + 0]
259 mov ebx, [esi + 4]
260 mov ecx, [esi + 8]
261 mov edx, [esi + 12]
262 mov edi, [esi + 20]
263 mov ebp, [esi + 24]
264 mov esi, [esi + 16]
265
266 ret // -> func; regs, flags, ebp, ..., ebx
267
268 9: pushf // eflags, regs, flags, ebp, ..., ebx
269 push esi // esi, eflags, regs, flags, ebp, ..., ebx
270 mov esi, [esp + 8]
271 mov [esi + 0], eax
272 mov [esi + 4], ebx
273 mov [esi + 8], ecx
274 mov [esi + 12], edx
275 mov [esi + 20], edi
276 mov [esi + 24], ebp
277 pop eax // rflags, regs, flags, ebp, ..., ebx
278 mov [esi + 16], eax
279 pop eax // regs, flags, ebp, ..., ebx
280 mov [esi + 28], eax
281
282 add esp, 4 // flags, ebp, ..., ebx
283 popf // ebp, ..., ebx
284 pop ebp // ..., ebx
285 pop edi
286 pop esi
287 pop ebx //
288 ret
289
290 #elif defined(__x86_64__)
291
292 push rbx // rbx
293 push r10
294 push r11
295 push r12
296 push r13
297 push r14
298 push r15
299 push rbp // flags, rbp, ..., rbx
300 pushf
301
302 push rsi // regs, flags, rbp, ..., rbx
303
304 lea rax, [rip + 9f]
305 push rax // cont, regs, flags, rbp, ..., rbx
306 push rdi // func, cont, regs, flags, rbp, ..., rbx
307
308 mov rax, [rsi + 8*15]
309 pushf
310 pop rcx
311 and rax, 0x0cd5
312 and rcx, ~0x0cd5
313 or rax, rcx
314 push rax
315 popf
316 mov rax, [rsi + 0]
317 mov rbx, [rsi + 8]
318 mov rcx, [rsi + 16]
319 mov rdx, [rsi + 24]
320 mov rdi, [rsi + 40]
321 mov rbp, [rsi + 48]
322 mov r8, [rsi + 56]
323 mov r9, [rsi + 64]
324 mov r10, [rsi + 72]
325 mov r11, [rsi + 80]
326 mov r12, [rsi + 88]
327 mov r13, [rsi + 96]
328 mov r14, [rsi + 104]
329 mov r15, [rsi + 112]
330 mov rsi, [rsi + 32]
331
332 ret // -> func; regs, flags, rbp, ..., rbx
333
334 9: pushf // rflags, regs, flags, rbp, ..., rbx
335 push rsi // rsi, rflags, regs, flags, rbp, ..., rbx
336 mov rsi, [rsp + 16]
337 mov [rsi + 0], rax
338 mov [rsi + 8], rbx
339 mov [rsi + 16], rcx
340 mov [rsi + 24], rdx
341 mov [rsi + 40], rdi
342 mov [rsi + 48], rbp
343 mov [rsi + 56], r8
344 mov [rsi + 64], r9
345 mov [rsi + 72], r10
346 mov [rsi + 80], r11
347 mov [rsi + 88], r12
348 mov [rsi + 96], r13
349 mov [rsi + 104], r14
350 mov [rsi + 112], r15
351 pop rax // rflags, regs, flags, rbp, ..., rbx
352 mov [rsi + 32], rax
353 pop rax // regs, flags, rbp, ..., rbx
354 mov [rsi + 120], rax
355
356 add rsp, 8 // flags, rbp, ..., rbx
357 popf // rbp, ..., rbx
358 pop rbp // ..., rbx
359 pop r15
360 pop r14
361 pop r13
362 pop r12
363 pop r11
364 pop r10
365 pop rbx //
366 ret
367
368 #elif defined(__arm__)
369
370 stmfd r13!, {r0, r1, r4-r11, r14}
371 ldmia r1, {r0-r12, r14}
372 msr cpsr, r14
373 mov r14, pc
374 ldr pc, [r13], #4
375 ldr r14, [r13], #4
376 stmia r14!, {r0-r12}
377 mrs r0, cpsr
378 str r0, [r14]
379 ldmfd r13!, {r4-r11, pc}
380
381 #elif defined(__aarch64__)
382
383 stp x29, x30, [sp, #-13*8]!
384 mov x29, sp
385 stp x19, x20, [sp, #16]
386 stp x21, x22, [sp, #32]
387 stp x23, x24, [sp, #48]
388 stp x25, x26, [sp, #64]
389 stp x27, x28, [sp, #80]
390 str x1, [sp, #96]
391
392 mov x16, x0
393
394 ldr x17, [x1, #128]
395 ldp x14, x15, [x1, #112]
396 ldp x12, x13, [x1, #96]
397 ldp x10, x11, [x1, #80]
398 ldp x8, x9, [x1, #64]
399 ldp x6, x7, [x1, #48]
400 ldp x4, x5, [x1, #32]
401 ldp x2, x3, [x1, #16]
402 ldp x0, x1, [x1, #0]
403 msr nzcv, x17
404
405 blr x16
406
407 ldr x16, [sp, #96]
408 mrs x17, nzcv
409 str x17, [x16, #128]
410 stp x14, x15, [x16, #112]
411 stp x12, x13, [x16, #96]
412 stp x10, x11, [x16, #80]
413 stp x8, x9, [x16, #64]
414 stp x6, x7, [x16, #48]
415 stp x4, x5, [x16, #32]
416 stp x2, x3, [x16, #16]
417 stp x0, x1, [x16, #0]
418
419 ldp x19, x20, [sp, #16]
420 ldp x21, x22, [sp, #32]
421 ldp x23, x24, [sp, #48]
422 ldp x25, x26, [sp, #64]
423 ldp x27, x28, [sp, #80]
424 ldp x29, x30, [sp], #13*8
425
426 ret
427
428 #else
429 # error "not supported"
430 #endif
431
432 endproc
433
434 proc nop
435
436 ret
437
438 endproc
439
440 ///--------------------------------------------------------------------------
441 /// 0x00--0x0f
442
443 proc x00
444
445 // clear all 64 bits of extended traditional registers
446
447 #if defined(__x86_64__)
448
449 xor eax, eax // clear rax
450 lea rbx, [0] // rbx -> _|_
451 loop . // iterate, decrement rcx until zero
452 mov rdx, 0 // set rdx = 0
453 and esi, 0 // clear all bits of rsi
454 sub edi, edi // set rdi = edi - edi = 0
455 push 0
456 pop rbp // pop 0 into rbp
457
458 #elif defined(__i386__)
459
460 xor eax, eax
461 lea ebx, [0]
462 loop .
463 mov edx, 0
464 and esi, 0
465 sub edi, edi
466 push 0
467 pop ebp
468
469 #elif defined(__arm__)
470
471 eor r0, r0, r0
472 rsb r1, r1, r1
473 0: subs r2, r2, #1
474 bne 0b
475 mov r3, #0
476 and r4, r4, #0
477 sub r5, r5, r5
478
479 #elif defined(__aarch64__)
480
481 eor w0, w0, w0
482 mov w1, wzr
483 0: sub w2, w2, #1
484 cbnz w2, 0b
485 mov w3, #0
486 and w4, w4, wzr
487 sub w5, w5, w5
488
489 #else
490 notimpl
491 #endif
492
493 ret
494
495 endproc
496
497 proc x01
498
499 // advance a fibonacci pair by c steps
500 //
501 // on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1}
502 // and f_{i+c}, where f_{i+1} = f_i + f_{i-1}
503
504 #if defined(__x86_64__)
505
506 0: xadd rax, rdx // a, d = a + d, a
507 // = f_{i+1} + f_i, f_{i+1}
508 // = f_{i+2}, f_{i+1}
509 loop 0b // advance i, decrement c, iterate
510
511 #elif defined(__i386__)
512
513 0: xadd eax, edx
514 loop 0b
515
516 #elif defined(__arm__)
517
518 0: subs r2, r2, #2
519 add r3, r3, r0
520 blo 8f
521 add r0, r0, r3
522 bhi 0b
523
524 8: movne r0, r3
525
526 #elif defined(__aarch64__)
527
528 0: subs x2, x2, #2
529 add x3, x3, x0
530 b.lo 8f
531 add x0, x0, x3
532 b.hi 0b
533
534 8: cmov.ne x0, x3
535
536 #else
537 notimpl
538 #endif
539
540 ret
541
542 endproc
543
544 proc x02
545
546 // boolean canonify a: if a = 0 on entry, leave it zero; otherwise
547 // set a = 1
548
549 #if defined(__x86_64__)
550
551 neg rax // set cf iff a /= 0
552 sbb rax, rax // a = a - a - cf = -cf
553 neg rax // a = cf
554
555 #elif defined(__i386__)
556
557 neg eax
558 sbb eax, eax
559 neg eax
560
561 #elif defined(__arm__)
562
563 movs r1, r0 // the easy way
564 movne r1, #1 // mvnne r1, #1 for mask
565
566 cmp r0, #1 // clear cf iff a == 0
567 sbc r2, r0, r0 // c' = a - a - 1 + cf = cf - 1
568 add r2, r2, #1 // c' = cf
569
570 sub r3, r0, r0, lsr #1 // d' top bit clear; d' = 0 iff a = 0
571 rsb r3, r3, #0 // d' top bit set iff a /= 0
572 mov r3, r3, lsr #31 // asr for mask
573
574 rsbs r0, r0, #0
575 sbc r0, r0, r0
576 rsb r0, r0, #0
577
578 #elif defined(__aarch64__)
579
580 cmp x0, #0 // trivial
581 cset.ne x1 // csetm for mask
582
583 cmp xzr, x0 // set cf iff a == 0
584 sbc x2, x0, x0 // c' = a - a - 1 + cf = cf - 1
585 neg x2, x2 // c' = 1 - cf
586
587 sub x3, x0, x0, lsr #1 // if a < 2^63 then a' = ceil(d/2) <
588 // 2^63
589 // if a >= 2^63, write a = 2^63 + t
590 // with t < 2^63; d' = 2^63 - 2^62 +
591 // ceil(t/2) = 2^62 + ceil(t/2), and
592 // ceil(t/2) < 2^62
593 // anyway d' < 2^63 and d' = 0 iff
594 // a = 0
595 neg x3, x3 // d' top bit set iff a /= 0
596 lsr x3, x3, #63 // asr for mask
597
598 cmp x0, #1 // set cf iff a /= 0
599 adc x0, xzr, xzr // a' = 0 + 0 + cf = cf
600
601 #else
602 notimpl
603 #endif
604
605 ret
606
607 endproc
608
609 proc x03
610
611 // set a = min(a, d) (unsigned); clobber c, d
612
613 #if defined(__x86_64__)
614
615 sub rdx, rax // d' = d - a; set cf if a > d
616 sbb rcx, rcx // c = -cf = -[a > d]
617 and rcx, rdx // c = a > d ? d - a : 0
618 add rax, rcx // a' = a > d ? d : a
619
620 #elif defined(__i386__)
621
622 sub edx, eax
623 sbb ecx, ecx
624 and ecx, edx
625 add eax, ecx
626
627 #elif defined(__arm__)
628
629 cmp r0, r3 // the easy way
630 movlo r1, r0 // only needed for out-of-place
631 movhs r1, r3
632
633 subs r3, r3, r0
634 sbc r12, r12, r12
635 and r12, r12, r3
636 add r0, r0, r12
637
638 #elif defined(__aarch64__)
639
640 cmp x0, x3 // the easy way
641 csel.lo x1, x0, x3
642
643 subs x3, x3, x0 // d' = d - a; set cf if d >= a
644 sbc x16, xzr, xzr // t = -1 + cf = -[a > d]
645 and x16, x16, x3 // t = a > d ? d - a : 0
646 add x0, x0, x16 // a' = a > d ? d : a
647
648 #else
649 notimpl
650 #endif
651
652 ret
653
654 endproc
655
656 proc x04
657
658 // switch case?
659
660 #if defined(__x86_64__)
661
662 // unrelated playing
663 mov ecx, eax
664 mov rbx, -1
665 mov edx, ecx
666 sub edx, '0'
667 cmp edx, 10
668 cmovb rbx, rdx
669 or ecx, 0x20
670 mov edx, ecx
671 sub edx, 'a'
672 sub ecx, 'a' - 10
673 cmp edx, 6
674 cmovb rbx, rcx
675
676 xor al, 0x20
677
678 #elif defined(__i386__)
679
680 // unrelated playing
681 mov ecx, eax
682 mov ebx, -1
683 mov edx, ecx
684 sub edx, '0'
685 cmp edx, 10
686 cmovb ebx, edx
687 or ecx, 0x20
688 mov edx, ecx
689 sub edx, 'a'
690 sub ecx, 'a' - 10
691 cmp edx, 6
692 cmovb ebx, ecx
693
694 xor al, 0x20
695
696 #elif defined(__arm__)
697
698 // unrelated playing
699 mvn r1, #0
700 sub r12, r0, #'0'
701 cmp r12, #10
702 movlo r1, r12
703 orr r12, r0, #0x20
704 sub r12, r12, #'a'
705 cmp r12, #6
706 addlo r1, r12, #10
707
708 eor r0, r0, #0x20
709
710 #elif defined(__aarch64__)
711
712 // unrelated playing
713 mov x1, #-1
714 sub w16, w0, #'0'
715 cmp w16, #10
716 cmov.lo x1, x16
717 orr w16, w0, #0x20
718 sub w16, w16, #'a' - 10
719 cmp w16, #10
720 ccmp.hs w16, #16, #CCMP_HS
721 cmov.lo x1, x16
722
723 eor w0, w0, #0x20
724
725 #else
726 notimpl
727 #endif
728
729 ret
730
731 endproc
732
733 proc x05
734
735 // answer whether 5 <= a </<= 9.
736
737 #if defined(__x86_64__)
738
739 sub rax, 5 // a' = a - 5
740 cmp rax, 4 // is a' - 5 </<= 4?
741
742 // cc a' a
743 //
744 // z/e a' = 4 a = 9
745 // nz/ne a' /= 4 a /= 9
746 //
747 // a/nbe a' > 4 a > 9 or a < 5
748 // nc/ae/nb a' >= 4 a >= 9 or a < 5
749 // c/b/nae a' < 4 5 <= a < 9
750 // be/na a' <= 4 5 <= a <= 9
751 //
752 // o a' < -2^63 + 4 -2^63 + 5 <= a < -2^63 + 9
753 // no a' >= -2^63 + 4 a >= -2^63 + 9 or
754 // a < -2^63 + 5
755 // s -2^63 + 4 <= a' < 4 -2^63 + 9 <= a < 9
756 // ns a' < -2^63 + 4 or a < -2^63 + 9 or a >= 9
757 // a' >= 4
758 // ge/nl a' >= 4 a >= 9 or a < -2^63 + 5
759 // l/nge a' < 4 -2^63 + 5 <= a < 9
760 // g/nle a' > 4 a > 9 or a < -2^63 + 5
761 // le/ng a' <= 4 -2^63 + 5 <= a <= 9
762
763 #elif defined(__i386__)
764
765 sub eax, 5
766 cmp eax, 4
767
768 #elif defined(__arm__)
769
770 // i dimly remember having a slick way to do this way back in the
771 // day, but i can't figure it out any more.
772 sub r0, #5
773 cmp r0, #4
774
775 #elif defined(__aarch64__)
776
777 // literal translation is too obvious
778 cmp x0, #5
779 ccmp.hs x0, #9, #CCMP_HS
780
781 #else
782 notimpl
783 #endif
784
785 ret
786
787 endproc
788
789 proc x06
790
791 // leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of,
792 // set sf to msb(a)
793
794 #if defined(__x86_64__)
795
796 not rax // a' = -a - 1
797 inc rax // a' = -a
798 neg rax // a' = a
799
800 #elif defined(__i386__)
801
802 not eax
803 inc eax
804 neg eax
805
806 #elif defined(__arm__)
807
808 mvn r0, r0
809 add r0, r0, #1
810 rsbs r0, r0, #0 // cf has opposite sense
811
812 #elif defined(__aarch64__)
813
814 mvn x0, x0
815 add x0, x0, #1
816 negs x0, x0 // cf has opposite sense
817
818 #else
819 notimpl
820 #endif
821
822 ret
823
824 endproc
825
826 proc x07
827
828 // same as before (?)
829
830 #if defined(__x86_64__)
831
832 inc rax // a' = a + 1
833 neg rax // a' = -a - 1
834 inc rax // a' = -a
835 neg rax // a' = a
836
837 #elif defined(__i386__)
838
839 inc eax
840 neg eax
841 inc eax
842 neg eax
843
844 #elif defined(__arm__)
845
846 add r0, r0, #1
847 rsb r0, r0, #0
848 add r0, r0, #1
849 rsbs r0, r0, #0
850
851 #elif defined(__aarch64__)
852
853 add x0, x0, #1
854 neg x0, x0
855 add x0, x0, #1
856 negs x0, x0 // cf has opposite sense
857
858 #else
859 notimpl
860 #endif
861
862 ret
863
864 endproc
865
866 proc x08
867
868 // floor((a + d)/2), correctly handling overflow conditions; final cf
869 // is lsb(a + d), probably uninteresting
870
871 #if defined(__x86_64__)
872
873 add rax, rdx // cf || a' = a + d
874 rcr rax, 1 // shift 65-bit result right by one
875 // place; lsb moves into carry
876
877 #elif defined(__i386__)
878
879 add eax, edx
880 rcr eax, 1
881
882 #elif defined(__arm__)
883
884 // like the two-instruction a64 version
885 sub r1, r3, r0
886 add r1, r0, r1, lsr #1
887
888 // the slick version, similar to the above
889 adds r0, r0, r3
890 mov r0, r0, rrx
891
892 #elif defined(__aarch64__)
893
894 // a64 lacks a32's rrx. literal translation.
895 adds x1, x0, x3 // cf || a' = a + d
896 adc x16, xzr, xzr // realize cf in extra register
897 extr x1, x16, x1, #1 // shift down one place
898
899 // two instruction version: clobbers additional register. (if you
900 // wanted the answer in any other register, even overwriting d, then
901 // this is unnecessary.) also depends on d >= a.
902 sub x16, x3, x0 // compute difference
903 add x0, x0, x16, lsr #1 // add half of it (rounded down)
904
905 #else
906 notimpl
907 #endif
908
909 ret
910
911 endproc
912
913 proc x09
914
915 // a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3
916 // (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8).
917
918 #if defined(__x86_64__)
919
920 shr rax, 3 // a' = floor(a/8); cf = 1 if a ==
921 // 4, 5, 6, 7 (mod 8)
922 adc rax, 0 // a' = floor(a/8) + cf
923
924 #elif defined(__i386__)
925
926 shr eax, 3
927 adc eax, 0
928
929 #elif defined(__arm__)
930
931 movs r0, r0, lsr #3
932 adc r0, r0, #0
933
934 #elif defined(__aarch64__)
935
936 tst x0, #4
937 orr x0, xzr, x0, lsr #3
938 cinc.ne x0, x0
939
940 #else
941 notimpl
942 #endif
943
944 ret
945
946 endproc
947
948 proc x0a
949
950 // increment c-byte little-endian bignum at rdi
951
952 #if defined(__x86_64__)
953
954 add byte ptr [rdi], 1
955 0: inc rdi
956 adc byte ptr [rdi], 0
957 loop 0b
958
959 #elif defined(__i386__)
960
961 add byte ptr [edi], 1
962 0: inc edi
963 adc byte ptr [edi], 0
964 loop 0b
965
966 #elif defined(__arm__)
967
968 mov r12, #256 // set initial carry
969 0: ldrb r0, [r5]
970 subs r2, r2, #1
971 add r12, r0, r12, lsr #8
972 strb r12, [r5], #1
973 bne 0b
974
975 #elif defined(__aarch64__)
976
977 mov w17, #256 // set initial carry
978 0: ldrb w16, [x5]
979 sub x2, x2, #1
980 add w17, w16, w17, lsr #8
981 strb w17, [x5], #1
982 cbnz x2, 0b
983
984 #else
985 notimpl
986 #endif
987
988 ret
989
990 endproc
991
992 proc x0b
993
994 // negate double-precision d:a
995
996 #if defined(__x86_64__)
997
998 not rdx // d' = -d - 1
999 neg rax // a' = -a;
1000 // cf = 1 iff a /= 0
1001 sbb rdx, -1 // d' = -d - cf
1002
1003 #elif defined(__i386__)
1004
1005 not edx
1006 neg eax
1007 sbb edx, -1
1008
1009 #elif defined(__arm__)
1010
1011 // reverse subtract is awesome
1012 rsbs r0, r0, #0
1013 rsc r3, r3, #0
1014
1015 #elif defined(__aarch64__)
1016
1017 // easy way: everything is better with zero registers.
1018 negs x0, x0
1019 ngc x3, x3
1020
1021 #else
1022 notimpl
1023 #endif
1024
1025 ret
1026
1027 endproc
1028
1029 proc x0c
1030
1031 // rotate is distributive over xor.
1032
1033 #if defined(__x86_64__)
1034
1035 // rax // = a_1 || a_0
1036 // rbx // = b_1 || b_0
1037 mov rcx, rax // = a_1 || a_0
1038
1039 xor rcx, rbx // = (a_1 XOR b_1) || (a_0 XOR b_0)
1040 ror rcx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1)
1041
1042 ror rax, 0xd // = a_0 || a_1
1043 ror rbx, 0xd // = b_0 || b_1
1044 xor rax, rbx // = (a_0 XOR b_0) || (a_1 XOR b_1)
1045
1046 cmp rax, rcx // always equal
1047
1048 #elif defined(__i386__)
1049
1050 mov ecx, eax // = a_1 || a_0
1051
1052 xor ecx, ebx // = (a_1 XOR b_1) || (a_0 XOR b_0)
1053 ror ecx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1)
1054
1055 ror eax, 0xd // = a_0 || a_1
1056 ror ebx, 0xd // = b_0 || b_1
1057 xor eax, ebx // = (a_0 XOR b_0) || (a_1 XOR b_1)
1058
1059 cmp eax, ecx // always equal
1060
1061 #elif defined(__arm__)
1062
1063
1064 // r0 // = a_1 || a_0
1065 // r1 // = b_1 || b_0
1066 eor r2, r0, r1 // = (a_1 XOR b_1) || (a_0 XOR b_0)
1067 mov r2, r2, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1068
1069 mov r1, r1, ror #13 // = b_0 || b_1
1070 eor r0, r1, r0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1071
1072 cmp r0, r2 // always equal
1073
1074 #elif defined(__aarch64__)
1075
1076 // x0 // = a_1 || a_0
1077 // x1 // = b_1 || b_0
1078 eor x2, x0, x1 // = (a_1 XOR b_1) || (a_0 XOR b_0)
1079 ror x2, x2, #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1080
1081 ror x1, x1, #13 // = b_0 || b_1
1082 eor x0, x1, x0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1083
1084 cmp x0, x2 // always equal
1085
1086 #else
1087 notimpl
1088 #endif
1089
1090 ret
1091
1092 endproc
1093
1094 proc x0d
1095
1096 // and is distributive over xor.
1097
1098 #if defined(__x86_64__)
1099
1100 mov rdx, rbx // = b
1101
1102 xor rbx, rcx // = b XOR c
1103 and rbx, rax // = a AND (b XOR c)
1104
1105 and rdx, rax // = a AND b
1106 and rax, rcx // = a AND c
1107 xor rax, rdx // = (a AND b) XOR (a AND c)
1108 // = a AND (b XOR c)
1109
1110 cmp rax, rbx // always equal
1111
1112 #elif defined(__i386__)
1113
1114 mov edx, ebx // = b
1115
1116 xor ebx, ecx // = b XOR c
1117 and ebx, eax // = a AND (b XOR c)
1118
1119 and edx, eax // = a AND b
1120 and eax, ecx // = a AND c
1121 xor eax, edx // = (a AND b) XOR (a AND c)
1122 // = a AND (b XOR c)
1123
1124 cmp eax, ebx // always equal
1125
1126 #elif defined(__arm__)
1127
1128 and r3, r0, r1 // = a AND b
1129
1130 eor r1, r1, r2 // = b XOR c
1131 and r1, r1, r0 // = a AND (b XOR c)
1132
1133 and r0, r0, r2 // = a AND c
1134 eor r0, r0, r3 // = (a AND b) XOR (a AND c)
1135 // = a AND (b XOR c)
1136
1137 cmp r0, r1 // always equal
1138
1139 #elif defined(__aarch64__)
1140
1141 and x3, x0, x1 // = a AND b
1142
1143 eor x1, x1, x2 // = b XOR c
1144 and x1, x1, x0 // = a AND (b XOR c)
1145
1146 and x0, x0, x2 // = a AND c
1147 eor x0, x0, x3 // = (a AND b) XOR (a AND c)
1148 // = a AND (b XOR c)
1149
1150 cmp x0, x1 // always equal
1151
1152 #else
1153 notimpl
1154 #endif
1155
1156 ret
1157
1158 endproc
1159
1160 proc x0e
1161
1162 // de morgan's law
1163
1164 #if defined(__x86_64__)
1165
1166 mov rcx, rax // = a
1167
1168 and rcx, rbx // = a AND b
1169 not rcx // = NOT (a AND b)
1170
1171 not rax // = NOT a
1172 not rbx // = NOT b
1173 or rax, rbx // = (NOT a) OR (NOT b)
1174 // = NOT (a AND b)
1175
1176 cmp rax, rcx // always equal
1177
1178 #elif defined(__i386__)
1179
1180 mov ecx, eax // = a
1181
1182 and ecx, ebx // = a AND b
1183 not ecx // = NOT (a AND b)
1184
1185 not eax // = NOT a
1186 not ebx // = NOT b
1187 or eax, ebx // = (NOT a) OR (NOT b)
1188 // = NOT (a AND b)
1189
1190 cmp eax, ecx // always equal
1191
1192 #elif defined(__arm__)
1193
1194 and r2, r0, r1 // = a AND b
1195 mvn r2, r2 // = NOT (a AND b)
1196
1197 mvn r0, r0 // = NOT a
1198 mvn r1, r1 // = NOT b
1199 orr r0, r0, r1 // = (NOT a) OR (NOT b)
1200
1201 cmp r0, r2 // always equal
1202
1203 #elif defined(__aarch64__)
1204
1205 and x2, x0, x1 // = a AND b
1206 mvn x2, x2 // = NOT (a AND b)
1207
1208 mvn x0, x0 // = NOT a
1209 orn x0, x0, x1 // = (NOT a) OR (NOT b)
1210
1211 cmp x0, x2 // always equal
1212
1213 #else
1214 notimpl
1215 #endif
1216
1217 ret
1218
1219 endproc
1220
1221 proc x0f
1222
1223 // replace input buffer bytes with cumulative XORs with initial a;
1224 // final a is XOR of all buffer bytes and initial a.
1225 //
1226 // not sure why you'd do this.
1227
1228 #if defined(__x86_64__)
1229
1230 0: xor [rsi], al
1231 lodsb
1232 loop 0b
1233
1234 #elif defined(__i386__)
1235
1236 0: xor [esi], al
1237 lodsb
1238 loop 0b
1239
1240 #elif defined(__arm__)
1241
1242 0: ldrb r12, [r4]
1243 subs r2, r2, #1
1244 eor r0, r0, r12
1245 strb r0, [r4], #1
1246 bne 0b
1247
1248 #elif defined(__aarch64__)
1249
1250 0: ldrb w16, [x4]
1251 sub x2, x2, #1
1252 eor w0, w0, w16
1253 strb w0, [x4], #1
1254 cbnz x2, 0b
1255
1256 #else
1257 notimpl
1258 #endif
1259
1260 ret
1261
1262 endproc
1263
1264 ///--------------------------------------------------------------------------
1265 /// 0x10--0x1f
1266
1267 proc x10
1268
1269 // four different ways to swap a pair of registers.
1270
1271 #if defined(__x86_64__)
1272
1273 push rax
1274 push rcx
1275 pop rax
1276 pop rcx
1277
1278 xor rax, rcx
1279 xor rcx, rax
1280 xor rax, rcx
1281
1282 add rax, rcx
1283 sub rcx, rax
1284 add rax, rcx
1285 neg rcx
1286
1287 xchg rax, rcx
1288
1289 #elif defined(__i386__)
1290
1291 push eax
1292 push ecx
1293 pop eax
1294 pop ecx
1295
1296 xor eax, ecx
1297 xor ecx, eax
1298 xor eax, ecx
1299
1300 add eax, ecx
1301 sub ecx, eax
1302 add eax, ecx
1303 neg ecx
1304
1305 xchg eax, ecx
1306
1307 #elif defined(__arm__)
1308
1309 stmfd r13!, {r0, r2}
1310 ldr r0, [r13, #4]
1311 ldr r2, [r13], #8
1312
1313 eor r0, r0, r2
1314 eor r2, r2, r0
1315 eor r0, r0, r2
1316
1317 sub r0, r0, r2
1318 add r2, r2, r0
1319 rsb r0, r0, r2 // don't need 3-addr with reverse-sub
1320
1321 mov r12, r0
1322 mov r0, r2
1323 mov r2, r0
1324
1325 #elif defined(__aarch64__)
1326
1327 // anything you can do
1328 stp x0, x2, [sp, #-16]!
1329 ldp x2, x0, [sp], #16
1330
1331 eor x0, x0, x2
1332 eor x2, x2, x0
1333 eor x0, x0, x2
1334
1335 // the add/sub/add thing was daft. you can do it in three if you're
1336 // clever -- and have three-address operations.
1337 sub x0, x0, x2
1338 add x2, x2, x0
1339 sub x0, x2, x0
1340
1341 // but we lack a fourth. we can't do this in fewer than three
1342 // instructions without hitting memory. only `ldp' will modify two
1343 // registers at a time, so we need at least two instructions -- but
1344 // if the first one sets one of our two registers to its final value
1345 // then we lose the other input value with no way to recover it, so
1346 // we must either write a fresh third register, or write something
1347 // other than the final value, and in both cases we need a third
1348 // instruction to fix everything up. we've done the wrong-something-
1349 // other trick twice, so here's the captain-obvious use-a-third-
1350 // register version.
1351 mov x16, x0
1352 mov x0, x2
1353 mov x2, x16
1354
1355 #else
1356 notimpl
1357 #endif
1358
1359 ret
1360
1361 endproc
1362
1363 proc x11
1364
1365 // assuming a is initialized to zero, set a to the inclusive or of
1366 // the xor-differences of corresponding bytes in the c-byte strings
1367 // at si and di.
1368 //
1369 // in particular, a will be zero (and zf set) if and only if the two
1370 // strings are equal.
1371
1372 #if defined(__x86_64__)
1373
1374 0: mov dl, [rsi]
1375 xor dl, [rdi]
1376 inc rsi
1377 inc rdi
1378 or al, dl
1379 loop 0b
1380
1381 #elif defined(__i386__)
1382
1383 0: mov dl, [esi]
1384 xor dl, [edi]
1385 inc esi
1386 inc edi
1387 or al, dl
1388 loop 0b
1389
1390 #elif defined(__arm__)
1391
1392 0: ldrb r1, [r4], #1
1393 ldrb r12, [r5], #1
1394 subs r2, r2, #1
1395 eor r12, r12, r1
1396 orr r0, r0, r12
1397 bne 0b
1398
1399 #elif defined(__aarch64__)
1400
1401 0: ldrb w16, [x4], #1
1402 ldrb w17, [x5], #1
1403 sub x2, x2, #1
1404 eor w16, w16, w17
1405 orr w0, w0, w16
1406 cbnz x2, 0b
1407
1408 #else
1409 notimpl
1410 #endif
1411
1412 ret
1413
1414 endproc
1415
1416 proc x12
1417
1418 // an obtuse way of adding two registers. for any bit position, a
1419 // OR d is set if and only if at least one of a and d has a bit set
1420 // in that position, and a AND d is set if and only if both have a
1421 // bit set in that position. essentially, then, what we've done is
1422 // move all of the set bits in d to a, unless there's already a bit
1423 // there. this clearly doesn't change the sum.
1424
1425 #if defined(__x86_64__)
1426
1427 mov rcx, rdx // c' = d
1428 and rdx, rax // d' = a AND d
1429 or rax, rcx // a' = a OR d
1430 add rax, rdx
1431
1432 #elif defined(__i386__)
1433
1434 mov ecx, edx // c' = d
1435 and edx, eax // d' = a AND d
1436 or eax, ecx // a' = a OR d
1437 add eax, edx
1438
1439 #elif defined(__arm__)
1440
1441 and r2, r0, r3 // c' = a AND d
1442 orr r0, r0, r3 // a' = a OR d
1443 add r0, r0, r2
1444
1445 #elif defined(__aarch64__)
1446
1447 and x2, x0, x3 // c' = a AND d
1448 orr x0, x0, x3 // a' = a OR d
1449 add x0, x0, x2
1450
1451 #else
1452 notimpl
1453 #endif
1454
1455 ret
1456
1457 endproc
1458
1459 proc x13
1460
1461 // ok, so this is a really obtuse way of adding a and b; the result
1462 // is in a and d. but why does it work?
1463
1464 #if defined(__x86_64__)
1465
1466 mov rcx, 0x40 // carry chains at most 64 long
1467 0: mov rdx, rax // copy a'
1468 xor rax, rbx // low bits of each bitwise sum
1469 and rbx, rdx // carry bits from each bitwise sum
1470 shl rbx, 1 // carry them into next position
1471 loop 0b
1472
1473 #elif defined(__i386__)
1474
1475 mov ecx, 0x40 // carry chains at most 64 long
1476 0: mov edx, eax // copy a'
1477 xor eax, ebx // low bits of each bitwise sum
1478 and ebx, edx // carry bits from each bitwise sum
1479 shl ebx, 1 // carry them into next position
1480 loop 0b
1481
1482 #elif defined(__arm__)
1483
1484 mov r2, #0x40
1485 0: and r3, r0, r1
1486 subs r2, r2, #1
1487 eor r0, r0, r1
1488 lsl r1, r3, #1
1489 bne 0b
1490
1491 #elif defined(__aarch64__)
1492
1493 mov x2, #0x40
1494 0: and x3, x0, x1
1495 sub x2, x2, #1
1496 eor x0, x0, x1
1497 lsl x1, x3, #1
1498 cbnz x2, 0b
1499
1500 #else
1501 notimpl
1502 #endif
1503
1504 ret
1505
1506 endproc
1507
1508 proc x14
1509
1510 // floor((a + d)/2), like x08.
1511
1512 #if defined(__x86_64__)
1513
1514 mov rcx, rax // copy a for later
1515 and rcx, rdx // carry bits
1516
1517 xor rax, rdx // low bits of each bitwise sum
1518 shr rax, 1 // divide by 2; carries now in place
1519
1520 add rax, rcx // add the carries; done
1521
1522 #elif defined(__i386__)
1523
1524 mov ecx, eax // copy a for later
1525 and ecx, edx // carry bits
1526
1527 xor eax, edx // low bits of each bitwise sum
1528 shr eax, 1 // divide by 2; carries now in place
1529
1530 add eax, ecx // add the carries; done
1531
1532 #elif defined(__arm__)
1533
1534 and r2, r0, r3
1535 eor r0, r0, r3
1536 add r0, r2, r0, lsr #1
1537
1538 #elif defined(__aarch64__)
1539
1540 and x2, x0, x3
1541 eor x0, x0, x3
1542 add x0, x2, x0, lsr #1
1543
1544 #else
1545 notimpl
1546 #endif
1547
1548 ret
1549
1550 endproc
1551
1552 proc x15
1553
1554 // sign extension 32 -> 64 bits.
1555
1556 #if defined(__x86_64__)
1557
1558 movsx rbx, eax // like this?
1559
1560 mov rdx, 0xffffffff80000000
1561 add rax, rdx // if bit 31 of a is set then bits
1562 // 31--63 of a' are clear; otherwise,
1563 // these bits are all set -- which is
1564 // exactly backwards
1565 xor rax, rdx // so fix it
1566
1567 #elif defined(__i386__)
1568
1569 movsx ebx, ax // like this?
1570
1571 mov edx, 0xffff8000
1572 add eax, edx // if bit 31 of a is set then bits
1573 // 31--63 of a' are clear; otherwise,
1574 // these bits are all set -- which is
1575 // exactly backwards
1576 xor eax, edx // so fix it
1577
1578 #elif defined(__arm__)
1579
1580 sxth r1, r0 // like this
1581
1582 mov r12, #0x80000000
1583 add r0, r0, r12, asr #16
1584 eor r0, r0, r12, asr #16
1585
1586 #elif defined(__aarch64__)
1587
1588 sxtw x1, w0 // like this
1589
1590 mov x16, #0xffffffff80000000
1591 add x0, x0, x16
1592 eor x0, x0, x16
1593
1594 #else
1595 notimpl
1596 #endif
1597
1598 ret
1599
1600 endproc
1601
1602 proc x16
1603
1604 // ??? i don't know why you'd want to calculate this.
1605
1606 #if defined(__x86_64__)
1607
1608 xor rax, rbx // a' = a XOR b
1609 xor rbx, rcx // b' = b XOR c
1610 mov rsi, rax // t = a XOR b
1611 add rsi, rbx // t = (a XOR b) + (b XOR c)
1612 cmovc rax, rbx // a' = cf ? b XOR c : a XOR b
1613 xor rax, rbx // a' = cf ? 0 : a XOR c
1614 cmp rax, rsi
1615
1616 #elif defined(__i386__)
1617
1618 xor eax, ebx // a' = a XOR b
1619 xor ebx, ecx // b' = b XOR c
1620 mov esi, eax // t = a XOR b
1621 add esi, ebx // t = (a XOR b) + (b XOR c)
1622 cmovc eax, ebx // a' = cf ? b XOR c : a XOR b
1623 xor eax, ebx // a' = cf ? 0 : a XOR c
1624 cmp eax, esi
1625
1626 #elif defined(__arm__)
1627
1628 eor r0, r0, r1
1629 eor r1, r1, r2
1630 adds r4, r0, r1
1631 movcs r0, r1
1632 eor r0, r0, r1
1633 cmp r0, r4
1634
1635 #elif defined(__aarch64__)
1636
1637 eor x0, x0, x1
1638 eor x1, x1, x2
1639 adds x4, x0, x1
1640 cmov.cs x0, x1
1641 eor x0, x0, x1
1642 cmp x0, x4
1643
1644 #else
1645 notimpl
1646 #endif
1647
1648 ret
1649
1650 endproc
1651
1652 proc x17
1653
1654 // absolute value
1655
1656 #if defined(__x86_64__)
1657
1658 cqo // d = a < 0 ? -1 : 0
1659 xor rax, rdx // a' = a < 0 ? -a - 1 : a
1660 sub rax, rdx // a' = a < 0 ? -a : a
1661
1662 #elif defined(__i386__)
1663
1664 cdq // d = a < 0 ? -1 : 0
1665 xor eax, edx // a' = a < 0 ? -a - 1 : a
1666 sub eax, edx // a' = a < 0 ? -a : a
1667
1668 #elif defined(__arm__)
1669
1670 // direct approach
1671 movs r1, r0
1672 rsbmi r1, r0, #0
1673
1674 // faithful-ish conversion
1675 eor r3, r0, r0, asr #31
1676 sub r0, r3, r0, asr #31
1677
1678 #elif defined(__aarch64__)
1679
1680 // direct approach
1681 tst x0, #1 << 63
1682 cneg.ne x1, x0
1683
1684 // faithful-ish conversion
1685 eor x3, x0, x0, asr #63
1686 sub x0, x3, x0, asr #63
1687
1688 #else
1689 notimpl
1690 #endif
1691
1692 ret
1693
1694 endproc
1695
1696 proc x18
1697
1698 // should always set sf, clear zf, unless we get rescheduled to a
1699 // different core.
1700
1701 #if defined(__x86_64__)
1702
1703 rdtsc // d || a = cycles
1704 shl rdx, 0x20
1705 or rax, rdx // a = cycles
1706 mov rcx, rax // c = cycles
1707
1708 rdtsc // d || a = cycles'
1709 shl rdx, 0x20
1710 or rax, rdx // a = cycles'
1711
1712 cmp rcx, rax
1713
1714 #elif defined(__i386__)
1715
1716 rdtsc // d || a = cycles
1717 mov ebx, eax
1718 mov ecx, edx // c || b = cycles
1719
1720 rdtsc // d || a = cycles'
1721
1722 sub ebx, eax
1723 sbb ecx, edx
1724
1725 #elif defined(__arm__)
1726
1727 // cycle clock not available in user mode
1728 mrrc p15, 0, r0, r1, c9
1729 mrrc p15, 0, r2, r3, c9
1730 subs r0, r0, r2
1731 sbcs r1, r1, r3
1732
1733 #elif defined(__aarch64__)
1734
1735 // cycle clock not available in user mode
1736 mrs x0, pmccntr_el0
1737 mrs x1, pmccntr_el0
1738 cmp x0, x1
1739
1740 #else
1741 notimpl
1742 #endif
1743
1744 ret
1745
1746 endproc
1747
1748 proc x19
1749
1750 // stupid way to capture a pointer to inline data and jump past it.
1751 // confuses the return-address predictor something chronic. worse
1752 // because amd64 calling convention doesn't usually pass arguments on
1753 // the stack.
1754
1755 #if defined(__x86_64__)
1756
1757 call 8f
1758 .string "hello world!\n\0"
1759 8: call print_str
1760 add rsp, 8
1761 ret
1762
1763 print_str:
1764 // actually implement this ridiculous thing
1765 mov rsi, [rsp + 8]
1766 xor edx, edx
1767 0: mov al, [rsi + rdx]
1768 inc rdx
1769 cmp al, 0
1770 jnz 0b
1771 mov eax, SYS_write
1772 mov edi, 1
1773 dec rdx
1774 syscall // clobbers r11 :-(
1775 ret
1776
1777 #elif defined(__i386__)
1778
1779 call 8f
1780 .string "hello world!\n\0"
1781 8: call print_str
1782 add esp, 4
1783 ret
1784
1785 print_str:
1786 // actually implement this ridiculous thing
1787 mov ecx, [esp + 4]
1788 xor edx, edx
1789 0: mov al, [ecx + edx]
1790 inc edx
1791 cmp al, 0
1792 jnz 0b
1793 mov eax, SYS_write
1794 mov ebx, 1
1795 dec edx
1796 int 0x80
1797 ret
1798
1799 #elif defined(__arm__)
1800
1801 // why am i doing this?
1802 stmfd r13!, {r14}
1803 bl 8f
1804 .string "hello world!\n\0"
1805 .balign 4
1806 8: mov r1, r14 // might as well make it easy on myself
1807 bl print_str
1808 ldmfd r13!, {pc}
1809
1810 print_str:
1811 mov r2, #0
1812 0: ldrb r0, [r1, r2]
1813 cmp r0, #0
1814 addne r2, r2, #1
1815 bne 0b
1816 mov r0, #1
1817 mov r7, #SYS_write
1818 swi 0
1819 bx r14
1820
1821 #elif defined(__aarch64__)
1822
1823 // why am i doing this?
1824 str x30, [sp, #-16]!
1825 bl 8f
1826 .string "hello world!\n\0"
1827 .balign 4
1828 8: mov x1, x30 // might as well make it easy on myself
1829 bl print_str
1830 ldr x30, [sp], #16
1831 ret
1832
1833 print_str:
1834 mov x2, #0
1835 0: ldrb w0, [x1, x2]
1836 cmp w0, #0
1837 cinc.ne x2, x2
1838 b.ne 0b
1839 mov x0, #1
1840 mov x8, #SYS_write
1841 svc #0
1842 ret
1843
1844 #else
1845 notimpl
1846 #endif
1847
1848 endproc
1849
1850 proc x1a
1851
1852 // collect the current instruction-pointer address. this was an old
1853 // 32-bit i386 trick for position-independent code, but (a) it
1854 // confuses the return predictor, and (b) amd64 has true pc-relative
1855 // addressing.
1856
1857 #if defined(__x86_64__)
1858
1859 // the actual example
1860 call 0f
1861 0: pop rax
1862
1863 // the modern i386 trick doesn't confuse the return-address
1864 // predictor.
1865 call calladdr_rbx
1866 sub rbx, . - 0b
1867
1868 // but rip-relative addressing is even better
1869 lea rcx, [rip + 0b]
1870
1871 ret
1872
1873 calladdr_rbx:
1874 mov rbx, [rsp]
1875 ret
1876
1877 #elif defined(__i386__)
1878
1879 // the actual example
1880 call 0f
1881 0: pop eax
1882
1883 // the modern i386 trick doesn't confuse the return-address
1884 // predictor.
1885 call get_pc_ebx
1886 sub ebx, . - 0b
1887
1888 ret
1889
1890 #elif defined(__arm__)
1891
1892 stmfd r13!, {r14}
1893
1894 bl 0f
1895 0: mov r0, r14
1896
1897 bl return
1898 sub r1, r14, #. - 0b
1899
1900 adr r2, 0b
1901
1902 ldmfd r13!, {pc}
1903
1904 return: bx r14
1905
1906 #elif defined(__aarch64__)
1907
1908 str x30, [sp, #-16]!
1909
1910 // we can do all of the above using a64
1911 bl 0f
1912 0: mov x0, x30
1913
1914 bl return
1915 sub x1, x30, #. - 0b
1916
1917 adr x2, 0b
1918
1919 ldr x30, [sp], #16
1920 return: ret
1921
1922 #else
1923 notimpl
1924 #endif
1925
1926 endproc
1927
1928 proc x1b
1929
1930 #if defined(__x86_64__)
1931
1932 // retpolines: an mitigation against adversarially influenced
1933 // speculative execution at indirect branches. if an adversary can
1934 // prepare a branch-target buffer entry matching an indirect branch
1935 // in the victim's address space then they can cause the victim to
1936 // /speculatively/ (but not architecturally) execute any code in
1937 // their address space, possibly leading to leaking secrets through
1938 // the cache. retpolines aren't susceptible to this because the
1939 // predicted destination address is from the return-prediction stack
1940 // which the adversary can't prime. the performance penalty is still
1941 // essentially a branch misprediction -- for this return, and
1942 // possibly all others already stacked.
1943
1944 // (try not to crash)
1945 lea rax, [rip + 9f]
1946
1947 push rax
1948 9: ret
1949
1950 #elif defined(__i386__)
1951
1952 call get_pc_ebx
1953 lea eax, [ebx + 9f - .]
1954
1955 push eax
1956 9: ret
1957
1958 #elif defined(__arm__)
1959
1960 stmfd r13!, {r14}
1961
1962 adr r14, 8f
1963 bx r14
1964
1965 8: ldmfd r13!, {pc}
1966
1967 #elif defined(__aarch64__)
1968
1969 str x30, [sp, #-16]!
1970
1971 adr x30, 8f
1972 ret
1973
1974 8: ldr x30, [sp], #16
1975 ret
1976
1977 #else
1978 notimpl
1979 #endif
1980
1981 endproc
1982
1983 proc x1c
1984
1985 // ok, having a hard time seeing a use for this. the most important
1986 // thing to note is that sp is set from `pop' /after/ it's
1987 // incremented.
1988
1989 #if defined(__x86_64__)
1990
1991 // try not to crash
1992 mov rax, rsp
1993 and rsp, -16
1994 push rax
1995
1996 pop rsp
1997
1998 // check it worked
1999 mov rbx, rsp
2000 ret
2001
2002 #elif defined(__i386__)
2003
2004 // try not to crash
2005 mov eax, esp
2006 and esp, -16
2007 push eax
2008
2009 pop esp
2010
2011 // check it worked
2012 mov ebx, esp
2013 ret
2014
2015 #elif defined(__arm__)
2016
2017 // not even going to dignify this
2018 notimpl
2019
2020 #elif defined(__aarch64__)
2021
2022 // not even going to dignify this
2023 notimpl
2024
2025 #else
2026 notimpl
2027 #endif
2028
2029 endproc
2030
2031 proc x1d
2032
2033 // monumentally cheesy way to copy 8 n bytes from buff1 to buff2.
2034 // also clobbers words at buff2 + 8 n and buff2 - 8 for good measure.
2035
2036 n = 4
2037
2038 #if defined(__x86_64__)
2039
2040 mov rax, rsp // safekeeping
2041
2042 // we're toast if we get hit by a signal now. fingers crossed...
2043 .if 0
2044 mov rsp, buff2 + 8*n + 8
2045 mov rbp, buff1 + 8*n
2046 .else
2047 lea rsp, [rdi + 8*n + 16]
2048 lea rbp, [rsi + 8*n]
2049 .endif
2050 enter 0, n + 1
2051
2052 // precise action:
2053 //
2054 // +---------+ +---------+
2055 // rbp -> | ??? | rsp -> | ??? |
2056 // +---------+ +---------+
2057 // | w_{n-1} | | rbp | <- rbp'
2058 // +---------+ +---------+
2059 // | ... | | w_{n-1} |
2060 // +---------+ +---------+
2061 // | w_1 | | ... |
2062 // +---------+ +---------+
2063 // | w_0 | | w_1 |
2064 // +---------+ +---------+
2065 // | w_0 |
2066 // +---------+
2067 // | rbp' | <- rsp'
2068 // +---------+
2069
2070 mov rdx, rsp
2071 mov rsp, rax
2072
2073 #elif defined(__i386__)
2074
2075 mov eax, esp // safekeeping
2076
2077 // we're toast if we get hit by a signal now. fingers crossed...
2078 .if 0
2079 mov esp, buff2 + 4*n + 4
2080 mov ebp, buff1 + 4*n
2081 .else
2082 lea esp, [edi + 4*n + 8]
2083 lea ebp, [esi + 4*n]
2084 .endif
2085 enter 0, n + 1
2086
2087 mov edx, esp
2088 mov esp, eax
2089
2090 #elif defined(__arm__)
2091
2092 add r4, r4, #4*n
2093 add r5, r5, #4*n + 8
2094
2095 str r4, [r5, #-4]!
2096 .rept n/2
2097 ldrd r0, r1, [r4, #-8]!
2098 strd r0, r1, [r5, #-8]!
2099 .endr
2100 add r4, r5, #4*n
2101 str r4, [r5, #-4]!
2102
2103 #elif defined(__aarch64__)
2104
2105 // omgwtf. let's not actually screw with the stack pointer.
2106
2107 add x4, x4, #8*n
2108 add x5, x5, #8*n + 16
2109
2110 str x4, [x5, #-8]!
2111 .rept n/2
2112 ldp x16, x17, [x4, #-16]!
2113 stp x16, x17, [x5, #-16]!
2114 .endr
2115 add x4, x5, #8*n
2116 str x4, [x5, #-8]!
2117
2118 #else
2119 notimpl
2120 #endif
2121
2122 ret
2123
2124 endproc
2125
2126 proc x1e
2127
2128 // convert nibble value to (uppercase) hex; other input values yield
2129 // nonsense.
2130
2131 #if defined(__x86_64__)
2132
2133 // das doesn't work in 64-bit mode; best i can come up with
2134 mov edx, eax
2135 add al, '0'
2136 add dl, 'A' - 10
2137 cmp al, '9' + 1
2138 cmovae eax, edx
2139
2140 #elif defined(__i386__)
2141
2142 cmp al, 0x0a // cf = 1 iff a < 10
2143 sbb al, 0x69 // if 0 <= a < 10, a' = a - 0x6a, so
2144 // 0x96 <= a' < 0x70, setting af, cf
2145 // if 10 <= a < 16, a' = a - 0x69, so
2146 // 0x71 <= a' < 0x77, setting cf but
2147 // clearing af
2148 das // if 0 <= a < 10, then af and cf are
2149 // both set, so set subtract 0x66
2150 // from a' leaving 0x30 <= a' < 0x3a;
2151 // if 10 <= a < 16 then af clear but
2152 // cf set, so subtract 0x60 from a'
2153 // leaving 0x41 <= a' < 0x47
2154
2155 #elif defined(__arm__)
2156
2157 // significantly less tricksy
2158 cmp r0, #10
2159 addlo r0, r0, #'0'
2160 addhs r0, r0, #'A' - 10
2161
2162 #elif defined(__aarch64__)
2163
2164 // with less versatile conditional execution this is the best we can
2165 // do
2166 cmp w0, #10
2167 add w16, w0, #'A' - 10
2168 add w0, w0, #'0'
2169 cmov.hs w0, w16
2170
2171 #else
2172 notimpl
2173 #endif
2174
2175 ret
2176
2177 endproc
2178
2179 proc x1f
2180
2181 // verify collatz conjecture starting at a; assume a /= 0!
2182
2183 #if defined(__x86_64__)
2184
2185 0: bsf rcx, rax // clobber c if a = 0
2186 shr rax, cl // a = 2^c a'
2187 cmp rdx, 0
2188 je 1f
2189 stosq
2190 dec rdx
2191 1:
2192 cmp rax, 1 // done?
2193 je 9f
2194 lea rax, [2*rax + rax + 1] // a' = 3 a' + 1
2195 jmp 0b // again
2196
2197 9: ret
2198
2199 #elif defined(__i386__)
2200
2201 0: bsf ecx, eax // clobber c if a = 0
2202 shr eax, cl // a = 2^c a'
2203 cmp edx, 0
2204 je 1f
2205 stosd
2206 dec edx
2207 1:
2208 cmp eax, 1 // done?
2209 je 9f
2210 lea eax, [2*eax + eax + 1] // a' = 3 a' + 1
2211 jmp 0b // again
2212
2213 9: ret
2214
2215 #elif defined(__arm__)
2216
2217 // rbit introduced in armv7
2218 0: rbit r2, r0
2219 clz r2, r2
2220 mov r0, r0, lsr r2 // a = 2^c a'
2221 cmp r3, #0
2222 strne r0, [r5], #4
2223 subne r3, r3, #1
2224 cmp r0, #1
2225 adcne r0, r0, r0, lsl #1 // a' = 3 a' + 1 (because c set)
2226 bne 0b
2227
2228 ret
2229
2230 #elif defined(__aarch64__)
2231
2232 0: rbit w2, w0
2233 clz w2, w2
2234 lsr w0, w0, w2 // a = 2^c a'
2235 cmp x3, #0
2236 beq 1f
2237 str x0, [x5], #8
2238 sub x3, x3, #1
2239 1:
2240 cmp w0, #1
2241 add w16, w0, w0, lsl #1 // t = 3 a' + 1 (because c set)
2242 csinc.eq w0, w0, w16
2243 b.ne 0b
2244
2245 ret
2246
2247 #else
2248 notimpl
2249 #endif
2250
2251 endproc
2252
2253 ///--------------------------------------------------------------------------
2254 /// 0x20--0x2f
2255
2256 proc x20
2257
2258 // calculate 1337 a slowly
2259
2260 #if defined(__x86_64__)
2261
2262 // original version
2263 mov rcx, rax // c = a
2264 shl rcx, 2 // c = 4 a
2265 add rcx, rax // c = 5 a
2266 shl rcx, 3 // c = 40 a
2267 add rcx, rax // c = 41 a
2268 shl rcx, 1 // c = 82 a
2269 add rcx, rax // c = 83 a
2270 shl rcx, 1 // c = 166 a
2271 add rcx, rax // c = 167 a
2272 shl rcx, 3 // c = 1336 a
2273 add rcx, rax // c = 1337 a
2274
2275 // a quick way
2276 lea rdx, [2*rax + rax] // t = 3 a
2277 shl rdx, 6 // t = 192 a
2278 sub rdx, rax // t = 191 a
2279 lea rbx, [8*rdx] // b = 1528 a
2280 sub rbx, rdx // b = 1337 a
2281
2282 #elif defined(__i386__)
2283
2284 // original version
2285 mov ecx, eax // c = a
2286 shl ecx, 2 // c = 4 a
2287 add ecx, eax // c = 5 a
2288 shl ecx, 3 // c = 40 a
2289 add ecx, eax // c = 41 a
2290 shl ecx, 1 // c = 82 a
2291 add ecx, eax // c = 83 a
2292 shl ecx, 1 // c = 166 a
2293 add ecx, eax // c = 167 a
2294 shl ecx, 3 // c = 1336 a
2295 add ecx, eax // c = 1337 a
2296
2297 // a quick way
2298 lea edx, [2*eax + eax] // t = 3 a
2299 shl edx, 6 // t = 192 a
2300 sub edx, eax // t = 191 a
2301 lea ebx, [8*edx] // b = 1528 a
2302 sub ebx, edx // b = 1337 a
2303
2304 #elif defined(__arm__)
2305
2306 // original version, ish
2307 add r2, r0, r0, lsl #2 // c = 5 a
2308 add r2, r0, r2, lsl #3 // c = 41 a
2309 add r2, r0, r2, lsl #1 // c = 83 a
2310 add r2, r0, r2, lsl #1 // c = 167 a
2311 add r2, r0, r2, lsl #3 // c = 1337 a
2312
2313 // quicker way
2314 add r1, r0, r0, lsl #1 // b = 3 a
2315 rsb r1, r0, r1, lsl #6 // b = 191 a
2316 rsb r1, r1, r1, lsl #3 // b = 1337 a
2317
2318 #elif defined(__aarch64__)
2319
2320 // original version, ish
2321 add x2, x0, x0, lsl #2 // c = 5 a
2322 add x2, x0, x2, lsl #3 // c = 41 a
2323 add x2, x0, x2, lsl #1 // c = 83 a
2324 add x2, x0, x2, lsl #1 // c = 167 a
2325 add x2, x0, x2, lsl #3 // c = 1337 a
2326
2327 // sleazy because no rsb
2328 add x1, x0, x0, lsl #1 // b = 3 a
2329 sub x1, x0, x1, lsl #6 // b = -191 a
2330 sub x1, x1, x1, lsl #3 // b = 1337 a
2331
2332 #else
2333 notimpl
2334 #endif
2335
2336 ret
2337
2338 endproc
2339
2340 proc x21
2341
2342 // multiply complex numbers a + b i and c + d i
2343 //
2344 // (a + b i) (c + d i) = (a c - b d) + (a d + b c) i
2345 //
2346 // somewhat slick approach uses only three multiplications
2347
2348 #if defined(__x86_64__)
2349
2350 mov rsi, rax // t = a
2351 add rax, rbx // a' = a + b
2352 mov rdi, rdx // u = d
2353 sub rdx, rcx // d' = d - c
2354 add rdi, rcx // u = c + d
2355
2356 imul rax, rcx // a' = c (a + b)
2357 imul rsi, rdx // t = a (d - c)
2358 imul rdi, rbx // u = b (c + d)
2359
2360 add rsi, rax // t = a (d - c) + c (a + b)
2361 mov rbx, rsi // b' = a (d - c) + c (a + b)
2362 // = a d + b c
2363 sub rax, rdi // a' = c (a + b) - b (c + d)
2364 // = a c - b d
2365
2366 #elif defined(__i386__)
2367
2368 mov esi, eax // t = a
2369 add eax, ebx // a' = a + b
2370 mov edi, edx // u = d
2371 sub edx, ecx // d' = d - c
2372 add edi, ecx // u = c + d
2373
2374 imul eax, ecx // a' = c (a + b)
2375 imul esi, edx // t = a (d - c)
2376 imul edi, ebx // u = b (c + d)
2377
2378 add esi, eax // t = a (d - c) + c (a + b)
2379 mov ebx, esi // b' = a (d - c) + c (a + b)
2380 // = a d + b c
2381 sub eax, edi // a' = c (a + b) - b (c + d)
2382 // = a c - b d
2383
2384 #elif defined(__arm__)
2385
2386 add r4, r0, r1 // t = a + b
2387 add r5, r2, r3 // u = c + d
2388 sub r3, r3, r2 // d' = d - c
2389
2390 // mls introduced in armv7
2391 mul r4, r4, r2 // t = c (a + b)
2392 mov r2, r1 // c' = a (bah!)
2393 mla r1, r0, r3, r4 // b' = a (d - c) + c (a + b)
2394 // = a d + b c
2395 mls r0, r2, r5, r4 // a' = c (a + b) - b (c + d)
2396 // = a c - b d
2397
2398 #elif defined(__aarch64__)
2399
2400 add x4, x0, x1 // t = a + b
2401 add x5, x2, x3 // u = c + d
2402 sub x3, x3, x2 // d' = d - c
2403
2404 // mls intxoduced in axmv7
2405 mul x4, x4, x2 // t = c (a + b)
2406 mov x2, x1 // c' = a (bah!)
2407 madd x1, x0, x3, x4 // b' = a (d - c) + c (a + b)
2408 // = a d + b c
2409 msub x0, x2, x5, x4 // a' = c (a + b) - b (c + d)
2410 // = a c - b d
2411
2412 #else
2413 notimpl
2414 #endif
2415
2416 ret
2417
2418 endproc
2419
2420 proc x22
2421
2422 // divide by 3
2423
2424 #if defined(__x86_64__)
2425
2426 mov rdx, 0xaaaaaaaaaaaaaaab // = ceil(2/3 2^64)
2427 mul rdx // d' || a' =~ 2/3 a 2^64
2428 shr rdx, 1 // d' = floor(a/3)
2429 mov rax, rdx // a' = floor(a/3)
2430
2431 // we start with 0 <= a < 2^64. write f = ceil(2/3 2^64), so that
2432 // 2/3 < f/2^64 < 2/3 + 1/2^64. then floor(2/3 a) <= floor(a f/2^64)
2433 // <= floor(2/3 a + a/2^64), but a < 2^64 so a/2^64 < 1 and
2434 // floor(a f/2^64) = floor(2/3 a).
2435
2436 #elif defined(__i386__)
2437
2438 mov edx, 0xaaaaaaab // = ceil(2/3 2^32)
2439 mul edx // d' || a' =~ 2/3 a 2^32
2440 shr edx, 1 // d' = floor(a/3)
2441 mov eax, edx // a' = floor(a/3)
2442
2443 #elif defined(__arm__)
2444
2445 ldr r12, =0xaaaaaaab
2446 umull r12, r0, r0, r12
2447 mov r0, r0, lsr #1
2448
2449 #elif defined(__aarch64__)
2450
2451 ldr x16, =0xaaaaaaaaaaaaaaab
2452 umulh x0, x0, x16
2453 lsr x0, x0, #1
2454
2455 #else
2456 notimpl
2457 #endif
2458
2459 ret
2460
2461 endproc
2462
2463 proc x23
2464
2465 #if defined(__x86_64__)
2466
2467 // main loop: shorten a preserving residue class mod 3
2468 0: cmp rax, 5
2469 jbe 8f
2470 // a > 5
2471 mov rdx, rax // d' = a
2472 shr rdx, 2 // d' = floor(a/4)
2473 and rax, 3 // a = 4 d' + a' (0 <= a' < 4)
2474 add rax, rdx // a' == a (mod 3) but a' < a/4 + 4
2475 jmp 0b
2476
2477 // fix up final value 0 <= a < 6: want 0 <= a < 3
2478 //
2479 // the tricky part is actually a = 3; but the other final cases take
2480 // additional iterations which we can avoid.
2481 8: cmp rax, 3 // set cf iff a < 3
2482 cmc // set cf iff a >= 3
2483 sbb rdx, rdx // d' = a >= 3 ? -1 : 0
2484 and rdx, 3 // d' = a >= 3 ? 3 : 0
2485 sub rax, rdx // a' = a - (a >= 3 ? 3 : 0)
2486 // = a (mod 3)
2487
2488 #elif defined(__i386__)
2489
2490 // main loop: shorten a preserving residue class mod 3
2491 0: cmp eax, 5
2492 jbe 8f
2493 // a > 5
2494 mov edx, eax // d' = a
2495 shr edx, 2 // d' = floor(a/4)
2496 and eax, 3 // a = 4 d' + a' (0 <= a' < 4)
2497 add eax, edx // a' == a (mod 3) but a' < a/4 + 4
2498 jmp 0b
2499
2500 // fix up final value 0 <= a < 6: want 0 <= a < 3
2501 //
2502 // the tricky part is actually a = 3; but the other final cases take
2503 // additional iterations which we can avoid.
2504 8: cmp eax, 3 // set cf iff a < 3
2505 cmc // set cf iff a >= 3
2506 sbb edx, edx // d' = a >= 3 ? -1 : 0
2507 and edx, 3 // d' = a >= 3 ? 3 : 0
2508 sub eax, edx // a' = a - (a >= 3 ? 3 : 0)
2509 // = a (mod 3)
2510
2511 #elif defined(__arm__)
2512
2513 0: cmp r0, #6
2514 andhs r12, r0, #3
2515 addhs r0, r12, r0, lsr #2
2516 bhs 0b
2517
2518 cmp r0, #3
2519 subhs r0, r0, #3
2520
2521 #elif defined(__aarch64__)
2522
2523 0: cmp x0, #6
2524 // blunder on through regardless since this doesn't affect the result
2525 and x16, x0, #3
2526 add x0, x16, x0, lsr #2
2527 b.hs 0b
2528
2529 subs x16, x0, #3
2530 cmov.hs x0, x16
2531
2532 #else
2533 notimpl
2534 #endif
2535
2536 ret
2537
2538 endproc
2539
2540 proc x24
2541
2542 // invert (odd) a mod 2^64
2543 //
2544 // suppose a a_i == 1 (mod 2^{2^i})
2545 //
2546 // clearly good for i = 0, since 2^i = 1 and 2^{2^i} = 2, and a_0 =
2547 // a == 1 (mod 2) by assumption
2548 //
2549 // write a a_i == b_i 2^{2^i} + 1 (mod 2^{2^{i+1}})
2550 // then b_i == (a a_i - 1)/2^{2^i} (mod 2^{2^i})
2551 // to lift inverse, we want x such that a x == -b_i (mod 2^{2^i});
2552 // clearly x = -a_i b_i will do, since a a_i == 1 (mod 2^{2^i})
2553 // then:
2554 // a_{i+1} = a_i - a_i b_i 2^{2^i} = a_i (1 - (a a_i - 1))
2555 // = 2 a_i - a a_i^2
2556 //
2557 // check:
2558 // a a_{i+1} = 2 a a_i - a^2 a_i^2
2559 // == 2 a a_i - (b_i 2^{2^i} + 1)^2
2560 // == 2 (b_i 2^{2^i} + 1) -
2561 // (b_i^2 2^{2^{i+1}} + 2 b_i 2^{2^i} + 1)
2562 // == 1 (mod 2^{2^{i+1}})
2563
2564 #if defined(__x86_64__)
2565
2566 // rax // a_0 = a
2567 mov rbx, rax // b' = a
2568 mov rsi, rax // t = a_0
2569
2570 0:
2571 cmp rbp, 0
2572 je 1f
2573 stosq
2574 dec rbp
2575 1:
2576 mul rbx // a' = a a_i
2577 mov rcx, rax // c = a a_i
2578
2579 sub rax, 2 // a' = a a_i - 2
2580 neg rax // a' = 2 - a a_i
2581 mul rsi // a_{i+1} = a_i (2 - a a_i)
2582 // = 2 a_i - a a_i^2
2583 mov rsi, rax // t = a_{i+1}
2584
2585 cmp rcx, 1 // done?
2586 ja 0b // no -- iterate
2587
2588 #elif defined(__i386__)
2589
2590 // eax // a_0 = a
2591 mov ebx, eax // b' = a
2592 mov esi, eax // t = a_0
2593
2594 0:
2595 cmp ebp, 0
2596 je 1f
2597 stosd
2598 dec ebp
2599 1:
2600 mul ebx // a' = a a_i
2601 mov ecx, eax // c = a a_i
2602
2603 sub eax, 2 // a' = a a_i - 2
2604 jb 9f // done if < 2
2605 neg eax // a' = 2 - a a_i
2606 mul esi // a_{i+1} = a_i (2 - a a_i)
2607 // = 2 a_i - a a_i^2
2608 mov esi, eax // t = a_{i+1}
2609
2610 jmp 0b // and iterate
2611 9: mov eax, esi // restore
2612
2613 #elif defined(__arm__)
2614
2615 // r0 // a_0 = a
2616 mov r1, r0 // b' = a
2617
2618 0:
2619 cmp r6, #0
2620 strne r0, [r5], #4
2621 subne r6, r6, #1
2622 mul r2, r0, r1 // c = a a_i
2623 rsbs r2, r2, #2 // c = 2 - a a_i
2624 mul r0, r0, r2 // a_{i+1} = a_i (2 - a a_i)
2625 // = 2 a_i - a a_i^2
2626 blo 0b
2627
2628 #elif defined(__aarch64__)
2629
2630 // x0 // a_0 = a
2631 mov x1, x0 // b' = a
2632 mov x16, #2 // because we have no rsb
2633
2634 0:
2635 cmp x6, #0
2636 b.eq 1f
2637 str x0, [x5], #8
2638 sub x6, x6, #1
2639 1:
2640 mul x2, x0, x1 // c = a a_i
2641 subs x2, x16, x2 // c = 2 - a a_i
2642 mul x0, x0, x2 // a_{i+1} = a_i (2 - a a_i)
2643 // = 2 a_i - a a_i^2
2644 b.lo 0b
2645
2646 #else
2647 notimpl
2648 #endif
2649
2650 ret
2651
2652 endproc
2653
2654 proc x25
2655
2656 // a poor approximation to pi/4
2657 //
2658 // think of x and y as being in 16.16 fixed-point format. we sample
2659 // points in the unit square, and determine how many of them are
2660 // within a unit quarter-circle centred at the origin. the area of
2661 // the quarter-circle is pi/4.
2662
2663 #if defined(__x86_64__)
2664
2665 xor eax, eax // a = 0
2666 mov rcx, 1
2667 shl rcx, 0x20 // c =~ 4 billion
2668
2669 0: movzx rbx, cx // x = low 16 bits of c
2670 imul rbx, rbx // b = x^2
2671
2672 ror rcx, 0x10 // switch halves of c
2673 movzx rdx, cx // y = high 16 bits of c
2674 imul rdx, rdx // d = y^2
2675 rol rcx, 0x10 // switch back
2676
2677 add rbx, rdx // r^2 = x^2 + y^2
2678 shr rbx, 0x20 // r^2 >= 1?
2679 cmp rbx, 1 // set cf iff r^2 >= 1
2680 adc rax, 0 // and add onto accumulator
2681 loop 0b
2682
2683 #elif defined(__i386__)
2684
2685 // this is actually better done in 32 bits. the carry has the wrong
2686 // sense here, so instead deduct one for each point outside the
2687 // quarter-circle rather than adding one for each point inside it.
2688 xor eax, eax
2689 xor ecx, ecx
2690
2691 0: movzx ebx, cx
2692 imul ebx, ebx
2693
2694 ror ecx, 0x10
2695 movzx edx, cx
2696 imul edx, edx
2697 rol ecx, 0x10
2698
2699 add ebx, edx // see?
2700 sbb eax, 0
2701 loop 0b
2702
2703 #elif defined(__arm__)
2704
2705 mov r0, #0
2706 mov r2, #0
2707
2708 0: uxth r1, r2, ror #0
2709 uxth r3, r2, ror #16
2710 mul r1, r1, r1
2711 mul r3, r3, r3
2712 cmn r1, r3 // mlas doesn't set cf usefully
2713 addcc r0, r0, #1
2714 adds r2, r2, #1
2715 bne 0b
2716
2717 #elif defined(__aarch64__)
2718
2719 mov w0, #0
2720 mov w2, #0
2721
2722 0: ubfx w1, w2, #0, #16
2723 ubfx w3, w2, #16, #16
2724 sub w2, w2, #1
2725 mul w1, w1, w1
2726 mul w3, w3, w3
2727 cmn w1, w3
2728 cinc.cc w0, w0
2729 cbnz w2, 0b
2730
2731 #else
2732 notimpl
2733 #endif
2734
2735 ret
2736
2737 endproc
2738
2739 proc x26
2740
2741 #if defined(__x86_64__)
2742
2743 notimpl
2744
2745 #elif defined(__i386__)
2746
2747 notimpl
2748
2749 #elif defined(__arm__)
2750
2751 notimpl
2752
2753 #elif defined(__aarch64__)
2754
2755 notimpl
2756
2757 #else
2758 notimpl
2759 #endif
2760
2761 endproc
2762
2763 proc x27
2764
2765 #if defined(__x86_64__)
2766
2767 notimpl
2768
2769 #elif defined(__i386__)
2770
2771 notimpl
2772
2773 #elif defined(__arm__)
2774
2775 notimpl
2776
2777 #elif defined(__aarch64__)
2778
2779 notimpl
2780
2781 #else
2782 notimpl
2783 #endif
2784
2785 endproc
2786
2787 proc x28
2788
2789 #if defined(__x86_64__)
2790
2791 notimpl
2792
2793 #elif defined(__i386__)
2794
2795 notimpl
2796
2797 #elif defined(__arm__)
2798
2799 notimpl
2800
2801 #elif defined(__aarch64__)
2802
2803 notimpl
2804
2805 #else
2806 notimpl
2807 #endif
2808
2809 endproc
2810
2811 proc x29
2812
2813 #if defined(__x86_64__)
2814
2815 notimpl
2816
2817 #elif defined(__i386__)
2818
2819 notimpl
2820
2821 #elif defined(__arm__)
2822
2823 notimpl
2824
2825 #elif defined(__aarch64__)
2826
2827 notimpl
2828
2829 #else
2830 notimpl
2831 #endif
2832
2833 endproc
2834
2835 proc x2a
2836
2837 #if defined(__x86_64__)
2838
2839 notimpl
2840
2841 #elif defined(__i386__)
2842
2843 notimpl
2844
2845 #elif defined(__arm__)
2846
2847 notimpl
2848
2849 #elif defined(__aarch64__)
2850
2851 notimpl
2852
2853 #else
2854 notimpl
2855 #endif
2856
2857 endproc
2858
2859 proc x2b
2860
2861 #if defined(__x86_64__)
2862
2863 notimpl
2864
2865 #elif defined(__i386__)
2866
2867 notimpl
2868
2869 #elif defined(__arm__)
2870
2871 notimpl
2872
2873 #elif defined(__aarch64__)
2874
2875 notimpl
2876
2877 #else
2878 notimpl
2879 #endif
2880
2881 endproc
2882
2883 proc x2c
2884
2885 #if defined(__x86_64__)
2886
2887 notimpl
2888
2889 #elif defined(__i386__)
2890
2891 notimpl
2892
2893 #elif defined(__arm__)
2894
2895 notimpl
2896
2897 #elif defined(__aarch64__)
2898
2899 notimpl
2900
2901 #else
2902 notimpl
2903 #endif
2904
2905 endproc
2906
2907 proc x2d
2908
2909 #if defined(__x86_64__)
2910
2911 notimpl
2912
2913 #elif defined(__i386__)
2914
2915 notimpl
2916
2917 #elif defined(__arm__)
2918
2919 notimpl
2920
2921 #elif defined(__aarch64__)
2922
2923 notimpl
2924
2925 #else
2926 notimpl
2927 #endif
2928
2929 endproc
2930
2931 proc x2e
2932
2933 #if defined(__x86_64__)
2934
2935 notimpl
2936
2937 #elif defined(__i386__)
2938
2939 notimpl
2940
2941 #elif defined(__arm__)
2942
2943 notimpl
2944
2945 #elif defined(__aarch64__)
2946
2947 notimpl
2948
2949 #else
2950 notimpl
2951 #endif
2952
2953 endproc
2954
2955 proc x2f
2956
2957 #if defined(__x86_64__)
2958
2959 notimpl
2960
2961 #elif defined(__i386__)
2962
2963 notimpl
2964
2965 #elif defined(__arm__)
2966
2967 notimpl
2968
2969 #elif defined(__aarch64__)
2970
2971 notimpl
2972
2973 #else
2974 notimpl
2975 #endif
2976
2977 endproc
2978
2979 ///--------------------------------------------------------------------------
2980 /// 0x30--0x3f
2981
2982 proc x30
2983
2984 #if defined(__x86_64__)
2985
2986 notimpl
2987
2988 #elif defined(__i386__)
2989
2990 notimpl
2991
2992 #elif defined(__arm__)
2993
2994 notimpl
2995
2996 #elif defined(__aarch64__)
2997
2998 notimpl
2999
3000 #else
3001 notimpl
3002 #endif
3003
3004 ret
3005
3006 endproc
3007
3008 proc x31
3009
3010 #if defined(__x86_64__)
3011
3012 notimpl
3013
3014 #elif defined(__i386__)
3015
3016 notimpl
3017
3018 #elif defined(__arm__)
3019
3020 notimpl
3021
3022 #elif defined(__aarch64__)
3023
3024 notimpl
3025
3026 #else
3027 notimpl
3028 #endif
3029
3030 endproc
3031
3032 proc x32
3033
3034 #if defined(__x86_64__)
3035
3036 notimpl
3037
3038 #elif defined(__i386__)
3039
3040 notimpl
3041
3042 #elif defined(__arm__)
3043
3044 notimpl
3045
3046 #elif defined(__aarch64__)
3047
3048 notimpl
3049
3050 #else
3051 notimpl
3052 #endif
3053
3054 endproc
3055
3056 proc x33
3057
3058 #if defined(__x86_64__)
3059
3060 notimpl
3061
3062 #elif defined(__i386__)
3063
3064 notimpl
3065
3066 #elif defined(__arm__)
3067
3068 notimpl
3069
3070 #elif defined(__aarch64__)
3071
3072 notimpl
3073
3074 #else
3075 notimpl
3076 #endif
3077
3078 endproc
3079
3080 proc x34
3081
3082 #if defined(__x86_64__)
3083
3084 notimpl
3085
3086 #elif defined(__i386__)
3087
3088 notimpl
3089
3090 #elif defined(__arm__)
3091
3092 notimpl
3093
3094 #elif defined(__aarch64__)
3095
3096 notimpl
3097
3098 #else
3099 notimpl
3100 #endif
3101
3102 endproc
3103
3104 proc x35
3105
3106 #if defined(__x86_64__)
3107
3108 notimpl
3109
3110 #elif defined(__i386__)
3111
3112 notimpl
3113
3114 #elif defined(__arm__)
3115
3116 notimpl
3117
3118 #elif defined(__aarch64__)
3119
3120 notimpl
3121
3122 #else
3123 notimpl
3124 #endif
3125
3126 endproc
3127
3128 proc x36
3129
3130 #if defined(__x86_64__)
3131
3132 notimpl
3133
3134 #elif defined(__i386__)
3135
3136 notimpl
3137
3138 #elif defined(__arm__)
3139
3140 notimpl
3141
3142 #elif defined(__aarch64__)
3143
3144 notimpl
3145
3146 #else
3147 notimpl
3148 #endif
3149
3150 endproc
3151
3152 proc x37
3153
3154 #if defined(__x86_64__)
3155
3156 notimpl
3157
3158 #elif defined(__i386__)
3159
3160 notimpl
3161
3162 #elif defined(__arm__)
3163
3164 notimpl
3165
3166 #elif defined(__aarch64__)
3167
3168 notimpl
3169
3170 #else
3171 notimpl
3172 #endif
3173
3174 endproc
3175
3176 proc x38
3177
3178 #if defined(__x86_64__)
3179
3180 notimpl
3181
3182 #elif defined(__i386__)
3183
3184 notimpl
3185
3186 #elif defined(__arm__)
3187
3188 notimpl
3189
3190 #elif defined(__aarch64__)
3191
3192 notimpl
3193
3194 #else
3195 notimpl
3196 #endif
3197
3198 endproc
3199
3200 proc x39
3201
3202 #if defined(__x86_64__)
3203
3204 notimpl
3205
3206 #elif defined(__i386__)
3207
3208 notimpl
3209
3210 #elif defined(__arm__)
3211
3212 notimpl
3213
3214 #elif defined(__aarch64__)
3215
3216 notimpl
3217
3218 #else
3219 notimpl
3220 #endif
3221
3222 endproc
3223
3224 proc x3a
3225
3226 #if defined(__x86_64__)
3227
3228 notimpl
3229
3230 #elif defined(__i386__)
3231
3232 notimpl
3233
3234 #elif defined(__arm__)
3235
3236 notimpl
3237
3238 #elif defined(__aarch64__)
3239
3240 notimpl
3241
3242 #else
3243 notimpl
3244 #endif
3245
3246 endproc
3247
3248 proc x3b
3249
3250 #if defined(__x86_64__)
3251
3252 notimpl
3253
3254 #elif defined(__i386__)
3255
3256 notimpl
3257
3258 #elif defined(__arm__)
3259
3260 notimpl
3261
3262 #elif defined(__aarch64__)
3263
3264 notimpl
3265
3266 #else
3267 notimpl
3268 #endif
3269
3270 endproc
3271
3272 proc x3c
3273
3274 #if defined(__x86_64__)
3275
3276 notimpl
3277
3278 #elif defined(__i386__)
3279
3280 notimpl
3281
3282 #elif defined(__arm__)
3283
3284 notimpl
3285
3286 #elif defined(__aarch64__)
3287
3288 notimpl
3289
3290 #else
3291 notimpl
3292 #endif
3293
3294 endproc
3295
3296 proc x3d
3297
3298 #if defined(__x86_64__)
3299
3300 notimpl
3301
3302 #elif defined(__i386__)
3303
3304 notimpl
3305
3306 #elif defined(__arm__)
3307
3308 notimpl
3309
3310 #elif defined(__aarch64__)
3311
3312 notimpl
3313
3314 #else
3315 notimpl
3316 #endif
3317
3318 endproc
3319
3320 proc x3e
3321
3322 #if defined(__x86_64__)
3323
3324 notimpl
3325
3326 #elif defined(__i386__)
3327
3328 notimpl
3329
3330 #elif defined(__arm__)
3331
3332 notimpl
3333
3334 #elif defined(__aarch64__)
3335
3336 notimpl
3337
3338 #else
3339 notimpl
3340 #endif
3341
3342 endproc
3343
3344 proc x3f
3345
3346 #if defined(__x86_64__)
3347
3348 notimpl
3349
3350 #elif defined(__i386__)
3351
3352 notimpl
3353
3354 #elif defined(__arm__)
3355
3356 notimpl
3357
3358 #elif defined(__aarch64__)
3359
3360 notimpl
3361
3362 #else
3363 notimpl
3364 #endif
3365
3366 endproc
3367
3368 ///----- That's all, folks --------------------------------------------------