xchg.S: Another couple of (easy) exercises.
[xchg-rax-rax] / xchg.S
1 /// -*- mode: asm; asm-comment-char: 0 -*-
2
3 ///--------------------------------------------------------------------------
4 /// Preliminaries.
5
6 #include <sys/syscall.h>
7
8 #if defined(__i386__) || defined(__x86_64__)
9
10 .intel_syntax noprefix
11
12 #elif defined(__arm__)
13
14 .macro ret
15 bx r14
16 .endm
17
18 .arch armv7-a
19
20 #elif defined(__aarch64__)
21
22 .macro cmov rd, rn, cc
23 csel \rd, \rn, \rd, \cc
24 .endm
25 #define _COND(_) \
26 _(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl) \
27 _(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv) \
28 _(hs) _(lo)
29 #define _INST(_) \
30 _(ccmp) _(ccmn) \
31 _(csel) _(cmov) \
32 _(csinc) _(cinc) _(cset) \
33 _(csneg) _(cneg) \
34 _(csinv) _(cinv) _(csetm)
35 #define _CONDVAR(cc) _definstvar cc;
36 #define _INSTVARS(inst) \
37 .macro _definstvar cc; \
38 .macro inst.\cc args:vararg; inst \args, \cc; .endm; \
39 .endm; \
40 _COND(_CONDVAR); \
41 .purgem _definstvar;
42 _INST(_INSTVARS)
43 #undef _COND
44 #undef _INST
45 #undef _CONDVAR
46 #undef _INSTVARS
47
48 #define CCMP_N 8
49 #define CCMP_Z 4
50 #define CCMP_C 2
51 #define CCMP_V 1
52
53 #define CCMP_MI CCMP_N
54 #define CCMP_PL 0
55 #define CCMP_EQ CCMP_Z
56 #define CCMP_NE 0
57 #define CCMP_CS CCMP_C
58 #define CCMP_HS CCMP_C
59 #define CCMP_CC 0
60 #define CCMP_LO 0
61 #define CCMP_VS CCMP_V
62 #define CCMP_VC 0
63 #define CCMP_HI CCMP_C
64 #define CCMP_LS 0
65 #define CCMP_LT CCMP_N
66 #define CCMP_GE 0
67 #define CCMP_LE CCMP_N
68 #define CCMP_GT 0
69
70 #else
71 # error "not supported"
72 #endif
73
74 .macro proc name
75 .globl \name
76 .type \name, STT_FUNC
77 .p2align 4
78 \name\():
79 .macro endproc
80 .size \name, . - \name
81 .purgem endproc
82 .endm
83 .endm
84
85 .macro ch c
86 #if defined(__i386__)
87
88 pushf
89 push eax
90 push ebx
91 push ecx
92 push edx
93 push ebp
94 mov ebp, esp
95 and esp, -16
96
97 push \c
98 call putchar@plt
99
100 call get_pc_ebx
101 add ebx, offset _GLOBAL_OFFSET_TABLE
102 mov eax, [ebx + stdout@GOT]
103 mov eax, [eax]
104 call fflush@plt
105
106 mov esp, ebp
107 pop ebp
108 pop edx
109 pop ecx
110 pop ebx
111 pop eax
112 popf
113
114 #elif defined(__x86_64__)
115
116 pushf
117 push rax
118 push rcx
119 push rdx
120 push rsi
121 push rdi
122 push r8
123 push r9
124 push rbp
125 mov rbp, rsp
126 and rsp, -16
127
128 mov rdi, \c
129 call putchar@plt
130
131 mov rdi, [rip + stdout]
132 call fflush@plt
133
134 mov rsp, rbp
135 pop rbp
136 pop r9
137 pop r8
138 pop rdi
139 pop rsi
140 pop rdx
141 pop rcx
142 pop rax
143 popf
144
145 #elif defined(__arm__)
146
147 stmfd r13!, {r0-r4, r12, r14}
148
149 mov r4, r13
150 bic r14, r4, #15
151 mov r13, r14
152
153 mov r0, #\c
154 bl putchar@plt
155
156 ldr r14, .L$_c$gotoff$\@
157 .L$_c$gotpc$\@:
158 add r14, pc, r14
159 b .L$_c$cont$\@
160 .L$_c$gotoff$\@:
161 .word _GLOBAL_OFFSET_TABLE - .L$_c$gotpc$\@ - 8
162 .L$_c$cont$\@:
163 bl fflush@plt
164
165 mov r13, r4
166 ldmfd r13!, {r0-r4, r12, r14}
167
168 #elif defined(__aarch64__)
169
170 sub sp, sp, #20*8
171 stp x0, x1, [sp, #0]
172 stp x2, x3, [sp, #16]
173 stp x4, x5, [sp, #32]
174 stp x6, x7, [sp, #48]
175 stp x8, x9, [sp, #64]
176 stp x10, x11, [sp, #80]
177 stp x12, x13, [sp, #96]
178 stp x14, x15, [sp, #112]
179 stp x16, x17, [sp, #128]
180 mrs x16, nzcv
181 stp x16, x30, [sp, #144]
182
183 mov w0, #\c
184 bl putchar
185 adrp x0, :got:stdout
186 ldr x0, [x0, #:got_lo12:stdout]
187 ldr x0, [x0]
188 bl fflush
189
190 ldp x16, x30, [sp, #144]
191 msr nzcv, x16
192 ldp x16, x17, [sp, #128]
193 ldp x14, x15, [sp, #112]
194 ldp x12, x13, [sp, #96]
195 ldp x10, x11, [sp, #80]
196 ldp x8, x9, [sp, #64]
197 ldp x6, x7, [sp, #48]
198 ldp x4, x5, [sp, #32]
199 ldp x2, x3, [sp, #16]
200 ldp x0, x1, [sp, #0]
201 add sp, sp, #20*8
202
203 #else
204 # error "not supported"
205 #endif
206 .endm
207
208 .macro notimpl
209 #if defined(__i386__) || defined(__x86_64__)
210 ud2
211 #elif defined(__arm__)
212 udf
213 #elif defined(__aarch64__)
214 hlt #0
215 #else
216 # error "not supported"
217 #endif
218 .endm
219
220 .section .note.GNU-stack, "", %progbits
221
222 .text
223
224 #if defined(__i386__)
225 get_pc_ebx:
226 mov ebx, [esp]
227 ret
228 #endif
229
230
231 proc call_example
232
233 #if defined(__i386__)
234
235 push ebx // ebx
236 push esi // esi, ebx
237 push edi // edi, esi, ebx
238 push ebp // flags, ebp, ..., ebx
239 pushf
240
241 mov edi, [esp + 4*6]
242 mov esi, [esp + 4*7]
243 push esi // regs, flags, ebp, ..., ebx
244
245 call get_pc_ebx
246 lea eax, [ebx + 9f - .]
247 push eax // cont, regs, flags, ebp, ..., ebx
248 push edi // func, cont, regs, flags, ebp, ..., ebx
249
250 mov eax, [esi + 28]
251 pushf
252 pop ecx
253 and eax, 0x0cd5
254 and ecx, ~0x0cd5
255 or eax, ecx
256 push eax
257 popf
258 mov eax, [esi + 0]
259 mov ebx, [esi + 4]
260 mov ecx, [esi + 8]
261 mov edx, [esi + 12]
262 mov edi, [esi + 20]
263 mov ebp, [esi + 24]
264 mov esi, [esi + 16]
265
266 ret // -> func; regs, flags, ebp, ..., ebx
267
268 9: pushf // eflags, regs, flags, ebp, ..., ebx
269 push esi // esi, eflags, regs, flags, ebp, ..., ebx
270 mov esi, [esp + 8]
271 mov [esi + 0], eax
272 mov [esi + 4], ebx
273 mov [esi + 8], ecx
274 mov [esi + 12], edx
275 mov [esi + 20], edi
276 mov [esi + 24], ebp
277 pop eax // rflags, regs, flags, ebp, ..., ebx
278 mov [esi + 16], eax
279 pop eax // regs, flags, ebp, ..., ebx
280 mov [esi + 28], eax
281
282 add esp, 4 // flags, ebp, ..., ebx
283 popf // ebp, ..., ebx
284 pop ebp // ..., ebx
285 pop edi
286 pop esi
287 pop ebx //
288 ret
289
290 #elif defined(__x86_64__)
291
292 push rbx // rbx
293 push r10
294 push r11
295 push r12
296 push r13
297 push r14
298 push r15
299 push rbp // flags, rbp, ..., rbx
300 pushf
301
302 push rsi // regs, flags, rbp, ..., rbx
303
304 lea rax, [rip + 9f]
305 push rax // cont, regs, flags, rbp, ..., rbx
306 push rdi // func, cont, regs, flags, rbp, ..., rbx
307
308 mov rax, [rsi + 8*15]
309 pushf
310 pop rcx
311 and rax, 0x0cd5
312 and rcx, ~0x0cd5
313 or rax, rcx
314 push rax
315 popf
316 mov rax, [rsi + 0]
317 mov rbx, [rsi + 8]
318 mov rcx, [rsi + 16]
319 mov rdx, [rsi + 24]
320 mov rdi, [rsi + 40]
321 mov rbp, [rsi + 48]
322 mov r8, [rsi + 56]
323 mov r9, [rsi + 64]
324 mov r10, [rsi + 72]
325 mov r11, [rsi + 80]
326 mov r12, [rsi + 88]
327 mov r13, [rsi + 96]
328 mov r14, [rsi + 104]
329 mov r15, [rsi + 112]
330 mov rsi, [rsi + 32]
331
332 ret // -> func; regs, flags, rbp, ..., rbx
333
334 9: pushf // rflags, regs, flags, rbp, ..., rbx
335 push rsi // rsi, rflags, regs, flags, rbp, ..., rbx
336 mov rsi, [rsp + 16]
337 mov [rsi + 0], rax
338 mov [rsi + 8], rbx
339 mov [rsi + 16], rcx
340 mov [rsi + 24], rdx
341 mov [rsi + 40], rdi
342 mov [rsi + 48], rbp
343 mov [rsi + 56], r8
344 mov [rsi + 64], r9
345 mov [rsi + 72], r10
346 mov [rsi + 80], r11
347 mov [rsi + 88], r12
348 mov [rsi + 96], r13
349 mov [rsi + 104], r14
350 mov [rsi + 112], r15
351 pop rax // rflags, regs, flags, rbp, ..., rbx
352 mov [rsi + 32], rax
353 pop rax // regs, flags, rbp, ..., rbx
354 mov [rsi + 120], rax
355
356 add rsp, 8 // flags, rbp, ..., rbx
357 popf // rbp, ..., rbx
358 pop rbp // ..., rbx
359 pop r15
360 pop r14
361 pop r13
362 pop r12
363 pop r11
364 pop r10
365 pop rbx //
366 ret
367
368 #elif defined(__arm__)
369
370 stmfd r13!, {r0, r1, r4-r11, r14}
371 ldmia r1, {r0-r12, r14}
372 msr cpsr, r14
373 mov r14, pc
374 ldr pc, [r13], #4
375 ldr r14, [r13], #4
376 stmia r14!, {r0-r12}
377 mrs r0, cpsr
378 str r0, [r14]
379 ldmfd r13!, {r4-r11, pc}
380
381 #elif defined(__aarch64__)
382
383 stp x29, x30, [sp, #-14*8]!
384 mov x29, sp
385 stp x19, x20, [sp, #16]
386 stp x21, x22, [sp, #32]
387 stp x23, x24, [sp, #48]
388 stp x25, x26, [sp, #64]
389 stp x27, x28, [sp, #80]
390 str x1, [sp, #104]
391
392 ldp x29, x30, [x1, #224]
393 msr nzcv, x30
394 mov x30, x0
395 ldp x27, x28, [x1, #208]
396 ldp x25, x26, [x1, #192]
397 ldp x23, x24, [x1, #176]
398 ldp x21, x22, [x1, #160]
399 ldp x19, x20, [x1, #144]
400 ldp x16, x17, [x1, #128]
401 ldp x14, x15, [x1, #112]
402 ldp x12, x13, [x1, #96]
403 ldp x10, x11, [x1, #80]
404 ldp x8, x9, [x1, #64]
405 ldp x6, x7, [x1, #48]
406 ldp x4, x5, [x1, #32]
407 ldp x2, x3, [x1, #16]
408 ldp x0, x1, [x1, #0]
409
410 blr x30
411
412 ldr x30, [sp, #104]
413 stp x27, x28, [x30, #208]
414 stp x25, x26, [x30, #192]
415 stp x23, x24, [x30, #176]
416 stp x21, x22, [x30, #160]
417 stp x19, x20, [x30, #144]
418 stp x16, x17, [x30, #128]
419 stp x14, x15, [x30, #112]
420 stp x12, x13, [x30, #96]
421 stp x10, x11, [x30, #80]
422 stp x8, x9, [x30, #64]
423 stp x6, x7, [x30, #48]
424 stp x4, x5, [x30, #32]
425 stp x2, x3, [x30, #16]
426 stp x0, x1, [x30, #0]
427 mov x0, x30
428 mrs x30, nzcv
429 stp x29, x30, [x0, #224]
430
431 ldp x19, x20, [sp, #16]
432 ldp x21, x22, [sp, #32]
433 ldp x23, x24, [sp, #48]
434 ldp x25, x26, [sp, #64]
435 ldp x27, x28, [sp, #80]
436 ldp x29, x30, [sp], #14*8
437
438 ret
439
440 #else
441 # error "not supported"
442 #endif
443
444 endproc
445
446 proc nop
447
448 ret
449
450 endproc
451
452 ///--------------------------------------------------------------------------
453 /// 0x00--0x0f
454
455 proc x00
456
457 // clear all 64 bits of extended traditional registers
458
459 #if defined(__x86_64__)
460
461 xor eax, eax // clear rax
462 lea rbx, [0] // rbx -> _|_
463 loop . // iterate, decrement rcx until zero
464 mov rdx, 0 // set rdx = 0
465 and esi, 0 // clear all bits of rsi
466 sub edi, edi // set rdi = edi - edi = 0
467 push 0
468 pop rbp // pop 0 into rbp
469
470 #elif defined(__i386__)
471
472 xor eax, eax
473 lea ebx, [0]
474 loop .
475 mov edx, 0
476 and esi, 0
477 sub edi, edi
478 push 0
479 pop ebp
480
481 #elif defined(__arm__)
482
483 eor r0, r0, r0
484 rsb r1, r1, r1
485 0: subs r2, r2, #1
486 bne 0b
487 mov r3, #0
488 and r4, r4, #0
489 sub r5, r5, r5
490
491 #elif defined(__aarch64__)
492
493 eor w0, w0, w0
494 mov w1, wzr
495 0: sub w2, w2, #1
496 cbnz w2, 0b
497 mov w3, #0
498 and w4, w4, wzr
499 sub w5, w5, w5
500
501 #else
502 notimpl
503 #endif
504
505 ret
506
507 endproc
508
509 proc x01
510
511 // advance a fibonacci pair by c steps
512 //
513 // on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1}
514 // and f_{i+c}, where f_{i+1} = f_i + f_{i-1}
515
516 #if defined(__x86_64__)
517
518 0: xadd rax, rdx // a, d = a + d, a
519 // = f_{i+1} + f_i, f_{i+1}
520 // = f_{i+2}, f_{i+1}
521 loop 0b // advance i, decrement c, iterate
522
523 #elif defined(__i386__)
524
525 0: xadd eax, edx
526 loop 0b
527
528 #elif defined(__arm__)
529
530 0: subs r2, r2, #2
531 add r3, r3, r0
532 blo 8f
533 add r0, r0, r3
534 bhi 0b
535
536 8: movne r0, r3
537
538 #elif defined(__aarch64__)
539
540 0: subs x2, x2, #2
541 add x3, x3, x0
542 b.lo 8f
543 add x0, x0, x3
544 b.hi 0b
545
546 8: cmov.ne x0, x3
547
548 #else
549 notimpl
550 #endif
551
552 ret
553
554 endproc
555
556 proc x02
557
558 // boolean canonify a: if a = 0 on entry, leave it zero; otherwise
559 // set a = 1
560
561 #if defined(__x86_64__)
562
563 neg rax // set cf iff a /= 0
564 sbb rax, rax // a = a - a - cf = -cf
565 neg rax // a = cf
566
567 #elif defined(__i386__)
568
569 neg eax
570 sbb eax, eax
571 neg eax
572
573 #elif defined(__arm__)
574
575 movs r1, r0 // the easy way
576 movne r1, #1 // mvnne r1, #1 for mask
577
578 cmp r0, #1 // clear cf iff a == 0
579 sbc r2, r0, r0 // c' = a - a - 1 + cf = cf - 1
580 add r2, r2, #1 // c' = cf
581
582 sub r3, r0, r0, lsr #1 // d' top bit clear; d' = 0 iff a = 0
583 rsb r3, r3, #0 // d' top bit set iff a /= 0
584 mov r3, r3, lsr #31 // asr for mask
585
586 rsbs r0, r0, #0
587 sbc r0, r0, r0
588 rsb r0, r0, #0
589
590 #elif defined(__aarch64__)
591
592 cmp x0, #0 // trivial
593 cset.ne x1 // csetm for mask
594
595 cmp xzr, x0 // set cf iff a == 0
596 sbc x2, x0, x0 // c' = a - a - 1 + cf = cf - 1
597 neg x2, x2 // c' = 1 - cf
598
599 sub x3, x0, x0, lsr #1 // if a < 2^63 then a' = ceil(d/2) <
600 // 2^63
601 // if a >= 2^63, write a = 2^63 + t
602 // with t < 2^63; d' = 2^63 - 2^62 +
603 // ceil(t/2) = 2^62 + ceil(t/2), and
604 // ceil(t/2) < 2^62
605 // anyway d' < 2^63 and d' = 0 iff
606 // a = 0
607 neg x3, x3 // d' top bit set iff a /= 0
608 lsr x3, x3, #63 // asr for mask
609
610 cmp x0, #1 // set cf iff a /= 0
611 adc x0, xzr, xzr // a' = 0 + 0 + cf = cf
612
613 #else
614 notimpl
615 #endif
616
617 ret
618
619 endproc
620
621 proc x03
622
623 // set a = min(a, d) (unsigned); clobber c, d
624
625 #if defined(__x86_64__)
626
627 sub rdx, rax // d' = d - a; set cf if a > d
628 sbb rcx, rcx // c = -cf = -[a > d]
629 and rcx, rdx // c = a > d ? d - a : 0
630 add rax, rcx // a' = a > d ? d : a
631
632 #elif defined(__i386__)
633
634 sub edx, eax
635 sbb ecx, ecx
636 and ecx, edx
637 add eax, ecx
638
639 #elif defined(__arm__)
640
641 cmp r0, r3 // the easy way
642 movlo r1, r0 // only needed for out-of-place
643 movhs r1, r3
644
645 subs r3, r3, r0
646 sbc r12, r12, r12
647 and r12, r12, r3
648 add r0, r0, r12
649
650 #elif defined(__aarch64__)
651
652 cmp x0, x3 // the easy way
653 csel.lo x1, x0, x3
654
655 subs x3, x3, x0 // d' = d - a; set cf if d >= a
656 sbc x16, xzr, xzr // t = -1 + cf = -[a > d]
657 and x16, x16, x3 // t = a > d ? d - a : 0
658 add x0, x0, x16 // a' = a > d ? d : a
659
660 #else
661 notimpl
662 #endif
663
664 ret
665
666 endproc
667
668 proc x04
669
670 // switch case?
671
672 #if defined(__x86_64__)
673
674 // unrelated playing
675 mov ecx, eax
676 mov rbx, -1
677 mov edx, ecx
678 sub edx, '0'
679 cmp edx, 10
680 cmovb rbx, rdx
681 or ecx, 0x20
682 mov edx, ecx
683 sub edx, 'a'
684 sub ecx, 'a' - 10
685 cmp edx, 6
686 cmovb rbx, rcx
687
688 xor al, 0x20
689
690 #elif defined(__i386__)
691
692 // unrelated playing
693 mov ecx, eax
694 mov ebx, -1
695 mov edx, ecx
696 sub edx, '0'
697 cmp edx, 10
698 cmovb ebx, edx
699 or ecx, 0x20
700 mov edx, ecx
701 sub edx, 'a'
702 sub ecx, 'a' - 10
703 cmp edx, 6
704 cmovb ebx, ecx
705
706 xor al, 0x20
707
708 #elif defined(__arm__)
709
710 // unrelated playing
711 mvn r1, #0
712 sub r12, r0, #'0'
713 cmp r12, #10
714 movlo r1, r12
715 orr r12, r0, #0x20
716 sub r12, r12, #'a'
717 cmp r12, #6
718 addlo r1, r12, #10
719
720 eor r0, r0, #0x20
721
722 #elif defined(__aarch64__)
723
724 // unrelated playing
725 mov x1, #-1
726 sub w16, w0, #'0'
727 cmp w16, #10
728 cmov.lo x1, x16
729 orr w16, w0, #0x20
730 sub w16, w16, #'a' - 10
731 cmp w16, #10
732 ccmp.hs w16, #16, #CCMP_HS
733 cmov.lo x1, x16
734
735 eor w0, w0, #0x20
736
737 #else
738 notimpl
739 #endif
740
741 ret
742
743 endproc
744
745 proc x05
746
747 // answer whether 5 <= a </<= 9.
748
749 #if defined(__x86_64__)
750
751 sub rax, 5 // a' = a - 5
752 cmp rax, 4 // is a' - 5 </<= 4?
753
754 // cc a' a
755 //
756 // z/e a' = 4 a = 9
757 // nz/ne a' /= 4 a /= 9
758 //
759 // a/nbe a' > 4 a > 9 or a < 5
760 // nc/ae/nb a' >= 4 a >= 9 or a < 5
761 // c/b/nae a' < 4 5 <= a < 9
762 // be/na a' <= 4 5 <= a <= 9
763 //
764 // o a' < -2^63 + 4 -2^63 + 5 <= a < -2^63 + 9
765 // no a' >= -2^63 + 4 a >= -2^63 + 9 or
766 // a < -2^63 + 5
767 // s -2^63 + 4 <= a' < 4 -2^63 + 9 <= a < 9
768 // ns a' < -2^63 + 4 or a < -2^63 + 9 or a >= 9
769 // a' >= 4
770 // ge/nl a' >= 4 a >= 9 or a < -2^63 + 5
771 // l/nge a' < 4 -2^63 + 5 <= a < 9
772 // g/nle a' > 4 a > 9 or a < -2^63 + 5
773 // le/ng a' <= 4 -2^63 + 5 <= a <= 9
774
775 #elif defined(__i386__)
776
777 sub eax, 5
778 cmp eax, 4
779
780 #elif defined(__arm__)
781
782 // i dimly remember having a slick way to do this way back in the
783 // day, but i can't figure it out any more.
784 sub r0, #5
785 cmp r0, #4
786
787 #elif defined(__aarch64__)
788
789 // literal translation is too obvious
790 cmp x0, #5
791 ccmp.hs x0, #9, #CCMP_HS
792
793 #else
794 notimpl
795 #endif
796
797 ret
798
799 endproc
800
801 proc x06
802
803 // leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of,
804 // set sf to msb(a)
805
806 #if defined(__x86_64__)
807
808 not rax // a' = -a - 1
809 inc rax // a' = -a
810 neg rax // a' = a
811
812 #elif defined(__i386__)
813
814 not eax
815 inc eax
816 neg eax
817
818 #elif defined(__arm__)
819
820 mvn r0, r0
821 add r0, r0, #1
822 rsbs r0, r0, #0 // cf has opposite sense
823
824 #elif defined(__aarch64__)
825
826 mvn x0, x0
827 add x0, x0, #1
828 negs x0, x0 // cf has opposite sense
829
830 #else
831 notimpl
832 #endif
833
834 ret
835
836 endproc
837
838 proc x07
839
840 // same as before (?)
841
842 #if defined(__x86_64__)
843
844 inc rax // a' = a + 1
845 neg rax // a' = -a - 1
846 inc rax // a' = -a
847 neg rax // a' = a
848
849 #elif defined(__i386__)
850
851 inc eax
852 neg eax
853 inc eax
854 neg eax
855
856 #elif defined(__arm__)
857
858 add r0, r0, #1
859 rsb r0, r0, #0
860 add r0, r0, #1
861 rsbs r0, r0, #0
862
863 #elif defined(__aarch64__)
864
865 add x0, x0, #1
866 neg x0, x0
867 add x0, x0, #1
868 negs x0, x0 // cf has opposite sense
869
870 #else
871 notimpl
872 #endif
873
874 ret
875
876 endproc
877
878 proc x08
879
880 // floor((a + d)/2), correctly handling overflow conditions; final cf
881 // is lsb(a + d), probably uninteresting
882
883 #if defined(__x86_64__)
884
885 add rax, rdx // cf || a' = a + d
886 rcr rax, 1 // shift 65-bit result right by one
887 // place; lsb moves into carry
888
889 #elif defined(__i386__)
890
891 add eax, edx
892 rcr eax, 1
893
894 #elif defined(__arm__)
895
896 // like the two-instruction a64 version
897 sub r1, r3, r0
898 add r1, r0, r1, lsr #1
899
900 // the slick version, similar to the above
901 adds r0, r0, r3
902 mov r0, r0, rrx
903
904 #elif defined(__aarch64__)
905
906 // a64 lacks a32's rrx. literal translation.
907 adds x1, x0, x3 // cf || a' = a + d
908 adc x16, xzr, xzr // realize cf in extra register
909 extr x1, x16, x1, #1 // shift down one place
910
911 // two instruction version: clobbers additional register. (if you
912 // wanted the answer in any other register, even overwriting d, then
913 // this is unnecessary.) also depends on d >= a.
914 sub x16, x3, x0 // compute difference
915 add x0, x0, x16, lsr #1 // add half of it (rounded down)
916
917 #else
918 notimpl
919 #endif
920
921 ret
922
923 endproc
924
925 proc x09
926
927 // a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3
928 // (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8).
929
930 #if defined(__x86_64__)
931
932 shr rax, 3 // a' = floor(a/8); cf = 1 if a ==
933 // 4, 5, 6, 7 (mod 8)
934 adc rax, 0 // a' = floor(a/8) + cf
935
936 #elif defined(__i386__)
937
938 shr eax, 3
939 adc eax, 0
940
941 #elif defined(__arm__)
942
943 movs r0, r0, lsr #3
944 adc r0, r0, #0
945
946 #elif defined(__aarch64__)
947
948 tst x0, #4
949 orr x0, xzr, x0, lsr #3
950 cinc.ne x0, x0
951
952 #else
953 notimpl
954 #endif
955
956 ret
957
958 endproc
959
960 proc x0a
961
962 // increment c-byte little-endian bignum at rdi
963
964 #if defined(__x86_64__)
965
966 add byte ptr [rdi], 1
967 0: inc rdi
968 adc byte ptr [rdi], 0
969 loop 0b
970
971 #elif defined(__i386__)
972
973 add byte ptr [edi], 1
974 0: inc edi
975 adc byte ptr [edi], 0
976 loop 0b
977
978 #elif defined(__arm__)
979
980 mov r12, #256 // set initial carry
981 0: ldrb r0, [r5]
982 subs r2, r2, #1
983 add r12, r0, r12, lsr #8
984 strb r12, [r5], #1
985 bne 0b
986
987 #elif defined(__aarch64__)
988
989 mov w17, #256 // set initial carry
990 0: ldrb w16, [x5]
991 sub x2, x2, #1
992 add w17, w16, w17, lsr #8
993 strb w17, [x5], #1
994 cbnz x2, 0b
995
996 #else
997 notimpl
998 #endif
999
1000 ret
1001
1002 endproc
1003
1004 proc x0b
1005
1006 // negate double-precision d:a
1007
1008 #if defined(__x86_64__)
1009
1010 not rdx // d' = -d - 1
1011 neg rax // a' = -a;
1012 // cf = 1 iff a /= 0
1013 sbb rdx, -1 // d' = -d - cf
1014
1015 #elif defined(__i386__)
1016
1017 not edx
1018 neg eax
1019 sbb edx, -1
1020
1021 #elif defined(__arm__)
1022
1023 // reverse subtract is awesome
1024 rsbs r0, r0, #0
1025 rsc r3, r3, #0
1026
1027 #elif defined(__aarch64__)
1028
1029 // easy way: everything is better with zero registers.
1030 negs x0, x0
1031 ngc x3, x3
1032
1033 #else
1034 notimpl
1035 #endif
1036
1037 ret
1038
1039 endproc
1040
1041 proc x0c
1042
1043 // rotate is distributive over xor.
1044
1045 #if defined(__x86_64__)
1046
1047 // rax // = a_1 || a_0
1048 // rbx // = b_1 || b_0
1049 mov rcx, rax // = a_1 || a_0
1050
1051 xor rcx, rbx // = (a_1 XOR b_1) || (a_0 XOR b_0)
1052 ror rcx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1)
1053
1054 ror rax, 0xd // = a_0 || a_1
1055 ror rbx, 0xd // = b_0 || b_1
1056 xor rax, rbx // = (a_0 XOR b_0) || (a_1 XOR b_1)
1057
1058 cmp rax, rcx // always equal
1059
1060 #elif defined(__i386__)
1061
1062 mov ecx, eax // = a_1 || a_0
1063
1064 xor ecx, ebx // = (a_1 XOR b_1) || (a_0 XOR b_0)
1065 ror ecx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1)
1066
1067 ror eax, 0xd // = a_0 || a_1
1068 ror ebx, 0xd // = b_0 || b_1
1069 xor eax, ebx // = (a_0 XOR b_0) || (a_1 XOR b_1)
1070
1071 cmp eax, ecx // always equal
1072
1073 #elif defined(__arm__)
1074
1075
1076 // r0 // = a_1 || a_0
1077 // r1 // = b_1 || b_0
1078 eor r2, r0, r1 // = (a_1 XOR b_1) || (a_0 XOR b_0)
1079 mov r2, r2, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1080
1081 mov r1, r1, ror #13 // = b_0 || b_1
1082 eor r0, r1, r0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1083
1084 cmp r0, r2 // always equal
1085
1086 #elif defined(__aarch64__)
1087
1088 // x0 // = a_1 || a_0
1089 // x1 // = b_1 || b_0
1090 eor x2, x0, x1 // = (a_1 XOR b_1) || (a_0 XOR b_0)
1091 ror x2, x2, #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1092
1093 ror x1, x1, #13 // = b_0 || b_1
1094 eor x0, x1, x0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1095
1096 cmp x0, x2 // always equal
1097
1098 #else
1099 notimpl
1100 #endif
1101
1102 ret
1103
1104 endproc
1105
1106 proc x0d
1107
1108 // and is distributive over xor.
1109
1110 #if defined(__x86_64__)
1111
1112 mov rdx, rbx // = b
1113
1114 xor rbx, rcx // = b XOR c
1115 and rbx, rax // = a AND (b XOR c)
1116
1117 and rdx, rax // = a AND b
1118 and rax, rcx // = a AND c
1119 xor rax, rdx // = (a AND b) XOR (a AND c)
1120 // = a AND (b XOR c)
1121
1122 cmp rax, rbx // always equal
1123
1124 #elif defined(__i386__)
1125
1126 mov edx, ebx // = b
1127
1128 xor ebx, ecx // = b XOR c
1129 and ebx, eax // = a AND (b XOR c)
1130
1131 and edx, eax // = a AND b
1132 and eax, ecx // = a AND c
1133 xor eax, edx // = (a AND b) XOR (a AND c)
1134 // = a AND (b XOR c)
1135
1136 cmp eax, ebx // always equal
1137
1138 #elif defined(__arm__)
1139
1140 and r3, r0, r1 // = a AND b
1141
1142 eor r1, r1, r2 // = b XOR c
1143 and r1, r1, r0 // = a AND (b XOR c)
1144
1145 and r0, r0, r2 // = a AND c
1146 eor r0, r0, r3 // = (a AND b) XOR (a AND c)
1147 // = a AND (b XOR c)
1148
1149 cmp r0, r1 // always equal
1150
1151 #elif defined(__aarch64__)
1152
1153 and x3, x0, x1 // = a AND b
1154
1155 eor x1, x1, x2 // = b XOR c
1156 and x1, x1, x0 // = a AND (b XOR c)
1157
1158 and x0, x0, x2 // = a AND c
1159 eor x0, x0, x3 // = (a AND b) XOR (a AND c)
1160 // = a AND (b XOR c)
1161
1162 cmp x0, x1 // always equal
1163
1164 #else
1165 notimpl
1166 #endif
1167
1168 ret
1169
1170 endproc
1171
1172 proc x0e
1173
1174 // de morgan's law
1175
1176 #if defined(__x86_64__)
1177
1178 mov rcx, rax // = a
1179
1180 and rcx, rbx // = a AND b
1181 not rcx // = NOT (a AND b)
1182
1183 not rax // = NOT a
1184 not rbx // = NOT b
1185 or rax, rbx // = (NOT a) OR (NOT b)
1186 // = NOT (a AND b)
1187
1188 cmp rax, rcx // always equal
1189
1190 #elif defined(__i386__)
1191
1192 mov ecx, eax // = a
1193
1194 and ecx, ebx // = a AND b
1195 not ecx // = NOT (a AND b)
1196
1197 not eax // = NOT a
1198 not ebx // = NOT b
1199 or eax, ebx // = (NOT a) OR (NOT b)
1200 // = NOT (a AND b)
1201
1202 cmp eax, ecx // always equal
1203
1204 #elif defined(__arm__)
1205
1206 and r2, r0, r1 // = a AND b
1207 mvn r2, r2 // = NOT (a AND b)
1208
1209 mvn r0, r0 // = NOT a
1210 mvn r1, r1 // = NOT b
1211 orr r0, r0, r1 // = (NOT a) OR (NOT b)
1212
1213 cmp r0, r2 // always equal
1214
1215 #elif defined(__aarch64__)
1216
1217 and x2, x0, x1 // = a AND b
1218 mvn x2, x2 // = NOT (a AND b)
1219
1220 mvn x0, x0 // = NOT a
1221 orn x0, x0, x1 // = (NOT a) OR (NOT b)
1222
1223 cmp x0, x2 // always equal
1224
1225 #else
1226 notimpl
1227 #endif
1228
1229 ret
1230
1231 endproc
1232
1233 proc x0f
1234
1235 // replace input buffer bytes with cumulative XORs with initial a;
1236 // final a is XOR of all buffer bytes and initial a.
1237 //
1238 // not sure why you'd do this.
1239
1240 #if defined(__x86_64__)
1241
1242 0: xor [rsi], al
1243 lodsb
1244 loop 0b
1245
1246 #elif defined(__i386__)
1247
1248 0: xor [esi], al
1249 lodsb
1250 loop 0b
1251
1252 #elif defined(__arm__)
1253
1254 0: ldrb r12, [r4]
1255 subs r2, r2, #1
1256 eor r0, r0, r12
1257 strb r0, [r4], #1
1258 bne 0b
1259
1260 #elif defined(__aarch64__)
1261
1262 0: ldrb w16, [x4]
1263 sub x2, x2, #1
1264 eor w0, w0, w16
1265 strb w0, [x4], #1
1266 cbnz x2, 0b
1267
1268 #else
1269 notimpl
1270 #endif
1271
1272 ret
1273
1274 endproc
1275
1276 ///--------------------------------------------------------------------------
1277 /// 0x10--0x1f
1278
1279 proc x10
1280
1281 // four different ways to swap a pair of registers.
1282
1283 #if defined(__x86_64__)
1284
1285 push rax
1286 push rcx
1287 pop rax
1288 pop rcx
1289
1290 xor rax, rcx
1291 xor rcx, rax
1292 xor rax, rcx
1293
1294 add rax, rcx
1295 sub rcx, rax
1296 add rax, rcx
1297 neg rcx
1298
1299 xchg rax, rcx
1300
1301 #elif defined(__i386__)
1302
1303 push eax
1304 push ecx
1305 pop eax
1306 pop ecx
1307
1308 xor eax, ecx
1309 xor ecx, eax
1310 xor eax, ecx
1311
1312 add eax, ecx
1313 sub ecx, eax
1314 add eax, ecx
1315 neg ecx
1316
1317 xchg eax, ecx
1318
1319 #elif defined(__arm__)
1320
1321 stmfd r13!, {r0, r2}
1322 ldr r0, [r13, #4]
1323 ldr r2, [r13], #8
1324
1325 eor r0, r0, r2
1326 eor r2, r2, r0
1327 eor r0, r0, r2
1328
1329 sub r0, r0, r2
1330 add r2, r2, r0
1331 rsb r0, r0, r2 // don't need 3-addr with reverse-sub
1332
1333 mov r12, r0
1334 mov r0, r2
1335 mov r2, r0
1336
1337 #elif defined(__aarch64__)
1338
1339 // anything you can do
1340 stp x0, x2, [sp, #-16]!
1341 ldp x2, x0, [sp], #16
1342
1343 eor x0, x0, x2
1344 eor x2, x2, x0
1345 eor x0, x0, x2
1346
1347 // the add/sub/add thing was daft. you can do it in three if you're
1348 // clever -- and have three-address operations.
1349 sub x0, x0, x2
1350 add x2, x2, x0
1351 sub x0, x2, x0
1352
1353 // but we lack a fourth. we can't do this in fewer than three
1354 // instructions without hitting memory. only `ldp' will modify two
1355 // registers at a time, so we need at least two instructions -- but
1356 // if the first one sets one of our two registers to its final value
1357 // then we lose the other input value with no way to recover it, so
1358 // we must either write a fresh third register, or write something
1359 // other than the final value, and in both cases we need a third
1360 // instruction to fix everything up. we've done the wrong-something-
1361 // other trick twice, so here's the captain-obvious use-a-third-
1362 // register version.
1363 mov x16, x0
1364 mov x0, x2
1365 mov x2, x16
1366
1367 #else
1368 notimpl
1369 #endif
1370
1371 ret
1372
1373 endproc
1374
1375 proc x11
1376
1377 // assuming a is initialized to zero, set a to the inclusive or of
1378 // the xor-differences of corresponding bytes in the c-byte strings
1379 // at si and di.
1380 //
1381 // in particular, a will be zero (and zf set) if and only if the two
1382 // strings are equal.
1383
1384 #if defined(__x86_64__)
1385
1386 0: mov dl, [rsi]
1387 xor dl, [rdi]
1388 inc rsi
1389 inc rdi
1390 or al, dl
1391 loop 0b
1392
1393 #elif defined(__i386__)
1394
1395 0: mov dl, [esi]
1396 xor dl, [edi]
1397 inc esi
1398 inc edi
1399 or al, dl
1400 loop 0b
1401
1402 #elif defined(__arm__)
1403
1404 0: ldrb r1, [r4], #1
1405 ldrb r12, [r5], #1
1406 subs r2, r2, #1
1407 eor r12, r12, r1
1408 orr r0, r0, r12
1409 bne 0b
1410
1411 #elif defined(__aarch64__)
1412
1413 0: ldrb w16, [x4], #1
1414 ldrb w17, [x5], #1
1415 sub x2, x2, #1
1416 eor w16, w16, w17
1417 orr w0, w0, w16
1418 cbnz x2, 0b
1419
1420 #else
1421 notimpl
1422 #endif
1423
1424 ret
1425
1426 endproc
1427
1428 proc x12
1429
1430 // an obtuse way of adding two registers. for any bit position, a
1431 // OR d is set if and only if at least one of a and d has a bit set
1432 // in that position, and a AND d is set if and only if both have a
1433 // bit set in that position. essentially, then, what we've done is
1434 // move all of the set bits in d to a, unless there's already a bit
1435 // there. this clearly doesn't change the sum.
1436
1437 #if defined(__x86_64__)
1438
1439 mov rcx, rdx // c' = d
1440 and rdx, rax // d' = a AND d
1441 or rax, rcx // a' = a OR d
1442 add rax, rdx
1443
1444 #elif defined(__i386__)
1445
1446 mov ecx, edx // c' = d
1447 and edx, eax // d' = a AND d
1448 or eax, ecx // a' = a OR d
1449 add eax, edx
1450
1451 #elif defined(__arm__)
1452
1453 and r2, r0, r3 // c' = a AND d
1454 orr r0, r0, r3 // a' = a OR d
1455 add r0, r0, r2
1456
1457 #elif defined(__aarch64__)
1458
1459 and x2, x0, x3 // c' = a AND d
1460 orr x0, x0, x3 // a' = a OR d
1461 add x0, x0, x2
1462
1463 #else
1464 notimpl
1465 #endif
1466
1467 ret
1468
1469 endproc
1470
1471 proc x13
1472
1473 // ok, so this is a really obtuse way of adding a and b; the result
1474 // is in a and d. but why does it work?
1475
1476 #if defined(__x86_64__)
1477
1478 mov rcx, 0x40 // carry chains at most 64 long
1479 0: mov rdx, rax // copy a'
1480 xor rax, rbx // low bits of each bitwise sum
1481 and rbx, rdx // carry bits from each bitwise sum
1482 shl rbx, 1 // carry them into next position
1483 loop 0b
1484
1485 #elif defined(__i386__)
1486
1487 mov ecx, 0x40 // carry chains at most 64 long
1488 0: mov edx, eax // copy a'
1489 xor eax, ebx // low bits of each bitwise sum
1490 and ebx, edx // carry bits from each bitwise sum
1491 shl ebx, 1 // carry them into next position
1492 loop 0b
1493
1494 #elif defined(__arm__)
1495
1496 mov r2, #0x40
1497 0: and r3, r0, r1
1498 subs r2, r2, #1
1499 eor r0, r0, r1
1500 lsl r1, r3, #1
1501 bne 0b
1502
1503 #elif defined(__aarch64__)
1504
1505 mov x2, #0x40
1506 0: and x3, x0, x1
1507 sub x2, x2, #1
1508 eor x0, x0, x1
1509 lsl x1, x3, #1
1510 cbnz x2, 0b
1511
1512 #else
1513 notimpl
1514 #endif
1515
1516 ret
1517
1518 endproc
1519
1520 proc x14
1521
1522 // floor((a + d)/2), like x08.
1523
1524 #if defined(__x86_64__)
1525
1526 mov rcx, rax // copy a for later
1527 and rcx, rdx // carry bits
1528
1529 xor rax, rdx // low bits of each bitwise sum
1530 shr rax, 1 // divide by 2; carries now in place
1531
1532 add rax, rcx // add the carries; done
1533
1534 #elif defined(__i386__)
1535
1536 mov ecx, eax // copy a for later
1537 and ecx, edx // carry bits
1538
1539 xor eax, edx // low bits of each bitwise sum
1540 shr eax, 1 // divide by 2; carries now in place
1541
1542 add eax, ecx // add the carries; done
1543
1544 #elif defined(__arm__)
1545
1546 and r2, r0, r3
1547 eor r0, r0, r3
1548 add r0, r2, r0, lsr #1
1549
1550 #elif defined(__aarch64__)
1551
1552 and x2, x0, x3
1553 eor x0, x0, x3
1554 add x0, x2, x0, lsr #1
1555
1556 #else
1557 notimpl
1558 #endif
1559
1560 ret
1561
1562 endproc
1563
1564 proc x15
1565
1566 // sign extension 32 -> 64 bits.
1567
1568 #if defined(__x86_64__)
1569
1570 movsx rbx, eax // like this?
1571
1572 mov rdx, 0xffffffff80000000
1573 add rax, rdx // if bit 31 of a is set then bits
1574 // 31--63 of a' are clear; otherwise,
1575 // these bits are all set -- which is
1576 // exactly backwards
1577 xor rax, rdx // so fix it
1578
1579 #elif defined(__i386__)
1580
1581 movsx ebx, ax // like this?
1582
1583 mov edx, 0xffff8000
1584 add eax, edx // if bit 31 of a is set then bits
1585 // 31--63 of a' are clear; otherwise,
1586 // these bits are all set -- which is
1587 // exactly backwards
1588 xor eax, edx // so fix it
1589
1590 #elif defined(__arm__)
1591
1592 sxth r1, r0 // like this
1593
1594 mov r12, #0x80000000
1595 add r0, r0, r12, asr #16
1596 eor r0, r0, r12, asr #16
1597
1598 #elif defined(__aarch64__)
1599
1600 sxtw x1, w0 // like this
1601
1602 mov x16, #0xffffffff80000000
1603 add x0, x0, x16
1604 eor x0, x0, x16
1605
1606 #else
1607 notimpl
1608 #endif
1609
1610 ret
1611
1612 endproc
1613
1614 proc x16
1615
1616 // ??? i don't know why you'd want to calculate this.
1617
1618 #if defined(__x86_64__)
1619
1620 xor rax, rbx // a' = a XOR b
1621 xor rbx, rcx // b' = b XOR c
1622 mov rsi, rax // t = a XOR b
1623 add rsi, rbx // t = (a XOR b) + (b XOR c)
1624 cmovc rax, rbx // a' = cf ? b XOR c : a XOR b
1625 xor rax, rbx // a' = cf ? 0 : a XOR c
1626 cmp rax, rsi
1627
1628 #elif defined(__i386__)
1629
1630 xor eax, ebx // a' = a XOR b
1631 xor ebx, ecx // b' = b XOR c
1632 mov esi, eax // t = a XOR b
1633 add esi, ebx // t = (a XOR b) + (b XOR c)
1634 cmovc eax, ebx // a' = cf ? b XOR c : a XOR b
1635 xor eax, ebx // a' = cf ? 0 : a XOR c
1636 cmp eax, esi
1637
1638 #elif defined(__arm__)
1639
1640 eor r0, r0, r1
1641 eor r1, r1, r2
1642 adds r4, r0, r1
1643 movcs r0, r1
1644 eor r0, r0, r1
1645 cmp r0, r4
1646
1647 #elif defined(__aarch64__)
1648
1649 eor x0, x0, x1
1650 eor x1, x1, x2
1651 adds x4, x0, x1
1652 cmov.cs x0, x1
1653 eor x0, x0, x1
1654 cmp x0, x4
1655
1656 #else
1657 notimpl
1658 #endif
1659
1660 ret
1661
1662 endproc
1663
1664 proc x17
1665
1666 // absolute value
1667
1668 #if defined(__x86_64__)
1669
1670 cqo // d = a < 0 ? -1 : 0
1671 xor rax, rdx // a' = a < 0 ? -a - 1 : a
1672 sub rax, rdx // a' = a < 0 ? -a : a
1673
1674 #elif defined(__i386__)
1675
1676 cdq // d = a < 0 ? -1 : 0
1677 xor eax, edx // a' = a < 0 ? -a - 1 : a
1678 sub eax, edx // a' = a < 0 ? -a : a
1679
1680 #elif defined(__arm__)
1681
1682 // direct approach
1683 movs r1, r0
1684 rsbmi r1, r0, #0
1685
1686 // faithful-ish conversion
1687 eor r3, r0, r0, asr #31
1688 sub r0, r3, r0, asr #31
1689
1690 #elif defined(__aarch64__)
1691
1692 // direct approach
1693 tst x0, #1 << 63
1694 cneg.ne x1, x0
1695
1696 // faithful-ish conversion
1697 eor x3, x0, x0, asr #63
1698 sub x0, x3, x0, asr #63
1699
1700 #else
1701 notimpl
1702 #endif
1703
1704 ret
1705
1706 endproc
1707
1708 proc x18
1709
1710 // should always set sf, clear zf, unless we get rescheduled to a
1711 // different core.
1712
1713 #if defined(__x86_64__)
1714
1715 rdtsc // d || a = cycles
1716 shl rdx, 0x20
1717 or rax, rdx // a = cycles
1718 mov rcx, rax // c = cycles
1719
1720 rdtsc // d || a = cycles'
1721 shl rdx, 0x20
1722 or rax, rdx // a = cycles'
1723
1724 cmp rcx, rax
1725
1726 #elif defined(__i386__)
1727
1728 rdtsc // d || a = cycles
1729 mov ebx, eax
1730 mov ecx, edx // c || b = cycles
1731
1732 rdtsc // d || a = cycles'
1733
1734 sub ebx, eax
1735 sbb ecx, edx
1736
1737 #elif defined(__arm__)
1738
1739 // cycle clock not available in user mode
1740 mrrc p15, 0, r0, r1, c9
1741 mrrc p15, 0, r2, r3, c9
1742 subs r0, r0, r2
1743 sbcs r1, r1, r3
1744
1745 #elif defined(__aarch64__)
1746
1747 // cycle clock not available in user mode
1748 mrs x0, pmccntr_el0
1749 mrs x1, pmccntr_el0
1750 cmp x0, x1
1751
1752 #else
1753 notimpl
1754 #endif
1755
1756 ret
1757
1758 endproc
1759
1760 proc x19
1761
1762 // stupid way to capture a pointer to inline data and jump past it.
1763 // confuses the return-address predictor something chronic. worse
1764 // because amd64 calling convention doesn't usually pass arguments on
1765 // the stack.
1766
1767 #if defined(__x86_64__)
1768
1769 call 8f
1770 .string "hello world!\n\0"
1771 8: call print_str
1772 add rsp, 8
1773 ret
1774
1775 print_str:
1776 // actually implement this ridiculous thing
1777 mov rsi, [rsp + 8]
1778 xor edx, edx
1779 0: mov al, [rsi + rdx]
1780 inc rdx
1781 cmp al, 0
1782 jnz 0b
1783 mov eax, SYS_write
1784 mov edi, 1
1785 dec rdx
1786 syscall // clobbers r11 :-(
1787 ret
1788
1789 #elif defined(__i386__)
1790
1791 call 8f
1792 .string "hello world!\n\0"
1793 8: call print_str
1794 add esp, 4
1795 ret
1796
1797 print_str:
1798 // actually implement this ridiculous thing
1799 mov ecx, [esp + 4]
1800 xor edx, edx
1801 0: mov al, [ecx + edx]
1802 inc edx
1803 cmp al, 0
1804 jnz 0b
1805 mov eax, SYS_write
1806 mov ebx, 1
1807 dec edx
1808 int 0x80
1809 ret
1810
1811 #elif defined(__arm__)
1812
1813 // why am i doing this?
1814 stmfd r13!, {r14}
1815 bl 8f
1816 .string "hello world!\n\0"
1817 .balign 4
1818 8: mov r1, r14 // might as well make it easy on myself
1819 bl print_str
1820 ldmfd r13!, {pc}
1821
1822 print_str:
1823 mov r2, #0
1824 0: ldrb r0, [r1, r2]
1825 cmp r0, #0
1826 addne r2, r2, #1
1827 bne 0b
1828 mov r0, #1
1829 mov r7, #SYS_write
1830 swi 0
1831 bx r14
1832
1833 #elif defined(__aarch64__)
1834
1835 // why am i doing this?
1836 str x30, [sp, #-16]!
1837 bl 8f
1838 .string "hello world!\n\0"
1839 .balign 4
1840 8: mov x1, x30 // might as well make it easy on myself
1841 bl print_str
1842 ldr x30, [sp], #16
1843 ret
1844
1845 print_str:
1846 mov x2, #0
1847 0: ldrb w0, [x1, x2]
1848 cmp w0, #0
1849 cinc.ne x2, x2
1850 b.ne 0b
1851 mov x0, #1
1852 mov x8, #SYS_write
1853 svc #0
1854 ret
1855
1856 #else
1857 notimpl
1858 #endif
1859
1860 endproc
1861
1862 proc x1a
1863
1864 // collect the current instruction-pointer address. this was an old
1865 // 32-bit i386 trick for position-independent code, but (a) it
1866 // confuses the return predictor, and (b) amd64 has true pc-relative
1867 // addressing.
1868
1869 #if defined(__x86_64__)
1870
1871 // the actual example
1872 call 0f
1873 0: pop rax
1874
1875 // the modern i386 trick doesn't confuse the return-address
1876 // predictor.
1877 call calladdr_rbx
1878 sub rbx, . - 0b
1879
1880 // but rip-relative addressing is even better
1881 lea rcx, [rip + 0b]
1882
1883 ret
1884
1885 calladdr_rbx:
1886 mov rbx, [rsp]
1887 ret
1888
1889 #elif defined(__i386__)
1890
1891 // the actual example
1892 call 0f
1893 0: pop eax
1894
1895 // the modern i386 trick doesn't confuse the return-address
1896 // predictor.
1897 call get_pc_ebx
1898 sub ebx, . - 0b
1899
1900 ret
1901
1902 #elif defined(__arm__)
1903
1904 stmfd r13!, {r14}
1905
1906 bl 0f
1907 0: mov r0, r14
1908
1909 bl return
1910 sub r1, r14, #. - 0b
1911
1912 adr r2, 0b
1913
1914 ldmfd r13!, {pc}
1915
1916 return: bx r14
1917
1918 #elif defined(__aarch64__)
1919
1920 str x30, [sp, #-16]!
1921
1922 // we can do all of the above using a64
1923 bl 0f
1924 0: mov x0, x30
1925
1926 bl return
1927 sub x1, x30, #. - 0b
1928
1929 adr x2, 0b
1930
1931 ldr x30, [sp], #16
1932 return: ret
1933
1934 #else
1935 notimpl
1936 #endif
1937
1938 endproc
1939
1940 proc x1b
1941
1942 #if defined(__x86_64__)
1943
1944 // retpolines: an mitigation against adversarially influenced
1945 // speculative execution at indirect branches. if an adversary can
1946 // prepare a branch-target buffer entry matching an indirect branch
1947 // in the victim's address space then they can cause the victim to
1948 // /speculatively/ (but not architecturally) execute any code in
1949 // their address space, possibly leading to leaking secrets through
1950 // the cache. retpolines aren't susceptible to this because the
1951 // predicted destination address is from the return-prediction stack
1952 // which the adversary can't prime. the performance penalty is still
1953 // essentially a branch misprediction -- for this return, and
1954 // possibly all others already stacked.
1955
1956 // (try not to crash)
1957 lea rax, [rip + 9f]
1958
1959 push rax
1960 9: ret
1961
1962 #elif defined(__i386__)
1963
1964 call get_pc_ebx
1965 lea eax, [ebx + 9f - .]
1966
1967 push eax
1968 9: ret
1969
1970 #elif defined(__arm__)
1971
1972 stmfd r13!, {r14}
1973
1974 adr r14, 8f
1975 bx r14
1976
1977 8: ldmfd r13!, {pc}
1978
1979 #elif defined(__aarch64__)
1980
1981 str x30, [sp, #-16]!
1982
1983 adr x30, 8f
1984 ret
1985
1986 8: ldr x30, [sp], #16
1987 ret
1988
1989 #else
1990 notimpl
1991 #endif
1992
1993 endproc
1994
1995 proc x1c
1996
1997 // ok, having a hard time seeing a use for this. the most important
1998 // thing to note is that sp is set from `pop' /after/ it's
1999 // incremented.
2000
2001 #if defined(__x86_64__)
2002
2003 // try not to crash
2004 mov rax, rsp
2005 and rsp, -16
2006 push rax
2007
2008 pop rsp
2009
2010 // check it worked
2011 mov rbx, rsp
2012 ret
2013
2014 #elif defined(__i386__)
2015
2016 // try not to crash
2017 mov eax, esp
2018 and esp, -16
2019 push eax
2020
2021 pop esp
2022
2023 // check it worked
2024 mov ebx, esp
2025 ret
2026
2027 #elif defined(__arm__)
2028
2029 // not even going to dignify this
2030 notimpl
2031
2032 #elif defined(__aarch64__)
2033
2034 // not even going to dignify this
2035 notimpl
2036
2037 #else
2038 notimpl
2039 #endif
2040
2041 endproc
2042
2043 proc x1d
2044
2045 // monumentally cheesy way to copy 8 n bytes from buff1 to buff2.
2046 // also clobbers words at buff2 + 8 n and buff2 - 8 for good measure.
2047
2048 n = 4
2049
2050 #if defined(__x86_64__)
2051
2052 mov rax, rsp // safekeeping
2053
2054 // we're toast if we get hit by a signal now. fingers crossed...
2055 .if 0
2056 mov rsp, buff2 + 8*n + 8
2057 mov rbp, buff1 + 8*n
2058 .else
2059 lea rsp, [rdi + 8*n + 16]
2060 lea rbp, [rsi + 8*n]
2061 .endif
2062 enter 0, n + 1
2063
2064 // precise action:
2065 //
2066 // +---------+ +---------+
2067 // rbp -> | ??? | rsp -> | ??? |
2068 // +---------+ +---------+
2069 // | w_{n-1} | | rbp | <- rbp'
2070 // +---------+ +---------+
2071 // | ... | | w_{n-1} |
2072 // +---------+ +---------+
2073 // | w_1 | | ... |
2074 // +---------+ +---------+
2075 // | w_0 | | w_1 |
2076 // +---------+ +---------+
2077 // | w_0 |
2078 // +---------+
2079 // | rbp' | <- rsp'
2080 // +---------+
2081
2082 mov rdx, rsp
2083 mov rsp, rax
2084
2085 #elif defined(__i386__)
2086
2087 mov eax, esp // safekeeping
2088
2089 // we're toast if we get hit by a signal now. fingers crossed...
2090 .if 0
2091 mov esp, buff2 + 4*n + 4
2092 mov ebp, buff1 + 4*n
2093 .else
2094 lea esp, [edi + 4*n + 8]
2095 lea ebp, [esi + 4*n]
2096 .endif
2097 enter 0, n + 1
2098
2099 mov edx, esp
2100 mov esp, eax
2101
2102 #elif defined(__arm__)
2103
2104 add r4, r4, #4*n
2105 add r5, r5, #4*n + 8
2106
2107 str r4, [r5, #-4]!
2108 .rept n/2
2109 ldrd r0, r1, [r4, #-8]!
2110 strd r0, r1, [r5, #-8]!
2111 .endr
2112 add r4, r5, #4*n
2113 str r4, [r5, #-4]!
2114
2115 #elif defined(__aarch64__)
2116
2117 // omgwtf. let's not actually screw with the stack pointer.
2118
2119 add x4, x4, #8*n
2120 add x5, x5, #8*n + 16
2121
2122 str x4, [x5, #-8]!
2123 .rept n/2
2124 ldp x16, x17, [x4, #-16]!
2125 stp x16, x17, [x5, #-16]!
2126 .endr
2127 add x4, x5, #8*n
2128 str x4, [x5, #-8]!
2129
2130 #else
2131 notimpl
2132 #endif
2133
2134 ret
2135
2136 endproc
2137
2138 proc x1e
2139
2140 // convert nibble value to (uppercase) hex; other input values yield
2141 // nonsense.
2142
2143 #if defined(__x86_64__)
2144
2145 // das doesn't work in 64-bit mode; best i can come up with
2146 mov edx, eax
2147 add al, '0'
2148 add dl, 'A' - 10
2149 cmp al, '9' + 1
2150 cmovae eax, edx
2151
2152 #elif defined(__i386__)
2153
2154 cmp al, 0x0a // cf = 1 iff a < 10
2155 sbb al, 0x69 // if 0 <= a < 10, a' = a - 0x6a, so
2156 // 0x96 <= a' < 0x70, setting af, cf
2157 // if 10 <= a < 16, a' = a - 0x69, so
2158 // 0x71 <= a' < 0x77, setting cf but
2159 // clearing af
2160 das // if 0 <= a < 10, then af and cf are
2161 // both set, so set subtract 0x66
2162 // from a' leaving 0x30 <= a' < 0x3a;
2163 // if 10 <= a < 16 then af clear but
2164 // cf set, so subtract 0x60 from a'
2165 // leaving 0x41 <= a' < 0x47
2166
2167 #elif defined(__arm__)
2168
2169 // significantly less tricksy
2170 cmp r0, #10
2171 addlo r0, r0, #'0'
2172 addhs r0, r0, #'A' - 10
2173
2174 #elif defined(__aarch64__)
2175
2176 // with less versatile conditional execution this is the best we can
2177 // do
2178 cmp w0, #10
2179 add w16, w0, #'A' - 10
2180 add w0, w0, #'0'
2181 cmov.hs w0, w16
2182
2183 #else
2184 notimpl
2185 #endif
2186
2187 ret
2188
2189 endproc
2190
2191 proc x1f
2192
2193 // verify collatz conjecture starting at a; assume a /= 0!
2194
2195 #if defined(__x86_64__)
2196
2197 0: bsf rcx, rax // clobber c if a = 0
2198 shr rax, cl // a = 2^c a'
2199 cmp rdx, 0
2200 je 1f
2201 stosq
2202 dec rdx
2203 1:
2204 cmp rax, 1 // done?
2205 je 9f
2206 lea rax, [2*rax + rax + 1] // a' = 3 a' + 1
2207 jmp 0b // again
2208
2209 9: ret
2210
2211 #elif defined(__i386__)
2212
2213 0: bsf ecx, eax // clobber c if a = 0
2214 shr eax, cl // a = 2^c a'
2215 cmp edx, 0
2216 je 1f
2217 stosd
2218 dec edx
2219 1:
2220 cmp eax, 1 // done?
2221 je 9f
2222 lea eax, [2*eax + eax + 1] // a' = 3 a' + 1
2223 jmp 0b // again
2224
2225 9: ret
2226
2227 #elif defined(__arm__)
2228
2229 // rbit introduced in armv7
2230 0: rbit r2, r0
2231 clz r2, r2
2232 mov r0, r0, lsr r2 // a = 2^c a'
2233 cmp r3, #0
2234 strne r0, [r5], #4
2235 subne r3, r3, #1
2236 cmp r0, #1
2237 adcne r0, r0, r0, lsl #1 // a' = 3 a' + 1 (because c set)
2238 bne 0b
2239
2240 ret
2241
2242 #elif defined(__aarch64__)
2243
2244 0: rbit w2, w0
2245 clz w2, w2
2246 lsr w0, w0, w2 // a = 2^c a'
2247 cmp x3, #0
2248 beq 1f
2249 str x0, [x5], #8
2250 sub x3, x3, #1
2251 1:
2252 cmp w0, #1
2253 add w16, w0, w0, lsl #1 // t = 3 a' + 1 (because c set)
2254 csinc.eq w0, w0, w16
2255 b.ne 0b
2256
2257 ret
2258
2259 #else
2260 notimpl
2261 #endif
2262
2263 endproc
2264
2265 ///--------------------------------------------------------------------------
2266 /// 0x20--0x2f
2267
2268 proc x20
2269
2270 // calculate 1337 a slowly
2271
2272 #if defined(__x86_64__)
2273
2274 // original version
2275 mov rcx, rax // c = a
2276 shl rcx, 2 // c = 4 a
2277 add rcx, rax // c = 5 a
2278 shl rcx, 3 // c = 40 a
2279 add rcx, rax // c = 41 a
2280 shl rcx, 1 // c = 82 a
2281 add rcx, rax // c = 83 a
2282 shl rcx, 1 // c = 166 a
2283 add rcx, rax // c = 167 a
2284 shl rcx, 3 // c = 1336 a
2285 add rcx, rax // c = 1337 a
2286
2287 // a quick way
2288 lea rdx, [2*rax + rax] // t = 3 a
2289 shl rdx, 6 // t = 192 a
2290 sub rdx, rax // t = 191 a
2291 lea rbx, [8*rdx] // b = 1528 a
2292 sub rbx, rdx // b = 1337 a
2293
2294 #elif defined(__i386__)
2295
2296 // original version
2297 mov ecx, eax // c = a
2298 shl ecx, 2 // c = 4 a
2299 add ecx, eax // c = 5 a
2300 shl ecx, 3 // c = 40 a
2301 add ecx, eax // c = 41 a
2302 shl ecx, 1 // c = 82 a
2303 add ecx, eax // c = 83 a
2304 shl ecx, 1 // c = 166 a
2305 add ecx, eax // c = 167 a
2306 shl ecx, 3 // c = 1336 a
2307 add ecx, eax // c = 1337 a
2308
2309 // a quick way
2310 lea edx, [2*eax + eax] // t = 3 a
2311 shl edx, 6 // t = 192 a
2312 sub edx, eax // t = 191 a
2313 lea ebx, [8*edx] // b = 1528 a
2314 sub ebx, edx // b = 1337 a
2315
2316 #elif defined(__arm__)
2317
2318 // original version, ish
2319 add r2, r0, r0, lsl #2 // c = 5 a
2320 add r2, r0, r2, lsl #3 // c = 41 a
2321 add r2, r0, r2, lsl #1 // c = 83 a
2322 add r2, r0, r2, lsl #1 // c = 167 a
2323 add r2, r0, r2, lsl #3 // c = 1337 a
2324
2325 // quicker way
2326 add r1, r0, r0, lsl #1 // b = 3 a
2327 rsb r1, r0, r1, lsl #6 // b = 191 a
2328 rsb r1, r1, r1, lsl #3 // b = 1337 a
2329
2330 #elif defined(__aarch64__)
2331
2332 // original version, ish
2333 add x2, x0, x0, lsl #2 // c = 5 a
2334 add x2, x0, x2, lsl #3 // c = 41 a
2335 add x2, x0, x2, lsl #1 // c = 83 a
2336 add x2, x0, x2, lsl #1 // c = 167 a
2337 add x2, x0, x2, lsl #3 // c = 1337 a
2338
2339 // sleazy because no rsb
2340 add x1, x0, x0, lsl #1 // b = 3 a
2341 sub x1, x0, x1, lsl #6 // b = -191 a
2342 sub x1, x1, x1, lsl #3 // b = 1337 a
2343
2344 #else
2345 notimpl
2346 #endif
2347
2348 ret
2349
2350 endproc
2351
2352 proc x21
2353
2354 // multiply complex numbers a + b i and c + d i
2355 //
2356 // (a + b i) (c + d i) = (a c - b d) + (a d + b c) i
2357 //
2358 // somewhat slick approach uses only three multiplications
2359
2360 #if defined(__x86_64__)
2361
2362 mov rsi, rax // t = a
2363 add rax, rbx // a' = a + b
2364 mov rdi, rdx // u = d
2365 sub rdx, rcx // d' = d - c
2366 add rdi, rcx // u = c + d
2367
2368 imul rax, rcx // a' = c (a + b)
2369 imul rsi, rdx // t = a (d - c)
2370 imul rdi, rbx // u = b (c + d)
2371
2372 add rsi, rax // t = a (d - c) + c (a + b)
2373 mov rbx, rsi // b' = a (d - c) + c (a + b)
2374 // = a d + b c
2375 sub rax, rdi // a' = c (a + b) - b (c + d)
2376 // = a c - b d
2377
2378 #elif defined(__i386__)
2379
2380 mov esi, eax // t = a
2381 add eax, ebx // a' = a + b
2382 mov edi, edx // u = d
2383 sub edx, ecx // d' = d - c
2384 add edi, ecx // u = c + d
2385
2386 imul eax, ecx // a' = c (a + b)
2387 imul esi, edx // t = a (d - c)
2388 imul edi, ebx // u = b (c + d)
2389
2390 add esi, eax // t = a (d - c) + c (a + b)
2391 mov ebx, esi // b' = a (d - c) + c (a + b)
2392 // = a d + b c
2393 sub eax, edi // a' = c (a + b) - b (c + d)
2394 // = a c - b d
2395
2396 #elif defined(__arm__)
2397
2398 add r4, r0, r1 // t = a + b
2399 add r5, r2, r3 // u = c + d
2400 sub r3, r3, r2 // d' = d - c
2401
2402 // mls introduced in armv7
2403 mul r4, r4, r2 // t = c (a + b)
2404 mov r2, r1 // c' = a (bah!)
2405 mla r1, r0, r3, r4 // b' = a (d - c) + c (a + b)
2406 // = a d + b c
2407 mls r0, r2, r5, r4 // a' = c (a + b) - b (c + d)
2408 // = a c - b d
2409
2410 #elif defined(__aarch64__)
2411
2412 add x4, x0, x1 // t = a + b
2413 add x5, x2, x3 // u = c + d
2414 sub x3, x3, x2 // d' = d - c
2415
2416 // mls intxoduced in axmv7
2417 mul x4, x4, x2 // t = c (a + b)
2418 mov x2, x1 // c' = a (bah!)
2419 madd x1, x0, x3, x4 // b' = a (d - c) + c (a + b)
2420 // = a d + b c
2421 msub x0, x2, x5, x4 // a' = c (a + b) - b (c + d)
2422 // = a c - b d
2423
2424 #else
2425 notimpl
2426 #endif
2427
2428 ret
2429
2430 endproc
2431
2432 proc x22
2433
2434 // divide by 3
2435
2436 #if defined(__x86_64__)
2437
2438 mov rdx, 0xaaaaaaaaaaaaaaab // = ceil(2/3 2^64)
2439 mul rdx // d' || a' =~ 2/3 a 2^64
2440 shr rdx, 1 // d' = floor(a/3)
2441 mov rax, rdx // a' = floor(a/3)
2442
2443 // we start with 0 <= a < 2^64. write f = ceil(2/3 2^64), so that
2444 // 2/3 < f/2^64 < 2/3 + 1/2^64. then floor(2/3 a) <= floor(a f/2^64)
2445 // <= floor(2/3 a + a/2^64), but a < 2^64 so a/2^64 < 1 and
2446 // floor(a f/2^64) = floor(2/3 a).
2447
2448 #elif defined(__i386__)
2449
2450 mov edx, 0xaaaaaaab // = ceil(2/3 2^32)
2451 mul edx // d' || a' =~ 2/3 a 2^32
2452 shr edx, 1 // d' = floor(a/3)
2453 mov eax, edx // a' = floor(a/3)
2454
2455 #elif defined(__arm__)
2456
2457 ldr r12, =0xaaaaaaab
2458 umull r12, r0, r0, r12
2459 mov r0, r0, lsr #1
2460
2461 #elif defined(__aarch64__)
2462
2463 ldr x16, =0xaaaaaaaaaaaaaaab
2464 umulh x0, x0, x16
2465 lsr x0, x0, #1
2466
2467 #else
2468 notimpl
2469 #endif
2470
2471 ret
2472
2473 endproc
2474
2475 proc x23
2476
2477 #if defined(__x86_64__)
2478
2479 // main loop: shorten a preserving residue class mod 3
2480 0: cmp rax, 5
2481 jbe 8f
2482 // a > 5
2483 mov rdx, rax // d' = a
2484 shr rdx, 2 // d' = floor(a/4)
2485 and rax, 3 // a = 4 d' + a' (0 <= a' < 4)
2486 add rax, rdx // a' == a (mod 3) but a' < a/4 + 4
2487 jmp 0b
2488
2489 // fix up final value 0 <= a < 6: want 0 <= a < 3
2490 //
2491 // the tricky part is actually a = 3; but the other final cases take
2492 // additional iterations which we can avoid.
2493 8: cmp rax, 3 // set cf iff a < 3
2494 cmc // set cf iff a >= 3
2495 sbb rdx, rdx // d' = a >= 3 ? -1 : 0
2496 and rdx, 3 // d' = a >= 3 ? 3 : 0
2497 sub rax, rdx // a' = a - (a >= 3 ? 3 : 0)
2498 // = a (mod 3)
2499
2500 #elif defined(__i386__)
2501
2502 // main loop: shorten a preserving residue class mod 3
2503 0: cmp eax, 5
2504 jbe 8f
2505 // a > 5
2506 mov edx, eax // d' = a
2507 shr edx, 2 // d' = floor(a/4)
2508 and eax, 3 // a = 4 d' + a' (0 <= a' < 4)
2509 add eax, edx // a' == a (mod 3) but a' < a/4 + 4
2510 jmp 0b
2511
2512 // fix up final value 0 <= a < 6: want 0 <= a < 3
2513 //
2514 // the tricky part is actually a = 3; but the other final cases take
2515 // additional iterations which we can avoid.
2516 8: cmp eax, 3 // set cf iff a < 3
2517 cmc // set cf iff a >= 3
2518 sbb edx, edx // d' = a >= 3 ? -1 : 0
2519 and edx, 3 // d' = a >= 3 ? 3 : 0
2520 sub eax, edx // a' = a - (a >= 3 ? 3 : 0)
2521 // = a (mod 3)
2522
2523 #elif defined(__arm__)
2524
2525 0: cmp r0, #6
2526 andhs r12, r0, #3
2527 addhs r0, r12, r0, lsr #2
2528 bhs 0b
2529
2530 cmp r0, #3
2531 subhs r0, r0, #3
2532
2533 #elif defined(__aarch64__)
2534
2535 0: cmp x0, #6
2536 // blunder on through regardless since this doesn't affect the result
2537 and x16, x0, #3
2538 add x0, x16, x0, lsr #2
2539 b.hs 0b
2540
2541 subs x16, x0, #3
2542 cmov.hs x0, x16
2543
2544 #else
2545 notimpl
2546 #endif
2547
2548 ret
2549
2550 endproc
2551
2552 proc x24
2553
2554 // invert (odd) a mod 2^64
2555 //
2556 // suppose a a_i == 1 (mod 2^{2^i})
2557 //
2558 // clearly good for i = 0, since 2^i = 1 and 2^{2^i} = 2, and a_0 =
2559 // a == 1 (mod 2) by assumption
2560 //
2561 // write a a_i == b_i 2^{2^i} + 1 (mod 2^{2^{i+1}})
2562 // then b_i == (a a_i - 1)/2^{2^i} (mod 2^{2^i})
2563 // to lift inverse, we want x such that a x == -b_i (mod 2^{2^i});
2564 // clearly x = -a_i b_i will do, since a a_i == 1 (mod 2^{2^i})
2565 // then:
2566 // a_{i+1} = a_i - a_i b_i 2^{2^i} = a_i (1 - (a a_i - 1))
2567 // = 2 a_i - a a_i^2
2568 //
2569 // check:
2570 // a a_{i+1} = 2 a a_i - a^2 a_i^2
2571 // == 2 a a_i - (b_i 2^{2^i} + 1)^2
2572 // == 2 (b_i 2^{2^i} + 1) -
2573 // (b_i^2 2^{2^{i+1}} + 2 b_i 2^{2^i} + 1)
2574 // == 1 (mod 2^{2^{i+1}})
2575
2576 #if defined(__x86_64__)
2577
2578 // rax // a_0 = a
2579 mov rbx, rax // b' = a
2580 mov rsi, rax // t = a_0
2581
2582 0:
2583 cmp rbp, 0
2584 je 1f
2585 stosq
2586 dec rbp
2587 1:
2588 mul rbx // a' = a a_i
2589 mov rcx, rax // c = a a_i
2590
2591 sub rax, 2 // a' = a a_i - 2
2592 neg rax // a' = 2 - a a_i
2593 mul rsi // a_{i+1} = a_i (2 - a a_i)
2594 // = 2 a_i - a a_i^2
2595 mov rsi, rax // t = a_{i+1}
2596
2597 cmp rcx, 1 // done?
2598 ja 0b // no -- iterate
2599
2600 #elif defined(__i386__)
2601
2602 // eax // a_0 = a
2603 mov ebx, eax // b' = a
2604 mov esi, eax // t = a_0
2605
2606 0:
2607 cmp ebp, 0
2608 je 1f
2609 stosd
2610 dec ebp
2611 1:
2612 mul ebx // a' = a a_i
2613 mov ecx, eax // c = a a_i
2614
2615 sub eax, 2 // a' = a a_i - 2
2616 jb 9f // done if < 2
2617 neg eax // a' = 2 - a a_i
2618 mul esi // a_{i+1} = a_i (2 - a a_i)
2619 // = 2 a_i - a a_i^2
2620 mov esi, eax // t = a_{i+1}
2621
2622 jmp 0b // and iterate
2623 9: mov eax, esi // restore
2624
2625 #elif defined(__arm__)
2626
2627 // r0 // a_0 = a
2628 mov r1, r0 // b' = a
2629
2630 0:
2631 cmp r6, #0
2632 strne r0, [r5], #4
2633 subne r6, r6, #1
2634 mul r2, r0, r1 // c = a a_i
2635 rsbs r2, r2, #2 // c = 2 - a a_i
2636 mul r0, r0, r2 // a_{i+1} = a_i (2 - a a_i)
2637 // = 2 a_i - a a_i^2
2638 blo 0b
2639
2640 #elif defined(__aarch64__)
2641
2642 // x0 // a_0 = a
2643 mov x1, x0 // b' = a
2644 mov x16, #2 // because we have no rsb
2645
2646 0:
2647 cmp x6, #0
2648 b.eq 1f
2649 str x0, [x5], #8
2650 sub x6, x6, #1
2651 1:
2652 mul x2, x0, x1 // c = a a_i
2653 subs x2, x16, x2 // c = 2 - a a_i
2654 mul x0, x0, x2 // a_{i+1} = a_i (2 - a a_i)
2655 // = 2 a_i - a a_i^2
2656 b.lo 0b
2657
2658 #else
2659 notimpl
2660 #endif
2661
2662 ret
2663
2664 endproc
2665
2666 proc x25
2667
2668 // a poor approximation to pi/4
2669 //
2670 // think of x and y as being in 16.16 fixed-point format. we sample
2671 // points in the unit square, and determine how many of them are
2672 // within a unit quarter-circle centred at the origin. the area of
2673 // the quarter-circle is pi/4.
2674
2675 #if defined(__x86_64__)
2676
2677 xor eax, eax // a = 0
2678 mov rcx, 1
2679 shl rcx, 0x20 // c =~ 4 billion
2680
2681 0: movzx rbx, cx // x = low 16 bits of c
2682 imul rbx, rbx // b = x^2
2683
2684 ror rcx, 0x10 // switch halves of c
2685 movzx rdx, cx // y = high 16 bits of c
2686 imul rdx, rdx // d = y^2
2687 rol rcx, 0x10 // switch back
2688
2689 add rbx, rdx // r^2 = x^2 + y^2
2690 shr rbx, 0x20 // r^2 >= 1?
2691 cmp rbx, 1 // set cf iff r^2 >= 1
2692 adc rax, 0 // and add onto accumulator
2693 loop 0b
2694
2695 #elif defined(__i386__)
2696
2697 // this is actually better done in 32 bits. the carry has the wrong
2698 // sense here, so instead deduct one for each point outside the
2699 // quarter-circle rather than adding one for each point inside it.
2700 xor eax, eax
2701 xor ecx, ecx
2702
2703 0: movzx ebx, cx
2704 imul ebx, ebx
2705
2706 ror ecx, 0x10
2707 movzx edx, cx
2708 imul edx, edx
2709 rol ecx, 0x10
2710
2711 add ebx, edx // see?
2712 sbb eax, 0
2713 loop 0b
2714
2715 #elif defined(__arm__)
2716
2717 mov r0, #0
2718 mov r2, #0
2719
2720 0: uxth r1, r2, ror #0
2721 uxth r3, r2, ror #16
2722 mul r1, r1, r1
2723 mul r3, r3, r3
2724 cmn r1, r3 // mlas doesn't set cf usefully
2725 addcc r0, r0, #1
2726 adds r2, r2, #1
2727 bne 0b
2728
2729 #elif defined(__aarch64__)
2730
2731 mov w0, #0
2732 mov w2, #0
2733
2734 0: ubfx w1, w2, #0, #16
2735 ubfx w3, w2, #16, #16
2736 sub w2, w2, #1
2737 mul w1, w1, w1
2738 mul w3, w3, w3
2739 cmn w1, w3
2740 cinc.cc w0, w0
2741 cbnz w2, 0b
2742
2743 #else
2744 notimpl
2745 #endif
2746
2747 ret
2748
2749 endproc
2750
2751 proc x26
2752
2753 // a bad way to rotate a right by 7 places
2754
2755 #if defined(__x86_64__)
2756
2757 mov rbx, rax
2758 ror rbx, 7 // better
2759
2760 mov rdx, rax // d' = a
2761 shr rax, 7 // a' = a >> 7
2762 shl rdx, 0x39 // d' = a << 57
2763 or rax, rdx // a' = a >>> 7
2764
2765 #elif defined(__i386__)
2766
2767 mov ebx, eax
2768 ror ebx, 7 // better
2769
2770 mov edx, eax // d' = a
2771 shr eax, 7 // a' = a >> 7
2772 shl edx, 0x39 // d' = a << 57
2773 or eax, edx // a' = a >>> 7
2774
2775 #elif defined(__arm__)
2776
2777 mov r1, r0, ror #7 // easy way
2778
2779 // even the hard way is fairly easy on arm
2780 mov r3, r0, lsl #25
2781 orr r0, r3, r0, lsr #7 // hard way
2782
2783 #elif defined(__aarch64__)
2784
2785 ror x1, x0, #7 // easy way
2786
2787 // even the hard way is fairly easy on arm
2788 lsl x3, x0, #57
2789 orr x0, x3, x0, lsr #7 // hard way
2790
2791 #else
2792 notimpl
2793 #endif
2794
2795 ret
2796
2797 endproc
2798
2799 proc x27
2800
2801 // shift a right by c places, in two halves
2802
2803 #if defined(__x86_64__)
2804
2805 mov ch, cl // c' = [c, c]
2806 inc ch // c' = [c, c + 1]
2807 shr ch, 1
2808 shr cl, 1 // c' = [floor(c/2), ceil(c/2)]
2809 shr rax, cl
2810 xchg ch, cl
2811 shr rax, cl
2812
2813 #elif defined(__i386__)
2814
2815 mov ch, cl // c' = [c, c]
2816 inc ch // c' = [c, c + 1]
2817 shr ch, 1
2818 shr cl, 1 // c' = [floor(c/2), ceil(c/2)]
2819 shr eax, cl
2820 xchg ch, cl
2821 shr eax, cl
2822
2823 #elif defined(__arm__)
2824
2825 // it would be clearer and more efficient to say: `mov r12, r2, lsr
2826 // #1; sub r2, r2, r12', but that's not the lesson this exercise is
2827 // trying to teach.
2828 add r12, r2, #1
2829 mov r2, r2, lsr #1
2830 mov r12, r12, lsr #1
2831 mov r0, r0, lsr r2
2832 mov r0, r0, lsr r12
2833
2834 #elif defined(__aarch64__)
2835
2836 add w16, w2, #1
2837 lsr w2, w2, #1
2838 lsr w16, w16, #1
2839 lsr x0, x0, x2
2840 lsr x0, x0, x16
2841
2842 #else
2843 notimpl
2844 #endif
2845
2846 ret
2847
2848 endproc
2849
2850 proc x28
2851
2852 #if defined(__x86_64__)
2853
2854 notimpl
2855
2856 #elif defined(__i386__)
2857
2858 notimpl
2859
2860 #elif defined(__arm__)
2861
2862 notimpl
2863
2864 #elif defined(__aarch64__)
2865
2866 notimpl
2867
2868 #else
2869 notimpl
2870 #endif
2871
2872 endproc
2873
2874 proc x29
2875
2876 #if defined(__x86_64__)
2877
2878 notimpl
2879
2880 #elif defined(__i386__)
2881
2882 notimpl
2883
2884 #elif defined(__arm__)
2885
2886 notimpl
2887
2888 #elif defined(__aarch64__)
2889
2890 notimpl
2891
2892 #else
2893 notimpl
2894 #endif
2895
2896 endproc
2897
2898 proc x2a
2899
2900 #if defined(__x86_64__)
2901
2902 notimpl
2903
2904 #elif defined(__i386__)
2905
2906 notimpl
2907
2908 #elif defined(__arm__)
2909
2910 notimpl
2911
2912 #elif defined(__aarch64__)
2913
2914 notimpl
2915
2916 #else
2917 notimpl
2918 #endif
2919
2920 endproc
2921
2922 proc x2b
2923
2924 #if defined(__x86_64__)
2925
2926 notimpl
2927
2928 #elif defined(__i386__)
2929
2930 notimpl
2931
2932 #elif defined(__arm__)
2933
2934 notimpl
2935
2936 #elif defined(__aarch64__)
2937
2938 notimpl
2939
2940 #else
2941 notimpl
2942 #endif
2943
2944 endproc
2945
2946 proc x2c
2947
2948 #if defined(__x86_64__)
2949
2950 notimpl
2951
2952 #elif defined(__i386__)
2953
2954 notimpl
2955
2956 #elif defined(__arm__)
2957
2958 notimpl
2959
2960 #elif defined(__aarch64__)
2961
2962 notimpl
2963
2964 #else
2965 notimpl
2966 #endif
2967
2968 endproc
2969
2970 proc x2d
2971
2972 #if defined(__x86_64__)
2973
2974 notimpl
2975
2976 #elif defined(__i386__)
2977
2978 notimpl
2979
2980 #elif defined(__arm__)
2981
2982 notimpl
2983
2984 #elif defined(__aarch64__)
2985
2986 notimpl
2987
2988 #else
2989 notimpl
2990 #endif
2991
2992 endproc
2993
2994 proc x2e
2995
2996 #if defined(__x86_64__)
2997
2998 notimpl
2999
3000 #elif defined(__i386__)
3001
3002 notimpl
3003
3004 #elif defined(__arm__)
3005
3006 notimpl
3007
3008 #elif defined(__aarch64__)
3009
3010 notimpl
3011
3012 #else
3013 notimpl
3014 #endif
3015
3016 endproc
3017
3018 proc x2f
3019
3020 #if defined(__x86_64__)
3021
3022 notimpl
3023
3024 #elif defined(__i386__)
3025
3026 notimpl
3027
3028 #elif defined(__arm__)
3029
3030 notimpl
3031
3032 #elif defined(__aarch64__)
3033
3034 notimpl
3035
3036 #else
3037 notimpl
3038 #endif
3039
3040 endproc
3041
3042 ///--------------------------------------------------------------------------
3043 /// 0x30--0x3f
3044
3045 proc x30
3046
3047 #if defined(__x86_64__)
3048
3049 notimpl
3050
3051 #elif defined(__i386__)
3052
3053 notimpl
3054
3055 #elif defined(__arm__)
3056
3057 notimpl
3058
3059 #elif defined(__aarch64__)
3060
3061 notimpl
3062
3063 #else
3064 notimpl
3065 #endif
3066
3067 ret
3068
3069 endproc
3070
3071 proc x31
3072
3073 #if defined(__x86_64__)
3074
3075 notimpl
3076
3077 #elif defined(__i386__)
3078
3079 notimpl
3080
3081 #elif defined(__arm__)
3082
3083 notimpl
3084
3085 #elif defined(__aarch64__)
3086
3087 notimpl
3088
3089 #else
3090 notimpl
3091 #endif
3092
3093 endproc
3094
3095 proc x32
3096
3097 #if defined(__x86_64__)
3098
3099 notimpl
3100
3101 #elif defined(__i386__)
3102
3103 notimpl
3104
3105 #elif defined(__arm__)
3106
3107 notimpl
3108
3109 #elif defined(__aarch64__)
3110
3111 notimpl
3112
3113 #else
3114 notimpl
3115 #endif
3116
3117 endproc
3118
3119 proc x33
3120
3121 #if defined(__x86_64__)
3122
3123 notimpl
3124
3125 #elif defined(__i386__)
3126
3127 notimpl
3128
3129 #elif defined(__arm__)
3130
3131 notimpl
3132
3133 #elif defined(__aarch64__)
3134
3135 notimpl
3136
3137 #else
3138 notimpl
3139 #endif
3140
3141 endproc
3142
3143 proc x34
3144
3145 #if defined(__x86_64__)
3146
3147 notimpl
3148
3149 #elif defined(__i386__)
3150
3151 notimpl
3152
3153 #elif defined(__arm__)
3154
3155 notimpl
3156
3157 #elif defined(__aarch64__)
3158
3159 notimpl
3160
3161 #else
3162 notimpl
3163 #endif
3164
3165 endproc
3166
3167 proc x35
3168
3169 #if defined(__x86_64__)
3170
3171 notimpl
3172
3173 #elif defined(__i386__)
3174
3175 notimpl
3176
3177 #elif defined(__arm__)
3178
3179 notimpl
3180
3181 #elif defined(__aarch64__)
3182
3183 notimpl
3184
3185 #else
3186 notimpl
3187 #endif
3188
3189 endproc
3190
3191 proc x36
3192
3193 #if defined(__x86_64__)
3194
3195 notimpl
3196
3197 #elif defined(__i386__)
3198
3199 notimpl
3200
3201 #elif defined(__arm__)
3202
3203 notimpl
3204
3205 #elif defined(__aarch64__)
3206
3207 notimpl
3208
3209 #else
3210 notimpl
3211 #endif
3212
3213 endproc
3214
3215 proc x37
3216
3217 #if defined(__x86_64__)
3218
3219 notimpl
3220
3221 #elif defined(__i386__)
3222
3223 notimpl
3224
3225 #elif defined(__arm__)
3226
3227 notimpl
3228
3229 #elif defined(__aarch64__)
3230
3231 notimpl
3232
3233 #else
3234 notimpl
3235 #endif
3236
3237 endproc
3238
3239 proc x38
3240
3241 #if defined(__x86_64__)
3242
3243 notimpl
3244
3245 #elif defined(__i386__)
3246
3247 notimpl
3248
3249 #elif defined(__arm__)
3250
3251 notimpl
3252
3253 #elif defined(__aarch64__)
3254
3255 notimpl
3256
3257 #else
3258 notimpl
3259 #endif
3260
3261 endproc
3262
3263 proc x39
3264
3265 #if defined(__x86_64__)
3266
3267 notimpl
3268
3269 #elif defined(__i386__)
3270
3271 notimpl
3272
3273 #elif defined(__arm__)
3274
3275 notimpl
3276
3277 #elif defined(__aarch64__)
3278
3279 notimpl
3280
3281 #else
3282 notimpl
3283 #endif
3284
3285 endproc
3286
3287 proc x3a
3288
3289 #if defined(__x86_64__)
3290
3291 notimpl
3292
3293 #elif defined(__i386__)
3294
3295 notimpl
3296
3297 #elif defined(__arm__)
3298
3299 notimpl
3300
3301 #elif defined(__aarch64__)
3302
3303 notimpl
3304
3305 #else
3306 notimpl
3307 #endif
3308
3309 endproc
3310
3311 proc x3b
3312
3313 #if defined(__x86_64__)
3314
3315 notimpl
3316
3317 #elif defined(__i386__)
3318
3319 notimpl
3320
3321 #elif defined(__arm__)
3322
3323 notimpl
3324
3325 #elif defined(__aarch64__)
3326
3327 notimpl
3328
3329 #else
3330 notimpl
3331 #endif
3332
3333 endproc
3334
3335 proc x3c
3336
3337 #if defined(__x86_64__)
3338
3339 notimpl
3340
3341 #elif defined(__i386__)
3342
3343 notimpl
3344
3345 #elif defined(__arm__)
3346
3347 notimpl
3348
3349 #elif defined(__aarch64__)
3350
3351 notimpl
3352
3353 #else
3354 notimpl
3355 #endif
3356
3357 endproc
3358
3359 proc x3d
3360
3361 #if defined(__x86_64__)
3362
3363 notimpl
3364
3365 #elif defined(__i386__)
3366
3367 notimpl
3368
3369 #elif defined(__arm__)
3370
3371 notimpl
3372
3373 #elif defined(__aarch64__)
3374
3375 notimpl
3376
3377 #else
3378 notimpl
3379 #endif
3380
3381 endproc
3382
3383 proc x3e
3384
3385 #if defined(__x86_64__)
3386
3387 notimpl
3388
3389 #elif defined(__i386__)
3390
3391 notimpl
3392
3393 #elif defined(__arm__)
3394
3395 notimpl
3396
3397 #elif defined(__aarch64__)
3398
3399 notimpl
3400
3401 #else
3402 notimpl
3403 #endif
3404
3405 endproc
3406
3407 proc x3f
3408
3409 #if defined(__x86_64__)
3410
3411 notimpl
3412
3413 #elif defined(__i386__)
3414
3415 notimpl
3416
3417 #elif defined(__arm__)
3418
3419 notimpl
3420
3421 #elif defined(__aarch64__)
3422
3423 notimpl
3424
3425 #else
3426 notimpl
3427 #endif
3428
3429 endproc
3430
3431 ///----- That's all, folks --------------------------------------------------