xchg.S: More exercises.
[xchg-rax-rax] / xchg.S
1 /// -*- mode: asm; asm-comment-char: 0 -*-
2
3 ///--------------------------------------------------------------------------
4 /// Preliminaries.
5
6 #include <sys/syscall.h>
7
8 #if defined(__i386__) || defined(__x86_64__)
9
10 .intel_syntax noprefix
11
12 #elif defined(__arm__)
13
14 .macro ret
15 bx r14
16 .endm
17
18 .arch armv7-a
19 .fpu neon
20
21 #elif defined(__aarch64__)
22
23 .macro cmov rd, rn, cc
24 csel \rd, \rn, \rd, \cc
25 .endm
26 #define _COND(_) \
27 _(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl) \
28 _(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv) \
29 _(hs) _(lo)
30 #define _INST(_) \
31 _(ccmp) _(ccmn) \
32 _(csel) _(cmov) \
33 _(csinc) _(cinc) _(cset) \
34 _(csneg) _(cneg) \
35 _(csinv) _(cinv) _(csetm)
36 #define _CONDVAR(cc) _definstvar cc;
37 #define _INSTVARS(inst) \
38 .macro _definstvar cc; \
39 .macro inst.\cc args:vararg; inst \args, \cc; .endm; \
40 .endm; \
41 _COND(_CONDVAR); \
42 .purgem _definstvar;
43 _INST(_INSTVARS)
44 #undef _COND
45 #undef _INST
46 #undef _CONDVAR
47 #undef _INSTVARS
48
49 #define CCMP_N 8
50 #define CCMP_Z 4
51 #define CCMP_C 2
52 #define CCMP_V 1
53
54 #define CCMP_MI CCMP_N
55 #define CCMP_PL 0
56 #define CCMP_EQ CCMP_Z
57 #define CCMP_NE 0
58 #define CCMP_CS CCMP_C
59 #define CCMP_HS CCMP_C
60 #define CCMP_CC 0
61 #define CCMP_LO 0
62 #define CCMP_VS CCMP_V
63 #define CCMP_VC 0
64 #define CCMP_HI CCMP_C
65 #define CCMP_LS 0
66 #define CCMP_LT CCMP_N
67 #define CCMP_GE 0
68 #define CCMP_LE CCMP_N
69 #define CCMP_GT 0
70
71 #else
72 # error "not supported"
73 #endif
74
75 .macro proc name
76 .globl \name
77 .type \name, STT_FUNC
78 .p2align 4
79 \name\():
80 .macro endproc
81 .size \name, . - \name
82 .purgem endproc
83 .endm
84 .endm
85
86 .macro ch c
87 #if defined(__i386__)
88
89 pushf
90 push eax
91 push ebx
92 push ecx
93 push edx
94 push ebp
95 mov ebp, esp
96 and esp, -16
97
98 push \c
99 call putchar@plt
100
101 call get_pc_ebx
102 add ebx, offset _GLOBAL_OFFSET_TABLE
103 mov eax, [ebx + stdout@GOT]
104 mov eax, [eax]
105 call fflush@plt
106
107 mov esp, ebp
108 pop ebp
109 pop edx
110 pop ecx
111 pop ebx
112 pop eax
113 popf
114
115 #elif defined(__x86_64__)
116
117 pushf
118 push rax
119 push rcx
120 push rdx
121 push rsi
122 push rdi
123 push r8
124 push r9
125 push rbp
126 mov rbp, rsp
127 and rsp, -16
128
129 mov rdi, \c
130 call putchar@plt
131
132 mov rdi, [rip + stdout]
133 call fflush@plt
134
135 mov rsp, rbp
136 pop rbp
137 pop r9
138 pop r8
139 pop rdi
140 pop rsi
141 pop rdx
142 pop rcx
143 pop rax
144 popf
145
146 #elif defined(__arm__)
147
148 stmfd r13!, {r0-r4, r12, r14}
149
150 mov r4, r13
151 bic r14, r4, #15
152 mov r13, r14
153
154 mov r0, #\c
155 bl putchar@plt
156
157 ldr r14, .L$_c$gotoff$\@
158 .L$_c$gotpc$\@:
159 add r14, pc, r14
160 b .L$_c$cont$\@
161 .L$_c$gotoff$\@:
162 .word _GLOBAL_OFFSET_TABLE - .L$_c$gotpc$\@ - 8
163 .L$_c$cont$\@:
164 bl fflush@plt
165
166 mov r13, r4
167 ldmfd r13!, {r0-r4, r12, r14}
168
169 #elif defined(__aarch64__)
170
171 sub sp, sp, #20*8
172 stp x0, x1, [sp, #0]
173 stp x2, x3, [sp, #16]
174 stp x4, x5, [sp, #32]
175 stp x6, x7, [sp, #48]
176 stp x8, x9, [sp, #64]
177 stp x10, x11, [sp, #80]
178 stp x12, x13, [sp, #96]
179 stp x14, x15, [sp, #112]
180 stp x16, x17, [sp, #128]
181 mrs x16, nzcv
182 stp x16, x30, [sp, #144]
183
184 mov w0, #\c
185 bl putchar
186 adrp x0, :got:stdout
187 ldr x0, [x0, #:got_lo12:stdout]
188 ldr x0, [x0]
189 bl fflush
190
191 ldp x16, x30, [sp, #144]
192 msr nzcv, x16
193 ldp x16, x17, [sp, #128]
194 ldp x14, x15, [sp, #112]
195 ldp x12, x13, [sp, #96]
196 ldp x10, x11, [sp, #80]
197 ldp x8, x9, [sp, #64]
198 ldp x6, x7, [sp, #48]
199 ldp x4, x5, [sp, #32]
200 ldp x2, x3, [sp, #16]
201 ldp x0, x1, [sp, #0]
202 add sp, sp, #20*8
203
204 #else
205 # error "not supported"
206 #endif
207 .endm
208
209 .macro notimpl
210 #if defined(__i386__) || defined(__x86_64__)
211 ud2
212 #elif defined(__arm__)
213 udf
214 #elif defined(__aarch64__)
215 hlt #0
216 #else
217 # error "not supported"
218 #endif
219 .endm
220
221 .section .note.GNU-stack, "", %progbits
222
223 .text
224
225 #if defined(__i386__)
226 get_pc_ebx:
227 mov ebx, [esp]
228 ret
229 #endif
230
231
232 proc call_example
233
234 #if defined(__i386__)
235
236 push ebx // ebx
237 push esi // esi, ebx
238 push edi // edi, esi, ebx
239 push ebp // flags, ebp, ..., ebx
240 pushf
241
242 mov edi, [esp + 4*6]
243 mov esi, [esp + 4*7]
244 push esi // regs, flags, ebp, ..., ebx
245
246 call get_pc_ebx
247 lea eax, [ebx + 9f - .]
248 push eax // cont, regs, flags, ebp, ..., ebx
249 push edi // func, cont, regs, flags, ebp, ..., ebx
250
251 mov eax, [esi + 28]
252 pushf
253 pop ecx
254 and eax, 0x0cd5
255 and ecx, ~0x0cd5
256 or eax, ecx
257 push eax
258 popf
259 mov eax, [esi + 0]
260 mov ebx, [esi + 4]
261 mov ecx, [esi + 8]
262 mov edx, [esi + 12]
263 mov edi, [esi + 20]
264 mov ebp, [esi + 24]
265 mov esi, [esi + 16]
266
267 ret // -> func; regs, flags, ebp, ..., ebx
268
269 9: pushf // eflags, regs, flags, ebp, ..., ebx
270 push esi // esi, eflags, regs, flags, ebp, ..., ebx
271 mov esi, [esp + 8]
272 mov [esi + 0], eax
273 mov [esi + 4], ebx
274 mov [esi + 8], ecx
275 mov [esi + 12], edx
276 mov [esi + 20], edi
277 mov [esi + 24], ebp
278 pop eax // rflags, regs, flags, ebp, ..., ebx
279 mov [esi + 16], eax
280 pop eax // regs, flags, ebp, ..., ebx
281 mov [esi + 28], eax
282
283 add esp, 4 // flags, ebp, ..., ebx
284 popf // ebp, ..., ebx
285 pop ebp // ..., ebx
286 pop edi
287 pop esi
288 pop ebx //
289 ret
290
291 #elif defined(__x86_64__)
292
293 push rbx // rbx
294 push r10
295 push r11
296 push r12
297 push r13
298 push r14
299 push r15
300 push rbp // flags, rbp, ..., rbx
301 pushf
302
303 push rsi // regs, flags, rbp, ..., rbx
304
305 lea rax, [rip + 9f]
306 push rax // cont, regs, flags, rbp, ..., rbx
307 push rdi // func, cont, regs, flags, rbp, ..., rbx
308
309 mov rax, [rsi + 8*15]
310 pushf
311 pop rcx
312 and rax, 0x0cd5
313 and rcx, ~0x0cd5
314 or rax, rcx
315 push rax
316 popf
317 mov rax, [rsi + 0]
318 mov rbx, [rsi + 8]
319 mov rcx, [rsi + 16]
320 mov rdx, [rsi + 24]
321 mov rdi, [rsi + 40]
322 mov rbp, [rsi + 48]
323 mov r8, [rsi + 56]
324 mov r9, [rsi + 64]
325 mov r10, [rsi + 72]
326 mov r11, [rsi + 80]
327 mov r12, [rsi + 88]
328 mov r13, [rsi + 96]
329 mov r14, [rsi + 104]
330 mov r15, [rsi + 112]
331 mov rsi, [rsi + 32]
332
333 ret // -> func; regs, flags, rbp, ..., rbx
334
335 9: pushf // rflags, regs, flags, rbp, ..., rbx
336 push rsi // rsi, rflags, regs, flags, rbp, ..., rbx
337 mov rsi, [rsp + 16]
338 mov [rsi + 0], rax
339 mov [rsi + 8], rbx
340 mov [rsi + 16], rcx
341 mov [rsi + 24], rdx
342 mov [rsi + 40], rdi
343 mov [rsi + 48], rbp
344 mov [rsi + 56], r8
345 mov [rsi + 64], r9
346 mov [rsi + 72], r10
347 mov [rsi + 80], r11
348 mov [rsi + 88], r12
349 mov [rsi + 96], r13
350 mov [rsi + 104], r14
351 mov [rsi + 112], r15
352 pop rax // rflags, regs, flags, rbp, ..., rbx
353 mov [rsi + 32], rax
354 pop rax // regs, flags, rbp, ..., rbx
355 mov [rsi + 120], rax
356
357 add rsp, 8 // flags, rbp, ..., rbx
358 popf // rbp, ..., rbx
359 pop rbp // ..., rbx
360 pop r15
361 pop r14
362 pop r13
363 pop r12
364 pop r11
365 pop r10
366 pop rbx //
367 ret
368
369 #elif defined(__arm__)
370
371 stmfd r13!, {r0, r1, r4-r11, r14}
372 ldmia r1, {r0-r12, r14}
373 msr cpsr, r14
374 mov r14, pc
375 ldr pc, [r13], #4
376 ldr r14, [r13], #4
377 stmia r14!, {r0-r12}
378 mrs r0, cpsr
379 str r0, [r14]
380 ldmfd r13!, {r4-r11, pc}
381
382 #elif defined(__aarch64__)
383
384 stp x29, x30, [sp, #-14*8]!
385 mov x29, sp
386 stp x19, x20, [sp, #16]
387 stp x21, x22, [sp, #32]
388 stp x23, x24, [sp, #48]
389 stp x25, x26, [sp, #64]
390 stp x27, x28, [sp, #80]
391 str x1, [sp, #104]
392
393 ldp x29, x30, [x1, #224]
394 msr nzcv, x30
395 mov x30, x0
396 ldp x27, x28, [x1, #208]
397 ldp x25, x26, [x1, #192]
398 ldp x23, x24, [x1, #176]
399 ldp x21, x22, [x1, #160]
400 ldp x19, x20, [x1, #144]
401 ldp x16, x17, [x1, #128]
402 ldp x14, x15, [x1, #112]
403 ldp x12, x13, [x1, #96]
404 ldp x10, x11, [x1, #80]
405 ldp x8, x9, [x1, #64]
406 ldp x6, x7, [x1, #48]
407 ldp x4, x5, [x1, #32]
408 ldp x2, x3, [x1, #16]
409 ldp x0, x1, [x1, #0]
410
411 blr x30
412
413 ldr x30, [sp, #104]
414 stp x27, x28, [x30, #208]
415 stp x25, x26, [x30, #192]
416 stp x23, x24, [x30, #176]
417 stp x21, x22, [x30, #160]
418 stp x19, x20, [x30, #144]
419 stp x16, x17, [x30, #128]
420 stp x14, x15, [x30, #112]
421 stp x12, x13, [x30, #96]
422 stp x10, x11, [x30, #80]
423 stp x8, x9, [x30, #64]
424 stp x6, x7, [x30, #48]
425 stp x4, x5, [x30, #32]
426 stp x2, x3, [x30, #16]
427 stp x0, x1, [x30, #0]
428 mov x0, x30
429 mrs x30, nzcv
430 stp x29, x30, [x0, #224]
431
432 ldp x19, x20, [sp, #16]
433 ldp x21, x22, [sp, #32]
434 ldp x23, x24, [sp, #48]
435 ldp x25, x26, [sp, #64]
436 ldp x27, x28, [sp, #80]
437 ldp x29, x30, [sp], #14*8
438
439 ret
440
441 #else
442 # error "not supported"
443 #endif
444
445 endproc
446
447 proc nop
448
449 ret
450
451 endproc
452
453 ///--------------------------------------------------------------------------
454 /// 0x00--0x0f
455
456 proc x00
457
458 // clear all 64 bits of extended traditional registers
459
460 #if defined(__x86_64__)
461
462 xor eax, eax // clear rax
463 lea rbx, [0] // rbx -> _|_
464 loop . // iterate, decrement rcx until zero
465 mov rdx, 0 // set rdx = 0
466 and esi, 0 // clear all bits of rsi
467 sub edi, edi // set rdi = edi - edi = 0
468 push 0
469 pop rbp // pop 0 into rbp
470
471 #elif defined(__i386__)
472
473 xor eax, eax
474 lea ebx, [0]
475 loop .
476 mov edx, 0
477 and esi, 0
478 sub edi, edi
479 push 0
480 pop ebp
481
482 #elif defined(__arm__)
483
484 eor r0, r0, r0
485 rsb r1, r1, r1
486 0: subs r2, r2, #1
487 bne 0b
488 mov r3, #0
489 and r4, r4, #0
490 sub r5, r5, r5
491
492 #elif defined(__aarch64__)
493
494 eor w0, w0, w0
495 mov w1, wzr
496 0: sub w2, w2, #1
497 cbnz w2, 0b
498 mov w3, #0
499 and w4, w4, wzr
500 sub w5, w5, w5
501
502 #else
503 notimpl
504 #endif
505
506 ret
507
508 endproc
509
510 proc x01
511
512 // advance a fibonacci pair by c steps
513 //
514 // on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1}
515 // and f_{i+c}, where f_{i+1} = f_i + f_{i-1}
516
517 #if defined(__x86_64__)
518
519 0: xadd rax, rdx // a, d = a + d, a
520 // = f_{i+1} + f_i, f_{i+1}
521 // = f_{i+2}, f_{i+1}
522 loop 0b // advance i, decrement c, iterate
523
524 #elif defined(__i386__)
525
526 0: xadd eax, edx
527 loop 0b
528
529 #elif defined(__arm__)
530
531 0: subs r2, r2, #2
532 add r3, r3, r0
533 blo 8f
534 add r0, r0, r3
535 bhi 0b
536
537 8: movne r0, r3
538
539 #elif defined(__aarch64__)
540
541 0: subs x2, x2, #2
542 add x3, x3, x0
543 b.lo 8f
544 add x0, x0, x3
545 b.hi 0b
546
547 8: cmov.ne x0, x3
548
549 #else
550 notimpl
551 #endif
552
553 ret
554
555 endproc
556
557 proc x02
558
559 // boolean canonify a: if a = 0 on entry, leave it zero; otherwise
560 // set a = 1
561
562 #if defined(__x86_64__)
563
564 neg rax // set cf iff a /= 0
565 sbb rax, rax // a = a - a - cf = -cf
566 neg rax // a = cf
567
568 #elif defined(__i386__)
569
570 neg eax
571 sbb eax, eax
572 neg eax
573
574 #elif defined(__arm__)
575
576 movs r1, r0 // the easy way
577 movne r1, #1 // mvnne r1, #1 for mask
578
579 cmp r0, #1 // clear cf iff a == 0
580 sbc r2, r0, r0 // c' = a - a - 1 + cf = cf - 1
581 add r2, r2, #1 // c' = cf
582
583 sub r3, r0, r0, lsr #1 // d' top bit clear; d' = 0 iff a = 0
584 rsb r3, r3, #0 // d' top bit set iff a /= 0
585 mov r3, r3, lsr #31 // asr for mask
586
587 rsbs r0, r0, #0
588 sbc r0, r0, r0
589 rsb r0, r0, #0
590
591 #elif defined(__aarch64__)
592
593 cmp x0, #0 // trivial
594 cset.ne x1 // csetm for mask
595
596 cmp xzr, x0 // set cf iff a == 0
597 sbc x2, x0, x0 // c' = a - a - 1 + cf = cf - 1
598 neg x2, x2 // c' = 1 - cf
599
600 sub x3, x0, x0, lsr #1 // if a < 2^63 then a' = ceil(d/2) <
601 // 2^63
602 // if a >= 2^63, write a = 2^63 + t
603 // with t < 2^63; d' = 2^63 - 2^62 +
604 // ceil(t/2) = 2^62 + ceil(t/2), and
605 // ceil(t/2) < 2^62
606 // anyway d' < 2^63 and d' = 0 iff
607 // a = 0
608 neg x3, x3 // d' top bit set iff a /= 0
609 lsr x3, x3, #63 // asr for mask
610
611 cmp x0, #1 // set cf iff a /= 0
612 adc x0, xzr, xzr // a' = 0 + 0 + cf = cf
613
614 #else
615 notimpl
616 #endif
617
618 ret
619
620 endproc
621
622 proc x03
623
624 // set a = min(a, d) (unsigned); clobber c, d
625
626 #if defined(__x86_64__)
627
628 sub rdx, rax // d' = d - a; set cf if a > d
629 sbb rcx, rcx // c = -cf = -[a > d]
630 and rcx, rdx // c = a > d ? d - a : 0
631 add rax, rcx // a' = a > d ? d : a
632
633 #elif defined(__i386__)
634
635 sub edx, eax
636 sbb ecx, ecx
637 and ecx, edx
638 add eax, ecx
639
640 #elif defined(__arm__)
641
642 cmp r0, r3 // the easy way
643 movlo r1, r0 // only needed for out-of-place
644 movhs r1, r3
645
646 subs r3, r3, r0
647 sbc r12, r12, r12
648 and r12, r12, r3
649 add r0, r0, r12
650
651 #elif defined(__aarch64__)
652
653 cmp x0, x3 // the easy way
654 csel.lo x1, x0, x3
655
656 subs x3, x3, x0 // d' = d - a; set cf if d >= a
657 sbc x16, xzr, xzr // t = -1 + cf = -[a > d]
658 and x16, x16, x3 // t = a > d ? d - a : 0
659 add x0, x0, x16 // a' = a > d ? d : a
660
661 #else
662 notimpl
663 #endif
664
665 ret
666
667 endproc
668
669 proc x04
670
671 // switch case?
672
673 #if defined(__x86_64__)
674
675 // unrelated playing
676 mov ecx, eax
677 mov rbx, -1
678 mov edx, ecx
679 sub edx, '0'
680 cmp edx, 10
681 cmovb rbx, rdx
682 or ecx, 0x20
683 mov edx, ecx
684 sub edx, 'a'
685 sub ecx, 'a' - 10
686 cmp edx, 6
687 cmovb rbx, rcx
688
689 xor al, 0x20
690
691 #elif defined(__i386__)
692
693 // unrelated playing
694 mov ecx, eax
695 mov ebx, -1
696 mov edx, ecx
697 sub edx, '0'
698 cmp edx, 10
699 cmovb ebx, edx
700 or ecx, 0x20
701 mov edx, ecx
702 sub edx, 'a'
703 sub ecx, 'a' - 10
704 cmp edx, 6
705 cmovb ebx, ecx
706
707 xor al, 0x20
708
709 #elif defined(__arm__)
710
711 // unrelated playing
712 mvn r1, #0
713 sub r12, r0, #'0'
714 cmp r12, #10
715 movlo r1, r12
716 orr r12, r0, #0x20
717 sub r12, r12, #'a'
718 cmp r12, #6
719 addlo r1, r12, #10
720
721 eor r0, r0, #0x20
722
723 #elif defined(__aarch64__)
724
725 // unrelated playing
726 mov x1, #-1
727 sub w16, w0, #'0'
728 cmp w16, #10
729 cmov.lo x1, x16
730 orr w16, w0, #0x20
731 sub w16, w16, #'a' - 10
732 cmp w16, #10
733 ccmp.hs w16, #16, #CCMP_HS
734 cmov.lo x1, x16
735
736 eor w0, w0, #0x20
737
738 #else
739 notimpl
740 #endif
741
742 ret
743
744 endproc
745
746 proc x05
747
748 // answer whether 5 <= a </<= 9.
749
750 #if defined(__x86_64__)
751
752 sub rax, 5 // a' = a - 5
753 cmp rax, 4 // is a' - 5 </<= 4?
754
755 // cc a' a
756 //
757 // z/e a' = 4 a = 9
758 // nz/ne a' /= 4 a /= 9
759 //
760 // a/nbe a' > 4 a > 9 or a < 5
761 // nc/ae/nb a' >= 4 a >= 9 or a < 5
762 // c/b/nae a' < 4 5 <= a < 9
763 // be/na a' <= 4 5 <= a <= 9
764 //
765 // o a' < -2^63 + 4 -2^63 + 5 <= a < -2^63 + 9
766 // no a' >= -2^63 + 4 a >= -2^63 + 9 or
767 // a < -2^63 + 5
768 // s -2^63 + 4 <= a' < 4 -2^63 + 9 <= a < 9
769 // ns a' < -2^63 + 4 or a < -2^63 + 9 or a >= 9
770 // a' >= 4
771 // ge/nl a' >= 4 a >= 9 or a < -2^63 + 5
772 // l/nge a' < 4 -2^63 + 5 <= a < 9
773 // g/nle a' > 4 a > 9 or a < -2^63 + 5
774 // le/ng a' <= 4 -2^63 + 5 <= a <= 9
775
776 #elif defined(__i386__)
777
778 sub eax, 5
779 cmp eax, 4
780
781 #elif defined(__arm__)
782
783 // i dimly remember having a slick way to do this way back in the
784 // day, but i can't figure it out any more.
785 sub r0, #5
786 cmp r0, #4
787
788 #elif defined(__aarch64__)
789
790 // literal translation is too obvious
791 cmp x0, #5
792 ccmp.hs x0, #9, #CCMP_HS
793
794 #else
795 notimpl
796 #endif
797
798 ret
799
800 endproc
801
802 proc x06
803
804 // leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of,
805 // set sf to msb(a)
806
807 #if defined(__x86_64__)
808
809 not rax // a' = -a - 1
810 inc rax // a' = -a
811 neg rax // a' = a
812
813 #elif defined(__i386__)
814
815 not eax
816 inc eax
817 neg eax
818
819 #elif defined(__arm__)
820
821 mvn r0, r0
822 add r0, r0, #1
823 rsbs r0, r0, #0 // cf has opposite sense
824
825 #elif defined(__aarch64__)
826
827 mvn x0, x0
828 add x0, x0, #1
829 negs x0, x0 // cf has opposite sense
830
831 #else
832 notimpl
833 #endif
834
835 ret
836
837 endproc
838
839 proc x07
840
841 // same as before (?)
842
843 #if defined(__x86_64__)
844
845 inc rax // a' = a + 1
846 neg rax // a' = -a - 1
847 inc rax // a' = -a
848 neg rax // a' = a
849
850 #elif defined(__i386__)
851
852 inc eax
853 neg eax
854 inc eax
855 neg eax
856
857 #elif defined(__arm__)
858
859 add r0, r0, #1
860 rsb r0, r0, #0
861 add r0, r0, #1
862 rsbs r0, r0, #0
863
864 #elif defined(__aarch64__)
865
866 add x0, x0, #1
867 neg x0, x0
868 add x0, x0, #1
869 negs x0, x0 // cf has opposite sense
870
871 #else
872 notimpl
873 #endif
874
875 ret
876
877 endproc
878
879 proc x08
880
881 // floor((a + d)/2), correctly handling overflow conditions; final cf
882 // is lsb(a + d), probably uninteresting
883
884 #if defined(__x86_64__)
885
886 add rax, rdx // cf || a' = a + d
887 rcr rax, 1 // shift 65-bit result right by one
888 // place; lsb moves into carry
889
890 #elif defined(__i386__)
891
892 add eax, edx
893 rcr eax, 1
894
895 #elif defined(__arm__)
896
897 // like the two-instruction a64 version
898 sub r1, r3, r0
899 add r1, r0, r1, lsr #1
900
901 // the slick version, similar to the above
902 adds r0, r0, r3
903 mov r0, r0, rrx
904
905 #elif defined(__aarch64__)
906
907 // a64 lacks a32's rrx. literal translation.
908 adds x1, x0, x3 // cf || a' = a + d
909 adc x16, xzr, xzr // realize cf in extra register
910 extr x1, x16, x1, #1 // shift down one place
911
912 // two instruction version: clobbers additional register. (if you
913 // wanted the answer in any other register, even overwriting d, then
914 // this is unnecessary.) also depends on d >= a.
915 sub x16, x3, x0 // compute difference
916 add x0, x0, x16, lsr #1 // add half of it (rounded down)
917
918 #else
919 notimpl
920 #endif
921
922 ret
923
924 endproc
925
926 proc x09
927
928 // a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3
929 // (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8).
930
931 #if defined(__x86_64__)
932
933 shr rax, 3 // a' = floor(a/8); cf = 1 if a ==
934 // 4, 5, 6, 7 (mod 8)
935 adc rax, 0 // a' = floor(a/8) + cf
936
937 #elif defined(__i386__)
938
939 shr eax, 3
940 adc eax, 0
941
942 #elif defined(__arm__)
943
944 movs r0, r0, lsr #3
945 adc r0, r0, #0
946
947 #elif defined(__aarch64__)
948
949 tst x0, #4
950 orr x0, xzr, x0, lsr #3
951 cinc.ne x0, x0
952
953 #else
954 notimpl
955 #endif
956
957 ret
958
959 endproc
960
961 proc x0a
962
963 // increment c-byte little-endian bignum at rdi
964
965 #if defined(__x86_64__)
966
967 add byte ptr [rdi], 1
968 0: inc rdi
969 adc byte ptr [rdi], 0
970 loop 0b
971
972 #elif defined(__i386__)
973
974 add byte ptr [edi], 1
975 0: inc edi
976 adc byte ptr [edi], 0
977 loop 0b
978
979 #elif defined(__arm__)
980
981 mov r12, #256 // set initial carry
982 0: ldrb r0, [r5]
983 subs r2, r2, #1
984 add r12, r0, r12, lsr #8
985 strb r12, [r5], #1
986 bne 0b
987
988 #elif defined(__aarch64__)
989
990 mov w17, #256 // set initial carry
991 0: ldrb w16, [x5]
992 sub x2, x2, #1
993 add w17, w16, w17, lsr #8
994 strb w17, [x5], #1
995 cbnz x2, 0b
996
997 #else
998 notimpl
999 #endif
1000
1001 ret
1002
1003 endproc
1004
1005 proc x0b
1006
1007 // negate double-precision d:a
1008
1009 #if defined(__x86_64__)
1010
1011 not rdx // d' = -d - 1
1012 neg rax // a' = -a;
1013 // cf = 1 iff a /= 0
1014 sbb rdx, -1 // d' = -d - cf
1015
1016 #elif defined(__i386__)
1017
1018 not edx
1019 neg eax
1020 sbb edx, -1
1021
1022 #elif defined(__arm__)
1023
1024 // reverse subtract is awesome
1025 rsbs r0, r0, #0
1026 rsc r3, r3, #0
1027
1028 #elif defined(__aarch64__)
1029
1030 // easy way: everything is better with zero registers.
1031 negs x0, x0
1032 ngc x3, x3
1033
1034 #else
1035 notimpl
1036 #endif
1037
1038 ret
1039
1040 endproc
1041
1042 proc x0c
1043
1044 // rotate is distributive over xor.
1045
1046 #if defined(__x86_64__)
1047
1048 // rax // = a_1 || a_0
1049 // rbx // = b_1 || b_0
1050 mov rcx, rax // = a_1 || a_0
1051
1052 xor rcx, rbx // = (a_1 XOR b_1) || (a_0 XOR b_0)
1053 ror rcx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1)
1054
1055 ror rax, 0xd // = a_0 || a_1
1056 ror rbx, 0xd // = b_0 || b_1
1057 xor rax, rbx // = (a_0 XOR b_0) || (a_1 XOR b_1)
1058
1059 cmp rax, rcx // always equal
1060
1061 #elif defined(__i386__)
1062
1063 mov ecx, eax // = a_1 || a_0
1064
1065 xor ecx, ebx // = (a_1 XOR b_1) || (a_0 XOR b_0)
1066 ror ecx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1)
1067
1068 ror eax, 0xd // = a_0 || a_1
1069 ror ebx, 0xd // = b_0 || b_1
1070 xor eax, ebx // = (a_0 XOR b_0) || (a_1 XOR b_1)
1071
1072 cmp eax, ecx // always equal
1073
1074 #elif defined(__arm__)
1075
1076
1077 // r0 // = a_1 || a_0
1078 // r1 // = b_1 || b_0
1079 eor r2, r0, r1 // = (a_1 XOR b_1) || (a_0 XOR b_0)
1080 mov r2, r2, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1081
1082 mov r1, r1, ror #13 // = b_0 || b_1
1083 eor r0, r1, r0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1084
1085 cmp r0, r2 // always equal
1086
1087 #elif defined(__aarch64__)
1088
1089 // x0 // = a_1 || a_0
1090 // x1 // = b_1 || b_0
1091 eor x2, x0, x1 // = (a_1 XOR b_1) || (a_0 XOR b_0)
1092 ror x2, x2, #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1093
1094 ror x1, x1, #13 // = b_0 || b_1
1095 eor x0, x1, x0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1096
1097 cmp x0, x2 // always equal
1098
1099 #else
1100 notimpl
1101 #endif
1102
1103 ret
1104
1105 endproc
1106
1107 proc x0d
1108
1109 // and is distributive over xor.
1110
1111 #if defined(__x86_64__)
1112
1113 mov rdx, rbx // = b
1114
1115 xor rbx, rcx // = b XOR c
1116 and rbx, rax // = a AND (b XOR c)
1117
1118 and rdx, rax // = a AND b
1119 and rax, rcx // = a AND c
1120 xor rax, rdx // = (a AND b) XOR (a AND c)
1121 // = a AND (b XOR c)
1122
1123 cmp rax, rbx // always equal
1124
1125 #elif defined(__i386__)
1126
1127 mov edx, ebx // = b
1128
1129 xor ebx, ecx // = b XOR c
1130 and ebx, eax // = a AND (b XOR c)
1131
1132 and edx, eax // = a AND b
1133 and eax, ecx // = a AND c
1134 xor eax, edx // = (a AND b) XOR (a AND c)
1135 // = a AND (b XOR c)
1136
1137 cmp eax, ebx // always equal
1138
1139 #elif defined(__arm__)
1140
1141 and r3, r0, r1 // = a AND b
1142
1143 eor r1, r1, r2 // = b XOR c
1144 and r1, r1, r0 // = a AND (b XOR c)
1145
1146 and r0, r0, r2 // = a AND c
1147 eor r0, r0, r3 // = (a AND b) XOR (a AND c)
1148 // = a AND (b XOR c)
1149
1150 cmp r0, r1 // always equal
1151
1152 #elif defined(__aarch64__)
1153
1154 and x3, x0, x1 // = a AND b
1155
1156 eor x1, x1, x2 // = b XOR c
1157 and x1, x1, x0 // = a AND (b XOR c)
1158
1159 and x0, x0, x2 // = a AND c
1160 eor x0, x0, x3 // = (a AND b) XOR (a AND c)
1161 // = a AND (b XOR c)
1162
1163 cmp x0, x1 // always equal
1164
1165 #else
1166 notimpl
1167 #endif
1168
1169 ret
1170
1171 endproc
1172
1173 proc x0e
1174
1175 // de morgan's law
1176
1177 #if defined(__x86_64__)
1178
1179 mov rcx, rax // = a
1180
1181 and rcx, rbx // = a AND b
1182 not rcx // = NOT (a AND b)
1183
1184 not rax // = NOT a
1185 not rbx // = NOT b
1186 or rax, rbx // = (NOT a) OR (NOT b)
1187 // = NOT (a AND b)
1188
1189 cmp rax, rcx // always equal
1190
1191 #elif defined(__i386__)
1192
1193 mov ecx, eax // = a
1194
1195 and ecx, ebx // = a AND b
1196 not ecx // = NOT (a AND b)
1197
1198 not eax // = NOT a
1199 not ebx // = NOT b
1200 or eax, ebx // = (NOT a) OR (NOT b)
1201 // = NOT (a AND b)
1202
1203 cmp eax, ecx // always equal
1204
1205 #elif defined(__arm__)
1206
1207 and r2, r0, r1 // = a AND b
1208 mvn r2, r2 // = NOT (a AND b)
1209
1210 mvn r0, r0 // = NOT a
1211 mvn r1, r1 // = NOT b
1212 orr r0, r0, r1 // = (NOT a) OR (NOT b)
1213
1214 cmp r0, r2 // always equal
1215
1216 #elif defined(__aarch64__)
1217
1218 and x2, x0, x1 // = a AND b
1219 mvn x2, x2 // = NOT (a AND b)
1220
1221 mvn x0, x0 // = NOT a
1222 orn x0, x0, x1 // = (NOT a) OR (NOT b)
1223
1224 cmp x0, x2 // always equal
1225
1226 #else
1227 notimpl
1228 #endif
1229
1230 ret
1231
1232 endproc
1233
1234 proc x0f
1235
1236 // replace input buffer bytes with cumulative XORs with initial a;
1237 // final a is XOR of all buffer bytes and initial a.
1238 //
1239 // not sure why you'd do this.
1240
1241 #if defined(__x86_64__)
1242
1243 0: xor [rsi], al
1244 lodsb
1245 loop 0b
1246
1247 #elif defined(__i386__)
1248
1249 0: xor [esi], al
1250 lodsb
1251 loop 0b
1252
1253 #elif defined(__arm__)
1254
1255 0: ldrb r12, [r4]
1256 subs r2, r2, #1
1257 eor r0, r0, r12
1258 strb r0, [r4], #1
1259 bne 0b
1260
1261 #elif defined(__aarch64__)
1262
1263 0: ldrb w16, [x4]
1264 sub x2, x2, #1
1265 eor w0, w0, w16
1266 strb w0, [x4], #1
1267 cbnz x2, 0b
1268
1269 #else
1270 notimpl
1271 #endif
1272
1273 ret
1274
1275 endproc
1276
1277 ///--------------------------------------------------------------------------
1278 /// 0x10--0x1f
1279
1280 proc x10
1281
1282 // four different ways to swap a pair of registers.
1283
1284 #if defined(__x86_64__)
1285
1286 push rax
1287 push rcx
1288 pop rax
1289 pop rcx
1290
1291 xor rax, rcx
1292 xor rcx, rax
1293 xor rax, rcx
1294
1295 add rax, rcx
1296 sub rcx, rax
1297 add rax, rcx
1298 neg rcx
1299
1300 xchg rax, rcx
1301
1302 #elif defined(__i386__)
1303
1304 push eax
1305 push ecx
1306 pop eax
1307 pop ecx
1308
1309 xor eax, ecx
1310 xor ecx, eax
1311 xor eax, ecx
1312
1313 add eax, ecx
1314 sub ecx, eax
1315 add eax, ecx
1316 neg ecx
1317
1318 xchg eax, ecx
1319
1320 #elif defined(__arm__)
1321
1322 stmfd r13!, {r0, r2}
1323 ldr r0, [r13, #4]
1324 ldr r2, [r13], #8
1325
1326 eor r0, r0, r2
1327 eor r2, r2, r0
1328 eor r0, r0, r2
1329
1330 sub r0, r0, r2
1331 add r2, r2, r0
1332 rsb r0, r0, r2 // don't need 3-addr with reverse-sub
1333
1334 mov r12, r0
1335 mov r0, r2
1336 mov r2, r0
1337
1338 #elif defined(__aarch64__)
1339
1340 // anything you can do
1341 stp x0, x2, [sp, #-16]!
1342 ldp x2, x0, [sp], #16
1343
1344 eor x0, x0, x2
1345 eor x2, x2, x0
1346 eor x0, x0, x2
1347
1348 // the add/sub/add thing was daft. you can do it in three if you're
1349 // clever -- and have three-address operations.
1350 sub x0, x0, x2
1351 add x2, x2, x0
1352 sub x0, x2, x0
1353
1354 // but we lack a fourth. we can't do this in fewer than three
1355 // instructions without hitting memory. only `ldp' will modify two
1356 // registers at a time, so we need at least two instructions -- but
1357 // if the first one sets one of our two registers to its final value
1358 // then we lose the other input value with no way to recover it, so
1359 // we must either write a fresh third register, or write something
1360 // other than the final value, and in both cases we need a third
1361 // instruction to fix everything up. we've done the wrong-something-
1362 // other trick twice, so here's the captain-obvious use-a-third-
1363 // register version.
1364 mov x16, x0
1365 mov x0, x2
1366 mov x2, x16
1367
1368 #else
1369 notimpl
1370 #endif
1371
1372 ret
1373
1374 endproc
1375
1376 proc x11
1377
1378 // assuming a is initialized to zero, set a to the inclusive or of
1379 // the xor-differences of corresponding bytes in the c-byte strings
1380 // at si and di.
1381 //
1382 // in particular, a will be zero (and zf set) if and only if the two
1383 // strings are equal.
1384
1385 #if defined(__x86_64__)
1386
1387 0: mov dl, [rsi]
1388 xor dl, [rdi]
1389 inc rsi
1390 inc rdi
1391 or al, dl
1392 loop 0b
1393
1394 #elif defined(__i386__)
1395
1396 0: mov dl, [esi]
1397 xor dl, [edi]
1398 inc esi
1399 inc edi
1400 or al, dl
1401 loop 0b
1402
1403 #elif defined(__arm__)
1404
1405 0: ldrb r1, [r4], #1
1406 ldrb r12, [r5], #1
1407 subs r2, r2, #1
1408 eor r12, r12, r1
1409 orr r0, r0, r12
1410 bne 0b
1411
1412 #elif defined(__aarch64__)
1413
1414 0: ldrb w16, [x4], #1
1415 ldrb w17, [x5], #1
1416 sub x2, x2, #1
1417 eor w16, w16, w17
1418 orr w0, w0, w16
1419 cbnz x2, 0b
1420
1421 #else
1422 notimpl
1423 #endif
1424
1425 ret
1426
1427 endproc
1428
1429 proc x12
1430
1431 // an obtuse way of adding two registers. for any bit position, a
1432 // OR d is set if and only if at least one of a and d has a bit set
1433 // in that position, and a AND d is set if and only if both have a
1434 // bit set in that position. essentially, then, what we've done is
1435 // move all of the set bits in d to a, unless there's already a bit
1436 // there. this clearly doesn't change the sum.
1437
1438 #if defined(__x86_64__)
1439
1440 mov rcx, rdx // c' = d
1441 and rdx, rax // d' = a AND d
1442 or rax, rcx // a' = a OR d
1443 add rax, rdx
1444
1445 #elif defined(__i386__)
1446
1447 mov ecx, edx // c' = d
1448 and edx, eax // d' = a AND d
1449 or eax, ecx // a' = a OR d
1450 add eax, edx
1451
1452 #elif defined(__arm__)
1453
1454 and r2, r0, r3 // c' = a AND d
1455 orr r0, r0, r3 // a' = a OR d
1456 add r0, r0, r2
1457
1458 #elif defined(__aarch64__)
1459
1460 and x2, x0, x3 // c' = a AND d
1461 orr x0, x0, x3 // a' = a OR d
1462 add x0, x0, x2
1463
1464 #else
1465 notimpl
1466 #endif
1467
1468 ret
1469
1470 endproc
1471
1472 proc x13
1473
1474 // ok, so this is a really obtuse way of adding a and b; the result
1475 // is in a and d. but why does it work?
1476
1477 #if defined(__x86_64__)
1478
1479 mov rcx, 0x40 // carry chains at most 64 long
1480 0: mov rdx, rax // copy a'
1481 xor rax, rbx // low bits of each bitwise sum
1482 and rbx, rdx // carry bits from each bitwise sum
1483 shl rbx, 1 // carry them into next position
1484 loop 0b
1485
1486 #elif defined(__i386__)
1487
1488 mov ecx, 0x40 // carry chains at most 64 long
1489 0: mov edx, eax // copy a'
1490 xor eax, ebx // low bits of each bitwise sum
1491 and ebx, edx // carry bits from each bitwise sum
1492 shl ebx, 1 // carry them into next position
1493 loop 0b
1494
1495 #elif defined(__arm__)
1496
1497 mov r2, #0x40
1498 0: and r3, r0, r1
1499 subs r2, r2, #1
1500 eor r0, r0, r1
1501 lsl r1, r3, #1
1502 bne 0b
1503
1504 #elif defined(__aarch64__)
1505
1506 mov x2, #0x40
1507 0: and x3, x0, x1
1508 sub x2, x2, #1
1509 eor x0, x0, x1
1510 lsl x1, x3, #1
1511 cbnz x2, 0b
1512
1513 #else
1514 notimpl
1515 #endif
1516
1517 ret
1518
1519 endproc
1520
1521 proc x14
1522
1523 // floor((a + d)/2), like x08.
1524
1525 #if defined(__x86_64__)
1526
1527 mov rcx, rax // copy a for later
1528 and rcx, rdx // carry bits
1529
1530 xor rax, rdx // low bits of each bitwise sum
1531 shr rax, 1 // divide by 2; carries now in place
1532
1533 add rax, rcx // add the carries; done
1534
1535 #elif defined(__i386__)
1536
1537 mov ecx, eax // copy a for later
1538 and ecx, edx // carry bits
1539
1540 xor eax, edx // low bits of each bitwise sum
1541 shr eax, 1 // divide by 2; carries now in place
1542
1543 add eax, ecx // add the carries; done
1544
1545 #elif defined(__arm__)
1546
1547 and r2, r0, r3
1548 eor r0, r0, r3
1549 add r0, r2, r0, lsr #1
1550
1551 #elif defined(__aarch64__)
1552
1553 and x2, x0, x3
1554 eor x0, x0, x3
1555 add x0, x2, x0, lsr #1
1556
1557 #else
1558 notimpl
1559 #endif
1560
1561 ret
1562
1563 endproc
1564
1565 proc x15
1566
1567 // sign extension 32 -> 64 bits.
1568
1569 #if defined(__x86_64__)
1570
1571 movsx rbx, eax // like this?
1572
1573 mov rdx, 0xffffffff80000000
1574 add rax, rdx // if bit 31 of a is set then bits
1575 // 31--63 of a' are clear; otherwise,
1576 // these bits are all set -- which is
1577 // exactly backwards
1578 xor rax, rdx // so fix it
1579
1580 #elif defined(__i386__)
1581
1582 movsx ebx, ax // like this?
1583
1584 mov edx, 0xffff8000
1585 add eax, edx // if bit 31 of a is set then bits
1586 // 31--63 of a' are clear; otherwise,
1587 // these bits are all set -- which is
1588 // exactly backwards
1589 xor eax, edx // so fix it
1590
1591 #elif defined(__arm__)
1592
1593 sxth r1, r0 // like this
1594
1595 mov r12, #0x80000000
1596 add r0, r0, r12, asr #16
1597 eor r0, r0, r12, asr #16
1598
1599 #elif defined(__aarch64__)
1600
1601 sxtw x1, w0 // like this
1602
1603 mov x16, #0xffffffff80000000
1604 add x0, x0, x16
1605 eor x0, x0, x16
1606
1607 #else
1608 notimpl
1609 #endif
1610
1611 ret
1612
1613 endproc
1614
1615 proc x16
1616
1617 // ??? i don't know why you'd want to calculate this.
1618
1619 #if defined(__x86_64__)
1620
1621 xor rax, rbx // a' = a XOR b
1622 xor rbx, rcx // b' = b XOR c
1623 mov rsi, rax // t = a XOR b
1624 add rsi, rbx // t = (a XOR b) + (b XOR c)
1625 cmovc rax, rbx // a' = cf ? b XOR c : a XOR b
1626 xor rax, rbx // a' = cf ? 0 : a XOR c
1627 cmp rax, rsi
1628
1629 #elif defined(__i386__)
1630
1631 xor eax, ebx // a' = a XOR b
1632 xor ebx, ecx // b' = b XOR c
1633 mov esi, eax // t = a XOR b
1634 add esi, ebx // t = (a XOR b) + (b XOR c)
1635 cmovc eax, ebx // a' = cf ? b XOR c : a XOR b
1636 xor eax, ebx // a' = cf ? 0 : a XOR c
1637 cmp eax, esi
1638
1639 #elif defined(__arm__)
1640
1641 eor r0, r0, r1
1642 eor r1, r1, r2
1643 adds r4, r0, r1
1644 movcs r0, r1
1645 eor r0, r0, r1
1646 cmp r0, r4
1647
1648 #elif defined(__aarch64__)
1649
1650 eor x0, x0, x1
1651 eor x1, x1, x2
1652 adds x4, x0, x1
1653 cmov.cs x0, x1
1654 eor x0, x0, x1
1655 cmp x0, x4
1656
1657 #else
1658 notimpl
1659 #endif
1660
1661 ret
1662
1663 endproc
1664
1665 proc x17
1666
1667 // absolute value
1668
1669 #if defined(__x86_64__)
1670
1671 cqo // d = a < 0 ? -1 : 0
1672 xor rax, rdx // a' = a < 0 ? -a - 1 : a
1673 sub rax, rdx // a' = a < 0 ? -a : a
1674
1675 #elif defined(__i386__)
1676
1677 cdq // d = a < 0 ? -1 : 0
1678 xor eax, edx // a' = a < 0 ? -a - 1 : a
1679 sub eax, edx // a' = a < 0 ? -a : a
1680
1681 #elif defined(__arm__)
1682
1683 // direct approach
1684 movs r1, r0
1685 rsbmi r1, r0, #0
1686
1687 // faithful-ish conversion
1688 eor r3, r0, r0, asr #31
1689 sub r0, r3, r0, asr #31
1690
1691 #elif defined(__aarch64__)
1692
1693 // direct approach
1694 tst x0, #1 << 63
1695 cneg.ne x1, x0
1696
1697 // faithful-ish conversion
1698 eor x3, x0, x0, asr #63
1699 sub x0, x3, x0, asr #63
1700
1701 #else
1702 notimpl
1703 #endif
1704
1705 ret
1706
1707 endproc
1708
1709 proc x18
1710
1711 // should always set sf, clear zf, unless we get rescheduled to a
1712 // different core.
1713
1714 #if defined(__x86_64__)
1715
1716 rdtsc // d || a = cycles
1717 shl rdx, 0x20
1718 or rax, rdx // a = cycles
1719 mov rcx, rax // c = cycles
1720
1721 rdtsc // d || a = cycles'
1722 shl rdx, 0x20
1723 or rax, rdx // a = cycles'
1724
1725 cmp rcx, rax
1726
1727 #elif defined(__i386__)
1728
1729 rdtsc // d || a = cycles
1730 mov ebx, eax
1731 mov ecx, edx // c || b = cycles
1732
1733 rdtsc // d || a = cycles'
1734
1735 sub ebx, eax
1736 sbb ecx, edx
1737
1738 #elif defined(__arm__)
1739
1740 // cycle clock not available in user mode
1741 mrrc p15, 0, r0, r1, c9
1742 mrrc p15, 0, r2, r3, c9
1743 subs r0, r0, r2
1744 sbcs r1, r1, r3
1745
1746 #elif defined(__aarch64__)
1747
1748 // cycle clock not available in user mode
1749 mrs x0, pmccntr_el0
1750 mrs x1, pmccntr_el0
1751 cmp x0, x1
1752
1753 #else
1754 notimpl
1755 #endif
1756
1757 ret
1758
1759 endproc
1760
1761 proc x19
1762
1763 // stupid way to capture a pointer to inline data and jump past it.
1764 // confuses the return-address predictor something chronic. worse
1765 // because amd64 calling convention doesn't usually pass arguments on
1766 // the stack.
1767
1768 #if defined(__x86_64__)
1769
1770 call 8f
1771 .string "hello world!\n\0"
1772 8: call print_str
1773 add rsp, 8
1774 ret
1775
1776 print_str:
1777 // actually implement this ridiculous thing
1778 mov rsi, [rsp + 8]
1779 xor edx, edx
1780 0: mov al, [rsi + rdx]
1781 inc rdx
1782 cmp al, 0
1783 jnz 0b
1784 mov eax, SYS_write
1785 mov edi, 1
1786 dec rdx
1787 syscall // clobbers r11 :-(
1788 ret
1789
1790 #elif defined(__i386__)
1791
1792 call 8f
1793 .string "hello world!\n\0"
1794 8: call print_str
1795 add esp, 4
1796 ret
1797
1798 print_str:
1799 // actually implement this ridiculous thing
1800 mov ecx, [esp + 4]
1801 xor edx, edx
1802 0: mov al, [ecx + edx]
1803 inc edx
1804 cmp al, 0
1805 jnz 0b
1806 mov eax, SYS_write
1807 mov ebx, 1
1808 dec edx
1809 int 0x80
1810 ret
1811
1812 #elif defined(__arm__)
1813
1814 // why am i doing this?
1815 stmfd r13!, {r14}
1816 bl 8f
1817 .string "hello world!\n\0"
1818 .balign 4
1819 8: mov r1, r14 // might as well make it easy on myself
1820 bl print_str
1821 ldmfd r13!, {pc}
1822
1823 print_str:
1824 mov r2, #0
1825 0: ldrb r0, [r1, r2]
1826 cmp r0, #0
1827 addne r2, r2, #1
1828 bne 0b
1829 mov r0, #1
1830 mov r7, #SYS_write
1831 swi 0
1832 bx r14
1833
1834 #elif defined(__aarch64__)
1835
1836 // why am i doing this?
1837 str x30, [sp, #-16]!
1838 bl 8f
1839 .string "hello world!\n\0"
1840 .balign 4
1841 8: mov x1, x30 // might as well make it easy on myself
1842 bl print_str
1843 ldr x30, [sp], #16
1844 ret
1845
1846 print_str:
1847 mov x2, #0
1848 0: ldrb w0, [x1, x2]
1849 cmp w0, #0
1850 cinc.ne x2, x2
1851 b.ne 0b
1852 mov x0, #1
1853 mov x8, #SYS_write
1854 svc #0
1855 ret
1856
1857 #else
1858 notimpl
1859 #endif
1860
1861 endproc
1862
1863 proc x1a
1864
1865 // collect the current instruction-pointer address. this was an old
1866 // 32-bit i386 trick for position-independent code, but (a) it
1867 // confuses the return predictor, and (b) amd64 has true pc-relative
1868 // addressing.
1869
1870 #if defined(__x86_64__)
1871
1872 // the actual example
1873 call 0f
1874 0: pop rax
1875
1876 // the modern i386 trick doesn't confuse the return-address
1877 // predictor.
1878 call calladdr_rbx
1879 sub rbx, . - 0b
1880
1881 // but rip-relative addressing is even better
1882 lea rcx, [rip + 0b]
1883
1884 ret
1885
1886 calladdr_rbx:
1887 mov rbx, [rsp]
1888 ret
1889
1890 #elif defined(__i386__)
1891
1892 // the actual example
1893 call 0f
1894 0: pop eax
1895
1896 // the modern i386 trick doesn't confuse the return-address
1897 // predictor.
1898 call get_pc_ebx
1899 sub ebx, . - 0b
1900
1901 ret
1902
1903 #elif defined(__arm__)
1904
1905 stmfd r13!, {r14}
1906
1907 bl 0f
1908 0: mov r0, r14
1909
1910 bl return
1911 sub r1, r14, #. - 0b
1912
1913 adr r2, 0b
1914
1915 ldmfd r13!, {pc}
1916
1917 return: bx r14
1918
1919 #elif defined(__aarch64__)
1920
1921 str x30, [sp, #-16]!
1922
1923 // we can do all of the above using a64
1924 bl 0f
1925 0: mov x0, x30
1926
1927 bl return
1928 sub x1, x30, #. - 0b
1929
1930 adr x2, 0b
1931
1932 ldr x30, [sp], #16
1933 return: ret
1934
1935 #else
1936 notimpl
1937 #endif
1938
1939 endproc
1940
1941 proc x1b
1942
1943 #if defined(__x86_64__)
1944
1945 // retpolines: an mitigation against adversarially influenced
1946 // speculative execution at indirect branches. if an adversary can
1947 // prepare a branch-target buffer entry matching an indirect branch
1948 // in the victim's address space then they can cause the victim to
1949 // /speculatively/ (but not architecturally) execute any code in
1950 // their address space, possibly leading to leaking secrets through
1951 // the cache. retpolines aren't susceptible to this because the
1952 // predicted destination address is from the return-prediction stack
1953 // which the adversary can't prime. the performance penalty is still
1954 // essentially a branch misprediction -- for this return, and
1955 // possibly all others already stacked.
1956
1957 // (try not to crash)
1958 lea rax, [rip + 9f]
1959
1960 push rax
1961 9: ret
1962
1963 #elif defined(__i386__)
1964
1965 call get_pc_ebx
1966 lea eax, [ebx + 9f - .]
1967
1968 push eax
1969 9: ret
1970
1971 #elif defined(__arm__)
1972
1973 stmfd r13!, {r14}
1974
1975 adr r14, 8f
1976 bx r14
1977
1978 8: ldmfd r13!, {pc}
1979
1980 #elif defined(__aarch64__)
1981
1982 str x30, [sp, #-16]!
1983
1984 adr x30, 8f
1985 ret
1986
1987 8: ldr x30, [sp], #16
1988 ret
1989
1990 #else
1991 notimpl
1992 #endif
1993
1994 endproc
1995
1996 proc x1c
1997
1998 // ok, having a hard time seeing a use for this. the most important
1999 // thing to note is that sp is set from `pop' /after/ it's
2000 // incremented.
2001
2002 #if defined(__x86_64__)
2003
2004 // try not to crash
2005 mov rax, rsp
2006 and rsp, -16
2007 push rax
2008
2009 pop rsp
2010
2011 // check it worked
2012 mov rbx, rsp
2013 ret
2014
2015 #elif defined(__i386__)
2016
2017 // try not to crash
2018 mov eax, esp
2019 and esp, -16
2020 push eax
2021
2022 pop esp
2023
2024 // check it worked
2025 mov ebx, esp
2026 ret
2027
2028 #elif defined(__arm__)
2029
2030 // not even going to dignify this
2031 notimpl
2032
2033 #elif defined(__aarch64__)
2034
2035 // not even going to dignify this
2036 notimpl
2037
2038 #else
2039 notimpl
2040 #endif
2041
2042 endproc
2043
2044 proc x1d
2045
2046 // monumentally cheesy way to copy 8 n bytes from buff1 to buff2.
2047 // also clobbers words at buff2 + 8 n and buff2 - 8 for good measure.
2048
2049 n = 4
2050
2051 #if defined(__x86_64__)
2052
2053 mov rax, rsp // safekeeping
2054
2055 // we're toast if we get hit by a signal now. fingers crossed...
2056 .if 0
2057 mov rsp, buff2 + 8*n + 8
2058 mov rbp, buff1 + 8*n
2059 .else
2060 lea rsp, [rdi + 8*n + 16]
2061 lea rbp, [rsi + 8*n]
2062 .endif
2063 enter 0, n + 1
2064
2065 // precise action:
2066 //
2067 // +---------+ +---------+
2068 // rbp -> | ??? | rsp -> | ??? |
2069 // +---------+ +---------+
2070 // | w_{n-1} | | rbp | <- rbp'
2071 // +---------+ +---------+
2072 // | ... | | w_{n-1} |
2073 // +---------+ +---------+
2074 // | w_1 | | ... |
2075 // +---------+ +---------+
2076 // | w_0 | | w_1 |
2077 // +---------+ +---------+
2078 // | w_0 |
2079 // +---------+
2080 // | rbp' | <- rsp'
2081 // +---------+
2082
2083 mov rdx, rsp
2084 mov rsp, rax
2085
2086 #elif defined(__i386__)
2087
2088 mov eax, esp // safekeeping
2089
2090 // we're toast if we get hit by a signal now. fingers crossed...
2091 .if 0
2092 mov esp, buff2 + 4*n + 4
2093 mov ebp, buff1 + 4*n
2094 .else
2095 lea esp, [edi + 4*n + 8]
2096 lea ebp, [esi + 4*n]
2097 .endif
2098 enter 0, n + 1
2099
2100 mov edx, esp
2101 mov esp, eax
2102
2103 #elif defined(__arm__)
2104
2105 add r4, r4, #4*n
2106 add r5, r5, #4*n + 8
2107
2108 str r4, [r5, #-4]!
2109 .rept n/2
2110 ldrd r0, r1, [r4, #-8]!
2111 strd r0, r1, [r5, #-8]!
2112 .endr
2113 add r4, r5, #4*n
2114 str r4, [r5, #-4]!
2115
2116 #elif defined(__aarch64__)
2117
2118 // omgwtf. let's not actually screw with the stack pointer.
2119
2120 add x4, x4, #8*n
2121 add x5, x5, #8*n + 16
2122
2123 str x4, [x5, #-8]!
2124 .rept n/2
2125 ldp x16, x17, [x4, #-16]!
2126 stp x16, x17, [x5, #-16]!
2127 .endr
2128 add x4, x5, #8*n
2129 str x4, [x5, #-8]!
2130
2131 #else
2132 notimpl
2133 #endif
2134
2135 ret
2136
2137 endproc
2138
2139 proc x1e
2140
2141 // convert nibble value to (uppercase) hex; other input values yield
2142 // nonsense.
2143
2144 #if defined(__x86_64__)
2145
2146 // das doesn't work in 64-bit mode; best i can come up with
2147 mov edx, eax
2148 add al, '0'
2149 add dl, 'A' - 10
2150 cmp al, '9' + 1
2151 cmovae eax, edx
2152
2153 #elif defined(__i386__)
2154
2155 cmp al, 0x0a // cf = 1 iff a < 10
2156 sbb al, 0x69 // if 0 <= a < 10, a' = a - 0x6a, so
2157 // 0x96 <= a' < 0x70, setting af, cf
2158 // if 10 <= a < 16, a' = a - 0x69, so
2159 // 0x71 <= a' < 0x77, setting cf but
2160 // clearing af
2161 das // if 0 <= a < 10, then af and cf are
2162 // both set, so set subtract 0x66
2163 // from a' leaving 0x30 <= a' < 0x3a;
2164 // if 10 <= a < 16 then af clear but
2165 // cf set, so subtract 0x60 from a'
2166 // leaving 0x41 <= a' < 0x47
2167
2168 #elif defined(__arm__)
2169
2170 // significantly less tricksy
2171 cmp r0, #10
2172 addlo r0, r0, #'0'
2173 addhs r0, r0, #'A' - 10
2174
2175 #elif defined(__aarch64__)
2176
2177 // with less versatile conditional execution this is the best we can
2178 // do
2179 cmp w0, #10
2180 add w16, w0, #'A' - 10
2181 add w0, w0, #'0'
2182 cmov.hs w0, w16
2183
2184 #else
2185 notimpl
2186 #endif
2187
2188 ret
2189
2190 endproc
2191
2192 proc x1f
2193
2194 // verify collatz conjecture starting at a; assume a /= 0!
2195
2196 #if defined(__x86_64__)
2197
2198 0: bsf rcx, rax // clobber c if a = 0
2199 shr rax, cl // a = 2^c a'
2200 cmp rdx, 0
2201 je 1f
2202 stosq
2203 dec rdx
2204 1:
2205 cmp rax, 1 // done?
2206 je 9f
2207 lea rax, [2*rax + rax + 1] // a' = 3 a' + 1
2208 jmp 0b // again
2209
2210 9: ret
2211
2212 #elif defined(__i386__)
2213
2214 0: bsf ecx, eax // clobber c if a = 0
2215 shr eax, cl // a = 2^c a'
2216 cmp edx, 0
2217 je 1f
2218 stosd
2219 dec edx
2220 1:
2221 cmp eax, 1 // done?
2222 je 9f
2223 lea eax, [2*eax + eax + 1] // a' = 3 a' + 1
2224 jmp 0b // again
2225
2226 9: ret
2227
2228 #elif defined(__arm__)
2229
2230 // rbit introduced in armv7
2231 0: rbit r2, r0
2232 clz r2, r2
2233 mov r0, r0, lsr r2 // a = 2^c a'
2234 cmp r3, #0
2235 strne r0, [r5], #4
2236 subne r3, r3, #1
2237 cmp r0, #1
2238 adcne r0, r0, r0, lsl #1 // a' = 3 a' + 1 (because c set)
2239 bne 0b
2240
2241 ret
2242
2243 #elif defined(__aarch64__)
2244
2245 0: rbit w2, w0
2246 clz w2, w2
2247 lsr w0, w0, w2 // a = 2^c a'
2248 cmp x3, #0
2249 beq 1f
2250 str x0, [x5], #8
2251 sub x3, x3, #1
2252 1:
2253 cmp w0, #1
2254 add w16, w0, w0, lsl #1 // t = 3 a' + 1 (because c set)
2255 csinc.eq w0, w0, w16
2256 b.ne 0b
2257
2258 ret
2259
2260 #else
2261 notimpl
2262 #endif
2263
2264 endproc
2265
2266 ///--------------------------------------------------------------------------
2267 /// 0x20--0x2f
2268
2269 proc x20
2270
2271 // calculate 1337 a slowly
2272
2273 #if defined(__x86_64__)
2274
2275 // original version
2276 mov rcx, rax // c = a
2277 shl rcx, 2 // c = 4 a
2278 add rcx, rax // c = 5 a
2279 shl rcx, 3 // c = 40 a
2280 add rcx, rax // c = 41 a
2281 shl rcx, 1 // c = 82 a
2282 add rcx, rax // c = 83 a
2283 shl rcx, 1 // c = 166 a
2284 add rcx, rax // c = 167 a
2285 shl rcx, 3 // c = 1336 a
2286 add rcx, rax // c = 1337 a
2287
2288 // a quick way
2289 lea rdx, [2*rax + rax] // t = 3 a
2290 shl rdx, 6 // t = 192 a
2291 sub rdx, rax // t = 191 a
2292 lea rbx, [8*rdx] // b = 1528 a
2293 sub rbx, rdx // b = 1337 a
2294
2295 #elif defined(__i386__)
2296
2297 // original version
2298 mov ecx, eax // c = a
2299 shl ecx, 2 // c = 4 a
2300 add ecx, eax // c = 5 a
2301 shl ecx, 3 // c = 40 a
2302 add ecx, eax // c = 41 a
2303 shl ecx, 1 // c = 82 a
2304 add ecx, eax // c = 83 a
2305 shl ecx, 1 // c = 166 a
2306 add ecx, eax // c = 167 a
2307 shl ecx, 3 // c = 1336 a
2308 add ecx, eax // c = 1337 a
2309
2310 // a quick way
2311 lea edx, [2*eax + eax] // t = 3 a
2312 shl edx, 6 // t = 192 a
2313 sub edx, eax // t = 191 a
2314 lea ebx, [8*edx] // b = 1528 a
2315 sub ebx, edx // b = 1337 a
2316
2317 #elif defined(__arm__)
2318
2319 // original version, ish
2320 add r2, r0, r0, lsl #2 // c = 5 a
2321 add r2, r0, r2, lsl #3 // c = 41 a
2322 add r2, r0, r2, lsl #1 // c = 83 a
2323 add r2, r0, r2, lsl #1 // c = 167 a
2324 add r2, r0, r2, lsl #3 // c = 1337 a
2325
2326 // quicker way
2327 add r1, r0, r0, lsl #1 // b = 3 a
2328 rsb r1, r0, r1, lsl #6 // b = 191 a
2329 rsb r1, r1, r1, lsl #3 // b = 1337 a
2330
2331 #elif defined(__aarch64__)
2332
2333 // original version, ish
2334 add x2, x0, x0, lsl #2 // c = 5 a
2335 add x2, x0, x2, lsl #3 // c = 41 a
2336 add x2, x0, x2, lsl #1 // c = 83 a
2337 add x2, x0, x2, lsl #1 // c = 167 a
2338 add x2, x0, x2, lsl #3 // c = 1337 a
2339
2340 // sleazy because no rsb
2341 add x1, x0, x0, lsl #1 // b = 3 a
2342 sub x1, x0, x1, lsl #6 // b = -191 a
2343 sub x1, x1, x1, lsl #3 // b = 1337 a
2344
2345 #else
2346 notimpl
2347 #endif
2348
2349 ret
2350
2351 endproc
2352
2353 proc x21
2354
2355 // multiply complex numbers a + b i and c + d i
2356 //
2357 // (a + b i) (c + d i) = (a c - b d) + (a d + b c) i
2358 //
2359 // somewhat slick approach uses only three multiplications
2360
2361 #if defined(__x86_64__)
2362
2363 mov rsi, rax // t = a
2364 add rax, rbx // a' = a + b
2365 mov rdi, rdx // u = d
2366 sub rdx, rcx // d' = d - c
2367 add rdi, rcx // u = c + d
2368
2369 imul rax, rcx // a' = c (a + b)
2370 imul rsi, rdx // t = a (d - c)
2371 imul rdi, rbx // u = b (c + d)
2372
2373 add rsi, rax // t = a (d - c) + c (a + b)
2374 mov rbx, rsi // b' = a (d - c) + c (a + b)
2375 // = a d + b c
2376 sub rax, rdi // a' = c (a + b) - b (c + d)
2377 // = a c - b d
2378
2379 #elif defined(__i386__)
2380
2381 mov esi, eax // t = a
2382 add eax, ebx // a' = a + b
2383 mov edi, edx // u = d
2384 sub edx, ecx // d' = d - c
2385 add edi, ecx // u = c + d
2386
2387 imul eax, ecx // a' = c (a + b)
2388 imul esi, edx // t = a (d - c)
2389 imul edi, ebx // u = b (c + d)
2390
2391 add esi, eax // t = a (d - c) + c (a + b)
2392 mov ebx, esi // b' = a (d - c) + c (a + b)
2393 // = a d + b c
2394 sub eax, edi // a' = c (a + b) - b (c + d)
2395 // = a c - b d
2396
2397 #elif defined(__arm__)
2398
2399 add r4, r0, r1 // t = a + b
2400 add r5, r2, r3 // u = c + d
2401 sub r3, r3, r2 // d' = d - c
2402
2403 // mls introduced in armv7
2404 mul r4, r4, r2 // t = c (a + b)
2405 mov r2, r1 // c' = a (bah!)
2406 mla r1, r0, r3, r4 // b' = a (d - c) + c (a + b)
2407 // = a d + b c
2408 mls r0, r2, r5, r4 // a' = c (a + b) - b (c + d)
2409 // = a c - b d
2410
2411 #elif defined(__aarch64__)
2412
2413 add x4, x0, x1 // t = a + b
2414 add x5, x2, x3 // u = c + d
2415 sub x3, x3, x2 // d' = d - c
2416
2417 // mls intxoduced in axmv7
2418 mul x4, x4, x2 // t = c (a + b)
2419 mov x2, x1 // c' = a (bah!)
2420 madd x1, x0, x3, x4 // b' = a (d - c) + c (a + b)
2421 // = a d + b c
2422 msub x0, x2, x5, x4 // a' = c (a + b) - b (c + d)
2423 // = a c - b d
2424
2425 #else
2426 notimpl
2427 #endif
2428
2429 ret
2430
2431 endproc
2432
2433 proc x22
2434
2435 // divide by 3
2436
2437 #if defined(__x86_64__)
2438
2439 mov rdx, 0xaaaaaaaaaaaaaaab // = ceil(2/3 2^64)
2440 mul rdx // d' || a' =~ 2/3 a 2^64
2441 shr rdx, 1 // d' = floor(a/3)
2442 mov rax, rdx // a' = floor(a/3)
2443
2444 // we start with 0 <= a < 2^64. write f = ceil(2/3 2^64), so that
2445 // 2/3 < f/2^64 < 2/3 + 1/2^64. then floor(2/3 a) <= floor(a f/2^64)
2446 // <= floor(2/3 a + a/2^64), but a < 2^64 so a/2^64 < 1 and
2447 // floor(a f/2^64) = floor(2/3 a).
2448
2449 #elif defined(__i386__)
2450
2451 mov edx, 0xaaaaaaab // = ceil(2/3 2^32)
2452 mul edx // d' || a' =~ 2/3 a 2^32
2453 shr edx, 1 // d' = floor(a/3)
2454 mov eax, edx // a' = floor(a/3)
2455
2456 #elif defined(__arm__)
2457
2458 ldr r12, =0xaaaaaaab
2459 umull r12, r0, r0, r12
2460 mov r0, r0, lsr #1
2461
2462 #elif defined(__aarch64__)
2463
2464 ldr x16, =0xaaaaaaaaaaaaaaab
2465 umulh x0, x0, x16
2466 lsr x0, x0, #1
2467
2468 #else
2469 notimpl
2470 #endif
2471
2472 ret
2473
2474 endproc
2475
2476 proc x23
2477
2478 #if defined(__x86_64__)
2479
2480 // main loop: shorten a preserving residue class mod 3
2481 0: cmp rax, 5
2482 jbe 8f
2483 // a > 5
2484 mov rdx, rax // d' = a
2485 shr rdx, 2 // d' = floor(a/4)
2486 and rax, 3 // a = 4 d' + a' (0 <= a' < 4)
2487 add rax, rdx // a' == a (mod 3) but a' < a/4 + 4
2488 jmp 0b
2489
2490 // fix up final value 0 <= a < 6: want 0 <= a < 3
2491 //
2492 // the tricky part is actually a = 3; but the other final cases take
2493 // additional iterations which we can avoid.
2494 8: cmp rax, 3 // set cf iff a < 3
2495 cmc // set cf iff a >= 3
2496 sbb rdx, rdx // d' = a >= 3 ? -1 : 0
2497 and rdx, 3 // d' = a >= 3 ? 3 : 0
2498 sub rax, rdx // a' = a - (a >= 3 ? 3 : 0)
2499 // = a (mod 3)
2500
2501 #elif defined(__i386__)
2502
2503 // main loop: shorten a preserving residue class mod 3
2504 0: cmp eax, 5
2505 jbe 8f
2506 // a > 5
2507 mov edx, eax // d' = a
2508 shr edx, 2 // d' = floor(a/4)
2509 and eax, 3 // a = 4 d' + a' (0 <= a' < 4)
2510 add eax, edx // a' == a (mod 3) but a' < a/4 + 4
2511 jmp 0b
2512
2513 // fix up final value 0 <= a < 6: want 0 <= a < 3
2514 //
2515 // the tricky part is actually a = 3; but the other final cases take
2516 // additional iterations which we can avoid.
2517 8: cmp eax, 3 // set cf iff a < 3
2518 cmc // set cf iff a >= 3
2519 sbb edx, edx // d' = a >= 3 ? -1 : 0
2520 and edx, 3 // d' = a >= 3 ? 3 : 0
2521 sub eax, edx // a' = a - (a >= 3 ? 3 : 0)
2522 // = a (mod 3)
2523
2524 #elif defined(__arm__)
2525
2526 0: cmp r0, #6
2527 andhs r12, r0, #3
2528 addhs r0, r12, r0, lsr #2
2529 bhs 0b
2530
2531 cmp r0, #3
2532 subhs r0, r0, #3
2533
2534 #elif defined(__aarch64__)
2535
2536 0: cmp x0, #6
2537 // blunder on through regardless since this doesn't affect the result
2538 and x16, x0, #3
2539 add x0, x16, x0, lsr #2
2540 b.hs 0b
2541
2542 subs x16, x0, #3
2543 cmov.hs x0, x16
2544
2545 #else
2546 notimpl
2547 #endif
2548
2549 ret
2550
2551 endproc
2552
2553 proc x24
2554
2555 // invert (odd) a mod 2^64
2556 //
2557 // suppose a a_i == 1 (mod 2^{2^i})
2558 //
2559 // clearly good for i = 0, since 2^i = 1 and 2^{2^i} = 2, and a_0 =
2560 // a == 1 (mod 2) by assumption
2561 //
2562 // write a a_i == b_i 2^{2^i} + 1 (mod 2^{2^{i+1}})
2563 // then b_i == (a a_i - 1)/2^{2^i} (mod 2^{2^i})
2564 // to lift inverse, we want x such that a x == -b_i (mod 2^{2^i});
2565 // clearly x = -a_i b_i will do, since a a_i == 1 (mod 2^{2^i})
2566 // then:
2567 // a_{i+1} = a_i - a_i b_i 2^{2^i} = a_i (1 - (a a_i - 1))
2568 // = 2 a_i - a a_i^2
2569 //
2570 // check:
2571 // a a_{i+1} = 2 a a_i - a^2 a_i^2
2572 // == 2 a a_i - (b_i 2^{2^i} + 1)^2
2573 // == 2 (b_i 2^{2^i} + 1) -
2574 // (b_i^2 2^{2^{i+1}} + 2 b_i 2^{2^i} + 1)
2575 // == 1 (mod 2^{2^{i+1}})
2576
2577 #if defined(__x86_64__)
2578
2579 // rax // a_0 = a
2580 mov rbx, rax // b' = a
2581 mov rsi, rax // t = a_0
2582
2583 0:
2584 cmp rbp, 0
2585 je 1f
2586 stosq
2587 dec rbp
2588 1:
2589 mul rbx // a' = a a_i
2590 mov rcx, rax // c = a a_i
2591
2592 sub rax, 2 // a' = a a_i - 2
2593 neg rax // a' = 2 - a a_i
2594 mul rsi // a_{i+1} = a_i (2 - a a_i)
2595 // = 2 a_i - a a_i^2
2596 mov rsi, rax // t = a_{i+1}
2597
2598 cmp rcx, 1 // done?
2599 ja 0b // no -- iterate
2600
2601 #elif defined(__i386__)
2602
2603 // eax // a_0 = a
2604 mov ebx, eax // b' = a
2605 mov esi, eax // t = a_0
2606
2607 0:
2608 cmp ebp, 0
2609 je 1f
2610 stosd
2611 dec ebp
2612 1:
2613 mul ebx // a' = a a_i
2614 mov ecx, eax // c = a a_i
2615
2616 sub eax, 2 // a' = a a_i - 2
2617 jb 9f // done if < 2
2618 neg eax // a' = 2 - a a_i
2619 mul esi // a_{i+1} = a_i (2 - a a_i)
2620 // = 2 a_i - a a_i^2
2621 mov esi, eax // t = a_{i+1}
2622
2623 jmp 0b // and iterate
2624 9: mov eax, esi // restore
2625
2626 #elif defined(__arm__)
2627
2628 // r0 // a_0 = a
2629 mov r1, r0 // b' = a
2630
2631 0:
2632 cmp r6, #0
2633 strne r0, [r5], #4
2634 subne r6, r6, #1
2635 mul r2, r0, r1 // c = a a_i
2636 rsbs r2, r2, #2 // c = 2 - a a_i
2637 mul r0, r0, r2 // a_{i+1} = a_i (2 - a a_i)
2638 // = 2 a_i - a a_i^2
2639 blo 0b
2640
2641 #elif defined(__aarch64__)
2642
2643 // x0 // a_0 = a
2644 mov x1, x0 // b' = a
2645 mov x16, #2 // because we have no rsb
2646
2647 0:
2648 cmp x6, #0
2649 b.eq 1f
2650 str x0, [x5], #8
2651 sub x6, x6, #1
2652 1:
2653 mul x2, x0, x1 // c = a a_i
2654 subs x2, x16, x2 // c = 2 - a a_i
2655 mul x0, x0, x2 // a_{i+1} = a_i (2 - a a_i)
2656 // = 2 a_i - a a_i^2
2657 b.lo 0b
2658
2659 #else
2660 notimpl
2661 #endif
2662
2663 ret
2664
2665 endproc
2666
2667 proc x25
2668
2669 // a poor approximation to pi/4
2670 //
2671 // think of x and y as being in 16.16 fixed-point format. we sample
2672 // points in the unit square, and determine how many of them are
2673 // within a unit quarter-circle centred at the origin. the area of
2674 // the quarter-circle is pi/4.
2675
2676 #if defined(__x86_64__)
2677
2678 xor eax, eax // a = 0
2679 mov rcx, 1
2680 shl rcx, 0x20 // c =~ 4 billion
2681
2682 0: movzx rbx, cx // x = low 16 bits of c
2683 imul rbx, rbx // b = x^2
2684
2685 ror rcx, 0x10 // switch halves of c
2686 movzx rdx, cx // y = high 16 bits of c
2687 imul rdx, rdx // d = y^2
2688 rol rcx, 0x10 // switch back
2689
2690 add rbx, rdx // r^2 = x^2 + y^2
2691 shr rbx, 0x20 // r^2 >= 1?
2692 cmp rbx, 1 // set cf iff r^2 >= 1
2693 adc rax, 0 // and add onto accumulator
2694 loop 0b
2695
2696 #elif defined(__i386__)
2697
2698 // this is actually better done in 32 bits. the carry has the wrong
2699 // sense here, so instead deduct one for each point outside the
2700 // quarter-circle rather than adding one for each point inside it.
2701 xor eax, eax
2702 xor ecx, ecx
2703
2704 0: movzx ebx, cx
2705 imul ebx, ebx
2706
2707 mov edx, ecx
2708 shr edx, 0x10
2709 imul edx, edx
2710
2711 add ebx, edx // see?
2712 sbb eax, 0
2713 loop 0b
2714
2715 #elif defined(__arm__)
2716
2717 mov r0, #0
2718 mov r2, #0
2719
2720 0: uxth r1, r2, ror #0
2721 uxth r3, r2, ror #16
2722 mul r1, r1, r1
2723 mul r3, r3, r3
2724 cmn r1, r3 // mlas doesn't set cf usefully
2725 addcc r0, r0, #1
2726 adds r2, r2, #1
2727 bne 0b
2728
2729 #elif defined(__aarch64__)
2730
2731 mov w0, #0
2732 mov w2, #0
2733
2734 0: ubfx w1, w2, #0, #16
2735 ubfx w3, w2, #16, #16
2736 sub w2, w2, #1
2737 mul w1, w1, w1
2738 mul w3, w3, w3
2739 cmn w1, w3
2740 cinc.cc w0, w0
2741 cbnz w2, 0b
2742
2743 #else
2744 notimpl
2745 #endif
2746
2747 ret
2748
2749 endproc
2750
2751 proc x26
2752
2753 // a bad way to rotate a right by 7 places
2754
2755 #if defined(__x86_64__)
2756
2757 mov rbx, rax
2758 ror rbx, 7 // better
2759
2760 mov rdx, rax // d' = a
2761 shr rax, 7 // a' = a >> 7
2762 shl rdx, 0x39 // d' = a << 57
2763 or rax, rdx // a' = a >>> 7
2764
2765 #elif defined(__i386__)
2766
2767 mov ebx, eax
2768 ror ebx, 7 // better
2769
2770 mov edx, eax // d' = a
2771 shr eax, 7 // a' = a >> 7
2772 shl edx, 0x39 // d' = a << 57
2773 or eax, edx // a' = a >>> 7
2774
2775 #elif defined(__arm__)
2776
2777 mov r1, r0, ror #7 // easy way
2778
2779 // even the hard way is fairly easy on arm
2780 mov r3, r0, lsl #25
2781 orr r0, r3, r0, lsr #7 // hard way
2782
2783 #elif defined(__aarch64__)
2784
2785 ror x1, x0, #7 // easy way
2786
2787 // even the hard way is fairly easy on arm
2788 lsl x3, x0, #57
2789 orr x0, x3, x0, lsr #7 // hard way
2790
2791 #else
2792 notimpl
2793 #endif
2794
2795 ret
2796
2797 endproc
2798
2799 proc x27
2800
2801 // shift a right by c places, in two halves
2802
2803 #if defined(__x86_64__)
2804
2805 mov ch, cl // c' = [c, c]
2806 inc ch // c' = [c, c + 1]
2807 shr ch, 1
2808 shr cl, 1 // c' = [floor(c/2), ceil(c/2)]
2809 shr rax, cl
2810 xchg ch, cl
2811 shr rax, cl
2812
2813 #elif defined(__i386__)
2814
2815 mov ch, cl // c' = [c, c]
2816 inc ch // c' = [c, c + 1]
2817 shr ch, 1
2818 shr cl, 1 // c' = [floor(c/2), ceil(c/2)]
2819 shr eax, cl
2820 xchg ch, cl
2821 shr eax, cl
2822
2823 #elif defined(__arm__)
2824
2825 // it would be clearer and more efficient to say: `mov r12, r2, lsr
2826 // #1; sub r2, r2, r12', but that's not the lesson this exercise is
2827 // trying to teach.
2828 add r12, r2, #1
2829 mov r2, r2, lsr #1
2830 mov r12, r12, lsr #1
2831 mov r0, r0, lsr r2
2832 mov r0, r0, lsr r12
2833
2834 #elif defined(__aarch64__)
2835
2836 add w16, w2, #1
2837 lsr w2, w2, #1
2838 lsr w16, w16, #1
2839 lsr x0, x0, x2
2840 lsr x0, x0, x16
2841
2842 #else
2843 notimpl
2844 #endif
2845
2846 ret
2847
2848 endproc
2849
2850 proc x28
2851
2852 // divide c-byte little-endian bignum at rsi by 2 (rounding down)
2853
2854 #if defined(__x86_64__)
2855
2856 clc
2857 0: rcr byte ptr [rsi], 1
2858 inc rsi
2859 loop 0b
2860
2861 #elif defined(__i386__)
2862
2863 clc
2864 0: rcr byte ptr [esi], 1
2865 inc esi
2866 loop 0b
2867
2868 #elif defined(__arm__)
2869
2870 // we could hack this a word at a time using rrx
2871 mov r3, #0
2872 0: ldrb r12, [r4]
2873 subs r2, r2, #1
2874 orr r3, r3, r12, lsr #1
2875 strb r3, [r4], #1
2876 mov r3, r12, lsl #7
2877 bne 0b
2878
2879 #elif defined(__aarch64__)
2880
2881 mov w16, #0
2882 0: ldrb w17, [x4]
2883 sub x2, x2, #1
2884 orr w16, w16, w17, lsr #1
2885 strb w16, [x4], #1
2886 lsl w16, w17, #7
2887 cbnz x2, 0b
2888
2889 #else
2890 notimpl
2891 #endif
2892
2893 ret
2894
2895 endproc
2896
2897 proc x29
2898
2899 // fill a buffer with a 3-byte pattern
2900
2901 #if defined(__x86_64__)
2902
2903 lea rdi, [rsi + 3]
2904 rep movsb
2905
2906 #elif defined(__i386__)
2907
2908 lea edi, [esi + 3]
2909 rep movsb
2910
2911 #elif defined(__arm__)
2912
2913 add r5, r4, #3
2914 0: subs r2, r2, #1
2915 ldrhsb r12, [r4], #1
2916 strhsb r12, [r5], #1
2917 bhs 0b
2918
2919 #elif defined(__aarch64__)
2920
2921 cbz x2, 9f
2922 add x5, x4, #3
2923 0: sub x2, x2, #1
2924 ldrb w16, [x4], #1
2925 strb w16, [x5], #1
2926 cbnz x2, 0b
2927 9:
2928
2929 #else
2930 notimpl
2931 #endif
2932
2933 ret
2934
2935 endproc
2936
2937 proc x2a
2938
2939 // rotate the words in a buffer, so that the last word comes first,
2940 // the first comes second, and so on. this isn't a good way to do
2941 // it.
2942
2943 #if defined(__x86_64__)
2944
2945 mov rsi, rbx // set string pointers
2946 mov rdi, rbx
2947 0: lodsq // fetch next word
2948 xchg rax, qword ptr [rbx] // stash it for next iteration and
2949 // replace it with the previously
2950 // stashed word
2951 stosq // store in output
2952 // (note that the first iteration doesn't actually do anything)
2953 loop 0b // continue until all done
2954
2955 #elif defined(__i386__)
2956
2957 mov esi, ebx // set string pointers
2958 mov edi, ebx
2959 0: lodsd // fetch next word
2960 xchg eax, dword ptr [ebx] // stash it for next iteration and
2961 // replace it with the previously
2962 // stashed word
2963 stosd // store in output
2964 loop 0b // continue until all done
2965
2966 #elif defined(__arm__)
2967
2968 // let's do this a sensible way. (we could go faster using ldm/stm.)
2969 add r0, r1, r2, lsl #2 // find the end of the buffer
2970 ldr r0, [r0, #-4] // collect final element
2971 0: subs r2, r2, #1
2972 ldr r12, [r1]
2973 str r0, [r1], #4
2974 mov r0, r12
2975 bne 0b
2976
2977 #elif defined(__aarch64__)
2978
2979 add x0, x1, x2, lsl #3 // find the end of the buffer
2980 ldr x0, [x0, #-8] // collect final element
2981 0: sub x2, x2, #1
2982 ldr x16, [x1]
2983 str x0, [x1], #8
2984 mov x0, x16
2985 cbnz x2, 0b
2986
2987 #else
2988 notimpl
2989 #endif
2990
2991 ret
2992
2993 endproc
2994
2995 proc x2b
2996
2997 // find a cycle in a function f: B -> B, where B = {0, 1, ..., 255}
2998
2999 #if defined(__x86_64__)
3000
3001 // this is floyd's cycle-finding algorithm.
3002 //
3003 // consider the sequence s_0 = 0, s_1 = f(0), s_2 = f(f(0)), ...,
3004 // s_{i+1} = f(s_i). since B is finite, there must be some smallest
3005 // t and c such that s(t) = s(t + c); then we have s_i = s_j iff
3006 // i >= t, j >= t, and i == j (mod c).
3007 //
3008 // the algorithm sets two cursors advancing through the sequence: a
3009 // /tortoise/ which advances one step at a time, and a /hare/ which
3010 // advances by two, so when the tortoise is at element s_i, the hare
3011 // is at s_{2i}. the hare will run around the cycle and catch the
3012 // tortoise when i >= t and i == 2 i (mod c); the latter is simply i
3013 // == 0 (mod c), which therefore happens first when i = k = t +
3014 // (-t mod c).
3015 //
3016 // i'm not sure what good xlatb does here that mov al, [rbx + al]
3017 // doesn't.
3018
3019 xor eax, eax // tortoise starts at 0
3020 xor edx, edx // hare starts at 0
3021 0: xlatb // advance tortoise
3022 xchg rax, rdx // switch to hare
3023 xlatb // advance hare ...
3024 xlatb // ... twice
3025 xchg rax, rdx // switch back
3026 cmp al, dl // hare caught the tortoise?
3027 jnz 0b // no -- go around again
3028
3029 // now we trace the initial tail: reset the tortoise to s_0, and slow
3030 // the hare down so that both take only a single step in each
3031 // iteration. this loop terminates when i >= t and i == i + 2 k
3032 // (mod c). we know k is a multiple of c, so the latter condition
3033 // always holds, so this finds the first step of the cycle.
3034
3035 xor eax, eax // reset the tortoise
3036 0: xlatb // advance tortoise
3037 xchg rax, rdx // switch to hare
3038 xlatb // advance hare
3039 xchg rax, rdx // and switch back
3040 cmp al, dl // done?
3041 jnz 0b // no -- iterate
3042
3043 #elif defined(__i386__)
3044
3045 xor eax, eax // tortoise starts at 0
3046 xor edx, edx // hare starts at 0
3047 0: xlatb // advance tortoise
3048 xchg eax, edx // switch to hare
3049 xlatb // advance hare ...
3050 xlatb // ... twice
3051 xchg eax, edx // switch back
3052 cmp al, dl // hare caught the tortoise?
3053 jnz 0b // no -- go around again
3054
3055 xor eax, eax // reset the tortoise
3056 0: xlatb // advance tortoise
3057 xchg eax, edx // switch to hare
3058 xlatb // advance hare
3059 xchg eax, edx // and switch back
3060 cmp al, dl // done?
3061 jnz 0b // no -- iterate
3062
3063 #elif defined(__arm__)
3064
3065 mov r0, #0
3066 mov r3, #0
3067 0: ldrb r0, [r1, r0]
3068 ldrb r3, [r1, r3]
3069 ldrb r3, [r1, r3]
3070 cmp r0, r3
3071 bne 0b
3072
3073 mov r0, #0
3074 0: ldrb r0, [r1, r0]
3075 ldrb r3, [r1, r3]
3076 cmp r0, r3
3077 bne 0b
3078
3079 #elif defined(__aarch64__)
3080
3081 mov w0, #0
3082 mov w3, #0
3083 0: ldrb w0, [x1, x0]
3084 ldrb w3, [x1, x3]
3085 ldrb w3, [x1, x3]
3086 cmp w0, w3
3087 b.ne 0b
3088
3089 mov w0, #0
3090 0: ldrb w0, [x1, x0]
3091 ldrb w3, [x1, x3]
3092 cmp w0, w3
3093 b.ne 0b
3094
3095 #else
3096 notimpl
3097 #endif
3098
3099 ret
3100
3101 endproc
3102
3103 proc x2c
3104
3105 // a convoluted way to set rax = rsi
3106
3107 #if defined(__x86_64__)
3108
3109 mov qword ptr [rbx + 8*rcx], 0 // b[c] = 0
3110 mov qword ptr [rbx + 8*rdx], 1 // b[d] = 1
3111 mov rax, [rbx + 8*rcx] // a' = b[c] = 0
3112
3113 mov [rbx], rsi // b[0] = t
3114 mov [rbx + 8], rdi // b[1] = u
3115 mov rax, [rbx + 8*rax] // a' = b[a'] = b[0] = t
3116
3117 #elif defined(__i386__)
3118
3119 mov dword ptr [ebx + 8*ecx], 0 // b[c] = 0
3120 mov dword ptr [ebx + 8*edx], 1 // b[d] = 1
3121 mov eax, [ebx + 8*ecx] // a' = b[c] = 0
3122
3123 mov [ebx], esi // b[0] = t
3124 mov [ebx + 8], edi // b[1] = u
3125 mov eax, [ebx + 8*eax] // a' = b[a'] = b[0] = t
3126
3127 #elif defined(__arm__)
3128
3129 mov r0, #0
3130 mov r12, #1
3131
3132 str r0, [r1, r2, lsl #2]
3133 str r12, [r1, r3, lsl #2]
3134 ldr r0, [r1, r2, lsl #2]
3135
3136 str r4, [r1]
3137 str r5, [r1, #4]
3138 ldr r0, [r1, r0, lsl #2]
3139
3140 #elif defined(__aarch64__)
3141
3142 mov x16, #1
3143
3144 str xzr, [x1, x2, lsl #3]
3145 str x16, [x1, x3, lsl #3]
3146 ldr x0, [x1, x2, lsl #3]
3147
3148 str x4, [x1]
3149 str x5, [x1, #8]
3150 ldr x0, [x1, x0, lsl #3]
3151
3152 #else
3153 notimpl
3154 #endif
3155
3156 ret
3157
3158 endproc
3159
3160 proc x2d
3161
3162 // clear the least significant set bit in a, by calculating a' =
3163 // a AND (a - 1).
3164 //
3165 // if a = 0 then a' = 0. otherwise, a - 1 differs from a exactly in
3166 // the least significant /set/ bit of a, and all bits of lesser
3167 // significance. to put it another way: write a = u 2^{k+1} + 2^k;
3168 // then a - 1 = u 2^{k+1} + 2^{k-1} + ... + 2 + 1. taking the
3169 // bitwise AND of these leaves only the bits common to both, i.e.,
3170 // u 2^{k+1}.
3171
3172 #if defined(__x86_64__)
3173
3174 mov rdx, rax // d' = a
3175 dec rax // a' = a - 1
3176 and rax, rdx // a' = a AND (a - 1)
3177
3178 #elif defined(__i386__)
3179
3180 mov edx, eax // d' = a
3181 dec eax // a' = a - 1
3182 and eax, edx // a' = a AND (a - 1)
3183
3184 #elif defined(__arm__)
3185
3186 sub r3, r0, #1
3187 and r0, r0, r3
3188
3189 #elif defined(__aarch64__)
3190
3191 sub x3, x0, #1
3192 and x0, x0, x3
3193
3194 #else
3195 notimpl
3196 #endif
3197
3198 ret
3199
3200 endproc
3201
3202 proc x2e
3203
3204 // compute a mask of one bits in exactly the positions of the
3205 // low-order run of zero bits in a
3206
3207 #if defined(__x86_64__)
3208
3209 mov rdx, rax // d' = a
3210 dec rdx // d' = a - 1
3211 xor rax, rdx // a = a XOR (a - 1)
3212 // set bits are least significant
3213 // set bit of a, and all bits of
3214 // lesser significance
3215 shr rax, 1 // now only bits of lesser
3216 // significance; a' = 0 iff a odd
3217 cmp rax, rdx // equal if a = 0 or 2^k; otherwise
3218 // strictly less
3219
3220 #elif defined(__i386__)
3221
3222 mov edx, eax
3223 dec edx
3224 xor eax, edx
3225 shr eax, 1
3226 cmp eax, edx
3227
3228 #elif defined(__arm__)
3229
3230 sub r3, r0, #1
3231 eor r0, r0, r3
3232 mov r0, r0, lsr #1 // probably fold shift into next inst
3233 cmp r0, r3
3234
3235 #elif defined(__aarch64__)
3236
3237 sub x3, x0, #1
3238 eor x0, x0, x3
3239 mov x0, x0, lsr #1 // probably fold shift into next inst
3240 cmp x0, x3
3241
3242 #else
3243 notimpl
3244 #endif
3245
3246 ret
3247
3248 endproc
3249
3250 proc x2f
3251
3252 // a slow population count
3253
3254 #if defined(__x86_64__)
3255
3256 popcnt rbx, rcx // the easy way
3257
3258 // a fast version in software
3259 mov rax, rcx
3260
3261 mov rdx, rcx
3262 shr rdx, 1
3263 mov rsi, 0x5555555555555555
3264 and rax, rsi
3265 and rdx, rsi
3266 add rax, rdx
3267
3268 mov rdx, rax
3269 shr rdx, 2
3270 mov rsi, 0x3333333333333333
3271 and rax, rsi
3272 and rdx, rsi
3273 add rax, rdx
3274
3275 mov rdx, rax
3276 shr rdx, 32
3277 add rax, rdx
3278
3279 mov rdx, rax
3280 shr rdx, 4
3281 and rax, 0x0f0f0f0f
3282 and rdx, 0x0f0f0f0f
3283 add rax, rdx
3284
3285 mov rdx, rax
3286 shr rdx, 8
3287 add rax, rdx
3288
3289 mov rdx, rax
3290 shr rdx, 16
3291 add rax, rdx
3292 movzx rsi, al
3293
3294 // the official version
3295 xor eax, eax // clear iteration counter
3296 0: jrcxz 9f // bail if c = 0
3297 inc rax // bump iteration count
3298 mov rdx, rcx // d' = c
3299 dec rdx // d' = c - 1
3300 and rcx, rdx // zap least significant set bit of c
3301 jmp 0b // and go again
3302 9:
3303
3304 #elif defined(__i386__)
3305
3306 popcnt ebx, ecx // the easy way
3307
3308 mov eax, ecx
3309
3310 mov edx, ecx
3311 shr edx, 1
3312 and eax, 0x55555555
3313 and edx, 0x55555555
3314 add eax, edx
3315
3316 mov edx, eax
3317 shr edx, 2
3318 and eax, 0x33333333
3319 and edx, 0x33333333
3320 add eax, edx
3321
3322 mov edx, eax
3323 shr edx, 4
3324 add eax, edx
3325
3326 mov edx, eax
3327 shr edx, 8
3328 and eax, 0x000f000f
3329 and edx, 0x000f000f
3330 add eax, edx
3331
3332 mov edx, eax
3333 shr edx, 16
3334 add eax, edx
3335 movzx esi, al
3336
3337 xor eax, eax
3338 0: jecxz 9f
3339 inc eax
3340 mov edx, ecx
3341 dec edx
3342 and ecx, edx
3343 jmp 0b
3344 9:
3345
3346 #elif defined(__arm__)
3347
3348 // the easy-ish way
3349 vmov d0[0], r2
3350 vcnt.8 d0, d0
3351 vmov r1, d0[0]
3352 add r1, r1, r1, lsl #8
3353 add r1, r1, r1, lsl #16
3354 mov r1, r1, lsr #24
3355
3356 // the hard way
3357 movw r12, #0x5555
3358 movt r12, #0x5555
3359 and r3, r12, r2, lsr #1
3360 and r0, r12, r2
3361 add r0, r0, r3
3362
3363 movw r12, #0x3333
3364 movt r12, #0x3333
3365 and r3, r12, r0, lsr #2
3366 and r0, r12, r0
3367 add r0, r0, r3
3368
3369 add r0, r0, r0, lsl #16
3370
3371 movt r12, 0x0f0f
3372 and r3, r12, r0, lsr #4
3373 and r0, r12, r0
3374 add r0, r0, r3
3375
3376 add r0, r0, r0, lsl #8
3377
3378 mov r4, r0, lsr #24
3379
3380 // and following the exercise
3381 mov r0, #0
3382 cmp r2, #0
3383 beq 9f
3384 0: add r0, r0, #1
3385 sub r3, r2, #1
3386 ands r2, r2, r3
3387 bne 0b
3388 9:
3389
3390 #elif defined(__aarch64__)
3391
3392 // the easy-ish way
3393 mov v0.d[0], x2
3394 cnt v0.8b, v0.8b
3395 mov x1, v0.d[0]
3396 add x1, x1, x1, lsl #8
3397 add x1, x1, x1, lsl #16
3398 add x1, x1, x1, lsl #32
3399 lsr x1, x1, #56
3400
3401 // the hard way -- though arm64's immediate constant encodings and
3402 // shifting make this actually rather pleasant.
3403 and x3, x2, #0xaaaaaaaaaaaaaaaa
3404 and x0, x2, #0x5555555555555555
3405 add x0, x0, x3, lsr #1
3406
3407 and x3, x0, #0xcccccccccccccccc
3408 and x0, x0, #0x3333333333333333
3409 add x0, x0, x3, lsr #2
3410
3411 add x0, x0, x0, lsr #4
3412
3413 and x3, x0, #0x0f000f000f000f00
3414 and x0, x0, #0x000f000f000f000f
3415 add x0, x3, x0, lsl #8
3416
3417 add x0, x0, x0, lsl #16
3418 add x0, x0, x0, lsl #32
3419 lsr x4, x0, #56
3420
3421 // and the official way
3422 mov x0, #0
3423 cbz x2, 9f
3424 0: add x0, x0, #1
3425 sub x3, x2, #1
3426 and x2, x2, x3
3427 cbnz x2, 0b
3428 9:
3429
3430 #else
3431 notimpl
3432 #endif
3433
3434 ret
3435
3436 endproc
3437
3438 ///--------------------------------------------------------------------------
3439 /// 0x30--0x3f
3440
3441 proc x30
3442
3443 #if defined(__x86_64__)
3444
3445 notimpl
3446
3447 #elif defined(__i386__)
3448
3449 notimpl
3450
3451 #elif defined(__arm__)
3452
3453 notimpl
3454
3455 #elif defined(__aarch64__)
3456
3457 notimpl
3458
3459 #else
3460 notimpl
3461 #endif
3462
3463 ret
3464
3465 endproc
3466
3467 proc x31
3468
3469 #if defined(__x86_64__)
3470
3471 notimpl
3472
3473 #elif defined(__i386__)
3474
3475 notimpl
3476
3477 #elif defined(__arm__)
3478
3479 notimpl
3480
3481 #elif defined(__aarch64__)
3482
3483 notimpl
3484
3485 #else
3486 notimpl
3487 #endif
3488
3489 endproc
3490
3491 proc x32
3492
3493 #if defined(__x86_64__)
3494
3495 notimpl
3496
3497 #elif defined(__i386__)
3498
3499 notimpl
3500
3501 #elif defined(__arm__)
3502
3503 notimpl
3504
3505 #elif defined(__aarch64__)
3506
3507 notimpl
3508
3509 #else
3510 notimpl
3511 #endif
3512
3513 endproc
3514
3515 proc x33
3516
3517 #if defined(__x86_64__)
3518
3519 notimpl
3520
3521 #elif defined(__i386__)
3522
3523 notimpl
3524
3525 #elif defined(__arm__)
3526
3527 notimpl
3528
3529 #elif defined(__aarch64__)
3530
3531 notimpl
3532
3533 #else
3534 notimpl
3535 #endif
3536
3537 endproc
3538
3539 proc x34
3540
3541 #if defined(__x86_64__)
3542
3543 notimpl
3544
3545 #elif defined(__i386__)
3546
3547 notimpl
3548
3549 #elif defined(__arm__)
3550
3551 notimpl
3552
3553 #elif defined(__aarch64__)
3554
3555 notimpl
3556
3557 #else
3558 notimpl
3559 #endif
3560
3561 endproc
3562
3563 proc x35
3564
3565 #if defined(__x86_64__)
3566
3567 notimpl
3568
3569 #elif defined(__i386__)
3570
3571 notimpl
3572
3573 #elif defined(__arm__)
3574
3575 notimpl
3576
3577 #elif defined(__aarch64__)
3578
3579 notimpl
3580
3581 #else
3582 notimpl
3583 #endif
3584
3585 endproc
3586
3587 proc x36
3588
3589 #if defined(__x86_64__)
3590
3591 notimpl
3592
3593 #elif defined(__i386__)
3594
3595 notimpl
3596
3597 #elif defined(__arm__)
3598
3599 notimpl
3600
3601 #elif defined(__aarch64__)
3602
3603 notimpl
3604
3605 #else
3606 notimpl
3607 #endif
3608
3609 endproc
3610
3611 proc x37
3612
3613 #if defined(__x86_64__)
3614
3615 notimpl
3616
3617 #elif defined(__i386__)
3618
3619 notimpl
3620
3621 #elif defined(__arm__)
3622
3623 notimpl
3624
3625 #elif defined(__aarch64__)
3626
3627 notimpl
3628
3629 #else
3630 notimpl
3631 #endif
3632
3633 endproc
3634
3635 proc x38
3636
3637 #if defined(__x86_64__)
3638
3639 notimpl
3640
3641 #elif defined(__i386__)
3642
3643 notimpl
3644
3645 #elif defined(__arm__)
3646
3647 notimpl
3648
3649 #elif defined(__aarch64__)
3650
3651 notimpl
3652
3653 #else
3654 notimpl
3655 #endif
3656
3657 endproc
3658
3659 proc x39
3660
3661 #if defined(__x86_64__)
3662
3663 notimpl
3664
3665 #elif defined(__i386__)
3666
3667 notimpl
3668
3669 #elif defined(__arm__)
3670
3671 notimpl
3672
3673 #elif defined(__aarch64__)
3674
3675 notimpl
3676
3677 #else
3678 notimpl
3679 #endif
3680
3681 endproc
3682
3683 proc x3a
3684
3685 #if defined(__x86_64__)
3686
3687 notimpl
3688
3689 #elif defined(__i386__)
3690
3691 notimpl
3692
3693 #elif defined(__arm__)
3694
3695 notimpl
3696
3697 #elif defined(__aarch64__)
3698
3699 notimpl
3700
3701 #else
3702 notimpl
3703 #endif
3704
3705 endproc
3706
3707 proc x3b
3708
3709 #if defined(__x86_64__)
3710
3711 notimpl
3712
3713 #elif defined(__i386__)
3714
3715 notimpl
3716
3717 #elif defined(__arm__)
3718
3719 notimpl
3720
3721 #elif defined(__aarch64__)
3722
3723 notimpl
3724
3725 #else
3726 notimpl
3727 #endif
3728
3729 endproc
3730
3731 proc x3c
3732
3733 #if defined(__x86_64__)
3734
3735 notimpl
3736
3737 #elif defined(__i386__)
3738
3739 notimpl
3740
3741 #elif defined(__arm__)
3742
3743 notimpl
3744
3745 #elif defined(__aarch64__)
3746
3747 notimpl
3748
3749 #else
3750 notimpl
3751 #endif
3752
3753 endproc
3754
3755 proc x3d
3756
3757 #if defined(__x86_64__)
3758
3759 notimpl
3760
3761 #elif defined(__i386__)
3762
3763 notimpl
3764
3765 #elif defined(__arm__)
3766
3767 notimpl
3768
3769 #elif defined(__aarch64__)
3770
3771 notimpl
3772
3773 #else
3774 notimpl
3775 #endif
3776
3777 endproc
3778
3779 proc x3e
3780
3781 #if defined(__x86_64__)
3782
3783 notimpl
3784
3785 #elif defined(__i386__)
3786
3787 notimpl
3788
3789 #elif defined(__arm__)
3790
3791 notimpl
3792
3793 #elif defined(__aarch64__)
3794
3795 notimpl
3796
3797 #else
3798 notimpl
3799 #endif
3800
3801 endproc
3802
3803 proc x3f
3804
3805 #if defined(__x86_64__)
3806
3807 notimpl
3808
3809 #elif defined(__i386__)
3810
3811 notimpl
3812
3813 #elif defined(__arm__)
3814
3815 notimpl
3816
3817 #elif defined(__aarch64__)
3818
3819 notimpl
3820
3821 #else
3822 notimpl
3823 #endif
3824
3825 endproc
3826
3827 ///----- That's all, folks --------------------------------------------------