Multiple architectures, more solutions.
[xchg-rax-rax] / xchg.S
CommitLineData
90c4eee3
MW
1/// -*- mode: asm; asm-comment-char: 0 -*-
2
3///--------------------------------------------------------------------------
4/// Preliminaries.
5
6#include <sys/syscall.h>
7
8#if defined(__i386__) || defined(__x86_64__)
06297a93
MW
9
10 .intel_syntax noprefix
11
90c4eee3
MW
12#elif defined(__arm__)
13
14.macro ret
15 bx r14
16.endm
17
18 .arch armv7-a
19
20#elif defined(__aarch64__)
21
22.macro cmov rd, rn, cc
23 csel \rd, \rn, \rd, \cc
24.endm
25#define _COND(_) \
26 _(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl) \
27 _(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv) \
28 _(hs) _(lo)
29#define _INST(_) \
30 _(ccmp) _(ccmn) \
31 _(csel) _(cmov) \
32 _(csinc) _(cinc) _(cset) \
33 _(csneg) _(cneg) \
34 _(csinv) _(cinv) _(csetm)
35#define _CONDVAR(cc) _definstvar cc;
36#define _INSTVARS(inst) \
37 .macro _definstvar cc; \
38 .macro inst.\cc args:vararg; inst \args, \cc; .endm; \
39 .endm; \
40 _COND(_CONDVAR); \
41 .purgem _definstvar;
42 _INST(_INSTVARS)
43#undef _COND
44#undef _INST
45#undef _CONDVAR
46#undef _INSTVARS
47
48#define CCMP_N 8
49#define CCMP_Z 4
50#define CCMP_C 2
51#define CCMP_V 1
52
53#define CCMP_MI CCMP_N
54#define CCMP_PL 0
55#define CCMP_EQ CCMP_Z
56#define CCMP_NE 0
57#define CCMP_CS CCMP_C
58#define CCMP_HS CCMP_C
59#define CCMP_CC 0
60#define CCMP_LO 0
61#define CCMP_VS CCMP_V
62#define CCMP_VC 0
63#define CCMP_HI CCMP_C
64#define CCMP_LS 0
65#define CCMP_LT CCMP_N
66#define CCMP_GE 0
67#define CCMP_LE CCMP_N
68#define CCMP_GT 0
69
70#else
71# error "not supported"
72#endif
06297a93
MW
73
74.macro proc name
75 .globl \name
76 .type \name, STT_FUNC
77 .p2align 4
78\name\():
79 .macro endproc
80 .size \name, . - \name
81 .purgem endproc
82 .endm
83.endm
84
85.macro ch c
90c4eee3
MW
86#if defined(__i386__)
87
88 pushf
89 push eax
90 push ebx
91 push ecx
92 push edx
93 push ebp
94 mov ebp, esp
95 and esp, -16
96
97 push \c
98 call putchar@plt
99
100 call get_pc_ebx
101 add ebx, offset _GLOBAL_OFFSET_TABLE
102 mov eax, [ebx + stdout@GOT]
103 mov eax, [eax]
104 call fflush@plt
105
106 mov esp, ebp
107 pop ebp
108 pop edx
109 pop ecx
110 pop ebx
111 pop eax
112 popf
113
114#elif defined(__x86_64__)
115
06297a93
MW
116 pushf
117 push rax
118 push rcx
119 push rdx
120 push rsi
121 push rdi
122 push r8
123 push r9
124 push rbp
125 mov rbp, rsp
126 and rsp, -16
127
128 mov rdi, \c
129 call putchar@plt
130
131 mov rdi, [rip + stdout]
132 call fflush@plt
133
134 mov rsp, rbp
135 pop rbp
136 pop r9
137 pop r8
138 pop rdi
139 pop rsi
140 pop rdx
141 pop rcx
142 pop rax
143 popf
90c4eee3
MW
144
145#elif defined(__arm__)
146
147 stmfd r13!, {r0-r4, r12, r14}
148
149 mov r4, r13
150 bic r14, r4, #15
151 mov r13, r14
152
153 mov r0, #\c
154 bl putchar@plt
155
156 ldr r14, .L$_c$gotoff$\@
157.L$_c$gotpc$\@:
158 add r14, pc, r14
159 b .L$_c$cont$\@
160.L$_c$gotoff$\@:
161 .word _GLOBAL_OFFSET_TABLE - .L$_c$gotpc$\@ - 8
162.L$_c$cont$\@:
163 bl fflush@plt
164
165 mov r13, r4
166 ldmfd r13!, {r0-r4, r12, r14}
167
168#elif defined(__aarch64__)
169
170 sub sp, sp, #20*8
171 stp x0, x1, [sp, #0]
172 stp x2, x3, [sp, #16]
173 stp x4, x5, [sp, #32]
174 stp x6, x7, [sp, #48]
175 stp x8, x9, [sp, #64]
176 stp x10, x11, [sp, #80]
177 stp x12, x13, [sp, #96]
178 stp x14, x15, [sp, #112]
179 stp x16, x17, [sp, #128]
180 mrs x16, nzcv
181 stp x16, x30, [sp, #144]
182
183 mov w0, #\c
184 bl putchar
185 adrp x0, :got:stdout
186 ldr x0, [x0, #:got_lo12:stdout]
187 ldr x0, [x0]
188 bl fflush
189
190 ldp x16, x30, [sp, #144]
191 msr nzcv, x16
192 ldp x16, x17, [sp, #128]
193 ldp x14, x15, [sp, #112]
194 ldp x12, x13, [sp, #96]
195 ldp x10, x11, [sp, #80]
196 ldp x8, x9, [sp, #64]
197 ldp x6, x7, [sp, #48]
198 ldp x4, x5, [sp, #32]
199 ldp x2, x3, [sp, #16]
200 ldp x0, x1, [sp, #0]
201 add sp, sp, #20*8
202
203#else
204# error "not supported"
205#endif
06297a93
MW
206.endm
207
90c4eee3
MW
208.macro notimpl
209#if defined(__i386__) || defined(__x86_64__)
210 ud2
211#elif defined(__arm__)
212 udf
213#elif defined(__aarch64__)
214 hlt #0
215#else
216# error "not supported"
217#endif
218.endm
219
220 .section .note.GNU-stack, "", %progbits
221
06297a93
MW
222 .text
223
90c4eee3
MW
224#if defined(__i386__)
225get_pc_ebx:
226 mov ebx, [esp]
227 ret
228#endif
229
230
06297a93
MW
231proc call_example
232
90c4eee3
MW
233#if defined(__i386__)
234
235 push ebx // ebx
236 push esi // esi, ebx
237 push edi // edi, esi, ebx
238 push ebp // flags, ebp, ..., ebx
239 pushf
240
241 mov edi, [esp + 4*6]
242 mov esi, [esp + 4*7]
243 push esi // regs, flags, ebp, ..., ebx
244
245 call get_pc_ebx
246 lea eax, [ebx + 9f - .]
247 push eax // cont, regs, flags, ebp, ..., ebx
248 push edi // func, cont, regs, flags, ebp, ..., ebx
249
250 mov eax, [esi + 28]
251 pushf
252 pop ecx
253 and eax, 0x0cd5
254 and ecx, ~0x0cd5
255 or eax, ecx
256 push eax
257 popf
258 mov eax, [esi + 0]
259 mov ebx, [esi + 4]
260 mov ecx, [esi + 8]
261 mov edx, [esi + 12]
262 mov edi, [esi + 20]
263 mov ebp, [esi + 24]
264 mov esi, [esi + 16]
265
266 ret // -> func; regs, flags, ebp, ..., ebx
267
2689: pushf // eflags, regs, flags, ebp, ..., ebx
269 push esi // esi, eflags, regs, flags, ebp, ..., ebx
270 mov esi, [esp + 8]
271 mov [esi + 0], eax
272 mov [esi + 4], ebx
273 mov [esi + 8], ecx
274 mov [esi + 12], edx
275 mov [esi + 20], edi
276 mov [esi + 24], ebp
277 pop eax // rflags, regs, flags, ebp, ..., ebx
278 mov [esi + 16], eax
279 pop eax // regs, flags, ebp, ..., ebx
280 mov [esi + 28], eax
281
282 add esp, 4 // flags, ebp, ..., ebx
283 popf // ebp, ..., ebx
284 pop ebp // ..., ebx
285 pop edi
286 pop esi
287 pop ebx //
288 ret
289
290#elif defined(__x86_64__)
291
06297a93
MW
292 push rbx // rbx
293 push r10
294 push r11
295 push r12
296 push r13
297 push r14
298 push r15
299 push rbp // flags, rbp, ..., rbx
300 pushf
301
302 push rsi // regs, flags, rbp, ..., rbx
303
304 lea rax, [rip + 9f]
305 push rax // cont, regs, flags, rbp, ..., rbx
306 push rdi // func, cont, regs, flags, rbp, ..., rbx
307
90c4eee3 308 mov rax, [rsi + 8*15]
06297a93
MW
309 pushf
310 pop rcx
311 and rax, 0x0cd5
312 and rcx, ~0x0cd5
313 or rax, rcx
314 push rax
315 popf
90c4eee3
MW
316 mov rax, [rsi + 0]
317 mov rbx, [rsi + 8]
318 mov rcx, [rsi + 16]
319 mov rdx, [rsi + 24]
320 mov rdi, [rsi + 40]
321 mov rbp, [rsi + 48]
322 mov r8, [rsi + 56]
323 mov r9, [rsi + 64]
324 mov r10, [rsi + 72]
325 mov r11, [rsi + 80]
326 mov r12, [rsi + 88]
327 mov r13, [rsi + 96]
328 mov r14, [rsi + 104]
329 mov r15, [rsi + 112]
330 mov rsi, [rsi + 32]
06297a93
MW
331
332 ret // -> func; regs, flags, rbp, ..., rbx
333
3349: pushf // rflags, regs, flags, rbp, ..., rbx
335 push rsi // rsi, rflags, regs, flags, rbp, ..., rbx
336 mov rsi, [rsp + 16]
90c4eee3
MW
337 mov [rsi + 0], rax
338 mov [rsi + 8], rbx
339 mov [rsi + 16], rcx
340 mov [rsi + 24], rdx
341 mov [rsi + 40], rdi
342 mov [rsi + 48], rbp
343 mov [rsi + 56], r8
344 mov [rsi + 64], r9
345 mov [rsi + 72], r10
346 mov [rsi + 80], r11
347 mov [rsi + 88], r12
348 mov [rsi + 96], r13
349 mov [rsi + 104], r14
350 mov [rsi + 112], r15
06297a93 351 pop rax // rflags, regs, flags, rbp, ..., rbx
90c4eee3 352 mov [rsi + 32], rax
06297a93 353 pop rax // regs, flags, rbp, ..., rbx
90c4eee3 354 mov [rsi + 120], rax
06297a93
MW
355
356 add rsp, 8 // flags, rbp, ..., rbx
357 popf // rbp, ..., rbx
358 pop rbp // ..., rbx
359 pop r15
360 pop r14
361 pop r13
362 pop r12
363 pop r11
364 pop r10
365 pop rbx //
366 ret
367
90c4eee3
MW
368#elif defined(__arm__)
369
370 stmfd r13!, {r0, r1, r4-r11, r14}
371 ldmia r1, {r0-r12, r14}
372 msr cpsr, r14
373 mov r14, pc
374 ldr pc, [r13], #4
375 ldr r14, [r13], #4
376 stmia r14!, {r0-r12}
377 mrs r0, cpsr
378 str r0, [r14]
379 ldmfd r13!, {r4-r11, pc}
380
381#elif defined(__aarch64__)
382
383 stp x29, x30, [sp, #-13*8]!
384 mov x29, sp
385 stp x19, x20, [sp, #16]
386 stp x21, x22, [sp, #32]
387 stp x23, x24, [sp, #48]
388 stp x25, x26, [sp, #64]
389 stp x27, x28, [sp, #80]
390 str x1, [sp, #96]
391
392 mov x16, x0
393
394 ldr x17, [x1, #128]
395 ldp x14, x15, [x1, #112]
396 ldp x12, x13, [x1, #96]
397 ldp x10, x11, [x1, #80]
398 ldp x8, x9, [x1, #64]
399 ldp x6, x7, [x1, #48]
400 ldp x4, x5, [x1, #32]
401 ldp x2, x3, [x1, #16]
402 ldp x0, x1, [x1, #0]
403 msr nzcv, x17
404
405 blr x16
406
407 ldr x16, [sp, #96]
408 mrs x17, nzcv
409 str x17, [x16, #128]
410 stp x14, x15, [x16, #112]
411 stp x12, x13, [x16, #96]
412 stp x10, x11, [x16, #80]
413 stp x8, x9, [x16, #64]
414 stp x6, x7, [x16, #48]
415 stp x4, x5, [x16, #32]
416 stp x2, x3, [x16, #16]
417 stp x0, x1, [x16, #0]
418
419 ldp x19, x20, [sp, #16]
420 ldp x21, x22, [sp, #32]
421 ldp x23, x24, [sp, #48]
422 ldp x25, x26, [sp, #64]
423 ldp x27, x28, [sp, #80]
424 ldp x29, x30, [sp], #13*8
425
426#else
427# error "not supported"
428#endif
429
06297a93
MW
430endproc
431
432proc nop
433
434 ret
435
436endproc
437
438///--------------------------------------------------------------------------
90c4eee3 439/// 0x00--0x0f
06297a93
MW
440
441proc x00
442
443 // clear all 64 bits of extended traditional registers
90c4eee3
MW
444
445#if defined(__x86_64__)
446
447 xor eax, eax // clear rax
448 lea rbx, [0] // rbx -> _|_
06297a93 449 loop . // iterate, decrement rcx until zero
90c4eee3
MW
450 mov rdx, 0 // set rdx = 0
451 and esi, 0 // clear all bits of rsi
452 sub edi, edi // set rdi = edi - edi = 0
06297a93
MW
453 push 0
454 pop rbp // pop 0 into rbp
455
90c4eee3
MW
456#elif defined(__i386__)
457
458 xor eax, eax
459 lea ebx, [0]
460 loop .
461 mov edx, 0
462 and esi, 0
463 sub edi, edi
464 push 0
465 pop ebp
466
467#elif defined(__arm__)
468
469 eor r0, r0, r0
470 rsb r1, r1, r1
4710: subs r2, r2, #1
472 bne 0b
473 mov r3, #0
474 and r4, r4, #0
475 sub r5, r5, r5
476
477#elif defined(__aarch64__)
478
479 eor w0, w0, w0
480 mov w1, wzr
4810: sub w2, w2, #1
482 cbnz w2, 0b
483 mov w3, #0
484 and w4, w4, wzr
485 sub w5, w5, w5
486
487#else
488 notimpl
489#endif
490
06297a93
MW
491 ret
492
493endproc
494
495proc x01
496
497 // advance a fibonacci pair by c steps
498 //
499 // on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1}
500 // and f_{i+c}, where f_{i+1} = f_i + f_{i-1}
90c4eee3
MW
501
502#if defined(__x86_64__)
503
06297a93
MW
5040: xadd rax, rdx // a, d = a + d, a
505 // = f_{i+1} + f_i, f_{i+1}
506 // = f_{i+2}, f_{i+1}
507 loop 0b // advance i, decrement c, iterate
508
90c4eee3
MW
509#elif defined(__i386__)
510
5110: xadd eax, edx
512 loop 0b
513
514#elif defined(__arm__)
515
5160: subs r2, r2, #2
517 add r3, r3, r0
518 blo 8f
519 add r0, r0, r3
520 bhi 0b
521
5228: movne r0, r3
523
524#elif defined(__aarch64__)
525
5260: subs x2, x2, #2
527 add x3, x3, x0
528 b.lo 8f
529 add x0, x0, x3
530 b.hi 0b
531
5328: cmov.ne x0, x3
533
534#else
535 notimpl
536#endif
537
06297a93
MW
538 ret
539
540endproc
541
542proc x02
543
544 // boolean canonify a: if a = 0 on entry, leave it zero; otherwise
545 // set a = 1
90c4eee3
MW
546
547#if defined(__x86_64__)
548
06297a93
MW
549 neg rax // set cf iff a /= 0
550 sbb rax, rax // a = a - a - cf = -cf
551 neg rax // a = cf
552
90c4eee3
MW
553#elif defined(__i386__)
554
555 neg eax
556 sbb eax, eax
557 neg eax
558
559#elif defined(__arm__)
560
561 movs r1, r0 // the easy way
562 movne r1, #1 // mvnne r1, #1 for mask
563
564 cmp r0, #1 // clear cf iff a == 0
565 sbc r2, r0, r0 // c' = a - a - 1 + cf = cf - 1
566 add r2, r2, #1 // c' = cf
567
568 sub r3, r0, r0, lsr #1 // d' top bit clear; d' = 0 iff a = 0
569 rsb r3, r3, #0 // d' top bit set iff a /= 0
570 mov r3, r3, lsr #31 // asr for mask
571
572 rsbs r0, r0, #0
573 sbc r0, r0, r0
574 rsb r0, r0, #0
575
576#elif defined(__aarch64__)
577
578 cmp x0, #0 // trivial
579 cset.ne x1 // csetm for mask
580
581 cmp xzr, x0 // set cf iff a == 0
582 sbc x2, x0, x0 // c' = a - a - 1 + cf = cf - 1
583 neg x2, x2 // c' = 1 - cf
584
585 sub x3, x0, x0, lsr #1 // if a < 2^63 then a' = ceil(d/2) <
586 // 2^63
587 // if a >= 2^63, write a = 2^63 + t
588 // with t < 2^63; d' = 2^63 - 2^62 +
589 // ceil(t/2) = 2^62 + ceil(t/2), and
590 // ceil(t/2) < 2^62
591 // anyway d' < 2^63 and d' = 0 iff
592 // a = 0
593 neg x3, x3 // d' top bit set iff a /= 0
594 lsr x3, x3, #63 // asr for mask
595
596 cmp x0, #1 // set cf iff a /= 0
597 adc x0, xzr, xzr // a' = 0 + 0 + cf = cf
598
599#else
600 notimpl
601#endif
602
06297a93
MW
603 ret
604
605endproc
606
607proc x03
608
609 // set a = min(a, d) (unsigned); clobber c, d
90c4eee3
MW
610
611#if defined(__x86_64__)
612
06297a93
MW
613 sub rdx, rax // d' = d - a; set cf if a > d
614 sbb rcx, rcx // c = -cf = -[a > d]
615 and rcx, rdx // c = a > d ? d - a : 0
616 add rax, rcx // a' = a > d ? d : a
617
90c4eee3
MW
618#elif defined(__i386__)
619
620 sub edx, eax
621 sbb ecx, ecx
622 and ecx, edx
623 add eax, ecx
624
625#elif defined(__arm__)
626
627 cmp r0, r3 // the easy way
628 movlo r1, r0 // only needed for out-of-place
629 movhs r1, r3
630
631 subs r3, r3, r0
632 sbc r12, r12, r12
633 and r12, r12, r3
634 add r0, r0, r12
635
636#elif defined(__aarch64__)
637
638 cmp x0, x3 // the easy way
639 csel.lo x1, x0, x3
640
641 subs x3, x3, x0 // d' = d - a; set cf if d >= a
642 sbc x16, xzr, xzr // t = -1 + cf = -[a > d]
643 and x16, x16, x3 // t = a > d ? d - a : 0
644 add x0, x0, x16 // a' = a > d ? d : a
645
646#else
647 notimpl
648#endif
649
06297a93
MW
650 ret
651
652endproc
653
654proc x04
655
656 // switch case?
90c4eee3
MW
657
658#if defined(__x86_64__)
659
660 // unrelated playing
661 mov ecx, eax
662 mov rbx, -1
663 mov edx, ecx
664 sub edx, '0'
665 cmp edx, 10
666 cmovb rbx, rdx
667 or ecx, 0x20
668 mov edx, ecx
669 sub edx, 'a'
670 sub ecx, 'a' - 10
671 cmp edx, 6
672 cmovb rbx, rcx
673
674 xor al, 0x20
675
676#elif defined(__i386__)
677
678 // unrelated playing
679 mov ecx, eax
680 mov ebx, -1
681 mov edx, ecx
682 sub edx, '0'
683 cmp edx, 10
684 cmovb ebx, edx
685 or ecx, 0x20
686 mov edx, ecx
687 sub edx, 'a'
688 sub ecx, 'a' - 10
689 cmp edx, 6
690 cmovb ebx, ecx
691
06297a93
MW
692 xor al, 0x20
693
90c4eee3
MW
694#elif defined(__arm__)
695
696 // unrelated playing
697 mvn r1, #0
698 sub r12, r0, #'0'
699 cmp r12, #10
700 movlo r1, r12
701 orr r12, r0, #0x20
702 sub r12, r12, #'a'
703 cmp r12, #6
704 addlo r1, r12, #10
705
706 eor r0, r0, #0x20
707
708#elif defined(__aarch64__)
709
710 // unrelated playing
711 mov x1, #-1
712 sub w16, w0, #'0'
713 cmp w16, #10
714 cmov.lo x1, x16
715 orr w16, w0, #0x20
716 sub w16, w16, #'a' - 10
717 cmp w16, #10
718 ccmp.hs w16, #16, #CCMP_HS
719 cmov.lo x1, x16
720
721 eor w0, w0, #0x20
722
723#else
724 notimpl
725#endif
726
06297a93
MW
727 ret
728
729endproc
730
731proc x05
732
733 // answer whether 5 <= a </<= 9.
90c4eee3
MW
734
735#if defined(__x86_64__)
736
06297a93
MW
737 sub rax, 5 // a' = a - 5
738 cmp rax, 4 // is a' - 5 </<= 4?
739
740 // cc a' a
741 //
742 // z/e a' = 4 a = 9
743 // nz/ne a' /= 4 a /= 9
744 //
745 // a/nbe a' > 4 a > 9 or a < 5
746 // nc/ae/nb a' >= 4 a >= 9 or a < 5
747 // c/b/nae a' < 4 5 <= a < 9
748 // be/na a' <= 4 5 <= a <= 9
749 //
750 // o a' < -2^63 + 4 -2^63 + 5 <= a < -2^63 + 9
751 // no a' >= -2^63 + 4 a >= -2^63 + 9 or
752 // a < -2^63 + 5
753 // s -2^63 + 4 <= a' < 4 -2^63 + 9 <= a < 9
754 // ns a' < -2^63 + 4 or a < -2^63 + 9 or a >= 9
755 // a' >= 4
756 // ge/nl a' >= 4 a >= 9 or a < -2^63 + 5
757 // l/nge a' < 4 -2^63 + 5 <= a < 9
758 // g/nle a' > 4 a > 9 or a < -2^63 + 5
759 // le/ng a' <= 4 -2^63 + 5 <= a <= 9
760
90c4eee3
MW
761#elif defined(__i386__)
762
763 sub eax, 5
764 cmp eax, 4
765
766#elif defined(__arm__)
767
768 // i dimly remember having a slick way to do this way back in the
769 // day, but i can't figure it out any more.
770 sub r0, #5
771 cmp r0, #4
772
773#elif defined(__aarch64__)
774
775 // literal translation is too obvious
776 cmp x0, #5
777 ccmp.hs x0, #9, #CCMP_HS
778
779#else
780 notimpl
781#endif
782
06297a93
MW
783 ret
784
785endproc
786
787proc x06
788
789 // leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of,
790 // set sf to msb(a)
90c4eee3
MW
791
792#if defined(__x86_64__)
793
06297a93
MW
794 not rax // a' = -a - 1
795 inc rax // a' = -a
796 neg rax // a' = a
797
90c4eee3
MW
798#elif defined(__i386__)
799
800 not eax
801 inc eax
802 neg eax
803
804#elif defined(__arm__)
805
806 mvn r0, r0
807 add r0, r0, #1
808 rsbs r0, r0, #0 // cf has opposite sense
809
810#elif defined(__aarch64__)
811
812 mvn x0, x0
813 add x0, x0, #1
814 negs x0, x0 // cf has opposite sense
815
816#else
817 notimpl
818#endif
819
06297a93
MW
820 ret
821
822endproc
823
824proc x07
825
826 // same as before (?)
90c4eee3
MW
827
828#if defined(__x86_64__)
829
06297a93
MW
830 inc rax // a' = a + 1
831 neg rax // a' = -a - 1
832 inc rax // a' = -a
833 neg rax // a' = a
834
90c4eee3
MW
835#elif defined(__i386__)
836
837 inc eax
838 neg eax
839 inc eax
840 neg eax
841
842#elif defined(__arm__)
843
844 add r0, r0, #1
845 rsb r0, r0, #0
846 add r0, r0, #1
847 rsbs r0, r0, #0
848
849#elif defined(__aarch64__)
850
851 add x0, x0, #1
852 neg x0, x0
853 add x0, x0, #1
854 negs x0, x0 // cf has opposite sense
855
856#else
857 notimpl
858#endif
859
06297a93
MW
860 ret
861
862endproc
863
864proc x08
865
866 // floor((a + d)/2), correctly handling overflow conditions; final cf
867 // is lsb(a + d), probably uninteresting
90c4eee3
MW
868
869#if defined(__x86_64__)
870
06297a93
MW
871 add rax, rdx // cf || a' = a + d
872 rcr rax, 1 // shift 65-bit result right by one
873 // place; lsb moves into carry
874
90c4eee3
MW
875#elif defined(__i386__)
876
877 add eax, edx
878 rcr eax, 1
879
880#elif defined(__arm__)
881
882 // like the two-instruction a64 version
883 sub r1, r3, r0
884 add r1, r0, r1, lsr #1
885
886 // the slick version, similar to the above
887 adds r0, r0, r3
888 mov r0, r0, rrx
889
890#elif defined(__aarch64__)
891
892 // a64 lacks a32's rrx. literal translation.
893 adds x1, x0, x3 // cf || a' = a + d
894 adc x16, xzr, xzr // realize cf in extra register
895 extr x1, x16, x1, #1 // shift down one place
896
897 // two instruction version: clobbers additional register. (if you
898 // wanted the answer in any other register, even overwriting d, then
899 // this is unnecessary.) also depends on d >= a.
900 sub x16, x3, x0 // compute difference
901 add x0, x0, x16, lsr #1 // add half of it (rounded down)
902
903#else
904 notimpl
905#endif
906
06297a93
MW
907 ret
908
909endproc
910
911proc x09
912
913 // a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3
914 // (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8).
90c4eee3
MW
915
916#if defined(__x86_64__)
917
06297a93
MW
918 shr rax, 3 // a' = floor(a/8); cf = 1 if a ==
919 // 4, 5, 6, 7 (mod 8)
920 adc rax, 0 // a' = floor(a/8) + cf
921
90c4eee3
MW
922#elif defined(__i386__)
923
924 shr eax, 3
925 adc eax, 0
926
927#elif defined(__arm__)
928
929 movs r0, r0, lsr #3
930 adc r0, r0, #0
931
932#elif defined(__aarch64__)
933
934 tst x0, #4
935 orr x0, xzr, x0, lsr #3
936 cinc.ne x0, x0
937
938#else
939 notimpl
940#endif
941
06297a93
MW
942 ret
943
944endproc
945
946proc x0a
947
948 // increment c-byte little-endian bignum at rdi
90c4eee3
MW
949
950#if defined(__x86_64__)
951
06297a93
MW
952 add byte ptr [rdi], 1
9530: inc rdi
954 adc byte ptr [rdi], 0
955 loop 0b
956
90c4eee3
MW
957#elif defined(__i386__)
958
959 add byte ptr [edi], 1
9600: inc edi
961 adc byte ptr [edi], 0
962 loop 0b
963
964#elif defined(__arm__)
965
966 mov r12, #256 // set initial carry
9670: ldrb r0, [r5]
968 subs r2, r2, #1
969 add r12, r0, r12, lsr #8
970 strb r12, [r5], #1
971 bne 0b
972
973#elif defined(__aarch64__)
974
975 mov w17, #256 // set initial carry
9760: ldrb w16, [x5]
977 sub x2, x2, #1
978 add w17, w16, w17, lsr #8
979 strb w17, [x5], #1
980 cbnz x2, 0b
981
982#else
983 notimpl
984#endif
985
06297a93
MW
986 ret
987
988endproc
989
990proc x0b
991
992 // negate double-precision d:a
90c4eee3
MW
993
994#if defined(__x86_64__)
995
06297a93
MW
996 not rdx // d' = -d - 1
997 neg rax // a' = -a;
998 // cf = 1 iff a /= 0
999 sbb rdx, -1 // d' = -d - cf
1000
90c4eee3
MW
1001#elif defined(__i386__)
1002
1003 not edx
1004 neg eax
1005 sbb edx, -1
1006
1007#elif defined(__arm__)
1008
1009 // reverse subtract is awesome
1010 rsbs r0, r0, #0
1011 rsc r3, r3, #0
1012
1013#elif defined(__aarch64__)
1014
1015 // easy way: everything is better with zero registers.
1016 negs x0, x0
1017 ngc x3, x3
1018
1019#else
1020 notimpl
1021#endif
1022
06297a93
MW
1023 ret
1024
1025endproc
1026
1027proc x0c
1028
1029 // rotate is distributive over xor.
1030
90c4eee3
MW
1031#if defined(__x86_64__)
1032
06297a93
MW
1033 // rax // = a_1 || a_0
1034 // rbx // = b_1 || b_0
1035 mov rcx, rax // = a_1 || a_0
1036
1037 xor rcx, rbx // = (a_1 XOR b_1) || (a_0 XOR b_0)
1038 ror rcx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1)
1039
1040 ror rax, 0xd // = a_0 || a_1
1041 ror rbx, 0xd // = b_0 || b_1
1042 xor rax, rbx // = (a_0 XOR b_0) || (a_1 XOR b_1)
1043
1044 cmp rax, rcx // always equal
1045
90c4eee3
MW
1046#elif defined(__i386__)
1047
1048 mov ecx, eax // = a_1 || a_0
1049
1050 xor ecx, ebx // = (a_1 XOR b_1) || (a_0 XOR b_0)
1051 ror ecx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1)
1052
1053 ror eax, 0xd // = a_0 || a_1
1054 ror ebx, 0xd // = b_0 || b_1
1055 xor eax, ebx // = (a_0 XOR b_0) || (a_1 XOR b_1)
1056
1057 cmp eax, ecx // always equal
1058
1059#elif defined(__arm__)
1060
1061
1062 // r0 // = a_1 || a_0
1063 // r1 // = b_1 || b_0
1064 eor r2, r0, r1 // = (a_1 XOR b_1) || (a_0 XOR b_0)
1065 mov r2, r2, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1066
1067 mov r1, r1, ror #13 // = b_0 || b_1
1068 eor r0, r1, r0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1069
1070 cmp r0, r2 // always equal
1071
1072#elif defined(__aarch64__)
1073
1074 // x0 // = a_1 || a_0
1075 // x1 // = b_1 || b_0
1076 eor x2, x0, x1 // = (a_1 XOR b_1) || (a_0 XOR b_0)
1077 ror x2, x2, #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1078
1079 ror x1, x1, #13 // = b_0 || b_1
1080 eor x0, x1, x0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
1081
1082 cmp x0, x2 // always equal
1083
1084#else
1085 notimpl
1086#endif
1087
06297a93
MW
1088 ret
1089
1090endproc
1091
1092proc x0d
1093
1094 // and is distributive over xor.
1095
90c4eee3
MW
1096#if defined(__x86_64__)
1097
06297a93
MW
1098 mov rdx, rbx // = b
1099
1100 xor rbx, rcx // = b XOR c
1101 and rbx, rax // = a AND (b XOR c)
1102
1103 and rdx, rax // = a AND b
1104 and rax, rcx // = a AND c
1105 xor rax, rdx // = (a AND b) XOR (a AND c)
1106 // = a AND (b XOR c)
1107
1108 cmp rax, rbx // always equal
1109
90c4eee3
MW
1110#elif defined(__i386__)
1111
1112 mov edx, ebx // = b
1113
1114 xor ebx, ecx // = b XOR c
1115 and ebx, eax // = a AND (b XOR c)
1116
1117 and edx, eax // = a AND b
1118 and eax, ecx // = a AND c
1119 xor eax, edx // = (a AND b) XOR (a AND c)
1120 // = a AND (b XOR c)
1121
1122 cmp eax, ebx // always equal
1123
1124#elif defined(__arm__)
1125
1126 and r3, r0, r1 // = a AND b
1127
1128 eor r1, r1, r2 // = b XOR c
1129 and r1, r1, r0 // = a AND (b XOR c)
1130
1131 and r0, r0, r2 // = a AND c
1132 eor r0, r0, r3 // = (a AND b) XOR (a AND c)
1133 // = a AND (b XOR c)
1134
1135 cmp r0, r1 // always equal
1136
1137#elif defined(__aarch64__)
1138
1139 and x3, x0, x1 // = a AND b
1140
1141 eor x1, x1, x2 // = b XOR c
1142 and x1, x1, x0 // = a AND (b XOR c)
1143
1144 and x0, x0, x2 // = a AND c
1145 eor x0, x0, x3 // = (a AND b) XOR (a AND c)
1146 // = a AND (b XOR c)
1147
1148 cmp x0, x1 // always equal
1149
1150#else
1151 notimpl
1152#endif
1153
06297a93
MW
1154 ret
1155
1156endproc
1157
1158proc x0e
1159
1160 // de morgan's law
1161
90c4eee3
MW
1162#if defined(__x86_64__)
1163
06297a93
MW
1164 mov rcx, rax // = a
1165
1166 and rcx, rbx // = a AND b
1167 not rcx // = NOT (a AND b)
1168
1169 not rax // = NOT a
1170 not rbx // = NOT b
1171 or rax, rbx // = (NOT a) OR (NOT b)
1172 // = NOT (a AND b)
1173
90c4eee3
MW
1174 cmp rax, rcx // always equal
1175
1176#elif defined(__i386__)
1177
1178 mov ecx, eax // = a
1179
1180 and ecx, ebx // = a AND b
1181 not ecx // = NOT (a AND b)
1182
1183 not eax // = NOT a
1184 not ebx // = NOT b
1185 or eax, ebx // = (NOT a) OR (NOT b)
1186 // = NOT (a AND b)
1187
1188 cmp eax, ecx // always equal
1189
1190#elif defined(__arm__)
1191
1192 and r2, r0, r1 // = a AND b
1193 mvn r2, r2 // = NOT (a AND b)
1194
1195 mvn r0, r0 // = NOT a
1196 mvn r1, r1 // = NOT b
1197 orr r0, r0, r1 // = (NOT a) OR (NOT b)
1198
1199 cmp r0, r2 // always equal
1200
1201#elif defined(__aarch64__)
1202
1203 and x2, x0, x1 // = a AND b
1204 mvn x2, x2 // = NOT (a AND b)
1205
1206 mvn x0, x0 // = NOT a
1207 orn x0, x0, x1 // = (NOT a) OR (NOT b)
1208
1209 cmp x0, x2 // always equal
1210
1211#else
1212 notimpl
1213#endif
06297a93
MW
1214
1215 ret
1216
1217endproc
1218
1219proc x0f
1220
1221 // replace input buffer bytes with cumulative XORs with initial a;
1222 // final a is XOR of all buffer bytes and initial a.
1223 //
1224 // not sure why you'd do this.
1225
90c4eee3 1226#if defined(__x86_64__)
06297a93
MW
1227
12280: xor [rsi], al
1229 lodsb
1230 loop 0b
1231
90c4eee3
MW
1232#elif defined(__i386__)
1233
12340: xor [esi], al
1235 lodsb
1236 loop 0b
1237
1238#elif defined(__arm__)
1239
12400: ldrb r12, [r4]
1241 subs r2, r2, #1
1242 eor r0, r0, r12
1243 strb r0, [r4], #1
1244 bne 0b
1245
1246#elif defined(__aarch64__)
1247
12480: ldrb w16, [x4]
1249 sub x2, x2, #1
1250 eor w0, w0, w16
1251 strb w0, [x4], #1
1252 cbnz x2, 0b
1253
1254#else
1255 notimpl
1256#endif
1257
06297a93
MW
1258 ret
1259
1260endproc
1261
90c4eee3
MW
1262///--------------------------------------------------------------------------
1263/// 0x10--0x1f
1264
06297a93
MW
1265proc x10
1266
1267 // four different ways to swap a pair of registers.
1268
90c4eee3
MW
1269#if defined(__x86_64__)
1270
06297a93
MW
1271 push rax
1272 push rcx
1273 pop rax
1274 pop rcx
1275
1276 xor rax, rcx
1277 xor rcx, rax
1278 xor rax, rcx
1279
1280 add rax, rcx
1281 sub rcx, rax
1282 add rax, rcx
1283 neg rcx
1284
1285 xchg rax, rcx
1286
90c4eee3
MW
1287#elif defined(__i386__)
1288
1289 push eax
1290 push ecx
1291 pop eax
1292 pop ecx
1293
1294 xor eax, ecx
1295 xor ecx, eax
1296 xor eax, ecx
1297
1298 add eax, ecx
1299 sub ecx, eax
1300 add eax, ecx
1301 neg ecx
1302
1303 xchg eax, ecx
1304
1305#elif defined(__arm__)
1306
1307 stmfd r13!, {r0, r2}
1308 ldr r0, [r13, #4]
1309 ldr r2, [r13], #8
1310
1311 eor r0, r0, r2
1312 eor r2, r2, r0
1313 eor r0, r0, r2
1314
1315 sub r0, r0, r2
1316 add r2, r2, r0
1317 rsb r0, r0, r2 // don't need 3-addr with reverse-sub
1318
1319 mov r12, r0
1320 mov r0, r2
1321 mov r2, r0
1322
1323#elif defined(__aarch64__)
1324
1325 // anything you can do
1326 stp x0, x2, [sp, #-16]!
1327 ldp x2, x0, [sp], #16
1328
1329 eor x0, x0, x2
1330 eor x2, x2, x0
1331 eor x0, x0, x2
1332
1333 // the add/sub/add thing was daft. you can do it in three if you're
1334 // clever -- and have three-address operations.
1335 sub x0, x0, x2
1336 add x2, x2, x0
1337 sub x0, x2, x0
1338
1339 // but we lack a fourth. we can't do this in fewer than three
1340 // instructions without hitting memory. only `ldp' will modify two
1341 // registers at a time, so we need at least two instructions -- but
1342 // if the first one sets one of our two registers to its final value
1343 // then we lose the other input value with no way to recover it, so
1344 // we must either write a fresh third register, or write something
1345 // other than the final value, and in both cases we need a third
1346 // instruction to fix everything up. we've done the wrong-something-
1347 // other trick twice, so here's the captain-obvious use-a-third-
1348 // register version.
1349 mov x16, x0
1350 mov x0, x2
1351 mov x2, x16
1352
1353#else
1354 notimpl
1355#endif
1356
06297a93
MW
1357 ret
1358
1359endproc
1360
1361proc x11
1362
1363 // assuming a is initialized to zero, set a to the inclusive or of
1364 // the xor-differences of corresponding bytes in the c-byte strings
1365 // at si and di.
1366 //
1367 // in particular, a will be zero (and zf set) if and only if the two
1368 // strings are equal.
1369
90c4eee3
MW
1370#if defined(__x86_64__)
1371
06297a93
MW
13720: mov dl, [rsi]
1373 xor dl, [rdi]
1374 inc rsi
1375 inc rdi
1376 or al, dl
1377 loop 0b
1378
90c4eee3
MW
1379#elif defined(__i386__)
1380
13810: mov dl, [esi]
1382 xor dl, [edi]
1383 inc esi
1384 inc edi
1385 or al, dl
1386 loop 0b
1387
1388#elif defined(__arm__)
1389
13900: ldrb r1, [r4], #1
1391 ldrb r12, [r5], #1
1392 subs r2, r2, #1
1393 eor r12, r12, r1
1394 orr r0, r0, r12
1395 bne 0b
1396
1397#elif defined(__aarch64__)
1398
13990: ldrb w16, [x4], #1
1400 ldrb w17, [x5], #1
1401 sub x2, x2, #1
1402 eor w16, w16, w17
1403 orr w0, w0, w16
1404 cbnz x2, 0b
1405
1406#else
1407 notimpl
1408#endif
1409
06297a93
MW
1410 ret
1411
1412endproc
1413
1414proc x12
1415
1416 // an obtuse way of adding two registers. for any bit position, a
1417 // OR d is set if and only if at least one of a and d has a bit set
1418 // in that position, and a AND d is set if and only if both have a
1419 // bit set in that position. essentially, then, what we've done is
1420 // move all of the set bits in d to a, unless there's already a bit
1421 // there. this clearly doesn't change the sum.
1422
90c4eee3
MW
1423#if defined(__x86_64__)
1424
06297a93
MW
1425 mov rcx, rdx // c' = d
1426 and rdx, rax // d' = a AND d
1427 or rax, rcx // a' = a OR d
1428 add rax, rdx
1429
90c4eee3
MW
1430#elif defined(__i386__)
1431
1432 mov ecx, edx // c' = d
1433 and edx, eax // d' = a AND d
1434 or eax, ecx // a' = a OR d
1435 add eax, edx
1436
1437#elif defined(__arm__)
1438
1439 and r2, r0, r3 // c' = a AND d
1440 orr r0, r0, r3 // a' = a OR d
1441 add r0, r0, r2
1442
1443#elif defined(__aarch64__)
1444
1445 and x2, x0, x3 // c' = a AND d
1446 orr x0, x0, x3 // a' = a OR d
1447 add x0, x0, x2
1448
1449#else
1450 notimpl
1451#endif
1452
06297a93
MW
1453 ret
1454
1455endproc
1456
1457proc x13
1458
1459 // ok, so this is a really obtuse way of adding a and b; the result
1460 // is in a and d. but why does it work?
1461
90c4eee3
MW
1462#if defined(__x86_64__)
1463
06297a93
MW
1464 mov rcx, 0x40 // carry chains at most 64 long
14650: mov rdx, rax // copy a'
1466 xor rax, rbx // low bits of each bitwise sum
1467 and rbx, rdx // carry bits from each bitwise sum
90c4eee3
MW
1468 shl rbx, 1 // carry them into next position
1469 loop 0b
1470
1471#elif defined(__i386__)
1472
1473 mov ecx, 0x40 // carry chains at most 64 long
14740: mov edx, eax // copy a'
1475 xor eax, ebx // low bits of each bitwise sum
1476 and ebx, edx // carry bits from each bitwise sum
1477 shl ebx, 1 // carry them into next position
06297a93
MW
1478 loop 0b
1479
90c4eee3
MW
1480#elif defined(__arm__)
1481
1482 mov r2, #0x40
14830: and r3, r0, r1
1484 subs r2, r2, #1
1485 eor r0, r0, r1
1486 lsl r1, r3, #1
1487 bne 0b
1488
1489#elif defined(__aarch64__)
1490
1491 mov x2, #0x40
14920: and x3, x0, x1
1493 sub x2, x2, #1
1494 eor x0, x0, x1
1495 lsl x1, x3, #1
1496 cbnz x2, 0b
1497
1498#else
1499 notimpl
1500#endif
1501
06297a93
MW
1502 ret
1503
1504endproc
1505
1506proc x14
1507
1508 // floor((a + d)/2), like x08.
1509
90c4eee3
MW
1510#if defined(__x86_64__)
1511
06297a93
MW
1512 mov rcx, rax // copy a for later
1513 and rcx, rdx // carry bits
1514
1515 xor rax, rdx // low bits of each bitwise sum
1516 shr rax, 1 // divide by 2; carries now in place
1517
1518 add rax, rcx // add the carries; done
1519
90c4eee3
MW
1520#elif defined(__i386__)
1521
1522 mov ecx, eax // copy a for later
1523 and ecx, edx // carry bits
1524
1525 xor eax, edx // low bits of each bitwise sum
1526 shr eax, 1 // divide by 2; carries now in place
1527
1528 add eax, ecx // add the carries; done
1529
1530#elif defined(__arm__)
1531
1532 and r2, r0, r3
1533 eor r0, r0, r3
1534 add r0, r2, r0, lsr #1
1535
1536#elif defined(__aarch64__)
1537
1538 and x2, x0, x3
1539 eor x0, x0, x3
1540 add x0, x2, x0, lsr #1
1541
1542#else
1543 notimpl
1544#endif
1545
06297a93
MW
1546 ret
1547
1548endproc
1549
1550proc x15
1551
1552 // sign extension 32 -> 64 bits.
1553
90c4eee3
MW
1554#if defined(__x86_64__)
1555
1556 movsx rbx, eax // like this?
06297a93
MW
1557
1558 mov rdx, 0xffffffff80000000
1559 add rax, rdx // if bit 31 of a is set then bits
1560 // 31--63 of a' are clear; otherwise,
1561 // these bits are all set -- which is
1562 // exactly backwards
1563 xor rax, rdx // so fix it
1564
90c4eee3
MW
1565#elif defined(__i386__)
1566
1567 movsx ebx, ax // like this?
1568
1569 mov edx, 0xffff8000
1570 add eax, edx // if bit 31 of a is set then bits
1571 // 31--63 of a' are clear; otherwise,
1572 // these bits are all set -- which is
1573 // exactly backwards
1574 xor eax, edx // so fix it
1575
1576#elif defined(__arm__)
1577
1578 sxth r1, r0 // like this
1579
1580 mov r12, #0x80000000
1581 add r0, r0, r12, asr #16
1582 eor r0, r0, r12, asr #16
1583
1584#elif defined(__aarch64__)
1585
1586 sxtw x1, w0 // like this
1587
1588 mov x16, #0xffffffff80000000
1589 add x0, x0, x16
1590 eor x0, x0, x16
1591
1592#else
1593 notimpl
1594#endif
1595
06297a93
MW
1596 ret
1597
1598endproc
1599
1600proc x16
1601
90c4eee3
MW
1602 // ??? i don't know why you'd want to calculate this.
1603
1604#if defined(__x86_64__)
06297a93
MW
1605
1606 xor rax, rbx // a' = a XOR b
1607 xor rbx, rcx // b' = b XOR c
1608 mov rsi, rax // t = a XOR b
1609 add rsi, rbx // t = (a XOR b) + (b XOR c)
1610 cmovc rax, rbx // a' = cf ? b XOR c : a XOR b
1611 xor rax, rbx // a' = cf ? 0 : a XOR c
1612 cmp rax, rsi
1613
90c4eee3
MW
1614#elif defined(__i386__)
1615
1616 xor eax, ebx // a' = a XOR b
1617 xor ebx, ecx // b' = b XOR c
1618 mov esi, eax // t = a XOR b
1619 add esi, ebx // t = (a XOR b) + (b XOR c)
1620 cmovc eax, ebx // a' = cf ? b XOR c : a XOR b
1621 xor eax, ebx // a' = cf ? 0 : a XOR c
1622 cmp eax, esi
1623
1624#elif defined(__arm__)
1625
1626 eor r0, r0, r1
1627 eor r1, r1, r2
1628 adds r4, r0, r1
1629 movcs r0, r1
1630 eor r0, r0, r1
1631 cmp r0, r4
1632
1633#elif defined(__aarch64__)
1634
1635 eor x0, x0, x1
1636 eor x1, x1, x2
1637 adds x4, x0, x1
1638 cmov.cs x0, x1
1639 eor x0, x0, x1
1640 cmp x0, x4
1641
1642#else
1643 notimpl
1644#endif
1645
06297a93
MW
1646 ret
1647
1648endproc
1649
1650proc x17
1651
90c4eee3
MW
1652 // absolute value
1653
1654#if defined(__x86_64__)
1655
1656 cqo // d = a < 0 ? -1 : 0
1657 xor rax, rdx // a' = a < 0 ? -a - 1 : a
1658 sub rax, rdx // a' = a < 0 ? -a : a
1659
1660#elif defined(__i386__)
1661
1662 cdq // d = a < 0 ? -1 : 0
1663 xor eax, edx // a' = a < 0 ? -a - 1 : a
1664 sub eax, edx // a' = a < 0 ? -a : a
1665
1666#elif defined(__arm__)
1667
1668 // direct approach
1669 movs r1, r0
1670 rsbmi r1, r0, #0
1671
1672 // faithful-ish conversion
1673 eor r3, r0, r0, asr #31
1674 sub r0, r3, r0, asr #31
1675
1676#elif defined(__aarch64__)
1677
1678 // direct approach
1679 tst x0, #1 << 63
1680 cneg.ne x1, x0
1681
1682 // faithful-ish conversion
1683 eor x3, x0, x0, asr #63
1684 sub x0, x3, x0, asr #63
1685
1686#else
1687 notimpl
1688#endif
1689
1690 ret
06297a93
MW
1691
1692endproc
1693
1694proc x18
1695
90c4eee3
MW
1696 // should always set sf, clear zf, unless we get rescheduled to a
1697 // different core.
1698
1699#if defined(__x86_64__)
1700
1701 rdtsc // d || a = cycles
1702 shl rdx, 0x20
1703 or rax, rdx // a = cycles
1704 mov rcx, rax // c = cycles
1705
1706 rdtsc // d || a = cycles'
1707 shl rdx, 0x20
1708 or rax, rdx // a = cycles'
1709
1710 cmp rcx, rax
1711
1712#elif defined(__i386__)
1713
1714 rdtsc // d || a = cycles
1715 mov ebx, eax
1716 mov ecx, edx // c || b = cycles
1717
1718 rdtsc // d || a = cycles'
1719
1720 sub ebx, eax
1721 sbb ecx, edx
1722
1723#elif defined(__arm__)
1724
1725 // cycle clock not available in user mode
1726 mrrc p15, 0, r0, r1, c9
1727 mrrc p15, 0, r2, r3, c9
1728 subs r0, r0, r2
1729 sbcs r1, r1, r3
1730
1731#elif defined(__aarch64__)
1732
1733 // cycle clock not available in user mode
1734 mrs x0, pmccntr_el0
1735 mrs x1, pmccntr_el0
1736 cmp x0, x1
1737
1738#else
1739 notimpl
1740#endif
1741
1742 ret
06297a93
MW
1743
1744endproc
1745
1746proc x19
1747
90c4eee3
MW
1748 // stupid way to capture a pointer to inline data and jump past it.
1749 // confuses the return-address predictor something chronic. worse
1750 // because amd64 calling convention doesn't usually pass arguments on
1751 // the stack.
1752
1753#if defined(__x86_64__)
1754
1755 call 8f
1756 .string "hello world!\n\0"
17578: call print_str
1758 add rsp, 8
1759 ret
1760
1761print_str:
1762 // actually implement this ridiculous thing
1763 mov rsi, [rsp + 8]
1764 xor edx, edx
17650: mov al, [rsi + rdx]
1766 inc rdx
1767 cmp al, 0
1768 jnz 0b
1769 mov eax, SYS_write
1770 mov edi, 1
1771 dec rdx
1772 syscall // clobbers r11 :-(
1773 ret
1774
1775#elif defined(__i386__)
1776
1777 call 8f
1778 .string "hello world!\n\0"
17798: call print_str
1780 add esp, 4
1781 ret
1782
1783print_str:
1784 // actually implement this ridiculous thing
1785 mov ecx, [esp + 4]
1786 xor edx, edx
17870: mov al, [ecx + edx]
1788 inc edx
1789 cmp al, 0
1790 jnz 0b
1791 mov eax, SYS_write
1792 mov ebx, 1
1793 dec edx
1794 int 0x80
1795 ret
1796
1797#elif defined(__arm__)
1798
1799 // why am i doing this?
1800 stmfd r13!, {r14}
1801 bl 8f
1802 .string "hello world!\n\0"
1803 .balign 4
18048: mov r1, r14 // might as well make it easy on myself
1805 bl print_str
1806 ldmfd r13!, {pc}
1807
1808print_str:
1809 mov r2, #0
18100: ldrb r0, [r1, r2]
1811 cmp r0, #0
1812 addne r2, r2, #1
1813 bne 0b
1814 mov r0, #1
1815 mov r7, #SYS_write
1816 swi 0
1817 bx r14
1818
1819#elif defined(__aarch64__)
1820
1821 // why am i doing this?
1822 str x30, [sp, #-16]!
1823 bl 8f
1824 .string "hello world!\n\0"
1825 .balign 4
18268: mov x1, x30 // might as well make it easy on myself
1827 bl print_str
1828 ldr x30, [sp], #16
1829 ret
1830
1831print_str:
1832 mov x2, #0
18330: ldrb w0, [x1, x2]
1834 cmp w0, #0
1835 cinc.ne x2, x2
1836 b.ne 0b
1837 mov x0, #1
1838 mov x8, #SYS_write
1839 svc #0
1840 ret
1841
1842#else
1843 notimpl
1844#endif
06297a93
MW
1845
1846endproc
1847
1848proc x1a
1849
90c4eee3
MW
1850 // collect the current instruction-pointer address. this was an old
1851 // 32-bit i386 trick for position-independent code, but (a) it
1852 // confuses the return predictor, and (b) amd64 has true pc-relative
1853 // addressing.
1854
1855#if defined(__x86_64__)
1856
1857 // the actual example
1858 call 0f
18590: pop rax
1860
1861 // the modern i386 trick doesn't confuse the return-address
1862 // predictor.
1863 call calladdr_rbx
1864 sub rbx, . - 0b
1865
1866 // but rip-relative addressing is even better
1867 lea rcx, [rip + 0b]
1868
1869 ret
1870
1871calladdr_rbx:
1872 mov rbx, [rsp]
1873 ret
1874
1875#elif defined(__i386__)
1876
1877 // the actual example
1878 call 0f
18790: pop eax
1880
1881 // the modern i386 trick doesn't confuse the return-address
1882 // predictor.
1883 call get_pc_ebx
1884 sub ebx, . - 0b
1885
1886 ret
1887
1888#elif defined(__arm__)
1889
1890 stmfd r13!, {r14}
1891
1892 bl 0f
18930: mov r0, r14
1894
1895 bl return
1896 sub r1, r14, #. - 0b
1897
1898 adr r2, 0b
1899
1900 ldmfd r13!, {pc}
1901
1902return: bx r14
1903
1904#elif defined(__aarch64__)
1905
1906 str x30, [sp, #-16]!
1907
1908 // we can do all of the above using a64
1909 bl 0f
19100: mov x0, x30
1911
1912 bl return
1913 sub x1, x30, #. - 0b
1914
1915 adr x2, 0b
1916
1917 ldr x30, [sp], #16
1918return: ret
1919
1920#else
1921 notimpl
1922#endif
06297a93
MW
1923
1924endproc
1925
1926proc x1b
1927
90c4eee3
MW
1928#if defined(__x86_64__)
1929
1930 // retpolines: an mitigation against adversarially influenced
1931 // speculative execution at indirect branches. if an adversary can
1932 // prepare a branch-target buffer entry matching an indirect branch
1933 // in the victim's address space then they can cause the victim to
1934 // /speculatively/ (but not architecturally) execute any code in
1935 // their address space, possibly leading to leaking secrets through
1936 // the cache. retpolines aren't susceptible to this because the
1937 // predicted destination address is from the return-prediction stack
1938 // which the adversary can't prime. the performance penalty is still
1939 // essentially a branch misprediction -- for this return, and
1940 // possibly all others already stacked.
1941
1942 // (try not to crash)
1943 lea rax, [rip + 9f]
1944
1945 push rax
19469: ret
1947
1948#elif defined(__i386__)
1949
1950 call get_pc_ebx
1951 lea eax, [ebx + 9f - .]
1952
1953 push eax
19549: ret
1955
1956#elif defined(__arm__)
1957
1958 stmfd r13!, {r14}
1959
1960 adr r14, 8f
1961 bx r14
1962
19638: ldmfd r13!, {pc}
1964
1965#elif defined(__aarch64__)
1966
1967 str x30, [sp, #-16]!
1968
1969 adr x30, 8f
1970 ret
1971
19728: ldr x30, [sp], #16
1973 ret
1974
1975#else
1976 notimpl
1977#endif
06297a93
MW
1978
1979endproc
1980
1981proc x1c
1982
90c4eee3
MW
1983 // ok, having a hard time seeing a use for this. the most important
1984 // thing to note is that sp is set from `pop' /after/ it's
1985 // incremented.
1986
1987#if defined(__x86_64__)
1988
1989 // try not to crash
1990 mov rax, rsp
1991 and rsp, -16
1992 push rax
1993
1994 pop rsp
1995
1996 // check it worked
1997 mov rbx, rsp
1998 ret
1999
2000#elif defined(__i386__)
2001
2002 // try not to crash
2003 mov eax, esp
2004 and esp, -16
2005 push eax
2006
2007 pop esp
2008
2009 // check it worked
2010 mov ebx, esp
2011 ret
2012
2013#elif defined(__arm__)
2014
2015 // not even going to dignify this
2016 notimpl
2017
2018#elif defined(__aarch64__)
2019
2020 // not even going to dignify this
2021 notimpl
2022
2023#else
2024 notimpl
2025#endif
06297a93
MW
2026
2027endproc
2028
2029proc x1d
2030
90c4eee3
MW
2031 // monumentally cheesy way to copy 8 n bytes from buff1 to buff2.
2032 // also clobbers words at buff2 + 8 n and buff2 - 8 for good measure.
2033
2034 n = 4
2035
2036#if defined(__x86_64__)
2037
2038 mov rax, rsp // safekeeping
2039
2040 // we're toast if we get hit by a signal now. fingers crossed...
2041 .if 0
2042 mov rsp, buff2 + 8*n + 8
2043 mov rbp, buff1 + 8*n
2044 .else
2045 lea rsp, [rdi + 8*n + 16]
2046 lea rbp, [rsi + 8*n]
2047 .endif
2048 enter 0, n + 1
2049
2050 // precise action:
2051 //
2052 // +---------+ +---------+
2053 // rbp -> | ??? | rsp -> | ??? |
2054 // +---------+ +---------+
2055 // | w_{n-1} | | rbp | <- rbp'
2056 // +---------+ +---------+
2057 // | ... | | w_{n-1} |
2058 // +---------+ +---------+
2059 // | w_1 | | ... |
2060 // +---------+ +---------+
2061 // | w_0 | | w_1 |
2062 // +---------+ +---------+
2063 // | w_0 |
2064 // +---------+
2065 // | rbp' | <- rsp'
2066 // +---------+
2067
2068 mov rdx, rsp
2069 mov rsp, rax
2070
2071#elif defined(__i386__)
2072
2073 mov eax, esp // safekeeping
2074
2075 // we're toast if we get hit by a signal now. fingers crossed...
2076 .if 0
2077 mov esp, buff2 + 4*n + 4
2078 mov ebp, buff1 + 4*n
2079 .else
2080 lea esp, [edi + 4*n + 8]
2081 lea ebp, [esi + 4*n]
2082 .endif
2083 enter 0, n + 1
2084
2085 mov edx, esp
2086 mov esp, eax
2087
2088#elif defined(__arm__)
2089
2090 add r4, r4, #4*n
2091 add r5, r5, #4*n + 8
2092
2093 str r4, [r5, #-4]!
2094 .rept n/2
2095 ldrd r0, r1, [r4, #-8]!
2096 strd r0, r1, [r5, #-8]!
2097 .endr
2098 add r4, r5, #4*n
2099 str r4, [r5, #-4]!
2100
2101#elif defined(__aarch64__)
2102
2103 // omgwtf. let's not actually screw with the stack pointer.
2104
2105 add x4, x4, #8*n
2106 add x5, x5, #8*n + 16
2107
2108 str x4, [x5, #-8]!
2109 .rept n/2
2110 ldp x16, x17, [x4, #-16]!
2111 stp x16, x17, [x5, #-16]!
2112 .endr
2113 add x4, x5, #8*n
2114 str x4, [x5, #-8]!
2115
2116#else
2117 notimpl
2118#endif
2119
2120 ret
06297a93
MW
2121
2122endproc
2123
2124proc x1e
2125
90c4eee3
MW
2126 // convert nibble value to (uppercase) hex; other input values yield
2127 // nonsense.
2128
2129#if defined(__x86_64__)
2130
2131 // das doesn't work in 64-bit mode; best i can come up with
2132 mov edx, eax
2133 add al, '0'
2134 add dl, 'A' - 10
2135 cmp al, '9' + 1
2136 cmovae eax, edx
2137
2138#elif defined(__i386__)
2139
2140 cmp al, 0x0a // cf = 1 iff a < 10
2141 sbb al, 0x69 // if 0 <= a < 10, a' = a - 0x6a, so
2142 // 0x96 <= a' < 0x70, setting af, cf
2143 // if 10 <= a < 16, a' = a - 0x69, so
2144 // 0x71 <= a' < 0x77, setting cf but
2145 // clearing af
2146 das // if 0 <= a < 10, then af and cf are
2147 // both set, so set subtract 0x66
2148 // from a' leaving 0x30 <= a' < 0x3a;
2149 // if 10 <= a < 16 then af clear but
2150 // cf set, so subtract 0x60 from a'
2151 // leaving 0x41 <= a' < 0x47
2152
2153#elif defined(__arm__)
2154
2155 // significantly less tricksy
2156 cmp r0, #10
2157 addlo r0, r0, #'0'
2158 addhs r0, r0, #'A' - 10
2159
2160#elif defined(__aarch64__)
2161
2162 // with less versatile conditional execution this is the best we can
2163 // do
2164 cmp w0, #10
2165 add w16, w0, #'A' - 10
2166 add w0, w0, #'0'
2167 cmov.hs w0, w16
2168
2169#else
2170 notimpl
2171#endif
2172
2173 ret
06297a93
MW
2174
2175endproc
2176
2177proc x1f
2178
90c4eee3
MW
2179 // verify collatz conjecture starting at a; assume a /= 0!
2180
2181#if defined(__x86_64__)
2182
21830: bsf rcx, rax // clobber c if a = 0
2184 shr rax, cl // a = 2^c a'
2185 cmp rdx, 0
2186 je 1f
2187 stosq
2188 dec rdx
21891:
2190 cmp rax, 1 // done?
2191 je 9f
2192 lea rax, [2*rax + rax + 1] // a' = 3 a' + 1
2193 jmp 0b // again
2194
21959: ret
2196
2197#elif defined(__i386__)
2198
21990: bsf ecx, eax // clobber c if a = 0
2200 shr eax, cl // a = 2^c a'
2201 cmp edx, 0
2202 je 1f
2203 stosd
2204 dec edx
22051:
2206 cmp eax, 1 // done?
2207 je 9f
2208 lea eax, [2*eax + eax + 1] // a' = 3 a' + 1
2209 jmp 0b // again
2210
22119: ret
2212
2213#elif defined(__arm__)
2214
2215 // rbit introduced in armv7
22160: rbit r2, r0
2217 clz r2, r2
2218 mov r0, r0, lsr r2 // a = 2^c a'
2219 cmp r3, #0
2220 strne r0, [r5], #4
2221 subne r3, r3, #1
2222 cmp r0, #1
2223 adcne r0, r0, r0, lsl #1 // a' = 3 a' + 1 (because c set)
2224 bne 0b
2225
2226 ret
2227
2228#elif defined(__aarch64__)
2229
22300: rbit w2, w0
2231 clz w2, w2
2232 lsr w0, w0, w2 // a = 2^c a'
2233 cmp x3, #0
2234 beq 1f
2235 str x0, [x5], #8
2236 sub x3, x3, #1
22371:
2238 cmp w0, #1
2239 add w16, w0, w0, lsl #1 // t = 3 a' + 1 (because c set)
2240 csinc.eq w0, w0, w16
2241 b.ne 0b
2242
2243 ret
2244
2245#else
2246 notimpl
2247#endif
06297a93
MW
2248
2249endproc
2250
90c4eee3
MW
2251///--------------------------------------------------------------------------
2252/// 0x20--0x2f
2253
06297a93
MW
2254proc x20
2255
90c4eee3
MW
2256 // calculate 1337 a slowly
2257
2258#if defined(__x86_64__)
2259
2260 // original version
2261 mov rcx, rax // c = a
2262 shl rcx, 2 // c = 4 a
2263 add rcx, rax // c = 5 a
2264 shl rcx, 3 // c = 40 a
2265 add rcx, rax // c = 41 a
2266 shl rcx, 1 // c = 82 a
2267 add rcx, rax // c = 83 a
2268 shl rcx, 1 // c = 166 a
2269 add rcx, rax // c = 167 a
2270 shl rcx, 3 // c = 1336 a
2271 add rcx, rax // c = 1337 a
2272
2273 // a quick way
2274 lea rdx, [2*rax + rax] // t = 3 a
2275 shl rdx, 6 // t = 192 a
2276 sub rdx, rax // t = 191 a
2277 lea rbx, [8*rdx] // b = 1528 a
2278 sub rbx, rdx // b = 1337 a
2279
2280#elif defined(__i386__)
2281
2282 // original version
2283 mov ecx, eax // c = a
2284 shl ecx, 2 // c = 4 a
2285 add ecx, eax // c = 5 a
2286 shl ecx, 3 // c = 40 a
2287 add ecx, eax // c = 41 a
2288 shl ecx, 1 // c = 82 a
2289 add ecx, eax // c = 83 a
2290 shl ecx, 1 // c = 166 a
2291 add ecx, eax // c = 167 a
2292 shl ecx, 3 // c = 1336 a
2293 add ecx, eax // c = 1337 a
2294
2295 // a quick way
2296 lea edx, [2*eax + eax] // t = 3 a
2297 shl edx, 6 // t = 192 a
2298 sub edx, eax // t = 191 a
2299 lea ebx, [8*edx] // b = 1528 a
2300 sub ebx, edx // b = 1337 a
2301
2302#elif defined(__arm__)
2303
2304 // original version, ish
2305 add r2, r0, r0, lsl #2 // c = 5 a
2306 add r2, r0, r2, lsl #3 // c = 41 a
2307 add r2, r0, r2, lsl #1 // c = 83 a
2308 add r2, r0, r2, lsl #1 // c = 167 a
2309 add r2, r0, r2, lsl #3 // c = 1337 a
2310
2311 // quicker way
2312 add r1, r0, r0, lsl #1 // b = 3 a
2313 rsb r1, r0, r1, lsl #6 // b = 191 a
2314 rsb r1, r1, r1, lsl #3 // b = 1337 a
2315
2316#elif defined(__aarch64__)
2317
2318 // original version, ish
2319 add x2, x0, x0, lsl #2 // c = 5 a
2320 add x2, x0, x2, lsl #3 // c = 41 a
2321 add x2, x0, x2, lsl #1 // c = 83 a
2322 add x2, x0, x2, lsl #1 // c = 167 a
2323 add x2, x0, x2, lsl #3 // c = 1337 a
2324
2325 // sleazy because no rsb
2326 add x1, x0, x0, lsl #1 // b = 3 a
2327 sub x1, x0, x1, lsl #6 // b = -191 a
2328 sub x1, x1, x1, lsl #3 // b = 1337 a
2329
2330#else
2331 notimpl
2332#endif
06297a93
MW
2333
2334 ret
2335
2336endproc
2337
2338proc x21
2339
90c4eee3
MW
2340 // multiply complex numbers a + b i and c + d i
2341 //
2342 // (a + b i) (c + d i) = (a c - b d) + (a d + b c) i
2343 //
2344 // somewhat slick approach uses only three multiplications
2345
2346#if defined(__x86_64__)
2347
2348 mov rsi, rax // t = a
2349 add rax, rbx // a' = a + b
2350 mov rdi, rdx // u = d
2351 sub rdx, rcx // d' = d - c
2352 add rdi, rcx // u = c + d
2353
2354 imul rax, rcx // a' = c (a + b)
2355 imul rsi, rdx // t = a (d - c)
2356 imul rdi, rbx // u = b (c + d)
2357
2358 add rsi, rax // t = a (d - c) + c (a + b)
2359 mov rbx, rsi // b' = a (d - c) + c (a + b)
2360 // = a d + b c
2361 sub rax, rdi // a' = c (a + b) - b (c + d)
2362 // = a c - b d
2363
2364#elif defined(__i386__)
2365
2366 mov esi, eax // t = a
2367 add eax, ebx // a' = a + b
2368 mov edi, edx // u = d
2369 sub edx, ecx // d' = d - c
2370 add edi, ecx // u = c + d
2371
2372 imul eax, ecx // a' = c (a + b)
2373 imul esi, edx // t = a (d - c)
2374 imul edi, ebx // u = b (c + d)
2375
2376 add esi, eax // t = a (d - c) + c (a + b)
2377 mov ebx, esi // b' = a (d - c) + c (a + b)
2378 // = a d + b c
2379 sub eax, edi // a' = c (a + b) - b (c + d)
2380 // = a c - b d
2381
2382#elif defined(__arm__)
2383
2384 add r4, r0, r1 // t = a + b
2385 add r5, r2, r3 // u = c + d
2386 sub r3, r3, r2 // d' = d - c
2387
2388 // mls introduced in armv7
2389 mul r4, r4, r2 // t = c (a + b)
2390 mov r2, r1 // c' = a (bah!)
2391 mla r1, r0, r3, r4 // b' = a (d - c) + c (a + b)
2392 // = a d + b c
2393 mls r0, r2, r5, r4 // a' = c (a + b) - b (c + d)
2394 // = a c - b d
2395
2396#elif defined(__aarch64__)
2397
2398 add x4, x0, x1 // t = a + b
2399 add x5, x2, x3 // u = c + d
2400 sub x3, x3, x2 // d' = d - c
2401
2402 // mls intxoduced in axmv7
2403 mul x4, x4, x2 // t = c (a + b)
2404 mov x2, x1 // c' = a (bah!)
2405 madd x1, x0, x3, x4 // b' = a (d - c) + c (a + b)
2406 // = a d + b c
2407 msub x0, x2, x5, x4 // a' = c (a + b) - b (c + d)
2408 // = a c - b d
2409
2410#else
2411 notimpl
2412#endif
2413
2414 ret
06297a93
MW
2415
2416endproc
2417
2418proc x22
2419
90c4eee3
MW
2420 // divide by 3
2421
2422#if defined(__x86_64__)
2423
2424 mov rdx, 0xaaaaaaaaaaaaaaab // = ceil(2/3 2^64)
2425 mul rdx // d' || a' =~ 2/3 a 2^64
2426 shr rdx, 1 // d' = floor(a/3)
2427 mov rax, rdx // a' = floor(a/3)
2428
2429 // we start with 0 <= a < 2^64. write f = ceil(2/3 2^64), so that
2430 // 2/3 < f/2^64 < 2/3 + 1/2^64. then floor(2/3 a) <= floor(a f/2^64)
2431 // <= floor(2/3 a + a/2^64), but a < 2^64 so a/2^64 < 1 and
2432 // floor(a f/2^64) = floor(2/3 a).
2433
2434#elif defined(__i386__)
2435
2436 mov edx, 0xaaaaaaab // = ceil(2/3 2^32)
2437 mul edx // d' || a' =~ 2/3 a 2^32
2438 shr edx, 1 // d' = floor(a/3)
2439 mov eax, edx // a' = floor(a/3)
2440
2441#elif defined(__arm__)
2442
2443 ldr r12, =0xaaaaaaab
2444 umull r12, r0, r0, r12
2445 mov r0, r0, lsr #1
2446
2447#elif defined(__aarch64__)
2448
2449 ldr x16, =0xaaaaaaaaaaaaaaab
2450 umulh x0, x0, x16
2451 lsr x0, x0, #1
2452
2453#else
2454 notimpl
2455#endif
2456
2457 ret
06297a93
MW
2458
2459endproc
2460
2461proc x23
2462
90c4eee3
MW
2463#if defined(__x86_64__)
2464
2465 // main loop: shorten a preserving residue class mod 3
24660: cmp rax, 5
2467 jbe 8f
2468 // a > 5
2469 mov rdx, rax // d' = a
2470 shr rdx, 2 // d' = floor(a/4)
2471 and rax, 3 // a = 4 d' + a' (0 <= a' < 4)
2472 add rax, rdx // a' == a (mod 3) but a' < a/4 + 4
2473 jmp 0b
2474
2475 // fix up final value 0 <= a < 6: want 0 <= a < 3
2476 //
2477 // the tricky part is actually a = 3; but the other final cases take
2478 // additional iterations which we can avoid.
24798: cmp rax, 3 // set cf iff a < 3
2480 cmc // set cf iff a >= 3
2481 sbb rdx, rdx // d' = a >= 3 ? -1 : 0
2482 and rdx, 3 // d' = a >= 3 ? 3 : 0
2483 sub rax, rdx // a' = a - (a >= 3 ? 3 : 0)
2484 // = a (mod 3)
2485
2486#elif defined(__i386__)
2487
2488 // main loop: shorten a preserving residue class mod 3
24890: cmp eax, 5
2490 jbe 8f
2491 // a > 5
2492 mov edx, eax // d' = a
2493 shr edx, 2 // d' = floor(a/4)
2494 and eax, 3 // a = 4 d' + a' (0 <= a' < 4)
2495 add eax, edx // a' == a (mod 3) but a' < a/4 + 4
2496 jmp 0b
2497
2498 // fix up final value 0 <= a < 6: want 0 <= a < 3
2499 //
2500 // the tricky part is actually a = 3; but the other final cases take
2501 // additional iterations which we can avoid.
25028: cmp eax, 3 // set cf iff a < 3
2503 cmc // set cf iff a >= 3
2504 sbb edx, edx // d' = a >= 3 ? -1 : 0
2505 and edx, 3 // d' = a >= 3 ? 3 : 0
2506 sub eax, edx // a' = a - (a >= 3 ? 3 : 0)
2507 // = a (mod 3)
2508
2509#elif defined(__arm__)
2510
25110: cmp r0, #6
2512 andhs r12, r0, #3
2513 addhs r0, r12, r0, lsr #2
2514 bhs 0b
2515
2516 cmp r0, #3
2517 subhs r0, r0, #3
2518
2519#elif defined(__aarch64__)
2520
25210: cmp x0, #6
2522 // blunder on through regardless since this doesn't affect the result
2523 and x16, x0, #3
2524 add x0, x16, x0, lsr #2
2525 b.hs 0b
2526
2527 subs x16, x0, #3
2528 cmov.hs x0, x16
2529
2530#else
2531 notimpl
2532#endif
2533
2534 ret
06297a93
MW
2535
2536endproc
2537
2538proc x24
2539
90c4eee3
MW
2540 // invert (odd) a mod 2^64
2541 //
2542 // suppose a a_i == 1 (mod 2^{2^i})
2543 //
2544 // clearly good for i = 0, since 2^i = 1 and 2^{2^i} = 2, and a_0 =
2545 // a == 1 (mod 2) by assumption
2546 //
2547 // write a a_i == b_i 2^{2^i} + 1 (mod 2^{2^{i+1}})
2548 // then b_i == (a a_i - 1)/2^{2^i} (mod 2^{2^i})
2549 // to lift inverse, we want x such that a x == -b_i (mod 2^{2^i});
2550 // clearly x = -a_i b_i will do, since a a_i == 1 (mod 2^{2^i})
2551 // then:
2552 // a_{i+1} = a_i - a_i b_i 2^{2^i} = a_i (1 - (a a_i - 1))
2553 // = 2 a_i - a a_i^2
2554 //
2555 // check:
2556 // a a_{i+1} = 2 a a_i - a^2 a_i^2
2557 // == 2 a a_i - (b_i 2^{2^i} + 1)^2
2558 // == 2 (b_i 2^{2^i} + 1) -
2559 // (b_i^2 2^{2^{i+1}} + 2 b_i 2^{2^i} + 1)
2560 // == 1 (mod 2^{2^{i+1}})
2561
2562#if defined(__x86_64__)
2563
2564 // rax // a_0 = a
2565 mov rbx, rax // b' = a
2566 mov rsi, rax // t = a_0
2567
25680:
2569 cmp rbp, 0
2570 je 1f
2571 stosq
2572 dec rbp
25731:
2574 mul rbx // a' = a a_i
2575 mov rcx, rax // c = a a_i
2576
2577 sub rax, 2 // a' = a a_i - 2
2578 neg rax // a' = 2 - a a_i
2579 mul rsi // a_{i+1} = a_i (2 - a a_i)
2580 // = 2 a_i - a a_i^2
2581 mov rsi, rax // t = a_{i+1}
2582
2583 cmp rcx, 1 // done?
2584 ja 0b // no -- iterate
2585
2586#elif defined(__i386__)
2587
2588 // eax // a_0 = a
2589 mov ebx, eax // b' = a
2590 mov esi, eax // t = a_0
2591
25920:
2593 cmp ebp, 0
2594 je 1f
2595 stosd
2596 dec ebp
25971:
2598 mul ebx // a' = a a_i
2599 mov ecx, eax // c = a a_i
2600
2601 sub eax, 2 // a' = a a_i - 2
2602 jb 9f // done if < 2
2603 neg eax // a' = 2 - a a_i
2604 mul esi // a_{i+1} = a_i (2 - a a_i)
2605 // = 2 a_i - a a_i^2
2606 mov esi, eax // t = a_{i+1}
2607
2608 jmp 0b // and iterate
26099: mov eax, esi // restore
2610
2611#elif defined(__arm__)
2612
2613 // r0 // a_0 = a
2614 mov r1, r0 // b' = a
2615
26160:
2617 cmp r6, #0
2618 strne r0, [r5], #4
2619 subne r6, r6, #1
2620 mul r2, r0, r1 // c = a a_i
2621 rsbs r2, r2, #2 // c = 2 - a a_i
2622 mul r0, r0, r2 // a_{i+1} = a_i (2 - a a_i)
2623 // = 2 a_i - a a_i^2
2624 blo 0b
2625
2626#elif defined(__aarch64__)
2627
2628 // x0 // a_0 = a
2629 mov x1, x0 // b' = a
2630 mov x16, #2 // because we have no rsb
2631
26320:
2633 cmp x6, #0
2634 b.eq 1f
2635 str x0, [x5], #8
2636 sub x6, x6, #1
26371:
2638 mul x2, x0, x1 // c = a a_i
2639 subs x2, x16, x2 // c = 2 - a a_i
2640 mul x0, x0, x2 // a_{i+1} = a_i (2 - a a_i)
2641 // = 2 a_i - a a_i^2
2642 b.lo 0b
2643
2644#else
2645 notimpl
2646#endif
2647
2648 ret
06297a93
MW
2649
2650endproc
2651
2652proc x25
2653
90c4eee3
MW
2654 // a poor approximation to pi/4
2655 //
2656 // think of x and y as being in 16.16 fixed-point format. we sample
2657 // points in the unit square, and determine how many of them are
2658 // within a unit quarter-circle centred at the origin. the area of
2659 // the quarter-circle is pi/4.
2660
2661#if defined(__x86_64__)
2662
2663 xor eax, eax // a = 0
2664 mov rcx, 1
2665 shl rcx, 0x20 // c =~ 4 billion
2666
26670: movzx rbx, cx // x = low 16 bits of c
2668 imul rbx, rbx // b = x^2
2669
2670 ror rcx, 0x10 // switch halves of c
2671 movzx rdx, cx // y = high 16 bits of c
2672 imul rdx, rdx // d = y^2
2673 rol rcx, 0x10 // switch back
2674
2675 add rbx, rdx // r^2 = x^2 + y^2
2676 shr rbx, 0x20 // r^2 >= 1?
2677 cmp rbx, 1 // set cf iff r^2 >= 1
2678 adc rax, 0 // and add onto accumulator
2679 loop 0b
2680
2681#elif defined(__i386__)
2682
2683 // this is actually better done in 32 bits. the carry has the wrong
2684 // sense here, so instead deduct one for each point outside the
2685 // quarter-circle rather than adding one for each point inside it.
2686 xor eax, eax
2687 xor ecx, ecx
2688
26890: movzx ebx, cx
2690 imul ebx, ebx
2691
2692 ror ecx, 0x10
2693 movzx edx, cx
2694 imul edx, edx
2695 rol ecx, 0x10
2696
2697 add ebx, edx // see?
2698 sbb eax, 0
2699 loop 0b
2700
2701#elif defined(__arm__)
2702
2703 mov r0, #0
2704 mov r2, #0
2705
27060: uxth r1, r2, ror #0
2707 uxth r3, r2, ror #16
2708 mul r1, r1, r1
2709 mul r3, r3, r3
2710 cmn r1, r3 // mlas doesn't set cf usefully
2711 addcc r0, r0, #1
2712 adds r2, r2, #1
2713 bne 0b
2714
2715#elif defined(__aarch64__)
2716
2717 mov w0, #0
2718 mov w2, #0
2719
27200: ubfx w1, w2, #0, #16
2721 ubfx w3, w2, #16, #16
2722 sub w2, w2, #1
2723 mul w1, w1, w1
2724 mul w3, w3, w3
2725 cmn w1, w3
2726 cinc.cc w0, w0
2727 cbnz w2, 0b
2728
2729#else
2730 notimpl
2731#endif
2732
2733 ret
06297a93
MW
2734
2735endproc
2736
2737proc x26
2738
90c4eee3
MW
2739#if defined(__x86_64__)
2740
2741 notimpl
2742
2743#elif defined(__i386__)
2744
2745 notimpl
2746
2747#elif defined(__arm__)
2748
2749 notimpl
2750
2751#elif defined(__aarch64__)
2752
2753 notimpl
2754
2755#else
2756 notimpl
2757#endif
06297a93
MW
2758
2759endproc
2760
2761proc x27
2762
90c4eee3
MW
2763#if defined(__x86_64__)
2764
2765 notimpl
2766
2767#elif defined(__i386__)
2768
2769 notimpl
2770
2771#elif defined(__arm__)
2772
2773 notimpl
2774
2775#elif defined(__aarch64__)
2776
2777 notimpl
2778
2779#else
2780 notimpl
2781#endif
06297a93
MW
2782
2783endproc
2784
2785proc x28
2786
90c4eee3
MW
2787#if defined(__x86_64__)
2788
2789 notimpl
2790
2791#elif defined(__i386__)
2792
2793 notimpl
2794
2795#elif defined(__arm__)
2796
2797 notimpl
2798
2799#elif defined(__aarch64__)
2800
2801 notimpl
2802
2803#else
2804 notimpl
2805#endif
06297a93
MW
2806
2807endproc
2808
2809proc x29
2810
90c4eee3
MW
2811#if defined(__x86_64__)
2812
2813 notimpl
2814
2815#elif defined(__i386__)
2816
2817 notimpl
2818
2819#elif defined(__arm__)
2820
2821 notimpl
2822
2823#elif defined(__aarch64__)
2824
2825 notimpl
2826
2827#else
2828 notimpl
2829#endif
06297a93
MW
2830
2831endproc
2832
2833proc x2a
2834
90c4eee3
MW
2835#if defined(__x86_64__)
2836
2837 notimpl
2838
2839#elif defined(__i386__)
2840
2841 notimpl
2842
2843#elif defined(__arm__)
2844
2845 notimpl
2846
2847#elif defined(__aarch64__)
2848
2849 notimpl
2850
2851#else
2852 notimpl
2853#endif
06297a93
MW
2854
2855endproc
2856
2857proc x2b
2858
90c4eee3
MW
2859#if defined(__x86_64__)
2860
2861 notimpl
2862
2863#elif defined(__i386__)
2864
2865 notimpl
2866
2867#elif defined(__arm__)
2868
2869 notimpl
2870
2871#elif defined(__aarch64__)
2872
2873 notimpl
2874
2875#else
2876 notimpl
2877#endif
06297a93
MW
2878
2879endproc
2880
2881proc x2c
2882
90c4eee3
MW
2883#if defined(__x86_64__)
2884
2885 notimpl
2886
2887#elif defined(__i386__)
2888
2889 notimpl
2890
2891#elif defined(__arm__)
2892
2893 notimpl
2894
2895#elif defined(__aarch64__)
2896
2897 notimpl
2898
2899#else
2900 notimpl
2901#endif
06297a93
MW
2902
2903endproc
2904
2905proc x2d
2906
90c4eee3
MW
2907#if defined(__x86_64__)
2908
2909 notimpl
2910
2911#elif defined(__i386__)
2912
2913 notimpl
2914
2915#elif defined(__arm__)
2916
2917 notimpl
2918
2919#elif defined(__aarch64__)
2920
2921 notimpl
2922
2923#else
2924 notimpl
2925#endif
06297a93
MW
2926
2927endproc
2928
2929proc x2e
2930
90c4eee3
MW
2931#if defined(__x86_64__)
2932
2933 notimpl
2934
2935#elif defined(__i386__)
2936
2937 notimpl
2938
2939#elif defined(__arm__)
2940
2941 notimpl
2942
2943#elif defined(__aarch64__)
2944
2945 notimpl
2946
2947#else
2948 notimpl
2949#endif
06297a93
MW
2950
2951endproc
2952
2953proc x2f
2954
90c4eee3
MW
2955#if defined(__x86_64__)
2956
2957 notimpl
2958
2959#elif defined(__i386__)
2960
2961 notimpl
2962
2963#elif defined(__arm__)
2964
2965 notimpl
2966
2967#elif defined(__aarch64__)
2968
2969 notimpl
2970
2971#else
2972 notimpl
2973#endif
06297a93
MW
2974
2975endproc
2976
90c4eee3
MW
2977///--------------------------------------------------------------------------
2978/// 0x30--0x3f
2979
06297a93
MW
2980proc x30
2981
90c4eee3
MW
2982#if defined(__x86_64__)
2983
2984 notimpl
2985
2986#elif defined(__i386__)
2987
2988 notimpl
2989
2990#elif defined(__arm__)
2991
2992 notimpl
2993
2994#elif defined(__aarch64__)
2995
2996 notimpl
2997
2998#else
2999 notimpl
3000#endif
06297a93
MW
3001
3002 ret
3003
3004endproc
3005
3006proc x31
3007
90c4eee3
MW
3008#if defined(__x86_64__)
3009
3010 notimpl
3011
3012#elif defined(__i386__)
3013
3014 notimpl
3015
3016#elif defined(__arm__)
3017
3018 notimpl
3019
3020#elif defined(__aarch64__)
3021
3022 notimpl
3023
3024#else
3025 notimpl
3026#endif
06297a93
MW
3027
3028endproc
3029
3030proc x32
3031
90c4eee3
MW
3032#if defined(__x86_64__)
3033
3034 notimpl
3035
3036#elif defined(__i386__)
3037
3038 notimpl
3039
3040#elif defined(__arm__)
3041
3042 notimpl
3043
3044#elif defined(__aarch64__)
3045
3046 notimpl
3047
3048#else
3049 notimpl
3050#endif
06297a93
MW
3051
3052endproc
3053
3054proc x33
3055
90c4eee3
MW
3056#if defined(__x86_64__)
3057
3058 notimpl
3059
3060#elif defined(__i386__)
3061
3062 notimpl
3063
3064#elif defined(__arm__)
3065
3066 notimpl
3067
3068#elif defined(__aarch64__)
3069
3070 notimpl
3071
3072#else
3073 notimpl
3074#endif
06297a93
MW
3075
3076endproc
3077
3078proc x34
3079
90c4eee3
MW
3080#if defined(__x86_64__)
3081
3082 notimpl
3083
3084#elif defined(__i386__)
3085
3086 notimpl
3087
3088#elif defined(__arm__)
3089
3090 notimpl
3091
3092#elif defined(__aarch64__)
3093
3094 notimpl
3095
3096#else
3097 notimpl
3098#endif
06297a93
MW
3099
3100endproc
3101
3102proc x35
3103
90c4eee3
MW
3104#if defined(__x86_64__)
3105
3106 notimpl
3107
3108#elif defined(__i386__)
3109
3110 notimpl
3111
3112#elif defined(__arm__)
3113
3114 notimpl
3115
3116#elif defined(__aarch64__)
3117
3118 notimpl
3119
3120#else
3121 notimpl
3122#endif
06297a93
MW
3123
3124endproc
3125
3126proc x36
3127
90c4eee3
MW
3128#if defined(__x86_64__)
3129
3130 notimpl
3131
3132#elif defined(__i386__)
3133
3134 notimpl
3135
3136#elif defined(__arm__)
3137
3138 notimpl
3139
3140#elif defined(__aarch64__)
3141
3142 notimpl
3143
3144#else
3145 notimpl
3146#endif
06297a93
MW
3147
3148endproc
3149
3150proc x37
3151
90c4eee3
MW
3152#if defined(__x86_64__)
3153
3154 notimpl
3155
3156#elif defined(__i386__)
3157
3158 notimpl
3159
3160#elif defined(__arm__)
3161
3162 notimpl
3163
3164#elif defined(__aarch64__)
3165
3166 notimpl
3167
3168#else
3169 notimpl
3170#endif
06297a93
MW
3171
3172endproc
3173
3174proc x38
3175
90c4eee3
MW
3176#if defined(__x86_64__)
3177
3178 notimpl
3179
3180#elif defined(__i386__)
3181
3182 notimpl
3183
3184#elif defined(__arm__)
3185
3186 notimpl
3187
3188#elif defined(__aarch64__)
3189
3190 notimpl
3191
3192#else
3193 notimpl
3194#endif
06297a93
MW
3195
3196endproc
3197
3198proc x39
3199
90c4eee3
MW
3200#if defined(__x86_64__)
3201
3202 notimpl
3203
3204#elif defined(__i386__)
3205
3206 notimpl
3207
3208#elif defined(__arm__)
3209
3210 notimpl
3211
3212#elif defined(__aarch64__)
3213
3214 notimpl
3215
3216#else
3217 notimpl
3218#endif
06297a93
MW
3219
3220endproc
3221
3222proc x3a
3223
90c4eee3
MW
3224#if defined(__x86_64__)
3225
3226 notimpl
3227
3228#elif defined(__i386__)
3229
3230 notimpl
3231
3232#elif defined(__arm__)
3233
3234 notimpl
3235
3236#elif defined(__aarch64__)
3237
3238 notimpl
3239
3240#else
3241 notimpl
3242#endif
06297a93
MW
3243
3244endproc
3245
3246proc x3b
3247
90c4eee3
MW
3248#if defined(__x86_64__)
3249
3250 notimpl
3251
3252#elif defined(__i386__)
3253
3254 notimpl
3255
3256#elif defined(__arm__)
3257
3258 notimpl
3259
3260#elif defined(__aarch64__)
3261
3262 notimpl
3263
3264#else
3265 notimpl
3266#endif
06297a93
MW
3267
3268endproc
3269
3270proc x3c
3271
90c4eee3
MW
3272#if defined(__x86_64__)
3273
3274 notimpl
3275
3276#elif defined(__i386__)
3277
3278 notimpl
3279
3280#elif defined(__arm__)
3281
3282 notimpl
3283
3284#elif defined(__aarch64__)
3285
3286 notimpl
3287
3288#else
3289 notimpl
3290#endif
06297a93
MW
3291
3292endproc
3293
3294proc x3d
3295
90c4eee3
MW
3296#if defined(__x86_64__)
3297
3298 notimpl
3299
3300#elif defined(__i386__)
3301
3302 notimpl
3303
3304#elif defined(__arm__)
3305
3306 notimpl
3307
3308#elif defined(__aarch64__)
3309
3310 notimpl
3311
3312#else
3313 notimpl
3314#endif
06297a93
MW
3315
3316endproc
3317
3318proc x3e
3319
90c4eee3
MW
3320#if defined(__x86_64__)
3321
3322 notimpl
3323
3324#elif defined(__i386__)
3325
3326 notimpl
3327
3328#elif defined(__arm__)
3329
3330 notimpl
3331
3332#elif defined(__aarch64__)
3333
3334 notimpl
3335
3336#else
3337 notimpl
3338#endif
06297a93
MW
3339
3340endproc
3341
3342proc x3f
3343
90c4eee3
MW
3344#if defined(__x86_64__)
3345
3346 notimpl
3347
3348#elif defined(__i386__)
3349
3350 notimpl
3351
3352#elif defined(__arm__)
3353
3354 notimpl
3355
3356#elif defined(__aarch64__)
3357
3358 notimpl
3359
3360#else
3361 notimpl
3362#endif
06297a93
MW
3363
3364endproc
90c4eee3
MW
3365
3366///----- That's all, folks --------------------------------------------------