Initial version. AMD64 only, and only as far as 0x16.
[xchg-rax-rax] / xchg.S
1 /// -*- mode: asm; asm-comment-char: ?/ -*-
2
3 .intel_syntax noprefix
4
5 .section .note.GNU-stack, "", @progbits
6
7 .macro proc name
8 .globl \name
9 .type \name, STT_FUNC
10 .p2align 4
11 \name\():
12 .macro endproc
13 .size \name, . - \name
14 .purgem endproc
15 .endm
16 .endm
17
18 .macro ch c
19 pushf
20 push rax
21 push rcx
22 push rdx
23 push rsi
24 push rdi
25 push r8
26 push r9
27 push rbp
28 mov rbp, rsp
29 and rsp, -16
30
31 mov rdi, \c
32 call putchar@plt
33
34 mov rdi, [rip + stdout]
35 call fflush@plt
36
37 mov rsp, rbp
38 pop rbp
39 pop r9
40 pop r8
41 pop rdi
42 pop rsi
43 pop rdx
44 pop rcx
45 pop rax
46 popf
47 .endm
48
49 .text
50
51 proc call_example
52
53 push rbx // rbx
54 push r10
55 push r11
56 push r12
57 push r13
58 push r14
59 push r15
60 push rbp // flags, rbp, ..., rbx
61 pushf
62
63 push rsi // regs, flags, rbp, ..., rbx
64
65 lea rax, [rip + 9f]
66 push rax // cont, regs, flags, rbp, ..., rbx
67 push rdi // func, cont, regs, flags, rbp, ..., rbx
68
69 mov rax, [rsi + 56]
70 pushf
71 pop rcx
72 and rax, 0x0cd5
73 and rcx, ~0x0cd5
74 or rax, rcx
75 push rax
76 popf
77 mov rax, [rsi + 0]
78 mov rbx, [rsi + 8]
79 mov rcx, [rsi + 16]
80 mov rdx, [rsi + 24]
81 mov rdi, [rsi + 40]
82 mov rbp, [rsi + 48]
83 mov rsi, [rsi + 32]
84
85 ret // -> func; regs, flags, rbp, ..., rbx
86
87 9: pushf // rflags, regs, flags, rbp, ..., rbx
88 push rsi // rsi, rflags, regs, flags, rbp, ..., rbx
89 mov rsi, [rsp + 16]
90 mov [rsi + 0], rax
91 mov [rsi + 8], rbx
92 mov [rsi + 16], rcx
93 mov [rsi + 24], rdx
94 mov [rsi + 40], rdi
95 mov [rsi + 48], rbp
96 pop rax // rflags, regs, flags, rbp, ..., rbx
97 mov [rsi + 32], rax
98 pop rax // regs, flags, rbp, ..., rbx
99 mov [rsi + 56], rax
100
101 add rsp, 8 // flags, rbp, ..., rbx
102 popf // rbp, ..., rbx
103 pop rbp // ..., rbx
104 pop r15
105 pop r14
106 pop r13
107 pop r12
108 pop r11
109 pop r10
110 pop rbx //
111 ret
112
113 endproc
114
115 proc nop
116
117 ret
118
119 endproc
120
121 ///--------------------------------------------------------------------------
122
123 proc x00
124
125 // clear all 64 bits of extended traditional registers
126 xor eax,eax // clear rax
127 lea rbx,[0] // rbx -> _|_
128 loop . // iterate, decrement rcx until zero
129 mov rdx,0 // set rdx = 0
130 and esi,0 // clear all bits of rsi
131 sub edi,edi // set rdi = edi - edi = 0
132 push 0
133 pop rbp // pop 0 into rbp
134
135 ret
136
137 endproc
138
139 proc x01
140
141 // advance a fibonacci pair by c steps
142 //
143 // on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1}
144 // and f_{i+c}, where f_{i+1} = f_i + f_{i-1}
145 0: xadd rax, rdx // a, d = a + d, a
146 // = f_{i+1} + f_i, f_{i+1}
147 // = f_{i+2}, f_{i+1}
148 loop 0b // advance i, decrement c, iterate
149
150 ret
151
152 endproc
153
154 proc x02
155
156 // boolean canonify a: if a = 0 on entry, leave it zero; otherwise
157 // set a = 1
158 neg rax // set cf iff a /= 0
159 sbb rax, rax // a = a - a - cf = -cf
160 neg rax // a = cf
161
162 ret
163
164 endproc
165
166 proc x03
167
168 // set a = min(a, d) (unsigned); clobber c, d
169 sub rdx, rax // d' = d - a; set cf if a > d
170 sbb rcx, rcx // c = -cf = -[a > d]
171 and rcx, rdx // c = a > d ? d - a : 0
172 add rax, rcx // a' = a > d ? d : a
173
174 ret
175
176 endproc
177
178 proc x04
179
180 // switch case?
181 xor al, 0x20
182
183 ret
184
185 endproc
186
187 proc x05
188
189 // answer whether 5 <= a </<= 9.
190 sub rax, 5 // a' = a - 5
191 cmp rax, 4 // is a' - 5 </<= 4?
192
193 // cc a' a
194 //
195 // z/e a' = 4 a = 9
196 // nz/ne a' /= 4 a /= 9
197 //
198 // a/nbe a' > 4 a > 9 or a < 5
199 // nc/ae/nb a' >= 4 a >= 9 or a < 5
200 // c/b/nae a' < 4 5 <= a < 9
201 // be/na a' <= 4 5 <= a <= 9
202 //
203 // o a' < -2^63 + 4 -2^63 + 5 <= a < -2^63 + 9
204 // no a' >= -2^63 + 4 a >= -2^63 + 9 or
205 // a < -2^63 + 5
206 // s -2^63 + 4 <= a' < 4 -2^63 + 9 <= a < 9
207 // ns a' < -2^63 + 4 or a < -2^63 + 9 or a >= 9
208 // a' >= 4
209 // ge/nl a' >= 4 a >= 9 or a < -2^63 + 5
210 // l/nge a' < 4 -2^63 + 5 <= a < 9
211 // g/nle a' > 4 a > 9 or a < -2^63 + 5
212 // le/ng a' <= 4 -2^63 + 5 <= a <= 9
213
214 ret
215
216 endproc
217
218 proc x06
219
220 // leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of,
221 // set sf to msb(a)
222 not rax // a' = -a - 1
223 inc rax // a' = -a
224 neg rax // a' = a
225
226 ret
227
228 endproc
229
230 proc x07
231
232 // same as before (?)
233 inc rax // a' = a + 1
234 neg rax // a' = -a - 1
235 inc rax // a' = -a
236 neg rax // a' = a
237
238 ret
239
240 endproc
241
242 proc x08
243
244 // floor((a + d)/2), correctly handling overflow conditions; final cf
245 // is lsb(a + d), probably uninteresting
246 add rax, rdx // cf || a' = a + d
247 rcr rax, 1 // shift 65-bit result right by one
248 // place; lsb moves into carry
249
250 ret
251
252 endproc
253
254 proc x09
255
256 // a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3
257 // (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8).
258 shr rax, 3 // a' = floor(a/8); cf = 1 if a ==
259 // 4, 5, 6, 7 (mod 8)
260 adc rax, 0 // a' = floor(a/8) + cf
261
262 ret
263
264 endproc
265
266 proc x0a
267
268 // increment c-byte little-endian bignum at rdi
269 add byte ptr [rdi], 1
270 0: inc rdi
271 adc byte ptr [rdi], 0
272 loop 0b
273
274 ret
275
276 endproc
277
278 proc x0b
279
280 // negate double-precision d:a
281 not rdx // d' = -d - 1
282 neg rax // a' = -a;
283 // cf = 1 iff a /= 0
284 sbb rdx, -1 // d' = -d - cf
285
286 ret
287
288 endproc
289
290 proc x0c
291
292 // rotate is distributive over xor.
293
294 // rax // = a_1 || a_0
295 // rbx // = b_1 || b_0
296 mov rcx, rax // = a_1 || a_0
297
298 xor rcx, rbx // = (a_1 XOR b_1) || (a_0 XOR b_0)
299 ror rcx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1)
300
301 ror rax, 0xd // = a_0 || a_1
302 ror rbx, 0xd // = b_0 || b_1
303 xor rax, rbx // = (a_0 XOR b_0) || (a_1 XOR b_1)
304
305 cmp rax, rcx // always equal
306
307 ret
308
309 endproc
310
311 proc x0d
312
313 // and is distributive over xor.
314
315 mov rdx, rbx // = b
316
317 xor rbx, rcx // = b XOR c
318 and rbx, rax // = a AND (b XOR c)
319
320 and rdx, rax // = a AND b
321 and rax, rcx // = a AND c
322 xor rax, rdx // = (a AND b) XOR (a AND c)
323 // = a AND (b XOR c)
324
325 cmp rax, rbx // always equal
326
327 ret
328
329 endproc
330
331 proc x0e
332
333 // de morgan's law
334
335 mov rcx, rax // = a
336
337 and rcx, rbx // = a AND b
338 not rcx // = NOT (a AND b)
339
340 not rax // = NOT a
341 not rbx // = NOT b
342 or rax, rbx // = (NOT a) OR (NOT b)
343 // = NOT (a AND b)
344
345 cmp rax, rcx
346
347 ret
348
349 endproc
350
351 proc x0f
352
353 // replace input buffer bytes with cumulative XORs with initial a;
354 // final a is XOR of all buffer bytes and initial a.
355 //
356 // not sure why you'd do this.
357
358 cld
359
360 0: xor [rsi], al
361 lodsb
362 loop 0b
363
364 ret
365
366 endproc
367
368 proc x10
369
370 // four different ways to swap a pair of registers.
371
372 push rax
373 push rcx
374 pop rax
375 pop rcx
376
377 xor rax, rcx
378 xor rcx, rax
379 xor rax, rcx
380
381 add rax, rcx
382 sub rcx, rax
383 add rax, rcx
384 neg rcx
385
386 xchg rax, rcx
387
388 ret
389
390 endproc
391
392 proc x11
393
394 // assuming a is initialized to zero, set a to the inclusive or of
395 // the xor-differences of corresponding bytes in the c-byte strings
396 // at si and di.
397 //
398 // in particular, a will be zero (and zf set) if and only if the two
399 // strings are equal.
400
401 0: mov dl, [rsi]
402 xor dl, [rdi]
403 inc rsi
404 inc rdi
405 or al, dl
406 loop 0b
407
408 ret
409
410 endproc
411
412 proc x12
413
414 // an obtuse way of adding two registers. for any bit position, a
415 // OR d is set if and only if at least one of a and d has a bit set
416 // in that position, and a AND d is set if and only if both have a
417 // bit set in that position. essentially, then, what we've done is
418 // move all of the set bits in d to a, unless there's already a bit
419 // there. this clearly doesn't change the sum.
420
421 mov rcx, rdx // c' = d
422 and rdx, rax // d' = a AND d
423 or rax, rcx // a' = a OR d
424 add rax, rdx
425
426 ret
427
428 endproc
429
430 proc x13
431
432 // ok, so this is a really obtuse way of adding a and b; the result
433 // is in a and d. but why does it work?
434
435 mov rcx, 0x40 // carry chains at most 64 long
436 0: mov rdx, rax // copy a'
437 xor rax, rbx // low bits of each bitwise sum
438 and rbx, rdx // carry bits from each bitwise sum
439 shl rbx, 001 // carry them into next position
440 loop 0b
441
442 ret
443
444 endproc
445
446 proc x14
447
448 // floor((a + d)/2), like x08.
449
450 mov rcx, rax // copy a for later
451 and rcx, rdx // carry bits
452
453 xor rax, rdx // low bits of each bitwise sum
454 shr rax, 1 // divide by 2; carries now in place
455
456 add rax, rcx // add the carries; done
457
458 ret
459
460 endproc
461
462 proc x15
463
464 // sign extension 32 -> 64 bits.
465
466 //movsx rbx, eax // like this?
467
468 mov rdx, 0xffffffff80000000
469 add rax, rdx // if bit 31 of a is set then bits
470 // 31--63 of a' are clear; otherwise,
471 // these bits are all set -- which is
472 // exactly backwards
473 xor rax, rdx // so fix it
474
475 ret
476
477 endproc
478
479 proc x16
480
481 shl rax, 56
482 shl rbx, 56
483 shl rcx, 56
484
485 xor rax, rbx // a' = a XOR b
486 xor rbx, rcx // b' = b XOR c
487 mov rsi, rax // t = a XOR b
488 add rsi, rbx // t = (a XOR b) + (b XOR c)
489 cmovc rax, rbx // a' = cf ? b XOR c : a XOR b
490 xor rax, rbx // a' = cf ? 0 : a XOR c
491 cmp rax, rsi
492
493 ret
494
495 endproc
496
497 proc x17
498
499 ud2
500
501 endproc
502
503 proc x18
504
505 ud2
506
507 endproc
508
509 proc x19
510
511 ud2
512
513 endproc
514
515 proc x1a
516
517 ud2
518
519 endproc
520
521 proc x1b
522
523 ud2
524
525 endproc
526
527 proc x1c
528
529 ud2
530
531 endproc
532
533 proc x1d
534
535 ud2
536
537 endproc
538
539 proc x1e
540
541 ud2
542
543 endproc
544
545 proc x1f
546
547 ud2
548
549 endproc
550
551 proc x20
552
553 ud2
554
555 ret
556
557 endproc
558
559 proc x21
560
561 ud2
562
563 endproc
564
565 proc x22
566
567 ud2
568
569 endproc
570
571 proc x23
572
573 ud2
574
575 endproc
576
577 proc x24
578
579 ud2
580
581 endproc
582
583 proc x25
584
585 ud2
586
587 endproc
588
589 proc x26
590
591 ud2
592
593 endproc
594
595 proc x27
596
597 ud2
598
599 endproc
600
601 proc x28
602
603 ud2
604
605 endproc
606
607 proc x29
608
609 ud2
610
611 endproc
612
613 proc x2a
614
615 ud2
616
617 endproc
618
619 proc x2b
620
621 ud2
622
623 endproc
624
625 proc x2c
626
627 ud2
628
629 endproc
630
631 proc x2d
632
633 ud2
634
635 endproc
636
637 proc x2e
638
639 ud2
640
641 endproc
642
643 proc x2f
644
645 ud2
646
647 endproc
648
649 proc x30
650
651 ud2
652
653 ret
654
655 endproc
656
657 proc x31
658
659 ud2
660
661 endproc
662
663 proc x32
664
665 ud2
666
667 endproc
668
669 proc x33
670
671 ud2
672
673 endproc
674
675 proc x34
676
677 ud2
678
679 endproc
680
681 proc x35
682
683 ud2
684
685 endproc
686
687 proc x36
688
689 ud2
690
691 endproc
692
693 proc x37
694
695 ud2
696
697 endproc
698
699 proc x38
700
701 ud2
702
703 endproc
704
705 proc x39
706
707 ud2
708
709 endproc
710
711 proc x3a
712
713 ud2
714
715 endproc
716
717 proc x3b
718
719 ud2
720
721 endproc
722
723 proc x3c
724
725 ud2
726
727 endproc
728
729 proc x3d
730
731 ud2
732
733 endproc
734
735 proc x3e
736
737 ud2
738
739 endproc
740
741 proc x3f
742
743 ud2
744
745 endproc