long u;
};
+#if defined(__i386__)
+# define GPREGS(_) \
+ _(a) _(b) _(c) _(d) _(si) _(di) _(bp)
+#elif defined(__x86_64__)
+# define GPREGS(_) \
+ _(a) _(b) _(c) _(d) _(si) _(di) _(bp) \
+ _(r8) _(r9) _(r10) _(r11) _(r12) _(r13) _(r14) _(r15)
+#elif defined(__arm__)
+# define GPREGS(_) \
+ _(r0) _(r1) _(r2) _(r3) _(r4) _(r5) _(r6) _(r7) \
+ _(r8) _(r9) _(r10) _(r11) _(r12)
+#elif defined(__aarch64__)
+# define GPREGS(_) \
+ _(x0) _(x1) _(x2) _(x3) _(x4) _(x5) _(x6) _(x7) \
+ _(x8) _(x9) _(x10) _(x11) _(x12) _(x13) _(x14) _(x15)
+#else
+# error "not supported"
+#endif
+
+enum {
+#define DEFCONST(r) R_##r,
+ GPREGS(DEFCONST)
+#undef DEFCONST
+ R_flags,
+ NREGS
+};
+
+static const char *const rname[] = {
+#define DEFNAME(r) #r,
+ GPREGS(DEFNAME)
+#undef DEFNAME
+ "f"
+};
+
struct regs {
- union reg
- a, b, c, d, si, di, bp, f,
- r8, r9, r10, r11, r12, r13, r14, r15;
+ union reg r[NREGS];
};
struct seg {
#define N(v) (sizeof(v)/sizeof((v)[0]))
+#define STRCMP(a, op, b) (strcmp((a), (b)) op 0)
+#define STRNCMP(a, op, b, n) (strncmp((a), (b), (n)) op 0)
#define CTYPE_HACK(func, ch) func((unsigned char)(ch))
#define ISDIGIT(ch) CTYPE_HACK(isdigit, ch)
#define ISSPACE(ch) CTYPE_HACK(isspace, ch)
else return (-1);
}
-static void setreg(union reg *r,
- struct seg **seg_inout,
- int *i_inout, int argc, char *argv[])
+static void setreg(union reg *r, struct seg **seg_inout, const char *p)
{
struct seg *seg;
- const char *p, *pp;
+ const char *pp;
unsigned char *q;
int hi, lo;
size_t n;
#define LONG_REG(p) (parse_long("signed register", (p), LONG_MIN, LONG_MAX))
#define ULONG_REG(p) (parse_ulong("unsigned register", (p), 0, ULONG_MAX))
- p = *i_inout >= argc ? "-" : argv[(*i_inout)++];
switch (*p) {
case '-':
if (p[1]) r->i = LONG_REG(p);
- else r->u = 0xdeadbeefdeadbeef;
break;
case 'i':
if (p[1] != ':') goto bad;
}
r->p = seg->p;
break;
+ case 'z':
+ if (p[1] != ':') goto bad;
+ n = parse_ulong("buffer length", p + 2, 0, ~(size_t)0);
+ seg = (*seg_inout)++; seg->p = q = xmalloc(n); seg->sz = n;
+ r->p = q; memset(q, 0, n);
+ break;
default:
if (ISDIGIT(*p)) r->u = ULONG_REG(p);
else if (*p == '+') r->i = LONG_REG(p);
{
size_t i;
+#if ULONG_MAX == 0xffffffff
+ printf("%3s = 0x%08lx = %20ld = %20lu", name, r->u, r->i, r->u);
+#else
printf("%3s = 0x%016lx = %20ld = %20lu", name, r->u, r->i, r->u);
+#endif
if (r->u >= ' ' && r->u <= '~') printf(" = '%c'", (int)r->u);
for (i = 0; i < nseg; i++) {
if (r->p == seg[i].p)
{
struct regs r;
struct seg seg[16], *segp = seg;
- size_t nseg;
- int i, j;
+ size_t nseg, n;
+ const char *p;
+ char *q;
+ unsigned long f;
+ int i, j, k;
+ unsigned long l;
prog = strrchr(argv[0], '/'); if (prog) prog++; else prog = argv[0];
if (argc < 2)
- barf("usage: %s I [A B C D SI DI BP R8 R9 R10 R11 R12 R13 R14 R15 F]",
+ barf("usage: %s I [REG...]",
prog);
j = parse_long("program index", argv[1], -1, N(x) - 1);
- i = 2;
- setreg(&r.a, &segp, &i, argc, argv);
- setreg(&r.b, &segp, &i, argc, argv);
- setreg(&r.c, &segp, &i, argc, argv);
- setreg(&r.d, &segp, &i, argc, argv);
- setreg(&r.si, &segp, &i, argc, argv);
- setreg(&r.di, &segp, &i, argc, argv);
- setreg(&r.bp, &segp, &i, argc, argv);
- setreg(&r.r8, &segp, &i, argc, argv);
- setreg(&r.r9, &segp, &i, argc, argv);
- setreg(&r.r10, &segp, &i, argc, argv);
- setreg(&r.r11, &segp, &i, argc, argv);
- setreg(&r.r12, &segp, &i, argc, argv);
- setreg(&r.r13, &segp, &i, argc, argv);
- setreg(&r.r14, &segp, &i, argc, argv);
- setreg(&r.r15, &segp, &i, argc, argv);
- setreg(&r.f, &segp, &i, argc, argv);
- nseg = segp - seg;
+#if ULONG_MAX == 0xffffffff
+# define DEAD 0xdeadbeef
+#else
+# define DEAD 0xdeadbeefdeadbeef
+#endif
+ for (i = 0; i < NREGS - 1; i++) r.r[i].u = DEAD;
+#undef DEAD
+ r.r[R_flags].u = 0;
+
+ i = 0;
+ argv += 2;
+ while (*argv) {
+ p = *argv++;
+ if (ISDIGIT(*p)) {
+ l = strtoul(p, &q, 10);
+ if (l < NREGS && *q == '=') { p = q + 1; i = l; }
+ } else for (k = 0; k < NREGS; k++) {
+ n = strlen(rname[k]);
+ if (STRNCMP(p, ==, rname[k], n) && p[n] == '=')
+ { i = k; p += n + 1; break; }
+ }
+ if (i >= NREGS) barf("too many registers");
+ setreg(&r.r[i], &segp, p); i++;
+ }
+ nseg = segp - seg;
call_example(j < 0 ? &nop : x[j], &r);
- dumpreg("rax", &r.a, seg, nseg);
- dumpreg("rbx", &r.b, seg, nseg);
- dumpreg("rcx", &r.c, seg, nseg);
- dumpreg("rdx", &r.d, seg, nseg);
- dumpreg("rsi", &r.si, seg, nseg);
- dumpreg("rdi", &r.di, seg, nseg);
- dumpreg("rbp", &r.bp, seg, nseg);
- dumpreg("rbp", &r.bp, seg, nseg);
- dumpreg("r8", &r.r8, seg, nseg);
- dumpreg("r9", &r.r9, seg, nseg);
- dumpreg("r10", &r.r10, seg, nseg);
- dumpreg("r11", &r.r11, seg, nseg);
- dumpreg("r12", &r.r12, seg, nseg);
- dumpreg("r13", &r.r13, seg, nseg);
- dumpreg("r14", &r.r14, seg, nseg);
- dumpreg("r15", &r.r15, seg, nseg);
+ for (i = 0; i < NREGS; i++) dumpreg(rname[i], &r.r[i], seg, nseg);
+
+ f = r.r[R_flags].u;
+
+#if defined(__i386__) || defined(__x86_64__)
#define CF (1 << 0)
#define PF (1 << 2)
#define SF (1 << 7)
#define OF (1 << 11)
- dumpreg("f", &r.f, seg, nseg);
printf("\tstatus: %ccf %cpf %caf %czf %csf %cdf %cof\n",
- (r.f.u >> 0)&1u ? '+' : '-',
- (r.f.u >> 2)&1u ? '+' : '-',
- (r.f.u >> 4)&1u ? '+' : '-',
- (r.f.u >> 6)&1u ? '+' : '-',
- (r.f.u >> 7)&1u ? '+' : '-',
- (r.f.u >> 10)&1u ? '+' : '-',
- (r.f.u >> 11)&1u ? '+' : '-');
+ (f >> 0)&1u ? '+' : '-',
+ (f >> 2)&1u ? '+' : '-',
+ (f >> 4)&1u ? '+' : '-',
+ (f >> 6)&1u ? '+' : '-',
+ (f >> 7)&1u ? '+' : '-',
+ (f >> 10)&1u ? '+' : '-',
+ (f >> 11)&1u ? '+' : '-');
printf("\tcond:");
- if (r.f.u&CF) printf(" c/b/nae"); else printf(" nc/ae/nb");
- if (r.f.u&ZF) printf(" e/z"); else printf(" ne/nz");
- if (r.f.u&SF) printf(" s"); else printf(" ns");
- if (r.f.u&OF) printf(" o"); else printf(" no");
- if (r.f.u&PF) printf(" p"); else printf(" np");
- if ((r.f.u&CF) || (r.f.u&ZF)) printf(" be/na"); else printf(" a/nbe");
- if (!(r.f.u&OF) == !(r.f.u&SF)) printf(" ge/nl"); else printf(" l/nge");
- if (!(r.f.u&OF) == !(r.f.u&SF) && !(r.f.u&ZF))
+ if (f&CF) printf(" c/b/nae"); else printf(" nc/ae/nb");
+ if (f&ZF) printf(" e/z"); else printf(" ne/nz");
+ if (f&SF) printf(" s"); else printf(" ns");
+ if (f&OF) printf(" o"); else printf(" no");
+ if (f&PF) printf(" p"); else printf(" np");
+ if ((f&CF) || (f&ZF)) printf(" be/na"); else printf(" a/nbe");
+ if (!(f&OF) == !(f&SF)) printf(" ge/nl"); else printf(" l/nge");
+ if (!(f&OF) == !(f&SF) && !(f&ZF))
printf(" g/nle"); else printf(" le/ng");
putchar('\n');
printf("\tsystem: %ctf %cif iopl=%d %cnt "
"%crf %cvm %cac %cvif %cvip %cid\n",
- (r.f.u >> 8)&1u ? '+' : '-',
- (r.f.u >> 9)&1u ? '+' : '-',
- (int)((r.f.u >> 12)&1u),
- (r.f.u >> 14)&1u ? '+' : '-',
- (r.f.u >> 16)&1u ? '+' : '-',
- (r.f.u >> 17)&1u ? '+' : '-',
- (r.f.u >> 18)&1u ? '+' : '-',
- (r.f.u >> 19)&1u ? '+' : '-',
- (r.f.u >> 20)&1u ? '+' : '-',
- (r.f.u >> 21)&1u ? '+' : '-');
+ (f >> 8)&1u ? '+' : '-',
+ (f >> 9)&1u ? '+' : '-',
+ (int)((f >> 12)&1u),
+ (f >> 14)&1u ? '+' : '-',
+ (f >> 16)&1u ? '+' : '-',
+ (f >> 17)&1u ? '+' : '-',
+ (f >> 18)&1u ? '+' : '-',
+ (f >> 19)&1u ? '+' : '-',
+ (f >> 20)&1u ? '+' : '-',
+ (f >> 21)&1u ? '+' : '-');
#undef CF
#undef PF
#undef SF
#undef OF
+#elif defined(__arm__)
+
+#define NF (1u << 31)
+#define ZF (1u << 30)
+#define CF (1u << 29)
+#define VF (1u << 28)
+
+ {
+ static const char
+ *modetab[] = { "?00", "?01", "?02", "?03", "?04", "?05", "?06", "?07",
+ "?08", "?09", "?10", "?11", "?12", "?13", "?14", "?15",
+ "usr", "fiq", "irq", "svc", "?20", "?21", "mon", "abt",
+ "?24", "?25", "hyp", "und", "?28", "?29", "?30", "sys" },
+ *condtab[] = { "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
+ "hi", "ls", "ge", "lt", "gt", "le", "al", "nv" };
+
+ printf("\tuser: %cn %cz %cc %cv %cq ge=%c%c%c%c;",
+ (f >> 31)&1u ? '+' : '-',
+ (f >> 30)&1u ? '+' : '-',
+ (f >> 29)&1u ? '+' : '-',
+ (f >> 28)&1u ? '+' : '-',
+ (f >> 27)&1u ? '+' : '-',
+ (f >> 19)&1u ? '1' : '0',
+ (f >> 18)&1u ? '1' : '0',
+ (f >> 17)&1u ? '1' : '0',
+ (f >> 16)&1u ? '1' : '0');
+ if (f&NF) printf(" mi"); else printf(" pl");
+ if (f&ZF) printf(" eq"); else printf(" ne");
+ if (f&CF) printf(" cs/hs"); else printf(" cc/lo");
+ if (f&VF) printf(" vs"); else printf(" vc");
+ if ((f&CF) && !(f&ZF)) printf(" hi"); else printf(" ls");
+ if (!(f&VF) == !(f&NF)) printf(" ge"); else printf(" lt");
+ if (!(f&VF) == !(f&NF) && !(f&ZF)) printf(" gt"); else printf(" le");
+ putchar('\n');
+ printf("\tsystem: %cj it=%s:%c%c%c%c %ce %ca %ci %cf %ct m=%s\n",
+ (f >> 24)&1u ? '+' : '-',
+ condtab[(f >> 12)&15u],
+ (f >> 11)&1u ? '1' : '0',
+ (f >> 10)&1u ? '1' : '0',
+ (f >> 26)&1u ? '1' : '0',
+ (f >> 25)&1u ? '1' : '0',
+ (f >> 9)&1u ? '+' : '-',
+ (f >> 8)&1u ? '+' : '-',
+ (f >> 7)&1u ? '+' : '-',
+ (f >> 6)&1u ? '+' : '-',
+ (f >> 5)&1u ? '+' : '-',
+ modetab[(f >> 0)&31u]);
+ }
+
+#undef NF
+#undef ZF
+#undef CF
+#undef VF
+
+#elif defined(__aarch64__)
+
+#define NF (1u << 31)
+#define ZF (1u << 30)
+#define CF (1u << 29)
+#define VF (1u << 28)
+
+ printf("\tuser: %cn %cz %cc %cv;",
+ (f >> 31)&1u ? '+' : '-',
+ (f >> 30)&1u ? '+' : '-',
+ (f >> 29)&1u ? '+' : '-',
+ (f >> 28)&1u ? '+' : '-');
+ if (f&NF) printf(" mi"); else printf(" pl");
+ if (f&ZF) printf(" eq"); else printf(" ne");
+ if (f&CF) printf(" cs/hs"); else printf(" cc/lo");
+ if (f&VF) printf(" vs"); else printf(" vc");
+ if ((f&CF) && !(f&ZF)) printf(" hi"); else printf(" ls");
+ if (!(f&VF) == !(f&NF)) printf(" ge"); else printf(" lt");
+ if (!(f&VF) == !(f&NF) && !(f&ZF)) printf(" gt"); else printf(" le");
+ putchar('\n');
+
+#undef NF
+#undef ZF
+#undef CF
+#undef VF
+
+#else
+# error "not supported"
+#endif
+
for (i = 0; i < nseg; i++)
{ printf("seg[%d] (%p):\n", i, seg[i].p); dumpseg(&seg[i]); }
-/// -*- mode: asm; asm-comment-char: ?/ -*-
+/// -*- mode: asm; asm-comment-char: 0 -*-
+
+///--------------------------------------------------------------------------
+/// Preliminaries.
+
+#include <sys/syscall.h>
+
+#if defined(__i386__) || defined(__x86_64__)
.intel_syntax noprefix
- .section .note.GNU-stack, "", @progbits
+#elif defined(__arm__)
+
+.macro ret
+ bx r14
+.endm
+
+ .arch armv7-a
+
+#elif defined(__aarch64__)
+
+.macro cmov rd, rn, cc
+ csel \rd, \rn, \rd, \cc
+.endm
+#define _COND(_) \
+ _(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl) \
+ _(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv) \
+ _(hs) _(lo)
+#define _INST(_) \
+ _(ccmp) _(ccmn) \
+ _(csel) _(cmov) \
+ _(csinc) _(cinc) _(cset) \
+ _(csneg) _(cneg) \
+ _(csinv) _(cinv) _(csetm)
+#define _CONDVAR(cc) _definstvar cc;
+#define _INSTVARS(inst) \
+ .macro _definstvar cc; \
+ .macro inst.\cc args:vararg; inst \args, \cc; .endm; \
+ .endm; \
+ _COND(_CONDVAR); \
+ .purgem _definstvar;
+ _INST(_INSTVARS)
+#undef _COND
+#undef _INST
+#undef _CONDVAR
+#undef _INSTVARS
+
+#define CCMP_N 8
+#define CCMP_Z 4
+#define CCMP_C 2
+#define CCMP_V 1
+
+#define CCMP_MI CCMP_N
+#define CCMP_PL 0
+#define CCMP_EQ CCMP_Z
+#define CCMP_NE 0
+#define CCMP_CS CCMP_C
+#define CCMP_HS CCMP_C
+#define CCMP_CC 0
+#define CCMP_LO 0
+#define CCMP_VS CCMP_V
+#define CCMP_VC 0
+#define CCMP_HI CCMP_C
+#define CCMP_LS 0
+#define CCMP_LT CCMP_N
+#define CCMP_GE 0
+#define CCMP_LE CCMP_N
+#define CCMP_GT 0
+
+#else
+# error "not supported"
+#endif
.macro proc name
.globl \name
.endm
.macro ch c
+#if defined(__i386__)
+
+ pushf
+ push eax
+ push ebx
+ push ecx
+ push edx
+ push ebp
+ mov ebp, esp
+ and esp, -16
+
+ push \c
+ call putchar@plt
+
+ call get_pc_ebx
+ add ebx, offset _GLOBAL_OFFSET_TABLE
+ mov eax, [ebx + stdout@GOT]
+ mov eax, [eax]
+ call fflush@plt
+
+ mov esp, ebp
+ pop ebp
+ pop edx
+ pop ecx
+ pop ebx
+ pop eax
+ popf
+
+#elif defined(__x86_64__)
+
pushf
push rax
push rcx
pop rcx
pop rax
popf
+
+#elif defined(__arm__)
+
+ stmfd r13!, {r0-r4, r12, r14}
+
+ mov r4, r13
+ bic r14, r4, #15
+ mov r13, r14
+
+ mov r0, #\c
+ bl putchar@plt
+
+ ldr r14, .L$_c$gotoff$\@
+.L$_c$gotpc$\@:
+ add r14, pc, r14
+ b .L$_c$cont$\@
+.L$_c$gotoff$\@:
+ .word _GLOBAL_OFFSET_TABLE - .L$_c$gotpc$\@ - 8
+.L$_c$cont$\@:
+ bl fflush@plt
+
+ mov r13, r4
+ ldmfd r13!, {r0-r4, r12, r14}
+
+#elif defined(__aarch64__)
+
+ sub sp, sp, #20*8
+ stp x0, x1, [sp, #0]
+ stp x2, x3, [sp, #16]
+ stp x4, x5, [sp, #32]
+ stp x6, x7, [sp, #48]
+ stp x8, x9, [sp, #64]
+ stp x10, x11, [sp, #80]
+ stp x12, x13, [sp, #96]
+ stp x14, x15, [sp, #112]
+ stp x16, x17, [sp, #128]
+ mrs x16, nzcv
+ stp x16, x30, [sp, #144]
+
+ mov w0, #\c
+ bl putchar
+ adrp x0, :got:stdout
+ ldr x0, [x0, #:got_lo12:stdout]
+ ldr x0, [x0]
+ bl fflush
+
+ ldp x16, x30, [sp, #144]
+ msr nzcv, x16
+ ldp x16, x17, [sp, #128]
+ ldp x14, x15, [sp, #112]
+ ldp x12, x13, [sp, #96]
+ ldp x10, x11, [sp, #80]
+ ldp x8, x9, [sp, #64]
+ ldp x6, x7, [sp, #48]
+ ldp x4, x5, [sp, #32]
+ ldp x2, x3, [sp, #16]
+ ldp x0, x1, [sp, #0]
+ add sp, sp, #20*8
+
+#else
+# error "not supported"
+#endif
.endm
+.macro notimpl
+#if defined(__i386__) || defined(__x86_64__)
+ ud2
+#elif defined(__arm__)
+ udf
+#elif defined(__aarch64__)
+ hlt #0
+#else
+# error "not supported"
+#endif
+.endm
+
+ .section .note.GNU-stack, "", %progbits
+
.text
+#if defined(__i386__)
+get_pc_ebx:
+ mov ebx, [esp]
+ ret
+#endif
+
+
proc call_example
+#if defined(__i386__)
+
+ push ebx // ebx
+ push esi // esi, ebx
+ push edi // edi, esi, ebx
+ push ebp // flags, ebp, ..., ebx
+ pushf
+
+ mov edi, [esp + 4*6]
+ mov esi, [esp + 4*7]
+ push esi // regs, flags, ebp, ..., ebx
+
+ call get_pc_ebx
+ lea eax, [ebx + 9f - .]
+ push eax // cont, regs, flags, ebp, ..., ebx
+ push edi // func, cont, regs, flags, ebp, ..., ebx
+
+ mov eax, [esi + 28]
+ pushf
+ pop ecx
+ and eax, 0x0cd5
+ and ecx, ~0x0cd5
+ or eax, ecx
+ push eax
+ popf
+ mov eax, [esi + 0]
+ mov ebx, [esi + 4]
+ mov ecx, [esi + 8]
+ mov edx, [esi + 12]
+ mov edi, [esi + 20]
+ mov ebp, [esi + 24]
+ mov esi, [esi + 16]
+
+ ret // -> func; regs, flags, ebp, ..., ebx
+
+9: pushf // eflags, regs, flags, ebp, ..., ebx
+ push esi // esi, eflags, regs, flags, ebp, ..., ebx
+ mov esi, [esp + 8]
+ mov [esi + 0], eax
+ mov [esi + 4], ebx
+ mov [esi + 8], ecx
+ mov [esi + 12], edx
+ mov [esi + 20], edi
+ mov [esi + 24], ebp
+ pop eax // rflags, regs, flags, ebp, ..., ebx
+ mov [esi + 16], eax
+ pop eax // regs, flags, ebp, ..., ebx
+ mov [esi + 28], eax
+
+ add esp, 4 // flags, ebp, ..., ebx
+ popf // ebp, ..., ebx
+ pop ebp // ..., ebx
+ pop edi
+ pop esi
+ pop ebx //
+ ret
+
+#elif defined(__x86_64__)
+
push rbx // rbx
push r10
push r11
push rax // cont, regs, flags, rbp, ..., rbx
push rdi // func, cont, regs, flags, rbp, ..., rbx
- mov rax, [rsi + 56]
+ mov rax, [rsi + 8*15]
pushf
pop rcx
and rax, 0x0cd5
or rax, rcx
push rax
popf
- mov rax, [rsi + 0]
- mov rbx, [rsi + 8]
- mov rcx, [rsi + 16]
- mov rdx, [rsi + 24]
- mov rdi, [rsi + 40]
- mov rbp, [rsi + 48]
- mov rsi, [rsi + 32]
+ mov rax, [rsi + 0]
+ mov rbx, [rsi + 8]
+ mov rcx, [rsi + 16]
+ mov rdx, [rsi + 24]
+ mov rdi, [rsi + 40]
+ mov rbp, [rsi + 48]
+ mov r8, [rsi + 56]
+ mov r9, [rsi + 64]
+ mov r10, [rsi + 72]
+ mov r11, [rsi + 80]
+ mov r12, [rsi + 88]
+ mov r13, [rsi + 96]
+ mov r14, [rsi + 104]
+ mov r15, [rsi + 112]
+ mov rsi, [rsi + 32]
ret // -> func; regs, flags, rbp, ..., rbx
9: pushf // rflags, regs, flags, rbp, ..., rbx
push rsi // rsi, rflags, regs, flags, rbp, ..., rbx
mov rsi, [rsp + 16]
- mov [rsi + 0], rax
- mov [rsi + 8], rbx
- mov [rsi + 16], rcx
- mov [rsi + 24], rdx
- mov [rsi + 40], rdi
- mov [rsi + 48], rbp
+ mov [rsi + 0], rax
+ mov [rsi + 8], rbx
+ mov [rsi + 16], rcx
+ mov [rsi + 24], rdx
+ mov [rsi + 40], rdi
+ mov [rsi + 48], rbp
+ mov [rsi + 56], r8
+ mov [rsi + 64], r9
+ mov [rsi + 72], r10
+ mov [rsi + 80], r11
+ mov [rsi + 88], r12
+ mov [rsi + 96], r13
+ mov [rsi + 104], r14
+ mov [rsi + 112], r15
pop rax // rflags, regs, flags, rbp, ..., rbx
- mov [rsi + 32], rax
+ mov [rsi + 32], rax
pop rax // regs, flags, rbp, ..., rbx
- mov [rsi + 56], rax
+ mov [rsi + 120], rax
add rsp, 8 // flags, rbp, ..., rbx
popf // rbp, ..., rbx
pop rbx //
ret
+#elif defined(__arm__)
+
+ stmfd r13!, {r0, r1, r4-r11, r14}
+ ldmia r1, {r0-r12, r14}
+ msr cpsr, r14
+ mov r14, pc
+ ldr pc, [r13], #4
+ ldr r14, [r13], #4
+ stmia r14!, {r0-r12}
+ mrs r0, cpsr
+ str r0, [r14]
+ ldmfd r13!, {r4-r11, pc}
+
+#elif defined(__aarch64__)
+
+ stp x29, x30, [sp, #-13*8]!
+ mov x29, sp
+ stp x19, x20, [sp, #16]
+ stp x21, x22, [sp, #32]
+ stp x23, x24, [sp, #48]
+ stp x25, x26, [sp, #64]
+ stp x27, x28, [sp, #80]
+ str x1, [sp, #96]
+
+ mov x16, x0
+
+ ldr x17, [x1, #128]
+ ldp x14, x15, [x1, #112]
+ ldp x12, x13, [x1, #96]
+ ldp x10, x11, [x1, #80]
+ ldp x8, x9, [x1, #64]
+ ldp x6, x7, [x1, #48]
+ ldp x4, x5, [x1, #32]
+ ldp x2, x3, [x1, #16]
+ ldp x0, x1, [x1, #0]
+ msr nzcv, x17
+
+ blr x16
+
+ ldr x16, [sp, #96]
+ mrs x17, nzcv
+ str x17, [x16, #128]
+ stp x14, x15, [x16, #112]
+ stp x12, x13, [x16, #96]
+ stp x10, x11, [x16, #80]
+ stp x8, x9, [x16, #64]
+ stp x6, x7, [x16, #48]
+ stp x4, x5, [x16, #32]
+ stp x2, x3, [x16, #16]
+ stp x0, x1, [x16, #0]
+
+ ldp x19, x20, [sp, #16]
+ ldp x21, x22, [sp, #32]
+ ldp x23, x24, [sp, #48]
+ ldp x25, x26, [sp, #64]
+ ldp x27, x28, [sp, #80]
+ ldp x29, x30, [sp], #13*8
+
+#else
+# error "not supported"
+#endif
+
endproc
proc nop
endproc
///--------------------------------------------------------------------------
+/// 0x00--0x0f
proc x00
// clear all 64 bits of extended traditional registers
- xor eax,eax // clear rax
- lea rbx,[0] // rbx -> _|_
+
+#if defined(__x86_64__)
+
+ xor eax, eax // clear rax
+ lea rbx, [0] // rbx -> _|_
loop . // iterate, decrement rcx until zero
- mov rdx,0 // set rdx = 0
- and esi,0 // clear all bits of rsi
- sub edi,edi // set rdi = edi - edi = 0
+ mov rdx, 0 // set rdx = 0
+ and esi, 0 // clear all bits of rsi
+ sub edi, edi // set rdi = edi - edi = 0
push 0
pop rbp // pop 0 into rbp
+#elif defined(__i386__)
+
+ xor eax, eax
+ lea ebx, [0]
+ loop .
+ mov edx, 0
+ and esi, 0
+ sub edi, edi
+ push 0
+ pop ebp
+
+#elif defined(__arm__)
+
+ eor r0, r0, r0
+ rsb r1, r1, r1
+0: subs r2, r2, #1
+ bne 0b
+ mov r3, #0
+ and r4, r4, #0
+ sub r5, r5, r5
+
+#elif defined(__aarch64__)
+
+ eor w0, w0, w0
+ mov w1, wzr
+0: sub w2, w2, #1
+ cbnz w2, 0b
+ mov w3, #0
+ and w4, w4, wzr
+ sub w5, w5, w5
+
+#else
+ notimpl
+#endif
+
ret
endproc
//
// on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1}
// and f_{i+c}, where f_{i+1} = f_i + f_{i-1}
+
+#if defined(__x86_64__)
+
0: xadd rax, rdx // a, d = a + d, a
// = f_{i+1} + f_i, f_{i+1}
// = f_{i+2}, f_{i+1}
loop 0b // advance i, decrement c, iterate
+#elif defined(__i386__)
+
+0: xadd eax, edx
+ loop 0b
+
+#elif defined(__arm__)
+
+0: subs r2, r2, #2
+ add r3, r3, r0
+ blo 8f
+ add r0, r0, r3
+ bhi 0b
+
+8: movne r0, r3
+
+#elif defined(__aarch64__)
+
+0: subs x2, x2, #2
+ add x3, x3, x0
+ b.lo 8f
+ add x0, x0, x3
+ b.hi 0b
+
+8: cmov.ne x0, x3
+
+#else
+ notimpl
+#endif
+
ret
endproc
// boolean canonify a: if a = 0 on entry, leave it zero; otherwise
// set a = 1
+
+#if defined(__x86_64__)
+
neg rax // set cf iff a /= 0
sbb rax, rax // a = a - a - cf = -cf
neg rax // a = cf
+#elif defined(__i386__)
+
+ neg eax
+ sbb eax, eax
+ neg eax
+
+#elif defined(__arm__)
+
+ movs r1, r0 // the easy way
+ movne r1, #1 // mvnne r1, #1 for mask
+
+ cmp r0, #1 // clear cf iff a == 0
+ sbc r2, r0, r0 // c' = a - a - 1 + cf = cf - 1
+ add r2, r2, #1 // c' = cf
+
+ sub r3, r0, r0, lsr #1 // d' top bit clear; d' = 0 iff a = 0
+ rsb r3, r3, #0 // d' top bit set iff a /= 0
+ mov r3, r3, lsr #31 // asr for mask
+
+ rsbs r0, r0, #0
+ sbc r0, r0, r0
+ rsb r0, r0, #0
+
+#elif defined(__aarch64__)
+
+ cmp x0, #0 // trivial
+ cset.ne x1 // csetm for mask
+
+ cmp xzr, x0 // set cf iff a == 0
+ sbc x2, x0, x0 // c' = a - a - 1 + cf = cf - 1
+ neg x2, x2 // c' = 1 - cf
+
+ sub x3, x0, x0, lsr #1 // if a < 2^63 then a' = ceil(d/2) <
+ // 2^63
+ // if a >= 2^63, write a = 2^63 + t
+ // with t < 2^63; d' = 2^63 - 2^62 +
+ // ceil(t/2) = 2^62 + ceil(t/2), and
+ // ceil(t/2) < 2^62
+ // anyway d' < 2^63 and d' = 0 iff
+ // a = 0
+ neg x3, x3 // d' top bit set iff a /= 0
+ lsr x3, x3, #63 // asr for mask
+
+ cmp x0, #1 // set cf iff a /= 0
+ adc x0, xzr, xzr // a' = 0 + 0 + cf = cf
+
+#else
+ notimpl
+#endif
+
ret
endproc
proc x03
// set a = min(a, d) (unsigned); clobber c, d
+
+#if defined(__x86_64__)
+
sub rdx, rax // d' = d - a; set cf if a > d
sbb rcx, rcx // c = -cf = -[a > d]
and rcx, rdx // c = a > d ? d - a : 0
add rax, rcx // a' = a > d ? d : a
+#elif defined(__i386__)
+
+ sub edx, eax
+ sbb ecx, ecx
+ and ecx, edx
+ add eax, ecx
+
+#elif defined(__arm__)
+
+ cmp r0, r3 // the easy way
+ movlo r1, r0 // only needed for out-of-place
+ movhs r1, r3
+
+ subs r3, r3, r0
+ sbc r12, r12, r12
+ and r12, r12, r3
+ add r0, r0, r12
+
+#elif defined(__aarch64__)
+
+ cmp x0, x3 // the easy way
+ csel.lo x1, x0, x3
+
+ subs x3, x3, x0 // d' = d - a; set cf if d >= a
+ sbc x16, xzr, xzr // t = -1 + cf = -[a > d]
+ and x16, x16, x3 // t = a > d ? d - a : 0
+ add x0, x0, x16 // a' = a > d ? d : a
+
+#else
+ notimpl
+#endif
+
ret
endproc
proc x04
// switch case?
+
+#if defined(__x86_64__)
+
+ // unrelated playing
+ mov ecx, eax
+ mov rbx, -1
+ mov edx, ecx
+ sub edx, '0'
+ cmp edx, 10
+ cmovb rbx, rdx
+ or ecx, 0x20
+ mov edx, ecx
+ sub edx, 'a'
+ sub ecx, 'a' - 10
+ cmp edx, 6
+ cmovb rbx, rcx
+
+ xor al, 0x20
+
+#elif defined(__i386__)
+
+ // unrelated playing
+ mov ecx, eax
+ mov ebx, -1
+ mov edx, ecx
+ sub edx, '0'
+ cmp edx, 10
+ cmovb ebx, edx
+ or ecx, 0x20
+ mov edx, ecx
+ sub edx, 'a'
+ sub ecx, 'a' - 10
+ cmp edx, 6
+ cmovb ebx, ecx
+
xor al, 0x20
+#elif defined(__arm__)
+
+ // unrelated playing
+ mvn r1, #0
+ sub r12, r0, #'0'
+ cmp r12, #10
+ movlo r1, r12
+ orr r12, r0, #0x20
+ sub r12, r12, #'a'
+ cmp r12, #6
+ addlo r1, r12, #10
+
+ eor r0, r0, #0x20
+
+#elif defined(__aarch64__)
+
+ // unrelated playing
+ mov x1, #-1
+ sub w16, w0, #'0'
+ cmp w16, #10
+ cmov.lo x1, x16
+ orr w16, w0, #0x20
+ sub w16, w16, #'a' - 10
+ cmp w16, #10
+ ccmp.hs w16, #16, #CCMP_HS
+ cmov.lo x1, x16
+
+ eor w0, w0, #0x20
+
+#else
+ notimpl
+#endif
+
ret
endproc
proc x05
// answer whether 5 <= a </<= 9.
+
+#if defined(__x86_64__)
+
sub rax, 5 // a' = a - 5
cmp rax, 4 // is a' - 5 </<= 4?
// g/nle a' > 4 a > 9 or a < -2^63 + 5
// le/ng a' <= 4 -2^63 + 5 <= a <= 9
+#elif defined(__i386__)
+
+ sub eax, 5
+ cmp eax, 4
+
+#elif defined(__arm__)
+
+ // i dimly remember having a slick way to do this way back in the
+ // day, but i can't figure it out any more.
+ sub r0, #5
+ cmp r0, #4
+
+#elif defined(__aarch64__)
+
+ // literal translation is too obvious
+ cmp x0, #5
+ ccmp.hs x0, #9, #CCMP_HS
+
+#else
+ notimpl
+#endif
+
ret
endproc
// leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of,
// set sf to msb(a)
+
+#if defined(__x86_64__)
+
not rax // a' = -a - 1
inc rax // a' = -a
neg rax // a' = a
+#elif defined(__i386__)
+
+ not eax
+ inc eax
+ neg eax
+
+#elif defined(__arm__)
+
+ mvn r0, r0
+ add r0, r0, #1
+ rsbs r0, r0, #0 // cf has opposite sense
+
+#elif defined(__aarch64__)
+
+ mvn x0, x0
+ add x0, x0, #1
+ negs x0, x0 // cf has opposite sense
+
+#else
+ notimpl
+#endif
+
ret
endproc
proc x07
// same as before (?)
+
+#if defined(__x86_64__)
+
inc rax // a' = a + 1
neg rax // a' = -a - 1
inc rax // a' = -a
neg rax // a' = a
+#elif defined(__i386__)
+
+ inc eax
+ neg eax
+ inc eax
+ neg eax
+
+#elif defined(__arm__)
+
+ add r0, r0, #1
+ rsb r0, r0, #0
+ add r0, r0, #1
+ rsbs r0, r0, #0
+
+#elif defined(__aarch64__)
+
+ add x0, x0, #1
+ neg x0, x0
+ add x0, x0, #1
+ negs x0, x0 // cf has opposite sense
+
+#else
+ notimpl
+#endif
+
ret
endproc
// floor((a + d)/2), correctly handling overflow conditions; final cf
// is lsb(a + d), probably uninteresting
+
+#if defined(__x86_64__)
+
add rax, rdx // cf || a' = a + d
rcr rax, 1 // shift 65-bit result right by one
// place; lsb moves into carry
+#elif defined(__i386__)
+
+ add eax, edx
+ rcr eax, 1
+
+#elif defined(__arm__)
+
+ // like the two-instruction a64 version
+ sub r1, r3, r0
+ add r1, r0, r1, lsr #1
+
+ // the slick version, similar to the above
+ adds r0, r0, r3
+ mov r0, r0, rrx
+
+#elif defined(__aarch64__)
+
+ // a64 lacks a32's rrx. literal translation.
+ adds x1, x0, x3 // cf || a' = a + d
+ adc x16, xzr, xzr // realize cf in extra register
+ extr x1, x16, x1, #1 // shift down one place
+
+ // two instruction version: clobbers additional register. (if you
+ // wanted the answer in any other register, even overwriting d, then
+ // this is unnecessary.) also depends on d >= a.
+ sub x16, x3, x0 // compute difference
+ add x0, x0, x16, lsr #1 // add half of it (rounded down)
+
+#else
+ notimpl
+#endif
+
ret
endproc
// a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3
// (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8).
+
+#if defined(__x86_64__)
+
shr rax, 3 // a' = floor(a/8); cf = 1 if a ==
// 4, 5, 6, 7 (mod 8)
adc rax, 0 // a' = floor(a/8) + cf
+#elif defined(__i386__)
+
+ shr eax, 3
+ adc eax, 0
+
+#elif defined(__arm__)
+
+ movs r0, r0, lsr #3
+ adc r0, r0, #0
+
+#elif defined(__aarch64__)
+
+ tst x0, #4
+ orr x0, xzr, x0, lsr #3
+ cinc.ne x0, x0
+
+#else
+ notimpl
+#endif
+
ret
endproc
proc x0a
// increment c-byte little-endian bignum at rdi
+
+#if defined(__x86_64__)
+
add byte ptr [rdi], 1
0: inc rdi
adc byte ptr [rdi], 0
loop 0b
+#elif defined(__i386__)
+
+ add byte ptr [edi], 1
+0: inc edi
+ adc byte ptr [edi], 0
+ loop 0b
+
+#elif defined(__arm__)
+
+ mov r12, #256 // set initial carry
+0: ldrb r0, [r5]
+ subs r2, r2, #1
+ add r12, r0, r12, lsr #8
+ strb r12, [r5], #1
+ bne 0b
+
+#elif defined(__aarch64__)
+
+ mov w17, #256 // set initial carry
+0: ldrb w16, [x5]
+ sub x2, x2, #1
+ add w17, w16, w17, lsr #8
+ strb w17, [x5], #1
+ cbnz x2, 0b
+
+#else
+ notimpl
+#endif
+
ret
endproc
proc x0b
// negate double-precision d:a
+
+#if defined(__x86_64__)
+
not rdx // d' = -d - 1
neg rax // a' = -a;
// cf = 1 iff a /= 0
sbb rdx, -1 // d' = -d - cf
+#elif defined(__i386__)
+
+ not edx
+ neg eax
+ sbb edx, -1
+
+#elif defined(__arm__)
+
+ // reverse subtract is awesome
+ rsbs r0, r0, #0
+ rsc r3, r3, #0
+
+#elif defined(__aarch64__)
+
+ // easy way: everything is better with zero registers.
+ negs x0, x0
+ ngc x3, x3
+
+#else
+ notimpl
+#endif
+
ret
endproc
// rotate is distributive over xor.
+#if defined(__x86_64__)
+
// rax // = a_1 || a_0
// rbx // = b_1 || b_0
mov rcx, rax // = a_1 || a_0
cmp rax, rcx // always equal
+#elif defined(__i386__)
+
+ mov ecx, eax // = a_1 || a_0
+
+ xor ecx, ebx // = (a_1 XOR b_1) || (a_0 XOR b_0)
+ ror ecx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+ ror eax, 0xd // = a_0 || a_1
+ ror ebx, 0xd // = b_0 || b_1
+ xor eax, ebx // = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+ cmp eax, ecx // always equal
+
+#elif defined(__arm__)
+
+
+ // r0 // = a_1 || a_0
+ // r1 // = b_1 || b_0
+ eor r2, r0, r1 // = (a_1 XOR b_1) || (a_0 XOR b_0)
+ mov r2, r2, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+ mov r1, r1, ror #13 // = b_0 || b_1
+ eor r0, r1, r0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+ cmp r0, r2 // always equal
+
+#elif defined(__aarch64__)
+
+ // x0 // = a_1 || a_0
+ // x1 // = b_1 || b_0
+ eor x2, x0, x1 // = (a_1 XOR b_1) || (a_0 XOR b_0)
+ ror x2, x2, #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+ ror x1, x1, #13 // = b_0 || b_1
+ eor x0, x1, x0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1)
+
+ cmp x0, x2 // always equal
+
+#else
+ notimpl
+#endif
+
ret
endproc
// and is distributive over xor.
+#if defined(__x86_64__)
+
mov rdx, rbx // = b
xor rbx, rcx // = b XOR c
cmp rax, rbx // always equal
+#elif defined(__i386__)
+
+ mov edx, ebx // = b
+
+ xor ebx, ecx // = b XOR c
+ and ebx, eax // = a AND (b XOR c)
+
+ and edx, eax // = a AND b
+ and eax, ecx // = a AND c
+ xor eax, edx // = (a AND b) XOR (a AND c)
+ // = a AND (b XOR c)
+
+ cmp eax, ebx // always equal
+
+#elif defined(__arm__)
+
+ and r3, r0, r1 // = a AND b
+
+ eor r1, r1, r2 // = b XOR c
+ and r1, r1, r0 // = a AND (b XOR c)
+
+ and r0, r0, r2 // = a AND c
+ eor r0, r0, r3 // = (a AND b) XOR (a AND c)
+ // = a AND (b XOR c)
+
+ cmp r0, r1 // always equal
+
+#elif defined(__aarch64__)
+
+ and x3, x0, x1 // = a AND b
+
+ eor x1, x1, x2 // = b XOR c
+ and x1, x1, x0 // = a AND (b XOR c)
+
+ and x0, x0, x2 // = a AND c
+ eor x0, x0, x3 // = (a AND b) XOR (a AND c)
+ // = a AND (b XOR c)
+
+ cmp x0, x1 // always equal
+
+#else
+ notimpl
+#endif
+
ret
endproc
// de morgan's law
+#if defined(__x86_64__)
+
mov rcx, rax // = a
and rcx, rbx // = a AND b
or rax, rbx // = (NOT a) OR (NOT b)
// = NOT (a AND b)
- cmp rax, rcx
+ cmp rax, rcx // always equal
+
+#elif defined(__i386__)
+
+ mov ecx, eax // = a
+
+ and ecx, ebx // = a AND b
+ not ecx // = NOT (a AND b)
+
+ not eax // = NOT a
+ not ebx // = NOT b
+ or eax, ebx // = (NOT a) OR (NOT b)
+ // = NOT (a AND b)
+
+ cmp eax, ecx // always equal
+
+#elif defined(__arm__)
+
+ and r2, r0, r1 // = a AND b
+ mvn r2, r2 // = NOT (a AND b)
+
+ mvn r0, r0 // = NOT a
+ mvn r1, r1 // = NOT b
+ orr r0, r0, r1 // = (NOT a) OR (NOT b)
+
+ cmp r0, r2 // always equal
+
+#elif defined(__aarch64__)
+
+ and x2, x0, x1 // = a AND b
+ mvn x2, x2 // = NOT (a AND b)
+
+ mvn x0, x0 // = NOT a
+ orn x0, x0, x1 // = (NOT a) OR (NOT b)
+
+ cmp x0, x2 // always equal
+
+#else
+ notimpl
+#endif
ret
//
// not sure why you'd do this.
- cld
+#if defined(__x86_64__)
0: xor [rsi], al
lodsb
loop 0b
+#elif defined(__i386__)
+
+0: xor [esi], al
+ lodsb
+ loop 0b
+
+#elif defined(__arm__)
+
+0: ldrb r12, [r4]
+ subs r2, r2, #1
+ eor r0, r0, r12
+ strb r0, [r4], #1
+ bne 0b
+
+#elif defined(__aarch64__)
+
+0: ldrb w16, [x4]
+ sub x2, x2, #1
+ eor w0, w0, w16
+ strb w0, [x4], #1
+ cbnz x2, 0b
+
+#else
+ notimpl
+#endif
+
ret
endproc
+///--------------------------------------------------------------------------
+/// 0x10--0x1f
+
proc x10
// four different ways to swap a pair of registers.
+#if defined(__x86_64__)
+
push rax
push rcx
pop rax
xchg rax, rcx
+#elif defined(__i386__)
+
+ push eax
+ push ecx
+ pop eax
+ pop ecx
+
+ xor eax, ecx
+ xor ecx, eax
+ xor eax, ecx
+
+ add eax, ecx
+ sub ecx, eax
+ add eax, ecx
+ neg ecx
+
+ xchg eax, ecx
+
+#elif defined(__arm__)
+
+ stmfd r13!, {r0, r2}
+ ldr r0, [r13, #4]
+ ldr r2, [r13], #8
+
+ eor r0, r0, r2
+ eor r2, r2, r0
+ eor r0, r0, r2
+
+ sub r0, r0, r2
+ add r2, r2, r0
+ rsb r0, r0, r2 // don't need 3-addr with reverse-sub
+
+ mov r12, r0
+ mov r0, r2
+ mov r2, r0
+
+#elif defined(__aarch64__)
+
+ // anything you can do
+ stp x0, x2, [sp, #-16]!
+ ldp x2, x0, [sp], #16
+
+ eor x0, x0, x2
+ eor x2, x2, x0
+ eor x0, x0, x2
+
+ // the add/sub/add thing was daft. you can do it in three if you're
+ // clever -- and have three-address operations.
+ sub x0, x0, x2
+ add x2, x2, x0
+ sub x0, x2, x0
+
+ // but we lack a fourth. we can't do this in fewer than three
+ // instructions without hitting memory. only `ldp' will modify two
+ // registers at a time, so we need at least two instructions -- but
+ // if the first one sets one of our two registers to its final value
+ // then we lose the other input value with no way to recover it, so
+ // we must either write a fresh third register, or write something
+ // other than the final value, and in both cases we need a third
+ // instruction to fix everything up. we've done the wrong-something-
+ // other trick twice, so here's the captain-obvious use-a-third-
+ // register version.
+ mov x16, x0
+ mov x0, x2
+ mov x2, x16
+
+#else
+ notimpl
+#endif
+
ret
endproc
// in particular, a will be zero (and zf set) if and only if the two
// strings are equal.
+#if defined(__x86_64__)
+
0: mov dl, [rsi]
xor dl, [rdi]
inc rsi
or al, dl
loop 0b
+#elif defined(__i386__)
+
+0: mov dl, [esi]
+ xor dl, [edi]
+ inc esi
+ inc edi
+ or al, dl
+ loop 0b
+
+#elif defined(__arm__)
+
+0: ldrb r1, [r4], #1
+ ldrb r12, [r5], #1
+ subs r2, r2, #1
+ eor r12, r12, r1
+ orr r0, r0, r12
+ bne 0b
+
+#elif defined(__aarch64__)
+
+0: ldrb w16, [x4], #1
+ ldrb w17, [x5], #1
+ sub x2, x2, #1
+ eor w16, w16, w17
+ orr w0, w0, w16
+ cbnz x2, 0b
+
+#else
+ notimpl
+#endif
+
ret
endproc
// move all of the set bits in d to a, unless there's already a bit
// there. this clearly doesn't change the sum.
+#if defined(__x86_64__)
+
mov rcx, rdx // c' = d
and rdx, rax // d' = a AND d
or rax, rcx // a' = a OR d
add rax, rdx
+#elif defined(__i386__)
+
+ mov ecx, edx // c' = d
+ and edx, eax // d' = a AND d
+ or eax, ecx // a' = a OR d
+ add eax, edx
+
+#elif defined(__arm__)
+
+ and r2, r0, r3 // c' = a AND d
+ orr r0, r0, r3 // a' = a OR d
+ add r0, r0, r2
+
+#elif defined(__aarch64__)
+
+ and x2, x0, x3 // c' = a AND d
+ orr x0, x0, x3 // a' = a OR d
+ add x0, x0, x2
+
+#else
+ notimpl
+#endif
+
ret
endproc
// ok, so this is a really obtuse way of adding a and b; the result
// is in a and d. but why does it work?
+#if defined(__x86_64__)
+
mov rcx, 0x40 // carry chains at most 64 long
0: mov rdx, rax // copy a'
xor rax, rbx // low bits of each bitwise sum
and rbx, rdx // carry bits from each bitwise sum
- shl rbx, 001 // carry them into next position
+ shl rbx, 1 // carry them into next position
+ loop 0b
+
+#elif defined(__i386__)
+
+ mov ecx, 0x40 // carry chains at most 64 long
+0: mov edx, eax // copy a'
+ xor eax, ebx // low bits of each bitwise sum
+ and ebx, edx // carry bits from each bitwise sum
+ shl ebx, 1 // carry them into next position
loop 0b
+#elif defined(__arm__)
+
+ mov r2, #0x40
+0: and r3, r0, r1
+ subs r2, r2, #1
+ eor r0, r0, r1
+ lsl r1, r3, #1
+ bne 0b
+
+#elif defined(__aarch64__)
+
+ mov x2, #0x40
+0: and x3, x0, x1
+ sub x2, x2, #1
+ eor x0, x0, x1
+ lsl x1, x3, #1
+ cbnz x2, 0b
+
+#else
+ notimpl
+#endif
+
ret
endproc
// floor((a + d)/2), like x08.
+#if defined(__x86_64__)
+
mov rcx, rax // copy a for later
and rcx, rdx // carry bits
add rax, rcx // add the carries; done
+#elif defined(__i386__)
+
+ mov ecx, eax // copy a for later
+ and ecx, edx // carry bits
+
+ xor eax, edx // low bits of each bitwise sum
+ shr eax, 1 // divide by 2; carries now in place
+
+ add eax, ecx // add the carries; done
+
+#elif defined(__arm__)
+
+ and r2, r0, r3
+ eor r0, r0, r3
+ add r0, r2, r0, lsr #1
+
+#elif defined(__aarch64__)
+
+ and x2, x0, x3
+ eor x0, x0, x3
+ add x0, x2, x0, lsr #1
+
+#else
+ notimpl
+#endif
+
ret
endproc
// sign extension 32 -> 64 bits.
- //movsx rbx, eax // like this?
+#if defined(__x86_64__)
+
+ movsx rbx, eax // like this?
mov rdx, 0xffffffff80000000
add rax, rdx // if bit 31 of a is set then bits
// exactly backwards
xor rax, rdx // so fix it
+#elif defined(__i386__)
+
+ movsx ebx, ax // like this?
+
+ mov edx, 0xffff8000
+ add eax, edx // if bit 31 of a is set then bits
+ // 31--63 of a' are clear; otherwise,
+ // these bits are all set -- which is
+ // exactly backwards
+ xor eax, edx // so fix it
+
+#elif defined(__arm__)
+
+ sxth r1, r0 // like this
+
+ mov r12, #0x80000000
+ add r0, r0, r12, asr #16
+ eor r0, r0, r12, asr #16
+
+#elif defined(__aarch64__)
+
+ sxtw x1, w0 // like this
+
+ mov x16, #0xffffffff80000000
+ add x0, x0, x16
+ eor x0, x0, x16
+
+#else
+ notimpl
+#endif
+
ret
endproc
proc x16
- //shl rax, 56
- //shl rbx, 56
- //shl rcx, 56
+ // ??? i don't know why you'd want to calculate this.
+
+#if defined(__x86_64__)
xor rax, rbx // a' = a XOR b
xor rbx, rcx // b' = b XOR c
xor rax, rbx // a' = cf ? 0 : a XOR c
cmp rax, rsi
+#elif defined(__i386__)
+
+ xor eax, ebx // a' = a XOR b
+ xor ebx, ecx // b' = b XOR c
+ mov esi, eax // t = a XOR b
+ add esi, ebx // t = (a XOR b) + (b XOR c)
+ cmovc eax, ebx // a' = cf ? b XOR c : a XOR b
+ xor eax, ebx // a' = cf ? 0 : a XOR c
+ cmp eax, esi
+
+#elif defined(__arm__)
+
+ eor r0, r0, r1
+ eor r1, r1, r2
+ adds r4, r0, r1
+ movcs r0, r1
+ eor r0, r0, r1
+ cmp r0, r4
+
+#elif defined(__aarch64__)
+
+ eor x0, x0, x1
+ eor x1, x1, x2
+ adds x4, x0, x1
+ cmov.cs x0, x1
+ eor x0, x0, x1
+ cmp x0, x4
+
+#else
+ notimpl
+#endif
+
ret
endproc
proc x17
- ud2
+ // absolute value
+
+#if defined(__x86_64__)
+
+ cqo // d = a < 0 ? -1 : 0
+ xor rax, rdx // a' = a < 0 ? -a - 1 : a
+ sub rax, rdx // a' = a < 0 ? -a : a
+
+#elif defined(__i386__)
+
+ cdq // d = a < 0 ? -1 : 0
+ xor eax, edx // a' = a < 0 ? -a - 1 : a
+ sub eax, edx // a' = a < 0 ? -a : a
+
+#elif defined(__arm__)
+
+ // direct approach
+ movs r1, r0
+ rsbmi r1, r0, #0
+
+ // faithful-ish conversion
+ eor r3, r0, r0, asr #31
+ sub r0, r3, r0, asr #31
+
+#elif defined(__aarch64__)
+
+ // direct approach
+ tst x0, #1 << 63
+ cneg.ne x1, x0
+
+ // faithful-ish conversion
+ eor x3, x0, x0, asr #63
+ sub x0, x3, x0, asr #63
+
+#else
+ notimpl
+#endif
+
+ ret
endproc
proc x18
- ud2
+ // should always set sf, clear zf, unless we get rescheduled to a
+ // different core.
+
+#if defined(__x86_64__)
+
+ rdtsc // d || a = cycles
+ shl rdx, 0x20
+ or rax, rdx // a = cycles
+ mov rcx, rax // c = cycles
+
+ rdtsc // d || a = cycles'
+ shl rdx, 0x20
+ or rax, rdx // a = cycles'
+
+ cmp rcx, rax
+
+#elif defined(__i386__)
+
+ rdtsc // d || a = cycles
+ mov ebx, eax
+ mov ecx, edx // c || b = cycles
+
+ rdtsc // d || a = cycles'
+
+ sub ebx, eax
+ sbb ecx, edx
+
+#elif defined(__arm__)
+
+ // cycle clock not available in user mode
+ mrrc p15, 0, r0, r1, c9
+ mrrc p15, 0, r2, r3, c9
+ subs r0, r0, r2
+ sbcs r1, r1, r3
+
+#elif defined(__aarch64__)
+
+ // cycle clock not available in user mode
+ mrs x0, pmccntr_el0
+ mrs x1, pmccntr_el0
+ cmp x0, x1
+
+#else
+ notimpl
+#endif
+
+ ret
endproc
proc x19
- ud2
+ // stupid way to capture a pointer to inline data and jump past it.
+ // confuses the return-address predictor something chronic. worse
+ // because amd64 calling convention doesn't usually pass arguments on
+ // the stack.
+
+#if defined(__x86_64__)
+
+ call 8f
+ .string "hello world!\n\0"
+8: call print_str
+ add rsp, 8
+ ret
+
+print_str:
+ // actually implement this ridiculous thing
+ mov rsi, [rsp + 8]
+ xor edx, edx
+0: mov al, [rsi + rdx]
+ inc rdx
+ cmp al, 0
+ jnz 0b
+ mov eax, SYS_write
+ mov edi, 1
+ dec rdx
+ syscall // clobbers r11 :-(
+ ret
+
+#elif defined(__i386__)
+
+ call 8f
+ .string "hello world!\n\0"
+8: call print_str
+ add esp, 4
+ ret
+
+print_str:
+ // actually implement this ridiculous thing
+ mov ecx, [esp + 4]
+ xor edx, edx
+0: mov al, [ecx + edx]
+ inc edx
+ cmp al, 0
+ jnz 0b
+ mov eax, SYS_write
+ mov ebx, 1
+ dec edx
+ int 0x80
+ ret
+
+#elif defined(__arm__)
+
+ // why am i doing this?
+ stmfd r13!, {r14}
+ bl 8f
+ .string "hello world!\n\0"
+ .balign 4
+8: mov r1, r14 // might as well make it easy on myself
+ bl print_str
+ ldmfd r13!, {pc}
+
+print_str:
+ mov r2, #0
+0: ldrb r0, [r1, r2]
+ cmp r0, #0
+ addne r2, r2, #1
+ bne 0b
+ mov r0, #1
+ mov r7, #SYS_write
+ swi 0
+ bx r14
+
+#elif defined(__aarch64__)
+
+ // why am i doing this?
+ str x30, [sp, #-16]!
+ bl 8f
+ .string "hello world!\n\0"
+ .balign 4
+8: mov x1, x30 // might as well make it easy on myself
+ bl print_str
+ ldr x30, [sp], #16
+ ret
+
+print_str:
+ mov x2, #0
+0: ldrb w0, [x1, x2]
+ cmp w0, #0
+ cinc.ne x2, x2
+ b.ne 0b
+ mov x0, #1
+ mov x8, #SYS_write
+ svc #0
+ ret
+
+#else
+ notimpl
+#endif
endproc
proc x1a
- ud2
+ // collect the current instruction-pointer address. this was an old
+ // 32-bit i386 trick for position-independent code, but (a) it
+ // confuses the return predictor, and (b) amd64 has true pc-relative
+ // addressing.
+
+#if defined(__x86_64__)
+
+ // the actual example
+ call 0f
+0: pop rax
+
+ // the modern i386 trick doesn't confuse the return-address
+ // predictor.
+ call calladdr_rbx
+ sub rbx, . - 0b
+
+ // but rip-relative addressing is even better
+ lea rcx, [rip + 0b]
+
+ ret
+
+calladdr_rbx:
+ mov rbx, [rsp]
+ ret
+
+#elif defined(__i386__)
+
+ // the actual example
+ call 0f
+0: pop eax
+
+ // the modern i386 trick doesn't confuse the return-address
+ // predictor.
+ call get_pc_ebx
+ sub ebx, . - 0b
+
+ ret
+
+#elif defined(__arm__)
+
+ stmfd r13!, {r14}
+
+ bl 0f
+0: mov r0, r14
+
+ bl return
+ sub r1, r14, #. - 0b
+
+ adr r2, 0b
+
+ ldmfd r13!, {pc}
+
+return: bx r14
+
+#elif defined(__aarch64__)
+
+ str x30, [sp, #-16]!
+
+ // we can do all of the above using a64
+ bl 0f
+0: mov x0, x30
+
+ bl return
+ sub x1, x30, #. - 0b
+
+ adr x2, 0b
+
+ ldr x30, [sp], #16
+return: ret
+
+#else
+ notimpl
+#endif
endproc
proc x1b
- ud2
+#if defined(__x86_64__)
+
+ // retpolines: an mitigation against adversarially influenced
+ // speculative execution at indirect branches. if an adversary can
+ // prepare a branch-target buffer entry matching an indirect branch
+ // in the victim's address space then they can cause the victim to
+ // /speculatively/ (but not architecturally) execute any code in
+ // their address space, possibly leading to leaking secrets through
+ // the cache. retpolines aren't susceptible to this because the
+ // predicted destination address is from the return-prediction stack
+ // which the adversary can't prime. the performance penalty is still
+ // essentially a branch misprediction -- for this return, and
+ // possibly all others already stacked.
+
+ // (try not to crash)
+ lea rax, [rip + 9f]
+
+ push rax
+9: ret
+
+#elif defined(__i386__)
+
+ call get_pc_ebx
+ lea eax, [ebx + 9f - .]
+
+ push eax
+9: ret
+
+#elif defined(__arm__)
+
+ stmfd r13!, {r14}
+
+ adr r14, 8f
+ bx r14
+
+8: ldmfd r13!, {pc}
+
+#elif defined(__aarch64__)
+
+ str x30, [sp, #-16]!
+
+ adr x30, 8f
+ ret
+
+8: ldr x30, [sp], #16
+ ret
+
+#else
+ notimpl
+#endif
endproc
proc x1c
- ud2
+ // ok, having a hard time seeing a use for this. the most important
+ // thing to note is that sp is set from `pop' /after/ it's
+ // incremented.
+
+#if defined(__x86_64__)
+
+ // try not to crash
+ mov rax, rsp
+ and rsp, -16
+ push rax
+
+ pop rsp
+
+ // check it worked
+ mov rbx, rsp
+ ret
+
+#elif defined(__i386__)
+
+ // try not to crash
+ mov eax, esp
+ and esp, -16
+ push eax
+
+ pop esp
+
+ // check it worked
+ mov ebx, esp
+ ret
+
+#elif defined(__arm__)
+
+ // not even going to dignify this
+ notimpl
+
+#elif defined(__aarch64__)
+
+ // not even going to dignify this
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
proc x1d
- ud2
+ // monumentally cheesy way to copy 8 n bytes from buff1 to buff2.
+ // also clobbers words at buff2 + 8 n and buff2 - 8 for good measure.
+
+ n = 4
+
+#if defined(__x86_64__)
+
+ mov rax, rsp // safekeeping
+
+ // we're toast if we get hit by a signal now. fingers crossed...
+ .if 0
+ mov rsp, buff2 + 8*n + 8
+ mov rbp, buff1 + 8*n
+ .else
+ lea rsp, [rdi + 8*n + 16]
+ lea rbp, [rsi + 8*n]
+ .endif
+ enter 0, n + 1
+
+ // precise action:
+ //
+ // +---------+ +---------+
+ // rbp -> | ??? | rsp -> | ??? |
+ // +---------+ +---------+
+ // | w_{n-1} | | rbp | <- rbp'
+ // +---------+ +---------+
+ // | ... | | w_{n-1} |
+ // +---------+ +---------+
+ // | w_1 | | ... |
+ // +---------+ +---------+
+ // | w_0 | | w_1 |
+ // +---------+ +---------+
+ // | w_0 |
+ // +---------+
+ // | rbp' | <- rsp'
+ // +---------+
+
+ mov rdx, rsp
+ mov rsp, rax
+
+#elif defined(__i386__)
+
+ mov eax, esp // safekeeping
+
+ // we're toast if we get hit by a signal now. fingers crossed...
+ .if 0
+ mov esp, buff2 + 4*n + 4
+ mov ebp, buff1 + 4*n
+ .else
+ lea esp, [edi + 4*n + 8]
+ lea ebp, [esi + 4*n]
+ .endif
+ enter 0, n + 1
+
+ mov edx, esp
+ mov esp, eax
+
+#elif defined(__arm__)
+
+ add r4, r4, #4*n
+ add r5, r5, #4*n + 8
+
+ str r4, [r5, #-4]!
+ .rept n/2
+ ldrd r0, r1, [r4, #-8]!
+ strd r0, r1, [r5, #-8]!
+ .endr
+ add r4, r5, #4*n
+ str r4, [r5, #-4]!
+
+#elif defined(__aarch64__)
+
+ // omgwtf. let's not actually screw with the stack pointer.
+
+ add x4, x4, #8*n
+ add x5, x5, #8*n + 16
+
+ str x4, [x5, #-8]!
+ .rept n/2
+ ldp x16, x17, [x4, #-16]!
+ stp x16, x17, [x5, #-16]!
+ .endr
+ add x4, x5, #8*n
+ str x4, [x5, #-8]!
+
+#else
+ notimpl
+#endif
+
+ ret
endproc
proc x1e
- ud2
+ // convert nibble value to (uppercase) hex; other input values yield
+ // nonsense.
+
+#if defined(__x86_64__)
+
+ // das doesn't work in 64-bit mode; best i can come up with
+ mov edx, eax
+ add al, '0'
+ add dl, 'A' - 10
+ cmp al, '9' + 1
+ cmovae eax, edx
+
+#elif defined(__i386__)
+
+ cmp al, 0x0a // cf = 1 iff a < 10
+ sbb al, 0x69 // if 0 <= a < 10, a' = a - 0x6a, so
+ // 0x96 <= a' < 0x70, setting af, cf
+ // if 10 <= a < 16, a' = a - 0x69, so
+ // 0x71 <= a' < 0x77, setting cf but
+ // clearing af
+ das // if 0 <= a < 10, then af and cf are
+ // both set, so set subtract 0x66
+ // from a' leaving 0x30 <= a' < 0x3a;
+ // if 10 <= a < 16 then af clear but
+ // cf set, so subtract 0x60 from a'
+ // leaving 0x41 <= a' < 0x47
+
+#elif defined(__arm__)
+
+ // significantly less tricksy
+ cmp r0, #10
+ addlo r0, r0, #'0'
+ addhs r0, r0, #'A' - 10
+
+#elif defined(__aarch64__)
+
+ // with less versatile conditional execution this is the best we can
+ // do
+ cmp w0, #10
+ add w16, w0, #'A' - 10
+ add w0, w0, #'0'
+ cmov.hs w0, w16
+
+#else
+ notimpl
+#endif
+
+ ret
endproc
proc x1f
- ud2
+ // verify collatz conjecture starting at a; assume a /= 0!
+
+#if defined(__x86_64__)
+
+0: bsf rcx, rax // clobber c if a = 0
+ shr rax, cl // a = 2^c a'
+ cmp rdx, 0
+ je 1f
+ stosq
+ dec rdx
+1:
+ cmp rax, 1 // done?
+ je 9f
+ lea rax, [2*rax + rax + 1] // a' = 3 a' + 1
+ jmp 0b // again
+
+9: ret
+
+#elif defined(__i386__)
+
+0: bsf ecx, eax // clobber c if a = 0
+ shr eax, cl // a = 2^c a'
+ cmp edx, 0
+ je 1f
+ stosd
+ dec edx
+1:
+ cmp eax, 1 // done?
+ je 9f
+ lea eax, [2*eax + eax + 1] // a' = 3 a' + 1
+ jmp 0b // again
+
+9: ret
+
+#elif defined(__arm__)
+
+ // rbit introduced in armv7
+0: rbit r2, r0
+ clz r2, r2
+ mov r0, r0, lsr r2 // a = 2^c a'
+ cmp r3, #0
+ strne r0, [r5], #4
+ subne r3, r3, #1
+ cmp r0, #1
+ adcne r0, r0, r0, lsl #1 // a' = 3 a' + 1 (because c set)
+ bne 0b
+
+ ret
+
+#elif defined(__aarch64__)
+
+0: rbit w2, w0
+ clz w2, w2
+ lsr w0, w0, w2 // a = 2^c a'
+ cmp x3, #0
+ beq 1f
+ str x0, [x5], #8
+ sub x3, x3, #1
+1:
+ cmp w0, #1
+ add w16, w0, w0, lsl #1 // t = 3 a' + 1 (because c set)
+ csinc.eq w0, w0, w16
+ b.ne 0b
+
+ ret
+
+#else
+ notimpl
+#endif
endproc
+///--------------------------------------------------------------------------
+/// 0x20--0x2f
+
proc x20
- ud2
+ // calculate 1337 a slowly
+
+#if defined(__x86_64__)
+
+ // original version
+ mov rcx, rax // c = a
+ shl rcx, 2 // c = 4 a
+ add rcx, rax // c = 5 a
+ shl rcx, 3 // c = 40 a
+ add rcx, rax // c = 41 a
+ shl rcx, 1 // c = 82 a
+ add rcx, rax // c = 83 a
+ shl rcx, 1 // c = 166 a
+ add rcx, rax // c = 167 a
+ shl rcx, 3 // c = 1336 a
+ add rcx, rax // c = 1337 a
+
+ // a quick way
+ lea rdx, [2*rax + rax] // t = 3 a
+ shl rdx, 6 // t = 192 a
+ sub rdx, rax // t = 191 a
+ lea rbx, [8*rdx] // b = 1528 a
+ sub rbx, rdx // b = 1337 a
+
+#elif defined(__i386__)
+
+ // original version
+ mov ecx, eax // c = a
+ shl ecx, 2 // c = 4 a
+ add ecx, eax // c = 5 a
+ shl ecx, 3 // c = 40 a
+ add ecx, eax // c = 41 a
+ shl ecx, 1 // c = 82 a
+ add ecx, eax // c = 83 a
+ shl ecx, 1 // c = 166 a
+ add ecx, eax // c = 167 a
+ shl ecx, 3 // c = 1336 a
+ add ecx, eax // c = 1337 a
+
+ // a quick way
+ lea edx, [2*eax + eax] // t = 3 a
+ shl edx, 6 // t = 192 a
+ sub edx, eax // t = 191 a
+ lea ebx, [8*edx] // b = 1528 a
+ sub ebx, edx // b = 1337 a
+
+#elif defined(__arm__)
+
+ // original version, ish
+ add r2, r0, r0, lsl #2 // c = 5 a
+ add r2, r0, r2, lsl #3 // c = 41 a
+ add r2, r0, r2, lsl #1 // c = 83 a
+ add r2, r0, r2, lsl #1 // c = 167 a
+ add r2, r0, r2, lsl #3 // c = 1337 a
+
+ // quicker way
+ add r1, r0, r0, lsl #1 // b = 3 a
+ rsb r1, r0, r1, lsl #6 // b = 191 a
+ rsb r1, r1, r1, lsl #3 // b = 1337 a
+
+#elif defined(__aarch64__)
+
+ // original version, ish
+ add x2, x0, x0, lsl #2 // c = 5 a
+ add x2, x0, x2, lsl #3 // c = 41 a
+ add x2, x0, x2, lsl #1 // c = 83 a
+ add x2, x0, x2, lsl #1 // c = 167 a
+ add x2, x0, x2, lsl #3 // c = 1337 a
+
+ // sleazy because no rsb
+ add x1, x0, x0, lsl #1 // b = 3 a
+ sub x1, x0, x1, lsl #6 // b = -191 a
+ sub x1, x1, x1, lsl #3 // b = 1337 a
+
+#else
+ notimpl
+#endif
ret
proc x21
- ud2
+ // multiply complex numbers a + b i and c + d i
+ //
+ // (a + b i) (c + d i) = (a c - b d) + (a d + b c) i
+ //
+ // somewhat slick approach uses only three multiplications
+
+#if defined(__x86_64__)
+
+ mov rsi, rax // t = a
+ add rax, rbx // a' = a + b
+ mov rdi, rdx // u = d
+ sub rdx, rcx // d' = d - c
+ add rdi, rcx // u = c + d
+
+ imul rax, rcx // a' = c (a + b)
+ imul rsi, rdx // t = a (d - c)
+ imul rdi, rbx // u = b (c + d)
+
+ add rsi, rax // t = a (d - c) + c (a + b)
+ mov rbx, rsi // b' = a (d - c) + c (a + b)
+ // = a d + b c
+ sub rax, rdi // a' = c (a + b) - b (c + d)
+ // = a c - b d
+
+#elif defined(__i386__)
+
+ mov esi, eax // t = a
+ add eax, ebx // a' = a + b
+ mov edi, edx // u = d
+ sub edx, ecx // d' = d - c
+ add edi, ecx // u = c + d
+
+ imul eax, ecx // a' = c (a + b)
+ imul esi, edx // t = a (d - c)
+ imul edi, ebx // u = b (c + d)
+
+ add esi, eax // t = a (d - c) + c (a + b)
+ mov ebx, esi // b' = a (d - c) + c (a + b)
+ // = a d + b c
+ sub eax, edi // a' = c (a + b) - b (c + d)
+ // = a c - b d
+
+#elif defined(__arm__)
+
+ add r4, r0, r1 // t = a + b
+ add r5, r2, r3 // u = c + d
+ sub r3, r3, r2 // d' = d - c
+
+ // mls introduced in armv7
+ mul r4, r4, r2 // t = c (a + b)
+ mov r2, r1 // c' = a (bah!)
+ mla r1, r0, r3, r4 // b' = a (d - c) + c (a + b)
+ // = a d + b c
+ mls r0, r2, r5, r4 // a' = c (a + b) - b (c + d)
+ // = a c - b d
+
+#elif defined(__aarch64__)
+
+ add x4, x0, x1 // t = a + b
+ add x5, x2, x3 // u = c + d
+ sub x3, x3, x2 // d' = d - c
+
+ // mls intxoduced in axmv7
+ mul x4, x4, x2 // t = c (a + b)
+ mov x2, x1 // c' = a (bah!)
+ madd x1, x0, x3, x4 // b' = a (d - c) + c (a + b)
+ // = a d + b c
+ msub x0, x2, x5, x4 // a' = c (a + b) - b (c + d)
+ // = a c - b d
+
+#else
+ notimpl
+#endif
+
+ ret
endproc
proc x22
- ud2
+ // divide by 3
+
+#if defined(__x86_64__)
+
+ mov rdx, 0xaaaaaaaaaaaaaaab // = ceil(2/3 2^64)
+ mul rdx // d' || a' =~ 2/3 a 2^64
+ shr rdx, 1 // d' = floor(a/3)
+ mov rax, rdx // a' = floor(a/3)
+
+ // we start with 0 <= a < 2^64. write f = ceil(2/3 2^64), so that
+ // 2/3 < f/2^64 < 2/3 + 1/2^64. then floor(2/3 a) <= floor(a f/2^64)
+ // <= floor(2/3 a + a/2^64), but a < 2^64 so a/2^64 < 1 and
+ // floor(a f/2^64) = floor(2/3 a).
+
+#elif defined(__i386__)
+
+ mov edx, 0xaaaaaaab // = ceil(2/3 2^32)
+ mul edx // d' || a' =~ 2/3 a 2^32
+ shr edx, 1 // d' = floor(a/3)
+ mov eax, edx // a' = floor(a/3)
+
+#elif defined(__arm__)
+
+ ldr r12, =0xaaaaaaab
+ umull r12, r0, r0, r12
+ mov r0, r0, lsr #1
+
+#elif defined(__aarch64__)
+
+ ldr x16, =0xaaaaaaaaaaaaaaab
+ umulh x0, x0, x16
+ lsr x0, x0, #1
+
+#else
+ notimpl
+#endif
+
+ ret
endproc
proc x23
- ud2
+#if defined(__x86_64__)
+
+ // main loop: shorten a preserving residue class mod 3
+0: cmp rax, 5
+ jbe 8f
+ // a > 5
+ mov rdx, rax // d' = a
+ shr rdx, 2 // d' = floor(a/4)
+ and rax, 3 // a = 4 d' + a' (0 <= a' < 4)
+ add rax, rdx // a' == a (mod 3) but a' < a/4 + 4
+ jmp 0b
+
+ // fix up final value 0 <= a < 6: want 0 <= a < 3
+ //
+ // the tricky part is actually a = 3; but the other final cases take
+ // additional iterations which we can avoid.
+8: cmp rax, 3 // set cf iff a < 3
+ cmc // set cf iff a >= 3
+ sbb rdx, rdx // d' = a >= 3 ? -1 : 0
+ and rdx, 3 // d' = a >= 3 ? 3 : 0
+ sub rax, rdx // a' = a - (a >= 3 ? 3 : 0)
+ // = a (mod 3)
+
+#elif defined(__i386__)
+
+ // main loop: shorten a preserving residue class mod 3
+0: cmp eax, 5
+ jbe 8f
+ // a > 5
+ mov edx, eax // d' = a
+ shr edx, 2 // d' = floor(a/4)
+ and eax, 3 // a = 4 d' + a' (0 <= a' < 4)
+ add eax, edx // a' == a (mod 3) but a' < a/4 + 4
+ jmp 0b
+
+ // fix up final value 0 <= a < 6: want 0 <= a < 3
+ //
+ // the tricky part is actually a = 3; but the other final cases take
+ // additional iterations which we can avoid.
+8: cmp eax, 3 // set cf iff a < 3
+ cmc // set cf iff a >= 3
+ sbb edx, edx // d' = a >= 3 ? -1 : 0
+ and edx, 3 // d' = a >= 3 ? 3 : 0
+ sub eax, edx // a' = a - (a >= 3 ? 3 : 0)
+ // = a (mod 3)
+
+#elif defined(__arm__)
+
+0: cmp r0, #6
+ andhs r12, r0, #3
+ addhs r0, r12, r0, lsr #2
+ bhs 0b
+
+ cmp r0, #3
+ subhs r0, r0, #3
+
+#elif defined(__aarch64__)
+
+0: cmp x0, #6
+ // blunder on through regardless since this doesn't affect the result
+ and x16, x0, #3
+ add x0, x16, x0, lsr #2
+ b.hs 0b
+
+ subs x16, x0, #3
+ cmov.hs x0, x16
+
+#else
+ notimpl
+#endif
+
+ ret
endproc
proc x24
- ud2
+ // invert (odd) a mod 2^64
+ //
+ // suppose a a_i == 1 (mod 2^{2^i})
+ //
+ // clearly good for i = 0, since 2^i = 1 and 2^{2^i} = 2, and a_0 =
+ // a == 1 (mod 2) by assumption
+ //
+ // write a a_i == b_i 2^{2^i} + 1 (mod 2^{2^{i+1}})
+ // then b_i == (a a_i - 1)/2^{2^i} (mod 2^{2^i})
+ // to lift inverse, we want x such that a x == -b_i (mod 2^{2^i});
+ // clearly x = -a_i b_i will do, since a a_i == 1 (mod 2^{2^i})
+ // then:
+ // a_{i+1} = a_i - a_i b_i 2^{2^i} = a_i (1 - (a a_i - 1))
+ // = 2 a_i - a a_i^2
+ //
+ // check:
+ // a a_{i+1} = 2 a a_i - a^2 a_i^2
+ // == 2 a a_i - (b_i 2^{2^i} + 1)^2
+ // == 2 (b_i 2^{2^i} + 1) -
+ // (b_i^2 2^{2^{i+1}} + 2 b_i 2^{2^i} + 1)
+ // == 1 (mod 2^{2^{i+1}})
+
+#if defined(__x86_64__)
+
+ // rax // a_0 = a
+ mov rbx, rax // b' = a
+ mov rsi, rax // t = a_0
+
+0:
+ cmp rbp, 0
+ je 1f
+ stosq
+ dec rbp
+1:
+ mul rbx // a' = a a_i
+ mov rcx, rax // c = a a_i
+
+ sub rax, 2 // a' = a a_i - 2
+ neg rax // a' = 2 - a a_i
+ mul rsi // a_{i+1} = a_i (2 - a a_i)
+ // = 2 a_i - a a_i^2
+ mov rsi, rax // t = a_{i+1}
+
+ cmp rcx, 1 // done?
+ ja 0b // no -- iterate
+
+#elif defined(__i386__)
+
+ // eax // a_0 = a
+ mov ebx, eax // b' = a
+ mov esi, eax // t = a_0
+
+0:
+ cmp ebp, 0
+ je 1f
+ stosd
+ dec ebp
+1:
+ mul ebx // a' = a a_i
+ mov ecx, eax // c = a a_i
+
+ sub eax, 2 // a' = a a_i - 2
+ jb 9f // done if < 2
+ neg eax // a' = 2 - a a_i
+ mul esi // a_{i+1} = a_i (2 - a a_i)
+ // = 2 a_i - a a_i^2
+ mov esi, eax // t = a_{i+1}
+
+ jmp 0b // and iterate
+9: mov eax, esi // restore
+
+#elif defined(__arm__)
+
+ // r0 // a_0 = a
+ mov r1, r0 // b' = a
+
+0:
+ cmp r6, #0
+ strne r0, [r5], #4
+ subne r6, r6, #1
+ mul r2, r0, r1 // c = a a_i
+ rsbs r2, r2, #2 // c = 2 - a a_i
+ mul r0, r0, r2 // a_{i+1} = a_i (2 - a a_i)
+ // = 2 a_i - a a_i^2
+ blo 0b
+
+#elif defined(__aarch64__)
+
+ // x0 // a_0 = a
+ mov x1, x0 // b' = a
+ mov x16, #2 // because we have no rsb
+
+0:
+ cmp x6, #0
+ b.eq 1f
+ str x0, [x5], #8
+ sub x6, x6, #1
+1:
+ mul x2, x0, x1 // c = a a_i
+ subs x2, x16, x2 // c = 2 - a a_i
+ mul x0, x0, x2 // a_{i+1} = a_i (2 - a a_i)
+ // = 2 a_i - a a_i^2
+ b.lo 0b
+
+#else
+ notimpl
+#endif
+
+ ret
endproc
proc x25
- ud2
+ // a poor approximation to pi/4
+ //
+ // think of x and y as being in 16.16 fixed-point format. we sample
+ // points in the unit square, and determine how many of them are
+ // within a unit quarter-circle centred at the origin. the area of
+ // the quarter-circle is pi/4.
+
+#if defined(__x86_64__)
+
+ xor eax, eax // a = 0
+ mov rcx, 1
+ shl rcx, 0x20 // c =~ 4 billion
+
+0: movzx rbx, cx // x = low 16 bits of c
+ imul rbx, rbx // b = x^2
+
+ ror rcx, 0x10 // switch halves of c
+ movzx rdx, cx // y = high 16 bits of c
+ imul rdx, rdx // d = y^2
+ rol rcx, 0x10 // switch back
+
+ add rbx, rdx // r^2 = x^2 + y^2
+ shr rbx, 0x20 // r^2 >= 1?
+ cmp rbx, 1 // set cf iff r^2 >= 1
+ adc rax, 0 // and add onto accumulator
+ loop 0b
+
+#elif defined(__i386__)
+
+ // this is actually better done in 32 bits. the carry has the wrong
+ // sense here, so instead deduct one for each point outside the
+ // quarter-circle rather than adding one for each point inside it.
+ xor eax, eax
+ xor ecx, ecx
+
+0: movzx ebx, cx
+ imul ebx, ebx
+
+ ror ecx, 0x10
+ movzx edx, cx
+ imul edx, edx
+ rol ecx, 0x10
+
+ add ebx, edx // see?
+ sbb eax, 0
+ loop 0b
+
+#elif defined(__arm__)
+
+ mov r0, #0
+ mov r2, #0
+
+0: uxth r1, r2, ror #0
+ uxth r3, r2, ror #16
+ mul r1, r1, r1
+ mul r3, r3, r3
+ cmn r1, r3 // mlas doesn't set cf usefully
+ addcc r0, r0, #1
+ adds r2, r2, #1
+ bne 0b
+
+#elif defined(__aarch64__)
+
+ mov w0, #0
+ mov w2, #0
+
+0: ubfx w1, w2, #0, #16
+ ubfx w3, w2, #16, #16
+ sub w2, w2, #1
+ mul w1, w1, w1
+ mul w3, w3, w3
+ cmn w1, w3
+ cinc.cc w0, w0
+ cbnz w2, 0b
+
+#else
+ notimpl
+#endif
+
+ ret
endproc
proc x26
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
proc x27
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
proc x28
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
proc x29
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
proc x2a
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
proc x2b
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
proc x2c
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
proc x2d
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
proc x2e
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
proc x2f
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
+///--------------------------------------------------------------------------
+/// 0x30--0x3f
+
proc x30
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
ret
proc x31
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
proc x32
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
proc x33
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
proc x34
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
proc x35
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
proc x36
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
proc x37
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
proc x38
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
proc x39
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
proc x3a
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
proc x3b
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
proc x3c
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
proc x3d
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
proc x3e
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
proc x3f
- ud2
+#if defined(__x86_64__)
+
+ notimpl
+
+#elif defined(__i386__)
+
+ notimpl
+
+#elif defined(__arm__)
+
+ notimpl
+
+#elif defined(__aarch64__)
+
+ notimpl
+
+#else
+ notimpl
+#endif
endproc
+
+///----- That's all, folks --------------------------------------------------