From: Mark Wooding Date: Mon, 19 Oct 2020 03:21:35 +0000 (+0100) Subject: Multiple architectures, more solutions. X-Git-Url: https://git.distorted.org.uk/~mdw/xchg-rax-rax/commitdiff_plain/90c4eee32bd52dbdba4913a5030c7b27cffaa103 Multiple architectures, more solutions. --- diff --git a/.gitignore b/.gitignore index 505bf5b..8a19953 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ *.o -/xchg +/xchg-amd64 +/xchg-i386 +/xchg-armhf +/xchg-arm64 diff --git a/Makefile b/Makefile index ca069ff..5488b34 100644 --- a/Makefile +++ b/Makefile @@ -8,23 +8,36 @@ V_AT = $(call vcond,@) vtag = \ $(call vcond,@printf " %-8s %s\n" "$1" "$(or $2,$@)";) -CC = gcc -m64 -CFLAGS = -O2 -g -Wall -Werror - -AS = gcc -m64 -ASFLAGS = -O2 -g +ARCH = -LD = gcc -m64 +CFLAGS = -O2 -g -Wall -Werror +ASFLAGS = $(CFLAGS) LDFLAGS = -%.o: %.c - $(call vtag,CC)$(CC) -c -o $@ $(CFLAGS) $< - -%.o: %.S - $(call vtag,AS)$(AS) -c -o $@ $(ASFLAGS) $< - -all:: xchg -xchg: xchg.o main.o - $(call vtag,LD)$(LD) -o $@ $^ - -clean::; rm -f xchg *.o +ARCH += i386 +CC/i386 = gcc -m32 + +ARCH += amd64 +CC/amd64 = gcc -m64 + +ARCH += armhf +CC/armhf = arm-linux-gnueabihf-gcc + +ARCH += arm64 +CC/arm64 = aarch64-linux-gnu-gcc + +define defarch.body +AS/$1 = $$(CC/$1) +LD/$1 = $$(CC/$1) +%-$1.o: %.c + $$(call vtag,CC/$1)$$(CC/$1) -c -o $$@ $$(CFLAGS) $$(CFLAGS/$1) $$< +%-$1.o: %.S + $$(call vtag,AS/$1)$$(AS/$1) -c -o $$@ $$(ASFLAGS) $$(ASFLAGS/$1) $$< +all:: xchg-$1 +xchg-$1: xchg-$1.o main-$1.o + $$(call vtag,LD/$1)$$(LD/$1) -o $$@ $$^ +endef +defarch = $(eval $(call defarch.body,$1)) +$(foreach a,$(ARCH), $(call defarch,$a)) + +clean::; rm -f $(foreach a,$(ARCH), xchg-$a) *.o diff --git a/main.c b/main.c index 9ab4c2e..df4b941 100644 --- a/main.c +++ b/main.c @@ -12,10 +12,42 @@ union reg { long u; }; +#if defined(__i386__) +# define GPREGS(_) \ + _(a) _(b) _(c) _(d) _(si) _(di) _(bp) +#elif defined(__x86_64__) +# define GPREGS(_) \ + _(a) _(b) _(c) _(d) _(si) _(di) _(bp) \ + _(r8) _(r9) _(r10) _(r11) _(r12) _(r13) _(r14) _(r15) +#elif defined(__arm__) +# define GPREGS(_) \ + _(r0) _(r1) _(r2) _(r3) _(r4) _(r5) _(r6) _(r7) \ + _(r8) _(r9) _(r10) _(r11) _(r12) +#elif defined(__aarch64__) +# define GPREGS(_) \ + _(x0) _(x1) _(x2) _(x3) _(x4) _(x5) _(x6) _(x7) \ + _(x8) _(x9) _(x10) _(x11) _(x12) _(x13) _(x14) _(x15) +#else +# error "not supported" +#endif + +enum { +#define DEFCONST(r) R_##r, + GPREGS(DEFCONST) +#undef DEFCONST + R_flags, + NREGS +}; + +static const char *const rname[] = { +#define DEFNAME(r) #r, + GPREGS(DEFNAME) +#undef DEFNAME + "f" +}; + struct regs { - union reg - a, b, c, d, si, di, bp, f, - r8, r9, r10, r11, r12, r13, r14, r15; + union reg r[NREGS]; }; struct seg { @@ -25,6 +57,8 @@ struct seg { #define N(v) (sizeof(v)/sizeof((v)[0])) +#define STRCMP(a, op, b) (strcmp((a), (b)) op 0) +#define STRNCMP(a, op, b, n) (strncmp((a), (b), (n)) op 0) #define CTYPE_HACK(func, ch) func((unsigned char)(ch)) #define ISDIGIT(ch) CTYPE_HACK(isdigit, ch) #define ISSPACE(ch) CTYPE_HACK(isspace, ch) @@ -106,12 +140,10 @@ static int hex_digit(char ch) else return (-1); } -static void setreg(union reg *r, - struct seg **seg_inout, - int *i_inout, int argc, char *argv[]) +static void setreg(union reg *r, struct seg **seg_inout, const char *p) { struct seg *seg; - const char *p, *pp; + const char *pp; unsigned char *q; int hi, lo; size_t n; @@ -119,11 +151,9 @@ static void setreg(union reg *r, #define LONG_REG(p) (parse_long("signed register", (p), LONG_MIN, LONG_MAX)) #define ULONG_REG(p) (parse_ulong("unsigned register", (p), 0, ULONG_MAX)) - p = *i_inout >= argc ? "-" : argv[(*i_inout)++]; switch (*p) { case '-': if (p[1]) r->i = LONG_REG(p); - else r->u = 0xdeadbeefdeadbeef; break; case 'i': if (p[1] != ':') goto bad; @@ -154,6 +184,12 @@ static void setreg(union reg *r, } r->p = seg->p; break; + case 'z': + if (p[1] != ':') goto bad; + n = parse_ulong("buffer length", p + 2, 0, ~(size_t)0); + seg = (*seg_inout)++; seg->p = q = xmalloc(n); seg->sz = n; + r->p = q; memset(q, 0, n); + break; default: if (ISDIGIT(*p)) r->u = ULONG_REG(p); else if (*p == '+') r->i = LONG_REG(p); @@ -173,7 +209,11 @@ static void dumpreg(const char *name, const union reg *r, { size_t i; +#if ULONG_MAX == 0xffffffff + printf("%3s = 0x%08lx = %20ld = %20lu", name, r->u, r->i, r->u); +#else printf("%3s = 0x%016lx = %20ld = %20lu", name, r->u, r->i, r->u); +#endif if (r->u >= ' ' && r->u <= '~') printf(" = '%c'", (int)r->u); for (i = 0; i < nseg; i++) { if (r->p == seg[i].p) @@ -212,54 +252,54 @@ int main(int argc, char *argv[]) { struct regs r; struct seg seg[16], *segp = seg; - size_t nseg; - int i, j; + size_t nseg, n; + const char *p; + char *q; + unsigned long f; + int i, j, k; + unsigned long l; prog = strrchr(argv[0], '/'); if (prog) prog++; else prog = argv[0]; if (argc < 2) - barf("usage: %s I [A B C D SI DI BP R8 R9 R10 R11 R12 R13 R14 R15 F]", + barf("usage: %s I [REG...]", prog); j = parse_long("program index", argv[1], -1, N(x) - 1); - i = 2; - setreg(&r.a, &segp, &i, argc, argv); - setreg(&r.b, &segp, &i, argc, argv); - setreg(&r.c, &segp, &i, argc, argv); - setreg(&r.d, &segp, &i, argc, argv); - setreg(&r.si, &segp, &i, argc, argv); - setreg(&r.di, &segp, &i, argc, argv); - setreg(&r.bp, &segp, &i, argc, argv); - setreg(&r.r8, &segp, &i, argc, argv); - setreg(&r.r9, &segp, &i, argc, argv); - setreg(&r.r10, &segp, &i, argc, argv); - setreg(&r.r11, &segp, &i, argc, argv); - setreg(&r.r12, &segp, &i, argc, argv); - setreg(&r.r13, &segp, &i, argc, argv); - setreg(&r.r14, &segp, &i, argc, argv); - setreg(&r.r15, &segp, &i, argc, argv); - setreg(&r.f, &segp, &i, argc, argv); - nseg = segp - seg; +#if ULONG_MAX == 0xffffffff +# define DEAD 0xdeadbeef +#else +# define DEAD 0xdeadbeefdeadbeef +#endif + for (i = 0; i < NREGS - 1; i++) r.r[i].u = DEAD; +#undef DEAD + r.r[R_flags].u = 0; + + i = 0; + argv += 2; + while (*argv) { + p = *argv++; + if (ISDIGIT(*p)) { + l = strtoul(p, &q, 10); + if (l < NREGS && *q == '=') { p = q + 1; i = l; } + } else for (k = 0; k < NREGS; k++) { + n = strlen(rname[k]); + if (STRNCMP(p, ==, rname[k], n) && p[n] == '=') + { i = k; p += n + 1; break; } + } + if (i >= NREGS) barf("too many registers"); + setreg(&r.r[i], &segp, p); i++; + } + nseg = segp - seg; call_example(j < 0 ? &nop : x[j], &r); - dumpreg("rax", &r.a, seg, nseg); - dumpreg("rbx", &r.b, seg, nseg); - dumpreg("rcx", &r.c, seg, nseg); - dumpreg("rdx", &r.d, seg, nseg); - dumpreg("rsi", &r.si, seg, nseg); - dumpreg("rdi", &r.di, seg, nseg); - dumpreg("rbp", &r.bp, seg, nseg); - dumpreg("rbp", &r.bp, seg, nseg); - dumpreg("r8", &r.r8, seg, nseg); - dumpreg("r9", &r.r9, seg, nseg); - dumpreg("r10", &r.r10, seg, nseg); - dumpreg("r11", &r.r11, seg, nseg); - dumpreg("r12", &r.r12, seg, nseg); - dumpreg("r13", &r.r13, seg, nseg); - dumpreg("r14", &r.r14, seg, nseg); - dumpreg("r15", &r.r15, seg, nseg); + for (i = 0; i < NREGS; i++) dumpreg(rname[i], &r.r[i], seg, nseg); + + f = r.r[R_flags].u; + +#if defined(__i386__) || defined(__x86_64__) #define CF (1 << 0) #define PF (1 << 2) @@ -267,38 +307,37 @@ int main(int argc, char *argv[]) #define SF (1 << 7) #define OF (1 << 11) - dumpreg("f", &r.f, seg, nseg); printf("\tstatus: %ccf %cpf %caf %czf %csf %cdf %cof\n", - (r.f.u >> 0)&1u ? '+' : '-', - (r.f.u >> 2)&1u ? '+' : '-', - (r.f.u >> 4)&1u ? '+' : '-', - (r.f.u >> 6)&1u ? '+' : '-', - (r.f.u >> 7)&1u ? '+' : '-', - (r.f.u >> 10)&1u ? '+' : '-', - (r.f.u >> 11)&1u ? '+' : '-'); + (f >> 0)&1u ? '+' : '-', + (f >> 2)&1u ? '+' : '-', + (f >> 4)&1u ? '+' : '-', + (f >> 6)&1u ? '+' : '-', + (f >> 7)&1u ? '+' : '-', + (f >> 10)&1u ? '+' : '-', + (f >> 11)&1u ? '+' : '-'); printf("\tcond:"); - if (r.f.u&CF) printf(" c/b/nae"); else printf(" nc/ae/nb"); - if (r.f.u&ZF) printf(" e/z"); else printf(" ne/nz"); - if (r.f.u&SF) printf(" s"); else printf(" ns"); - if (r.f.u&OF) printf(" o"); else printf(" no"); - if (r.f.u&PF) printf(" p"); else printf(" np"); - if ((r.f.u&CF) || (r.f.u&ZF)) printf(" be/na"); else printf(" a/nbe"); - if (!(r.f.u&OF) == !(r.f.u&SF)) printf(" ge/nl"); else printf(" l/nge"); - if (!(r.f.u&OF) == !(r.f.u&SF) && !(r.f.u&ZF)) + if (f&CF) printf(" c/b/nae"); else printf(" nc/ae/nb"); + if (f&ZF) printf(" e/z"); else printf(" ne/nz"); + if (f&SF) printf(" s"); else printf(" ns"); + if (f&OF) printf(" o"); else printf(" no"); + if (f&PF) printf(" p"); else printf(" np"); + if ((f&CF) || (f&ZF)) printf(" be/na"); else printf(" a/nbe"); + if (!(f&OF) == !(f&SF)) printf(" ge/nl"); else printf(" l/nge"); + if (!(f&OF) == !(f&SF) && !(f&ZF)) printf(" g/nle"); else printf(" le/ng"); putchar('\n'); printf("\tsystem: %ctf %cif iopl=%d %cnt " "%crf %cvm %cac %cvif %cvip %cid\n", - (r.f.u >> 8)&1u ? '+' : '-', - (r.f.u >> 9)&1u ? '+' : '-', - (int)((r.f.u >> 12)&1u), - (r.f.u >> 14)&1u ? '+' : '-', - (r.f.u >> 16)&1u ? '+' : '-', - (r.f.u >> 17)&1u ? '+' : '-', - (r.f.u >> 18)&1u ? '+' : '-', - (r.f.u >> 19)&1u ? '+' : '-', - (r.f.u >> 20)&1u ? '+' : '-', - (r.f.u >> 21)&1u ? '+' : '-'); + (f >> 8)&1u ? '+' : '-', + (f >> 9)&1u ? '+' : '-', + (int)((f >> 12)&1u), + (f >> 14)&1u ? '+' : '-', + (f >> 16)&1u ? '+' : '-', + (f >> 17)&1u ? '+' : '-', + (f >> 18)&1u ? '+' : '-', + (f >> 19)&1u ? '+' : '-', + (f >> 20)&1u ? '+' : '-', + (f >> 21)&1u ? '+' : '-'); #undef CF #undef PF @@ -306,6 +345,90 @@ int main(int argc, char *argv[]) #undef SF #undef OF +#elif defined(__arm__) + +#define NF (1u << 31) +#define ZF (1u << 30) +#define CF (1u << 29) +#define VF (1u << 28) + + { + static const char + *modetab[] = { "?00", "?01", "?02", "?03", "?04", "?05", "?06", "?07", + "?08", "?09", "?10", "?11", "?12", "?13", "?14", "?15", + "usr", "fiq", "irq", "svc", "?20", "?21", "mon", "abt", + "?24", "?25", "hyp", "und", "?28", "?29", "?30", "sys" }, + *condtab[] = { "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc", + "hi", "ls", "ge", "lt", "gt", "le", "al", "nv" }; + + printf("\tuser: %cn %cz %cc %cv %cq ge=%c%c%c%c;", + (f >> 31)&1u ? '+' : '-', + (f >> 30)&1u ? '+' : '-', + (f >> 29)&1u ? '+' : '-', + (f >> 28)&1u ? '+' : '-', + (f >> 27)&1u ? '+' : '-', + (f >> 19)&1u ? '1' : '0', + (f >> 18)&1u ? '1' : '0', + (f >> 17)&1u ? '1' : '0', + (f >> 16)&1u ? '1' : '0'); + if (f&NF) printf(" mi"); else printf(" pl"); + if (f&ZF) printf(" eq"); else printf(" ne"); + if (f&CF) printf(" cs/hs"); else printf(" cc/lo"); + if (f&VF) printf(" vs"); else printf(" vc"); + if ((f&CF) && !(f&ZF)) printf(" hi"); else printf(" ls"); + if (!(f&VF) == !(f&NF)) printf(" ge"); else printf(" lt"); + if (!(f&VF) == !(f&NF) && !(f&ZF)) printf(" gt"); else printf(" le"); + putchar('\n'); + printf("\tsystem: %cj it=%s:%c%c%c%c %ce %ca %ci %cf %ct m=%s\n", + (f >> 24)&1u ? '+' : '-', + condtab[(f >> 12)&15u], + (f >> 11)&1u ? '1' : '0', + (f >> 10)&1u ? '1' : '0', + (f >> 26)&1u ? '1' : '0', + (f >> 25)&1u ? '1' : '0', + (f >> 9)&1u ? '+' : '-', + (f >> 8)&1u ? '+' : '-', + (f >> 7)&1u ? '+' : '-', + (f >> 6)&1u ? '+' : '-', + (f >> 5)&1u ? '+' : '-', + modetab[(f >> 0)&31u]); + } + +#undef NF +#undef ZF +#undef CF +#undef VF + +#elif defined(__aarch64__) + +#define NF (1u << 31) +#define ZF (1u << 30) +#define CF (1u << 29) +#define VF (1u << 28) + + printf("\tuser: %cn %cz %cc %cv;", + (f >> 31)&1u ? '+' : '-', + (f >> 30)&1u ? '+' : '-', + (f >> 29)&1u ? '+' : '-', + (f >> 28)&1u ? '+' : '-'); + if (f&NF) printf(" mi"); else printf(" pl"); + if (f&ZF) printf(" eq"); else printf(" ne"); + if (f&CF) printf(" cs/hs"); else printf(" cc/lo"); + if (f&VF) printf(" vs"); else printf(" vc"); + if ((f&CF) && !(f&ZF)) printf(" hi"); else printf(" ls"); + if (!(f&VF) == !(f&NF)) printf(" ge"); else printf(" lt"); + if (!(f&VF) == !(f&NF) && !(f&ZF)) printf(" gt"); else printf(" le"); + putchar('\n'); + +#undef NF +#undef ZF +#undef CF +#undef VF + +#else +# error "not supported" +#endif + for (i = 0; i < nseg; i++) { printf("seg[%d] (%p):\n", i, seg[i].p); dumpseg(&seg[i]); } diff --git a/xchg.S b/xchg.S index 12aeeba..0dcb6c8 100644 --- a/xchg.S +++ b/xchg.S @@ -1,8 +1,75 @@ -/// -*- mode: asm; asm-comment-char: ?/ -*- +/// -*- mode: asm; asm-comment-char: 0 -*- + +///-------------------------------------------------------------------------- +/// Preliminaries. + +#include + +#if defined(__i386__) || defined(__x86_64__) .intel_syntax noprefix - .section .note.GNU-stack, "", @progbits +#elif defined(__arm__) + +.macro ret + bx r14 +.endm + + .arch armv7-a + +#elif defined(__aarch64__) + +.macro cmov rd, rn, cc + csel \rd, \rn, \rd, \cc +.endm +#define _COND(_) \ + _(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl) \ + _(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv) \ + _(hs) _(lo) +#define _INST(_) \ + _(ccmp) _(ccmn) \ + _(csel) _(cmov) \ + _(csinc) _(cinc) _(cset) \ + _(csneg) _(cneg) \ + _(csinv) _(cinv) _(csetm) +#define _CONDVAR(cc) _definstvar cc; +#define _INSTVARS(inst) \ + .macro _definstvar cc; \ + .macro inst.\cc args:vararg; inst \args, \cc; .endm; \ + .endm; \ + _COND(_CONDVAR); \ + .purgem _definstvar; + _INST(_INSTVARS) +#undef _COND +#undef _INST +#undef _CONDVAR +#undef _INSTVARS + +#define CCMP_N 8 +#define CCMP_Z 4 +#define CCMP_C 2 +#define CCMP_V 1 + +#define CCMP_MI CCMP_N +#define CCMP_PL 0 +#define CCMP_EQ CCMP_Z +#define CCMP_NE 0 +#define CCMP_CS CCMP_C +#define CCMP_HS CCMP_C +#define CCMP_CC 0 +#define CCMP_LO 0 +#define CCMP_VS CCMP_V +#define CCMP_VC 0 +#define CCMP_HI CCMP_C +#define CCMP_LS 0 +#define CCMP_LT CCMP_N +#define CCMP_GE 0 +#define CCMP_LE CCMP_N +#define CCMP_GT 0 + +#else +# error "not supported" +#endif .macro proc name .globl \name @@ -16,6 +83,36 @@ .endm .macro ch c +#if defined(__i386__) + + pushf + push eax + push ebx + push ecx + push edx + push ebp + mov ebp, esp + and esp, -16 + + push \c + call putchar@plt + + call get_pc_ebx + add ebx, offset _GLOBAL_OFFSET_TABLE + mov eax, [ebx + stdout@GOT] + mov eax, [eax] + call fflush@plt + + mov esp, ebp + pop ebp + pop edx + pop ecx + pop ebx + pop eax + popf + +#elif defined(__x86_64__) + pushf push rax push rcx @@ -44,12 +141,154 @@ pop rcx pop rax popf + +#elif defined(__arm__) + + stmfd r13!, {r0-r4, r12, r14} + + mov r4, r13 + bic r14, r4, #15 + mov r13, r14 + + mov r0, #\c + bl putchar@plt + + ldr r14, .L$_c$gotoff$\@ +.L$_c$gotpc$\@: + add r14, pc, r14 + b .L$_c$cont$\@ +.L$_c$gotoff$\@: + .word _GLOBAL_OFFSET_TABLE - .L$_c$gotpc$\@ - 8 +.L$_c$cont$\@: + bl fflush@plt + + mov r13, r4 + ldmfd r13!, {r0-r4, r12, r14} + +#elif defined(__aarch64__) + + sub sp, sp, #20*8 + stp x0, x1, [sp, #0] + stp x2, x3, [sp, #16] + stp x4, x5, [sp, #32] + stp x6, x7, [sp, #48] + stp x8, x9, [sp, #64] + stp x10, x11, [sp, #80] + stp x12, x13, [sp, #96] + stp x14, x15, [sp, #112] + stp x16, x17, [sp, #128] + mrs x16, nzcv + stp x16, x30, [sp, #144] + + mov w0, #\c + bl putchar + adrp x0, :got:stdout + ldr x0, [x0, #:got_lo12:stdout] + ldr x0, [x0] + bl fflush + + ldp x16, x30, [sp, #144] + msr nzcv, x16 + ldp x16, x17, [sp, #128] + ldp x14, x15, [sp, #112] + ldp x12, x13, [sp, #96] + ldp x10, x11, [sp, #80] + ldp x8, x9, [sp, #64] + ldp x6, x7, [sp, #48] + ldp x4, x5, [sp, #32] + ldp x2, x3, [sp, #16] + ldp x0, x1, [sp, #0] + add sp, sp, #20*8 + +#else +# error "not supported" +#endif .endm +.macro notimpl +#if defined(__i386__) || defined(__x86_64__) + ud2 +#elif defined(__arm__) + udf +#elif defined(__aarch64__) + hlt #0 +#else +# error "not supported" +#endif +.endm + + .section .note.GNU-stack, "", %progbits + .text +#if defined(__i386__) +get_pc_ebx: + mov ebx, [esp] + ret +#endif + + proc call_example +#if defined(__i386__) + + push ebx // ebx + push esi // esi, ebx + push edi // edi, esi, ebx + push ebp // flags, ebp, ..., ebx + pushf + + mov edi, [esp + 4*6] + mov esi, [esp + 4*7] + push esi // regs, flags, ebp, ..., ebx + + call get_pc_ebx + lea eax, [ebx + 9f - .] + push eax // cont, regs, flags, ebp, ..., ebx + push edi // func, cont, regs, flags, ebp, ..., ebx + + mov eax, [esi + 28] + pushf + pop ecx + and eax, 0x0cd5 + and ecx, ~0x0cd5 + or eax, ecx + push eax + popf + mov eax, [esi + 0] + mov ebx, [esi + 4] + mov ecx, [esi + 8] + mov edx, [esi + 12] + mov edi, [esi + 20] + mov ebp, [esi + 24] + mov esi, [esi + 16] + + ret // -> func; regs, flags, ebp, ..., ebx + +9: pushf // eflags, regs, flags, ebp, ..., ebx + push esi // esi, eflags, regs, flags, ebp, ..., ebx + mov esi, [esp + 8] + mov [esi + 0], eax + mov [esi + 4], ebx + mov [esi + 8], ecx + mov [esi + 12], edx + mov [esi + 20], edi + mov [esi + 24], ebp + pop eax // rflags, regs, flags, ebp, ..., ebx + mov [esi + 16], eax + pop eax // regs, flags, ebp, ..., ebx + mov [esi + 28], eax + + add esp, 4 // flags, ebp, ..., ebx + popf // ebp, ..., ebx + pop ebp // ..., ebx + pop edi + pop esi + pop ebx // + ret + +#elif defined(__x86_64__) + push rbx // rbx push r10 push r11 @@ -66,7 +305,7 @@ proc call_example push rax // cont, regs, flags, rbp, ..., rbx push rdi // func, cont, regs, flags, rbp, ..., rbx - mov rax, [rsi + 56] + mov rax, [rsi + 8*15] pushf pop rcx and rax, 0x0cd5 @@ -74,29 +313,45 @@ proc call_example or rax, rcx push rax popf - mov rax, [rsi + 0] - mov rbx, [rsi + 8] - mov rcx, [rsi + 16] - mov rdx, [rsi + 24] - mov rdi, [rsi + 40] - mov rbp, [rsi + 48] - mov rsi, [rsi + 32] + mov rax, [rsi + 0] + mov rbx, [rsi + 8] + mov rcx, [rsi + 16] + mov rdx, [rsi + 24] + mov rdi, [rsi + 40] + mov rbp, [rsi + 48] + mov r8, [rsi + 56] + mov r9, [rsi + 64] + mov r10, [rsi + 72] + mov r11, [rsi + 80] + mov r12, [rsi + 88] + mov r13, [rsi + 96] + mov r14, [rsi + 104] + mov r15, [rsi + 112] + mov rsi, [rsi + 32] ret // -> func; regs, flags, rbp, ..., rbx 9: pushf // rflags, regs, flags, rbp, ..., rbx push rsi // rsi, rflags, regs, flags, rbp, ..., rbx mov rsi, [rsp + 16] - mov [rsi + 0], rax - mov [rsi + 8], rbx - mov [rsi + 16], rcx - mov [rsi + 24], rdx - mov [rsi + 40], rdi - mov [rsi + 48], rbp + mov [rsi + 0], rax + mov [rsi + 8], rbx + mov [rsi + 16], rcx + mov [rsi + 24], rdx + mov [rsi + 40], rdi + mov [rsi + 48], rbp + mov [rsi + 56], r8 + mov [rsi + 64], r9 + mov [rsi + 72], r10 + mov [rsi + 80], r11 + mov [rsi + 88], r12 + mov [rsi + 96], r13 + mov [rsi + 104], r14 + mov [rsi + 112], r15 pop rax // rflags, regs, flags, rbp, ..., rbx - mov [rsi + 32], rax + mov [rsi + 32], rax pop rax // regs, flags, rbp, ..., rbx - mov [rsi + 56], rax + mov [rsi + 120], rax add rsp, 8 // flags, rbp, ..., rbx popf // rbp, ..., rbx @@ -110,6 +365,68 @@ proc call_example pop rbx // ret +#elif defined(__arm__) + + stmfd r13!, {r0, r1, r4-r11, r14} + ldmia r1, {r0-r12, r14} + msr cpsr, r14 + mov r14, pc + ldr pc, [r13], #4 + ldr r14, [r13], #4 + stmia r14!, {r0-r12} + mrs r0, cpsr + str r0, [r14] + ldmfd r13!, {r4-r11, pc} + +#elif defined(__aarch64__) + + stp x29, x30, [sp, #-13*8]! + mov x29, sp + stp x19, x20, [sp, #16] + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + stp x25, x26, [sp, #64] + stp x27, x28, [sp, #80] + str x1, [sp, #96] + + mov x16, x0 + + ldr x17, [x1, #128] + ldp x14, x15, [x1, #112] + ldp x12, x13, [x1, #96] + ldp x10, x11, [x1, #80] + ldp x8, x9, [x1, #64] + ldp x6, x7, [x1, #48] + ldp x4, x5, [x1, #32] + ldp x2, x3, [x1, #16] + ldp x0, x1, [x1, #0] + msr nzcv, x17 + + blr x16 + + ldr x16, [sp, #96] + mrs x17, nzcv + str x17, [x16, #128] + stp x14, x15, [x16, #112] + stp x12, x13, [x16, #96] + stp x10, x11, [x16, #80] + stp x8, x9, [x16, #64] + stp x6, x7, [x16, #48] + stp x4, x5, [x16, #32] + stp x2, x3, [x16, #16] + stp x0, x1, [x16, #0] + + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp x25, x26, [sp, #64] + ldp x27, x28, [sp, #80] + ldp x29, x30, [sp], #13*8 + +#else +# error "not supported" +#endif + endproc proc nop @@ -119,19 +436,58 @@ proc nop endproc ///-------------------------------------------------------------------------- +/// 0x00--0x0f proc x00 // clear all 64 bits of extended traditional registers - xor eax,eax // clear rax - lea rbx,[0] // rbx -> _|_ + +#if defined(__x86_64__) + + xor eax, eax // clear rax + lea rbx, [0] // rbx -> _|_ loop . // iterate, decrement rcx until zero - mov rdx,0 // set rdx = 0 - and esi,0 // clear all bits of rsi - sub edi,edi // set rdi = edi - edi = 0 + mov rdx, 0 // set rdx = 0 + and esi, 0 // clear all bits of rsi + sub edi, edi // set rdi = edi - edi = 0 push 0 pop rbp // pop 0 into rbp +#elif defined(__i386__) + + xor eax, eax + lea ebx, [0] + loop . + mov edx, 0 + and esi, 0 + sub edi, edi + push 0 + pop ebp + +#elif defined(__arm__) + + eor r0, r0, r0 + rsb r1, r1, r1 +0: subs r2, r2, #1 + bne 0b + mov r3, #0 + and r4, r4, #0 + sub r5, r5, r5 + +#elif defined(__aarch64__) + + eor w0, w0, w0 + mov w1, wzr +0: sub w2, w2, #1 + cbnz w2, 0b + mov w3, #0 + and w4, w4, wzr + sub w5, w5, w5 + +#else + notimpl +#endif + ret endproc @@ -142,11 +498,43 @@ proc x01 // // on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1} // and f_{i+c}, where f_{i+1} = f_i + f_{i-1} + +#if defined(__x86_64__) + 0: xadd rax, rdx // a, d = a + d, a // = f_{i+1} + f_i, f_{i+1} // = f_{i+2}, f_{i+1} loop 0b // advance i, decrement c, iterate +#elif defined(__i386__) + +0: xadd eax, edx + loop 0b + +#elif defined(__arm__) + +0: subs r2, r2, #2 + add r3, r3, r0 + blo 8f + add r0, r0, r3 + bhi 0b + +8: movne r0, r3 + +#elif defined(__aarch64__) + +0: subs x2, x2, #2 + add x3, x3, x0 + b.lo 8f + add x0, x0, x3 + b.hi 0b + +8: cmov.ne x0, x3 + +#else + notimpl +#endif + ret endproc @@ -155,10 +543,63 @@ proc x02 // boolean canonify a: if a = 0 on entry, leave it zero; otherwise // set a = 1 + +#if defined(__x86_64__) + neg rax // set cf iff a /= 0 sbb rax, rax // a = a - a - cf = -cf neg rax // a = cf +#elif defined(__i386__) + + neg eax + sbb eax, eax + neg eax + +#elif defined(__arm__) + + movs r1, r0 // the easy way + movne r1, #1 // mvnne r1, #1 for mask + + cmp r0, #1 // clear cf iff a == 0 + sbc r2, r0, r0 // c' = a - a - 1 + cf = cf - 1 + add r2, r2, #1 // c' = cf + + sub r3, r0, r0, lsr #1 // d' top bit clear; d' = 0 iff a = 0 + rsb r3, r3, #0 // d' top bit set iff a /= 0 + mov r3, r3, lsr #31 // asr for mask + + rsbs r0, r0, #0 + sbc r0, r0, r0 + rsb r0, r0, #0 + +#elif defined(__aarch64__) + + cmp x0, #0 // trivial + cset.ne x1 // csetm for mask + + cmp xzr, x0 // set cf iff a == 0 + sbc x2, x0, x0 // c' = a - a - 1 + cf = cf - 1 + neg x2, x2 // c' = 1 - cf + + sub x3, x0, x0, lsr #1 // if a < 2^63 then a' = ceil(d/2) < + // 2^63 + // if a >= 2^63, write a = 2^63 + t + // with t < 2^63; d' = 2^63 - 2^62 + + // ceil(t/2) = 2^62 + ceil(t/2), and + // ceil(t/2) < 2^62 + // anyway d' < 2^63 and d' = 0 iff + // a = 0 + neg x3, x3 // d' top bit set iff a /= 0 + lsr x3, x3, #63 // asr for mask + + cmp x0, #1 // set cf iff a /= 0 + adc x0, xzr, xzr // a' = 0 + 0 + cf = cf + +#else + notimpl +#endif + ret endproc @@ -166,11 +607,46 @@ endproc proc x03 // set a = min(a, d) (unsigned); clobber c, d + +#if defined(__x86_64__) + sub rdx, rax // d' = d - a; set cf if a > d sbb rcx, rcx // c = -cf = -[a > d] and rcx, rdx // c = a > d ? d - a : 0 add rax, rcx // a' = a > d ? d : a +#elif defined(__i386__) + + sub edx, eax + sbb ecx, ecx + and ecx, edx + add eax, ecx + +#elif defined(__arm__) + + cmp r0, r3 // the easy way + movlo r1, r0 // only needed for out-of-place + movhs r1, r3 + + subs r3, r3, r0 + sbc r12, r12, r12 + and r12, r12, r3 + add r0, r0, r12 + +#elif defined(__aarch64__) + + cmp x0, x3 // the easy way + csel.lo x1, x0, x3 + + subs x3, x3, x0 // d' = d - a; set cf if d >= a + sbc x16, xzr, xzr // t = -1 + cf = -[a > d] + and x16, x16, x3 // t = a > d ? d - a : 0 + add x0, x0, x16 // a' = a > d ? d : a + +#else + notimpl +#endif + ret endproc @@ -178,8 +654,76 @@ endproc proc x04 // switch case? + +#if defined(__x86_64__) + + // unrelated playing + mov ecx, eax + mov rbx, -1 + mov edx, ecx + sub edx, '0' + cmp edx, 10 + cmovb rbx, rdx + or ecx, 0x20 + mov edx, ecx + sub edx, 'a' + sub ecx, 'a' - 10 + cmp edx, 6 + cmovb rbx, rcx + + xor al, 0x20 + +#elif defined(__i386__) + + // unrelated playing + mov ecx, eax + mov ebx, -1 + mov edx, ecx + sub edx, '0' + cmp edx, 10 + cmovb ebx, edx + or ecx, 0x20 + mov edx, ecx + sub edx, 'a' + sub ecx, 'a' - 10 + cmp edx, 6 + cmovb ebx, ecx + xor al, 0x20 +#elif defined(__arm__) + + // unrelated playing + mvn r1, #0 + sub r12, r0, #'0' + cmp r12, #10 + movlo r1, r12 + orr r12, r0, #0x20 + sub r12, r12, #'a' + cmp r12, #6 + addlo r1, r12, #10 + + eor r0, r0, #0x20 + +#elif defined(__aarch64__) + + // unrelated playing + mov x1, #-1 + sub w16, w0, #'0' + cmp w16, #10 + cmov.lo x1, x16 + orr w16, w0, #0x20 + sub w16, w16, #'a' - 10 + cmp w16, #10 + ccmp.hs w16, #16, #CCMP_HS + cmov.lo x1, x16 + + eor w0, w0, #0x20 + +#else + notimpl +#endif + ret endproc @@ -187,6 +731,9 @@ endproc proc x05 // answer whether 5 <= a 4 a > 9 or a < -2^63 + 5 // le/ng a' <= 4 -2^63 + 5 <= a <= 9 +#elif defined(__i386__) + + sub eax, 5 + cmp eax, 4 + +#elif defined(__arm__) + + // i dimly remember having a slick way to do this way back in the + // day, but i can't figure it out any more. + sub r0, #5 + cmp r0, #4 + +#elif defined(__aarch64__) + + // literal translation is too obvious + cmp x0, #5 + ccmp.hs x0, #9, #CCMP_HS + +#else + notimpl +#endif + ret endproc @@ -219,10 +788,35 @@ proc x06 // leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of, // set sf to msb(a) + +#if defined(__x86_64__) + not rax // a' = -a - 1 inc rax // a' = -a neg rax // a' = a +#elif defined(__i386__) + + not eax + inc eax + neg eax + +#elif defined(__arm__) + + mvn r0, r0 + add r0, r0, #1 + rsbs r0, r0, #0 // cf has opposite sense + +#elif defined(__aarch64__) + + mvn x0, x0 + add x0, x0, #1 + negs x0, x0 // cf has opposite sense + +#else + notimpl +#endif + ret endproc @@ -230,11 +824,39 @@ endproc proc x07 // same as before (?) + +#if defined(__x86_64__) + inc rax // a' = a + 1 neg rax // a' = -a - 1 inc rax // a' = -a neg rax // a' = a +#elif defined(__i386__) + + inc eax + neg eax + inc eax + neg eax + +#elif defined(__arm__) + + add r0, r0, #1 + rsb r0, r0, #0 + add r0, r0, #1 + rsbs r0, r0, #0 + +#elif defined(__aarch64__) + + add x0, x0, #1 + neg x0, x0 + add x0, x0, #1 + negs x0, x0 // cf has opposite sense + +#else + notimpl +#endif + ret endproc @@ -243,10 +865,45 @@ proc x08 // floor((a + d)/2), correctly handling overflow conditions; final cf // is lsb(a + d), probably uninteresting + +#if defined(__x86_64__) + add rax, rdx // cf || a' = a + d rcr rax, 1 // shift 65-bit result right by one // place; lsb moves into carry +#elif defined(__i386__) + + add eax, edx + rcr eax, 1 + +#elif defined(__arm__) + + // like the two-instruction a64 version + sub r1, r3, r0 + add r1, r0, r1, lsr #1 + + // the slick version, similar to the above + adds r0, r0, r3 + mov r0, r0, rrx + +#elif defined(__aarch64__) + + // a64 lacks a32's rrx. literal translation. + adds x1, x0, x3 // cf || a' = a + d + adc x16, xzr, xzr // realize cf in extra register + extr x1, x16, x1, #1 // shift down one place + + // two instruction version: clobbers additional register. (if you + // wanted the answer in any other register, even overwriting d, then + // this is unnecessary.) also depends on d >= a. + sub x16, x3, x0 // compute difference + add x0, x0, x16, lsr #1 // add half of it (rounded down) + +#else + notimpl +#endif + ret endproc @@ -255,10 +912,33 @@ proc x09 // a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3 // (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8). + +#if defined(__x86_64__) + shr rax, 3 // a' = floor(a/8); cf = 1 if a == // 4, 5, 6, 7 (mod 8) adc rax, 0 // a' = floor(a/8) + cf +#elif defined(__i386__) + + shr eax, 3 + adc eax, 0 + +#elif defined(__arm__) + + movs r0, r0, lsr #3 + adc r0, r0, #0 + +#elif defined(__aarch64__) + + tst x0, #4 + orr x0, xzr, x0, lsr #3 + cinc.ne x0, x0 + +#else + notimpl +#endif + ret endproc @@ -266,11 +946,43 @@ endproc proc x0a // increment c-byte little-endian bignum at rdi + +#if defined(__x86_64__) + add byte ptr [rdi], 1 0: inc rdi adc byte ptr [rdi], 0 loop 0b +#elif defined(__i386__) + + add byte ptr [edi], 1 +0: inc edi + adc byte ptr [edi], 0 + loop 0b + +#elif defined(__arm__) + + mov r12, #256 // set initial carry +0: ldrb r0, [r5] + subs r2, r2, #1 + add r12, r0, r12, lsr #8 + strb r12, [r5], #1 + bne 0b + +#elif defined(__aarch64__) + + mov w17, #256 // set initial carry +0: ldrb w16, [x5] + sub x2, x2, #1 + add w17, w16, w17, lsr #8 + strb w17, [x5], #1 + cbnz x2, 0b + +#else + notimpl +#endif + ret endproc @@ -278,11 +990,36 @@ endproc proc x0b // negate double-precision d:a + +#if defined(__x86_64__) + not rdx // d' = -d - 1 neg rax // a' = -a; // cf = 1 iff a /= 0 sbb rdx, -1 // d' = -d - cf +#elif defined(__i386__) + + not edx + neg eax + sbb edx, -1 + +#elif defined(__arm__) + + // reverse subtract is awesome + rsbs r0, r0, #0 + rsc r3, r3, #0 + +#elif defined(__aarch64__) + + // easy way: everything is better with zero registers. + negs x0, x0 + ngc x3, x3 + +#else + notimpl +#endif + ret endproc @@ -291,6 +1028,8 @@ proc x0c // rotate is distributive over xor. +#if defined(__x86_64__) + // rax // = a_1 || a_0 // rbx // = b_1 || b_0 mov rcx, rax // = a_1 || a_0 @@ -304,6 +1043,48 @@ proc x0c cmp rax, rcx // always equal +#elif defined(__i386__) + + mov ecx, eax // = a_1 || a_0 + + xor ecx, ebx // = (a_1 XOR b_1) || (a_0 XOR b_0) + ror ecx, 0xd // = (a_0 XOR b_0) || (a_1 XOR b_1) + + ror eax, 0xd // = a_0 || a_1 + ror ebx, 0xd // = b_0 || b_1 + xor eax, ebx // = (a_0 XOR b_0) || (a_1 XOR b_1) + + cmp eax, ecx // always equal + +#elif defined(__arm__) + + + // r0 // = a_1 || a_0 + // r1 // = b_1 || b_0 + eor r2, r0, r1 // = (a_1 XOR b_1) || (a_0 XOR b_0) + mov r2, r2, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1) + + mov r1, r1, ror #13 // = b_0 || b_1 + eor r0, r1, r0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1) + + cmp r0, r2 // always equal + +#elif defined(__aarch64__) + + // x0 // = a_1 || a_0 + // x1 // = b_1 || b_0 + eor x2, x0, x1 // = (a_1 XOR b_1) || (a_0 XOR b_0) + ror x2, x2, #13 // = (a_0 XOR b_0) || (a_1 XOR b_1) + + ror x1, x1, #13 // = b_0 || b_1 + eor x0, x1, x0, ror #13 // = (a_0 XOR b_0) || (a_1 XOR b_1) + + cmp x0, x2 // always equal + +#else + notimpl +#endif + ret endproc @@ -312,6 +1093,8 @@ proc x0d // and is distributive over xor. +#if defined(__x86_64__) + mov rdx, rbx // = b xor rbx, rcx // = b XOR c @@ -324,6 +1107,50 @@ proc x0d cmp rax, rbx // always equal +#elif defined(__i386__) + + mov edx, ebx // = b + + xor ebx, ecx // = b XOR c + and ebx, eax // = a AND (b XOR c) + + and edx, eax // = a AND b + and eax, ecx // = a AND c + xor eax, edx // = (a AND b) XOR (a AND c) + // = a AND (b XOR c) + + cmp eax, ebx // always equal + +#elif defined(__arm__) + + and r3, r0, r1 // = a AND b + + eor r1, r1, r2 // = b XOR c + and r1, r1, r0 // = a AND (b XOR c) + + and r0, r0, r2 // = a AND c + eor r0, r0, r3 // = (a AND b) XOR (a AND c) + // = a AND (b XOR c) + + cmp r0, r1 // always equal + +#elif defined(__aarch64__) + + and x3, x0, x1 // = a AND b + + eor x1, x1, x2 // = b XOR c + and x1, x1, x0 // = a AND (b XOR c) + + and x0, x0, x2 // = a AND c + eor x0, x0, x3 // = (a AND b) XOR (a AND c) + // = a AND (b XOR c) + + cmp x0, x1 // always equal + +#else + notimpl +#endif + ret endproc @@ -332,6 +1159,8 @@ proc x0e // de morgan's law +#if defined(__x86_64__) + mov rcx, rax // = a and rcx, rbx // = a AND b @@ -342,7 +1171,46 @@ proc x0e or rax, rbx // = (NOT a) OR (NOT b) // = NOT (a AND b) - cmp rax, rcx + cmp rax, rcx // always equal + +#elif defined(__i386__) + + mov ecx, eax // = a + + and ecx, ebx // = a AND b + not ecx // = NOT (a AND b) + + not eax // = NOT a + not ebx // = NOT b + or eax, ebx // = (NOT a) OR (NOT b) + // = NOT (a AND b) + + cmp eax, ecx // always equal + +#elif defined(__arm__) + + and r2, r0, r1 // = a AND b + mvn r2, r2 // = NOT (a AND b) + + mvn r0, r0 // = NOT a + mvn r1, r1 // = NOT b + orr r0, r0, r1 // = (NOT a) OR (NOT b) + + cmp r0, r2 // always equal + +#elif defined(__aarch64__) + + and x2, x0, x1 // = a AND b + mvn x2, x2 // = NOT (a AND b) + + mvn x0, x0 // = NOT a + orn x0, x0, x1 // = (NOT a) OR (NOT b) + + cmp x0, x2 // always equal + +#else + notimpl +#endif ret @@ -355,20 +1223,51 @@ proc x0f // // not sure why you'd do this. - cld +#if defined(__x86_64__) 0: xor [rsi], al lodsb loop 0b +#elif defined(__i386__) + +0: xor [esi], al + lodsb + loop 0b + +#elif defined(__arm__) + +0: ldrb r12, [r4] + subs r2, r2, #1 + eor r0, r0, r12 + strb r0, [r4], #1 + bne 0b + +#elif defined(__aarch64__) + +0: ldrb w16, [x4] + sub x2, x2, #1 + eor w0, w0, w16 + strb w0, [x4], #1 + cbnz x2, 0b + +#else + notimpl +#endif + ret endproc +///-------------------------------------------------------------------------- +/// 0x10--0x1f + proc x10 // four different ways to swap a pair of registers. +#if defined(__x86_64__) + push rax push rcx pop rax @@ -385,6 +1284,76 @@ proc x10 xchg rax, rcx +#elif defined(__i386__) + + push eax + push ecx + pop eax + pop ecx + + xor eax, ecx + xor ecx, eax + xor eax, ecx + + add eax, ecx + sub ecx, eax + add eax, ecx + neg ecx + + xchg eax, ecx + +#elif defined(__arm__) + + stmfd r13!, {r0, r2} + ldr r0, [r13, #4] + ldr r2, [r13], #8 + + eor r0, r0, r2 + eor r2, r2, r0 + eor r0, r0, r2 + + sub r0, r0, r2 + add r2, r2, r0 + rsb r0, r0, r2 // don't need 3-addr with reverse-sub + + mov r12, r0 + mov r0, r2 + mov r2, r0 + +#elif defined(__aarch64__) + + // anything you can do + stp x0, x2, [sp, #-16]! + ldp x2, x0, [sp], #16 + + eor x0, x0, x2 + eor x2, x2, x0 + eor x0, x0, x2 + + // the add/sub/add thing was daft. you can do it in three if you're + // clever -- and have three-address operations. + sub x0, x0, x2 + add x2, x2, x0 + sub x0, x2, x0 + + // but we lack a fourth. we can't do this in fewer than three + // instructions without hitting memory. only `ldp' will modify two + // registers at a time, so we need at least two instructions -- but + // if the first one sets one of our two registers to its final value + // then we lose the other input value with no way to recover it, so + // we must either write a fresh third register, or write something + // other than the final value, and in both cases we need a third + // instruction to fix everything up. we've done the wrong-something- + // other trick twice, so here's the captain-obvious use-a-third- + // register version. + mov x16, x0 + mov x0, x2 + mov x2, x16 + +#else + notimpl +#endif + ret endproc @@ -398,6 +1367,8 @@ proc x11 // in particular, a will be zero (and zf set) if and only if the two // strings are equal. +#if defined(__x86_64__) + 0: mov dl, [rsi] xor dl, [rdi] inc rsi @@ -405,6 +1376,37 @@ proc x11 or al, dl loop 0b +#elif defined(__i386__) + +0: mov dl, [esi] + xor dl, [edi] + inc esi + inc edi + or al, dl + loop 0b + +#elif defined(__arm__) + +0: ldrb r1, [r4], #1 + ldrb r12, [r5], #1 + subs r2, r2, #1 + eor r12, r12, r1 + orr r0, r0, r12 + bne 0b + +#elif defined(__aarch64__) + +0: ldrb w16, [x4], #1 + ldrb w17, [x5], #1 + sub x2, x2, #1 + eor w16, w16, w17 + orr w0, w0, w16 + cbnz x2, 0b + +#else + notimpl +#endif + ret endproc @@ -418,11 +1420,36 @@ proc x12 // move all of the set bits in d to a, unless there's already a bit // there. this clearly doesn't change the sum. +#if defined(__x86_64__) + mov rcx, rdx // c' = d and rdx, rax // d' = a AND d or rax, rcx // a' = a OR d add rax, rdx +#elif defined(__i386__) + + mov ecx, edx // c' = d + and edx, eax // d' = a AND d + or eax, ecx // a' = a OR d + add eax, edx + +#elif defined(__arm__) + + and r2, r0, r3 // c' = a AND d + orr r0, r0, r3 // a' = a OR d + add r0, r0, r2 + +#elif defined(__aarch64__) + + and x2, x0, x3 // c' = a AND d + orr x0, x0, x3 // a' = a OR d + add x0, x0, x2 + +#else + notimpl +#endif + ret endproc @@ -432,13 +1459,46 @@ proc x13 // ok, so this is a really obtuse way of adding a and b; the result // is in a and d. but why does it work? +#if defined(__x86_64__) + mov rcx, 0x40 // carry chains at most 64 long 0: mov rdx, rax // copy a' xor rax, rbx // low bits of each bitwise sum and rbx, rdx // carry bits from each bitwise sum - shl rbx, 001 // carry them into next position + shl rbx, 1 // carry them into next position + loop 0b + +#elif defined(__i386__) + + mov ecx, 0x40 // carry chains at most 64 long +0: mov edx, eax // copy a' + xor eax, ebx // low bits of each bitwise sum + and ebx, edx // carry bits from each bitwise sum + shl ebx, 1 // carry them into next position loop 0b +#elif defined(__arm__) + + mov r2, #0x40 +0: and r3, r0, r1 + subs r2, r2, #1 + eor r0, r0, r1 + lsl r1, r3, #1 + bne 0b + +#elif defined(__aarch64__) + + mov x2, #0x40 +0: and x3, x0, x1 + sub x2, x2, #1 + eor x0, x0, x1 + lsl x1, x3, #1 + cbnz x2, 0b + +#else + notimpl +#endif + ret endproc @@ -447,6 +1507,8 @@ proc x14 // floor((a + d)/2), like x08. +#if defined(__x86_64__) + mov rcx, rax // copy a for later and rcx, rdx // carry bits @@ -455,6 +1517,32 @@ proc x14 add rax, rcx // add the carries; done +#elif defined(__i386__) + + mov ecx, eax // copy a for later + and ecx, edx // carry bits + + xor eax, edx // low bits of each bitwise sum + shr eax, 1 // divide by 2; carries now in place + + add eax, ecx // add the carries; done + +#elif defined(__arm__) + + and r2, r0, r3 + eor r0, r0, r3 + add r0, r2, r0, lsr #1 + +#elif defined(__aarch64__) + + and x2, x0, x3 + eor x0, x0, x3 + add x0, x2, x0, lsr #1 + +#else + notimpl +#endif + ret endproc @@ -463,7 +1551,9 @@ proc x15 // sign extension 32 -> 64 bits. - //movsx rbx, eax // like this? +#if defined(__x86_64__) + + movsx rbx, eax // like this? mov rdx, 0xffffffff80000000 add rax, rdx // if bit 31 of a is set then bits @@ -472,15 +1562,46 @@ proc x15 // exactly backwards xor rax, rdx // so fix it +#elif defined(__i386__) + + movsx ebx, ax // like this? + + mov edx, 0xffff8000 + add eax, edx // if bit 31 of a is set then bits + // 31--63 of a' are clear; otherwise, + // these bits are all set -- which is + // exactly backwards + xor eax, edx // so fix it + +#elif defined(__arm__) + + sxth r1, r0 // like this + + mov r12, #0x80000000 + add r0, r0, r12, asr #16 + eor r0, r0, r12, asr #16 + +#elif defined(__aarch64__) + + sxtw x1, w0 // like this + + mov x16, #0xffffffff80000000 + add x0, x0, x16 + eor x0, x0, x16 + +#else + notimpl +#endif + ret endproc proc x16 - //shl rax, 56 - //shl rbx, 56 - //shl rcx, 56 + // ??? i don't know why you'd want to calculate this. + +#if defined(__x86_64__) xor rax, rbx // a' = a XOR b xor rbx, rcx // b' = b XOR c @@ -490,67 +1611,725 @@ proc x16 xor rax, rbx // a' = cf ? 0 : a XOR c cmp rax, rsi +#elif defined(__i386__) + + xor eax, ebx // a' = a XOR b + xor ebx, ecx // b' = b XOR c + mov esi, eax // t = a XOR b + add esi, ebx // t = (a XOR b) + (b XOR c) + cmovc eax, ebx // a' = cf ? b XOR c : a XOR b + xor eax, ebx // a' = cf ? 0 : a XOR c + cmp eax, esi + +#elif defined(__arm__) + + eor r0, r0, r1 + eor r1, r1, r2 + adds r4, r0, r1 + movcs r0, r1 + eor r0, r0, r1 + cmp r0, r4 + +#elif defined(__aarch64__) + + eor x0, x0, x1 + eor x1, x1, x2 + adds x4, x0, x1 + cmov.cs x0, x1 + eor x0, x0, x1 + cmp x0, x4 + +#else + notimpl +#endif + ret endproc proc x17 - ud2 + // absolute value + +#if defined(__x86_64__) + + cqo // d = a < 0 ? -1 : 0 + xor rax, rdx // a' = a < 0 ? -a - 1 : a + sub rax, rdx // a' = a < 0 ? -a : a + +#elif defined(__i386__) + + cdq // d = a < 0 ? -1 : 0 + xor eax, edx // a' = a < 0 ? -a - 1 : a + sub eax, edx // a' = a < 0 ? -a : a + +#elif defined(__arm__) + + // direct approach + movs r1, r0 + rsbmi r1, r0, #0 + + // faithful-ish conversion + eor r3, r0, r0, asr #31 + sub r0, r3, r0, asr #31 + +#elif defined(__aarch64__) + + // direct approach + tst x0, #1 << 63 + cneg.ne x1, x0 + + // faithful-ish conversion + eor x3, x0, x0, asr #63 + sub x0, x3, x0, asr #63 + +#else + notimpl +#endif + + ret endproc proc x18 - ud2 + // should always set sf, clear zf, unless we get rescheduled to a + // different core. + +#if defined(__x86_64__) + + rdtsc // d || a = cycles + shl rdx, 0x20 + or rax, rdx // a = cycles + mov rcx, rax // c = cycles + + rdtsc // d || a = cycles' + shl rdx, 0x20 + or rax, rdx // a = cycles' + + cmp rcx, rax + +#elif defined(__i386__) + + rdtsc // d || a = cycles + mov ebx, eax + mov ecx, edx // c || b = cycles + + rdtsc // d || a = cycles' + + sub ebx, eax + sbb ecx, edx + +#elif defined(__arm__) + + // cycle clock not available in user mode + mrrc p15, 0, r0, r1, c9 + mrrc p15, 0, r2, r3, c9 + subs r0, r0, r2 + sbcs r1, r1, r3 + +#elif defined(__aarch64__) + + // cycle clock not available in user mode + mrs x0, pmccntr_el0 + mrs x1, pmccntr_el0 + cmp x0, x1 + +#else + notimpl +#endif + + ret endproc proc x19 - ud2 + // stupid way to capture a pointer to inline data and jump past it. + // confuses the return-address predictor something chronic. worse + // because amd64 calling convention doesn't usually pass arguments on + // the stack. + +#if defined(__x86_64__) + + call 8f + .string "hello world!\n\0" +8: call print_str + add rsp, 8 + ret + +print_str: + // actually implement this ridiculous thing + mov rsi, [rsp + 8] + xor edx, edx +0: mov al, [rsi + rdx] + inc rdx + cmp al, 0 + jnz 0b + mov eax, SYS_write + mov edi, 1 + dec rdx + syscall // clobbers r11 :-( + ret + +#elif defined(__i386__) + + call 8f + .string "hello world!\n\0" +8: call print_str + add esp, 4 + ret + +print_str: + // actually implement this ridiculous thing + mov ecx, [esp + 4] + xor edx, edx +0: mov al, [ecx + edx] + inc edx + cmp al, 0 + jnz 0b + mov eax, SYS_write + mov ebx, 1 + dec edx + int 0x80 + ret + +#elif defined(__arm__) + + // why am i doing this? + stmfd r13!, {r14} + bl 8f + .string "hello world!\n\0" + .balign 4 +8: mov r1, r14 // might as well make it easy on myself + bl print_str + ldmfd r13!, {pc} + +print_str: + mov r2, #0 +0: ldrb r0, [r1, r2] + cmp r0, #0 + addne r2, r2, #1 + bne 0b + mov r0, #1 + mov r7, #SYS_write + swi 0 + bx r14 + +#elif defined(__aarch64__) + + // why am i doing this? + str x30, [sp, #-16]! + bl 8f + .string "hello world!\n\0" + .balign 4 +8: mov x1, x30 // might as well make it easy on myself + bl print_str + ldr x30, [sp], #16 + ret + +print_str: + mov x2, #0 +0: ldrb w0, [x1, x2] + cmp w0, #0 + cinc.ne x2, x2 + b.ne 0b + mov x0, #1 + mov x8, #SYS_write + svc #0 + ret + +#else + notimpl +#endif endproc proc x1a - ud2 + // collect the current instruction-pointer address. this was an old + // 32-bit i386 trick for position-independent code, but (a) it + // confuses the return predictor, and (b) amd64 has true pc-relative + // addressing. + +#if defined(__x86_64__) + + // the actual example + call 0f +0: pop rax + + // the modern i386 trick doesn't confuse the return-address + // predictor. + call calladdr_rbx + sub rbx, . - 0b + + // but rip-relative addressing is even better + lea rcx, [rip + 0b] + + ret + +calladdr_rbx: + mov rbx, [rsp] + ret + +#elif defined(__i386__) + + // the actual example + call 0f +0: pop eax + + // the modern i386 trick doesn't confuse the return-address + // predictor. + call get_pc_ebx + sub ebx, . - 0b + + ret + +#elif defined(__arm__) + + stmfd r13!, {r14} + + bl 0f +0: mov r0, r14 + + bl return + sub r1, r14, #. - 0b + + adr r2, 0b + + ldmfd r13!, {pc} + +return: bx r14 + +#elif defined(__aarch64__) + + str x30, [sp, #-16]! + + // we can do all of the above using a64 + bl 0f +0: mov x0, x30 + + bl return + sub x1, x30, #. - 0b + + adr x2, 0b + + ldr x30, [sp], #16 +return: ret + +#else + notimpl +#endif endproc proc x1b - ud2 +#if defined(__x86_64__) + + // retpolines: an mitigation against adversarially influenced + // speculative execution at indirect branches. if an adversary can + // prepare a branch-target buffer entry matching an indirect branch + // in the victim's address space then they can cause the victim to + // /speculatively/ (but not architecturally) execute any code in + // their address space, possibly leading to leaking secrets through + // the cache. retpolines aren't susceptible to this because the + // predicted destination address is from the return-prediction stack + // which the adversary can't prime. the performance penalty is still + // essentially a branch misprediction -- for this return, and + // possibly all others already stacked. + + // (try not to crash) + lea rax, [rip + 9f] + + push rax +9: ret + +#elif defined(__i386__) + + call get_pc_ebx + lea eax, [ebx + 9f - .] + + push eax +9: ret + +#elif defined(__arm__) + + stmfd r13!, {r14} + + adr r14, 8f + bx r14 + +8: ldmfd r13!, {pc} + +#elif defined(__aarch64__) + + str x30, [sp, #-16]! + + adr x30, 8f + ret + +8: ldr x30, [sp], #16 + ret + +#else + notimpl +#endif endproc proc x1c - ud2 + // ok, having a hard time seeing a use for this. the most important + // thing to note is that sp is set from `pop' /after/ it's + // incremented. + +#if defined(__x86_64__) + + // try not to crash + mov rax, rsp + and rsp, -16 + push rax + + pop rsp + + // check it worked + mov rbx, rsp + ret + +#elif defined(__i386__) + + // try not to crash + mov eax, esp + and esp, -16 + push eax + + pop esp + + // check it worked + mov ebx, esp + ret + +#elif defined(__arm__) + + // not even going to dignify this + notimpl + +#elif defined(__aarch64__) + + // not even going to dignify this + notimpl + +#else + notimpl +#endif endproc proc x1d - ud2 + // monumentally cheesy way to copy 8 n bytes from buff1 to buff2. + // also clobbers words at buff2 + 8 n and buff2 - 8 for good measure. + + n = 4 + +#if defined(__x86_64__) + + mov rax, rsp // safekeeping + + // we're toast if we get hit by a signal now. fingers crossed... + .if 0 + mov rsp, buff2 + 8*n + 8 + mov rbp, buff1 + 8*n + .else + lea rsp, [rdi + 8*n + 16] + lea rbp, [rsi + 8*n] + .endif + enter 0, n + 1 + + // precise action: + // + // +---------+ +---------+ + // rbp -> | ??? | rsp -> | ??? | + // +---------+ +---------+ + // | w_{n-1} | | rbp | <- rbp' + // +---------+ +---------+ + // | ... | | w_{n-1} | + // +---------+ +---------+ + // | w_1 | | ... | + // +---------+ +---------+ + // | w_0 | | w_1 | + // +---------+ +---------+ + // | w_0 | + // +---------+ + // | rbp' | <- rsp' + // +---------+ + + mov rdx, rsp + mov rsp, rax + +#elif defined(__i386__) + + mov eax, esp // safekeeping + + // we're toast if we get hit by a signal now. fingers crossed... + .if 0 + mov esp, buff2 + 4*n + 4 + mov ebp, buff1 + 4*n + .else + lea esp, [edi + 4*n + 8] + lea ebp, [esi + 4*n] + .endif + enter 0, n + 1 + + mov edx, esp + mov esp, eax + +#elif defined(__arm__) + + add r4, r4, #4*n + add r5, r5, #4*n + 8 + + str r4, [r5, #-4]! + .rept n/2 + ldrd r0, r1, [r4, #-8]! + strd r0, r1, [r5, #-8]! + .endr + add r4, r5, #4*n + str r4, [r5, #-4]! + +#elif defined(__aarch64__) + + // omgwtf. let's not actually screw with the stack pointer. + + add x4, x4, #8*n + add x5, x5, #8*n + 16 + + str x4, [x5, #-8]! + .rept n/2 + ldp x16, x17, [x4, #-16]! + stp x16, x17, [x5, #-16]! + .endr + add x4, x5, #8*n + str x4, [x5, #-8]! + +#else + notimpl +#endif + + ret endproc proc x1e - ud2 + // convert nibble value to (uppercase) hex; other input values yield + // nonsense. + +#if defined(__x86_64__) + + // das doesn't work in 64-bit mode; best i can come up with + mov edx, eax + add al, '0' + add dl, 'A' - 10 + cmp al, '9' + 1 + cmovae eax, edx + +#elif defined(__i386__) + + cmp al, 0x0a // cf = 1 iff a < 10 + sbb al, 0x69 // if 0 <= a < 10, a' = a - 0x6a, so + // 0x96 <= a' < 0x70, setting af, cf + // if 10 <= a < 16, a' = a - 0x69, so + // 0x71 <= a' < 0x77, setting cf but + // clearing af + das // if 0 <= a < 10, then af and cf are + // both set, so set subtract 0x66 + // from a' leaving 0x30 <= a' < 0x3a; + // if 10 <= a < 16 then af clear but + // cf set, so subtract 0x60 from a' + // leaving 0x41 <= a' < 0x47 + +#elif defined(__arm__) + + // significantly less tricksy + cmp r0, #10 + addlo r0, r0, #'0' + addhs r0, r0, #'A' - 10 + +#elif defined(__aarch64__) + + // with less versatile conditional execution this is the best we can + // do + cmp w0, #10 + add w16, w0, #'A' - 10 + add w0, w0, #'0' + cmov.hs w0, w16 + +#else + notimpl +#endif + + ret endproc proc x1f - ud2 + // verify collatz conjecture starting at a; assume a /= 0! + +#if defined(__x86_64__) + +0: bsf rcx, rax // clobber c if a = 0 + shr rax, cl // a = 2^c a' + cmp rdx, 0 + je 1f + stosq + dec rdx +1: + cmp rax, 1 // done? + je 9f + lea rax, [2*rax + rax + 1] // a' = 3 a' + 1 + jmp 0b // again + +9: ret + +#elif defined(__i386__) + +0: bsf ecx, eax // clobber c if a = 0 + shr eax, cl // a = 2^c a' + cmp edx, 0 + je 1f + stosd + dec edx +1: + cmp eax, 1 // done? + je 9f + lea eax, [2*eax + eax + 1] // a' = 3 a' + 1 + jmp 0b // again + +9: ret + +#elif defined(__arm__) + + // rbit introduced in armv7 +0: rbit r2, r0 + clz r2, r2 + mov r0, r0, lsr r2 // a = 2^c a' + cmp r3, #0 + strne r0, [r5], #4 + subne r3, r3, #1 + cmp r0, #1 + adcne r0, r0, r0, lsl #1 // a' = 3 a' + 1 (because c set) + bne 0b + + ret + +#elif defined(__aarch64__) + +0: rbit w2, w0 + clz w2, w2 + lsr w0, w0, w2 // a = 2^c a' + cmp x3, #0 + beq 1f + str x0, [x5], #8 + sub x3, x3, #1 +1: + cmp w0, #1 + add w16, w0, w0, lsl #1 // t = 3 a' + 1 (because c set) + csinc.eq w0, w0, w16 + b.ne 0b + + ret + +#else + notimpl +#endif endproc +///-------------------------------------------------------------------------- +/// 0x20--0x2f + proc x20 - ud2 + // calculate 1337 a slowly + +#if defined(__x86_64__) + + // original version + mov rcx, rax // c = a + shl rcx, 2 // c = 4 a + add rcx, rax // c = 5 a + shl rcx, 3 // c = 40 a + add rcx, rax // c = 41 a + shl rcx, 1 // c = 82 a + add rcx, rax // c = 83 a + shl rcx, 1 // c = 166 a + add rcx, rax // c = 167 a + shl rcx, 3 // c = 1336 a + add rcx, rax // c = 1337 a + + // a quick way + lea rdx, [2*rax + rax] // t = 3 a + shl rdx, 6 // t = 192 a + sub rdx, rax // t = 191 a + lea rbx, [8*rdx] // b = 1528 a + sub rbx, rdx // b = 1337 a + +#elif defined(__i386__) + + // original version + mov ecx, eax // c = a + shl ecx, 2 // c = 4 a + add ecx, eax // c = 5 a + shl ecx, 3 // c = 40 a + add ecx, eax // c = 41 a + shl ecx, 1 // c = 82 a + add ecx, eax // c = 83 a + shl ecx, 1 // c = 166 a + add ecx, eax // c = 167 a + shl ecx, 3 // c = 1336 a + add ecx, eax // c = 1337 a + + // a quick way + lea edx, [2*eax + eax] // t = 3 a + shl edx, 6 // t = 192 a + sub edx, eax // t = 191 a + lea ebx, [8*edx] // b = 1528 a + sub ebx, edx // b = 1337 a + +#elif defined(__arm__) + + // original version, ish + add r2, r0, r0, lsl #2 // c = 5 a + add r2, r0, r2, lsl #3 // c = 41 a + add r2, r0, r2, lsl #1 // c = 83 a + add r2, r0, r2, lsl #1 // c = 167 a + add r2, r0, r2, lsl #3 // c = 1337 a + + // quicker way + add r1, r0, r0, lsl #1 // b = 3 a + rsb r1, r0, r1, lsl #6 // b = 191 a + rsb r1, r1, r1, lsl #3 // b = 1337 a + +#elif defined(__aarch64__) + + // original version, ish + add x2, x0, x0, lsl #2 // c = 5 a + add x2, x0, x2, lsl #3 // c = 41 a + add x2, x0, x2, lsl #1 // c = 83 a + add x2, x0, x2, lsl #1 // c = 167 a + add x2, x0, x2, lsl #3 // c = 1337 a + + // sleazy because no rsb + add x1, x0, x0, lsl #1 // b = 3 a + sub x1, x0, x1, lsl #6 // b = -191 a + sub x1, x1, x1, lsl #3 // b = 1337 a + +#else + notimpl +#endif ret @@ -558,97 +2337,667 @@ endproc proc x21 - ud2 + // multiply complex numbers a + b i and c + d i + // + // (a + b i) (c + d i) = (a c - b d) + (a d + b c) i + // + // somewhat slick approach uses only three multiplications + +#if defined(__x86_64__) + + mov rsi, rax // t = a + add rax, rbx // a' = a + b + mov rdi, rdx // u = d + sub rdx, rcx // d' = d - c + add rdi, rcx // u = c + d + + imul rax, rcx // a' = c (a + b) + imul rsi, rdx // t = a (d - c) + imul rdi, rbx // u = b (c + d) + + add rsi, rax // t = a (d - c) + c (a + b) + mov rbx, rsi // b' = a (d - c) + c (a + b) + // = a d + b c + sub rax, rdi // a' = c (a + b) - b (c + d) + // = a c - b d + +#elif defined(__i386__) + + mov esi, eax // t = a + add eax, ebx // a' = a + b + mov edi, edx // u = d + sub edx, ecx // d' = d - c + add edi, ecx // u = c + d + + imul eax, ecx // a' = c (a + b) + imul esi, edx // t = a (d - c) + imul edi, ebx // u = b (c + d) + + add esi, eax // t = a (d - c) + c (a + b) + mov ebx, esi // b' = a (d - c) + c (a + b) + // = a d + b c + sub eax, edi // a' = c (a + b) - b (c + d) + // = a c - b d + +#elif defined(__arm__) + + add r4, r0, r1 // t = a + b + add r5, r2, r3 // u = c + d + sub r3, r3, r2 // d' = d - c + + // mls introduced in armv7 + mul r4, r4, r2 // t = c (a + b) + mov r2, r1 // c' = a (bah!) + mla r1, r0, r3, r4 // b' = a (d - c) + c (a + b) + // = a d + b c + mls r0, r2, r5, r4 // a' = c (a + b) - b (c + d) + // = a c - b d + +#elif defined(__aarch64__) + + add x4, x0, x1 // t = a + b + add x5, x2, x3 // u = c + d + sub x3, x3, x2 // d' = d - c + + // mls intxoduced in axmv7 + mul x4, x4, x2 // t = c (a + b) + mov x2, x1 // c' = a (bah!) + madd x1, x0, x3, x4 // b' = a (d - c) + c (a + b) + // = a d + b c + msub x0, x2, x5, x4 // a' = c (a + b) - b (c + d) + // = a c - b d + +#else + notimpl +#endif + + ret endproc proc x22 - ud2 + // divide by 3 + +#if defined(__x86_64__) + + mov rdx, 0xaaaaaaaaaaaaaaab // = ceil(2/3 2^64) + mul rdx // d' || a' =~ 2/3 a 2^64 + shr rdx, 1 // d' = floor(a/3) + mov rax, rdx // a' = floor(a/3) + + // we start with 0 <= a < 2^64. write f = ceil(2/3 2^64), so that + // 2/3 < f/2^64 < 2/3 + 1/2^64. then floor(2/3 a) <= floor(a f/2^64) + // <= floor(2/3 a + a/2^64), but a < 2^64 so a/2^64 < 1 and + // floor(a f/2^64) = floor(2/3 a). + +#elif defined(__i386__) + + mov edx, 0xaaaaaaab // = ceil(2/3 2^32) + mul edx // d' || a' =~ 2/3 a 2^32 + shr edx, 1 // d' = floor(a/3) + mov eax, edx // a' = floor(a/3) + +#elif defined(__arm__) + + ldr r12, =0xaaaaaaab + umull r12, r0, r0, r12 + mov r0, r0, lsr #1 + +#elif defined(__aarch64__) + + ldr x16, =0xaaaaaaaaaaaaaaab + umulh x0, x0, x16 + lsr x0, x0, #1 + +#else + notimpl +#endif + + ret endproc proc x23 - ud2 +#if defined(__x86_64__) + + // main loop: shorten a preserving residue class mod 3 +0: cmp rax, 5 + jbe 8f + // a > 5 + mov rdx, rax // d' = a + shr rdx, 2 // d' = floor(a/4) + and rax, 3 // a = 4 d' + a' (0 <= a' < 4) + add rax, rdx // a' == a (mod 3) but a' < a/4 + 4 + jmp 0b + + // fix up final value 0 <= a < 6: want 0 <= a < 3 + // + // the tricky part is actually a = 3; but the other final cases take + // additional iterations which we can avoid. +8: cmp rax, 3 // set cf iff a < 3 + cmc // set cf iff a >= 3 + sbb rdx, rdx // d' = a >= 3 ? -1 : 0 + and rdx, 3 // d' = a >= 3 ? 3 : 0 + sub rax, rdx // a' = a - (a >= 3 ? 3 : 0) + // = a (mod 3) + +#elif defined(__i386__) + + // main loop: shorten a preserving residue class mod 3 +0: cmp eax, 5 + jbe 8f + // a > 5 + mov edx, eax // d' = a + shr edx, 2 // d' = floor(a/4) + and eax, 3 // a = 4 d' + a' (0 <= a' < 4) + add eax, edx // a' == a (mod 3) but a' < a/4 + 4 + jmp 0b + + // fix up final value 0 <= a < 6: want 0 <= a < 3 + // + // the tricky part is actually a = 3; but the other final cases take + // additional iterations which we can avoid. +8: cmp eax, 3 // set cf iff a < 3 + cmc // set cf iff a >= 3 + sbb edx, edx // d' = a >= 3 ? -1 : 0 + and edx, 3 // d' = a >= 3 ? 3 : 0 + sub eax, edx // a' = a - (a >= 3 ? 3 : 0) + // = a (mod 3) + +#elif defined(__arm__) + +0: cmp r0, #6 + andhs r12, r0, #3 + addhs r0, r12, r0, lsr #2 + bhs 0b + + cmp r0, #3 + subhs r0, r0, #3 + +#elif defined(__aarch64__) + +0: cmp x0, #6 + // blunder on through regardless since this doesn't affect the result + and x16, x0, #3 + add x0, x16, x0, lsr #2 + b.hs 0b + + subs x16, x0, #3 + cmov.hs x0, x16 + +#else + notimpl +#endif + + ret endproc proc x24 - ud2 + // invert (odd) a mod 2^64 + // + // suppose a a_i == 1 (mod 2^{2^i}) + // + // clearly good for i = 0, since 2^i = 1 and 2^{2^i} = 2, and a_0 = + // a == 1 (mod 2) by assumption + // + // write a a_i == b_i 2^{2^i} + 1 (mod 2^{2^{i+1}}) + // then b_i == (a a_i - 1)/2^{2^i} (mod 2^{2^i}) + // to lift inverse, we want x such that a x == -b_i (mod 2^{2^i}); + // clearly x = -a_i b_i will do, since a a_i == 1 (mod 2^{2^i}) + // then: + // a_{i+1} = a_i - a_i b_i 2^{2^i} = a_i (1 - (a a_i - 1)) + // = 2 a_i - a a_i^2 + // + // check: + // a a_{i+1} = 2 a a_i - a^2 a_i^2 + // == 2 a a_i - (b_i 2^{2^i} + 1)^2 + // == 2 (b_i 2^{2^i} + 1) - + // (b_i^2 2^{2^{i+1}} + 2 b_i 2^{2^i} + 1) + // == 1 (mod 2^{2^{i+1}}) + +#if defined(__x86_64__) + + // rax // a_0 = a + mov rbx, rax // b' = a + mov rsi, rax // t = a_0 + +0: + cmp rbp, 0 + je 1f + stosq + dec rbp +1: + mul rbx // a' = a a_i + mov rcx, rax // c = a a_i + + sub rax, 2 // a' = a a_i - 2 + neg rax // a' = 2 - a a_i + mul rsi // a_{i+1} = a_i (2 - a a_i) + // = 2 a_i - a a_i^2 + mov rsi, rax // t = a_{i+1} + + cmp rcx, 1 // done? + ja 0b // no -- iterate + +#elif defined(__i386__) + + // eax // a_0 = a + mov ebx, eax // b' = a + mov esi, eax // t = a_0 + +0: + cmp ebp, 0 + je 1f + stosd + dec ebp +1: + mul ebx // a' = a a_i + mov ecx, eax // c = a a_i + + sub eax, 2 // a' = a a_i - 2 + jb 9f // done if < 2 + neg eax // a' = 2 - a a_i + mul esi // a_{i+1} = a_i (2 - a a_i) + // = 2 a_i - a a_i^2 + mov esi, eax // t = a_{i+1} + + jmp 0b // and iterate +9: mov eax, esi // restore + +#elif defined(__arm__) + + // r0 // a_0 = a + mov r1, r0 // b' = a + +0: + cmp r6, #0 + strne r0, [r5], #4 + subne r6, r6, #1 + mul r2, r0, r1 // c = a a_i + rsbs r2, r2, #2 // c = 2 - a a_i + mul r0, r0, r2 // a_{i+1} = a_i (2 - a a_i) + // = 2 a_i - a a_i^2 + blo 0b + +#elif defined(__aarch64__) + + // x0 // a_0 = a + mov x1, x0 // b' = a + mov x16, #2 // because we have no rsb + +0: + cmp x6, #0 + b.eq 1f + str x0, [x5], #8 + sub x6, x6, #1 +1: + mul x2, x0, x1 // c = a a_i + subs x2, x16, x2 // c = 2 - a a_i + mul x0, x0, x2 // a_{i+1} = a_i (2 - a a_i) + // = 2 a_i - a a_i^2 + b.lo 0b + +#else + notimpl +#endif + + ret endproc proc x25 - ud2 + // a poor approximation to pi/4 + // + // think of x and y as being in 16.16 fixed-point format. we sample + // points in the unit square, and determine how many of them are + // within a unit quarter-circle centred at the origin. the area of + // the quarter-circle is pi/4. + +#if defined(__x86_64__) + + xor eax, eax // a = 0 + mov rcx, 1 + shl rcx, 0x20 // c =~ 4 billion + +0: movzx rbx, cx // x = low 16 bits of c + imul rbx, rbx // b = x^2 + + ror rcx, 0x10 // switch halves of c + movzx rdx, cx // y = high 16 bits of c + imul rdx, rdx // d = y^2 + rol rcx, 0x10 // switch back + + add rbx, rdx // r^2 = x^2 + y^2 + shr rbx, 0x20 // r^2 >= 1? + cmp rbx, 1 // set cf iff r^2 >= 1 + adc rax, 0 // and add onto accumulator + loop 0b + +#elif defined(__i386__) + + // this is actually better done in 32 bits. the carry has the wrong + // sense here, so instead deduct one for each point outside the + // quarter-circle rather than adding one for each point inside it. + xor eax, eax + xor ecx, ecx + +0: movzx ebx, cx + imul ebx, ebx + + ror ecx, 0x10 + movzx edx, cx + imul edx, edx + rol ecx, 0x10 + + add ebx, edx // see? + sbb eax, 0 + loop 0b + +#elif defined(__arm__) + + mov r0, #0 + mov r2, #0 + +0: uxth r1, r2, ror #0 + uxth r3, r2, ror #16 + mul r1, r1, r1 + mul r3, r3, r3 + cmn r1, r3 // mlas doesn't set cf usefully + addcc r0, r0, #1 + adds r2, r2, #1 + bne 0b + +#elif defined(__aarch64__) + + mov w0, #0 + mov w2, #0 + +0: ubfx w1, w2, #0, #16 + ubfx w3, w2, #16, #16 + sub w2, w2, #1 + mul w1, w1, w1 + mul w3, w3, w3 + cmn w1, w3 + cinc.cc w0, w0 + cbnz w2, 0b + +#else + notimpl +#endif + + ret endproc proc x26 - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc proc x27 - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc proc x28 - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc proc x29 - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc proc x2a - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc proc x2b - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc proc x2c - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc proc x2d - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc proc x2e - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc proc x2f - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc +///-------------------------------------------------------------------------- +/// 0x30--0x3f + proc x30 - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif ret @@ -656,90 +3005,362 @@ endproc proc x31 - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc proc x32 - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc proc x33 - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc proc x34 - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc proc x35 - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc proc x36 - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc proc x37 - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc proc x38 - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc proc x39 - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc proc x3a - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc proc x3b - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc proc x3c - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc proc x3d - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc proc x3e - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc proc x3f - ud2 +#if defined(__x86_64__) + + notimpl + +#elif defined(__i386__) + + notimpl + +#elif defined(__arm__) + + notimpl + +#elif defined(__aarch64__) + + notimpl + +#else + notimpl +#endif endproc + +///----- That's all, folks --------------------------------------------------