From 0f23f75ff53acadf80e9d3dfd2dfd14cb526074f Mon Sep 17 00:00:00 2001 From: Mark Wooding Date: Sat, 21 May 2016 14:33:28 +0100 Subject: [PATCH] Add support for AMD64 processors and Microsoft Windows. * Slightly modify CPU-feature-probing code in `base/dispatch.c', mostly to use wider registers for the stack operations since there are no 32-bit `push' instructions. The feature codes are the same for both, so there's no corresponding header-file change. * Add appropriate macros to `base/asm-common.h' for dealing with PIC on AMD64. It's refreshingly straightforward. * Modify the existing assembler code to support the new environments. This is mostly tuning register allocation and prologue/epilogue sequences. * Use the fancy code on the new platforms. --- base/asm-common.h | 33 +- base/dispatch.c | 43 ++- configure.ac | 5 +- symm/Makefile.am | 15 +- symm/{chacha-x86-sse2.S => chacha-x86ish-sse2.S} | 111 ++++-- symm/chacha.c | 8 +- symm/rijndael-base.c | 8 +- ...ijndael-x86-aesni.S => rijndael-x86ish-aesni.S} | 428 ++++++++++++++------- symm/rijndael.c | 14 +- symm/{salsa20-x86-sse2.S => salsa20-x86ish-sse2.S} | 161 +++++--- symm/salsa20.c | 8 +- 11 files changed, 594 insertions(+), 240 deletions(-) rename symm/{chacha-x86-sse2.S => chacha-x86ish-sse2.S} (65%) rename symm/{rijndael-x86-aesni.S => rijndael-x86ish-aesni.S} (53%) rename symm/{salsa20-x86-sse2.S => salsa20-x86ish-sse2.S} (63%) diff --git a/base/asm-common.h b/base/asm-common.h index 7e62eb54..8745ea43 100644 --- a/base/asm-common.h +++ b/base/asm-common.h @@ -58,9 +58,22 @@ F(name): \ #endif ///-------------------------------------------------------------------------- -/// x86-specific hacking. +/// Windows-specific hacking. + +#if ABI_WIN #if CPUFAM_X86 +# define F(name) _##name +#endif + +#endif + +///-------------------------------------------------------------------------- +/// x86- and amd64-specific hacking. +/// +/// It's (slightly) easier to deal with both of these in one go. + +#if CPUFAM_X86 || CPUFAM_AMD64 // Set the function hooks. #define FUNC_PREHOOK(_) .balign 16 @@ -86,7 +99,7 @@ F(name): \ // Maybe load GOT address into GOT. .macro ldgot got=GOTREG -#if WANT_PIC +#if WANT_PIC && CPUFAM_X86 call _where_am_i.\got add \got, offset _GLOBAL_OFFSET_TABLE_ #endif @@ -94,7 +107,7 @@ F(name): \ // Maybe build a helper subroutine for `ldgot GOT'. .macro gotaux got=GOTREG -#if WANT_PIC +#if WANT_PIC && CPUFAM_X86 .align 16 _where_am_i.\got : mov \got, [esp] @@ -105,9 +118,19 @@ _where_am_i.\got : // Load address of external symbol ADDR into REG, maybe using GOT. .macro leaext reg, addr, got=GOTREG #if WANT_PIC +# if CPUFAM_X86 mov \reg, [\got + \addr@GOT] +# endif +# if CPUFAM_AMD64 + mov \reg, \addr@GOTPCREL[rip] +# endif #else +# if CPUFAM_X86 mov \reg, offset \addr +# endif +# if CPUFAM_AMD64 + lea \reg, \addr[rip] +# endif #endif .endm @@ -115,7 +138,9 @@ _where_am_i.\got : // referring to ADDR, which is within our module, maybe using GOT. #define INTADDR(...) INTADDR__0(__VA_ARGS__, GOTREG, dummy) #define INTADDR__0(addr, got, ...) INTADDR__1(addr, got) -#if WANT_PIC +#if CPUFAM_AMD64 +# define INTADDR__1(addr, got) addr + rip +#elif WANT_PIC # define INTADDR__1(addr, got) got + addr@GOTOFF #else # define INTADDR__1(addr, got) addr diff --git a/base/dispatch.c b/base/dispatch.c index 61c45fa7..8936ea4a 100644 --- a/base/dispatch.c +++ b/base/dispatch.c @@ -41,7 +41,7 @@ /*----- Intel x86/AMD64 feature probing -----------------------------------*/ -#ifdef CPUFAM_X86 +#if CPUFAM_X86 || CPUFAM_AMD64 # define EFLAGS_ID (1u << 21) # define CPUID1D_SSE2 (1u << 26) @@ -64,6 +64,7 @@ struct cpuid { unsigned a, b, c, d; }; */ #ifdef __GNUC__ +# if CPUFAM_X86 static __inline__ unsigned getflags(void) { unsigned f; __asm__ ("pushf; popl %0" : "=g" (f)); return (f); } static __inline__ unsigned setflags(unsigned f) @@ -74,6 +75,18 @@ static __inline__ unsigned setflags(unsigned f) : "g" (f)); return (ff); } +# else +static __inline__ unsigned long getflags(void) + { unsigned long f; __asm__ ("pushf; popq %0" : "=g" (f)); return (f); } +static __inline__ unsigned long long setflags(unsigned long f) +{ + unsigned long ff; + __asm__ ("pushf; pushq %1; popf; pushf; popq %0; popf" + : "=g" (ff) + : "g" (f)); + return (ff); +} +# endif #endif static void cpuid(struct cpuid *cc, unsigned a, unsigned c) @@ -97,9 +110,19 @@ static void cpuid(struct cpuid *cc, unsigned a, unsigned c) /* Alas, EBX is magical in PIC code, so abuse ESI instead. This isn't * pretty, but it works. */ +# if CPUFAM_X86 __asm__ ("pushl %%ebx; cpuid; movl %%ebx, %%esi; popl %%ebx" : "=a" (cc->a), "=S" (cc->b), "=c" (cc->c), "=d" (cc->d) : "a" (a) , "c" (c)); +# elif CPUFAM_AMD64 + __asm__ ("pushq %%rbx; cpuid; movl %%ebx, %%esi; popq %%rbx" + : "=a" (cc->a), "=S" (cc->b), "=c" (cc->c), "=d" (cc->d) + : "a" (a) , "c" (c)); +# else +# error "I'm confused." +# endif + dispatch_debug("CPUID(%08x, %08x) -> %08x, %08x, %08x, %08x", + a, c, cc->a, cc->b, cc->c, cc->d); #else dispatch_debug("GNU inline assembler not available; can't CPUID"); #endif @@ -141,6 +164,7 @@ static int xmm_registers_available_p(void) * XMM registers are actually alive. */ if (!cpuid_features_p(CPUID1D_FXSR, 0)) return (0); +# if CPUFAM_X86 __asm__ ("movl %%esp, %%edx; subl $512, %%esp; andl $~15, %%esp\n" "fxsave (%%esp)\n" "movl 160(%%esp), %%eax; xorl $0xaaaa5555, 160(%%esp)\n" @@ -151,6 +175,21 @@ static int xmm_registers_available_p(void) : "=a" (f) : /* no inputs */ : "%ecx", "%edx"); +# elif CPUFAM_AMD64 + __asm__ ("movq %%rsp, %%rdx; subq $512, %%rsp; andq $~15, %%rsp\n" + "fxsave (%%rsp)\n" + "movl 160(%%rsp), %%eax; xorl $0xaaaa5555, 160(%%rsp)\n" + "fxrstor (%%rsp); fxsave (%%rsp)\n" + "movl 160(%%rsp), %%ecx; movl %%eax, 160(%%rsp)\n" + "fxrstor (%%rsp); movq %%rdx, %%rsp\n" + "xorl %%ecx, %%eax" + : "=a" (f) + : /* no inputs */ + : "%ecx", "%rdx"); +# else +# error "I'm confused." +# endif + dispatch_debug("XMM registers %savailable", f ? "" : "not "); return (f); #else dispatch_debug("GNU inline assembler not available; can't check for XMM"); @@ -257,7 +296,7 @@ int cpu_feature_p(int feat) return (feat_debug(ftok, "runtime probe", cond)); switch (feat) { -#ifdef CPUFAM_X86 +#if CPUFAM_X86 || CPUFAM_AMD64 CASE_CPUFEAT(X86_SSE2, "x86:sse2", xmm_registers_available_p() && cpuid_features_p(CPUID1D_SSE2, 0)); diff --git a/configure.ac b/configure.ac index b76c5619..8a58d782 100644 --- a/configure.ac +++ b/configure.ac @@ -55,7 +55,10 @@ dnl The table of CPU families and ABIs which we might support. Support is dnl not uniform: each dispatched function might or might not have an dnl implementation for any particular CPU/ABI combination. AC_DEFUN([catacomb_CPU_FAMILIES], - [$1([i[[3-6]]86,*], [x86], [sysv])]) + [$1([i[[3-6]]86,cygwin], [x86], [win]) + $1([i[[3-6]]86,*], [x86], [sysv]) + $1([x86_64,cygwin], [amd64], [win]) + $1([x86_64,*], [amd64], [sysv])]) dnl A utility to clear the `seen' flags, used so as to process each CPU or dnl ABI once. diff --git a/symm/Makefile.am b/symm/Makefile.am index ba037cd5..e78277b7 100644 --- a/symm/Makefile.am +++ b/symm/Makefile.am @@ -181,7 +181,10 @@ BLKCS += rc5 BLKCS += rijndael rijndael192 rijndael256 libsymm_la_SOURCES += rijndael-base.h rijndael-base.c if CPUFAM_X86 -libsymm_la_SOURCES += rijndael-x86-aesni.S +libsymm_la_SOURCES += rijndael-x86ish-aesni.S +endif +if CPUFAM_AMD64 +libsymm_la_SOURCES += rijndael-x86ish-aesni.S endif libsymm_la_SOURCES += $(precomp)/rijndael-tab.c PRECOMPS += $(precomp)/rijndael-tab.c @@ -382,7 +385,10 @@ EXTRA_DIST += salsa20-tvconv pkginclude_HEADERS += salsa20.h salsa20-core.h libsymm_la_SOURCES += salsa20.c if CPUFAM_X86 -libsymm_la_SOURCES += salsa20-x86-sse2.S +libsymm_la_SOURCES += salsa20-x86ish-sse2.S +endif +if CPUFAM_AMD64 +libsymm_la_SOURCES += salsa20-x86ish-sse2.S endif TESTS += salsa20.$t ALL_CIPHERS += salsa20 salsa2012 salsa208 @@ -411,7 +417,10 @@ t/salsa20: salsa20-tvconv t/salsa20.local $(SALSA20_ESTREAM_TV) pkginclude_HEADERS += chacha.h chacha-core.h libsymm_la_SOURCES += chacha.c if CPUFAM_X86 -libsymm_la_SOURCES += chacha-x86-sse2.S +libsymm_la_SOURCES += chacha-x86ish-sse2.S +endif +if CPUFAM_AMD64 +libsymm_la_SOURCES += chacha-x86ish-sse2.S endif TESTS += chacha.$t EXTRA_DIST += t/chacha diff --git a/symm/chacha-x86-sse2.S b/symm/chacha-x86ish-sse2.S similarity index 65% rename from symm/chacha-x86-sse2.S rename to symm/chacha-x86ish-sse2.S index ccdfa538..f36bf90f 100644 --- a/symm/chacha-x86-sse2.S +++ b/symm/chacha-x86ish-sse2.S @@ -44,17 +44,73 @@ .arch pentium4 .section .text -FUNC(chacha_core_x86_sse2) +FUNC(chacha_core_x86ish_sse2) + + // Initial setup. + +#if CPUFAM_X86 + // Arguments come in on the stack, and will need to be collected. We + // we can get away with just the scratch registers for integer work, + // but we'll run out of XMM registers and will need some properly + // aligned space which we'll steal from the stack. I don't trust the + // stack pointer's alignment, so I'll have to mask the stack pointer, + // which in turn means I'll need to keep track of the old value. + // Hence I'm making a full i386-style stack frame here. + // + // The Windows and SysV ABIs are sufficiently similar that we don't + // need to worry about the differences here. + +# define NR ecx +# define IN eax +# define OUT edx +# define SAVE0 xmm5 +# define SAVE1 xmm6 +# define SAVE2 xmm7 +# define SAVE3 [esp] - // Initial state. We have three arguments: - // [ebp + 8] is the number of rounds to do - // [ebp + 12] points to the input matrix - // [ebp + 16] points to the output matrix push ebp mov ebp, esp sub esp, 16 - mov edx, [ebp + 12] + mov IN, [ebp + 12] + mov OUT, [ebp + 16] and esp, ~15 + mov NR, [ebp + 8] +#endif + +#if CPUFAM_AMD64 && ABI_SYSV + // This is nice. We have plenty of XMM registers, and the arguments + // are in useful places. There's no need to spill anything and we + // can just get on with the code. + +# define NR edi +# define IN rsi +# define OUT rdx +# define SAVE0 xmm5 +# define SAVE1 xmm6 +# define SAVE2 xmm7 +# define SAVE3 xmm8 +#endif + +#if CPUFAM_AMD64 && ABI_WIN + // Arguments come in registers, but they're different between Windows + // and everyone else (and everyone else is saner). + // + // The Windows ABI insists that we preserve some of the XMM + // registers, but we want more than we can use as scratch space. We + // only need to save a copy of the input for the feedforward at the + // end, so we might as well use memory rather than spill extra + // registers. (We need an extra 8 bytes to align the stack.) + +# define NR ecx +# define IN rdx +# define OUT r8 +# define SAVE0 xmm5 +# define SAVE1 [rsp + 0] +# define SAVE2 [rsp + 16] +# define SAVE3 [rsp + 32] + + sub rsp, 48 + 8 +#endif // First job is to slurp the matrix into XMM registers. Be careful: // the input matrix isn't likely to be properly aligned. @@ -63,20 +119,17 @@ FUNC(chacha_core_x86_sse2) // [ 4 5 6 7] (b, xmm1) // [ 8 9 10 11] (c, xmm2) // [12 13 14 15] (d, xmm3) - movdqu xmm0, [edx + 0] - movdqu xmm1, [edx + 16] - movdqu xmm2, [edx + 32] - movdqu xmm3, [edx + 48] - - // Prepare for the main loop. - mov ecx, [ebp + 8] + movdqu xmm0, [IN + 0] + movdqu xmm1, [IN + 16] + movdqu xmm2, [IN + 32] + movdqu xmm3, [IN + 48] // Take a copy for later. This one is aligned properly, by // construction. - movdqa [esp], xmm0 - movdqa xmm5, xmm1 - movdqa xmm6, xmm2 - movdqa xmm7, xmm3 + movdqa SAVE0, xmm0 + movdqa SAVE1, xmm1 + movdqa SAVE2, xmm2 + movdqa SAVE3, xmm3 loop: // Apply a column quarterround to each of the columns simultaneously. @@ -174,26 +227,30 @@ loop: pshufd xmm1, xmm1, ROTL // Decrement the loop counter and see if we should go round again. - sub ecx, 2 + sub NR, 2 ja loop // Almost there. Firstly, the feedforward addition. - mov edx, [ebp + 16] - paddd xmm0, [esp] - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 + paddd xmm0, SAVE0 + paddd xmm1, SAVE1 + paddd xmm2, SAVE2 + paddd xmm3, SAVE3 // And now we write out the result. This one won't be aligned // either. - movdqu [edx + 0], xmm0 - movdqu [edx + 16], xmm1 - movdqu [edx + 32], xmm2 - movdqu [edx + 48], xmm3 + movdqu [OUT + 0], xmm0 + movdqu [OUT + 16], xmm1 + movdqu [OUT + 32], xmm2 + movdqu [OUT + 48], xmm3 // Tidy things up. +#if CPUFAM_X86 mov esp, ebp pop ebp +#endif +#if CPUFAM_AMD64 && ABI_WIN + add rsp, 48 + 8 +#endif // And with that, we're done. ret diff --git a/symm/chacha.c b/symm/chacha.c index 5683c8e9..80a84c17 100644 --- a/symm/chacha.c +++ b/symm/chacha.c @@ -72,14 +72,14 @@ static void simple_core(unsigned r, const chacha_matrix src, chacha_matrix dest) { CHACHA_nR(dest, src, r); CHACHA_FFWD(dest, src); } -#ifdef CPUFAM_X86 -extern core__functype chacha_core_x86_sse2; +#if CPUFAM_X86 || CPUFAM_AMD64 +extern core__functype chacha_core_x86ish_sse2; #endif static core__functype *pick_core(void) { -#ifdef CPUFAM_X86 - DISPATCH_PICK_COND(chacha_core, chacha_core_x86_sse2, +#if CPUFAM_X86 || CPUFAM_AMD64 + DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_sse2, cpu_feature_p(CPUFEAT_X86_SSE2)); #endif DISPATCH_PICK_FALLBACK(chacha_core, simple_core); diff --git a/symm/rijndael-base.c b/symm/rijndael-base.c index 3d2bb8ef..b0505a66 100644 --- a/symm/rijndael-base.c +++ b/symm/rijndael-base.c @@ -116,14 +116,14 @@ CPU_DISPATCH(static, EMPTY, void, setup, (rijndael_ctx *k, unsigned nb, const void *buf, unsigned nk), (k, nb, buf, nk), pick_setup, simple_setup) -#ifdef CPUFAM_X86 -extern setup__functype rijndael_setup_x86_aesni; +#if CPUFAM_X86 || CPUFAM_AMD64 +extern setup__functype rijndael_setup_x86ish_aesni; #endif static setup__functype *pick_setup(void) { -#ifdef CPUFAM_X86 - DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86_aesni, +#if CPUFAM_X86 || CPUFAM_AMD64 + DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86ish_aesni, cpu_feature_p(CPUFEAT_X86_AESNI)); #endif DISPATCH_PICK_FALLBACK(rijndael_setup, simple_setup); diff --git a/symm/rijndael-x86-aesni.S b/symm/rijndael-x86ish-aesni.S similarity index 53% rename from symm/rijndael-x86-aesni.S rename to symm/rijndael-x86ish-aesni.S index c0cd437a..91fcc352 100644 --- a/symm/rijndael-x86-aesni.S +++ b/symm/rijndael-x86ish-aesni.S @@ -72,45 +72,137 @@ ///-------------------------------------------------------------------------- /// Key setup. -FUNC(rijndael_setup_x86_aesni) +FUNC(rijndael_setup_x86ish_aesni) - // Initial state. We have four arguments: - // [esp + 20] is the context pointer - // [esp + 24] is the block size, in 32-bit words (4, 6, or 8) - // [esp + 28] points to the key material, unaligned - // [esp + 32] is the size of the key, in words - // The key size has already been checked for validity, and the number - // of rounds has been computed. Our job is only to fill in the `w' - // and `wi' vectors. +#if CPUFAM_X86 + // Arguments are on the stack. We'll need to stack the caller's + // register veriables, but we'll manage. +# define CTX ebp // context pointer +# define BLKSZ [esp + 24] // block size + +# define SI esi // source pointer +# define DI edi // destination pointer + +# define KSZ ebx // key size +# define KSZo ebx // ... as address offset +# define NKW edx // total number of key words +# define NKW_NEEDS_REFRESH 1 // ... needs recalculating +# define RCON ecx // round constants table +# define LIM edx // limit pointer +# define LIMn edx // ... as integer offset from base + +# define NR ecx // number of rounds +# define LRK eax // distance to last key +# define LRKo eax // ... as address offset +# define BLKOFF edx // block size in bytes +# define BLKOFFo edx // ... as address offset + + // Stack the caller's registers. push ebp push ebx push esi push edi + // Set up our own variables. + mov CTX, [esp + 20] // context base pointer + mov SI, [esp + 28] // key material + mov KSZ, [esp + 32] // key size, in words +#endif + +#if CPUFAM_AMD64 && ABI_SYSV + // Arguments are in registers. We have plenty, but, to be honest, + // the initial register allocation is a bit annoying. + +# define CTX r8 // context pointer +# define BLKSZ r9d // block size + +# define SI rsi // source pointer +# define DI rdi // destination pointer + +# define KSZ edx // key size +# define KSZo rdx // ... as address offset +# define NKW r10d // total number of key words +# define RCON rdi // round constants table +# define LIMn ecx // limit pointer +# define LIM rcx // ... as integer offset from base + +# define NR ecx // number of rounds +# define LRK eax // distance to last key +# define LRKo rax // ... as address offset +# define BLKOFF r9d // block size in bytes +# define BLKOFFo r9 // ... as address offset + + // Move arguments to more useful places. + mov CTX, rdi // context base pointer + mov BLKSZ, esi // block size in words + mov SI, rdx // key material + mov KSZ, ecx // key size, in words +#endif + +#if CPUFAM_AMD64 && ABI_WIN + // Arguments are in different registers, and they're a little tight. + +# define CTX r8 // context pointer +# define BLKSZ edx // block size + +# define SI rsi // source pointer +# define DI rdi // destination pointer + +# define KSZ r9d // key size +# define KSZo r9 // ... as address offset +# define NKW r10d // total number of key words +# define RCON rdi // round constants table +# define LIMn ecx // limit pointer +# define LIM rcx // ... as integer offset from base + +# define NR ecx // number of rounds +# define LRK eax // distance to last key +# define LRKo rax // ... as address offset +# define BLKOFF edx // block size in bytes +# define BLKOFFo rdx // ... as address offset + + // We'll need the index registers, which belong to the caller in this + // ABI. + push rsi + push rdi + + // Move arguments to more useful places. + mov SI, r8 // key material + mov CTX, rcx // context base pointer +#endif + // The initial round key material is taken directly from the input // key, so copy it over. - mov ebp, [esp + 20] // context base pointer - mov ebx, [esp + 32] // key size, in words - mov ecx, ebx - mov esi, [esp + 28] - lea edi, [ebp + w] +#if CPUFAM_AMD64 && ABI_SYSV + // We've been lucky. We already have a copy of the context pointer + // in rdi, and the key size in ecx. + add DI, w +#else + lea DI, [CTX + w] + mov ecx, KSZ +#endif rep movsd // Find out other useful things. - mov edx, [ebp + nr] // number of rounds - add edx, 1 - imul edx, [esp + 24] // total key size in words - sub edx, ebx // offset by the key size + mov NKW, [CTX + nr] // number of rounds + add NKW, 1 + imul NKW, BLKSZ // total key size in words +#if !NKW_NEEDS_REFRESH + // If we can't keep NKW for later, then we use the same register for + // it and LIM, so this move is unnecessary. + mov LIMn, NKW +#endif + sub LIMn, KSZ // offset by the key size // Find the round constants. ldgot ecx - leaext ecx, rijndael_rcon, ecx + leaext RCON, rijndael_rcon, ecx // Prepare for the main loop. - lea esi, [ebp + w] - mov eax, [esi + 4*ebx - 4] // most recent key word - lea edx, [esi + 4*edx] // limit, offset by one key expansion + lea SI, [CTX + w] + mov eax, [SI + 4*KSZo - 4] // most recent key word + lea LIM, [SI + 4*LIM] // limit, offset by one key expansion // Main key expansion loop. The first word of each key-length chunk // needs special treatment. @@ -131,76 +223,76 @@ FUNC(rijndael_setup_x86_aesni) aeskeygenassist xmm1, xmm0, 0 pshufd xmm1, xmm1, ROTL movd eax, xmm1 - xor eax, [esi] - xor al, [ecx] - inc ecx - mov [esi + 4*ebx], eax - add esi, 4 - cmp esi, edx + xor eax, [SI] + xor al, [RCON] + inc RCON + mov [SI + 4*KSZo], eax + add SI, 4 + cmp SI, LIM jae 8f // The next three words are simple... - xor eax, [esi] - mov [esi + 4*ebx], eax - add esi, 4 - cmp esi, edx + xor eax, [SI] + mov [SI + 4*KSZo], eax + add SI, 4 + cmp SI, LIM jae 8f // (Word 2...) - xor eax, [esi] - mov [esi + 4*ebx], eax - add esi, 4 - cmp esi, edx + xor eax, [SI] + mov [SI + 4*KSZo], eax + add SI, 4 + cmp SI, LIM jae 8f // (Word 3...) - xor eax, [esi] - mov [esi + 4*ebx], eax - add esi, 4 - cmp esi, edx + xor eax, [SI] + mov [SI + 4*KSZo], eax + add SI, 4 + cmp SI, LIM jae 8f // Word 4. If the key is /more/ than 6 words long, then we must // apply a substitution here. - cmp ebx, 5 + cmp KSZ, 5 jb 9b - cmp ebx, 7 + cmp KSZ, 7 jb 0f movd xmm0, eax pshufd xmm0, xmm0, ROTL aeskeygenassist xmm1, xmm0, 0 movd eax, xmm1 -0: xor eax, [esi] - mov [esi + 4*ebx], eax - add esi, 4 - cmp esi, edx +0: xor eax, [SI] + mov [SI + 4*KSZo], eax + add SI, 4 + cmp SI, LIM jae 8f // (Word 5...) - cmp ebx, 6 + cmp KSZ, 6 jb 9b - xor eax, [esi] - mov [esi + 4*ebx], eax - add esi, 4 - cmp esi, edx + xor eax, [SI] + mov [SI + 4*KSZo], eax + add SI, 4 + cmp SI, LIM jae 8f // (Word 6...) - cmp ebx, 7 + cmp KSZ, 7 jb 9b - xor eax, [esi] - mov [esi + 4*ebx], eax - add esi, 4 - cmp esi, edx + xor eax, [SI] + mov [SI + 4*KSZo], eax + add SI, 4 + cmp SI, LIM jae 8f // (Word 7...) - cmp ebx, 8 + cmp KSZ, 8 jb 9b - xor eax, [esi] - mov [esi + 4*ebx], eax - add esi, 4 - cmp esi, edx + xor eax, [SI] + mov [SI + 4*KSZo], eax + add SI, 4 + cmp SI, LIM jae 8f // Must be done by now. @@ -219,130 +311,183 @@ FUNC(rijndael_setup_x86_aesni) // there's easily enough buffer space for the over-enthusiastic reads // and writes because the context has space for 32-byte blocks, which // is our maximum and an exact fit for two SSE registers. -8: mov ecx, [ebp + nr] // number of rounds - mov ebx, [esp + 24] // block size (in words) - mov edx, ecx - imul edx, ebx - lea edi, [ebp + wi] - lea esi, [ebp + 4*edx + w] // last round's keys - shl ebx, 2 // block size (in bytes now) +8: mov NR, [CTX + nr] // number of rounds +#if NKW_NEEDS_REFRESH + mov BLKOFF, BLKSZ + mov LRK, NR + imul LRK, BLKOFF +#else + // If we retain NKW, then BLKSZ and BLKOFF are the same register + // because we won't need the former again. + mov LRK, NKW + sub LRK, BLKSZ +#endif + lea DI, [CTX + wi] + lea SI, [CTX + w + 4*LRKo] // last round's keys + shl BLKOFF, 2 // block size (in bytes now) // Copy the last encryption round's keys. - movdqu xmm0, [esi] - movdqu [edi], xmm0 - cmp ebx, 16 + movdqu xmm0, [SI] + movdqu [DI], xmm0 + cmp BLKOFF, 16 jbe 9f - movdqu xmm0, [esi + 16] - movdqu [edi + 16], xmm0 + movdqu xmm0, [SI + 16] + movdqu [DI + 16], xmm0 // Update the loop variables and stop if we've finished. -9: add edi, ebx - sub esi, ebx - sub ecx, 1 +9: add DI, BLKOFFo + sub SI, BLKOFFo + sub NR, 1 jbe 0f // Do another middle round's keys... - movdqu xmm0, [esi] + movdqu xmm0, [SI] aesimc xmm0, xmm0 - movdqu [edi], xmm0 - cmp ebx, 16 + movdqu [DI], xmm0 + cmp BLKOFF, 16 jbe 9b - movdqu xmm0, [esi + 16] + movdqu xmm0, [SI + 16] aesimc xmm0, xmm0 - movdqu [edi + 16], xmm0 + movdqu [DI + 16], xmm0 jmp 9b // Finally do the first encryption round. -0: movdqu xmm0, [esi] - movdqu [edi], xmm0 - cmp ebx, 16 +0: movdqu xmm0, [SI] + movdqu [DI], xmm0 + cmp BLKOFF, 16 jbe 0f - movdqu xmm0, [esi + 16] - movdqu [edi + 16], xmm0 + movdqu xmm0, [SI + 16] + movdqu [DI + 16], xmm0 // If the block size is not exactly four words then we must end-swap // everything. We can use fancy SSE toys for this. -0: cmp ebx, 16 +0: cmp BLKOFF, 16 je 0f // Find the byte-reordering table. ldgot ecx movdqa xmm5, [INTADDR(endswap_tab, ecx)] +#if NKW_NEEDS_REFRESH // Calculate the number of subkey words again. (It's a good job // we've got a fast multiplier.) - mov ecx, [ebp + nr] - add ecx, 1 - imul ecx, [esp + 24] // total keys in words + mov NKW, [CTX + nr] + add NKW, 1 + imul NKW, BLKSZ +#endif // End-swap the encryption keys. - mov eax, ecx - lea esi, [ebp + w] + mov ecx, NKW + lea SI, [CTX + w] call endswap_block // And the decryption keys. - mov ecx, eax - lea esi, [ebp + wi] + mov ecx, NKW + lea SI, [CTX + wi] call endswap_block - // All done. -0: pop edi +0: // All done. +#if CPUFAM_X86 + pop edi pop esi pop ebx pop ebp +#endif +#if CPUFAM_AMD64 && ABI_WIN + pop rdi + pop rsi +#endif ret .align 16 endswap_block: - // End-swap ECX words starting at ESI. The end-swapping table is + // End-swap ECX words starting at SI. The end-swapping table is // already loaded into XMM5; and it's OK to work in 16-byte chunks. - movdqu xmm1, [esi] + movdqu xmm1, [SI] pshufb xmm1, xmm5 - movdqu [esi], xmm1 - add esi, 16 + movdqu [SI], xmm1 + add SI, 16 sub ecx, 4 ja endswap_block ret +#undef CTX +#undef BLKSZ +#undef SI +#undef DI +#undef KSZ +#undef KSZo +#undef RCON +#undef LIMn +#undef LIM +#undef NR +#undef LRK +#undef LRKo +#undef BLKOFF +#undef BLKOFFo + ENDFUNC ///-------------------------------------------------------------------------- /// Encrypting and decrypting blocks. .macro encdec op, aes, koff -FUNC(rijndael_\op\()_x86_aesni) - - // On entry, we have: - // [esp + 4] points to the context block - // [esp + 8] points to the input data block - // [esp + 12] points to the output buffer +FUNC(rijndael_\op\()_x86ish_aesni) // Find the magic endianness-swapping table. ldgot ecx movdqa xmm5, [INTADDR(endswap_tab, ecx)] - // Load the input block and end-swap it. Also, start loading the - // keys. - mov eax, [esp + 8] - movdqu xmm0, [eax] +#if CPUFAM_X86 + // Arguments come in on the stack, and need to be collected. We + // don't have a shortage of registers. + +# define K ecx +# define SRC edx +# define DST edx +# define NR eax + + mov K, [esp + 4] + mov SRC, [esp + 8] +#endif + +#if CPUFAM_AMD64 && ABI_SYSV + // Arguments come in registers. All is good. + +# define K rdi +# define SRC rsi +# define DST rdx +# define NR eax +#endif + +#if CPUFAM_AMD64 && ABI_WIN + // Arguments come in different registers. + +# define K rcx +# define SRC rdx +# define DST r8 +# define NR eax +#endif + + // Initial setup. + movdqu xmm0, [SRC] pshufb xmm0, xmm5 - mov eax, [esp + 4] - lea edx, [eax + \koff] - mov eax, [eax + nr] + mov NR, [K + nr] + add K, \koff // Initial whitening. - movdqu xmm1, [edx] - add edx, 16 + movdqu xmm1, [K] + add K, 16 pxor xmm0, xmm1 // Dispatch to the correct code. - cmp eax, 10 + cmp NR, 10 je 10f jb bogus - cmp eax, 14 + cmp NR, 14 je 14f ja bogus - cmp eax, 12 + cmp NR, 12 je 12f jb 11f jmp 13f @@ -350,73 +495,80 @@ FUNC(rijndael_\op\()_x86_aesni) .align 2 // 14 rounds... -14: movdqu xmm1, [edx] - add edx, 16 +14: movdqu xmm1, [K] + add K, 16 \aes xmm0, xmm1 // 13 rounds... -13: movdqu xmm1, [edx] - add edx, 16 +13: movdqu xmm1, [K] + add K, 16 \aes xmm0, xmm1 // 12 rounds... -12: movdqu xmm1, [edx] - add edx, 16 +12: movdqu xmm1, [K] + add K, 16 \aes xmm0, xmm1 // 11 rounds... -11: movdqu xmm1, [edx] - add edx, 16 +11: movdqu xmm1, [K] + add K, 16 \aes xmm0, xmm1 // 10 rounds... -10: movdqu xmm1, [edx] +10: movdqu xmm1, [K] \aes xmm0, xmm1 // 9 rounds... - movdqu xmm1, [edx + 16] + movdqu xmm1, [K + 16] \aes xmm0, xmm1 // 8 rounds... - movdqu xmm1, [edx + 32] + movdqu xmm1, [K + 32] \aes xmm0, xmm1 // 7 rounds... - movdqu xmm1, [edx + 48] + movdqu xmm1, [K + 48] \aes xmm0, xmm1 // 6 rounds... - movdqu xmm1, [edx + 64] + movdqu xmm1, [K + 64] \aes xmm0, xmm1 // 5 rounds... - movdqu xmm1, [edx + 80] + movdqu xmm1, [K + 80] \aes xmm0, xmm1 // 4 rounds... - movdqu xmm1, [edx + 96] + movdqu xmm1, [K + 96] \aes xmm0, xmm1 // 3 rounds... - movdqu xmm1, [edx + 112] + movdqu xmm1, [K + 112] \aes xmm0, xmm1 // 2 rounds... - movdqu xmm1, [edx + 128] + movdqu xmm1, [K + 128] \aes xmm0, xmm1 // Final round... - movdqu xmm1, [edx + 144] + movdqu xmm1, [K + 144] \aes\()last xmm0, xmm1 // Unpermute the ciphertext block and store it. pshufb xmm0, xmm5 - mov eax, [esp + 12] - movdqu [eax], xmm0 +#if CPUFAM_X86 + mov DST, [esp + 12] +#endif + movdqu [DST], xmm0 // And we're done. ret +#undef K +#undef SRC +#undef DST +#undef NR + ENDFUNC .endm diff --git a/symm/rijndael.c b/symm/rijndael.c index dcb35e61..293f28da 100644 --- a/symm/rijndael.c +++ b/symm/rijndael.c @@ -82,15 +82,15 @@ CPU_DISPATCH(EMPTY, EMPTY, void, rijndael_dblk, (const rijndael_ctx *k, uint32 d[4]), (k, s, d), pick_dblk, simple_dblk) -#ifdef CPUFAM_X86 -extern rijndael_eblk__functype rijndael_eblk_x86_aesni; -extern rijndael_dblk__functype rijndael_dblk_x86_aesni; +#if CPUFAM_X86 || CPUFAM_AMD64 +extern rijndael_eblk__functype rijndael_eblk_x86ish_aesni; +extern rijndael_dblk__functype rijndael_dblk_x86ish_aesni; #endif static rijndael_eblk__functype *pick_eblk(void) { -#ifdef CPUFAM_X86 - DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86_aesni, +#if CPUFAM_X86 || CPUFAM_AMD64 + DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86ish_aesni, cpu_feature_p(CPUFEAT_X86_AESNI)); #endif DISPATCH_PICK_FALLBACK(rijndael_eblk, simple_eblk); @@ -98,8 +98,8 @@ static rijndael_eblk__functype *pick_eblk(void) static rijndael_dblk__functype *pick_dblk(void) { -#ifdef CPUFAM_X86 - DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86_aesni, +#if CPUFAM_X86 || CPUFAM_AMD64 + DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86ish_aesni, cpu_feature_p(CPUFEAT_X86_AESNI)); #endif DISPATCH_PICK_FALLBACK(rijndael_dblk, simple_dblk); diff --git a/symm/salsa20-x86-sse2.S b/symm/salsa20-x86ish-sse2.S similarity index 63% rename from symm/salsa20-x86-sse2.S rename to symm/salsa20-x86ish-sse2.S index 7a5bd2a3..a168d79a 100644 --- a/symm/salsa20-x86-sse2.S +++ b/symm/salsa20-x86ish-sse2.S @@ -44,20 +44,76 @@ .arch pentium4 .section .text -FUNC(salsa20_core_x86_sse2) +FUNC(salsa20_core_x86ish_sse2) + + // Initial setup. + +#if CPUFAM_X86 + // Arguments come in on the stack, and will need to be collected. We + // we can get away with just the scratch registers for integer work, + // but we'll run out of XMM registers and will need some properly + // aligned space which we'll steal from the stack. I don't trust the + // stack pointer's alignment, so I'll have to mask the stack pointer, + // which in turn means I'll need to keep track of the old value. + // Hence I'm making a full i386-style stack frame here. + // + // The Windows and SysV ABIs are sufficiently similar that we don't + // need to worry about the differences here. + +# define NR ecx +# define IN eax +# define OUT edx +# define SAVE0 xmm6 +# define SAVE1 xmm7 +# define SAVE2 [esp + 0] +# define SAVE3 [esp + 16] - // Initial state. We have three arguments: - // [ebp + 8] is the number of rounds to do - // [ebp + 12] points to the input matrix - // [ebp + 16] points to the output matrix push ebp mov ebp, esp sub esp, 32 - mov edx, [ebp + 12] + mov IN, [ebp + 12] + mov OUT, [ebp + 16] and esp, ~15 - - // Prepare for the main loop. - mov ecx, [ebp + 8] + mov NR, [ebp + 8] +#endif + +#if CPUFAM_AMD64 && ABI_SYSV + // This is nice. We have plenty of XMM registers, and the arguments + // are in useful places. There's no need to spill anything and we + // can just get on with the code. + +# define NR edi +# define IN rsi +# define OUT rdx +# define SAVE0 xmm6 +# define SAVE1 xmm7 +# define SAVE2 xmm8 +# define SAVE3 xmm9 +#endif + +# if CPUFAM_AMD64 && ABI_WIN + // Arguments come in registers, but they're different between Windows + // and everyone else (and everyone else is saner). + // + // The Windows ABI insists that we preserve some of the XMM + // registers, but we want more than we can use as scratch space. Two + // places we only need to save a copy of the input for the + // feedforward at the end; but the other two we want for the final + // permutation, so save the old values on the stack (We need an extra + // 8 bytes to align the stack.) + +# define NR ecx +# define IN rdx +# define OUT r8 +# define SAVE0 xmm6 +# define SAVE1 xmm7 +# define SAVE2 [rsp + 32] +# define SAVE3 [rsp + 48] + + sub rsp, 64 + 8 + movdqa [rsp + 0], xmm6 + movdqa [rsp + 16], xmm7 +#endif // First job is to slurp the matrix into XMM registers. The words // have already been permuted conveniently to make them line up @@ -85,19 +141,18 @@ FUNC(salsa20_core_x86_sse2) // [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1) // [ 8 9 10 11] [ 8 13 2 7] (c, xmm2) // [12 13 14 15] [12 1 6 11] (d, xmm3) - movdqu xmm0, [edx + 0] - movdqu xmm1, [edx + 16] - movdqu xmm2, [edx + 32] - movdqu xmm3, [edx + 48] + movdqu xmm0, [IN + 0] + movdqu xmm1, [IN + 16] + movdqu xmm2, [IN + 32] + movdqu xmm3, [IN + 48] - // Take a copy for later. - movdqa [esp + 0], xmm0 - movdqa [esp + 16], xmm1 - movdqa xmm6, xmm2 - movdqa xmm7, xmm3 + ## Take a copy for later. + movdqa SAVE0, xmm0 + movdqa SAVE1, xmm1 + movdqa SAVE2, xmm2 + movdqa SAVE3, xmm3 loop: - // Apply a column quarterround to each of the columns simultaneously. // Alas, there doesn't seem to be a packed doubleword rotate, so we // have to synthesize it. @@ -147,9 +202,9 @@ loop: // involve any movement of elements between rows. // // [ 0 5 10 15] [ 0 5 10 15] (a, xmm0) - // [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3) - // [ 8 13 2 7] [ 2 7 8 13] (c, xmm2) - // [12 1 6 11] [ 3 4 9 14] (d, xmm1) + // [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3) + // [ 8 13 2 7] [ 2 7 8 13] (c, xmm2) + // [12 1 6 11] [ 3 4 9 14] (d, xmm1) // // The shuffles have quite high latency, so they've been pushed // backwards into the main instruction list. @@ -200,7 +255,7 @@ loop: // back the shuffles because they take a long time coming through. // Decrement the loop counter and see if we should go round again. // Later processors fuse this pair into a single uop. - sub ecx, 2 + sub NR, 2 ja loop // Almost there. Firstly, the feedforward addition, and then we have @@ -208,55 +263,69 @@ loop: // which was already applied to the input. Shuffling has quite high // latency, so arrange to start a new shuffle into a temporary as // soon as we've written out the old value. - mov edx, [ebp + 16] - - paddd xmm0, [esp + 0] - pshufd xmm4, xmm0, ROTR - movd [edx + 0], xmm0 + paddd xmm0, SAVE0 + pshufd xmm4, xmm0, 0x39 + movd [OUT + 0], xmm0 - paddd xmm1, [esp + 16] + paddd xmm1, SAVE1 pshufd xmm5, xmm1, ROTL - movd [edx + 16], xmm1 + movd [OUT + 16], xmm1 - paddd xmm2, xmm6 + paddd xmm2, SAVE2 pshufd xmm6, xmm2, ROT2 - movd [edx + 32], xmm2 + movd [OUT + 32], xmm2 - paddd xmm3, xmm7 + paddd xmm3, SAVE3 pshufd xmm7, xmm3, ROTR - movd [edx + 48], xmm3 + movd [OUT + 48], xmm3 - movd [edx + 4], xmm7 + movd [OUT + 4], xmm7 pshufd xmm7, xmm3, ROT2 - movd [edx + 24], xmm7 + movd [OUT + 24], xmm7 pshufd xmm3, xmm3, ROTL - movd [edx + 44], xmm3 + movd [OUT + 44], xmm3 - movd [edx + 8], xmm6 + movd [OUT + 8], xmm6 pshufd xmm6, xmm2, ROTL - movd [edx + 28], xmm6 + movd [OUT + 28], xmm6 pshufd xmm2, xmm2, ROTR - movd [edx + 52], xmm2 + movd [OUT + 52], xmm2 - movd [edx + 12], xmm5 + movd [OUT + 12], xmm5 pshufd xmm5, xmm1, ROTR - movd [edx + 36], xmm5 + movd [OUT + 36], xmm5 pshufd xmm1, xmm1, ROT2 - movd [edx + 56], xmm1 + movd [OUT + 56], xmm1 - movd [edx + 20], xmm4 + movd [OUT + 20], xmm4 pshufd xmm4, xmm0, ROT2 - movd [edx + 40], xmm4 + movd [OUT + 40], xmm4 pshufd xmm0, xmm0, ROTL - movd [edx + 60], xmm0 + movd [OUT + 60], xmm0 // Tidy things up. + +#if CPUFAM_X86 mov esp, ebp pop ebp +#endif +#if CPUFAM_AMD64 && ABI_WIN + movdqa xmm6, [rsp + 0] + movdqa xmm7, [rsp + 16] + add rsp, 64 + 8 +#endif // And with that, we're done. ret +#undef NR +#undef IN +#undef OUT +#undef SAVE0 +#undef SAVE1 +#undef SAVE2 +#undef SAVE3 + ENDFUNC ///----- That's all, folks -------------------------------------------------- diff --git a/symm/salsa20.c b/symm/salsa20.c index 15e4d50e..eb4e67ad 100644 --- a/symm/salsa20.c +++ b/symm/salsa20.c @@ -52,14 +52,14 @@ static void simple_core(unsigned r, const salsa20_matrix src, salsa20_matrix dest) { SALSA20_nR(dest, src, r); SALSA20_FFWD(dest, src); } -#ifdef CPUFAM_X86 -extern core__functype salsa20_core_x86_sse2; +#if CPUFAM_X86 || CPUFAM_AMD64 +extern core__functype salsa20_core_x86ish_sse2; #endif static core__functype *pick_core(void) { -#ifdef CPUFAM_X86 - DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86_sse2, +#if CPUFAM_X86 || CPUFAM_AMD64 + DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_sse2, cpu_feature_p(CPUFEAT_X86_SSE2)); #endif DISPATCH_PICK_FALLBACK(salsa20_core, simple_core); -- 2.11.0