X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/160214515f6913d84e0e41253cf61281718bcd99..HEAD:/symm/rijndael-x86ish-aesni.S diff --git a/symm/rijndael-x86ish-aesni.S b/symm/rijndael-x86ish-aesni.S index 5194c17d..ad9236a8 100644 --- a/symm/rijndael-x86ish-aesni.S +++ b/symm/rijndael-x86ish-aesni.S @@ -25,28 +25,21 @@ /// MA 02111-1307, USA. ///-------------------------------------------------------------------------- -/// External definitions. +/// Preliminaries. #include "config.h" #include "asm-common.h" - .globl F(abort) - .globl F(rijndael_rcon) + .arch .aes -///-------------------------------------------------------------------------- -/// Local utilities. + .extern F(abort) + .extern F(rijndael_rcon) -// Magic constants for shuffling. -#define ROTL 0x93 -#define ROT2 0x4e -#define ROTR 0x39 + .text ///-------------------------------------------------------------------------- /// Main code. - .arch .aes - .text - /// The AESNI instructions implement a little-endian version of AES, but /// Catacomb's internal interface presents as big-endian so as to work better /// with things like GCM. We therefore maintain the round keys in @@ -69,43 +62,42 @@ ///-------------------------------------------------------------------------- /// Key setup. +FUNC(rijndael_setup_x86ish_aesni_avx) + vzeroupper // avoid penalty on `legacy' XMM access + endprologue + // and drop through... +ENDFUNC + FUNC(rijndael_setup_x86ish_aesni) #if CPUFAM_X86 // Arguments are on the stack. We'll need to stack the caller's // register veriables, but we'll manage. -# define CTX ebp // context pointer -# define BLKSZ [esp + 24] // block size - -# define SI esi // source pointer -# define DI edi // destination pointer +# define CTX BP // context pointer +# define BLKSZ [SP + 24] // block size # define KSZ ebx // key size -# define KSZo ebx // ... as address offset # define NKW edx // total number of key words # define NKW_NEEDS_REFRESH 1 // ... needs recalculating # define RCON ecx // round constants table # define LIM edx // limit pointer -# define LIMn edx // ... as integer offset from base # define CYIX edi // index in shift-register cycle # define NR ecx // number of rounds # define LRK eax // distance to last key -# define LRKo eax // ... as address offset # define BLKOFF edx // block size in bytes -# define BLKOFFo edx // ... as address offset // Stack the caller's registers. - push ebp - push ebx - push esi - push edi + pushreg BP + pushreg ebx + pushreg esi + pushreg edi // Set up our own variables. - mov CTX, [esp + 20] // context base pointer - mov SI, [esp + 28] // key material - mov KSZ, [esp + 32] // key size, in words + mov CTX, [SP + 20] // context base pointer + mov SI, [SP + 28] // key material + mov KSZ, [SP + 32] // key size, in words #endif #if CPUFAM_AMD64 && ABI_SYSV @@ -115,22 +107,15 @@ FUNC(rijndael_setup_x86ish_aesni) # define CTX r8 // context pointer # define BLKSZ r9d // block size -# define SI rsi // source pointer -# define DI rdi // destination pointer - # define KSZ edx // key size -# define KSZo rdx // ... as address offset # define NKW r10d // total number of key words # define RCON rdi // round constants table -# define LIMn ecx // limit pointer -# define LIM rcx // ... as integer offset from base +# define LIM rcx // limit pointer # define CYIX r11d // index in shift-register cycle # define NR ecx // number of rounds # define LRK eax // distance to last key -# define LRKo rax // ... as address offset # define BLKOFF r9d // block size in bytes -# define BLKOFFo r9 // ... as address offset // Move arguments to more useful places. mov CTX, rdi // context base pointer @@ -145,39 +130,34 @@ FUNC(rijndael_setup_x86ish_aesni) # define CTX r8 // context pointer # define BLKSZ edx // block size -# define SI rsi // source pointer -# define DI rdi // destination pointer - # define KSZ r9d // key size -# define KSZo r9 // ... as address offset # define NKW r10d // total number of key words # define RCON rdi // round constants table -# define LIMn ecx // limit pointer -# define LIM rcx // ... as integer offset from base +# define LIM rcx // limit pointer # define CYIX r11d // index in shift-register cycle # define NR ecx // number of rounds # define LRK eax // distance to last key -# define LRKo rax // ... as address offset # define BLKOFF edx // block size in bytes -# define BLKOFFo rdx // ... as address offset // We'll need the index registers, which belong to the caller in this // ABI. - push rsi - push rdi + pushreg rsi + pushreg rdi // Move arguments to more useful places. - mov SI, r8 // key material + mov rsi, r8 // key material mov CTX, rcx // context base pointer #endif + endprologue + // The initial round key material is taken directly from the input // key, so copy it over. #if CPUFAM_AMD64 && ABI_SYSV // We've been lucky. We already have a copy of the context pointer // in rdi, and the key size in ecx. - add DI, w + add rdi, w #else lea DI, [CTX + w] mov ecx, KSZ @@ -191,17 +171,17 @@ FUNC(rijndael_setup_x86ish_aesni) #if !NKW_NEEDS_REFRESH // If we can't keep NKW for later, then we use the same register for // it and LIM, so this move is unnecessary. - mov LIMn, NKW + mov DWORD(LIM), NKW #endif - sub LIMn, KSZ // offset by the key size + sub DWORD(LIM), KSZ // offset by the key size // Find the round constants. - ldgot ecx - leaext RCON, F(rijndael_rcon), ecx + ldgot WHOLE(c) + leaext RCON, F(rijndael_rcon), WHOLE(c) // Prepare for the main loop. lea SI, [CTX + w] - mov eax, [SI + 4*KSZo - 4] // most recent key word + mov eax, [SI + 4*WHOLE(KSZ) - 4] // most recent key word lea LIM, [SI + 4*LIM] // limit, offset by one key expansion xor CYIX, CYIX // start of new cycle @@ -229,16 +209,16 @@ FUNC(rijndael_setup_x86ish_aesni) // Fourth word of the cycle, and seven or eight words of key. Do a // byte substitution. movd xmm0, eax - pshufd xmm0, xmm0, ROTL + pshufd xmm0, xmm0, SHUF(2, 1, 0, 3) aeskeygenassist xmm1, xmm0, 0 movd eax, xmm1 jmp 2f // First word of the cycle. This is the complicated piece. 1: movd xmm0, eax - pshufd xmm0, xmm0, ROTR + pshufd xmm0, xmm0, SHUF(0, 3, 2, 1) aeskeygenassist xmm1, xmm0, 0 - pshufd xmm1, xmm1, ROTL + pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) movd eax, xmm1 xor al, [RCON] inc RCON @@ -246,7 +226,7 @@ FUNC(rijndael_setup_x86ish_aesni) // Common tail. Mix in the corresponding word from the previous // cycle and prepare for the next loop. 2: xor eax, [SI] - mov [SI + 4*KSZo], eax + mov [SI + 4*WHOLE(KSZ)], eax add SI, 4 inc CYIX cmp SI, LIM @@ -281,7 +261,7 @@ FUNC(rijndael_setup_x86ish_aesni) sub LRK, BLKSZ #endif lea DI, [CTX + wi] - lea SI, [CTX + w + 4*LRKo] // last round's keys + lea SI, [CTX + w + 4*WHOLE(LRK)] // last round's keys shl BLKOFF, 2 // block size (in bytes now) // Copy the last encryption round's keys. @@ -293,8 +273,8 @@ FUNC(rijndael_setup_x86ish_aesni) movdqu [DI + 16], xmm0 // Update the loop variables and stop if we've finished. -0: add DI, BLKOFFo - sub SI, BLKOFFo +0: add DI, WHOLE(BLKOFF) + sub SI, WHOLE(BLKOFF) sub NR, 1 jbe 9f @@ -344,21 +324,24 @@ FUNC(rijndael_setup_x86ish_aesni) 9: // All done. #if CPUFAM_X86 - pop edi - pop esi - pop ebx - pop ebp + popreg edi + popreg esi + popreg ebx + popreg BP #endif #if CPUFAM_AMD64 && ABI_WIN - pop rdi - pop rsi + popreg rdi + popreg rsi #endif ret - .align 16 -endswap_block: +ENDFUNC + +INTFUNC(endswap_block) // End-swap NKW words starting at SI. The end-swapping table is // already loaded into XMM5; and it's OK to work in 16-byte chunks. + endprologue + mov ecx, NKW 0: movdqu xmm1, [SI] pshufb xmm1, xmm5 @@ -366,46 +349,45 @@ endswap_block: add SI, 16 sub ecx, 4 ja 0b + ret +ENDFUNC + #undef CTX #undef BLKSZ #undef SI #undef DI #undef KSZ -#undef KSZo #undef RCON -#undef LIMn #undef LIM #undef NR #undef LRK -#undef LRKo #undef BLKOFF -#undef BLKOFFo - -ENDFUNC ///-------------------------------------------------------------------------- /// Encrypting and decrypting blocks. .macro encdec op, aes, koff - FUNC(rijndael_\op\()_x86ish_aesni) + FUNC(rijndael_\op\()_x86ish_aesni_avx) + vzeroupper // avoid XMM penalties + endprologue + // and drop through... + ENDFUNC - // Find the magic endianness-swapping table. - ldgot ecx - movdqa xmm5, [INTADDR(endswap_tab, ecx)] + FUNC(rijndael_\op\()_x86ish_aesni) #if CPUFAM_X86 // Arguments come in on the stack, and need to be collected. We // don't have a shortage of registers. -# define K ecx +# define K eax # define SRC edx # define DST edx -# define NR eax +# define NR ecx - mov K, [esp + 4] - mov SRC, [esp + 8] + mov K, [SP + 4] + mov SRC, [SP + 8] #endif #if CPUFAM_AMD64 && ABI_SYSV @@ -426,6 +408,12 @@ ENDFUNC # define NR eax #endif + endprologue + + // Find the magic endianness-swapping table. + ldgot ecx + movdqa xmm5, [INTADDR(endswap_tab, ecx)] + // Initial setup. movdqu xmm0, [SRC] pshufb xmm0, xmm5 @@ -436,6 +424,9 @@ ENDFUNC movdqu xmm1, [K] add K, 16 pxor xmm0, xmm1 +#if CPUFAM_X86 + mov DST, [SP + 12] +#endif // Dispatch to the correct code. cmp NR, 10 @@ -513,9 +504,6 @@ ENDFUNC // Unpermute the ciphertext block and store it. pshufb xmm0, xmm5 -#if CPUFAM_X86 - mov DST, [esp + 12] -#endif movdqu [DST], xmm0 // And we're done. @@ -535,22 +523,26 @@ ENDFUNC ///-------------------------------------------------------------------------- /// Random utilities. - .align 16 +INTFUNC(bogus) // Abort the process because of a programming error. Indirecting // through this point serves several purposes: (a) by CALLing, rather // than branching to, `abort', we can save the return address, which // might at least provide a hint as to what went wrong; (b) we don't // have conditional CALLs (and they'd be big anyway); and (c) we can // write a HLT here as a backstop against `abort' being mad. -bogus: callext F(abort) + endprologue + + callext F(abort) 0: hlt jmp 0b - gotaux ecx +ENDFUNC ///-------------------------------------------------------------------------- /// Data tables. + RODATA + .align 16 endswap_tab: .byte 3, 2, 1, 0