X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/1a517bb3785891ff6940c73af7c5a136d0250ebf..8f2287ef5c05d496fcb9b012629af007fe56f897:/symm/rijndael-x86ish-aesni.S diff --git a/symm/rijndael-x86ish-aesni.S b/symm/rijndael-x86ish-aesni.S index b0b880a4..6d9b3b22 100644 --- a/symm/rijndael-x86ish-aesni.S +++ b/symm/rijndael-x86ish-aesni.S @@ -25,20 +25,21 @@ /// MA 02111-1307, USA. ///-------------------------------------------------------------------------- -/// External definitions. +/// Preliminaries. #include "config.h" #include "asm-common.h" - .globl F(abort) - .globl F(rijndael_rcon) + .arch .aes -///-------------------------------------------------------------------------- -/// Main code. + .extern F(abort) + .extern F(rijndael_rcon) - .arch .aes .text +///-------------------------------------------------------------------------- +/// Main code. + /// The AESNI instructions implement a little-endian version of AES, but /// Catacomb's internal interface presents as big-endian so as to work better /// with things like GCM. We therefore maintain the round keys in @@ -61,6 +62,12 @@ ///-------------------------------------------------------------------------- /// Key setup. +FUNC(rijndael_setup_x86ish_aesni_avx) + vzeroupper // avoid penalty on `legacy' XMM access + endprologue + // and drop through... +ENDFUNC + FUNC(rijndael_setup_x86ish_aesni) #define SI WHOLE(si) @@ -85,10 +92,10 @@ FUNC(rijndael_setup_x86ish_aesni) # define BLKOFF edx // block size in bytes // Stack the caller's registers. - push ebp - push ebx - push esi - push edi + pushreg ebp + pushreg ebx + pushreg esi + pushreg edi // Set up our own variables. mov CTX, [esp + 20] // context base pointer @@ -138,17 +145,16 @@ FUNC(rijndael_setup_x86ish_aesni) // We'll need the index registers, which belong to the caller in this // ABI. - push rsi - .seh_pushreg rsi - push rdi - .seh_pushreg rdi - .seh_endprologue + pushreg rsi + pushreg rdi // Move arguments to more useful places. mov rsi, r8 // key material mov CTX, rcx // context base pointer #endif + endprologue + // The initial round key material is taken directly from the input // key, so copy it over. #if CPUFAM_AMD64 && ABI_SYSV @@ -206,16 +212,16 @@ FUNC(rijndael_setup_x86ish_aesni) // Fourth word of the cycle, and seven or eight words of key. Do a // byte substitution. movd xmm0, eax - pshufd xmm0, xmm0, SHUF(2, 1, 0, 3) + pshufd xmm0, xmm0, SHUF(3, 0, 1, 2) aeskeygenassist xmm1, xmm0, 0 movd eax, xmm1 jmp 2f // First word of the cycle. This is the complicated piece. 1: movd xmm0, eax - pshufd xmm0, xmm0, SHUF(0, 3, 2, 1) + pshufd xmm0, xmm0, SHUF(1, 2, 3, 0) aeskeygenassist xmm1, xmm0, 0 - pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) + pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) movd eax, xmm1 xor al, [RCON] inc RCON @@ -321,14 +327,14 @@ FUNC(rijndael_setup_x86ish_aesni) 9: // All done. #if CPUFAM_X86 - pop edi - pop esi - pop ebx - pop ebp + popreg edi + popreg esi + popreg ebx + popreg ebp #endif #if CPUFAM_AMD64 && ABI_WIN - pop rdi - pop rsi + popreg rdi + popreg rsi #endif ret @@ -337,9 +343,7 @@ ENDFUNC INTFUNC(endswap_block) // End-swap NKW words starting at SI. The end-swapping table is // already loaded into XMM5; and it's OK to work in 16-byte chunks. -#if CPUFAM_AMD64 && ABI_WIN - .seh_endprologue -#endif + endprologue mov ecx, NKW 0: movdqu xmm1, [SI] @@ -368,6 +372,12 @@ ENDFUNC /// Encrypting and decrypting blocks. .macro encdec op, aes, koff + FUNC(rijndael_\op\()_x86ish_aesni_avx) + vzeroupper // avoid XMM penalties + endprologue + // and drop through... + ENDFUNC + FUNC(rijndael_\op\()_x86ish_aesni) #if CPUFAM_X86 @@ -399,9 +409,10 @@ ENDFUNC # define SRC rdx # define DST r8 # define NR eax - .seh_endprologue #endif + endprologue + // Find the magic endianness-swapping table. ldgot ecx movdqa xmm5, [INTADDR(endswap_tab, ecx)] @@ -522,9 +533,7 @@ INTFUNC(bogus) // might at least provide a hint as to what went wrong; (b) we don't // have conditional CALLs (and they'd be big anyway); and (c) we can // write a HLT here as a backstop against `abort' being mad. -#if CPUFAM_AMD64 && ABI_WIN - .seh_endprologue -#endif + endprologue callext F(abort) 0: hlt