X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/0f23f75ff53acadf80e9d3dfd2dfd14cb526074f..645fcce0830342b644cc16e71e28790c838d9415:/symm/rijndael-x86ish-aesni.S diff --git a/symm/rijndael-x86ish-aesni.S b/symm/rijndael-x86ish-aesni.S index 91fcc352..3cca50f7 100644 --- a/symm/rijndael-x86ish-aesni.S +++ b/symm/rijndael-x86ish-aesni.S @@ -30,25 +30,14 @@ #include "config.h" #include "asm-common.h" -///-------------------------------------------------------------------------- -/// External definitions. - .globl F(abort) .globl F(rijndael_rcon) ///-------------------------------------------------------------------------- -/// Local utilities. - -// Magic constants for shuffling. -#define ROTL 0x93 -#define ROT2 0x4e -#define ROTR 0x39 - -///-------------------------------------------------------------------------- /// Main code. .arch .aes - .section .text + .text /// The AESNI instructions implement a little-endian version of AES, but /// Catacomb's internal interface presents as big-endian so as to work better @@ -91,6 +80,7 @@ FUNC(rijndael_setup_x86ish_aesni) # define RCON ecx // round constants table # define LIM edx // limit pointer # define LIMn edx // ... as integer offset from base +# define CYIX edi // index in shift-register cycle # define NR ecx // number of rounds # define LRK eax // distance to last key @@ -126,6 +116,7 @@ FUNC(rijndael_setup_x86ish_aesni) # define RCON rdi // round constants table # define LIMn ecx // limit pointer # define LIM rcx // ... as integer offset from base +# define CYIX r11d // index in shift-register cycle # define NR ecx // number of rounds # define LRK eax // distance to last key @@ -155,6 +146,7 @@ FUNC(rijndael_setup_x86ish_aesni) # define RCON rdi // round constants table # define LIMn ecx // limit pointer # define LIM rcx // ... as integer offset from base +# define CYIX r11d // index in shift-register cycle # define NR ecx // number of rounds # define LRK eax // distance to last key @@ -165,7 +157,10 @@ FUNC(rijndael_setup_x86ish_aesni) // We'll need the index registers, which belong to the caller in this // ABI. push rsi + .seh_pushreg rsi push rdi + .seh_pushreg rdi + .seh_endprologue // Move arguments to more useful places. mov SI, r8 // key material @@ -197,12 +192,13 @@ FUNC(rijndael_setup_x86ish_aesni) // Find the round constants. ldgot ecx - leaext RCON, rijndael_rcon, ecx + leaext RCON, F(rijndael_rcon), ecx // Prepare for the main loop. lea SI, [CTX + w] mov eax, [SI + 4*KSZo - 4] // most recent key word lea LIM, [SI + 4*LIM] // limit, offset by one key expansion + xor CYIX, CYIX // start of new cycle // Main key expansion loop. The first word of each key-length chunk // needs special treatment. @@ -218,85 +214,42 @@ FUNC(rijndael_setup_x86ish_aesni) // as an immediate, so it's kind of annoying if you're not // open-coding the whole thing. It's much easier to leave that as // zero and XOR in the round constant by hand. -9: movd xmm0, eax - pshufd xmm0, xmm0, ROTR +0: cmp CYIX, 0 // first word of the cycle? + je 1f + cmp CYIX, 4 // fourth word of the cycle? + jne 2f + cmp KSZ, 7 // and a large key? + jb 2f + + // Fourth word of the cycle, and seven or eight words of key. Do a + // byte substitution. + movd xmm0, eax + pshufd xmm0, xmm0, SHUF(2, 1, 0, 3) aeskeygenassist xmm1, xmm0, 0 - pshufd xmm1, xmm1, ROTL movd eax, xmm1 - xor eax, [SI] - xor al, [RCON] - inc RCON - mov [SI + 4*KSZo], eax - add SI, 4 - cmp SI, LIM - jae 8f - - // The next three words are simple... - xor eax, [SI] - mov [SI + 4*KSZo], eax - add SI, 4 - cmp SI, LIM - jae 8f + jmp 2f - // (Word 2...) - xor eax, [SI] - mov [SI + 4*KSZo], eax - add SI, 4 - cmp SI, LIM - jae 8f - - // (Word 3...) - xor eax, [SI] - mov [SI + 4*KSZo], eax - add SI, 4 - cmp SI, LIM - jae 8f - - // Word 4. If the key is /more/ than 6 words long, then we must - // apply a substitution here. - cmp KSZ, 5 - jb 9b - cmp KSZ, 7 - jb 0f - movd xmm0, eax - pshufd xmm0, xmm0, ROTL + // First word of the cycle. This is the complicated piece. +1: movd xmm0, eax + pshufd xmm0, xmm0, SHUF(0, 3, 2, 1) aeskeygenassist xmm1, xmm0, 0 + pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) movd eax, xmm1 -0: xor eax, [SI] - mov [SI + 4*KSZo], eax - add SI, 4 - cmp SI, LIM - jae 8f - - // (Word 5...) - cmp KSZ, 6 - jb 9b - xor eax, [SI] - mov [SI + 4*KSZo], eax - add SI, 4 - cmp SI, LIM - jae 8f - - // (Word 6...) - cmp KSZ, 7 - jb 9b - xor eax, [SI] - mov [SI + 4*KSZo], eax - add SI, 4 - cmp SI, LIM - jae 8f + xor al, [RCON] + inc RCON - // (Word 7...) - cmp KSZ, 8 - jb 9b - xor eax, [SI] + // Common tail. Mix in the corresponding word from the previous + // cycle and prepare for the next loop. +2: xor eax, [SI] mov [SI + 4*KSZo], eax add SI, 4 + inc CYIX cmp SI, LIM - jae 8f - - // Must be done by now. - jmp 9b + jae 9f + cmp CYIX, KSZ + jb 0b + xor CYIX, CYIX + jmp 0b // Next job is to construct the decryption keys. The keys for the // first and last rounds don't need to be mangled, but the remaining @@ -311,7 +264,7 @@ FUNC(rijndael_setup_x86ish_aesni) // there's easily enough buffer space for the over-enthusiastic reads // and writes because the context has space for 32-byte blocks, which // is our maximum and an exact fit for two SSE registers. -8: mov NR, [CTX + nr] // number of rounds +9: mov NR, [CTX + nr] // number of rounds #if NKW_NEEDS_REFRESH mov BLKOFF, BLKSZ mov LRK, NR @@ -330,39 +283,39 @@ FUNC(rijndael_setup_x86ish_aesni) movdqu xmm0, [SI] movdqu [DI], xmm0 cmp BLKOFF, 16 - jbe 9f + jbe 0f movdqu xmm0, [SI + 16] movdqu [DI + 16], xmm0 // Update the loop variables and stop if we've finished. -9: add DI, BLKOFFo +0: add DI, BLKOFFo sub SI, BLKOFFo sub NR, 1 - jbe 0f + jbe 9f // Do another middle round's keys... movdqu xmm0, [SI] aesimc xmm0, xmm0 movdqu [DI], xmm0 cmp BLKOFF, 16 - jbe 9b + jbe 0b movdqu xmm0, [SI + 16] aesimc xmm0, xmm0 movdqu [DI + 16], xmm0 - jmp 9b + jmp 0b // Finally do the first encryption round. -0: movdqu xmm0, [SI] +9: movdqu xmm0, [SI] movdqu [DI], xmm0 cmp BLKOFF, 16 - jbe 0f + jbe 1f movdqu xmm0, [SI + 16] movdqu [DI + 16], xmm0 // If the block size is not exactly four words then we must end-swap // everything. We can use fancy SSE toys for this. -0: cmp BLKOFF, 16 - je 0f +1: cmp BLKOFF, 16 + je 9f // Find the byte-reordering table. ldgot ecx @@ -377,16 +330,14 @@ FUNC(rijndael_setup_x86ish_aesni) #endif // End-swap the encryption keys. - mov ecx, NKW lea SI, [CTX + w] call endswap_block // And the decryption keys. - mov ecx, NKW lea SI, [CTX + wi] call endswap_block -0: // All done. +9: // All done. #if CPUFAM_X86 pop edi pop esi @@ -401,14 +352,15 @@ FUNC(rijndael_setup_x86ish_aesni) .align 16 endswap_block: - // End-swap ECX words starting at SI. The end-swapping table is + // End-swap NKW words starting at SI. The end-swapping table is // already loaded into XMM5; and it's OK to work in 16-byte chunks. - movdqu xmm1, [SI] + mov ecx, NKW +0: movdqu xmm1, [SI] pshufb xmm1, xmm5 movdqu [SI], xmm1 add SI, 16 sub ecx, 4 - ja endswap_block + ja 0b ret #undef CTX @@ -431,21 +383,17 @@ ENDFUNC ///-------------------------------------------------------------------------- /// Encrypting and decrypting blocks. - .macro encdec op, aes, koff -FUNC(rijndael_\op\()_x86ish_aesni) - - // Find the magic endianness-swapping table. - ldgot ecx - movdqa xmm5, [INTADDR(endswap_tab, ecx)] +.macro encdec op, aes, koff + FUNC(rijndael_\op\()_x86ish_aesni) #if CPUFAM_X86 // Arguments come in on the stack, and need to be collected. We // don't have a shortage of registers. -# define K ecx +# define K eax # define SRC edx # define DST edx -# define NR eax +# define NR ecx mov K, [esp + 4] mov SRC, [esp + 8] @@ -467,8 +415,13 @@ FUNC(rijndael_\op\()_x86ish_aesni) # define SRC rdx # define DST r8 # define NR eax + .seh_endprologue #endif + // Find the magic endianness-swapping table. + ldgot ecx + movdqa xmm5, [INTADDR(endswap_tab, ecx)] + // Initial setup. movdqu xmm0, [SRC] pshufb xmm0, xmm5 @@ -479,6 +432,9 @@ FUNC(rijndael_\op\()_x86ish_aesni) movdqu xmm1, [K] add K, 16 pxor xmm0, xmm1 +#if CPUFAM_X86 + mov DST, [esp + 12] +#endif // Dispatch to the correct code. cmp NR, 10 @@ -556,9 +512,6 @@ FUNC(rijndael_\op\()_x86ish_aesni) // Unpermute the ciphertext block and store it. pshufb xmm0, xmm5 -#if CPUFAM_X86 - mov DST, [esp + 12] -#endif movdqu [DST], xmm0 // And we're done. @@ -569,8 +522,8 @@ FUNC(rijndael_\op\()_x86ish_aesni) #undef DST #undef NR -ENDFUNC - .endm + ENDFUNC +.endm encdec eblk, aesenc, w encdec dblk, aesdec, wi @@ -589,11 +542,11 @@ bogus: callext F(abort) 0: hlt jmp 0b - gotaux ecx - ///-------------------------------------------------------------------------- /// Data tables. + RODATA + .align 16 endswap_tab: .byte 3, 2, 1, 0