/// -*- mode: asm; asm-comment-char: ?/ -*- /// /// AESNI-based implementation of Rijndael /// /// (c) 2015 Straylight/Edgeware /// ///----- Licensing notice --------------------------------------------------- /// /// This file is part of Catacomb. /// /// Catacomb is free software; you can redistribute it and/or modify /// it under the terms of the GNU Library General Public License as /// published by the Free Software Foundation; either version 2 of the /// License, or (at your option) any later version. /// /// Catacomb is distributed in the hope that it will be useful, /// but WITHOUT ANY WARRANTY; without even the implied warranty of /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the /// GNU Library General Public License for more details. /// /// You should have received a copy of the GNU Library General Public /// License along with Catacomb; if not, write to the Free /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, /// MA 02111-1307, USA. ///-------------------------------------------------------------------------- /// Preliminaries. #include "config.h" #include "asm-common.h" .arch .aes .extern F(abort) .extern F(rijndael_rcon) .text ///-------------------------------------------------------------------------- /// Main code. /// The AESNI instructions implement a little-endian version of AES, but /// Catacomb's internal interface presents as big-endian so as to work better /// with things like GCM. We therefore maintain the round keys in /// little-endian form, and have to end-swap blocks in and out. /// /// For added amusement, the AESNI instructions don't implement the /// larger-block versions of Rijndael, so we have to end-swap the keys if /// we're preparing for one of those. // Useful constants. .equ maxrounds, 16 // maximum number of rounds .equ maxblksz, 32 // maximum block size, in bytes .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer // Context structure. .equ nr, 0 // number of rounds .equ w, nr + 4 // encryption key words .equ wi, w + kbufsz // decryption key words ///-------------------------------------------------------------------------- /// Key setup. FUNC(rijndael_setup_x86ish_aesni_avx) vzeroupper // avoid penalty on `legacy' XMM access endprologue // and drop through... ENDFUNC FUNC(rijndael_setup_x86ish_aesni) #if CPUFAM_X86 // Arguments are on the stack. We'll need to stack the caller's // register veriables, but we'll manage. # define CTX BP // context pointer # define BLKSZ [SP + 24] // block size # define KSZ ebx // key size # define NKW edx // total number of key words # define NKW_NEEDS_REFRESH 1 // ... needs recalculating # define RCON ecx // round constants table # define LIM edx // limit pointer # define CYIX edi // index in shift-register cycle # define NR ecx // number of rounds # define LRK eax // distance to last key # define BLKOFF edx // block size in bytes // Stack the caller's registers. pushreg BP pushreg ebx pushreg esi pushreg edi // Set up our own variables. mov CTX, [SP + 20] // context base pointer mov SI, [SP + 28] // key material mov KSZ, [SP + 32] // key size, in words #endif #if CPUFAM_AMD64 && ABI_SYSV // Arguments are in registers. We have plenty, but, to be honest, // the initial register allocation is a bit annoying. # define CTX r8 // context pointer # define BLKSZ r9d // block size # define KSZ edx // key size # define NKW r10d // total number of key words # define RCON rdi // round constants table # define LIM rcx // limit pointer # define CYIX r11d // index in shift-register cycle # define NR ecx // number of rounds # define LRK eax // distance to last key # define BLKOFF r9d // block size in bytes // Move arguments to more useful places. mov CTX, rdi // context base pointer mov BLKSZ, esi // block size in words mov SI, rdx // key material mov KSZ, ecx // key size, in words #endif #if CPUFAM_AMD64 && ABI_WIN // Arguments are in different registers, and they're a little tight. # define CTX r8 // context pointer # define BLKSZ edx // block size # define KSZ r9d // key size # define NKW r10d // total number of key words # define RCON rdi // round constants table # define LIM rcx // limit pointer # define CYIX r11d // index in shift-register cycle # define NR ecx // number of rounds # define LRK eax // distance to last key # define BLKOFF edx // block size in bytes // We'll need the index registers, which belong to the caller in this // ABI. pushreg rsi pushreg rdi // Move arguments to more useful places. mov rsi, r8 // key material mov CTX, rcx // context base pointer #endif endprologue // The initial round key material is taken directly from the input // key, so copy it over. #if CPUFAM_AMD64 && ABI_SYSV // We've been lucky. We already have a copy of the context pointer // in rdi, and the key size in ecx. add rdi, w #else lea DI, [CTX + w] mov ecx, KSZ #endif rep movsd // Find out other useful things. mov NKW, [CTX + nr] // number of rounds add NKW, 1 imul NKW, BLKSZ // total key size in words #if !NKW_NEEDS_REFRESH // If we can't keep NKW for later, then we use the same register for // it and LIM, so this move is unnecessary. mov DWORD(LIM), NKW #endif sub DWORD(LIM), KSZ // offset by the key size // Find the round constants. ldgot WHOLE(c) leaext RCON, F(rijndael_rcon), WHOLE(c) // Prepare for the main loop. lea SI, [CTX + w] mov eax, [SI + 4*WHOLE(KSZ) - 4] // most recent key word lea LIM, [SI + 4*LIM] // limit, offset by one key expansion xor CYIX, CYIX // start of new cycle // Main key expansion loop. The first word of each key-length chunk // needs special treatment. // // This is rather tedious because the Intel `AESKEYGENASSIST' // instruction is very strangely shaped. Firstly, it wants to // operate on vast SSE registers, even though we're data-blocked from // doing more than operation at a time unless we're doing two key // schedules simultaneously -- and even then we can't do more than // two, because the instruction ignores two of its input words // entirely, and produces two different outputs for each of the other // two. And secondly it insists on taking the magic round constant // as an immediate, so it's kind of annoying if you're not // open-coding the whole thing. It's much easier to leave that as // zero and XOR in the round constant by hand. 0: cmp CYIX, 0 // first word of the cycle? je 1f cmp CYIX, 4 // fourth word of the cycle? jne 2f cmp KSZ, 7 // and a large key? jb 2f // Fourth word of the cycle, and seven or eight words of key. Do a // byte substitution. movd xmm0, eax pshufd xmm0, xmm0, SHUF(2, 1, 0, 3) aeskeygenassist xmm1, xmm0, 0 movd eax, xmm1 jmp 2f // First word of the cycle. This is the complicated piece. 1: movd xmm0, eax pshufd xmm0, xmm0, SHUF(0, 3, 2, 1) aeskeygenassist xmm1, xmm0, 0 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) movd eax, xmm1 xor al, [RCON] inc RCON // Common tail. Mix in the corresponding word from the previous // cycle and prepare for the next loop. 2: xor eax, [SI] mov [SI + 4*WHOLE(KSZ)], eax add SI, 4 inc CYIX cmp SI, LIM jae 9f cmp CYIX, KSZ jb 0b xor CYIX, CYIX jmp 0b // Next job is to construct the decryption keys. The keys for the // first and last rounds don't need to be mangled, but the remaining // ones do -- and they all need to be reordered too. // // The plan of action, then, is to copy the final encryption round's // keys into place first, then to do each of the intermediate rounds // in reverse order, and finally do the first round. // // Do all of the heavy lifting with SSE registers. The order we're // doing this in means that it's OK if we read or write too much, and // there's easily enough buffer space for the over-enthusiastic reads // and writes because the context has space for 32-byte blocks, which // is our maximum and an exact fit for two SSE registers. 9: mov NR, [CTX + nr] // number of rounds #if NKW_NEEDS_REFRESH mov BLKOFF, BLKSZ mov LRK, NR imul LRK, BLKOFF #else // If we retain NKW, then BLKSZ and BLKOFF are the same register // because we won't need the former again. mov LRK, NKW sub LRK, BLKSZ #endif lea DI, [CTX + wi] lea SI, [CTX + w + 4*WHOLE(LRK)] // last round's keys shl BLKOFF, 2 // block size (in bytes now) // Copy the last encryption round's keys. movdqu xmm0, [SI] movdqu [DI], xmm0 cmp BLKOFF, 16 jbe 0f movdqu xmm0, [SI + 16] movdqu [DI + 16], xmm0 // Update the loop variables and stop if we've finished. 0: add DI, WHOLE(BLKOFF) sub SI, WHOLE(BLKOFF) sub NR, 1 jbe 9f // Do another middle round's keys... movdqu xmm0, [SI] aesimc xmm0, xmm0 movdqu [DI], xmm0 cmp BLKOFF, 16 jbe 0b movdqu xmm0, [SI + 16] aesimc xmm0, xmm0 movdqu [DI + 16], xmm0 jmp 0b // Finally do the first encryption round. 9: movdqu xmm0, [SI] movdqu [DI], xmm0 cmp BLKOFF, 16 jbe 1f movdqu xmm0, [SI + 16] movdqu [DI + 16], xmm0 // If the block size is not exactly four words then we must end-swap // everything. We can use fancy SSE toys for this. 1: cmp BLKOFF, 16 je 9f // Find the byte-reordering table. ldgot ecx movdqa xmm5, [INTADDR(endswap_tab, ecx)] #if NKW_NEEDS_REFRESH // Calculate the number of subkey words again. (It's a good job // we've got a fast multiplier.) mov NKW, [CTX + nr] add NKW, 1 imul NKW, BLKSZ #endif // End-swap the encryption keys. lea SI, [CTX + w] call endswap_block // And the decryption keys. lea SI, [CTX + wi] call endswap_block 9: // All done. #if CPUFAM_X86 popreg edi popreg esi popreg ebx popreg BP #endif #if CPUFAM_AMD64 && ABI_WIN popreg rdi popreg rsi #endif ret ENDFUNC INTFUNC(endswap_block) // End-swap NKW words starting at SI. The end-swapping table is // already loaded into XMM5; and it's OK to work in 16-byte chunks. endprologue mov ecx, NKW 0: movdqu xmm1, [SI] pshufb xmm1, xmm5 movdqu [SI], xmm1 add SI, 16 sub ecx, 4 ja 0b ret ENDFUNC #undef CTX #undef BLKSZ #undef SI #undef DI #undef KSZ #undef RCON #undef LIM #undef NR #undef LRK #undef BLKOFF ///-------------------------------------------------------------------------- /// Encrypting and decrypting blocks. .macro encdec op, aes, koff FUNC(rijndael_\op\()_x86ish_aesni_avx) vzeroupper // avoid XMM penalties endprologue // and drop through... ENDFUNC FUNC(rijndael_\op\()_x86ish_aesni) #if CPUFAM_X86 // Arguments come in on the stack, and need to be collected. We // don't have a shortage of registers. # define K eax # define SRC edx # define DST edx # define NR ecx mov K, [SP + 4] mov SRC, [SP + 8] #endif #if CPUFAM_AMD64 && ABI_SYSV // Arguments come in registers. All is good. # define K rdi # define SRC rsi # define DST rdx # define NR eax #endif #if CPUFAM_AMD64 && ABI_WIN // Arguments come in different registers. # define K rcx # define SRC rdx # define DST r8 # define NR eax #endif endprologue // Find the magic endianness-swapping table. ldgot ecx movdqa xmm5, [INTADDR(endswap_tab, ecx)] // Initial setup. movdqu xmm0, [SRC] pshufb xmm0, xmm5 mov NR, [K + nr] add K, \koff // Initial whitening. movdqu xmm1, [K] add K, 16 pxor xmm0, xmm1 #if CPUFAM_X86 mov DST, [SP + 12] #endif // Dispatch to the correct code. cmp NR, 10 je 10f jb bogus cmp NR, 14 je 14f ja bogus cmp NR, 12 je 12f jb 11f jmp 13f .align 2 // 14 rounds... 14: movdqu xmm1, [K] add K, 16 \aes xmm0, xmm1 // 13 rounds... 13: movdqu xmm1, [K] add K, 16 \aes xmm0, xmm1 // 12 rounds... 12: movdqu xmm1, [K] add K, 16 \aes xmm0, xmm1 // 11 rounds... 11: movdqu xmm1, [K] add K, 16 \aes xmm0, xmm1 // 10 rounds... 10: movdqu xmm1, [K] \aes xmm0, xmm1 // 9 rounds... movdqu xmm1, [K + 16] \aes xmm0, xmm1 // 8 rounds... movdqu xmm1, [K + 32] \aes xmm0, xmm1 // 7 rounds... movdqu xmm1, [K + 48] \aes xmm0, xmm1 // 6 rounds... movdqu xmm1, [K + 64] \aes xmm0, xmm1 // 5 rounds... movdqu xmm1, [K + 80] \aes xmm0, xmm1 // 4 rounds... movdqu xmm1, [K + 96] \aes xmm0, xmm1 // 3 rounds... movdqu xmm1, [K + 112] \aes xmm0, xmm1 // 2 rounds... movdqu xmm1, [K + 128] \aes xmm0, xmm1 // Final round... movdqu xmm1, [K + 144] \aes\()last xmm0, xmm1 // Unpermute the ciphertext block and store it. pshufb xmm0, xmm5 movdqu [DST], xmm0 // And we're done. ret #undef K #undef SRC #undef DST #undef NR ENDFUNC .endm encdec eblk, aesenc, w encdec dblk, aesdec, wi ///-------------------------------------------------------------------------- /// Random utilities. INTFUNC(bogus) // Abort the process because of a programming error. Indirecting // through this point serves several purposes: (a) by CALLing, rather // than branching to, `abort', we can save the return address, which // might at least provide a hint as to what went wrong; (b) we don't // have conditional CALLs (and they'd be big anyway); and (c) we can // write a HLT here as a backstop against `abort' being mad. endprologue callext F(abort) 0: hlt jmp 0b ENDFUNC ///-------------------------------------------------------------------------- /// Data tables. RODATA .align 16 endswap_tab: .byte 3, 2, 1, 0 .byte 7, 6, 5, 4 .byte 11, 10, 9, 8 .byte 15, 14, 13, 12 ///----- That's all, folks --------------------------------------------------