/// -*- mode: asm; asm-comment-char: ?/ -*- /// /// AESNI-based implementation of Rijndael /// /// (c) 2015 Straylight/Edgeware /// ///----- Licensing notice --------------------------------------------------- /// /// This file is part of Catacomb. /// /// Catacomb is free software; you can redistribute it and/or modify /// it under the terms of the GNU Library General Public License as /// published by the Free Software Foundation; either version 2 of the /// License, or (at your option) any later version. /// /// Catacomb is distributed in the hope that it will be useful, /// but WITHOUT ANY WARRANTY; without even the implied warranty of /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the /// GNU Library General Public License for more details. /// /// You should have received a copy of the GNU Library General Public /// License along with Catacomb; if not, write to the Free /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, /// MA 02111-1307, USA. ///-------------------------------------------------------------------------- /// External definitions. #include "config.h" #include "asm-common.h" ///-------------------------------------------------------------------------- /// External definitions. .globl F(abort) .globl F(rijndael_rcon) ///-------------------------------------------------------------------------- /// Main code. .arch .aes .section .text /// The AESNI instructions implement a little-endian version of AES, but /// Catacomb's internal interface presents as big-endian so as to work better /// with things like GCM. We therefore maintain the round keys in /// little-endian form, and have to end-swap blocks in and out. /// /// For added amusement, the AESNI instructions don't implement the /// larger-block versions of Rijndael, so we have to end-swap the keys if /// we're preparing for one of those. // Useful constants. .equ maxrounds, 16 // maximum number of rounds .equ maxblksz, 32 // maximum block size, in bytes .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer // Context structure. .equ nr, 0 // number of rounds .equ w, nr + 4 // encryption key words .equ wi, w + kbufsz // decryption key words ///-------------------------------------------------------------------------- /// Key setup. FUNC(rijndael_setup_x86_aesni) // Initial state. We have four arguments: // [esp + 20] is the context pointer // [esp + 24] is the block size, in 32-bit words (4, 6, or 8) // [esp + 28] points to the key material, unaligned // [esp + 32] is the size of the key, in words // The key size has already been checked for validity, and the number // of rounds has been computed. Our job is only to fill in the `w' // and `wi' vectors. push ebp push ebx push esi push edi // The initial round key material is taken directly from the input // key, so copy it over. mov ebp, [esp + 20] // context base pointer mov ebx, [esp + 32] // key size, in words mov ecx, ebx mov esi, [esp + 28] lea edi, [ebp + w] rep movsd // Find out other useful things. mov edx, [ebp + nr] // number of rounds add edx, 1 imul edx, [esp + 24] // total key size in words sub edx, ebx // offset by the key size // Find the round constants. ldgot ecx leaext ecx, rijndael_rcon, ecx // Prepare for the main loop. lea esi, [ebp + w] mov eax, [esi + 4*ebx - 4] // most recent key word lea edx, [esi + 4*edx] // limit, offset by one key expansion // Main key expansion loop. The first word of each key-length chunk // needs special treatment. // // This is rather tedious because the Intel `AESKEYGENASSIST' // instruction is very strangely shaped. Firstly, it wants to // operate on vast SSE registers, even though we're data-blocked from // doing more than operation at a time unless we're doing two key // schedules simultaneously -- and even then we can't do more than // two, because the instruction ignores two of its input words // entirely, and produces two different outputs for each of the other // two. And secondly it insists on taking the magic round constant // as an immediate, so it's kind of annoying if you're not // open-coding the whole thing. It's much easier to leave that as // zero and XOR in the round constant by hand. 9: movd xmm0, eax pshufd xmm0, xmm0, 0x39 aeskeygenassist xmm1, xmm0, 0 pshufd xmm1, xmm1, 0x93 movd eax, xmm1 xor eax, [esi] xor al, [ecx] inc ecx mov [esi + 4*ebx], eax add esi, 4 cmp esi, edx jae 8f // The next three words are simple... xor eax, [esi] mov [esi + 4*ebx], eax add esi, 4 cmp esi, edx jae 8f // (Word 2...) xor eax, [esi] mov [esi + 4*ebx], eax add esi, 4 cmp esi, edx jae 8f // (Word 3...) xor eax, [esi] mov [esi + 4*ebx], eax add esi, 4 cmp esi, edx jae 8f // Word 4. If the key is /more/ than 6 words long, then we must // apply a substitution here. cmp ebx, 5 jb 9b cmp ebx, 7 jb 0f movd xmm0, eax pshufd xmm0, xmm0, 0x93 aeskeygenassist xmm1, xmm0, 0 movd eax, xmm1 0: xor eax, [esi] mov [esi + 4*ebx], eax add esi, 4 cmp esi, edx jae 8f // (Word 5...) cmp ebx, 6 jb 9b xor eax, [esi] mov [esi + 4*ebx], eax add esi, 4 cmp esi, edx jae 8f // (Word 6...) cmp ebx, 7 jb 9b xor eax, [esi] mov [esi + 4*ebx], eax add esi, 4 cmp esi, edx jae 8f // (Word 7...) cmp ebx, 8 jb 9b xor eax, [esi] mov [esi + 4*ebx], eax add esi, 4 cmp esi, edx jae 8f // Must be done by now. jmp 9b // Next job is to construct the decryption keys. The keys for the // first and last rounds don't need to be mangled, but the remaining // ones do -- and they all need to be reordered too. // // The plan of action, then, is to copy the final encryption round's // keys into place first, then to do each of the intermediate rounds // in reverse order, and finally do the first round. // // Do all of the heavy lifting with SSE registers. The order we're // doing this in means that it's OK if we read or write too much, and // there's easily enough buffer space for the over-enthusiastic reads // and writes because the context has space for 32-byte blocks, which // is our maximum and an exact fit for two SSE registers. 8: mov ecx, [ebp + nr] // number of rounds mov ebx, [esp + 24] // block size (in words) mov edx, ecx imul edx, ebx lea edi, [ebp + wi] lea esi, [ebp + 4*edx + w] // last round's keys shl ebx, 2 // block size (in bytes now) // Copy the last encryption round's keys. movdqu xmm0, [esi] movdqu [edi], xmm0 cmp ebx, 16 jbe 9f movdqu xmm0, [esi + 16] movdqu [edi + 16], xmm0 // Update the loop variables and stop if we've finished. 9: add edi, ebx sub esi, ebx sub ecx, 1 jbe 0f // Do another middle round's keys... movdqu xmm0, [esi] aesimc xmm0, xmm0 movdqu [edi], xmm0 cmp ebx, 16 jbe 9b movdqu xmm0, [esi + 16] aesimc xmm0, xmm0 movdqu [edi + 16], xmm0 jmp 9b // Finally do the first encryption round. 0: movdqu xmm0, [esi] movdqu [edi], xmm0 cmp ebx, 16 jbe 0f movdqu xmm0, [esi + 16] movdqu [edi + 16], xmm0 // If the block size is not exactly four words then we must end-swap // everything. We can use fancy SSE toys for this. 0: cmp ebx, 16 je 0f // Find the byte-reordering table. ldgot ecx movdqa xmm7, [INTADDR(endswap_tab, ecx)] // Calculate the number of subkey words again. (It's a good job // we've got a fast multiplier.) mov ecx, [ebp + nr] add ecx, 1 imul ecx, [esp + 24] // total keys in words // End-swap the encryption keys. mov eax, ecx lea esi, [ebp + w] call endswap_block // And the decryption keys. mov ecx, eax lea esi, [ebp + wi] call endswap_block // All done. 0: pop edi pop esi pop ebx pop ebp ret .align 16 endswap_block: // End-swap ECX words starting at ESI. The end-swapping table is // already loaded into XMM7; and it's OK to work in 16-byte chunks. movdqu xmm1, [esi] pshufb xmm1, xmm7 movdqu [esi], xmm1 add esi, 16 sub ecx, 4 ja endswap_block ret ENDFUNC ///-------------------------------------------------------------------------- /// Encrypting and decrypting blocks. FUNC(rijndael_eblk_x86_aesni) // On entry, we have: // [esp + 4] points to the context block // [esp + 8] points to the input data block // [esp + 12] points to the output buffer // Find the magic endianness-swapping table. ldgot ecx movdqa xmm7, [INTADDR(endswap_tab, ecx)] // Load the input block and end-swap it. Also, start loading the // keys. mov eax, [esp + 8] movdqu xmm0, [eax] pshufb xmm0, xmm7 mov eax, [esp + 4] lea edx, [eax + w] mov eax, [eax + nr] // Initial whitening. movdqu xmm1, [edx] add edx, 16 pxor xmm0, xmm1 // Dispatch to the correct code. cmp eax, 10 je er10 jb bogus cmp eax, 14 je er14 ja bogus cmp eax, 12 je er12 jb er11 jmp er13 .align 2 // 14 rounds... er14: movdqu xmm1, [edx] add edx, 16 aesenc xmm0, xmm1 // 13 rounds... er13: movdqu xmm1, [edx] add edx, 16 aesenc xmm0, xmm1 // 12 rounds... er12: movdqu xmm1, [edx] add edx, 16 aesenc xmm0, xmm1 // 11 rounds... er11: movdqu xmm1, [edx] add edx, 16 aesenc xmm0, xmm1 // 10 rounds... er10: movdqu xmm1, [edx] aesenc xmm0, xmm1 // 9 rounds... movdqu xmm1, [edx + 16] aesenc xmm0, xmm1 // 8 rounds... movdqu xmm1, [edx + 32] aesenc xmm0, xmm1 // 7 rounds... movdqu xmm1, [edx + 48] aesenc xmm0, xmm1 // 6 rounds... movdqu xmm1, [edx + 64] aesenc xmm0, xmm1 // 5 rounds... movdqu xmm1, [edx + 80] aesenc xmm0, xmm1 // 4 rounds... movdqu xmm1, [edx + 96] aesenc xmm0, xmm1 // 3 rounds... movdqu xmm1, [edx + 112] aesenc xmm0, xmm1 // 2 rounds... movdqu xmm1, [edx + 128] aesenc xmm0, xmm1 // Final round... movdqu xmm1, [edx + 144] aesenclast xmm0, xmm1 // Unpermute the ciphertext block and store it. pshufb xmm0, xmm7 mov eax, [esp + 12] movdqu [eax], xmm0 // And we're done. ret ENDFUNC FUNC(rijndael_dblk_x86_aesni) // On entry, we have: // [esp + 4] points to the context block // [esp + 8] points to the input data block // [esp + 12] points to the output buffer // Find the magic endianness-swapping table. ldgot ecx movdqa xmm7, [INTADDR(endswap_tab, ecx)] // Load the input block and end-swap it. Also, start loading the // keys. mov eax, [esp + 8] movdqu xmm0, [eax] pshufb xmm0, xmm7 mov eax, [esp + 4] lea edx, [eax + wi] mov eax, [eax + nr] // Initial whitening. movdqu xmm1, [edx] add edx, 16 pxor xmm0, xmm1 // Dispatch to the correct code. cmp eax, 10 je dr10 jb bogus cmp eax, 14 je dr14 ja bogus cmp eax, 12 je dr12 jb dr11 jmp dr13 .align 2 // 14 rounds... dr14: movdqu xmm1, [edx] add edx, 16 aesdec xmm0, xmm1 // 13 rounds... dr13: movdqu xmm1, [edx] add edx, 16 aesdec xmm0, xmm1 // 12 rounds... dr12: movdqu xmm1, [edx] add edx, 16 aesdec xmm0, xmm1 // 11 rounds... dr11: movdqu xmm1, [edx] add edx, 16 aesdec xmm0, xmm1 // 10 rounds... dr10: movdqu xmm1, [edx] aesdec xmm0, xmm1 // 9 rounds... movdqu xmm1, [edx + 16] aesdec xmm0, xmm1 // 8 rounds... movdqu xmm1, [edx + 32] aesdec xmm0, xmm1 // 7 rounds... movdqu xmm1, [edx + 48] aesdec xmm0, xmm1 // 6 rounds... movdqu xmm1, [edx + 64] aesdec xmm0, xmm1 // 5 rounds... movdqu xmm1, [edx + 80] aesdec xmm0, xmm1 // 4 rounds... movdqu xmm1, [edx + 96] aesdec xmm0, xmm1 // 3 rounds... movdqu xmm1, [edx + 112] aesdec xmm0, xmm1 // 2 rounds... movdqu xmm1, [edx + 128] aesdec xmm0, xmm1 // Final round... movdqu xmm1, [edx + 144] aesdeclast xmm0, xmm1 // Unpermute the ciphertext block and store it. pshufb xmm0, xmm7 mov eax, [esp + 12] movdqu [eax], xmm0 // And we're done. ret ENDFUNC ///-------------------------------------------------------------------------- /// Random utilities. .align 16 // Abort the process because of a programming error. Indirecting // through this point serves several purposes: (a) by CALLing, rather // than branching to, `abort', we can save the return address, which // might at least provide a hint as to what went wrong; (b) we don't // have conditional CALLs (and they'd be big anyway); and (c) we can // write a HLT here as a backstop against `abort' being mad. bogus: callext F(abort) 0: hlt jmp 0b gotaux ecx ///-------------------------------------------------------------------------- /// Data tables. .align 16 endswap_tab: .byte 3, 2, 1, 0 .byte 7, 6, 5, 4 .byte 11, 10, 9, 8 .byte 15, 14, 13, 12 ///----- That's all, folks --------------------------------------------------