1 /// -*- mode: asm; asm-comment-char: ?/ -*-
3 /// AESNI-based implementation of Rijndael
5 /// (c) 2015 Straylight/Edgeware
8 ///----- Licensing notice ---------------------------------------------------
10 /// This file is part of Catacomb.
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
27 ///--------------------------------------------------------------------------
28 /// External definitions.
31 #include "asm-common.h"
33 ///--------------------------------------------------------------------------
34 /// External definitions.
37 .globl F(rijndael_rcon)
39 ///--------------------------------------------------------------------------
45 /// The AESNI instructions implement a little-endian version of AES, but
46 /// Catacomb's internal interface presents as big-endian so as to work better
47 /// with things like GCM. We therefore maintain the round keys in
48 /// little-endian form, and have to end-swap blocks in and out.
50 /// For added amusement, the AESNI instructions don't implement the
51 /// larger-block versions of Rijndael, so we have to end-swap the keys if
52 /// we're preparing for one of those.
55 .equ maxrounds, 16 // maximum number of rounds
56 .equ maxblksz, 32 // maximum block size, in bytes
57 .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
60 .equ nr, 0 // number of rounds
61 .equ w, nr + 4 // encryption key words
62 .equ wi, w + kbufsz // decryption key words
64 ///--------------------------------------------------------------------------
67 FUNC(rijndael_setup_x86_aesni)
69 // Initial state. We have four arguments:
70 // [esp + 20] is the context pointer
71 // [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
72 // [esp + 28] points to the key material, unaligned
73 // [esp + 32] is the size of the key, in words
74 // The key size has already been checked for validity, and the number
75 // of rounds has been computed. Our job is only to fill in the `w'
83 // The initial round key material is taken directly from the input
84 // key, so copy it over.
85 mov ebp, [esp + 20] // context base pointer
86 mov ebx, [esp + 32] // key size, in words
92 // Find out other useful things.
93 mov edx, [ebp + nr] // number of rounds
95 imul edx, [esp + 24] // total key size in words
96 sub edx, ebx // offset by the key size
98 // Find the round constants.
100 leaext ecx, rijndael_rcon, ecx
102 // Prepare for the main loop.
104 mov eax, [esi + 4*ebx - 4] // most recent key word
105 lea edx, [esi + 4*edx] // limit, offset by one key expansion
107 // Main key expansion loop. The first word of each key-length chunk
108 // needs special treatment.
110 // This is rather tedious because the Intel `AESKEYGENASSIST'
111 // instruction is very strangely shaped. Firstly, it wants to
112 // operate on vast SSE registers, even though we're data-blocked from
113 // doing more than operation at a time unless we're doing two key
114 // schedules simultaneously -- and even then we can't do more than
115 // two, because the instruction ignores two of its input words
116 // entirely, and produces two different outputs for each of the other
117 // two. And secondly it insists on taking the magic round constant
118 // as an immediate, so it's kind of annoying if you're not
119 // open-coding the whole thing. It's much easier to leave that as
120 // zero and XOR in the round constant by hand.
122 pshufd xmm0, xmm0, 0x39
123 aeskeygenassist xmm1, xmm0, 0
124 pshufd xmm1, xmm1, 0x93
129 mov [esi + 4*ebx], eax
134 // The next three words are simple...
136 mov [esi + 4*ebx], eax
143 mov [esi + 4*ebx], eax
150 mov [esi + 4*ebx], eax
155 // Word 4. If the key is /more/ than 6 words long, then we must
156 // apply a substitution here.
162 pshufd xmm0, xmm0, 0x93
163 aeskeygenassist xmm1, xmm0, 0
166 mov [esi + 4*ebx], eax
175 mov [esi + 4*ebx], eax
184 mov [esi + 4*ebx], eax
193 mov [esi + 4*ebx], eax
198 // Must be done by now.
201 // Next job is to construct the decryption keys. The keys for the
202 // first and last rounds don't need to be mangled, but the remaining
203 // ones do -- and they all need to be reordered too.
205 // The plan of action, then, is to copy the final encryption round's
206 // keys into place first, then to do each of the intermediate rounds
207 // in reverse order, and finally do the first round.
209 // Do all of the heavy lifting with SSE registers. The order we're
210 // doing this in means that it's OK if we read or write too much, and
211 // there's easily enough buffer space for the over-enthusiastic reads
212 // and writes because the context has space for 32-byte blocks, which
213 // is our maximum and an exact fit for two SSE registers.
214 8: mov ecx, [ebp + nr] // number of rounds
215 mov ebx, [esp + 24] // block size (in words)
219 lea esi, [ebp + 4*edx + w] // last round's keys
220 shl ebx, 2 // block size (in bytes now)
222 // Copy the last encryption round's keys.
227 movdqu xmm0, [esi + 16]
228 movdqu [edi + 16], xmm0
230 // Update the loop variables and stop if we've finished.
236 // Do another middle round's keys...
242 movdqu xmm0, [esi + 16]
244 movdqu [edi + 16], xmm0
247 // Finally do the first encryption round.
248 0: movdqu xmm0, [esi]
252 movdqu xmm0, [esi + 16]
253 movdqu [edi + 16], xmm0
255 // If the block size is not exactly four words then we must end-swap
256 // everything. We can use fancy SSE toys for this.
260 // Find the byte-reordering table.
262 movdqa xmm7, [INTADDR(endswap_tab, ecx)]
264 // Calculate the number of subkey words again. (It's a good job
265 // we've got a fast multiplier.)
268 imul ecx, [esp + 24] // total keys in words
270 // End-swap the encryption keys.
275 // And the decryption keys.
289 // End-swap ECX words starting at ESI. The end-swapping table is
290 // already loaded into XMM7; and it's OK to work in 16-byte chunks.
301 ///--------------------------------------------------------------------------
302 /// Encrypting and decrypting blocks.
304 FUNC(rijndael_eblk_x86_aesni)
306 // On entry, we have:
307 // [esp + 4] points to the context block
308 // [esp + 8] points to the input data block
309 // [esp + 12] points to the output buffer
311 // Find the magic endianness-swapping table.
313 movdqa xmm7, [INTADDR(endswap_tab, ecx)]
315 // Load the input block and end-swap it. Also, start loading the
324 // Initial whitening.
329 // Dispatch to the correct code.
344 er14: movdqu xmm1, [edx]
349 er13: movdqu xmm1, [edx]
354 er12: movdqu xmm1, [edx]
359 er11: movdqu xmm1, [edx]
364 er10: movdqu xmm1, [edx]
368 movdqu xmm1, [edx + 16]
372 movdqu xmm1, [edx + 32]
376 movdqu xmm1, [edx + 48]
380 movdqu xmm1, [edx + 64]
384 movdqu xmm1, [edx + 80]
388 movdqu xmm1, [edx + 96]
392 movdqu xmm1, [edx + 112]
396 movdqu xmm1, [edx + 128]
400 movdqu xmm1, [edx + 144]
401 aesenclast xmm0, xmm1
403 // Unpermute the ciphertext block and store it.
413 FUNC(rijndael_dblk_x86_aesni)
415 // On entry, we have:
416 // [esp + 4] points to the context block
417 // [esp + 8] points to the input data block
418 // [esp + 12] points to the output buffer
420 // Find the magic endianness-swapping table.
422 movdqa xmm7, [INTADDR(endswap_tab, ecx)]
424 // Load the input block and end-swap it. Also, start loading the
433 // Initial whitening.
438 // Dispatch to the correct code.
453 dr14: movdqu xmm1, [edx]
458 dr13: movdqu xmm1, [edx]
463 dr12: movdqu xmm1, [edx]
468 dr11: movdqu xmm1, [edx]
473 dr10: movdqu xmm1, [edx]
477 movdqu xmm1, [edx + 16]
481 movdqu xmm1, [edx + 32]
485 movdqu xmm1, [edx + 48]
489 movdqu xmm1, [edx + 64]
493 movdqu xmm1, [edx + 80]
497 movdqu xmm1, [edx + 96]
501 movdqu xmm1, [edx + 112]
505 movdqu xmm1, [edx + 128]
509 movdqu xmm1, [edx + 144]
510 aesdeclast xmm0, xmm1
512 // Unpermute the ciphertext block and store it.
522 ///--------------------------------------------------------------------------
523 /// Random utilities.
526 // Abort the process because of a programming error. Indirecting
527 // through this point serves several purposes: (a) by CALLing, rather
528 // than branching to, `abort', we can save the return address, which
529 // might at least provide a hint as to what went wrong; (b) we don't
530 // have conditional CALLs (and they'd be big anyway); and (c) we can
531 // write a HLT here as a backstop against `abort' being mad.
532 bogus: callext F(abort)
538 ///--------------------------------------------------------------------------
548 ///----- That's all, folks --------------------------------------------------