1 /// -*- mode: asm; asm-comment-char: ?/ -*-
3 /// AESNI-based implementation of Rijndael
5 /// (c) 2015 Straylight/Edgeware
8 ///----- Licensing notice ---------------------------------------------------
10 /// This file is part of Catacomb.
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
27 ///--------------------------------------------------------------------------
28 /// External definitions.
31 #include "asm-common.h"
34 .extern F(rijndael_rcon)
36 ///--------------------------------------------------------------------------
42 /// The AESNI instructions implement a little-endian version of AES, but
43 /// Catacomb's internal interface presents as big-endian so as to work better
44 /// with things like GCM. We therefore maintain the round keys in
45 /// little-endian form, and have to end-swap blocks in and out.
47 /// For added amusement, the AESNI instructions don't implement the
48 /// larger-block versions of Rijndael, so we have to end-swap the keys if
49 /// we're preparing for one of those.
52 .equ maxrounds, 16 // maximum number of rounds
53 .equ maxblksz, 32 // maximum block size, in bytes
54 .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
57 .equ nr, 0 // number of rounds
58 .equ w, nr + 4 // encryption key words
59 .equ wi, w + kbufsz // decryption key words
61 ///--------------------------------------------------------------------------
64 FUNC(rijndael_setup_x86ish_aesni_avx)
65 vzeroupper // avoid penalty on `legacy' XMM access
67 // and drop through...
70 FUNC(rijndael_setup_x86ish_aesni)
76 // Arguments are on the stack. We'll need to stack the caller's
77 // register veriables, but we'll manage.
79 # define CTX ebp // context pointer
80 # define BLKSZ [esp + 24] // block size
82 # define KSZ ebx // key size
83 # define NKW edx // total number of key words
84 # define NKW_NEEDS_REFRESH 1 // ... needs recalculating
85 # define RCON ecx // round constants table
86 # define LIM edx // limit pointer
87 # define CYIX edi // index in shift-register cycle
89 # define NR ecx // number of rounds
90 # define LRK eax // distance to last key
91 # define BLKOFF edx // block size in bytes
93 // Stack the caller's registers.
99 // Set up our own variables.
100 mov CTX, [esp + 20] // context base pointer
101 mov SI, [esp + 28] // key material
102 mov KSZ, [esp + 32] // key size, in words
105 #if CPUFAM_AMD64 && ABI_SYSV
106 // Arguments are in registers. We have plenty, but, to be honest,
107 // the initial register allocation is a bit annoying.
109 # define CTX r8 // context pointer
110 # define BLKSZ r9d // block size
112 # define KSZ edx // key size
113 # define NKW r10d // total number of key words
114 # define RCON rdi // round constants table
115 # define LIM rcx // limit pointer
116 # define CYIX r11d // index in shift-register cycle
118 # define NR ecx // number of rounds
119 # define LRK eax // distance to last key
120 # define BLKOFF r9d // block size in bytes
122 // Move arguments to more useful places.
123 mov CTX, rdi // context base pointer
124 mov BLKSZ, esi // block size in words
125 mov SI, rdx // key material
126 mov KSZ, ecx // key size, in words
129 #if CPUFAM_AMD64 && ABI_WIN
130 // Arguments are in different registers, and they're a little tight.
132 # define CTX r8 // context pointer
133 # define BLKSZ edx // block size
135 # define KSZ r9d // key size
136 # define NKW r10d // total number of key words
137 # define RCON rdi // round constants table
138 # define LIM rcx // limit pointer
139 # define CYIX r11d // index in shift-register cycle
141 # define NR ecx // number of rounds
142 # define LRK eax // distance to last key
143 # define BLKOFF edx // block size in bytes
145 // We'll need the index registers, which belong to the caller in this
150 // Move arguments to more useful places.
151 mov rsi, r8 // key material
152 mov CTX, rcx // context base pointer
157 // The initial round key material is taken directly from the input
158 // key, so copy it over.
159 #if CPUFAM_AMD64 && ABI_SYSV
160 // We've been lucky. We already have a copy of the context pointer
161 // in rdi, and the key size in ecx.
169 // Find out other useful things.
170 mov NKW, [CTX + nr] // number of rounds
172 imul NKW, BLKSZ // total key size in words
173 #if !NKW_NEEDS_REFRESH
174 // If we can't keep NKW for later, then we use the same register for
175 // it and LIM, so this move is unnecessary.
178 sub DWORD(LIM), KSZ // offset by the key size
180 // Find the round constants.
182 leaext RCON, F(rijndael_rcon), WHOLE(c)
184 // Prepare for the main loop.
186 mov eax, [SI + 4*WHOLE(KSZ) - 4] // most recent key word
187 lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
188 xor CYIX, CYIX // start of new cycle
190 // Main key expansion loop. The first word of each key-length chunk
191 // needs special treatment.
193 // This is rather tedious because the Intel `AESKEYGENASSIST'
194 // instruction is very strangely shaped. Firstly, it wants to
195 // operate on vast SSE registers, even though we're data-blocked from
196 // doing more than operation at a time unless we're doing two key
197 // schedules simultaneously -- and even then we can't do more than
198 // two, because the instruction ignores two of its input words
199 // entirely, and produces two different outputs for each of the other
200 // two. And secondly it insists on taking the magic round constant
201 // as an immediate, so it's kind of annoying if you're not
202 // open-coding the whole thing. It's much easier to leave that as
203 // zero and XOR in the round constant by hand.
204 0: cmp CYIX, 0 // first word of the cycle?
206 cmp CYIX, 4 // fourth word of the cycle?
208 cmp KSZ, 7 // and a large key?
211 // Fourth word of the cycle, and seven or eight words of key. Do a
212 // byte substitution.
214 pshufd xmm0, xmm0, SHUF(2, 1, 0, 3)
215 aeskeygenassist xmm1, xmm0, 0
219 // First word of the cycle. This is the complicated piece.
221 pshufd xmm0, xmm0, SHUF(0, 3, 2, 1)
222 aeskeygenassist xmm1, xmm0, 0
223 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
228 // Common tail. Mix in the corresponding word from the previous
229 // cycle and prepare for the next loop.
231 mov [SI + 4*WHOLE(KSZ)], eax
241 // Next job is to construct the decryption keys. The keys for the
242 // first and last rounds don't need to be mangled, but the remaining
243 // ones do -- and they all need to be reordered too.
245 // The plan of action, then, is to copy the final encryption round's
246 // keys into place first, then to do each of the intermediate rounds
247 // in reverse order, and finally do the first round.
249 // Do all of the heavy lifting with SSE registers. The order we're
250 // doing this in means that it's OK if we read or write too much, and
251 // there's easily enough buffer space for the over-enthusiastic reads
252 // and writes because the context has space for 32-byte blocks, which
253 // is our maximum and an exact fit for two SSE registers.
254 9: mov NR, [CTX + nr] // number of rounds
255 #if NKW_NEEDS_REFRESH
260 // If we retain NKW, then BLKSZ and BLKOFF are the same register
261 // because we won't need the former again.
266 lea SI, [CTX + w + 4*WHOLE(LRK)] // last round's keys
267 shl BLKOFF, 2 // block size (in bytes now)
269 // Copy the last encryption round's keys.
274 movdqu xmm0, [SI + 16]
275 movdqu [DI + 16], xmm0
277 // Update the loop variables and stop if we've finished.
278 0: add DI, WHOLE(BLKOFF)
279 sub SI, WHOLE(BLKOFF)
283 // Do another middle round's keys...
289 movdqu xmm0, [SI + 16]
291 movdqu [DI + 16], xmm0
294 // Finally do the first encryption round.
299 movdqu xmm0, [SI + 16]
300 movdqu [DI + 16], xmm0
302 // If the block size is not exactly four words then we must end-swap
303 // everything. We can use fancy SSE toys for this.
307 // Find the byte-reordering table.
309 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
311 #if NKW_NEEDS_REFRESH
312 // Calculate the number of subkey words again. (It's a good job
313 // we've got a fast multiplier.)
319 // End-swap the encryption keys.
323 // And the decryption keys.
334 #if CPUFAM_AMD64 && ABI_WIN
342 INTFUNC(endswap_block)
343 // End-swap NKW words starting at SI. The end-swapping table is
344 // already loaded into XMM5; and it's OK to work in 16-byte chunks.
370 ///--------------------------------------------------------------------------
371 /// Encrypting and decrypting blocks.
373 .macro encdec op, aes, koff
374 FUNC(rijndael_\op\()_x86ish_aesni_avx)
375 vzeroupper // avoid XMM penalties
377 // and drop through...
380 FUNC(rijndael_\op\()_x86ish_aesni)
383 // Arguments come in on the stack, and need to be collected. We
384 // don't have a shortage of registers.
395 #if CPUFAM_AMD64 && ABI_SYSV
396 // Arguments come in registers. All is good.
404 #if CPUFAM_AMD64 && ABI_WIN
405 // Arguments come in different registers.
415 // Find the magic endianness-swapping table.
417 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
425 // Initial whitening.
433 // Dispatch to the correct code.
472 movdqu xmm1, [K + 16]
476 movdqu xmm1, [K + 32]
480 movdqu xmm1, [K + 48]
484 movdqu xmm1, [K + 64]
488 movdqu xmm1, [K + 80]
492 movdqu xmm1, [K + 96]
496 movdqu xmm1, [K + 112]
500 movdqu xmm1, [K + 128]
504 movdqu xmm1, [K + 144]
505 \aes\()last xmm0, xmm1
507 // Unpermute the ciphertext block and store it.
522 encdec eblk, aesenc, w
523 encdec dblk, aesdec, wi
525 ///--------------------------------------------------------------------------
526 /// Random utilities.
529 // Abort the process because of a programming error. Indirecting
530 // through this point serves several purposes: (a) by CALLing, rather
531 // than branching to, `abort', we can save the return address, which
532 // might at least provide a hint as to what went wrong; (b) we don't
533 // have conditional CALLs (and they'd be big anyway); and (c) we can
534 // write a HLT here as a backstop against `abort' being mad.
543 ///--------------------------------------------------------------------------
555 ///----- That's all, folks --------------------------------------------------