symm/rijndael-x86-aseni.S: Unify encryption and decryption with a macro.
[catacomb] / symm / rijndael-x86-aesni.S
CommitLineData
1a0c09c4
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// AESNI-based implementation of Rijndael
4///
5/// (c) 2015 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
28/// External definitions.
29
30#include "config.h"
31#include "asm-common.h"
32
33///--------------------------------------------------------------------------
34/// External definitions.
35
36 .globl F(abort)
37 .globl F(rijndael_rcon)
38
39///--------------------------------------------------------------------------
47103664
MW
40/// Local utilities.
41
42// Magic constants for shuffling.
43#define ROTL 0x93
44#define ROT2 0x4e
45#define ROTR 0x39
46
47///--------------------------------------------------------------------------
1a0c09c4
MW
48/// Main code.
49
50 .arch .aes
51 .section .text
52
53/// The AESNI instructions implement a little-endian version of AES, but
54/// Catacomb's internal interface presents as big-endian so as to work better
55/// with things like GCM. We therefore maintain the round keys in
56/// little-endian form, and have to end-swap blocks in and out.
57///
58/// For added amusement, the AESNI instructions don't implement the
59/// larger-block versions of Rijndael, so we have to end-swap the keys if
60/// we're preparing for one of those.
61
62 // Useful constants.
63 .equ maxrounds, 16 // maximum number of rounds
64 .equ maxblksz, 32 // maximum block size, in bytes
65 .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
66
67 // Context structure.
68 .equ nr, 0 // number of rounds
69 .equ w, nr + 4 // encryption key words
70 .equ wi, w + kbufsz // decryption key words
71
72///--------------------------------------------------------------------------
73/// Key setup.
74
75FUNC(rijndael_setup_x86_aesni)
76
77 // Initial state. We have four arguments:
78 // [esp + 20] is the context pointer
79 // [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
80 // [esp + 28] points to the key material, unaligned
81 // [esp + 32] is the size of the key, in words
82 // The key size has already been checked for validity, and the number
83 // of rounds has been computed. Our job is only to fill in the `w'
84 // and `wi' vectors.
85
86 push ebp
87 push ebx
88 push esi
89 push edi
90
91 // The initial round key material is taken directly from the input
92 // key, so copy it over.
93 mov ebp, [esp + 20] // context base pointer
94 mov ebx, [esp + 32] // key size, in words
95 mov ecx, ebx
96 mov esi, [esp + 28]
97 lea edi, [ebp + w]
98 rep movsd
99
100 // Find out other useful things.
101 mov edx, [ebp + nr] // number of rounds
102 add edx, 1
103 imul edx, [esp + 24] // total key size in words
104 sub edx, ebx // offset by the key size
105
106 // Find the round constants.
107 ldgot ecx
108 leaext ecx, rijndael_rcon, ecx
109
110 // Prepare for the main loop.
111 lea esi, [ebp + w]
112 mov eax, [esi + 4*ebx - 4] // most recent key word
113 lea edx, [esi + 4*edx] // limit, offset by one key expansion
114
115 // Main key expansion loop. The first word of each key-length chunk
116 // needs special treatment.
117 //
118 // This is rather tedious because the Intel `AESKEYGENASSIST'
119 // instruction is very strangely shaped. Firstly, it wants to
120 // operate on vast SSE registers, even though we're data-blocked from
121 // doing more than operation at a time unless we're doing two key
122 // schedules simultaneously -- and even then we can't do more than
123 // two, because the instruction ignores two of its input words
124 // entirely, and produces two different outputs for each of the other
125 // two. And secondly it insists on taking the magic round constant
126 // as an immediate, so it's kind of annoying if you're not
127 // open-coding the whole thing. It's much easier to leave that as
128 // zero and XOR in the round constant by hand.
1299: movd xmm0, eax
47103664 130 pshufd xmm0, xmm0, ROTR
1a0c09c4 131 aeskeygenassist xmm1, xmm0, 0
47103664 132 pshufd xmm1, xmm1, ROTL
1a0c09c4
MW
133 movd eax, xmm1
134 xor eax, [esi]
135 xor al, [ecx]
136 inc ecx
137 mov [esi + 4*ebx], eax
138 add esi, 4
139 cmp esi, edx
140 jae 8f
141
142 // The next three words are simple...
143 xor eax, [esi]
144 mov [esi + 4*ebx], eax
145 add esi, 4
146 cmp esi, edx
147 jae 8f
148
149 // (Word 2...)
150 xor eax, [esi]
151 mov [esi + 4*ebx], eax
152 add esi, 4
153 cmp esi, edx
154 jae 8f
155
156 // (Word 3...)
157 xor eax, [esi]
158 mov [esi + 4*ebx], eax
159 add esi, 4
160 cmp esi, edx
161 jae 8f
162
163 // Word 4. If the key is /more/ than 6 words long, then we must
164 // apply a substitution here.
165 cmp ebx, 5
166 jb 9b
167 cmp ebx, 7
168 jb 0f
169 movd xmm0, eax
47103664 170 pshufd xmm0, xmm0, ROTL
1a0c09c4
MW
171 aeskeygenassist xmm1, xmm0, 0
172 movd eax, xmm1
1730: xor eax, [esi]
174 mov [esi + 4*ebx], eax
175 add esi, 4
176 cmp esi, edx
177 jae 8f
178
179 // (Word 5...)
180 cmp ebx, 6
181 jb 9b
182 xor eax, [esi]
183 mov [esi + 4*ebx], eax
184 add esi, 4
185 cmp esi, edx
186 jae 8f
187
188 // (Word 6...)
189 cmp ebx, 7
190 jb 9b
191 xor eax, [esi]
192 mov [esi + 4*ebx], eax
193 add esi, 4
194 cmp esi, edx
195 jae 8f
196
197 // (Word 7...)
198 cmp ebx, 8
199 jb 9b
200 xor eax, [esi]
201 mov [esi + 4*ebx], eax
202 add esi, 4
203 cmp esi, edx
204 jae 8f
205
206 // Must be done by now.
207 jmp 9b
208
209 // Next job is to construct the decryption keys. The keys for the
210 // first and last rounds don't need to be mangled, but the remaining
211 // ones do -- and they all need to be reordered too.
212 //
213 // The plan of action, then, is to copy the final encryption round's
214 // keys into place first, then to do each of the intermediate rounds
215 // in reverse order, and finally do the first round.
216 //
217 // Do all of the heavy lifting with SSE registers. The order we're
218 // doing this in means that it's OK if we read or write too much, and
219 // there's easily enough buffer space for the over-enthusiastic reads
220 // and writes because the context has space for 32-byte blocks, which
221 // is our maximum and an exact fit for two SSE registers.
2228: mov ecx, [ebp + nr] // number of rounds
223 mov ebx, [esp + 24] // block size (in words)
224 mov edx, ecx
225 imul edx, ebx
226 lea edi, [ebp + wi]
227 lea esi, [ebp + 4*edx + w] // last round's keys
228 shl ebx, 2 // block size (in bytes now)
229
230 // Copy the last encryption round's keys.
231 movdqu xmm0, [esi]
232 movdqu [edi], xmm0
233 cmp ebx, 16
234 jbe 9f
235 movdqu xmm0, [esi + 16]
236 movdqu [edi + 16], xmm0
237
238 // Update the loop variables and stop if we've finished.
2399: add edi, ebx
240 sub esi, ebx
241 sub ecx, 1
242 jbe 0f
243
244 // Do another middle round's keys...
245 movdqu xmm0, [esi]
246 aesimc xmm0, xmm0
247 movdqu [edi], xmm0
248 cmp ebx, 16
249 jbe 9b
250 movdqu xmm0, [esi + 16]
251 aesimc xmm0, xmm0
252 movdqu [edi + 16], xmm0
253 jmp 9b
254
255 // Finally do the first encryption round.
2560: movdqu xmm0, [esi]
257 movdqu [edi], xmm0
258 cmp ebx, 16
259 jbe 0f
260 movdqu xmm0, [esi + 16]
261 movdqu [edi + 16], xmm0
262
263 // If the block size is not exactly four words then we must end-swap
264 // everything. We can use fancy SSE toys for this.
2650: cmp ebx, 16
266 je 0f
267
268 // Find the byte-reordering table.
269 ldgot ecx
8d6ca554 270 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
1a0c09c4
MW
271
272 // Calculate the number of subkey words again. (It's a good job
273 // we've got a fast multiplier.)
274 mov ecx, [ebp + nr]
275 add ecx, 1
276 imul ecx, [esp + 24] // total keys in words
277
278 // End-swap the encryption keys.
279 mov eax, ecx
280 lea esi, [ebp + w]
281 call endswap_block
282
283 // And the decryption keys.
284 mov ecx, eax
285 lea esi, [ebp + wi]
286 call endswap_block
287
288 // All done.
2890: pop edi
290 pop esi
291 pop ebx
292 pop ebp
293 ret
294
295 .align 16
296endswap_block:
297 // End-swap ECX words starting at ESI. The end-swapping table is
8d6ca554 298 // already loaded into XMM5; and it's OK to work in 16-byte chunks.
1a0c09c4 299 movdqu xmm1, [esi]
8d6ca554 300 pshufb xmm1, xmm5
1a0c09c4
MW
301 movdqu [esi], xmm1
302 add esi, 16
303 sub ecx, 4
304 ja endswap_block
305 ret
306
307ENDFUNC
308
309///--------------------------------------------------------------------------
310/// Encrypting and decrypting blocks.
311
e297526c
MW
312 .macro encdec op, aes, koff
313FUNC(rijndael_\op\()_x86_aesni)
1a0c09c4
MW
314
315 // On entry, we have:
316 // [esp + 4] points to the context block
317 // [esp + 8] points to the input data block
318 // [esp + 12] points to the output buffer
319
320 // Find the magic endianness-swapping table.
321 ldgot ecx
8d6ca554 322 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
1a0c09c4
MW
323
324 // Load the input block and end-swap it. Also, start loading the
325 // keys.
326 mov eax, [esp + 8]
327 movdqu xmm0, [eax]
8d6ca554 328 pshufb xmm0, xmm5
1a0c09c4 329 mov eax, [esp + 4]
e297526c 330 lea edx, [eax + \koff]
1a0c09c4
MW
331 mov eax, [eax + nr]
332
333 // Initial whitening.
334 movdqu xmm1, [edx]
335 add edx, 16
336 pxor xmm0, xmm1
337
338 // Dispatch to the correct code.
339 cmp eax, 10
e297526c 340 je 10f
1a0c09c4
MW
341 jb bogus
342 cmp eax, 14
e297526c 343 je 14f
1a0c09c4
MW
344 ja bogus
345 cmp eax, 12
e297526c
MW
346 je 12f
347 jb 11f
348 jmp 13f
1a0c09c4
MW
349
350 .align 2
351
352 // 14 rounds...
e297526c 35314: movdqu xmm1, [edx]
1a0c09c4 354 add edx, 16
e297526c 355 \aes xmm0, xmm1
1a0c09c4
MW
356
357 // 13 rounds...
e297526c 35813: movdqu xmm1, [edx]
1a0c09c4 359 add edx, 16
e297526c 360 \aes xmm0, xmm1
1a0c09c4
MW
361
362 // 12 rounds...
e297526c 36312: movdqu xmm1, [edx]
1a0c09c4 364 add edx, 16
e297526c 365 \aes xmm0, xmm1
1a0c09c4
MW
366
367 // 11 rounds...
e297526c 36811: movdqu xmm1, [edx]
1a0c09c4 369 add edx, 16
e297526c 370 \aes xmm0, xmm1
1a0c09c4
MW
371
372 // 10 rounds...
e297526c
MW
37310: movdqu xmm1, [edx]
374 \aes xmm0, xmm1
1a0c09c4
MW
375
376 // 9 rounds...
377 movdqu xmm1, [edx + 16]
e297526c 378 \aes xmm0, xmm1
1a0c09c4
MW
379
380 // 8 rounds...
381 movdqu xmm1, [edx + 32]
e297526c 382 \aes xmm0, xmm1
1a0c09c4
MW
383
384 // 7 rounds...
385 movdqu xmm1, [edx + 48]
e297526c 386 \aes xmm0, xmm1
1a0c09c4
MW
387
388 // 6 rounds...
389 movdqu xmm1, [edx + 64]
e297526c 390 \aes xmm0, xmm1
1a0c09c4
MW
391
392 // 5 rounds...
393 movdqu xmm1, [edx + 80]
e297526c 394 \aes xmm0, xmm1
1a0c09c4
MW
395
396 // 4 rounds...
397 movdqu xmm1, [edx + 96]
e297526c 398 \aes xmm0, xmm1
1a0c09c4
MW
399
400 // 3 rounds...
401 movdqu xmm1, [edx + 112]
e297526c 402 \aes xmm0, xmm1
1a0c09c4
MW
403
404 // 2 rounds...
405 movdqu xmm1, [edx + 128]
e297526c 406 \aes xmm0, xmm1
1a0c09c4
MW
407
408 // Final round...
409 movdqu xmm1, [edx + 144]
e297526c 410 \aes\()last xmm0, xmm1
1a0c09c4
MW
411
412 // Unpermute the ciphertext block and store it.
8d6ca554 413 pshufb xmm0, xmm5
1a0c09c4
MW
414 mov eax, [esp + 12]
415 movdqu [eax], xmm0
416
417 // And we're done.
418 ret
419
420ENDFUNC
e297526c 421 .endm
1a0c09c4 422
e297526c
MW
423 encdec eblk, aesenc, w
424 encdec dblk, aesdec, wi
1a0c09c4
MW
425
426///--------------------------------------------------------------------------
427/// Random utilities.
428
429 .align 16
430 // Abort the process because of a programming error. Indirecting
431 // through this point serves several purposes: (a) by CALLing, rather
432 // than branching to, `abort', we can save the return address, which
433 // might at least provide a hint as to what went wrong; (b) we don't
434 // have conditional CALLs (and they'd be big anyway); and (c) we can
435 // write a HLT here as a backstop against `abort' being mad.
436bogus: callext F(abort)
4370: hlt
438 jmp 0b
439
440 gotaux ecx
441
442///--------------------------------------------------------------------------
443/// Data tables.
444
445 .align 16
446endswap_tab:
447 .byte 3, 2, 1, 0
448 .byte 7, 6, 5, 4
449 .byte 11, 10, 9, 8
450 .byte 15, 14, 13, 12
451
452///----- That's all, folks --------------------------------------------------