Preprocess the assembler files.
[catacomb] / symm / rijndael-x86-aesni.S
CommitLineData
1a0c09c4
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// AESNI-based implementation of Rijndael
4///
5/// (c) 2015 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
28/// External definitions.
29
30#include "config.h"
31#include "asm-common.h"
32
33///--------------------------------------------------------------------------
34/// External definitions.
35
36 .globl F(abort)
37 .globl F(rijndael_rcon)
38
39///--------------------------------------------------------------------------
40/// Main code.
41
42 .arch .aes
43 .section .text
44
45/// The AESNI instructions implement a little-endian version of AES, but
46/// Catacomb's internal interface presents as big-endian so as to work better
47/// with things like GCM. We therefore maintain the round keys in
48/// little-endian form, and have to end-swap blocks in and out.
49///
50/// For added amusement, the AESNI instructions don't implement the
51/// larger-block versions of Rijndael, so we have to end-swap the keys if
52/// we're preparing for one of those.
53
54 // Useful constants.
55 .equ maxrounds, 16 // maximum number of rounds
56 .equ maxblksz, 32 // maximum block size, in bytes
57 .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
58
59 // Context structure.
60 .equ nr, 0 // number of rounds
61 .equ w, nr + 4 // encryption key words
62 .equ wi, w + kbufsz // decryption key words
63
64///--------------------------------------------------------------------------
65/// Key setup.
66
67FUNC(rijndael_setup_x86_aesni)
68
69 // Initial state. We have four arguments:
70 // [esp + 20] is the context pointer
71 // [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
72 // [esp + 28] points to the key material, unaligned
73 // [esp + 32] is the size of the key, in words
74 // The key size has already been checked for validity, and the number
75 // of rounds has been computed. Our job is only to fill in the `w'
76 // and `wi' vectors.
77
78 push ebp
79 push ebx
80 push esi
81 push edi
82
83 // The initial round key material is taken directly from the input
84 // key, so copy it over.
85 mov ebp, [esp + 20] // context base pointer
86 mov ebx, [esp + 32] // key size, in words
87 mov ecx, ebx
88 mov esi, [esp + 28]
89 lea edi, [ebp + w]
90 rep movsd
91
92 // Find out other useful things.
93 mov edx, [ebp + nr] // number of rounds
94 add edx, 1
95 imul edx, [esp + 24] // total key size in words
96 sub edx, ebx // offset by the key size
97
98 // Find the round constants.
99 ldgot ecx
100 leaext ecx, rijndael_rcon, ecx
101
102 // Prepare for the main loop.
103 lea esi, [ebp + w]
104 mov eax, [esi + 4*ebx - 4] // most recent key word
105 lea edx, [esi + 4*edx] // limit, offset by one key expansion
106
107 // Main key expansion loop. The first word of each key-length chunk
108 // needs special treatment.
109 //
110 // This is rather tedious because the Intel `AESKEYGENASSIST'
111 // instruction is very strangely shaped. Firstly, it wants to
112 // operate on vast SSE registers, even though we're data-blocked from
113 // doing more than operation at a time unless we're doing two key
114 // schedules simultaneously -- and even then we can't do more than
115 // two, because the instruction ignores two of its input words
116 // entirely, and produces two different outputs for each of the other
117 // two. And secondly it insists on taking the magic round constant
118 // as an immediate, so it's kind of annoying if you're not
119 // open-coding the whole thing. It's much easier to leave that as
120 // zero and XOR in the round constant by hand.
1219: movd xmm0, eax
122 pshufd xmm0, xmm0, 0x39
123 aeskeygenassist xmm1, xmm0, 0
124 pshufd xmm1, xmm1, 0x93
125 movd eax, xmm1
126 xor eax, [esi]
127 xor al, [ecx]
128 inc ecx
129 mov [esi + 4*ebx], eax
130 add esi, 4
131 cmp esi, edx
132 jae 8f
133
134 // The next three words are simple...
135 xor eax, [esi]
136 mov [esi + 4*ebx], eax
137 add esi, 4
138 cmp esi, edx
139 jae 8f
140
141 // (Word 2...)
142 xor eax, [esi]
143 mov [esi + 4*ebx], eax
144 add esi, 4
145 cmp esi, edx
146 jae 8f
147
148 // (Word 3...)
149 xor eax, [esi]
150 mov [esi + 4*ebx], eax
151 add esi, 4
152 cmp esi, edx
153 jae 8f
154
155 // Word 4. If the key is /more/ than 6 words long, then we must
156 // apply a substitution here.
157 cmp ebx, 5
158 jb 9b
159 cmp ebx, 7
160 jb 0f
161 movd xmm0, eax
162 pshufd xmm0, xmm0, 0x93
163 aeskeygenassist xmm1, xmm0, 0
164 movd eax, xmm1
1650: xor eax, [esi]
166 mov [esi + 4*ebx], eax
167 add esi, 4
168 cmp esi, edx
169 jae 8f
170
171 // (Word 5...)
172 cmp ebx, 6
173 jb 9b
174 xor eax, [esi]
175 mov [esi + 4*ebx], eax
176 add esi, 4
177 cmp esi, edx
178 jae 8f
179
180 // (Word 6...)
181 cmp ebx, 7
182 jb 9b
183 xor eax, [esi]
184 mov [esi + 4*ebx], eax
185 add esi, 4
186 cmp esi, edx
187 jae 8f
188
189 // (Word 7...)
190 cmp ebx, 8
191 jb 9b
192 xor eax, [esi]
193 mov [esi + 4*ebx], eax
194 add esi, 4
195 cmp esi, edx
196 jae 8f
197
198 // Must be done by now.
199 jmp 9b
200
201 // Next job is to construct the decryption keys. The keys for the
202 // first and last rounds don't need to be mangled, but the remaining
203 // ones do -- and they all need to be reordered too.
204 //
205 // The plan of action, then, is to copy the final encryption round's
206 // keys into place first, then to do each of the intermediate rounds
207 // in reverse order, and finally do the first round.
208 //
209 // Do all of the heavy lifting with SSE registers. The order we're
210 // doing this in means that it's OK if we read or write too much, and
211 // there's easily enough buffer space for the over-enthusiastic reads
212 // and writes because the context has space for 32-byte blocks, which
213 // is our maximum and an exact fit for two SSE registers.
2148: mov ecx, [ebp + nr] // number of rounds
215 mov ebx, [esp + 24] // block size (in words)
216 mov edx, ecx
217 imul edx, ebx
218 lea edi, [ebp + wi]
219 lea esi, [ebp + 4*edx + w] // last round's keys
220 shl ebx, 2 // block size (in bytes now)
221
222 // Copy the last encryption round's keys.
223 movdqu xmm0, [esi]
224 movdqu [edi], xmm0
225 cmp ebx, 16
226 jbe 9f
227 movdqu xmm0, [esi + 16]
228 movdqu [edi + 16], xmm0
229
230 // Update the loop variables and stop if we've finished.
2319: add edi, ebx
232 sub esi, ebx
233 sub ecx, 1
234 jbe 0f
235
236 // Do another middle round's keys...
237 movdqu xmm0, [esi]
238 aesimc xmm0, xmm0
239 movdqu [edi], xmm0
240 cmp ebx, 16
241 jbe 9b
242 movdqu xmm0, [esi + 16]
243 aesimc xmm0, xmm0
244 movdqu [edi + 16], xmm0
245 jmp 9b
246
247 // Finally do the first encryption round.
2480: movdqu xmm0, [esi]
249 movdqu [edi], xmm0
250 cmp ebx, 16
251 jbe 0f
252 movdqu xmm0, [esi + 16]
253 movdqu [edi + 16], xmm0
254
255 // If the block size is not exactly four words then we must end-swap
256 // everything. We can use fancy SSE toys for this.
2570: cmp ebx, 16
258 je 0f
259
260 // Find the byte-reordering table.
261 ldgot ecx
262 movdqa xmm7, [INTADDR(endswap_tab, ecx)]
263
264 // Calculate the number of subkey words again. (It's a good job
265 // we've got a fast multiplier.)
266 mov ecx, [ebp + nr]
267 add ecx, 1
268 imul ecx, [esp + 24] // total keys in words
269
270 // End-swap the encryption keys.
271 mov eax, ecx
272 lea esi, [ebp + w]
273 call endswap_block
274
275 // And the decryption keys.
276 mov ecx, eax
277 lea esi, [ebp + wi]
278 call endswap_block
279
280 // All done.
2810: pop edi
282 pop esi
283 pop ebx
284 pop ebp
285 ret
286
287 .align 16
288endswap_block:
289 // End-swap ECX words starting at ESI. The end-swapping table is
290 // already loaded into XMM7; and it's OK to work in 16-byte chunks.
291 movdqu xmm1, [esi]
292 pshufb xmm1, xmm7
293 movdqu [esi], xmm1
294 add esi, 16
295 sub ecx, 4
296 ja endswap_block
297 ret
298
299ENDFUNC
300
301///--------------------------------------------------------------------------
302/// Encrypting and decrypting blocks.
303
304FUNC(rijndael_eblk_x86_aesni)
305
306 // On entry, we have:
307 // [esp + 4] points to the context block
308 // [esp + 8] points to the input data block
309 // [esp + 12] points to the output buffer
310
311 // Find the magic endianness-swapping table.
312 ldgot ecx
313 movdqa xmm7, [INTADDR(endswap_tab, ecx)]
314
315 // Load the input block and end-swap it. Also, start loading the
316 // keys.
317 mov eax, [esp + 8]
318 movdqu xmm0, [eax]
319 pshufb xmm0, xmm7
320 mov eax, [esp + 4]
321 lea edx, [eax + w]
322 mov eax, [eax + nr]
323
324 // Initial whitening.
325 movdqu xmm1, [edx]
326 add edx, 16
327 pxor xmm0, xmm1
328
329 // Dispatch to the correct code.
330 cmp eax, 10
331 je er10
332 jb bogus
333 cmp eax, 14
334 je er14
335 ja bogus
336 cmp eax, 12
337 je er12
338 jb er11
339 jmp er13
340
341 .align 2
342
343 // 14 rounds...
344er14: movdqu xmm1, [edx]
345 add edx, 16
346 aesenc xmm0, xmm1
347
348 // 13 rounds...
349er13: movdqu xmm1, [edx]
350 add edx, 16
351 aesenc xmm0, xmm1
352
353 // 12 rounds...
354er12: movdqu xmm1, [edx]
355 add edx, 16
356 aesenc xmm0, xmm1
357
358 // 11 rounds...
359er11: movdqu xmm1, [edx]
360 add edx, 16
361 aesenc xmm0, xmm1
362
363 // 10 rounds...
364er10: movdqu xmm1, [edx]
365 aesenc xmm0, xmm1
366
367 // 9 rounds...
368 movdqu xmm1, [edx + 16]
369 aesenc xmm0, xmm1
370
371 // 8 rounds...
372 movdqu xmm1, [edx + 32]
373 aesenc xmm0, xmm1
374
375 // 7 rounds...
376 movdqu xmm1, [edx + 48]
377 aesenc xmm0, xmm1
378
379 // 6 rounds...
380 movdqu xmm1, [edx + 64]
381 aesenc xmm0, xmm1
382
383 // 5 rounds...
384 movdqu xmm1, [edx + 80]
385 aesenc xmm0, xmm1
386
387 // 4 rounds...
388 movdqu xmm1, [edx + 96]
389 aesenc xmm0, xmm1
390
391 // 3 rounds...
392 movdqu xmm1, [edx + 112]
393 aesenc xmm0, xmm1
394
395 // 2 rounds...
396 movdqu xmm1, [edx + 128]
397 aesenc xmm0, xmm1
398
399 // Final round...
400 movdqu xmm1, [edx + 144]
401 aesenclast xmm0, xmm1
402
403 // Unpermute the ciphertext block and store it.
404 pshufb xmm0, xmm7
405 mov eax, [esp + 12]
406 movdqu [eax], xmm0
407
408 // And we're done.
409 ret
410
411ENDFUNC
412
413FUNC(rijndael_dblk_x86_aesni)
414
415 // On entry, we have:
416 // [esp + 4] points to the context block
417 // [esp + 8] points to the input data block
418 // [esp + 12] points to the output buffer
419
420 // Find the magic endianness-swapping table.
421 ldgot ecx
422 movdqa xmm7, [INTADDR(endswap_tab, ecx)]
423
424 // Load the input block and end-swap it. Also, start loading the
425 // keys.
426 mov eax, [esp + 8]
427 movdqu xmm0, [eax]
428 pshufb xmm0, xmm7
429 mov eax, [esp + 4]
430 lea edx, [eax + wi]
431 mov eax, [eax + nr]
432
433 // Initial whitening.
434 movdqu xmm1, [edx]
435 add edx, 16
436 pxor xmm0, xmm1
437
438 // Dispatch to the correct code.
439 cmp eax, 10
440 je dr10
441 jb bogus
442 cmp eax, 14
443 je dr14
444 ja bogus
445 cmp eax, 12
446 je dr12
447 jb dr11
448 jmp dr13
449
450 .align 2
451
452 // 14 rounds...
453dr14: movdqu xmm1, [edx]
454 add edx, 16
455 aesdec xmm0, xmm1
456
457 // 13 rounds...
458dr13: movdqu xmm1, [edx]
459 add edx, 16
460 aesdec xmm0, xmm1
461
462 // 12 rounds...
463dr12: movdqu xmm1, [edx]
464 add edx, 16
465 aesdec xmm0, xmm1
466
467 // 11 rounds...
468dr11: movdqu xmm1, [edx]
469 add edx, 16
470 aesdec xmm0, xmm1
471
472 // 10 rounds...
473dr10: movdqu xmm1, [edx]
474 aesdec xmm0, xmm1
475
476 // 9 rounds...
477 movdqu xmm1, [edx + 16]
478 aesdec xmm0, xmm1
479
480 // 8 rounds...
481 movdqu xmm1, [edx + 32]
482 aesdec xmm0, xmm1
483
484 // 7 rounds...
485 movdqu xmm1, [edx + 48]
486 aesdec xmm0, xmm1
487
488 // 6 rounds...
489 movdqu xmm1, [edx + 64]
490 aesdec xmm0, xmm1
491
492 // 5 rounds...
493 movdqu xmm1, [edx + 80]
494 aesdec xmm0, xmm1
495
496 // 4 rounds...
497 movdqu xmm1, [edx + 96]
498 aesdec xmm0, xmm1
499
500 // 3 rounds...
501 movdqu xmm1, [edx + 112]
502 aesdec xmm0, xmm1
503
504 // 2 rounds...
505 movdqu xmm1, [edx + 128]
506 aesdec xmm0, xmm1
507
508 // Final round...
509 movdqu xmm1, [edx + 144]
510 aesdeclast xmm0, xmm1
511
512 // Unpermute the ciphertext block and store it.
513 pshufb xmm0, xmm7
514 mov eax, [esp + 12]
515 movdqu [eax], xmm0
516
517 // And we're done.
518 ret
519
520ENDFUNC
521
522///--------------------------------------------------------------------------
523/// Random utilities.
524
525 .align 16
526 // Abort the process because of a programming error. Indirecting
527 // through this point serves several purposes: (a) by CALLing, rather
528 // than branching to, `abort', we can save the return address, which
529 // might at least provide a hint as to what went wrong; (b) we don't
530 // have conditional CALLs (and they'd be big anyway); and (c) we can
531 // write a HLT here as a backstop against `abort' being mad.
532bogus: callext F(abort)
5330: hlt
534 jmp 0b
535
536 gotaux ecx
537
538///--------------------------------------------------------------------------
539/// Data tables.
540
541 .align 16
542endswap_tab:
543 .byte 3, 2, 1, 0
544 .byte 7, 6, 5, 4
545 .byte 11, 10, 9, 8
546 .byte 15, 14, 13, 12
547
548///----- That's all, folks --------------------------------------------------