symm/rijndael-x86-aesni.S: Use xmm5 instead of xmm7.
[catacomb] / symm / rijndael-x86-aesni.S
CommitLineData
1a0c09c4
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// AESNI-based implementation of Rijndael
4///
5/// (c) 2015 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
28/// External definitions.
29
30#include "config.h"
31#include "asm-common.h"
32
33///--------------------------------------------------------------------------
34/// External definitions.
35
36 .globl F(abort)
37 .globl F(rijndael_rcon)
38
39///--------------------------------------------------------------------------
47103664
MW
40/// Local utilities.
41
42// Magic constants for shuffling.
43#define ROTL 0x93
44#define ROT2 0x4e
45#define ROTR 0x39
46
47///--------------------------------------------------------------------------
1a0c09c4
MW
48/// Main code.
49
50 .arch .aes
51 .section .text
52
53/// The AESNI instructions implement a little-endian version of AES, but
54/// Catacomb's internal interface presents as big-endian so as to work better
55/// with things like GCM. We therefore maintain the round keys in
56/// little-endian form, and have to end-swap blocks in and out.
57///
58/// For added amusement, the AESNI instructions don't implement the
59/// larger-block versions of Rijndael, so we have to end-swap the keys if
60/// we're preparing for one of those.
61
62 // Useful constants.
63 .equ maxrounds, 16 // maximum number of rounds
64 .equ maxblksz, 32 // maximum block size, in bytes
65 .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
66
67 // Context structure.
68 .equ nr, 0 // number of rounds
69 .equ w, nr + 4 // encryption key words
70 .equ wi, w + kbufsz // decryption key words
71
72///--------------------------------------------------------------------------
73/// Key setup.
74
75FUNC(rijndael_setup_x86_aesni)
76
77 // Initial state. We have four arguments:
78 // [esp + 20] is the context pointer
79 // [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
80 // [esp + 28] points to the key material, unaligned
81 // [esp + 32] is the size of the key, in words
82 // The key size has already been checked for validity, and the number
83 // of rounds has been computed. Our job is only to fill in the `w'
84 // and `wi' vectors.
85
86 push ebp
87 push ebx
88 push esi
89 push edi
90
91 // The initial round key material is taken directly from the input
92 // key, so copy it over.
93 mov ebp, [esp + 20] // context base pointer
94 mov ebx, [esp + 32] // key size, in words
95 mov ecx, ebx
96 mov esi, [esp + 28]
97 lea edi, [ebp + w]
98 rep movsd
99
100 // Find out other useful things.
101 mov edx, [ebp + nr] // number of rounds
102 add edx, 1
103 imul edx, [esp + 24] // total key size in words
104 sub edx, ebx // offset by the key size
105
106 // Find the round constants.
107 ldgot ecx
108 leaext ecx, rijndael_rcon, ecx
109
110 // Prepare for the main loop.
111 lea esi, [ebp + w]
112 mov eax, [esi + 4*ebx - 4] // most recent key word
113 lea edx, [esi + 4*edx] // limit, offset by one key expansion
114
115 // Main key expansion loop. The first word of each key-length chunk
116 // needs special treatment.
117 //
118 // This is rather tedious because the Intel `AESKEYGENASSIST'
119 // instruction is very strangely shaped. Firstly, it wants to
120 // operate on vast SSE registers, even though we're data-blocked from
121 // doing more than operation at a time unless we're doing two key
122 // schedules simultaneously -- and even then we can't do more than
123 // two, because the instruction ignores two of its input words
124 // entirely, and produces two different outputs for each of the other
125 // two. And secondly it insists on taking the magic round constant
126 // as an immediate, so it's kind of annoying if you're not
127 // open-coding the whole thing. It's much easier to leave that as
128 // zero and XOR in the round constant by hand.
1299: movd xmm0, eax
47103664 130 pshufd xmm0, xmm0, ROTR
1a0c09c4 131 aeskeygenassist xmm1, xmm0, 0
47103664 132 pshufd xmm1, xmm1, ROTL
1a0c09c4
MW
133 movd eax, xmm1
134 xor eax, [esi]
135 xor al, [ecx]
136 inc ecx
137 mov [esi + 4*ebx], eax
138 add esi, 4
139 cmp esi, edx
140 jae 8f
141
142 // The next three words are simple...
143 xor eax, [esi]
144 mov [esi + 4*ebx], eax
145 add esi, 4
146 cmp esi, edx
147 jae 8f
148
149 // (Word 2...)
150 xor eax, [esi]
151 mov [esi + 4*ebx], eax
152 add esi, 4
153 cmp esi, edx
154 jae 8f
155
156 // (Word 3...)
157 xor eax, [esi]
158 mov [esi + 4*ebx], eax
159 add esi, 4
160 cmp esi, edx
161 jae 8f
162
163 // Word 4. If the key is /more/ than 6 words long, then we must
164 // apply a substitution here.
165 cmp ebx, 5
166 jb 9b
167 cmp ebx, 7
168 jb 0f
169 movd xmm0, eax
47103664 170 pshufd xmm0, xmm0, ROTL
1a0c09c4
MW
171 aeskeygenassist xmm1, xmm0, 0
172 movd eax, xmm1
1730: xor eax, [esi]
174 mov [esi + 4*ebx], eax
175 add esi, 4
176 cmp esi, edx
177 jae 8f
178
179 // (Word 5...)
180 cmp ebx, 6
181 jb 9b
182 xor eax, [esi]
183 mov [esi + 4*ebx], eax
184 add esi, 4
185 cmp esi, edx
186 jae 8f
187
188 // (Word 6...)
189 cmp ebx, 7
190 jb 9b
191 xor eax, [esi]
192 mov [esi + 4*ebx], eax
193 add esi, 4
194 cmp esi, edx
195 jae 8f
196
197 // (Word 7...)
198 cmp ebx, 8
199 jb 9b
200 xor eax, [esi]
201 mov [esi + 4*ebx], eax
202 add esi, 4
203 cmp esi, edx
204 jae 8f
205
206 // Must be done by now.
207 jmp 9b
208
209 // Next job is to construct the decryption keys. The keys for the
210 // first and last rounds don't need to be mangled, but the remaining
211 // ones do -- and they all need to be reordered too.
212 //
213 // The plan of action, then, is to copy the final encryption round's
214 // keys into place first, then to do each of the intermediate rounds
215 // in reverse order, and finally do the first round.
216 //
217 // Do all of the heavy lifting with SSE registers. The order we're
218 // doing this in means that it's OK if we read or write too much, and
219 // there's easily enough buffer space for the over-enthusiastic reads
220 // and writes because the context has space for 32-byte blocks, which
221 // is our maximum and an exact fit for two SSE registers.
2228: mov ecx, [ebp + nr] // number of rounds
223 mov ebx, [esp + 24] // block size (in words)
224 mov edx, ecx
225 imul edx, ebx
226 lea edi, [ebp + wi]
227 lea esi, [ebp + 4*edx + w] // last round's keys
228 shl ebx, 2 // block size (in bytes now)
229
230 // Copy the last encryption round's keys.
231 movdqu xmm0, [esi]
232 movdqu [edi], xmm0
233 cmp ebx, 16
234 jbe 9f
235 movdqu xmm0, [esi + 16]
236 movdqu [edi + 16], xmm0
237
238 // Update the loop variables and stop if we've finished.
2399: add edi, ebx
240 sub esi, ebx
241 sub ecx, 1
242 jbe 0f
243
244 // Do another middle round's keys...
245 movdqu xmm0, [esi]
246 aesimc xmm0, xmm0
247 movdqu [edi], xmm0
248 cmp ebx, 16
249 jbe 9b
250 movdqu xmm0, [esi + 16]
251 aesimc xmm0, xmm0
252 movdqu [edi + 16], xmm0
253 jmp 9b
254
255 // Finally do the first encryption round.
2560: movdqu xmm0, [esi]
257 movdqu [edi], xmm0
258 cmp ebx, 16
259 jbe 0f
260 movdqu xmm0, [esi + 16]
261 movdqu [edi + 16], xmm0
262
263 // If the block size is not exactly four words then we must end-swap
264 // everything. We can use fancy SSE toys for this.
2650: cmp ebx, 16
266 je 0f
267
268 // Find the byte-reordering table.
269 ldgot ecx
8d6ca554 270 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
1a0c09c4
MW
271
272 // Calculate the number of subkey words again. (It's a good job
273 // we've got a fast multiplier.)
274 mov ecx, [ebp + nr]
275 add ecx, 1
276 imul ecx, [esp + 24] // total keys in words
277
278 // End-swap the encryption keys.
279 mov eax, ecx
280 lea esi, [ebp + w]
281 call endswap_block
282
283 // And the decryption keys.
284 mov ecx, eax
285 lea esi, [ebp + wi]
286 call endswap_block
287
288 // All done.
2890: pop edi
290 pop esi
291 pop ebx
292 pop ebp
293 ret
294
295 .align 16
296endswap_block:
297 // End-swap ECX words starting at ESI. The end-swapping table is
8d6ca554 298 // already loaded into XMM5; and it's OK to work in 16-byte chunks.
1a0c09c4 299 movdqu xmm1, [esi]
8d6ca554 300 pshufb xmm1, xmm5
1a0c09c4
MW
301 movdqu [esi], xmm1
302 add esi, 16
303 sub ecx, 4
304 ja endswap_block
305 ret
306
307ENDFUNC
308
309///--------------------------------------------------------------------------
310/// Encrypting and decrypting blocks.
311
312FUNC(rijndael_eblk_x86_aesni)
313
314 // On entry, we have:
315 // [esp + 4] points to the context block
316 // [esp + 8] points to the input data block
317 // [esp + 12] points to the output buffer
318
319 // Find the magic endianness-swapping table.
320 ldgot ecx
8d6ca554 321 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
1a0c09c4
MW
322
323 // Load the input block and end-swap it. Also, start loading the
324 // keys.
325 mov eax, [esp + 8]
326 movdqu xmm0, [eax]
8d6ca554 327 pshufb xmm0, xmm5
1a0c09c4
MW
328 mov eax, [esp + 4]
329 lea edx, [eax + w]
330 mov eax, [eax + nr]
331
332 // Initial whitening.
333 movdqu xmm1, [edx]
334 add edx, 16
335 pxor xmm0, xmm1
336
337 // Dispatch to the correct code.
338 cmp eax, 10
339 je er10
340 jb bogus
341 cmp eax, 14
342 je er14
343 ja bogus
344 cmp eax, 12
345 je er12
346 jb er11
347 jmp er13
348
349 .align 2
350
351 // 14 rounds...
352er14: movdqu xmm1, [edx]
353 add edx, 16
354 aesenc xmm0, xmm1
355
356 // 13 rounds...
357er13: movdqu xmm1, [edx]
358 add edx, 16
359 aesenc xmm0, xmm1
360
361 // 12 rounds...
362er12: movdqu xmm1, [edx]
363 add edx, 16
364 aesenc xmm0, xmm1
365
366 // 11 rounds...
367er11: movdqu xmm1, [edx]
368 add edx, 16
369 aesenc xmm0, xmm1
370
371 // 10 rounds...
372er10: movdqu xmm1, [edx]
373 aesenc xmm0, xmm1
374
375 // 9 rounds...
376 movdqu xmm1, [edx + 16]
377 aesenc xmm0, xmm1
378
379 // 8 rounds...
380 movdqu xmm1, [edx + 32]
381 aesenc xmm0, xmm1
382
383 // 7 rounds...
384 movdqu xmm1, [edx + 48]
385 aesenc xmm0, xmm1
386
387 // 6 rounds...
388 movdqu xmm1, [edx + 64]
389 aesenc xmm0, xmm1
390
391 // 5 rounds...
392 movdqu xmm1, [edx + 80]
393 aesenc xmm0, xmm1
394
395 // 4 rounds...
396 movdqu xmm1, [edx + 96]
397 aesenc xmm0, xmm1
398
399 // 3 rounds...
400 movdqu xmm1, [edx + 112]
401 aesenc xmm0, xmm1
402
403 // 2 rounds...
404 movdqu xmm1, [edx + 128]
405 aesenc xmm0, xmm1
406
407 // Final round...
408 movdqu xmm1, [edx + 144]
409 aesenclast xmm0, xmm1
410
411 // Unpermute the ciphertext block and store it.
8d6ca554 412 pshufb xmm0, xmm5
1a0c09c4
MW
413 mov eax, [esp + 12]
414 movdqu [eax], xmm0
415
416 // And we're done.
417 ret
418
419ENDFUNC
420
421FUNC(rijndael_dblk_x86_aesni)
422
423 // On entry, we have:
424 // [esp + 4] points to the context block
425 // [esp + 8] points to the input data block
426 // [esp + 12] points to the output buffer
427
428 // Find the magic endianness-swapping table.
429 ldgot ecx
8d6ca554 430 movdqa xmm5, [INTADDR(endswap_tab, ecx)]
1a0c09c4
MW
431
432 // Load the input block and end-swap it. Also, start loading the
433 // keys.
434 mov eax, [esp + 8]
435 movdqu xmm0, [eax]
8d6ca554 436 pshufb xmm0, xmm5
1a0c09c4
MW
437 mov eax, [esp + 4]
438 lea edx, [eax + wi]
439 mov eax, [eax + nr]
440
441 // Initial whitening.
442 movdqu xmm1, [edx]
443 add edx, 16
444 pxor xmm0, xmm1
445
446 // Dispatch to the correct code.
447 cmp eax, 10
448 je dr10
449 jb bogus
450 cmp eax, 14
451 je dr14
452 ja bogus
453 cmp eax, 12
454 je dr12
455 jb dr11
456 jmp dr13
457
458 .align 2
459
460 // 14 rounds...
461dr14: movdqu xmm1, [edx]
462 add edx, 16
463 aesdec xmm0, xmm1
464
465 // 13 rounds...
466dr13: movdqu xmm1, [edx]
467 add edx, 16
468 aesdec xmm0, xmm1
469
470 // 12 rounds...
471dr12: movdqu xmm1, [edx]
472 add edx, 16
473 aesdec xmm0, xmm1
474
475 // 11 rounds...
476dr11: movdqu xmm1, [edx]
477 add edx, 16
478 aesdec xmm0, xmm1
479
480 // 10 rounds...
481dr10: movdqu xmm1, [edx]
482 aesdec xmm0, xmm1
483
484 // 9 rounds...
485 movdqu xmm1, [edx + 16]
486 aesdec xmm0, xmm1
487
488 // 8 rounds...
489 movdqu xmm1, [edx + 32]
490 aesdec xmm0, xmm1
491
492 // 7 rounds...
493 movdqu xmm1, [edx + 48]
494 aesdec xmm0, xmm1
495
496 // 6 rounds...
497 movdqu xmm1, [edx + 64]
498 aesdec xmm0, xmm1
499
500 // 5 rounds...
501 movdqu xmm1, [edx + 80]
502 aesdec xmm0, xmm1
503
504 // 4 rounds...
505 movdqu xmm1, [edx + 96]
506 aesdec xmm0, xmm1
507
508 // 3 rounds...
509 movdqu xmm1, [edx + 112]
510 aesdec xmm0, xmm1
511
512 // 2 rounds...
513 movdqu xmm1, [edx + 128]
514 aesdec xmm0, xmm1
515
516 // Final round...
517 movdqu xmm1, [edx + 144]
518 aesdeclast xmm0, xmm1
519
520 // Unpermute the ciphertext block and store it.
8d6ca554 521 pshufb xmm0, xmm5
1a0c09c4
MW
522 mov eax, [esp + 12]
523 movdqu [eax], xmm0
524
525 // And we're done.
526 ret
527
528ENDFUNC
529
530///--------------------------------------------------------------------------
531/// Random utilities.
532
533 .align 16
534 // Abort the process because of a programming error. Indirecting
535 // through this point serves several purposes: (a) by CALLing, rather
536 // than branching to, `abort', we can save the return address, which
537 // might at least provide a hint as to what went wrong; (b) we don't
538 // have conditional CALLs (and they'd be big anyway); and (c) we can
539 // write a HLT here as a backstop against `abort' being mad.
540bogus: callext F(abort)
5410: hlt
542 jmp 0b
543
544 gotaux ecx
545
546///--------------------------------------------------------------------------
547/// Data tables.
548
549 .align 16
550endswap_tab:
551 .byte 3, 2, 1, 0
552 .byte 7, 6, 5, 4
553 .byte 11, 10, 9, 8
554 .byte 15, 14, 13, 12
555
556///----- That's all, folks --------------------------------------------------