Commit | Line | Data |
---|---|---|
1a0c09c4 MW |
1 | /// -*- mode: asm; asm-comment-char: ?/ -*- |
2 | /// | |
3 | /// AESNI-based implementation of Rijndael | |
4 | /// | |
5 | /// (c) 2015 Straylight/Edgeware | |
6 | /// | |
7 | ||
8 | ///----- Licensing notice --------------------------------------------------- | |
9 | /// | |
10 | /// This file is part of Catacomb. | |
11 | /// | |
12 | /// Catacomb is free software; you can redistribute it and/or modify | |
13 | /// it under the terms of the GNU Library General Public License as | |
14 | /// published by the Free Software Foundation; either version 2 of the | |
15 | /// License, or (at your option) any later version. | |
16 | /// | |
17 | /// Catacomb is distributed in the hope that it will be useful, | |
18 | /// but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 | /// GNU Library General Public License for more details. | |
21 | /// | |
22 | /// You should have received a copy of the GNU Library General Public | |
23 | /// License along with Catacomb; if not, write to the Free | |
24 | /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, | |
25 | /// MA 02111-1307, USA. | |
26 | ||
27 | ///-------------------------------------------------------------------------- | |
28 | /// External definitions. | |
29 | ||
30 | #include "config.h" | |
31 | #include "asm-common.h" | |
32 | ||
33 | ///-------------------------------------------------------------------------- | |
34 | /// External definitions. | |
35 | ||
36 | .globl F(abort) | |
37 | .globl F(rijndael_rcon) | |
38 | ||
39 | ///-------------------------------------------------------------------------- | |
47103664 MW |
40 | /// Local utilities. |
41 | ||
42 | // Magic constants for shuffling. | |
43 | #define ROTL 0x93 | |
44 | #define ROT2 0x4e | |
45 | #define ROTR 0x39 | |
46 | ||
47 | ///-------------------------------------------------------------------------- | |
1a0c09c4 MW |
48 | /// Main code. |
49 | ||
50 | .arch .aes | |
51 | .section .text | |
52 | ||
53 | /// The AESNI instructions implement a little-endian version of AES, but | |
54 | /// Catacomb's internal interface presents as big-endian so as to work better | |
55 | /// with things like GCM. We therefore maintain the round keys in | |
56 | /// little-endian form, and have to end-swap blocks in and out. | |
57 | /// | |
58 | /// For added amusement, the AESNI instructions don't implement the | |
59 | /// larger-block versions of Rijndael, so we have to end-swap the keys if | |
60 | /// we're preparing for one of those. | |
61 | ||
62 | // Useful constants. | |
63 | .equ maxrounds, 16 // maximum number of rounds | |
64 | .equ maxblksz, 32 // maximum block size, in bytes | |
65 | .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer | |
66 | ||
67 | // Context structure. | |
68 | .equ nr, 0 // number of rounds | |
69 | .equ w, nr + 4 // encryption key words | |
70 | .equ wi, w + kbufsz // decryption key words | |
71 | ||
72 | ///-------------------------------------------------------------------------- | |
73 | /// Key setup. | |
74 | ||
75 | FUNC(rijndael_setup_x86_aesni) | |
76 | ||
77 | // Initial state. We have four arguments: | |
78 | // [esp + 20] is the context pointer | |
79 | // [esp + 24] is the block size, in 32-bit words (4, 6, or 8) | |
80 | // [esp + 28] points to the key material, unaligned | |
81 | // [esp + 32] is the size of the key, in words | |
82 | // The key size has already been checked for validity, and the number | |
83 | // of rounds has been computed. Our job is only to fill in the `w' | |
84 | // and `wi' vectors. | |
85 | ||
86 | push ebp | |
87 | push ebx | |
88 | push esi | |
89 | push edi | |
90 | ||
91 | // The initial round key material is taken directly from the input | |
92 | // key, so copy it over. | |
93 | mov ebp, [esp + 20] // context base pointer | |
94 | mov ebx, [esp + 32] // key size, in words | |
95 | mov ecx, ebx | |
96 | mov esi, [esp + 28] | |
97 | lea edi, [ebp + w] | |
98 | rep movsd | |
99 | ||
100 | // Find out other useful things. | |
101 | mov edx, [ebp + nr] // number of rounds | |
102 | add edx, 1 | |
103 | imul edx, [esp + 24] // total key size in words | |
104 | sub edx, ebx // offset by the key size | |
105 | ||
106 | // Find the round constants. | |
107 | ldgot ecx | |
108 | leaext ecx, rijndael_rcon, ecx | |
109 | ||
110 | // Prepare for the main loop. | |
111 | lea esi, [ebp + w] | |
112 | mov eax, [esi + 4*ebx - 4] // most recent key word | |
113 | lea edx, [esi + 4*edx] // limit, offset by one key expansion | |
114 | ||
115 | // Main key expansion loop. The first word of each key-length chunk | |
116 | // needs special treatment. | |
117 | // | |
118 | // This is rather tedious because the Intel `AESKEYGENASSIST' | |
119 | // instruction is very strangely shaped. Firstly, it wants to | |
120 | // operate on vast SSE registers, even though we're data-blocked from | |
121 | // doing more than operation at a time unless we're doing two key | |
122 | // schedules simultaneously -- and even then we can't do more than | |
123 | // two, because the instruction ignores two of its input words | |
124 | // entirely, and produces two different outputs for each of the other | |
125 | // two. And secondly it insists on taking the magic round constant | |
126 | // as an immediate, so it's kind of annoying if you're not | |
127 | // open-coding the whole thing. It's much easier to leave that as | |
128 | // zero and XOR in the round constant by hand. | |
129 | 9: movd xmm0, eax | |
47103664 | 130 | pshufd xmm0, xmm0, ROTR |
1a0c09c4 | 131 | aeskeygenassist xmm1, xmm0, 0 |
47103664 | 132 | pshufd xmm1, xmm1, ROTL |
1a0c09c4 MW |
133 | movd eax, xmm1 |
134 | xor eax, [esi] | |
135 | xor al, [ecx] | |
136 | inc ecx | |
137 | mov [esi + 4*ebx], eax | |
138 | add esi, 4 | |
139 | cmp esi, edx | |
140 | jae 8f | |
141 | ||
142 | // The next three words are simple... | |
143 | xor eax, [esi] | |
144 | mov [esi + 4*ebx], eax | |
145 | add esi, 4 | |
146 | cmp esi, edx | |
147 | jae 8f | |
148 | ||
149 | // (Word 2...) | |
150 | xor eax, [esi] | |
151 | mov [esi + 4*ebx], eax | |
152 | add esi, 4 | |
153 | cmp esi, edx | |
154 | jae 8f | |
155 | ||
156 | // (Word 3...) | |
157 | xor eax, [esi] | |
158 | mov [esi + 4*ebx], eax | |
159 | add esi, 4 | |
160 | cmp esi, edx | |
161 | jae 8f | |
162 | ||
163 | // Word 4. If the key is /more/ than 6 words long, then we must | |
164 | // apply a substitution here. | |
165 | cmp ebx, 5 | |
166 | jb 9b | |
167 | cmp ebx, 7 | |
168 | jb 0f | |
169 | movd xmm0, eax | |
47103664 | 170 | pshufd xmm0, xmm0, ROTL |
1a0c09c4 MW |
171 | aeskeygenassist xmm1, xmm0, 0 |
172 | movd eax, xmm1 | |
173 | 0: xor eax, [esi] | |
174 | mov [esi + 4*ebx], eax | |
175 | add esi, 4 | |
176 | cmp esi, edx | |
177 | jae 8f | |
178 | ||
179 | // (Word 5...) | |
180 | cmp ebx, 6 | |
181 | jb 9b | |
182 | xor eax, [esi] | |
183 | mov [esi + 4*ebx], eax | |
184 | add esi, 4 | |
185 | cmp esi, edx | |
186 | jae 8f | |
187 | ||
188 | // (Word 6...) | |
189 | cmp ebx, 7 | |
190 | jb 9b | |
191 | xor eax, [esi] | |
192 | mov [esi + 4*ebx], eax | |
193 | add esi, 4 | |
194 | cmp esi, edx | |
195 | jae 8f | |
196 | ||
197 | // (Word 7...) | |
198 | cmp ebx, 8 | |
199 | jb 9b | |
200 | xor eax, [esi] | |
201 | mov [esi + 4*ebx], eax | |
202 | add esi, 4 | |
203 | cmp esi, edx | |
204 | jae 8f | |
205 | ||
206 | // Must be done by now. | |
207 | jmp 9b | |
208 | ||
209 | // Next job is to construct the decryption keys. The keys for the | |
210 | // first and last rounds don't need to be mangled, but the remaining | |
211 | // ones do -- and they all need to be reordered too. | |
212 | // | |
213 | // The plan of action, then, is to copy the final encryption round's | |
214 | // keys into place first, then to do each of the intermediate rounds | |
215 | // in reverse order, and finally do the first round. | |
216 | // | |
217 | // Do all of the heavy lifting with SSE registers. The order we're | |
218 | // doing this in means that it's OK if we read or write too much, and | |
219 | // there's easily enough buffer space for the over-enthusiastic reads | |
220 | // and writes because the context has space for 32-byte blocks, which | |
221 | // is our maximum and an exact fit for two SSE registers. | |
222 | 8: mov ecx, [ebp + nr] // number of rounds | |
223 | mov ebx, [esp + 24] // block size (in words) | |
224 | mov edx, ecx | |
225 | imul edx, ebx | |
226 | lea edi, [ebp + wi] | |
227 | lea esi, [ebp + 4*edx + w] // last round's keys | |
228 | shl ebx, 2 // block size (in bytes now) | |
229 | ||
230 | // Copy the last encryption round's keys. | |
231 | movdqu xmm0, [esi] | |
232 | movdqu [edi], xmm0 | |
233 | cmp ebx, 16 | |
234 | jbe 9f | |
235 | movdqu xmm0, [esi + 16] | |
236 | movdqu [edi + 16], xmm0 | |
237 | ||
238 | // Update the loop variables and stop if we've finished. | |
239 | 9: add edi, ebx | |
240 | sub esi, ebx | |
241 | sub ecx, 1 | |
242 | jbe 0f | |
243 | ||
244 | // Do another middle round's keys... | |
245 | movdqu xmm0, [esi] | |
246 | aesimc xmm0, xmm0 | |
247 | movdqu [edi], xmm0 | |
248 | cmp ebx, 16 | |
249 | jbe 9b | |
250 | movdqu xmm0, [esi + 16] | |
251 | aesimc xmm0, xmm0 | |
252 | movdqu [edi + 16], xmm0 | |
253 | jmp 9b | |
254 | ||
255 | // Finally do the first encryption round. | |
256 | 0: movdqu xmm0, [esi] | |
257 | movdqu [edi], xmm0 | |
258 | cmp ebx, 16 | |
259 | jbe 0f | |
260 | movdqu xmm0, [esi + 16] | |
261 | movdqu [edi + 16], xmm0 | |
262 | ||
263 | // If the block size is not exactly four words then we must end-swap | |
264 | // everything. We can use fancy SSE toys for this. | |
265 | 0: cmp ebx, 16 | |
266 | je 0f | |
267 | ||
268 | // Find the byte-reordering table. | |
269 | ldgot ecx | |
8d6ca554 | 270 | movdqa xmm5, [INTADDR(endswap_tab, ecx)] |
1a0c09c4 MW |
271 | |
272 | // Calculate the number of subkey words again. (It's a good job | |
273 | // we've got a fast multiplier.) | |
274 | mov ecx, [ebp + nr] | |
275 | add ecx, 1 | |
276 | imul ecx, [esp + 24] // total keys in words | |
277 | ||
278 | // End-swap the encryption keys. | |
279 | mov eax, ecx | |
280 | lea esi, [ebp + w] | |
281 | call endswap_block | |
282 | ||
283 | // And the decryption keys. | |
284 | mov ecx, eax | |
285 | lea esi, [ebp + wi] | |
286 | call endswap_block | |
287 | ||
288 | // All done. | |
289 | 0: pop edi | |
290 | pop esi | |
291 | pop ebx | |
292 | pop ebp | |
293 | ret | |
294 | ||
295 | .align 16 | |
296 | endswap_block: | |
297 | // End-swap ECX words starting at ESI. The end-swapping table is | |
8d6ca554 | 298 | // already loaded into XMM5; and it's OK to work in 16-byte chunks. |
1a0c09c4 | 299 | movdqu xmm1, [esi] |
8d6ca554 | 300 | pshufb xmm1, xmm5 |
1a0c09c4 MW |
301 | movdqu [esi], xmm1 |
302 | add esi, 16 | |
303 | sub ecx, 4 | |
304 | ja endswap_block | |
305 | ret | |
306 | ||
307 | ENDFUNC | |
308 | ||
309 | ///-------------------------------------------------------------------------- | |
310 | /// Encrypting and decrypting blocks. | |
311 | ||
312 | FUNC(rijndael_eblk_x86_aesni) | |
313 | ||
314 | // On entry, we have: | |
315 | // [esp + 4] points to the context block | |
316 | // [esp + 8] points to the input data block | |
317 | // [esp + 12] points to the output buffer | |
318 | ||
319 | // Find the magic endianness-swapping table. | |
320 | ldgot ecx | |
8d6ca554 | 321 | movdqa xmm5, [INTADDR(endswap_tab, ecx)] |
1a0c09c4 MW |
322 | |
323 | // Load the input block and end-swap it. Also, start loading the | |
324 | // keys. | |
325 | mov eax, [esp + 8] | |
326 | movdqu xmm0, [eax] | |
8d6ca554 | 327 | pshufb xmm0, xmm5 |
1a0c09c4 MW |
328 | mov eax, [esp + 4] |
329 | lea edx, [eax + w] | |
330 | mov eax, [eax + nr] | |
331 | ||
332 | // Initial whitening. | |
333 | movdqu xmm1, [edx] | |
334 | add edx, 16 | |
335 | pxor xmm0, xmm1 | |
336 | ||
337 | // Dispatch to the correct code. | |
338 | cmp eax, 10 | |
339 | je er10 | |
340 | jb bogus | |
341 | cmp eax, 14 | |
342 | je er14 | |
343 | ja bogus | |
344 | cmp eax, 12 | |
345 | je er12 | |
346 | jb er11 | |
347 | jmp er13 | |
348 | ||
349 | .align 2 | |
350 | ||
351 | // 14 rounds... | |
352 | er14: movdqu xmm1, [edx] | |
353 | add edx, 16 | |
354 | aesenc xmm0, xmm1 | |
355 | ||
356 | // 13 rounds... | |
357 | er13: movdqu xmm1, [edx] | |
358 | add edx, 16 | |
359 | aesenc xmm0, xmm1 | |
360 | ||
361 | // 12 rounds... | |
362 | er12: movdqu xmm1, [edx] | |
363 | add edx, 16 | |
364 | aesenc xmm0, xmm1 | |
365 | ||
366 | // 11 rounds... | |
367 | er11: movdqu xmm1, [edx] | |
368 | add edx, 16 | |
369 | aesenc xmm0, xmm1 | |
370 | ||
371 | // 10 rounds... | |
372 | er10: movdqu xmm1, [edx] | |
373 | aesenc xmm0, xmm1 | |
374 | ||
375 | // 9 rounds... | |
376 | movdqu xmm1, [edx + 16] | |
377 | aesenc xmm0, xmm1 | |
378 | ||
379 | // 8 rounds... | |
380 | movdqu xmm1, [edx + 32] | |
381 | aesenc xmm0, xmm1 | |
382 | ||
383 | // 7 rounds... | |
384 | movdqu xmm1, [edx + 48] | |
385 | aesenc xmm0, xmm1 | |
386 | ||
387 | // 6 rounds... | |
388 | movdqu xmm1, [edx + 64] | |
389 | aesenc xmm0, xmm1 | |
390 | ||
391 | // 5 rounds... | |
392 | movdqu xmm1, [edx + 80] | |
393 | aesenc xmm0, xmm1 | |
394 | ||
395 | // 4 rounds... | |
396 | movdqu xmm1, [edx + 96] | |
397 | aesenc xmm0, xmm1 | |
398 | ||
399 | // 3 rounds... | |
400 | movdqu xmm1, [edx + 112] | |
401 | aesenc xmm0, xmm1 | |
402 | ||
403 | // 2 rounds... | |
404 | movdqu xmm1, [edx + 128] | |
405 | aesenc xmm0, xmm1 | |
406 | ||
407 | // Final round... | |
408 | movdqu xmm1, [edx + 144] | |
409 | aesenclast xmm0, xmm1 | |
410 | ||
411 | // Unpermute the ciphertext block and store it. | |
8d6ca554 | 412 | pshufb xmm0, xmm5 |
1a0c09c4 MW |
413 | mov eax, [esp + 12] |
414 | movdqu [eax], xmm0 | |
415 | ||
416 | // And we're done. | |
417 | ret | |
418 | ||
419 | ENDFUNC | |
420 | ||
421 | FUNC(rijndael_dblk_x86_aesni) | |
422 | ||
423 | // On entry, we have: | |
424 | // [esp + 4] points to the context block | |
425 | // [esp + 8] points to the input data block | |
426 | // [esp + 12] points to the output buffer | |
427 | ||
428 | // Find the magic endianness-swapping table. | |
429 | ldgot ecx | |
8d6ca554 | 430 | movdqa xmm5, [INTADDR(endswap_tab, ecx)] |
1a0c09c4 MW |
431 | |
432 | // Load the input block and end-swap it. Also, start loading the | |
433 | // keys. | |
434 | mov eax, [esp + 8] | |
435 | movdqu xmm0, [eax] | |
8d6ca554 | 436 | pshufb xmm0, xmm5 |
1a0c09c4 MW |
437 | mov eax, [esp + 4] |
438 | lea edx, [eax + wi] | |
439 | mov eax, [eax + nr] | |
440 | ||
441 | // Initial whitening. | |
442 | movdqu xmm1, [edx] | |
443 | add edx, 16 | |
444 | pxor xmm0, xmm1 | |
445 | ||
446 | // Dispatch to the correct code. | |
447 | cmp eax, 10 | |
448 | je dr10 | |
449 | jb bogus | |
450 | cmp eax, 14 | |
451 | je dr14 | |
452 | ja bogus | |
453 | cmp eax, 12 | |
454 | je dr12 | |
455 | jb dr11 | |
456 | jmp dr13 | |
457 | ||
458 | .align 2 | |
459 | ||
460 | // 14 rounds... | |
461 | dr14: movdqu xmm1, [edx] | |
462 | add edx, 16 | |
463 | aesdec xmm0, xmm1 | |
464 | ||
465 | // 13 rounds... | |
466 | dr13: movdqu xmm1, [edx] | |
467 | add edx, 16 | |
468 | aesdec xmm0, xmm1 | |
469 | ||
470 | // 12 rounds... | |
471 | dr12: movdqu xmm1, [edx] | |
472 | add edx, 16 | |
473 | aesdec xmm0, xmm1 | |
474 | ||
475 | // 11 rounds... | |
476 | dr11: movdqu xmm1, [edx] | |
477 | add edx, 16 | |
478 | aesdec xmm0, xmm1 | |
479 | ||
480 | // 10 rounds... | |
481 | dr10: movdqu xmm1, [edx] | |
482 | aesdec xmm0, xmm1 | |
483 | ||
484 | // 9 rounds... | |
485 | movdqu xmm1, [edx + 16] | |
486 | aesdec xmm0, xmm1 | |
487 | ||
488 | // 8 rounds... | |
489 | movdqu xmm1, [edx + 32] | |
490 | aesdec xmm0, xmm1 | |
491 | ||
492 | // 7 rounds... | |
493 | movdqu xmm1, [edx + 48] | |
494 | aesdec xmm0, xmm1 | |
495 | ||
496 | // 6 rounds... | |
497 | movdqu xmm1, [edx + 64] | |
498 | aesdec xmm0, xmm1 | |
499 | ||
500 | // 5 rounds... | |
501 | movdqu xmm1, [edx + 80] | |
502 | aesdec xmm0, xmm1 | |
503 | ||
504 | // 4 rounds... | |
505 | movdqu xmm1, [edx + 96] | |
506 | aesdec xmm0, xmm1 | |
507 | ||
508 | // 3 rounds... | |
509 | movdqu xmm1, [edx + 112] | |
510 | aesdec xmm0, xmm1 | |
511 | ||
512 | // 2 rounds... | |
513 | movdqu xmm1, [edx + 128] | |
514 | aesdec xmm0, xmm1 | |
515 | ||
516 | // Final round... | |
517 | movdqu xmm1, [edx + 144] | |
518 | aesdeclast xmm0, xmm1 | |
519 | ||
520 | // Unpermute the ciphertext block and store it. | |
8d6ca554 | 521 | pshufb xmm0, xmm5 |
1a0c09c4 MW |
522 | mov eax, [esp + 12] |
523 | movdqu [eax], xmm0 | |
524 | ||
525 | // And we're done. | |
526 | ret | |
527 | ||
528 | ENDFUNC | |
529 | ||
530 | ///-------------------------------------------------------------------------- | |
531 | /// Random utilities. | |
532 | ||
533 | .align 16 | |
534 | // Abort the process because of a programming error. Indirecting | |
535 | // through this point serves several purposes: (a) by CALLing, rather | |
536 | // than branching to, `abort', we can save the return address, which | |
537 | // might at least provide a hint as to what went wrong; (b) we don't | |
538 | // have conditional CALLs (and they'd be big anyway); and (c) we can | |
539 | // write a HLT here as a backstop against `abort' being mad. | |
540 | bogus: callext F(abort) | |
541 | 0: hlt | |
542 | jmp 0b | |
543 | ||
544 | gotaux ecx | |
545 | ||
546 | ///-------------------------------------------------------------------------- | |
547 | /// Data tables. | |
548 | ||
549 | .align 16 | |
550 | endswap_tab: | |
551 | .byte 3, 2, 1, 0 | |
552 | .byte 7, 6, 5, 4 | |
553 | .byte 11, 10, 9, 8 | |
554 | .byte 15, 14, 13, 12 | |
555 | ||
556 | ///----- That's all, folks -------------------------------------------------- |