Commit | Line | Data |
---|---|---|
226639f3 MW |
1 | ### -*- mode: asm; asm-comment-char: ?# -*- |
2 | ### | |
3 | ### AESNI-based implementation of Rijndael | |
4 | ### | |
5 | ### (c) 2015 Straylight/Edgeware | |
6 | ### | |
7 | ||
8 | ###----- Licensing notice --------------------------------------------------- | |
9 | ### | |
10 | ### This file is part of Catacomb. | |
11 | ### | |
12 | ### Catacomb is free software; you can redistribute it and/or modify | |
13 | ### it under the terms of the GNU Library General Public License as | |
14 | ### published by the Free Software Foundation; either version 2 of the | |
15 | ### License, or (at your option) any later version. | |
16 | ### | |
17 | ### Catacomb is distributed in the hope that it will be useful, | |
18 | ### but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 | ### GNU Library General Public License for more details. | |
21 | ### | |
22 | ### You should have received a copy of the GNU Library General Public | |
23 | ### License along with Catacomb; if not, write to the Free | |
24 | ### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, | |
25 | ### MA 02111-1307, USA. | |
26 | ||
27 | .intel_syntax noprefix | |
28 | .arch .aes | |
29 | ||
30 | .globl abort | |
31 | .globl rijndael_rcon | |
32 | ||
33 | .section .text | |
34 | ||
35 | ### The AESNI instructions implement a little-endian version of AES, but | |
36 | ### Catacomb's internal interface presents as big-endian so as to work better | |
37 | ### with things like GCM. We therefore maintain the round keys in | |
38 | ### little-endian form, and have to end-swap blocks in and out. | |
39 | ### | |
40 | ### For added amusement, the AESNI instructions don't implement the | |
41 | ### larger-block versions of Rijndael, so we have to end-swap the keys if | |
42 | ### we're preparing for one of those. | |
43 | ||
44 | ## Useful constants. | |
45 | .equ maxrounds, 16 # maximum number of rounds | |
46 | .equ maxblksz, 32 # maximum block size, in bytes | |
47 | .equ kbufsz, maxblksz*(maxrounds + 1) # size of a key-schedule buffer | |
48 | ||
49 | ## Context structure. | |
50 | .equ nr, 0 # number of rounds | |
51 | .equ w, nr + 4 # encryption key words | |
52 | .equ wi, w + kbufsz # decryption key words | |
53 | ||
54 | ###-------------------------------------------------------------------------- | |
55 | ### Key setup. | |
56 | ||
57 | .globl rijndael_setup_x86_aesni | |
58 | .type rijndael_setup_x86_aesni, STT_FUNC | |
59 | .align 16 | |
60 | rijndael_setup_x86_aesni: | |
61 | ||
62 | ## Initial state. We have four arguments: | |
63 | ## [esp + 20] is the context pointer | |
64 | ## [esp + 24] is the block size, in 32-bit words (4, 6, or 8) | |
65 | ## [esp + 28] points to the key material, unaligned | |
66 | ## [esp + 32] is the size of the key, in words | |
67 | ## The key size has already been checked for validity, and the number | |
68 | ## of rounds has been computed. Our job is only to fill in the `w' | |
69 | ## and `wi' vectors. | |
70 | ||
71 | push ebp | |
72 | push ebx | |
73 | push esi | |
74 | push edi | |
75 | ||
76 | ## The initial round key material is taken directly from the input | |
77 | ## key, so copy it over. | |
78 | mov ebp, [esp + 20] # context base pointer | |
79 | mov ebx, [esp + 32] # key size, in words | |
80 | mov ecx, ebx | |
81 | mov esi, [esp + 28] | |
82 | lea edi, [ebp + w] | |
83 | rep movsd | |
84 | ||
85 | ## Find out other useful things. | |
86 | mov edx, [ebp + nr] # number of rounds | |
87 | add edx, 1 | |
88 | imul edx, [esp + 24] # total key size in words | |
89 | sub edx, ebx # offset by the key size | |
90 | ||
91 | ## Find the round constants. | |
92 | call where_am_i_ecx | |
93 | add ecx, offset _GLOBAL_OFFSET_TABLE_ | |
94 | mov ecx, [ecx + rijndael_rcon@GOT] | |
95 | ||
96 | ## Prepare for the main loop. | |
97 | lea esi, [ebp + w] | |
98 | mov eax, [esi + 4*ebx - 4] # most recent key word | |
99 | lea edx, [esi + 4*edx] # limit, offset by one key expansion | |
100 | ||
101 | ## Main key expansion loop. The first word of each key-length chunk | |
102 | ## needs special treatment. | |
103 | ## | |
104 | ## This is rather tedious because the Intel `AESKEYGENASSIST' | |
105 | ## instruction is very strangely shaped. Firstly, it wants to | |
106 | ## operate on vast SSE registers, even though we're data-blocked from | |
107 | ## doing more than operation at a time unless we're doing two key | |
108 | ## schedules simultaneously -- and even then we can't do more than | |
109 | ## two, because the instruction ignores two of its input words | |
110 | ## entirely, and produces two different outputs for each of the other | |
111 | ## two. And secondly it insists on taking the magic round constant | |
112 | ## as an immediate, so it's kind of annoying if you're not | |
113 | ## open-coding the whole thing. It's much easier to leave that as | |
114 | ## zero and XOR in the round constant by hand. | |
115 | 9: movd xmm0, eax | |
116 | pshufd xmm0, xmm0, 0x39 | |
117 | aeskeygenassist xmm1, xmm0, 0 | |
118 | pshufd xmm1, xmm1, 0x93 | |
119 | movd eax, xmm1 | |
120 | xor eax, [esi] | |
121 | xor al, [ecx] | |
122 | inc ecx | |
123 | mov [esi + 4*ebx], eax | |
124 | add esi, 4 | |
125 | cmp esi, edx | |
126 | jae 8f | |
127 | ||
128 | ## The next three words are simple... | |
129 | xor eax, [esi] | |
130 | mov [esi + 4*ebx], eax | |
131 | add esi, 4 | |
132 | cmp esi, edx | |
133 | jae 8f | |
134 | ||
135 | ## (Word 2...) | |
136 | xor eax, [esi] | |
137 | mov [esi + 4*ebx], eax | |
138 | add esi, 4 | |
139 | cmp esi, edx | |
140 | jae 8f | |
141 | ||
142 | ## (Word 3...) | |
143 | xor eax, [esi] | |
144 | mov [esi + 4*ebx], eax | |
145 | add esi, 4 | |
146 | cmp esi, edx | |
147 | jae 8f | |
148 | ||
149 | ## Word 4. If the key is /more/ than 6 words long, then we must | |
150 | ## apply a substitution here. | |
151 | cmp ebx, 5 | |
152 | jb 9b | |
153 | cmp ebx, 7 | |
154 | jb 0f | |
155 | movd xmm0, eax | |
156 | pshufd xmm0, xmm0, 0x93 | |
157 | aeskeygenassist xmm1, xmm0, 0 | |
158 | movd eax, xmm1 | |
159 | 0: xor eax, [esi] | |
160 | mov [esi + 4*ebx], eax | |
161 | add esi, 4 | |
162 | cmp esi, edx | |
163 | jae 8f | |
164 | ||
165 | ## (Word 5...) | |
166 | cmp ebx, 6 | |
167 | jb 9b | |
168 | xor eax, [esi] | |
169 | mov [esi + 4*ebx], eax | |
170 | add esi, 4 | |
171 | cmp esi, edx | |
172 | jae 8f | |
173 | ||
174 | ## (Word 6...) | |
175 | cmp ebx, 7 | |
176 | jb 9b | |
177 | xor eax, [esi] | |
178 | mov [esi + 4*ebx], eax | |
179 | add esi, 4 | |
180 | cmp esi, edx | |
181 | jae 8f | |
182 | ||
183 | ## (Word 7...) | |
184 | cmp ebx, 8 | |
185 | jb 9b | |
186 | xor eax, [esi] | |
187 | mov [esi + 4*ebx], eax | |
188 | add esi, 4 | |
189 | cmp esi, edx | |
190 | jae 8f | |
191 | ||
192 | ## Must be done by now. | |
193 | jmp 9b | |
194 | ||
195 | ## Next job is to construct the decryption keys. The keys for the | |
196 | ## first and last rounds don't need to be mangled, but the remaining | |
197 | ## ones do -- and they all need to be reordered too. | |
198 | ## | |
199 | ## The plan of action, then, is to copy the final encryption round's | |
200 | ## keys into place first, then to do each of the intermediate rounds | |
201 | ## in reverse order, and finally do the first round. | |
202 | ## | |
203 | ## Do all of the heavy lifting with SSE registers. The order we're | |
204 | ## doing this in means that it's OK if we read or write too much, and | |
205 | ## there's easily enough buffer space for the over-enthusiastic reads | |
206 | ## and writes because the context has space for 32-byte blocks, which | |
207 | ## is our maximum and an exact fit for two SSE registers. | |
208 | 8: mov ecx, [ebp + nr] # number of rounds | |
209 | mov ebx, [esp + 24] # block size (in words) | |
210 | mov edx, ecx | |
211 | imul edx, ebx | |
212 | lea edi, [ebp + wi] | |
213 | lea esi, [ebp + 4*edx + w] # last round's keys | |
214 | shl ebx, 2 # block size (in bytes now) | |
215 | ||
216 | ## Copy the last encryption round's keys. | |
217 | movdqu xmm0, [esi] | |
218 | movdqu [edi], xmm0 | |
219 | cmp ebx, 16 | |
220 | jbe 9f | |
221 | movdqu xmm0, [esi + 16] | |
222 | movdqu [edi + 16], xmm0 | |
223 | ||
224 | ## Update the loop variables and stop if we've finished. | |
225 | 9: add edi, ebx | |
226 | sub esi, ebx | |
227 | sub ecx, 1 | |
228 | jbe 0f | |
229 | ||
230 | ## Do another middle round's keys... | |
231 | movdqu xmm0, [esi] | |
232 | aesimc xmm0, xmm0 | |
233 | movdqu [edi], xmm0 | |
234 | cmp ebx, 16 | |
235 | jbe 9b | |
236 | movdqu xmm0, [esi + 16] | |
237 | aesimc xmm0, xmm0 | |
238 | movdqu [edi + 16], xmm0 | |
239 | jmp 9b | |
240 | ||
241 | ## Finally do the first encryption round. | |
242 | 0: movdqu xmm0, [esi] | |
243 | movdqu [edi], xmm0 | |
244 | cmp ebx, 16 | |
245 | jbe 0f | |
246 | movdqu xmm0, [esi + 16] | |
247 | movdqu [edi + 16], xmm0 | |
248 | ||
249 | ## If the block size is not exactly four words then we must end-swap | |
250 | ## everything. We can use fancy SSE toys for this. | |
251 | 0: cmp ebx, 16 | |
252 | je 0f | |
253 | ||
254 | ## Find the byte-reordering table. | |
255 | call where_am_i_ecx | |
256 | movdqa xmm7, [ecx + endswap_tab - .] | |
257 | ||
258 | ## Calculate the number of subkey words again. (It's a good job | |
259 | ## we've got a fast multiplier.) | |
260 | mov ecx, [ebp + nr] | |
261 | add ecx, 1 | |
262 | imul ecx, [esp + 24] # total keys in words | |
263 | ||
264 | ## End-swap the encryption keys. | |
265 | mov eax, ecx | |
266 | lea esi, [ebp + w] | |
267 | call endswap_block | |
268 | ||
269 | ## And the decryption keys. | |
270 | mov ecx, eax | |
271 | lea esi, [ebp + wi] | |
272 | call endswap_block | |
273 | ||
274 | ## All done. | |
275 | 0: pop edi | |
276 | pop esi | |
277 | pop ebx | |
278 | pop ebp | |
279 | ret | |
280 | ||
281 | .align 16 | |
282 | endswap_block: | |
283 | ## End-swap ECX words starting at ESI. The end-swapping table is | |
284 | ## already loaded into XMM7; and it's OK to work in 16-byte chunks. | |
285 | movdqu xmm1, [esi] | |
286 | pshufb xmm1, xmm7 | |
287 | movdqu [esi], xmm1 | |
288 | add esi, 16 | |
289 | sub ecx, 4 | |
290 | ja endswap_block | |
291 | ret | |
292 | ||
293 | .size rijndael_setup_x86_aesni, . - rijndael_setup_x86_aesni | |
294 | ||
295 | ###-------------------------------------------------------------------------- | |
296 | ### Encrypting and decrypting blocks. | |
297 | ||
298 | .globl rijndael_eblk_x86_aesni | |
299 | .type rijndael_eblk_x86_aesni, STT_FUNC | |
300 | .align 16 | |
301 | rijndael_eblk_x86_aesni: | |
302 | ||
303 | ## On entry, we have: | |
304 | ## [esp + 4] points to the context block | |
305 | ## [esp + 8] points to the input data block | |
306 | ## [esp + 12] points to the output buffer | |
307 | ||
308 | ## Find the magic endianness-swapping table. | |
309 | call where_am_i_ecx | |
310 | movdqa xmm7, [ecx + endswap_tab - .] | |
311 | ||
312 | ## Load the input block and end-swap it. Also, start loading the | |
313 | ## keys. | |
314 | mov eax, [esp + 8] | |
315 | movdqu xmm0, [eax] | |
316 | pshufb xmm0, xmm7 | |
317 | mov eax, [esp + 4] | |
318 | lea edx, [eax + w] | |
319 | mov eax, [eax + nr] | |
320 | ||
321 | ## Initial whitening. | |
322 | movdqu xmm1, [edx] | |
323 | add edx, 16 | |
324 | pxor xmm0, xmm1 | |
325 | ||
326 | ## Dispatch to the correct code. | |
327 | cmp eax, 10 | |
328 | je er10 | |
329 | jb bogus | |
330 | cmp eax, 14 | |
331 | je er14 | |
332 | ja bogus | |
333 | cmp eax, 12 | |
334 | je er12 | |
335 | jb er11 | |
336 | jmp er13 | |
337 | ||
338 | .align 2 | |
339 | ||
340 | ## 14 rounds... | |
341 | er14: movdqu xmm1, [edx] | |
342 | add edx, 16 | |
343 | aesenc xmm0, xmm1 | |
344 | ||
345 | ## 13 rounds... | |
346 | er13: movdqu xmm1, [edx] | |
347 | add edx, 16 | |
348 | aesenc xmm0, xmm1 | |
349 | ||
350 | ## 12 rounds... | |
351 | er12: movdqu xmm1, [edx] | |
352 | add edx, 16 | |
353 | aesenc xmm0, xmm1 | |
354 | ||
355 | ## 11 rounds... | |
356 | er11: movdqu xmm1, [edx] | |
357 | add edx, 16 | |
358 | aesenc xmm0, xmm1 | |
359 | ||
360 | ## 10 rounds... | |
361 | er10: movdqu xmm1, [edx] | |
362 | aesenc xmm0, xmm1 | |
363 | ||
364 | ## 9 rounds... | |
365 | movdqu xmm1, [edx + 16] | |
366 | aesenc xmm0, xmm1 | |
367 | ||
368 | ## 8 rounds... | |
369 | movdqu xmm1, [edx + 32] | |
370 | aesenc xmm0, xmm1 | |
371 | ||
372 | ## 7 rounds... | |
373 | movdqu xmm1, [edx + 48] | |
374 | aesenc xmm0, xmm1 | |
375 | ||
376 | ## 6 rounds... | |
377 | movdqu xmm1, [edx + 64] | |
378 | aesenc xmm0, xmm1 | |
379 | ||
380 | ## 5 rounds... | |
381 | movdqu xmm1, [edx + 80] | |
382 | aesenc xmm0, xmm1 | |
383 | ||
384 | ## 4 rounds... | |
385 | movdqu xmm1, [edx + 96] | |
386 | aesenc xmm0, xmm1 | |
387 | ||
388 | ## 3 rounds... | |
389 | movdqu xmm1, [edx + 112] | |
390 | aesenc xmm0, xmm1 | |
391 | ||
392 | ## 2 rounds... | |
393 | movdqu xmm1, [edx + 128] | |
394 | aesenc xmm0, xmm1 | |
395 | ||
396 | ## Final round... | |
397 | movdqu xmm1, [edx + 144] | |
398 | aesenclast xmm0, xmm1 | |
399 | ||
400 | ## Unpermute the ciphertext block and store it. | |
401 | pshufb xmm0, xmm7 | |
402 | mov eax, [esp + 12] | |
403 | movdqu [eax], xmm0 | |
404 | ||
405 | ## And we're done. | |
406 | ret | |
407 | ||
408 | .size rijndael_eblk_x86_aesni, . - rijndael_dblk_x86_aesni | |
409 | ||
410 | .globl rijndael_dblk_x86_aesni | |
411 | .type rijndael_dblk_x86_aesni, STT_FUNC | |
412 | .align 16 | |
413 | rijndael_dblk_x86_aesni: | |
414 | ||
415 | ## On entry, we have: | |
416 | ## [esp + 4] points to the context block | |
417 | ## [esp + 8] points to the input data block | |
418 | ## [esp + 12] points to the output buffer | |
419 | ||
420 | ## Find the magic endianness-swapping table. | |
421 | call where_am_i_ecx | |
422 | movdqa xmm7, [ecx + endswap_tab - .] | |
423 | ||
424 | ## Load the input block and end-swap it. Also, start loading the | |
425 | ## keys. | |
426 | mov eax, [esp + 8] | |
427 | movdqu xmm0, [eax] | |
428 | pshufb xmm0, xmm7 | |
429 | mov eax, [esp + 4] | |
430 | lea edx, [eax + wi] | |
431 | mov eax, [eax + nr] | |
432 | ||
433 | ## Initial whitening. | |
434 | movdqu xmm1, [edx] | |
435 | add edx, 16 | |
436 | pxor xmm0, xmm1 | |
437 | ||
438 | ## Dispatch to the correct code. | |
439 | cmp eax, 10 | |
440 | je dr10 | |
441 | jb bogus | |
442 | cmp eax, 14 | |
443 | je dr14 | |
444 | ja bogus | |
445 | cmp eax, 12 | |
446 | je dr12 | |
447 | jb dr11 | |
448 | jmp dr13 | |
449 | ||
450 | .align 2 | |
451 | ||
452 | ## 14 rounds... | |
453 | dr14: movdqu xmm1, [edx] | |
454 | add edx, 16 | |
455 | aesdec xmm0, xmm1 | |
456 | ||
457 | ## 13 rounds... | |
458 | dr13: movdqu xmm1, [edx] | |
459 | add edx, 16 | |
460 | aesdec xmm0, xmm1 | |
461 | ||
462 | ## 12 rounds... | |
463 | dr12: movdqu xmm1, [edx] | |
464 | add edx, 16 | |
465 | aesdec xmm0, xmm1 | |
466 | ||
467 | ## 11 rounds... | |
468 | dr11: movdqu xmm1, [edx] | |
469 | add edx, 16 | |
470 | aesdec xmm0, xmm1 | |
471 | ||
472 | ## 10 rounds... | |
473 | dr10: movdqu xmm1, [edx] | |
474 | aesdec xmm0, xmm1 | |
475 | ||
476 | ## 9 rounds... | |
477 | movdqu xmm1, [edx + 16] | |
478 | aesdec xmm0, xmm1 | |
479 | ||
480 | ## 8 rounds... | |
481 | movdqu xmm1, [edx + 32] | |
482 | aesdec xmm0, xmm1 | |
483 | ||
484 | ## 7 rounds... | |
485 | movdqu xmm1, [edx + 48] | |
486 | aesdec xmm0, xmm1 | |
487 | ||
488 | ## 6 rounds... | |
489 | movdqu xmm1, [edx + 64] | |
490 | aesdec xmm0, xmm1 | |
491 | ||
492 | ## 5 rounds... | |
493 | movdqu xmm1, [edx + 80] | |
494 | aesdec xmm0, xmm1 | |
495 | ||
496 | ## 4 rounds... | |
497 | movdqu xmm1, [edx + 96] | |
498 | aesdec xmm0, xmm1 | |
499 | ||
500 | ## 3 rounds... | |
501 | movdqu xmm1, [edx + 112] | |
502 | aesdec xmm0, xmm1 | |
503 | ||
504 | ## 2 rounds... | |
505 | movdqu xmm1, [edx + 128] | |
506 | aesdec xmm0, xmm1 | |
507 | ||
508 | ## Final round... | |
509 | movdqu xmm1, [edx + 144] | |
510 | aesdeclast xmm0, xmm1 | |
511 | ||
512 | ## Unpermute the ciphertext block and store it. | |
513 | pshufb xmm0, xmm7 | |
514 | mov eax, [esp + 12] | |
515 | movdqu [eax], xmm0 | |
516 | ||
517 | ## And we're done. | |
518 | ret | |
519 | ||
520 | .size rijndael_dblk_x86_aesni, . - rijndael_dblk_x86_aesni | |
521 | ||
522 | ###-------------------------------------------------------------------------- | |
523 | ### Random utilities. | |
524 | ||
525 | .align 16 | |
526 | ## Abort the process because of a programming error. Indirecting | |
527 | ## through this point serves several purposes: (a) by CALLing, rather | |
528 | ## than branching to, `abort', we can save the return address, which | |
529 | ## might at least provide a hint as to what went wrong; (b) we don't | |
530 | ## have conditional CALLs (and they'd be big anyway); and (c) we can | |
531 | ## write a HLT here as a backstop against `abort' being mad. | |
532 | bogus: call abort@PLT | |
533 | 0: hlt | |
534 | jmp 0b | |
535 | ||
536 | .align 16 | |
537 | ## Return the address of the instruction following the CALL here in | |
538 | ## ECX. This is useful for doing position-independent addressing. | |
539 | where_am_i_ecx: | |
540 | mov ecx, [esp] | |
541 | ret | |
542 | ||
543 | ###-------------------------------------------------------------------------- | |
544 | ### Data tables. | |
545 | ||
546 | .align 16 | |
547 | endswap_tab: | |
548 | .byte 3, 2, 1, 0 | |
549 | .byte 7, 6, 5, 4 | |
550 | .byte 11, 10, 9, 8 | |
551 | .byte 15, 14, 13, 12 | |
552 | ||
553 | ###----- That's all, folks -------------------------------------------------- |