configure.ac: Improve the host CPU family detection.
[catacomb] / symm / rijndael-x86-aesni.s
CommitLineData
226639f3
MW
1### -*- mode: asm; asm-comment-char: ?# -*-
2###
3### AESNI-based implementation of Rijndael
4###
5### (c) 2015 Straylight/Edgeware
6###
7
8###----- Licensing notice ---------------------------------------------------
9###
10### This file is part of Catacomb.
11###
12### Catacomb is free software; you can redistribute it and/or modify
13### it under the terms of the GNU Library General Public License as
14### published by the Free Software Foundation; either version 2 of the
15### License, or (at your option) any later version.
16###
17### Catacomb is distributed in the hope that it will be useful,
18### but WITHOUT ANY WARRANTY; without even the implied warranty of
19### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20### GNU Library General Public License for more details.
21###
22### You should have received a copy of the GNU Library General Public
23### License along with Catacomb; if not, write to the Free
24### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25### MA 02111-1307, USA.
26
27 .intel_syntax noprefix
28 .arch .aes
29
30 .globl abort
31 .globl rijndael_rcon
32
33 .section .text
34
35### The AESNI instructions implement a little-endian version of AES, but
36### Catacomb's internal interface presents as big-endian so as to work better
37### with things like GCM. We therefore maintain the round keys in
38### little-endian form, and have to end-swap blocks in and out.
39###
40### For added amusement, the AESNI instructions don't implement the
41### larger-block versions of Rijndael, so we have to end-swap the keys if
42### we're preparing for one of those.
43
44 ## Useful constants.
45 .equ maxrounds, 16 # maximum number of rounds
46 .equ maxblksz, 32 # maximum block size, in bytes
47 .equ kbufsz, maxblksz*(maxrounds + 1) # size of a key-schedule buffer
48
49 ## Context structure.
50 .equ nr, 0 # number of rounds
51 .equ w, nr + 4 # encryption key words
52 .equ wi, w + kbufsz # decryption key words
53
54###--------------------------------------------------------------------------
55### Key setup.
56
57 .globl rijndael_setup_x86_aesni
58 .type rijndael_setup_x86_aesni, STT_FUNC
59 .align 16
60rijndael_setup_x86_aesni:
61
62 ## Initial state. We have four arguments:
63 ## [esp + 20] is the context pointer
64 ## [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
65 ## [esp + 28] points to the key material, unaligned
66 ## [esp + 32] is the size of the key, in words
67 ## The key size has already been checked for validity, and the number
68 ## of rounds has been computed. Our job is only to fill in the `w'
69 ## and `wi' vectors.
70
71 push ebp
72 push ebx
73 push esi
74 push edi
75
76 ## The initial round key material is taken directly from the input
77 ## key, so copy it over.
78 mov ebp, [esp + 20] # context base pointer
79 mov ebx, [esp + 32] # key size, in words
80 mov ecx, ebx
81 mov esi, [esp + 28]
82 lea edi, [ebp + w]
83 rep movsd
84
85 ## Find out other useful things.
86 mov edx, [ebp + nr] # number of rounds
87 add edx, 1
88 imul edx, [esp + 24] # total key size in words
89 sub edx, ebx # offset by the key size
90
91 ## Find the round constants.
92 call where_am_i_ecx
93 add ecx, offset _GLOBAL_OFFSET_TABLE_
94 mov ecx, [ecx + rijndael_rcon@GOT]
95
96 ## Prepare for the main loop.
97 lea esi, [ebp + w]
98 mov eax, [esi + 4*ebx - 4] # most recent key word
99 lea edx, [esi + 4*edx] # limit, offset by one key expansion
100
101 ## Main key expansion loop. The first word of each key-length chunk
102 ## needs special treatment.
103 ##
104 ## This is rather tedious because the Intel `AESKEYGENASSIST'
105 ## instruction is very strangely shaped. Firstly, it wants to
106 ## operate on vast SSE registers, even though we're data-blocked from
107 ## doing more than operation at a time unless we're doing two key
108 ## schedules simultaneously -- and even then we can't do more than
109 ## two, because the instruction ignores two of its input words
110 ## entirely, and produces two different outputs for each of the other
111 ## two. And secondly it insists on taking the magic round constant
112 ## as an immediate, so it's kind of annoying if you're not
113 ## open-coding the whole thing. It's much easier to leave that as
114 ## zero and XOR in the round constant by hand.
1159: movd xmm0, eax
116 pshufd xmm0, xmm0, 0x39
117 aeskeygenassist xmm1, xmm0, 0
118 pshufd xmm1, xmm1, 0x93
119 movd eax, xmm1
120 xor eax, [esi]
121 xor al, [ecx]
122 inc ecx
123 mov [esi + 4*ebx], eax
124 add esi, 4
125 cmp esi, edx
126 jae 8f
127
128 ## The next three words are simple...
129 xor eax, [esi]
130 mov [esi + 4*ebx], eax
131 add esi, 4
132 cmp esi, edx
133 jae 8f
134
135 ## (Word 2...)
136 xor eax, [esi]
137 mov [esi + 4*ebx], eax
138 add esi, 4
139 cmp esi, edx
140 jae 8f
141
142 ## (Word 3...)
143 xor eax, [esi]
144 mov [esi + 4*ebx], eax
145 add esi, 4
146 cmp esi, edx
147 jae 8f
148
149 ## Word 4. If the key is /more/ than 6 words long, then we must
150 ## apply a substitution here.
151 cmp ebx, 5
152 jb 9b
153 cmp ebx, 7
154 jb 0f
155 movd xmm0, eax
156 pshufd xmm0, xmm0, 0x93
157 aeskeygenassist xmm1, xmm0, 0
158 movd eax, xmm1
1590: xor eax, [esi]
160 mov [esi + 4*ebx], eax
161 add esi, 4
162 cmp esi, edx
163 jae 8f
164
165 ## (Word 5...)
166 cmp ebx, 6
167 jb 9b
168 xor eax, [esi]
169 mov [esi + 4*ebx], eax
170 add esi, 4
171 cmp esi, edx
172 jae 8f
173
174 ## (Word 6...)
175 cmp ebx, 7
176 jb 9b
177 xor eax, [esi]
178 mov [esi + 4*ebx], eax
179 add esi, 4
180 cmp esi, edx
181 jae 8f
182
183 ## (Word 7...)
184 cmp ebx, 8
185 jb 9b
186 xor eax, [esi]
187 mov [esi + 4*ebx], eax
188 add esi, 4
189 cmp esi, edx
190 jae 8f
191
192 ## Must be done by now.
193 jmp 9b
194
195 ## Next job is to construct the decryption keys. The keys for the
196 ## first and last rounds don't need to be mangled, but the remaining
197 ## ones do -- and they all need to be reordered too.
198 ##
199 ## The plan of action, then, is to copy the final encryption round's
200 ## keys into place first, then to do each of the intermediate rounds
201 ## in reverse order, and finally do the first round.
202 ##
203 ## Do all of the heavy lifting with SSE registers. The order we're
204 ## doing this in means that it's OK if we read or write too much, and
205 ## there's easily enough buffer space for the over-enthusiastic reads
206 ## and writes because the context has space for 32-byte blocks, which
207 ## is our maximum and an exact fit for two SSE registers.
2088: mov ecx, [ebp + nr] # number of rounds
209 mov ebx, [esp + 24] # block size (in words)
210 mov edx, ecx
211 imul edx, ebx
212 lea edi, [ebp + wi]
213 lea esi, [ebp + 4*edx + w] # last round's keys
214 shl ebx, 2 # block size (in bytes now)
215
216 ## Copy the last encryption round's keys.
217 movdqu xmm0, [esi]
218 movdqu [edi], xmm0
219 cmp ebx, 16
220 jbe 9f
221 movdqu xmm0, [esi + 16]
222 movdqu [edi + 16], xmm0
223
224 ## Update the loop variables and stop if we've finished.
2259: add edi, ebx
226 sub esi, ebx
227 sub ecx, 1
228 jbe 0f
229
230 ## Do another middle round's keys...
231 movdqu xmm0, [esi]
232 aesimc xmm0, xmm0
233 movdqu [edi], xmm0
234 cmp ebx, 16
235 jbe 9b
236 movdqu xmm0, [esi + 16]
237 aesimc xmm0, xmm0
238 movdqu [edi + 16], xmm0
239 jmp 9b
240
241 ## Finally do the first encryption round.
2420: movdqu xmm0, [esi]
243 movdqu [edi], xmm0
244 cmp ebx, 16
245 jbe 0f
246 movdqu xmm0, [esi + 16]
247 movdqu [edi + 16], xmm0
248
249 ## If the block size is not exactly four words then we must end-swap
250 ## everything. We can use fancy SSE toys for this.
2510: cmp ebx, 16
252 je 0f
253
254 ## Find the byte-reordering table.
255 call where_am_i_ecx
256 movdqa xmm7, [ecx + endswap_tab - .]
257
258 ## Calculate the number of subkey words again. (It's a good job
259 ## we've got a fast multiplier.)
260 mov ecx, [ebp + nr]
261 add ecx, 1
262 imul ecx, [esp + 24] # total keys in words
263
264 ## End-swap the encryption keys.
265 mov eax, ecx
266 lea esi, [ebp + w]
267 call endswap_block
268
269 ## And the decryption keys.
270 mov ecx, eax
271 lea esi, [ebp + wi]
272 call endswap_block
273
274 ## All done.
2750: pop edi
276 pop esi
277 pop ebx
278 pop ebp
279 ret
280
281 .align 16
282endswap_block:
283 ## End-swap ECX words starting at ESI. The end-swapping table is
284 ## already loaded into XMM7; and it's OK to work in 16-byte chunks.
285 movdqu xmm1, [esi]
286 pshufb xmm1, xmm7
287 movdqu [esi], xmm1
288 add esi, 16
289 sub ecx, 4
290 ja endswap_block
291 ret
292
293 .size rijndael_setup_x86_aesni, . - rijndael_setup_x86_aesni
294
295###--------------------------------------------------------------------------
296### Encrypting and decrypting blocks.
297
298 .globl rijndael_eblk_x86_aesni
299 .type rijndael_eblk_x86_aesni, STT_FUNC
300 .align 16
301rijndael_eblk_x86_aesni:
302
303 ## On entry, we have:
304 ## [esp + 4] points to the context block
305 ## [esp + 8] points to the input data block
306 ## [esp + 12] points to the output buffer
307
308 ## Find the magic endianness-swapping table.
309 call where_am_i_ecx
310 movdqa xmm7, [ecx + endswap_tab - .]
311
312 ## Load the input block and end-swap it. Also, start loading the
313 ## keys.
314 mov eax, [esp + 8]
315 movdqu xmm0, [eax]
316 pshufb xmm0, xmm7
317 mov eax, [esp + 4]
318 lea edx, [eax + w]
319 mov eax, [eax + nr]
320
321 ## Initial whitening.
322 movdqu xmm1, [edx]
323 add edx, 16
324 pxor xmm0, xmm1
325
326 ## Dispatch to the correct code.
327 cmp eax, 10
328 je er10
329 jb bogus
330 cmp eax, 14
331 je er14
332 ja bogus
333 cmp eax, 12
334 je er12
335 jb er11
336 jmp er13
337
338 .align 2
339
340 ## 14 rounds...
341er14: movdqu xmm1, [edx]
342 add edx, 16
343 aesenc xmm0, xmm1
344
345 ## 13 rounds...
346er13: movdqu xmm1, [edx]
347 add edx, 16
348 aesenc xmm0, xmm1
349
350 ## 12 rounds...
351er12: movdqu xmm1, [edx]
352 add edx, 16
353 aesenc xmm0, xmm1
354
355 ## 11 rounds...
356er11: movdqu xmm1, [edx]
357 add edx, 16
358 aesenc xmm0, xmm1
359
360 ## 10 rounds...
361er10: movdqu xmm1, [edx]
362 aesenc xmm0, xmm1
363
364 ## 9 rounds...
365 movdqu xmm1, [edx + 16]
366 aesenc xmm0, xmm1
367
368 ## 8 rounds...
369 movdqu xmm1, [edx + 32]
370 aesenc xmm0, xmm1
371
372 ## 7 rounds...
373 movdqu xmm1, [edx + 48]
374 aesenc xmm0, xmm1
375
376 ## 6 rounds...
377 movdqu xmm1, [edx + 64]
378 aesenc xmm0, xmm1
379
380 ## 5 rounds...
381 movdqu xmm1, [edx + 80]
382 aesenc xmm0, xmm1
383
384 ## 4 rounds...
385 movdqu xmm1, [edx + 96]
386 aesenc xmm0, xmm1
387
388 ## 3 rounds...
389 movdqu xmm1, [edx + 112]
390 aesenc xmm0, xmm1
391
392 ## 2 rounds...
393 movdqu xmm1, [edx + 128]
394 aesenc xmm0, xmm1
395
396 ## Final round...
397 movdqu xmm1, [edx + 144]
398 aesenclast xmm0, xmm1
399
400 ## Unpermute the ciphertext block and store it.
401 pshufb xmm0, xmm7
402 mov eax, [esp + 12]
403 movdqu [eax], xmm0
404
405 ## And we're done.
406 ret
407
408 .size rijndael_eblk_x86_aesni, . - rijndael_dblk_x86_aesni
409
410 .globl rijndael_dblk_x86_aesni
411 .type rijndael_dblk_x86_aesni, STT_FUNC
412 .align 16
413rijndael_dblk_x86_aesni:
414
415 ## On entry, we have:
416 ## [esp + 4] points to the context block
417 ## [esp + 8] points to the input data block
418 ## [esp + 12] points to the output buffer
419
420 ## Find the magic endianness-swapping table.
421 call where_am_i_ecx
422 movdqa xmm7, [ecx + endswap_tab - .]
423
424 ## Load the input block and end-swap it. Also, start loading the
425 ## keys.
426 mov eax, [esp + 8]
427 movdqu xmm0, [eax]
428 pshufb xmm0, xmm7
429 mov eax, [esp + 4]
430 lea edx, [eax + wi]
431 mov eax, [eax + nr]
432
433 ## Initial whitening.
434 movdqu xmm1, [edx]
435 add edx, 16
436 pxor xmm0, xmm1
437
438 ## Dispatch to the correct code.
439 cmp eax, 10
440 je dr10
441 jb bogus
442 cmp eax, 14
443 je dr14
444 ja bogus
445 cmp eax, 12
446 je dr12
447 jb dr11
448 jmp dr13
449
450 .align 2
451
452 ## 14 rounds...
453dr14: movdqu xmm1, [edx]
454 add edx, 16
455 aesdec xmm0, xmm1
456
457 ## 13 rounds...
458dr13: movdqu xmm1, [edx]
459 add edx, 16
460 aesdec xmm0, xmm1
461
462 ## 12 rounds...
463dr12: movdqu xmm1, [edx]
464 add edx, 16
465 aesdec xmm0, xmm1
466
467 ## 11 rounds...
468dr11: movdqu xmm1, [edx]
469 add edx, 16
470 aesdec xmm0, xmm1
471
472 ## 10 rounds...
473dr10: movdqu xmm1, [edx]
474 aesdec xmm0, xmm1
475
476 ## 9 rounds...
477 movdqu xmm1, [edx + 16]
478 aesdec xmm0, xmm1
479
480 ## 8 rounds...
481 movdqu xmm1, [edx + 32]
482 aesdec xmm0, xmm1
483
484 ## 7 rounds...
485 movdqu xmm1, [edx + 48]
486 aesdec xmm0, xmm1
487
488 ## 6 rounds...
489 movdqu xmm1, [edx + 64]
490 aesdec xmm0, xmm1
491
492 ## 5 rounds...
493 movdqu xmm1, [edx + 80]
494 aesdec xmm0, xmm1
495
496 ## 4 rounds...
497 movdqu xmm1, [edx + 96]
498 aesdec xmm0, xmm1
499
500 ## 3 rounds...
501 movdqu xmm1, [edx + 112]
502 aesdec xmm0, xmm1
503
504 ## 2 rounds...
505 movdqu xmm1, [edx + 128]
506 aesdec xmm0, xmm1
507
508 ## Final round...
509 movdqu xmm1, [edx + 144]
510 aesdeclast xmm0, xmm1
511
512 ## Unpermute the ciphertext block and store it.
513 pshufb xmm0, xmm7
514 mov eax, [esp + 12]
515 movdqu [eax], xmm0
516
517 ## And we're done.
518 ret
519
520 .size rijndael_dblk_x86_aesni, . - rijndael_dblk_x86_aesni
521
522###--------------------------------------------------------------------------
523### Random utilities.
524
525 .align 16
526 ## Abort the process because of a programming error. Indirecting
527 ## through this point serves several purposes: (a) by CALLing, rather
528 ## than branching to, `abort', we can save the return address, which
529 ## might at least provide a hint as to what went wrong; (b) we don't
530 ## have conditional CALLs (and they'd be big anyway); and (c) we can
531 ## write a HLT here as a backstop against `abort' being mad.
532bogus: call abort@PLT
5330: hlt
534 jmp 0b
535
536 .align 16
537 ## Return the address of the instruction following the CALL here in
538 ## ECX. This is useful for doing position-independent addressing.
539where_am_i_ecx:
540 mov ecx, [esp]
541 ret
542
543###--------------------------------------------------------------------------
544### Data tables.
545
546 .align 16
547endswap_tab:
548 .byte 3, 2, 1, 0
549 .byte 7, 6, 5, 4
550 .byte 11, 10, 9, 8
551 .byte 15, 14, 13, 12
552
553###----- That's all, folks --------------------------------------------------