1 ### -*- mode: asm; asm-comment-char: ?# -*-
3 ### AESNI-based implementation of Rijndael
5 ### (c) 2015 Straylight/Edgeware
8 ###----- Licensing notice ---------------------------------------------------
10 ### This file is part of Catacomb.
12 ### Catacomb is free software; you can redistribute it and/or modify
13 ### it under the terms of the GNU Library General Public License as
14 ### published by the Free Software Foundation; either version 2 of the
15 ### License, or (at your option) any later version.
17 ### Catacomb is distributed in the hope that it will be useful,
18 ### but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ### GNU Library General Public License for more details.
22 ### You should have received a copy of the GNU Library General Public
23 ### License along with Catacomb; if not, write to the Free
24 ### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 ### MA 02111-1307, USA.
27 .intel_syntax noprefix
35 ### The AESNI instructions implement a little-endian version of AES, but
36 ### Catacomb's internal interface presents as big-endian so as to work better
37 ### with things like GCM. We therefore maintain the round keys in
38 ### little-endian form, and have to end-swap blocks in and out.
40 ### For added amusement, the AESNI instructions don't implement the
41 ### larger-block versions of Rijndael, so we have to end-swap the keys if
42 ### we're preparing for one of those.
45 .equ maxrounds, 16 # maximum number of rounds
46 .equ maxblksz, 32 # maximum block size, in bytes
47 .equ kbufsz, maxblksz*(maxrounds + 1) # size of a key-schedule buffer
50 .equ nr, 0 # number of rounds
51 .equ w, nr + 4 # encryption key words
52 .equ wi, w + kbufsz # decryption key words
54 ###--------------------------------------------------------------------------
57 .globl rijndael_setup_x86_aesni
58 .type rijndael_setup_x86_aesni, STT_FUNC
60 rijndael_setup_x86_aesni:
62 ## Initial state. We have four arguments:
63 ## [esp + 20] is the context pointer
64 ## [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
65 ## [esp + 28] points to the key material, unaligned
66 ## [esp + 32] is the size of the key, in words
67 ## The key size has already been checked for validity, and the number
68 ## of rounds has been computed. Our job is only to fill in the `w'
76 ## The initial round key material is taken directly from the input
77 ## key, so copy it over.
78 mov ebp, [esp + 20] # context base pointer
79 mov ebx, [esp + 32] # key size, in words
85 ## Find out other useful things.
86 mov edx, [ebp + nr] # number of rounds
88 imul edx, [esp + 24] # total key size in words
89 sub edx, ebx # offset by the key size
91 ## Find the round constants.
93 add ecx, offset _GLOBAL_OFFSET_TABLE_
94 mov ecx, [ecx + rijndael_rcon@GOT]
96 ## Prepare for the main loop.
98 mov eax, [esi + 4*ebx - 4] # most recent key word
99 lea edx, [esi + 4*edx] # limit, offset by one key expansion
101 ## Main key expansion loop. The first word of each key-length chunk
102 ## needs special treatment.
104 ## This is rather tedious because the Intel `AESKEYGENASSIST'
105 ## instruction is very strangely shaped. Firstly, it wants to
106 ## operate on vast SSE registers, even though we're data-blocked from
107 ## doing more than operation at a time unless we're doing two key
108 ## schedules simultaneously -- and even then we can't do more than
109 ## two, because the instruction ignores two of its input words
110 ## entirely, and produces two different outputs for each of the other
111 ## two. And secondly it insists on taking the magic round constant
112 ## as an immediate, so it's kind of annoying if you're not
113 ## open-coding the whole thing. It's much easier to leave that as
114 ## zero and XOR in the round constant by hand.
116 pshufd xmm0, xmm0, 0x39
117 aeskeygenassist xmm1, xmm0, 0
118 pshufd xmm1, xmm1, 0x93
123 mov [esi + 4*ebx], eax
128 ## The next three words are simple...
130 mov [esi + 4*ebx], eax
137 mov [esi + 4*ebx], eax
144 mov [esi + 4*ebx], eax
149 ## Word 4. If the key is /more/ than 6 words long, then we must
150 ## apply a substitution here.
156 pshufd xmm0, xmm0, 0x93
157 aeskeygenassist xmm1, xmm0, 0
160 mov [esi + 4*ebx], eax
169 mov [esi + 4*ebx], eax
178 mov [esi + 4*ebx], eax
187 mov [esi + 4*ebx], eax
192 ## Must be done by now.
195 ## Next job is to construct the decryption keys. The keys for the
196 ## first and last rounds don't need to be mangled, but the remaining
197 ## ones do -- and they all need to be reordered too.
199 ## The plan of action, then, is to copy the final encryption round's
200 ## keys into place first, then to do each of the intermediate rounds
201 ## in reverse order, and finally do the first round.
203 ## Do all of the heavy lifting with SSE registers. The order we're
204 ## doing this in means that it's OK if we read or write too much, and
205 ## there's easily enough buffer space for the over-enthusiastic reads
206 ## and writes because the context has space for 32-byte blocks, which
207 ## is our maximum and an exact fit for two SSE registers.
208 8: mov ecx, [ebp + nr] # number of rounds
209 mov ebx, [esp + 24] # block size (in words)
213 lea esi, [ebp + 4*edx + w] # last round's keys
214 shl ebx, 2 # block size (in bytes now)
216 ## Copy the last encryption round's keys.
221 movdqu xmm0, [esi + 16]
222 movdqu [edi + 16], xmm0
224 ## Update the loop variables and stop if we've finished.
230 ## Do another middle round's keys...
236 movdqu xmm0, [esi + 16]
238 movdqu [edi + 16], xmm0
241 ## Finally do the first encryption round.
242 0: movdqu xmm0, [esi]
246 movdqu xmm0, [esi + 16]
247 movdqu [edi + 16], xmm0
249 ## If the block size is not exactly four words then we must end-swap
250 ## everything. We can use fancy SSE toys for this.
254 ## Find the byte-reordering table.
256 movdqa xmm7, [ecx + endswap_tab - .]
258 ## Calculate the number of subkey words again. (It's a good job
259 ## we've got a fast multiplier.)
262 imul ecx, [esp + 24] # total keys in words
264 ## End-swap the encryption keys.
269 ## And the decryption keys.
283 ## End-swap ECX words starting at ESI. The end-swapping table is
284 ## already loaded into XMM7; and it's OK to work in 16-byte chunks.
293 .size rijndael_setup_x86_aesni, . - rijndael_setup_x86_aesni
295 ###--------------------------------------------------------------------------
296 ### Encrypting and decrypting blocks.
298 .globl rijndael_eblk_x86_aesni
299 .type rijndael_eblk_x86_aesni, STT_FUNC
301 rijndael_eblk_x86_aesni:
303 ## On entry, we have:
304 ## [esp + 4] points to the context block
305 ## [esp + 8] points to the input data block
306 ## [esp + 12] points to the output buffer
308 ## Find the magic endianness-swapping table.
310 movdqa xmm7, [ecx + endswap_tab - .]
312 ## Load the input block and end-swap it. Also, start loading the
321 ## Initial whitening.
326 ## Dispatch to the correct code.
341 er14: movdqu xmm1, [edx]
346 er13: movdqu xmm1, [edx]
351 er12: movdqu xmm1, [edx]
356 er11: movdqu xmm1, [edx]
361 er10: movdqu xmm1, [edx]
365 movdqu xmm1, [edx + 16]
369 movdqu xmm1, [edx + 32]
373 movdqu xmm1, [edx + 48]
377 movdqu xmm1, [edx + 64]
381 movdqu xmm1, [edx + 80]
385 movdqu xmm1, [edx + 96]
389 movdqu xmm1, [edx + 112]
393 movdqu xmm1, [edx + 128]
397 movdqu xmm1, [edx + 144]
398 aesenclast xmm0, xmm1
400 ## Unpermute the ciphertext block and store it.
408 .size rijndael_eblk_x86_aesni, . - rijndael_dblk_x86_aesni
410 .globl rijndael_dblk_x86_aesni
411 .type rijndael_dblk_x86_aesni, STT_FUNC
413 rijndael_dblk_x86_aesni:
415 ## On entry, we have:
416 ## [esp + 4] points to the context block
417 ## [esp + 8] points to the input data block
418 ## [esp + 12] points to the output buffer
420 ## Find the magic endianness-swapping table.
422 movdqa xmm7, [ecx + endswap_tab - .]
424 ## Load the input block and end-swap it. Also, start loading the
433 ## Initial whitening.
438 ## Dispatch to the correct code.
453 dr14: movdqu xmm1, [edx]
458 dr13: movdqu xmm1, [edx]
463 dr12: movdqu xmm1, [edx]
468 dr11: movdqu xmm1, [edx]
473 dr10: movdqu xmm1, [edx]
477 movdqu xmm1, [edx + 16]
481 movdqu xmm1, [edx + 32]
485 movdqu xmm1, [edx + 48]
489 movdqu xmm1, [edx + 64]
493 movdqu xmm1, [edx + 80]
497 movdqu xmm1, [edx + 96]
501 movdqu xmm1, [edx + 112]
505 movdqu xmm1, [edx + 128]
509 movdqu xmm1, [edx + 144]
510 aesdeclast xmm0, xmm1
512 ## Unpermute the ciphertext block and store it.
520 .size rijndael_dblk_x86_aesni, . - rijndael_dblk_x86_aesni
522 ###--------------------------------------------------------------------------
523 ### Random utilities.
526 ## Abort the process because of a programming error. Indirecting
527 ## through this point serves several purposes: (a) by CALLing, rather
528 ## than branching to, `abort', we can save the return address, which
529 ## might at least provide a hint as to what went wrong; (b) we don't
530 ## have conditional CALLs (and they'd be big anyway); and (c) we can
531 ## write a HLT here as a backstop against `abort' being mad.
532 bogus: call abort@PLT
537 ## Return the address of the instruction following the CALL here in
538 ## ECX. This is useful for doing position-independent addressing.
543 ###--------------------------------------------------------------------------
553 ###----- That's all, folks --------------------------------------------------