-### -*- mode: asm; asm-comment-char: ?# -*-
-###
-### AESNI-based implementation of Rijndael
-###
-### (c) 2015 Straylight/Edgeware
-###
-
-###----- Licensing notice ---------------------------------------------------
-###
-### This file is part of Catacomb.
-###
-### Catacomb is free software; you can redistribute it and/or modify
-### it under the terms of the GNU Library General Public License as
-### published by the Free Software Foundation; either version 2 of the
-### License, or (at your option) any later version.
-###
-### Catacomb is distributed in the hope that it will be useful,
-### but WITHOUT ANY WARRANTY; without even the implied warranty of
-### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-### GNU Library General Public License for more details.
-###
-### You should have received a copy of the GNU Library General Public
-### License along with Catacomb; if not, write to the Free
-### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
-### MA 02111-1307, USA.
-
- .intel_syntax noprefix
- .arch .aes
-
- .globl abort
- .globl rijndael_rcon
-
- .section .text
-
-### The AESNI instructions implement a little-endian version of AES, but
-### Catacomb's internal interface presents as big-endian so as to work better
-### with things like GCM. We therefore maintain the round keys in
-### little-endian form, and have to end-swap blocks in and out.
-###
-### For added amusement, the AESNI instructions don't implement the
-### larger-block versions of Rijndael, so we have to end-swap the keys if
-### we're preparing for one of those.
-
- ## Useful constants.
- .equ maxrounds, 16 # maximum number of rounds
- .equ maxblksz, 32 # maximum block size, in bytes
- .equ kbufsz, maxblksz*(maxrounds + 1) # size of a key-schedule buffer
-
- ## Context structure.
- .equ nr, 0 # number of rounds
- .equ w, nr + 4 # encryption key words
- .equ wi, w + kbufsz # decryption key words
-
-###--------------------------------------------------------------------------
-### Key setup.
-
- .globl rijndael_setup_x86_aesni
- .type rijndael_setup_x86_aesni, STT_FUNC
- .align 16
-rijndael_setup_x86_aesni:
-
- ## Initial state. We have four arguments:
- ## [esp + 20] is the context pointer
- ## [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
- ## [esp + 28] points to the key material, unaligned
- ## [esp + 32] is the size of the key, in words
- ## The key size has already been checked for validity, and the number
- ## of rounds has been computed. Our job is only to fill in the `w'
- ## and `wi' vectors.
-
- push ebp
- push ebx
- push esi
- push edi
-
- ## The initial round key material is taken directly from the input
- ## key, so copy it over.
- mov ebp, [esp + 20] # context base pointer
- mov ebx, [esp + 32] # key size, in words
- mov ecx, ebx
- mov esi, [esp + 28]
- lea edi, [ebp + w]
- rep movsd
-
- ## Find out other useful things.
- mov edx, [ebp + nr] # number of rounds
- add edx, 1
- imul edx, [esp + 24] # total key size in words
- sub edx, ebx # offset by the key size
-
- ## Find the round constants.
- call where_am_i_ecx
- add ecx, offset _GLOBAL_OFFSET_TABLE_
- mov ecx, [ecx + rijndael_rcon@GOT]
-
- ## Prepare for the main loop.
- lea esi, [ebp + w]
- mov eax, [esi + 4*ebx - 4] # most recent key word
- lea edx, [esi + 4*edx] # limit, offset by one key expansion
-
- ## Main key expansion loop. The first word of each key-length chunk
- ## needs special treatment.
- ##
- ## This is rather tedious because the Intel `AESKEYGENASSIST'
- ## instruction is very strangely shaped. Firstly, it wants to
- ## operate on vast SSE registers, even though we're data-blocked from
- ## doing more than operation at a time unless we're doing two key
- ## schedules simultaneously -- and even then we can't do more than
- ## two, because the instruction ignores two of its input words
- ## entirely, and produces two different outputs for each of the other
- ## two. And secondly it insists on taking the magic round constant
- ## as an immediate, so it's kind of annoying if you're not
- ## open-coding the whole thing. It's much easier to leave that as
- ## zero and XOR in the round constant by hand.
-9: movd xmm0, eax
- pshufd xmm0, xmm0, 0x39
- aeskeygenassist xmm1, xmm0, 0
- pshufd xmm1, xmm1, 0x93
- movd eax, xmm1
- xor eax, [esi]
- xor al, [ecx]
- inc ecx
- mov [esi + 4*ebx], eax
- add esi, 4
- cmp esi, edx
- jae 8f
-
- ## The next three words are simple...
- xor eax, [esi]
- mov [esi + 4*ebx], eax
- add esi, 4
- cmp esi, edx
- jae 8f
-
- ## (Word 2...)
- xor eax, [esi]
- mov [esi + 4*ebx], eax
- add esi, 4
- cmp esi, edx
- jae 8f
-
- ## (Word 3...)
- xor eax, [esi]
- mov [esi + 4*ebx], eax
- add esi, 4
- cmp esi, edx
- jae 8f
-
- ## Word 4. If the key is /more/ than 6 words long, then we must
- ## apply a substitution here.
- cmp ebx, 5
- jb 9b
- cmp ebx, 7
- jb 0f
- movd xmm0, eax
- pshufd xmm0, xmm0, 0x93
- aeskeygenassist xmm1, xmm0, 0
- movd eax, xmm1
-0: xor eax, [esi]
- mov [esi + 4*ebx], eax
- add esi, 4
- cmp esi, edx
- jae 8f
-
- ## (Word 5...)
- cmp ebx, 6
- jb 9b
- xor eax, [esi]
- mov [esi + 4*ebx], eax
- add esi, 4
- cmp esi, edx
- jae 8f
-
- ## (Word 6...)
- cmp ebx, 7
- jb 9b
- xor eax, [esi]
- mov [esi + 4*ebx], eax
- add esi, 4
- cmp esi, edx
- jae 8f
-
- ## (Word 7...)
- cmp ebx, 8
- jb 9b
- xor eax, [esi]
- mov [esi + 4*ebx], eax
- add esi, 4
- cmp esi, edx
- jae 8f
-
- ## Must be done by now.
- jmp 9b
-
- ## Next job is to construct the decryption keys. The keys for the
- ## first and last rounds don't need to be mangled, but the remaining
- ## ones do -- and they all need to be reordered too.
- ##
- ## The plan of action, then, is to copy the final encryption round's
- ## keys into place first, then to do each of the intermediate rounds
- ## in reverse order, and finally do the first round.
- ##
- ## Do all of the heavy lifting with SSE registers. The order we're
- ## doing this in means that it's OK if we read or write too much, and
- ## there's easily enough buffer space for the over-enthusiastic reads
- ## and writes because the context has space for 32-byte blocks, which
- ## is our maximum and an exact fit for two SSE registers.
-8: mov ecx, [ebp + nr] # number of rounds
- mov ebx, [esp + 24] # block size (in words)
- mov edx, ecx
- imul edx, ebx
- lea edi, [ebp + wi]
- lea esi, [ebp + 4*edx + w] # last round's keys
- shl ebx, 2 # block size (in bytes now)
-
- ## Copy the last encryption round's keys.
- movdqu xmm0, [esi]
- movdqu [edi], xmm0
- cmp ebx, 16
- jbe 9f
- movdqu xmm0, [esi + 16]
- movdqu [edi + 16], xmm0
-
- ## Update the loop variables and stop if we've finished.
-9: add edi, ebx
- sub esi, ebx
- sub ecx, 1
- jbe 0f
-
- ## Do another middle round's keys...
- movdqu xmm0, [esi]
- aesimc xmm0, xmm0
- movdqu [edi], xmm0
- cmp ebx, 16
- jbe 9b
- movdqu xmm0, [esi + 16]
- aesimc xmm0, xmm0
- movdqu [edi + 16], xmm0
- jmp 9b
-
- ## Finally do the first encryption round.
-0: movdqu xmm0, [esi]
- movdqu [edi], xmm0
- cmp ebx, 16
- jbe 0f
- movdqu xmm0, [esi + 16]
- movdqu [edi + 16], xmm0
-
- ## If the block size is not exactly four words then we must end-swap
- ## everything. We can use fancy SSE toys for this.
-0: cmp ebx, 16
- je 0f
-
- ## Find the byte-reordering table.
- call where_am_i_ecx
- movdqa xmm7, [ecx + endswap_tab - .]
-
- ## Calculate the number of subkey words again. (It's a good job
- ## we've got a fast multiplier.)
- mov ecx, [ebp + nr]
- add ecx, 1
- imul ecx, [esp + 24] # total keys in words
-
- ## End-swap the encryption keys.
- mov eax, ecx
- lea esi, [ebp + w]
- call endswap_block
-
- ## And the decryption keys.
- mov ecx, eax
- lea esi, [ebp + wi]
- call endswap_block
-
- ## All done.
-0: pop edi
- pop esi
- pop ebx
- pop ebp
- ret
-
- .align 16
-endswap_block:
- ## End-swap ECX words starting at ESI. The end-swapping table is
- ## already loaded into XMM7; and it's OK to work in 16-byte chunks.
- movdqu xmm1, [esi]
- pshufb xmm1, xmm7
- movdqu [esi], xmm1
- add esi, 16
- sub ecx, 4
- ja endswap_block
- ret
-
- .size rijndael_setup_x86_aesni, . - rijndael_setup_x86_aesni
-
-###--------------------------------------------------------------------------
-### Encrypting and decrypting blocks.
-
- .globl rijndael_eblk_x86_aesni
- .type rijndael_eblk_x86_aesni, STT_FUNC
- .align 16
-rijndael_eblk_x86_aesni:
-
- ## On entry, we have:
- ## [esp + 4] points to the context block
- ## [esp + 8] points to the input data block
- ## [esp + 12] points to the output buffer
-
- ## Find the magic endianness-swapping table.
- call where_am_i_ecx
- movdqa xmm7, [ecx + endswap_tab - .]
-
- ## Load the input block and end-swap it. Also, start loading the
- ## keys.
- mov eax, [esp + 8]
- movdqu xmm0, [eax]
- pshufb xmm0, xmm7
- mov eax, [esp + 4]
- lea edx, [eax + w]
- mov eax, [eax + nr]
-
- ## Initial whitening.
- movdqu xmm1, [edx]
- add edx, 16
- pxor xmm0, xmm1
-
- ## Dispatch to the correct code.
- cmp eax, 10
- je er10
- jb bogus
- cmp eax, 14
- je er14
- ja bogus
- cmp eax, 12
- je er12
- jb er11
- jmp er13
-
- .align 2
-
- ## 14 rounds...
-er14: movdqu xmm1, [edx]
- add edx, 16
- aesenc xmm0, xmm1
-
- ## 13 rounds...
-er13: movdqu xmm1, [edx]
- add edx, 16
- aesenc xmm0, xmm1
-
- ## 12 rounds...
-er12: movdqu xmm1, [edx]
- add edx, 16
- aesenc xmm0, xmm1
-
- ## 11 rounds...
-er11: movdqu xmm1, [edx]
- add edx, 16
- aesenc xmm0, xmm1
-
- ## 10 rounds...
-er10: movdqu xmm1, [edx]
- aesenc xmm0, xmm1
-
- ## 9 rounds...
- movdqu xmm1, [edx + 16]
- aesenc xmm0, xmm1
-
- ## 8 rounds...
- movdqu xmm1, [edx + 32]
- aesenc xmm0, xmm1
-
- ## 7 rounds...
- movdqu xmm1, [edx + 48]
- aesenc xmm0, xmm1
-
- ## 6 rounds...
- movdqu xmm1, [edx + 64]
- aesenc xmm0, xmm1
-
- ## 5 rounds...
- movdqu xmm1, [edx + 80]
- aesenc xmm0, xmm1
-
- ## 4 rounds...
- movdqu xmm1, [edx + 96]
- aesenc xmm0, xmm1
-
- ## 3 rounds...
- movdqu xmm1, [edx + 112]
- aesenc xmm0, xmm1
-
- ## 2 rounds...
- movdqu xmm1, [edx + 128]
- aesenc xmm0, xmm1
-
- ## Final round...
- movdqu xmm1, [edx + 144]
- aesenclast xmm0, xmm1
-
- ## Unpermute the ciphertext block and store it.
- pshufb xmm0, xmm7
- mov eax, [esp + 12]
- movdqu [eax], xmm0
-
- ## And we're done.
- ret
-
- .size rijndael_eblk_x86_aesni, . - rijndael_dblk_x86_aesni
-
- .globl rijndael_dblk_x86_aesni
- .type rijndael_dblk_x86_aesni, STT_FUNC
- .align 16
-rijndael_dblk_x86_aesni:
-
- ## On entry, we have:
- ## [esp + 4] points to the context block
- ## [esp + 8] points to the input data block
- ## [esp + 12] points to the output buffer
-
- ## Find the magic endianness-swapping table.
- call where_am_i_ecx
- movdqa xmm7, [ecx + endswap_tab - .]
-
- ## Load the input block and end-swap it. Also, start loading the
- ## keys.
- mov eax, [esp + 8]
- movdqu xmm0, [eax]
- pshufb xmm0, xmm7
- mov eax, [esp + 4]
- lea edx, [eax + wi]
- mov eax, [eax + nr]
-
- ## Initial whitening.
- movdqu xmm1, [edx]
- add edx, 16
- pxor xmm0, xmm1
-
- ## Dispatch to the correct code.
- cmp eax, 10
- je dr10
- jb bogus
- cmp eax, 14
- je dr14
- ja bogus
- cmp eax, 12
- je dr12
- jb dr11
- jmp dr13
-
- .align 2
-
- ## 14 rounds...
-dr14: movdqu xmm1, [edx]
- add edx, 16
- aesdec xmm0, xmm1
-
- ## 13 rounds...
-dr13: movdqu xmm1, [edx]
- add edx, 16
- aesdec xmm0, xmm1
-
- ## 12 rounds...
-dr12: movdqu xmm1, [edx]
- add edx, 16
- aesdec xmm0, xmm1
-
- ## 11 rounds...
-dr11: movdqu xmm1, [edx]
- add edx, 16
- aesdec xmm0, xmm1
-
- ## 10 rounds...
-dr10: movdqu xmm1, [edx]
- aesdec xmm0, xmm1
-
- ## 9 rounds...
- movdqu xmm1, [edx + 16]
- aesdec xmm0, xmm1
-
- ## 8 rounds...
- movdqu xmm1, [edx + 32]
- aesdec xmm0, xmm1
-
- ## 7 rounds...
- movdqu xmm1, [edx + 48]
- aesdec xmm0, xmm1
-
- ## 6 rounds...
- movdqu xmm1, [edx + 64]
- aesdec xmm0, xmm1
-
- ## 5 rounds...
- movdqu xmm1, [edx + 80]
- aesdec xmm0, xmm1
-
- ## 4 rounds...
- movdqu xmm1, [edx + 96]
- aesdec xmm0, xmm1
-
- ## 3 rounds...
- movdqu xmm1, [edx + 112]
- aesdec xmm0, xmm1
-
- ## 2 rounds...
- movdqu xmm1, [edx + 128]
- aesdec xmm0, xmm1
-
- ## Final round...
- movdqu xmm1, [edx + 144]
- aesdeclast xmm0, xmm1
-
- ## Unpermute the ciphertext block and store it.
- pshufb xmm0, xmm7
- mov eax, [esp + 12]
- movdqu [eax], xmm0
-
- ## And we're done.
- ret
-
- .size rijndael_dblk_x86_aesni, . - rijndael_dblk_x86_aesni
-
-###--------------------------------------------------------------------------
-### Random utilities.
-
- .align 16
- ## Abort the process because of a programming error. Indirecting
- ## through this point serves several purposes: (a) by CALLing, rather
- ## than branching to, `abort', we can save the return address, which
- ## might at least provide a hint as to what went wrong; (b) we don't
- ## have conditional CALLs (and they'd be big anyway); and (c) we can
- ## write a HLT here as a backstop against `abort' being mad.
-bogus: call abort@PLT
-0: hlt
- jmp 0b
-
- .align 16
- ## Return the address of the instruction following the CALL here in
- ## ECX. This is useful for doing position-independent addressing.
-where_am_i_ecx:
- mov ecx, [esp]
- ret
-
-###--------------------------------------------------------------------------
-### Data tables.
-
- .align 16
-endswap_tab:
- .byte 3, 2, 1, 0
- .byte 7, 6, 5, 4
- .byte 11, 10, 9, 8
- .byte 15, 14, 13, 12
-
-###----- That's all, folks --------------------------------------------------