[catacomb] / symm / rijndael-x86-aesni.s

### -*- mode: asm; asm-comment-char: ?# -*-
###
### AESNI-based implementation of Rijndael
###
### (c) 2015 Straylight/Edgeware
###

###----- Licensing notice ---------------------------------------------------
###
### This file is part of Catacomb.
###
### Catacomb is free software; you can redistribute it and/or modify
### it under the terms of the GNU Library General Public License as
### published by the Free Software Foundation; either version 2 of the
### License, or (at your option) any later version.
###
### Catacomb is distributed in the hope that it will be useful,
### but WITHOUT ANY WARRANTY; without even the implied warranty of
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
### GNU Library General Public License for more details.
###
### You should have received a copy of the GNU Library General Public
### License along with Catacomb; if not, write to the Free
### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
### MA 02111-1307, USA.

	.intel_syntax noprefix
	.arch .aes

	.globl	abort
	.globl	rijndael_rcon

	.section .text

### The AESNI instructions implement a little-endian version of AES, but
### Catacomb's internal interface presents as big-endian so as to work better
### with things like GCM.  We therefore maintain the round keys in
### little-endian form, and have to end-swap blocks in and out.
###
### For added amusement, the AESNI instructions don't implement the
### larger-block versions of Rijndael, so we have to end-swap the keys if
### we're preparing for one of those.

	## Useful constants.
	.equ maxrounds, 16		# maximum number of rounds
	.equ maxblksz, 32		# maximum block size, in bytes
	.equ kbufsz, maxblksz*(maxrounds + 1) # size of a key-schedule buffer

	## Context structure.
	.equ nr, 0			# number of rounds
	.equ w, nr + 4			# encryption key words
	.equ wi, w + kbufsz		# decryption key words

###--------------------------------------------------------------------------
### Key setup.

	.globl	rijndael_setup_x86_aesni
	.type	rijndael_setup_x86_aesni, STT_FUNC
	.align	16
rijndael_setup_x86_aesni:

	## Initial state.  We have four arguments:
	## [esp + 20] is the context pointer
	## [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
	## [esp + 28] points to the key material, unaligned
	## [esp + 32] is the size of the key, in words
	## The key size has already been checked for validity, and the number
	## of rounds has been computed.  Our job is only to fill in the `w'
	## and `wi' vectors.

	push	ebp
	push	ebx
	push	esi
	push	edi

	## The initial round key material is taken directly from the input
	## key, so copy it over.
	mov	ebp, [esp + 20]		# context base pointer
	mov	ebx, [esp + 32]		# key size, in words
	mov	ecx, ebx
	mov	esi, [esp + 28]
	lea	edi, [ebp + w]
	rep	movsd

	## Find out other useful things.
	mov	edx, [ebp + nr]		# number of rounds
	add	edx, 1
	imul	edx, [esp + 24]		# total key size in words
	sub	edx, ebx		# offset by the key size

	## Find the round constants.
	call	where_am_i_ecx
	add	ecx, offset _GLOBAL_OFFSET_TABLE_
	mov	ecx, [ecx + rijndael_rcon@GOT]

	## Prepare for the main loop.
	lea	esi, [ebp + w]
	mov	eax, [esi + 4*ebx - 4]	# most recent key word
	lea	edx, [esi + 4*edx]	# limit, offset by one key expansion

	## Main key expansion loop.  The first word of each key-length chunk
	## needs special treatment.
	##
	## This is rather tedious because the Intel `AESKEYGENASSIST'
	## instruction is very strangely shaped.  Firstly, it wants to
	## operate on vast SSE registers, even though we're data-blocked from
	## doing more than operation at a time unless we're doing two key
	## schedules simultaneously -- and even then we can't do more than
	## two, because the instruction ignores two of its input words
	## entirely, and produces two different outputs for each of the other
	## two.  And secondly it insists on taking the magic round constant
	## as an immediate, so it's kind of annoying if you're not
	## open-coding the whole thing.  It's much easier to leave that as
	## zero and XOR in the round constant by hand.
9:	movd	xmm0, eax
	pshufd	xmm0, xmm0, 0x39
	aeskeygenassist xmm1, xmm0, 0
	pshufd	xmm1, xmm1, 0x93
	movd	eax, xmm1
	xor	eax, [esi]
	xor	al, [ecx]
	inc	ecx
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	## The next three words are simple...
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	## (Word 2...)
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	## (Word 3...)
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	## Word 4.  If the key is /more/ than 6 words long, then we must
	## apply a substitution here.
	cmp	ebx, 5
	jb	9b
	cmp	ebx, 7
	jb	0f
	movd	xmm0, eax
	pshufd	xmm0, xmm0, 0x93
	aeskeygenassist xmm1, xmm0, 0
	movd	eax, xmm1
0:	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	## (Word 5...)
	cmp	ebx, 6
	jb	9b
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	## (Word 6...)
	cmp	ebx, 7
	jb	9b
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	## (Word 7...)
	cmp	ebx, 8
	jb	9b
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	## Must be done by now.
	jmp	9b

	## Next job is to construct the decryption keys.  The keys for the
	## first and last rounds don't need to be mangled, but the remaining
	## ones do -- and they all need to be reordered too.
	##
	## The plan of action, then, is to copy the final encryption round's
	## keys into place first, then to do each of the intermediate rounds
	## in reverse order, and finally do the first round.
	##
	## Do all of the heavy lifting with SSE registers.  The order we're
	## doing this in means that it's OK if we read or write too much, and
	## there's easily enough buffer space for the over-enthusiastic reads
	## and writes because the context has space for 32-byte blocks, which
	## is our maximum and an exact fit for two SSE registers.
8:	mov	ecx, [ebp + nr]		# number of rounds
	mov	ebx, [esp + 24]		# block size (in words)
	mov	edx, ecx
	imul	edx, ebx
	lea	edi, [ebp + wi]
	lea	esi, [ebp + 4*edx + w]	# last round's keys
	shl	ebx, 2			# block size (in bytes now)

	## Copy the last encryption round's keys.
	movdqu	xmm0, [esi]
	movdqu	[edi], xmm0
	cmp	ebx, 16
	jbe	9f
	movdqu	xmm0, [esi + 16]
	movdqu	[edi + 16], xmm0

	## Update the loop variables and stop if we've finished.
9:	add	edi, ebx
	sub	esi, ebx
	sub	ecx, 1
	jbe	0f

	## Do another middle round's keys...
	movdqu	xmm0, [esi]
	aesimc	xmm0, xmm0
	movdqu	[edi], xmm0
	cmp	ebx, 16
	jbe	9b
	movdqu	xmm0, [esi + 16]
	aesimc	xmm0, xmm0
	movdqu	[edi + 16], xmm0
	jmp	9b

	## Finally do the first encryption round.
0:	movdqu	xmm0, [esi]
	movdqu	[edi], xmm0
	cmp	ebx, 16
	jbe	0f
	movdqu	xmm0, [esi + 16]
	movdqu	[edi + 16], xmm0

	## If the block size is not exactly four words then we must end-swap
	## everything.  We can use fancy SSE toys for this.
0:	cmp	ebx, 16
	je	0f

	## Find the byte-reordering table.
	call	where_am_i_ecx
	movdqa	xmm7, [ecx + endswap_tab - .]

	## Calculate the number of subkey words again.  (It's a good job
	## we've got a fast multiplier.)
	mov	ecx, [ebp + nr]
	add	ecx, 1
	imul	ecx, [esp + 24]		# total keys in words

	## End-swap the encryption keys.
	mov	eax, ecx
	lea	esi, [ebp + w]
	call	endswap_block

	## And the decryption keys.
	mov	ecx, eax
	lea	esi, [ebp + wi]
	call	endswap_block

	## All done.
0:	pop	edi
	pop	esi
	pop	ebx
	pop	ebp
	ret

	.align	16
endswap_block:
	## End-swap ECX words starting at ESI.  The end-swapping table is
	## already loaded into XMM7; and it's OK to work in 16-byte chunks.
	movdqu	xmm1, [esi]
	pshufb	xmm1, xmm7
	movdqu	[esi], xmm1
	add	esi, 16
	sub	ecx, 4
	ja	endswap_block
	ret

	.size	rijndael_setup_x86_aesni, . - rijndael_setup_x86_aesni

###--------------------------------------------------------------------------
### Encrypting and decrypting blocks.

	.globl	rijndael_eblk_x86_aesni
	.type	rijndael_eblk_x86_aesni, STT_FUNC
	.align	16
rijndael_eblk_x86_aesni:

	## On entry, we have:
	## [esp +  4] points to the context block
	## [esp +  8] points to the input data block
	## [esp + 12] points to the output buffer

	## Find the magic endianness-swapping table.
	call	where_am_i_ecx
	movdqa	xmm7, [ecx + endswap_tab - .]

	## Load the input block and end-swap it.  Also, start loading the
	## keys.
	mov	eax, [esp + 8]
	movdqu	xmm0, [eax]
	pshufb	xmm0, xmm7
	mov	eax, [esp + 4]
	lea	edx, [eax + w]
	mov	eax, [eax + nr]

	## Initial whitening.
	movdqu	xmm1, [edx]
	add	edx, 16
	pxor	xmm0, xmm1

	## Dispatch to the correct code.
	cmp	eax, 10
	je	er10
	jb	bogus
	cmp	eax, 14
	je	er14
	ja	bogus
	cmp	eax, 12
	je	er12
	jb	er11
	jmp	er13

	.align	2

	## 14 rounds...
er14:	movdqu	xmm1, [edx]
	add	edx, 16
	aesenc	xmm0, xmm1

	## 13 rounds...
er13:	movdqu	xmm1, [edx]
	add	edx, 16
	aesenc	xmm0, xmm1

	## 12 rounds...
er12:	movdqu	xmm1, [edx]
	add	edx, 16
	aesenc	xmm0, xmm1

	## 11 rounds...
er11:	movdqu	xmm1, [edx]
	add	edx, 16
	aesenc	xmm0, xmm1

	## 10 rounds...
er10:	movdqu	xmm1, [edx]
	aesenc	xmm0, xmm1

	## 9 rounds...
	movdqu	xmm1, [edx + 16]
	aesenc	xmm0, xmm1

	## 8 rounds...
	movdqu	xmm1, [edx + 32]
	aesenc	xmm0, xmm1

	## 7 rounds...
	movdqu	xmm1, [edx + 48]
	aesenc	xmm0, xmm1

	## 6 rounds...
	movdqu	xmm1, [edx + 64]
	aesenc	xmm0, xmm1

	## 5 rounds...
	movdqu	xmm1, [edx + 80]
	aesenc	xmm0, xmm1

	## 4 rounds...
	movdqu	xmm1, [edx + 96]
	aesenc	xmm0, xmm1

	## 3 rounds...
	movdqu	xmm1, [edx + 112]
	aesenc	xmm0, xmm1

	## 2 rounds...
	movdqu	xmm1, [edx + 128]
	aesenc	xmm0, xmm1

	## Final round...
	movdqu	xmm1, [edx + 144]
	aesenclast xmm0, xmm1

	## Unpermute the ciphertext block and store it.
	pshufb	xmm0, xmm7
	mov	eax, [esp + 12]
	movdqu	[eax], xmm0

	## And we're done.
	ret

	.size	rijndael_eblk_x86_aesni, . - rijndael_dblk_x86_aesni

	.globl	rijndael_dblk_x86_aesni
	.type	rijndael_dblk_x86_aesni, STT_FUNC
	.align	16
rijndael_dblk_x86_aesni:

	## On entry, we have:
	## [esp +  4] points to the context block
	## [esp +  8] points to the input data block
	## [esp + 12] points to the output buffer

	## Find the magic endianness-swapping table.
	call	where_am_i_ecx
	movdqa	xmm7, [ecx + endswap_tab - .]

	## Load the input block and end-swap it.  Also, start loading the
	## keys.
	mov	eax, [esp + 8]
	movdqu	xmm0, [eax]
	pshufb	xmm0, xmm7
	mov	eax, [esp + 4]
	lea	edx, [eax + wi]
	mov	eax, [eax + nr]

	## Initial whitening.
	movdqu	xmm1, [edx]
	add	edx, 16
	pxor	xmm0, xmm1

	## Dispatch to the correct code.
	cmp	eax, 10
	je	dr10
	jb	bogus
	cmp	eax, 14
	je	dr14
	ja	bogus
	cmp	eax, 12
	je	dr12
	jb	dr11
	jmp	dr13

	.align	2

	## 14 rounds...
dr14:	movdqu	xmm1, [edx]
	add	edx, 16
	aesdec	xmm0, xmm1

	## 13 rounds...
dr13:	movdqu	xmm1, [edx]
	add	edx, 16
	aesdec	xmm0, xmm1

	## 12 rounds...
dr12:	movdqu	xmm1, [edx]
	add	edx, 16
	aesdec	xmm0, xmm1

	## 11 rounds...
dr11:	movdqu	xmm1, [edx]
	add	edx, 16
	aesdec	xmm0, xmm1

	## 10 rounds...
dr10:	movdqu	xmm1, [edx]
	aesdec	xmm0, xmm1

	## 9 rounds...
	movdqu	xmm1, [edx + 16]
	aesdec	xmm0, xmm1

	## 8 rounds...
	movdqu	xmm1, [edx + 32]
	aesdec	xmm0, xmm1

	## 7 rounds...
	movdqu	xmm1, [edx + 48]
	aesdec	xmm0, xmm1

	## 6 rounds...
	movdqu	xmm1, [edx + 64]
	aesdec	xmm0, xmm1

	## 5 rounds...
	movdqu	xmm1, [edx + 80]
	aesdec	xmm0, xmm1

	## 4 rounds...
	movdqu	xmm1, [edx + 96]
	aesdec	xmm0, xmm1

	## 3 rounds...
	movdqu	xmm1, [edx + 112]
	aesdec	xmm0, xmm1

	## 2 rounds...
	movdqu	xmm1, [edx + 128]
	aesdec	xmm0, xmm1

	## Final round...
	movdqu	xmm1, [edx + 144]
	aesdeclast xmm0, xmm1

	## Unpermute the ciphertext block and store it.
	pshufb	xmm0, xmm7
	mov	eax, [esp + 12]
	movdqu	[eax], xmm0

	## And we're done.
	ret

	.size	rijndael_dblk_x86_aesni, . - rijndael_dblk_x86_aesni

###--------------------------------------------------------------------------
### Random utilities.

	.align	16
	## Abort the process because of a programming error.  Indirecting
	## through this point serves several purposes: (a) by CALLing, rather
	## than branching to, `abort', we can save the return address, which
	## might at least provide a hint as to what went wrong; (b) we don't
	## have conditional CALLs (and they'd be big anyway); and (c) we can
	## write a HLT here as a backstop against `abort' being mad.
bogus:	call	abort@PLT
0:	hlt
	jmp	0b

	.align	16
	## Return the address of the instruction following the CALL here in
	## ECX.  This is useful for doing position-independent addressing.
where_am_i_ecx:
	mov	ecx, [esp]
	ret

###--------------------------------------------------------------------------
### Data tables.

	.align	16
endswap_tab:
	.byte	 3,  2,  1,  0
	.byte	 7,  6,  5,  4
	.byte	11, 10,  9,  8
	.byte	15, 14, 13, 12

###----- That's all, folks --------------------------------------------------
Commit	Line	Data
226639f3 MW	1	### -- mode: asm; asm-comment-char: ?# --
	2	###
	3	### AESNI-based implementation of Rijndael
	4	###
	5	### (c) 2015 Straylight/Edgeware
	6	###
	7
	8	###----- Licensing notice ---------------------------------------------------
	9	###
	10	### This file is part of Catacomb.
	11	###
	12	### Catacomb is free software; you can redistribute it and/or modify
	13	### it under the terms of the GNU Library General Public License as
	14	### published by the Free Software Foundation; either version 2 of the
	15	### License, or (at your option) any later version.
	16	###
	17	### Catacomb is distributed in the hope that it will be useful,
	18	### but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	### GNU Library General Public License for more details.
	21	###
	22	### You should have received a copy of the GNU Library General Public
	23	### License along with Catacomb; if not, write to the Free
	24	### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	### MA 02111-1307, USA.
	26
	27	.intel_syntax noprefix
	28	.arch .aes
	29
	30	.globl abort
	31	.globl rijndael_rcon
	32
	33	.section .text
	34
	35	### The AESNI instructions implement a little-endian version of AES, but
	36	### Catacomb's internal interface presents as big-endian so as to work better
	37	### with things like GCM. We therefore maintain the round keys in
	38	### little-endian form, and have to end-swap blocks in and out.
	39	###
	40	### For added amusement, the AESNI instructions don't implement the
	41	### larger-block versions of Rijndael, so we have to end-swap the keys if
	42	### we're preparing for one of those.
	43
	44	## Useful constants.
	45	.equ maxrounds, 16 # maximum number of rounds
	46	.equ maxblksz, 32 # maximum block size, in bytes
	47	.equ kbufsz, maxblksz*(maxrounds + 1) # size of a key-schedule buffer
	48
	49	## Context structure.
	50	.equ nr, 0 # number of rounds
	51	.equ w, nr + 4 # encryption key words
	52	.equ wi, w + kbufsz # decryption key words
	53
	54	###--------------------------------------------------------------------------
	55	### Key setup.
	56
	57	.globl rijndael_setup_x86_aesni
	58	.type rijndael_setup_x86_aesni, STT_FUNC
	59	.align 16
	60	rijndael_setup_x86_aesni:
	61
	62	## Initial state. We have four arguments:
	63	## [esp + 20] is the context pointer
	64	## [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
65	## [esp + 28] points to the key material, unaligned
66	## [esp + 32] is the size of the key, in words
67	## The key size has already been checked for validity, and the number
68	## of rounds has been computed. Our job is only to fill in the `w'
69	## and `wi' vectors.
70
71	push ebp
72	push ebx
73	push esi
74	push edi
75
76	## The initial round key material is taken directly from the input
77	## key, so copy it over.
78	mov ebp, [esp + 20] # context base pointer
79	mov ebx, [esp + 32] # key size, in words
80	mov ecx, ebx
81	mov esi, [esp + 28]
82	lea edi, [ebp + w]
83	rep movsd
84
85	## Find out other useful things.
86	mov edx, [ebp + nr] # number of rounds
87	add edx, 1
88	imul edx, [esp + 24] # total key size in words
89	sub edx, ebx # offset by the key size
90
91	## Find the round constants.
92	call where_am_i_ecx
93	add ecx, offset _GLOBAL_OFFSET_TABLE_
94	mov ecx, [ecx + rijndael_rcon@GOT]
95
96	## Prepare for the main loop.
97	lea esi, [ebp + w]
98	mov eax, [esi + 4*ebx - 4] # most recent key word
99	lea edx, [esi + 4*edx] # limit, offset by one key expansion
100
101	## Main key expansion loop. The first word of each key-length chunk
102	## needs special treatment.
103	##
104	## This is rather tedious because the Intel `AESKEYGENASSIST'
105	## instruction is very strangely shaped. Firstly, it wants to
106	## operate on vast SSE registers, even though we're data-blocked from
107	## doing more than operation at a time unless we're doing two key
108	## schedules simultaneously -- and even then we can't do more than
109	## two, because the instruction ignores two of its input words
110	## entirely, and produces two different outputs for each of the other
111	## two. And secondly it insists on taking the magic round constant
112	## as an immediate, so it's kind of annoying if you're not
113	## open-coding the whole thing. It's much easier to leave that as
114	## zero and XOR in the round constant by hand.
115	9: movd xmm0, eax
116	pshufd xmm0, xmm0, 0x39
117	aeskeygenassist xmm1, xmm0, 0
118	pshufd xmm1, xmm1, 0x93
119	movd eax, xmm1
120	xor eax, [esi]
121	xor al, [ecx]
122	inc ecx
123	mov [esi + 4*ebx], eax
124	add esi, 4
125	cmp esi, edx
126	jae 8f
127
128	## The next three words are simple...
129	xor eax, [esi]
130	mov [esi + 4*ebx], eax
131	add esi, 4
132	cmp esi, edx
133	jae 8f
134
135	## (Word 2...)
136	xor eax, [esi]
137	mov [esi + 4*ebx], eax
138	add esi, 4
139	cmp esi, edx
140	jae 8f
141
142	## (Word 3...)
143	xor eax, [esi]
144	mov [esi + 4*ebx], eax
145	add esi, 4
146	cmp esi, edx
147	jae 8f
148
149	## Word 4. If the key is /more/ than 6 words long, then we must
150	## apply a substitution here.
151	cmp ebx, 5
152	jb 9b
153	cmp ebx, 7
154	jb 0f
155	movd xmm0, eax
156	pshufd xmm0, xmm0, 0x93
157	aeskeygenassist xmm1, xmm0, 0
158	movd eax, xmm1
159	0: xor eax, [esi]
160	mov [esi + 4*ebx], eax
161	add esi, 4
162	cmp esi, edx
163	jae 8f
164
165	## (Word 5...)
166	cmp ebx, 6
167	jb 9b
168	xor eax, [esi]
169	mov [esi + 4*ebx], eax
170	add esi, 4
171	cmp esi, edx
172	jae 8f
173
174	## (Word 6...)
175	cmp ebx, 7
176	jb 9b
177	xor eax, [esi]
178	mov [esi + 4*ebx], eax
179	add esi, 4
180	cmp esi, edx
181	jae 8f
182
183	## (Word 7...)
184	cmp ebx, 8
185	jb 9b
186	xor eax, [esi]
187	mov [esi + 4*ebx], eax
188	add esi, 4
189	cmp esi, edx
190	jae 8f
191
192	## Must be done by now.
193	jmp 9b
194
195	## Next job is to construct the decryption keys. The keys for the
196	## first and last rounds don't need to be mangled, but the remaining
197	## ones do -- and they all need to be reordered too.
198	##
199	## The plan of action, then, is to copy the final encryption round's
200	## keys into place first, then to do each of the intermediate rounds
201	## in reverse order, and finally do the first round.
202	##
203	## Do all of the heavy lifting with SSE registers. The order we're
204	## doing this in means that it's OK if we read or write too much, and
205	## there's easily enough buffer space for the over-enthusiastic reads
206	## and writes because the context has space for 32-byte blocks, which
207	## is our maximum and an exact fit for two SSE registers.
208	8: mov ecx, [ebp + nr] # number of rounds
209	mov ebx, [esp + 24] # block size (in words)
210	mov edx, ecx
211	imul edx, ebx
212	lea edi, [ebp + wi]
213	lea esi, [ebp + 4*edx + w] # last round's keys
214	shl ebx, 2 # block size (in bytes now)
215
216	## Copy the last encryption round's keys.
217	movdqu xmm0, [esi]
218	movdqu [edi], xmm0
219	cmp ebx, 16
220	jbe 9f
221	movdqu xmm0, [esi + 16]
222	movdqu [edi + 16], xmm0
223
224	## Update the loop variables and stop if we've finished.
225	9: add edi, ebx
226	sub esi, ebx
227	sub ecx, 1
228	jbe 0f
229
230	## Do another middle round's keys...
231	movdqu xmm0, [esi]
232	aesimc xmm0, xmm0
233	movdqu [edi], xmm0
234	cmp ebx, 16
235	jbe 9b
236	movdqu xmm0, [esi + 16]
237	aesimc xmm0, xmm0
238	movdqu [edi + 16], xmm0
239	jmp 9b
240
241	## Finally do the first encryption round.
242	0: movdqu xmm0, [esi]
243	movdqu [edi], xmm0
244	cmp ebx, 16
245	jbe 0f
246	movdqu xmm0, [esi + 16]
247	movdqu [edi + 16], xmm0
248
249	## If the block size is not exactly four words then we must end-swap
250	## everything. We can use fancy SSE toys for this.
251	0: cmp ebx, 16
252	je 0f
253
254	## Find the byte-reordering table.
255	call where_am_i_ecx
256	movdqa xmm7, [ecx + endswap_tab - .]
257
258	## Calculate the number of subkey words again. (It's a good job
259	## we've got a fast multiplier.)
260	mov ecx, [ebp + nr]
261	add ecx, 1
262	imul ecx, [esp + 24] # total keys in words
263
264	## End-swap the encryption keys.
265	mov eax, ecx
266	lea esi, [ebp + w]
267	call endswap_block
268
269	## And the decryption keys.
270	mov ecx, eax
271	lea esi, [ebp + wi]
272	call endswap_block
273
274	## All done.
275	0: pop edi
276	pop esi
277	pop ebx
278	pop ebp
279	ret
280
281	.align 16
282	endswap_block:
283	## End-swap ECX words starting at ESI. The end-swapping table is
284	## already loaded into XMM7; and it's OK to work in 16-byte chunks.
285	movdqu xmm1, [esi]
286	pshufb xmm1, xmm7
287	movdqu [esi], xmm1
288	add esi, 16
289	sub ecx, 4
290	ja endswap_block
291	ret
292
293	.size rijndael_setup_x86_aesni, . - rijndael_setup_x86_aesni
294
295	###--------------------------------------------------------------------------
296	### Encrypting and decrypting blocks.
297
298	.globl rijndael_eblk_x86_aesni
299	.type rijndael_eblk_x86_aesni, STT_FUNC
300	.align 16
301	rijndael_eblk_x86_aesni:
302
303	## On entry, we have:
304	## [esp + 4] points to the context block
305	## [esp + 8] points to the input data block
306	## [esp + 12] points to the output buffer
307
308	## Find the magic endianness-swapping table.
309	call where_am_i_ecx
310	movdqa xmm7, [ecx + endswap_tab - .]
311
312	## Load the input block and end-swap it. Also, start loading the
313	## keys.
314	mov eax, [esp + 8]
315	movdqu xmm0, [eax]
316	pshufb xmm0, xmm7
317	mov eax, [esp + 4]
318	lea edx, [eax + w]
319	mov eax, [eax + nr]
320
321	## Initial whitening.
322	movdqu xmm1, [edx]
323	add edx, 16
324	pxor xmm0, xmm1
325
326	## Dispatch to the correct code.
327	cmp eax, 10
328	je er10
329	jb bogus
330	cmp eax, 14
331	je er14
332	ja bogus
333	cmp eax, 12
334	je er12
335	jb er11
336	jmp er13
337
338	.align 2
339
340	## 14 rounds...
341	er14: movdqu xmm1, [edx]
342	add edx, 16
343	aesenc xmm0, xmm1
344
345	## 13 rounds...
346	er13: movdqu xmm1, [edx]
347	add edx, 16
348	aesenc xmm0, xmm1
349
350	## 12 rounds...
351	er12: movdqu xmm1, [edx]
352	add edx, 16
353	aesenc xmm0, xmm1
354
355	## 11 rounds...
356	er11: movdqu xmm1, [edx]
357	add edx, 16
358	aesenc xmm0, xmm1
359
360	## 10 rounds...
361	er10: movdqu xmm1, [edx]
362	aesenc xmm0, xmm1
363
364	## 9 rounds...
365	movdqu xmm1, [edx + 16]
366	aesenc xmm0, xmm1
367
368	## 8 rounds...
369	movdqu xmm1, [edx + 32]
370	aesenc xmm0, xmm1
371
372	## 7 rounds...
373	movdqu xmm1, [edx + 48]
374	aesenc xmm0, xmm1
375
376	## 6 rounds...
377	movdqu xmm1, [edx + 64]
378	aesenc xmm0, xmm1
379
380	## 5 rounds...
381	movdqu xmm1, [edx + 80]
382	aesenc xmm0, xmm1
383
384	## 4 rounds...
385	movdqu xmm1, [edx + 96]
386	aesenc xmm0, xmm1
387
388	## 3 rounds...
389	movdqu xmm1, [edx + 112]
390	aesenc xmm0, xmm1
391
392	## 2 rounds...
393	movdqu xmm1, [edx + 128]
394	aesenc xmm0, xmm1
395
396	## Final round...
397	movdqu xmm1, [edx + 144]
398	aesenclast xmm0, xmm1
399
400	## Unpermute the ciphertext block and store it.
401	pshufb xmm0, xmm7
402	mov eax, [esp + 12]
403	movdqu [eax], xmm0
404
405	## And we're done.
406	ret
407
408	.size rijndael_eblk_x86_aesni, . - rijndael_dblk_x86_aesni
409
410	.globl rijndael_dblk_x86_aesni
411	.type rijndael_dblk_x86_aesni, STT_FUNC
412	.align 16
413	rijndael_dblk_x86_aesni:
414
415	## On entry, we have:
416	## [esp + 4] points to the context block
417	## [esp + 8] points to the input data block
418	## [esp + 12] points to the output buffer
419
420	## Find the magic endianness-swapping table.
421	call where_am_i_ecx
422	movdqa xmm7, [ecx + endswap_tab - .]
423
424	## Load the input block and end-swap it. Also, start loading the
425	## keys.
426	mov eax, [esp + 8]
427	movdqu xmm0, [eax]
428	pshufb xmm0, xmm7
429	mov eax, [esp + 4]
430	lea edx, [eax + wi]
431	mov eax, [eax + nr]
432
433	## Initial whitening.
434	movdqu xmm1, [edx]
435	add edx, 16
436	pxor xmm0, xmm1
437
438	## Dispatch to the correct code.
439	cmp eax, 10
440	je dr10
441	jb bogus
442	cmp eax, 14
443	je dr14
444	ja bogus
445	cmp eax, 12
446	je dr12
447	jb dr11
448	jmp dr13
449
450	.align 2
451
452	## 14 rounds...
453	dr14: movdqu xmm1, [edx]
454	add edx, 16
455	aesdec xmm0, xmm1
456
457	## 13 rounds...
458	dr13: movdqu xmm1, [edx]
459	add edx, 16
460	aesdec xmm0, xmm1
461
462	## 12 rounds...
463	dr12: movdqu xmm1, [edx]
464	add edx, 16
465	aesdec xmm0, xmm1
466
467	## 11 rounds...
468	dr11: movdqu xmm1, [edx]
469	add edx, 16
470	aesdec xmm0, xmm1
471
472	## 10 rounds...
473	dr10: movdqu xmm1, [edx]
474	aesdec xmm0, xmm1
475
476	## 9 rounds...
477	movdqu xmm1, [edx + 16]
478	aesdec xmm0, xmm1
479
480	## 8 rounds...
481	movdqu xmm1, [edx + 32]
482	aesdec xmm0, xmm1
483
484	## 7 rounds...
485	movdqu xmm1, [edx + 48]
486	aesdec xmm0, xmm1
487
488	## 6 rounds...
489	movdqu xmm1, [edx + 64]
490	aesdec xmm0, xmm1
491
492	## 5 rounds...
493	movdqu xmm1, [edx + 80]
494	aesdec xmm0, xmm1
495
496	## 4 rounds...
497	movdqu xmm1, [edx + 96]
498	aesdec xmm0, xmm1
499
500	## 3 rounds...
501	movdqu xmm1, [edx + 112]
502	aesdec xmm0, xmm1
503
504	## 2 rounds...
505	movdqu xmm1, [edx + 128]
506	aesdec xmm0, xmm1
507
508	## Final round...
509	movdqu xmm1, [edx + 144]
510	aesdeclast xmm0, xmm1
511
512	## Unpermute the ciphertext block and store it.
513	pshufb xmm0, xmm7
514	mov eax, [esp + 12]
515	movdqu [eax], xmm0
516
517	## And we're done.
518	ret
519
520	.size rijndael_dblk_x86_aesni, . - rijndael_dblk_x86_aesni
521
522	###--------------------------------------------------------------------------
523	### Random utilities.
524
525	.align 16
526	## Abort the process because of a programming error. Indirecting
527	## through this point serves several purposes: (a) by CALLing, rather
528	## than branching to, `abort', we can save the return address, which
529	## might at least provide a hint as to what went wrong; (b) we don't
530	## have conditional CALLs (and they'd be big anyway); and (c) we can
531	## write a HLT here as a backstop against `abort' being mad.
532	bogus: call abort@PLT
533	0: hlt
534	jmp 0b
535
536	.align 16
537	## Return the address of the instruction following the CALL here in
538	## ECX. This is useful for doing position-independent addressing.
539	where_am_i_ecx:
540	mov ecx, [esp]
541	ret
542
543	###--------------------------------------------------------------------------
544	### Data tables.
545
546	.align 16
547	endswap_tab:
548	.byte 3, 2, 1, 0
549	.byte 7, 6, 5, 4
550	.byte 11, 10, 9, 8
551	.byte 15, 14, 13, 12
552
553	###----- That's all, folks --------------------------------------------------