[catacomb] / symm / rijndael-x86-aesni.S

/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// AESNI-based implementation of Rijndael
///
/// (c) 2015 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software; you can redistribute it and/or modify
/// it under the terms of the GNU Library General Public License as
/// published by the Free Software Foundation; either version 2 of the
/// License, or (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb; if not, write to the Free
/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
/// MA 02111-1307, USA.

///--------------------------------------------------------------------------
/// External definitions.

#include "config.h"
#include "asm-common.h"

///--------------------------------------------------------------------------
/// External definitions.

	.globl	F(abort)
	.globl	F(rijndael_rcon)

///--------------------------------------------------------------------------
/// Main code.

	.arch	.aes
	.section .text

/// The AESNI instructions implement a little-endian version of AES, but
/// Catacomb's internal interface presents as big-endian so as to work better
/// with things like GCM.  We therefore maintain the round keys in
/// little-endian form, and have to end-swap blocks in and out.
///
/// For added amusement, the AESNI instructions don't implement the
/// larger-block versions of Rijndael, so we have to end-swap the keys if
/// we're preparing for one of those.

	// Useful constants.
	.equ	maxrounds, 16		// maximum number of rounds
	.equ	maxblksz, 32		// maximum block size, in bytes
	.equ	kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer

	// Context structure.
	.equ	nr, 0			// number of rounds
	.equ	w, nr + 4		// encryption key words
	.equ	wi, w + kbufsz		// decryption key words

///--------------------------------------------------------------------------
/// Key setup.

FUNC(rijndael_setup_x86_aesni)

	// Initial state.  We have four arguments:
	// [esp + 20] is the context pointer
	// [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
	// [esp + 28] points to the key material, unaligned
	// [esp + 32] is the size of the key, in words
	// The key size has already been checked for validity, and the number
	// of rounds has been computed.  Our job is only to fill in the `w'
	// and `wi' vectors.

	push	ebp
	push	ebx
	push	esi
	push	edi

	// The initial round key material is taken directly from the input
	// key, so copy it over.
	mov	ebp, [esp + 20]		// context base pointer
	mov	ebx, [esp + 32]		// key size, in words
	mov	ecx, ebx
	mov	esi, [esp + 28]
	lea	edi, [ebp + w]
	rep	movsd

	// Find out other useful things.
	mov	edx, [ebp + nr]		// number of rounds
	add	edx, 1
	imul	edx, [esp + 24]		// total key size in words
	sub	edx, ebx		// offset by the key size

	// Find the round constants.
	ldgot	ecx
	leaext	ecx, rijndael_rcon, ecx

	// Prepare for the main loop.
	lea	esi, [ebp + w]
	mov	eax, [esi + 4*ebx - 4]	// most recent key word
	lea	edx, [esi + 4*edx]	// limit, offset by one key expansion

	// Main key expansion loop.  The first word of each key-length chunk
	// needs special treatment.
	//
	// This is rather tedious because the Intel `AESKEYGENASSIST'
	// instruction is very strangely shaped.  Firstly, it wants to
	// operate on vast SSE registers, even though we're data-blocked from
	// doing more than operation at a time unless we're doing two key
	// schedules simultaneously -- and even then we can't do more than
	// two, because the instruction ignores two of its input words
	// entirely, and produces two different outputs for each of the other
	// two.  And secondly it insists on taking the magic round constant
	// as an immediate, so it's kind of annoying if you're not
	// open-coding the whole thing.  It's much easier to leave that as
	// zero and XOR in the round constant by hand.
9:	movd	xmm0, eax
	pshufd	xmm0, xmm0, 0x39
	aeskeygenassist xmm1, xmm0, 0
	pshufd	xmm1, xmm1, 0x93
	movd	eax, xmm1
	xor	eax, [esi]
	xor	al, [ecx]
	inc	ecx
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// The next three words are simple...
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// (Word 2...)
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// (Word 3...)
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// Word 4.  If the key is /more/ than 6 words long, then we must
	// apply a substitution here.
	cmp	ebx, 5
	jb	9b
	cmp	ebx, 7
	jb	0f
	movd	xmm0, eax
	pshufd	xmm0, xmm0, 0x93
	aeskeygenassist xmm1, xmm0, 0
	movd	eax, xmm1
0:	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// (Word 5...)
	cmp	ebx, 6
	jb	9b
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// (Word 6...)
	cmp	ebx, 7
	jb	9b
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// (Word 7...)
	cmp	ebx, 8
	jb	9b
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// Must be done by now.
	jmp	9b

	// Next job is to construct the decryption keys.  The keys for the
	// first and last rounds don't need to be mangled, but the remaining
	// ones do -- and they all need to be reordered too.
	//
	// The plan of action, then, is to copy the final encryption round's
	// keys into place first, then to do each of the intermediate rounds
	// in reverse order, and finally do the first round.
	//
	// Do all of the heavy lifting with SSE registers.  The order we're
	// doing this in means that it's OK if we read or write too much, and
	// there's easily enough buffer space for the over-enthusiastic reads
	// and writes because the context has space for 32-byte blocks, which
	// is our maximum and an exact fit for two SSE registers.
8:	mov	ecx, [ebp + nr]		// number of rounds
	mov	ebx, [esp + 24]		// block size (in words)
	mov	edx, ecx
	imul	edx, ebx
	lea	edi, [ebp + wi]
	lea	esi, [ebp + 4*edx + w]	// last round's keys
	shl	ebx, 2			// block size (in bytes now)

	// Copy the last encryption round's keys.
	movdqu	xmm0, [esi]
	movdqu	[edi], xmm0
	cmp	ebx, 16
	jbe	9f
	movdqu	xmm0, [esi + 16]
	movdqu	[edi + 16], xmm0

	// Update the loop variables and stop if we've finished.
9:	add	edi, ebx
	sub	esi, ebx
	sub	ecx, 1
	jbe	0f

	// Do another middle round's keys...
	movdqu	xmm0, [esi]
	aesimc	xmm0, xmm0
	movdqu	[edi], xmm0
	cmp	ebx, 16
	jbe	9b
	movdqu	xmm0, [esi + 16]
	aesimc	xmm0, xmm0
	movdqu	[edi + 16], xmm0
	jmp	9b

	// Finally do the first encryption round.
0:	movdqu	xmm0, [esi]
	movdqu	[edi], xmm0
	cmp	ebx, 16
	jbe	0f
	movdqu	xmm0, [esi + 16]
	movdqu	[edi + 16], xmm0

	// If the block size is not exactly four words then we must end-swap
	// everything.  We can use fancy SSE toys for this.
0:	cmp	ebx, 16
	je	0f

	// Find the byte-reordering table.
	ldgot	ecx
	movdqa	xmm7, [INTADDR(endswap_tab, ecx)]

	// Calculate the number of subkey words again.  (It's a good job
	// we've got a fast multiplier.)
	mov	ecx, [ebp + nr]
	add	ecx, 1
	imul	ecx, [esp + 24]		// total keys in words

	// End-swap the encryption keys.
	mov	eax, ecx
	lea	esi, [ebp + w]
	call	endswap_block

	// And the decryption keys.
	mov	ecx, eax
	lea	esi, [ebp + wi]
	call	endswap_block

	// All done.
0:	pop	edi
	pop	esi
	pop	ebx
	pop	ebp
	ret

	.align	16
endswap_block:
	// End-swap ECX words starting at ESI.  The end-swapping table is
	// already loaded into XMM7; and it's OK to work in 16-byte chunks.
	movdqu	xmm1, [esi]
	pshufb	xmm1, xmm7
	movdqu	[esi], xmm1
	add	esi, 16
	sub	ecx, 4
	ja	endswap_block
	ret

ENDFUNC

///--------------------------------------------------------------------------
/// Encrypting and decrypting blocks.

FUNC(rijndael_eblk_x86_aesni)

	// On entry, we have:
	// [esp +  4] points to the context block
	// [esp +  8] points to the input data block
	// [esp + 12] points to the output buffer

	// Find the magic endianness-swapping table.
	ldgot	ecx
	movdqa	xmm7, [INTADDR(endswap_tab, ecx)]

	// Load the input block and end-swap it.  Also, start loading the
	// keys.
	mov	eax, [esp + 8]
	movdqu	xmm0, [eax]
	pshufb	xmm0, xmm7
	mov	eax, [esp + 4]
	lea	edx, [eax + w]
	mov	eax, [eax + nr]

	// Initial whitening.
	movdqu	xmm1, [edx]
	add	edx, 16
	pxor	xmm0, xmm1

	// Dispatch to the correct code.
	cmp	eax, 10
	je	er10
	jb	bogus
	cmp	eax, 14
	je	er14
	ja	bogus
	cmp	eax, 12
	je	er12
	jb	er11
	jmp	er13

	.align	2

	// 14 rounds...
er14:	movdqu	xmm1, [edx]
	add	edx, 16
	aesenc	xmm0, xmm1

	// 13 rounds...
er13:	movdqu	xmm1, [edx]
	add	edx, 16
	aesenc	xmm0, xmm1

	// 12 rounds...
er12:	movdqu	xmm1, [edx]
	add	edx, 16
	aesenc	xmm0, xmm1

	// 11 rounds...
er11:	movdqu	xmm1, [edx]
	add	edx, 16
	aesenc	xmm0, xmm1

	// 10 rounds...
er10:	movdqu	xmm1, [edx]
	aesenc	xmm0, xmm1

	// 9 rounds...
	movdqu	xmm1, [edx + 16]
	aesenc	xmm0, xmm1

	// 8 rounds...
	movdqu	xmm1, [edx + 32]
	aesenc	xmm0, xmm1

	// 7 rounds...
	movdqu	xmm1, [edx + 48]
	aesenc	xmm0, xmm1

	// 6 rounds...
	movdqu	xmm1, [edx + 64]
	aesenc	xmm0, xmm1

	// 5 rounds...
	movdqu	xmm1, [edx + 80]
	aesenc	xmm0, xmm1

	// 4 rounds...
	movdqu	xmm1, [edx + 96]
	aesenc	xmm0, xmm1

	// 3 rounds...
	movdqu	xmm1, [edx + 112]
	aesenc	xmm0, xmm1

	// 2 rounds...
	movdqu	xmm1, [edx + 128]
	aesenc	xmm0, xmm1

	// Final round...
	movdqu	xmm1, [edx + 144]
	aesenclast xmm0, xmm1

	// Unpermute the ciphertext block and store it.
	pshufb	xmm0, xmm7
	mov	eax, [esp + 12]
	movdqu	[eax], xmm0

	// And we're done.
	ret

ENDFUNC

FUNC(rijndael_dblk_x86_aesni)

	// On entry, we have:
	// [esp +  4] points to the context block
	// [esp +  8] points to the input data block
	// [esp + 12] points to the output buffer

	// Find the magic endianness-swapping table.
	ldgot	ecx
	movdqa	xmm7, [INTADDR(endswap_tab, ecx)]

	// Load the input block and end-swap it.  Also, start loading the
	// keys.
	mov	eax, [esp + 8]
	movdqu	xmm0, [eax]
	pshufb	xmm0, xmm7
	mov	eax, [esp + 4]
	lea	edx, [eax + wi]
	mov	eax, [eax + nr]

	// Initial whitening.
	movdqu	xmm1, [edx]
	add	edx, 16
	pxor	xmm0, xmm1

	// Dispatch to the correct code.
	cmp	eax, 10
	je	dr10
	jb	bogus
	cmp	eax, 14
	je	dr14
	ja	bogus
	cmp	eax, 12
	je	dr12
	jb	dr11
	jmp	dr13

	.align	2

	// 14 rounds...
dr14:	movdqu	xmm1, [edx]
	add	edx, 16
	aesdec	xmm0, xmm1

	// 13 rounds...
dr13:	movdqu	xmm1, [edx]
	add	edx, 16
	aesdec	xmm0, xmm1

	// 12 rounds...
dr12:	movdqu	xmm1, [edx]
	add	edx, 16
	aesdec	xmm0, xmm1

	// 11 rounds...
dr11:	movdqu	xmm1, [edx]
	add	edx, 16
	aesdec	xmm0, xmm1

	// 10 rounds...
dr10:	movdqu	xmm1, [edx]
	aesdec	xmm0, xmm1

	// 9 rounds...
	movdqu	xmm1, [edx + 16]
	aesdec	xmm0, xmm1

	// 8 rounds...
	movdqu	xmm1, [edx + 32]
	aesdec	xmm0, xmm1

	// 7 rounds...
	movdqu	xmm1, [edx + 48]
	aesdec	xmm0, xmm1

	// 6 rounds...
	movdqu	xmm1, [edx + 64]
	aesdec	xmm0, xmm1

	// 5 rounds...
	movdqu	xmm1, [edx + 80]
	aesdec	xmm0, xmm1

	// 4 rounds...
	movdqu	xmm1, [edx + 96]
	aesdec	xmm0, xmm1

	// 3 rounds...
	movdqu	xmm1, [edx + 112]
	aesdec	xmm0, xmm1

	// 2 rounds...
	movdqu	xmm1, [edx + 128]
	aesdec	xmm0, xmm1

	// Final round...
	movdqu	xmm1, [edx + 144]
	aesdeclast xmm0, xmm1

	// Unpermute the ciphertext block and store it.
	pshufb	xmm0, xmm7
	mov	eax, [esp + 12]
	movdqu	[eax], xmm0

	// And we're done.
	ret

ENDFUNC

///--------------------------------------------------------------------------
/// Random utilities.

	.align	16
	// Abort the process because of a programming error.  Indirecting
	// through this point serves several purposes: (a) by CALLing, rather
	// than branching to, `abort', we can save the return address, which
	// might at least provide a hint as to what went wrong; (b) we don't
	// have conditional CALLs (and they'd be big anyway); and (c) we can
	// write a HLT here as a backstop against `abort' being mad.
bogus:	callext	F(abort)
0:	hlt
	jmp	0b

	gotaux	ecx

///--------------------------------------------------------------------------
/// Data tables.

	.align	16
endswap_tab:
	.byte	 3,  2,  1,  0
	.byte	 7,  6,  5,  4
	.byte	11, 10,  9,  8
	.byte	15, 14, 13, 12

///----- That's all, folks --------------------------------------------------
Commit	Line	Data
1a0c09c4 MW	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// AESNI-based implementation of Rijndael
	4	///
	5	/// (c) 2015 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// External definitions.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	///--------------------------------------------------------------------------
	34	/// External definitions.
	35
	36	.globl F(abort)
	37	.globl F(rijndael_rcon)
	38
	39	///--------------------------------------------------------------------------
	40	/// Main code.
	41
	42	.arch .aes
	43	.section .text
	44
	45	/// The AESNI instructions implement a little-endian version of AES, but
	46	/// Catacomb's internal interface presents as big-endian so as to work better
	47	/// with things like GCM. We therefore maintain the round keys in
	48	/// little-endian form, and have to end-swap blocks in and out.
	49	///
	50	/// For added amusement, the AESNI instructions don't implement the
	51	/// larger-block versions of Rijndael, so we have to end-swap the keys if
	52	/// we're preparing for one of those.
	53
	54	// Useful constants.
	55	.equ maxrounds, 16 // maximum number of rounds
	56	.equ maxblksz, 32 // maximum block size, in bytes
	57	.equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
	58
	59	// Context structure.
	60	.equ nr, 0 // number of rounds
	61	.equ w, nr + 4 // encryption key words
	62	.equ wi, w + kbufsz // decryption key words
	63
	64	///--------------------------------------------------------------------------
65	/// Key setup.
66
67	FUNC(rijndael_setup_x86_aesni)
68
69	// Initial state. We have four arguments:
70	// [esp + 20] is the context pointer
71	// [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
72	// [esp + 28] points to the key material, unaligned
73	// [esp + 32] is the size of the key, in words
74	// The key size has already been checked for validity, and the number
75	// of rounds has been computed. Our job is only to fill in the `w'
76	// and `wi' vectors.
77
78	push ebp
79	push ebx
80	push esi
81	push edi
82
83	// The initial round key material is taken directly from the input
84	// key, so copy it over.
85	mov ebp, [esp + 20] // context base pointer
86	mov ebx, [esp + 32] // key size, in words
87	mov ecx, ebx
88	mov esi, [esp + 28]
89	lea edi, [ebp + w]
90	rep movsd
91
92	// Find out other useful things.
93	mov edx, [ebp + nr] // number of rounds
94	add edx, 1
95	imul edx, [esp + 24] // total key size in words
96	sub edx, ebx // offset by the key size
97
98	// Find the round constants.
99	ldgot ecx
100	leaext ecx, rijndael_rcon, ecx
101
102	// Prepare for the main loop.
103	lea esi, [ebp + w]
104	mov eax, [esi + 4*ebx - 4] // most recent key word
105	lea edx, [esi + 4*edx] // limit, offset by one key expansion
106
107	// Main key expansion loop. The first word of each key-length chunk
108	// needs special treatment.
109	//
110	// This is rather tedious because the Intel `AESKEYGENASSIST'
111	// instruction is very strangely shaped. Firstly, it wants to
112	// operate on vast SSE registers, even though we're data-blocked from
113	// doing more than operation at a time unless we're doing two key
114	// schedules simultaneously -- and even then we can't do more than
115	// two, because the instruction ignores two of its input words
116	// entirely, and produces two different outputs for each of the other
117	// two. And secondly it insists on taking the magic round constant
118	// as an immediate, so it's kind of annoying if you're not
119	// open-coding the whole thing. It's much easier to leave that as
120	// zero and XOR in the round constant by hand.
121	9: movd xmm0, eax
122	pshufd xmm0, xmm0, 0x39
123	aeskeygenassist xmm1, xmm0, 0
124	pshufd xmm1, xmm1, 0x93
125	movd eax, xmm1
126	xor eax, [esi]
127	xor al, [ecx]
128	inc ecx
129	mov [esi + 4*ebx], eax
130	add esi, 4
131	cmp esi, edx
132	jae 8f
133
134	// The next three words are simple...
135	xor eax, [esi]
136	mov [esi + 4*ebx], eax
137	add esi, 4
138	cmp esi, edx
139	jae 8f
140
141	// (Word 2...)
142	xor eax, [esi]
143	mov [esi + 4*ebx], eax
144	add esi, 4
145	cmp esi, edx
146	jae 8f
147
148	// (Word 3...)
149	xor eax, [esi]
150	mov [esi + 4*ebx], eax
151	add esi, 4
152	cmp esi, edx
153	jae 8f
154
155	// Word 4. If the key is /more/ than 6 words long, then we must
156	// apply a substitution here.
157	cmp ebx, 5
158	jb 9b
159	cmp ebx, 7
160	jb 0f
161	movd xmm0, eax
162	pshufd xmm0, xmm0, 0x93
163	aeskeygenassist xmm1, xmm0, 0
164	movd eax, xmm1
165	0: xor eax, [esi]
166	mov [esi + 4*ebx], eax
167	add esi, 4
168	cmp esi, edx
169	jae 8f
170
171	// (Word 5...)
172	cmp ebx, 6
173	jb 9b
174	xor eax, [esi]
175	mov [esi + 4*ebx], eax
176	add esi, 4
177	cmp esi, edx
178	jae 8f
179
180	// (Word 6...)
181	cmp ebx, 7
182	jb 9b
183	xor eax, [esi]
184	mov [esi + 4*ebx], eax
185	add esi, 4
186	cmp esi, edx
187	jae 8f
188
189	// (Word 7...)
190	cmp ebx, 8
191	jb 9b
192	xor eax, [esi]
193	mov [esi + 4*ebx], eax
194	add esi, 4
195	cmp esi, edx
196	jae 8f
197
198	// Must be done by now.
199	jmp 9b
200
201	// Next job is to construct the decryption keys. The keys for the
202	// first and last rounds don't need to be mangled, but the remaining
203	// ones do -- and they all need to be reordered too.
204	//
205	// The plan of action, then, is to copy the final encryption round's
206	// keys into place first, then to do each of the intermediate rounds
207	// in reverse order, and finally do the first round.
208	//
209	// Do all of the heavy lifting with SSE registers. The order we're
210	// doing this in means that it's OK if we read or write too much, and
211	// there's easily enough buffer space for the over-enthusiastic reads
212	// and writes because the context has space for 32-byte blocks, which
213	// is our maximum and an exact fit for two SSE registers.
214	8: mov ecx, [ebp + nr] // number of rounds
215	mov ebx, [esp + 24] // block size (in words)
216	mov edx, ecx
217	imul edx, ebx
218	lea edi, [ebp + wi]
219	lea esi, [ebp + 4*edx + w] // last round's keys
220	shl ebx, 2 // block size (in bytes now)
221
222	// Copy the last encryption round's keys.
223	movdqu xmm0, [esi]
224	movdqu [edi], xmm0
225	cmp ebx, 16
226	jbe 9f
227	movdqu xmm0, [esi + 16]
228	movdqu [edi + 16], xmm0
229
230	// Update the loop variables and stop if we've finished.
231	9: add edi, ebx
232	sub esi, ebx
233	sub ecx, 1
234	jbe 0f
235
236	// Do another middle round's keys...
237	movdqu xmm0, [esi]
238	aesimc xmm0, xmm0
239	movdqu [edi], xmm0
240	cmp ebx, 16
241	jbe 9b
242	movdqu xmm0, [esi + 16]
243	aesimc xmm0, xmm0
244	movdqu [edi + 16], xmm0
245	jmp 9b
246
247	// Finally do the first encryption round.
248	0: movdqu xmm0, [esi]
249	movdqu [edi], xmm0
250	cmp ebx, 16
251	jbe 0f
252	movdqu xmm0, [esi + 16]
253	movdqu [edi + 16], xmm0
254
255	// If the block size is not exactly four words then we must end-swap
256	// everything. We can use fancy SSE toys for this.
257	0: cmp ebx, 16
258	je 0f
259
260	// Find the byte-reordering table.
261	ldgot ecx
262	movdqa xmm7, [INTADDR(endswap_tab, ecx)]
263
264	// Calculate the number of subkey words again. (It's a good job
265	// we've got a fast multiplier.)
266	mov ecx, [ebp + nr]
267	add ecx, 1
268	imul ecx, [esp + 24] // total keys in words
269
270	// End-swap the encryption keys.
271	mov eax, ecx
272	lea esi, [ebp + w]
273	call endswap_block
274
275	// And the decryption keys.
276	mov ecx, eax
277	lea esi, [ebp + wi]
278	call endswap_block
279
280	// All done.
281	0: pop edi
282	pop esi
283	pop ebx
284	pop ebp
285	ret
286
287	.align 16
288	endswap_block:
289	// End-swap ECX words starting at ESI. The end-swapping table is
290	// already loaded into XMM7; and it's OK to work in 16-byte chunks.
291	movdqu xmm1, [esi]
292	pshufb xmm1, xmm7
293	movdqu [esi], xmm1
294	add esi, 16
295	sub ecx, 4
296	ja endswap_block
297	ret
298
299	ENDFUNC
300
301	///--------------------------------------------------------------------------
302	/// Encrypting and decrypting blocks.
303
304	FUNC(rijndael_eblk_x86_aesni)
305
306	// On entry, we have:
307	// [esp + 4] points to the context block
308	// [esp + 8] points to the input data block
309	// [esp + 12] points to the output buffer
310
311	// Find the magic endianness-swapping table.
312	ldgot ecx
313	movdqa xmm7, [INTADDR(endswap_tab, ecx)]
314
315	// Load the input block and end-swap it. Also, start loading the
316	// keys.
317	mov eax, [esp + 8]
318	movdqu xmm0, [eax]
319	pshufb xmm0, xmm7
320	mov eax, [esp + 4]
321	lea edx, [eax + w]
322	mov eax, [eax + nr]
323
324	// Initial whitening.
325	movdqu xmm1, [edx]
326	add edx, 16
327	pxor xmm0, xmm1
328
329	// Dispatch to the correct code.
330	cmp eax, 10
331	je er10
332	jb bogus
333	cmp eax, 14
334	je er14
335	ja bogus
336	cmp eax, 12
337	je er12
338	jb er11
339	jmp er13
340
341	.align 2
342
343	// 14 rounds...
344	er14: movdqu xmm1, [edx]
345	add edx, 16
346	aesenc xmm0, xmm1
347
348	// 13 rounds...
349	er13: movdqu xmm1, [edx]
350	add edx, 16
351	aesenc xmm0, xmm1
352
353	// 12 rounds...
354	er12: movdqu xmm1, [edx]
355	add edx, 16
356	aesenc xmm0, xmm1
357
358	// 11 rounds...
359	er11: movdqu xmm1, [edx]
360	add edx, 16
361	aesenc xmm0, xmm1
362
363	// 10 rounds...
364	er10: movdqu xmm1, [edx]
365	aesenc xmm0, xmm1
366
367	// 9 rounds...
368	movdqu xmm1, [edx + 16]
369	aesenc xmm0, xmm1
370
371	// 8 rounds...
372	movdqu xmm1, [edx + 32]
373	aesenc xmm0, xmm1
374
375	// 7 rounds...
376	movdqu xmm1, [edx + 48]
377	aesenc xmm0, xmm1
378
379	// 6 rounds...
380	movdqu xmm1, [edx + 64]
381	aesenc xmm0, xmm1
382
383	// 5 rounds...
384	movdqu xmm1, [edx + 80]
385	aesenc xmm0, xmm1
386
387	// 4 rounds...
388	movdqu xmm1, [edx + 96]
389	aesenc xmm0, xmm1
390
391	// 3 rounds...
392	movdqu xmm1, [edx + 112]
393	aesenc xmm0, xmm1
394
395	// 2 rounds...
396	movdqu xmm1, [edx + 128]
397	aesenc xmm0, xmm1
398
399	// Final round...
400	movdqu xmm1, [edx + 144]
401	aesenclast xmm0, xmm1
402
403	// Unpermute the ciphertext block and store it.
404	pshufb xmm0, xmm7
405	mov eax, [esp + 12]
406	movdqu [eax], xmm0
407
408	// And we're done.
409	ret
410
411	ENDFUNC
412
413	FUNC(rijndael_dblk_x86_aesni)
414
415	// On entry, we have:
416	// [esp + 4] points to the context block
417	// [esp + 8] points to the input data block
418	// [esp + 12] points to the output buffer
419
420	// Find the magic endianness-swapping table.
421	ldgot ecx
422	movdqa xmm7, [INTADDR(endswap_tab, ecx)]
423
424	// Load the input block and end-swap it. Also, start loading the
425	// keys.
426	mov eax, [esp + 8]
427	movdqu xmm0, [eax]
428	pshufb xmm0, xmm7
429	mov eax, [esp + 4]
430	lea edx, [eax + wi]
431	mov eax, [eax + nr]
432
433	// Initial whitening.
434	movdqu xmm1, [edx]
435	add edx, 16
436	pxor xmm0, xmm1
437
438	// Dispatch to the correct code.
439	cmp eax, 10
440	je dr10
441	jb bogus
442	cmp eax, 14
443	je dr14
444	ja bogus
445	cmp eax, 12
446	je dr12
447	jb dr11
448	jmp dr13
449
450	.align 2
451
452	// 14 rounds...
453	dr14: movdqu xmm1, [edx]
454	add edx, 16
455	aesdec xmm0, xmm1
456
457	// 13 rounds...
458	dr13: movdqu xmm1, [edx]
459	add edx, 16
460	aesdec xmm0, xmm1
461
462	// 12 rounds...
463	dr12: movdqu xmm1, [edx]
464	add edx, 16
465	aesdec xmm0, xmm1
466
467	// 11 rounds...
468	dr11: movdqu xmm1, [edx]
469	add edx, 16
470	aesdec xmm0, xmm1
471
472	// 10 rounds...
473	dr10: movdqu xmm1, [edx]
474	aesdec xmm0, xmm1
475
476	// 9 rounds...
477	movdqu xmm1, [edx + 16]
478	aesdec xmm0, xmm1
479
480	// 8 rounds...
481	movdqu xmm1, [edx + 32]
482	aesdec xmm0, xmm1
483
484	// 7 rounds...
485	movdqu xmm1, [edx + 48]
486	aesdec xmm0, xmm1
487
488	// 6 rounds...
489	movdqu xmm1, [edx + 64]
490	aesdec xmm0, xmm1
491
492	// 5 rounds...
493	movdqu xmm1, [edx + 80]
494	aesdec xmm0, xmm1
495
496	// 4 rounds...
497	movdqu xmm1, [edx + 96]
498	aesdec xmm0, xmm1
499
500	// 3 rounds...
501	movdqu xmm1, [edx + 112]
502	aesdec xmm0, xmm1
503
504	// 2 rounds...
505	movdqu xmm1, [edx + 128]
506	aesdec xmm0, xmm1
507
508	// Final round...
509	movdqu xmm1, [edx + 144]
510	aesdeclast xmm0, xmm1
511
512	// Unpermute the ciphertext block and store it.
513	pshufb xmm0, xmm7
514	mov eax, [esp + 12]
515	movdqu [eax], xmm0
516
517	// And we're done.
518	ret
519
520	ENDFUNC
521
522	///--------------------------------------------------------------------------
523	/// Random utilities.
524
525	.align 16
526	// Abort the process because of a programming error. Indirecting
527	// through this point serves several purposes: (a) by CALLing, rather
528	// than branching to, `abort', we can save the return address, which
529	// might at least provide a hint as to what went wrong; (b) we don't
530	// have conditional CALLs (and they'd be big anyway); and (c) we can
531	// write a HLT here as a backstop against `abort' being mad.
532	bogus: callext F(abort)
533	0: hlt
534	jmp 0b
535
536	gotaux ecx
537
538	///--------------------------------------------------------------------------
539	/// Data tables.
540
541	.align 16
542	endswap_tab:
543	.byte 3, 2, 1, 0
544	.byte 7, 6, 5, 4
545	.byte 11, 10, 9, 8
546	.byte 15, 14, 13, 12
547
548	///----- That's all, folks --------------------------------------------------