[catacomb] / symm / rijndael-x86-aesni.S

/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// AESNI-based implementation of Rijndael
///
/// (c) 2015 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software; you can redistribute it and/or modify
/// it under the terms of the GNU Library General Public License as
/// published by the Free Software Foundation; either version 2 of the
/// License, or (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb; if not, write to the Free
/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
/// MA 02111-1307, USA.

///--------------------------------------------------------------------------
/// External definitions.

#include "config.h"
#include "asm-common.h"

///--------------------------------------------------------------------------
/// External definitions.

	.globl	F(abort)
	.globl	F(rijndael_rcon)

///--------------------------------------------------------------------------
/// Local utilities.

// Magic constants for shuffling.
#define ROTL 0x93
#define ROT2 0x4e
#define ROTR 0x39

///--------------------------------------------------------------------------
/// Main code.

	.arch	.aes
	.section .text

/// The AESNI instructions implement a little-endian version of AES, but
/// Catacomb's internal interface presents as big-endian so as to work better
/// with things like GCM.  We therefore maintain the round keys in
/// little-endian form, and have to end-swap blocks in and out.
///
/// For added amusement, the AESNI instructions don't implement the
/// larger-block versions of Rijndael, so we have to end-swap the keys if
/// we're preparing for one of those.

	// Useful constants.
	.equ	maxrounds, 16		// maximum number of rounds
	.equ	maxblksz, 32		// maximum block size, in bytes
	.equ	kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer

	// Context structure.
	.equ	nr, 0			// number of rounds
	.equ	w, nr + 4		// encryption key words
	.equ	wi, w + kbufsz		// decryption key words

///--------------------------------------------------------------------------
/// Key setup.

FUNC(rijndael_setup_x86_aesni)

	// Initial state.  We have four arguments:
	// [esp + 20] is the context pointer
	// [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
	// [esp + 28] points to the key material, unaligned
	// [esp + 32] is the size of the key, in words
	// The key size has already been checked for validity, and the number
	// of rounds has been computed.  Our job is only to fill in the `w'
	// and `wi' vectors.

	push	ebp
	push	ebx
	push	esi
	push	edi

	// The initial round key material is taken directly from the input
	// key, so copy it over.
	mov	ebp, [esp + 20]		// context base pointer
	mov	ebx, [esp + 32]		// key size, in words
	mov	ecx, ebx
	mov	esi, [esp + 28]
	lea	edi, [ebp + w]
	rep	movsd

	// Find out other useful things.
	mov	edx, [ebp + nr]		// number of rounds
	add	edx, 1
	imul	edx, [esp + 24]		// total key size in words
	sub	edx, ebx		// offset by the key size

	// Find the round constants.
	ldgot	ecx
	leaext	ecx, rijndael_rcon, ecx

	// Prepare for the main loop.
	lea	esi, [ebp + w]
	mov	eax, [esi + 4*ebx - 4]	// most recent key word
	lea	edx, [esi + 4*edx]	// limit, offset by one key expansion

	// Main key expansion loop.  The first word of each key-length chunk
	// needs special treatment.
	//
	// This is rather tedious because the Intel `AESKEYGENASSIST'
	// instruction is very strangely shaped.  Firstly, it wants to
	// operate on vast SSE registers, even though we're data-blocked from
	// doing more than operation at a time unless we're doing two key
	// schedules simultaneously -- and even then we can't do more than
	// two, because the instruction ignores two of its input words
	// entirely, and produces two different outputs for each of the other
	// two.  And secondly it insists on taking the magic round constant
	// as an immediate, so it's kind of annoying if you're not
	// open-coding the whole thing.  It's much easier to leave that as
	// zero and XOR in the round constant by hand.
9:	movd	xmm0, eax
	pshufd	xmm0, xmm0, ROTR
	aeskeygenassist xmm1, xmm0, 0
	pshufd	xmm1, xmm1, ROTL
	movd	eax, xmm1
	xor	eax, [esi]
	xor	al, [ecx]
	inc	ecx
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// The next three words are simple...
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// (Word 2...)
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// (Word 3...)
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// Word 4.  If the key is /more/ than 6 words long, then we must
	// apply a substitution here.
	cmp	ebx, 5
	jb	9b
	cmp	ebx, 7
	jb	0f
	movd	xmm0, eax
	pshufd	xmm0, xmm0, ROTL
	aeskeygenassist xmm1, xmm0, 0
	movd	eax, xmm1
0:	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// (Word 5...)
	cmp	ebx, 6
	jb	9b
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// (Word 6...)
	cmp	ebx, 7
	jb	9b
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// (Word 7...)
	cmp	ebx, 8
	jb	9b
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// Must be done by now.
	jmp	9b

	// Next job is to construct the decryption keys.  The keys for the
	// first and last rounds don't need to be mangled, but the remaining
	// ones do -- and they all need to be reordered too.
	//
	// The plan of action, then, is to copy the final encryption round's
	// keys into place first, then to do each of the intermediate rounds
	// in reverse order, and finally do the first round.
	//
	// Do all of the heavy lifting with SSE registers.  The order we're
	// doing this in means that it's OK if we read or write too much, and
	// there's easily enough buffer space for the over-enthusiastic reads
	// and writes because the context has space for 32-byte blocks, which
	// is our maximum and an exact fit for two SSE registers.
8:	mov	ecx, [ebp + nr]		// number of rounds
	mov	ebx, [esp + 24]		// block size (in words)
	mov	edx, ecx
	imul	edx, ebx
	lea	edi, [ebp + wi]
	lea	esi, [ebp + 4*edx + w]	// last round's keys
	shl	ebx, 2			// block size (in bytes now)

	// Copy the last encryption round's keys.
	movdqu	xmm0, [esi]
	movdqu	[edi], xmm0
	cmp	ebx, 16
	jbe	9f
	movdqu	xmm0, [esi + 16]
	movdqu	[edi + 16], xmm0

	// Update the loop variables and stop if we've finished.
9:	add	edi, ebx
	sub	esi, ebx
	sub	ecx, 1
	jbe	0f

	// Do another middle round's keys...
	movdqu	xmm0, [esi]
	aesimc	xmm0, xmm0
	movdqu	[edi], xmm0
	cmp	ebx, 16
	jbe	9b
	movdqu	xmm0, [esi + 16]
	aesimc	xmm0, xmm0
	movdqu	[edi + 16], xmm0
	jmp	9b

	// Finally do the first encryption round.
0:	movdqu	xmm0, [esi]
	movdqu	[edi], xmm0
	cmp	ebx, 16
	jbe	0f
	movdqu	xmm0, [esi + 16]
	movdqu	[edi + 16], xmm0

	// If the block size is not exactly four words then we must end-swap
	// everything.  We can use fancy SSE toys for this.
0:	cmp	ebx, 16
	je	0f

	// Find the byte-reordering table.
	ldgot	ecx
	movdqa	xmm5, [INTADDR(endswap_tab, ecx)]

	// Calculate the number of subkey words again.  (It's a good job
	// we've got a fast multiplier.)
	mov	ecx, [ebp + nr]
	add	ecx, 1
	imul	ecx, [esp + 24]		// total keys in words

	// End-swap the encryption keys.
	mov	eax, ecx
	lea	esi, [ebp + w]
	call	endswap_block

	// And the decryption keys.
	mov	ecx, eax
	lea	esi, [ebp + wi]
	call	endswap_block

	// All done.
0:	pop	edi
	pop	esi
	pop	ebx
	pop	ebp
	ret

	.align	16
endswap_block:
	// End-swap ECX words starting at ESI.  The end-swapping table is
	// already loaded into XMM5; and it's OK to work in 16-byte chunks.
	movdqu	xmm1, [esi]
	pshufb	xmm1, xmm5
	movdqu	[esi], xmm1
	add	esi, 16
	sub	ecx, 4
	ja	endswap_block
	ret

ENDFUNC

///--------------------------------------------------------------------------
/// Encrypting and decrypting blocks.

	.macro	encdec op, aes, koff
FUNC(rijndael_\op\()_x86_aesni)

	// On entry, we have:
	// [esp +  4] points to the context block
	// [esp +  8] points to the input data block
	// [esp + 12] points to the output buffer

	// Find the magic endianness-swapping table.
	ldgot	ecx
	movdqa	xmm5, [INTADDR(endswap_tab, ecx)]

	// Load the input block and end-swap it.  Also, start loading the
	// keys.
	mov	eax, [esp + 8]
	movdqu	xmm0, [eax]
	pshufb	xmm0, xmm5
	mov	eax, [esp + 4]
	lea	edx, [eax + \koff]
	mov	eax, [eax + nr]

	// Initial whitening.
	movdqu	xmm1, [edx]
	add	edx, 16
	pxor	xmm0, xmm1

	// Dispatch to the correct code.
	cmp	eax, 10
	je	10f
	jb	bogus
	cmp	eax, 14
	je	14f
	ja	bogus
	cmp	eax, 12
	je	12f
	jb	11f
	jmp	13f

	.align	2

	// 14 rounds...
14:	movdqu	xmm1, [edx]
	add	edx, 16
	\aes	xmm0, xmm1

	// 13 rounds...
13:	movdqu	xmm1, [edx]
	add	edx, 16
	\aes	xmm0, xmm1

	// 12 rounds...
12:	movdqu	xmm1, [edx]
	add	edx, 16
	\aes	xmm0, xmm1

	// 11 rounds...
11:	movdqu	xmm1, [edx]
	add	edx, 16
	\aes	xmm0, xmm1

	// 10 rounds...
10:	movdqu	xmm1, [edx]
	\aes	xmm0, xmm1

	// 9 rounds...
	movdqu	xmm1, [edx + 16]
	\aes	xmm0, xmm1

	// 8 rounds...
	movdqu	xmm1, [edx + 32]
	\aes	xmm0, xmm1

	// 7 rounds...
	movdqu	xmm1, [edx + 48]
	\aes	xmm0, xmm1

	// 6 rounds...
	movdqu	xmm1, [edx + 64]
	\aes	xmm0, xmm1

	// 5 rounds...
	movdqu	xmm1, [edx + 80]
	\aes	xmm0, xmm1

	// 4 rounds...
	movdqu	xmm1, [edx + 96]
	\aes	xmm0, xmm1

	// 3 rounds...
	movdqu	xmm1, [edx + 112]
	\aes	xmm0, xmm1

	// 2 rounds...
	movdqu	xmm1, [edx + 128]
	\aes	xmm0, xmm1

	// Final round...
	movdqu	xmm1, [edx + 144]
	\aes\()last xmm0, xmm1

	// Unpermute the ciphertext block and store it.
	pshufb	xmm0, xmm5
	mov	eax, [esp + 12]
	movdqu	[eax], xmm0

	// And we're done.
	ret

ENDFUNC
	.endm

	encdec	eblk, aesenc, w
	encdec	dblk, aesdec, wi

///--------------------------------------------------------------------------
/// Random utilities.

	.align	16
	// Abort the process because of a programming error.  Indirecting
	// through this point serves several purposes: (a) by CALLing, rather
	// than branching to, `abort', we can save the return address, which
	// might at least provide a hint as to what went wrong; (b) we don't
	// have conditional CALLs (and they'd be big anyway); and (c) we can
	// write a HLT here as a backstop against `abort' being mad.
bogus:	callext	F(abort)
0:	hlt
	jmp	0b

	gotaux	ecx

///--------------------------------------------------------------------------
/// Data tables.

	.align	16
endswap_tab:
	.byte	 3,  2,  1,  0
	.byte	 7,  6,  5,  4
	.byte	11, 10,  9,  8
	.byte	15, 14, 13, 12

///----- That's all, folks --------------------------------------------------
Commit	Line	Data
1a0c09c4 MW	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// AESNI-based implementation of Rijndael
	4	///
	5	/// (c) 2015 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// External definitions.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	///--------------------------------------------------------------------------
	34	/// External definitions.
	35
	36	.globl F(abort)
	37	.globl F(rijndael_rcon)
	38
	39	///--------------------------------------------------------------------------
47103664 MW	40	/// Local utilities.
	41
	42	// Magic constants for shuffling.
	43	#define ROTL 0x93
	44	#define ROT2 0x4e
	45	#define ROTR 0x39
	46
	47	///--------------------------------------------------------------------------
1a0c09c4 MW	48	/// Main code.
	49
	50	.arch .aes
	51	.section .text
	52
	53	/// The AESNI instructions implement a little-endian version of AES, but
	54	/// Catacomb's internal interface presents as big-endian so as to work better
	55	/// with things like GCM. We therefore maintain the round keys in
	56	/// little-endian form, and have to end-swap blocks in and out.
	57	///
	58	/// For added amusement, the AESNI instructions don't implement the
	59	/// larger-block versions of Rijndael, so we have to end-swap the keys if
	60	/// we're preparing for one of those.
	61
	62	// Useful constants.
	63	.equ maxrounds, 16 // maximum number of rounds
	64	.equ maxblksz, 32 // maximum block size, in bytes
	65	.equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
	66
	67	// Context structure.
	68	.equ nr, 0 // number of rounds
	69	.equ w, nr + 4 // encryption key words
	70	.equ wi, w + kbufsz // decryption key words
	71
	72	///--------------------------------------------------------------------------
	73	/// Key setup.
	74
	75	FUNC(rijndael_setup_x86_aesni)
	76
	77	// Initial state. We have four arguments:
	78	// [esp + 20] is the context pointer
	79	// [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
	80	// [esp + 28] points to the key material, unaligned
	81	// [esp + 32] is the size of the key, in words
	82	// The key size has already been checked for validity, and the number
	83	// of rounds has been computed. Our job is only to fill in the `w'
	84	// and `wi' vectors.
	85
	86	push ebp
	87	push ebx
	88	push esi
	89	push edi
	90
	91	// The initial round key material is taken directly from the input
	92	// key, so copy it over.
	93	mov ebp, [esp + 20] // context base pointer
	94	mov ebx, [esp + 32] // key size, in words
	95	mov ecx, ebx
	96	mov esi, [esp + 28]
	97	lea edi, [ebp + w]
	98	rep movsd
	99
	100	// Find out other useful things.
	101	mov edx, [ebp + nr] // number of rounds
	102	add edx, 1
	103	imul edx, [esp + 24] // total key size in words
	104	sub edx, ebx // offset by the key size
	105
	106	// Find the round constants.
	107	ldgot ecx
	108	leaext ecx, rijndael_rcon, ecx
	109
	110	// Prepare for the main loop.
	111	lea esi, [ebp + w]
112	mov eax, [esi + 4*ebx - 4] // most recent key word
113	lea edx, [esi + 4*edx] // limit, offset by one key expansion
114
115	// Main key expansion loop. The first word of each key-length chunk
116	// needs special treatment.
117	//
118	// This is rather tedious because the Intel `AESKEYGENASSIST'
119	// instruction is very strangely shaped. Firstly, it wants to
120	// operate on vast SSE registers, even though we're data-blocked from
121	// doing more than operation at a time unless we're doing two key
122	// schedules simultaneously -- and even then we can't do more than
123	// two, because the instruction ignores two of its input words
124	// entirely, and produces two different outputs for each of the other
125	// two. And secondly it insists on taking the magic round constant
126	// as an immediate, so it's kind of annoying if you're not
127	// open-coding the whole thing. It's much easier to leave that as
128	// zero and XOR in the round constant by hand.
129	9: movd xmm0, eax
47103664	130	pshufd xmm0, xmm0, ROTR
1a0c09c4	131	aeskeygenassist xmm1, xmm0, 0
47103664	132	pshufd xmm1, xmm1, ROTL
1a0c09c4 MW	133	movd eax, xmm1
	134	xor eax, [esi]
	135	xor al, [ecx]
	136	inc ecx
	137	mov [esi + 4*ebx], eax
	138	add esi, 4
	139	cmp esi, edx
	140	jae 8f
	141
	142	// The next three words are simple...
	143	xor eax, [esi]
	144	mov [esi + 4*ebx], eax
	145	add esi, 4
	146	cmp esi, edx
	147	jae 8f
	148
	149	// (Word 2...)
	150	xor eax, [esi]
	151	mov [esi + 4*ebx], eax
	152	add esi, 4
	153	cmp esi, edx
	154	jae 8f
	155
	156	// (Word 3...)
	157	xor eax, [esi]
	158	mov [esi + 4*ebx], eax
	159	add esi, 4
	160	cmp esi, edx
	161	jae 8f
	162
	163	// Word 4. If the key is /more/ than 6 words long, then we must
	164	// apply a substitution here.
	165	cmp ebx, 5
	166	jb 9b
	167	cmp ebx, 7
	168	jb 0f
	169	movd xmm0, eax
47103664	170	pshufd xmm0, xmm0, ROTL
1a0c09c4 MW	171	aeskeygenassist xmm1, xmm0, 0
	172	movd eax, xmm1
	173	0: xor eax, [esi]
	174	mov [esi + 4*ebx], eax
	175	add esi, 4
	176	cmp esi, edx
	177	jae 8f
	178
	179	// (Word 5...)
	180	cmp ebx, 6
	181	jb 9b
	182	xor eax, [esi]
	183	mov [esi + 4*ebx], eax
	184	add esi, 4
	185	cmp esi, edx
	186	jae 8f
	187
	188	// (Word 6...)
	189	cmp ebx, 7
	190	jb 9b
	191	xor eax, [esi]
	192	mov [esi + 4*ebx], eax
	193	add esi, 4
	194	cmp esi, edx
	195	jae 8f
	196
	197	// (Word 7...)
	198	cmp ebx, 8
	199	jb 9b
	200	xor eax, [esi]
	201	mov [esi + 4*ebx], eax
	202	add esi, 4
	203	cmp esi, edx
	204	jae 8f
	205
	206	// Must be done by now.
	207	jmp 9b
	208
	209	// Next job is to construct the decryption keys. The keys for the
	210	// first and last rounds don't need to be mangled, but the remaining
	211	// ones do -- and they all need to be reordered too.
	212	//
	213	// The plan of action, then, is to copy the final encryption round's
	214	// keys into place first, then to do each of the intermediate rounds
	215	// in reverse order, and finally do the first round.
	216	//
	217	// Do all of the heavy lifting with SSE registers. The order we're
	218	// doing this in means that it's OK if we read or write too much, and
	219	// there's easily enough buffer space for the over-enthusiastic reads
	220	// and writes because the context has space for 32-byte blocks, which
	221	// is our maximum and an exact fit for two SSE registers.
	222	8: mov ecx, [ebp + nr] // number of rounds
	223	mov ebx, [esp + 24] // block size (in words)
	224	mov edx, ecx
	225	imul edx, ebx
	226	lea edi, [ebp + wi]
	227	lea esi, [ebp + 4*edx + w] // last round's keys
	228	shl ebx, 2 // block size (in bytes now)
	229
	230	// Copy the last encryption round's keys.
	231	movdqu xmm0, [esi]
	232	movdqu [edi], xmm0
	233	cmp ebx, 16
	234	jbe 9f
235	movdqu xmm0, [esi + 16]
236	movdqu [edi + 16], xmm0
237
238	// Update the loop variables and stop if we've finished.
239	9: add edi, ebx
240	sub esi, ebx
241	sub ecx, 1
242	jbe 0f
243
244	// Do another middle round's keys...
245	movdqu xmm0, [esi]
246	aesimc xmm0, xmm0
247	movdqu [edi], xmm0
248	cmp ebx, 16
249	jbe 9b
250	movdqu xmm0, [esi + 16]
251	aesimc xmm0, xmm0
252	movdqu [edi + 16], xmm0
253	jmp 9b
254
255	// Finally do the first encryption round.
256	0: movdqu xmm0, [esi]
257	movdqu [edi], xmm0
258	cmp ebx, 16
259	jbe 0f
260	movdqu xmm0, [esi + 16]
261	movdqu [edi + 16], xmm0
262
263	// If the block size is not exactly four words then we must end-swap
264	// everything. We can use fancy SSE toys for this.
265	0: cmp ebx, 16
266	je 0f
267
268	// Find the byte-reordering table.
269	ldgot ecx
8d6ca554	270	movdqa xmm5, [INTADDR(endswap_tab, ecx)]
1a0c09c4 MW	271
	272	// Calculate the number of subkey words again. (It's a good job
	273	// we've got a fast multiplier.)
	274	mov ecx, [ebp + nr]
	275	add ecx, 1
	276	imul ecx, [esp + 24] // total keys in words
	277
	278	// End-swap the encryption keys.
	279	mov eax, ecx
	280	lea esi, [ebp + w]
	281	call endswap_block
	282
	283	// And the decryption keys.
	284	mov ecx, eax
	285	lea esi, [ebp + wi]
	286	call endswap_block
	287
	288	// All done.
	289	0: pop edi
	290	pop esi
	291	pop ebx
	292	pop ebp
	293	ret
	294
	295	.align 16
	296	endswap_block:
	297	// End-swap ECX words starting at ESI. The end-swapping table is
8d6ca554	298	// already loaded into XMM5; and it's OK to work in 16-byte chunks.
1a0c09c4	299	movdqu xmm1, [esi]
8d6ca554	300	pshufb xmm1, xmm5
1a0c09c4 MW	301	movdqu [esi], xmm1
	302	add esi, 16
	303	sub ecx, 4
	304	ja endswap_block
	305	ret
	306
	307	ENDFUNC
	308
	309	///--------------------------------------------------------------------------
	310	/// Encrypting and decrypting blocks.
	311
e297526c MW	312	.macro encdec op, aes, koff
e297526c MW	313	FUNC(rijndael_\op\()_x86_aesni)
1a0c09c4 MW	314
	315	// On entry, we have:
	316	// [esp + 4] points to the context block
	317	// [esp + 8] points to the input data block
	318	// [esp + 12] points to the output buffer
	319
	320	// Find the magic endianness-swapping table.
	321	ldgot ecx
8d6ca554	322	movdqa xmm5, [INTADDR(endswap_tab, ecx)]
1a0c09c4 MW	323
	324	// Load the input block and end-swap it. Also, start loading the
	325	// keys.
	326	mov eax, [esp + 8]
	327	movdqu xmm0, [eax]
8d6ca554	328	pshufb xmm0, xmm5
1a0c09c4	329	mov eax, [esp + 4]
e297526c	330	lea edx, [eax + \koff]
1a0c09c4 MW	331	mov eax, [eax + nr]
	332
	333	// Initial whitening.
	334	movdqu xmm1, [edx]
	335	add edx, 16
	336	pxor xmm0, xmm1
	337
	338	// Dispatch to the correct code.
	339	cmp eax, 10
e297526c	340	je 10f
1a0c09c4 MW	341	jb bogus
1a0c09c4 MW	342	cmp eax, 14
e297526c	343	je 14f
1a0c09c4 MW	344	ja bogus
1a0c09c4 MW	345	cmp eax, 12
e297526c MW	346	je 12f
	347	jb 11f
	348	jmp 13f
1a0c09c4 MW	349
	350	.align 2
	351
	352	// 14 rounds...
e297526c	353	14: movdqu xmm1, [edx]
1a0c09c4	354	add edx, 16
e297526c	355	\aes xmm0, xmm1
1a0c09c4 MW	356
1a0c09c4 MW	357	// 13 rounds...
e297526c	358	13: movdqu xmm1, [edx]
1a0c09c4	359	add edx, 16
e297526c	360	\aes xmm0, xmm1
1a0c09c4 MW	361
1a0c09c4 MW	362	// 12 rounds...
e297526c	363	12: movdqu xmm1, [edx]
1a0c09c4	364	add edx, 16
e297526c	365	\aes xmm0, xmm1
1a0c09c4 MW	366
1a0c09c4 MW	367	// 11 rounds...
e297526c	368	11: movdqu xmm1, [edx]
1a0c09c4	369	add edx, 16
e297526c	370	\aes xmm0, xmm1
1a0c09c4 MW	371
1a0c09c4 MW	372	// 10 rounds...
e297526c MW	373	10: movdqu xmm1, [edx]
e297526c MW	374	\aes xmm0, xmm1
1a0c09c4 MW	375
	376	// 9 rounds...
	377	movdqu xmm1, [edx + 16]
e297526c	378	\aes xmm0, xmm1
1a0c09c4 MW	379
	380	// 8 rounds...
	381	movdqu xmm1, [edx + 32]
e297526c	382	\aes xmm0, xmm1
1a0c09c4 MW	383
	384	// 7 rounds...
	385	movdqu xmm1, [edx + 48]
e297526c	386	\aes xmm0, xmm1
1a0c09c4 MW	387
	388	// 6 rounds...
	389	movdqu xmm1, [edx + 64]
e297526c	390	\aes xmm0, xmm1
1a0c09c4 MW	391
	392	// 5 rounds...
	393	movdqu xmm1, [edx + 80]
e297526c	394	\aes xmm0, xmm1
1a0c09c4 MW	395
	396	// 4 rounds...
	397	movdqu xmm1, [edx + 96]
e297526c	398	\aes xmm0, xmm1
1a0c09c4 MW	399
	400	// 3 rounds...
	401	movdqu xmm1, [edx + 112]
e297526c	402	\aes xmm0, xmm1
1a0c09c4 MW	403
	404	// 2 rounds...
	405	movdqu xmm1, [edx + 128]
e297526c	406	\aes xmm0, xmm1
1a0c09c4 MW	407
	408	// Final round...
	409	movdqu xmm1, [edx + 144]
e297526c	410	\aes\()last xmm0, xmm1
1a0c09c4 MW	411
1a0c09c4 MW	412	// Unpermute the ciphertext block and store it.
8d6ca554	413	pshufb xmm0, xmm5
1a0c09c4 MW	414	mov eax, [esp + 12]
	415	movdqu [eax], xmm0
	416
	417	// And we're done.
	418	ret
	419
	420	ENDFUNC
e297526c	421	.endm
1a0c09c4	422
e297526c MW	423	encdec eblk, aesenc, w
e297526c MW	424	encdec dblk, aesdec, wi
1a0c09c4 MW	425
	426	///--------------------------------------------------------------------------
	427	/// Random utilities.
	428
	429	.align 16
	430	// Abort the process because of a programming error. Indirecting
	431	// through this point serves several purposes: (a) by CALLing, rather
	432	// than branching to, `abort', we can save the return address, which
	433	// might at least provide a hint as to what went wrong; (b) we don't
	434	// have conditional CALLs (and they'd be big anyway); and (c) we can
	435	// write a HLT here as a backstop against `abort' being mad.
	436	bogus: callext F(abort)
	437	0: hlt
	438	jmp 0b
	439
	440	gotaux ecx
	441
	442	///--------------------------------------------------------------------------
	443	/// Data tables.
	444
	445	.align 16
	446	endswap_tab:
	447	.byte 3, 2, 1, 0
	448	.byte 7, 6, 5, 4
	449	.byte 11, 10, 9, 8
	450	.byte 15, 14, 13, 12
	451
	452	///----- That's all, folks --------------------------------------------------