[catacomb] / symm / rijndael-x86-aesni.S

/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// AESNI-based implementation of Rijndael
///
/// (c) 2015 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software; you can redistribute it and/or modify
/// it under the terms of the GNU Library General Public License as
/// published by the Free Software Foundation; either version 2 of the
/// License, or (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb; if not, write to the Free
/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
/// MA 02111-1307, USA.

///--------------------------------------------------------------------------
/// External definitions.

#include "config.h"
#include "asm-common.h"

///--------------------------------------------------------------------------
/// External definitions.

	.globl	F(abort)
	.globl	F(rijndael_rcon)

///--------------------------------------------------------------------------
/// Local utilities.

// Magic constants for shuffling.
#define ROTL 0x93
#define ROT2 0x4e
#define ROTR 0x39

///--------------------------------------------------------------------------
/// Main code.

	.arch	.aes
	.section .text

/// The AESNI instructions implement a little-endian version of AES, but
/// Catacomb's internal interface presents as big-endian so as to work better
/// with things like GCM.  We therefore maintain the round keys in
/// little-endian form, and have to end-swap blocks in and out.
///
/// For added amusement, the AESNI instructions don't implement the
/// larger-block versions of Rijndael, so we have to end-swap the keys if
/// we're preparing for one of those.

	// Useful constants.
	.equ	maxrounds, 16		// maximum number of rounds
	.equ	maxblksz, 32		// maximum block size, in bytes
	.equ	kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer

	// Context structure.
	.equ	nr, 0			// number of rounds
	.equ	w, nr + 4		// encryption key words
	.equ	wi, w + kbufsz		// decryption key words

///--------------------------------------------------------------------------
/// Key setup.

FUNC(rijndael_setup_x86_aesni)

	// Initial state.  We have four arguments:
	// [esp + 20] is the context pointer
	// [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
	// [esp + 28] points to the key material, unaligned
	// [esp + 32] is the size of the key, in words
	// The key size has already been checked for validity, and the number
	// of rounds has been computed.  Our job is only to fill in the `w'
	// and `wi' vectors.

	push	ebp
	push	ebx
	push	esi
	push	edi

	// The initial round key material is taken directly from the input
	// key, so copy it over.
	mov	ebp, [esp + 20]		// context base pointer
	mov	ebx, [esp + 32]		// key size, in words
	mov	ecx, ebx
	mov	esi, [esp + 28]
	lea	edi, [ebp + w]
	rep	movsd

	// Find out other useful things.
	mov	edx, [ebp + nr]		// number of rounds
	add	edx, 1
	imul	edx, [esp + 24]		// total key size in words
	sub	edx, ebx		// offset by the key size

	// Find the round constants.
	ldgot	ecx
	leaext	ecx, rijndael_rcon, ecx

	// Prepare for the main loop.
	lea	esi, [ebp + w]
	mov	eax, [esi + 4*ebx - 4]	// most recent key word
	lea	edx, [esi + 4*edx]	// limit, offset by one key expansion

	// Main key expansion loop.  The first word of each key-length chunk
	// needs special treatment.
	//
	// This is rather tedious because the Intel `AESKEYGENASSIST'
	// instruction is very strangely shaped.  Firstly, it wants to
	// operate on vast SSE registers, even though we're data-blocked from
	// doing more than operation at a time unless we're doing two key
	// schedules simultaneously -- and even then we can't do more than
	// two, because the instruction ignores two of its input words
	// entirely, and produces two different outputs for each of the other
	// two.  And secondly it insists on taking the magic round constant
	// as an immediate, so it's kind of annoying if you're not
	// open-coding the whole thing.  It's much easier to leave that as
	// zero and XOR in the round constant by hand.
9:	movd	xmm0, eax
	pshufd	xmm0, xmm0, ROTR
	aeskeygenassist xmm1, xmm0, 0
	pshufd	xmm1, xmm1, ROTL
	movd	eax, xmm1
	xor	eax, [esi]
	xor	al, [ecx]
	inc	ecx
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// The next three words are simple...
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// (Word 2...)
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// (Word 3...)
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// Word 4.  If the key is /more/ than 6 words long, then we must
	// apply a substitution here.
	cmp	ebx, 5
	jb	9b
	cmp	ebx, 7
	jb	0f
	movd	xmm0, eax
	pshufd	xmm0, xmm0, ROTL
	aeskeygenassist xmm1, xmm0, 0
	movd	eax, xmm1
0:	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// (Word 5...)
	cmp	ebx, 6
	jb	9b
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// (Word 6...)
	cmp	ebx, 7
	jb	9b
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// (Word 7...)
	cmp	ebx, 8
	jb	9b
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// Must be done by now.
	jmp	9b

	// Next job is to construct the decryption keys.  The keys for the
	// first and last rounds don't need to be mangled, but the remaining
	// ones do -- and they all need to be reordered too.
	//
	// The plan of action, then, is to copy the final encryption round's
	// keys into place first, then to do each of the intermediate rounds
	// in reverse order, and finally do the first round.
	//
	// Do all of the heavy lifting with SSE registers.  The order we're
	// doing this in means that it's OK if we read or write too much, and
	// there's easily enough buffer space for the over-enthusiastic reads
	// and writes because the context has space for 32-byte blocks, which
	// is our maximum and an exact fit for two SSE registers.
8:	mov	ecx, [ebp + nr]		// number of rounds
	mov	ebx, [esp + 24]		// block size (in words)
	mov	edx, ecx
	imul	edx, ebx
	lea	edi, [ebp + wi]
	lea	esi, [ebp + 4*edx + w]	// last round's keys
	shl	ebx, 2			// block size (in bytes now)

	// Copy the last encryption round's keys.
	movdqu	xmm0, [esi]
	movdqu	[edi], xmm0
	cmp	ebx, 16
	jbe	9f
	movdqu	xmm0, [esi + 16]
	movdqu	[edi + 16], xmm0

	// Update the loop variables and stop if we've finished.
9:	add	edi, ebx
	sub	esi, ebx
	sub	ecx, 1
	jbe	0f

	// Do another middle round's keys...
	movdqu	xmm0, [esi]
	aesimc	xmm0, xmm0
	movdqu	[edi], xmm0
	cmp	ebx, 16
	jbe	9b
	movdqu	xmm0, [esi + 16]
	aesimc	xmm0, xmm0
	movdqu	[edi + 16], xmm0
	jmp	9b

	// Finally do the first encryption round.
0:	movdqu	xmm0, [esi]
	movdqu	[edi], xmm0
	cmp	ebx, 16
	jbe	0f
	movdqu	xmm0, [esi + 16]
	movdqu	[edi + 16], xmm0

	// If the block size is not exactly four words then we must end-swap
	// everything.  We can use fancy SSE toys for this.
0:	cmp	ebx, 16
	je	0f

	// Find the byte-reordering table.
	ldgot	ecx
	movdqa	xmm5, [INTADDR(endswap_tab, ecx)]

	// Calculate the number of subkey words again.  (It's a good job
	// we've got a fast multiplier.)
	mov	ecx, [ebp + nr]
	add	ecx, 1
	imul	ecx, [esp + 24]		// total keys in words

	// End-swap the encryption keys.
	mov	eax, ecx
	lea	esi, [ebp + w]
	call	endswap_block

	// And the decryption keys.
	mov	ecx, eax
	lea	esi, [ebp + wi]
	call	endswap_block

	// All done.
0:	pop	edi
	pop	esi
	pop	ebx
	pop	ebp
	ret

	.align	16
endswap_block:
	// End-swap ECX words starting at ESI.  The end-swapping table is
	// already loaded into XMM5; and it's OK to work in 16-byte chunks.
	movdqu	xmm1, [esi]
	pshufb	xmm1, xmm5
	movdqu	[esi], xmm1
	add	esi, 16
	sub	ecx, 4
	ja	endswap_block
	ret

ENDFUNC

///--------------------------------------------------------------------------
/// Encrypting and decrypting blocks.

FUNC(rijndael_eblk_x86_aesni)

	// On entry, we have:
	// [esp +  4] points to the context block
	// [esp +  8] points to the input data block
	// [esp + 12] points to the output buffer

	// Find the magic endianness-swapping table.
	ldgot	ecx
	movdqa	xmm5, [INTADDR(endswap_tab, ecx)]

	// Load the input block and end-swap it.  Also, start loading the
	// keys.
	mov	eax, [esp + 8]
	movdqu	xmm0, [eax]
	pshufb	xmm0, xmm5
	mov	eax, [esp + 4]
	lea	edx, [eax + w]
	mov	eax, [eax + nr]

	// Initial whitening.
	movdqu	xmm1, [edx]
	add	edx, 16
	pxor	xmm0, xmm1

	// Dispatch to the correct code.
	cmp	eax, 10
	je	er10
	jb	bogus
	cmp	eax, 14
	je	er14
	ja	bogus
	cmp	eax, 12
	je	er12
	jb	er11
	jmp	er13

	.align	2

	// 14 rounds...
er14:	movdqu	xmm1, [edx]
	add	edx, 16
	aesenc	xmm0, xmm1

	// 13 rounds...
er13:	movdqu	xmm1, [edx]
	add	edx, 16
	aesenc	xmm0, xmm1

	// 12 rounds...
er12:	movdqu	xmm1, [edx]
	add	edx, 16
	aesenc	xmm0, xmm1

	// 11 rounds...
er11:	movdqu	xmm1, [edx]
	add	edx, 16
	aesenc	xmm0, xmm1

	// 10 rounds...
er10:	movdqu	xmm1, [edx]
	aesenc	xmm0, xmm1

	// 9 rounds...
	movdqu	xmm1, [edx + 16]
	aesenc	xmm0, xmm1

	// 8 rounds...
	movdqu	xmm1, [edx + 32]
	aesenc	xmm0, xmm1

	// 7 rounds...
	movdqu	xmm1, [edx + 48]
	aesenc	xmm0, xmm1

	// 6 rounds...
	movdqu	xmm1, [edx + 64]
	aesenc	xmm0, xmm1

	// 5 rounds...
	movdqu	xmm1, [edx + 80]
	aesenc	xmm0, xmm1

	// 4 rounds...
	movdqu	xmm1, [edx + 96]
	aesenc	xmm0, xmm1

	// 3 rounds...
	movdqu	xmm1, [edx + 112]
	aesenc	xmm0, xmm1

	// 2 rounds...
	movdqu	xmm1, [edx + 128]
	aesenc	xmm0, xmm1

	// Final round...
	movdqu	xmm1, [edx + 144]
	aesenclast xmm0, xmm1

	// Unpermute the ciphertext block and store it.
	pshufb	xmm0, xmm5
	mov	eax, [esp + 12]
	movdqu	[eax], xmm0

	// And we're done.
	ret

ENDFUNC

FUNC(rijndael_dblk_x86_aesni)

	// On entry, we have:
	// [esp +  4] points to the context block
	// [esp +  8] points to the input data block
	// [esp + 12] points to the output buffer

	// Find the magic endianness-swapping table.
	ldgot	ecx
	movdqa	xmm5, [INTADDR(endswap_tab, ecx)]

	// Load the input block and end-swap it.  Also, start loading the
	// keys.
	mov	eax, [esp + 8]
	movdqu	xmm0, [eax]
	pshufb	xmm0, xmm5
	mov	eax, [esp + 4]
	lea	edx, [eax + wi]
	mov	eax, [eax + nr]

	// Initial whitening.
	movdqu	xmm1, [edx]
	add	edx, 16
	pxor	xmm0, xmm1

	// Dispatch to the correct code.
	cmp	eax, 10
	je	dr10
	jb	bogus
	cmp	eax, 14
	je	dr14
	ja	bogus
	cmp	eax, 12
	je	dr12
	jb	dr11
	jmp	dr13

	.align	2

	// 14 rounds...
dr14:	movdqu	xmm1, [edx]
	add	edx, 16
	aesdec	xmm0, xmm1

	// 13 rounds...
dr13:	movdqu	xmm1, [edx]
	add	edx, 16
	aesdec	xmm0, xmm1

	// 12 rounds...
dr12:	movdqu	xmm1, [edx]
	add	edx, 16
	aesdec	xmm0, xmm1

	// 11 rounds...
dr11:	movdqu	xmm1, [edx]
	add	edx, 16
	aesdec	xmm0, xmm1

	// 10 rounds...
dr10:	movdqu	xmm1, [edx]
	aesdec	xmm0, xmm1

	// 9 rounds...
	movdqu	xmm1, [edx + 16]
	aesdec	xmm0, xmm1

	// 8 rounds...
	movdqu	xmm1, [edx + 32]
	aesdec	xmm0, xmm1

	// 7 rounds...
	movdqu	xmm1, [edx + 48]
	aesdec	xmm0, xmm1

	// 6 rounds...
	movdqu	xmm1, [edx + 64]
	aesdec	xmm0, xmm1

	// 5 rounds...
	movdqu	xmm1, [edx + 80]
	aesdec	xmm0, xmm1

	// 4 rounds...
	movdqu	xmm1, [edx + 96]
	aesdec	xmm0, xmm1

	// 3 rounds...
	movdqu	xmm1, [edx + 112]
	aesdec	xmm0, xmm1

	// 2 rounds...
	movdqu	xmm1, [edx + 128]
	aesdec	xmm0, xmm1

	// Final round...
	movdqu	xmm1, [edx + 144]
	aesdeclast xmm0, xmm1

	// Unpermute the ciphertext block and store it.
	pshufb	xmm0, xmm5
	mov	eax, [esp + 12]
	movdqu	[eax], xmm0

	// And we're done.
	ret

ENDFUNC

///--------------------------------------------------------------------------
/// Random utilities.

	.align	16
	// Abort the process because of a programming error.  Indirecting
	// through this point serves several purposes: (a) by CALLing, rather
	// than branching to, `abort', we can save the return address, which
	// might at least provide a hint as to what went wrong; (b) we don't
	// have conditional CALLs (and they'd be big anyway); and (c) we can
	// write a HLT here as a backstop against `abort' being mad.
bogus:	callext	F(abort)
0:	hlt
	jmp	0b

	gotaux	ecx

///--------------------------------------------------------------------------
/// Data tables.

	.align	16
endswap_tab:
	.byte	 3,  2,  1,  0
	.byte	 7,  6,  5,  4
	.byte	11, 10,  9,  8
	.byte	15, 14, 13, 12

///----- That's all, folks --------------------------------------------------
Commit	Line	Data
1a0c09c4 MW	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// AESNI-based implementation of Rijndael
	4	///
	5	/// (c) 2015 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// External definitions.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	///--------------------------------------------------------------------------
	34	/// External definitions.
	35
	36	.globl F(abort)
	37	.globl F(rijndael_rcon)
	38
	39	///--------------------------------------------------------------------------
47103664 MW	40	/// Local utilities.
	41
	42	// Magic constants for shuffling.
	43	#define ROTL 0x93
	44	#define ROT2 0x4e
	45	#define ROTR 0x39
	46
	47	///--------------------------------------------------------------------------
1a0c09c4 MW	48	/// Main code.
	49
	50	.arch .aes
	51	.section .text
	52
	53	/// The AESNI instructions implement a little-endian version of AES, but
	54	/// Catacomb's internal interface presents as big-endian so as to work better
	55	/// with things like GCM. We therefore maintain the round keys in
	56	/// little-endian form, and have to end-swap blocks in and out.
	57	///
	58	/// For added amusement, the AESNI instructions don't implement the
	59	/// larger-block versions of Rijndael, so we have to end-swap the keys if
	60	/// we're preparing for one of those.
	61
	62	// Useful constants.
	63	.equ maxrounds, 16 // maximum number of rounds
	64	.equ maxblksz, 32 // maximum block size, in bytes
	65	.equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
	66
	67	// Context structure.
	68	.equ nr, 0 // number of rounds
	69	.equ w, nr + 4 // encryption key words
	70	.equ wi, w + kbufsz // decryption key words
	71
	72	///--------------------------------------------------------------------------
	73	/// Key setup.
	74
	75	FUNC(rijndael_setup_x86_aesni)
	76
	77	// Initial state. We have four arguments:
	78	// [esp + 20] is the context pointer
	79	// [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
	80	// [esp + 28] points to the key material, unaligned
	81	// [esp + 32] is the size of the key, in words
	82	// The key size has already been checked for validity, and the number
	83	// of rounds has been computed. Our job is only to fill in the `w'
	84	// and `wi' vectors.
	85
	86	push ebp
	87	push ebx
	88	push esi
	89	push edi
	90
	91	// The initial round key material is taken directly from the input
	92	// key, so copy it over.
	93	mov ebp, [esp + 20] // context base pointer
	94	mov ebx, [esp + 32] // key size, in words
	95	mov ecx, ebx
	96	mov esi, [esp + 28]
	97	lea edi, [ebp + w]
	98	rep movsd
	99
	100	// Find out other useful things.
	101	mov edx, [ebp + nr] // number of rounds
	102	add edx, 1
	103	imul edx, [esp + 24] // total key size in words
	104	sub edx, ebx // offset by the key size
	105
	106	// Find the round constants.
	107	ldgot ecx
	108	leaext ecx, rijndael_rcon, ecx
	109
	110	// Prepare for the main loop.
	111	lea esi, [ebp + w]
112	mov eax, [esi + 4*ebx - 4] // most recent key word
113	lea edx, [esi + 4*edx] // limit, offset by one key expansion
114
115	// Main key expansion loop. The first word of each key-length chunk
116	// needs special treatment.
117	//
118	// This is rather tedious because the Intel `AESKEYGENASSIST'
119	// instruction is very strangely shaped. Firstly, it wants to
120	// operate on vast SSE registers, even though we're data-blocked from
121	// doing more than operation at a time unless we're doing two key
122	// schedules simultaneously -- and even then we can't do more than
123	// two, because the instruction ignores two of its input words
124	// entirely, and produces two different outputs for each of the other
125	// two. And secondly it insists on taking the magic round constant
126	// as an immediate, so it's kind of annoying if you're not
127	// open-coding the whole thing. It's much easier to leave that as
128	// zero and XOR in the round constant by hand.
129	9: movd xmm0, eax
47103664	130	pshufd xmm0, xmm0, ROTR
1a0c09c4	131	aeskeygenassist xmm1, xmm0, 0
47103664	132	pshufd xmm1, xmm1, ROTL
1a0c09c4 MW	133	movd eax, xmm1
	134	xor eax, [esi]
	135	xor al, [ecx]
	136	inc ecx
	137	mov [esi + 4*ebx], eax
	138	add esi, 4
	139	cmp esi, edx
	140	jae 8f
	141
	142	// The next three words are simple...
	143	xor eax, [esi]
	144	mov [esi + 4*ebx], eax
	145	add esi, 4
	146	cmp esi, edx
	147	jae 8f
	148
	149	// (Word 2...)
	150	xor eax, [esi]
	151	mov [esi + 4*ebx], eax
	152	add esi, 4
	153	cmp esi, edx
	154	jae 8f
	155
	156	// (Word 3...)
	157	xor eax, [esi]
	158	mov [esi + 4*ebx], eax
	159	add esi, 4
	160	cmp esi, edx
	161	jae 8f
	162
	163	// Word 4. If the key is /more/ than 6 words long, then we must
	164	// apply a substitution here.
	165	cmp ebx, 5
	166	jb 9b
	167	cmp ebx, 7
	168	jb 0f
	169	movd xmm0, eax
47103664	170	pshufd xmm0, xmm0, ROTL
1a0c09c4 MW	171	aeskeygenassist xmm1, xmm0, 0
	172	movd eax, xmm1
	173	0: xor eax, [esi]
	174	mov [esi + 4*ebx], eax
	175	add esi, 4
	176	cmp esi, edx
	177	jae 8f
	178
	179	// (Word 5...)
	180	cmp ebx, 6
	181	jb 9b
	182	xor eax, [esi]
	183	mov [esi + 4*ebx], eax
	184	add esi, 4
	185	cmp esi, edx
	186	jae 8f
	187
	188	// (Word 6...)
	189	cmp ebx, 7
	190	jb 9b
	191	xor eax, [esi]
	192	mov [esi + 4*ebx], eax
	193	add esi, 4
	194	cmp esi, edx
	195	jae 8f
	196
	197	// (Word 7...)
	198	cmp ebx, 8
	199	jb 9b
	200	xor eax, [esi]
	201	mov [esi + 4*ebx], eax
	202	add esi, 4
	203	cmp esi, edx
	204	jae 8f
	205
	206	// Must be done by now.
	207	jmp 9b
	208
	209	// Next job is to construct the decryption keys. The keys for the
	210	// first and last rounds don't need to be mangled, but the remaining
	211	// ones do -- and they all need to be reordered too.
	212	//
	213	// The plan of action, then, is to copy the final encryption round's
	214	// keys into place first, then to do each of the intermediate rounds
	215	// in reverse order, and finally do the first round.
	216	//
	217	// Do all of the heavy lifting with SSE registers. The order we're
	218	// doing this in means that it's OK if we read or write too much, and
	219	// there's easily enough buffer space for the over-enthusiastic reads
	220	// and writes because the context has space for 32-byte blocks, which
	221	// is our maximum and an exact fit for two SSE registers.
	222	8: mov ecx, [ebp + nr] // number of rounds
	223	mov ebx, [esp + 24] // block size (in words)
	224	mov edx, ecx
	225	imul edx, ebx
	226	lea edi, [ebp + wi]
	227	lea esi, [ebp + 4*edx + w] // last round's keys
	228	shl ebx, 2 // block size (in bytes now)
	229
	230	// Copy the last encryption round's keys.
	231	movdqu xmm0, [esi]
	232	movdqu [edi], xmm0
	233	cmp ebx, 16
	234	jbe 9f
235	movdqu xmm0, [esi + 16]
236	movdqu [edi + 16], xmm0
237
238	// Update the loop variables and stop if we've finished.
239	9: add edi, ebx
240	sub esi, ebx
241	sub ecx, 1
242	jbe 0f
243
244	// Do another middle round's keys...
245	movdqu xmm0, [esi]
246	aesimc xmm0, xmm0
247	movdqu [edi], xmm0
248	cmp ebx, 16
249	jbe 9b
250	movdqu xmm0, [esi + 16]
251	aesimc xmm0, xmm0
252	movdqu [edi + 16], xmm0
253	jmp 9b
254
255	// Finally do the first encryption round.
256	0: movdqu xmm0, [esi]
257	movdqu [edi], xmm0
258	cmp ebx, 16
259	jbe 0f
260	movdqu xmm0, [esi + 16]
261	movdqu [edi + 16], xmm0
262
263	// If the block size is not exactly four words then we must end-swap
264	// everything. We can use fancy SSE toys for this.
265	0: cmp ebx, 16
266	je 0f
267
268	// Find the byte-reordering table.
269	ldgot ecx
8d6ca554	270	movdqa xmm5, [INTADDR(endswap_tab, ecx)]
1a0c09c4 MW	271
	272	// Calculate the number of subkey words again. (It's a good job
	273	// we've got a fast multiplier.)
	274	mov ecx, [ebp + nr]
	275	add ecx, 1
	276	imul ecx, [esp + 24] // total keys in words
	277
	278	// End-swap the encryption keys.
	279	mov eax, ecx
	280	lea esi, [ebp + w]
	281	call endswap_block
	282
	283	// And the decryption keys.
	284	mov ecx, eax
	285	lea esi, [ebp + wi]
	286	call endswap_block
	287
	288	// All done.
	289	0: pop edi
	290	pop esi
	291	pop ebx
	292	pop ebp
	293	ret
	294
	295	.align 16
	296	endswap_block:
	297	// End-swap ECX words starting at ESI. The end-swapping table is
8d6ca554	298	// already loaded into XMM5; and it's OK to work in 16-byte chunks.
1a0c09c4	299	movdqu xmm1, [esi]
8d6ca554	300	pshufb xmm1, xmm5
1a0c09c4 MW	301	movdqu [esi], xmm1
	302	add esi, 16
	303	sub ecx, 4
	304	ja endswap_block
	305	ret
	306
	307	ENDFUNC
	308
	309	///--------------------------------------------------------------------------
	310	/// Encrypting and decrypting blocks.
	311
	312	FUNC(rijndael_eblk_x86_aesni)
	313
	314	// On entry, we have:
	315	// [esp + 4] points to the context block
	316	// [esp + 8] points to the input data block
	317	// [esp + 12] points to the output buffer
	318
	319	// Find the magic endianness-swapping table.
	320	ldgot ecx
8d6ca554	321	movdqa xmm5, [INTADDR(endswap_tab, ecx)]
1a0c09c4 MW	322
	323	// Load the input block and end-swap it. Also, start loading the
	324	// keys.
	325	mov eax, [esp + 8]
	326	movdqu xmm0, [eax]
8d6ca554	327	pshufb xmm0, xmm5
1a0c09c4 MW	328	mov eax, [esp + 4]
	329	lea edx, [eax + w]
	330	mov eax, [eax + nr]
	331
	332	// Initial whitening.
	333	movdqu xmm1, [edx]
	334	add edx, 16
	335	pxor xmm0, xmm1
	336
	337	// Dispatch to the correct code.
	338	cmp eax, 10
	339	je er10
	340	jb bogus
	341	cmp eax, 14
	342	je er14
	343	ja bogus
	344	cmp eax, 12
	345	je er12
	346	jb er11
	347	jmp er13
	348
	349	.align 2
	350
	351	// 14 rounds...
	352	er14: movdqu xmm1, [edx]
	353	add edx, 16
	354	aesenc xmm0, xmm1
	355
	356	// 13 rounds...
	357	er13: movdqu xmm1, [edx]
	358	add edx, 16
	359	aesenc xmm0, xmm1
	360
	361	// 12 rounds...
	362	er12: movdqu xmm1, [edx]
	363	add edx, 16
	364	aesenc xmm0, xmm1
	365
	366	// 11 rounds...
	367	er11: movdqu xmm1, [edx]
	368	add edx, 16
	369	aesenc xmm0, xmm1
	370
	371	// 10 rounds...
	372	er10: movdqu xmm1, [edx]
	373	aesenc xmm0, xmm1
	374
	375	// 9 rounds...
	376	movdqu xmm1, [edx + 16]
	377	aesenc xmm0, xmm1
	378
	379	// 8 rounds...
	380	movdqu xmm1, [edx + 32]
	381	aesenc xmm0, xmm1
	382
	383	// 7 rounds...
	384	movdqu xmm1, [edx + 48]
	385	aesenc xmm0, xmm1
	386
	387	// 6 rounds...
	388	movdqu xmm1, [edx + 64]
	389	aesenc xmm0, xmm1
	390
	391	// 5 rounds...
392	movdqu xmm1, [edx + 80]
393	aesenc xmm0, xmm1
394
395	// 4 rounds...
396	movdqu xmm1, [edx + 96]
397	aesenc xmm0, xmm1
398
399	// 3 rounds...
400	movdqu xmm1, [edx + 112]
401	aesenc xmm0, xmm1
402
403	// 2 rounds...
404	movdqu xmm1, [edx + 128]
405	aesenc xmm0, xmm1
406
407	// Final round...
408	movdqu xmm1, [edx + 144]
409	aesenclast xmm0, xmm1
410
411	// Unpermute the ciphertext block and store it.
8d6ca554	412	pshufb xmm0, xmm5
1a0c09c4 MW	413	mov eax, [esp + 12]
	414	movdqu [eax], xmm0
	415
	416	// And we're done.
	417	ret
	418
	419	ENDFUNC
	420
	421	FUNC(rijndael_dblk_x86_aesni)
	422
	423	// On entry, we have:
	424	// [esp + 4] points to the context block
	425	// [esp + 8] points to the input data block
	426	// [esp + 12] points to the output buffer
	427
	428	// Find the magic endianness-swapping table.
	429	ldgot ecx
8d6ca554	430	movdqa xmm5, [INTADDR(endswap_tab, ecx)]
1a0c09c4 MW	431
	432	// Load the input block and end-swap it. Also, start loading the
	433	// keys.
	434	mov eax, [esp + 8]
	435	movdqu xmm0, [eax]
8d6ca554	436	pshufb xmm0, xmm5
1a0c09c4 MW	437	mov eax, [esp + 4]
	438	lea edx, [eax + wi]
	439	mov eax, [eax + nr]
	440
	441	// Initial whitening.
	442	movdqu xmm1, [edx]
	443	add edx, 16
	444	pxor xmm0, xmm1
	445
	446	// Dispatch to the correct code.
	447	cmp eax, 10
	448	je dr10
	449	jb bogus
	450	cmp eax, 14
	451	je dr14
	452	ja bogus
	453	cmp eax, 12
	454	je dr12
	455	jb dr11
	456	jmp dr13
	457
	458	.align 2
	459
	460	// 14 rounds...
	461	dr14: movdqu xmm1, [edx]
	462	add edx, 16
	463	aesdec xmm0, xmm1
	464
	465	// 13 rounds...
	466	dr13: movdqu xmm1, [edx]
	467	add edx, 16
	468	aesdec xmm0, xmm1
	469
	470	// 12 rounds...
	471	dr12: movdqu xmm1, [edx]
	472	add edx, 16
	473	aesdec xmm0, xmm1
	474
	475	// 11 rounds...
	476	dr11: movdqu xmm1, [edx]
	477	add edx, 16
	478	aesdec xmm0, xmm1
	479
	480	// 10 rounds...
	481	dr10: movdqu xmm1, [edx]
	482	aesdec xmm0, xmm1
	483
	484	// 9 rounds...
	485	movdqu xmm1, [edx + 16]
	486	aesdec xmm0, xmm1
	487
	488	// 8 rounds...
	489	movdqu xmm1, [edx + 32]
	490	aesdec xmm0, xmm1
	491
	492	// 7 rounds...
	493	movdqu xmm1, [edx + 48]
	494	aesdec xmm0, xmm1
	495
	496	// 6 rounds...
	497	movdqu xmm1, [edx + 64]
	498	aesdec xmm0, xmm1
	499
	500	// 5 rounds...
501	movdqu xmm1, [edx + 80]
502	aesdec xmm0, xmm1
503
504	// 4 rounds...
505	movdqu xmm1, [edx + 96]
506	aesdec xmm0, xmm1
507
508	// 3 rounds...
509	movdqu xmm1, [edx + 112]
510	aesdec xmm0, xmm1
511
512	// 2 rounds...
513	movdqu xmm1, [edx + 128]
514	aesdec xmm0, xmm1
515
516	// Final round...
517	movdqu xmm1, [edx + 144]
518	aesdeclast xmm0, xmm1
519
520	// Unpermute the ciphertext block and store it.
8d6ca554	521	pshufb xmm0, xmm5
1a0c09c4 MW	522	mov eax, [esp + 12]
	523	movdqu [eax], xmm0
	524
	525	// And we're done.
	526	ret
	527
	528	ENDFUNC
	529
	530	///--------------------------------------------------------------------------
	531	/// Random utilities.
	532
	533	.align 16
	534	// Abort the process because of a programming error. Indirecting
	535	// through this point serves several purposes: (a) by CALLing, rather
	536	// than branching to, `abort', we can save the return address, which
	537	// might at least provide a hint as to what went wrong; (b) we don't
	538	// have conditional CALLs (and they'd be big anyway); and (c) we can
	539	// write a HLT here as a backstop against `abort' being mad.
	540	bogus: callext F(abort)
	541	0: hlt
	542	jmp 0b
	543
	544	gotaux ecx
	545
	546	///--------------------------------------------------------------------------
	547	/// Data tables.
	548
	549	.align 16
	550	endswap_tab:
	551	.byte 3, 2, 1, 0
	552	.byte 7, 6, 5, 4
	553	.byte 11, 10, 9, 8
	554	.byte 15, 14, 13, 12
	555
	556	///----- That's all, folks --------------------------------------------------