/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// AESNI-based implementation of Rijndael
///
/// (c) 2015 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software; you can redistribute it and/or modify
/// it under the terms of the GNU Library General Public License as
/// published by the Free Software Foundation; either version 2 of the
/// License, or (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb; if not, write to the Free
/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
/// MA 02111-1307, USA.

///--------------------------------------------------------------------------
/// External definitions.

#include "config.h"
#include "asm-common.h"

///--------------------------------------------------------------------------
/// External definitions.

	.globl	F(abort)
	.globl	F(rijndael_rcon)

///--------------------------------------------------------------------------
/// Main code.

	.arch	.aes
	.section .text

/// The AESNI instructions implement a little-endian version of AES, but
/// Catacomb's internal interface presents as big-endian so as to work better
/// with things like GCM.  We therefore maintain the round keys in
/// little-endian form, and have to end-swap blocks in and out.
///
/// For added amusement, the AESNI instructions don't implement the
/// larger-block versions of Rijndael, so we have to end-swap the keys if
/// we're preparing for one of those.

	// Useful constants.
	.equ	maxrounds, 16		// maximum number of rounds
	.equ	maxblksz, 32		// maximum block size, in bytes
	.equ	kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer

	// Context structure.
	.equ	nr, 0			// number of rounds
	.equ	w, nr + 4		// encryption key words
	.equ	wi, w + kbufsz		// decryption key words

///--------------------------------------------------------------------------
/// Key setup.

FUNC(rijndael_setup_x86_aesni)

	// Initial state.  We have four arguments:
	// [esp + 20] is the context pointer
	// [esp + 24] is the block size, in 32-bit words (4, 6, or 8)
	// [esp + 28] points to the key material, unaligned
	// [esp + 32] is the size of the key, in words
	// The key size has already been checked for validity, and the number
	// of rounds has been computed.  Our job is only to fill in the `w'
	// and `wi' vectors.

	push	ebp
	push	ebx
	push	esi
	push	edi

	// The initial round key material is taken directly from the input
	// key, so copy it over.
	mov	ebp, [esp + 20]		// context base pointer
	mov	ebx, [esp + 32]		// key size, in words
	mov	ecx, ebx
	mov	esi, [esp + 28]
	lea	edi, [ebp + w]
	rep	movsd

	// Find out other useful things.
	mov	edx, [ebp + nr]		// number of rounds
	add	edx, 1
	imul	edx, [esp + 24]		// total key size in words
	sub	edx, ebx		// offset by the key size

	// Find the round constants.
	ldgot	ecx
	leaext	ecx, rijndael_rcon, ecx

	// Prepare for the main loop.
	lea	esi, [ebp + w]
	mov	eax, [esi + 4*ebx - 4]	// most recent key word
	lea	edx, [esi + 4*edx]	// limit, offset by one key expansion

	// Main key expansion loop.  The first word of each key-length chunk
	// needs special treatment.
	//
	// This is rather tedious because the Intel `AESKEYGENASSIST'
	// instruction is very strangely shaped.  Firstly, it wants to
	// operate on vast SSE registers, even though we're data-blocked from
	// doing more than operation at a time unless we're doing two key
	// schedules simultaneously -- and even then we can't do more than
	// two, because the instruction ignores two of its input words
	// entirely, and produces two different outputs for each of the other
	// two.  And secondly it insists on taking the magic round constant
	// as an immediate, so it's kind of annoying if you're not
	// open-coding the whole thing.  It's much easier to leave that as
	// zero and XOR in the round constant by hand.
9:	movd	xmm0, eax
	pshufd	xmm0, xmm0, 0x39
	aeskeygenassist xmm1, xmm0, 0
	pshufd	xmm1, xmm1, 0x93
	movd	eax, xmm1
	xor	eax, [esi]
	xor	al, [ecx]
	inc	ecx
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// The next three words are simple...
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// (Word 2...)
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// (Word 3...)
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// Word 4.  If the key is /more/ than 6 words long, then we must
	// apply a substitution here.
	cmp	ebx, 5
	jb	9b
	cmp	ebx, 7
	jb	0f
	movd	xmm0, eax
	pshufd	xmm0, xmm0, 0x93
	aeskeygenassist xmm1, xmm0, 0
	movd	eax, xmm1
0:	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// (Word 5...)
	cmp	ebx, 6
	jb	9b
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// (Word 6...)
	cmp	ebx, 7
	jb	9b
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// (Word 7...)
	cmp	ebx, 8
	jb	9b
	xor	eax, [esi]
	mov	[esi + 4*ebx], eax
	add	esi, 4
	cmp	esi, edx
	jae	8f

	// Must be done by now.
	jmp	9b

	// Next job is to construct the decryption keys.  The keys for the
	// first and last rounds don't need to be mangled, but the remaining
	// ones do -- and they all need to be reordered too.
	//
	// The plan of action, then, is to copy the final encryption round's
	// keys into place first, then to do each of the intermediate rounds
	// in reverse order, and finally do the first round.
	//
	// Do all of the heavy lifting with SSE registers.  The order we're
	// doing this in means that it's OK if we read or write too much, and
	// there's easily enough buffer space for the over-enthusiastic reads
	// and writes because the context has space for 32-byte blocks, which
	// is our maximum and an exact fit for two SSE registers.
8:	mov	ecx, [ebp + nr]		// number of rounds
	mov	ebx, [esp + 24]		// block size (in words)
	mov	edx, ecx
	imul	edx, ebx
	lea	edi, [ebp + wi]
	lea	esi, [ebp + 4*edx + w]	// last round's keys
	shl	ebx, 2			// block size (in bytes now)

	// Copy the last encryption round's keys.
	movdqu	xmm0, [esi]
	movdqu	[edi], xmm0
	cmp	ebx, 16
	jbe	9f
	movdqu	xmm0, [esi + 16]
	movdqu	[edi + 16], xmm0

	// Update the loop variables and stop if we've finished.
9:	add	edi, ebx
	sub	esi, ebx
	sub	ecx, 1
	jbe	0f

	// Do another middle round's keys...
	movdqu	xmm0, [esi]
	aesimc	xmm0, xmm0
	movdqu	[edi], xmm0
	cmp	ebx, 16
	jbe	9b
	movdqu	xmm0, [esi + 16]
	aesimc	xmm0, xmm0
	movdqu	[edi + 16], xmm0
	jmp	9b

	// Finally do the first encryption round.
0:	movdqu	xmm0, [esi]
	movdqu	[edi], xmm0
	cmp	ebx, 16
	jbe	0f
	movdqu	xmm0, [esi + 16]
	movdqu	[edi + 16], xmm0

	// If the block size is not exactly four words then we must end-swap
	// everything.  We can use fancy SSE toys for this.
0:	cmp	ebx, 16
	je	0f

	// Find the byte-reordering table.
	ldgot	ecx
	movdqa	xmm7, [INTADDR(endswap_tab, ecx)]

	// Calculate the number of subkey words again.  (It's a good job
	// we've got a fast multiplier.)
	mov	ecx, [ebp + nr]
	add	ecx, 1
	imul	ecx, [esp + 24]		// total keys in words

	// End-swap the encryption keys.
	mov	eax, ecx
	lea	esi, [ebp + w]
	call	endswap_block

	// And the decryption keys.
	mov	ecx, eax
	lea	esi, [ebp + wi]
	call	endswap_block

	// All done.
0:	pop	edi
	pop	esi
	pop	ebx
	pop	ebp
	ret

	.align	16
endswap_block:
	// End-swap ECX words starting at ESI.  The end-swapping table is
	// already loaded into XMM7; and it's OK to work in 16-byte chunks.
	movdqu	xmm1, [esi]
	pshufb	xmm1, xmm7
	movdqu	[esi], xmm1
	add	esi, 16
	sub	ecx, 4
	ja	endswap_block
	ret

ENDFUNC

///--------------------------------------------------------------------------
/// Encrypting and decrypting blocks.

FUNC(rijndael_eblk_x86_aesni)

	// On entry, we have:
	// [esp +  4] points to the context block
	// [esp +  8] points to the input data block
	// [esp + 12] points to the output buffer

	// Find the magic endianness-swapping table.
	ldgot	ecx
	movdqa	xmm7, [INTADDR(endswap_tab, ecx)]

	// Load the input block and end-swap it.  Also, start loading the
	// keys.
	mov	eax, [esp + 8]
	movdqu	xmm0, [eax]
	pshufb	xmm0, xmm7
	mov	eax, [esp + 4]
	lea	edx, [eax + w]
	mov	eax, [eax + nr]

	// Initial whitening.
	movdqu	xmm1, [edx]
	add	edx, 16
	pxor	xmm0, xmm1

	// Dispatch to the correct code.
	cmp	eax, 10
	je	er10
	jb	bogus
	cmp	eax, 14
	je	er14
	ja	bogus
	cmp	eax, 12
	je	er12
	jb	er11
	jmp	er13

	.align	2

	// 14 rounds...
er14:	movdqu	xmm1, [edx]
	add	edx, 16
	aesenc	xmm0, xmm1

	// 13 rounds...
er13:	movdqu	xmm1, [edx]
	add	edx, 16
	aesenc	xmm0, xmm1

	// 12 rounds...
er12:	movdqu	xmm1, [edx]
	add	edx, 16
	aesenc	xmm0, xmm1

	// 11 rounds...
er11:	movdqu	xmm1, [edx]
	add	edx, 16
	aesenc	xmm0, xmm1

	// 10 rounds...
er10:	movdqu	xmm1, [edx]
	aesenc	xmm0, xmm1

	// 9 rounds...
	movdqu	xmm1, [edx + 16]
	aesenc	xmm0, xmm1

	// 8 rounds...
	movdqu	xmm1, [edx + 32]
	aesenc	xmm0, xmm1

	// 7 rounds...
	movdqu	xmm1, [edx + 48]
	aesenc	xmm0, xmm1

	// 6 rounds...
	movdqu	xmm1, [edx + 64]
	aesenc	xmm0, xmm1

	// 5 rounds...
	movdqu	xmm1, [edx + 80]
	aesenc	xmm0, xmm1

	// 4 rounds...
	movdqu	xmm1, [edx + 96]
	aesenc	xmm0, xmm1

	// 3 rounds...
	movdqu	xmm1, [edx + 112]
	aesenc	xmm0, xmm1

	// 2 rounds...
	movdqu	xmm1, [edx + 128]
	aesenc	xmm0, xmm1

	// Final round...
	movdqu	xmm1, [edx + 144]
	aesenclast xmm0, xmm1

	// Unpermute the ciphertext block and store it.
	pshufb	xmm0, xmm7
	mov	eax, [esp + 12]
	movdqu	[eax], xmm0

	// And we're done.
	ret

ENDFUNC

FUNC(rijndael_dblk_x86_aesni)

	// On entry, we have:
	// [esp +  4] points to the context block
	// [esp +  8] points to the input data block
	// [esp + 12] points to the output buffer

	// Find the magic endianness-swapping table.
	ldgot	ecx
	movdqa	xmm7, [INTADDR(endswap_tab, ecx)]

	// Load the input block and end-swap it.  Also, start loading the
	// keys.
	mov	eax, [esp + 8]
	movdqu	xmm0, [eax]
	pshufb	xmm0, xmm7
	mov	eax, [esp + 4]
	lea	edx, [eax + wi]
	mov	eax, [eax + nr]

	// Initial whitening.
	movdqu	xmm1, [edx]
	add	edx, 16
	pxor	xmm0, xmm1

	// Dispatch to the correct code.
	cmp	eax, 10
	je	dr10
	jb	bogus
	cmp	eax, 14
	je	dr14
	ja	bogus
	cmp	eax, 12
	je	dr12
	jb	dr11
	jmp	dr13

	.align	2

	// 14 rounds...
dr14:	movdqu	xmm1, [edx]
	add	edx, 16
	aesdec	xmm0, xmm1

	// 13 rounds...
dr13:	movdqu	xmm1, [edx]
	add	edx, 16
	aesdec	xmm0, xmm1

	// 12 rounds...
dr12:	movdqu	xmm1, [edx]
	add	edx, 16
	aesdec	xmm0, xmm1

	// 11 rounds...
dr11:	movdqu	xmm1, [edx]
	add	edx, 16
	aesdec	xmm0, xmm1

	// 10 rounds...
dr10:	movdqu	xmm1, [edx]
	aesdec	xmm0, xmm1

	// 9 rounds...
	movdqu	xmm1, [edx + 16]
	aesdec	xmm0, xmm1

	// 8 rounds...
	movdqu	xmm1, [edx + 32]
	aesdec	xmm0, xmm1

	// 7 rounds...
	movdqu	xmm1, [edx + 48]
	aesdec	xmm0, xmm1

	// 6 rounds...
	movdqu	xmm1, [edx + 64]
	aesdec	xmm0, xmm1

	// 5 rounds...
	movdqu	xmm1, [edx + 80]
	aesdec	xmm0, xmm1

	// 4 rounds...
	movdqu	xmm1, [edx + 96]
	aesdec	xmm0, xmm1

	// 3 rounds...
	movdqu	xmm1, [edx + 112]
	aesdec	xmm0, xmm1

	// 2 rounds...
	movdqu	xmm1, [edx + 128]
	aesdec	xmm0, xmm1

	// Final round...
	movdqu	xmm1, [edx + 144]
	aesdeclast xmm0, xmm1

	// Unpermute the ciphertext block and store it.
	pshufb	xmm0, xmm7
	mov	eax, [esp + 12]
	movdqu	[eax], xmm0

	// And we're done.
	ret

ENDFUNC

///--------------------------------------------------------------------------
/// Random utilities.

	.align	16
	// Abort the process because of a programming error.  Indirecting
	// through this point serves several purposes: (a) by CALLing, rather
	// than branching to, `abort', we can save the return address, which
	// might at least provide a hint as to what went wrong; (b) we don't
	// have conditional CALLs (and they'd be big anyway); and (c) we can
	// write a HLT here as a backstop against `abort' being mad.
bogus:	callext	F(abort)
0:	hlt
	jmp	0b

	gotaux	ecx

///--------------------------------------------------------------------------
/// Data tables.

	.align	16
endswap_tab:
	.byte	 3,  2,  1,  0
	.byte	 7,  6,  5,  4
	.byte	11, 10,  9,  8
	.byte	15, 14, 13, 12

///----- That's all, folks --------------------------------------------------