/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// AESNI-based implementation of Rijndael
///
/// (c) 2015 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software; you can redistribute it and/or modify
/// it under the terms of the GNU Library General Public License as
/// published by the Free Software Foundation; either version 2 of the
/// License, or (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb; if not, write to the Free
/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
/// MA 02111-1307, USA.

///--------------------------------------------------------------------------
/// Preliminaries.

#include "config.h"
#include "asm-common.h"

	.arch	.aes

	.extern	F(abort)
	.extern	F(rijndael_rcon)

	.text

///--------------------------------------------------------------------------
/// Main code.

/// The AESNI instructions implement a little-endian version of AES, but
/// Catacomb's internal interface presents as big-endian so as to work better
/// with things like GCM.  We therefore maintain the round keys in
/// little-endian form, and have to end-swap blocks in and out.
///
/// For added amusement, the AESNI instructions don't implement the
/// larger-block versions of Rijndael, so we have to end-swap the keys if
/// we're preparing for one of those.

	// Useful constants.
	.equ	maxrounds, 16		// maximum number of rounds
	.equ	maxblksz, 32		// maximum block size, in bytes
	.equ	kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer

	// Context structure.
	.equ	nr, 0			// number of rounds
	.equ	w, nr + 4		// encryption key words
	.equ	wi, w + kbufsz		// decryption key words

///--------------------------------------------------------------------------
/// Key setup.

FUNC(rijndael_setup_x86ish_aesni_avx)
	vzeroupper		      // avoid penalty on `legacy' XMM access
  endprologue
	// and drop through...
ENDFUNC

FUNC(rijndael_setup_x86ish_aesni)

#if CPUFAM_X86
	// Arguments are on the stack.  We'll need to stack the caller's
	// register veriables, but we'll manage.

#  define CTX BP			// context pointer
#  define BLKSZ [SP + 24]		// block size

#  define KSZ ebx			// key size
#  define NKW edx			// total number of key words
#  define NKW_NEEDS_REFRESH 1		// ... needs recalculating
#  define RCON ecx			// round constants table
#  define LIM edx			// limit pointer
#  define CYIX edi			// index in shift-register cycle

#  define NR ecx			// number of rounds
#  define LRK eax			// distance to last key
#  define BLKOFF edx			// block size in bytes

	// Stack the caller's registers.
	pushreg	BP
	pushreg	ebx
	pushreg	esi
	pushreg	edi

	// Set up our own variables.
	mov	CTX, [SP + 20]		// context base pointer
	mov	SI, [SP + 28]		// key material
	mov	KSZ, [SP + 32]		// key size, in words
#endif

#if CPUFAM_AMD64 && ABI_SYSV
	// Arguments are in registers.  We have plenty, but, to be honest,
	// the initial register allocation is a bit annoying.

#  define CTX r8			// context pointer
#  define BLKSZ r9d			// block size

#  define KSZ edx			// key size
#  define NKW r10d			// total number of key words
#  define RCON rdi			// round constants table
#  define LIM rcx			// limit pointer
#  define CYIX r11d			// index in shift-register cycle

#  define NR ecx			// number of rounds
#  define LRK eax			// distance to last key
#  define BLKOFF r9d			// block size in bytes

	// Move arguments to more useful places.
	mov	CTX, rdi		// context base pointer
	mov	BLKSZ, esi		// block size in words
	mov	SI, rdx			// key material
	mov	KSZ, ecx		// key size, in words
#endif

#if CPUFAM_AMD64 && ABI_WIN
	// Arguments are in different registers, and they're a little tight.

#  define CTX r8			// context pointer
#  define BLKSZ edx			// block size

#  define KSZ r9d			// key size
#  define NKW r10d			// total number of key words
#  define RCON rdi			// round constants table
#  define LIM rcx			// limit pointer
#  define CYIX r11d			// index in shift-register cycle

#  define NR ecx			// number of rounds
#  define LRK eax			// distance to last key
#  define BLKOFF edx			// block size in bytes

	// We'll need the index registers, which belong to the caller in this
	// ABI.
	pushreg	rsi
	pushreg	rdi

	// Move arguments to more useful places.
	mov	rsi, r8			// key material
	mov	CTX, rcx		// context base pointer
#endif

  endprologue

	// The initial round key material is taken directly from the input
	// key, so copy it over.
#if CPUFAM_AMD64 && ABI_SYSV
	// We've been lucky.  We already have a copy of the context pointer
	// in rdi, and the key size in ecx.
	add	rdi, w
#else
	lea	DI, [CTX + w]
	mov	ecx, KSZ
#endif
	rep	movsd

	// Find out other useful things.
	mov	NKW, [CTX + nr]		// number of rounds
	add	NKW, 1
	imul	NKW, BLKSZ		// total key size in words
#if !NKW_NEEDS_REFRESH
	// If we can't keep NKW for later, then we use the same register for
	// it and LIM, so this move is unnecessary.
	mov	DWORD(LIM), NKW
#endif
	sub	DWORD(LIM), KSZ		// offset by the key size

	// Find the round constants.
	ldgot	WHOLE(c)
	leaext	RCON, F(rijndael_rcon), WHOLE(c)

	// Prepare for the main loop.
	lea	SI, [CTX + w]
	mov	eax, [SI + 4*WHOLE(KSZ) - 4] // most recent key word
	lea	LIM, [SI + 4*LIM]	// limit, offset by one key expansion
	xor	CYIX, CYIX		// start of new cycle

	// Main key expansion loop.  The first word of each key-length chunk
	// needs special treatment.
	//
	// This is rather tedious because the Intel `AESKEYGENASSIST'
	// instruction is very strangely shaped.  Firstly, it wants to
	// operate on vast SSE registers, even though we're data-blocked from
	// doing more than operation at a time unless we're doing two key
	// schedules simultaneously -- and even then we can't do more than
	// two, because the instruction ignores two of its input words
	// entirely, and produces two different outputs for each of the other
	// two.  And secondly it insists on taking the magic round constant
	// as an immediate, so it's kind of annoying if you're not
	// open-coding the whole thing.  It's much easier to leave that as
	// zero and XOR in the round constant by hand.
0:	cmp	CYIX, 0			// first word of the cycle?
	je	1f
	cmp	CYIX, 4			// fourth word of the cycle?
	jne	2f
	cmp	KSZ, 7			// and a large key?
	jb	2f

	// Fourth word of the cycle, and seven or eight words of key.  Do a
	// byte substitution.
	movd	xmm0, eax
	pshufd	xmm0, xmm0, SHUF(2, 1, 0, 3)
	aeskeygenassist xmm1, xmm0, 0
	movd	eax, xmm1
	jmp	2f

	// First word of the cycle.  This is the complicated piece.
1:	movd	xmm0, eax
	pshufd	xmm0, xmm0, SHUF(0, 3, 2, 1)
	aeskeygenassist xmm1, xmm0, 0
	pshufd	xmm1, xmm1, SHUF(2, 1, 0, 3)
	movd	eax, xmm1
	xor	al, [RCON]
	inc	RCON

	// Common tail.  Mix in the corresponding word from the previous
	// cycle and prepare for the next loop.
2:	xor	eax, [SI]
	mov	[SI + 4*WHOLE(KSZ)], eax
	add	SI, 4
	inc	CYIX
	cmp	SI, LIM
	jae	9f
	cmp	CYIX, KSZ
	jb	0b
	xor	CYIX, CYIX
	jmp	0b

	// Next job is to construct the decryption keys.  The keys for the
	// first and last rounds don't need to be mangled, but the remaining
	// ones do -- and they all need to be reordered too.
	//
	// The plan of action, then, is to copy the final encryption round's
	// keys into place first, then to do each of the intermediate rounds
	// in reverse order, and finally do the first round.
	//
	// Do all of the heavy lifting with SSE registers.  The order we're
	// doing this in means that it's OK if we read or write too much, and
	// there's easily enough buffer space for the over-enthusiastic reads
	// and writes because the context has space for 32-byte blocks, which
	// is our maximum and an exact fit for two SSE registers.
9:	mov	NR, [CTX + nr]		// number of rounds
#if NKW_NEEDS_REFRESH
	mov	BLKOFF, BLKSZ
	mov	LRK, NR
	imul	LRK, BLKOFF
#else
	// If we retain NKW, then BLKSZ and BLKOFF are the same register
	// because we won't need the former again.
	mov	LRK, NKW
	sub	LRK, BLKSZ
#endif
	lea	DI, [CTX + wi]
	lea	SI, [CTX + w + 4*WHOLE(LRK)] // last round's keys
	shl	BLKOFF, 2		// block size (in bytes now)

	// Copy the last encryption round's keys.
	movdqu	xmm0, [SI]
	movdqu	[DI], xmm0
	cmp	BLKOFF, 16
	jbe	0f
	movdqu	xmm0, [SI + 16]
	movdqu	[DI + 16], xmm0

	// Update the loop variables and stop if we've finished.
0:	add	DI, WHOLE(BLKOFF)
	sub	SI, WHOLE(BLKOFF)
	sub	NR, 1
	jbe	9f

	// Do another middle round's keys...
	movdqu	xmm0, [SI]
	aesimc	xmm0, xmm0
	movdqu	[DI], xmm0
	cmp	BLKOFF, 16
	jbe	0b
	movdqu	xmm0, [SI + 16]
	aesimc	xmm0, xmm0
	movdqu	[DI + 16], xmm0
	jmp	0b

	// Finally do the first encryption round.
9:	movdqu	xmm0, [SI]
	movdqu	[DI], xmm0
	cmp	BLKOFF, 16
	jbe	1f
	movdqu	xmm0, [SI + 16]
	movdqu	[DI + 16], xmm0

	// If the block size is not exactly four words then we must end-swap
	// everything.  We can use fancy SSE toys for this.
1:	cmp	BLKOFF, 16
	je	9f

	// Find the byte-reordering table.
	ldgot	ecx
	movdqa	xmm5, [INTADDR(endswap_tab, ecx)]

#if NKW_NEEDS_REFRESH
	// Calculate the number of subkey words again.  (It's a good job
	// we've got a fast multiplier.)
	mov	NKW, [CTX + nr]
	add	NKW, 1
	imul	NKW, BLKSZ
#endif

	// End-swap the encryption keys.
	lea	SI, [CTX + w]
	call	endswap_block

	// And the decryption keys.
	lea	SI, [CTX + wi]
	call	endswap_block

9:	// All done.
#if CPUFAM_X86
	popreg	edi
	popreg	esi
	popreg	ebx
	popreg	BP
#endif
#if CPUFAM_AMD64 && ABI_WIN
	popreg	rdi
	popreg	rsi
#endif
	ret

ENDFUNC

INTFUNC(endswap_block)
	// End-swap NKW words starting at SI.  The end-swapping table is
	// already loaded into XMM5; and it's OK to work in 16-byte chunks.
  endprologue

	mov	ecx, NKW
0:	movdqu	xmm1, [SI]
	pshufb	xmm1, xmm5
	movdqu	[SI], xmm1
	add	SI, 16
	sub	ecx, 4
	ja	0b

	ret

ENDFUNC

#undef CTX
#undef BLKSZ
#undef SI
#undef DI
#undef KSZ
#undef RCON
#undef LIM
#undef NR
#undef LRK
#undef BLKOFF

///--------------------------------------------------------------------------
/// Encrypting and decrypting blocks.

.macro	encdec	op, aes, koff
  FUNC(rijndael_\op\()_x86ish_aesni_avx)
	vzeroupper			// avoid XMM penalties
  endprologue
	// and drop through...
  ENDFUNC

  FUNC(rijndael_\op\()_x86ish_aesni)

#if CPUFAM_X86
	// Arguments come in on the stack, and need to be collected.  We
	// don't have a shortage of registers.

#  define K eax
#  define SRC edx
#  define DST edx
#  define NR ecx

	mov	K, [SP + 4]
	mov	SRC, [SP + 8]
#endif

#if CPUFAM_AMD64 && ABI_SYSV
	// Arguments come in registers.  All is good.

#  define K rdi
#  define SRC rsi
#  define DST rdx
#  define NR eax
#endif

#if CPUFAM_AMD64 && ABI_WIN
	// Arguments come in different registers.

#  define K rcx
#  define SRC rdx
#  define DST r8
#  define NR eax
#endif

  endprologue

	// Find the magic endianness-swapping table.
	ldgot	ecx
	movdqa	xmm5, [INTADDR(endswap_tab, ecx)]

	// Initial setup.
	movdqu	xmm0, [SRC]
	pshufb	xmm0, xmm5
	mov	NR, [K + nr]
	add	K, \koff

	// Initial whitening.
	movdqu	xmm1, [K]
	add	K, 16
	pxor	xmm0, xmm1
#if CPUFAM_X86
	mov	DST, [SP + 12]
#endif

	// Dispatch to the correct code.
	cmp	NR, 10
	je	10f
	jb	bogus
	cmp	NR, 14
	je	14f
	ja	bogus
	cmp	NR, 12
	je	12f
	jb	11f
	jmp	13f

	.align	2

	// 14 rounds...
14:	movdqu	xmm1, [K]
	add	K, 16
	\aes	xmm0, xmm1

	// 13 rounds...
13:	movdqu	xmm1, [K]
	add	K, 16
	\aes	xmm0, xmm1

	// 12 rounds...
12:	movdqu	xmm1, [K]
	add	K, 16
	\aes	xmm0, xmm1

	// 11 rounds...
11:	movdqu	xmm1, [K]
	add	K, 16
	\aes	xmm0, xmm1

	// 10 rounds...
10:	movdqu	xmm1, [K]
	\aes	xmm0, xmm1

	// 9 rounds...
	movdqu	xmm1, [K + 16]
	\aes	xmm0, xmm1

	// 8 rounds...
	movdqu	xmm1, [K + 32]
	\aes	xmm0, xmm1

	// 7 rounds...
	movdqu	xmm1, [K + 48]
	\aes	xmm0, xmm1

	// 6 rounds...
	movdqu	xmm1, [K + 64]
	\aes	xmm0, xmm1

	// 5 rounds...
	movdqu	xmm1, [K + 80]
	\aes	xmm0, xmm1

	// 4 rounds...
	movdqu	xmm1, [K + 96]
	\aes	xmm0, xmm1

	// 3 rounds...
	movdqu	xmm1, [K + 112]
	\aes	xmm0, xmm1

	// 2 rounds...
	movdqu	xmm1, [K + 128]
	\aes	xmm0, xmm1

	// Final round...
	movdqu	xmm1, [K + 144]
	\aes\()last xmm0, xmm1

	// Unpermute the ciphertext block and store it.
	pshufb	xmm0, xmm5
	movdqu	[DST], xmm0

	// And we're done.
	ret

#undef K
#undef SRC
#undef DST
#undef NR

  ENDFUNC
.endm

	encdec	eblk, aesenc, w
	encdec	dblk, aesdec, wi

///--------------------------------------------------------------------------
/// Random utilities.

INTFUNC(bogus)
	// Abort the process because of a programming error.  Indirecting
	// through this point serves several purposes: (a) by CALLing, rather
	// than branching to, `abort', we can save the return address, which
	// might at least provide a hint as to what went wrong; (b) we don't
	// have conditional CALLs (and they'd be big anyway); and (c) we can
	// write a HLT here as a backstop against `abort' being mad.
  endprologue

	callext	F(abort)
0:	hlt
	jmp	0b

ENDFUNC

///--------------------------------------------------------------------------
/// Data tables.

	RODATA

	.align	16
endswap_tab:
	.byte	 3,  2,  1,  0
	.byte	 7,  6,  5,  4
	.byte	11, 10,  9,  8
	.byte	15, 14, 13, 12

///----- That's all, folks --------------------------------------------------