/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// AArch64 crypto-extension-based implementation of Rijndael
///
/// (c) 2018 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software; you can redistribute it and/or modify
/// it under the terms of the GNU Library General Public License as
/// published by the Free Software Foundation; either version 2 of the
/// License, or (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb; if not, write to the Free
/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
/// MA 02111-1307, USA.

///--------------------------------------------------------------------------
/// Preliminaries.

#include "config.h"
#include "asm-common.h"

	.arch	armv8-a+crypto

	.extern	F(abort)
	.extern	F(rijndael_rcon)

	.text

///--------------------------------------------------------------------------
/// Main code.

/// The ARM crypto extension implements a little-endian version of AES
/// (though the manual doesn't actually spell this out and you have to
/// experiment), but Catacomb's internal interface presents as big-endian so
/// as to work better with things like GCM.  We therefore maintain the round
/// keys in little-endian form, and have to end-swap blocks in and out.
///
/// For added amusement, the crypto extension doesn't implement the larger-
/// block versions of Rijndael, so we have to end-swap the keys if we're
/// preparing for one of those.

	// Useful constants.
	.equ	maxrounds, 16		// maximum number of rounds
	.equ	maxblksz, 32		// maximum block size, in bytes
	.equ	kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer

	// Context structure.
	.equ	nr, 0			// number of rounds
	.equ	w, nr + 4		// encryption key words
	.equ	wi, w + kbufsz		// decryption key words

///--------------------------------------------------------------------------
/// Key setup.

FUNC(rijndael_setup_arm64_crypto)

	// Arguments:
	//	x0 = pointer to context
	//	w1 = block size in 32-bit words
	//	x2 = pointer to key material
	//	x3 = key size in words

	pushreg	x29, x30
	mov	x29, sp

	// The initial round key material is taken directly from the input
	// key, so copy it over.  Unfortunately, the key material is not
	// guaranteed to be aligned in any especially useful way.  Assume
	// that alignment traps are not enabled.  (Why would they be?  On
	// A32, alignment traps were part of a transition plan which changed
	// the way unaligned loads and stores behaved, but there's never been
	// any other behaviour on A64.)
	mov	x15, x3
	add	x4, x0, #w
0:	sub	x15, x15, #1
	ldr	w14, [x2], #4
	str	w14, [x4], #4
	cbnz	x15, 0b

	// Find out other useful things and prepare for the main loop.
9:	ldr	w9, [x0, #nr]		// number of rounds
	madd	w2, w1, w9, w1		// total key size in words
	leaext	x5, rijndael_rcon	// round constants
	sub	x6, x2, x3		// minus what we've copied already
	add	x7, x0, #w		// position in previous cycle
	movi	v1.4s, #0		// all-zero register for the key
	mov	x8, #0			// position in current cycle

	// Main key expansion loop.  Dispatch according to the position in
	// the cycle.
0:	ldr	w15, [x7], #4		// word from previous cycle
	cbz	x8, 1f			// first word of the cycle?
	cmp	x8, #4			// fourth word of the cycle?
	b.ne	2f
	cmp	x3, #7			// seven or eight words of key?
	b.cc	2f

	// Fourth word of the cycle, seven or eight words of key.  We must do
	// the byte substitution.
	dup	v0.4s, w14
	aese	v0.16b, v1.16b		// effectively, just SubBytes
	mov	w14, v0.s[0]
	b	2f

	// First word of the cycle.  Byte substitution, rotation, and round
	// constant.
1:	ldrb	w13, [x5], #1		// next round constant
	dup	v0.4s, w14
	aese	v0.16b, v1.16b		// effectively, just SubBytes
	mov	w14, v0.s[0]
	eor	w14, w13, w14, ror #8

	// Common ending: mix in the word from the previous cycle and store.
2:	eor	w14, w14, w15
	str	w14, [x4], #4

	// Prepare for the next iteration.  If we're done, then stop; if
	// we've finished a cycle then reset the counter.
	add	x8, x8, #1
	sub	x6, x6, #1
	cmp	x8, x3
	cbz	x6, 9f
	csel	x8, x8, xzr, cc
	b	0b

	// Next job is to construct the decryption keys.  The keys for the
	// first and last rounds don't need to be mangled, but the remaining
	// ones do -- and they all need to be reordered too.
	//
	// The plan of action, then, is to copy the final encryption round's
	// keys into place first, then to do each of the intermediate rounds
	// in reverse order, and finally do the first round.
	//
	// Do all the heavy lifting with the vector registers.  The order
	// we're doing this in means that it's OK if we read or write too
	// much, and there's easily enough buffer space for the
	// over-enthusiastic reads and writes because the context has space
	// for 32-byte blocks, which is our maximum and an exact fit for two
	// full-width registers.
9:	add	x5, x0, #wi
	add	x4, x0, #w
	add	x4, x4, w2, uxtw #2
	sub	x4, x4, w1, uxtw #2		// last round's keys

	// Copy the last encryption round's keys.
	ld1	{v0.4s, v1.4s}, [x4]
	st1	{v0.4s, v1.4s}, [x5]

	// Update the loop variables and stop if we've finished.
0:	sub	w9, w9, #1
	add	x5, x5, w1, uxtw #2
	sub	x4, x4, w1, uxtw #2
	cbz	w9, 9f

	// Do another middle round's keys...
	ld1	{v0.4s, v1.4s}, [x4]
	aesimc	v0.16b, v0.16b
	aesimc	v1.16b, v1.16b
	st1	{v0.4s, v1.4s}, [x5]
	b	0b

	// Finally do the first encryption round.
9:	ld1	{v0.4s, v1.4s}, [x4]
	st1	{v0.4s, v1.4s}, [x5]

	// If the block size is not exactly four words then we must end-swap
	// everything.  We can use fancy vector toys for this.
	cmp	w1, #4
	b.eq	9f

	// End-swap the encryption keys.
	add	x1, x0, #w
	bl	endswap_block

	// And the decryption keys
	add	x1, x0, #wi
	bl	endswap_block

	// All done.
9:	popreg	x29, x30
	ret

ENDFUNC

INTFUNC(endswap_block)
	// End-swap w2 words starting at x1.  x1 is clobbered; w2 is not.
	// It's OK to work in 16-byte chunks.

	mov	w3, w2
0:	subs	w3, w3, #4
	ld1	{v0.4s}, [x1]
	rev32	v0.16b, v0.16b
	st1	{v0.4s}, [x1], #16
	b.hi	0b
	ret

ENDFUNC

///--------------------------------------------------------------------------
/// Encrypting and decrypting blocks.

.macro	encdec	op, aes, mc, koff
  FUNC(rijndael_\op\()_arm64_crypto)

	// Arguments:
	//	x0 = pointer to context
	//	x1 = pointer to input block
	//	x2 = pointer to output block

	// Set things up ready.
	ldr	w3, [x0, #nr]
	add	x0, x0, #\koff
	ld1	{v0.4s}, [x1]
	rev32	v0.16b, v0.16b

	// Check the number of rounds and dispatch.
	cmp	w3, #14
	b.eq	14f
	cmp	w3, #10
	b.eq	10f
	cmp	w3, #12
	b.eq	12f
	cmp	w3, #13
	b.eq	13f
	cmp	w3, #11
	b.eq	11f
	callext	F(abort)

	// Eleven rounds.
11:	ld1	{v16.4s}, [x0], #16
	\aes	v0.16b, v16.16b
	\mc	v0.16b, v0.16b
	b	10f

	// Twelve rounds.
12:	ld1	{v16.4s, v17.4s}, [x0], #32
	\aes	v0.16b, v16.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v17.16b
	\mc	v0.16b, v0.16b
	b	10f

	// Thirteen rounds.
13:	ld1	{v16.4s-v18.4s}, [x0], #48
	\aes	v0.16b, v16.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v17.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v18.16b
	\mc	v0.16b, v0.16b
	b	10f

	// Fourteen rounds.  (Drops through to the ten round case because
	// this is the next most common.)
14:	ld1	{v16.4s-v19.4s}, [x0], #64
	\aes	v0.16b, v16.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v17.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v18.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v19.16b
	\mc	v0.16b, v0.16b
	// Drop through...

	// Ten rounds.
10:	ld1	{v16.4s-v19.4s}, [x0], #64
	ld1	{v20.4s-v23.4s}, [x0], #64
	\aes	v0.16b, v16.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v17.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v18.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v19.16b
	\mc	v0.16b, v0.16b

	ld1	{v16.4s-v18.4s}, [x0], #48
	\aes	v0.16b, v20.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v21.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v22.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v23.16b
	\mc	v0.16b, v0.16b

	// Final round has no MixColumns, but is followed by final whitening.
	\aes	v0.16b, v16.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v17.16b
	eor	v0.16b, v0.16b, v18.16b

	// All done.
	rev32	v0.16b, v0.16b
	st1	{v0.4s}, [x2]
	ret

  ENDFUNC
.endm

	encdec	eblk, aese, aesmc, w
	encdec	dblk, aesd, aesimc, wi

///----- That's all, folks --------------------------------------------------