/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// ARM crypto-extension-based implementation of Rijndael
///
/// (c) 2016 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software; you can redistribute it and/or modify
/// it under the terms of the GNU Library General Public License as
/// published by the Free Software Foundation; either version 2 of the
/// License, or (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb; if not, write to the Free
/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
/// MA 02111-1307, USA.

///--------------------------------------------------------------------------
/// Preliminaries.

#include "config.h"
#include "asm-common.h"

	.arch	armv8-a
	.fpu	crypto-neon-fp-armv8

	.extern	F(abort)
	.extern	F(rijndael_rcon)

	.text

///--------------------------------------------------------------------------
/// Main code.

/// The ARM crypto extension implements a little-endian version of AES
/// (though the manual doesn't actually spell this out and you have to
/// experiment), but Catacomb's internal interface presents as big-endian so
/// as to work better with things like GCM.  We therefore maintain the round
/// keys in little-endian form, and have to end-swap blocks in and out.
///
/// For added amusement, the crypto extension doesn't implement the larger-
/// block versions of Rijndael, so we have to end-swap the keys if we're
/// preparing for one of those.

	// Useful constants.
	.equ	maxrounds, 16		// maximum number of rounds
	.equ	maxblksz, 32		// maximum block size, in bytes
	.equ	kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer

	// Context structure.
	.equ	nr, 0			// number of rounds
	.equ	w, nr + 4		// encryption key words
	.equ	wi, w + kbufsz		// decryption key words

///--------------------------------------------------------------------------
/// Key setup.

FUNC(rijndael_setup_arm_crypto)

	// Arguments:
	//	r0 = pointer to context
	//	r1 = block size in words
	//	r2 = pointer to key material
	//	r3 = key size in words

	pushreg	r4-r9, r14

	// The initial round key material is taken directly from the input
	// key, so copy it over.  Unfortunately, the key material is not
	// guaranteed to be aligned in any especially useful way, so we must
	// sort this out.
	add	r9, r0, #w
	mov	r14, r3
	ands	r6, r2, #3
	beq	1f
	mov	r6, r6, lsl #3
	rsb	r7, r6, #32
	bic	r2, r2, #3
	ldr	r4, [r2], #4

0:	ldr	r5, [r2], #4
	mov	r4, r4, lsr r6
	orr	r4, r5, lsl r7
	str	r4, [r9], #4
	subs	r14, r14, #1
	movhi	r4, r5
	bhi	0b
	b	9f

1:	ldr	r4, [r2], #4
	str	r4, [r9], #4
	subs	r14, r14, #1
	bhi	1b

	// Find out other useful things and prepare for the main loop.
9:	ldr	r7, [r0, #nr]		// number of rounds
	mla	r2, r1, r7, r1		// total key size in words
	leaextq	r5, rijndael_rcon	// round constants
	sub	r8, r2, r3		// minus what we've copied already
	vmov.i32 q1, #0			// all-zero register for the key
	add	r8, r9, r8, lsl #2	// limit of the key buffer
	mov	r12, #0			// position in current cycle

	// Main key expansion loop.  Dispatch according to the position in
	// the cycle.
0:	ldr	r6, [r9, -r3, lsl #2]	// word from previous cycle
	cmp	r12, #0			// first word of the cycle?
	beq	1f
	cmp	r12, #4			// fourth word of the cycle?
	bne	2f
	cmp	r3, #7			// seven or eight words of key?
	bcc	2f

	// Fourth word of the cycle, seven or eight words of key.  We must do
	// the byte substitution.
	vdup.32	q0, r4
	aese.8	q0, q1			// effectively, just SubBytes
	vmov.32	r4, d0[0]
	b	2f

	// First word of the cycle.  Byte substitution, rotation, and round
	// constant.
1:	ldrb	r14, [r5], #1		// next round constant
	vdup.32	q0, r4
	aese.8	q0, q1			// effectively, just SubBytes
	vmov.32	r4, d0[0]
	eor	r4, r14, r4, ror #8

	// Common ending: mix in the word from the previous cycle and store.
2:	eor	r4, r4, r6
	str	r4, [r9], #4

	// Prepare for the next iteration.  If we're done, then stop; if
	// we've finished a cycle then reset the counter.
	add	r12, r12, #1
	cmp	r9, r8
	bcs	9f
	cmp	r12, r3
	movcs	r12, #0
	b	0b

	// Next job is to construct the decryption keys.  The keys for the
	// first and last rounds don't need to be mangled, but the remaining
	// ones do -- and they all need to be reordered too.
	//
	// The plan of action, then, is to copy the final encryption round's
	// keys into place first, then to do each of the intermediate rounds
	// in reverse order, and finally do the first round.
	//
	// Do all the heavy lifting with NEON registers.  The order we're
	// doing this in means that it's OK if we read or write too much, and
	// there's easily enough buffer space for the over-enthusiastic reads
	// and writes because the context has space for 32-byte blocks, which
	// is our maximum and an exact fit for two Q-class registers.
9:	add	r5, r0, #wi
	add	r4, r0, #w
	add	r4, r4, r2, lsl #2
	sub	r4, r4, r1, lsl #2		// last round's keys

	// Copy the last encryption round's keys.
	teq	r1, #4
	vldmiaeq r4, {d0, d1}
	vldmiane r4, {d0-d3}
	vstmiaeq r5, {d0, d1}
	vstmiane r5, {d0-d3}

	// Update the loop variables and stop if we've finished.
0:	sub	r4, r4, r1, lsl #2
	add	r5, r5, r1, lsl #2
	subs	r7, r7, #1
	beq	9f

	// Do another middle round's keys...
	teq	r1, #4
	vldmiaeq r4, {d0, d1}
	vldmiane r4, {d0-d3}
	aesimc.8 q0, q0
	vstmiaeq r5, {d0, d1}
	beq	0b
	aesimc.8 q1, q1
	vstmia	r5, {d0-d3}
	b	0b

	// Finally do the first encryption round.
9:	teq	r1, #4
	vldmiaeq r4, {d0, d1}
	vldmiane r4, {d0-d3}
	vstmiaeq r5, {d0, d1}
	vstmiane r5, {d0-d3}

	// If the block size is not exactly four words then we must end-swap
	// everything.  We can use fancy NEON toys for this.
	beq	9f

	// End-swap the encryption keys.
	add	r1, r0, #w
	bl	endswap_block

	// And the decryption keys
	add	r1, r0, #wi
	bl	endswap_block

	// All done.
9:	popreg	r4-r9, pc

ENDFUNC

INTFUNC(endswap_block)
	// End-swap R2 words starting at R1.  R1 is clobbered; R2 is not.
	// It's OK to work in 16-byte chunks.

	mov	r4, r2
0:	vldmia	r1, {d0, d1}
	vrev32.8 q0, q0
	vstmia	r1!, {d0, d1}
	subs	r4, r4, #4
	bhi	0b
	bx	r14

ENDFUNC

///--------------------------------------------------------------------------
/// Encrypting and decrypting blocks.

.macro	encdec	op, aes, mc, koff
  FUNC(rijndael_\op\()_arm_crypto)

	// Arguments:
	//	r0 = pointer to context
	//	r1 = pointer to input block
	//	r2 = pointer to output block

	// Set things up ready.
	ldr	r3, [r0, #nr]
	add	r0, r0, #\koff
	vldmia	r1, {d0, d1}
	vrev32.8 q0, q0

	// Check the number of rounds and dispatch.
	sub	r3, r3, #10
	cmp	r3, #5
	addlo	pc, pc, r3, lsl #2
	callext	F(abort)

	b	10f
	b	11f
	b	12f
	b	13f
	b	14f

	// Eleven rounds.
11:	vldmia	r0!, {d16, d17}
	\aes\().8 q0, q8
	\mc\().8 q0, q0
	b	10f

	// Twelve rounds.
12:	vldmia	r0!, {d16-d19}
	\aes\().8 q0, q8
	\mc\().8 q0, q0
	\aes\().8 q0, q9
	\mc\().8 q0, q0
	b	10f

	// Thirteen rounds.
13:	vldmia	r0!, {d16-d21}
	\aes\().8 q0, q8
	\mc\().8 q0, q0
	\aes\().8 q0, q9
	\mc\().8 q0, q0
	\aes\().8 q0, q10
	\mc\().8 q0, q0
	b	10f

	// Fourteen rounds.  (Drops through to the ten round case because
	// this is the next most common.)
14:	vldmia	r0!, {d16-d23}
	\aes\().8 q0, q8
	\mc\().8 q0, q0
	\aes\().8 q0, q9
	\mc\().8 q0, q0
	\aes\().8 q0, q10
	\mc\().8 q0, q0
	\aes\().8 q0, q11
	\mc\().8 q0, q0
	// Drop through...

	// Ten rounds.
10:	vldmia	r0!, {d16-d25}
	\aes\().8 q0, q8
	\mc\().8 q0, q0
	\aes\().8 q0, q9
	\mc\().8 q0, q0
	\aes\().8 q0, q10
	\mc\().8 q0, q0
	\aes\().8 q0, q11
	\mc\().8 q0, q0
	\aes\().8 q0, q12
	\mc\().8 q0, q0

	vldmia	r0!, {d16-d27}
	\aes\().8 q0, q8
	\mc\().8 q0, q0
	\aes\().8 q0, q9
	\mc\().8 q0, q0
	\aes\().8 q0, q10
	\mc\().8 q0, q0
	\aes\().8 q0, q11
	\mc\().8 q0, q0

	// Final round has no MixColumns, but is followed by final whitening.
	\aes\().8 q0, q12
	veor	q0, q0, q13

	// All done.
	vrev32.8 q0, q0
	vstmia	r2, {d0, d1}
	bx	r14

  ENDFUNC
.endm

	encdec	eblk, aese, aesmc, w
	encdec	dblk, aesd, aesimc, wi

///----- That's all, folks --------------------------------------------------