X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/26e182fc3ae2a40dc7d52bab2318d8d1837dfeee..318c3c31be343fdba362cb60f33aab3e88798d8a:/symm/rijndael-arm-crypto.S

diff --git a/symm/rijndael-arm-crypto.S b/symm/rijndael-arm-crypto.S
index d33cac6b..1df81d97 100644
--- a/symm/rijndael-arm-crypto.S
+++ b/symm/rijndael-arm-crypto.S
@@ -25,20 +25,22 @@
 /// MA 02111-1307, USA.
 
 ///--------------------------------------------------------------------------
-/// External definitions.
+/// Preliminaries.
 
 #include "config.h"
 #include "asm-common.h"
 
-	.globl	F(abort)
-	.globl	F(rijndael_rcon)
+	.arch	armv8-a
+	.fpu	crypto-neon-fp-armv8
+
+	.extern	F(abort)
+	.extern	F(rijndael_rcon)
+
+	.text
 
 ///--------------------------------------------------------------------------
 /// Main code.
 
-	.arch	armv8-a
-	.fpu	crypto-neon-fp-armv8
-
 /// The ARM crypto extension implements a little-endian version of AES
 /// (though the manual doesn't actually spell this out and you have to
 /// experiment), but Catacomb's internal interface presents as big-endian so
@@ -52,7 +54,7 @@
 	// Useful constants.
 	.equ	maxrounds, 16		// maximum number of rounds
 	.equ	maxblksz, 32		// maximum block size, in bytes
-	.equ	kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
+	.equ	kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer
 
 	// Context structure.
 	.equ	nr, 0			// number of rounds
@@ -70,7 +72,7 @@ FUNC(rijndael_setup_arm_crypto)
 	//	r2 = pointer to key material
 	//	r3 = key size in words
 
-	stmfd	sp!, {r4-r9, r14}
+	pushreg	r4-r9, r14
 
 	// The initial round key material is taken directly from the input
 	// key, so copy it over.  Unfortunately, the key material is not
@@ -78,114 +80,73 @@ FUNC(rijndael_setup_arm_crypto)
 	// sort this out.
 	add	r9, r0, #w
 	mov	r14, r3
-	ands	r4, r2, #3
+	ands	r6, r2, #3
 	beq	1f
-	mov	r4, r4, lsl #3
-	rsb	r5, r4, #32
+	mov	r6, r6, lsl #3
+	rsb	r7, r6, #32
 	bic	r2, r2, #3
-	ldr	r6, [r2], #4
+	ldr	r4, [r2], #4
 
-0:	ldr	r7, [r2], #4
-	mov	r6, r6, lsr r4
-	orr	r6, r7, lsl r5
-	str	r6, [r9], #4
-	mov	r6, r7
+0:	ldr	r5, [r2], #4
+	mov	r4, r4, lsr r6
+	orr	r4, r5, lsl r7
+	str	r4, [r9], #4
 	subs	r14, r14, #1
+	movhi	r4, r5
 	bhi	0b
 	b	9f
 
-1:	ldr	r6, [r2], #4
-	str	r6, [r9], #4
+1:	ldr	r4, [r2], #4
+	str	r4, [r9], #4
 	subs	r14, r14, #1
 	bhi	1b
 
 	// Find out other useful things and prepare for the main loop.
-	ldr	r7, [r0, #nr]		// number of rounds
+9:	ldr	r7, [r0, #nr]		// number of rounds
 	mla	r2, r1, r7, r1		// total key size in words
-	ldr	r4, [r9, #-4]		// most recent key word
 	leaextq	r5, rijndael_rcon	// round constants
 	sub	r8, r2, r3		// minus what we've copied already
-	veor	q1, q1			// all-zero register for the key
+	vmov.i32 q1, #0			// all-zero register for the key
 	add	r8, r9, r8, lsl #2	// limit of the key buffer
+	mov	r12, #0			// position in current cycle
+
+	// Main key expansion loop.  Dispatch according to the position in
+	// the cycle.
+0:	ldr	r6, [r9, -r3, lsl #2]	// word from previous cycle
+	cmp	r12, #0			// first word of the cycle?
+	beq	1f
+	cmp	r12, #4			// fourth word of the cycle?
+	bne	2f
+	cmp	r3, #7			// seven or eight words of key?
+	bcc	2f
 
-	// Main key expansion loop.  The first word of each key-length chunk
-	// needs special treatment.
-9:	ldrb	r14, [r5], #1		// next round constant
-	ldr	r6, [r9, -r3, lsl #2]
+	// Fourth word of the cycle, seven or eight words of key.  We must do
+	// the byte substitution.
 	vdup.32	q0, r4
 	aese.8	q0, q1			// effectively, just SubBytes
 	vmov.32	r4, d0[0]
-	eor	r4, r14, r4, ror #8
-	eor	r4, r4, r6
-	str	r4, [r9], #4
-	cmp	r9, r8
-	bcs	8f
-
-	// The next three words are simple.
-	ldr	r6, [r9, -r3, lsl #2]
-	eor	r4, r4, r6
-	str	r4, [r9], #4
-	cmp	r9, r8
-	bcs	8f
-
-	// (Word 2...)
-	ldr	r6, [r9, -r3, lsl #2]
-	eor	r4, r4, r6
-	str	r4, [r9], #4
-	cmp	r9, r8
-	bcs	8f
+	b	2f
 
-	// (Word 3...)
-	ldr	r6, [r9, -r3, lsl #2]
-	eor	r4, r4, r6
-	str	r4, [r9], #4
-	cmp	r9, r8
-	bcs	8f
-
-	// Word 4.  If the key is /more/ than 6 words long, then we must
-	// apply a substitution here.
-	cmp	r3, #5
-	bcc	9b
-	ldr	r6, [r9, -r3, lsl #2]
-	cmp	r3, #7
-	bcc	0f
+	// First word of the cycle.  Byte substitution, rotation, and round
+	// constant.
+1:	ldrb	r14, [r5], #1		// next round constant
 	vdup.32	q0, r4
 	aese.8	q0, q1			// effectively, just SubBytes
 	vmov.32	r4, d0[0]
-0:	eor	r4, r4, r6
-	str	r4, [r9], #4
-	cmp	r9, r8
-	bcs	8f
-
-	// (Word 5...)
-	cmp	r3, #6
-	bcc	9b
-	ldr	r6, [r9, -r3, lsl #2]
-	eor	r4, r4, r6
-	str	r4, [r9], #4
-	cmp	r9, r8
-	bcs	8f
+	eor	r4, r14, r4, ror #8
 
-	// (Word 6...)
-	cmp	r3, #7
-	bcc	9b
-	ldr	r6, [r9, -r3, lsl #2]
-	eor	r4, r4, r6
+	// Common ending: mix in the word from the previous cycle and store.
+2:	eor	r4, r4, r6
 	str	r4, [r9], #4
-	cmp	r9, r8
-	bcs	8f
 
-	// (Word 7...)
-	cmp	r3, #8
-	bcc	9b
-	ldr	r6, [r9, -r3, lsl #2]
-	eor	r4, r4, r6
-	str	r4, [r9], #4
+	// Prepare for the next iteration.  If we're done, then stop; if
+	// we've finished a cycle then reset the counter.
+	add	r12, r12, #1
 	cmp	r9, r8
-	bcs	8f
-
-	// Must be done by now.
-	b	9b
+	bcs	9f
+	cmp	r12, r3
+	movcs	r12, #0
+	b	0b
 
 	// Next job is to construct the decryption keys.  The keys for the
 	// first and last rounds don't need to be mangled, but the remaining
@@ -200,7 +161,7 @@ FUNC(rijndael_setup_arm_crypto)
 	// there's easily enough buffer space for the over-enthusiastic reads
 	// and writes because the context has space for 32-byte blocks, which
 	// is our maximum and an exact fit for two Q-class registers.
-8:	add	r5, r0, #wi
+9:	add	r5, r0, #wi
 	add	r4, r0, #w
 	add	r4, r4, r2, lsl #2
 	sub	r4, r4, r1, lsl #2		// last round's keys
@@ -213,10 +174,10 @@ FUNC(rijndael_setup_arm_crypto)
 	vstmiane r5, {d0-d3}
 
 	// Update the loop variables and stop if we've finished.
-9:	sub	r4, r4, r1, lsl #2
+0:	sub	r4, r4, r1, lsl #2
 	add	r5, r5, r1, lsl #2
 	subs	r7, r7, #1
-	beq	0f
+	beq	9f
 
 	// Do another middle round's keys...
 	teq	r1, #4
@@ -224,13 +185,13 @@ FUNC(rijndael_setup_arm_crypto)
 	vldmiane r4, {d0-d3}
 	aesimc.8 q0, q0
 	vstmiaeq r5, {d0, d1}
-	beq	9b
+	beq	0b
 	aesimc.8 q1, q1
 	vstmia	r5, {d0-d3}
-	b	9b
+	b	0b
 
 	// Finally do the first encryption round.
-0:	teq	r1, #4
+9:	teq	r1, #4
 	vldmiaeq r4, {d0, d1}
 	vldmiane r4, {d0-d3}
 	vstmiaeq r5, {d0, d1}
@@ -238,7 +199,7 @@ FUNC(rijndael_setup_arm_crypto)
 
 	// If the block size is not exactly four words then we must end-swap
 	// everything.  We can use fancy NEON toys for this.
-	beq	0f
+	beq	9f
 
 	// End-swap the encryption keys.
 	add	r1, r0, #w
@@ -249,11 +210,14 @@ FUNC(rijndael_setup_arm_crypto)
 	bl	endswap_block
 
 	// All done.
-0:	ldmfd	sp!, {r4-r9, pc}
+9:	popreg	r4-r9, pc
+
+ENDFUNC
 
-endswap_block:
+INTFUNC(endswap_block)
 	// End-swap R2 words starting at R1.  R1 is clobbered; R2 is not.
 	// It's OK to work in 16-byte chunks.
+
 	mov	r4, r2
 0:	vldmia	r1, {d0, d1}
 	vrev32.8 q0, q0
@@ -267,7 +231,8 @@ ENDFUNC
 ///--------------------------------------------------------------------------
 /// Encrypting and decrypting blocks.
 
-FUNC(rijndael_eblk_arm_crypto)
+.macro	encdec	op, aes, mc, koff
+  FUNC(rijndael_\op\()_arm_crypto)
 
 	// Arguments:
 	//	r0 = pointer to context
@@ -276,77 +241,95 @@ FUNC(rijndael_eblk_arm_crypto)
 
 	// Set things up ready.
 	ldr	r3, [r0, #nr]
-	add	r0, r0, #w
+	add	r0, r0, #\koff
 	vldmia	r1, {d0, d1}
 	vrev32.8 q0, q0
 
-	// Dispatch according to the number of rounds.
-	add	r3, r3, r3, lsl #1
-	rsbs	r3, r3, #3*14
-	addcs	pc, pc, r3, lsl #2
+	// Check the number of rounds and dispatch.
+	sub	r3, r3, #10
+	cmp	r3, #5
+	addlo	pc, pc, r3, lsl #2
 	callext	F(abort)
 
-	// The last round doesn't have MixColumns, so do it separately.
-	.rept	13
-	vldmia	r0!, {d2, d3}
-	aese.8	q0, q1
-	aesmc.8	q0, q0
-	.endr
-
-	// Final round.
-	vldmia	r0!, {d2, d3}
-	aese.8	q0, q1
-
-	// Final whitening.
-	vldmia	r0!, {d2, d3}
-	veor	q0, q1
+	b	10f
+	b	11f
+	b	12f
+	b	13f
+	b	14f
+
+	// Eleven rounds.
+11:	vldmia	r0!, {d16, d17}
+	\aes\().8 q0, q8
+	\mc\().8 q0, q0
+	b	10f
+
+	// Twelve rounds.
+12:	vldmia	r0!, {d16-d19}
+	\aes\().8 q0, q8
+	\mc\().8 q0, q0
+	\aes\().8 q0, q9
+	\mc\().8 q0, q0
+	b	10f
+
+	// Thirteen rounds.
+13:	vldmia	r0!, {d16-d21}
+	\aes\().8 q0, q8
+	\mc\().8 q0, q0
+	\aes\().8 q0, q9
+	\mc\().8 q0, q0
+	\aes\().8 q0, q10
+	\mc\().8 q0, q0
+	b	10f
+
+	// Fourteen rounds.  (Drops through to the ten round case because
+	// this is the next most common.)
+14:	vldmia	r0!, {d16-d23}
+	\aes\().8 q0, q8
+	\mc\().8 q0, q0
+	\aes\().8 q0, q9
+	\mc\().8 q0, q0
+	\aes\().8 q0, q10
+	\mc\().8 q0, q0
+	\aes\().8 q0, q11
+	\mc\().8 q0, q0
+	// Drop through...
+
+	// Ten rounds.
+10:	vldmia	r0!, {d16-d25}
+	\aes\().8 q0, q8
+	\mc\().8 q0, q0
+	\aes\().8 q0, q9
+	\mc\().8 q0, q0
+	\aes\().8 q0, q10
+	\mc\().8 q0, q0
+	\aes\().8 q0, q11
+	\mc\().8 q0, q0
+	\aes\().8 q0, q12
+	\mc\().8 q0, q0
+
+	vldmia	r0!, {d16-d27}
+	\aes\().8 q0, q8
+	\mc\().8 q0, q0
+	\aes\().8 q0, q9
+	\mc\().8 q0, q0
+	\aes\().8 q0, q10
+	\mc\().8 q0, q0
+	\aes\().8 q0, q11
+	\mc\().8 q0, q0
+
+	// Final round has no MixColumns, but is followed by final whitening.
+	\aes\().8 q0, q12
+	veor	q0, q0, q13
 
 	// All done.
 	vrev32.8 q0, q0
 	vstmia	r2, {d0, d1}
 	bx	r14
 
-ENDFUNC
-
-FUNC(rijndael_dblk_arm_crypto)
-
-	// Arguments:
-	//	r0 = pointer to context
-	//	r1 = pointer to input block
-	//	r2 = pointer to output block
+  ENDFUNC
+.endm
 
-	// Set things up ready.
-	ldr	r3, [r0, #nr]
-	add	r0, r0, #wi
-	vldmia	r1, {d0, d1}
-	vrev32.8 q0, q0
-
-	// Dispatch according to the number of rounds.
-	add	r3, r3, r3, lsl #1
-	rsbs	r3, r3, #3*14
-	addcs	pc, pc, r3, lsl #2
-	callext	F(abort)
-
-	// The last round doesn't have MixColumns, so do it separately.
-	.rept	13
-	vldmia	r0!, {d2, d3}
-	aesd.8	q0, q1
-	aesimc.8 q0, q0
-	.endr
-
-	// Final round.
-	vldmia	r0!, {d2, d3}
-	aesd.8	q0, q1
-
-	// Final whitening.
-	vldmia	r0!, {d2, d3}
-	veor	q0, q1
-
-	// All done.
-	vrev32.8 q0, q0
-	vstmia	r2, {d0, d1}
-	bx	r14
-
-ENDFUNC
+	encdec	eblk, aese, aesmc, w
+	encdec	dblk, aesd, aesimc, wi
 
 ///----- That's all, folks --------------------------------------------------