mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// ARM crypto-extension-based implementation of Rijndael
	4	///
	5	/// (c) 2016 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// External definitions.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	.extern F(abort)
	34	.extern F(rijndael_rcon)
	35
	36	///--------------------------------------------------------------------------
	37	/// Main code.
	38
	39	.arch armv8-a
	40	.fpu crypto-neon-fp-armv8
	41
	42	/// The ARM crypto extension implements a little-endian version of AES
	43	/// (though the manual doesn't actually spell this out and you have to
	44	/// experiment), but Catacomb's internal interface presents as big-endian so
	45	/// as to work better with things like GCM. We therefore maintain the round
	46	/// keys in little-endian form, and have to end-swap blocks in and out.
	47	///
	48	/// For added amusement, the crypto extension doesn't implement the larger-
	49	/// block versions of Rijndael, so we have to end-swap the keys if we're
	50	/// preparing for one of those.
	51
	52	// Useful constants.
	53	.equ maxrounds, 16 // maximum number of rounds
	54	.equ maxblksz, 32 // maximum block size, in bytes
	55	.equ kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer
	56
	57	// Context structure.
	58	.equ nr, 0 // number of rounds
	59	.equ w, nr + 4 // encryption key words
	60	.equ wi, w + kbufsz // decryption key words
	61
	62	///--------------------------------------------------------------------------
	63	/// Key setup.
	64
	65	FUNC(rijndael_setup_arm_crypto)
	66
	67	// Arguments:
	68	// r0 = pointer to context
	69	// r1 = block size in words
	70	// r2 = pointer to key material
	71	// r3 = key size in words
	72
	73	pushreg {r4-r9, r14}
	74
	75	// The initial round key material is taken directly from the input
	76	// key, so copy it over. Unfortunately, the key material is not
	77	// guaranteed to be aligned in any especially useful way, so we must
	78	// sort this out.
	79	add r9, r0, #w
	80	mov r14, r3
	81	ands r6, r2, #3
	82	beq 1f
	83	mov r6, r6, lsl #3
	84	rsb r7, r6, #32
	85	bic r2, r2, #3
	86	ldr r4, [r2], #4
	87
	88	0: ldr r5, [r2], #4
	89	mov r4, r4, lsr r6
	90	orr r4, r5, lsl r7
	91	str r4, [r9], #4
	92	subs r14, r14, #1
	93	movhi r4, r5
	94	bhi 0b
	95	b 9f
	96
	97	1: ldr r4, [r2], #4
	98	str r4, [r9], #4
	99	subs r14, r14, #1
	100	bhi 1b
	101
	102	// Find out other useful things and prepare for the main loop.
	103	9: ldr r7, [r0, #nr] // number of rounds
	104	mla r2, r1, r7, r1 // total key size in words
	105	leaextq r5, rijndael_rcon // round constants
	106	sub r8, r2, r3 // minus what we've copied already
	107	veor q1, q1 // all-zero register for the key
	108	add r8, r9, r8, lsl #2 // limit of the key buffer
	109	mov r12, #0 // position in current cycle
	110
	111	// Main key expansion loop. Dispatch according to the position in
	112	// the cycle.
	113	0: ldr r6, [r9, -r3, lsl #2] // word from previous cycle
	114	cmp r12, #0 // first word of the cycle?
	115	beq 1f
	116	cmp r12, #4 // fourth word of the cycle?
	117	bne 2f
	118	cmp r3, #7 // seven or eight words of key?
	119	bcc 2f
	120
	121	// Fourth word of the cycle, seven or eight words of key. We must do
	122	// the byte substitution.
	123	vdup.32 q0, r4
	124	aese.8 q0, q1 // effectively, just SubBytes
	125	vmov.32 r4, d0[0]
	126	b 2f
	127
	128	// First word of the cycle. Byte substitution, rotation, and round
	129	// constant.
	130	1: ldrb r14, [r5], #1 // next round constant
	131	ldr r6, [r9, -r3, lsl #2]
	132	vdup.32 q0, r4
	133	aese.8 q0, q1 // effectively, just SubBytes
	134	vmov.32 r4, d0[0]
	135	eor r4, r14, r4, ror #8
	136
	137	// Common ending: mix in the word from the previous cycle and store.
	138	2: eor r4, r4, r6
	139	str r4, [r9], #4
	140
	141	// Prepare for the next iteration. If we're done, then stop; if
	142	// we've finished a cycle then reset the counter.
	143	add r12, r12, #1
	144	cmp r9, r8
	145	bcs 9f
	146	cmp r12, r3
	147	movcs r12, #0
	148	b 0b
	149
	150	// Next job is to construct the decryption keys. The keys for the
	151	// first and last rounds don't need to be mangled, but the remaining
	152	// ones do -- and they all need to be reordered too.
	153	//
	154	// The plan of action, then, is to copy the final encryption round's
	155	// keys into place first, then to do each of the intermediate rounds
	156	// in reverse order, and finally do the first round.
	157	//
	158	// Do all the heavy lifting with NEON registers. The order we're
	159	// doing this in means that it's OK if we read or write too much, and
	160	// there's easily enough buffer space for the over-enthusiastic reads
	161	// and writes because the context has space for 32-byte blocks, which
	162	// is our maximum and an exact fit for two Q-class registers.
	163	9: add r5, r0, #wi
	164	add r4, r0, #w
	165	add r4, r4, r2, lsl #2
	166	sub r4, r4, r1, lsl #2 // last round's keys
	167
	168	// Copy the last encryption round's keys.
	169	teq r1, #4
	170	vldmiaeq r4, {d0, d1}
	171	vldmiane r4, {d0-d3}
	172	vstmiaeq r5, {d0, d1}
	173	vstmiane r5, {d0-d3}
	174
	175	// Update the loop variables and stop if we've finished.
	176	0: sub r4, r4, r1, lsl #2
	177	add r5, r5, r1, lsl #2
	178	subs r7, r7, #1
	179	beq 9f
	180
	181	// Do another middle round's keys...
	182	teq r1, #4
	183	vldmiaeq r4, {d0, d1}
	184	vldmiane r4, {d0-d3}
	185	aesimc.8 q0, q0
	186	vstmiaeq r5, {d0, d1}
	187	beq 0b
	188	aesimc.8 q1, q1
	189	vstmia r5, {d0-d3}
	190	b 0b
	191
	192	// Finally do the first encryption round.
	193	9: teq r1, #4
	194	vldmiaeq r4, {d0, d1}
	195	vldmiane r4, {d0-d3}
	196	vstmiaeq r5, {d0, d1}
	197	vstmiane r5, {d0-d3}
	198
	199	// If the block size is not exactly four words then we must end-swap
	200	// everything. We can use fancy NEON toys for this.
	201	beq 9f
	202
	203	// End-swap the encryption keys.
	204	add r1, r0, #w
	205	bl endswap_block
	206
	207	// And the decryption keys
	208	add r1, r0, #wi
	209	bl endswap_block
	210
	211	// All done.
	212	9: popreg {r4-r9, pc}
	213
	214	ENDFUNC
	215
	216	INTFUNC(endswap_block)
	217	// End-swap R2 words starting at R1. R1 is clobbered; R2 is not.
	218	// It's OK to work in 16-byte chunks.
	219
	220	mov r4, r2
	221	0: vldmia r1, {d0, d1}
	222	vrev32.8 q0, q0
	223	vstmia r1!, {d0, d1}
	224	subs r4, r4, #4
	225	bhi 0b
	226	bx r14
	227
	228	ENDFUNC
	229
	230	///--------------------------------------------------------------------------
	231	/// Encrypting and decrypting blocks.
	232
	233	.macro encdec op, aes, mc, koff
	234	FUNC(rijndael_\op\()_arm_crypto)
	235
	236	// Arguments:
	237	// r0 = pointer to context
	238	// r1 = pointer to input block
	239	// r2 = pointer to output block
	240
	241	// Set things up ready.
	242	ldr r3, [r0, #nr]
	243	add r0, r0, #\koff
	244	vldmia r1, {d0, d1}
	245	vrev32.8 q0, q0
	246
	247	// Check the number of rounds and dispatch.
	248	sub r3, r3, #10
	249	cmp r3, #5
	250	addlo pc, pc, r3, lsl #2
	251	callext F(abort)
	252
	253	b 10f
	254	b 11f
	255	b 12f
	256	b 13f
	257	b 14f
	258
	259	// Eleven rounds.
	260	11: vldmia r0!, {d16, d17}
	261	\aes\().8 q0, q8
	262	\mc\().8 q0, q0
	263	b 10f
	264
	265	// Twelve rounds.
	266	12: vldmia r0!, {d16-d19}
	267	\aes\().8 q0, q8
	268	\mc\().8 q0, q0
	269	\aes\().8 q0, q9
	270	\mc\().8 q0, q0
	271	b 10f
	272
	273	// Thirteen rounds.
	274	13: vldmia r0!, {d16-d21}
	275	\aes\().8 q0, q8
	276	\mc\().8 q0, q0
	277	\aes\().8 q0, q9
	278	\mc\().8 q0, q0
	279	\aes\().8 q0, q10
	280	\mc\().8 q0, q0
	281	b 10f
	282
	283	// Fourteen rounds. (Drops through to the ten round case because
	284	// this is the next most common.)
	285	14: vldmia r0!, {d16-d23}
	286	\aes\().8 q0, q8
	287	\mc\().8 q0, q0
	288	\aes\().8 q0, q9
	289	\mc\().8 q0, q0
	290	\aes\().8 q0, q10
	291	\mc\().8 q0, q0
	292	\aes\().8 q0, q11
	293	\mc\().8 q0, q0
	294	// Drop through...
	295
	296	// Ten rounds.
	297	10: vldmia r0!, {d16-d25}
	298	\aes\().8 q0, q8
	299	\mc\().8 q0, q0
	300	\aes\().8 q0, q9
	301	\mc\().8 q0, q0
	302	\aes\().8 q0, q10
	303	\mc\().8 q0, q0
	304	\aes\().8 q0, q11
	305	\mc\().8 q0, q0
	306	\aes\().8 q0, q12
	307	\mc\().8 q0, q0
	308
	309	vldmia r0!, {d16-d27}
	310	\aes\().8 q0, q8
	311	\mc\().8 q0, q0
	312	\aes\().8 q0, q9
	313	\mc\().8 q0, q0
	314	\aes\().8 q0, q10
	315	\mc\().8 q0, q0
	316	\aes\().8 q0, q11
	317	\mc\().8 q0, q0
	318
	319	// Final round has no MixColumns, but is followed by final whitening.
	320	\aes\().8 q0, q12
	321	veor q0, q0, q13
	322
	323	// All done.
	324	vrev32.8 q0, q0
	325	vstmia r2, {d0, d1}
	326	bx r14
	327
	328	ENDFUNC
	329	.endm
	330
	331	encdec eblk, aese, aesmc, w
	332	encdec dblk, aesd, aesimc, wi
	333
	334	///----- That's all, folks --------------------------------------------------