mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// AArch64 crypto-extension-based implementation of Rijndael
	4	///
	5	/// (c) 2018 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// Preliminaries.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	.arch armv8-a+crypto
	34
	35	.extern F(abort)
	36	.extern F(rijndael_rcon)
	37
	38	.text
	39
	40	///--------------------------------------------------------------------------
	41	/// Main code.
	42
	43	/// The ARM crypto extension implements a little-endian version of AES
	44	/// (though the manual doesn't actually spell this out and you have to
	45	/// experiment), but Catacomb's internal interface presents as big-endian so
	46	/// as to work better with things like GCM. We therefore maintain the round
	47	/// keys in little-endian form, and have to end-swap blocks in and out.
	48	///
	49	/// For added amusement, the crypto extension doesn't implement the larger-
	50	/// block versions of Rijndael, so we have to end-swap the keys if we're
	51	/// preparing for one of those.
	52
	53	// Useful constants.
	54	.equ maxrounds, 16 // maximum number of rounds
	55	.equ maxblksz, 32 // maximum block size, in bytes
	56	.equ kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer
	57
	58	// Context structure.
	59	.equ nr, 0 // number of rounds
	60	.equ w, nr + 4 // encryption key words
	61	.equ wi, w + kbufsz // decryption key words
	62
	63	///--------------------------------------------------------------------------
	64	/// Key setup.
	65
	66	FUNC(rijndael_setup_arm64_crypto)
	67
	68	// Arguments:
	69	// x0 = pointer to context
	70	// w1 = block size in 32-bit words
	71	// x2 = pointer to key material
	72	// x3 = key size in words
	73
	74	pushreg x29, x30
	75	mov x29, sp
	76
	77	// The initial round key material is taken directly from the input
	78	// key, so copy it over. Unfortunately, the key material is not
	79	// guaranteed to be aligned in any especially useful way. Assume
	80	// that alignment traps are not enabled. (Why would they be? On
	81	// A32, alignment traps were part of a transition plan which changed
	82	// the way unaligned loads and stores behaved, but there's never been
	83	// any other behaviour on A64.)
	84	mov x15, x3
	85	add x4, x0, #w
	86	0: sub x15, x15, #1
	87	ldr w14, [x2], #4
	88	str w14, [x4], #4
	89	cbnz x15, 0b
	90
	91	// Find out other useful things and prepare for the main loop.
	92	9: ldr w9, [x0, #nr] // number of rounds
	93	madd w2, w1, w9, w1 // total key size in words
	94	leaext x5, rijndael_rcon // round constants
	95	sub x6, x2, x3 // minus what we've copied already
	96	add x7, x0, #w // position in previous cycle
	97	movi v1.4s, #0 // all-zero register for the key
	98	mov x8, #0 // position in current cycle
	99
	100	// Main key expansion loop. Dispatch according to the position in
	101	// the cycle.
	102	0: ldr w15, [x7], #4 // word from previous cycle
	103	cbz x8, 1f // first word of the cycle?
	104	cmp x8, #4 // fourth word of the cycle?
	105	b.ne 2f
	106	cmp x3, #7 // seven or eight words of key?
	107	b.cc 2f
	108
	109	// Fourth word of the cycle, seven or eight words of key. We must do
	110	// the byte substitution.
	111	dup v0.4s, w14
	112	aese v0.16b, v1.16b // effectively, just SubBytes
	113	mov w14, v0.4s[0]
	114	b 2f
	115
	116	// First word of the cycle. Byte substitution, rotation, and round
	117	// constant.
	118	1: ldrb w13, [x5], #1 // next round constant
	119	dup v0.4s, w14
	120	aese v0.16b, v1.16b // effectively, just SubBytes
	121	mov w14, v0.4s[0]
	122	eor w14, w13, w14, ror #8
	123
	124	// Common ending: mix in the word from the previous cycle and store.
	125	2: eor w14, w14, w15
	126	str w14, [x4], #4
	127
	128	// Prepare for the next iteration. If we're done, then stop; if
	129	// we've finished a cycle then reset the counter.
	130	add x8, x8, #1
	131	sub x6, x6, #1
	132	cmp x8, x3
	133	cbz x6, 9f
	134	csel x8, x8, xzr, cc
	135	b 0b
	136
	137	// Next job is to construct the decryption keys. The keys for the
	138	// first and last rounds don't need to be mangled, but the remaining
	139	// ones do -- and they all need to be reordered too.
	140	//
	141	// The plan of action, then, is to copy the final encryption round's
	142	// keys into place first, then to do each of the intermediate rounds
	143	// in reverse order, and finally do the first round.
	144	//
	145	// Do all the heavy lifting with the vector registers. The order
	146	// we're doing this in means that it's OK if we read or write too
	147	// much, and there's easily enough buffer space for the
	148	// over-enthusiastic reads and writes because the context has space
	149	// for 32-byte blocks, which is our maximum and an exact fit for two
	150	// full-width registers.
	151	9: add x5, x0, #wi
	152	add x4, x0, #w
	153	add x4, x4, w2, uxtw #2
	154	sub x4, x4, w1, uxtw #2 // last round's keys
	155
	156	// Copy the last encryption round's keys.
	157	ld1 {v0.4s, v1.4s}, [x4]
	158	st1 {v0.4s, v1.4s}, [x5]
	159
	160	// Update the loop variables and stop if we've finished.
	161	0: sub w9, w9, #1
	162	add x5, x5, w1, uxtw #2
	163	sub x4, x4, w1, uxtw #2
	164	cbz w9, 9f
	165
	166	// Do another middle round's keys...
	167	ld1 {v0.4s, v1.4s}, [x4]
	168	aesimc v0.16b, v0.16b
	169	aesimc v1.16b, v1.16b
	170	st1 {v0.4s, v1.4s}, [x5]
	171	b 0b
	172
	173	// Finally do the first encryption round.
	174	9: ld1 {v0.4s, v1.4s}, [x4]
	175	st1 {v0.4s, v1.4s}, [x5]
	176
	177	// If the block size is not exactly four words then we must end-swap
	178	// everything. We can use fancy vector toys for this.
	179	cmp w1, #4
	180	b.eq 9f
	181
	182	// End-swap the encryption keys.
	183	add x1, x0, #w
	184	bl endswap_block
	185
	186	// And the decryption keys
	187	add x1, x0, #wi
	188	bl endswap_block
	189
	190	// All done.
	191	9: popreg x29, x30
	192	ret
	193
	194	ENDFUNC
	195
	196	INTFUNC(endswap_block)
	197	// End-swap w2 words starting at x1. x1 is clobbered; w2 is not.
	198	// It's OK to work in 16-byte chunks.
	199
	200	mov w3, w2
	201	0: subs w3, w3, #4
	202	ld1 {v0.4s}, [x1]
	203	rev32 v0.16b, v0.16b
	204	st1 {v0.4s}, [x1], #16
	205	b.hi 0b
	206	ret
	207
	208	ENDFUNC
	209
	210	///--------------------------------------------------------------------------
	211	/// Encrypting and decrypting blocks.
	212
	213	.macro encdec op, aes, mc, koff
	214	FUNC(rijndael_\op\()_arm64_crypto)
	215
	216	// Arguments:
	217	// x0 = pointer to context
	218	// x1 = pointer to input block
	219	// x2 = pointer to output block
	220
	221	// Set things up ready.
	222	ldr w3, [x0, #nr]
	223	add x0, x0, #\koff
	224	ld1 {v0.4s}, [x1]
	225	rev32 v0.16b, v0.16b
	226
	227	// Check the number of rounds and dispatch.
	228	cmp w3, #14
	229	b.eq 14f
	230	cmp w3, #10
	231	b.eq 10f
	232	cmp w3, #12
	233	b.eq 12f
	234	cmp w3, #13
	235	b.eq 13f
	236	cmp w3, #11
	237	b.eq 11f
	238	callext F(abort)
	239
	240	// Eleven rounds.
	241	11: ld1 {v16.4s}, [x0], #16
	242	\aes v0.16b, v16.16b
	243	\mc v0.16b, v0.16b
	244	b 10f
	245
	246	// Twelve rounds.
	247	12: ld1 {v16.4s, v17.4s}, [x0], #32
	248	\aes v0.16b, v16.16b
	249	\mc v0.16b, v0.16b
	250	\aes v0.16b, v17.16b
	251	\mc v0.16b, v0.16b
	252	b 10f
	253
	254	// Thirteen rounds.
	255	13: ld1 {v16.4s-v18.4s}, [x0], #48
	256	\aes v0.16b, v16.16b
	257	\mc v0.16b, v0.16b
	258	\aes v0.16b, v17.16b
	259	\mc v0.16b, v0.16b
	260	\aes v0.16b, v18.16b
	261	\mc v0.16b, v0.16b
	262	b 10f
	263
	264	// Fourteen rounds. (Drops through to the ten round case because
	265	// this is the next most common.)
	266	14: ld1 {v16.4s-v19.4s}, [x0], #64
	267	\aes v0.16b, v16.16b
	268	\mc v0.16b, v0.16b
	269	\aes v0.16b, v17.16b
	270	\mc v0.16b, v0.16b
	271	\aes v0.16b, v18.16b
	272	\mc v0.16b, v0.16b
	273	\aes v0.16b, v19.16b
	274	\mc v0.16b, v0.16b
	275	// Drop through...
	276
	277	// Ten rounds.
	278	10: ld1 {v16.4s-v19.4s}, [x0], #64
	279	ld1 {v20.4s-v23.4s}, [x0], #64
	280	\aes v0.16b, v16.16b
	281	\mc v0.16b, v0.16b
	282	\aes v0.16b, v17.16b
	283	\mc v0.16b, v0.16b
	284	\aes v0.16b, v18.16b
	285	\mc v0.16b, v0.16b
	286	\aes v0.16b, v19.16b
	287	\mc v0.16b, v0.16b
	288
	289	ld1 {v16.4s-v18.4s}, [x0], #48
	290	\aes v0.16b, v20.16b
	291	\mc v0.16b, v0.16b
	292	\aes v0.16b, v21.16b
	293	\mc v0.16b, v0.16b
	294	\aes v0.16b, v22.16b
	295	\mc v0.16b, v0.16b
	296	\aes v0.16b, v23.16b
	297	\mc v0.16b, v0.16b
	298
	299	// Final round has no MixColumns, but is followed by final whitening.
	300	\aes v0.16b, v16.16b
	301	\mc v0.16b, v0.16b
	302	\aes v0.16b, v17.16b
	303	eor v0.16b, v0.16b, v18.16b
	304
	305	// All done.
	306	rev32 v0.16b, v0.16b
	307	st1 {v0.4s}, [x2]
	308	ret
	309
	310	ENDFUNC
	311	.endm
	312
	313	encdec eblk, aese, aesmc, w
	314	encdec dblk, aesd, aesimc, wi
	315
	316	///----- That's all, folks --------------------------------------------------