[catacomb] / symm / rijndael-arm64-crypto.S

/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// AArch64 crypto-extension-based implementation of Rijndael
///
/// (c) 2018 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software; you can redistribute it and/or modify
/// it under the terms of the GNU Library General Public License as
/// published by the Free Software Foundation; either version 2 of the
/// License, or (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb; if not, write to the Free
/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
/// MA 02111-1307, USA.

///--------------------------------------------------------------------------
/// External definitions.

#include "config.h"
#include "asm-common.h"

	.extern	F(abort)
	.extern	F(rijndael_rcon)

///--------------------------------------------------------------------------
/// Main code.

	.arch	armv8-a+crypto

/// The ARM crypto extension implements a little-endian version of AES
/// (though the manual doesn't actually spell this out and you have to
/// experiment), but Catacomb's internal interface presents as big-endian so
/// as to work better with things like GCM.  We therefore maintain the round
/// keys in little-endian form, and have to end-swap blocks in and out.
///
/// For added amusement, the crypto extension doesn't implement the larger-
/// block versions of Rijndael, so we have to end-swap the keys if we're
/// preparing for one of those.

	// Useful constants.
	.equ	maxrounds, 16		// maximum number of rounds
	.equ	maxblksz, 32		// maximum block size, in bytes
	.equ	kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer

	// Context structure.
	.equ	nr, 0			// number of rounds
	.equ	w, nr + 4		// encryption key words
	.equ	wi, w + kbufsz		// decryption key words

///--------------------------------------------------------------------------
/// Key setup.

FUNC(rijndael_setup_arm64_crypto)

	// Arguments:
	//	x0 = pointer to context
	//	w1 = block size in 32-bit words
	//	x2 = pointer to key material
	//	x3 = key size in words

	pushreg	x29, x30
	mov	x29, sp

	// The initial round key material is taken directly from the input
	// key, so copy it over.  Unfortunately, the key material is not
	// guaranteed to be aligned in any especially useful way.  Assume
	// that alignment traps are not enabled.  (Why would they be?  On
	// A32, alignment traps were part of a transition plan which changed
	// the way unaligned loads and stores behaved, but there's never been
	// any other behaviour on A64.)
	mov	x15, x3
	add	x4, x0, #w
0:	sub	x15, x15, #1
	ldr	w14, [x2], #4
	str	w14, [x4], #4
	cbnz	x15, 0b

	// Find out other useful things and prepare for the main loop.
9:	ldr	w9, [x0, #nr]		// number of rounds
	madd	w2, w1, w9, w1		// total key size in words
	leaext	x5, rijndael_rcon	// round constants
	sub	x6, x2, x3		// minus what we've copied already
	add	x7, x0, #w		// position in previous cycle
	movi	v1.4s, #0		// all-zero register for the key
	mov	x8, #0			// position in current cycle

	// Main key expansion loop.  Dispatch according to the position in
	// the cycle.
0:	ldr	w15, [x7], #4		// word from previous cycle
	cbz	x8, 1f			// first word of the cycle?
	cmp	x8, #4			// fourth word of the cycle?
	b.ne	2f
	cmp	x3, #7			// seven or eight words of key?
	b.cc	2f

	// Fourth word of the cycle, seven or eight words of key.  We must do
	// the byte substitution.
	dup	v0.4s, w14
	aese	v0.16b, v1.16b		// effectively, just SubBytes
	mov	w14, v0.4s[0]
	b	2f

	// First word of the cycle.  Byte substitution, rotation, and round
	// constant.
1:	ldrb	w13, [x5], #1		// next round constant
	dup	v0.4s, w14
	aese	v0.16b, v1.16b		// effectively, just SubBytes
	mov	w14, v0.4s[0]
	eor	w14, w13, w14, ror #8

	// Common ending: mix in the word from the previous cycle and store.
2:	eor	w14, w14, w15
	str	w14, [x4], #4

	// Prepare for the next iteration.  If we're done, then stop; if
	// we've finished a cycle then reset the counter.
	add	x8, x8, #1
	sub	x6, x6, #1
	cmp	x8, x3
	cbz	x6, 9f
	csel	x8, x8, xzr, cc
	b	0b

	// Next job is to construct the decryption keys.  The keys for the
	// first and last rounds don't need to be mangled, but the remaining
	// ones do -- and they all need to be reordered too.
	//
	// The plan of action, then, is to copy the final encryption round's
	// keys into place first, then to do each of the intermediate rounds
	// in reverse order, and finally do the first round.
	//
	// Do all the heavy lifting with the vector registers.  The order
	// we're doing this in means that it's OK if we read or write too
	// much, and there's easily enough buffer space for the
	// over-enthusiastic reads and writes because the context has space
	// for 32-byte blocks, which is our maximum and an exact fit for two
	// full-width registers.
9:	add	x5, x0, #wi
	add	x4, x0, #w
	add	x4, x4, w2, uxtw #2
	sub	x4, x4, w1, uxtw #2		// last round's keys

	// Copy the last encryption round's keys.
	ld1	{v0.4s, v1.4s}, [x4]
	st1	{v0.4s, v1.4s}, [x5]

	// Update the loop variables and stop if we've finished.
0:	sub	w9, w9, #1
	add	x5, x5, w1, uxtw #2
	sub	x4, x4, w1, uxtw #2
	cbz	w9, 9f

	// Do another middle round's keys...
	ld1	{v0.4s, v1.4s}, [x4]
	aesimc	v0.16b, v0.16b
	aesimc	v1.16b, v1.16b
	st1	{v0.4s, v1.4s}, [x5]
	b	0b

	// Finally do the first encryption round.
9:	ld1	{v0.4s, v1.4s}, [x4]
	st1	{v0.4s, v1.4s}, [x5]

	// If the block size is not exactly four words then we must end-swap
	// everything.  We can use fancy vector toys for this.
	cmp	w1, #4
	b.eq	9f

	// End-swap the encryption keys.
	add	x1, x0, #w
	bl	endswap_block

	// And the decryption keys
	add	x1, x0, #wi
	bl	endswap_block

	// All done.
9:	popreg	x29, x30
	ret

ENDFUNC

INTFUNC(endswap_block)
	// End-swap w2 words starting at x1.  x1 is clobbered; w2 is not.
	// It's OK to work in 16-byte chunks.

	mov	w3, w2
0:	subs	w3, w3, #4
	ld1	{v0.4s}, [x1]
	rev32	v0.16b, v0.16b
	st1	{v0.4s}, [x1], #16
	b.hi	0b
	ret

ENDFUNC

///--------------------------------------------------------------------------
/// Encrypting and decrypting blocks.

.macro	encdec	op, aes, mc, koff
  FUNC(rijndael_\op\()_arm64_crypto)

	// Arguments:
	//	x0 = pointer to context
	//	x1 = pointer to input block
	//	x2 = pointer to output block

	// Set things up ready.
	ldr	w3, [x0, #nr]
	add	x0, x0, #\koff
	ld1	{v0.4s}, [x1]
	rev32	v0.16b, v0.16b

	// Check the number of rounds and dispatch.
	cmp	w3, #14
	b.eq	14f
	cmp	w3, #10
	b.eq	10f
	cmp	w3, #12
	b.eq	12f
	cmp	w3, #13
	b.eq	13f
	cmp	w3, #11
	b.eq	11f
	callext	F(abort)

	// Eleven rounds.
11:	ld1	{v16.4s}, [x0], #16
	\aes	v0.16b, v16.16b
	\mc	v0.16b, v0.16b
	b	10f

	// Twelve rounds.
12:	ld1	{v16.4s, v17.4s}, [x0], #32
	\aes	v0.16b, v16.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v17.16b
	\mc	v0.16b, v0.16b
	b	10f

	// Thirteen rounds.
13:	ld1	{v16.4s-v18.4s}, [x0], #48
	\aes	v0.16b, v16.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v17.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v18.16b
	\mc	v0.16b, v0.16b
	b	10f

	// Fourteen rounds.  (Drops through to the ten round case because
	// this is the next most common.)
14:	ld1	{v16.4s-v19.4s}, [x0], #64
	\aes	v0.16b, v16.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v17.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v18.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v19.16b
	\mc	v0.16b, v0.16b
	// Drop through...

	// Ten rounds.
10:	ld1	{v16.4s-v19.4s}, [x0], #64
	ld1	{v20.4s-v23.4s}, [x0], #64
	\aes	v0.16b, v16.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v17.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v18.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v19.16b
	\mc	v0.16b, v0.16b

	ld1	{v16.4s-v18.4s}, [x0], #48
	\aes	v0.16b, v20.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v21.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v22.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v23.16b
	\mc	v0.16b, v0.16b

	// Final round has no MixColumns, but is followed by final whitening.
	\aes	v0.16b, v16.16b
	\mc	v0.16b, v0.16b
	\aes	v0.16b, v17.16b
	eor	v0.16b, v0.16b, v18.16b

	// All done.
	rev32	v0.16b, v0.16b
	st1	{v0.4s}, [x2]
	ret

  ENDFUNC
.endm

	encdec	eblk, aese, aesmc, w
	encdec	dblk, aesd, aesimc, wi

///----- That's all, folks --------------------------------------------------
Commit	Line	Data
e492db88 MW	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// AArch64 crypto-extension-based implementation of Rijndael
	4	///
	5	/// (c) 2018 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// External definitions.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	.extern F(abort)
	34	.extern F(rijndael_rcon)
	35
	36	///--------------------------------------------------------------------------
	37	/// Main code.
	38
	39	.arch armv8-a+crypto
	40
	41	/// The ARM crypto extension implements a little-endian version of AES
	42	/// (though the manual doesn't actually spell this out and you have to
	43	/// experiment), but Catacomb's internal interface presents as big-endian so
	44	/// as to work better with things like GCM. We therefore maintain the round
	45	/// keys in little-endian form, and have to end-swap blocks in and out.
	46	///
	47	/// For added amusement, the crypto extension doesn't implement the larger-
	48	/// block versions of Rijndael, so we have to end-swap the keys if we're
	49	/// preparing for one of those.
	50
	51	// Useful constants.
	52	.equ maxrounds, 16 // maximum number of rounds
	53	.equ maxblksz, 32 // maximum block size, in bytes
	54	.equ kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer
	55
	56	// Context structure.
	57	.equ nr, 0 // number of rounds
	58	.equ w, nr + 4 // encryption key words
	59	.equ wi, w + kbufsz // decryption key words
	60
	61	///--------------------------------------------------------------------------
	62	/// Key setup.
	63
	64	FUNC(rijndael_setup_arm64_crypto)
65
66	// Arguments:
67	// x0 = pointer to context
68	// w1 = block size in 32-bit words
69	// x2 = pointer to key material
70	// x3 = key size in words
71
72	pushreg x29, x30
73	mov x29, sp
74
75	// The initial round key material is taken directly from the input
76	// key, so copy it over. Unfortunately, the key material is not
77	// guaranteed to be aligned in any especially useful way. Assume
78	// that alignment traps are not enabled. (Why would they be? On
79	// A32, alignment traps were part of a transition plan which changed
80	// the way unaligned loads and stores behaved, but there's never been
81	// any other behaviour on A64.)
82	mov x15, x3
83	add x4, x0, #w
84	0: sub x15, x15, #1
85	ldr w14, [x2], #4
86	str w14, [x4], #4
87	cbnz x15, 0b
88
89	// Find out other useful things and prepare for the main loop.
90	9: ldr w9, [x0, #nr] // number of rounds
91	madd w2, w1, w9, w1 // total key size in words
92	leaext x5, rijndael_rcon // round constants
93	sub x6, x2, x3 // minus what we've copied already
94	add x7, x0, #w // position in previous cycle
95	movi v1.4s, #0 // all-zero register for the key
96	mov x8, #0 // position in current cycle
97
98	// Main key expansion loop. Dispatch according to the position in
99	// the cycle.
100	0: ldr w15, [x7], #4 // word from previous cycle
101	cbz x8, 1f // first word of the cycle?
102	cmp x8, #4 // fourth word of the cycle?
103	b.ne 2f
104	cmp x3, #7 // seven or eight words of key?
105	b.cc 2f
106
107	// Fourth word of the cycle, seven or eight words of key. We must do
108	// the byte substitution.
109	dup v0.4s, w14
110	aese v0.16b, v1.16b // effectively, just SubBytes
111	mov w14, v0.4s[0]
112	b 2f
113
114	// First word of the cycle. Byte substitution, rotation, and round
115	// constant.
116	1: ldrb w13, [x5], #1 // next round constant
117	dup v0.4s, w14
118	aese v0.16b, v1.16b // effectively, just SubBytes
119	mov w14, v0.4s[0]
120	eor w14, w13, w14, ror #8
121
122	// Common ending: mix in the word from the previous cycle and store.
123	2: eor w14, w14, w15
124	str w14, [x4], #4
125
126	// Prepare for the next iteration. If we're done, then stop; if
127	// we've finished a cycle then reset the counter.
128	add x8, x8, #1
129	sub x6, x6, #1
130	cmp x8, x3
131	cbz x6, 9f
132	csel x8, x8, xzr, cc
133	b 0b
134
135	// Next job is to construct the decryption keys. The keys for the
136	// first and last rounds don't need to be mangled, but the remaining
137	// ones do -- and they all need to be reordered too.
138	//
139	// The plan of action, then, is to copy the final encryption round's
140	// keys into place first, then to do each of the intermediate rounds
141	// in reverse order, and finally do the first round.
142	//
143	// Do all the heavy lifting with the vector registers. The order
144	// we're doing this in means that it's OK if we read or write too
145	// much, and there's easily enough buffer space for the
146	// over-enthusiastic reads and writes because the context has space
147	// for 32-byte blocks, which is our maximum and an exact fit for two
148	// full-width registers.
149	9: add x5, x0, #wi
150	add x4, x0, #w
151	add x4, x4, w2, uxtw #2
152	sub x4, x4, w1, uxtw #2 // last round's keys
153
154	// Copy the last encryption round's keys.
155	ld1 {v0.4s, v1.4s}, [x4]
156	st1 {v0.4s, v1.4s}, [x5]
157
158	// Update the loop variables and stop if we've finished.
159	0: sub w9, w9, #1
160	add x5, x5, w1, uxtw #2
161	sub x4, x4, w1, uxtw #2
162	cbz w9, 9f
163
164	// Do another middle round's keys...
165	ld1 {v0.4s, v1.4s}, [x4]
166	aesimc v0.16b, v0.16b
167	aesimc v1.16b, v1.16b
168	st1 {v0.4s, v1.4s}, [x5]
169	b 0b
170
171	// Finally do the first encryption round.
172	9: ld1 {v0.4s, v1.4s}, [x4]
173	st1 {v0.4s, v1.4s}, [x5]
174
175	// If the block size is not exactly four words then we must end-swap
176	// everything. We can use fancy vector toys for this.
177	cmp w1, #4
178	b.eq 9f
179
180	// End-swap the encryption keys.
181	add x1, x0, #w
182	bl endswap_block
183
184	// And the decryption keys
185	add x1, x0, #wi
186	bl endswap_block
187
188	// All done.
189	9: popreg x29, x30
190	ret
191
192	ENDFUNC
193
194	INTFUNC(endswap_block)
195	// End-swap w2 words starting at x1. x1 is clobbered; w2 is not.
196	// It's OK to work in 16-byte chunks.
197
198	mov w3, w2
199	0: subs w3, w3, #4
200	ld1 {v0.4s}, [x1]
201	rev32 v0.16b, v0.16b
202	st1 {v0.4s}, [x1], #16
203	b.hi 0b
204	ret
205
206	ENDFUNC
207
208	///--------------------------------------------------------------------------
209	/// Encrypting and decrypting blocks.
210
211	.macro encdec op, aes, mc, koff
212	FUNC(rijndael_\op\()_arm64_crypto)
213
214	// Arguments:
215	// x0 = pointer to context
216	// x1 = pointer to input block
217	// x2 = pointer to output block
218
219	// Set things up ready.
220	ldr w3, [x0, #nr]
221	add x0, x0, #\koff
222	ld1 {v0.4s}, [x1]
223	rev32 v0.16b, v0.16b
224
225	// Check the number of rounds and dispatch.
226	cmp w3, #14
227	b.eq 14f
228	cmp w3, #10
229	b.eq 10f
230	cmp w3, #12
231	b.eq 12f
232	cmp w3, #13
233	b.eq 13f
234	cmp w3, #11
235	b.eq 11f
236	callext F(abort)
237
238	// Eleven rounds.
239	11: ld1 {v16.4s}, [x0], #16
240	\aes v0.16b, v16.16b
241	\mc v0.16b, v0.16b
242	b 10f
243
244	// Twelve rounds.
245	12: ld1 {v16.4s, v17.4s}, [x0], #32
246	\aes v0.16b, v16.16b
247	\mc v0.16b, v0.16b
248	\aes v0.16b, v17.16b
249	\mc v0.16b, v0.16b
250	b 10f
251
252	// Thirteen rounds.
253	13: ld1 {v16.4s-v18.4s}, [x0], #48
254	\aes v0.16b, v16.16b
255	\mc v0.16b, v0.16b
256	\aes v0.16b, v17.16b
257	\mc v0.16b, v0.16b
258	\aes v0.16b, v18.16b
259	\mc v0.16b, v0.16b
260	b 10f
261
262	// Fourteen rounds. (Drops through to the ten round case because
263	// this is the next most common.)
264	14: ld1 {v16.4s-v19.4s}, [x0], #64
265	\aes v0.16b, v16.16b
266	\mc v0.16b, v0.16b
267	\aes v0.16b, v17.16b
268	\mc v0.16b, v0.16b
269	\aes v0.16b, v18.16b
270	\mc v0.16b, v0.16b
271	\aes v0.16b, v19.16b
272	\mc v0.16b, v0.16b
273	// Drop through...
274
275	// Ten rounds.
276	10: ld1 {v16.4s-v19.4s}, [x0], #64
277	ld1 {v20.4s-v23.4s}, [x0], #64
278	\aes v0.16b, v16.16b
279	\mc v0.16b, v0.16b
280	\aes v0.16b, v17.16b
281	\mc v0.16b, v0.16b
282	\aes v0.16b, v18.16b
283	\mc v0.16b, v0.16b
284	\aes v0.16b, v19.16b
285	\mc v0.16b, v0.16b
286
287	ld1 {v16.4s-v18.4s}, [x0], #48
288	\aes v0.16b, v20.16b
289	\mc v0.16b, v0.16b
290	\aes v0.16b, v21.16b
291	\mc v0.16b, v0.16b
292	\aes v0.16b, v22.16b
293	\mc v0.16b, v0.16b
294	\aes v0.16b, v23.16b
295	\mc v0.16b, v0.16b
296
297	// Final round has no MixColumns, but is followed by final whitening.
298	\aes v0.16b, v16.16b
299	\mc v0.16b, v0.16b
300	\aes v0.16b, v17.16b
301	eor v0.16b, v0.16b, v18.16b
302
303	// All done.
304	rev32 v0.16b, v0.16b
305	st1 {v0.4s}, [x2]
306	ret
307
308	ENDFUNC
309	.endm
310
311	encdec eblk, aese, aesmc, w
312	encdec dblk, aesd, aesimc, wi
313
314	///----- That's all, folks --------------------------------------------------