math/Makefile.am, symm/Makefile.am: Use `--no-install' on oddball tests.
[catacomb] / symm / rijndael-arm64-crypto.S
1 /// -*- mode: asm; asm-comment-char: ?/ -*-
2 ///
3 /// AArch64 crypto-extension-based implementation of Rijndael
4 ///
5 /// (c) 2018 Straylight/Edgeware
6 ///
7
8 ///----- Licensing notice ---------------------------------------------------
9 ///
10 /// This file is part of Catacomb.
11 ///
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
16 ///
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
21 ///
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
26
27 ///--------------------------------------------------------------------------
28 /// External definitions.
29
30 #include "config.h"
31 #include "asm-common.h"
32
33 .extern F(abort)
34 .extern F(rijndael_rcon)
35
36 ///--------------------------------------------------------------------------
37 /// Main code.
38
39 .arch armv8-a+crypto
40
41 /// The ARM crypto extension implements a little-endian version of AES
42 /// (though the manual doesn't actually spell this out and you have to
43 /// experiment), but Catacomb's internal interface presents as big-endian so
44 /// as to work better with things like GCM. We therefore maintain the round
45 /// keys in little-endian form, and have to end-swap blocks in and out.
46 ///
47 /// For added amusement, the crypto extension doesn't implement the larger-
48 /// block versions of Rijndael, so we have to end-swap the keys if we're
49 /// preparing for one of those.
50
51 // Useful constants.
52 .equ maxrounds, 16 // maximum number of rounds
53 .equ maxblksz, 32 // maximum block size, in bytes
54 .equ kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer
55
56 // Context structure.
57 .equ nr, 0 // number of rounds
58 .equ w, nr + 4 // encryption key words
59 .equ wi, w + kbufsz // decryption key words
60
61 ///--------------------------------------------------------------------------
62 /// Key setup.
63
64 FUNC(rijndael_setup_arm64_crypto)
65
66 // Arguments:
67 // x0 = pointer to context
68 // w1 = block size in 32-bit words
69 // x2 = pointer to key material
70 // x3 = key size in words
71
72 pushreg x29, x30
73 mov x29, sp
74
75 // The initial round key material is taken directly from the input
76 // key, so copy it over. Unfortunately, the key material is not
77 // guaranteed to be aligned in any especially useful way. Assume
78 // that alignment traps are not enabled. (Why would they be? On
79 // A32, alignment traps were part of a transition plan which changed
80 // the way unaligned loads and stores behaved, but there's never been
81 // any other behaviour on A64.)
82 mov x15, x3
83 add x4, x0, #w
84 0: sub x15, x15, #1
85 ldr w14, [x2], #4
86 str w14, [x4], #4
87 cbnz x15, 0b
88
89 // Find out other useful things and prepare for the main loop.
90 9: ldr w9, [x0, #nr] // number of rounds
91 madd w2, w1, w9, w1 // total key size in words
92 leaext x5, rijndael_rcon // round constants
93 sub x6, x2, x3 // minus what we've copied already
94 add x7, x0, #w // position in previous cycle
95 movi v1.4s, #0 // all-zero register for the key
96 mov x8, #0 // position in current cycle
97
98 // Main key expansion loop. Dispatch according to the position in
99 // the cycle.
100 0: ldr w15, [x7], #4 // word from previous cycle
101 cbz x8, 1f // first word of the cycle?
102 cmp x8, #4 // fourth word of the cycle?
103 b.ne 2f
104 cmp x3, #7 // seven or eight words of key?
105 b.cc 2f
106
107 // Fourth word of the cycle, seven or eight words of key. We must do
108 // the byte substitution.
109 dup v0.4s, w14
110 aese v0.16b, v1.16b // effectively, just SubBytes
111 mov w14, v0.s[0]
112 b 2f
113
114 // First word of the cycle. Byte substitution, rotation, and round
115 // constant.
116 1: ldrb w13, [x5], #1 // next round constant
117 dup v0.4s, w14
118 aese v0.16b, v1.16b // effectively, just SubBytes
119 mov w14, v0.s[0]
120 eor w14, w13, w14, ror #8
121
122 // Common ending: mix in the word from the previous cycle and store.
123 2: eor w14, w14, w15
124 str w14, [x4], #4
125
126 // Prepare for the next iteration. If we're done, then stop; if
127 // we've finished a cycle then reset the counter.
128 add x8, x8, #1
129 sub x6, x6, #1
130 cmp x8, x3
131 cbz x6, 9f
132 csel x8, x8, xzr, cc
133 b 0b
134
135 // Next job is to construct the decryption keys. The keys for the
136 // first and last rounds don't need to be mangled, but the remaining
137 // ones do -- and they all need to be reordered too.
138 //
139 // The plan of action, then, is to copy the final encryption round's
140 // keys into place first, then to do each of the intermediate rounds
141 // in reverse order, and finally do the first round.
142 //
143 // Do all the heavy lifting with the vector registers. The order
144 // we're doing this in means that it's OK if we read or write too
145 // much, and there's easily enough buffer space for the
146 // over-enthusiastic reads and writes because the context has space
147 // for 32-byte blocks, which is our maximum and an exact fit for two
148 // full-width registers.
149 9: add x5, x0, #wi
150 add x4, x0, #w
151 add x4, x4, w2, uxtw #2
152 sub x4, x4, w1, uxtw #2 // last round's keys
153
154 // Copy the last encryption round's keys.
155 ld1 {v0.4s, v1.4s}, [x4]
156 st1 {v0.4s, v1.4s}, [x5]
157
158 // Update the loop variables and stop if we've finished.
159 0: sub w9, w9, #1
160 add x5, x5, w1, uxtw #2
161 sub x4, x4, w1, uxtw #2
162 cbz w9, 9f
163
164 // Do another middle round's keys...
165 ld1 {v0.4s, v1.4s}, [x4]
166 aesimc v0.16b, v0.16b
167 aesimc v1.16b, v1.16b
168 st1 {v0.4s, v1.4s}, [x5]
169 b 0b
170
171 // Finally do the first encryption round.
172 9: ld1 {v0.4s, v1.4s}, [x4]
173 st1 {v0.4s, v1.4s}, [x5]
174
175 // If the block size is not exactly four words then we must end-swap
176 // everything. We can use fancy vector toys for this.
177 cmp w1, #4
178 b.eq 9f
179
180 // End-swap the encryption keys.
181 add x1, x0, #w
182 bl endswap_block
183
184 // And the decryption keys
185 add x1, x0, #wi
186 bl endswap_block
187
188 // All done.
189 9: popreg x29, x30
190 ret
191
192 ENDFUNC
193
194 INTFUNC(endswap_block)
195 // End-swap w2 words starting at x1. x1 is clobbered; w2 is not.
196 // It's OK to work in 16-byte chunks.
197
198 mov w3, w2
199 0: subs w3, w3, #4
200 ld1 {v0.4s}, [x1]
201 rev32 v0.16b, v0.16b
202 st1 {v0.4s}, [x1], #16
203 b.hi 0b
204 ret
205
206 ENDFUNC
207
208 ///--------------------------------------------------------------------------
209 /// Encrypting and decrypting blocks.
210
211 .macro encdec op, aes, mc, koff
212 FUNC(rijndael_\op\()_arm64_crypto)
213
214 // Arguments:
215 // x0 = pointer to context
216 // x1 = pointer to input block
217 // x2 = pointer to output block
218
219 // Set things up ready.
220 ldr w3, [x0, #nr]
221 add x0, x0, #\koff
222 ld1 {v0.4s}, [x1]
223 rev32 v0.16b, v0.16b
224
225 // Check the number of rounds and dispatch.
226 cmp w3, #14
227 b.eq 14f
228 cmp w3, #10
229 b.eq 10f
230 cmp w3, #12
231 b.eq 12f
232 cmp w3, #13
233 b.eq 13f
234 cmp w3, #11
235 b.eq 11f
236 callext F(abort)
237
238 // Eleven rounds.
239 11: ld1 {v16.4s}, [x0], #16
240 \aes v0.16b, v16.16b
241 \mc v0.16b, v0.16b
242 b 10f
243
244 // Twelve rounds.
245 12: ld1 {v16.4s, v17.4s}, [x0], #32
246 \aes v0.16b, v16.16b
247 \mc v0.16b, v0.16b
248 \aes v0.16b, v17.16b
249 \mc v0.16b, v0.16b
250 b 10f
251
252 // Thirteen rounds.
253 13: ld1 {v16.4s-v18.4s}, [x0], #48
254 \aes v0.16b, v16.16b
255 \mc v0.16b, v0.16b
256 \aes v0.16b, v17.16b
257 \mc v0.16b, v0.16b
258 \aes v0.16b, v18.16b
259 \mc v0.16b, v0.16b
260 b 10f
261
262 // Fourteen rounds. (Drops through to the ten round case because
263 // this is the next most common.)
264 14: ld1 {v16.4s-v19.4s}, [x0], #64
265 \aes v0.16b, v16.16b
266 \mc v0.16b, v0.16b
267 \aes v0.16b, v17.16b
268 \mc v0.16b, v0.16b
269 \aes v0.16b, v18.16b
270 \mc v0.16b, v0.16b
271 \aes v0.16b, v19.16b
272 \mc v0.16b, v0.16b
273 // Drop through...
274
275 // Ten rounds.
276 10: ld1 {v16.4s-v19.4s}, [x0], #64
277 ld1 {v20.4s-v23.4s}, [x0], #64
278 \aes v0.16b, v16.16b
279 \mc v0.16b, v0.16b
280 \aes v0.16b, v17.16b
281 \mc v0.16b, v0.16b
282 \aes v0.16b, v18.16b
283 \mc v0.16b, v0.16b
284 \aes v0.16b, v19.16b
285 \mc v0.16b, v0.16b
286
287 ld1 {v16.4s-v18.4s}, [x0], #48
288 \aes v0.16b, v20.16b
289 \mc v0.16b, v0.16b
290 \aes v0.16b, v21.16b
291 \mc v0.16b, v0.16b
292 \aes v0.16b, v22.16b
293 \mc v0.16b, v0.16b
294 \aes v0.16b, v23.16b
295 \mc v0.16b, v0.16b
296
297 // Final round has no MixColumns, but is followed by final whitening.
298 \aes v0.16b, v16.16b
299 \mc v0.16b, v0.16b
300 \aes v0.16b, v17.16b
301 eor v0.16b, v0.16b, v18.16b
302
303 // All done.
304 rev32 v0.16b, v0.16b
305 st1 {v0.4s}, [x2]
306 ret
307
308 ENDFUNC
309 .endm
310
311 encdec eblk, aese, aesmc, w
312 encdec dblk, aesd, aesimc, wi
313
314 ///----- That's all, folks --------------------------------------------------