progs/perftest.c: Use from Glibc syscall numbers.
[catacomb] / symm / rijndael-arm64-crypto.S
1 /// -*- mode: asm; asm-comment-char: ?/ -*-
2 ///
3 /// AArch64 crypto-extension-based implementation of Rijndael
4 ///
5 /// (c) 2018 Straylight/Edgeware
6 ///
7
8 ///----- Licensing notice ---------------------------------------------------
9 ///
10 /// This file is part of Catacomb.
11 ///
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
16 ///
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
21 ///
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
26
27 ///--------------------------------------------------------------------------
28 /// Preliminaries.
29
30 #include "config.h"
31 #include "asm-common.h"
32
33 .arch armv8-a+crypto
34
35 .extern F(abort)
36 .extern F(rijndael_rcon)
37
38 .text
39
40 ///--------------------------------------------------------------------------
41 /// Main code.
42
43 /// The ARM crypto extension implements a little-endian version of AES
44 /// (though the manual doesn't actually spell this out and you have to
45 /// experiment), but Catacomb's internal interface presents as big-endian so
46 /// as to work better with things like GCM. We therefore maintain the round
47 /// keys in little-endian form, and have to end-swap blocks in and out.
48 ///
49 /// For added amusement, the crypto extension doesn't implement the larger-
50 /// block versions of Rijndael, so we have to end-swap the keys if we're
51 /// preparing for one of those.
52
53 // Useful constants.
54 .equ maxrounds, 16 // maximum number of rounds
55 .equ maxblksz, 32 // maximum block size, in bytes
56 .equ kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer
57
58 // Context structure.
59 .equ nr, 0 // number of rounds
60 .equ w, nr + 4 // encryption key words
61 .equ wi, w + kbufsz // decryption key words
62
63 ///--------------------------------------------------------------------------
64 /// Key setup.
65
66 FUNC(rijndael_setup_arm64_crypto)
67
68 // Arguments:
69 // x0 = pointer to context
70 // w1 = block size in 32-bit words
71 // x2 = pointer to key material
72 // x3 = key size in words
73
74 pushreg x29, x30
75 mov x29, sp
76
77 // The initial round key material is taken directly from the input
78 // key, so copy it over. Unfortunately, the key material is not
79 // guaranteed to be aligned in any especially useful way. Assume
80 // that alignment traps are not enabled. (Why would they be? On
81 // A32, alignment traps were part of a transition plan which changed
82 // the way unaligned loads and stores behaved, but there's never been
83 // any other behaviour on A64.)
84 mov x15, x3
85 add x4, x0, #w
86 0: sub x15, x15, #1
87 ldr w14, [x2], #4
88 str w14, [x4], #4
89 cbnz x15, 0b
90
91 // Find out other useful things and prepare for the main loop.
92 9: ldr w9, [x0, #nr] // number of rounds
93 madd w2, w1, w9, w1 // total key size in words
94 leaext x5, rijndael_rcon // round constants
95 sub x6, x2, x3 // minus what we've copied already
96 add x7, x0, #w // position in previous cycle
97 movi v1.4s, #0 // all-zero register for the key
98 mov x8, #0 // position in current cycle
99
100 // Main key expansion loop. Dispatch according to the position in
101 // the cycle.
102 0: ldr w15, [x7], #4 // word from previous cycle
103 cbz x8, 1f // first word of the cycle?
104 cmp x8, #4 // fourth word of the cycle?
105 b.ne 2f
106 cmp x3, #7 // seven or eight words of key?
107 b.cc 2f
108
109 // Fourth word of the cycle, seven or eight words of key. We must do
110 // the byte substitution.
111 dup v0.4s, w14
112 aese v0.16b, v1.16b // effectively, just SubBytes
113 mov w14, v0.s[0]
114 b 2f
115
116 // First word of the cycle. Byte substitution, rotation, and round
117 // constant.
118 1: ldrb w13, [x5], #1 // next round constant
119 dup v0.4s, w14
120 aese v0.16b, v1.16b // effectively, just SubBytes
121 mov w14, v0.s[0]
122 eor w14, w13, w14, ror #8
123
124 // Common ending: mix in the word from the previous cycle and store.
125 2: eor w14, w14, w15
126 str w14, [x4], #4
127
128 // Prepare for the next iteration. If we're done, then stop; if
129 // we've finished a cycle then reset the counter.
130 add x8, x8, #1
131 sub x6, x6, #1
132 cmp x8, x3
133 cbz x6, 9f
134 cmov.cs x8, xzr
135 b 0b
136
137 // Next job is to construct the decryption keys. The keys for the
138 // first and last rounds don't need to be mangled, but the remaining
139 // ones do -- and they all need to be reordered too.
140 //
141 // The plan of action, then, is to copy the final encryption round's
142 // keys into place first, then to do each of the intermediate rounds
143 // in reverse order, and finally do the first round.
144 //
145 // Do all the heavy lifting with the vector registers. The order
146 // we're doing this in means that it's OK if we read or write too
147 // much, and there's easily enough buffer space for the
148 // over-enthusiastic reads and writes because the context has space
149 // for 32-byte blocks, which is our maximum and an exact fit for two
150 // full-width registers.
151 9: add x5, x0, #wi
152 add x4, x0, #w
153 add x4, x4, w2, uxtw #2
154 sub x4, x4, w1, uxtw #2 // last round's keys
155
156 // Copy the last encryption round's keys.
157 ld1 {v0.4s, v1.4s}, [x4]
158 st1 {v0.4s, v1.4s}, [x5]
159
160 // Update the loop variables and stop if we've finished.
161 0: sub w9, w9, #1
162 add x5, x5, w1, uxtw #2
163 sub x4, x4, w1, uxtw #2
164 cbz w9, 9f
165
166 // Do another middle round's keys...
167 ld1 {v0.4s, v1.4s}, [x4]
168 aesimc v0.16b, v0.16b
169 aesimc v1.16b, v1.16b
170 st1 {v0.4s, v1.4s}, [x5]
171 b 0b
172
173 // Finally do the first encryption round.
174 9: ld1 {v0.4s, v1.4s}, [x4]
175 st1 {v0.4s, v1.4s}, [x5]
176
177 // If the block size is not exactly four words then we must end-swap
178 // everything. We can use fancy vector toys for this.
179 cmp w1, #4
180 b.eq 9f
181
182 // End-swap the encryption keys.
183 add x1, x0, #w
184 bl endswap_block
185
186 // And the decryption keys
187 add x1, x0, #wi
188 bl endswap_block
189
190 // All done.
191 9: popreg x29, x30
192 ret
193
194 ENDFUNC
195
196 INTFUNC(endswap_block)
197 // End-swap w2 words starting at x1. x1 is clobbered; w2 is not.
198 // It's OK to work in 16-byte chunks.
199
200 mov w3, w2
201 0: subs w3, w3, #4
202 ld1 {v0.4s}, [x1]
203 rev32 v0.16b, v0.16b
204 st1 {v0.4s}, [x1], #16
205 b.hi 0b
206 ret
207
208 ENDFUNC
209
210 ///--------------------------------------------------------------------------
211 /// Encrypting and decrypting blocks.
212
213 .macro encdec op, aes, mc, koff
214 FUNC(rijndael_\op\()_arm64_crypto)
215
216 // Arguments:
217 // x0 = pointer to context
218 // x1 = pointer to input block
219 // x2 = pointer to output block
220
221 // Set things up ready.
222 ldr w3, [x0, #nr]
223 add x0, x0, #\koff
224 ld1 {v0.4s}, [x1]
225 rev32 v0.16b, v0.16b
226
227 // Check the number of rounds and dispatch.
228 cmp w3, #14
229 b.eq 14f
230 cmp w3, #10
231 b.eq 10f
232 cmp w3, #12
233 b.eq 12f
234 cmp w3, #13
235 b.eq 13f
236 cmp w3, #11
237 b.eq 11f
238 callext F(abort)
239
240 // Eleven rounds.
241 11: ld1 {v16.4s}, [x0], #16
242 \aes v0.16b, v16.16b
243 \mc v0.16b, v0.16b
244 b 10f
245
246 // Twelve rounds.
247 12: ld1 {v16.4s, v17.4s}, [x0], #32
248 \aes v0.16b, v16.16b
249 \mc v0.16b, v0.16b
250 \aes v0.16b, v17.16b
251 \mc v0.16b, v0.16b
252 b 10f
253
254 // Thirteen rounds.
255 13: ld1 {v16.4s-v18.4s}, [x0], #48
256 \aes v0.16b, v16.16b
257 \mc v0.16b, v0.16b
258 \aes v0.16b, v17.16b
259 \mc v0.16b, v0.16b
260 \aes v0.16b, v18.16b
261 \mc v0.16b, v0.16b
262 b 10f
263
264 // Fourteen rounds. (Drops through to the ten round case because
265 // this is the next most common.)
266 14: ld1 {v16.4s-v19.4s}, [x0], #64
267 \aes v0.16b, v16.16b
268 \mc v0.16b, v0.16b
269 \aes v0.16b, v17.16b
270 \mc v0.16b, v0.16b
271 \aes v0.16b, v18.16b
272 \mc v0.16b, v0.16b
273 \aes v0.16b, v19.16b
274 \mc v0.16b, v0.16b
275 // Drop through...
276
277 // Ten rounds.
278 10: ld1 {v16.4s-v19.4s}, [x0], #64
279 ld1 {v20.4s-v23.4s}, [x0], #64
280 \aes v0.16b, v16.16b
281 \mc v0.16b, v0.16b
282 \aes v0.16b, v17.16b
283 \mc v0.16b, v0.16b
284 \aes v0.16b, v18.16b
285 \mc v0.16b, v0.16b
286 \aes v0.16b, v19.16b
287 \mc v0.16b, v0.16b
288
289 ld1 {v16.4s-v18.4s}, [x0], #48
290 \aes v0.16b, v20.16b
291 \mc v0.16b, v0.16b
292 \aes v0.16b, v21.16b
293 \mc v0.16b, v0.16b
294 \aes v0.16b, v22.16b
295 \mc v0.16b, v0.16b
296 \aes v0.16b, v23.16b
297 \mc v0.16b, v0.16b
298
299 // Final round has no MixColumns, but is followed by final whitening.
300 \aes v0.16b, v16.16b
301 \mc v0.16b, v0.16b
302 \aes v0.16b, v17.16b
303 eor v0.16b, v0.16b, v18.16b
304
305 // All done.
306 rev32 v0.16b, v0.16b
307 st1 {v0.4s}, [x2]
308 ret
309
310 ENDFUNC
311 .endm
312
313 encdec eblk, aese, aesmc, w
314 encdec dblk, aesd, aesimc, wi
315
316 ///----- That's all, folks --------------------------------------------------