math/Makefile.am, symm/Makefile.am: Use `--no-install' on oddball tests.
[catacomb] / symm / rijndael-arm-crypto.S
CommitLineData
26e182fc
MW
1/// -*- mode: asm; asm-comment-char: ?/ -*-
2///
3/// ARM crypto-extension-based implementation of Rijndael
4///
5/// (c) 2016 Straylight/Edgeware
6///
7
8///----- Licensing notice ---------------------------------------------------
9///
10/// This file is part of Catacomb.
11///
12/// Catacomb is free software; you can redistribute it and/or modify
13/// it under the terms of the GNU Library General Public License as
14/// published by the Free Software Foundation; either version 2 of the
15/// License, or (at your option) any later version.
16///
17/// Catacomb is distributed in the hope that it will be useful,
18/// but WITHOUT ANY WARRANTY; without even the implied warranty of
19/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20/// GNU Library General Public License for more details.
21///
22/// You should have received a copy of the GNU Library General Public
23/// License along with Catacomb; if not, write to the Free
24/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25/// MA 02111-1307, USA.
26
27///--------------------------------------------------------------------------
28/// External definitions.
29
30#include "config.h"
31#include "asm-common.h"
32
1a517bb3
MW
33 .extern F(abort)
34 .extern F(rijndael_rcon)
26e182fc
MW
35
36///--------------------------------------------------------------------------
37/// Main code.
38
39 .arch armv8-a
40 .fpu crypto-neon-fp-armv8
41
42/// The ARM crypto extension implements a little-endian version of AES
43/// (though the manual doesn't actually spell this out and you have to
44/// experiment), but Catacomb's internal interface presents as big-endian so
45/// as to work better with things like GCM. We therefore maintain the round
46/// keys in little-endian form, and have to end-swap blocks in and out.
47///
48/// For added amusement, the crypto extension doesn't implement the larger-
49/// block versions of Rijndael, so we have to end-swap the keys if we're
50/// preparing for one of those.
51
52 // Useful constants.
53 .equ maxrounds, 16 // maximum number of rounds
54 .equ maxblksz, 32 // maximum block size, in bytes
43ea7558 55 .equ kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer
26e182fc
MW
56
57 // Context structure.
58 .equ nr, 0 // number of rounds
59 .equ w, nr + 4 // encryption key words
60 .equ wi, w + kbufsz // decryption key words
61
62///--------------------------------------------------------------------------
63/// Key setup.
64
65FUNC(rijndael_setup_arm_crypto)
66
67 // Arguments:
68 // r0 = pointer to context
69 // r1 = block size in words
70 // r2 = pointer to key material
71 // r3 = key size in words
72
ae4fc08a 73 pushreg r4-r9, r14
26e182fc
MW
74
75 // The initial round key material is taken directly from the input
76 // key, so copy it over. Unfortunately, the key material is not
77 // guaranteed to be aligned in any especially useful way, so we must
78 // sort this out.
79 add r9, r0, #w
80 mov r14, r3
c82543b7 81 ands r6, r2, #3
26e182fc 82 beq 1f
c82543b7
MW
83 mov r6, r6, lsl #3
84 rsb r7, r6, #32
26e182fc 85 bic r2, r2, #3
c82543b7 86 ldr r4, [r2], #4
26e182fc 87
c82543b7
MW
880: ldr r5, [r2], #4
89 mov r4, r4, lsr r6
90 orr r4, r5, lsl r7
91 str r4, [r9], #4
26e182fc 92 subs r14, r14, #1
c82543b7 93 movhi r4, r5
26e182fc
MW
94 bhi 0b
95 b 9f
96
c82543b7
MW
971: ldr r4, [r2], #4
98 str r4, [r9], #4
26e182fc
MW
99 subs r14, r14, #1
100 bhi 1b
101
102 // Find out other useful things and prepare for the main loop.
aec6bc36 1039: ldr r7, [r0, #nr] // number of rounds
26e182fc 104 mla r2, r1, r7, r1 // total key size in words
26e182fc
MW
105 leaextq r5, rijndael_rcon // round constants
106 sub r8, r2, r3 // minus what we've copied already
62bb18d7 107 vmov.i32 q1, #0 // all-zero register for the key
26e182fc 108 add r8, r9, r8, lsl #2 // limit of the key buffer
16021451 109 mov r12, #0 // position in current cycle
26e182fc 110
16021451
MW
111 // Main key expansion loop. Dispatch according to the position in
112 // the cycle.
1130: ldr r6, [r9, -r3, lsl #2] // word from previous cycle
114 cmp r12, #0 // first word of the cycle?
115 beq 1f
116 cmp r12, #4 // fourth word of the cycle?
117 bne 2f
118 cmp r3, #7 // seven or eight words of key?
119 bcc 2f
120
121 // Fourth word of the cycle, seven or eight words of key. We must do
122 // the byte substitution.
26e182fc
MW
123 vdup.32 q0, r4
124 aese.8 q0, q1 // effectively, just SubBytes
125 vmov.32 r4, d0[0]
16021451 126 b 2f
26e182fc 127
16021451
MW
128 // First word of the cycle. Byte substitution, rotation, and round
129 // constant.
1301: ldrb r14, [r5], #1 // next round constant
26e182fc
MW
131 vdup.32 q0, r4
132 aese.8 q0, q1 // effectively, just SubBytes
133 vmov.32 r4, d0[0]
16021451 134 eor r4, r14, r4, ror #8
26e182fc 135
16021451
MW
136 // Common ending: mix in the word from the previous cycle and store.
1372: eor r4, r4, r6
26e182fc 138 str r4, [r9], #4
26e182fc 139
16021451
MW
140 // Prepare for the next iteration. If we're done, then stop; if
141 // we've finished a cycle then reset the counter.
142 add r12, r12, #1
26e182fc 143 cmp r9, r8
f71eed58 144 bcs 9f
16021451
MW
145 cmp r12, r3
146 movcs r12, #0
f71eed58 147 b 0b
26e182fc
MW
148
149 // Next job is to construct the decryption keys. The keys for the
150 // first and last rounds don't need to be mangled, but the remaining
151 // ones do -- and they all need to be reordered too.
152 //
153 // The plan of action, then, is to copy the final encryption round's
154 // keys into place first, then to do each of the intermediate rounds
155 // in reverse order, and finally do the first round.
156 //
157 // Do all the heavy lifting with NEON registers. The order we're
158 // doing this in means that it's OK if we read or write too much, and
159 // there's easily enough buffer space for the over-enthusiastic reads
160 // and writes because the context has space for 32-byte blocks, which
161 // is our maximum and an exact fit for two Q-class registers.
f71eed58 1629: add r5, r0, #wi
26e182fc
MW
163 add r4, r0, #w
164 add r4, r4, r2, lsl #2
165 sub r4, r4, r1, lsl #2 // last round's keys
166
167 // Copy the last encryption round's keys.
168 teq r1, #4
169 vldmiaeq r4, {d0, d1}
170 vldmiane r4, {d0-d3}
171 vstmiaeq r5, {d0, d1}
172 vstmiane r5, {d0-d3}
173
174 // Update the loop variables and stop if we've finished.
f71eed58 1750: sub r4, r4, r1, lsl #2
26e182fc
MW
176 add r5, r5, r1, lsl #2
177 subs r7, r7, #1
f71eed58 178 beq 9f
26e182fc
MW
179
180 // Do another middle round's keys...
181 teq r1, #4
182 vldmiaeq r4, {d0, d1}
183 vldmiane r4, {d0-d3}
184 aesimc.8 q0, q0
185 vstmiaeq r5, {d0, d1}
f71eed58 186 beq 0b
26e182fc
MW
187 aesimc.8 q1, q1
188 vstmia r5, {d0-d3}
f71eed58 189 b 0b
26e182fc
MW
190
191 // Finally do the first encryption round.
f71eed58 1929: teq r1, #4
26e182fc
MW
193 vldmiaeq r4, {d0, d1}
194 vldmiane r4, {d0-d3}
195 vstmiaeq r5, {d0, d1}
196 vstmiane r5, {d0-d3}
197
198 // If the block size is not exactly four words then we must end-swap
199 // everything. We can use fancy NEON toys for this.
f71eed58 200 beq 9f
26e182fc
MW
201
202 // End-swap the encryption keys.
203 add r1, r0, #w
204 bl endswap_block
205
206 // And the decryption keys
207 add r1, r0, #wi
208 bl endswap_block
209
210 // All done.
ae4fc08a 2119: popreg r4-r9, pc
26e182fc 212
1a517bb3
MW
213ENDFUNC
214
215INTFUNC(endswap_block)
26e182fc
MW
216 // End-swap R2 words starting at R1. R1 is clobbered; R2 is not.
217 // It's OK to work in 16-byte chunks.
1a517bb3 218
26e182fc
MW
219 mov r4, r2
2200: vldmia r1, {d0, d1}
221 vrev32.8 q0, q0
222 vstmia r1!, {d0, d1}
223 subs r4, r4, #4
224 bhi 0b
225 bx r14
226
227ENDFUNC
228
229///--------------------------------------------------------------------------
230/// Encrypting and decrypting blocks.
231
6717fd12
MW
232.macro encdec op, aes, mc, koff
233 FUNC(rijndael_\op\()_arm_crypto)
26e182fc
MW
234
235 // Arguments:
236 // r0 = pointer to context
237 // r1 = pointer to input block
238 // r2 = pointer to output block
239
240 // Set things up ready.
241 ldr r3, [r0, #nr]
6717fd12 242 add r0, r0, #\koff
26e182fc
MW
243 vldmia r1, {d0, d1}
244 vrev32.8 q0, q0
245
6717fd12
MW
246 // Check the number of rounds and dispatch.
247 sub r3, r3, #10
248 cmp r3, #5
249 addlo pc, pc, r3, lsl #2
26e182fc
MW
250 callext F(abort)
251
6717fd12
MW
252 b 10f
253 b 11f
254 b 12f
255 b 13f
256 b 14f
257
258 // Eleven rounds.
25911: vldmia r0!, {d16, d17}
260 \aes\().8 q0, q8
261 \mc\().8 q0, q0
262 b 10f
263
264 // Twelve rounds.
26512: vldmia r0!, {d16-d19}
266 \aes\().8 q0, q8
267 \mc\().8 q0, q0
268 \aes\().8 q0, q9
269 \mc\().8 q0, q0
270 b 10f
271
272 // Thirteen rounds.
27313: vldmia r0!, {d16-d21}
274 \aes\().8 q0, q8
275 \mc\().8 q0, q0
276 \aes\().8 q0, q9
277 \mc\().8 q0, q0
278 \aes\().8 q0, q10
279 \mc\().8 q0, q0
280 b 10f
281
282 // Fourteen rounds. (Drops through to the ten round case because
283 // this is the next most common.)
28414: vldmia r0!, {d16-d23}
285 \aes\().8 q0, q8
286 \mc\().8 q0, q0
287 \aes\().8 q0, q9
288 \mc\().8 q0, q0
289 \aes\().8 q0, q10
290 \mc\().8 q0, q0
291 \aes\().8 q0, q11
292 \mc\().8 q0, q0
293 // Drop through...
294
295 // Ten rounds.
29610: vldmia r0!, {d16-d25}
297 \aes\().8 q0, q8
298 \mc\().8 q0, q0
299 \aes\().8 q0, q9
300 \mc\().8 q0, q0
301 \aes\().8 q0, q10
302 \mc\().8 q0, q0
303 \aes\().8 q0, q11
304 \mc\().8 q0, q0
305 \aes\().8 q0, q12
306 \mc\().8 q0, q0
307
308 vldmia r0!, {d16-d27}
309 \aes\().8 q0, q8
310 \mc\().8 q0, q0
311 \aes\().8 q0, q9
312 \mc\().8 q0, q0
313 \aes\().8 q0, q10
314 \mc\().8 q0, q0
315 \aes\().8 q0, q11
316 \mc\().8 q0, q0
317
318 // Final round has no MixColumns, but is followed by final whitening.
319 \aes\().8 q0, q12
320 veor q0, q0, q13
26e182fc
MW
321
322 // All done.
323 vrev32.8 q0, q0
324 vstmia r2, {d0, d1}
325 bx r14
326
6717fd12
MW
327 ENDFUNC
328.endm
26e182fc 329
6717fd12
MW
330 encdec eblk, aese, aesmc, w
331 encdec dblk, aesd, aesimc, wi
26e182fc
MW
332
333///----- That's all, folks --------------------------------------------------