base/asm-common.h, symm/*.S: New macros for register name decoration.
[catacomb] / symm / rijndael-arm-crypto.S
1 /// -*- mode: asm; asm-comment-char: ?/ -*-
2 ///
3 /// ARM crypto-extension-based implementation of Rijndael
4 ///
5 /// (c) 2016 Straylight/Edgeware
6 ///
7
8 ///----- Licensing notice ---------------------------------------------------
9 ///
10 /// This file is part of Catacomb.
11 ///
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
16 ///
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
21 ///
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
26
27 ///--------------------------------------------------------------------------
28 /// External definitions.
29
30 #include "config.h"
31 #include "asm-common.h"
32
33 .globl F(abort)
34 .globl F(rijndael_rcon)
35
36 ///--------------------------------------------------------------------------
37 /// Main code.
38
39 .arch armv8-a
40 .fpu crypto-neon-fp-armv8
41
42 /// The ARM crypto extension implements a little-endian version of AES
43 /// (though the manual doesn't actually spell this out and you have to
44 /// experiment), but Catacomb's internal interface presents as big-endian so
45 /// as to work better with things like GCM. We therefore maintain the round
46 /// keys in little-endian form, and have to end-swap blocks in and out.
47 ///
48 /// For added amusement, the crypto extension doesn't implement the larger-
49 /// block versions of Rijndael, so we have to end-swap the keys if we're
50 /// preparing for one of those.
51
52 // Useful constants.
53 .equ maxrounds, 16 // maximum number of rounds
54 .equ maxblksz, 32 // maximum block size, in bytes
55 .equ kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer
56
57 // Context structure.
58 .equ nr, 0 // number of rounds
59 .equ w, nr + 4 // encryption key words
60 .equ wi, w + kbufsz // decryption key words
61
62 ///--------------------------------------------------------------------------
63 /// Key setup.
64
65 FUNC(rijndael_setup_arm_crypto)
66
67 // Arguments:
68 // r0 = pointer to context
69 // r1 = block size in words
70 // r2 = pointer to key material
71 // r3 = key size in words
72
73 stmfd sp!, {r4-r9, r14}
74
75 // The initial round key material is taken directly from the input
76 // key, so copy it over. Unfortunately, the key material is not
77 // guaranteed to be aligned in any especially useful way, so we must
78 // sort this out.
79 add r9, r0, #w
80 mov r14, r3
81 ands r6, r2, #3
82 beq 1f
83 mov r6, r6, lsl #3
84 rsb r7, r6, #32
85 bic r2, r2, #3
86 ldr r4, [r2], #4
87
88 0: ldr r5, [r2], #4
89 mov r4, r4, lsr r6
90 orr r4, r5, lsl r7
91 str r4, [r9], #4
92 subs r14, r14, #1
93 movhi r4, r5
94 bhi 0b
95 b 9f
96
97 1: ldr r4, [r2], #4
98 str r4, [r9], #4
99 subs r14, r14, #1
100 bhi 1b
101
102 // Find out other useful things and prepare for the main loop.
103 9: ldr r7, [r0, #nr] // number of rounds
104 mla r2, r1, r7, r1 // total key size in words
105 leaextq r5, rijndael_rcon // round constants
106 sub r8, r2, r3 // minus what we've copied already
107 veor q1, q1 // all-zero register for the key
108 add r8, r9, r8, lsl #2 // limit of the key buffer
109 mov r12, #0 // position in current cycle
110
111 // Main key expansion loop. Dispatch according to the position in
112 // the cycle.
113 0: ldr r6, [r9, -r3, lsl #2] // word from previous cycle
114 cmp r12, #0 // first word of the cycle?
115 beq 1f
116 cmp r12, #4 // fourth word of the cycle?
117 bne 2f
118 cmp r3, #7 // seven or eight words of key?
119 bcc 2f
120
121 // Fourth word of the cycle, seven or eight words of key. We must do
122 // the byte substitution.
123 vdup.32 q0, r4
124 aese.8 q0, q1 // effectively, just SubBytes
125 vmov.32 r4, d0[0]
126 b 2f
127
128 // First word of the cycle. Byte substitution, rotation, and round
129 // constant.
130 1: ldrb r14, [r5], #1 // next round constant
131 ldr r6, [r9, -r3, lsl #2]
132 vdup.32 q0, r4
133 aese.8 q0, q1 // effectively, just SubBytes
134 vmov.32 r4, d0[0]
135 eor r4, r14, r4, ror #8
136
137 // Common ending: mix in the word from the previous cycle and store.
138 2: eor r4, r4, r6
139 str r4, [r9], #4
140
141 // Prepare for the next iteration. If we're done, then stop; if
142 // we've finished a cycle then reset the counter.
143 add r12, r12, #1
144 cmp r9, r8
145 bcs 9f
146 cmp r12, r3
147 movcs r12, #0
148 b 0b
149
150 // Next job is to construct the decryption keys. The keys for the
151 // first and last rounds don't need to be mangled, but the remaining
152 // ones do -- and they all need to be reordered too.
153 //
154 // The plan of action, then, is to copy the final encryption round's
155 // keys into place first, then to do each of the intermediate rounds
156 // in reverse order, and finally do the first round.
157 //
158 // Do all the heavy lifting with NEON registers. The order we're
159 // doing this in means that it's OK if we read or write too much, and
160 // there's easily enough buffer space for the over-enthusiastic reads
161 // and writes because the context has space for 32-byte blocks, which
162 // is our maximum and an exact fit for two Q-class registers.
163 9: add r5, r0, #wi
164 add r4, r0, #w
165 add r4, r4, r2, lsl #2
166 sub r4, r4, r1, lsl #2 // last round's keys
167
168 // Copy the last encryption round's keys.
169 teq r1, #4
170 vldmiaeq r4, {d0, d1}
171 vldmiane r4, {d0-d3}
172 vstmiaeq r5, {d0, d1}
173 vstmiane r5, {d0-d3}
174
175 // Update the loop variables and stop if we've finished.
176 0: sub r4, r4, r1, lsl #2
177 add r5, r5, r1, lsl #2
178 subs r7, r7, #1
179 beq 9f
180
181 // Do another middle round's keys...
182 teq r1, #4
183 vldmiaeq r4, {d0, d1}
184 vldmiane r4, {d0-d3}
185 aesimc.8 q0, q0
186 vstmiaeq r5, {d0, d1}
187 beq 0b
188 aesimc.8 q1, q1
189 vstmia r5, {d0-d3}
190 b 0b
191
192 // Finally do the first encryption round.
193 9: teq r1, #4
194 vldmiaeq r4, {d0, d1}
195 vldmiane r4, {d0-d3}
196 vstmiaeq r5, {d0, d1}
197 vstmiane r5, {d0-d3}
198
199 // If the block size is not exactly four words then we must end-swap
200 // everything. We can use fancy NEON toys for this.
201 beq 9f
202
203 // End-swap the encryption keys.
204 add r1, r0, #w
205 bl endswap_block
206
207 // And the decryption keys
208 add r1, r0, #wi
209 bl endswap_block
210
211 // All done.
212 9: ldmfd sp!, {r4-r9, pc}
213
214 endswap_block:
215 // End-swap R2 words starting at R1. R1 is clobbered; R2 is not.
216 // It's OK to work in 16-byte chunks.
217 mov r4, r2
218 0: vldmia r1, {d0, d1}
219 vrev32.8 q0, q0
220 vstmia r1!, {d0, d1}
221 subs r4, r4, #4
222 bhi 0b
223 bx r14
224
225 ENDFUNC
226
227 ///--------------------------------------------------------------------------
228 /// Encrypting and decrypting blocks.
229
230 .macro encdec op, aes, mc, koff
231 FUNC(rijndael_\op\()_arm_crypto)
232
233 // Arguments:
234 // r0 = pointer to context
235 // r1 = pointer to input block
236 // r2 = pointer to output block
237
238 // Set things up ready.
239 ldr r3, [r0, #nr]
240 add r0, r0, #\koff
241 vldmia r1, {d0, d1}
242 vrev32.8 q0, q0
243
244 // Check the number of rounds and dispatch.
245 sub r3, r3, #10
246 cmp r3, #5
247 addlo pc, pc, r3, lsl #2
248 callext F(abort)
249
250 b 10f
251 b 11f
252 b 12f
253 b 13f
254 b 14f
255
256 // Eleven rounds.
257 11: vldmia r0!, {d16, d17}
258 \aes\().8 q0, q8
259 \mc\().8 q0, q0
260 b 10f
261
262 // Twelve rounds.
263 12: vldmia r0!, {d16-d19}
264 \aes\().8 q0, q8
265 \mc\().8 q0, q0
266 \aes\().8 q0, q9
267 \mc\().8 q0, q0
268 b 10f
269
270 // Thirteen rounds.
271 13: vldmia r0!, {d16-d21}
272 \aes\().8 q0, q8
273 \mc\().8 q0, q0
274 \aes\().8 q0, q9
275 \mc\().8 q0, q0
276 \aes\().8 q0, q10
277 \mc\().8 q0, q0
278 b 10f
279
280 // Fourteen rounds. (Drops through to the ten round case because
281 // this is the next most common.)
282 14: vldmia r0!, {d16-d23}
283 \aes\().8 q0, q8
284 \mc\().8 q0, q0
285 \aes\().8 q0, q9
286 \mc\().8 q0, q0
287 \aes\().8 q0, q10
288 \mc\().8 q0, q0
289 \aes\().8 q0, q11
290 \mc\().8 q0, q0
291 // Drop through...
292
293 // Ten rounds.
294 10: vldmia r0!, {d16-d25}
295 \aes\().8 q0, q8
296 \mc\().8 q0, q0
297 \aes\().8 q0, q9
298 \mc\().8 q0, q0
299 \aes\().8 q0, q10
300 \mc\().8 q0, q0
301 \aes\().8 q0, q11
302 \mc\().8 q0, q0
303 \aes\().8 q0, q12
304 \mc\().8 q0, q0
305
306 vldmia r0!, {d16-d27}
307 \aes\().8 q0, q8
308 \mc\().8 q0, q0
309 \aes\().8 q0, q9
310 \mc\().8 q0, q0
311 \aes\().8 q0, q10
312 \mc\().8 q0, q0
313 \aes\().8 q0, q11
314 \mc\().8 q0, q0
315
316 // Final round has no MixColumns, but is followed by final whitening.
317 \aes\().8 q0, q12
318 veor q0, q0, q13
319
320 // All done.
321 vrev32.8 q0, q0
322 vstmia r2, {d0, d1}
323 bx r14
324
325 ENDFUNC
326 .endm
327
328 encdec eblk, aese, aesmc, w
329 encdec dblk, aesd, aesimc, wi
330
331 ///----- That's all, folks --------------------------------------------------