1 /// -*- mode: asm; asm-comment-char: ?/ -*-
3 /// AArch64 crypto-extension-based implementation of Rijndael
5 /// (c) 2018 Straylight/Edgeware
8 ///----- Licensing notice ---------------------------------------------------
10 /// This file is part of Catacomb.
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
27 ///--------------------------------------------------------------------------
28 /// External definitions.
31 #include "asm-common.h"
34 .extern F(rijndael_rcon)
36 ///--------------------------------------------------------------------------
41 /// The ARM crypto extension implements a little-endian version of AES
42 /// (though the manual doesn't actually spell this out and you have to
43 /// experiment), but Catacomb's internal interface presents as big-endian so
44 /// as to work better with things like GCM. We therefore maintain the round
45 /// keys in little-endian form, and have to end-swap blocks in and out.
47 /// For added amusement, the crypto extension doesn't implement the larger-
48 /// block versions of Rijndael, so we have to end-swap the keys if we're
49 /// preparing for one of those.
52 .equ maxrounds, 16 // maximum number of rounds
53 .equ maxblksz, 32 // maximum block size, in bytes
54 .equ kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer
57 .equ nr, 0 // number of rounds
58 .equ w, nr + 4 // encryption key words
59 .equ wi, w + kbufsz // decryption key words
61 ///--------------------------------------------------------------------------
64 FUNC(rijndael_setup_arm64_crypto)
67 // x0 = pointer to context
68 // w1 = block size in 32-bit words
69 // x2 = pointer to key material
70 // x3 = key size in words
75 // The initial round key material is taken directly from the input
76 // key, so copy it over. Unfortunately, the key material is not
77 // guaranteed to be aligned in any especially useful way. Assume
78 // that alignment traps are not enabled. (Why would they be? On
79 // A32, alignment traps were part of a transition plan which changed
80 // the way unaligned loads and stores behaved, but there's never been
81 // any other behaviour on A64.)
89 // Find out other useful things and prepare for the main loop.
90 9: ldr w9, [x0, #nr] // number of rounds
91 madd w2, w1, w9, w1 // total key size in words
92 leaext x5, rijndael_rcon // round constants
93 sub x6, x2, x3 // minus what we've copied already
94 add x7, x0, #w // position in previous cycle
95 movi v1.4s, #0 // all-zero register for the key
96 mov x8, #0 // position in current cycle
98 // Main key expansion loop. Dispatch according to the position in
100 0: ldr w15, [x7], #4 // word from previous cycle
101 cbz x8, 1f // first word of the cycle?
102 cmp x8, #4 // fourth word of the cycle?
104 cmp x3, #7 // seven or eight words of key?
107 // Fourth word of the cycle, seven or eight words of key. We must do
108 // the byte substitution.
110 aese v0.16b, v1.16b // effectively, just SubBytes
114 // First word of the cycle. Byte substitution, rotation, and round
116 1: ldrb w13, [x5], #1 // next round constant
118 aese v0.16b, v1.16b // effectively, just SubBytes
120 eor w14, w13, w14, ror #8
122 // Common ending: mix in the word from the previous cycle and store.
126 // Prepare for the next iteration. If we're done, then stop; if
127 // we've finished a cycle then reset the counter.
135 // Next job is to construct the decryption keys. The keys for the
136 // first and last rounds don't need to be mangled, but the remaining
137 // ones do -- and they all need to be reordered too.
139 // The plan of action, then, is to copy the final encryption round's
140 // keys into place first, then to do each of the intermediate rounds
141 // in reverse order, and finally do the first round.
143 // Do all the heavy lifting with the vector registers. The order
144 // we're doing this in means that it's OK if we read or write too
145 // much, and there's easily enough buffer space for the
146 // over-enthusiastic reads and writes because the context has space
147 // for 32-byte blocks, which is our maximum and an exact fit for two
148 // full-width registers.
151 add x4, x4, w2, uxtw #2
152 sub x4, x4, w1, uxtw #2 // last round's keys
154 // Copy the last encryption round's keys.
155 ld1 {v0.4s, v1.4s}, [x4]
156 st1 {v0.4s, v1.4s}, [x5]
158 // Update the loop variables and stop if we've finished.
160 add x5, x5, w1, uxtw #2
161 sub x4, x4, w1, uxtw #2
164 // Do another middle round's keys...
165 ld1 {v0.4s, v1.4s}, [x4]
166 aesimc v0.16b, v0.16b
167 aesimc v1.16b, v1.16b
168 st1 {v0.4s, v1.4s}, [x5]
171 // Finally do the first encryption round.
172 9: ld1 {v0.4s, v1.4s}, [x4]
173 st1 {v0.4s, v1.4s}, [x5]
175 // If the block size is not exactly four words then we must end-swap
176 // everything. We can use fancy vector toys for this.
180 // End-swap the encryption keys.
184 // And the decryption keys
194 INTFUNC(endswap_block)
195 // End-swap w2 words starting at x1. x1 is clobbered; w2 is not.
196 // It's OK to work in 16-byte chunks.
202 st1 {v0.4s}, [x1], #16
208 ///--------------------------------------------------------------------------
209 /// Encrypting and decrypting blocks.
211 .macro encdec op, aes, mc, koff
212 FUNC(rijndael_\op\()_arm64_crypto)
215 // x0 = pointer to context
216 // x1 = pointer to input block
217 // x2 = pointer to output block
219 // Set things up ready.
225 // Check the number of rounds and dispatch.
239 11: ld1 {v16.4s}, [x0], #16
245 12: ld1 {v16.4s, v17.4s}, [x0], #32
253 13: ld1 {v16.4s-v18.4s}, [x0], #48
262 // Fourteen rounds. (Drops through to the ten round case because
263 // this is the next most common.)
264 14: ld1 {v16.4s-v19.4s}, [x0], #64
276 10: ld1 {v16.4s-v19.4s}, [x0], #64
277 ld1 {v20.4s-v23.4s}, [x0], #64
287 ld1 {v16.4s-v18.4s}, [x0], #48
297 // Final round has no MixColumns, but is followed by final whitening.
301 eor v0.16b, v0.16b, v18.16b
311 encdec eblk, aese, aesmc, w
312 encdec dblk, aesd, aesimc, wi
314 ///----- That's all, folks --------------------------------------------------