1 /// -*- mode: asm; asm-comment-char: ?/ -*-
3 /// AArch64 crypto-extension-based implementation of Rijndael
5 /// (c) 2018 Straylight/Edgeware
8 ///----- Licensing notice ---------------------------------------------------
10 /// This file is part of Catacomb.
12 /// Catacomb is free software; you can redistribute it and/or modify
13 /// it under the terms of the GNU Library General Public License as
14 /// published by the Free Software Foundation; either version 2 of the
15 /// License, or (at your option) any later version.
17 /// Catacomb is distributed in the hope that it will be useful,
18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 /// GNU Library General Public License for more details.
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb; if not, write to the Free
24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 /// MA 02111-1307, USA.
27 ///--------------------------------------------------------------------------
31 #include "asm-common.h"
36 .extern F(rijndael_rcon)
40 ///--------------------------------------------------------------------------
43 /// The ARM crypto extension implements a little-endian version of AES
44 /// (though the manual doesn't actually spell this out and you have to
45 /// experiment), but Catacomb's internal interface presents as big-endian so
46 /// as to work better with things like GCM. We therefore maintain the round
47 /// keys in little-endian form, and have to end-swap blocks in and out.
49 /// For added amusement, the crypto extension doesn't implement the larger-
50 /// block versions of Rijndael, so we have to end-swap the keys if we're
51 /// preparing for one of those.
54 .equ maxrounds, 16 // maximum number of rounds
55 .equ maxblksz, 32 // maximum block size, in bytes
56 .equ kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer
59 .equ nr, 0 // number of rounds
60 .equ w, nr + 4 // encryption key words
61 .equ wi, w + kbufsz // decryption key words
63 ///--------------------------------------------------------------------------
66 FUNC(rijndael_setup_arm64_crypto)
69 // x0 = pointer to context
70 // w1 = block size in 32-bit words
71 // x2 = pointer to key material
72 // x3 = key size in words
77 // The initial round key material is taken directly from the input
78 // key, so copy it over. Unfortunately, the key material is not
79 // guaranteed to be aligned in any especially useful way. Assume
80 // that alignment traps are not enabled. (Why would they be? On
81 // A32, alignment traps were part of a transition plan which changed
82 // the way unaligned loads and stores behaved, but there's never been
83 // any other behaviour on A64.)
91 // Find out other useful things and prepare for the main loop.
92 9: ldr w9, [x0, #nr] // number of rounds
93 madd w2, w1, w9, w1 // total key size in words
94 leaext x5, rijndael_rcon // round constants
95 sub x6, x2, x3 // minus what we've copied already
96 add x7, x0, #w // position in previous cycle
97 movi v1.4s, #0 // all-zero register for the key
98 mov x8, #0 // position in current cycle
100 // Main key expansion loop. Dispatch according to the position in
102 0: ldr w15, [x7], #4 // word from previous cycle
103 cbz x8, 1f // first word of the cycle?
104 cmp x8, #4 // fourth word of the cycle?
106 cmp x3, #7 // seven or eight words of key?
109 // Fourth word of the cycle, seven or eight words of key. We must do
110 // the byte substitution.
112 aese v0.16b, v1.16b // effectively, just SubBytes
116 // First word of the cycle. Byte substitution, rotation, and round
118 1: ldrb w13, [x5], #1 // next round constant
120 aese v0.16b, v1.16b // effectively, just SubBytes
122 eor w14, w13, w14, ror #8
124 // Common ending: mix in the word from the previous cycle and store.
128 // Prepare for the next iteration. If we're done, then stop; if
129 // we've finished a cycle then reset the counter.
137 // Next job is to construct the decryption keys. The keys for the
138 // first and last rounds don't need to be mangled, but the remaining
139 // ones do -- and they all need to be reordered too.
141 // The plan of action, then, is to copy the final encryption round's
142 // keys into place first, then to do each of the intermediate rounds
143 // in reverse order, and finally do the first round.
145 // Do all the heavy lifting with the vector registers. The order
146 // we're doing this in means that it's OK if we read or write too
147 // much, and there's easily enough buffer space for the
148 // over-enthusiastic reads and writes because the context has space
149 // for 32-byte blocks, which is our maximum and an exact fit for two
150 // full-width registers.
153 add x4, x4, w2, uxtw #2
154 sub x4, x4, w1, uxtw #2 // last round's keys
156 // Copy the last encryption round's keys.
157 ld1 {v0.4s, v1.4s}, [x4]
158 st1 {v0.4s, v1.4s}, [x5]
160 // Update the loop variables and stop if we've finished.
162 add x5, x5, w1, uxtw #2
163 sub x4, x4, w1, uxtw #2
166 // Do another middle round's keys...
167 ld1 {v0.4s, v1.4s}, [x4]
168 aesimc v0.16b, v0.16b
169 aesimc v1.16b, v1.16b
170 st1 {v0.4s, v1.4s}, [x5]
173 // Finally do the first encryption round.
174 9: ld1 {v0.4s, v1.4s}, [x4]
175 st1 {v0.4s, v1.4s}, [x5]
177 // If the block size is not exactly four words then we must end-swap
178 // everything. We can use fancy vector toys for this.
182 // End-swap the encryption keys.
186 // And the decryption keys
196 INTFUNC(endswap_block)
197 // End-swap w2 words starting at x1. x1 is clobbered; w2 is not.
198 // It's OK to work in 16-byte chunks.
204 st1 {v0.4s}, [x1], #16
210 ///--------------------------------------------------------------------------
211 /// Encrypting and decrypting blocks.
213 .macro encdec op, aes, mc, koff
214 FUNC(rijndael_\op\()_arm64_crypto)
217 // x0 = pointer to context
218 // x1 = pointer to input block
219 // x2 = pointer to output block
221 // Set things up ready.
227 // Check the number of rounds and dispatch.
241 11: ld1 {v16.4s}, [x0], #16
247 12: ld1 {v16.4s, v17.4s}, [x0], #32
255 13: ld1 {v16.4s-v18.4s}, [x0], #48
264 // Fourteen rounds. (Drops through to the ten round case because
265 // this is the next most common.)
266 14: ld1 {v16.4s-v19.4s}, [x0], #64
278 10: ld1 {v16.4s-v19.4s}, [x0], #64
279 ld1 {v20.4s-v23.4s}, [x0], #64
289 ld1 {v16.4s-v18.4s}, [x0], #48
299 // Final round has no MixColumns, but is followed by final whitening.
303 eor v0.16b, v0.16b, v18.16b
313 encdec eblk, aese, aesmc, w
314 encdec dblk, aesd, aesimc, wi
316 ///----- That's all, folks --------------------------------------------------