/// -*- mode: asm; asm-comment-char: ?/ -*- /// /// AArch64 crypto-extension-based implementation of Rijndael /// /// (c) 2018 Straylight/Edgeware /// ///----- Licensing notice --------------------------------------------------- /// /// This file is part of Catacomb. /// /// Catacomb is free software; you can redistribute it and/or modify /// it under the terms of the GNU Library General Public License as /// published by the Free Software Foundation; either version 2 of the /// License, or (at your option) any later version. /// /// Catacomb is distributed in the hope that it will be useful, /// but WITHOUT ANY WARRANTY; without even the implied warranty of /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the /// GNU Library General Public License for more details. /// /// You should have received a copy of the GNU Library General Public /// License along with Catacomb; if not, write to the Free /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, /// MA 02111-1307, USA. ///-------------------------------------------------------------------------- /// Preliminaries. #include "config.h" #include "asm-common.h" .arch armv8-a+crypto .extern F(abort) .extern F(rijndael_rcon) .text ///-------------------------------------------------------------------------- /// Main code. /// The ARM crypto extension implements a little-endian version of AES /// (though the manual doesn't actually spell this out and you have to /// experiment), but Catacomb's internal interface presents as big-endian so /// as to work better with things like GCM. We therefore maintain the round /// keys in little-endian form, and have to end-swap blocks in and out. /// /// For added amusement, the crypto extension doesn't implement the larger- /// block versions of Rijndael, so we have to end-swap the keys if we're /// preparing for one of those. // Useful constants. .equ maxrounds, 16 // maximum number of rounds .equ maxblksz, 32 // maximum block size, in bytes .equ kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer // Context structure. .equ nr, 0 // number of rounds .equ w, nr + 4 // encryption key words .equ wi, w + kbufsz // decryption key words ///-------------------------------------------------------------------------- /// Key setup. FUNC(rijndael_setup_arm64_crypto) // Arguments: // x0 = pointer to context // w1 = block size in 32-bit words // x2 = pointer to key material // x3 = key size in words pushreg x29, x30 mov x29, sp // The initial round key material is taken directly from the input // key, so copy it over. Unfortunately, the key material is not // guaranteed to be aligned in any especially useful way. Assume // that alignment traps are not enabled. (Why would they be? On // A32, alignment traps were part of a transition plan which changed // the way unaligned loads and stores behaved, but there's never been // any other behaviour on A64.) mov x15, x3 add x4, x0, #w 0: sub x15, x15, #1 ldr w14, [x2], #4 str w14, [x4], #4 cbnz x15, 0b // Find out other useful things and prepare for the main loop. 9: ldr w9, [x0, #nr] // number of rounds madd w2, w1, w9, w1 // total key size in words leaext x5, rijndael_rcon // round constants sub x6, x2, x3 // minus what we've copied already add x7, x0, #w // position in previous cycle movi v1.4s, #0 // all-zero register for the key mov x8, #0 // position in current cycle // Main key expansion loop. Dispatch according to the position in // the cycle. 0: ldr w15, [x7], #4 // word from previous cycle cbz x8, 1f // first word of the cycle? cmp x8, #4 // fourth word of the cycle? b.ne 2f cmp x3, #7 // seven or eight words of key? b.cc 2f // Fourth word of the cycle, seven or eight words of key. We must do // the byte substitution. dup v0.4s, w14 aese v0.16b, v1.16b // effectively, just SubBytes mov w14, v0.s[0] b 2f // First word of the cycle. Byte substitution, rotation, and round // constant. 1: ldrb w13, [x5], #1 // next round constant dup v0.4s, w14 aese v0.16b, v1.16b // effectively, just SubBytes mov w14, v0.s[0] eor w14, w13, w14, ror #8 // Common ending: mix in the word from the previous cycle and store. 2: eor w14, w14, w15 str w14, [x4], #4 // Prepare for the next iteration. If we're done, then stop; if // we've finished a cycle then reset the counter. add x8, x8, #1 sub x6, x6, #1 cmp x8, x3 cbz x6, 9f csel x8, x8, xzr, cc b 0b // Next job is to construct the decryption keys. The keys for the // first and last rounds don't need to be mangled, but the remaining // ones do -- and they all need to be reordered too. // // The plan of action, then, is to copy the final encryption round's // keys into place first, then to do each of the intermediate rounds // in reverse order, and finally do the first round. // // Do all the heavy lifting with the vector registers. The order // we're doing this in means that it's OK if we read or write too // much, and there's easily enough buffer space for the // over-enthusiastic reads and writes because the context has space // for 32-byte blocks, which is our maximum and an exact fit for two // full-width registers. 9: add x5, x0, #wi add x4, x0, #w add x4, x4, w2, uxtw #2 sub x4, x4, w1, uxtw #2 // last round's keys // Copy the last encryption round's keys. ld1 {v0.4s, v1.4s}, [x4] st1 {v0.4s, v1.4s}, [x5] // Update the loop variables and stop if we've finished. 0: sub w9, w9, #1 add x5, x5, w1, uxtw #2 sub x4, x4, w1, uxtw #2 cbz w9, 9f // Do another middle round's keys... ld1 {v0.4s, v1.4s}, [x4] aesimc v0.16b, v0.16b aesimc v1.16b, v1.16b st1 {v0.4s, v1.4s}, [x5] b 0b // Finally do the first encryption round. 9: ld1 {v0.4s, v1.4s}, [x4] st1 {v0.4s, v1.4s}, [x5] // If the block size is not exactly four words then we must end-swap // everything. We can use fancy vector toys for this. cmp w1, #4 b.eq 9f // End-swap the encryption keys. add x1, x0, #w bl endswap_block // And the decryption keys add x1, x0, #wi bl endswap_block // All done. 9: popreg x29, x30 ret ENDFUNC INTFUNC(endswap_block) // End-swap w2 words starting at x1. x1 is clobbered; w2 is not. // It's OK to work in 16-byte chunks. mov w3, w2 0: subs w3, w3, #4 ld1 {v0.4s}, [x1] rev32 v0.16b, v0.16b st1 {v0.4s}, [x1], #16 b.hi 0b ret ENDFUNC ///-------------------------------------------------------------------------- /// Encrypting and decrypting blocks. .macro encdec op, aes, mc, koff FUNC(rijndael_\op\()_arm64_crypto) // Arguments: // x0 = pointer to context // x1 = pointer to input block // x2 = pointer to output block // Set things up ready. ldr w3, [x0, #nr] add x0, x0, #\koff ld1 {v0.4s}, [x1] rev32 v0.16b, v0.16b // Check the number of rounds and dispatch. cmp w3, #14 b.eq 14f cmp w3, #10 b.eq 10f cmp w3, #12 b.eq 12f cmp w3, #13 b.eq 13f cmp w3, #11 b.eq 11f callext F(abort) // Eleven rounds. 11: ld1 {v16.4s}, [x0], #16 \aes v0.16b, v16.16b \mc v0.16b, v0.16b b 10f // Twelve rounds. 12: ld1 {v16.4s, v17.4s}, [x0], #32 \aes v0.16b, v16.16b \mc v0.16b, v0.16b \aes v0.16b, v17.16b \mc v0.16b, v0.16b b 10f // Thirteen rounds. 13: ld1 {v16.4s-v18.4s}, [x0], #48 \aes v0.16b, v16.16b \mc v0.16b, v0.16b \aes v0.16b, v17.16b \mc v0.16b, v0.16b \aes v0.16b, v18.16b \mc v0.16b, v0.16b b 10f // Fourteen rounds. (Drops through to the ten round case because // this is the next most common.) 14: ld1 {v16.4s-v19.4s}, [x0], #64 \aes v0.16b, v16.16b \mc v0.16b, v0.16b \aes v0.16b, v17.16b \mc v0.16b, v0.16b \aes v0.16b, v18.16b \mc v0.16b, v0.16b \aes v0.16b, v19.16b \mc v0.16b, v0.16b // Drop through... // Ten rounds. 10: ld1 {v16.4s-v19.4s}, [x0], #64 ld1 {v20.4s-v23.4s}, [x0], #64 \aes v0.16b, v16.16b \mc v0.16b, v0.16b \aes v0.16b, v17.16b \mc v0.16b, v0.16b \aes v0.16b, v18.16b \mc v0.16b, v0.16b \aes v0.16b, v19.16b \mc v0.16b, v0.16b ld1 {v16.4s-v18.4s}, [x0], #48 \aes v0.16b, v20.16b \mc v0.16b, v0.16b \aes v0.16b, v21.16b \mc v0.16b, v0.16b \aes v0.16b, v22.16b \mc v0.16b, v0.16b \aes v0.16b, v23.16b \mc v0.16b, v0.16b // Final round has no MixColumns, but is followed by final whitening. \aes v0.16b, v16.16b \mc v0.16b, v0.16b \aes v0.16b, v17.16b eor v0.16b, v0.16b, v18.16b // All done. rev32 v0.16b, v0.16b st1 {v0.4s}, [x2] ret ENDFUNC .endm encdec eblk, aese, aesmc, w encdec dblk, aesd, aesimc, wi ///----- That's all, folks --------------------------------------------------