--- /dev/null
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// AArch64 crypto-extension-based implementation of Rijndael
+///
+/// (c) 2018 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+#include "config.h"
+#include "asm-common.h"
+
+ .extern F(abort)
+ .extern F(rijndael_rcon)
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+ .arch armv8-a+crypto
+
+/// The ARM crypto extension implements a little-endian version of AES
+/// (though the manual doesn't actually spell this out and you have to
+/// experiment), but Catacomb's internal interface presents as big-endian so
+/// as to work better with things like GCM. We therefore maintain the round
+/// keys in little-endian form, and have to end-swap blocks in and out.
+///
+/// For added amusement, the crypto extension doesn't implement the larger-
+/// block versions of Rijndael, so we have to end-swap the keys if we're
+/// preparing for one of those.
+
+ // Useful constants.
+ .equ maxrounds, 16 // maximum number of rounds
+ .equ maxblksz, 32 // maximum block size, in bytes
+ .equ kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer
+
+ // Context structure.
+ .equ nr, 0 // number of rounds
+ .equ w, nr + 4 // encryption key words
+ .equ wi, w + kbufsz // decryption key words
+
+///--------------------------------------------------------------------------
+/// Key setup.
+
+FUNC(rijndael_setup_arm64_crypto)
+
+ // Arguments:
+ // x0 = pointer to context
+ // w1 = block size in 32-bit words
+ // x2 = pointer to key material
+ // x3 = key size in words
+
+ pushreg x29, x30
+ mov x29, sp
+
+ // The initial round key material is taken directly from the input
+ // key, so copy it over. Unfortunately, the key material is not
+ // guaranteed to be aligned in any especially useful way. Assume
+ // that alignment traps are not enabled. (Why would they be? On
+ // A32, alignment traps were part of a transition plan which changed
+ // the way unaligned loads and stores behaved, but there's never been
+ // any other behaviour on A64.)
+ mov x15, x3
+ add x4, x0, #w
+0: sub x15, x15, #1
+ ldr w14, [x2], #4
+ str w14, [x4], #4
+ cbnz x15, 0b
+
+ // Find out other useful things and prepare for the main loop.
+9: ldr w9, [x0, #nr] // number of rounds
+ madd w2, w1, w9, w1 // total key size in words
+ leaext x5, rijndael_rcon // round constants
+ sub x6, x2, x3 // minus what we've copied already
+ add x7, x0, #w // position in previous cycle
+ movi v1.4s, #0 // all-zero register for the key
+ mov x8, #0 // position in current cycle
+
+ // Main key expansion loop. Dispatch according to the position in
+ // the cycle.
+0: ldr w15, [x7], #4 // word from previous cycle
+ cbz x8, 1f // first word of the cycle?
+ cmp x8, #4 // fourth word of the cycle?
+ b.ne 2f
+ cmp x3, #7 // seven or eight words of key?
+ b.cc 2f
+
+ // Fourth word of the cycle, seven or eight words of key. We must do
+ // the byte substitution.
+ dup v0.4s, w14
+ aese v0.16b, v1.16b // effectively, just SubBytes
+ mov w14, v0.4s[0]
+ b 2f
+
+ // First word of the cycle. Byte substitution, rotation, and round
+ // constant.
+1: ldrb w13, [x5], #1 // next round constant
+ dup v0.4s, w14
+ aese v0.16b, v1.16b // effectively, just SubBytes
+ mov w14, v0.4s[0]
+ eor w14, w13, w14, ror #8
+
+ // Common ending: mix in the word from the previous cycle and store.
+2: eor w14, w14, w15
+ str w14, [x4], #4
+
+ // Prepare for the next iteration. If we're done, then stop; if
+ // we've finished a cycle then reset the counter.
+ add x8, x8, #1
+ sub x6, x6, #1
+ cmp x8, x3
+ cbz x6, 9f
+ csel x8, x8, xzr, cc
+ b 0b
+
+ // Next job is to construct the decryption keys. The keys for the
+ // first and last rounds don't need to be mangled, but the remaining
+ // ones do -- and they all need to be reordered too.
+ //
+ // The plan of action, then, is to copy the final encryption round's
+ // keys into place first, then to do each of the intermediate rounds
+ // in reverse order, and finally do the first round.
+ //
+ // Do all the heavy lifting with the vector registers. The order
+ // we're doing this in means that it's OK if we read or write too
+ // much, and there's easily enough buffer space for the
+ // over-enthusiastic reads and writes because the context has space
+ // for 32-byte blocks, which is our maximum and an exact fit for two
+ // full-width registers.
+9: add x5, x0, #wi
+ add x4, x0, #w
+ add x4, x4, w2, uxtw #2
+ sub x4, x4, w1, uxtw #2 // last round's keys
+
+ // Copy the last encryption round's keys.
+ ld1 {v0.4s, v1.4s}, [x4]
+ st1 {v0.4s, v1.4s}, [x5]
+
+ // Update the loop variables and stop if we've finished.
+0: sub w9, w9, #1
+ add x5, x5, w1, uxtw #2
+ sub x4, x4, w1, uxtw #2
+ cbz w9, 9f
+
+ // Do another middle round's keys...
+ ld1 {v0.4s, v1.4s}, [x4]
+ aesimc v0.16b, v0.16b
+ aesimc v1.16b, v1.16b
+ st1 {v0.4s, v1.4s}, [x5]
+ b 0b
+
+ // Finally do the first encryption round.
+9: ld1 {v0.4s, v1.4s}, [x4]
+ st1 {v0.4s, v1.4s}, [x5]
+
+ // If the block size is not exactly four words then we must end-swap
+ // everything. We can use fancy vector toys for this.
+ cmp w1, #4
+ b.eq 9f
+
+ // End-swap the encryption keys.
+ add x1, x0, #w
+ bl endswap_block
+
+ // And the decryption keys
+ add x1, x0, #wi
+ bl endswap_block
+
+ // All done.
+9: popreg x29, x30
+ ret
+
+ENDFUNC
+
+INTFUNC(endswap_block)
+ // End-swap w2 words starting at x1. x1 is clobbered; w2 is not.
+ // It's OK to work in 16-byte chunks.
+
+ mov w3, w2
+0: subs w3, w3, #4
+ ld1 {v0.4s}, [x1]
+ rev32 v0.16b, v0.16b
+ st1 {v0.4s}, [x1], #16
+ b.hi 0b
+ ret
+
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Encrypting and decrypting blocks.
+
+.macro encdec op, aes, mc, koff
+ FUNC(rijndael_\op\()_arm64_crypto)
+
+ // Arguments:
+ // x0 = pointer to context
+ // x1 = pointer to input block
+ // x2 = pointer to output block
+
+ // Set things up ready.
+ ldr w3, [x0, #nr]
+ add x0, x0, #\koff
+ ld1 {v0.4s}, [x1]
+ rev32 v0.16b, v0.16b
+
+ // Check the number of rounds and dispatch.
+ cmp w3, #14
+ b.eq 14f
+ cmp w3, #10
+ b.eq 10f
+ cmp w3, #12
+ b.eq 12f
+ cmp w3, #13
+ b.eq 13f
+ cmp w3, #11
+ b.eq 11f
+ callext F(abort)
+
+ // Eleven rounds.
+11: ld1 {v16.4s}, [x0], #16
+ \aes v0.16b, v16.16b
+ \mc v0.16b, v0.16b
+ b 10f
+
+ // Twelve rounds.
+12: ld1 {v16.4s, v17.4s}, [x0], #32
+ \aes v0.16b, v16.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v17.16b
+ \mc v0.16b, v0.16b
+ b 10f
+
+ // Thirteen rounds.
+13: ld1 {v16.4s-v18.4s}, [x0], #48
+ \aes v0.16b, v16.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v17.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v18.16b
+ \mc v0.16b, v0.16b
+ b 10f
+
+ // Fourteen rounds. (Drops through to the ten round case because
+ // this is the next most common.)
+14: ld1 {v16.4s-v19.4s}, [x0], #64
+ \aes v0.16b, v16.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v17.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v18.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v19.16b
+ \mc v0.16b, v0.16b
+ // Drop through...
+
+ // Ten rounds.
+10: ld1 {v16.4s-v19.4s}, [x0], #64
+ ld1 {v20.4s-v23.4s}, [x0], #64
+ \aes v0.16b, v16.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v17.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v18.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v19.16b
+ \mc v0.16b, v0.16b
+
+ ld1 {v16.4s-v18.4s}, [x0], #48
+ \aes v0.16b, v20.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v21.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v22.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v23.16b
+ \mc v0.16b, v0.16b
+
+ // Final round has no MixColumns, but is followed by final whitening.
+ \aes v0.16b, v16.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v17.16b
+ eor v0.16b, v0.16b, v18.16b
+
+ // All done.
+ rev32 v0.16b, v0.16b
+ st1 {v0.4s}, [x2]
+ ret
+
+ ENDFUNC
+.endm
+
+ encdec eblk, aese, aesmc, w
+ encdec dblk, aesd, aesimc, wi
+
+///----- That's all, folks --------------------------------------------------