From e492db887af6561dd33aa18e3887efaeb219fd16 Mon Sep 17 00:00:00 2001 From: Mark Wooding Date: Fri, 22 Jun 2018 10:20:44 +0100 Subject: [PATCH] Add support for fancy AArch64 assembler code. It's a fun instruction set, and maybe this will improve my crypto on Raspberry Pi 3. --- base/asm-common.h | 119 ++++++++++++++++ base/dispatch.c | 14 ++ configure.ac | 3 +- symm/Makefile.am | 9 ++ symm/chacha-arm64.S | 181 +++++++++++++++++++++++++ symm/chacha.c | 7 + symm/rijndael-arm64-crypto.S | 314 +++++++++++++++++++++++++++++++++++++++++++ symm/rijndael-base.c | 7 + symm/rijndael.c | 12 ++ symm/salsa20-arm64.S | 236 ++++++++++++++++++++++++++++++++ symm/salsa20.c | 7 + 11 files changed, 908 insertions(+), 1 deletion(-) create mode 100644 symm/chacha-arm64.S create mode 100644 symm/rijndael-arm64-crypto.S create mode 100644 symm/salsa20-arm64.S diff --git a/base/asm-common.h b/base/asm-common.h index f4c4f6e3..d81e4098 100644 --- a/base/asm-common.h +++ b/base/asm-common.h @@ -1031,6 +1031,125 @@ name: #endif ///-------------------------------------------------------------------------- +/// AArch64-specific hacking. + +#if CPUFAM_ARM64 + +// Set the function hooks. +#define FUNC_PREHOOK(_) .balign 4 +#define FUNC_POSTHOOK(_) .cfi_startproc; .L$_prologue_p = -1 +#define ENDFUNC_HOOK(_) .cfi_endproc + +// Call external subroutine at ADDR, possibly via PLT. +.macro callext addr + bl \addr +.endm + +// Load address of external symbol ADDR into REG. +.macro leaext reg, addr +#if WANT_PIC + adrp \reg, :got:\addr + ldr \reg, [\reg, #:got_lo12:\addr] +#else + adrp \reg, \addr + add \reg, \reg, #:lo12:\addr +#endif +.endm + +// Stack management and unwinding. +.macro setfp fp, offset = 0 + // If you're just going through the motions with a fixed-size stack frame, + // then you want to say `add x29, sp, #OFFSET' directly, which will avoid + // pointlessly restoring sp later. + .if \offset == 0 + mov \fp, sp + .cfi_def_cfa_register \fp + .else + add \fp, sp, #\offset + .cfi_def_cfa_register \fp + .cfi_adjust_cfa_offset -\offset + .endif + .macro dropfp; _dropfp \fp, \offset; .endm + .L$_frameptr_p = -1 +.endm + +.macro _dropfp fp, offset = 0 + .if \offset == 0 + mov sp, \fp + .cfi_def_cfa_register sp + .else + sub sp, \fp, #\offset + .cfi_def_cfa_register sp + .cfi_adjust_cfa_offset +\offset + .endif + .purgem dropfp + .L$_frameptr_p = 0 +.endm + +.macro stalloc n + sub sp, sp, #\n + .cfi_adjust_cfa_offset +\n +.endm + +.macro stfree n + add sp, sp, #\n + .cfi_adjust_cfa_offset -\n +.endm + +.macro pushreg x, y= + .ifeqs "\y", "" + str \x, [sp, #-16]! + .cfi_adjust_cfa_offset +16 + .cfi_rel_offset \x, 0 + .else + stp \x, \y, [sp, #-16]! + .cfi_adjust_cfa_offset +16 + .cfi_rel_offset \x, 0 + .cfi_rel_offset \y, 8 + .endif +.endm + +.macro popreg x, y= + .ifeqs "\y", "" + ldr \x, [sp], #16 + .cfi_restore \x + .cfi_adjust_cfa_offset -16 + .else + ldp \x, \y, [sp], #16 + .cfi_restore \x + .cfi_restore \y + .cfi_adjust_cfa_offset -16 + .endif +.endm + +.macro savereg x, y, z= + .ifeqs "\z", "" + str \x, [sp, #\y] + .cfi_rel_offset \x, \y + .else + stp \x, \y, [sp, #\z] + .cfi_rel_offset \x, \z + .cfi_rel_offset \y, \z + 8 + .endif +.endm + +.macro rstrreg x, y, z= + .ifeqs "\z", "" + ldr \x, [sp, #\y] + .cfi_restore \x + .else + ldp \x, \y, [sp, #\z] + .cfi_restore \x + .cfi_restore \y + .endif +.endm + +.macro endprologue +.endm + +#endif + +///-------------------------------------------------------------------------- /// Final stuff. // Default values for the various hooks. diff --git a/base/dispatch.c b/base/dispatch.c index 50c94380..908a4e31 100644 --- a/base/dispatch.c +++ b/base/dispatch.c @@ -229,6 +229,11 @@ struct auxentry { unsigned long type; union auxval value; }; # define WANT_AT_HWCAP(_) _(AT_HWCAP, u, hwcap) #endif +#if defined(AT_HWCAP) && CPUFAM_ARM64 +# define WANT_ANY 1 +# define WANT_AT_HWCAP(_) _(AT_HWCAP, u, hwcap) +#endif + #if defined(AT_HWCAP2) && CPUFAM_ARMEL # define WANT_ANY 1 # define WANT_AT_HWCAP2(_) _(AT_HWCAP2, u, hwcap2) @@ -278,6 +283,12 @@ static unsigned hwcaps = 0; _(ARM_D32, "arm:d32") \ _(ARM_AES, "arm:aes") #endif +#if CPUFAM_ARM64 +# define WANTAUX(_) \ + WANT_AT_HWCAP(_) +# define CAPMAP(_) \ + _(ARM_AES, "arm:aes") +#endif /* Build the bitmask for `hwcaps' from the `CAPMAP' list. */ enum { @@ -391,6 +402,9 @@ static void probe_hwcaps(void) if (probed.hwcap2 & HWCAP2_AES) hw |= HF_ARM_AES; # endif #endif +#if CPUFAM_ARM64 + if (probed.hwcap & HWCAP_AES) hw |= HF_ARM_AES; +#endif /* Store the bitmask of features we probed for everyone to see. */ DISPATCH_STORE(hwcaps, hw); diff --git a/configure.ac b/configure.ac index 1643ad00..f8ad8b77 100644 --- a/configure.ac +++ b/configure.ac @@ -78,7 +78,8 @@ AC_DEFUN([catacomb_CPU_FAMILIES], $1([i[[3-6]]86,*], [x86], [sysv]) $1([x86_64,cygwin], [amd64], [win]) $1([x86_64,*], [amd64], [sysv]) - $1([arm,* | armv*,*], [armel], [std])]) + $1([arm,* | armv*,*], [armel], [std]) + $1([aarch64,*], [arm64], [std])]) dnl A utility to clear the `seen' flags, used so as to process each CPU or dnl ABI once. diff --git a/symm/Makefile.am b/symm/Makefile.am index 4441ecc4..0e56319d 100644 --- a/symm/Makefile.am +++ b/symm/Makefile.am @@ -193,6 +193,9 @@ if CPUFAM_ARMEL libsymm_la_SOURCES += rijndael-arm-crypto.S endif endif +if CPUFAM_ARM64 +libsymm_la_SOURCES += rijndael-arm64-crypto.S +endif nodist_libsymm_la_SOURCES += ../precomp/symm/rijndael-tab.c PRECOMPS += $(precomp)/symm/rijndael-tab.c PRECOMP_PROGS += rijndael-mktab @@ -477,6 +480,9 @@ endif if CPUFAM_ARMEL libsymm_la_SOURCES += salsa20-arm-neon.S endif +if CPUFAM_ARM64 +libsymm_la_SOURCES += salsa20-arm64.S +endif TESTS += salsa20.t$(EXEEXT) ALL_CIPHERS += salsa20 salsa2012 salsa208 ALL_CIPHERS += salsa20-ietf salsa2012-ietf salsa208-ietf @@ -516,6 +522,9 @@ endif if CPUFAM_ARMEL libsymm_la_SOURCES += chacha-arm-neon.S endif +if CPUFAM_ARM64 +libsymm_la_SOURCES += chacha-arm64.S +endif TESTS += chacha.t$(EXEEXT) EXTRA_DIST += t/chacha ALL_CIPHERS += chacha20 chacha12 chacha8 diff --git a/symm/chacha-arm64.S b/symm/chacha-arm64.S new file mode 100644 index 00000000..a423e9e5 --- /dev/null +++ b/symm/chacha-arm64.S @@ -0,0 +1,181 @@ +/// -*- mode: asm; asm-comment-char: ?/ -*- +/// +/// Fancy SIMD implementation of ChaCha for AArch64 +/// +/// (c) 2018 Straylight/Edgeware +/// + +///----- Licensing notice --------------------------------------------------- +/// +/// This file is part of Catacomb. +/// +/// Catacomb is free software; you can redistribute it and/or modify +/// it under the terms of the GNU Library General Public License as +/// published by the Free Software Foundation; either version 2 of the +/// License, or (at your option) any later version. +/// +/// Catacomb is distributed in the hope that it will be useful, +/// but WITHOUT ANY WARRANTY; without even the implied warranty of +/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +/// GNU Library General Public License for more details. +/// +/// You should have received a copy of the GNU Library General Public +/// License along with Catacomb; if not, write to the Free +/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +/// MA 02111-1307, USA. + +///-------------------------------------------------------------------------- +/// External definitions. + +#include "config.h" +#include "asm-common.h" + +///-------------------------------------------------------------------------- +/// Main.code. + + .arch armv8-a + .text + +FUNC(chacha_core_arm64) + + // Arguments are in registers. + // w0 is the number of rounds to perform + // x1 points to the input matrix + // x2 points to the output matrix + + // First job is to slurp the matrix into the SIMD registers. + // + // [ 0 1 2 3] (a, v4) + // [ 4 5 6 7] (b, v5) + // [ 8 9 10 11] (c, v6) + // [12 13 14 15] (d, v7) + // + // We need a copy for later. Rather than waste time copying them by + // hand, we'll use the three-address nature of the instruction set. + // But this means that the main loop is offset by a bit. + ld1 {v0.4s-v3.4s}, [x1] + + // a += b; d ^= a; d <<<= 16 + add v4.4s, v0.4s, v1.4s + eor v7.16b, v3.16b, v4.16b + shl v16.4s, v7.4s, #16 + ushr v7.4s, v7.4s, #16 + orr v7.16b, v7.16b, v16.16b + + // c += d; b ^= c; b <<<= 12 + add v6.4s, v2.4s, v7.4s + eor v5.16b, v1.16b, v6.16b + shl v16.4s, v5.4s, #12 + ushr v5.4s, v5.4s, #20 + orr v5.16b, v5.16b, v16.16b + +0: + // Apply (the rest of) a column quarterround to each of the columns + // simultaneously. Alas, there doesn't seem to be a packed word + // rotate, so we have to synthesize it. + + // a += b; d ^= a; d <<<= 8 + add v4.4s, v4.4s, v5.4s + eor v7.16b, v7.16b, v4.16b + shl v16.4s, v7.4s, #8 + ushr v7.4s, v7.4s, #24 + orr v7.16b, v7.16b, v16.16b + + // c += d; b ^= c; b <<<= 7 + add v6.4s, v6.4s, v7.4s + ext v7.16b, v7.16b, v7.16b, #12 + eor v5.16b, v5.16b, v6.16b + ext v6.16b, v6.16b, v6.16b, #8 + shl v16.4s, v5.4s, #7 + ushr v5.4s, v5.4s, #25 + orr v5.16b, v5.16b, v16.16b + + // The not-quite-transpose conveniently only involves reordering + // elements of individual rows, which can be done quite easily. It + // doesn't involve any movement of elements between rows, or even + // renaming of the rows. + // + // [ 0 1 2 3] [ 0 1 2 3] (a, v4) + // [ 4 5 6 7] --> [ 5 6 7 4] (b, v5) + // [ 8 9 10 11] [10 11 8 9] (c, v6) + // [12 13 14 15] [15 12 13 14] (d, v7) + // + // The reorderings have for the most part been pushed upwards to + // reduce delays. + ext v5.16b, v5.16b, v5.16b, #4 + sub w0, w0, #2 + + // Apply the diagonal quarterround to each of the columns + // simultaneously. + + // a += b; d ^= a; d <<<= 16 + add v4.4s, v4.4s, v5.4s + eor v7.16b, v7.16b, v4.16b + shl v16.4s, v7.4s, #16 + ushr v7.4s, v7.4s, #16 + orr v7.16b, v7.16b, v16.16b + + // c += d; b ^= c; b <<<= 12 + add v6.4s, v6.4s, v7.4s + eor v5.16b, v5.16b, v6.16b + shl v16.4s, v5.4s, #12 + ushr v5.4s, v5.4s, #20 + orr v5.16b, v5.16b, v16.16b + + // a += b; d ^= a; d <<<= 8 + add v4.4s, v4.4s, v5.4s + eor v7.16b, v7.16b, v4.16b + shl v16.4s, v7.4s, #8 + ushr v7.4s, v7.4s, #24 + orr v7.16b, v7.16b, v16.16b + + // c += d; b ^= c; b <<<= 7 + add v6.4s, v6.4s, v7.4s + ext v7.16b, v7.16b, v7.16b, #4 + eor v5.16b, v5.16b, v6.16b + ext v6.16b, v6.16b, v6.16b, #8 + shl v16.4s, v5.4s, #7 + ushr v5.4s, v5.4s, #25 + orr v5.16b, v5.16b, v16.16b + + // Finally finish off undoing the transpose, and we're done for this + // doubleround. Again, most of this was done above so we don't have + // to wait for the reorderings. + ext v5.16b, v5.16b, v5.16b, #12 + + // Decrement the loop counter and see if we should go round again. + cbz w0, 9f + + // Do the first part of the next round because this loop is offset. + + // a += b; d ^= a; d <<<= 16 + add v4.4s, v4.4s, v5.4s + eor v7.16b, v7.16b, v4.16b + shl v16.4s, v7.4s, #16 + ushr v7.4s, v7.4s, #16 + orr v7.16b, v7.16b, v16.16b + + // c += d; b ^= c; b <<<= 12 + add v6.4s, v6.4s, v7.4s + eor v5.16b, v5.16b, v6.16b + shl v16.4s, v5.4s, #12 + ushr v5.4s, v5.4s, #20 + orr v5.16b, v5.16b, v16.16b + + b 0b + + // Almost there. Firstly the feedfoward addition. +9: add v0.4s, v0.4s, v4.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + + // And now we write out the result. + st1 {v0.4s-v3.4s}, [x2] + + // And with that, we're done. + ret + +ENDFUNC + +///----- That's all, folks -------------------------------------------------- diff --git a/symm/chacha.c b/symm/chacha.c index 2dcb1253..34198618 100644 --- a/symm/chacha.c +++ b/symm/chacha.c @@ -78,6 +78,10 @@ extern core__functype chacha_core_x86ish_sse2; extern core__functype chacha_core_arm_neon; #endif +#if CPUFAM_ARM64 +extern core__functype chacha_core_arm64; +#endif + static core__functype *pick_core(void) { #if CPUFAM_X86 || CPUFAM_AMD64 @@ -88,6 +92,9 @@ static core__functype *pick_core(void) DISPATCH_PICK_COND(chacha_core, chacha_core_arm_neon, cpu_feature_p(CPUFEAT_ARM_NEON)); #endif +#if CPUFAM_ARM64 + DISPATCH_PICK_COND(chacha_core, chacha_core_arm64, 1); +#endif DISPATCH_PICK_FALLBACK(chacha_core, simple_core); } diff --git a/symm/rijndael-arm64-crypto.S b/symm/rijndael-arm64-crypto.S new file mode 100644 index 00000000..8739c193 --- /dev/null +++ b/symm/rijndael-arm64-crypto.S @@ -0,0 +1,314 @@ +/// -*- mode: asm; asm-comment-char: ?/ -*- +/// +/// AArch64 crypto-extension-based implementation of Rijndael +/// +/// (c) 2018 Straylight/Edgeware +/// + +///----- Licensing notice --------------------------------------------------- +/// +/// This file is part of Catacomb. +/// +/// Catacomb is free software; you can redistribute it and/or modify +/// it under the terms of the GNU Library General Public License as +/// published by the Free Software Foundation; either version 2 of the +/// License, or (at your option) any later version. +/// +/// Catacomb is distributed in the hope that it will be useful, +/// but WITHOUT ANY WARRANTY; without even the implied warranty of +/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +/// GNU Library General Public License for more details. +/// +/// You should have received a copy of the GNU Library General Public +/// License along with Catacomb; if not, write to the Free +/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +/// MA 02111-1307, USA. + +///-------------------------------------------------------------------------- +/// External definitions. + +#include "config.h" +#include "asm-common.h" + + .extern F(abort) + .extern F(rijndael_rcon) + +///-------------------------------------------------------------------------- +/// Main code. + + .arch armv8-a+crypto + +/// The ARM crypto extension implements a little-endian version of AES +/// (though the manual doesn't actually spell this out and you have to +/// experiment), but Catacomb's internal interface presents as big-endian so +/// as to work better with things like GCM. We therefore maintain the round +/// keys in little-endian form, and have to end-swap blocks in and out. +/// +/// For added amusement, the crypto extension doesn't implement the larger- +/// block versions of Rijndael, so we have to end-swap the keys if we're +/// preparing for one of those. + + // Useful constants. + .equ maxrounds, 16 // maximum number of rounds + .equ maxblksz, 32 // maximum block size, in bytes + .equ kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer + + // Context structure. + .equ nr, 0 // number of rounds + .equ w, nr + 4 // encryption key words + .equ wi, w + kbufsz // decryption key words + +///-------------------------------------------------------------------------- +/// Key setup. + +FUNC(rijndael_setup_arm64_crypto) + + // Arguments: + // x0 = pointer to context + // w1 = block size in 32-bit words + // x2 = pointer to key material + // x3 = key size in words + + pushreg x29, x30 + mov x29, sp + + // The initial round key material is taken directly from the input + // key, so copy it over. Unfortunately, the key material is not + // guaranteed to be aligned in any especially useful way. Assume + // that alignment traps are not enabled. (Why would they be? On + // A32, alignment traps were part of a transition plan which changed + // the way unaligned loads and stores behaved, but there's never been + // any other behaviour on A64.) + mov x15, x3 + add x4, x0, #w +0: sub x15, x15, #1 + ldr w14, [x2], #4 + str w14, [x4], #4 + cbnz x15, 0b + + // Find out other useful things and prepare for the main loop. +9: ldr w9, [x0, #nr] // number of rounds + madd w2, w1, w9, w1 // total key size in words + leaext x5, rijndael_rcon // round constants + sub x6, x2, x3 // minus what we've copied already + add x7, x0, #w // position in previous cycle + movi v1.4s, #0 // all-zero register for the key + mov x8, #0 // position in current cycle + + // Main key expansion loop. Dispatch according to the position in + // the cycle. +0: ldr w15, [x7], #4 // word from previous cycle + cbz x8, 1f // first word of the cycle? + cmp x8, #4 // fourth word of the cycle? + b.ne 2f + cmp x3, #7 // seven or eight words of key? + b.cc 2f + + // Fourth word of the cycle, seven or eight words of key. We must do + // the byte substitution. + dup v0.4s, w14 + aese v0.16b, v1.16b // effectively, just SubBytes + mov w14, v0.4s[0] + b 2f + + // First word of the cycle. Byte substitution, rotation, and round + // constant. +1: ldrb w13, [x5], #1 // next round constant + dup v0.4s, w14 + aese v0.16b, v1.16b // effectively, just SubBytes + mov w14, v0.4s[0] + eor w14, w13, w14, ror #8 + + // Common ending: mix in the word from the previous cycle and store. +2: eor w14, w14, w15 + str w14, [x4], #4 + + // Prepare for the next iteration. If we're done, then stop; if + // we've finished a cycle then reset the counter. + add x8, x8, #1 + sub x6, x6, #1 + cmp x8, x3 + cbz x6, 9f + csel x8, x8, xzr, cc + b 0b + + // Next job is to construct the decryption keys. The keys for the + // first and last rounds don't need to be mangled, but the remaining + // ones do -- and they all need to be reordered too. + // + // The plan of action, then, is to copy the final encryption round's + // keys into place first, then to do each of the intermediate rounds + // in reverse order, and finally do the first round. + // + // Do all the heavy lifting with the vector registers. The order + // we're doing this in means that it's OK if we read or write too + // much, and there's easily enough buffer space for the + // over-enthusiastic reads and writes because the context has space + // for 32-byte blocks, which is our maximum and an exact fit for two + // full-width registers. +9: add x5, x0, #wi + add x4, x0, #w + add x4, x4, w2, uxtw #2 + sub x4, x4, w1, uxtw #2 // last round's keys + + // Copy the last encryption round's keys. + ld1 {v0.4s, v1.4s}, [x4] + st1 {v0.4s, v1.4s}, [x5] + + // Update the loop variables and stop if we've finished. +0: sub w9, w9, #1 + add x5, x5, w1, uxtw #2 + sub x4, x4, w1, uxtw #2 + cbz w9, 9f + + // Do another middle round's keys... + ld1 {v0.4s, v1.4s}, [x4] + aesimc v0.16b, v0.16b + aesimc v1.16b, v1.16b + st1 {v0.4s, v1.4s}, [x5] + b 0b + + // Finally do the first encryption round. +9: ld1 {v0.4s, v1.4s}, [x4] + st1 {v0.4s, v1.4s}, [x5] + + // If the block size is not exactly four words then we must end-swap + // everything. We can use fancy vector toys for this. + cmp w1, #4 + b.eq 9f + + // End-swap the encryption keys. + add x1, x0, #w + bl endswap_block + + // And the decryption keys + add x1, x0, #wi + bl endswap_block + + // All done. +9: popreg x29, x30 + ret + +ENDFUNC + +INTFUNC(endswap_block) + // End-swap w2 words starting at x1. x1 is clobbered; w2 is not. + // It's OK to work in 16-byte chunks. + + mov w3, w2 +0: subs w3, w3, #4 + ld1 {v0.4s}, [x1] + rev32 v0.16b, v0.16b + st1 {v0.4s}, [x1], #16 + b.hi 0b + ret + +ENDFUNC + +///-------------------------------------------------------------------------- +/// Encrypting and decrypting blocks. + +.macro encdec op, aes, mc, koff + FUNC(rijndael_\op\()_arm64_crypto) + + // Arguments: + // x0 = pointer to context + // x1 = pointer to input block + // x2 = pointer to output block + + // Set things up ready. + ldr w3, [x0, #nr] + add x0, x0, #\koff + ld1 {v0.4s}, [x1] + rev32 v0.16b, v0.16b + + // Check the number of rounds and dispatch. + cmp w3, #14 + b.eq 14f + cmp w3, #10 + b.eq 10f + cmp w3, #12 + b.eq 12f + cmp w3, #13 + b.eq 13f + cmp w3, #11 + b.eq 11f + callext F(abort) + + // Eleven rounds. +11: ld1 {v16.4s}, [x0], #16 + \aes v0.16b, v16.16b + \mc v0.16b, v0.16b + b 10f + + // Twelve rounds. +12: ld1 {v16.4s, v17.4s}, [x0], #32 + \aes v0.16b, v16.16b + \mc v0.16b, v0.16b + \aes v0.16b, v17.16b + \mc v0.16b, v0.16b + b 10f + + // Thirteen rounds. +13: ld1 {v16.4s-v18.4s}, [x0], #48 + \aes v0.16b, v16.16b + \mc v0.16b, v0.16b + \aes v0.16b, v17.16b + \mc v0.16b, v0.16b + \aes v0.16b, v18.16b + \mc v0.16b, v0.16b + b 10f + + // Fourteen rounds. (Drops through to the ten round case because + // this is the next most common.) +14: ld1 {v16.4s-v19.4s}, [x0], #64 + \aes v0.16b, v16.16b + \mc v0.16b, v0.16b + \aes v0.16b, v17.16b + \mc v0.16b, v0.16b + \aes v0.16b, v18.16b + \mc v0.16b, v0.16b + \aes v0.16b, v19.16b + \mc v0.16b, v0.16b + // Drop through... + + // Ten rounds. +10: ld1 {v16.4s-v19.4s}, [x0], #64 + ld1 {v20.4s-v23.4s}, [x0], #64 + \aes v0.16b, v16.16b + \mc v0.16b, v0.16b + \aes v0.16b, v17.16b + \mc v0.16b, v0.16b + \aes v0.16b, v18.16b + \mc v0.16b, v0.16b + \aes v0.16b, v19.16b + \mc v0.16b, v0.16b + + ld1 {v16.4s-v18.4s}, [x0], #48 + \aes v0.16b, v20.16b + \mc v0.16b, v0.16b + \aes v0.16b, v21.16b + \mc v0.16b, v0.16b + \aes v0.16b, v22.16b + \mc v0.16b, v0.16b + \aes v0.16b, v23.16b + \mc v0.16b, v0.16b + + // Final round has no MixColumns, but is followed by final whitening. + \aes v0.16b, v16.16b + \mc v0.16b, v0.16b + \aes v0.16b, v17.16b + eor v0.16b, v0.16b, v18.16b + + // All done. + rev32 v0.16b, v0.16b + st1 {v0.4s}, [x2] + ret + + ENDFUNC +.endm + + encdec eblk, aese, aesmc, w + encdec dblk, aesd, aesimc, wi + +///----- That's all, folks -------------------------------------------------- diff --git a/symm/rijndael-base.c b/symm/rijndael-base.c index 01f781df..83a49e92 100644 --- a/symm/rijndael-base.c +++ b/symm/rijndael-base.c @@ -122,6 +122,9 @@ extern setup__functype rijndael_setup_x86ish_aesni; #if CPUFAM_ARMEL && HAVE_AS_ARMV8_CRYPTO extern setup__functype rijndael_setup_arm_crypto; #endif +#if CPUFAM_ARM64 +extern setup__functype rijndael_setup_arm64_crypto; +#endif static setup__functype *pick_setup(void) { @@ -133,6 +136,10 @@ static setup__functype *pick_setup(void) DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_arm_crypto, cpu_feature_p(CPUFEAT_ARM_AES)); #endif +#if CPUFAM_ARM64 + DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_arm64_crypto, + cpu_feature_p(CPUFEAT_ARM_AES)); +#endif DISPATCH_PICK_FALLBACK(rijndael_setup, simple_setup); } diff --git a/symm/rijndael.c b/symm/rijndael.c index 4c8837d2..02cfb76b 100644 --- a/symm/rijndael.c +++ b/symm/rijndael.c @@ -88,6 +88,10 @@ extern rijndael_dblk__functype rijndael_dblk_x86ish_aesni; extern rijndael_eblk__functype rijndael_eblk_arm_crypto; extern rijndael_dblk__functype rijndael_dblk_arm_crypto; #endif +#if CPUFAM_ARM64 +extern rijndael_eblk__functype rijndael_eblk_arm64_crypto; +extern rijndael_dblk__functype rijndael_dblk_arm64_crypto; +#endif static rijndael_eblk__functype *pick_eblk(void) { @@ -99,6 +103,10 @@ static rijndael_eblk__functype *pick_eblk(void) DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_arm_crypto, cpu_feature_p(CPUFEAT_ARM_AES)); #endif +#if CPUFAM_ARM64 + DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_arm64_crypto, + cpu_feature_p(CPUFEAT_ARM_AES)); +#endif DISPATCH_PICK_FALLBACK(rijndael_eblk, simple_eblk); } @@ -112,6 +120,10 @@ static rijndael_dblk__functype *pick_dblk(void) DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_arm_crypto, cpu_feature_p(CPUFEAT_ARM_AES)); #endif +#if CPUFAM_ARM64 + DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_arm64_crypto, + cpu_feature_p(CPUFEAT_ARM_AES)); +#endif DISPATCH_PICK_FALLBACK(rijndael_dblk, simple_dblk); } diff --git a/symm/salsa20-arm64.S b/symm/salsa20-arm64.S new file mode 100644 index 00000000..821548e1 --- /dev/null +++ b/symm/salsa20-arm64.S @@ -0,0 +1,236 @@ +/// -*- mode: asm; asm-comment-char: ?/ -*- +/// +/// Fancy SIMD implementation of Salsa20 for AArch64 +/// +/// (c) 2018 Straylight/Edgeware +/// + +///----- Licensing notice --------------------------------------------------- +/// +/// This file is part of Catacomb. +/// +/// Catacomb is free software; you can redistribute it and/or modify +/// it under the terms of the GNU Library General Public License as +/// published by the Free Software Foundation; either version 2 of the +/// License, or (at your option) any later version. +/// +/// Catacomb is distributed in the hope that it will be useful, +/// but WITHOUT ANY WARRANTY; without even the implied warranty of +/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +/// GNU Library General Public License for more details. +/// +/// You should have received a copy of the GNU Library General Public +/// License along with Catacomb; if not, write to the Free +/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +/// MA 02111-1307, USA. + +///-------------------------------------------------------------------------- +/// External definitions. + +#include "config.h" +#include "asm-common.h" + +///-------------------------------------------------------------------------- +/// Main.code. + + .arch armv8-a + .text + +FUNC(salsa20_core_arm64) + + // Arguments are in registers. + // w0 is the number of rounds to perform + // x1 points to the input matrix + // x2 points to the output matrix + + // First job is to slurp the matrix into the SIMD registers. The + // words have already been permuted conveniently to make them line up + // better for SIMD processing. + // + // The textbook arrangement of the matrix is this. + // + // [C K K K] + // [K C N N] + // [T T C K] + // [K K K C] + // + // But we've rotated the columns up so that the main diagonal with + // the constants on it end up in the first row, giving something more + // like + // + // [C C C C] + // [K T K K] + // [T K K N] + // [K K N K] + // + // so the transformation looks like this: + // + // [ 0 1 2 3] [ 0 5 10 15] (a, v4) + // [ 4 5 6 7] --> [ 4 9 14 3] (b, v5) + // [ 8 9 10 11] [ 8 13 2 7] (c, v6) + // [12 13 14 15] [12 1 6 11] (d, v7) + // + // We need a copy for later. Rather than waste time copying them by + // hand, we'll use the three-address nature of the instruction set. + // But this means that the main loop is offset by a bit. + ld1 {v0.4s-v3.4s}, [x1] + + // Apply a column quarterround to each of the columns simultaneously, + // moving the results to their working registers. Alas, there + // doesn't seem to be a packed word rotate, so we have to synthesize + // it. + + // b ^= (a + d) <<< 7 + add v16.4s, v0.4s, v3.4s + shl v17.4s, v16.4s, #7 + ushr v16.4s, v16.4s, #25 + orr v16.16b, v16.16b, v17.16b + eor v5.16b, v1.16b, v16.16b + + // c ^= (b + a) <<< 9 + add v16.4s, v5.4s, v0.4s + shl v17.4s, v16.4s, #9 + ushr v16.4s, v16.4s, #23 + orr v16.16b, v16.16b, v17.16b + eor v6.16b, v2.16b, v16.16b + + // d ^= (c + b) <<< 13 + add v16.4s, v6.4s, v5.4s + ext v5.16b, v5.16b, v5.16b, #12 + shl v17.4s, v16.4s, #13 + ushr v16.4s, v16.4s, #19 + orr v16.16b, v16.16b, v17.16b + eor v7.16b, v3.16b, v16.16b + + // a ^= (d + c) <<< 18 + add v16.4s, v7.4s, v6.4s + ext v6.16b, v6.16b, v6.16b, #8 + ext v7.16b, v7.16b, v7.16b, #4 + shl v17.4s, v16.4s, #18 + ushr v16.4s, v16.4s, #14 + orr v16.16b, v16.16b, v17.16b + eor v4.16b, v0.16b, v16.16b + +0: + // The transpose conveniently only involves reordering elements of + // individual rows, which can be done quite easily, and reordering + // the rows themselves, which is a trivial renaming. It doesn't + // involve any movement of elements between rows. + // + // [ 0 5 10 15] [ 0 5 10 15] (a, v4) + // [ 4 9 14 3] --> [ 1 6 11 12] (b, v7) + // [ 8 13 2 7] [ 2 7 8 13] (c, v6) + // [12 1 6 11] [ 3 4 9 14] (d, v5) + // + // The reorderings have been pushed upwards to reduce delays. + sub w0, w0, #2 + + // Apply the row quarterround to each of the columns (yes!) + // simultaneously. + + // b ^= (a + d) <<< 7 + add v16.4s, v4.4s, v5.4s + shl v17.4s, v16.4s, #7 + ushr v16.4s, v16.4s, #25 + orr v16.16b, v16.16b, v17.16b + eor v7.16b, v7.16b, v16.16b + + // c ^= (b + a) <<< 9 + add v16.4s, v7.4s, v4.4s + shl v17.4s, v16.4s, #9 + ushr v16.4s, v16.4s, #23 + orr v16.16b, v16.16b, v17.16b + eor v6.16b, v6.16b, v16.16b + + // d ^= (c + b) <<< 13 + add v16.4s, v6.4s, v7.4s + ext v7.16b, v7.16b, v7.16b, #12 + shl v17.4s, v16.4s, #13 + ushr v16.4s, v16.4s, #19 + orr v16.16b, v16.16b, v17.16b + eor v5.16b, v5.16b, v16.16b + + // a ^= (d + c) <<< 18 + add v16.4s, v5.4s, v6.4s + ext v6.16b, v6.16b, v6.16b, #8 + ext v5.16b, v5.16b, v5.16b, #4 + shl v17.4s, v16.4s, #18 + ushr v16.4s, v16.4s, #14 + orr v16.16b, v16.16b, v17.16b + eor v4.16b, v4.16b, v16.16b + + // We had to undo the transpose ready for the next loop. Again, push + // back the reorderings to reduce latency. Decrement the loop + // counter and see if we should go round again. + cbz w0, 9f + + // Do the first half of the next round because this loop is offset. + + // b ^= (a + d) <<< 7 + add v16.4s, v4.4s, v7.4s + shl v17.4s, v16.4s, #7 + ushr v16.4s, v16.4s, #25 + orr v16.16b, v16.16b, v17.16b + eor v5.16b, v5.16b, v16.16b + + // c ^= (b + a) <<< 9 + add v16.4s, v5.4s, v4.4s + shl v17.4s, v16.4s, #9 + ushr v16.4s, v16.4s, #23 + orr v16.16b, v16.16b, v17.16b + eor v6.16b, v6.16b, v16.16b + + // d ^= (c + b) <<< 13 + add v16.4s, v6.4s, v5.4s + ext v5.16b, v5.16b, v5.16b, #12 + shl v17.4s, v16.4s, #13 + ushr v16.4s, v16.4s, #19 + orr v16.16b, v16.16b, v17.16b + eor v7.16b, v7.16b, v16.16b + + // a ^= (d + c) <<< 18 + add v16.4s, v7.4s, v6.4s + ext v6.16b, v6.16b, v6.16b, #8 + ext v7.16b, v7.16b, v7.16b, #4 + shl v17.4s, v16.4s, #18 + ushr v16.4s, v16.4s, #14 + orr v16.16b, v16.16b, v17.16b + eor v4.16b, v4.16b, v16.16b + + b 0b + + // Almost there. Firstly the feedfoward addition. Also, establish + // constants which will be useful later. +9: add v0.4s, v0.4s, v4.4s // 0, 5, 10, 15 + movi v16.2d, #0xffffffff // = (-1, 0, -1, 0) + movi d17, #-1 // = (-1, -1, 0, 0) + add v1.4s, v1.4s, v5.4s // 4, 9, 14, 3 + add v2.4s, v2.4s, v6.4s // 8, 13, 2, 7 + add v3.4s, v3.4s, v7.4s // 12, 1, 6, 11 + + // Next we must undo the permutation which was already applied to the + // input. The core trick is from Dan Bernstein's `armneon3' + // implementation, but with a lot of liposuction. + mov v4.16b, v0.16b + + // Sort out the columns by pairs. + bif v0.16b, v3.16b, v16.16b // 0, 1, 10, 11 + bif v3.16b, v2.16b, v16.16b // 12, 13, 6, 7 + bif v2.16b, v1.16b, v16.16b // 8, 9, 2, 3 + bif v1.16b, v4.16b, v16.16b // 4, 5, 14, 15 + mov v4.16b, v0.16b + mov v5.16b, v3.16b + + // Now fix up the remaining discrepancies. + bif v0.16b, v2.16b, v17.16b // 0, 1, 2, 3 + bif v3.16b, v1.16b, v17.16b // 12, 13, 14, 15 + bif v2.16b, v4.16b, v17.16b // 8, 9, 10, 11 + bif v1.16b, v5.16b, v17.16b // 4, 5, 6, 7 + + // And with that, we're done. + st1 {v0.4s-v3.4s}, [x2] + ret + +ENDFUNC + +///----- That's all, folks -------------------------------------------------- diff --git a/symm/salsa20.c b/symm/salsa20.c index ff6efe39..03fcf469 100644 --- a/symm/salsa20.c +++ b/symm/salsa20.c @@ -78,6 +78,10 @@ extern core__functype salsa20_core_x86ish_sse2; extern core__functype salsa20_core_arm_neon; #endif +#if CPUFAM_ARM64 +extern core__functype salsa20_core_arm64; +#endif + static core__functype *pick_core(void) { #if CPUFAM_X86 || CPUFAM_AMD64 @@ -88,6 +92,9 @@ static core__functype *pick_core(void) DISPATCH_PICK_COND(salsa20_core, salsa20_core_arm_neon, cpu_feature_p(CPUFEAT_ARM_NEON)); #endif +#if CPUFAM_ARM64 + DISPATCH_PICK_COND(salsa20_core, salsa20_core_arm64, 1); +#endif DISPATCH_PICK_FALLBACK(salsa20_core, simple_core); } -- 2.11.0