#endif
///--------------------------------------------------------------------------
+/// AArch64-specific hacking.
+
+#if CPUFAM_ARM64
+
+// Set the function hooks.
+#define FUNC_PREHOOK(_) .balign 4
+#define FUNC_POSTHOOK(_) .cfi_startproc; .L$_prologue_p = -1
+#define ENDFUNC_HOOK(_) .cfi_endproc
+
+// Call external subroutine at ADDR, possibly via PLT.
+.macro callext addr
+ bl \addr
+.endm
+
+// Load address of external symbol ADDR into REG.
+.macro leaext reg, addr
+#if WANT_PIC
+ adrp \reg, :got:\addr
+ ldr \reg, [\reg, #:got_lo12:\addr]
+#else
+ adrp \reg, \addr
+ add \reg, \reg, #:lo12:\addr
+#endif
+.endm
+
+// Stack management and unwinding.
+.macro setfp fp, offset = 0
+ // If you're just going through the motions with a fixed-size stack frame,
+ // then you want to say `add x29, sp, #OFFSET' directly, which will avoid
+ // pointlessly restoring sp later.
+ .if \offset == 0
+ mov \fp, sp
+ .cfi_def_cfa_register \fp
+ .else
+ add \fp, sp, #\offset
+ .cfi_def_cfa_register \fp
+ .cfi_adjust_cfa_offset -\offset
+ .endif
+ .macro dropfp; _dropfp \fp, \offset; .endm
+ .L$_frameptr_p = -1
+.endm
+
+.macro _dropfp fp, offset = 0
+ .if \offset == 0
+ mov sp, \fp
+ .cfi_def_cfa_register sp
+ .else
+ sub sp, \fp, #\offset
+ .cfi_def_cfa_register sp
+ .cfi_adjust_cfa_offset +\offset
+ .endif
+ .purgem dropfp
+ .L$_frameptr_p = 0
+.endm
+
+.macro stalloc n
+ sub sp, sp, #\n
+ .cfi_adjust_cfa_offset +\n
+.endm
+
+.macro stfree n
+ add sp, sp, #\n
+ .cfi_adjust_cfa_offset -\n
+.endm
+
+.macro pushreg x, y=
+ .ifeqs "\y", ""
+ str \x, [sp, #-16]!
+ .cfi_adjust_cfa_offset +16
+ .cfi_rel_offset \x, 0
+ .else
+ stp \x, \y, [sp, #-16]!
+ .cfi_adjust_cfa_offset +16
+ .cfi_rel_offset \x, 0
+ .cfi_rel_offset \y, 8
+ .endif
+.endm
+
+.macro popreg x, y=
+ .ifeqs "\y", ""
+ ldr \x, [sp], #16
+ .cfi_restore \x
+ .cfi_adjust_cfa_offset -16
+ .else
+ ldp \x, \y, [sp], #16
+ .cfi_restore \x
+ .cfi_restore \y
+ .cfi_adjust_cfa_offset -16
+ .endif
+.endm
+
+.macro savereg x, y, z=
+ .ifeqs "\z", ""
+ str \x, [sp, #\y]
+ .cfi_rel_offset \x, \y
+ .else
+ stp \x, \y, [sp, #\z]
+ .cfi_rel_offset \x, \z
+ .cfi_rel_offset \y, \z + 8
+ .endif
+.endm
+
+.macro rstrreg x, y, z=
+ .ifeqs "\z", ""
+ ldr \x, [sp, #\y]
+ .cfi_restore \x
+ .else
+ ldp \x, \y, [sp, #\z]
+ .cfi_restore \x
+ .cfi_restore \y
+ .endif
+.endm
+
+.macro endprologue
+.endm
+
+#endif
+
+///--------------------------------------------------------------------------
/// Final stuff.
// Default values for the various hooks.
# define WANT_AT_HWCAP(_) _(AT_HWCAP, u, hwcap)
#endif
+#if defined(AT_HWCAP) && CPUFAM_ARM64
+# define WANT_ANY 1
+# define WANT_AT_HWCAP(_) _(AT_HWCAP, u, hwcap)
+#endif
+
#if defined(AT_HWCAP2) && CPUFAM_ARMEL
# define WANT_ANY 1
# define WANT_AT_HWCAP2(_) _(AT_HWCAP2, u, hwcap2)
_(ARM_D32, "arm:d32") \
_(ARM_AES, "arm:aes")
#endif
+#if CPUFAM_ARM64
+# define WANTAUX(_) \
+ WANT_AT_HWCAP(_)
+# define CAPMAP(_) \
+ _(ARM_AES, "arm:aes")
+#endif
/* Build the bitmask for `hwcaps' from the `CAPMAP' list. */
enum {
if (probed.hwcap2 & HWCAP2_AES) hw |= HF_ARM_AES;
# endif
#endif
+#if CPUFAM_ARM64
+ if (probed.hwcap & HWCAP_AES) hw |= HF_ARM_AES;
+#endif
/* Store the bitmask of features we probed for everyone to see. */
DISPATCH_STORE(hwcaps, hw);
$1([i[[3-6]]86,*], [x86], [sysv])
$1([x86_64,cygwin], [amd64], [win])
$1([x86_64,*], [amd64], [sysv])
- $1([arm,* | armv*,*], [armel], [std])])
+ $1([arm,* | armv*,*], [armel], [std])
+ $1([aarch64,*], [arm64], [std])])
dnl A utility to clear the `seen' flags, used so as to process each CPU or
dnl ABI once.
libsymm_la_SOURCES += rijndael-arm-crypto.S
endif
endif
+if CPUFAM_ARM64
+libsymm_la_SOURCES += rijndael-arm64-crypto.S
+endif
nodist_libsymm_la_SOURCES += ../precomp/symm/rijndael-tab.c
PRECOMPS += $(precomp)/symm/rijndael-tab.c
PRECOMP_PROGS += rijndael-mktab
if CPUFAM_ARMEL
libsymm_la_SOURCES += salsa20-arm-neon.S
endif
+if CPUFAM_ARM64
+libsymm_la_SOURCES += salsa20-arm64.S
+endif
TESTS += salsa20.t$(EXEEXT)
ALL_CIPHERS += salsa20 salsa2012 salsa208
ALL_CIPHERS += salsa20-ietf salsa2012-ietf salsa208-ietf
if CPUFAM_ARMEL
libsymm_la_SOURCES += chacha-arm-neon.S
endif
+if CPUFAM_ARM64
+libsymm_la_SOURCES += chacha-arm64.S
+endif
TESTS += chacha.t$(EXEEXT)
EXTRA_DIST += t/chacha
ALL_CIPHERS += chacha20 chacha12 chacha8
--- /dev/null
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// Fancy SIMD implementation of ChaCha for AArch64
+///
+/// (c) 2018 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+#include "config.h"
+#include "asm-common.h"
+
+///--------------------------------------------------------------------------
+/// Main.code.
+
+ .arch armv8-a
+ .text
+
+FUNC(chacha_core_arm64)
+
+ // Arguments are in registers.
+ // w0 is the number of rounds to perform
+ // x1 points to the input matrix
+ // x2 points to the output matrix
+
+ // First job is to slurp the matrix into the SIMD registers.
+ //
+ // [ 0 1 2 3] (a, v4)
+ // [ 4 5 6 7] (b, v5)
+ // [ 8 9 10 11] (c, v6)
+ // [12 13 14 15] (d, v7)
+ //
+ // We need a copy for later. Rather than waste time copying them by
+ // hand, we'll use the three-address nature of the instruction set.
+ // But this means that the main loop is offset by a bit.
+ ld1 {v0.4s-v3.4s}, [x1]
+
+ // a += b; d ^= a; d <<<= 16
+ add v4.4s, v0.4s, v1.4s
+ eor v7.16b, v3.16b, v4.16b
+ shl v16.4s, v7.4s, #16
+ ushr v7.4s, v7.4s, #16
+ orr v7.16b, v7.16b, v16.16b
+
+ // c += d; b ^= c; b <<<= 12
+ add v6.4s, v2.4s, v7.4s
+ eor v5.16b, v1.16b, v6.16b
+ shl v16.4s, v5.4s, #12
+ ushr v5.4s, v5.4s, #20
+ orr v5.16b, v5.16b, v16.16b
+
+0:
+ // Apply (the rest of) a column quarterround to each of the columns
+ // simultaneously. Alas, there doesn't seem to be a packed word
+ // rotate, so we have to synthesize it.
+
+ // a += b; d ^= a; d <<<= 8
+ add v4.4s, v4.4s, v5.4s
+ eor v7.16b, v7.16b, v4.16b
+ shl v16.4s, v7.4s, #8
+ ushr v7.4s, v7.4s, #24
+ orr v7.16b, v7.16b, v16.16b
+
+ // c += d; b ^= c; b <<<= 7
+ add v6.4s, v6.4s, v7.4s
+ ext v7.16b, v7.16b, v7.16b, #12
+ eor v5.16b, v5.16b, v6.16b
+ ext v6.16b, v6.16b, v6.16b, #8
+ shl v16.4s, v5.4s, #7
+ ushr v5.4s, v5.4s, #25
+ orr v5.16b, v5.16b, v16.16b
+
+ // The not-quite-transpose conveniently only involves reordering
+ // elements of individual rows, which can be done quite easily. It
+ // doesn't involve any movement of elements between rows, or even
+ // renaming of the rows.
+ //
+ // [ 0 1 2 3] [ 0 1 2 3] (a, v4)
+ // [ 4 5 6 7] --> [ 5 6 7 4] (b, v5)
+ // [ 8 9 10 11] [10 11 8 9] (c, v6)
+ // [12 13 14 15] [15 12 13 14] (d, v7)
+ //
+ // The reorderings have for the most part been pushed upwards to
+ // reduce delays.
+ ext v5.16b, v5.16b, v5.16b, #4
+ sub w0, w0, #2
+
+ // Apply the diagonal quarterround to each of the columns
+ // simultaneously.
+
+ // a += b; d ^= a; d <<<= 16
+ add v4.4s, v4.4s, v5.4s
+ eor v7.16b, v7.16b, v4.16b
+ shl v16.4s, v7.4s, #16
+ ushr v7.4s, v7.4s, #16
+ orr v7.16b, v7.16b, v16.16b
+
+ // c += d; b ^= c; b <<<= 12
+ add v6.4s, v6.4s, v7.4s
+ eor v5.16b, v5.16b, v6.16b
+ shl v16.4s, v5.4s, #12
+ ushr v5.4s, v5.4s, #20
+ orr v5.16b, v5.16b, v16.16b
+
+ // a += b; d ^= a; d <<<= 8
+ add v4.4s, v4.4s, v5.4s
+ eor v7.16b, v7.16b, v4.16b
+ shl v16.4s, v7.4s, #8
+ ushr v7.4s, v7.4s, #24
+ orr v7.16b, v7.16b, v16.16b
+
+ // c += d; b ^= c; b <<<= 7
+ add v6.4s, v6.4s, v7.4s
+ ext v7.16b, v7.16b, v7.16b, #4
+ eor v5.16b, v5.16b, v6.16b
+ ext v6.16b, v6.16b, v6.16b, #8
+ shl v16.4s, v5.4s, #7
+ ushr v5.4s, v5.4s, #25
+ orr v5.16b, v5.16b, v16.16b
+
+ // Finally finish off undoing the transpose, and we're done for this
+ // doubleround. Again, most of this was done above so we don't have
+ // to wait for the reorderings.
+ ext v5.16b, v5.16b, v5.16b, #12
+
+ // Decrement the loop counter and see if we should go round again.
+ cbz w0, 9f
+
+ // Do the first part of the next round because this loop is offset.
+
+ // a += b; d ^= a; d <<<= 16
+ add v4.4s, v4.4s, v5.4s
+ eor v7.16b, v7.16b, v4.16b
+ shl v16.4s, v7.4s, #16
+ ushr v7.4s, v7.4s, #16
+ orr v7.16b, v7.16b, v16.16b
+
+ // c += d; b ^= c; b <<<= 12
+ add v6.4s, v6.4s, v7.4s
+ eor v5.16b, v5.16b, v6.16b
+ shl v16.4s, v5.4s, #12
+ ushr v5.4s, v5.4s, #20
+ orr v5.16b, v5.16b, v16.16b
+
+ b 0b
+
+ // Almost there. Firstly the feedfoward addition.
+9: add v0.4s, v0.4s, v4.4s
+ add v1.4s, v1.4s, v5.4s
+ add v2.4s, v2.4s, v6.4s
+ add v3.4s, v3.4s, v7.4s
+
+ // And now we write out the result.
+ st1 {v0.4s-v3.4s}, [x2]
+
+ // And with that, we're done.
+ ret
+
+ENDFUNC
+
+///----- That's all, folks --------------------------------------------------
extern core__functype chacha_core_arm_neon;
#endif
+#if CPUFAM_ARM64
+extern core__functype chacha_core_arm64;
+#endif
+
static core__functype *pick_core(void)
{
#if CPUFAM_X86 || CPUFAM_AMD64
DISPATCH_PICK_COND(chacha_core, chacha_core_arm_neon,
cpu_feature_p(CPUFEAT_ARM_NEON));
#endif
+#if CPUFAM_ARM64
+ DISPATCH_PICK_COND(chacha_core, chacha_core_arm64, 1);
+#endif
DISPATCH_PICK_FALLBACK(chacha_core, simple_core);
}
--- /dev/null
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// AArch64 crypto-extension-based implementation of Rijndael
+///
+/// (c) 2018 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+#include "config.h"
+#include "asm-common.h"
+
+ .extern F(abort)
+ .extern F(rijndael_rcon)
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+ .arch armv8-a+crypto
+
+/// The ARM crypto extension implements a little-endian version of AES
+/// (though the manual doesn't actually spell this out and you have to
+/// experiment), but Catacomb's internal interface presents as big-endian so
+/// as to work better with things like GCM. We therefore maintain the round
+/// keys in little-endian form, and have to end-swap blocks in and out.
+///
+/// For added amusement, the crypto extension doesn't implement the larger-
+/// block versions of Rijndael, so we have to end-swap the keys if we're
+/// preparing for one of those.
+
+ // Useful constants.
+ .equ maxrounds, 16 // maximum number of rounds
+ .equ maxblksz, 32 // maximum block size, in bytes
+ .equ kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer
+
+ // Context structure.
+ .equ nr, 0 // number of rounds
+ .equ w, nr + 4 // encryption key words
+ .equ wi, w + kbufsz // decryption key words
+
+///--------------------------------------------------------------------------
+/// Key setup.
+
+FUNC(rijndael_setup_arm64_crypto)
+
+ // Arguments:
+ // x0 = pointer to context
+ // w1 = block size in 32-bit words
+ // x2 = pointer to key material
+ // x3 = key size in words
+
+ pushreg x29, x30
+ mov x29, sp
+
+ // The initial round key material is taken directly from the input
+ // key, so copy it over. Unfortunately, the key material is not
+ // guaranteed to be aligned in any especially useful way. Assume
+ // that alignment traps are not enabled. (Why would they be? On
+ // A32, alignment traps were part of a transition plan which changed
+ // the way unaligned loads and stores behaved, but there's never been
+ // any other behaviour on A64.)
+ mov x15, x3
+ add x4, x0, #w
+0: sub x15, x15, #1
+ ldr w14, [x2], #4
+ str w14, [x4], #4
+ cbnz x15, 0b
+
+ // Find out other useful things and prepare for the main loop.
+9: ldr w9, [x0, #nr] // number of rounds
+ madd w2, w1, w9, w1 // total key size in words
+ leaext x5, rijndael_rcon // round constants
+ sub x6, x2, x3 // minus what we've copied already
+ add x7, x0, #w // position in previous cycle
+ movi v1.4s, #0 // all-zero register for the key
+ mov x8, #0 // position in current cycle
+
+ // Main key expansion loop. Dispatch according to the position in
+ // the cycle.
+0: ldr w15, [x7], #4 // word from previous cycle
+ cbz x8, 1f // first word of the cycle?
+ cmp x8, #4 // fourth word of the cycle?
+ b.ne 2f
+ cmp x3, #7 // seven or eight words of key?
+ b.cc 2f
+
+ // Fourth word of the cycle, seven or eight words of key. We must do
+ // the byte substitution.
+ dup v0.4s, w14
+ aese v0.16b, v1.16b // effectively, just SubBytes
+ mov w14, v0.4s[0]
+ b 2f
+
+ // First word of the cycle. Byte substitution, rotation, and round
+ // constant.
+1: ldrb w13, [x5], #1 // next round constant
+ dup v0.4s, w14
+ aese v0.16b, v1.16b // effectively, just SubBytes
+ mov w14, v0.4s[0]
+ eor w14, w13, w14, ror #8
+
+ // Common ending: mix in the word from the previous cycle and store.
+2: eor w14, w14, w15
+ str w14, [x4], #4
+
+ // Prepare for the next iteration. If we're done, then stop; if
+ // we've finished a cycle then reset the counter.
+ add x8, x8, #1
+ sub x6, x6, #1
+ cmp x8, x3
+ cbz x6, 9f
+ csel x8, x8, xzr, cc
+ b 0b
+
+ // Next job is to construct the decryption keys. The keys for the
+ // first and last rounds don't need to be mangled, but the remaining
+ // ones do -- and they all need to be reordered too.
+ //
+ // The plan of action, then, is to copy the final encryption round's
+ // keys into place first, then to do each of the intermediate rounds
+ // in reverse order, and finally do the first round.
+ //
+ // Do all the heavy lifting with the vector registers. The order
+ // we're doing this in means that it's OK if we read or write too
+ // much, and there's easily enough buffer space for the
+ // over-enthusiastic reads and writes because the context has space
+ // for 32-byte blocks, which is our maximum and an exact fit for two
+ // full-width registers.
+9: add x5, x0, #wi
+ add x4, x0, #w
+ add x4, x4, w2, uxtw #2
+ sub x4, x4, w1, uxtw #2 // last round's keys
+
+ // Copy the last encryption round's keys.
+ ld1 {v0.4s, v1.4s}, [x4]
+ st1 {v0.4s, v1.4s}, [x5]
+
+ // Update the loop variables and stop if we've finished.
+0: sub w9, w9, #1
+ add x5, x5, w1, uxtw #2
+ sub x4, x4, w1, uxtw #2
+ cbz w9, 9f
+
+ // Do another middle round's keys...
+ ld1 {v0.4s, v1.4s}, [x4]
+ aesimc v0.16b, v0.16b
+ aesimc v1.16b, v1.16b
+ st1 {v0.4s, v1.4s}, [x5]
+ b 0b
+
+ // Finally do the first encryption round.
+9: ld1 {v0.4s, v1.4s}, [x4]
+ st1 {v0.4s, v1.4s}, [x5]
+
+ // If the block size is not exactly four words then we must end-swap
+ // everything. We can use fancy vector toys for this.
+ cmp w1, #4
+ b.eq 9f
+
+ // End-swap the encryption keys.
+ add x1, x0, #w
+ bl endswap_block
+
+ // And the decryption keys
+ add x1, x0, #wi
+ bl endswap_block
+
+ // All done.
+9: popreg x29, x30
+ ret
+
+ENDFUNC
+
+INTFUNC(endswap_block)
+ // End-swap w2 words starting at x1. x1 is clobbered; w2 is not.
+ // It's OK to work in 16-byte chunks.
+
+ mov w3, w2
+0: subs w3, w3, #4
+ ld1 {v0.4s}, [x1]
+ rev32 v0.16b, v0.16b
+ st1 {v0.4s}, [x1], #16
+ b.hi 0b
+ ret
+
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Encrypting and decrypting blocks.
+
+.macro encdec op, aes, mc, koff
+ FUNC(rijndael_\op\()_arm64_crypto)
+
+ // Arguments:
+ // x0 = pointer to context
+ // x1 = pointer to input block
+ // x2 = pointer to output block
+
+ // Set things up ready.
+ ldr w3, [x0, #nr]
+ add x0, x0, #\koff
+ ld1 {v0.4s}, [x1]
+ rev32 v0.16b, v0.16b
+
+ // Check the number of rounds and dispatch.
+ cmp w3, #14
+ b.eq 14f
+ cmp w3, #10
+ b.eq 10f
+ cmp w3, #12
+ b.eq 12f
+ cmp w3, #13
+ b.eq 13f
+ cmp w3, #11
+ b.eq 11f
+ callext F(abort)
+
+ // Eleven rounds.
+11: ld1 {v16.4s}, [x0], #16
+ \aes v0.16b, v16.16b
+ \mc v0.16b, v0.16b
+ b 10f
+
+ // Twelve rounds.
+12: ld1 {v16.4s, v17.4s}, [x0], #32
+ \aes v0.16b, v16.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v17.16b
+ \mc v0.16b, v0.16b
+ b 10f
+
+ // Thirteen rounds.
+13: ld1 {v16.4s-v18.4s}, [x0], #48
+ \aes v0.16b, v16.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v17.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v18.16b
+ \mc v0.16b, v0.16b
+ b 10f
+
+ // Fourteen rounds. (Drops through to the ten round case because
+ // this is the next most common.)
+14: ld1 {v16.4s-v19.4s}, [x0], #64
+ \aes v0.16b, v16.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v17.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v18.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v19.16b
+ \mc v0.16b, v0.16b
+ // Drop through...
+
+ // Ten rounds.
+10: ld1 {v16.4s-v19.4s}, [x0], #64
+ ld1 {v20.4s-v23.4s}, [x0], #64
+ \aes v0.16b, v16.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v17.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v18.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v19.16b
+ \mc v0.16b, v0.16b
+
+ ld1 {v16.4s-v18.4s}, [x0], #48
+ \aes v0.16b, v20.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v21.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v22.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v23.16b
+ \mc v0.16b, v0.16b
+
+ // Final round has no MixColumns, but is followed by final whitening.
+ \aes v0.16b, v16.16b
+ \mc v0.16b, v0.16b
+ \aes v0.16b, v17.16b
+ eor v0.16b, v0.16b, v18.16b
+
+ // All done.
+ rev32 v0.16b, v0.16b
+ st1 {v0.4s}, [x2]
+ ret
+
+ ENDFUNC
+.endm
+
+ encdec eblk, aese, aesmc, w
+ encdec dblk, aesd, aesimc, wi
+
+///----- That's all, folks --------------------------------------------------
#if CPUFAM_ARMEL && HAVE_AS_ARMV8_CRYPTO
extern setup__functype rijndael_setup_arm_crypto;
#endif
+#if CPUFAM_ARM64
+extern setup__functype rijndael_setup_arm64_crypto;
+#endif
static setup__functype *pick_setup(void)
{
DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_arm_crypto,
cpu_feature_p(CPUFEAT_ARM_AES));
#endif
+#if CPUFAM_ARM64
+ DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_arm64_crypto,
+ cpu_feature_p(CPUFEAT_ARM_AES));
+#endif
DISPATCH_PICK_FALLBACK(rijndael_setup, simple_setup);
}
extern rijndael_eblk__functype rijndael_eblk_arm_crypto;
extern rijndael_dblk__functype rijndael_dblk_arm_crypto;
#endif
+#if CPUFAM_ARM64
+extern rijndael_eblk__functype rijndael_eblk_arm64_crypto;
+extern rijndael_dblk__functype rijndael_dblk_arm64_crypto;
+#endif
static rijndael_eblk__functype *pick_eblk(void)
{
DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_arm_crypto,
cpu_feature_p(CPUFEAT_ARM_AES));
#endif
+#if CPUFAM_ARM64
+ DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_arm64_crypto,
+ cpu_feature_p(CPUFEAT_ARM_AES));
+#endif
DISPATCH_PICK_FALLBACK(rijndael_eblk, simple_eblk);
}
DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_arm_crypto,
cpu_feature_p(CPUFEAT_ARM_AES));
#endif
+#if CPUFAM_ARM64
+ DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_arm64_crypto,
+ cpu_feature_p(CPUFEAT_ARM_AES));
+#endif
DISPATCH_PICK_FALLBACK(rijndael_dblk, simple_dblk);
}
--- /dev/null
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// Fancy SIMD implementation of Salsa20 for AArch64
+///
+/// (c) 2018 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+#include "config.h"
+#include "asm-common.h"
+
+///--------------------------------------------------------------------------
+/// Main.code.
+
+ .arch armv8-a
+ .text
+
+FUNC(salsa20_core_arm64)
+
+ // Arguments are in registers.
+ // w0 is the number of rounds to perform
+ // x1 points to the input matrix
+ // x2 points to the output matrix
+
+ // First job is to slurp the matrix into the SIMD registers. The
+ // words have already been permuted conveniently to make them line up
+ // better for SIMD processing.
+ //
+ // The textbook arrangement of the matrix is this.
+ //
+ // [C K K K]
+ // [K C N N]
+ // [T T C K]
+ // [K K K C]
+ //
+ // But we've rotated the columns up so that the main diagonal with
+ // the constants on it end up in the first row, giving something more
+ // like
+ //
+ // [C C C C]
+ // [K T K K]
+ // [T K K N]
+ // [K K N K]
+ //
+ // so the transformation looks like this:
+ //
+ // [ 0 1 2 3] [ 0 5 10 15] (a, v4)
+ // [ 4 5 6 7] --> [ 4 9 14 3] (b, v5)
+ // [ 8 9 10 11] [ 8 13 2 7] (c, v6)
+ // [12 13 14 15] [12 1 6 11] (d, v7)
+ //
+ // We need a copy for later. Rather than waste time copying them by
+ // hand, we'll use the three-address nature of the instruction set.
+ // But this means that the main loop is offset by a bit.
+ ld1 {v0.4s-v3.4s}, [x1]
+
+ // Apply a column quarterround to each of the columns simultaneously,
+ // moving the results to their working registers. Alas, there
+ // doesn't seem to be a packed word rotate, so we have to synthesize
+ // it.
+
+ // b ^= (a + d) <<< 7
+ add v16.4s, v0.4s, v3.4s
+ shl v17.4s, v16.4s, #7
+ ushr v16.4s, v16.4s, #25
+ orr v16.16b, v16.16b, v17.16b
+ eor v5.16b, v1.16b, v16.16b
+
+ // c ^= (b + a) <<< 9
+ add v16.4s, v5.4s, v0.4s
+ shl v17.4s, v16.4s, #9
+ ushr v16.4s, v16.4s, #23
+ orr v16.16b, v16.16b, v17.16b
+ eor v6.16b, v2.16b, v16.16b
+
+ // d ^= (c + b) <<< 13
+ add v16.4s, v6.4s, v5.4s
+ ext v5.16b, v5.16b, v5.16b, #12
+ shl v17.4s, v16.4s, #13
+ ushr v16.4s, v16.4s, #19
+ orr v16.16b, v16.16b, v17.16b
+ eor v7.16b, v3.16b, v16.16b
+
+ // a ^= (d + c) <<< 18
+ add v16.4s, v7.4s, v6.4s
+ ext v6.16b, v6.16b, v6.16b, #8
+ ext v7.16b, v7.16b, v7.16b, #4
+ shl v17.4s, v16.4s, #18
+ ushr v16.4s, v16.4s, #14
+ orr v16.16b, v16.16b, v17.16b
+ eor v4.16b, v0.16b, v16.16b
+
+0:
+ // The transpose conveniently only involves reordering elements of
+ // individual rows, which can be done quite easily, and reordering
+ // the rows themselves, which is a trivial renaming. It doesn't
+ // involve any movement of elements between rows.
+ //
+ // [ 0 5 10 15] [ 0 5 10 15] (a, v4)
+ // [ 4 9 14 3] --> [ 1 6 11 12] (b, v7)
+ // [ 8 13 2 7] [ 2 7 8 13] (c, v6)
+ // [12 1 6 11] [ 3 4 9 14] (d, v5)
+ //
+ // The reorderings have been pushed upwards to reduce delays.
+ sub w0, w0, #2
+
+ // Apply the row quarterround to each of the columns (yes!)
+ // simultaneously.
+
+ // b ^= (a + d) <<< 7
+ add v16.4s, v4.4s, v5.4s
+ shl v17.4s, v16.4s, #7
+ ushr v16.4s, v16.4s, #25
+ orr v16.16b, v16.16b, v17.16b
+ eor v7.16b, v7.16b, v16.16b
+
+ // c ^= (b + a) <<< 9
+ add v16.4s, v7.4s, v4.4s
+ shl v17.4s, v16.4s, #9
+ ushr v16.4s, v16.4s, #23
+ orr v16.16b, v16.16b, v17.16b
+ eor v6.16b, v6.16b, v16.16b
+
+ // d ^= (c + b) <<< 13
+ add v16.4s, v6.4s, v7.4s
+ ext v7.16b, v7.16b, v7.16b, #12
+ shl v17.4s, v16.4s, #13
+ ushr v16.4s, v16.4s, #19
+ orr v16.16b, v16.16b, v17.16b
+ eor v5.16b, v5.16b, v16.16b
+
+ // a ^= (d + c) <<< 18
+ add v16.4s, v5.4s, v6.4s
+ ext v6.16b, v6.16b, v6.16b, #8
+ ext v5.16b, v5.16b, v5.16b, #4
+ shl v17.4s, v16.4s, #18
+ ushr v16.4s, v16.4s, #14
+ orr v16.16b, v16.16b, v17.16b
+ eor v4.16b, v4.16b, v16.16b
+
+ // We had to undo the transpose ready for the next loop. Again, push
+ // back the reorderings to reduce latency. Decrement the loop
+ // counter and see if we should go round again.
+ cbz w0, 9f
+
+ // Do the first half of the next round because this loop is offset.
+
+ // b ^= (a + d) <<< 7
+ add v16.4s, v4.4s, v7.4s
+ shl v17.4s, v16.4s, #7
+ ushr v16.4s, v16.4s, #25
+ orr v16.16b, v16.16b, v17.16b
+ eor v5.16b, v5.16b, v16.16b
+
+ // c ^= (b + a) <<< 9
+ add v16.4s, v5.4s, v4.4s
+ shl v17.4s, v16.4s, #9
+ ushr v16.4s, v16.4s, #23
+ orr v16.16b, v16.16b, v17.16b
+ eor v6.16b, v6.16b, v16.16b
+
+ // d ^= (c + b) <<< 13
+ add v16.4s, v6.4s, v5.4s
+ ext v5.16b, v5.16b, v5.16b, #12
+ shl v17.4s, v16.4s, #13
+ ushr v16.4s, v16.4s, #19
+ orr v16.16b, v16.16b, v17.16b
+ eor v7.16b, v7.16b, v16.16b
+
+ // a ^= (d + c) <<< 18
+ add v16.4s, v7.4s, v6.4s
+ ext v6.16b, v6.16b, v6.16b, #8
+ ext v7.16b, v7.16b, v7.16b, #4
+ shl v17.4s, v16.4s, #18
+ ushr v16.4s, v16.4s, #14
+ orr v16.16b, v16.16b, v17.16b
+ eor v4.16b, v4.16b, v16.16b
+
+ b 0b
+
+ // Almost there. Firstly the feedfoward addition. Also, establish
+ // constants which will be useful later.
+9: add v0.4s, v0.4s, v4.4s // 0, 5, 10, 15
+ movi v16.2d, #0xffffffff // = (-1, 0, -1, 0)
+ movi d17, #-1 // = (-1, -1, 0, 0)
+ add v1.4s, v1.4s, v5.4s // 4, 9, 14, 3
+ add v2.4s, v2.4s, v6.4s // 8, 13, 2, 7
+ add v3.4s, v3.4s, v7.4s // 12, 1, 6, 11
+
+ // Next we must undo the permutation which was already applied to the
+ // input. The core trick is from Dan Bernstein's `armneon3'
+ // implementation, but with a lot of liposuction.
+ mov v4.16b, v0.16b
+
+ // Sort out the columns by pairs.
+ bif v0.16b, v3.16b, v16.16b // 0, 1, 10, 11
+ bif v3.16b, v2.16b, v16.16b // 12, 13, 6, 7
+ bif v2.16b, v1.16b, v16.16b // 8, 9, 2, 3
+ bif v1.16b, v4.16b, v16.16b // 4, 5, 14, 15
+ mov v4.16b, v0.16b
+ mov v5.16b, v3.16b
+
+ // Now fix up the remaining discrepancies.
+ bif v0.16b, v2.16b, v17.16b // 0, 1, 2, 3
+ bif v3.16b, v1.16b, v17.16b // 12, 13, 14, 15
+ bif v2.16b, v4.16b, v17.16b // 8, 9, 10, 11
+ bif v1.16b, v5.16b, v17.16b // 4, 5, 6, 7
+
+ // And with that, we're done.
+ st1 {v0.4s-v3.4s}, [x2]
+ ret
+
+ENDFUNC
+
+///----- That's all, folks --------------------------------------------------
extern core__functype salsa20_core_arm_neon;
#endif
+#if CPUFAM_ARM64
+extern core__functype salsa20_core_arm64;
+#endif
+
static core__functype *pick_core(void)
{
#if CPUFAM_X86 || CPUFAM_AMD64
DISPATCH_PICK_COND(salsa20_core, salsa20_core_arm_neon,
cpu_feature_p(CPUFEAT_ARM_NEON));
#endif
+#if CPUFAM_ARM64
+ DISPATCH_PICK_COND(salsa20_core, salsa20_core_arm64, 1);
+#endif
DISPATCH_PICK_FALLBACK(salsa20_core, simple_core);
}