From e492db887af6561dd33aa18e3887efaeb219fd16 Mon Sep 17 00:00:00 2001
From: Mark Wooding <mdw@distorted.org.uk>
Date: Fri, 22 Jun 2018 10:20:44 +0100
Subject: [PATCH] Add support for fancy AArch64 assembler code.

It's a fun instruction set, and maybe this will improve my crypto on
Raspberry Pi 3.
---
 base/asm-common.h            | 119 ++++++++++++++++
 base/dispatch.c              |  14 ++
 configure.ac                 |   3 +-
 symm/Makefile.am             |   9 ++
 symm/chacha-arm64.S          | 181 +++++++++++++++++++++++++
 symm/chacha.c                |   7 +
 symm/rijndael-arm64-crypto.S | 314 +++++++++++++++++++++++++++++++++++++++++++
 symm/rijndael-base.c         |   7 +
 symm/rijndael.c              |  12 ++
 symm/salsa20-arm64.S         | 236 ++++++++++++++++++++++++++++++++
 symm/salsa20.c               |   7 +
 11 files changed, 908 insertions(+), 1 deletion(-)
 create mode 100644 symm/chacha-arm64.S
 create mode 100644 symm/rijndael-arm64-crypto.S
 create mode 100644 symm/salsa20-arm64.S

diff --git a/base/asm-common.h b/base/asm-common.h
index f4c4f6e3..d81e4098 100644
--- a/base/asm-common.h
+++ b/base/asm-common.h
@@ -1031,6 +1031,125 @@ name:
 #endif
 
 ///--------------------------------------------------------------------------
+/// AArch64-specific hacking.
+
+#if CPUFAM_ARM64
+
+// Set the function hooks.
+#define FUNC_PREHOOK(_) .balign 4
+#define FUNC_POSTHOOK(_) .cfi_startproc; .L$_prologue_p = -1
+#define ENDFUNC_HOOK(_) .cfi_endproc
+
+// Call external subroutine at ADDR, possibly via PLT.
+.macro	callext addr
+	bl	\addr
+.endm
+
+// Load address of external symbol ADDR into REG.
+.macro	leaext	reg, addr
+#if WANT_PIC
+	adrp	\reg, :got:\addr
+	ldr	\reg, [\reg, #:got_lo12:\addr]
+#else
+	adrp	\reg, \addr
+	add	\reg, \reg, #:lo12:\addr
+#endif
+.endm
+
+// Stack management and unwinding.
+.macro	setfp	fp, offset = 0
+  // If you're just going through the motions with a fixed-size stack frame,
+  // then you want to say `add x29, sp, #OFFSET' directly, which will avoid
+  // pointlessly restoring sp later.
+  .if \offset == 0
+	mov	\fp, sp
+	  .cfi_def_cfa_register \fp
+  .else
+	add	\fp, sp, #\offset
+	  .cfi_def_cfa_register \fp
+	  .cfi_adjust_cfa_offset -\offset
+  .endif
+	.macro dropfp; _dropfp	\fp, \offset; .endm
+	.L$_frameptr_p = -1
+.endm
+
+.macro	_dropfp	fp, offset = 0
+  .if \offset == 0
+	mov	sp, \fp
+	  .cfi_def_cfa_register sp
+  .else
+	sub	sp, \fp, #\offset
+	  .cfi_def_cfa_register sp
+	  .cfi_adjust_cfa_offset +\offset
+  .endif
+	.purgem	dropfp
+	.L$_frameptr_p = 0
+.endm
+
+.macro	stalloc	n
+	sub	sp, sp, #\n
+	  .cfi_adjust_cfa_offset +\n
+.endm
+
+.macro	stfree	n
+	add	sp, sp, #\n
+	  .cfi_adjust_cfa_offset -\n
+.endm
+
+.macro	pushreg	x, y=
+  .ifeqs "\y", ""
+	str	\x, [sp, #-16]!
+	  .cfi_adjust_cfa_offset +16
+	  .cfi_rel_offset \x, 0
+  .else
+	stp	\x, \y, [sp, #-16]!
+	  .cfi_adjust_cfa_offset +16
+	  .cfi_rel_offset \x, 0
+	  .cfi_rel_offset \y, 8
+  .endif
+.endm
+
+.macro	popreg	x, y=
+  .ifeqs "\y", ""
+	ldr	\x, [sp], #16
+	  .cfi_restore \x
+	  .cfi_adjust_cfa_offset -16
+  .else
+	ldp	\x, \y, [sp], #16
+	  .cfi_restore \x
+	  .cfi_restore \y
+	  .cfi_adjust_cfa_offset -16
+  .endif
+.endm
+
+.macro	savereg	x, y, z=
+  .ifeqs "\z", ""
+	str	\x, [sp, #\y]
+	  .cfi_rel_offset \x, \y
+  .else
+	stp	\x, \y, [sp, #\z]
+	  .cfi_rel_offset \x, \z
+	  .cfi_rel_offset \y, \z + 8
+  .endif
+.endm
+
+.macro	rstrreg	x, y, z=
+  .ifeqs "\z", ""
+	ldr	\x, [sp, #\y]
+	  .cfi_restore \x
+  .else
+	ldp	\x, \y, [sp, #\z]
+	  .cfi_restore \x
+	  .cfi_restore \y
+  .endif
+.endm
+
+.macro	endprologue
+.endm
+
+#endif
+
+///--------------------------------------------------------------------------
 /// Final stuff.
 
 // Default values for the various hooks.
diff --git a/base/dispatch.c b/base/dispatch.c
index 50c94380..908a4e31 100644
--- a/base/dispatch.c
+++ b/base/dispatch.c
@@ -229,6 +229,11 @@ struct auxentry { unsigned long type; union auxval value; };
 #  define WANT_AT_HWCAP(_) _(AT_HWCAP, u, hwcap)
 #endif
 
+#if defined(AT_HWCAP) && CPUFAM_ARM64
+#  define WANT_ANY 1
+#  define WANT_AT_HWCAP(_) _(AT_HWCAP, u, hwcap)
+#endif
+
 #if defined(AT_HWCAP2) && CPUFAM_ARMEL
 #  define WANT_ANY 1
 #  define WANT_AT_HWCAP2(_) _(AT_HWCAP2, u, hwcap2)
@@ -278,6 +283,12 @@ static unsigned hwcaps = 0;
 	_(ARM_D32, "arm:d32")						\
 	_(ARM_AES, "arm:aes")
 #endif
+#if CPUFAM_ARM64
+#  define WANTAUX(_)							\
+	WANT_AT_HWCAP(_)
+#  define CAPMAP(_)							\
+	_(ARM_AES, "arm:aes")
+#endif
 
 /* Build the bitmask for `hwcaps' from the `CAPMAP' list. */
 enum {
@@ -391,6 +402,9 @@ static void probe_hwcaps(void)
   if (probed.hwcap2 & HWCAP2_AES) hw |= HF_ARM_AES;
 #  endif
 #endif
+#if CPUFAM_ARM64
+  if (probed.hwcap & HWCAP_AES) hw |= HF_ARM_AES;
+#endif
 
   /* Store the bitmask of features we probed for everyone to see. */
   DISPATCH_STORE(hwcaps, hw);
diff --git a/configure.ac b/configure.ac
index 1643ad00..f8ad8b77 100644
--- a/configure.ac
+++ b/configure.ac
@@ -78,7 +78,8 @@ AC_DEFUN([catacomb_CPU_FAMILIES],
    $1([i[[3-6]]86,*], [x86], [sysv])
    $1([x86_64,cygwin], [amd64], [win])
    $1([x86_64,*], [amd64], [sysv])
-   $1([arm,* | armv*,*], [armel], [std])])
+   $1([arm,* | armv*,*], [armel], [std])
+   $1([aarch64,*], [arm64], [std])])
 
 dnl A utility to clear the `seen' flags, used so as to process each CPU or
 dnl ABI once.
diff --git a/symm/Makefile.am b/symm/Makefile.am
index 4441ecc4..0e56319d 100644
--- a/symm/Makefile.am
+++ b/symm/Makefile.am
@@ -193,6 +193,9 @@ if CPUFAM_ARMEL
 libsymm_la_SOURCES	+= rijndael-arm-crypto.S
 endif
 endif
+if CPUFAM_ARM64
+libsymm_la_SOURCES	+= rijndael-arm64-crypto.S
+endif
 nodist_libsymm_la_SOURCES += ../precomp/symm/rijndael-tab.c
 PRECOMPS		+= $(precomp)/symm/rijndael-tab.c
 PRECOMP_PROGS		+= rijndael-mktab
@@ -477,6 +480,9 @@ endif
 if CPUFAM_ARMEL
 libsymm_la_SOURCES	+= salsa20-arm-neon.S
 endif
+if CPUFAM_ARM64
+libsymm_la_SOURCES	+= salsa20-arm64.S
+endif
 TESTS			+= salsa20.t$(EXEEXT)
 ALL_CIPHERS		+= salsa20 salsa2012 salsa208
 ALL_CIPHERS		+= salsa20-ietf salsa2012-ietf salsa208-ietf
@@ -516,6 +522,9 @@ endif
 if CPUFAM_ARMEL
 libsymm_la_SOURCES	+= chacha-arm-neon.S
 endif
+if CPUFAM_ARM64
+libsymm_la_SOURCES	+= chacha-arm64.S
+endif
 TESTS			+= chacha.t$(EXEEXT)
 EXTRA_DIST		+= t/chacha
 ALL_CIPHERS		+= chacha20 chacha12 chacha8
diff --git a/symm/chacha-arm64.S b/symm/chacha-arm64.S
new file mode 100644
index 00000000..a423e9e5
--- /dev/null
+++ b/symm/chacha-arm64.S
@@ -0,0 +1,181 @@
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// Fancy SIMD implementation of ChaCha for AArch64
+///
+/// (c) 2018 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+#include "config.h"
+#include "asm-common.h"
+
+///--------------------------------------------------------------------------
+/// Main.code.
+
+	.arch	armv8-a
+	.text
+
+FUNC(chacha_core_arm64)
+
+	// Arguments are in registers.
+	// w0 is the number of rounds to perform
+	// x1 points to the input matrix
+	// x2 points to the output matrix
+
+	// First job is to slurp the matrix into the SIMD registers.
+	//
+	//	[ 0  1  2  3] (a, v4)
+	//	[ 4  5  6  7] (b, v5)
+	//	[ 8  9 10 11] (c, v6)
+	//	[12 13 14 15] (d, v7)
+	//
+	// We need a copy for later.  Rather than waste time copying them by
+	// hand, we'll use the three-address nature of the instruction set.
+	// But this means that the main loop is offset by a bit.
+	ld1	{v0.4s-v3.4s}, [x1]
+
+	// a += b; d ^= a; d <<<= 16
+	add	v4.4s, v0.4s, v1.4s
+	eor	v7.16b, v3.16b, v4.16b
+	shl	v16.4s, v7.4s, #16
+	ushr	v7.4s, v7.4s, #16
+	orr	v7.16b, v7.16b, v16.16b
+
+	// c += d; b ^= c; b <<<= 12
+	add	v6.4s, v2.4s, v7.4s
+	eor	v5.16b, v1.16b, v6.16b
+	shl	v16.4s, v5.4s, #12
+	ushr	v5.4s, v5.4s, #20
+	orr	v5.16b, v5.16b, v16.16b
+
+0:
+	// Apply (the rest of) a column quarterround to each of the columns
+	// simultaneously.  Alas, there doesn't seem to be a packed word
+	// rotate, so we have to synthesize it.
+
+	// a += b; d ^= a; d <<<= 8
+	add	v4.4s, v4.4s, v5.4s
+	eor	v7.16b, v7.16b, v4.16b
+	shl	v16.4s, v7.4s, #8
+	ushr	v7.4s, v7.4s, #24
+	orr	v7.16b, v7.16b, v16.16b
+
+	// c += d; b ^= c; b <<<= 7
+	add	v6.4s, v6.4s, v7.4s
+	 ext	v7.16b, v7.16b, v7.16b, #12
+	eor	v5.16b, v5.16b, v6.16b
+	 ext	v6.16b, v6.16b, v6.16b, #8
+	shl	v16.4s, v5.4s, #7
+	ushr	v5.4s, v5.4s, #25
+	orr	v5.16b, v5.16b, v16.16b
+
+	// The not-quite-transpose conveniently only involves reordering
+	// elements of individual rows, which can be done quite easily.  It
+	// doesn't involve any movement of elements between rows, or even
+	// renaming of the rows.
+	//
+	//	[ 0  1  2  3]		[ 0  1  2  3] (a, v4)
+	//	[ 4  5  6  7]    -->	[ 5  6  7  4] (b, v5)
+	//	[ 8  9 10 11]		[10 11  8  9] (c, v6)
+	//	[12 13 14 15]		[15 12 13 14] (d, v7)
+	//
+	// The reorderings have for the most part been pushed upwards to
+	// reduce delays.
+	ext	v5.16b, v5.16b, v5.16b, #4
+	sub	w0, w0, #2
+
+	// Apply the diagonal quarterround to each of the columns
+	// simultaneously.
+
+	// a += b; d ^= a; d <<<= 16
+	add	v4.4s, v4.4s, v5.4s
+	eor	v7.16b, v7.16b, v4.16b
+	shl	v16.4s, v7.4s, #16
+	ushr	v7.4s, v7.4s, #16
+	orr	v7.16b, v7.16b, v16.16b
+
+	// c += d; b ^= c; b <<<= 12
+	add	v6.4s, v6.4s, v7.4s
+	eor	v5.16b, v5.16b, v6.16b
+	shl	v16.4s, v5.4s, #12
+	ushr	v5.4s, v5.4s, #20
+	orr	v5.16b, v5.16b, v16.16b
+
+	// a += b; d ^= a; d <<<= 8
+	add	v4.4s, v4.4s, v5.4s
+	eor	v7.16b, v7.16b, v4.16b
+	shl	v16.4s, v7.4s, #8
+	ushr	v7.4s, v7.4s, #24
+	orr	v7.16b, v7.16b, v16.16b
+
+	// c += d; b ^= c; b <<<= 7
+	add	v6.4s, v6.4s, v7.4s
+	 ext	v7.16b, v7.16b, v7.16b, #4
+	eor	v5.16b, v5.16b, v6.16b
+	 ext	v6.16b, v6.16b, v6.16b, #8
+	shl	v16.4s, v5.4s, #7
+	ushr	v5.4s, v5.4s, #25
+	orr	v5.16b, v5.16b, v16.16b
+
+	// Finally finish off undoing the transpose, and we're done for this
+	// doubleround.  Again, most of this was done above so we don't have
+	// to wait for the reorderings.
+	ext	v5.16b, v5.16b, v5.16b, #12
+
+	// Decrement the loop counter and see if we should go round again.
+	cbz	w0, 9f
+
+	// Do the first part of the next round because this loop is offset.
+
+	// a += b; d ^= a; d <<<= 16
+	add	v4.4s, v4.4s, v5.4s
+	eor	v7.16b, v7.16b, v4.16b
+	shl	v16.4s, v7.4s, #16
+	ushr	v7.4s, v7.4s, #16
+	orr	v7.16b, v7.16b, v16.16b
+
+	// c += d; b ^= c; b <<<= 12
+	add	v6.4s, v6.4s, v7.4s
+	eor	v5.16b, v5.16b, v6.16b
+	shl	v16.4s, v5.4s, #12
+	ushr	v5.4s, v5.4s, #20
+	orr	v5.16b, v5.16b, v16.16b
+
+	b	0b
+
+	// Almost there.  Firstly the feedfoward addition.
+9:	add	v0.4s, v0.4s, v4.4s
+	add	v1.4s, v1.4s, v5.4s
+	add	v2.4s, v2.4s, v6.4s
+	add	v3.4s, v3.4s, v7.4s
+
+	// And now we write out the result.
+	st1	{v0.4s-v3.4s}, [x2]
+
+	// And with that, we're done.
+	ret
+
+ENDFUNC
+
+///----- That's all, folks --------------------------------------------------
diff --git a/symm/chacha.c b/symm/chacha.c
index 2dcb1253..34198618 100644
--- a/symm/chacha.c
+++ b/symm/chacha.c
@@ -78,6 +78,10 @@ extern core__functype chacha_core_x86ish_sse2;
 extern core__functype chacha_core_arm_neon;
 #endif
 
+#if CPUFAM_ARM64
+extern core__functype chacha_core_arm64;
+#endif
+
 static core__functype *pick_core(void)
 {
 #if CPUFAM_X86 || CPUFAM_AMD64
@@ -88,6 +92,9 @@ static core__functype *pick_core(void)
   DISPATCH_PICK_COND(chacha_core, chacha_core_arm_neon,
 		     cpu_feature_p(CPUFEAT_ARM_NEON));
 #endif
+#if CPUFAM_ARM64
+  DISPATCH_PICK_COND(chacha_core, chacha_core_arm64, 1);
+#endif
   DISPATCH_PICK_FALLBACK(chacha_core, simple_core);
 }
 
diff --git a/symm/rijndael-arm64-crypto.S b/symm/rijndael-arm64-crypto.S
new file mode 100644
index 00000000..8739c193
--- /dev/null
+++ b/symm/rijndael-arm64-crypto.S
@@ -0,0 +1,314 @@
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// AArch64 crypto-extension-based implementation of Rijndael
+///
+/// (c) 2018 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+#include "config.h"
+#include "asm-common.h"
+
+	.extern	F(abort)
+	.extern	F(rijndael_rcon)
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+	.arch	armv8-a+crypto
+
+/// The ARM crypto extension implements a little-endian version of AES
+/// (though the manual doesn't actually spell this out and you have to
+/// experiment), but Catacomb's internal interface presents as big-endian so
+/// as to work better with things like GCM.  We therefore maintain the round
+/// keys in little-endian form, and have to end-swap blocks in and out.
+///
+/// For added amusement, the crypto extension doesn't implement the larger-
+/// block versions of Rijndael, so we have to end-swap the keys if we're
+/// preparing for one of those.
+
+	// Useful constants.
+	.equ	maxrounds, 16		// maximum number of rounds
+	.equ	maxblksz, 32		// maximum block size, in bytes
+	.equ	kbufsz, maxblksz*(maxrounds + 1) // size of key-sched buffer
+
+	// Context structure.
+	.equ	nr, 0			// number of rounds
+	.equ	w, nr + 4		// encryption key words
+	.equ	wi, w + kbufsz		// decryption key words
+
+///--------------------------------------------------------------------------
+/// Key setup.
+
+FUNC(rijndael_setup_arm64_crypto)
+
+	// Arguments:
+	//	x0 = pointer to context
+	//	w1 = block size in 32-bit words
+	//	x2 = pointer to key material
+	//	x3 = key size in words
+
+	pushreg	x29, x30
+	mov	x29, sp
+
+	// The initial round key material is taken directly from the input
+	// key, so copy it over.  Unfortunately, the key material is not
+	// guaranteed to be aligned in any especially useful way.  Assume
+	// that alignment traps are not enabled.  (Why would they be?  On
+	// A32, alignment traps were part of a transition plan which changed
+	// the way unaligned loads and stores behaved, but there's never been
+	// any other behaviour on A64.)
+	mov	x15, x3
+	add	x4, x0, #w
+0:	sub	x15, x15, #1
+	ldr	w14, [x2], #4
+	str	w14, [x4], #4
+	cbnz	x15, 0b
+
+	// Find out other useful things and prepare for the main loop.
+9:	ldr	w9, [x0, #nr]		// number of rounds
+	madd	w2, w1, w9, w1		// total key size in words
+	leaext	x5, rijndael_rcon	// round constants
+	sub	x6, x2, x3		// minus what we've copied already
+	add	x7, x0, #w		// position in previous cycle
+	movi	v1.4s, #0		// all-zero register for the key
+	mov	x8, #0			// position in current cycle
+
+	// Main key expansion loop.  Dispatch according to the position in
+	// the cycle.
+0:	ldr	w15, [x7], #4		// word from previous cycle
+	cbz	x8, 1f			// first word of the cycle?
+	cmp	x8, #4			// fourth word of the cycle?
+	b.ne	2f
+	cmp	x3, #7			// seven or eight words of key?
+	b.cc	2f
+
+	// Fourth word of the cycle, seven or eight words of key.  We must do
+	// the byte substitution.
+	dup	v0.4s, w14
+	aese	v0.16b, v1.16b		// effectively, just SubBytes
+	mov	w14, v0.4s[0]
+	b	2f
+
+	// First word of the cycle.  Byte substitution, rotation, and round
+	// constant.
+1:	ldrb	w13, [x5], #1		// next round constant
+	dup	v0.4s, w14
+	aese	v0.16b, v1.16b		// effectively, just SubBytes
+	mov	w14, v0.4s[0]
+	eor	w14, w13, w14, ror #8
+
+	// Common ending: mix in the word from the previous cycle and store.
+2:	eor	w14, w14, w15
+	str	w14, [x4], #4
+
+	// Prepare for the next iteration.  If we're done, then stop; if
+	// we've finished a cycle then reset the counter.
+	add	x8, x8, #1
+	sub	x6, x6, #1
+	cmp	x8, x3
+	cbz	x6, 9f
+	csel	x8, x8, xzr, cc
+	b	0b
+
+	// Next job is to construct the decryption keys.  The keys for the
+	// first and last rounds don't need to be mangled, but the remaining
+	// ones do -- and they all need to be reordered too.
+	//
+	// The plan of action, then, is to copy the final encryption round's
+	// keys into place first, then to do each of the intermediate rounds
+	// in reverse order, and finally do the first round.
+	//
+	// Do all the heavy lifting with the vector registers.  The order
+	// we're doing this in means that it's OK if we read or write too
+	// much, and there's easily enough buffer space for the
+	// over-enthusiastic reads and writes because the context has space
+	// for 32-byte blocks, which is our maximum and an exact fit for two
+	// full-width registers.
+9:	add	x5, x0, #wi
+	add	x4, x0, #w
+	add	x4, x4, w2, uxtw #2
+	sub	x4, x4, w1, uxtw #2		// last round's keys
+
+	// Copy the last encryption round's keys.
+	ld1	{v0.4s, v1.4s}, [x4]
+	st1	{v0.4s, v1.4s}, [x5]
+
+	// Update the loop variables and stop if we've finished.
+0:	sub	w9, w9, #1
+	add	x5, x5, w1, uxtw #2
+	sub	x4, x4, w1, uxtw #2
+	cbz	w9, 9f
+
+	// Do another middle round's keys...
+	ld1	{v0.4s, v1.4s}, [x4]
+	aesimc	v0.16b, v0.16b
+	aesimc	v1.16b, v1.16b
+	st1	{v0.4s, v1.4s}, [x5]
+	b	0b
+
+	// Finally do the first encryption round.
+9:	ld1	{v0.4s, v1.4s}, [x4]
+	st1	{v0.4s, v1.4s}, [x5]
+
+	// If the block size is not exactly four words then we must end-swap
+	// everything.  We can use fancy vector toys for this.
+	cmp	w1, #4
+	b.eq	9f
+
+	// End-swap the encryption keys.
+	add	x1, x0, #w
+	bl	endswap_block
+
+	// And the decryption keys
+	add	x1, x0, #wi
+	bl	endswap_block
+
+	// All done.
+9:	popreg	x29, x30
+	ret
+
+ENDFUNC
+
+INTFUNC(endswap_block)
+	// End-swap w2 words starting at x1.  x1 is clobbered; w2 is not.
+	// It's OK to work in 16-byte chunks.
+
+	mov	w3, w2
+0:	subs	w3, w3, #4
+	ld1	{v0.4s}, [x1]
+	rev32	v0.16b, v0.16b
+	st1	{v0.4s}, [x1], #16
+	b.hi	0b
+	ret
+
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Encrypting and decrypting blocks.
+
+.macro	encdec	op, aes, mc, koff
+  FUNC(rijndael_\op\()_arm64_crypto)
+
+	// Arguments:
+	//	x0 = pointer to context
+	//	x1 = pointer to input block
+	//	x2 = pointer to output block
+
+	// Set things up ready.
+	ldr	w3, [x0, #nr]
+	add	x0, x0, #\koff
+	ld1	{v0.4s}, [x1]
+	rev32	v0.16b, v0.16b
+
+	// Check the number of rounds and dispatch.
+	cmp	w3, #14
+	b.eq	14f
+	cmp	w3, #10
+	b.eq	10f
+	cmp	w3, #12
+	b.eq	12f
+	cmp	w3, #13
+	b.eq	13f
+	cmp	w3, #11
+	b.eq	11f
+	callext	F(abort)
+
+	// Eleven rounds.
+11:	ld1	{v16.4s}, [x0], #16
+	\aes	v0.16b, v16.16b
+	\mc	v0.16b, v0.16b
+	b	10f
+
+	// Twelve rounds.
+12:	ld1	{v16.4s, v17.4s}, [x0], #32
+	\aes	v0.16b, v16.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v17.16b
+	\mc	v0.16b, v0.16b
+	b	10f
+
+	// Thirteen rounds.
+13:	ld1	{v16.4s-v18.4s}, [x0], #48
+	\aes	v0.16b, v16.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v17.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v18.16b
+	\mc	v0.16b, v0.16b
+	b	10f
+
+	// Fourteen rounds.  (Drops through to the ten round case because
+	// this is the next most common.)
+14:	ld1	{v16.4s-v19.4s}, [x0], #64
+	\aes	v0.16b, v16.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v17.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v18.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v19.16b
+	\mc	v0.16b, v0.16b
+	// Drop through...
+
+	// Ten rounds.
+10:	ld1	{v16.4s-v19.4s}, [x0], #64
+	ld1	{v20.4s-v23.4s}, [x0], #64
+	\aes	v0.16b, v16.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v17.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v18.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v19.16b
+	\mc	v0.16b, v0.16b
+
+	ld1	{v16.4s-v18.4s}, [x0], #48
+	\aes	v0.16b, v20.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v21.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v22.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v23.16b
+	\mc	v0.16b, v0.16b
+
+	// Final round has no MixColumns, but is followed by final whitening.
+	\aes	v0.16b, v16.16b
+	\mc	v0.16b, v0.16b
+	\aes	v0.16b, v17.16b
+	eor	v0.16b, v0.16b, v18.16b
+
+	// All done.
+	rev32	v0.16b, v0.16b
+	st1	{v0.4s}, [x2]
+	ret
+
+  ENDFUNC
+.endm
+
+	encdec	eblk, aese, aesmc, w
+	encdec	dblk, aesd, aesimc, wi
+
+///----- That's all, folks --------------------------------------------------
diff --git a/symm/rijndael-base.c b/symm/rijndael-base.c
index 01f781df..83a49e92 100644
--- a/symm/rijndael-base.c
+++ b/symm/rijndael-base.c
@@ -122,6 +122,9 @@ extern setup__functype rijndael_setup_x86ish_aesni;
 #if CPUFAM_ARMEL && HAVE_AS_ARMV8_CRYPTO
 extern setup__functype rijndael_setup_arm_crypto;
 #endif
+#if CPUFAM_ARM64
+extern setup__functype rijndael_setup_arm64_crypto;
+#endif
 
 static setup__functype *pick_setup(void)
 {
@@ -133,6 +136,10 @@ static setup__functype *pick_setup(void)
   DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_arm_crypto,
 		     cpu_feature_p(CPUFEAT_ARM_AES));
 #endif
+#if CPUFAM_ARM64
+  DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_arm64_crypto,
+		     cpu_feature_p(CPUFEAT_ARM_AES));
+#endif
   DISPATCH_PICK_FALLBACK(rijndael_setup, simple_setup);
 }
 
diff --git a/symm/rijndael.c b/symm/rijndael.c
index 4c8837d2..02cfb76b 100644
--- a/symm/rijndael.c
+++ b/symm/rijndael.c
@@ -88,6 +88,10 @@ extern rijndael_dblk__functype rijndael_dblk_x86ish_aesni;
 extern rijndael_eblk__functype rijndael_eblk_arm_crypto;
 extern rijndael_dblk__functype rijndael_dblk_arm_crypto;
 #endif
+#if CPUFAM_ARM64
+extern rijndael_eblk__functype rijndael_eblk_arm64_crypto;
+extern rijndael_dblk__functype rijndael_dblk_arm64_crypto;
+#endif
 
 static rijndael_eblk__functype *pick_eblk(void)
 {
@@ -99,6 +103,10 @@ static rijndael_eblk__functype *pick_eblk(void)
   DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_arm_crypto,
 		     cpu_feature_p(CPUFEAT_ARM_AES));
 #endif
+#if CPUFAM_ARM64
+  DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_arm64_crypto,
+		     cpu_feature_p(CPUFEAT_ARM_AES));
+#endif
   DISPATCH_PICK_FALLBACK(rijndael_eblk, simple_eblk);
 }
 
@@ -112,6 +120,10 @@ static rijndael_dblk__functype *pick_dblk(void)
   DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_arm_crypto,
 		     cpu_feature_p(CPUFEAT_ARM_AES));
 #endif
+#if CPUFAM_ARM64
+  DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_arm64_crypto,
+		     cpu_feature_p(CPUFEAT_ARM_AES));
+#endif
   DISPATCH_PICK_FALLBACK(rijndael_dblk, simple_dblk);
 }
 
diff --git a/symm/salsa20-arm64.S b/symm/salsa20-arm64.S
new file mode 100644
index 00000000..821548e1
--- /dev/null
+++ b/symm/salsa20-arm64.S
@@ -0,0 +1,236 @@
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// Fancy SIMD implementation of Salsa20 for AArch64
+///
+/// (c) 2018 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+#include "config.h"
+#include "asm-common.h"
+
+///--------------------------------------------------------------------------
+/// Main.code.
+
+	.arch	armv8-a
+	.text
+
+FUNC(salsa20_core_arm64)
+
+	// Arguments are in registers.
+	// w0 is the number of rounds to perform
+	// x1 points to the input matrix
+	// x2 points to the output matrix
+
+	// First job is to slurp the matrix into the SIMD registers.  The
+	// words have already been permuted conveniently to make them line up
+	// better for SIMD processing.
+	//
+	// The textbook arrangement of the matrix is this.
+	//
+	//	[C K K K]
+	//	[K C N N]
+	//	[T T C K]
+	//	[K K K C]
+	//
+	// But we've rotated the columns up so that the main diagonal with
+	// the constants on it end up in the first row, giving something more
+	// like
+	//
+	//	[C C C C]
+	//	[K T K K]
+	//	[T K K N]
+	//	[K K N K]
+	//
+	// so the transformation looks like this:
+	//
+	//	[ 0  1  2  3]		[ 0  5 10 15] (a, v4)
+	//	[ 4  5  6  7]    -->	[ 4  9 14  3] (b, v5)
+	//	[ 8  9 10 11]		[ 8 13  2  7] (c, v6)
+	//	[12 13 14 15]		[12  1  6 11] (d, v7)
+	//
+	// We need a copy for later.  Rather than waste time copying them by
+	// hand, we'll use the three-address nature of the instruction set.
+	// But this means that the main loop is offset by a bit.
+	ld1	{v0.4s-v3.4s}, [x1]
+
+	// Apply a column quarterround to each of the columns simultaneously,
+	// moving the results to their working registers.  Alas, there
+	// doesn't seem to be a packed word rotate, so we have to synthesize
+	// it.
+
+	// b ^= (a + d) <<<  7
+	add	v16.4s, v0.4s, v3.4s
+	shl	v17.4s, v16.4s, #7
+	ushr	v16.4s, v16.4s, #25
+	orr	v16.16b, v16.16b, v17.16b
+	eor	v5.16b, v1.16b, v16.16b
+
+	// c ^= (b + a) <<<  9
+	add	v16.4s, v5.4s, v0.4s
+	shl	v17.4s, v16.4s, #9
+	ushr	v16.4s, v16.4s, #23
+	orr	v16.16b, v16.16b, v17.16b
+	eor	v6.16b, v2.16b, v16.16b
+
+	// d ^= (c + b) <<< 13
+	add	v16.4s, v6.4s, v5.4s
+	 ext	v5.16b, v5.16b, v5.16b, #12
+	shl	v17.4s, v16.4s, #13
+	ushr	v16.4s, v16.4s, #19
+	orr	v16.16b, v16.16b, v17.16b
+	eor	v7.16b, v3.16b, v16.16b
+
+	// a ^= (d + c) <<< 18
+	add	v16.4s, v7.4s, v6.4s
+	 ext	v6.16b, v6.16b, v6.16b, #8
+	 ext	v7.16b, v7.16b, v7.16b, #4
+	shl	v17.4s, v16.4s, #18
+	ushr	v16.4s, v16.4s, #14
+	orr	v16.16b, v16.16b, v17.16b
+	eor	v4.16b, v0.16b, v16.16b
+
+0:
+	// The transpose conveniently only involves reordering elements of
+	// individual rows, which can be done quite easily, and reordering
+	// the rows themselves, which is a trivial renaming.  It doesn't
+	// involve any movement of elements between rows.
+	//
+	//	[ 0  5 10 15]		[ 0  5 10 15] (a, v4)
+	//	[ 4  9 14  3]	 -->	[ 1  6 11 12] (b, v7)
+	//	[ 8 13	2  7]		[ 2  7	8 13] (c, v6)
+	//	[12  1	6 11]		[ 3  4	9 14] (d, v5)
+	//
+	// The reorderings have been pushed upwards to reduce delays.
+	sub	w0, w0, #2
+
+	// Apply the row quarterround to each of the columns (yes!)
+	// simultaneously.
+
+	// b ^= (a + d) <<<  7
+	add	v16.4s, v4.4s, v5.4s
+	shl	v17.4s, v16.4s, #7
+	ushr	v16.4s, v16.4s, #25
+	orr	v16.16b, v16.16b, v17.16b
+	eor	v7.16b, v7.16b, v16.16b
+
+	// c ^= (b + a) <<<  9
+	add	v16.4s, v7.4s, v4.4s
+	shl	v17.4s, v16.4s, #9
+	ushr	v16.4s, v16.4s, #23
+	orr	v16.16b, v16.16b, v17.16b
+	eor	v6.16b, v6.16b, v16.16b
+
+	// d ^= (c + b) <<< 13
+	add	v16.4s, v6.4s, v7.4s
+	 ext	v7.16b, v7.16b, v7.16b, #12
+	shl	v17.4s, v16.4s, #13
+	ushr	v16.4s, v16.4s, #19
+	orr	v16.16b, v16.16b, v17.16b
+	eor	v5.16b, v5.16b, v16.16b
+
+	// a ^= (d + c) <<< 18
+	add	v16.4s, v5.4s, v6.4s
+	 ext	v6.16b, v6.16b, v6.16b, #8
+	 ext	v5.16b, v5.16b, v5.16b, #4
+	shl	v17.4s, v16.4s, #18
+	ushr	v16.4s, v16.4s, #14
+	orr	v16.16b, v16.16b, v17.16b
+	eor	v4.16b, v4.16b, v16.16b
+
+	// We had to undo the transpose ready for the next loop.  Again, push
+	// back the reorderings to reduce latency.  Decrement the loop
+	// counter and see if we should go round again.
+	cbz	w0, 9f
+
+	// Do the first half of the next round because this loop is offset.
+
+	// b ^= (a + d) <<<  7
+	add	v16.4s, v4.4s, v7.4s
+	shl	v17.4s, v16.4s, #7
+	ushr	v16.4s, v16.4s, #25
+	orr	v16.16b, v16.16b, v17.16b
+	eor	v5.16b, v5.16b, v16.16b
+
+	// c ^= (b + a) <<<  9
+	add	v16.4s, v5.4s, v4.4s
+	shl	v17.4s, v16.4s, #9
+	ushr	v16.4s, v16.4s, #23
+	orr	v16.16b, v16.16b, v17.16b
+	eor	v6.16b, v6.16b, v16.16b
+
+	// d ^= (c + b) <<< 13
+	add	v16.4s, v6.4s, v5.4s
+	 ext	v5.16b, v5.16b, v5.16b, #12
+	shl	v17.4s, v16.4s, #13
+	ushr	v16.4s, v16.4s, #19
+	orr	v16.16b, v16.16b, v17.16b
+	eor	v7.16b, v7.16b, v16.16b
+
+	// a ^= (d + c) <<< 18
+	add	v16.4s, v7.4s, v6.4s
+	 ext	v6.16b, v6.16b, v6.16b, #8
+	 ext	v7.16b, v7.16b, v7.16b, #4
+	shl	v17.4s, v16.4s, #18
+	ushr	v16.4s, v16.4s, #14
+	orr	v16.16b, v16.16b, v17.16b
+	eor	v4.16b, v4.16b, v16.16b
+
+	b	0b
+
+	// Almost there.  Firstly the feedfoward addition.  Also, establish
+	// constants which will be useful later.
+9:	add	v0.4s, v0.4s, v4.4s		//  0,  5, 10, 15
+	 movi	v16.2d, #0xffffffff		// = (-1, 0, -1, 0)
+	 movi	d17, #-1			// = (-1, -1, 0, 0)
+	add	v1.4s, v1.4s, v5.4s		//  4,  9, 14,  3
+	add	v2.4s, v2.4s, v6.4s		//  8, 13,  2,  7
+	add	v3.4s, v3.4s, v7.4s		// 12,  1,  6, 11
+
+	// Next we must undo the permutation which was already applied to the
+	// input.  The core trick is from Dan Bernstein's `armneon3'
+	// implementation, but with a lot of liposuction.
+	 mov	v4.16b, v0.16b
+
+	// Sort out the columns by pairs.
+	bif	v0.16b, v3.16b, v16.16b		//  0,  1, 10, 11
+	bif	v3.16b, v2.16b, v16.16b		// 12, 13,  6,  7
+	bif	v2.16b, v1.16b, v16.16b		//  8,  9,  2,  3
+	bif	v1.16b, v4.16b, v16.16b		//  4,  5, 14, 15
+	 mov	v4.16b, v0.16b
+	 mov	v5.16b, v3.16b
+
+	// Now fix up the remaining discrepancies.
+	bif	v0.16b, v2.16b, v17.16b		//  0,  1,  2,  3
+	bif	v3.16b, v1.16b, v17.16b		// 12, 13, 14, 15
+	bif	v2.16b, v4.16b, v17.16b		//  8,  9, 10, 11
+	bif	v1.16b, v5.16b, v17.16b		//  4,  5,  6,  7
+
+	// And with that, we're done.
+	st1	{v0.4s-v3.4s}, [x2]
+	ret
+
+ENDFUNC
+
+///----- That's all, folks --------------------------------------------------
diff --git a/symm/salsa20.c b/symm/salsa20.c
index ff6efe39..03fcf469 100644
--- a/symm/salsa20.c
+++ b/symm/salsa20.c
@@ -78,6 +78,10 @@ extern core__functype salsa20_core_x86ish_sse2;
 extern core__functype salsa20_core_arm_neon;
 #endif
 
+#if CPUFAM_ARM64
+extern core__functype salsa20_core_arm64;
+#endif
+
 static core__functype *pick_core(void)
 {
 #if CPUFAM_X86 || CPUFAM_AMD64
@@ -88,6 +92,9 @@ static core__functype *pick_core(void)
   DISPATCH_PICK_COND(salsa20_core, salsa20_core_arm_neon,
 		     cpu_feature_p(CPUFEAT_ARM_NEON));
 #endif
+#if CPUFAM_ARM64
+  DISPATCH_PICK_COND(salsa20_core, salsa20_core_arm64, 1);
+#endif
   DISPATCH_PICK_FALLBACK(salsa20_core, simple_core);
 }
 
-- 
2.11.0