From: Mark Wooding <mdw@distorted.org.uk>
Date: Wed, 18 May 2016 09:29:03 +0000 (+0100)
Subject: Merge branch 'mdw/cpu-dispatch'
X-Git-Tag: 2.2.3~1^2~22
X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/commitdiff_plain/1da1ed6a5815deef6c33d74f1eb3c856793df3e5?hp=1c8e76bd932668c8e26f9984ad008c0027dc1167

Merge branch 'mdw/cpu-dispatch'

* mdw/cpu-dispatch:
  Add support machinery for ARM hosts.
  base/dispatch.c: Add (unused) machinery for probing ELF auxilary vector.
  Add support for AMD64 processors and Microsoft Windows.
  symm/rijndael-x86-aseni.S: Unify encryption and decryption with a macro.
  symm/rijndael-x86-aesni.S: Use xmm5 instead of xmm7.
  symm/*.S: Symbolic names for shuffles.
  symm/chacha-x86-sse2.S: Fix the register allocation comment.
  Preprocess the assembler files.
  configure.ac: Improve the host CPU family detection.
  base/dispatch.c: Indent some preprocessor definitions properly.
  Add a pile of debug output around the CPU dispatching machinery.
  base/dispatch.c: Add documentation for some internal functions.
  base/dispatch.c: Add in more useful section markers.
  Support Intel's AES Native Instructions where available on x86 hardware.
  symm/: New SSE2 implementations of Salsa20 and ChaCha.
  symm/salsa20.c, symm/salsa20-core.h: Permute input matrix for SIMD.
  debian/rules: Run tests twice, once without any detected CPU features.
  base/dispatch.c: Check operating system support for XMM registers.
  configure.ac, base/dispatch.[ch]: CPU-specific implementations.
  configure.ac: Arrange to have an assembler available.

Conflicts:
	configure.ac
	symm/Makefile.am
---

diff --git a/base/Makefile.am b/base/Makefile.am
index 35c86ff5..0ac43f2e 100644
--- a/base/Makefile.am
+++ b/base/Makefile.am
@@ -40,6 +40,10 @@ libbase_la_SOURCES	+= arena.c
 pkginclude_HEADERS	+= ct.h
 libbase_la_SOURCES	+= ct.c
 
+## CPU-specific dispatch.
+pkginclude_HEADERS	+= dispatch.h
+libbase_la_SOURCES	+= dispatch.c
+
 ## Acceptable key-size descriptions.
 pkginclude_HEADERS	+= keysz.h
 libbase_la_SOURCES	+= keysz.c keysz-conv.c
@@ -51,4 +55,7 @@ libbase_la_SOURCES	+= lmem.c
 ## Clearing secrets from memory.
 pkginclude_HEADERS	+= paranoia.h
 
+## Base definitions for assembler source.
+EXTRA_DIST		+= asm-common.h
+
 ###----- That's all, folks --------------------------------------------------
diff --git a/base/asm-common.h b/base/asm-common.h
new file mode 100644
index 00000000..0af9cb58
--- /dev/null
+++ b/base/asm-common.h
@@ -0,0 +1,221 @@
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// Fancy SIMD implementation of Salsa20
+///
+/// (c) 2015 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// General definitions.
+
+// Announcing an external function.
+#define FUNC(name)							\
+	.globl	F(name);						\
+	TYPE_FUNC(name);						\
+	.macro	ENDFUNC; _ENDFUNC(name); .endm;				\
+	FUNC_PREHOOK(name);						\
+F(name):								\
+	FUNC_POSTHOOK(name)
+
+// Marking the end of a function.
+#define _ENDFUNC(name)							\
+	.purgem	ENDFUNC;						\
+	SIZE_OBJ(name);							\
+	ENDFUNC_HOOK(name)
+
+///--------------------------------------------------------------------------
+/// ELF-specific hacking.
+
+#if __ELF__
+
+#if __PIC__ || __PIE__
+#  define WANT_PIC 1
+#endif
+
+#define TYPE_FUNC(name) .type name, STT_FUNC
+
+#define SIZE_OBJ(name) .size name, . - name
+
+#endif
+
+///--------------------------------------------------------------------------
+/// Windows-specific hacking.
+
+#if ABI_WIN
+
+#if CPUFAM_X86
+#  define F(name) _##name
+#endif
+
+#endif
+
+///--------------------------------------------------------------------------
+/// x86- and amd64-specific hacking.
+///
+/// It's (slightly) easier to deal with both of these in one go.
+
+#if CPUFAM_X86 || CPUFAM_AMD64
+
+// Set the function hooks.
+#define FUNC_PREHOOK(_) .balign 16
+
+// Don't use the wretched AT&T syntax.  It's festooned with pointless
+// punctuation, and all of the data movement is backwards.  Ugh!
+	.intel_syntax noprefix
+
+// Call external subroutine at ADDR, possibly via PLT.
+	.macro	callext addr
+#if WANT_PIC
+	call	\addr@PLT
+#else
+	call	\addr
+#endif
+	.endm
+
+// Do I need to arrange a spare GOT register?
+#if WANT_PIC && CPUFAM_X86
+#  define NEED_GOT 1
+#endif
+#define GOTREG ebx			// Not needed in AMD64 so don't care.
+
+// Maybe load GOT address into GOT.
+	.macro	ldgot got=GOTREG
+#if WANT_PIC && CPUFAM_X86
+	call	_where_am_i.\got
+	add	\got, offset _GLOBAL_OFFSET_TABLE_
+#endif
+	.endm
+
+// Maybe build a helper subroutine for `ldgot GOT'.
+	.macro	gotaux got=GOTREG
+#if WANT_PIC && CPUFAM_X86
+	.align	16
+_where_am_i.\got :
+	mov	\got, [esp]
+	ret
+#endif
+	.endm
+
+// Load address of external symbol ADDR into REG, maybe using GOT.
+	.macro	leaext reg, addr, got=GOTREG
+#if WANT_PIC
+#  if CPUFAM_X86
+	mov	\reg, [\got + \addr@GOT]
+#  endif
+#  if CPUFAM_AMD64
+	mov	\reg, \addr@GOTPCREL[rip]
+#  endif
+#else
+#  if CPUFAM_X86
+	mov	\reg, offset \addr
+#  endif
+#  if CPUFAM_AMD64
+	lea	\reg, \addr[rip]
+#  endif
+#endif
+	.endm
+
+// Address expression (possibly using a base register, and a displacement)
+// referring to ADDR, which is within our module, maybe using GOT.
+#define INTADDR(...) INTADDR__0(__VA_ARGS__, GOTREG, dummy)
+#define INTADDR__0(addr, got, ...) INTADDR__1(addr, got)
+#if CPUFAM_AMD64
+#  define INTADDR__1(addr, got) addr + rip
+#elif WANT_PIC
+#  define INTADDR__1(addr, got) got + addr@GOTOFF
+#else
+#  define INTADDR__1(addr, got) addr
+#endif
+
+#endif
+
+///--------------------------------------------------------------------------
+/// ARM-specific hacking.
+
+#if CPUFAM_ARM
+
+// Set the function hooks.
+#define FUNC_PREHOOK(_) .balign 4
+#define ENDFUNC_HOOK(name) .ltorg
+
+// Call external subroutine at ADDR, possibly via PLT.
+	.macro	callext addr, cond=
+#if WANT_PIC
+	bl\cond	\addr(PLT)
+#else
+	bl\cond	\addr
+#endif
+	.endm
+
+// Do I need to arrange a spare GOT register?
+#if WANT_PIC
+#  define NEED_GOT 1
+#endif
+#define GOTREG r9
+
+// Maybe load GOT address into GOT.
+	.macro	ldgot	got=r9
+#if WANT_PIC
+	ldr	\got, =_GLOBAL_OFFSET_TABLE_ - . - 12
+	add	\got, pc, \got
+#endif
+	.endm
+
+// Load address of external symbol ADDR into REG, maybe using GOT.
+	.macro	leaext	reg, addr, cond=, got=GOTREG
+#if WANT_PIC
+	ldr	\reg, =\addr(GOT)
+	ldr	\reg, [\got, \reg]
+#else
+	ldr	\reg, =\addr
+#endif
+	.endm
+
+#endif
+
+///--------------------------------------------------------------------------
+/// Final stuff.
+
+// Default values for the various hooks.
+#ifndef FUNC_PREHOOK
+#  define FUNC_PREHOOK(name)
+#endif
+#ifndef FUNC_POSTHOOK
+#  define FUNC_POSTHOOK(name)
+#endif
+#ifndef ENDFUNC_HOOK
+#  define ENDFUNC_HOOK(name)
+#endif
+
+#ifndef F
+#  define F(name) name
+#endif
+
+#ifndef TYPE_FUNC
+#  define TYPE_FUNC(name)
+#endif
+
+#ifndef SIZE_OBJ
+#  define SIZE_OBJ(name)
+#endif
+
+///----- That's all, folks --------------------------------------------------
diff --git a/base/dispatch.c b/base/dispatch.c
new file mode 100644
index 00000000..1b0ab2b1
--- /dev/null
+++ b/base/dispatch.c
@@ -0,0 +1,532 @@
+/* -*-c-*-
+ *
+ * CPU-specific dispatch
+ *
+ * (c) 2015 Straylight/Edgeware
+ */
+
+/*----- Licensing notice --------------------------------------------------*
+ *
+ * This file is part of Catacomb.
+ *
+ * Catacomb is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Library General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * Catacomb is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with Catacomb; if not, write to the Free
+ * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ * MA 02111-1307, USA.
+ */
+
+/*----- Header files ------------------------------------------------------*/
+
+#include "config.h"
+
+#include <ctype.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <mLib/macros.h>
+
+#include "dispatch.h"
+
+/*----- Intel x86/AMD64 feature probing -----------------------------------*/
+
+#if CPUFAM_X86 || CPUFAM_AMD64
+
+#  define EFLAGS_ID (1u << 21)
+#  define CPUID1D_SSE2 (1u << 26)
+#  define CPUID1D_FXSR (1u << 24)
+#  define CPUID1C_AESNI (1u << 25)
+
+struct cpuid { unsigned a, b, c, d; };
+
+/* --- @cpuid@ --- *
+ *
+ * Arguments:	@struct cpuid *cc@ = where to write the result
+ *		@unsigned a, c@ = EAX and ECX registers to set
+ *
+ * Returns:	---
+ *
+ * Use:		Minimal C wrapper around the x86 `CPUID' instruction.  Checks
+ *		that the instruction is actually available before invoking
+ *		it; fills the output structure with zero if it's not going to
+ *		work.
+ */
+
+#ifdef __GNUC__
+#  if CPUFAM_X86
+static __inline__ unsigned getflags(void)
+  { unsigned f; __asm__ ("pushf; popl %0" : "=g" (f)); return (f); }
+static __inline__ unsigned setflags(unsigned f)
+{
+  unsigned ff;
+  __asm__ ("pushf; pushl %1; popf; pushf; popl %0; popf"
+	   : "=g" (ff)
+	   : "g" (f));
+  return (ff);
+}
+#  else
+static __inline__ unsigned long getflags(void)
+  { unsigned long f; __asm__ ("pushf; popq %0" : "=g" (f)); return (f); }
+static __inline__ unsigned long long setflags(unsigned long f)
+{
+  unsigned long ff;
+  __asm__ ("pushf; pushq %1; popf; pushf; popq %0; popf"
+	   : "=g" (ff)
+	   : "g" (f));
+  return (ff);
+}
+#  endif
+#endif
+
+static void cpuid(struct cpuid *cc, unsigned a, unsigned c)
+{
+#ifdef __GNUC__
+  unsigned f;
+#endif
+
+  cc->a = cc->b = cc->c = cc->d = 0;
+
+#ifdef __GNUC__
+  /* Stupid dance to detect whether the CPUID instruction is available. */
+  f = getflags();
+  if (!(setflags(f |  EFLAGS_ID) & EFLAGS_ID) ||
+	setflags(f & ~EFLAGS_ID) & EFLAGS_ID) {
+    dispatch_debug("CPUID instruction not available");
+    return;
+  }
+  setflags(f);
+
+  /* Alas, EBX is magical in PIC code, so abuse ESI instead.  This isn't
+   * pretty, but it works.
+   */
+#  if CPUFAM_X86
+  __asm__ ("pushl %%ebx; cpuid; movl %%ebx, %%esi; popl %%ebx"
+	   : "=a" (cc->a), "=S" (cc->b), "=c" (cc->c), "=d" (cc->d)
+	   : "a" (a) , "c" (c));
+#  elif CPUFAM_AMD64
+  __asm__ ("pushq %%rbx; cpuid; movl %%ebx, %%esi; popq %%rbx"
+	   : "=a" (cc->a), "=S" (cc->b), "=c" (cc->c), "=d" (cc->d)
+	   : "a" (a) , "c" (c));
+#  else
+#    error "I'm confused."
+#  endif
+  dispatch_debug("CPUID(%08x, %08x) -> %08x, %08x, %08x, %08x",
+		 a, c, cc->a, cc->b, cc->c, cc->d);
+#else
+  dispatch_debug("GNU inline assembler not available; can't CPUID");
+#endif
+}
+
+static unsigned cpuid_maxleaf(void)
+  { struct cpuid c; cpuid(&c, 0, 0); return (c.a); }
+
+/* --- @cpuid_features_p@ --- *
+ *
+ * Arguments:	@unsigned dbits@ = bits to check in EDX
+ *		@unsigned cbits@ = bits to check in ECX
+ *
+ * Returns:	Nonzero if all the requested bits are set in the CPUID result
+ *		on leaf 1.
+ */
+
+static int cpuid_features_p(unsigned dbits, unsigned cbits)
+{
+  struct cpuid c;
+  if (cpuid_maxleaf() < 1) return (0);
+  cpuid(&c, 1, 0);
+  return ((c.d & dbits) == dbits && (c.c & cbits) == cbits);
+}
+
+/* --- @xmm_registers_available_p@ --- *
+ *
+ * Arguments:	---
+ *
+ * Returns:	Nonzero if the operating system has made the XMM registers
+ *		available for use.
+ */
+
+static int xmm_registers_available_p(void)
+{
+#ifdef __GNUC__
+  unsigned f;
+  /* This hack is by Agner Fog.  Use FXSAVE/FXRSTOR to figure out whether the
+   * XMM registers are actually alive.
+   */
+  if (!cpuid_features_p(CPUID1D_FXSR, 0)) return (0);
+#  if CPUFAM_X86
+  __asm__ ("movl %%esp, %%edx; subl $512, %%esp; andl $~15, %%esp\n"
+	   "fxsave (%%esp)\n"
+	   "movl 160(%%esp), %%eax; xorl $0xaaaa5555, 160(%%esp)\n"
+	   "fxrstor (%%esp); fxsave (%%esp)\n"
+	   "movl 160(%%esp), %%ecx; movl %%eax, 160(%%esp)\n"
+	   "fxrstor (%%esp); movl %%edx, %%esp\n"
+	   "xorl %%ecx, %%eax"
+	   : "=a" (f)
+	   : /* no inputs */
+	   : "%ecx", "%edx");
+#  elif CPUFAM_AMD64
+  __asm__ ("movq %%rsp, %%rdx; subq $512, %%rsp; andq $~15, %%rsp\n"
+	   "fxsave (%%rsp)\n"
+	   "movl 160(%%rsp), %%eax; xorl $0xaaaa5555, 160(%%rsp)\n"
+	   "fxrstor (%%rsp); fxsave (%%rsp)\n"
+	   "movl 160(%%rsp), %%ecx; movl %%eax, 160(%%rsp)\n"
+	   "fxrstor (%%rsp); movq %%rdx, %%rsp\n"
+	   "xorl %%ecx, %%eax"
+	   : "=a" (f)
+	   : /* no inputs */
+	   : "%ecx", "%rdx");
+#  else
+#    error "I'm confused."
+#  endif
+  dispatch_debug("XMM registers %savailable", f ? "" : "not ");
+  return (f);
+#else
+  dispatch_debug("GNU inline assembler not available; can't check for XMM");
+  return (0);
+#endif
+}
+
+#endif
+
+/*----- General feature probing using auxiliary vectors -------------------*/
+
+/* Try to find the system's definitions for auxiliary vector entries. */
+#ifdef HAVE_SYS_AUXV_H
+#  include <sys/auxv.h>
+#else
+#  ifdef HAVE_LINUX_AUXVEC_H
+#    include <linux/auxvec.h>
+#  endif
+#  ifdef HAVE_ASM_HWCAP_H
+#    include <asm/hwcap.h>
+#  endif
+#endif
+
+/* The type of entries in the auxiliary vector.  I'm assuming that `unsigned
+ * long' matches each platform's word length; if this is false then we'll
+ * need some host-specific tweaking here.
+ */
+union auxval { long i; unsigned long u; const void *p; };
+struct auxentry { unsigned long type; union auxval value; };
+
+/* Register each CPU family's interest in the auxiliary vector.  Make sure
+ * that the necessary entry types are defined.  This is primarily ordered by
+ * entry type to minimize duplication.
+ */
+#if defined(AT_HWCAP) && CPUFAM_ARMEL
+#  define WANT_ANY 1
+#  define WANT_AT_HWCAP(_) _(AT_HWCAP, u, hwcap)
+#endif
+
+/* If we couldn't find any interesting entries then we can switch all of this
+ * machinery off.  Also do that if we have no means for atomic updates.
+ */
+#if WANT_ANY && CPU_DISPATCH_P
+
+/* The main output of this section is a bitmask of detected features.  The
+ * least significant bit will be set if we've tried to probe.  Always access
+ * this using `DISPATCH_LOAD' and `DISPATCH_STORE'.
+ */
+static unsigned hwcaps = 0;
+
+/* For each potentially interesting type which turned out not to exist or be
+ * wanted, define a dummy macro for the sake of the next step.
+ */
+#ifndef WANT_AT_HWCAP
+#  define WANT_AT_HWCAP(_)
+#endif
+
+/* For each CPU family, define two lists.
+ *
+ *   * `WANTAUX' is a list of the `WANT_AT_MUMBLE' macros which the CPU
+ *     family tried to register interest in above.  Each entry contains the
+ *     interesting auxiliary vector entry type, the name of the union branch
+ *     for its value, and the name of the slot in `struct auxprobe' in which
+ *     to store the value.
+ *
+ *   * `CAPMAP' is a list describing the output features which the CPU family
+ *     intends to satisfy from the auxiliary vector.  Each entry contains a
+ *     feature name suffix, and the token name (for `check_env').
+ */
+#if CPUFAM_ARMEL
+#  define WANTAUX(_)							\
+	WANT_AT_HWCAP(_)
+#  define CAPMAP(_)							\
+	_(ARM_VFP, "arm:vfp")						\
+	_(ARM_NEON, "arm:neon")						\
+	_(ARM_V4, "arm:v4")						\
+	_(ARM_D32, "arm:d32")
+#endif
+
+/* Build the bitmask for `hwcaps' from the `CAPMAP' list. */
+enum {
+  HFI_PROBED = 0,
+#define HFI__ENUM(feat, tok) HFI_##feat,
+  CAPMAP(HFI__ENUM)
+#undef HFI__ENUM
+  HFI__END
+};
+enum {
+  HF_PROBED = 1,
+#define HF__FLAG(feat, tok) HF_##feat = 1 << HFI_##feat,
+  CAPMAP(HF__FLAG)
+#undef HF__FLAG
+  HF__END
+};
+
+/* Build a structure in which we can capture the interesting data from the
+ * auxiliary vector.
+ */
+#define AUXUTYPE_i long
+#define AUXUTYPE_u unsigned long
+#define AUXUTYPE_p const void *
+struct auxprobe {
+#define AUXPROBE__SLOT(type, ubranch, slot) AUXUTYPE_##ubranch slot;
+  WANTAUX(AUXPROBE__SLOT)
+#undef AUXPROBE_SLOT
+};
+
+/* --- @probe_hwcaps@ --- *
+ *
+ * Arguments:	---
+ *
+ * Returns:	---
+ *
+ * Use:		Attempt to find the auxiliary vector (which is well hidden)
+ *		and discover interesting features from it.
+ */
+
+static void probe_hwcaps(void)
+{
+  unsigned hw = HF_PROBED;
+  struct auxprobe probed = { 0 };
+
+  /* Populate `probed' with the information we manage to retrieve from the
+   * auxiliary vector.  Slots we couldn't find are left zero-valued.
+   */
+#if defined(HAVE_GETAUXVAL)
+  /* Shiny new libc lets us request individual entry types.  This is almost
+   * too easy.
+   */
+#  define CAP__GET(type, slot, ubranch)					\
+	probed.slot.ubranch = (AUXUTYPE_##ubranch)getauxval(type);
+  WANTAUX(CAP__GET)
+#else
+  /* Otherwise we're a bit stuck, really.  Modern Linux kernels make a copy
+   * of the vector available in `/procc' so we could try that.
+   *
+   * The usual place is stuck on the end of the environment vector, but that
+   * may well have moved, and we have no way of telling whether it has or
+   * whether there was ever an auxiliary vector there at all; so don't do
+   * that.
+   */
+  {
+    FILE *fp = 0;
+    unsigned char *p = 0, *q = 0;
+    const struct auxentry *a;
+    size_t sz, off, n;
+
+    /* Open the file and read it into a memory chunk. */
+    if ((fp = fopen("/proc/self/auxv", "rb")) == 0) goto clean;
+    sz = 4096; off = 0;
+    if ((p = malloc(sz)) == 0) goto clean;
+    for (;;) {
+      n = fread(p + off, 1, sz - off, fp);
+      off += n;
+      if (off < sz) break;
+      sz *= 2; if ((q = realloc(p, sz)) == 0) break;
+      p = q;
+    }
+
+    /* Work through the vector (or as much of it as we found) and extract the
+     * types we're interested in.
+     */
+    for (a = (const struct auxentry *)p,
+	   n = sz/sizeof(struct auxentry);
+	 n--; a++) {
+      switch (a->type) {
+#define CAP__SWITCH(type, ubranch, slot)				\
+	case type: probed.slot = a->value.ubranch; break;
+	WANTAUX(CAP__SWITCH)
+      }
+    }
+
+  clean:
+    if (p) free(p);
+    if (fp) fclose(fp);
+  }
+#endif
+
+  /* Each CPU family now has to pick through what was found and stashed in
+   * `probed', and set the appropriate flag bits in `hw'.
+   */
+#if CPUFAM_ARMEL
+  if (probed.hwcap & HWCAP_VFPv3) hw |= HF_ARM_VFP;
+  if (probed.hwcap & HWCAP_NEON) hw |= HF_ARM_NEON;
+  if (probed.hwcap & HWCAP_VFPD32) hw |= HF_ARM_D32;
+  if (probed.hwcap & HWCAP_VFPv4) hw |= HF_ARM_V4;
+#endif
+
+  /* Store the bitmask of features we probed for everyone to see. */
+  DISPATCH_STORE(hwcaps, hw);
+
+  /* Finally, make a report about the things we found.  (Doing this earlier
+   * will pointlessly widen the window in which multiple threads will do the
+   * above auxiliary-vector probing.)
+   */
+#define CAP__DEBUG(feat, tok)						\
+  dispatch_debug("check auxv for feature `%s': %s", tok,		\
+		 hw & HF_##feat ? "available" : "absent");
+  CAPMAP(CAP__DEBUG)
+#undef CAP__DEBUG
+}
+
+/* --- @get_hwcaps@ --- *
+ *
+ * Arguments:	---
+ *
+ * Returns:	A mask of hardware capabilities and other features, as probed
+ *		from the auxiliary vector.
+ */
+
+static unsigned get_hwcaps(void)
+{
+  unsigned hw;
+
+  DISPATCH_LOAD(hwcaps, hw);
+  if (!(hwcaps & HF_PROBED)) { probe_hwcaps(); DISPATCH_LOAD(hwcaps, hw); }
+  return (hw);
+}
+
+#endif
+
+/*----- External interface ------------------------------------------------*/
+
+/* --- @dispatch_debug@ --- *
+ *
+ * Arguments:	@const char *fmt@ = a format string
+ *		@...@ = additional arguments
+ *
+ * Returns:	---
+ *
+ * Use:		Writes a formatted message to standard output if dispatch
+ *		debugging is enabled.
+ */
+
+void dispatch_debug(const char *fmt, ...)
+{
+  va_list ap;
+  const char *e = getenv("CATACOMB_CPUDISPATCH_DEBUG");
+
+  if (e && *e != 'n' && *e != '0') {
+    va_start(ap, fmt);
+    fputs("Catacomb CPUDISPATCH: ", stderr);
+    vfprintf(stderr, fmt, ap);
+    fputc('\n', stderr);
+    va_end(ap);
+  }
+}
+
+/* --- @check_env@ --- *
+ *
+ * Arguments:	@const char *ftok@ = feature token
+ *
+ * Returns:	Zero if the feature is forced off; positive if it's forced
+ *		on; negative if the user hasn't decided.
+ *
+ * Use:		Checks the environment variable `CATACOMB_CPUFEAT' for the
+ *		feature token @ftok@.  The variable, if it exists, should be
+ *		a space-separated sequence of `+tok' and `-tok' items.  These
+ *		tokens may end in `*', which matches any suffix.
+ */
+
+static int IGNORABLE check_env(const char *ftok)
+{
+  const char *p, *q, *pp;
+  int d;
+
+  p = getenv("CATACOMB_CPUFEAT");
+  if (!p) return (-1);
+
+  for (;;) {
+    while (isspace((unsigned char)*p)) p++;
+    if (!*p) return (-1);
+    switch (*p) {
+      case '+': d = +1; p++; break;
+      case '-': d =  0; p++; break;
+      default:  d = -1;      break;
+    }
+    for (q = p; *q && !isspace((unsigned char)*q); q++);
+    if (d >= 0) {
+      for (pp = ftok; p < q && *pp && *p == *pp; p++, pp++);
+      if ((p == q && !*pp) || (*p == '*' && p + 1 == q)) return (d);
+    }
+    p = q;
+  }
+  return (-1);
+}
+
+/* --- @cpu_feature_p@ --- *
+ *
+ * Arguments:	@unsigned feat@ = a @CPUFEAT_...@ code
+ *
+ * Returns:	Nonzero if the feature is available.
+ */
+
+#include <stdio.h>
+
+static int IGNORABLE
+  feat_debug(const char *ftok, const char *check, int verdict)
+{
+  if (verdict >= 0) {
+    dispatch_debug("feature `%s': %s -> %s", ftok, check,
+		   verdict ? "available" : "absent");
+  }
+  return (verdict);
+}
+
+int cpu_feature_p(int feat)
+{
+  int IGNORABLE f;
+  IGNORE(f);
+#define CASE_CPUFEAT(feat, ftok, cond) case CPUFEAT_##feat:		\
+  if ((f = feat_debug(ftok, "environment override",			\
+		      check_env(ftok))) >= 0)				\
+    return (f);								\
+  else									\
+    return (feat_debug(ftok, "runtime probe", cond));
+
+  switch (feat) {
+#if CPUFAM_X86 || CPUFAM_AMD64
+    CASE_CPUFEAT(X86_SSE2, "x86:sse2",
+		 xmm_registers_available_p() &&
+		 cpuid_features_p(CPUID1D_SSE2, 0));
+    CASE_CPUFEAT(X86_AESNI, "x86:aesni",
+		 xmm_registers_available_p() &&
+		 cpuid_features_p(CPUID1D_SSE2, CPUID1C_AESNI));
+#endif
+#ifdef CAPMAP
+#  define FEATP__CASE(feat, tok)					\
+	CASE_CPUFEAT(feat, tok,	get_hwcaps & HF_##feat)
+    CAPMAP(FEATP__CASE)
+#undef FEATP__CASE
+#endif
+    default:
+      dispatch_debug("denying unknown feature %d", feat);
+      return (0);
+  }
+#undef CASE_CPUFEAT
+}
+
+/*----- That's all, folks -------------------------------------------------*/
diff --git a/base/dispatch.h b/base/dispatch.h
new file mode 100644
index 00000000..1e463bdc
--- /dev/null
+++ b/base/dispatch.h
@@ -0,0 +1,193 @@
+/* -*-c-*-
+ *
+ * CPU-specific dispatch
+ *
+ * (c) 2015 Straylight/Edgeware
+ */
+
+/*----- Licensing notice --------------------------------------------------*
+ *
+ * This file is part of Catacomb.
+ *
+ * Catacomb is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Library General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * Catacomb is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with Catacomb; if not, write to the Free
+ * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ * MA 02111-1307, USA.
+ */
+
+#ifndef CATACOMB_DISPATCH_H
+#define CATACOMB_DISPATCH_H
+
+#ifdef __cplusplus
+  extern "C" {
+#endif
+
+/*----- Header files ------------------------------------------------------*/
+
+#include <mLib/macros.h>
+
+/*----- Macros ------------------------------------------------------------*/
+
+/* --- Atomic data access machinery --- *
+ *
+ * If they're available, use GCC's `__atomic_*' intrinsics.  If that doesn't
+ * work and we're using one of a small number of processors I'm sure won't
+ * mind, then just stick with simple memory access.  Otherwise turn
+ * dispatching off, because it probably isn't thread-safe.
+ */
+
+#if GCC_VERSION_P(4, 7)
+#  define CPU_DISPATCH_P 1
+#  define DISPATCH_LOAD(g, v)						\
+	((v) = __atomic_load_n(&(g), __ATOMIC_RELAXED))
+#  define DISPATCH_STORE(g, v)						\
+	(__atomic_store_n(&(g), (v), __ATOMIC_RELAXED))
+#elif defined(__i386__) || defined(__amd64__) ||			\
+      defined(__arm__) || defined(__aarch64__) ||			\
+      defined(__mips__)
+#  define CPU_DISPATCH_P 1
+#  define DISPATCH_LOAD(g, v) ((v) = (g))
+#  define DISPATCH_STORE(g, v) ((g) = (v))
+#endif
+
+/* --- A simple hack --- */
+
+#ifndef EMPTY
+#  define EMPTY
+#endif
+
+/* --- @CPU_DISPATCH@ --- *
+ *
+ * Arguments:	@stcls@ = storage class for the main @ext@ function
+ *			(typically either @static@ or @EMPTY@)
+ *		@rtn@ = prefix for tail-calling a function of the appropriate
+ *			type (either @(void)@ or @return@)
+ *		@ret@ = return type for the function
+ *		@ext@ = name for the main function (other named are derived
+ *			from this)
+ *		@argdecls@ = parenthesis-enclosed list of argument types
+ *		@args@ = parenthesis-enclosed list of argument names only
+ *		@pick@ = function to select appropriate implementation
+ *		@dflt@ = fallback implementation
+ *
+ * Use:		Main machinery for CPU-specfic dispatching.
+ *
+ *		The macro defines a function
+ *
+ *			@stcls ret ext argdcls@
+ *
+ *		The first time @ext@ is called, it will invoke @pick@ to
+ *		select and a return a pointer to an appropriate
+ *		implementation for the runtime environment.  Subsequent calls
+ *		to @ext@ will (usually) call this preferred implementation
+ *		directly.
+ *
+ *		Some target platforms may not be able to establish the
+ *		necessary function pointer in a threadsafe way.  On such
+ *		platforms, the dispatch machinery is disabled and @ext@ will
+ *		simply call @dflt@.
+ *
+ *		Some additional declarations are made.  As a convenience,
+ *		@ext__functype@ is the function type of @ext@.  Declarations
+ *		are made for @pick@ and @dflt@, as @static@ functions.
+ */
+
+#ifdef CPU_DISPATCH_P
+
+#define CPU_DISPATCH(stcls, rtn, ret, ext, argdecls, args, pick, dflt)	\
+									\
+typedef ret ext##__functype argdecls;					\
+static ret dflt argdecls;						\
+static ret ext##__dispatch argdecls;					\
+static ext##__functype *pick(void);					\
+static ext##__functype *ext##__ptr = ext##__dispatch;			\
+									\
+static ret ext##__dispatch argdecls					\
+{									\
+  ext##__functype *f = pick();						\
+  DISPATCH_STORE(ext##__ptr, f);					\
+  rtn f args;								\
+}									\
+									\
+stcls ret ext argdecls							\
+{									\
+  ext##__functype *f;							\
+  DISPATCH_LOAD(ext##__ptr, f);						\
+  rtn f args;								\
+}
+
+#else
+
+#define CPU_DISPATCH(stcls, rtn, ret, ext, argdecls, args, pick, dflt)	\
+									\
+typedef ret ext##__functype argdecls;					\
+static ret dflt argdecls;						\
+static ext##__functype *pick(void) IGNORABLE;				\
+									\
+stcls ret ext argdecls { rtn dflt args; }
+
+#endif
+
+/* --- Some macros for producing useful debugging --- */
+
+#define DISPATCH_PICK_COND(what, func, cond) do {			\
+  if (cond) {								\
+    dispatch_debug("picked `%s' for `%s'", #func, #what);		\
+    return (func);							\
+  }									\
+} while (0)
+#define DISPATCH_PICK_FALLBACK(what, func) do {				\
+  dispatch_debug("using default `%s'", #what);				\
+  return (func);							\
+} while (0)
+
+/*----- Functions provided ------------------------------------------------*/
+
+/* --- @dispatch_debug@ --- *
+ *
+ * Arguments:	@const char *fmt@ = a format string
+ *		@...@ = additional arguments
+ *
+ * Returns:	---
+ *
+ * Use:		Writes a formatted message to standard output if dispatch
+ *		debugging is enabled.
+ */
+
+extern void dispatch_debug(const char */*fmt*/, ...);
+
+/* --- @cpu_feature_p@ --- *
+ *
+ * Arguments:	@unsigned feat@ = a @CPUFEAT_...@ code
+ *
+ * Returns:	Nonzero if the feature is available.
+ */
+
+enum {
+  CPUFEAT_X86_SSE2,			/* Streaming SIMD Extensions 2 */
+  CPUFEAT_X86_AESNI,			/* AES Native Instructions */
+  CPUFEAT_ARM_VFP,			/* VFP floating-point (v3 or v4) */
+  CPUFEAT_ARM_NEON,			/* Advanced SIMD (v1 or v2) */
+  CPUFEAT_ARM_V4,			/* VFPv4 and/or SIMD v2 */
+  CPUFEAT_ARM_D32			/* 32 double registers, not 16 */
+};
+
+extern int cpu_feature_p(int /*feat*/);
+
+/*----- That's all, folks -------------------------------------------------*/
+
+#ifdef __cplusplus
+  }
+#endif
+
+#endif
diff --git a/configure.ac b/configure.ac
index c3e55ca4..715a5c28 100644
--- a/configure.ac
+++ b/configure.ac
@@ -32,6 +32,7 @@ AC_INIT([catacomb], AUTO_VERSION, [mdw@distorted.org.uk])
 AC_CONFIG_SRCDIR([catacomb.pc.in])
 AC_CONFIG_AUX_DIR([config])
 AM_INIT_AUTOMAKE([foreign parallel-tests color-tests subdir-objects])
+AC_CANONICAL_HOST
 mdw_SILENT_RULES
 
 AC_PROG_CC
@@ -39,11 +40,102 @@ AX_CFLAGS_WARN_ALL
 AM_PROG_LIBTOOL
 mdw_LIBTOOL_VERSION_INFO
 
+AM_PROG_AS
+
 AC_PROG_YACC
 
 AC_SUBST(AM_CFLAGS)
 
 dnl--------------------------------------------------------------------------
+dnl Host-specific configuration.
+
+AC_MSG_CHECKING([CPU family and ABI])
+
+dnl The table of CPU families and ABIs which we might support.  Support is
+dnl not uniform: each dispatched function might or might not have an
+dnl implementation for any particular CPU/ABI combination.
+AC_DEFUN([catacomb_CPU_FAMILIES],
+  [$1([i[[3-6]]86,cygwin], [x86], [win])
+   $1([i[[3-6]]86,*], [x86], [sysv])
+   $1([x86_64,cygwin], [amd64], [win])
+   $1([x86_64,*], [amd64], [sysv])
+   $1([armv*,*-gnueabi | armv*,*-gnueabihf], [armel], [gnueabi])])
+
+dnl A utility to clear the `seen' flags, used so as to process each CPU or
+dnl ABI once.
+m4_define([catacomb_CLEAR_FLAGS],
+[m4_ifdef([catacomb_seen_cpu/$2],
+	  [m4_undefine([catacomb_seen_cpu/$2])])dnl
+m4_ifdef([catacomb_seen_abi/$3],
+	  [m4_undefine([catacomb_seen_abi/$3])])])
+
+dnl Identify the current host.
+case $host_cpu,$host_os in
+  m4_define([catacomb_CPU_CASE],
+    [$1) CPUFAM=$2 ABI=$3 ;;
+])
+  catacomb_CPU_FAMILIES([catacomb_CPU_CASE])
+  *) CPUFAM=nil ABI=nil ;;
+esac
+
+dnl Figure out the current CPU.
+catacomb_CPU_FAMILIES([catacomb_CLEAR_FLAGS])
+case $CPUFAM in
+  m4_define([catacomb_DEFINE_CPU],
+    [m4_ifdef([catacomb_seen_cpu/$2], [],
+      [$2)
+	AC_DEFINE([CPUFAM_]m4_translit([$2], [a-z], [A-Z]), [1],
+		  [Define if host CPU family is \`$2\'.])
+	;;m4_define([catacomb_seen_cpu/$2], [t])])])
+  catacomb_CPU_FAMILIES([catacomb_DEFINE_CPU])
+  nil) ;;
+  *) AC_MSG_ERROR([BUG: unexpected cpufam \`$CPUFAM']) ;;
+esac
+AC_SUBST([CPUFAM])
+
+dnl Figure out the current ABI.
+catacomb_CPU_FAMILIES([catacomb_CLEAR_FLAGS])
+case $ABI in
+  m4_define([catacomb_DEFINE_ABI],
+    [m4_ifdef([catacomb_seen_abi/$3], [],
+      [$3)
+	AC_DEFINE([ABI_]m4_translit([$3], [a-z], [A-Z]), [1],
+		  [Define if host ABI variant is \`$3\'.])
+	;;m4_define([catacomb_seen_abi/$3], [t])])])
+  catacomb_CPU_FAMILIES([catacomb_DEFINE_ABI])
+  nil) ;;
+  *) AC_MSG_ERROR([BUG: unexpected ABI \`$ABI']) ;;
+esac
+AC_SUBST([ABI])
+
+dnl Establish Automake conditions for things.
+catacomb_CPU_FAMILIES([catacomb_CLEAR_FLAGS])
+m4_define([catacomb_COND_CPU],
+[m4_define([_CPU], m4_translit([$2], [a-z], [A-Z]))
+m4_define([_ABI], m4_translit([$3], [a-z], [A-Z]))
+AM_CONDITIONAL([CPUABI_]_CPU[_]_ABI, [test x$CPUFAM/$ABI = x$2/$3])
+m4_ifdef([catacomb_seen_cpu/$2], [],
+[AM_CONDITIONAL([CPUFAM_]_CPU, [test x$CPUFAM = x$2])dnl
+m4_define([catacomb_seen_cpu/$2], [t])])
+m4_ifdef([catacomb_seen_abi/$3], [],
+[AM_CONDITIONAL([ABI_]_ABI, [test x$ABI = x$3])dnl
+m4_define([catacomb_seen_abi/$3], [t])])])
+catacomb_CPU_FAMILIES([catacomb_COND_CPU])
+AM_CONDITIONAL([KNOWN_CPUFAM], [test x$CPUFAM != xnil])
+
+dnl Report on what we found.
+case $CPUFAM in
+  nil) AC_MSG_RESULT([not supported]) ;;
+  *) AC_MSG_RESULT([$CPUFAM/$ABI]) ;;
+esac
+
+dnl Some equipment wanted for checking CPU features at runtime.
+AC_CHECK_HEADERS([asm/hwcap.h])
+AC_CHECK_HEADERS([sys/auxv.h])
+AC_CHECK_HEADERS([linux/auxvec.h])
+AC_CHECK_FUNCS([getauxval])
+
+dnl--------------------------------------------------------------------------
 dnl C programming environment.
 
 dnl Find out if we're cross-compiling.
diff --git a/debian/rules b/debian/rules
index cec98bbe..e64e6b43 100755
--- a/debian/rules
+++ b/debian/rules
@@ -1,2 +1,6 @@
 #! /usr/bin/make -f
 %:; dh $@ --parallel -Bdebian/build
+
+override_dh_auto_test:
+	dh_auto_test --parallel -Bdebian/build
+	CATACOMB_CPUFEAT="-*" dh_auto_test --parallel -Bdebian/build
diff --git a/symm/Makefile.am b/symm/Makefile.am
index 69c1013f..1d3374f5 100644
--- a/symm/Makefile.am
+++ b/symm/Makefile.am
@@ -182,6 +182,12 @@ BLKCS			+= rc5
 ## Daemen and Rijmen's `Rijndael' block cipher, selected as AES.
 BLKCS			+= rijndael rijndael192 rijndael256
 libsymm_la_SOURCES	+= rijndael-base.h rijndael-base.c
+if CPUFAM_X86
+libsymm_la_SOURCES	+= rijndael-x86ish-aesni.S
+endif
+if CPUFAM_AMD64
+libsymm_la_SOURCES	+= rijndael-x86ish-aesni.S
+endif
 nodist_libsymm_la_SOURCES += ../precomp/symm/rijndael-tab.c
 PRECOMPS		+= $(precomp)/symm/rijndael-tab.c
 PRECOMP_PROGS		+= rijndael-mktab
@@ -388,6 +394,12 @@ ALL_CIPHERS		+= seal
 EXTRA_DIST		+= salsa20-tvconv
 pkginclude_HEADERS	+= salsa20.h salsa20-core.h
 libsymm_la_SOURCES	+= salsa20.c
+if CPUFAM_X86
+libsymm_la_SOURCES	+= salsa20-x86ish-sse2.S
+endif
+if CPUFAM_AMD64
+libsymm_la_SOURCES	+= salsa20-x86ish-sse2.S
+endif
 TESTS			+= salsa20.t$(EXEEXT)
 ALL_CIPHERS		+= salsa20 salsa2012 salsa208
 ALL_CIPHERS		+= xsalsa20 xsalsa2012 xsalsa208
@@ -414,6 +426,12 @@ t/salsa20: salsa20-tvconv t/salsa20.local $(SALSA20_ESTREAM_TV)
 ## Bernstein's `ChaCha' stream cipher.
 pkginclude_HEADERS	+= chacha.h chacha-core.h
 libsymm_la_SOURCES	+= chacha.c
+if CPUFAM_X86
+libsymm_la_SOURCES	+= chacha-x86ish-sse2.S
+endif
+if CPUFAM_AMD64
+libsymm_la_SOURCES	+= chacha-x86ish-sse2.S
+endif
 TESTS			+= chacha.t$(EXEEXT)
 EXTRA_DIST		+= t/chacha
 ALL_CIPHERS		+= chacha20 chacha12 chacha8
diff --git a/symm/chacha-core.h b/symm/chacha-core.h
index ad6b05f5..1c0efcd9 100644
--- a/symm/chacha-core.h
+++ b/symm/chacha-core.h
@@ -95,9 +95,12 @@
 
 /* The ChaCha feedforward step, used at the end of the core function.  Here,
  * @y@ contains the original input matrix; @z@ contains the final one, and is
- * updated.  This is the same as Salsa20.
+ * updated.  This is the same as Salsa20, only without the final permutation.
  */
-#define CHACHA_FFWD(z, y) SALSA20_FFWD(z, y)
+#define CHACHA_FFWD(z, y) do {						\
+  int _i;								\
+  for (_i = 0; _i < 16; _i++) (z)[_i] += (y)[_i];			\
+} while (0)
 
 /* Various numbers of rounds, unrolled.  Read from @y@, and write to @z@. */
 #define CHACHA_4R(z, y)							\
diff --git a/symm/chacha-x86ish-sse2.S b/symm/chacha-x86ish-sse2.S
new file mode 100644
index 00000000..f36bf90f
--- /dev/null
+++ b/symm/chacha-x86ish-sse2.S
@@ -0,0 +1,260 @@
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// Fancy SIMD implementation of ChaCha
+///
+/// (c) 2015 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+#include "config.h"
+#include "asm-common.h"
+
+///--------------------------------------------------------------------------
+/// Local utilities.
+
+// Magic constants for shuffling.
+#define ROTL 0x93
+#define ROT2 0x4e
+#define ROTR 0x39
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+	.arch pentium4
+	.section .text
+
+FUNC(chacha_core_x86ish_sse2)
+
+	// Initial setup.
+
+#if CPUFAM_X86
+	// Arguments come in on the stack, and will need to be collected.  We
+	// we can get away with just the scratch registers for integer work,
+	// but we'll run out of XMM registers and will need some properly
+	// aligned space which we'll steal from the stack.  I don't trust the
+	// stack pointer's alignment, so I'll have to mask the stack pointer,
+	// which in turn means I'll need to keep track of the old value.
+	// Hence I'm making a full i386-style stack frame here.
+	//
+	// The Windows and SysV ABIs are sufficiently similar that we don't
+	// need to worry about the differences here.
+
+#  define NR ecx
+#  define IN eax
+#  define OUT edx
+#  define SAVE0 xmm5
+#  define SAVE1 xmm6
+#  define SAVE2 xmm7
+#  define SAVE3 [esp]
+
+	push	ebp
+	mov	ebp, esp
+	sub	esp, 16
+	mov	IN, [ebp + 12]
+	mov	OUT, [ebp + 16]
+	and	esp, ~15
+	mov	NR, [ebp + 8]
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+	// This is nice.  We have plenty of XMM registers, and the arguments
+	// are in useful places.  There's no need to spill anything and we
+	// can just get on with the code.
+
+#  define NR edi
+#  define IN rsi
+#  define OUT rdx
+#  define SAVE0 xmm5
+#  define SAVE1 xmm6
+#  define SAVE2 xmm7
+#  define SAVE3 xmm8
+#endif
+
+#if CPUFAM_AMD64 && ABI_WIN
+	// Arguments come in registers, but they're different between Windows
+	// and everyone else (and everyone else is saner).
+	//
+	// The Windows ABI insists that we preserve some of the XMM
+	// registers, but we want more than we can use as scratch space.  We
+	// only need to save a copy of the input for the feedforward at the
+	// end, so we might as well use memory rather than spill extra
+	// registers.  (We need an extra 8 bytes to align the stack.)
+
+#  define NR ecx
+#  define IN rdx
+#  define OUT r8
+#  define SAVE0 xmm5
+#  define SAVE1 [rsp +  0]
+#  define SAVE2 [rsp + 16]
+#  define SAVE3 [rsp + 32]
+
+	sub	rsp, 48 + 8
+#endif
+
+	// First job is to slurp the matrix into XMM registers.  Be careful:
+	// the input matrix isn't likely to be properly aligned.
+	//
+	//	[ 0  1  2  3] (a, xmm0)
+	//	[ 4  5  6  7] (b, xmm1)
+	//	[ 8  9 10 11] (c, xmm2)
+	//	[12 13 14 15] (d, xmm3)
+	movdqu	xmm0, [IN +  0]
+	movdqu	xmm1, [IN + 16]
+	movdqu	xmm2, [IN + 32]
+	movdqu	xmm3, [IN + 48]
+
+	// Take a copy for later.  This one is aligned properly, by
+	// construction.
+	movdqa	SAVE0, xmm0
+	movdqa	SAVE1, xmm1
+	movdqa	SAVE2, xmm2
+	movdqa	SAVE3, xmm3
+
+loop:
+	// Apply a column quarterround to each of the columns simultaneously.
+	// Alas, there doesn't seem to be a packed doubleword rotate, so we
+	// have to synthesize it.
+
+	// a += b; d ^= a; d <<<= 16
+	paddd	xmm0, xmm1
+	pxor	xmm3, xmm0
+	movdqa	xmm4, xmm3
+	pslld	xmm3, 16
+	psrld	xmm4, 16
+	por	xmm3, xmm4
+
+	// c += d; b ^= c; b <<<= 12
+	paddd	xmm2, xmm3
+	pxor	xmm1, xmm2
+	movdqa	xmm4, xmm1
+	pslld	xmm1, 12
+	psrld	xmm4, 20
+	por	xmm1, xmm4
+
+	// a += b; d ^= a; d <<<=  8
+	paddd	xmm0, xmm1
+	pxor	xmm3, xmm0
+	movdqa	xmm4, xmm3
+	pslld	xmm3, 8
+	psrld	xmm4, 24
+	por	xmm3, xmm4
+
+	// c += d; b ^= c; b <<<=  7
+	paddd	xmm2, xmm3
+	pshufd	xmm3, xmm3, ROTL
+	pxor	xmm1, xmm2
+	pshufd	xmm2, xmm2, ROT2
+	movdqa	xmm4, xmm1
+	pslld	xmm1, 7
+	psrld	xmm4, 25
+	por	xmm1, xmm4
+
+	// The not-quite-transpose conveniently only involves reordering
+	// elements of individual rows, which can be done quite easily.  It
+	// doesn't involve any movement of elements between rows, or even
+	// renaming of the rows.
+	//
+	//	[ 0  1  2  3]		[ 0  1  2  3] (a, xmm0)
+	//	[ 4  5  6  7]    -->	[ 5  6  7  4] (b, xmm1)
+	//	[ 8  9 10 11]		[10 11  8  9] (c, xmm2)
+	//	[12 13 14 15]		[15 12 13 14] (d, xmm3)
+	//
+	// The shuffles have quite high latency, so they've mostly been
+	// pushed upwards.  The remaining one can't be moved, though.
+	pshufd	xmm1, xmm1, ROTR
+
+	// Apply the diagonal quarterround to each of the columns
+	// simultaneously.
+
+	// a += b; d ^= a; d <<<= 16
+	paddd	xmm0, xmm1
+	pxor	xmm3, xmm0
+	movdqa	xmm4, xmm3
+	pslld	xmm3, 16
+	psrld	xmm4, 16
+	por	xmm3, xmm4
+
+	// c += d; b ^= c; b <<<= 12
+	paddd	xmm2, xmm3
+	pxor	xmm1, xmm2
+	movdqa	xmm4, xmm1
+	pslld	xmm1, 12
+	psrld	xmm4, 20
+	por	xmm1, xmm4
+
+	// a += b; d ^= a; d <<<=  8
+	paddd	xmm0, xmm1
+	pxor	xmm3, xmm0
+	movdqa	xmm4, xmm3
+	pslld	xmm3, 8
+	psrld	xmm4, 24
+	por	xmm3, xmm4
+
+	// c += d; b ^= c; b <<<=  7
+	paddd	xmm2, xmm3
+	pshufd	xmm3, xmm3, ROTR
+	pxor	xmm1, xmm2
+	pshufd	xmm2, xmm2, ROT2
+	movdqa	xmm4, xmm1
+	pslld	xmm1, 7
+	psrld	xmm4, 25
+	por	xmm1, xmm4
+
+	// Finally, finish off undoing the transpose, and we're done for this
+	// doubleround.  Again, most of this was done above so we don't have
+	// to wait for the shuffles.
+	pshufd	xmm1, xmm1, ROTL
+
+	// Decrement the loop counter and see if we should go round again.
+	sub	NR, 2
+	ja	loop
+
+	// Almost there.  Firstly, the feedforward addition.
+	paddd	xmm0, SAVE0
+	paddd	xmm1, SAVE1
+	paddd	xmm2, SAVE2
+	paddd	xmm3, SAVE3
+
+	// And now we write out the result.  This one won't be aligned
+	// either.
+	movdqu	[OUT +  0], xmm0
+	movdqu	[OUT + 16], xmm1
+	movdqu	[OUT + 32], xmm2
+	movdqu	[OUT + 48], xmm3
+
+	// Tidy things up.
+#if CPUFAM_X86
+	mov	esp, ebp
+	pop	ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+	add	rsp, 48 + 8
+#endif
+
+	// And with that, we're done.
+	ret
+
+ENDFUNC
+
+///----- That's all, folks --------------------------------------------------
diff --git a/symm/chacha.c b/symm/chacha.c
index e694ad22..0c8aa003 100644
--- a/symm/chacha.c
+++ b/symm/chacha.c
@@ -27,6 +27,8 @@
 
 /*----- Header files ------------------------------------------------------*/
 
+#include "config.h"
+
 #include <stdarg.h>
 
 #include <mLib/bits.h>
@@ -34,6 +36,7 @@
 #include "arena.h"
 #include "chacha.h"
 #include "chacha-core.h"
+#include "dispatch.h"
 #include "gcipher.h"
 #include "grand.h"
 #include "keysz.h"
@@ -59,9 +62,29 @@ const octet chacha_keysz[] = { KSZ_SET, 32, 16, 10, 0 };
  *		the feedforward step.
  */
 
-static void core(unsigned r, const chacha_matrix src, chacha_matrix dest)
+CPU_DISPATCH(static, (void),
+	     void, core, (unsigned r, const chacha_matrix src,
+			  chacha_matrix dest),
+	     (r, src, dest),
+	     pick_core, simple_core);
+
+static void simple_core(unsigned r, const chacha_matrix src,
+			chacha_matrix dest)
   { CHACHA_nR(dest, src, r); CHACHA_FFWD(dest, src); }
 
+#if CPUFAM_X86 || CPUFAM_AMD64
+extern core__functype chacha_core_x86ish_sse2;
+#endif
+
+static core__functype *pick_core(void)
+{
+#if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_sse2,
+		     cpu_feature_p(CPUFEAT_X86_SSE2));
+#endif
+  DISPATCH_PICK_FALLBACK(chacha_core, simple_core);
+}
+
 /* --- @populate@ --- *
  *
  * Arguments:	@chacha_matrix a@ = a matrix to fill in
diff --git a/symm/rijndael-base.c b/symm/rijndael-base.c
index bfab63a2..b0505a66 100644
--- a/symm/rijndael-base.c
+++ b/symm/rijndael-base.c
@@ -27,12 +27,15 @@
 
 /*----- Header files ------------------------------------------------------*/
 
+#include "config.h"
+
 #include <assert.h>
 #include <stdio.h>
 
 #include <mLib/bits.h>
 
 #include "blkc.h"
+#include "dispatch.h"
 #include "gcipher.h"
 #include "rijndael.h"
 #include "rijndael-base.h"
@@ -55,25 +58,14 @@ const octet rijndael_keysz[] = { KSZ_RANGE, RIJNDAEL_KEYSZ, 4, 32, 4 };
  * Use:		Low-level key-scheduling.
  */
 
-void rijndael_setup(rijndael_ctx *k, unsigned nb, const void *buf, size_t sz)
+static void simple_setup(rijndael_ctx *k, unsigned nb,
+			 const void *buf, unsigned nk)
 {
-  unsigned nk, nr, nw;
+  unsigned nr = k->nr, nw;
   unsigned i, j, jj;
   const octet *p;
   uint32 ww;
 
-  /* --- Sort out the key size --- */
-
-  KSZ_ASSERT(rijndael, sz);
-  nk = sz / 4;
-
-  /* --- Select the number of rounds --- */
-
-  nr = (nk > nb ? nk : nb) + 6;
-  if (nr < 10)
-    nr = 10;
-  k->nr = nr;
-
   /* --- Fetch the first key words out --- */
 
   p = buf;
@@ -120,4 +112,42 @@ void rijndael_setup(rijndael_ctx *k, unsigned nb, const void *buf, size_t sz)
     k->wi[i] = k->w[j + jj++];
 }
 
+CPU_DISPATCH(static, EMPTY, void, setup, (rijndael_ctx *k, unsigned nb,
+					  const void *buf, unsigned nk),
+	     (k, nb, buf, nk), pick_setup, simple_setup)
+
+#if CPUFAM_X86 || CPUFAM_AMD64
+extern setup__functype rijndael_setup_x86ish_aesni;
+#endif
+
+static setup__functype *pick_setup(void)
+{
+#if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86ish_aesni,
+		     cpu_feature_p(CPUFEAT_X86_AESNI));
+#endif
+  DISPATCH_PICK_FALLBACK(rijndael_setup, simple_setup);
+}
+
+void rijndael_setup(rijndael_ctx *k, unsigned nb, const void *buf, size_t sz)
+{
+  unsigned nk, nr;
+
+  /* --- Sort out the key size --- */
+
+  KSZ_ASSERT(rijndael, sz);
+  nk = sz / 4;
+
+  /* --- Select the number of rounds --- */
+
+  nr = (nk > nb ? nk : nb) + 6;
+  if (nr < 10)
+    nr = 10;
+  k->nr = nr;
+
+  /* --- Do the main setup --- */
+
+  setup(k, nb, buf, nk);
+}
+
 /*----- That's all, folks -------------------------------------------------*/
diff --git a/symm/rijndael-x86ish-aesni.S b/symm/rijndael-x86ish-aesni.S
new file mode 100644
index 00000000..91fcc352
--- /dev/null
+++ b/symm/rijndael-x86ish-aesni.S
@@ -0,0 +1,604 @@
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// AESNI-based implementation of Rijndael
+///
+/// (c) 2015 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+#include "config.h"
+#include "asm-common.h"
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+	.globl	F(abort)
+	.globl	F(rijndael_rcon)
+
+///--------------------------------------------------------------------------
+/// Local utilities.
+
+// Magic constants for shuffling.
+#define ROTL 0x93
+#define ROT2 0x4e
+#define ROTR 0x39
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+	.arch	.aes
+	.section .text
+
+/// The AESNI instructions implement a little-endian version of AES, but
+/// Catacomb's internal interface presents as big-endian so as to work better
+/// with things like GCM.  We therefore maintain the round keys in
+/// little-endian form, and have to end-swap blocks in and out.
+///
+/// For added amusement, the AESNI instructions don't implement the
+/// larger-block versions of Rijndael, so we have to end-swap the keys if
+/// we're preparing for one of those.
+
+	// Useful constants.
+	.equ	maxrounds, 16		// maximum number of rounds
+	.equ	maxblksz, 32		// maximum block size, in bytes
+	.equ	kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
+
+	// Context structure.
+	.equ	nr, 0			// number of rounds
+	.equ	w, nr + 4		// encryption key words
+	.equ	wi, w + kbufsz		// decryption key words
+
+///--------------------------------------------------------------------------
+/// Key setup.
+
+FUNC(rijndael_setup_x86ish_aesni)
+
+#if CPUFAM_X86
+	// Arguments are on the stack.  We'll need to stack the caller's
+	// register veriables, but we'll manage.
+
+#  define CTX ebp			// context pointer
+#  define BLKSZ [esp + 24]		// block size
+
+#  define SI esi			// source pointer
+#  define DI edi			// destination pointer
+
+#  define KSZ ebx			// key size
+#  define KSZo ebx			// ... as address offset
+#  define NKW edx			// total number of key words
+#  define NKW_NEEDS_REFRESH 1		// ... needs recalculating
+#  define RCON ecx			// round constants table
+#  define LIM edx			// limit pointer
+#  define LIMn edx			// ... as integer offset from base
+
+#  define NR ecx			// number of rounds
+#  define LRK eax			// distance to last key
+#  define LRKo eax			// ... as address offset
+#  define BLKOFF edx			// block size in bytes
+#  define BLKOFFo edx			// ... as address offset
+
+	// Stack the caller's registers.
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+
+	// Set up our own variables.
+	mov	CTX, [esp + 20]		// context base pointer
+	mov	SI, [esp + 28]		// key material
+	mov	KSZ, [esp + 32]		// key size, in words
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+	// Arguments are in registers.  We have plenty, but, to be honest,
+	// the initial register allocation is a bit annoying.
+
+#  define CTX r8			// context pointer
+#  define BLKSZ r9d			// block size
+
+#  define SI rsi			// source pointer
+#  define DI rdi			// destination pointer
+
+#  define KSZ edx			// key size
+#  define KSZo rdx			// ... as address offset
+#  define NKW r10d			// total number of key words
+#  define RCON rdi			// round constants table
+#  define LIMn ecx			// limit pointer
+#  define LIM rcx			// ... as integer offset from base
+
+#  define NR ecx			// number of rounds
+#  define LRK eax			// distance to last key
+#  define LRKo rax			// ... as address offset
+#  define BLKOFF r9d			// block size in bytes
+#  define BLKOFFo r9			// ... as address offset
+
+	// Move arguments to more useful places.
+	mov	CTX, rdi		// context base pointer
+	mov	BLKSZ, esi		// block size in words
+	mov	SI, rdx			// key material
+	mov	KSZ, ecx		// key size, in words
+#endif
+
+#if CPUFAM_AMD64 && ABI_WIN
+	// Arguments are in different registers, and they're a little tight.
+
+#  define CTX r8			// context pointer
+#  define BLKSZ edx			// block size
+
+#  define SI rsi			// source pointer
+#  define DI rdi			// destination pointer
+
+#  define KSZ r9d			// key size
+#  define KSZo r9			// ... as address offset
+#  define NKW r10d			// total number of key words
+#  define RCON rdi			// round constants table
+#  define LIMn ecx			// limit pointer
+#  define LIM rcx			// ... as integer offset from base
+
+#  define NR ecx			// number of rounds
+#  define LRK eax			// distance to last key
+#  define LRKo rax			// ... as address offset
+#  define BLKOFF edx			// block size in bytes
+#  define BLKOFFo rdx			// ... as address offset
+
+	// We'll need the index registers, which belong to the caller in this
+	// ABI.
+	push	rsi
+	push	rdi
+
+	// Move arguments to more useful places.
+	mov	SI, r8			// key material
+	mov	CTX, rcx		// context base pointer
+#endif
+
+	// The initial round key material is taken directly from the input
+	// key, so copy it over.
+#if CPUFAM_AMD64 && ABI_SYSV
+	// We've been lucky.  We already have a copy of the context pointer
+	// in rdi, and the key size in ecx.
+	add	DI, w
+#else
+	lea	DI, [CTX + w]
+	mov	ecx, KSZ
+#endif
+	rep	movsd
+
+	// Find out other useful things.
+	mov	NKW, [CTX + nr]		// number of rounds
+	add	NKW, 1
+	imul	NKW, BLKSZ		// total key size in words
+#if !NKW_NEEDS_REFRESH
+	// If we can't keep NKW for later, then we use the same register for
+	// it and LIM, so this move is unnecessary.
+	mov	LIMn, NKW
+#endif
+	sub	LIMn, KSZ		// offset by the key size
+
+	// Find the round constants.
+	ldgot	ecx
+	leaext	RCON, rijndael_rcon, ecx
+
+	// Prepare for the main loop.
+	lea	SI, [CTX + w]
+	mov	eax, [SI + 4*KSZo - 4]	// most recent key word
+	lea	LIM, [SI + 4*LIM]	// limit, offset by one key expansion
+
+	// Main key expansion loop.  The first word of each key-length chunk
+	// needs special treatment.
+	//
+	// This is rather tedious because the Intel `AESKEYGENASSIST'
+	// instruction is very strangely shaped.  Firstly, it wants to
+	// operate on vast SSE registers, even though we're data-blocked from
+	// doing more than operation at a time unless we're doing two key
+	// schedules simultaneously -- and even then we can't do more than
+	// two, because the instruction ignores two of its input words
+	// entirely, and produces two different outputs for each of the other
+	// two.  And secondly it insists on taking the magic round constant
+	// as an immediate, so it's kind of annoying if you're not
+	// open-coding the whole thing.  It's much easier to leave that as
+	// zero and XOR in the round constant by hand.
+9:	movd	xmm0, eax
+	pshufd	xmm0, xmm0, ROTR
+	aeskeygenassist xmm1, xmm0, 0
+	pshufd	xmm1, xmm1, ROTL
+	movd	eax, xmm1
+	xor	eax, [SI]
+	xor	al, [RCON]
+	inc	RCON
+	mov	[SI + 4*KSZo], eax
+	add	SI, 4
+	cmp	SI, LIM
+	jae	8f
+
+	// The next three words are simple...
+	xor	eax, [SI]
+	mov	[SI + 4*KSZo], eax
+	add	SI, 4
+	cmp	SI, LIM
+	jae	8f
+
+	// (Word 2...)
+	xor	eax, [SI]
+	mov	[SI + 4*KSZo], eax
+	add	SI, 4
+	cmp	SI, LIM
+	jae	8f
+
+	// (Word 3...)
+	xor	eax, [SI]
+	mov	[SI + 4*KSZo], eax
+	add	SI, 4
+	cmp	SI, LIM
+	jae	8f
+
+	// Word 4.  If the key is /more/ than 6 words long, then we must
+	// apply a substitution here.
+	cmp	KSZ, 5
+	jb	9b
+	cmp	KSZ, 7
+	jb	0f
+	movd	xmm0, eax
+	pshufd	xmm0, xmm0, ROTL
+	aeskeygenassist xmm1, xmm0, 0
+	movd	eax, xmm1
+0:	xor	eax, [SI]
+	mov	[SI + 4*KSZo], eax
+	add	SI, 4
+	cmp	SI, LIM
+	jae	8f
+
+	// (Word 5...)
+	cmp	KSZ, 6
+	jb	9b
+	xor	eax, [SI]
+	mov	[SI + 4*KSZo], eax
+	add	SI, 4
+	cmp	SI, LIM
+	jae	8f
+
+	// (Word 6...)
+	cmp	KSZ, 7
+	jb	9b
+	xor	eax, [SI]
+	mov	[SI + 4*KSZo], eax
+	add	SI, 4
+	cmp	SI, LIM
+	jae	8f
+
+	// (Word 7...)
+	cmp	KSZ, 8
+	jb	9b
+	xor	eax, [SI]
+	mov	[SI + 4*KSZo], eax
+	add	SI, 4
+	cmp	SI, LIM
+	jae	8f
+
+	// Must be done by now.
+	jmp	9b
+
+	// Next job is to construct the decryption keys.  The keys for the
+	// first and last rounds don't need to be mangled, but the remaining
+	// ones do -- and they all need to be reordered too.
+	//
+	// The plan of action, then, is to copy the final encryption round's
+	// keys into place first, then to do each of the intermediate rounds
+	// in reverse order, and finally do the first round.
+	//
+	// Do all of the heavy lifting with SSE registers.  The order we're
+	// doing this in means that it's OK if we read or write too much, and
+	// there's easily enough buffer space for the over-enthusiastic reads
+	// and writes because the context has space for 32-byte blocks, which
+	// is our maximum and an exact fit for two SSE registers.
+8:	mov	NR, [CTX + nr]		// number of rounds
+#if NKW_NEEDS_REFRESH
+	mov	BLKOFF, BLKSZ
+	mov	LRK, NR
+	imul	LRK, BLKOFF
+#else
+	// If we retain NKW, then BLKSZ and BLKOFF are the same register
+	// because we won't need the former again.
+	mov	LRK, NKW
+	sub	LRK, BLKSZ
+#endif
+	lea	DI, [CTX + wi]
+	lea	SI, [CTX + w + 4*LRKo]	// last round's keys
+	shl	BLKOFF, 2		// block size (in bytes now)
+
+	// Copy the last encryption round's keys.
+	movdqu	xmm0, [SI]
+	movdqu	[DI], xmm0
+	cmp	BLKOFF, 16
+	jbe	9f
+	movdqu	xmm0, [SI + 16]
+	movdqu	[DI + 16], xmm0
+
+	// Update the loop variables and stop if we've finished.
+9:	add	DI, BLKOFFo
+	sub	SI, BLKOFFo
+	sub	NR, 1
+	jbe	0f
+
+	// Do another middle round's keys...
+	movdqu	xmm0, [SI]
+	aesimc	xmm0, xmm0
+	movdqu	[DI], xmm0
+	cmp	BLKOFF, 16
+	jbe	9b
+	movdqu	xmm0, [SI + 16]
+	aesimc	xmm0, xmm0
+	movdqu	[DI + 16], xmm0
+	jmp	9b
+
+	// Finally do the first encryption round.
+0:	movdqu	xmm0, [SI]
+	movdqu	[DI], xmm0
+	cmp	BLKOFF, 16
+	jbe	0f
+	movdqu	xmm0, [SI + 16]
+	movdqu	[DI + 16], xmm0
+
+	// If the block size is not exactly four words then we must end-swap
+	// everything.  We can use fancy SSE toys for this.
+0:	cmp	BLKOFF, 16
+	je	0f
+
+	// Find the byte-reordering table.
+	ldgot	ecx
+	movdqa	xmm5, [INTADDR(endswap_tab, ecx)]
+
+#if NKW_NEEDS_REFRESH
+	// Calculate the number of subkey words again.  (It's a good job
+	// we've got a fast multiplier.)
+	mov	NKW, [CTX + nr]
+	add	NKW, 1
+	imul	NKW, BLKSZ
+#endif
+
+	// End-swap the encryption keys.
+	mov	ecx, NKW
+	lea	SI, [CTX + w]
+	call	endswap_block
+
+	// And the decryption keys.
+	mov	ecx, NKW
+	lea	SI, [CTX + wi]
+	call	endswap_block
+
+0:	// All done.
+#if CPUFAM_X86
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+	pop	rdi
+	pop	rsi
+#endif
+	ret
+
+	.align	16
+endswap_block:
+	// End-swap ECX words starting at SI.  The end-swapping table is
+	// already loaded into XMM5; and it's OK to work in 16-byte chunks.
+	movdqu	xmm1, [SI]
+	pshufb	xmm1, xmm5
+	movdqu	[SI], xmm1
+	add	SI, 16
+	sub	ecx, 4
+	ja	endswap_block
+	ret
+
+#undef CTX
+#undef BLKSZ
+#undef SI
+#undef DI
+#undef KSZ
+#undef KSZo
+#undef RCON
+#undef LIMn
+#undef LIM
+#undef NR
+#undef LRK
+#undef LRKo
+#undef BLKOFF
+#undef BLKOFFo
+
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Encrypting and decrypting blocks.
+
+	.macro	encdec op, aes, koff
+FUNC(rijndael_\op\()_x86ish_aesni)
+
+	// Find the magic endianness-swapping table.
+	ldgot	ecx
+	movdqa	xmm5, [INTADDR(endswap_tab, ecx)]
+
+#if CPUFAM_X86
+	// Arguments come in on the stack, and need to be collected.  We
+	// don't have a shortage of registers.
+
+#  define K ecx
+#  define SRC edx
+#  define DST edx
+#  define NR eax
+
+	mov	K, [esp + 4]
+	mov	SRC, [esp + 8]
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+	// Arguments come in registers.  All is good.
+
+#  define K rdi
+#  define SRC rsi
+#  define DST rdx
+#  define NR eax
+#endif
+
+#if CPUFAM_AMD64 && ABI_WIN
+	// Arguments come in different registers.
+
+#  define K rcx
+#  define SRC rdx
+#  define DST r8
+#  define NR eax
+#endif
+
+	// Initial setup.
+	movdqu	xmm0, [SRC]
+	pshufb	xmm0, xmm5
+	mov	NR, [K + nr]
+	add	K, \koff
+
+	// Initial whitening.
+	movdqu	xmm1, [K]
+	add	K, 16
+	pxor	xmm0, xmm1
+
+	// Dispatch to the correct code.
+	cmp	NR, 10
+	je	10f
+	jb	bogus
+	cmp	NR, 14
+	je	14f
+	ja	bogus
+	cmp	NR, 12
+	je	12f
+	jb	11f
+	jmp	13f
+
+	.align	2
+
+	// 14 rounds...
+14:	movdqu	xmm1, [K]
+	add	K, 16
+	\aes	xmm0, xmm1
+
+	// 13 rounds...
+13:	movdqu	xmm1, [K]
+	add	K, 16
+	\aes	xmm0, xmm1
+
+	// 12 rounds...
+12:	movdqu	xmm1, [K]
+	add	K, 16
+	\aes	xmm0, xmm1
+
+	// 11 rounds...
+11:	movdqu	xmm1, [K]
+	add	K, 16
+	\aes	xmm0, xmm1
+
+	// 10 rounds...
+10:	movdqu	xmm1, [K]
+	\aes	xmm0, xmm1
+
+	// 9 rounds...
+	movdqu	xmm1, [K + 16]
+	\aes	xmm0, xmm1
+
+	// 8 rounds...
+	movdqu	xmm1, [K + 32]
+	\aes	xmm0, xmm1
+
+	// 7 rounds...
+	movdqu	xmm1, [K + 48]
+	\aes	xmm0, xmm1
+
+	// 6 rounds...
+	movdqu	xmm1, [K + 64]
+	\aes	xmm0, xmm1
+
+	// 5 rounds...
+	movdqu	xmm1, [K + 80]
+	\aes	xmm0, xmm1
+
+	// 4 rounds...
+	movdqu	xmm1, [K + 96]
+	\aes	xmm0, xmm1
+
+	// 3 rounds...
+	movdqu	xmm1, [K + 112]
+	\aes	xmm0, xmm1
+
+	// 2 rounds...
+	movdqu	xmm1, [K + 128]
+	\aes	xmm0, xmm1
+
+	// Final round...
+	movdqu	xmm1, [K + 144]
+	\aes\()last xmm0, xmm1
+
+	// Unpermute the ciphertext block and store it.
+	pshufb	xmm0, xmm5
+#if CPUFAM_X86
+	mov	DST, [esp + 12]
+#endif
+	movdqu	[DST], xmm0
+
+	// And we're done.
+	ret
+
+#undef K
+#undef SRC
+#undef DST
+#undef NR
+
+ENDFUNC
+	.endm
+
+	encdec	eblk, aesenc, w
+	encdec	dblk, aesdec, wi
+
+///--------------------------------------------------------------------------
+/// Random utilities.
+
+	.align	16
+	// Abort the process because of a programming error.  Indirecting
+	// through this point serves several purposes: (a) by CALLing, rather
+	// than branching to, `abort', we can save the return address, which
+	// might at least provide a hint as to what went wrong; (b) we don't
+	// have conditional CALLs (and they'd be big anyway); and (c) we can
+	// write a HLT here as a backstop against `abort' being mad.
+bogus:	callext	F(abort)
+0:	hlt
+	jmp	0b
+
+	gotaux	ecx
+
+///--------------------------------------------------------------------------
+/// Data tables.
+
+	.align	16
+endswap_tab:
+	.byte	 3,  2,  1,  0
+	.byte	 7,  6,  5,  4
+	.byte	11, 10,  9,  8
+	.byte	15, 14, 13, 12
+
+///----- That's all, folks --------------------------------------------------
diff --git a/symm/rijndael.c b/symm/rijndael.c
index 9d8e739f..293f28da 100644
--- a/symm/rijndael.c
+++ b/symm/rijndael.c
@@ -27,12 +27,15 @@
 
 /*----- Header files ------------------------------------------------------*/
 
+#include "config.h"
+
 #include <assert.h>
 #include <stdio.h>
 
 #include <mLib/bits.h>
 
 #include "blkc.h"
+#include "dispatch.h"
 #include "gcipher.h"
 #include "rijndael.h"
 #include "rijndael-base.h"
@@ -69,6 +72,39 @@ void rijndael_init(rijndael_ctx *k, const void *buf, size_t sz)
  * Use:		Low-level block encryption and decryption.
  */
 
+CPU_DISPATCH(EMPTY, EMPTY, void, rijndael_eblk, (const rijndael_ctx *k,
+						 const uint32 s[4],
+						 uint32 d[4]),
+	     (k, s, d), pick_eblk, simple_eblk)
+
+CPU_DISPATCH(EMPTY, EMPTY, void, rijndael_dblk, (const rijndael_ctx *k,
+						 const uint32 s[4],
+						 uint32 d[4]),
+	     (k, s, d), pick_dblk, simple_dblk)
+
+#if CPUFAM_X86 || CPUFAM_AMD64
+extern rijndael_eblk__functype rijndael_eblk_x86ish_aesni;
+extern rijndael_dblk__functype rijndael_dblk_x86ish_aesni;
+#endif
+
+static rijndael_eblk__functype *pick_eblk(void)
+{
+#if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86ish_aesni,
+		     cpu_feature_p(CPUFEAT_X86_AESNI));
+#endif
+  DISPATCH_PICK_FALLBACK(rijndael_eblk, simple_eblk);
+}
+
+static rijndael_dblk__functype *pick_dblk(void)
+{
+#if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86ish_aesni,
+		     cpu_feature_p(CPUFEAT_X86_AESNI));
+#endif
+  DISPATCH_PICK_FALLBACK(rijndael_dblk, simple_dblk);
+}
+
 #define DO(what, t, aa, bb, cc, dd, a, b, c, d, w) do {			\
   aa = what(t, a, b, c, d) ^ *w++;					\
   bb = what(t, b, c, d, a) ^ *w++;					\
@@ -83,7 +119,7 @@ void rijndael_init(rijndael_ctx *k, const void *buf, size_t sz)
   dd = what(t, d, c, b, a) ^ *w++;					\
 } while (0)
 
-void rijndael_eblk(const rijndael_ctx *k, const uint32 *s, uint32 *dst)
+static void simple_eblk(const rijndael_ctx *k, const uint32 *s, uint32 *dst)
 {
   uint32 a = s[0], b = s[1], c = s[2], d = s[3];
   uint32 aa, bb, cc, dd;
@@ -118,7 +154,7 @@ void rijndael_eblk(const rijndael_ctx *k, const uint32 *s, uint32 *dst)
   dst[0] = a; dst[1] = b; dst[2] = c; dst[3] = d;
 }
 
-void rijndael_dblk(const rijndael_ctx *k, const uint32 *s, uint32 *dst)
+static void simple_dblk(const rijndael_ctx *k, const uint32 *s, uint32 *dst)
 {
   uint32 a = s[0], b = s[1], c = s[2], d = s[3];
   uint32 aa, bb, cc, dd;
diff --git a/symm/salsa20-core.h b/symm/salsa20-core.h
index 98efa72e..b27f2220 100644
--- a/symm/salsa20-core.h
+++ b/symm/salsa20-core.h
@@ -43,6 +43,28 @@
 
 /*----- The Salsa20 core function -----------------------------------------*/
 
+/* It makes life somewhat easier if we don't actually store and maintain the
+ * input matrix in the textbook order.  Instead, we rotate the columns other
+ * than the leftmost one upwards, so that the constants which were originally
+ * along the diagonal end up on the top row.  We'll need to undo this
+ * permutation on output, but that's not too terrible an imposition.
+ *
+ * The permutation we're applying to the matrix elements is this:
+ *
+ * [  0  1  2  3 ]	 [  0  5 10 15 ]
+ * [  4  5  6  7 ]  -->  [  4  9 14  3 ]
+ * [  8  9 10 11 ]	 [  8 13  2  7 ]
+ * [ 12 13 14 15 ]	 [ 12  1  6 11 ]
+ *
+ * and as a result, we need to apply this inverse permutation to figure out
+ * which indices to use in the doublerow function and elsewhere.
+ *
+ * [  0 13 10  7 ]
+ * [  4  1 14 11 ]
+ * [  8  5  2 15 ]
+ * [ 12  9  6  3 ]
+ */
+
 /* The Salsa20 quarter-round.  Read from the matrix @y@ at indices @a@, @b@,
  * @c@, and @d@; and write to the corresponding elements of @z@.
  */
@@ -58,22 +80,31 @@
  */
 #define SALSA20_DR(z, y) do {						\
   SALSA20_QR(z, y,  0,  4,  8, 12);					\
-  SALSA20_QR(z, y,  5,  9, 13,  1);					\
-  SALSA20_QR(z, y, 10, 14,  2,  6);					\
-  SALSA20_QR(z, y, 15,  3,  7, 11);					\
-  SALSA20_QR(z, z,  0,  1,  2,  3);					\
-  SALSA20_QR(z, z,  5,  6,  7,  4);					\
-  SALSA20_QR(z, z, 10, 11,  8,  9);					\
-  SALSA20_QR(z, z, 15, 12, 13, 14);					\
+  SALSA20_QR(z, y,  1,  5,  9, 13);					\
+  SALSA20_QR(z, y,  2,  6, 10, 14);					\
+  SALSA20_QR(z, y,  3,  7, 11, 15);					\
+  SALSA20_QR(z, z,  0, 13, 10,  7);					\
+  SALSA20_QR(z, z,  1, 14, 11,  4);					\
+  SALSA20_QR(z, z,  2, 15,  8,  5);					\
+  SALSA20_QR(z, z,  3, 12,  9,  6);					\
 } while (0)
 
 /* The Salsa20 feedforward step, used at the end of the core function.  Here,
  * @y@ contains the original input matrix; @z@ contains the final one, and is
- * updated.
+ * updated.  The output is rendered in canonical order, ready for output.
  */
 #define SALSA20_FFWD(z, y) do {						\
-  int _i;								\
-  for (_i = 0; _i < 16; _i++) (z)[_i] += (y)[_i];			\
+  const uint32 *_y = (y);						\
+  uint32 *_z = (z);							\
+  int _t;								\
+  _z[ 0] = _z[ 0] + _y[ 0]; _z[ 4] = _z[ 4] + _y[ 4];			\
+  _z[ 8] = _z[ 8] + _y[ 8]; _z[12] = _z[12] + _y[12];			\
+      _t = _z[ 1] + _y[ 1]; _z[ 1] = _z[13] + _y[13];			\
+  _z[13] = _z[ 9] + _y[ 9]; _z[ 9] = _z[ 5] + _y[ 5]; _z[ 5] = _t;	\
+      _t = _z[ 2] + _y[ 2]; _z[ 2] = _z[10] + _y[10]; _z[10] = _t;	\
+      _t = _z[ 6] + _y[ 6]; _z[ 6] = _z[14] + _y[14]; _z[14] = _t;	\
+      _t = _z[ 3] + _y[ 3]; _z[ 3] = _z[ 7] + _y[ 7];			\
+  _z[ 7] = _z[11] + _y[11]; _z[11] = _z[15] + _y[15]; _z[15] = _t;	\
 } while (0)
 
 /* Various numbers of rounds, unrolled.  Read from @y@, and write to @z@. */
@@ -98,7 +129,7 @@
 
 /* Step the counter in the Salsa20 state matrix @a@. */
 #define SALSA20_STEP(a)							\
-  do { (a)[8] = U32((a)[8] + 1); (a)[9] += !(a)[8]; } while (0)
+  do { (a)[8] = U32((a)[8] + 1); (a)[5] += !(a)[8]; } while (0)
 
 /*----- Buffering and output ----------------------------------------------*
  *
diff --git a/symm/salsa20-x86ish-sse2.S b/symm/salsa20-x86ish-sse2.S
new file mode 100644
index 00000000..a168d79a
--- /dev/null
+++ b/symm/salsa20-x86ish-sse2.S
@@ -0,0 +1,331 @@
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// Fancy SIMD implementation of Salsa20
+///
+/// (c) 2015 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+#include "config.h"
+#include "asm-common.h"
+
+///--------------------------------------------------------------------------
+/// Local utilities.
+
+// Magic constants for shuffling.
+#define ROTL 0x93
+#define ROT2 0x4e
+#define ROTR 0x39
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+	.arch pentium4
+	.section .text
+
+FUNC(salsa20_core_x86ish_sse2)
+
+	// Initial setup.
+
+#if CPUFAM_X86
+	// Arguments come in on the stack, and will need to be collected.  We
+	// we can get away with just the scratch registers for integer work,
+	// but we'll run out of XMM registers and will need some properly
+	// aligned space which we'll steal from the stack.  I don't trust the
+	// stack pointer's alignment, so I'll have to mask the stack pointer,
+	// which in turn means I'll need to keep track of the old value.
+	// Hence I'm making a full i386-style stack frame here.
+	//
+	// The Windows and SysV ABIs are sufficiently similar that we don't
+	// need to worry about the differences here.
+
+#  define NR ecx
+#  define IN eax
+#  define OUT edx
+#  define SAVE0 xmm6
+#  define SAVE1 xmm7
+#  define SAVE2 [esp + 0]
+#  define SAVE3 [esp + 16]
+
+	push	ebp
+	mov	ebp, esp
+	sub	esp, 32
+	mov	IN, [ebp + 12]
+	mov	OUT, [ebp + 16]
+	and	esp, ~15
+	mov	NR, [ebp + 8]
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+	// This is nice.  We have plenty of XMM registers, and the arguments
+	// are in useful places.  There's no need to spill anything and we
+	// can just get on with the code.
+
+#  define NR edi
+#  define IN rsi
+#  define OUT rdx
+#  define SAVE0 xmm6
+#  define SAVE1 xmm7
+#  define SAVE2 xmm8
+#  define SAVE3 xmm9
+#endif
+
+#  if CPUFAM_AMD64 && ABI_WIN
+	// Arguments come in registers, but they're different between Windows
+	// and everyone else (and everyone else is saner).
+	//
+	// The Windows ABI insists that we preserve some of the XMM
+	// registers, but we want more than we can use as scratch space.  Two
+	// places we only need to save a copy of the input for the
+	// feedforward at the end; but the other two we want for the final
+	// permutation, so save the old values on the stack (We need an extra
+	// 8 bytes to align the stack.)
+
+#  define NR ecx
+#  define IN rdx
+#  define OUT r8
+#  define SAVE0 xmm6
+#  define SAVE1 xmm7
+#  define SAVE2 [rsp + 32]
+#  define SAVE3 [rsp + 48]
+
+	sub	rsp, 64 + 8
+	movdqa	[rsp +  0], xmm6
+	movdqa	[rsp + 16], xmm7
+#endif
+
+	// First job is to slurp the matrix into XMM registers.  The words
+	// have already been permuted conveniently to make them line up
+	// better for SIMD processing.
+	//
+	// The textbook arrangement of the matrix is this.
+	//
+	//	[C K K K]
+	//	[K C N N]
+	//	[T T C K]
+	//	[K K K C]
+	//
+	// But we've rotated the columns up so that the main diagonal with
+	// the constants on it end up in the first row, giving something more
+	// like
+	//
+	//	[C C C C]
+	//	[K T K K]
+	//	[T K K N]
+	//	[K K N K]
+	//
+	// so the transformation looks like this:
+	//
+	//	[ 0  1  2  3]		[ 0  5 10 15] (a, xmm0)
+	//	[ 4  5  6  7]    -->	[ 4  9 14  3] (b, xmm1)
+	//	[ 8  9 10 11]		[ 8 13  2  7] (c, xmm2)
+	//	[12 13 14 15]		[12  1  6 11] (d, xmm3)
+	movdqu	xmm0, [IN +  0]
+	movdqu	xmm1, [IN + 16]
+	movdqu	xmm2, [IN + 32]
+	movdqu	xmm3, [IN + 48]
+
+	## Take a copy for later.
+	movdqa	SAVE0, xmm0
+	movdqa	SAVE1, xmm1
+	movdqa	SAVE2, xmm2
+	movdqa	SAVE3, xmm3
+
+loop:
+	// Apply a column quarterround to each of the columns simultaneously.
+	// Alas, there doesn't seem to be a packed doubleword rotate, so we
+	// have to synthesize it.
+
+	// b ^= (a + d) <<<  7
+	movdqa	xmm4, xmm0
+	paddd	xmm4, xmm3
+	movdqa	xmm5, xmm4
+	pslld	xmm4, 7
+	psrld	xmm5, 25
+	por	xmm4, xmm5
+	pxor	xmm1, xmm4
+
+	// c ^= (b + a) <<<  9
+	movdqa	xmm4, xmm1
+	paddd	xmm4, xmm0
+	movdqa	xmm5, xmm4
+	pslld	xmm4, 9
+	psrld	xmm5, 23
+	por	xmm4, xmm5
+	pxor	xmm2, xmm4
+
+	// d ^= (c + b) <<< 13
+	movdqa	xmm4, xmm2
+	paddd	xmm4, xmm1
+	pshufd	xmm1, xmm1, ROTL
+	movdqa	xmm5, xmm4
+	pslld	xmm4, 13
+	psrld	xmm5, 19
+	por	xmm4, xmm5
+	pxor	xmm3, xmm4
+
+	// a ^= (d + c) <<< 18
+	movdqa	xmm4, xmm3
+	pshufd	xmm3, xmm3, ROTR
+	paddd	xmm4, xmm2
+	pshufd	xmm2, xmm2, ROT2
+	movdqa	xmm5, xmm4
+	pslld	xmm4, 18
+	psrld	xmm5, 14
+	por	xmm4, xmm5
+	pxor	xmm0, xmm4
+
+	// The transpose conveniently only involves reordering elements of
+	// individual rows, which can be done quite easily, and reordering
+	// the rows themselves, which is a trivial renaming.  It doesn't
+	// involve any movement of elements between rows.
+	//
+	//	[ 0  5 10 15]		[ 0  5 10 15] (a, xmm0)
+	//	[ 4  9 14  3]	 -->	[ 1  6 11 12] (b, xmm3)
+	//	[ 8 13	2  7]		[ 2  7	8 13] (c, xmm2)
+	//	[12  1	6 11]		[ 3  4	9 14] (d, xmm1)
+	//
+	// The shuffles have quite high latency, so they've been pushed
+	// backwards into the main instruction list.
+
+	// Apply the row quarterround to each of the columns (yes!)
+	// simultaneously.
+
+	// b ^= (a + d) <<<  7
+	movdqa	xmm4, xmm0
+	paddd	xmm4, xmm1
+	movdqa	xmm5, xmm4
+	pslld	xmm4, 7
+	psrld	xmm5, 25
+	por	xmm4, xmm5
+	pxor	xmm3, xmm4
+
+	// c ^= (b + a) <<<  9
+	movdqa	xmm4, xmm3
+	paddd	xmm4, xmm0
+	movdqa	xmm5, xmm4
+	pslld	xmm4, 9
+	psrld	xmm5, 23
+	por	xmm4, xmm5
+	pxor	xmm2, xmm4
+
+	// d ^= (c + b) <<< 13
+	movdqa	xmm4, xmm2
+	paddd	xmm4, xmm3
+	pshufd	xmm3, xmm3, ROTL
+	movdqa	xmm5, xmm4
+	pslld	xmm4, 13
+	psrld	xmm5, 19
+	por	xmm4, xmm5
+	pxor	xmm1, xmm4
+
+	// a ^= (d + c) <<< 18
+	movdqa	xmm4, xmm1
+	pshufd	xmm1, xmm1, ROTR
+	paddd	xmm4, xmm2
+	pshufd	xmm2, xmm2, ROT2
+	movdqa	xmm5, xmm4
+	pslld	xmm4, 18
+	psrld	xmm5, 14
+	por	xmm4, xmm5
+	pxor	xmm0, xmm4
+
+	// We had to undo the transpose ready for the next loop.  Again, push
+	// back the shuffles because they take a long time coming through.
+	// Decrement the loop counter and see if we should go round again.
+	// Later processors fuse this pair into a single uop.
+	sub	NR, 2
+	ja	loop
+
+	// Almost there.  Firstly, the feedforward addition, and then we have
+	// to write out the result.  Here we have to undo the permutation
+	// which was already applied to the input.  Shuffling has quite high
+	// latency, so arrange to start a new shuffle into a temporary as
+	// soon as we've written out the old value.
+	paddd	xmm0, SAVE0
+	pshufd	xmm4, xmm0, 0x39
+	movd	[OUT +  0], xmm0
+
+	paddd	xmm1, SAVE1
+	pshufd	xmm5, xmm1, ROTL
+	movd	[OUT + 16], xmm1
+
+	paddd	xmm2, SAVE2
+	pshufd	xmm6, xmm2, ROT2
+	movd	[OUT + 32], xmm2
+
+	paddd	xmm3, SAVE3
+	pshufd	xmm7, xmm3, ROTR
+	movd	[OUT + 48], xmm3
+
+	movd	[OUT +  4], xmm7
+	pshufd	xmm7, xmm3, ROT2
+	movd	[OUT + 24], xmm7
+	pshufd	xmm3, xmm3, ROTL
+	movd	[OUT + 44], xmm3
+
+	movd	[OUT +  8], xmm6
+	pshufd	xmm6, xmm2, ROTL
+	movd	[OUT + 28], xmm6
+	pshufd	xmm2, xmm2, ROTR
+	movd	[OUT + 52], xmm2
+
+	movd	[OUT + 12], xmm5
+	pshufd	xmm5, xmm1, ROTR
+	movd	[OUT + 36], xmm5
+	pshufd	xmm1, xmm1, ROT2
+	movd	[OUT + 56], xmm1
+
+	movd	[OUT + 20], xmm4
+	pshufd	xmm4, xmm0, ROT2
+	movd	[OUT + 40], xmm4
+	pshufd	xmm0, xmm0, ROTL
+	movd	[OUT + 60], xmm0
+
+	// Tidy things up.
+
+#if CPUFAM_X86
+	mov	esp, ebp
+	pop	ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+	movdqa	xmm6, [rsp +  0]
+	movdqa	xmm7, [rsp + 16]
+	add	rsp, 64 + 8
+#endif
+
+	// And with that, we're done.
+	ret
+
+#undef NR
+#undef IN
+#undef OUT
+#undef SAVE0
+#undef SAVE1
+#undef SAVE2
+#undef SAVE3
+
+ENDFUNC
+
+///----- That's all, folks --------------------------------------------------
diff --git a/symm/salsa20.c b/symm/salsa20.c
index 4b35cbd7..40f28fc0 100644
--- a/symm/salsa20.c
+++ b/symm/salsa20.c
@@ -7,11 +7,14 @@
 
 /*----- Header files ------------------------------------------------------*/
 
+#include "config.h"
+
 #include <stdarg.h>
 
 #include <mLib/bits.h>
 
 #include "arena.h"
+#include "dispatch.h"
 #include "gcipher.h"
 #include "grand.h"
 #include "keysz.h"
@@ -39,9 +42,29 @@ const octet salsa20_keysz[] = { KSZ_SET, 32, 16, 10, 0 };
  *		the feedforward step.
  */
 
-static void core(unsigned r, const salsa20_matrix src, salsa20_matrix dest)
+CPU_DISPATCH(static, (void),
+	     void, core, (unsigned r, const salsa20_matrix src,
+			  salsa20_matrix dest),
+	     (r, src, dest),
+	     pick_core, simple_core);
+
+static void simple_core(unsigned r, const salsa20_matrix src,
+			salsa20_matrix dest)
   { SALSA20_nR(dest, src, r); SALSA20_FFWD(dest, src); }
 
+#if CPUFAM_X86 || CPUFAM_AMD64
+extern core__functype salsa20_core_x86ish_sse2;
+#endif
+
+static core__functype *pick_core(void)
+{
+#if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_sse2,
+		     cpu_feature_p(CPUFEAT_X86_SSE2));
+#endif
+  DISPATCH_PICK_FALLBACK(salsa20_core, simple_core);
+}
+
 /* --- @populate@ --- *
  *
  * Arguments:	@salsa20_matrix a@ = a matrix to fill in
@@ -61,33 +84,42 @@ static void populate(salsa20_matrix a, const void *key, size_t ksz)
 
   KSZ_ASSERT(salsa20, ksz);
 
-  a[ 1] = LOAD32_L(k +  0);
-  a[ 2] = LOAD32_L(k +  4);
+  /* Here's the pattern of key, constant, nonce, and counter pieces in the
+   * matrix, before and after our permutation.
+   *
+   * [ C0 K0 K1 K2 ]	   [ C0 C1 C2 C3 ]
+   * [ K3 C1 N0 N1 ]  -->  [ K3 T1 K7 K2 ]
+   * [ T0 T1 C2 K4 ]	   [ T0 K6 K1 N1 ]
+   * [ K5 K6 K7 C3 ]	   [ K5 K0 N0 K4 ]
+   */
+
+  a[13] = LOAD32_L(k +  0);
+  a[10] = LOAD32_L(k +  4);
   if (ksz == 10) {
-    a[ 3] = LOAD16_L(k +  8);
+    a[ 7] = LOAD16_L(k +  8);
     a[ 4] = 0;
   } else {
-    a[ 3] = LOAD32_L(k +  8);
+    a[ 7] = LOAD32_L(k +  8);
     a[ 4] = LOAD32_L(k + 12);
   }
   if (ksz <= 16) {
-    a[11] = a[ 1];
-    a[12] = a[ 2];
-    a[13] = a[ 3];
-    a[14] = a[ 4];
+    a[15] = a[13];
+    a[12] = a[10];
+    a[ 9] = a[ 7];
+    a[ 6] = a[ 4];
     a[ 0] = SALSA20_A128;
-    a[ 5] = SALSA20_B128;
-    a[10] = ksz == 10 ? SALSA20_C80 : SALSA20_C128;
-    a[15] = SALSA20_D128;
+    a[ 1] = SALSA20_B128;
+    a[ 2] = ksz == 10 ? SALSA20_C80 : SALSA20_C128;
+    a[ 3] = SALSA20_D128;
   } else {
-    a[11] = LOAD32_L(k + 16);
+    a[15] = LOAD32_L(k + 16);
     a[12] = LOAD32_L(k + 20);
-    a[13] = LOAD32_L(k + 24);
-    a[14] = LOAD32_L(k + 28);
+    a[ 9] = LOAD32_L(k + 24);
+    a[ 6] = LOAD32_L(k + 28);
     a[ 0] = SALSA20_A256;
-    a[ 5] = SALSA20_B256;
-    a[10] = SALSA20_C256;
-    a[15] = SALSA20_D256;
+    a[ 1] = SALSA20_B256;
+    a[ 2] = SALSA20_C256;
+    a[ 3] = SALSA20_D256;
   }
 }
 
@@ -130,8 +162,8 @@ void salsa20_setnonce(salsa20_ctx *ctx, const void *nonce)
 {
   const octet *n = nonce;
 
-  ctx->a[6] = LOAD32_L(n + 0);
-  ctx->a[7] = LOAD32_L(n + 4);
+  ctx->a[14] = LOAD32_L(n + 0);
+  ctx->a[11] = LOAD32_L(n + 4);
   salsa20_seek(ctx, 0);
 }
 
@@ -153,7 +185,7 @@ void salsa20_seek(salsa20_ctx *ctx, unsigned long i)
 
 void salsa20_seeku64(salsa20_ctx *ctx, kludge64 i)
 {
-  ctx->a[8] = LO64(i); ctx->a[9] = HI64(i);
+  ctx->a[8] = LO64(i); ctx->a[5] = HI64(i);
   ctx->bufi = SALSA20_OUTSZ;
 }
 
@@ -169,7 +201,7 @@ unsigned long salsa20_tell(salsa20_ctx *ctx)
   { kludge64 i = salsa20_tellu64(ctx); return (GET64(unsigned long, i)); }
 
 kludge64 salsa20_tellu64(salsa20_ctx *ctx)
-  { kludge64 i; SET64(i, ctx->a[9], ctx->a[8]); return (i); }
+  { kludge64 i; SET64(i, ctx->a[5], ctx->a[8]); return (i); }
 
 /* --- @salsa20{,12,8}_encrypt@ --- *
  *
@@ -272,10 +304,10 @@ SALSA20_VARS(DEFENCRYPT)
      * speed critical, so we do it the harder way.			\
      */									\
 									\
-    for (i = 0; i < 4; i++) k[i + 6] = src[i];				\
+    for (i = 0; i < 4; i++) k[14 - 3*i] = src[i];			\
     core(r, k, a);							\
-    for (i = 0; i < 4; i++) dest[i] = a[5*i] - k[5*i];			\
-    for (i = 4; i < 8; i++) dest[i] = a[i + 2] - k[i + 2];		\
+    for (i = 0; i < 4; i++) dest[i] = a[5*i] - k[i];			\
+    for (i = 4; i < 8; i++) dest[i] = a[i + 2] - k[26 - 3*i];		\
   }									\
 									\
   void HSALSA20_PRF(r, salsa20_ctx *ctx, const void *src, void *dest)	\
@@ -340,9 +372,9 @@ SALSA20_VARS(DEFHSALSA20)
 									\
     populate(ctx->k, key, ksz);						\
     ctx->s.a[ 0] = SALSA20_A256;					\
-    ctx->s.a[ 5] = SALSA20_B256;					\
-    ctx->s.a[10] = SALSA20_C256;					\
-    ctx->s.a[15] = SALSA20_D256;					\
+    ctx->s.a[ 1] = SALSA20_B256;					\
+    ctx->s.a[ 2] = SALSA20_C256;					\
+    ctx->s.a[ 3] = SALSA20_D256;					\
     XSALSA20_SETNONCE(r, ctx, nonce ? nonce : zerononce);		\
   }
 SALSA20_VARS(DEFXINIT)
@@ -371,8 +403,8 @@ SALSA20_VARS(DEFXINIT)
 									\
     for (i = 0; i < 4; i++) in[i] = LOAD32_L(n + 4*i);			\
     HSALSA20_RAW(r, ctx->k, in, out);					\
-    for (i = 0; i < 4; i++) ctx->s.a[i + 1] = out[i];			\
-    for (i = 4; i < 8; i++) ctx->s.a[i + 7] = out[i];			\
+    for (i = 0; i < 4; i++) ctx->s.a[13 - 3*i] = out[i];		\
+    for (i = 4; i < 8; i++) ctx->s.a[27 - 3*i] = out[i];		\
     salsa20_setnonce(&ctx->s, n + 16);					\
   }
 SALSA20_VARS(DEFXNONCE)
@@ -730,23 +762,31 @@ SALSA20_VARS(DEFXGRAND)
 #include <mLib/quis.h>
 #include <mLib/testrig.h>
 
+static const int perm[] = {
+   0, 13, 10,  7,
+   4,  1, 14, 11,
+   8,  5,  2, 15,
+  12,  9,  6,  3
+};
+
 #define DEFVCORE(r)							\
   static int v_core_##r(dstr *v)					\
   {									\
     salsa20_matrix a, b;						\
     dstr d = DSTR_INIT;							\
-    int i, n;								\
+    int i, j, n;							\
     int ok = 1;								\
 									\
     DENSURE(&d, SALSA20_OUTSZ); d.len = SALSA20_OUTSZ;			\
     n = *(int *)v[0].buf;						\
     for (i = 0; i < SALSA20_OUTSZ/4; i++)				\
-      a[i] = LOAD32_L(v[1].buf + 4*i);					\
+      b[i] = LOAD32_L(v[1].buf + 4*i);					\
     for (i = 0; i < n; i++) {						\
+      for (j = 0; j < 16; j++) a[perm[j]] = b[j];			\
       core(r, a, b);							\
       memcpy(a, b, sizeof(a));						\
     }									\
-    for (i = 0; i < SALSA20_OUTSZ/4; i++) STORE32_L(d.buf + 4*i, a[i]);	\
+    for (i = 0; i < SALSA20_OUTSZ/4; i++) STORE32_L(d.buf + 4*i, b[i]);	\
 									\
     if (d.len != v[2].len || memcmp(d.buf, v[2].buf, v[2].len) != 0) {	\
       ok = 0;								\