From 9e6a4409d58d1ed9dfe2de3c6ffaee822e051c9f Mon Sep 17 00:00:00 2001
From: Mark Wooding <mdw@distorted.org.uk>
Date: Tue, 13 Nov 2018 11:28:53 +0000
Subject: [PATCH] symm/gcm-*.S: GCM acceleration using hardware polynomial
 multiplication.

Add assembler implementations of the low-level GCM arithmetic which make
use of polynomial multiplication instructions on x86 (the delightfully
named `pclmul{l,h}q{l,h}dq' instructions) and ARM processors (the ARM32
`vmull.p64' and ARM64 `pmull{,2}' instructions).  Of course, this
involves adding the necessary CPU feature detection.

GCM's bit and byte order is remarkably confusing.  I've tried quite hard
to write the code so as to help the reader keep track of which bits are
where, but it's very difficult.

There's also a Python implementation which has proven invaluable while
debugging these things.
---
 base/dispatch.c          |   18 +-
 base/dispatch.h          |    5 +-
 symm/Makefile.am         |   12 +
 symm/gcm-arm-crypto.S    |  718 +++++++++++++++++++++++++++++++
 symm/gcm-arm64-pmull.S   |  661 ++++++++++++++++++++++++++++
 symm/gcm-x86ish-pclmul.S | 1073 ++++++++++++++++++++++++++++++++++++++++++++++
 symm/gcm.c               |  263 +++++++++++-
 utils/gcm-ref            |  502 ++++++++++++++++++++++
 8 files changed, 3246 insertions(+), 6 deletions(-)
 create mode 100644 symm/gcm-arm-crypto.S
 create mode 100644 symm/gcm-arm64-pmull.S
 create mode 100644 symm/gcm-x86ish-pclmul.S
 create mode 100755 utils/gcm-ref

diff --git a/base/dispatch.c b/base/dispatch.c
index 9ba6a7cd..9f2ac71b 100644
--- a/base/dispatch.c
+++ b/base/dispatch.c
@@ -46,6 +46,8 @@
 #  define EFLAGS_ID (1u << 21)
 #  define CPUID1D_SSE2 (1u << 26)
 #  define CPUID1D_FXSR (1u << 24)
+#  define CPUID1C_PCLMUL (1u << 1)
+#  define CPUID1C_SSSE3 (1u << 9)
 #  define CPUID1C_AESNI (1u << 25)
 #  define CPUID1C_AVX (1u << 28)
 #  define CPUID1C_RDRAND (1u << 30)
@@ -282,13 +284,15 @@ static unsigned hwcaps = 0;
 	_(ARM_NEON, "arm:neon")						\
 	_(ARM_V4, "arm:v4")						\
 	_(ARM_D32, "arm:d32")						\
-	_(ARM_AES, "arm:aes")
+	_(ARM_AES, "arm:aes")						\
+	_(ARM_PMULL, "arm:pmull")
 #endif
 #if CPUFAM_ARM64
 #  define WANTAUX(_)							\
 	WANT_AT_HWCAP(_)
 #  define CAPMAP(_)							\
-	_(ARM_AES, "arm:aes")
+	_(ARM_AES, "arm:aes")						\
+	_(ARM_PMULL, "arm:pmull")
 #endif
 
 /* Build the bitmask for `hwcaps' from the `CAPMAP' list. */
@@ -402,9 +406,13 @@ static void probe_hwcaps(void)
 #  ifdef HWCAP2_AES
   if (probed.hwcap2 & HWCAP2_AES) hw |= HF_ARM_AES;
 #  endif
+#  ifdef HWCAP2_PMULL
+  if (probed.hwcap2 & HWCAP2_PMULL) hw |= HF_ARM_PMULL;
+#  endif
 #endif
 #if CPUFAM_ARM64
   if (probed.hwcap & HWCAP_AES) hw |= HF_ARM_AES;
+  if (probed.hwcap & HWCAP_PMULL) hw |= HF_ARM_PMULL;
 #endif
 
   /* Store the bitmask of features we probed for everyone to see. */
@@ -549,6 +557,12 @@ int cpu_feature_p(int feat)
     CASE_CPUFEAT(X86_AVX, "x86:avx",
 		 xmm_registers_available_p() &&
 		 cpuid_features_p(0, CPUID1C_AVX));
+    CASE_CPUFEAT(X86_SSSE3, "x86:ssse3",
+		 xmm_registers_available_p() &&
+		 cpuid_features_p(0, CPUID1C_SSSE3));
+    CASE_CPUFEAT(X86_PCLMUL, "x86:pclmul",
+		 xmm_registers_available_p() &&
+		 cpuid_features_p(0, CPUID1C_PCLMUL));
 #endif
 #ifdef CAPMAP
 #  define FEATP__CASE(feat, tok)					\
diff --git a/base/dispatch.h b/base/dispatch.h
index dae6a689..7c083821 100644
--- a/base/dispatch.h
+++ b/base/dispatch.h
@@ -182,7 +182,10 @@ enum {
   CPUFEAT_ARM_D32,			/* 32 double registers, not 16 */
   CPUFEAT_X86_RDRAND,			/* Built-in entropy source */
   CPUFEAT_ARM_AES,			/* AES instructions */
-  CPUFEAT_X86_AVX			/* AVX 1 (i.e., 256-bit YMM regs) */
+  CPUFEAT_X86_AVX,			/* AVX 1 (i.e., 256-bit YMM regs) */
+  CPUFEAT_X86_SSSE3,			/* Supplementary SSE 3 */
+  CPUFEAT_X86_PCLMUL,			/* Carry-less multiplication */
+  CPUFEAT_ARM_PMULL			/* Polynomial multiplication */
 };
 
 extern int cpu_feature_p(int /*feat*/);
diff --git a/symm/Makefile.am b/symm/Makefile.am
index cb20f3a9..a4f45e9d 100644
--- a/symm/Makefile.am
+++ b/symm/Makefile.am
@@ -323,6 +323,18 @@ BLKCMACMODES		+= cmac pmac1
 pkginclude_HEADERS	+= ocb.h
 BLKCAEADMODES		+= ccm eax gcm ocb1 ocb3
 libsymm_la_SOURCES	+= ccm.c gcm.c ocb.c
+if CPUFAM_X86
+libsymm_la_SOURCES	+= gcm-x86ish-pclmul.S
+endif
+if CPUFAM_AMD64
+libsymm_la_SOURCES	+= gcm-x86ish-pclmul.S
+endif
+if CPUFAM_ARMEL
+libsymm_la_SOURCES	+= gcm-arm-crypto.S
+endif
+if CPUFAM_ARM64
+libsymm_la_SOURCES	+= gcm-arm64-pmull.S
+endif
 
 TESTS			+= gcm.t$(EXEEXT)
 EXTRA_DIST		+= t/gcm
diff --git a/symm/gcm-arm-crypto.S b/symm/gcm-arm-crypto.S
new file mode 100644
index 00000000..ddc714b3
--- /dev/null
+++ b/symm/gcm-arm-crypto.S
@@ -0,0 +1,718 @@
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// GCM acceleration for ARM processors
+///
+/// (c) 2019 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software: you can redistribute it and/or modify it
+/// under the terms of the GNU Library General Public License as published
+/// by the Free Software Foundation; either version 2 of the License, or
+/// (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful, but
+/// WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+/// Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb.  If not, write to the Free Software
+/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+/// USA.
+
+///--------------------------------------------------------------------------
+/// Preliminaries.
+
+#include "config.h"
+#include "asm-common.h"
+
+	.arch	armv8-a
+	.fpu	crypto-neon-fp-armv8
+
+	.text
+
+///--------------------------------------------------------------------------
+/// Multiplication macros.
+
+	// The good news is that we have a fancy instruction to do the
+	// multiplications.  The bad news is that it's not particularly well-
+	// suited to the job.
+	//
+	// For one thing, it only does a 64-bit multiplication, so in general
+	// we'll need to synthesize the full-width multiply by hand.  For
+	// another thing, it doesn't help with the reduction, so we have to
+	// do that by hand too.  And, finally, GCM has crazy bit ordering,
+	// and the instruction does nothing useful for that at all.
+	//
+	// Focusing on that last problem first: the bits aren't in monotonic
+	// significance order unless we permute them.  If we reverse the byte
+	// order, then we'll have the bits in monotonic order, but backwards,
+	// so the degree-0 coefficient will be in the most-significant bit.
+	//
+	// This is less of a difficulty than it seems at first, because
+	// algebra.  Suppose we are given u = SUM_{0<=i<n} u_i t^i and v =
+	// SUM_{0<=j<n} v_j t^j; then
+	//
+	//	u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
+	//
+	// Suppose instead that we're given Å© = SUM_{0<=i<n} u_{n-i-1} t^i
+	// and vÌ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
+	// Then
+	//
+	//	Å© vÌ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
+	//	    = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
+	//
+	// which is almost the bit-reversal of u v, only it's shifted right
+	// by one place.  Oh, well: we'll have to shift it back later.
+	//
+	// That was important to think about, but there's not a great deal to
+	// do about it yet other than to convert what we've got from the
+	// blockcipher's byte-ordering convention to our big-endian
+	// convention.  Since this depends on the blockcipher convention,
+	// we'll leave the caller to cope with this: the macros here will
+	// assume that the operands are in `register' format, which is the
+	// same as the external representation, except that the bytes within
+	// each 64-bit piece are reversed.  In the commentary, pieces of
+	// polynomial are numbered according to the degree of the
+	// coefficients, so the unit coefficient of some polynomial a is in
+	// a_0.
+	//
+	// The commentary for `mul128' is the most detailed.  The other
+	// macros assume that you've already read and understood that.
+
+.macro	mul128
+	// Enter with u and v in q0 and q1 respectively; leave with z = u v
+	// in q0.  Clobbers q1--q3, q8, q9.
+
+	// First for the double-precision multiplication.  It's tempting to
+	// use Karatsuba's identity here, but I suspect that loses more in
+	// the shifting, bit-twiddling, and dependency chains that it gains
+	// in saving a multiplication which otherwise pipelines well.
+	// q0 =				// (u_0; u_1)
+	// q1 =				// (v_0; v_1)
+	vmull.p64 q2, d1, d2		// u_1 v_0
+	vmull.p64 q3, d0, d3		// u_0 v_1
+	vmull.p64 q8, d1, d3		// (x_3; t_1) = u_1 v_1
+	vmull.p64 q9, d0, d2		// (t_0; x_0) = u_0 v_0
+
+	// Arrange the pieces to form a double-precision polynomial.
+	veor	q2, q2, q3		// (m_1; m_0) = u_0 v_1 + u_1 v_0
+	veor	d17, d17, d4		// x_2 = t_1 + m_1
+	veor	d18, d18, d5		// x_1 = t_0 + m_0
+	// q8 =				// (x_3; x_2)
+	// q9 =				// (x_1; x_0)
+
+	// Two-and-a-half problems remain.  The first is that this product is
+	// shifted left by one place, which is annoying.  Let's take care of
+	// that now.  Once this is done, we'll be properly in GCM's backwards
+	// bit-ordering.
+	//
+	// The half a problem is that the result wants to have its 64-bit
+	// halves switched.  Here turns out to be the best place to arrange
+	// for that.
+	//
+	//		     q9				    q8
+	//	,-------------.-------------.  ,-------------.-------------.
+	//	| 0  x_0-x_62 | x_63-x_126  |  | x_127-x_190 | x_191-x_254 |
+	//	`-------------^-------------'  `-------------^-------------'
+	//	      d19	    d18		     d17	   d16
+	//
+	// We start by shifting each 32-bit lane right (from GCM's point of
+	// view -- physically, left) by one place, which gives us this:
+	//
+	//		   low (q9)			 high (q8)
+	//	,-------------.-------------.  ,-------------.-------------.
+	//	| x_0-x_62  0 |x_64-x_126 0 |  |x_128-x_190 0|x_192-x_254 0|
+	//	`-------------^-------------'  `-------------^-------------'
+	//	      d19	    d18		     d17	   d16
+	//
+	// but we've lost a bunch of bits.  We separately shift each lane
+	// left by 31 places to give us the bits we lost.
+	//
+	//		   low (q3)			 high (q2)
+	//	,-------------.-------------.  ,-------------.-------------.
+	//	|    0...0    | 0...0  x_63 |  | 0...0 x_127 | 0...0 x_191 |
+	//	`-------------^-------------'  `-------------^-------------'
+	//			     d6		     d5		   d4
+	//
+	// Since we can address each of these pieces individually, putting
+	// them together is relatively straightforward.
+
+
+	vshr.u64 d6, d18, #63		// shifted left; just the carries
+	vshl.u64 q9, q9, #1		// shifted right, but dropped carries
+	vshr.u64 q2, q8, #63
+	vshl.u64 q8, q8, #1
+	vorr	d0, d19, d6		// y_0
+	vorr	d1, d18, d5		// y_1
+	vorr	d2, d17, d4		// y_2
+	vmov	d3, d16			// y_3
+
+	// And the other one is that the result needs to be reduced modulo
+	// p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128 = t^7 + t^2 + t +
+	// 1 in our field.  So far, we've calculated z_0 and z_1 such that
+	// z_0 + z_1 R = u v using the identity R = t^128: now we must
+	// collapse the two halves of y together using the other identity R =
+	// t^7 + t^2 + t + 1.
+	//
+	// We do this by working on y_2 and y_3 separately, so consider y_i
+	// for i = 2 or 3.  Certainly, y_i t^{64i} = y_i R t^{64(i-2) =
+	// (t^7 + t^2 + t + 1) y_i t^{64(i-2)}, but we can't use that
+	// directly without breaking up the 64-bit word structure.  Instead,
+	// we start by considering just y_i t^7 t^{64(i-2)}, which again
+	// looks tricky.  Now, split y_i = a_i + t^57 b_i, with deg a_i < 57;
+	// then
+	//
+	//	y_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
+	//
+	// We can similarly decompose y_i t^2 and y_i t into a pair of 64-bit
+	// contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the
+	// splits are different.  This is lovely, with one small snag: when
+	// we do this to y_3, we end up with a contribution back into the
+	// t^128 coefficient word.  But notice that only the low seven bits
+	// of this word are affected, so there's no knock-on contribution
+	// into the t^64 word.  Therefore, if we handle the high bits of each
+	// word together, and then the low bits, everything will be fine.
+
+	// First, shift the high bits down.
+	vshl.u64 q2, q1, #63		// the b_i for t
+	vshl.u64 q3, q1, #62		// the b_i for t^2
+	vshl.u64 q8, q1, #57		// the b_i for t^7
+	veor	q2, q2, q3		// add them all together
+	veor	q2, q2, q8
+	veor	d2, d2, d5		// contribution into low half
+	veor	d1, d1, d4		// and high half
+
+	// And then shift the low bits up.
+	vshr.u64 q2, q1, #1
+	vshr.u64 q3, q1, #2
+	vshr.u64 q8, q1, #7
+	veor	q0, q0, q1		// mix in the unit contribution
+	veor	q2, q2, q3		// t and t^2 contribs
+	veor	q0, q0, q8		// low, unit, and t^7 contribs
+	veor	q0, q0, q2		// mix them together and we're done
+.endm
+
+.macro	mul64
+	// Enter with u and v in the low halves of d0 and d1 respectively;
+	// leave with z = u v in d0.  Clobbers d1--d5.
+
+	// The multiplication is thankfully easy.
+	vmull.p64 q0, d0, d1		// u v
+
+	// Shift the product up by one place, and swap the two halves.  After
+	// this, we're in GCM bizarro-world.
+	vshr.u64 d2, d0, #63		// shifted left; just the carries
+	vshl.u64 d3, d1, #1		// low half right
+	vshl.u64 d1, d0, #1		// high half shifted right
+	vorr	d0, d3, d2		// mix in the carries
+
+	// Now we must reduce.  This is essentially the same as the 128-bit
+	// case above, but mostly simpler because everything is smaller.  The
+	// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
+
+	// First, shift the high bits down.
+	vshl.u64 d2, d1, #63		// b_i for t
+	vshl.u64 d3, d1, #61		// b_i for t^3
+	vshl.u64 d4, d1, #60		// b_i for t^4
+	veor	d2, d2, d3		// add them all together
+	veor	d2, d2, d4
+	veor	d1, d1, d2		// contribution back into high half
+
+	// And then shift the low bits up.
+	vshr.u64 d2, d1, #1
+	vshr.u64 d3, d1, #3
+	vshr.u64 d4, d1, #4
+	veor	d0, d0, d1		// mix in the unit contribution
+	veor	d2, d2, d3		// t and t^3 contribs
+	veor	d0, d0, d4		// low, unit, and t^4
+	veor	d0, d0, d2		// mix them together and we're done
+.endm
+
+.macro	mul96
+	// Enter with u and v in the most-significant three words of q0 and
+	// q1 respectively, and zero in the low words, and zero in q15; leave
+	// with z = u v in the high three words of q0, and /junk/ in the low
+	// word.  Clobbers ???.
+
+	// This is an inconvenient size.  There's nothing for it but to do
+	// four multiplications, as if for the 128-bit case.  It's possible
+	// that there's cruft in the top 32 bits of the input registers, so
+	// shift both of them up by four bytes before we start.  This will
+	// mean that the high 64 bits of the result (from GCM's viewpoint)
+	// will be zero.
+	// q0 =				// (u_0 + u_1 t^32; u_2)
+	// q1 =				// (v_0 + v_1 t^32; v_2)
+	vmull.p64 q8, d1, d2		// u_2 (v_0 + v_1 t^32) = e_0
+	vmull.p64 q9, d0, d3		// v_2 (u_0 + u_1 t^32) = e_1
+	vmull.p64 q3, d1, d3		// u_2 v_2 t^64 = d = (0; d)
+	vmull.p64 q0, d0, d2		// u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
+					//   + u_1 v_1 t^64 = f
+
+	// Extract the high and low halves of the 192-bit result.  The answer
+	// we want is d t^128 + e t^64 + f, where e = e_0 + e_1.  The low 96
+	// bits of the answer will end up in q0, and the high 96 bits will
+	// end up in q1; we'll need both of these to have zero in their
+	// bottom 32 bits.
+	//
+	// Here, bot(x) is the low 96 bits of a 192-bit quantity x, arranged
+	// in the low 96 bits of a SIMD register, with junk in the top 32
+	// bits; and top(x) is the high 96 bits, also arranged in the low 96
+	// bits of a register, with /zero/ in the top 32 bits.
+	veor	q8, q8, q9		// e_0 + e_1 = e
+	vshr128	q1, q3, 32		// top(d t^128)
+	vext.8	d19, d16, d17, #4	// top(e t^64)
+	vshl.u64 d16, d0, #32		// top(f), sort of
+	veor	d3, d3, d19		// q1 = top(d t^128 + e t^64)
+	veor	d0, d0, d17		// q0 = bot([d t^128] + e t^64 + f)
+	veor	d3, d3, d16		// q1 = top(d t^128 + e t^64 + f)
+
+	// Shift the product right by one place (from GCM's point of view),
+	// but, unusually, don't swap the halves, because we need to work on
+	// the 32-bit pieces later.  After this, we're in GCM bizarro-world.
+	// q0 =				// (?, x_2; x_1, x_0)
+	// q1 =				// (0, x_5; x_4, x_3)
+	vshr.u64 d4, d0, #63		// carry from d0 to d1
+	vshr.u64 d5, d2, #63		// carry from d2 to d3
+	vshr.u32 d6, d3, #31		// carry from d3 to d0
+	vshl.u64 q0, q0, #1		// shift low half
+	vshl.u64 q1, q1, #1		// shift high half
+	vorr	d1, d1, d4
+	vorr	d0, d0, d6
+	vorr	d3, d3, d5
+
+	// Finally, the reduction.  This is essentially the same as the
+	// 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
+	// t^9 + t^6 + 1.  The degrees are larger but not enough to cause
+	// trouble for the general approach.
+
+	// First, shift the high bits down.
+	vshl.u32 q2, q1, #26		// b_i for t^6
+	vshl.u32 q3, q1, #23		// b_i for t^9
+	vshl.u32 q8, q1, #22		// b_i for t^10
+	veor	q2, q2, q3		// add them all together
+	veor	q2, q2, q8
+	vshl128	q3, q2, 64		// contribution into high half
+	vshr128	q2, q2, 32		// and low half
+	veor	q1, q1, q3		// mix them in
+	veor	q0, q0, q2
+
+	// And then shift the low bits up.
+	vshr.u32 q2, q1, #6
+	vshr.u32 q3, q1, #9
+	veor	q0, q0, q1		// mix in the unit contribution
+	vshr.u32 q8, q1, #10
+	veor	q2, q2, q3		// mix together t^6 and t^9
+	veor	q0, q0, q8		// mix in t^10
+	veor	q0, q0, q2		// and the rest
+
+	// And finally swap the two halves.
+	vswp	d0, d1
+.endm
+
+.macro	mul192
+	// Enter with u and v in d0--d2 and d3--d5 respectively; leave
+	// with z = u v in d0--d2.  Clobbers q8--q15.
+
+	// Start multiplying and accumulating pieces of product.
+	// (d0; d1; d2) =		// (u_0; u_1; u_2)
+	// (d3; d4; d5) =		// (v_0; v_1; v_2)
+	vmull.p64 q10, d0, d3		// e = u_0 v_0
+
+	vmull.p64 q12, d0, d4		//     u_0 v_1
+	vmull.p64 q13, d1, d3		//     u_1 v_0
+
+	vmull.p64 q9, d0, d5		//     u_0 v_2
+	vmull.p64 q14, d1, d4		//     u_1 v_1
+	vmull.p64 q15, d2, d3		//     u_2 v_0
+	 veor	q12, q12, q13		// d = u_0 v_1 + u_1 v_0
+
+	vmull.p64 q11, d1, d5		//     u_1 v_2
+	vmull.p64 q13, d2, d4		//     u_2 v_1
+	 veor	q9, q9, q14		//     u_0 v_2 + u_1 v_1
+
+	vmull.p64 q8, d2, d5		// a = u_2 v_2
+	 veor	q9, q9, q15		// c = u_0 v_2 + u_1 v_1 + u_2 v_0
+	 veor	q11, q11, q13		// b = u_1 v_2 + u_2 v_1
+
+	// Piece the product together.
+	veor	d17, d17, d22  //  q8 =	// (x_5; x_4)
+	veor	d18, d18, d23
+	veor	d19, d19, d24  //  q9 =	// (x_3; x_2)
+	veor	d20, d20, d25  // q10 =	// (x_1; x_0)
+
+	// Shift the product right by one place (from GCM's point of view).
+	vshr.u64 q11, q8, #63		// carry from d16/d17 to d17/d18
+	vshr.u64 q12, q9, #63		// carry from d18/d19 to d19/d20
+	vshr.u64 d26, d20, #63		// carry from d20 to d21
+	vshl.u64 q8, q8, #1		// shift everything down
+	vshl.u64 q9, q9, #1
+	vshl.u64 q10, q10, #1
+	vorr	d17, d17, d22		// and mix in the carries
+	vorr	d18, d18, d23
+	vorr	d19, d19, d24
+	vorr	d20, d20, d25
+	vorr	d21, d21, d26
+
+	// Next, the reduction.  Our polynomial this time is p(x) = t^192 +
+	// t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
+	// 128-bit case.  I don't know why.
+
+	// First, shift the high bits down.
+	// q8 =				// (y_5; y_4)
+	// q9 =				// (y_3; y_2)
+	// q10 =			// (y_1; y_0)
+	vshl.u64 q11, q8, #63		// (y_5; y_4) b_i for t
+	vshl.u64 d28, d18, #63		// y_3 b_i for t
+	vshl.u64 q12, q8, #62		// (y_5; y_4) b_i for t^2
+	vshl.u64 d29, d18, #62		// y_3 b_i for t^2
+	vshl.u64 q13, q8, #57		// (y_5; y_4) b_i for t^7
+	vshl.u64 d30, d18, #57		// y_3 b_i for t^7
+	veor	q11, q11, q12		// mix them all together
+	veor	d28, d28, d29
+	veor	q11, q11, q13
+	veor	d28, d28, d30
+	veor	q9, q9, q11
+	veor	d20, d20, d28
+
+	// And finally shift the low bits up.  Also, switch the order of the
+	// pieces for output.
+	// q8 =				// (y'_5; y'_4)
+	// q9 =				// (y'_3; y'_2)
+	// q10 =			// (y'_1; y'_0)
+	vshr.u64 q11, q8, #1		// (y_5; y_4) a_i for t
+	vshr.u64 d28, d18, #1		// y'_3 a_i for t
+	vshr.u64 q12, q8, #2		// (y_5; y_4) a_i for t^2
+	vshr.u64 d29, d18, #2		// y'_3 a_i for t^2
+	vshr.u64 q13, q8, #7		// (y_5; y_4) a_i for t^7
+	vshr.u64 d30, d18, #7		// y'_3 a_i for t^7
+	veor	q8, q8, q11
+	veor	d18, d18, d28
+	veor	q12, q12, q13
+	veor	d29, d29, d30
+	veor	q8, q8, q12
+	veor	d18, d18, d29
+	veor	d0, d21, d18
+	veor	d1, d20, d17
+	veor	d2, d19, d16
+.endm
+
+.macro	mul256
+	// Enter with u and v in q0/q1 and q2/q3 respectively; leave
+	// with z = u v in q0/q1.  Clobbers q8--q15.
+
+	// Now it's starting to look worthwhile to do Karatsuba.  Suppose
+	// u = u_0 + u_1 B and v = v_0 + v_1 B.  Then
+	//
+	//	u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
+	//
+	// Name these coefficients of B^i be a, b, and c, respectively, and
+	// let r = u_0 + u_1 and s = v_0 + v_1.  Then observe that
+	//
+	//	q = r s = (u_0 + u_1) (v_0 + v_1)
+	//	  = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
+	//	  = a + d + c
+	//
+	// The first two terms we've already calculated; the last is the
+	// remaining one we want.  We'll set B = t^128.  We know how to do
+	// 128-bit multiplications already, and Karatsuba is too annoying
+	// there, so there'll be 12 multiplications altogether, rather than
+	// the 16 we'd have if we did this the naÃ¯ve way.
+	// q0 =				// u_0 = (u_00; u_01)
+	// q1 =				// u_1 = (u_10; u_11)
+	// q2 =				// v_0 = (v_00; v_01)
+	// q3 =				// v_1 = (v_10; v_11)
+
+	veor	q8, q0, q1		// u_* = (u_00 + u_10; u_01 + u_11)
+	veor	q9, q2, q3		// v_* = (v_00 + v_10; v_01 + v_11)
+
+	// Start by building the cross product, q = u_* v_*.
+	vmull.p64 q14, d16, d19		// u_*0 v_*1
+	vmull.p64 q15, d17, d18		// u_*1 v_*0
+	vmull.p64 q12, d17, d19		// u_*1 v_*1
+	vmull.p64 q13, d16, d18		// u_*0 v_*0
+	veor	q14, q14, q15		// u_*0 v_*1 + u_*1 v_*0
+	veor	d25, d25, d28  // q12 =	// q_1
+	veor	d26, d26, d29  // q13 = // q_0
+
+	// Next, work on the low half, a = u_0 v_0.
+	vmull.p64 q14, d0, d5		// u_00 v_01
+	vmull.p64 q15, d1, d4		// u_01 v_00
+	vmull.p64 q10, d1, d5		// u_01 v_01
+	vmull.p64 q11, d0, d4		// u_00 v_00
+	veor	q14, q14, q15		// u_00 v_01 + u_01 v_00
+	veor	d21, d21, d28  // q10 =	// a_1
+	veor	d22, d22, d29  // q11 =	// a_0
+
+	// Mix the pieces we have so far.
+	veor	q12, q12, q10
+	veor	q13, q13, q11
+
+	// Finally, the high half, c = u_1 v_1.
+	vmull.p64 q14, d2, d7		// u_10 v_11
+	vmull.p64 q15, d3, d6		// u_11 v_10
+	vmull.p64 q8, d3, d7		// u_11 v_11
+	vmull.p64 q9, d2, d6		// u_10 v_10
+	veor	q14, q14, q15		// u_10 v_11 + u_11 v_10
+	veor	d17, d17, d28  //  q8 =	// c_1
+	veor	d18, d18, d29  //  q9 =	// c_0
+
+	// Finish mixing the product together.
+	veor	q12, q12, q8   // q12 =	// b_1
+	veor	q13, q13, q9   // q13 =	// b_0
+	veor	q9, q9, q12
+	veor	q10, q10, q13
+
+	// Shift the product right by one place (from GCM's point of view).
+	vshr.u64 q0, q8, #63		// carry from d16/d17 to d17/d18
+	vshr.u64 q1, q9, #63		// carry from d18/d19 to d19/d20
+	vshr.u64 q2, q10, #63		// carry from d20/d21 to d21/d22
+	vshr.u64 d6, d22, #63		// carry from d22 to d23
+	vshl.u64 q8, q8, #1		// shift everyting down
+	vshl.u64 q9, q9, #1
+	vshl.u64 q10, q10, #1
+	vshl.u64 q11, q11, #1
+	vorr	d17, d17, d0
+	vorr	d18, d18, d1
+	vorr	d19, d19, d2
+	vorr	d20, d20, d3
+	vorr	d21, d21, d4
+	vorr	d22, d22, d5
+	vorr	d23, d23, d6
+
+	// Now we must reduce.  This is essentially the same as the 192-bit
+	// case above, but more complicated because everything is bigger.
+	// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
+
+	// First, shift the high bits down.
+	// q8 =				// (y_7; y_6)
+	// q9 =				// (y_5; y_4)
+	// q10 =			// (y_3; y_2)
+	// q11 =			// (y_1; y_0)
+	vshl.u64 q0, q8, #62		// (y_7; y_6) b_i for t^2
+	vshl.u64 q12, q9, #62		// (y_5; y_4) b_i for t^2
+	vshl.u64 q1, q8, #59		// (y_7; y_6) b_i for t^5
+	vshl.u64 q13, q9, #59		// (y_5; y_4) b_i for t^5
+	vshl.u64 q2, q8, #54		// (y_7; y_6) b_i for t^10
+	vshl.u64 q14, q9, #54		// (y_5; y_4) b_i for t^10
+	veor	q0, q0, q1		// mix the contributions together
+	veor	q12, q12, q13
+	veor	q0, q0, q2
+	veor	q12, q12, q14
+	veor	d19, d19, d0		// and combine into the lower pieces
+	veor	d20, d20, d1
+	veor	d21, d21, d24
+	veor	d22, d22, d25
+
+	// And then shift the low bits up.  Also, switch the order of the
+	// pieces for output.
+	// q8 =				// (y'_7; y'_6)
+	// q9 =				// (y'_5; y'_4)
+	// q10 =			// (y'_3; y'_2)
+	// q11 =			// (y'_1; y'_0)
+	vshr.u64 q0, q8, #2		// (y_7; y_6) a_i for t^2
+	vshr.u64 q12, q9, #2		// (y_5; y'_4) a_i for t^2
+	vshr.u64 q1, q8, #5		// (y_7; y_6) a_i for t^5
+	vshr.u64 q13, q9, #5		// (y_5; y_4) a_i for t^5
+	vshr.u64 q2, q8, #10		// (y_7; y_6) a_i for t^10
+	vshr.u64 q14, q9, #10		// (y_5; y_4) a_i for t^10
+
+	veor	q8, q8, q0		// mix the contributions together
+	veor	q1, q1, q2
+	veor	q9, q9, q12
+	veor	q13, q13, q14
+	veor	q8, q8, q1
+	veor	q9, q9, q13
+	veor	d3, d20, d16		// and output
+	veor	d2, d21, d17
+	veor	d1, d22, d18
+	veor	d0, d23, d19
+.endm
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+// There are a number of representations of field elements in this code and
+// it can be confusing.
+//
+//   * The `external format' consists of a sequence of contiguous bytes in
+//     memory called a `block'.  The GCM spec explains how to interpret this
+//     block as an element of a finite field.  As discussed extensively, this
+//     representation is very annoying for a number of reasons.  On the other
+//     hand, this code never actually deals with it directly.
+//
+//   * The `register format' consists of one or more NEON registers,
+//     depending on the block size.  The bytes in each 64-bit lane of these
+//     registers are in reverse order, compared to the external format.
+//
+//   * The `words' format consists of a sequence of bytes, as in the
+//     `external format', but, according to the blockcipher in use, the bytes
+//     within each 32-bit word may be reversed (`big-endian') or not
+//     (`little-endian').  Accordingly, there are separate entry points for
+//     each variant, identified with `b' or `l'.
+
+FUNC(gcm_mulk_128b_arm_crypto)
+	// On entry, r0 points to a 128-bit field element A in big-endian
+	// words format; r1 points to a field-element K in register format.
+	// On exit, A is updated with the product A K.
+
+	vld1.8	{q0}, [r0]
+	vld1.8	{q1}, [r1]
+	vrev64.32 q0, q0
+	mul128
+	vrev64.32 q0, q0
+	vst1.8	{q0}, [r0]
+	bx	r14
+ENDFUNC
+
+FUNC(gcm_mulk_128l_arm_crypto)
+	// On entry, r0 points to a 128-bit field element A in little-endian
+	// words format; r1 points to a field-element K in register format.
+	// On exit, A is updated with the product A K.
+
+	vld1.8	{q0}, [r0]
+	vld1.8	{q1}, [r1]
+	vrev64.8 q0, q0
+	mul128
+	vrev64.8 q0, q0
+	vst1.8	{q0}, [r0]
+	bx	r14
+ENDFUNC
+
+FUNC(gcm_mulk_64b_arm_crypto)
+	// On entry, r0 points to a 64-bit field element A in big-endian
+	// words format; r1 points to a field-element K in register format.
+	// On exit, A is updated with the product A K.
+
+	vld1.8	{d0}, [r0]
+	vld1.8	{d1}, [r1]
+	vrev64.32 d0, d0
+	mul64
+	vrev64.32 d0, d0
+	vst1.8	{d0}, [r0]
+	bx	r14
+ENDFUNC
+
+FUNC(gcm_mulk_64l_arm_crypto)
+	// On entry, r0 points to a 64-bit field element A in little-endian
+	// words format; r1 points to a field-element K in register format.
+	// On exit, A is updated with the product A K.
+
+	vld1.8	{d0}, [r0]
+	vld1.8	{d1}, [r1]
+	vrev64.8 d0, d0
+	vzero
+	mul64
+	vrev64.8 d0, d0
+	vst1.8	{d0}, [r0]
+	bx	r14
+ENDFUNC
+
+FUNC(gcm_mulk_96b_arm_crypto)
+	// On entry, r0 points to a 96-bit field element A in big-endian
+	// words format; r1 points to a field-element K in register format.
+	// On exit, A is updated with the product A K.
+
+	ldr	r3, [r0, #8]
+	mov	r12, #0
+	vld1.8	{d0}, [r0]
+	vld1.8	{q1}, [r1]
+	vrev64.32 d0, d0
+	vmov	d1, r12, r3
+	vzero
+	mul96
+	vrev64.32 d0, d0
+	vmov	r3, d1[1]
+	vst1.8	{d0}, [r0]
+	str	r3, [r0, #8]
+	bx	r14
+ENDFUNC
+
+FUNC(gcm_mulk_96l_arm_crypto)
+	// On entry, r0 points to a 128-bit field element A in little-endian
+	// words format; r1 points to a field-element K in register format.
+	// On exit, A is updated with the product A K.
+
+	ldr	r3, [r0, #8]
+	mov	r12, #0
+	vld1.8	{d0}, [r0]
+	vld1.8	{q1}, [r1]
+	vmov	d1, r3, r12
+	vrev64.8 q0, q0
+	mul96
+	vrev64.8 q0, q0
+	vmov	r3, d1[0]
+	vst1.8	{d0}, [r0]
+	str	r3, [r0, #8]
+	bx	r14
+ENDFUNC
+
+FUNC(gcm_mulk_192b_arm_crypto)
+	// On entry, r0 points to a 192-bit field element A in big-endian
+	// words format; r1 points to a field-element K in register format.
+	// On exit, A is updated with the product A K.
+
+	vld1.8	{d0-d2}, [r0]
+	vld1.8	{d3-d5}, [r1]
+	vrev64.32 q0, q0
+	vrev64.32 d2, d2
+	mul192
+	vrev64.32 q0, q0
+	vrev64.32 d2, d2
+	vst1.8	{d0-d2}, [r0]
+	bx	r14
+ENDFUNC
+
+FUNC(gcm_mulk_192l_arm_crypto)
+	// On entry, r0 points to a 192-bit field element A in little-endian
+	// words format; r1 points to a field-element K in register format.
+	// On exit, A is updated with the product A K.
+
+	vld1.8	{d0-d2}, [r0]
+	vld1.8	{d3-d5}, [r1]
+	vrev64.8 q0, q0
+	vrev64.8 d2, d2
+	mul192
+	vrev64.8 q0, q0
+	vrev64.8 d2, d2
+	vst1.8	{d0-d2}, [r0]
+	bx	r14
+ENDFUNC
+
+FUNC(gcm_mulk_256b_arm_crypto)
+	// On entry, r0 points to a 256-bit field element A in big-endian
+	// words format; r1 points to a field-element K in register format.
+	// On exit, A is updated with the product A K.
+
+	vld1.8	{q0, q1}, [r0]
+	vld1.8	{q2, q3}, [r1]
+	vrev64.32 q0, q0
+	vrev64.32 q1, q1
+	mul256
+	vrev64.32 q0, q0
+	vrev64.32 q1, q1
+	vst1.8	{q0, q1}, [r0]
+	bx	r14
+ENDFUNC
+
+FUNC(gcm_mulk_256l_arm_crypto)
+	// On entry, r0 points to a 256-bit field element A in little-endian
+	// words format; r1 points to a field-element K in register format.
+	// On exit, A is updated with the product A K.
+
+	vld1.8	{q0, q1}, [r0]
+	vld1.8	{q2, q3}, [r1]
+	vrev64.8 q0, q0
+	vrev64.8 q1, q1
+	mul256
+	vrev64.8 q0, q0
+	vrev64.8 q1, q1
+	vst1.8	{q0, q1}, [r0]
+	bx	r14
+ENDFUNC
+
+///----- That's all, folks --------------------------------------------------
diff --git a/symm/gcm-arm64-pmull.S b/symm/gcm-arm64-pmull.S
new file mode 100644
index 00000000..97bb3bf2
--- /dev/null
+++ b/symm/gcm-arm64-pmull.S
@@ -0,0 +1,661 @@
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// GCM acceleration for ARM64 processors
+///
+/// (c) 2019 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software: you can redistribute it and/or modify it
+/// under the terms of the GNU Library General Public License as published
+/// by the Free Software Foundation; either version 2 of the License, or
+/// (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful, but
+/// WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+/// Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb.  If not, write to the Free Software
+/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+/// USA.
+
+///--------------------------------------------------------------------------
+/// Preliminaries.
+
+#include "config.h"
+#include "asm-common.h"
+
+	.arch	armv8-a+crypto
+
+	.text
+
+///--------------------------------------------------------------------------
+/// Multiplication macros.
+
+	// The good news is that we have a fancy instruction to do the
+	// multiplications.  The bad news is that it's not particularly well-
+	// suited to the job.
+	//
+	// For one thing, it only does a 64-bit multiplication, so in general
+	// we'll need to synthesize the full-width multiply by hand.  For
+	// another thing, it doesn't help with the reduction, so we have to
+	// do that by hand too.  And, finally, GCM has crazy bit ordering,
+	// and the instruction does nothing useful for that at all.
+	//
+	// Focusing on that last problem first: the bits aren't in monotonic
+	// significance order unless we permute them.  Fortunately, ARM64 has
+	// an instruction which will just permute the bits in each byte for
+	// us, so we don't have to worry about this very much.
+	//
+	// Our main weapons, the `pmull' and `pmull2' instructions, work on
+	// 64-bit operands, in half of a vector register, and produce 128-bit
+	// results.  But neither of them will multiply the high half of one
+	// vector by the low half of a second one, so we have a problem,
+	// which we solve by representing one of the operands redundantly:
+	// rather than packing the 64-bit pieces together, we duplicate each
+	// 64-bit piece across both halves of a register.
+	//
+	// The commentary for `mul128' is the most detailed.  The other
+	// macros assume that you've already read and understood that.
+
+.macro	mul128
+	// Enter with u and v in v0 and v1/v2 respectively, and 0 in v31;
+	// leave with z = u v in v0.  Clobbers v1--v6.
+
+	// First for the double-precision multiplication.  It's tempting to
+	// use Karatsuba's identity here, but I suspect that loses more in
+	// the shifting, bit-twiddling, and dependency chains that it gains
+	// in saving a multiplication which otherwise pipelines well.
+	// v0 =				// (u_0; u_1)
+	// v1/v2 =			// (v_0; v_1)
+	pmull2	v3.1q, v0.2d, v1.2d	// u_1 v_0
+	pmull	v4.1q, v0.1d, v2.1d	// u_0 v_1
+	pmull2	v5.1q, v0.2d, v2.2d	// (t_1; x_3) = u_1 v_1
+	pmull	v6.1q, v0.1d, v1.1d	// (x_0; t_0) = u_0 v_0
+
+	// Arrange the pieces to form a double-precision polynomial.
+	eor	v3.16b, v3.16b, v4.16b	// (m_0; m_1) = u_0 v_1 + u_1 v_0
+	vshr128	v4, v3, 64		// (m_1; 0)
+	vshl128	v3, v3, 64		// (0; m_0)
+	eor	v1.16b, v5.16b, v4.16b	// (x_2; x_3)
+	eor	v0.16b, v6.16b, v3.16b	// (x_0; x_1)
+
+	// And now the only remaining difficulty is that the result needs to
+	// be reduced modulo p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128
+	// = t^7 + t^2 + t + 1 in our field.  So far, we've calculated z_0
+	// and z_1 such that z_0 + z_1 R = u v using the identity R = t^128:
+	// now we must collapse the two halves of y together using the other
+	// identity R = t^7 + t^2 + t + 1.
+	//
+	// We do this by working on y_2 and y_3 separately, so consider y_i
+	// for i = 2 or 3.  Certainly, y_i t^{64i} = y_i R t^{64(i-2) =
+	// (t^7 + t^2 + t + 1) y_i t^{64(i-2)}, but we can't use that
+	// directly without breaking up the 64-bit word structure.  Instead,
+	// we start by considering just y_i t^7 t^{64(i-2)}, which again
+	// looks tricky.  Now, split y_i = a_i + t^57 b_i, with deg a_i < 57;
+	// then
+	//
+	//	y_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
+	//
+	// We can similarly decompose y_i t^2 and y_i t into a pair of 64-bit
+	// contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the
+	// splits are different.  This is lovely, with one small snag: when
+	// we do this to y_3, we end up with a contribution back into the
+	// t^128 coefficient word.  But notice that only the low seven bits
+	// of this word are affected, so there's no knock-on contribution
+	// into the t^64 word.  Therefore, if we handle the high bits of each
+	// word together, and then the low bits, everything will be fine.
+
+	// First, shift the high bits down.
+	ushr	v2.2d, v1.2d, #63	// the b_i for t
+	ushr	v3.2d, v1.2d, #62	// the b_i for t^2
+	ushr	v4.2d, v1.2d, #57	// the b_i for t^7
+	eor	v2.16b, v2.16b, v3.16b	// add them all together
+	eor	v2.16b, v2.16b, v4.16b
+	vshr128	v3, v2, 64
+	vshl128	v4, v2, 64
+	eor	v1.16b, v1.16b, v3.16b	// contribution into high half
+	eor	v0.16b, v0.16b, v4.16b	// and low half
+
+	// And then shift the low bits up.
+	shl	v2.2d, v1.2d, #1
+	shl	v3.2d, v1.2d, #2
+	shl	v4.2d, v1.2d, #7
+	eor	v1.16b, v1.16b, v2.16b	// unit and t contribs
+	eor	v3.16b, v3.16b, v4.16b	// t^2 and t^7 contribs
+	eor	v0.16b, v0.16b, v1.16b	// mix everything together
+	eor	v0.16b, v0.16b, v3.16b	// ... and we're done
+.endm
+
+.macro	mul64
+	// Enter with u and v in the low halves of v0 and v1, respectively;
+	// leave with z = u v in x2.  Clobbers x2--x4.
+
+	// The multiplication is thankfully easy.
+	// v0 =					// (u; ?)
+	// v1 =					// (v; ?)
+	pmull	v0.1q, v0.1d, v1.1d		// u v
+
+	// Now we must reduce.  This is essentially the same as the 128-bit
+	// case above, but mostly simpler because everything is smaller.  The
+	// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
+
+	// Before we get stuck in, transfer the product to general-purpose
+	// registers.
+	mov	x3, v0.d[1]
+	mov	x2, v0.d[0]
+
+	// First, shift the high bits down.
+	eor	x4, x3, x3, lsr #1	// pre-mix t^3 and t^4
+	eor	x3, x3, x3, lsr #63	// mix in t contribution
+	eor	x3, x3, x4, lsr #60	// shift and mix in t^3 and t^4
+
+	// And then shift the low bits up.
+	eor	x3, x3, x3, lsl #1	// mix unit and t; pre-mix t^3, t^4
+	eor	x2, x2, x3		// fold them in
+	eor	x2, x2, x3, lsl #3	// and t^3 and t^4
+.endm
+
+.macro	mul96
+	// Enter with u in the least-significant 96 bits of v0, with zero in
+	// the upper 32 bits, and with the least-significant 64 bits of v in
+	// both halves of v1, and the upper 32 bits of v in the low 32 bits
+	// of each half of v2, with zero in the upper 32 bits; and with zero
+	// in v31.  Yes, that's a bit hairy.  Leave with the product u v in
+	// the low 96 bits of v0, and /junk/ in the high 32 bits.  Clobbers
+	// v1--v6.
+
+	// This is an inconvenient size.  There's nothing for it but to do
+	// four multiplications, as if for the 128-bit case.  It's possible
+	// that there's cruft in the top 32 bits of the input registers, so
+	// shift both of them up by four bytes before we start.  This will
+	// mean that the high 64 bits of the result (from GCM's viewpoint)
+	// will be zero.
+	// v0 =				// (u_0 + u_1 t^32; u_2)
+	// v1 =				// (v_0 + v_1 t^32; v_0 + v_1 t^32)
+	// v2 =				// (v_2; v_2)
+	pmull2	v5.1q, v0.2d, v1.2d	// u_2 (v_0 + v_1 t^32) t^32 = e_0
+	pmull	v4.1q, v0.1d, v2.1d	// v_2 (u_0 + u_1 t^32) t^32 = e_1
+	pmull2	v6.1q, v0.2d, v2.2d	// u_2 v_2 = d = (d; 0)
+	pmull	v3.1q, v0.1d, v1.1d	// u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
+					//   + u_1 v_1 t^64 = f
+
+	// Extract the high and low halves of the 192-bit result.  The answer
+	// we want is d t^128 + e t^64 + f, where e = e_0 + e_1.  The low 96
+	// bits of the answer will end up in v0, with junk in the top 32
+	// bits; the high 96 bits will end up in v1, which must have zero in
+	// its top 32 bits.
+	//
+	// Here, bot(x) is the low 96 bits of a 192-bit quantity x, arranged
+	// in the low 96 bits of a SIMD register, with junk in the top 32
+	// bits; and top(x) is the high 96 bits, also arranged in the low 96
+	// bits of a register, with /zero/ in the top 32 bits.
+	eor	v4.16b, v4.16b, v5.16b	// e_0 + e_1 = e
+	vshl128	v6, v6, 32		// top(d t^128)
+	vshr128	v5, v4, 32		// top(e t^64)
+	vshl128	v4, v4, 64		// bot(e t^64)
+	vshr128	v1, v3, 96		// top(f)
+	eor	v6.16b, v6.16b, v5.16b	// top(d t^128 + e t^64)
+	eor	v0.16b, v3.16b, v4.16b	// bot([d t^128] + e t^64 + f)
+	eor	v1.16b, v1.16b, v6.16b	// top(e t^64 + d t^128 + f)
+
+	// Finally, the reduction.  This is essentially the same as the
+	// 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
+	// t^9 + t^6 + 1.  The degrees are larger but not enough to cause
+	// trouble for the general approach.  Unfortunately, we have to do
+	// this in 32-bit pieces rather than 64.
+
+	// First, shift the high bits down.
+	ushr	v2.4s, v1.4s, #26	// the b_i for t^6
+	ushr	v3.4s, v1.4s, #23	// the b_i for t^9
+	ushr	v4.4s, v1.4s, #22	// the b_i for t^10
+	eor	v2.16b, v2.16b, v3.16b	// add them all together
+	eor	v2.16b, v2.16b, v4.16b
+	vshr128	v3, v2, 64		// contribution for high half
+	vshl128	v2, v2, 32		// contribution for low half
+	eor	v1.16b, v1.16b, v3.16b	// apply to high half
+	eor	v0.16b, v0.16b, v2.16b	// and low half
+
+	// And then shift the low bits up.
+	shl	v2.4s, v1.4s, #6
+	shl	v3.4s, v1.4s, #9
+	shl	v4.4s, v1.4s, #10
+	eor	v1.16b, v1.16b, v2.16b	// unit and t^6 contribs
+	eor	v3.16b, v3.16b, v4.16b	// t^9 and t^10 contribs
+	eor	v0.16b, v0.16b, v1.16b	// mix everything together
+	eor	v0.16b, v0.16b, v3.16b	// ... and we're done
+.endm
+
+.macro	mul192
+	// Enter with u in v0 and the less-significant half of v1, with v
+	// duplicated across both halves of v2/v3/v4, and with zero in v31.
+	// Leave with the product u v in v0 and the bottom half of v1.
+	// Clobbers v16--v25.
+
+	// Start multiplying and accumulating pieces of product.
+	// v0 =				// (u_0; u_1)
+	// v1 =				// (u_2; ?)
+	// v2 =				// (v_0; v_0)
+	// v3 =				// (v_1; v_1)
+	// v4 =				// (v_2; v_2)
+	pmull	v16.1q, v0.1d, v2.1d	//   a = u_0 v_0
+
+	pmull	v19.1q, v0.1d, v3.1d	//	 u_0 v_1
+	pmull2	v21.1q, v0.2d, v2.2d	//	 u_1 v_0
+
+	pmull	v17.1q, v0.1d, v4.1d	//	 u_0 v_2
+	pmull2	v22.1q, v0.2d, v3.2d	//	 u_1 v_1
+	pmull	v23.1q, v1.1d, v2.1d	//	 u_2 v_0
+	 eor	v19.16b, v19.16b, v21.16b // b = u_0 v_1 + u_1 v_0
+
+	pmull2	v20.1q, v0.2d, v4.2d	//	 u_1 v_2
+	pmull	v24.1q, v1.1d, v3.1d	//	 u_2 v_1
+	 eor	v17.16b, v17.16b, v22.16b //	 u_0 v_2 + u_1 v_1
+
+	pmull	v18.1q, v1.1d, v4.1d	//   e = u_2 v_2
+	 eor	v17.16b, v17.16b, v23.16b // c = u_0 v_2 + u_1 v_1 + u_2 v_1
+	 eor	v20.16b, v20.16b, v24.16b // d = u_1 v_2 + u_2 v_1
+
+	// Piece the product together.
+	// v16 =			// (a_0; a_1)
+	// v19 =			// (b_0; b_1)
+	// v17 =			// (c_0; c_1)
+	// v20 =			// (d_0; d_1)
+	// v18 =			// (e_0; e_1)
+	vshl128	v21, v19, 64		// (0; b_0)
+	ext	v22.16b, v19.16b, v20.16b, #8 // (b_1; d_0)
+	vshr128	v23, v20, 64		// (d_1; 0)
+	eor	v16.16b, v16.16b, v21.16b // (x_0; x_1)
+	eor	v17.16b, v17.16b, v22.16b // (x_2; x_3)
+	eor	v18.16b, v18.16b, v23.16b // (x_2; x_3)
+
+	// Next, the reduction.  Our polynomial this time is p(x) = t^192 +
+	// t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
+	// 128-bit case.  I don't know why.
+
+	// First, shift the high bits down.
+	// v16 =			// (y_0; y_1)
+	// v17 =			// (y_2; y_3)
+	// v18 =			// (y_4; y_5)
+	mov	v19.d[0], v17.d[1]	// (y_3; ?)
+
+	ushr	v23.2d, v18.2d, #63	// hi b_i for t
+	ushr	d20, d19, #63		// lo b_i for t
+	ushr	v24.2d, v18.2d, #62	// hi b_i for t^2
+	ushr	d21, d19, #62		// lo b_i for t^2
+	ushr	v25.2d, v18.2d, #57	// hi b_i for t^7
+	ushr	d22, d19, #57		// lo b_i for t^7
+	eor	v23.16b, v23.16b, v24.16b // mix them all together
+	eor	v20.8b, v20.8b, v21.8b
+	eor	v23.16b, v23.16b, v25.16b
+	eor	v20.8b, v20.8b, v22.8b
+
+	// Permute the high pieces while we fold in the b_i.
+	eor	v17.16b, v17.16b, v23.16b
+	vshl128	v20, v20, 64
+	mov	v19.d[0], v18.d[1]	// (y_5; ?)
+	ext	v18.16b, v17.16b, v18.16b, #8 // (y_3; y_4)
+	eor	v16.16b, v16.16b, v20.16b
+
+	// And finally shift the low bits up.
+	// v16 =			// (y'_0; y'_1)
+	// v17 =			// (y'_2; ?)
+	// v18 =			// (y'_3; y'_4)
+	// v19 =			// (y'_5; ?)
+	shl	v20.2d, v18.2d, #1
+	shl	d23, d19, #1
+	shl	v21.2d, v18.2d, #2
+	shl	d24, d19, #2
+	shl	v22.2d, v18.2d, #7
+	shl	d25, d19, #7
+	eor	v18.16b, v18.16b, v20.16b // unit and t contribs
+	eor	v19.8b, v19.8b, v23.8b
+	eor	v21.16b, v21.16b, v22.16b // t^2 and t^7 contribs
+	eor	v24.8b, v24.8b, v25.8b
+	eor	v18.16b, v18.16b, v21.16b // all contribs
+	eor	v19.8b, v19.8b, v24.8b
+	eor	v0.16b, v16.16b, v18.16b // mix them into the low half
+	eor	v1.8b, v17.8b, v19.8b
+.endm
+
+.macro	mul256
+	// Enter with u in v0/v1, with v duplicated across both halves of
+	// v2--v5, and with zero in v31.  Leave with the product u v in
+	// v0/v1.  Clobbers ???.
+
+	// Now it's starting to look worthwhile to do Karatsuba.  Suppose
+	// u = u_0 + u_1 B and v = v_0 + v_1 B.  Then
+	//
+	//	u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
+	//
+	// Name these coefficients of B^i be a, b, and c, respectively, and
+	// let r = u_0 + u_1 and s = v_0 + v_1.  Then observe that
+	//
+	//	q = r s = (u_0 + u_1) (v_0 + v_1)
+	//	  = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
+	//	  = a + d + c
+	//
+	// The first two terms we've already calculated; the last is the
+	// remaining one we want.  We'll set B = t^128.  We know how to do
+	// 128-bit multiplications already, and Karatsuba is too annoying
+	// there, so there'll be 12 multiplications altogether, rather than
+	// the 16 we'd have if we did this the naÃ¯ve way.
+	// v0 =				// u_0 = (u_00; u_01)
+	// v1 =				// u_1 = (u_10; u_11)
+	// v2 =				// (v_00; v_00)
+	// v3 =				// (v_01; v_01)
+	// v4 =				// (v_10; v_10)
+	// v5 =				// (v_11; v_11)
+
+	eor	v28.16b, v0.16b, v1.16b	// u_* = (u_00 + u_10; u_01 + u_11)
+	eor	v29.16b, v2.16b, v4.16b	// v_*0 = v_00 + v_10
+	eor	v30.16b, v3.16b, v5.16b	// v_*1 = v_01 + v_11
+
+	// Start by building the cross product, q = u_* v_*.
+	pmull	v24.1q, v28.1d, v30.1d	// u_*0 v_*1
+	pmull2	v25.1q, v28.2d, v29.2d	// u_*1 v_*0
+	pmull	v20.1q, v28.1d, v29.1d	// u_*0 v_*0
+	pmull2	v21.1q, v28.2d, v30.2d	// u_*1 v_*1
+	eor	v24.16b, v24.16b, v25.16b // u_*0 v_*1 + u_*1 v_*0
+	vshr128	v25, v24, 64
+	vshl128	v24, v24, 64
+	eor	v20.16b, v20.16b, v24.16b // q_0
+	eor	v21.16b, v21.16b, v25.16b // q_1
+
+	// Next, work on the low half, a = u_0 v_0
+	pmull	v24.1q, v0.1d, v3.1d	// u_00 v_01
+	pmull2	v25.1q, v0.2d, v2.2d	// u_01 v_00
+	pmull	v16.1q, v0.1d, v2.1d	// u_00 v_00
+	pmull2	v17.1q, v0.2d, v3.2d	// u_01 v_01
+	eor	v24.16b, v24.16b, v25.16b // u_00 v_01 + u_01 v_00
+	vshr128	v25, v24, 64
+	vshl128	v24, v24, 64
+	eor	v16.16b, v16.16b, v24.16b // a_0
+	eor	v17.16b, v17.16b, v25.16b // a_1
+
+	// Mix the pieces we have so far.
+	eor	v20.16b, v20.16b, v16.16b
+	eor	v21.16b, v21.16b, v17.16b
+
+	// Finally, work on the high half, c = u_1 v_1
+	pmull	v24.1q, v1.1d, v5.1d	// u_10 v_11
+	pmull2	v25.1q, v1.2d, v4.2d	// u_11 v_10
+	pmull	v18.1q, v1.1d, v4.1d	// u_10 v_10
+	pmull2	v19.1q, v1.2d, v5.2d	// u_11 v_11
+	eor	v24.16b, v24.16b, v25.16b // u_10 v_11 + u_11 v_10
+	vshr128	v25, v24, 64
+	vshl128	v24, v24, 64
+	eor	v18.16b, v18.16b, v24.16b // c_0
+	eor	v19.16b, v19.16b, v25.16b // c_1
+
+	// Finish mixing the product together.
+	eor	v20.16b, v20.16b, v18.16b
+	eor	v21.16b, v21.16b, v19.16b
+	eor	v17.16b, v17.16b, v20.16b
+	eor	v18.16b, v18.16b, v21.16b
+
+	// Now we must reduce.  This is essentially the same as the 192-bit
+	// case above, but more complicated because everything is bigger.
+	// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
+	// v16 =			// (y_0; y_1)
+	// v17 =			// (y_2; y_3)
+	// v18 =			// (y_4; y_5)
+	// v19 =			// (y_6; y_7)
+	ushr	v24.2d, v18.2d, #62	// (y_4; y_5) b_i for t^2
+	ushr	v25.2d, v19.2d, #62	// (y_6; y_7) b_i for t^2
+	ushr	v26.2d, v18.2d, #59	// (y_4; y_5) b_i for t^5
+	ushr	v27.2d, v19.2d, #59	// (y_6; y_7) b_i for t^5
+	ushr	v28.2d, v18.2d, #54	// (y_4; y_5) b_i for t^10
+	ushr	v29.2d, v19.2d, #54	// (y_6; y_7) b_i for t^10
+	eor	v24.16b, v24.16b, v26.16b // mix the contributions together
+	eor	v25.16b, v25.16b, v27.16b
+	eor	v24.16b, v24.16b, v28.16b
+	eor	v25.16b, v25.16b, v29.16b
+	vshr128	v26, v25, 64		// slide contribs into position
+	ext	v25.16b, v24.16b, v25.16b, #8
+	vshl128	v24, v24, 64
+	eor	v18.16b, v18.16b, v26.16b
+	eor	v17.16b, v17.16b, v25.16b
+	eor	v16.16b, v16.16b, v24.16b
+
+	// And then shift the low bits up.
+	// v16 =			// (y'_0; y'_1)
+	// v17 =			// (y'_2; y'_3)
+	// v18 =			// (y'_4; y'_5)
+	// v19 =			// (y'_6; y'_7)
+	shl	v24.2d, v18.2d, #2	// (y'_4; y_5) a_i for t^2
+	shl	v25.2d, v19.2d, #2	// (y_6; y_7) a_i for t^2
+	shl	v26.2d, v18.2d, #5	// (y'_4; y_5) a_i for t^5
+	shl	v27.2d, v19.2d, #5	// (y_6; y_7) a_i for t^5
+	shl	v28.2d, v18.2d, #10	// (y'_4; y_5) a_i for t^10
+	shl	v29.2d, v19.2d, #10	// (y_6; y_7) a_i for t^10
+	eor	v18.16b, v18.16b, v24.16b // mix the contributions together
+	eor	v19.16b, v19.16b, v25.16b
+	eor	v26.16b, v26.16b, v28.16b
+	eor	v27.16b, v27.16b, v29.16b
+	eor	v18.16b, v18.16b, v26.16b
+	eor	v19.16b, v19.16b, v27.16b
+	eor	v0.16b, v16.16b, v18.16b
+	eor	v1.16b, v17.16b, v19.16b
+.endm
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+// There are a number of representations of field elements in this code and
+// it can be confusing.
+//
+//   * The `external format' consists of a sequence of contiguous bytes in
+//     memory called a `block'.  The GCM spec explains how to interpret this
+//     block as an element of a finite field.  As discussed extensively, this
+//     representation is very annoying for a number of reasons.  On the other
+//     hand, this code never actually deals with it directly.
+//
+//   * The `register format' consists of one or more SIMD registers,
+//     depending on the block size.  The bits in each byte are reversed,
+//     compared to the external format, which makes the polynomials
+//     completely vanilla, unlike all of the other GCM implementations.
+//
+//   * The `table format' is just like the `register format', only the two
+//     halves of 128-bit SIMD register are the same, so we need twice as many
+//     registers.
+//
+//   * The `words' format consists of a sequence of bytes, as in the
+//     `external format', but, according to the blockcipher in use, the bytes
+//     within each 32-bit word may be reversed (`big-endian') or not
+//     (`little-endian').  Accordingly, there are separate entry points for
+//     each variant, identified with `b' or `l'.
+
+FUNC(gcm_mulk_128b_arm64_pmull)
+	// On entry, x0 points to a 128-bit field element A in big-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldr	q0, [x0]
+	ldp	q1, q2, [x1]
+	rev32	v0.16b, v0.16b
+	vzero
+	rbit	v0.16b, v0.16b
+	mul128
+	rbit	v0.16b, v0.16b
+	rev32	v0.16b, v0.16b
+	str	q0, [x0]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_128l_arm64_pmull)
+	// On entry, x0 points to a 128-bit field element A in little-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldr	q0, [x0]
+	ldp	q1, q2, [x1]
+	vzero
+	rbit	v0.16b, v0.16b
+	mul128
+	rbit	v0.16b, v0.16b
+	str	q0, [x0]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_64b_arm64_pmull)
+	// On entry, x0 points to a 64-bit field element A in big-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldr	d0, [x0]
+	ldr	q1, [x1]
+	rev32	v0.8b, v0.8b
+	rbit	v0.8b, v0.8b
+	mul64
+	rbit	x2, x2
+	ror	x2, x2, #32
+	str	x2, [x0]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_64l_arm64_pmull)
+	// On entry, x0 points to a 64-bit field element A in little-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldr	d0, [x0]
+	ldr	q1, [x1]
+	rbit	v0.8b, v0.8b
+	mul64
+	rbit	x2, x2
+	rev	x2, x2
+	str	x2, [x0]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_96b_arm64_pmull)
+	// On entry, x0 points to a 96-bit field element A in big-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldr	w2, [x0, #8]
+	ldr	d0, [x0, #0]
+	mov	v0.d[1], x2
+	ldp	q1, q2, [x1]
+	rev32	v0.16b, v0.16b
+	vzero
+	rbit	v0.16b, v0.16b
+	mul96
+	rbit	v0.16b, v0.16b
+	rev32	v0.16b, v0.16b
+	mov	w2, v0.s[2]
+	str	d0, [x0, #0]
+	str	w2, [x0, #8]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_96l_arm64_pmull)
+	// On entry, x0 points to a 96-bit field element A in little-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldr	d0, [x0, #0]
+	ldr	w2, [x0, #8]
+	mov	v0.d[1], x2
+	ldp	q1, q2, [x1]
+	rbit	v0.16b, v0.16b
+	vzero
+	mul96
+	rbit	v0.16b, v0.16b
+	mov	w2, v0.s[2]
+	str	d0, [x0, #0]
+	str	w2, [x0, #8]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_192b_arm64_pmull)
+	// On entry, x0 points to a 192-bit field element A in big-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldr	q0, [x0, #0]
+	ldr	d1, [x0, #16]
+	ldp	q2, q3, [x1, #0]
+	ldr	q4, [x1, #32]
+	rev32	v0.16b, v0.16b
+	rev32	v1.8b, v1.8b
+	rbit	v0.16b, v0.16b
+	rbit	v1.8b, v1.8b
+	vzero
+	mul192
+	rev32	v0.16b, v0.16b
+	rev32	v1.8b, v1.8b
+	rbit	v0.16b, v0.16b
+	rbit	v1.8b, v1.8b
+	str	q0, [x0, #0]
+	str	d1, [x0, #16]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_192l_arm64_pmull)
+	// On entry, x0 points to a 192-bit field element A in little-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldr	q0, [x0, #0]
+	ldr	d1, [x0, #16]
+	ldp	q2, q3, [x1, #0]
+	ldr	q4, [x1, #32]
+	rbit	v0.16b, v0.16b
+	rbit	v1.8b, v1.8b
+	vzero
+	mul192
+	rbit	v0.16b, v0.16b
+	rbit	v1.8b, v1.8b
+	str	q0, [x0, #0]
+	str	d1, [x0, #16]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_256b_arm64_pmull)
+	// On entry, x0 points to a 256-bit field element A in big-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldp	q0, q1, [x0]
+	ldp	q2, q3, [x1, #0]
+	ldp	q4, q5, [x1, #32]
+	rev32	v0.16b, v0.16b
+	rev32	v1.16b, v1.16b
+	rbit	v0.16b, v0.16b
+	rbit	v1.16b, v1.16b
+	vzero
+	mul256
+	rev32	v0.16b, v0.16b
+	rev32	v1.16b, v1.16b
+	rbit	v0.16b, v0.16b
+	rbit	v1.16b, v1.16b
+	stp	q0, q1, [x0]
+	ret
+ENDFUNC
+
+FUNC(gcm_mulk_256l_arm64_pmull)
+	// On entry, x0 points to a 256-bit field element A in little-endian
+	// words format; x1 points to a field-element K in table format.  On
+	// exit, A is updated with the product A K.
+
+	ldp	q0, q1, [x0]
+	ldp	q2, q3, [x1, #0]
+	ldp	q4, q5, [x1, #32]
+	rbit	v0.16b, v0.16b
+	rbit	v1.16b, v1.16b
+	vzero
+	mul256
+	rbit	v0.16b, v0.16b
+	rbit	v1.16b, v1.16b
+	stp	q0, q1, [x0]
+	ret
+ENDFUNC
+
+///----- That's all, folks --------------------------------------------------
diff --git a/symm/gcm-x86ish-pclmul.S b/symm/gcm-x86ish-pclmul.S
new file mode 100644
index 00000000..e60b7cab
--- /dev/null
+++ b/symm/gcm-x86ish-pclmul.S
@@ -0,0 +1,1073 @@
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// GCM acceleration for x86 processors
+///
+/// (c) 2018 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software: you can redistribute it and/or modify it
+/// under the terms of the GNU Library General Public License as published
+/// by the Free Software Foundation; either version 2 of the License, or
+/// (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful, but
+/// WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+/// Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb.  If not, write to the Free Software
+/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+/// USA.
+
+///--------------------------------------------------------------------------
+/// Preliminaries.
+
+#include "config.h"
+#include "asm-common.h"
+
+	.arch	.pclmul
+
+	.text
+
+///--------------------------------------------------------------------------
+/// Common register allocation.
+
+#if CPUFAM_X86
+#  define A eax
+#  define K edx
+#elif CPUFAM_AMD64 && ABI_SYSV
+#  define A rdi
+#  define K rsi
+#elif CPUFAM_AMD64 && ABI_WIN
+#  define A rcx
+#  define K rdx
+#endif
+
+///--------------------------------------------------------------------------
+/// Multiplication macros.
+
+	// The good news is that we have a fancy instruction to do the
+	// multiplications.  The bad news is that it's not particularly well-
+	// suited to the job.
+	//
+	// For one thing, it only does a 64-bit multiplication, so in general
+	// we'll need to synthesize the full-width multiply by hand.  For
+	// another thing, it doesn't help with the reduction, so we have to
+	// do that by hand too.  And, finally, GCM has crazy bit ordering,
+	// and the instruction does nothing useful for that at all.
+	//
+	// Focusing on that last problem first: the bits aren't in monotonic
+	// significance order unless we permute them.  If we reverse the byte
+	// order, then we'll have the bits in monotonic order, but backwards,
+	// so the degree-0 coefficient will be in the most-significant bit.
+	//
+	// This is less of a difficulty than it seems at first, because
+	// algebra.  Suppose we are given u = SUM_{0<=i<n} u_i t^i and v =
+	// SUM_{0<=j<n} v_j t^j; then
+	//
+	//	u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
+	//
+	// Suppose instead that we're given Å© = SUM_{0<=i<n} u_{n-i-1} t^i
+	// and vÌ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
+	// Then
+	//
+	//	Å© vÌ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
+	//	    = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
+	//
+	// which is almost the bit-reversal of u v, only it's shifted right
+	// by one place.  Oh, well: we'll have to shift it back later.
+	//
+	// That was important to think about, but there's not a great deal to
+	// do about it yet other than to convert what we've got from the
+	// blockcipher's byte-ordering convention to our big-endian
+	// convention.  Since this depends on the blockcipher convention,
+	// we'll leave the caller to cope with this: the macros here will
+	// assume that the operands are in `register' format, which is the
+	// byte-reversal of the external representation, padded at the
+	// most-significant end except for 96-bit blocks, which are
+	// zero-padded at the least-significant end (see `mul96' for the
+	// details).  In the commentary, pieces of polynomial are numbered
+	// according to the degree of the coefficients, so the unit
+	// coefficient of some polynomial a is in a_0.
+	//
+	// The commentary for `mul128' is the most detailed.  The other
+	// macros assume that you've already read and understood that.
+
+.macro	mul128
+	// Enter with u and v in xmm0 and xmm1 respectively; leave with z =
+	// u v in xmm0.  Clobbers xmm1--xmm4.
+
+	// First for the double-precision multiplication.  It's tempting to
+	// use Karatsuba's identity here, but I suspect that loses more in
+	// the shifting, bit-twiddling, and dependency chains that it gains
+	// in saving a multiplication which otherwise pipelines well.
+	// xmm0 =			// (u_1; u_0)
+	// xmm1 =			// (v_1; v_0)
+	movdqa	xmm2, xmm1		// (v_1; v_0) again
+	movdqa	xmm3, xmm0		// (u_1; u_0) again
+	movdqa	xmm4, xmm0		// (u_1; u_0) yet again
+	pclmulhqlqdq xmm2, xmm0		// u_1 v_0
+	pclmullqlqdq xmm0, xmm1		// u_1 v_1
+	pclmulhqlqdq xmm3, xmm1		// u_0 v_1
+	pclmulhqhqdq xmm4, xmm1		// u_0 v_0
+
+	// Arrange the pieces to form a double-precision polynomial.
+	pxor	xmm2, xmm3		// (m_1; m_0) = u_1 v_0 + u_0 v_1
+	movdqa	xmm1, xmm2		// (m_1; m_0) again
+	pslldq	xmm2, 8			// (0; m_1)
+	psrldq	xmm1, 8			// (m_0; 0)
+	pxor	xmm0, xmm2		// x_1 = u_1 v_1 + m_1
+	pxor	xmm1, xmm4		// x_0 = u_0 v_0 + t^64 m_0
+
+	// Two problems remain.  The first is that this product is shifted
+	// left (from GCM's backwards perspective) by one place, which is
+	// annoying.  Let's take care of that now.  Once this is done, we'll
+	// be properly in GCM's backwards bit-ordering, so xmm1 will hold the
+	// low half of the product and xmm0 the high half.  (The following
+	// diagrams show bit 0 consistently on the right.)
+	//
+	//				 xmm1
+	//    ,-------------.-------------.-------------.-------------.
+	//    | 0  x_0-x_30 |  x_31-x_62  |  x_63-x_94  |  x_95-x_126 |
+	//    `-------------^-------------^-------------^-------------'
+	//
+	//				 xmm0
+	//    ,-------------.-------------.-------------.-------------.
+	//    | x_127-x_158 | x_159-x_190 | x_191-x_222 | x_223-x_254 |
+	//    `-------------^-------------^-------------^-------------'
+	//
+	// We start by shifting each 32-bit lane right (from GCM's point of
+	// view -- physically, left) by one place, which gives us this:
+	//
+	//			     low (xmm3)
+	//    ,-------------.-------------.-------------.-------------.
+	//    | x_0-x_30  0 | x_32-x_62 0 | x_64-x_94 0 | x_96-x_126 0|
+	//    `-------------^-------------^-------------^-------------'
+	//
+	//			     high (xmm2)
+	//    ,-------------.-------------.-------------.-------------.
+	//    |x_128-x_158 0|x_160-x_190 0|x_192-x_222 0|x_224-x_254 0|
+	//    `-------------^-------------^-------------^-------------'
+	//
+	// but we've lost a bunch of bits.  We separately shift each lane
+	// left by 31 places to give us the bits we lost.
+	//
+	//			     low (xmm1)
+	//    ,-------------.-------------.-------------.-------------.
+	//    |    0...0    | 0...0  x_31 | 0...0  x_63 | 0...0  x_95 |
+	//    `-------------^-------------^-------------^-------------'
+	//
+	//			     high (xmm0)
+	//    ,-------------.-------------.-------------.-------------.
+	//    | 0...0 x_127 | 0...0 x_159 | 0...0 x_191 | 0...0 x_223 |
+	//    `-------------^-------------^-------------^-------------'
+	//
+	// Which is close, but we don't get a cigar yet.  To get the missing
+	// bits into position, we shift each of these right by a lane, but,
+	// alas, the x_127 falls off, so, separately, we shift the high
+	// register left by three lanes, so that everything is lined up
+	// properly when we OR them all together:
+	//
+	//			     low (xmm1)
+	//    ,-------------.-------------.-------------.-------------.
+	//    ? 0...0  x_31 | 0...0  x_63 | 0...0  x_95 |    0...0    |
+	//    `-------------^-------------^-------------^-------------'
+	//
+	//			     wrap (xmm4)
+	//    ,-------------.-------------.-------------.-------------.
+	//    |    0...0    |    0...0    |    0...0    | 0...0 x_127 |
+	//    `-------------^-------------^-------------^-------------'
+	//
+	//			     high (xmm0)
+	//    ,-------------.-------------.-------------.-------------.
+	//    | 0...0 x_159 | 0...0 x_191 | 0...0 x_223 |    0...0    |
+	//    `-------------^-------------^-------------^-------------'
+	//
+	// The `low' and `wrap' registers (xmm1, xmm3, xmm4) then collect the
+	// low 128 coefficients, while the `high' registers (xmm0, xmm2)
+	// collect the high 127 registers, leaving a zero bit at the most
+	// significant end as we expect.
+
+	// xmm0 =			// (x_7, x_6; x_5, x_4)
+	// xmm1 =			// (x_3, x_2; x_1, x_0)
+	movdqa	xmm3, xmm1		// (x_3, x_2; x_1, x_0) again
+	movdqa	xmm2, xmm0		// (x_7, x_6; x_5, x_4) again
+	psrld	xmm1, 31		// shifted left; just the carries
+	psrld	xmm0, 31
+	pslld	xmm3, 1			// shifted right, but dropped carries
+	pslld	xmm2, 1
+	movdqa	xmm4, xmm0		// another copy for the carry around
+	pslldq	xmm1, 4			// move carries over
+	pslldq	xmm0, 4
+	psrldq	xmm4, 12		// the big carry wraps around
+	por	xmm1, xmm3
+	por	xmm0, xmm2		// (y_7, y_6; y_5, y_4)
+	por	xmm1, xmm4		// (y_3, y_2; y_1, y_0)
+
+	// And the other problem is that the result needs to be reduced
+	// modulo p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128 = t^7 +
+	// t^2 + t + 1 in our field.  So far, we've calculated z_0 and z_1
+	// such that z_0 + z_1 R = u v using the identity R = t^128: now we
+	// must collapse the two halves of z together using the other
+	// identity R = t^7 + t^2 + t + 1.
+	//
+	// We do this by working on each 32-bit word of the high half of z
+	// separately, so consider y_i, for some 4 <= i < 8.  Certainly, y_i
+	// t^{32i} = y_i R t^{32(i-4)} = (t^7 + t^2 + t + 1) y_i t^{32(i-4)},
+	// but we can't use that directly without breaking up the 32-bit word
+	// structure.  Instead, we start by considering just y_i t^7
+	// t^{32(i-4)}, which again looks tricky.  Now, split y_i = a_i +
+	// t^25 b_i, with deg a_i < 25; then
+	//
+	//	y_i t^7 t^{32(i-4)} = a_i t^7 t^{32(i-4)} + b_i t^{32(i-3)}
+	//
+	// We can similarly decompose y_i t^2 and y_i t into a pair of 32-bit
+	// contributions to the t^{32(i-4)} and t^{32(i-3)} words, but the
+	// splits are different.  This is lovely, with one small snag: when
+	// we do this to y_7, we end up with a contribution back into the
+	// t^128 coefficient word.  But notice that only the low seven bits
+	// of this word are affected, so there's no knock-on contribution
+	// into the t^32 word.  Therefore, if we handle the high bits of each
+	// word together, and then the low bits, everything will be fine.
+
+	// First, shift the high bits down.
+	movdqa	xmm2, xmm0		// (y_7, y_6; y_5, y_4) again
+	movdqa	xmm3, xmm0		// (y_7, y_6; y_5, y_4) yet again
+	movdqa	xmm4, xmm0		// (y_7, y_6; y_5, y_4) again again
+	pslld	xmm2, 31		// the b_i for t
+	pslld	xmm3, 30		// the b_i for t^2
+	pslld	xmm4, 25		// the b_i for t^7
+	pxor	xmm2, xmm3		// add them all together
+	pxor	xmm2, xmm4
+	movdqa	xmm3, xmm2		// and a copy for later
+	psrldq	xmm2, 4			// contribution into low half
+	pslldq	xmm3, 12		// and high half
+	pxor	xmm1, xmm2
+	pxor	xmm0, xmm3
+
+	// And then shift the low bits up.
+	movdqa	xmm2, xmm0
+	movdqa	xmm3, xmm0
+	pxor	xmm1, xmm0		// mix in the unit contribution
+	psrld	xmm0, 1
+	psrld	xmm2, 2
+	psrld	xmm3, 7
+	pxor	xmm1, xmm2		// low half, unit, and t^2 contribs
+	pxor	xmm0, xmm3		// t and t^7 contribs
+	pxor	xmm0, xmm1		// mix them together and we're done
+.endm
+
+.macro	mul64
+	// Enter with u and v in the low halves of xmm0 and xmm1
+	// respectively; leave with z = u v in xmm0.  Clobbers xmm1--xmm4.
+
+	// The multiplication is thankfully easy.
+	pclmullqlqdq xmm0, xmm1		// u v
+
+	// Shift the product up by one place.  After this, we're in GCM
+	// bizarro-world.
+	movdqa	xmm1, xmm0		// u v again
+	psrld	xmm0, 31		// shifted left; just the carries
+	pslld	xmm1, 1			// shifted right, but dropped carries
+	pslldq	xmm0, 4			// move carries over
+	por	xmm1, xmm0		// (y_3, y_2; y_1, y_0)
+
+	// Now we must reduce.  This is essentially the same as the 128-bit
+	// case above, but mostly simpler because everything is smaller.  The
+	// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
+
+	// First, we must detach the top (`low'!) half of the result.
+	movdqa	xmm0, xmm1		// (y_3, y_2; y_1, y_0) again
+	psrldq	xmm1, 8			// (y_1, y_0; 0, 0)
+
+	// Next, shift the high bits down.
+	movdqa	xmm2, xmm0		// (y_3, y_2; ?, ?) again
+	movdqa	xmm3, xmm0		// (y_3, y_2; ?, ?) yet again
+	movdqa	xmm4, xmm0		// (y_3, y_2; ?, ?) again again
+	pslld	xmm2, 31		// b_i for t
+	pslld	xmm3, 29		// b_i for t^3
+	pslld	xmm4, 28		// b_i for t^4
+	pxor	xmm2, xmm3		// add them all together
+	pxor	xmm2, xmm4
+	movdqa	xmm3, xmm2		// and a copy for later
+	movq	xmm2, xmm2		// zap high half
+	pslldq	xmm3, 4			// contribution into high half
+	psrldq	xmm2, 4			// and low half
+	pxor	xmm0, xmm3
+	pxor	xmm1, xmm2
+
+	// And then shift the low bits up.
+	movdqa	xmm2, xmm0
+	movdqa	xmm3, xmm0
+	pxor	xmm1, xmm0		// mix in the unit contribution
+	psrld	xmm0, 1
+	psrld	xmm2, 3
+	psrld	xmm3, 4
+	pxor	xmm1, xmm2		// low half, unit, and t^3 contribs
+	pxor	xmm0, xmm3		// t and t^4 contribs
+	pxor	xmm0, xmm1		// mix them together and we're done
+.endm
+
+.macro	mul96
+	// Enter with u and v in the /high/ three words of xmm0 and xmm1
+	// respectively (and zero in the low word); leave with z = u v in the
+	// high three words of xmm0, and /junk/ in the low word.  Clobbers
+	// xmm1--xmm4.
+
+	// This is an inconvenient size.  There's nothing for it but to do
+	// four multiplications, as if for the 128-bit case.  It's possible
+	// that there's cruft in the top 32 bits of the input registers, so
+	// shift both of them up by four bytes before we start.  This will
+	// mean that the high 64 bits of the result (from GCM's viewpoint)
+	// will be zero.
+	// xmm0 =			// (0, u_2; u_1, u_0)
+	// xmm1 =			// (0, v_2; v_1, v_0)
+	movdqa	xmm2, xmm1		// (0, v_2; v_1, v_0) again
+	movdqa	xmm3, xmm0		// (0, u_2; u_1, u_0) again
+	movdqa	xmm4, xmm0		// (0, u_2; u_1, u_0) yet again
+	pclmulhqlqdq xmm2, xmm0		// u_2 (v_1 t^32 + v_0) = e_0
+	pclmullqlqdq xmm0, xmm1		// u_2 v_2 = d = (0; d)
+	pclmulhqlqdq xmm3, xmm1		// v_2 (u_1 t^32 + u_0) = e_1
+	pclmulhqhqdq xmm4, xmm1		// u_0 v_0 + (u_1 v_0 + u_0 v_1) t^32
+					//   + u_1 v_1 t^64 = f
+
+	// Extract the high and low halves of the 192-bit result.  We don't
+	// need be too picky about the unused high words of the result
+	// registers.  The answer we want is d t^128 + e t^64 + f, where e =
+	// e_0 + e_1.
+	//
+	// The place values for the two halves are (t^160, t^128; t^96, ?)
+	// and (?, t^64; t^32, 1).
+	psrldq	xmm0, 8			// (d; 0) = d t^128
+	pxor	xmm2, xmm3		// e = (e_0 + e_1)
+	movdqa	xmm1, xmm4		// f again
+	pxor	xmm0, xmm2		// d t^128 + e t^64
+	psrldq	xmm2, 12		// e[31..0] t^64
+	psrldq	xmm1, 4			// f[95..0]
+	pslldq	xmm4, 8			// f[127..96]
+	pxor	xmm1, xmm2		// low 96 bits of result
+	pxor	xmm0, xmm4		// high 96 bits of result
+
+	// Next, shift everything one bit to the left to compensate for GCM's
+	// strange ordering.  This will be easier if we shift up the high
+	// half by a word before we start.  After this we're in GCM bizarro-
+	// world.
+	movdqa	xmm3, xmm1		// low half again
+	pslldq	xmm0, 4			// shift high half
+	psrld	xmm1, 31		// shift low half down: just carries
+	movdqa	xmm2, xmm0		// copy high half
+	pslld	xmm3, 1			// shift low half down: drop carries
+	psrld	xmm0, 31		// shift high half up: just carries
+	pslld	xmm2, 1			// shift high half down: drop carries
+	movdqa	xmm4, xmm0		// copy high carries for carry-around
+	pslldq	xmm0, 4			// shift carries down
+	pslldq	xmm1, 4
+	psrldq	xmm4, 12		// the big carry wraps around
+	por	xmm1, xmm3
+	por	xmm0, xmm2
+	por	xmm1, xmm4
+
+	// Finally, the reduction.  This is essentially the same as the
+	// 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
+	// t^9 + t^6 + 1.  The degrees are larger but not enough to cause
+	// trouble for the general approach.
+
+	// First, shift the high bits down.
+	movdqa	xmm2, xmm0		// copies of the high part
+	movdqa	xmm3, xmm0
+	movdqa	xmm4, xmm0
+	pslld	xmm2, 26		// b_i for t^6
+	pslld	xmm3, 23		// b_i for t^9
+	pslld	xmm4, 22		// b_i for t^10
+	pxor	xmm2, xmm3		// add them all together
+	pslldq	xmm1, 4			// shift low part up to match
+	pxor	xmm2, xmm4
+	movdqa	xmm3, xmm2		// and a copy for later
+	pslldq	xmm2, 8			// contribution to high half
+	psrldq	xmm3, 4			// contribution to low half
+	pxor	xmm1, xmm3
+	pxor	xmm0, xmm2
+
+	// And then shift the low bits up.
+	movdqa	xmm2, xmm0		// copies of the high part
+	movdqa	xmm3, xmm0
+	pxor	xmm1, xmm0		// mix in the unit contribution
+	psrld	xmm0, 6
+	psrld	xmm2, 9
+	psrld	xmm3, 10
+	pxor	xmm1, xmm2		// low half, unit, and t^9 contribs
+	pxor	xmm0, xmm3		// t^6 and t^10 contribs
+	pxor	xmm0, xmm1		// mix them together and we're done
+.endm
+
+.macro	mul192
+	// Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave
+	// with z = u v in xmm0/xmm1 -- the top halves of the high registers
+	// are unimportant.  Clobbers xmm2--xmm7.
+
+	// Start multiplying and accumulating pieces of product.
+	// xmm0 =			// (u_2; u_1)
+	// xmm1 =			// (u_0; ?)
+	// xmm2 =			// (v_2; v_1)
+	// xmm3 =			// (v_0; ?)
+	movdqa	xmm4, xmm0		// (u_2; u_1) again
+	movdqa	xmm5, xmm0		// (u_2; u_1) yet again
+	movdqa	xmm6, xmm0		// (u_2; u_1) again again
+	movdqa	xmm7, xmm1		// (u_0; ?) again
+	punpcklqdq xmm1, xmm3		// (u_0; v_0)
+	pclmulhqhqdq xmm4, xmm2		// u_1 v_1
+	pclmullqlqdq xmm3, xmm0		// u_2 v_0
+	pclmullqhqdq xmm5, xmm2		// u_2 v_1
+	pclmulhqlqdq xmm6, xmm2		// u_1 v_2
+	pxor	xmm4, xmm3		// u_2 v_0 + u_1 v_1
+	pclmullqlqdq xmm7, xmm2		// u_0 v_2
+	pxor	xmm5, xmm6		// b = u_2 v_1 + u_1 v_2
+	movdqa	xmm6, xmm0		// (u_2; u_1) like a bad penny
+	pxor	xmm4, xmm7		// c = u_0 v_2 + u_1 v_1 + u_2 v_0
+	pclmullqlqdq xmm0, xmm2		// a = u_2 v_2
+	pclmulhqhqdq xmm6, xmm1		// u_1 v_0
+	pclmulhqlqdq xmm2, xmm1		// u_0 v_1
+	pclmullqhqdq xmm1, xmm1		// e = u_0 v_0
+	pxor	xmm2, xmm6		// d = u_1 v_0 + u_0 v_1
+
+	// Next, the piecing together of the product.
+	// xmm0 =			// (a_1; a_0) = a = u_2 v_2
+	// xmm5 =			// (b_1; b_0) = b = u_1 v_2 + u_2 v_1
+	// xmm4 =			// (c_1; c_0) = c = u_0 v_2 +
+					//	u_1 v_1 + u_2 v_0
+	// xmm2 =			// (d_1; d_0) = d = u_0 v_1 + u_1 v_0
+	// xmm1 =			// (e_1; e_0) = e = u_0 v_0
+	// xmm3, xmm6, xmm7 spare
+	movdqa	xmm3, xmm2		// (d_1; d_0) again
+	movdqa	xmm6, xmm5		// (b_1; b_0) again
+	pslldq	xmm2, 8			// (0; d_1)
+	psrldq	xmm5, 8			// (b_0; 0)
+	psrldq	xmm3, 8			// (d_0; 0)
+	pslldq	xmm6, 8			// (0; b_1)
+	pxor	xmm5, xmm2		// (b_0; d_1)
+	pxor	xmm0, xmm6		// x_2 = (a_1; a_0 + b_1)
+	pxor	xmm3, xmm1		// x_0 = (e_1 + d_0; e_0)
+	pxor	xmm4, xmm5		// x_1 = (b_0 + c_1; c_0 + d_1)
+
+	// Now, shift it right (from GCM's point of view) by one bit, and try
+	// to leave the result in less random registers.  After this, we'll
+	// be in GCM bizarro-world.
+	// xmm1, xmm2, xmm5, xmm6, xmm7 spare
+	movdqa	xmm5, xmm0		// copy x_2
+	movdqa	xmm1, xmm4		// copy x_1
+	movdqa	xmm2, xmm3		// copy x_0
+	psrld	xmm0, 31		// x_2 carries
+	psrld	xmm4, 31		// x_1 carries
+	psrld	xmm3, 31		// x_0 carries
+	pslld	xmm5, 1			// x_2 shifted
+	pslld	xmm1, 1			// x_1 shifted
+	pslld	xmm2, 1			// x_0 shifted
+	movdqa	xmm6, xmm0		// x_2 carry copy
+	movdqa	xmm7, xmm4		// x_1 carry copy
+	pslldq	xmm0, 4			// x_2 carry shifted
+	pslldq	xmm4, 4			// x_1 carry shifted
+	pslldq	xmm3, 4			// x_0 carry shifted
+	psrldq	xmm6, 12		// x_2 carry out
+	psrldq	xmm7, 12		// x_1 carry out
+	por	xmm0, xmm5		// (y_5; y_4)
+	por	xmm1, xmm4
+	por	xmm2, xmm3
+	por	xmm1, xmm6		// (y_3; y_2)
+	por	xmm2, xmm7		// (y_1; y_0)
+
+	// Next, the reduction.  Our polynomial this time is p(x) = t^192 +
+	// t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
+	// 128-bit case.  I don't know why.
+
+	// First, shift the high bits down.
+	// xmm0 =			// (y_5; y_4)
+	// xmm1 =			// (y_3; y_2)
+	// xmm2 =			// (y_1; y_0)
+	// xmm3--xmm7 spare
+	movdqa	xmm3, xmm0		// (y_5; y_4) copy
+	movdqa	xmm4, xmm0		// (y_5; y_4) copy
+	movdqa	xmm5, xmm0		// (y_5; y_4) copy
+	pslld	xmm3, 31		// (y_5; y_4) b_i for t
+	pslld	xmm4, 30		// (y_5; y_4) b_i for t^2
+	pslld	xmm5, 25		// (y_5; y_4) b_i for t^7
+	 movq	xmm6, xmm1		// (y_3; 0) copy
+	pxor	xmm3, xmm4
+	 movq	xmm7, xmm1		// (y_3; 0) copy
+	pxor	xmm3, xmm5
+	 movq	xmm5, xmm1		// (y_3; 0) copy
+	movdqa	xmm4, xmm3		// (y_5; y_4) b_i combined
+	 pslld	xmm6, 31		// (y_3; 0) b_i for t
+	 pslld	xmm7, 30		// (y_3; 0) b_i for t^2
+	 pslld	xmm5, 25		// (y_3; 0) b_i for t^7
+	psrldq	xmm3, 12		// (y_5; y_4) low contrib
+	pslldq	xmm4, 4			// (y_5; y_4) high contrib
+	 pxor	xmm6, xmm7
+	pxor	xmm2, xmm3
+	 pxor	xmm6, xmm5
+	pxor	xmm1, xmm4
+	 pslldq	xmm6, 4
+	 pxor	xmm2, xmm6
+
+	// And finally shift the low bits up.  Unfortunately, we also have to
+	// split the low bits out.
+	// xmm0 =			// (y'_5; y'_4)
+	// xmm1 =			// (y'_3; y'_2)
+	// xmm2 =			// (y'_1; y'_0)
+	 movdqa xmm5, xmm1		// copies of (y'_3; y'_2)
+	 movdqa	xmm6, xmm1
+	 movdqa	xmm7, xmm1
+	  psrldq xmm1, 8		// bring down (y'_2; ?)
+	movdqa	xmm3, xmm0		// copies of (y'_5; y'_4)
+	movdqa	xmm4, xmm0
+	  punpcklqdq  xmm1, xmm2	// (y'_2; y'_1)
+	  psrldq xmm2, 8		// (y'_0; ?)
+	 pxor	xmm2, xmm5		// low half and unit contrib
+	pxor	xmm1, xmm0
+	 psrld	xmm5, 1
+	psrld	xmm0, 1
+	 psrld	xmm6, 2
+	psrld	xmm3, 2
+	 psrld	xmm7, 7
+	psrld	xmm4, 7
+	 pxor	xmm2, xmm6		// low half, unit, t^2 contribs
+	pxor	xmm1, xmm3
+	 pxor	xmm5, xmm7		// t and t^7 contribs
+	pxor	xmm0, xmm4
+	 pxor	xmm5, xmm2		// mix everything together
+	pxor	xmm0, xmm1
+	 movq	xmm1, xmm5		// shunt (z_0; ?) into proper place
+.endm
+
+.macro	mul256
+	// Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave
+	// with z = u v in xmm0/xmm1.  Clobbers xmm2--xmm7.  On 32-bit x86,
+	// requires 16 bytes aligned space at SP; on amd64, also clobbers
+	// xmm8.
+
+	// Now it's starting to look worthwhile to do Karatsuba.  Suppose
+	// u = u_0 + u_1 B and v = v_0 + v_1 B.  Then
+	//
+	//	u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
+	//
+	// Name these coefficients of B^i be a, b, and c, respectively, and
+	// let r = u_0 + u_1 and s = v_0 + v_1.  Then observe that
+	//
+	//	q = r s = (u_0 + u_1) (v_0 + v_1)
+	//	  = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
+	//	  = a + d + c
+	//
+	// The first two terms we've already calculated; the last is the
+	// remaining one we want.  We'll set B = t^128.  We know how to do
+	// 128-bit multiplications already, and Karatsuba is too annoying
+	// there, so there'll be 12 multiplications altogether, rather than
+	// the 16 we'd have if we did this the naÃ¯ve way.
+	//
+	// On x86, there aren't quite enough registers, so spill one for a
+	// bit.  On AMD64, we can keep on going, so it's all good.
+
+	// xmm0 =			// u_1 = (u_11; u_10)
+	// xmm1 =			// u_0 = (u_01; u_00)
+	// xmm2 =			// v_1 = (v_11; v_10)
+	// xmm3 =			// v_0 = (v_01; v_00)
+	movdqa	xmm4, xmm0		// u_1 again
+#if CPUFAM_X86
+	movdqa	[esp + 0], xmm3
+#elif CPUFAM_AMD64
+	movdqa	xmm8, xmm3
+#  define V0 xmm8
+#endif
+	pxor	xmm4, xmm1		// u_* = (u_01 + u_11; u_00 + u_10)
+	pxor	xmm3, xmm2		// v_* = (v_01 + v_11; v_00 + v_10)
+
+	// Start by building the cross product, q = u_* v_*.
+	movdqa	xmm7, xmm4		// more copies of u_*
+	movdqa	xmm5, xmm4
+	movdqa	xmm6, xmm4
+	pclmullqhqdq xmm4, xmm3		// u_*1 v_*0
+	pclmulhqlqdq xmm7, xmm3		// u_*0 v_*1
+	pclmullqlqdq xmm5, xmm3		// u_*1 v_*1
+	pclmulhqhqdq xmm6, xmm3		// u_*0 v_*0
+	pxor	xmm4, xmm7		// u_*1 v_*0 + u_*0 v_*1
+	movdqa	xmm7, xmm4
+	pslldq	xmm4, 8
+	psrldq	xmm7, 8
+	pxor	xmm5, xmm4		// q_1
+	pxor	xmm6, xmm7		// q_0
+
+	// Next, work on the high half, a = u_1 v_1.
+	movdqa	xmm3, xmm0		// more copies of u_1
+	movdqa	xmm4, xmm0
+	movdqa	xmm7, xmm0
+	pclmullqhqdq xmm0, xmm2		// u_11 v_10
+	pclmulhqlqdq xmm3, xmm2		// u_10 v_11
+	pclmullqlqdq xmm4, xmm2		// u_11 v_11
+	pclmulhqhqdq xmm7, xmm2		// u_10 v_10
+#if CPUFAM_X86
+	movdqa	xmm2, [esp + 0]
+#  define V0 xmm2
+#endif
+	pxor	xmm0, xmm3		// u_10 v_11 + u_11 v_10
+	movdqa	xmm3, xmm0
+	pslldq	xmm0, 8
+	psrldq	xmm3, 8
+	pxor	xmm4, xmm0		// x_1 = a_1
+	pxor	xmm7, xmm3		// a_0
+
+	// Mix that into the product now forming in xmm4--xmm7.
+	pxor	xmm5, xmm4		// a_1 + q_1
+	pxor	xmm6, xmm7		// a_0 + q_0
+	pxor	xmm5, xmm7		// a_0 + (a_1 + q_1)
+
+	// Finally, the low half, c = u_0 v_0.
+	movdqa	xmm0, xmm1		// more copies of u_0
+	movdqa	xmm3, xmm1
+	movdqa	xmm7, xmm1
+	pclmullqhqdq xmm1, V0		// u_01 v_00
+	pclmulhqlqdq xmm0, V0		// u_00 v_01
+	pclmullqlqdq xmm3, V0		// u_01 v_01
+	pclmulhqhqdq xmm7, V0		// u_00 v_00
+	pxor	xmm0, xmm1		// u_10 v_11 + u_11 v_10
+	movdqa	xmm1, xmm0
+	pslldq	xmm0, 8
+	psrldq	xmm1, 8
+	pxor	xmm3, xmm0		// c_1
+	pxor	xmm7, xmm1		// x_0 = c_0
+
+	// And mix that in to complete the product.
+	pxor	xmm6, xmm3		// (a_0 + q_0) + c_1
+	pxor	xmm5, xmm3	 // x_2 = a_0 + (a_1 + c_1 + q_1) = a_0 + b_1
+	pxor	xmm6, xmm7	 // x_1 = (a_0 + c_0 + q_0) + c_1 = b_0 + c_1
+
+#undef V0
+
+	// Now we need to shift that whole lot one bit to the left.  This
+	// will also give us an opportunity to put the product back in
+	// xmm0--xmm3.  This is a slightly merry dance because it's nearly
+	// pipelined but we don't have enough registers.
+	//
+	// After this, we'll be in GCM bizarro-world.
+	movdqa	xmm0, xmm4		// x_3 again
+	psrld	xmm4, 31		// x_3 carries
+	pslld	xmm0, 1			// x_3 shifted left
+	movdqa	xmm3, xmm4		// x_3 copy carries
+	 movdqa	xmm1, xmm5		// x_2 again
+	pslldq	xmm4, 4			// x_3 carries shifted up
+	 psrld	xmm5, 31		// x_2 carries
+	psrldq	xmm3, 12		// x_3 big carry out
+	 pslld	xmm1, 1			// x_2 shifted left
+	por	xmm0, xmm4		// x_3 mixed together
+	 movdqa	xmm4, xmm5		// x_2 copy carries
+	  movdqa xmm2, xmm6		// x_1 again
+	 pslldq	xmm5, 4			// x_2 carries shifted up
+	  psrld	xmm6, 31		// x_1 carries
+	 psrldq	xmm4, 12		// x_2 big carry out
+	  pslld	xmm2, 1			// x_1 shifted
+	 por	xmm1, xmm5		// x_2 mixed together
+	  movdqa xmm5, xmm6		// x_1 copy carries
+	 por	xmm1, xmm3		// x_2 with carry from x_3
+	   movdqa xmm3, xmm7		// x_0 again
+	  pslldq xmm6, 4		// x_1 carries shifted up
+	   psrld xmm7, 31		// x_2 carries
+	  psrldq xmm5, 12		// x_1 big carry out
+	   pslld xmm3, 1		// x_0 shifted
+	  por	xmm2, xmm6		// x_1 mixed together
+	   pslldq xmm7, 4		// x_0 carries shifted up
+	  por	xmm2, xmm4		// x_1 with carry from x_2
+	   por	xmm3, xmm7		// x_0 mixed together
+	   por	xmm3, xmm5		// x_0 with carry from x_1
+
+	// Now we must reduce.  This is essentially the same as the 128-bit
+	// case above, but more complicated because everything is bigger.
+	// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
+
+	// First, shift the high bits down.
+	movdqa	xmm4, xmm0		// y_3 again
+	movdqa	xmm5, xmm0		// y_3 yet again
+	movdqa	xmm6, xmm0		// y_3 again again
+	pslld	xmm4, 30		// y_3: b_i for t^2
+	pslld	xmm5, 27		// y_3: b_i for t^5
+	pslld	xmm6, 22		// y_3: b_i for t^10
+	 movdqa	xmm7, xmm1		// y_2 again
+	pxor	xmm4, xmm5
+	 movdqa	xmm5, xmm1		// y_2 again
+	pxor	xmm4, xmm6
+	 movdqa	xmm6, xmm1		// y_2 again
+	 pslld	xmm7, 30		// y_2: b_i for t^2
+	 pslld	xmm5, 27		// y_2: b_i for t^5
+	 pslld	xmm6, 22		// y_2: b_i for t^10
+	 pxor	xmm7, xmm5
+	movdqa	xmm5, xmm4
+	 pxor	xmm7, xmm6
+	psrldq	xmm4, 4
+	 movdqa	xmm6, xmm7
+	pslldq	xmm5, 12
+	 psrldq	xmm7, 4
+	pxor	xmm2, xmm4
+	 pslldq	xmm6, 12
+	 pxor	xmm3, xmm7
+	pxor	xmm1, xmm5
+	 pxor	xmm2, xmm6
+
+	// And then shift the low bits up.
+	movdqa	xmm4, xmm0		// y_3 again
+	 movdqa	xmm5, xmm1		// y_2 again
+	movdqa	xmm6, xmm0		// y_3 yet again
+	 movdqa	xmm7, xmm1		// y_2 yet again
+	pxor	xmm2, xmm0		// y_1 and unit contrib from y_3
+	 pxor	xmm3, xmm1		// y_0 and unit contrib from y_2
+	psrld	xmm0, 2
+	 psrld	xmm1, 2
+	psrld	xmm4, 5
+	 psrld	xmm5, 5
+	psrld	xmm6, 10
+	 psrld	xmm7, 10
+	pxor	xmm0, xmm2		// y_1, with y_3 units and t^2
+	 pxor	xmm1, xmm3		// y_0, with y_2 units and t^2
+	pxor	xmm4, xmm6		// y_3 t^5 and t^10 contribs
+	 pxor	xmm5, xmm7		// y_2 t^5 and t^10 contribs
+	pxor	xmm0, xmm4		// high half of reduced result
+	pxor	xmm1, xmm5		// low half; all done
+.endm
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+// There are a number of representations of field elements in this code and
+// it can be confusing.
+//
+//   * The `external format' consists of a sequence of contiguous bytes in
+//     memory called a `block'.  The GCM spec explains how to interpret this
+//     block as an element of a finite field.  As discussed extensively, this
+//     representation is very annoying for a number of reasons.  On the other
+//     hand, this code never actually deals with it directly.
+//
+//   * The `register format' consists of one or more XMM registers, depending
+//     on the block size.  The bytes in these registers are in reverse order
+//     -- so the least-significant byte of the lowest-numbered register holds
+//     the /last/ byte in the block.  If the block size is not a multiple of
+//     16 bytes, then there must be padding.  96-bit blocks are weird: the
+//     padding is inserted at the /least/ significant end, so the register
+//     holds (0, x_0; x_1, x_2); otherwise, the padding goes at the most
+//     significant end.
+//
+//   * The `words' format consists of a sequence of bytes, as in the
+//     `external format', but, according to the blockcipher in use, the bytes
+//     within each 32-bit word may be reversed (`big-endian') or not
+//     (`little-endian').  Accordingly, there are separate entry points for
+//     each variant, identified with `b' or `l'.
+
+#define SSEFUNC(f)							\
+	FUNC(f##_avx); vzeroupper; endprologue; ENDFUNC;		\
+	FUNC(f)
+
+SSEFUNC(gcm_mulk_128b_x86ish_pclmul)
+	// On entry, A points to a 128-bit field element in big-endian words
+	// format; K points to a field-element in register format.  On exit,
+	// A is updated with the product A K.
+
+#if CPUFAM_X86
+	mov	A, [esp + 4]
+	mov	K, [esp + 8]
+#endif
+  endprologue
+	movdqu	xmm0, [A]
+	movdqu	xmm1, [K]
+	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
+	mul128
+	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
+	movdqu	[A], xmm0
+	ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_128l_x86ish_pclmul)
+	// On entry, A points to a 128-bit field element in little-endian
+	// words format; K points to a field-element in register format.  On
+	// exit, A is updated with the product A K.
+
+#if CPUFAM_X86
+	mov	A, [esp + 4]
+	mov	K, [esp + 8]
+	ldgot	ecx
+#endif
+  endprologue
+	movdqa	xmm7, [INTADDR(swaptab_128l, ecx)]
+	movdqu	xmm0, [A]
+	movdqu	xmm1, [K]
+	pshufb	xmm0, xmm7
+	mul128
+	pshufb	xmm0, xmm7
+	movdqu	[A], xmm0
+	ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_64b_x86ish_pclmul)
+	// On entry, A points to a 64-bit field element in big-endian words
+	// format; K points to a field-element in register format.  On exit,
+	// A is updated with the product A K.
+
+#if CPUFAM_X86
+	mov	A, [esp + 4]
+	mov	K, [esp + 8]
+#endif
+  endprologue
+	movq	xmm0, [A]
+	movq	xmm1, [K]
+	pshufd	xmm0, xmm0, SHUF(1, 0, 3, 3)
+	mul64
+	pshufd	xmm0, xmm0, SHUF(1, 0, 3, 3)
+	movq	[A], xmm0
+	ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_64l_x86ish_pclmul)
+	// On entry, A points to a 64-bit field element in little-endian
+	// words format; K points to a field-element in register format.  On
+	// exit, A is updated with the product A K.
+
+#if CPUFAM_X86
+	mov	A, [esp + 4]
+	mov	K, [esp + 8]
+	ldgot	ecx
+#endif
+  endprologue
+	movdqa	xmm7, [INTADDR(swaptab_64l, ecx)]
+	movq	xmm0, [A]
+	movq	xmm1, [K]
+	pshufb	xmm0, xmm7
+	mul64
+	pshufb	xmm0, xmm7
+	movq	[A], xmm0
+	ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_96b_x86ish_pclmul)
+	// On entry, A points to a 96-bit field element in big-endian words
+	// format; K points to a field-element in register format (i.e., 16
+	// bytes, with the first four bytes zero).  On exit, A is updated
+	// with the product A K.
+
+#if CPUFAM_X86
+	mov	A, [esp + 4]
+	mov	K, [esp + 8]
+#endif
+  endprologue
+	movq	xmm0, [A + 0]
+	movd	xmm2, [A + 8]
+	movdqu	xmm1, [K]
+	punpcklqdq xmm0, xmm2
+	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
+	mul96
+	pshufd	xmm1, xmm0, SHUF(3, 2, 1, 0)
+	psrldq	xmm0, 4
+	movq	[A + 0], xmm1
+	movd	[A + 8], xmm0
+	ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_96l_x86ish_pclmul)
+	// On entry, A points to a 96-bit field element in little-endian
+	// words format; K points to a field-element in register format
+	// (i.e., 16 bytes, with the first four bytes zero).  On exit, A is
+	// updated with the product A K.
+
+#if CPUFAM_X86
+	mov	A, [esp + 4]
+	mov	K, [esp + 8]
+	ldgot	ecx
+#endif
+  endprologue
+	movdqa	xmm7, [INTADDR(swaptab_128l, ecx)]
+	movq	xmm0, [A + 0]
+	movd	xmm2, [A + 8]
+	movdqu	xmm1, [K]
+	punpcklqdq xmm0, xmm2
+	pshufb	xmm0, xmm7
+	mul96
+	pshufb	xmm0, xmm7
+	movq	[A + 0], xmm0
+	psrldq	xmm0, 8
+	movd	[A + 8], xmm0
+	ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_192b_x86ish_pclmul)
+	// On entry, A points to a 192-bit field element in big-endian words
+	// format; K points to a field-element in register format.  On exit,
+	// A is updated with the product A K.
+
+#if CPUFAM_X86
+	mov	A, [esp + 4]
+	mov	K, [esp + 8]
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+	stalloc	2*16 + 8
+	savexmm	xmm6, 0
+	savexmm	xmm7, 16
+#endif
+  endprologue
+	movdqu	xmm0, [A + 8]
+	movq	xmm1, [A + 0]
+	movdqu	xmm2, [K + 0]
+	movq	xmm3, [K + 16]
+	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
+	pshufd	xmm1, xmm1, SHUF(1, 0, 3, 3)
+	mul192
+	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
+	pshufd	xmm1, xmm1, SHUF(1, 0, 3, 3)
+	movdqu	[A + 8], xmm0
+	movq	[A + 0], xmm1
+#if CPUFAM_AMD64 && ABI_WIN
+	rstrxmm	xmm6, 0
+	rstrxmm	xmm7, 16
+	stfree	2*16 + 8
+#endif
+	ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_192l_x86ish_pclmul)
+	// On entry, A points to a 192-bit field element in little-endian
+	// words format; K points to a field-element in register format.  On
+	// exit, A is updated with the product A K.
+
+#if CPUFAM_X86
+	mov	A, [esp + 4]
+	mov	K, [esp + 8]
+	ldgot	ecx
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+	stalloc	2*16 + 8
+	savexmm	xmm6, 0
+	savexmm	xmm7, 16
+#endif
+  endprologue
+	movdqu	xmm0, [A + 8]
+	movq	xmm1, [A + 0]
+	movdqu	xmm2, [K + 0]
+	movq	xmm3, [K + 16]
+	pshufb	xmm0, [INTADDR(swaptab_128l, ecx)]
+	pshufb	xmm1, [INTADDR(swaptab_64l, ecx)]
+	mul192
+	pshufb	xmm0, [INTADDR(swaptab_128l, ecx)]
+	pshufb	xmm1, [INTADDR(swaptab_64l, ecx)]
+	movdqu	[A + 8], xmm0
+	movq	[A + 0], xmm1
+#if CPUFAM_AMD64 && ABI_WIN
+	rstrxmm	xmm6, 0
+	rstrxmm	xmm7, 16
+	stfree	2*16 + 8
+#endif
+	ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_256b_x86ish_pclmul)
+	// On entry, A points to a 256-bit field element in big-endian words
+	// format; K points to a field-element in register format.  On exit,
+	// A is updated with the product A K.
+
+#if CPUFAM_X86
+	pushreg	ebp
+	setfp
+	mov	A, [esp + 8]
+	mov	K, [esp + 12]
+	and	esp, ~15
+	sub	esp, 16
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+	stalloc	3*16 + 8
+	savexmm	xmm6, 0
+	savexmm	xmm7, 16
+	savexmm	xmm8, 32
+#endif
+  endprologue
+	movdqu	xmm0, [A + 16]
+	movdqu	xmm1, [A + 0]
+	movdqu	xmm2, [K + 0]
+	movdqu	xmm3, [K + 16]
+	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
+	pshufd	xmm1, xmm1, SHUF(3, 2, 1, 0)
+	mul256
+	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
+	pshufd	xmm1, xmm1, SHUF(3, 2, 1, 0)
+	movdqu	[A + 16], xmm0
+	movdqu	[A + 0], xmm1
+#if CPUFAM_X86
+	dropfp
+	popreg	ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+	rstrxmm	xmm6, 0
+	rstrxmm	xmm7, 16
+	rstrxmm	xmm8, 32
+	stfree	3*16 + 8
+#endif
+	ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_256l_x86ish_pclmul)
+	// On entry, A points to a 256-bit field element in little-endian
+	// words format; K points to a field-element in register format.  On
+	// exit, A is updated with the product A K.
+
+#if CPUFAM_X86
+	pushreg	ebp
+	setfp
+	mov	A, [esp + 8]
+	mov	K, [esp + 12]
+	and	esp, ~15
+	ldgot	ecx
+	sub	esp, 16
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+	stalloc	3*16 + 8
+	savexmm	xmm6, 0
+	savexmm	xmm7, 16
+	savexmm	xmm8, 32
+#endif
+  endprologue
+	movdqa	xmm7, [INTADDR(swaptab_128l, ecx)]
+	movdqu	xmm0, [A + 16]
+	movdqu	xmm1, [A + 0]
+	movdqu	xmm2, [K + 0]
+	movdqu	xmm3, [K + 16]
+	pshufb	xmm0, xmm7
+	pshufb	xmm1, xmm7
+	mul256
+	movdqa	xmm7, [INTADDR(swaptab_128l, ecx)]
+	pshufb	xmm0, xmm7
+	pshufb	xmm1, xmm7
+	movdqu	[A + 16], xmm0
+	movdqu	[A + 0], xmm1
+#if CPUFAM_X86
+	dropfp
+	popreg	ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+	rstrxmm	xmm6, 0
+	rstrxmm	xmm7, 16
+	rstrxmm	xmm8, 32
+	stfree	3*16 + 8
+#endif
+	ret
+ENDFUNC
+
+	RODATA
+
+	.balign	16
+swaptab_128l:
+	// Table for byte-swapping little-endian words-format blocks larger
+	// than 64 bits.
+	.byte	 15,  14,  13,  12,   11,  10,   9,   8
+	.byte	  7,   6,   5,   4,    3,   2,   1,   0
+
+	.balign	16
+swaptab_64l:
+	// Table for byte-swapping 64-bit little-endian words-format blocks.
+	.byte	  7,   6,   5,   4,    3,   2,   1,   0
+	.byte	255, 255, 255, 255,  255, 255, 255, 255
+
+///----- That's all, folks --------------------------------------------------
diff --git a/symm/gcm.c b/symm/gcm.c
index 12096aec..73b28517 100644
--- a/symm/gcm.c
+++ b/symm/gcm.c
@@ -234,6 +234,120 @@ static void simple_mktable(const gcm_params *p,
     for (i = 0; i < 32*p->n*p->n; i++) ktab[i] = ENDSWAP32(ktab[i]);
 }
 
+#if CPUFAM_X86 || CPUFAM_AMD64
+static void pclmul_mktable(const gcm_params *p,
+			   uint32 *ktab, const uint32 *k)
+{
+  unsigned n = p->n;
+  unsigned nz;
+  uint32 *t;
+
+  /* We just need to store the value in a way which is convenient for the
+   * assembler code to read back.  That involves reordering the words, and,
+   * in the case of 96-bit blocks, padding with zeroes to fill out a 128-bit
+   * chunk.
+   */
+
+  if (n == 3) nz = 1;
+  else nz = 0;
+  t = ktab + n + nz;
+
+  if (p->f&GCMF_SWAP) while (n--) { *--t = ENDSWAP32(*k); k++; }
+  else while (n--) *--t = *k++;
+  while (nz--) *--t = 0;
+}
+#endif
+
+#if CPUFAM_ARMEL
+static void arm_crypto_mktable(const gcm_params *p,
+			       uint32 *ktab, const uint32 *k)
+{
+  unsigned n = p->n;
+  uint32 *t;
+
+  /* We just need to store the value in a way which is convenient for the
+   * assembler code to read back.  That involves swapping the bytes in each
+   * 64-bit lane.
+   */
+
+  t = ktab;
+  if (p->f&GCMF_SWAP) {
+    while (n >= 2) {
+      t[1] = ENDSWAP32(k[0]); t[0] = ENDSWAP32(k[1]);
+      t += 2; k += 2; n -= 2;
+    }
+    if (n) { t[1] = ENDSWAP32(k[0]); t[0] = 0; }
+  } else {
+    while (n >= 2) {
+      t[1] = k[0]; t[0] = k[1];
+      t += 2; k += 2; n -= 2;
+    }
+    if (n) { t[1] = k[0]; t[0] = 0; }
+  }
+}
+#endif
+
+#if CPUFAM_ARM64
+static uint32 rbit32(uint32 x)
+{
+  uint32 z, t;
+
+#if GCC_VERSION_P(4, 3)
+  /* Two tricks here.  Firstly, two separate steps, rather than a single
+   * block of assembler, to allow finer-grained instruction scheduling.
+   * Secondly, use `ENDSWAP32' so that the compiler can cancel it if the
+   * caller actually wants the bytes reordered.
+   */
+  __asm__("rbit %w0, %w1" : "=r"(t) : "r"(x));
+  z = ENDSWAP32(t);
+#else
+  /* A generic but slightly clever implementation. */
+#  define SWIZZLE(x, m, s) ((((x)&(m)) << (s)) | (((x)&~(m)) >> (s)))
+					/* 76543210 */
+  t = SWIZZLE(x, 0x0f0f0f0f, 4);	/* 32107654 -- swap nibbles */
+  t = SWIZZLE(t, 0x33333333, 2);	/* 10325476 -- swap bit pairs */
+  z = SWIZZLE(t, 0x55555555, 1);	/* 01234567 -- swap adjacent bits */
+#  undef SWIZZLE
+#endif
+  return (z);
+}
+
+static void arm64_pmull_mktable(const gcm_params *p,
+				uint32 *ktab, const uint32 *k)
+{
+  unsigned n = p->n;
+  uint32 *t;
+
+  /* We just need to store the value in a way which is convenient for the
+   * assembler code to read back.  That involves two transformations:
+   *
+   *   * firstly, reversing the order of the bits in each byte; and,
+   *
+   *   * secondly, storing two copies of each 64-bit chunk.
+   *
+   * Note that, in this case, we /want/ the little-endian byte order of GCM,
+   * so endianness-swapping happens in the big-endian case.
+   */
+
+  t = ktab;
+  if (p->f&GCMF_SWAP) {
+    while (n >= 2) {
+      t[0] = t[2] = rbit32(k[0]);
+      t[1] = t[3] = rbit32(k[1]);
+      t += 4; k += 2; n -= 2;
+    }
+    if (n) { t[0] = t[2] = rbit32(k[0]); t[1] = t[3] = 0; }
+  } else {
+    while (n >= 2) {
+      t[0] = t[2] = ENDSWAP32(rbit32(k[0]));
+      t[1] = t[3] = ENDSWAP32(rbit32(k[1]));
+      t += 4; k += 2; n -= 2;
+    }
+    if (n) { t[0] = t[2] = ENDSWAP32(rbit32(k[0])); t[1] = t[3] = 0; }
+  }
+}
+#endif
+
 CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable,
 	     (const gcm_params *p, uint32 *ktab, const uint32 *k),
 	     (p, ktab, k),
@@ -241,6 +355,19 @@ CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable,
 
 static gcm_mktable__functype *pick_mktable(void)
 {
+#if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(gcm_mktable, pclmul_mktable,
+		     cpu_feature_p(CPUFEAT_X86_SSSE3) &&
+		     cpu_feature_p(CPUFEAT_X86_PCLMUL));
+#endif
+#if CPUFAM_ARMEL
+  DISPATCH_PICK_COND(gcm_mktable, arm_crypto_mktable,
+		     cpu_feature_p(CPUFEAT_ARM_PMULL));
+#endif
+#if CPUFAM_ARM64
+  DISPATCH_PICK_COND(gcm_mktable, arm64_pmull_mktable,
+		     cpu_feature_p(CPUFEAT_ARM_PMULL));
+#endif
   DISPATCH_PICK_FALLBACK(gcm_mktable, simple_mktable);
 }
 
@@ -271,13 +398,84 @@ static void simple_recover_k(const gcm_params *p,
   else for (i = 0; i < p->n; i++) k[i] = ENDSWAP32(ktab[24*p->n + i]);
 }
 
+#if CPUFAM_X86 || CPUFAM_AMD64
+static void pclmul_recover_k(const gcm_params *p,
+			     uint32 *k, const uint32 *ktab)
+{
+  unsigned n = p->n;
+  unsigned nz;
+  const uint32 *t;
+
+  /* The representation is already independent of the blockcipher endianness.
+   * We need to compensate for padding, and reorder the words.
+   */
+
+  if (n == 3) nz = 1; else nz = 0;
+  t = ktab + n + nz;
+  while (n--) *k++ = *--t;
+}
+#endif
+
+#if CPUFAM_ARMEL
+static void arm_crypto_recover_k(const gcm_params *p,
+				 uint32 *k, const uint32 *ktab)
+{
+  unsigned n = p->n;
+  const uint32 *t;
+
+  /* The representation is already independent of the blockcipher endianness.
+   * We only need to reorder the words.
+   */
+
+  t = ktab;
+  while (n >= 2) { k[1] = t[0]; k[0] = t[1]; t += 2; k += 2; n -= 2; }
+  if (n) k[0] = t[1];
+}
+#endif
+
+#if CPUFAM_ARM64
+static void arm64_pmull_recover_k(const gcm_params *p,
+				  uint32 *k, const uint32 *ktab)
+{
+  unsigned n = p->n;
+  const uint32 *t;
+
+  /* The representation is already independent of the blockcipher endianness.
+   * We need to skip the duplicate pieces, and unscramble the bytes.
+   */
+
+  t = ktab;
+  while (n >= 2) {
+    k[0] = ENDSWAP32(rbit32(t[0]));
+    k[1] = ENDSWAP32(rbit32(t[1]));
+    t += 4; k += 2; n -= 2;
+  }
+  if (n) k[0] = ENDSWAP32(rbit32(t[0]));
+}
+#endif
+
 CPU_DISPATCH(static, EMPTY, void, recover_k,
 	     (const gcm_params *p, uint32 *k, const uint32 *ktab),
 	     (p, k, ktab),
 	     pick_recover_k, simple_recover_k)
 
 static recover_k__functype *pick_recover_k(void)
-  { DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k); }
+{
+#if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(recover_k, pclmul_recover_k,
+		     cpu_feature_p(CPUFEAT_X86_SSSE3) &&
+		     cpu_feature_p(CPUFEAT_X86_PCLMUL));
+#endif
+#if CPUFAM_ARMEL
+  DISPATCH_PICK_COND(recover_k, arm_crypto_recover_k,
+		     cpu_feature_p(CPUFEAT_ARM_PMULL));
+#endif
+#if CPUFAM_ARM64
+  DISPATCH_PICK_COND(recover_k, arm64_pmull_recover_k,
+		     cpu_feature_p(CPUFEAT_ARM_PMULL));
+#endif
+  DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k);
+}
 
 /* --- @gcm_mulk_N{b,l}@ --- *
  *
@@ -292,6 +490,48 @@ static recover_k__functype *pick_recover_k(void)
  *		function whose performance actually matters.
  */
 
+#if CPUFAM_X86 || CPUFAM_AMD64
+#  define DECL_MULK_X86ISH(var) extern gcm_mulk_##var##__functype	\
+  gcm_mulk_##var##_x86ish_pclmul_avx,					\
+  gcm_mulk_##var##_x86ish_pclmul;
+#  define PICK_MULK_X86ISH(var) do {					\
+  DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul_avx, \
+		     cpu_feature_p(CPUFEAT_X86_AVX) &&			\
+		     cpu_feature_p(CPUFEAT_X86_PCLMUL) &&		\
+		     cpu_feature_p(CPUFEAT_X86_SSSE3));			\
+  DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul,	\
+		     cpu_feature_p(CPUFEAT_X86_PCLMUL) &&		\
+		     cpu_feature_p(CPUFEAT_X86_SSSE3));			\
+} while (0)
+#else
+#  define DECL_MULK_X86ISH(var)
+#  define PICK_MULK_X86ISH(var) do ; while (0)
+#endif
+
+#if CPUFAM_ARMEL
+#  define DECL_MULK_ARM(var)						\
+  extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm_crypto;
+#  define PICK_MULK_ARM(var) do {					\
+  DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm_crypto,	\
+		     cpu_feature_p(CPUFEAT_ARM_PMULL));			\
+} while (0)
+#else
+#  define DECL_MULK_ARM(var)
+#  define PICK_MULK_ARM(var) do ; while (0)
+#endif
+
+#if CPUFAM_ARM64
+#  define DECL_MULK_ARM64(var)						\
+  extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm64_pmull;
+#  define PICK_MULK_ARM64(var) do {					\
+  DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm64_pmull,	\
+		     cpu_feature_p(CPUFEAT_ARM_PMULL));			\
+} while (0)
+#else
+#  define DECL_MULK_ARM64(var)
+#  define PICK_MULK_ARM64(var) do ; while (0)
+#endif
+
 #define DEF_MULK(nbits)							\
 									\
 CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##b,			\
@@ -321,10 +561,27 @@ static void simple_mulk_##nbits(uint32 *a, const uint32 *ktab)		\
   for (i = 0; i < nbits/32; i++) a[i] = z[i];				\
 }									\
 									\
+DECL_MULK_X86ISH(nbits##b)						\
+DECL_MULK_ARM(nbits##b)							\
+DECL_MULK_ARM64(nbits##b)						\
 static gcm_mulk_##nbits##b##__functype *pick_mulk_##nbits##b(void)	\
-  { DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits); } \
+{									\
+  PICK_MULK_X86ISH(nbits##b);						\
+  PICK_MULK_ARM(nbits##b);						\
+  PICK_MULK_ARM64(nbits##b);						\
+  DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits);	\
+}									\
+									\
+DECL_MULK_X86ISH(nbits##l)						\
+DECL_MULK_ARM(nbits##l)							\
+DECL_MULK_ARM64(nbits##l)						\
 static gcm_mulk_##nbits##l##__functype *pick_mulk_##nbits##l(void)	\
-  { DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits); }
+{									\
+  PICK_MULK_X86ISH(nbits##l);						\
+  PICK_MULK_ARM(nbits##l);						\
+  PICK_MULK_ARM64(nbits##l);						\
+  DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits);	\
+}
 
 GCM_WIDTHS(DEF_MULK)
 
diff --git a/utils/gcm-ref b/utils/gcm-ref
new file mode 100755
index 00000000..ccbf4321
--- /dev/null
+++ b/utils/gcm-ref
@@ -0,0 +1,502 @@
+#! /usr/bin/python
+### -*- coding: utf-8 -*-
+
+from sys import argv, exit
+
+import catacomb as C
+
+###--------------------------------------------------------------------------
+### Random utilities.
+
+def words(s):
+  """Split S into 32-bit pieces and report their values as hex."""
+  return ' '.join('%08x' % C.MP.loadb(s[i:i + 4])
+                  for i in xrange(0, len(s), 4))
+
+def words_64(s):
+  """Split S into 64-bit pieces and report their values as hex."""
+  return ' '.join('%016x' % C.MP.loadb(s[i:i + 8])
+                  for i in xrange(0, len(s), 8))
+
+def repmask(val, wd, n):
+  """Return a mask consisting of N copies of the WD-bit value VAL."""
+  v = C.GF(val)
+  a = C.GF(0)
+  for i in xrange(n): a = (a << wd) | v
+  return a
+
+def combs(things, k):
+  """Iterate over all possible combinations of K of the THINGS."""
+  ii = range(k)
+  n = len(things)
+  while True:
+    yield [things[i] for i in ii]
+    for j in xrange(k):
+      if j == k - 1: lim = n
+      else: lim = ii[j + 1]
+      i = ii[j] + 1
+      if i < lim:
+        ii[j] = i
+        break
+      ii[j] = j
+    else:
+      return
+
+POLYMAP = {}
+
+def poly(nbits):
+  """
+  Return the lexically first irreducible polynomial of degree NBITS of lowest
+  weight.
+  """
+  try: return POLYMAP[nbits]
+  except KeyError: pass
+  base = C.GF(0).setbit(nbits).setbit(0)
+  for k in xrange(1, nbits, 2):
+    for cc in combs(range(1, nbits), k):
+      p = base + sum(C.GF(0).setbit(c) for c in cc)
+      if p.irreduciblep(): POLYMAP[nbits] = p; return p
+  raise ValueError, nbits
+
+def gcm_mangle(x):
+  """Flip the bits within each byte according to GCM's insane convention."""
+  y = C.WriteBuffer()
+  for b in x:
+    b = ord(b)
+    bb = 0
+    for i in xrange(8):
+      bb <<= 1
+      if b&1: bb |= 1
+      b >>= 1
+    y.putu8(bb)
+  return y.contents
+
+def endswap_words_32(x):
+  """End-swap each 32-bit word of X."""
+  x = C.ReadBuffer(x)
+  y = C.WriteBuffer()
+  while x.left: y.putu32l(x.getu32b())
+  return y.contents
+
+def endswap_words_64(x):
+  """End-swap each 64-bit word of X."""
+  x = C.ReadBuffer(x)
+  y = C.WriteBuffer()
+  while x.left: y.putu64l(x.getu64b())
+  return y.contents
+
+def endswap_bytes(x):
+  """End-swap X by bytes."""
+  y = C.WriteBuffer()
+  for ch in reversed(x): y.put(ch)
+  return y.contents
+
+def gfmask(n):
+  return C.GF(C.MP(0).setbit(n) - 1)
+
+def gcm_mul(x, y):
+  """Multiply X and Y according to the GCM rules."""
+  w = len(x)
+  p = poly(8*w)
+  u, v = C.GF.loadl(gcm_mangle(x)), C.GF.loadl(gcm_mangle(y))
+  z = (u*v)%p
+  return gcm_mangle(z.storel(w))
+
+DEMOMAP = {}
+def demo(func):
+  name = func.func_name
+  assert(name.startswith('demo_'))
+  DEMOMAP[name[5:].replace('_', '-')] = func
+  return func
+
+def iota(i = 0):
+  vi = [i]
+  def next(): vi[0] += 1; return vi[0] - 1
+  return next
+
+###--------------------------------------------------------------------------
+### Portable table-driven implementation.
+
+def shift_left(x):
+  """Given a field element X (in external format), return X t."""
+  w = len(x)
+  p = poly(8*w)
+  return gcm_mangle(C.GF.storel((C.GF.loadl(gcm_mangle(x)) << 1)%p))
+
+def table_common(u, v, flip, getword, ixmask):
+  """
+  Multiply U by V using table lookup; common for `table-b' and `table-l'.
+
+  This matches the `simple_mulk_...' implementation in `gcm.c'.  One-entry
+  per bit is the best we can manage if we want a constant-time
+  implementation: processing n bits at a time means we need to scan
+  (2^n - 1)/n times as much memory.
+
+    * FLIP is a function (assumed to be an involution) on one argument X to
+      convert X from external format to table-entry format or back again.
+
+    * GETWORD is a function on one argument B to retrieve the next 32-bit
+      chunk of a field element held in a `ReadBuffer'.  Bits within a word
+      are processed most-significant first.
+
+    * IXMASK is a mask XORed into table indices to permute the table so that
+      it's order matches that induced by GETWORD.
+
+  The table is built such that tab[i XOR IXMASK] = U t^i.
+  """
+  w = len(u); assert(w == len(v))
+  a = C.ByteString.zero(w)
+  tab = [None]*(8*w)
+  for i in xrange(8*w):
+    print ';; %9s = %7s = %s' % ('utab[%d]' % i, 'u t^%d' % i, words(u))
+    tab[i ^ ixmask] = flip(u)
+    u = shift_left(u)
+  v = C.ReadBuffer(v)
+  i = 0
+  while v.left:
+    t = getword(v)
+    for j in xrange(32):
+      bit = (t >> 31)&1
+      if bit: a ^= tab[i]
+      print ';; %6s = %d: a <- %s [%9s = %s]' % \
+        ('v[%d]' % (i ^ ixmask), bit, words(a),
+         'utab[%d]' % (i ^ ixmask), words(tab[i]))
+      i += 1; t <<= 1
+  return flip(a)
+
+@demo
+def demo_table_b(u, v):
+  """Big-endian table lookup."""
+  return table_common(u, v, lambda x: x, lambda b: b.getu32b(), 0)
+
+@demo
+def demo_table_l(u, v):
+  """Little-endian table lookup."""
+  return table_common(u, v, endswap_words, lambda b: b.getu32l(), 0x18)
+
+###--------------------------------------------------------------------------
+### Implementation using 64Ã64->128-bit binary polynomial multiplication.
+
+_i = iota()
+TAG_INPUT_U = _i()
+TAG_INPUT_V = _i()
+TAG_KPIECE_U = _i()
+TAG_KPIECE_V = _i()
+TAG_PRODPIECE = _i()
+TAG_PRODSUM = _i()
+TAG_PRODUCT = _i()
+TAG_SHIFTED = _i()
+TAG_REDCBITS = _i()
+TAG_REDCFULL = _i()
+TAG_REDCMIX = _i()
+TAG_OUTPUT = _i()
+
+def split_gf(x, n):
+  n /= 8
+  return [C.GF.loadb(x[i:i + n]) for i in xrange(0, len(x), n)]
+
+def join_gf(xx, n):
+  x = C.GF(0)
+  for i in xrange(len(xx)): x = (x << n) | xx[i]
+  return x
+
+def present_gf(x, w, n, what):
+  firstp = True
+  m = gfmask(n)
+  for i in xrange(0, w, 128):
+    print ';; %12s%c         =%s' % \
+      (firstp and what or '',
+       firstp and ':' or ' ',
+       ''.join([j < w
+                and '          0x%s' % hex(((x >> j)&m).storeb(n/8))
+                or ''
+                for j in xrange(i, i + 128, n)]))
+    firstp = False
+
+def present_gf_pclmul(tag, wd, x, w, n, what):
+  if tag != TAG_PRODPIECE: present_gf(x, w, n, what)
+
+def reverse(x, w):
+  return C.GF.loadl(x.storeb(w/8))
+
+def rev32(x):
+  w = x.noctets
+  m_ffff = repmask(0xffff, 32, w/4)
+  m_ff = repmask(0xff, 16, w/2)
+  x = ((x&m_ffff) << 16) | ((x >> 16)&m_ffff)
+  x = ((x&m_ff) << 8) | ((x >> 8)&m_ff)
+  return x
+
+def rev8(x):
+  w = x.noctets
+  m_0f = repmask(0x0f, 8, w)
+  m_33 = repmask(0x33, 8, w)
+  m_55 = repmask(0x55, 8, w)
+  x = ((x&m_0f) << 4) | ((x >> 4)&m_0f)
+  x = ((x&m_33) << 2) | ((x >> 2)&m_33)
+  x = ((x&m_55) << 1) | ((x >> 1)&m_55)
+  return x
+
+def present_gf_mullp64(tag, wd, x, w, n, what):
+  if tag == TAG_PRODPIECE or tag == TAG_REDCFULL:
+    return
+  elif (wd == 128 or wd == 64) and TAG_PRODSUM <= tag <= TAG_PRODUCT:
+    y = x
+  elif (wd == 96 or wd == 192 or wd == 256) and \
+       TAG_PRODSUM <= tag < TAG_OUTPUT:
+    y = x
+  else:
+    xx = x.storeb(w/8)
+    extra = len(xx)%8
+    if extra: xx += C.ByteString.zero(8 - extra)
+    yb = C.WriteBuffer()
+    for i in xrange(len(xx), 0, -8): yb.put(xx[i - 8:i])
+    y = C.GF.loadb(yb.contents)
+  present_gf(y, (w + 63)&~63, n, what)
+
+def present_gf_pmull(tag, wd, x, w, n, what):
+  if tag == TAG_PRODPIECE or tag == TAG_REDCFULL or tag == TAG_SHIFTED:
+    return
+  elif tag == TAG_INPUT_V or tag == TAG_KPIECE_V:
+    bx = C.ReadBuffer(x.storeb(w/8))
+    by = C.WriteBuffer()
+    while bx.left: chunk = bx.get(8); by.put(chunk).put(chunk)
+    x = C.GF.loadb(by.contents)
+    w *= 2
+  elif TAG_PRODSUM <= tag <= TAG_PRODUCT:
+    x <<= 1
+  y = reverse(rev8(x), w)
+  present_gf(y, w, n, what)
+
+def poly64_mul_simple(u, v, presfn, wd, dispwd, mulwd, uwhat, vwhat):
+  """
+  Multiply U by V, returning the product.
+
+  This is the fallback long multiplication.
+  """
+
+  uw, vw = 8*len(u), 8*len(v)
+
+  ## We start by carving the operands into 64-bit pieces.  This is
+  ## straightforward except for the 96-bit case, where we end up with two
+  ## short pieces which we pad at the beginning.
+  if uw%mulwd: pad = (-uw)%mulwd; u += C.ByteString.zero(pad); uw += pad
+  if vw%mulwd: pad = (-uw)%mulwd; v += C.ByteString.zero(pad); vw += pad
+  uu = split_gf(u, mulwd)
+  vv = split_gf(v, mulwd)
+
+  ## Report and accumulate the individual product pieces.
+  x = C.GF(0)
+  ulim, vlim = uw/mulwd, vw/mulwd
+  for i in xrange(ulim + vlim - 2, -1, -1):
+    t = C.GF(0)
+    for j in xrange(max(0, i - vlim + 1), min(vlim, i + 1)):
+      s = uu[ulim - 1 - i + j]*vv[vlim - 1 - j]
+      presfn(TAG_PRODPIECE, wd, s, 2*mulwd, dispwd,
+             '%s_%d %s_%d' % (uwhat, i - j, vwhat, j))
+      t += s
+    presfn(TAG_PRODSUM, wd, t, 2*mulwd, dispwd,
+           '(%s %s)_%d' % (uwhat, vwhat, ulim + vlim - 2 - i))
+    x += t << (mulwd*i)
+  presfn(TAG_PRODUCT, wd, x, uw + vw, dispwd, '%s %s' % (uwhat, vwhat))
+
+  return x
+
+def poly64_mul_karatsuba(u, v, klimit, presfn, wd,
+                         dispwd, mulwd, uwhat, vwhat):
+  """
+  Multiply U by V, returning the product.
+
+  If the length of U and V is at least KLIMIT, and the operands are otherwise
+  suitable, then do Karatsuba--Ofman multiplication; otherwise, delegate to
+  `poly64_mul_simple'.
+  """
+  w = 8*len(u)
+
+  if w < klimit or w != 8*len(v) or w%(2*mulwd) != 0:
+    return poly64_mul_simple(u, v, presfn, wd, dispwd, mulwd, uwhat, vwhat)
+
+  hw = w/2
+  u0, u1 = u[:hw/8], u[hw/8:]
+  v0, v1 = v[:hw/8], v[hw/8:]
+  uu, vv = u0 ^ u1, v0 ^ v1
+
+  presfn(TAG_KPIECE_U, wd, C.GF.loadb(uu), hw, dispwd, '%s*' % uwhat)
+  presfn(TAG_KPIECE_V, wd, C.GF.loadb(vv), hw, dispwd, '%s*' % vwhat)
+  uuvv = poly64_mul_karatsuba(uu, vv, klimit, presfn, wd, dispwd, mulwd,
+                              '%s*' % uwhat, '%s*' % vwhat)
+
+  presfn(TAG_KPIECE_U, wd, C.GF.loadb(u0), hw, dispwd, '%s0' % uwhat)
+  presfn(TAG_KPIECE_V, wd, C.GF.loadb(v0), hw, dispwd, '%s0' % vwhat)
+  u0v0 = poly64_mul_karatsuba(u0, v0, klimit, presfn, wd, dispwd, mulwd,
+                              '%s0' % uwhat, '%s0' % vwhat)
+
+  presfn(TAG_KPIECE_U, wd, C.GF.loadb(u1), hw, dispwd, '%s1' % uwhat)
+  presfn(TAG_KPIECE_V, wd, C.GF.loadb(v1), hw, dispwd, '%s1' % vwhat)
+  u1v1 = poly64_mul_karatsuba(u1, v1, klimit, presfn, wd, dispwd, mulwd,
+                              '%s1' % uwhat, '%s1' % vwhat)
+
+  uvuv = uuvv + u0v0 + u1v1
+  presfn(TAG_PRODSUM, wd, uvuv, w, dispwd, '%s!%s' % (uwhat, vwhat))
+
+  x = u1v1 + (uvuv << hw) + (u0v0 << w)
+  presfn(TAG_PRODUCT, wd, x, 2*w, dispwd, '%s %s' % (uwhat, vwhat))
+  return x
+
+def poly64_common(u, v, presfn, dispwd = 32, mulwd = 64, redcwd = 32,
+                  klimit = 256):
+  """
+  Multiply U by V using a primitive 64-bit binary polynomial mutliplier.
+
+  Such a multiplier exists as the appallingly-named `pclmul[lh]q[lh]qdq' on
+  x86, and as `vmull.p64'/`pmull' on ARM.
+
+  Operands arrive in a `register format', which is a byte-swapped variant of
+  the external format.  Implementations differ on the precise details,
+  though.
+  """
+
+  ## We work in two main phases: first, calculate the full double-width
+  ## product; and, second, reduce it modulo the field polynomial.
+
+  w = 8*len(u); assert(w == 8*len(v))
+  p = poly(w)
+  presfn(TAG_INPUT_U, w, C.GF.loadb(u), w, dispwd, 'u')
+  presfn(TAG_INPUT_V, w, C.GF.loadb(v), w, dispwd, 'v')
+
+  ## So, on to the first part: the multiplication.
+  x = poly64_mul_karatsuba(u, v, klimit, presfn, w, dispwd, mulwd, 'u', 'v')
+
+  ## Now we have to shift everything up one bit to account for GCM's crazy
+  ## bit ordering.
+  y = x << 1
+  if w == 96: y >>= 64
+  presfn(TAG_SHIFTED, w, y, 2*w, dispwd, 'y')
+
+  ## Now for the reduction.
+  ##
+  ## Our polynomial has the form p = t^d + r where r = SUM_{0<=i<d} r_i t^i,
+  ## with each r_i either 0 or 1.  Because we choose the lexically earliest
+  ## irreducible polynomial with the necessary degree, r_i = 1 happens only
+  ## for a small number of tiny i.  In our field, we have t^d = r.
+  ##
+  ## We carve the product into convenient n-bit pieces, for some n dividing d
+  ## -- typically n = 32 or 64.  Let d = m n, and write y = SUM_{0<=i<2m} y_i
+  ## t^{ni}.  The upper portion, the y_i with i >= m, needs reduction; but
+  ## y_i t^{ni} = y_i r t^{n(i-m)}, so we just multiply the top half by r and
+  ## add it to the bottom half.  This all depends on r_i = 0 for all i >=
+  ## n/2.  We process each nonzero coefficient of r separately, in two
+  ## passes.
+  ##
+  ## Multiplying a chunk y_i by some t^j is the same as shifting it left by j
+  ## bits (or would be if GCM weren't backwards, but let's not worry about
+  ## that right now).  The high j bits will spill over into the next chunk,
+  ## while the low n - j bits will stay where they are.  It's these high bits
+  ## which cause trouble -- particularly the high bits of the top chunk,
+  ## since we'll add them on to y_m, which will need further reduction.  But
+  ## only the topmost j bits will do this.
+  ##
+  ## The trick is that we do all of the bits which spill over first -- all of
+  ## the top j bits in each chunk, for each j -- in one pass, and then a
+  ## second pass of all the bits which don't.  Because j, j' < n/2 for any
+  ## two nonzero coefficient degrees j and j', we have j + j' < n whence j <
+  ## n - j' -- so all of the bits contributed to y_m will be handled in the
+  ## second pass when we handle the bits that don't spill over.
+  rr = [i for i in xrange(1, w) if p.testbit(i)]
+  m = gfmask(redcwd)
+
+  ## Handle the spilling bits.
+  yy = split_gf(y.storeb(w/4), redcwd)
+  b = C.GF(0)
+  for rj in rr:
+    br = [(yi << (redcwd - rj))&m for yi in yy[w/redcwd:]]
+    presfn(TAG_REDCBITS, w, join_gf(br, redcwd), w, dispwd, 'b(%d)' % rj)
+    b += join_gf(br, redcwd) << (w - redcwd)
+  presfn(TAG_REDCFULL, w, b, 2*w, dispwd, 'b')
+  s = y + b
+  presfn(TAG_REDCMIX, w, s, 2*w, dispwd, 's')
+
+  ## Handle the nonspilling bits.
+  ss = split_gf(s.storeb(w/4), redcwd)
+  a = C.GF(0)
+  for rj in rr:
+    ar = [si >> rj for si in ss[w/redcwd:]]
+    presfn(TAG_REDCBITS, w, join_gf(ar, redcwd), w, dispwd, 'a(%d)' % rj)
+    a += join_gf(ar, redcwd)
+  presfn(TAG_REDCFULL, w, a, w, dispwd, 'a')
+
+  ## Mix everything together.
+  m = gfmask(w)
+  z = (s&m) + (s >> w) + a
+  presfn(TAG_OUTPUT, w, z, w, dispwd, 'z')
+
+  ## And we're done.
+  return z.storeb(w/8)
+
+@demo
+def demo_pclmul(u, v):
+  return poly64_common(u, v, presfn = present_gf_pclmul)
+
+@demo
+def demo_vmullp64(u, v):
+  w = 8*len(u)
+  return poly64_common(u, v, presfn = present_gf_mullp64,
+                       redcwd = w%64 == 32 and 32 or 64)
+
+@demo
+def demo_pmull(u, v):
+  w = 8*len(u)
+  return poly64_common(u, v, presfn = present_gf_pmull,
+                       redcwd = w%64 == 32 and 32 or 64)
+
+###--------------------------------------------------------------------------
+### @@@ Random debris to be deleted. @@@
+
+def cutting_room_floor():
+
+  x = C.bytes('cde4bef260d7bcda163547d348b7551195e77022907dd1df')
+  y = C.bytes('f7dac5c9941d26d0c6eb14ad568f86edd1dc9268eeee5332')
+
+  u, v = C.GF.loadb(x), C.GF.loadb(y)
+
+  g = u*v << 1
+  print 'y = %s' % words(g.storeb(48))
+  b1 = (g&repmask(0x01, 32, 6)) << 191
+  b2 = (g&repmask(0x03, 32, 6)) << 190
+  b7 = (g&repmask(0x7f, 32, 6)) << 185
+  b = b1 + b2 + b7
+  print 'b = %s' % words(b.storeb(48)[0:28])
+  h = g + b
+  print 'w = %s' % words(h.storeb(48))
+
+  a0 = (h&repmask(0xffffffff, 32, 6)) << 192
+  a1 = (h&repmask(0xfffffffe, 32, 6)) << 191
+  a2 = (h&repmask(0xfffffffc, 32, 6)) << 190
+  a7 = (h&repmask(0xffffff80, 32, 6)) << 185
+  a = a0 + a1 + a2 + a7
+
+  print '     a_1 = %s' % words(a1.storeb(48)[0:24])
+  print '     a_2 = %s' % words(a2.storeb(48)[0:24])
+  print '     a_7 = %s' % words(a7.storeb(48)[0:24])
+
+  print 'low+unit = %s' % words((h + a0).storeb(48)[0:24])
+  print ' low+0,2 = %s' % words((h + a0 + a2).storeb(48)[0:24])
+  print '     1,7 = %s' % words((a1 + a7).storeb(48)[0:24])
+
+  print 'a = %s' % words(a.storeb(48)[0:24])
+  z = h + a
+  print 'z = %s' % words(z.storeb(48))
+
+  z = gcm_mul(x, y)
+  print 'u v mod p = %s' % words(z)
+
+###--------------------------------------------------------------------------
+### Main program.
+
+style = argv[1]
+u = C.bytes(argv[2])
+v = C.bytes(argv[3])
+zz = DEMOMAP[style](u, v)
+assert zz == gcm_mul(u, v)
+
+###----- That's all, folks --------------------------------------------------
-- 
2.11.0