From 3709fba58c13ab6b9242ea1c63f7f559901dac30 Mon Sep 17 00:00:00 2001
From: Mark Wooding <mdw@distorted.org.uk>
Date: Mon, 15 Jan 2024 18:41:30 +0000
Subject: [PATCH] @@@ mess

---
 base/asm-common.h        |  16 ++
 symm/gcm-arm-crypto.S    | 185 ++++++---------------
 symm/gcm-x86ish-pclmul.S | 422 +++++++++++++++--------------------------------
 symm/gcm.c               |  93 ++++++-----
 utils/gcm-ref            |  99 ++++++-----
 5 files changed, 310 insertions(+), 505 deletions(-)

diff --git a/base/asm-common.h b/base/asm-common.h
index b4d4a909..0c209580 100644
--- a/base/asm-common.h
+++ b/base/asm-common.h
@@ -666,6 +666,22 @@ name:
 	vext.8	\vd, \vn, \vz, #\nbit >> 3
 .endm
 
+.macro	vrol128	vd, vn, nbit
+	// Set VD to VN rotated left by NBIT.  NBIT must be a multiple of 8.
+  .if \nbit&3 != 0
+	.error	"shift quantity must be whole number of bytes"
+  .endif
+	vext.8	\vd, \vn, \vn, #16 - (\nbit >> 3)
+.endm
+
+.macro	vror128	vd, vn, nbit
+	// Set VD to VN shifted right by NBIT.  NBIT must be a multiple of 8.
+  .if \nbit&3 != 0
+	.error	"shift quantity must be whole number of bytes"
+  .endif
+	vext.8	\vd, \vn, \vn, #\nbit >> 3
+.endm
+
 // Apply decoration decor to register name reg.
 #define _REGFORM(reg, decor) _GLUE(_REGFORM_, reg)(decor)
 
diff --git a/symm/gcm-arm-crypto.S b/symm/gcm-arm-crypto.S
index ddc714b3..12d6aa42 100644
--- a/symm/gcm-arm-crypto.S
+++ b/symm/gcm-arm-crypto.S
@@ -60,14 +60,21 @@
 	//	u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
 	//
 	// Suppose instead that we're given Å© = SUM_{0<=i<n} u_{n-i-1} t^i
-	// and vÌ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
+	// and á¹½ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
 	// Then
 	//
-	//	Å© vÌ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
+	//	Å© á¹½ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
 	//	    = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
 	//
 	// which is almost the bit-reversal of u v, only it's shifted right
-	// by one place.  Oh, well: we'll have to shift it back later.
+	// by one place.  Putting this another way, what we have is actually
+	// the bit reversal of the product u v t.  We could get the correct
+	// answer (modulo p(t)) if we'd sneakily divided one of the operands
+	// by t before we started.  Conveniently, v is actually the secret
+	// value k set up by the GCM `mktable' function, so we can arrange to
+	// actually store k/t (mod p(t)) and then the product will come out
+	// correct (modulo p(t)) and we won't have anything more to worry
+	// about here.
 	//
 	// That was important to think about, but there's not a great deal to
 	// do about it yet other than to convert what we've got from the
@@ -106,95 +113,52 @@
 	// q8 =				// (x_3; x_2)
 	// q9 =				// (x_1; x_0)
 
-	// Two-and-a-half problems remain.  The first is that this product is
-	// shifted left by one place, which is annoying.  Let's take care of
-	// that now.  Once this is done, we'll be properly in GCM's backwards
-	// bit-ordering.
+	// One-and-a-half problems remain.
 	//
-	// The half a problem is that the result wants to have its 64-bit
-	// halves switched.  Here turns out to be the best place to arrange
-	// for that.
+	// The full-size problem is that the result needs to be reduced
+	// modulo p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128 = t^7 +
+	// t^2 + t + 1 in our field.  So far, we've calculated z_0 and z_1
+	// such that z_0 + z_1 R = u v using the identity R = t^128: now we
+	// must collapse the two halves of y together using the other
+	// identity R = t^7 + t^2 + t + 1.
 	//
-	//		     q9				    q8
-	//	,-------------.-------------.  ,-------------.-------------.
-	//	| 0  x_0-x_62 | x_63-x_126  |  | x_127-x_190 | x_191-x_254 |
-	//	`-------------^-------------'  `-------------^-------------'
-	//	      d19	    d18		     d17	   d16
-	//
-	// We start by shifting each 32-bit lane right (from GCM's point of
-	// view -- physically, left) by one place, which gives us this:
-	//
-	//		   low (q9)			 high (q8)
-	//	,-------------.-------------.  ,-------------.-------------.
-	//	| x_0-x_62  0 |x_64-x_126 0 |  |x_128-x_190 0|x_192-x_254 0|
-	//	`-------------^-------------'  `-------------^-------------'
-	//	      d19	    d18		     d17	   d16
-	//
-	// but we've lost a bunch of bits.  We separately shift each lane
-	// left by 31 places to give us the bits we lost.
-	//
-	//		   low (q3)			 high (q2)
-	//	,-------------.-------------.  ,-------------.-------------.
-	//	|    0...0    | 0...0  x_63 |  | 0...0 x_127 | 0...0 x_191 |
-	//	`-------------^-------------'  `-------------^-------------'
-	//			     d6		     d5		   d4
-	//
-	// Since we can address each of these pieces individually, putting
-	// them together is relatively straightforward.
-
-
-	vshr.u64 d6, d18, #63		// shifted left; just the carries
-	vshl.u64 q9, q9, #1		// shifted right, but dropped carries
-	vshr.u64 q2, q8, #63
-	vshl.u64 q8, q8, #1
-	vorr	d0, d19, d6		// y_0
-	vorr	d1, d18, d5		// y_1
-	vorr	d2, d17, d4		// y_2
-	vmov	d3, d16			// y_3
-
-	// And the other one is that the result needs to be reduced modulo
-	// p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128 = t^7 + t^2 + t +
-	// 1 in our field.  So far, we've calculated z_0 and z_1 such that
-	// z_0 + z_1 R = u v using the identity R = t^128: now we must
-	// collapse the two halves of y together using the other identity R =
-	// t^7 + t^2 + t + 1.
-	//
-	// We do this by working on y_2 and y_3 separately, so consider y_i
-	// for i = 2 or 3.  Certainly, y_i t^{64i} = y_i R t^{64(i-2) =
-	// (t^7 + t^2 + t + 1) y_i t^{64(i-2)}, but we can't use that
+	// We do this by working on x_2 and x_3 separately, so consider x_i
+	// for i = 2 or 3.  Certainly, x_i t^{64i} = x_i R t^{64(i-2) =
+	// (t^7 + t^2 + t + 1) x_i t^{64(i-2)}, but we can't use that
 	// directly without breaking up the 64-bit word structure.  Instead,
-	// we start by considering just y_i t^7 t^{64(i-2)}, which again
-	// looks tricky.  Now, split y_i = a_i + t^57 b_i, with deg a_i < 57;
+	// we start by considering just x_i t^7 t^{64(i-2)}, which again
+	// looks tricky.  Now, split x_i = a_i + t^57 b_i, with deg a_i < 57;
 	// then
 	//
-	//	y_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
+	//	x_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
 	//
-	// We can similarly decompose y_i t^2 and y_i t into a pair of 64-bit
+	// We can similarly decompose x_i t^2 and x_i t into a pair of 64-bit
 	// contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the
 	// splits are different.  This is lovely, with one small snag: when
-	// we do this to y_3, we end up with a contribution back into the
+	// we do this to x_3, we end up with a contribution back into the
 	// t^128 coefficient word.  But notice that only the low seven bits
 	// of this word are affected, so there's no knock-on contribution
 	// into the t^64 word.  Therefore, if we handle the high bits of each
 	// word together, and then the low bits, everything will be fine.
 
 	// First, shift the high bits down.
-	vshl.u64 q2, q1, #63		// the b_i for t
-	vshl.u64 q3, q1, #62		// the b_i for t^2
-	vshl.u64 q8, q1, #57		// the b_i for t^7
+	vshl.u64 q2, q8, #63		// the b_i for t
+	vshl.u64 q3, q8, #62		// the b_i for t^2
+	vshl.u64 q0, q8, #57		// the b_i for t^7
 	veor	q2, q2, q3		// add them all together
-	veor	q2, q2, q8
-	veor	d2, d2, d5		// contribution into low half
-	veor	d1, d1, d4		// and high half
+	veor	q2, q2, q0
+	veor	d18, d18, d5		// contribution into low half
+	veor	d17, d17, d4		// and high half
 
 	// And then shift the low bits up.
-	vshr.u64 q2, q1, #1
-	vshr.u64 q3, q1, #2
-	vshr.u64 q8, q1, #7
-	veor	q0, q0, q1		// mix in the unit contribution
+	vshr.u64 q2, q8, #1
+	vshr.u64 q3, q8, #2
+	vshr.u64 q1, q8, #7
+	veor	q8, q8, q9		// mix in the unit contribution
 	veor	q2, q2, q3		// t and t^2 contribs
-	veor	q0, q0, q8		// low, unit, and t^7 contribs
-	veor	q0, q0, q2		// mix them together and we're done
+	veor	q1, q1, q8		// low, unit, and t^7 contribs
+	veor	d1, d2, d4		// mix them together and swap halves
+	veor	d0, d3, d5
 .endm
 
 .macro	mul64
@@ -204,29 +168,22 @@
 	// The multiplication is thankfully easy.
 	vmull.p64 q0, d0, d1		// u v
 
-	// Shift the product up by one place, and swap the two halves.  After
-	// this, we're in GCM bizarro-world.
-	vshr.u64 d2, d0, #63		// shifted left; just the carries
-	vshl.u64 d3, d1, #1		// low half right
-	vshl.u64 d1, d0, #1		// high half shifted right
-	vorr	d0, d3, d2		// mix in the carries
-
 	// Now we must reduce.  This is essentially the same as the 128-bit
 	// case above, but mostly simpler because everything is smaller.  The
 	// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
 
 	// First, shift the high bits down.
-	vshl.u64 d2, d1, #63		// b_i for t
-	vshl.u64 d3, d1, #61		// b_i for t^3
-	vshl.u64 d4, d1, #60		// b_i for t^4
+	vshl.u64 d2, d0, #63		// b_i for t
+	vshl.u64 d3, d0, #61		// b_i for t^3
+	vshl.u64 d4, d0, #60		// b_i for t^4
 	veor	d2, d2, d3		// add them all together
 	veor	d2, d2, d4
-	veor	d1, d1, d2		// contribution back into high half
+	veor	d0, d0, d2		// contribution back into high half
 
 	// And then shift the low bits up.
-	vshr.u64 d2, d1, #1
-	vshr.u64 d3, d1, #3
-	vshr.u64 d4, d1, #4
+	vshr.u64 d2, d0, #1
+	vshr.u64 d3, d0, #3
+	vshr.u64 d4, d0, #4
 	veor	d0, d0, d1		// mix in the unit contribution
 	veor	d2, d2, d3		// t and t^3 contribs
 	veor	d0, d0, d4		// low, unit, and t^4
@@ -237,14 +194,10 @@
 	// Enter with u and v in the most-significant three words of q0 and
 	// q1 respectively, and zero in the low words, and zero in q15; leave
 	// with z = u v in the high three words of q0, and /junk/ in the low
-	// word.  Clobbers ???.
+	// word.  Clobbers q1--q3, q8, q9.
 
 	// This is an inconvenient size.  There's nothing for it but to do
-	// four multiplications, as if for the 128-bit case.  It's possible
-	// that there's cruft in the top 32 bits of the input registers, so
-	// shift both of them up by four bytes before we start.  This will
-	// mean that the high 64 bits of the result (from GCM's viewpoint)
-	// will be zero.
+	// four multiplications, as if for the 128-bit case.
 	// q0 =				// (u_0 + u_1 t^32; u_2)
 	// q1 =				// (v_0 + v_1 t^32; v_2)
 	vmull.p64 q8, d1, d2		// u_2 (v_0 + v_1 t^32) = e_0
@@ -271,20 +224,6 @@
 	veor	d0, d0, d17		// q0 = bot([d t^128] + e t^64 + f)
 	veor	d3, d3, d16		// q1 = top(d t^128 + e t^64 + f)
 
-	// Shift the product right by one place (from GCM's point of view),
-	// but, unusually, don't swap the halves, because we need to work on
-	// the 32-bit pieces later.  After this, we're in GCM bizarro-world.
-	// q0 =				// (?, x_2; x_1, x_0)
-	// q1 =				// (0, x_5; x_4, x_3)
-	vshr.u64 d4, d0, #63		// carry from d0 to d1
-	vshr.u64 d5, d2, #63		// carry from d2 to d3
-	vshr.u32 d6, d3, #31		// carry from d3 to d0
-	vshl.u64 q0, q0, #1		// shift low half
-	vshl.u64 q1, q1, #1		// shift high half
-	vorr	d1, d1, d4
-	vorr	d0, d0, d6
-	vorr	d3, d3, d5
-
 	// Finally, the reduction.  This is essentially the same as the
 	// 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
 	// t^9 + t^6 + 1.  The degrees are larger but not enough to cause
@@ -345,19 +284,6 @@
 	veor	d19, d19, d24  //  q9 =	// (x_3; x_2)
 	veor	d20, d20, d25  // q10 =	// (x_1; x_0)
 
-	// Shift the product right by one place (from GCM's point of view).
-	vshr.u64 q11, q8, #63		// carry from d16/d17 to d17/d18
-	vshr.u64 q12, q9, #63		// carry from d18/d19 to d19/d20
-	vshr.u64 d26, d20, #63		// carry from d20 to d21
-	vshl.u64 q8, q8, #1		// shift everything down
-	vshl.u64 q9, q9, #1
-	vshl.u64 q10, q10, #1
-	vorr	d17, d17, d22		// and mix in the carries
-	vorr	d18, d18, d23
-	vorr	d19, d19, d24
-	vorr	d20, d20, d25
-	vorr	d21, d21, d26
-
 	// Next, the reduction.  Our polynomial this time is p(x) = t^192 +
 	// t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
 	// 128-bit case.  I don't know why.
@@ -467,23 +393,6 @@
 	veor	q9, q9, q12
 	veor	q10, q10, q13
 
-	// Shift the product right by one place (from GCM's point of view).
-	vshr.u64 q0, q8, #63		// carry from d16/d17 to d17/d18
-	vshr.u64 q1, q9, #63		// carry from d18/d19 to d19/d20
-	vshr.u64 q2, q10, #63		// carry from d20/d21 to d21/d22
-	vshr.u64 d6, d22, #63		// carry from d22 to d23
-	vshl.u64 q8, q8, #1		// shift everyting down
-	vshl.u64 q9, q9, #1
-	vshl.u64 q10, q10, #1
-	vshl.u64 q11, q11, #1
-	vorr	d17, d17, d0
-	vorr	d18, d18, d1
-	vorr	d19, d19, d2
-	vorr	d20, d20, d3
-	vorr	d21, d21, d4
-	vorr	d22, d22, d5
-	vorr	d23, d23, d6
-
 	// Now we must reduce.  This is essentially the same as the 192-bit
 	// case above, but more complicated because everything is bigger.
 	// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
diff --git a/symm/gcm-x86ish-pclmul.S b/symm/gcm-x86ish-pclmul.S
index 5edf56e1..837abbdd 100644
--- a/symm/gcm-x86ish-pclmul.S
+++ b/symm/gcm-x86ish-pclmul.S
@@ -73,14 +73,21 @@
 	//	u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
 	//
 	// Suppose instead that we're given Å© = SUM_{0<=i<n} u_{n-i-1} t^i
-	// and vÌ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
+	// and á¹½ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
 	// Then
 	//
-	//	Å© vÌ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
+	//	Å© á¹½ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
 	//	    = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
 	//
 	// which is almost the bit-reversal of u v, only it's shifted right
-	// by one place.  Oh, well: we'll have to shift it back later.
+	// by one place.  Putting this another way, what we have is actually
+	// the bit reversal of the product u v t.  We could get the correct
+	// answer (modulo p(t)) if we'd sneakily divided one of the operands
+	// by t before we started.  Conveniently, v is actually the secret
+	// value k set up by the GCM `mktable' function, so we can arrange to
+	// actually store k/t (mod p(t)) and then the product will come out
+	// correct (modulo p(t)) and we won't have anything more to worry
+	// about here.
 	//
 	// That was important to think about, but there's not a great deal to
 	// do about it yet other than to convert what we've got from the
@@ -121,95 +128,10 @@
 	movdqa	xmm1, xmm2		// (m_1; m_0) again
 	pslldq	xmm2, 8			// (0; m_1)
 	psrldq	xmm1, 8			// (m_0; 0)
-	pxor	xmm0, xmm2		// x_1 = u_1 v_1 + m_1
-	pxor	xmm1, xmm4		// x_0 = u_0 v_0 + t^64 m_0
-
-	// Two problems remain.  The first is that this product is shifted
-	// left (from GCM's backwards perspective) by one place, which is
-	// annoying.  Let's take care of that now.  Once this is done, we'll
-	// be properly in GCM's backwards bit-ordering, so xmm1 will hold the
-	// low half of the product and xmm0 the high half.  (The following
-	// diagrams show bit 0 consistently on the right.)
-	//
-	//				 xmm1
-	//    ,-------------.-------------.-------------.-------------.
-	//    | 0  x_0-x_30 |  x_31-x_62  |  x_63-x_94  |  x_95-x_126 |
-	//    `-------------^-------------^-------------^-------------'
-	//
-	//				 xmm0
-	//    ,-------------.-------------.-------------.-------------.
-	//    | x_127-x_158 | x_159-x_190 | x_191-x_222 | x_223-x_254 |
-	//    `-------------^-------------^-------------^-------------'
-	//
-	// We start by shifting each 32-bit lane right (from GCM's point of
-	// view -- physically, left) by one place, which gives us this:
-	//
-	//			     low (xmm3)
-	//    ,-------------.-------------.-------------.-------------.
-	//    | x_0-x_30  0 | x_32-x_62 0 | x_64-x_94 0 | x_96-x_126 0|
-	//    `-------------^-------------^-------------^-------------'
-	//
-	//			     high (xmm2)
-	//    ,-------------.-------------.-------------.-------------.
-	//    |x_128-x_158 0|x_160-x_190 0|x_192-x_222 0|x_224-x_254 0|
-	//    `-------------^-------------^-------------^-------------'
-	//
-	// but we've lost a bunch of bits.  We separately shift each lane
-	// left by 31 places to give us the bits we lost.
-	//
-	//			     low (xmm1)
-	//    ,-------------.-------------.-------------.-------------.
-	//    |    0...0    | 0...0  x_31 | 0...0  x_63 | 0...0  x_95 |
-	//    `-------------^-------------^-------------^-------------'
-	//
-	//			     high (xmm0)
-	//    ,-------------.-------------.-------------.-------------.
-	//    | 0...0 x_127 | 0...0 x_159 | 0...0 x_191 | 0...0 x_223 |
-	//    `-------------^-------------^-------------^-------------'
-	//
-	// Which is close, but we don't get a cigar yet.  To get the missing
-	// bits into position, we shift each of these right by a lane, but,
-	// alas, the x_127 falls off, so, separately, we shift the high
-	// register left by three lanes, so that everything is lined up
-	// properly when we OR them all together:
-	//
-	//			     low (xmm1)
-	//    ,-------------.-------------.-------------.-------------.
-	//    ? 0...0  x_31 | 0...0  x_63 | 0...0  x_95 |    0...0    |
-	//    `-------------^-------------^-------------^-------------'
-	//
-	//			     wrap (xmm4)
-	//    ,-------------.-------------.-------------.-------------.
-	//    |    0...0    |    0...0    |    0...0    | 0...0 x_127 |
-	//    `-------------^-------------^-------------^-------------'
-	//
-	//			     high (xmm0)
-	//    ,-------------.-------------.-------------.-------------.
-	//    | 0...0 x_159 | 0...0 x_191 | 0...0 x_223 |    0...0    |
-	//    `-------------^-------------^-------------^-------------'
-	//
-	// The `low' and `wrap' registers (xmm1, xmm3, xmm4) then collect the
-	// low 128 coefficients, while the `high' registers (xmm0, xmm2)
-	// collect the high 127 registers, leaving a zero bit at the most
-	// significant end as we expect.
-
-	// xmm0 =			// (x_7, x_6; x_5, x_4)
-	// xmm1 =			// (x_3, x_2; x_1, x_0)
-	movdqa	xmm3, xmm1		// (x_3, x_2; x_1, x_0) again
-	movdqa	xmm2, xmm0		// (x_7, x_6; x_5, x_4) again
-	psrld	xmm1, 31		// shifted left; just the carries
-	psrld	xmm0, 31
-	pslld	xmm3, 1			// shifted right, but dropped carries
-	pslld	xmm2, 1
-	movdqa	xmm4, xmm0		// another copy for the carry around
-	pslldq	xmm1, 4			// move carries over
-	pslldq	xmm0, 4
-	psrldq	xmm4, 12		// the big carry wraps around
-	por	xmm1, xmm3
-	por	xmm0, xmm2		// (y_7, y_6; y_5, y_4)
-	por	xmm1, xmm4		// (y_3, y_2; y_1, y_0)
-
-	// And the other problem is that the result needs to be reduced
+	pxor	xmm0, xmm2		// z_1 = u_1 v_1 + m_1
+	pxor	xmm1, xmm4		// z_0 = u_0 v_0 + t^64 m_0
+
+	// The remaining problem is that the result needs to be reduced
 	// modulo p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128 = t^7 +
 	// t^2 + t + 1 in our field.  So far, we've calculated z_0 and z_1
 	// such that z_0 + z_1 R = u v using the identity R = t^128: now we
@@ -217,28 +139,28 @@
 	// identity R = t^7 + t^2 + t + 1.
 	//
 	// We do this by working on each 32-bit word of the high half of z
-	// separately, so consider y_i, for some 4 <= i < 8.  Certainly, y_i
-	// t^{32i} = y_i R t^{32(i-4)} = (t^7 + t^2 + t + 1) y_i t^{32(i-4)},
+	// separately, so consider x_i, for some 4 <= i < 8.  Certainly, x_i
+	// t^{32i} = x_i R t^{32(i-4)} = (t^7 + t^2 + t + 1) x_i t^{32(i-4)},
 	// but we can't use that directly without breaking up the 32-bit word
-	// structure.  Instead, we start by considering just y_i t^7
-	// t^{32(i-4)}, which again looks tricky.  Now, split y_i = a_i +
+	// structure.  Instead, we start by considering just x_i t^7
+	// t^{32(i-4)}, which again looks tricky.  Now, split x_i = a_i +
 	// t^25 b_i, with deg a_i < 25; then
 	//
-	//	y_i t^7 t^{32(i-4)} = a_i t^7 t^{32(i-4)} + b_i t^{32(i-3)}
+	//	x_i t^7 t^{32(i-4)} = a_i t^7 t^{32(i-4)} + b_i t^{32(i-3)}
 	//
-	// We can similarly decompose y_i t^2 and y_i t into a pair of 32-bit
+	// We can similarly decompose x_i t^2 and x_i t into a pair of 32-bit
 	// contributions to the t^{32(i-4)} and t^{32(i-3)} words, but the
 	// splits are different.  This is lovely, with one small snag: when
-	// we do this to y_7, we end up with a contribution back into the
+	// we do this to x_7, we end up with a contribution back into the
 	// t^128 coefficient word.  But notice that only the low seven bits
 	// of this word are affected, so there's no knock-on contribution
 	// into the t^32 word.  Therefore, if we handle the high bits of each
 	// word together, and then the low bits, everything will be fine.
 
 	// First, shift the high bits down.
-	movdqa	xmm2, xmm0		// (y_7, y_6; y_5, y_4) again
-	movdqa	xmm3, xmm0		// (y_7, y_6; y_5, y_4) yet again
-	movdqa	xmm4, xmm0		// (y_7, y_6; y_5, y_4) again again
+	movdqa	xmm2, xmm0		// (x_7, x_6; x_5, x_4) again
+	movdqa	xmm3, xmm0		// (x_7, x_6; x_5, x_4) yet again
+	movdqa	xmm4, xmm0		// (x_7, x_6; x_5, x_4) again again
 	pslld	xmm2, 31		// the b_i for t
 	pslld	xmm3, 30		// the b_i for t^2
 	pslld	xmm4, 25		// the b_i for t^7
@@ -267,28 +189,20 @@
 	// respectively; leave with z = u v in xmm0.  Clobbers xmm1--xmm4.
 
 	// The multiplication is thankfully easy.
-	pclmullqlqdq xmm0, xmm1		// u v
-
-	// Shift the product up by one place.  After this, we're in GCM
-	// bizarro-world.
-	movdqa	xmm1, xmm0		// u v again
-	psrld	xmm0, 31		// shifted left; just the carries
-	pslld	xmm1, 1			// shifted right, but dropped carries
-	pslldq	xmm0, 4			// move carries over
-	por	xmm1, xmm0		// (y_3, y_2; y_1, y_0)
+	pclmullqlqdq xmm1, xmm0		// u v
 
 	// Now we must reduce.  This is essentially the same as the 128-bit
 	// case above, but mostly simpler because everything is smaller.  The
 	// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
 
 	// First, we must detach the top (`low'!) half of the result.
-	movdqa	xmm0, xmm1		// (y_3, y_2; y_1, y_0) again
-	psrldq	xmm1, 8			// (y_1, y_0; 0, 0)
+	movdqa	xmm0, xmm1		// (x_3, x_2; x_1, x_0) again
+	psrldq	xmm1, 8			// (x_1, x_0; 0, 0)
 
 	// Next, shift the high bits down.
-	movdqa	xmm2, xmm0		// (y_3, y_2; ?, ?) again
-	movdqa	xmm3, xmm0		// (y_3, y_2; ?, ?) yet again
-	movdqa	xmm4, xmm0		// (y_3, y_2; ?, ?) again again
+	movdqa	xmm2, xmm0		// (x_3, x_2; ?, ?) again
+	movdqa	xmm3, xmm0		// (x_3, x_2; ?, ?) yet again
+	movdqa	xmm4, xmm0		// (x_3, x_2; ?, ?) again again
 	pslld	xmm2, 31		// b_i for t
 	pslld	xmm3, 29		// b_i for t^3
 	pslld	xmm4, 28		// b_i for t^4
@@ -342,36 +256,19 @@
 	// e_0 + e_1.
 	//
 	// The place values for the two halves are (t^160, t^128; t^96, ?)
-	// and (?, t^64; t^32, 1).
+	// and (?, t^64; t^32, 1).  But we also want to shift the high part
+	// left by a word, for symmetry's sake.
 	psrldq	xmm0, 8			// (d; 0) = d t^128
 	pxor	xmm2, xmm3		// e = (e_0 + e_1)
 	movdqa	xmm1, xmm4		// f again
 	pxor	xmm0, xmm2		// d t^128 + e t^64
 	psrldq	xmm2, 12		// e[31..0] t^64
 	psrldq	xmm1, 4			// f[95..0]
-	pslldq	xmm4, 8			// f[127..96]
+	pslldq	xmm4, 12		// f[127..96], shifted
+	pslldq	xmm0, 4			// shift high 96 bits
 	pxor	xmm1, xmm2		// low 96 bits of result
 	pxor	xmm0, xmm4		// high 96 bits of result
 
-	// Next, shift everything one bit to the left to compensate for GCM's
-	// strange ordering.  This will be easier if we shift up the high
-	// half by a word before we start.  After this we're in GCM bizarro-
-	// world.
-	movdqa	xmm3, xmm1		// low half again
-	pslldq	xmm0, 4			// shift high half
-	psrld	xmm1, 31		// shift low half down: just carries
-	movdqa	xmm2, xmm0		// copy high half
-	pslld	xmm3, 1			// shift low half down: drop carries
-	psrld	xmm0, 31		// shift high half up: just carries
-	pslld	xmm2, 1			// shift high half down: drop carries
-	movdqa	xmm4, xmm0		// copy high carries for carry-around
-	pslldq	xmm0, 4			// shift carries down
-	pslldq	xmm1, 4
-	psrldq	xmm4, 12		// the big carry wraps around
-	por	xmm1, xmm3
-	por	xmm0, xmm2
-	por	xmm1, xmm4
-
 	// Finally, the reduction.  This is essentially the same as the
 	// 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
 	// t^9 + t^6 + 1.  The degrees are larger but not enough to cause
@@ -418,94 +315,69 @@
 	movdqa	xmm4, xmm0		// (u_2; u_1) again
 	movdqa	xmm5, xmm0		// (u_2; u_1) yet again
 	movdqa	xmm6, xmm0		// (u_2; u_1) again again
-	movdqa	xmm7, xmm1		// (u_0; ?) again
-	punpcklqdq xmm1, xmm3		// (u_0; v_0)
+	movdqa	xmm7, xmm3		// (v_0; ?) again
+	punpcklqdq xmm3, xmm1		// (v_0; u_0)
 	pclmulhqhqdq xmm4, xmm2		// u_1 v_1
-	pclmullqlqdq xmm3, xmm0		// u_2 v_0
+	pclmullqlqdq xmm1, xmm2		// u_0 v_2
 	pclmullqhqdq xmm5, xmm2		// u_2 v_1
 	pclmulhqlqdq xmm6, xmm2		// u_1 v_2
-	pxor	xmm4, xmm3		// u_2 v_0 + u_1 v_1
-	pclmullqlqdq xmm7, xmm2		// u_0 v_2
+	pxor	xmm1, xmm4		// u_0 v_2 + u_1 v_1
+	pclmullqlqdq xmm7, xmm0		// u_2 v_0
 	pxor	xmm5, xmm6		// b = u_2 v_1 + u_1 v_2
 	movdqa	xmm6, xmm0		// (u_2; u_1) like a bad penny
-	pxor	xmm4, xmm7		// c = u_0 v_2 + u_1 v_1 + u_2 v_0
+	pxor	xmm1, xmm7		// c = u_0 v_2 + u_1 v_1 + u_2 v_0
 	pclmullqlqdq xmm0, xmm2		// a = u_2 v_2
-	pclmulhqhqdq xmm6, xmm1		// u_1 v_0
-	pclmulhqlqdq xmm2, xmm1		// u_0 v_1
-	pclmullqhqdq xmm1, xmm1		// e = u_0 v_0
-	pxor	xmm2, xmm6		// d = u_1 v_0 + u_0 v_1
+	pclmulhqlqdq xmm6, xmm3		// u_1 v_0
+	pclmulhqhqdq xmm2, xmm3		// u_0 v_1
+	pclmullqhqdq xmm3, xmm3		// e = u_0 v_0
+	pxor	xmm6, xmm2		// d = u_1 v_0 + u_0 v_1
 
-	// Next, the piecing together of the product.
+	// Next, the piecing together of the product.  There's significant
+	// work here to leave the completed pieces in sensible registers.
 	// xmm0 =			// (a_1; a_0) = a = u_2 v_2
 	// xmm5 =			// (b_1; b_0) = b = u_1 v_2 + u_2 v_1
-	// xmm4 =			// (c_1; c_0) = c = u_0 v_2 +
+	// xmm1 =			// (c_1; c_0) = c = u_0 v_2 +
 					//	u_1 v_1 + u_2 v_0
-	// xmm2 =			// (d_1; d_0) = d = u_0 v_1 + u_1 v_0
-	// xmm1 =			// (e_1; e_0) = e = u_0 v_0
-	// xmm3, xmm6, xmm7 spare
-	movdqa	xmm3, xmm2		// (d_1; d_0) again
-	movdqa	xmm6, xmm5		// (b_1; b_0) again
-	pslldq	xmm2, 8			// (0; d_1)
+	// xmm6 =			// (d_1; d_0) = d = u_0 v_1 + u_1 v_0
+	// xmm3 =			// (e_1; e_0) = e = u_0 v_0
+	// xmm2, xmm4, xmm7 spare
+	movdqa	xmm2, xmm6		// (d_1; d_0) again
+	movdqa	xmm4, xmm5		// (b_1; b_0) again
+	pslldq	xmm6, 8			// (0; d_1)
 	psrldq	xmm5, 8			// (b_0; 0)
-	psrldq	xmm3, 8			// (d_0; 0)
-	pslldq	xmm6, 8			// (0; b_1)
-	pxor	xmm5, xmm2		// (b_0; d_1)
-	pxor	xmm0, xmm6		// x_2 = (a_1; a_0 + b_1)
-	pxor	xmm3, xmm1		// x_0 = (e_1 + d_0; e_0)
-	pxor	xmm4, xmm5		// x_1 = (b_0 + c_1; c_0 + d_1)
-
-	// Now, shift it right (from GCM's point of view) by one bit, and try
-	// to leave the result in less random registers.  After this, we'll
-	// be in GCM bizarro-world.
-	// xmm1, xmm2, xmm5, xmm6, xmm7 spare
-	movdqa	xmm5, xmm0		// copy x_2
-	movdqa	xmm1, xmm4		// copy x_1
-	movdqa	xmm2, xmm3		// copy x_0
-	psrld	xmm0, 31		// x_2 carries
-	psrld	xmm4, 31		// x_1 carries
-	psrld	xmm3, 31		// x_0 carries
-	pslld	xmm5, 1			// x_2 shifted
-	pslld	xmm1, 1			// x_1 shifted
-	pslld	xmm2, 1			// x_0 shifted
-	movdqa	xmm6, xmm0		// x_2 carry copy
-	movdqa	xmm7, xmm4		// x_1 carry copy
-	pslldq	xmm0, 4			// x_2 carry shifted
-	pslldq	xmm4, 4			// x_1 carry shifted
-	pslldq	xmm3, 4			// x_0 carry shifted
-	psrldq	xmm6, 12		// x_2 carry out
-	psrldq	xmm7, 12		// x_1 carry out
-	por	xmm0, xmm5		// (y_5; y_4)
-	por	xmm1, xmm4
-	por	xmm2, xmm3
-	por	xmm1, xmm6		// (y_3; y_2)
-	por	xmm2, xmm7		// (y_1; y_0)
+	psrldq	xmm2, 8			// (d_0; 0)
+	pslldq	xmm4, 8			// (0; b_1)
+	pxor	xmm5, xmm6		// (b_0; d_1)
+	pxor	xmm0, xmm4		// (x_5; x_4) = (a_1; a_0 + b_1)
+	pxor	xmm2, xmm3		// (x_1; x_0) = (e_1 + d_0; e_0)
+	pxor	xmm1, xmm5	       // (x_3; x_2) = (b_0 + c_1; c_0 + d_1)
 
 	// Next, the reduction.  Our polynomial this time is p(x) = t^192 +
 	// t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
 	// 128-bit case.  I don't know why.
 
 	// First, shift the high bits down.
-	// xmm0 =			// (y_5; y_4)
-	// xmm1 =			// (y_3; y_2)
-	// xmm2 =			// (y_1; y_0)
+	// xmm0 =			// (x_5; x_4)
+	// xmm1 =			// (x_3; x_2)
+	// xmm2 =			// (x_1; x_0)
 	// xmm3--xmm7 spare
-	movdqa	xmm3, xmm0		// (y_5; y_4) copy
-	movdqa	xmm4, xmm0		// (y_5; y_4) copy
-	movdqa	xmm5, xmm0		// (y_5; y_4) copy
-	pslld	xmm3, 31		// (y_5; y_4) b_i for t
-	pslld	xmm4, 30		// (y_5; y_4) b_i for t^2
-	pslld	xmm5, 25		// (y_5; y_4) b_i for t^7
-	 movq	xmm6, xmm1		// (y_3; 0) copy
+	movdqa	xmm3, xmm0		// (x_5; x_4) copy
+	movdqa	xmm4, xmm0		// (x_5; x_4) copy
+	movdqa	xmm5, xmm0		// (x_5; x_4) copy
+	pslld	xmm3, 31		// (x_5; x_4) b_i for t
+	pslld	xmm4, 30		// (x_5; x_4) b_i for t^2
+	pslld	xmm5, 25		// (x_5; x_4) b_i for t^7
+	 movq	xmm6, xmm1		// (x_3; 0) copy
 	pxor	xmm3, xmm4
-	 movq	xmm7, xmm1		// (y_3; 0) copy
+	 movq	xmm7, xmm1		// (x_3; 0) copy
 	pxor	xmm3, xmm5
-	 movq	xmm5, xmm1		// (y_3; 0) copy
-	movdqa	xmm4, xmm3		// (y_5; y_4) b_i combined
-	 pslld	xmm6, 31		// (y_3; 0) b_i for t
-	 pslld	xmm7, 30		// (y_3; 0) b_i for t^2
-	 pslld	xmm5, 25		// (y_3; 0) b_i for t^7
-	psrldq	xmm3, 12		// (y_5; y_4) low contrib
-	pslldq	xmm4, 4			// (y_5; y_4) high contrib
+	 movq	xmm5, xmm1		// (x_3; 0) copy
+	movdqa	xmm4, xmm3		// (x_5; x_4) b_i combined
+	 pslld	xmm6, 31		// (x_3; 0) b_i for t
+	 pslld	xmm7, 30		// (x_3; 0) b_i for t^2
+	 pslld	xmm5, 25		// (x_3; 0) b_i for t^7
+	psrldq	xmm3, 12		// (x_5; x_4) low contrib
+	pslldq	xmm4, 4			// (x_5; x_4) high contrib
 	 pxor	xmm6, xmm7
 	pxor	xmm2, xmm3
 	 pxor	xmm6, xmm5
@@ -515,17 +387,17 @@
 
 	// And finally shift the low bits up.  Unfortunately, we also have to
 	// split the low bits out.
-	// xmm0 =			// (y'_5; y'_4)
-	// xmm1 =			// (y'_3; y'_2)
-	// xmm2 =			// (y'_1; y'_0)
-	 movdqa xmm5, xmm1		// copies of (y'_3; y'_2)
+	// xmm0 =			// (x'_5; x'_4)
+	// xmm1 =			// (x'_3; x'_2)
+	// xmm2 =			// (x'_1; x'_0)
+	 movdqa xmm5, xmm1		// copies of (x'_3; x'_2)
 	 movdqa	xmm6, xmm1
 	 movdqa	xmm7, xmm1
-	  psrldq xmm1, 8		// bring down (y'_2; ?)
-	movdqa	xmm3, xmm0		// copies of (y'_5; y'_4)
+	  psrldq xmm1, 8		// bring down (x'_2; ?)
+	movdqa	xmm3, xmm0		// copies of (x'_5; x'_4)
 	movdqa	xmm4, xmm0
-	  punpcklqdq  xmm1, xmm2	// (y'_2; y'_1)
-	  psrldq xmm2, 8		// (y'_0; ?)
+	  punpcklqdq  xmm1, xmm2	// (x'_2; x'_1)
+	  psrldq xmm2, 8		// (x'_0; ?)
 	 pxor	xmm2, xmm5		// low half and unit contrib
 	pxor	xmm1, xmm0
 	 psrld	xmm5, 1
@@ -559,7 +431,7 @@
 	//
 	//	q = r s = (u_0 + u_1) (v_0 + v_1)
 	//	  = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
-	//	  = a + d + c
+	//	  = a + c + b
 	//
 	// The first two terms we've already calculated; the last is the
 	// remaining one we want.  We'll set B = t^128.  We know how to do
@@ -615,7 +487,7 @@
 	movdqa	xmm3, xmm0
 	pslldq	xmm0, 8
 	psrldq	xmm3, 8
-	pxor	xmm4, xmm0		// x_1 = a_1
+	pxor	xmm4, xmm0		// x_3 = a_1
 	pxor	xmm7, xmm3		// a_0
 
 	// Mix that into the product now forming in xmm4--xmm7.
@@ -645,91 +517,55 @@
 
 #undef V0
 
-	// Now we need to shift that whole lot one bit to the left.  This
-	// will also give us an opportunity to put the product back in
-	// xmm0--xmm3.  This is a slightly merry dance because it's nearly
-	// pipelined but we don't have enough registers.
-	//
-	// After this, we'll be in GCM bizarro-world.
-	movdqa	xmm0, xmm4		// x_3 again
-	psrld	xmm4, 31		// x_3 carries
-	pslld	xmm0, 1			// x_3 shifted left
-	movdqa	xmm3, xmm4		// x_3 copy carries
-	 movdqa	xmm1, xmm5		// x_2 again
-	pslldq	xmm4, 4			// x_3 carries shifted up
-	 psrld	xmm5, 31		// x_2 carries
-	psrldq	xmm3, 12		// x_3 big carry out
-	 pslld	xmm1, 1			// x_2 shifted left
-	por	xmm0, xmm4		// x_3 mixed together
-	 movdqa	xmm4, xmm5		// x_2 copy carries
-	  movdqa xmm2, xmm6		// x_1 again
-	 pslldq	xmm5, 4			// x_2 carries shifted up
-	  psrld	xmm6, 31		// x_1 carries
-	 psrldq	xmm4, 12		// x_2 big carry out
-	  pslld	xmm2, 1			// x_1 shifted
-	 por	xmm1, xmm5		// x_2 mixed together
-	  movdqa xmm5, xmm6		// x_1 copy carries
-	 por	xmm1, xmm3		// x_2 with carry from x_3
-	   movdqa xmm3, xmm7		// x_0 again
-	  pslldq xmm6, 4		// x_1 carries shifted up
-	   psrld xmm7, 31		// x_2 carries
-	  psrldq xmm5, 12		// x_1 big carry out
-	   pslld xmm3, 1		// x_0 shifted
-	  por	xmm2, xmm6		// x_1 mixed together
-	   pslldq xmm7, 4		// x_0 carries shifted up
-	  por	xmm2, xmm4		// x_1 with carry from x_2
-	   por	xmm3, xmm7		// x_0 mixed together
-	   por	xmm3, xmm5		// x_0 with carry from x_1
-
 	// Now we must reduce.  This is essentially the same as the 128-bit
 	// case above, but more complicated because everything is bigger.
 	// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
 
 	// First, shift the high bits down.
-	movdqa	xmm4, xmm0		// y_3 again
-	movdqa	xmm5, xmm0		// y_3 yet again
-	movdqa	xmm6, xmm0		// y_3 again again
-	pslld	xmm4, 30		// y_3: b_i for t^2
-	pslld	xmm5, 27		// y_3: b_i for t^5
-	pslld	xmm6, 22		// y_3: b_i for t^10
-	 movdqa	xmm7, xmm1		// y_2 again
-	pxor	xmm4, xmm5
-	 movdqa	xmm5, xmm1		// y_2 again
-	pxor	xmm4, xmm6
-	 movdqa	xmm6, xmm1		// y_2 again
-	 pslld	xmm7, 30		// y_2: b_i for t^2
-	 pslld	xmm5, 27		// y_2: b_i for t^5
-	 pslld	xmm6, 22		// y_2: b_i for t^10
-	 pxor	xmm7, xmm5
-	movdqa	xmm5, xmm4
-	 pxor	xmm7, xmm6
-	psrldq	xmm4, 4
-	 movdqa	xmm6, xmm7
-	pslldq	xmm5, 12
-	 psrldq	xmm7, 4
-	pxor	xmm2, xmm4
-	 pslldq	xmm6, 12
-	 pxor	xmm3, xmm7
-	pxor	xmm1, xmm5
-	 pxor	xmm2, xmm6
+	movdqa	xmm0, xmm4		// x_3 again
+	movdqa	xmm1, xmm4		// x_3 yet again
+	movdqa	xmm2, xmm4		// x_3 again again
+	pslld	xmm0, 30		// x_3: b_i for t^2
+	pslld	xmm1, 27		// x_3: b_i for t^5
+	pslld	xmm2, 22		// x_3: b_i for t^10
+	 movdqa	xmm3, xmm5		// x_2 again
+	pxor	xmm0, xmm1
+	 movdqa	xmm1, xmm5		// x_2 again
+	pxor	xmm0, xmm2		// b_3
+	 movdqa	xmm2, xmm5		// x_2 again
+	 pslld	xmm3, 30		// x_2: b_i for t^2
+	 pslld	xmm1, 27		// x_2: b_i for t^5
+	 pslld	xmm2, 22		// x_2: b_i for t^10
+	 pxor	xmm3, xmm1
+	movdqa	xmm1, xmm0
+	 pxor	xmm3, xmm2		// b_2
+	psrldq	xmm0, 4
+	 movdqa	xmm2, xmm3
+	pslldq	xmm1, 12
+	 psrldq	xmm3, 4
+	pxor	xmm6, xmm0
+	 pslldq	xmm2, 12
+	 pxor	xmm7, xmm3
+	pxor	xmm5, xmm1
+	 pxor	xmm6, xmm2
 
 	// And then shift the low bits up.
-	movdqa	xmm4, xmm0		// y_3 again
-	 movdqa	xmm5, xmm1		// y_2 again
-	movdqa	xmm6, xmm0		// y_3 yet again
-	 movdqa	xmm7, xmm1		// y_2 yet again
-	pxor	xmm2, xmm0		// y_1 and unit contrib from y_3
-	 pxor	xmm3, xmm1		// y_0 and unit contrib from y_2
-	psrld	xmm0, 2
-	 psrld	xmm1, 2
-	psrld	xmm4, 5
-	 psrld	xmm5, 5
-	psrld	xmm6, 10
-	 psrld	xmm7, 10
-	pxor	xmm0, xmm2		// y_1, with y_3 units and t^2
-	 pxor	xmm1, xmm3		// y_0, with y_2 units and t^2
-	pxor	xmm4, xmm6		// y_3 t^5 and t^10 contribs
-	 pxor	xmm5, xmm7		// y_2 t^5 and t^10 contribs
+	movdqa	xmm0, xmm4		// x_3 again
+	 movdqa	xmm1, xmm5		// x_2 again
+	movdqa	xmm2, xmm4		// x_3 yet again
+	 movdqa	xmm3, xmm5		// x_2 yet again
+	pxor	xmm6, xmm4		// x_1 and unit contrib from x_3
+	 pxor	xmm7, xmm5		// x_0 and unit contrib from x_2
+	psrld	xmm4, 2
+	 psrld	xmm5, 2
+	psrld	xmm0, 5
+	 psrld	xmm1, 5
+	psrld	xmm2, 10
+	 psrld	xmm3, 10
+	pxor	xmm4, xmm6		// x_1, with x_3 units and t^2
+	 pxor	xmm5, xmm7		// x_0, with x_2 units and t^2
+	pxor	xmm0, xmm2		// x_3 t^5 and t^10 contribs
+	 pxor	xmm1, xmm3		// x_2 t^5 and t^10 contribs
 	pxor	xmm0, xmm4		// high half of reduced result
 	pxor	xmm1, xmm5		// low half; all done
 .endm
diff --git a/symm/gcm.c b/symm/gcm.c
index b4384153..25be7482 100644
--- a/symm/gcm.c
+++ b/symm/gcm.c
@@ -121,7 +121,7 @@
 
 /*----- Low-level utilities -----------------------------------------------*/
 
-/* --- @mult@ --- *
+/* --- @mult@, @divt@ --- *
  *
  * Arguments:	@const gcm_params *p@ = pointer to the parameters
  *		@uint32 *z@ = where to write the result
@@ -129,11 +129,11 @@
  *
  * Returns:	---
  *
- * Use:		Multiply the input field element by %$t$%, and write the
- *		product to @z@.  It's safe for @x@ and @z@ to be equal, but
- *		they should not otherwise overlap.  Both input and output are
- *		in big-endian form, i.e., with the lowest-degree coefficients
- *		in the most significant bits.
+ * Use:		Multiply or divide the input field element by %$t$%, and
+ *		write the product or quotient to @z@.  It's safe for @x@ and
+ *		@z@ to be equal, but they should not otherwise overlap.  Both
+ *		input and output are in big-endian form, i.e., with the
+ *		lowest-degree coefficients in the most significant bits.
  */
 
 static void mult(const gcm_params *p, uint32 *z, const uint32 *x)
@@ -145,6 +145,18 @@ static void mult(const gcm_params *p, uint32 *z, const uint32 *x)
   for (i = 0; i < p->n; i++) { t = x[i]; z[i] = (t >> 1) ^ c; c = t << 31; }
 }
 
+#if CPUFAM_X86 || CPUFAM_AMD64 || CPUFAM_ARMEL
+static void divt(const gcm_params *p, uint32 *z, const uint32 *x)
+{
+  uint32 m, c, t;
+  unsigned i;
+
+  t = x[0]; m = -((t >> 31)&1u); c = m&1u;
+  for (i = p->n - 1; i; i--) { t = x[i]; z[i] = (t << 1) | c; c = t >> 31; }
+  t = x[0]; z[0] = ((t ^ (m&p->poly)) << 1) | c;
+}
+#endif
+
 /* --- @mul@ --- *
  *
  * Arguments:	@const gcm_params *p@ = pointer to the parameters
@@ -238,22 +250,26 @@ static void simple_mktable(const gcm_params *p,
 static void pclmul_mktable(const gcm_params *p,
 			   uint32 *ktab, const uint32 *k)
 {
-  unsigned n = p->n;
+  unsigned i, n = p->n;
   unsigned nz;
-  uint32 *t;
+  uint32 k_over_t[GCM_NMAX], *t;
 
-  /* We just need to store the value in a way which is convenient for the
-   * assembler code to read back.  That involves reordering the words, and,
-   * in the case of 96-bit blocks, padding with zeroes to fill out a 128-bit
-   * chunk.
+  /* We need to divide the value by t (to compensate for the one-bit shift
+   * resulting from GCM's backwards bit ordering) and store the value in a
+   * way which is convenient for the assembler code to read back.  That
+   * involves reordering the words, and, in the case of 96-bit blocks,
+   * padding with zeroes to fill out a 128-bit chunk.
    */
 
+  if (!(p->f&GCMF_SWAP)) divt(p, k_over_t, k);
+  else {
+    for (i = 0; i < n; i++) k_over_t[i] = ENDSWAP32(k[i]);
+    divt(p, k_over_t, k_over_t);
+  }
+
   if (n == 3) nz = 1;
   else nz = 0;
-  t = ktab + n + nz;
-
-  if (p->f&GCMF_SWAP) while (n--) { *--t = ENDSWAP32(*k); k++; }
-  else while (n--) *--t = *k++;
+  k = k_over_t; t = ktab + n + nz; while (n--) *--t = *k++;
   while (nz--) *--t = 0;
 }
 #endif
@@ -262,28 +278,27 @@ static void pclmul_mktable(const gcm_params *p,
 static void arm_crypto_mktable(const gcm_params *p,
 			       uint32 *ktab, const uint32 *k)
 {
-  unsigned n = p->n;
-  uint32 *t;
+  unsigned i, n = p->n;
+  uint32 k_over_t[GCM_NMAX], *t;
 
-  /* We just need to store the value in a way which is convenient for the
-   * assembler code to read back.  That involves swapping the bytes in each
-   * 64-bit lane.
+  /* We need to divide the value by t (to compensate for the one-bit shift
+   * resulting from GCM's backwards bit ordering) and store the value in a
+   * way which is convenient for the assembler code to read back.  That
+   * involves swapping the bytes in each 64-bit lane.
    */
 
-  t = ktab;
-  if (p->f&GCMF_SWAP) {
-    while (n >= 2) {
-      t[1] = ENDSWAP32(k[0]); t[0] = ENDSWAP32(k[1]);
-      t += 2; k += 2; n -= 2;
-    }
-    if (n) { t[1] = ENDSWAP32(k[0]); t[0] = 0; }
-  } else {
-    while (n >= 2) {
-      t[1] = k[0]; t[0] = k[1];
-      t += 2; k += 2; n -= 2;
-    }
-    if (n) { t[1] = k[0]; t[0] = 0; }
+  if (!(p->f&GCMF_SWAP)) divt(p, k_over_t, k);
+  else {
+    for (i = 0; i < n; i++) k_over_t[i] = ENDSWAP32(k[i]);
+    divt(p, k_over_t, k_over_t);
+  }
+
+  t = ktab; k = k_over_t;
+  while (n >= 2) {
+    t[1] = k[0]; t[0] = k[1];
+    t += 2; k += 2; n -= 2;
   }
+  if (n) { t[1] = k[0]; t[0] = 0; }
 }
 #endif
 
@@ -407,12 +422,14 @@ static void pclmul_recover_k(const gcm_params *p,
   const uint32 *t;
 
   /* The representation is already independent of the blockcipher endianness.
-   * We need to compensate for padding, and reorder the words.
+   * We need to compensate for padding, reorder the words, and multiply by t
+   * to compensate for the factor of t we divided out earlier.
    */
 
   if (n == 3) nz = 1; else nz = 0;
   t = ktab + n + nz;
   while (n--) *k++ = *--t;
+  mult(p, k - p->n, k - p->n);
 }
 #endif
 
@@ -424,12 +441,14 @@ static void arm_crypto_recover_k(const gcm_params *p,
   const uint32 *t;
 
   /* The representation is already independent of the blockcipher endianness.
-   * We only need to reorder the words.
+   * We only need to reorder the words, and multiply by t to compensate for
+   * the factor of t we divided out earlier.
    */
 
   t = ktab;
   while (n >= 2) { k[1] = t[0]; k[0] = t[1]; t += 2; k += 2; n -= 2; }
-  if (n) k[0] = t[1];
+  if (n) { k[0] = t[1]; k++; n--; }
+  mult(p, k - p->n, k - p->n);
 }
 #endif
 
diff --git a/utils/gcm-ref b/utils/gcm-ref
index 4a53737b..406c6595 100755
--- a/utils/gcm-ref
+++ b/utils/gcm-ref
@@ -123,11 +123,17 @@ def shift_left(x):
   p = poly(8*w)
   return gcm_mangle(C.GF.storel((C.GF.loadl(gcm_mangle(x)) << 1)%p))
 
+def shift_right(x):
+  """Given a field element X (in external format), return X/t."""
+  w = len(x)
+  p = poly(8*w)
+  return gcm_mangle(C.GF.storel((C.GF.loadl(gcm_mangle(x))*p.modinv(2))%p))
+
 def table_common(u, v, flip, getword, ixmask):
   """
   Multiply U by V using table lookup; common for `table-b' and `table-l'.
 
-  This matches the `simple_mulk_...' implementation in `gcm.c'.  One-entry
+  This matches the `simple_mulk_...' implementation in `gcm.c'.  One entry
   per bit is the best we can manage if we want a constant-time
   implementation: processing n bits at a time means we need to scan
   (2^n - 1)/n times as much memory.
@@ -140,7 +146,7 @@ def table_common(u, v, flip, getword, ixmask):
       are processed most-significant first.
 
     * IXMASK is a mask XORed into table indices to permute the table so that
-      it's order matches that induced by GETWORD.
+      its order matches that induced by GETWORD.
 
   The table is built such that tab[i XOR IXMASK] = U t^i.
   """
@@ -172,7 +178,7 @@ def demo_table_b(u, v):
 @demo
 def demo_table_l(u, v):
   """Little-endian table lookup."""
-  return table_common(u, v, endswap_words, lambda b: b.getu32l(), 0x18)
+  return table_common(u, v, endswap_words_32, lambda b: b.getu32l(), 0x18)
 
 ###--------------------------------------------------------------------------
 ### Implementation using 64Ã64->128-bit binary polynomial multiplication.
@@ -180,12 +186,12 @@ def demo_table_l(u, v):
 _i = iota()
 TAG_INPUT_U = _i()
 TAG_INPUT_V = _i()
+TAG_SHIFTED_V = _i()
 TAG_KPIECE_U = _i()
 TAG_KPIECE_V = _i()
 TAG_PRODPIECE = _i()
 TAG_PRODSUM = _i()
 TAG_PRODUCT = _i()
-TAG_SHIFTED = _i()
 TAG_REDCBITS = _i()
 TAG_REDCFULL = _i()
 TAG_REDCMIX = _i()
@@ -237,7 +243,7 @@ def rev8(x):
   x = ((x&m_55) << 1) | ((x >> 1)&m_55)
   return x
 
-def present_gf_mullp64(tag, wd, x, w, n, what):
+def present_gf_vmullp64(tag, wd, x, w, n, what):
   if tag == TAG_PRODPIECE or tag == TAG_REDCFULL:
     return
   elif (wd == 128 or wd == 64) and TAG_PRODSUM <= tag <= TAG_PRODUCT:
@@ -255,9 +261,10 @@ def present_gf_mullp64(tag, wd, x, w, n, what):
   present_gf(y, (w + 63)&~63, n, what)
 
 def present_gf_pmull(tag, wd, x, w, n, what):
-  if tag == TAG_PRODPIECE or tag == TAG_REDCFULL or tag == TAG_SHIFTED:
+  if tag == TAG_PRODPIECE or tag == TAG_REDCFULL:
     return
-  elif tag == TAG_INPUT_V or tag == TAG_KPIECE_V:
+  elif tag == TAG_INPUT_V or tag == TAG_SHIFTED_V or tag == TAG_KPIECE_V:
+    w = (w + 63)&~63
     bx = C.ReadBuffer(x.storeb(w/8))
     by = C.WriteBuffer()
     while bx.left: chunk = bx.get(8); by.put(chunk).put(chunk)
@@ -280,10 +287,9 @@ def poly64_mul_simple(u, v, presfn, wd, dispwd, mulwd, uwhat, vwhat):
   ## We start by carving the operands into 64-bit pieces.  This is
   ## straightforward except for the 96-bit case, where we end up with two
   ## short pieces which we pad at the beginning.
-  if uw%mulwd: pad = (-uw)%mulwd; u += C.ByteString.zero(pad); uw += pad
-  if vw%mulwd: pad = (-uw)%mulwd; v += C.ByteString.zero(pad); vw += pad
-  uu = split_gf(u, mulwd)
-  vv = split_gf(v, mulwd)
+  upad = (-uw)%mulwd; u += C.ByteString.zero(upad); uw += upad
+  vpad = (-vw)%mulwd; v += C.ByteString.zero(vpad); vw += vpad
+  uu = split_gf(u, mulwd); vv = split_gf(v, mulwd)
 
   ## Report and accumulate the individual product pieces.
   x = C.GF(0)
@@ -300,7 +306,7 @@ def poly64_mul_simple(u, v, presfn, wd, dispwd, mulwd, uwhat, vwhat):
     x += t << (mulwd*i)
   presfn(TAG_PRODUCT, wd, x, uw + vw, dispwd, '%s %s' % (uwhat, vwhat))
 
-  return x
+  return x >> (upad + vpad)
 
 def poly64_mul_karatsuba(u, v, klimit, presfn, wd,
                          dispwd, mulwd, uwhat, vwhat):
@@ -343,8 +349,7 @@ def poly64_mul_karatsuba(u, v, klimit, presfn, wd,
   presfn(TAG_PRODUCT, wd, x, 2*w, dispwd, '%s %s' % (uwhat, vwhat))
   return x
 
-def poly64_common(u, v, presfn, dispwd = 32, mulwd = 64, redcwd = 32,
-                  klimit = 256):
+def poly64_mul(u, v, presfn, dispwd, mulwd, klimit, uwhat, vwhat):
   """
   Multiply U by V using a primitive 64-bit binary polynomial mutliplier.
 
@@ -353,28 +358,27 @@ def poly64_common(u, v, presfn, dispwd = 32, mulwd = 64, redcwd = 32,
 
   Operands arrive in a `register format', which is a byte-swapped variant of
   the external format.  Implementations differ on the precise details,
-  though.
+  though.  Returns the double-precision product.
   """
 
-  ## We work in two main phases: first, calculate the full double-width
-  ## product; and, second, reduce it modulo the field polynomial.
-
   w = 8*len(u); assert(w == 8*len(v))
-  p = poly(w)
-  presfn(TAG_INPUT_U, w, C.GF.loadb(u), w, dispwd, 'u')
-  presfn(TAG_INPUT_V, w, C.GF.loadb(v), w, dispwd, 'v')
+  x = poly64_mul_karatsuba(u, v, klimit, presfn,
+                           w, dispwd, mulwd, uwhat, vwhat)
 
-  ## So, on to the first part: the multiplication.
-  x = poly64_mul_karatsuba(u, v, klimit, presfn, w, dispwd, mulwd, 'u', 'v')
+  return x.storeb(w/4)
 
-  ## Now we have to shift everything up one bit to account for GCM's crazy
-  ## bit ordering.
-  y = x << 1
-  if w == 96: y >>= 64
-  presfn(TAG_SHIFTED, w, y, 2*w, dispwd, 'y')
+def poly64_redc(y, presfn, dispwd, redcwd):
+  """
+  Reduce a double-precision product X modulo the appropriate polynomial.
+
+  The operand arrives in a `register format', which is a byte-swapped variant
+  of the external format.  Implementations differ on the precise details,
+  though.  Returns the single-precision reduced value.
+  """
+
+  w = 4*len(y)
+  p = poly(w)
 
-  ## Now for the reduction.
-  ##
   ## Our polynomial has the form p = t^d + r where r = SUM_{0<=i<d} r_i t^i,
   ## with each r_i either 0 or 1.  Because we choose the lexically earliest
   ## irreducible polynomial with the necessary degree, r_i = 1 happens only
@@ -406,14 +410,14 @@ def poly64_common(u, v, presfn, dispwd = 32, mulwd = 64, redcwd = 32,
   m = gfmask(redcwd)
 
   ## Handle the spilling bits.
-  yy = split_gf(y.storeb(w/4), redcwd)
+  yy = split_gf(y, redcwd)
   b = C.GF(0)
   for rj in rr:
     br = [(yi << (redcwd - rj))&m for yi in yy[w/redcwd:]]
     presfn(TAG_REDCBITS, w, join_gf(br, redcwd), w, dispwd, 'b(%d)' % rj)
     b += join_gf(br, redcwd) << (w - redcwd)
   presfn(TAG_REDCFULL, w, b, 2*w, dispwd, 'b')
-  s = y + b
+  s = C.GF.loadb(y) + b
   presfn(TAG_REDCMIX, w, s, 2*w, dispwd, 's')
 
   ## Handle the nonspilling bits.
@@ -433,21 +437,42 @@ def poly64_common(u, v, presfn, dispwd = 32, mulwd = 64, redcwd = 32,
   ## And we're done.
   return z.storeb(w/8)
 
+def poly64_shiftcommon(u, v, presfn, dispwd = 32, mulwd = 64,
+                       redcwd = 32, klimit = 256):
+  w = 8*len(u)
+  presfn(TAG_INPUT_U, w, C.GF.loadb(u), w, dispwd, 'u')
+  presfn(TAG_INPUT_V, w, C.GF.loadb(v), w, dispwd, 'v')
+  vv = shift_right(v)
+  presfn(TAG_SHIFTED_V, w, C.GF.loadb(vv), w, dispwd, "v'")
+  y = poly64_mul(u, vv, presfn, dispwd, mulwd, klimit, "u", "v'")
+  z = poly64_redc(y, presfn, dispwd, redcwd)
+  return z
+
+def poly64_directcommon(u, v, presfn, dispwd = 32, mulwd = 64,
+                        redcwd = 32, klimit = 256):
+  w = 8*len(u)
+  presfn(TAG_INPUT_U, w, C.GF.loadb(u), w, dispwd, 'u')
+  presfn(TAG_INPUT_V, w, C.GF.loadb(v), w, dispwd, 'v')
+  y = poly64_mul(u, v, presfn, dispwd, mulwd, klimit, "u", "v")
+  y = (C.GF.loadb(y) << 1).storeb(w/4)
+  z = poly64_redc(y, presfn, dispwd, redcwd)
+  return z
+
 @demo
 def demo_pclmul(u, v):
-  return poly64_common(u, v, presfn = present_gf_pclmul)
+  return poly64_shiftcommon(u, v, presfn = present_gf_pclmul)
 
 @demo
 def demo_vmullp64(u, v):
   w = 8*len(u)
-  return poly64_common(u, v, presfn = present_gf_mullp64,
-                       redcwd = w%64 == 32 and 32 or 64)
+  return poly64_shiftcommon(u, v, presfn = present_gf_vmullp64,
+                            redcwd = w%64 == 32 and 32 or 64)
 
 @demo
 def demo_pmull(u, v):
   w = 8*len(u)
-  return poly64_common(u, v, presfn = present_gf_pmull,
-                       redcwd = w%64 == 32 and 32 or 64)
+  return poly64_directcommon(u, v, presfn = present_gf_pmull,
+                             redcwd = w%64 == 32 and 32 or 64)
 
 ###--------------------------------------------------------------------------
 ### @@@ Random debris to be deleted. @@@
-- 
2.11.0