From ac4a43c1da1d1554247212289cf483934b9e6d1c Mon Sep 17 00:00:00 2001
From: Mark Wooding <mdw@distorted.org.uk>
Date: Tue, 16 Jan 2024 14:03:11 +0000
Subject: [PATCH] symm/gcm.c, symm/gcm-*.S, utils/gcm-ref: Replace one-bit
 shift with algebra.

The ARM32 and x86 instruction sets lack an instruction to reverse the
order of bits in each byte of a vector register.  Therefore, they
resolve the GCM bit-ordering nightmare by reordering the bytes and
working with reversed polynomials.  But when you multiply reversed
polynomials, the result ends up being shifted by one bit relative to the
answer you actually wanted -- and SIMD instruction sets are bad at
multiprecision bit shifts, so this involves a lot of work.

Instead, use algebra.  If the result is shifted by one bit position,
then it's been multiplied by the generator t.  We can therefore fix this
by dividing through by t.  Of course, this might not even be possible
working in general the ring of polynomials over GF(2), but we're
actually working in the GCM quotient field, and t definitely has an
inverse there.  Also, dividing by t will take time and effort -- but in
fact one of the operands is something we know ahead of time and get to
prepare in whatever way we like.  So, in particular, we could divide it
by t before we even start.

The result is that we get to delete a bunch of rather fiddly assembler
code, in favour of some fairly simple C setup (and extra compensation
work in `recover_k').

I stole this trick from PuTTY.
---
 symm/gcm-arm-crypto.S    | 179 +++++---------------
 symm/gcm-x86ish-pclmul.S | 416 ++++++++++++++---------------------------------
 symm/gcm.c               |  93 ++++++-----
 utils/gcm-ref            |  37 +++--
 4 files changed, 253 insertions(+), 472 deletions(-)

diff --git a/symm/gcm-arm-crypto.S b/symm/gcm-arm-crypto.S
index 8861e601..d5a58f89 100644
--- a/symm/gcm-arm-crypto.S
+++ b/symm/gcm-arm-crypto.S
@@ -67,7 +67,14 @@
 	//	    = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
 	//
 	// which is almost the bit-reversal of u v, only it's shifted right
-	// by one place.  Oh, well: we'll have to shift it back later.
+	// by one place.  Putting this another way, what we have is actually
+	// the bit reversal of the product u v t.  We could get the correct
+	// answer (modulo p(t)) if we'd sneakily divided one of the operands
+	// by t before we started.  Conveniently, v is actually the secret
+	// value k set up by the GCM `mktable' function, so we can arrange to
+	// actually store k/t (mod p(t)) and then the product will come out
+	// correct (modulo p(t)) and we won't have anything more to worry
+	// about here.
 	//
 	// That was important to think about, but there's not a great deal to
 	// do about it yet other than to convert what we've got from the
@@ -106,95 +113,52 @@
 	// q8 =				// (x_3; x_2)
 	// q9 =				// (x_1; x_0)
 
-	// Two-and-a-half problems remain.  The first is that this product is
-	// shifted left by one place, which is annoying.  Let's take care of
-	// that now.  Once this is done, we'll be properly in GCM's backwards
-	// bit-ordering.
+	// One-and-a-half problems remain.
 	//
-	// The half a problem is that the result wants to have its 64-bit
-	// halves switched.  Here turns out to be the best place to arrange
-	// for that.
+	// The full-size problem is that the result needs to be reduced
+	// modulo p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128 = t^7 +
+	// t^2 + t + 1 in our field.  So far, we've calculated z_0 and z_1
+	// such that z_0 + z_1 R = u v using the identity R = t^128: now we
+	// must collapse the two halves of y together using the other
+	// identity R = t^7 + t^2 + t + 1.
 	//
-	//		     q9				    q8
-	//	,-------------.-------------.  ,-------------.-------------.
-	//	| 0  x_0-x_62 | x_63-x_126  |  | x_127-x_190 | x_191-x_254 |
-	//	`-------------^-------------'  `-------------^-------------'
-	//	      d19	    d18		     d17	   d16
-	//
-	// We start by shifting each 32-bit lane right (from GCM's point of
-	// view -- physically, left) by one place, which gives us this:
-	//
-	//		   low (q9)			 high (q8)
-	//	,-------------.-------------.  ,-------------.-------------.
-	//	| x_0-x_62  0 |x_64-x_126 0 |  |x_128-x_190 0|x_192-x_254 0|
-	//	`-------------^-------------'  `-------------^-------------'
-	//	      d19	    d18		     d17	   d16
-	//
-	// but we've lost a bunch of bits.  We separately shift each lane
-	// left by 31 places to give us the bits we lost.
-	//
-	//		   low (q3)			 high (q2)
-	//	,-------------.-------------.  ,-------------.-------------.
-	//	|    0...0    | 0...0  x_63 |  | 0...0 x_127 | 0...0 x_191 |
-	//	`-------------^-------------'  `-------------^-------------'
-	//			     d6		     d5		   d4
-	//
-	// Since we can address each of these pieces individually, putting
-	// them together is relatively straightforward.
-
-
-	vshr.u64 d6, d18, #63		// shifted left; just the carries
-	vshl.u64 q9, q9, #1		// shifted right, but dropped carries
-	vshr.u64 q2, q8, #63
-	vshl.u64 q8, q8, #1
-	vorr	d0, d19, d6		// y_0
-	vorr	d1, d18, d5		// y_1
-	vorr	d2, d17, d4		// y_2
-	vmov	d3, d16			// y_3
-
-	// And the other one is that the result needs to be reduced modulo
-	// p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128 = t^7 + t^2 + t +
-	// 1 in our field.  So far, we've calculated z_0 and z_1 such that
-	// z_0 + z_1 R = u v using the identity R = t^128: now we must
-	// collapse the two halves of y together using the other identity R =
-	// t^7 + t^2 + t + 1.
-	//
-	// We do this by working on y_2 and y_3 separately, so consider y_i
-	// for i = 2 or 3.  Certainly, y_i t^{64i} = y_i R t^{64(i-2) =
-	// (t^7 + t^2 + t + 1) y_i t^{64(i-2)}, but we can't use that
+	// We do this by working on x_2 and x_3 separately, so consider x_i
+	// for i = 2 or 3.  Certainly, x_i t^{64i} = x_i R t^{64(i-2) =
+	// (t^7 + t^2 + t + 1) x_i t^{64(i-2)}, but we can't use that
 	// directly without breaking up the 64-bit word structure.  Instead,
-	// we start by considering just y_i t^7 t^{64(i-2)}, which again
-	// looks tricky.  Now, split y_i = a_i + t^57 b_i, with deg a_i < 57;
+	// we start by considering just x_i t^7 t^{64(i-2)}, which again
+	// looks tricky.  Now, split x_i = a_i + t^57 b_i, with deg a_i < 57;
 	// then
 	//
-	//	y_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
+	//	x_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
 	//
-	// We can similarly decompose y_i t^2 and y_i t into a pair of 64-bit
+	// We can similarly decompose x_i t^2 and x_i t into a pair of 64-bit
 	// contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the
 	// splits are different.  This is lovely, with one small snag: when
-	// we do this to y_3, we end up with a contribution back into the
+	// we do this to x_3, we end up with a contribution back into the
 	// t^128 coefficient word.  But notice that only the low seven bits
 	// of this word are affected, so there's no knock-on contribution
 	// into the t^64 word.  Therefore, if we handle the high bits of each
 	// word together, and then the low bits, everything will be fine.
 
 	// First, shift the high bits down.
-	vshl.u64 q2, q1, #63		// the b_i for t
-	vshl.u64 q3, q1, #62		// the b_i for t^2
-	vshl.u64 q8, q1, #57		// the b_i for t^7
+	vshl.u64 q2, q8, #63		// the b_i for t
+	vshl.u64 q3, q8, #62		// the b_i for t^2
+	vshl.u64 q0, q8, #57		// the b_i for t^7
 	veor	q2, q2, q3		// add them all together
-	veor	q2, q2, q8
-	veor	d2, d2, d5		// contribution into low half
-	veor	d1, d1, d4		// and high half
+	veor	q2, q2, q0
+	veor	d18, d18, d5		// contribution into low half
+	veor	d17, d17, d4		// and high half
 
 	// And then shift the low bits up.
-	vshr.u64 q2, q1, #1
-	vshr.u64 q3, q1, #2
-	vshr.u64 q8, q1, #7
-	veor	q0, q0, q1		// mix in the unit contribution
+	vshr.u64 q2, q8, #1
+	vshr.u64 q3, q8, #2
+	vshr.u64 q1, q8, #7
+	veor	q8, q8, q9		// mix in the unit contribution
 	veor	q2, q2, q3		// t and t^2 contribs
-	veor	q0, q0, q8		// low, unit, and t^7 contribs
-	veor	q0, q0, q2		// mix them together and we're done
+	veor	q1, q1, q8		// low, unit, and t^7 contribs
+	veor	d1, d2, d4		// mix them together and swap halves
+	veor	d0, d3, d5
 .endm
 
 .macro	mul64
@@ -204,29 +168,22 @@
 	// The multiplication is thankfully easy.
 	vmull.p64 q0, d0, d1		// u v
 
-	// Shift the product up by one place, and swap the two halves.  After
-	// this, we're in GCM bizarro-world.
-	vshr.u64 d2, d0, #63		// shifted left; just the carries
-	vshl.u64 d3, d1, #1		// low half right
-	vshl.u64 d1, d0, #1		// high half shifted right
-	vorr	d0, d3, d2		// mix in the carries
-
 	// Now we must reduce.  This is essentially the same as the 128-bit
 	// case above, but mostly simpler because everything is smaller.  The
 	// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
 
 	// First, shift the high bits down.
-	vshl.u64 d2, d1, #63		// b_i for t
-	vshl.u64 d3, d1, #61		// b_i for t^3
-	vshl.u64 d4, d1, #60		// b_i for t^4
+	vshl.u64 d2, d0, #63		// b_i for t
+	vshl.u64 d3, d0, #61		// b_i for t^3
+	vshl.u64 d4, d0, #60		// b_i for t^4
 	veor	d2, d2, d3		// add them all together
 	veor	d2, d2, d4
-	veor	d1, d1, d2		// contribution back into high half
+	veor	d0, d0, d2		// contribution back into high half
 
 	// And then shift the low bits up.
-	vshr.u64 d2, d1, #1
-	vshr.u64 d3, d1, #3
-	vshr.u64 d4, d1, #4
+	vshr.u64 d2, d0, #1
+	vshr.u64 d3, d0, #3
+	vshr.u64 d4, d0, #4
 	veor	d0, d0, d1		// mix in the unit contribution
 	veor	d2, d2, d3		// t and t^3 contribs
 	veor	d0, d0, d4		// low, unit, and t^4
@@ -240,11 +197,7 @@
 	// word.  Clobbers q1--q3, q8, q9.
 
 	// This is an inconvenient size.  There's nothing for it but to do
-	// four multiplications, as if for the 128-bit case.  It's possible
-	// that there's cruft in the top 32 bits of the input registers, so
-	// shift both of them up by four bytes before we start.  This will
-	// mean that the high 64 bits of the result (from GCM's viewpoint)
-	// will be zero.
+	// four multiplications, as if for the 128-bit case.
 	// q0 =				// (u_0 + u_1 t^32; u_2)
 	// q1 =				// (v_0 + v_1 t^32; v_2)
 	vmull.p64 q8, d1, d2		// u_2 (v_0 + v_1 t^32) = e_0
@@ -271,20 +224,6 @@
 	veor	d0, d0, d17		// q0 = bot([d t^128] + e t^64 + f)
 	veor	d3, d3, d16		// q1 = top(d t^128 + e t^64 + f)
 
-	// Shift the product right by one place (from GCM's point of view),
-	// but, unusually, don't swap the halves, because we need to work on
-	// the 32-bit pieces later.  After this, we're in GCM bizarro-world.
-	// q0 =				// (?, x_2; x_1, x_0)
-	// q1 =				// (0, x_5; x_4, x_3)
-	vshr.u64 d4, d0, #63		// carry from d0 to d1
-	vshr.u64 d5, d2, #63		// carry from d2 to d3
-	vshr.u32 d6, d3, #31		// carry from d3 to d0
-	vshl.u64 q0, q0, #1		// shift low half
-	vshl.u64 q1, q1, #1		// shift high half
-	vorr	d1, d1, d4
-	vorr	d0, d0, d6
-	vorr	d3, d3, d5
-
 	// Finally, the reduction.  This is essentially the same as the
 	// 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
 	// t^9 + t^6 + 1.  The degrees are larger but not enough to cause
@@ -345,19 +284,6 @@
 	veor	d19, d19, d24  //  q9 =	// (x_3; x_2)
 	veor	d20, d20, d25  // q10 =	// (x_1; x_0)
 
-	// Shift the product right by one place (from GCM's point of view).
-	vshr.u64 q11, q8, #63		// carry from d16/d17 to d17/d18
-	vshr.u64 q12, q9, #63		// carry from d18/d19 to d19/d20
-	vshr.u64 d26, d20, #63		// carry from d20 to d21
-	vshl.u64 q8, q8, #1		// shift everything down
-	vshl.u64 q9, q9, #1
-	vshl.u64 q10, q10, #1
-	vorr	d17, d17, d22		// and mix in the carries
-	vorr	d18, d18, d23
-	vorr	d19, d19, d24
-	vorr	d20, d20, d25
-	vorr	d21, d21, d26
-
 	// Next, the reduction.  Our polynomial this time is p(x) = t^192 +
 	// t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
 	// 128-bit case.  I don't know why.
@@ -467,23 +393,6 @@
 	veor	q9, q9, q12
 	veor	q10, q10, q13
 
-	// Shift the product right by one place (from GCM's point of view).
-	vshr.u64 q0, q8, #63		// carry from d16/d17 to d17/d18
-	vshr.u64 q1, q9, #63		// carry from d18/d19 to d19/d20
-	vshr.u64 q2, q10, #63		// carry from d20/d21 to d21/d22
-	vshr.u64 d6, d22, #63		// carry from d22 to d23
-	vshl.u64 q8, q8, #1		// shift everyting down
-	vshl.u64 q9, q9, #1
-	vshl.u64 q10, q10, #1
-	vshl.u64 q11, q11, #1
-	vorr	d17, d17, d0
-	vorr	d18, d18, d1
-	vorr	d19, d19, d2
-	vorr	d20, d20, d3
-	vorr	d21, d21, d4
-	vorr	d22, d22, d5
-	vorr	d23, d23, d6
-
 	// Now we must reduce.  This is essentially the same as the 192-bit
 	// case above, but more complicated because everything is bigger.
 	// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
diff --git a/symm/gcm-x86ish-pclmul.S b/symm/gcm-x86ish-pclmul.S
index d56cfd16..837abbdd 100644
--- a/symm/gcm-x86ish-pclmul.S
+++ b/symm/gcm-x86ish-pclmul.S
@@ -80,7 +80,14 @@
 	//	    = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
 	//
 	// which is almost the bit-reversal of u v, only it's shifted right
-	// by one place.  Oh, well: we'll have to shift it back later.
+	// by one place.  Putting this another way, what we have is actually
+	// the bit reversal of the product u v t.  We could get the correct
+	// answer (modulo p(t)) if we'd sneakily divided one of the operands
+	// by t before we started.  Conveniently, v is actually the secret
+	// value k set up by the GCM `mktable' function, so we can arrange to
+	// actually store k/t (mod p(t)) and then the product will come out
+	// correct (modulo p(t)) and we won't have anything more to worry
+	// about here.
 	//
 	// That was important to think about, but there's not a great deal to
 	// do about it yet other than to convert what we've got from the
@@ -121,95 +128,10 @@
 	movdqa	xmm1, xmm2		// (m_1; m_0) again
 	pslldq	xmm2, 8			// (0; m_1)
 	psrldq	xmm1, 8			// (m_0; 0)
-	pxor	xmm0, xmm2		// x_1 = u_1 v_1 + m_1
-	pxor	xmm1, xmm4		// x_0 = u_0 v_0 + t^64 m_0
-
-	// Two problems remain.  The first is that this product is shifted
-	// left (from GCM's backwards perspective) by one place, which is
-	// annoying.  Let's take care of that now.  Once this is done, we'll
-	// be properly in GCM's backwards bit-ordering, so xmm1 will hold the
-	// low half of the product and xmm0 the high half.  (The following
-	// diagrams show bit 0 consistently on the right.)
-	//
-	//				 xmm1
-	//    ,-------------.-------------.-------------.-------------.
-	//    | 0  x_0-x_30 |  x_31-x_62  |  x_63-x_94  |  x_95-x_126 |
-	//    `-------------^-------------^-------------^-------------'
-	//
-	//				 xmm0
-	//    ,-------------.-------------.-------------.-------------.
-	//    | x_127-x_158 | x_159-x_190 | x_191-x_222 | x_223-x_254 |
-	//    `-------------^-------------^-------------^-------------'
-	//
-	// We start by shifting each 32-bit lane right (from GCM's point of
-	// view -- physically, left) by one place, which gives us this:
-	//
-	//			     low (xmm3)
-	//    ,-------------.-------------.-------------.-------------.
-	//    | x_0-x_30  0 | x_32-x_62 0 | x_64-x_94 0 | x_96-x_126 0|
-	//    `-------------^-------------^-------------^-------------'
-	//
-	//			     high (xmm2)
-	//    ,-------------.-------------.-------------.-------------.
-	//    |x_128-x_158 0|x_160-x_190 0|x_192-x_222 0|x_224-x_254 0|
-	//    `-------------^-------------^-------------^-------------'
-	//
-	// but we've lost a bunch of bits.  We separately shift each lane
-	// left by 31 places to give us the bits we lost.
-	//
-	//			     low (xmm1)
-	//    ,-------------.-------------.-------------.-------------.
-	//    |    0...0    | 0...0  x_31 | 0...0  x_63 | 0...0  x_95 |
-	//    `-------------^-------------^-------------^-------------'
-	//
-	//			     high (xmm0)
-	//    ,-------------.-------------.-------------.-------------.
-	//    | 0...0 x_127 | 0...0 x_159 | 0...0 x_191 | 0...0 x_223 |
-	//    `-------------^-------------^-------------^-------------'
-	//
-	// Which is close, but we don't get a cigar yet.  To get the missing
-	// bits into position, we shift each of these right by a lane, but,
-	// alas, the x_127 falls off, so, separately, we shift the high
-	// register left by three lanes, so that everything is lined up
-	// properly when we OR them all together:
-	//
-	//			     low (xmm1)
-	//    ,-------------.-------------.-------------.-------------.
-	//    ? 0...0  x_31 | 0...0  x_63 | 0...0  x_95 |    0...0    |
-	//    `-------------^-------------^-------------^-------------'
-	//
-	//			     wrap (xmm4)
-	//    ,-------------.-------------.-------------.-------------.
-	//    |    0...0    |    0...0    |    0...0    | 0...0 x_127 |
-	//    `-------------^-------------^-------------^-------------'
-	//
-	//			     high (xmm0)
-	//    ,-------------.-------------.-------------.-------------.
-	//    | 0...0 x_159 | 0...0 x_191 | 0...0 x_223 |    0...0    |
-	//    `-------------^-------------^-------------^-------------'
-	//
-	// The `low' and `wrap' registers (xmm1, xmm3, xmm4) then collect the
-	// low 128 coefficients, while the `high' registers (xmm0, xmm2)
-	// collect the high 127 registers, leaving a zero bit at the most
-	// significant end as we expect.
-
-	// xmm0 =			// (x_7, x_6; x_5, x_4)
-	// xmm1 =			// (x_3, x_2; x_1, x_0)
-	movdqa	xmm3, xmm1		// (x_3, x_2; x_1, x_0) again
-	movdqa	xmm2, xmm0		// (x_7, x_6; x_5, x_4) again
-	psrld	xmm1, 31		// shifted left; just the carries
-	psrld	xmm0, 31
-	pslld	xmm3, 1			// shifted right, but dropped carries
-	pslld	xmm2, 1
-	movdqa	xmm4, xmm0		// another copy for the carry around
-	pslldq	xmm1, 4			// move carries over
-	pslldq	xmm0, 4
-	psrldq	xmm4, 12		// the big carry wraps around
-	por	xmm1, xmm3
-	por	xmm0, xmm2		// (y_7, y_6; y_5, y_4)
-	por	xmm1, xmm4		// (y_3, y_2; y_1, y_0)
-
-	// And the other problem is that the result needs to be reduced
+	pxor	xmm0, xmm2		// z_1 = u_1 v_1 + m_1
+	pxor	xmm1, xmm4		// z_0 = u_0 v_0 + t^64 m_0
+
+	// The remaining problem is that the result needs to be reduced
 	// modulo p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128 = t^7 +
 	// t^2 + t + 1 in our field.  So far, we've calculated z_0 and z_1
 	// such that z_0 + z_1 R = u v using the identity R = t^128: now we
@@ -217,28 +139,28 @@
 	// identity R = t^7 + t^2 + t + 1.
 	//
 	// We do this by working on each 32-bit word of the high half of z
-	// separately, so consider y_i, for some 4 <= i < 8.  Certainly, y_i
-	// t^{32i} = y_i R t^{32(i-4)} = (t^7 + t^2 + t + 1) y_i t^{32(i-4)},
+	// separately, so consider x_i, for some 4 <= i < 8.  Certainly, x_i
+	// t^{32i} = x_i R t^{32(i-4)} = (t^7 + t^2 + t + 1) x_i t^{32(i-4)},
 	// but we can't use that directly without breaking up the 32-bit word
-	// structure.  Instead, we start by considering just y_i t^7
-	// t^{32(i-4)}, which again looks tricky.  Now, split y_i = a_i +
+	// structure.  Instead, we start by considering just x_i t^7
+	// t^{32(i-4)}, which again looks tricky.  Now, split x_i = a_i +
 	// t^25 b_i, with deg a_i < 25; then
 	//
-	//	y_i t^7 t^{32(i-4)} = a_i t^7 t^{32(i-4)} + b_i t^{32(i-3)}
+	//	x_i t^7 t^{32(i-4)} = a_i t^7 t^{32(i-4)} + b_i t^{32(i-3)}
 	//
-	// We can similarly decompose y_i t^2 and y_i t into a pair of 32-bit
+	// We can similarly decompose x_i t^2 and x_i t into a pair of 32-bit
 	// contributions to the t^{32(i-4)} and t^{32(i-3)} words, but the
 	// splits are different.  This is lovely, with one small snag: when
-	// we do this to y_7, we end up with a contribution back into the
+	// we do this to x_7, we end up with a contribution back into the
 	// t^128 coefficient word.  But notice that only the low seven bits
 	// of this word are affected, so there's no knock-on contribution
 	// into the t^32 word.  Therefore, if we handle the high bits of each
 	// word together, and then the low bits, everything will be fine.
 
 	// First, shift the high bits down.
-	movdqa	xmm2, xmm0		// (y_7, y_6; y_5, y_4) again
-	movdqa	xmm3, xmm0		// (y_7, y_6; y_5, y_4) yet again
-	movdqa	xmm4, xmm0		// (y_7, y_6; y_5, y_4) again again
+	movdqa	xmm2, xmm0		// (x_7, x_6; x_5, x_4) again
+	movdqa	xmm3, xmm0		// (x_7, x_6; x_5, x_4) yet again
+	movdqa	xmm4, xmm0		// (x_7, x_6; x_5, x_4) again again
 	pslld	xmm2, 31		// the b_i for t
 	pslld	xmm3, 30		// the b_i for t^2
 	pslld	xmm4, 25		// the b_i for t^7
@@ -267,28 +189,20 @@
 	// respectively; leave with z = u v in xmm0.  Clobbers xmm1--xmm4.
 
 	// The multiplication is thankfully easy.
-	pclmullqlqdq xmm0, xmm1		// u v
-
-	// Shift the product up by one place.  After this, we're in GCM
-	// bizarro-world.
-	movdqa	xmm1, xmm0		// u v again
-	psrld	xmm0, 31		// shifted left; just the carries
-	pslld	xmm1, 1			// shifted right, but dropped carries
-	pslldq	xmm0, 4			// move carries over
-	por	xmm1, xmm0		// (y_3, y_2; y_1, y_0)
+	pclmullqlqdq xmm1, xmm0		// u v
 
 	// Now we must reduce.  This is essentially the same as the 128-bit
 	// case above, but mostly simpler because everything is smaller.  The
 	// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
 
 	// First, we must detach the top (`low'!) half of the result.
-	movdqa	xmm0, xmm1		// (y_3, y_2; y_1, y_0) again
-	psrldq	xmm1, 8			// (y_1, y_0; 0, 0)
+	movdqa	xmm0, xmm1		// (x_3, x_2; x_1, x_0) again
+	psrldq	xmm1, 8			// (x_1, x_0; 0, 0)
 
 	// Next, shift the high bits down.
-	movdqa	xmm2, xmm0		// (y_3, y_2; ?, ?) again
-	movdqa	xmm3, xmm0		// (y_3, y_2; ?, ?) yet again
-	movdqa	xmm4, xmm0		// (y_3, y_2; ?, ?) again again
+	movdqa	xmm2, xmm0		// (x_3, x_2; ?, ?) again
+	movdqa	xmm3, xmm0		// (x_3, x_2; ?, ?) yet again
+	movdqa	xmm4, xmm0		// (x_3, x_2; ?, ?) again again
 	pslld	xmm2, 31		// b_i for t
 	pslld	xmm3, 29		// b_i for t^3
 	pslld	xmm4, 28		// b_i for t^4
@@ -342,36 +256,19 @@
 	// e_0 + e_1.
 	//
 	// The place values for the two halves are (t^160, t^128; t^96, ?)
-	// and (?, t^64; t^32, 1).
+	// and (?, t^64; t^32, 1).  But we also want to shift the high part
+	// left by a word, for symmetry's sake.
 	psrldq	xmm0, 8			// (d; 0) = d t^128
 	pxor	xmm2, xmm3		// e = (e_0 + e_1)
 	movdqa	xmm1, xmm4		// f again
 	pxor	xmm0, xmm2		// d t^128 + e t^64
 	psrldq	xmm2, 12		// e[31..0] t^64
 	psrldq	xmm1, 4			// f[95..0]
-	pslldq	xmm4, 8			// f[127..96]
+	pslldq	xmm4, 12		// f[127..96], shifted
+	pslldq	xmm0, 4			// shift high 96 bits
 	pxor	xmm1, xmm2		// low 96 bits of result
 	pxor	xmm0, xmm4		// high 96 bits of result
 
-	// Next, shift everything one bit to the left to compensate for GCM's
-	// strange ordering.  This will be easier if we shift up the high
-	// half by a word before we start.  After this we're in GCM bizarro-
-	// world.
-	movdqa	xmm3, xmm1		// low half again
-	pslldq	xmm0, 4			// shift high half
-	psrld	xmm1, 31		// shift low half down: just carries
-	movdqa	xmm2, xmm0		// copy high half
-	pslld	xmm3, 1			// shift low half down: drop carries
-	psrld	xmm0, 31		// shift high half up: just carries
-	pslld	xmm2, 1			// shift high half down: drop carries
-	movdqa	xmm4, xmm0		// copy high carries for carry-around
-	pslldq	xmm0, 4			// shift carries down
-	pslldq	xmm1, 4
-	psrldq	xmm4, 12		// the big carry wraps around
-	por	xmm1, xmm3
-	por	xmm0, xmm2
-	por	xmm1, xmm4
-
 	// Finally, the reduction.  This is essentially the same as the
 	// 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
 	// t^9 + t^6 + 1.  The degrees are larger but not enough to cause
@@ -418,94 +315,69 @@
 	movdqa	xmm4, xmm0		// (u_2; u_1) again
 	movdqa	xmm5, xmm0		// (u_2; u_1) yet again
 	movdqa	xmm6, xmm0		// (u_2; u_1) again again
-	movdqa	xmm7, xmm1		// (u_0; ?) again
-	punpcklqdq xmm1, xmm3		// (u_0; v_0)
+	movdqa	xmm7, xmm3		// (v_0; ?) again
+	punpcklqdq xmm3, xmm1		// (v_0; u_0)
 	pclmulhqhqdq xmm4, xmm2		// u_1 v_1
-	pclmullqlqdq xmm3, xmm0		// u_2 v_0
+	pclmullqlqdq xmm1, xmm2		// u_0 v_2
 	pclmullqhqdq xmm5, xmm2		// u_2 v_1
 	pclmulhqlqdq xmm6, xmm2		// u_1 v_2
-	pxor	xmm4, xmm3		// u_2 v_0 + u_1 v_1
-	pclmullqlqdq xmm7, xmm2		// u_0 v_2
+	pxor	xmm1, xmm4		// u_0 v_2 + u_1 v_1
+	pclmullqlqdq xmm7, xmm0		// u_2 v_0
 	pxor	xmm5, xmm6		// b = u_2 v_1 + u_1 v_2
 	movdqa	xmm6, xmm0		// (u_2; u_1) like a bad penny
-	pxor	xmm4, xmm7		// c = u_0 v_2 + u_1 v_1 + u_2 v_0
+	pxor	xmm1, xmm7		// c = u_0 v_2 + u_1 v_1 + u_2 v_0
 	pclmullqlqdq xmm0, xmm2		// a = u_2 v_2
-	pclmulhqhqdq xmm6, xmm1		// u_1 v_0
-	pclmulhqlqdq xmm2, xmm1		// u_0 v_1
-	pclmullqhqdq xmm1, xmm1		// e = u_0 v_0
-	pxor	xmm2, xmm6		// d = u_1 v_0 + u_0 v_1
+	pclmulhqlqdq xmm6, xmm3		// u_1 v_0
+	pclmulhqhqdq xmm2, xmm3		// u_0 v_1
+	pclmullqhqdq xmm3, xmm3		// e = u_0 v_0
+	pxor	xmm6, xmm2		// d = u_1 v_0 + u_0 v_1
 
-	// Next, the piecing together of the product.
+	// Next, the piecing together of the product.  There's significant
+	// work here to leave the completed pieces in sensible registers.
 	// xmm0 =			// (a_1; a_0) = a = u_2 v_2
 	// xmm5 =			// (b_1; b_0) = b = u_1 v_2 + u_2 v_1
-	// xmm4 =			// (c_1; c_0) = c = u_0 v_2 +
+	// xmm1 =			// (c_1; c_0) = c = u_0 v_2 +
 					//	u_1 v_1 + u_2 v_0
-	// xmm2 =			// (d_1; d_0) = d = u_0 v_1 + u_1 v_0
-	// xmm1 =			// (e_1; e_0) = e = u_0 v_0
-	// xmm3, xmm6, xmm7 spare
-	movdqa	xmm3, xmm2		// (d_1; d_0) again
-	movdqa	xmm6, xmm5		// (b_1; b_0) again
-	pslldq	xmm2, 8			// (0; d_1)
+	// xmm6 =			// (d_1; d_0) = d = u_0 v_1 + u_1 v_0
+	// xmm3 =			// (e_1; e_0) = e = u_0 v_0
+	// xmm2, xmm4, xmm7 spare
+	movdqa	xmm2, xmm6		// (d_1; d_0) again
+	movdqa	xmm4, xmm5		// (b_1; b_0) again
+	pslldq	xmm6, 8			// (0; d_1)
 	psrldq	xmm5, 8			// (b_0; 0)
-	psrldq	xmm3, 8			// (d_0; 0)
-	pslldq	xmm6, 8			// (0; b_1)
-	pxor	xmm5, xmm2		// (b_0; d_1)
-	pxor	xmm0, xmm6		// x_2 = (a_1; a_0 + b_1)
-	pxor	xmm3, xmm1		// x_0 = (e_1 + d_0; e_0)
-	pxor	xmm4, xmm5		// x_1 = (b_0 + c_1; c_0 + d_1)
-
-	// Now, shift it right (from GCM's point of view) by one bit, and try
-	// to leave the result in less random registers.  After this, we'll
-	// be in GCM bizarro-world.
-	// xmm1, xmm2, xmm5, xmm6, xmm7 spare
-	movdqa	xmm5, xmm0		// copy x_2
-	movdqa	xmm1, xmm4		// copy x_1
-	movdqa	xmm2, xmm3		// copy x_0
-	psrld	xmm0, 31		// x_2 carries
-	psrld	xmm4, 31		// x_1 carries
-	psrld	xmm3, 31		// x_0 carries
-	pslld	xmm5, 1			// x_2 shifted
-	pslld	xmm1, 1			// x_1 shifted
-	pslld	xmm2, 1			// x_0 shifted
-	movdqa	xmm6, xmm0		// x_2 carry copy
-	movdqa	xmm7, xmm4		// x_1 carry copy
-	pslldq	xmm0, 4			// x_2 carry shifted
-	pslldq	xmm4, 4			// x_1 carry shifted
-	pslldq	xmm3, 4			// x_0 carry shifted
-	psrldq	xmm6, 12		// x_2 carry out
-	psrldq	xmm7, 12		// x_1 carry out
-	por	xmm0, xmm5		// (y_5; y_4)
-	por	xmm1, xmm4
-	por	xmm2, xmm3
-	por	xmm1, xmm6		// (y_3; y_2)
-	por	xmm2, xmm7		// (y_1; y_0)
+	psrldq	xmm2, 8			// (d_0; 0)
+	pslldq	xmm4, 8			// (0; b_1)
+	pxor	xmm5, xmm6		// (b_0; d_1)
+	pxor	xmm0, xmm4		// (x_5; x_4) = (a_1; a_0 + b_1)
+	pxor	xmm2, xmm3		// (x_1; x_0) = (e_1 + d_0; e_0)
+	pxor	xmm1, xmm5	       // (x_3; x_2) = (b_0 + c_1; c_0 + d_1)
 
 	// Next, the reduction.  Our polynomial this time is p(x) = t^192 +
 	// t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
 	// 128-bit case.  I don't know why.
 
 	// First, shift the high bits down.
-	// xmm0 =			// (y_5; y_4)
-	// xmm1 =			// (y_3; y_2)
-	// xmm2 =			// (y_1; y_0)
+	// xmm0 =			// (x_5; x_4)
+	// xmm1 =			// (x_3; x_2)
+	// xmm2 =			// (x_1; x_0)
 	// xmm3--xmm7 spare
-	movdqa	xmm3, xmm0		// (y_5; y_4) copy
-	movdqa	xmm4, xmm0		// (y_5; y_4) copy
-	movdqa	xmm5, xmm0		// (y_5; y_4) copy
-	pslld	xmm3, 31		// (y_5; y_4) b_i for t
-	pslld	xmm4, 30		// (y_5; y_4) b_i for t^2
-	pslld	xmm5, 25		// (y_5; y_4) b_i for t^7
-	 movq	xmm6, xmm1		// (y_3; 0) copy
+	movdqa	xmm3, xmm0		// (x_5; x_4) copy
+	movdqa	xmm4, xmm0		// (x_5; x_4) copy
+	movdqa	xmm5, xmm0		// (x_5; x_4) copy
+	pslld	xmm3, 31		// (x_5; x_4) b_i for t
+	pslld	xmm4, 30		// (x_5; x_4) b_i for t^2
+	pslld	xmm5, 25		// (x_5; x_4) b_i for t^7
+	 movq	xmm6, xmm1		// (x_3; 0) copy
 	pxor	xmm3, xmm4
-	 movq	xmm7, xmm1		// (y_3; 0) copy
+	 movq	xmm7, xmm1		// (x_3; 0) copy
 	pxor	xmm3, xmm5
-	 movq	xmm5, xmm1		// (y_3; 0) copy
-	movdqa	xmm4, xmm3		// (y_5; y_4) b_i combined
-	 pslld	xmm6, 31		// (y_3; 0) b_i for t
-	 pslld	xmm7, 30		// (y_3; 0) b_i for t^2
-	 pslld	xmm5, 25		// (y_3; 0) b_i for t^7
-	psrldq	xmm3, 12		// (y_5; y_4) low contrib
-	pslldq	xmm4, 4			// (y_5; y_4) high contrib
+	 movq	xmm5, xmm1		// (x_3; 0) copy
+	movdqa	xmm4, xmm3		// (x_5; x_4) b_i combined
+	 pslld	xmm6, 31		// (x_3; 0) b_i for t
+	 pslld	xmm7, 30		// (x_3; 0) b_i for t^2
+	 pslld	xmm5, 25		// (x_3; 0) b_i for t^7
+	psrldq	xmm3, 12		// (x_5; x_4) low contrib
+	pslldq	xmm4, 4			// (x_5; x_4) high contrib
 	 pxor	xmm6, xmm7
 	pxor	xmm2, xmm3
 	 pxor	xmm6, xmm5
@@ -515,17 +387,17 @@
 
 	// And finally shift the low bits up.  Unfortunately, we also have to
 	// split the low bits out.
-	// xmm0 =			// (y'_5; y'_4)
-	// xmm1 =			// (y'_3; y'_2)
-	// xmm2 =			// (y'_1; y'_0)
-	 movdqa xmm5, xmm1		// copies of (y'_3; y'_2)
+	// xmm0 =			// (x'_5; x'_4)
+	// xmm1 =			// (x'_3; x'_2)
+	// xmm2 =			// (x'_1; x'_0)
+	 movdqa xmm5, xmm1		// copies of (x'_3; x'_2)
 	 movdqa	xmm6, xmm1
 	 movdqa	xmm7, xmm1
-	  psrldq xmm1, 8		// bring down (y'_2; ?)
-	movdqa	xmm3, xmm0		// copies of (y'_5; y'_4)
+	  psrldq xmm1, 8		// bring down (x'_2; ?)
+	movdqa	xmm3, xmm0		// copies of (x'_5; x'_4)
 	movdqa	xmm4, xmm0
-	  punpcklqdq  xmm1, xmm2	// (y'_2; y'_1)
-	  psrldq xmm2, 8		// (y'_0; ?)
+	  punpcklqdq  xmm1, xmm2	// (x'_2; x'_1)
+	  psrldq xmm2, 8		// (x'_0; ?)
 	 pxor	xmm2, xmm5		// low half and unit contrib
 	pxor	xmm1, xmm0
 	 psrld	xmm5, 1
@@ -615,7 +487,7 @@
 	movdqa	xmm3, xmm0
 	pslldq	xmm0, 8
 	psrldq	xmm3, 8
-	pxor	xmm4, xmm0		// x_1 = a_1
+	pxor	xmm4, xmm0		// x_3 = a_1
 	pxor	xmm7, xmm3		// a_0
 
 	// Mix that into the product now forming in xmm4--xmm7.
@@ -645,91 +517,55 @@
 
 #undef V0
 
-	// Now we need to shift that whole lot one bit to the left.  This
-	// will also give us an opportunity to put the product back in
-	// xmm0--xmm3.  This is a slightly merry dance because it's nearly
-	// pipelined but we don't have enough registers.
-	//
-	// After this, we'll be in GCM bizarro-world.
-	movdqa	xmm0, xmm4		// x_3 again
-	psrld	xmm4, 31		// x_3 carries
-	pslld	xmm0, 1			// x_3 shifted left
-	movdqa	xmm3, xmm4		// x_3 copy carries
-	 movdqa	xmm1, xmm5		// x_2 again
-	pslldq	xmm4, 4			// x_3 carries shifted up
-	 psrld	xmm5, 31		// x_2 carries
-	psrldq	xmm3, 12		// x_3 big carry out
-	 pslld	xmm1, 1			// x_2 shifted left
-	por	xmm0, xmm4		// x_3 mixed together
-	 movdqa	xmm4, xmm5		// x_2 copy carries
-	  movdqa xmm2, xmm6		// x_1 again
-	 pslldq	xmm5, 4			// x_2 carries shifted up
-	  psrld	xmm6, 31		// x_1 carries
-	 psrldq	xmm4, 12		// x_2 big carry out
-	  pslld	xmm2, 1			// x_1 shifted
-	 por	xmm1, xmm5		// x_2 mixed together
-	  movdqa xmm5, xmm6		// x_1 copy carries
-	 por	xmm1, xmm3		// x_2 with carry from x_3
-	   movdqa xmm3, xmm7		// x_0 again
-	  pslldq xmm6, 4		// x_1 carries shifted up
-	   psrld xmm7, 31		// x_2 carries
-	  psrldq xmm5, 12		// x_1 big carry out
-	   pslld xmm3, 1		// x_0 shifted
-	  por	xmm2, xmm6		// x_1 mixed together
-	   pslldq xmm7, 4		// x_0 carries shifted up
-	  por	xmm2, xmm4		// x_1 with carry from x_2
-	   por	xmm3, xmm7		// x_0 mixed together
-	   por	xmm3, xmm5		// x_0 with carry from x_1
-
 	// Now we must reduce.  This is essentially the same as the 128-bit
 	// case above, but more complicated because everything is bigger.
 	// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
 
 	// First, shift the high bits down.
-	movdqa	xmm4, xmm0		// y_3 again
-	movdqa	xmm5, xmm0		// y_3 yet again
-	movdqa	xmm6, xmm0		// y_3 again again
-	pslld	xmm4, 30		// y_3: b_i for t^2
-	pslld	xmm5, 27		// y_3: b_i for t^5
-	pslld	xmm6, 22		// y_3: b_i for t^10
-	 movdqa	xmm7, xmm1		// y_2 again
-	pxor	xmm4, xmm5
-	 movdqa	xmm5, xmm1		// y_2 again
-	pxor	xmm4, xmm6
-	 movdqa	xmm6, xmm1		// y_2 again
-	 pslld	xmm7, 30		// y_2: b_i for t^2
-	 pslld	xmm5, 27		// y_2: b_i for t^5
-	 pslld	xmm6, 22		// y_2: b_i for t^10
-	 pxor	xmm7, xmm5
-	movdqa	xmm5, xmm4
-	 pxor	xmm7, xmm6
-	psrldq	xmm4, 4
-	 movdqa	xmm6, xmm7
-	pslldq	xmm5, 12
-	 psrldq	xmm7, 4
-	pxor	xmm2, xmm4
-	 pslldq	xmm6, 12
-	 pxor	xmm3, xmm7
-	pxor	xmm1, xmm5
-	 pxor	xmm2, xmm6
+	movdqa	xmm0, xmm4		// x_3 again
+	movdqa	xmm1, xmm4		// x_3 yet again
+	movdqa	xmm2, xmm4		// x_3 again again
+	pslld	xmm0, 30		// x_3: b_i for t^2
+	pslld	xmm1, 27		// x_3: b_i for t^5
+	pslld	xmm2, 22		// x_3: b_i for t^10
+	 movdqa	xmm3, xmm5		// x_2 again
+	pxor	xmm0, xmm1
+	 movdqa	xmm1, xmm5		// x_2 again
+	pxor	xmm0, xmm2		// b_3
+	 movdqa	xmm2, xmm5		// x_2 again
+	 pslld	xmm3, 30		// x_2: b_i for t^2
+	 pslld	xmm1, 27		// x_2: b_i for t^5
+	 pslld	xmm2, 22		// x_2: b_i for t^10
+	 pxor	xmm3, xmm1
+	movdqa	xmm1, xmm0
+	 pxor	xmm3, xmm2		// b_2
+	psrldq	xmm0, 4
+	 movdqa	xmm2, xmm3
+	pslldq	xmm1, 12
+	 psrldq	xmm3, 4
+	pxor	xmm6, xmm0
+	 pslldq	xmm2, 12
+	 pxor	xmm7, xmm3
+	pxor	xmm5, xmm1
+	 pxor	xmm6, xmm2
 
 	// And then shift the low bits up.
-	movdqa	xmm4, xmm0		// y_3 again
-	 movdqa	xmm5, xmm1		// y_2 again
-	movdqa	xmm6, xmm0		// y_3 yet again
-	 movdqa	xmm7, xmm1		// y_2 yet again
-	pxor	xmm2, xmm0		// y_1 and unit contrib from y_3
-	 pxor	xmm3, xmm1		// y_0 and unit contrib from y_2
-	psrld	xmm0, 2
-	 psrld	xmm1, 2
-	psrld	xmm4, 5
-	 psrld	xmm5, 5
-	psrld	xmm6, 10
-	 psrld	xmm7, 10
-	pxor	xmm0, xmm2		// y_1, with y_3 units and t^2
-	 pxor	xmm1, xmm3		// y_0, with y_2 units and t^2
-	pxor	xmm4, xmm6		// y_3 t^5 and t^10 contribs
-	 pxor	xmm5, xmm7		// y_2 t^5 and t^10 contribs
+	movdqa	xmm0, xmm4		// x_3 again
+	 movdqa	xmm1, xmm5		// x_2 again
+	movdqa	xmm2, xmm4		// x_3 yet again
+	 movdqa	xmm3, xmm5		// x_2 yet again
+	pxor	xmm6, xmm4		// x_1 and unit contrib from x_3
+	 pxor	xmm7, xmm5		// x_0 and unit contrib from x_2
+	psrld	xmm4, 2
+	 psrld	xmm5, 2
+	psrld	xmm0, 5
+	 psrld	xmm1, 5
+	psrld	xmm2, 10
+	 psrld	xmm3, 10
+	pxor	xmm4, xmm6		// x_1, with x_3 units and t^2
+	 pxor	xmm5, xmm7		// x_0, with x_2 units and t^2
+	pxor	xmm0, xmm2		// x_3 t^5 and t^10 contribs
+	 pxor	xmm1, xmm3		// x_2 t^5 and t^10 contribs
 	pxor	xmm0, xmm4		// high half of reduced result
 	pxor	xmm1, xmm5		// low half; all done
 .endm
diff --git a/symm/gcm.c b/symm/gcm.c
index b4384153..25be7482 100644
--- a/symm/gcm.c
+++ b/symm/gcm.c
@@ -121,7 +121,7 @@
 
 /*----- Low-level utilities -----------------------------------------------*/
 
-/* --- @mult@ --- *
+/* --- @mult@, @divt@ --- *
  *
  * Arguments:	@const gcm_params *p@ = pointer to the parameters
  *		@uint32 *z@ = where to write the result
@@ -129,11 +129,11 @@
  *
  * Returns:	---
  *
- * Use:		Multiply the input field element by %$t$%, and write the
- *		product to @z@.  It's safe for @x@ and @z@ to be equal, but
- *		they should not otherwise overlap.  Both input and output are
- *		in big-endian form, i.e., with the lowest-degree coefficients
- *		in the most significant bits.
+ * Use:		Multiply or divide the input field element by %$t$%, and
+ *		write the product or quotient to @z@.  It's safe for @x@ and
+ *		@z@ to be equal, but they should not otherwise overlap.  Both
+ *		input and output are in big-endian form, i.e., with the
+ *		lowest-degree coefficients in the most significant bits.
  */
 
 static void mult(const gcm_params *p, uint32 *z, const uint32 *x)
@@ -145,6 +145,18 @@ static void mult(const gcm_params *p, uint32 *z, const uint32 *x)
   for (i = 0; i < p->n; i++) { t = x[i]; z[i] = (t >> 1) ^ c; c = t << 31; }
 }
 
+#if CPUFAM_X86 || CPUFAM_AMD64 || CPUFAM_ARMEL
+static void divt(const gcm_params *p, uint32 *z, const uint32 *x)
+{
+  uint32 m, c, t;
+  unsigned i;
+
+  t = x[0]; m = -((t >> 31)&1u); c = m&1u;
+  for (i = p->n - 1; i; i--) { t = x[i]; z[i] = (t << 1) | c; c = t >> 31; }
+  t = x[0]; z[0] = ((t ^ (m&p->poly)) << 1) | c;
+}
+#endif
+
 /* --- @mul@ --- *
  *
  * Arguments:	@const gcm_params *p@ = pointer to the parameters
@@ -238,22 +250,26 @@ static void simple_mktable(const gcm_params *p,
 static void pclmul_mktable(const gcm_params *p,
 			   uint32 *ktab, const uint32 *k)
 {
-  unsigned n = p->n;
+  unsigned i, n = p->n;
   unsigned nz;
-  uint32 *t;
+  uint32 k_over_t[GCM_NMAX], *t;
 
-  /* We just need to store the value in a way which is convenient for the
-   * assembler code to read back.  That involves reordering the words, and,
-   * in the case of 96-bit blocks, padding with zeroes to fill out a 128-bit
-   * chunk.
+  /* We need to divide the value by t (to compensate for the one-bit shift
+   * resulting from GCM's backwards bit ordering) and store the value in a
+   * way which is convenient for the assembler code to read back.  That
+   * involves reordering the words, and, in the case of 96-bit blocks,
+   * padding with zeroes to fill out a 128-bit chunk.
    */
 
+  if (!(p->f&GCMF_SWAP)) divt(p, k_over_t, k);
+  else {
+    for (i = 0; i < n; i++) k_over_t[i] = ENDSWAP32(k[i]);
+    divt(p, k_over_t, k_over_t);
+  }
+
   if (n == 3) nz = 1;
   else nz = 0;
-  t = ktab + n + nz;
-
-  if (p->f&GCMF_SWAP) while (n--) { *--t = ENDSWAP32(*k); k++; }
-  else while (n--) *--t = *k++;
+  k = k_over_t; t = ktab + n + nz; while (n--) *--t = *k++;
   while (nz--) *--t = 0;
 }
 #endif
@@ -262,28 +278,27 @@ static void pclmul_mktable(const gcm_params *p,
 static void arm_crypto_mktable(const gcm_params *p,
 			       uint32 *ktab, const uint32 *k)
 {
-  unsigned n = p->n;
-  uint32 *t;
+  unsigned i, n = p->n;
+  uint32 k_over_t[GCM_NMAX], *t;
 
-  /* We just need to store the value in a way which is convenient for the
-   * assembler code to read back.  That involves swapping the bytes in each
-   * 64-bit lane.
+  /* We need to divide the value by t (to compensate for the one-bit shift
+   * resulting from GCM's backwards bit ordering) and store the value in a
+   * way which is convenient for the assembler code to read back.  That
+   * involves swapping the bytes in each 64-bit lane.
    */
 
-  t = ktab;
-  if (p->f&GCMF_SWAP) {
-    while (n >= 2) {
-      t[1] = ENDSWAP32(k[0]); t[0] = ENDSWAP32(k[1]);
-      t += 2; k += 2; n -= 2;
-    }
-    if (n) { t[1] = ENDSWAP32(k[0]); t[0] = 0; }
-  } else {
-    while (n >= 2) {
-      t[1] = k[0]; t[0] = k[1];
-      t += 2; k += 2; n -= 2;
-    }
-    if (n) { t[1] = k[0]; t[0] = 0; }
+  if (!(p->f&GCMF_SWAP)) divt(p, k_over_t, k);
+  else {
+    for (i = 0; i < n; i++) k_over_t[i] = ENDSWAP32(k[i]);
+    divt(p, k_over_t, k_over_t);
+  }
+
+  t = ktab; k = k_over_t;
+  while (n >= 2) {
+    t[1] = k[0]; t[0] = k[1];
+    t += 2; k += 2; n -= 2;
   }
+  if (n) { t[1] = k[0]; t[0] = 0; }
 }
 #endif
 
@@ -407,12 +422,14 @@ static void pclmul_recover_k(const gcm_params *p,
   const uint32 *t;
 
   /* The representation is already independent of the blockcipher endianness.
-   * We need to compensate for padding, and reorder the words.
+   * We need to compensate for padding, reorder the words, and multiply by t
+   * to compensate for the factor of t we divided out earlier.
    */
 
   if (n == 3) nz = 1; else nz = 0;
   t = ktab + n + nz;
   while (n--) *k++ = *--t;
+  mult(p, k - p->n, k - p->n);
 }
 #endif
 
@@ -424,12 +441,14 @@ static void arm_crypto_recover_k(const gcm_params *p,
   const uint32 *t;
 
   /* The representation is already independent of the blockcipher endianness.
-   * We only need to reorder the words.
+   * We only need to reorder the words, and multiply by t to compensate for
+   * the factor of t we divided out earlier.
    */
 
   t = ktab;
   while (n >= 2) { k[1] = t[0]; k[0] = t[1]; t += 2; k += 2; n -= 2; }
-  if (n) k[0] = t[1];
+  if (n) { k[0] = t[1]; k++; n--; }
+  mult(p, k - p->n, k - p->n);
 }
 #endif
 
diff --git a/utils/gcm-ref b/utils/gcm-ref
index 174e79e4..406c6595 100755
--- a/utils/gcm-ref
+++ b/utils/gcm-ref
@@ -123,6 +123,12 @@ def shift_left(x):
   p = poly(8*w)
   return gcm_mangle(C.GF.storel((C.GF.loadl(gcm_mangle(x)) << 1)%p))
 
+def shift_right(x):
+  """Given a field element X (in external format), return X/t."""
+  w = len(x)
+  p = poly(8*w)
+  return gcm_mangle(C.GF.storel((C.GF.loadl(gcm_mangle(x))*p.modinv(2))%p))
+
 def table_common(u, v, flip, getword, ixmask):
   """
   Multiply U by V using table lookup; common for `table-b' and `table-l'.
@@ -180,12 +186,12 @@ def demo_table_l(u, v):
 _i = iota()
 TAG_INPUT_U = _i()
 TAG_INPUT_V = _i()
+TAG_SHIFTED_V = _i()
 TAG_KPIECE_U = _i()
 TAG_KPIECE_V = _i()
 TAG_PRODPIECE = _i()
 TAG_PRODSUM = _i()
 TAG_PRODUCT = _i()
-TAG_SHIFTED = _i()
 TAG_REDCBITS = _i()
 TAG_REDCFULL = _i()
 TAG_REDCMIX = _i()
@@ -255,9 +261,9 @@ def present_gf_vmullp64(tag, wd, x, w, n, what):
   present_gf(y, (w + 63)&~63, n, what)
 
 def present_gf_pmull(tag, wd, x, w, n, what):
-  if tag == TAG_PRODPIECE or tag == TAG_REDCFULL or tag == TAG_SHIFTED:
+  if tag == TAG_PRODPIECE or tag == TAG_REDCFULL:
     return
-  elif tag == TAG_INPUT_V or tag == TAG_KPIECE_V:
+  elif tag == TAG_INPUT_V or tag == TAG_SHIFTED_V or tag == TAG_KPIECE_V:
     w = (w + 63)&~63
     bx = C.ReadBuffer(x.storeb(w/8))
     by = C.WriteBuffer()
@@ -431,8 +437,19 @@ def poly64_redc(y, presfn, dispwd, redcwd):
   ## And we're done.
   return z.storeb(w/8)
 
-def poly64_common(u, v, presfn, dispwd = 32, mulwd = 64,
-                  redcwd = 32, klimit = 256):
+def poly64_shiftcommon(u, v, presfn, dispwd = 32, mulwd = 64,
+                       redcwd = 32, klimit = 256):
+  w = 8*len(u)
+  presfn(TAG_INPUT_U, w, C.GF.loadb(u), w, dispwd, 'u')
+  presfn(TAG_INPUT_V, w, C.GF.loadb(v), w, dispwd, 'v')
+  vv = shift_right(v)
+  presfn(TAG_SHIFTED_V, w, C.GF.loadb(vv), w, dispwd, "v'")
+  y = poly64_mul(u, vv, presfn, dispwd, mulwd, klimit, "u", "v'")
+  z = poly64_redc(y, presfn, dispwd, redcwd)
+  return z
+
+def poly64_directcommon(u, v, presfn, dispwd = 32, mulwd = 64,
+                        redcwd = 32, klimit = 256):
   w = 8*len(u)
   presfn(TAG_INPUT_U, w, C.GF.loadb(u), w, dispwd, 'u')
   presfn(TAG_INPUT_V, w, C.GF.loadb(v), w, dispwd, 'v')
@@ -443,19 +460,19 @@ def poly64_common(u, v, presfn, dispwd = 32, mulwd = 64,
 
 @demo
 def demo_pclmul(u, v):
-  return poly64_common(u, v, presfn = present_gf_pclmul)
+  return poly64_shiftcommon(u, v, presfn = present_gf_pclmul)
 
 @demo
 def demo_vmullp64(u, v):
   w = 8*len(u)
-  return poly64_common(u, v, presfn = present_gf_vmullp64,
-                       redcwd = w%64 == 32 and 32 or 64)
+  return poly64_shiftcommon(u, v, presfn = present_gf_vmullp64,
+                            redcwd = w%64 == 32 and 32 or 64)
 
 @demo
 def demo_pmull(u, v):
   w = 8*len(u)
-  return poly64_common(u, v, presfn = present_gf_pmull,
-                       redcwd = w%64 == 32 and 32 or 64)
+  return poly64_directcommon(u, v, presfn = present_gf_pmull,
+                             redcwd = w%64 == 32 and 32 or 64)
 
 ###--------------------------------------------------------------------------
 ### @@@ Random debris to be deleted. @@@
-- 
2.11.0