symm/gcm-*.S: GCM acceleration using hardware polynomial multiplication.

author Mark Wooding <mdw@distorted.org.uk>

Tue, 13 Nov 2018 11:28:53 +0000 (11:28 +0000)

committer Mark Wooding <mdw@distorted.org.uk>

Sun, 8 Sep 2019 17:35:39 +0000 (18:35 +0100)
author Mark Wooding <mdw@distorted.org.uk>
Tue, 13 Nov 2018 11:28:53 +0000 (11:28 +0000)
committer Mark Wooding <mdw@distorted.org.uk>
Sun, 8 Sep 2019 17:35:39 +0000 (18:35 +0100)
diff --git a/base/dispatch.c b/base/dispatch.c

index 9ba6a7c..9f2ac71 100644 (file)
--- a/base/dispatch.c
+++ b/base/dispatch.c
@@ -46,6 +46,8 @@
  #  define EFLAGS_ID (1u << 21)
  #  define CPUID1D_SSE2 (1u << 26)
  #  define CPUID1D_FXSR (1u << 24)
+#  define CPUID1C_PCLMUL (1u << 1)
+#  define CPUID1C_SSSE3 (1u << 9)
  #  define CPUID1C_AESNI (1u << 25)
  #  define CPUID1C_AVX (1u << 28)
  #  define CPUID1C_RDRAND (1u << 30)
@@ -282,13 +284,15 @@ static unsigned hwcaps = 0;
         _(ARM_NEON, "arm:neon")                                         \
         _(ARM_V4, "arm:v4")                                             \
         _(ARM_D32, "arm:d32")                                           \
-       _(ARM_AES, "arm:aes")
+       _(ARM_AES, "arm:aes")                                           \
+       _(ARM_PMULL, "arm:pmull")
  #endif
  #if CPUFAM_ARM64
  #  define WANTAUX(_)                                                   \
         WANT_AT_HWCAP(_)
  #  define CAPMAP(_)                                                    \
-       _(ARM_AES, "arm:aes")
+       _(ARM_AES, "arm:aes")                                           \
+       _(ARM_PMULL, "arm:pmull")
  #endif
  
  /* Build the bitmask for `hwcaps' from the `CAPMAP' list. */
@@ -402,9 +406,13 @@ static void probe_hwcaps(void)
  #  ifdef HWCAP2_AES
    if (probed.hwcap2 & HWCAP2_AES) hw |= HF_ARM_AES;
  #  endif
+#  ifdef HWCAP2_PMULL
+  if (probed.hwcap2 & HWCAP2_PMULL) hw |= HF_ARM_PMULL;
+#  endif
  #endif
  #if CPUFAM_ARM64
    if (probed.hwcap & HWCAP_AES) hw |= HF_ARM_AES;
+  if (probed.hwcap & HWCAP_PMULL) hw |= HF_ARM_PMULL;
  #endif
  
    /* Store the bitmask of features we probed for everyone to see. */
@@ -549,6 +557,12 @@ int cpu_feature_p(int feat)
      CASE_CPUFEAT(X86_AVX, "x86:avx",
                  xmm_registers_available_p() &&
                  cpuid_features_p(0, CPUID1C_AVX));
+    CASE_CPUFEAT(X86_SSSE3, "x86:ssse3",
+                xmm_registers_available_p() &&
+                cpuid_features_p(0, CPUID1C_SSSE3));
+    CASE_CPUFEAT(X86_PCLMUL, "x86:pclmul",
+                xmm_registers_available_p() &&
+                cpuid_features_p(0, CPUID1C_PCLMUL));
  #endif
  #ifdef CAPMAP
  #  define FEATP__CASE(feat, tok)                                       \
diff --git a/base/dispatch.h b/base/dispatch.h

index dae6a68..7c08382 100644 (file)
--- a/base/dispatch.h
+++ b/base/dispatch.h
@@ -182,7 +182,10 @@ enum {
    CPUFEAT_ARM_D32,                     /* 32 double registers, not 16 */
    CPUFEAT_X86_RDRAND,                  /* Built-in entropy source */
    CPUFEAT_ARM_AES,                     /* AES instructions */
-  CPUFEAT_X86_AVX                      /* AVX 1 (i.e., 256-bit YMM regs) */
+  CPUFEAT_X86_AVX,                     /* AVX 1 (i.e., 256-bit YMM regs) */
+  CPUFEAT_X86_SSSE3,                   /* Supplementary SSE 3 */
+  CPUFEAT_X86_PCLMUL,                  /* Carry-less multiplication */
+  CPUFEAT_ARM_PMULL                    /* Polynomial multiplication */
  };
  
  extern int cpu_feature_p(int /*feat*/);
diff --git a/symm/Makefile.am b/symm/Makefile.am

index cb20f3a..a4f45e9 100644 (file)
--- a/symm/Makefile.am
+++ b/symm/Makefile.am
@@ -323,6 +323,18 @@ BLKCMACMODES               += cmac pmac1
  pkginclude_HEADERS     += ocb.h
  BLKCAEADMODES          += ccm eax gcm ocb1 ocb3
  libsymm_la_SOURCES     += ccm.c gcm.c ocb.c
+if CPUFAM_X86
+libsymm_la_SOURCES     += gcm-x86ish-pclmul.S
+endif
+if CPUFAM_AMD64
+libsymm_la_SOURCES     += gcm-x86ish-pclmul.S
+endif
+if CPUFAM_ARMEL
+libsymm_la_SOURCES     += gcm-arm-crypto.S
+endif
+if CPUFAM_ARM64
+libsymm_la_SOURCES     += gcm-arm64-pmull.S
+endif
  
  TESTS                  += gcm.t$(EXEEXT)
  EXTRA_DIST             += t/gcm
diff --git a/symm/gcm-arm-crypto.S b/symm/gcm-arm-crypto.S

new file mode 100644 (file)

index 0000000..ddc714b
--- /dev/null
+++ b/symm/gcm-arm-crypto.S
@@ -0,0 +1,718 @@
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// GCM acceleration for ARM processors
+///
+/// (c) 2019 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software: you can redistribute it and/or modify it
+/// under the terms of the GNU Library General Public License as published
+/// by the Free Software Foundation; either version 2 of the License, or
+/// (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful, but
+/// WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+/// Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb.  If not, write to the Free Software
+/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+/// USA.
+
+///--------------------------------------------------------------------------
+/// Preliminaries.
+
+#include "config.h"
+#include "asm-common.h"
+
+       .arch   armv8-a
+       .fpu    crypto-neon-fp-armv8
+
+       .text
+
+///--------------------------------------------------------------------------
+/// Multiplication macros.
+
+       // The good news is that we have a fancy instruction to do the
+       // multiplications.  The bad news is that it's not particularly well-
+       // suited to the job.
+       //
+       // For one thing, it only does a 64-bit multiplication, so in general
+       // we'll need to synthesize the full-width multiply by hand.  For
+       // another thing, it doesn't help with the reduction, so we have to
+       // do that by hand too.  And, finally, GCM has crazy bit ordering,
+       // and the instruction does nothing useful for that at all.
+       //
+       // Focusing on that last problem first: the bits aren't in monotonic
+       // significance order unless we permute them.  If we reverse the byte
+       // order, then we'll have the bits in monotonic order, but backwards,
+       // so the degree-0 coefficient will be in the most-significant bit.
+       //
+       // This is less of a difficulty than it seems at first, because
+       // algebra.  Suppose we are given u = SUM_{0<=i<n} u_i t^i and v =
+       // SUM_{0<=j<n} v_j t^j; then
+       //
+       //      u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
+       //
+       // Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i
+       // and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
+       // Then
+       //
+       //      ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
+       //          = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
+       //
+       // which is almost the bit-reversal of u v, only it's shifted right
+       // by one place.  Oh, well: we'll have to shift it back later.
+       //
+       // That was important to think about, but there's not a great deal to
+       // do about it yet other than to convert what we've got from the
+       // blockcipher's byte-ordering convention to our big-endian
+       // convention.  Since this depends on the blockcipher convention,
+       // we'll leave the caller to cope with this: the macros here will
+       // assume that the operands are in `register' format, which is the
+       // same as the external representation, except that the bytes within
+       // each 64-bit piece are reversed.  In the commentary, pieces of
+       // polynomial are numbered according to the degree of the
+       // coefficients, so the unit coefficient of some polynomial a is in
+       // a_0.
+       //
+       // The commentary for `mul128' is the most detailed.  The other
+       // macros assume that you've already read and understood that.
+
+.macro mul128
+       // Enter with u and v in q0 and q1 respectively; leave with z = u v
+       // in q0.  Clobbers q1--q3, q8, q9.
+
+       // First for the double-precision multiplication.  It's tempting to
+       // use Karatsuba's identity here, but I suspect that loses more in
+       // the shifting, bit-twiddling, and dependency chains that it gains
+       // in saving a multiplication which otherwise pipelines well.
+       // q0 =                         // (u_0; u_1)
+       // q1 =                         // (v_0; v_1)
+       vmull.p64 q2, d1, d2            // u_1 v_0
+       vmull.p64 q3, d0, d3            // u_0 v_1
+       vmull.p64 q8, d1, d3            // (x_3; t_1) = u_1 v_1
+       vmull.p64 q9, d0, d2            // (t_0; x_0) = u_0 v_0
+
+       // Arrange the pieces to form a double-precision polynomial.
+       veor    q2, q2, q3              // (m_1; m_0) = u_0 v_1 + u_1 v_0
+       veor    d17, d17, d4            // x_2 = t_1 + m_1
+       veor    d18, d18, d5            // x_1 = t_0 + m_0
+       // q8 =                         // (x_3; x_2)
+       // q9 =                         // (x_1; x_0)
+
+       // Two-and-a-half problems remain.  The first is that this product is
+       // shifted left by one place, which is annoying.  Let's take care of
+       // that now.  Once this is done, we'll be properly in GCM's backwards
+       // bit-ordering.
+       //
+       // The half a problem is that the result wants to have its 64-bit
+       // halves switched.  Here turns out to be the best place to arrange
+       // for that.
+       //
+       //                   q9                             q8
+       //      ,-------------.-------------.  ,-------------.-------------.
+       //      | 0  x_0-x_62 | x_63-x_126  |  | x_127-x_190 | x_191-x_254 |
+       //      `-------------^-------------'  `-------------^-------------'
+       //            d19           d18              d17           d16
+       //
+       // We start by shifting each 32-bit lane right (from GCM's point of
+       // view -- physically, left) by one place, which gives us this:
+       //
+       //                 low (q9)                      high (q8)
+       //      ,-------------.-------------.  ,-------------.-------------.
+       //      | x_0-x_62  0 |x_64-x_126 0 |  |x_128-x_190 0|x_192-x_254 0|
+       //      `-------------^-------------'  `-------------^-------------'
+       //            d19           d18              d17           d16
+       //
+       // but we've lost a bunch of bits.  We separately shift each lane
+       // left by 31 places to give us the bits we lost.
+       //
+       //                 low (q3)                      high (q2)
+       //      ,-------------.-------------.  ,-------------.-------------.
+       //      |    0...0    | 0...0  x_63 |  | 0...0 x_127 | 0...0 x_191 |
+       //      `-------------^-------------'  `-------------^-------------'
+       //                           d6              d5            d4
+       //
+       // Since we can address each of these pieces individually, putting
+       // them together is relatively straightforward.
+
+
+       vshr.u64 d6, d18, #63           // shifted left; just the carries
+       vshl.u64 q9, q9, #1             // shifted right, but dropped carries
+       vshr.u64 q2, q8, #63
+       vshl.u64 q8, q8, #1
+       vorr    d0, d19, d6             // y_0
+       vorr    d1, d18, d5             // y_1
+       vorr    d2, d17, d4             // y_2
+       vmov    d3, d16                 // y_3
+
+       // And the other one is that the result needs to be reduced modulo
+       // p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128 = t^7 + t^2 + t +
+       // 1 in our field.  So far, we've calculated z_0 and z_1 such that
+       // z_0 + z_1 R = u v using the identity R = t^128: now we must
+       // collapse the two halves of y together using the other identity R =
+       // t^7 + t^2 + t + 1.
+       //
+       // We do this by working on y_2 and y_3 separately, so consider y_i
+       // for i = 2 or 3.  Certainly, y_i t^{64i} = y_i R t^{64(i-2) =
+       // (t^7 + t^2 + t + 1) y_i t^{64(i-2)}, but we can't use that
+       // directly without breaking up the 64-bit word structure.  Instead,
+       // we start by considering just y_i t^7 t^{64(i-2)}, which again
+       // looks tricky.  Now, split y_i = a_i + t^57 b_i, with deg a_i < 57;
+       // then
+       //
+       //      y_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
+       //
+       // We can similarly decompose y_i t^2 and y_i t into a pair of 64-bit
+       // contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the
+       // splits are different.  This is lovely, with one small snag: when
+       // we do this to y_3, we end up with a contribution back into the
+       // t^128 coefficient word.  But notice that only the low seven bits
+       // of this word are affected, so there's no knock-on contribution
+       // into the t^64 word.  Therefore, if we handle the high bits of each
+       // word together, and then the low bits, everything will be fine.
+
+       // First, shift the high bits down.
+       vshl.u64 q2, q1, #63            // the b_i for t
+       vshl.u64 q3, q1, #62            // the b_i for t^2
+       vshl.u64 q8, q1, #57            // the b_i for t^7
+       veor    q2, q2, q3              // add them all together
+       veor    q2, q2, q8
+       veor    d2, d2, d5              // contribution into low half
+       veor    d1, d1, d4              // and high half
+
+       // And then shift the low bits up.
+       vshr.u64 q2, q1, #1
+       vshr.u64 q3, q1, #2
+       vshr.u64 q8, q1, #7
+       veor    q0, q0, q1              // mix in the unit contribution
+       veor    q2, q2, q3              // t and t^2 contribs
+       veor    q0, q0, q8              // low, unit, and t^7 contribs
+       veor    q0, q0, q2              // mix them together and we're done
+.endm
+
+.macro mul64
+       // Enter with u and v in the low halves of d0 and d1 respectively;
+       // leave with z = u v in d0.  Clobbers d1--d5.
+
+       // The multiplication is thankfully easy.
+       vmull.p64 q0, d0, d1            // u v
+
+       // Shift the product up by one place, and swap the two halves.  After
+       // this, we're in GCM bizarro-world.
+       vshr.u64 d2, d0, #63            // shifted left; just the carries
+       vshl.u64 d3, d1, #1             // low half right
+       vshl.u64 d1, d0, #1             // high half shifted right
+       vorr    d0, d3, d2              // mix in the carries
+
+       // Now we must reduce.  This is essentially the same as the 128-bit
+       // case above, but mostly simpler because everything is smaller.  The
+       // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
+
+       // First, shift the high bits down.
+       vshl.u64 d2, d1, #63            // b_i for t
+       vshl.u64 d3, d1, #61            // b_i for t^3
+       vshl.u64 d4, d1, #60            // b_i for t^4
+       veor    d2, d2, d3              // add them all together
+       veor    d2, d2, d4
+       veor    d1, d1, d2              // contribution back into high half
+
+       // And then shift the low bits up.
+       vshr.u64 d2, d1, #1
+       vshr.u64 d3, d1, #3
+       vshr.u64 d4, d1, #4
+       veor    d0, d0, d1              // mix in the unit contribution
+       veor    d2, d2, d3              // t and t^3 contribs
+       veor    d0, d0, d4              // low, unit, and t^4
+       veor    d0, d0, d2              // mix them together and we're done
+.endm
+
+.macro mul96
+       // Enter with u and v in the most-significant three words of q0 and
+       // q1 respectively, and zero in the low words, and zero in q15; leave
+       // with z = u v in the high three words of q0, and /junk/ in the low
+       // word.  Clobbers ???.
+
+       // This is an inconvenient size.  There's nothing for it but to do
+       // four multiplications, as if for the 128-bit case.  It's possible
+       // that there's cruft in the top 32 bits of the input registers, so
+       // shift both of them up by four bytes before we start.  This will
+       // mean that the high 64 bits of the result (from GCM's viewpoint)
+       // will be zero.
+       // q0 =                         // (u_0 + u_1 t^32; u_2)
+       // q1 =                         // (v_0 + v_1 t^32; v_2)
+       vmull.p64 q8, d1, d2            // u_2 (v_0 + v_1 t^32) = e_0
+       vmull.p64 q9, d0, d3            // v_2 (u_0 + u_1 t^32) = e_1
+       vmull.p64 q3, d1, d3            // u_2 v_2 t^64 = d = (0; d)
+       vmull.p64 q0, d0, d2            // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
+                                       //   + u_1 v_1 t^64 = f
+
+       // Extract the high and low halves of the 192-bit result.  The answer
+       // we want is d t^128 + e t^64 + f, where e = e_0 + e_1.  The low 96
+       // bits of the answer will end up in q0, and the high 96 bits will
+       // end up in q1; we'll need both of these to have zero in their
+       // bottom 32 bits.
+       //
+       // Here, bot(x) is the low 96 bits of a 192-bit quantity x, arranged
+       // in the low 96 bits of a SIMD register, with junk in the top 32
+       // bits; and top(x) is the high 96 bits, also arranged in the low 96
+       // bits of a register, with /zero/ in the top 32 bits.
+       veor    q8, q8, q9              // e_0 + e_1 = e
+       vshr128 q1, q3, 32              // top(d t^128)
+       vext.8  d19, d16, d17, #4       // top(e t^64)
+       vshl.u64 d16, d0, #32           // top(f), sort of
+       veor    d3, d3, d19             // q1 = top(d t^128 + e t^64)
+       veor    d0, d0, d17             // q0 = bot([d t^128] + e t^64 + f)
+       veor    d3, d3, d16             // q1 = top(d t^128 + e t^64 + f)
+
+       // Shift the product right by one place (from GCM's point of view),
+       // but, unusually, don't swap the halves, because we need to work on
+       // the 32-bit pieces later.  After this, we're in GCM bizarro-world.
+       // q0 =                         // (?, x_2; x_1, x_0)
+       // q1 =                         // (0, x_5; x_4, x_3)
+       vshr.u64 d4, d0, #63            // carry from d0 to d1
+       vshr.u64 d5, d2, #63            // carry from d2 to d3
+       vshr.u32 d6, d3, #31            // carry from d3 to d0
+       vshl.u64 q0, q0, #1             // shift low half
+       vshl.u64 q1, q1, #1             // shift high half
+       vorr    d1, d1, d4
+       vorr    d0, d0, d6
+       vorr    d3, d3, d5
+
+       // Finally, the reduction.  This is essentially the same as the
+       // 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
+       // t^9 + t^6 + 1.  The degrees are larger but not enough to cause
+       // trouble for the general approach.
+
+       // First, shift the high bits down.
+       vshl.u32 q2, q1, #26            // b_i for t^6
+       vshl.u32 q3, q1, #23            // b_i for t^9
+       vshl.u32 q8, q1, #22            // b_i for t^10
+       veor    q2, q2, q3              // add them all together
+       veor    q2, q2, q8
+       vshl128 q3, q2, 64              // contribution into high half
+       vshr128 q2, q2, 32              // and low half
+       veor    q1, q1, q3              // mix them in
+       veor    q0, q0, q2
+
+       // And then shift the low bits up.
+       vshr.u32 q2, q1, #6
+       vshr.u32 q3, q1, #9
+       veor    q0, q0, q1              // mix in the unit contribution
+       vshr.u32 q8, q1, #10
+       veor    q2, q2, q3              // mix together t^6 and t^9
+       veor    q0, q0, q8              // mix in t^10
+       veor    q0, q0, q2              // and the rest
+
+       // And finally swap the two halves.
+       vswp    d0, d1
+.endm
+
+.macro mul192
+       // Enter with u and v in d0--d2 and d3--d5 respectively; leave
+       // with z = u v in d0--d2.  Clobbers q8--q15.
+
+       // Start multiplying and accumulating pieces of product.
+       // (d0; d1; d2) =               // (u_0; u_1; u_2)
+       // (d3; d4; d5) =               // (v_0; v_1; v_2)
+       vmull.p64 q10, d0, d3           // e = u_0 v_0
+
+       vmull.p64 q12, d0, d4           //     u_0 v_1
+       vmull.p64 q13, d1, d3           //     u_1 v_0
+
+       vmull.p64 q9, d0, d5            //     u_0 v_2
+       vmull.p64 q14, d1, d4           //     u_1 v_1
+       vmull.p64 q15, d2, d3           //     u_2 v_0
+        veor   q12, q12, q13           // d = u_0 v_1 + u_1 v_0
+
+       vmull.p64 q11, d1, d5           //     u_1 v_2
+       vmull.p64 q13, d2, d4           //     u_2 v_1
+        veor   q9, q9, q14             //     u_0 v_2 + u_1 v_1
+
+       vmull.p64 q8, d2, d5            // a = u_2 v_2
+        veor   q9, q9, q15             // c = u_0 v_2 + u_1 v_1 + u_2 v_0
+        veor   q11, q11, q13           // b = u_1 v_2 + u_2 v_1
+
+       // Piece the product together.
+       veor    d17, d17, d22  //  q8 = // (x_5; x_4)
+       veor    d18, d18, d23
+       veor    d19, d19, d24  //  q9 = // (x_3; x_2)
+       veor    d20, d20, d25  // q10 = // (x_1; x_0)
+
+       // Shift the product right by one place (from GCM's point of view).
+       vshr.u64 q11, q8, #63           // carry from d16/d17 to d17/d18
+       vshr.u64 q12, q9, #63           // carry from d18/d19 to d19/d20
+       vshr.u64 d26, d20, #63          // carry from d20 to d21
+       vshl.u64 q8, q8, #1             // shift everything down
+       vshl.u64 q9, q9, #1
+       vshl.u64 q10, q10, #1
+       vorr    d17, d17, d22           // and mix in the carries
+       vorr    d18, d18, d23
+       vorr    d19, d19, d24
+       vorr    d20, d20, d25
+       vorr    d21, d21, d26
+
+       // Next, the reduction.  Our polynomial this time is p(x) = t^192 +
+       // t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
+       // 128-bit case.  I don't know why.
+
+       // First, shift the high bits down.
+       // q8 =                         // (y_5; y_4)
+       // q9 =                         // (y_3; y_2)
+       // q10 =                        // (y_1; y_0)
+       vshl.u64 q11, q8, #63           // (y_5; y_4) b_i for t
+       vshl.u64 d28, d18, #63          // y_3 b_i for t
+       vshl.u64 q12, q8, #62           // (y_5; y_4) b_i for t^2
+       vshl.u64 d29, d18, #62          // y_3 b_i for t^2
+       vshl.u64 q13, q8, #57           // (y_5; y_4) b_i for t^7
+       vshl.u64 d30, d18, #57          // y_3 b_i for t^7
+       veor    q11, q11, q12           // mix them all together
+       veor    d28, d28, d29
+       veor    q11, q11, q13
+       veor    d28, d28, d30
+       veor    q9, q9, q11
+       veor    d20, d20, d28
+
+       // And finally shift the low bits up.  Also, switch the order of the
+       // pieces for output.
+       // q8 =                         // (y'_5; y'_4)
+       // q9 =                         // (y'_3; y'_2)
+       // q10 =                        // (y'_1; y'_0)
+       vshr.u64 q11, q8, #1            // (y_5; y_4) a_i for t
+       vshr.u64 d28, d18, #1           // y'_3 a_i for t
+       vshr.u64 q12, q8, #2            // (y_5; y_4) a_i for t^2
+       vshr.u64 d29, d18, #2           // y'_3 a_i for t^2
+       vshr.u64 q13, q8, #7            // (y_5; y_4) a_i for t^7
+       vshr.u64 d30, d18, #7           // y'_3 a_i for t^7
+       veor    q8, q8, q11
+       veor    d18, d18, d28
+       veor    q12, q12, q13
+       veor    d29, d29, d30
+       veor    q8, q8, q12
+       veor    d18, d18, d29
+       veor    d0, d21, d18
+       veor    d1, d20, d17
+       veor    d2, d19, d16
+.endm
+
+.macro mul256
+       // Enter with u and v in q0/q1 and q2/q3 respectively; leave
+       // with z = u v in q0/q1.  Clobbers q8--q15.
+
+       // Now it's starting to look worthwhile to do Karatsuba.  Suppose
+       // u = u_0 + u_1 B and v = v_0 + v_1 B.  Then
+       //
+       //      u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
+       //
+       // Name these coefficients of B^i be a, b, and c, respectively, and
+       // let r = u_0 + u_1 and s = v_0 + v_1.  Then observe that
+       //
+       //      q = r s = (u_0 + u_1) (v_0 + v_1)
+       //        = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
+       //        = a + d + c
+       //
+       // The first two terms we've already calculated; the last is the
+       // remaining one we want.  We'll set B = t^128.  We know how to do
+       // 128-bit multiplications already, and Karatsuba is too annoying
+       // there, so there'll be 12 multiplications altogether, rather than
+       // the 16 we'd have if we did this the naïve way.
+       // q0 =                         // u_0 = (u_00; u_01)
+       // q1 =                         // u_1 = (u_10; u_11)
+       // q2 =                         // v_0 = (v_00; v_01)
+       // q3 =                         // v_1 = (v_10; v_11)
+
+       veor    q8, q0, q1              // u_* = (u_00 + u_10; u_01 + u_11)
+       veor    q9, q2, q3              // v_* = (v_00 + v_10; v_01 + v_11)
+
+       // Start by building the cross product, q = u_* v_*.
+       vmull.p64 q14, d16, d19         // u_*0 v_*1
+       vmull.p64 q15, d17, d18         // u_*1 v_*0
+       vmull.p64 q12, d17, d19         // u_*1 v_*1
+       vmull.p64 q13, d16, d18         // u_*0 v_*0
+       veor    q14, q14, q15           // u_*0 v_*1 + u_*1 v_*0
+       veor    d25, d25, d28  // q12 = // q_1
+       veor    d26, d26, d29  // q13 = // q_0
+
+       // Next, work on the low half, a = u_0 v_0.
+       vmull.p64 q14, d0, d5           // u_00 v_01
+       vmull.p64 q15, d1, d4           // u_01 v_00
+       vmull.p64 q10, d1, d5           // u_01 v_01
+       vmull.p64 q11, d0, d4           // u_00 v_00
+       veor    q14, q14, q15           // u_00 v_01 + u_01 v_00
+       veor    d21, d21, d28  // q10 = // a_1
+       veor    d22, d22, d29  // q11 = // a_0
+
+       // Mix the pieces we have so far.
+       veor    q12, q12, q10
+       veor    q13, q13, q11
+
+       // Finally, the high half, c = u_1 v_1.
+       vmull.p64 q14, d2, d7           // u_10 v_11
+       vmull.p64 q15, d3, d6           // u_11 v_10
+       vmull.p64 q8, d3, d7            // u_11 v_11
+       vmull.p64 q9, d2, d6            // u_10 v_10
+       veor    q14, q14, q15           // u_10 v_11 + u_11 v_10
+       veor    d17, d17, d28  //  q8 = // c_1
+       veor    d18, d18, d29  //  q9 = // c_0
+
+       // Finish mixing the product together.
+       veor    q12, q12, q8   // q12 = // b_1
+       veor    q13, q13, q9   // q13 = // b_0
+       veor    q9, q9, q12
+       veor    q10, q10, q13
+
+       // Shift the product right by one place (from GCM's point of view).
+       vshr.u64 q0, q8, #63            // carry from d16/d17 to d17/d18
+       vshr.u64 q1, q9, #63            // carry from d18/d19 to d19/d20
+       vshr.u64 q2, q10, #63           // carry from d20/d21 to d21/d22
+       vshr.u64 d6, d22, #63           // carry from d22 to d23
+       vshl.u64 q8, q8, #1             // shift everyting down
+       vshl.u64 q9, q9, #1
+       vshl.u64 q10, q10, #1
+       vshl.u64 q11, q11, #1
+       vorr    d17, d17, d0
+       vorr    d18, d18, d1
+       vorr    d19, d19, d2
+       vorr    d20, d20, d3
+       vorr    d21, d21, d4
+       vorr    d22, d22, d5
+       vorr    d23, d23, d6
+
+       // Now we must reduce.  This is essentially the same as the 192-bit
+       // case above, but more complicated because everything is bigger.
+       // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
+
+       // First, shift the high bits down.
+       // q8 =                         // (y_7; y_6)
+       // q9 =                         // (y_5; y_4)
+       // q10 =                        // (y_3; y_2)
+       // q11 =                        // (y_1; y_0)
+       vshl.u64 q0, q8, #62            // (y_7; y_6) b_i for t^2
+       vshl.u64 q12, q9, #62           // (y_5; y_4) b_i for t^2
+       vshl.u64 q1, q8, #59            // (y_7; y_6) b_i for t^5
+       vshl.u64 q13, q9, #59           // (y_5; y_4) b_i for t^5
+       vshl.u64 q2, q8, #54            // (y_7; y_6) b_i for t^10
+       vshl.u64 q14, q9, #54           // (y_5; y_4) b_i for t^10
+       veor    q0, q0, q1              // mix the contributions together
+       veor    q12, q12, q13
+       veor    q0, q0, q2
+       veor    q12, q12, q14
+       veor    d19, d19, d0            // and combine into the lower pieces
+       veor    d20, d20, d1
+       veor    d21, d21, d24
+       veor    d22, d22, d25
+
+       // And then shift the low bits up.  Also, switch the order of the
+       // pieces for output.
+       // q8 =                         // (y'_7; y'_6)
+       // q9 =                         // (y'_5; y'_4)
+       // q10 =                        // (y'_3; y'_2)
+       // q11 =                        // (y'_1; y'_0)
+       vshr.u64 q0, q8, #2             // (y_7; y_6) a_i for t^2
+       vshr.u64 q12, q9, #2            // (y_5; y'_4) a_i for t^2
+       vshr.u64 q1, q8, #5             // (y_7; y_6) a_i for t^5
+       vshr.u64 q13, q9, #5            // (y_5; y_4) a_i for t^5
+       vshr.u64 q2, q8, #10            // (y_7; y_6) a_i for t^10
+       vshr.u64 q14, q9, #10           // (y_5; y_4) a_i for t^10
+
+       veor    q8, q8, q0              // mix the contributions together
+       veor    q1, q1, q2
+       veor    q9, q9, q12
+       veor    q13, q13, q14
+       veor    q8, q8, q1
+       veor    q9, q9, q13
+       veor    d3, d20, d16            // and output
+       veor    d2, d21, d17
+       veor    d1, d22, d18
+       veor    d0, d23, d19
+.endm
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+// There are a number of representations of field elements in this code and
+// it can be confusing.
+//
+//   * The `external format' consists of a sequence of contiguous bytes in
+//     memory called a `block'.  The GCM spec explains how to interpret this
+//     block as an element of a finite field.  As discussed extensively, this
+//     representation is very annoying for a number of reasons.  On the other
+//     hand, this code never actually deals with it directly.
+//
+//   * The `register format' consists of one or more NEON registers,
+//     depending on the block size.  The bytes in each 64-bit lane of these
+//     registers are in reverse order, compared to the external format.
+//
+//   * The `words' format consists of a sequence of bytes, as in the
+//     `external format', but, according to the blockcipher in use, the bytes
+//     within each 32-bit word may be reversed (`big-endian') or not
+//     (`little-endian').  Accordingly, there are separate entry points for
+//     each variant, identified with `b' or `l'.
+
+FUNC(gcm_mulk_128b_arm_crypto)
+       // On entry, r0 points to a 128-bit field element A in big-endian
+       // words format; r1 points to a field-element K in register format.
+       // On exit, A is updated with the product A K.
+
+       vld1.8  {q0}, [r0]
+       vld1.8  {q1}, [r1]
+       vrev64.32 q0, q0
+       mul128
+       vrev64.32 q0, q0
+       vst1.8  {q0}, [r0]
+       bx      r14
+ENDFUNC
+
+FUNC(gcm_mulk_128l_arm_crypto)
+       // On entry, r0 points to a 128-bit field element A in little-endian
+       // words format; r1 points to a field-element K in register format.
+       // On exit, A is updated with the product A K.
+
+       vld1.8  {q0}, [r0]
+       vld1.8  {q1}, [r1]
+       vrev64.8 q0, q0
+       mul128
+       vrev64.8 q0, q0
+       vst1.8  {q0}, [r0]
+       bx      r14
+ENDFUNC
+
+FUNC(gcm_mulk_64b_arm_crypto)
+       // On entry, r0 points to a 64-bit field element A in big-endian
+       // words format; r1 points to a field-element K in register format.
+       // On exit, A is updated with the product A K.
+
+       vld1.8  {d0}, [r0]
+       vld1.8  {d1}, [r1]
+       vrev64.32 d0, d0
+       mul64
+       vrev64.32 d0, d0
+       vst1.8  {d0}, [r0]
+       bx      r14
+ENDFUNC
+
+FUNC(gcm_mulk_64l_arm_crypto)
+       // On entry, r0 points to a 64-bit field element A in little-endian
+       // words format; r1 points to a field-element K in register format.
+       // On exit, A is updated with the product A K.
+
+       vld1.8  {d0}, [r0]
+       vld1.8  {d1}, [r1]
+       vrev64.8 d0, d0
+       vzero
+       mul64
+       vrev64.8 d0, d0
+       vst1.8  {d0}, [r0]
+       bx      r14
+ENDFUNC
+
+FUNC(gcm_mulk_96b_arm_crypto)
+       // On entry, r0 points to a 96-bit field element A in big-endian
+       // words format; r1 points to a field-element K in register format.
+       // On exit, A is updated with the product A K.
+
+       ldr     r3, [r0, #8]
+       mov     r12, #0
+       vld1.8  {d0}, [r0]
+       vld1.8  {q1}, [r1]
+       vrev64.32 d0, d0
+       vmov    d1, r12, r3
+       vzero
+       mul96
+       vrev64.32 d0, d0
+       vmov    r3, d1[1]
+       vst1.8  {d0}, [r0]
+       str     r3, [r0, #8]
+       bx      r14
+ENDFUNC
+
+FUNC(gcm_mulk_96l_arm_crypto)
+       // On entry, r0 points to a 128-bit field element A in little-endian
+       // words format; r1 points to a field-element K in register format.
+       // On exit, A is updated with the product A K.
+
+       ldr     r3, [r0, #8]
+       mov     r12, #0
+       vld1.8  {d0}, [r0]
+       vld1.8  {q1}, [r1]
+       vmov    d1, r3, r12
+       vrev64.8 q0, q0
+       mul96
+       vrev64.8 q0, q0
+       vmov    r3, d1[0]
+       vst1.8  {d0}, [r0]
+       str     r3, [r0, #8]
+       bx      r14
+ENDFUNC
+
+FUNC(gcm_mulk_192b_arm_crypto)
+       // On entry, r0 points to a 192-bit field element A in big-endian
+       // words format; r1 points to a field-element K in register format.
+       // On exit, A is updated with the product A K.
+
+       vld1.8  {d0-d2}, [r0]
+       vld1.8  {d3-d5}, [r1]
+       vrev64.32 q0, q0
+       vrev64.32 d2, d2
+       mul192
+       vrev64.32 q0, q0
+       vrev64.32 d2, d2
+       vst1.8  {d0-d2}, [r0]
+       bx      r14
+ENDFUNC
+
+FUNC(gcm_mulk_192l_arm_crypto)
+       // On entry, r0 points to a 192-bit field element A in little-endian
+       // words format; r1 points to a field-element K in register format.
+       // On exit, A is updated with the product A K.
+
+       vld1.8  {d0-d2}, [r0]
+       vld1.8  {d3-d5}, [r1]
+       vrev64.8 q0, q0
+       vrev64.8 d2, d2
+       mul192
+       vrev64.8 q0, q0
+       vrev64.8 d2, d2
+       vst1.8  {d0-d2}, [r0]
+       bx      r14
+ENDFUNC
+
+FUNC(gcm_mulk_256b_arm_crypto)
+       // On entry, r0 points to a 256-bit field element A in big-endian
+       // words format; r1 points to a field-element K in register format.
+       // On exit, A is updated with the product A K.
+
+       vld1.8  {q0, q1}, [r0]
+       vld1.8  {q2, q3}, [r1]
+       vrev64.32 q0, q0
+       vrev64.32 q1, q1
+       mul256
+       vrev64.32 q0, q0
+       vrev64.32 q1, q1
+       vst1.8  {q0, q1}, [r0]
+       bx      r14
+ENDFUNC
+
+FUNC(gcm_mulk_256l_arm_crypto)
+       // On entry, r0 points to a 256-bit field element A in little-endian
+       // words format; r1 points to a field-element K in register format.
+       // On exit, A is updated with the product A K.
+
+       vld1.8  {q0, q1}, [r0]
+       vld1.8  {q2, q3}, [r1]
+       vrev64.8 q0, q0
+       vrev64.8 q1, q1
+       mul256
+       vrev64.8 q0, q0
+       vrev64.8 q1, q1
+       vst1.8  {q0, q1}, [r0]
+       bx      r14
+ENDFUNC
+
+///----- That's all, folks --------------------------------------------------
diff --git a/symm/gcm-arm64-pmull.S b/symm/gcm-arm64-pmull.S

new file mode 100644 (file)

index 0000000..97bb3bf
--- /dev/null
+++ b/symm/gcm-arm64-pmull.S
@@ -0,0 +1,661 @@
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// GCM acceleration for ARM64 processors
+///
+/// (c) 2019 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software: you can redistribute it and/or modify it
+/// under the terms of the GNU Library General Public License as published
+/// by the Free Software Foundation; either version 2 of the License, or
+/// (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful, but
+/// WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+/// Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb.  If not, write to the Free Software
+/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+/// USA.
+
+///--------------------------------------------------------------------------
+/// Preliminaries.
+
+#include "config.h"
+#include "asm-common.h"
+
+       .arch   armv8-a+crypto
+
+       .text
+
+///--------------------------------------------------------------------------
+/// Multiplication macros.
+
+       // The good news is that we have a fancy instruction to do the
+       // multiplications.  The bad news is that it's not particularly well-
+       // suited to the job.
+       //
+       // For one thing, it only does a 64-bit multiplication, so in general
+       // we'll need to synthesize the full-width multiply by hand.  For
+       // another thing, it doesn't help with the reduction, so we have to
+       // do that by hand too.  And, finally, GCM has crazy bit ordering,
+       // and the instruction does nothing useful for that at all.
+       //
+       // Focusing on that last problem first: the bits aren't in monotonic
+       // significance order unless we permute them.  Fortunately, ARM64 has
+       // an instruction which will just permute the bits in each byte for
+       // us, so we don't have to worry about this very much.
+       //
+       // Our main weapons, the `pmull' and `pmull2' instructions, work on
+       // 64-bit operands, in half of a vector register, and produce 128-bit
+       // results.  But neither of them will multiply the high half of one
+       // vector by the low half of a second one, so we have a problem,
+       // which we solve by representing one of the operands redundantly:
+       // rather than packing the 64-bit pieces together, we duplicate each
+       // 64-bit piece across both halves of a register.
+       //
+       // The commentary for `mul128' is the most detailed.  The other
+       // macros assume that you've already read and understood that.
+
+.macro mul128
+       // Enter with u and v in v0 and v1/v2 respectively, and 0 in v31;
+       // leave with z = u v in v0.  Clobbers v1--v6.
+
+       // First for the double-precision multiplication.  It's tempting to
+       // use Karatsuba's identity here, but I suspect that loses more in
+       // the shifting, bit-twiddling, and dependency chains that it gains
+       // in saving a multiplication which otherwise pipelines well.
+       // v0 =                         // (u_0; u_1)
+       // v1/v2 =                      // (v_0; v_1)
+       pmull2  v3.1q, v0.2d, v1.2d     // u_1 v_0
+       pmull   v4.1q, v0.1d, v2.1d     // u_0 v_1
+       pmull2  v5.1q, v0.2d, v2.2d     // (t_1; x_3) = u_1 v_1
+       pmull   v6.1q, v0.1d, v1.1d     // (x_0; t_0) = u_0 v_0
+
+       // Arrange the pieces to form a double-precision polynomial.
+       eor     v3.16b, v3.16b, v4.16b  // (m_0; m_1) = u_0 v_1 + u_1 v_0
+       vshr128 v4, v3, 64              // (m_1; 0)
+       vshl128 v3, v3, 64              // (0; m_0)
+       eor     v1.16b, v5.16b, v4.16b  // (x_2; x_3)
+       eor     v0.16b, v6.16b, v3.16b  // (x_0; x_1)
+
+       // And now the only remaining difficulty is that the result needs to
+       // be reduced modulo p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128
+       // = t^7 + t^2 + t + 1 in our field.  So far, we've calculated z_0
+       // and z_1 such that z_0 + z_1 R = u v using the identity R = t^128:
+       // now we must collapse the two halves of y together using the other
+       // identity R = t^7 + t^2 + t + 1.
+       //
+       // We do this by working on y_2 and y_3 separately, so consider y_i
+       // for i = 2 or 3.  Certainly, y_i t^{64i} = y_i R t^{64(i-2) =
+       // (t^7 + t^2 + t + 1) y_i t^{64(i-2)}, but we can't use that
+       // directly without breaking up the 64-bit word structure.  Instead,
+       // we start by considering just y_i t^7 t^{64(i-2)}, which again
+       // looks tricky.  Now, split y_i = a_i + t^57 b_i, with deg a_i < 57;
+       // then
+       //
+       //      y_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
+       //
+       // We can similarly decompose y_i t^2 and y_i t into a pair of 64-bit
+       // contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the
+       // splits are different.  This is lovely, with one small snag: when
+       // we do this to y_3, we end up with a contribution back into the
+       // t^128 coefficient word.  But notice that only the low seven bits
+       // of this word are affected, so there's no knock-on contribution
+       // into the t^64 word.  Therefore, if we handle the high bits of each
+       // word together, and then the low bits, everything will be fine.
+
+       // First, shift the high bits down.
+       ushr    v2.2d, v1.2d, #63       // the b_i for t
+       ushr    v3.2d, v1.2d, #62       // the b_i for t^2
+       ushr    v4.2d, v1.2d, #57       // the b_i for t^7
+       eor     v2.16b, v2.16b, v3.16b  // add them all together
+       eor     v2.16b, v2.16b, v4.16b
+       vshr128 v3, v2, 64
+       vshl128 v4, v2, 64
+       eor     v1.16b, v1.16b, v3.16b  // contribution into high half
+       eor     v0.16b, v0.16b, v4.16b  // and low half
+
+       // And then shift the low bits up.
+       shl     v2.2d, v1.2d, #1
+       shl     v3.2d, v1.2d, #2
+       shl     v4.2d, v1.2d, #7
+       eor     v1.16b, v1.16b, v2.16b  // unit and t contribs
+       eor     v3.16b, v3.16b, v4.16b  // t^2 and t^7 contribs
+       eor     v0.16b, v0.16b, v1.16b  // mix everything together
+       eor     v0.16b, v0.16b, v3.16b  // ... and we're done
+.endm
+
+.macro mul64
+       // Enter with u and v in the low halves of v0 and v1, respectively;
+       // leave with z = u v in x2.  Clobbers x2--x4.
+
+       // The multiplication is thankfully easy.
+       // v0 =                                 // (u; ?)
+       // v1 =                                 // (v; ?)
+       pmull   v0.1q, v0.1d, v1.1d             // u v
+
+       // Now we must reduce.  This is essentially the same as the 128-bit
+       // case above, but mostly simpler because everything is smaller.  The
+       // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
+
+       // Before we get stuck in, transfer the product to general-purpose
+       // registers.
+       mov     x3, v0.d[1]
+       mov     x2, v0.d[0]
+
+       // First, shift the high bits down.
+       eor     x4, x3, x3, lsr #1      // pre-mix t^3 and t^4
+       eor     x3, x3, x3, lsr #63     // mix in t contribution
+       eor     x3, x3, x4, lsr #60     // shift and mix in t^3 and t^4
+
+       // And then shift the low bits up.
+       eor     x3, x3, x3, lsl #1      // mix unit and t; pre-mix t^3, t^4
+       eor     x2, x2, x3              // fold them in
+       eor     x2, x2, x3, lsl #3      // and t^3 and t^4
+.endm
+
+.macro mul96
+       // Enter with u in the least-significant 96 bits of v0, with zero in
+       // the upper 32 bits, and with the least-significant 64 bits of v in
+       // both halves of v1, and the upper 32 bits of v in the low 32 bits
+       // of each half of v2, with zero in the upper 32 bits; and with zero
+       // in v31.  Yes, that's a bit hairy.  Leave with the product u v in
+       // the low 96 bits of v0, and /junk/ in the high 32 bits.  Clobbers
+       // v1--v6.
+
+       // This is an inconvenient size.  There's nothing for it but to do
+       // four multiplications, as if for the 128-bit case.  It's possible
+       // that there's cruft in the top 32 bits of the input registers, so
+       // shift both of them up by four bytes before we start.  This will
+       // mean that the high 64 bits of the result (from GCM's viewpoint)
+       // will be zero.
+       // v0 =                         // (u_0 + u_1 t^32; u_2)
+       // v1 =                         // (v_0 + v_1 t^32; v_0 + v_1 t^32)
+       // v2 =                         // (v_2; v_2)
+       pmull2  v5.1q, v0.2d, v1.2d     // u_2 (v_0 + v_1 t^32) t^32 = e_0
+       pmull   v4.1q, v0.1d, v2.1d     // v_2 (u_0 + u_1 t^32) t^32 = e_1
+       pmull2  v6.1q, v0.2d, v2.2d     // u_2 v_2 = d = (d; 0)
+       pmull   v3.1q, v0.1d, v1.1d     // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
+                                       //   + u_1 v_1 t^64 = f
+
+       // Extract the high and low halves of the 192-bit result.  The answer
+       // we want is d t^128 + e t^64 + f, where e = e_0 + e_1.  The low 96
+       // bits of the answer will end up in v0, with junk in the top 32
+       // bits; the high 96 bits will end up in v1, which must have zero in
+       // its top 32 bits.
+       //
+       // Here, bot(x) is the low 96 bits of a 192-bit quantity x, arranged
+       // in the low 96 bits of a SIMD register, with junk in the top 32
+       // bits; and top(x) is the high 96 bits, also arranged in the low 96
+       // bits of a register, with /zero/ in the top 32 bits.
+       eor     v4.16b, v4.16b, v5.16b  // e_0 + e_1 = e
+       vshl128 v6, v6, 32              // top(d t^128)
+       vshr128 v5, v4, 32              // top(e t^64)
+       vshl128 v4, v4, 64              // bot(e t^64)
+       vshr128 v1, v3, 96              // top(f)
+       eor     v6.16b, v6.16b, v5.16b  // top(d t^128 + e t^64)
+       eor     v0.16b, v3.16b, v4.16b  // bot([d t^128] + e t^64 + f)
+       eor     v1.16b, v1.16b, v6.16b  // top(e t^64 + d t^128 + f)
+
+       // Finally, the reduction.  This is essentially the same as the
+       // 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
+       // t^9 + t^6 + 1.  The degrees are larger but not enough to cause
+       // trouble for the general approach.  Unfortunately, we have to do
+       // this in 32-bit pieces rather than 64.
+
+       // First, shift the high bits down.
+       ushr    v2.4s, v1.4s, #26       // the b_i for t^6
+       ushr    v3.4s, v1.4s, #23       // the b_i for t^9
+       ushr    v4.4s, v1.4s, #22       // the b_i for t^10
+       eor     v2.16b, v2.16b, v3.16b  // add them all together
+       eor     v2.16b, v2.16b, v4.16b
+       vshr128 v3, v2, 64              // contribution for high half
+       vshl128 v2, v2, 32              // contribution for low half
+       eor     v1.16b, v1.16b, v3.16b  // apply to high half
+       eor     v0.16b, v0.16b, v2.16b  // and low half
+
+       // And then shift the low bits up.
+       shl     v2.4s, v1.4s, #6
+       shl     v3.4s, v1.4s, #9
+       shl     v4.4s, v1.4s, #10
+       eor     v1.16b, v1.16b, v2.16b  // unit and t^6 contribs
+       eor     v3.16b, v3.16b, v4.16b  // t^9 and t^10 contribs
+       eor     v0.16b, v0.16b, v1.16b  // mix everything together
+       eor     v0.16b, v0.16b, v3.16b  // ... and we're done
+.endm
+
+.macro mul192
+       // Enter with u in v0 and the less-significant half of v1, with v
+       // duplicated across both halves of v2/v3/v4, and with zero in v31.
+       // Leave with the product u v in v0 and the bottom half of v1.
+       // Clobbers v16--v25.
+
+       // Start multiplying and accumulating pieces of product.
+       // v0 =                         // (u_0; u_1)
+       // v1 =                         // (u_2; ?)
+       // v2 =                         // (v_0; v_0)
+       // v3 =                         // (v_1; v_1)
+       // v4 =                         // (v_2; v_2)
+       pmull   v16.1q, v0.1d, v2.1d    //   a = u_0 v_0
+
+       pmull   v19.1q, v0.1d, v3.1d    //       u_0 v_1
+       pmull2  v21.1q, v0.2d, v2.2d    //       u_1 v_0
+
+       pmull   v17.1q, v0.1d, v4.1d    //       u_0 v_2
+       pmull2  v22.1q, v0.2d, v3.2d    //       u_1 v_1
+       pmull   v23.1q, v1.1d, v2.1d    //       u_2 v_0
+        eor    v19.16b, v19.16b, v21.16b // b = u_0 v_1 + u_1 v_0
+
+       pmull2  v20.1q, v0.2d, v4.2d    //       u_1 v_2
+       pmull   v24.1q, v1.1d, v3.1d    //       u_2 v_1
+        eor    v17.16b, v17.16b, v22.16b //     u_0 v_2 + u_1 v_1
+
+       pmull   v18.1q, v1.1d, v4.1d    //   e = u_2 v_2
+        eor    v17.16b, v17.16b, v23.16b // c = u_0 v_2 + u_1 v_1 + u_2 v_1
+        eor    v20.16b, v20.16b, v24.16b // d = u_1 v_2 + u_2 v_1
+
+       // Piece the product together.
+       // v16 =                        // (a_0; a_1)
+       // v19 =                        // (b_0; b_1)
+       // v17 =                        // (c_0; c_1)
+       // v20 =                        // (d_0; d_1)
+       // v18 =                        // (e_0; e_1)
+       vshl128 v21, v19, 64            // (0; b_0)
+       ext     v22.16b, v19.16b, v20.16b, #8 // (b_1; d_0)
+       vshr128 v23, v20, 64            // (d_1; 0)
+       eor     v16.16b, v16.16b, v21.16b // (x_0; x_1)
+       eor     v17.16b, v17.16b, v22.16b // (x_2; x_3)
+       eor     v18.16b, v18.16b, v23.16b // (x_2; x_3)
+
+       // Next, the reduction.  Our polynomial this time is p(x) = t^192 +
+       // t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
+       // 128-bit case.  I don't know why.
+
+       // First, shift the high bits down.
+       // v16 =                        // (y_0; y_1)
+       // v17 =                        // (y_2; y_3)
+       // v18 =                        // (y_4; y_5)
+       mov     v19.d[0], v17.d[1]      // (y_3; ?)
+
+       ushr    v23.2d, v18.2d, #63     // hi b_i for t
+       ushr    d20, d19, #63           // lo b_i for t
+       ushr    v24.2d, v18.2d, #62     // hi b_i for t^2
+       ushr    d21, d19, #62           // lo b_i for t^2
+       ushr    v25.2d, v18.2d, #57     // hi b_i for t^7
+       ushr    d22, d19, #57           // lo b_i for t^7
+       eor     v23.16b, v23.16b, v24.16b // mix them all together
+       eor     v20.8b, v20.8b, v21.8b
+       eor     v23.16b, v23.16b, v25.16b
+       eor     v20.8b, v20.8b, v22.8b
+
+       // Permute the high pieces while we fold in the b_i.
+       eor     v17.16b, v17.16b, v23.16b
+       vshl128 v20, v20, 64
+       mov     v19.d[0], v18.d[1]      // (y_5; ?)
+       ext     v18.16b, v17.16b, v18.16b, #8 // (y_3; y_4)
+       eor     v16.16b, v16.16b, v20.16b
+
+       // And finally shift the low bits up.
+       // v16 =                        // (y'_0; y'_1)
+       // v17 =                        // (y'_2; ?)
+       // v18 =                        // (y'_3; y'_4)
+       // v19 =                        // (y'_5; ?)
+       shl     v20.2d, v18.2d, #1
+       shl     d23, d19, #1
+       shl     v21.2d, v18.2d, #2
+       shl     d24, d19, #2
+       shl     v22.2d, v18.2d, #7
+       shl     d25, d19, #7
+       eor     v18.16b, v18.16b, v20.16b // unit and t contribs
+       eor     v19.8b, v19.8b, v23.8b
+       eor     v21.16b, v21.16b, v22.16b // t^2 and t^7 contribs
+       eor     v24.8b, v24.8b, v25.8b
+       eor     v18.16b, v18.16b, v21.16b // all contribs
+       eor     v19.8b, v19.8b, v24.8b
+       eor     v0.16b, v16.16b, v18.16b // mix them into the low half
+       eor     v1.8b, v17.8b, v19.8b
+.endm
+
+.macro mul256
+       // Enter with u in v0/v1, with v duplicated across both halves of
+       // v2--v5, and with zero in v31.  Leave with the product u v in
+       // v0/v1.  Clobbers ???.
+
+       // Now it's starting to look worthwhile to do Karatsuba.  Suppose
+       // u = u_0 + u_1 B and v = v_0 + v_1 B.  Then
+       //
+       //      u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
+       //
+       // Name these coefficients of B^i be a, b, and c, respectively, and
+       // let r = u_0 + u_1 and s = v_0 + v_1.  Then observe that
+       //
+       //      q = r s = (u_0 + u_1) (v_0 + v_1)
+       //        = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
+       //        = a + d + c
+       //
+       // The first two terms we've already calculated; the last is the
+       // remaining one we want.  We'll set B = t^128.  We know how to do
+       // 128-bit multiplications already, and Karatsuba is too annoying
+       // there, so there'll be 12 multiplications altogether, rather than
+       // the 16 we'd have if we did this the naïve way.
+       // v0 =                         // u_0 = (u_00; u_01)
+       // v1 =                         // u_1 = (u_10; u_11)
+       // v2 =                         // (v_00; v_00)
+       // v3 =                         // (v_01; v_01)
+       // v4 =                         // (v_10; v_10)
+       // v5 =                         // (v_11; v_11)
+
+       eor     v28.16b, v0.16b, v1.16b // u_* = (u_00 + u_10; u_01 + u_11)
+       eor     v29.16b, v2.16b, v4.16b // v_*0 = v_00 + v_10
+       eor     v30.16b, v3.16b, v5.16b // v_*1 = v_01 + v_11
+
+       // Start by building the cross product, q = u_* v_*.
+       pmull   v24.1q, v28.1d, v30.1d  // u_*0 v_*1
+       pmull2  v25.1q, v28.2d, v29.2d  // u_*1 v_*0
+       pmull   v20.1q, v28.1d, v29.1d  // u_*0 v_*0
+       pmull2  v21.1q, v28.2d, v30.2d  // u_*1 v_*1
+       eor     v24.16b, v24.16b, v25.16b // u_*0 v_*1 + u_*1 v_*0
+       vshr128 v25, v24, 64
+       vshl128 v24, v24, 64
+       eor     v20.16b, v20.16b, v24.16b // q_0
+       eor     v21.16b, v21.16b, v25.16b // q_1
+
+       // Next, work on the low half, a = u_0 v_0
+       pmull   v24.1q, v0.1d, v3.1d    // u_00 v_01
+       pmull2  v25.1q, v0.2d, v2.2d    // u_01 v_00
+       pmull   v16.1q, v0.1d, v2.1d    // u_00 v_00
+       pmull2  v17.1q, v0.2d, v3.2d    // u_01 v_01
+       eor     v24.16b, v24.16b, v25.16b // u_00 v_01 + u_01 v_00
+       vshr128 v25, v24, 64
+       vshl128 v24, v24, 64
+       eor     v16.16b, v16.16b, v24.16b // a_0
+       eor     v17.16b, v17.16b, v25.16b // a_1
+
+       // Mix the pieces we have so far.
+       eor     v20.16b, v20.16b, v16.16b
+       eor     v21.16b, v21.16b, v17.16b
+
+       // Finally, work on the high half, c = u_1 v_1
+       pmull   v24.1q, v1.1d, v5.1d    // u_10 v_11
+       pmull2  v25.1q, v1.2d, v4.2d    // u_11 v_10
+       pmull   v18.1q, v1.1d, v4.1d    // u_10 v_10
+       pmull2  v19.1q, v1.2d, v5.2d    // u_11 v_11
+       eor     v24.16b, v24.16b, v25.16b // u_10 v_11 + u_11 v_10
+       vshr128 v25, v24, 64
+       vshl128 v24, v24, 64
+       eor     v18.16b, v18.16b, v24.16b // c_0
+       eor     v19.16b, v19.16b, v25.16b // c_1
+
+       // Finish mixing the product together.
+       eor     v20.16b, v20.16b, v18.16b
+       eor     v21.16b, v21.16b, v19.16b
+       eor     v17.16b, v17.16b, v20.16b
+       eor     v18.16b, v18.16b, v21.16b
+
+       // Now we must reduce.  This is essentially the same as the 192-bit
+       // case above, but more complicated because everything is bigger.
+       // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
+       // v16 =                        // (y_0; y_1)
+       // v17 =                        // (y_2; y_3)
+       // v18 =                        // (y_4; y_5)
+       // v19 =                        // (y_6; y_7)
+       ushr    v24.2d, v18.2d, #62     // (y_4; y_5) b_i for t^2
+       ushr    v25.2d, v19.2d, #62     // (y_6; y_7) b_i for t^2
+       ushr    v26.2d, v18.2d, #59     // (y_4; y_5) b_i for t^5
+       ushr    v27.2d, v19.2d, #59     // (y_6; y_7) b_i for t^5
+       ushr    v28.2d, v18.2d, #54     // (y_4; y_5) b_i for t^10
+       ushr    v29.2d, v19.2d, #54     // (y_6; y_7) b_i for t^10
+       eor     v24.16b, v24.16b, v26.16b // mix the contributions together
+       eor     v25.16b, v25.16b, v27.16b
+       eor     v24.16b, v24.16b, v28.16b
+       eor     v25.16b, v25.16b, v29.16b
+       vshr128 v26, v25, 64            // slide contribs into position
+       ext     v25.16b, v24.16b, v25.16b, #8
+       vshl128 v24, v24, 64
+       eor     v18.16b, v18.16b, v26.16b
+       eor     v17.16b, v17.16b, v25.16b
+       eor     v16.16b, v16.16b, v24.16b
+
+       // And then shift the low bits up.
+       // v16 =                        // (y'_0; y'_1)
+       // v17 =                        // (y'_2; y'_3)
+       // v18 =                        // (y'_4; y'_5)
+       // v19 =                        // (y'_6; y'_7)
+       shl     v24.2d, v18.2d, #2      // (y'_4; y_5) a_i for t^2
+       shl     v25.2d, v19.2d, #2      // (y_6; y_7) a_i for t^2
+       shl     v26.2d, v18.2d, #5      // (y'_4; y_5) a_i for t^5
+       shl     v27.2d, v19.2d, #5      // (y_6; y_7) a_i for t^5
+       shl     v28.2d, v18.2d, #10     // (y'_4; y_5) a_i for t^10
+       shl     v29.2d, v19.2d, #10     // (y_6; y_7) a_i for t^10
+       eor     v18.16b, v18.16b, v24.16b // mix the contributions together
+       eor     v19.16b, v19.16b, v25.16b
+       eor     v26.16b, v26.16b, v28.16b
+       eor     v27.16b, v27.16b, v29.16b
+       eor     v18.16b, v18.16b, v26.16b
+       eor     v19.16b, v19.16b, v27.16b
+       eor     v0.16b, v16.16b, v18.16b
+       eor     v1.16b, v17.16b, v19.16b
+.endm
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+// There are a number of representations of field elements in this code and
+// it can be confusing.
+//
+//   * The `external format' consists of a sequence of contiguous bytes in
+//     memory called a `block'.  The GCM spec explains how to interpret this
+//     block as an element of a finite field.  As discussed extensively, this
+//     representation is very annoying for a number of reasons.  On the other
+//     hand, this code never actually deals with it directly.
+//
+//   * The `register format' consists of one or more SIMD registers,
+//     depending on the block size.  The bits in each byte are reversed,
+//     compared to the external format, which makes the polynomials
+//     completely vanilla, unlike all of the other GCM implementations.
+//
+//   * The `table format' is just like the `register format', only the two
+//     halves of 128-bit SIMD register are the same, so we need twice as many
+//     registers.
+//
+//   * The `words' format consists of a sequence of bytes, as in the
+//     `external format', but, according to the blockcipher in use, the bytes
+//     within each 32-bit word may be reversed (`big-endian') or not
+//     (`little-endian').  Accordingly, there are separate entry points for
+//     each variant, identified with `b' or `l'.
+
+FUNC(gcm_mulk_128b_arm64_pmull)
+       // On entry, x0 points to a 128-bit field element A in big-endian
+       // words format; x1 points to a field-element K in table format.  On
+       // exit, A is updated with the product A K.
+
+       ldr     q0, [x0]
+       ldp     q1, q2, [x1]
+       rev32   v0.16b, v0.16b
+       vzero
+       rbit    v0.16b, v0.16b
+       mul128
+       rbit    v0.16b, v0.16b
+       rev32   v0.16b, v0.16b
+       str     q0, [x0]
+       ret
+ENDFUNC
+
+FUNC(gcm_mulk_128l_arm64_pmull)
+       // On entry, x0 points to a 128-bit field element A in little-endian
+       // words format; x1 points to a field-element K in table format.  On
+       // exit, A is updated with the product A K.
+
+       ldr     q0, [x0]
+       ldp     q1, q2, [x1]
+       vzero
+       rbit    v0.16b, v0.16b
+       mul128
+       rbit    v0.16b, v0.16b
+       str     q0, [x0]
+       ret
+ENDFUNC
+
+FUNC(gcm_mulk_64b_arm64_pmull)
+       // On entry, x0 points to a 64-bit field element A in big-endian
+       // words format; x1 points to a field-element K in table format.  On
+       // exit, A is updated with the product A K.
+
+       ldr     d0, [x0]
+       ldr     q1, [x1]
+       rev32   v0.8b, v0.8b
+       rbit    v0.8b, v0.8b
+       mul64
+       rbit    x2, x2
+       ror     x2, x2, #32
+       str     x2, [x0]
+       ret
+ENDFUNC
+
+FUNC(gcm_mulk_64l_arm64_pmull)
+       // On entry, x0 points to a 64-bit field element A in little-endian
+       // words format; x1 points to a field-element K in table format.  On
+       // exit, A is updated with the product A K.
+
+       ldr     d0, [x0]
+       ldr     q1, [x1]
+       rbit    v0.8b, v0.8b
+       mul64
+       rbit    x2, x2
+       rev     x2, x2
+       str     x2, [x0]
+       ret
+ENDFUNC
+
+FUNC(gcm_mulk_96b_arm64_pmull)
+       // On entry, x0 points to a 96-bit field element A in big-endian
+       // words format; x1 points to a field-element K in table format.  On
+       // exit, A is updated with the product A K.
+
+       ldr     w2, [x0, #8]
+       ldr     d0, [x0, #0]
+       mov     v0.d[1], x2
+       ldp     q1, q2, [x1]
+       rev32   v0.16b, v0.16b
+       vzero
+       rbit    v0.16b, v0.16b
+       mul96
+       rbit    v0.16b, v0.16b
+       rev32   v0.16b, v0.16b
+       mov     w2, v0.s[2]
+       str     d0, [x0, #0]
+       str     w2, [x0, #8]
+       ret
+ENDFUNC
+
+FUNC(gcm_mulk_96l_arm64_pmull)
+       // On entry, x0 points to a 96-bit field element A in little-endian
+       // words format; x1 points to a field-element K in table format.  On
+       // exit, A is updated with the product A K.
+
+       ldr     d0, [x0, #0]
+       ldr     w2, [x0, #8]
+       mov     v0.d[1], x2
+       ldp     q1, q2, [x1]
+       rbit    v0.16b, v0.16b
+       vzero
+       mul96
+       rbit    v0.16b, v0.16b
+       mov     w2, v0.s[2]
+       str     d0, [x0, #0]
+       str     w2, [x0, #8]
+       ret
+ENDFUNC
+
+FUNC(gcm_mulk_192b_arm64_pmull)
+       // On entry, x0 points to a 192-bit field element A in big-endian
+       // words format; x1 points to a field-element K in table format.  On
+       // exit, A is updated with the product A K.
+
+       ldr     q0, [x0, #0]
+       ldr     d1, [x0, #16]
+       ldp     q2, q3, [x1, #0]
+       ldr     q4, [x1, #32]
+       rev32   v0.16b, v0.16b
+       rev32   v1.8b, v1.8b
+       rbit    v0.16b, v0.16b
+       rbit    v1.8b, v1.8b
+       vzero
+       mul192
+       rev32   v0.16b, v0.16b
+       rev32   v1.8b, v1.8b
+       rbit    v0.16b, v0.16b
+       rbit    v1.8b, v1.8b
+       str     q0, [x0, #0]
+       str     d1, [x0, #16]
+       ret
+ENDFUNC
+
+FUNC(gcm_mulk_192l_arm64_pmull)
+       // On entry, x0 points to a 192-bit field element A in little-endian
+       // words format; x1 points to a field-element K in table format.  On
+       // exit, A is updated with the product A K.
+
+       ldr     q0, [x0, #0]
+       ldr     d1, [x0, #16]
+       ldp     q2, q3, [x1, #0]
+       ldr     q4, [x1, #32]
+       rbit    v0.16b, v0.16b
+       rbit    v1.8b, v1.8b
+       vzero
+       mul192
+       rbit    v0.16b, v0.16b
+       rbit    v1.8b, v1.8b
+       str     q0, [x0, #0]
+       str     d1, [x0, #16]
+       ret
+ENDFUNC
+
+FUNC(gcm_mulk_256b_arm64_pmull)
+       // On entry, x0 points to a 256-bit field element A in big-endian
+       // words format; x1 points to a field-element K in table format.  On
+       // exit, A is updated with the product A K.
+
+       ldp     q0, q1, [x0]
+       ldp     q2, q3, [x1, #0]
+       ldp     q4, q5, [x1, #32]
+       rev32   v0.16b, v0.16b
+       rev32   v1.16b, v1.16b
+       rbit    v0.16b, v0.16b
+       rbit    v1.16b, v1.16b
+       vzero
+       mul256
+       rev32   v0.16b, v0.16b
+       rev32   v1.16b, v1.16b
+       rbit    v0.16b, v0.16b
+       rbit    v1.16b, v1.16b
+       stp     q0, q1, [x0]
+       ret
+ENDFUNC
+
+FUNC(gcm_mulk_256l_arm64_pmull)
+       // On entry, x0 points to a 256-bit field element A in little-endian
+       // words format; x1 points to a field-element K in table format.  On
+       // exit, A is updated with the product A K.
+
+       ldp     q0, q1, [x0]
+       ldp     q2, q3, [x1, #0]
+       ldp     q4, q5, [x1, #32]
+       rbit    v0.16b, v0.16b
+       rbit    v1.16b, v1.16b
+       vzero
+       mul256
+       rbit    v0.16b, v0.16b
+       rbit    v1.16b, v1.16b
+       stp     q0, q1, [x0]
+       ret
+ENDFUNC
+
+///----- That's all, folks --------------------------------------------------
diff --git a/symm/gcm-x86ish-pclmul.S b/symm/gcm-x86ish-pclmul.S

new file mode 100644 (file)

index 0000000..e60b7ca
--- /dev/null
+++ b/symm/gcm-x86ish-pclmul.S
@@ -0,0 +1,1073 @@
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// GCM acceleration for x86 processors
+///
+/// (c) 2018 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software: you can redistribute it and/or modify it
+/// under the terms of the GNU Library General Public License as published
+/// by the Free Software Foundation; either version 2 of the License, or
+/// (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful, but
+/// WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+/// Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb.  If not, write to the Free Software
+/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+/// USA.
+
+///--------------------------------------------------------------------------
+/// Preliminaries.
+
+#include "config.h"
+#include "asm-common.h"
+
+       .arch   .pclmul
+
+       .text
+
+///--------------------------------------------------------------------------
+/// Common register allocation.
+
+#if CPUFAM_X86
+#  define A eax
+#  define K edx
+#elif CPUFAM_AMD64 && ABI_SYSV
+#  define A rdi
+#  define K rsi
+#elif CPUFAM_AMD64 && ABI_WIN
+#  define A rcx
+#  define K rdx
+#endif
+
+///--------------------------------------------------------------------------
+/// Multiplication macros.
+
+       // The good news is that we have a fancy instruction to do the
+       // multiplications.  The bad news is that it's not particularly well-
+       // suited to the job.
+       //
+       // For one thing, it only does a 64-bit multiplication, so in general
+       // we'll need to synthesize the full-width multiply by hand.  For
+       // another thing, it doesn't help with the reduction, so we have to
+       // do that by hand too.  And, finally, GCM has crazy bit ordering,
+       // and the instruction does nothing useful for that at all.
+       //
+       // Focusing on that last problem first: the bits aren't in monotonic
+       // significance order unless we permute them.  If we reverse the byte
+       // order, then we'll have the bits in monotonic order, but backwards,
+       // so the degree-0 coefficient will be in the most-significant bit.
+       //
+       // This is less of a difficulty than it seems at first, because
+       // algebra.  Suppose we are given u = SUM_{0<=i<n} u_i t^i and v =
+       // SUM_{0<=j<n} v_j t^j; then
+       //
+       //      u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
+       //
+       // Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i
+       // and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
+       // Then
+       //
+       //      ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
+       //          = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
+       //
+       // which is almost the bit-reversal of u v, only it's shifted right
+       // by one place.  Oh, well: we'll have to shift it back later.
+       //
+       // That was important to think about, but there's not a great deal to
+       // do about it yet other than to convert what we've got from the
+       // blockcipher's byte-ordering convention to our big-endian
+       // convention.  Since this depends on the blockcipher convention,
+       // we'll leave the caller to cope with this: the macros here will
+       // assume that the operands are in `register' format, which is the
+       // byte-reversal of the external representation, padded at the
+       // most-significant end except for 96-bit blocks, which are
+       // zero-padded at the least-significant end (see `mul96' for the
+       // details).  In the commentary, pieces of polynomial are numbered
+       // according to the degree of the coefficients, so the unit
+       // coefficient of some polynomial a is in a_0.
+       //
+       // The commentary for `mul128' is the most detailed.  The other
+       // macros assume that you've already read and understood that.
+
+.macro mul128
+       // Enter with u and v in xmm0 and xmm1 respectively; leave with z =
+       // u v in xmm0.  Clobbers xmm1--xmm4.
+
+       // First for the double-precision multiplication.  It's tempting to
+       // use Karatsuba's identity here, but I suspect that loses more in
+       // the shifting, bit-twiddling, and dependency chains that it gains
+       // in saving a multiplication which otherwise pipelines well.
+       // xmm0 =                       // (u_1; u_0)
+       // xmm1 =                       // (v_1; v_0)
+       movdqa  xmm2, xmm1              // (v_1; v_0) again
+       movdqa  xmm3, xmm0              // (u_1; u_0) again
+       movdqa  xmm4, xmm0              // (u_1; u_0) yet again
+       pclmulhqlqdq xmm2, xmm0         // u_1 v_0
+       pclmullqlqdq xmm0, xmm1         // u_1 v_1
+       pclmulhqlqdq xmm3, xmm1         // u_0 v_1
+       pclmulhqhqdq xmm4, xmm1         // u_0 v_0
+
+       // Arrange the pieces to form a double-precision polynomial.
+       pxor    xmm2, xmm3              // (m_1; m_0) = u_1 v_0 + u_0 v_1
+       movdqa  xmm1, xmm2              // (m_1; m_0) again
+       pslldq  xmm2, 8                 // (0; m_1)
+       psrldq  xmm1, 8                 // (m_0; 0)
+       pxor    xmm0, xmm2              // x_1 = u_1 v_1 + m_1
+       pxor    xmm1, xmm4              // x_0 = u_0 v_0 + t^64 m_0
+
+       // Two problems remain.  The first is that this product is shifted
+       // left (from GCM's backwards perspective) by one place, which is
+       // annoying.  Let's take care of that now.  Once this is done, we'll
+       // be properly in GCM's backwards bit-ordering, so xmm1 will hold the
+       // low half of the product and xmm0 the high half.  (The following
+       // diagrams show bit 0 consistently on the right.)
+       //
+       //                               xmm1
+       //    ,-------------.-------------.-------------.-------------.
+       //    | 0  x_0-x_30 |  x_31-x_62  |  x_63-x_94  |  x_95-x_126 |
+       //    `-------------^-------------^-------------^-------------'
+       //
+       //                               xmm0
+       //    ,-------------.-------------.-------------.-------------.
+       //    | x_127-x_158 | x_159-x_190 | x_191-x_222 | x_223-x_254 |
+       //    `-------------^-------------^-------------^-------------'
+       //
+       // We start by shifting each 32-bit lane right (from GCM's point of
+       // view -- physically, left) by one place, which gives us this:
+       //
+       //                           low (xmm3)
+       //    ,-------------.-------------.-------------.-------------.
+       //    | x_0-x_30  0 | x_32-x_62 0 | x_64-x_94 0 | x_96-x_126 0|
+       //    `-------------^-------------^-------------^-------------'
+       //
+       //                           high (xmm2)
+       //    ,-------------.-------------.-------------.-------------.
+       //    |x_128-x_158 0|x_160-x_190 0|x_192-x_222 0|x_224-x_254 0|
+       //    `-------------^-------------^-------------^-------------'
+       //
+       // but we've lost a bunch of bits.  We separately shift each lane
+       // left by 31 places to give us the bits we lost.
+       //
+       //                           low (xmm1)
+       //    ,-------------.-------------.-------------.-------------.
+       //    |    0...0    | 0...0  x_31 | 0...0  x_63 | 0...0  x_95 |
+       //    `-------------^-------------^-------------^-------------'
+       //
+       //                           high (xmm0)
+       //    ,-------------.-------------.-------------.-------------.
+       //    | 0...0 x_127 | 0...0 x_159 | 0...0 x_191 | 0...0 x_223 |
+       //    `-------------^-------------^-------------^-------------'
+       //
+       // Which is close, but we don't get a cigar yet.  To get the missing
+       // bits into position, we shift each of these right by a lane, but,
+       // alas, the x_127 falls off, so, separately, we shift the high
+       // register left by three lanes, so that everything is lined up
+       // properly when we OR them all together:
+       //
+       //                           low (xmm1)
+       //    ,-------------.-------------.-------------.-------------.
+       //    ? 0...0  x_31 | 0...0  x_63 | 0...0  x_95 |    0...0    |
+       //    `-------------^-------------^-------------^-------------'
+       //
+       //                           wrap (xmm4)
+       //    ,-------------.-------------.-------------.-------------.
+       //    |    0...0    |    0...0    |    0...0    | 0...0 x_127 |
+       //    `-------------^-------------^-------------^-------------'
+       //
+       //                           high (xmm0)
+       //    ,-------------.-------------.-------------.-------------.
+       //    | 0...0 x_159 | 0...0 x_191 | 0...0 x_223 |    0...0    |
+       //    `-------------^-------------^-------------^-------------'
+       //
+       // The `low' and `wrap' registers (xmm1, xmm3, xmm4) then collect the
+       // low 128 coefficients, while the `high' registers (xmm0, xmm2)
+       // collect the high 127 registers, leaving a zero bit at the most
+       // significant end as we expect.
+
+       // xmm0 =                       // (x_7, x_6; x_5, x_4)
+       // xmm1 =                       // (x_3, x_2; x_1, x_0)
+       movdqa  xmm3, xmm1              // (x_3, x_2; x_1, x_0) again
+       movdqa  xmm2, xmm0              // (x_7, x_6; x_5, x_4) again
+       psrld   xmm1, 31                // shifted left; just the carries
+       psrld   xmm0, 31
+       pslld   xmm3, 1                 // shifted right, but dropped carries
+       pslld   xmm2, 1
+       movdqa  xmm4, xmm0              // another copy for the carry around
+       pslldq  xmm1, 4                 // move carries over
+       pslldq  xmm0, 4
+       psrldq  xmm4, 12                // the big carry wraps around
+       por     xmm1, xmm3
+       por     xmm0, xmm2              // (y_7, y_6; y_5, y_4)
+       por     xmm1, xmm4              // (y_3, y_2; y_1, y_0)
+
+       // And the other problem is that the result needs to be reduced
+       // modulo p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128 = t^7 +
+       // t^2 + t + 1 in our field.  So far, we've calculated z_0 and z_1
+       // such that z_0 + z_1 R = u v using the identity R = t^128: now we
+       // must collapse the two halves of z together using the other
+       // identity R = t^7 + t^2 + t + 1.
+       //
+       // We do this by working on each 32-bit word of the high half of z
+       // separately, so consider y_i, for some 4 <= i < 8.  Certainly, y_i
+       // t^{32i} = y_i R t^{32(i-4)} = (t^7 + t^2 + t + 1) y_i t^{32(i-4)},
+       // but we can't use that directly without breaking up the 32-bit word
+       // structure.  Instead, we start by considering just y_i t^7
+       // t^{32(i-4)}, which again looks tricky.  Now, split y_i = a_i +
+       // t^25 b_i, with deg a_i < 25; then
+       //
+       //      y_i t^7 t^{32(i-4)} = a_i t^7 t^{32(i-4)} + b_i t^{32(i-3)}
+       //
+       // We can similarly decompose y_i t^2 and y_i t into a pair of 32-bit
+       // contributions to the t^{32(i-4)} and t^{32(i-3)} words, but the
+       // splits are different.  This is lovely, with one small snag: when
+       // we do this to y_7, we end up with a contribution back into the
+       // t^128 coefficient word.  But notice that only the low seven bits
+       // of this word are affected, so there's no knock-on contribution
+       // into the t^32 word.  Therefore, if we handle the high bits of each
+       // word together, and then the low bits, everything will be fine.
+
+       // First, shift the high bits down.
+       movdqa  xmm2, xmm0              // (y_7, y_6; y_5, y_4) again
+       movdqa  xmm3, xmm0              // (y_7, y_6; y_5, y_4) yet again
+       movdqa  xmm4, xmm0              // (y_7, y_6; y_5, y_4) again again
+       pslld   xmm2, 31                // the b_i for t
+       pslld   xmm3, 30                // the b_i for t^2
+       pslld   xmm4, 25                // the b_i for t^7
+       pxor    xmm2, xmm3              // add them all together
+       pxor    xmm2, xmm4
+       movdqa  xmm3, xmm2              // and a copy for later
+       psrldq  xmm2, 4                 // contribution into low half
+       pslldq  xmm3, 12                // and high half
+       pxor    xmm1, xmm2
+       pxor    xmm0, xmm3
+
+       // And then shift the low bits up.
+       movdqa  xmm2, xmm0
+       movdqa  xmm3, xmm0
+       pxor    xmm1, xmm0              // mix in the unit contribution
+       psrld   xmm0, 1
+       psrld   xmm2, 2
+       psrld   xmm3, 7
+       pxor    xmm1, xmm2              // low half, unit, and t^2 contribs
+       pxor    xmm0, xmm3              // t and t^7 contribs
+       pxor    xmm0, xmm1              // mix them together and we're done
+.endm
+
+.macro mul64
+       // Enter with u and v in the low halves of xmm0 and xmm1
+       // respectively; leave with z = u v in xmm0.  Clobbers xmm1--xmm4.
+
+       // The multiplication is thankfully easy.
+       pclmullqlqdq xmm0, xmm1         // u v
+
+       // Shift the product up by one place.  After this, we're in GCM
+       // bizarro-world.
+       movdqa  xmm1, xmm0              // u v again
+       psrld   xmm0, 31                // shifted left; just the carries
+       pslld   xmm1, 1                 // shifted right, but dropped carries
+       pslldq  xmm0, 4                 // move carries over
+       por     xmm1, xmm0              // (y_3, y_2; y_1, y_0)
+
+       // Now we must reduce.  This is essentially the same as the 128-bit
+       // case above, but mostly simpler because everything is smaller.  The
+       // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
+
+       // First, we must detach the top (`low'!) half of the result.
+       movdqa  xmm0, xmm1              // (y_3, y_2; y_1, y_0) again
+       psrldq  xmm1, 8                 // (y_1, y_0; 0, 0)
+
+       // Next, shift the high bits down.
+       movdqa  xmm2, xmm0              // (y_3, y_2; ?, ?) again
+       movdqa  xmm3, xmm0              // (y_3, y_2; ?, ?) yet again
+       movdqa  xmm4, xmm0              // (y_3, y_2; ?, ?) again again
+       pslld   xmm2, 31                // b_i for t
+       pslld   xmm3, 29                // b_i for t^3
+       pslld   xmm4, 28                // b_i for t^4
+       pxor    xmm2, xmm3              // add them all together
+       pxor    xmm2, xmm4
+       movdqa  xmm3, xmm2              // and a copy for later
+       movq    xmm2, xmm2              // zap high half
+       pslldq  xmm3, 4                 // contribution into high half
+       psrldq  xmm2, 4                 // and low half
+       pxor    xmm0, xmm3
+       pxor    xmm1, xmm2
+
+       // And then shift the low bits up.
+       movdqa  xmm2, xmm0
+       movdqa  xmm3, xmm0
+       pxor    xmm1, xmm0              // mix in the unit contribution
+       psrld   xmm0, 1
+       psrld   xmm2, 3
+       psrld   xmm3, 4
+       pxor    xmm1, xmm2              // low half, unit, and t^3 contribs
+       pxor    xmm0, xmm3              // t and t^4 contribs
+       pxor    xmm0, xmm1              // mix them together and we're done
+.endm
+
+.macro mul96
+       // Enter with u and v in the /high/ three words of xmm0 and xmm1
+       // respectively (and zero in the low word); leave with z = u v in the
+       // high three words of xmm0, and /junk/ in the low word.  Clobbers
+       // xmm1--xmm4.
+
+       // This is an inconvenient size.  There's nothing for it but to do
+       // four multiplications, as if for the 128-bit case.  It's possible
+       // that there's cruft in the top 32 bits of the input registers, so
+       // shift both of them up by four bytes before we start.  This will
+       // mean that the high 64 bits of the result (from GCM's viewpoint)
+       // will be zero.
+       // xmm0 =                       // (0, u_2; u_1, u_0)
+       // xmm1 =                       // (0, v_2; v_1, v_0)
+       movdqa  xmm2, xmm1              // (0, v_2; v_1, v_0) again
+       movdqa  xmm3, xmm0              // (0, u_2; u_1, u_0) again
+       movdqa  xmm4, xmm0              // (0, u_2; u_1, u_0) yet again
+       pclmulhqlqdq xmm2, xmm0         // u_2 (v_1 t^32 + v_0) = e_0
+       pclmullqlqdq xmm0, xmm1         // u_2 v_2 = d = (0; d)
+       pclmulhqlqdq xmm3, xmm1         // v_2 (u_1 t^32 + u_0) = e_1
+       pclmulhqhqdq xmm4, xmm1         // u_0 v_0 + (u_1 v_0 + u_0 v_1) t^32
+                                       //   + u_1 v_1 t^64 = f
+
+       // Extract the high and low halves of the 192-bit result.  We don't
+       // need be too picky about the unused high words of the result
+       // registers.  The answer we want is d t^128 + e t^64 + f, where e =
+       // e_0 + e_1.
+       //
+       // The place values for the two halves are (t^160, t^128; t^96, ?)
+       // and (?, t^64; t^32, 1).
+       psrldq  xmm0, 8                 // (d; 0) = d t^128
+       pxor    xmm2, xmm3              // e = (e_0 + e_1)
+       movdqa  xmm1, xmm4              // f again
+       pxor    xmm0, xmm2              // d t^128 + e t^64
+       psrldq  xmm2, 12                // e[31..0] t^64
+       psrldq  xmm1, 4                 // f[95..0]
+       pslldq  xmm4, 8                 // f[127..96]
+       pxor    xmm1, xmm2              // low 96 bits of result
+       pxor    xmm0, xmm4              // high 96 bits of result
+
+       // Next, shift everything one bit to the left to compensate for GCM's
+       // strange ordering.  This will be easier if we shift up the high
+       // half by a word before we start.  After this we're in GCM bizarro-
+       // world.
+       movdqa  xmm3, xmm1              // low half again
+       pslldq  xmm0, 4                 // shift high half
+       psrld   xmm1, 31                // shift low half down: just carries
+       movdqa  xmm2, xmm0              // copy high half
+       pslld   xmm3, 1                 // shift low half down: drop carries
+       psrld   xmm0, 31                // shift high half up: just carries
+       pslld   xmm2, 1                 // shift high half down: drop carries
+       movdqa  xmm4, xmm0              // copy high carries for carry-around
+       pslldq  xmm0, 4                 // shift carries down
+       pslldq  xmm1, 4
+       psrldq  xmm4, 12                // the big carry wraps around
+       por     xmm1, xmm3
+       por     xmm0, xmm2
+       por     xmm1, xmm4
+
+       // Finally, the reduction.  This is essentially the same as the
+       // 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
+       // t^9 + t^6 + 1.  The degrees are larger but not enough to cause
+       // trouble for the general approach.
+
+       // First, shift the high bits down.
+       movdqa  xmm2, xmm0              // copies of the high part
+       movdqa  xmm3, xmm0
+       movdqa  xmm4, xmm0
+       pslld   xmm2, 26                // b_i for t^6
+       pslld   xmm3, 23                // b_i for t^9
+       pslld   xmm4, 22                // b_i for t^10
+       pxor    xmm2, xmm3              // add them all together
+       pslldq  xmm1, 4                 // shift low part up to match
+       pxor    xmm2, xmm4
+       movdqa  xmm3, xmm2              // and a copy for later
+       pslldq  xmm2, 8                 // contribution to high half
+       psrldq  xmm3, 4                 // contribution to low half
+       pxor    xmm1, xmm3
+       pxor    xmm0, xmm2
+
+       // And then shift the low bits up.
+       movdqa  xmm2, xmm0              // copies of the high part
+       movdqa  xmm3, xmm0
+       pxor    xmm1, xmm0              // mix in the unit contribution
+       psrld   xmm0, 6
+       psrld   xmm2, 9
+       psrld   xmm3, 10
+       pxor    xmm1, xmm2              // low half, unit, and t^9 contribs
+       pxor    xmm0, xmm3              // t^6 and t^10 contribs
+       pxor    xmm0, xmm1              // mix them together and we're done
+.endm
+
+.macro mul192
+       // Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave
+       // with z = u v in xmm0/xmm1 -- the top halves of the high registers
+       // are unimportant.  Clobbers xmm2--xmm7.
+
+       // Start multiplying and accumulating pieces of product.
+       // xmm0 =                       // (u_2; u_1)
+       // xmm1 =                       // (u_0; ?)
+       // xmm2 =                       // (v_2; v_1)
+       // xmm3 =                       // (v_0; ?)
+       movdqa  xmm4, xmm0              // (u_2; u_1) again
+       movdqa  xmm5, xmm0              // (u_2; u_1) yet again
+       movdqa  xmm6, xmm0              // (u_2; u_1) again again
+       movdqa  xmm7, xmm1              // (u_0; ?) again
+       punpcklqdq xmm1, xmm3           // (u_0; v_0)
+       pclmulhqhqdq xmm4, xmm2         // u_1 v_1
+       pclmullqlqdq xmm3, xmm0         // u_2 v_0
+       pclmullqhqdq xmm5, xmm2         // u_2 v_1
+       pclmulhqlqdq xmm6, xmm2         // u_1 v_2
+       pxor    xmm4, xmm3              // u_2 v_0 + u_1 v_1
+       pclmullqlqdq xmm7, xmm2         // u_0 v_2
+       pxor    xmm5, xmm6              // b = u_2 v_1 + u_1 v_2
+       movdqa  xmm6, xmm0              // (u_2; u_1) like a bad penny
+       pxor    xmm4, xmm7              // c = u_0 v_2 + u_1 v_1 + u_2 v_0
+       pclmullqlqdq xmm0, xmm2         // a = u_2 v_2
+       pclmulhqhqdq xmm6, xmm1         // u_1 v_0
+       pclmulhqlqdq xmm2, xmm1         // u_0 v_1
+       pclmullqhqdq xmm1, xmm1         // e = u_0 v_0
+       pxor    xmm2, xmm6              // d = u_1 v_0 + u_0 v_1
+
+       // Next, the piecing together of the product.
+       // xmm0 =                       // (a_1; a_0) = a = u_2 v_2
+       // xmm5 =                       // (b_1; b_0) = b = u_1 v_2 + u_2 v_1
+       // xmm4 =                       // (c_1; c_0) = c = u_0 v_2 +
+                                       //      u_1 v_1 + u_2 v_0
+       // xmm2 =                       // (d_1; d_0) = d = u_0 v_1 + u_1 v_0
+       // xmm1 =                       // (e_1; e_0) = e = u_0 v_0
+       // xmm3, xmm6, xmm7 spare
+       movdqa  xmm3, xmm2              // (d_1; d_0) again
+       movdqa  xmm6, xmm5              // (b_1; b_0) again
+       pslldq  xmm2, 8                 // (0; d_1)
+       psrldq  xmm5, 8                 // (b_0; 0)
+       psrldq  xmm3, 8                 // (d_0; 0)
+       pslldq  xmm6, 8                 // (0; b_1)
+       pxor    xmm5, xmm2              // (b_0; d_1)
+       pxor    xmm0, xmm6              // x_2 = (a_1; a_0 + b_1)
+       pxor    xmm3, xmm1              // x_0 = (e_1 + d_0; e_0)
+       pxor    xmm4, xmm5              // x_1 = (b_0 + c_1; c_0 + d_1)
+
+       // Now, shift it right (from GCM's point of view) by one bit, and try
+       // to leave the result in less random registers.  After this, we'll
+       // be in GCM bizarro-world.
+       // xmm1, xmm2, xmm5, xmm6, xmm7 spare
+       movdqa  xmm5, xmm0              // copy x_2
+       movdqa  xmm1, xmm4              // copy x_1
+       movdqa  xmm2, xmm3              // copy x_0
+       psrld   xmm0, 31                // x_2 carries
+       psrld   xmm4, 31                // x_1 carries
+       psrld   xmm3, 31                // x_0 carries
+       pslld   xmm5, 1                 // x_2 shifted
+       pslld   xmm1, 1                 // x_1 shifted
+       pslld   xmm2, 1                 // x_0 shifted
+       movdqa  xmm6, xmm0              // x_2 carry copy
+       movdqa  xmm7, xmm4              // x_1 carry copy
+       pslldq  xmm0, 4                 // x_2 carry shifted
+       pslldq  xmm4, 4                 // x_1 carry shifted
+       pslldq  xmm3, 4                 // x_0 carry shifted
+       psrldq  xmm6, 12                // x_2 carry out
+       psrldq  xmm7, 12                // x_1 carry out
+       por     xmm0, xmm5              // (y_5; y_4)
+       por     xmm1, xmm4
+       por     xmm2, xmm3
+       por     xmm1, xmm6              // (y_3; y_2)
+       por     xmm2, xmm7              // (y_1; y_0)
+
+       // Next, the reduction.  Our polynomial this time is p(x) = t^192 +
+       // t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
+       // 128-bit case.  I don't know why.
+
+       // First, shift the high bits down.
+       // xmm0 =                       // (y_5; y_4)
+       // xmm1 =                       // (y_3; y_2)
+       // xmm2 =                       // (y_1; y_0)
+       // xmm3--xmm7 spare
+       movdqa  xmm3, xmm0              // (y_5; y_4) copy
+       movdqa  xmm4, xmm0              // (y_5; y_4) copy
+       movdqa  xmm5, xmm0              // (y_5; y_4) copy
+       pslld   xmm3, 31                // (y_5; y_4) b_i for t
+       pslld   xmm4, 30                // (y_5; y_4) b_i for t^2
+       pslld   xmm5, 25                // (y_5; y_4) b_i for t^7
+        movq   xmm6, xmm1              // (y_3; 0) copy
+       pxor    xmm3, xmm4
+        movq   xmm7, xmm1              // (y_3; 0) copy
+       pxor    xmm3, xmm5
+        movq   xmm5, xmm1              // (y_3; 0) copy
+       movdqa  xmm4, xmm3              // (y_5; y_4) b_i combined
+        pslld  xmm6, 31                // (y_3; 0) b_i for t
+        pslld  xmm7, 30                // (y_3; 0) b_i for t^2
+        pslld  xmm5, 25                // (y_3; 0) b_i for t^7
+       psrldq  xmm3, 12                // (y_5; y_4) low contrib
+       pslldq  xmm4, 4                 // (y_5; y_4) high contrib
+        pxor   xmm6, xmm7
+       pxor    xmm2, xmm3
+        pxor   xmm6, xmm5
+       pxor    xmm1, xmm4
+        pslldq xmm6, 4
+        pxor   xmm2, xmm6
+
+       // And finally shift the low bits up.  Unfortunately, we also have to
+       // split the low bits out.
+       // xmm0 =                       // (y'_5; y'_4)
+       // xmm1 =                       // (y'_3; y'_2)
+       // xmm2 =                       // (y'_1; y'_0)
+        movdqa xmm5, xmm1              // copies of (y'_3; y'_2)
+        movdqa xmm6, xmm1
+        movdqa xmm7, xmm1
+         psrldq xmm1, 8                // bring down (y'_2; ?)
+       movdqa  xmm3, xmm0              // copies of (y'_5; y'_4)
+       movdqa  xmm4, xmm0
+         punpcklqdq  xmm1, xmm2        // (y'_2; y'_1)
+         psrldq xmm2, 8                // (y'_0; ?)
+        pxor   xmm2, xmm5              // low half and unit contrib
+       pxor    xmm1, xmm0
+        psrld  xmm5, 1
+       psrld   xmm0, 1
+        psrld  xmm6, 2
+       psrld   xmm3, 2
+        psrld  xmm7, 7
+       psrld   xmm4, 7
+        pxor   xmm2, xmm6              // low half, unit, t^2 contribs
+       pxor    xmm1, xmm3
+        pxor   xmm5, xmm7              // t and t^7 contribs
+       pxor    xmm0, xmm4
+        pxor   xmm5, xmm2              // mix everything together
+       pxor    xmm0, xmm1
+        movq   xmm1, xmm5              // shunt (z_0; ?) into proper place
+.endm
+
+.macro mul256
+       // Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave
+       // with z = u v in xmm0/xmm1.  Clobbers xmm2--xmm7.  On 32-bit x86,
+       // requires 16 bytes aligned space at SP; on amd64, also clobbers
+       // xmm8.
+
+       // Now it's starting to look worthwhile to do Karatsuba.  Suppose
+       // u = u_0 + u_1 B and v = v_0 + v_1 B.  Then
+       //
+       //      u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
+       //
+       // Name these coefficients of B^i be a, b, and c, respectively, and
+       // let r = u_0 + u_1 and s = v_0 + v_1.  Then observe that
+       //
+       //      q = r s = (u_0 + u_1) (v_0 + v_1)
+       //        = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
+       //        = a + d + c
+       //
+       // The first two terms we've already calculated; the last is the
+       // remaining one we want.  We'll set B = t^128.  We know how to do
+       // 128-bit multiplications already, and Karatsuba is too annoying
+       // there, so there'll be 12 multiplications altogether, rather than
+       // the 16 we'd have if we did this the naïve way.
+       //
+       // On x86, there aren't quite enough registers, so spill one for a
+       // bit.  On AMD64, we can keep on going, so it's all good.
+
+       // xmm0 =                       // u_1 = (u_11; u_10)
+       // xmm1 =                       // u_0 = (u_01; u_00)
+       // xmm2 =                       // v_1 = (v_11; v_10)
+       // xmm3 =                       // v_0 = (v_01; v_00)
+       movdqa  xmm4, xmm0              // u_1 again
+#if CPUFAM_X86
+       movdqa  [esp + 0], xmm3
+#elif CPUFAM_AMD64
+       movdqa  xmm8, xmm3
+#  define V0 xmm8
+#endif
+       pxor    xmm4, xmm1              // u_* = (u_01 + u_11; u_00 + u_10)
+       pxor    xmm3, xmm2              // v_* = (v_01 + v_11; v_00 + v_10)
+
+       // Start by building the cross product, q = u_* v_*.
+       movdqa  xmm7, xmm4              // more copies of u_*
+       movdqa  xmm5, xmm4
+       movdqa  xmm6, xmm4
+       pclmullqhqdq xmm4, xmm3         // u_*1 v_*0
+       pclmulhqlqdq xmm7, xmm3         // u_*0 v_*1
+       pclmullqlqdq xmm5, xmm3         // u_*1 v_*1
+       pclmulhqhqdq xmm6, xmm3         // u_*0 v_*0
+       pxor    xmm4, xmm7              // u_*1 v_*0 + u_*0 v_*1
+       movdqa  xmm7, xmm4
+       pslldq  xmm4, 8
+       psrldq  xmm7, 8
+       pxor    xmm5, xmm4              // q_1
+       pxor    xmm6, xmm7              // q_0
+
+       // Next, work on the high half, a = u_1 v_1.
+       movdqa  xmm3, xmm0              // more copies of u_1
+       movdqa  xmm4, xmm0
+       movdqa  xmm7, xmm0
+       pclmullqhqdq xmm0, xmm2         // u_11 v_10
+       pclmulhqlqdq xmm3, xmm2         // u_10 v_11
+       pclmullqlqdq xmm4, xmm2         // u_11 v_11
+       pclmulhqhqdq xmm7, xmm2         // u_10 v_10
+#if CPUFAM_X86
+       movdqa  xmm2, [esp + 0]
+#  define V0 xmm2
+#endif
+       pxor    xmm0, xmm3              // u_10 v_11 + u_11 v_10
+       movdqa  xmm3, xmm0
+       pslldq  xmm0, 8
+       psrldq  xmm3, 8
+       pxor    xmm4, xmm0              // x_1 = a_1
+       pxor    xmm7, xmm3              // a_0
+
+       // Mix that into the product now forming in xmm4--xmm7.
+       pxor    xmm5, xmm4              // a_1 + q_1
+       pxor    xmm6, xmm7              // a_0 + q_0
+       pxor    xmm5, xmm7              // a_0 + (a_1 + q_1)
+
+       // Finally, the low half, c = u_0 v_0.
+       movdqa  xmm0, xmm1              // more copies of u_0
+       movdqa  xmm3, xmm1
+       movdqa  xmm7, xmm1
+       pclmullqhqdq xmm1, V0           // u_01 v_00
+       pclmulhqlqdq xmm0, V0           // u_00 v_01
+       pclmullqlqdq xmm3, V0           // u_01 v_01
+       pclmulhqhqdq xmm7, V0           // u_00 v_00
+       pxor    xmm0, xmm1              // u_10 v_11 + u_11 v_10
+       movdqa  xmm1, xmm0
+       pslldq  xmm0, 8
+       psrldq  xmm1, 8
+       pxor    xmm3, xmm0              // c_1
+       pxor    xmm7, xmm1              // x_0 = c_0
+
+       // And mix that in to complete the product.
+       pxor    xmm6, xmm3              // (a_0 + q_0) + c_1
+       pxor    xmm5, xmm3       // x_2 = a_0 + (a_1 + c_1 + q_1) = a_0 + b_1
+       pxor    xmm6, xmm7       // x_1 = (a_0 + c_0 + q_0) + c_1 = b_0 + c_1
+
+#undef V0
+
+       // Now we need to shift that whole lot one bit to the left.  This
+       // will also give us an opportunity to put the product back in
+       // xmm0--xmm3.  This is a slightly merry dance because it's nearly
+       // pipelined but we don't have enough registers.
+       //
+       // After this, we'll be in GCM bizarro-world.
+       movdqa  xmm0, xmm4              // x_3 again
+       psrld   xmm4, 31                // x_3 carries
+       pslld   xmm0, 1                 // x_3 shifted left
+       movdqa  xmm3, xmm4              // x_3 copy carries
+        movdqa xmm1, xmm5              // x_2 again
+       pslldq  xmm4, 4                 // x_3 carries shifted up
+        psrld  xmm5, 31                // x_2 carries
+       psrldq  xmm3, 12                // x_3 big carry out
+        pslld  xmm1, 1                 // x_2 shifted left
+       por     xmm0, xmm4              // x_3 mixed together
+        movdqa xmm4, xmm5              // x_2 copy carries
+         movdqa xmm2, xmm6             // x_1 again
+        pslldq xmm5, 4                 // x_2 carries shifted up
+         psrld xmm6, 31                // x_1 carries
+        psrldq xmm4, 12                // x_2 big carry out
+         pslld xmm2, 1                 // x_1 shifted
+        por    xmm1, xmm5              // x_2 mixed together
+         movdqa xmm5, xmm6             // x_1 copy carries
+        por    xmm1, xmm3              // x_2 with carry from x_3
+          movdqa xmm3, xmm7            // x_0 again
+         pslldq xmm6, 4                // x_1 carries shifted up
+          psrld xmm7, 31               // x_2 carries
+         psrldq xmm5, 12               // x_1 big carry out
+          pslld xmm3, 1                // x_0 shifted
+         por   xmm2, xmm6              // x_1 mixed together
+          pslldq xmm7, 4               // x_0 carries shifted up
+         por   xmm2, xmm4              // x_1 with carry from x_2
+          por  xmm3, xmm7              // x_0 mixed together
+          por  xmm3, xmm5              // x_0 with carry from x_1
+
+       // Now we must reduce.  This is essentially the same as the 128-bit
+       // case above, but more complicated because everything is bigger.
+       // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
+
+       // First, shift the high bits down.
+       movdqa  xmm4, xmm0              // y_3 again
+       movdqa  xmm5, xmm0              // y_3 yet again
+       movdqa  xmm6, xmm0              // y_3 again again
+       pslld   xmm4, 30                // y_3: b_i for t^2
+       pslld   xmm5, 27                // y_3: b_i for t^5
+       pslld   xmm6, 22                // y_3: b_i for t^10
+        movdqa xmm7, xmm1              // y_2 again
+       pxor    xmm4, xmm5
+        movdqa xmm5, xmm1              // y_2 again
+       pxor    xmm4, xmm6
+        movdqa xmm6, xmm1              // y_2 again
+        pslld  xmm7, 30                // y_2: b_i for t^2
+        pslld  xmm5, 27                // y_2: b_i for t^5
+        pslld  xmm6, 22                // y_2: b_i for t^10
+        pxor   xmm7, xmm5
+       movdqa  xmm5, xmm4
+        pxor   xmm7, xmm6
+       psrldq  xmm4, 4
+        movdqa xmm6, xmm7
+       pslldq  xmm5, 12
+        psrldq xmm7, 4
+       pxor    xmm2, xmm4
+        pslldq xmm6, 12
+        pxor   xmm3, xmm7
+       pxor    xmm1, xmm5
+        pxor   xmm2, xmm6
+
+       // And then shift the low bits up.
+       movdqa  xmm4, xmm0              // y_3 again
+        movdqa xmm5, xmm1              // y_2 again
+       movdqa  xmm6, xmm0              // y_3 yet again
+        movdqa xmm7, xmm1              // y_2 yet again
+       pxor    xmm2, xmm0              // y_1 and unit contrib from y_3
+        pxor   xmm3, xmm1              // y_0 and unit contrib from y_2
+       psrld   xmm0, 2
+        psrld  xmm1, 2
+       psrld   xmm4, 5
+        psrld  xmm5, 5
+       psrld   xmm6, 10
+        psrld  xmm7, 10
+       pxor    xmm0, xmm2              // y_1, with y_3 units and t^2
+        pxor   xmm1, xmm3              // y_0, with y_2 units and t^2
+       pxor    xmm4, xmm6              // y_3 t^5 and t^10 contribs
+        pxor   xmm5, xmm7              // y_2 t^5 and t^10 contribs
+       pxor    xmm0, xmm4              // high half of reduced result
+       pxor    xmm1, xmm5              // low half; all done
+.endm
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+// There are a number of representations of field elements in this code and
+// it can be confusing.
+//
+//   * The `external format' consists of a sequence of contiguous bytes in
+//     memory called a `block'.  The GCM spec explains how to interpret this
+//     block as an element of a finite field.  As discussed extensively, this
+//     representation is very annoying for a number of reasons.  On the other
+//     hand, this code never actually deals with it directly.
+//
+//   * The `register format' consists of one or more XMM registers, depending
+//     on the block size.  The bytes in these registers are in reverse order
+//     -- so the least-significant byte of the lowest-numbered register holds
+//     the /last/ byte in the block.  If the block size is not a multiple of
+//     16 bytes, then there must be padding.  96-bit blocks are weird: the
+//     padding is inserted at the /least/ significant end, so the register
+//     holds (0, x_0; x_1, x_2); otherwise, the padding goes at the most
+//     significant end.
+//
+//   * The `words' format consists of a sequence of bytes, as in the
+//     `external format', but, according to the blockcipher in use, the bytes
+//     within each 32-bit word may be reversed (`big-endian') or not
+//     (`little-endian').  Accordingly, there are separate entry points for
+//     each variant, identified with `b' or `l'.
+
+#define SSEFUNC(f)                                                     \
+       FUNC(f##_avx); vzeroupper; endprologue; ENDFUNC;                \
+       FUNC(f)
+
+SSEFUNC(gcm_mulk_128b_x86ish_pclmul)
+       // On entry, A points to a 128-bit field element in big-endian words
+       // format; K points to a field-element in register format.  On exit,
+       // A is updated with the product A K.
+
+#if CPUFAM_X86
+       mov     A, [esp + 4]
+       mov     K, [esp + 8]
+#endif
+  endprologue
+       movdqu  xmm0, [A]
+       movdqu  xmm1, [K]
+       pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
+       mul128
+       pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
+       movdqu  [A], xmm0
+       ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_128l_x86ish_pclmul)
+       // On entry, A points to a 128-bit field element in little-endian
+       // words format; K points to a field-element in register format.  On
+       // exit, A is updated with the product A K.
+
+#if CPUFAM_X86
+       mov     A, [esp + 4]
+       mov     K, [esp + 8]
+       ldgot   ecx
+#endif
+  endprologue
+       movdqa  xmm7, [INTADDR(swaptab_128l, ecx)]
+       movdqu  xmm0, [A]
+       movdqu  xmm1, [K]
+       pshufb  xmm0, xmm7
+       mul128
+       pshufb  xmm0, xmm7
+       movdqu  [A], xmm0
+       ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_64b_x86ish_pclmul)
+       // On entry, A points to a 64-bit field element in big-endian words
+       // format; K points to a field-element in register format.  On exit,
+       // A is updated with the product A K.
+
+#if CPUFAM_X86
+       mov     A, [esp + 4]
+       mov     K, [esp + 8]
+#endif
+  endprologue
+       movq    xmm0, [A]
+       movq    xmm1, [K]
+       pshufd  xmm0, xmm0, SHUF(1, 0, 3, 3)
+       mul64
+       pshufd  xmm0, xmm0, SHUF(1, 0, 3, 3)
+       movq    [A], xmm0
+       ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_64l_x86ish_pclmul)
+       // On entry, A points to a 64-bit field element in little-endian
+       // words format; K points to a field-element in register format.  On
+       // exit, A is updated with the product A K.
+
+#if CPUFAM_X86
+       mov     A, [esp + 4]
+       mov     K, [esp + 8]
+       ldgot   ecx
+#endif
+  endprologue
+       movdqa  xmm7, [INTADDR(swaptab_64l, ecx)]
+       movq    xmm0, [A]
+       movq    xmm1, [K]
+       pshufb  xmm0, xmm7
+       mul64
+       pshufb  xmm0, xmm7
+       movq    [A], xmm0
+       ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_96b_x86ish_pclmul)
+       // On entry, A points to a 96-bit field element in big-endian words
+       // format; K points to a field-element in register format (i.e., 16
+       // bytes, with the first four bytes zero).  On exit, A is updated
+       // with the product A K.
+
+#if CPUFAM_X86
+       mov     A, [esp + 4]
+       mov     K, [esp + 8]
+#endif
+  endprologue
+       movq    xmm0, [A + 0]
+       movd    xmm2, [A + 8]
+       movdqu  xmm1, [K]
+       punpcklqdq xmm0, xmm2
+       pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
+       mul96
+       pshufd  xmm1, xmm0, SHUF(3, 2, 1, 0)
+       psrldq  xmm0, 4
+       movq    [A + 0], xmm1
+       movd    [A + 8], xmm0
+       ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_96l_x86ish_pclmul)
+       // On entry, A points to a 96-bit field element in little-endian
+       // words format; K points to a field-element in register format
+       // (i.e., 16 bytes, with the first four bytes zero).  On exit, A is
+       // updated with the product A K.
+
+#if CPUFAM_X86
+       mov     A, [esp + 4]
+       mov     K, [esp + 8]
+       ldgot   ecx
+#endif
+  endprologue
+       movdqa  xmm7, [INTADDR(swaptab_128l, ecx)]
+       movq    xmm0, [A + 0]
+       movd    xmm2, [A + 8]
+       movdqu  xmm1, [K]
+       punpcklqdq xmm0, xmm2
+       pshufb  xmm0, xmm7
+       mul96
+       pshufb  xmm0, xmm7
+       movq    [A + 0], xmm0
+       psrldq  xmm0, 8
+       movd    [A + 8], xmm0
+       ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_192b_x86ish_pclmul)
+       // On entry, A points to a 192-bit field element in big-endian words
+       // format; K points to a field-element in register format.  On exit,
+       // A is updated with the product A K.
+
+#if CPUFAM_X86
+       mov     A, [esp + 4]
+       mov     K, [esp + 8]
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+       stalloc 2*16 + 8
+       savexmm xmm6, 0
+       savexmm xmm7, 16
+#endif
+  endprologue
+       movdqu  xmm0, [A + 8]
+       movq    xmm1, [A + 0]
+       movdqu  xmm2, [K + 0]
+       movq    xmm3, [K + 16]
+       pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
+       pshufd  xmm1, xmm1, SHUF(1, 0, 3, 3)
+       mul192
+       pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
+       pshufd  xmm1, xmm1, SHUF(1, 0, 3, 3)
+       movdqu  [A + 8], xmm0
+       movq    [A + 0], xmm1
+#if CPUFAM_AMD64 && ABI_WIN
+       rstrxmm xmm6, 0
+       rstrxmm xmm7, 16
+       stfree  2*16 + 8
+#endif
+       ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_192l_x86ish_pclmul)
+       // On entry, A points to a 192-bit field element in little-endian
+       // words format; K points to a field-element in register format.  On
+       // exit, A is updated with the product A K.
+
+#if CPUFAM_X86
+       mov     A, [esp + 4]
+       mov     K, [esp + 8]
+       ldgot   ecx
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+       stalloc 2*16 + 8
+       savexmm xmm6, 0
+       savexmm xmm7, 16
+#endif
+  endprologue
+       movdqu  xmm0, [A + 8]
+       movq    xmm1, [A + 0]
+       movdqu  xmm2, [K + 0]
+       movq    xmm3, [K + 16]
+       pshufb  xmm0, [INTADDR(swaptab_128l, ecx)]
+       pshufb  xmm1, [INTADDR(swaptab_64l, ecx)]
+       mul192
+       pshufb  xmm0, [INTADDR(swaptab_128l, ecx)]
+       pshufb  xmm1, [INTADDR(swaptab_64l, ecx)]
+       movdqu  [A + 8], xmm0
+       movq    [A + 0], xmm1
+#if CPUFAM_AMD64 && ABI_WIN
+       rstrxmm xmm6, 0
+       rstrxmm xmm7, 16
+       stfree  2*16 + 8
+#endif
+       ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_256b_x86ish_pclmul)
+       // On entry, A points to a 256-bit field element in big-endian words
+       // format; K points to a field-element in register format.  On exit,
+       // A is updated with the product A K.
+
+#if CPUFAM_X86
+       pushreg ebp
+       setfp
+       mov     A, [esp + 8]
+       mov     K, [esp + 12]
+       and     esp, ~15
+       sub     esp, 16
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+       stalloc 3*16 + 8
+       savexmm xmm6, 0
+       savexmm xmm7, 16
+       savexmm xmm8, 32
+#endif
+  endprologue
+       movdqu  xmm0, [A + 16]
+       movdqu  xmm1, [A + 0]
+       movdqu  xmm2, [K + 0]
+       movdqu  xmm3, [K + 16]
+       pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
+       pshufd  xmm1, xmm1, SHUF(3, 2, 1, 0)
+       mul256
+       pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
+       pshufd  xmm1, xmm1, SHUF(3, 2, 1, 0)
+       movdqu  [A + 16], xmm0
+       movdqu  [A + 0], xmm1
+#if CPUFAM_X86
+       dropfp
+       popreg  ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+       rstrxmm xmm6, 0
+       rstrxmm xmm7, 16
+       rstrxmm xmm8, 32
+       stfree  3*16 + 8
+#endif
+       ret
+ENDFUNC
+
+SSEFUNC(gcm_mulk_256l_x86ish_pclmul)
+       // On entry, A points to a 256-bit field element in little-endian
+       // words format; K points to a field-element in register format.  On
+       // exit, A is updated with the product A K.
+
+#if CPUFAM_X86
+       pushreg ebp
+       setfp
+       mov     A, [esp + 8]
+       mov     K, [esp + 12]
+       and     esp, ~15
+       ldgot   ecx
+       sub     esp, 16
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+       stalloc 3*16 + 8
+       savexmm xmm6, 0
+       savexmm xmm7, 16
+       savexmm xmm8, 32
+#endif
+  endprologue
+       movdqa  xmm7, [INTADDR(swaptab_128l, ecx)]
+       movdqu  xmm0, [A + 16]
+       movdqu  xmm1, [A + 0]
+       movdqu  xmm2, [K + 0]
+       movdqu  xmm3, [K + 16]
+       pshufb  xmm0, xmm7
+       pshufb  xmm1, xmm7
+       mul256
+       movdqa  xmm7, [INTADDR(swaptab_128l, ecx)]
+       pshufb  xmm0, xmm7
+       pshufb  xmm1, xmm7
+       movdqu  [A + 16], xmm0
+       movdqu  [A + 0], xmm1
+#if CPUFAM_X86
+       dropfp
+       popreg  ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+       rstrxmm xmm6, 0
+       rstrxmm xmm7, 16
+       rstrxmm xmm8, 32
+       stfree  3*16 + 8
+#endif
+       ret
+ENDFUNC
+
+       RODATA
+
+       .balign 16
+swaptab_128l:
+       // Table for byte-swapping little-endian words-format blocks larger
+       // than 64 bits.
+       .byte    15,  14,  13,  12,   11,  10,   9,   8
+       .byte     7,   6,   5,   4,    3,   2,   1,   0
+
+       .balign 16
+swaptab_64l:
+       // Table for byte-swapping 64-bit little-endian words-format blocks.
+       .byte     7,   6,   5,   4,    3,   2,   1,   0
+       .byte   255, 255, 255, 255,  255, 255, 255, 255
+
+///----- That's all, folks --------------------------------------------------
diff --git a/symm/gcm.c b/symm/gcm.c

index 12096ae..73b2851 100644 (file)
--- a/symm/gcm.c
+++ b/symm/gcm.c
@@ -234,6 +234,120 @@ static void simple_mktable(const gcm_params *p,
      for (i = 0; i < 32*p->n*p->n; i++) ktab[i] = ENDSWAP32(ktab[i]);
  }
  
+#if CPUFAM_X86 || CPUFAM_AMD64
+static void pclmul_mktable(const gcm_params *p,
+                          uint32 *ktab, const uint32 *k)
+{
+  unsigned n = p->n;
+  unsigned nz;
+  uint32 *t;
+
+  /* We just need to store the value in a way which is convenient for the
+   * assembler code to read back.  That involves reordering the words, and,
+   * in the case of 96-bit blocks, padding with zeroes to fill out a 128-bit
+   * chunk.
+   */
+
+  if (n == 3) nz = 1;
+  else nz = 0;
+  t = ktab + n + nz;
+
+  if (p->f&GCMF_SWAP) while (n--) { *--t = ENDSWAP32(*k); k++; }
+  else while (n--) *--t = *k++;
+  while (nz--) *--t = 0;
+}
+#endif
+
+#if CPUFAM_ARMEL
+static void arm_crypto_mktable(const gcm_params *p,
+                              uint32 *ktab, const uint32 *k)
+{
+  unsigned n = p->n;
+  uint32 *t;
+
+  /* We just need to store the value in a way which is convenient for the
+   * assembler code to read back.  That involves swapping the bytes in each
+   * 64-bit lane.
+   */
+
+  t = ktab;
+  if (p->f&GCMF_SWAP) {
+    while (n >= 2) {
+      t[1] = ENDSWAP32(k[0]); t[0] = ENDSWAP32(k[1]);
+      t += 2; k += 2; n -= 2;
+    }
+    if (n) { t[1] = ENDSWAP32(k[0]); t[0] = 0; }
+  } else {
+    while (n >= 2) {
+      t[1] = k[0]; t[0] = k[1];
+      t += 2; k += 2; n -= 2;
+    }
+    if (n) { t[1] = k[0]; t[0] = 0; }
+  }
+}
+#endif
+
+#if CPUFAM_ARM64
+static uint32 rbit32(uint32 x)
+{
+  uint32 z, t;
+
+#if GCC_VERSION_P(4, 3)
+  /* Two tricks here.  Firstly, two separate steps, rather than a single
+   * block of assembler, to allow finer-grained instruction scheduling.
+   * Secondly, use `ENDSWAP32' so that the compiler can cancel it if the
+   * caller actually wants the bytes reordered.
+   */
+  __asm__("rbit %w0, %w1" : "=r"(t) : "r"(x));
+  z = ENDSWAP32(t);
+#else
+  /* A generic but slightly clever implementation. */
+#  define SWIZZLE(x, m, s) ((((x)&(m)) << (s)) | (((x)&~(m)) >> (s)))
+                                       /* 76543210 */
+  t = SWIZZLE(x, 0x0f0f0f0f, 4);       /* 32107654 -- swap nibbles */
+  t = SWIZZLE(t, 0x33333333, 2);       /* 10325476 -- swap bit pairs */
+  z = SWIZZLE(t, 0x55555555, 1);       /* 01234567 -- swap adjacent bits */
+#  undef SWIZZLE
+#endif
+  return (z);
+}
+
+static void arm64_pmull_mktable(const gcm_params *p,
+                               uint32 *ktab, const uint32 *k)
+{
+  unsigned n = p->n;
+  uint32 *t;
+
+  /* We just need to store the value in a way which is convenient for the
+   * assembler code to read back.  That involves two transformations:
+   *
+   *   * firstly, reversing the order of the bits in each byte; and,
+   *
+   *   * secondly, storing two copies of each 64-bit chunk.
+   *
+   * Note that, in this case, we /want/ the little-endian byte order of GCM,
+   * so endianness-swapping happens in the big-endian case.
+   */
+
+  t = ktab;
+  if (p->f&GCMF_SWAP) {
+    while (n >= 2) {
+      t[0] = t[2] = rbit32(k[0]);
+      t[1] = t[3] = rbit32(k[1]);
+      t += 4; k += 2; n -= 2;
+    }
+    if (n) { t[0] = t[2] = rbit32(k[0]); t[1] = t[3] = 0; }
+  } else {
+    while (n >= 2) {
+      t[0] = t[2] = ENDSWAP32(rbit32(k[0]));
+      t[1] = t[3] = ENDSWAP32(rbit32(k[1]));
+      t += 4; k += 2; n -= 2;
+    }
+    if (n) { t[0] = t[2] = ENDSWAP32(rbit32(k[0])); t[1] = t[3] = 0; }
+  }
+}
+#endif
+
  CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable,
              (const gcm_params *p, uint32 *ktab, const uint32 *k),
              (p, ktab, k),
@@ -241,6 +355,19 @@ CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable,
  
  static gcm_mktable__functype *pick_mktable(void)
  {
+#if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(gcm_mktable, pclmul_mktable,
+                    cpu_feature_p(CPUFEAT_X86_SSSE3) &&
+                    cpu_feature_p(CPUFEAT_X86_PCLMUL));
+#endif
+#if CPUFAM_ARMEL
+  DISPATCH_PICK_COND(gcm_mktable, arm_crypto_mktable,
+                    cpu_feature_p(CPUFEAT_ARM_PMULL));
+#endif
+#if CPUFAM_ARM64
+  DISPATCH_PICK_COND(gcm_mktable, arm64_pmull_mktable,
+                    cpu_feature_p(CPUFEAT_ARM_PMULL));
+#endif
    DISPATCH_PICK_FALLBACK(gcm_mktable, simple_mktable);
  }
  
@@ -271,13 +398,84 @@ static void simple_recover_k(const gcm_params *p,
    else for (i = 0; i < p->n; i++) k[i] = ENDSWAP32(ktab[24*p->n + i]);
  }
  
+#if CPUFAM_X86 || CPUFAM_AMD64
+static void pclmul_recover_k(const gcm_params *p,
+                            uint32 *k, const uint32 *ktab)
+{
+  unsigned n = p->n;
+  unsigned nz;
+  const uint32 *t;
+
+  /* The representation is already independent of the blockcipher endianness.
+   * We need to compensate for padding, and reorder the words.
+   */
+
+  if (n == 3) nz = 1; else nz = 0;
+  t = ktab + n + nz;
+  while (n--) *k++ = *--t;
+}
+#endif
+
+#if CPUFAM_ARMEL
+static void arm_crypto_recover_k(const gcm_params *p,
+                                uint32 *k, const uint32 *ktab)
+{
+  unsigned n = p->n;
+  const uint32 *t;
+
+  /* The representation is already independent of the blockcipher endianness.
+   * We only need to reorder the words.
+   */
+
+  t = ktab;
+  while (n >= 2) { k[1] = t[0]; k[0] = t[1]; t += 2; k += 2; n -= 2; }
+  if (n) k[0] = t[1];
+}
+#endif
+
+#if CPUFAM_ARM64
+static void arm64_pmull_recover_k(const gcm_params *p,
+                                 uint32 *k, const uint32 *ktab)
+{
+  unsigned n = p->n;
+  const uint32 *t;
+
+  /* The representation is already independent of the blockcipher endianness.
+   * We need to skip the duplicate pieces, and unscramble the bytes.
+   */
+
+  t = ktab;
+  while (n >= 2) {
+    k[0] = ENDSWAP32(rbit32(t[0]));
+    k[1] = ENDSWAP32(rbit32(t[1]));
+    t += 4; k += 2; n -= 2;
+  }
+  if (n) k[0] = ENDSWAP32(rbit32(t[0]));
+}
+#endif
+
  CPU_DISPATCH(static, EMPTY, void, recover_k,
              (const gcm_params *p, uint32 *k, const uint32 *ktab),
              (p, k, ktab),
              pick_recover_k, simple_recover_k)
  
  static recover_k__functype *pick_recover_k(void)
-  { DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k); }
+{
+#if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(recover_k, pclmul_recover_k,
+                    cpu_feature_p(CPUFEAT_X86_SSSE3) &&
+                    cpu_feature_p(CPUFEAT_X86_PCLMUL));
+#endif
+#if CPUFAM_ARMEL
+  DISPATCH_PICK_COND(recover_k, arm_crypto_recover_k,
+                    cpu_feature_p(CPUFEAT_ARM_PMULL));
+#endif
+#if CPUFAM_ARM64
+  DISPATCH_PICK_COND(recover_k, arm64_pmull_recover_k,
+                    cpu_feature_p(CPUFEAT_ARM_PMULL));
+#endif
+  DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k);
+}
  
  /* --- @gcm_mulk_N{b,l}@ --- *
   *
@@ -292,6 +490,48 @@ static recover_k__functype *pick_recover_k(void)
   *             function whose performance actually matters.
   */
  
+#if CPUFAM_X86 || CPUFAM_AMD64
+#  define DECL_MULK_X86ISH(var) extern gcm_mulk_##var##__functype      \
+  gcm_mulk_##var##_x86ish_pclmul_avx,                                  \
+  gcm_mulk_##var##_x86ish_pclmul;
+#  define PICK_MULK_X86ISH(var) do {                                   \
+  DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul_avx, \
+                    cpu_feature_p(CPUFEAT_X86_AVX) &&                  \
+                    cpu_feature_p(CPUFEAT_X86_PCLMUL) &&               \
+                    cpu_feature_p(CPUFEAT_X86_SSSE3));                 \
+  DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul,   \
+                    cpu_feature_p(CPUFEAT_X86_PCLMUL) &&               \
+                    cpu_feature_p(CPUFEAT_X86_SSSE3));                 \
+} while (0)
+#else
+#  define DECL_MULK_X86ISH(var)
+#  define PICK_MULK_X86ISH(var) do ; while (0)
+#endif
+
+#if CPUFAM_ARMEL
+#  define DECL_MULK_ARM(var)                                           \
+  extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm_crypto;
+#  define PICK_MULK_ARM(var) do {                                      \
+  DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm_crypto,      \
+                    cpu_feature_p(CPUFEAT_ARM_PMULL));                 \
+} while (0)
+#else
+#  define DECL_MULK_ARM(var)
+#  define PICK_MULK_ARM(var) do ; while (0)
+#endif
+
+#if CPUFAM_ARM64
+#  define DECL_MULK_ARM64(var)                                         \
+  extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm64_pmull;
+#  define PICK_MULK_ARM64(var) do {                                    \
+  DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm64_pmull,     \
+                    cpu_feature_p(CPUFEAT_ARM_PMULL));                 \
+} while (0)
+#else
+#  define DECL_MULK_ARM64(var)
+#  define PICK_MULK_ARM64(var) do ; while (0)
+#endif
+
  #define DEF_MULK(nbits)                                                        \
                                                                         \
  CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##b,                  \
@@ -321,10 +561,27 @@ static void simple_mulk_##nbits(uint32 *a, const uint32 *ktab)            \
    for (i = 0; i < nbits/32; i++) a[i] = z[i];                          \
  }                                                                      \
                                                                         \
+DECL_MULK_X86ISH(nbits##b)                                             \
+DECL_MULK_ARM(nbits##b)                                                        \
+DECL_MULK_ARM64(nbits##b)                                              \
  static gcm_mulk_##nbits##b##__functype *pick_mulk_##nbits##b(void)     \
-  { DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits); } \
+{                                                                      \
+  PICK_MULK_X86ISH(nbits##b);                                          \
+  PICK_MULK_ARM(nbits##b);                                             \
+  PICK_MULK_ARM64(nbits##b);                                           \
+  DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits);    \
+}                                                                      \
+                                                                       \
+DECL_MULK_X86ISH(nbits##l)                                             \
+DECL_MULK_ARM(nbits##l)                                                        \
+DECL_MULK_ARM64(nbits##l)                                              \
  static gcm_mulk_##nbits##l##__functype *pick_mulk_##nbits##l(void)     \
-  { DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits); }
+{                                                                      \
+  PICK_MULK_X86ISH(nbits##l);                                          \
+  PICK_MULK_ARM(nbits##l);                                             \
+  PICK_MULK_ARM64(nbits##l);                                           \
+  DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits);    \
+}
  
  GCM_WIDTHS(DEF_MULK)
  
diff --git a/utils/gcm-ref b/utils/gcm-ref

new file mode 100755 (executable)

index 0000000..ccbf432
--- /dev/null
+++ b/utils/gcm-ref
@@ -0,0 +1,502 @@
+#! /usr/bin/python
+### -*- coding: utf-8 -*-
+
+from sys import argv, exit
+
+import catacomb as C
+
+###--------------------------------------------------------------------------
+### Random utilities.
+
+def words(s):
+  """Split S into 32-bit pieces and report their values as hex."""
+  return ' '.join('%08x' % C.MP.loadb(s[i:i + 4])
+                  for i in xrange(0, len(s), 4))
+
+def words_64(s):
+  """Split S into 64-bit pieces and report their values as hex."""
+  return ' '.join('%016x' % C.MP.loadb(s[i:i + 8])
+                  for i in xrange(0, len(s), 8))
+
+def repmask(val, wd, n):
+  """Return a mask consisting of N copies of the WD-bit value VAL."""
+  v = C.GF(val)
+  a = C.GF(0)
+  for i in xrange(n): a = (a << wd) | v
+  return a
+
+def combs(things, k):
+  """Iterate over all possible combinations of K of the THINGS."""
+  ii = range(k)
+  n = len(things)
+  while True:
+    yield [things[i] for i in ii]
+    for j in xrange(k):
+      if j == k - 1: lim = n
+      else: lim = ii[j + 1]
+      i = ii[j] + 1
+      if i < lim:
+        ii[j] = i
+        break
+      ii[j] = j
+    else:
+      return
+
+POLYMAP = {}
+
+def poly(nbits):
+  """
+  Return the lexically first irreducible polynomial of degree NBITS of lowest
+  weight.
+  """
+  try: return POLYMAP[nbits]
+  except KeyError: pass
+  base = C.GF(0).setbit(nbits).setbit(0)
+  for k in xrange(1, nbits, 2):
+    for cc in combs(range(1, nbits), k):
+      p = base + sum(C.GF(0).setbit(c) for c in cc)
+      if p.irreduciblep(): POLYMAP[nbits] = p; return p
+  raise ValueError, nbits
+
+def gcm_mangle(x):
+  """Flip the bits within each byte according to GCM's insane convention."""
+  y = C.WriteBuffer()
+  for b in x:
+    b = ord(b)
+    bb = 0
+    for i in xrange(8):
+      bb <<= 1
+      if b&1: bb |= 1
+      b >>= 1
+    y.putu8(bb)
+  return y.contents
+
+def endswap_words_32(x):
+  """End-swap each 32-bit word of X."""
+  x = C.ReadBuffer(x)
+  y = C.WriteBuffer()
+  while x.left: y.putu32l(x.getu32b())
+  return y.contents
+
+def endswap_words_64(x):
+  """End-swap each 64-bit word of X."""
+  x = C.ReadBuffer(x)
+  y = C.WriteBuffer()
+  while x.left: y.putu64l(x.getu64b())
+  return y.contents
+
+def endswap_bytes(x):
+  """End-swap X by bytes."""
+  y = C.WriteBuffer()
+  for ch in reversed(x): y.put(ch)
+  return y.contents
+
+def gfmask(n):
+  return C.GF(C.MP(0).setbit(n) - 1)
+
+def gcm_mul(x, y):
+  """Multiply X and Y according to the GCM rules."""
+  w = len(x)
+  p = poly(8*w)
+  u, v = C.GF.loadl(gcm_mangle(x)), C.GF.loadl(gcm_mangle(y))
+  z = (u*v)%p
+  return gcm_mangle(z.storel(w))
+
+DEMOMAP = {}
+def demo(func):
+  name = func.func_name
+  assert(name.startswith('demo_'))
+  DEMOMAP[name[5:].replace('_', '-')] = func
+  return func
+
+def iota(i = 0):
+  vi = [i]
+  def next(): vi[0] += 1; return vi[0] - 1
+  return next
+
+###--------------------------------------------------------------------------
+### Portable table-driven implementation.
+
+def shift_left(x):
+  """Given a field element X (in external format), return X t."""
+  w = len(x)
+  p = poly(8*w)
+  return gcm_mangle(C.GF.storel((C.GF.loadl(gcm_mangle(x)) << 1)%p))
+
+def table_common(u, v, flip, getword, ixmask):
+  """
+  Multiply U by V using table lookup; common for `table-b' and `table-l'.
+
+  This matches the `simple_mulk_...' implementation in `gcm.c'.  One-entry
+  per bit is the best we can manage if we want a constant-time
+  implementation: processing n bits at a time means we need to scan
+  (2^n - 1)/n times as much memory.
+
+    * FLIP is a function (assumed to be an involution) on one argument X to
+      convert X from external format to table-entry format or back again.
+
+    * GETWORD is a function on one argument B to retrieve the next 32-bit
+      chunk of a field element held in a `ReadBuffer'.  Bits within a word
+      are processed most-significant first.
+
+    * IXMASK is a mask XORed into table indices to permute the table so that
+      it's order matches that induced by GETWORD.
+
+  The table is built such that tab[i XOR IXMASK] = U t^i.
+  """
+  w = len(u); assert(w == len(v))
+  a = C.ByteString.zero(w)
+  tab = [None]*(8*w)
+  for i in xrange(8*w):
+    print ';; %9s = %7s = %s' % ('utab[%d]' % i, 'u t^%d' % i, words(u))
+    tab[i ^ ixmask] = flip(u)
+    u = shift_left(u)
+  v = C.ReadBuffer(v)
+  i = 0
+  while v.left:
+    t = getword(v)
+    for j in xrange(32):
+      bit = (t >> 31)&1
+      if bit: a ^= tab[i]
+      print ';; %6s = %d: a <- %s [%9s = %s]' % \
+        ('v[%d]' % (i ^ ixmask), bit, words(a),
+         'utab[%d]' % (i ^ ixmask), words(tab[i]))
+      i += 1; t <<= 1
+  return flip(a)
+
+@demo
+def demo_table_b(u, v):
+  """Big-endian table lookup."""
+  return table_common(u, v, lambda x: x, lambda b: b.getu32b(), 0)
+
+@demo
+def demo_table_l(u, v):
+  """Little-endian table lookup."""
+  return table_common(u, v, endswap_words, lambda b: b.getu32l(), 0x18)
+
+###--------------------------------------------------------------------------
+### Implementation using 64×64->128-bit binary polynomial multiplication.
+
+_i = iota()
+TAG_INPUT_U = _i()
+TAG_INPUT_V = _i()
+TAG_KPIECE_U = _i()
+TAG_KPIECE_V = _i()
+TAG_PRODPIECE = _i()
+TAG_PRODSUM = _i()
+TAG_PRODUCT = _i()
+TAG_SHIFTED = _i()
+TAG_REDCBITS = _i()
+TAG_REDCFULL = _i()
+TAG_REDCMIX = _i()
+TAG_OUTPUT = _i()
+
+def split_gf(x, n):
+  n /= 8
+  return [C.GF.loadb(x[i:i + n]) for i in xrange(0, len(x), n)]
+
+def join_gf(xx, n):
+  x = C.GF(0)
+  for i in xrange(len(xx)): x = (x << n) | xx[i]
+  return x
+
+def present_gf(x, w, n, what):
+  firstp = True
+  m = gfmask(n)
+  for i in xrange(0, w, 128):
+    print ';; %12s%c         =%s' % \
+      (firstp and what or '',
+       firstp and ':' or ' ',
+       ''.join([j < w
+                and '          0x%s' % hex(((x >> j)&m).storeb(n/8))
+                or ''
+                for j in xrange(i, i + 128, n)]))
+    firstp = False
+
+def present_gf_pclmul(tag, wd, x, w, n, what):
+  if tag != TAG_PRODPIECE: present_gf(x, w, n, what)
+
+def reverse(x, w):
+  return C.GF.loadl(x.storeb(w/8))
+
+def rev32(x):
+  w = x.noctets
+  m_ffff = repmask(0xffff, 32, w/4)
+  m_ff = repmask(0xff, 16, w/2)
+  x = ((x&m_ffff) << 16) | ((x >> 16)&m_ffff)
+  x = ((x&m_ff) << 8) | ((x >> 8)&m_ff)
+  return x
+
+def rev8(x):
+  w = x.noctets
+  m_0f = repmask(0x0f, 8, w)
+  m_33 = repmask(0x33, 8, w)
+  m_55 = repmask(0x55, 8, w)
+  x = ((x&m_0f) << 4) | ((x >> 4)&m_0f)
+  x = ((x&m_33) << 2) | ((x >> 2)&m_33)
+  x = ((x&m_55) << 1) | ((x >> 1)&m_55)
+  return x
+
+def present_gf_mullp64(tag, wd, x, w, n, what):
+  if tag == TAG_PRODPIECE or tag == TAG_REDCFULL:
+    return
+  elif (wd == 128 or wd == 64) and TAG_PRODSUM <= tag <= TAG_PRODUCT:
+    y = x
+  elif (wd == 96 or wd == 192 or wd == 256) and \
+       TAG_PRODSUM <= tag < TAG_OUTPUT:
+    y = x
+  else:
+    xx = x.storeb(w/8)
+    extra = len(xx)%8
+    if extra: xx += C.ByteString.zero(8 - extra)
+    yb = C.WriteBuffer()
+    for i in xrange(len(xx), 0, -8): yb.put(xx[i - 8:i])
+    y = C.GF.loadb(yb.contents)
+  present_gf(y, (w + 63)&~63, n, what)
+
+def present_gf_pmull(tag, wd, x, w, n, what):
+  if tag == TAG_PRODPIECE or tag == TAG_REDCFULL or tag == TAG_SHIFTED:
+    return
+  elif tag == TAG_INPUT_V or tag == TAG_KPIECE_V:
+    bx = C.ReadBuffer(x.storeb(w/8))
+    by = C.WriteBuffer()
+    while bx.left: chunk = bx.get(8); by.put(chunk).put(chunk)
+    x = C.GF.loadb(by.contents)
+    w *= 2
+  elif TAG_PRODSUM <= tag <= TAG_PRODUCT:
+    x <<= 1
+  y = reverse(rev8(x), w)
+  present_gf(y, w, n, what)
+
+def poly64_mul_simple(u, v, presfn, wd, dispwd, mulwd, uwhat, vwhat):
+  """
+  Multiply U by V, returning the product.
+
+  This is the fallback long multiplication.
+  """
+
+  uw, vw = 8*len(u), 8*len(v)
+
+  ## We start by carving the operands into 64-bit pieces.  This is
+  ## straightforward except for the 96-bit case, where we end up with two
+  ## short pieces which we pad at the beginning.
+  if uw%mulwd: pad = (-uw)%mulwd; u += C.ByteString.zero(pad); uw += pad
+  if vw%mulwd: pad = (-uw)%mulwd; v += C.ByteString.zero(pad); vw += pad
+  uu = split_gf(u, mulwd)
+  vv = split_gf(v, mulwd)
+
+  ## Report and accumulate the individual product pieces.
+  x = C.GF(0)
+  ulim, vlim = uw/mulwd, vw/mulwd
+  for i in xrange(ulim + vlim - 2, -1, -1):
+    t = C.GF(0)
+    for j in xrange(max(0, i - vlim + 1), min(vlim, i + 1)):
+      s = uu[ulim - 1 - i + j]*vv[vlim - 1 - j]
+      presfn(TAG_PRODPIECE, wd, s, 2*mulwd, dispwd,
+             '%s_%d %s_%d' % (uwhat, i - j, vwhat, j))
+      t += s
+    presfn(TAG_PRODSUM, wd, t, 2*mulwd, dispwd,
+           '(%s %s)_%d' % (uwhat, vwhat, ulim + vlim - 2 - i))
+    x += t << (mulwd*i)
+  presfn(TAG_PRODUCT, wd, x, uw + vw, dispwd, '%s %s' % (uwhat, vwhat))
+
+  return x
+
+def poly64_mul_karatsuba(u, v, klimit, presfn, wd,
+                         dispwd, mulwd, uwhat, vwhat):
+  """
+  Multiply U by V, returning the product.
+
+  If the length of U and V is at least KLIMIT, and the operands are otherwise
+  suitable, then do Karatsuba--Ofman multiplication; otherwise, delegate to
+  `poly64_mul_simple'.
+  """
+  w = 8*len(u)
+
+  if w < klimit or w != 8*len(v) or w%(2*mulwd) != 0:
+    return poly64_mul_simple(u, v, presfn, wd, dispwd, mulwd, uwhat, vwhat)
+
+  hw = w/2
+  u0, u1 = u[:hw/8], u[hw/8:]
+  v0, v1 = v[:hw/8], v[hw/8:]
+  uu, vv = u0 ^ u1, v0 ^ v1
+
+  presfn(TAG_KPIECE_U, wd, C.GF.loadb(uu), hw, dispwd, '%s*' % uwhat)
+  presfn(TAG_KPIECE_V, wd, C.GF.loadb(vv), hw, dispwd, '%s*' % vwhat)
+  uuvv = poly64_mul_karatsuba(uu, vv, klimit, presfn, wd, dispwd, mulwd,
+                              '%s*' % uwhat, '%s*' % vwhat)
+
+  presfn(TAG_KPIECE_U, wd, C.GF.loadb(u0), hw, dispwd, '%s0' % uwhat)
+  presfn(TAG_KPIECE_V, wd, C.GF.loadb(v0), hw, dispwd, '%s0' % vwhat)
+  u0v0 = poly64_mul_karatsuba(u0, v0, klimit, presfn, wd, dispwd, mulwd,
+                              '%s0' % uwhat, '%s0' % vwhat)
+
+  presfn(TAG_KPIECE_U, wd, C.GF.loadb(u1), hw, dispwd, '%s1' % uwhat)
+  presfn(TAG_KPIECE_V, wd, C.GF.loadb(v1), hw, dispwd, '%s1' % vwhat)
+  u1v1 = poly64_mul_karatsuba(u1, v1, klimit, presfn, wd, dispwd, mulwd,
+                              '%s1' % uwhat, '%s1' % vwhat)
+
+  uvuv = uuvv + u0v0 + u1v1
+  presfn(TAG_PRODSUM, wd, uvuv, w, dispwd, '%s!%s' % (uwhat, vwhat))
+
+  x = u1v1 + (uvuv << hw) + (u0v0 << w)
+  presfn(TAG_PRODUCT, wd, x, 2*w, dispwd, '%s %s' % (uwhat, vwhat))
+  return x
+
+def poly64_common(u, v, presfn, dispwd = 32, mulwd = 64, redcwd = 32,
+                  klimit = 256):
+  """
+  Multiply U by V using a primitive 64-bit binary polynomial mutliplier.
+
+  Such a multiplier exists as the appallingly-named `pclmul[lh]q[lh]qdq' on
+  x86, and as `vmull.p64'/`pmull' on ARM.
+
+  Operands arrive in a `register format', which is a byte-swapped variant of
+  the external format.  Implementations differ on the precise details,
+  though.
+  """
+
+  ## We work in two main phases: first, calculate the full double-width
+  ## product; and, second, reduce it modulo the field polynomial.
+
+  w = 8*len(u); assert(w == 8*len(v))
+  p = poly(w)
+  presfn(TAG_INPUT_U, w, C.GF.loadb(u), w, dispwd, 'u')
+  presfn(TAG_INPUT_V, w, C.GF.loadb(v), w, dispwd, 'v')
+
+  ## So, on to the first part: the multiplication.
+  x = poly64_mul_karatsuba(u, v, klimit, presfn, w, dispwd, mulwd, 'u', 'v')
+
+  ## Now we have to shift everything up one bit to account for GCM's crazy
+  ## bit ordering.
+  y = x << 1
+  if w == 96: y >>= 64
+  presfn(TAG_SHIFTED, w, y, 2*w, dispwd, 'y')
+
+  ## Now for the reduction.
+  ##
+  ## Our polynomial has the form p = t^d + r where r = SUM_{0<=i<d} r_i t^i,
+  ## with each r_i either 0 or 1.  Because we choose the lexically earliest
+  ## irreducible polynomial with the necessary degree, r_i = 1 happens only
+  ## for a small number of tiny i.  In our field, we have t^d = r.
+  ##
+  ## We carve the product into convenient n-bit pieces, for some n dividing d
+  ## -- typically n = 32 or 64.  Let d = m n, and write y = SUM_{0<=i<2m} y_i
+  ## t^{ni}.  The upper portion, the y_i with i >= m, needs reduction; but
+  ## y_i t^{ni} = y_i r t^{n(i-m)}, so we just multiply the top half by r and
+  ## add it to the bottom half.  This all depends on r_i = 0 for all i >=
+  ## n/2.  We process each nonzero coefficient of r separately, in two
+  ## passes.
+  ##
+  ## Multiplying a chunk y_i by some t^j is the same as shifting it left by j
+  ## bits (or would be if GCM weren't backwards, but let's not worry about
+  ## that right now).  The high j bits will spill over into the next chunk,
+  ## while the low n - j bits will stay where they are.  It's these high bits
+  ## which cause trouble -- particularly the high bits of the top chunk,
+  ## since we'll add them on to y_m, which will need further reduction.  But
+  ## only the topmost j bits will do this.
+  ##
+  ## The trick is that we do all of the bits which spill over first -- all of
+  ## the top j bits in each chunk, for each j -- in one pass, and then a
+  ## second pass of all the bits which don't.  Because j, j' < n/2 for any
+  ## two nonzero coefficient degrees j and j', we have j + j' < n whence j <
+  ## n - j' -- so all of the bits contributed to y_m will be handled in the
+  ## second pass when we handle the bits that don't spill over.
+  rr = [i for i in xrange(1, w) if p.testbit(i)]
+  m = gfmask(redcwd)
+
+  ## Handle the spilling bits.
+  yy = split_gf(y.storeb(w/4), redcwd)
+  b = C.GF(0)
+  for rj in rr:
+    br = [(yi << (redcwd - rj))&m for yi in yy[w/redcwd:]]
+    presfn(TAG_REDCBITS, w, join_gf(br, redcwd), w, dispwd, 'b(%d)' % rj)
+    b += join_gf(br, redcwd) << (w - redcwd)
+  presfn(TAG_REDCFULL, w, b, 2*w, dispwd, 'b')
+  s = y + b
+  presfn(TAG_REDCMIX, w, s, 2*w, dispwd, 's')
+
+  ## Handle the nonspilling bits.
+  ss = split_gf(s.storeb(w/4), redcwd)
+  a = C.GF(0)
+  for rj in rr:
+    ar = [si >> rj for si in ss[w/redcwd:]]
+    presfn(TAG_REDCBITS, w, join_gf(ar, redcwd), w, dispwd, 'a(%d)' % rj)
+    a += join_gf(ar, redcwd)
+  presfn(TAG_REDCFULL, w, a, w, dispwd, 'a')
+
+  ## Mix everything together.
+  m = gfmask(w)
+  z = (s&m) + (s >> w) + a
+  presfn(TAG_OUTPUT, w, z, w, dispwd, 'z')
+
+  ## And we're done.
+  return z.storeb(w/8)
+
+@demo
+def demo_pclmul(u, v):
+  return poly64_common(u, v, presfn = present_gf_pclmul)
+
+@demo
+def demo_vmullp64(u, v):
+  w = 8*len(u)
+  return poly64_common(u, v, presfn = present_gf_mullp64,
+                       redcwd = w%64 == 32 and 32 or 64)
+
+@demo
+def demo_pmull(u, v):
+  w = 8*len(u)
+  return poly64_common(u, v, presfn = present_gf_pmull,
+                       redcwd = w%64 == 32 and 32 or 64)
+
+###--------------------------------------------------------------------------
+### @@@ Random debris to be deleted. @@@
+
+def cutting_room_floor():
+
+  x = C.bytes('cde4bef260d7bcda163547d348b7551195e77022907dd1df')
+  y = C.bytes('f7dac5c9941d26d0c6eb14ad568f86edd1dc9268eeee5332')
+
+  u, v = C.GF.loadb(x), C.GF.loadb(y)
+
+  g = u*v << 1
+  print 'y = %s' % words(g.storeb(48))
+  b1 = (g&repmask(0x01, 32, 6)) << 191
+  b2 = (g&repmask(0x03, 32, 6)) << 190
+  b7 = (g&repmask(0x7f, 32, 6)) << 185
+  b = b1 + b2 + b7
+  print 'b = %s' % words(b.storeb(48)[0:28])
+  h = g + b
+  print 'w = %s' % words(h.storeb(48))
+
+  a0 = (h&repmask(0xffffffff, 32, 6)) << 192
+  a1 = (h&repmask(0xfffffffe, 32, 6)) << 191
+  a2 = (h&repmask(0xfffffffc, 32, 6)) << 190
+  a7 = (h&repmask(0xffffff80, 32, 6)) << 185
+  a = a0 + a1 + a2 + a7
+
+  print '     a_1 = %s' % words(a1.storeb(48)[0:24])
+  print '     a_2 = %s' % words(a2.storeb(48)[0:24])
+  print '     a_7 = %s' % words(a7.storeb(48)[0:24])
+
+  print 'low+unit = %s' % words((h + a0).storeb(48)[0:24])
+  print ' low+0,2 = %s' % words((h + a0 + a2).storeb(48)[0:24])
+  print '     1,7 = %s' % words((a1 + a7).storeb(48)[0:24])
+
+  print 'a = %s' % words(a.storeb(48)[0:24])
+  z = h + a
+  print 'z = %s' % words(z.storeb(48))
+
+  z = gcm_mul(x, y)
+  print 'u v mod p = %s' % words(z)
+
+###--------------------------------------------------------------------------
+### Main program.
+
+style = argv[1]
+u = C.bytes(argv[2])
+v = C.bytes(argv[3])
+zz = DEMOMAP[style](u, v)
+assert zz == gcm_mul(u, v)
+
+###----- That's all, folks --------------------------------------------------
author	Mark Wooding <mdw@distorted.org.uk>
	Tue, 13 Nov 2018 11:28:53 +0000 (11:28 +0000)
committer	Mark Wooding <mdw@distorted.org.uk>
	Sun, 8 Sep 2019 17:35:39 +0000 (18:35 +0100)
base/dispatch.c		patch \| blob \| blame \| history
base/dispatch.h		patch \| blob \| blame \| history
symm/Makefile.am		patch \| blob \| blame \| history
symm/gcm-arm-crypto.S	[new file with mode: 0644]	patch \| blob
symm/gcm-arm64-pmull.S	[new file with mode: 0644]	patch \| blob
symm/gcm-x86ish-pclmul.S	[new file with mode: 0644]	patch \| blob
symm/gcm.c		patch \| blob \| blame \| history
utils/gcm-ref	[new file with mode: 0755]	patch \| blob