math/mpx-mul4-*.S: Output expanded Montgomery factor in a sensible order.

[catacomb] / symm / gcm.c
diff --git a/symm/gcm.c b/symm/gcm.c

index 749f9d6..b438415 100644 (file)
--- a/symm/gcm.c
+++ b/symm/gcm.c
@@ -33,6 +33,7 @@
  
  #include <mLib/bits.h>
  
+#include "dispatch.h"
  #include "gcm.h"
  #include "gcm-def.h"
  
@@ -203,7 +204,8 @@ static void mul(const gcm_params *p, uint32 *z,
   *             multiply (vaguely) efficiently by @k@.
   */
  
-void gcm_mktable(const gcm_params *p, uint32 *ktab, const uint32 *k)
+static void simple_mktable(const gcm_params *p,
+                          uint32 *ktab, const uint32 *k)
  {
    unsigned m = (p->f&GCMF_SWAP ? 0x18 : 0);
    unsigned i, j, o = m*p->n;
@@ -232,7 +234,250 @@ void gcm_mktable(const gcm_params *p, uint32 *ktab, const uint32 *k)
      for (i = 0; i < 32*p->n*p->n; i++) ktab[i] = ENDSWAP32(ktab[i]);
  }
  
-/* --- @gcm_mulk_N@ --- *
+#if CPUFAM_X86 || CPUFAM_AMD64
+static void pclmul_mktable(const gcm_params *p,
+                          uint32 *ktab, const uint32 *k)
+{
+  unsigned n = p->n;
+  unsigned nz;
+  uint32 *t;
+
+  /* We just need to store the value in a way which is convenient for the
+   * assembler code to read back.  That involves reordering the words, and,
+   * in the case of 96-bit blocks, padding with zeroes to fill out a 128-bit
+   * chunk.
+   */
+
+  if (n == 3) nz = 1;
+  else nz = 0;
+  t = ktab + n + nz;
+
+  if (p->f&GCMF_SWAP) while (n--) { *--t = ENDSWAP32(*k); k++; }
+  else while (n--) *--t = *k++;
+  while (nz--) *--t = 0;
+}
+#endif
+
+#if CPUFAM_ARMEL
+static void arm_crypto_mktable(const gcm_params *p,
+                              uint32 *ktab, const uint32 *k)
+{
+  unsigned n = p->n;
+  uint32 *t;
+
+  /* We just need to store the value in a way which is convenient for the
+   * assembler code to read back.  That involves swapping the bytes in each
+   * 64-bit lane.
+   */
+
+  t = ktab;
+  if (p->f&GCMF_SWAP) {
+    while (n >= 2) {
+      t[1] = ENDSWAP32(k[0]); t[0] = ENDSWAP32(k[1]);
+      t += 2; k += 2; n -= 2;
+    }
+    if (n) { t[1] = ENDSWAP32(k[0]); t[0] = 0; }
+  } else {
+    while (n >= 2) {
+      t[1] = k[0]; t[0] = k[1];
+      t += 2; k += 2; n -= 2;
+    }
+    if (n) { t[1] = k[0]; t[0] = 0; }
+  }
+}
+#endif
+
+#if CPUFAM_ARM64
+static uint32 rbit32(uint32 x)
+{
+  uint32 z, t;
+
+#if GCC_VERSION_P(4, 3)
+  /* Two tricks here.  Firstly, two separate steps, rather than a single
+   * block of assembler, to allow finer-grained instruction scheduling.
+   * Secondly, use `ENDSWAP32' so that the compiler can cancel it if the
+   * caller actually wants the bytes reordered.
+   */
+  __asm__("rbit %w0, %w1" : "=r"(t) : "r"(x));
+  z = ENDSWAP32(t);
+#else
+  /* A generic but slightly clever implementation. */
+#  define SWIZZLE(x, m, s) ((((x)&(m)) << (s)) | (((x)&~(m)) >> (s)))
+                                       /* 76543210 */
+  t = SWIZZLE(x, 0x0f0f0f0f, 4);       /* 32107654 -- swap nibbles */
+  t = SWIZZLE(t, 0x33333333, 2);       /* 10325476 -- swap bit pairs */
+  z = SWIZZLE(t, 0x55555555, 1);       /* 01234567 -- swap adjacent bits */
+#  undef SWIZZLE
+#endif
+  return (z);
+}
+
+static void arm64_pmull_mktable(const gcm_params *p,
+                               uint32 *ktab, const uint32 *k)
+{
+  unsigned n = p->n;
+  uint32 *t;
+
+  /* We just need to store the value in a way which is convenient for the
+   * assembler code to read back.  That involves two transformations:
+   *
+   *   * firstly, reversing the order of the bits in each byte; and,
+   *
+   *   * secondly, storing two copies of each 64-bit chunk.
+   *
+   * Note that, in this case, we /want/ the little-endian byte order of GCM,
+   * so endianness-swapping happens in the big-endian case.
+   */
+
+  t = ktab;
+  if (p->f&GCMF_SWAP) {
+    while (n >= 2) {
+      t[0] = t[2] = rbit32(k[0]);
+      t[1] = t[3] = rbit32(k[1]);
+      t += 4; k += 2; n -= 2;
+    }
+    if (n) { t[0] = t[2] = rbit32(k[0]); t[1] = t[3] = 0; }
+  } else {
+    while (n >= 2) {
+      t[0] = t[2] = ENDSWAP32(rbit32(k[0]));
+      t[1] = t[3] = ENDSWAP32(rbit32(k[1]));
+      t += 4; k += 2; n -= 2;
+    }
+    if (n) { t[0] = t[2] = ENDSWAP32(rbit32(k[0])); t[1] = t[3] = 0; }
+  }
+}
+#endif
+
+CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable,
+            (const gcm_params *p, uint32 *ktab, const uint32 *k),
+            (p, ktab, k),
+            pick_mktable, simple_mktable)
+
+static gcm_mktable__functype *pick_mktable(void)
+{
+#if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(gcm_mktable, pclmul_mktable,
+                    cpu_feature_p(CPUFEAT_X86_SSSE3) &&
+                    cpu_feature_p(CPUFEAT_X86_PCLMUL));
+#endif
+#if CPUFAM_ARMEL
+  DISPATCH_PICK_COND(gcm_mktable, arm_crypto_mktable,
+                    cpu_feature_p(CPUFEAT_ARM_PMULL));
+#endif
+#if CPUFAM_ARM64
+  DISPATCH_PICK_COND(gcm_mktable, arm64_pmull_mktable,
+                    cpu_feature_p(CPUFEAT_ARM_PMULL));
+#endif
+  DISPATCH_PICK_FALLBACK(gcm_mktable, simple_mktable);
+}
+
+/* --- @recover_k@ --- *
+ *
+ * Arguments:  @const gcm_params *p@ = pointer to the parameters
+ *             @uint32 *k@ = block-sized vector in which to store %$k$%
+ *             @const uint32 *ktab@ = the table encoding %$k$%
+ *
+ * Returns:    ---
+ *
+ * Use:                Recovers %$k$%, the secret from which @ktab@ was by
+ *             @gcm_mktable@, from the table, and stores it in internal
+ *             (big-endian) form in @k@.
+ */
+
+static void simple_recover_k(const gcm_params *p,
+                            uint32 *k, const uint32 *ktab)
+{
+  unsigned i;
+
+  /* If the blockcipher is big-endian, then the key is simply in the first
+   * table element, in the right format.  If the blockcipher is little-endian
+   * then it's in element 24, and the bytes need swapping.
+   */
+
+  if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) k[i] = ktab[i];
+  else for (i = 0; i < p->n; i++) k[i] = ENDSWAP32(ktab[24*p->n + i]);
+}
+
+#if CPUFAM_X86 || CPUFAM_AMD64
+static void pclmul_recover_k(const gcm_params *p,
+                            uint32 *k, const uint32 *ktab)
+{
+  unsigned n = p->n;
+  unsigned nz;
+  const uint32 *t;
+
+  /* The representation is already independent of the blockcipher endianness.
+   * We need to compensate for padding, and reorder the words.
+   */
+
+  if (n == 3) nz = 1; else nz = 0;
+  t = ktab + n + nz;
+  while (n--) *k++ = *--t;
+}
+#endif
+
+#if CPUFAM_ARMEL
+static void arm_crypto_recover_k(const gcm_params *p,
+                                uint32 *k, const uint32 *ktab)
+{
+  unsigned n = p->n;
+  const uint32 *t;
+
+  /* The representation is already independent of the blockcipher endianness.
+   * We only need to reorder the words.
+   */
+
+  t = ktab;
+  while (n >= 2) { k[1] = t[0]; k[0] = t[1]; t += 2; k += 2; n -= 2; }
+  if (n) k[0] = t[1];
+}
+#endif
+
+#if CPUFAM_ARM64
+static void arm64_pmull_recover_k(const gcm_params *p,
+                                 uint32 *k, const uint32 *ktab)
+{
+  unsigned n = p->n;
+  const uint32 *t;
+
+  /* The representation is already independent of the blockcipher endianness.
+   * We need to skip the duplicate pieces, and unscramble the bytes.
+   */
+
+  t = ktab;
+  while (n >= 2) {
+    k[0] = ENDSWAP32(rbit32(t[0]));
+    k[1] = ENDSWAP32(rbit32(t[1]));
+    t += 4; k += 2; n -= 2;
+  }
+  if (n) k[0] = ENDSWAP32(rbit32(t[0]));
+}
+#endif
+
+CPU_DISPATCH(static, EMPTY, void, recover_k,
+            (const gcm_params *p, uint32 *k, const uint32 *ktab),
+            (p, k, ktab),
+            pick_recover_k, simple_recover_k)
+
+static recover_k__functype *pick_recover_k(void)
+{
+#if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(recover_k, pclmul_recover_k,
+                    cpu_feature_p(CPUFEAT_X86_SSSE3) &&
+                    cpu_feature_p(CPUFEAT_X86_PCLMUL));
+#endif
+#if CPUFAM_ARMEL
+  DISPATCH_PICK_COND(recover_k, arm_crypto_recover_k,
+                    cpu_feature_p(CPUFEAT_ARM_PMULL));
+#endif
+#if CPUFAM_ARM64
+  DISPATCH_PICK_COND(recover_k, arm64_pmull_recover_k,
+                    cpu_feature_p(CPUFEAT_ARM_PMULL));
+#endif
+  DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k);
+}
+
+/* --- @gcm_mulk_N{b,l}@ --- *
   *
   * Arguments:  @uint32 *a@ = accumulator to multiply
   *             @const uint32 *ktab@ = table constructed by @gcm_mktable@
@@ -241,12 +486,62 @@ void gcm_mktable(const gcm_params *p, uint32 *ktab, const uint32 *k)
   *
   * Use:                Multiply @a@ by @k@ (implicitly represented in @ktab@),
   *             updating @a@ in-place.  There are separate functions for each
- *             supported block size because this is the function whose
- *             performance actually matters.
+ *             supported block size and endianness because this is the
+ *             function whose performance actually matters.
   */
  
+#if CPUFAM_X86 || CPUFAM_AMD64
+#  define DECL_MULK_X86ISH(var) extern gcm_mulk_##var##__functype      \
+  gcm_mulk_##var##_x86ish_pclmul_avx,                                  \
+  gcm_mulk_##var##_x86ish_pclmul;
+#  define PICK_MULK_X86ISH(var) do {                                   \
+  DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul_avx, \
+                    cpu_feature_p(CPUFEAT_X86_AVX) &&                  \
+                    cpu_feature_p(CPUFEAT_X86_PCLMUL) &&               \
+                    cpu_feature_p(CPUFEAT_X86_SSSE3));                 \
+  DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul,   \
+                    cpu_feature_p(CPUFEAT_X86_PCLMUL) &&               \
+                    cpu_feature_p(CPUFEAT_X86_SSSE3));                 \
+} while (0)
+#else
+#  define DECL_MULK_X86ISH(var)
+#  define PICK_MULK_X86ISH(var) do ; while (0)
+#endif
+
+#if CPUFAM_ARMEL
+#  define DECL_MULK_ARM(var)                                           \
+  extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm_crypto;
+#  define PICK_MULK_ARM(var) do {                                      \
+  DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm_crypto,      \
+                    cpu_feature_p(CPUFEAT_ARM_PMULL));                 \
+} while (0)
+#else
+#  define DECL_MULK_ARM(var)
+#  define PICK_MULK_ARM(var) do ; while (0)
+#endif
+
+#if CPUFAM_ARM64
+#  define DECL_MULK_ARM64(var)                                         \
+  extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm64_pmull;
+#  define PICK_MULK_ARM64(var) do {                                    \
+  DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm64_pmull,     \
+                    cpu_feature_p(CPUFEAT_ARM_PMULL));                 \
+} while (0)
+#else
+#  define DECL_MULK_ARM64(var)
+#  define PICK_MULK_ARM64(var) do ; while (0)
+#endif
+
  #define DEF_MULK(nbits)                                                        \
-void gcm_mulk_##nbits(uint32 *a, const uint32 *ktab)                   \
+                                                                       \
+CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##b,                  \
+            (uint32 *a, const uint32 *ktab), (a, ktab),                \
+            pick_mulk_##nbits##b, simple_mulk_##nbits)                 \
+CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##l,                  \
+            (uint32 *a, const uint32 *ktab), (a, ktab),                \
+            pick_mulk_##nbits##l, simple_mulk_##nbits)                 \
+                                                                       \
+static void simple_mulk_##nbits(uint32 *a, const uint32 *ktab)         \
  {                                                                      \
    uint32 m, t;                                                         \
    uint32 z[nbits/32];                                                  \
@@ -264,9 +559,46 @@ void gcm_mulk_##nbits(uint32 *a, const uint32 *ktab)                       \
    }                                                                    \
                                                                         \
    for (i = 0; i < nbits/32; i++) a[i] = z[i];                          \
+}                                                                      \
+                                                                       \
+DECL_MULK_X86ISH(nbits##b)                                             \
+DECL_MULK_ARM(nbits##b)                                                        \
+DECL_MULK_ARM64(nbits##b)                                              \
+static gcm_mulk_##nbits##b##__functype *pick_mulk_##nbits##b(void)     \
+{                                                                      \
+  PICK_MULK_X86ISH(nbits##b);                                          \
+  PICK_MULK_ARM(nbits##b);                                             \
+  PICK_MULK_ARM64(nbits##b);                                           \
+  DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits);    \
+}                                                                      \
+                                                                       \
+DECL_MULK_X86ISH(nbits##l)                                             \
+DECL_MULK_ARM(nbits##l)                                                        \
+DECL_MULK_ARM64(nbits##l)                                              \
+static gcm_mulk_##nbits##l##__functype *pick_mulk_##nbits##l(void)     \
+{                                                                      \
+  PICK_MULK_X86ISH(nbits##l);                                          \
+  PICK_MULK_ARM(nbits##l);                                             \
+  PICK_MULK_ARM64(nbits##l);                                           \
+  DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits);    \
  }
+
  GCM_WIDTHS(DEF_MULK)
  
+#define GCM_MULK_CASE(nbits)                                           \
+  case nbits/32:                                                       \
+    if (_f&GCMF_SWAP) gcm_mulk_##nbits##l(_a, _ktab);                  \
+    else gcm_mulk_##nbits##b(_a, _ktab);                               \
+    break;
+#define MULK(n, f, a, ktab) do {                                       \
+  uint32 *_a = (a); const uint32 *_ktab = (ktab);                      \
+  unsigned _f = (f);                                                   \
+  switch (n) {                                                         \
+    GCM_WIDTHS(GCM_MULK_CASE)                                          \
+    default: abort();                                                  \
+  }                                                                    \
+} while (0)
+
  /*----- Other utilities ---------------------------------------------------*/
  
  /* --- @putlen@ --- *
@@ -327,21 +659,11 @@ static void mix(const gcm_params *p, uint32 *a,
  {
    unsigned i;
  
-  /* Convert the block from bytes into words, using the appropriate
-   * convention.
-   */
    if (p->f&GCMF_SWAP)
      for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_L(q); q += 4; }
    else
      for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_B(q); q += 4; }
-
-  /* Dispatch to the correct multiply-by-%$k$% function. */
-  switch (p->n) {
-#define CASE(nbits) case nbits/32: gcm_mulk_##nbits(a, ktab); break;
-    GCM_WIDTHS(CASE)
-#undef CASE
-    default: abort();
-  }
+  MULK(p->n, p->f, a, ktab);
  }
  
  /* --- @gcm_ghashdone@ --- *
@@ -424,8 +746,7 @@ void gcm_concat(const gcm_params *p, uint32 *z, const uint32 *x,
      /* Start by retrieving %$k$% from the table, and convert it to big-endian
       * form.
       */
-    if (!(p->f&GCMF_SWAP)) for (j = 0; j < p->n; j++) u[j] = ktab[j];
-    else for (j = 0; j < p->n; j++) u[j] = ENDSWAP32(ktab[24*p->n + j]);
+    recover_k(p, u, ktab);
  
      /* Now calculate %$k^n$%. */
      i = ULONG_BITS;
@@ -453,4 +774,117 @@ void gcm_concat(const gcm_params *p, uint32 *z, const uint32 *x,
    }
  }
  
+/*----- Test rig ----------------------------------------------------------*/
+
+#ifdef TEST_RIG
+
+#include <mLib/macros.h>
+#include <mLib/quis.h>
+#include <mLib/testrig.h>
+
+#ifdef ENABLE_ASM_DEBUG
+#  include "regdump.h"
+#endif
+
+static void report_failure(const char *test, unsigned nbits,
+                          const char *ref, dstr v[], dstr *d)
+{
+  printf("test %s failed (nbits = %u)", test, nbits);
+  printf("\n\tx  = "); type_hex.dump(&v[0], stdout);
+  printf("\n\ty  = "); type_hex.dump(&v[1], stdout);
+  printf("\n\tz  = "); type_hex.dump(&v[2], stdout);
+  printf("\n\t%s' = ", ref); type_hex.dump(d, stdout);
+  putchar('\n');
+}
+
+static void mulk(unsigned nbits, unsigned f, uint32 *x, const uint32 *ktab)
+  { MULK(nbits/32, f, x, ktab); }
+
+static int test_mul(uint32 poly, dstr v[])
+{
+  uint32 x[GCM_NMAX], y[GCM_NMAX], z[GCM_NMAX], ktab[32*GCM_NMAX*GCM_NMAX];
+  gcm_params p;
+  dstr d = DSTR_INIT;
+  unsigned i, nbits;
+  int ok = 1;
+  enum { I_x, I_y, I_z };
+
+  nbits = 8*v[0].len; p.f = 0; p.n = nbits/32; p.poly = poly;
+  dstr_ensure(&d, nbits/8); d.len = nbits/8;
+
+#define LOADXY(E) do {                                                 \
+  for (i = 0; i < nbits/32; i++) {                                     \
+    x[i] = LOAD32_##E(v[I_x].buf + 4*i);                               \
+    y[i] = LOAD32_##E(v[I_y].buf + 4*i);                               \
+  }                                                                    \
+} while (0)
+
+#define INITZ(x) do {                                                  \
+  for (i = 0; i < nbits/32; i++) z[i] = (x)[i];                                \
+} while (0)
+
+#define CHECK(E, what, ref) do {                                       \
+  for (i = 0; i < nbits/32; i++) STORE32_##E(d.buf + 4*i, z[i]);       \
+  if (MEMCMP(d.buf, !=, v[I_##ref].buf, nbits/8))                      \
+    { ok = 0; report_failure(what, nbits, #ref, v, &d); }              \
+} while (0)
+
+#define TEST_PREP_1(E, x, y, what) do {                                        \
+  gcm_mktable(&p, ktab, y);                                            \
+  recover_k(&p, z, ktab); CHECK(B, "mktable/recover_k (" #y ")", y);   \
+  INITZ(x); mulk(nbits, p.f, z, ktab); CHECK(E, what " (k = " #y ")", z); \
+} while (0)
+
+#define TEST_PREP(E, what) do {                                                \
+  TEST_PREP_1(E, x, y, what);                                          \
+  TEST_PREP_1(E, y, x, what);                                          \
+} while (0)
+
+  /* First, test plain multiply. */
+  LOADXY(B); mul(&p, z, x, y); CHECK(B, "gcm_mul", z);
+
+  /* Next, test big-endian prepared key. */
+  LOADXY(B); TEST_PREP(B, "gcm_kmul_b");
+
+  /* Finally, test little-endian prepared key. */
+  p.f = GCMF_SWAP; LOADXY(L);
+  TEST_PREP(L, "gcm_kmul_l");
+
+#undef LOADXY
+#undef INITZ
+#undef CHECK
+#undef TEST_PREP_1
+#undef TEST_PREP
+
+  /* All done. */
+  return (ok);
+}
+
+#define TEST(nbits)                                                    \
+static int test_mul_##nbits(dstr v[])                                  \
+  { return (test_mul(GCM_POLY_##nbits, v)); }
+GCM_WIDTHS(TEST)
+#undef TEST
+
+static test_chunk defs[] = {
+#define TEST(nbits)                                                    \
+  { "gcm-mul" #nbits, test_mul_##nbits,                                        \
+    { &type_hex, &type_hex, &type_hex, 0 } },
+GCM_WIDTHS(TEST)
+#undef TEST
+  { 0, 0, { 0 } }
+};
+
+int main(int argc,  char *argv[])
+{
+  ego(argv[0]);
+#ifdef ENABLE_ASM_DEBUG
+  regdump_init();
+#endif
+  test_run(argc, argv, defs, SRCDIR"/t/gcm");
+  return (0);
+}
+
+#endif
+
  /*----- That's all, folks -------------------------------------------------*/