From 1d30a9b905cb0d622934dd438117e0a1b354c3f8 Mon Sep 17 00:00:00 2001
From: Mark Wooding <mdw@distorted.org.uk>
Date: Tue, 13 Nov 2018 11:26:56 +0000
Subject: [PATCH] symm/gcm.c: Make `gcm_mktable' and `gcm_mulk_...' be
 CPU-dependent.

A couple of other changes to ease the way:

  * Split `gcm_mulk_...' into two endianness variants, so that
    CPU-specific variants don't have to track what's going on through
    the key table.

  * Abstract out `recover_k' to decode the key value from a table, for
    the use of `gcm_concat'.  This is, of course, necessary if the table
    format is CPU-dependent.

  * Add testing to make sure that `mktable'/`recover_k' agree with each
    other.

There are currently no fancy implementations, but you can tell what's
coming.  No actual functional change, except for logging if you set
`CATACOMB_CPUDISPATCH_DEBUG' in the environment.
---
 symm/gcm-def.h |  16 ++++--
 symm/gcm.c     | 170 +++++++++++++++++++++++++++++++++++++++------------------
 2 files changed, 129 insertions(+), 57 deletions(-)

diff --git a/symm/gcm-def.h b/symm/gcm-def.h
index f8688c4e..34f95aa1 100644
--- a/symm/gcm-def.h
+++ b/symm/gcm-def.h
@@ -107,7 +107,7 @@ typedef struct gcm_params {
 extern void gcm_mktable(const gcm_params */*p*/,
 			uint32 */*ktab*/, const uint32 */*k*/);
 
-/* --- @gcm_mulk_N@ --- *
+/* --- @gcm_mulk_N{b,l}@ --- *
  *
  * Arguments:	@uint32 *a@ = accumulator to multiply
  *		@const uint32 *ktab@ = table constructed by @gcm_mktable@
@@ -116,17 +116,23 @@ extern void gcm_mktable(const gcm_params */*p*/,
  *
  * Use:		Multiply @a@ by @k@ (implicitly represented in @ktab@),
  *		updating @a@ in-place.  There are separate functions for each
- *		supported block size because this is the function whose
- *		performance actually matters.
+ *		supported block size and endianness because this is the
+ *		function whose performance actually matters.
  */
 
 #define GCM_DECL_MULK(nbits)						\
-  extern void gcm_mulk_##nbits(uint32 */*a*/, const uint32 */*ktab*/);
+  extern void gcm_mulk_##nbits##b(uint32 */*a*/, const uint32 */*ktab*/); \
+  extern void gcm_mulk_##nbits##l(uint32 */*a*/, const uint32 */*ktab*/);
 GCM_WIDTHS(GCM_DECL_MULK)
 #undef GCM_DECL_MULK
 
 /* Dispatch to the appropriate variant of @gcm_mulk@. */
-#define GCM_MULK(PRE, a, ktab) BLKC_GLUE(gcm_mulk_, BLKC_BITS(PRE))(a, ktab)
+#define GCM_MULK(PRE, a, ktab)						\
+  BLKC_GLUE(GCM_MULK_, BLKC_ENDIAN(PRE))(BLKC_BITS(PRE), a, ktab)
+#define GCM_MULK_B(nbits, a, ktab)					\
+  BLKC_GLUE(BLKC_GLUE(gcm_mulk_, nbits), b)(a, ktab)
+#define GCM_MULK_L(nbits, a, ktab)					\
+  BLKC_GLUE(BLKC_GLUE(gcm_mulk_, nbits), l)(a, ktab)
 
 /* --- @gcm_ghashdone@ --- *
  *
diff --git a/symm/gcm.c b/symm/gcm.c
index 330ec7b9..12096aec 100644
--- a/symm/gcm.c
+++ b/symm/gcm.c
@@ -33,6 +33,7 @@
 
 #include <mLib/bits.h>
 
+#include "dispatch.h"
 #include "gcm.h"
 #include "gcm-def.h"
 
@@ -203,7 +204,8 @@ static void mul(const gcm_params *p, uint32 *z,
  *		multiply (vaguely) efficiently by @k@.
  */
 
-void gcm_mktable(const gcm_params *p, uint32 *ktab, const uint32 *k)
+static void simple_mktable(const gcm_params *p,
+			   uint32 *ktab, const uint32 *k)
 {
   unsigned m = (p->f&GCMF_SWAP ? 0x18 : 0);
   unsigned i, j, o = m*p->n;
@@ -232,7 +234,52 @@ void gcm_mktable(const gcm_params *p, uint32 *ktab, const uint32 *k)
     for (i = 0; i < 32*p->n*p->n; i++) ktab[i] = ENDSWAP32(ktab[i]);
 }
 
-/* --- @gcm_mulk_N@ --- *
+CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable,
+	     (const gcm_params *p, uint32 *ktab, const uint32 *k),
+	     (p, ktab, k),
+	     pick_mktable, simple_mktable)
+
+static gcm_mktable__functype *pick_mktable(void)
+{
+  DISPATCH_PICK_FALLBACK(gcm_mktable, simple_mktable);
+}
+
+/* --- @recover_k@ --- *
+ *
+ * Arguments:	@const gcm_params *p@ = pointer to the parameters
+ *		@uint32 *k@ = block-sized vector in which to store %$k$%
+ *		@const uint32 *ktab@ = the table encoding %$k$%
+ *
+ * Returns:	---
+ *
+ * Use:		Recovers %$k$%, the secret from which @ktab@ was by
+ *		@gcm_mktable@, from the table, and stores it in internal
+ *		(big-endian) form in @k@.
+ */
+
+static void simple_recover_k(const gcm_params *p,
+			     uint32 *k, const uint32 *ktab)
+{
+  unsigned i;
+
+  /* If the blockcipher is big-endian, then the key is simply in the first
+   * table element, in the right format.  If the blockcipher is little-endian
+   * then it's in element 24, and the bytes need swapping.
+   */
+
+  if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) k[i] = ktab[i];
+  else for (i = 0; i < p->n; i++) k[i] = ENDSWAP32(ktab[24*p->n + i]);
+}
+
+CPU_DISPATCH(static, EMPTY, void, recover_k,
+	     (const gcm_params *p, uint32 *k, const uint32 *ktab),
+	     (p, k, ktab),
+	     pick_recover_k, simple_recover_k)
+
+static recover_k__functype *pick_recover_k(void)
+  { DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k); }
+
+/* --- @gcm_mulk_N{b,l}@ --- *
  *
  * Arguments:	@uint32 *a@ = accumulator to multiply
  *		@const uint32 *ktab@ = table constructed by @gcm_mktable@
@@ -241,12 +288,20 @@ void gcm_mktable(const gcm_params *p, uint32 *ktab, const uint32 *k)
  *
  * Use:		Multiply @a@ by @k@ (implicitly represented in @ktab@),
  *		updating @a@ in-place.  There are separate functions for each
- *		supported block size because this is the function whose
- *		performance actually matters.
+ *		supported block size and endianness because this is the
+ *		function whose performance actually matters.
  */
 
 #define DEF_MULK(nbits)							\
-void gcm_mulk_##nbits(uint32 *a, const uint32 *ktab)			\
+									\
+CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##b,			\
+	     (uint32 *a, const uint32 *ktab), (a, ktab),		\
+	     pick_mulk_##nbits##b, simple_mulk_##nbits)			\
+CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##l,			\
+	     (uint32 *a, const uint32 *ktab), (a, ktab),		\
+	     pick_mulk_##nbits##l, simple_mulk_##nbits)			\
+									\
+static void simple_mulk_##nbits(uint32 *a, const uint32 *ktab)		\
 {									\
   uint32 m, t;								\
   uint32 z[nbits/32];							\
@@ -264,13 +319,23 @@ void gcm_mulk_##nbits(uint32 *a, const uint32 *ktab)			\
   }									\
 									\
   for (i = 0; i < nbits/32; i++) a[i] = z[i];				\
-}
+}									\
+									\
+static gcm_mulk_##nbits##b##__functype *pick_mulk_##nbits##b(void)	\
+  { DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits); } \
+static gcm_mulk_##nbits##l##__functype *pick_mulk_##nbits##l(void)	\
+  { DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits); }
+
 GCM_WIDTHS(DEF_MULK)
 
 #define GCM_MULK_CASE(nbits)						\
-  case nbits/32: gcm_mulk_##nbits(_a, _ktab); break;
-#define MULK(n, a, ktab) do {						\
+  case nbits/32:							\
+    if (_f&GCMF_SWAP) gcm_mulk_##nbits##l(_a, _ktab);			\
+    else gcm_mulk_##nbits##b(_a, _ktab);				\
+    break;
+#define MULK(n, f, a, ktab) do {					\
   uint32 *_a = (a); const uint32 *_ktab = (ktab);			\
+  unsigned _f = (f);							\
   switch (n) {								\
     GCM_WIDTHS(GCM_MULK_CASE)						\
     default: abort();							\
@@ -341,7 +406,7 @@ static void mix(const gcm_params *p, uint32 *a,
     for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_L(q); q += 4; }
   else
     for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_B(q); q += 4; }
-  MULK(p->n, a, ktab);
+  MULK(p->n, p->f, a, ktab);
 }
 
 /* --- @gcm_ghashdone@ --- *
@@ -424,8 +489,7 @@ void gcm_concat(const gcm_params *p, uint32 *z, const uint32 *x,
     /* Start by retrieving %$k$% from the table, and convert it to big-endian
      * form.
      */
-    if (!(p->f&GCMF_SWAP)) for (j = 0; j < p->n; j++) u[j] = ktab[j];
-    else for (j = 0; j < p->n; j++) u[j] = ENDSWAP32(ktab[24*p->n + j]);
+    recover_k(p, u, ktab);
 
     /* Now calculate %$k^n$%. */
     i = ULONG_BITS;
@@ -461,18 +525,18 @@ void gcm_concat(const gcm_params *p, uint32 *z, const uint32 *x,
 #include <mLib/testrig.h>
 
 static void report_failure(const char *test, unsigned nbits,
-			   dstr v[], dstr *d)
+			   const char *ref, dstr v[], dstr *d)
 {
   printf("test %s failed (nbits = %u)", test, nbits);
   printf("\n\tx  = "); type_hex.dump(&v[0], stdout);
   printf("\n\ty  = "); type_hex.dump(&v[1], stdout);
   printf("\n\tz  = "); type_hex.dump(&v[2], stdout);
-  printf("\n\tz' = "); type_hex.dump(d, stdout);
+  printf("\n\t%s' = ", ref); type_hex.dump(d, stdout);
   putchar('\n');
 }
 
-static void mulk(unsigned nbits, uint32 *x, const uint32 *ktab)
-  { MULK(nbits/32, x, ktab); }
+static void mulk(unsigned nbits, unsigned f, uint32 *x, const uint32 *ktab)
+  { MULK(nbits/32, f, x, ktab); }
 
 static int test_mul(uint32 poly, dstr v[])
 {
@@ -481,52 +545,54 @@ static int test_mul(uint32 poly, dstr v[])
   dstr d = DSTR_INIT;
   unsigned i, nbits;
   int ok = 1;
+  enum { I_x, I_y, I_z };
 
   nbits = 8*v[0].len; p.f = 0; p.n = nbits/32; p.poly = poly;
   dstr_ensure(&d, nbits/8); d.len = nbits/8;
 
+#define LOADXY(E) do {							\
+  for (i = 0; i < nbits/32; i++) {					\
+    x[i] = LOAD32_##E(v[I_x].buf + 4*i);				\
+    y[i] = LOAD32_##E(v[I_y].buf + 4*i);				\
+  }									\
+} while (0)
+
+#define INITZ(x) do {							\
+  for (i = 0; i < nbits/32; i++) z[i] = (x)[i];				\
+} while (0)
+
+#define CHECK(E, what, ref) do {					\
+  for (i = 0; i < nbits/32; i++) STORE32_##E(d.buf + 4*i, z[i]);	\
+  if (memcmp(d.buf, v[I_##ref].buf, nbits/8) != 0)			\
+    { ok = 0; report_failure(what, nbits, #ref, v, &d); }		\
+} while (0)
+
+#define TEST_PREP_1(E, x, y, what) do {					\
+  gcm_mktable(&p, ktab, y);						\
+  recover_k(&p, z, ktab); CHECK(B, "mktable/recover_k (" #y ")", y);	\
+  INITZ(x); mulk(nbits, p.f, z, ktab); CHECK(E, what " (k = " #y ")", z); \
+} while (0)
+
+#define TEST_PREP(E, what) do {						\
+  TEST_PREP_1(E, x, y, what);						\
+  TEST_PREP_1(E, y, x, what);						\
+} while (0)
+
   /* First, test plain multiply. */
-  for (i = 0; i < nbits/32; i++)
-    { x[i] = LOAD32_B(v[0].buf + 4*i); y[i] = LOAD32_B(v[1].buf + 4*i); }
-  mul(&p, z, x, y);
-  for (i = 0; i < nbits/32; i++) STORE32_B(d.buf + 4*i, z[i]);
-  if (memcmp(d.buf, v[2].buf, nbits/8) != 0)
-    { ok = 0; report_failure("gcm_mul", nbits, v, &d); }
+  LOADXY(B); mul(&p, z, x, y); CHECK(B, "gcm_mul", z);
 
   /* Next, test big-endian prepared key. */
-  for (i = 0; i < nbits/32; i++)
-    { x[i] = LOAD32_B(v[0].buf + 4*i); y[i] = LOAD32_B(v[1].buf + 4*i); }
-  gcm_mktable(&p, ktab, y);
-  mulk(nbits, x, ktab);
-  for (i = 0; i < nbits/32; i++) STORE32_B(d.buf + 4*i, x[i]);
-  if (memcmp(d.buf, v[2].buf, nbits/8) != 0)
-    { ok = 0; report_failure("gcm_kmul_b(k = y)", nbits, v, &d); }
-
-  for (i = 0; i < nbits/32; i++)
-    { x[i] = LOAD32_B(v[0].buf + 4*i); y[i] = LOAD32_B(v[1].buf + 4*i); }
-  gcm_mktable(&p, ktab, x);
-  mulk(nbits, y, ktab);
-  for (i = 0; i < nbits/32; i++) STORE32_B(d.buf + 4*i, y[i]);
-  if (memcmp(d.buf, v[2].buf, nbits/8) != 0)
-    { ok = 0; report_failure("gcm_kmul_b(k = x)", nbits, v, &d); }
+  LOADXY(B); TEST_PREP(B, "gcm_kmul_b");
 
   /* Finally, test little-endian prepared key. */
-  p.f = GCMF_SWAP;
-  for (i = 0; i < nbits/32; i++)
-    { x[i] = LOAD32_L(v[0].buf + 4*i); y[i] = LOAD32_L(v[1].buf + 4*i); }
-  gcm_mktable(&p, ktab, y);
-  mulk(nbits, x, ktab);
-  for (i = 0; i < nbits/32; i++) STORE32_L(d.buf + 4*i, x[i]);
-  if (memcmp(d.buf, v[2].buf, nbits/8) != 0)
-    { ok = 0; report_failure("gcm_kmul_l(k = y)", nbits, v, &d); }
-
-  for (i = 0; i < nbits/32; i++)
-    { x[i] = LOAD32_L(v[0].buf + 4*i); y[i] = LOAD32_L(v[1].buf + 4*i); }
-  gcm_mktable(&p, ktab, x);
-  mulk(nbits, y, ktab);
-  for (i = 0; i < nbits/32; i++) STORE32_L(d.buf + 4*i, y[i]);
-  if (memcmp(d.buf, v[2].buf, nbits/8) != 0)
-    { ok = 0; report_failure("gcm_kmul_l(k = x)", nbits, v, &d); }
+  p.f = GCMF_SWAP; LOADXY(L);
+  TEST_PREP(L, "gcm_kmul_l");
+
+#undef LOADXY
+#undef INITZ
+#undef CHECK
+#undef TEST_PREP_1
+#undef TEST_PREP
 
   /* All done. */
   return (ok);
-- 
2.11.0