#include <mLib/bits.h>
+#include "dispatch.h"
#include "gcm.h"
#include "gcm-def.h"
* multiply (vaguely) efficiently by @k@.
*/
-void gcm_mktable(const gcm_params *p, uint32 *ktab, const uint32 *k)
+static void simple_mktable(const gcm_params *p,
+ uint32 *ktab, const uint32 *k)
{
unsigned m = (p->f&GCMF_SWAP ? 0x18 : 0);
unsigned i, j, o = m*p->n;
for (i = 0; i < 32*p->n*p->n; i++) ktab[i] = ENDSWAP32(ktab[i]);
}
-/* --- @gcm_mulk_N@ --- *
+CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable,
+ (const gcm_params *p, uint32 *ktab, const uint32 *k),
+ (p, ktab, k),
+ pick_mktable, simple_mktable)
+
+static gcm_mktable__functype *pick_mktable(void)
+{
+ DISPATCH_PICK_FALLBACK(gcm_mktable, simple_mktable);
+}
+
+/* --- @recover_k@ --- *
+ *
+ * Arguments: @const gcm_params *p@ = pointer to the parameters
+ * @uint32 *k@ = block-sized vector in which to store %$k$%
+ * @const uint32 *ktab@ = the table encoding %$k$%
+ *
+ * Returns: ---
+ *
+ * Use: Recovers %$k$%, the secret from which @ktab@ was by
+ * @gcm_mktable@, from the table, and stores it in internal
+ * (big-endian) form in @k@.
+ */
+
+static void simple_recover_k(const gcm_params *p,
+ uint32 *k, const uint32 *ktab)
+{
+ unsigned i;
+
+ /* If the blockcipher is big-endian, then the key is simply in the first
+ * table element, in the right format. If the blockcipher is little-endian
+ * then it's in element 24, and the bytes need swapping.
+ */
+
+ if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) k[i] = ktab[i];
+ else for (i = 0; i < p->n; i++) k[i] = ENDSWAP32(ktab[24*p->n + i]);
+}
+
+CPU_DISPATCH(static, EMPTY, void, recover_k,
+ (const gcm_params *p, uint32 *k, const uint32 *ktab),
+ (p, k, ktab),
+ pick_recover_k, simple_recover_k)
+
+static recover_k__functype *pick_recover_k(void)
+ { DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k); }
+
+/* --- @gcm_mulk_N{b,l}@ --- *
*
* Arguments: @uint32 *a@ = accumulator to multiply
* @const uint32 *ktab@ = table constructed by @gcm_mktable@
*
* Use: Multiply @a@ by @k@ (implicitly represented in @ktab@),
* updating @a@ in-place. There are separate functions for each
- * supported block size because this is the function whose
- * performance actually matters.
+ * supported block size and endianness because this is the
+ * function whose performance actually matters.
*/
#define DEF_MULK(nbits) \
-void gcm_mulk_##nbits(uint32 *a, const uint32 *ktab) \
+ \
+CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##b, \
+ (uint32 *a, const uint32 *ktab), (a, ktab), \
+ pick_mulk_##nbits##b, simple_mulk_##nbits) \
+CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##l, \
+ (uint32 *a, const uint32 *ktab), (a, ktab), \
+ pick_mulk_##nbits##l, simple_mulk_##nbits) \
+ \
+static void simple_mulk_##nbits(uint32 *a, const uint32 *ktab) \
{ \
uint32 m, t; \
uint32 z[nbits/32]; \
} \
\
for (i = 0; i < nbits/32; i++) a[i] = z[i]; \
-}
+} \
+ \
+static gcm_mulk_##nbits##b##__functype *pick_mulk_##nbits##b(void) \
+ { DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits); } \
+static gcm_mulk_##nbits##l##__functype *pick_mulk_##nbits##l(void) \
+ { DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits); }
+
GCM_WIDTHS(DEF_MULK)
#define GCM_MULK_CASE(nbits) \
- case nbits/32: gcm_mulk_##nbits(_a, _ktab); break;
-#define MULK(n, a, ktab) do { \
+ case nbits/32: \
+ if (_f&GCMF_SWAP) gcm_mulk_##nbits##l(_a, _ktab); \
+ else gcm_mulk_##nbits##b(_a, _ktab); \
+ break;
+#define MULK(n, f, a, ktab) do { \
uint32 *_a = (a); const uint32 *_ktab = (ktab); \
+ unsigned _f = (f); \
switch (n) { \
GCM_WIDTHS(GCM_MULK_CASE) \
default: abort(); \
for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_L(q); q += 4; }
else
for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_B(q); q += 4; }
- MULK(p->n, a, ktab);
+ MULK(p->n, p->f, a, ktab);
}
/* --- @gcm_ghashdone@ --- *
/* Start by retrieving %$k$% from the table, and convert it to big-endian
* form.
*/
- if (!(p->f&GCMF_SWAP)) for (j = 0; j < p->n; j++) u[j] = ktab[j];
- else for (j = 0; j < p->n; j++) u[j] = ENDSWAP32(ktab[24*p->n + j]);
+ recover_k(p, u, ktab);
/* Now calculate %$k^n$%. */
i = ULONG_BITS;
#include <mLib/testrig.h>
static void report_failure(const char *test, unsigned nbits,
- dstr v[], dstr *d)
+ const char *ref, dstr v[], dstr *d)
{
printf("test %s failed (nbits = %u)", test, nbits);
printf("\n\tx = "); type_hex.dump(&v[0], stdout);
printf("\n\ty = "); type_hex.dump(&v[1], stdout);
printf("\n\tz = "); type_hex.dump(&v[2], stdout);
- printf("\n\tz' = "); type_hex.dump(d, stdout);
+ printf("\n\t%s' = ", ref); type_hex.dump(d, stdout);
putchar('\n');
}
-static void mulk(unsigned nbits, uint32 *x, const uint32 *ktab)
- { MULK(nbits/32, x, ktab); }
+static void mulk(unsigned nbits, unsigned f, uint32 *x, const uint32 *ktab)
+ { MULK(nbits/32, f, x, ktab); }
static int test_mul(uint32 poly, dstr v[])
{
dstr d = DSTR_INIT;
unsigned i, nbits;
int ok = 1;
+ enum { I_x, I_y, I_z };
nbits = 8*v[0].len; p.f = 0; p.n = nbits/32; p.poly = poly;
dstr_ensure(&d, nbits/8); d.len = nbits/8;
+#define LOADXY(E) do { \
+ for (i = 0; i < nbits/32; i++) { \
+ x[i] = LOAD32_##E(v[I_x].buf + 4*i); \
+ y[i] = LOAD32_##E(v[I_y].buf + 4*i); \
+ } \
+} while (0)
+
+#define INITZ(x) do { \
+ for (i = 0; i < nbits/32; i++) z[i] = (x)[i]; \
+} while (0)
+
+#define CHECK(E, what, ref) do { \
+ for (i = 0; i < nbits/32; i++) STORE32_##E(d.buf + 4*i, z[i]); \
+ if (memcmp(d.buf, v[I_##ref].buf, nbits/8) != 0) \
+ { ok = 0; report_failure(what, nbits, #ref, v, &d); } \
+} while (0)
+
+#define TEST_PREP_1(E, x, y, what) do { \
+ gcm_mktable(&p, ktab, y); \
+ recover_k(&p, z, ktab); CHECK(B, "mktable/recover_k (" #y ")", y); \
+ INITZ(x); mulk(nbits, p.f, z, ktab); CHECK(E, what " (k = " #y ")", z); \
+} while (0)
+
+#define TEST_PREP(E, what) do { \
+ TEST_PREP_1(E, x, y, what); \
+ TEST_PREP_1(E, y, x, what); \
+} while (0)
+
/* First, test plain multiply. */
- for (i = 0; i < nbits/32; i++)
- { x[i] = LOAD32_B(v[0].buf + 4*i); y[i] = LOAD32_B(v[1].buf + 4*i); }
- mul(&p, z, x, y);
- for (i = 0; i < nbits/32; i++) STORE32_B(d.buf + 4*i, z[i]);
- if (memcmp(d.buf, v[2].buf, nbits/8) != 0)
- { ok = 0; report_failure("gcm_mul", nbits, v, &d); }
+ LOADXY(B); mul(&p, z, x, y); CHECK(B, "gcm_mul", z);
/* Next, test big-endian prepared key. */
- for (i = 0; i < nbits/32; i++)
- { x[i] = LOAD32_B(v[0].buf + 4*i); y[i] = LOAD32_B(v[1].buf + 4*i); }
- gcm_mktable(&p, ktab, y);
- mulk(nbits, x, ktab);
- for (i = 0; i < nbits/32; i++) STORE32_B(d.buf + 4*i, x[i]);
- if (memcmp(d.buf, v[2].buf, nbits/8) != 0)
- { ok = 0; report_failure("gcm_kmul_b(k = y)", nbits, v, &d); }
-
- for (i = 0; i < nbits/32; i++)
- { x[i] = LOAD32_B(v[0].buf + 4*i); y[i] = LOAD32_B(v[1].buf + 4*i); }
- gcm_mktable(&p, ktab, x);
- mulk(nbits, y, ktab);
- for (i = 0; i < nbits/32; i++) STORE32_B(d.buf + 4*i, y[i]);
- if (memcmp(d.buf, v[2].buf, nbits/8) != 0)
- { ok = 0; report_failure("gcm_kmul_b(k = x)", nbits, v, &d); }
+ LOADXY(B); TEST_PREP(B, "gcm_kmul_b");
/* Finally, test little-endian prepared key. */
- p.f = GCMF_SWAP;
- for (i = 0; i < nbits/32; i++)
- { x[i] = LOAD32_L(v[0].buf + 4*i); y[i] = LOAD32_L(v[1].buf + 4*i); }
- gcm_mktable(&p, ktab, y);
- mulk(nbits, x, ktab);
- for (i = 0; i < nbits/32; i++) STORE32_L(d.buf + 4*i, x[i]);
- if (memcmp(d.buf, v[2].buf, nbits/8) != 0)
- { ok = 0; report_failure("gcm_kmul_l(k = y)", nbits, v, &d); }
-
- for (i = 0; i < nbits/32; i++)
- { x[i] = LOAD32_L(v[0].buf + 4*i); y[i] = LOAD32_L(v[1].buf + 4*i); }
- gcm_mktable(&p, ktab, x);
- mulk(nbits, y, ktab);
- for (i = 0; i < nbits/32; i++) STORE32_L(d.buf + 4*i, y[i]);
- if (memcmp(d.buf, v[2].buf, nbits/8) != 0)
- { ok = 0; report_failure("gcm_kmul_l(k = x)", nbits, v, &d); }
+ p.f = GCMF_SWAP; LOADXY(L);
+ TEST_PREP(L, "gcm_kmul_l");
+
+#undef LOADXY
+#undef INITZ
+#undef CHECK
+#undef TEST_PREP_1
+#undef TEST_PREP
/* All done. */
return (ok);