From 1d30a9b905cb0d622934dd438117e0a1b354c3f8 Mon Sep 17 00:00:00 2001 From: Mark Wooding Date: Tue, 13 Nov 2018 11:26:56 +0000 Subject: [PATCH] symm/gcm.c: Make `gcm_mktable' and `gcm_mulk_...' be CPU-dependent. A couple of other changes to ease the way: * Split `gcm_mulk_...' into two endianness variants, so that CPU-specific variants don't have to track what's going on through the key table. * Abstract out `recover_k' to decode the key value from a table, for the use of `gcm_concat'. This is, of course, necessary if the table format is CPU-dependent. * Add testing to make sure that `mktable'/`recover_k' agree with each other. There are currently no fancy implementations, but you can tell what's coming. No actual functional change, except for logging if you set `CATACOMB_CPUDISPATCH_DEBUG' in the environment. --- symm/gcm-def.h | 16 ++++-- symm/gcm.c | 170 +++++++++++++++++++++++++++++++++++++++------------------ 2 files changed, 129 insertions(+), 57 deletions(-) diff --git a/symm/gcm-def.h b/symm/gcm-def.h index f8688c4e..34f95aa1 100644 --- a/symm/gcm-def.h +++ b/symm/gcm-def.h @@ -107,7 +107,7 @@ typedef struct gcm_params { extern void gcm_mktable(const gcm_params */*p*/, uint32 */*ktab*/, const uint32 */*k*/); -/* --- @gcm_mulk_N@ --- * +/* --- @gcm_mulk_N{b,l}@ --- * * * Arguments: @uint32 *a@ = accumulator to multiply * @const uint32 *ktab@ = table constructed by @gcm_mktable@ @@ -116,17 +116,23 @@ extern void gcm_mktable(const gcm_params */*p*/, * * Use: Multiply @a@ by @k@ (implicitly represented in @ktab@), * updating @a@ in-place. There are separate functions for each - * supported block size because this is the function whose - * performance actually matters. + * supported block size and endianness because this is the + * function whose performance actually matters. */ #define GCM_DECL_MULK(nbits) \ - extern void gcm_mulk_##nbits(uint32 */*a*/, const uint32 */*ktab*/); + extern void gcm_mulk_##nbits##b(uint32 */*a*/, const uint32 */*ktab*/); \ + extern void gcm_mulk_##nbits##l(uint32 */*a*/, const uint32 */*ktab*/); GCM_WIDTHS(GCM_DECL_MULK) #undef GCM_DECL_MULK /* Dispatch to the appropriate variant of @gcm_mulk@. */ -#define GCM_MULK(PRE, a, ktab) BLKC_GLUE(gcm_mulk_, BLKC_BITS(PRE))(a, ktab) +#define GCM_MULK(PRE, a, ktab) \ + BLKC_GLUE(GCM_MULK_, BLKC_ENDIAN(PRE))(BLKC_BITS(PRE), a, ktab) +#define GCM_MULK_B(nbits, a, ktab) \ + BLKC_GLUE(BLKC_GLUE(gcm_mulk_, nbits), b)(a, ktab) +#define GCM_MULK_L(nbits, a, ktab) \ + BLKC_GLUE(BLKC_GLUE(gcm_mulk_, nbits), l)(a, ktab) /* --- @gcm_ghashdone@ --- * * diff --git a/symm/gcm.c b/symm/gcm.c index 330ec7b9..12096aec 100644 --- a/symm/gcm.c +++ b/symm/gcm.c @@ -33,6 +33,7 @@ #include +#include "dispatch.h" #include "gcm.h" #include "gcm-def.h" @@ -203,7 +204,8 @@ static void mul(const gcm_params *p, uint32 *z, * multiply (vaguely) efficiently by @k@. */ -void gcm_mktable(const gcm_params *p, uint32 *ktab, const uint32 *k) +static void simple_mktable(const gcm_params *p, + uint32 *ktab, const uint32 *k) { unsigned m = (p->f&GCMF_SWAP ? 0x18 : 0); unsigned i, j, o = m*p->n; @@ -232,7 +234,52 @@ void gcm_mktable(const gcm_params *p, uint32 *ktab, const uint32 *k) for (i = 0; i < 32*p->n*p->n; i++) ktab[i] = ENDSWAP32(ktab[i]); } -/* --- @gcm_mulk_N@ --- * +CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable, + (const gcm_params *p, uint32 *ktab, const uint32 *k), + (p, ktab, k), + pick_mktable, simple_mktable) + +static gcm_mktable__functype *pick_mktable(void) +{ + DISPATCH_PICK_FALLBACK(gcm_mktable, simple_mktable); +} + +/* --- @recover_k@ --- * + * + * Arguments: @const gcm_params *p@ = pointer to the parameters + * @uint32 *k@ = block-sized vector in which to store %$k$% + * @const uint32 *ktab@ = the table encoding %$k$% + * + * Returns: --- + * + * Use: Recovers %$k$%, the secret from which @ktab@ was by + * @gcm_mktable@, from the table, and stores it in internal + * (big-endian) form in @k@. + */ + +static void simple_recover_k(const gcm_params *p, + uint32 *k, const uint32 *ktab) +{ + unsigned i; + + /* If the blockcipher is big-endian, then the key is simply in the first + * table element, in the right format. If the blockcipher is little-endian + * then it's in element 24, and the bytes need swapping. + */ + + if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) k[i] = ktab[i]; + else for (i = 0; i < p->n; i++) k[i] = ENDSWAP32(ktab[24*p->n + i]); +} + +CPU_DISPATCH(static, EMPTY, void, recover_k, + (const gcm_params *p, uint32 *k, const uint32 *ktab), + (p, k, ktab), + pick_recover_k, simple_recover_k) + +static recover_k__functype *pick_recover_k(void) + { DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k); } + +/* --- @gcm_mulk_N{b,l}@ --- * * * Arguments: @uint32 *a@ = accumulator to multiply * @const uint32 *ktab@ = table constructed by @gcm_mktable@ @@ -241,12 +288,20 @@ void gcm_mktable(const gcm_params *p, uint32 *ktab, const uint32 *k) * * Use: Multiply @a@ by @k@ (implicitly represented in @ktab@), * updating @a@ in-place. There are separate functions for each - * supported block size because this is the function whose - * performance actually matters. + * supported block size and endianness because this is the + * function whose performance actually matters. */ #define DEF_MULK(nbits) \ -void gcm_mulk_##nbits(uint32 *a, const uint32 *ktab) \ + \ +CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##b, \ + (uint32 *a, const uint32 *ktab), (a, ktab), \ + pick_mulk_##nbits##b, simple_mulk_##nbits) \ +CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##l, \ + (uint32 *a, const uint32 *ktab), (a, ktab), \ + pick_mulk_##nbits##l, simple_mulk_##nbits) \ + \ +static void simple_mulk_##nbits(uint32 *a, const uint32 *ktab) \ { \ uint32 m, t; \ uint32 z[nbits/32]; \ @@ -264,13 +319,23 @@ void gcm_mulk_##nbits(uint32 *a, const uint32 *ktab) \ } \ \ for (i = 0; i < nbits/32; i++) a[i] = z[i]; \ -} +} \ + \ +static gcm_mulk_##nbits##b##__functype *pick_mulk_##nbits##b(void) \ + { DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits); } \ +static gcm_mulk_##nbits##l##__functype *pick_mulk_##nbits##l(void) \ + { DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits); } + GCM_WIDTHS(DEF_MULK) #define GCM_MULK_CASE(nbits) \ - case nbits/32: gcm_mulk_##nbits(_a, _ktab); break; -#define MULK(n, a, ktab) do { \ + case nbits/32: \ + if (_f&GCMF_SWAP) gcm_mulk_##nbits##l(_a, _ktab); \ + else gcm_mulk_##nbits##b(_a, _ktab); \ + break; +#define MULK(n, f, a, ktab) do { \ uint32 *_a = (a); const uint32 *_ktab = (ktab); \ + unsigned _f = (f); \ switch (n) { \ GCM_WIDTHS(GCM_MULK_CASE) \ default: abort(); \ @@ -341,7 +406,7 @@ static void mix(const gcm_params *p, uint32 *a, for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_L(q); q += 4; } else for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_B(q); q += 4; } - MULK(p->n, a, ktab); + MULK(p->n, p->f, a, ktab); } /* --- @gcm_ghashdone@ --- * @@ -424,8 +489,7 @@ void gcm_concat(const gcm_params *p, uint32 *z, const uint32 *x, /* Start by retrieving %$k$% from the table, and convert it to big-endian * form. */ - if (!(p->f&GCMF_SWAP)) for (j = 0; j < p->n; j++) u[j] = ktab[j]; - else for (j = 0; j < p->n; j++) u[j] = ENDSWAP32(ktab[24*p->n + j]); + recover_k(p, u, ktab); /* Now calculate %$k^n$%. */ i = ULONG_BITS; @@ -461,18 +525,18 @@ void gcm_concat(const gcm_params *p, uint32 *z, const uint32 *x, #include static void report_failure(const char *test, unsigned nbits, - dstr v[], dstr *d) + const char *ref, dstr v[], dstr *d) { printf("test %s failed (nbits = %u)", test, nbits); printf("\n\tx = "); type_hex.dump(&v[0], stdout); printf("\n\ty = "); type_hex.dump(&v[1], stdout); printf("\n\tz = "); type_hex.dump(&v[2], stdout); - printf("\n\tz' = "); type_hex.dump(d, stdout); + printf("\n\t%s' = ", ref); type_hex.dump(d, stdout); putchar('\n'); } -static void mulk(unsigned nbits, uint32 *x, const uint32 *ktab) - { MULK(nbits/32, x, ktab); } +static void mulk(unsigned nbits, unsigned f, uint32 *x, const uint32 *ktab) + { MULK(nbits/32, f, x, ktab); } static int test_mul(uint32 poly, dstr v[]) { @@ -481,52 +545,54 @@ static int test_mul(uint32 poly, dstr v[]) dstr d = DSTR_INIT; unsigned i, nbits; int ok = 1; + enum { I_x, I_y, I_z }; nbits = 8*v[0].len; p.f = 0; p.n = nbits/32; p.poly = poly; dstr_ensure(&d, nbits/8); d.len = nbits/8; +#define LOADXY(E) do { \ + for (i = 0; i < nbits/32; i++) { \ + x[i] = LOAD32_##E(v[I_x].buf + 4*i); \ + y[i] = LOAD32_##E(v[I_y].buf + 4*i); \ + } \ +} while (0) + +#define INITZ(x) do { \ + for (i = 0; i < nbits/32; i++) z[i] = (x)[i]; \ +} while (0) + +#define CHECK(E, what, ref) do { \ + for (i = 0; i < nbits/32; i++) STORE32_##E(d.buf + 4*i, z[i]); \ + if (memcmp(d.buf, v[I_##ref].buf, nbits/8) != 0) \ + { ok = 0; report_failure(what, nbits, #ref, v, &d); } \ +} while (0) + +#define TEST_PREP_1(E, x, y, what) do { \ + gcm_mktable(&p, ktab, y); \ + recover_k(&p, z, ktab); CHECK(B, "mktable/recover_k (" #y ")", y); \ + INITZ(x); mulk(nbits, p.f, z, ktab); CHECK(E, what " (k = " #y ")", z); \ +} while (0) + +#define TEST_PREP(E, what) do { \ + TEST_PREP_1(E, x, y, what); \ + TEST_PREP_1(E, y, x, what); \ +} while (0) + /* First, test plain multiply. */ - for (i = 0; i < nbits/32; i++) - { x[i] = LOAD32_B(v[0].buf + 4*i); y[i] = LOAD32_B(v[1].buf + 4*i); } - mul(&p, z, x, y); - for (i = 0; i < nbits/32; i++) STORE32_B(d.buf + 4*i, z[i]); - if (memcmp(d.buf, v[2].buf, nbits/8) != 0) - { ok = 0; report_failure("gcm_mul", nbits, v, &d); } + LOADXY(B); mul(&p, z, x, y); CHECK(B, "gcm_mul", z); /* Next, test big-endian prepared key. */ - for (i = 0; i < nbits/32; i++) - { x[i] = LOAD32_B(v[0].buf + 4*i); y[i] = LOAD32_B(v[1].buf + 4*i); } - gcm_mktable(&p, ktab, y); - mulk(nbits, x, ktab); - for (i = 0; i < nbits/32; i++) STORE32_B(d.buf + 4*i, x[i]); - if (memcmp(d.buf, v[2].buf, nbits/8) != 0) - { ok = 0; report_failure("gcm_kmul_b(k = y)", nbits, v, &d); } - - for (i = 0; i < nbits/32; i++) - { x[i] = LOAD32_B(v[0].buf + 4*i); y[i] = LOAD32_B(v[1].buf + 4*i); } - gcm_mktable(&p, ktab, x); - mulk(nbits, y, ktab); - for (i = 0; i < nbits/32; i++) STORE32_B(d.buf + 4*i, y[i]); - if (memcmp(d.buf, v[2].buf, nbits/8) != 0) - { ok = 0; report_failure("gcm_kmul_b(k = x)", nbits, v, &d); } + LOADXY(B); TEST_PREP(B, "gcm_kmul_b"); /* Finally, test little-endian prepared key. */ - p.f = GCMF_SWAP; - for (i = 0; i < nbits/32; i++) - { x[i] = LOAD32_L(v[0].buf + 4*i); y[i] = LOAD32_L(v[1].buf + 4*i); } - gcm_mktable(&p, ktab, y); - mulk(nbits, x, ktab); - for (i = 0; i < nbits/32; i++) STORE32_L(d.buf + 4*i, x[i]); - if (memcmp(d.buf, v[2].buf, nbits/8) != 0) - { ok = 0; report_failure("gcm_kmul_l(k = y)", nbits, v, &d); } - - for (i = 0; i < nbits/32; i++) - { x[i] = LOAD32_L(v[0].buf + 4*i); y[i] = LOAD32_L(v[1].buf + 4*i); } - gcm_mktable(&p, ktab, x); - mulk(nbits, y, ktab); - for (i = 0; i < nbits/32; i++) STORE32_L(d.buf + 4*i, y[i]); - if (memcmp(d.buf, v[2].buf, nbits/8) != 0) - { ok = 0; report_failure("gcm_kmul_l(k = x)", nbits, v, &d); } + p.f = GCMF_SWAP; LOADXY(L); + TEST_PREP(L, "gcm_kmul_l"); + +#undef LOADXY +#undef INITZ +#undef CHECK +#undef TEST_PREP_1 +#undef TEST_PREP /* All done. */ return (ok); -- 2.11.0