X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/50df573383d76f5587ba5434c016fec9346d577a..416b88692ad45dca8b3ae4800916dd8b3e9c2551:/symm/gcm.c diff --git a/symm/gcm.c b/symm/gcm.c index 749f9d6f..b4384153 100644 --- a/symm/gcm.c +++ b/symm/gcm.c @@ -33,6 +33,7 @@ #include +#include "dispatch.h" #include "gcm.h" #include "gcm-def.h" @@ -203,7 +204,8 @@ static void mul(const gcm_params *p, uint32 *z, * multiply (vaguely) efficiently by @k@. */ -void gcm_mktable(const gcm_params *p, uint32 *ktab, const uint32 *k) +static void simple_mktable(const gcm_params *p, + uint32 *ktab, const uint32 *k) { unsigned m = (p->f&GCMF_SWAP ? 0x18 : 0); unsigned i, j, o = m*p->n; @@ -232,7 +234,250 @@ void gcm_mktable(const gcm_params *p, uint32 *ktab, const uint32 *k) for (i = 0; i < 32*p->n*p->n; i++) ktab[i] = ENDSWAP32(ktab[i]); } -/* --- @gcm_mulk_N@ --- * +#if CPUFAM_X86 || CPUFAM_AMD64 +static void pclmul_mktable(const gcm_params *p, + uint32 *ktab, const uint32 *k) +{ + unsigned n = p->n; + unsigned nz; + uint32 *t; + + /* We just need to store the value in a way which is convenient for the + * assembler code to read back. That involves reordering the words, and, + * in the case of 96-bit blocks, padding with zeroes to fill out a 128-bit + * chunk. + */ + + if (n == 3) nz = 1; + else nz = 0; + t = ktab + n + nz; + + if (p->f&GCMF_SWAP) while (n--) { *--t = ENDSWAP32(*k); k++; } + else while (n--) *--t = *k++; + while (nz--) *--t = 0; +} +#endif + +#if CPUFAM_ARMEL +static void arm_crypto_mktable(const gcm_params *p, + uint32 *ktab, const uint32 *k) +{ + unsigned n = p->n; + uint32 *t; + + /* We just need to store the value in a way which is convenient for the + * assembler code to read back. That involves swapping the bytes in each + * 64-bit lane. + */ + + t = ktab; + if (p->f&GCMF_SWAP) { + while (n >= 2) { + t[1] = ENDSWAP32(k[0]); t[0] = ENDSWAP32(k[1]); + t += 2; k += 2; n -= 2; + } + if (n) { t[1] = ENDSWAP32(k[0]); t[0] = 0; } + } else { + while (n >= 2) { + t[1] = k[0]; t[0] = k[1]; + t += 2; k += 2; n -= 2; + } + if (n) { t[1] = k[0]; t[0] = 0; } + } +} +#endif + +#if CPUFAM_ARM64 +static uint32 rbit32(uint32 x) +{ + uint32 z, t; + +#if GCC_VERSION_P(4, 3) + /* Two tricks here. Firstly, two separate steps, rather than a single + * block of assembler, to allow finer-grained instruction scheduling. + * Secondly, use `ENDSWAP32' so that the compiler can cancel it if the + * caller actually wants the bytes reordered. + */ + __asm__("rbit %w0, %w1" : "=r"(t) : "r"(x)); + z = ENDSWAP32(t); +#else + /* A generic but slightly clever implementation. */ +# define SWIZZLE(x, m, s) ((((x)&(m)) << (s)) | (((x)&~(m)) >> (s))) + /* 76543210 */ + t = SWIZZLE(x, 0x0f0f0f0f, 4); /* 32107654 -- swap nibbles */ + t = SWIZZLE(t, 0x33333333, 2); /* 10325476 -- swap bit pairs */ + z = SWIZZLE(t, 0x55555555, 1); /* 01234567 -- swap adjacent bits */ +# undef SWIZZLE +#endif + return (z); +} + +static void arm64_pmull_mktable(const gcm_params *p, + uint32 *ktab, const uint32 *k) +{ + unsigned n = p->n; + uint32 *t; + + /* We just need to store the value in a way which is convenient for the + * assembler code to read back. That involves two transformations: + * + * * firstly, reversing the order of the bits in each byte; and, + * + * * secondly, storing two copies of each 64-bit chunk. + * + * Note that, in this case, we /want/ the little-endian byte order of GCM, + * so endianness-swapping happens in the big-endian case. + */ + + t = ktab; + if (p->f&GCMF_SWAP) { + while (n >= 2) { + t[0] = t[2] = rbit32(k[0]); + t[1] = t[3] = rbit32(k[1]); + t += 4; k += 2; n -= 2; + } + if (n) { t[0] = t[2] = rbit32(k[0]); t[1] = t[3] = 0; } + } else { + while (n >= 2) { + t[0] = t[2] = ENDSWAP32(rbit32(k[0])); + t[1] = t[3] = ENDSWAP32(rbit32(k[1])); + t += 4; k += 2; n -= 2; + } + if (n) { t[0] = t[2] = ENDSWAP32(rbit32(k[0])); t[1] = t[3] = 0; } + } +} +#endif + +CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable, + (const gcm_params *p, uint32 *ktab, const uint32 *k), + (p, ktab, k), + pick_mktable, simple_mktable) + +static gcm_mktable__functype *pick_mktable(void) +{ +#if CPUFAM_X86 || CPUFAM_AMD64 + DISPATCH_PICK_COND(gcm_mktable, pclmul_mktable, + cpu_feature_p(CPUFEAT_X86_SSSE3) && + cpu_feature_p(CPUFEAT_X86_PCLMUL)); +#endif +#if CPUFAM_ARMEL + DISPATCH_PICK_COND(gcm_mktable, arm_crypto_mktable, + cpu_feature_p(CPUFEAT_ARM_PMULL)); +#endif +#if CPUFAM_ARM64 + DISPATCH_PICK_COND(gcm_mktable, arm64_pmull_mktable, + cpu_feature_p(CPUFEAT_ARM_PMULL)); +#endif + DISPATCH_PICK_FALLBACK(gcm_mktable, simple_mktable); +} + +/* --- @recover_k@ --- * + * + * Arguments: @const gcm_params *p@ = pointer to the parameters + * @uint32 *k@ = block-sized vector in which to store %$k$% + * @const uint32 *ktab@ = the table encoding %$k$% + * + * Returns: --- + * + * Use: Recovers %$k$%, the secret from which @ktab@ was by + * @gcm_mktable@, from the table, and stores it in internal + * (big-endian) form in @k@. + */ + +static void simple_recover_k(const gcm_params *p, + uint32 *k, const uint32 *ktab) +{ + unsigned i; + + /* If the blockcipher is big-endian, then the key is simply in the first + * table element, in the right format. If the blockcipher is little-endian + * then it's in element 24, and the bytes need swapping. + */ + + if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) k[i] = ktab[i]; + else for (i = 0; i < p->n; i++) k[i] = ENDSWAP32(ktab[24*p->n + i]); +} + +#if CPUFAM_X86 || CPUFAM_AMD64 +static void pclmul_recover_k(const gcm_params *p, + uint32 *k, const uint32 *ktab) +{ + unsigned n = p->n; + unsigned nz; + const uint32 *t; + + /* The representation is already independent of the blockcipher endianness. + * We need to compensate for padding, and reorder the words. + */ + + if (n == 3) nz = 1; else nz = 0; + t = ktab + n + nz; + while (n--) *k++ = *--t; +} +#endif + +#if CPUFAM_ARMEL +static void arm_crypto_recover_k(const gcm_params *p, + uint32 *k, const uint32 *ktab) +{ + unsigned n = p->n; + const uint32 *t; + + /* The representation is already independent of the blockcipher endianness. + * We only need to reorder the words. + */ + + t = ktab; + while (n >= 2) { k[1] = t[0]; k[0] = t[1]; t += 2; k += 2; n -= 2; } + if (n) k[0] = t[1]; +} +#endif + +#if CPUFAM_ARM64 +static void arm64_pmull_recover_k(const gcm_params *p, + uint32 *k, const uint32 *ktab) +{ + unsigned n = p->n; + const uint32 *t; + + /* The representation is already independent of the blockcipher endianness. + * We need to skip the duplicate pieces, and unscramble the bytes. + */ + + t = ktab; + while (n >= 2) { + k[0] = ENDSWAP32(rbit32(t[0])); + k[1] = ENDSWAP32(rbit32(t[1])); + t += 4; k += 2; n -= 2; + } + if (n) k[0] = ENDSWAP32(rbit32(t[0])); +} +#endif + +CPU_DISPATCH(static, EMPTY, void, recover_k, + (const gcm_params *p, uint32 *k, const uint32 *ktab), + (p, k, ktab), + pick_recover_k, simple_recover_k) + +static recover_k__functype *pick_recover_k(void) +{ +#if CPUFAM_X86 || CPUFAM_AMD64 + DISPATCH_PICK_COND(recover_k, pclmul_recover_k, + cpu_feature_p(CPUFEAT_X86_SSSE3) && + cpu_feature_p(CPUFEAT_X86_PCLMUL)); +#endif +#if CPUFAM_ARMEL + DISPATCH_PICK_COND(recover_k, arm_crypto_recover_k, + cpu_feature_p(CPUFEAT_ARM_PMULL)); +#endif +#if CPUFAM_ARM64 + DISPATCH_PICK_COND(recover_k, arm64_pmull_recover_k, + cpu_feature_p(CPUFEAT_ARM_PMULL)); +#endif + DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k); +} + +/* --- @gcm_mulk_N{b,l}@ --- * * * Arguments: @uint32 *a@ = accumulator to multiply * @const uint32 *ktab@ = table constructed by @gcm_mktable@ @@ -241,12 +486,62 @@ void gcm_mktable(const gcm_params *p, uint32 *ktab, const uint32 *k) * * Use: Multiply @a@ by @k@ (implicitly represented in @ktab@), * updating @a@ in-place. There are separate functions for each - * supported block size because this is the function whose - * performance actually matters. + * supported block size and endianness because this is the + * function whose performance actually matters. */ +#if CPUFAM_X86 || CPUFAM_AMD64 +# define DECL_MULK_X86ISH(var) extern gcm_mulk_##var##__functype \ + gcm_mulk_##var##_x86ish_pclmul_avx, \ + gcm_mulk_##var##_x86ish_pclmul; +# define PICK_MULK_X86ISH(var) do { \ + DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul_avx, \ + cpu_feature_p(CPUFEAT_X86_AVX) && \ + cpu_feature_p(CPUFEAT_X86_PCLMUL) && \ + cpu_feature_p(CPUFEAT_X86_SSSE3)); \ + DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul, \ + cpu_feature_p(CPUFEAT_X86_PCLMUL) && \ + cpu_feature_p(CPUFEAT_X86_SSSE3)); \ +} while (0) +#else +# define DECL_MULK_X86ISH(var) +# define PICK_MULK_X86ISH(var) do ; while (0) +#endif + +#if CPUFAM_ARMEL +# define DECL_MULK_ARM(var) \ + extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm_crypto; +# define PICK_MULK_ARM(var) do { \ + DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm_crypto, \ + cpu_feature_p(CPUFEAT_ARM_PMULL)); \ +} while (0) +#else +# define DECL_MULK_ARM(var) +# define PICK_MULK_ARM(var) do ; while (0) +#endif + +#if CPUFAM_ARM64 +# define DECL_MULK_ARM64(var) \ + extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm64_pmull; +# define PICK_MULK_ARM64(var) do { \ + DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm64_pmull, \ + cpu_feature_p(CPUFEAT_ARM_PMULL)); \ +} while (0) +#else +# define DECL_MULK_ARM64(var) +# define PICK_MULK_ARM64(var) do ; while (0) +#endif + #define DEF_MULK(nbits) \ -void gcm_mulk_##nbits(uint32 *a, const uint32 *ktab) \ + \ +CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##b, \ + (uint32 *a, const uint32 *ktab), (a, ktab), \ + pick_mulk_##nbits##b, simple_mulk_##nbits) \ +CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##l, \ + (uint32 *a, const uint32 *ktab), (a, ktab), \ + pick_mulk_##nbits##l, simple_mulk_##nbits) \ + \ +static void simple_mulk_##nbits(uint32 *a, const uint32 *ktab) \ { \ uint32 m, t; \ uint32 z[nbits/32]; \ @@ -264,9 +559,46 @@ void gcm_mulk_##nbits(uint32 *a, const uint32 *ktab) \ } \ \ for (i = 0; i < nbits/32; i++) a[i] = z[i]; \ +} \ + \ +DECL_MULK_X86ISH(nbits##b) \ +DECL_MULK_ARM(nbits##b) \ +DECL_MULK_ARM64(nbits##b) \ +static gcm_mulk_##nbits##b##__functype *pick_mulk_##nbits##b(void) \ +{ \ + PICK_MULK_X86ISH(nbits##b); \ + PICK_MULK_ARM(nbits##b); \ + PICK_MULK_ARM64(nbits##b); \ + DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits); \ +} \ + \ +DECL_MULK_X86ISH(nbits##l) \ +DECL_MULK_ARM(nbits##l) \ +DECL_MULK_ARM64(nbits##l) \ +static gcm_mulk_##nbits##l##__functype *pick_mulk_##nbits##l(void) \ +{ \ + PICK_MULK_X86ISH(nbits##l); \ + PICK_MULK_ARM(nbits##l); \ + PICK_MULK_ARM64(nbits##l); \ + DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits); \ } + GCM_WIDTHS(DEF_MULK) +#define GCM_MULK_CASE(nbits) \ + case nbits/32: \ + if (_f&GCMF_SWAP) gcm_mulk_##nbits##l(_a, _ktab); \ + else gcm_mulk_##nbits##b(_a, _ktab); \ + break; +#define MULK(n, f, a, ktab) do { \ + uint32 *_a = (a); const uint32 *_ktab = (ktab); \ + unsigned _f = (f); \ + switch (n) { \ + GCM_WIDTHS(GCM_MULK_CASE) \ + default: abort(); \ + } \ +} while (0) + /*----- Other utilities ---------------------------------------------------*/ /* --- @putlen@ --- * @@ -327,21 +659,11 @@ static void mix(const gcm_params *p, uint32 *a, { unsigned i; - /* Convert the block from bytes into words, using the appropriate - * convention. - */ if (p->f&GCMF_SWAP) for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_L(q); q += 4; } else for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_B(q); q += 4; } - - /* Dispatch to the correct multiply-by-%$k$% function. */ - switch (p->n) { -#define CASE(nbits) case nbits/32: gcm_mulk_##nbits(a, ktab); break; - GCM_WIDTHS(CASE) -#undef CASE - default: abort(); - } + MULK(p->n, p->f, a, ktab); } /* --- @gcm_ghashdone@ --- * @@ -424,8 +746,7 @@ void gcm_concat(const gcm_params *p, uint32 *z, const uint32 *x, /* Start by retrieving %$k$% from the table, and convert it to big-endian * form. */ - if (!(p->f&GCMF_SWAP)) for (j = 0; j < p->n; j++) u[j] = ktab[j]; - else for (j = 0; j < p->n; j++) u[j] = ENDSWAP32(ktab[24*p->n + j]); + recover_k(p, u, ktab); /* Now calculate %$k^n$%. */ i = ULONG_BITS; @@ -453,4 +774,117 @@ void gcm_concat(const gcm_params *p, uint32 *z, const uint32 *x, } } +/*----- Test rig ----------------------------------------------------------*/ + +#ifdef TEST_RIG + +#include +#include +#include + +#ifdef ENABLE_ASM_DEBUG +# include "regdump.h" +#endif + +static void report_failure(const char *test, unsigned nbits, + const char *ref, dstr v[], dstr *d) +{ + printf("test %s failed (nbits = %u)", test, nbits); + printf("\n\tx = "); type_hex.dump(&v[0], stdout); + printf("\n\ty = "); type_hex.dump(&v[1], stdout); + printf("\n\tz = "); type_hex.dump(&v[2], stdout); + printf("\n\t%s' = ", ref); type_hex.dump(d, stdout); + putchar('\n'); +} + +static void mulk(unsigned nbits, unsigned f, uint32 *x, const uint32 *ktab) + { MULK(nbits/32, f, x, ktab); } + +static int test_mul(uint32 poly, dstr v[]) +{ + uint32 x[GCM_NMAX], y[GCM_NMAX], z[GCM_NMAX], ktab[32*GCM_NMAX*GCM_NMAX]; + gcm_params p; + dstr d = DSTR_INIT; + unsigned i, nbits; + int ok = 1; + enum { I_x, I_y, I_z }; + + nbits = 8*v[0].len; p.f = 0; p.n = nbits/32; p.poly = poly; + dstr_ensure(&d, nbits/8); d.len = nbits/8; + +#define LOADXY(E) do { \ + for (i = 0; i < nbits/32; i++) { \ + x[i] = LOAD32_##E(v[I_x].buf + 4*i); \ + y[i] = LOAD32_##E(v[I_y].buf + 4*i); \ + } \ +} while (0) + +#define INITZ(x) do { \ + for (i = 0; i < nbits/32; i++) z[i] = (x)[i]; \ +} while (0) + +#define CHECK(E, what, ref) do { \ + for (i = 0; i < nbits/32; i++) STORE32_##E(d.buf + 4*i, z[i]); \ + if (MEMCMP(d.buf, !=, v[I_##ref].buf, nbits/8)) \ + { ok = 0; report_failure(what, nbits, #ref, v, &d); } \ +} while (0) + +#define TEST_PREP_1(E, x, y, what) do { \ + gcm_mktable(&p, ktab, y); \ + recover_k(&p, z, ktab); CHECK(B, "mktable/recover_k (" #y ")", y); \ + INITZ(x); mulk(nbits, p.f, z, ktab); CHECK(E, what " (k = " #y ")", z); \ +} while (0) + +#define TEST_PREP(E, what) do { \ + TEST_PREP_1(E, x, y, what); \ + TEST_PREP_1(E, y, x, what); \ +} while (0) + + /* First, test plain multiply. */ + LOADXY(B); mul(&p, z, x, y); CHECK(B, "gcm_mul", z); + + /* Next, test big-endian prepared key. */ + LOADXY(B); TEST_PREP(B, "gcm_kmul_b"); + + /* Finally, test little-endian prepared key. */ + p.f = GCMF_SWAP; LOADXY(L); + TEST_PREP(L, "gcm_kmul_l"); + +#undef LOADXY +#undef INITZ +#undef CHECK +#undef TEST_PREP_1 +#undef TEST_PREP + + /* All done. */ + return (ok); +} + +#define TEST(nbits) \ +static int test_mul_##nbits(dstr v[]) \ + { return (test_mul(GCM_POLY_##nbits, v)); } +GCM_WIDTHS(TEST) +#undef TEST + +static test_chunk defs[] = { +#define TEST(nbits) \ + { "gcm-mul" #nbits, test_mul_##nbits, \ + { &type_hex, &type_hex, &type_hex, 0 } }, +GCM_WIDTHS(TEST) +#undef TEST + { 0, 0, { 0 } } +}; + +int main(int argc, char *argv[]) +{ + ego(argv[0]); +#ifdef ENABLE_ASM_DEBUG + regdump_init(); +#endif + test_run(argc, argv, defs, SRCDIR"/t/gcm"); + return (0); +} + +#endif + /*----- That's all, folks -------------------------------------------------*/