X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/1d30a9b905cb0d622934dd438117e0a1b354c3f8..9e6a4409d58d1ed9dfe2de3c6ffaee822e051c9f:/symm/gcm.c diff --git a/symm/gcm.c b/symm/gcm.c index 12096aec..73b28517 100644 --- a/symm/gcm.c +++ b/symm/gcm.c @@ -234,6 +234,120 @@ static void simple_mktable(const gcm_params *p, for (i = 0; i < 32*p->n*p->n; i++) ktab[i] = ENDSWAP32(ktab[i]); } +#if CPUFAM_X86 || CPUFAM_AMD64 +static void pclmul_mktable(const gcm_params *p, + uint32 *ktab, const uint32 *k) +{ + unsigned n = p->n; + unsigned nz; + uint32 *t; + + /* We just need to store the value in a way which is convenient for the + * assembler code to read back. That involves reordering the words, and, + * in the case of 96-bit blocks, padding with zeroes to fill out a 128-bit + * chunk. + */ + + if (n == 3) nz = 1; + else nz = 0; + t = ktab + n + nz; + + if (p->f&GCMF_SWAP) while (n--) { *--t = ENDSWAP32(*k); k++; } + else while (n--) *--t = *k++; + while (nz--) *--t = 0; +} +#endif + +#if CPUFAM_ARMEL +static void arm_crypto_mktable(const gcm_params *p, + uint32 *ktab, const uint32 *k) +{ + unsigned n = p->n; + uint32 *t; + + /* We just need to store the value in a way which is convenient for the + * assembler code to read back. That involves swapping the bytes in each + * 64-bit lane. + */ + + t = ktab; + if (p->f&GCMF_SWAP) { + while (n >= 2) { + t[1] = ENDSWAP32(k[0]); t[0] = ENDSWAP32(k[1]); + t += 2; k += 2; n -= 2; + } + if (n) { t[1] = ENDSWAP32(k[0]); t[0] = 0; } + } else { + while (n >= 2) { + t[1] = k[0]; t[0] = k[1]; + t += 2; k += 2; n -= 2; + } + if (n) { t[1] = k[0]; t[0] = 0; } + } +} +#endif + +#if CPUFAM_ARM64 +static uint32 rbit32(uint32 x) +{ + uint32 z, t; + +#if GCC_VERSION_P(4, 3) + /* Two tricks here. Firstly, two separate steps, rather than a single + * block of assembler, to allow finer-grained instruction scheduling. + * Secondly, use `ENDSWAP32' so that the compiler can cancel it if the + * caller actually wants the bytes reordered. + */ + __asm__("rbit %w0, %w1" : "=r"(t) : "r"(x)); + z = ENDSWAP32(t); +#else + /* A generic but slightly clever implementation. */ +# define SWIZZLE(x, m, s) ((((x)&(m)) << (s)) | (((x)&~(m)) >> (s))) + /* 76543210 */ + t = SWIZZLE(x, 0x0f0f0f0f, 4); /* 32107654 -- swap nibbles */ + t = SWIZZLE(t, 0x33333333, 2); /* 10325476 -- swap bit pairs */ + z = SWIZZLE(t, 0x55555555, 1); /* 01234567 -- swap adjacent bits */ +# undef SWIZZLE +#endif + return (z); +} + +static void arm64_pmull_mktable(const gcm_params *p, + uint32 *ktab, const uint32 *k) +{ + unsigned n = p->n; + uint32 *t; + + /* We just need to store the value in a way which is convenient for the + * assembler code to read back. That involves two transformations: + * + * * firstly, reversing the order of the bits in each byte; and, + * + * * secondly, storing two copies of each 64-bit chunk. + * + * Note that, in this case, we /want/ the little-endian byte order of GCM, + * so endianness-swapping happens in the big-endian case. + */ + + t = ktab; + if (p->f&GCMF_SWAP) { + while (n >= 2) { + t[0] = t[2] = rbit32(k[0]); + t[1] = t[3] = rbit32(k[1]); + t += 4; k += 2; n -= 2; + } + if (n) { t[0] = t[2] = rbit32(k[0]); t[1] = t[3] = 0; } + } else { + while (n >= 2) { + t[0] = t[2] = ENDSWAP32(rbit32(k[0])); + t[1] = t[3] = ENDSWAP32(rbit32(k[1])); + t += 4; k += 2; n -= 2; + } + if (n) { t[0] = t[2] = ENDSWAP32(rbit32(k[0])); t[1] = t[3] = 0; } + } +} +#endif + CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable, (const gcm_params *p, uint32 *ktab, const uint32 *k), (p, ktab, k), @@ -241,6 +355,19 @@ CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable, static gcm_mktable__functype *pick_mktable(void) { +#if CPUFAM_X86 || CPUFAM_AMD64 + DISPATCH_PICK_COND(gcm_mktable, pclmul_mktable, + cpu_feature_p(CPUFEAT_X86_SSSE3) && + cpu_feature_p(CPUFEAT_X86_PCLMUL)); +#endif +#if CPUFAM_ARMEL + DISPATCH_PICK_COND(gcm_mktable, arm_crypto_mktable, + cpu_feature_p(CPUFEAT_ARM_PMULL)); +#endif +#if CPUFAM_ARM64 + DISPATCH_PICK_COND(gcm_mktable, arm64_pmull_mktable, + cpu_feature_p(CPUFEAT_ARM_PMULL)); +#endif DISPATCH_PICK_FALLBACK(gcm_mktable, simple_mktable); } @@ -271,13 +398,84 @@ static void simple_recover_k(const gcm_params *p, else for (i = 0; i < p->n; i++) k[i] = ENDSWAP32(ktab[24*p->n + i]); } +#if CPUFAM_X86 || CPUFAM_AMD64 +static void pclmul_recover_k(const gcm_params *p, + uint32 *k, const uint32 *ktab) +{ + unsigned n = p->n; + unsigned nz; + const uint32 *t; + + /* The representation is already independent of the blockcipher endianness. + * We need to compensate for padding, and reorder the words. + */ + + if (n == 3) nz = 1; else nz = 0; + t = ktab + n + nz; + while (n--) *k++ = *--t; +} +#endif + +#if CPUFAM_ARMEL +static void arm_crypto_recover_k(const gcm_params *p, + uint32 *k, const uint32 *ktab) +{ + unsigned n = p->n; + const uint32 *t; + + /* The representation is already independent of the blockcipher endianness. + * We only need to reorder the words. + */ + + t = ktab; + while (n >= 2) { k[1] = t[0]; k[0] = t[1]; t += 2; k += 2; n -= 2; } + if (n) k[0] = t[1]; +} +#endif + +#if CPUFAM_ARM64 +static void arm64_pmull_recover_k(const gcm_params *p, + uint32 *k, const uint32 *ktab) +{ + unsigned n = p->n; + const uint32 *t; + + /* The representation is already independent of the blockcipher endianness. + * We need to skip the duplicate pieces, and unscramble the bytes. + */ + + t = ktab; + while (n >= 2) { + k[0] = ENDSWAP32(rbit32(t[0])); + k[1] = ENDSWAP32(rbit32(t[1])); + t += 4; k += 2; n -= 2; + } + if (n) k[0] = ENDSWAP32(rbit32(t[0])); +} +#endif + CPU_DISPATCH(static, EMPTY, void, recover_k, (const gcm_params *p, uint32 *k, const uint32 *ktab), (p, k, ktab), pick_recover_k, simple_recover_k) static recover_k__functype *pick_recover_k(void) - { DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k); } +{ +#if CPUFAM_X86 || CPUFAM_AMD64 + DISPATCH_PICK_COND(recover_k, pclmul_recover_k, + cpu_feature_p(CPUFEAT_X86_SSSE3) && + cpu_feature_p(CPUFEAT_X86_PCLMUL)); +#endif +#if CPUFAM_ARMEL + DISPATCH_PICK_COND(recover_k, arm_crypto_recover_k, + cpu_feature_p(CPUFEAT_ARM_PMULL)); +#endif +#if CPUFAM_ARM64 + DISPATCH_PICK_COND(recover_k, arm64_pmull_recover_k, + cpu_feature_p(CPUFEAT_ARM_PMULL)); +#endif + DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k); +} /* --- @gcm_mulk_N{b,l}@ --- * * @@ -292,6 +490,48 @@ static recover_k__functype *pick_recover_k(void) * function whose performance actually matters. */ +#if CPUFAM_X86 || CPUFAM_AMD64 +# define DECL_MULK_X86ISH(var) extern gcm_mulk_##var##__functype \ + gcm_mulk_##var##_x86ish_pclmul_avx, \ + gcm_mulk_##var##_x86ish_pclmul; +# define PICK_MULK_X86ISH(var) do { \ + DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul_avx, \ + cpu_feature_p(CPUFEAT_X86_AVX) && \ + cpu_feature_p(CPUFEAT_X86_PCLMUL) && \ + cpu_feature_p(CPUFEAT_X86_SSSE3)); \ + DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul, \ + cpu_feature_p(CPUFEAT_X86_PCLMUL) && \ + cpu_feature_p(CPUFEAT_X86_SSSE3)); \ +} while (0) +#else +# define DECL_MULK_X86ISH(var) +# define PICK_MULK_X86ISH(var) do ; while (0) +#endif + +#if CPUFAM_ARMEL +# define DECL_MULK_ARM(var) \ + extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm_crypto; +# define PICK_MULK_ARM(var) do { \ + DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm_crypto, \ + cpu_feature_p(CPUFEAT_ARM_PMULL)); \ +} while (0) +#else +# define DECL_MULK_ARM(var) +# define PICK_MULK_ARM(var) do ; while (0) +#endif + +#if CPUFAM_ARM64 +# define DECL_MULK_ARM64(var) \ + extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm64_pmull; +# define PICK_MULK_ARM64(var) do { \ + DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm64_pmull, \ + cpu_feature_p(CPUFEAT_ARM_PMULL)); \ +} while (0) +#else +# define DECL_MULK_ARM64(var) +# define PICK_MULK_ARM64(var) do ; while (0) +#endif + #define DEF_MULK(nbits) \ \ CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##b, \ @@ -321,10 +561,27 @@ static void simple_mulk_##nbits(uint32 *a, const uint32 *ktab) \ for (i = 0; i < nbits/32; i++) a[i] = z[i]; \ } \ \ +DECL_MULK_X86ISH(nbits##b) \ +DECL_MULK_ARM(nbits##b) \ +DECL_MULK_ARM64(nbits##b) \ static gcm_mulk_##nbits##b##__functype *pick_mulk_##nbits##b(void) \ - { DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits); } \ +{ \ + PICK_MULK_X86ISH(nbits##b); \ + PICK_MULK_ARM(nbits##b); \ + PICK_MULK_ARM64(nbits##b); \ + DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits); \ +} \ + \ +DECL_MULK_X86ISH(nbits##l) \ +DECL_MULK_ARM(nbits##l) \ +DECL_MULK_ARM64(nbits##l) \ static gcm_mulk_##nbits##l##__functype *pick_mulk_##nbits##l(void) \ - { DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits); } +{ \ + PICK_MULK_X86ISH(nbits##l); \ + PICK_MULK_ARM(nbits##l); \ + PICK_MULK_ARM64(nbits##l); \ + DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits); \ +} GCM_WIDTHS(DEF_MULK)