for (i = 0; i < 32*p->n*p->n; i++) ktab[i] = ENDSWAP32(ktab[i]);
}
+#if CPUFAM_X86 || CPUFAM_AMD64
+static void pclmul_mktable(const gcm_params *p,
+ uint32 *ktab, const uint32 *k)
+{
+ unsigned n = p->n;
+ unsigned nz;
+ uint32 *t;
+
+ /* We just need to store the value in a way which is convenient for the
+ * assembler code to read back. That involves reordering the words, and,
+ * in the case of 96-bit blocks, padding with zeroes to fill out a 128-bit
+ * chunk.
+ */
+
+ if (n == 3) nz = 1;
+ else nz = 0;
+ t = ktab + n + nz;
+
+ if (p->f&GCMF_SWAP) while (n--) { *--t = ENDSWAP32(*k); k++; }
+ else while (n--) *--t = *k++;
+ while (nz--) *--t = 0;
+}
+#endif
+
+#if CPUFAM_ARMEL
+static void arm_crypto_mktable(const gcm_params *p,
+ uint32 *ktab, const uint32 *k)
+{
+ unsigned n = p->n;
+ uint32 *t;
+
+ /* We just need to store the value in a way which is convenient for the
+ * assembler code to read back. That involves swapping the bytes in each
+ * 64-bit lane.
+ */
+
+ t = ktab;
+ if (p->f&GCMF_SWAP) {
+ while (n >= 2) {
+ t[1] = ENDSWAP32(k[0]); t[0] = ENDSWAP32(k[1]);
+ t += 2; k += 2; n -= 2;
+ }
+ if (n) { t[1] = ENDSWAP32(k[0]); t[0] = 0; }
+ } else {
+ while (n >= 2) {
+ t[1] = k[0]; t[0] = k[1];
+ t += 2; k += 2; n -= 2;
+ }
+ if (n) { t[1] = k[0]; t[0] = 0; }
+ }
+}
+#endif
+
+#if CPUFAM_ARM64
+static uint32 rbit32(uint32 x)
+{
+ uint32 z, t;
+
+#if GCC_VERSION_P(4, 3)
+ /* Two tricks here. Firstly, two separate steps, rather than a single
+ * block of assembler, to allow finer-grained instruction scheduling.
+ * Secondly, use `ENDSWAP32' so that the compiler can cancel it if the
+ * caller actually wants the bytes reordered.
+ */
+ __asm__("rbit %w0, %w1" : "=r"(t) : "r"(x));
+ z = ENDSWAP32(t);
+#else
+ /* A generic but slightly clever implementation. */
+# define SWIZZLE(x, m, s) ((((x)&(m)) << (s)) | (((x)&~(m)) >> (s)))
+ /* 76543210 */
+ t = SWIZZLE(x, 0x0f0f0f0f, 4); /* 32107654 -- swap nibbles */
+ t = SWIZZLE(t, 0x33333333, 2); /* 10325476 -- swap bit pairs */
+ z = SWIZZLE(t, 0x55555555, 1); /* 01234567 -- swap adjacent bits */
+# undef SWIZZLE
+#endif
+ return (z);
+}
+
+static void arm64_pmull_mktable(const gcm_params *p,
+ uint32 *ktab, const uint32 *k)
+{
+ unsigned n = p->n;
+ uint32 *t;
+
+ /* We just need to store the value in a way which is convenient for the
+ * assembler code to read back. That involves two transformations:
+ *
+ * * firstly, reversing the order of the bits in each byte; and,
+ *
+ * * secondly, storing two copies of each 64-bit chunk.
+ *
+ * Note that, in this case, we /want/ the little-endian byte order of GCM,
+ * so endianness-swapping happens in the big-endian case.
+ */
+
+ t = ktab;
+ if (p->f&GCMF_SWAP) {
+ while (n >= 2) {
+ t[0] = t[2] = rbit32(k[0]);
+ t[1] = t[3] = rbit32(k[1]);
+ t += 4; k += 2; n -= 2;
+ }
+ if (n) { t[0] = t[2] = rbit32(k[0]); t[1] = t[3] = 0; }
+ } else {
+ while (n >= 2) {
+ t[0] = t[2] = ENDSWAP32(rbit32(k[0]));
+ t[1] = t[3] = ENDSWAP32(rbit32(k[1]));
+ t += 4; k += 2; n -= 2;
+ }
+ if (n) { t[0] = t[2] = ENDSWAP32(rbit32(k[0])); t[1] = t[3] = 0; }
+ }
+}
+#endif
+
CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable,
(const gcm_params *p, uint32 *ktab, const uint32 *k),
(p, ktab, k),
static gcm_mktable__functype *pick_mktable(void)
{
+#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(gcm_mktable, pclmul_mktable,
+ cpu_feature_p(CPUFEAT_X86_SSSE3) &&
+ cpu_feature_p(CPUFEAT_X86_PCLMUL));
+#endif
+#if CPUFAM_ARMEL
+ DISPATCH_PICK_COND(gcm_mktable, arm_crypto_mktable,
+ cpu_feature_p(CPUFEAT_ARM_PMULL));
+#endif
+#if CPUFAM_ARM64
+ DISPATCH_PICK_COND(gcm_mktable, arm64_pmull_mktable,
+ cpu_feature_p(CPUFEAT_ARM_PMULL));
+#endif
DISPATCH_PICK_FALLBACK(gcm_mktable, simple_mktable);
}
else for (i = 0; i < p->n; i++) k[i] = ENDSWAP32(ktab[24*p->n + i]);
}
+#if CPUFAM_X86 || CPUFAM_AMD64
+static void pclmul_recover_k(const gcm_params *p,
+ uint32 *k, const uint32 *ktab)
+{
+ unsigned n = p->n;
+ unsigned nz;
+ const uint32 *t;
+
+ /* The representation is already independent of the blockcipher endianness.
+ * We need to compensate for padding, and reorder the words.
+ */
+
+ if (n == 3) nz = 1; else nz = 0;
+ t = ktab + n + nz;
+ while (n--) *k++ = *--t;
+}
+#endif
+
+#if CPUFAM_ARMEL
+static void arm_crypto_recover_k(const gcm_params *p,
+ uint32 *k, const uint32 *ktab)
+{
+ unsigned n = p->n;
+ const uint32 *t;
+
+ /* The representation is already independent of the blockcipher endianness.
+ * We only need to reorder the words.
+ */
+
+ t = ktab;
+ while (n >= 2) { k[1] = t[0]; k[0] = t[1]; t += 2; k += 2; n -= 2; }
+ if (n) k[0] = t[1];
+}
+#endif
+
+#if CPUFAM_ARM64
+static void arm64_pmull_recover_k(const gcm_params *p,
+ uint32 *k, const uint32 *ktab)
+{
+ unsigned n = p->n;
+ const uint32 *t;
+
+ /* The representation is already independent of the blockcipher endianness.
+ * We need to skip the duplicate pieces, and unscramble the bytes.
+ */
+
+ t = ktab;
+ while (n >= 2) {
+ k[0] = ENDSWAP32(rbit32(t[0]));
+ k[1] = ENDSWAP32(rbit32(t[1]));
+ t += 4; k += 2; n -= 2;
+ }
+ if (n) k[0] = ENDSWAP32(rbit32(t[0]));
+}
+#endif
+
CPU_DISPATCH(static, EMPTY, void, recover_k,
(const gcm_params *p, uint32 *k, const uint32 *ktab),
(p, k, ktab),
pick_recover_k, simple_recover_k)
static recover_k__functype *pick_recover_k(void)
- { DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k); }
+{
+#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(recover_k, pclmul_recover_k,
+ cpu_feature_p(CPUFEAT_X86_SSSE3) &&
+ cpu_feature_p(CPUFEAT_X86_PCLMUL));
+#endif
+#if CPUFAM_ARMEL
+ DISPATCH_PICK_COND(recover_k, arm_crypto_recover_k,
+ cpu_feature_p(CPUFEAT_ARM_PMULL));
+#endif
+#if CPUFAM_ARM64
+ DISPATCH_PICK_COND(recover_k, arm64_pmull_recover_k,
+ cpu_feature_p(CPUFEAT_ARM_PMULL));
+#endif
+ DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k);
+}
/* --- @gcm_mulk_N{b,l}@ --- *
*
* function whose performance actually matters.
*/
+#if CPUFAM_X86 || CPUFAM_AMD64
+# define DECL_MULK_X86ISH(var) extern gcm_mulk_##var##__functype \
+ gcm_mulk_##var##_x86ish_pclmul_avx, \
+ gcm_mulk_##var##_x86ish_pclmul;
+# define PICK_MULK_X86ISH(var) do { \
+ DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul_avx, \
+ cpu_feature_p(CPUFEAT_X86_AVX) && \
+ cpu_feature_p(CPUFEAT_X86_PCLMUL) && \
+ cpu_feature_p(CPUFEAT_X86_SSSE3)); \
+ DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul, \
+ cpu_feature_p(CPUFEAT_X86_PCLMUL) && \
+ cpu_feature_p(CPUFEAT_X86_SSSE3)); \
+} while (0)
+#else
+# define DECL_MULK_X86ISH(var)
+# define PICK_MULK_X86ISH(var) do ; while (0)
+#endif
+
+#if CPUFAM_ARMEL
+# define DECL_MULK_ARM(var) \
+ extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm_crypto;
+# define PICK_MULK_ARM(var) do { \
+ DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm_crypto, \
+ cpu_feature_p(CPUFEAT_ARM_PMULL)); \
+} while (0)
+#else
+# define DECL_MULK_ARM(var)
+# define PICK_MULK_ARM(var) do ; while (0)
+#endif
+
+#if CPUFAM_ARM64
+# define DECL_MULK_ARM64(var) \
+ extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm64_pmull;
+# define PICK_MULK_ARM64(var) do { \
+ DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm64_pmull, \
+ cpu_feature_p(CPUFEAT_ARM_PMULL)); \
+} while (0)
+#else
+# define DECL_MULK_ARM64(var)
+# define PICK_MULK_ARM64(var) do ; while (0)
+#endif
+
#define DEF_MULK(nbits) \
\
CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##b, \
for (i = 0; i < nbits/32; i++) a[i] = z[i]; \
} \
\
+DECL_MULK_X86ISH(nbits##b) \
+DECL_MULK_ARM(nbits##b) \
+DECL_MULK_ARM64(nbits##b) \
static gcm_mulk_##nbits##b##__functype *pick_mulk_##nbits##b(void) \
- { DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits); } \
+{ \
+ PICK_MULK_X86ISH(nbits##b); \
+ PICK_MULK_ARM(nbits##b); \
+ PICK_MULK_ARM64(nbits##b); \
+ DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits); \
+} \
+ \
+DECL_MULK_X86ISH(nbits##l) \
+DECL_MULK_ARM(nbits##l) \
+DECL_MULK_ARM64(nbits##l) \
static gcm_mulk_##nbits##l##__functype *pick_mulk_##nbits##l(void) \
- { DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits); }
+{ \
+ PICK_MULK_X86ISH(nbits##l); \
+ PICK_MULK_ARM(nbits##l); \
+ PICK_MULK_ARM64(nbits##l); \
+ DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits); \
+}
GCM_WIDTHS(DEF_MULK)
#ifdef TEST_RIG
+#include <mLib/macros.h>
#include <mLib/quis.h>
#include <mLib/testrig.h>
+#ifdef ENABLE_ASM_DEBUG
+# include "regdump.h"
+#endif
+
static void report_failure(const char *test, unsigned nbits,
const char *ref, dstr v[], dstr *d)
{
#define CHECK(E, what, ref) do { \
for (i = 0; i < nbits/32; i++) STORE32_##E(d.buf + 4*i, z[i]); \
- if (memcmp(d.buf, v[I_##ref].buf, nbits/8) != 0) \
+ if (MEMCMP(d.buf, !=, v[I_##ref].buf, nbits/8)) \
{ ok = 0; report_failure(what, nbits, #ref, v, &d); } \
} while (0)
int main(int argc, char *argv[])
{
ego(argv[0]);
+#ifdef ENABLE_ASM_DEBUG
+ regdump_init();
+#endif
test_run(argc, argv, defs, SRCDIR"/t/gcm");
return (0);
}