symm/gcm-*.S: GCM acceleration using hardware polynomial multiplication.
[catacomb] / symm / gcm.c
index 12096ae..73b2851 100644 (file)
@@ -234,6 +234,120 @@ static void simple_mktable(const gcm_params *p,
     for (i = 0; i < 32*p->n*p->n; i++) ktab[i] = ENDSWAP32(ktab[i]);
 }
 
+#if CPUFAM_X86 || CPUFAM_AMD64
+static void pclmul_mktable(const gcm_params *p,
+                          uint32 *ktab, const uint32 *k)
+{
+  unsigned n = p->n;
+  unsigned nz;
+  uint32 *t;
+
+  /* We just need to store the value in a way which is convenient for the
+   * assembler code to read back.  That involves reordering the words, and,
+   * in the case of 96-bit blocks, padding with zeroes to fill out a 128-bit
+   * chunk.
+   */
+
+  if (n == 3) nz = 1;
+  else nz = 0;
+  t = ktab + n + nz;
+
+  if (p->f&GCMF_SWAP) while (n--) { *--t = ENDSWAP32(*k); k++; }
+  else while (n--) *--t = *k++;
+  while (nz--) *--t = 0;
+}
+#endif
+
+#if CPUFAM_ARMEL
+static void arm_crypto_mktable(const gcm_params *p,
+                              uint32 *ktab, const uint32 *k)
+{
+  unsigned n = p->n;
+  uint32 *t;
+
+  /* We just need to store the value in a way which is convenient for the
+   * assembler code to read back.  That involves swapping the bytes in each
+   * 64-bit lane.
+   */
+
+  t = ktab;
+  if (p->f&GCMF_SWAP) {
+    while (n >= 2) {
+      t[1] = ENDSWAP32(k[0]); t[0] = ENDSWAP32(k[1]);
+      t += 2; k += 2; n -= 2;
+    }
+    if (n) { t[1] = ENDSWAP32(k[0]); t[0] = 0; }
+  } else {
+    while (n >= 2) {
+      t[1] = k[0]; t[0] = k[1];
+      t += 2; k += 2; n -= 2;
+    }
+    if (n) { t[1] = k[0]; t[0] = 0; }
+  }
+}
+#endif
+
+#if CPUFAM_ARM64
+static uint32 rbit32(uint32 x)
+{
+  uint32 z, t;
+
+#if GCC_VERSION_P(4, 3)
+  /* Two tricks here.  Firstly, two separate steps, rather than a single
+   * block of assembler, to allow finer-grained instruction scheduling.
+   * Secondly, use `ENDSWAP32' so that the compiler can cancel it if the
+   * caller actually wants the bytes reordered.
+   */
+  __asm__("rbit %w0, %w1" : "=r"(t) : "r"(x));
+  z = ENDSWAP32(t);
+#else
+  /* A generic but slightly clever implementation. */
+#  define SWIZZLE(x, m, s) ((((x)&(m)) << (s)) | (((x)&~(m)) >> (s)))
+                                       /* 76543210 */
+  t = SWIZZLE(x, 0x0f0f0f0f, 4);       /* 32107654 -- swap nibbles */
+  t = SWIZZLE(t, 0x33333333, 2);       /* 10325476 -- swap bit pairs */
+  z = SWIZZLE(t, 0x55555555, 1);       /* 01234567 -- swap adjacent bits */
+#  undef SWIZZLE
+#endif
+  return (z);
+}
+
+static void arm64_pmull_mktable(const gcm_params *p,
+                               uint32 *ktab, const uint32 *k)
+{
+  unsigned n = p->n;
+  uint32 *t;
+
+  /* We just need to store the value in a way which is convenient for the
+   * assembler code to read back.  That involves two transformations:
+   *
+   *   * firstly, reversing the order of the bits in each byte; and,
+   *
+   *   * secondly, storing two copies of each 64-bit chunk.
+   *
+   * Note that, in this case, we /want/ the little-endian byte order of GCM,
+   * so endianness-swapping happens in the big-endian case.
+   */
+
+  t = ktab;
+  if (p->f&GCMF_SWAP) {
+    while (n >= 2) {
+      t[0] = t[2] = rbit32(k[0]);
+      t[1] = t[3] = rbit32(k[1]);
+      t += 4; k += 2; n -= 2;
+    }
+    if (n) { t[0] = t[2] = rbit32(k[0]); t[1] = t[3] = 0; }
+  } else {
+    while (n >= 2) {
+      t[0] = t[2] = ENDSWAP32(rbit32(k[0]));
+      t[1] = t[3] = ENDSWAP32(rbit32(k[1]));
+      t += 4; k += 2; n -= 2;
+    }
+    if (n) { t[0] = t[2] = ENDSWAP32(rbit32(k[0])); t[1] = t[3] = 0; }
+  }
+}
+#endif
+
 CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable,
             (const gcm_params *p, uint32 *ktab, const uint32 *k),
             (p, ktab, k),
@@ -241,6 +355,19 @@ CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable,
 
 static gcm_mktable__functype *pick_mktable(void)
 {
+#if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(gcm_mktable, pclmul_mktable,
+                    cpu_feature_p(CPUFEAT_X86_SSSE3) &&
+                    cpu_feature_p(CPUFEAT_X86_PCLMUL));
+#endif
+#if CPUFAM_ARMEL
+  DISPATCH_PICK_COND(gcm_mktable, arm_crypto_mktable,
+                    cpu_feature_p(CPUFEAT_ARM_PMULL));
+#endif
+#if CPUFAM_ARM64
+  DISPATCH_PICK_COND(gcm_mktable, arm64_pmull_mktable,
+                    cpu_feature_p(CPUFEAT_ARM_PMULL));
+#endif
   DISPATCH_PICK_FALLBACK(gcm_mktable, simple_mktable);
 }
 
@@ -271,13 +398,84 @@ static void simple_recover_k(const gcm_params *p,
   else for (i = 0; i < p->n; i++) k[i] = ENDSWAP32(ktab[24*p->n + i]);
 }
 
+#if CPUFAM_X86 || CPUFAM_AMD64
+static void pclmul_recover_k(const gcm_params *p,
+                            uint32 *k, const uint32 *ktab)
+{
+  unsigned n = p->n;
+  unsigned nz;
+  const uint32 *t;
+
+  /* The representation is already independent of the blockcipher endianness.
+   * We need to compensate for padding, and reorder the words.
+   */
+
+  if (n == 3) nz = 1; else nz = 0;
+  t = ktab + n + nz;
+  while (n--) *k++ = *--t;
+}
+#endif
+
+#if CPUFAM_ARMEL
+static void arm_crypto_recover_k(const gcm_params *p,
+                                uint32 *k, const uint32 *ktab)
+{
+  unsigned n = p->n;
+  const uint32 *t;
+
+  /* The representation is already independent of the blockcipher endianness.
+   * We only need to reorder the words.
+   */
+
+  t = ktab;
+  while (n >= 2) { k[1] = t[0]; k[0] = t[1]; t += 2; k += 2; n -= 2; }
+  if (n) k[0] = t[1];
+}
+#endif
+
+#if CPUFAM_ARM64
+static void arm64_pmull_recover_k(const gcm_params *p,
+                                 uint32 *k, const uint32 *ktab)
+{
+  unsigned n = p->n;
+  const uint32 *t;
+
+  /* The representation is already independent of the blockcipher endianness.
+   * We need to skip the duplicate pieces, and unscramble the bytes.
+   */
+
+  t = ktab;
+  while (n >= 2) {
+    k[0] = ENDSWAP32(rbit32(t[0]));
+    k[1] = ENDSWAP32(rbit32(t[1]));
+    t += 4; k += 2; n -= 2;
+  }
+  if (n) k[0] = ENDSWAP32(rbit32(t[0]));
+}
+#endif
+
 CPU_DISPATCH(static, EMPTY, void, recover_k,
             (const gcm_params *p, uint32 *k, const uint32 *ktab),
             (p, k, ktab),
             pick_recover_k, simple_recover_k)
 
 static recover_k__functype *pick_recover_k(void)
-  { DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k); }
+{
+#if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(recover_k, pclmul_recover_k,
+                    cpu_feature_p(CPUFEAT_X86_SSSE3) &&
+                    cpu_feature_p(CPUFEAT_X86_PCLMUL));
+#endif
+#if CPUFAM_ARMEL
+  DISPATCH_PICK_COND(recover_k, arm_crypto_recover_k,
+                    cpu_feature_p(CPUFEAT_ARM_PMULL));
+#endif
+#if CPUFAM_ARM64
+  DISPATCH_PICK_COND(recover_k, arm64_pmull_recover_k,
+                    cpu_feature_p(CPUFEAT_ARM_PMULL));
+#endif
+  DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k);
+}
 
 /* --- @gcm_mulk_N{b,l}@ --- *
  *
@@ -292,6 +490,48 @@ static recover_k__functype *pick_recover_k(void)
  *             function whose performance actually matters.
  */
 
+#if CPUFAM_X86 || CPUFAM_AMD64
+#  define DECL_MULK_X86ISH(var) extern gcm_mulk_##var##__functype      \
+  gcm_mulk_##var##_x86ish_pclmul_avx,                                  \
+  gcm_mulk_##var##_x86ish_pclmul;
+#  define PICK_MULK_X86ISH(var) do {                                   \
+  DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul_avx, \
+                    cpu_feature_p(CPUFEAT_X86_AVX) &&                  \
+                    cpu_feature_p(CPUFEAT_X86_PCLMUL) &&               \
+                    cpu_feature_p(CPUFEAT_X86_SSSE3));                 \
+  DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul,   \
+                    cpu_feature_p(CPUFEAT_X86_PCLMUL) &&               \
+                    cpu_feature_p(CPUFEAT_X86_SSSE3));                 \
+} while (0)
+#else
+#  define DECL_MULK_X86ISH(var)
+#  define PICK_MULK_X86ISH(var) do ; while (0)
+#endif
+
+#if CPUFAM_ARMEL
+#  define DECL_MULK_ARM(var)                                           \
+  extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm_crypto;
+#  define PICK_MULK_ARM(var) do {                                      \
+  DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm_crypto,      \
+                    cpu_feature_p(CPUFEAT_ARM_PMULL));                 \
+} while (0)
+#else
+#  define DECL_MULK_ARM(var)
+#  define PICK_MULK_ARM(var) do ; while (0)
+#endif
+
+#if CPUFAM_ARM64
+#  define DECL_MULK_ARM64(var)                                         \
+  extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm64_pmull;
+#  define PICK_MULK_ARM64(var) do {                                    \
+  DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm64_pmull,     \
+                    cpu_feature_p(CPUFEAT_ARM_PMULL));                 \
+} while (0)
+#else
+#  define DECL_MULK_ARM64(var)
+#  define PICK_MULK_ARM64(var) do ; while (0)
+#endif
+
 #define DEF_MULK(nbits)                                                        \
                                                                        \
 CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##b,                  \
@@ -321,10 +561,27 @@ static void simple_mulk_##nbits(uint32 *a, const uint32 *ktab)            \
   for (i = 0; i < nbits/32; i++) a[i] = z[i];                          \
 }                                                                      \
                                                                        \
+DECL_MULK_X86ISH(nbits##b)                                             \
+DECL_MULK_ARM(nbits##b)                                                        \
+DECL_MULK_ARM64(nbits##b)                                              \
 static gcm_mulk_##nbits##b##__functype *pick_mulk_##nbits##b(void)     \
-  { DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits); } \
+{                                                                      \
+  PICK_MULK_X86ISH(nbits##b);                                          \
+  PICK_MULK_ARM(nbits##b);                                             \
+  PICK_MULK_ARM64(nbits##b);                                           \
+  DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits);    \
+}                                                                      \
+                                                                       \
+DECL_MULK_X86ISH(nbits##l)                                             \
+DECL_MULK_ARM(nbits##l)                                                        \
+DECL_MULK_ARM64(nbits##l)                                              \
 static gcm_mulk_##nbits##l##__functype *pick_mulk_##nbits##l(void)     \
-  { DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits); }
+{                                                                      \
+  PICK_MULK_X86ISH(nbits##l);                                          \
+  PICK_MULK_ARM(nbits##l);                                             \
+  PICK_MULK_ARM64(nbits##l);                                           \
+  DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits);    \
+}
 
 GCM_WIDTHS(DEF_MULK)