+#if CPUFAM_X86 || CPUFAM_AMD64
+static void pclmul_mktable(const gcm_params *p,
+ uint32 *ktab, const uint32 *k)
+{
+ unsigned n = p->n;
+ unsigned nz;
+ uint32 *t;
+
+ /* We just need to store the value in a way which is convenient for the
+ * assembler code to read back. That involves reordering the words, and,
+ * in the case of 96-bit blocks, padding with zeroes to fill out a 128-bit
+ * chunk.
+ */
+
+ if (n == 3) nz = 1;
+ else nz = 0;
+ t = ktab + n + nz;
+
+ if (p->f&GCMF_SWAP) while (n--) { *--t = ENDSWAP32(*k); k++; }
+ else while (n--) *--t = *k++;
+ while (nz--) *--t = 0;
+}
+#endif
+
+#if CPUFAM_ARMEL
+static void arm_crypto_mktable(const gcm_params *p,
+ uint32 *ktab, const uint32 *k)
+{
+ unsigned n = p->n;
+ uint32 *t;
+
+ /* We just need to store the value in a way which is convenient for the
+ * assembler code to read back. That involves swapping the bytes in each
+ * 64-bit lane.
+ */
+
+ t = ktab;
+ if (p->f&GCMF_SWAP) {
+ while (n >= 2) {
+ t[1] = ENDSWAP32(k[0]); t[0] = ENDSWAP32(k[1]);
+ t += 2; k += 2; n -= 2;
+ }
+ if (n) { t[1] = ENDSWAP32(k[0]); t[0] = 0; }
+ } else {
+ while (n >= 2) {
+ t[1] = k[0]; t[0] = k[1];
+ t += 2; k += 2; n -= 2;
+ }
+ if (n) { t[1] = k[0]; t[0] = 0; }
+ }
+}
+#endif
+
+#if CPUFAM_ARM64
+static uint32 rbit32(uint32 x)
+{
+ uint32 z, t;
+
+#if GCC_VERSION_P(4, 3)
+ /* Two tricks here. Firstly, two separate steps, rather than a single
+ * block of assembler, to allow finer-grained instruction scheduling.
+ * Secondly, use `ENDSWAP32' so that the compiler can cancel it if the
+ * caller actually wants the bytes reordered.
+ */
+ __asm__("rbit %w0, %w1" : "=r"(t) : "r"(x));
+ z = ENDSWAP32(t);
+#else
+ /* A generic but slightly clever implementation. */
+# define SWIZZLE(x, m, s) ((((x)&(m)) << (s)) | (((x)&~(m)) >> (s)))
+ /* 76543210 */
+ t = SWIZZLE(x, 0x0f0f0f0f, 4); /* 32107654 -- swap nibbles */
+ t = SWIZZLE(t, 0x33333333, 2); /* 10325476 -- swap bit pairs */
+ z = SWIZZLE(t, 0x55555555, 1); /* 01234567 -- swap adjacent bits */
+# undef SWIZZLE
+#endif
+ return (z);
+}
+
+static void arm64_pmull_mktable(const gcm_params *p,
+ uint32 *ktab, const uint32 *k)
+{
+ unsigned n = p->n;
+ uint32 *t;
+
+ /* We just need to store the value in a way which is convenient for the
+ * assembler code to read back. That involves two transformations:
+ *
+ * * firstly, reversing the order of the bits in each byte; and,
+ *
+ * * secondly, storing two copies of each 64-bit chunk.
+ *
+ * Note that, in this case, we /want/ the little-endian byte order of GCM,
+ * so endianness-swapping happens in the big-endian case.
+ */
+
+ t = ktab;
+ if (p->f&GCMF_SWAP) {
+ while (n >= 2) {
+ t[0] = t[2] = rbit32(k[0]);
+ t[1] = t[3] = rbit32(k[1]);
+ t += 4; k += 2; n -= 2;
+ }
+ if (n) { t[0] = t[2] = rbit32(k[0]); t[1] = t[3] = 0; }
+ } else {
+ while (n >= 2) {
+ t[0] = t[2] = ENDSWAP32(rbit32(k[0]));
+ t[1] = t[3] = ENDSWAP32(rbit32(k[1]));
+ t += 4; k += 2; n -= 2;
+ }
+ if (n) { t[0] = t[2] = ENDSWAP32(rbit32(k[0])); t[1] = t[3] = 0; }
+ }
+}
+#endif
+
+CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable,
+ (const gcm_params *p, uint32 *ktab, const uint32 *k),
+ (p, ktab, k),
+ pick_mktable, simple_mktable)
+
+static gcm_mktable__functype *pick_mktable(void)
+{
+#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(gcm_mktable, pclmul_mktable,
+ cpu_feature_p(CPUFEAT_X86_SSSE3) &&
+ cpu_feature_p(CPUFEAT_X86_PCLMUL));
+#endif
+#if CPUFAM_ARMEL
+ DISPATCH_PICK_COND(gcm_mktable, arm_crypto_mktable,
+ cpu_feature_p(CPUFEAT_ARM_PMULL));
+#endif
+#if CPUFAM_ARM64
+ DISPATCH_PICK_COND(gcm_mktable, arm64_pmull_mktable,
+ cpu_feature_p(CPUFEAT_ARM_PMULL));
+#endif
+ DISPATCH_PICK_FALLBACK(gcm_mktable, simple_mktable);
+}
+
+/* --- @recover_k@ --- *
+ *
+ * Arguments: @const gcm_params *p@ = pointer to the parameters
+ * @uint32 *k@ = block-sized vector in which to store %$k$%
+ * @const uint32 *ktab@ = the table encoding %$k$%
+ *
+ * Returns: ---
+ *
+ * Use: Recovers %$k$%, the secret from which @ktab@ was by
+ * @gcm_mktable@, from the table, and stores it in internal
+ * (big-endian) form in @k@.
+ */
+
+static void simple_recover_k(const gcm_params *p,
+ uint32 *k, const uint32 *ktab)
+{
+ unsigned i;
+
+ /* If the blockcipher is big-endian, then the key is simply in the first
+ * table element, in the right format. If the blockcipher is little-endian
+ * then it's in element 24, and the bytes need swapping.
+ */
+
+ if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) k[i] = ktab[i];
+ else for (i = 0; i < p->n; i++) k[i] = ENDSWAP32(ktab[24*p->n + i]);
+}
+
+#if CPUFAM_X86 || CPUFAM_AMD64
+static void pclmul_recover_k(const gcm_params *p,
+ uint32 *k, const uint32 *ktab)
+{
+ unsigned n = p->n;
+ unsigned nz;
+ const uint32 *t;
+
+ /* The representation is already independent of the blockcipher endianness.
+ * We need to compensate for padding, and reorder the words.
+ */
+
+ if (n == 3) nz = 1; else nz = 0;
+ t = ktab + n + nz;
+ while (n--) *k++ = *--t;
+}
+#endif
+
+#if CPUFAM_ARMEL
+static void arm_crypto_recover_k(const gcm_params *p,
+ uint32 *k, const uint32 *ktab)
+{
+ unsigned n = p->n;
+ const uint32 *t;
+
+ /* The representation is already independent of the blockcipher endianness.
+ * We only need to reorder the words.
+ */
+
+ t = ktab;
+ while (n >= 2) { k[1] = t[0]; k[0] = t[1]; t += 2; k += 2; n -= 2; }
+ if (n) k[0] = t[1];
+}
+#endif
+
+#if CPUFAM_ARM64
+static void arm64_pmull_recover_k(const gcm_params *p,
+ uint32 *k, const uint32 *ktab)
+{
+ unsigned n = p->n;
+ const uint32 *t;
+
+ /* The representation is already independent of the blockcipher endianness.
+ * We need to skip the duplicate pieces, and unscramble the bytes.
+ */
+
+ t = ktab;
+ while (n >= 2) {
+ k[0] = ENDSWAP32(rbit32(t[0]));
+ k[1] = ENDSWAP32(rbit32(t[1]));
+ t += 4; k += 2; n -= 2;
+ }
+ if (n) k[0] = ENDSWAP32(rbit32(t[0]));
+}
+#endif
+
+CPU_DISPATCH(static, EMPTY, void, recover_k,
+ (const gcm_params *p, uint32 *k, const uint32 *ktab),
+ (p, k, ktab),
+ pick_recover_k, simple_recover_k)
+
+static recover_k__functype *pick_recover_k(void)
+{
+#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(recover_k, pclmul_recover_k,
+ cpu_feature_p(CPUFEAT_X86_SSSE3) &&
+ cpu_feature_p(CPUFEAT_X86_PCLMUL));
+#endif
+#if CPUFAM_ARMEL
+ DISPATCH_PICK_COND(recover_k, arm_crypto_recover_k,
+ cpu_feature_p(CPUFEAT_ARM_PMULL));
+#endif
+#if CPUFAM_ARM64
+ DISPATCH_PICK_COND(recover_k, arm64_pmull_recover_k,
+ cpu_feature_p(CPUFEAT_ARM_PMULL));
+#endif
+ DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k);
+}
+
+/* --- @gcm_mulk_N{b,l}@ --- *