+#if CPUFAM_X86 || CPUFAM_AMD64
+static void pclmul_mktable(const gcm_params *p,
+ uint32 *ktab, const uint32 *k)
+{
+ unsigned n = p->n;
+ unsigned nz;
+ uint32 *t;
+
+ /* We just need to store the value in a way which is convenient for the
+ * assembler code to read back. That involves reordering the words, and,
+ * in the case of 96-bit blocks, padding with zeroes to fill out a 128-bit
+ * chunk.
+ */
+
+ if (n == 3) nz = 1;
+ else nz = 0;
+ t = ktab + n + nz;
+
+ if (p->f&GCMF_SWAP) while (n--) { *--t = ENDSWAP32(*k); k++; }
+ else while (n--) *--t = *k++;
+ while (nz--) *--t = 0;
+}
+#endif
+
+#if CPUFAM_ARMEL
+static void arm_crypto_mktable(const gcm_params *p,
+ uint32 *ktab, const uint32 *k)
+{
+ unsigned n = p->n;
+ uint32 *t;
+
+ /* We just need to store the value in a way which is convenient for the
+ * assembler code to read back. That involves swapping the bytes in each
+ * 64-bit lane.
+ */
+
+ t = ktab;
+ if (p->f&GCMF_SWAP) {
+ while (n >= 2) {
+ t[1] = ENDSWAP32(k[0]); t[0] = ENDSWAP32(k[1]);
+ t += 2; k += 2; n -= 2;
+ }
+ if (n) { t[1] = ENDSWAP32(k[0]); t[0] = 0; }
+ } else {
+ while (n >= 2) {
+ t[1] = k[0]; t[0] = k[1];
+ t += 2; k += 2; n -= 2;
+ }
+ if (n) { t[1] = k[0]; t[0] = 0; }
+ }
+}
+#endif
+
+#if CPUFAM_ARM64
+static uint32 rbit32(uint32 x)
+{
+ uint32 z, t;
+
+#if GCC_VERSION_P(4, 3)
+ /* Two tricks here. Firstly, two separate steps, rather than a single
+ * block of assembler, to allow finer-grained instruction scheduling.
+ * Secondly, use `ENDSWAP32' so that the compiler can cancel it if the
+ * caller actually wants the bytes reordered.
+ */
+ __asm__("rbit %w0, %w1" : "=r"(t) : "r"(x));
+ z = ENDSWAP32(t);
+#else
+ /* A generic but slightly clever implementation. */
+# define SWIZZLE(x, m, s) ((((x)&(m)) << (s)) | (((x)&~(m)) >> (s)))
+ /* 76543210 */
+ t = SWIZZLE(x, 0x0f0f0f0f, 4); /* 32107654 -- swap nibbles */
+ t = SWIZZLE(t, 0x33333333, 2); /* 10325476 -- swap bit pairs */
+ z = SWIZZLE(t, 0x55555555, 1); /* 01234567 -- swap adjacent bits */
+# undef SWIZZLE
+#endif
+ return (z);
+}
+
+static void arm64_pmull_mktable(const gcm_params *p,
+ uint32 *ktab, const uint32 *k)
+{
+ unsigned n = p->n;
+ uint32 *t;
+
+ /* We just need to store the value in a way which is convenient for the
+ * assembler code to read back. That involves two transformations:
+ *
+ * * firstly, reversing the order of the bits in each byte; and,
+ *
+ * * secondly, storing two copies of each 64-bit chunk.
+ *
+ * Note that, in this case, we /want/ the little-endian byte order of GCM,
+ * so endianness-swapping happens in the big-endian case.
+ */
+
+ t = ktab;
+ if (p->f&GCMF_SWAP) {
+ while (n >= 2) {
+ t[0] = t[2] = rbit32(k[0]);
+ t[1] = t[3] = rbit32(k[1]);
+ t += 4; k += 2; n -= 2;
+ }
+ if (n) { t[0] = t[2] = rbit32(k[0]); t[1] = t[3] = 0; }
+ } else {
+ while (n >= 2) {
+ t[0] = t[2] = ENDSWAP32(rbit32(k[0]));
+ t[1] = t[3] = ENDSWAP32(rbit32(k[1]));
+ t += 4; k += 2; n -= 2;
+ }
+ if (n) { t[0] = t[2] = ENDSWAP32(rbit32(k[0])); t[1] = t[3] = 0; }
+ }
+}
+#endif
+