[catacomb] / symm / gcm.c

/* -*-c-*-
 *
 * The GCM authenticated encryption mode
 *
 * (c) 2017 Straylight/Edgeware
 */

/*----- Licensing notice --------------------------------------------------*
 *
 * This file is part of Catacomb.
 *
 * Catacomb is free software: you can redistribute it and/or modify it
 * under the terms of the GNU Library General Public License as published
 * by the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * Catacomb is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with Catacomb.  If not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
 * USA.
 */

/*----- Header files ------------------------------------------------------*/

#include "config.h"

#include <stdio.h>

#include <mLib/bits.h>

#include "dispatch.h"
#include "gcm.h"
#include "gcm-def.h"

/*----- Overall strategy --------------------------------------------------*
 *
 * GCM is pretty awful to implement in software.  (This presentation is going
 * to be somewhat different to that in the specification, but I think it
 * makes more sense like this.)
 *
 * We're given a %$w$%-bit blockcipher %$E$% with a key %$K$%.
 *
 * The main part is arithmetic in the finite field %$k = \gf{2^w}$%, which we
 * represent as the quotient ring %$\gf{2}[t]/(p_w(t))$% for some irreducible
 * degree-%$w$% polynomial %$p(t)$%, whose precise value isn't very important
 * right now.  We choose a secret point %$x = E_K(0^w)$%.
 *
 * We choose a length size %$z$% as follows: if %$w < 96%$ then %$z = w$%;
 * otherwise %$z = w/2$%.  Format a message pair as follows:
 *
 *	%$F(a, b) = P_w(a) \cat P_w(b) \cat [\ell(a)]_z \cat [\ell(b)]_z$%
 *
 * where %$P_w(x) = x \cat 0^n$% where $%0 \le n < w$% such that
 * %$\ell(x) + n \equiv 0 \pmod{w}$%.
 *
 * Hash a (block-aligned) message %$u$% as follows.  First, split %$u$% into
 * %$w$%-bit blocks %$u_0$%, %$u_1$%, %%\ldots%%, %$u_{n-1}$%.  Interpret
 * these as elements of %$k$%.  Then
 *
 *	%$G_x(u) = u_0 t^n + u_1 t^{n-1} + \cdots + u_{n-1} t$%
 *
 * converted back to a %$w$%-bit string.
 *
 * We're ready to go now.  Suppose we're to encrypt a message %$M$% with
 * header %$H$% and nonce %$N$%.  If %$\ell(N) + 32 = w$% then let
 * %$N' = N$% and let %$i_0 = 1$%; otherwise, let %$U = G_t(F(\epsilon, N))$%
 * and split this into %$N' = U[0 \bitsto w - 32]$% and
 * %$[i_0]_{32} = U[w - 32 \bitsto w]$%.
 *
 * Let %$n = \lceil \ell(M)/w \rceil$%.  Compute
 *
 *	%$y_j = E_K(N' \cat [i_0 + j]_{32})$%
 *
 * for %$0 \le j \le n$%.  Let
 *
 *	%$s = (y_1 \cat y_2 \cat \cdots \cat y_n)[0 \bitsto \ell(M)$%
 *
 * Let %$C = M \xor s$% and let %$T = G_x(F(H, C)) \xor y_0$%.  These are the
 * ciphertext and tag respectively.
 *
 * So why is this awful?
 *
 * For one thing, the bits are in a completely terrible order.  The bytes are
 * arranged in little-endian order, so the unit coefficient is in the first
 * byte, and the degree-127 coefficient is in the last byte.  But within each
 * byte, the lowest-degree coefficient is in the most significant bit.  It's
 * therefore better to think of GCM as using a big-endian byte-ordering
 * convention, but with the bits backwards.
 *
 * But messing about with byte ordering is expensive, so let's not do that in
 * the inner loop.  But multiplication in %$k$% is not easy either.  Some
 * kind of precomputed table would be nice, but that will leak secrets
 * through the cache.
 *
 * I choose a particularly simple table: given %$x$%, let %$X[i'] = x t^i$%.
 * Then $%$x y = \sum_{0\le i<w} y_i X[i']$% which is just a bunch of
 * bitmasking.  But the natural order for examining bits of %$y$% is not
 * necessarily the obvious one.  We'll have already loaded %$y$% into
 * internal form, as 32-bit words.  The good order to process these is left
 * to right, from high to low bits.  But now the order of degrees depends on
 * the endianness of our conversion of bytes to words.  Oh, well.
 *
 * If we've adopted a big-endian convention, then we'll see the degrees in
 * order, 0, 1, ..., all the way up to %$w - 1$% and everything is fine.  If
 * we've adopted a little-endian convention, though, we'll see an ordering
 * like this:
 *
 *	24, 25, ..., 31, 16, 17, ..., 23,  8,  9, ..., 15,  0,  1, ..., 7,
 *	56, 57, ..., 63, 48, 49, ..., 55, 40, 41, ..., 47, 32, 33, ..., 39,
 *	etc.
 *
 * which is the ordinary order with 0x18 = 24 XORed into the index.  That is,
 * %$i' = i$% if we've adopted a big-endian convention, and
 * %$i' = i \xor 24$% if we've adopted a little-endian convention.
 */

/*----- Low-level utilities -----------------------------------------------*/

/* --- @mult@, @divt@ --- *
 *
 * Arguments:	@const gcm_params *p@ = pointer to the parameters
 *		@uint32 *z@ = where to write the result
 *		@const uint32 *x@ = input field element
 *
 * Returns:	---
 *
 * Use:		Multiply or divide the input field element by %$t$%, and
 *		write the product or quotient to @z@.  It's safe for @x@ and
 *		@z@ to be equal, but they should not otherwise overlap.  Both
 *		input and output are in big-endian form, i.e., with the
 *		lowest-degree coefficients in the most significant bits.
 */

static void mult(const gcm_params *p, uint32 *z, const uint32 *x)
{
  uint32 m, c, t;
  unsigned i;

  t = x[p->n - 1]; m = -(t&1u); c = m&p->poly;
  for (i = 0; i < p->n; i++) { t = x[i]; z[i] = (t >> 1) ^ c; c = t << 31; }
}

#if CPUFAM_X86 || CPUFAM_AMD64 || CPUFAM_ARMEL
static void divt(const gcm_params *p, uint32 *z, const uint32 *x)
{
  uint32 m, c, t;
  unsigned i;

  t = x[0]; m = -((t >> 31)&1u); c = m&1u;
  for (i = p->n - 1; i; i--) { t = x[i]; z[i] = (t << 1) | c; c = t >> 31; }
  t = x[0]; z[0] = ((t ^ (m&p->poly)) << 1) | c;
}
#endif

/* --- @mul@ --- *
 *
 * Arguments:	@const gcm_params *p@ = pointer to the parameters
 *		@uint32 *z@ = where to write the result
 *		@const uint32 *x, *y@ = input field elements
 *
 * Returns:	---
 *
 * Use:		Multiply the input field elements together, and write the
 *		product to @z@.  It's safe for the operands to overlap.  Both
 *		inputs and the output are in big-endian form, i.e., with the
 *		lowest-degree coefficients in the most significant bits.
 */

static void mul(const gcm_params *p, uint32 *z,
		const uint32 *x, const uint32 *y)
{
  uint32 m, t, u[GCM_NMAX], v[GCM_NMAX];
  unsigned i, j, k;

  /* We can't do this in-place at all, so use temporary space.  Make a copy
   * of @x@ in @u@, where we can clobber it, and build the product in @v@.
   */
  for (i = 0; i < p->n; i++) { u[i] = x[i]; v[i] = 0; }

  /* Repeatedly multiply @x@ (in @u@) by %$t$%, and add together those
   * %$x t^i$% selected by the bits of @y@.  This is basically what you get
   * by streaming the result of @gcm_mktable@ into @gcm_mulk_...@.
   */
  for (i = 0; i < p->n; i++) {
    t = y[i];
    for (j = 0; j < 32; j++) {
      m = -((t >> 31)&1u);
      for (k = 0; k < p->n; k++) v[k] ^= u[k]&m;
      mult(p, u, u); t <<= 1;
    }
  }

  /* Write out the result now that it's ready. */
  for (i = 0; i < p->n; i++) z[i] = v[i];
}

/*----- Table-based multiplication ----------------------------------------*/

/* --- @gcm_mktable@ --- *
 *
 * Arguments:	@const gcm_params *p@ = pointer to the parameters
 *		@uint32 *ktab@ = where to write the table; there must be
 *			space for %$32 n$% $%n$%-word entries, i.e.,
 *			%$32 n^2$% 32-bit words in total, where %$n$% is
 *			@p->n@, the block size in words
 *		@const uint32 *k@ = input field element
 *
 * Returns:	---
 *
 * Use:		Construct a table for use by @gcm_mulk_...@ below, to
 *		multiply (vaguely) efficiently by @k@.
 */

static void simple_mktable(const gcm_params *p,
			   uint32 *ktab, const uint32 *k)
{
  unsigned m = (p->f&GCMF_SWAP ? 0x18 : 0);
  unsigned i, j, o = m*p->n;

  /* As described above, the table stores entries %$K[i \xor m] = k t^i$%,
   * where %$m = 0$% (big-endian cipher) or %$m = 24$% (little-endian).
   * The first job is to store %$K[m] = k$%.
   *
   * We initially build the table with the entries in big-endian order, and
   * then swap them if necessary.  This makes the arithmetic functions more
   * amenable for use by @gcm_concat@ below.
   */
  if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) ktab[o + i] = k[i];
  else for (i = 0; i < p->n; i++) ktab[o + i] = ENDSWAP32(k[i]);

  /* Fill in the rest of the table by repeatedly multiplying the previous
   * entry by %$t$%.
   */
  for (i = 1; i < 32*p->n; i++)
    { j = (i ^ m)*p->n; mult(p, ktab + j, ktab + o); o = j; }

  /* Finally, if the cipher uses a little-endian convention, then swap all of
   * the individual words.
   */
  if (p->f&GCMF_SWAP)
    for (i = 0; i < 32*p->n*p->n; i++) ktab[i] = ENDSWAP32(ktab[i]);
}

#if CPUFAM_X86 || CPUFAM_AMD64
static void pclmul_mktable(const gcm_params *p,
			   uint32 *ktab, const uint32 *k)
{
  unsigned i, n = p->n;
  unsigned nz;
  uint32 k_over_t[GCM_NMAX], *t;

  /* We need to divide the value by t (to compensate for the one-bit shift
   * resulting from GCM's backwards bit ordering) and store the value in a
   * way which is convenient for the assembler code to read back.  That
   * involves reordering the words, and, in the case of 96-bit blocks,
   * padding with zeroes to fill out a 128-bit chunk.
   */

  if (!(p->f&GCMF_SWAP)) divt(p, k_over_t, k);
  else {
    for (i = 0; i < n; i++) k_over_t[i] = ENDSWAP32(k[i]);
    divt(p, k_over_t, k_over_t);
  }

  if (n == 3) nz = 1;
  else nz = 0;
  k = k_over_t; t = ktab + n + nz; while (n--) *--t = *k++;
  while (nz--) *--t = 0;
}
#endif

#if CPUFAM_ARMEL
static void arm_crypto_mktable(const gcm_params *p,
			       uint32 *ktab, const uint32 *k)
{
  unsigned i, n = p->n;
  uint32 k_over_t[GCM_NMAX], *t;

  /* We need to divide the value by t (to compensate for the one-bit shift
   * resulting from GCM's backwards bit ordering) and store the value in a
   * way which is convenient for the assembler code to read back.  That
   * involves swapping the bytes in each 64-bit lane.
   */

  if (!(p->f&GCMF_SWAP)) divt(p, k_over_t, k);
  else {
    for (i = 0; i < n; i++) k_over_t[i] = ENDSWAP32(k[i]);
    divt(p, k_over_t, k_over_t);
  }

  t = ktab; k = k_over_t;
  while (n >= 2) {
    t[1] = k[0]; t[0] = k[1];
    t += 2; k += 2; n -= 2;
  }
  if (n) { t[1] = k[0]; t[0] = 0; }
}
#endif

#if CPUFAM_ARM64
static uint32 rbit32(uint32 x)
{
  uint32 z, t;

#if GCC_VERSION_P(4, 3)
  /* Two tricks here.  Firstly, two separate steps, rather than a single
   * block of assembler, to allow finer-grained instruction scheduling.
   * Secondly, use `ENDSWAP32' so that the compiler can cancel it if the
   * caller actually wants the bytes reordered.
   */
  __asm__("rbit %w0, %w1" : "=r"(t) : "r"(x));
  z = ENDSWAP32(t);
#else
  /* A generic but slightly clever implementation. */
#  define SWIZZLE(x, m, s) ((((x)&(m)) << (s)) | (((x)&~(m)) >> (s)))
					/* 76543210 */
  t = SWIZZLE(x, 0x0f0f0f0f, 4);	/* 32107654 -- swap nibbles */
  t = SWIZZLE(t, 0x33333333, 2);	/* 10325476 -- swap bit pairs */
  z = SWIZZLE(t, 0x55555555, 1);	/* 01234567 -- swap adjacent bits */
#  undef SWIZZLE
#endif
  return (z);
}

static void arm64_pmull_mktable(const gcm_params *p,
				uint32 *ktab, const uint32 *k)
{
  unsigned n = p->n;
  uint32 *t;

  /* We just need to store the value in a way which is convenient for the
   * assembler code to read back.  That involves two transformations:
   *
   *   * firstly, reversing the order of the bits in each byte; and,
   *
   *   * secondly, storing two copies of each 64-bit chunk.
   *
   * Note that, in this case, we /want/ the little-endian byte order of GCM,
   * so endianness-swapping happens in the big-endian case.
   */

  t = ktab;
  if (p->f&GCMF_SWAP) {
    while (n >= 2) {
      t[0] = t[2] = rbit32(k[0]);
      t[1] = t[3] = rbit32(k[1]);
      t += 4; k += 2; n -= 2;
    }
    if (n) { t[0] = t[2] = rbit32(k[0]); t[1] = t[3] = 0; }
  } else {
    while (n >= 2) {
      t[0] = t[2] = ENDSWAP32(rbit32(k[0]));
      t[1] = t[3] = ENDSWAP32(rbit32(k[1]));
      t += 4; k += 2; n -= 2;
    }
    if (n) { t[0] = t[2] = ENDSWAP32(rbit32(k[0])); t[1] = t[3] = 0; }
  }
}
#endif

CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable,
	     (const gcm_params *p, uint32 *ktab, const uint32 *k),
	     (p, ktab, k),
	     pick_mktable, simple_mktable)

static gcm_mktable__functype *pick_mktable(void)
{
#if CPUFAM_X86 || CPUFAM_AMD64
  DISPATCH_PICK_COND(gcm_mktable, pclmul_mktable,
		     cpu_feature_p(CPUFEAT_X86_SSSE3) &&
		     cpu_feature_p(CPUFEAT_X86_PCLMUL));
#endif
#if CPUFAM_ARMEL
  DISPATCH_PICK_COND(gcm_mktable, arm_crypto_mktable,
		     cpu_feature_p(CPUFEAT_ARM_PMULL));
#endif
#if CPUFAM_ARM64
  DISPATCH_PICK_COND(gcm_mktable, arm64_pmull_mktable,
		     cpu_feature_p(CPUFEAT_ARM_PMULL));
#endif
  DISPATCH_PICK_FALLBACK(gcm_mktable, simple_mktable);
}

/* --- @recover_k@ --- *
 *
 * Arguments:	@const gcm_params *p@ = pointer to the parameters
 *		@uint32 *k@ = block-sized vector in which to store %$k$%
 *		@const uint32 *ktab@ = the table encoding %$k$%
 *
 * Returns:	---
 *
 * Use:		Recovers %$k$%, the secret from which @ktab@ was by
 *		@gcm_mktable@, from the table, and stores it in internal
 *		(big-endian) form in @k@.
 */

static void simple_recover_k(const gcm_params *p,
			     uint32 *k, const uint32 *ktab)
{
  unsigned i;

  /* If the blockcipher is big-endian, then the key is simply in the first
   * table element, in the right format.  If the blockcipher is little-endian
   * then it's in element 24, and the bytes need swapping.
   */

  if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) k[i] = ktab[i];
  else for (i = 0; i < p->n; i++) k[i] = ENDSWAP32(ktab[24*p->n + i]);
}

#if CPUFAM_X86 || CPUFAM_AMD64
static void pclmul_recover_k(const gcm_params *p,
			     uint32 *k, const uint32 *ktab)
{
  unsigned n = p->n;
  unsigned nz;
  const uint32 *t;

  /* The representation is already independent of the blockcipher endianness.
   * We need to compensate for padding, reorder the words, and multiply by t
   * to compensate for the factor of t we divided out earlier.
   */

  if (n == 3) nz = 1; else nz = 0;
  t = ktab + n + nz;
  while (n--) *k++ = *--t;
  mult(p, k - p->n, k - p->n);
}
#endif

#if CPUFAM_ARMEL
static void arm_crypto_recover_k(const gcm_params *p,
				 uint32 *k, const uint32 *ktab)
{
  unsigned n = p->n;
  const uint32 *t;

  /* The representation is already independent of the blockcipher endianness.
   * We only need to reorder the words, and multiply by t to compensate for
   * the factor of t we divided out earlier.
   */

  t = ktab;
  while (n >= 2) { k[1] = t[0]; k[0] = t[1]; t += 2; k += 2; n -= 2; }
  if (n) { k[0] = t[1]; k++; n--; }
  mult(p, k - p->n, k - p->n);
}
#endif

#if CPUFAM_ARM64
static void arm64_pmull_recover_k(const gcm_params *p,
				  uint32 *k, const uint32 *ktab)
{
  unsigned n = p->n;
  const uint32 *t;

  /* The representation is already independent of the blockcipher endianness.
   * We need to skip the duplicate pieces, and unscramble the bytes.
   */

  t = ktab;
  while (n >= 2) {
    k[0] = ENDSWAP32(rbit32(t[0]));
    k[1] = ENDSWAP32(rbit32(t[1]));
    t += 4; k += 2; n -= 2;
  }
  if (n) k[0] = ENDSWAP32(rbit32(t[0]));
}
#endif

CPU_DISPATCH(static, EMPTY, void, recover_k,
	     (const gcm_params *p, uint32 *k, const uint32 *ktab),
	     (p, k, ktab),
	     pick_recover_k, simple_recover_k)

static recover_k__functype *pick_recover_k(void)
{
#if CPUFAM_X86 || CPUFAM_AMD64
  DISPATCH_PICK_COND(recover_k, pclmul_recover_k,
		     cpu_feature_p(CPUFEAT_X86_SSSE3) &&
		     cpu_feature_p(CPUFEAT_X86_PCLMUL));
#endif
#if CPUFAM_ARMEL
  DISPATCH_PICK_COND(recover_k, arm_crypto_recover_k,
		     cpu_feature_p(CPUFEAT_ARM_PMULL));
#endif
#if CPUFAM_ARM64
  DISPATCH_PICK_COND(recover_k, arm64_pmull_recover_k,
		     cpu_feature_p(CPUFEAT_ARM_PMULL));
#endif
  DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k);
}

/* --- @gcm_mulk_N{b,l}@ --- *
 *
 * Arguments:	@uint32 *a@ = accumulator to multiply
 *		@const uint32 *ktab@ = table constructed by @gcm_mktable@
 *
 * Returns:	---
 *
 * Use:		Multiply @a@ by @k@ (implicitly represented in @ktab@),
 *		updating @a@ in-place.  There are separate functions for each
 *		supported block size and endianness because this is the
 *		function whose performance actually matters.
 */

#if CPUFAM_X86 || CPUFAM_AMD64
#  define DECL_MULK_X86ISH(var) extern gcm_mulk_##var##__functype	\
  gcm_mulk_##var##_x86ish_pclmul_avx,					\
  gcm_mulk_##var##_x86ish_pclmul;
#  define PICK_MULK_X86ISH(var) do {					\
  DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul_avx, \
		     cpu_feature_p(CPUFEAT_X86_AVX) &&			\
		     cpu_feature_p(CPUFEAT_X86_PCLMUL) &&		\
		     cpu_feature_p(CPUFEAT_X86_SSSE3));			\
  DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul,	\
		     cpu_feature_p(CPUFEAT_X86_PCLMUL) &&		\
		     cpu_feature_p(CPUFEAT_X86_SSSE3));			\
} while (0)
#else
#  define DECL_MULK_X86ISH(var)
#  define PICK_MULK_X86ISH(var) do ; while (0)
#endif

#if CPUFAM_ARMEL
#  define DECL_MULK_ARM(var)						\
  extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm_crypto;
#  define PICK_MULK_ARM(var) do {					\
  DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm_crypto,	\
		     cpu_feature_p(CPUFEAT_ARM_PMULL));			\
} while (0)
#else
#  define DECL_MULK_ARM(var)
#  define PICK_MULK_ARM(var) do ; while (0)
#endif

#if CPUFAM_ARM64
#  define DECL_MULK_ARM64(var)						\
  extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm64_pmull;
#  define PICK_MULK_ARM64(var) do {					\
  DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm64_pmull,	\
		     cpu_feature_p(CPUFEAT_ARM_PMULL));			\
} while (0)
#else
#  define DECL_MULK_ARM64(var)
#  define PICK_MULK_ARM64(var) do ; while (0)
#endif

#define DEF_MULK(nbits)							\
									\
CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##b,			\
	     (uint32 *a, const uint32 *ktab), (a, ktab),		\
	     pick_mulk_##nbits##b, simple_mulk_##nbits)			\
CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##l,			\
	     (uint32 *a, const uint32 *ktab), (a, ktab),		\
	     pick_mulk_##nbits##l, simple_mulk_##nbits)			\
									\
static void simple_mulk_##nbits(uint32 *a, const uint32 *ktab)		\
{									\
  uint32 m, t;								\
  uint32 z[nbits/32];							\
  unsigned i, j, k;							\
									\
  for (i = 0; i < nbits/32; i++) z[i] = 0;				\
									\
  for (i = 0; i < nbits/32; i++) {					\
    t = a[i];								\
    for (j = 0; j < 32; j++) {						\
      m = -((t >> 31)&1u);						\
      for (k = 0; k < nbits/32; k++) z[k] ^= *ktab++&m;			\
      t <<= 1;								\
    }									\
  }									\
									\
  for (i = 0; i < nbits/32; i++) a[i] = z[i];				\
}									\
									\
DECL_MULK_X86ISH(nbits##b)						\
DECL_MULK_ARM(nbits##b)							\
DECL_MULK_ARM64(nbits##b)						\
static gcm_mulk_##nbits##b##__functype *pick_mulk_##nbits##b(void)	\
{									\
  PICK_MULK_X86ISH(nbits##b);						\
  PICK_MULK_ARM(nbits##b);						\
  PICK_MULK_ARM64(nbits##b);						\
  DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits);	\
}									\
									\
DECL_MULK_X86ISH(nbits##l)						\
DECL_MULK_ARM(nbits##l)							\
DECL_MULK_ARM64(nbits##l)						\
static gcm_mulk_##nbits##l##__functype *pick_mulk_##nbits##l(void)	\
{									\
  PICK_MULK_X86ISH(nbits##l);						\
  PICK_MULK_ARM(nbits##l);						\
  PICK_MULK_ARM64(nbits##l);						\
  DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits);	\
}

GCM_WIDTHS(DEF_MULK)

#define GCM_MULK_CASE(nbits)						\
  case nbits/32:							\
    if (_f&GCMF_SWAP) gcm_mulk_##nbits##l(_a, _ktab);			\
    else gcm_mulk_##nbits##b(_a, _ktab);				\
    break;
#define MULK(n, f, a, ktab) do {					\
  uint32 *_a = (a); const uint32 *_ktab = (ktab);			\
  unsigned _f = (f);							\
  switch (n) {								\
    GCM_WIDTHS(GCM_MULK_CASE)						\
    default: abort();							\
  }									\
} while (0)

/*----- Other utilities ---------------------------------------------------*/

/* --- @putlen@ --- *
 *
 * Arguments:	@octet *p@ = pointer to output buffer
 *		@unsigned w@ = size of output buffer
 *		@unsigned blksz@ = block size (assumed fairly small)
 *		@unsigned long nblocks@ = number of blocks
 *		@unsigned nbytes@ = tail size in bytes (assumed small)
 *
 * Returns:	---
 *
 * Use:		Store the overall length in %$\emph{bits}$% (i.e.,
 *		@3*(nblocks*blksz + nbytes)@ in big-endian form in the
 *		buffer @p@.
 */

static void putlen(octet *p, unsigned w, unsigned blksz,
		   unsigned long nblocks, unsigned nbytes)
{
  unsigned long nblo = nblocks&((1ul << (ULONG_BITS/2)) - 1),
    nbhi = nblocks >> ULONG_BITS/2;
  unsigned long nlo = nblo*blksz + nbytes, nhi = nbhi*blksz;

  /* This is fiddly.  Split @nblocks@, which is the big number, into high and
   * low halves, multiply those separately by @blksz@, propagate carries, and
   * then multiply by eight.
   */
  nhi += nlo >> ULONG_BITS/2;
  nlo &= (1ul << (ULONG_BITS/2)) - 1;
  nlo <<= 3;

  /* Now write out the size, feeding bits in from @nhi@ as necessary. */
  p += w;
  while (w--) {
    *--p = U8(nlo);
    nlo = (nlo >> 8) | ((nhi&0xff) << (ULONG_BITS/2 - 5));
    nhi >>= 8;
  }
}

/* --- @mix@ --- *
 *
 * Arguments:	@const gcm_params *p@ = pointer to the parameters
 *		@uint32 *a@ = GHASH accumulator
 *		@const octet *q@ = pointer to an input block
 *		@const uint32 *ktab@ = multiplication table, built by
 *			@gcm_mktable@
 *
 * Returns:	---
 *
 * Use:		Fold the block @q@ into the GHASH accumulator.  The
 *		calculation is %$a' = k (a + q)$%.
 */

static void mix(const gcm_params *p, uint32 *a,
		const octet *q, const uint32 *ktab)
{
  unsigned i;

  if (p->f&GCMF_SWAP)
    for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_L(q); q += 4; }
  else
    for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_B(q); q += 4; }
  MULK(p->n, p->f, a, ktab);
}

/* --- @gcm_ghashdone@ --- *
 *
 * Arguments:	@const gcm_params *p@ = pointer to the parameters
 *		@uint32 *a@ = GHASH accumulator
 *		@const uint32 *ktab@ = multiplication table, built by
 *			@gcm_mktable@
 *		@unsigned long xblocks, yblocks@ = number of whole blocks in
 *			the two inputs
 *		@unsigned xbytes, ybytes@ = number of trailing bytes in the
 *			two inputs
 *
 * Returns:	---
 *
 * Use:		Finishes a GHASH operation by appending the appropriately
 *		encoded lengths of the two constituent messages.
 */

void gcm_ghashdone(const gcm_params *p, uint32 *a, const uint32 *ktab,
		   unsigned long xblocks, unsigned xbytes,
		   unsigned long yblocks, unsigned ybytes)
{
  octet b[4*GCM_NMAX];
  unsigned w = p->n < 3 ? 4*p->n : 2*p->n;

  /* Construct the encoded lengths.  Note that smaller-block versions of GCM
   * encode the lengths in separate blocks.  GCM is only officially defined
   * for 64- and 128-bit blocks; I've placed the cutoff somewhat arbitrarily
   * at 96 bits.
   */
  putlen(b,     w, 4*p->n, xblocks, xbytes);
  putlen(b + w, w, 4*p->n, yblocks, ybytes);

  /* Feed the lengths into the accumulator. */
  mix(p, a, b, ktab);
  if (p->n < 3) mix(p, a, b + w, ktab);
}

/* --- @gcm_concat@ --- *
 *
 * Arguments:	@const gcm_params *p@ = pointer to the parameters
 *		@uint32 *z@ = GHASH accumulator for suffix, updated
 *		@const uint32 *x@ = GHASH accumulator for prefix
 *		@const uint32 *ktab@ = multiplication table, built by
 *			@gcm_mktable@
 *		@unsigned long n@ = length of suffix in whole blocks
 *
 * Returns:	---
 *
 * Use:		On entry, @x@ and @z@ are the results of hashing two strings
 *		%$a$% and %$b$%, each a whole number of blocks long; in
 *		particular, %$b$% is @n@ blocks long.  On exit, @z@ is
 *		updated to be the hash of %$a \cat b$%.
 */

void gcm_concat(const gcm_params *p, uint32 *z, const uint32 *x,
		const uint32 *ktab, unsigned long n)
{
  uint32 t[GCM_NMAX], u[GCM_NMAX];
  unsigned i, j;

  if (!n) {
    /* If @n@ is zero, then there's not much to do.  The mathematics
     * (explained below) still works, but the code takes a shortcut which
     * doesn't handle this case: so set %$z' = z + x k^n = z + x$%.
     */

    for (j = 0; j < p->n; j++) z[j] ^= x[j];
  } else {
    /* We have %$x = a_0 t^m + \cdots + a_{m-2} t^2 + a_{m-1} t$% and
     * %$z = b_0 t^n + \cdots + b_{n-2} t^2 + b_{n-1} t$%.  What we'd like is
     * the hash of %$a \cat b$%, which is %$z + x k^n$%.
     *
     * The first job, then, is to calculate %$k^n$%, and for this we use a
     * simple left-to-right square-and-multiply algorithm.  There's no need
     * to keep %$n$% secret here.
     */

    /* Start by retrieving %$k$% from the table, and convert it to big-endian
     * form.
     */
    recover_k(p, u, ktab);

    /* Now calculate %$k^n$%. */
    i = ULONG_BITS;
#define BIT (1ul << (ULONG_BITS - 1))
    while (!(n&BIT)) { n <<= 1; i--; }
    n <<= 1; i--; for (j = 0; j < p->n; j++) t[j] = u[j];
    while (i--) { mul(p, t, t, t); if (n&BIT) mul(p, t, t, u); n <<= 1; }
#undef BIT

    /* Next, calculate %$x k^n$%.  If we're using a little-endian convention
     * then we must convert %$x$%; otherwise we can just use it in place.
     */
    if (!(p->f&GCMF_SWAP))
      mul(p, t, t, x);
    else {
      for (j = 0; j < p->n; j++) u[j] = ENDSWAP32(x[j]);
      mul(p, t, t, u);
    }

    /* Finally, add %$x k^n$% onto %$z$%, converting back to little-endian if
     * necessary.
     */
    if (!(p->f&GCMF_SWAP)) for (j = 0; j < p->n; j++) z[j] ^= t[j];
    else for (j = 0; j < p->n; j++) z[j] ^= ENDSWAP32(t[j]);
  }
}

/*----- Test rig ----------------------------------------------------------*/

#ifdef TEST_RIG

#include <mLib/macros.h>
#include <mLib/quis.h>
#include <mLib/testrig.h>

#ifdef ENABLE_ASM_DEBUG
#  include "regdump.h"
#endif

static void report_failure(const char *test, unsigned nbits,
			   const char *ref, dstr v[], dstr *d)
{
  printf("test %s failed (nbits = %u)", test, nbits);
  printf("\n\tx  = "); type_hex.dump(&v[0], stdout);
  printf("\n\ty  = "); type_hex.dump(&v[1], stdout);
  printf("\n\tz  = "); type_hex.dump(&v[2], stdout);
  printf("\n\t%s' = ", ref); type_hex.dump(d, stdout);
  putchar('\n');
}

static void mulk(unsigned nbits, unsigned f, uint32 *x, const uint32 *ktab)
  { MULK(nbits/32, f, x, ktab); }

static int test_mul(uint32 poly, dstr v[])
{
  uint32 x[GCM_NMAX], y[GCM_NMAX], z[GCM_NMAX], ktab[32*GCM_NMAX*GCM_NMAX];
  gcm_params p;
  dstr d = DSTR_INIT;
  unsigned i, nbits;
  int ok = 1;
  enum { I_x, I_y, I_z };

  nbits = 8*v[0].len; p.f = 0; p.n = nbits/32; p.poly = poly;
  dstr_ensure(&d, nbits/8); d.len = nbits/8;

#define LOADXY(E) do {							\
  for (i = 0; i < nbits/32; i++) {					\
    x[i] = LOAD32_##E(v[I_x].buf + 4*i);				\
    y[i] = LOAD32_##E(v[I_y].buf + 4*i);				\
  }									\
} while (0)

#define INITZ(x) do {							\
  for (i = 0; i < nbits/32; i++) z[i] = (x)[i];				\
} while (0)

#define CHECK(E, what, ref) do {					\
  for (i = 0; i < nbits/32; i++) STORE32_##E(d.buf + 4*i, z[i]);	\
  if (MEMCMP(d.buf, !=, v[I_##ref].buf, nbits/8))			\
    { ok = 0; report_failure(what, nbits, #ref, v, &d); }		\
} while (0)

#define TEST_PREP_1(E, x, y, what) do {					\
  gcm_mktable(&p, ktab, y);						\
  recover_k(&p, z, ktab); CHECK(B, "mktable/recover_k (" #y ")", y);	\
  INITZ(x); mulk(nbits, p.f, z, ktab); CHECK(E, what " (k = " #y ")", z); \
} while (0)

#define TEST_PREP(E, what) do {						\
  TEST_PREP_1(E, x, y, what);						\
  TEST_PREP_1(E, y, x, what);						\
} while (0)

  /* First, test plain multiply. */
  LOADXY(B); mul(&p, z, x, y); CHECK(B, "gcm_mul", z);

  /* Next, test big-endian prepared key. */
  LOADXY(B); TEST_PREP(B, "gcm_kmul_b");

  /* Finally, test little-endian prepared key. */
  p.f = GCMF_SWAP; LOADXY(L);
  TEST_PREP(L, "gcm_kmul_l");

#undef LOADXY
#undef INITZ
#undef CHECK
#undef TEST_PREP_1
#undef TEST_PREP

  /* All done. */
  return (ok);
}

#define TEST(nbits)							\
static int test_mul_##nbits(dstr v[])					\
  { return (test_mul(GCM_POLY_##nbits, v)); }
GCM_WIDTHS(TEST)
#undef TEST

static test_chunk defs[] = {
#define TEST(nbits)							\
  { "gcm-mul" #nbits, test_mul_##nbits,					\
    { &type_hex, &type_hex, &type_hex, 0 } },
GCM_WIDTHS(TEST)
#undef TEST
  { 0, 0, { 0 } }
};

int main(int argc,  char *argv[])
{
  ego(argv[0]);
#ifdef ENABLE_ASM_DEBUG
  regdump_init();
#endif
  test_run(argc, argv, defs, SRCDIR"/t/gcm");
  return (0);
}

#endif

/*----- That's all, folks -------------------------------------------------*/