[catacomb] / symm / gcm.c

/* -*-c-*-
 *
 * The GCM authenticated encryption mode
 *
 * (c) 2017 Straylight/Edgeware
 */

/*----- Licensing notice --------------------------------------------------*
 *
 * This file is part of Catacomb.
 *
 * Catacomb is free software: you can redistribute it and/or modify it
 * under the terms of the GNU Library General Public License as published
 * by the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * Catacomb is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with Catacomb.  If not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
 * USA.
 */

/*----- Header files ------------------------------------------------------*/

#include "config.h"

#include <stdio.h>

#include <mLib/bits.h>

#include "gcm.h"
#include "gcm-def.h"

/*----- Overall strategy --------------------------------------------------*
 *
 * GCM is pretty awful to implement in software.  (This presentation is going
 * to be somewhat different to that in the specification, but I think it
 * makes more sense like this.)
 *
 * We're given a %$w$%-bit blockcipher %$E$% with a key %$K$%.
 *
 * The main part is arithmetic in the finite field %$k = \gf{2^w}$%, which we
 * represent as the quotient ring %$\gf{2}[t]/(p_w(t))$% for some irreducible
 * degree-%$w$% polynomial %$p(t)$%, whose precise value isn't very important
 * right now.  We choose a secret point %$x = E_K(0^w)$%.
 *
 * We choose a length size %$z$% as follows: if %$w < 96%$ then %$z = w$%;
 * otherwise %$z = w/2$%.  Format a message pair as follows:
 *
 *	%$F(a, b) = P_w(a) \cat P_w(b) \cat [\ell(a)]_z \cat [\ell(b)]_z$%
 *
 * where %$P_w(x) = x \cat 0^n$% where $%0 \le n < w$% such that
 * %$\ell(x) + n \equiv 0 \pmod{w}$%.
 *
 * Hash a (block-aligned) message %$u$% as follows.  First, split %$u$% into
 * %$w$%-bit blocks %$u_0$%, %$u_1$%, %%\ldots%%, %$u_{n-1}$%.  Interpret
 * these as elements of %$k$%.  Then
 *
 *	%$G_x(u) = u_0 t^n + u_1 t^{n-1} + \cdots + u_{n-1} t$%
 *
 * converted back to a %$w$%-bit string.
 *
 * We're ready to go now.  Suppose we're to encrypt a message %$M$% with
 * header %$H$% and nonce %$N$%.  If %$\ell(N) + 32 = w$% then let
 * %$N' = N$% and let %$i_0 = 1$%; otherwise, let %$U = G_t(F(\epsilon, N))$%
 * and split this into %$N' = U[0 \bitsto w - 32]$% and
 * %$[i_0]_{32} = U[w - 32 \bitsto w]$%.
 *
 * Let %$n = \lceil \ell(M)/w \rceil$%.  Compute
 *
 *	%$y_j = E_K(N' \cat [i_0 + j]_{32})$%
 *
 * for %$0 \le j \le n$%.  Let
 *
 *	%$s = (y_1 \cat y_2 \cat \cdots \cat y_n)[0 \bitsto \ell(M)$%
 *
 * Let %$C = M \xor s$% and let %$T = G_x(F(H, C)) \xor y_0$%.  These are the
 * ciphertext and tag respectively.
 *
 * So why is this awful?
 *
 * For one thing, the bits are in a completely terrible order.  The bytes are
 * arranged in little-endian order, so the unit coefficient is in the first
 * byte, and the degree-127 coefficient is in the last byte.  But within each
 * byte, the lowest-degree coefficient is in the most significant bit.  It's
 * therefore better to think of GCM as using a big-endian byte-ordering
 * convention, but with the bits backwards.
 *
 * But messing about with byte ordering is expensive, so let's not do that in
 * the inner loop.  But multiplication in %$k$% is not easy either.  Some
 * kind of precomputed table would be nice, but that will leak secrets
 * through the cache.
 *
 * I choose a particularly simple table: given %$x$%, let %$X[i'] = x t^i$%.
 * Then $%$x y = \sum_{0\le i<w} y_i X[i']$% which is just a bunch of
 * bitmasking.  But the natural order for examining bits of %$y$% is not
 * necessarily the obvious one.  We'll have already loaded %$y$% into
 * internal form, as 32-bit words.  The good order to process these is left
 * to right, from high to low bits.  But now the order of degrees depends on
 * the endianness of our conversion of bytes to words.  Oh, well.
 *
 * If we've adopted a big-endian convention, then we'll see the degrees in
 * order, 0, 1, ..., all the way up to %$w - 1$% and everything is fine.  If
 * we've adopted a little-endian convention, though, we'll see an ordering
 * like this:
 *
 *	24, 25, ..., 31, 16, 17, ..., 23,  8,  9, ..., 15,  0,  1, ..., 7,
 *	56, 57, ..., 63, 48, 49, ..., 55, 40, 41, ..., 47, 32, 33, ..., 39,
 *	etc.
 *
 * which is the ordinary order with 0x18 = 24 XORed into the index.  That is,
 * %$i' = i$% if we've adopted a big-endian convention, and
 * %$i' = i \xor 24$% if we've adopted a little-endian convention.
 */

/*----- Low-level utilities -----------------------------------------------*/

/* --- @mult@ --- *
 *
 * Arguments:	@const gcm_params *p@ = pointer to the parameters
 *		@uint32 *z@ = where to write the result
 *		@const uint32 *x@ = input field element
 *
 * Returns:	---
 *
 * Use:		Multiply the input field element by %$t$%, and write the
 *		product to @z@.  It's safe for @x@ and @z@ to be equal, but
 *		they should not otherwise overlap.  Both input and output are
 *		in big-endian form, i.e., with the lowest-degree coefficients
 *		in the most significant bits.
 */

static void mult(const gcm_params *p, uint32 *z, const uint32 *x)
{
  uint32 m, c, t;
  unsigned i;

  t = x[p->n - 1]; m = -(t&1u); c = m&p->poly;
  for (i = 0; i < p->n; i++) { t = x[i]; z[i] = (t >> 1) ^ c; c = t << 31; }
}

/* --- @mul@ --- *
 *
 * Arguments:	@const gcm_params *p@ = pointer to the parameters
 *		@uint32 *z@ = where to write the result
 *		@const uint32 *x, *y@ = input field elements
 *
 * Returns:	---
 *
 * Use:		Multiply the input field elements together, and write the
 *		product to @z@.  It's safe for the operands to overlap.  Both
 *		inputs and the output are in big-endian form, i.e., with the
 *		lowest-degree coefficients in the most significant bits.
 */

static void mul(const gcm_params *p, uint32 *z,
		const uint32 *x, const uint32 *y)
{
  uint32 m, t, u[GCM_NMAX], v[GCM_NMAX];
  unsigned i, j, k;

  /* We can't do this in-place at all, so use temporary space.  Make a copy
   * of @x@ in @u@, where we can clobber it, and build the product in @v@.
   */
  for (i = 0; i < p->n; i++) { u[i] = x[i]; v[i] = 0; }

  /* Repeatedly multiply @x@ (in @u@) by %$t$%, and add together those
   * %$x t^i$% selected by the bits of @y@.  This is basically what you get
   * by streaming the result of @gcm_mktable@ into @gcm_mulk_...@.
   */
  for (i = 0; i < p->n; i++) {
    t = y[i];
    for (j = 0; j < 32; j++) {
      m = -((t >> 31)&1u);
      for (k = 0; k < p->n; k++) v[k] ^= u[k]&m;
      mult(p, u, u); t <<= 1;
    }
  }

  /* Write out the result now that it's ready. */
  for (i = 0; i < p->n; i++) z[i] = v[i];
}

/*----- Table-based multiplication ----------------------------------------*/

/* --- @gcm_mktable@ --- *
 *
 * Arguments:	@const gcm_params *p@ = pointer to the parameters
 *		@uint32 *ktab@ = where to write the table; there must be
 *			space for %$32 n$% $%n$%-word entries, i.e.,
 *			%$32 n^2$% 32-bit words in total, where %$n$% is
 *			@p->n@, the block size in words
 *		@const uint32 *k@ = input field element
 *
 * Returns:	---
 *
 * Use:		Construct a table for use by @gcm_mulk_...@ below, to
 *		multiply (vaguely) efficiently by @k@.
 */

void gcm_mktable(const gcm_params *p, uint32 *ktab, const uint32 *k)
{
  unsigned m = (p->f&GCMF_SWAP ? 0x18 : 0);
  unsigned i, j, o = m*p->n;

  /* As described above, the table stores entries %$K[i \xor m] = k t^i$%,
   * where %$m = 0$% (big-endian cipher) or %$m = 24$% (little-endian).
   * The first job is to store %$K[m] = k$%.
   *
   * We initially build the table with the entries in big-endian order, and
   * then swap them if necessary.  This makes the arithmetic functions more
   * amenable for use by @gcm_concat@ below.
   */
  if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) ktab[o + i] = k[i];
  else for (i = 0; i < p->n; i++) ktab[o + i] = ENDSWAP32(k[i]);

  /* Fill in the rest of the table by repeatedly multiplying the previous
   * entry by %$t$%.
   */
  for (i = 1; i < 32*p->n; i++)
    { j = (i ^ m)*p->n; mult(p, ktab + j, ktab + o); o = j; }

  /* Finally, if the cipher uses a little-endian convention, then swap all of
   * the individual words.
   */
  if (p->f&GCMF_SWAP)
    for (i = 0; i < 32*p->n*p->n; i++) ktab[i] = ENDSWAP32(ktab[i]);
}

/* --- @gcm_mulk_N@ --- *
 *
 * Arguments:	@uint32 *a@ = accumulator to multiply
 *		@const uint32 *ktab@ = table constructed by @gcm_mktable@
 *
 * Returns:	---
 *
 * Use:		Multiply @a@ by @k@ (implicitly represented in @ktab@),
 *		updating @a@ in-place.  There are separate functions for each
 *		supported block size because this is the function whose
 *		performance actually matters.
 */

#define DEF_MULK(nbits)							\
void gcm_mulk_##nbits(uint32 *a, const uint32 *ktab)			\
{									\
  uint32 m, t;								\
  uint32 z[nbits/32];							\
  unsigned i, j, k;							\
									\
  for (i = 0; i < nbits/32; i++) z[i] = 0;				\
									\
  for (i = 0; i < nbits/32; i++) {					\
    t = a[i];								\
    for (j = 0; j < 32; j++) {						\
      m = -((t >> 31)&1u);						\
      for (k = 0; k < nbits/32; k++) z[k] ^= *ktab++&m;			\
      t <<= 1;								\
    }									\
  }									\
									\
  for (i = 0; i < nbits/32; i++) a[i] = z[i];				\
}
GCM_WIDTHS(DEF_MULK)

/*----- Other utilities ---------------------------------------------------*/

/* --- @putlen@ --- *
 *
 * Arguments:	@octet *p@ = pointer to output buffer
 *		@unsigned w@ = size of output buffer
 *		@unsigned blksz@ = block size (assumed fairly small)
 *		@unsigned long nblocks@ = number of blocks
 *		@unsigned nbytes@ = tail size in bytes (assumed small)
 *
 * Returns:	---
 *
 * Use:		Store the overall length in %$\emph{bits}$% (i.e.,
 *		@3*(nblocks*blksz + nbytes)@ in big-endian form in the
 *		buffer @p@.
 */

static void putlen(octet *p, unsigned w, unsigned blksz,
		   unsigned long nblocks, unsigned nbytes)
{
  unsigned long nblo = nblocks&((1ul << (ULONG_BITS/2)) - 1),
    nbhi = nblocks >> ULONG_BITS/2;
  unsigned long nlo = nblo*blksz + nbytes, nhi = nbhi*blksz;

  /* This is fiddly.  Split @nblocks@, which is the big number, into high and
   * low halves, multiply those separately by @blksz@, propagate carries, and
   * then multiply by eight.
   */
  nhi += nlo >> ULONG_BITS/2;
  nlo &= (1ul << (ULONG_BITS/2)) - 1;
  nlo <<= 3;

  /* Now write out the size, feeding bits in from @nhi@ as necessary. */
  p += w;
  while (w--) {
    *--p = U8(nlo);
    nlo = (nlo >> 8) | ((nhi&0xff) << (ULONG_BITS/2 - 5));
    nhi >>= 8;
  }
}

/* --- @mix@ --- *
 *
 * Arguments:	@const gcm_params *p@ = pointer to the parameters
 *		@uint32 *a@ = GHASH accumulator
 *		@const octet *q@ = pointer to an input block
 *		@const uint32 *ktab@ = multiplication table, built by
 *			@gcm_mktable@
 *
 * Returns:	---
 *
 * Use:		Fold the block @q@ into the GHASH accumulator.  The
 *		calculation is %$a' = k (a + q)$%.
 */

static void mix(const gcm_params *p, uint32 *a,
		const octet *q, const uint32 *ktab)
{
  unsigned i;

  /* Convert the block from bytes into words, using the appropriate
   * convention.
   */
  if (p->f&GCMF_SWAP)
    for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_L(q); q += 4; }
  else
    for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_B(q); q += 4; }

  /* Dispatch to the correct multiply-by-%$k$% function. */
  switch (p->n) {
#define CASE(nbits) case nbits/32: gcm_mulk_##nbits(a, ktab); break;
    GCM_WIDTHS(CASE)
#undef CASE
    default: abort();
  }
}

/* --- @gcm_ghashdone@ --- *
 *
 * Arguments:	@const gcm_params *p@ = pointer to the parameters
 *		@uint32 *a@ = GHASH accumulator
 *		@const uint32 *ktab@ = multiplication table, built by
 *			@gcm_mktable@
 *		@unsigned long xblocks, yblocks@ = number of whole blocks in
 *			the two inputs
 *		@unsigned xbytes, ybytes@ = number of trailing bytes in the
 *			two inputs
 *
 * Returns:	---
 *
 * Use:		Finishes a GHASH operation by appending the appropriately
 *		encoded lengths of the two constituent messages.
 */

void gcm_ghashdone(const gcm_params *p, uint32 *a, const uint32 *ktab,
		   unsigned long xblocks, unsigned xbytes,
		   unsigned long yblocks, unsigned ybytes)
{
  octet b[4*GCM_NMAX];
  unsigned w = p->n < 3 ? 4*p->n : 2*p->n;

  /* Construct the encoded lengths.  Note that smaller-block versions of GCM
   * encode the lengths in separate blocks.  GCM is only officially defined
   * for 64- and 128-bit blocks; I've placed the cutoff somewhat arbitrarily
   * at 96 bits.
   */
  putlen(b,     w, 4*p->n, xblocks, xbytes);
  putlen(b + w, w, 4*p->n, yblocks, ybytes);

  /* Feed the lengths into the accumulator. */
  mix(p, a, b, ktab);
  if (p->n < 3) mix(p, a, b + w, ktab);
}

/* --- @gcm_concat@ --- *
 *
 * Arguments:	@const gcm_params *p@ = pointer to the parameters
 *		@uint32 *z@ = GHASH accumulator for suffix, updated
 *		@const uint32 *x@ = GHASH accumulator for prefix
 *		@const uint32 *ktab@ = multiplication table, built by
 *			@gcm_mktable@
 *		@unsigned long n@ = length of suffix in whole blocks
 *
 * Returns:	---
 *
 * Use:		On entry, @x@ and @z@ are the results of hashing two strings
 *		%$a$% and %$b$%, each a whole number of blocks long; in
 *		particular, %$b$% is @n@ blocks long.  On exit, @z@ is
 *		updated to be the hash of %$a \cat b$%.
 */

void gcm_concat(const gcm_params *p, uint32 *z, const uint32 *x,
		const uint32 *ktab, unsigned long n)
{
  uint32 t[GCM_NMAX], u[GCM_NMAX];
  unsigned i, j;

  if (!n) {
    /* If @n@ is zero, then there's not much to do.  The mathematics
     * (explained below) still works, but the code takes a shortcut which
     * doesn't handle this case: so set %$z' = z + x k^n = z + x$%.
     */

    for (j = 0; j < p->n; j++) z[j] ^= x[j];
  } else {
    /* We have %$x = a_0 t^m + \cdots + a_{m-2} t^2 + a_{m-1} t$% and
     * %$z = b_0 t^n + \cdots + b_{n-2} t^2 + b_{n-1} t$%.  What we'd like is
     * the hash of %$a \cat b$%, which is %$z + x k^n$%.
     *
     * The first job, then, is to calculate %$k^n$%, and for this we use a
     * simple left-to-right square-and-multiply algorithm.  There's no need
     * to keep %$n$% secret here.
     */

    /* Start by retrieving %$k$% from the table, and convert it to big-endian
     * form.
     */
    if (!(p->f&GCMF_SWAP)) for (j = 0; j < p->n; j++) u[j] = ktab[j];
    else for (j = 0; j < p->n; j++) u[j] = ENDSWAP32(ktab[24*p->n + j]);

    /* Now calculate %$k^n$%. */
    i = ULONG_BITS;
#define BIT (1ul << (ULONG_BITS - 1))
    while (!(n&BIT)) { n <<= 1; i--; }
    n <<= 1; i--; for (j = 0; j < p->n; j++) t[j] = u[j];
    while (i--) { mul(p, t, t, t); if (n&BIT) mul(p, t, t, u); n <<= 1; }
#undef BIT

    /* Next, calculate %$x k^n$%.  If we're using a little-endian convention
     * then we must convert %$x$%; otherwise we can just use it in place.
     */
    if (!(p->f&GCMF_SWAP))
      mul(p, t, t, x);
    else {
      for (j = 0; j < p->n; j++) u[j] = ENDSWAP32(x[j]);
      mul(p, t, t, u);
    }

    /* Finally, add %$x k^n$% onto %$z$%, converting back to little-endian if
     * necessary.
     */
    if (!(p->f&GCMF_SWAP)) for (j = 0; j < p->n; j++) z[j] ^= t[j];
    else for (j = 0; j < p->n; j++) z[j] ^= ENDSWAP32(t[j]);
  }
}

/*----- That's all, folks -------------------------------------------------*/
Commit	Line	Data
50df5733 MW	1	/* --c--
	2	*
	3	* The GCM authenticated encryption mode
	4	*
	5	* (c) 2017 Straylight/Edgeware
	6	*/
	7
	8	/----- Licensing notice --------------------------------------------------
	9	*
	10	* This file is part of Catacomb.
	11	*
	12	* Catacomb is free software: you can redistribute it and/or modify it
	13	* under the terms of the GNU Library General Public License as published
	14	* by the Free Software Foundation; either version 2 of the License, or
	15	* (at your option) any later version.
	16	*
	17	* Catacomb is distributed in the hope that it will be useful, but
	18	* WITHOUT ANY WARRANTY; without even the implied warranty of
	19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	20	* Library General Public License for more details.
	21	*
	22	* You should have received a copy of the GNU Library General Public
	23	* License along with Catacomb. If not, write to the Free Software
	24	* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
	25	* USA.
	26	*/
	27
	28	/----- Header files ------------------------------------------------------/
	29
	30	#include "config.h"
	31
	32	#include <stdio.h>
	33
	34	#include <mLib/bits.h>
	35
	36	#include "gcm.h"
	37	#include "gcm-def.h"
	38
	39	/----- Overall strategy --------------------------------------------------
	40	*
	41	* GCM is pretty awful to implement in software. (This presentation is going
	42	* to be somewhat different to that in the specification, but I think it
	43	* makes more sense like this.)
	44	*
	45	* We're given a %$w$%-bit blockcipher %$E$% with a key %$K$%.
	46	*
	47	* The main part is arithmetic in the finite field %$k = \gf{2^w}$%, which we
	48	* represent as the quotient ring %$\gf{2}[t]/(p_w(t))$% for some irreducible
	49	* degree-%$w$% polynomial %$p(t)$%, whose precise value isn't very important
	50	* right now. We choose a secret point %$x = E_K(0^w)$%.
	51	*
	52	* We choose a length size %$z$% as follows: if %$w < 96%$ then %$z = w$%;
	53	* otherwise %$z = w/2$%. Format a message pair as follows:
	54	*
	55	* %$F(a, b) = P_w(a) \cat P_w(b) \cat [\ell(a)]_z \cat [\ell(b)]_z$%
	56	*
	57	* where %$P_w(x) = x \cat 0^n$% where $%0 \le n < w$% such that
	58	* %$\ell(x) + n \equiv 0 \pmod{w}$%.
	59	*
	60	* Hash a (block-aligned) message %$u$% as follows. First, split %$u$% into
	61	* %$w$%-bit blocks %$u_0$%, %$u_1$%, %%\ldots%%, %$u_{n-1}$%. Interpret
	62	* these as elements of %$k$%. Then
	63	*
	64	* %$G_x(u) = u_0 t^n + u_1 t^{n-1} + \cdots + u_{n-1} t$%
65	*
66	* converted back to a %$w$%-bit string.
67	*
68	* We're ready to go now. Suppose we're to encrypt a message %$M$% with
69	* header %$H$% and nonce %$N$%. If %$\ell(N) + 32 = w$% then let
70	* %$N' = N$% and let %$i_0 = 1$%; otherwise, let %$U = G_t(F(\epsilon, N))$%
71	* and split this into %$N' = U[0 \bitsto w - 32]$% and
72	* %$[i_0]_{32} = U[w - 32 \bitsto w]$%.
73	*
74	* Let %$n = \lceil \ell(M)/w \rceil$%. Compute
75	*
76	* %$y_j = E_K(N' \cat [i_0 + j]_{32})$%
77	*
78	* for %$0 \le j \le n$%. Let
79	*
80	* %$s = (y_1 \cat y_2 \cat \cdots \cat y_n)[0 \bitsto \ell(M)$%
81	*
82	* Let %$C = M \xor s$% and let %$T = G_x(F(H, C)) \xor y_0$%. These are the
83	* ciphertext and tag respectively.
84	*
85	* So why is this awful?
86	*
87	* For one thing, the bits are in a completely terrible order. The bytes are
88	* arranged in little-endian order, so the unit coefficient is in the first
89	* byte, and the degree-127 coefficient is in the last byte. But within each
90	* byte, the lowest-degree coefficient is in the most significant bit. It's
91	* therefore better to think of GCM as using a big-endian byte-ordering
92	* convention, but with the bits backwards.
93	*
94	* But messing about with byte ordering is expensive, so let's not do that in
95	* the inner loop. But multiplication in %$k$% is not easy either. Some
96	* kind of precomputed table would be nice, but that will leak secrets
97	* through the cache.
98	*
99	* I choose a particularly simple table: given %$x$%, let %$X[i'] = x t^i$%.
100	* Then $%$x y = \sum_{0\le i<w} y_i X[i']$% which is just a bunch of
101	* bitmasking. But the natural order for examining bits of %$y$% is not
102	* necessarily the obvious one. We'll have already loaded %$y$% into
103	* internal form, as 32-bit words. The good order to process these is left
104	* to right, from high to low bits. But now the order of degrees depends on
105	* the endianness of our conversion of bytes to words. Oh, well.
106	*
107	* If we've adopted a big-endian convention, then we'll see the degrees in
108	* order, 0, 1, ..., all the way up to %$w - 1$% and everything is fine. If
109	* we've adopted a little-endian convention, though, we'll see an ordering
110	* like this:
111	*
112	* 24, 25, ..., 31, 16, 17, ..., 23, 8, 9, ..., 15, 0, 1, ..., 7,
113	* 56, 57, ..., 63, 48, 49, ..., 55, 40, 41, ..., 47, 32, 33, ..., 39,
114	* etc.
115	*
116	* which is the ordinary order with 0x18 = 24 XORed into the index. That is,
117	* %$i' = i$% if we've adopted a big-endian convention, and
118	* %$i' = i \xor 24$% if we've adopted a little-endian convention.
119	*/
120
121	/----- Low-level utilities -----------------------------------------------/
122
123	/* --- @mult@ --- *
124	*
125	* Arguments: @const gcm_params *p@ = pointer to the parameters
126	* @uint32 *z@ = where to write the result
127	* @const uint32 *x@ = input field element
128	*
129	* Returns: ---
130	*
131	* Use: Multiply the input field element by %$t$%, and write the
132	* product to @z@. It's safe for @x@ and @z@ to be equal, but
133	* they should not otherwise overlap. Both input and output are
134	* in big-endian form, i.e., with the lowest-degree coefficients
135	* in the most significant bits.
136	*/
137
138	static void mult(const gcm_params p, uint32 z, const uint32 *x)
139	{
140	uint32 m, c, t;
141	unsigned i;
142
143	t = x[p->n - 1]; m = -(t&1u); c = m&p->poly;
144	for (i = 0; i < p->n; i++) { t = x[i]; z[i] = (t >> 1) ^ c; c = t << 31; }
145	}
146
147	/* --- @mul@ --- *
148	*
149	* Arguments: @const gcm_params *p@ = pointer to the parameters
150	* @uint32 *z@ = where to write the result
151	* @const uint32 x, y@ = input field elements
152	*
153	* Returns: ---
154	*
155	* Use: Multiply the input field elements together, and write the
156	* product to @z@. It's safe for the operands to overlap. Both
157	* inputs and the output are in big-endian form, i.e., with the
158	* lowest-degree coefficients in the most significant bits.
159	*/
160
161	static void mul(const gcm_params p, uint32 z,
162	const uint32 x, const uint32 y)
163	{
164	uint32 m, t, u[GCM_NMAX], v[GCM_NMAX];
165	unsigned i, j, k;
166
167	/* We can't do this in-place at all, so use temporary space. Make a copy
168	* of @x@ in @u@, where we can clobber it, and build the product in @v@.
169	*/
170	for (i = 0; i < p->n; i++) { u[i] = x[i]; v[i] = 0; }
171
172	/* Repeatedly multiply @x@ (in @u@) by %$t$%, and add together those
173	* %$x t^i$% selected by the bits of @y@. This is basically what you get
174	* by streaming the result of @gcm_mktable@ into @gcm_mulk_...@.
175	*/
176	for (i = 0; i < p->n; i++) {
177	t = y[i];
178	for (j = 0; j < 32; j++) {
179	m = -((t >> 31)&1u);
180	for (k = 0; k < p->n; k++) v[k] ^= u[k]&m;
181	mult(p, u, u); t <<= 1;
182	}
183	}
184
185	/* Write out the result now that it's ready. */
186	for (i = 0; i < p->n; i++) z[i] = v[i];
187	}
188
189	/----- Table-based multiplication ----------------------------------------/
190
191	/* --- @gcm_mktable@ --- *
192	*
193	* Arguments: @const gcm_params *p@ = pointer to the parameters
194	* @uint32 *ktab@ = where to write the table; there must be
195	* space for %$32 n$% $%n$%-word entries, i.e.,
196	* %$32 n^2$% 32-bit words in total, where %$n$% is
197	* @p->n@, the block size in words
198	* @const uint32 *k@ = input field element
199	*
200	* Returns: ---
201	*
202	* Use: Construct a table for use by @gcm_mulk_...@ below, to
203	* multiply (vaguely) efficiently by @k@.
204	*/
205
206	void gcm_mktable(const gcm_params p, uint32 ktab, const uint32 *k)
207	{
208	unsigned m = (p->f&GCMF_SWAP ? 0x18 : 0);
209	unsigned i, j, o = m*p->n;
210
211	/* As described above, the table stores entries %$K[i \xor m] = k t^i$%,
212	* where %$m = 0$% (big-endian cipher) or %$m = 24$% (little-endian).
213	* The first job is to store %$K[m] = k$%.
214	*
215	* We initially build the table with the entries in big-endian order, and
216	* then swap them if necessary. This makes the arithmetic functions more
217	* amenable for use by @gcm_concat@ below.
218	*/
219	if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) ktab[o + i] = k[i];
220	else for (i = 0; i < p->n; i++) ktab[o + i] = ENDSWAP32(k[i]);
221
222	/* Fill in the rest of the table by repeatedly multiplying the previous
223	* entry by %$t$%.
224	*/
225	for (i = 1; i < 32*p->n; i++)
226	{ j = (i ^ m)*p->n; mult(p, ktab + j, ktab + o); o = j; }
227
228	/* Finally, if the cipher uses a little-endian convention, then swap all of
229	* the individual words.
230	*/
231	if (p->f&GCMF_SWAP)
232	for (i = 0; i < 32p->np->n; i++) ktab[i] = ENDSWAP32(ktab[i]);
233	}
234
235	/* --- @gcm_mulk_N@ --- *
236	*
237	* Arguments: @uint32 *a@ = accumulator to multiply
238	* @const uint32 *ktab@ = table constructed by @gcm_mktable@
239	*
240	* Returns: ---
241	*
242	* Use: Multiply @a@ by @k@ (implicitly represented in @ktab@),
243	* updating @a@ in-place. There are separate functions for each
244	* supported block size because this is the function whose
245	* performance actually matters.
246	*/
247
248	#define DEF_MULK(nbits) \
249	void gcm_mulk_##nbits(uint32 a, const uint32 ktab) \
250	{ \
251	uint32 m, t; \
252	uint32 z[nbits/32]; \
253	unsigned i, j, k; \
254	\
255	for (i = 0; i < nbits/32; i++) z[i] = 0; \
256	\
257	for (i = 0; i < nbits/32; i++) { \
258	t = a[i]; \
259	for (j = 0; j < 32; j++) { \
260	m = -((t >> 31)&1u); \
261	for (k = 0; k < nbits/32; k++) z[k] ^= *ktab++&m; \
262	t <<= 1; \
263	} \
264	} \
265	\
266	for (i = 0; i < nbits/32; i++) a[i] = z[i]; \
267	}
268	GCM_WIDTHS(DEF_MULK)
269
270	/----- Other utilities ---------------------------------------------------/
271
272	/* --- @putlen@ --- *
273	*
274	* Arguments: @octet *p@ = pointer to output buffer
275	* @unsigned w@ = size of output buffer
276	* @unsigned blksz@ = block size (assumed fairly small)
277	* @unsigned long nblocks@ = number of blocks
278	* @unsigned nbytes@ = tail size in bytes (assumed small)
279	*
280	* Returns: ---
281	*
282	* Use: Store the overall length in %$\emph{bits}$% (i.e.,
283	* @3(nblocksblksz + nbytes)@ in big-endian form in the
284	* buffer @p@.
285	*/
286
287	static void putlen(octet *p, unsigned w, unsigned blksz,
288	unsigned long nblocks, unsigned nbytes)
289	{
290	unsigned long nblo = nblocks&((1ul << (ULONG_BITS/2)) - 1),
291	nbhi = nblocks >> ULONG_BITS/2;
292	unsigned long nlo = nbloblksz + nbytes, nhi = nbhiblksz;
293
294	/* This is fiddly. Split @nblocks@, which is the big number, into high and
295	* low halves, multiply those separately by @blksz@, propagate carries, and
296	* then multiply by eight.
297	*/
298	nhi += nlo >> ULONG_BITS/2;
299	nlo &= (1ul << (ULONG_BITS/2)) - 1;
300	nlo <<= 3;
301
302	/* Now write out the size, feeding bits in from @nhi@ as necessary. */
303	p += w;
304	while (w--) {
305	*--p = U8(nlo);
306	nlo = (nlo >> 8) \| ((nhi&0xff) << (ULONG_BITS/2 - 5));
307	nhi >>= 8;
308	}
309	}
310
311	/* --- @mix@ --- *
312	*
313	* Arguments: @const gcm_params *p@ = pointer to the parameters
314	* @uint32 *a@ = GHASH accumulator
315	* @const octet *q@ = pointer to an input block
316	* @const uint32 *ktab@ = multiplication table, built by
317	* @gcm_mktable@
318	*
319	* Returns: ---
320	*
321	* Use: Fold the block @q@ into the GHASH accumulator. The
322	* calculation is %$a' = k (a + q)$%.
323	*/
324
325	static void mix(const gcm_params p, uint32 a,
326	const octet q, const uint32 ktab)
327	{
328	unsigned i;
329
330	/* Convert the block from bytes into words, using the appropriate
331	* convention.
332	*/
333	if (p->f&GCMF_SWAP)
334	for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_L(q); q += 4; }
335	else
336	for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_B(q); q += 4; }
337
338	/* Dispatch to the correct multiply-by-%$k$% function. */
339	switch (p->n) {
340	#define CASE(nbits) case nbits/32: gcm_mulk_##nbits(a, ktab); break;
341	GCM_WIDTHS(CASE)
342	#undef CASE
343	default: abort();
344	}
345	}
346
347	/* --- @gcm_ghashdone@ --- *
348	*
349	* Arguments: @const gcm_params *p@ = pointer to the parameters
350	* @uint32 *a@ = GHASH accumulator
351	* @const uint32 *ktab@ = multiplication table, built by
352	* @gcm_mktable@
353	* @unsigned long xblocks, yblocks@ = number of whole blocks in
354	* the two inputs
355	* @unsigned xbytes, ybytes@ = number of trailing bytes in the
356	* two inputs
357	*
358	* Returns: ---
359	*
360	* Use: Finishes a GHASH operation by appending the appropriately
361	* encoded lengths of the two constituent messages.
362	*/
363
364	void gcm_ghashdone(const gcm_params p, uint32 a, const uint32 *ktab,
365	unsigned long xblocks, unsigned xbytes,
366	unsigned long yblocks, unsigned ybytes)
367	{
368	octet b[4*GCM_NMAX];
369	unsigned w = p->n < 3 ? 4p->n : 2p->n;
370
371	/* Construct the encoded lengths. Note that smaller-block versions of GCM
372	* encode the lengths in separate blocks. GCM is only officially defined
373	* for 64- and 128-bit blocks; I've placed the cutoff somewhat arbitrarily
374	* at 96 bits.
375	*/
376	putlen(b, w, 4*p->n, xblocks, xbytes);
377	putlen(b + w, w, 4*p->n, yblocks, ybytes);
378
379	/* Feed the lengths into the accumulator. */
380	mix(p, a, b, ktab);
381	if (p->n < 3) mix(p, a, b + w, ktab);
382	}
383
384	/* --- @gcm_concat@ --- *
385	*
386	* Arguments: @const gcm_params *p@ = pointer to the parameters
387	* @uint32 *z@ = GHASH accumulator for suffix, updated
388	* @const uint32 *x@ = GHASH accumulator for prefix
389	* @const uint32 *ktab@ = multiplication table, built by
390	* @gcm_mktable@
391	* @unsigned long n@ = length of suffix in whole blocks
392	*
393	* Returns: ---
394	*
395	* Use: On entry, @x@ and @z@ are the results of hashing two strings
396	* %$a$% and %$b$%, each a whole number of blocks long; in
397	* particular, %$b$% is @n@ blocks long. On exit, @z@ is
398	* updated to be the hash of %$a \cat b$%.
399	*/
400
401	void gcm_concat(const gcm_params p, uint32 z, const uint32 *x,
402	const uint32 *ktab, unsigned long n)
403	{
404	uint32 t[GCM_NMAX], u[GCM_NMAX];
405	unsigned i, j;
406
407	if (!n) {
408	/* If @n@ is zero, then there's not much to do. The mathematics
409	* (explained below) still works, but the code takes a shortcut which
410	* doesn't handle this case: so set %$z' = z + x k^n = z + x$%.
411	*/
412
413	for (j = 0; j < p->n; j++) z[j] ^= x[j];
414	} else {
415	/* We have %$x = a_0 t^m + \cdots + a_{m-2} t^2 + a_{m-1} t$% and
416	* %$z = b_0 t^n + \cdots + b_{n-2} t^2 + b_{n-1} t$%. What we'd like is
417	* the hash of %$a \cat b$%, which is %$z + x k^n$%.
418	*
419	* The first job, then, is to calculate %$k^n$%, and for this we use a
420	* simple left-to-right square-and-multiply algorithm. There's no need
421	* to keep %$n$% secret here.
422	*/
423
424	/* Start by retrieving %$k$% from the table, and convert it to big-endian
425	* form.
426	*/
427	if (!(p->f&GCMF_SWAP)) for (j = 0; j < p->n; j++) u[j] = ktab[j];
428	else for (j = 0; j < p->n; j++) u[j] = ENDSWAP32(ktab[24*p->n + j]);
429
430	/* Now calculate %$k^n$%. */
431	i = ULONG_BITS;
432	#define BIT (1ul << (ULONG_BITS - 1))
433	while (!(n&BIT)) { n <<= 1; i--; }
434	n <<= 1; i--; for (j = 0; j < p->n; j++) t[j] = u[j];
435	while (i--) { mul(p, t, t, t); if (n&BIT) mul(p, t, t, u); n <<= 1; }
436	#undef BIT
437
438	/* Next, calculate %$x k^n$%. If we're using a little-endian convention
439	* then we must convert %$x$%; otherwise we can just use it in place.
440	*/
441	if (!(p->f&GCMF_SWAP))
442	mul(p, t, t, x);
443	else {
444	for (j = 0; j < p->n; j++) u[j] = ENDSWAP32(x[j]);
445	mul(p, t, t, u);
446	}
447
448	/* Finally, add %$x k^n$% onto %$z$%, converting back to little-endian if
449	* necessary.
450	*/
451	if (!(p->f&GCMF_SWAP)) for (j = 0; j < p->n; j++) z[j] ^= t[j];
452	else for (j = 0; j < p->n; j++) z[j] ^= ENDSWAP32(t[j]);
453	}
454	}
455
456	/----- That's all, folks -------------------------------------------------/