[catacomb] / symm / poly1305.c

/* -*-c-*-
 *
 * Poly1305 message authentication code
 *
 * (c) 2017 Straylight/Edgeware
 */

/*----- Licensing notice --------------------------------------------------*
 *
 * This file is part of Catacomb.
 *
 * Catacomb is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Library General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * Catacomb is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with Catacomb; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
 * MA 02111-1307, USA.
 */

/*----- Header files ------------------------------------------------------*/

#include "config.h"

#include <assert.h>
#include <string.h>

#include "poly1305.h"

/*----- Global variables --------------------------------------------------*/

const octet poly1305_keysz[] = { KSZ_SET, 16, 0 };

/*----- Low-level implementation for 32/64-bit targets --------------------*/

#if !defined(POLY1305_IMPL) && defined(HAVE_UINT64)
#  define POLY1305_IMPL 26
#endif

#if POLY1305_IMPL == 26

/* Elements x of GF(2^130 - 5) are represented by five integers x_i: x =
 * SUM_{0<=i<5} x_i 2^{26i}.
 *
 * Not all elements are represented canonically.  We have 0 <= r_i, s_i <
 * 2^26 by construction.  We maintain 0 <= h_i < 2^27.  When we read a
 * message block m, we have 0 <= m_i < 2^26 by construction again.  When we
 * update the hash state, we calculate h' = r (h + m).  Addition is done
 * componentwise; let t = h + m, and we will have 0 <= t_i < 3*2^26.
 */
typedef uint32 felt[5];
#define M26 0x03ffffff
#define P p26

/* Convert 32-bit words into field-element pieces. */
#define P26W0(x)  (((x##0) <<  0)&0x03ffffff)
#define P26W1(x) ((((x##1) <<  6)&0x03ffffc0) | (((x##0) >> 26)&0x0000003f))
#define P26W2(x) ((((x##2) << 12)&0x03ffffff) | (((x##1) >> 20)&0x00000fff))
#define P26W3(x) ((((x##3) << 18)&0x03fc0000) | (((x##2) >> 14)&0x0003ffff))
#define P26W4(x)				(((x##3) >>  8)&0x00ffffff)

/* Propagate carries in parallel.  If 0 <= u_i < 2^26 c_i, then we shall have
 * 0 <= v_0 < 2^26 + 5 c_4, and 0 <= v_i < 2^26 + c_{i-1} for 1 <= i < 5.
 */
#define CARRY_REDUCE(v, u) do {						\
  (v##0) = ((u##0)&M26) + 5*((u##4) >> 26);				\
  (v##1) = ((u##1)&M26) +   ((u##0) >> 26);				\
  (v##2) = ((u##2)&M26) +   ((u##1) >> 26);				\
  (v##3) = ((u##3)&M26) +   ((u##2) >> 26);				\
  (v##4) = ((u##4)&M26) +   ((u##3) >> 26);				\
} while (0)

/* General multiplication, used by `concat'. */
static void mul(felt z, const felt x, const felt y)
{
  /* Initial bounds: we assume x_i, y_i < 2^27.  On exit, z_i < 2^27. */

  uint32 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4];
  uint32 y0 = y[0], y1 = y[1], y2 = y[2], y3 = y[3], y4 = y[4];
  uint64 u0, u1, u2, u3, u4;
  uint64 v0, v1, v2, v3, v4;
  uint32 z0, z1, z2, z3, z4;

  /* Do the multiplication: u = h x mod 2^130 - 5.  We will have u_i <
   * 2^27 (5 (4 - i) + i + 1) 2^27 = 2^54 (21 - 4 i) = 2^52 (84 - 16 i).  In
   * all cases we have u_i < 84*2^52 < 2^59.  Notably, u_4 < 5*2^54 =
   * 20*2^52.
   */
#define M(x, y) ((uint64)(x)*(y))
  u0 = M(x0, y0) + (M(x1, y4) +  M(x2, y3) +  M(x3, y2) +  M(x4, y1))*5;
  u1 = M(x0, y1) +  M(x1, y0) + (M(x2, y4) +  M(x3, y3) +  M(x4, y2))*5;
  u2 = M(x0, y2) +  M(x1, y1) +  M(x2, y0) + (M(x3, y4) +  M(x4, y3))*5;
  u3 = M(x0, y3) +  M(x1, y2) +  M(x2, y1) +  M(x3, y0) + (M(x4, y4))*5;
  u4 = M(x0, y4) +  M(x1, y3) +  M(x2, y2) +  M(x3, y1) +  M(x4, y0);
#undef M

  /* Now we must reduce the coefficients.  We do this in an approximate
   * manner which avoids long data-dependency chains, but requires two
   * passes.
   *
   * The reduced carry down from u_4 to u_0 in the first pass will be c_0 <
   * 100*2^26; the remaining c_i are smaller: c_i < 2^26 (84 - 16 i).  This
   * leaves 0 <= v_i < 101*2^26.  The carries in the second pass are bounded
   * above by 180.
   */
  CARRY_REDUCE(v, u); CARRY_REDUCE(z, v);
  z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4;
}

/* General squaring, used by `concat'. */
static void sqr(felt z, const felt x)
{
  /* Initial bounds: we assume x_i < 2^27.  On exit, z_i < 2^27. */

  uint32 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4];
  uint64 u0, u1, u2, u3, u4;
  uint64 v0, v1, v2, v3, v4;
  uint32 z0, z1, z2, z3, z4;

  /* Do the squaring.  See `mul' for bounds. */
#define M(x, y) ((uint64)(x)*(y))
  u0 = M(x0, x0) +			      10*(M(x1, x4) +   M(x2, x3));
  u1 =		   2* M(x0, x1) +	       5*(M(x3, x3) + 2*M(x2, x4));
  u2 = M(x1, x1) + 2* M(x0, x2) +	      10* M(x3, x4);
  u3 =		   2*(M(x0, x3) + M(x1, x2)) + 5* M(x4, x4);
  u4 = M(x2, x2) + 2*(M(x0, x4) + M(x1, x3));
#undef M

  /* Now we must reduce the coefficients.  See `mul' for bounds. */
  CARRY_REDUCE(v, u); CARRY_REDUCE(z, v);
  z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4;
}

/* Multiplication by r, using precomputation. */
static void mul_r(const poly1305_ctx *ctx, felt z, const felt x)
{
  /* Initial bounds: by construction, r_i < 2^26.  We assume x_i < 3*2^26.
   * On exit, z_i < 2^27.
   */

  uint32
    r0 = ctx->k.u.p26.r0,
    r1 = ctx->k.u.p26.r1, rr1 = ctx->k.u.p26.rr1,
    r2 = ctx->k.u.p26.r2, rr2 = ctx->k.u.p26.rr2,
    r3 = ctx->k.u.p26.r3, rr3 = ctx->k.u.p26.rr3,
    r4 = ctx->k.u.p26.r4, rr4 = ctx->k.u.p26.rr4;
  uint32 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4];
  uint64 u0, u1, u2, u3, u4;
  uint64 v0, v1, v2, v3, v4;
  uint32 z0, z1, z2, z3, z4;

  /* Do the multiplication: u = h x mod 2^130 - 5.  We will have u_i <
   * 2^26 (5 (4 - i) + i + 1) 3*2^26 = 2^52 (63 - 12 i).  In all cases
   * we have u_i < 63*2^52 < 2^58.  Notably, u_4 < 15*2^52.
   */
#define M(x, y) ((uint64)(x)*(y))
  u0 = M(x0, r0)  + M(x1, rr4) + M(x2, rr3) + M(x3, rr2) + M(x4, rr1);
  u1 = M(x0, r1)  + M(x1, r0)  + M(x2, rr4) + M(x3, rr3) + M(x4, rr2);
  u2 = M(x0, r2)  + M(x1, r1)  + M(x2, r0)  + M(x3, rr4) + M(x4, rr3);
  u3 = M(x0, r3)  + M(x1, r2)  + M(x2, r1)  + M(x3, r0)  + M(x4, rr4);
  u4 = M(x0, r4)  + M(x1, r3)  + M(x2, r2)  + M(x3, r1)  + M(x4, r0);
#undef M

  /* Now we must reduce the coefficients.  We do this in an approximate
   * manner which avoids long data-dependency chains, but requires two
   * passes.
   *
   * The reduced carry down from u_4 to u_0 in the first pass will be c_0 <
   * 75*2^26; the remaining c_i are smaller: c_i < 2^26 (63 - 12 i).  This
   * leaves 0 <= v_i < 76*2^26.  The carries in the second pass are bounded
   * above by 135.
   */
  CARRY_REDUCE(v, u); CARRY_REDUCE(z, v);
  z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4;
}

#endif

/*----- Low-level implementation for 32/64-bit targets --------------------*/

#ifndef POLY1305_IMPL
#  define POLY1305_IMPL 11
#endif

#if POLY1305_IMPL == 11

/* Elements x of GF(2^130 - 5) are represented by 12 integers x_i: x =
 * SUM_{0<=i<12} x_i 2^P_i, where P_i = SUM_{0<=j<i} w_j, and w_5 = w_11 =
 * 10, and w_i = 11 for i in { 0, 1, 2, 3, 4, 6, 7, 8, 9, 10 }.
 *
 * Not all elements are represented canonically.  We have 0 <= r_i, s_i <
 * 2^w_i <= 2^11 by construction.  We maintain 0 <= h_i < 2^12.  When we read
 * a message block m, we have 0 <= m_i < 2^w_i by construction again.  When
 * we update the hash state, we calculate h' = r (h + m).  Addition is done
 * componentwise; let t = h + m, and we will have 0 <= t_i < 3*2^11.
 */
typedef uint16 felt[12];
#define M10 0x3ff
#define M11 0x7ff
#define P p11

/* Load a field element from an octet string. */
static void load_p11(felt d, const octet *s)
{
  unsigned i, j, n, w;
  uint16 m;
  uint32 a;

  for (i = j = n = 0, a = 0; j < 12; j++) {
    if (j == 5 || j == 11) { w = 10; m = M10; }
    else		   { w = 11; m = M11; }
    while (n < w && i < 16) { a |= s[i++] << n; n += 8; }
    d[j] = a&m; a >>= w; n -= w;
  }
}

/* Reduce a field-element's pieces to manageable size. */
static void carry_reduce(uint32 u[12])
{
  /* Initial bounds: we assume u_i < 636*2^22.  On exit, u_i < 2^11. */

  unsigned i;
  uint32 c;

  /* Do sequential carry propagation (16-bit CPUs are less likely to benefit
   * from instruction-level parallelism).  Start at u_9; truncate it to 11
   * bits, and add the carry onto u_10.  Truncate u10 to 11 bits, and add the
   * carry onto u_11.  Truncate u_11 to 10 bits, and add five times the carry
   * onto u_0.  And so on.
   *
   * The carry is larger than the pieces we're leaving behind.  Let c_i be
   * the high portion of u_i, to be carried onto u_{i+1}.  I claim that c_i <
   * 2557*2^10.  Then the carry /into/ any u_i is at most 12785*2^10 < 2^24
   * (allowing for the reduction as we carry from u_11 to u_0), and u_i after
   * carry is bounded above by 636*2^22 + 12785*2^10 < 2557*2^20.  Hence, the
   * carry out is at most 2557*2^10, as claimed.
   *
   * Once we reach u_9 for the second time, we start with u_9 < 2^11.  The
   * carry into u_9 is at most 2557*2^10 < 1279*2^11 as calculated above; so
   * the carry out into u_10 is at most 1280.  Since u_10 < 2^11 prior to
   * this carry in, we now have u_10 < 2^11 + 1280 < 2^12; so the carry out
   * into u_11 is at most 1.  The final reduction therefore only needs a
   * conditional subtraction.
   */
			   {		   c =  u[9] >> 11;  u[9] &= M11; }
			   { u[10] +=   c; c = u[10] >> 11; u[10] &= M11; }
			   { u[11] +=   c; c = u[11] >> 10; u[11] &= M10; }
			   {  u[0] += 5*c; c =  u[0] >> 11;  u[0] &= M11; }
  for (i = 1; i <  5; i++) {  u[i] +=   c; c =  u[i] >> 11;  u[i] &= M11; }
			   {  u[5] +=   c; c =  u[5] >> 10;  u[5] &= M10; }
  for (i = 6; i < 11; i++) {  u[i] +=   c; c =  u[i] >> 11;  u[i] &= M11; }
  u[11] += c;
}

/* General multiplication. */
static void mul(felt z, const felt x, const felt y)
{
  /* Initial bounds: we assume x_i < 3*2^11, and y_i < 2^12.  On exit,
   * z_i < 2^12.
   */

  uint32 u[12];
  unsigned i, j, k;

  /* Do the main multiplication.  After this, we shall have
   *
   *	      { 2^22 (636 - 184 i)	for 0 <= i < 6
   *	u_i < {
   *	      { 2^22 (732 - 60 i)	for 6 <= i < 12
   *
   * In particular, u_0 < 636*2^22 < 2^32, and u_11 < 72*2^22.
   *
   * The irregularly positioned pieces are annoying.  Because we fold the
   * reduction into the multiplication, it's also important to see where the
   * reduced products fit.  Finally, products don't align with the piece
   * boundaries, and sometimes need to be doubled.  The following table
   * tracks all of this.
   *
   *  piece    width   offset  second
   *	 0	11	  0	130
   *	 1	11	 11	141
   *	 2	11	 22	152
   *	 3	11	 33	163
   *	 4	11	 44	174
   *	 5	10	 55	185
   *	 6	11	 65	195
   *	 7	11	 76	206
   *	 8	11	 87	217
   *	 9	11	 98	228
   *	10	11	109	239
   *	11	10	120	250
   *
   * The next table tracks exactly which products end up being multiplied by
   * which constants and accumulated into which destination pieces.
   *
   *   u_k =    t_i r_j +	 2 t_i r_j + 5 t_i r_j +  10 t_i r_j
   *	 0	0/0		   --	     6/6	  1-5/11-7 7-11/5-1
   *	 1	0-1/1-0		   --	     6-7/7-6	  2-5/11-8 8-11/5-2
   *	 2	0-2/2-0		   --	     6-8/8-6	  3-5/11-9 9-11/5-3
   *	 3	0-3/3-0		   --	     6-9/9-6	  4-5/11-10 10-11/5-4
   *	 4	0-4/4-0		   --	     6-10/10-6	  5/11 11/5
   *	 5	0-5/5-0		   --	     6-11/11-6	  --
   *	 6	0/6 6/0		   1-5/5-1   --		  7-11/11-7
   *	 7	0-1/7-6 6-7/1-0	   2-5/5-2   --		  8-11/11-8
   *	 8	0-2/8-6 6-8/2-0	   3-5/5-3   --		  9-11/11-9
   *	 9	0-3/9-6 6-9/3-0	   4-5/5-4   --		  10-11/11-10
   *	10	0-4/10-6 6-10/4-0  5/5	     --		  11/11
   *	11	0-11/11-0	   --	     --		  --
   *
   * And, finally, trying to bound the multiple of 6*2^22 in each destination
   * piece is fiddly, so here's a tableau showing the calculation.
   *
   *     k      1* + 2* + 5* +10*   =	 1* +  5*   =
   *     0	1   --    1   10	 1    21	106
   *	 1	2   --	  2    8	 2    18	 92
   *     2	3   --	  3    6	 3    15	 78
   *	 3	4   --	  4    4	 4    12	 64
   *     4	5   --	  5    2	 5     9	 50
   *	 5	6   --	  6   --	 6     6	 36
   *     6	2    5   --    5	12    10	 62
   *	 7	4    4	 --    4	12     8	 52
   *     8	6    3	 --    3	12     6	 42
   *	 9	8    2	 --    2	12     4	 32
   *    10     10    1	 --    1	12     2	 22
   *	11     12   --	 --   --	12     0	 12
   */

  for (i = 0; i < 12; i++) u[i] = 0;

#define M(i, j) ((uint32)x[i]*y[j])

  /* Product terms we must multiply by 10. */
  for (k = 0; k < 5; k++) {
    for (i = k + 1; i < 6; i++) {
      j = 12 + k - i;
      u[k] += M(i, j) + M(j, i);
      u[k + 6] += M(i + 6, j);
    }
  }
  for (k = 0; k < 5; k++) u[k] *= 2;
  for (k = 6; k < 11; k++) u[k] *= 5;

  /* Product terms we must multiply by 5. */
  for (k = 0; k < 6; k++) {
    for (i = k + 6; i >= 6; i--) {
      j = 12 + k - i;
      u[k] += M(i, j);
    }
  }
  for (k = 0; k < 6; k++) u[k] *= 5;

  /* Product terms we must multiply by 2. */
  for (k = 6; k < 11; k++) {
    for (i = k - 5; i < 6; i++) {
      j = k - i;
      u[k] += M(i, j);
    }
  }
  for (k = 6; k < 11; k++) u[k] *= 2;

  /* Remaining product terms. */
  for (k = 0; k < 6; k++) {
    for (i = k; i < 6; i--) {
      j = k - i;
      u[k] += M(i, j);
      u[k + 6] += M(i + 6, j) + M(i, j + 6);
    }
  }

#undef M

  /* Do the reduction.  Currently, `carry_reduce' does more than we need, but
   * that's fine.
   */
  carry_reduce(u);

  /* Done.  Write out the answer. */
  for (i = 0; i < 12; i++) z[i] = u[i];
}

/* General squaring, used by `concat'. */
static void sqr(felt z, const felt x)
  { mul(z, x, x); }

/* Multiplication by r. */
static void mul_r(const poly1305_ctx *ctx, felt z, const felt x)
  { mul(z, x, ctx->k.u.p11.r); }

#endif

/*----- Interface functions -----------------------------------------------*/

/* --- @poly1305_keyinit@ --- *
 *
 * Arguments:	@poly1305_key *key@ = key structure to fill in
 *		@const void *k@ = pointer to key material
 *		@size_t ksz@ = length of key (must be @POLY1305_KEYSZ == 16@)
 *
 * Returns:	---
 *
 * Use:		Records a Poly1305 key and performs (minimal)
 *		precomputations.
 */

void poly1305_keyinit(poly1305_key *key, const void *k, size_t ksz)
{
  const octet *r = k;
#if POLY1305_IMPL == 11
  octet rr[16];
#endif

  KSZ_ASSERT(poly1305, ksz);

#if POLY1305_IMPL == 26
  uint32 r0 = LOAD32_L(r +  0), r1 = LOAD32_L(r +  4),
	 r2 = LOAD32_L(r +  8), r3 = LOAD32_L(r + 12);

  r0 &= 0x0fffffff; r1 &= 0x0ffffffc; r2 &= 0x0ffffffc; r3 &= 0x0ffffffc;
  key->u.p26.r0 = P26W0(r); key->u.p26.r1 = P26W1(r);
  key->u.p26.r2 = P26W2(r); key->u.p26.r3 = P26W3(r);
  key->u.p26.r4 = P26W4(r);

  key->u.p26.rr1 = 5*key->u.p26.r1; key->u.p26.rr2 = 5*key->u.p26.r2;
  key->u.p26.rr3 = 5*key->u.p26.r3; key->u.p26.rr4 = 5*key->u.p26.r4;
#else
  memcpy(rr, r, 16);
		  rr[ 3] &= 0x0f;
  rr[ 4] &= 0xfc; rr[ 7] &= 0x0f;
  rr[ 8] &= 0xfc; rr[11] &= 0x0f;
  rr[12] &= 0xfc; rr[15] &= 0x0f;
  load_p11(key->u.p11.r, rr);
#endif
}

/* --- @poly1305_macinit@ --- *
 *
 * Arguments:	@poly1305_ctx *ctx@ = MAC context to fill in
 *		@const poly1305_key *key@ = pointer to key structure to use
 *		@const void *iv@ = pointer to mask string
 *
 * Returns:	---
 *
 * Use:		Initializes a MAC context for use.  The key can be discarded
 *		at any time.
 *
 *		It is permitted for @iv@ to be null, though it is not then
 *		possible to complete the MAC computation on @ctx@.  The
 *		resulting context may still be useful, e.g., as an operand to
 *		@poly1305_concat@.
 */

void poly1305_macinit(poly1305_ctx *ctx,
		      const poly1305_key *key, const void *iv)
{
  const octet *s = iv;
#if POLY1305_IMPL == 26
  uint32 s0, s1, s2, s3;
#else
  unsigned i;
#endif

#if POLY1305_IMPL == 26
  if (s) {
    s0 = LOAD32_L(s +  0); s1 = LOAD32_L(s +  4);
    s2 = LOAD32_L(s +  8); s3 = LOAD32_L(s + 12);
    ctx->u.p26.s0 = P26W0(s); ctx->u.p26.s1 = P26W1(s);
    ctx->u.p26.s2 = P26W2(s); ctx->u.p26.s3 = P26W3(s);
    ctx->u.p26.s4 = P26W4(s);
  }
  ctx->u.p26.h[0] = ctx->u.p26.h[1] = ctx->u.p26.h[2] =
    ctx->u.p26.h[3] = ctx->u.p26.h[4] = 0;
#else
  if (s) load_p11(ctx->u.p11.s, s);
  for (i = 0; i < 12; i++) ctx->u.p11.h[i] = 0;
#endif
  ctx->k = *key;
  ctx->nbuf = 0;
  ctx->count = 0;
}

/* --- @poly1305_copy@ --- *
 *
 * Arguments:	@poly1305_ctx *to@ = destination context
 *		@const poly1305_ctx *from@ = source context
 *
 * Returns:	---
 *
 * Use:		Duplicates a Poly1305 MAC context.  The destination need not
 *		have been initialized.  Both contexts can be used
 *		independently afterwards.
 */

void poly1305_copy(poly1305_ctx *ctx, const poly1305_ctx *from)
  { *ctx = *from; }

/* --- @poly1305_hash@ --- *
 *
 * Arguments:	@poly1305_ctx *ctx@ = MAC context to update
 *		@const void *p@ = pointer to message data
 *		@size_t sz@ = length of message data
 *
 * Returns:	---
 *
 * Use:		Processes a chunk of message.  The message pieces may have
 *		arbitrary lengths, and may be empty.
 */

static void update_full(poly1305_ctx *ctx, const octet *p)
{
  felt t;
#if POLY1305_IMPL == 26
  uint32
    m0 = LOAD32_L(p +  0), m1 = LOAD32_L(p +  4),
    m2 = LOAD32_L(p +  8), m3 = LOAD32_L(p + 12);

  t[0] = ctx->u.p26.h[0] + P26W0(m);
  t[1] = ctx->u.p26.h[1] + P26W1(m);
  t[2] = ctx->u.p26.h[2] + P26W2(m);
  t[3] = ctx->u.p26.h[3] + P26W3(m);
  t[4] = ctx->u.p26.h[4] + P26W4(m) + 0x01000000;
#else
  unsigned i;

  load_p11(t, p); t[11] += 0x100;
  for (i = 0; i < 12; i++) t[i] += ctx->u.p11.h[i];
#endif

  mul_r(ctx, ctx->u.P.h, t);
  ctx->count++;
}

void poly1305_hash(poly1305_ctx *ctx, const void *p, size_t sz)
{
  const octet *pp = p;
  size_t n;

  if (ctx->nbuf) {
    if (sz < 16 - ctx->nbuf) {
      memcpy(ctx->buf + ctx->nbuf, p, sz);
      ctx->nbuf += sz;
      return;
    }
    n = 16 - ctx->nbuf;
    memcpy(ctx->buf + ctx->nbuf, pp, n);
    update_full(ctx, ctx->buf);
    pp += n; sz -= n;
  }
  while (sz >= 16) {
    update_full(ctx, pp);
    pp += 16; sz -= 16;
  }
  if (sz) memcpy(ctx->buf, pp, sz);
  ctx->nbuf = sz;
}

/* --- @poly1305_flush@ --- *
 *
 * Arguments:	@poly1305_ctx *ctx@ = MAC context to flush
 *
 * Returns:	---
 *
 * Use:		Forces any buffered message data in the context to be
 *		processed.  This has no effect if the message processed so
 *		far is a whole number of blocks.  Flushing is performed
 *		automatically by @poly1305_done@, but it may be necessary to
 *		force it by hand when using @poly1305_concat@.
 *
 *		Flushing a partial block has an observable effect on the
 *		computation: the resulting state is (with high probability)
 *		dissimilar to any state reachable with a message which is a
 *		whole number of blocks long.
 */

void poly1305_flush(poly1305_ctx *ctx)
{
  felt t;
#if POLY1305_IMPL == 26
  uint32 m0, m1, m2, m3;
#else
  unsigned i;
#endif

  if (!ctx->nbuf) return;
  ctx->buf[ctx->nbuf++] = 1; memset(ctx->buf + ctx->nbuf, 0, 16 - ctx->nbuf);
#if POLY1305_IMPL == 26
  m0 = LOAD32_L(ctx->buf +  0); m1 = LOAD32_L(ctx->buf +  4);
  m2 = LOAD32_L(ctx->buf +  8); m3 = LOAD32_L(ctx->buf + 12);

  t[0] = ctx->u.p26.h[0] + P26W0(m);
  t[1] = ctx->u.p26.h[1] + P26W1(m);
  t[2] = ctx->u.p26.h[2] + P26W2(m);
  t[3] = ctx->u.p26.h[3] + P26W3(m);
  t[4] = ctx->u.p26.h[4] + P26W4(m);
#else
  load_p11(t, ctx->buf);
  for (i = 0; i < 12; i++) t[i] += ctx->u.p11.h[i];
#endif

  mul_r(ctx, ctx->u.P.h, t);
  ctx->count++;
  ctx->nbuf = 0;
}

/* --- @poly1305_concat@ --- *
 *
 * Arguments:	@poly1305_ctx *ctx@ = destination context
 *		@const poly1305_ctx *prefix, *suffix@ = two operand contexts
 *
 * Returns:	---
 *
 * Use:		The two operand contexts @prefix@ and @suffix@ represent
 *		processing of two messages %$m$% and %$m'$%; the effect is to
 *		set @ctx@ to the state corresponding to their concatenation
 *		%$m \cat m'$%.
 *
 *		All three contexts must have been initialized using the same
 *		key value (though not necessarily from the same key
 *		structure).  The mask values associated with the input
 *		contexts are irrelevant.  The @prefix@ message %$m$% must be
 *		a whole number of blocks long: this can be arranged by
 *		flushing the context.  The @suffix@ message need not be a
 *		whole number of blocks long.  All of the contexts remain
 *		operational and can be used independently afterwards.
 */

void poly1305_concat(poly1305_ctx *ctx,
		     const poly1305_ctx *prefix, const poly1305_ctx *suffix)
{
  /* Assume that lengths are public, so it's safe to behave conditionally on
   * the bits of ctx->count.
   */
  unsigned long n;
  unsigned i;
  felt x;
#if POLY1305_IMPL == 26
  uint32 x0, x1, x2, x3, x4, y0, y1, y2, y3, y4;
#else
  uint32 y[12];
#endif

  /* We can only concatenate if the prefix is block-aligned. */
  assert(!prefix->nbuf);

  /* The hash for a message m = m_{k-1} m_{k-2} ... m_1 m_0 is h_r(m) =
   * SUM_{0<=i<k} m_i r^{i+1}.  If we have two messages, m, m', of lengths k
   * and k' blocks respectively, then
   *
   *	h_r(m || m') = SUM_{0<=i<k} m_i r^{k'+i+1} +
   *			 SUM_{0<=i<k'} m'_i r^{i+1}
   *		     =  r^{k'} h_r(m) + h_r(m')
   *
   * This is simple left-to-right square-and-multiply exponentiation.
   */
  n = suffix->count;
  x[0] = 1;
#if POLY1305_IMPL == 26
  x[1] = x[2] = x[3] = x[4] = 0;
#else
  for (i = 1; i < 12; i++) x[i] = 0;
#endif
#define BIT (1ul << (ULONG_BITS - 1))
  if (n) {
    i = ULONG_BITS;
    while (!(n & BIT)) { n <<= 1; i--; }
    mul_r(prefix, x, x); n <<= 1; i--;
    while (i--) { sqr(x, x); if (n & BIT) mul_r(prefix, x, x); n <<= 1; }
  }
#undef BIT
  mul(x, prefix->u.P.h, x);

  /* Add on the suffix hash. */
#if POLY1305_IMPL == 26
  /* We're going to add the two hashes elementwise.  Both h' = h_r(m') and
   * x = r^{k'} h_r(m) are bounded above by 2^27, so the sum will be bounded
   * by 2^28; but this is too large to leave in the accumulator.  (Strictly,
   * we could get away with it, but the caller can in theory chain an
   * arbitrary number of concatenations and expect us to cope, and we'd
   * definitely overflow eventually.)  So we reduce.  Since the excess is so
   * small, a single round of `CARRY_REDUCE' is enough.
   */
  x0 = x[0] + suffix->u.p26.h[0]; x1 = x[1] + suffix->u.p26.h[1];
  x2 = x[2] + suffix->u.p26.h[2]; x3 = x[3] + suffix->u.p26.h[3];
  x4 = x[4] + suffix->u.p26.h[4];
  CARRY_REDUCE(y, x);
  ctx->u.p26.h[0] = y0; ctx->u.p26.h[1] = y1; ctx->u.p26.h[2] = y2;
  ctx->u.p26.h[3] = y3; ctx->u.p26.h[4] = y4;
#else
  /* We'll add the two hashes elementwise and have to reduce again.  The
   * numbers are different, but the reasoning is basically the same.
   */
  for (i = 0; i < 12; i++) y[i] = x[i] + suffix->u.p11.h[i];
  carry_reduce(y);
  for (i = 0; i < 12; i++) ctx->u.p11.h[i] = y[i];
#endif

  /* Copy the remaining pieces of the context to set up the result. */
  if (ctx != suffix) {
    memcpy(ctx->buf, suffix->buf, suffix->nbuf);
    ctx->nbuf = suffix->nbuf;
  }
  ctx->count = prefix->count + suffix->count;
}

/* --- @poly1305_done@ --- *
 *
 * Arguments:	@poly1305_ctx *ctx@ = MAC context to finish
 *		@void *h@ = buffer to write the tag to
 *
 * Returns:	---
 *
 * Use:		Completes a Poly1305 MAC tag computation.
 */

void poly1305_done(poly1305_ctx *ctx, void *h)
{
  octet *p = h;

#if POLY1305_IMPL == 26
  uint32 m_sub, t, c;
  uint32 h0, h1, h2, h3, h4, hh0, hh1, hh2, hh3, hh4;

  /* If there's anything left over in the buffer, pad it to form a final
   * coefficient and update the evaluation one last time.
   */
  poly1305_flush(ctx);

  /* Collect the final hash state. */
  h0 = ctx->u.p26.h[0];
  h1 = ctx->u.p26.h[1];
  h2 = ctx->u.p26.h[2];
  h3 = ctx->u.p26.h[3];
  h4 = ctx->u.p26.h[4];

  /* Reduce the final value mod 2^130 - 5.  First pass: set h <- h +
   * 5 floor(h/2^130).  After this, the low pieces of h will be normalized:
   * 0 <= h_i < 2^26 for 0 <= i < 4; and 0 <= h_4 < 2^26 + 1.  In the
   * (highly unlikely) event that h_4 >= 2^26, set c and truncate to 130
   * bits.
   */
	     c = h4 >> 26; h4 &= M26;
  h0 += 5*c; c = h0 >> 26; h0 &= M26;
  h1 +=   c; c = h1 >> 26; h1 &= M26;
  h2 +=   c; c = h2 >> 26; h2 &= M26;
  h3 +=   c; c = h3 >> 26; h3 &= M26;
  h4 +=   c; c = h4 >> 26; h4 &= M26;

  /* Calculate h' = h - (2^130 - 5).  If h' >= 0 then t ends up 1; otherwise
   * it's zero.
   */
  t  = h0 + 5; hh0 = t&M26; t >>= 26;
  t += h1;     hh1 = t&M26; t >>= 26;
  t += h2;     hh2 = t&M26; t >>= 26;
  t += h3;     hh3 = t&M26; t >>= 26;
  t += h4;     hh4 = t&M26; t >>= 26;

  /* Keep the subtraction result above if t or c is set. */
  m_sub = -(t | c);
  h0 = (hh0&m_sub) | (h0&~m_sub);
  h1 = (hh1&m_sub) | (h1&~m_sub);
  h2 = (hh2&m_sub) | (h2&~m_sub);
  h3 = (hh3&m_sub) | (h3&~m_sub);
  h4 = (hh4&m_sub) | (h4&~m_sub);

  /* Add the mask onto the hash result. */
  t  = h0 + ctx->u.p26.s0; h0 = t&M26; t >>= 26;
  t += h1 + ctx->u.p26.s1; h1 = t&M26; t >>= 26;
  t += h2 + ctx->u.p26.s2; h2 = t&M26; t >>= 26;
  t += h3 + ctx->u.p26.s3; h3 = t&M26; t >>= 26;
  t += h4 + ctx->u.p26.s4; h4 = t&M26; t >>= 26;

  /* Convert this mess back into 32-bit words.  We lose the top two bits,
   * but that's fine.
   */
  h0 = (h0 >>  0) | ((h1 & 0x0000003f) << 26);
  h1 = (h1 >>  6) | ((h2 & 0x00000fff) << 20);
  h2 = (h2 >> 12) | ((h3 & 0x0003ffff) << 14);
  h3 = (h3 >> 18) | ((h4 & 0x00ffffff) <<  8);

  /* All done. */
  STORE32_L(p +  0, h0); STORE32_L(p +  4, h1);
  STORE32_L(p +  8, h2); STORE32_L(p + 12, h3);
#else
  uint16 hh[12], hi[12], c, t, m_sub;
  uint32 a;
  unsigned i, j, n;

  /* If there's anything left over in the buffer, pad it to form a final
   * coefficient and update the evaluation one last time.
   */
  poly1305_flush(ctx);

  /* Collect the final hash state. */
  for (i = 0; i < 12; i++) hh[i] = ctx->u.p11.h[i];

  /* Reduce the final value mod 2^130 - 5.  First pass: set h <- h +
   * 5 floor(h/2^130).  After this, the low pieces of h will be normalized:
   * 0 <= h_i < 2^{w_i} for 0 <= i < 11; and 0 <= h_{11} < 2^10 + 1.  In the
   * (highly unlikely) event that h_{11} >= 2^10, set c and truncate to 130
   * bits.
   */
  c = 5*(hh[11] >> 10); hh[11] &= M10;
  for (i = 0; i < 12; i++) {
    if (i == 5 || i == 11) { c += hh[i]; hh[i] = c&M10; c >>= 10; }
    else		   { c += hh[i]; hh[i] = c&M11; c >>= 11; }
  }

  /* Calculate h' = h - (2^130 - 5).  If h' >= 0 then t ends up 1; otherwise
   * it's zero.
   */
  for (i = 0, t = 5; i < 12; i++) {
    t += hh[i];
    if (i == 5 || i == 11) { hi[i] = t&M10; t >>= 10; }
    else		   { hi[i] = t&M11; t >>= 11; }
  }

  /* Keep the subtraction result above if t or c is set. */
  m_sub = -(t | c);
  for (i = 0; i < 12; i++) hh[i] = (hi[i]&m_sub) | (hh[i]&~m_sub);

  /* Add the mask onto the hash result. */
  for (i = 0, t = 0; i < 12; i++) {
    t += hh[i] + ctx->u.p11.s[i];
    if (i == 5 || i == 11) { hh[i] = t&M10; t >>= 10; }
    else		   { hh[i] = t&M11; t >>= 11; }
  }

  /* Convert this mess back into bytes.  We lose the top two bits, but that's
   * fine.
   */
  for (i = j = n = 0, a = 0; i < 16; i++) {
    if (n < 8) {
      a |= hh[j] << n;
      n += (j == 5 || j == 11) ? 10 : 11;
      j++;
    }
    p[i] = a&0xff; a >>= 8; n -= 8;
  }

#endif
}

/*----- Test rig ----------------------------------------------------------*/

#ifdef TEST_RIG

#include <mLib/testrig.h>

static int vrf_hash(dstr v[])
{
  poly1305_key k;
  poly1305_ctx ctx;
  dstr t = DSTR_INIT;
  unsigned i, j;

  if (v[0].len != 16) { fprintf(stderr, "bad key length\n"); exit(2); }
  if (v[1].len != 16) { fprintf(stderr, "bad mask length\n"); exit(2); }
  if (v[3].len != 16) { fprintf(stderr, "bad tag length\n"); exit(2); }
  dstr_ensure(&t, 16); t.len = 16;

  poly1305_keyinit(&k, v[0].buf, v[0].len);
  for (i = 0; i < v[2].len; i++) {
    for (j = i; j < v[2].len; j++) {
      poly1305_macinit(&ctx, &k, v[1].buf);
      poly1305_hash(&ctx, v[2].buf, i);
      poly1305_hash(&ctx, v[2].buf + i, j - i);
      poly1305_hash(&ctx, v[2].buf + j, v[2].len - j);
      poly1305_done(&ctx, t.buf);
      if (memcmp(t.buf, v[3].buf, 16) != 0) {
	fprintf(stderr, "failed...");
	fprintf(stderr, "\n\tkey    = "); type_hex.dump(&v[0], stderr);
	fprintf(stderr, "\n\tmask   = "); type_hex.dump(&v[1], stderr);
	fprintf(stderr, "\n\tmsg    = "); type_hex.dump(&v[2], stderr);
	fprintf(stderr, "\n\texp    = "); type_hex.dump(&v[3], stderr);
	fprintf(stderr, "\n\tcalc   = "); type_hex.dump(&t, stderr);
	fprintf(stderr, "\n\tsplits = 0 .. %u .. %u .. %lu\n",
		i, j, (unsigned long)v[1].len);
	return (0);
      }
    }
  }
  return (1);
}

static int vrf_cat(dstr v[])
{
  poly1305_key k;
  poly1305_ctx ctx, cc[3];
  dstr t = DSTR_INIT;
  unsigned i;
  int ok = 1;

  if (v[0].len != 16) { fprintf(stderr, "bad key length\n"); exit(2); }
  if (v[1].len != 16) { fprintf(stderr, "bad mask length\n"); exit(2); }
  if (v[5].len != 16) { fprintf(stderr, "bad tag length\n"); exit(2); }
  dstr_ensure(&t, 16); t.len = 16;

  poly1305_keyinit(&k, v[0].buf, v[0].len);
  poly1305_macinit(&ctx, &k, v[1].buf);
  for (i = 0; i < 3; i++) {
    poly1305_macinit(&cc[i], &k, 0);
    poly1305_hash(&cc[i], v[i + 2].buf, v[i + 2].len);
  }
  for (i = 0; i < 2; i++) {
    if (!i) {
      poly1305_concat(&ctx, &cc[1], &cc[2]);
      poly1305_concat(&ctx, &cc[0], &ctx);
    } else {
      poly1305_concat(&ctx, &cc[0], &cc[1]);
      poly1305_concat(&ctx, &ctx, &cc[2]);
    }
    poly1305_done(&ctx, t.buf);
    if (memcmp(t.buf, v[5].buf, 16) != 0) {
      fprintf(stderr, "failed...");
      fprintf(stderr, "\n\tkey    = "); type_hex.dump(&v[0], stderr);
      fprintf(stderr, "\n\tmask   = "); type_hex.dump(&v[1], stderr);
      fprintf(stderr, "\n\tmsg[0] = "); type_hex.dump(&v[2], stderr);
      fprintf(stderr, "\n\tmsg[1] = "); type_hex.dump(&v[3], stderr);
      fprintf(stderr, "\n\tmsg[2] = "); type_hex.dump(&v[4], stderr);
      fprintf(stderr, "\n\texp    = "); type_hex.dump(&v[5], stderr);
      fprintf(stderr, "\n\tcalc   = "); type_hex.dump(&t, stderr);
      fprintf(stderr, "\n\tassoc  = %s\n",
	      !i ? "msg[0] || (msg[1] || msg[2])" :
		   "(msg[0] || msg[1]) || msg[2]");
      ok = 0;
    }
  }
  return (ok);
}

static const struct test_chunk tests[] = {
  { "poly1305-hash", vrf_hash,
    { &type_hex, &type_hex, &type_hex, &type_hex } },
  { "poly1305-cat", vrf_cat,
    { &type_hex, &type_hex, &type_hex, &type_hex, &type_hex, &type_hex } },
  { 0, 0, { 0 } }
};

int main(int argc, char *argv[])
{
  test_run(argc, argv, tests, SRCDIR "/t/poly1305");
  return (0);
}

#endif

/*----- That's all, folks -------------------------------------------------*/