--- /dev/null
+/* -*-c-*-
+ *
+ * Poly1305 message authentication code
+ *
+ * (c) 2017 Straylight/Edgeware
+ */
+
+/*----- Licensing notice --------------------------------------------------*
+ *
+ * This file is part of Catacomb.
+ *
+ * Catacomb is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Library General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * Catacomb is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with Catacomb; if not, write to the Free
+ * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ * MA 02111-1307, USA.
+ */
+
+/*----- Header files ------------------------------------------------------*/
+
+#include "config.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "poly1305.h"
+
+/*----- Global variables --------------------------------------------------*/
+
+const octet poly1305_keysz[] = { KSZ_SET, 16, 0 };
+
+/*----- Low-level implementation for 32/64-bit targets --------------------*/
+
+#if !defined(POLY1305_IMPL) && defined(HAVE_UINT64)
+# define POLY1305_IMPL 26
+#endif
+
+#if POLY1305_IMPL == 26
+
+/* Elements x of GF(2^130 - 5) are represented by five integers x_i: x =
+ * SUM_{0<=i<5} x_i 2^{26i}.
+ *
+ * Not all elements are represented canonically. We have 0 <= r_i, s_i <
+ * 2^26 by construction. We maintain 0 <= h_i < 2^27. When we read a
+ * message block m, we have 0 <= m_i < 2^26 by construction again. When we
+ * update the hash state, we calculate h' = r (h + m). Addition is done
+ * componentwise; let t = h + m, and we will have 0 <= t_i < 3*2^26.
+ */
+typedef uint32 felt[5];
+#define M26 0x03ffffff
+#define P p26
+
+/* Convert 32-bit words into field-element pieces. */
+#define P26W0(x) ((x##0)&0x03ffffff)
+#define P26W1(x) ((((x##1)&0x000fffff) << 6) | (((x##0) >> 26)&0x0000003f))
+#define P26W2(x) ((((x##2)&0x00003fff) << 12) | (((x##1) >> 20)&0x00000fff))
+#define P26W3(x) ((((x##3)&0x000000ff) << 18) | (((x##2) >> 14)&0x0003ffff))
+#define P26W4(x) (((x##3) >> 8)&0x00ffffff)
+
+/* Propagate carries in parallel. If 0 <= u_i < 2^26 c_i, then we shall have
+ * 0 <= v_0 < 2^26 + 5 c_4, and 0 <= v_i < 2^26 + c_{i-1} for 1 <= i < 5.
+ */
+#define CARRY_REDUCE(v, u) do { \
+ (v##0) = ((u##0)&M26) + 5*((u##4) >> 26); \
+ (v##1) = ((u##1)&M26) + ((u##0) >> 26); \
+ (v##2) = ((u##2)&M26) + ((u##1) >> 26); \
+ (v##3) = ((u##3)&M26) + ((u##2) >> 26); \
+ (v##4) = ((u##4)&M26) + ((u##3) >> 26); \
+} while (0)
+
+/* General multiplication, used by `concat'. */
+static void mul(felt z, const felt x, const felt y)
+{
+ /* Initial bounds: we assume x_i, y_i < 2^27. On exit, z_i < 2^27. */
+
+ uint32 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4];
+ uint32 y0 = y[0], y1 = y[1], y2 = y[2], y3 = y[3], y4 = y[4];
+ uint64 u0, u1, u2, u3, u4;
+ uint64 v0, v1, v2, v3, v4;
+ uint32 z0, z1, z2, z3, z4;
+
+ /* Do the multiplication: u = h x mod 2^130 - 5. We will have u_i <
+ * 2^27 (5 (4 - i) + i + 1) 2^27 = 2^54 (21 - 4 i) = 2^52 (84 - 16 i). In
+ * all cases we have u_i < 84*2^52 < 2^59. Notably, u_4 < 5*2^54 =
+ * 20*2^52.
+ */
+#define M(x, y) ((uint64)(x)*(y))
+ u0 = M(x0, y0) + (M(x1, y4) + M(x2, y3) + M(x3, y2) + M(x4, y1))*5;
+ u1 = M(x0, y1) + M(x1, y0) + (M(x2, y4) + M(x3, y3) + M(x4, y2))*5;
+ u2 = M(x0, y2) + M(x1, y1) + M(x2, y0) + (M(x3, y4) + M(x4, y3))*5;
+ u3 = M(x0, y3) + M(x1, y2) + M(x2, y1) + M(x3, y0) + (M(x4, y4))*5;
+ u4 = M(x0, y4) + M(x1, y3) + M(x2, y2) + M(x3, y1) + M(x4, y0);
+#undef M
+
+ /* Now we must reduce the coefficients. We do this in an approximate
+ * manner which avoids long data-dependency chains, but requires two
+ * passes.
+ *
+ * The reduced carry down from u_4 to u_0 in the first pass will be c_0 <
+ * 100*2^26; the remaining c_i are smaller: c_i < 2^26 (84 - 16 i). This
+ * leaves 0 <= v_i < 101*2^26. The carries in the second pass are bounded
+ * above by 180.
+ */
+ CARRY_REDUCE(v, u); CARRY_REDUCE(z, v);
+ z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4;
+}
+
+/* General squaring, used by `concat'. */
+static void sqr(felt z, const felt x)
+{
+ /* Initial bounds: we assume x_i < 2^27. On exit, z_i < 2^27. */
+
+ uint32 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4];
+ uint64 u0, u1, u2, u3, u4;
+ uint64 v0, v1, v2, v3, v4;
+ uint32 z0, z1, z2, z3, z4;
+
+ /* Do the squaring. See `mul' for bounds. */
+#define M(x, y) ((uint64)(x)*(y))
+ u0 = M(x0, x0) + 10*(M(x1, x4) + M(x2, x3));
+ u1 = 2* M(x0, x1) + 5*(M(x3, x3) + 2*M(x2, x4));
+ u2 = M(x1, x1) + 2* M(x0, x2) + 10* M(x3, x4);
+ u3 = 2*(M(x0, x3) + M(x1, x2)) + 5* M(x4, x4);
+ u4 = M(x2, x2) + 2*(M(x0, x4) + M(x1, x3));
+#undef M
+
+ /* Now we must reduce the coefficients. See `mul' for bounds. */
+ CARRY_REDUCE(v, u); CARRY_REDUCE(z, v);
+ z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4;
+}
+
+/* Multiplication by r, using precomputation. */
+static void mul_r(const poly1305_ctx *ctx, felt z, const felt x)
+{
+ /* Initial bounds: by construction, r_i < 2^26. We assume x_i < 3*2^26.
+ * On exit, z_i < 2^27.
+ */
+
+ uint32
+ r0 = ctx->k.u.p26.r0,
+ r1 = ctx->k.u.p26.r1, rr1 = ctx->k.u.p26.rr1,
+ r2 = ctx->k.u.p26.r2, rr2 = ctx->k.u.p26.rr2,
+ r3 = ctx->k.u.p26.r3, rr3 = ctx->k.u.p26.rr3,
+ r4 = ctx->k.u.p26.r4, rr4 = ctx->k.u.p26.rr4;
+ uint32 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4];
+ uint64 u0, u1, u2, u3, u4;
+ uint64 v0, v1, v2, v3, v4;
+ uint32 z0, z1, z2, z3, z4;
+
+ /* Do the multiplication: u = h x mod 2^130 - 5. We will have u_i <
+ * 2^26 (5 (4 - i) + i + 1) 3*2^26 = 2^52 (63 - 12 i). In all cases
+ * we have u_i < 63*2^52 < 2^58. Notably, u_4 < 15*2^52.
+ */
+#define M(x, y) ((uint64)(x)*(y))
+ u0 = M(x0, r0) + M(x1, rr4) + M(x2, rr3) + M(x3, rr2) + M(x4, rr1);
+ u1 = M(x0, r1) + M(x1, r0) + M(x2, rr4) + M(x3, rr3) + M(x4, rr2);
+ u2 = M(x0, r2) + M(x1, r1) + M(x2, r0) + M(x3, rr4) + M(x4, rr3);
+ u3 = M(x0, r3) + M(x1, r2) + M(x2, r1) + M(x3, r0) + M(x4, rr4);
+ u4 = M(x0, r4) + M(x1, r3) + M(x2, r2) + M(x3, r1) + M(x4, r0);
+#undef M
+
+ /* Now we must reduce the coefficients. We do this in an approximate
+ * manner which avoids long data-dependency chains, but requires two
+ * passes.
+ *
+ * The reduced carry down from u_4 to u_0 in the first pass will be c_0 <
+ * 75*2^26; the remaining c_i are smaller: c_i < 2^26 (63 - 12 i). This
+ * leaves 0 <= v_i < 76*2^26. The carries in the second pass are bounded
+ * above by 135.
+ */
+ CARRY_REDUCE(v, u); CARRY_REDUCE(z, v);
+ z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4;
+}
+
+#endif
+
+/*----- Low-level implementation for 32/64-bit targets --------------------*/
+
+#ifndef POLY1305_IMPL
+# define POLY1305_IMPL 11
+#endif
+
+#if POLY1305_IMPL == 11
+
+/* Elements x of GF(2^130 - 5) are represented by 12 integers x_i: x =
+ * SUM_{0<=i<12} x_i 2^P_i, where P_i = SUM_{0<=j<i} w_j, and w_5 = w_11 =
+ * 10, and w_i = 11 for i in { 0, 1, 2, 3, 4, 6, 7, 8, 9, 10 }.
+ *
+ * Not all elements are represented canonically. We have 0 <= r_i, s_i <
+ * 2^w_i <= 2^11 by construction. We maintain 0 <= h_i < 2^12. When we read
+ * a message block m, we have 0 <= m_i < 2^w_i by construction again. When
+ * we update the hash state, we calculate h' = r (h + m). Addition is done
+ * componentwise; let t = h + m, and we will have 0 <= t_i < 3*2^11.
+ */
+typedef uint16 felt[12];
+#define M10 0x3ff
+#define M11 0x7ff
+#define P p11
+
+/* Load a field element from an octet string. */
+static void load_p11(felt d, const octet *s)
+{
+ unsigned i, j, n, w;
+ uint16 m;
+ uint32 a;
+
+ for (i = j = n = 0, a = 0; j < 12; j++) {
+ if (j == 5 || j == 11) { w = 10; m = M10; }
+ else { w = 11; m = M11; }
+ while (n < w && i < 16) { a |= s[i++] << n; n += 8; }
+ d[j] = a&m; a >>= w; n -= w;
+ }
+}
+
+/* Reduce a field-element's pieces to manageable size. */
+static void carry_reduce(uint32 u[12])
+{
+ /* Initial bounds: we assume u_i < 636*2^22. On exit, u_i < 2^11. */
+
+ unsigned i;
+ uint32 c;
+
+ /* Do sequential carry propagation (16-bit CPUs are less likely to benefit
+ * from instruction-level parallelism). Start at u_10; truncate it to 11
+ * bits, and add the carry onto u_11. Truncate u_11 to 10 bits, and add
+ * five times the carry onto u_0. And so on.
+ *
+ * The carry is larger than the pieces we're leaving behind. Let c_i be
+ * the high portion of u_i, to be carried onto u_{i+1}. I claim that c_i <
+ * 2557*2^10. Then the carry /into/ any u_i is at most 12785*2^10 < 2^24
+ * (allowing for the reduction as we carry from u_11 to u_0), and u_i after
+ * carry is bounded above by 636*2^22 + 12785*2^10 < 2557*2^20. Hence, the
+ * carry out is at most 2557*2^10, as claimed.
+ *
+ * Once we reach u_10 for the second time, we start with u_10 < 2^11. The
+ * carry into u_10 is at most 2557*2^10 < 1279*2^11 as calculated above; so
+ * the carry out into u_11 is at most 1280. Since u_11 < 2^10 prior to
+ * this carry in, all of the u_i are now bounded above by 2^11. The final
+ * reduction therefore only needs a conditional subtraction.
+ */
+ { c = u[10] >> 11; u[10] &= M11; }
+ { u[11] += c; c = u[11] >> 10; u[11] &= M10; }
+ { u[0] += 5*c; c = u[0] >> 11; u[0] &= M11; }
+ for (i = 1; i < 5; i++) { u[i] += c; c = u[i] >> 11; u[i] &= M11; }
+ { u[5] += c; c = u[5] >> 10; u[5] &= M10; }
+ for (i = 6; i < 11; i++) { u[i] += c; c = u[i] >> 11; u[i] &= M11; }
+ u[11] += c;
+}
+
+/* General multiplication. */
+static void mul(felt z, const felt x, const felt y)
+{
+ /* Initial bounds: we assume x_i < 3*2^11, and y_i < 2^12. On exit,
+ * z_i < 2^12.
+ */
+
+ uint32 u[12];
+ unsigned i, j, k;
+
+ /* Do the main multiplication. After this, we shall have
+ *
+ * { 2^22 (636 - 184 i) for 0 <= i < 6
+ * u_i < {
+ * { 2^22 (732 - 60 i) for 6 <= i < 12
+ *
+ * In particular, u_0 < 636*2^22 < 2^32, and u_11 < 72*2^22.
+ *
+ * The irregularly positioned pieces are annoying. Because we fold the
+ * reduction into the multiplication, it's also important to see where the
+ * reduced products fit. Finally, products don't align with the piece
+ * boundaries, and sometimes need to be doubled. The following table
+ * tracks all of this.
+ *
+ * piece width offset second
+ * 0 11 0 130
+ * 1 11 11 141
+ * 2 11 22 152
+ * 3 11 33 163
+ * 4 11 44 174
+ * 5 10 55 185
+ * 6 11 65 195
+ * 7 11 76 206
+ * 8 11 87 217
+ * 9 11 98 228
+ * 10 11 109 239
+ * 11 10 120 250
+ *
+ * The next table tracks exactly which products end up being multiplied by
+ * which constants and accumulated into which destination pieces.
+ *
+ * u_k = t_i r_j + 2 t_i r_j + 5 t_i r_j + 10 t_i r_j
+ * 0 0/0 -- 6/6 1-5/11-7 7-11/5-1
+ * 1 0-1/1-0 -- 6-7/7-6 2-5/11-8 8-11/5-2
+ * 2 0-2/2-0 -- 6-8/8-6 3-5/11-9 9-11/5-3
+ * 3 0-3/3-0 -- 6-9/9-6 4-5/11-10 10-11/5-4
+ * 4 0-4/4-0 -- 6-10/10-6 5/11 11/5
+ * 5 0-5/5-0 -- 6-11/11-6 --
+ * 6 0/6 6/0 1-5/5-1 -- 7-11/11-7
+ * 7 0-1/7-6 6-7/1-0 2-5/5-2 -- 8-11/11-8
+ * 8 0-2/8-6 6-8/2-0 3-5/5-3 -- 9-11/11-9
+ * 9 0-3/9-6 6-9/3-0 4-5/5-4 -- 10-11/11-10
+ * 10 0-4/10-6 6-10/4-0 5/5 -- 11/11
+ * 11 0-11/11-0 -- -- --
+ *
+ * And, finally, trying to bound the multiple of 6*2^22 in each destination
+ * piece is fiddly, so here's a tableau showing the calculation.
+ *
+ * k 1* + 2* + 5* +10* = 1* + 5* =
+ * 0 1 -- 1 10 1 21 106
+ * 1 2 -- 2 8 2 18 92
+ * 2 3 -- 3 6 3 15 78
+ * 3 4 -- 4 4 4 12 64
+ * 4 5 -- 5 2 5 9 50
+ * 5 6 -- 6 -- 6 6 36
+ * 6 2 5 -- 5 12 10 62
+ * 7 4 4 -- 4 12 8 52
+ * 8 6 3 -- 3 12 6 42
+ * 9 8 2 -- 2 12 4 32
+ * 10 10 1 -- 1 12 2 22
+ * 11 12 -- -- -- 12 0 12
+ */
+
+ for (i = 0; i < 12; i++) u[i] = 0;
+
+#define M(i, j) ((uint32)x[i]*y[j])
+
+ /* Product terms we must multiply by 10. */
+ for (k = 0; k < 5; k++) {
+ for (i = k + 1; i < 6; i++) {
+ j = 12 + k - i;
+ u[k] += M(i, j) + M(j, i);
+ u[k + 6] += M(i + 6, j);
+ }
+ }
+ for (k = 0; k < 5; k++) u[k] *= 2;
+ for (k = 6; k < 11; k++) u[k] *= 5;
+
+ /* Product terms we must multiply by 5. */
+ for (k = 0; k < 6; k++) {
+ for (i = k + 6; i >= 6; i--) {
+ j = 12 + k - i;
+ u[k] += M(i, j);
+ }
+ }
+ for (k = 0; k < 6; k++) u[k] *= 5;
+
+ /* Product terms we must multiply by 2. */
+ for (k = 6; k < 11; k++) {
+ for (i = k - 5; i < 6; i++) {
+ j = k - i;
+ u[k] += M(i, j);
+ }
+ }
+ for (k = 6; k < 11; k++) u[k] *= 2;
+
+ /* Remaining product terms. */
+ for (k = 0; k < 6; k++) {
+ for (i = k; i < 6; i--) {
+ j = k - i;
+ u[k] += M(i, j);
+ u[k + 6] += M(i + 6, j) + M(i, j + 6);
+ }
+ }
+
+#undef M
+
+ /* Do the reduction. Currently, `carry_reduce' does more than we need, but
+ * that's fine.
+ */
+ carry_reduce(u);
+
+ /* Done. Write out the answer. */
+ for (i = 0; i < 12; i++) z[i] = u[i];
+}
+
+/* General squaring, used by `concat'. */
+static void sqr(felt z, const felt x)
+ { mul(z, x, x); }
+
+/* Multiplication by r. */
+static void mul_r(const poly1305_ctx *ctx, felt z, const felt x)
+ { mul(z, x, ctx->k.u.p11.r); }
+
+#endif
+
+/*----- Interface functions -----------------------------------------------*/
+
+/* --- @poly1305_keyinit@ --- *
+ *
+ * Arguments: @poly1305_key *key@ = key structure to fill in
+ * @const void *k@ = pointer to key material
+ * @size_t ksz@ = length of key (must be @POLY1305_KEYSZ == 16@)
+ *
+ * Returns: ---
+ *
+ * Use: Records a Poly1305 key and performs (minimal)
+ * precomputations.
+ */
+
+void poly1305_keyinit(poly1305_key *key, const void *k, size_t ksz)
+{
+ const octet *r = k;
+#if POLY1305_IMPL == 11
+ octet rr[16];
+#endif
+
+ KSZ_ASSERT(poly1305, ksz);
+
+#if POLY1305_IMPL == 26
+ uint32 r0 = LOAD32_L(r + 0), r1 = LOAD32_L(r + 4),
+ r2 = LOAD32_L(r + 8), r3 = LOAD32_L(r + 12);
+
+ r0 &= 0x0fffffff; r1 &= 0x0ffffffc; r2 &= 0x0ffffffc; r3 &= 0x0ffffffc;
+ key->u.p26.r0 = P26W0(r); key->u.p26.r1 = P26W1(r);
+ key->u.p26.r2 = P26W2(r); key->u.p26.r3 = P26W3(r);
+ key->u.p26.r4 = P26W4(r);
+
+ key->u.p26.rr1 = 5*key->u.p26.r1; key->u.p26.rr2 = 5*key->u.p26.r2;
+ key->u.p26.rr3 = 5*key->u.p26.r3; key->u.p26.rr4 = 5*key->u.p26.r4;
+#else
+ memcpy(rr, r, 16);
+ rr[ 3] &= 0x0f;
+ rr[ 4] &= 0xfc; rr[ 7] &= 0x0f;
+ rr[ 8] &= 0xfc; rr[11] &= 0x0f;
+ rr[12] &= 0xfc; rr[15] &= 0x0f;
+ load_p11(key->u.p11.r, rr);
+#endif
+}
+
+/* --- @poly1305_macinit@ --- *
+ *
+ * Arguments: @poly1305_ctx *ctx@ = MAC context to fill in
+ * @const poly1305_key *key@ = pointer to key structure to use
+ * @const void *iv@ = pointer to mask string
+ *
+ * Returns: ---
+ *
+ * Use: Initializes a MAC context for use. The key can be discarded
+ * at any time.
+ *
+ * It is permitted for @iv@ to be null, though it is not then
+ * possible to complete the MAC computation on @ctx@. The
+ * resulting context may still be useful, e.g., as an operand to
+ * @poly1305_concat@.
+ */
+
+void poly1305_macinit(poly1305_ctx *ctx,
+ const poly1305_key *key, const void *iv)
+{
+ const octet *s = iv;
+#if POLY1305_IMPL == 26
+ uint32 s0, s1, s2, s3;
+#else
+ unsigned i;
+#endif
+
+#if POLY1305_IMPL == 26
+ if (s) {
+ s0 = LOAD32_L(s + 0); s1 = LOAD32_L(s + 4);
+ s2 = LOAD32_L(s + 8); s3 = LOAD32_L(s + 12);
+ ctx->u.p26.s0 = P26W0(s); ctx->u.p26.s1 = P26W1(s);
+ ctx->u.p26.s2 = P26W2(s); ctx->u.p26.s3 = P26W3(s);
+ ctx->u.p26.s4 = P26W4(s);
+ }
+ ctx->u.p26.h[0] = ctx->u.p26.h[1] = ctx->u.p26.h[2] =
+ ctx->u.p26.h[3] = ctx->u.p26.h[4] = 0;
+#else
+ if (s) load_p11(ctx->u.p11.s, s);
+ for (i = 0; i < 12; i++) ctx->u.p11.h[i] = 0;
+#endif
+ ctx->k = *key;
+ ctx->nbuf = 0;
+ ctx->count = 0;
+}
+
+/* --- @poly1305_copy@ --- *
+ *
+ * Arguments: @poly1305_ctx *to@ = destination context
+ * @const poly1305_ctx *from@ = source context
+ *
+ * Returns: ---
+ *
+ * Use: Duplicates a Poly1305 MAC context. The destination need not
+ * have been initialized. Both contexts can be used
+ * independently afterwards.
+ */
+
+void poly1305_copy(poly1305_ctx *ctx, const poly1305_ctx *from)
+ { *ctx = *from; }
+
+/* --- @poly1305_hash@ --- *
+ *
+ * Arguments: @poly1305_ctx *ctx@ = MAC context to update
+ * @const void *p@ = pointer to message data
+ * @size_t sz@ = length of message data
+ *
+ * Returns: ---
+ *
+ * Use: Processes a chunk of message. The message pieces may have
+ * arbitrary lengths, and may be empty.
+ */
+
+static void update_full(poly1305_ctx *ctx, const octet *p)
+{
+ felt t;
+#if POLY1305_IMPL == 26
+ uint32
+ m0 = LOAD32_L(p + 0), m1 = LOAD32_L(p + 4),
+ m2 = LOAD32_L(p + 8), m3 = LOAD32_L(p + 12);
+
+ t[0] = ctx->u.p26.h[0] + P26W0(m);
+ t[1] = ctx->u.p26.h[1] + P26W1(m);
+ t[2] = ctx->u.p26.h[2] + P26W2(m);
+ t[3] = ctx->u.p26.h[3] + P26W3(m);
+ t[4] = ctx->u.p26.h[4] + P26W4(m) + 0x01000000;
+#else
+ unsigned i;
+
+ load_p11(t, p); t[11] += 0x100;
+ for (i = 0; i < 12; i++) t[i] += ctx->u.p11.h[i];
+#endif
+
+ mul_r(ctx, ctx->u.P.h, t);
+ ctx->count++;
+}
+
+void poly1305_hash(poly1305_ctx *ctx, const void *p, size_t sz)
+{
+ const octet *pp = p;
+ size_t n;
+
+ if (ctx->nbuf) {
+ if (sz < 16 - ctx->nbuf) {
+ memcpy(ctx->buf + ctx->nbuf, p, sz);
+ ctx->nbuf += sz;
+ return;
+ }
+ n = 16 - ctx->nbuf;
+ memcpy(ctx->buf + ctx->nbuf, pp, n);
+ update_full(ctx, ctx->buf);
+ pp += n; sz -= n;
+ }
+ while (sz >= 16) {
+ update_full(ctx, pp);
+ pp += 16; sz -= 16;
+ }
+ if (sz) memcpy(ctx->buf, pp, sz);
+ ctx->nbuf = sz;
+}
+
+/* --- @poly1305_flush@ --- *
+ *
+ * Arguments: @poly1305_ctx *ctx@ = MAC context to flush
+ *
+ * Returns: ---
+ *
+ * Use: Forces any buffered message data in the context to be
+ * processed. This has no effect if the message processed so
+ * far is a whole number of blocks. Flushing is performed
+ * automatically by @poly1305_done@, but it may be necessary to
+ * force it by hand when using @poly1305_concat@.
+ *
+ * Flushing a partial block has an observable effect on the
+ * computation: the resulting state is (with high probability)
+ * dissimilar to any state reachable with a message which is a
+ * whole number of blocks long.
+ */
+
+void poly1305_flush(poly1305_ctx *ctx)
+{
+ felt t;
+#if POLY1305_IMPL == 26
+ uint32 m0, m1, m2, m3;
+#else
+ unsigned i;
+#endif
+
+ if (!ctx->nbuf) return;
+ ctx->buf[ctx->nbuf++] = 1; memset(ctx->buf + ctx->nbuf, 0, 16 - ctx->nbuf);
+#if POLY1305_IMPL == 26
+ m0 = LOAD32_L(ctx->buf + 0); m1 = LOAD32_L(ctx->buf + 4);
+ m2 = LOAD32_L(ctx->buf + 8); m3 = LOAD32_L(ctx->buf + 12);
+
+ t[0] = ctx->u.p26.h[0] + P26W0(m);
+ t[1] = ctx->u.p26.h[1] + P26W1(m);
+ t[2] = ctx->u.p26.h[2] + P26W2(m);
+ t[3] = ctx->u.p26.h[3] + P26W3(m);
+ t[4] = ctx->u.p26.h[4] + P26W4(m);
+#else
+ load_p11(t, ctx->buf);
+ for (i = 0; i < 12; i++) t[i] += ctx->u.p11.h[i];
+#endif
+
+ mul_r(ctx, ctx->u.P.h, t);
+ ctx->count++;
+ ctx->nbuf = 0;
+}
+
+/* --- @poly1305_concat@ --- *
+ *
+ * Arguments: @poly1305_ctx *ctx@ = destination context
+ * @const poly1305_ctx *prefix, *suffix@ = two operand contexts
+ *
+ * Returns: ---
+ *
+ * Use: The two operand contexts @prefix@ and @suffix@ represent
+ * processing of two messages %$m$% and %$m'$%; the effect is to
+ * set @ctx@ to the state corresponding to their concatenation
+ * %$m \cat m'$%.
+ *
+ * All three contexts must have been initialized using the same
+ * key value (though not necessarily from the same key
+ * structure). The mask values associated with the input
+ * contexts are irrelevant. The @prefix@ message %$m$% must be
+ * a whole number of blocks long: this can be arranged by
+ * flushing the context. The @suffix@ message need not be a
+ * whole number of blocks long. All of the contexts remain
+ * operational and can be used independently afterwards.
+ */
+
+void poly1305_concat(poly1305_ctx *ctx,
+ const poly1305_ctx *prefix, const poly1305_ctx *suffix)
+{
+ /* Assume that lengths are public, so it's safe to behave conditionally on
+ * the bits of ctx->count.
+ */
+ unsigned long n;
+ unsigned i;
+ felt x;
+#if POLY1305_IMPL == 26
+ uint32 x0, x1, x2, x3, x4, y0, y1, y2, y3, y4;
+#else
+ uint32 y[12];
+#endif
+
+ /* We can only concatenate if the prefix is block-aligned. */
+ assert(!prefix->nbuf);
+
+ /* The hash for a message m = m_{k-1} m_{k-2} ... m_1 m_0 is h_r(m) =
+ * SUM_{0<=i<k} m_i r^{i+1}. If we have two messages, m, m', of lengths k
+ * and k' blocks respectively, then
+ *
+ * h_r(m || m') = SUM_{0<=i<k} m_i r^{k'+i+1} +
+ * SUM_{0<=i<k'} m'_i r^{i+1}
+ * = r^{k'} h_r(m) + h_r(m')
+ *
+ * This is simple left-to-right square-and-multiply exponentiation.
+ */
+ n = suffix->count;
+ x[0] = 1;
+#if POLY1305_IMPL == 26
+ x[1] = x[2] = x[3] = x[4] = 0;
+#else
+ for (i = 1; i < 12; i++) x[i] = 0;
+#endif
+#define BIT (1 << (ULONG_BITS - 1))
+ if (n) {
+ i = ULONG_BITS;
+ while (!(n & BIT)) { n <<= 1; i--; }
+ mul_r(prefix, x, x); n <<= 1; i--;
+ while (i--) { sqr(x, x); if (n & BIT) mul_r(prefix, x, x); n <<= 1; }
+ }
+#undef BIT
+ mul(x, prefix->u.P.h, x);
+
+ /* Add on the suffix hash. */
+#if POLY1305_IMPL == 26
+ /* We're going to add the two hashes elementwise. Both h' = h_r(m') and
+ * x = r^{k'} h_r(m) are bounded above by 2^27, so the sum will be bounded
+ * by 2^28; but this is too large to leave in the accumulator. (Strictly,
+ * we could get away with it, but the caller can in theory chain an
+ * arbitrary number of concatenations and expect us to cope, and we'd
+ * definitely overflow eventually.) So we reduce. Since the excess is so
+ * small, a single round of `CARRY_REDUCE' is enough.
+ */
+ x0 = x[0] + suffix->u.p26.h[0]; x1 = x[1] + suffix->u.p26.h[1];
+ x2 = x[2] + suffix->u.p26.h[2]; x3 = x[3] + suffix->u.p26.h[3];
+ x4 = x[4] + suffix->u.p26.h[4];
+ CARRY_REDUCE(y, x);
+ ctx->u.p26.h[0] = y0; ctx->u.p26.h[1] = y1; ctx->u.p26.h[2] = y2;
+ ctx->u.p26.h[3] = y3; ctx->u.p26.h[4] = y4;
+#else
+ /* We'll add the two hashes elementwise and have to reduce again. The
+ * numbers are different, but the reasoning is basically the same.
+ */
+ for (i = 0; i < 12; i++) y[i] = x[i] + suffix->u.p11.h[i];
+ carry_reduce(y);
+ for (i = 0; i < 12; i++) ctx->u.p11.h[i] = y[i];
+#endif
+
+ /* Copy the remaining pieces of the context to set up the result. */
+ if (ctx != suffix) {
+ memcpy(ctx->buf, suffix->buf, suffix->nbuf);
+ ctx->nbuf = suffix->nbuf;
+ }
+ ctx->count = prefix->count + suffix->count;
+}
+
+/* --- @poly1305_done@ --- *
+ *
+ * Arguments: @poly1305_ctx *ctx@ = MAC context to finish
+ * @void *h@ = buffer to write the tag to
+ *
+ * Returns: ---
+ *
+ * Use: Completes a Poly1305 MAC tag computation.
+ */
+
+void poly1305_done(poly1305_ctx *ctx, void *h)
+{
+ octet *p = h;
+
+#if POLY1305_IMPL == 26
+ uint32 m_sub, t, c;
+ uint32 h0, h1, h2, h3, h4, hh0, hh1, hh2, hh3, hh4;
+
+ /* If there's anything left over in the buffer, pad it to form a final
+ * coefficient and update the evaluation one last time.
+ */
+ poly1305_flush(ctx);
+
+ /* Collect the final hash state. */
+ h0 = ctx->u.p26.h[0];
+ h1 = ctx->u.p26.h[1];
+ h2 = ctx->u.p26.h[2];
+ h3 = ctx->u.p26.h[3];
+ h4 = ctx->u.p26.h[4];
+
+ /* Reduce the final value mod 2^130 - 5. First pass: set h <- h +
+ * 5 floor(h/2^130). After this, the low pieces of h will be normalized:
+ * 0 <= h_i < 2^26 for 0 <= i < 4; and 0 <= h_4 < 2^26 + 1. In the
+ * (highly unlikely) event that h_4 >= 2^26, set c and truncate to 130
+ * bits.
+ */
+ c = h4 >> 26; h4 &= M26;
+ h0 += 5*c; c = h0 >> 26; h0 &= M26;
+ h1 += c; c = h1 >> 26; h1 &= M26;
+ h2 += c; c = h2 >> 26; h2 &= M26;
+ h3 += c; c = h3 >> 26; h3 &= M26;
+ h4 += c; c = h4 >> 26; h4 &= M26;
+
+ /* Calculate h' = h - (2^130 - 5). If h' >= 0 then t ends up 1; otherwise
+ * it's zero.
+ */
+ t = h0 + 5; hh0 = t&M26; t >>= 26;
+ t += h1; hh1 = t&M26; t >>= 26;
+ t += h2; hh2 = t&M26; t >>= 26;
+ t += h3; hh3 = t&M26; t >>= 26;
+ t += h4; hh4 = t&M26; t >>= 26;
+
+ /* Keep the subtraction result above if t or c is set. */
+ m_sub = -(t | c);
+ h0 = (hh0&m_sub) | (h0&~m_sub);
+ h1 = (hh1&m_sub) | (h1&~m_sub);
+ h2 = (hh2&m_sub) | (h2&~m_sub);
+ h3 = (hh3&m_sub) | (h3&~m_sub);
+ h4 = (hh4&m_sub) | (h4&~m_sub);
+
+ /* Add the mask onto the hash result. */
+ t = h0 + ctx->u.p26.s0; h0 = t&M26; t >>= 26;
+ t += h1 + ctx->u.p26.s1; h1 = t&M26; t >>= 26;
+ t += h2 + ctx->u.p26.s2; h2 = t&M26; t >>= 26;
+ t += h3 + ctx->u.p26.s3; h3 = t&M26; t >>= 26;
+ t += h4 + ctx->u.p26.s4; h4 = t&M26; t >>= 26;
+
+ /* Convert this mess back into 32-bit words. We lose the top two bits,
+ * but that's fine.
+ */
+ h0 = (h0 >> 0) | ((h1 & 0x0000003f) << 26);
+ h1 = (h1 >> 6) | ((h2 & 0x00000fff) << 20);
+ h2 = (h2 >> 12) | ((h3 & 0x0003ffff) << 14);
+ h3 = (h3 >> 18) | ((h4 & 0x00ffffff) << 8);
+
+ /* All done. */
+ STORE32_L(p + 0, h0); STORE32_L(p + 4, h1);
+ STORE32_L(p + 8, h2); STORE32_L(p + 12, h3);
+#else
+ uint16 hh[12], hi[12], c, t, m_sub;
+ uint32 a;
+ unsigned i, j, n;
+
+ /* If there's anything left over in the buffer, pad it to form a final
+ * coefficient and update the evaluation one last time.
+ */
+ poly1305_flush(ctx);
+
+ /* Collect the final hash state. */
+ for (i = 0; i < 12; i++) hh[i] = ctx->u.p11.h[i];
+
+ /* Reduce the final value mod 2^130 - 5. First pass: set h <- h +
+ * 5 floor(h/2^130). After this, the low pieces of h will be normalized:
+ * 0 <= h_i < 2^{w_i} for 0 <= i < 11; and 0 <= h_{11} < 2^10 + 1. In the
+ * (highly unlikely) event that h_{11} >= 2^10, set c and truncate to 130
+ * bits.
+ */
+ c = 5*(hh[11] >> 10); hh[11] &= M10;
+ for (i = 0; i < 12; i++) {
+ if (i == 5 || i == 11) { c += hh[i]; hh[i] = c&M10; c >>= 10; }
+ else { c += hh[i]; hh[i] = c&M11; c >>= 11; }
+ }
+
+ /* Calculate h' = h - (2^130 - 5). If h' >= 0 then t ends up 1; otherwise
+ * it's zero.
+ */
+ for (i = 0, t = 5; i < 12; i++) {
+ t += hh[i];
+ if (i == 5 || i == 11) { hi[i] = t&M10; t >>= 10; }
+ else { hi[i] = t&M11; t >>= 11; }
+ }
+
+ /* Keep the subtraction result above if t or c is set. */
+ m_sub = -(t | c);
+ for (i = 0; i < 12; i++) hh[i] = (hi[i]&m_sub) | (hh[i]&~m_sub);
+
+ /* Add the mask onto the hash result. */
+ for (i = 0, t = 0; i < 12; i++) {
+ t += hh[i] + ctx->u.p11.s[i];
+ if (i == 5 || i == 11) { hh[i] = t&M10; t >>= 10; }
+ else { hh[i] = t&M11; t >>= 11; }
+ }
+
+ /* Convert this mess back into bytes. We lose the top two bits, but that's
+ * fine.
+ */
+ for (i = j = n = 0, a = 0; i < 16; i++) {
+ if (n < 8) {
+ a |= hh[j] << n;
+ n += (j == 5 || j == 11) ? 10 : 11;
+ j++;
+ }
+ p[i] = a&0xff; a >>= 8; n -= 8;
+ }
+
+#endif
+}
+
+/*----- Test rig ----------------------------------------------------------*/
+
+#ifdef TEST_RIG
+
+#include <mLib/testrig.h>
+
+static int vrf_hash(dstr v[])
+{
+ poly1305_key k;
+ poly1305_ctx ctx;
+ dstr t = DSTR_INIT;
+ unsigned i, j;
+
+ if (v[0].len != 16) { fprintf(stderr, "bad key length\n"); exit(2); }
+ if (v[1].len != 16) { fprintf(stderr, "bad mask length\n"); exit(2); }
+ if (v[3].len != 16) { fprintf(stderr, "bad tag length\n"); exit(2); }
+ dstr_ensure(&t, 16); t.len = 16;
+
+ poly1305_keyinit(&k, v[0].buf, v[0].len);
+ for (i = 0; i < v[2].len; i++) {
+ for (j = i; j < v[2].len; j++) {
+ poly1305_macinit(&ctx, &k, v[1].buf);
+ poly1305_hash(&ctx, v[2].buf, i);
+ poly1305_hash(&ctx, v[2].buf + i, j - i);
+ poly1305_hash(&ctx, v[2].buf + j, v[2].len - j);
+ poly1305_done(&ctx, t.buf);
+ if (memcmp(t.buf, v[3].buf, 16) != 0) {
+ fprintf(stderr, "failed...");
+ fprintf(stderr, "\n\tkey = "); type_hex.dump(&v[0], stderr);
+ fprintf(stderr, "\n\tmask = "); type_hex.dump(&v[1], stderr);
+ fprintf(stderr, "\n\tmsg = "); type_hex.dump(&v[2], stderr);
+ fprintf(stderr, "\n\texp = "); type_hex.dump(&v[3], stderr);
+ fprintf(stderr, "\n\tcalc = "); type_hex.dump(&t, stderr);
+ fprintf(stderr, "\n\tsplits = 0 .. %u .. %u .. %lu\n",
+ i, j, (unsigned long)v[1].len);
+ return (0);
+ }
+ }
+ }
+ return (1);
+}
+
+static int vrf_cat(dstr v[])
+{
+ poly1305_key k;
+ poly1305_ctx ctx, cc[3];
+ dstr t = DSTR_INIT;
+ unsigned i;
+ int ok = 1;
+
+ if (v[0].len != 16) { fprintf(stderr, "bad key length\n"); exit(2); }
+ if (v[1].len != 16) { fprintf(stderr, "bad mask length\n"); exit(2); }
+ if (v[5].len != 16) { fprintf(stderr, "bad tag length\n"); exit(2); }
+ dstr_ensure(&t, 16); t.len = 16;
+
+ poly1305_keyinit(&k, v[0].buf, v[0].len);
+ poly1305_macinit(&ctx, &k, v[1].buf);
+ for (i = 0; i < 3; i++) {
+ poly1305_macinit(&cc[i], &k, 0);
+ poly1305_hash(&cc[i], v[i + 2].buf, v[i + 2].len);
+ }
+ for (i = 0; i < 2; i++) {
+ if (!i) {
+ poly1305_concat(&ctx, &cc[1], &cc[2]);
+ poly1305_concat(&ctx, &cc[0], &ctx);
+ } else {
+ poly1305_concat(&ctx, &cc[0], &cc[1]);
+ poly1305_concat(&ctx, &ctx, &cc[2]);
+ }
+ poly1305_done(&ctx, t.buf);
+ if (memcmp(t.buf, v[5].buf, 16) != 0) {
+ fprintf(stderr, "failed...");
+ fprintf(stderr, "\n\tkey = "); type_hex.dump(&v[0], stderr);
+ fprintf(stderr, "\n\tmask = "); type_hex.dump(&v[1], stderr);
+ fprintf(stderr, "\n\tmsg[0] = "); type_hex.dump(&v[2], stderr);
+ fprintf(stderr, "\n\tmsg[1] = "); type_hex.dump(&v[3], stderr);
+ fprintf(stderr, "\n\tmsg[2] = "); type_hex.dump(&v[4], stderr);
+ fprintf(stderr, "\n\texp = "); type_hex.dump(&v[5], stderr);
+ fprintf(stderr, "\n\tcalc = "); type_hex.dump(&t, stderr);
+ fprintf(stderr, "\n\tassoc = %s\n",
+ !i ? "msg[0] || (msg[1] || msg[2])" :
+ "(msg[0] || msg[1]) || msg[2]");
+ ok = 0;
+ }
+ }
+ return (ok);
+}
+
+static const struct test_chunk tests[] = {
+ { "poly1305-hash", vrf_hash,
+ { &type_hex, &type_hex, &type_hex, &type_hex } },
+ { "poly1305-cat", vrf_cat,
+ { &type_hex, &type_hex, &type_hex, &type_hex, &type_hex, &type_hex } },
+ { 0, 0, { 0 } }
+};
+
+int main(int argc, char *argv[])
+{
+ test_run(argc, argv, tests, SRCDIR "/t/poly1305");
+ return (0);
+}
+
+#endif
+
+/*----- That's all, folks -------------------------------------------------*/
--- /dev/null
+/* -*-c-*-
+ *
+ * Poly1305 message authentication code
+ *
+ * (c) 2017 Straylight/Edgeware
+ */
+
+/*----- Licensing notice --------------------------------------------------*
+ *
+ * This file is part of Catacomb.
+ *
+ * Catacomb is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Library General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * Catacomb is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with Catacomb; if not, write to the Free
+ * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ * MA 02111-1307, USA.
+ */
+
+/*----- Notes on Poly1305 -------------------------------------------------*
+ *
+ * The Poly1305 message authentication code was designed by Daniel Bernstein
+ * in 2004. It's a heavily performance-engineered Carter--Wegman MAC, based
+ * on polynomial evaluation in %$\F = \mathrm{GF}(2^{130} - 5)$%. Some of
+ * the performance engineering is out-of-date, being there to support
+ * implementation techniques which are no longer relevant, but it still runs
+ * very quickly.
+ *
+ * The key %$r$% is an element of %$\F$%. Messages are encoded as a sequence
+ * %$m_0, m_1, \ldots, m_{n-1}$% of of elements of %$\F$%. A raw hash is
+ * calculated as %$h_0 = \sum_{0\le i<n} m_0 r^{n-i}$%. Finally, the raw
+ * hash is masked for output by adding to its canonical representative a mask
+ * value %$s$% modulo %$2^{128}$% and encoding the result as an octet string.
+ *
+ * As originally presented, Poly1305 generated the output mask by encrypting
+ * a nonce using AES. This has since been separated from the design, so that
+ * Poly1305 stands on its own. Poly1305 is highly key-agile, and most modern
+ * uses simply generate a fresh pseudorandom key and mask for each message.
+ * Note that both key and mask must be (at least) pseudorandom.
+ */
+
+#ifndef CATACOMB_POLY1305_H
+#define CATACOMB_POLY1305_H
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+/*----- Header files ------------------------------------------------------*/
+
+#include <mLib/bits.h>
+
+#ifndef CATACOMB_KEYSZ_H
+# include "keysz.h"
+#endif
+
+/*----- Constants ---------------------------------------------------------*/
+
+extern const octet poly1305_keysz[];
+
+#define POLY1305_BLKSZ 16u
+#define POLY1305_KEYSZ 16u
+#define POLY1305_MASKSZ 16u
+
+/*----- Data structures ---------------------------------------------------*/
+
+typedef struct poly1305_key {
+ union {
+ struct { uint32 r0, r1, r2, r3, r4, rr1, rr2, rr3, rr4; } p26;
+ struct { uint16 r[12]; } p11;
+ } u;
+} poly1305_key;
+
+typedef struct poly1305_ctx {
+ poly1305_key k;
+ union {
+ struct { uint32 s0, s1, s2, s3, s4; uint32 h[5]; } p26;
+ struct { uint16 s[12], h[12]; } p11;
+ } u;
+ unsigned long count;
+ unsigned nbuf;
+ octet buf[16];
+} poly1305_ctx;
+
+/*----- Functions provided ------------------------------------------------*/
+
+/* --- @poly1305_keyinit@ --- *
+ *
+ * Arguments: @poly1305_key *key@ = key structure to fill in
+ * @const void *k@ = pointer to key material
+ * @size_t ksz@ = length of key (must be @POLY1305_KEYSZ == 16@)
+ *
+ * Returns: ---
+ *
+ * Use: Records a Poly1305 key and performs (minimal)
+ * precomputations.
+ */
+
+extern void poly1305_keyinit(poly1305_key */*key*/,
+ const void */*k*/, size_t /*sz*/);
+
+/* --- @poly1305_macinit@ --- *
+ *
+ * Arguments: @poly1305_ctx *ctx@ = MAC context to fill in
+ * @const poly1305_key *key@ = pointer to key structure to use
+ * @const void *iv@ = pointer to mask string
+ *
+ * Returns: ---
+ *
+ * Use: Initializes a MAC context for use. The key can be discarded
+ * at any time.
+ *
+ * It is permitted for @iv@ to be null, though it is not then
+ * possible to complete the MAC computation on @ctx@. The
+ * resulting context may still be useful, e.g., as an operand to
+ * @poly1305_concat@.
+ */
+
+extern void poly1305_macinit(poly1305_ctx */*ctx*/,
+ const poly1305_key */*key*/,
+ const void */*iv*/);
+
+/* --- @poly1305_copy@ --- *
+ *
+ * Arguments: @poly1305_ctx *to@ = destination context
+ * @const poly1305_ctx *from@ = source context
+ *
+ * Returns: ---
+ *
+ * Use: Duplicates a Poly1305 MAC context. The destination need not
+ * have been initialized. Both contexts can be used
+ * independently afterwards.
+ */
+
+extern void poly1305_copy(poly1305_ctx */*to*/,
+ const poly1305_ctx */*from*/);
+
+/* --- @poly1305_hash@ --- *
+ *
+ * Arguments: @poly1305_ctx *ctx@ = MAC context to update
+ * @const void *p@ = pointer to message data
+ * @size_t sz@ = length of message data
+ *
+ * Returns: ---
+ *
+ * Use: Processes a chunk of message. The message pieces may have
+ * arbitrary lengths, and may be empty.
+ */
+
+extern void poly1305_hash(poly1305_ctx */*ctx*/,
+ const void */*p*/, size_t /*sz*/);
+
+/* --- @poly1305_flush@ --- *
+ *
+ * Arguments: @poly1305_ctx *ctx@ = MAC context to flush
+ *
+ * Returns: ---
+ *
+ * Use: Forces any buffered message data in the context to be
+ * processed. This has no effect if the message processed so
+ * far is a whole number of blocks. Flushing is performed
+ * automatically by @poly1305_done@, but it may be necessary to
+ * force it by hand when using @poly1305_concat@.
+ *
+ * Flushing a partial block has an observable effect on the
+ * computation: the resulting state is (with high probability)
+ * dissimilar to any state reachable with a message which is a
+ * whole number of blocks long.
+ */
+
+extern void poly1305_flush(poly1305_ctx */*ctx*/);
+
+/* --- @poly1305_concat@ --- *
+ *
+ * Arguments: @poly1305_ctx *ctx@ = destination context
+ * @const poly1305_ctx *prefix, *suffix@ = two operand contexts
+ *
+ * Returns: ---
+ *
+ * Use: The two operand contexts @prefix@ and @suffix@ represent
+ * processing of two messages %$m$% and %$m'$%; the effect is to
+ * set @ctx@ to the state corresponding to their concatenation
+ * %$m \cat m'$%.
+ *
+ * All three contexts must have been initialized using the same
+ * key value (though not necessarily from the same key
+ * structure). The mask values associated with the input
+ * contexts are irrelevant. The @prefix@ message %$m$% must be
+ * a whole number of blocks long: this can be arranged by
+ * flushing the context. The @suffix@ message need not be a
+ * whole number of blocks long. All of the contexts remain
+ * operational and can be used independently afterwards.
+ */
+
+extern void poly1305_concat(poly1305_ctx */*ctx*/,
+ const poly1305_ctx */*prefix*/,
+ const poly1305_ctx */*suffix*/);
+
+/* --- @poly1305_done@ --- *
+ *
+ * Arguments: @poly1305_ctx *ctx@ = MAC context to finish
+ * @void *h@ = buffer to write the tag to
+ *
+ * Returns: ---
+ *
+ * Use: Completes a Poly1305 MAC tag computation.
+ */
+
+extern void poly1305_done(poly1305_ctx */*ctx*/, void */*h*/);
+
+/*----- That's all, folks -------------------------------------------------*/
+
+#ifdef __cplusplus
+ }
+#endif
+
+#endif