mdw@git.distorted.org.uk Git - catacomb/blob - symm/poly1305.c

   1 /* -*-c-*-
   2  *
   3  * Poly1305 message authentication code
   4  *
   5  * (c) 2017 Straylight/Edgeware
   6  */
   7
   8 /*----- Licensing notice --------------------------------------------------*
   9  *
  10  * This file is part of Catacomb.
  11  *
  12  * Catacomb is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU Library General Public License as
  14  * published by the Free Software Foundation; either version 2 of the
  15  * License, or (at your option) any later version.
  16  *
  17  * Catacomb is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20  * GNU Library General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Library General Public
  23  * License along with Catacomb; if not, write to the Free
  24  * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25  * MA 02111-1307, USA.
  26  */
  27
  28 /*----- Header files ------------------------------------------------------*/
  29
  30 #include "config.h"
  31
  32 #include <assert.h>
  33 #include <string.h>
  34
  35 #include "poly1305.h"
  36 #include "rsvr.h"
  37
  38 /*----- Global variables --------------------------------------------------*/
  39
  40 const octet poly1305_keysz[] = { KSZ_SET, 16, 0 };
  41
  42 /*----- Low-level implementation for 32/64-bit targets --------------------*/
  43
  44 #if !defined(POLY1305_IMPL) && defined(HAVE_UINT64)
  45 #  define POLY1305_IMPL 26
  46 #endif
  47
  48 #if POLY1305_IMPL == 26
  49
  50 /* Elements x of GF(2^130 - 5) are represented by five integers x_i: x =
  51  * SUM_{0<=i<5} x_i 2^{26i}.
  52  *
  53  * Not all elements are represented canonically.  We have 0 <= r_i, s_i <
  54  * 2^26 by construction.  We maintain 0 <= h_i < 2^27.  When we read a
  55  * message block m, we have 0 <= m_i < 2^26 by construction again.  When we
  56  * update the hash state, we calculate h' = r (h + m).  Addition is done
  57  * componentwise; let t = h + m, and we will have 0 <= t_i < 3*2^26.
  58  */
  59 typedef uint32 felt[5];
  60 #define M26 0x03ffffff
  61 #define P p26
  62
  63 /* Convert 32-bit words into field-element pieces. */
  64 #define P26W0(x)  (((x##0) <<  0)&0x03ffffff)
  65 #define P26W1(x) ((((x##1) <<  6)&0x03ffffc0) | (((x##0) >> 26)&0x0000003f))
  66 #define P26W2(x) ((((x##2) << 12)&0x03ffffff) | (((x##1) >> 20)&0x00000fff))
  67 #define P26W3(x) ((((x##3) << 18)&0x03fc0000) | (((x##2) >> 14)&0x0003ffff))
  68 #define P26W4(x)                                (((x##3) >>  8)&0x00ffffff)
  69
  70 /* Propagate carries in parallel.  If 0 <= u_i < 2^26 c_i, then we shall have
  71  * 0 <= v_0 < 2^26 + 5 c_4, and 0 <= v_i < 2^26 + c_{i-1} for 1 <= i < 5.
  72  */
  73 #define CARRY_REDUCE(v, u) do {                                         \
  74   (v##0) = ((u##0)&M26) + 5*((u##4) >> 26);                             \
  75   (v##1) = ((u##1)&M26) +   ((u##0) >> 26);                             \
  76   (v##2) = ((u##2)&M26) +   ((u##1) >> 26);                             \
  77   (v##3) = ((u##3)&M26) +   ((u##2) >> 26);                             \
  78   (v##4) = ((u##4)&M26) +   ((u##3) >> 26);                             \
  79 } while (0)
  80
  81 /* General multiplication, used by `concat'. */
  82 static void mul(felt z, const felt x, const felt y)
  83 {
  84   /* Initial bounds: we assume x_i, y_i < 2^27.  On exit, z_i < 2^27. */
  85
  86   uint32 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4];
  87   uint32 y0 = y[0], y1 = y[1], y2 = y[2], y3 = y[3], y4 = y[4];
  88   uint64 u0, u1, u2, u3, u4;
  89   uint64 v0, v1, v2, v3, v4;
  90   uint32 z0, z1, z2, z3, z4;
  91
  92   /* Do the multiplication: u = h x mod 2^130 - 5.  We will have u_i <
  93    * 2^27 (5 (4 - i) + i + 1) 2^27 = 2^54 (21 - 4 i) = 2^52 (84 - 16 i).  In
  94    * all cases we have u_i < 84*2^52 < 2^59.  Notably, u_4 < 5*2^54 =
  95    * 20*2^52.
  96    */
  97 #define M(x, y) ((uint64)(x)*(y))
  98   u0 = M(x0, y0) + (M(x1, y4) +  M(x2, y3) +  M(x3, y2) +  M(x4, y1))*5;
  99   u1 = M(x0, y1) +  M(x1, y0) + (M(x2, y4) +  M(x3, y3) +  M(x4, y2))*5;
 100   u2 = M(x0, y2) +  M(x1, y1) +  M(x2, y0) + (M(x3, y4) +  M(x4, y3))*5;
 101   u3 = M(x0, y3) +  M(x1, y2) +  M(x2, y1) +  M(x3, y0) + (M(x4, y4))*5;
 102   u4 = M(x0, y4) +  M(x1, y3) +  M(x2, y2) +  M(x3, y1) +  M(x4, y0);
 103 #undef M
 104
 105   /* Now we must reduce the coefficients.  We do this in an approximate
 106    * manner which avoids long data-dependency chains, but requires two
 107    * passes.
 108    *
 109    * The reduced carry down from u_4 to u_0 in the first pass will be c_0 <
 110    * 100*2^26; the remaining c_i are smaller: c_i < 2^26 (84 - 16 i).  This
 111    * leaves 0 <= v_i < 101*2^26.  The carries in the second pass are bounded
 112    * above by 180.
 113    */
 114   CARRY_REDUCE(v, u); CARRY_REDUCE(z, v);
 115   z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4;
 116 }
 117
 118 /* General squaring, used by `concat'. */
 119 static void sqr(felt z, const felt x)
 120 {
 121   /* Initial bounds: we assume x_i < 2^27.  On exit, z_i < 2^27. */
 122
 123   uint32 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4];
 124   uint64 u0, u1, u2, u3, u4;
 125   uint64 v0, v1, v2, v3, v4;
 126   uint32 z0, z1, z2, z3, z4;
 127
 128   /* Do the squaring.  See `mul' for bounds. */
 129 #define M(x, y) ((uint64)(x)*(y))
 130   u0 = M(x0, x0) +                            10*(M(x1, x4) +   M(x2, x3));
 131   u1 =             2* M(x0, x1) +              5*(M(x3, x3) + 2*M(x2, x4));
 132   u2 = M(x1, x1) + 2* M(x0, x2) +             10* M(x3, x4);
 133   u3 =             2*(M(x0, x3) + M(x1, x2)) + 5* M(x4, x4);
 134   u4 = M(x2, x2) + 2*(M(x0, x4) + M(x1, x3));
 135 #undef M
 136
 137   /* Now we must reduce the coefficients.  See `mul' for bounds. */
 138   CARRY_REDUCE(v, u); CARRY_REDUCE(z, v);
 139   z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4;
 140 }
 141
 142 /* Multiplication by r, using precomputation. */
 143 static void mul_r(const poly1305_ctx *ctx, felt z, const felt x)
 144 {
 145   /* Initial bounds: by construction, r_i < 2^26.  We assume x_i < 3*2^26.
 146    * On exit, z_i < 2^27.
 147    */
 148
 149   uint32
 150     r0 = ctx->k.u.p26.r0,
 151     r1 = ctx->k.u.p26.r1, rr1 = ctx->k.u.p26.rr1,
 152     r2 = ctx->k.u.p26.r2, rr2 = ctx->k.u.p26.rr2,
 153     r3 = ctx->k.u.p26.r3, rr3 = ctx->k.u.p26.rr3,
 154     r4 = ctx->k.u.p26.r4, rr4 = ctx->k.u.p26.rr4;
 155   uint32 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4];
 156   uint64 u0, u1, u2, u3, u4;
 157   uint64 v0, v1, v2, v3, v4;
 158   uint32 z0, z1, z2, z3, z4;
 159
 160   /* Do the multiplication: u = h x mod 2^130 - 5.  We will have u_i <
 161    * 2^26 (5 (4 - i) + i + 1) 3*2^26 = 2^52 (63 - 12 i).  In all cases
 162    * we have u_i < 63*2^52 < 2^58.  Notably, u_4 < 15*2^52.
 163    */
 164 #define M(x, y) ((uint64)(x)*(y))
 165   u0 = M(x0, r0)  + M(x1, rr4) + M(x2, rr3) + M(x3, rr2) + M(x4, rr1);
 166   u1 = M(x0, r1)  + M(x1, r0)  + M(x2, rr4) + M(x3, rr3) + M(x4, rr2);
 167   u2 = M(x0, r2)  + M(x1, r1)  + M(x2, r0)  + M(x3, rr4) + M(x4, rr3);
 168   u3 = M(x0, r3)  + M(x1, r2)  + M(x2, r1)  + M(x3, r0)  + M(x4, rr4);
 169   u4 = M(x0, r4)  + M(x1, r3)  + M(x2, r2)  + M(x3, r1)  + M(x4, r0);
 170 #undef M
 171
 172   /* Now we must reduce the coefficients.  We do this in an approximate
 173    * manner which avoids long data-dependency chains, but requires two
 174    * passes.
 175    *
 176    * The reduced carry down from u_4 to u_0 in the first pass will be c_0 <
 177    * 75*2^26; the remaining c_i are smaller: c_i < 2^26 (63 - 12 i).  This
 178    * leaves 0 <= v_i < 76*2^26.  The carries in the second pass are bounded
 179    * above by 135.
 180    */
 181   CARRY_REDUCE(v, u); CARRY_REDUCE(z, v);
 182   z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4;
 183 }
 184
 185 #endif
 186
 187 /*----- Low-level implementation for 16/32-bit targets --------------------*/
 188
 189 #ifndef POLY1305_IMPL
 190 #  define POLY1305_IMPL 11
 191 #endif
 192
 193 #if POLY1305_IMPL == 11
 194
 195 /* Elements x of GF(2^130 - 5) are represented by 12 integers x_i: x =
 196  * SUM_{0<=i<12} x_i 2^P_i, where P_i = SUM_{0<=j<i} w_j, and w_5 = w_11 =
 197  * 10, and w_i = 11 for i in { 0, 1, 2, 3, 4, 6, 7, 8, 9, 10 }.
 198  *
 199  * Not all elements are represented canonically.  We have 0 <= r_i, s_i <
 200  * 2^w_i <= 2^11 by construction.  We maintain 0 <= h_i < 2^12.  When we read
 201  * a message block m, we have 0 <= m_i < 2^w_i by construction again.  When
 202  * we update the hash state, we calculate h' = r (h + m).  Addition is done
 203  * componentwise; let t = h + m, and we will have 0 <= t_i < 3*2^11.
 204  */
 205 typedef uint16 felt[12];
 206 #define M10 0x3ff
 207 #define M11 0x7ff
 208 #define P p11
 209
 210 /* Load a field element from an octet string. */
 211 static void load_p11(felt d, const octet *s)
 212 {
 213   unsigned i, j, n, w;
 214   uint16 m;
 215   uint32 a;
 216
 217   for (i = j = n = 0, a = 0; j < 12; j++) {
 218     if (j == 5 || j == 11) { w = 10; m = M10; }
 219     else                   { w = 11; m = M11; }
 220     while (n < w && i < 16) { a |= s[i++] << n; n += 8; }
 221     d[j] = a&m; a >>= w; n -= w;
 222   }
 223 }
 224
 225 /* Reduce a field-element's pieces to manageable size. */
 226 static void carry_reduce(uint32 u[12])
 227 {
 228   /* Initial bounds: we assume u_i < 636*2^22.  On exit, u_i < 2^11. */
 229
 230   unsigned i;
 231   uint32 c;
 232
 233   /* Do sequential carry propagation (16-bit CPUs are less likely to benefit
 234    * from instruction-level parallelism).  Start at u_9; truncate it to 11
 235    * bits, and add the carry onto u_10.  Truncate u10 to 11 bits, and add the
 236    * carry onto u_11.  Truncate u_11 to 10 bits, and add five times the carry
 237    * onto u_0.  And so on.
 238    *
 239    * The carry is larger than the pieces we're leaving behind.  Let c_i be
 240    * the high portion of u_i, to be carried onto u_{i+1}.  I claim that c_i <
 241    * 2557*2^10.  Then the carry /into/ any u_i is at most 12785*2^10 < 2^24
 242    * (allowing for the reduction as we carry from u_11 to u_0), and u_i after
 243    * carry is bounded above by 636*2^22 + 12785*2^10 < 2557*2^20.  Hence, the
 244    * carry out is at most 2557*2^10, as claimed.
 245    *
 246    * Once we reach u_9 for the second time, we start with u_9 < 2^11.  The
 247    * carry into u_9 is at most 2557*2^10 < 1279*2^11 as calculated above; so
 248    * the carry out into u_10 is at most 1280.  Since u_10 < 2^11 prior to
 249    * this carry in, we now have u_10 < 2^11 + 1280 < 2^12; so the carry out
 250    * into u_11 is at most 1.  The final reduction therefore only needs a
 251    * conditional subtraction.
 252    */
 253                            {               c =  u[9] >> 11;  u[9] &= M11; }
 254                            { u[10] +=   c; c = u[10] >> 11; u[10] &= M11; }
 255                            { u[11] +=   c; c = u[11] >> 10; u[11] &= M10; }
 256                            {  u[0] += 5*c; c =  u[0] >> 11;  u[0] &= M11; }
 257   for (i = 1; i <  5; i++) {  u[i] +=   c; c =  u[i] >> 11;  u[i] &= M11; }
 258                            {  u[5] +=   c; c =  u[5] >> 10;  u[5] &= M10; }
 259   for (i = 6; i < 11; i++) {  u[i] +=   c; c =  u[i] >> 11;  u[i] &= M11; }
 260   u[11] += c;
 261 }
 262
 263 /* General multiplication. */
 264 static void mul(felt z, const felt x, const felt y)
 265 {
 266   /* Initial bounds: we assume x_i < 3*2^11, and y_i < 2^12.  On exit,
 267    * z_i < 2^12.
 268    */
 269
 270   uint32 u[12];
 271   unsigned i, j, k;
 272
 273   /* Do the main multiplication.  After this, we shall have
 274    *
 275    *          { 2^22 (636 - 184 i)      for 0 <= i < 6
 276    *    u_i < {
 277    *          { 2^22 (732 - 60 i)       for 6 <= i < 12
 278    *
 279    * In particular, u_0 < 636*2^22 < 2^32, and u_11 < 72*2^22.
 280    *
 281    * The irregularly positioned pieces are annoying.  Because we fold the
 282    * reduction into the multiplication, it's also important to see where the
 283    * reduced products fit.  Finally, products don't align with the piece
 284    * boundaries, and sometimes need to be doubled.  The following table
 285    * tracks all of this.
 286    *
 287    *  piece    width   offset  second
 288    *     0      11        0     130
 289    *     1      11       11     141
 290    *     2      11       22     152
 291    *     3      11       33     163
 292    *     4      11       44     174
 293    *     5      10       55     185
 294    *     6      11       65     195
 295    *     7      11       76     206
 296    *     8      11       87     217
 297    *     9      11       98     228
 298    *    10      11      109     239
 299    *    11      10      120     250
 300    *
 301    * The next table tracks exactly which products end up being multiplied by
 302    * which constants and accumulated into which destination pieces.
 303    *
 304    *   u_k =    t_i r_j +        2 t_i r_j + 5 t_i r_j +  10 t_i r_j
 305    *     0      0/0                --        6/6          1-5/11-7 7-11/5-1
 306    *     1      0-1/1-0            --        6-7/7-6      2-5/11-8 8-11/5-2
 307    *     2      0-2/2-0            --        6-8/8-6      3-5/11-9 9-11/5-3
 308    *     3      0-3/3-0            --        6-9/9-6      4-5/11-10 10-11/5-4
 309    *     4      0-4/4-0            --        6-10/10-6    5/11 11/5
 310    *     5      0-5/5-0            --        6-11/11-6    --
 311    *     6      0/6 6/0            1-5/5-1   --           7-11/11-7
 312    *     7      0-1/7-6 6-7/1-0    2-5/5-2   --           8-11/11-8
 313    *     8      0-2/8-6 6-8/2-0    3-5/5-3   --           9-11/11-9
 314    *     9      0-3/9-6 6-9/3-0    4-5/5-4   --           10-11/11-10
 315    *    10      0-4/10-6 6-10/4-0  5/5       --           11/11
 316    *    11      0-11/11-0          --        --           --
 317    *
 318    * And, finally, trying to bound the multiple of 6*2^22 in each destination
 319    * piece is fiddly, so here's a tableau showing the calculation.
 320    *
 321    *     k      1* + 2* + 5* +10*   =    1* +  5*   =
 322    *     0      1   --    1   10         1    21        106
 323    *     1      2   --    2    8         2    18         92
 324    *     2      3   --    3    6         3    15         78
 325    *     3      4   --    4    4         4    12         64
 326    *     4      5   --    5    2         5     9         50
 327    *     5      6   --    6   --         6     6         36
 328    *     6      2    5   --    5        12    10         62
 329    *     7      4    4   --    4        12     8         52
 330    *     8      6    3   --    3        12     6         42
 331    *     9      8    2   --    2        12     4         32
 332    *    10     10    1   --    1        12     2         22
 333    *    11     12   --   --   --        12     0         12
 334    */
 335
 336   for (i = 0; i < 12; i++) u[i] = 0;
 337
 338 #define M(i, j) ((uint32)x[i]*y[j])
 339
 340   /* Product terms we must multiply by 10. */
 341   for (k = 0; k < 5; k++) {
 342     for (i = k + 1; i < 6; i++) {
 343       j = 12 + k - i;
 344       u[k] += M(i, j) + M(j, i);
 345       u[k + 6] += M(i + 6, j);
 346     }
 347   }
 348   for (k = 0; k < 5; k++) u[k] *= 2;
 349   for (k = 6; k < 11; k++) u[k] *= 5;
 350
 351   /* Product terms we must multiply by 5. */
 352   for (k = 0; k < 6; k++) {
 353     for (i = k + 6; i >= 6; i--) {
 354       j = 12 + k - i;
 355       u[k] += M(i, j);
 356     }
 357   }
 358   for (k = 0; k < 6; k++) u[k] *= 5;
 359
 360   /* Product terms we must multiply by 2. */
 361   for (k = 6; k < 11; k++) {
 362     for (i = k - 5; i < 6; i++) {
 363       j = k - i;
 364       u[k] += M(i, j);
 365     }
 366   }
 367   for (k = 6; k < 11; k++) u[k] *= 2;
 368
 369   /* Remaining product terms. */
 370   for (k = 0; k < 6; k++) {
 371     for (i = k; i < 6; i--) {
 372       j = k - i;
 373       u[k] += M(i, j);
 374       u[k + 6] += M(i + 6, j) + M(i, j + 6);
 375     }
 376   }
 377
 378 #undef M
 379
 380   /* Do the reduction.  Currently, `carry_reduce' does more than we need, but
 381    * that's fine.
 382    */
 383   carry_reduce(u);
 384
 385   /* Done.  Write out the answer. */
 386   for (i = 0; i < 12; i++) z[i] = u[i];
 387 }
 388
 389 /* General squaring, used by `concat'. */
 390 static void sqr(felt z, const felt x)
 391   { mul(z, x, x); }
 392
 393 /* Multiplication by r. */
 394 static void mul_r(const poly1305_ctx *ctx, felt z, const felt x)
 395   { mul(z, x, ctx->k.u.p11.r); }
 396
 397 #endif
 398
 399 /*----- Interface functions -----------------------------------------------*/
 400
 401 /* --- @poly1305_keyinit@ --- *
 402  *
 403  * Arguments:   @poly1305_key *key@ = key structure to fill in
 404  *              @const void *k@ = pointer to key material
 405  *              @size_t ksz@ = length of key (must be @POLY1305_KEYSZ == 16@)
 406  *
 407  * Returns:     ---
 408  *
 409  * Use:         Records a Poly1305 key and performs (minimal)
 410  *              precomputations.
 411  */
 412
 413 void poly1305_keyinit(poly1305_key *key, const void *k, size_t ksz)
 414 {
 415   const octet *r = k;
 416 #if POLY1305_IMPL == 11
 417   octet rr[16];
 418 #endif
 419
 420   KSZ_ASSERT(poly1305, ksz);
 421
 422 #if POLY1305_IMPL == 26
 423   uint32 r0 = LOAD32_L(r +  0), r1 = LOAD32_L(r +  4),
 424          r2 = LOAD32_L(r +  8), r3 = LOAD32_L(r + 12);
 425
 426   r0 &= 0x0fffffff; r1 &= 0x0ffffffc; r2 &= 0x0ffffffc; r3 &= 0x0ffffffc;
 427   key->u.p26.r0 = P26W0(r); key->u.p26.r1 = P26W1(r);
 428   key->u.p26.r2 = P26W2(r); key->u.p26.r3 = P26W3(r);
 429   key->u.p26.r4 = P26W4(r);
 430
 431   key->u.p26.rr1 = 5*key->u.p26.r1; key->u.p26.rr2 = 5*key->u.p26.r2;
 432   key->u.p26.rr3 = 5*key->u.p26.r3; key->u.p26.rr4 = 5*key->u.p26.r4;
 433 #else
 434   memcpy(rr, r, 16);
 435                   rr[ 3] &= 0x0f;
 436   rr[ 4] &= 0xfc; rr[ 7] &= 0x0f;
 437   rr[ 8] &= 0xfc; rr[11] &= 0x0f;
 438   rr[12] &= 0xfc; rr[15] &= 0x0f;
 439   load_p11(key->u.p11.r, rr);
 440 #endif
 441 }
 442
 443 /* --- @poly1305_macinit@ --- *
 444  *
 445  * Arguments:   @poly1305_ctx *ctx@ = MAC context to fill in
 446  *              @const poly1305_key *key@ = pointer to key structure to use
 447  *              @const void *iv@ = pointer to mask string
 448  *
 449  * Returns:     ---
 450  *
 451  * Use:         Initializes a MAC context for use.  The key can be discarded
 452  *              at any time.
 453  *
 454  *              It is permitted for @iv@ to be null, though it is not then
 455  *              possible to complete the MAC computation on @ctx@.  The
 456  *              resulting context may still be useful, e.g., as an operand to
 457  *              @poly1305_concat@.
 458  */
 459
 460 void poly1305_macinit(poly1305_ctx *ctx,
 461                       const poly1305_key *key, const void *iv)
 462 {
 463   const octet *s = iv;
 464 #if POLY1305_IMPL == 26
 465   uint32 s0, s1, s2, s3;
 466 #else
 467   unsigned i;
 468 #endif
 469
 470 #if POLY1305_IMPL == 26
 471   if (s) {
 472     s0 = LOAD32_L(s +  0); s1 = LOAD32_L(s +  4);
 473     s2 = LOAD32_L(s +  8); s3 = LOAD32_L(s + 12);
 474     ctx->u.p26.s0 = P26W0(s); ctx->u.p26.s1 = P26W1(s);
 475     ctx->u.p26.s2 = P26W2(s); ctx->u.p26.s3 = P26W3(s);
 476     ctx->u.p26.s4 = P26W4(s);
 477   }
 478   ctx->u.p26.h[0] = ctx->u.p26.h[1] = ctx->u.p26.h[2] =
 479     ctx->u.p26.h[3] = ctx->u.p26.h[4] = 0;
 480 #else
 481   if (s) load_p11(ctx->u.p11.s, s);
 482   for (i = 0; i < 12; i++) ctx->u.p11.h[i] = 0;
 483 #endif
 484   ctx->k = *key;
 485   ctx->nbuf = 0;
 486   ctx->count = 0;
 487 }
 488
 489 /* --- @poly1305_copy@ --- *
 490  *
 491  * Arguments:   @poly1305_ctx *to@ = destination context
 492  *              @const poly1305_ctx *from@ = source context
 493  *
 494  * Returns:     ---
 495  *
 496  * Use:         Duplicates a Poly1305 MAC context.  The destination need not
 497  *              have been initialized.  Both contexts can be used
 498  *              independently afterwards.
 499  */
 500
 501 void poly1305_copy(poly1305_ctx *ctx, const poly1305_ctx *from)
 502   { *ctx = *from; }
 503
 504 /* --- @poly1305_hash@ --- *
 505  *
 506  * Arguments:   @poly1305_ctx *ctx@ = MAC context to update
 507  *              @const void *p@ = pointer to message data
 508  *              @size_t sz@ = length of message data
 509  *
 510  * Returns:     ---
 511  *
 512  * Use:         Processes a chunk of message.  The message pieces may have
 513  *              arbitrary lengths, and may be empty.
 514  */
 515
 516 static void update_full(poly1305_ctx *ctx, const octet *p)
 517 {
 518   felt t;
 519 #if POLY1305_IMPL == 26
 520   uint32
 521     m0 = LOAD32_L(p +  0), m1 = LOAD32_L(p +  4),
 522     m2 = LOAD32_L(p +  8), m3 = LOAD32_L(p + 12);
 523
 524   t[0] = ctx->u.p26.h[0] + P26W0(m);
 525   t[1] = ctx->u.p26.h[1] + P26W1(m);
 526   t[2] = ctx->u.p26.h[2] + P26W2(m);
 527   t[3] = ctx->u.p26.h[3] + P26W3(m);
 528   t[4] = ctx->u.p26.h[4] + P26W4(m) + 0x01000000;
 529 #else
 530   unsigned i;
 531
 532   load_p11(t, p); t[11] += 0x100;
 533   for (i = 0; i < 12; i++) t[i] += ctx->u.p11.h[i];
 534 #endif
 535
 536   mul_r(ctx, ctx->u.P.h, t);
 537   ctx->count++;
 538 }
 539
 540 static const rsvr_policy pol = { 0, 16, 16 };
 541
 542 void poly1305_hash(poly1305_ctx *ctx, const void *p, size_t sz)
 543 {
 544   rsvr_state st;
 545   const octet *q = p;
 546
 547   rsvr_setup(&st, &pol, &ctx->buf, &ctx->nbuf, p, sz);
 548   RSVR_DO(&st) while ((q = RSVR_NEXT(&st, 16)) != 0) update_full(ctx, q);
 549 }
 550
 551 /* --- @poly1305_flush@ --- *
 552  *
 553  * Arguments:   @poly1305_ctx *ctx@ = MAC context to flush
 554  *
 555  * Returns:     ---
 556  *
 557  * Use:         Forces any buffered message data in the context to be
 558  *              processed.  This has no effect if the message processed so
 559  *              far is a whole number of blocks.  Flushing is performed
 560  *              automatically by @poly1305_done@, but it may be necessary to
 561  *              force it by hand when using @poly1305_concat@.
 562  *              (Alternatively, you might use @poly1305_flushzero@ instead.)
 563  *
 564  *              Flushing a partial block has an observable effect on the
 565  *              computation: the resulting state is (with high probability)
 566  *              dissimilar to any state reachable with a message which is a
 567  *              whole number of blocks long.
 568  */
 569
 570 void poly1305_flush(poly1305_ctx *ctx)
 571 {
 572   felt t;
 573 #if POLY1305_IMPL == 26
 574   uint32 m0, m1, m2, m3;
 575 #else
 576   unsigned i;
 577 #endif
 578
 579   if (!ctx->nbuf) return;
 580   ctx->buf[ctx->nbuf++] = 1; memset(ctx->buf + ctx->nbuf, 0, 16 - ctx->nbuf);
 581 #if POLY1305_IMPL == 26
 582   m0 = LOAD32_L(ctx->buf +  0); m1 = LOAD32_L(ctx->buf +  4);
 583   m2 = LOAD32_L(ctx->buf +  8); m3 = LOAD32_L(ctx->buf + 12);
 584
 585   t[0] = ctx->u.p26.h[0] + P26W0(m);
 586   t[1] = ctx->u.p26.h[1] + P26W1(m);
 587   t[2] = ctx->u.p26.h[2] + P26W2(m);
 588   t[3] = ctx->u.p26.h[3] + P26W3(m);
 589   t[4] = ctx->u.p26.h[4] + P26W4(m);
 590 #else
 591   load_p11(t, ctx->buf);
 592   for (i = 0; i < 12; i++) t[i] += ctx->u.p11.h[i];
 593 #endif
 594
 595   mul_r(ctx, ctx->u.P.h, t);
 596   ctx->nbuf = 0; ctx->count++;
 597 }
 598
 599 /* --- @poly1305_flushzero@ --- *
 600  *
 601  * Arguments:   @poly1305_ctx *ctx@ = MAC context to flush
 602  *
 603  * Returns:     ---
 604  *
 605  * Use:         Forces any buffered message data in the context to be
 606  *              processed, by hashing between zero and fifteen additional
 607  *              zero bytes.  Like @poly1305_flush@, this has no effect if the
 608  *              the message processed so far is a whole number of blocks.
 609  *              Unlike @poly1305_flush@, the behaviour if the message is not
 610  *              a whole number of blocks is equivalent to actually hashing
 611  *              some extra data.
 612  */
 613
 614 void poly1305_flushzero(poly1305_ctx *ctx)
 615 {
 616   if (!ctx->nbuf) return;
 617   memset(ctx->buf + ctx->nbuf, 0, 16 - ctx->nbuf);
 618   update_full(ctx, ctx->buf);
 619   ctx->nbuf = 0;
 620 }
 621
 622 /* --- @poly1305_concat@ --- *
 623  *
 624  * Arguments:   @poly1305_ctx *ctx@ = destination context
 625  *              @const poly1305_ctx *prefix, *suffix@ = two operand contexts
 626  *
 627  * Returns:     ---
 628  *
 629  * Use:         The two operand contexts @prefix@ and @suffix@ represent
 630  *              processing of two messages %$m$% and %$m'$%; the effect is to
 631  *              set @ctx@ to the state corresponding to their concatenation
 632  *              %$m \cat m'$%.
 633  *
 634  *              All three contexts must have been initialized using the same
 635  *              key value (though not necessarily from the same key
 636  *              structure).  The mask values associated with the input
 637  *              contexts are irrelevant.  The @prefix@ message %$m$% must be
 638  *              a whole number of blocks long: this can be arranged by
 639  *              flushing the context.  The @suffix@ message need not be a
 640  *              whole number of blocks long.  All of the contexts remain
 641  *              operational and can be used independently afterwards.
 642  */
 643
 644 void poly1305_concat(poly1305_ctx *ctx,
 645                      const poly1305_ctx *prefix, const poly1305_ctx *suffix)
 646 {
 647   /* Assume that lengths are public, so it's safe to behave conditionally on
 648    * the bits of ctx->count.
 649    */
 650   unsigned long n;
 651   unsigned i;
 652   felt x;
 653 #if POLY1305_IMPL == 26
 654   uint32 x0, x1, x2, x3, x4, y0, y1, y2, y3, y4;
 655 #else
 656   uint32 y[12];
 657 #endif
 658
 659   /* We can only concatenate if the prefix is block-aligned. */
 660   assert(!prefix->nbuf);
 661
 662   /* The hash for a message m = m_{k-1} m_{k-2} ... m_1 m_0 is h_r(m) =
 663    * SUM_{0<=i<k} m_i r^{i+1}.  If we have two messages, m, m', of lengths k
 664    * and k' blocks respectively, then
 665    *
 666    *    h_r(m || m') = SUM_{0<=i<k} m_i r^{k'+i+1} +
 667    *                     SUM_{0<=i<k'} m'_i r^{i+1}
 668    *                 =  r^{k'} h_r(m) + h_r(m')
 669    *
 670    * This is simple left-to-right square-and-multiply exponentiation.
 671    */
 672   n = suffix->count;
 673   x[0] = 1;
 674 #if POLY1305_IMPL == 26
 675   x[1] = x[2] = x[3] = x[4] = 0;
 676 #else
 677   for (i = 1; i < 12; i++) x[i] = 0;
 678 #endif
 679 #define BIT (1ul << (ULONG_BITS - 1))
 680   if (n) {
 681     i = ULONG_BITS;
 682     while (!(n & BIT)) { n <<= 1; i--; }
 683     mul_r(prefix, x, x); n <<= 1; i--;
 684     while (i--) { sqr(x, x); if (n & BIT) mul_r(prefix, x, x); n <<= 1; }
 685   }
 686 #undef BIT
 687   mul(x, prefix->u.P.h, x);
 688
 689   /* Add on the suffix hash. */
 690 #if POLY1305_IMPL == 26
 691   /* We're going to add the two hashes elementwise.  Both h' = h_r(m') and
 692    * x = r^{k'} h_r(m) are bounded above by 2^27, so the sum will be bounded
 693    * by 2^28; but this is too large to leave in the accumulator.  (Strictly,
 694    * we could get away with it, but the caller can in theory chain an
 695    * arbitrary number of concatenations and expect us to cope, and we'd
 696    * definitely overflow eventually.)  So we reduce.  Since the excess is so
 697    * small, a single round of `CARRY_REDUCE' is enough.
 698    */
 699   x0 = x[0] + suffix->u.p26.h[0]; x1 = x[1] + suffix->u.p26.h[1];
 700   x2 = x[2] + suffix->u.p26.h[2]; x3 = x[3] + suffix->u.p26.h[3];
 701   x4 = x[4] + suffix->u.p26.h[4];
 702   CARRY_REDUCE(y, x);
 703   ctx->u.p26.h[0] = y0; ctx->u.p26.h[1] = y1; ctx->u.p26.h[2] = y2;
 704   ctx->u.p26.h[3] = y3; ctx->u.p26.h[4] = y4;
 705 #else
 706   /* We'll add the two hashes elementwise and have to reduce again.  The
 707    * numbers are different, but the reasoning is basically the same.
 708    */
 709   for (i = 0; i < 12; i++) y[i] = x[i] + suffix->u.p11.h[i];
 710   carry_reduce(y);
 711   for (i = 0; i < 12; i++) ctx->u.p11.h[i] = y[i];
 712 #endif
 713
 714   /* Copy the remaining pieces of the context to set up the result. */
 715   if (ctx != suffix) {
 716     memcpy(ctx->buf, suffix->buf, suffix->nbuf);
 717     ctx->nbuf = suffix->nbuf;
 718   }
 719   ctx->count = prefix->count + suffix->count;
 720 }
 721
 722 /* --- @poly1305_done@ --- *
 723  *
 724  * Arguments:   @poly1305_ctx *ctx@ = MAC context to finish
 725  *              @void *h@ = buffer to write the tag to
 726  *
 727  * Returns:     ---
 728  *
 729  * Use:         Completes a Poly1305 MAC tag computation.
 730  */
 731
 732 void poly1305_done(poly1305_ctx *ctx, void *h)
 733 {
 734   octet *p = h;
 735
 736 #if POLY1305_IMPL == 26
 737   uint32 m_sub, t, c;
 738   uint32 h0, h1, h2, h3, h4, hh0, hh1, hh2, hh3, hh4;
 739
 740   /* If there's anything left over in the buffer, pad it to form a final
 741    * coefficient and update the evaluation one last time.
 742    */
 743   poly1305_flush(ctx);
 744
 745   /* Collect the final hash state. */
 746   h0 = ctx->u.p26.h[0];
 747   h1 = ctx->u.p26.h[1];
 748   h2 = ctx->u.p26.h[2];
 749   h3 = ctx->u.p26.h[3];
 750   h4 = ctx->u.p26.h[4];
 751
 752   /* Reduce the final value mod 2^130 - 5.  First pass: set h <- h +
 753    * 5 floor(h/2^130).  After this, the low pieces of h will be normalized:
 754    * 0 <= h_i < 2^26 for 0 <= i < 4; and 0 <= h_4 < 2^26 + 1.  In the
 755    * (highly unlikely) event that h_4 >= 2^26, set c and truncate to 130
 756    * bits.
 757    */
 758              c = h4 >> 26; h4 &= M26;
 759   h0 += 5*c; c = h0 >> 26; h0 &= M26;
 760   h1 +=   c; c = h1 >> 26; h1 &= M26;
 761   h2 +=   c; c = h2 >> 26; h2 &= M26;
 762   h3 +=   c; c = h3 >> 26; h3 &= M26;
 763   h4 +=   c; c = h4 >> 26; h4 &= M26;
 764
 765   /* Calculate h' = h - (2^130 - 5).  If h' >= 0 then t ends up 1; otherwise
 766    * it's zero.
 767    */
 768   t  = h0 + 5; hh0 = t&M26; t >>= 26;
 769   t += h1;     hh1 = t&M26; t >>= 26;
 770   t += h2;     hh2 = t&M26; t >>= 26;
 771   t += h3;     hh3 = t&M26; t >>= 26;
 772   t += h4;     hh4 = t&M26; t >>= 26;
 773
 774   /* Keep the subtraction result above if t or c is set. */
 775   m_sub = -(t | c);
 776   h0 = (hh0&m_sub) | (h0&~m_sub);
 777   h1 = (hh1&m_sub) | (h1&~m_sub);
 778   h2 = (hh2&m_sub) | (h2&~m_sub);
 779   h3 = (hh3&m_sub) | (h3&~m_sub);
 780   h4 = (hh4&m_sub) | (h4&~m_sub);
 781
 782   /* Add the mask onto the hash result. */
 783   t  = h0 + ctx->u.p26.s0; h0 = t&M26; t >>= 26;
 784   t += h1 + ctx->u.p26.s1; h1 = t&M26; t >>= 26;
 785   t += h2 + ctx->u.p26.s2; h2 = t&M26; t >>= 26;
 786   t += h3 + ctx->u.p26.s3; h3 = t&M26; t >>= 26;
 787   t += h4 + ctx->u.p26.s4; h4 = t&M26; t >>= 26;
 788
 789   /* Convert this mess back into 32-bit words.  We lose the top two bits,
 790    * but that's fine.
 791    */
 792   h0 = (h0 >>  0) | ((h1 & 0x0000003f) << 26);
 793   h1 = (h1 >>  6) | ((h2 & 0x00000fff) << 20);
 794   h2 = (h2 >> 12) | ((h3 & 0x0003ffff) << 14);
 795   h3 = (h3 >> 18) | ((h4 & 0x00ffffff) <<  8);
 796
 797   /* All done. */
 798   STORE32_L(p +  0, h0); STORE32_L(p +  4, h1);
 799   STORE32_L(p +  8, h2); STORE32_L(p + 12, h3);
 800 #else
 801   uint16 hh[12], hi[12], c, t, m_sub;
 802   uint32 a;
 803   unsigned i, j, n;
 804
 805   /* If there's anything left over in the buffer, pad it to form a final
 806    * coefficient and update the evaluation one last time.
 807    */
 808   poly1305_flush(ctx);
 809
 810   /* Collect the final hash state. */
 811   for (i = 0; i < 12; i++) hh[i] = ctx->u.p11.h[i];
 812
 813   /* Reduce the final value mod 2^130 - 5.  First pass: set h <- h +
 814    * 5 floor(h/2^130).  After this, the low pieces of h will be normalized:
 815    * 0 <= h_i < 2^{w_i} for 0 <= i < 11; and 0 <= h_{11} < 2^10 + 1.  In the
 816    * (highly unlikely) event that h_{11} >= 2^10, set c and truncate to 130
 817    * bits.
 818    */
 819   c = 5*(hh[11] >> 10); hh[11] &= M10;
 820   for (i = 0; i < 12; i++) {
 821     if (i == 5 || i == 11) { c += hh[i]; hh[i] = c&M10; c >>= 10; }
 822     else                   { c += hh[i]; hh[i] = c&M11; c >>= 11; }
 823   }
 824
 825   /* Calculate h' = h - (2^130 - 5).  If h' >= 0 then t ends up 1; otherwise
 826    * it's zero.
 827    */
 828   for (i = 0, t = 5; i < 12; i++) {
 829     t += hh[i];
 830     if (i == 5 || i == 11) { hi[i] = t&M10; t >>= 10; }
 831     else                   { hi[i] = t&M11; t >>= 11; }
 832   }
 833
 834   /* Keep the subtraction result above if t or c is set. */
 835   m_sub = -(t | c);
 836   for (i = 0; i < 12; i++) hh[i] = (hi[i]&m_sub) | (hh[i]&~m_sub);
 837
 838   /* Add the mask onto the hash result. */
 839   for (i = 0, t = 0; i < 12; i++) {
 840     t += hh[i] + ctx->u.p11.s[i];
 841     if (i == 5 || i == 11) { hh[i] = t&M10; t >>= 10; }
 842     else                   { hh[i] = t&M11; t >>= 11; }
 843   }
 844
 845   /* Convert this mess back into bytes.  We lose the top two bits, but that's
 846    * fine.
 847    */
 848   for (i = j = n = 0, a = 0; i < 16; i++) {
 849     if (n < 8) {
 850       a |= hh[j] << n;
 851       n += (j == 5 || j == 11) ? 10 : 11;
 852       j++;
 853     }
 854     p[i] = a&0xff; a >>= 8; n -= 8;
 855   }
 856
 857 #endif
 858 }
 859
 860 /*----- Test rig ----------------------------------------------------------*/
 861
 862 #ifdef TEST_RIG
 863
 864 #include <mLib/testrig.h>
 865
 866 #include "ct.h"
 867 #include "rijndael-ecb.h"
 868
 869 static int vrf_hash(dstr v[])
 870 {
 871   poly1305_key k;
 872   poly1305_ctx ctx;
 873   dstr t = DSTR_INIT;
 874   unsigned i, j;
 875
 876   if (v[0].len != 16) { fprintf(stderr, "bad key length\n"); exit(2); }
 877   if (v[1].len != 16) { fprintf(stderr, "bad mask length\n"); exit(2); }
 878   if (v[3].len != 16) { fprintf(stderr, "bad tag length\n"); exit(2); }
 879   dstr_ensure(&t, 16); t.len = 16;
 880
 881   ct_poison(v[0].buf, v[0].len);
 882   poly1305_keyinit(&k, v[0].buf, v[0].len);
 883   for (i = 0; i < v[2].len; i++) {
 884     for (j = i; j < v[2].len; j++) {
 885       poly1305_macinit(&ctx, &k, v[1].buf);
 886       poly1305_hash(&ctx, v[2].buf, i);
 887       poly1305_hash(&ctx, v[2].buf + i, j - i);
 888       poly1305_hash(&ctx, v[2].buf + j, v[2].len - j);
 889       poly1305_done(&ctx, t.buf);
 890       ct_remedy(t.buf, t.len);
 891       if (memcmp(t.buf, v[3].buf, 16) != 0) {
 892         fprintf(stderr, "failed...");
 893         fprintf(stderr, "\n\tkey    = "); type_hex.dump(&v[0], stderr);
 894         fprintf(stderr, "\n\tmask   = "); type_hex.dump(&v[1], stderr);
 895         fprintf(stderr, "\n\tmsg    = "); type_hex.dump(&v[2], stderr);
 896         fprintf(stderr, "\n\texp    = "); type_hex.dump(&v[3], stderr);
 897         fprintf(stderr, "\n\tcalc   = "); type_hex.dump(&t, stderr);
 898         fprintf(stderr, "\n\tsplits = 0 .. %u .. %u .. %lu\n",
 899                 i, j, (unsigned long)v[1].len);
 900         return (0);
 901       }
 902     }
 903   }
 904   return (1);
 905 }
 906
 907 static int vrf_cat(dstr v[])
 908 {
 909   poly1305_key k;
 910   poly1305_ctx ctx, cc[3];
 911   dstr t = DSTR_INIT;
 912   unsigned i;
 913   int ok = 1;
 914
 915   if (v[0].len != 16) { fprintf(stderr, "bad key length\n"); exit(2); }
 916   if (v[1].len != 16) { fprintf(stderr, "bad mask length\n"); exit(2); }
 917   if (v[5].len != 16) { fprintf(stderr, "bad tag length\n"); exit(2); }
 918   dstr_ensure(&t, 16); t.len = 16;
 919
 920   poly1305_keyinit(&k, v[0].buf, v[0].len);
 921   poly1305_macinit(&ctx, &k, v[1].buf);
 922   for (i = 0; i < 3; i++) {
 923     poly1305_macinit(&cc[i], &k, 0);
 924     poly1305_hash(&cc[i], v[i + 2].buf, v[i + 2].len);
 925   }
 926   for (i = 0; i < 2; i++) {
 927     if (!i) {
 928       poly1305_concat(&ctx, &cc[1], &cc[2]);
 929       poly1305_concat(&ctx, &cc[0], &ctx);
 930     } else {
 931       poly1305_concat(&ctx, &cc[0], &cc[1]);
 932       poly1305_concat(&ctx, &ctx, &cc[2]);
 933     }
 934     poly1305_done(&ctx, t.buf);
 935     if (memcmp(t.buf, v[5].buf, 16) != 0) {
 936       fprintf(stderr, "failed...");
 937       fprintf(stderr, "\n\tkey    = "); type_hex.dump(&v[0], stderr);
 938       fprintf(stderr, "\n\tmask   = "); type_hex.dump(&v[1], stderr);
 939       fprintf(stderr, "\n\tmsg[0] = "); type_hex.dump(&v[2], stderr);
 940       fprintf(stderr, "\n\tmsg[1] = "); type_hex.dump(&v[3], stderr);
 941       fprintf(stderr, "\n\tmsg[2] = "); type_hex.dump(&v[4], stderr);
 942       fprintf(stderr, "\n\texp    = "); type_hex.dump(&v[5], stderr);
 943       fprintf(stderr, "\n\tcalc   = "); type_hex.dump(&t, stderr);
 944       fprintf(stderr, "\n\tassoc  = %s\n",
 945               !i ? "msg[0] || (msg[1] || msg[2])" :
 946                    "(msg[0] || msg[1]) || msg[2]");
 947       ok = 0;
 948     }
 949   }
 950   return (ok);
 951 }
 952
 953 #define MSZMAX 1000
 954
 955 static int vrf_mct(dstr v[])
 956 {
 957   unsigned j, msz;
 958   unsigned long i, niter;
 959   rijndael_ecbctx rij;
 960   poly1305_key key;
 961   poly1305_ctx mac;
 962   dstr d = DSTR_INIT;
 963   octet k[16], r[16], n[16], s[16], *t, m[MSZMAX] = { 0 };
 964   int ok = 1;
 965
 966   if (v[0].len != sizeof(k)) { fprintf(stderr, "AES key len\n"); exit(2); }
 967   if (v[1].len != sizeof(r)) { fprintf(stderr, "poly key len\n"); exit(2); }
 968   if (v[2].len != sizeof(n)) { fprintf(stderr, "nonce len\n"); exit(2); }
 969   if (v[4].len != sizeof(n)) { fprintf(stderr, "result len\n"); exit(2); }
 970   memcpy(k, v[0].buf, sizeof(k));
 971   memcpy(r, v[1].buf, sizeof(k));
 972   memcpy(n, v[2].buf, sizeof(k));
 973   niter = *(unsigned long *)v[3].buf;
 974   dstr_ensure(&d, 16); d.len = 16; t = (octet *)d.buf;
 975
 976   rijndael_ecbinit(&rij, k, sizeof(k), 0);
 977   poly1305_keyinit(&key, r, sizeof(r));
 978   for (i = 0; i < niter; i++) {
 979     msz = 0;
 980     for (;;) {
 981       rijndael_ecbencrypt(&rij, n, s, 16);
 982       poly1305_macinit(&mac, &key, s);
 983       poly1305_hash(&mac, m, msz);
 984       poly1305_done(&mac, t);
 985       if (msz >= MSZMAX) break;
 986       n[0] ^= i&0xff;
 987       for (j = 0; j < 16; j++) n[j] ^= t[j];
 988       if (msz%2) {
 989         for (j = 0; j < 16; j++) k[j] ^= t[j];
 990         rijndael_ecbinit(&rij, k, sizeof(k), 0);
 991       }
 992       if (msz%3) {
 993         for (j = 0; j < 16; j++) r[j] ^= t[j];
 994         poly1305_keyinit(&key, r, sizeof(r));
 995       }
 996       m[msz++] ^= t[0];
 997     }
 998   }
 999
1000   if (memcmp(t, v[4].buf, 16) != 0) {
1001     ok = 0;
1002     fprintf(stderr, "failed...");
1003     fprintf(stderr, "\n\tinitial k = "); type_hex.dump(&v[0], stderr);
1004     fprintf(stderr, "\n\tinitial r = "); type_hex.dump(&v[1], stderr);
1005     fprintf(stderr, "\n\tinitial n = "); type_hex.dump(&v[2], stderr);
1006     fprintf(stderr, "\n\titerations = %lu", niter);
1007     fprintf(stderr, "\n\texpected = "); type_hex.dump(&v[4], stderr);
1008     fprintf(stderr, "\n\tcalculated = "); type_hex.dump(&d, stderr);
1009     fputc('\n', stderr);
1010   }
1011
1012   dstr_destroy(&d);
1013   return (ok);
1014 }
1015
1016 static const struct test_chunk tests[] = {
1017   { "poly1305-hash", vrf_hash,
1018     { &type_hex, &type_hex, &type_hex, &type_hex } },
1019   { "poly1305-cat", vrf_cat,
1020     { &type_hex, &type_hex, &type_hex, &type_hex, &type_hex, &type_hex } },
1021   { "poly1305-mct", vrf_mct,
1022     { &type_hex, &type_hex, &type_hex, &type_ulong, &type_hex } },
1023   { 0, 0, { 0 } }
1024 };
1025
1026 int main(int argc, char *argv[])
1027 {
1028   test_run(argc, argv, tests, SRCDIR "/t/poly1305");
1029   return (0);
1030 }
1031
1032 #endif
1033
1034 /*----- That's all, folks -------------------------------------------------*/