mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/* --c--
	2	*
	3	* Poly1305 message authentication code
	4	*
	5	* (c) 2017 Straylight/Edgeware
	6	*/
	7
	8	/----- Licensing notice --------------------------------------------------
	9	*
	10	* This file is part of Catacomb.
	11	*
	12	* Catacomb is free software; you can redistribute it and/or modify
	13	* it under the terms of the GNU Library General Public License as
	14	* published by the Free Software Foundation; either version 2 of the
	15	* License, or (at your option) any later version.
	16	*
	17	* Catacomb is distributed in the hope that it will be useful,
	18	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	* GNU Library General Public License for more details.
	21	*
	22	* You should have received a copy of the GNU Library General Public
	23	* License along with Catacomb; if not, write to the Free
	24	* Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	* MA 02111-1307, USA.
	26	*/
	27
	28	/----- Header files ------------------------------------------------------/
	29
	30	#include "config.h"
	31
	32	#include <assert.h>
	33	#include <string.h>
	34
	35	#include "poly1305.h"
	36	#include "rsvr.h"
	37
	38	/----- Global variables --------------------------------------------------/
	39
	40	const octet poly1305_keysz[] = { KSZ_SET, 16, 0 };
	41
	42	/----- Low-level implementation for 32/64-bit targets --------------------/
	43
	44	#if !defined(POLY1305_IMPL) && defined(HAVE_UINT64)
	45	# define POLY1305_IMPL 26
	46	#endif
	47
	48	#if POLY1305_IMPL == 26
	49
	50	/* Elements x of GF(2^130 - 5) are represented by five integers x_i: x =
	51	* SUM_{0<=i<5} x_i 2^{26i}.
	52	*
	53	* Not all elements are represented canonically. We have 0 <= r_i, s_i <
	54	* 2^26 by construction. We maintain 0 <= h_i < 2^27. When we read a
	55	* message block m, we have 0 <= m_i < 2^26 by construction again. When we
	56	* update the hash state, we calculate h' = r (h + m). Addition is done
	57	* componentwise; let t = h + m, and we will have 0 <= t_i < 3*2^26.
	58	*/
	59	typedef uint32 felt[5];
	60	#define M26 0x03ffffff
	61	#define P p26
	62
	63	/* Convert 32-bit words into field-element pieces. */
	64	#define P26W0(x) (((x##0) << 0)&0x03ffffff)
	65	#define P26W1(x) ((((x##1) << 6)&0x03ffffc0) \| (((x##0) >> 26)&0x0000003f))
	66	#define P26W2(x) ((((x##2) << 12)&0x03ffffff) \| (((x##1) >> 20)&0x00000fff))
	67	#define P26W3(x) ((((x##3) << 18)&0x03fc0000) \| (((x##2) >> 14)&0x0003ffff))
	68	#define P26W4(x) (((x##3) >> 8)&0x00ffffff)
	69
	70	/* Propagate carries in parallel. If 0 <= u_i < 2^26 c_i, then we shall have
	71	* 0 <= v_0 < 2^26 + 5 c_4, and 0 <= v_i < 2^26 + c_{i-1} for 1 <= i < 5.
	72	*/
	73	#define CARRY_REDUCE(v, u) do { \
	74	(v##0) = ((u##0)&M26) + 5*((u##4) >> 26); \
	75	(v##1) = ((u##1)&M26) + ((u##0) >> 26); \
	76	(v##2) = ((u##2)&M26) + ((u##1) >> 26); \
	77	(v##3) = ((u##3)&M26) + ((u##2) >> 26); \
	78	(v##4) = ((u##4)&M26) + ((u##3) >> 26); \
	79	} while (0)
	80
	81	/* General multiplication, used by `concat'. */
	82	static void mul(felt z, const felt x, const felt y)
	83	{
	84	/* Initial bounds: we assume x_i, y_i < 2^27. On exit, z_i < 2^27. */
	85
	86	uint32 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4];
	87	uint32 y0 = y[0], y1 = y[1], y2 = y[2], y3 = y[3], y4 = y[4];
	88	uint64 u0, u1, u2, u3, u4;
	89	uint64 v0, v1, v2, v3, v4;
	90	uint32 z0, z1, z2, z3, z4;
	91
	92	/* Do the multiplication: u = h x mod 2^130 - 5. We will have u_i <
	93	* 2^27 (5 (4 - i) + i + 1) 2^27 = 2^54 (21 - 4 i) = 2^52 (84 - 16 i). In
	94	* all cases we have u_i < 842^52 < 2^59. Notably, u_4 < 52^54 =
	95	* 20*2^52.
	96	*/
	97	#define M(x, y) ((uint64)(x)*(y))
	98	u0 = M(x0, y0) + (M(x1, y4) + M(x2, y3) + M(x3, y2) + M(x4, y1))*5;
	99	u1 = M(x0, y1) + M(x1, y0) + (M(x2, y4) + M(x3, y3) + M(x4, y2))*5;
	100	u2 = M(x0, y2) + M(x1, y1) + M(x2, y0) + (M(x3, y4) + M(x4, y3))*5;
	101	u3 = M(x0, y3) + M(x1, y2) + M(x2, y1) + M(x3, y0) + (M(x4, y4))*5;
	102	u4 = M(x0, y4) + M(x1, y3) + M(x2, y2) + M(x3, y1) + M(x4, y0);
	103	#undef M
	104
	105	/* Now we must reduce the coefficients. We do this in an approximate
	106	* manner which avoids long data-dependency chains, but requires two
	107	* passes.
	108	*
	109	* The reduced carry down from u_4 to u_0 in the first pass will be c_0 <
	110	* 100*2^26; the remaining c_i are smaller: c_i < 2^26 (84 - 16 i). This
	111	* leaves 0 <= v_i < 101*2^26. The carries in the second pass are bounded
	112	* above by 180.
	113	*/
	114	CARRY_REDUCE(v, u); CARRY_REDUCE(z, v);
	115	z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4;
	116	}
	117
	118	/* General squaring, used by `concat'. */
	119	static void sqr(felt z, const felt x)
	120	{
	121	/* Initial bounds: we assume x_i < 2^27. On exit, z_i < 2^27. */
	122
	123	uint32 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4];
	124	uint64 u0, u1, u2, u3, u4;
	125	uint64 v0, v1, v2, v3, v4;
	126	uint32 z0, z1, z2, z3, z4;
	127
	128	/* Do the squaring. See `mul' for bounds. */
	129	#define M(x, y) ((uint64)(x)*(y))
	130	u0 = M(x0, x0) + 10*(M(x1, x4) + M(x2, x3));
	131	u1 = 2* M(x0, x1) + 5(M(x3, x3) + 2M(x2, x4));
	132	u2 = M(x1, x1) + 2* M(x0, x2) + 10* M(x3, x4);
	133	u3 = 2(M(x0, x3) + M(x1, x2)) + 5 M(x4, x4);
	134	u4 = M(x2, x2) + 2*(M(x0, x4) + M(x1, x3));
	135	#undef M
	136
	137	/* Now we must reduce the coefficients. See `mul' for bounds. */
	138	CARRY_REDUCE(v, u); CARRY_REDUCE(z, v);
	139	z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4;
	140	}
	141
	142	/* Multiplication by r, using precomputation. */
	143	static void mul_r(const poly1305_ctx *ctx, felt z, const felt x)
	144	{
	145	/* Initial bounds: by construction, r_i < 2^26. We assume x_i < 3*2^26.
	146	* On exit, z_i < 2^27.
	147	*/
	148
	149	uint32
	150	r0 = ctx->k.u.p26.r0,
	151	r1 = ctx->k.u.p26.r1, rr1 = ctx->k.u.p26.rr1,
	152	r2 = ctx->k.u.p26.r2, rr2 = ctx->k.u.p26.rr2,
	153	r3 = ctx->k.u.p26.r3, rr3 = ctx->k.u.p26.rr3,
	154	r4 = ctx->k.u.p26.r4, rr4 = ctx->k.u.p26.rr4;
	155	uint32 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4];
	156	uint64 u0, u1, u2, u3, u4;
	157	uint64 v0, v1, v2, v3, v4;
	158	uint32 z0, z1, z2, z3, z4;
	159
	160	/* Do the multiplication: u = h x mod 2^130 - 5. We will have u_i <
	161	* 2^26 (5 (4 - i) + i + 1) 3*2^26 = 2^52 (63 - 12 i). In all cases
	162	* we have u_i < 632^52 < 2^58. Notably, u_4 < 152^52.
	163	*/
	164	#define M(x, y) ((uint64)(x)*(y))
	165	u0 = M(x0, r0) + M(x1, rr4) + M(x2, rr3) + M(x3, rr2) + M(x4, rr1);
	166	u1 = M(x0, r1) + M(x1, r0) + M(x2, rr4) + M(x3, rr3) + M(x4, rr2);
	167	u2 = M(x0, r2) + M(x1, r1) + M(x2, r0) + M(x3, rr4) + M(x4, rr3);
	168	u3 = M(x0, r3) + M(x1, r2) + M(x2, r1) + M(x3, r0) + M(x4, rr4);
	169	u4 = M(x0, r4) + M(x1, r3) + M(x2, r2) + M(x3, r1) + M(x4, r0);
	170	#undef M
	171
	172	/* Now we must reduce the coefficients. We do this in an approximate
	173	* manner which avoids long data-dependency chains, but requires two
	174	* passes.
	175	*
	176	* The reduced carry down from u_4 to u_0 in the first pass will be c_0 <
	177	* 75*2^26; the remaining c_i are smaller: c_i < 2^26 (63 - 12 i). This
	178	* leaves 0 <= v_i < 76*2^26. The carries in the second pass are bounded
	179	* above by 135.
	180	*/
	181	CARRY_REDUCE(v, u); CARRY_REDUCE(z, v);
	182	z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4;
	183	}
	184
	185	#endif
	186
	187	/----- Low-level implementation for 16/32-bit targets --------------------/
	188
	189	#ifndef POLY1305_IMPL
	190	# define POLY1305_IMPL 11
	191	#endif
	192
	193	#if POLY1305_IMPL == 11
	194
	195	/* Elements x of GF(2^130 - 5) are represented by 12 integers x_i: x =
	196	* SUM_{0<=i<12} x_i 2^P_i, where P_i = SUM_{0<=j<i} w_j, and w_5 = w_11 =
	197	* 10, and w_i = 11 for i in { 0, 1, 2, 3, 4, 6, 7, 8, 9, 10 }.
	198	*
	199	* Not all elements are represented canonically. We have 0 <= r_i, s_i <
	200	* 2^w_i <= 2^11 by construction. We maintain 0 <= h_i < 2^12. When we read
	201	* a message block m, we have 0 <= m_i < 2^w_i by construction again. When
	202	* we update the hash state, we calculate h' = r (h + m). Addition is done
	203	* componentwise; let t = h + m, and we will have 0 <= t_i < 3*2^11.
	204	*/
	205	typedef uint16 felt[12];
	206	#define M10 0x3ff
	207	#define M11 0x7ff
	208	#define P p11
	209
	210	/* Load a field element from an octet string. */
	211	static void load_p11(felt d, const octet *s)
	212	{
	213	unsigned i, j, n, w;
	214	uint16 m;
	215	uint32 a;
	216
	217	for (i = j = n = 0, a = 0; j < 12; j++) {
	218	if (j == 5 \|\| j == 11) { w = 10; m = M10; }
	219	else { w = 11; m = M11; }
	220	while (n < w && i < 16) { a \|= s[i++] << n; n += 8; }
	221	d[j] = a&m; a >>= w; n -= w;
	222	}
	223	}
	224
	225	/* Reduce a field-element's pieces to manageable size. */
	226	static void carry_reduce(uint32 u[12])
	227	{
	228	/* Initial bounds: we assume u_i < 6362^22. On exit, u_i < 2^11. /
	229
	230	unsigned i;
	231	uint32 c;
	232
	233	/* Do sequential carry propagation (16-bit CPUs are less likely to benefit
	234	* from instruction-level parallelism). Start at u_9; truncate it to 11
	235	* bits, and add the carry onto u_10. Truncate u10 to 11 bits, and add the
	236	* carry onto u_11. Truncate u_11 to 10 bits, and add five times the carry
	237	* onto u_0. And so on.
	238	*
	239	* The carry is larger than the pieces we're leaving behind. Let c_i be
	240	* the high portion of u_i, to be carried onto u_{i+1}. I claim that c_i <
	241	* 25572^10. Then the carry /into/ any u_i is at most 127852^10 < 2^24
	242	* (allowing for the reduction as we carry from u_11 to u_0), and u_i after
	243	* carry is bounded above by 6362^22 + 127852^10 < 2557*2^20. Hence, the
	244	* carry out is at most 2557*2^10, as claimed.
	245	*
	246	* Once we reach u_9 for the second time, we start with u_9 < 2^11. The
	247	* carry into u_9 is at most 25572^10 < 12792^11 as calculated above; so
	248	* the carry out into u_10 is at most 1280. Since u_10 < 2^11 prior to
	249	* this carry in, we now have u_10 < 2^11 + 1280 < 2^12; so the carry out
	250	* into u_11 is at most 1. The final reduction therefore only needs a
	251	* conditional subtraction.
	252	*/
	253	{ c = u[9] >> 11; u[9] &= M11; }
	254	{ u[10] += c; c = u[10] >> 11; u[10] &= M11; }
	255	{ u[11] += c; c = u[11] >> 10; u[11] &= M10; }
	256	{ u[0] += 5*c; c = u[0] >> 11; u[0] &= M11; }
	257	for (i = 1; i < 5; i++) { u[i] += c; c = u[i] >> 11; u[i] &= M11; }
	258	{ u[5] += c; c = u[5] >> 10; u[5] &= M10; }
	259	for (i = 6; i < 11; i++) { u[i] += c; c = u[i] >> 11; u[i] &= M11; }
	260	u[11] += c;
	261	}
	262
	263	/* General multiplication. */
	264	static void mul(felt z, const felt x, const felt y)
	265	{
	266	/* Initial bounds: we assume x_i < 3*2^11, and y_i < 2^12. On exit,
	267	* z_i < 2^12.
	268	*/
	269
	270	uint32 u[12];
	271	unsigned i, j, k;
	272
	273	/* Do the main multiplication. After this, we shall have
	274	*
	275	* { 2^22 (636 - 184 i) for 0 <= i < 6
	276	* u_i < {
	277	* { 2^22 (732 - 60 i) for 6 <= i < 12
	278	*
	279	* In particular, u_0 < 6362^22 < 2^32, and u_11 < 722^22.
	280	*
	281	* The irregularly positioned pieces are annoying. Because we fold the
	282	* reduction into the multiplication, it's also important to see where the
	283	* reduced products fit. Finally, products don't align with the piece
	284	* boundaries, and sometimes need to be doubled. The following table
	285	* tracks all of this.
	286	*
	287	* piece width offset second
	288	* 0 11 0 130
	289	* 1 11 11 141
	290	* 2 11 22 152
	291	* 3 11 33 163
	292	* 4 11 44 174
	293	* 5 10 55 185
	294	* 6 11 65 195
	295	* 7 11 76 206
	296	* 8 11 87 217
	297	* 9 11 98 228
	298	* 10 11 109 239
	299	* 11 10 120 250
	300	*
	301	* The next table tracks exactly which products end up being multiplied by
	302	* which constants and accumulated into which destination pieces.
	303	*
	304	* u_k = t_i r_j + 2 t_i r_j + 5 t_i r_j + 10 t_i r_j
	305	* 0 0/0 -- 6/6 1-5/11-7 7-11/5-1
	306	* 1 0-1/1-0 -- 6-7/7-6 2-5/11-8 8-11/5-2
	307	* 2 0-2/2-0 -- 6-8/8-6 3-5/11-9 9-11/5-3
	308	* 3 0-3/3-0 -- 6-9/9-6 4-5/11-10 10-11/5-4
	309	* 4 0-4/4-0 -- 6-10/10-6 5/11 11/5
	310	* 5 0-5/5-0 -- 6-11/11-6 --
	311	* 6 0/6 6/0 1-5/5-1 -- 7-11/11-7
	312	* 7 0-1/7-6 6-7/1-0 2-5/5-2 -- 8-11/11-8
	313	* 8 0-2/8-6 6-8/2-0 3-5/5-3 -- 9-11/11-9
	314	* 9 0-3/9-6 6-9/3-0 4-5/5-4 -- 10-11/11-10
	315	* 10 0-4/10-6 6-10/4-0 5/5 -- 11/11
	316	* 11 0-11/11-0 -- -- --
	317	*
	318	* And, finally, trying to bound the multiple of 6*2^22 in each destination
	319	* piece is fiddly, so here's a tableau showing the calculation.
	320	*
	321	* k 1* + 2* + 5* +10* = 1* + 5* =
	322	* 0 1 -- 1 10 1 21 106
	323	* 1 2 -- 2 8 2 18 92
	324	* 2 3 -- 3 6 3 15 78
	325	* 3 4 -- 4 4 4 12 64
	326	* 4 5 -- 5 2 5 9 50
	327	* 5 6 -- 6 -- 6 6 36
	328	* 6 2 5 -- 5 12 10 62
	329	* 7 4 4 -- 4 12 8 52
	330	* 8 6 3 -- 3 12 6 42
	331	* 9 8 2 -- 2 12 4 32
	332	* 10 10 1 -- 1 12 2 22
	333	* 11 12 -- -- -- 12 0 12
	334	*/
	335
	336	for (i = 0; i < 12; i++) u[i] = 0;
	337
	338	#define M(i, j) ((uint32)x[i]*y[j])
	339
	340	/* Product terms we must multiply by 10. */
	341	for (k = 0; k < 5; k++) {
	342	for (i = k + 1; i < 6; i++) {
	343	j = 12 + k - i;
	344	u[k] += M(i, j) + M(j, i);
	345	u[k + 6] += M(i + 6, j);
	346	}
	347	}
	348	for (k = 0; k < 5; k++) u[k] *= 2;
	349	for (k = 6; k < 11; k++) u[k] *= 5;
	350
	351	/* Product terms we must multiply by 5. */
	352	for (k = 0; k < 6; k++) {
	353	for (i = k + 6; i >= 6; i--) {
	354	j = 12 + k - i;
	355	u[k] += M(i, j);
	356	}
	357	}
	358	for (k = 0; k < 6; k++) u[k] *= 5;
	359
	360	/* Product terms we must multiply by 2. */
	361	for (k = 6; k < 11; k++) {
	362	for (i = k - 5; i < 6; i++) {
	363	j = k - i;
	364	u[k] += M(i, j);
	365	}
	366	}
	367	for (k = 6; k < 11; k++) u[k] *= 2;
	368
	369	/* Remaining product terms. */
	370	for (k = 0; k < 6; k++) {
	371	for (i = k; i < 6; i--) {
	372	j = k - i;
	373	u[k] += M(i, j);
	374	u[k + 6] += M(i + 6, j) + M(i, j + 6);
	375	}
	376	}
	377
	378	#undef M
	379
	380	/* Do the reduction. Currently, `carry_reduce' does more than we need, but
	381	* that's fine.
	382	*/
	383	carry_reduce(u);
	384
	385	/* Done. Write out the answer. */
	386	for (i = 0; i < 12; i++) z[i] = u[i];
	387	}
	388
	389	/* General squaring, used by `concat'. */
	390	static void sqr(felt z, const felt x)
	391	{ mul(z, x, x); }
	392
	393	/* Multiplication by r. */
	394	static void mul_r(const poly1305_ctx *ctx, felt z, const felt x)
	395	{ mul(z, x, ctx->k.u.p11.r); }
	396
	397	#endif
	398
	399	/----- Interface functions -----------------------------------------------/
	400
	401	/* --- @poly1305_keyinit@ --- *
	402	*
	403	* Arguments: @poly1305_key *key@ = key structure to fill in
	404	* @const void *k@ = pointer to key material
	405	* @size_t ksz@ = length of key (must be @POLY1305_KEYSZ == 16@)
	406	*
	407	* Returns: ---
	408	*
	409	* Use: Records a Poly1305 key and performs (minimal)
	410	* precomputations.
	411	*/
	412
	413	void poly1305_keyinit(poly1305_key key, const void k, size_t ksz)
	414	{
	415	const octet *r = k;
	416	#if POLY1305_IMPL == 11
	417	octet rr[16];
	418	#endif
	419
	420	KSZ_ASSERT(poly1305, ksz);
	421
	422	#if POLY1305_IMPL == 26
	423	uint32 r0 = LOAD32_L(r + 0), r1 = LOAD32_L(r + 4),
	424	r2 = LOAD32_L(r + 8), r3 = LOAD32_L(r + 12);
	425
	426	r0 &= 0x0fffffff; r1 &= 0x0ffffffc; r2 &= 0x0ffffffc; r3 &= 0x0ffffffc;
	427	key->u.p26.r0 = P26W0(r); key->u.p26.r1 = P26W1(r);
	428	key->u.p26.r2 = P26W2(r); key->u.p26.r3 = P26W3(r);
	429	key->u.p26.r4 = P26W4(r);
	430
	431	key->u.p26.rr1 = 5key->u.p26.r1; key->u.p26.rr2 = 5key->u.p26.r2;
	432	key->u.p26.rr3 = 5key->u.p26.r3; key->u.p26.rr4 = 5key->u.p26.r4;
	433	#else
	434	memcpy(rr, r, 16);
	435	rr[ 3] &= 0x0f;
	436	rr[ 4] &= 0xfc; rr[ 7] &= 0x0f;
	437	rr[ 8] &= 0xfc; rr[11] &= 0x0f;
	438	rr[12] &= 0xfc; rr[15] &= 0x0f;
	439	load_p11(key->u.p11.r, rr);
	440	#endif
	441	}
	442
	443	/* --- @poly1305_macinit@ --- *
	444	*
	445	* Arguments: @poly1305_ctx *ctx@ = MAC context to fill in
	446	* @const poly1305_key *key@ = pointer to key structure to use
	447	* @const void *iv@ = pointer to mask string
	448	*
	449	* Returns: ---
	450	*
	451	* Use: Initializes a MAC context for use. The key can be discarded
	452	* at any time.
	453	*
	454	* It is permitted for @iv@ to be null, though it is not then
	455	* possible to complete the MAC computation on @ctx@. The
	456	* resulting context may still be useful, e.g., as an operand to
	457	* @poly1305_concat@.
	458	*/
	459
	460	void poly1305_macinit(poly1305_ctx *ctx,
	461	const poly1305_key key, const void iv)
	462	{
	463	const octet *s = iv;
	464	#if POLY1305_IMPL == 26
	465	uint32 s0, s1, s2, s3;
	466	#else
	467	unsigned i;
	468	#endif
	469
	470	#if POLY1305_IMPL == 26
	471	if (s) {
	472	s0 = LOAD32_L(s + 0); s1 = LOAD32_L(s + 4);
	473	s2 = LOAD32_L(s + 8); s3 = LOAD32_L(s + 12);
	474	ctx->u.p26.s0 = P26W0(s); ctx->u.p26.s1 = P26W1(s);
	475	ctx->u.p26.s2 = P26W2(s); ctx->u.p26.s3 = P26W3(s);
	476	ctx->u.p26.s4 = P26W4(s);
	477	}
	478	ctx->u.p26.h[0] = ctx->u.p26.h[1] = ctx->u.p26.h[2] =
	479	ctx->u.p26.h[3] = ctx->u.p26.h[4] = 0;
	480	#else
	481	if (s) load_p11(ctx->u.p11.s, s);
	482	for (i = 0; i < 12; i++) ctx->u.p11.h[i] = 0;
	483	#endif
	484	ctx->k = *key;
	485	ctx->nbuf = 0;
	486	ctx->count = 0;
	487	}
	488
	489	/* --- @poly1305_copy@ --- *
	490	*
	491	* Arguments: @poly1305_ctx *to@ = destination context
	492	* @const poly1305_ctx *from@ = source context
	493	*
	494	* Returns: ---
	495	*
	496	* Use: Duplicates a Poly1305 MAC context. The destination need not
	497	* have been initialized. Both contexts can be used
	498	* independently afterwards.
	499	*/
	500
	501	void poly1305_copy(poly1305_ctx ctx, const poly1305_ctx from)
	502	{ ctx = from; }
	503
	504	/* --- @poly1305_hash@ --- *
	505	*
	506	* Arguments: @poly1305_ctx *ctx@ = MAC context to update
	507	* @const void *p@ = pointer to message data
	508	* @size_t sz@ = length of message data
	509	*
	510	* Returns: ---
	511	*
	512	* Use: Processes a chunk of message. The message pieces may have
	513	* arbitrary lengths, and may be empty.
	514	*/
	515
	516	static void update_full(poly1305_ctx ctx, const octet p)
	517	{
	518	felt t;
	519	#if POLY1305_IMPL == 26
	520	uint32
	521	m0 = LOAD32_L(p + 0), m1 = LOAD32_L(p + 4),
	522	m2 = LOAD32_L(p + 8), m3 = LOAD32_L(p + 12);
	523
	524	t[0] = ctx->u.p26.h[0] + P26W0(m);
	525	t[1] = ctx->u.p26.h[1] + P26W1(m);
	526	t[2] = ctx->u.p26.h[2] + P26W2(m);
	527	t[3] = ctx->u.p26.h[3] + P26W3(m);
	528	t[4] = ctx->u.p26.h[4] + P26W4(m) + 0x01000000;
	529	#else
	530	unsigned i;
	531
	532	load_p11(t, p); t[11] += 0x100;
	533	for (i = 0; i < 12; i++) t[i] += ctx->u.p11.h[i];
	534	#endif
	535
	536	mul_r(ctx, ctx->u.P.h, t);
	537	ctx->count++;
	538	}
	539
	540	static const rsvr_policy pol = { 0, 16, 16 };
	541
	542	void poly1305_hash(poly1305_ctx ctx, const void p, size_t sz)
	543	{
	544	rsvr_state st;
	545	const octet *q = p;
	546
	547	rsvr_setup(&st, &pol, &ctx->buf, &ctx->nbuf, p, sz);
	548	RSVR_DO(&st) while ((q = RSVR_NEXT(&st, 16)) != 0) update_full(ctx, q);
	549	}
	550
	551	/* --- @poly1305_flush@ --- *
	552	*
	553	* Arguments: @poly1305_ctx *ctx@ = MAC context to flush
	554	*
	555	* Returns: ---
	556	*
	557	* Use: Forces any buffered message data in the context to be
	558	* processed. This has no effect if the message processed so
	559	* far is a whole number of blocks. Flushing is performed
	560	* automatically by @poly1305_done@, but it may be necessary to
	561	* force it by hand when using @poly1305_concat@.
	562	* (Alternatively, you might use @poly1305_flushzero@ instead.)
	563	*
	564	* Flushing a partial block has an observable effect on the
	565	* computation: the resulting state is (with high probability)
	566	* dissimilar to any state reachable with a message which is a
	567	* whole number of blocks long.
	568	*/
	569
	570	void poly1305_flush(poly1305_ctx *ctx)
	571	{
	572	felt t;
	573	#if POLY1305_IMPL == 26
	574	uint32 m0, m1, m2, m3;
	575	#else
	576	unsigned i;
	577	#endif
	578
	579	if (!ctx->nbuf) return;
	580	ctx->buf[ctx->nbuf++] = 1; memset(ctx->buf + ctx->nbuf, 0, 16 - ctx->nbuf);
	581	#if POLY1305_IMPL == 26
	582	m0 = LOAD32_L(ctx->buf + 0); m1 = LOAD32_L(ctx->buf + 4);
	583	m2 = LOAD32_L(ctx->buf + 8); m3 = LOAD32_L(ctx->buf + 12);
	584
	585	t[0] = ctx->u.p26.h[0] + P26W0(m);
	586	t[1] = ctx->u.p26.h[1] + P26W1(m);
	587	t[2] = ctx->u.p26.h[2] + P26W2(m);
	588	t[3] = ctx->u.p26.h[3] + P26W3(m);
	589	t[4] = ctx->u.p26.h[4] + P26W4(m);
	590	#else
	591	load_p11(t, ctx->buf);
	592	for (i = 0; i < 12; i++) t[i] += ctx->u.p11.h[i];
	593	#endif
	594
	595	mul_r(ctx, ctx->u.P.h, t);
	596	ctx->nbuf = 0; ctx->count++;
	597	}
	598
	599	/* --- @poly1305_flushzero@ --- *
	600	*
	601	* Arguments: @poly1305_ctx *ctx@ = MAC context to flush
	602	*
	603	* Returns: ---
	604	*
	605	* Use: Forces any buffered message data in the context to be
	606	* processed, by hashing between zero and fifteen additional
	607	* zero bytes. Like @poly1305_flush@, this has no effect if the
	608	* the message processed so far is a whole number of blocks.
	609	* Unlike @poly1305_flush@, the behaviour if the message is not
	610	* a whole number of blocks is equivalent to actually hashing
	611	* some extra data.
	612	*/
	613
	614	void poly1305_flushzero(poly1305_ctx *ctx)
	615	{
	616	if (!ctx->nbuf) return;
	617	memset(ctx->buf + ctx->nbuf, 0, 16 - ctx->nbuf);
	618	update_full(ctx, ctx->buf);
	619	ctx->nbuf = 0;
	620	}
	621
	622	/* --- @poly1305_concat@ --- *
	623	*
	624	* Arguments: @poly1305_ctx *ctx@ = destination context
	625	* @const poly1305_ctx prefix, suffix@ = two operand contexts
	626	*
	627	* Returns: ---
	628	*
	629	* Use: The two operand contexts @prefix@ and @suffix@ represent
	630	* processing of two messages %$m$% and %$m'$%; the effect is to
	631	* set @ctx@ to the state corresponding to their concatenation
	632	* %$m \cat m'$%.
	633	*
	634	* All three contexts must have been initialized using the same
	635	* key value (though not necessarily from the same key
	636	* structure). The mask values associated with the input
	637	* contexts are irrelevant. The @prefix@ message %$m$% must be
	638	* a whole number of blocks long: this can be arranged by
	639	* flushing the context. The @suffix@ message need not be a
	640	* whole number of blocks long. All of the contexts remain
	641	* operational and can be used independently afterwards.
	642	*/
	643
	644	void poly1305_concat(poly1305_ctx *ctx,
	645	const poly1305_ctx prefix, const poly1305_ctx suffix)
	646	{
	647	/* Assume that lengths are public, so it's safe to behave conditionally on
	648	* the bits of ctx->count.
	649	*/
	650	unsigned long n;
	651	unsigned i;
	652	felt x;
	653	#if POLY1305_IMPL == 26
	654	uint32 x0, x1, x2, x3, x4, y0, y1, y2, y3, y4;
	655	#else
	656	uint32 y[12];
	657	#endif
	658
	659	/* We can only concatenate if the prefix is block-aligned. */
	660	assert(!prefix->nbuf);
	661
	662	/* The hash for a message m = m_{k-1} m_{k-2} ... m_1 m_0 is h_r(m) =
	663	* SUM_{0<=i<k} m_i r^{i+1}. If we have two messages, m, m', of lengths k
	664	* and k' blocks respectively, then
	665	*
	666	* h_r(m \|\| m') = SUM_{0<=i<k} m_i r^{k'+i+1} +
	667	* SUM_{0<=i<k'} m'_i r^{i+1}
	668	* = r^{k'} h_r(m) + h_r(m')
	669	*
	670	* This is simple left-to-right square-and-multiply exponentiation.
	671	*/
	672	n = suffix->count;
	673	x[0] = 1;
	674	#if POLY1305_IMPL == 26
	675	x[1] = x[2] = x[3] = x[4] = 0;
	676	#else
	677	for (i = 1; i < 12; i++) x[i] = 0;
	678	#endif
	679	#define BIT (1ul << (ULONG_BITS - 1))
	680	if (n) {
	681	i = ULONG_BITS;
	682	while (!(n & BIT)) { n <<= 1; i--; }
	683	mul_r(prefix, x, x); n <<= 1; i--;
	684	while (i--) { sqr(x, x); if (n & BIT) mul_r(prefix, x, x); n <<= 1; }
	685	}
	686	#undef BIT
	687	mul(x, prefix->u.P.h, x);
	688
	689	/* Add on the suffix hash. */
	690	#if POLY1305_IMPL == 26
	691	/* We're going to add the two hashes elementwise. Both h' = h_r(m') and
	692	* x = r^{k'} h_r(m) are bounded above by 2^27, so the sum will be bounded
	693	* by 2^28; but this is too large to leave in the accumulator. (Strictly,
	694	* we could get away with it, but the caller can in theory chain an
	695	* arbitrary number of concatenations and expect us to cope, and we'd
	696	* definitely overflow eventually.) So we reduce. Since the excess is so
	697	* small, a single round of `CARRY_REDUCE' is enough.
	698	*/
	699	x0 = x[0] + suffix->u.p26.h[0]; x1 = x[1] + suffix->u.p26.h[1];
	700	x2 = x[2] + suffix->u.p26.h[2]; x3 = x[3] + suffix->u.p26.h[3];
	701	x4 = x[4] + suffix->u.p26.h[4];
	702	CARRY_REDUCE(y, x);
	703	ctx->u.p26.h[0] = y0; ctx->u.p26.h[1] = y1; ctx->u.p26.h[2] = y2;
	704	ctx->u.p26.h[3] = y3; ctx->u.p26.h[4] = y4;
	705	#else
	706	/* We'll add the two hashes elementwise and have to reduce again. The
	707	* numbers are different, but the reasoning is basically the same.
	708	*/
	709	for (i = 0; i < 12; i++) y[i] = x[i] + suffix->u.p11.h[i];
	710	carry_reduce(y);
	711	for (i = 0; i < 12; i++) ctx->u.p11.h[i] = y[i];
	712	#endif
	713
	714	/* Copy the remaining pieces of the context to set up the result. */
	715	if (ctx != suffix) {
	716	memcpy(ctx->buf, suffix->buf, suffix->nbuf);
	717	ctx->nbuf = suffix->nbuf;
	718	}
	719	ctx->count = prefix->count + suffix->count;
	720	}
	721
	722	/* --- @poly1305_done@ --- *
	723	*
	724	* Arguments: @poly1305_ctx *ctx@ = MAC context to finish
	725	* @void *h@ = buffer to write the tag to
	726	*
	727	* Returns: ---
	728	*
	729	* Use: Completes a Poly1305 MAC tag computation.
	730	*/
	731
	732	void poly1305_done(poly1305_ctx ctx, void h)
	733	{
	734	octet *p = h;
	735
	736	#if POLY1305_IMPL == 26
	737	uint32 m_sub, t, c;
	738	uint32 h0, h1, h2, h3, h4, hh0, hh1, hh2, hh3, hh4;
	739
	740	/* If there's anything left over in the buffer, pad it to form a final
	741	* coefficient and update the evaluation one last time.
	742	*/
	743	poly1305_flush(ctx);
	744
	745	/* Collect the final hash state. */
	746	h0 = ctx->u.p26.h[0];
	747	h1 = ctx->u.p26.h[1];
	748	h2 = ctx->u.p26.h[2];
	749	h3 = ctx->u.p26.h[3];
	750	h4 = ctx->u.p26.h[4];
	751
	752	/* Reduce the final value mod 2^130 - 5. First pass: set h <- h +
	753	* 5 floor(h/2^130). After this, the low pieces of h will be normalized:
	754	* 0 <= h_i < 2^26 for 0 <= i < 4; and 0 <= h_4 < 2^26 + 1. In the
	755	* (highly unlikely) event that h_4 >= 2^26, set c and truncate to 130
	756	* bits.
	757	*/
	758	c = h4 >> 26; h4 &= M26;
	759	h0 += 5*c; c = h0 >> 26; h0 &= M26;
	760	h1 += c; c = h1 >> 26; h1 &= M26;
	761	h2 += c; c = h2 >> 26; h2 &= M26;
	762	h3 += c; c = h3 >> 26; h3 &= M26;
	763	h4 += c; c = h4 >> 26; h4 &= M26;
	764
	765	/* Calculate h' = h - (2^130 - 5). If h' >= 0 then t ends up 1; otherwise
	766	* it's zero.
	767	*/
	768	t = h0 + 5; hh0 = t&M26; t >>= 26;
	769	t += h1; hh1 = t&M26; t >>= 26;
	770	t += h2; hh2 = t&M26; t >>= 26;
	771	t += h3; hh3 = t&M26; t >>= 26;
	772	t += h4; hh4 = t&M26; t >>= 26;
	773
	774	/* Keep the subtraction result above if t or c is set. */
	775	m_sub = -(t \| c);
	776	h0 = (hh0&m_sub) \| (h0&~m_sub);
	777	h1 = (hh1&m_sub) \| (h1&~m_sub);
	778	h2 = (hh2&m_sub) \| (h2&~m_sub);
	779	h3 = (hh3&m_sub) \| (h3&~m_sub);
	780	h4 = (hh4&m_sub) \| (h4&~m_sub);
	781
	782	/* Add the mask onto the hash result. */
	783	t = h0 + ctx->u.p26.s0; h0 = t&M26; t >>= 26;
	784	t += h1 + ctx->u.p26.s1; h1 = t&M26; t >>= 26;
	785	t += h2 + ctx->u.p26.s2; h2 = t&M26; t >>= 26;
	786	t += h3 + ctx->u.p26.s3; h3 = t&M26; t >>= 26;
	787	t += h4 + ctx->u.p26.s4; h4 = t&M26; t >>= 26;
	788
	789	/* Convert this mess back into 32-bit words. We lose the top two bits,
	790	* but that's fine.
	791	*/
	792	h0 = (h0 >> 0) \| ((h1 & 0x0000003f) << 26);
	793	h1 = (h1 >> 6) \| ((h2 & 0x00000fff) << 20);
	794	h2 = (h2 >> 12) \| ((h3 & 0x0003ffff) << 14);
	795	h3 = (h3 >> 18) \| ((h4 & 0x00ffffff) << 8);
	796
	797	/* All done. */
	798	STORE32_L(p + 0, h0); STORE32_L(p + 4, h1);
	799	STORE32_L(p + 8, h2); STORE32_L(p + 12, h3);
	800	#else
	801	uint16 hh[12], hi[12], c, t, m_sub;
	802	uint32 a;
	803	unsigned i, j, n;
	804
	805	/* If there's anything left over in the buffer, pad it to form a final
	806	* coefficient and update the evaluation one last time.
	807	*/
	808	poly1305_flush(ctx);
	809
	810	/* Collect the final hash state. */
	811	for (i = 0; i < 12; i++) hh[i] = ctx->u.p11.h[i];
	812
	813	/* Reduce the final value mod 2^130 - 5. First pass: set h <- h +
	814	* 5 floor(h/2^130). After this, the low pieces of h will be normalized:
	815	* 0 <= h_i < 2^{w_i} for 0 <= i < 11; and 0 <= h_{11} < 2^10 + 1. In the
	816	* (highly unlikely) event that h_{11} >= 2^10, set c and truncate to 130
	817	* bits.
	818	*/
	819	c = 5*(hh[11] >> 10); hh[11] &= M10;
	820	for (i = 0; i < 12; i++) {
	821	if (i == 5 \|\| i == 11) { c += hh[i]; hh[i] = c&M10; c >>= 10; }
	822	else { c += hh[i]; hh[i] = c&M11; c >>= 11; }
	823	}
	824
	825	/* Calculate h' = h - (2^130 - 5). If h' >= 0 then t ends up 1; otherwise
	826	* it's zero.
	827	*/
	828	for (i = 0, t = 5; i < 12; i++) {
	829	t += hh[i];
	830	if (i == 5 \|\| i == 11) { hi[i] = t&M10; t >>= 10; }
	831	else { hi[i] = t&M11; t >>= 11; }
	832	}
	833
	834	/* Keep the subtraction result above if t or c is set. */
	835	m_sub = -(t \| c);
	836	for (i = 0; i < 12; i++) hh[i] = (hi[i]&m_sub) \| (hh[i]&~m_sub);
	837
	838	/* Add the mask onto the hash result. */
	839	for (i = 0, t = 0; i < 12; i++) {
	840	t += hh[i] + ctx->u.p11.s[i];
	841	if (i == 5 \|\| i == 11) { hh[i] = t&M10; t >>= 10; }
	842	else { hh[i] = t&M11; t >>= 11; }
	843	}
	844
	845	/* Convert this mess back into bytes. We lose the top two bits, but that's
	846	* fine.
	847	*/
	848	for (i = j = n = 0, a = 0; i < 16; i++) {
	849	if (n < 8) {
	850	a \|= hh[j] << n;
	851	n += (j == 5 \|\| j == 11) ? 10 : 11;
	852	j++;
	853	}
	854	p[i] = a&0xff; a >>= 8; n -= 8;
	855	}
	856
	857	#endif
	858	}
	859
	860	/----- Test rig ----------------------------------------------------------/
	861
	862	#ifdef TEST_RIG
	863
	864	#include <mLib/macros.h>
	865	#include <mLib/testrig.h>
	866
	867	#include "ct.h"
	868	#include "rijndael-ecb.h"
	869
	870	static int vrf_hash(dstr v[])
	871	{
	872	poly1305_key k;
	873	poly1305_ctx ctx;
	874	dstr t = DSTR_INIT;
	875	unsigned i, j;
	876
	877	if (v[0].len != 16) { fprintf(stderr, "bad key length\n"); exit(2); }
	878	if (v[1].len != 16) { fprintf(stderr, "bad mask length\n"); exit(2); }
	879	if (v[3].len != 16) { fprintf(stderr, "bad tag length\n"); exit(2); }
	880	dstr_ensure(&t, 16); t.len = 16;
	881
	882	ct_poison(v[0].buf, v[0].len);
	883	poly1305_keyinit(&k, v[0].buf, v[0].len);
	884	for (i = 0; i < v[2].len; i++) {
	885	for (j = i; j < v[2].len; j++) {
	886	poly1305_macinit(&ctx, &k, v[1].buf);
	887	poly1305_hash(&ctx, v[2].buf, i);
	888	poly1305_hash(&ctx, v[2].buf + i, j - i);
	889	poly1305_hash(&ctx, v[2].buf + j, v[2].len - j);
	890	poly1305_done(&ctx, t.buf);
	891	ct_remedy(t.buf, t.len);
	892	if (MEMCMP(t.buf, !=, v[3].buf, 16)) {
	893	fprintf(stderr, "failed...");
	894	fprintf(stderr, "\n\tkey = "); type_hex.dump(&v[0], stderr);
	895	fprintf(stderr, "\n\tmask = "); type_hex.dump(&v[1], stderr);
	896	fprintf(stderr, "\n\tmsg = "); type_hex.dump(&v[2], stderr);
	897	fprintf(stderr, "\n\texp = "); type_hex.dump(&v[3], stderr);
	898	fprintf(stderr, "\n\tcalc = "); type_hex.dump(&t, stderr);
	899	fprintf(stderr, "\n\tsplits = 0 .. %u .. %u .. %lu\n",
	900	i, j, (unsigned long)v[1].len);
	901	return (0);
	902	}
	903	}
	904	}
	905	return (1);
	906	}
	907
	908	static int vrf_cat(dstr v[])
	909	{
	910	poly1305_key k;
	911	poly1305_ctx ctx, cc[3];
	912	dstr t = DSTR_INIT;
	913	unsigned i;
	914	int ok = 1;
	915
	916	if (v[0].len != 16) { fprintf(stderr, "bad key length\n"); exit(2); }
	917	if (v[1].len != 16) { fprintf(stderr, "bad mask length\n"); exit(2); }
	918	if (v[5].len != 16) { fprintf(stderr, "bad tag length\n"); exit(2); }
	919	dstr_ensure(&t, 16); t.len = 16;
	920
	921	poly1305_keyinit(&k, v[0].buf, v[0].len);
	922	poly1305_macinit(&ctx, &k, v[1].buf);
	923	for (i = 0; i < 3; i++) {
	924	poly1305_macinit(&cc[i], &k, 0);
	925	poly1305_hash(&cc[i], v[i + 2].buf, v[i + 2].len);
	926	}
	927	for (i = 0; i < 2; i++) {
	928	if (!i) {
	929	poly1305_concat(&ctx, &cc[1], &cc[2]);
	930	poly1305_concat(&ctx, &cc[0], &ctx);
	931	} else {
	932	poly1305_concat(&ctx, &cc[0], &cc[1]);
	933	poly1305_concat(&ctx, &ctx, &cc[2]);
	934	}
	935	poly1305_done(&ctx, t.buf);
	936	if (MEMCMP(t.buf, !=, v[5].buf, 16)) {
	937	fprintf(stderr, "failed...");
	938	fprintf(stderr, "\n\tkey = "); type_hex.dump(&v[0], stderr);
	939	fprintf(stderr, "\n\tmask = "); type_hex.dump(&v[1], stderr);
	940	fprintf(stderr, "\n\tmsg[0] = "); type_hex.dump(&v[2], stderr);
	941	fprintf(stderr, "\n\tmsg[1] = "); type_hex.dump(&v[3], stderr);
	942	fprintf(stderr, "\n\tmsg[2] = "); type_hex.dump(&v[4], stderr);
	943	fprintf(stderr, "\n\texp = "); type_hex.dump(&v[5], stderr);
	944	fprintf(stderr, "\n\tcalc = "); type_hex.dump(&t, stderr);
	945	fprintf(stderr, "\n\tassoc = %s\n",
	946	!i ? "msg[0] \|\| (msg[1] \|\| msg[2])" :
	947	"(msg[0] \|\| msg[1]) \|\| msg[2]");
	948	ok = 0;
	949	}
	950	}
	951	return (ok);
	952	}
	953
	954	#define MSZMAX 1000
	955
	956	static int vrf_mct(dstr v[])
	957	{
	958	unsigned j, msz;
	959	unsigned long i, start_iter, end_iter;
	960	rijndael_ecbctx rij;
	961	poly1305_key key;
	962	poly1305_ctx mac;
	963	dstr dk = DSTR_INIT, dr = DSTR_INIT, dn = DSTR_INIT,
	964	dt = DSTR_INIT, dm = DSTR_INIT;
	965	octet k, r, s[16], n, t, *m;
	966	int ok = 1;
	967
	968	DENSURE(&dk, 16); k = (octet *)dk.buf; dk.len = 16;
	969	DENSURE(&dr, 16); r = (octet *)dr.buf; dr.len = 16;
	970	DENSURE(&dn, 16); n = (octet *)dn.buf; dn.len = 16;
	971	DENSURE(&dt, 16); t = (octet *)dt.buf; dt.len = 16;
	972	DENSURE(&dm, MSZMAX); m = (octet *)dm.buf; dm.len = MSZMAX;
	973	memset(m, 0, MSZMAX);
	974
	975	if (v[0].len != 16) { fprintf(stderr, "AES key len\n"); exit(2); }
	976	if (v[1].len != 16) { fprintf(stderr, "poly key len\n"); exit(2); }
	977	if (v[2].len != 16) { fprintf(stderr, "nonce len\n"); exit(2); }
	978	if (v[3].len != MSZMAX) { fprintf(stderr, "msgbuf len\n"); exit(2); }
	979	if (v[6].len != 16) { fprintf(stderr, "result len\n"); exit(2); }
	980	memcpy(k, v[0].buf, 16);
	981	memcpy(r, v[1].buf, 16);
	982	memcpy(n, v[2].buf, 16);
	983	memcpy(m, v[3].buf, MSZMAX);
	984	start_iter = (unsigned long )v[4].buf;
	985	end_iter = (unsigned long )v[5].buf;
	986	if (end_iter < start_iter) { fprintf(stderr, "iter bounds\n"); exit(2); }
	987
	988	rijndael_ecbinit(&rij, k, 16, 0);
	989	poly1305_keyinit(&key, r, 16);
	990	for (i = start_iter; i < end_iter; i++) {
	991	msz = 0;
	992	for (;;) {
	993	rijndael_ecbencrypt(&rij, n, s, 16);
	994	poly1305_macinit(&mac, &key, s);
	995	poly1305_hash(&mac, m, msz);
	996	poly1305_done(&mac, t);
	997	if (msz >= MSZMAX) break;
	998	n[0] ^= i&0xff;
	999	for (j = 0; j < 16; j++) n[j] ^= t[j];
	1000	if (msz%2) {
	1001	for (j = 0; j < 16; j++) k[j] ^= t[j];
	1002	rijndael_ecbinit(&rij, k, 16, 0);
	1003	}
	1004	if (msz%3) {
	1005	for (j = 0; j < 16; j++) r[j] ^= t[j];
	1006	poly1305_keyinit(&key, r, 16);
	1007	}
	1008	m[msz++] ^= t[0];
	1009	}
	1010	}
	1011
	1012	if (MEMCMP(t, !=, v[6].buf, 16)) {
	1013	ok = 0;
	1014	fprintf(stderr, "failed...");
	1015	fprintf(stderr, "\n\tinitial k = "); type_hex.dump(&v[0], stderr);
	1016	fprintf(stderr, "\n\tinitial r = "); type_hex.dump(&v[1], stderr);
	1017	fprintf(stderr, "\n\tinitial n = "); type_hex.dump(&v[2], stderr);
	1018	fprintf(stderr, "\n\tinitial m = "); type_hex.dump(&v[3], stderr);
	1019	fprintf(stderr, "\n\tstart iter = %lu", start_iter);
	1020	fprintf(stderr, "\n\tend iter = %lu", end_iter);
	1021	fprintf(stderr, "\n\tfinal k = "); type_hex.dump(&dk, stderr);
	1022	fprintf(stderr, "\n\tfinal r = "); type_hex.dump(&dr, stderr);
	1023	fprintf(stderr, "\n\tfinal n = "); type_hex.dump(&dn, stderr);
	1024	fprintf(stderr, "\n\tfinal m = "); type_hex.dump(&dm, stderr);
	1025	fprintf(stderr, "\n\texpected = "); type_hex.dump(&v[6], stderr);
	1026	fprintf(stderr, "\n\tcalculated = "); type_hex.dump(&dt, stderr);
	1027	fputc('\n', stderr);
	1028	}
	1029
	1030	dstr_destroy(&dk);
	1031	dstr_destroy(&dr);
	1032	dstr_destroy(&dn);
	1033	dstr_destroy(&dt);
	1034	dstr_destroy(&dm);
	1035	return (ok);
	1036	}
	1037
	1038	static const struct test_chunk tests[] = {
	1039	{ "poly1305-hash", vrf_hash,
	1040	{ &type_hex, &type_hex, &type_hex, &type_hex } },
	1041	{ "poly1305-cat", vrf_cat,
	1042	{ &type_hex, &type_hex, &type_hex, &type_hex, &type_hex, &type_hex } },
	1043	{ "poly1305-mct", vrf_mct,
	1044	{ &type_hex, &type_hex, &type_hex, &type_hex,
	1045	&type_ulong, &type_ulong, &type_hex } },
	1046	{ 0, 0, { 0 } }
	1047	};
	1048
	1049	int main(int argc, char *argv[])
	1050	{
	1051	test_run(argc, argv, tests, SRCDIR "/t/poly1305");
	1052	return (0);
	1053	}
	1054
	1055	#endif
	1056
	1057	/----- That's all, folks -------------------------------------------------/