3 * Poly1305 message authentication code
5 * (c) 2017 Straylight/Edgeware
8 /*----- Licensing notice --------------------------------------------------*
10 * This file is part of Catacomb.
12 * Catacomb is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU Library General Public License as
14 * published by the Free Software Foundation; either version 2 of the
15 * License, or (at your option) any later version.
17 * Catacomb is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU Library General Public License for more details.
22 * You should have received a copy of the GNU Library General Public
23 * License along with Catacomb; if not, write to the Free
24 * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
28 /*----- Header files ------------------------------------------------------*/
38 /*----- Global variables --------------------------------------------------*/
40 const octet poly1305_keysz
[] = { KSZ_SET
, 16, 0 };
42 /*----- Low-level implementation for 32/64-bit targets --------------------*/
44 #if !defined(POLY1305_IMPL) && defined(HAVE_UINT64)
45 # define POLY1305_IMPL 26
48 #if POLY1305_IMPL == 26
50 /* Elements x of GF(2^130 - 5) are represented by five integers x_i: x =
51 * SUM_{0<=i<5} x_i 2^{26i}.
53 * Not all elements are represented canonically. We have 0 <= r_i, s_i <
54 * 2^26 by construction. We maintain 0 <= h_i < 2^27. When we read a
55 * message block m, we have 0 <= m_i < 2^26 by construction again. When we
56 * update the hash state, we calculate h' = r (h + m). Addition is done
57 * componentwise; let t = h + m, and we will have 0 <= t_i < 3*2^26.
59 typedef uint32 felt
[5];
60 #define M26 0x03ffffff
63 /* Convert 32-bit words into field-element pieces. */
64 #define P26W0(x) (((x##0) << 0)&0x03ffffff)
65 #define P26W1(x) ((((x##1) << 6)&0x03ffffc0) | (((x##0) >> 26)&0x0000003f))
66 #define P26W2(x) ((((x##2) << 12)&0x03ffffff) | (((x##1) >> 20)&0x00000fff))
67 #define P26W3(x) ((((x##3) << 18)&0x03fc0000) | (((x##2) >> 14)&0x0003ffff))
68 #define P26W4(x) (((x##3) >> 8)&0x00ffffff)
70 /* Propagate carries in parallel. If 0 <= u_i < 2^26 c_i, then we shall have
71 * 0 <= v_0 < 2^26 + 5 c_4, and 0 <= v_i < 2^26 + c_{i-1} for 1 <= i < 5.
73 #define CARRY_REDUCE(v, u) do { \
74 (v##0) = ((u##0)&M26) + 5*((u##4) >> 26); \
75 (v##1) = ((u##1)&M26) + ((u##0) >> 26); \
76 (v##2) = ((u##2)&M26) + ((u##1) >> 26); \
77 (v##3) = ((u##3)&M26) + ((u##2) >> 26); \
78 (v##4) = ((u##4)&M26) + ((u##3) >> 26); \
81 /* General multiplication, used by `concat'. */
82 static void mul(felt z
, const felt x
, const felt y
)
84 /* Initial bounds: we assume x_i, y_i < 2^27. On exit, z_i < 2^27. */
86 uint32 x0
= x
[0], x1
= x
[1], x2
= x
[2], x3
= x
[3], x4
= x
[4];
87 uint32 y0
= y
[0], y1
= y
[1], y2
= y
[2], y3
= y
[3], y4
= y
[4];
88 uint64 u0
, u1
, u2
, u3
, u4
;
89 uint64 v0
, v1
, v2
, v3
, v4
;
90 uint32 z0
, z1
, z2
, z3
, z4
;
92 /* Do the multiplication: u = h x mod 2^130 - 5. We will have u_i <
93 * 2^27 (5 (4 - i) + i + 1) 2^27 = 2^54 (21 - 4 i) = 2^52 (84 - 16 i). In
94 * all cases we have u_i < 84*2^52 < 2^59. Notably, u_4 < 5*2^54 =
97 #define M(x, y) ((uint64)(x)*(y))
98 u0
= M(x0
, y0
) + (M(x1
, y4
) + M(x2
, y3
) + M(x3
, y2
) + M(x4
, y1
))*5;
99 u1
= M(x0
, y1
) + M(x1
, y0
) + (M(x2
, y4
) + M(x3
, y3
) + M(x4
, y2
))*5;
100 u2
= M(x0
, y2
) + M(x1
, y1
) + M(x2
, y0
) + (M(x3
, y4
) + M(x4
, y3
))*5;
101 u3
= M(x0
, y3
) + M(x1
, y2
) + M(x2
, y1
) + M(x3
, y0
) + (M(x4
, y4
))*5;
102 u4
= M(x0
, y4
) + M(x1
, y3
) + M(x2
, y2
) + M(x3
, y1
) + M(x4
, y0
);
105 /* Now we must reduce the coefficients. We do this in an approximate
106 * manner which avoids long data-dependency chains, but requires two
109 * The reduced carry down from u_4 to u_0 in the first pass will be c_0 <
110 * 100*2^26; the remaining c_i are smaller: c_i < 2^26 (84 - 16 i). This
111 * leaves 0 <= v_i < 101*2^26. The carries in the second pass are bounded
114 CARRY_REDUCE(v
, u
); CARRY_REDUCE(z
, v
);
115 z
[0] = z0
; z
[1] = z1
; z
[2] = z2
; z
[3] = z3
; z
[4] = z4
;
118 /* General squaring, used by `concat'. */
119 static void sqr(felt z
, const felt x
)
121 /* Initial bounds: we assume x_i < 2^27. On exit, z_i < 2^27. */
123 uint32 x0
= x
[0], x1
= x
[1], x2
= x
[2], x3
= x
[3], x4
= x
[4];
124 uint64 u0
, u1
, u2
, u3
, u4
;
125 uint64 v0
, v1
, v2
, v3
, v4
;
126 uint32 z0
, z1
, z2
, z3
, z4
;
128 /* Do the squaring. See `mul' for bounds. */
129 #define M(x, y) ((uint64)(x)*(y))
130 u0
= M(x0
, x0
) + 10*(M(x1
, x4
) + M(x2
, x3
));
131 u1
= 2* M(x0
, x1
) + 5*(M(x3
, x3
) + 2*M(x2
, x4
));
132 u2
= M(x1
, x1
) + 2* M(x0
, x2
) + 10* M(x3
, x4
);
133 u3
= 2*(M(x0
, x3
) + M(x1
, x2
)) + 5* M(x4
, x4
);
134 u4
= M(x2
, x2
) + 2*(M(x0
, x4
) + M(x1
, x3
));
137 /* Now we must reduce the coefficients. See `mul' for bounds. */
138 CARRY_REDUCE(v
, u
); CARRY_REDUCE(z
, v
);
139 z
[0] = z0
; z
[1] = z1
; z
[2] = z2
; z
[3] = z3
; z
[4] = z4
;
142 /* Multiplication by r, using precomputation. */
143 static void mul_r(const poly1305_ctx
*ctx
, felt z
, const felt x
)
145 /* Initial bounds: by construction, r_i < 2^26. We assume x_i < 3*2^26.
146 * On exit, z_i < 2^27.
150 r0
= ctx
->k
.u
.p26
.r0
,
151 r1
= ctx
->k
.u
.p26
.r1
, rr1
= ctx
->k
.u
.p26
.rr1
,
152 r2
= ctx
->k
.u
.p26
.r2
, rr2
= ctx
->k
.u
.p26
.rr2
,
153 r3
= ctx
->k
.u
.p26
.r3
, rr3
= ctx
->k
.u
.p26
.rr3
,
154 r4
= ctx
->k
.u
.p26
.r4
, rr4
= ctx
->k
.u
.p26
.rr4
;
155 uint32 x0
= x
[0], x1
= x
[1], x2
= x
[2], x3
= x
[3], x4
= x
[4];
156 uint64 u0
, u1
, u2
, u3
, u4
;
157 uint64 v0
, v1
, v2
, v3
, v4
;
158 uint32 z0
, z1
, z2
, z3
, z4
;
160 /* Do the multiplication: u = h x mod 2^130 - 5. We will have u_i <
161 * 2^26 (5 (4 - i) + i + 1) 3*2^26 = 2^52 (63 - 12 i). In all cases
162 * we have u_i < 63*2^52 < 2^58. Notably, u_4 < 15*2^52.
164 #define M(x, y) ((uint64)(x)*(y))
165 u0
= M(x0
, r0
) + M(x1
, rr4
) + M(x2
, rr3
) + M(x3
, rr2
) + M(x4
, rr1
);
166 u1
= M(x0
, r1
) + M(x1
, r0
) + M(x2
, rr4
) + M(x3
, rr3
) + M(x4
, rr2
);
167 u2
= M(x0
, r2
) + M(x1
, r1
) + M(x2
, r0
) + M(x3
, rr4
) + M(x4
, rr3
);
168 u3
= M(x0
, r3
) + M(x1
, r2
) + M(x2
, r1
) + M(x3
, r0
) + M(x4
, rr4
);
169 u4
= M(x0
, r4
) + M(x1
, r3
) + M(x2
, r2
) + M(x3
, r1
) + M(x4
, r0
);
172 /* Now we must reduce the coefficients. We do this in an approximate
173 * manner which avoids long data-dependency chains, but requires two
176 * The reduced carry down from u_4 to u_0 in the first pass will be c_0 <
177 * 75*2^26; the remaining c_i are smaller: c_i < 2^26 (63 - 12 i). This
178 * leaves 0 <= v_i < 76*2^26. The carries in the second pass are bounded
181 CARRY_REDUCE(v
, u
); CARRY_REDUCE(z
, v
);
182 z
[0] = z0
; z
[1] = z1
; z
[2] = z2
; z
[3] = z3
; z
[4] = z4
;
187 /*----- Low-level implementation for 16/32-bit targets --------------------*/
189 #ifndef POLY1305_IMPL
190 # define POLY1305_IMPL 11
193 #if POLY1305_IMPL == 11
195 /* Elements x of GF(2^130 - 5) are represented by 12 integers x_i: x =
196 * SUM_{0<=i<12} x_i 2^P_i, where P_i = SUM_{0<=j<i} w_j, and w_5 = w_11 =
197 * 10, and w_i = 11 for i in { 0, 1, 2, 3, 4, 6, 7, 8, 9, 10 }.
199 * Not all elements are represented canonically. We have 0 <= r_i, s_i <
200 * 2^w_i <= 2^11 by construction. We maintain 0 <= h_i < 2^12. When we read
201 * a message block m, we have 0 <= m_i < 2^w_i by construction again. When
202 * we update the hash state, we calculate h' = r (h + m). Addition is done
203 * componentwise; let t = h + m, and we will have 0 <= t_i < 3*2^11.
205 typedef uint16 felt
[12];
210 /* Load a field element from an octet string. */
211 static void load_p11(felt d
, const octet
*s
)
217 for (i
= j
= n
= 0, a
= 0; j
< 12; j
++) {
218 if (j
== 5 || j
== 11) { w
= 10; m
= M10
; }
219 else { w
= 11; m
= M11
; }
220 while (n
< w
&& i
< 16) { a
|= s
[i
++] << n
; n
+= 8; }
221 d
[j
] = a
&m
; a
>>= w
; n
-= w
;
225 /* Reduce a field-element's pieces to manageable size. */
226 static void carry_reduce(uint32 u
[12])
228 /* Initial bounds: we assume u_i < 636*2^22. On exit, u_i < 2^11. */
233 /* Do sequential carry propagation (16-bit CPUs are less likely to benefit
234 * from instruction-level parallelism). Start at u_9; truncate it to 11
235 * bits, and add the carry onto u_10. Truncate u10 to 11 bits, and add the
236 * carry onto u_11. Truncate u_11 to 10 bits, and add five times the carry
237 * onto u_0. And so on.
239 * The carry is larger than the pieces we're leaving behind. Let c_i be
240 * the high portion of u_i, to be carried onto u_{i+1}. I claim that c_i <
241 * 2557*2^10. Then the carry /into/ any u_i is at most 12785*2^10 < 2^24
242 * (allowing for the reduction as we carry from u_11 to u_0), and u_i after
243 * carry is bounded above by 636*2^22 + 12785*2^10 < 2557*2^20. Hence, the
244 * carry out is at most 2557*2^10, as claimed.
246 * Once we reach u_9 for the second time, we start with u_9 < 2^11. The
247 * carry into u_9 is at most 2557*2^10 < 1279*2^11 as calculated above; so
248 * the carry out into u_10 is at most 1280. Since u_10 < 2^11 prior to
249 * this carry in, we now have u_10 < 2^11 + 1280 < 2^12; so the carry out
250 * into u_11 is at most 1. The final reduction therefore only needs a
251 * conditional subtraction.
253 { c
= u
[9] >> 11; u
[9] &= M11
; }
254 { u
[10] += c
; c
= u
[10] >> 11; u
[10] &= M11
; }
255 { u
[11] += c
; c
= u
[11] >> 10; u
[11] &= M10
; }
256 { u
[0] += 5*c
; c
= u
[0] >> 11; u
[0] &= M11
; }
257 for (i
= 1; i
< 5; i
++) { u
[i
] += c
; c
= u
[i
] >> 11; u
[i
] &= M11
; }
258 { u
[5] += c
; c
= u
[5] >> 10; u
[5] &= M10
; }
259 for (i
= 6; i
< 11; i
++) { u
[i
] += c
; c
= u
[i
] >> 11; u
[i
] &= M11
; }
263 /* General multiplication. */
264 static void mul(felt z
, const felt x
, const felt y
)
266 /* Initial bounds: we assume x_i < 3*2^11, and y_i < 2^12. On exit,
273 /* Do the main multiplication. After this, we shall have
275 * { 2^22 (636 - 184 i) for 0 <= i < 6
277 * { 2^22 (732 - 60 i) for 6 <= i < 12
279 * In particular, u_0 < 636*2^22 < 2^32, and u_11 < 72*2^22.
281 * The irregularly positioned pieces are annoying. Because we fold the
282 * reduction into the multiplication, it's also important to see where the
283 * reduced products fit. Finally, products don't align with the piece
284 * boundaries, and sometimes need to be doubled. The following table
285 * tracks all of this.
287 * piece width offset second
301 * The next table tracks exactly which products end up being multiplied by
302 * which constants and accumulated into which destination pieces.
304 * u_k = t_i r_j + 2 t_i r_j + 5 t_i r_j + 10 t_i r_j
305 * 0 0/0 -- 6/6 1-5/11-7 7-11/5-1
306 * 1 0-1/1-0 -- 6-7/7-6 2-5/11-8 8-11/5-2
307 * 2 0-2/2-0 -- 6-8/8-6 3-5/11-9 9-11/5-3
308 * 3 0-3/3-0 -- 6-9/9-6 4-5/11-10 10-11/5-4
309 * 4 0-4/4-0 -- 6-10/10-6 5/11 11/5
310 * 5 0-5/5-0 -- 6-11/11-6 --
311 * 6 0/6 6/0 1-5/5-1 -- 7-11/11-7
312 * 7 0-1/7-6 6-7/1-0 2-5/5-2 -- 8-11/11-8
313 * 8 0-2/8-6 6-8/2-0 3-5/5-3 -- 9-11/11-9
314 * 9 0-3/9-6 6-9/3-0 4-5/5-4 -- 10-11/11-10
315 * 10 0-4/10-6 6-10/4-0 5/5 -- 11/11
316 * 11 0-11/11-0 -- -- --
318 * And, finally, trying to bound the multiple of 6*2^22 in each destination
319 * piece is fiddly, so here's a tableau showing the calculation.
321 * k 1* + 2* + 5* +10* = 1* + 5* =
322 * 0 1 -- 1 10 1 21 106
328 * 6 2 5 -- 5 12 10 62
332 * 10 10 1 -- 1 12 2 22
333 * 11 12 -- -- -- 12 0 12
336 for (i
= 0; i
< 12; i
++) u
[i
] = 0;
338 #define M(i, j) ((uint32)x[i]*y[j])
340 /* Product terms we must multiply by 10. */
341 for (k
= 0; k
< 5; k
++) {
342 for (i
= k
+ 1; i
< 6; i
++) {
344 u
[k
] += M(i
, j
) + M(j
, i
);
345 u
[k
+ 6] += M(i
+ 6, j
);
348 for (k
= 0; k
< 5; k
++) u
[k
] *= 2;
349 for (k
= 6; k
< 11; k
++) u
[k
] *= 5;
351 /* Product terms we must multiply by 5. */
352 for (k
= 0; k
< 6; k
++) {
353 for (i
= k
+ 6; i
>= 6; i
--) {
358 for (k
= 0; k
< 6; k
++) u
[k
] *= 5;
360 /* Product terms we must multiply by 2. */
361 for (k
= 6; k
< 11; k
++) {
362 for (i
= k
- 5; i
< 6; i
++) {
367 for (k
= 6; k
< 11; k
++) u
[k
] *= 2;
369 /* Remaining product terms. */
370 for (k
= 0; k
< 6; k
++) {
371 for (i
= k
; i
< 6; i
--) {
374 u
[k
+ 6] += M(i
+ 6, j
) + M(i
, j
+ 6);
380 /* Do the reduction. Currently, `carry_reduce' does more than we need, but
385 /* Done. Write out the answer. */
386 for (i
= 0; i
< 12; i
++) z
[i
] = u
[i
];
389 /* General squaring, used by `concat'. */
390 static void sqr(felt z
, const felt x
)
393 /* Multiplication by r. */
394 static void mul_r(const poly1305_ctx
*ctx
, felt z
, const felt x
)
395 { mul(z
, x
, ctx
->k
.u
.p11
.r
); }
399 /*----- Interface functions -----------------------------------------------*/
401 /* --- @poly1305_keyinit@ --- *
403 * Arguments: @poly1305_key *key@ = key structure to fill in
404 * @const void *k@ = pointer to key material
405 * @size_t ksz@ = length of key (must be @POLY1305_KEYSZ == 16@)
409 * Use: Records a Poly1305 key and performs (minimal)
413 void poly1305_keyinit(poly1305_key
*key
, const void *k
, size_t ksz
)
416 #if POLY1305_IMPL == 11
420 KSZ_ASSERT(poly1305
, ksz
);
422 #if POLY1305_IMPL == 26
423 uint32 r0
= LOAD32_L(r
+ 0), r1
= LOAD32_L(r
+ 4),
424 r2
= LOAD32_L(r
+ 8), r3
= LOAD32_L(r
+ 12);
426 r0
&= 0x0fffffff; r1
&= 0x0ffffffc; r2
&= 0x0ffffffc; r3
&= 0x0ffffffc;
427 key
->u
.p26
.r0
= P26W0(r
); key
->u
.p26
.r1
= P26W1(r
);
428 key
->u
.p26
.r2
= P26W2(r
); key
->u
.p26
.r3
= P26W3(r
);
429 key
->u
.p26
.r4
= P26W4(r
);
431 key
->u
.p26
.rr1
= 5*key
->u
.p26
.r1
; key
->u
.p26
.rr2
= 5*key
->u
.p26
.r2
;
432 key
->u
.p26
.rr3
= 5*key
->u
.p26
.r3
; key
->u
.p26
.rr4
= 5*key
->u
.p26
.r4
;
436 rr
[ 4] &= 0xfc; rr
[ 7] &= 0x0f;
437 rr
[ 8] &= 0xfc; rr
[11] &= 0x0f;
438 rr
[12] &= 0xfc; rr
[15] &= 0x0f;
439 load_p11(key
->u
.p11
.r
, rr
);
443 /* --- @poly1305_macinit@ --- *
445 * Arguments: @poly1305_ctx *ctx@ = MAC context to fill in
446 * @const poly1305_key *key@ = pointer to key structure to use
447 * @const void *iv@ = pointer to mask string
451 * Use: Initializes a MAC context for use. The key can be discarded
454 * It is permitted for @iv@ to be null, though it is not then
455 * possible to complete the MAC computation on @ctx@. The
456 * resulting context may still be useful, e.g., as an operand to
460 void poly1305_macinit(poly1305_ctx
*ctx
,
461 const poly1305_key
*key
, const void *iv
)
464 #if POLY1305_IMPL == 26
465 uint32 s0
, s1
, s2
, s3
;
470 #if POLY1305_IMPL == 26
472 s0
= LOAD32_L(s
+ 0); s1
= LOAD32_L(s
+ 4);
473 s2
= LOAD32_L(s
+ 8); s3
= LOAD32_L(s
+ 12);
474 ctx
->u
.p26
.s0
= P26W0(s
); ctx
->u
.p26
.s1
= P26W1(s
);
475 ctx
->u
.p26
.s2
= P26W2(s
); ctx
->u
.p26
.s3
= P26W3(s
);
476 ctx
->u
.p26
.s4
= P26W4(s
);
478 ctx
->u
.p26
.h
[0] = ctx
->u
.p26
.h
[1] = ctx
->u
.p26
.h
[2] =
479 ctx
->u
.p26
.h
[3] = ctx
->u
.p26
.h
[4] = 0;
481 if (s
) load_p11(ctx
->u
.p11
.s
, s
);
482 for (i
= 0; i
< 12; i
++) ctx
->u
.p11
.h
[i
] = 0;
489 /* --- @poly1305_copy@ --- *
491 * Arguments: @poly1305_ctx *to@ = destination context
492 * @const poly1305_ctx *from@ = source context
496 * Use: Duplicates a Poly1305 MAC context. The destination need not
497 * have been initialized. Both contexts can be used
498 * independently afterwards.
501 void poly1305_copy(poly1305_ctx
*ctx
, const poly1305_ctx
*from
)
504 /* --- @poly1305_hash@ --- *
506 * Arguments: @poly1305_ctx *ctx@ = MAC context to update
507 * @const void *p@ = pointer to message data
508 * @size_t sz@ = length of message data
512 * Use: Processes a chunk of message. The message pieces may have
513 * arbitrary lengths, and may be empty.
516 static void update_full(poly1305_ctx
*ctx
, const octet
*p
)
519 #if POLY1305_IMPL == 26
521 m0
= LOAD32_L(p
+ 0), m1
= LOAD32_L(p
+ 4),
522 m2
= LOAD32_L(p
+ 8), m3
= LOAD32_L(p
+ 12);
524 t
[0] = ctx
->u
.p26
.h
[0] + P26W0(m
);
525 t
[1] = ctx
->u
.p26
.h
[1] + P26W1(m
);
526 t
[2] = ctx
->u
.p26
.h
[2] + P26W2(m
);
527 t
[3] = ctx
->u
.p26
.h
[3] + P26W3(m
);
528 t
[4] = ctx
->u
.p26
.h
[4] + P26W4(m
) + 0x01000000;
532 load_p11(t
, p
); t
[11] += 0x100;
533 for (i
= 0; i
< 12; i
++) t
[i
] += ctx
->u
.p11
.h
[i
];
536 mul_r(ctx
, ctx
->u
.P
.h
, t
);
540 static const rsvr_policy pol
= { 0, 16, 16 };
542 void poly1305_hash(poly1305_ctx
*ctx
, const void *p
, size_t sz
)
547 rsvr_setup(&st
, &pol
, &ctx
->buf
, &ctx
->nbuf
, p
, sz
);
548 RSVR_DO(&st
) while ((q
= RSVR_NEXT(&st
, 16)) != 0) update_full(ctx
, q
);
551 /* --- @poly1305_flush@ --- *
553 * Arguments: @poly1305_ctx *ctx@ = MAC context to flush
557 * Use: Forces any buffered message data in the context to be
558 * processed. This has no effect if the message processed so
559 * far is a whole number of blocks. Flushing is performed
560 * automatically by @poly1305_done@, but it may be necessary to
561 * force it by hand when using @poly1305_concat@.
562 * (Alternatively, you might use @poly1305_flushzero@ instead.)
564 * Flushing a partial block has an observable effect on the
565 * computation: the resulting state is (with high probability)
566 * dissimilar to any state reachable with a message which is a
567 * whole number of blocks long.
570 void poly1305_flush(poly1305_ctx
*ctx
)
573 #if POLY1305_IMPL == 26
574 uint32 m0
, m1
, m2
, m3
;
579 if (!ctx
->nbuf
) return;
580 ctx
->buf
[ctx
->nbuf
++] = 1; memset(ctx
->buf
+ ctx
->nbuf
, 0, 16 - ctx
->nbuf
);
581 #if POLY1305_IMPL == 26
582 m0
= LOAD32_L(ctx
->buf
+ 0); m1
= LOAD32_L(ctx
->buf
+ 4);
583 m2
= LOAD32_L(ctx
->buf
+ 8); m3
= LOAD32_L(ctx
->buf
+ 12);
585 t
[0] = ctx
->u
.p26
.h
[0] + P26W0(m
);
586 t
[1] = ctx
->u
.p26
.h
[1] + P26W1(m
);
587 t
[2] = ctx
->u
.p26
.h
[2] + P26W2(m
);
588 t
[3] = ctx
->u
.p26
.h
[3] + P26W3(m
);
589 t
[4] = ctx
->u
.p26
.h
[4] + P26W4(m
);
591 load_p11(t
, ctx
->buf
);
592 for (i
= 0; i
< 12; i
++) t
[i
] += ctx
->u
.p11
.h
[i
];
595 mul_r(ctx
, ctx
->u
.P
.h
, t
);
596 ctx
->nbuf
= 0; ctx
->count
++;
599 /* --- @poly1305_flushzero@ --- *
601 * Arguments: @poly1305_ctx *ctx@ = MAC context to flush
605 * Use: Forces any buffered message data in the context to be
606 * processed, by hashing between zero and fifteen additional
607 * zero bytes. Like @poly1305_flush@, this has no effect if the
608 * the message processed so far is a whole number of blocks.
609 * Unlike @poly1305_flush@, the behaviour if the message is not
610 * a whole number of blocks is equivalent to actually hashing
614 void poly1305_flushzero(poly1305_ctx
*ctx
)
616 if (!ctx
->nbuf
) return;
617 memset(ctx
->buf
+ ctx
->nbuf
, 0, 16 - ctx
->nbuf
);
618 update_full(ctx
, ctx
->buf
);
622 /* --- @poly1305_concat@ --- *
624 * Arguments: @poly1305_ctx *ctx@ = destination context
625 * @const poly1305_ctx *prefix, *suffix@ = two operand contexts
629 * Use: The two operand contexts @prefix@ and @suffix@ represent
630 * processing of two messages %$m$% and %$m'$%; the effect is to
631 * set @ctx@ to the state corresponding to their concatenation
634 * All three contexts must have been initialized using the same
635 * key value (though not necessarily from the same key
636 * structure). The mask values associated with the input
637 * contexts are irrelevant. The @prefix@ message %$m$% must be
638 * a whole number of blocks long: this can be arranged by
639 * flushing the context. The @suffix@ message need not be a
640 * whole number of blocks long. All of the contexts remain
641 * operational and can be used independently afterwards.
644 void poly1305_concat(poly1305_ctx
*ctx
,
645 const poly1305_ctx
*prefix
, const poly1305_ctx
*suffix
)
647 /* Assume that lengths are public, so it's safe to behave conditionally on
648 * the bits of ctx->count.
653 #if POLY1305_IMPL == 26
654 uint32 x0
, x1
, x2
, x3
, x4
, y0
, y1
, y2
, y3
, y4
;
659 /* We can only concatenate if the prefix is block-aligned. */
660 assert(!prefix
->nbuf
);
662 /* The hash for a message m = m_{k-1} m_{k-2} ... m_1 m_0 is h_r(m) =
663 * SUM_{0<=i<k} m_i r^{i+1}. If we have two messages, m, m', of lengths k
664 * and k' blocks respectively, then
666 * h_r(m || m') = SUM_{0<=i<k} m_i r^{k'+i+1} +
667 * SUM_{0<=i<k'} m'_i r^{i+1}
668 * = r^{k'} h_r(m) + h_r(m')
670 * This is simple left-to-right square-and-multiply exponentiation.
674 #if POLY1305_IMPL == 26
675 x
[1] = x
[2] = x
[3] = x
[4] = 0;
677 for (i
= 1; i
< 12; i
++) x
[i
] = 0;
679 #define BIT (1ul << (ULONG_BITS - 1))
682 while (!(n
& BIT
)) { n
<<= 1; i
--; }
683 mul_r(prefix
, x
, x
); n
<<= 1; i
--;
684 while (i
--) { sqr(x
, x
); if (n
& BIT
) mul_r(prefix
, x
, x
); n
<<= 1; }
687 mul(x
, prefix
->u
.P
.h
, x
);
689 /* Add on the suffix hash. */
690 #if POLY1305_IMPL == 26
691 /* We're going to add the two hashes elementwise. Both h' = h_r(m') and
692 * x = r^{k'} h_r(m) are bounded above by 2^27, so the sum will be bounded
693 * by 2^28; but this is too large to leave in the accumulator. (Strictly,
694 * we could get away with it, but the caller can in theory chain an
695 * arbitrary number of concatenations and expect us to cope, and we'd
696 * definitely overflow eventually.) So we reduce. Since the excess is so
697 * small, a single round of `CARRY_REDUCE' is enough.
699 x0
= x
[0] + suffix
->u
.p26
.h
[0]; x1
= x
[1] + suffix
->u
.p26
.h
[1];
700 x2
= x
[2] + suffix
->u
.p26
.h
[2]; x3
= x
[3] + suffix
->u
.p26
.h
[3];
701 x4
= x
[4] + suffix
->u
.p26
.h
[4];
703 ctx
->u
.p26
.h
[0] = y0
; ctx
->u
.p26
.h
[1] = y1
; ctx
->u
.p26
.h
[2] = y2
;
704 ctx
->u
.p26
.h
[3] = y3
; ctx
->u
.p26
.h
[4] = y4
;
706 /* We'll add the two hashes elementwise and have to reduce again. The
707 * numbers are different, but the reasoning is basically the same.
709 for (i
= 0; i
< 12; i
++) y
[i
] = x
[i
] + suffix
->u
.p11
.h
[i
];
711 for (i
= 0; i
< 12; i
++) ctx
->u
.p11
.h
[i
] = y
[i
];
714 /* Copy the remaining pieces of the context to set up the result. */
716 memcpy(ctx
->buf
, suffix
->buf
, suffix
->nbuf
);
717 ctx
->nbuf
= suffix
->nbuf
;
719 ctx
->count
= prefix
->count
+ suffix
->count
;
722 /* --- @poly1305_done@ --- *
724 * Arguments: @poly1305_ctx *ctx@ = MAC context to finish
725 * @void *h@ = buffer to write the tag to
729 * Use: Completes a Poly1305 MAC tag computation.
732 void poly1305_done(poly1305_ctx
*ctx
, void *h
)
736 #if POLY1305_IMPL == 26
738 uint32 h0
, h1
, h2
, h3
, h4
, hh0
, hh1
, hh2
, hh3
, hh4
;
740 /* If there's anything left over in the buffer, pad it to form a final
741 * coefficient and update the evaluation one last time.
745 /* Collect the final hash state. */
746 h0
= ctx
->u
.p26
.h
[0];
747 h1
= ctx
->u
.p26
.h
[1];
748 h2
= ctx
->u
.p26
.h
[2];
749 h3
= ctx
->u
.p26
.h
[3];
750 h4
= ctx
->u
.p26
.h
[4];
752 /* Reduce the final value mod 2^130 - 5. First pass: set h <- h +
753 * 5 floor(h/2^130). After this, the low pieces of h will be normalized:
754 * 0 <= h_i < 2^26 for 0 <= i < 4; and 0 <= h_4 < 2^26 + 1. In the
755 * (highly unlikely) event that h_4 >= 2^26, set c and truncate to 130
758 c
= h4
>> 26; h4
&= M26
;
759 h0
+= 5*c
; c
= h0
>> 26; h0
&= M26
;
760 h1
+= c
; c
= h1
>> 26; h1
&= M26
;
761 h2
+= c
; c
= h2
>> 26; h2
&= M26
;
762 h3
+= c
; c
= h3
>> 26; h3
&= M26
;
763 h4
+= c
; c
= h4
>> 26; h4
&= M26
;
765 /* Calculate h' = h - (2^130 - 5). If h' >= 0 then t ends up 1; otherwise
768 t
= h0
+ 5; hh0
= t
&M26
; t
>>= 26;
769 t
+= h1
; hh1
= t
&M26
; t
>>= 26;
770 t
+= h2
; hh2
= t
&M26
; t
>>= 26;
771 t
+= h3
; hh3
= t
&M26
; t
>>= 26;
772 t
+= h4
; hh4
= t
&M26
; t
>>= 26;
774 /* Keep the subtraction result above if t or c is set. */
776 h0
= (hh0
&m_sub
) | (h0
&~m_sub
);
777 h1
= (hh1
&m_sub
) | (h1
&~m_sub
);
778 h2
= (hh2
&m_sub
) | (h2
&~m_sub
);
779 h3
= (hh3
&m_sub
) | (h3
&~m_sub
);
780 h4
= (hh4
&m_sub
) | (h4
&~m_sub
);
782 /* Add the mask onto the hash result. */
783 t
= h0
+ ctx
->u
.p26
.s0
; h0
= t
&M26
; t
>>= 26;
784 t
+= h1
+ ctx
->u
.p26
.s1
; h1
= t
&M26
; t
>>= 26;
785 t
+= h2
+ ctx
->u
.p26
.s2
; h2
= t
&M26
; t
>>= 26;
786 t
+= h3
+ ctx
->u
.p26
.s3
; h3
= t
&M26
; t
>>= 26;
787 t
+= h4
+ ctx
->u
.p26
.s4
; h4
= t
&M26
; t
>>= 26;
789 /* Convert this mess back into 32-bit words. We lose the top two bits,
792 h0
= (h0
>> 0) | ((h1
& 0x0000003f) << 26);
793 h1
= (h1
>> 6) | ((h2
& 0x00000fff) << 20);
794 h2
= (h2
>> 12) | ((h3
& 0x0003ffff) << 14);
795 h3
= (h3
>> 18) | ((h4
& 0x00ffffff) << 8);
798 STORE32_L(p
+ 0, h0
); STORE32_L(p
+ 4, h1
);
799 STORE32_L(p
+ 8, h2
); STORE32_L(p
+ 12, h3
);
801 uint16 hh
[12], hi
[12], c
, t
, m_sub
;
805 /* If there's anything left over in the buffer, pad it to form a final
806 * coefficient and update the evaluation one last time.
810 /* Collect the final hash state. */
811 for (i
= 0; i
< 12; i
++) hh
[i
] = ctx
->u
.p11
.h
[i
];
813 /* Reduce the final value mod 2^130 - 5. First pass: set h <- h +
814 * 5 floor(h/2^130). After this, the low pieces of h will be normalized:
815 * 0 <= h_i < 2^{w_i} for 0 <= i < 11; and 0 <= h_{11} < 2^10 + 1. In the
816 * (highly unlikely) event that h_{11} >= 2^10, set c and truncate to 130
819 c
= 5*(hh
[11] >> 10); hh
[11] &= M10
;
820 for (i
= 0; i
< 12; i
++) {
821 if (i
== 5 || i
== 11) { c
+= hh
[i
]; hh
[i
] = c
&M10
; c
>>= 10; }
822 else { c
+= hh
[i
]; hh
[i
] = c
&M11
; c
>>= 11; }
825 /* Calculate h' = h - (2^130 - 5). If h' >= 0 then t ends up 1; otherwise
828 for (i
= 0, t
= 5; i
< 12; i
++) {
830 if (i
== 5 || i
== 11) { hi
[i
] = t
&M10
; t
>>= 10; }
831 else { hi
[i
] = t
&M11
; t
>>= 11; }
834 /* Keep the subtraction result above if t or c is set. */
836 for (i
= 0; i
< 12; i
++) hh
[i
] = (hi
[i
]&m_sub
) | (hh
[i
]&~m_sub
);
838 /* Add the mask onto the hash result. */
839 for (i
= 0, t
= 0; i
< 12; i
++) {
840 t
+= hh
[i
] + ctx
->u
.p11
.s
[i
];
841 if (i
== 5 || i
== 11) { hh
[i
] = t
&M10
; t
>>= 10; }
842 else { hh
[i
] = t
&M11
; t
>>= 11; }
845 /* Convert this mess back into bytes. We lose the top two bits, but that's
848 for (i
= j
= n
= 0, a
= 0; i
< 16; i
++) {
851 n
+= (j
== 5 || j
== 11) ?
10 : 11;
854 p
[i
] = a
&0xff; a
>>= 8; n
-= 8;
860 /*----- Test rig ----------------------------------------------------------*/
864 #include <mLib/testrig.h>
867 #include "rijndael-ecb.h"
869 static int vrf_hash(dstr v
[])
876 if (v
[0].len
!= 16) { fprintf(stderr
, "bad key length\n"); exit(2); }
877 if (v
[1].len
!= 16) { fprintf(stderr
, "bad mask length\n"); exit(2); }
878 if (v
[3].len
!= 16) { fprintf(stderr
, "bad tag length\n"); exit(2); }
879 dstr_ensure(&t
, 16); t
.len
= 16;
881 ct_poison(v
[0].buf
, v
[0].len
);
882 poly1305_keyinit(&k
, v
[0].buf
, v
[0].len
);
883 for (i
= 0; i
< v
[2].len
; i
++) {
884 for (j
= i
; j
< v
[2].len
; j
++) {
885 poly1305_macinit(&ctx
, &k
, v
[1].buf
);
886 poly1305_hash(&ctx
, v
[2].buf
, i
);
887 poly1305_hash(&ctx
, v
[2].buf
+ i
, j
- i
);
888 poly1305_hash(&ctx
, v
[2].buf
+ j
, v
[2].len
- j
);
889 poly1305_done(&ctx
, t
.buf
);
890 ct_remedy(t
.buf
, t
.len
);
891 if (memcmp(t
.buf
, v
[3].buf
, 16) != 0) {
892 fprintf(stderr
, "failed...");
893 fprintf(stderr
, "\n\tkey = "); type_hex
.dump(&v
[0], stderr
);
894 fprintf(stderr
, "\n\tmask = "); type_hex
.dump(&v
[1], stderr
);
895 fprintf(stderr
, "\n\tmsg = "); type_hex
.dump(&v
[2], stderr
);
896 fprintf(stderr
, "\n\texp = "); type_hex
.dump(&v
[3], stderr
);
897 fprintf(stderr
, "\n\tcalc = "); type_hex
.dump(&t
, stderr
);
898 fprintf(stderr
, "\n\tsplits = 0 .. %u .. %u .. %lu\n",
899 i
, j
, (unsigned long)v
[1].len
);
907 static int vrf_cat(dstr v
[])
910 poly1305_ctx ctx
, cc
[3];
915 if (v
[0].len
!= 16) { fprintf(stderr
, "bad key length\n"); exit(2); }
916 if (v
[1].len
!= 16) { fprintf(stderr
, "bad mask length\n"); exit(2); }
917 if (v
[5].len
!= 16) { fprintf(stderr
, "bad tag length\n"); exit(2); }
918 dstr_ensure(&t
, 16); t
.len
= 16;
920 poly1305_keyinit(&k
, v
[0].buf
, v
[0].len
);
921 poly1305_macinit(&ctx
, &k
, v
[1].buf
);
922 for (i
= 0; i
< 3; i
++) {
923 poly1305_macinit(&cc
[i
], &k
, 0);
924 poly1305_hash(&cc
[i
], v
[i
+ 2].buf
, v
[i
+ 2].len
);
926 for (i
= 0; i
< 2; i
++) {
928 poly1305_concat(&ctx
, &cc
[1], &cc
[2]);
929 poly1305_concat(&ctx
, &cc
[0], &ctx
);
931 poly1305_concat(&ctx
, &cc
[0], &cc
[1]);
932 poly1305_concat(&ctx
, &ctx
, &cc
[2]);
934 poly1305_done(&ctx
, t
.buf
);
935 if (memcmp(t
.buf
, v
[5].buf
, 16) != 0) {
936 fprintf(stderr
, "failed...");
937 fprintf(stderr
, "\n\tkey = "); type_hex
.dump(&v
[0], stderr
);
938 fprintf(stderr
, "\n\tmask = "); type_hex
.dump(&v
[1], stderr
);
939 fprintf(stderr
, "\n\tmsg[0] = "); type_hex
.dump(&v
[2], stderr
);
940 fprintf(stderr
, "\n\tmsg[1] = "); type_hex
.dump(&v
[3], stderr
);
941 fprintf(stderr
, "\n\tmsg[2] = "); type_hex
.dump(&v
[4], stderr
);
942 fprintf(stderr
, "\n\texp = "); type_hex
.dump(&v
[5], stderr
);
943 fprintf(stderr
, "\n\tcalc = "); type_hex
.dump(&t
, stderr
);
944 fprintf(stderr
, "\n\tassoc = %s\n",
945 !i ?
"msg[0] || (msg[1] || msg[2])" :
946 "(msg[0] || msg[1]) || msg[2]");
955 static int vrf_mct(dstr v
[])
958 unsigned long i
, niter
;
963 octet k
[16], r
[16], n
[16], s
[16], *t
, m
[MSZMAX
] = { 0 };
966 if (v
[0].len
!= sizeof(k
)) { fprintf(stderr
, "AES key len\n"); exit(2); }
967 if (v
[1].len
!= sizeof(r
)) { fprintf(stderr
, "poly key len\n"); exit(2); }
968 if (v
[2].len
!= sizeof(n
)) { fprintf(stderr
, "nonce len\n"); exit(2); }
969 if (v
[4].len
!= sizeof(n
)) { fprintf(stderr
, "result len\n"); exit(2); }
970 memcpy(k
, v
[0].buf
, sizeof(k
));
971 memcpy(r
, v
[1].buf
, sizeof(k
));
972 memcpy(n
, v
[2].buf
, sizeof(k
));
973 niter
= *(unsigned long *)v
[3].buf
;
974 dstr_ensure(&d
, 16); d
.len
= 16; t
= (octet
*)d
.buf
;
976 rijndael_ecbinit(&rij
, k
, sizeof(k
), 0);
977 poly1305_keyinit(&key
, r
, sizeof(r
));
978 for (i
= 0; i
< niter
; i
++) {
981 rijndael_ecbencrypt(&rij
, n
, s
, 16);
982 poly1305_macinit(&mac
, &key
, s
);
983 poly1305_hash(&mac
, m
, msz
);
984 poly1305_done(&mac
, t
);
985 if (msz
>= MSZMAX
) break;
987 for (j
= 0; j
< 16; j
++) n
[j
] ^= t
[j
];
989 for (j
= 0; j
< 16; j
++) k
[j
] ^= t
[j
];
990 rijndael_ecbinit(&rij
, k
, sizeof(k
), 0);
993 for (j
= 0; j
< 16; j
++) r
[j
] ^= t
[j
];
994 poly1305_keyinit(&key
, r
, sizeof(r
));
1000 if (memcmp(t
, v
[4].buf
, 16) != 0) {
1002 fprintf(stderr
, "failed...");
1003 fprintf(stderr
, "\n\tinitial k = "); type_hex
.dump(&v
[0], stderr
);
1004 fprintf(stderr
, "\n\tinitial r = "); type_hex
.dump(&v
[1], stderr
);
1005 fprintf(stderr
, "\n\tinitial n = "); type_hex
.dump(&v
[2], stderr
);
1006 fprintf(stderr
, "\n\titerations = %lu", niter
);
1007 fprintf(stderr
, "\n\texpected = "); type_hex
.dump(&v
[4], stderr
);
1008 fprintf(stderr
, "\n\tcalculated = "); type_hex
.dump(&d
, stderr
);
1009 fputc('\n', stderr
);
1016 static const struct test_chunk tests
[] = {
1017 { "poly1305-hash", vrf_hash
,
1018 { &type_hex
, &type_hex
, &type_hex
, &type_hex
} },
1019 { "poly1305-cat", vrf_cat
,
1020 { &type_hex
, &type_hex
, &type_hex
, &type_hex
, &type_hex
, &type_hex
} },
1021 { "poly1305-mct", vrf_mct
,
1022 { &type_hex
, &type_hex
, &type_hex
, &type_ulong
, &type_hex
} },
1026 int main(int argc
, char *argv
[])
1028 test_run(argc
, argv
, tests
, SRCDIR
"/t/poly1305");
1034 /*----- That's all, folks -------------------------------------------------*/