| 1 | /* -*-c-*- |
| 2 | * |
| 3 | * Poly1305 message authentication code |
| 4 | * |
| 5 | * (c) 2017 Straylight/Edgeware |
| 6 | */ |
| 7 | |
| 8 | /*----- Licensing notice --------------------------------------------------* |
| 9 | * |
| 10 | * This file is part of Catacomb. |
| 11 | * |
| 12 | * Catacomb is free software; you can redistribute it and/or modify |
| 13 | * it under the terms of the GNU Library General Public License as |
| 14 | * published by the Free Software Foundation; either version 2 of the |
| 15 | * License, or (at your option) any later version. |
| 16 | * |
| 17 | * Catacomb is distributed in the hope that it will be useful, |
| 18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 20 | * GNU Library General Public License for more details. |
| 21 | * |
| 22 | * You should have received a copy of the GNU Library General Public |
| 23 | * License along with Catacomb; if not, write to the Free |
| 24 | * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, |
| 25 | * MA 02111-1307, USA. |
| 26 | */ |
| 27 | |
| 28 | /*----- Header files ------------------------------------------------------*/ |
| 29 | |
| 30 | #include "config.h" |
| 31 | |
| 32 | #include <assert.h> |
| 33 | #include <string.h> |
| 34 | |
| 35 | #include "poly1305.h" |
| 36 | #include "rsvr.h" |
| 37 | |
| 38 | /*----- Global variables --------------------------------------------------*/ |
| 39 | |
| 40 | const octet poly1305_keysz[] = { KSZ_SET, 16, 0 }; |
| 41 | |
| 42 | /*----- Low-level implementation for 32/64-bit targets --------------------*/ |
| 43 | |
| 44 | #if !defined(POLY1305_IMPL) && defined(HAVE_UINT64) |
| 45 | # define POLY1305_IMPL 26 |
| 46 | #endif |
| 47 | |
| 48 | #if POLY1305_IMPL == 26 |
| 49 | |
| 50 | /* Elements x of GF(2^130 - 5) are represented by five integers x_i: x = |
| 51 | * SUM_{0<=i<5} x_i 2^{26i}. |
| 52 | * |
| 53 | * Not all elements are represented canonically. We have 0 <= r_i, s_i < |
| 54 | * 2^26 by construction. We maintain 0 <= h_i < 2^27. When we read a |
| 55 | * message block m, we have 0 <= m_i < 2^26 by construction again. When we |
| 56 | * update the hash state, we calculate h' = r (h + m). Addition is done |
| 57 | * componentwise; let t = h + m, and we will have 0 <= t_i < 3*2^26. |
| 58 | */ |
| 59 | typedef uint32 felt[5]; |
| 60 | #define M26 0x03ffffff |
| 61 | #define P p26 |
| 62 | |
| 63 | /* Convert 32-bit words into field-element pieces. */ |
| 64 | #define P26W0(x) (((x##0) << 0)&0x03ffffff) |
| 65 | #define P26W1(x) ((((x##1) << 6)&0x03ffffc0) | (((x##0) >> 26)&0x0000003f)) |
| 66 | #define P26W2(x) ((((x##2) << 12)&0x03ffffff) | (((x##1) >> 20)&0x00000fff)) |
| 67 | #define P26W3(x) ((((x##3) << 18)&0x03fc0000) | (((x##2) >> 14)&0x0003ffff)) |
| 68 | #define P26W4(x) (((x##3) >> 8)&0x00ffffff) |
| 69 | |
| 70 | /* Propagate carries in parallel. If 0 <= u_i < 2^26 c_i, then we shall have |
| 71 | * 0 <= v_0 < 2^26 + 5 c_4, and 0 <= v_i < 2^26 + c_{i-1} for 1 <= i < 5. |
| 72 | */ |
| 73 | #define CARRY_REDUCE(v, u) do { \ |
| 74 | (v##0) = ((u##0)&M26) + 5*((u##4) >> 26); \ |
| 75 | (v##1) = ((u##1)&M26) + ((u##0) >> 26); \ |
| 76 | (v##2) = ((u##2)&M26) + ((u##1) >> 26); \ |
| 77 | (v##3) = ((u##3)&M26) + ((u##2) >> 26); \ |
| 78 | (v##4) = ((u##4)&M26) + ((u##3) >> 26); \ |
| 79 | } while (0) |
| 80 | |
| 81 | /* General multiplication, used by `concat'. */ |
| 82 | static void mul(felt z, const felt x, const felt y) |
| 83 | { |
| 84 | /* Initial bounds: we assume x_i, y_i < 2^27. On exit, z_i < 2^27. */ |
| 85 | |
| 86 | uint32 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4]; |
| 87 | uint32 y0 = y[0], y1 = y[1], y2 = y[2], y3 = y[3], y4 = y[4]; |
| 88 | uint64 u0, u1, u2, u3, u4; |
| 89 | uint64 v0, v1, v2, v3, v4; |
| 90 | uint32 z0, z1, z2, z3, z4; |
| 91 | |
| 92 | /* Do the multiplication: u = h x mod 2^130 - 5. We will have u_i < |
| 93 | * 2^27 (5 (4 - i) + i + 1) 2^27 = 2^54 (21 - 4 i) = 2^52 (84 - 16 i). In |
| 94 | * all cases we have u_i < 84*2^52 < 2^59. Notably, u_4 < 5*2^54 = |
| 95 | * 20*2^52. |
| 96 | */ |
| 97 | #define M(x, y) ((uint64)(x)*(y)) |
| 98 | u0 = M(x0, y0) + (M(x1, y4) + M(x2, y3) + M(x3, y2) + M(x4, y1))*5; |
| 99 | u1 = M(x0, y1) + M(x1, y0) + (M(x2, y4) + M(x3, y3) + M(x4, y2))*5; |
| 100 | u2 = M(x0, y2) + M(x1, y1) + M(x2, y0) + (M(x3, y4) + M(x4, y3))*5; |
| 101 | u3 = M(x0, y3) + M(x1, y2) + M(x2, y1) + M(x3, y0) + (M(x4, y4))*5; |
| 102 | u4 = M(x0, y4) + M(x1, y3) + M(x2, y2) + M(x3, y1) + M(x4, y0); |
| 103 | #undef M |
| 104 | |
| 105 | /* Now we must reduce the coefficients. We do this in an approximate |
| 106 | * manner which avoids long data-dependency chains, but requires two |
| 107 | * passes. |
| 108 | * |
| 109 | * The reduced carry down from u_4 to u_0 in the first pass will be c_0 < |
| 110 | * 100*2^26; the remaining c_i are smaller: c_i < 2^26 (84 - 16 i). This |
| 111 | * leaves 0 <= v_i < 101*2^26. The carries in the second pass are bounded |
| 112 | * above by 180. |
| 113 | */ |
| 114 | CARRY_REDUCE(v, u); CARRY_REDUCE(z, v); |
| 115 | z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4; |
| 116 | } |
| 117 | |
| 118 | /* General squaring, used by `concat'. */ |
| 119 | static void sqr(felt z, const felt x) |
| 120 | { |
| 121 | /* Initial bounds: we assume x_i < 2^27. On exit, z_i < 2^27. */ |
| 122 | |
| 123 | uint32 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4]; |
| 124 | uint64 u0, u1, u2, u3, u4; |
| 125 | uint64 v0, v1, v2, v3, v4; |
| 126 | uint32 z0, z1, z2, z3, z4; |
| 127 | |
| 128 | /* Do the squaring. See `mul' for bounds. */ |
| 129 | #define M(x, y) ((uint64)(x)*(y)) |
| 130 | u0 = M(x0, x0) + 10*(M(x1, x4) + M(x2, x3)); |
| 131 | u1 = 2* M(x0, x1) + 5*(M(x3, x3) + 2*M(x2, x4)); |
| 132 | u2 = M(x1, x1) + 2* M(x0, x2) + 10* M(x3, x4); |
| 133 | u3 = 2*(M(x0, x3) + M(x1, x2)) + 5* M(x4, x4); |
| 134 | u4 = M(x2, x2) + 2*(M(x0, x4) + M(x1, x3)); |
| 135 | #undef M |
| 136 | |
| 137 | /* Now we must reduce the coefficients. See `mul' for bounds. */ |
| 138 | CARRY_REDUCE(v, u); CARRY_REDUCE(z, v); |
| 139 | z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4; |
| 140 | } |
| 141 | |
| 142 | /* Multiplication by r, using precomputation. */ |
| 143 | static void mul_r(const poly1305_ctx *ctx, felt z, const felt x) |
| 144 | { |
| 145 | /* Initial bounds: by construction, r_i < 2^26. We assume x_i < 3*2^26. |
| 146 | * On exit, z_i < 2^27. |
| 147 | */ |
| 148 | |
| 149 | uint32 |
| 150 | r0 = ctx->k.u.p26.r0, |
| 151 | r1 = ctx->k.u.p26.r1, rr1 = ctx->k.u.p26.rr1, |
| 152 | r2 = ctx->k.u.p26.r2, rr2 = ctx->k.u.p26.rr2, |
| 153 | r3 = ctx->k.u.p26.r3, rr3 = ctx->k.u.p26.rr3, |
| 154 | r4 = ctx->k.u.p26.r4, rr4 = ctx->k.u.p26.rr4; |
| 155 | uint32 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4]; |
| 156 | uint64 u0, u1, u2, u3, u4; |
| 157 | uint64 v0, v1, v2, v3, v4; |
| 158 | uint32 z0, z1, z2, z3, z4; |
| 159 | |
| 160 | /* Do the multiplication: u = h x mod 2^130 - 5. We will have u_i < |
| 161 | * 2^26 (5 (4 - i) + i + 1) 3*2^26 = 2^52 (63 - 12 i). In all cases |
| 162 | * we have u_i < 63*2^52 < 2^58. Notably, u_4 < 15*2^52. |
| 163 | */ |
| 164 | #define M(x, y) ((uint64)(x)*(y)) |
| 165 | u0 = M(x0, r0) + M(x1, rr4) + M(x2, rr3) + M(x3, rr2) + M(x4, rr1); |
| 166 | u1 = M(x0, r1) + M(x1, r0) + M(x2, rr4) + M(x3, rr3) + M(x4, rr2); |
| 167 | u2 = M(x0, r2) + M(x1, r1) + M(x2, r0) + M(x3, rr4) + M(x4, rr3); |
| 168 | u3 = M(x0, r3) + M(x1, r2) + M(x2, r1) + M(x3, r0) + M(x4, rr4); |
| 169 | u4 = M(x0, r4) + M(x1, r3) + M(x2, r2) + M(x3, r1) + M(x4, r0); |
| 170 | #undef M |
| 171 | |
| 172 | /* Now we must reduce the coefficients. We do this in an approximate |
| 173 | * manner which avoids long data-dependency chains, but requires two |
| 174 | * passes. |
| 175 | * |
| 176 | * The reduced carry down from u_4 to u_0 in the first pass will be c_0 < |
| 177 | * 75*2^26; the remaining c_i are smaller: c_i < 2^26 (63 - 12 i). This |
| 178 | * leaves 0 <= v_i < 76*2^26. The carries in the second pass are bounded |
| 179 | * above by 135. |
| 180 | */ |
| 181 | CARRY_REDUCE(v, u); CARRY_REDUCE(z, v); |
| 182 | z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4; |
| 183 | } |
| 184 | |
| 185 | #endif |
| 186 | |
| 187 | /*----- Low-level implementation for 16/32-bit targets --------------------*/ |
| 188 | |
| 189 | #ifndef POLY1305_IMPL |
| 190 | # define POLY1305_IMPL 11 |
| 191 | #endif |
| 192 | |
| 193 | #if POLY1305_IMPL == 11 |
| 194 | |
| 195 | /* Elements x of GF(2^130 - 5) are represented by 12 integers x_i: x = |
| 196 | * SUM_{0<=i<12} x_i 2^P_i, where P_i = SUM_{0<=j<i} w_j, and w_5 = w_11 = |
| 197 | * 10, and w_i = 11 for i in { 0, 1, 2, 3, 4, 6, 7, 8, 9, 10 }. |
| 198 | * |
| 199 | * Not all elements are represented canonically. We have 0 <= r_i, s_i < |
| 200 | * 2^w_i <= 2^11 by construction. We maintain 0 <= h_i < 2^12. When we read |
| 201 | * a message block m, we have 0 <= m_i < 2^w_i by construction again. When |
| 202 | * we update the hash state, we calculate h' = r (h + m). Addition is done |
| 203 | * componentwise; let t = h + m, and we will have 0 <= t_i < 3*2^11. |
| 204 | */ |
| 205 | typedef uint16 felt[12]; |
| 206 | #define M10 0x3ff |
| 207 | #define M11 0x7ff |
| 208 | #define P p11 |
| 209 | |
| 210 | /* Load a field element from an octet string. */ |
| 211 | static void load_p11(felt d, const octet *s) |
| 212 | { |
| 213 | unsigned i, j, n, w; |
| 214 | uint16 m; |
| 215 | uint32 a; |
| 216 | |
| 217 | for (i = j = n = 0, a = 0; j < 12; j++) { |
| 218 | if (j == 5 || j == 11) { w = 10; m = M10; } |
| 219 | else { w = 11; m = M11; } |
| 220 | while (n < w && i < 16) { a |= s[i++] << n; n += 8; } |
| 221 | d[j] = a&m; a >>= w; n -= w; |
| 222 | } |
| 223 | } |
| 224 | |
| 225 | /* Reduce a field-element's pieces to manageable size. */ |
| 226 | static void carry_reduce(uint32 u[12]) |
| 227 | { |
| 228 | /* Initial bounds: we assume u_i < 636*2^22. On exit, u_i < 2^11. */ |
| 229 | |
| 230 | unsigned i; |
| 231 | uint32 c; |
| 232 | |
| 233 | /* Do sequential carry propagation (16-bit CPUs are less likely to benefit |
| 234 | * from instruction-level parallelism). Start at u_9; truncate it to 11 |
| 235 | * bits, and add the carry onto u_10. Truncate u10 to 11 bits, and add the |
| 236 | * carry onto u_11. Truncate u_11 to 10 bits, and add five times the carry |
| 237 | * onto u_0. And so on. |
| 238 | * |
| 239 | * The carry is larger than the pieces we're leaving behind. Let c_i be |
| 240 | * the high portion of u_i, to be carried onto u_{i+1}. I claim that c_i < |
| 241 | * 2557*2^10. Then the carry /into/ any u_i is at most 12785*2^10 < 2^24 |
| 242 | * (allowing for the reduction as we carry from u_11 to u_0), and u_i after |
| 243 | * carry is bounded above by 636*2^22 + 12785*2^10 < 2557*2^20. Hence, the |
| 244 | * carry out is at most 2557*2^10, as claimed. |
| 245 | * |
| 246 | * Once we reach u_9 for the second time, we start with u_9 < 2^11. The |
| 247 | * carry into u_9 is at most 2557*2^10 < 1279*2^11 as calculated above; so |
| 248 | * the carry out into u_10 is at most 1280. Since u_10 < 2^11 prior to |
| 249 | * this carry in, we now have u_10 < 2^11 + 1280 < 2^12; so the carry out |
| 250 | * into u_11 is at most 1. The final reduction therefore only needs a |
| 251 | * conditional subtraction. |
| 252 | */ |
| 253 | { c = u[9] >> 11; u[9] &= M11; } |
| 254 | { u[10] += c; c = u[10] >> 11; u[10] &= M11; } |
| 255 | { u[11] += c; c = u[11] >> 10; u[11] &= M10; } |
| 256 | { u[0] += 5*c; c = u[0] >> 11; u[0] &= M11; } |
| 257 | for (i = 1; i < 5; i++) { u[i] += c; c = u[i] >> 11; u[i] &= M11; } |
| 258 | { u[5] += c; c = u[5] >> 10; u[5] &= M10; } |
| 259 | for (i = 6; i < 11; i++) { u[i] += c; c = u[i] >> 11; u[i] &= M11; } |
| 260 | u[11] += c; |
| 261 | } |
| 262 | |
| 263 | /* General multiplication. */ |
| 264 | static void mul(felt z, const felt x, const felt y) |
| 265 | { |
| 266 | /* Initial bounds: we assume x_i < 3*2^11, and y_i < 2^12. On exit, |
| 267 | * z_i < 2^12. |
| 268 | */ |
| 269 | |
| 270 | uint32 u[12]; |
| 271 | unsigned i, j, k; |
| 272 | |
| 273 | /* Do the main multiplication. After this, we shall have |
| 274 | * |
| 275 | * { 2^22 (636 - 184 i) for 0 <= i < 6 |
| 276 | * u_i < { |
| 277 | * { 2^22 (732 - 60 i) for 6 <= i < 12 |
| 278 | * |
| 279 | * In particular, u_0 < 636*2^22 < 2^32, and u_11 < 72*2^22. |
| 280 | * |
| 281 | * The irregularly positioned pieces are annoying. Because we fold the |
| 282 | * reduction into the multiplication, it's also important to see where the |
| 283 | * reduced products fit. Finally, products don't align with the piece |
| 284 | * boundaries, and sometimes need to be doubled. The following table |
| 285 | * tracks all of this. |
| 286 | * |
| 287 | * piece width offset second |
| 288 | * 0 11 0 130 |
| 289 | * 1 11 11 141 |
| 290 | * 2 11 22 152 |
| 291 | * 3 11 33 163 |
| 292 | * 4 11 44 174 |
| 293 | * 5 10 55 185 |
| 294 | * 6 11 65 195 |
| 295 | * 7 11 76 206 |
| 296 | * 8 11 87 217 |
| 297 | * 9 11 98 228 |
| 298 | * 10 11 109 239 |
| 299 | * 11 10 120 250 |
| 300 | * |
| 301 | * The next table tracks exactly which products end up being multiplied by |
| 302 | * which constants and accumulated into which destination pieces. |
| 303 | * |
| 304 | * u_k = t_i r_j + 2 t_i r_j + 5 t_i r_j + 10 t_i r_j |
| 305 | * 0 0/0 -- 6/6 1-5/11-7 7-11/5-1 |
| 306 | * 1 0-1/1-0 -- 6-7/7-6 2-5/11-8 8-11/5-2 |
| 307 | * 2 0-2/2-0 -- 6-8/8-6 3-5/11-9 9-11/5-3 |
| 308 | * 3 0-3/3-0 -- 6-9/9-6 4-5/11-10 10-11/5-4 |
| 309 | * 4 0-4/4-0 -- 6-10/10-6 5/11 11/5 |
| 310 | * 5 0-5/5-0 -- 6-11/11-6 -- |
| 311 | * 6 0/6 6/0 1-5/5-1 -- 7-11/11-7 |
| 312 | * 7 0-1/7-6 6-7/1-0 2-5/5-2 -- 8-11/11-8 |
| 313 | * 8 0-2/8-6 6-8/2-0 3-5/5-3 -- 9-11/11-9 |
| 314 | * 9 0-3/9-6 6-9/3-0 4-5/5-4 -- 10-11/11-10 |
| 315 | * 10 0-4/10-6 6-10/4-0 5/5 -- 11/11 |
| 316 | * 11 0-11/11-0 -- -- -- |
| 317 | * |
| 318 | * And, finally, trying to bound the multiple of 6*2^22 in each destination |
| 319 | * piece is fiddly, so here's a tableau showing the calculation. |
| 320 | * |
| 321 | * k 1* + 2* + 5* +10* = 1* + 5* = |
| 322 | * 0 1 -- 1 10 1 21 106 |
| 323 | * 1 2 -- 2 8 2 18 92 |
| 324 | * 2 3 -- 3 6 3 15 78 |
| 325 | * 3 4 -- 4 4 4 12 64 |
| 326 | * 4 5 -- 5 2 5 9 50 |
| 327 | * 5 6 -- 6 -- 6 6 36 |
| 328 | * 6 2 5 -- 5 12 10 62 |
| 329 | * 7 4 4 -- 4 12 8 52 |
| 330 | * 8 6 3 -- 3 12 6 42 |
| 331 | * 9 8 2 -- 2 12 4 32 |
| 332 | * 10 10 1 -- 1 12 2 22 |
| 333 | * 11 12 -- -- -- 12 0 12 |
| 334 | */ |
| 335 | |
| 336 | for (i = 0; i < 12; i++) u[i] = 0; |
| 337 | |
| 338 | #define M(i, j) ((uint32)x[i]*y[j]) |
| 339 | |
| 340 | /* Product terms we must multiply by 10. */ |
| 341 | for (k = 0; k < 5; k++) { |
| 342 | for (i = k + 1; i < 6; i++) { |
| 343 | j = 12 + k - i; |
| 344 | u[k] += M(i, j) + M(j, i); |
| 345 | u[k + 6] += M(i + 6, j); |
| 346 | } |
| 347 | } |
| 348 | for (k = 0; k < 5; k++) u[k] *= 2; |
| 349 | for (k = 6; k < 11; k++) u[k] *= 5; |
| 350 | |
| 351 | /* Product terms we must multiply by 5. */ |
| 352 | for (k = 0; k < 6; k++) { |
| 353 | for (i = k + 6; i >= 6; i--) { |
| 354 | j = 12 + k - i; |
| 355 | u[k] += M(i, j); |
| 356 | } |
| 357 | } |
| 358 | for (k = 0; k < 6; k++) u[k] *= 5; |
| 359 | |
| 360 | /* Product terms we must multiply by 2. */ |
| 361 | for (k = 6; k < 11; k++) { |
| 362 | for (i = k - 5; i < 6; i++) { |
| 363 | j = k - i; |
| 364 | u[k] += M(i, j); |
| 365 | } |
| 366 | } |
| 367 | for (k = 6; k < 11; k++) u[k] *= 2; |
| 368 | |
| 369 | /* Remaining product terms. */ |
| 370 | for (k = 0; k < 6; k++) { |
| 371 | for (i = k; i < 6; i--) { |
| 372 | j = k - i; |
| 373 | u[k] += M(i, j); |
| 374 | u[k + 6] += M(i + 6, j) + M(i, j + 6); |
| 375 | } |
| 376 | } |
| 377 | |
| 378 | #undef M |
| 379 | |
| 380 | /* Do the reduction. Currently, `carry_reduce' does more than we need, but |
| 381 | * that's fine. |
| 382 | */ |
| 383 | carry_reduce(u); |
| 384 | |
| 385 | /* Done. Write out the answer. */ |
| 386 | for (i = 0; i < 12; i++) z[i] = u[i]; |
| 387 | } |
| 388 | |
| 389 | /* General squaring, used by `concat'. */ |
| 390 | static void sqr(felt z, const felt x) |
| 391 | { mul(z, x, x); } |
| 392 | |
| 393 | /* Multiplication by r. */ |
| 394 | static void mul_r(const poly1305_ctx *ctx, felt z, const felt x) |
| 395 | { mul(z, x, ctx->k.u.p11.r); } |
| 396 | |
| 397 | #endif |
| 398 | |
| 399 | /*----- Interface functions -----------------------------------------------*/ |
| 400 | |
| 401 | /* --- @poly1305_keyinit@ --- * |
| 402 | * |
| 403 | * Arguments: @poly1305_key *key@ = key structure to fill in |
| 404 | * @const void *k@ = pointer to key material |
| 405 | * @size_t ksz@ = length of key (must be @POLY1305_KEYSZ == 16@) |
| 406 | * |
| 407 | * Returns: --- |
| 408 | * |
| 409 | * Use: Records a Poly1305 key and performs (minimal) |
| 410 | * precomputations. |
| 411 | */ |
| 412 | |
| 413 | void poly1305_keyinit(poly1305_key *key, const void *k, size_t ksz) |
| 414 | { |
| 415 | const octet *r = k; |
| 416 | #if POLY1305_IMPL == 11 |
| 417 | octet rr[16]; |
| 418 | #endif |
| 419 | |
| 420 | KSZ_ASSERT(poly1305, ksz); |
| 421 | |
| 422 | #if POLY1305_IMPL == 26 |
| 423 | uint32 r0 = LOAD32_L(r + 0), r1 = LOAD32_L(r + 4), |
| 424 | r2 = LOAD32_L(r + 8), r3 = LOAD32_L(r + 12); |
| 425 | |
| 426 | r0 &= 0x0fffffff; r1 &= 0x0ffffffc; r2 &= 0x0ffffffc; r3 &= 0x0ffffffc; |
| 427 | key->u.p26.r0 = P26W0(r); key->u.p26.r1 = P26W1(r); |
| 428 | key->u.p26.r2 = P26W2(r); key->u.p26.r3 = P26W3(r); |
| 429 | key->u.p26.r4 = P26W4(r); |
| 430 | |
| 431 | key->u.p26.rr1 = 5*key->u.p26.r1; key->u.p26.rr2 = 5*key->u.p26.r2; |
| 432 | key->u.p26.rr3 = 5*key->u.p26.r3; key->u.p26.rr4 = 5*key->u.p26.r4; |
| 433 | #else |
| 434 | memcpy(rr, r, 16); |
| 435 | rr[ 3] &= 0x0f; |
| 436 | rr[ 4] &= 0xfc; rr[ 7] &= 0x0f; |
| 437 | rr[ 8] &= 0xfc; rr[11] &= 0x0f; |
| 438 | rr[12] &= 0xfc; rr[15] &= 0x0f; |
| 439 | load_p11(key->u.p11.r, rr); |
| 440 | #endif |
| 441 | } |
| 442 | |
| 443 | /* --- @poly1305_macinit@ --- * |
| 444 | * |
| 445 | * Arguments: @poly1305_ctx *ctx@ = MAC context to fill in |
| 446 | * @const poly1305_key *key@ = pointer to key structure to use |
| 447 | * @const void *iv@ = pointer to mask string |
| 448 | * |
| 449 | * Returns: --- |
| 450 | * |
| 451 | * Use: Initializes a MAC context for use. The key can be discarded |
| 452 | * at any time. |
| 453 | * |
| 454 | * It is permitted for @iv@ to be null, though it is not then |
| 455 | * possible to complete the MAC computation on @ctx@. The |
| 456 | * resulting context may still be useful, e.g., as an operand to |
| 457 | * @poly1305_concat@. |
| 458 | */ |
| 459 | |
| 460 | void poly1305_macinit(poly1305_ctx *ctx, |
| 461 | const poly1305_key *key, const void *iv) |
| 462 | { |
| 463 | const octet *s = iv; |
| 464 | #if POLY1305_IMPL == 26 |
| 465 | uint32 s0, s1, s2, s3; |
| 466 | #else |
| 467 | unsigned i; |
| 468 | #endif |
| 469 | |
| 470 | #if POLY1305_IMPL == 26 |
| 471 | if (s) { |
| 472 | s0 = LOAD32_L(s + 0); s1 = LOAD32_L(s + 4); |
| 473 | s2 = LOAD32_L(s + 8); s3 = LOAD32_L(s + 12); |
| 474 | ctx->u.p26.s0 = P26W0(s); ctx->u.p26.s1 = P26W1(s); |
| 475 | ctx->u.p26.s2 = P26W2(s); ctx->u.p26.s3 = P26W3(s); |
| 476 | ctx->u.p26.s4 = P26W4(s); |
| 477 | } |
| 478 | ctx->u.p26.h[0] = ctx->u.p26.h[1] = ctx->u.p26.h[2] = |
| 479 | ctx->u.p26.h[3] = ctx->u.p26.h[4] = 0; |
| 480 | #else |
| 481 | if (s) load_p11(ctx->u.p11.s, s); |
| 482 | for (i = 0; i < 12; i++) ctx->u.p11.h[i] = 0; |
| 483 | #endif |
| 484 | ctx->k = *key; |
| 485 | ctx->nbuf = 0; |
| 486 | ctx->count = 0; |
| 487 | } |
| 488 | |
| 489 | /* --- @poly1305_copy@ --- * |
| 490 | * |
| 491 | * Arguments: @poly1305_ctx *to@ = destination context |
| 492 | * @const poly1305_ctx *from@ = source context |
| 493 | * |
| 494 | * Returns: --- |
| 495 | * |
| 496 | * Use: Duplicates a Poly1305 MAC context. The destination need not |
| 497 | * have been initialized. Both contexts can be used |
| 498 | * independently afterwards. |
| 499 | */ |
| 500 | |
| 501 | void poly1305_copy(poly1305_ctx *ctx, const poly1305_ctx *from) |
| 502 | { *ctx = *from; } |
| 503 | |
| 504 | /* --- @poly1305_hash@ --- * |
| 505 | * |
| 506 | * Arguments: @poly1305_ctx *ctx@ = MAC context to update |
| 507 | * @const void *p@ = pointer to message data |
| 508 | * @size_t sz@ = length of message data |
| 509 | * |
| 510 | * Returns: --- |
| 511 | * |
| 512 | * Use: Processes a chunk of message. The message pieces may have |
| 513 | * arbitrary lengths, and may be empty. |
| 514 | */ |
| 515 | |
| 516 | static void update_full(poly1305_ctx *ctx, const octet *p) |
| 517 | { |
| 518 | felt t; |
| 519 | #if POLY1305_IMPL == 26 |
| 520 | uint32 |
| 521 | m0 = LOAD32_L(p + 0), m1 = LOAD32_L(p + 4), |
| 522 | m2 = LOAD32_L(p + 8), m3 = LOAD32_L(p + 12); |
| 523 | |
| 524 | t[0] = ctx->u.p26.h[0] + P26W0(m); |
| 525 | t[1] = ctx->u.p26.h[1] + P26W1(m); |
| 526 | t[2] = ctx->u.p26.h[2] + P26W2(m); |
| 527 | t[3] = ctx->u.p26.h[3] + P26W3(m); |
| 528 | t[4] = ctx->u.p26.h[4] + P26W4(m) + 0x01000000; |
| 529 | #else |
| 530 | unsigned i; |
| 531 | |
| 532 | load_p11(t, p); t[11] += 0x100; |
| 533 | for (i = 0; i < 12; i++) t[i] += ctx->u.p11.h[i]; |
| 534 | #endif |
| 535 | |
| 536 | mul_r(ctx, ctx->u.P.h, t); |
| 537 | ctx->count++; |
| 538 | } |
| 539 | |
| 540 | static const rsvr_policy pol = { 0, 16, 16 }; |
| 541 | |
| 542 | void poly1305_hash(poly1305_ctx *ctx, const void *p, size_t sz) |
| 543 | { |
| 544 | rsvr_state st; |
| 545 | const octet *q = p; |
| 546 | |
| 547 | rsvr_setup(&st, &pol, &ctx->buf, &ctx->nbuf, p, sz); |
| 548 | RSVR_DO(&st) while ((q = RSVR_NEXT(&st, 16)) != 0) update_full(ctx, q); |
| 549 | } |
| 550 | |
| 551 | /* --- @poly1305_flush@ --- * |
| 552 | * |
| 553 | * Arguments: @poly1305_ctx *ctx@ = MAC context to flush |
| 554 | * |
| 555 | * Returns: --- |
| 556 | * |
| 557 | * Use: Forces any buffered message data in the context to be |
| 558 | * processed. This has no effect if the message processed so |
| 559 | * far is a whole number of blocks. Flushing is performed |
| 560 | * automatically by @poly1305_done@, but it may be necessary to |
| 561 | * force it by hand when using @poly1305_concat@. |
| 562 | * (Alternatively, you might use @poly1305_flushzero@ instead.) |
| 563 | * |
| 564 | * Flushing a partial block has an observable effect on the |
| 565 | * computation: the resulting state is (with high probability) |
| 566 | * dissimilar to any state reachable with a message which is a |
| 567 | * whole number of blocks long. |
| 568 | */ |
| 569 | |
| 570 | void poly1305_flush(poly1305_ctx *ctx) |
| 571 | { |
| 572 | felt t; |
| 573 | #if POLY1305_IMPL == 26 |
| 574 | uint32 m0, m1, m2, m3; |
| 575 | #else |
| 576 | unsigned i; |
| 577 | #endif |
| 578 | |
| 579 | if (!ctx->nbuf) return; |
| 580 | ctx->buf[ctx->nbuf++] = 1; memset(ctx->buf + ctx->nbuf, 0, 16 - ctx->nbuf); |
| 581 | #if POLY1305_IMPL == 26 |
| 582 | m0 = LOAD32_L(ctx->buf + 0); m1 = LOAD32_L(ctx->buf + 4); |
| 583 | m2 = LOAD32_L(ctx->buf + 8); m3 = LOAD32_L(ctx->buf + 12); |
| 584 | |
| 585 | t[0] = ctx->u.p26.h[0] + P26W0(m); |
| 586 | t[1] = ctx->u.p26.h[1] + P26W1(m); |
| 587 | t[2] = ctx->u.p26.h[2] + P26W2(m); |
| 588 | t[3] = ctx->u.p26.h[3] + P26W3(m); |
| 589 | t[4] = ctx->u.p26.h[4] + P26W4(m); |
| 590 | #else |
| 591 | load_p11(t, ctx->buf); |
| 592 | for (i = 0; i < 12; i++) t[i] += ctx->u.p11.h[i]; |
| 593 | #endif |
| 594 | |
| 595 | mul_r(ctx, ctx->u.P.h, t); |
| 596 | ctx->nbuf = 0; ctx->count++; |
| 597 | } |
| 598 | |
| 599 | /* --- @poly1305_flushzero@ --- * |
| 600 | * |
| 601 | * Arguments: @poly1305_ctx *ctx@ = MAC context to flush |
| 602 | * |
| 603 | * Returns: --- |
| 604 | * |
| 605 | * Use: Forces any buffered message data in the context to be |
| 606 | * processed, by hashing between zero and fifteen additional |
| 607 | * zero bytes. Like @poly1305_flush@, this has no effect if the |
| 608 | * the message processed so far is a whole number of blocks. |
| 609 | * Unlike @poly1305_flush@, the behaviour if the message is not |
| 610 | * a whole number of blocks is equivalent to actually hashing |
| 611 | * some extra data. |
| 612 | */ |
| 613 | |
| 614 | void poly1305_flushzero(poly1305_ctx *ctx) |
| 615 | { |
| 616 | if (!ctx->nbuf) return; |
| 617 | memset(ctx->buf + ctx->nbuf, 0, 16 - ctx->nbuf); |
| 618 | update_full(ctx, ctx->buf); |
| 619 | ctx->nbuf = 0; |
| 620 | } |
| 621 | |
| 622 | /* --- @poly1305_concat@ --- * |
| 623 | * |
| 624 | * Arguments: @poly1305_ctx *ctx@ = destination context |
| 625 | * @const poly1305_ctx *prefix, *suffix@ = two operand contexts |
| 626 | * |
| 627 | * Returns: --- |
| 628 | * |
| 629 | * Use: The two operand contexts @prefix@ and @suffix@ represent |
| 630 | * processing of two messages %$m$% and %$m'$%; the effect is to |
| 631 | * set @ctx@ to the state corresponding to their concatenation |
| 632 | * %$m \cat m'$%. |
| 633 | * |
| 634 | * All three contexts must have been initialized using the same |
| 635 | * key value (though not necessarily from the same key |
| 636 | * structure). The mask values associated with the input |
| 637 | * contexts are irrelevant. The @prefix@ message %$m$% must be |
| 638 | * a whole number of blocks long: this can be arranged by |
| 639 | * flushing the context. The @suffix@ message need not be a |
| 640 | * whole number of blocks long. All of the contexts remain |
| 641 | * operational and can be used independently afterwards. |
| 642 | */ |
| 643 | |
| 644 | void poly1305_concat(poly1305_ctx *ctx, |
| 645 | const poly1305_ctx *prefix, const poly1305_ctx *suffix) |
| 646 | { |
| 647 | /* Assume that lengths are public, so it's safe to behave conditionally on |
| 648 | * the bits of ctx->count. |
| 649 | */ |
| 650 | unsigned long n; |
| 651 | unsigned i; |
| 652 | felt x; |
| 653 | #if POLY1305_IMPL == 26 |
| 654 | uint32 x0, x1, x2, x3, x4, y0, y1, y2, y3, y4; |
| 655 | #else |
| 656 | uint32 y[12]; |
| 657 | #endif |
| 658 | |
| 659 | /* We can only concatenate if the prefix is block-aligned. */ |
| 660 | assert(!prefix->nbuf); |
| 661 | |
| 662 | /* The hash for a message m = m_{k-1} m_{k-2} ... m_1 m_0 is h_r(m) = |
| 663 | * SUM_{0<=i<k} m_i r^{i+1}. If we have two messages, m, m', of lengths k |
| 664 | * and k' blocks respectively, then |
| 665 | * |
| 666 | * h_r(m || m') = SUM_{0<=i<k} m_i r^{k'+i+1} + |
| 667 | * SUM_{0<=i<k'} m'_i r^{i+1} |
| 668 | * = r^{k'} h_r(m) + h_r(m') |
| 669 | * |
| 670 | * This is simple left-to-right square-and-multiply exponentiation. |
| 671 | */ |
| 672 | n = suffix->count; |
| 673 | x[0] = 1; |
| 674 | #if POLY1305_IMPL == 26 |
| 675 | x[1] = x[2] = x[3] = x[4] = 0; |
| 676 | #else |
| 677 | for (i = 1; i < 12; i++) x[i] = 0; |
| 678 | #endif |
| 679 | #define BIT (1ul << (ULONG_BITS - 1)) |
| 680 | if (n) { |
| 681 | i = ULONG_BITS; |
| 682 | while (!(n & BIT)) { n <<= 1; i--; } |
| 683 | mul_r(prefix, x, x); n <<= 1; i--; |
| 684 | while (i--) { sqr(x, x); if (n & BIT) mul_r(prefix, x, x); n <<= 1; } |
| 685 | } |
| 686 | #undef BIT |
| 687 | mul(x, prefix->u.P.h, x); |
| 688 | |
| 689 | /* Add on the suffix hash. */ |
| 690 | #if POLY1305_IMPL == 26 |
| 691 | /* We're going to add the two hashes elementwise. Both h' = h_r(m') and |
| 692 | * x = r^{k'} h_r(m) are bounded above by 2^27, so the sum will be bounded |
| 693 | * by 2^28; but this is too large to leave in the accumulator. (Strictly, |
| 694 | * we could get away with it, but the caller can in theory chain an |
| 695 | * arbitrary number of concatenations and expect us to cope, and we'd |
| 696 | * definitely overflow eventually.) So we reduce. Since the excess is so |
| 697 | * small, a single round of `CARRY_REDUCE' is enough. |
| 698 | */ |
| 699 | x0 = x[0] + suffix->u.p26.h[0]; x1 = x[1] + suffix->u.p26.h[1]; |
| 700 | x2 = x[2] + suffix->u.p26.h[2]; x3 = x[3] + suffix->u.p26.h[3]; |
| 701 | x4 = x[4] + suffix->u.p26.h[4]; |
| 702 | CARRY_REDUCE(y, x); |
| 703 | ctx->u.p26.h[0] = y0; ctx->u.p26.h[1] = y1; ctx->u.p26.h[2] = y2; |
| 704 | ctx->u.p26.h[3] = y3; ctx->u.p26.h[4] = y4; |
| 705 | #else |
| 706 | /* We'll add the two hashes elementwise and have to reduce again. The |
| 707 | * numbers are different, but the reasoning is basically the same. |
| 708 | */ |
| 709 | for (i = 0; i < 12; i++) y[i] = x[i] + suffix->u.p11.h[i]; |
| 710 | carry_reduce(y); |
| 711 | for (i = 0; i < 12; i++) ctx->u.p11.h[i] = y[i]; |
| 712 | #endif |
| 713 | |
| 714 | /* Copy the remaining pieces of the context to set up the result. */ |
| 715 | if (ctx != suffix) { |
| 716 | memcpy(ctx->buf, suffix->buf, suffix->nbuf); |
| 717 | ctx->nbuf = suffix->nbuf; |
| 718 | } |
| 719 | ctx->count = prefix->count + suffix->count; |
| 720 | } |
| 721 | |
| 722 | /* --- @poly1305_done@ --- * |
| 723 | * |
| 724 | * Arguments: @poly1305_ctx *ctx@ = MAC context to finish |
| 725 | * @void *h@ = buffer to write the tag to |
| 726 | * |
| 727 | * Returns: --- |
| 728 | * |
| 729 | * Use: Completes a Poly1305 MAC tag computation. |
| 730 | */ |
| 731 | |
| 732 | void poly1305_done(poly1305_ctx *ctx, void *h) |
| 733 | { |
| 734 | octet *p = h; |
| 735 | |
| 736 | #if POLY1305_IMPL == 26 |
| 737 | uint32 m_sub, t, c; |
| 738 | uint32 h0, h1, h2, h3, h4, hh0, hh1, hh2, hh3, hh4; |
| 739 | |
| 740 | /* If there's anything left over in the buffer, pad it to form a final |
| 741 | * coefficient and update the evaluation one last time. |
| 742 | */ |
| 743 | poly1305_flush(ctx); |
| 744 | |
| 745 | /* Collect the final hash state. */ |
| 746 | h0 = ctx->u.p26.h[0]; |
| 747 | h1 = ctx->u.p26.h[1]; |
| 748 | h2 = ctx->u.p26.h[2]; |
| 749 | h3 = ctx->u.p26.h[3]; |
| 750 | h4 = ctx->u.p26.h[4]; |
| 751 | |
| 752 | /* Reduce the final value mod 2^130 - 5. First pass: set h <- h + |
| 753 | * 5 floor(h/2^130). After this, the low pieces of h will be normalized: |
| 754 | * 0 <= h_i < 2^26 for 0 <= i < 4; and 0 <= h_4 < 2^26 + 1. In the |
| 755 | * (highly unlikely) event that h_4 >= 2^26, set c and truncate to 130 |
| 756 | * bits. |
| 757 | */ |
| 758 | c = h4 >> 26; h4 &= M26; |
| 759 | h0 += 5*c; c = h0 >> 26; h0 &= M26; |
| 760 | h1 += c; c = h1 >> 26; h1 &= M26; |
| 761 | h2 += c; c = h2 >> 26; h2 &= M26; |
| 762 | h3 += c; c = h3 >> 26; h3 &= M26; |
| 763 | h4 += c; c = h4 >> 26; h4 &= M26; |
| 764 | |
| 765 | /* Calculate h' = h - (2^130 - 5). If h' >= 0 then t ends up 1; otherwise |
| 766 | * it's zero. |
| 767 | */ |
| 768 | t = h0 + 5; hh0 = t&M26; t >>= 26; |
| 769 | t += h1; hh1 = t&M26; t >>= 26; |
| 770 | t += h2; hh2 = t&M26; t >>= 26; |
| 771 | t += h3; hh3 = t&M26; t >>= 26; |
| 772 | t += h4; hh4 = t&M26; t >>= 26; |
| 773 | |
| 774 | /* Keep the subtraction result above if t or c is set. */ |
| 775 | m_sub = -(t | c); |
| 776 | h0 = (hh0&m_sub) | (h0&~m_sub); |
| 777 | h1 = (hh1&m_sub) | (h1&~m_sub); |
| 778 | h2 = (hh2&m_sub) | (h2&~m_sub); |
| 779 | h3 = (hh3&m_sub) | (h3&~m_sub); |
| 780 | h4 = (hh4&m_sub) | (h4&~m_sub); |
| 781 | |
| 782 | /* Add the mask onto the hash result. */ |
| 783 | t = h0 + ctx->u.p26.s0; h0 = t&M26; t >>= 26; |
| 784 | t += h1 + ctx->u.p26.s1; h1 = t&M26; t >>= 26; |
| 785 | t += h2 + ctx->u.p26.s2; h2 = t&M26; t >>= 26; |
| 786 | t += h3 + ctx->u.p26.s3; h3 = t&M26; t >>= 26; |
| 787 | t += h4 + ctx->u.p26.s4; h4 = t&M26; t >>= 26; |
| 788 | |
| 789 | /* Convert this mess back into 32-bit words. We lose the top two bits, |
| 790 | * but that's fine. |
| 791 | */ |
| 792 | h0 = (h0 >> 0) | ((h1 & 0x0000003f) << 26); |
| 793 | h1 = (h1 >> 6) | ((h2 & 0x00000fff) << 20); |
| 794 | h2 = (h2 >> 12) | ((h3 & 0x0003ffff) << 14); |
| 795 | h3 = (h3 >> 18) | ((h4 & 0x00ffffff) << 8); |
| 796 | |
| 797 | /* All done. */ |
| 798 | STORE32_L(p + 0, h0); STORE32_L(p + 4, h1); |
| 799 | STORE32_L(p + 8, h2); STORE32_L(p + 12, h3); |
| 800 | #else |
| 801 | uint16 hh[12], hi[12], c, t, m_sub; |
| 802 | uint32 a; |
| 803 | unsigned i, j, n; |
| 804 | |
| 805 | /* If there's anything left over in the buffer, pad it to form a final |
| 806 | * coefficient and update the evaluation one last time. |
| 807 | */ |
| 808 | poly1305_flush(ctx); |
| 809 | |
| 810 | /* Collect the final hash state. */ |
| 811 | for (i = 0; i < 12; i++) hh[i] = ctx->u.p11.h[i]; |
| 812 | |
| 813 | /* Reduce the final value mod 2^130 - 5. First pass: set h <- h + |
| 814 | * 5 floor(h/2^130). After this, the low pieces of h will be normalized: |
| 815 | * 0 <= h_i < 2^{w_i} for 0 <= i < 11; and 0 <= h_{11} < 2^10 + 1. In the |
| 816 | * (highly unlikely) event that h_{11} >= 2^10, set c and truncate to 130 |
| 817 | * bits. |
| 818 | */ |
| 819 | c = 5*(hh[11] >> 10); hh[11] &= M10; |
| 820 | for (i = 0; i < 12; i++) { |
| 821 | if (i == 5 || i == 11) { c += hh[i]; hh[i] = c&M10; c >>= 10; } |
| 822 | else { c += hh[i]; hh[i] = c&M11; c >>= 11; } |
| 823 | } |
| 824 | |
| 825 | /* Calculate h' = h - (2^130 - 5). If h' >= 0 then t ends up 1; otherwise |
| 826 | * it's zero. |
| 827 | */ |
| 828 | for (i = 0, t = 5; i < 12; i++) { |
| 829 | t += hh[i]; |
| 830 | if (i == 5 || i == 11) { hi[i] = t&M10; t >>= 10; } |
| 831 | else { hi[i] = t&M11; t >>= 11; } |
| 832 | } |
| 833 | |
| 834 | /* Keep the subtraction result above if t or c is set. */ |
| 835 | m_sub = -(t | c); |
| 836 | for (i = 0; i < 12; i++) hh[i] = (hi[i]&m_sub) | (hh[i]&~m_sub); |
| 837 | |
| 838 | /* Add the mask onto the hash result. */ |
| 839 | for (i = 0, t = 0; i < 12; i++) { |
| 840 | t += hh[i] + ctx->u.p11.s[i]; |
| 841 | if (i == 5 || i == 11) { hh[i] = t&M10; t >>= 10; } |
| 842 | else { hh[i] = t&M11; t >>= 11; } |
| 843 | } |
| 844 | |
| 845 | /* Convert this mess back into bytes. We lose the top two bits, but that's |
| 846 | * fine. |
| 847 | */ |
| 848 | for (i = j = n = 0, a = 0; i < 16; i++) { |
| 849 | if (n < 8) { |
| 850 | a |= hh[j] << n; |
| 851 | n += (j == 5 || j == 11) ? 10 : 11; |
| 852 | j++; |
| 853 | } |
| 854 | p[i] = a&0xff; a >>= 8; n -= 8; |
| 855 | } |
| 856 | |
| 857 | #endif |
| 858 | } |
| 859 | |
| 860 | /*----- Test rig ----------------------------------------------------------*/ |
| 861 | |
| 862 | #ifdef TEST_RIG |
| 863 | |
| 864 | #include <mLib/macros.h> |
| 865 | #include <mLib/testrig.h> |
| 866 | |
| 867 | #include "ct.h" |
| 868 | #include "rijndael-ecb.h" |
| 869 | |
| 870 | static int vrf_hash(dstr v[]) |
| 871 | { |
| 872 | poly1305_key k; |
| 873 | poly1305_ctx ctx; |
| 874 | dstr t = DSTR_INIT; |
| 875 | unsigned i, j; |
| 876 | |
| 877 | if (v[0].len != 16) { fprintf(stderr, "bad key length\n"); exit(2); } |
| 878 | if (v[1].len != 16) { fprintf(stderr, "bad mask length\n"); exit(2); } |
| 879 | if (v[3].len != 16) { fprintf(stderr, "bad tag length\n"); exit(2); } |
| 880 | dstr_ensure(&t, 16); t.len = 16; |
| 881 | |
| 882 | ct_poison(v[0].buf, v[0].len); |
| 883 | poly1305_keyinit(&k, v[0].buf, v[0].len); |
| 884 | for (i = 0; i < v[2].len; i++) { |
| 885 | for (j = i; j < v[2].len; j++) { |
| 886 | poly1305_macinit(&ctx, &k, v[1].buf); |
| 887 | poly1305_hash(&ctx, v[2].buf, i); |
| 888 | poly1305_hash(&ctx, v[2].buf + i, j - i); |
| 889 | poly1305_hash(&ctx, v[2].buf + j, v[2].len - j); |
| 890 | poly1305_done(&ctx, t.buf); |
| 891 | ct_remedy(t.buf, t.len); |
| 892 | if (MEMCMP(t.buf, !=, v[3].buf, 16)) { |
| 893 | fprintf(stderr, "failed..."); |
| 894 | fprintf(stderr, "\n\tkey = "); type_hex.dump(&v[0], stderr); |
| 895 | fprintf(stderr, "\n\tmask = "); type_hex.dump(&v[1], stderr); |
| 896 | fprintf(stderr, "\n\tmsg = "); type_hex.dump(&v[2], stderr); |
| 897 | fprintf(stderr, "\n\texp = "); type_hex.dump(&v[3], stderr); |
| 898 | fprintf(stderr, "\n\tcalc = "); type_hex.dump(&t, stderr); |
| 899 | fprintf(stderr, "\n\tsplits = 0 .. %u .. %u .. %lu\n", |
| 900 | i, j, (unsigned long)v[1].len); |
| 901 | return (0); |
| 902 | } |
| 903 | } |
| 904 | } |
| 905 | return (1); |
| 906 | } |
| 907 | |
| 908 | static int vrf_cat(dstr v[]) |
| 909 | { |
| 910 | poly1305_key k; |
| 911 | poly1305_ctx ctx, cc[3]; |
| 912 | dstr t = DSTR_INIT; |
| 913 | unsigned i; |
| 914 | int ok = 1; |
| 915 | |
| 916 | if (v[0].len != 16) { fprintf(stderr, "bad key length\n"); exit(2); } |
| 917 | if (v[1].len != 16) { fprintf(stderr, "bad mask length\n"); exit(2); } |
| 918 | if (v[5].len != 16) { fprintf(stderr, "bad tag length\n"); exit(2); } |
| 919 | dstr_ensure(&t, 16); t.len = 16; |
| 920 | |
| 921 | poly1305_keyinit(&k, v[0].buf, v[0].len); |
| 922 | poly1305_macinit(&ctx, &k, v[1].buf); |
| 923 | for (i = 0; i < 3; i++) { |
| 924 | poly1305_macinit(&cc[i], &k, 0); |
| 925 | poly1305_hash(&cc[i], v[i + 2].buf, v[i + 2].len); |
| 926 | } |
| 927 | for (i = 0; i < 2; i++) { |
| 928 | if (!i) { |
| 929 | poly1305_concat(&ctx, &cc[1], &cc[2]); |
| 930 | poly1305_concat(&ctx, &cc[0], &ctx); |
| 931 | } else { |
| 932 | poly1305_concat(&ctx, &cc[0], &cc[1]); |
| 933 | poly1305_concat(&ctx, &ctx, &cc[2]); |
| 934 | } |
| 935 | poly1305_done(&ctx, t.buf); |
| 936 | if (MEMCMP(t.buf, !=, v[5].buf, 16)) { |
| 937 | fprintf(stderr, "failed..."); |
| 938 | fprintf(stderr, "\n\tkey = "); type_hex.dump(&v[0], stderr); |
| 939 | fprintf(stderr, "\n\tmask = "); type_hex.dump(&v[1], stderr); |
| 940 | fprintf(stderr, "\n\tmsg[0] = "); type_hex.dump(&v[2], stderr); |
| 941 | fprintf(stderr, "\n\tmsg[1] = "); type_hex.dump(&v[3], stderr); |
| 942 | fprintf(stderr, "\n\tmsg[2] = "); type_hex.dump(&v[4], stderr); |
| 943 | fprintf(stderr, "\n\texp = "); type_hex.dump(&v[5], stderr); |
| 944 | fprintf(stderr, "\n\tcalc = "); type_hex.dump(&t, stderr); |
| 945 | fprintf(stderr, "\n\tassoc = %s\n", |
| 946 | !i ? "msg[0] || (msg[1] || msg[2])" : |
| 947 | "(msg[0] || msg[1]) || msg[2]"); |
| 948 | ok = 0; |
| 949 | } |
| 950 | } |
| 951 | return (ok); |
| 952 | } |
| 953 | |
| 954 | #define MSZMAX 1000 |
| 955 | |
| 956 | static int vrf_mct(dstr v[]) |
| 957 | { |
| 958 | unsigned j, msz; |
| 959 | unsigned long i, start_iter, end_iter; |
| 960 | rijndael_ecbctx rij; |
| 961 | poly1305_key key; |
| 962 | poly1305_ctx mac; |
| 963 | dstr dk = DSTR_INIT, dr = DSTR_INIT, dn = DSTR_INIT, |
| 964 | dt = DSTR_INIT, dm = DSTR_INIT; |
| 965 | octet *k, *r, s[16], *n, *t, *m; |
| 966 | int ok = 1; |
| 967 | |
| 968 | DENSURE(&dk, 16); k = (octet *)dk.buf; dk.len = 16; |
| 969 | DENSURE(&dr, 16); r = (octet *)dr.buf; dr.len = 16; |
| 970 | DENSURE(&dn, 16); n = (octet *)dn.buf; dn.len = 16; |
| 971 | DENSURE(&dt, 16); t = (octet *)dt.buf; dt.len = 16; |
| 972 | DENSURE(&dm, MSZMAX); m = (octet *)dm.buf; dm.len = MSZMAX; |
| 973 | memset(m, 0, MSZMAX); |
| 974 | |
| 975 | if (v[0].len != 16) { fprintf(stderr, "AES key len\n"); exit(2); } |
| 976 | if (v[1].len != 16) { fprintf(stderr, "poly key len\n"); exit(2); } |
| 977 | if (v[2].len != 16) { fprintf(stderr, "nonce len\n"); exit(2); } |
| 978 | if (v[3].len != MSZMAX) { fprintf(stderr, "msgbuf len\n"); exit(2); } |
| 979 | if (v[6].len != 16) { fprintf(stderr, "result len\n"); exit(2); } |
| 980 | memcpy(k, v[0].buf, 16); |
| 981 | memcpy(r, v[1].buf, 16); |
| 982 | memcpy(n, v[2].buf, 16); |
| 983 | memcpy(m, v[3].buf, MSZMAX); |
| 984 | start_iter = *(unsigned long *)v[4].buf; |
| 985 | end_iter = *(unsigned long *)v[5].buf; |
| 986 | if (end_iter < start_iter) { fprintf(stderr, "iter bounds\n"); exit(2); } |
| 987 | |
| 988 | rijndael_ecbinit(&rij, k, 16, 0); |
| 989 | poly1305_keyinit(&key, r, 16); |
| 990 | for (i = start_iter; i < end_iter; i++) { |
| 991 | msz = 0; |
| 992 | for (;;) { |
| 993 | rijndael_ecbencrypt(&rij, n, s, 16); |
| 994 | poly1305_macinit(&mac, &key, s); |
| 995 | poly1305_hash(&mac, m, msz); |
| 996 | poly1305_done(&mac, t); |
| 997 | if (msz >= MSZMAX) break; |
| 998 | n[0] ^= i&0xff; |
| 999 | for (j = 0; j < 16; j++) n[j] ^= t[j]; |
| 1000 | if (msz%2) { |
| 1001 | for (j = 0; j < 16; j++) k[j] ^= t[j]; |
| 1002 | rijndael_ecbinit(&rij, k, 16, 0); |
| 1003 | } |
| 1004 | if (msz%3) { |
| 1005 | for (j = 0; j < 16; j++) r[j] ^= t[j]; |
| 1006 | poly1305_keyinit(&key, r, 16); |
| 1007 | } |
| 1008 | m[msz++] ^= t[0]; |
| 1009 | } |
| 1010 | } |
| 1011 | |
| 1012 | if (MEMCMP(t, !=, v[6].buf, 16)) { |
| 1013 | ok = 0; |
| 1014 | fprintf(stderr, "failed..."); |
| 1015 | fprintf(stderr, "\n\tinitial k = "); type_hex.dump(&v[0], stderr); |
| 1016 | fprintf(stderr, "\n\tinitial r = "); type_hex.dump(&v[1], stderr); |
| 1017 | fprintf(stderr, "\n\tinitial n = "); type_hex.dump(&v[2], stderr); |
| 1018 | fprintf(stderr, "\n\tinitial m = "); type_hex.dump(&v[3], stderr); |
| 1019 | fprintf(stderr, "\n\tstart iter = %lu", start_iter); |
| 1020 | fprintf(stderr, "\n\tend iter = %lu", end_iter); |
| 1021 | fprintf(stderr, "\n\tfinal k = "); type_hex.dump(&dk, stderr); |
| 1022 | fprintf(stderr, "\n\tfinal r = "); type_hex.dump(&dr, stderr); |
| 1023 | fprintf(stderr, "\n\tfinal n = "); type_hex.dump(&dn, stderr); |
| 1024 | fprintf(stderr, "\n\tfinal m = "); type_hex.dump(&dm, stderr); |
| 1025 | fprintf(stderr, "\n\texpected = "); type_hex.dump(&v[6], stderr); |
| 1026 | fprintf(stderr, "\n\tcalculated = "); type_hex.dump(&dt, stderr); |
| 1027 | fputc('\n', stderr); |
| 1028 | } |
| 1029 | |
| 1030 | dstr_destroy(&dk); |
| 1031 | dstr_destroy(&dr); |
| 1032 | dstr_destroy(&dn); |
| 1033 | dstr_destroy(&dt); |
| 1034 | dstr_destroy(&dm); |
| 1035 | return (ok); |
| 1036 | } |
| 1037 | |
| 1038 | static const struct test_chunk tests[] = { |
| 1039 | { "poly1305-hash", vrf_hash, |
| 1040 | { &type_hex, &type_hex, &type_hex, &type_hex } }, |
| 1041 | { "poly1305-cat", vrf_cat, |
| 1042 | { &type_hex, &type_hex, &type_hex, &type_hex, &type_hex, &type_hex } }, |
| 1043 | { "poly1305-mct", vrf_mct, |
| 1044 | { &type_hex, &type_hex, &type_hex, &type_hex, |
| 1045 | &type_ulong, &type_ulong, &type_hex } }, |
| 1046 | { 0, 0, { 0 } } |
| 1047 | }; |
| 1048 | |
| 1049 | int main(int argc, char *argv[]) |
| 1050 | { |
| 1051 | test_run(argc, argv, tests, SRCDIR "/t/poly1305"); |
| 1052 | return (0); |
| 1053 | } |
| 1054 | |
| 1055 | #endif |
| 1056 | |
| 1057 | /*----- That's all, folks -------------------------------------------------*/ |