X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/ac082cc94574e921813a11a50e60f0557af04c7c..4741bd9fa1f6dfc9f6482d6e1335ad046dbd6a78:/symm/poly1305.c diff --git a/symm/poly1305.c b/symm/poly1305.c index 9cc97eab..9deaf19f 100644 --- a/symm/poly1305.c +++ b/symm/poly1305.c @@ -33,6 +33,7 @@ #include #include "poly1305.h" +#include "rsvr.h" /*----- Global variables --------------------------------------------------*/ @@ -60,11 +61,11 @@ typedef uint32 felt[5]; #define P p26 /* Convert 32-bit words into field-element pieces. */ -#define P26W0(x) ((x##0)&0x03ffffff) -#define P26W1(x) ((((x##1)&0x000fffff) << 6) | (((x##0) >> 26)&0x0000003f)) -#define P26W2(x) ((((x##2)&0x00003fff) << 12) | (((x##1) >> 20)&0x00000fff)) -#define P26W3(x) ((((x##3)&0x000000ff) << 18) | (((x##2) >> 14)&0x0003ffff)) -#define P26W4(x) (((x##3) >> 8)&0x00ffffff) +#define P26W0(x) (((x##0) << 0)&0x03ffffff) +#define P26W1(x) ((((x##1) << 6)&0x03ffffc0) | (((x##0) >> 26)&0x0000003f)) +#define P26W2(x) ((((x##2) << 12)&0x03ffffff) | (((x##1) >> 20)&0x00000fff)) +#define P26W3(x) ((((x##3) << 18)&0x03fc0000) | (((x##2) >> 14)&0x0003ffff)) +#define P26W4(x) (((x##3) >> 8)&0x00ffffff) /* Propagate carries in parallel. If 0 <= u_i < 2^26 c_i, then we shall have * 0 <= v_0 < 2^26 + 5 c_4, and 0 <= v_i < 2^26 + c_{i-1} for 1 <= i < 5. @@ -183,7 +184,7 @@ static void mul_r(const poly1305_ctx *ctx, felt z, const felt x) #endif -/*----- Low-level implementation for 32/64-bit targets --------------------*/ +/*----- Low-level implementation for 16/32-bit targets --------------------*/ #ifndef POLY1305_IMPL # define POLY1305_IMPL 11 @@ -230,9 +231,10 @@ static void carry_reduce(uint32 u[12]) uint32 c; /* Do sequential carry propagation (16-bit CPUs are less likely to benefit - * from instruction-level parallelism). Start at u_10; truncate it to 11 - * bits, and add the carry onto u_11. Truncate u_11 to 10 bits, and add - * five times the carry onto u_0. And so on. + * from instruction-level parallelism). Start at u_9; truncate it to 11 + * bits, and add the carry onto u_10. Truncate u10 to 11 bits, and add the + * carry onto u_11. Truncate u_11 to 10 bits, and add five times the carry + * onto u_0. And so on. * * The carry is larger than the pieces we're leaving behind. Let c_i be * the high portion of u_i, to be carried onto u_{i+1}. I claim that c_i < @@ -241,13 +243,15 @@ static void carry_reduce(uint32 u[12]) * carry is bounded above by 636*2^22 + 12785*2^10 < 2557*2^20. Hence, the * carry out is at most 2557*2^10, as claimed. * - * Once we reach u_10 for the second time, we start with u_10 < 2^11. The - * carry into u_10 is at most 2557*2^10 < 1279*2^11 as calculated above; so - * the carry out into u_11 is at most 1280. Since u_11 < 2^10 prior to - * this carry in, all of the u_i are now bounded above by 2^11. The final - * reduction therefore only needs a conditional subtraction. + * Once we reach u_9 for the second time, we start with u_9 < 2^11. The + * carry into u_9 is at most 2557*2^10 < 1279*2^11 as calculated above; so + * the carry out into u_10 is at most 1280. Since u_10 < 2^11 prior to + * this carry in, we now have u_10 < 2^11 + 1280 < 2^12; so the carry out + * into u_11 is at most 1. The final reduction therefore only needs a + * conditional subtraction. */ - { c = u[10] >> 11; u[10] &= M11; } + { c = u[9] >> 11; u[9] &= M11; } + { u[10] += c; c = u[10] >> 11; u[10] &= M11; } { u[11] += c; c = u[11] >> 10; u[11] &= M10; } { u[0] += 5*c; c = u[0] >> 11; u[0] &= M11; } for (i = 1; i < 5; i++) { u[i] += c; c = u[i] >> 11; u[i] &= M11; } @@ -533,28 +537,15 @@ static void update_full(poly1305_ctx *ctx, const octet *p) ctx->count++; } +static const rsvr_policy pol = { 0, 16, 16 }; + void poly1305_hash(poly1305_ctx *ctx, const void *p, size_t sz) { - const octet *pp = p; - size_t n; - - if (ctx->nbuf) { - if (sz < 16 - ctx->nbuf) { - memcpy(ctx->buf + ctx->nbuf, p, sz); - ctx->nbuf += sz; - return; - } - n = 16 - ctx->nbuf; - memcpy(ctx->buf + ctx->nbuf, pp, n); - update_full(ctx, ctx->buf); - pp += n; sz -= n; - } - while (sz >= 16) { - update_full(ctx, pp); - pp += 16; sz -= 16; - } - if (sz) memcpy(ctx->buf, pp, sz); - ctx->nbuf = sz; + rsvr_state st; + const octet *q = p; + + rsvr_setup(&st, &pol, &ctx->buf, &ctx->nbuf, p, sz); + RSVR_DO(&st) while ((q = RSVR_NEXT(&st, 16)) != 0) update_full(ctx, q); } /* --- @poly1305_flush@ --- * @@ -568,6 +559,7 @@ void poly1305_hash(poly1305_ctx *ctx, const void *p, size_t sz) * far is a whole number of blocks. Flushing is performed * automatically by @poly1305_done@, but it may be necessary to * force it by hand when using @poly1305_concat@. + * (Alternatively, you might use @poly1305_flushzero@ instead.) * * Flushing a partial block has an observable effect on the * computation: the resulting state is (with high probability) @@ -601,7 +593,29 @@ void poly1305_flush(poly1305_ctx *ctx) #endif mul_r(ctx, ctx->u.P.h, t); - ctx->count++; + ctx->nbuf = 0; ctx->count++; +} + +/* --- @poly1305_flushzero@ --- * + * + * Arguments: @poly1305_ctx *ctx@ = MAC context to flush + * + * Returns: --- + * + * Use: Forces any buffered message data in the context to be + * processed, by hashing between zero and fifteen additional + * zero bytes. Like @poly1305_flush@, this has no effect if the + * the message processed so far is a whole number of blocks. + * Unlike @poly1305_flush@, the behaviour if the message is not + * a whole number of blocks is equivalent to actually hashing + * some extra data. + */ + +void poly1305_flushzero(poly1305_ctx *ctx) +{ + if (!ctx->nbuf) return; + memset(ctx->buf + ctx->nbuf, 0, 16 - ctx->nbuf); + update_full(ctx, ctx->buf); ctx->nbuf = 0; } @@ -847,8 +861,12 @@ void poly1305_done(poly1305_ctx *ctx, void *h) #ifdef TEST_RIG +#include #include +#include "ct.h" +#include "rijndael-ecb.h" + static int vrf_hash(dstr v[]) { poly1305_key k; @@ -861,6 +879,7 @@ static int vrf_hash(dstr v[]) if (v[3].len != 16) { fprintf(stderr, "bad tag length\n"); exit(2); } dstr_ensure(&t, 16); t.len = 16; + ct_poison(v[0].buf, v[0].len); poly1305_keyinit(&k, v[0].buf, v[0].len); for (i = 0; i < v[2].len; i++) { for (j = i; j < v[2].len; j++) { @@ -869,7 +888,8 @@ static int vrf_hash(dstr v[]) poly1305_hash(&ctx, v[2].buf + i, j - i); poly1305_hash(&ctx, v[2].buf + j, v[2].len - j); poly1305_done(&ctx, t.buf); - if (memcmp(t.buf, v[3].buf, 16) != 0) { + ct_remedy(t.buf, t.len); + if (MEMCMP(t.buf, !=, v[3].buf, 16)) { fprintf(stderr, "failed..."); fprintf(stderr, "\n\tkey = "); type_hex.dump(&v[0], stderr); fprintf(stderr, "\n\tmask = "); type_hex.dump(&v[1], stderr); @@ -913,7 +933,7 @@ static int vrf_cat(dstr v[]) poly1305_concat(&ctx, &ctx, &cc[2]); } poly1305_done(&ctx, t.buf); - if (memcmp(t.buf, v[5].buf, 16) != 0) { + if (MEMCMP(t.buf, !=, v[5].buf, 16)) { fprintf(stderr, "failed..."); fprintf(stderr, "\n\tkey = "); type_hex.dump(&v[0], stderr); fprintf(stderr, "\n\tmask = "); type_hex.dump(&v[1], stderr); @@ -931,11 +951,76 @@ static int vrf_cat(dstr v[]) return (ok); } +#define MSZMAX 1000 + +static int vrf_mct(dstr v[]) +{ + unsigned j, msz; + unsigned long i, niter; + rijndael_ecbctx rij; + poly1305_key key; + poly1305_ctx mac; + dstr d = DSTR_INIT; + octet k[16], r[16], n[16], s[16], *t, m[MSZMAX] = { 0 }; + int ok = 1; + + if (v[0].len != 16) { fprintf(stderr, "AES key len\n"); exit(2); } + if (v[1].len != 16) { fprintf(stderr, "poly key len\n"); exit(2); } + if (v[2].len != 16) { fprintf(stderr, "nonce len\n"); exit(2); } + if (v[4].len != 16) { fprintf(stderr, "result len\n"); exit(2); } + memcpy(k, v[0].buf, 16); + memcpy(r, v[1].buf, 16); + memcpy(n, v[2].buf, 16); + niter = *(unsigned long *)v[3].buf; + dstr_ensure(&d, 16); d.len = 16; t = (octet *)d.buf; + + rijndael_ecbinit(&rij, k, 16, 0); + poly1305_keyinit(&key, r, 16); + for (i = 0; i < niter; i++) { + msz = 0; + for (;;) { + rijndael_ecbencrypt(&rij, n, s, 16); + poly1305_macinit(&mac, &key, s); + poly1305_hash(&mac, m, msz); + poly1305_done(&mac, t); + if (msz >= MSZMAX) break; + n[0] ^= i&0xff; + for (j = 0; j < 16; j++) n[j] ^= t[j]; + if (msz%2) { + for (j = 0; j < 16; j++) k[j] ^= t[j]; + rijndael_ecbinit(&rij, k, 16, 0); + } + if (msz%3) { + for (j = 0; j < 16; j++) r[j] ^= t[j]; + poly1305_keyinit(&key, r, 16); + } + m[msz++] ^= t[0]; + } + } + + if (MEMCMP(t, !=, v[4].buf, 16)) { + ok = 0; + fprintf(stderr, "failed..."); + fprintf(stderr, "\n\tinitial k = "); type_hex.dump(&v[0], stderr); + fprintf(stderr, "\n\tinitial r = "); type_hex.dump(&v[1], stderr); + fprintf(stderr, "\n\tinitial n = "); type_hex.dump(&v[2], stderr); + fprintf(stderr, "\n\titerations = %lu", niter); + fprintf(stderr, "\n\texpected = "); type_hex.dump(&v[4], stderr); + fprintf(stderr, "\n\tcalculated = "); type_hex.dump(&d, stderr); + fputc('\n', stderr); + } + + dstr_destroy(&d); + return (ok); +} + static const struct test_chunk tests[] = { { "poly1305-hash", vrf_hash, { &type_hex, &type_hex, &type_hex, &type_hex } }, { "poly1305-cat", vrf_cat, { &type_hex, &type_hex, &type_hex, &type_hex, &type_hex, &type_hex } }, + { "poly1305-mct", vrf_mct, + { &type_hex, &type_hex, &type_hex, &type_ulong, &type_hex } }, { 0, 0, { 0 } } };