#include <string.h>
#include "poly1305.h"
+#include "rsvr.h"
/*----- Global variables --------------------------------------------------*/
#define P p26
/* Convert 32-bit words into field-element pieces. */
-#define P26W0(x) ((x##0)&0x03ffffff)
-#define P26W1(x) ((((x##1)&0x000fffff) << 6) | (((x##0) >> 26)&0x0000003f))
-#define P26W2(x) ((((x##2)&0x00003fff) << 12) | (((x##1) >> 20)&0x00000fff))
-#define P26W3(x) ((((x##3)&0x000000ff) << 18) | (((x##2) >> 14)&0x0003ffff))
-#define P26W4(x) (((x##3) >> 8)&0x00ffffff)
+#define P26W0(x) (((x##0) << 0)&0x03ffffff)
+#define P26W1(x) ((((x##1) << 6)&0x03ffffc0) | (((x##0) >> 26)&0x0000003f))
+#define P26W2(x) ((((x##2) << 12)&0x03ffffff) | (((x##1) >> 20)&0x00000fff))
+#define P26W3(x) ((((x##3) << 18)&0x03fc0000) | (((x##2) >> 14)&0x0003ffff))
+#define P26W4(x) (((x##3) >> 8)&0x00ffffff)
/* Propagate carries in parallel. If 0 <= u_i < 2^26 c_i, then we shall have
* 0 <= v_0 < 2^26 + 5 c_4, and 0 <= v_i < 2^26 + c_{i-1} for 1 <= i < 5.
#endif
-/*----- Low-level implementation for 32/64-bit targets --------------------*/
+/*----- Low-level implementation for 16/32-bit targets --------------------*/
#ifndef POLY1305_IMPL
# define POLY1305_IMPL 11
uint32 c;
/* Do sequential carry propagation (16-bit CPUs are less likely to benefit
- * from instruction-level parallelism). Start at u_10; truncate it to 11
- * bits, and add the carry onto u_11. Truncate u_11 to 10 bits, and add
- * five times the carry onto u_0. And so on.
+ * from instruction-level parallelism). Start at u_9; truncate it to 11
+ * bits, and add the carry onto u_10. Truncate u10 to 11 bits, and add the
+ * carry onto u_11. Truncate u_11 to 10 bits, and add five times the carry
+ * onto u_0. And so on.
*
* The carry is larger than the pieces we're leaving behind. Let c_i be
* the high portion of u_i, to be carried onto u_{i+1}. I claim that c_i <
* carry is bounded above by 636*2^22 + 12785*2^10 < 2557*2^20. Hence, the
* carry out is at most 2557*2^10, as claimed.
*
- * Once we reach u_10 for the second time, we start with u_10 < 2^11. The
- * carry into u_10 is at most 2557*2^10 < 1279*2^11 as calculated above; so
- * the carry out into u_11 is at most 1280. Since u_11 < 2^10 prior to
- * this carry in, all of the u_i are now bounded above by 2^11. The final
- * reduction therefore only needs a conditional subtraction.
+ * Once we reach u_9 for the second time, we start with u_9 < 2^11. The
+ * carry into u_9 is at most 2557*2^10 < 1279*2^11 as calculated above; so
+ * the carry out into u_10 is at most 1280. Since u_10 < 2^11 prior to
+ * this carry in, we now have u_10 < 2^11 + 1280 < 2^12; so the carry out
+ * into u_11 is at most 1. The final reduction therefore only needs a
+ * conditional subtraction.
*/
- { c = u[10] >> 11; u[10] &= M11; }
+ { c = u[9] >> 11; u[9] &= M11; }
+ { u[10] += c; c = u[10] >> 11; u[10] &= M11; }
{ u[11] += c; c = u[11] >> 10; u[11] &= M10; }
{ u[0] += 5*c; c = u[0] >> 11; u[0] &= M11; }
for (i = 1; i < 5; i++) { u[i] += c; c = u[i] >> 11; u[i] &= M11; }
ctx->count++;
}
+static const rsvr_policy pol = { 0, 16, 16 };
+
void poly1305_hash(poly1305_ctx *ctx, const void *p, size_t sz)
{
- const octet *pp = p;
- size_t n;
-
- if (ctx->nbuf) {
- if (sz < 16 - ctx->nbuf) {
- memcpy(ctx->buf + ctx->nbuf, p, sz);
- ctx->nbuf += sz;
- return;
- }
- n = 16 - ctx->nbuf;
- memcpy(ctx->buf + ctx->nbuf, pp, n);
- update_full(ctx, ctx->buf);
- pp += n; sz -= n;
- }
- while (sz >= 16) {
- update_full(ctx, pp);
- pp += 16; sz -= 16;
- }
- if (sz) memcpy(ctx->buf, pp, sz);
- ctx->nbuf = sz;
+ rsvr_state st;
+ const octet *q = p;
+
+ rsvr_setup(&st, &pol, &ctx->buf, &ctx->nbuf, p, sz);
+ RSVR_DO(&st) while ((q = RSVR_NEXT(&st, 16)) != 0) update_full(ctx, q);
}
/* --- @poly1305_flush@ --- *
* far is a whole number of blocks. Flushing is performed
* automatically by @poly1305_done@, but it may be necessary to
* force it by hand when using @poly1305_concat@.
+ * (Alternatively, you might use @poly1305_flushzero@ instead.)
*
* Flushing a partial block has an observable effect on the
* computation: the resulting state is (with high probability)
#endif
mul_r(ctx, ctx->u.P.h, t);
- ctx->count++;
+ ctx->nbuf = 0; ctx->count++;
+}
+
+/* --- @poly1305_flushzero@ --- *
+ *
+ * Arguments: @poly1305_ctx *ctx@ = MAC context to flush
+ *
+ * Returns: ---
+ *
+ * Use: Forces any buffered message data in the context to be
+ * processed, by hashing between zero and fifteen additional
+ * zero bytes. Like @poly1305_flush@, this has no effect if the
+ * the message processed so far is a whole number of blocks.
+ * Unlike @poly1305_flush@, the behaviour if the message is not
+ * a whole number of blocks is equivalent to actually hashing
+ * some extra data.
+ */
+
+void poly1305_flushzero(poly1305_ctx *ctx)
+{
+ if (!ctx->nbuf) return;
+ memset(ctx->buf + ctx->nbuf, 0, 16 - ctx->nbuf);
+ update_full(ctx, ctx->buf);
ctx->nbuf = 0;
}
#define BIT (1ul << (ULONG_BITS - 1))
if (n) {
i = ULONG_BITS;
- while (!(n & BIT)) { n <<= 1; i--; }
+ while (!(n&BIT)) { n <<= 1; i--; }
mul_r(prefix, x, x); n <<= 1; i--;
- while (i--) { sqr(x, x); if (n & BIT) mul_r(prefix, x, x); n <<= 1; }
+ while (i--) { sqr(x, x); if (n&BIT) mul_r(prefix, x, x); n <<= 1; }
}
#undef BIT
mul(x, prefix->u.P.h, x);
/* Convert this mess back into 32-bit words. We lose the top two bits,
* but that's fine.
*/
- h0 = (h0 >> 0) | ((h1 & 0x0000003f) << 26);
- h1 = (h1 >> 6) | ((h2 & 0x00000fff) << 20);
- h2 = (h2 >> 12) | ((h3 & 0x0003ffff) << 14);
- h3 = (h3 >> 18) | ((h4 & 0x00ffffff) << 8);
+ h0 = (h0 >> 0) | ((h1&0x0000003f) << 26);
+ h1 = (h1 >> 6) | ((h2&0x00000fff) << 20);
+ h2 = (h2 >> 12) | ((h3&0x0003ffff) << 14);
+ h3 = (h3 >> 18) | ((h4&0x00ffffff) << 8);
/* All done. */
STORE32_L(p + 0, h0); STORE32_L(p + 4, h1);
#ifdef TEST_RIG
+#include <mLib/macros.h>
#include <mLib/testrig.h>
+#include "ct.h"
+#include "rijndael-ecb.h"
+
static int vrf_hash(dstr v[])
{
poly1305_key k;
if (v[3].len != 16) { fprintf(stderr, "bad tag length\n"); exit(2); }
dstr_ensure(&t, 16); t.len = 16;
+ ct_poison(v[0].buf, v[0].len);
poly1305_keyinit(&k, v[0].buf, v[0].len);
for (i = 0; i < v[2].len; i++) {
for (j = i; j < v[2].len; j++) {
poly1305_hash(&ctx, v[2].buf + i, j - i);
poly1305_hash(&ctx, v[2].buf + j, v[2].len - j);
poly1305_done(&ctx, t.buf);
- if (memcmp(t.buf, v[3].buf, 16) != 0) {
+ ct_remedy(t.buf, t.len);
+ if (MEMCMP(t.buf, !=, v[3].buf, 16)) {
fprintf(stderr, "failed...");
fprintf(stderr, "\n\tkey = "); type_hex.dump(&v[0], stderr);
fprintf(stderr, "\n\tmask = "); type_hex.dump(&v[1], stderr);
poly1305_concat(&ctx, &ctx, &cc[2]);
}
poly1305_done(&ctx, t.buf);
- if (memcmp(t.buf, v[5].buf, 16) != 0) {
+ if (MEMCMP(t.buf, !=, v[5].buf, 16)) {
fprintf(stderr, "failed...");
fprintf(stderr, "\n\tkey = "); type_hex.dump(&v[0], stderr);
fprintf(stderr, "\n\tmask = "); type_hex.dump(&v[1], stderr);
return (ok);
}
+#define MSZMAX 1000
+
+static int vrf_mct(dstr v[])
+{
+ unsigned j, msz;
+ unsigned long i, start_iter, end_iter;
+ rijndael_ecbctx rij;
+ poly1305_key key;
+ poly1305_ctx mac;
+ dstr dk = DSTR_INIT, dr = DSTR_INIT, dn = DSTR_INIT,
+ dt = DSTR_INIT, dm = DSTR_INIT;
+ octet *k, *r, s[16], *n, *t, *m;
+ int ok = 1;
+
+ DENSURE(&dk, 16); k = (octet *)dk.buf; dk.len = 16;
+ DENSURE(&dr, 16); r = (octet *)dr.buf; dr.len = 16;
+ DENSURE(&dn, 16); n = (octet *)dn.buf; dn.len = 16;
+ DENSURE(&dt, 16); t = (octet *)dt.buf; dt.len = 16;
+ DENSURE(&dm, MSZMAX); m = (octet *)dm.buf; dm.len = MSZMAX;
+ memset(m, 0, MSZMAX);
+
+ if (v[0].len != 16) { fprintf(stderr, "AES key len\n"); exit(2); }
+ if (v[1].len != 16) { fprintf(stderr, "poly key len\n"); exit(2); }
+ if (v[2].len != 16) { fprintf(stderr, "nonce len\n"); exit(2); }
+ if (v[3].len != MSZMAX) { fprintf(stderr, "msgbuf len\n"); exit(2); }
+ if (v[6].len != 16) { fprintf(stderr, "result len\n"); exit(2); }
+ memcpy(k, v[0].buf, 16);
+ memcpy(r, v[1].buf, 16);
+ memcpy(n, v[2].buf, 16);
+ memcpy(m, v[3].buf, MSZMAX);
+ start_iter = *(unsigned long *)v[4].buf;
+ end_iter = *(unsigned long *)v[5].buf;
+ if (end_iter < start_iter) { fprintf(stderr, "iter bounds\n"); exit(2); }
+
+ rijndael_ecbinit(&rij, k, 16, 0);
+ poly1305_keyinit(&key, r, 16);
+ for (i = start_iter; i < end_iter; i++) {
+ msz = 0;
+ for (;;) {
+ rijndael_ecbencrypt(&rij, n, s, 16);
+ poly1305_macinit(&mac, &key, s);
+ poly1305_hash(&mac, m, msz);
+ poly1305_done(&mac, t);
+ if (msz >= MSZMAX) break;
+ n[0] ^= i&0xff;
+ for (j = 0; j < 16; j++) n[j] ^= t[j];
+ if (msz%2) {
+ for (j = 0; j < 16; j++) k[j] ^= t[j];
+ rijndael_ecbinit(&rij, k, 16, 0);
+ }
+ if (msz%3) {
+ for (j = 0; j < 16; j++) r[j] ^= t[j];
+ poly1305_keyinit(&key, r, 16);
+ }
+ m[msz++] ^= t[0];
+ }
+ }
+
+ if (MEMCMP(t, !=, v[6].buf, 16)) {
+ ok = 0;
+ fprintf(stderr, "failed...");
+ fprintf(stderr, "\n\tinitial k = "); type_hex.dump(&v[0], stderr);
+ fprintf(stderr, "\n\tinitial r = "); type_hex.dump(&v[1], stderr);
+ fprintf(stderr, "\n\tinitial n = "); type_hex.dump(&v[2], stderr);
+ fprintf(stderr, "\n\tinitial m = "); type_hex.dump(&v[3], stderr);
+ fprintf(stderr, "\n\tstart iter = %lu", start_iter);
+ fprintf(stderr, "\n\tend iter = %lu", end_iter);
+ fprintf(stderr, "\n\tfinal k = "); type_hex.dump(&dk, stderr);
+ fprintf(stderr, "\n\tfinal r = "); type_hex.dump(&dr, stderr);
+ fprintf(stderr, "\n\tfinal n = "); type_hex.dump(&dn, stderr);
+ fprintf(stderr, "\n\tfinal m = "); type_hex.dump(&dm, stderr);
+ fprintf(stderr, "\n\texpected = "); type_hex.dump(&v[6], stderr);
+ fprintf(stderr, "\n\tcalculated = "); type_hex.dump(&dt, stderr);
+ fputc('\n', stderr);
+ }
+
+ dstr_destroy(&dk);
+ dstr_destroy(&dr);
+ dstr_destroy(&dn);
+ dstr_destroy(&dt);
+ dstr_destroy(&dm);
+ return (ok);
+}
+
static const struct test_chunk tests[] = {
{ "poly1305-hash", vrf_hash,
{ &type_hex, &type_hex, &type_hex, &type_hex } },
{ "poly1305-cat", vrf_cat,
{ &type_hex, &type_hex, &type_hex, &type_hex, &type_hex, &type_hex } },
+ { "poly1305-mct", vrf_mct,
+ { &type_hex, &type_hex, &type_hex, &type_hex,
+ &type_ulong, &type_ulong, &type_hex } },
{ 0, 0, { 0 } }
};