#define P p26
/* Convert 32-bit words into field-element pieces. */
-#define P26W0(x) ((x##0)&0x03ffffff)
-#define P26W1(x) ((((x##1)&0x000fffff) << 6) | (((x##0) >> 26)&0x0000003f))
-#define P26W2(x) ((((x##2)&0x00003fff) << 12) | (((x##1) >> 20)&0x00000fff))
-#define P26W3(x) ((((x##3)&0x000000ff) << 18) | (((x##2) >> 14)&0x0003ffff))
+#define P26W0(x) (((x##0) << 0)&0x03ffffff)
+#define P26W1(x) ((((x##1) << 6)&0x03ffffc0) | (((x##0) >> 26)&0x0000003f))
+#define P26W2(x) ((((x##2) << 12)&0x03ffffff) | (((x##1) >> 20)&0x00000fff))
+#define P26W3(x) ((((x##3) << 18)&0x03fc0000) | (((x##2) >> 14)&0x0003ffff))
#define P26W4(x) (((x##3) >> 8)&0x00ffffff)
/* Propagate carries in parallel. If 0 <= u_i < 2^26 c_i, then we shall have
uint32 c;
/* Do sequential carry propagation (16-bit CPUs are less likely to benefit
- * from instruction-level parallelism). Start at u_10; truncate it to 11
- * bits, and add the carry onto u_11. Truncate u_11 to 10 bits, and add
- * five times the carry onto u_0. And so on.
+ * from instruction-level parallelism). Start at u_9; truncate it to 11
+ * bits, and add the carry onto u_10. Truncate u10 to 11 bits, and add the
+ * carry onto u_11. Truncate u_11 to 10 bits, and add five times the carry
+ * onto u_0. And so on.
*
* The carry is larger than the pieces we're leaving behind. Let c_i be
* the high portion of u_i, to be carried onto u_{i+1}. I claim that c_i <
* carry is bounded above by 636*2^22 + 12785*2^10 < 2557*2^20. Hence, the
* carry out is at most 2557*2^10, as claimed.
*
- * Once we reach u_10 for the second time, we start with u_10 < 2^11. The
- * carry into u_10 is at most 2557*2^10 < 1279*2^11 as calculated above; so
- * the carry out into u_11 is at most 1280. Since u_11 < 2^10 prior to
- * this carry in, all of the u_i are now bounded above by 2^11. The final
- * reduction therefore only needs a conditional subtraction.
+ * Once we reach u_9 for the second time, we start with u_9 < 2^11. The
+ * carry into u_9 is at most 2557*2^10 < 1279*2^11 as calculated above; so
+ * the carry out into u_10 is at most 1280. Since u_10 < 2^11 prior to
+ * this carry in, we now have u_10 < 2^11 + 1280 < 2^12; so the carry out
+ * into u_11 is at most 1. The final reduction therefore only needs a
+ * conditional subtraction.
*/
- { c = u[10] >> 11; u[10] &= M11; }
+ { c = u[9] >> 11; u[9] &= M11; }
+ { u[10] += c; c = u[10] >> 11; u[10] &= M11; }
{ u[11] += c; c = u[11] >> 10; u[11] &= M10; }
{ u[0] += 5*c; c = u[0] >> 11; u[0] &= M11; }
for (i = 1; i < 5; i++) { u[i] += c; c = u[i] >> 11; u[i] &= M11; }
* far is a whole number of blocks. Flushing is performed
* automatically by @poly1305_done@, but it may be necessary to
* force it by hand when using @poly1305_concat@.
+ * (Alternatively, you might use @poly1305_flushzero@ instead.)
*
* Flushing a partial block has an observable effect on the
* computation: the resulting state is (with high probability)
#endif
mul_r(ctx, ctx->u.P.h, t);
- ctx->count++;
+ ctx->nbuf = 0; ctx->count++;
+}
+
+/* --- @poly1305_flushzero@ --- *
+ *
+ * Arguments: @poly1305_ctx *ctx@ = MAC context to flush
+ *
+ * Returns: ---
+ *
+ * Use: Forces any buffered message data in the context to be
+ * processed, by hashing between zero and fifteen additional
+ * zero bytes. Like @poly1305_flush@, this has no effect if the
+ * the message processed so far is a whole number of blocks.
+ * Unlike @poly1305_flush@, the behaviour if the message is not
+ * a whole number of blocks is equivalent to actually hashing
+ * some extra data.
+ */
+
+void poly1305_flushzero(poly1305_ctx *ctx)
+{
+ if (!ctx->nbuf) return;
+ memset(ctx->buf + ctx->nbuf, 0, 16 - ctx->nbuf);
+ update_full(ctx, ctx->buf);
ctx->nbuf = 0;
}
#include <mLib/testrig.h>
+#include "ct.h"
+#include "rijndael-ecb.h"
+
static int vrf_hash(dstr v[])
{
poly1305_key k;
if (v[3].len != 16) { fprintf(stderr, "bad tag length\n"); exit(2); }
dstr_ensure(&t, 16); t.len = 16;
+ ct_poison(v[0].buf, v[0].len);
poly1305_keyinit(&k, v[0].buf, v[0].len);
for (i = 0; i < v[2].len; i++) {
for (j = i; j < v[2].len; j++) {
poly1305_hash(&ctx, v[2].buf + i, j - i);
poly1305_hash(&ctx, v[2].buf + j, v[2].len - j);
poly1305_done(&ctx, t.buf);
+ ct_remedy(t.buf, t.len);
if (memcmp(t.buf, v[3].buf, 16) != 0) {
fprintf(stderr, "failed...");
fprintf(stderr, "\n\tkey = "); type_hex.dump(&v[0], stderr);
return (ok);
}
+#define MSZMAX 1000
+
+static int vrf_mct(dstr v[])
+{
+ unsigned j, msz;
+ unsigned long i, niter;
+ rijndael_ecbctx rij;
+ poly1305_key key;
+ poly1305_ctx mac;
+ dstr d = DSTR_INIT;
+ octet k[16], r[16], n[16], s[16], *t, m[MSZMAX] = { 0 };
+ int ok = 1;
+
+ if (v[0].len != sizeof(k)) { fprintf(stderr, "AES key len\n"); exit(2); }
+ if (v[1].len != sizeof(r)) { fprintf(stderr, "poly key len\n"); exit(2); }
+ if (v[2].len != sizeof(n)) { fprintf(stderr, "nonce len\n"); exit(2); }
+ if (v[4].len != sizeof(n)) { fprintf(stderr, "result len\n"); exit(2); }
+ memcpy(k, v[0].buf, sizeof(k));
+ memcpy(r, v[1].buf, sizeof(k));
+ memcpy(n, v[2].buf, sizeof(k));
+ niter = *(unsigned long *)v[3].buf;
+ dstr_ensure(&d, 16); d.len = 16; t = (octet *)d.buf;
+
+ rijndael_ecbinit(&rij, k, sizeof(k), 0);
+ poly1305_keyinit(&key, r, sizeof(r));
+ for (i = 0; i < niter; i++) {
+ msz = 0;
+ for (;;) {
+ rijndael_ecbencrypt(&rij, n, s, 16);
+ poly1305_macinit(&mac, &key, s);
+ poly1305_hash(&mac, m, msz);
+ poly1305_done(&mac, t);
+ if (msz >= MSZMAX) break;
+ n[0] ^= i&0xff;
+ for (j = 0; j < 16; j++) n[j] ^= t[j];
+ if (msz%2) {
+ for (j = 0; j < 16; j++) k[j] ^= t[j];
+ rijndael_ecbinit(&rij, k, sizeof(k), 0);
+ }
+ if (msz%3) {
+ for (j = 0; j < 16; j++) r[j] ^= t[j];
+ poly1305_keyinit(&key, r, sizeof(r));
+ }
+ m[msz++] ^= t[0];
+ }
+ }
+
+ if (memcmp(t, v[4].buf, 16) != 0) {
+ ok = 0;
+ fprintf(stderr, "failed...");
+ fprintf(stderr, "\n\tinitial k = "); type_hex.dump(&v[0], stderr);
+ fprintf(stderr, "\n\tinitial r = "); type_hex.dump(&v[1], stderr);
+ fprintf(stderr, "\n\tinitial n = "); type_hex.dump(&v[2], stderr);
+ fprintf(stderr, "\n\titerations = %lu", niter);
+ fprintf(stderr, "\n\texpected = "); type_hex.dump(&v[4], stderr);
+ fprintf(stderr, "\n\tcalculated = "); type_hex.dump(&d, stderr);
+ fputc('\n', stderr);
+ }
+
+ dstr_destroy(&d);
+ return (ok);
+}
+
static const struct test_chunk tests[] = {
{ "poly1305-hash", vrf_hash,
{ &type_hex, &type_hex, &type_hex, &type_hex } },
{ "poly1305-cat", vrf_cat,
{ &type_hex, &type_hex, &type_hex, &type_hex, &type_hex, &type_hex } },
+ { "poly1305-mct", vrf_mct,
+ { &type_hex, &type_hex, &type_hex, &type_ulong, &type_hex } },
{ 0, 0, { 0 } }
};