X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/ac082cc94574e921813a11a50e60f0557af04c7c..4741bd9fa1f6dfc9f6482d6e1335ad046dbd6a78:/symm/poly1305.c

diff --git a/symm/poly1305.c b/symm/poly1305.c
index 9cc97eab..9deaf19f 100644
--- a/symm/poly1305.c
+++ b/symm/poly1305.c
@@ -33,6 +33,7 @@
 #include <string.h>
 
 #include "poly1305.h"
+#include "rsvr.h"
 
 /*----- Global variables --------------------------------------------------*/
 
@@ -60,11 +61,11 @@ typedef uint32 felt[5];
 #define P p26
 
 /* Convert 32-bit words into field-element pieces. */
-#define P26W0(x)   ((x##0)&0x03ffffff)
-#define P26W1(x) ((((x##1)&0x000fffff) <<  6) | (((x##0) >> 26)&0x0000003f))
-#define P26W2(x) ((((x##2)&0x00003fff) << 12) | (((x##1) >> 20)&0x00000fff))
-#define P26W3(x) ((((x##3)&0x000000ff) << 18) | (((x##2) >> 14)&0x0003ffff))
-#define P26W4(x)				 (((x##3) >>  8)&0x00ffffff)
+#define P26W0(x)  (((x##0) <<  0)&0x03ffffff)
+#define P26W1(x) ((((x##1) <<  6)&0x03ffffc0) | (((x##0) >> 26)&0x0000003f))
+#define P26W2(x) ((((x##2) << 12)&0x03ffffff) | (((x##1) >> 20)&0x00000fff))
+#define P26W3(x) ((((x##3) << 18)&0x03fc0000) | (((x##2) >> 14)&0x0003ffff))
+#define P26W4(x)				(((x##3) >>  8)&0x00ffffff)
 
 /* Propagate carries in parallel.  If 0 <= u_i < 2^26 c_i, then we shall have
  * 0 <= v_0 < 2^26 + 5 c_4, and 0 <= v_i < 2^26 + c_{i-1} for 1 <= i < 5.
@@ -183,7 +184,7 @@ static void mul_r(const poly1305_ctx *ctx, felt z, const felt x)
 
 #endif
 
-/*----- Low-level implementation for 32/64-bit targets --------------------*/
+/*----- Low-level implementation for 16/32-bit targets --------------------*/
 
 #ifndef POLY1305_IMPL
 #  define POLY1305_IMPL 11
@@ -230,9 +231,10 @@ static void carry_reduce(uint32 u[12])
   uint32 c;
 
   /* Do sequential carry propagation (16-bit CPUs are less likely to benefit
-   * from instruction-level parallelism).  Start at u_10; truncate it to 11
-   * bits, and add the carry onto u_11.  Truncate u_11 to 10 bits, and add
-   * five times the carry onto u_0.  And so on.
+   * from instruction-level parallelism).  Start at u_9; truncate it to 11
+   * bits, and add the carry onto u_10.  Truncate u10 to 11 bits, and add the
+   * carry onto u_11.  Truncate u_11 to 10 bits, and add five times the carry
+   * onto u_0.  And so on.
    *
    * The carry is larger than the pieces we're leaving behind.  Let c_i be
    * the high portion of u_i, to be carried onto u_{i+1}.  I claim that c_i <
@@ -241,13 +243,15 @@ static void carry_reduce(uint32 u[12])
    * carry is bounded above by 636*2^22 + 12785*2^10 < 2557*2^20.  Hence, the
    * carry out is at most 2557*2^10, as claimed.
    *
-   * Once we reach u_10 for the second time, we start with u_10 < 2^11.  The
-   * carry into u_10 is at most 2557*2^10 < 1279*2^11 as calculated above; so
-   * the carry out into u_11 is at most 1280.  Since u_11 < 2^10 prior to
-   * this carry in, all of the u_i are now bounded above by 2^11.  The final
-   * reduction therefore only needs a conditional subtraction.
+   * Once we reach u_9 for the second time, we start with u_9 < 2^11.  The
+   * carry into u_9 is at most 2557*2^10 < 1279*2^11 as calculated above; so
+   * the carry out into u_10 is at most 1280.  Since u_10 < 2^11 prior to
+   * this carry in, we now have u_10 < 2^11 + 1280 < 2^12; so the carry out
+   * into u_11 is at most 1.  The final reduction therefore only needs a
+   * conditional subtraction.
    */
-			   {		   c = u[10] >> 11; u[10] &= M11; }
+			   {		   c =  u[9] >> 11;  u[9] &= M11; }
+			   { u[10] +=   c; c = u[10] >> 11; u[10] &= M11; }
 			   { u[11] +=   c; c = u[11] >> 10; u[11] &= M10; }
 			   {  u[0] += 5*c; c =  u[0] >> 11;  u[0] &= M11; }
   for (i = 1; i <  5; i++) {  u[i] +=   c; c =  u[i] >> 11;  u[i] &= M11; }
@@ -533,28 +537,15 @@ static void update_full(poly1305_ctx *ctx, const octet *p)
   ctx->count++;
 }
 
+static const rsvr_policy pol = { 0, 16, 16 };
+
 void poly1305_hash(poly1305_ctx *ctx, const void *p, size_t sz)
 {
-  const octet *pp = p;
-  size_t n;
-
-  if (ctx->nbuf) {
-    if (sz < 16 - ctx->nbuf) {
-      memcpy(ctx->buf + ctx->nbuf, p, sz);
-      ctx->nbuf += sz;
-      return;
-    }
-    n = 16 - ctx->nbuf;
-    memcpy(ctx->buf + ctx->nbuf, pp, n);
-    update_full(ctx, ctx->buf);
-    pp += n; sz -= n;
-  }
-  while (sz >= 16) {
-    update_full(ctx, pp);
-    pp += 16; sz -= 16;
-  }
-  if (sz) memcpy(ctx->buf, pp, sz);
-  ctx->nbuf = sz;
+  rsvr_state st;
+  const octet *q = p;
+
+  rsvr_setup(&st, &pol, &ctx->buf, &ctx->nbuf, p, sz);
+  RSVR_DO(&st) while ((q = RSVR_NEXT(&st, 16)) != 0) update_full(ctx, q);
 }
 
 /* --- @poly1305_flush@ --- *
@@ -568,6 +559,7 @@ void poly1305_hash(poly1305_ctx *ctx, const void *p, size_t sz)
  *		far is a whole number of blocks.  Flushing is performed
  *		automatically by @poly1305_done@, but it may be necessary to
  *		force it by hand when using @poly1305_concat@.
+ *		(Alternatively, you might use @poly1305_flushzero@ instead.)
  *
  *		Flushing a partial block has an observable effect on the
  *		computation: the resulting state is (with high probability)
@@ -601,7 +593,29 @@ void poly1305_flush(poly1305_ctx *ctx)
 #endif
 
   mul_r(ctx, ctx->u.P.h, t);
-  ctx->count++;
+  ctx->nbuf = 0; ctx->count++;
+}
+
+/* --- @poly1305_flushzero@ --- *
+ *
+ * Arguments:	@poly1305_ctx *ctx@ = MAC context to flush
+ *
+ * Returns:	---
+ *
+ * Use:		Forces any buffered message data in the context to be
+ *		processed, by hashing between zero and fifteen additional
+ *		zero bytes.  Like @poly1305_flush@, this has no effect if the
+ *		the message processed so far is a whole number of blocks.
+ *		Unlike @poly1305_flush@, the behaviour if the message is not
+ *		a whole number of blocks is equivalent to actually hashing
+ *		some extra data.
+ */
+
+void poly1305_flushzero(poly1305_ctx *ctx)
+{
+  if (!ctx->nbuf) return;
+  memset(ctx->buf + ctx->nbuf, 0, 16 - ctx->nbuf);
+  update_full(ctx, ctx->buf);
   ctx->nbuf = 0;
 }
 
@@ -847,8 +861,12 @@ void poly1305_done(poly1305_ctx *ctx, void *h)
 
 #ifdef TEST_RIG
 
+#include <mLib/macros.h>
 #include <mLib/testrig.h>
 
+#include "ct.h"
+#include "rijndael-ecb.h"
+
 static int vrf_hash(dstr v[])
 {
   poly1305_key k;
@@ -861,6 +879,7 @@ static int vrf_hash(dstr v[])
   if (v[3].len != 16) { fprintf(stderr, "bad tag length\n"); exit(2); }
   dstr_ensure(&t, 16); t.len = 16;
 
+  ct_poison(v[0].buf, v[0].len);
   poly1305_keyinit(&k, v[0].buf, v[0].len);
   for (i = 0; i < v[2].len; i++) {
     for (j = i; j < v[2].len; j++) {
@@ -869,7 +888,8 @@ static int vrf_hash(dstr v[])
       poly1305_hash(&ctx, v[2].buf + i, j - i);
       poly1305_hash(&ctx, v[2].buf + j, v[2].len - j);
       poly1305_done(&ctx, t.buf);
-      if (memcmp(t.buf, v[3].buf, 16) != 0) {
+      ct_remedy(t.buf, t.len);
+      if (MEMCMP(t.buf, !=, v[3].buf, 16)) {
 	fprintf(stderr, "failed...");
 	fprintf(stderr, "\n\tkey    = "); type_hex.dump(&v[0], stderr);
 	fprintf(stderr, "\n\tmask   = "); type_hex.dump(&v[1], stderr);
@@ -913,7 +933,7 @@ static int vrf_cat(dstr v[])
       poly1305_concat(&ctx, &ctx, &cc[2]);
     }
     poly1305_done(&ctx, t.buf);
-    if (memcmp(t.buf, v[5].buf, 16) != 0) {
+    if (MEMCMP(t.buf, !=, v[5].buf, 16)) {
       fprintf(stderr, "failed...");
       fprintf(stderr, "\n\tkey    = "); type_hex.dump(&v[0], stderr);
       fprintf(stderr, "\n\tmask   = "); type_hex.dump(&v[1], stderr);
@@ -931,11 +951,76 @@ static int vrf_cat(dstr v[])
   return (ok);
 }
 
+#define MSZMAX 1000
+
+static int vrf_mct(dstr v[])
+{
+  unsigned j, msz;
+  unsigned long i, niter;
+  rijndael_ecbctx rij;
+  poly1305_key key;
+  poly1305_ctx mac;
+  dstr d = DSTR_INIT;
+  octet k[16], r[16], n[16], s[16], *t, m[MSZMAX] = { 0 };
+  int ok = 1;
+
+  if (v[0].len != 16) { fprintf(stderr, "AES key len\n"); exit(2); }
+  if (v[1].len != 16) { fprintf(stderr, "poly key len\n"); exit(2); }
+  if (v[2].len != 16) { fprintf(stderr, "nonce len\n"); exit(2); }
+  if (v[4].len != 16) { fprintf(stderr, "result len\n"); exit(2); }
+  memcpy(k, v[0].buf, 16);
+  memcpy(r, v[1].buf, 16);
+  memcpy(n, v[2].buf, 16);
+  niter = *(unsigned long *)v[3].buf;
+  dstr_ensure(&d, 16); d.len = 16; t = (octet *)d.buf;
+
+  rijndael_ecbinit(&rij, k, 16, 0);
+  poly1305_keyinit(&key, r, 16);
+  for (i = 0; i < niter; i++) {
+    msz = 0;
+    for (;;) {
+      rijndael_ecbencrypt(&rij, n, s, 16);
+      poly1305_macinit(&mac, &key, s);
+      poly1305_hash(&mac, m, msz);
+      poly1305_done(&mac, t);
+      if (msz >= MSZMAX) break;
+      n[0] ^= i&0xff;
+      for (j = 0; j < 16; j++) n[j] ^= t[j];
+      if (msz%2) {
+	for (j = 0; j < 16; j++) k[j] ^= t[j];
+	rijndael_ecbinit(&rij, k, 16, 0);
+      }
+      if (msz%3) {
+	for (j = 0; j < 16; j++) r[j] ^= t[j];
+	poly1305_keyinit(&key, r, 16);
+      }
+      m[msz++] ^= t[0];
+    }
+  }
+
+  if (MEMCMP(t, !=, v[4].buf, 16)) {
+    ok = 0;
+    fprintf(stderr, "failed...");
+    fprintf(stderr, "\n\tinitial k = "); type_hex.dump(&v[0], stderr);
+    fprintf(stderr, "\n\tinitial r = "); type_hex.dump(&v[1], stderr);
+    fprintf(stderr, "\n\tinitial n = "); type_hex.dump(&v[2], stderr);
+    fprintf(stderr, "\n\titerations = %lu", niter);
+    fprintf(stderr, "\n\texpected = "); type_hex.dump(&v[4], stderr);
+    fprintf(stderr, "\n\tcalculated = "); type_hex.dump(&d, stderr);
+    fputc('\n', stderr);
+  }
+
+  dstr_destroy(&d);
+  return (ok);
+}
+
 static const struct test_chunk tests[] = {
   { "poly1305-hash", vrf_hash,
     { &type_hex, &type_hex, &type_hex, &type_hex } },
   { "poly1305-cat", vrf_cat,
     { &type_hex, &type_hex, &type_hex, &type_hex, &type_hex, &type_hex } },
+  { "poly1305-mct", vrf_mct,
+    { &type_hex, &type_hex, &type_hex, &type_ulong, &type_hex } },
   { 0, 0, { 0 } }
 };