base/asm-common.h: Sink the `PIC' stuff a little.

[catacomb] / symm / poly1305.c
diff --git a/symm/poly1305.c b/symm/poly1305.c

index 9cc97ea..39c832f 100644 (file)
--- a/symm/poly1305.c
+++ b/symm/poly1305.c
@@ -33,6 +33,7 @@
  #include <string.h>
  
  #include "poly1305.h"
+#include "rsvr.h"
  
  /*----- Global variables --------------------------------------------------*/
  
@@ -60,11 +61,11 @@ typedef uint32 felt[5];
  #define P p26
  
  /* Convert 32-bit words into field-element pieces. */
-#define P26W0(x)   ((x##0)&0x03ffffff)
-#define P26W1(x) ((((x##1)&0x000fffff) <<  6) | (((x##0) >> 26)&0x0000003f))
-#define P26W2(x) ((((x##2)&0x00003fff) << 12) | (((x##1) >> 20)&0x00000fff))
-#define P26W3(x) ((((x##3)&0x000000ff) << 18) | (((x##2) >> 14)&0x0003ffff))
-#define P26W4(x)                                (((x##3) >>  8)&0x00ffffff)
+#define P26W0(x)  (((x##0) <<  0)&0x03ffffff)
+#define P26W1(x) ((((x##1) <<  6)&0x03ffffc0) | (((x##0) >> 26)&0x0000003f))
+#define P26W2(x) ((((x##2) << 12)&0x03ffffff) | (((x##1) >> 20)&0x00000fff))
+#define P26W3(x) ((((x##3) << 18)&0x03fc0000) | (((x##2) >> 14)&0x0003ffff))
+#define P26W4(x)                               (((x##3) >>  8)&0x00ffffff)
  
  /* Propagate carries in parallel.  If 0 <= u_i < 2^26 c_i, then we shall have
   * 0 <= v_0 < 2^26 + 5 c_4, and 0 <= v_i < 2^26 + c_{i-1} for 1 <= i < 5.
@@ -183,7 +184,7 @@ static void mul_r(const poly1305_ctx *ctx, felt z, const felt x)
  
  #endif
  
-/*----- Low-level implementation for 32/64-bit targets --------------------*/
+/*----- Low-level implementation for 16/32-bit targets --------------------*/
  
  #ifndef POLY1305_IMPL
  #  define POLY1305_IMPL 11
@@ -230,9 +231,10 @@ static void carry_reduce(uint32 u[12])
    uint32 c;
  
    /* Do sequential carry propagation (16-bit CPUs are less likely to benefit
-   * from instruction-level parallelism).  Start at u_10; truncate it to 11
-   * bits, and add the carry onto u_11.  Truncate u_11 to 10 bits, and add
-   * five times the carry onto u_0.  And so on.
+   * from instruction-level parallelism).  Start at u_9; truncate it to 11
+   * bits, and add the carry onto u_10.  Truncate u10 to 11 bits, and add the
+   * carry onto u_11.  Truncate u_11 to 10 bits, and add five times the carry
+   * onto u_0.  And so on.
     *
     * The carry is larger than the pieces we're leaving behind.  Let c_i be
     * the high portion of u_i, to be carried onto u_{i+1}.  I claim that c_i <
@@ -241,13 +243,15 @@ static void carry_reduce(uint32 u[12])
     * carry is bounded above by 636*2^22 + 12785*2^10 < 2557*2^20.  Hence, the
     * carry out is at most 2557*2^10, as claimed.
     *
-   * Once we reach u_10 for the second time, we start with u_10 < 2^11.  The
-   * carry into u_10 is at most 2557*2^10 < 1279*2^11 as calculated above; so
-   * the carry out into u_11 is at most 1280.  Since u_11 < 2^10 prior to
-   * this carry in, all of the u_i are now bounded above by 2^11.  The final
-   * reduction therefore only needs a conditional subtraction.
+   * Once we reach u_9 for the second time, we start with u_9 < 2^11.  The
+   * carry into u_9 is at most 2557*2^10 < 1279*2^11 as calculated above; so
+   * the carry out into u_10 is at most 1280.  Since u_10 < 2^11 prior to
+   * this carry in, we now have u_10 < 2^11 + 1280 < 2^12; so the carry out
+   * into u_11 is at most 1.  The final reduction therefore only needs a
+   * conditional subtraction.
     */
-                          {               c = u[10] >> 11; u[10] &= M11; }
+                          {               c =  u[9] >> 11;  u[9] &= M11; }
+                          { u[10] +=   c; c = u[10] >> 11; u[10] &= M11; }
                            { u[11] +=   c; c = u[11] >> 10; u[11] &= M10; }
                            {  u[0] += 5*c; c =  u[0] >> 11;  u[0] &= M11; }
    for (i = 1; i <  5; i++) {  u[i] +=   c; c =  u[i] >> 11;  u[i] &= M11; }
@@ -533,28 +537,15 @@ static void update_full(poly1305_ctx *ctx, const octet *p)
    ctx->count++;
  }
  
+static const rsvr_policy pol = { 0, 16, 16 };
+
  void poly1305_hash(poly1305_ctx *ctx, const void *p, size_t sz)
  {
-  const octet *pp = p;
-  size_t n;
-
-  if (ctx->nbuf) {
-    if (sz < 16 - ctx->nbuf) {
-      memcpy(ctx->buf + ctx->nbuf, p, sz);
-      ctx->nbuf += sz;
-      return;
-    }
-    n = 16 - ctx->nbuf;
-    memcpy(ctx->buf + ctx->nbuf, pp, n);
-    update_full(ctx, ctx->buf);
-    pp += n; sz -= n;
-  }
-  while (sz >= 16) {
-    update_full(ctx, pp);
-    pp += 16; sz -= 16;
-  }
-  if (sz) memcpy(ctx->buf, pp, sz);
-  ctx->nbuf = sz;
+  rsvr_state st;
+  const octet *q = p;
+
+  rsvr_setup(&st, &pol, &ctx->buf, &ctx->nbuf, p, sz);
+  RSVR_DO(&st) while ((q = RSVR_NEXT(&st, 16)) != 0) update_full(ctx, q);
  }
  
  /* --- @poly1305_flush@ --- *
@@ -568,6 +559,7 @@ void poly1305_hash(poly1305_ctx *ctx, const void *p, size_t sz)
   *             far is a whole number of blocks.  Flushing is performed
   *             automatically by @poly1305_done@, but it may be necessary to
   *             force it by hand when using @poly1305_concat@.
+ *             (Alternatively, you might use @poly1305_flushzero@ instead.)
   *
   *             Flushing a partial block has an observable effect on the
   *             computation: the resulting state is (with high probability)
@@ -601,7 +593,29 @@ void poly1305_flush(poly1305_ctx *ctx)
  #endif
  
    mul_r(ctx, ctx->u.P.h, t);
-  ctx->count++;
+  ctx->nbuf = 0; ctx->count++;
+}
+
+/* --- @poly1305_flushzero@ --- *
+ *
+ * Arguments:  @poly1305_ctx *ctx@ = MAC context to flush
+ *
+ * Returns:    ---
+ *
+ * Use:                Forces any buffered message data in the context to be
+ *             processed, by hashing between zero and fifteen additional
+ *             zero bytes.  Like @poly1305_flush@, this has no effect if the
+ *             the message processed so far is a whole number of blocks.
+ *             Unlike @poly1305_flush@, the behaviour if the message is not
+ *             a whole number of blocks is equivalent to actually hashing
+ *             some extra data.
+ */
+
+void poly1305_flushzero(poly1305_ctx *ctx)
+{
+  if (!ctx->nbuf) return;
+  memset(ctx->buf + ctx->nbuf, 0, 16 - ctx->nbuf);
+  update_full(ctx, ctx->buf);
    ctx->nbuf = 0;
  }
  
@@ -665,9 +679,9 @@ void poly1305_concat(poly1305_ctx *ctx,
  #define BIT (1ul << (ULONG_BITS - 1))
    if (n) {
      i = ULONG_BITS;
-    while (!(n & BIT)) { n <<= 1; i--; }
+    while (!(n&BIT)) { n <<= 1; i--; }
      mul_r(prefix, x, x); n <<= 1; i--;
-    while (i--) { sqr(x, x); if (n & BIT) mul_r(prefix, x, x); n <<= 1; }
+    while (i--) { sqr(x, x); if (n&BIT) mul_r(prefix, x, x); n <<= 1; }
    }
  #undef BIT
    mul(x, prefix->u.P.h, x);
@@ -775,10 +789,10 @@ void poly1305_done(poly1305_ctx *ctx, void *h)
    /* Convert this mess back into 32-bit words.  We lose the top two bits,
     * but that's fine.
     */
-  h0 = (h0 >>  0) | ((h1 & 0x0000003f) << 26);
-  h1 = (h1 >>  6) | ((h2 & 0x00000fff) << 20);
-  h2 = (h2 >> 12) | ((h3 & 0x0003ffff) << 14);
-  h3 = (h3 >> 18) | ((h4 & 0x00ffffff) <<  8);
+  h0 = (h0 >>  0) | ((h1&0x0000003f) << 26);
+  h1 = (h1 >>  6) | ((h2&0x00000fff) << 20);
+  h2 = (h2 >> 12) | ((h3&0x0003ffff) << 14);
+  h3 = (h3 >> 18) | ((h4&0x00ffffff) <<  8);
  
    /* All done. */
    STORE32_L(p +  0, h0); STORE32_L(p +  4, h1);
@@ -847,8 +861,12 @@ void poly1305_done(poly1305_ctx *ctx, void *h)
  
  #ifdef TEST_RIG
  
+#include <mLib/macros.h>
  #include <mLib/testrig.h>
  
+#include "ct.h"
+#include "rijndael-ecb.h"
+
  static int vrf_hash(dstr v[])
  {
    poly1305_key k;
@@ -861,6 +879,7 @@ static int vrf_hash(dstr v[])
    if (v[3].len != 16) { fprintf(stderr, "bad tag length\n"); exit(2); }
    dstr_ensure(&t, 16); t.len = 16;
  
+  ct_poison(v[0].buf, v[0].len);
    poly1305_keyinit(&k, v[0].buf, v[0].len);
    for (i = 0; i < v[2].len; i++) {
      for (j = i; j < v[2].len; j++) {
@@ -869,7 +888,8 @@ static int vrf_hash(dstr v[])
        poly1305_hash(&ctx, v[2].buf + i, j - i);
        poly1305_hash(&ctx, v[2].buf + j, v[2].len - j);
        poly1305_done(&ctx, t.buf);
-      if (memcmp(t.buf, v[3].buf, 16) != 0) {
+      ct_remedy(t.buf, t.len);
+      if (MEMCMP(t.buf, !=, v[3].buf, 16)) {
         fprintf(stderr, "failed...");
         fprintf(stderr, "\n\tkey    = "); type_hex.dump(&v[0], stderr);
         fprintf(stderr, "\n\tmask   = "); type_hex.dump(&v[1], stderr);
@@ -913,7 +933,7 @@ static int vrf_cat(dstr v[])
        poly1305_concat(&ctx, &ctx, &cc[2]);
      }
      poly1305_done(&ctx, t.buf);
-    if (memcmp(t.buf, v[5].buf, 16) != 0) {
+    if (MEMCMP(t.buf, !=, v[5].buf, 16)) {
        fprintf(stderr, "failed...");
        fprintf(stderr, "\n\tkey    = "); type_hex.dump(&v[0], stderr);
        fprintf(stderr, "\n\tmask   = "); type_hex.dump(&v[1], stderr);
@@ -931,11 +951,98 @@ static int vrf_cat(dstr v[])
    return (ok);
  }
  
+#define MSZMAX 1000
+
+static int vrf_mct(dstr v[])
+{
+  unsigned j, msz;
+  unsigned long i, start_iter, end_iter;
+  rijndael_ecbctx rij;
+  poly1305_key key;
+  poly1305_ctx mac;
+  dstr dk = DSTR_INIT, dr = DSTR_INIT, dn = DSTR_INIT,
+    dt = DSTR_INIT, dm = DSTR_INIT;
+  octet *k, *r, s[16], *n, *t, *m;
+  int ok = 1;
+
+  DENSURE(&dk, 16);     k = (octet *)dk.buf; dk.len = 16;
+  DENSURE(&dr, 16);     r = (octet *)dr.buf; dr.len = 16;
+  DENSURE(&dn, 16);     n = (octet *)dn.buf; dn.len = 16;
+  DENSURE(&dt, 16);     t = (octet *)dt.buf; dt.len = 16;
+  DENSURE(&dm, MSZMAX); m = (octet *)dm.buf; dm.len = MSZMAX;
+  memset(m, 0, MSZMAX);
+
+  if (v[0].len != 16)     { fprintf(stderr, "AES key len\n"); exit(2); }
+  if (v[1].len != 16)     { fprintf(stderr, "poly key len\n"); exit(2); }
+  if (v[2].len != 16)     { fprintf(stderr, "nonce len\n"); exit(2); }
+  if (v[3].len != MSZMAX) { fprintf(stderr, "msgbuf len\n"); exit(2); }
+  if (v[6].len != 16)     { fprintf(stderr, "result len\n"); exit(2); }
+  memcpy(k, v[0].buf, 16);
+  memcpy(r, v[1].buf, 16);
+  memcpy(n, v[2].buf, 16);
+  memcpy(m, v[3].buf, MSZMAX);
+  start_iter = *(unsigned long *)v[4].buf;
+  end_iter = *(unsigned long *)v[5].buf;
+  if (end_iter < start_iter) { fprintf(stderr, "iter bounds\n"); exit(2); }
+
+  rijndael_ecbinit(&rij, k, 16, 0);
+  poly1305_keyinit(&key, r, 16);
+  for (i = start_iter; i < end_iter; i++) {
+    msz = 0;
+    for (;;) {
+      rijndael_ecbencrypt(&rij, n, s, 16);
+      poly1305_macinit(&mac, &key, s);
+      poly1305_hash(&mac, m, msz);
+      poly1305_done(&mac, t);
+      if (msz >= MSZMAX) break;
+      n[0] ^= i&0xff;
+      for (j = 0; j < 16; j++) n[j] ^= t[j];
+      if (msz%2) {
+       for (j = 0; j < 16; j++) k[j] ^= t[j];
+       rijndael_ecbinit(&rij, k, 16, 0);
+      }
+      if (msz%3) {
+       for (j = 0; j < 16; j++) r[j] ^= t[j];
+       poly1305_keyinit(&key, r, 16);
+      }
+      m[msz++] ^= t[0];
+    }
+  }
+
+  if (MEMCMP(t, !=, v[6].buf, 16)) {
+    ok = 0;
+    fprintf(stderr, "failed...");
+    fprintf(stderr, "\n\tinitial k = "); type_hex.dump(&v[0], stderr);
+    fprintf(stderr, "\n\tinitial r = "); type_hex.dump(&v[1], stderr);
+    fprintf(stderr, "\n\tinitial n = "); type_hex.dump(&v[2], stderr);
+    fprintf(stderr, "\n\tinitial m = "); type_hex.dump(&v[3], stderr);
+    fprintf(stderr, "\n\tstart iter = %lu", start_iter);
+    fprintf(stderr, "\n\tend iter = %lu", end_iter);
+    fprintf(stderr, "\n\tfinal k = "); type_hex.dump(&dk, stderr);
+    fprintf(stderr, "\n\tfinal r = "); type_hex.dump(&dr, stderr);
+    fprintf(stderr, "\n\tfinal n = "); type_hex.dump(&dn, stderr);
+    fprintf(stderr, "\n\tfinal m = "); type_hex.dump(&dm, stderr);
+    fprintf(stderr, "\n\texpected = "); type_hex.dump(&v[6], stderr);
+    fprintf(stderr, "\n\tcalculated = "); type_hex.dump(&dt, stderr);
+    fputc('\n', stderr);
+  }
+
+  dstr_destroy(&dk);
+  dstr_destroy(&dr);
+  dstr_destroy(&dn);
+  dstr_destroy(&dt);
+  dstr_destroy(&dm);
+  return (ok);
+}
+
  static const struct test_chunk tests[] = {
    { "poly1305-hash", vrf_hash,
      { &type_hex, &type_hex, &type_hex, &type_hex } },
    { "poly1305-cat", vrf_cat,
      { &type_hex, &type_hex, &type_hex, &type_hex, &type_hex, &type_hex } },
+  { "poly1305-mct", vrf_mct,
+    { &type_hex, &type_hex, &type_hex, &type_hex,
+      &type_ulong, &type_ulong, &type_hex } },
    { 0, 0, { 0 } }
  };