mdw@git.distorted.org.uk Git - catacomb/blob - symm/gcm.c

   1 /* -*-c-*-
   2  *
   3  * The GCM authenticated encryption mode
   4  *
   5  * (c) 2017 Straylight/Edgeware
   6  */
   7
   8 /*----- Licensing notice --------------------------------------------------*
   9  *
  10  * This file is part of Catacomb.
  11  *
  12  * Catacomb is free software: you can redistribute it and/or modify it
  13  * under the terms of the GNU Library General Public License as published
  14  * by the Free Software Foundation; either version 2 of the License, or
  15  * (at your option) any later version.
  16  *
  17  * Catacomb is distributed in the hope that it will be useful, but
  18  * WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Library General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Library General Public
  23  * License along with Catacomb.  If not, write to the Free Software
  24  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25  * USA.
  26  */
  27
  28 /*----- Header files ------------------------------------------------------*/
  29
  30 #include "config.h"
  31
  32 #include <stdio.h>
  33
  34 #include <mLib/bits.h>
  35
  36 #include "dispatch.h"
  37 #include "gcm.h"
  38 #include "gcm-def.h"
  39
  40 /*----- Overall strategy --------------------------------------------------*
  41  *
  42  * GCM is pretty awful to implement in software.  (This presentation is going
  43  * to be somewhat different to that in the specification, but I think it
  44  * makes more sense like this.)
  45  *
  46  * We're given a %$w$%-bit blockcipher %$E$% with a key %$K$%.
  47  *
  48  * The main part is arithmetic in the finite field %$k = \gf{2^w}$%, which we
  49  * represent as the quotient ring %$\gf{2}[t]/(p_w(t))$% for some irreducible
  50  * degree-%$w$% polynomial %$p(t)$%, whose precise value isn't very important
  51  * right now.  We choose a secret point %$x = E_K(0^w)$%.
  52  *
  53  * We choose a length size %$z$% as follows: if %$w < 96%$ then %$z = w$%;
  54  * otherwise %$z = w/2$%.  Format a message pair as follows:
  55  *
  56  *      %$F(a, b) = P_w(a) \cat P_w(b) \cat [\ell(a)]_z \cat [\ell(b)]_z$%
  57  *
  58  * where %$P_w(x) = x \cat 0^n$% where $%0 \le n < w$% such that
  59  * %$\ell(x) + n \equiv 0 \pmod{w}$%.
  60  *
  61  * Hash a (block-aligned) message %$u$% as follows.  First, split %$u$% into
  62  * %$w$%-bit blocks %$u_0$%, %$u_1$%, %%\ldots%%, %$u_{n-1}$%.  Interpret
  63  * these as elements of %$k$%.  Then
  64  *
  65  *      %$G_x(u) = u_0 t^n + u_1 t^{n-1} + \cdots + u_{n-1} t$%
  66  *
  67  * converted back to a %$w$%-bit string.
  68  *
  69  * We're ready to go now.  Suppose we're to encrypt a message %$M$% with
  70  * header %$H$% and nonce %$N$%.  If %$\ell(N) + 32 = w$% then let
  71  * %$N' = N$% and let %$i_0 = 1$%; otherwise, let %$U = G_t(F(\epsilon, N))$%
  72  * and split this into %$N' = U[0 \bitsto w - 32]$% and
  73  * %$[i_0]_{32} = U[w - 32 \bitsto w]$%.
  74  *
  75  * Let %$n = \lceil \ell(M)/w \rceil$%.  Compute
  76  *
  77  *      %$y_j = E_K(N' \cat [i_0 + j]_{32})$%
  78  *
  79  * for %$0 \le j \le n$%.  Let
  80  *
  81  *      %$s = (y_1 \cat y_2 \cat \cdots \cat y_n)[0 \bitsto \ell(M)$%
  82  *
  83  * Let %$C = M \xor s$% and let %$T = G_x(F(H, C)) \xor y_0$%.  These are the
  84  * ciphertext and tag respectively.
  85  *
  86  * So why is this awful?
  87  *
  88  * For one thing, the bits are in a completely terrible order.  The bytes are
  89  * arranged in little-endian order, so the unit coefficient is in the first
  90  * byte, and the degree-127 coefficient is in the last byte.  But within each
  91  * byte, the lowest-degree coefficient is in the most significant bit.  It's
  92  * therefore better to think of GCM as using a big-endian byte-ordering
  93  * convention, but with the bits backwards.
  94  *
  95  * But messing about with byte ordering is expensive, so let's not do that in
  96  * the inner loop.  But multiplication in %$k$% is not easy either.  Some
  97  * kind of precomputed table would be nice, but that will leak secrets
  98  * through the cache.
  99  *
 100  * I choose a particularly simple table: given %$x$%, let %$X[i'] = x t^i$%.
 101  * Then $%$x y = \sum_{0\le i<w} y_i X[i']$% which is just a bunch of
 102  * bitmasking.  But the natural order for examining bits of %$y$% is not
 103  * necessarily the obvious one.  We'll have already loaded %$y$% into
 104  * internal form, as 32-bit words.  The good order to process these is left
 105  * to right, from high to low bits.  But now the order of degrees depends on
 106  * the endianness of our conversion of bytes to words.  Oh, well.
 107  *
 108  * If we've adopted a big-endian convention, then we'll see the degrees in
 109  * order, 0, 1, ..., all the way up to %$w - 1$% and everything is fine.  If
 110  * we've adopted a little-endian convention, though, we'll see an ordering
 111  * like this:
 112  *
 113  *      24, 25, ..., 31, 16, 17, ..., 23,  8,  9, ..., 15,  0,  1, ..., 7,
 114  *      56, 57, ..., 63, 48, 49, ..., 55, 40, 41, ..., 47, 32, 33, ..., 39,
 115  *      etc.
 116  *
 117  * which is the ordinary order with 0x18 = 24 XORed into the index.  That is,
 118  * %$i' = i$% if we've adopted a big-endian convention, and
 119  * %$i' = i \xor 24$% if we've adopted a little-endian convention.
 120  */
 121
 122 /*----- Low-level utilities -----------------------------------------------*/
 123
 124 /* --- @mult@ --- *
 125  *
 126  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 127  *              @uint32 *z@ = where to write the result
 128  *              @const uint32 *x@ = input field element
 129  *
 130  * Returns:     ---
 131  *
 132  * Use:         Multiply the input field element by %$t$%, and write the
 133  *              product to @z@.  It's safe for @x@ and @z@ to be equal, but
 134  *              they should not otherwise overlap.  Both input and output are
 135  *              in big-endian form, i.e., with the lowest-degree coefficients
 136  *              in the most significant bits.
 137  */
 138
 139 static void mult(const gcm_params *p, uint32 *z, const uint32 *x)
 140 {
 141   uint32 m, c, t;
 142   unsigned i;
 143
 144   t = x[p->n - 1]; m = -(t&1u); c = m&p->poly;
 145   for (i = 0; i < p->n; i++) { t = x[i]; z[i] = (t >> 1) ^ c; c = t << 31; }
 146 }
 147
 148 /* --- @mul@ --- *
 149  *
 150  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 151  *              @uint32 *z@ = where to write the result
 152  *              @const uint32 *x, *y@ = input field elements
 153  *
 154  * Returns:     ---
 155  *
 156  * Use:         Multiply the input field elements together, and write the
 157  *              product to @z@.  It's safe for the operands to overlap.  Both
 158  *              inputs and the output are in big-endian form, i.e., with the
 159  *              lowest-degree coefficients in the most significant bits.
 160  */
 161
 162 static void mul(const gcm_params *p, uint32 *z,
 163                 const uint32 *x, const uint32 *y)
 164 {
 165   uint32 m, t, u[GCM_NMAX], v[GCM_NMAX];
 166   unsigned i, j, k;
 167
 168   /* We can't do this in-place at all, so use temporary space.  Make a copy
 169    * of @x@ in @u@, where we can clobber it, and build the product in @v@.
 170    */
 171   for (i = 0; i < p->n; i++) { u[i] = x[i]; v[i] = 0; }
 172
 173   /* Repeatedly multiply @x@ (in @u@) by %$t$%, and add together those
 174    * %$x t^i$% selected by the bits of @y@.  This is basically what you get
 175    * by streaming the result of @gcm_mktable@ into @gcm_mulk_...@.
 176    */
 177   for (i = 0; i < p->n; i++) {
 178     t = y[i];
 179     for (j = 0; j < 32; j++) {
 180       m = -((t >> 31)&1u);
 181       for (k = 0; k < p->n; k++) v[k] ^= u[k]&m;
 182       mult(p, u, u); t <<= 1;
 183     }
 184   }
 185
 186   /* Write out the result now that it's ready. */
 187   for (i = 0; i < p->n; i++) z[i] = v[i];
 188 }
 189
 190 /*----- Table-based multiplication ----------------------------------------*/
 191
 192 /* --- @gcm_mktable@ --- *
 193  *
 194  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 195  *              @uint32 *ktab@ = where to write the table; there must be
 196  *                      space for %$32 n$% $%n$%-word entries, i.e.,
 197  *                      %$32 n^2$% 32-bit words in total, where %$n$% is
 198  *                      @p->n@, the block size in words
 199  *              @const uint32 *k@ = input field element
 200  *
 201  * Returns:     ---
 202  *
 203  * Use:         Construct a table for use by @gcm_mulk_...@ below, to
 204  *              multiply (vaguely) efficiently by @k@.
 205  */
 206
 207 static void simple_mktable(const gcm_params *p,
 208                            uint32 *ktab, const uint32 *k)
 209 {
 210   unsigned m = (p->f&GCMF_SWAP ? 0x18 : 0);
 211   unsigned i, j, o = m*p->n;
 212
 213   /* As described above, the table stores entries %$K[i \xor m] = k t^i$%,
 214    * where %$m = 0$% (big-endian cipher) or %$m = 24$% (little-endian).
 215    * The first job is to store %$K[m] = k$%.
 216    *
 217    * We initially build the table with the entries in big-endian order, and
 218    * then swap them if necessary.  This makes the arithmetic functions more
 219    * amenable for use by @gcm_concat@ below.
 220    */
 221   if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) ktab[o + i] = k[i];
 222   else for (i = 0; i < p->n; i++) ktab[o + i] = ENDSWAP32(k[i]);
 223
 224   /* Fill in the rest of the table by repeatedly multiplying the previous
 225    * entry by %$t$%.
 226    */
 227   for (i = 1; i < 32*p->n; i++)
 228     { j = (i ^ m)*p->n; mult(p, ktab + j, ktab + o); o = j; }
 229
 230   /* Finally, if the cipher uses a little-endian convention, then swap all of
 231    * the individual words.
 232    */
 233   if (p->f&GCMF_SWAP)
 234     for (i = 0; i < 32*p->n*p->n; i++) ktab[i] = ENDSWAP32(ktab[i]);
 235 }
 236
 237 CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable,
 238              (const gcm_params *p, uint32 *ktab, const uint32 *k),
 239              (p, ktab, k),
 240              pick_mktable, simple_mktable)
 241
 242 static gcm_mktable__functype *pick_mktable(void)
 243 {
 244   DISPATCH_PICK_FALLBACK(gcm_mktable, simple_mktable);
 245 }
 246
 247 /* --- @recover_k@ --- *
 248  *
 249  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 250  *              @uint32 *k@ = block-sized vector in which to store %$k$%
 251  *              @const uint32 *ktab@ = the table encoding %$k$%
 252  *
 253  * Returns:     ---
 254  *
 255  * Use:         Recovers %$k$%, the secret from which @ktab@ was by
 256  *              @gcm_mktable@, from the table, and stores it in internal
 257  *              (big-endian) form in @k@.
 258  */
 259
 260 static void simple_recover_k(const gcm_params *p,
 261                              uint32 *k, const uint32 *ktab)
 262 {
 263   unsigned i;
 264
 265   /* If the blockcipher is big-endian, then the key is simply in the first
 266    * table element, in the right format.  If the blockcipher is little-endian
 267    * then it's in element 24, and the bytes need swapping.
 268    */
 269
 270   if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) k[i] = ktab[i];
 271   else for (i = 0; i < p->n; i++) k[i] = ENDSWAP32(ktab[24*p->n + i]);
 272 }
 273
 274 CPU_DISPATCH(static, EMPTY, void, recover_k,
 275              (const gcm_params *p, uint32 *k, const uint32 *ktab),
 276              (p, k, ktab),
 277              pick_recover_k, simple_recover_k)
 278
 279 static recover_k__functype *pick_recover_k(void)
 280   { DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k); }
 281
 282 /* --- @gcm_mulk_N{b,l}@ --- *
 283  *
 284  * Arguments:   @uint32 *a@ = accumulator to multiply
 285  *              @const uint32 *ktab@ = table constructed by @gcm_mktable@
 286  *
 287  * Returns:     ---
 288  *
 289  * Use:         Multiply @a@ by @k@ (implicitly represented in @ktab@),
 290  *              updating @a@ in-place.  There are separate functions for each
 291  *              supported block size and endianness because this is the
 292  *              function whose performance actually matters.
 293  */
 294
 295 #define DEF_MULK(nbits)                                                 \
 296                                                                         \
 297 CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##b,                   \
 298              (uint32 *a, const uint32 *ktab), (a, ktab),                \
 299              pick_mulk_##nbits##b, simple_mulk_##nbits)                 \
 300 CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##l,                   \
 301              (uint32 *a, const uint32 *ktab), (a, ktab),                \
 302              pick_mulk_##nbits##l, simple_mulk_##nbits)                 \
 303                                                                         \
 304 static void simple_mulk_##nbits(uint32 *a, const uint32 *ktab)          \
 305 {                                                                       \
 306   uint32 m, t;                                                          \
 307   uint32 z[nbits/32];                                                   \
 308   unsigned i, j, k;                                                     \
 309                                                                         \
 310   for (i = 0; i < nbits/32; i++) z[i] = 0;                              \
 311                                                                         \
 312   for (i = 0; i < nbits/32; i++) {                                      \
 313     t = a[i];                                                           \
 314     for (j = 0; j < 32; j++) {                                          \
 315       m = -((t >> 31)&1u);                                              \
 316       for (k = 0; k < nbits/32; k++) z[k] ^= *ktab++&m;                 \
 317       t <<= 1;                                                          \
 318     }                                                                   \
 319   }                                                                     \
 320                                                                         \
 321   for (i = 0; i < nbits/32; i++) a[i] = z[i];                           \
 322 }                                                                       \
 323                                                                         \
 324 static gcm_mulk_##nbits##b##__functype *pick_mulk_##nbits##b(void)      \
 325   { DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits); } \
 326 static gcm_mulk_##nbits##l##__functype *pick_mulk_##nbits##l(void)      \
 327   { DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits); }
 328
 329 GCM_WIDTHS(DEF_MULK)
 330
 331 #define GCM_MULK_CASE(nbits)                                            \
 332   case nbits/32:                                                        \
 333     if (_f&GCMF_SWAP) gcm_mulk_##nbits##l(_a, _ktab);                   \
 334     else gcm_mulk_##nbits##b(_a, _ktab);                                \
 335     break;
 336 #define MULK(n, f, a, ktab) do {                                        \
 337   uint32 *_a = (a); const uint32 *_ktab = (ktab);                       \
 338   unsigned _f = (f);                                                    \
 339   switch (n) {                                                          \
 340     GCM_WIDTHS(GCM_MULK_CASE)                                           \
 341     default: abort();                                                   \
 342   }                                                                     \
 343 } while (0)
 344
 345 /*----- Other utilities ---------------------------------------------------*/
 346
 347 /* --- @putlen@ --- *
 348  *
 349  * Arguments:   @octet *p@ = pointer to output buffer
 350  *              @unsigned w@ = size of output buffer
 351  *              @unsigned blksz@ = block size (assumed fairly small)
 352  *              @unsigned long nblocks@ = number of blocks
 353  *              @unsigned nbytes@ = tail size in bytes (assumed small)
 354  *
 355  * Returns:     ---
 356  *
 357  * Use:         Store the overall length in %$\emph{bits}$% (i.e.,
 358  *              @3*(nblocks*blksz + nbytes)@ in big-endian form in the
 359  *              buffer @p@.
 360  */
 361
 362 static void putlen(octet *p, unsigned w, unsigned blksz,
 363                    unsigned long nblocks, unsigned nbytes)
 364 {
 365   unsigned long nblo = nblocks&((1ul << (ULONG_BITS/2)) - 1),
 366     nbhi = nblocks >> ULONG_BITS/2;
 367   unsigned long nlo = nblo*blksz + nbytes, nhi = nbhi*blksz;
 368
 369   /* This is fiddly.  Split @nblocks@, which is the big number, into high and
 370    * low halves, multiply those separately by @blksz@, propagate carries, and
 371    * then multiply by eight.
 372    */
 373   nhi += nlo >> ULONG_BITS/2;
 374   nlo &= (1ul << (ULONG_BITS/2)) - 1;
 375   nlo <<= 3;
 376
 377   /* Now write out the size, feeding bits in from @nhi@ as necessary. */
 378   p += w;
 379   while (w--) {
 380     *--p = U8(nlo);
 381     nlo = (nlo >> 8) | ((nhi&0xff) << (ULONG_BITS/2 - 5));
 382     nhi >>= 8;
 383   }
 384 }
 385
 386 /* --- @mix@ --- *
 387  *
 388  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 389  *              @uint32 *a@ = GHASH accumulator
 390  *              @const octet *q@ = pointer to an input block
 391  *              @const uint32 *ktab@ = multiplication table, built by
 392  *                      @gcm_mktable@
 393  *
 394  * Returns:     ---
 395  *
 396  * Use:         Fold the block @q@ into the GHASH accumulator.  The
 397  *              calculation is %$a' = k (a + q)$%.
 398  */
 399
 400 static void mix(const gcm_params *p, uint32 *a,
 401                 const octet *q, const uint32 *ktab)
 402 {
 403   unsigned i;
 404
 405   if (p->f&GCMF_SWAP)
 406     for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_L(q); q += 4; }
 407   else
 408     for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_B(q); q += 4; }
 409   MULK(p->n, p->f, a, ktab);
 410 }
 411
 412 /* --- @gcm_ghashdone@ --- *
 413  *
 414  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 415  *              @uint32 *a@ = GHASH accumulator
 416  *              @const uint32 *ktab@ = multiplication table, built by
 417  *                      @gcm_mktable@
 418  *              @unsigned long xblocks, yblocks@ = number of whole blocks in
 419  *                      the two inputs
 420  *              @unsigned xbytes, ybytes@ = number of trailing bytes in the
 421  *                      two inputs
 422  *
 423  * Returns:     ---
 424  *
 425  * Use:         Finishes a GHASH operation by appending the appropriately
 426  *              encoded lengths of the two constituent messages.
 427  */
 428
 429 void gcm_ghashdone(const gcm_params *p, uint32 *a, const uint32 *ktab,
 430                    unsigned long xblocks, unsigned xbytes,
 431                    unsigned long yblocks, unsigned ybytes)
 432 {
 433   octet b[4*GCM_NMAX];
 434   unsigned w = p->n < 3 ? 4*p->n : 2*p->n;
 435
 436   /* Construct the encoded lengths.  Note that smaller-block versions of GCM
 437    * encode the lengths in separate blocks.  GCM is only officially defined
 438    * for 64- and 128-bit blocks; I've placed the cutoff somewhat arbitrarily
 439    * at 96 bits.
 440    */
 441   putlen(b,     w, 4*p->n, xblocks, xbytes);
 442   putlen(b + w, w, 4*p->n, yblocks, ybytes);
 443
 444   /* Feed the lengths into the accumulator. */
 445   mix(p, a, b, ktab);
 446   if (p->n < 3) mix(p, a, b + w, ktab);
 447 }
 448
 449 /* --- @gcm_concat@ --- *
 450  *
 451  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 452  *              @uint32 *z@ = GHASH accumulator for suffix, updated
 453  *              @const uint32 *x@ = GHASH accumulator for prefix
 454  *              @const uint32 *ktab@ = multiplication table, built by
 455  *                      @gcm_mktable@
 456  *              @unsigned long n@ = length of suffix in whole blocks
 457  *
 458  * Returns:     ---
 459  *
 460  * Use:         On entry, @x@ and @z@ are the results of hashing two strings
 461  *              %$a$% and %$b$%, each a whole number of blocks long; in
 462  *              particular, %$b$% is @n@ blocks long.  On exit, @z@ is
 463  *              updated to be the hash of %$a \cat b$%.
 464  */
 465
 466 void gcm_concat(const gcm_params *p, uint32 *z, const uint32 *x,
 467                 const uint32 *ktab, unsigned long n)
 468 {
 469   uint32 t[GCM_NMAX], u[GCM_NMAX];
 470   unsigned i, j;
 471
 472   if (!n) {
 473     /* If @n@ is zero, then there's not much to do.  The mathematics
 474      * (explained below) still works, but the code takes a shortcut which
 475      * doesn't handle this case: so set %$z' = z + x k^n = z + x$%.
 476      */
 477
 478     for (j = 0; j < p->n; j++) z[j] ^= x[j];
 479   } else {
 480     /* We have %$x = a_0 t^m + \cdots + a_{m-2} t^2 + a_{m-1} t$% and
 481      * %$z = b_0 t^n + \cdots + b_{n-2} t^2 + b_{n-1} t$%.  What we'd like is
 482      * the hash of %$a \cat b$%, which is %$z + x k^n$%.
 483      *
 484      * The first job, then, is to calculate %$k^n$%, and for this we use a
 485      * simple left-to-right square-and-multiply algorithm.  There's no need
 486      * to keep %$n$% secret here.
 487      */
 488
 489     /* Start by retrieving %$k$% from the table, and convert it to big-endian
 490      * form.
 491      */
 492     recover_k(p, u, ktab);
 493
 494     /* Now calculate %$k^n$%. */
 495     i = ULONG_BITS;
 496 #define BIT (1ul << (ULONG_BITS - 1))
 497     while (!(n&BIT)) { n <<= 1; i--; }
 498     n <<= 1; i--; for (j = 0; j < p->n; j++) t[j] = u[j];
 499     while (i--) { mul(p, t, t, t); if (n&BIT) mul(p, t, t, u); n <<= 1; }
 500 #undef BIT
 501
 502     /* Next, calculate %$x k^n$%.  If we're using a little-endian convention
 503      * then we must convert %$x$%; otherwise we can just use it in place.
 504      */
 505     if (!(p->f&GCMF_SWAP))
 506       mul(p, t, t, x);
 507     else {
 508       for (j = 0; j < p->n; j++) u[j] = ENDSWAP32(x[j]);
 509       mul(p, t, t, u);
 510     }
 511
 512     /* Finally, add %$x k^n$% onto %$z$%, converting back to little-endian if
 513      * necessary.
 514      */
 515     if (!(p->f&GCMF_SWAP)) for (j = 0; j < p->n; j++) z[j] ^= t[j];
 516     else for (j = 0; j < p->n; j++) z[j] ^= ENDSWAP32(t[j]);
 517   }
 518 }
 519
 520 /*----- Test rig ----------------------------------------------------------*/
 521
 522 #ifdef TEST_RIG
 523
 524 #include <mLib/quis.h>
 525 #include <mLib/testrig.h>
 526
 527 static void report_failure(const char *test, unsigned nbits,
 528                            const char *ref, dstr v[], dstr *d)
 529 {
 530   printf("test %s failed (nbits = %u)", test, nbits);
 531   printf("\n\tx  = "); type_hex.dump(&v[0], stdout);
 532   printf("\n\ty  = "); type_hex.dump(&v[1], stdout);
 533   printf("\n\tz  = "); type_hex.dump(&v[2], stdout);
 534   printf("\n\t%s' = ", ref); type_hex.dump(d, stdout);
 535   putchar('\n');
 536 }
 537
 538 static void mulk(unsigned nbits, unsigned f, uint32 *x, const uint32 *ktab)
 539   { MULK(nbits/32, f, x, ktab); }
 540
 541 static int test_mul(uint32 poly, dstr v[])
 542 {
 543   uint32 x[GCM_NMAX], y[GCM_NMAX], z[GCM_NMAX], ktab[32*GCM_NMAX*GCM_NMAX];
 544   gcm_params p;
 545   dstr d = DSTR_INIT;
 546   unsigned i, nbits;
 547   int ok = 1;
 548   enum { I_x, I_y, I_z };
 549
 550   nbits = 8*v[0].len; p.f = 0; p.n = nbits/32; p.poly = poly;
 551   dstr_ensure(&d, nbits/8); d.len = nbits/8;
 552
 553 #define LOADXY(E) do {                                                  \
 554   for (i = 0; i < nbits/32; i++) {                                      \
 555     x[i] = LOAD32_##E(v[I_x].buf + 4*i);                                \
 556     y[i] = LOAD32_##E(v[I_y].buf + 4*i);                                \
 557   }                                                                     \
 558 } while (0)
 559
 560 #define INITZ(x) do {                                                   \
 561   for (i = 0; i < nbits/32; i++) z[i] = (x)[i];                         \
 562 } while (0)
 563
 564 #define CHECK(E, what, ref) do {                                        \
 565   for (i = 0; i < nbits/32; i++) STORE32_##E(d.buf + 4*i, z[i]);        \
 566   if (memcmp(d.buf, v[I_##ref].buf, nbits/8) != 0)                      \
 567     { ok = 0; report_failure(what, nbits, #ref, v, &d); }               \
 568 } while (0)
 569
 570 #define TEST_PREP_1(E, x, y, what) do {                                 \
 571   gcm_mktable(&p, ktab, y);                                             \
 572   recover_k(&p, z, ktab); CHECK(B, "mktable/recover_k (" #y ")", y);    \
 573   INITZ(x); mulk(nbits, p.f, z, ktab); CHECK(E, what " (k = " #y ")", z); \
 574 } while (0)
 575
 576 #define TEST_PREP(E, what) do {                                         \
 577   TEST_PREP_1(E, x, y, what);                                           \
 578   TEST_PREP_1(E, y, x, what);                                           \
 579 } while (0)
 580
 581   /* First, test plain multiply. */
 582   LOADXY(B); mul(&p, z, x, y); CHECK(B, "gcm_mul", z);
 583
 584   /* Next, test big-endian prepared key. */
 585   LOADXY(B); TEST_PREP(B, "gcm_kmul_b");
 586
 587   /* Finally, test little-endian prepared key. */
 588   p.f = GCMF_SWAP; LOADXY(L);
 589   TEST_PREP(L, "gcm_kmul_l");
 590
 591 #undef LOADXY
 592 #undef INITZ
 593 #undef CHECK
 594 #undef TEST_PREP_1
 595 #undef TEST_PREP
 596
 597   /* All done. */
 598   return (ok);
 599 }
 600
 601 #define TEST(nbits)                                                     \
 602 static int test_mul_##nbits(dstr v[])                                   \
 603   { return (test_mul(GCM_POLY_##nbits, v)); }
 604 GCM_WIDTHS(TEST)
 605 #undef TEST
 606
 607 static test_chunk defs[] = {
 608 #define TEST(nbits)                                                     \
 609   { "gcm-mul" #nbits, test_mul_##nbits,                                 \
 610     { &type_hex, &type_hex, &type_hex, 0 } },
 611 GCM_WIDTHS(TEST)
 612 #undef TEST
 613   { 0, 0, { 0 } }
 614 };
 615
 616 int main(int argc,  char *argv[])
 617 {
 618   ego(argv[0]);
 619   test_run(argc, argv, defs, SRCDIR"/t/gcm");
 620   return (0);
 621 }
 622
 623 #endif
 624
 625 /*----- That's all, folks -------------------------------------------------*/