mdw@git.distorted.org.uk Git - catacomb/blob - symm/gcm.c

   1 /* -*-c-*-
   2  *
   3  * The GCM authenticated encryption mode
   4  *
   5  * (c) 2017 Straylight/Edgeware
   6  */
   7
   8 /*----- Licensing notice --------------------------------------------------*
   9  *
  10  * This file is part of Catacomb.
  11  *
  12  * Catacomb is free software: you can redistribute it and/or modify it
  13  * under the terms of the GNU Library General Public License as published
  14  * by the Free Software Foundation; either version 2 of the License, or
  15  * (at your option) any later version.
  16  *
  17  * Catacomb is distributed in the hope that it will be useful, but
  18  * WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Library General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Library General Public
  23  * License along with Catacomb.  If not, write to the Free Software
  24  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25  * USA.
  26  */
  27
  28 /*----- Header files ------------------------------------------------------*/
  29
  30 #include "config.h"
  31
  32 #include <stdio.h>
  33
  34 #include <mLib/bits.h>
  35
  36 #include "dispatch.h"
  37 #include "gcm.h"
  38 #include "gcm-def.h"
  39
  40 /*----- Overall strategy --------------------------------------------------*
  41  *
  42  * GCM is pretty awful to implement in software.  (This presentation is going
  43  * to be somewhat different to that in the specification, but I think it
  44  * makes more sense like this.)
  45  *
  46  * We're given a %$w$%-bit blockcipher %$E$% with a key %$K$%.
  47  *
  48  * The main part is arithmetic in the finite field %$k = \gf{2^w}$%, which we
  49  * represent as the quotient ring %$\gf{2}[t]/(p_w(t))$% for some irreducible
  50  * degree-%$w$% polynomial %$p(t)$%, whose precise value isn't very important
  51  * right now.  We choose a secret point %$x = E_K(0^w)$%.
  52  *
  53  * We choose a length size %$z$% as follows: if %$w < 96%$ then %$z = w$%;
  54  * otherwise %$z = w/2$%.  Format a message pair as follows:
  55  *
  56  *      %$F(a, b) = P_w(a) \cat P_w(b) \cat [\ell(a)]_z \cat [\ell(b)]_z$%
  57  *
  58  * where %$P_w(x) = x \cat 0^n$% where $%0 \le n < w$% such that
  59  * %$\ell(x) + n \equiv 0 \pmod{w}$%.
  60  *
  61  * Hash a (block-aligned) message %$u$% as follows.  First, split %$u$% into
  62  * %$w$%-bit blocks %$u_0$%, %$u_1$%, %%\ldots%%, %$u_{n-1}$%.  Interpret
  63  * these as elements of %$k$%.  Then
  64  *
  65  *      %$G_x(u) = u_0 t^n + u_1 t^{n-1} + \cdots + u_{n-1} t$%
  66  *
  67  * converted back to a %$w$%-bit string.
  68  *
  69  * We're ready to go now.  Suppose we're to encrypt a message %$M$% with
  70  * header %$H$% and nonce %$N$%.  If %$\ell(N) + 32 = w$% then let
  71  * %$N' = N$% and let %$i_0 = 1$%; otherwise, let %$U = G_t(F(\epsilon, N))$%
  72  * and split this into %$N' = U[0 \bitsto w - 32]$% and
  73  * %$[i_0]_{32} = U[w - 32 \bitsto w]$%.
  74  *
  75  * Let %$n = \lceil \ell(M)/w \rceil$%.  Compute
  76  *
  77  *      %$y_j = E_K(N' \cat [i_0 + j]_{32})$%
  78  *
  79  * for %$0 \le j \le n$%.  Let
  80  *
  81  *      %$s = (y_1 \cat y_2 \cat \cdots \cat y_n)[0 \bitsto \ell(M)$%
  82  *
  83  * Let %$C = M \xor s$% and let %$T = G_x(F(H, C)) \xor y_0$%.  These are the
  84  * ciphertext and tag respectively.
  85  *
  86  * So why is this awful?
  87  *
  88  * For one thing, the bits are in a completely terrible order.  The bytes are
  89  * arranged in little-endian order, so the unit coefficient is in the first
  90  * byte, and the degree-127 coefficient is in the last byte.  But within each
  91  * byte, the lowest-degree coefficient is in the most significant bit.  It's
  92  * therefore better to think of GCM as using a big-endian byte-ordering
  93  * convention, but with the bits backwards.
  94  *
  95  * But messing about with byte ordering is expensive, so let's not do that in
  96  * the inner loop.  But multiplication in %$k$% is not easy either.  Some
  97  * kind of precomputed table would be nice, but that will leak secrets
  98  * through the cache.
  99  *
 100  * I choose a particularly simple table: given %$x$%, let %$X[i'] = x t^i$%.
 101  * Then $%$x y = \sum_{0\le i<w} y_i X[i']$% which is just a bunch of
 102  * bitmasking.  But the natural order for examining bits of %$y$% is not
 103  * necessarily the obvious one.  We'll have already loaded %$y$% into
 104  * internal form, as 32-bit words.  The good order to process these is left
 105  * to right, from high to low bits.  But now the order of degrees depends on
 106  * the endianness of our conversion of bytes to words.  Oh, well.
 107  *
 108  * If we've adopted a big-endian convention, then we'll see the degrees in
 109  * order, 0, 1, ..., all the way up to %$w - 1$% and everything is fine.  If
 110  * we've adopted a little-endian convention, though, we'll see an ordering
 111  * like this:
 112  *
 113  *      24, 25, ..., 31, 16, 17, ..., 23,  8,  9, ..., 15,  0,  1, ..., 7,
 114  *      56, 57, ..., 63, 48, 49, ..., 55, 40, 41, ..., 47, 32, 33, ..., 39,
 115  *      etc.
 116  *
 117  * which is the ordinary order with 0x18 = 24 XORed into the index.  That is,
 118  * %$i' = i$% if we've adopted a big-endian convention, and
 119  * %$i' = i \xor 24$% if we've adopted a little-endian convention.
 120  */
 121
 122 /*----- Low-level utilities -----------------------------------------------*/
 123
 124 /* --- @mult@ --- *
 125  *
 126  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 127  *              @uint32 *z@ = where to write the result
 128  *              @const uint32 *x@ = input field element
 129  *
 130  * Returns:     ---
 131  *
 132  * Use:         Multiply the input field element by %$t$%, and write the
 133  *              product to @z@.  It's safe for @x@ and @z@ to be equal, but
 134  *              they should not otherwise overlap.  Both input and output are
 135  *              in big-endian form, i.e., with the lowest-degree coefficients
 136  *              in the most significant bits.
 137  */
 138
 139 static void mult(const gcm_params *p, uint32 *z, const uint32 *x)
 140 {
 141   uint32 m, c, t;
 142   unsigned i;
 143
 144   t = x[p->n - 1]; m = -(t&1u); c = m&p->poly;
 145   for (i = 0; i < p->n; i++) { t = x[i]; z[i] = (t >> 1) ^ c; c = t << 31; }
 146 }
 147
 148 /* --- @mul@ --- *
 149  *
 150  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 151  *              @uint32 *z@ = where to write the result
 152  *              @const uint32 *x, *y@ = input field elements
 153  *
 154  * Returns:     ---
 155  *
 156  * Use:         Multiply the input field elements together, and write the
 157  *              product to @z@.  It's safe for the operands to overlap.  Both
 158  *              inputs and the output are in big-endian form, i.e., with the
 159  *              lowest-degree coefficients in the most significant bits.
 160  */
 161
 162 static void mul(const gcm_params *p, uint32 *z,
 163                 const uint32 *x, const uint32 *y)
 164 {
 165   uint32 m, t, u[GCM_NMAX], v[GCM_NMAX];
 166   unsigned i, j, k;
 167
 168   /* We can't do this in-place at all, so use temporary space.  Make a copy
 169    * of @x@ in @u@, where we can clobber it, and build the product in @v@.
 170    */
 171   for (i = 0; i < p->n; i++) { u[i] = x[i]; v[i] = 0; }
 172
 173   /* Repeatedly multiply @x@ (in @u@) by %$t$%, and add together those
 174    * %$x t^i$% selected by the bits of @y@.  This is basically what you get
 175    * by streaming the result of @gcm_mktable@ into @gcm_mulk_...@.
 176    */
 177   for (i = 0; i < p->n; i++) {
 178     t = y[i];
 179     for (j = 0; j < 32; j++) {
 180       m = -((t >> 31)&1u);
 181       for (k = 0; k < p->n; k++) v[k] ^= u[k]&m;
 182       mult(p, u, u); t <<= 1;
 183     }
 184   }
 185
 186   /* Write out the result now that it's ready. */
 187   for (i = 0; i < p->n; i++) z[i] = v[i];
 188 }
 189
 190 /*----- Table-based multiplication ----------------------------------------*/
 191
 192 /* --- @gcm_mktable@ --- *
 193  *
 194  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 195  *              @uint32 *ktab@ = where to write the table; there must be
 196  *                      space for %$32 n$% $%n$%-word entries, i.e.,
 197  *                      %$32 n^2$% 32-bit words in total, where %$n$% is
 198  *                      @p->n@, the block size in words
 199  *              @const uint32 *k@ = input field element
 200  *
 201  * Returns:     ---
 202  *
 203  * Use:         Construct a table for use by @gcm_mulk_...@ below, to
 204  *              multiply (vaguely) efficiently by @k@.
 205  */
 206
 207 static void simple_mktable(const gcm_params *p,
 208                            uint32 *ktab, const uint32 *k)
 209 {
 210   unsigned m = (p->f&GCMF_SWAP ? 0x18 : 0);
 211   unsigned i, j, o = m*p->n;
 212
 213   /* As described above, the table stores entries %$K[i \xor m] = k t^i$%,
 214    * where %$m = 0$% (big-endian cipher) or %$m = 24$% (little-endian).
 215    * The first job is to store %$K[m] = k$%.
 216    *
 217    * We initially build the table with the entries in big-endian order, and
 218    * then swap them if necessary.  This makes the arithmetic functions more
 219    * amenable for use by @gcm_concat@ below.
 220    */
 221   if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) ktab[o + i] = k[i];
 222   else for (i = 0; i < p->n; i++) ktab[o + i] = ENDSWAP32(k[i]);
 223
 224   /* Fill in the rest of the table by repeatedly multiplying the previous
 225    * entry by %$t$%.
 226    */
 227   for (i = 1; i < 32*p->n; i++)
 228     { j = (i ^ m)*p->n; mult(p, ktab + j, ktab + o); o = j; }
 229
 230   /* Finally, if the cipher uses a little-endian convention, then swap all of
 231    * the individual words.
 232    */
 233   if (p->f&GCMF_SWAP)
 234     for (i = 0; i < 32*p->n*p->n; i++) ktab[i] = ENDSWAP32(ktab[i]);
 235 }
 236
 237 #if CPUFAM_X86 || CPUFAM_AMD64
 238 static void pclmul_mktable(const gcm_params *p,
 239                            uint32 *ktab, const uint32 *k)
 240 {
 241   unsigned n = p->n;
 242   unsigned nz;
 243   uint32 *t;
 244
 245   /* We just need to store the value in a way which is convenient for the
 246    * assembler code to read back.  That involves reordering the words, and,
 247    * in the case of 96-bit blocks, padding with zeroes to fill out a 128-bit
 248    * chunk.
 249    */
 250
 251   if (n == 3) nz = 1;
 252   else nz = 0;
 253   t = ktab + n + nz;
 254
 255   if (p->f&GCMF_SWAP) while (n--) { *--t = ENDSWAP32(*k); k++; }
 256   else while (n--) *--t = *k++;
 257   while (nz--) *--t = 0;
 258 }
 259 #endif
 260
 261 #if CPUFAM_ARMEL
 262 static void arm_crypto_mktable(const gcm_params *p,
 263                                uint32 *ktab, const uint32 *k)
 264 {
 265   unsigned n = p->n;
 266   uint32 *t;
 267
 268   /* We just need to store the value in a way which is convenient for the
 269    * assembler code to read back.  That involves swapping the bytes in each
 270    * 64-bit lane.
 271    */
 272
 273   t = ktab;
 274   if (p->f&GCMF_SWAP) {
 275     while (n >= 2) {
 276       t[1] = ENDSWAP32(k[0]); t[0] = ENDSWAP32(k[1]);
 277       t += 2; k += 2; n -= 2;
 278     }
 279     if (n) { t[1] = ENDSWAP32(k[0]); t[0] = 0; }
 280   } else {
 281     while (n >= 2) {
 282       t[1] = k[0]; t[0] = k[1];
 283       t += 2; k += 2; n -= 2;
 284     }
 285     if (n) { t[1] = k[0]; t[0] = 0; }
 286   }
 287 }
 288 #endif
 289
 290 #if CPUFAM_ARM64
 291 static uint32 rbit32(uint32 x)
 292 {
 293   uint32 z, t;
 294
 295 #if GCC_VERSION_P(4, 3)
 296   /* Two tricks here.  Firstly, two separate steps, rather than a single
 297    * block of assembler, to allow finer-grained instruction scheduling.
 298    * Secondly, use `ENDSWAP32' so that the compiler can cancel it if the
 299    * caller actually wants the bytes reordered.
 300    */
 301   __asm__("rbit %w0, %w1" : "=r"(t) : "r"(x));
 302   z = ENDSWAP32(t);
 303 #else
 304   /* A generic but slightly clever implementation. */
 305 #  define SWIZZLE(x, m, s) ((((x)&(m)) << (s)) | (((x)&~(m)) >> (s)))
 306                                         /* 76543210 */
 307   t = SWIZZLE(x, 0x0f0f0f0f, 4);        /* 32107654 -- swap nibbles */
 308   t = SWIZZLE(t, 0x33333333, 2);        /* 10325476 -- swap bit pairs */
 309   z = SWIZZLE(t, 0x55555555, 1);        /* 01234567 -- swap adjacent bits */
 310 #  undef SWIZZLE
 311 #endif
 312   return (z);
 313 }
 314
 315 static void arm64_pmull_mktable(const gcm_params *p,
 316                                 uint32 *ktab, const uint32 *k)
 317 {
 318   unsigned n = p->n;
 319   uint32 *t;
 320
 321   /* We just need to store the value in a way which is convenient for the
 322    * assembler code to read back.  That involves two transformations:
 323    *
 324    *   * firstly, reversing the order of the bits in each byte; and,
 325    *
 326    *   * secondly, storing two copies of each 64-bit chunk.
 327    *
 328    * Note that, in this case, we /want/ the little-endian byte order of GCM,
 329    * so endianness-swapping happens in the big-endian case.
 330    */
 331
 332   t = ktab;
 333   if (p->f&GCMF_SWAP) {
 334     while (n >= 2) {
 335       t[0] = t[2] = rbit32(k[0]);
 336       t[1] = t[3] = rbit32(k[1]);
 337       t += 4; k += 2; n -= 2;
 338     }
 339     if (n) { t[0] = t[2] = rbit32(k[0]); t[1] = t[3] = 0; }
 340   } else {
 341     while (n >= 2) {
 342       t[0] = t[2] = ENDSWAP32(rbit32(k[0]));
 343       t[1] = t[3] = ENDSWAP32(rbit32(k[1]));
 344       t += 4; k += 2; n -= 2;
 345     }
 346     if (n) { t[0] = t[2] = ENDSWAP32(rbit32(k[0])); t[1] = t[3] = 0; }
 347   }
 348 }
 349 #endif
 350
 351 CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable,
 352              (const gcm_params *p, uint32 *ktab, const uint32 *k),
 353              (p, ktab, k),
 354              pick_mktable, simple_mktable)
 355
 356 static gcm_mktable__functype *pick_mktable(void)
 357 {
 358 #if CPUFAM_X86 || CPUFAM_AMD64
 359   DISPATCH_PICK_COND(gcm_mktable, pclmul_mktable,
 360                      cpu_feature_p(CPUFEAT_X86_SSSE3) &&
 361                      cpu_feature_p(CPUFEAT_X86_PCLMUL));
 362 #endif
 363 #if CPUFAM_ARMEL
 364   DISPATCH_PICK_COND(gcm_mktable, arm_crypto_mktable,
 365                      cpu_feature_p(CPUFEAT_ARM_PMULL));
 366 #endif
 367 #if CPUFAM_ARM64
 368   DISPATCH_PICK_COND(gcm_mktable, arm64_pmull_mktable,
 369                      cpu_feature_p(CPUFEAT_ARM_PMULL));
 370 #endif
 371   DISPATCH_PICK_FALLBACK(gcm_mktable, simple_mktable);
 372 }
 373
 374 /* --- @recover_k@ --- *
 375  *
 376  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 377  *              @uint32 *k@ = block-sized vector in which to store %$k$%
 378  *              @const uint32 *ktab@ = the table encoding %$k$%
 379  *
 380  * Returns:     ---
 381  *
 382  * Use:         Recovers %$k$%, the secret from which @ktab@ was by
 383  *              @gcm_mktable@, from the table, and stores it in internal
 384  *              (big-endian) form in @k@.
 385  */
 386
 387 static void simple_recover_k(const gcm_params *p,
 388                              uint32 *k, const uint32 *ktab)
 389 {
 390   unsigned i;
 391
 392   /* If the blockcipher is big-endian, then the key is simply in the first
 393    * table element, in the right format.  If the blockcipher is little-endian
 394    * then it's in element 24, and the bytes need swapping.
 395    */
 396
 397   if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) k[i] = ktab[i];
 398   else for (i = 0; i < p->n; i++) k[i] = ENDSWAP32(ktab[24*p->n + i]);
 399 }
 400
 401 #if CPUFAM_X86 || CPUFAM_AMD64
 402 static void pclmul_recover_k(const gcm_params *p,
 403                              uint32 *k, const uint32 *ktab)
 404 {
 405   unsigned n = p->n;
 406   unsigned nz;
 407   const uint32 *t;
 408
 409   /* The representation is already independent of the blockcipher endianness.
 410    * We need to compensate for padding, and reorder the words.
 411    */
 412
 413   if (n == 3) nz = 1; else nz = 0;
 414   t = ktab + n + nz;
 415   while (n--) *k++ = *--t;
 416 }
 417 #endif
 418
 419 #if CPUFAM_ARMEL
 420 static void arm_crypto_recover_k(const gcm_params *p,
 421                                  uint32 *k, const uint32 *ktab)
 422 {
 423   unsigned n = p->n;
 424   const uint32 *t;
 425
 426   /* The representation is already independent of the blockcipher endianness.
 427    * We only need to reorder the words.
 428    */
 429
 430   t = ktab;
 431   while (n >= 2) { k[1] = t[0]; k[0] = t[1]; t += 2; k += 2; n -= 2; }
 432   if (n) k[0] = t[1];
 433 }
 434 #endif
 435
 436 #if CPUFAM_ARM64
 437 static void arm64_pmull_recover_k(const gcm_params *p,
 438                                   uint32 *k, const uint32 *ktab)
 439 {
 440   unsigned n = p->n;
 441   const uint32 *t;
 442
 443   /* The representation is already independent of the blockcipher endianness.
 444    * We need to skip the duplicate pieces, and unscramble the bytes.
 445    */
 446
 447   t = ktab;
 448   while (n >= 2) {
 449     k[0] = ENDSWAP32(rbit32(t[0]));
 450     k[1] = ENDSWAP32(rbit32(t[1]));
 451     t += 4; k += 2; n -= 2;
 452   }
 453   if (n) k[0] = ENDSWAP32(rbit32(t[0]));
 454 }
 455 #endif
 456
 457 CPU_DISPATCH(static, EMPTY, void, recover_k,
 458              (const gcm_params *p, uint32 *k, const uint32 *ktab),
 459              (p, k, ktab),
 460              pick_recover_k, simple_recover_k)
 461
 462 static recover_k__functype *pick_recover_k(void)
 463 {
 464 #if CPUFAM_X86 || CPUFAM_AMD64
 465   DISPATCH_PICK_COND(recover_k, pclmul_recover_k,
 466                      cpu_feature_p(CPUFEAT_X86_SSSE3) &&
 467                      cpu_feature_p(CPUFEAT_X86_PCLMUL));
 468 #endif
 469 #if CPUFAM_ARMEL
 470   DISPATCH_PICK_COND(recover_k, arm_crypto_recover_k,
 471                      cpu_feature_p(CPUFEAT_ARM_PMULL));
 472 #endif
 473 #if CPUFAM_ARM64
 474   DISPATCH_PICK_COND(recover_k, arm64_pmull_recover_k,
 475                      cpu_feature_p(CPUFEAT_ARM_PMULL));
 476 #endif
 477   DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k);
 478 }
 479
 480 /* --- @gcm_mulk_N{b,l}@ --- *
 481  *
 482  * Arguments:   @uint32 *a@ = accumulator to multiply
 483  *              @const uint32 *ktab@ = table constructed by @gcm_mktable@
 484  *
 485  * Returns:     ---
 486  *
 487  * Use:         Multiply @a@ by @k@ (implicitly represented in @ktab@),
 488  *              updating @a@ in-place.  There are separate functions for each
 489  *              supported block size and endianness because this is the
 490  *              function whose performance actually matters.
 491  */
 492
 493 #if CPUFAM_X86 || CPUFAM_AMD64
 494 #  define DECL_MULK_X86ISH(var) extern gcm_mulk_##var##__functype       \
 495   gcm_mulk_##var##_x86ish_pclmul_avx,                                   \
 496   gcm_mulk_##var##_x86ish_pclmul;
 497 #  define PICK_MULK_X86ISH(var) do {                                    \
 498   DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul_avx, \
 499                      cpu_feature_p(CPUFEAT_X86_AVX) &&                  \
 500                      cpu_feature_p(CPUFEAT_X86_PCLMUL) &&               \
 501                      cpu_feature_p(CPUFEAT_X86_SSSE3));                 \
 502   DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul,    \
 503                      cpu_feature_p(CPUFEAT_X86_PCLMUL) &&               \
 504                      cpu_feature_p(CPUFEAT_X86_SSSE3));                 \
 505 } while (0)
 506 #else
 507 #  define DECL_MULK_X86ISH(var)
 508 #  define PICK_MULK_X86ISH(var) do ; while (0)
 509 #endif
 510
 511 #if CPUFAM_ARMEL
 512 #  define DECL_MULK_ARM(var)                                            \
 513   extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm_crypto;
 514 #  define PICK_MULK_ARM(var) do {                                       \
 515   DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm_crypto,       \
 516                      cpu_feature_p(CPUFEAT_ARM_PMULL));                 \
 517 } while (0)
 518 #else
 519 #  define DECL_MULK_ARM(var)
 520 #  define PICK_MULK_ARM(var) do ; while (0)
 521 #endif
 522
 523 #if CPUFAM_ARM64
 524 #  define DECL_MULK_ARM64(var)                                          \
 525   extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm64_pmull;
 526 #  define PICK_MULK_ARM64(var) do {                                     \
 527   DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm64_pmull,      \
 528                      cpu_feature_p(CPUFEAT_ARM_PMULL));                 \
 529 } while (0)
 530 #else
 531 #  define DECL_MULK_ARM64(var)
 532 #  define PICK_MULK_ARM64(var) do ; while (0)
 533 #endif
 534
 535 #define DEF_MULK(nbits)                                                 \
 536                                                                         \
 537 CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##b,                   \
 538              (uint32 *a, const uint32 *ktab), (a, ktab),                \
 539              pick_mulk_##nbits##b, simple_mulk_##nbits)                 \
 540 CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##l,                   \
 541              (uint32 *a, const uint32 *ktab), (a, ktab),                \
 542              pick_mulk_##nbits##l, simple_mulk_##nbits)                 \
 543                                                                         \
 544 static void simple_mulk_##nbits(uint32 *a, const uint32 *ktab)          \
 545 {                                                                       \
 546   uint32 m, t;                                                          \
 547   uint32 z[nbits/32];                                                   \
 548   unsigned i, j, k;                                                     \
 549                                                                         \
 550   for (i = 0; i < nbits/32; i++) z[i] = 0;                              \
 551                                                                         \
 552   for (i = 0; i < nbits/32; i++) {                                      \
 553     t = a[i];                                                           \
 554     for (j = 0; j < 32; j++) {                                          \
 555       m = -((t >> 31)&1u);                                              \
 556       for (k = 0; k < nbits/32; k++) z[k] ^= *ktab++&m;                 \
 557       t <<= 1;                                                          \
 558     }                                                                   \
 559   }                                                                     \
 560                                                                         \
 561   for (i = 0; i < nbits/32; i++) a[i] = z[i];                           \
 562 }                                                                       \
 563                                                                         \
 564 DECL_MULK_X86ISH(nbits##b)                                              \
 565 DECL_MULK_ARM(nbits##b)                                                 \
 566 DECL_MULK_ARM64(nbits##b)                                               \
 567 static gcm_mulk_##nbits##b##__functype *pick_mulk_##nbits##b(void)      \
 568 {                                                                       \
 569   PICK_MULK_X86ISH(nbits##b);                                           \
 570   PICK_MULK_ARM(nbits##b);                                              \
 571   PICK_MULK_ARM64(nbits##b);                                            \
 572   DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits);     \
 573 }                                                                       \
 574                                                                         \
 575 DECL_MULK_X86ISH(nbits##l)                                              \
 576 DECL_MULK_ARM(nbits##l)                                                 \
 577 DECL_MULK_ARM64(nbits##l)                                               \
 578 static gcm_mulk_##nbits##l##__functype *pick_mulk_##nbits##l(void)      \
 579 {                                                                       \
 580   PICK_MULK_X86ISH(nbits##l);                                           \
 581   PICK_MULK_ARM(nbits##l);                                              \
 582   PICK_MULK_ARM64(nbits##l);                                            \
 583   DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits);     \
 584 }
 585
 586 GCM_WIDTHS(DEF_MULK)
 587
 588 #define GCM_MULK_CASE(nbits)                                            \
 589   case nbits/32:                                                        \
 590     if (_f&GCMF_SWAP) gcm_mulk_##nbits##l(_a, _ktab);                   \
 591     else gcm_mulk_##nbits##b(_a, _ktab);                                \
 592     break;
 593 #define MULK(n, f, a, ktab) do {                                        \
 594   uint32 *_a = (a); const uint32 *_ktab = (ktab);                       \
 595   unsigned _f = (f);                                                    \
 596   switch (n) {                                                          \
 597     GCM_WIDTHS(GCM_MULK_CASE)                                           \
 598     default: abort();                                                   \
 599   }                                                                     \
 600 } while (0)
 601
 602 /*----- Other utilities ---------------------------------------------------*/
 603
 604 /* --- @putlen@ --- *
 605  *
 606  * Arguments:   @octet *p@ = pointer to output buffer
 607  *              @unsigned w@ = size of output buffer
 608  *              @unsigned blksz@ = block size (assumed fairly small)
 609  *              @unsigned long nblocks@ = number of blocks
 610  *              @unsigned nbytes@ = tail size in bytes (assumed small)
 611  *
 612  * Returns:     ---
 613  *
 614  * Use:         Store the overall length in %$\emph{bits}$% (i.e.,
 615  *              @3*(nblocks*blksz + nbytes)@ in big-endian form in the
 616  *              buffer @p@.
 617  */
 618
 619 static void putlen(octet *p, unsigned w, unsigned blksz,
 620                    unsigned long nblocks, unsigned nbytes)
 621 {
 622   unsigned long nblo = nblocks&((1ul << (ULONG_BITS/2)) - 1),
 623     nbhi = nblocks >> ULONG_BITS/2;
 624   unsigned long nlo = nblo*blksz + nbytes, nhi = nbhi*blksz;
 625
 626   /* This is fiddly.  Split @nblocks@, which is the big number, into high and
 627    * low halves, multiply those separately by @blksz@, propagate carries, and
 628    * then multiply by eight.
 629    */
 630   nhi += nlo >> ULONG_BITS/2;
 631   nlo &= (1ul << (ULONG_BITS/2)) - 1;
 632   nlo <<= 3;
 633
 634   /* Now write out the size, feeding bits in from @nhi@ as necessary. */
 635   p += w;
 636   while (w--) {
 637     *--p = U8(nlo);
 638     nlo = (nlo >> 8) | ((nhi&0xff) << (ULONG_BITS/2 - 5));
 639     nhi >>= 8;
 640   }
 641 }
 642
 643 /* --- @mix@ --- *
 644  *
 645  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 646  *              @uint32 *a@ = GHASH accumulator
 647  *              @const octet *q@ = pointer to an input block
 648  *              @const uint32 *ktab@ = multiplication table, built by
 649  *                      @gcm_mktable@
 650  *
 651  * Returns:     ---
 652  *
 653  * Use:         Fold the block @q@ into the GHASH accumulator.  The
 654  *              calculation is %$a' = k (a + q)$%.
 655  */
 656
 657 static void mix(const gcm_params *p, uint32 *a,
 658                 const octet *q, const uint32 *ktab)
 659 {
 660   unsigned i;
 661
 662   if (p->f&GCMF_SWAP)
 663     for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_L(q); q += 4; }
 664   else
 665     for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_B(q); q += 4; }
 666   MULK(p->n, p->f, a, ktab);
 667 }
 668
 669 /* --- @gcm_ghashdone@ --- *
 670  *
 671  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 672  *              @uint32 *a@ = GHASH accumulator
 673  *              @const uint32 *ktab@ = multiplication table, built by
 674  *                      @gcm_mktable@
 675  *              @unsigned long xblocks, yblocks@ = number of whole blocks in
 676  *                      the two inputs
 677  *              @unsigned xbytes, ybytes@ = number of trailing bytes in the
 678  *                      two inputs
 679  *
 680  * Returns:     ---
 681  *
 682  * Use:         Finishes a GHASH operation by appending the appropriately
 683  *              encoded lengths of the two constituent messages.
 684  */
 685
 686 void gcm_ghashdone(const gcm_params *p, uint32 *a, const uint32 *ktab,
 687                    unsigned long xblocks, unsigned xbytes,
 688                    unsigned long yblocks, unsigned ybytes)
 689 {
 690   octet b[4*GCM_NMAX];
 691   unsigned w = p->n < 3 ? 4*p->n : 2*p->n;
 692
 693   /* Construct the encoded lengths.  Note that smaller-block versions of GCM
 694    * encode the lengths in separate blocks.  GCM is only officially defined
 695    * for 64- and 128-bit blocks; I've placed the cutoff somewhat arbitrarily
 696    * at 96 bits.
 697    */
 698   putlen(b,     w, 4*p->n, xblocks, xbytes);
 699   putlen(b + w, w, 4*p->n, yblocks, ybytes);
 700
 701   /* Feed the lengths into the accumulator. */
 702   mix(p, a, b, ktab);
 703   if (p->n < 3) mix(p, a, b + w, ktab);
 704 }
 705
 706 /* --- @gcm_concat@ --- *
 707  *
 708  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 709  *              @uint32 *z@ = GHASH accumulator for suffix, updated
 710  *              @const uint32 *x@ = GHASH accumulator for prefix
 711  *              @const uint32 *ktab@ = multiplication table, built by
 712  *                      @gcm_mktable@
 713  *              @unsigned long n@ = length of suffix in whole blocks
 714  *
 715  * Returns:     ---
 716  *
 717  * Use:         On entry, @x@ and @z@ are the results of hashing two strings
 718  *              %$a$% and %$b$%, each a whole number of blocks long; in
 719  *              particular, %$b$% is @n@ blocks long.  On exit, @z@ is
 720  *              updated to be the hash of %$a \cat b$%.
 721  */
 722
 723 void gcm_concat(const gcm_params *p, uint32 *z, const uint32 *x,
 724                 const uint32 *ktab, unsigned long n)
 725 {
 726   uint32 t[GCM_NMAX], u[GCM_NMAX];
 727   unsigned i, j;
 728
 729   if (!n) {
 730     /* If @n@ is zero, then there's not much to do.  The mathematics
 731      * (explained below) still works, but the code takes a shortcut which
 732      * doesn't handle this case: so set %$z' = z + x k^n = z + x$%.
 733      */
 734
 735     for (j = 0; j < p->n; j++) z[j] ^= x[j];
 736   } else {
 737     /* We have %$x = a_0 t^m + \cdots + a_{m-2} t^2 + a_{m-1} t$% and
 738      * %$z = b_0 t^n + \cdots + b_{n-2} t^2 + b_{n-1} t$%.  What we'd like is
 739      * the hash of %$a \cat b$%, which is %$z + x k^n$%.
 740      *
 741      * The first job, then, is to calculate %$k^n$%, and for this we use a
 742      * simple left-to-right square-and-multiply algorithm.  There's no need
 743      * to keep %$n$% secret here.
 744      */
 745
 746     /* Start by retrieving %$k$% from the table, and convert it to big-endian
 747      * form.
 748      */
 749     recover_k(p, u, ktab);
 750
 751     /* Now calculate %$k^n$%. */
 752     i = ULONG_BITS;
 753 #define BIT (1ul << (ULONG_BITS - 1))
 754     while (!(n&BIT)) { n <<= 1; i--; }
 755     n <<= 1; i--; for (j = 0; j < p->n; j++) t[j] = u[j];
 756     while (i--) { mul(p, t, t, t); if (n&BIT) mul(p, t, t, u); n <<= 1; }
 757 #undef BIT
 758
 759     /* Next, calculate %$x k^n$%.  If we're using a little-endian convention
 760      * then we must convert %$x$%; otherwise we can just use it in place.
 761      */
 762     if (!(p->f&GCMF_SWAP))
 763       mul(p, t, t, x);
 764     else {
 765       for (j = 0; j < p->n; j++) u[j] = ENDSWAP32(x[j]);
 766       mul(p, t, t, u);
 767     }
 768
 769     /* Finally, add %$x k^n$% onto %$z$%, converting back to little-endian if
 770      * necessary.
 771      */
 772     if (!(p->f&GCMF_SWAP)) for (j = 0; j < p->n; j++) z[j] ^= t[j];
 773     else for (j = 0; j < p->n; j++) z[j] ^= ENDSWAP32(t[j]);
 774   }
 775 }
 776
 777 /*----- Test rig ----------------------------------------------------------*/
 778
 779 #ifdef TEST_RIG
 780
 781 #include <mLib/macros.h>
 782 #include <mLib/quis.h>
 783 #include <mLib/testrig.h>
 784
 785 static void report_failure(const char *test, unsigned nbits,
 786                            const char *ref, dstr v[], dstr *d)
 787 {
 788   printf("test %s failed (nbits = %u)", test, nbits);
 789   printf("\n\tx  = "); type_hex.dump(&v[0], stdout);
 790   printf("\n\ty  = "); type_hex.dump(&v[1], stdout);
 791   printf("\n\tz  = "); type_hex.dump(&v[2], stdout);
 792   printf("\n\t%s' = ", ref); type_hex.dump(d, stdout);
 793   putchar('\n');
 794 }
 795
 796 static void mulk(unsigned nbits, unsigned f, uint32 *x, const uint32 *ktab)
 797   { MULK(nbits/32, f, x, ktab); }
 798
 799 static int test_mul(uint32 poly, dstr v[])
 800 {
 801   uint32 x[GCM_NMAX], y[GCM_NMAX], z[GCM_NMAX], ktab[32*GCM_NMAX*GCM_NMAX];
 802   gcm_params p;
 803   dstr d = DSTR_INIT;
 804   unsigned i, nbits;
 805   int ok = 1;
 806   enum { I_x, I_y, I_z };
 807
 808   nbits = 8*v[0].len; p.f = 0; p.n = nbits/32; p.poly = poly;
 809   dstr_ensure(&d, nbits/8); d.len = nbits/8;
 810
 811 #define LOADXY(E) do {                                                  \
 812   for (i = 0; i < nbits/32; i++) {                                      \
 813     x[i] = LOAD32_##E(v[I_x].buf + 4*i);                                \
 814     y[i] = LOAD32_##E(v[I_y].buf + 4*i);                                \
 815   }                                                                     \
 816 } while (0)
 817
 818 #define INITZ(x) do {                                                   \
 819   for (i = 0; i < nbits/32; i++) z[i] = (x)[i];                         \
 820 } while (0)
 821
 822 #define CHECK(E, what, ref) do {                                        \
 823   for (i = 0; i < nbits/32; i++) STORE32_##E(d.buf + 4*i, z[i]);        \
 824   if (MEMCMP(d.buf, !=, v[I_##ref].buf, nbits/8))                       \
 825     { ok = 0; report_failure(what, nbits, #ref, v, &d); }               \
 826 } while (0)
 827
 828 #define TEST_PREP_1(E, x, y, what) do {                                 \
 829   gcm_mktable(&p, ktab, y);                                             \
 830   recover_k(&p, z, ktab); CHECK(B, "mktable/recover_k (" #y ")", y);    \
 831   INITZ(x); mulk(nbits, p.f, z, ktab); CHECK(E, what " (k = " #y ")", z); \
 832 } while (0)
 833
 834 #define TEST_PREP(E, what) do {                                         \
 835   TEST_PREP_1(E, x, y, what);                                           \
 836   TEST_PREP_1(E, y, x, what);                                           \
 837 } while (0)
 838
 839   /* First, test plain multiply. */
 840   LOADXY(B); mul(&p, z, x, y); CHECK(B, "gcm_mul", z);
 841
 842   /* Next, test big-endian prepared key. */
 843   LOADXY(B); TEST_PREP(B, "gcm_kmul_b");
 844
 845   /* Finally, test little-endian prepared key. */
 846   p.f = GCMF_SWAP; LOADXY(L);
 847   TEST_PREP(L, "gcm_kmul_l");
 848
 849 #undef LOADXY
 850 #undef INITZ
 851 #undef CHECK
 852 #undef TEST_PREP_1
 853 #undef TEST_PREP
 854
 855   /* All done. */
 856   return (ok);
 857 }
 858
 859 #define TEST(nbits)                                                     \
 860 static int test_mul_##nbits(dstr v[])                                   \
 861   { return (test_mul(GCM_POLY_##nbits, v)); }
 862 GCM_WIDTHS(TEST)
 863 #undef TEST
 864
 865 static test_chunk defs[] = {
 866 #define TEST(nbits)                                                     \
 867   { "gcm-mul" #nbits, test_mul_##nbits,                                 \
 868     { &type_hex, &type_hex, &type_hex, 0 } },
 869 GCM_WIDTHS(TEST)
 870 #undef TEST
 871   { 0, 0, { 0 } }
 872 };
 873
 874 int main(int argc,  char *argv[])
 875 {
 876   ego(argv[0]);
 877   test_run(argc, argv, defs, SRCDIR"/t/gcm");
 878   return (0);
 879 }
 880
 881 #endif
 882
 883 /*----- That's all, folks -------------------------------------------------*/