mdw@git.distorted.org.uk Git - catacomb/blob - symm/gcm.c

   1 /* -*-c-*-
   2  *
   3  * The GCM authenticated encryption mode
   4  *
   5  * (c) 2017 Straylight/Edgeware
   6  */
   7
   8 /*----- Licensing notice --------------------------------------------------*
   9  *
  10  * This file is part of Catacomb.
  11  *
  12  * Catacomb is free software: you can redistribute it and/or modify it
  13  * under the terms of the GNU Library General Public License as published
  14  * by the Free Software Foundation; either version 2 of the License, or
  15  * (at your option) any later version.
  16  *
  17  * Catacomb is distributed in the hope that it will be useful, but
  18  * WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Library General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Library General Public
  23  * License along with Catacomb.  If not, write to the Free Software
  24  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25  * USA.
  26  */
  27
  28 /*----- Header files ------------------------------------------------------*/
  29
  30 #include "config.h"
  31
  32 #include <stdio.h>
  33
  34 #include <mLib/bits.h>
  35
  36 #include "gcm.h"
  37 #include "gcm-def.h"
  38
  39 /*----- Overall strategy --------------------------------------------------*
  40  *
  41  * GCM is pretty awful to implement in software.  (This presentation is going
  42  * to be somewhat different to that in the specification, but I think it
  43  * makes more sense like this.)
  44  *
  45  * We're given a %$w$%-bit blockcipher %$E$% with a key %$K$%.
  46  *
  47  * The main part is arithmetic in the finite field %$k = \gf{2^w}$%, which we
  48  * represent as the quotient ring %$\gf{2}[t]/(p_w(t))$% for some irreducible
  49  * degree-%$w$% polynomial %$p(t)$%, whose precise value isn't very important
  50  * right now.  We choose a secret point %$x = E_K(0^w)$%.
  51  *
  52  * We choose a length size %$z$% as follows: if %$w < 96%$ then %$z = w$%;
  53  * otherwise %$z = w/2$%.  Format a message pair as follows:
  54  *
  55  *      %$F(a, b) = P_w(a) \cat P_w(b) \cat [\ell(a)]_z \cat [\ell(b)]_z$%
  56  *
  57  * where %$P_w(x) = x \cat 0^n$% where $%0 \le n < w$% such that
  58  * %$\ell(x) + n \equiv 0 \pmod{w}$%.
  59  *
  60  * Hash a (block-aligned) message %$u$% as follows.  First, split %$u$% into
  61  * %$w$%-bit blocks %$u_0$%, %$u_1$%, %%\ldots%%, %$u_{n-1}$%.  Interpret
  62  * these as elements of %$k$%.  Then
  63  *
  64  *      %$G_x(u) = u_0 t^n + u_1 t^{n-1} + \cdots + u_{n-1} t$%
  65  *
  66  * converted back to a %$w$%-bit string.
  67  *
  68  * We're ready to go now.  Suppose we're to encrypt a message %$M$% with
  69  * header %$H$% and nonce %$N$%.  If %$\ell(N) + 32 = w$% then let
  70  * %$N' = N$% and let %$i_0 = 1$%; otherwise, let %$U = G_t(F(\epsilon, N))$%
  71  * and split this into %$N' = U[0 \bitsto w - 32]$% and
  72  * %$[i_0]_{32} = U[w - 32 \bitsto w]$%.
  73  *
  74  * Let %$n = \lceil \ell(M)/w \rceil$%.  Compute
  75  *
  76  *      %$y_j = E_K(N' \cat [i_0 + j]_{32})$%
  77  *
  78  * for %$0 \le j \le n$%.  Let
  79  *
  80  *      %$s = (y_1 \cat y_2 \cat \cdots \cat y_n)[0 \bitsto \ell(M)$%
  81  *
  82  * Let %$C = M \xor s$% and let %$T = G_x(F(H, C)) \xor y_0$%.  These are the
  83  * ciphertext and tag respectively.
  84  *
  85  * So why is this awful?
  86  *
  87  * For one thing, the bits are in a completely terrible order.  The bytes are
  88  * arranged in little-endian order, so the unit coefficient is in the first
  89  * byte, and the degree-127 coefficient is in the last byte.  But within each
  90  * byte, the lowest-degree coefficient is in the most significant bit.  It's
  91  * therefore better to think of GCM as using a big-endian byte-ordering
  92  * convention, but with the bits backwards.
  93  *
  94  * But messing about with byte ordering is expensive, so let's not do that in
  95  * the inner loop.  But multiplication in %$k$% is not easy either.  Some
  96  * kind of precomputed table would be nice, but that will leak secrets
  97  * through the cache.
  98  *
  99  * I choose a particularly simple table: given %$x$%, let %$X[i'] = x t^i$%.
 100  * Then $%$x y = \sum_{0\le i<w} y_i X[i']$% which is just a bunch of
 101  * bitmasking.  But the natural order for examining bits of %$y$% is not
 102  * necessarily the obvious one.  We'll have already loaded %$y$% into
 103  * internal form, as 32-bit words.  The good order to process these is left
 104  * to right, from high to low bits.  But now the order of degrees depends on
 105  * the endianness of our conversion of bytes to words.  Oh, well.
 106  *
 107  * If we've adopted a big-endian convention, then we'll see the degrees in
 108  * order, 0, 1, ..., all the way up to %$w - 1$% and everything is fine.  If
 109  * we've adopted a little-endian convention, though, we'll see an ordering
 110  * like this:
 111  *
 112  *      24, 25, ..., 31, 16, 17, ..., 23,  8,  9, ..., 15,  0,  1, ..., 7,
 113  *      56, 57, ..., 63, 48, 49, ..., 55, 40, 41, ..., 47, 32, 33, ..., 39,
 114  *      etc.
 115  *
 116  * which is the ordinary order with 0x18 = 24 XORed into the index.  That is,
 117  * %$i' = i$% if we've adopted a big-endian convention, and
 118  * %$i' = i \xor 24$% if we've adopted a little-endian convention.
 119  */
 120
 121 /*----- Low-level utilities -----------------------------------------------*/
 122
 123 /* --- @mult@ --- *
 124  *
 125  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 126  *              @uint32 *z@ = where to write the result
 127  *              @const uint32 *x@ = input field element
 128  *
 129  * Returns:     ---
 130  *
 131  * Use:         Multiply the input field element by %$t$%, and write the
 132  *              product to @z@.  It's safe for @x@ and @z@ to be equal, but
 133  *              they should not otherwise overlap.  Both input and output are
 134  *              in big-endian form, i.e., with the lowest-degree coefficients
 135  *              in the most significant bits.
 136  */
 137
 138 static void mult(const gcm_params *p, uint32 *z, const uint32 *x)
 139 {
 140   uint32 m, c, t;
 141   unsigned i;
 142
 143   t = x[p->n - 1]; m = -(t&1u); c = m&p->poly;
 144   for (i = 0; i < p->n; i++) { t = x[i]; z[i] = (t >> 1) ^ c; c = t << 31; }
 145 }
 146
 147 /* --- @mul@ --- *
 148  *
 149  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 150  *              @uint32 *z@ = where to write the result
 151  *              @const uint32 *x, *y@ = input field elements
 152  *
 153  * Returns:     ---
 154  *
 155  * Use:         Multiply the input field elements together, and write the
 156  *              product to @z@.  It's safe for the operands to overlap.  Both
 157  *              inputs and the output are in big-endian form, i.e., with the
 158  *              lowest-degree coefficients in the most significant bits.
 159  */
 160
 161 static void mul(const gcm_params *p, uint32 *z,
 162                 const uint32 *x, const uint32 *y)
 163 {
 164   uint32 m, t, u[GCM_NMAX], v[GCM_NMAX];
 165   unsigned i, j, k;
 166
 167   /* We can't do this in-place at all, so use temporary space.  Make a copy
 168    * of @x@ in @u@, where we can clobber it, and build the product in @v@.
 169    */
 170   for (i = 0; i < p->n; i++) { u[i] = x[i]; v[i] = 0; }
 171
 172   /* Repeatedly multiply @x@ (in @u@) by %$t$%, and add together those
 173    * %$x t^i$% selected by the bits of @y@.  This is basically what you get
 174    * by streaming the result of @gcm_mktable@ into @gcm_mulk_...@.
 175    */
 176   for (i = 0; i < p->n; i++) {
 177     t = y[i];
 178     for (j = 0; j < 32; j++) {
 179       m = -((t >> 31)&1u);
 180       for (k = 0; k < p->n; k++) v[k] ^= u[k]&m;
 181       mult(p, u, u); t <<= 1;
 182     }
 183   }
 184
 185   /* Write out the result now that it's ready. */
 186   for (i = 0; i < p->n; i++) z[i] = v[i];
 187 }
 188
 189 /*----- Table-based multiplication ----------------------------------------*/
 190
 191 /* --- @gcm_mktable@ --- *
 192  *
 193  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 194  *              @uint32 *ktab@ = where to write the table; there must be
 195  *                      space for %$32 n$% $%n$%-word entries, i.e.,
 196  *                      %$32 n^2$% 32-bit words in total, where %$n$% is
 197  *                      @p->n@, the block size in words
 198  *              @const uint32 *k@ = input field element
 199  *
 200  * Returns:     ---
 201  *
 202  * Use:         Construct a table for use by @gcm_mulk_...@ below, to
 203  *              multiply (vaguely) efficiently by @k@.
 204  */
 205
 206 void gcm_mktable(const gcm_params *p, uint32 *ktab, const uint32 *k)
 207 {
 208   unsigned m = (p->f&GCMF_SWAP ? 0x18 : 0);
 209   unsigned i, j, o = m*p->n;
 210
 211   /* As described above, the table stores entries %$K[i \xor m] = k t^i$%,
 212    * where %$m = 0$% (big-endian cipher) or %$m = 24$% (little-endian).
 213    * The first job is to store %$K[m] = k$%.
 214    *
 215    * We initially build the table with the entries in big-endian order, and
 216    * then swap them if necessary.  This makes the arithmetic functions more
 217    * amenable for use by @gcm_concat@ below.
 218    */
 219   if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) ktab[o + i] = k[i];
 220   else for (i = 0; i < p->n; i++) ktab[o + i] = ENDSWAP32(k[i]);
 221
 222   /* Fill in the rest of the table by repeatedly multiplying the previous
 223    * entry by %$t$%.
 224    */
 225   for (i = 1; i < 32*p->n; i++)
 226     { j = (i ^ m)*p->n; mult(p, ktab + j, ktab + o); o = j; }
 227
 228   /* Finally, if the cipher uses a little-endian convention, then swap all of
 229    * the individual words.
 230    */
 231   if (p->f&GCMF_SWAP)
 232     for (i = 0; i < 32*p->n*p->n; i++) ktab[i] = ENDSWAP32(ktab[i]);
 233 }
 234
 235 /* --- @gcm_mulk_N@ --- *
 236  *
 237  * Arguments:   @uint32 *a@ = accumulator to multiply
 238  *              @const uint32 *ktab@ = table constructed by @gcm_mktable@
 239  *
 240  * Returns:     ---
 241  *
 242  * Use:         Multiply @a@ by @k@ (implicitly represented in @ktab@),
 243  *              updating @a@ in-place.  There are separate functions for each
 244  *              supported block size because this is the function whose
 245  *              performance actually matters.
 246  */
 247
 248 #define DEF_MULK(nbits)                                                 \
 249 void gcm_mulk_##nbits(uint32 *a, const uint32 *ktab)                    \
 250 {                                                                       \
 251   uint32 m, t;                                                          \
 252   uint32 z[nbits/32];                                                   \
 253   unsigned i, j, k;                                                     \
 254                                                                         \
 255   for (i = 0; i < nbits/32; i++) z[i] = 0;                              \
 256                                                                         \
 257   for (i = 0; i < nbits/32; i++) {                                      \
 258     t = a[i];                                                           \
 259     for (j = 0; j < 32; j++) {                                          \
 260       m = -((t >> 31)&1u);                                              \
 261       for (k = 0; k < nbits/32; k++) z[k] ^= *ktab++&m;                 \
 262       t <<= 1;                                                          \
 263     }                                                                   \
 264   }                                                                     \
 265                                                                         \
 266   for (i = 0; i < nbits/32; i++) a[i] = z[i];                           \
 267 }
 268 GCM_WIDTHS(DEF_MULK)
 269
 270 /*----- Other utilities ---------------------------------------------------*/
 271
 272 /* --- @putlen@ --- *
 273  *
 274  * Arguments:   @octet *p@ = pointer to output buffer
 275  *              @unsigned w@ = size of output buffer
 276  *              @unsigned blksz@ = block size (assumed fairly small)
 277  *              @unsigned long nblocks@ = number of blocks
 278  *              @unsigned nbytes@ = tail size in bytes (assumed small)
 279  *
 280  * Returns:     ---
 281  *
 282  * Use:         Store the overall length in %$\emph{bits}$% (i.e.,
 283  *              @3*(nblocks*blksz + nbytes)@ in big-endian form in the
 284  *              buffer @p@.
 285  */
 286
 287 static void putlen(octet *p, unsigned w, unsigned blksz,
 288                    unsigned long nblocks, unsigned nbytes)
 289 {
 290   unsigned long nblo = nblocks&((1ul << (ULONG_BITS/2)) - 1),
 291     nbhi = nblocks >> ULONG_BITS/2;
 292   unsigned long nlo = nblo*blksz + nbytes, nhi = nbhi*blksz;
 293
 294   /* This is fiddly.  Split @nblocks@, which is the big number, into high and
 295    * low halves, multiply those separately by @blksz@, propagate carries, and
 296    * then multiply by eight.
 297    */
 298   nhi += nlo >> ULONG_BITS/2;
 299   nlo &= (1ul << (ULONG_BITS/2)) - 1;
 300   nlo <<= 3;
 301
 302   /* Now write out the size, feeding bits in from @nhi@ as necessary. */
 303   p += w;
 304   while (w--) {
 305     *--p = U8(nlo);
 306     nlo = (nlo >> 8) | ((nhi&0xff) << (ULONG_BITS/2 - 5));
 307     nhi >>= 8;
 308   }
 309 }
 310
 311 /* --- @mix@ --- *
 312  *
 313  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 314  *              @uint32 *a@ = GHASH accumulator
 315  *              @const octet *q@ = pointer to an input block
 316  *              @const uint32 *ktab@ = multiplication table, built by
 317  *                      @gcm_mktable@
 318  *
 319  * Returns:     ---
 320  *
 321  * Use:         Fold the block @q@ into the GHASH accumulator.  The
 322  *              calculation is %$a' = k (a + q)$%.
 323  */
 324
 325 static void mix(const gcm_params *p, uint32 *a,
 326                 const octet *q, const uint32 *ktab)
 327 {
 328   unsigned i;
 329
 330   /* Convert the block from bytes into words, using the appropriate
 331    * convention.
 332    */
 333   if (p->f&GCMF_SWAP)
 334     for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_L(q); q += 4; }
 335   else
 336     for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_B(q); q += 4; }
 337
 338   /* Dispatch to the correct multiply-by-%$k$% function. */
 339   switch (p->n) {
 340 #define CASE(nbits) case nbits/32: gcm_mulk_##nbits(a, ktab); break;
 341     GCM_WIDTHS(CASE)
 342 #undef CASE
 343     default: abort();
 344   }
 345 }
 346
 347 /* --- @gcm_ghashdone@ --- *
 348  *
 349  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 350  *              @uint32 *a@ = GHASH accumulator
 351  *              @const uint32 *ktab@ = multiplication table, built by
 352  *                      @gcm_mktable@
 353  *              @unsigned long xblocks, yblocks@ = number of whole blocks in
 354  *                      the two inputs
 355  *              @unsigned xbytes, ybytes@ = number of trailing bytes in the
 356  *                      two inputs
 357  *
 358  * Returns:     ---
 359  *
 360  * Use:         Finishes a GHASH operation by appending the appropriately
 361  *              encoded lengths of the two constituent messages.
 362  */
 363
 364 void gcm_ghashdone(const gcm_params *p, uint32 *a, const uint32 *ktab,
 365                    unsigned long xblocks, unsigned xbytes,
 366                    unsigned long yblocks, unsigned ybytes)
 367 {
 368   octet b[4*GCM_NMAX];
 369   unsigned w = p->n < 3 ? 4*p->n : 2*p->n;
 370
 371   /* Construct the encoded lengths.  Note that smaller-block versions of GCM
 372    * encode the lengths in separate blocks.  GCM is only officially defined
 373    * for 64- and 128-bit blocks; I've placed the cutoff somewhat arbitrarily
 374    * at 96 bits.
 375    */
 376   putlen(b,     w, 4*p->n, xblocks, xbytes);
 377   putlen(b + w, w, 4*p->n, yblocks, ybytes);
 378
 379   /* Feed the lengths into the accumulator. */
 380   mix(p, a, b, ktab);
 381   if (p->n < 3) mix(p, a, b + w, ktab);
 382 }
 383
 384 /* --- @gcm_concat@ --- *
 385  *
 386  * Arguments:   @const gcm_params *p@ = pointer to the parameters
 387  *              @uint32 *z@ = GHASH accumulator for suffix, updated
 388  *              @const uint32 *x@ = GHASH accumulator for prefix
 389  *              @const uint32 *ktab@ = multiplication table, built by
 390  *                      @gcm_mktable@
 391  *              @unsigned long n@ = length of suffix in whole blocks
 392  *
 393  * Returns:     ---
 394  *
 395  * Use:         On entry, @x@ and @z@ are the results of hashing two strings
 396  *              %$a$% and %$b$%, each a whole number of blocks long; in
 397  *              particular, %$b$% is @n@ blocks long.  On exit, @z@ is
 398  *              updated to be the hash of %$a \cat b$%.
 399  */
 400
 401 void gcm_concat(const gcm_params *p, uint32 *z, const uint32 *x,
 402                 const uint32 *ktab, unsigned long n)
 403 {
 404   uint32 t[GCM_NMAX], u[GCM_NMAX];
 405   unsigned i, j;
 406
 407   if (!n) {
 408     /* If @n@ is zero, then there's not much to do.  The mathematics
 409      * (explained below) still works, but the code takes a shortcut which
 410      * doesn't handle this case: so set %$z' = z + x k^n = z + x$%.
 411      */
 412
 413     for (j = 0; j < p->n; j++) z[j] ^= x[j];
 414   } else {
 415     /* We have %$x = a_0 t^m + \cdots + a_{m-2} t^2 + a_{m-1} t$% and
 416      * %$z = b_0 t^n + \cdots + b_{n-2} t^2 + b_{n-1} t$%.  What we'd like is
 417      * the hash of %$a \cat b$%, which is %$z + x k^n$%.
 418      *
 419      * The first job, then, is to calculate %$k^n$%, and for this we use a
 420      * simple left-to-right square-and-multiply algorithm.  There's no need
 421      * to keep %$n$% secret here.
 422      */
 423
 424     /* Start by retrieving %$k$% from the table, and convert it to big-endian
 425      * form.
 426      */
 427     if (!(p->f&GCMF_SWAP)) for (j = 0; j < p->n; j++) u[j] = ktab[j];
 428     else for (j = 0; j < p->n; j++) u[j] = ENDSWAP32(ktab[24*p->n + j]);
 429
 430     /* Now calculate %$k^n$%. */
 431     i = ULONG_BITS;
 432 #define BIT (1ul << (ULONG_BITS - 1))
 433     while (!(n&BIT)) { n <<= 1; i--; }
 434     n <<= 1; i--; for (j = 0; j < p->n; j++) t[j] = u[j];
 435     while (i--) { mul(p, t, t, t); if (n&BIT) mul(p, t, t, u); n <<= 1; }
 436 #undef BIT
 437
 438     /* Next, calculate %$x k^n$%.  If we're using a little-endian convention
 439      * then we must convert %$x$%; otherwise we can just use it in place.
 440      */
 441     if (!(p->f&GCMF_SWAP))
 442       mul(p, t, t, x);
 443     else {
 444       for (j = 0; j < p->n; j++) u[j] = ENDSWAP32(x[j]);
 445       mul(p, t, t, u);
 446     }
 447
 448     /* Finally, add %$x k^n$% onto %$z$%, converting back to little-endian if
 449      * necessary.
 450      */
 451     if (!(p->f&GCMF_SWAP)) for (j = 0; j < p->n; j++) z[j] ^= t[j];
 452     else for (j = 0; j < p->n; j++) z[j] ^= ENDSWAP32(t[j]);
 453   }
 454 }
 455
 456 /*----- That's all, folks -------------------------------------------------*/