symm/ocb3.h, symm/ocb3-def.h: Implement the OCB3 auth'ned encryption mode.
[catacomb] / symm / gcm.c
CommitLineData
50df5733
MW
1/* -*-c-*-
2 *
3 * The GCM authenticated encryption mode
4 *
5 * (c) 2017 Straylight/Edgeware
6 */
7
8/*----- Licensing notice --------------------------------------------------*
9 *
10 * This file is part of Catacomb.
11 *
12 * Catacomb is free software: you can redistribute it and/or modify it
13 * under the terms of the GNU Library General Public License as published
14 * by the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * Catacomb is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Library General Public License for more details.
21 *
22 * You should have received a copy of the GNU Library General Public
23 * License along with Catacomb. If not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25 * USA.
26 */
27
28/*----- Header files ------------------------------------------------------*/
29
30#include "config.h"
31
32#include <stdio.h>
33
34#include <mLib/bits.h>
35
36#include "gcm.h"
37#include "gcm-def.h"
38
39/*----- Overall strategy --------------------------------------------------*
40 *
41 * GCM is pretty awful to implement in software. (This presentation is going
42 * to be somewhat different to that in the specification, but I think it
43 * makes more sense like this.)
44 *
45 * We're given a %$w$%-bit blockcipher %$E$% with a key %$K$%.
46 *
47 * The main part is arithmetic in the finite field %$k = \gf{2^w}$%, which we
48 * represent as the quotient ring %$\gf{2}[t]/(p_w(t))$% for some irreducible
49 * degree-%$w$% polynomial %$p(t)$%, whose precise value isn't very important
50 * right now. We choose a secret point %$x = E_K(0^w)$%.
51 *
52 * We choose a length size %$z$% as follows: if %$w < 96%$ then %$z = w$%;
53 * otherwise %$z = w/2$%. Format a message pair as follows:
54 *
55 * %$F(a, b) = P_w(a) \cat P_w(b) \cat [\ell(a)]_z \cat [\ell(b)]_z$%
56 *
57 * where %$P_w(x) = x \cat 0^n$% where $%0 \le n < w$% such that
58 * %$\ell(x) + n \equiv 0 \pmod{w}$%.
59 *
60 * Hash a (block-aligned) message %$u$% as follows. First, split %$u$% into
61 * %$w$%-bit blocks %$u_0$%, %$u_1$%, %%\ldots%%, %$u_{n-1}$%. Interpret
62 * these as elements of %$k$%. Then
63 *
64 * %$G_x(u) = u_0 t^n + u_1 t^{n-1} + \cdots + u_{n-1} t$%
65 *
66 * converted back to a %$w$%-bit string.
67 *
68 * We're ready to go now. Suppose we're to encrypt a message %$M$% with
69 * header %$H$% and nonce %$N$%. If %$\ell(N) + 32 = w$% then let
70 * %$N' = N$% and let %$i_0 = 1$%; otherwise, let %$U = G_t(F(\epsilon, N))$%
71 * and split this into %$N' = U[0 \bitsto w - 32]$% and
72 * %$[i_0]_{32} = U[w - 32 \bitsto w]$%.
73 *
74 * Let %$n = \lceil \ell(M)/w \rceil$%. Compute
75 *
76 * %$y_j = E_K(N' \cat [i_0 + j]_{32})$%
77 *
78 * for %$0 \le j \le n$%. Let
79 *
80 * %$s = (y_1 \cat y_2 \cat \cdots \cat y_n)[0 \bitsto \ell(M)$%
81 *
82 * Let %$C = M \xor s$% and let %$T = G_x(F(H, C)) \xor y_0$%. These are the
83 * ciphertext and tag respectively.
84 *
85 * So why is this awful?
86 *
87 * For one thing, the bits are in a completely terrible order. The bytes are
88 * arranged in little-endian order, so the unit coefficient is in the first
89 * byte, and the degree-127 coefficient is in the last byte. But within each
90 * byte, the lowest-degree coefficient is in the most significant bit. It's
91 * therefore better to think of GCM as using a big-endian byte-ordering
92 * convention, but with the bits backwards.
93 *
94 * But messing about with byte ordering is expensive, so let's not do that in
95 * the inner loop. But multiplication in %$k$% is not easy either. Some
96 * kind of precomputed table would be nice, but that will leak secrets
97 * through the cache.
98 *
99 * I choose a particularly simple table: given %$x$%, let %$X[i'] = x t^i$%.
100 * Then $%$x y = \sum_{0\le i<w} y_i X[i']$% which is just a bunch of
101 * bitmasking. But the natural order for examining bits of %$y$% is not
102 * necessarily the obvious one. We'll have already loaded %$y$% into
103 * internal form, as 32-bit words. The good order to process these is left
104 * to right, from high to low bits. But now the order of degrees depends on
105 * the endianness of our conversion of bytes to words. Oh, well.
106 *
107 * If we've adopted a big-endian convention, then we'll see the degrees in
108 * order, 0, 1, ..., all the way up to %$w - 1$% and everything is fine. If
109 * we've adopted a little-endian convention, though, we'll see an ordering
110 * like this:
111 *
112 * 24, 25, ..., 31, 16, 17, ..., 23, 8, 9, ..., 15, 0, 1, ..., 7,
113 * 56, 57, ..., 63, 48, 49, ..., 55, 40, 41, ..., 47, 32, 33, ..., 39,
114 * etc.
115 *
116 * which is the ordinary order with 0x18 = 24 XORed into the index. That is,
117 * %$i' = i$% if we've adopted a big-endian convention, and
118 * %$i' = i \xor 24$% if we've adopted a little-endian convention.
119 */
120
121/*----- Low-level utilities -----------------------------------------------*/
122
123/* --- @mult@ --- *
124 *
125 * Arguments: @const gcm_params *p@ = pointer to the parameters
126 * @uint32 *z@ = where to write the result
127 * @const uint32 *x@ = input field element
128 *
129 * Returns: ---
130 *
131 * Use: Multiply the input field element by %$t$%, and write the
132 * product to @z@. It's safe for @x@ and @z@ to be equal, but
133 * they should not otherwise overlap. Both input and output are
134 * in big-endian form, i.e., with the lowest-degree coefficients
135 * in the most significant bits.
136 */
137
138static void mult(const gcm_params *p, uint32 *z, const uint32 *x)
139{
140 uint32 m, c, t;
141 unsigned i;
142
143 t = x[p->n - 1]; m = -(t&1u); c = m&p->poly;
144 for (i = 0; i < p->n; i++) { t = x[i]; z[i] = (t >> 1) ^ c; c = t << 31; }
145}
146
147/* --- @mul@ --- *
148 *
149 * Arguments: @const gcm_params *p@ = pointer to the parameters
150 * @uint32 *z@ = where to write the result
151 * @const uint32 *x, *y@ = input field elements
152 *
153 * Returns: ---
154 *
155 * Use: Multiply the input field elements together, and write the
156 * product to @z@. It's safe for the operands to overlap. Both
157 * inputs and the output are in big-endian form, i.e., with the
158 * lowest-degree coefficients in the most significant bits.
159 */
160
161static void mul(const gcm_params *p, uint32 *z,
162 const uint32 *x, const uint32 *y)
163{
164 uint32 m, t, u[GCM_NMAX], v[GCM_NMAX];
165 unsigned i, j, k;
166
167 /* We can't do this in-place at all, so use temporary space. Make a copy
168 * of @x@ in @u@, where we can clobber it, and build the product in @v@.
169 */
170 for (i = 0; i < p->n; i++) { u[i] = x[i]; v[i] = 0; }
171
172 /* Repeatedly multiply @x@ (in @u@) by %$t$%, and add together those
173 * %$x t^i$% selected by the bits of @y@. This is basically what you get
174 * by streaming the result of @gcm_mktable@ into @gcm_mulk_...@.
175 */
176 for (i = 0; i < p->n; i++) {
177 t = y[i];
178 for (j = 0; j < 32; j++) {
179 m = -((t >> 31)&1u);
180 for (k = 0; k < p->n; k++) v[k] ^= u[k]&m;
181 mult(p, u, u); t <<= 1;
182 }
183 }
184
185 /* Write out the result now that it's ready. */
186 for (i = 0; i < p->n; i++) z[i] = v[i];
187}
188
189/*----- Table-based multiplication ----------------------------------------*/
190
191/* --- @gcm_mktable@ --- *
192 *
193 * Arguments: @const gcm_params *p@ = pointer to the parameters
194 * @uint32 *ktab@ = where to write the table; there must be
195 * space for %$32 n$% $%n$%-word entries, i.e.,
196 * %$32 n^2$% 32-bit words in total, where %$n$% is
197 * @p->n@, the block size in words
198 * @const uint32 *k@ = input field element
199 *
200 * Returns: ---
201 *
202 * Use: Construct a table for use by @gcm_mulk_...@ below, to
203 * multiply (vaguely) efficiently by @k@.
204 */
205
206void gcm_mktable(const gcm_params *p, uint32 *ktab, const uint32 *k)
207{
208 unsigned m = (p->f&GCMF_SWAP ? 0x18 : 0);
209 unsigned i, j, o = m*p->n;
210
211 /* As described above, the table stores entries %$K[i \xor m] = k t^i$%,
212 * where %$m = 0$% (big-endian cipher) or %$m = 24$% (little-endian).
213 * The first job is to store %$K[m] = k$%.
214 *
215 * We initially build the table with the entries in big-endian order, and
216 * then swap them if necessary. This makes the arithmetic functions more
217 * amenable for use by @gcm_concat@ below.
218 */
219 if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) ktab[o + i] = k[i];
220 else for (i = 0; i < p->n; i++) ktab[o + i] = ENDSWAP32(k[i]);
221
222 /* Fill in the rest of the table by repeatedly multiplying the previous
223 * entry by %$t$%.
224 */
225 for (i = 1; i < 32*p->n; i++)
226 { j = (i ^ m)*p->n; mult(p, ktab + j, ktab + o); o = j; }
227
228 /* Finally, if the cipher uses a little-endian convention, then swap all of
229 * the individual words.
230 */
231 if (p->f&GCMF_SWAP)
232 for (i = 0; i < 32*p->n*p->n; i++) ktab[i] = ENDSWAP32(ktab[i]);
233}
234
235/* --- @gcm_mulk_N@ --- *
236 *
237 * Arguments: @uint32 *a@ = accumulator to multiply
238 * @const uint32 *ktab@ = table constructed by @gcm_mktable@
239 *
240 * Returns: ---
241 *
242 * Use: Multiply @a@ by @k@ (implicitly represented in @ktab@),
243 * updating @a@ in-place. There are separate functions for each
244 * supported block size because this is the function whose
245 * performance actually matters.
246 */
247
248#define DEF_MULK(nbits) \
249void gcm_mulk_##nbits(uint32 *a, const uint32 *ktab) \
250{ \
251 uint32 m, t; \
252 uint32 z[nbits/32]; \
253 unsigned i, j, k; \
254 \
255 for (i = 0; i < nbits/32; i++) z[i] = 0; \
256 \
257 for (i = 0; i < nbits/32; i++) { \
258 t = a[i]; \
259 for (j = 0; j < 32; j++) { \
260 m = -((t >> 31)&1u); \
261 for (k = 0; k < nbits/32; k++) z[k] ^= *ktab++&m; \
262 t <<= 1; \
263 } \
264 } \
265 \
266 for (i = 0; i < nbits/32; i++) a[i] = z[i]; \
267}
268GCM_WIDTHS(DEF_MULK)
269
270/*----- Other utilities ---------------------------------------------------*/
271
272/* --- @putlen@ --- *
273 *
274 * Arguments: @octet *p@ = pointer to output buffer
275 * @unsigned w@ = size of output buffer
276 * @unsigned blksz@ = block size (assumed fairly small)
277 * @unsigned long nblocks@ = number of blocks
278 * @unsigned nbytes@ = tail size in bytes (assumed small)
279 *
280 * Returns: ---
281 *
282 * Use: Store the overall length in %$\emph{bits}$% (i.e.,
283 * @3*(nblocks*blksz + nbytes)@ in big-endian form in the
284 * buffer @p@.
285 */
286
287static void putlen(octet *p, unsigned w, unsigned blksz,
288 unsigned long nblocks, unsigned nbytes)
289{
290 unsigned long nblo = nblocks&((1ul << (ULONG_BITS/2)) - 1),
291 nbhi = nblocks >> ULONG_BITS/2;
292 unsigned long nlo = nblo*blksz + nbytes, nhi = nbhi*blksz;
293
294 /* This is fiddly. Split @nblocks@, which is the big number, into high and
295 * low halves, multiply those separately by @blksz@, propagate carries, and
296 * then multiply by eight.
297 */
298 nhi += nlo >> ULONG_BITS/2;
299 nlo &= (1ul << (ULONG_BITS/2)) - 1;
300 nlo <<= 3;
301
302 /* Now write out the size, feeding bits in from @nhi@ as necessary. */
303 p += w;
304 while (w--) {
305 *--p = U8(nlo);
306 nlo = (nlo >> 8) | ((nhi&0xff) << (ULONG_BITS/2 - 5));
307 nhi >>= 8;
308 }
309}
310
311/* --- @mix@ --- *
312 *
313 * Arguments: @const gcm_params *p@ = pointer to the parameters
314 * @uint32 *a@ = GHASH accumulator
315 * @const octet *q@ = pointer to an input block
316 * @const uint32 *ktab@ = multiplication table, built by
317 * @gcm_mktable@
318 *
319 * Returns: ---
320 *
321 * Use: Fold the block @q@ into the GHASH accumulator. The
322 * calculation is %$a' = k (a + q)$%.
323 */
324
325static void mix(const gcm_params *p, uint32 *a,
326 const octet *q, const uint32 *ktab)
327{
328 unsigned i;
329
330 /* Convert the block from bytes into words, using the appropriate
331 * convention.
332 */
333 if (p->f&GCMF_SWAP)
334 for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_L(q); q += 4; }
335 else
336 for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_B(q); q += 4; }
337
338 /* Dispatch to the correct multiply-by-%$k$% function. */
339 switch (p->n) {
340#define CASE(nbits) case nbits/32: gcm_mulk_##nbits(a, ktab); break;
341 GCM_WIDTHS(CASE)
342#undef CASE
343 default: abort();
344 }
345}
346
347/* --- @gcm_ghashdone@ --- *
348 *
349 * Arguments: @const gcm_params *p@ = pointer to the parameters
350 * @uint32 *a@ = GHASH accumulator
351 * @const uint32 *ktab@ = multiplication table, built by
352 * @gcm_mktable@
353 * @unsigned long xblocks, yblocks@ = number of whole blocks in
354 * the two inputs
355 * @unsigned xbytes, ybytes@ = number of trailing bytes in the
356 * two inputs
357 *
358 * Returns: ---
359 *
360 * Use: Finishes a GHASH operation by appending the appropriately
361 * encoded lengths of the two constituent messages.
362 */
363
364void gcm_ghashdone(const gcm_params *p, uint32 *a, const uint32 *ktab,
365 unsigned long xblocks, unsigned xbytes,
366 unsigned long yblocks, unsigned ybytes)
367{
368 octet b[4*GCM_NMAX];
369 unsigned w = p->n < 3 ? 4*p->n : 2*p->n;
370
371 /* Construct the encoded lengths. Note that smaller-block versions of GCM
372 * encode the lengths in separate blocks. GCM is only officially defined
373 * for 64- and 128-bit blocks; I've placed the cutoff somewhat arbitrarily
374 * at 96 bits.
375 */
376 putlen(b, w, 4*p->n, xblocks, xbytes);
377 putlen(b + w, w, 4*p->n, yblocks, ybytes);
378
379 /* Feed the lengths into the accumulator. */
380 mix(p, a, b, ktab);
381 if (p->n < 3) mix(p, a, b + w, ktab);
382}
383
384/* --- @gcm_concat@ --- *
385 *
386 * Arguments: @const gcm_params *p@ = pointer to the parameters
387 * @uint32 *z@ = GHASH accumulator for suffix, updated
388 * @const uint32 *x@ = GHASH accumulator for prefix
389 * @const uint32 *ktab@ = multiplication table, built by
390 * @gcm_mktable@
391 * @unsigned long n@ = length of suffix in whole blocks
392 *
393 * Returns: ---
394 *
395 * Use: On entry, @x@ and @z@ are the results of hashing two strings
396 * %$a$% and %$b$%, each a whole number of blocks long; in
397 * particular, %$b$% is @n@ blocks long. On exit, @z@ is
398 * updated to be the hash of %$a \cat b$%.
399 */
400
401void gcm_concat(const gcm_params *p, uint32 *z, const uint32 *x,
402 const uint32 *ktab, unsigned long n)
403{
404 uint32 t[GCM_NMAX], u[GCM_NMAX];
405 unsigned i, j;
406
407 if (!n) {
408 /* If @n@ is zero, then there's not much to do. The mathematics
409 * (explained below) still works, but the code takes a shortcut which
410 * doesn't handle this case: so set %$z' = z + x k^n = z + x$%.
411 */
412
413 for (j = 0; j < p->n; j++) z[j] ^= x[j];
414 } else {
415 /* We have %$x = a_0 t^m + \cdots + a_{m-2} t^2 + a_{m-1} t$% and
416 * %$z = b_0 t^n + \cdots + b_{n-2} t^2 + b_{n-1} t$%. What we'd like is
417 * the hash of %$a \cat b$%, which is %$z + x k^n$%.
418 *
419 * The first job, then, is to calculate %$k^n$%, and for this we use a
420 * simple left-to-right square-and-multiply algorithm. There's no need
421 * to keep %$n$% secret here.
422 */
423
424 /* Start by retrieving %$k$% from the table, and convert it to big-endian
425 * form.
426 */
427 if (!(p->f&GCMF_SWAP)) for (j = 0; j < p->n; j++) u[j] = ktab[j];
428 else for (j = 0; j < p->n; j++) u[j] = ENDSWAP32(ktab[24*p->n + j]);
429
430 /* Now calculate %$k^n$%. */
431 i = ULONG_BITS;
432#define BIT (1ul << (ULONG_BITS - 1))
433 while (!(n&BIT)) { n <<= 1; i--; }
434 n <<= 1; i--; for (j = 0; j < p->n; j++) t[j] = u[j];
435 while (i--) { mul(p, t, t, t); if (n&BIT) mul(p, t, t, u); n <<= 1; }
436#undef BIT
437
438 /* Next, calculate %$x k^n$%. If we're using a little-endian convention
439 * then we must convert %$x$%; otherwise we can just use it in place.
440 */
441 if (!(p->f&GCMF_SWAP))
442 mul(p, t, t, x);
443 else {
444 for (j = 0; j < p->n; j++) u[j] = ENDSWAP32(x[j]);
445 mul(p, t, t, u);
446 }
447
448 /* Finally, add %$x k^n$% onto %$z$%, converting back to little-endian if
449 * necessary.
450 */
451 if (!(p->f&GCMF_SWAP)) for (j = 0; j < p->n; j++) z[j] ^= t[j];
452 else for (j = 0; j < p->n; j++) z[j] ^= ENDSWAP32(t[j]);
453 }
454}
455
456/*----- That's all, folks -------------------------------------------------*/