mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/* --c--
	2	*
	3	* The GCM authenticated encryption mode
	4	*
	5	* (c) 2017 Straylight/Edgeware
	6	*/
	7
	8	/----- Licensing notice --------------------------------------------------
	9	*
	10	* This file is part of Catacomb.
	11	*
	12	* Catacomb is free software: you can redistribute it and/or modify it
	13	* under the terms of the GNU Library General Public License as published
	14	* by the Free Software Foundation; either version 2 of the License, or
	15	* (at your option) any later version.
	16	*
	17	* Catacomb is distributed in the hope that it will be useful, but
	18	* WITHOUT ANY WARRANTY; without even the implied warranty of
	19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	20	* Library General Public License for more details.
	21	*
	22	* You should have received a copy of the GNU Library General Public
	23	* License along with Catacomb. If not, write to the Free Software
	24	* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
	25	* USA.
	26	*/
	27
	28	/----- Header files ------------------------------------------------------/
	29
	30	#include "config.h"
	31
	32	#include <stdio.h>
	33
	34	#include <mLib/bits.h>
	35
	36	#include "gcm.h"
	37	#include "gcm-def.h"
	38
	39	/----- Overall strategy --------------------------------------------------
	40	*
	41	* GCM is pretty awful to implement in software. (This presentation is going
	42	* to be somewhat different to that in the specification, but I think it
	43	* makes more sense like this.)
	44	*
	45	* We're given a %$w$%-bit blockcipher %$E$% with a key %$K$%.
	46	*
	47	* The main part is arithmetic in the finite field %$k = \gf{2^w}$%, which we
	48	* represent as the quotient ring %$\gf{2}[t]/(p_w(t))$% for some irreducible
	49	* degree-%$w$% polynomial %$p(t)$%, whose precise value isn't very important
	50	* right now. We choose a secret point %$x = E_K(0^w)$%.
	51	*
	52	* We choose a length size %$z$% as follows: if %$w < 96%$ then %$z = w$%;
	53	* otherwise %$z = w/2$%. Format a message pair as follows:
	54	*
	55	* %$F(a, b) = P_w(a) \cat P_w(b) \cat [\ell(a)]_z \cat [\ell(b)]_z$%
	56	*
	57	* where %$P_w(x) = x \cat 0^n$% where $%0 \le n < w$% such that
	58	* %$\ell(x) + n \equiv 0 \pmod{w}$%.
	59	*
	60	* Hash a (block-aligned) message %$u$% as follows. First, split %$u$% into
	61	* %$w$%-bit blocks %$u_0$%, %$u_1$%, %%\ldots%%, %$u_{n-1}$%. Interpret
	62	* these as elements of %$k$%. Then
	63	*
	64	* %$G_x(u) = u_0 t^n + u_1 t^{n-1} + \cdots + u_{n-1} t$%
	65	*
	66	* converted back to a %$w$%-bit string.
	67	*
	68	* We're ready to go now. Suppose we're to encrypt a message %$M$% with
	69	* header %$H$% and nonce %$N$%. If %$\ell(N) + 32 = w$% then let
	70	* %$N' = N$% and let %$i_0 = 1$%; otherwise, let %$U = G_t(F(\epsilon, N))$%
	71	* and split this into %$N' = U[0 \bitsto w - 32]$% and
	72	* %$[i_0]_{32} = U[w - 32 \bitsto w]$%.
	73	*
	74	* Let %$n = \lceil \ell(M)/w \rceil$%. Compute
	75	*
	76	* %$y_j = E_K(N' \cat [i_0 + j]_{32})$%
	77	*
	78	* for %$0 \le j \le n$%. Let
	79	*
	80	* %$s = (y_1 \cat y_2 \cat \cdots \cat y_n)[0 \bitsto \ell(M)$%
	81	*
	82	* Let %$C = M \xor s$% and let %$T = G_x(F(H, C)) \xor y_0$%. These are the
	83	* ciphertext and tag respectively.
	84	*
	85	* So why is this awful?
	86	*
	87	* For one thing, the bits are in a completely terrible order. The bytes are
	88	* arranged in little-endian order, so the unit coefficient is in the first
	89	* byte, and the degree-127 coefficient is in the last byte. But within each
	90	* byte, the lowest-degree coefficient is in the most significant bit. It's
	91	* therefore better to think of GCM as using a big-endian byte-ordering
	92	* convention, but with the bits backwards.
	93	*
	94	* But messing about with byte ordering is expensive, so let's not do that in
	95	* the inner loop. But multiplication in %$k$% is not easy either. Some
	96	* kind of precomputed table would be nice, but that will leak secrets
	97	* through the cache.
	98	*
	99	* I choose a particularly simple table: given %$x$%, let %$X[i'] = x t^i$%.
	100	* Then $%$x y = \sum_{0\le i<w} y_i X[i']$% which is just a bunch of
	101	* bitmasking. But the natural order for examining bits of %$y$% is not
	102	* necessarily the obvious one. We'll have already loaded %$y$% into
	103	* internal form, as 32-bit words. The good order to process these is left
	104	* to right, from high to low bits. But now the order of degrees depends on
	105	* the endianness of our conversion of bytes to words. Oh, well.
	106	*
	107	* If we've adopted a big-endian convention, then we'll see the degrees in
	108	* order, 0, 1, ..., all the way up to %$w - 1$% and everything is fine. If
	109	* we've adopted a little-endian convention, though, we'll see an ordering
	110	* like this:
	111	*
	112	* 24, 25, ..., 31, 16, 17, ..., 23, 8, 9, ..., 15, 0, 1, ..., 7,
	113	* 56, 57, ..., 63, 48, 49, ..., 55, 40, 41, ..., 47, 32, 33, ..., 39,
	114	* etc.
	115	*
	116	* which is the ordinary order with 0x18 = 24 XORed into the index. That is,
	117	* %$i' = i$% if we've adopted a big-endian convention, and
	118	* %$i' = i \xor 24$% if we've adopted a little-endian convention.
	119	*/
	120
	121	/----- Low-level utilities -----------------------------------------------/
	122
	123	/* --- @mult@ --- *
	124	*
	125	* Arguments: @const gcm_params *p@ = pointer to the parameters
	126	* @uint32 *z@ = where to write the result
	127	* @const uint32 *x@ = input field element
	128	*
	129	* Returns: ---
	130	*
	131	* Use: Multiply the input field element by %$t$%, and write the
	132	* product to @z@. It's safe for @x@ and @z@ to be equal, but
	133	* they should not otherwise overlap. Both input and output are
	134	* in big-endian form, i.e., with the lowest-degree coefficients
	135	* in the most significant bits.
	136	*/
	137
	138	static void mult(const gcm_params p, uint32 z, const uint32 *x)
	139	{
	140	uint32 m, c, t;
	141	unsigned i;
	142
	143	t = x[p->n - 1]; m = -(t&1u); c = m&p->poly;
	144	for (i = 0; i < p->n; i++) { t = x[i]; z[i] = (t >> 1) ^ c; c = t << 31; }
	145	}
	146
	147	/* --- @mul@ --- *
	148	*
	149	* Arguments: @const gcm_params *p@ = pointer to the parameters
	150	* @uint32 *z@ = where to write the result
	151	* @const uint32 x, y@ = input field elements
	152	*
	153	* Returns: ---
	154	*
	155	* Use: Multiply the input field elements together, and write the
	156	* product to @z@. It's safe for the operands to overlap. Both
	157	* inputs and the output are in big-endian form, i.e., with the
	158	* lowest-degree coefficients in the most significant bits.
	159	*/
	160
	161	static void mul(const gcm_params p, uint32 z,
	162	const uint32 x, const uint32 y)
	163	{
	164	uint32 m, t, u[GCM_NMAX], v[GCM_NMAX];
	165	unsigned i, j, k;
	166
	167	/* We can't do this in-place at all, so use temporary space. Make a copy
	168	* of @x@ in @u@, where we can clobber it, and build the product in @v@.
	169	*/
	170	for (i = 0; i < p->n; i++) { u[i] = x[i]; v[i] = 0; }
	171
	172	/* Repeatedly multiply @x@ (in @u@) by %$t$%, and add together those
	173	* %$x t^i$% selected by the bits of @y@. This is basically what you get
	174	* by streaming the result of @gcm_mktable@ into @gcm_mulk_...@.
	175	*/
	176	for (i = 0; i < p->n; i++) {
	177	t = y[i];
	178	for (j = 0; j < 32; j++) {
	179	m = -((t >> 31)&1u);
	180	for (k = 0; k < p->n; k++) v[k] ^= u[k]&m;
	181	mult(p, u, u); t <<= 1;
	182	}
	183	}
	184
	185	/* Write out the result now that it's ready. */
	186	for (i = 0; i < p->n; i++) z[i] = v[i];
	187	}
	188
	189	/----- Table-based multiplication ----------------------------------------/
	190
	191	/* --- @gcm_mktable@ --- *
	192	*
	193	* Arguments: @const gcm_params *p@ = pointer to the parameters
	194	* @uint32 *ktab@ = where to write the table; there must be
	195	* space for %$32 n$% $%n$%-word entries, i.e.,
	196	* %$32 n^2$% 32-bit words in total, where %$n$% is
	197	* @p->n@, the block size in words
	198	* @const uint32 *k@ = input field element
	199	*
	200	* Returns: ---
	201	*
	202	* Use: Construct a table for use by @gcm_mulk_...@ below, to
	203	* multiply (vaguely) efficiently by @k@.
	204	*/
	205
	206	void gcm_mktable(const gcm_params p, uint32 ktab, const uint32 *k)
	207	{
	208	unsigned m = (p->f&GCMF_SWAP ? 0x18 : 0);
	209	unsigned i, j, o = m*p->n;
	210
	211	/* As described above, the table stores entries %$K[i \xor m] = k t^i$%,
	212	* where %$m = 0$% (big-endian cipher) or %$m = 24$% (little-endian).
	213	* The first job is to store %$K[m] = k$%.
	214	*
	215	* We initially build the table with the entries in big-endian order, and
	216	* then swap them if necessary. This makes the arithmetic functions more
	217	* amenable for use by @gcm_concat@ below.
	218	*/
	219	if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) ktab[o + i] = k[i];
	220	else for (i = 0; i < p->n; i++) ktab[o + i] = ENDSWAP32(k[i]);
	221
	222	/* Fill in the rest of the table by repeatedly multiplying the previous
	223	* entry by %$t$%.
	224	*/
	225	for (i = 1; i < 32*p->n; i++)
	226	{ j = (i ^ m)*p->n; mult(p, ktab + j, ktab + o); o = j; }
	227
	228	/* Finally, if the cipher uses a little-endian convention, then swap all of
	229	* the individual words.
	230	*/
	231	if (p->f&GCMF_SWAP)
	232	for (i = 0; i < 32p->np->n; i++) ktab[i] = ENDSWAP32(ktab[i]);
	233	}
	234
	235	/* --- @gcm_mulk_N@ --- *
	236	*
	237	* Arguments: @uint32 *a@ = accumulator to multiply
	238	* @const uint32 *ktab@ = table constructed by @gcm_mktable@
	239	*
	240	* Returns: ---
	241	*
	242	* Use: Multiply @a@ by @k@ (implicitly represented in @ktab@),
	243	* updating @a@ in-place. There are separate functions for each
	244	* supported block size because this is the function whose
	245	* performance actually matters.
	246	*/
	247
	248	#define DEF_MULK(nbits) \
	249	void gcm_mulk_##nbits(uint32 a, const uint32 ktab) \
	250	{ \
	251	uint32 m, t; \
	252	uint32 z[nbits/32]; \
	253	unsigned i, j, k; \
	254	\
	255	for (i = 0; i < nbits/32; i++) z[i] = 0; \
	256	\
	257	for (i = 0; i < nbits/32; i++) { \
	258	t = a[i]; \
	259	for (j = 0; j < 32; j++) { \
	260	m = -((t >> 31)&1u); \
	261	for (k = 0; k < nbits/32; k++) z[k] ^= *ktab++&m; \
	262	t <<= 1; \
	263	} \
	264	} \
	265	\
	266	for (i = 0; i < nbits/32; i++) a[i] = z[i]; \
	267	}
	268	GCM_WIDTHS(DEF_MULK)
	269
	270	/----- Other utilities ---------------------------------------------------/
	271
	272	/* --- @putlen@ --- *
	273	*
	274	* Arguments: @octet *p@ = pointer to output buffer
	275	* @unsigned w@ = size of output buffer
	276	* @unsigned blksz@ = block size (assumed fairly small)
	277	* @unsigned long nblocks@ = number of blocks
	278	* @unsigned nbytes@ = tail size in bytes (assumed small)
	279	*
	280	* Returns: ---
	281	*
	282	* Use: Store the overall length in %$\emph{bits}$% (i.e.,
	283	* @3(nblocksblksz + nbytes)@ in big-endian form in the
	284	* buffer @p@.
	285	*/
	286
	287	static void putlen(octet *p, unsigned w, unsigned blksz,
	288	unsigned long nblocks, unsigned nbytes)
	289	{
	290	unsigned long nblo = nblocks&((1ul << (ULONG_BITS/2)) - 1),
	291	nbhi = nblocks >> ULONG_BITS/2;
	292	unsigned long nlo = nbloblksz + nbytes, nhi = nbhiblksz;
	293
	294	/* This is fiddly. Split @nblocks@, which is the big number, into high and
	295	* low halves, multiply those separately by @blksz@, propagate carries, and
	296	* then multiply by eight.
	297	*/
	298	nhi += nlo >> ULONG_BITS/2;
	299	nlo &= (1ul << (ULONG_BITS/2)) - 1;
	300	nlo <<= 3;
	301
	302	/* Now write out the size, feeding bits in from @nhi@ as necessary. */
	303	p += w;
	304	while (w--) {
	305	*--p = U8(nlo);
	306	nlo = (nlo >> 8) \| ((nhi&0xff) << (ULONG_BITS/2 - 5));
	307	nhi >>= 8;
	308	}
	309	}
	310
	311	/* --- @mix@ --- *
	312	*
	313	* Arguments: @const gcm_params *p@ = pointer to the parameters
	314	* @uint32 *a@ = GHASH accumulator
	315	* @const octet *q@ = pointer to an input block
	316	* @const uint32 *ktab@ = multiplication table, built by
	317	* @gcm_mktable@
	318	*
	319	* Returns: ---
	320	*
	321	* Use: Fold the block @q@ into the GHASH accumulator. The
	322	* calculation is %$a' = k (a + q)$%.
	323	*/
	324
	325	static void mix(const gcm_params p, uint32 a,
	326	const octet q, const uint32 ktab)
	327	{
	328	unsigned i;
	329
	330	/* Convert the block from bytes into words, using the appropriate
	331	* convention.
	332	*/
	333	if (p->f&GCMF_SWAP)
	334	for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_L(q); q += 4; }
	335	else
	336	for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_B(q); q += 4; }
	337
	338	/* Dispatch to the correct multiply-by-%$k$% function. */
	339	switch (p->n) {
	340	#define CASE(nbits) case nbits/32: gcm_mulk_##nbits(a, ktab); break;
	341	GCM_WIDTHS(CASE)
	342	#undef CASE
	343	default: abort();
	344	}
	345	}
	346
	347	/* --- @gcm_ghashdone@ --- *
	348	*
	349	* Arguments: @const gcm_params *p@ = pointer to the parameters
	350	* @uint32 *a@ = GHASH accumulator
	351	* @const uint32 *ktab@ = multiplication table, built by
	352	* @gcm_mktable@
	353	* @unsigned long xblocks, yblocks@ = number of whole blocks in
	354	* the two inputs
	355	* @unsigned xbytes, ybytes@ = number of trailing bytes in the
	356	* two inputs
	357	*
	358	* Returns: ---
	359	*
	360	* Use: Finishes a GHASH operation by appending the appropriately
	361	* encoded lengths of the two constituent messages.
	362	*/
	363
	364	void gcm_ghashdone(const gcm_params p, uint32 a, const uint32 *ktab,
	365	unsigned long xblocks, unsigned xbytes,
	366	unsigned long yblocks, unsigned ybytes)
	367	{
	368	octet b[4*GCM_NMAX];
	369	unsigned w = p->n < 3 ? 4p->n : 2p->n;
	370
	371	/* Construct the encoded lengths. Note that smaller-block versions of GCM
	372	* encode the lengths in separate blocks. GCM is only officially defined
	373	* for 64- and 128-bit blocks; I've placed the cutoff somewhat arbitrarily
	374	* at 96 bits.
	375	*/
	376	putlen(b, w, 4*p->n, xblocks, xbytes);
	377	putlen(b + w, w, 4*p->n, yblocks, ybytes);
	378
	379	/* Feed the lengths into the accumulator. */
	380	mix(p, a, b, ktab);
	381	if (p->n < 3) mix(p, a, b + w, ktab);
	382	}
	383
	384	/* --- @gcm_concat@ --- *
	385	*
	386	* Arguments: @const gcm_params *p@ = pointer to the parameters
	387	* @uint32 *z@ = GHASH accumulator for suffix, updated
	388	* @const uint32 *x@ = GHASH accumulator for prefix
	389	* @const uint32 *ktab@ = multiplication table, built by
	390	* @gcm_mktable@
	391	* @unsigned long n@ = length of suffix in whole blocks
	392	*
	393	* Returns: ---
	394	*
	395	* Use: On entry, @x@ and @z@ are the results of hashing two strings
	396	* %$a$% and %$b$%, each a whole number of blocks long; in
	397	* particular, %$b$% is @n@ blocks long. On exit, @z@ is
	398	* updated to be the hash of %$a \cat b$%.
	399	*/
	400
	401	void gcm_concat(const gcm_params p, uint32 z, const uint32 *x,
	402	const uint32 *ktab, unsigned long n)
	403	{
	404	uint32 t[GCM_NMAX], u[GCM_NMAX];
	405	unsigned i, j;
	406
	407	if (!n) {
	408	/* If @n@ is zero, then there's not much to do. The mathematics
	409	* (explained below) still works, but the code takes a shortcut which
	410	* doesn't handle this case: so set %$z' = z + x k^n = z + x$%.
	411	*/
	412
	413	for (j = 0; j < p->n; j++) z[j] ^= x[j];
	414	} else {
	415	/* We have %$x = a_0 t^m + \cdots + a_{m-2} t^2 + a_{m-1} t$% and
	416	* %$z = b_0 t^n + \cdots + b_{n-2} t^2 + b_{n-1} t$%. What we'd like is
	417	* the hash of %$a \cat b$%, which is %$z + x k^n$%.
	418	*
	419	* The first job, then, is to calculate %$k^n$%, and for this we use a
	420	* simple left-to-right square-and-multiply algorithm. There's no need
	421	* to keep %$n$% secret here.
	422	*/
	423
	424	/* Start by retrieving %$k$% from the table, and convert it to big-endian
	425	* form.
	426	*/
	427	if (!(p->f&GCMF_SWAP)) for (j = 0; j < p->n; j++) u[j] = ktab[j];
	428	else for (j = 0; j < p->n; j++) u[j] = ENDSWAP32(ktab[24*p->n + j]);
	429
	430	/* Now calculate %$k^n$%. */
	431	i = ULONG_BITS;
	432	#define BIT (1ul << (ULONG_BITS - 1))
	433	while (!(n&BIT)) { n <<= 1; i--; }
	434	n <<= 1; i--; for (j = 0; j < p->n; j++) t[j] = u[j];
	435	while (i--) { mul(p, t, t, t); if (n&BIT) mul(p, t, t, u); n <<= 1; }
	436	#undef BIT
	437
	438	/* Next, calculate %$x k^n$%. If we're using a little-endian convention
	439	* then we must convert %$x$%; otherwise we can just use it in place.
	440	*/
	441	if (!(p->f&GCMF_SWAP))
	442	mul(p, t, t, x);
	443	else {
	444	for (j = 0; j < p->n; j++) u[j] = ENDSWAP32(x[j]);
	445	mul(p, t, t, u);
	446	}
	447
	448	/* Finally, add %$x k^n$% onto %$z$%, converting back to little-endian if
	449	* necessary.
	450	*/
	451	if (!(p->f&GCMF_SWAP)) for (j = 0; j < p->n; j++) z[j] ^= t[j];
	452	else for (j = 0; j < p->n; j++) z[j] ^= ENDSWAP32(t[j]);
	453	}
	454	}
	455
	456	/----- That's all, folks -------------------------------------------------/