mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/* --c--
	2	*
	3	* The GCM authenticated encryption mode
	4	*
	5	* (c) 2017 Straylight/Edgeware
	6	*/
	7
	8	/----- Licensing notice --------------------------------------------------
	9	*
	10	* This file is part of Catacomb.
	11	*
	12	* Catacomb is free software: you can redistribute it and/or modify it
	13	* under the terms of the GNU Library General Public License as published
	14	* by the Free Software Foundation; either version 2 of the License, or
	15	* (at your option) any later version.
	16	*
	17	* Catacomb is distributed in the hope that it will be useful, but
	18	* WITHOUT ANY WARRANTY; without even the implied warranty of
	19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	20	* Library General Public License for more details.
	21	*
	22	* You should have received a copy of the GNU Library General Public
	23	* License along with Catacomb. If not, write to the Free Software
	24	* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
	25	* USA.
	26	*/
	27
	28	/----- Header files ------------------------------------------------------/
	29
	30	#include "config.h"
	31
	32	#include <stdio.h>
	33
	34	#include <mLib/bits.h>
	35
	36	#include "dispatch.h"
	37	#include "gcm.h"
	38	#include "gcm-def.h"
	39
	40	/----- Overall strategy --------------------------------------------------
	41	*
	42	* GCM is pretty awful to implement in software. (This presentation is going
	43	* to be somewhat different to that in the specification, but I think it
	44	* makes more sense like this.)
	45	*
	46	* We're given a %$w$%-bit blockcipher %$E$% with a key %$K$%.
	47	*
	48	* The main part is arithmetic in the finite field %$k = \gf{2^w}$%, which we
	49	* represent as the quotient ring %$\gf{2}[t]/(p_w(t))$% for some irreducible
	50	* degree-%$w$% polynomial %$p(t)$%, whose precise value isn't very important
	51	* right now. We choose a secret point %$x = E_K(0^w)$%.
	52	*
	53	* We choose a length size %$z$% as follows: if %$w < 96%$ then %$z = w$%;
	54	* otherwise %$z = w/2$%. Format a message pair as follows:
	55	*
	56	* %$F(a, b) = P_w(a) \cat P_w(b) \cat [\ell(a)]_z \cat [\ell(b)]_z$%
	57	*
	58	* where %$P_w(x) = x \cat 0^n$% where $%0 \le n < w$% such that
	59	* %$\ell(x) + n \equiv 0 \pmod{w}$%.
	60	*
	61	* Hash a (block-aligned) message %$u$% as follows. First, split %$u$% into
	62	* %$w$%-bit blocks %$u_0$%, %$u_1$%, %%\ldots%%, %$u_{n-1}$%. Interpret
	63	* these as elements of %$k$%. Then
	64	*
	65	* %$G_x(u) = u_0 t^n + u_1 t^{n-1} + \cdots + u_{n-1} t$%
	66	*
	67	* converted back to a %$w$%-bit string.
	68	*
	69	* We're ready to go now. Suppose we're to encrypt a message %$M$% with
	70	* header %$H$% and nonce %$N$%. If %$\ell(N) + 32 = w$% then let
	71	* %$N' = N$% and let %$i_0 = 1$%; otherwise, let %$U = G_t(F(\epsilon, N))$%
	72	* and split this into %$N' = U[0 \bitsto w - 32]$% and
	73	* %$[i_0]_{32} = U[w - 32 \bitsto w]$%.
	74	*
	75	* Let %$n = \lceil \ell(M)/w \rceil$%. Compute
	76	*
	77	* %$y_j = E_K(N' \cat [i_0 + j]_{32})$%
	78	*
	79	* for %$0 \le j \le n$%. Let
	80	*
	81	* %$s = (y_1 \cat y_2 \cat \cdots \cat y_n)[0 \bitsto \ell(M)$%
	82	*
	83	* Let %$C = M \xor s$% and let %$T = G_x(F(H, C)) \xor y_0$%. These are the
	84	* ciphertext and tag respectively.
	85	*
	86	* So why is this awful?
	87	*
	88	* For one thing, the bits are in a completely terrible order. The bytes are
	89	* arranged in little-endian order, so the unit coefficient is in the first
	90	* byte, and the degree-127 coefficient is in the last byte. But within each
	91	* byte, the lowest-degree coefficient is in the most significant bit. It's
	92	* therefore better to think of GCM as using a big-endian byte-ordering
	93	* convention, but with the bits backwards.
	94	*
	95	* But messing about with byte ordering is expensive, so let's not do that in
	96	* the inner loop. But multiplication in %$k$% is not easy either. Some
	97	* kind of precomputed table would be nice, but that will leak secrets
	98	* through the cache.
	99	*
	100	* I choose a particularly simple table: given %$x$%, let %$X[i'] = x t^i$%.
	101	* Then $%$x y = \sum_{0\le i<w} y_i X[i']$% which is just a bunch of
	102	* bitmasking. But the natural order for examining bits of %$y$% is not
	103	* necessarily the obvious one. We'll have already loaded %$y$% into
	104	* internal form, as 32-bit words. The good order to process these is left
	105	* to right, from high to low bits. But now the order of degrees depends on
	106	* the endianness of our conversion of bytes to words. Oh, well.
	107	*
	108	* If we've adopted a big-endian convention, then we'll see the degrees in
	109	* order, 0, 1, ..., all the way up to %$w - 1$% and everything is fine. If
	110	* we've adopted a little-endian convention, though, we'll see an ordering
	111	* like this:
	112	*
	113	* 24, 25, ..., 31, 16, 17, ..., 23, 8, 9, ..., 15, 0, 1, ..., 7,
	114	* 56, 57, ..., 63, 48, 49, ..., 55, 40, 41, ..., 47, 32, 33, ..., 39,
	115	* etc.
	116	*
	117	* which is the ordinary order with 0x18 = 24 XORed into the index. That is,
	118	* %$i' = i$% if we've adopted a big-endian convention, and
	119	* %$i' = i \xor 24$% if we've adopted a little-endian convention.
	120	*/
	121
	122	/----- Low-level utilities -----------------------------------------------/
	123
	124	/* --- @mult@ --- *
	125	*
	126	* Arguments: @const gcm_params *p@ = pointer to the parameters
	127	* @uint32 *z@ = where to write the result
	128	* @const uint32 *x@ = input field element
	129	*
	130	* Returns: ---
	131	*
	132	* Use: Multiply the input field element by %$t$%, and write the
	133	* product to @z@. It's safe for @x@ and @z@ to be equal, but
	134	* they should not otherwise overlap. Both input and output are
	135	* in big-endian form, i.e., with the lowest-degree coefficients
	136	* in the most significant bits.
	137	*/
	138
	139	static void mult(const gcm_params p, uint32 z, const uint32 *x)
	140	{
	141	uint32 m, c, t;
	142	unsigned i;
	143
	144	t = x[p->n - 1]; m = -(t&1u); c = m&p->poly;
	145	for (i = 0; i < p->n; i++) { t = x[i]; z[i] = (t >> 1) ^ c; c = t << 31; }
	146	}
	147
	148	/* --- @mul@ --- *
	149	*
	150	* Arguments: @const gcm_params *p@ = pointer to the parameters
	151	* @uint32 *z@ = where to write the result
	152	* @const uint32 x, y@ = input field elements
	153	*
	154	* Returns: ---
	155	*
	156	* Use: Multiply the input field elements together, and write the
	157	* product to @z@. It's safe for the operands to overlap. Both
	158	* inputs and the output are in big-endian form, i.e., with the
	159	* lowest-degree coefficients in the most significant bits.
	160	*/
	161
	162	static void mul(const gcm_params p, uint32 z,
	163	const uint32 x, const uint32 y)
	164	{
	165	uint32 m, t, u[GCM_NMAX], v[GCM_NMAX];
	166	unsigned i, j, k;
	167
	168	/* We can't do this in-place at all, so use temporary space. Make a copy
	169	* of @x@ in @u@, where we can clobber it, and build the product in @v@.
	170	*/
	171	for (i = 0; i < p->n; i++) { u[i] = x[i]; v[i] = 0; }
	172
	173	/* Repeatedly multiply @x@ (in @u@) by %$t$%, and add together those
	174	* %$x t^i$% selected by the bits of @y@. This is basically what you get
	175	* by streaming the result of @gcm_mktable@ into @gcm_mulk_...@.
	176	*/
	177	for (i = 0; i < p->n; i++) {
	178	t = y[i];
	179	for (j = 0; j < 32; j++) {
	180	m = -((t >> 31)&1u);
	181	for (k = 0; k < p->n; k++) v[k] ^= u[k]&m;
	182	mult(p, u, u); t <<= 1;
	183	}
	184	}
	185
	186	/* Write out the result now that it's ready. */
	187	for (i = 0; i < p->n; i++) z[i] = v[i];
	188	}
	189
	190	/----- Table-based multiplication ----------------------------------------/
	191
	192	/* --- @gcm_mktable@ --- *
	193	*
	194	* Arguments: @const gcm_params *p@ = pointer to the parameters
	195	* @uint32 *ktab@ = where to write the table; there must be
	196	* space for %$32 n$% $%n$%-word entries, i.e.,
	197	* %$32 n^2$% 32-bit words in total, where %$n$% is
	198	* @p->n@, the block size in words
	199	* @const uint32 *k@ = input field element
	200	*
	201	* Returns: ---
	202	*
	203	* Use: Construct a table for use by @gcm_mulk_...@ below, to
	204	* multiply (vaguely) efficiently by @k@.
	205	*/
	206
	207	static void simple_mktable(const gcm_params *p,
	208	uint32 ktab, const uint32 k)
	209	{
	210	unsigned m = (p->f&GCMF_SWAP ? 0x18 : 0);
	211	unsigned i, j, o = m*p->n;
	212
	213	/* As described above, the table stores entries %$K[i \xor m] = k t^i$%,
	214	* where %$m = 0$% (big-endian cipher) or %$m = 24$% (little-endian).
	215	* The first job is to store %$K[m] = k$%.
	216	*
	217	* We initially build the table with the entries in big-endian order, and
	218	* then swap them if necessary. This makes the arithmetic functions more
	219	* amenable for use by @gcm_concat@ below.
	220	*/
	221	if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) ktab[o + i] = k[i];
	222	else for (i = 0; i < p->n; i++) ktab[o + i] = ENDSWAP32(k[i]);
	223
	224	/* Fill in the rest of the table by repeatedly multiplying the previous
	225	* entry by %$t$%.
	226	*/
	227	for (i = 1; i < 32*p->n; i++)
	228	{ j = (i ^ m)*p->n; mult(p, ktab + j, ktab + o); o = j; }
	229
	230	/* Finally, if the cipher uses a little-endian convention, then swap all of
	231	* the individual words.
	232	*/
	233	if (p->f&GCMF_SWAP)
	234	for (i = 0; i < 32p->np->n; i++) ktab[i] = ENDSWAP32(ktab[i]);
	235	}
	236
	237	CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable,
	238	(const gcm_params p, uint32 ktab, const uint32 *k),
	239	(p, ktab, k),
	240	pick_mktable, simple_mktable)
	241
	242	static gcm_mktable__functype *pick_mktable(void)
	243	{
	244	DISPATCH_PICK_FALLBACK(gcm_mktable, simple_mktable);
	245	}
	246
	247	/* --- @recover_k@ --- *
	248	*
	249	* Arguments: @const gcm_params *p@ = pointer to the parameters
	250	* @uint32 *k@ = block-sized vector in which to store %$k$%
	251	* @const uint32 *ktab@ = the table encoding %$k$%
	252	*
	253	* Returns: ---
	254	*
	255	* Use: Recovers %$k$%, the secret from which @ktab@ was by
	256	* @gcm_mktable@, from the table, and stores it in internal
	257	* (big-endian) form in @k@.
	258	*/
	259
	260	static void simple_recover_k(const gcm_params *p,
	261	uint32 k, const uint32 ktab)
	262	{
	263	unsigned i;
	264
	265	/* If the blockcipher is big-endian, then the key is simply in the first
	266	* table element, in the right format. If the blockcipher is little-endian
	267	* then it's in element 24, and the bytes need swapping.
	268	*/
	269
	270	if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) k[i] = ktab[i];
	271	else for (i = 0; i < p->n; i++) k[i] = ENDSWAP32(ktab[24*p->n + i]);
	272	}
	273
	274	CPU_DISPATCH(static, EMPTY, void, recover_k,
	275	(const gcm_params p, uint32 k, const uint32 *ktab),
	276	(p, k, ktab),
	277	pick_recover_k, simple_recover_k)
	278
	279	static recover_k__functype *pick_recover_k(void)
	280	{ DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k); }
	281
	282	/* --- @gcm_mulk_N{b,l}@ --- *
	283	*
	284	* Arguments: @uint32 *a@ = accumulator to multiply
	285	* @const uint32 *ktab@ = table constructed by @gcm_mktable@
	286	*
	287	* Returns: ---
	288	*
	289	* Use: Multiply @a@ by @k@ (implicitly represented in @ktab@),
	290	* updating @a@ in-place. There are separate functions for each
	291	* supported block size and endianness because this is the
	292	* function whose performance actually matters.
	293	*/
	294
	295	#define DEF_MULK(nbits) \
	296	\
	297	CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##b, \
	298	(uint32 a, const uint32 ktab), (a, ktab), \
	299	pick_mulk_##nbits##b, simple_mulk_##nbits) \
	300	CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##l, \
	301	(uint32 a, const uint32 ktab), (a, ktab), \
	302	pick_mulk_##nbits##l, simple_mulk_##nbits) \
	303	\
	304	static void simple_mulk_##nbits(uint32 a, const uint32 ktab) \
	305	{ \
	306	uint32 m, t; \
	307	uint32 z[nbits/32]; \
	308	unsigned i, j, k; \
	309	\
	310	for (i = 0; i < nbits/32; i++) z[i] = 0; \
	311	\
	312	for (i = 0; i < nbits/32; i++) { \
	313	t = a[i]; \
	314	for (j = 0; j < 32; j++) { \
	315	m = -((t >> 31)&1u); \
	316	for (k = 0; k < nbits/32; k++) z[k] ^= *ktab++&m; \
	317	t <<= 1; \
	318	} \
	319	} \
	320	\
	321	for (i = 0; i < nbits/32; i++) a[i] = z[i]; \
	322	} \
	323	\
	324	static gcm_mulk_##nbits##b##__functype *pick_mulk_##nbits##b(void) \
	325	{ DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits); } \
	326	static gcm_mulk_##nbits##l##__functype *pick_mulk_##nbits##l(void) \
	327	{ DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits); }
	328
	329	GCM_WIDTHS(DEF_MULK)
	330
	331	#define GCM_MULK_CASE(nbits) \
	332	case nbits/32: \
	333	if (_f&GCMF_SWAP) gcm_mulk_##nbits##l(_a, _ktab); \
	334	else gcm_mulk_##nbits##b(_a, _ktab); \
	335	break;
	336	#define MULK(n, f, a, ktab) do { \
	337	uint32 _a = (a); const uint32 _ktab = (ktab); \
	338	unsigned _f = (f); \
	339	switch (n) { \
	340	GCM_WIDTHS(GCM_MULK_CASE) \
	341	default: abort(); \
	342	} \
	343	} while (0)
	344
	345	/----- Other utilities ---------------------------------------------------/
	346
	347	/* --- @putlen@ --- *
	348	*
	349	* Arguments: @octet *p@ = pointer to output buffer
	350	* @unsigned w@ = size of output buffer
	351	* @unsigned blksz@ = block size (assumed fairly small)
	352	* @unsigned long nblocks@ = number of blocks
	353	* @unsigned nbytes@ = tail size in bytes (assumed small)
	354	*
	355	* Returns: ---
	356	*
	357	* Use: Store the overall length in %$\emph{bits}$% (i.e.,
	358	* @3(nblocksblksz + nbytes)@ in big-endian form in the
	359	* buffer @p@.
	360	*/
	361
	362	static void putlen(octet *p, unsigned w, unsigned blksz,
	363	unsigned long nblocks, unsigned nbytes)
	364	{
	365	unsigned long nblo = nblocks&((1ul << (ULONG_BITS/2)) - 1),
	366	nbhi = nblocks >> ULONG_BITS/2;
	367	unsigned long nlo = nbloblksz + nbytes, nhi = nbhiblksz;
	368
	369	/* This is fiddly. Split @nblocks@, which is the big number, into high and
	370	* low halves, multiply those separately by @blksz@, propagate carries, and
	371	* then multiply by eight.
	372	*/
	373	nhi += nlo >> ULONG_BITS/2;
	374	nlo &= (1ul << (ULONG_BITS/2)) - 1;
	375	nlo <<= 3;
	376
	377	/* Now write out the size, feeding bits in from @nhi@ as necessary. */
	378	p += w;
	379	while (w--) {
	380	*--p = U8(nlo);
	381	nlo = (nlo >> 8) \| ((nhi&0xff) << (ULONG_BITS/2 - 5));
	382	nhi >>= 8;
	383	}
	384	}
	385
	386	/* --- @mix@ --- *
	387	*
	388	* Arguments: @const gcm_params *p@ = pointer to the parameters
	389	* @uint32 *a@ = GHASH accumulator
	390	* @const octet *q@ = pointer to an input block
	391	* @const uint32 *ktab@ = multiplication table, built by
	392	* @gcm_mktable@
	393	*
	394	* Returns: ---
	395	*
	396	* Use: Fold the block @q@ into the GHASH accumulator. The
	397	* calculation is %$a' = k (a + q)$%.
	398	*/
	399
	400	static void mix(const gcm_params p, uint32 a,
	401	const octet q, const uint32 ktab)
	402	{
	403	unsigned i;
	404
	405	if (p->f&GCMF_SWAP)
	406	for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_L(q); q += 4; }
	407	else
	408	for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_B(q); q += 4; }
	409	MULK(p->n, p->f, a, ktab);
	410	}
	411
	412	/* --- @gcm_ghashdone@ --- *
	413	*
	414	* Arguments: @const gcm_params *p@ = pointer to the parameters
	415	* @uint32 *a@ = GHASH accumulator
	416	* @const uint32 *ktab@ = multiplication table, built by
	417	* @gcm_mktable@
	418	* @unsigned long xblocks, yblocks@ = number of whole blocks in
	419	* the two inputs
	420	* @unsigned xbytes, ybytes@ = number of trailing bytes in the
	421	* two inputs
	422	*
	423	* Returns: ---
	424	*
	425	* Use: Finishes a GHASH operation by appending the appropriately
	426	* encoded lengths of the two constituent messages.
	427	*/
	428
	429	void gcm_ghashdone(const gcm_params p, uint32 a, const uint32 *ktab,
	430	unsigned long xblocks, unsigned xbytes,
	431	unsigned long yblocks, unsigned ybytes)
	432	{
	433	octet b[4*GCM_NMAX];
	434	unsigned w = p->n < 3 ? 4p->n : 2p->n;
	435
	436	/* Construct the encoded lengths. Note that smaller-block versions of GCM
	437	* encode the lengths in separate blocks. GCM is only officially defined
	438	* for 64- and 128-bit blocks; I've placed the cutoff somewhat arbitrarily
	439	* at 96 bits.
	440	*/
	441	putlen(b, w, 4*p->n, xblocks, xbytes);
	442	putlen(b + w, w, 4*p->n, yblocks, ybytes);
	443
	444	/* Feed the lengths into the accumulator. */
	445	mix(p, a, b, ktab);
	446	if (p->n < 3) mix(p, a, b + w, ktab);
	447	}
	448
	449	/* --- @gcm_concat@ --- *
	450	*
	451	* Arguments: @const gcm_params *p@ = pointer to the parameters
	452	* @uint32 *z@ = GHASH accumulator for suffix, updated
	453	* @const uint32 *x@ = GHASH accumulator for prefix
	454	* @const uint32 *ktab@ = multiplication table, built by
	455	* @gcm_mktable@
	456	* @unsigned long n@ = length of suffix in whole blocks
	457	*
	458	* Returns: ---
	459	*
	460	* Use: On entry, @x@ and @z@ are the results of hashing two strings
	461	* %$a$% and %$b$%, each a whole number of blocks long; in
	462	* particular, %$b$% is @n@ blocks long. On exit, @z@ is
	463	* updated to be the hash of %$a \cat b$%.
	464	*/
	465
	466	void gcm_concat(const gcm_params p, uint32 z, const uint32 *x,
	467	const uint32 *ktab, unsigned long n)
	468	{
	469	uint32 t[GCM_NMAX], u[GCM_NMAX];
	470	unsigned i, j;
	471
	472	if (!n) {
	473	/* If @n@ is zero, then there's not much to do. The mathematics
	474	* (explained below) still works, but the code takes a shortcut which
	475	* doesn't handle this case: so set %$z' = z + x k^n = z + x$%.
	476	*/
	477
	478	for (j = 0; j < p->n; j++) z[j] ^= x[j];
	479	} else {
	480	/* We have %$x = a_0 t^m + \cdots + a_{m-2} t^2 + a_{m-1} t$% and
	481	* %$z = b_0 t^n + \cdots + b_{n-2} t^2 + b_{n-1} t$%. What we'd like is
	482	* the hash of %$a \cat b$%, which is %$z + x k^n$%.
	483	*
	484	* The first job, then, is to calculate %$k^n$%, and for this we use a
	485	* simple left-to-right square-and-multiply algorithm. There's no need
	486	* to keep %$n$% secret here.
	487	*/
	488
	489	/* Start by retrieving %$k$% from the table, and convert it to big-endian
	490	* form.
	491	*/
	492	recover_k(p, u, ktab);
	493
	494	/* Now calculate %$k^n$%. */
	495	i = ULONG_BITS;
	496	#define BIT (1ul << (ULONG_BITS - 1))
	497	while (!(n&BIT)) { n <<= 1; i--; }
	498	n <<= 1; i--; for (j = 0; j < p->n; j++) t[j] = u[j];
	499	while (i--) { mul(p, t, t, t); if (n&BIT) mul(p, t, t, u); n <<= 1; }
	500	#undef BIT
	501
	502	/* Next, calculate %$x k^n$%. If we're using a little-endian convention
	503	* then we must convert %$x$%; otherwise we can just use it in place.
	504	*/
	505	if (!(p->f&GCMF_SWAP))
	506	mul(p, t, t, x);
	507	else {
	508	for (j = 0; j < p->n; j++) u[j] = ENDSWAP32(x[j]);
	509	mul(p, t, t, u);
	510	}
	511
	512	/* Finally, add %$x k^n$% onto %$z$%, converting back to little-endian if
	513	* necessary.
	514	*/
	515	if (!(p->f&GCMF_SWAP)) for (j = 0; j < p->n; j++) z[j] ^= t[j];
	516	else for (j = 0; j < p->n; j++) z[j] ^= ENDSWAP32(t[j]);
	517	}
	518	}
	519
	520	/----- Test rig ----------------------------------------------------------/
	521
	522	#ifdef TEST_RIG
	523
	524	#include <mLib/quis.h>
	525	#include <mLib/testrig.h>
	526
	527	static void report_failure(const char *test, unsigned nbits,
	528	const char ref, dstr v[], dstr d)
	529	{
	530	printf("test %s failed (nbits = %u)", test, nbits);
	531	printf("\n\tx = "); type_hex.dump(&v[0], stdout);
	532	printf("\n\ty = "); type_hex.dump(&v[1], stdout);
	533	printf("\n\tz = "); type_hex.dump(&v[2], stdout);
	534	printf("\n\t%s' = ", ref); type_hex.dump(d, stdout);
	535	putchar('\n');
	536	}
	537
	538	static void mulk(unsigned nbits, unsigned f, uint32 x, const uint32 ktab)
	539	{ MULK(nbits/32, f, x, ktab); }
	540
	541	static int test_mul(uint32 poly, dstr v[])
	542	{
	543	uint32 x[GCM_NMAX], y[GCM_NMAX], z[GCM_NMAX], ktab[32GCM_NMAXGCM_NMAX];
	544	gcm_params p;
	545	dstr d = DSTR_INIT;
	546	unsigned i, nbits;
	547	int ok = 1;
	548	enum { I_x, I_y, I_z };
	549
	550	nbits = 8*v[0].len; p.f = 0; p.n = nbits/32; p.poly = poly;
	551	dstr_ensure(&d, nbits/8); d.len = nbits/8;
	552
	553	#define LOADXY(E) do { \
	554	for (i = 0; i < nbits/32; i++) { \
	555	x[i] = LOAD32_##E(v[I_x].buf + 4*i); \
	556	y[i] = LOAD32_##E(v[I_y].buf + 4*i); \
	557	} \
	558	} while (0)
	559
	560	#define INITZ(x) do { \
	561	for (i = 0; i < nbits/32; i++) z[i] = (x)[i]; \
	562	} while (0)
	563
	564	#define CHECK(E, what, ref) do { \
	565	for (i = 0; i < nbits/32; i++) STORE32_##E(d.buf + 4*i, z[i]); \
	566	if (memcmp(d.buf, v[I_##ref].buf, nbits/8) != 0) \
	567	{ ok = 0; report_failure(what, nbits, #ref, v, &d); } \
	568	} while (0)
	569
	570	#define TEST_PREP_1(E, x, y, what) do { \
	571	gcm_mktable(&p, ktab, y); \
	572	recover_k(&p, z, ktab); CHECK(B, "mktable/recover_k (" #y ")", y); \
	573	INITZ(x); mulk(nbits, p.f, z, ktab); CHECK(E, what " (k = " #y ")", z); \
	574	} while (0)
	575
	576	#define TEST_PREP(E, what) do { \
	577	TEST_PREP_1(E, x, y, what); \
	578	TEST_PREP_1(E, y, x, what); \
	579	} while (0)
	580
	581	/* First, test plain multiply. */
	582	LOADXY(B); mul(&p, z, x, y); CHECK(B, "gcm_mul", z);
	583
	584	/* Next, test big-endian prepared key. */
	585	LOADXY(B); TEST_PREP(B, "gcm_kmul_b");
	586
	587	/* Finally, test little-endian prepared key. */
	588	p.f = GCMF_SWAP; LOADXY(L);
	589	TEST_PREP(L, "gcm_kmul_l");
	590
	591	#undef LOADXY
	592	#undef INITZ
	593	#undef CHECK
	594	#undef TEST_PREP_1
	595	#undef TEST_PREP
	596
	597	/* All done. */
	598	return (ok);
	599	}
	600
	601	#define TEST(nbits) \
	602	static int test_mul_##nbits(dstr v[]) \
	603	{ return (test_mul(GCM_POLY_##nbits, v)); }
	604	GCM_WIDTHS(TEST)
	605	#undef TEST
	606
	607	static test_chunk defs[] = {
	608	#define TEST(nbits) \
	609	{ "gcm-mul" #nbits, test_mul_##nbits, \
	610	{ &type_hex, &type_hex, &type_hex, 0 } },
	611	GCM_WIDTHS(TEST)
	612	#undef TEST
	613	{ 0, 0, { 0 } }
	614	};
	615
	616	int main(int argc, char *argv[])
	617	{
	618	ego(argv[0]);
	619	test_run(argc, argv, defs, SRCDIR"/t/gcm");
	620	return (0);
	621	}
	622
	623	#endif
	624
	625	/----- That's all, folks -------------------------------------------------/