mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/* --c--
	2	*
	3	* The GCM authenticated encryption mode
	4	*
	5	* (c) 2017 Straylight/Edgeware
	6	*/
	7
	8	/----- Licensing notice --------------------------------------------------
	9	*
	10	* This file is part of Catacomb.
	11	*
	12	* Catacomb is free software: you can redistribute it and/or modify it
	13	* under the terms of the GNU Library General Public License as published
	14	* by the Free Software Foundation; either version 2 of the License, or
	15	* (at your option) any later version.
	16	*
	17	* Catacomb is distributed in the hope that it will be useful, but
	18	* WITHOUT ANY WARRANTY; without even the implied warranty of
	19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	20	* Library General Public License for more details.
	21	*
	22	* You should have received a copy of the GNU Library General Public
	23	* License along with Catacomb. If not, write to the Free Software
	24	* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
	25	* USA.
	26	*/
	27
	28	/----- Header files ------------------------------------------------------/
	29
	30	#include "config.h"
	31
	32	#include <stdio.h>
	33
	34	#include <mLib/bits.h>
	35
	36	#include "dispatch.h"
	37	#include "gcm.h"
	38	#include "gcm-def.h"
	39
	40	/----- Overall strategy --------------------------------------------------
	41	*
	42	* GCM is pretty awful to implement in software. (This presentation is going
	43	* to be somewhat different to that in the specification, but I think it
	44	* makes more sense like this.)
	45	*
	46	* We're given a %$w$%-bit blockcipher %$E$% with a key %$K$%.
	47	*
	48	* The main part is arithmetic in the finite field %$k = \gf{2^w}$%, which we
	49	* represent as the quotient ring %$\gf{2}[t]/(p_w(t))$% for some irreducible
	50	* degree-%$w$% polynomial %$p(t)$%, whose precise value isn't very important
	51	* right now. We choose a secret point %$x = E_K(0^w)$%.
	52	*
	53	* We choose a length size %$z$% as follows: if %$w < 96%$ then %$z = w$%;
	54	* otherwise %$z = w/2$%. Format a message pair as follows:
	55	*
	56	* %$F(a, b) = P_w(a) \cat P_w(b) \cat [\ell(a)]_z \cat [\ell(b)]_z$%
	57	*
	58	* where %$P_w(x) = x \cat 0^n$% where $%0 \le n < w$% such that
	59	* %$\ell(x) + n \equiv 0 \pmod{w}$%.
	60	*
	61	* Hash a (block-aligned) message %$u$% as follows. First, split %$u$% into
	62	* %$w$%-bit blocks %$u_0$%, %$u_1$%, %%\ldots%%, %$u_{n-1}$%. Interpret
	63	* these as elements of %$k$%. Then
	64	*
	65	* %$G_x(u) = u_0 t^n + u_1 t^{n-1} + \cdots + u_{n-1} t$%
	66	*
	67	* converted back to a %$w$%-bit string.
	68	*
	69	* We're ready to go now. Suppose we're to encrypt a message %$M$% with
	70	* header %$H$% and nonce %$N$%. If %$\ell(N) + 32 = w$% then let
	71	* %$N' = N$% and let %$i_0 = 1$%; otherwise, let %$U = G_t(F(\epsilon, N))$%
	72	* and split this into %$N' = U[0 \bitsto w - 32]$% and
	73	* %$[i_0]_{32} = U[w - 32 \bitsto w]$%.
	74	*
	75	* Let %$n = \lceil \ell(M)/w \rceil$%. Compute
	76	*
	77	* %$y_j = E_K(N' \cat [i_0 + j]_{32})$%
	78	*
	79	* for %$0 \le j \le n$%. Let
	80	*
	81	* %$s = (y_1 \cat y_2 \cat \cdots \cat y_n)[0 \bitsto \ell(M)$%
	82	*
	83	* Let %$C = M \xor s$% and let %$T = G_x(F(H, C)) \xor y_0$%. These are the
	84	* ciphertext and tag respectively.
	85	*
	86	* So why is this awful?
	87	*
	88	* For one thing, the bits are in a completely terrible order. The bytes are
	89	* arranged in little-endian order, so the unit coefficient is in the first
	90	* byte, and the degree-127 coefficient is in the last byte. But within each
	91	* byte, the lowest-degree coefficient is in the most significant bit. It's
	92	* therefore better to think of GCM as using a big-endian byte-ordering
	93	* convention, but with the bits backwards.
	94	*
	95	* But messing about with byte ordering is expensive, so let's not do that in
	96	* the inner loop. But multiplication in %$k$% is not easy either. Some
	97	* kind of precomputed table would be nice, but that will leak secrets
	98	* through the cache.
	99	*
	100	* I choose a particularly simple table: given %$x$%, let %$X[i'] = x t^i$%.
	101	* Then $%$x y = \sum_{0\le i<w} y_i X[i']$% which is just a bunch of
	102	* bitmasking. But the natural order for examining bits of %$y$% is not
	103	* necessarily the obvious one. We'll have already loaded %$y$% into
	104	* internal form, as 32-bit words. The good order to process these is left
	105	* to right, from high to low bits. But now the order of degrees depends on
	106	* the endianness of our conversion of bytes to words. Oh, well.
	107	*
	108	* If we've adopted a big-endian convention, then we'll see the degrees in
	109	* order, 0, 1, ..., all the way up to %$w - 1$% and everything is fine. If
	110	* we've adopted a little-endian convention, though, we'll see an ordering
	111	* like this:
	112	*
	113	* 24, 25, ..., 31, 16, 17, ..., 23, 8, 9, ..., 15, 0, 1, ..., 7,
	114	* 56, 57, ..., 63, 48, 49, ..., 55, 40, 41, ..., 47, 32, 33, ..., 39,
	115	* etc.
	116	*
	117	* which is the ordinary order with 0x18 = 24 XORed into the index. That is,
	118	* %$i' = i$% if we've adopted a big-endian convention, and
	119	* %$i' = i \xor 24$% if we've adopted a little-endian convention.
	120	*/
	121
	122	/----- Low-level utilities -----------------------------------------------/
	123
	124	/* --- @mult@ --- *
	125	*
	126	* Arguments: @const gcm_params *p@ = pointer to the parameters
	127	* @uint32 *z@ = where to write the result
	128	* @const uint32 *x@ = input field element
	129	*
	130	* Returns: ---
	131	*
	132	* Use: Multiply the input field element by %$t$%, and write the
	133	* product to @z@. It's safe for @x@ and @z@ to be equal, but
	134	* they should not otherwise overlap. Both input and output are
	135	* in big-endian form, i.e., with the lowest-degree coefficients
	136	* in the most significant bits.
	137	*/
	138
	139	static void mult(const gcm_params p, uint32 z, const uint32 *x)
	140	{
	141	uint32 m, c, t;
	142	unsigned i;
	143
	144	t = x[p->n - 1]; m = -(t&1u); c = m&p->poly;
	145	for (i = 0; i < p->n; i++) { t = x[i]; z[i] = (t >> 1) ^ c; c = t << 31; }
	146	}
	147
	148	/* --- @mul@ --- *
	149	*
	150	* Arguments: @const gcm_params *p@ = pointer to the parameters
	151	* @uint32 *z@ = where to write the result
	152	* @const uint32 x, y@ = input field elements
	153	*
	154	* Returns: ---
	155	*
	156	* Use: Multiply the input field elements together, and write the
	157	* product to @z@. It's safe for the operands to overlap. Both
	158	* inputs and the output are in big-endian form, i.e., with the
	159	* lowest-degree coefficients in the most significant bits.
	160	*/
	161
	162	static void mul(const gcm_params p, uint32 z,
	163	const uint32 x, const uint32 y)
	164	{
	165	uint32 m, t, u[GCM_NMAX], v[GCM_NMAX];
	166	unsigned i, j, k;
	167
	168	/* We can't do this in-place at all, so use temporary space. Make a copy
	169	* of @x@ in @u@, where we can clobber it, and build the product in @v@.
	170	*/
	171	for (i = 0; i < p->n; i++) { u[i] = x[i]; v[i] = 0; }
	172
	173	/* Repeatedly multiply @x@ (in @u@) by %$t$%, and add together those
	174	* %$x t^i$% selected by the bits of @y@. This is basically what you get
	175	* by streaming the result of @gcm_mktable@ into @gcm_mulk_...@.
	176	*/
	177	for (i = 0; i < p->n; i++) {
	178	t = y[i];
	179	for (j = 0; j < 32; j++) {
	180	m = -((t >> 31)&1u);
	181	for (k = 0; k < p->n; k++) v[k] ^= u[k]&m;
	182	mult(p, u, u); t <<= 1;
	183	}
	184	}
	185
	186	/* Write out the result now that it's ready. */
	187	for (i = 0; i < p->n; i++) z[i] = v[i];
	188	}
	189
	190	/----- Table-based multiplication ----------------------------------------/
	191
	192	/* --- @gcm_mktable@ --- *
	193	*
	194	* Arguments: @const gcm_params *p@ = pointer to the parameters
	195	* @uint32 *ktab@ = where to write the table; there must be
	196	* space for %$32 n$% $%n$%-word entries, i.e.,
	197	* %$32 n^2$% 32-bit words in total, where %$n$% is
	198	* @p->n@, the block size in words
	199	* @const uint32 *k@ = input field element
	200	*
	201	* Returns: ---
	202	*
	203	* Use: Construct a table for use by @gcm_mulk_...@ below, to
	204	* multiply (vaguely) efficiently by @k@.
	205	*/
	206
	207	static void simple_mktable(const gcm_params *p,
	208	uint32 ktab, const uint32 k)
	209	{
	210	unsigned m = (p->f&GCMF_SWAP ? 0x18 : 0);
	211	unsigned i, j, o = m*p->n;
	212
	213	/* As described above, the table stores entries %$K[i \xor m] = k t^i$%,
	214	* where %$m = 0$% (big-endian cipher) or %$m = 24$% (little-endian).
	215	* The first job is to store %$K[m] = k$%.
	216	*
	217	* We initially build the table with the entries in big-endian order, and
	218	* then swap them if necessary. This makes the arithmetic functions more
	219	* amenable for use by @gcm_concat@ below.
	220	*/
	221	if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) ktab[o + i] = k[i];
	222	else for (i = 0; i < p->n; i++) ktab[o + i] = ENDSWAP32(k[i]);
	223
	224	/* Fill in the rest of the table by repeatedly multiplying the previous
	225	* entry by %$t$%.
	226	*/
	227	for (i = 1; i < 32*p->n; i++)
	228	{ j = (i ^ m)*p->n; mult(p, ktab + j, ktab + o); o = j; }
	229
	230	/* Finally, if the cipher uses a little-endian convention, then swap all of
	231	* the individual words.
	232	*/
	233	if (p->f&GCMF_SWAP)
	234	for (i = 0; i < 32p->np->n; i++) ktab[i] = ENDSWAP32(ktab[i]);
	235	}
	236
	237	#if CPUFAM_X86 \|\| CPUFAM_AMD64
	238	static void pclmul_mktable(const gcm_params *p,
	239	uint32 ktab, const uint32 k)
	240	{
	241	unsigned n = p->n;
	242	unsigned nz;
	243	uint32 *t;
	244
	245	/* We just need to store the value in a way which is convenient for the
	246	* assembler code to read back. That involves reordering the words, and,
	247	* in the case of 96-bit blocks, padding with zeroes to fill out a 128-bit
	248	* chunk.
	249	*/
	250
	251	if (n == 3) nz = 1;
	252	else nz = 0;
	253	t = ktab + n + nz;
	254
	255	if (p->f&GCMF_SWAP) while (n--) { --t = ENDSWAP32(k); k++; }
	256	else while (n--) --t = k++;
	257	while (nz--) *--t = 0;
	258	}
	259	#endif
	260
	261	#if CPUFAM_ARMEL
	262	static void arm_crypto_mktable(const gcm_params *p,
	263	uint32 ktab, const uint32 k)
	264	{
	265	unsigned n = p->n;
	266	uint32 *t;
	267
	268	/* We just need to store the value in a way which is convenient for the
	269	* assembler code to read back. That involves swapping the bytes in each
	270	* 64-bit lane.
	271	*/
	272
	273	t = ktab;
	274	if (p->f&GCMF_SWAP) {
	275	while (n >= 2) {
	276	t[1] = ENDSWAP32(k[0]); t[0] = ENDSWAP32(k[1]);
	277	t += 2; k += 2; n -= 2;
	278	}
	279	if (n) { t[1] = ENDSWAP32(k[0]); t[0] = 0; }
	280	} else {
	281	while (n >= 2) {
	282	t[1] = k[0]; t[0] = k[1];
	283	t += 2; k += 2; n -= 2;
	284	}
	285	if (n) { t[1] = k[0]; t[0] = 0; }
	286	}
	287	}
	288	#endif
	289
	290	#if CPUFAM_ARM64
	291	static uint32 rbit32(uint32 x)
	292	{
	293	uint32 z, t;
	294
	295	#if GCC_VERSION_P(4, 3)
	296	/* Two tricks here. Firstly, two separate steps, rather than a single
	297	* block of assembler, to allow finer-grained instruction scheduling.
	298	* Secondly, use `ENDSWAP32' so that the compiler can cancel it if the
	299	* caller actually wants the bytes reordered.
	300	*/
	301	__asm__("rbit %w0, %w1" : "=r"(t) : "r"(x));
	302	z = ENDSWAP32(t);
	303	#else
	304	/* A generic but slightly clever implementation. */
	305	# define SWIZZLE(x, m, s) ((((x)&(m)) << (s)) \| (((x)&~(m)) >> (s)))
	306	/* 76543210 */
	307	t = SWIZZLE(x, 0x0f0f0f0f, 4); /* 32107654 -- swap nibbles */
	308	t = SWIZZLE(t, 0x33333333, 2); /* 10325476 -- swap bit pairs */
	309	z = SWIZZLE(t, 0x55555555, 1); /* 01234567 -- swap adjacent bits */
	310	# undef SWIZZLE
	311	#endif
	312	return (z);
	313	}
	314
	315	static void arm64_pmull_mktable(const gcm_params *p,
	316	uint32 ktab, const uint32 k)
	317	{
	318	unsigned n = p->n;
	319	uint32 *t;
	320
	321	/* We just need to store the value in a way which is convenient for the
	322	* assembler code to read back. That involves two transformations:
	323	*
	324	* * firstly, reversing the order of the bits in each byte; and,
	325	*
	326	* * secondly, storing two copies of each 64-bit chunk.
	327	*
	328	* Note that, in this case, we /want/ the little-endian byte order of GCM,
	329	* so endianness-swapping happens in the big-endian case.
	330	*/
	331
	332	t = ktab;
	333	if (p->f&GCMF_SWAP) {
	334	while (n >= 2) {
	335	t[0] = t[2] = rbit32(k[0]);
	336	t[1] = t[3] = rbit32(k[1]);
	337	t += 4; k += 2; n -= 2;
	338	}
	339	if (n) { t[0] = t[2] = rbit32(k[0]); t[1] = t[3] = 0; }
	340	} else {
	341	while (n >= 2) {
	342	t[0] = t[2] = ENDSWAP32(rbit32(k[0]));
	343	t[1] = t[3] = ENDSWAP32(rbit32(k[1]));
	344	t += 4; k += 2; n -= 2;
	345	}
	346	if (n) { t[0] = t[2] = ENDSWAP32(rbit32(k[0])); t[1] = t[3] = 0; }
	347	}
	348	}
	349	#endif
	350
	351	CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable,
	352	(const gcm_params p, uint32 ktab, const uint32 *k),
	353	(p, ktab, k),
	354	pick_mktable, simple_mktable)
	355
	356	static gcm_mktable__functype *pick_mktable(void)
	357	{
	358	#if CPUFAM_X86 \|\| CPUFAM_AMD64
	359	DISPATCH_PICK_COND(gcm_mktable, pclmul_mktable,
	360	cpu_feature_p(CPUFEAT_X86_SSSE3) &&
	361	cpu_feature_p(CPUFEAT_X86_PCLMUL));
	362	#endif
	363	#if CPUFAM_ARMEL
	364	DISPATCH_PICK_COND(gcm_mktable, arm_crypto_mktable,
	365	cpu_feature_p(CPUFEAT_ARM_PMULL));
	366	#endif
	367	#if CPUFAM_ARM64
	368	DISPATCH_PICK_COND(gcm_mktable, arm64_pmull_mktable,
	369	cpu_feature_p(CPUFEAT_ARM_PMULL));
	370	#endif
	371	DISPATCH_PICK_FALLBACK(gcm_mktable, simple_mktable);
	372	}
	373
	374	/* --- @recover_k@ --- *
	375	*
	376	* Arguments: @const gcm_params *p@ = pointer to the parameters
	377	* @uint32 *k@ = block-sized vector in which to store %$k$%
	378	* @const uint32 *ktab@ = the table encoding %$k$%
	379	*
	380	* Returns: ---
	381	*
	382	* Use: Recovers %$k$%, the secret from which @ktab@ was by
	383	* @gcm_mktable@, from the table, and stores it in internal
	384	* (big-endian) form in @k@.
	385	*/
	386
	387	static void simple_recover_k(const gcm_params *p,
	388	uint32 k, const uint32 ktab)
	389	{
	390	unsigned i;
	391
	392	/* If the blockcipher is big-endian, then the key is simply in the first
	393	* table element, in the right format. If the blockcipher is little-endian
	394	* then it's in element 24, and the bytes need swapping.
	395	*/
	396
	397	if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) k[i] = ktab[i];
	398	else for (i = 0; i < p->n; i++) k[i] = ENDSWAP32(ktab[24*p->n + i]);
	399	}
	400
	401	#if CPUFAM_X86 \|\| CPUFAM_AMD64
	402	static void pclmul_recover_k(const gcm_params *p,
	403	uint32 k, const uint32 ktab)
	404	{
	405	unsigned n = p->n;
	406	unsigned nz;
	407	const uint32 *t;
	408
	409	/* The representation is already independent of the blockcipher endianness.
	410	* We need to compensate for padding, and reorder the words.
	411	*/
	412
	413	if (n == 3) nz = 1; else nz = 0;
	414	t = ktab + n + nz;
	415	while (n--) k++ = --t;
	416	}
	417	#endif
	418
	419	#if CPUFAM_ARMEL
	420	static void arm_crypto_recover_k(const gcm_params *p,
	421	uint32 k, const uint32 ktab)
	422	{
	423	unsigned n = p->n;
	424	const uint32 *t;
	425
	426	/* The representation is already independent of the blockcipher endianness.
	427	* We only need to reorder the words.
	428	*/
	429
	430	t = ktab;
	431	while (n >= 2) { k[1] = t[0]; k[0] = t[1]; t += 2; k += 2; n -= 2; }
	432	if (n) k[0] = t[1];
	433	}
	434	#endif
	435
	436	#if CPUFAM_ARM64
	437	static void arm64_pmull_recover_k(const gcm_params *p,
	438	uint32 k, const uint32 ktab)
	439	{
	440	unsigned n = p->n;
	441	const uint32 *t;
	442
	443	/* The representation is already independent of the blockcipher endianness.
	444	* We need to skip the duplicate pieces, and unscramble the bytes.
	445	*/
	446
	447	t = ktab;
	448	while (n >= 2) {
	449	k[0] = ENDSWAP32(rbit32(t[0]));
	450	k[1] = ENDSWAP32(rbit32(t[1]));
	451	t += 4; k += 2; n -= 2;
	452	}
	453	if (n) k[0] = ENDSWAP32(rbit32(t[0]));
	454	}
	455	#endif
	456
	457	CPU_DISPATCH(static, EMPTY, void, recover_k,
	458	(const gcm_params p, uint32 k, const uint32 *ktab),
	459	(p, k, ktab),
	460	pick_recover_k, simple_recover_k)
	461
	462	static recover_k__functype *pick_recover_k(void)
	463	{
	464	#if CPUFAM_X86 \|\| CPUFAM_AMD64
	465	DISPATCH_PICK_COND(recover_k, pclmul_recover_k,
	466	cpu_feature_p(CPUFEAT_X86_SSSE3) &&
	467	cpu_feature_p(CPUFEAT_X86_PCLMUL));
	468	#endif
	469	#if CPUFAM_ARMEL
	470	DISPATCH_PICK_COND(recover_k, arm_crypto_recover_k,
	471	cpu_feature_p(CPUFEAT_ARM_PMULL));
	472	#endif
	473	#if CPUFAM_ARM64
	474	DISPATCH_PICK_COND(recover_k, arm64_pmull_recover_k,
	475	cpu_feature_p(CPUFEAT_ARM_PMULL));
	476	#endif
	477	DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k);
	478	}
	479
	480	/* --- @gcm_mulk_N{b,l}@ --- *
	481	*
	482	* Arguments: @uint32 *a@ = accumulator to multiply
	483	* @const uint32 *ktab@ = table constructed by @gcm_mktable@
	484	*
	485	* Returns: ---
	486	*
	487	* Use: Multiply @a@ by @k@ (implicitly represented in @ktab@),
	488	* updating @a@ in-place. There are separate functions for each
	489	* supported block size and endianness because this is the
	490	* function whose performance actually matters.
	491	*/
	492
	493	#if CPUFAM_X86 \|\| CPUFAM_AMD64
	494	# define DECL_MULK_X86ISH(var) extern gcm_mulk_##var##__functype \
	495	gcm_mulk_##var##_x86ish_pclmul_avx, \
	496	gcm_mulk_##var##_x86ish_pclmul;
	497	# define PICK_MULK_X86ISH(var) do { \
	498	DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul_avx, \
	499	cpu_feature_p(CPUFEAT_X86_AVX) && \
	500	cpu_feature_p(CPUFEAT_X86_PCLMUL) && \
	501	cpu_feature_p(CPUFEAT_X86_SSSE3)); \
	502	DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul, \
	503	cpu_feature_p(CPUFEAT_X86_PCLMUL) && \
	504	cpu_feature_p(CPUFEAT_X86_SSSE3)); \
	505	} while (0)
	506	#else
	507	# define DECL_MULK_X86ISH(var)
	508	# define PICK_MULK_X86ISH(var) do ; while (0)
	509	#endif
	510
	511	#if CPUFAM_ARMEL
	512	# define DECL_MULK_ARM(var) \
	513	extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm_crypto;
	514	# define PICK_MULK_ARM(var) do { \
	515	DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm_crypto, \
	516	cpu_feature_p(CPUFEAT_ARM_PMULL)); \
	517	} while (0)
	518	#else
	519	# define DECL_MULK_ARM(var)
	520	# define PICK_MULK_ARM(var) do ; while (0)
	521	#endif
	522
	523	#if CPUFAM_ARM64
	524	# define DECL_MULK_ARM64(var) \
	525	extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm64_pmull;
	526	# define PICK_MULK_ARM64(var) do { \
	527	DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm64_pmull, \
	528	cpu_feature_p(CPUFEAT_ARM_PMULL)); \
	529	} while (0)
	530	#else
	531	# define DECL_MULK_ARM64(var)
	532	# define PICK_MULK_ARM64(var) do ; while (0)
	533	#endif
	534
	535	#define DEF_MULK(nbits) \
	536	\
	537	CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##b, \
	538	(uint32 a, const uint32 ktab), (a, ktab), \
	539	pick_mulk_##nbits##b, simple_mulk_##nbits) \
	540	CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##l, \
	541	(uint32 a, const uint32 ktab), (a, ktab), \
	542	pick_mulk_##nbits##l, simple_mulk_##nbits) \
	543	\
	544	static void simple_mulk_##nbits(uint32 a, const uint32 ktab) \
	545	{ \
	546	uint32 m, t; \
	547	uint32 z[nbits/32]; \
	548	unsigned i, j, k; \
	549	\
	550	for (i = 0; i < nbits/32; i++) z[i] = 0; \
	551	\
	552	for (i = 0; i < nbits/32; i++) { \
	553	t = a[i]; \
	554	for (j = 0; j < 32; j++) { \
	555	m = -((t >> 31)&1u); \
	556	for (k = 0; k < nbits/32; k++) z[k] ^= *ktab++&m; \
	557	t <<= 1; \
	558	} \
	559	} \
	560	\
	561	for (i = 0; i < nbits/32; i++) a[i] = z[i]; \
	562	} \
	563	\
	564	DECL_MULK_X86ISH(nbits##b) \
	565	DECL_MULK_ARM(nbits##b) \
	566	DECL_MULK_ARM64(nbits##b) \
	567	static gcm_mulk_##nbits##b##__functype *pick_mulk_##nbits##b(void) \
	568	{ \
	569	PICK_MULK_X86ISH(nbits##b); \
	570	PICK_MULK_ARM(nbits##b); \
	571	PICK_MULK_ARM64(nbits##b); \
	572	DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits); \
	573	} \
	574	\
	575	DECL_MULK_X86ISH(nbits##l) \
	576	DECL_MULK_ARM(nbits##l) \
	577	DECL_MULK_ARM64(nbits##l) \
	578	static gcm_mulk_##nbits##l##__functype *pick_mulk_##nbits##l(void) \
	579	{ \
	580	PICK_MULK_X86ISH(nbits##l); \
	581	PICK_MULK_ARM(nbits##l); \
	582	PICK_MULK_ARM64(nbits##l); \
	583	DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits); \
	584	}
	585
	586	GCM_WIDTHS(DEF_MULK)
	587
	588	#define GCM_MULK_CASE(nbits) \
	589	case nbits/32: \
	590	if (_f&GCMF_SWAP) gcm_mulk_##nbits##l(_a, _ktab); \
	591	else gcm_mulk_##nbits##b(_a, _ktab); \
	592	break;
	593	#define MULK(n, f, a, ktab) do { \
	594	uint32 _a = (a); const uint32 _ktab = (ktab); \
	595	unsigned _f = (f); \
	596	switch (n) { \
	597	GCM_WIDTHS(GCM_MULK_CASE) \
	598	default: abort(); \
	599	} \
	600	} while (0)
	601
	602	/----- Other utilities ---------------------------------------------------/
	603
	604	/* --- @putlen@ --- *
	605	*
	606	* Arguments: @octet *p@ = pointer to output buffer
	607	* @unsigned w@ = size of output buffer
	608	* @unsigned blksz@ = block size (assumed fairly small)
	609	* @unsigned long nblocks@ = number of blocks
	610	* @unsigned nbytes@ = tail size in bytes (assumed small)
	611	*
	612	* Returns: ---
	613	*
	614	* Use: Store the overall length in %$\emph{bits}$% (i.e.,
	615	* @3(nblocksblksz + nbytes)@ in big-endian form in the
	616	* buffer @p@.
	617	*/
	618
	619	static void putlen(octet *p, unsigned w, unsigned blksz,
	620	unsigned long nblocks, unsigned nbytes)
	621	{
	622	unsigned long nblo = nblocks&((1ul << (ULONG_BITS/2)) - 1),
	623	nbhi = nblocks >> ULONG_BITS/2;
	624	unsigned long nlo = nbloblksz + nbytes, nhi = nbhiblksz;
	625
	626	/* This is fiddly. Split @nblocks@, which is the big number, into high and
	627	* low halves, multiply those separately by @blksz@, propagate carries, and
	628	* then multiply by eight.
	629	*/
	630	nhi += nlo >> ULONG_BITS/2;
	631	nlo &= (1ul << (ULONG_BITS/2)) - 1;
	632	nlo <<= 3;
	633
	634	/* Now write out the size, feeding bits in from @nhi@ as necessary. */
	635	p += w;
	636	while (w--) {
	637	*--p = U8(nlo);
	638	nlo = (nlo >> 8) \| ((nhi&0xff) << (ULONG_BITS/2 - 5));
	639	nhi >>= 8;
	640	}
	641	}
	642
	643	/* --- @mix@ --- *
	644	*
	645	* Arguments: @const gcm_params *p@ = pointer to the parameters
	646	* @uint32 *a@ = GHASH accumulator
	647	* @const octet *q@ = pointer to an input block
	648	* @const uint32 *ktab@ = multiplication table, built by
	649	* @gcm_mktable@
	650	*
	651	* Returns: ---
	652	*
	653	* Use: Fold the block @q@ into the GHASH accumulator. The
	654	* calculation is %$a' = k (a + q)$%.
	655	*/
	656
	657	static void mix(const gcm_params p, uint32 a,
	658	const octet q, const uint32 ktab)
	659	{
	660	unsigned i;
	661
	662	if (p->f&GCMF_SWAP)
	663	for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_L(q); q += 4; }
	664	else
	665	for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_B(q); q += 4; }
	666	MULK(p->n, p->f, a, ktab);
	667	}
	668
	669	/* --- @gcm_ghashdone@ --- *
	670	*
	671	* Arguments: @const gcm_params *p@ = pointer to the parameters
	672	* @uint32 *a@ = GHASH accumulator
	673	* @const uint32 *ktab@ = multiplication table, built by
	674	* @gcm_mktable@
	675	* @unsigned long xblocks, yblocks@ = number of whole blocks in
	676	* the two inputs
	677	* @unsigned xbytes, ybytes@ = number of trailing bytes in the
	678	* two inputs
	679	*
	680	* Returns: ---
	681	*
	682	* Use: Finishes a GHASH operation by appending the appropriately
	683	* encoded lengths of the two constituent messages.
	684	*/
	685
	686	void gcm_ghashdone(const gcm_params p, uint32 a, const uint32 *ktab,
	687	unsigned long xblocks, unsigned xbytes,
	688	unsigned long yblocks, unsigned ybytes)
	689	{
	690	octet b[4*GCM_NMAX];
	691	unsigned w = p->n < 3 ? 4p->n : 2p->n;
	692
	693	/* Construct the encoded lengths. Note that smaller-block versions of GCM
	694	* encode the lengths in separate blocks. GCM is only officially defined
	695	* for 64- and 128-bit blocks; I've placed the cutoff somewhat arbitrarily
	696	* at 96 bits.
	697	*/
	698	putlen(b, w, 4*p->n, xblocks, xbytes);
	699	putlen(b + w, w, 4*p->n, yblocks, ybytes);
	700
	701	/* Feed the lengths into the accumulator. */
	702	mix(p, a, b, ktab);
	703	if (p->n < 3) mix(p, a, b + w, ktab);
	704	}
	705
	706	/* --- @gcm_concat@ --- *
	707	*
	708	* Arguments: @const gcm_params *p@ = pointer to the parameters
	709	* @uint32 *z@ = GHASH accumulator for suffix, updated
	710	* @const uint32 *x@ = GHASH accumulator for prefix
	711	* @const uint32 *ktab@ = multiplication table, built by
	712	* @gcm_mktable@
	713	* @unsigned long n@ = length of suffix in whole blocks
	714	*
	715	* Returns: ---
	716	*
	717	* Use: On entry, @x@ and @z@ are the results of hashing two strings
	718	* %$a$% and %$b$%, each a whole number of blocks long; in
	719	* particular, %$b$% is @n@ blocks long. On exit, @z@ is
	720	* updated to be the hash of %$a \cat b$%.
	721	*/
	722
	723	void gcm_concat(const gcm_params p, uint32 z, const uint32 *x,
	724	const uint32 *ktab, unsigned long n)
	725	{
	726	uint32 t[GCM_NMAX], u[GCM_NMAX];
	727	unsigned i, j;
	728
	729	if (!n) {
	730	/* If @n@ is zero, then there's not much to do. The mathematics
	731	* (explained below) still works, but the code takes a shortcut which
	732	* doesn't handle this case: so set %$z' = z + x k^n = z + x$%.
	733	*/
	734
	735	for (j = 0; j < p->n; j++) z[j] ^= x[j];
	736	} else {
	737	/* We have %$x = a_0 t^m + \cdots + a_{m-2} t^2 + a_{m-1} t$% and
	738	* %$z = b_0 t^n + \cdots + b_{n-2} t^2 + b_{n-1} t$%. What we'd like is
	739	* the hash of %$a \cat b$%, which is %$z + x k^n$%.
	740	*
	741	* The first job, then, is to calculate %$k^n$%, and for this we use a
	742	* simple left-to-right square-and-multiply algorithm. There's no need
	743	* to keep %$n$% secret here.
	744	*/
	745
	746	/* Start by retrieving %$k$% from the table, and convert it to big-endian
	747	* form.
	748	*/
	749	recover_k(p, u, ktab);
	750
	751	/* Now calculate %$k^n$%. */
	752	i = ULONG_BITS;
	753	#define BIT (1ul << (ULONG_BITS - 1))
	754	while (!(n&BIT)) { n <<= 1; i--; }
	755	n <<= 1; i--; for (j = 0; j < p->n; j++) t[j] = u[j];
	756	while (i--) { mul(p, t, t, t); if (n&BIT) mul(p, t, t, u); n <<= 1; }
	757	#undef BIT
	758
	759	/* Next, calculate %$x k^n$%. If we're using a little-endian convention
	760	* then we must convert %$x$%; otherwise we can just use it in place.
	761	*/
	762	if (!(p->f&GCMF_SWAP))
	763	mul(p, t, t, x);
	764	else {
	765	for (j = 0; j < p->n; j++) u[j] = ENDSWAP32(x[j]);
	766	mul(p, t, t, u);
	767	}
	768
	769	/* Finally, add %$x k^n$% onto %$z$%, converting back to little-endian if
	770	* necessary.
	771	*/
	772	if (!(p->f&GCMF_SWAP)) for (j = 0; j < p->n; j++) z[j] ^= t[j];
	773	else for (j = 0; j < p->n; j++) z[j] ^= ENDSWAP32(t[j]);
	774	}
	775	}
	776
	777	/----- Test rig ----------------------------------------------------------/
	778
	779	#ifdef TEST_RIG
	780
	781	#include <mLib/macros.h>
	782	#include <mLib/quis.h>
	783	#include <mLib/testrig.h>
	784
	785	#ifdef ENABLE_ASM_DEBUG
	786	# include "regdump.h"
	787	#endif
	788
	789	static void report_failure(const char *test, unsigned nbits,
	790	const char ref, dstr v[], dstr d)
	791	{
	792	printf("test %s failed (nbits = %u)", test, nbits);
	793	printf("\n\tx = "); type_hex.dump(&v[0], stdout);
	794	printf("\n\ty = "); type_hex.dump(&v[1], stdout);
	795	printf("\n\tz = "); type_hex.dump(&v[2], stdout);
	796	printf("\n\t%s' = ", ref); type_hex.dump(d, stdout);
	797	putchar('\n');
	798	}
	799
	800	static void mulk(unsigned nbits, unsigned f, uint32 x, const uint32 ktab)
	801	{ MULK(nbits/32, f, x, ktab); }
	802
	803	static int test_mul(uint32 poly, dstr v[])
	804	{
	805	uint32 x[GCM_NMAX], y[GCM_NMAX], z[GCM_NMAX], ktab[32GCM_NMAXGCM_NMAX];
	806	gcm_params p;
	807	dstr d = DSTR_INIT;
	808	unsigned i, nbits;
	809	int ok = 1;
	810	enum { I_x, I_y, I_z };
	811
	812	nbits = 8*v[0].len; p.f = 0; p.n = nbits/32; p.poly = poly;
	813	dstr_ensure(&d, nbits/8); d.len = nbits/8;
	814
	815	#define LOADXY(E) do { \
	816	for (i = 0; i < nbits/32; i++) { \
	817	x[i] = LOAD32_##E(v[I_x].buf + 4*i); \
	818	y[i] = LOAD32_##E(v[I_y].buf + 4*i); \
	819	} \
	820	} while (0)
	821
	822	#define INITZ(x) do { \
	823	for (i = 0; i < nbits/32; i++) z[i] = (x)[i]; \
	824	} while (0)
	825
	826	#define CHECK(E, what, ref) do { \
	827	for (i = 0; i < nbits/32; i++) STORE32_##E(d.buf + 4*i, z[i]); \
	828	if (MEMCMP(d.buf, !=, v[I_##ref].buf, nbits/8)) \
	829	{ ok = 0; report_failure(what, nbits, #ref, v, &d); } \
	830	} while (0)
	831
	832	#define TEST_PREP_1(E, x, y, what) do { \
	833	gcm_mktable(&p, ktab, y); \
	834	recover_k(&p, z, ktab); CHECK(B, "mktable/recover_k (" #y ")", y); \
	835	INITZ(x); mulk(nbits, p.f, z, ktab); CHECK(E, what " (k = " #y ")", z); \
	836	} while (0)
	837
	838	#define TEST_PREP(E, what) do { \
	839	TEST_PREP_1(E, x, y, what); \
	840	TEST_PREP_1(E, y, x, what); \
	841	} while (0)
	842
	843	/* First, test plain multiply. */
	844	LOADXY(B); mul(&p, z, x, y); CHECK(B, "gcm_mul", z);
	845
	846	/* Next, test big-endian prepared key. */
	847	LOADXY(B); TEST_PREP(B, "gcm_kmul_b");
	848
	849	/* Finally, test little-endian prepared key. */
	850	p.f = GCMF_SWAP; LOADXY(L);
	851	TEST_PREP(L, "gcm_kmul_l");
	852
	853	#undef LOADXY
	854	#undef INITZ
	855	#undef CHECK
	856	#undef TEST_PREP_1
	857	#undef TEST_PREP
	858
	859	/* All done. */
	860	return (ok);
	861	}
	862
	863	#define TEST(nbits) \
	864	static int test_mul_##nbits(dstr v[]) \
	865	{ return (test_mul(GCM_POLY_##nbits, v)); }
	866	GCM_WIDTHS(TEST)
	867	#undef TEST
	868
	869	static test_chunk defs[] = {
	870	#define TEST(nbits) \
	871	{ "gcm-mul" #nbits, test_mul_##nbits, \
	872	{ &type_hex, &type_hex, &type_hex, 0 } },
	873	GCM_WIDTHS(TEST)
	874	#undef TEST
	875	{ 0, 0, { 0 } }
	876	};
	877
	878	int main(int argc, char *argv[])
	879	{
	880	ego(argv[0]);
	881	#ifdef ENABLE_ASM_DEBUG
	882	regdump_init();
	883	#endif
	884	test_run(argc, argv, defs, SRCDIR"/t/gcm");
	885	return (0);
	886	}
	887
	888	#endif
	889
	890	/----- That's all, folks -------------------------------------------------/