mdw@git.distorted.org.uk Git - catacomb/blame

Commit	Line	Data
50df5733 MW	1	/* --c--
	2	*
	3	* The GCM authenticated encryption mode
	4	*
	5	* (c) 2017 Straylight/Edgeware
	6	*/
	7
	8	/----- Licensing notice --------------------------------------------------
	9	*
	10	* This file is part of Catacomb.
	11	*
	12	* Catacomb is free software: you can redistribute it and/or modify it
	13	* under the terms of the GNU Library General Public License as published
	14	* by the Free Software Foundation; either version 2 of the License, or
	15	* (at your option) any later version.
	16	*
	17	* Catacomb is distributed in the hope that it will be useful, but
	18	* WITHOUT ANY WARRANTY; without even the implied warranty of
	19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	20	* Library General Public License for more details.
	21	*
	22	* You should have received a copy of the GNU Library General Public
	23	* License along with Catacomb. If not, write to the Free Software
	24	* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
	25	* USA.
	26	*/
	27
	28	/----- Header files ------------------------------------------------------/
	29
	30	#include "config.h"
	31
	32	#include <stdio.h>
	33
	34	#include <mLib/bits.h>
	35
1d30a9b9	36	#include "dispatch.h"
50df5733 MW	37	#include "gcm.h"
	38	#include "gcm-def.h"
	39
	40	/----- Overall strategy --------------------------------------------------
	41	*
	42	* GCM is pretty awful to implement in software. (This presentation is going
	43	* to be somewhat different to that in the specification, but I think it
	44	* makes more sense like this.)
	45	*
	46	* We're given a %$w$%-bit blockcipher %$E$% with a key %$K$%.
	47	*
	48	* The main part is arithmetic in the finite field %$k = \gf{2^w}$%, which we
	49	* represent as the quotient ring %$\gf{2}[t]/(p_w(t))$% for some irreducible
	50	* degree-%$w$% polynomial %$p(t)$%, whose precise value isn't very important
	51	* right now. We choose a secret point %$x = E_K(0^w)$%.
	52	*
	53	* We choose a length size %$z$% as follows: if %$w < 96%$ then %$z = w$%;
	54	* otherwise %$z = w/2$%. Format a message pair as follows:
	55	*
	56	* %$F(a, b) = P_w(a) \cat P_w(b) \cat [\ell(a)]_z \cat [\ell(b)]_z$%
	57	*
	58	* where %$P_w(x) = x \cat 0^n$% where $%0 \le n < w$% such that
	59	* %$\ell(x) + n \equiv 0 \pmod{w}$%.
	60	*
	61	* Hash a (block-aligned) message %$u$% as follows. First, split %$u$% into
	62	* %$w$%-bit blocks %$u_0$%, %$u_1$%, %%\ldots%%, %$u_{n-1}$%. Interpret
	63	* these as elements of %$k$%. Then
	64	*
	65	* %$G_x(u) = u_0 t^n + u_1 t^{n-1} + \cdots + u_{n-1} t$%
	66	*
	67	* converted back to a %$w$%-bit string.
	68	*
	69	* We're ready to go now. Suppose we're to encrypt a message %$M$% with
	70	* header %$H$% and nonce %$N$%. If %$\ell(N) + 32 = w$% then let
	71	* %$N' = N$% and let %$i_0 = 1$%; otherwise, let %$U = G_t(F(\epsilon, N))$%
	72	* and split this into %$N' = U[0 \bitsto w - 32]$% and
	73	* %$[i_0]_{32} = U[w - 32 \bitsto w]$%.
	74	*
	75	* Let %$n = \lceil \ell(M)/w \rceil$%. Compute
	76	*
	77	* %$y_j = E_K(N' \cat [i_0 + j]_{32})$%
	78	*
	79	* for %$0 \le j \le n$%. Let
	80	*
	81	* %$s = (y_1 \cat y_2 \cat \cdots \cat y_n)[0 \bitsto \ell(M)$%
	82	*
	83	* Let %$C = M \xor s$% and let %$T = G_x(F(H, C)) \xor y_0$%. These are the
	84	* ciphertext and tag respectively.
	85	*
	86	* So why is this awful?
	87	*
	88	* For one thing, the bits are in a completely terrible order. The bytes are
	89	* arranged in little-endian order, so the unit coefficient is in the first
	90	* byte, and the degree-127 coefficient is in the last byte. But within each
	91	* byte, the lowest-degree coefficient is in the most significant bit. It's
	92	* therefore better to think of GCM as using a big-endian byte-ordering
	93	* convention, but with the bits backwards.
	94	*
	95	* But messing about with byte ordering is expensive, so let's not do that in
	96	* the inner loop. But multiplication in %$k$% is not easy either. Some
	97	* kind of precomputed table would be nice, but that will leak secrets
	98	* through the cache.
	99	*
	100	* I choose a particularly simple table: given %$x$%, let %$X[i'] = x t^i$%.
101	* Then $%$x y = \sum_{0\le i<w} y_i X[i']$% which is just a bunch of
102	* bitmasking. But the natural order for examining bits of %$y$% is not
103	* necessarily the obvious one. We'll have already loaded %$y$% into
104	* internal form, as 32-bit words. The good order to process these is left
105	* to right, from high to low bits. But now the order of degrees depends on
106	* the endianness of our conversion of bytes to words. Oh, well.
107	*
108	* If we've adopted a big-endian convention, then we'll see the degrees in
109	* order, 0, 1, ..., all the way up to %$w - 1$% and everything is fine. If
110	* we've adopted a little-endian convention, though, we'll see an ordering
111	* like this:
112	*
113	* 24, 25, ..., 31, 16, 17, ..., 23, 8, 9, ..., 15, 0, 1, ..., 7,
114	* 56, 57, ..., 63, 48, 49, ..., 55, 40, 41, ..., 47, 32, 33, ..., 39,
115	* etc.
116	*
117	* which is the ordinary order with 0x18 = 24 XORed into the index. That is,
118	* %$i' = i$% if we've adopted a big-endian convention, and
119	* %$i' = i \xor 24$% if we've adopted a little-endian convention.
120	*/
121
122	/----- Low-level utilities -----------------------------------------------/
123
ac4a43c1	124	/* --- @mult@, @divt@ --- *
50df5733 MW	125	*
	126	* Arguments: @const gcm_params *p@ = pointer to the parameters
	127	* @uint32 *z@ = where to write the result
	128	* @const uint32 *x@ = input field element
	129	*
	130	* Returns: ---
	131	*
ac4a43c1 MW	132	* Use: Multiply or divide the input field element by %$t$%, and
	133	* write the product or quotient to @z@. It's safe for @x@ and
	134	* @z@ to be equal, but they should not otherwise overlap. Both
	135	* input and output are in big-endian form, i.e., with the
	136	* lowest-degree coefficients in the most significant bits.
50df5733 MW	137	*/
	138
	139	static void mult(const gcm_params p, uint32 z, const uint32 *x)
	140	{
	141	uint32 m, c, t;
	142	unsigned i;
	143
	144	t = x[p->n - 1]; m = -(t&1u); c = m&p->poly;
	145	for (i = 0; i < p->n; i++) { t = x[i]; z[i] = (t >> 1) ^ c; c = t << 31; }
	146	}
	147
ac4a43c1 MW	148	#if CPUFAM_X86 \|\| CPUFAM_AMD64 \|\| CPUFAM_ARMEL
	149	static void divt(const gcm_params p, uint32 z, const uint32 *x)
	150	{
	151	uint32 m, c, t;
	152	unsigned i;
	153
	154	t = x[0]; m = -((t >> 31)&1u); c = m&1u;
	155	for (i = p->n - 1; i; i--) { t = x[i]; z[i] = (t << 1) \| c; c = t >> 31; }
	156	t = x[0]; z[0] = ((t ^ (m&p->poly)) << 1) \| c;
	157	}
	158	#endif
	159
50df5733 MW	160	/* --- @mul@ --- *
	161	*
	162	* Arguments: @const gcm_params *p@ = pointer to the parameters
	163	* @uint32 *z@ = where to write the result
	164	* @const uint32 x, y@ = input field elements
	165	*
	166	* Returns: ---
	167	*
	168	* Use: Multiply the input field elements together, and write the
	169	* product to @z@. It's safe for the operands to overlap. Both
	170	* inputs and the output are in big-endian form, i.e., with the
	171	* lowest-degree coefficients in the most significant bits.
	172	*/
	173
	174	static void mul(const gcm_params p, uint32 z,
	175	const uint32 x, const uint32 y)
	176	{
	177	uint32 m, t, u[GCM_NMAX], v[GCM_NMAX];
	178	unsigned i, j, k;
	179
	180	/* We can't do this in-place at all, so use temporary space. Make a copy
	181	* of @x@ in @u@, where we can clobber it, and build the product in @v@.
	182	*/
	183	for (i = 0; i < p->n; i++) { u[i] = x[i]; v[i] = 0; }
	184
	185	/* Repeatedly multiply @x@ (in @u@) by %$t$%, and add together those
	186	* %$x t^i$% selected by the bits of @y@. This is basically what you get
	187	* by streaming the result of @gcm_mktable@ into @gcm_mulk_...@.
	188	*/
	189	for (i = 0; i < p->n; i++) {
	190	t = y[i];
	191	for (j = 0; j < 32; j++) {
	192	m = -((t >> 31)&1u);
	193	for (k = 0; k < p->n; k++) v[k] ^= u[k]&m;
	194	mult(p, u, u); t <<= 1;
	195	}
	196	}
	197
	198	/* Write out the result now that it's ready. */
	199	for (i = 0; i < p->n; i++) z[i] = v[i];
	200	}
	201
	202	/----- Table-based multiplication ----------------------------------------/
	203
	204	/* --- @gcm_mktable@ --- *
	205	*
	206	* Arguments: @const gcm_params *p@ = pointer to the parameters
	207	* @uint32 *ktab@ = where to write the table; there must be
	208	* space for %$32 n$% $%n$%-word entries, i.e.,
	209	* %$32 n^2$% 32-bit words in total, where %$n$% is
	210	* @p->n@, the block size in words
	211	* @const uint32 *k@ = input field element
	212	*
	213	* Returns: ---
	214	*
	215	* Use: Construct a table for use by @gcm_mulk_...@ below, to
	216	* multiply (vaguely) efficiently by @k@.
	217	*/
	218
1d30a9b9 MW	219	static void simple_mktable(const gcm_params *p,
1d30a9b9 MW	220	uint32 ktab, const uint32 k)
50df5733 MW	221	{
	222	unsigned m = (p->f&GCMF_SWAP ? 0x18 : 0);
	223	unsigned i, j, o = m*p->n;
	224
	225	/* As described above, the table stores entries %$K[i \xor m] = k t^i$%,
	226	* where %$m = 0$% (big-endian cipher) or %$m = 24$% (little-endian).
	227	* The first job is to store %$K[m] = k$%.
	228	*
	229	* We initially build the table with the entries in big-endian order, and
	230	* then swap them if necessary. This makes the arithmetic functions more
	231	* amenable for use by @gcm_concat@ below.
	232	*/
	233	if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) ktab[o + i] = k[i];
	234	else for (i = 0; i < p->n; i++) ktab[o + i] = ENDSWAP32(k[i]);
	235
	236	/* Fill in the rest of the table by repeatedly multiplying the previous
	237	* entry by %$t$%.
	238	*/
	239	for (i = 1; i < 32*p->n; i++)
	240	{ j = (i ^ m)*p->n; mult(p, ktab + j, ktab + o); o = j; }
	241
	242	/* Finally, if the cipher uses a little-endian convention, then swap all of
	243	* the individual words.
	244	*/
	245	if (p->f&GCMF_SWAP)
	246	for (i = 0; i < 32p->np->n; i++) ktab[i] = ENDSWAP32(ktab[i]);
	247	}
	248
9e6a4409 MW	249	#if CPUFAM_X86 \|\| CPUFAM_AMD64
	250	static void pclmul_mktable(const gcm_params *p,
	251	uint32 ktab, const uint32 k)
	252	{
ac4a43c1	253	unsigned i, n = p->n;
9e6a4409	254	unsigned nz;
ac4a43c1	255	uint32 k_over_t[GCM_NMAX], *t;
9e6a4409	256
ac4a43c1 MW	257	/* We need to divide the value by t (to compensate for the one-bit shift
	258	* resulting from GCM's backwards bit ordering) and store the value in a
	259	* way which is convenient for the assembler code to read back. That
	260	* involves reordering the words, and, in the case of 96-bit blocks,
	261	* padding with zeroes to fill out a 128-bit chunk.
9e6a4409 MW	262	*/
9e6a4409 MW	263
ac4a43c1 MW	264	if (!(p->f&GCMF_SWAP)) divt(p, k_over_t, k);
	265	else {
	266	for (i = 0; i < n; i++) k_over_t[i] = ENDSWAP32(k[i]);
	267	divt(p, k_over_t, k_over_t);
	268	}
	269
9e6a4409 MW	270	if (n == 3) nz = 1;
9e6a4409 MW	271	else nz = 0;
ac4a43c1	272	k = k_over_t; t = ktab + n + nz; while (n--) --t = k++;
9e6a4409 MW	273	while (nz--) *--t = 0;
	274	}
	275	#endif
	276
	277	#if CPUFAM_ARMEL
	278	static void arm_crypto_mktable(const gcm_params *p,
	279	uint32 ktab, const uint32 k)
	280	{
ac4a43c1 MW	281	unsigned i, n = p->n;
ac4a43c1 MW	282	uint32 k_over_t[GCM_NMAX], *t;
9e6a4409	283
ac4a43c1 MW	284	/* We need to divide the value by t (to compensate for the one-bit shift
	285	* resulting from GCM's backwards bit ordering) and store the value in a
	286	* way which is convenient for the assembler code to read back. That
	287	* involves swapping the bytes in each 64-bit lane.
9e6a4409 MW	288	*/
9e6a4409 MW	289
ac4a43c1 MW	290	if (!(p->f&GCMF_SWAP)) divt(p, k_over_t, k);
	291	else {
	292	for (i = 0; i < n; i++) k_over_t[i] = ENDSWAP32(k[i]);
	293	divt(p, k_over_t, k_over_t);
	294	}
	295
	296	t = ktab; k = k_over_t;
	297	while (n >= 2) {
	298	t[1] = k[0]; t[0] = k[1];
	299	t += 2; k += 2; n -= 2;
9e6a4409	300	}
ac4a43c1	301	if (n) { t[1] = k[0]; t[0] = 0; }
9e6a4409 MW	302	}
	303	#endif
	304
	305	#if CPUFAM_ARM64
	306	static uint32 rbit32(uint32 x)
	307	{
	308	uint32 z, t;
	309
	310	#if GCC_VERSION_P(4, 3)
	311	/* Two tricks here. Firstly, two separate steps, rather than a single
	312	* block of assembler, to allow finer-grained instruction scheduling.
	313	* Secondly, use `ENDSWAP32' so that the compiler can cancel it if the
	314	* caller actually wants the bytes reordered.
	315	*/
	316	__asm__("rbit %w0, %w1" : "=r"(t) : "r"(x));
	317	z = ENDSWAP32(t);
	318	#else
	319	/* A generic but slightly clever implementation. */
	320	# define SWIZZLE(x, m, s) ((((x)&(m)) << (s)) \| (((x)&~(m)) >> (s)))
	321	/* 76543210 */
	322	t = SWIZZLE(x, 0x0f0f0f0f, 4); /* 32107654 -- swap nibbles */
	323	t = SWIZZLE(t, 0x33333333, 2); /* 10325476 -- swap bit pairs */
	324	z = SWIZZLE(t, 0x55555555, 1); /* 01234567 -- swap adjacent bits */
	325	# undef SWIZZLE
	326	#endif
	327	return (z);
	328	}
	329
	330	static void arm64_pmull_mktable(const gcm_params *p,
	331	uint32 ktab, const uint32 k)
	332	{
	333	unsigned n = p->n;
	334	uint32 *t;
	335
	336	/* We just need to store the value in a way which is convenient for the
	337	* assembler code to read back. That involves two transformations:
	338	*
	339	* * firstly, reversing the order of the bits in each byte; and,
	340	*
	341	* * secondly, storing two copies of each 64-bit chunk.
	342	*
	343	* Note that, in this case, we /want/ the little-endian byte order of GCM,
	344	* so endianness-swapping happens in the big-endian case.
	345	*/
	346
	347	t = ktab;
	348	if (p->f&GCMF_SWAP) {
	349	while (n >= 2) {
	350	t[0] = t[2] = rbit32(k[0]);
	351	t[1] = t[3] = rbit32(k[1]);
	352	t += 4; k += 2; n -= 2;
	353	}
	354	if (n) { t[0] = t[2] = rbit32(k[0]); t[1] = t[3] = 0; }
	355	} else {
	356	while (n >= 2) {
	357	t[0] = t[2] = ENDSWAP32(rbit32(k[0]));
	358	t[1] = t[3] = ENDSWAP32(rbit32(k[1]));
	359	t += 4; k += 2; n -= 2;
	360	}
	361	if (n) { t[0] = t[2] = ENDSWAP32(rbit32(k[0])); t[1] = t[3] = 0; }
	362	}
	363	}
	364	#endif
	365
1d30a9b9 MW	366	CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable,
	367	(const gcm_params p, uint32 ktab, const uint32 *k),
	368	(p, ktab, k),
	369	pick_mktable, simple_mktable)
	370
	371	static gcm_mktable__functype *pick_mktable(void)
	372	{
9e6a4409 MW	373	#if CPUFAM_X86 \|\| CPUFAM_AMD64
	374	DISPATCH_PICK_COND(gcm_mktable, pclmul_mktable,
	375	cpu_feature_p(CPUFEAT_X86_SSSE3) &&
	376	cpu_feature_p(CPUFEAT_X86_PCLMUL));
	377	#endif
	378	#if CPUFAM_ARMEL
	379	DISPATCH_PICK_COND(gcm_mktable, arm_crypto_mktable,
	380	cpu_feature_p(CPUFEAT_ARM_PMULL));
	381	#endif
	382	#if CPUFAM_ARM64
	383	DISPATCH_PICK_COND(gcm_mktable, arm64_pmull_mktable,
	384	cpu_feature_p(CPUFEAT_ARM_PMULL));
	385	#endif
1d30a9b9 MW	386	DISPATCH_PICK_FALLBACK(gcm_mktable, simple_mktable);
	387	}
	388
	389	/* --- @recover_k@ --- *
	390	*
	391	* Arguments: @const gcm_params *p@ = pointer to the parameters
	392	* @uint32 *k@ = block-sized vector in which to store %$k$%
	393	* @const uint32 *ktab@ = the table encoding %$k$%
	394	*
	395	* Returns: ---
	396	*
	397	* Use: Recovers %$k$%, the secret from which @ktab@ was by
	398	* @gcm_mktable@, from the table, and stores it in internal
	399	* (big-endian) form in @k@.
	400	*/
	401
	402	static void simple_recover_k(const gcm_params *p,
	403	uint32 k, const uint32 ktab)
	404	{
	405	unsigned i;
	406
	407	/* If the blockcipher is big-endian, then the key is simply in the first
	408	* table element, in the right format. If the blockcipher is little-endian
	409	* then it's in element 24, and the bytes need swapping.
	410	*/
	411
	412	if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) k[i] = ktab[i];
	413	else for (i = 0; i < p->n; i++) k[i] = ENDSWAP32(ktab[24*p->n + i]);
	414	}
	415
9e6a4409 MW	416	#if CPUFAM_X86 \|\| CPUFAM_AMD64
	417	static void pclmul_recover_k(const gcm_params *p,
	418	uint32 k, const uint32 ktab)
	419	{
	420	unsigned n = p->n;
	421	unsigned nz;
	422	const uint32 *t;
	423
	424	/* The representation is already independent of the blockcipher endianness.
ac4a43c1 MW	425	* We need to compensate for padding, reorder the words, and multiply by t
ac4a43c1 MW	426	* to compensate for the factor of t we divided out earlier.
9e6a4409 MW	427	*/
	428
	429	if (n == 3) nz = 1; else nz = 0;
	430	t = ktab + n + nz;
	431	while (n--) k++ = --t;
ac4a43c1	432	mult(p, k - p->n, k - p->n);
9e6a4409 MW	433	}
	434	#endif
	435
	436	#if CPUFAM_ARMEL
	437	static void arm_crypto_recover_k(const gcm_params *p,
	438	uint32 k, const uint32 ktab)
	439	{
	440	unsigned n = p->n;
	441	const uint32 *t;
	442
	443	/* The representation is already independent of the blockcipher endianness.
ac4a43c1 MW	444	* We only need to reorder the words, and multiply by t to compensate for
ac4a43c1 MW	445	* the factor of t we divided out earlier.
9e6a4409 MW	446	*/
	447
	448	t = ktab;
	449	while (n >= 2) { k[1] = t[0]; k[0] = t[1]; t += 2; k += 2; n -= 2; }
ac4a43c1 MW	450	if (n) { k[0] = t[1]; k++; n--; }
ac4a43c1 MW	451	mult(p, k - p->n, k - p->n);
9e6a4409 MW	452	}
	453	#endif
	454
	455	#if CPUFAM_ARM64
	456	static void arm64_pmull_recover_k(const gcm_params *p,
	457	uint32 k, const uint32 ktab)
	458	{
	459	unsigned n = p->n;
	460	const uint32 *t;
	461
	462	/* The representation is already independent of the blockcipher endianness.
	463	* We need to skip the duplicate pieces, and unscramble the bytes.
	464	*/
	465
	466	t = ktab;
	467	while (n >= 2) {
	468	k[0] = ENDSWAP32(rbit32(t[0]));
	469	k[1] = ENDSWAP32(rbit32(t[1]));
	470	t += 4; k += 2; n -= 2;
	471	}
	472	if (n) k[0] = ENDSWAP32(rbit32(t[0]));
	473	}
	474	#endif
	475
1d30a9b9 MW	476	CPU_DISPATCH(static, EMPTY, void, recover_k,
	477	(const gcm_params p, uint32 k, const uint32 *ktab),
	478	(p, k, ktab),
	479	pick_recover_k, simple_recover_k)
	480
	481	static recover_k__functype *pick_recover_k(void)
9e6a4409 MW	482	{
	483	#if CPUFAM_X86 \|\| CPUFAM_AMD64
	484	DISPATCH_PICK_COND(recover_k, pclmul_recover_k,
	485	cpu_feature_p(CPUFEAT_X86_SSSE3) &&
	486	cpu_feature_p(CPUFEAT_X86_PCLMUL));
	487	#endif
	488	#if CPUFAM_ARMEL
	489	DISPATCH_PICK_COND(recover_k, arm_crypto_recover_k,
	490	cpu_feature_p(CPUFEAT_ARM_PMULL));
	491	#endif
	492	#if CPUFAM_ARM64
	493	DISPATCH_PICK_COND(recover_k, arm64_pmull_recover_k,
	494	cpu_feature_p(CPUFEAT_ARM_PMULL));
	495	#endif
	496	DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k);
	497	}
1d30a9b9 MW	498
1d30a9b9 MW	499	/* --- @gcm_mulk_N{b,l}@ --- *
50df5733 MW	500	*
	501	* Arguments: @uint32 *a@ = accumulator to multiply
	502	* @const uint32 *ktab@ = table constructed by @gcm_mktable@
	503	*
	504	* Returns: ---
	505	*
	506	* Use: Multiply @a@ by @k@ (implicitly represented in @ktab@),
	507	* updating @a@ in-place. There are separate functions for each
1d30a9b9 MW	508	* supported block size and endianness because this is the
1d30a9b9 MW	509	* function whose performance actually matters.
50df5733 MW	510	*/
50df5733 MW	511
9e6a4409 MW	512	#if CPUFAM_X86 \|\| CPUFAM_AMD64
	513	# define DECL_MULK_X86ISH(var) extern gcm_mulk_##var##__functype \
	514	gcm_mulk_##var##_x86ish_pclmul_avx, \
	515	gcm_mulk_##var##_x86ish_pclmul;
	516	# define PICK_MULK_X86ISH(var) do { \
	517	DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul_avx, \
	518	cpu_feature_p(CPUFEAT_X86_AVX) && \
	519	cpu_feature_p(CPUFEAT_X86_PCLMUL) && \
	520	cpu_feature_p(CPUFEAT_X86_SSSE3)); \
	521	DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul, \
	522	cpu_feature_p(CPUFEAT_X86_PCLMUL) && \
	523	cpu_feature_p(CPUFEAT_X86_SSSE3)); \
	524	} while (0)
	525	#else
	526	# define DECL_MULK_X86ISH(var)
	527	# define PICK_MULK_X86ISH(var) do ; while (0)
	528	#endif
	529
	530	#if CPUFAM_ARMEL
	531	# define DECL_MULK_ARM(var) \
	532	extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm_crypto;
	533	# define PICK_MULK_ARM(var) do { \
	534	DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm_crypto, \
	535	cpu_feature_p(CPUFEAT_ARM_PMULL)); \
	536	} while (0)
	537	#else
	538	# define DECL_MULK_ARM(var)
	539	# define PICK_MULK_ARM(var) do ; while (0)
	540	#endif
	541
	542	#if CPUFAM_ARM64
	543	# define DECL_MULK_ARM64(var) \
	544	extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm64_pmull;
	545	# define PICK_MULK_ARM64(var) do { \
	546	DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm64_pmull, \
	547	cpu_feature_p(CPUFEAT_ARM_PMULL)); \
	548	} while (0)
	549	#else
	550	# define DECL_MULK_ARM64(var)
	551	# define PICK_MULK_ARM64(var) do ; while (0)
	552	#endif
	553
50df5733	554	#define DEF_MULK(nbits) \
1d30a9b9 MW	555	\
	556	CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##b, \
	557	(uint32 a, const uint32 ktab), (a, ktab), \
	558	pick_mulk_##nbits##b, simple_mulk_##nbits) \
	559	CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##l, \
	560	(uint32 a, const uint32 ktab), (a, ktab), \
	561	pick_mulk_##nbits##l, simple_mulk_##nbits) \
	562	\
	563	static void simple_mulk_##nbits(uint32 a, const uint32 ktab) \
50df5733 MW	564	{ \
	565	uint32 m, t; \
	566	uint32 z[nbits/32]; \
	567	unsigned i, j, k; \
	568	\
	569	for (i = 0; i < nbits/32; i++) z[i] = 0; \
	570	\
	571	for (i = 0; i < nbits/32; i++) { \
	572	t = a[i]; \
	573	for (j = 0; j < 32; j++) { \
	574	m = -((t >> 31)&1u); \
	575	for (k = 0; k < nbits/32; k++) z[k] ^= *ktab++&m; \
	576	t <<= 1; \
	577	} \
	578	} \
	579	\
	580	for (i = 0; i < nbits/32; i++) a[i] = z[i]; \
1d30a9b9 MW	581	} \
1d30a9b9 MW	582	\
9e6a4409 MW	583	DECL_MULK_X86ISH(nbits##b) \
	584	DECL_MULK_ARM(nbits##b) \
	585	DECL_MULK_ARM64(nbits##b) \
1d30a9b9	586	static gcm_mulk_##nbits##b##__functype *pick_mulk_##nbits##b(void) \
9e6a4409 MW	587	{ \
	588	PICK_MULK_X86ISH(nbits##b); \
	589	PICK_MULK_ARM(nbits##b); \
	590	PICK_MULK_ARM64(nbits##b); \
	591	DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits); \
	592	} \
	593	\
	594	DECL_MULK_X86ISH(nbits##l) \
	595	DECL_MULK_ARM(nbits##l) \
	596	DECL_MULK_ARM64(nbits##l) \
1d30a9b9	597	static gcm_mulk_##nbits##l##__functype *pick_mulk_##nbits##l(void) \
9e6a4409 MW	598	{ \
	599	PICK_MULK_X86ISH(nbits##l); \
	600	PICK_MULK_ARM(nbits##l); \
	601	PICK_MULK_ARM64(nbits##l); \
	602	DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits); \
	603	}
1d30a9b9	604
50df5733 MW	605	GCM_WIDTHS(DEF_MULK)
50df5733 MW	606
8f6a5276	607	#define GCM_MULK_CASE(nbits) \
1d30a9b9 MW	608	case nbits/32: \
	609	if (_f&GCMF_SWAP) gcm_mulk_##nbits##l(_a, _ktab); \
	610	else gcm_mulk_##nbits##b(_a, _ktab); \
	611	break;
	612	#define MULK(n, f, a, ktab) do { \
8f6a5276	613	uint32 _a = (a); const uint32 _ktab = (ktab); \
1d30a9b9	614	unsigned _f = (f); \
8f6a5276 MW	615	switch (n) { \
	616	GCM_WIDTHS(GCM_MULK_CASE) \
	617	default: abort(); \
	618	} \
	619	} while (0)
	620
50df5733 MW	621	/----- Other utilities ---------------------------------------------------/
	622
	623	/* --- @putlen@ --- *
	624	*
	625	* Arguments: @octet *p@ = pointer to output buffer
	626	* @unsigned w@ = size of output buffer
	627	* @unsigned blksz@ = block size (assumed fairly small)
	628	* @unsigned long nblocks@ = number of blocks
	629	* @unsigned nbytes@ = tail size in bytes (assumed small)
	630	*
	631	* Returns: ---
	632	*
	633	* Use: Store the overall length in %$\emph{bits}$% (i.e.,
	634	* @3(nblocksblksz + nbytes)@ in big-endian form in the
	635	* buffer @p@.
	636	*/
	637
	638	static void putlen(octet *p, unsigned w, unsigned blksz,
	639	unsigned long nblocks, unsigned nbytes)
	640	{
	641	unsigned long nblo = nblocks&((1ul << (ULONG_BITS/2)) - 1),
	642	nbhi = nblocks >> ULONG_BITS/2;
	643	unsigned long nlo = nbloblksz + nbytes, nhi = nbhiblksz;
	644
	645	/* This is fiddly. Split @nblocks@, which is the big number, into high and
	646	* low halves, multiply those separately by @blksz@, propagate carries, and
	647	* then multiply by eight.
	648	*/
	649	nhi += nlo >> ULONG_BITS/2;
	650	nlo &= (1ul << (ULONG_BITS/2)) - 1;
	651	nlo <<= 3;
	652
	653	/* Now write out the size, feeding bits in from @nhi@ as necessary. */
	654	p += w;
	655	while (w--) {
	656	*--p = U8(nlo);
	657	nlo = (nlo >> 8) \| ((nhi&0xff) << (ULONG_BITS/2 - 5));
	658	nhi >>= 8;
	659	}
	660	}
	661
	662	/* --- @mix@ --- *
	663	*
	664	* Arguments: @const gcm_params *p@ = pointer to the parameters
	665	* @uint32 *a@ = GHASH accumulator
	666	* @const octet *q@ = pointer to an input block
	667	* @const uint32 *ktab@ = multiplication table, built by
	668	* @gcm_mktable@
	669	*
	670	* Returns: ---
	671	*
	672	* Use: Fold the block @q@ into the GHASH accumulator. The
	673	* calculation is %$a' = k (a + q)$%.
	674	*/
	675
	676	static void mix(const gcm_params p, uint32 a,
	677	const octet q, const uint32 ktab)
	678	{
	679	unsigned i;
	680
50df5733 MW	681	if (p->f&GCMF_SWAP)
	682	for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_L(q); q += 4; }
	683	else
	684	for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_B(q); q += 4; }
1d30a9b9	685	MULK(p->n, p->f, a, ktab);
50df5733 MW	686	}
	687
	688	/* --- @gcm_ghashdone@ --- *
	689	*
	690	* Arguments: @const gcm_params *p@ = pointer to the parameters
	691	* @uint32 *a@ = GHASH accumulator
	692	* @const uint32 *ktab@ = multiplication table, built by
	693	* @gcm_mktable@
	694	* @unsigned long xblocks, yblocks@ = number of whole blocks in
	695	* the two inputs
	696	* @unsigned xbytes, ybytes@ = number of trailing bytes in the
	697	* two inputs
	698	*
	699	* Returns: ---
	700	*
	701	* Use: Finishes a GHASH operation by appending the appropriately
	702	* encoded lengths of the two constituent messages.
	703	*/
	704
	705	void gcm_ghashdone(const gcm_params p, uint32 a, const uint32 *ktab,
	706	unsigned long xblocks, unsigned xbytes,
	707	unsigned long yblocks, unsigned ybytes)
	708	{
	709	octet b[4*GCM_NMAX];
	710	unsigned w = p->n < 3 ? 4p->n : 2p->n;
	711
	712	/* Construct the encoded lengths. Note that smaller-block versions of GCM
	713	* encode the lengths in separate blocks. GCM is only officially defined
	714	* for 64- and 128-bit blocks; I've placed the cutoff somewhat arbitrarily
	715	* at 96 bits.
	716	*/
	717	putlen(b, w, 4*p->n, xblocks, xbytes);
	718	putlen(b + w, w, 4*p->n, yblocks, ybytes);
	719
	720	/* Feed the lengths into the accumulator. */
	721	mix(p, a, b, ktab);
	722	if (p->n < 3) mix(p, a, b + w, ktab);
	723	}
	724
	725	/* --- @gcm_concat@ --- *
	726	*
	727	* Arguments: @const gcm_params *p@ = pointer to the parameters
	728	* @uint32 *z@ = GHASH accumulator for suffix, updated
	729	* @const uint32 *x@ = GHASH accumulator for prefix
	730	* @const uint32 *ktab@ = multiplication table, built by
	731	* @gcm_mktable@
	732	* @unsigned long n@ = length of suffix in whole blocks
	733	*
	734	* Returns: ---
	735	*
	736	* Use: On entry, @x@ and @z@ are the results of hashing two strings
	737	* %$a$% and %$b$%, each a whole number of blocks long; in
	738	* particular, %$b$% is @n@ blocks long. On exit, @z@ is
	739	* updated to be the hash of %$a \cat b$%.
	740	*/
	741
	742	void gcm_concat(const gcm_params p, uint32 z, const uint32 *x,
	743	const uint32 *ktab, unsigned long n)
	744	{
	745	uint32 t[GCM_NMAX], u[GCM_NMAX];
	746	unsigned i, j;
	747
	748	if (!n) {
	749	/* If @n@ is zero, then there's not much to do. The mathematics
750	* (explained below) still works, but the code takes a shortcut which
751	* doesn't handle this case: so set %$z' = z + x k^n = z + x$%.
752	*/
753
754	for (j = 0; j < p->n; j++) z[j] ^= x[j];
755	} else {
756	/* We have %$x = a_0 t^m + \cdots + a_{m-2} t^2 + a_{m-1} t$% and
757	* %$z = b_0 t^n + \cdots + b_{n-2} t^2 + b_{n-1} t$%. What we'd like is
758	* the hash of %$a \cat b$%, which is %$z + x k^n$%.
759	*
760	* The first job, then, is to calculate %$k^n$%, and for this we use a
761	* simple left-to-right square-and-multiply algorithm. There's no need
762	* to keep %$n$% secret here.
763	*/
764
765	/* Start by retrieving %$k$% from the table, and convert it to big-endian
766	* form.
767	*/
1d30a9b9	768	recover_k(p, u, ktab);
50df5733 MW	769
	770	/* Now calculate %$k^n$%. */
	771	i = ULONG_BITS;
	772	#define BIT (1ul << (ULONG_BITS - 1))
	773	while (!(n&BIT)) { n <<= 1; i--; }
	774	n <<= 1; i--; for (j = 0; j < p->n; j++) t[j] = u[j];
	775	while (i--) { mul(p, t, t, t); if (n&BIT) mul(p, t, t, u); n <<= 1; }
	776	#undef BIT
	777
	778	/* Next, calculate %$x k^n$%. If we're using a little-endian convention
	779	* then we must convert %$x$%; otherwise we can just use it in place.
	780	*/
	781	if (!(p->f&GCMF_SWAP))
	782	mul(p, t, t, x);
	783	else {
	784	for (j = 0; j < p->n; j++) u[j] = ENDSWAP32(x[j]);
	785	mul(p, t, t, u);
	786	}
	787
	788	/* Finally, add %$x k^n$% onto %$z$%, converting back to little-endian if
	789	* necessary.
	790	*/
	791	if (!(p->f&GCMF_SWAP)) for (j = 0; j < p->n; j++) z[j] ^= t[j];
	792	else for (j = 0; j < p->n; j++) z[j] ^= ENDSWAP32(t[j]);
	793	}
	794	}
	795
8f6a5276 MW	796	/----- Test rig ----------------------------------------------------------/
	797
	798	#ifdef TEST_RIG
	799
141c1284	800	#include <mLib/macros.h>
8f6a5276 MW	801	#include <mLib/quis.h>
	802	#include <mLib/testrig.h>
	803
416b8869 MW	804	#ifdef ENABLE_ASM_DEBUG
	805	# include "regdump.h"
	806	#endif
	807
8f6a5276	808	static void report_failure(const char *test, unsigned nbits,
1d30a9b9	809	const char ref, dstr v[], dstr d)
8f6a5276 MW	810	{
	811	printf("test %s failed (nbits = %u)", test, nbits);
	812	printf("\n\tx = "); type_hex.dump(&v[0], stdout);
	813	printf("\n\ty = "); type_hex.dump(&v[1], stdout);
	814	printf("\n\tz = "); type_hex.dump(&v[2], stdout);
1d30a9b9	815	printf("\n\t%s' = ", ref); type_hex.dump(d, stdout);
8f6a5276 MW	816	putchar('\n');
	817	}
	818
1d30a9b9 MW	819	static void mulk(unsigned nbits, unsigned f, uint32 x, const uint32 ktab)
1d30a9b9 MW	820	{ MULK(nbits/32, f, x, ktab); }
8f6a5276 MW	821
	822	static int test_mul(uint32 poly, dstr v[])
	823	{
	824	uint32 x[GCM_NMAX], y[GCM_NMAX], z[GCM_NMAX], ktab[32GCM_NMAXGCM_NMAX];
	825	gcm_params p;
	826	dstr d = DSTR_INIT;
	827	unsigned i, nbits;
	828	int ok = 1;
1d30a9b9	829	enum { I_x, I_y, I_z };
8f6a5276 MW	830
	831	nbits = 8*v[0].len; p.f = 0; p.n = nbits/32; p.poly = poly;
	832	dstr_ensure(&d, nbits/8); d.len = nbits/8;
	833
1d30a9b9 MW	834	#define LOADXY(E) do { \
	835	for (i = 0; i < nbits/32; i++) { \
	836	x[i] = LOAD32_##E(v[I_x].buf + 4*i); \
	837	y[i] = LOAD32_##E(v[I_y].buf + 4*i); \
	838	} \
	839	} while (0)
	840
	841	#define INITZ(x) do { \
	842	for (i = 0; i < nbits/32; i++) z[i] = (x)[i]; \
	843	} while (0)
	844
	845	#define CHECK(E, what, ref) do { \
	846	for (i = 0; i < nbits/32; i++) STORE32_##E(d.buf + 4*i, z[i]); \
141c1284	847	if (MEMCMP(d.buf, !=, v[I_##ref].buf, nbits/8)) \
1d30a9b9 MW	848	{ ok = 0; report_failure(what, nbits, #ref, v, &d); } \
	849	} while (0)
	850
	851	#define TEST_PREP_1(E, x, y, what) do { \
	852	gcm_mktable(&p, ktab, y); \
	853	recover_k(&p, z, ktab); CHECK(B, "mktable/recover_k (" #y ")", y); \
	854	INITZ(x); mulk(nbits, p.f, z, ktab); CHECK(E, what " (k = " #y ")", z); \
	855	} while (0)
	856
	857	#define TEST_PREP(E, what) do { \
	858	TEST_PREP_1(E, x, y, what); \
	859	TEST_PREP_1(E, y, x, what); \
	860	} while (0)
	861
8f6a5276	862	/* First, test plain multiply. */
1d30a9b9	863	LOADXY(B); mul(&p, z, x, y); CHECK(B, "gcm_mul", z);
8f6a5276 MW	864
8f6a5276 MW	865	/* Next, test big-endian prepared key. */
1d30a9b9	866	LOADXY(B); TEST_PREP(B, "gcm_kmul_b");
8f6a5276 MW	867
8f6a5276 MW	868	/* Finally, test little-endian prepared key. */
1d30a9b9 MW	869	p.f = GCMF_SWAP; LOADXY(L);
	870	TEST_PREP(L, "gcm_kmul_l");
	871
	872	#undef LOADXY
	873	#undef INITZ
	874	#undef CHECK
	875	#undef TEST_PREP_1
	876	#undef TEST_PREP
8f6a5276 MW	877
	878	/* All done. */
	879	return (ok);
	880	}
	881
	882	#define TEST(nbits) \
	883	static int test_mul_##nbits(dstr v[]) \
	884	{ return (test_mul(GCM_POLY_##nbits, v)); }
	885	GCM_WIDTHS(TEST)
	886	#undef TEST
	887
	888	static test_chunk defs[] = {
	889	#define TEST(nbits) \
	890	{ "gcm-mul" #nbits, test_mul_##nbits, \
	891	{ &type_hex, &type_hex, &type_hex, 0 } },
	892	GCM_WIDTHS(TEST)
	893	#undef TEST
	894	{ 0, 0, { 0 } }
	895	};
	896
	897	int main(int argc, char *argv[])
	898	{
	899	ego(argv[0]);
416b8869 MW	900	#ifdef ENABLE_ASM_DEBUG
	901	regdump_init();
	902	#endif
8f6a5276 MW	903	test_run(argc, argv, defs, SRCDIR"/t/gcm");
	904	return (0);
	905	}
	906
	907	#endif
	908
50df5733	909	/----- That's all, folks -------------------------------------------------/