mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// GCM acceleration for x86 processors
	4	///
	5	/// (c) 2018 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software: you can redistribute it and/or modify it
	13	/// under the terms of the GNU Library General Public License as published
	14	/// by the Free Software Foundation; either version 2 of the License, or
	15	/// (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful, but
	18	/// WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	20	/// Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb. If not, write to the Free Software
	24	/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
	25	/// USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// Preliminaries.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	.arch .pclmul
	34
	35	.text
	36
	37	///--------------------------------------------------------------------------
	38	/// Common register allocation.
	39
	40	#if CPUFAM_X86
	41	# define A eax
	42	# define K edx
	43	#elif CPUFAM_AMD64 && ABI_SYSV
	44	# define A rdi
	45	# define K rsi
	46	#elif CPUFAM_AMD64 && ABI_WIN
	47	# define A rcx
	48	# define K rdx
	49	#endif
	50
	51	///--------------------------------------------------------------------------
	52	/// Multiplication macros.
	53
	54	// The good news is that we have a fancy instruction to do the
	55	// multiplications. The bad news is that it's not particularly well-
	56	// suited to the job.
	57	//
	58	// For one thing, it only does a 64-bit multiplication, so in general
	59	// we'll need to synthesize the full-width multiply by hand. For
	60	// another thing, it doesn't help with the reduction, so we have to
	61	// do that by hand too. And, finally, GCM has crazy bit ordering,
	62	// and the instruction does nothing useful for that at all.
	63	//
	64	// Focusing on that last problem first: the bits aren't in monotonic
	65	// significance order unless we permute them. If we reverse the byte
	66	// order, then we'll have the bits in monotonic order, but backwards,
	67	// so the degree-0 coefficient will be in the most-significant bit.
	68	//
	69	// This is less of a difficulty than it seems at first, because
	70	// algebra. Suppose we are given u = SUM_{0<=i<n} u_i t^i and v =
	71	// SUM_{0<=j<n} v_j t^j; then
	72	//
	73	// u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
	74	//
	75	// Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i
	76	// and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
	77	// Then
	78	//
	79	// ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
	80	// = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
	81	//
	82	// which is almost the bit-reversal of u v, only it's shifted right
	83	// by one place. Putting this another way, what we have is actually
	84	// the bit reversal of the product u v t. We could get the correct
	85	// answer (modulo p(t)) if we'd sneakily divided one of the operands
	86	// by t before we started. Conveniently, v is actually the secret
	87	// value k set up by the GCM `mktable' function, so we can arrange to
	88	// actually store k/t (mod p(t)) and then the product will come out
	89	// correct (modulo p(t)) and we won't have anything more to worry
	90	// about here.
	91	//
	92	// That was important to think about, but there's not a great deal to
	93	// do about it yet other than to convert what we've got from the
	94	// blockcipher's byte-ordering convention to our big-endian
	95	// convention. Since this depends on the blockcipher convention,
	96	// we'll leave the caller to cope with this: the macros here will
	97	// assume that the operands are in `register' format, which is the
	98	// byte-reversal of the external representation, padded at the
	99	// most-significant end except for 96-bit blocks, which are
	100	// zero-padded at the least-significant end (see `mul96' for the
	101	// details). In the commentary, pieces of polynomial are numbered
	102	// according to the degree of the coefficients, so the unit
	103	// coefficient of some polynomial a is in a_0.
	104	//
	105	// The commentary for `mul128' is the most detailed. The other
	106	// macros assume that you've already read and understood that.
	107
	108	.macro mul128
	109	// Enter with u and v in xmm0 and xmm1 respectively; leave with z =
	110	// u v in xmm0. Clobbers xmm1--xmm4.
	111
	112	// First for the double-precision multiplication. It's tempting to
	113	// use Karatsuba's identity here, but I suspect that loses more in
	114	// the shifting, bit-twiddling, and dependency chains that it gains
	115	// in saving a multiplication which otherwise pipelines well.
	116	// xmm0 = // (u_0; u_1)
	117	// xmm1 = // (v_0; v_1)
	118	movdqa xmm2, xmm1 // (v_0; v_1) again
	119	movdqa xmm3, xmm0 // (u_0; u_1) again
	120	movdqa xmm4, xmm0 // (u_0; u_1) yet again
	121	pclmulhqlqdq xmm2, xmm0 // u_1 v_0
	122	pclmullqlqdq xmm0, xmm1 // u_1 v_1
	123	pclmulhqlqdq xmm3, xmm1 // u_0 v_1
	124	pclmulhqhqdq xmm4, xmm1 // u_0 v_0
	125
	126	// Arrange the pieces to form a double-precision polynomial.
	127	pxor xmm2, xmm3 // (m_0; m_1) = u_1 v_0 + u_0 v_1
	128	movdqa xmm1, xmm2 // (m_0; m_1) again
	129	pslldq xmm2, 8 // (m_1; 0)
	130	psrldq xmm1, 8 // (0; m_0)
	131	pxor xmm0, xmm2 // z_1 = u_1 v_1 + m_1
	132	pxor xmm1, xmm4 // z_0 = u_0 v_0 + t^64 m_0
	133
	134	// The remaining problem is that the result needs to be reduced
	135	// modulo p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128 = t^7 +
	136	// t^2 + t + 1 in our field. So far, we've calculated z_0 and z_1
	137	// such that z_0 + z_1 R = u v using the identity R = t^128: now we
	138	// must collapse the two halves of z together using the other
	139	// identity R = t^7 + t^2 + t + 1.
	140	//
	141	// We do this by working on each 32-bit word of the high half of z
	142	// separately, so consider x_i, for some 4 <= i < 8. Certainly, x_i
	143	// t^{32i} = x_i R t^{32(i-4)} = (t^7 + t^2 + t + 1) x_i t^{32(i-4)},
	144	// but we can't use that directly without breaking up the 32-bit word
	145	// structure. Instead, we start by considering just x_i t^7
	146	// t^{32(i-4)}, which again looks tricky. Now, split x_i = a_i +
	147	// t^25 b_i, with deg a_i < 25; then
	148	//
	149	// x_i t^7 t^{32(i-4)} = a_i t^7 t^{32(i-4)} + b_i t^{32(i-3)}
	150	//
	151	// We can similarly decompose x_i t^2 and x_i t into a pair of 32-bit
	152	// contributions to the t^{32(i-4)} and t^{32(i-3)} words, but the
	153	// splits are different. This is lovely, with one small snag: when
	154	// we do this to x_7, we end up with a contribution back into the
	155	// t^128 coefficient word. But notice that only the low seven bits
	156	// of this word are affected, so there's no knock-on contribution
	157	// into the t^32 word. Therefore, if we handle the high bits of each
	158	// word together, and then the low bits, everything will be fine.
	159
	160	// First, shift the high bits down.
	161	movdqa xmm2, xmm0 // (x_4, x_5; x_6, x_7) again
	162	movdqa xmm3, xmm0 // (x_4, x_5; x_6, x_7) yet again
	163	movdqa xmm4, xmm0 // (x_4, x_5; x_6, x_7) again again
	164	pslld xmm2, 31 // the b_i for t
	165	pslld xmm3, 30 // the b_i for t^2
	166	pslld xmm4, 25 // the b_i for t^7
	167	pxor xmm2, xmm3 // add them all together
	168	pxor xmm2, xmm4
	169	movdqa xmm3, xmm2 // and a copy for later
	170	psrldq xmm2, 4 // contribution into low half
	171	pslldq xmm3, 12 // and high half
	172	pxor xmm1, xmm2
	173	pxor xmm0, xmm3
	174
	175	// And then shift the low bits up.
	176	movdqa xmm2, xmm0
	177	movdqa xmm3, xmm0
	178	pxor xmm1, xmm0 // mix in the unit contribution
	179	psrld xmm0, 1
	180	psrld xmm2, 2
	181	psrld xmm3, 7
	182	pxor xmm1, xmm2 // low half, unit, and t^2 contribs
	183	pxor xmm0, xmm3 // t and t^7 contribs
	184	pxor xmm0, xmm1 // mix them together and we're done
	185	.endm
	186
	187	.macro mul64
	188	// Enter with u and v in the low halves of xmm0 and xmm1
	189	// respectively; leave with z = u v in xmm0. Clobbers xmm1--xmm4.
	190
	191	// The multiplication is thankfully easy.
	192	pclmullqlqdq xmm1, xmm0 // u v
	193
	194	// Now we must reduce. This is essentially the same as the 128-bit
	195	// case above, but mostly simpler because everything is smaller. The
	196	// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
	197
	198	// First, we must detach the top (`low'!) half of the result.
	199	movdqa xmm0, xmm1 // (x_0, x_1; x_2, x_3) again
	200	psrldq xmm1, 8 // (0, 0; x_0, x_1)
	201
	202	// Next, shift the high bits down.
	203	movdqa xmm2, xmm0 // (?, ?; x_2, x_3) again
	204	movdqa xmm3, xmm0 // (?, ?; x_2, x_3) yet again
	205	movdqa xmm4, xmm0 // (?, ?; x_2, x_3) again again
	206	pslld xmm2, 31 // b_i for t
	207	pslld xmm3, 29 // b_i for t^3
	208	pslld xmm4, 28 // b_i for t^4
	209	pxor xmm2, xmm3 // add them all together
	210	pxor xmm2, xmm4
	211	movdqa xmm3, xmm2 // and a copy for later
	212	movq xmm2, xmm2 // zap high half
	213	pslldq xmm3, 4 // contribution into high half
	214	psrldq xmm2, 4 // and low half
	215	pxor xmm0, xmm3
	216	pxor xmm1, xmm2
	217
	218	// And then shift the low bits up.
	219	movdqa xmm2, xmm0
	220	movdqa xmm3, xmm0
	221	pxor xmm1, xmm0 // mix in the unit contribution
	222	psrld xmm0, 1
	223	psrld xmm2, 3
	224	psrld xmm3, 4
	225	pxor xmm1, xmm2 // low half, unit, and t^3 contribs
	226	pxor xmm0, xmm3 // t and t^4 contribs
	227	pxor xmm0, xmm1 // mix them together and we're done
	228	.endm
	229
	230	.macro mul96
	231	// Enter with u and v in the /high/ three words of xmm0 and xmm1
	232	// respectively (and zero in the low word); leave with z = u v in the
	233	// high three words of xmm0, and /junk/ in the low word. Clobbers
	234	// xmm1--xmm4.
	235
	236	// This is an inconvenient size. There's nothing for it but to do
	237	// four multiplications, as if for the 128-bit case. It's possible
	238	// that there's cruft in the top 32 bits of the input registers, so
	239	// shift both of them up by four bytes before we start. This will
	240	// mean that the high 64 bits of the result (from GCM's viewpoint)
	241	// will be zero.
	242	// xmm0 = // (u_0, u_1; u_2, 0)
	243	// xmm1 = // (v_0, v_1; v_2, 0)
	244	movdqa xmm2, xmm1 // (v_0, v_1; v_2, 0) again
	245	movdqa xmm3, xmm0 // (u_0, u_1; u_2, 0) again
	246	movdqa xmm4, xmm0 // (u_0, u_1; u_2, 0) yet again
	247	pclmulhqlqdq xmm2, xmm0 // u_2 (v_1 t^32 + v_0) = e_0
	248	pclmullqlqdq xmm0, xmm1 // u_2 v_2 = d = (0; d)
	249	pclmulhqlqdq xmm3, xmm1 // v_2 (u_1 t^32 + u_0) = e_1
	250	pclmulhqhqdq xmm4, xmm1 // u_0 v_0 + (u_1 v_0 + u_0 v_1) t^32
	251	// + u_1 v_1 t^64 = f
	252
	253	// Extract the high and low halves of the 192-bit result. We don't
	254	// need be too picky about the unused high words of the result
	255	// registers. The answer we want is d t^128 + e t^64 + f, where e =
	256	// e_0 + e_1.
	257	//
	258	// The place values for the two halves are (?, t^96; t^128, t^160)
	259	// and (1, t^32; t^64, ?). But we also want to shift the high part
	260	// left by a word, for symmetry's sake.
	261	psrldq xmm0, 8 // (0; d) = d t^128
	262	pxor xmm2, xmm3 // e = (e_0 + e_1)
	263	movdqa xmm1, xmm4 // f again
	264	pxor xmm0, xmm2 // d t^128 + e t^64
	265	psrldq xmm2, 12 // e[31..0] t^64
	266	psrldq xmm1, 4 // f[95..0]
	267	pslldq xmm4, 12 // f[127..96], shifted
	268	pslldq xmm0, 4 // shift high 96 bits
	269	pxor xmm1, xmm2 // low 96 bits of result
	270	pxor xmm0, xmm4 // high 96 bits of result
	271
	272	// Finally, the reduction. This is essentially the same as the
	273	// 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
	274	// t^9 + t^6 + 1. The degrees are larger but not enough to cause
	275	// trouble for the general approach.
	276
	277	// First, shift the high bits down.
	278	movdqa xmm2, xmm0 // copies of the high part
	279	movdqa xmm3, xmm0
	280	movdqa xmm4, xmm0
	281	pslld xmm2, 26 // b_i for t^6
	282	pslld xmm3, 23 // b_i for t^9
	283	pslld xmm4, 22 // b_i for t^10
	284	pxor xmm2, xmm3 // add them all together
	285	pslldq xmm1, 4 // shift low part up to match
	286	pxor xmm2, xmm4
	287	movdqa xmm3, xmm2 // and a copy for later
	288	pslldq xmm2, 8 // contribution to high half
	289	psrldq xmm3, 4 // contribution to low half
	290	pxor xmm1, xmm3
	291	pxor xmm0, xmm2
	292
	293	// And then shift the low bits up.
	294	movdqa xmm2, xmm0 // copies of the high part
	295	movdqa xmm3, xmm0
	296	pxor xmm1, xmm0 // mix in the unit contribution
	297	psrld xmm0, 6
	298	psrld xmm2, 9
	299	psrld xmm3, 10
	300	pxor xmm1, xmm2 // low half, unit, and t^9 contribs
	301	pxor xmm0, xmm3 // t^6 and t^10 contribs
	302	pxor xmm0, xmm1 // mix them together and we're done
	303	.endm
	304
	305	.macro mul192
	306	// Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave
	307	// with z = u v in xmm0/xmm1 -- the top halves of the high registers
	308	// are unimportant. Clobbers xmm2--xmm7.
	309
	310	// Start multiplying and accumulating pieces of product.
	311	// xmm0 = // (u_1; u_2)
	312	// xmm1 = // (?; u_0)
	313	// xmm2 = // (v_1; v_2)
	314	// xmm3 = // (?; v_0)
	315	movdqa xmm4, xmm0 // (u_1; u_2) again
	316	movdqa xmm5, xmm0 // (u_1; u_2) yet again
	317	movdqa xmm6, xmm0 // (u_1; u_2) again again
	318	movdqa xmm7, xmm3 // (?; v_0) again
	319	punpcklqdq xmm3, xmm1 // (u_0; v_0)
	320	pclmulhqhqdq xmm4, xmm2 // u_1 v_1
	321	pclmullqlqdq xmm1, xmm2 // u_0 v_2
	322	pclmullqhqdq xmm5, xmm2 // u_2 v_1
	323	pclmulhqlqdq xmm6, xmm2 // u_1 v_2
	324	pxor xmm1, xmm4 // u_0 v_2 + u_1 v_1
	325	pclmullqlqdq xmm7, xmm0 // u_2 v_0
	326	pxor xmm5, xmm6 // b = u_2 v_1 + u_1 v_2
	327	movdqa xmm6, xmm0 // (u_1; u_2) like a bad penny
	328	pxor xmm1, xmm7 // c = u_0 v_2 + u_1 v_1 + u_2 v_0
	329	pclmullqlqdq xmm0, xmm2 // a = u_2 v_2
	330	pclmulhqlqdq xmm6, xmm3 // u_1 v_0
	331	pclmulhqhqdq xmm2, xmm3 // u_0 v_1
	332	pclmullqhqdq xmm3, xmm3 // e = u_0 v_0
	333	pxor xmm6, xmm2 // d = u_1 v_0 + u_0 v_1
	334
	335	// Next, the piecing together of the product. There's significant
	336	// work here to leave the completed pieces in sensible registers.
	337	// xmm0 = // (a_0; a_1) = a = u_2 v_2
	338	// xmm5 = // (b_0; b_1) = b = u_1 v_2 + u_2 v_1
	339	// xmm1 = // (c_0; c_1) = c = u_0 v_2 +
	340	// u_1 v_1 + u_2 v_0
	341	// xmm6 = // (d_0; d_1) = d = u_0 v_1 + u_1 v_0
	342	// xmm3 = // (e_0; e_1) = e = u_0 v_0
	343	// xmm2, xmm4, xmm7 spare
	344	movdqa xmm2, xmm6 // (d_0; d_1) again
	345	movdqa xmm4, xmm5 // (b_0; b_1) again
	346	pslldq xmm6, 8 // (d_1; 0)
	347	psrldq xmm5, 8 // (0; b_0)
	348	psrldq xmm2, 8 // (0; d_0)
	349	pslldq xmm4, 8 // (b_1; 0)
	350	pxor xmm5, xmm6 // (d_1; b_0)
	351	pxor xmm0, xmm4 // (x_4; x_5) = (a_0 + b_1; a_1)
	352	pxor xmm2, xmm3 // (x_0; x_1) = (e_0; e_1 + d_0)
	353	pxor xmm1, xmm5 // (x_2; x_3) = (c_0 + d_1; b_0 + c_1)
	354
	355	// Next, the reduction. Our polynomial this time is p(x) = t^192 +
	356	// t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
	357	// 128-bit case. I don't know why.
	358
	359	// First, shift the high bits down.
	360	// xmm0 = // (x_4; x_5)
	361	// xmm1 = // (x_2; x_3)
	362	// xmm2 = // (x_0; x_1)
	363	// xmm3--xmm7 spare
	364	movdqa xmm3, xmm0 // (x_4; x_5) copy
	365	movdqa xmm4, xmm0 // (x_4; x_5) copy
	366	movdqa xmm5, xmm0 // (x_4; x_5) copy
	367	pslld xmm3, 31 // (x_4; x_5) b_i for t
	368	pslld xmm4, 30 // (x_4; x_5) b_i for t^2
	369	pslld xmm5, 25 // (x_4; x_5) b_i for t^7
	370	movq xmm6, xmm1 // (0; x_3) copy
	371	pxor xmm3, xmm4
	372	movq xmm7, xmm1 // (0; x_3) copy
	373	pxor xmm3, xmm5
	374	movq xmm5, xmm1 // (0; x_3) copy
	375	movdqa xmm4, xmm3 // (x_4; x_5) b_i combined
	376	pslld xmm6, 31 // (0; x_3) b_i for t
	377	pslld xmm7, 30 // (0; x_3) b_i for t^2
	378	pslld xmm5, 25 // (0; x_3) b_i for t^7
	379	psrldq xmm3, 12 // (x_4; x_5) low contrib
	380	pslldq xmm4, 4 // (x_4; x_5) high contrib
	381	pxor xmm6, xmm7
	382	pxor xmm2, xmm3
	383	pxor xmm6, xmm5
	384	pxor xmm1, xmm4
	385	pslldq xmm6, 4
	386	pxor xmm2, xmm6
	387
	388	// And finally shift the low bits up. Unfortunately, we also have to
	389	// split the low bits out.
	390	// xmm0 = // (x'_4; x'_5)
	391	// xmm1 = // (x'_2; x'_3)
	392	// xmm2 = // (x'_0; x'_1)
	393	movdqa xmm5, xmm1 // copies of (x'_2; x'_3)
	394	movdqa xmm6, xmm1
	395	movdqa xmm7, xmm1
	396	psrldq xmm1, 8 // bring down (?; x'_2)
	397	movdqa xmm3, xmm0 // copies of (x'_4; x'_5)
	398	movdqa xmm4, xmm0
	399	punpcklqdq xmm1, xmm2 // (x'_1; x'_2)
	400	psrldq xmm2, 8 // (?; x'_0)
	401	pxor xmm2, xmm5 // low half and unit contrib
	402	pxor xmm1, xmm0
	403	psrld xmm5, 1
	404	psrld xmm0, 1
	405	psrld xmm6, 2
	406	psrld xmm3, 2
	407	psrld xmm7, 7
	408	psrld xmm4, 7
	409	pxor xmm2, xmm6 // low half, unit, t^2 contribs
	410	pxor xmm1, xmm3
	411	pxor xmm5, xmm7 // t and t^7 contribs
	412	pxor xmm0, xmm4
	413	pxor xmm5, xmm2 // mix everything together
	414	pxor xmm0, xmm1
	415	movq xmm1, xmm5 // shunt (?; z_0) into proper place
	416	.endm
	417
	418	.macro mul256
	419	// Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave
	420	// with z = u v in xmm0/xmm1. Clobbers xmm2--xmm7. On 32-bit x86,
	421	// requires 16 bytes aligned space at SP; on amd64, also clobbers
	422	// xmm8.
	423
	424	// Now it's starting to look worthwhile to do Karatsuba. Suppose
	425	// u = u_0 + u_1 B and v = v_0 + v_1 B. Then
	426	//
	427	// u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
	428	//
	429	// Name these coefficients of B^i be a, b, and c, respectively, and
	430	// let r = u_0 + u_1 and s = v_0 + v_1. Then observe that
	431	//
	432	// q = r s = (u_0 + u_1) (v_0 + v_1)
	433	// = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
	434	// = a + c + b
	435	//
	436	// The first two terms we've already calculated; the last is the
	437	// remaining one we want. We'll set B = t^128. We know how to do
	438	// 128-bit multiplications already, and Karatsuba is too annoying
	439	// there, so there'll be 12 multiplications altogether, rather than
	440	// the 16 we'd have if we did this the naïve way.
	441	//
	442	// On x86, there aren't quite enough registers, so spill one for a
	443	// bit. On AMD64, we can keep on going, so it's all good.
	444
	445	// xmm0 = // u_1 = (u_10; u_11)
	446	// xmm1 = // u_0 = (u_00; u_01)
	447	// xmm2 = // v_1 = (v_10; v_11)
	448	// xmm3 = // v_0 = (v_00; v_01)
	449	movdqa xmm4, xmm0 // u_1 again
	450	#if CPUFAM_X86
	451	movdqa [SP + 0], xmm3
	452	#elif CPUFAM_AMD64
	453	movdqa xmm8, xmm3
	454	# define V0 xmm8
	455	#endif
	456	pxor xmm4, xmm1 // u_* = (u_00 + u_10; u_01 + u_11)
	457	pxor xmm3, xmm2 // v_* = (v_00 + v_10; v_01 + v_11)
	458
	459	// Start by building the cross product, q = u_* v_*.
	460	movdqa xmm7, xmm4 // more copies of u_*
	461	movdqa xmm5, xmm4
	462	movdqa xmm6, xmm4
	463	pclmullqhqdq xmm4, xmm3 // u_1 v_0
	464	pclmulhqlqdq xmm7, xmm3 // u_0 v_1
	465	pclmullqlqdq xmm5, xmm3 // u_1 v_1
	466	pclmulhqhqdq xmm6, xmm3 // u_0 v_0
	467	pxor xmm4, xmm7 // u_1 v_0 + u_0 v_1
	468	movdqa xmm7, xmm4
	469	pslldq xmm4, 8
	470	psrldq xmm7, 8
	471	pxor xmm5, xmm4 // q_1
	472	pxor xmm6, xmm7 // q_0
	473
	474	// Next, work on the high half, a = u_1 v_1.
	475	movdqa xmm3, xmm0 // more copies of u_1
	476	movdqa xmm4, xmm0
	477	movdqa xmm7, xmm0
	478	pclmullqhqdq xmm0, xmm2 // u_11 v_10
	479	pclmulhqlqdq xmm3, xmm2 // u_10 v_11
	480	pclmullqlqdq xmm4, xmm2 // u_11 v_11
	481	pclmulhqhqdq xmm7, xmm2 // u_10 v_10
	482	#if CPUFAM_X86
	483	movdqa xmm2, [SP + 0]
	484	# define V0 xmm2
	485	#endif
	486	pxor xmm0, xmm3 // u_10 v_11 + u_11 v_10
	487	movdqa xmm3, xmm0
	488	pslldq xmm0, 8
	489	psrldq xmm3, 8
	490	pxor xmm4, xmm0 // x_3 = a_1
	491	pxor xmm7, xmm3 // a_0
	492
	493	// Mix that into the product now forming in xmm4--xmm7.
	494	pxor xmm5, xmm4 // a_1 + q_1
	495	pxor xmm6, xmm7 // a_0 + q_0
	496	pxor xmm5, xmm7 // a_0 + (a_1 + q_1)
	497
	498	// Finally, the low half, c = u_0 v_0.
	499	movdqa xmm0, xmm1 // more copies of u_0
	500	movdqa xmm3, xmm1
	501	movdqa xmm7, xmm1
	502	pclmullqhqdq xmm1, V0 // u_01 v_00
	503	pclmulhqlqdq xmm0, V0 // u_00 v_01
	504	pclmullqlqdq xmm3, V0 // u_01 v_01
	505	pclmulhqhqdq xmm7, V0 // u_00 v_00
	506	pxor xmm0, xmm1 // u_10 v_11 + u_11 v_10
	507	movdqa xmm1, xmm0
	508	pslldq xmm0, 8
	509	psrldq xmm1, 8
	510	pxor xmm3, xmm0 // c_1
	511	pxor xmm7, xmm1 // x_0 = c_0
	512
	513	// And mix that in to complete the product.
	514	pxor xmm6, xmm3 // (a_0 + q_0) + c_1
	515	pxor xmm5, xmm3 // x_2 = a_0 + (a_1 + c_1 + q_1) = a_0 + b_1
	516	pxor xmm6, xmm7 // x_1 = (a_0 + c_0 + q_0) + c_1 = b_0 + c_1
	517
	518	#undef V0
	519
	520	// Now we must reduce. This is essentially the same as the 128-bit
	521	// case above, but more complicated because everything is bigger.
	522	// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
	523
	524	// First, shift the high bits down.
	525	movdqa xmm0, xmm4 // x_3 again
	526	movdqa xmm1, xmm4 // x_3 yet again
	527	movdqa xmm2, xmm4 // x_3 again again
	528	pslld xmm0, 30 // x_3: b_i for t^2
	529	pslld xmm1, 27 // x_3: b_i for t^5
	530	pslld xmm2, 22 // x_3: b_i for t^10
	531	movdqa xmm3, xmm5 // x_2 again
	532	pxor xmm0, xmm1
	533	movdqa xmm1, xmm5 // x_2 again
	534	pxor xmm0, xmm2 // b_3
	535	movdqa xmm2, xmm5 // x_2 again
	536	pslld xmm3, 30 // x_2: b_i for t^2
	537	pslld xmm1, 27 // x_2: b_i for t^5
	538	pslld xmm2, 22 // x_2: b_i for t^10
	539	pxor xmm3, xmm1
	540	movdqa xmm1, xmm0
	541	pxor xmm3, xmm2 // b_2
	542	psrldq xmm0, 4
	543	movdqa xmm2, xmm3
	544	pslldq xmm1, 12
	545	psrldq xmm3, 4
	546	pxor xmm6, xmm0
	547	pslldq xmm2, 12
	548	pxor xmm7, xmm3
	549	pxor xmm5, xmm1
	550	pxor xmm6, xmm2
	551
	552	// And then shift the low bits up.
	553	movdqa xmm0, xmm4 // x_3 again
	554	movdqa xmm1, xmm5 // x_2 again
	555	movdqa xmm2, xmm4 // x_3 yet again
	556	movdqa xmm3, xmm5 // x_2 yet again
	557	pxor xmm6, xmm4 // x_1 and unit contrib from x_3
	558	pxor xmm7, xmm5 // x_0 and unit contrib from x_2
	559	psrld xmm4, 2
	560	psrld xmm5, 2
	561	psrld xmm0, 5
	562	psrld xmm1, 5
	563	psrld xmm2, 10
	564	psrld xmm3, 10
	565	pxor xmm4, xmm6 // x_1, with x_3 units and t^2
	566	pxor xmm5, xmm7 // x_0, with x_2 units and t^2
	567	pxor xmm0, xmm2 // x_3 t^5 and t^10 contribs
	568	pxor xmm1, xmm3 // x_2 t^5 and t^10 contribs
	569	pxor xmm0, xmm4 // high half of reduced result
	570	pxor xmm1, xmm5 // low half; all done
	571	.endm
	572
	573	///--------------------------------------------------------------------------
	574	/// Main code.
	575
	576	// There are a number of representations of field elements in this code and
	577	// it can be confusing.
	578	//
	579	// * The `external format' consists of a sequence of contiguous bytes in
	580	// memory called a `block'. The GCM spec explains how to interpret this
	581	// block as an element of a finite field. As discussed extensively, this
	582	// representation is very annoying for a number of reasons. On the other
	583	// hand, this code never actually deals with it directly.
	584	//
	585	// * The `register format' consists of one or more XMM registers, depending
	586	// on the block size. The bytes in these registers are in reverse order
	587	// -- so the least-significant byte of the lowest-numbered register holds
	588	// the /last/ byte in the block. If the block size is not a multiple of
	589	// 16 bytes, then there must be padding. 96-bit blocks are weird: the
	590	// padding is inserted at the /least/ significant end, so the register
	591	// holds (x_2, x_1; x_0, 0); otherwise, the padding goes at the most
	592	// significant end.
	593	//
	594	// * The `words' format consists of a sequence of bytes, as in the
	595	// `external format', but, according to the blockcipher in use, the bytes
	596	// within each 32-bit word may be reversed (`big-endian') or not
	597	// (`little-endian'). Accordingly, there are separate entry points for
	598	// each variant, identified with `b' or `l'.
	599
	600	#define SSEFUNC(f) \
	601	FUNC(f##_avx); vzeroupper; endprologue; ENDFUNC; \
	602	FUNC(f)
	603
	604	SSEFUNC(gcm_mulk_128b_x86ish_pclmul)
	605	// On entry, A points to a 128-bit field element in big-endian words
	606	// format; K points to a field-element in register format. On exit,
	607	// A is updated with the product A K.
	608
	609	#if CPUFAM_X86
	610	mov A, [SP + 4]
	611	mov K, [SP + 8]
	612	#endif
	613	endprologue
	614	movdqu xmm0, [A]
	615	movdqu xmm1, [K]
	616	pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
	617	mul128
	618	pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
	619	movdqu [A], xmm0
	620	ret
	621	ENDFUNC
	622
	623	SSEFUNC(gcm_mulk_128l_x86ish_pclmul)
	624	// On entry, A points to a 128-bit field element in little-endian
	625	// words format; K points to a field-element in register format. On
	626	// exit, A is updated with the product A K.
	627
	628	#if CPUFAM_X86
	629	mov A, [SP + 4]
	630	mov K, [SP + 8]
	631	ldgot ecx
	632	#endif
	633	endprologue
	634	movdqa xmm7, [INTADDR(swaptab_128l, ecx)]
	635	movdqu xmm0, [A]
	636	movdqu xmm1, [K]
	637	pshufb xmm0, xmm7
	638	mul128
	639	pshufb xmm0, xmm7
	640	movdqu [A], xmm0
	641	ret
	642	ENDFUNC
	643
	644	SSEFUNC(gcm_mulk_64b_x86ish_pclmul)
	645	// On entry, A points to a 64-bit field element in big-endian words
	646	// format; K points to a field-element in register format. On exit,
	647	// A is updated with the product A K.
	648
	649	#if CPUFAM_X86
	650	mov A, [SP + 4]
	651	mov K, [SP + 8]
	652	#endif
	653	endprologue
	654	movq xmm0, [A]
	655	movq xmm1, [K]
	656	pshufd xmm0, xmm0, SHUF(3, 3, 0, 1)
	657	mul64
	658	pshufd xmm0, xmm0, SHUF(3, 3, 0, 1)
	659	movq [A], xmm0
	660	ret
	661	ENDFUNC
	662
	663	SSEFUNC(gcm_mulk_64l_x86ish_pclmul)
	664	// On entry, A points to a 64-bit field element in little-endian
	665	// words format; K points to a field-element in register format. On
	666	// exit, A is updated with the product A K.
	667
	668	#if CPUFAM_X86
	669	mov A, [SP + 4]
	670	mov K, [SP + 8]
	671	ldgot ecx
	672	#endif
	673	endprologue
	674	movdqa xmm7, [INTADDR(swaptab_64l, ecx)]
	675	movq xmm0, [A]
	676	movq xmm1, [K]
	677	pshufb xmm0, xmm7
	678	mul64
	679	pshufb xmm0, xmm7
	680	movq [A], xmm0
	681	ret
	682	ENDFUNC
	683
	684	SSEFUNC(gcm_mulk_96b_x86ish_pclmul)
	685	// On entry, A points to a 96-bit field element in big-endian words
	686	// format; K points to a field-element in register format (i.e., 16
	687	// bytes, with the first four bytes zero). On exit, A is updated
	688	// with the product A K.
	689
	690	#if CPUFAM_X86
	691	mov A, [SP + 4]
	692	mov K, [SP + 8]
	693	#endif
	694	endprologue
	695	movq xmm0, [A + 0]
	696	movd xmm2, [A + 8]
	697	movdqu xmm1, [K]
	698	punpcklqdq xmm0, xmm2
	699	pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
	700	mul96
	701	pshufd xmm1, xmm0, SHUF(0, 1, 2, 3)
	702	psrldq xmm0, 4
	703	movq [A + 0], xmm1
	704	movd [A + 8], xmm0
	705	ret
	706	ENDFUNC
	707
	708	SSEFUNC(gcm_mulk_96l_x86ish_pclmul)
	709	// On entry, A points to a 96-bit field element in little-endian
	710	// words format; K points to a field-element in register format
	711	// (i.e., 16 bytes, with the first four bytes zero). On exit, A is
	712	// updated with the product A K.
	713
	714	#if CPUFAM_X86
	715	mov A, [SP + 4]
	716	mov K, [SP + 8]
	717	ldgot ecx
	718	#endif
	719	endprologue
	720	movdqa xmm7, [INTADDR(swaptab_128l, ecx)]
	721	movq xmm0, [A + 0]
	722	movd xmm2, [A + 8]
	723	movdqu xmm1, [K]
	724	punpcklqdq xmm0, xmm2
	725	pshufb xmm0, xmm7
	726	mul96
	727	pshufb xmm0, xmm7
	728	movq [A + 0], xmm0
	729	psrldq xmm0, 8
	730	movd [A + 8], xmm0
	731	ret
	732	ENDFUNC
	733
	734	SSEFUNC(gcm_mulk_192b_x86ish_pclmul)
	735	// On entry, A points to a 192-bit field element in big-endian words
	736	// format; K points to a field-element in register format. On exit,
	737	// A is updated with the product A K.
	738
	739	#if CPUFAM_X86
	740	mov A, [SP + 4]
	741	mov K, [SP + 8]
	742	#endif
	743	#if CPUFAM_AMD64 && ABI_WIN
	744	stalloc 2*16 + 8
	745	savexmm xmm6, 0
	746	savexmm xmm7, 16
	747	#endif
	748	endprologue
	749	movdqu xmm0, [A + 8]
	750	movq xmm1, [A + 0]
	751	movdqu xmm2, [K + 0]
	752	movq xmm3, [K + 16]
	753	pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
	754	pshufd xmm1, xmm1, SHUF(3, 3, 0, 1)
	755	mul192
	756	pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
	757	pshufd xmm1, xmm1, SHUF(3, 3, 0, 1)
	758	movdqu [A + 8], xmm0
	759	movq [A + 0], xmm1
	760	#if CPUFAM_AMD64 && ABI_WIN
	761	rstrxmm xmm6, 0
	762	rstrxmm xmm7, 16
	763	stfree 2*16 + 8
	764	#endif
	765	ret
	766	ENDFUNC
	767
	768	SSEFUNC(gcm_mulk_192l_x86ish_pclmul)
	769	// On entry, A points to a 192-bit field element in little-endian
	770	// words format; K points to a field-element in register format. On
	771	// exit, A is updated with the product A K.
	772
	773	#if CPUFAM_X86
	774	mov A, [SP + 4]
	775	mov K, [SP + 8]
	776	ldgot ecx
	777	#endif
	778	#if CPUFAM_AMD64 && ABI_WIN
	779	stalloc 2*16 + 8
	780	savexmm xmm6, 0
	781	savexmm xmm7, 16
	782	#endif
	783	endprologue
	784	movdqu xmm0, [A + 8]
	785	movq xmm1, [A + 0]
	786	movdqu xmm2, [K + 0]
	787	movq xmm3, [K + 16]
	788	pshufb xmm0, [INTADDR(swaptab_128l, ecx)]
	789	pshufb xmm1, [INTADDR(swaptab_64l, ecx)]
	790	mul192
	791	pshufb xmm0, [INTADDR(swaptab_128l, ecx)]
	792	pshufb xmm1, [INTADDR(swaptab_64l, ecx)]
	793	movdqu [A + 8], xmm0
	794	movq [A + 0], xmm1
	795	#if CPUFAM_AMD64 && ABI_WIN
	796	rstrxmm xmm6, 0
	797	rstrxmm xmm7, 16
	798	stfree 2*16 + 8
	799	#endif
	800	ret
	801	ENDFUNC
	802
	803	SSEFUNC(gcm_mulk_256b_x86ish_pclmul)
	804	// On entry, A points to a 256-bit field element in big-endian words
	805	// format; K points to a field-element in register format. On exit,
	806	// A is updated with the product A K.
	807
	808	#if CPUFAM_X86
	809	pushreg BP
	810	setfp
	811	mov A, [SP + 8]
	812	mov K, [SP + 12]
	813	stalloc 16
	814	and SP, ~15
	815	#endif
	816	#if CPUFAM_AMD64 && ABI_WIN
	817	stalloc 3*16 + 8
	818	savexmm xmm6, 0
	819	savexmm xmm7, 16
	820	savexmm xmm8, 32
	821	#endif
	822	endprologue
	823	movdqu xmm0, [A + 16]
	824	movdqu xmm1, [A + 0]
	825	movdqu xmm2, [K + 0]
	826	movdqu xmm3, [K + 16]
	827	pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
	828	pshufd xmm1, xmm1, SHUF(0, 1, 2, 3)
	829	mul256
	830	pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
	831	pshufd xmm1, xmm1, SHUF(0, 1, 2, 3)
	832	movdqu [A + 16], xmm0
	833	movdqu [A + 0], xmm1
	834	#if CPUFAM_X86
	835	dropfp
	836	popreg BP
	837	#endif
	838	#if CPUFAM_AMD64 && ABI_WIN
	839	rstrxmm xmm6, 0
	840	rstrxmm xmm7, 16
	841	rstrxmm xmm8, 32
	842	stfree 3*16 + 8
	843	#endif
	844	ret
	845	ENDFUNC
	846
	847	SSEFUNC(gcm_mulk_256l_x86ish_pclmul)
	848	// On entry, A points to a 256-bit field element in little-endian
	849	// words format; K points to a field-element in register format. On
	850	// exit, A is updated with the product A K.
	851
	852	#if CPUFAM_X86
	853	pushreg BP
	854	setfp
	855	mov A, [SP + 8]
	856	mov K, [SP + 12]
	857	stalloc 16
	858	ldgot ecx
	859	and SP, ~15
	860	#endif
	861	#if CPUFAM_AMD64 && ABI_WIN
	862	stalloc 3*16 + 8
	863	savexmm xmm6, 0
	864	savexmm xmm7, 16
	865	savexmm xmm8, 32
	866	#endif
	867	endprologue
	868	movdqa xmm7, [INTADDR(swaptab_128l, ecx)]
	869	movdqu xmm0, [A + 16]
	870	movdqu xmm1, [A + 0]
	871	movdqu xmm2, [K + 0]
	872	movdqu xmm3, [K + 16]
	873	pshufb xmm0, xmm7
	874	pshufb xmm1, xmm7
	875	mul256
	876	movdqa xmm7, [INTADDR(swaptab_128l, ecx)]
	877	pshufb xmm0, xmm7
	878	pshufb xmm1, xmm7
	879	movdqu [A + 16], xmm0
	880	movdqu [A + 0], xmm1
	881	#if CPUFAM_X86
	882	dropfp
	883	popreg BP
	884	#endif
	885	#if CPUFAM_AMD64 && ABI_WIN
	886	rstrxmm xmm6, 0
	887	rstrxmm xmm7, 16
	888	rstrxmm xmm8, 32
	889	stfree 3*16 + 8
	890	#endif
	891	ret
	892	ENDFUNC
	893
	894	RODATA
	895
	896	.balign 16
	897	swaptab_128l:
	898	// Table for byte-swapping little-endian words-format blocks larger
	899	// than 64 bits.
	900	.byte 15, 14, 13, 12, 11, 10, 9, 8
	901	.byte 7, 6, 5, 4, 3, 2, 1, 0
	902
	903	.balign 16
	904	swaptab_64l:
	905	// Table for byte-swapping 64-bit little-endian words-format blocks.
	906	.byte 7, 6, 5, 4, 3, 2, 1, 0
	907	.byte 255, 255, 255, 255, 255, 255, 255, 255
	908
	909	///----- That's all, folks --------------------------------------------------