mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// GCM acceleration for x86 processors
	4	///
	5	/// (c) 2018 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software: you can redistribute it and/or modify it
	13	/// under the terms of the GNU Library General Public License as published
	14	/// by the Free Software Foundation; either version 2 of the License, or
	15	/// (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful, but
	18	/// WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	20	/// Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb. If not, write to the Free Software
	24	/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
	25	/// USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// Preliminaries.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	.arch .pclmul
	34
	35	.text
	36
	37	///--------------------------------------------------------------------------
	38	/// Common register allocation.
	39
	40	#if CPUFAM_X86
	41	# define A eax
	42	# define K edx
	43	#elif CPUFAM_AMD64 && ABI_SYSV
	44	# define A rdi
	45	# define K rsi
	46	#elif CPUFAM_AMD64 && ABI_WIN
	47	# define A rcx
	48	# define K rdx
	49	#endif
	50
	51	///--------------------------------------------------------------------------
	52	/// Multiplication macros.
	53
	54	// The good news is that we have a fancy instruction to do the
	55	// multiplications. The bad news is that it's not particularly well-
	56	// suited to the job.
	57	//
	58	// For one thing, it only does a 64-bit multiplication, so in general
	59	// we'll need to synthesize the full-width multiply by hand. For
	60	// another thing, it doesn't help with the reduction, so we have to
	61	// do that by hand too. And, finally, GCM has crazy bit ordering,
	62	// and the instruction does nothing useful for that at all.
	63	//
	64	// Focusing on that last problem first: the bits aren't in monotonic
	65	// significance order unless we permute them. If we reverse the byte
	66	// order, then we'll have the bits in monotonic order, but backwards,
	67	// so the degree-0 coefficient will be in the most-significant bit.
	68	//
	69	// This is less of a difficulty than it seems at first, because
	70	// algebra. Suppose we are given u = SUM_{0<=i<n} u_i t^i and v =
	71	// SUM_{0<=j<n} v_j t^j; then
	72	//
	73	// u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
	74	//
	75	// Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i
	76	// and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
	77	// Then
	78	//
	79	// ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
	80	// = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
	81	//
	82	// which is almost the bit-reversal of u v, only it's shifted right
	83	// by one place. Oh, well: we'll have to shift it back later.
	84	//
	85	// That was important to think about, but there's not a great deal to
	86	// do about it yet other than to convert what we've got from the
	87	// blockcipher's byte-ordering convention to our big-endian
	88	// convention. Since this depends on the blockcipher convention,
	89	// we'll leave the caller to cope with this: the macros here will
	90	// assume that the operands are in `register' format, which is the
	91	// byte-reversal of the external representation, padded at the
	92	// most-significant end except for 96-bit blocks, which are
	93	// zero-padded at the least-significant end (see `mul96' for the
	94	// details). In the commentary, pieces of polynomial are numbered
	95	// according to the degree of the coefficients, so the unit
	96	// coefficient of some polynomial a is in a_0.
	97	//
	98	// The commentary for `mul128' is the most detailed. The other
	99	// macros assume that you've already read and understood that.
	100
	101	.macro mul128
	102	// Enter with u and v in xmm0 and xmm1 respectively; leave with z =
	103	// u v in xmm0. Clobbers xmm1--xmm4.
	104
	105	// First for the double-precision multiplication. It's tempting to
	106	// use Karatsuba's identity here, but I suspect that loses more in
	107	// the shifting, bit-twiddling, and dependency chains that it gains
	108	// in saving a multiplication which otherwise pipelines well.
	109	// xmm0 = // (u_1; u_0)
	110	// xmm1 = // (v_1; v_0)
	111	movdqa xmm2, xmm1 // (v_1; v_0) again
	112	movdqa xmm3, xmm0 // (u_1; u_0) again
	113	movdqa xmm4, xmm0 // (u_1; u_0) yet again
	114	pclmulhqlqdq xmm2, xmm0 // u_1 v_0
	115	pclmullqlqdq xmm0, xmm1 // u_1 v_1
	116	pclmulhqlqdq xmm3, xmm1 // u_0 v_1
	117	pclmulhqhqdq xmm4, xmm1 // u_0 v_0
	118
	119	// Arrange the pieces to form a double-precision polynomial.
	120	pxor xmm2, xmm3 // (m_1; m_0) = u_1 v_0 + u_0 v_1
	121	movdqa xmm1, xmm2 // (m_1; m_0) again
	122	pslldq xmm2, 8 // (0; m_1)
	123	psrldq xmm1, 8 // (m_0; 0)
	124	pxor xmm0, xmm2 // x_1 = u_1 v_1 + m_1
	125	pxor xmm1, xmm4 // x_0 = u_0 v_0 + t^64 m_0
	126
	127	// Two problems remain. The first is that this product is shifted
	128	// left (from GCM's backwards perspective) by one place, which is
	129	// annoying. Let's take care of that now. Once this is done, we'll
	130	// be properly in GCM's backwards bit-ordering, so xmm1 will hold the
	131	// low half of the product and xmm0 the high half. (The following
	132	// diagrams show bit 0 consistently on the right.)
	133	//
	134	// xmm1
	135	// ,-------------.-------------.-------------.-------------.
	136	// \| 0 x_0-x_30 \| x_31-x_62 \| x_63-x_94 \| x_95-x_126 \|
	137	// `-------------^-------------^-------------^-------------'
	138	//
	139	// xmm0
	140	// ,-------------.-------------.-------------.-------------.
	141	// \| x_127-x_158 \| x_159-x_190 \| x_191-x_222 \| x_223-x_254 \|
	142	// `-------------^-------------^-------------^-------------'
	143	//
	144	// We start by shifting each 32-bit lane right (from GCM's point of
	145	// view -- physically, left) by one place, which gives us this:
	146	//
	147	// low (xmm3)
	148	// ,-------------.-------------.-------------.-------------.
	149	// \| x_0-x_30 0 \| x_32-x_62 0 \| x_64-x_94 0 \| x_96-x_126 0\|
	150	// `-------------^-------------^-------------^-------------'
	151	//
	152	// high (xmm2)
	153	// ,-------------.-------------.-------------.-------------.
	154	// \|x_128-x_158 0\|x_160-x_190 0\|x_192-x_222 0\|x_224-x_254 0\|
	155	// `-------------^-------------^-------------^-------------'
	156	//
	157	// but we've lost a bunch of bits. We separately shift each lane
	158	// left by 31 places to give us the bits we lost.
	159	//
	160	// low (xmm1)
	161	// ,-------------.-------------.-------------.-------------.
	162	// \| 0...0 \| 0...0 x_31 \| 0...0 x_63 \| 0...0 x_95 \|
	163	// `-------------^-------------^-------------^-------------'
	164	//
	165	// high (xmm0)
	166	// ,-------------.-------------.-------------.-------------.
	167	// \| 0...0 x_127 \| 0...0 x_159 \| 0...0 x_191 \| 0...0 x_223 \|
	168	// `-------------^-------------^-------------^-------------'
	169	//
	170	// Which is close, but we don't get a cigar yet. To get the missing
	171	// bits into position, we shift each of these right by a lane, but,
	172	// alas, the x_127 falls off, so, separately, we shift the high
	173	// register left by three lanes, so that everything is lined up
	174	// properly when we OR them all together:
	175	//
	176	// low (xmm1)
	177	// ,-------------.-------------.-------------.-------------.
	178	// ? 0...0 x_31 \| 0...0 x_63 \| 0...0 x_95 \| 0...0 \|
	179	// `-------------^-------------^-------------^-------------'
	180	//
	181	// wrap (xmm4)
	182	// ,-------------.-------------.-------------.-------------.
	183	// \| 0...0 \| 0...0 \| 0...0 \| 0...0 x_127 \|
	184	// `-------------^-------------^-------------^-------------'
	185	//
	186	// high (xmm0)
	187	// ,-------------.-------------.-------------.-------------.
	188	// \| 0...0 x_159 \| 0...0 x_191 \| 0...0 x_223 \| 0...0 \|
	189	// `-------------^-------------^-------------^-------------'
	190	//
	191	// The `low' and `wrap' registers (xmm1, xmm3, xmm4) then collect the
	192	// low 128 coefficients, while the `high' registers (xmm0, xmm2)
	193	// collect the high 127 registers, leaving a zero bit at the most
	194	// significant end as we expect.
	195
	196	// xmm0 = // (x_7, x_6; x_5, x_4)
	197	// xmm1 = // (x_3, x_2; x_1, x_0)
	198	movdqa xmm3, xmm1 // (x_3, x_2; x_1, x_0) again
	199	movdqa xmm2, xmm0 // (x_7, x_6; x_5, x_4) again
	200	psrld xmm1, 31 // shifted left; just the carries
	201	psrld xmm0, 31
	202	pslld xmm3, 1 // shifted right, but dropped carries
	203	pslld xmm2, 1
	204	movdqa xmm4, xmm0 // another copy for the carry around
	205	pslldq xmm1, 4 // move carries over
	206	pslldq xmm0, 4
	207	psrldq xmm4, 12 // the big carry wraps around
	208	por xmm1, xmm3
	209	por xmm0, xmm2 // (y_7, y_6; y_5, y_4)
	210	por xmm1, xmm4 // (y_3, y_2; y_1, y_0)
	211
	212	// And the other problem is that the result needs to be reduced
	213	// modulo p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128 = t^7 +
	214	// t^2 + t + 1 in our field. So far, we've calculated z_0 and z_1
	215	// such that z_0 + z_1 R = u v using the identity R = t^128: now we
	216	// must collapse the two halves of z together using the other
	217	// identity R = t^7 + t^2 + t + 1.
	218	//
	219	// We do this by working on each 32-bit word of the high half of z
	220	// separately, so consider y_i, for some 4 <= i < 8. Certainly, y_i
	221	// t^{32i} = y_i R t^{32(i-4)} = (t^7 + t^2 + t + 1) y_i t^{32(i-4)},
	222	// but we can't use that directly without breaking up the 32-bit word
	223	// structure. Instead, we start by considering just y_i t^7
	224	// t^{32(i-4)}, which again looks tricky. Now, split y_i = a_i +
	225	// t^25 b_i, with deg a_i < 25; then
	226	//
	227	// y_i t^7 t^{32(i-4)} = a_i t^7 t^{32(i-4)} + b_i t^{32(i-3)}
	228	//
	229	// We can similarly decompose y_i t^2 and y_i t into a pair of 32-bit
	230	// contributions to the t^{32(i-4)} and t^{32(i-3)} words, but the
	231	// splits are different. This is lovely, with one small snag: when
	232	// we do this to y_7, we end up with a contribution back into the
	233	// t^128 coefficient word. But notice that only the low seven bits
	234	// of this word are affected, so there's no knock-on contribution
	235	// into the t^32 word. Therefore, if we handle the high bits of each
	236	// word together, and then the low bits, everything will be fine.
	237
	238	// First, shift the high bits down.
	239	movdqa xmm2, xmm0 // (y_7, y_6; y_5, y_4) again
	240	movdqa xmm3, xmm0 // (y_7, y_6; y_5, y_4) yet again
	241	movdqa xmm4, xmm0 // (y_7, y_6; y_5, y_4) again again
	242	pslld xmm2, 31 // the b_i for t
	243	pslld xmm3, 30 // the b_i for t^2
	244	pslld xmm4, 25 // the b_i for t^7
	245	pxor xmm2, xmm3 // add them all together
	246	pxor xmm2, xmm4
	247	movdqa xmm3, xmm2 // and a copy for later
	248	psrldq xmm2, 4 // contribution into low half
	249	pslldq xmm3, 12 // and high half
	250	pxor xmm1, xmm2
	251	pxor xmm0, xmm3
	252
	253	// And then shift the low bits up.
	254	movdqa xmm2, xmm0
	255	movdqa xmm3, xmm0
	256	pxor xmm1, xmm0 // mix in the unit contribution
	257	psrld xmm0, 1
	258	psrld xmm2, 2
	259	psrld xmm3, 7
	260	pxor xmm1, xmm2 // low half, unit, and t^2 contribs
	261	pxor xmm0, xmm3 // t and t^7 contribs
	262	pxor xmm0, xmm1 // mix them together and we're done
	263	.endm
	264
	265	.macro mul64
	266	// Enter with u and v in the low halves of xmm0 and xmm1
	267	// respectively; leave with z = u v in xmm0. Clobbers xmm1--xmm4.
	268
	269	// The multiplication is thankfully easy.
	270	pclmullqlqdq xmm0, xmm1 // u v
	271
	272	// Shift the product up by one place. After this, we're in GCM
	273	// bizarro-world.
	274	movdqa xmm1, xmm0 // u v again
	275	psrld xmm0, 31 // shifted left; just the carries
	276	pslld xmm1, 1 // shifted right, but dropped carries
	277	pslldq xmm0, 4 // move carries over
	278	por xmm1, xmm0 // (y_3, y_2; y_1, y_0)
	279
	280	// Now we must reduce. This is essentially the same as the 128-bit
	281	// case above, but mostly simpler because everything is smaller. The
	282	// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
	283
	284	// First, we must detach the top (`low'!) half of the result.
	285	movdqa xmm0, xmm1 // (y_3, y_2; y_1, y_0) again
	286	psrldq xmm1, 8 // (y_1, y_0; 0, 0)
	287
	288	// Next, shift the high bits down.
	289	movdqa xmm2, xmm0 // (y_3, y_2; ?, ?) again
	290	movdqa xmm3, xmm0 // (y_3, y_2; ?, ?) yet again
	291	movdqa xmm4, xmm0 // (y_3, y_2; ?, ?) again again
	292	pslld xmm2, 31 // b_i for t
	293	pslld xmm3, 29 // b_i for t^3
	294	pslld xmm4, 28 // b_i for t^4
	295	pxor xmm2, xmm3 // add them all together
	296	pxor xmm2, xmm4
	297	movdqa xmm3, xmm2 // and a copy for later
	298	movq xmm2, xmm2 // zap high half
	299	pslldq xmm3, 4 // contribution into high half
	300	psrldq xmm2, 4 // and low half
	301	pxor xmm0, xmm3
	302	pxor xmm1, xmm2
	303
	304	// And then shift the low bits up.
	305	movdqa xmm2, xmm0
	306	movdqa xmm3, xmm0
	307	pxor xmm1, xmm0 // mix in the unit contribution
	308	psrld xmm0, 1
	309	psrld xmm2, 3
	310	psrld xmm3, 4
	311	pxor xmm1, xmm2 // low half, unit, and t^3 contribs
	312	pxor xmm0, xmm3 // t and t^4 contribs
	313	pxor xmm0, xmm1 // mix them together and we're done
	314	.endm
	315
	316	.macro mul96
	317	// Enter with u and v in the /high/ three words of xmm0 and xmm1
	318	// respectively (and zero in the low word); leave with z = u v in the
	319	// high three words of xmm0, and /junk/ in the low word. Clobbers
	320	// xmm1--xmm4.
	321
	322	// This is an inconvenient size. There's nothing for it but to do
	323	// four multiplications, as if for the 128-bit case. It's possible
	324	// that there's cruft in the top 32 bits of the input registers, so
	325	// shift both of them up by four bytes before we start. This will
	326	// mean that the high 64 bits of the result (from GCM's viewpoint)
	327	// will be zero.
	328	// xmm0 = // (0, u_2; u_1, u_0)
	329	// xmm1 = // (0, v_2; v_1, v_0)
	330	movdqa xmm2, xmm1 // (0, v_2; v_1, v_0) again
	331	movdqa xmm3, xmm0 // (0, u_2; u_1, u_0) again
	332	movdqa xmm4, xmm0 // (0, u_2; u_1, u_0) yet again
	333	pclmulhqlqdq xmm2, xmm0 // u_2 (v_1 t^32 + v_0) = e_0
	334	pclmullqlqdq xmm0, xmm1 // u_2 v_2 = d = (0; d)
	335	pclmulhqlqdq xmm3, xmm1 // v_2 (u_1 t^32 + u_0) = e_1
	336	pclmulhqhqdq xmm4, xmm1 // u_0 v_0 + (u_1 v_0 + u_0 v_1) t^32
	337	// + u_1 v_1 t^64 = f
	338
	339	// Extract the high and low halves of the 192-bit result. We don't
	340	// need be too picky about the unused high words of the result
	341	// registers. The answer we want is d t^128 + e t^64 + f, where e =
	342	// e_0 + e_1.
	343	//
	344	// The place values for the two halves are (t^160, t^128; t^96, ?)
	345	// and (?, t^64; t^32, 1).
	346	psrldq xmm0, 8 // (d; 0) = d t^128
	347	pxor xmm2, xmm3 // e = (e_0 + e_1)
	348	movdqa xmm1, xmm4 // f again
	349	pxor xmm0, xmm2 // d t^128 + e t^64
	350	psrldq xmm2, 12 // e[31..0] t^64
	351	psrldq xmm1, 4 // f[95..0]
	352	pslldq xmm4, 8 // f[127..96]
	353	pxor xmm1, xmm2 // low 96 bits of result
	354	pxor xmm0, xmm4 // high 96 bits of result
	355
	356	// Next, shift everything one bit to the left to compensate for GCM's
	357	// strange ordering. This will be easier if we shift up the high
	358	// half by a word before we start. After this we're in GCM bizarro-
	359	// world.
	360	movdqa xmm3, xmm1 // low half again
	361	pslldq xmm0, 4 // shift high half
	362	psrld xmm1, 31 // shift low half down: just carries
	363	movdqa xmm2, xmm0 // copy high half
	364	pslld xmm3, 1 // shift low half down: drop carries
	365	psrld xmm0, 31 // shift high half up: just carries
	366	pslld xmm2, 1 // shift high half down: drop carries
	367	movdqa xmm4, xmm0 // copy high carries for carry-around
	368	pslldq xmm0, 4 // shift carries down
	369	pslldq xmm1, 4
	370	psrldq xmm4, 12 // the big carry wraps around
	371	por xmm1, xmm3
	372	por xmm0, xmm2
	373	por xmm1, xmm4
	374
	375	// Finally, the reduction. This is essentially the same as the
	376	// 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
	377	// t^9 + t^6 + 1. The degrees are larger but not enough to cause
	378	// trouble for the general approach.
	379
	380	// First, shift the high bits down.
	381	movdqa xmm2, xmm0 // copies of the high part
	382	movdqa xmm3, xmm0
	383	movdqa xmm4, xmm0
	384	pslld xmm2, 26 // b_i for t^6
	385	pslld xmm3, 23 // b_i for t^9
	386	pslld xmm4, 22 // b_i for t^10
	387	pxor xmm2, xmm3 // add them all together
	388	pslldq xmm1, 4 // shift low part up to match
	389	pxor xmm2, xmm4
	390	movdqa xmm3, xmm2 // and a copy for later
	391	pslldq xmm2, 8 // contribution to high half
	392	psrldq xmm3, 4 // contribution to low half
	393	pxor xmm1, xmm3
	394	pxor xmm0, xmm2
	395
	396	// And then shift the low bits up.
	397	movdqa xmm2, xmm0 // copies of the high part
	398	movdqa xmm3, xmm0
	399	pxor xmm1, xmm0 // mix in the unit contribution
	400	psrld xmm0, 6
	401	psrld xmm2, 9
	402	psrld xmm3, 10
	403	pxor xmm1, xmm2 // low half, unit, and t^9 contribs
	404	pxor xmm0, xmm3 // t^6 and t^10 contribs
	405	pxor xmm0, xmm1 // mix them together and we're done
	406	.endm
	407
	408	.macro mul192
	409	// Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave
	410	// with z = u v in xmm0/xmm1 -- the top halves of the high registers
	411	// are unimportant. Clobbers xmm2--xmm7.
	412
	413	// Start multiplying and accumulating pieces of product.
	414	// xmm0 = // (u_2; u_1)
	415	// xmm1 = // (u_0; ?)
	416	// xmm2 = // (v_2; v_1)
	417	// xmm3 = // (v_0; ?)
	418	movdqa xmm4, xmm0 // (u_2; u_1) again
	419	movdqa xmm5, xmm0 // (u_2; u_1) yet again
	420	movdqa xmm6, xmm0 // (u_2; u_1) again again
	421	movdqa xmm7, xmm1 // (u_0; ?) again
	422	punpcklqdq xmm1, xmm3 // (u_0; v_0)
	423	pclmulhqhqdq xmm4, xmm2 // u_1 v_1
	424	pclmullqlqdq xmm3, xmm0 // u_2 v_0
	425	pclmullqhqdq xmm5, xmm2 // u_2 v_1
	426	pclmulhqlqdq xmm6, xmm2 // u_1 v_2
	427	pxor xmm4, xmm3 // u_2 v_0 + u_1 v_1
	428	pclmullqlqdq xmm7, xmm2 // u_0 v_2
	429	pxor xmm5, xmm6 // b = u_2 v_1 + u_1 v_2
	430	movdqa xmm6, xmm0 // (u_2; u_1) like a bad penny
	431	pxor xmm4, xmm7 // c = u_0 v_2 + u_1 v_1 + u_2 v_0
	432	pclmullqlqdq xmm0, xmm2 // a = u_2 v_2
	433	pclmulhqhqdq xmm6, xmm1 // u_1 v_0
	434	pclmulhqlqdq xmm2, xmm1 // u_0 v_1
	435	pclmullqhqdq xmm1, xmm1 // e = u_0 v_0
	436	pxor xmm2, xmm6 // d = u_1 v_0 + u_0 v_1
	437
	438	// Next, the piecing together of the product.
	439	// xmm0 = // (a_1; a_0) = a = u_2 v_2
	440	// xmm5 = // (b_1; b_0) = b = u_1 v_2 + u_2 v_1
	441	// xmm4 = // (c_1; c_0) = c = u_0 v_2 +
	442	// u_1 v_1 + u_2 v_0
	443	// xmm2 = // (d_1; d_0) = d = u_0 v_1 + u_1 v_0
	444	// xmm1 = // (e_1; e_0) = e = u_0 v_0
	445	// xmm3, xmm6, xmm7 spare
	446	movdqa xmm3, xmm2 // (d_1; d_0) again
	447	movdqa xmm6, xmm5 // (b_1; b_0) again
	448	pslldq xmm2, 8 // (0; d_1)
	449	psrldq xmm5, 8 // (b_0; 0)
	450	psrldq xmm3, 8 // (d_0; 0)
	451	pslldq xmm6, 8 // (0; b_1)
	452	pxor xmm5, xmm2 // (b_0; d_1)
	453	pxor xmm0, xmm6 // x_2 = (a_1; a_0 + b_1)
	454	pxor xmm3, xmm1 // x_0 = (e_1 + d_0; e_0)
	455	pxor xmm4, xmm5 // x_1 = (b_0 + c_1; c_0 + d_1)
	456
	457	// Now, shift it right (from GCM's point of view) by one bit, and try
	458	// to leave the result in less random registers. After this, we'll
	459	// be in GCM bizarro-world.
	460	// xmm1, xmm2, xmm5, xmm6, xmm7 spare
	461	movdqa xmm5, xmm0 // copy x_2
	462	movdqa xmm1, xmm4 // copy x_1
	463	movdqa xmm2, xmm3 // copy x_0
	464	psrld xmm0, 31 // x_2 carries
	465	psrld xmm4, 31 // x_1 carries
	466	psrld xmm3, 31 // x_0 carries
	467	pslld xmm5, 1 // x_2 shifted
	468	pslld xmm1, 1 // x_1 shifted
	469	pslld xmm2, 1 // x_0 shifted
	470	movdqa xmm6, xmm0 // x_2 carry copy
	471	movdqa xmm7, xmm4 // x_1 carry copy
	472	pslldq xmm0, 4 // x_2 carry shifted
	473	pslldq xmm4, 4 // x_1 carry shifted
	474	pslldq xmm3, 4 // x_0 carry shifted
	475	psrldq xmm6, 12 // x_2 carry out
	476	psrldq xmm7, 12 // x_1 carry out
	477	por xmm0, xmm5 // (y_5; y_4)
	478	por xmm1, xmm4
	479	por xmm2, xmm3
	480	por xmm1, xmm6 // (y_3; y_2)
	481	por xmm2, xmm7 // (y_1; y_0)
	482
	483	// Next, the reduction. Our polynomial this time is p(x) = t^192 +
	484	// t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
	485	// 128-bit case. I don't know why.
	486
	487	// First, shift the high bits down.
	488	// xmm0 = // (y_5; y_4)
	489	// xmm1 = // (y_3; y_2)
	490	// xmm2 = // (y_1; y_0)
	491	// xmm3--xmm7 spare
	492	movdqa xmm3, xmm0 // (y_5; y_4) copy
	493	movdqa xmm4, xmm0 // (y_5; y_4) copy
	494	movdqa xmm5, xmm0 // (y_5; y_4) copy
	495	pslld xmm3, 31 // (y_5; y_4) b_i for t
	496	pslld xmm4, 30 // (y_5; y_4) b_i for t^2
	497	pslld xmm5, 25 // (y_5; y_4) b_i for t^7
	498	movq xmm6, xmm1 // (y_3; 0) copy
	499	pxor xmm3, xmm4
	500	movq xmm7, xmm1 // (y_3; 0) copy
	501	pxor xmm3, xmm5
	502	movq xmm5, xmm1 // (y_3; 0) copy
	503	movdqa xmm4, xmm3 // (y_5; y_4) b_i combined
	504	pslld xmm6, 31 // (y_3; 0) b_i for t
	505	pslld xmm7, 30 // (y_3; 0) b_i for t^2
	506	pslld xmm5, 25 // (y_3; 0) b_i for t^7
	507	psrldq xmm3, 12 // (y_5; y_4) low contrib
	508	pslldq xmm4, 4 // (y_5; y_4) high contrib
	509	pxor xmm6, xmm7
	510	pxor xmm2, xmm3
	511	pxor xmm6, xmm5
	512	pxor xmm1, xmm4
	513	pslldq xmm6, 4
	514	pxor xmm2, xmm6
	515
	516	// And finally shift the low bits up. Unfortunately, we also have to
	517	// split the low bits out.
	518	// xmm0 = // (y'_5; y'_4)
	519	// xmm1 = // (y'_3; y'_2)
	520	// xmm2 = // (y'_1; y'_0)
	521	movdqa xmm5, xmm1 // copies of (y'_3; y'_2)
	522	movdqa xmm6, xmm1
	523	movdqa xmm7, xmm1
	524	psrldq xmm1, 8 // bring down (y'_2; ?)
	525	movdqa xmm3, xmm0 // copies of (y'_5; y'_4)
	526	movdqa xmm4, xmm0
	527	punpcklqdq xmm1, xmm2 // (y'_2; y'_1)
	528	psrldq xmm2, 8 // (y'_0; ?)
	529	pxor xmm2, xmm5 // low half and unit contrib
	530	pxor xmm1, xmm0
	531	psrld xmm5, 1
	532	psrld xmm0, 1
	533	psrld xmm6, 2
	534	psrld xmm3, 2
	535	psrld xmm7, 7
	536	psrld xmm4, 7
	537	pxor xmm2, xmm6 // low half, unit, t^2 contribs
	538	pxor xmm1, xmm3
	539	pxor xmm5, xmm7 // t and t^7 contribs
	540	pxor xmm0, xmm4
	541	pxor xmm5, xmm2 // mix everything together
	542	pxor xmm0, xmm1
	543	movq xmm1, xmm5 // shunt (z_0; ?) into proper place
	544	.endm
	545
	546	.macro mul256
	547	// Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave
	548	// with z = u v in xmm0/xmm1. Clobbers xmm2--xmm7. On 32-bit x86,
	549	// requires 16 bytes aligned space at SP; on amd64, also clobbers
	550	// xmm8.
	551
	552	// Now it's starting to look worthwhile to do Karatsuba. Suppose
	553	// u = u_0 + u_1 B and v = v_0 + v_1 B. Then
	554	//
	555	// u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
	556	//
	557	// Name these coefficients of B^i be a, b, and c, respectively, and
	558	// let r = u_0 + u_1 and s = v_0 + v_1. Then observe that
	559	//
	560	// q = r s = (u_0 + u_1) (v_0 + v_1)
	561	// = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
	562	// = a + d + c
	563	//
	564	// The first two terms we've already calculated; the last is the
	565	// remaining one we want. We'll set B = t^128. We know how to do
	566	// 128-bit multiplications already, and Karatsuba is too annoying
	567	// there, so there'll be 12 multiplications altogether, rather than
	568	// the 16 we'd have if we did this the naïve way.
	569	//
	570	// On x86, there aren't quite enough registers, so spill one for a
	571	// bit. On AMD64, we can keep on going, so it's all good.
	572
	573	// xmm0 = // u_1 = (u_11; u_10)
	574	// xmm1 = // u_0 = (u_01; u_00)
	575	// xmm2 = // v_1 = (v_11; v_10)
	576	// xmm3 = // v_0 = (v_01; v_00)
	577	movdqa xmm4, xmm0 // u_1 again
	578	#if CPUFAM_X86
	579	movdqa [esp + 0], xmm3
	580	#elif CPUFAM_AMD64
	581	movdqa xmm8, xmm3
	582	# define V0 xmm8
	583	#endif
	584	pxor xmm4, xmm1 // u_* = (u_01 + u_11; u_00 + u_10)
	585	pxor xmm3, xmm2 // v_* = (v_01 + v_11; v_00 + v_10)
	586
	587	// Start by building the cross product, q = u_* v_*.
	588	movdqa xmm7, xmm4 // more copies of u_*
	589	movdqa xmm5, xmm4
	590	movdqa xmm6, xmm4
	591	pclmullqhqdq xmm4, xmm3 // u_1 v_0
	592	pclmulhqlqdq xmm7, xmm3 // u_0 v_1
	593	pclmullqlqdq xmm5, xmm3 // u_1 v_1
	594	pclmulhqhqdq xmm6, xmm3 // u_0 v_0
	595	pxor xmm4, xmm7 // u_1 v_0 + u_0 v_1
	596	movdqa xmm7, xmm4
	597	pslldq xmm4, 8
	598	psrldq xmm7, 8
	599	pxor xmm5, xmm4 // q_1
	600	pxor xmm6, xmm7 // q_0
	601
	602	// Next, work on the high half, a = u_1 v_1.
	603	movdqa xmm3, xmm0 // more copies of u_1
	604	movdqa xmm4, xmm0
	605	movdqa xmm7, xmm0
	606	pclmullqhqdq xmm0, xmm2 // u_11 v_10
	607	pclmulhqlqdq xmm3, xmm2 // u_10 v_11
	608	pclmullqlqdq xmm4, xmm2 // u_11 v_11
	609	pclmulhqhqdq xmm7, xmm2 // u_10 v_10
	610	#if CPUFAM_X86
	611	movdqa xmm2, [esp + 0]
	612	# define V0 xmm2
	613	#endif
	614	pxor xmm0, xmm3 // u_10 v_11 + u_11 v_10
	615	movdqa xmm3, xmm0
	616	pslldq xmm0, 8
	617	psrldq xmm3, 8
	618	pxor xmm4, xmm0 // x_1 = a_1
	619	pxor xmm7, xmm3 // a_0
	620
	621	// Mix that into the product now forming in xmm4--xmm7.
	622	pxor xmm5, xmm4 // a_1 + q_1
	623	pxor xmm6, xmm7 // a_0 + q_0
	624	pxor xmm5, xmm7 // a_0 + (a_1 + q_1)
	625
	626	// Finally, the low half, c = u_0 v_0.
	627	movdqa xmm0, xmm1 // more copies of u_0
	628	movdqa xmm3, xmm1
	629	movdqa xmm7, xmm1
	630	pclmullqhqdq xmm1, V0 // u_01 v_00
	631	pclmulhqlqdq xmm0, V0 // u_00 v_01
	632	pclmullqlqdq xmm3, V0 // u_01 v_01
	633	pclmulhqhqdq xmm7, V0 // u_00 v_00
	634	pxor xmm0, xmm1 // u_10 v_11 + u_11 v_10
	635	movdqa xmm1, xmm0
	636	pslldq xmm0, 8
	637	psrldq xmm1, 8
	638	pxor xmm3, xmm0 // c_1
	639	pxor xmm7, xmm1 // x_0 = c_0
	640
	641	// And mix that in to complete the product.
	642	pxor xmm6, xmm3 // (a_0 + q_0) + c_1
	643	pxor xmm5, xmm3 // x_2 = a_0 + (a_1 + c_1 + q_1) = a_0 + b_1
	644	pxor xmm6, xmm7 // x_1 = (a_0 + c_0 + q_0) + c_1 = b_0 + c_1
	645
	646	#undef V0
	647
	648	// Now we need to shift that whole lot one bit to the left. This
	649	// will also give us an opportunity to put the product back in
	650	// xmm0--xmm3. This is a slightly merry dance because it's nearly
	651	// pipelined but we don't have enough registers.
	652	//
	653	// After this, we'll be in GCM bizarro-world.
	654	movdqa xmm0, xmm4 // x_3 again
	655	psrld xmm4, 31 // x_3 carries
	656	pslld xmm0, 1 // x_3 shifted left
	657	movdqa xmm3, xmm4 // x_3 copy carries
	658	movdqa xmm1, xmm5 // x_2 again
	659	pslldq xmm4, 4 // x_3 carries shifted up
	660	psrld xmm5, 31 // x_2 carries
	661	psrldq xmm3, 12 // x_3 big carry out
	662	pslld xmm1, 1 // x_2 shifted left
	663	por xmm0, xmm4 // x_3 mixed together
	664	movdqa xmm4, xmm5 // x_2 copy carries
	665	movdqa xmm2, xmm6 // x_1 again
	666	pslldq xmm5, 4 // x_2 carries shifted up
	667	psrld xmm6, 31 // x_1 carries
	668	psrldq xmm4, 12 // x_2 big carry out
	669	pslld xmm2, 1 // x_1 shifted
	670	por xmm1, xmm5 // x_2 mixed together
	671	movdqa xmm5, xmm6 // x_1 copy carries
	672	por xmm1, xmm3 // x_2 with carry from x_3
	673	movdqa xmm3, xmm7 // x_0 again
	674	pslldq xmm6, 4 // x_1 carries shifted up
	675	psrld xmm7, 31 // x_2 carries
	676	psrldq xmm5, 12 // x_1 big carry out
	677	pslld xmm3, 1 // x_0 shifted
	678	por xmm2, xmm6 // x_1 mixed together
	679	pslldq xmm7, 4 // x_0 carries shifted up
	680	por xmm2, xmm4 // x_1 with carry from x_2
	681	por xmm3, xmm7 // x_0 mixed together
	682	por xmm3, xmm5 // x_0 with carry from x_1
	683
	684	// Now we must reduce. This is essentially the same as the 128-bit
	685	// case above, but more complicated because everything is bigger.
	686	// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
	687
	688	// First, shift the high bits down.
	689	movdqa xmm4, xmm0 // y_3 again
	690	movdqa xmm5, xmm0 // y_3 yet again
	691	movdqa xmm6, xmm0 // y_3 again again
	692	pslld xmm4, 30 // y_3: b_i for t^2
	693	pslld xmm5, 27 // y_3: b_i for t^5
	694	pslld xmm6, 22 // y_3: b_i for t^10
	695	movdqa xmm7, xmm1 // y_2 again
	696	pxor xmm4, xmm5
	697	movdqa xmm5, xmm1 // y_2 again
	698	pxor xmm4, xmm6
	699	movdqa xmm6, xmm1 // y_2 again
	700	pslld xmm7, 30 // y_2: b_i for t^2
	701	pslld xmm5, 27 // y_2: b_i for t^5
	702	pslld xmm6, 22 // y_2: b_i for t^10
	703	pxor xmm7, xmm5
	704	movdqa xmm5, xmm4
	705	pxor xmm7, xmm6
	706	psrldq xmm4, 4
	707	movdqa xmm6, xmm7
	708	pslldq xmm5, 12
	709	psrldq xmm7, 4
	710	pxor xmm2, xmm4
	711	pslldq xmm6, 12
	712	pxor xmm3, xmm7
	713	pxor xmm1, xmm5
	714	pxor xmm2, xmm6
	715
	716	// And then shift the low bits up.
	717	movdqa xmm4, xmm0 // y_3 again
	718	movdqa xmm5, xmm1 // y_2 again
	719	movdqa xmm6, xmm0 // y_3 yet again
	720	movdqa xmm7, xmm1 // y_2 yet again
	721	pxor xmm2, xmm0 // y_1 and unit contrib from y_3
	722	pxor xmm3, xmm1 // y_0 and unit contrib from y_2
	723	psrld xmm0, 2
	724	psrld xmm1, 2
	725	psrld xmm4, 5
	726	psrld xmm5, 5
	727	psrld xmm6, 10
	728	psrld xmm7, 10
	729	pxor xmm0, xmm2 // y_1, with y_3 units and t^2
	730	pxor xmm1, xmm3 // y_0, with y_2 units and t^2
	731	pxor xmm4, xmm6 // y_3 t^5 and t^10 contribs
	732	pxor xmm5, xmm7 // y_2 t^5 and t^10 contribs
	733	pxor xmm0, xmm4 // high half of reduced result
	734	pxor xmm1, xmm5 // low half; all done
	735	.endm
	736
	737	///--------------------------------------------------------------------------
	738	/// Main code.
	739
	740	// There are a number of representations of field elements in this code and
	741	// it can be confusing.
	742	//
	743	// * The `external format' consists of a sequence of contiguous bytes in
	744	// memory called a `block'. The GCM spec explains how to interpret this
	745	// block as an element of a finite field. As discussed extensively, this
	746	// representation is very annoying for a number of reasons. On the other
	747	// hand, this code never actually deals with it directly.
	748	//
	749	// * The `register format' consists of one or more XMM registers, depending
	750	// on the block size. The bytes in these registers are in reverse order
	751	// -- so the least-significant byte of the lowest-numbered register holds
	752	// the /last/ byte in the block. If the block size is not a multiple of
	753	// 16 bytes, then there must be padding. 96-bit blocks are weird: the
	754	// padding is inserted at the /least/ significant end, so the register
	755	// holds (0, x_0; x_1, x_2); otherwise, the padding goes at the most
	756	// significant end.
	757	//
	758	// * The `words' format consists of a sequence of bytes, as in the
	759	// `external format', but, according to the blockcipher in use, the bytes
	760	// within each 32-bit word may be reversed (`big-endian') or not
	761	// (`little-endian'). Accordingly, there are separate entry points for
	762	// each variant, identified with `b' or `l'.
	763
	764	#define SSEFUNC(f) \
	765	FUNC(f##_avx); vzeroupper; endprologue; ENDFUNC; \
	766	FUNC(f)
	767
	768	SSEFUNC(gcm_mulk_128b_x86ish_pclmul)
	769	// On entry, A points to a 128-bit field element in big-endian words
	770	// format; K points to a field-element in register format. On exit,
	771	// A is updated with the product A K.
	772
	773	#if CPUFAM_X86
	774	mov A, [esp + 4]
	775	mov K, [esp + 8]
	776	#endif
	777	endprologue
	778	movdqu xmm0, [A]
	779	movdqu xmm1, [K]
	780	pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
	781	mul128
	782	pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
	783	movdqu [A], xmm0
	784	ret
	785	ENDFUNC
	786
	787	SSEFUNC(gcm_mulk_128l_x86ish_pclmul)
	788	// On entry, A points to a 128-bit field element in little-endian
	789	// words format; K points to a field-element in register format. On
	790	// exit, A is updated with the product A K.
	791
	792	#if CPUFAM_X86
	793	mov A, [esp + 4]
	794	mov K, [esp + 8]
	795	ldgot ecx
	796	#endif
	797	endprologue
	798	movdqa xmm7, [INTADDR(swaptab_128l, ecx)]
	799	movdqu xmm0, [A]
	800	movdqu xmm1, [K]
	801	pshufb xmm0, xmm7
	802	mul128
	803	pshufb xmm0, xmm7
	804	movdqu [A], xmm0
	805	ret
	806	ENDFUNC
	807
	808	SSEFUNC(gcm_mulk_64b_x86ish_pclmul)
	809	// On entry, A points to a 64-bit field element in big-endian words
	810	// format; K points to a field-element in register format. On exit,
	811	// A is updated with the product A K.
	812
	813	#if CPUFAM_X86
	814	mov A, [esp + 4]
	815	mov K, [esp + 8]
	816	#endif
	817	endprologue
	818	movq xmm0, [A]
	819	movq xmm1, [K]
	820	pshufd xmm0, xmm0, SHUF(1, 0, 3, 3)
	821	mul64
	822	pshufd xmm0, xmm0, SHUF(1, 0, 3, 3)
	823	movq [A], xmm0
	824	ret
	825	ENDFUNC
	826
	827	SSEFUNC(gcm_mulk_64l_x86ish_pclmul)
	828	// On entry, A points to a 64-bit field element in little-endian
	829	// words format; K points to a field-element in register format. On
	830	// exit, A is updated with the product A K.
	831
	832	#if CPUFAM_X86
	833	mov A, [esp + 4]
	834	mov K, [esp + 8]
	835	ldgot ecx
	836	#endif
	837	endprologue
	838	movdqa xmm7, [INTADDR(swaptab_64l, ecx)]
	839	movq xmm0, [A]
	840	movq xmm1, [K]
	841	pshufb xmm0, xmm7
	842	mul64
	843	pshufb xmm0, xmm7
	844	movq [A], xmm0
	845	ret
	846	ENDFUNC
	847
	848	SSEFUNC(gcm_mulk_96b_x86ish_pclmul)
	849	// On entry, A points to a 96-bit field element in big-endian words
	850	// format; K points to a field-element in register format (i.e., 16
	851	// bytes, with the first four bytes zero). On exit, A is updated
	852	// with the product A K.
	853
	854	#if CPUFAM_X86
	855	mov A, [esp + 4]
	856	mov K, [esp + 8]
	857	#endif
	858	endprologue
	859	movq xmm0, [A + 0]
	860	movd xmm2, [A + 8]
	861	movdqu xmm1, [K]
	862	punpcklqdq xmm0, xmm2
	863	pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
	864	mul96
	865	pshufd xmm1, xmm0, SHUF(3, 2, 1, 0)
	866	psrldq xmm0, 4
	867	movq [A + 0], xmm1
	868	movd [A + 8], xmm0
	869	ret
	870	ENDFUNC
	871
	872	SSEFUNC(gcm_mulk_96l_x86ish_pclmul)
	873	// On entry, A points to a 96-bit field element in little-endian
	874	// words format; K points to a field-element in register format
	875	// (i.e., 16 bytes, with the first four bytes zero). On exit, A is
	876	// updated with the product A K.
	877
	878	#if CPUFAM_X86
	879	mov A, [esp + 4]
	880	mov K, [esp + 8]
	881	ldgot ecx
	882	#endif
	883	endprologue
	884	movdqa xmm7, [INTADDR(swaptab_128l, ecx)]
	885	movq xmm0, [A + 0]
	886	movd xmm2, [A + 8]
	887	movdqu xmm1, [K]
	888	punpcklqdq xmm0, xmm2
	889	pshufb xmm0, xmm7
	890	mul96
	891	pshufb xmm0, xmm7
	892	movq [A + 0], xmm0
	893	psrldq xmm0, 8
	894	movd [A + 8], xmm0
	895	ret
	896	ENDFUNC
	897
	898	SSEFUNC(gcm_mulk_192b_x86ish_pclmul)
	899	// On entry, A points to a 192-bit field element in big-endian words
	900	// format; K points to a field-element in register format. On exit,
	901	// A is updated with the product A K.
	902
	903	#if CPUFAM_X86
	904	mov A, [esp + 4]
	905	mov K, [esp + 8]
	906	#endif
	907	#if CPUFAM_AMD64 && ABI_WIN
	908	stalloc 2*16 + 8
	909	savexmm xmm6, 0
	910	savexmm xmm7, 16
	911	#endif
	912	endprologue
	913	movdqu xmm0, [A + 8]
	914	movq xmm1, [A + 0]
	915	movdqu xmm2, [K + 0]
	916	movq xmm3, [K + 16]
	917	pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
	918	pshufd xmm1, xmm1, SHUF(1, 0, 3, 3)
	919	mul192
	920	pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
	921	pshufd xmm1, xmm1, SHUF(1, 0, 3, 3)
	922	movdqu [A + 8], xmm0
	923	movq [A + 0], xmm1
	924	#if CPUFAM_AMD64 && ABI_WIN
	925	rstrxmm xmm6, 0
	926	rstrxmm xmm7, 16
	927	stfree 2*16 + 8
	928	#endif
	929	ret
	930	ENDFUNC
	931
	932	SSEFUNC(gcm_mulk_192l_x86ish_pclmul)
	933	// On entry, A points to a 192-bit field element in little-endian
	934	// words format; K points to a field-element in register format. On
	935	// exit, A is updated with the product A K.
	936
	937	#if CPUFAM_X86
	938	mov A, [esp + 4]
	939	mov K, [esp + 8]
	940	ldgot ecx
	941	#endif
	942	#if CPUFAM_AMD64 && ABI_WIN
	943	stalloc 2*16 + 8
	944	savexmm xmm6, 0
	945	savexmm xmm7, 16
	946	#endif
	947	endprologue
	948	movdqu xmm0, [A + 8]
	949	movq xmm1, [A + 0]
	950	movdqu xmm2, [K + 0]
	951	movq xmm3, [K + 16]
	952	pshufb xmm0, [INTADDR(swaptab_128l, ecx)]
	953	pshufb xmm1, [INTADDR(swaptab_64l, ecx)]
	954	mul192
	955	pshufb xmm0, [INTADDR(swaptab_128l, ecx)]
	956	pshufb xmm1, [INTADDR(swaptab_64l, ecx)]
	957	movdqu [A + 8], xmm0
	958	movq [A + 0], xmm1
	959	#if CPUFAM_AMD64 && ABI_WIN
	960	rstrxmm xmm6, 0
	961	rstrxmm xmm7, 16
	962	stfree 2*16 + 8
	963	#endif
	964	ret
	965	ENDFUNC
	966
	967	SSEFUNC(gcm_mulk_256b_x86ish_pclmul)
	968	// On entry, A points to a 256-bit field element in big-endian words
	969	// format; K points to a field-element in register format. On exit,
	970	// A is updated with the product A K.
	971
	972	#if CPUFAM_X86
	973	pushreg ebp
	974	setfp
	975	mov A, [esp + 8]
	976	mov K, [esp + 12]
	977	and esp, ~15
	978	sub esp, 16
	979	#endif
	980	#if CPUFAM_AMD64 && ABI_WIN
	981	stalloc 3*16 + 8
	982	savexmm xmm6, 0
	983	savexmm xmm7, 16
	984	savexmm xmm8, 32
	985	#endif
	986	endprologue
	987	movdqu xmm0, [A + 16]
	988	movdqu xmm1, [A + 0]
	989	movdqu xmm2, [K + 0]
	990	movdqu xmm3, [K + 16]
	991	pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
	992	pshufd xmm1, xmm1, SHUF(3, 2, 1, 0)
	993	mul256
	994	pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
	995	pshufd xmm1, xmm1, SHUF(3, 2, 1, 0)
	996	movdqu [A + 16], xmm0
	997	movdqu [A + 0], xmm1
	998	#if CPUFAM_X86
	999	dropfp
	1000	popreg ebp
	1001	#endif
	1002	#if CPUFAM_AMD64 && ABI_WIN
	1003	rstrxmm xmm6, 0
	1004	rstrxmm xmm7, 16
	1005	rstrxmm xmm8, 32
	1006	stfree 3*16 + 8
	1007	#endif
	1008	ret
	1009	ENDFUNC
	1010
	1011	SSEFUNC(gcm_mulk_256l_x86ish_pclmul)
	1012	// On entry, A points to a 256-bit field element in little-endian
	1013	// words format; K points to a field-element in register format. On
	1014	// exit, A is updated with the product A K.
	1015
	1016	#if CPUFAM_X86
	1017	pushreg ebp
	1018	setfp
	1019	mov A, [esp + 8]
	1020	mov K, [esp + 12]
	1021	and esp, ~15
	1022	ldgot ecx
	1023	sub esp, 16
	1024	#endif
	1025	#if CPUFAM_AMD64 && ABI_WIN
	1026	stalloc 3*16 + 8
	1027	savexmm xmm6, 0
	1028	savexmm xmm7, 16
	1029	savexmm xmm8, 32
	1030	#endif
	1031	endprologue
	1032	movdqa xmm7, [INTADDR(swaptab_128l, ecx)]
	1033	movdqu xmm0, [A + 16]
	1034	movdqu xmm1, [A + 0]
	1035	movdqu xmm2, [K + 0]
	1036	movdqu xmm3, [K + 16]
	1037	pshufb xmm0, xmm7
	1038	pshufb xmm1, xmm7
	1039	mul256
	1040	movdqa xmm7, [INTADDR(swaptab_128l, ecx)]
	1041	pshufb xmm0, xmm7
	1042	pshufb xmm1, xmm7
	1043	movdqu [A + 16], xmm0
	1044	movdqu [A + 0], xmm1
	1045	#if CPUFAM_X86
	1046	dropfp
	1047	popreg ebp
	1048	#endif
	1049	#if CPUFAM_AMD64 && ABI_WIN
	1050	rstrxmm xmm6, 0
	1051	rstrxmm xmm7, 16
	1052	rstrxmm xmm8, 32
	1053	stfree 3*16 + 8
	1054	#endif
	1055	ret
	1056	ENDFUNC
	1057
	1058	RODATA
	1059
	1060	.balign 16
	1061	swaptab_128l:
	1062	// Table for byte-swapping little-endian words-format blocks larger
	1063	// than 64 bits.
	1064	.byte 15, 14, 13, 12, 11, 10, 9, 8
	1065	.byte 7, 6, 5, 4, 3, 2, 1, 0
	1066
	1067	.balign 16
	1068	swaptab_64l:
	1069	// Table for byte-swapping 64-bit little-endian words-format blocks.
	1070	.byte 7, 6, 5, 4, 3, 2, 1, 0
	1071	.byte 255, 255, 255, 255, 255, 255, 255, 255
	1072
	1073	///----- That's all, folks --------------------------------------------------