[catacomb] / symm / gcm-x86ish-pclmul.S

/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// GCM acceleration for x86 processors
///
/// (c) 2018 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software: you can redistribute it and/or modify it
/// under the terms of the GNU Library General Public License as published
/// by the Free Software Foundation; either version 2 of the License, or
/// (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful, but
/// WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
/// Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb.  If not, write to the Free Software
/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
/// USA.

///--------------------------------------------------------------------------
/// Preliminaries.

#include "config.h"
#include "asm-common.h"

	.arch	.pclmul

	.text

///--------------------------------------------------------------------------
/// Common register allocation.

#if CPUFAM_X86
#  define A eax
#  define K edx
#elif CPUFAM_AMD64 && ABI_SYSV
#  define A rdi
#  define K rsi
#elif CPUFAM_AMD64 && ABI_WIN
#  define A rcx
#  define K rdx
#endif

///--------------------------------------------------------------------------
/// Multiplication macros.

	// The good news is that we have a fancy instruction to do the
	// multiplications.  The bad news is that it's not particularly well-
	// suited to the job.
	//
	// For one thing, it only does a 64-bit multiplication, so in general
	// we'll need to synthesize the full-width multiply by hand.  For
	// another thing, it doesn't help with the reduction, so we have to
	// do that by hand too.  And, finally, GCM has crazy bit ordering,
	// and the instruction does nothing useful for that at all.
	//
	// Focusing on that last problem first: the bits aren't in monotonic
	// significance order unless we permute them.  If we reverse the byte
	// order, then we'll have the bits in monotonic order, but backwards,
	// so the degree-0 coefficient will be in the most-significant bit.
	//
	// This is less of a difficulty than it seems at first, because
	// algebra.  Suppose we are given u = SUM_{0<=i<n} u_i t^i and v =
	// SUM_{0<=j<n} v_j t^j; then
	//
	//	u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
	//
	// Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i
	// and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
	// Then
	//
	//	ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
	//	    = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
	//
	// which is almost the bit-reversal of u v, only it's shifted right
	// by one place.  Putting this another way, what we have is actually
	// the bit reversal of the product u v t.  We could get the correct
	// answer (modulo p(t)) if we'd sneakily divided one of the operands
	// by t before we started.  Conveniently, v is actually the secret
	// value k set up by the GCM `mktable' function, so we can arrange to
	// actually store k/t (mod p(t)) and then the product will come out
	// correct (modulo p(t)) and we won't have anything more to worry
	// about here.
	//
	// That was important to think about, but there's not a great deal to
	// do about it yet other than to convert what we've got from the
	// blockcipher's byte-ordering convention to our big-endian
	// convention.  Since this depends on the blockcipher convention,
	// we'll leave the caller to cope with this: the macros here will
	// assume that the operands are in `register' format, which is the
	// byte-reversal of the external representation, padded at the
	// most-significant end except for 96-bit blocks, which are
	// zero-padded at the least-significant end (see `mul96' for the
	// details).  In the commentary, pieces of polynomial are numbered
	// according to the degree of the coefficients, so the unit
	// coefficient of some polynomial a is in a_0.
	//
	// The commentary for `mul128' is the most detailed.  The other
	// macros assume that you've already read and understood that.

.macro	mul128
	// Enter with u and v in xmm0 and xmm1 respectively; leave with z =
	// u v in xmm0.  Clobbers xmm1--xmm4.

	// First for the double-precision multiplication.  It's tempting to
	// use Karatsuba's identity here, but I suspect that loses more in
	// the shifting, bit-twiddling, and dependency chains that it gains
	// in saving a multiplication which otherwise pipelines well.
	// xmm0 =			// (u_1; u_0)
	// xmm1 =			// (v_1; v_0)
	movdqa	xmm2, xmm1		// (v_1; v_0) again
	movdqa	xmm3, xmm0		// (u_1; u_0) again
	movdqa	xmm4, xmm0		// (u_1; u_0) yet again
	pclmulhqlqdq xmm2, xmm0		// u_1 v_0
	pclmullqlqdq xmm0, xmm1		// u_1 v_1
	pclmulhqlqdq xmm3, xmm1		// u_0 v_1
	pclmulhqhqdq xmm4, xmm1		// u_0 v_0

	// Arrange the pieces to form a double-precision polynomial.
	pxor	xmm2, xmm3		// (m_1; m_0) = u_1 v_0 + u_0 v_1
	movdqa	xmm1, xmm2		// (m_1; m_0) again
	pslldq	xmm2, 8			// (0; m_1)
	psrldq	xmm1, 8			// (m_0; 0)
	pxor	xmm0, xmm2		// z_1 = u_1 v_1 + m_1
	pxor	xmm1, xmm4		// z_0 = u_0 v_0 + t^64 m_0

	// The remaining problem is that the result needs to be reduced
	// modulo p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128 = t^7 +
	// t^2 + t + 1 in our field.  So far, we've calculated z_0 and z_1
	// such that z_0 + z_1 R = u v using the identity R = t^128: now we
	// must collapse the two halves of z together using the other
	// identity R = t^7 + t^2 + t + 1.
	//
	// We do this by working on each 32-bit word of the high half of z
	// separately, so consider x_i, for some 4 <= i < 8.  Certainly, x_i
	// t^{32i} = x_i R t^{32(i-4)} = (t^7 + t^2 + t + 1) x_i t^{32(i-4)},
	// but we can't use that directly without breaking up the 32-bit word
	// structure.  Instead, we start by considering just x_i t^7
	// t^{32(i-4)}, which again looks tricky.  Now, split x_i = a_i +
	// t^25 b_i, with deg a_i < 25; then
	//
	//	x_i t^7 t^{32(i-4)} = a_i t^7 t^{32(i-4)} + b_i t^{32(i-3)}
	//
	// We can similarly decompose x_i t^2 and x_i t into a pair of 32-bit
	// contributions to the t^{32(i-4)} and t^{32(i-3)} words, but the
	// splits are different.  This is lovely, with one small snag: when
	// we do this to x_7, we end up with a contribution back into the
	// t^128 coefficient word.  But notice that only the low seven bits
	// of this word are affected, so there's no knock-on contribution
	// into the t^32 word.  Therefore, if we handle the high bits of each
	// word together, and then the low bits, everything will be fine.

	// First, shift the high bits down.
	movdqa	xmm2, xmm0		// (x_7, x_6; x_5, x_4) again
	movdqa	xmm3, xmm0		// (x_7, x_6; x_5, x_4) yet again
	movdqa	xmm4, xmm0		// (x_7, x_6; x_5, x_4) again again
	pslld	xmm2, 31		// the b_i for t
	pslld	xmm3, 30		// the b_i for t^2
	pslld	xmm4, 25		// the b_i for t^7
	pxor	xmm2, xmm3		// add them all together
	pxor	xmm2, xmm4
	movdqa	xmm3, xmm2		// and a copy for later
	psrldq	xmm2, 4			// contribution into low half
	pslldq	xmm3, 12		// and high half
	pxor	xmm1, xmm2
	pxor	xmm0, xmm3

	// And then shift the low bits up.
	movdqa	xmm2, xmm0
	movdqa	xmm3, xmm0
	pxor	xmm1, xmm0		// mix in the unit contribution
	psrld	xmm0, 1
	psrld	xmm2, 2
	psrld	xmm3, 7
	pxor	xmm1, xmm2		// low half, unit, and t^2 contribs
	pxor	xmm0, xmm3		// t and t^7 contribs
	pxor	xmm0, xmm1		// mix them together and we're done
.endm

.macro	mul64
	// Enter with u and v in the low halves of xmm0 and xmm1
	// respectively; leave with z = u v in xmm0.  Clobbers xmm1--xmm4.

	// The multiplication is thankfully easy.
	pclmullqlqdq xmm1, xmm0		// u v

	// Now we must reduce.  This is essentially the same as the 128-bit
	// case above, but mostly simpler because everything is smaller.  The
	// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.

	// First, we must detach the top (`low'!) half of the result.
	movdqa	xmm0, xmm1		// (x_3, x_2; x_1, x_0) again
	psrldq	xmm1, 8			// (x_1, x_0; 0, 0)

	// Next, shift the high bits down.
	movdqa	xmm2, xmm0		// (x_3, x_2; ?, ?) again
	movdqa	xmm3, xmm0		// (x_3, x_2; ?, ?) yet again
	movdqa	xmm4, xmm0		// (x_3, x_2; ?, ?) again again
	pslld	xmm2, 31		// b_i for t
	pslld	xmm3, 29		// b_i for t^3
	pslld	xmm4, 28		// b_i for t^4
	pxor	xmm2, xmm3		// add them all together
	pxor	xmm2, xmm4
	movdqa	xmm3, xmm2		// and a copy for later
	movq	xmm2, xmm2		// zap high half
	pslldq	xmm3, 4			// contribution into high half
	psrldq	xmm2, 4			// and low half
	pxor	xmm0, xmm3
	pxor	xmm1, xmm2

	// And then shift the low bits up.
	movdqa	xmm2, xmm0
	movdqa	xmm3, xmm0
	pxor	xmm1, xmm0		// mix in the unit contribution
	psrld	xmm0, 1
	psrld	xmm2, 3
	psrld	xmm3, 4
	pxor	xmm1, xmm2		// low half, unit, and t^3 contribs
	pxor	xmm0, xmm3		// t and t^4 contribs
	pxor	xmm0, xmm1		// mix them together and we're done
.endm

.macro	mul96
	// Enter with u and v in the /high/ three words of xmm0 and xmm1
	// respectively (and zero in the low word); leave with z = u v in the
	// high three words of xmm0, and /junk/ in the low word.  Clobbers
	// xmm1--xmm4.

	// This is an inconvenient size.  There's nothing for it but to do
	// four multiplications, as if for the 128-bit case.  It's possible
	// that there's cruft in the top 32 bits of the input registers, so
	// shift both of them up by four bytes before we start.  This will
	// mean that the high 64 bits of the result (from GCM's viewpoint)
	// will be zero.
	// xmm0 =			// (0, u_2; u_1, u_0)
	// xmm1 =			// (0, v_2; v_1, v_0)
	movdqa	xmm2, xmm1		// (0, v_2; v_1, v_0) again
	movdqa	xmm3, xmm0		// (0, u_2; u_1, u_0) again
	movdqa	xmm4, xmm0		// (0, u_2; u_1, u_0) yet again
	pclmulhqlqdq xmm2, xmm0		// u_2 (v_1 t^32 + v_0) = e_0
	pclmullqlqdq xmm0, xmm1		// u_2 v_2 = d = (0; d)
	pclmulhqlqdq xmm3, xmm1		// v_2 (u_1 t^32 + u_0) = e_1
	pclmulhqhqdq xmm4, xmm1		// u_0 v_0 + (u_1 v_0 + u_0 v_1) t^32
					//   + u_1 v_1 t^64 = f

	// Extract the high and low halves of the 192-bit result.  We don't
	// need be too picky about the unused high words of the result
	// registers.  The answer we want is d t^128 + e t^64 + f, where e =
	// e_0 + e_1.
	//
	// The place values for the two halves are (t^160, t^128; t^96, ?)
	// and (?, t^64; t^32, 1).  But we also want to shift the high part
	// left by a word, for symmetry's sake.
	psrldq	xmm0, 8			// (d; 0) = d t^128
	pxor	xmm2, xmm3		// e = (e_0 + e_1)
	movdqa	xmm1, xmm4		// f again
	pxor	xmm0, xmm2		// d t^128 + e t^64
	psrldq	xmm2, 12		// e[31..0] t^64
	psrldq	xmm1, 4			// f[95..0]
	pslldq	xmm4, 12		// f[127..96], shifted
	pslldq	xmm0, 4			// shift high 96 bits
	pxor	xmm1, xmm2		// low 96 bits of result
	pxor	xmm0, xmm4		// high 96 bits of result

	// Finally, the reduction.  This is essentially the same as the
	// 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
	// t^9 + t^6 + 1.  The degrees are larger but not enough to cause
	// trouble for the general approach.

	// First, shift the high bits down.
	movdqa	xmm2, xmm0		// copies of the high part
	movdqa	xmm3, xmm0
	movdqa	xmm4, xmm0
	pslld	xmm2, 26		// b_i for t^6
	pslld	xmm3, 23		// b_i for t^9
	pslld	xmm4, 22		// b_i for t^10
	pxor	xmm2, xmm3		// add them all together
	pslldq	xmm1, 4			// shift low part up to match
	pxor	xmm2, xmm4
	movdqa	xmm3, xmm2		// and a copy for later
	pslldq	xmm2, 8			// contribution to high half
	psrldq	xmm3, 4			// contribution to low half
	pxor	xmm1, xmm3
	pxor	xmm0, xmm2

	// And then shift the low bits up.
	movdqa	xmm2, xmm0		// copies of the high part
	movdqa	xmm3, xmm0
	pxor	xmm1, xmm0		// mix in the unit contribution
	psrld	xmm0, 6
	psrld	xmm2, 9
	psrld	xmm3, 10
	pxor	xmm1, xmm2		// low half, unit, and t^9 contribs
	pxor	xmm0, xmm3		// t^6 and t^10 contribs
	pxor	xmm0, xmm1		// mix them together and we're done
.endm

.macro	mul192
	// Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave
	// with z = u v in xmm0/xmm1 -- the top halves of the high registers
	// are unimportant.  Clobbers xmm2--xmm7.

	// Start multiplying and accumulating pieces of product.
	// xmm0 =			// (u_2; u_1)
	// xmm1 =			// (u_0; ?)
	// xmm2 =			// (v_2; v_1)
	// xmm3 =			// (v_0; ?)
	movdqa	xmm4, xmm0		// (u_2; u_1) again
	movdqa	xmm5, xmm0		// (u_2; u_1) yet again
	movdqa	xmm6, xmm0		// (u_2; u_1) again again
	movdqa	xmm7, xmm3		// (v_0; ?) again
	punpcklqdq xmm3, xmm1		// (v_0; u_0)
	pclmulhqhqdq xmm4, xmm2		// u_1 v_1
	pclmullqlqdq xmm1, xmm2		// u_0 v_2
	pclmullqhqdq xmm5, xmm2		// u_2 v_1
	pclmulhqlqdq xmm6, xmm2		// u_1 v_2
	pxor	xmm1, xmm4		// u_0 v_2 + u_1 v_1
	pclmullqlqdq xmm7, xmm0		// u_2 v_0
	pxor	xmm5, xmm6		// b = u_2 v_1 + u_1 v_2
	movdqa	xmm6, xmm0		// (u_2; u_1) like a bad penny
	pxor	xmm1, xmm7		// c = u_0 v_2 + u_1 v_1 + u_2 v_0
	pclmullqlqdq xmm0, xmm2		// a = u_2 v_2
	pclmulhqlqdq xmm6, xmm3		// u_1 v_0
	pclmulhqhqdq xmm2, xmm3		// u_0 v_1
	pclmullqhqdq xmm3, xmm3		// e = u_0 v_0
	pxor	xmm6, xmm2		// d = u_1 v_0 + u_0 v_1

	// Next, the piecing together of the product.  There's significant
	// work here to leave the completed pieces in sensible registers.
	// xmm0 =			// (a_1; a_0) = a = u_2 v_2
	// xmm5 =			// (b_1; b_0) = b = u_1 v_2 + u_2 v_1
	// xmm1 =			// (c_1; c_0) = c = u_0 v_2 +
					//	u_1 v_1 + u_2 v_0
	// xmm6 =			// (d_1; d_0) = d = u_0 v_1 + u_1 v_0
	// xmm3 =			// (e_1; e_0) = e = u_0 v_0
	// xmm2, xmm4, xmm7 spare
	movdqa	xmm2, xmm6		// (d_1; d_0) again
	movdqa	xmm4, xmm5		// (b_1; b_0) again
	pslldq	xmm6, 8			// (0; d_1)
	psrldq	xmm5, 8			// (b_0; 0)
	psrldq	xmm2, 8			// (d_0; 0)
	pslldq	xmm4, 8			// (0; b_1)
	pxor	xmm5, xmm6		// (b_0; d_1)
	pxor	xmm0, xmm4		// (x_5; x_4) = (a_1; a_0 + b_1)
	pxor	xmm2, xmm3		// (x_1; x_0) = (e_1 + d_0; e_0)
	pxor	xmm1, xmm5	       // (x_3; x_2) = (b_0 + c_1; c_0 + d_1)

	// Next, the reduction.  Our polynomial this time is p(x) = t^192 +
	// t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
	// 128-bit case.  I don't know why.

	// First, shift the high bits down.
	// xmm0 =			// (x_5; x_4)
	// xmm1 =			// (x_3; x_2)
	// xmm2 =			// (x_1; x_0)
	// xmm3--xmm7 spare
	movdqa	xmm3, xmm0		// (x_5; x_4) copy
	movdqa	xmm4, xmm0		// (x_5; x_4) copy
	movdqa	xmm5, xmm0		// (x_5; x_4) copy
	pslld	xmm3, 31		// (x_5; x_4) b_i for t
	pslld	xmm4, 30		// (x_5; x_4) b_i for t^2
	pslld	xmm5, 25		// (x_5; x_4) b_i for t^7
	 movq	xmm6, xmm1		// (x_3; 0) copy
	pxor	xmm3, xmm4
	 movq	xmm7, xmm1		// (x_3; 0) copy
	pxor	xmm3, xmm5
	 movq	xmm5, xmm1		// (x_3; 0) copy
	movdqa	xmm4, xmm3		// (x_5; x_4) b_i combined
	 pslld	xmm6, 31		// (x_3; 0) b_i for t
	 pslld	xmm7, 30		// (x_3; 0) b_i for t^2
	 pslld	xmm5, 25		// (x_3; 0) b_i for t^7
	psrldq	xmm3, 12		// (x_5; x_4) low contrib
	pslldq	xmm4, 4			// (x_5; x_4) high contrib
	 pxor	xmm6, xmm7
	pxor	xmm2, xmm3
	 pxor	xmm6, xmm5
	pxor	xmm1, xmm4
	 pslldq	xmm6, 4
	 pxor	xmm2, xmm6

	// And finally shift the low bits up.  Unfortunately, we also have to
	// split the low bits out.
	// xmm0 =			// (x'_5; x'_4)
	// xmm1 =			// (x'_3; x'_2)
	// xmm2 =			// (x'_1; x'_0)
	 movdqa xmm5, xmm1		// copies of (x'_3; x'_2)
	 movdqa	xmm6, xmm1
	 movdqa	xmm7, xmm1
	  psrldq xmm1, 8		// bring down (x'_2; ?)
	movdqa	xmm3, xmm0		// copies of (x'_5; x'_4)
	movdqa	xmm4, xmm0
	  punpcklqdq  xmm1, xmm2	// (x'_2; x'_1)
	  psrldq xmm2, 8		// (x'_0; ?)
	 pxor	xmm2, xmm5		// low half and unit contrib
	pxor	xmm1, xmm0
	 psrld	xmm5, 1
	psrld	xmm0, 1
	 psrld	xmm6, 2
	psrld	xmm3, 2
	 psrld	xmm7, 7
	psrld	xmm4, 7
	 pxor	xmm2, xmm6		// low half, unit, t^2 contribs
	pxor	xmm1, xmm3
	 pxor	xmm5, xmm7		// t and t^7 contribs
	pxor	xmm0, xmm4
	 pxor	xmm5, xmm2		// mix everything together
	pxor	xmm0, xmm1
	 movq	xmm1, xmm5		// shunt (z_0; ?) into proper place
.endm

.macro	mul256
	// Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave
	// with z = u v in xmm0/xmm1.  Clobbers xmm2--xmm7.  On 32-bit x86,
	// requires 16 bytes aligned space at SP; on amd64, also clobbers
	// xmm8.

	// Now it's starting to look worthwhile to do Karatsuba.  Suppose
	// u = u_0 + u_1 B and v = v_0 + v_1 B.  Then
	//
	//	u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
	//
	// Name these coefficients of B^i be a, b, and c, respectively, and
	// let r = u_0 + u_1 and s = v_0 + v_1.  Then observe that
	//
	//	q = r s = (u_0 + u_1) (v_0 + v_1)
	//	  = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
	//	  = a + c + b
	//
	// The first two terms we've already calculated; the last is the
	// remaining one we want.  We'll set B = t^128.  We know how to do
	// 128-bit multiplications already, and Karatsuba is too annoying
	// there, so there'll be 12 multiplications altogether, rather than
	// the 16 we'd have if we did this the naïve way.
	//
	// On x86, there aren't quite enough registers, so spill one for a
	// bit.  On AMD64, we can keep on going, so it's all good.

	// xmm0 =			// u_1 = (u_11; u_10)
	// xmm1 =			// u_0 = (u_01; u_00)
	// xmm2 =			// v_1 = (v_11; v_10)
	// xmm3 =			// v_0 = (v_01; v_00)
	movdqa	xmm4, xmm0		// u_1 again
#if CPUFAM_X86
	movdqa	[SP + 0], xmm3
#elif CPUFAM_AMD64
	movdqa	xmm8, xmm3
#  define V0 xmm8
#endif
	pxor	xmm4, xmm1		// u_* = (u_01 + u_11; u_00 + u_10)
	pxor	xmm3, xmm2		// v_* = (v_01 + v_11; v_00 + v_10)

	// Start by building the cross product, q = u_* v_*.
	movdqa	xmm7, xmm4		// more copies of u_*
	movdqa	xmm5, xmm4
	movdqa	xmm6, xmm4
	pclmullqhqdq xmm4, xmm3		// u_*1 v_*0
	pclmulhqlqdq xmm7, xmm3		// u_*0 v_*1
	pclmullqlqdq xmm5, xmm3		// u_*1 v_*1
	pclmulhqhqdq xmm6, xmm3		// u_*0 v_*0
	pxor	xmm4, xmm7		// u_*1 v_*0 + u_*0 v_*1
	movdqa	xmm7, xmm4
	pslldq	xmm4, 8
	psrldq	xmm7, 8
	pxor	xmm5, xmm4		// q_1
	pxor	xmm6, xmm7		// q_0

	// Next, work on the high half, a = u_1 v_1.
	movdqa	xmm3, xmm0		// more copies of u_1
	movdqa	xmm4, xmm0
	movdqa	xmm7, xmm0
	pclmullqhqdq xmm0, xmm2		// u_11 v_10
	pclmulhqlqdq xmm3, xmm2		// u_10 v_11
	pclmullqlqdq xmm4, xmm2		// u_11 v_11
	pclmulhqhqdq xmm7, xmm2		// u_10 v_10
#if CPUFAM_X86
	movdqa	xmm2, [SP + 0]
#  define V0 xmm2
#endif
	pxor	xmm0, xmm3		// u_10 v_11 + u_11 v_10
	movdqa	xmm3, xmm0
	pslldq	xmm0, 8
	psrldq	xmm3, 8
	pxor	xmm4, xmm0		// x_3 = a_1
	pxor	xmm7, xmm3		// a_0

	// Mix that into the product now forming in xmm4--xmm7.
	pxor	xmm5, xmm4		// a_1 + q_1
	pxor	xmm6, xmm7		// a_0 + q_0
	pxor	xmm5, xmm7		// a_0 + (a_1 + q_1)

	// Finally, the low half, c = u_0 v_0.
	movdqa	xmm0, xmm1		// more copies of u_0
	movdqa	xmm3, xmm1
	movdqa	xmm7, xmm1
	pclmullqhqdq xmm1, V0		// u_01 v_00
	pclmulhqlqdq xmm0, V0		// u_00 v_01
	pclmullqlqdq xmm3, V0		// u_01 v_01
	pclmulhqhqdq xmm7, V0		// u_00 v_00
	pxor	xmm0, xmm1		// u_10 v_11 + u_11 v_10
	movdqa	xmm1, xmm0
	pslldq	xmm0, 8
	psrldq	xmm1, 8
	pxor	xmm3, xmm0		// c_1
	pxor	xmm7, xmm1		// x_0 = c_0

	// And mix that in to complete the product.
	pxor	xmm6, xmm3		// (a_0 + q_0) + c_1
	pxor	xmm5, xmm3	 // x_2 = a_0 + (a_1 + c_1 + q_1) = a_0 + b_1
	pxor	xmm6, xmm7	 // x_1 = (a_0 + c_0 + q_0) + c_1 = b_0 + c_1

#undef V0

	// Now we must reduce.  This is essentially the same as the 128-bit
	// case above, but more complicated because everything is bigger.
	// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.

	// First, shift the high bits down.
	movdqa	xmm0, xmm4		// x_3 again
	movdqa	xmm1, xmm4		// x_3 yet again
	movdqa	xmm2, xmm4		// x_3 again again
	pslld	xmm0, 30		// x_3: b_i for t^2
	pslld	xmm1, 27		// x_3: b_i for t^5
	pslld	xmm2, 22		// x_3: b_i for t^10
	 movdqa	xmm3, xmm5		// x_2 again
	pxor	xmm0, xmm1
	 movdqa	xmm1, xmm5		// x_2 again
	pxor	xmm0, xmm2		// b_3
	 movdqa	xmm2, xmm5		// x_2 again
	 pslld	xmm3, 30		// x_2: b_i for t^2
	 pslld	xmm1, 27		// x_2: b_i for t^5
	 pslld	xmm2, 22		// x_2: b_i for t^10
	 pxor	xmm3, xmm1
	movdqa	xmm1, xmm0
	 pxor	xmm3, xmm2		// b_2
	psrldq	xmm0, 4
	 movdqa	xmm2, xmm3
	pslldq	xmm1, 12
	 psrldq	xmm3, 4
	pxor	xmm6, xmm0
	 pslldq	xmm2, 12
	 pxor	xmm7, xmm3
	pxor	xmm5, xmm1
	 pxor	xmm6, xmm2

	// And then shift the low bits up.
	movdqa	xmm0, xmm4		// x_3 again
	 movdqa	xmm1, xmm5		// x_2 again
	movdqa	xmm2, xmm4		// x_3 yet again
	 movdqa	xmm3, xmm5		// x_2 yet again
	pxor	xmm6, xmm4		// x_1 and unit contrib from x_3
	 pxor	xmm7, xmm5		// x_0 and unit contrib from x_2
	psrld	xmm4, 2
	 psrld	xmm5, 2
	psrld	xmm0, 5
	 psrld	xmm1, 5
	psrld	xmm2, 10
	 psrld	xmm3, 10
	pxor	xmm4, xmm6		// x_1, with x_3 units and t^2
	 pxor	xmm5, xmm7		// x_0, with x_2 units and t^2
	pxor	xmm0, xmm2		// x_3 t^5 and t^10 contribs
	 pxor	xmm1, xmm3		// x_2 t^5 and t^10 contribs
	pxor	xmm0, xmm4		// high half of reduced result
	pxor	xmm1, xmm5		// low half; all done
.endm

///--------------------------------------------------------------------------
/// Main code.

// There are a number of representations of field elements in this code and
// it can be confusing.
//
//   * The `external format' consists of a sequence of contiguous bytes in
//     memory called a `block'.  The GCM spec explains how to interpret this
//     block as an element of a finite field.  As discussed extensively, this
//     representation is very annoying for a number of reasons.  On the other
//     hand, this code never actually deals with it directly.
//
//   * The `register format' consists of one or more XMM registers, depending
//     on the block size.  The bytes in these registers are in reverse order
//     -- so the least-significant byte of the lowest-numbered register holds
//     the /last/ byte in the block.  If the block size is not a multiple of
//     16 bytes, then there must be padding.  96-bit blocks are weird: the
//     padding is inserted at the /least/ significant end, so the register
//     holds (0, x_0; x_1, x_2); otherwise, the padding goes at the most
//     significant end.
//
//   * The `words' format consists of a sequence of bytes, as in the
//     `external format', but, according to the blockcipher in use, the bytes
//     within each 32-bit word may be reversed (`big-endian') or not
//     (`little-endian').  Accordingly, there are separate entry points for
//     each variant, identified with `b' or `l'.

#define SSEFUNC(f)							\
	FUNC(f##_avx); vzeroupper; endprologue; ENDFUNC;		\
	FUNC(f)

SSEFUNC(gcm_mulk_128b_x86ish_pclmul)
	// On entry, A points to a 128-bit field element in big-endian words
	// format; K points to a field-element in register format.  On exit,
	// A is updated with the product A K.

#if CPUFAM_X86
	mov	A, [SP + 4]
	mov	K, [SP + 8]
#endif
  endprologue
	movdqu	xmm0, [A]
	movdqu	xmm1, [K]
	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
	mul128
	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
	movdqu	[A], xmm0
	ret
ENDFUNC

SSEFUNC(gcm_mulk_128l_x86ish_pclmul)
	// On entry, A points to a 128-bit field element in little-endian
	// words format; K points to a field-element in register format.  On
	// exit, A is updated with the product A K.

#if CPUFAM_X86
	mov	A, [SP + 4]
	mov	K, [SP + 8]
	ldgot	ecx
#endif
  endprologue
	movdqa	xmm7, [INTADDR(swaptab_128l, ecx)]
	movdqu	xmm0, [A]
	movdqu	xmm1, [K]
	pshufb	xmm0, xmm7
	mul128
	pshufb	xmm0, xmm7
	movdqu	[A], xmm0
	ret
ENDFUNC

SSEFUNC(gcm_mulk_64b_x86ish_pclmul)
	// On entry, A points to a 64-bit field element in big-endian words
	// format; K points to a field-element in register format.  On exit,
	// A is updated with the product A K.

#if CPUFAM_X86
	mov	A, [SP + 4]
	mov	K, [SP + 8]
#endif
  endprologue
	movq	xmm0, [A]
	movq	xmm1, [K]
	pshufd	xmm0, xmm0, SHUF(1, 0, 3, 3)
	mul64
	pshufd	xmm0, xmm0, SHUF(1, 0, 3, 3)
	movq	[A], xmm0
	ret
ENDFUNC

SSEFUNC(gcm_mulk_64l_x86ish_pclmul)
	// On entry, A points to a 64-bit field element in little-endian
	// words format; K points to a field-element in register format.  On
	// exit, A is updated with the product A K.

#if CPUFAM_X86
	mov	A, [SP + 4]
	mov	K, [SP + 8]
	ldgot	ecx
#endif
  endprologue
	movdqa	xmm7, [INTADDR(swaptab_64l, ecx)]
	movq	xmm0, [A]
	movq	xmm1, [K]
	pshufb	xmm0, xmm7
	mul64
	pshufb	xmm0, xmm7
	movq	[A], xmm0
	ret
ENDFUNC

SSEFUNC(gcm_mulk_96b_x86ish_pclmul)
	// On entry, A points to a 96-bit field element in big-endian words
	// format; K points to a field-element in register format (i.e., 16
	// bytes, with the first four bytes zero).  On exit, A is updated
	// with the product A K.

#if CPUFAM_X86
	mov	A, [SP + 4]
	mov	K, [SP + 8]
#endif
  endprologue
	movq	xmm0, [A + 0]
	movd	xmm2, [A + 8]
	movdqu	xmm1, [K]
	punpcklqdq xmm0, xmm2
	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
	mul96
	pshufd	xmm1, xmm0, SHUF(3, 2, 1, 0)
	psrldq	xmm0, 4
	movq	[A + 0], xmm1
	movd	[A + 8], xmm0
	ret
ENDFUNC

SSEFUNC(gcm_mulk_96l_x86ish_pclmul)
	// On entry, A points to a 96-bit field element in little-endian
	// words format; K points to a field-element in register format
	// (i.e., 16 bytes, with the first four bytes zero).  On exit, A is
	// updated with the product A K.

#if CPUFAM_X86
	mov	A, [SP + 4]
	mov	K, [SP + 8]
	ldgot	ecx
#endif
  endprologue
	movdqa	xmm7, [INTADDR(swaptab_128l, ecx)]
	movq	xmm0, [A + 0]
	movd	xmm2, [A + 8]
	movdqu	xmm1, [K]
	punpcklqdq xmm0, xmm2
	pshufb	xmm0, xmm7
	mul96
	pshufb	xmm0, xmm7
	movq	[A + 0], xmm0
	psrldq	xmm0, 8
	movd	[A + 8], xmm0
	ret
ENDFUNC

SSEFUNC(gcm_mulk_192b_x86ish_pclmul)
	// On entry, A points to a 192-bit field element in big-endian words
	// format; K points to a field-element in register format.  On exit,
	// A is updated with the product A K.

#if CPUFAM_X86
	mov	A, [SP + 4]
	mov	K, [SP + 8]
#endif
#if CPUFAM_AMD64 && ABI_WIN
	stalloc	2*16 + 8
	savexmm	xmm6, 0
	savexmm	xmm7, 16
#endif
  endprologue
	movdqu	xmm0, [A + 8]
	movq	xmm1, [A + 0]
	movdqu	xmm2, [K + 0]
	movq	xmm3, [K + 16]
	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
	pshufd	xmm1, xmm1, SHUF(1, 0, 3, 3)
	mul192
	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
	pshufd	xmm1, xmm1, SHUF(1, 0, 3, 3)
	movdqu	[A + 8], xmm0
	movq	[A + 0], xmm1
#if CPUFAM_AMD64 && ABI_WIN
	rstrxmm	xmm6, 0
	rstrxmm	xmm7, 16
	stfree	2*16 + 8
#endif
	ret
ENDFUNC

SSEFUNC(gcm_mulk_192l_x86ish_pclmul)
	// On entry, A points to a 192-bit field element in little-endian
	// words format; K points to a field-element in register format.  On
	// exit, A is updated with the product A K.

#if CPUFAM_X86
	mov	A, [SP + 4]
	mov	K, [SP + 8]
	ldgot	ecx
#endif
#if CPUFAM_AMD64 && ABI_WIN
	stalloc	2*16 + 8
	savexmm	xmm6, 0
	savexmm	xmm7, 16
#endif
  endprologue
	movdqu	xmm0, [A + 8]
	movq	xmm1, [A + 0]
	movdqu	xmm2, [K + 0]
	movq	xmm3, [K + 16]
	pshufb	xmm0, [INTADDR(swaptab_128l, ecx)]
	pshufb	xmm1, [INTADDR(swaptab_64l, ecx)]
	mul192
	pshufb	xmm0, [INTADDR(swaptab_128l, ecx)]
	pshufb	xmm1, [INTADDR(swaptab_64l, ecx)]
	movdqu	[A + 8], xmm0
	movq	[A + 0], xmm1
#if CPUFAM_AMD64 && ABI_WIN
	rstrxmm	xmm6, 0
	rstrxmm	xmm7, 16
	stfree	2*16 + 8
#endif
	ret
ENDFUNC

SSEFUNC(gcm_mulk_256b_x86ish_pclmul)
	// On entry, A points to a 256-bit field element in big-endian words
	// format; K points to a field-element in register format.  On exit,
	// A is updated with the product A K.

#if CPUFAM_X86
	pushreg	BP
	setfp
	mov	A, [SP + 8]
	mov	K, [SP + 12]
	stalloc	16
	and	SP, ~15
#endif
#if CPUFAM_AMD64 && ABI_WIN
	stalloc	3*16 + 8
	savexmm	xmm6, 0
	savexmm	xmm7, 16
	savexmm	xmm8, 32
#endif
  endprologue
	movdqu	xmm0, [A + 16]
	movdqu	xmm1, [A + 0]
	movdqu	xmm2, [K + 0]
	movdqu	xmm3, [K + 16]
	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
	pshufd	xmm1, xmm1, SHUF(3, 2, 1, 0)
	mul256
	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
	pshufd	xmm1, xmm1, SHUF(3, 2, 1, 0)
	movdqu	[A + 16], xmm0
	movdqu	[A + 0], xmm1
#if CPUFAM_X86
	dropfp
	popreg	BP
#endif
#if CPUFAM_AMD64 && ABI_WIN
	rstrxmm	xmm6, 0
	rstrxmm	xmm7, 16
	rstrxmm	xmm8, 32
	stfree	3*16 + 8
#endif
	ret
ENDFUNC

SSEFUNC(gcm_mulk_256l_x86ish_pclmul)
	// On entry, A points to a 256-bit field element in little-endian
	// words format; K points to a field-element in register format.  On
	// exit, A is updated with the product A K.

#if CPUFAM_X86
	pushreg	BP
	setfp
	mov	A, [SP + 8]
	mov	K, [SP + 12]
	stalloc	16
	ldgot	ecx
	and	SP, ~15
#endif
#if CPUFAM_AMD64 && ABI_WIN
	stalloc	3*16 + 8
	savexmm	xmm6, 0
	savexmm	xmm7, 16
	savexmm	xmm8, 32
#endif
  endprologue
	movdqa	xmm7, [INTADDR(swaptab_128l, ecx)]
	movdqu	xmm0, [A + 16]
	movdqu	xmm1, [A + 0]
	movdqu	xmm2, [K + 0]
	movdqu	xmm3, [K + 16]
	pshufb	xmm0, xmm7
	pshufb	xmm1, xmm7
	mul256
	movdqa	xmm7, [INTADDR(swaptab_128l, ecx)]
	pshufb	xmm0, xmm7
	pshufb	xmm1, xmm7
	movdqu	[A + 16], xmm0
	movdqu	[A + 0], xmm1
#if CPUFAM_X86
	dropfp
	popreg	BP
#endif
#if CPUFAM_AMD64 && ABI_WIN
	rstrxmm	xmm6, 0
	rstrxmm	xmm7, 16
	rstrxmm	xmm8, 32
	stfree	3*16 + 8
#endif
	ret
ENDFUNC

	RODATA

	.balign	16
swaptab_128l:
	// Table for byte-swapping little-endian words-format blocks larger
	// than 64 bits.
	.byte	 15,  14,  13,  12,   11,  10,   9,   8
	.byte	  7,   6,   5,   4,    3,   2,   1,   0

	.balign	16
swaptab_64l:
	// Table for byte-swapping 64-bit little-endian words-format blocks.
	.byte	  7,   6,   5,   4,    3,   2,   1,   0
	.byte	255, 255, 255, 255,  255, 255, 255, 255

///----- That's all, folks --------------------------------------------------