[catacomb] / symm / gcm-x86ish-pclmul.S

/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// GCM acceleration for x86 processors
///
/// (c) 2018 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software: you can redistribute it and/or modify it
/// under the terms of the GNU Library General Public License as published
/// by the Free Software Foundation; either version 2 of the License, or
/// (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful, but
/// WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
/// Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb.  If not, write to the Free Software
/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
/// USA.

///--------------------------------------------------------------------------
/// Preliminaries.

#include "config.h"
#include "asm-common.h"

	.arch	.pclmul

	.text

///--------------------------------------------------------------------------
/// Common register allocation.

#if CPUFAM_X86
#  define A eax
#  define K edx
#elif CPUFAM_AMD64 && ABI_SYSV
#  define A rdi
#  define K rsi
#elif CPUFAM_AMD64 && ABI_WIN
#  define A rcx
#  define K rdx
#endif

///--------------------------------------------------------------------------
/// Multiplication macros.

	// The good news is that we have a fancy instruction to do the
	// multiplications.  The bad news is that it's not particularly well-
	// suited to the job.
	//
	// For one thing, it only does a 64-bit multiplication, so in general
	// we'll need to synthesize the full-width multiply by hand.  For
	// another thing, it doesn't help with the reduction, so we have to
	// do that by hand too.  And, finally, GCM has crazy bit ordering,
	// and the instruction does nothing useful for that at all.
	//
	// Focusing on that last problem first: the bits aren't in monotonic
	// significance order unless we permute them.  If we reverse the byte
	// order, then we'll have the bits in monotonic order, but backwards,
	// so the degree-0 coefficient will be in the most-significant bit.
	//
	// This is less of a difficulty than it seems at first, because
	// algebra.  Suppose we are given u = SUM_{0<=i<n} u_i t^i and v =
	// SUM_{0<=j<n} v_j t^j; then
	//
	//	u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
	//
	// Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i
	// and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
	// Then
	//
	//	ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
	//	    = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
	//
	// which is almost the bit-reversal of u v, only it's shifted right
	// by one place.  Oh, well: we'll have to shift it back later.
	//
	// That was important to think about, but there's not a great deal to
	// do about it yet other than to convert what we've got from the
	// blockcipher's byte-ordering convention to our big-endian
	// convention.  Since this depends on the blockcipher convention,
	// we'll leave the caller to cope with this: the macros here will
	// assume that the operands are in `register' format, which is the
	// byte-reversal of the external representation, padded at the
	// most-significant end except for 96-bit blocks, which are
	// zero-padded at the least-significant end (see `mul96' for the
	// details).  In the commentary, pieces of polynomial are numbered
	// according to the degree of the coefficients, so the unit
	// coefficient of some polynomial a is in a_0.
	//
	// The commentary for `mul128' is the most detailed.  The other
	// macros assume that you've already read and understood that.

.macro	mul128
	// Enter with u and v in xmm0 and xmm1 respectively; leave with z =
	// u v in xmm0.  Clobbers xmm1--xmm4.

	// First for the double-precision multiplication.  It's tempting to
	// use Karatsuba's identity here, but I suspect that loses more in
	// the shifting, bit-twiddling, and dependency chains that it gains
	// in saving a multiplication which otherwise pipelines well.
	// xmm0 =			// (u_1; u_0)
	// xmm1 =			// (v_1; v_0)
	movdqa	xmm2, xmm1		// (v_1; v_0) again
	movdqa	xmm3, xmm0		// (u_1; u_0) again
	movdqa	xmm4, xmm0		// (u_1; u_0) yet again
	pclmulhqlqdq xmm2, xmm0		// u_1 v_0
	pclmullqlqdq xmm0, xmm1		// u_1 v_1
	pclmulhqlqdq xmm3, xmm1		// u_0 v_1
	pclmulhqhqdq xmm4, xmm1		// u_0 v_0

	// Arrange the pieces to form a double-precision polynomial.
	pxor	xmm2, xmm3		// (m_1; m_0) = u_1 v_0 + u_0 v_1
	movdqa	xmm1, xmm2		// (m_1; m_0) again
	pslldq	xmm2, 8			// (0; m_1)
	psrldq	xmm1, 8			// (m_0; 0)
	pxor	xmm0, xmm2		// x_1 = u_1 v_1 + m_1
	pxor	xmm1, xmm4		// x_0 = u_0 v_0 + t^64 m_0

	// Two problems remain.  The first is that this product is shifted
	// left (from GCM's backwards perspective) by one place, which is
	// annoying.  Let's take care of that now.  Once this is done, we'll
	// be properly in GCM's backwards bit-ordering, so xmm1 will hold the
	// low half of the product and xmm0 the high half.  (The following
	// diagrams show bit 0 consistently on the right.)
	//
	//				 xmm1
	//    ,-------------.-------------.-------------.-------------.
	//    | 0  x_0-x_30 |  x_31-x_62  |  x_63-x_94  |  x_95-x_126 |
	//    `-------------^-------------^-------------^-------------'
	//
	//				 xmm0
	//    ,-------------.-------------.-------------.-------------.
	//    | x_127-x_158 | x_159-x_190 | x_191-x_222 | x_223-x_254 |
	//    `-------------^-------------^-------------^-------------'
	//
	// We start by shifting each 32-bit lane right (from GCM's point of
	// view -- physically, left) by one place, which gives us this:
	//
	//			     low (xmm3)
	//    ,-------------.-------------.-------------.-------------.
	//    | x_0-x_30  0 | x_32-x_62 0 | x_64-x_94 0 | x_96-x_126 0|
	//    `-------------^-------------^-------------^-------------'
	//
	//			     high (xmm2)
	//    ,-------------.-------------.-------------.-------------.
	//    |x_128-x_158 0|x_160-x_190 0|x_192-x_222 0|x_224-x_254 0|
	//    `-------------^-------------^-------------^-------------'
	//
	// but we've lost a bunch of bits.  We separately shift each lane
	// left by 31 places to give us the bits we lost.
	//
	//			     low (xmm1)
	//    ,-------------.-------------.-------------.-------------.
	//    |    0...0    | 0...0  x_31 | 0...0  x_63 | 0...0  x_95 |
	//    `-------------^-------------^-------------^-------------'
	//
	//			     high (xmm0)
	//    ,-------------.-------------.-------------.-------------.
	//    | 0...0 x_127 | 0...0 x_159 | 0...0 x_191 | 0...0 x_223 |
	//    `-------------^-------------^-------------^-------------'
	//
	// Which is close, but we don't get a cigar yet.  To get the missing
	// bits into position, we shift each of these right by a lane, but,
	// alas, the x_127 falls off, so, separately, we shift the high
	// register left by three lanes, so that everything is lined up
	// properly when we OR them all together:
	//
	//			     low (xmm1)
	//    ,-------------.-------------.-------------.-------------.
	//    ? 0...0  x_31 | 0...0  x_63 | 0...0  x_95 |    0...0    |
	//    `-------------^-------------^-------------^-------------'
	//
	//			     wrap (xmm4)
	//    ,-------------.-------------.-------------.-------------.
	//    |    0...0    |    0...0    |    0...0    | 0...0 x_127 |
	//    `-------------^-------------^-------------^-------------'
	//
	//			     high (xmm0)
	//    ,-------------.-------------.-------------.-------------.
	//    | 0...0 x_159 | 0...0 x_191 | 0...0 x_223 |    0...0    |
	//    `-------------^-------------^-------------^-------------'
	//
	// The `low' and `wrap' registers (xmm1, xmm3, xmm4) then collect the
	// low 128 coefficients, while the `high' registers (xmm0, xmm2)
	// collect the high 127 registers, leaving a zero bit at the most
	// significant end as we expect.

	// xmm0 =			// (x_7, x_6; x_5, x_4)
	// xmm1 =			// (x_3, x_2; x_1, x_0)
	movdqa	xmm3, xmm1		// (x_3, x_2; x_1, x_0) again
	movdqa	xmm2, xmm0		// (x_7, x_6; x_5, x_4) again
	psrld	xmm1, 31		// shifted left; just the carries
	psrld	xmm0, 31
	pslld	xmm3, 1			// shifted right, but dropped carries
	pslld	xmm2, 1
	movdqa	xmm4, xmm0		// another copy for the carry around
	pslldq	xmm1, 4			// move carries over
	pslldq	xmm0, 4
	psrldq	xmm4, 12		// the big carry wraps around
	por	xmm1, xmm3
	por	xmm0, xmm2		// (y_7, y_6; y_5, y_4)
	por	xmm1, xmm4		// (y_3, y_2; y_1, y_0)

	// And the other problem is that the result needs to be reduced
	// modulo p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128 = t^7 +
	// t^2 + t + 1 in our field.  So far, we've calculated z_0 and z_1
	// such that z_0 + z_1 R = u v using the identity R = t^128: now we
	// must collapse the two halves of z together using the other
	// identity R = t^7 + t^2 + t + 1.
	//
	// We do this by working on each 32-bit word of the high half of z
	// separately, so consider y_i, for some 4 <= i < 8.  Certainly, y_i
	// t^{32i} = y_i R t^{32(i-4)} = (t^7 + t^2 + t + 1) y_i t^{32(i-4)},
	// but we can't use that directly without breaking up the 32-bit word
	// structure.  Instead, we start by considering just y_i t^7
	// t^{32(i-4)}, which again looks tricky.  Now, split y_i = a_i +
	// t^25 b_i, with deg a_i < 25; then
	//
	//	y_i t^7 t^{32(i-4)} = a_i t^7 t^{32(i-4)} + b_i t^{32(i-3)}
	//
	// We can similarly decompose y_i t^2 and y_i t into a pair of 32-bit
	// contributions to the t^{32(i-4)} and t^{32(i-3)} words, but the
	// splits are different.  This is lovely, with one small snag: when
	// we do this to y_7, we end up with a contribution back into the
	// t^128 coefficient word.  But notice that only the low seven bits
	// of this word are affected, so there's no knock-on contribution
	// into the t^32 word.  Therefore, if we handle the high bits of each
	// word together, and then the low bits, everything will be fine.

	// First, shift the high bits down.
	movdqa	xmm2, xmm0		// (y_7, y_6; y_5, y_4) again
	movdqa	xmm3, xmm0		// (y_7, y_6; y_5, y_4) yet again
	movdqa	xmm4, xmm0		// (y_7, y_6; y_5, y_4) again again
	pslld	xmm2, 31		// the b_i for t
	pslld	xmm3, 30		// the b_i for t^2
	pslld	xmm4, 25		// the b_i for t^7
	pxor	xmm2, xmm3		// add them all together
	pxor	xmm2, xmm4
	movdqa	xmm3, xmm2		// and a copy for later
	psrldq	xmm2, 4			// contribution into low half
	pslldq	xmm3, 12		// and high half
	pxor	xmm1, xmm2
	pxor	xmm0, xmm3

	// And then shift the low bits up.
	movdqa	xmm2, xmm0
	movdqa	xmm3, xmm0
	pxor	xmm1, xmm0		// mix in the unit contribution
	psrld	xmm0, 1
	psrld	xmm2, 2
	psrld	xmm3, 7
	pxor	xmm1, xmm2		// low half, unit, and t^2 contribs
	pxor	xmm0, xmm3		// t and t^7 contribs
	pxor	xmm0, xmm1		// mix them together and we're done
.endm

.macro	mul64
	// Enter with u and v in the low halves of xmm0 and xmm1
	// respectively; leave with z = u v in xmm0.  Clobbers xmm1--xmm4.

	// The multiplication is thankfully easy.
	pclmullqlqdq xmm0, xmm1		// u v

	// Shift the product up by one place.  After this, we're in GCM
	// bizarro-world.
	movdqa	xmm1, xmm0		// u v again
	psrld	xmm0, 31		// shifted left; just the carries
	pslld	xmm1, 1			// shifted right, but dropped carries
	pslldq	xmm0, 4			// move carries over
	por	xmm1, xmm0		// (y_3, y_2; y_1, y_0)

	// Now we must reduce.  This is essentially the same as the 128-bit
	// case above, but mostly simpler because everything is smaller.  The
	// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.

	// First, we must detach the top (`low'!) half of the result.
	movdqa	xmm0, xmm1		// (y_3, y_2; y_1, y_0) again
	psrldq	xmm1, 8			// (y_1, y_0; 0, 0)

	// Next, shift the high bits down.
	movdqa	xmm2, xmm0		// (y_3, y_2; ?, ?) again
	movdqa	xmm3, xmm0		// (y_3, y_2; ?, ?) yet again
	movdqa	xmm4, xmm0		// (y_3, y_2; ?, ?) again again
	pslld	xmm2, 31		// b_i for t
	pslld	xmm3, 29		// b_i for t^3
	pslld	xmm4, 28		// b_i for t^4
	pxor	xmm2, xmm3		// add them all together
	pxor	xmm2, xmm4
	movdqa	xmm3, xmm2		// and a copy for later
	movq	xmm2, xmm2		// zap high half
	pslldq	xmm3, 4			// contribution into high half
	psrldq	xmm2, 4			// and low half
	pxor	xmm0, xmm3
	pxor	xmm1, xmm2

	// And then shift the low bits up.
	movdqa	xmm2, xmm0
	movdqa	xmm3, xmm0
	pxor	xmm1, xmm0		// mix in the unit contribution
	psrld	xmm0, 1
	psrld	xmm2, 3
	psrld	xmm3, 4
	pxor	xmm1, xmm2		// low half, unit, and t^3 contribs
	pxor	xmm0, xmm3		// t and t^4 contribs
	pxor	xmm0, xmm1		// mix them together and we're done
.endm

.macro	mul96
	// Enter with u and v in the /high/ three words of xmm0 and xmm1
	// respectively (and zero in the low word); leave with z = u v in the
	// high three words of xmm0, and /junk/ in the low word.  Clobbers
	// xmm1--xmm4.

	// This is an inconvenient size.  There's nothing for it but to do
	// four multiplications, as if for the 128-bit case.  It's possible
	// that there's cruft in the top 32 bits of the input registers, so
	// shift both of them up by four bytes before we start.  This will
	// mean that the high 64 bits of the result (from GCM's viewpoint)
	// will be zero.
	// xmm0 =			// (0, u_2; u_1, u_0)
	// xmm1 =			// (0, v_2; v_1, v_0)
	movdqa	xmm2, xmm1		// (0, v_2; v_1, v_0) again
	movdqa	xmm3, xmm0		// (0, u_2; u_1, u_0) again
	movdqa	xmm4, xmm0		// (0, u_2; u_1, u_0) yet again
	pclmulhqlqdq xmm2, xmm0		// u_2 (v_1 t^32 + v_0) = e_0
	pclmullqlqdq xmm0, xmm1		// u_2 v_2 = d = (0; d)
	pclmulhqlqdq xmm3, xmm1		// v_2 (u_1 t^32 + u_0) = e_1
	pclmulhqhqdq xmm4, xmm1		// u_0 v_0 + (u_1 v_0 + u_0 v_1) t^32
					//   + u_1 v_1 t^64 = f

	// Extract the high and low halves of the 192-bit result.  We don't
	// need be too picky about the unused high words of the result
	// registers.  The answer we want is d t^128 + e t^64 + f, where e =
	// e_0 + e_1.
	//
	// The place values for the two halves are (t^160, t^128; t^96, ?)
	// and (?, t^64; t^32, 1).
	psrldq	xmm0, 8			// (d; 0) = d t^128
	pxor	xmm2, xmm3		// e = (e_0 + e_1)
	movdqa	xmm1, xmm4		// f again
	pxor	xmm0, xmm2		// d t^128 + e t^64
	psrldq	xmm2, 12		// e[31..0] t^64
	psrldq	xmm1, 4			// f[95..0]
	pslldq	xmm4, 8			// f[127..96]
	pxor	xmm1, xmm2		// low 96 bits of result
	pxor	xmm0, xmm4		// high 96 bits of result

	// Next, shift everything one bit to the left to compensate for GCM's
	// strange ordering.  This will be easier if we shift up the high
	// half by a word before we start.  After this we're in GCM bizarro-
	// world.
	movdqa	xmm3, xmm1		// low half again
	pslldq	xmm0, 4			// shift high half
	psrld	xmm1, 31		// shift low half down: just carries
	movdqa	xmm2, xmm0		// copy high half
	pslld	xmm3, 1			// shift low half down: drop carries
	psrld	xmm0, 31		// shift high half up: just carries
	pslld	xmm2, 1			// shift high half down: drop carries
	movdqa	xmm4, xmm0		// copy high carries for carry-around
	pslldq	xmm0, 4			// shift carries down
	pslldq	xmm1, 4
	psrldq	xmm4, 12		// the big carry wraps around
	por	xmm1, xmm3
	por	xmm0, xmm2
	por	xmm1, xmm4

	// Finally, the reduction.  This is essentially the same as the
	// 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
	// t^9 + t^6 + 1.  The degrees are larger but not enough to cause
	// trouble for the general approach.

	// First, shift the high bits down.
	movdqa	xmm2, xmm0		// copies of the high part
	movdqa	xmm3, xmm0
	movdqa	xmm4, xmm0
	pslld	xmm2, 26		// b_i for t^6
	pslld	xmm3, 23		// b_i for t^9
	pslld	xmm4, 22		// b_i for t^10
	pxor	xmm2, xmm3		// add them all together
	pslldq	xmm1, 4			// shift low part up to match
	pxor	xmm2, xmm4
	movdqa	xmm3, xmm2		// and a copy for later
	pslldq	xmm2, 8			// contribution to high half
	psrldq	xmm3, 4			// contribution to low half
	pxor	xmm1, xmm3
	pxor	xmm0, xmm2

	// And then shift the low bits up.
	movdqa	xmm2, xmm0		// copies of the high part
	movdqa	xmm3, xmm0
	pxor	xmm1, xmm0		// mix in the unit contribution
	psrld	xmm0, 6
	psrld	xmm2, 9
	psrld	xmm3, 10
	pxor	xmm1, xmm2		// low half, unit, and t^9 contribs
	pxor	xmm0, xmm3		// t^6 and t^10 contribs
	pxor	xmm0, xmm1		// mix them together and we're done
.endm

.macro	mul192
	// Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave
	// with z = u v in xmm0/xmm1 -- the top halves of the high registers
	// are unimportant.  Clobbers xmm2--xmm7.

	// Start multiplying and accumulating pieces of product.
	// xmm0 =			// (u_2; u_1)
	// xmm1 =			// (u_0; ?)
	// xmm2 =			// (v_2; v_1)
	// xmm3 =			// (v_0; ?)
	movdqa	xmm4, xmm0		// (u_2; u_1) again
	movdqa	xmm5, xmm0		// (u_2; u_1) yet again
	movdqa	xmm6, xmm0		// (u_2; u_1) again again
	movdqa	xmm7, xmm1		// (u_0; ?) again
	punpcklqdq xmm1, xmm3		// (u_0; v_0)
	pclmulhqhqdq xmm4, xmm2		// u_1 v_1
	pclmullqlqdq xmm3, xmm0		// u_2 v_0
	pclmullqhqdq xmm5, xmm2		// u_2 v_1
	pclmulhqlqdq xmm6, xmm2		// u_1 v_2
	pxor	xmm4, xmm3		// u_2 v_0 + u_1 v_1
	pclmullqlqdq xmm7, xmm2		// u_0 v_2
	pxor	xmm5, xmm6		// b = u_2 v_1 + u_1 v_2
	movdqa	xmm6, xmm0		// (u_2; u_1) like a bad penny
	pxor	xmm4, xmm7		// c = u_0 v_2 + u_1 v_1 + u_2 v_0
	pclmullqlqdq xmm0, xmm2		// a = u_2 v_2
	pclmulhqhqdq xmm6, xmm1		// u_1 v_0
	pclmulhqlqdq xmm2, xmm1		// u_0 v_1
	pclmullqhqdq xmm1, xmm1		// e = u_0 v_0
	pxor	xmm2, xmm6		// d = u_1 v_0 + u_0 v_1

	// Next, the piecing together of the product.
	// xmm0 =			// (a_1; a_0) = a = u_2 v_2
	// xmm5 =			// (b_1; b_0) = b = u_1 v_2 + u_2 v_1
	// xmm4 =			// (c_1; c_0) = c = u_0 v_2 +
					//	u_1 v_1 + u_2 v_0
	// xmm2 =			// (d_1; d_0) = d = u_0 v_1 + u_1 v_0
	// xmm1 =			// (e_1; e_0) = e = u_0 v_0
	// xmm3, xmm6, xmm7 spare
	movdqa	xmm3, xmm2		// (d_1; d_0) again
	movdqa	xmm6, xmm5		// (b_1; b_0) again
	pslldq	xmm2, 8			// (0; d_1)
	psrldq	xmm5, 8			// (b_0; 0)
	psrldq	xmm3, 8			// (d_0; 0)
	pslldq	xmm6, 8			// (0; b_1)
	pxor	xmm5, xmm2		// (b_0; d_1)
	pxor	xmm0, xmm6		// x_2 = (a_1; a_0 + b_1)
	pxor	xmm3, xmm1		// x_0 = (e_1 + d_0; e_0)
	pxor	xmm4, xmm5		// x_1 = (b_0 + c_1; c_0 + d_1)

	// Now, shift it right (from GCM's point of view) by one bit, and try
	// to leave the result in less random registers.  After this, we'll
	// be in GCM bizarro-world.
	// xmm1, xmm2, xmm5, xmm6, xmm7 spare
	movdqa	xmm5, xmm0		// copy x_2
	movdqa	xmm1, xmm4		// copy x_1
	movdqa	xmm2, xmm3		// copy x_0
	psrld	xmm0, 31		// x_2 carries
	psrld	xmm4, 31		// x_1 carries
	psrld	xmm3, 31		// x_0 carries
	pslld	xmm5, 1			// x_2 shifted
	pslld	xmm1, 1			// x_1 shifted
	pslld	xmm2, 1			// x_0 shifted
	movdqa	xmm6, xmm0		// x_2 carry copy
	movdqa	xmm7, xmm4		// x_1 carry copy
	pslldq	xmm0, 4			// x_2 carry shifted
	pslldq	xmm4, 4			// x_1 carry shifted
	pslldq	xmm3, 4			// x_0 carry shifted
	psrldq	xmm6, 12		// x_2 carry out
	psrldq	xmm7, 12		// x_1 carry out
	por	xmm0, xmm5		// (y_5; y_4)
	por	xmm1, xmm4
	por	xmm2, xmm3
	por	xmm1, xmm6		// (y_3; y_2)
	por	xmm2, xmm7		// (y_1; y_0)

	// Next, the reduction.  Our polynomial this time is p(x) = t^192 +
	// t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
	// 128-bit case.  I don't know why.

	// First, shift the high bits down.
	// xmm0 =			// (y_5; y_4)
	// xmm1 =			// (y_3; y_2)
	// xmm2 =			// (y_1; y_0)
	// xmm3--xmm7 spare
	movdqa	xmm3, xmm0		// (y_5; y_4) copy
	movdqa	xmm4, xmm0		// (y_5; y_4) copy
	movdqa	xmm5, xmm0		// (y_5; y_4) copy
	pslld	xmm3, 31		// (y_5; y_4) b_i for t
	pslld	xmm4, 30		// (y_5; y_4) b_i for t^2
	pslld	xmm5, 25		// (y_5; y_4) b_i for t^7
	 movq	xmm6, xmm1		// (y_3; 0) copy
	pxor	xmm3, xmm4
	 movq	xmm7, xmm1		// (y_3; 0) copy
	pxor	xmm3, xmm5
	 movq	xmm5, xmm1		// (y_3; 0) copy
	movdqa	xmm4, xmm3		// (y_5; y_4) b_i combined
	 pslld	xmm6, 31		// (y_3; 0) b_i for t
	 pslld	xmm7, 30		// (y_3; 0) b_i for t^2
	 pslld	xmm5, 25		// (y_3; 0) b_i for t^7
	psrldq	xmm3, 12		// (y_5; y_4) low contrib
	pslldq	xmm4, 4			// (y_5; y_4) high contrib
	 pxor	xmm6, xmm7
	pxor	xmm2, xmm3
	 pxor	xmm6, xmm5
	pxor	xmm1, xmm4
	 pslldq	xmm6, 4
	 pxor	xmm2, xmm6

	// And finally shift the low bits up.  Unfortunately, we also have to
	// split the low bits out.
	// xmm0 =			// (y'_5; y'_4)
	// xmm1 =			// (y'_3; y'_2)
	// xmm2 =			// (y'_1; y'_0)
	 movdqa xmm5, xmm1		// copies of (y'_3; y'_2)
	 movdqa	xmm6, xmm1
	 movdqa	xmm7, xmm1
	  psrldq xmm1, 8		// bring down (y'_2; ?)
	movdqa	xmm3, xmm0		// copies of (y'_5; y'_4)
	movdqa	xmm4, xmm0
	  punpcklqdq  xmm1, xmm2	// (y'_2; y'_1)
	  psrldq xmm2, 8		// (y'_0; ?)
	 pxor	xmm2, xmm5		// low half and unit contrib
	pxor	xmm1, xmm0
	 psrld	xmm5, 1
	psrld	xmm0, 1
	 psrld	xmm6, 2
	psrld	xmm3, 2
	 psrld	xmm7, 7
	psrld	xmm4, 7
	 pxor	xmm2, xmm6		// low half, unit, t^2 contribs
	pxor	xmm1, xmm3
	 pxor	xmm5, xmm7		// t and t^7 contribs
	pxor	xmm0, xmm4
	 pxor	xmm5, xmm2		// mix everything together
	pxor	xmm0, xmm1
	 movq	xmm1, xmm5		// shunt (z_0; ?) into proper place
.endm

.macro	mul256
	// Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave
	// with z = u v in xmm0/xmm1.  Clobbers xmm2--xmm7.  On 32-bit x86,
	// requires 16 bytes aligned space at SP; on amd64, also clobbers
	// xmm8.

	// Now it's starting to look worthwhile to do Karatsuba.  Suppose
	// u = u_0 + u_1 B and v = v_0 + v_1 B.  Then
	//
	//	u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
	//
	// Name these coefficients of B^i be a, b, and c, respectively, and
	// let r = u_0 + u_1 and s = v_0 + v_1.  Then observe that
	//
	//	q = r s = (u_0 + u_1) (v_0 + v_1)
	//	  = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
	//	  = a + d + c
	//
	// The first two terms we've already calculated; the last is the
	// remaining one we want.  We'll set B = t^128.  We know how to do
	// 128-bit multiplications already, and Karatsuba is too annoying
	// there, so there'll be 12 multiplications altogether, rather than
	// the 16 we'd have if we did this the naïve way.
	//
	// On x86, there aren't quite enough registers, so spill one for a
	// bit.  On AMD64, we can keep on going, so it's all good.

	// xmm0 =			// u_1 = (u_11; u_10)
	// xmm1 =			// u_0 = (u_01; u_00)
	// xmm2 =			// v_1 = (v_11; v_10)
	// xmm3 =			// v_0 = (v_01; v_00)
	movdqa	xmm4, xmm0		// u_1 again
#if CPUFAM_X86
	movdqa	[SP + 0], xmm3
#elif CPUFAM_AMD64
	movdqa	xmm8, xmm3
#  define V0 xmm8
#endif
	pxor	xmm4, xmm1		// u_* = (u_01 + u_11; u_00 + u_10)
	pxor	xmm3, xmm2		// v_* = (v_01 + v_11; v_00 + v_10)

	// Start by building the cross product, q = u_* v_*.
	movdqa	xmm7, xmm4		// more copies of u_*
	movdqa	xmm5, xmm4
	movdqa	xmm6, xmm4
	pclmullqhqdq xmm4, xmm3		// u_*1 v_*0
	pclmulhqlqdq xmm7, xmm3		// u_*0 v_*1
	pclmullqlqdq xmm5, xmm3		// u_*1 v_*1
	pclmulhqhqdq xmm6, xmm3		// u_*0 v_*0
	pxor	xmm4, xmm7		// u_*1 v_*0 + u_*0 v_*1
	movdqa	xmm7, xmm4
	pslldq	xmm4, 8
	psrldq	xmm7, 8
	pxor	xmm5, xmm4		// q_1
	pxor	xmm6, xmm7		// q_0

	// Next, work on the high half, a = u_1 v_1.
	movdqa	xmm3, xmm0		// more copies of u_1
	movdqa	xmm4, xmm0
	movdqa	xmm7, xmm0
	pclmullqhqdq xmm0, xmm2		// u_11 v_10
	pclmulhqlqdq xmm3, xmm2		// u_10 v_11
	pclmullqlqdq xmm4, xmm2		// u_11 v_11
	pclmulhqhqdq xmm7, xmm2		// u_10 v_10
#if CPUFAM_X86
	movdqa	xmm2, [SP + 0]
#  define V0 xmm2
#endif
	pxor	xmm0, xmm3		// u_10 v_11 + u_11 v_10
	movdqa	xmm3, xmm0
	pslldq	xmm0, 8
	psrldq	xmm3, 8
	pxor	xmm4, xmm0		// x_1 = a_1
	pxor	xmm7, xmm3		// a_0

	// Mix that into the product now forming in xmm4--xmm7.
	pxor	xmm5, xmm4		// a_1 + q_1
	pxor	xmm6, xmm7		// a_0 + q_0
	pxor	xmm5, xmm7		// a_0 + (a_1 + q_1)

	// Finally, the low half, c = u_0 v_0.
	movdqa	xmm0, xmm1		// more copies of u_0
	movdqa	xmm3, xmm1
	movdqa	xmm7, xmm1
	pclmullqhqdq xmm1, V0		// u_01 v_00
	pclmulhqlqdq xmm0, V0		// u_00 v_01
	pclmullqlqdq xmm3, V0		// u_01 v_01
	pclmulhqhqdq xmm7, V0		// u_00 v_00
	pxor	xmm0, xmm1		// u_10 v_11 + u_11 v_10
	movdqa	xmm1, xmm0
	pslldq	xmm0, 8
	psrldq	xmm1, 8
	pxor	xmm3, xmm0		// c_1
	pxor	xmm7, xmm1		// x_0 = c_0

	// And mix that in to complete the product.
	pxor	xmm6, xmm3		// (a_0 + q_0) + c_1
	pxor	xmm5, xmm3	 // x_2 = a_0 + (a_1 + c_1 + q_1) = a_0 + b_1
	pxor	xmm6, xmm7	 // x_1 = (a_0 + c_0 + q_0) + c_1 = b_0 + c_1

#undef V0

	// Now we need to shift that whole lot one bit to the left.  This
	// will also give us an opportunity to put the product back in
	// xmm0--xmm3.  This is a slightly merry dance because it's nearly
	// pipelined but we don't have enough registers.
	//
	// After this, we'll be in GCM bizarro-world.
	movdqa	xmm0, xmm4		// x_3 again
	psrld	xmm4, 31		// x_3 carries
	pslld	xmm0, 1			// x_3 shifted left
	movdqa	xmm3, xmm4		// x_3 copy carries
	 movdqa	xmm1, xmm5		// x_2 again
	pslldq	xmm4, 4			// x_3 carries shifted up
	 psrld	xmm5, 31		// x_2 carries
	psrldq	xmm3, 12		// x_3 big carry out
	 pslld	xmm1, 1			// x_2 shifted left
	por	xmm0, xmm4		// x_3 mixed together
	 movdqa	xmm4, xmm5		// x_2 copy carries
	  movdqa xmm2, xmm6		// x_1 again
	 pslldq	xmm5, 4			// x_2 carries shifted up
	  psrld	xmm6, 31		// x_1 carries
	 psrldq	xmm4, 12		// x_2 big carry out
	  pslld	xmm2, 1			// x_1 shifted
	 por	xmm1, xmm5		// x_2 mixed together
	  movdqa xmm5, xmm6		// x_1 copy carries
	 por	xmm1, xmm3		// x_2 with carry from x_3
	   movdqa xmm3, xmm7		// x_0 again
	  pslldq xmm6, 4		// x_1 carries shifted up
	   psrld xmm7, 31		// x_2 carries
	  psrldq xmm5, 12		// x_1 big carry out
	   pslld xmm3, 1		// x_0 shifted
	  por	xmm2, xmm6		// x_1 mixed together
	   pslldq xmm7, 4		// x_0 carries shifted up
	  por	xmm2, xmm4		// x_1 with carry from x_2
	   por	xmm3, xmm7		// x_0 mixed together
	   por	xmm3, xmm5		// x_0 with carry from x_1

	// Now we must reduce.  This is essentially the same as the 128-bit
	// case above, but more complicated because everything is bigger.
	// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.

	// First, shift the high bits down.
	movdqa	xmm4, xmm0		// y_3 again
	movdqa	xmm5, xmm0		// y_3 yet again
	movdqa	xmm6, xmm0		// y_3 again again
	pslld	xmm4, 30		// y_3: b_i for t^2
	pslld	xmm5, 27		// y_3: b_i for t^5
	pslld	xmm6, 22		// y_3: b_i for t^10
	 movdqa	xmm7, xmm1		// y_2 again
	pxor	xmm4, xmm5
	 movdqa	xmm5, xmm1		// y_2 again
	pxor	xmm4, xmm6
	 movdqa	xmm6, xmm1		// y_2 again
	 pslld	xmm7, 30		// y_2: b_i for t^2
	 pslld	xmm5, 27		// y_2: b_i for t^5
	 pslld	xmm6, 22		// y_2: b_i for t^10
	 pxor	xmm7, xmm5
	movdqa	xmm5, xmm4
	 pxor	xmm7, xmm6
	psrldq	xmm4, 4
	 movdqa	xmm6, xmm7
	pslldq	xmm5, 12
	 psrldq	xmm7, 4
	pxor	xmm2, xmm4
	 pslldq	xmm6, 12
	 pxor	xmm3, xmm7
	pxor	xmm1, xmm5
	 pxor	xmm2, xmm6

	// And then shift the low bits up.
	movdqa	xmm4, xmm0		// y_3 again
	 movdqa	xmm5, xmm1		// y_2 again
	movdqa	xmm6, xmm0		// y_3 yet again
	 movdqa	xmm7, xmm1		// y_2 yet again
	pxor	xmm2, xmm0		// y_1 and unit contrib from y_3
	 pxor	xmm3, xmm1		// y_0 and unit contrib from y_2
	psrld	xmm0, 2
	 psrld	xmm1, 2
	psrld	xmm4, 5
	 psrld	xmm5, 5
	psrld	xmm6, 10
	 psrld	xmm7, 10
	pxor	xmm0, xmm2		// y_1, with y_3 units and t^2
	 pxor	xmm1, xmm3		// y_0, with y_2 units and t^2
	pxor	xmm4, xmm6		// y_3 t^5 and t^10 contribs
	 pxor	xmm5, xmm7		// y_2 t^5 and t^10 contribs
	pxor	xmm0, xmm4		// high half of reduced result
	pxor	xmm1, xmm5		// low half; all done
.endm

///--------------------------------------------------------------------------
/// Main code.

// There are a number of representations of field elements in this code and
// it can be confusing.
//
//   * The `external format' consists of a sequence of contiguous bytes in
//     memory called a `block'.  The GCM spec explains how to interpret this
//     block as an element of a finite field.  As discussed extensively, this
//     representation is very annoying for a number of reasons.  On the other
//     hand, this code never actually deals with it directly.
//
//   * The `register format' consists of one or more XMM registers, depending
//     on the block size.  The bytes in these registers are in reverse order
//     -- so the least-significant byte of the lowest-numbered register holds
//     the /last/ byte in the block.  If the block size is not a multiple of
//     16 bytes, then there must be padding.  96-bit blocks are weird: the
//     padding is inserted at the /least/ significant end, so the register
//     holds (0, x_0; x_1, x_2); otherwise, the padding goes at the most
//     significant end.
//
//   * The `words' format consists of a sequence of bytes, as in the
//     `external format', but, according to the blockcipher in use, the bytes
//     within each 32-bit word may be reversed (`big-endian') or not
//     (`little-endian').  Accordingly, there are separate entry points for
//     each variant, identified with `b' or `l'.

#define SSEFUNC(f)							\
	FUNC(f##_avx); vzeroupper; endprologue; ENDFUNC;		\
	FUNC(f)

SSEFUNC(gcm_mulk_128b_x86ish_pclmul)
	// On entry, A points to a 128-bit field element in big-endian words
	// format; K points to a field-element in register format.  On exit,
	// A is updated with the product A K.

#if CPUFAM_X86
	mov	A, [SP + 4]
	mov	K, [SP + 8]
#endif
  endprologue
	movdqu	xmm0, [A]
	movdqu	xmm1, [K]
	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
	mul128
	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
	movdqu	[A], xmm0
	ret
ENDFUNC

SSEFUNC(gcm_mulk_128l_x86ish_pclmul)
	// On entry, A points to a 128-bit field element in little-endian
	// words format; K points to a field-element in register format.  On
	// exit, A is updated with the product A K.

#if CPUFAM_X86
	mov	A, [SP + 4]
	mov	K, [SP + 8]
	ldgot	ecx
#endif
  endprologue
	movdqa	xmm7, [INTADDR(swaptab_128l, ecx)]
	movdqu	xmm0, [A]
	movdqu	xmm1, [K]
	pshufb	xmm0, xmm7
	mul128
	pshufb	xmm0, xmm7
	movdqu	[A], xmm0
	ret
ENDFUNC

SSEFUNC(gcm_mulk_64b_x86ish_pclmul)
	// On entry, A points to a 64-bit field element in big-endian words
	// format; K points to a field-element in register format.  On exit,
	// A is updated with the product A K.

#if CPUFAM_X86
	mov	A, [SP + 4]
	mov	K, [SP + 8]
#endif
  endprologue
	movq	xmm0, [A]
	movq	xmm1, [K]
	pshufd	xmm0, xmm0, SHUF(1, 0, 3, 3)
	mul64
	pshufd	xmm0, xmm0, SHUF(1, 0, 3, 3)
	movq	[A], xmm0
	ret
ENDFUNC

SSEFUNC(gcm_mulk_64l_x86ish_pclmul)
	// On entry, A points to a 64-bit field element in little-endian
	// words format; K points to a field-element in register format.  On
	// exit, A is updated with the product A K.

#if CPUFAM_X86
	mov	A, [SP + 4]
	mov	K, [SP + 8]
	ldgot	ecx
#endif
  endprologue
	movdqa	xmm7, [INTADDR(swaptab_64l, ecx)]
	movq	xmm0, [A]
	movq	xmm1, [K]
	pshufb	xmm0, xmm7
	mul64
	pshufb	xmm0, xmm7
	movq	[A], xmm0
	ret
ENDFUNC

SSEFUNC(gcm_mulk_96b_x86ish_pclmul)
	// On entry, A points to a 96-bit field element in big-endian words
	// format; K points to a field-element in register format (i.e., 16
	// bytes, with the first four bytes zero).  On exit, A is updated
	// with the product A K.

#if CPUFAM_X86
	mov	A, [SP + 4]
	mov	K, [SP + 8]
#endif
  endprologue
	movq	xmm0, [A + 0]
	movd	xmm2, [A + 8]
	movdqu	xmm1, [K]
	punpcklqdq xmm0, xmm2
	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
	mul96
	pshufd	xmm1, xmm0, SHUF(3, 2, 1, 0)
	psrldq	xmm0, 4
	movq	[A + 0], xmm1
	movd	[A + 8], xmm0
	ret
ENDFUNC

SSEFUNC(gcm_mulk_96l_x86ish_pclmul)
	// On entry, A points to a 96-bit field element in little-endian
	// words format; K points to a field-element in register format
	// (i.e., 16 bytes, with the first four bytes zero).  On exit, A is
	// updated with the product A K.

#if CPUFAM_X86
	mov	A, [SP + 4]
	mov	K, [SP + 8]
	ldgot	ecx
#endif
  endprologue
	movdqa	xmm7, [INTADDR(swaptab_128l, ecx)]
	movq	xmm0, [A + 0]
	movd	xmm2, [A + 8]
	movdqu	xmm1, [K]
	punpcklqdq xmm0, xmm2
	pshufb	xmm0, xmm7
	mul96
	pshufb	xmm0, xmm7
	movq	[A + 0], xmm0
	psrldq	xmm0, 8
	movd	[A + 8], xmm0
	ret
ENDFUNC

SSEFUNC(gcm_mulk_192b_x86ish_pclmul)
	// On entry, A points to a 192-bit field element in big-endian words
	// format; K points to a field-element in register format.  On exit,
	// A is updated with the product A K.

#if CPUFAM_X86
	mov	A, [SP + 4]
	mov	K, [SP + 8]
#endif
#if CPUFAM_AMD64 && ABI_WIN
	stalloc	2*16 + 8
	savexmm	xmm6, 0
	savexmm	xmm7, 16
#endif
  endprologue
	movdqu	xmm0, [A + 8]
	movq	xmm1, [A + 0]
	movdqu	xmm2, [K + 0]
	movq	xmm3, [K + 16]
	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
	pshufd	xmm1, xmm1, SHUF(1, 0, 3, 3)
	mul192
	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
	pshufd	xmm1, xmm1, SHUF(1, 0, 3, 3)
	movdqu	[A + 8], xmm0
	movq	[A + 0], xmm1
#if CPUFAM_AMD64 && ABI_WIN
	rstrxmm	xmm6, 0
	rstrxmm	xmm7, 16
	stfree	2*16 + 8
#endif
	ret
ENDFUNC

SSEFUNC(gcm_mulk_192l_x86ish_pclmul)
	// On entry, A points to a 192-bit field element in little-endian
	// words format; K points to a field-element in register format.  On
	// exit, A is updated with the product A K.

#if CPUFAM_X86
	mov	A, [SP + 4]
	mov	K, [SP + 8]
	ldgot	ecx
#endif
#if CPUFAM_AMD64 && ABI_WIN
	stalloc	2*16 + 8
	savexmm	xmm6, 0
	savexmm	xmm7, 16
#endif
  endprologue
	movdqu	xmm0, [A + 8]
	movq	xmm1, [A + 0]
	movdqu	xmm2, [K + 0]
	movq	xmm3, [K + 16]
	pshufb	xmm0, [INTADDR(swaptab_128l, ecx)]
	pshufb	xmm1, [INTADDR(swaptab_64l, ecx)]
	mul192
	pshufb	xmm0, [INTADDR(swaptab_128l, ecx)]
	pshufb	xmm1, [INTADDR(swaptab_64l, ecx)]
	movdqu	[A + 8], xmm0
	movq	[A + 0], xmm1
#if CPUFAM_AMD64 && ABI_WIN
	rstrxmm	xmm6, 0
	rstrxmm	xmm7, 16
	stfree	2*16 + 8
#endif
	ret
ENDFUNC

SSEFUNC(gcm_mulk_256b_x86ish_pclmul)
	// On entry, A points to a 256-bit field element in big-endian words
	// format; K points to a field-element in register format.  On exit,
	// A is updated with the product A K.

#if CPUFAM_X86
	pushreg	BP
	setfp
	mov	A, [SP + 8]
	mov	K, [SP + 12]
	stalloc	16
	and	SP, ~15
#endif
#if CPUFAM_AMD64 && ABI_WIN
	stalloc	3*16 + 8
	savexmm	xmm6, 0
	savexmm	xmm7, 16
	savexmm	xmm8, 32
#endif
  endprologue
	movdqu	xmm0, [A + 16]
	movdqu	xmm1, [A + 0]
	movdqu	xmm2, [K + 0]
	movdqu	xmm3, [K + 16]
	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
	pshufd	xmm1, xmm1, SHUF(3, 2, 1, 0)
	mul256
	pshufd	xmm0, xmm0, SHUF(3, 2, 1, 0)
	pshufd	xmm1, xmm1, SHUF(3, 2, 1, 0)
	movdqu	[A + 16], xmm0
	movdqu	[A + 0], xmm1
#if CPUFAM_X86
	dropfp
	popreg	BP
#endif
#if CPUFAM_AMD64 && ABI_WIN
	rstrxmm	xmm6, 0
	rstrxmm	xmm7, 16
	rstrxmm	xmm8, 32
	stfree	3*16 + 8
#endif
	ret
ENDFUNC

SSEFUNC(gcm_mulk_256l_x86ish_pclmul)
	// On entry, A points to a 256-bit field element in little-endian
	// words format; K points to a field-element in register format.  On
	// exit, A is updated with the product A K.

#if CPUFAM_X86
	pushreg	BP
	setfp
	mov	A, [SP + 8]
	mov	K, [SP + 12]
	stalloc	16
	ldgot	ecx
	and	SP, ~15
#endif
#if CPUFAM_AMD64 && ABI_WIN
	stalloc	3*16 + 8
	savexmm	xmm6, 0
	savexmm	xmm7, 16
	savexmm	xmm8, 32
#endif
  endprologue
	movdqa	xmm7, [INTADDR(swaptab_128l, ecx)]
	movdqu	xmm0, [A + 16]
	movdqu	xmm1, [A + 0]
	movdqu	xmm2, [K + 0]
	movdqu	xmm3, [K + 16]
	pshufb	xmm0, xmm7
	pshufb	xmm1, xmm7
	mul256
	movdqa	xmm7, [INTADDR(swaptab_128l, ecx)]
	pshufb	xmm0, xmm7
	pshufb	xmm1, xmm7
	movdqu	[A + 16], xmm0
	movdqu	[A + 0], xmm1
#if CPUFAM_X86
	dropfp
	popreg	BP
#endif
#if CPUFAM_AMD64 && ABI_WIN
	rstrxmm	xmm6, 0
	rstrxmm	xmm7, 16
	rstrxmm	xmm8, 32
	stfree	3*16 + 8
#endif
	ret
ENDFUNC

	RODATA

	.balign	16
swaptab_128l:
	// Table for byte-swapping little-endian words-format blocks larger
	// than 64 bits.
	.byte	 15,  14,  13,  12,   11,  10,   9,   8
	.byte	  7,   6,   5,   4,    3,   2,   1,   0

	.balign	16
swaptab_64l:
	// Table for byte-swapping 64-bit little-endian words-format blocks.
	.byte	  7,   6,   5,   4,    3,   2,   1,   0
	.byte	255, 255, 255, 255,  255, 255, 255, 255

///----- That's all, folks --------------------------------------------------