/// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
///
/// Large SIMD-based multiplications
///
/// (c) 2016 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software; you can redistribute it and/or modify
/// it under the terms of the GNU Library General Public License as
/// published by the Free Software Foundation; either version 2 of the
/// License, or (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb; if not, write to the Free
/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
/// MA 02111-1307, USA.

///--------------------------------------------------------------------------
/// Preliminaries.

#include "config.h"
#include "asm-common.h"

	.arch	pentium4

	.text

///--------------------------------------------------------------------------
/// Theory.
///
/// We define a number of primitive fixed-size multipliers from which we can
/// construct more general variable-length multipliers.
///
/// The basic trick is the same throughout.  In an operand-scanning
/// multiplication, the inner multiplication loop multiplies a multiple-
/// precision operand by a single precision factor, and adds the result,
/// appropriately shifted, to the result.  A `finely integrated operand
/// scanning' implementation of Montgomery multiplication also adds the
/// product of a single-precision `Montgomery factor' and the modulus,
/// calculated in the same pass.  The more common `coarsely integrated
/// operand scanning' alternates main multiplication and Montgomery passes,
/// which requires additional carry propagation.
///
/// Throughout both plain-multiplication and Montgomery stages, then, one of
/// the factors remains constant throughout the operation, so we can afford
/// to take a little time to preprocess it.  The transformation we perform is
/// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
/// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
/// operands, as follows.
///
///	Offset	   12	    8	   4	   0
///	   0	v''_1	v''_0	v'_1	v'_0
///	  16	v''_3	v''_2	v'_3	v'_2
///
/// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
/// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
/// this vector right by 4 bytes brings v'_1 and v''_1 into position.  We can
/// multiply such a vector by a full 32-bit scalar to produce two 48-bit
/// results in 64-bit fields.  The sixteen bits of headroom allows us to add
/// many products together before we must deal with carrying; it also allows
/// for some calculations to be performed on the above expanded form.
///
/// We maintain four `carry' registers XMM12--XMM15 accumulating intermediate
/// results.  The registers' precise roles rotate during the computation; we
/// name them `c0', `c1', `c2', and `c3'.  Each carry register holds two
/// 64-bit halves: the register c0, for example, holds c'_0 (low half) and
/// c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the
/// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2 +
/// c_3 B^3.  The `pmuluqdq' instruction acting on a scalar operand
/// (broadcast across all lanes of its vector) and an operand in the expanded
/// form above produces a result which can be added directly to the
/// appropriate carry register.  Following a pass of four multiplications, we
/// perform some limited carry propagation: let t = c''_0 mod B, and let d =
/// c'_0 + t b; then we output z = d mod B, add (floor(d/B), floor(c''_0/B))
/// to c1, and cycle the carry registers around, so that c1 becomes c0, and
/// the old (implicitly) zeroed c0 becomes c3.
///
/// On 64-bit AMD64, we have a reasonable number of registers: the expanded
/// operands are kept in registers.  The packed operands are read from memory
/// into working registers XMM4 and XMM5; XMM0--XMM3 are used for the actual
/// multiplications; and XMM6 and XMM7 are used for combining the results.
/// The following conventional argument names and locations are used
/// throughout.
///
/// Arg	Format	    Location	Notes
///
/// U	packed	    [RAX]
/// X	packed	    [RBX]	In Montgomery multiplication, X = N
/// V	expanded    XMM8/XMM9
/// Y	expanded    XMM10/XMM11	In Montgomery multiplication, Y = (A + U V) M
/// M	expanded    (see below)	Montgomery factor, M = -N^{-1} (mod B^4)
/// N				Modulus, for Montgomery multiplication
/// A	packed	    [RDI]	Destination/accumulator
/// C	carry	    XMM12--XMM15
///
/// The calculation is some variant of
///
///	A' + C' B^4 <- U V + X Y + A + C
///
/// The low-level functions fit into a fairly traditional (finely-integrated)
/// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y)
/// (indexed by i).
///
/// The variants are as follows.
///
/// Function	Variant			Use		i	j
///
/// mmul4	A = C = 0, Y = M	Montgomery	0	0
/// dmul4	A = 0			Montgomery	0	+
/// mmla4	C = 0, Y = M		Montgomery	+	0
/// dmla4	exactly as shown	Montgomery	+	+
/// mont4	U = C = 0, V = M	Montgomery	any	0
///
/// mul4zc	U = V = A = C = 0	Plain		0	0
/// mul4	U = V = A = 0		Plain		0	+
/// mla4zc	U = V = C = 0		Plain		+	0
/// mla4	U = V = 0		Plain		+	+
///
/// The `mmul4' and `mmla4' functions are also responsible for calculating
/// the Montgomery reduction factor Y = (A + U V) M used by the rest of the
/// inner loop.

///--------------------------------------------------------------------------
/// Macro definitions.

.macro	mulcore	r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
	// Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
	// of the product in registers D0, D1, D2, D3.
	pshufd	\d0, \r, SHUF(3, \i, 3, \i) // (?, r_i; ?, r_i)
  .ifnes "\d1", "nil"
	movdqa	\d1, \slo		// (s''_1, s''_0; s'_1, s'_0)
  .endif
  .ifnes "\d3", "nil"
	movdqa	\d3, \shi		// (s''_3, s''_2; s'_3, s'_2)
  .endif
  .ifnes "\d1", "nil"
	psrldq	\d1, 4			// (0, s''_1; s''_0, s'_1)
  .endif
  .ifnes "\d2", "nil"
	movdqa	\d2, \d0		// another copy of (?, r_i; ?, r_i)
  .endif
  .ifnes "\d3", "nil"
	psrldq	\d3, 4			// (0, s''_3; s''_2, s'_3)
  .endif
  .ifnes "\d1", "nil"
	pmuludq	\d1, \d0		// (r_i s''_1; r_i s'_1)
  .endif
  .ifnes "\d3", "nil"
	pmuludq	\d3, \d0		// (r_i s''_3; r_i s'_3)
  .endif
  .ifnes "\d2", "nil"
	pmuludq	\d2, \shi		// (r_i s''_2; r_i s'_2)
  .endif
	pmuludq	\d0, \slo		// (r_i s''_0; r_i s'_0)
.endm

.macro	accum	c0, c1=nil, c2=nil, c3=nil
	// Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
	// carry registers C0--C3.  Any or all of C1--C3 may be `nil' to skip
	// updating that register.
	paddq	\c0, xmm0
  .ifnes "\c1", "nil"
	paddq	\c1, xmm1
  .endif
  .ifnes "\c2", "nil"
	paddq	\c2, xmm2
  .endif
  .ifnes "\c3", "nil"
	paddq	\c3, xmm3
  .endif
.endm

.macro	mulacc	r, i, slo, shi, c0=nil, c1=nil, c2=nil, c3=nil, z3p=nil
	// Multiply R_I by the expanded operand SLO/SHI, and accumulate in
	// carry registers C0, C1, C2, C3.  If Z3P is `t' then C3 notionally
	// contains zero, but needs clearing; in practice, we store the
	// product directly rather than attempting to add.  On completion,
	// XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P is not `t'.
  .ifeqs "\z3p", "t"
	mulcore	\r, \i, \slo, \shi, xmm0, xmm1, xmm2, \c3
	accum			    \c0,  \c1,  \c2
  .else
	mulcore	\r, \i, \slo, \shi, xmm0, xmm1, xmm2, xmm3
	accum			    \c0,  \c1,  \c2,  \c3
  .endif
.endm

.macro	propout	d, pos, c, cc=nil
	// Calculate an output word from C, and store it at POS in D;
	// propagate carries out from C to CC in preparation for a rotation
	// of the carry registers.  D is an XMM register; the POS is either
	// `lo' or `hi' according to whether the output word should be in
	// lane 0 or 1 of D; the high two lanes of D are clobbered.  On
	// completion, XMM3 is clobbered.  If CC is `nil', then the
	// contribution which would have been added to it is left in C.
	pshufd	xmm3, \c, SHUF(2, 3, 3, 3) // (t = c'' mod B, ?; ?, ?)
	psrldq	xmm3, 12		// (0, 0; 0, t) = (0; t)
	pslldq	xmm3, 2			// (0; t b)
	paddq	\c, xmm3		// (c''; c' + t b)
  .ifeqs "\pos", "lo"
	movdqa	\d, \c
  .else
	punpckldq \d, \c
  .endif
	psrlq	\c, 32			// floor(c/B)
  .ifnes "\cc", "nil"
	paddq	\cc, \c			// propagate up
  .endif
.endm

.macro endprop d, pos, c, t
	// On entry, C contains a carry register.  On exit, the low 32 bits
	// of the value represented in C are written at POS in D, and the
	// remaining bits are left at the bottom of T.
	movdqa	\t, \c
	psllq	\t, 16			// (c'' b; ?)
	pslldq	\c, 8			// (c'; 0)
	paddq	\t, \c			// (c' + c'' b; ?)
	psrldq	\t, 8			// (0; c' + c'' b) = (0; c)
  .ifeqs "\pos", "lo"
	movdqa	\d, \t
  .else
	punpckldq \d, \t
  .endif
	psrldq	\t, 4			// (floor(c/B); 0)
.endm

.macro	expand	z, a, b, c=nil, d=nil
	// On entry, A and C hold packed 128-bit values, and Z is zero.  On
	// exit, A:B and C:D together hold the same values in expanded
	// form.  If C is `nil', then only expand A to A:B.
	movdqa	\b, \a			// (a_3, a_2; a_1, a_0)
  .ifnes "\c", "nil"
	movdqa	\d, \c			// (c_3, c_2; c_1, c_0)
  .endif
	punpcklwd \a, \z		// (a''_1, a'_1; a''_0, a'_0)
	punpckhwd \b, \z		// (a''_3, a'_3; a''_2, a'_2)
  .ifnes "\c", "nil"
	punpcklwd \c, \z		// (c''_1, c'_1; c''_0, c'_0)
	punpckhwd \d, \z		// (c''_3, c'_3; c''_2, c'_2)
  .endif
	pshufd	\a, \a, SHUF(3, 1, 2, 0) // (a''_1, a''_0; a'_1, a'_0)
	pshufd	\b, \b, SHUF(3, 1, 2, 0) // (a''_3, a''_2; a'_3, a'_2)
  .ifnes "\c", "nil"
	pshufd	\c, \c, SHUF(3, 1, 2, 0) // (c''_1, c''_0; c'_1, c'_0)
	pshufd	\d, \d, SHUF(3, 1, 2, 0) // (c''_3, c''_2; c'_3, c'_2)
  .endif
.endm

.macro	squash	c0, c1, c2, c3, t, u, lo, hi=nil
	// On entry, C0, C1, C2, C3 are carry registers representing a value
	// Y.  On exit, LO holds the low 128 bits of the carry value; C1, C2,
	// C3, T, and U are clobbered; and the high bits of Y are stored in
	// HI, if this is not `nil'.

	// The first step is to eliminate the `double-prime' pieces -- i.e.,
	// the ones offset by 16 bytes from a 32-bit boundary -- by carrying
	// them into the 32-bit-aligned pieces above and below.  But before
	// we can do that, we must gather them together.
	movdqa	\t, \c0
	movdqa	\u, \c1
	punpcklqdq \t, \c2		// (y'_2; y'_0)
	punpckhqdq \c0, \c2		// (y''_2; y''_0)
	punpcklqdq \u, \c3		// (y'_3; y'_1)
	punpckhqdq \c1, \c3		// (y''_3; y''_1)

	// Now split the double-prime pieces.  The high (up to) 48 bits will
	// go up; the low 16 bits go down.
	movdqa	\c2, \c0
	movdqa	\c3, \c1
	psllq	\c2, 48
	psllq	\c3, 48
	psrlq	\c0, 16			// high parts of (y''_2; y''_0)
	psrlq	\c1, 16			// high parts of (y''_3; y''_1)
	psrlq	\c2, 32			// low parts of (y''_2; y''_0)
	psrlq	\c3, 32			// low parts of (y''_3; y''_1)
  .ifnes "\hi", "nil"
	movdqa	\hi, \c1
  .endif
	pslldq	\c1, 8			// high part of (y''_1; 0)

	paddq	\t, \c2			// propagate down
	paddq	\u, \c3
	paddq	\t, \c1			// and up: (y_2; y_0)
	paddq	\u, \c0			// (y_3; y_1)
  .ifnes "\hi", "nil"
	psrldq	\hi, 8			// high part of (0; y''_3)
  .endif

	// Finally extract the answer.  This complicated dance is better than
	// storing to memory and loading, because the piecemeal stores
	// inhibit store forwarding.
	movdqa	\c3, \t			// (?; y_0)
	movdqa	\lo, \t			// (?, ?; ?, y^*_0)
	psrldq	\t, 8			// (0; y_2)
	psrlq	\c3, 32			// (floor(y_0/B); ?)
	paddq	\c3, \u			// (y_1 + floor(y_0/B); ?)
	movdqa	\c1, \c3		// (?, ?; ?, y^*_1)
	psrldq	\u, 8			// (0; y_3)
	psrlq	\c3, 32			// (floor((y_1 B + y_0)/B^2; ?)
	paddq	\c3, \t			// (y_2 + floor((y_1 B + y_0)/B^2; ?)
	punpckldq \lo, \c3		// (?, ?; y^*_2, y^*_0)
	psrlq	\c3, 32		    // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
	paddq	\c3, \u	      // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
  .ifnes "\hi", "nil"
	movdqa	\t, \c3
	pxor	\u, \u
  .endif
	punpckldq \c1, \c3		// (?, ?; y^*_3, y^*_1)
  .ifnes "\hi", "nil"
	psrlq	\t, 32			// very high bits of y
	paddq	\hi, \t
	punpcklqdq \hi, \u		// carry up
  .endif
	punpckldq \lo, \c1		// y mod B^4
.endm

.macro	carryadd
	// On entry, RDI points to a packed addend A, and XMM12, XMM13, XMM14
	// hold the incoming carry registers c0, c1, and c2 representing a
	// carry-in C.
	//
	// On exit, the carry registers, including XMM15, are updated to hold
	// C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
	// registers are preserved.
	movd	xmm0, [rdi +  0]	// (0; a_0)
	movd	xmm1, [rdi +  4]	// (0; a_1)
	movd	xmm2, [rdi +  8]	// (0; a_2)
	movd	xmm15, [rdi + 12]	// (0; a_3)
	paddq	xmm12, xmm0		// (c''_0; c'_0 + a_0)
	paddq	xmm13, xmm1		// (c''_1; c'_1 + a_1)
	paddq	xmm14, xmm2		// (c''_2 + a_3 b; c'_2 + a_2)
.endm

///--------------------------------------------------------------------------
/// Primitive multipliers and related utilities.

INTFUNC(carryprop)
	// On entry, XMM12, XMM13, and XMM14 hold a 144-bit carry in an
	// expanded form.  Store the low 128 bits of the represented carry to
	// [RDI] as a packed 128-bit value, and leave the remaining 16 bits
	// in the low 32 bits of XMM12.  On exit, XMM0, XMM1, XMM3, XMM13 and
	// XMM14 are clobbered.
  endprologue

	propout	xmm0, lo, xmm12, xmm13
	propout	xmm1, lo, xmm13, xmm14
	propout	xmm0, hi, xmm14, nil
	endprop	xmm1, hi, xmm14, xmm12
	punpckldq xmm0, xmm1
	movdqu	[rdi], xmm0

	ret
ENDFUNC

INTFUNC(dmul4)
	// On entry, RDI points to the destination buffer; RAX and RBX point
	// to the packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
	// expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
	// incoming carry registers c0, c1, and c2; c3 is assumed to be zero.
	//
	// On exit, we write the low 128 bits of the sum C + U V + X Y to
	// [RDI], and update the carry registers with the carry out.  The
	// registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
	// registers are preserved.
  endprologue

	movdqu	xmm4, [rax]
	movdqu	xmm5, [rbx]

	mulacc	xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15, t
	mulacc	xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
	propout	xmm6, lo,		 xmm12, xmm13

	mulacc	xmm4, 1,   xmm8,  xmm9,  xmm13, xmm14, xmm15, xmm12, t
	mulacc	xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
	propout	xmm7, lo,		 xmm13, xmm14

	mulacc	xmm4, 2,   xmm8,  xmm9,  xmm14, xmm15, xmm12, xmm13, t
	mulacc	xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
	propout	xmm6, hi,		 xmm14, xmm15

	mulacc	xmm4, 3,   xmm8,  xmm9,  xmm15, xmm12, xmm13, xmm14, t
	mulacc	xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
	propout	xmm7, hi,		 xmm15, xmm12

	punpckldq xmm6, xmm7
	movdqu	[rdi], xmm6

	ret
ENDFUNC

INTFUNC(dmla4)
	// On entry, RDI points to the destination buffer, which also
	// contains an addend A to accumulate; RAX and RBX point to the
	// packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
	// expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
	// incoming carry registers c0, c1, and c2 representing a carry-in C;
	// c3 is assumed to be zero.
	//
	// On exit, we write the low 128 bits of the sum A + C + U V + X Y to
	// [RDI], and update the carry registers with the carry out.  The
	// registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
	// registers are preserved.
  endprologue

	movdqu	xmm4, [rax]
	movdqu	xmm5, [rbx]
	carryadd

	mulacc	xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15
	mulacc	xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
	propout	xmm6, lo,		 xmm12, xmm13

	mulacc	xmm4, 1,   xmm8,  xmm9,  xmm13, xmm14, xmm15, xmm12, t
	mulacc	xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
	propout	xmm7, lo,		 xmm13, xmm14

	mulacc	xmm4, 2,   xmm8,  xmm9,  xmm14, xmm15, xmm12, xmm13, t
	mulacc	xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
	propout	xmm6, hi,		 xmm14, xmm15

	mulacc	xmm4, 3,   xmm8,  xmm9,  xmm15, xmm12, xmm13, xmm14, t
	mulacc	xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
	propout	xmm7, hi,		 xmm15, xmm12

	punpckldq xmm6, xmm7
	movdqu	[rdi], xmm6

	ret
ENDFUNC

INTFUNC(mul4zc)
	// On entry, RDI points to the destination buffer; RBX points to a
	// packed operand X; and XMM10/XMM11 hold an expanded operand Y.
	//
	// On exit, we write the low 128 bits of the product X Y to [RDI],
	// and set the carry registers XMM12, XMM13, XMM14 to the carry out.
	// The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
	// general-purpose registers are preserved.
  endprologue

	movdqu	xmm5, [rbx]

	mulcore	xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
	propout	xmm6, lo,		 xmm12, xmm13

	mulacc	xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
	propout	xmm7, lo,		 xmm13, xmm14

	mulacc	xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
	propout	xmm6, hi,		 xmm14, xmm15

	mulacc	xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
	propout	xmm7, hi,		 xmm15, xmm12

	punpckldq xmm6, xmm7
	movdqu	[rdi], xmm6

	ret
ENDFUNC

INTFUNC(mul4)
	// On entry, RDI points to the destination buffer; RBX points to a
	// packed operand X; XMM10/XMM11 hold an expanded operand Y; and
	// XMM12, XMM13, XMM14 hold the incoming carry registers c0, c1, and
	// c2, representing a carry-in C; c3 is assumed to be zero.
	//
	// On exit, we write the low 128 bits of the sum C + X Y to [RDI],
	// and update the carry registers with the carry out.  The registers
	// XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
	// general-purpose registers are preserved.
  endprologue

	movdqu	xmm5, [rbx]

	mulacc	xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, t
	propout	xmm6, lo,		 xmm12, xmm13

	mulacc	xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
	propout	xmm7, lo,		 xmm13, xmm14

	mulacc	xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
	propout	xmm6, hi,		 xmm14, xmm15

	mulacc	xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
	propout	xmm7, hi,		 xmm15, xmm12

	punpckldq xmm6, xmm7
	movdqu	[rdi], xmm6

	ret
ENDFUNC

INTFUNC(mla4zc)
	// On entry, RDI points to the destination buffer, which also
	// contains an addend A to accumulate; RBX points to a packed operand
	// X; and XMM10/XMM11 points to an expanded operand Y.
	//
	// On exit, we write the low 128 bits of the sum A + X Y to [RDI],
	// and set the carry registers XMM12, XMM13, XMM14 to the carry out.
	// The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
	// general-purpose registers are preserved.
  endprologue

	movdqu	xmm5, [rbx]
	movd	xmm12, [rdi +  0]
	movd	xmm13, [rdi +  4]
	movd	xmm14, [rdi +  8]
	movd	xmm15, [rdi + 12]

	mulacc	xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
	propout	xmm6, lo,		 xmm12, xmm13

	mulacc	xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
	propout	xmm7, lo,		 xmm13, xmm14

	mulacc	xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
	propout	xmm6, hi,		 xmm14, xmm15

	mulacc	xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
	propout	xmm7, hi,		 xmm15, xmm12

	punpckldq xmm6, xmm7
	movdqu	[rdi], xmm6

	ret
ENDFUNC

INTFUNC(mla4)
	// On entry, RDI points to the destination buffer, which also
	// contains an addend A to accumulate; RBX points to a packed operand
	// X; XMM10/XMM11 holds an expanded operand Y; and XMM12, XMM13,
	// XMM14 hold the incoming carry registers c0, c1, and c2,
	// representing a carry-in C; c3 is assumed to be zero.
	//
	// On exit, we write the low 128 bits of the sum A + C + X Y to
	// [RDI], and update the carry registers with the carry out.  The
	// registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
	// general-purpose registers are preserved.
  endprologue

	movdqu	xmm5, [rbx]
	carryadd

	mulacc	xmm5, 0,    xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
	propout	xmm6, lo,		 xmm12, xmm13

	mulacc	xmm5, 1,    xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
	propout	xmm7, lo,		 xmm13, xmm14

	mulacc	xmm5, 2,    xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
	propout	xmm6, hi,		 xmm14, xmm15

	mulacc	xmm5, 3,    xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
	propout	xmm7, hi,		 xmm15, xmm12

	punpckldq xmm6, xmm7
	movdqu	[rdi], xmm6

	ret
ENDFUNC

INTFUNC(mmul4)
	// On entry, RDI points to the destination buffer; RAX and RBX point
	// to the packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold
	// the expanded operands V and M.  The stack pointer must be 8 modulo
	// 16 (as usual for AMD64 ABIs).
	//
	// On exit, we store Y = U V M mod B in XMM10/XMM11, and write the
	// low 128 bits of the sum U V + N Y to [RDI], leaving the remaining
	// carry in XMM12, XMM13, and XMM14.  The registers XMM0--XMM7, and
	// XMM15 are clobbered; the general-purpose registers are preserved.
	movdqu	xmm4, [rax]
#if ABI_WIN
	stalloc	48 + 8			// space for the carries
#endif
  endprologue

	// Calculate W = U V, and leave it in XMM7.  Stash the carry pieces
	// for later.
	mulcore	xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15
	propout	xmm7, lo,		 xmm12, xmm13
	jmp	5f
ENDFUNC

INTFUNC(mmla4)
	// On entry, RDI points to the destination buffer, which also
	// contains an addend A to accumulate; RAX and RBX point to the
	// packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold the
	// expanded operands V and M.  The stack pointer must be 8 modulo 16
	// (as usual for AMD64 ABIs).
	//
	// On exit, we store Y = (A + U V) M mod B in XMM10/XMM11, and write
	// the low 128 bits of the sum A + U V + N Y to [RDI], leaving the
	// remaining carry in XMM12, XMM13, and XMM14.  The registers
	// XMM0--XMM7, and XMM15 are clobbered; the general-purpose registers
	// are preserved.
	movdqu	xmm4, [rax]
#if ABI_WIN
	stalloc	48 + 8			// space for the carries
#  define STKTMP(i) [SP + i]
#endif
#if ABI_SYSV
#  define STKTMP(i) [SP + i - 48 - 8]	// use red zone
#endif
  endprologue

	movd	xmm12, [rdi +  0]
	movd	xmm13, [rdi +  4]
	movd	xmm14, [rdi +  8]
	movd	xmm15, [rdi + 12]

	// Calculate W = U V, and leave it in XMM7.  Stash the carry pieces
	// for later.
	mulacc	xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15
	propout	xmm7, lo,		 xmm12, xmm13

5:	mulacc	xmm4, 1,   xmm8,  xmm9,  xmm13, xmm14, xmm15, xmm12, t
	propout	xmm6, lo,		 xmm13, xmm14

	mulacc	xmm4, 2,   xmm8,  xmm9,  xmm14, xmm15, xmm12, xmm13, t
	propout	xmm7, hi,		 xmm14, xmm15

	mulacc	xmm4, 3,   xmm8,  xmm9,  xmm15, xmm12, xmm13, xmm14, t
	propout	xmm6, hi,		 xmm15, xmm12

	// Prepare W, and stash carries for later.
	punpckldq xmm7, xmm6
	movdqa	STKTMP( 0), xmm12
	movdqa	STKTMP(16), xmm13
	movdqa	STKTMP(32), xmm14

	// Calculate Y = W M.  We just about have enough spare registers to
	// make this work.
	mulcore	xmm7, 0,   xmm10, xmm11, xmm3,  xmm4,  xmm5,  xmm6

	// Start expanding W back into the main carry registers...
	pxor	xmm15, xmm15
	movdqa	xmm12, xmm7
	movdqa	xmm14, xmm7

	mulcore	xmm7, 1,   xmm10, xmm11, xmm0,  xmm1,  xmm2
	accum				 xmm4,  xmm5,  xmm6

	punpckldq xmm12, xmm15		// (0, w_1; 0, w_0)
	punpckhdq xmm14, xmm15		// (0, w_3; 0, w_2)

	mulcore	xmm7, 2,   xmm10, xmm11, xmm0,  xmm1
	accum				 xmm5,  xmm6

	pxor	xmm2, xmm2
	movdqa	xmm13, xmm12
	movdqa	xmm15, xmm14

	mulcore	xmm7, 3,   xmm10, xmm11, xmm0
	accum				 xmm6

	punpckldq xmm12, xmm2		// (0, 0; 0, w_0)
	punpckldq xmm14, xmm2		// (0, 0; 0, w_2)
	punpckhdq xmm13, xmm2		// (0, 0; 0, w_1)
	punpckhdq xmm15, xmm2		// (0, 0; 0, w_3)

	// That's lots of pieces.  Now we have to assemble the answer.
	squash	xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10

	// Expand it.
	movdqu	xmm5, [rbx]
	expand	xmm2, xmm10, xmm11

	// Finish the calculation by adding the Montgomery product.
	mulacc	xmm5, 0    xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
	propout	xmm6, lo,		 xmm12, xmm13

	mulacc	xmm5, 1    xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
	propout	xmm7, lo,		 xmm13, xmm14

	mulacc	xmm5, 2    xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
	propout	xmm6, hi,		 xmm14, xmm15

	mulacc	xmm5, 3    xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
	propout	xmm7, hi,		 xmm15, xmm12

	punpckldq xmm6, xmm7

	// Add add on the carry we calculated earlier.
	paddq	xmm12, STKTMP( 0)
	paddq	xmm13, STKTMP(16)
	paddq	xmm14, STKTMP(32)

	// And, with that, we're done.
	movdqu	[rdi], xmm6
#if ABI_WIN
	stfree	56
#endif
	ret

#undef STKTMP

ENDFUNC

INTFUNC(mont4)
	// On entry, RDI points to the destination buffer holding a packed
	// value W; RBX points to a packed operand N; and XMM8/XMM9 hold an
	// expanded operand M.
	//
	// On exit, we store Y = W M mod B in XMM10/XMM11, and write the low
	// 128 bits of the sum W + N Y to [RDI], leaving the remaining carry
	// in XMM12, XMM13, and XMM14.  The registers XMM0--XMM3, XMM5--XMM7,
	// and XMM15 are clobbered; the general-purpose registers are
	// preserved.
  endprologue

	movdqu	xmm7, [rdi]

	// Calculate Y = W M.  Avoid the standard carry registers, because
	// we're setting something else up there.
	mulcore	xmm7, 0,   xmm8,  xmm9,  xmm3,  xmm4,  xmm5,  xmm6

	// Start expanding W back into the main carry registers...
	pxor	xmm15, xmm15
	movdqa	xmm12, xmm7
	movdqa	xmm14, xmm7

	mulcore	xmm7, 1,   xmm8,  xmm9,  xmm0,  xmm1,  xmm2
	accum				 xmm4,  xmm5,  xmm6

	punpckldq xmm12, xmm15		// (0, w_1; 0, w_0)
	punpckhdq xmm14, xmm15		// (0, w_3; 0, w_2)

	mulcore	xmm7, 2,   xmm8,  xmm9,  xmm0,  xmm1
	accum				 xmm5,  xmm6

	pxor	xmm2, xmm2
	movdqa	xmm13, xmm12
	movdqa	xmm15, xmm14

	mulcore	xmm7, 3,   xmm8,  xmm9,  xmm0
	accum				 xmm6

	punpckldq xmm12, xmm2		// (0, 0; 0, w_0)
	punpckldq xmm14, xmm2		// (0, 0; 0, w_2)
	punpckhdq xmm13, xmm2		// (0, 0; 0, w_1)
	punpckhdq xmm15, xmm2		// (0, 0; 0, w_3)

	// That's lots of pieces.  Now we have to assemble the answer.
	squash	xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10

	// Expand it.
	movdqu	xmm5, [rbx]
	expand	xmm2, xmm10, xmm11

	// Finish the calculation by adding the Montgomery product.
	mulacc	xmm5, 0    xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
	propout	xmm6, lo,		 xmm12, xmm13

	mulacc	xmm5, 1    xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
	propout	xmm7, lo,		 xmm13, xmm14

	mulacc	xmm5, 2    xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
	propout	xmm6, hi,		 xmm14, xmm15

	mulacc	xmm5, 3    xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
	propout	xmm7, hi,		 xmm15, xmm12

	punpckldq xmm6, xmm7

	// And, with that, we're done.
	movdqu	[rdi], xmm6
	ret
ENDFUNC

///--------------------------------------------------------------------------
/// Bulk multipliers.

FUNC(mpx_umul4_amd64_avx)
	.arch	.avx
	vzeroupper
  endprologue
	.arch	pentium4
ENDFUNC

FUNC(mpx_umul4_amd64_sse2)
	// void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
	//			     const mpw *bv, const mpw *bvl);

	// Establish the arguments and do initial setup.
	//
	//			sysv	win
	// inner loop dv	rdi	rdi*
	// inner loop av	rbx*	rbx*
	// outer loop dv	r10	rcx
	// outer loop bv	rcx	r9
	// av base		rsi	rdx
	// av limit		rdx	r8
	// bv limit		r8	r10

#if ABI_SYSV
#  define DV r10
#  define AV rsi
#  define AVL rdx
#  define BV rcx
#  define BVL r8

	pushreg	rbx
  endprologue

	mov	DV, rdi
#endif

#if ABI_WIN
#  define DV rcx
#  define AV rdx
#  define AVL r8
#  define BV r9
#  define BVL r10

	pushreg	rbx
	pushreg	rdi
	stalloc	160 + 8

	savexmm	xmm6,    0
	savexmm	xmm7,   16
	savexmm	xmm8,   32
	savexmm	xmm9,   48
	savexmm	xmm10,  64
	savexmm	xmm11,  80
	savexmm	xmm12,  96
	savexmm	xmm13, 112
	savexmm	xmm14, 128
	savexmm	xmm15, 144

  endprologue

	mov	rdi, DV
	mov	BVL, [SP + 224]
#endif

	// Prepare for the first iteration.
	pxor	xmm0, xmm0
	movdqu	xmm10, [BV]		// bv[0]
	mov	rbx, AV
	add	DV, 16
	add	BV, 16
	expand	xmm0, xmm10, xmm11
	call	mul4zc
	add	rbx, 16
	add	rdi, 16
	cmp	rbx, AVL		// all done?
	jae	8f

	.p2align 4
	// Continue with the first iteration.
0:	call	mul4
	add	rbx, 16
	add	rdi, 16
	cmp	rbx, AVL		// all done?
	jb	0b

	// Write out the leftover carry.  There can be no tail here.
8:	call	carryprop
	cmp	BV, BVL			// more passes to do?
	jae	9f

	.p2align 4
	// Set up for the next pass.
1:	movdqu	xmm10, [BV]		// bv[i]
	mov	rdi, DV			// -> dv[i]
	pxor	xmm0, xmm0
	expand	xmm0, xmm10, xmm11
	mov	rbx, AV			// -> av[0]
	add	DV, 16
	add	BV, 16
	call	mla4zc
	add	rbx, 16
	add	rdi, 16
	cmp	rbx, AVL		// done yet?
	jae	8f

	.p2align 4
	// Continue...
0:	call	mla4
	add	rbx, 16
	add	rdi, 16
	cmp	rbx, AVL
	jb	0b

	// Finish off this pass.  There was no tail on the previous pass, and
	// there can be none on this pass.
8:	call	carryprop
	cmp	BV, BVL
	jb	1b

	// All over.
9:

#if ABI_SYSV
	popreg	rbx
#endif

#if ABI_WIN
	rstrxmm	xmm6,    0
	rstrxmm	xmm7,   16
	rstrxmm	xmm8,   32
	rstrxmm	xmm9,   48
	rstrxmm	xmm10,  64
	rstrxmm	xmm11,  80
	rstrxmm	xmm12,  96
	rstrxmm	xmm13, 112
	rstrxmm	xmm14, 128
	rstrxmm	xmm15, 144

	stfree	160 + 8
	popreg	rdi
	popreg	rbx
#endif

	ret

#undef DV
#undef AV
#undef AVL
#undef BV
#undef BVL

ENDFUNC

FUNC(mpxmont_mul4_amd64_avx)
	.arch	.avx
	vzeroupper
  endprologue
	.arch	pentium4
ENDFUNC

FUNC(mpxmont_mul4_amd64_sse2)
	// void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
	//			     const mpw *nv, size_t n, const mpw *mi);

	// Establish the arguments and do initial setup.
	//
	//			sysv	win
	// inner loop dv	rdi	rdi*
	// inner loop av	rax	rax
	// inner loop nv	rbx*	rbx*
	// mi			r9	r10
	// outer loop dv	r10	rcx
	// outer loop bv	rdx	r8
	// av base		rsi	rdx
	// av limit		r11	r11
	// bv limit		r8	r12*
	// nv base		rcx	r9
	// n			r8	r12*

#if ABI_SYSV
#  define DV r10
#  define AV rsi
#  define AVL r11
#  define BV rdx
#  define BVL r8
#  define NV rcx
#  define N r8
#  define MI r9

	pushreg	rbx
  endprologue

	mov	DV, rdi
#endif

#if ABI_WIN
#  define DV rcx
#  define AV rdx
#  define AVL r11
#  define BV r8
#  define BVL r12
#  define NV r9
#  define N r12
#  define MI r10

	pushreg	rbx
	pushreg	rdi
	pushreg	r12
	stalloc	160

	savexmm	xmm6,    0
	savexmm	xmm7,   16
	savexmm	xmm8,   32
	savexmm	xmm9,   48
	savexmm	xmm10,  64
	savexmm	xmm11,  80
	savexmm	xmm12,  96
	savexmm	xmm13, 112
	savexmm	xmm14, 128
	savexmm	xmm15, 144

  endprologue

	mov	rdi, DV
	mov	N, [SP + 224]
	mov	MI, [SP + 232]
#endif

	// Establish the expanded operands.
	pxor	xmm0, xmm0
	movdqu	xmm8, [BV]		// bv[0]
	movdqu	xmm10, [MI]		// mi
	expand	xmm0, xmm8, xmm9, xmm10, xmm11

	// Set up the outer loop state and prepare for the first iteration.
	mov	rax, AV			// -> U = av[0]
	mov	rbx, NV			// -> X = nv[0]
	lea	AVL, [AV + 4*N]		// -> av[n/4] = av limit
	lea	BVL, [BV + 4*N]		// -> bv[n/4] = bv limit
	add	BV, 16
	add	DV, 16
	call	mmul4
	add	rdi, 16
	add	rax, 16
	add	rbx, 16
	cmp	rax, AVL		// done already?
	jae	8f

	.p2align 4
	// Complete the first inner loop.
0:	call	dmul4
	add	rdi, 16
	add	rax, 16
	add	rbx, 16
	cmp	rax, AVL		// done yet?
	jb	0b

	// Still have carries left to propagate.
	call	carryprop
	movd	[rdi + 16], xmm12

	.p2align 4
	// Embark on the next iteration.  (There must be one.  If n = 1, then
	// we would have bailed above, to label 8.  Similarly, the subsequent
	// iterations can fall into the inner loop immediately.)
1:	pxor	xmm0, xmm0
	movdqu	xmm8, [BV]		// bv[i]
	movdqu	xmm10, [MI]		// mi
	mov	rdi, DV			// -> Z = dv[i]
	mov	rax, AV			// -> U = av[0]
	mov	rbx, NV			// -> X = nv[0]
	expand	xmm0, xmm8, xmm9, xmm10, xmm11
	add	BV, 16
	add	DV, 16
	call	mmla4
	add	rdi, 16
	add	rax, 16
	add	rbx, 16

	.p2align 4
	// Complete the next inner loop.
0:	call	dmla4
	add	rdi, 16
	add	rax, 16
	add	rbx, 16
	cmp	rax, AVL
	jb	0b

	// Still have carries left to propagate, and they overlap the
	// previous iteration's final tail, so read that in and add it.
	movd	xmm0, [rdi]
	paddq	xmm12, xmm0
	call	carryprop
	movd	[rdi + 16], xmm12

	// Back again, maybe.
	cmp	BV, BVL
	jb	1b

	// All done.
9:

#if ABI_SYSV
	popreg	rbx
#endif

#if ABI_WIN
	rstrxmm	xmm6,    0
	rstrxmm	xmm7,   16
	rstrxmm	xmm8,   32
	rstrxmm	xmm9,   48
	rstrxmm	xmm10,  64
	rstrxmm	xmm11,  80
	rstrxmm	xmm12,  96
	rstrxmm	xmm13, 112
	rstrxmm	xmm14, 128
	rstrxmm	xmm15, 144

	stfree	160
	popreg	r12
	popreg	rdi
	popreg	rbx
#endif

	ret

	// First iteration was short.  Write out the carries and we're done.
	// (This could be folded into the main loop structure, but that would
	// penalize small numbers more.)
8:	call	carryprop
	movd	[rdi + 16], xmm12
#if ABI_SYSV
	popreg	rbx
	ret
#endif
#if ABI_WIN
	jmp	9b
#endif

#undef DV
#undef AV
#undef AVL
#undef BV
#undef BVL
#undef NV
#undef N
#undef MI

ENDFUNC

FUNC(mpxmont_redc4_amd64_avx)
	.arch	.avx
	vzeroupper
  endprologue
	.arch	pentium4
ENDFUNC

FUNC(mpxmont_redc4_amd64_sse2)
	// void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
	//			       size_t n, const mpw *mi);

	// Establish the arguments and do initial setup.
	//
	//			sysv	win
	// inner loop dv	rdi	rdi*
	// dv limit		rax	rax
	// blocks-of-4 dv limit	rsi	rdx
	// inner loop nv	rbx*	rbx*
	// mi			r8	r10
	// outer loop dv	r10	rcx
	// outer loop dv limit	r11	r11
	// nv base		rdx	r8
	// nv limit		r9	r10*
	// n			rcx	r9
	// c			rcx	r9

#if ABI_SYSV
#  define DVL rax
#  define DVL4 rsi
#  define MI r8
#  define DV r10
#  define DVLO r11
#  define NV rdx
#  define NVL r9
#  define N rcx
#  define C ecx

	pushreg	rbx
  endprologue

	mov	DV, rdi
#endif

#if ABI_WIN
#  define DVL rax
#  define DVL4 rdx
#  define MI r10
#  define DV rcx
#  define DVLO r11
#  define NV r8
#  define NVL r10
#  define N r9
#  define C r9d

	pushreg	rbx
	pushreg	rdi
	stalloc	168

	savexmm	xmm6,    0
	savexmm	xmm7,   16
	savexmm	xmm8,   32
	savexmm	xmm9,   48
	savexmm	xmm10,  64
	savexmm	xmm11,  80
	savexmm	xmm12,  96
	savexmm	xmm13, 112
	savexmm	xmm14, 128
	savexmm	xmm15, 144

  endprologue

	mov	rdi, DV
	mov	MI, [SP + 224]
#endif

	// Establish the expanded operands and the blocks-of-4 dv limit.
	pxor	xmm0, xmm0
	mov	DVL, DVL4		// -> dv[n] = dv limit
	sub	DVL4, DV		// length of dv in bytes
	movdqu	xmm8, [MI]		// mi
	and	DVL4, ~15		// mask off the tail end
	expand	xmm0, xmm8, xmm9
	add	DVL4, DV		// find limit

	// Set up the outer loop state and prepare for the first iteration.
	mov	rbx, NV			// -> X = nv[0]
	lea	DVLO, [DV + 4*N]	// -> dv[n/4] = outer dv limit
	lea	NVL, [NV + 4*N]		// -> nv[n/4] = nv limit
	add	DV, 16
	call	mont4
	add	rbx, 16
	add	rdi, 16
	cmp	rbx, NVL		// done already?
	jae	8f

	.p2align 4
	// Complete the first inner loop.
5:	call	mla4
	add	rbx, 16
	add	rdi, 16
	cmp	rbx, NVL		// done yet?
	jb	5b

	// Still have carries left to propagate.
8:	carryadd
	psllq	xmm15, 16
	pslldq	xmm15, 8
	paddq	xmm14, xmm15
	call	carryprop
	movd	C, xmm12
	add	rdi, 16
	cmp	rdi, DVL4
	jae	7f

	.p2align 4
	// Continue carry propagation until the end of the buffer.
0:	add	[rdi], C
	mov	C, 0			// preserves flags
	adc	dword ptr [rdi + 4], 0
	adc	dword ptr [rdi + 8], 0
	adc	dword ptr [rdi + 12], 0
	adc	C, 0
	add	rdi, 16
	cmp	rdi, DVL4
	jb	0b

	// Deal with the tail end.  Note that the actual destination length
	// won't be an exacty number of blocks of four, so it's safe to just
	// drop through here.
7:	add	[rdi], C
	mov	C, 0
	add	rdi, 4
	adc	C, 0
	cmp	rdi, DVL
	jb	7b

	// All done for this iteration.  Start the next.
	cmp	DV, DVLO		// all done yet?
	jae	9f
	mov	rdi, DV			// -> Z = dv[i]
	mov	rbx, NV			// -> X = nv[0]
	add	DV, 16
	call	mont4
	add	rdi, 16
	add	rbx, 16
	jmp	5b

	// All over.
9:

#if ABI_SYSV
	popreg	rbx
#endif

#if ABI_WIN
	rstrxmm	xmm6,    0
	rstrxmm	xmm7,   16
	rstrxmm	xmm8,   32
	rstrxmm	xmm9,   48
	rstrxmm	xmm10,  64
	rstrxmm	xmm11,  80
	rstrxmm	xmm12,  96
	rstrxmm	xmm13, 112
	rstrxmm	xmm14, 128
	rstrxmm	xmm15, 144

	stfree	168
	popreg	rdi
	popreg	rbx
#endif

	ret

#undef DVL
#undef DVL4
#undef MI
#undef DV
#undef DVLO
#undef NV
#undef NVL
#undef N
#undef C

ENDFUNC

///--------------------------------------------------------------------------
/// Testing and performance measurement.

#ifdef TEST_MUL4

#if ABI_SYSV
#  define ARG0 rdi
#  define ARG1 rsi
#  define ARG2 rdx
#  define ARG3 rcx
#  define ARG4 r8
#  define ARG5 r9
#  define ARG6 STKARG(0)
#  define ARG7 STKARG(1)
#  define ARG8 STKARG(2)
#  define STKARG_OFFSET 16
#endif
#if ABI_WIN
#  define ARG0 rcx
#  define ARG1 rdx
#  define ARG2 r8
#  define ARG3 r9
#  define ARG4 STKARG(0)
#  define ARG5 STKARG(1)
#  define ARG6 STKARG(2)
#  define ARG7 STKARG(3)
#  define ARG8 STKARG(4)
#  define STKARG_OFFSET 224
#endif
#define STKARG(i) [SP + STKARG_OFFSET + 8*(i)]

//		  sysv				win
//		  dmul  smul  mmul  mont	dmul  smul  mmul  mont
// A	rax
// D	rdx
// z	rdi	  rdi   rdi   rdi    rdi	rcx   rcx   rcx    rcx
// c	rcx	  rsi   rsi   rsi    rsi	rdx   rdx   rdx    rdx
// y	r10	  --    --    rdx    rdx	--    --    r8     r8
// u	r11	  rdx   --    rcx    --		r8    --    r9     --
// x	rbx	  rcx   rdx   r8     rcx	r9    r8    stk0   r9
// vv	xmm8/9	  r8    --    r9     r8		stk0  --    stk1   stk0
// yy	xmm10/11  r9    rcx   stk0   --		stk1  r9    stk2   --
// n	r8	  stk0  r8    stk1   r9		stk2  stk0  stk3   stk1
// cyv	r9	  stk1  r9    stk2   stk0	stk3  stk1  stk4   stk2

.macro	cysetup	v, n
	rdtsc
	shl	rdx, 32
	or	rax, rdx
	mov	[\v + 8*\n - 8], rax
.endm

.macro	cystore	v, n
	rdtsc
	shl	rdx, 32
	or	rax, rdx
	sub	rax, [\v + 8*\n - 8]
	mov	[\v + 8*\n - 8], rax
	dec	\n
.endm

.macro	testprologue mode
	pushreg	rbx
#if ABI_SYSV
  endprologue
  .ifeqs "\mode", "dmul"
	mov	rbx, rcx
	movdqu	xmm8, [r8]
	movdqu	xmm10, [r9]
	mov	r8d, STKARG(0)
	mov	r9, STKARG(1)
	mov	r11, rdx
	mov	rcx, rsi
  .endif
  .ifeqs "\mode", "smul"
	mov	rbx, rdx
	movdqu	xmm10, [rcx]
	mov	rcx, rsi
  .endif
  .ifeqs "\mode", "mmul"
	mov	rax, STKARG(0)
	mov	rbx, r8
	movdqu	xmm8, [r9]
	movdqu	xmm10, [rax]
	mov	r8d, STKARG(1)
	mov	r9, STKARG(2)
	mov	r10, rdx
	mov	r11, rcx
	mov	rcx, rsi
  .endif
  .ifeqs "\mode", "mont"
	mov	rbx, rcx
	movdqu	xmm8, [r8]
	mov	r8d, r9d
	mov	r9, STKARG(0)
	mov	r10, rdx
	mov	rcx, rsi
  .endif
#endif
#if ABI_WIN
	pushreg	rdi
	stalloc	168
	savexmm	xmm6,    0
	savexmm	xmm7,   16
	savexmm	xmm8,   32
	savexmm	xmm9,   48
	savexmm	xmm10,  64
	savexmm	xmm11,  80
	savexmm	xmm12,  96
	savexmm	xmm13, 112
	savexmm	xmm14, 128
	savexmm	xmm15, 144
  endprologue
  .ifeqs "\mode", "dmul"
	mov	r10, STKARG(0)
	mov	r11, STKARG(1)
	mov	rdi, rcx
	mov	rcx, rdx
	mov	rbx, r9
	movdqu	xmm8, [r10]
	movdqu	xmm10, [r11]
	mov	r11, r8
	mov	r8d, STKARG(2)
	mov	r9, STKARG(3)
  .endif
  .ifeqs "\mode", "smul"
	mov	rdi, rcx
	mov	rcx, rdx
	mov	rbx, r8
	movdqu	xmm10, [r9]
	mov	r8d, STKARG(0)
	mov	r9, STKARG(1)
  .endif
  .ifeqs "\mode", "mmul"
	mov	r10, STKARG(1)
	mov	r11, STKARG(2)
	mov	rdi, rcx
	mov	rcx, rdx
	mov	rbx, STKARG(0)
	movdqu	xmm8, [r10]
	movdqu	xmm10, [r11]
	mov	r10, r8
	mov	r11, r9
	mov	r8d, STKARG(3)
	mov	r9, STKARG(4)
  .endif
  .ifeqs "\mode", "mont"
	mov	r10, STKARG(0)
	mov	rdi, rcx
	mov	rcx, rdx
	mov	rbx, r9
	movdqu	xmm8, [r10]
	mov	r10, r8
	mov	r8d, STKARG(1)
	mov	r9, STKARG(2)
  .endif
#endif

	pxor	xmm0, xmm0
  .ifeqs "\mode", "dmul"
	expand	xmm0, xmm8, xmm9, xmm10, xmm11
  .endif
  .ifeqs "\mode", "smul"
	expand	xmm0, xmm10, xmm11
  .endif
  .ifeqs "\mode", "mmul"
	expand	xmm0, xmm8, xmm9, xmm10, xmm11
  .endif
  .ifeqs "\mode", "mont"
	expand	xmm0, xmm8, xmm9
  .endif
.endm

.macro	testepilogue
#if ABI_WIN
	rstrxmm	xmm6,    0
	rstrxmm	xmm7,   16
	rstrxmm	xmm8,   32
	rstrxmm	xmm9,   48
	rstrxmm	xmm10,  64
	rstrxmm	xmm11,  80
	rstrxmm	xmm12,  96
	rstrxmm	xmm13, 112
	rstrxmm	xmm14, 128
	rstrxmm	xmm15, 144
	stfree	168
	popreg	rdi
#endif
	popreg	rbx
	ret
.endm

.macro	testldcarry
	movdqu	xmm12, [rcx +  0]	// (c''_0; c'_0)
	movdqu	xmm13, [rcx + 16]	// (c''_1; c'_1)
	movdqu	xmm14, [rcx + 32]	// (c''_2; c'_2)
.endm

.macro	testtop	u=nil
	.p2align 4
0:
	cysetup	r9, r8
  .ifnes "\u", "nil"
	mov	rax, \u
  .endif
.endm

.macro	testtail
	cystore	r9, r8
	jnz	0b
.endm

.macro	testcarryout
	movdqu	[rcx +  0], xmm12
	movdqu	[rcx + 16], xmm13
	movdqu	[rcx + 32], xmm14
.endm

FUNC(test_dmul4)
	testprologue dmul
	testldcarry
	testtop	r11
	call	dmul4
	testtail
	testcarryout
	testepilogue
ENDFUNC

FUNC(test_dmla4)
	testprologue dmul
	testldcarry
	testtop	r11
	call	dmla4
	testtail
	testcarryout
	testepilogue
ENDFUNC

FUNC(test_mul4)
	testprologue smul
	testldcarry
	testtop	nil
	call	mul4
	testtail
	testcarryout
	testepilogue
ENDFUNC

FUNC(test_mul4zc)
	testprologue smul
	testldcarry
	testtop	nil
	call	mul4zc
	testtail
	testcarryout
	testepilogue
ENDFUNC

FUNC(test_mla4)
	testprologue smul
	testldcarry
	testtop	nil
	call	mla4
	testtail
	testcarryout
	testepilogue
ENDFUNC

FUNC(test_mla4zc)
	testprologue smul
	testldcarry
	testtop	nil
	call	mla4zc
	testtail
	testcarryout
	testepilogue
ENDFUNC

FUNC(test_mmul4)
	testprologue mmul
	testtop	r11
	call	mmul4
	testtail
	pshufd	xmm10, xmm10, SHUF(3, 1, 2, 0)
	pshufd	xmm11, xmm11, SHUF(3, 1, 2, 0)
	movdqu	[r10 +  0], xmm10
	movdqu	[r10 + 16], xmm11
	testcarryout
	testepilogue
ENDFUNC

FUNC(test_mmla4)
	testprologue mmul
	testtop	r11
	call	mmla4
	testtail
	pshufd	xmm10, xmm10, SHUF(3, 1, 2, 0)
	pshufd	xmm11, xmm11, SHUF(3, 1, 2, 0)
	movdqu	[r10 +  0], xmm10
	movdqu	[r10 + 16], xmm11
	testcarryout
	testepilogue
ENDFUNC

FUNC(test_mont4)
	testprologue mont
	testtop
	call	mont4
	testtail
	pshufd	xmm10, xmm10, SHUF(3, 1, 2, 0)
	pshufd	xmm11, xmm11, SHUF(3, 1, 2, 0)
	movdqu	[r10 +  0], xmm10
	movdqu	[r10 + 16], xmm11
	testcarryout
	testepilogue
ENDFUNC

#endif

///----- That's all, folks --------------------------------------------------