/// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
///
/// Large SIMD-based multiplications
///
/// (c) 2016 Straylight/Edgeware

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software; you can redistribute it and/or modify
/// it under the terms of the GNU Library General Public License as
/// published by the Free Software Foundation; either version 2 of the
/// License, or (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb; if not, write to the Free
/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
/// MA 02111-1307, USA.

///--------------------------------------------------------------------------
/// Preliminaries.

#include "config.h"
#include "asm-common.h"

	.arch	pentium4

	.text

///--------------------------------------------------------------------------
/// Theory.
///
/// We define a number of primitive fixed-size multipliers from which we can
/// construct more general variable-length multipliers.
///
/// The basic trick is the same throughout.  In an operand-scanning
/// multiplication, the inner multiplication loop multiplies a
/// multiple-precision operand by a single precision factor, and adds the
/// result, appropriately shifted, to the result.  A `finely integrated
/// operand scanning' implementation of Montgomery multiplication also adds
/// the product of a single-precision `Montgomery factor' and the modulus,
/// calculated in the same pass.  The more common `coarsely integrated
/// operand scanning' alternates main multiplication and Montgomery passes,
/// which requires additional carry propagation.
///
/// Throughout both plain-multiplication and Montgomery stages, then, one of
/// the factors remains constant throughout the operation, so we can afford
/// to take a little time to preprocess it.  The transformation we perform is
/// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
/// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
/// operands, as follows.
///
///	Offset	   0	   4	    8	   12
///	   0	v'_0	v'_1	v''_0	v''_1
///	  16	v'_2	v'_3	v''_2	v''_3
///
/// A `pmuludq' instruction ignores the odd positions in its operands; thus,
/// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
/// this vector right by 4 bytes brings v'_1 and v''_1 into position.  We can
/// multiply such a vector by a full 32-bit scalar to produce two 48-bit
/// results in 64-bit fields.  The sixteen bits of headroom allows us to add
/// many products together before we must deal with carrying; it also allows
/// for some calculations to be performed on the above expanded form.
///
/// On 32-bit x86, we are register starved: the expanded operands are kept in
/// memory, typically in warm L1 cache.
///
/// We maintain four `carry' registers accumulating intermediate results.
/// The registers' precise roles rotate during the computation; we name them
/// `c0', `c1', `c2', and `c3'.  Each carry register holds two 64-bit halves:
/// the register c0, for example, holds c'_0 (low half) and c''_0 (high
/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3.  The
/// `pmuluqd' instruction acting on a scalar operand (broadcast across all
/// lanes of its vector) and an operand in the expanded form above produces a
/// result which can be added directly to the appropriate carry register.
/// Following a pass of four multiplications, we perform some limited carry
/// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
/// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
/// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
/// zeroed becomes c3.

///--------------------------------------------------------------------------
/// Macro definitions.

.macro	mulcore	r, s, d0, d1=nil, d2=nil, d3=nil
	// Load a word r_i from R, multiply by the expanded operand [S], and
	// leave the pieces of the product in registers D0, D1, D2, D3.
	movd	\d0, \r			// (r_i, 0; 0, 0)
  .ifnes "\d1", "nil"
	movdqa	\d1, [\s]		// (s'_0, s'_1; s''_0, s''_1)
  .endif
  .ifnes "\d3", "nil"
	movdqa	\d3, [\s + 16]		// (s'_2, s'_3; s''_2, s''_3)
  .endif
	pshufd	\d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?)
  .ifnes "\d1", "nil"
	psrldq	\d1, 4			// (s'_1, s''_0; s''_1, 0)
  .endif
  .ifnes "\d2", "nil"
    .ifnes "\d3", "nil"
	movdqa	\d2, \d3		// another copy of (s'_2, s'_3; ...)
    .else
	movdqa	\d2, \d0		// another copy of (r_i, ?; r_i, ?)
    .endif
  .endif
  .ifnes "\d3", "nil"
	psrldq	\d3, 4			// (s'_3, s''_2; s''_3, 0)
  .endif
  .ifnes "\d1", "nil"
	pmuludq	\d1, \d0		// (r_i s'_1; r_i s''_1)
  .endif
  .ifnes "\d3", "nil"
	pmuludq	\d3, \d0		// (r_i s'_3; r_i s''_3)
  .endif
  .ifnes "\d2", "nil"
    .ifnes "\d3", "nil"
	pmuludq	\d2, \d0		// (r_i s'_2; r_i s''_2)
    .else
	pmuludq	\d2, [\s + 16]
    .endif
  .endif
	pmuludq	\d0, [\s]		// (r_i s'_0; r_i s''_0)
.endm

.macro	accum	c0, c1=nil, c2=nil, c3=nil
	// Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
	// carry registers C0--C3.  Any or all of C1--C3 may be `nil' to skip
	// updating that register.
	paddq	\c0, xmm0
  .ifnes "\c1", "nil"
	paddq	\c1, xmm1
  .endif
  .ifnes "\c2", "nil"
	paddq	\c2, xmm2
  .endif
  .ifnes "\c3", "nil"
	paddq	\c3, xmm3
  .endif
.endm

.macro	mulacc	r, s, c0, c1, c2, c3, z3p=nil
	// Load a word r_i from R, multiply by the expanded operand [S],
	// and accumulate in carry registers C0, C1, C2, C3.  If Z3P is `t'
	// then C3 notionally contains zero, but needs clearing; in practice,
	// we store the product directly rather than attempting to add.  On
	// completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
	// is not `t'.
  .ifeqs "\z3p", "t"
	mulcore	\r, \s, xmm0, xmm1, xmm2, \c3
	accum		\c0,  \c1,  \c2
  .else
	mulcore	\r, \s, xmm0, xmm1, xmm2, xmm3
	accum		\c0,  \c1,  \c2,  \c3
  .endif
.endm

.macro	propout	d, c, cc=nil
	// Calculate an output word from C, and store it in D; propagate
	// carries out from C to CC in preparation for a rotation of the
	// carry registers.  On completion, XMM3 is clobbered.  If CC is
	// `nil', then the contribution which would have been added to it is
	// left in C.
	pshufd	xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
	psrldq	xmm3, 12		// (t, 0; 0, 0) = (t, 0)
	pslldq	xmm3, 2			// (t b; 0)
	paddq	\c, xmm3		// (c' + t b; c'')
	movd	\d, \c
	psrlq	\c, 32			// floor(c/B)
  .ifnes "\cc", "nil"
	paddq	\cc, \c			// propagate up
  .endif
.endm

.macro	endprop	d, c, t
	// On entry, C contains a carry register.  On exit, the low 32 bits
	// of the value represented in C are written to D, and the remaining
	// bits are left at the bottom of T.
	movdqa	\t, \c
	psllq	\t, 16			// (?; c'' b)
	pslldq	\c, 8			// (0; c')
	paddq	\t, \c			// (?; c' + c'' b)
	psrldq	\t, 8			// (c' + c'' b; 0) = (c; 0)
	movd	\d, \t
	psrldq	\t, 4			// (floor(c/B); 0)
.endm

.macro	expand	z, a, b, c=nil, d=nil
	// On entry, A and C hold packed 128-bit values, and Z is zero.  On
	// exit, A:B and C:D together hold the same values in expanded
	// form.  If C is `nil', then only expand A to A:B.
	movdqa	\b, \a			// (a_0, a_1; a_2, a_3)
  .ifnes "\c", "nil"
	movdqa	\d, \c			// (c_0, c_1; c_2, c_3)
  .endif
	punpcklwd \a, \z		// (a'_0, a''_0; a'_1, a''_1)
	punpckhwd \b, \z		// (a'_2, a''_2; a'_3, a''_3)
  .ifnes "\c", "nil"
	punpcklwd \c, \z		// (c'_0, c''_0; c'_1, c''_1)
	punpckhwd \d, \z		// (c'_2, c''_2; c'_3, c''_3)
  .endif
	pshufd	\a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
	pshufd	\b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
  .ifnes "\c", "nil"
	pshufd	\c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
	pshufd	\d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
  .endif
.endm

.macro	squash	c0, c1, c2, c3, t, u, lo, hi=nil
	// On entry, C0, C1, C2, C3 are carry registers representing a value
	// Y.  On exit, LO holds the low 128 bits of the carry value; C1, C2,
	// C3, T, and U are clobbered; and the high bits of Y are stored in
	// HI, if this is not `nil'.

	// The first step is to eliminate the `double-prime' pieces -- i.e.,
	// the ones offset by 16 bytes from a 32-bit boundary -- by carrying
	// them into the 32-bit-aligned pieces above and below.  But before
	// we can do that, we must gather them together.
	movdqa	\t, \c0
	movdqa	\u, \c1
	punpcklqdq \t, \c2		// (y'_0; y'_2)
	punpckhqdq \c0, \c2		// (y''_0; y''_2)
	punpcklqdq \u, \c3		// (y'_1; y'_3)
	punpckhqdq \c1, \c3		// (y''_1; y''_3)

	// Now split the double-prime pieces.  The high (up to) 48 bits will
	// go up; the low 16 bits go down.
	movdqa	\c2, \c0
	movdqa	\c3, \c1
	psllq	\c2, 48
	psllq	\c3, 48
	psrlq	\c0, 16			// high parts of (y''_0; y''_2)
	psrlq	\c1, 16			// high parts of (y''_1; y''_3)
	psrlq	\c2, 32			// low parts of (y''_0; y''_2)
	psrlq	\c3, 32			// low parts of (y''_1; y''_3)
  .ifnes "\hi", "nil"
	movdqa	\hi, \c1
  .endif
	pslldq	\c1, 8			// high part of (0; y''_1)

	paddq	\t, \c2			// propagate down
	paddq	\u, \c3
	paddq	\t, \c1			// and up: (y_0; y_2)
	paddq	\u, \c0			// (y_1; y_3)
  .ifnes "\hi", "nil"
	psrldq	\hi, 8			// high part of (y''_3; 0)
  .endif

	// Finally extract the answer.  This complicated dance is better than
	// storing to memory and loading, because the piecemeal stores
	// inhibit store forwarding.
	movdqa	\c3, \t			// (y_0; ?)
	movdqa	\lo, \t			// (y^*_0, ?; ?, ?)
	psrldq	\t, 8			// (y_2; 0)
	psrlq	\c3, 32			// (floor(y_0/B); ?)
	paddq	\c3, \u			// (y_1 + floor(y_0/B); ?)
	movdqa	\c1, \c3		// (y^*_1, ?; ?, ?)
	psrldq	\u, 8			// (y_3; 0)
	psrlq	\c3, 32			// (floor((y_1 B + y_0)/B^2; ?)
	paddq	\c3, \t			// (y_2 + floor((y_1 B + y_0)/B^2; ?)
	punpckldq \lo, \c3		// (y^*_0, y^*_2; ?, ?)
	psrlq	\c3, 32		    // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
	paddq	\c3, \u	      // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
  .ifnes "\hi", "nil"
	movdqa	\t, \c3
	pxor	\u, \u
  .endif
	punpckldq \c1, \c3		// (y^*_1, y^*_3; ?, ?)
  .ifnes "\hi", "nil"
	psrlq	\t, 32			// very high bits of y
	paddq	\hi, \t
	punpcklqdq \hi, \u		// carry up
  .endif
	punpckldq \lo, \c1		// y mod B^4
.endm

.macro	carryadd
	// On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
	// hold the incoming carry registers c0, c1, and c2 representing a
	// carry-in C.
	//
	// On exit, the carry registers, including XMM7, are updated to hold
	// C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
	// registers are preserved.
	movd	xmm0, [edi +  0]	// (a_0; 0)
	movd	xmm1, [edi +  4]	// (a_1; 0)
	movd	xmm2, [edi +  8]	// (a_2; 0)
	movd	xmm7, [edi + 12]	// (a_3; 0)

	paddq	xmm4, xmm0		// (c'_0 + a_0; c''_0)
	paddq	xmm5, xmm1		// (c'_1 + a_1; c''_1)
	paddq	xmm6, xmm2		// (c'_2 + a_2; c''_2 + a_3 b)
.endm

///--------------------------------------------------------------------------
/// Primitive multipliers and related utilities.

INTFUNC(carryprop)
	// On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
	// form.  Store the low 128 bits of the represented carry to [EDI] as
	// a packed 128-bit value, and leave the remaining 16 bits in the low
	// 32 bits of XMM4.  On exit, XMM3, XMM5 and XMM6 are clobbered.
  endprologue

	propout	[edi +  0], xmm4, xmm5
	propout	[edi +  4], xmm5, xmm6
	propout	[edi +  8], xmm6, nil
	endprop	[edi + 12], xmm6, xmm4
	ret

ENDFUNC

INTFUNC(dmul4)
	// On entry, EDI points to the destination buffer; EAX and EBX point
	// to the packed operands U and X; ECX and EDX point to the expanded
	// operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
	// registers c0, c1, and c2; c3 is assumed to be zero.
	//
	// On exit, we write the low 128 bits of the sum C + U V + X Y to
	// [EDI], and update the carry registers with the carry out.  The
	// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
	// general-purpose registers are preserved.
  endprologue

	mulacc	[eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, t
	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
	propout	[edi +  0],	 xmm4, xmm5

	mulacc	[eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
	mulacc	[ebx +  4], edx, xmm5, xmm6, xmm7, xmm4
	propout	[edi +  4],	 xmm5, xmm6

	mulacc	[eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
	mulacc	[ebx +  8], edx, xmm6, xmm7, xmm4, xmm5
	propout	[edi +  8],	 xmm6, xmm7

	mulacc	[eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
	mulacc	[ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
	propout	[edi + 12],	 xmm7, xmm4

	ret

ENDFUNC

INTFUNC(dmla4)
	// On entry, EDI points to the destination buffer, which also
	// contains an addend A to accumulate; EAX and EBX point to the
	// packed operands U and X; ECX and EDX point to the expanded
	// operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
	// registers c0, c1, and c2 representing a carry-in C; c3 is assumed
	// to be zero.
	//
	// On exit, we write the low 128 bits of the sum A + C + U V + X Y to
	// [EDI], and update the carry registers with the carry out.  The
	// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
	// general-purpose registers are preserved.
  endprologue

	carryadd

	mulacc	[eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
	propout	[edi +  0],	 xmm4, xmm5

	mulacc	[eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
	mulacc	[ebx +  4], edx, xmm5, xmm6, xmm7, xmm4
	propout	[edi +  4],	 xmm5, xmm6

	mulacc	[eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
	mulacc	[ebx +  8], edx, xmm6, xmm7, xmm4, xmm5
	propout	[edi +  8],	 xmm6, xmm7

	mulacc	[eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
	mulacc	[ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
	propout	[edi + 12],	 xmm7, xmm4

	ret

ENDFUNC

INTFUNC(mul4zc)
	// On entry, EDI points to the destination buffer; EBX points to a
	// packed operand X; and EDX points to an expanded operand Y.
	//
	// On exit, we write the low 128 bits of the product X Y to [EDI],
	// and set the carry registers XMM4, XMM5, XMM6 to the carry out.
	// The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
	// general-purpose registers are preserved.
  endprologue

	mulcore	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
	propout	[edi +  0],	 xmm4, xmm5

	mulacc	[ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
	propout	[edi +  4],	 xmm5, xmm6

	mulacc	[ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
	propout	[edi +  8],	 xmm6, xmm7

	mulacc	[ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
	propout	[edi + 12],	 xmm7, xmm4

	ret

ENDFUNC

INTFUNC(mul4)
	// On entry, EDI points to the destination buffer; EBX points to a
	// packed operand X; EDX points to an expanded operand Y; and XMM4,
	// XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
	// representing a carry-in C; c3 is assumed to be zero.
	//
	// On exit, we write the low 128 bits of the sum C + X Y to [EDI],
	// and update the carry registers with the carry out.  The registers
	// XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
	// general-purpose registers are preserved.
  endprologue

	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, t
	propout	[edi +  0],	 xmm4, xmm5

	mulacc	[ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
	propout	[edi +  4],	 xmm5, xmm6

	mulacc	[ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
	propout	[edi +  8],	 xmm6, xmm7

	mulacc	[ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
	propout	[edi + 12],	 xmm7, xmm4

	ret

ENDFUNC

INTFUNC(mla4zc)
	// On entry, EDI points to the destination buffer, which also
	// contains an addend A to accumulate; EBX points to a packed operand
	// X; and EDX points to an expanded operand Y.
	//
	// On exit, we write the low 128 bits of the sum A + X Y to [EDI],
	// and set the carry registers XMM4, XMM5, XMM6 to the carry out.
	// The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
	// general-purpose registers are preserved.
  endprologue

	movd	xmm4, [edi +  0]
	movd	xmm5, [edi +  4]
	movd	xmm6, [edi +  8]
	movd	xmm7, [edi + 12]

	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
	propout	[edi +  0],	 xmm4, xmm5

	mulacc	[ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
	propout	[edi +  4],	 xmm5, xmm6

	mulacc	[ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
	propout	[edi +  8],	 xmm6, xmm7

	mulacc	[ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
	propout	[edi + 12],	 xmm7, xmm4

	ret

ENDFUNC

INTFUNC(mla4)
	// On entry, EDI points to the destination buffer, which also
	// contains an addend A to accumulate; EBX points to a packed operand
	// X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
	// the incoming carry registers c0, c1, and c2, representing a
	// carry-in C; c3 is assumed to be zero.
	//
	// On exit, we write the low 128 bits of the sum A + C + X Y to
	// [EDI], and update the carry registers with the carry out.  The
	// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
	// general-purpose registers are preserved.
  endprologue

	carryadd

	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
	propout	[edi +  0],	 xmm4, xmm5

	mulacc	[ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
	propout	[edi +  4],	 xmm5, xmm6

	mulacc	[ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
	propout	[edi +  8],	 xmm6, xmm7

	mulacc	[ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
	propout	[edi + 12],	 xmm7, xmm4

	ret

ENDFUNC

INTFUNC(mmul4)
	// On entry, EDI points to the destination buffer; EAX and EBX point
	// to the packed operands U and N; ECX and ESI point to the expanded
	// operands V and M; and EDX points to a place to store an expanded
	// result Y (32 bytes, at a 16-byte boundary).  The stack pointer
	// must be 12 modulo 16, as is usual for modern x86 ABIs.
	//
	// On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
	// of the sum U V + N Y to [EDI], leaving the remaining carry in
	// XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
	// XMM7 are clobbered; the general-purpose registers are preserved.
	stalloc	48 + 12			// space for the carries
  endprologue

	// Calculate W = U V, and leave it in the destination.  Stash the
	// carry pieces for later.
	mulcore	[eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
	propout	[edi +  0],	 xmm4, xmm5
	jmp	5f

ENDFUNC

INTFUNC(mmla4)
	// On entry, EDI points to the destination buffer, which also
	// contains an addend A to accumulate; EAX and EBX point to the
	// packed operands U and N; ECX and ESI point to the expanded
	// operands V and M; and EDX points to a place to store an expanded
	// result Y (32 bytes, at a 16-byte boundary).  The stack pointer
	// must be 12 modulo 16, as is usual for modern x86 ABIs.
	//
	// On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
	// bits of the sum A + U V + N Y to [EDI], leaving the remaining
	// carry in XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2,
	// XMM3, and XMM7 are clobbered; the general-purpose registers are
	// preserved.
	stalloc	48 + 12			// space for the carries
  endprologue

	movd	xmm4, [edi +  0]
	movd	xmm5, [edi +  4]
	movd	xmm6, [edi +  8]
	movd	xmm7, [edi + 12]

	// Calculate W = U V, and leave it in the destination.  Stash the
	// carry pieces for later.
	mulacc	[eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
	propout	[edi +  0],	 xmm4, xmm5

5:	mulacc	[eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
	propout	[edi +  4],	 xmm5, xmm6

	mulacc	[eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
	propout	[edi +  8],	 xmm6, xmm7

	mulacc	[eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
	propout	[edi + 12],	 xmm7, xmm4

	movdqa	[esp +  0], xmm4
	movdqa	[esp + 16], xmm5
	movdqa	[esp + 32], xmm6

	// Calculate Y = W M.
	mulcore	[edi +  0], esi, xmm4, xmm5, xmm6, xmm7

	mulcore	[edi +  4], esi, xmm0, xmm1, xmm2
	accum			 xmm5, xmm6, xmm7

	mulcore	[edi +  8], esi, xmm0, xmm1
	accum			 xmm6, xmm7

	mulcore	[edi + 12], esi, xmm0
	accum			 xmm7

	// That's lots of pieces.  Now we have to assemble the answer.
	squash	xmm4, xmm5, xmm6, xmm7,  xmm0, xmm1,  xmm4

	// Expand it.
	pxor	xmm2, xmm2
	expand	xmm2, xmm4, xmm1
	movdqa	[edx +  0], xmm4
	movdqa	[edx + 16], xmm1

	// Initialize the carry from the value for W we calculated earlier.
	movd	xmm4, [edi +  0]
	movd	xmm5, [edi +  4]
	movd	xmm6, [edi +  8]
	movd	xmm7, [edi + 12]

	// Finish the calculation by adding the Montgomery product.
	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
	propout	[edi +  0],	 xmm4, xmm5

	mulacc	[ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
	propout	[edi +  4],	 xmm5, xmm6

	mulacc	[ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
	propout	[edi +  8],	 xmm6, xmm7

	mulacc	[ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
	propout	[edi + 12],	 xmm7, xmm4

	// Add add on the carry we calculated earlier.
	paddq	xmm4, [esp +  0]
	paddq	xmm5, [esp + 16]
	paddq	xmm6, [esp + 32]

	// And, with that, we're done.
	stfree	48 + 12
	ret

ENDFUNC

INTFUNC(mont4)
	// On entry, EDI points to the destination buffer holding a packed
	// value W; EBX points to a packed operand N; ESI points to an
	// expanded operand M; and EDX points to a place to store an expanded
	// result Y (32 bytes, at a 16-byte boundary).
	//
	// On exit, we write Y = W M mod B to [EDX], and the low 128 bits
	// of the sum W + N Y to [EDI], leaving the remaining carry in
	// XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
	// XMM7 are clobbered; the general-purpose registers are preserved.
  endprologue

	// Calculate Y = W M.
	mulcore	[edi +  0], esi, xmm4, xmm5, xmm6, xmm7

	mulcore	[edi +  4], esi, xmm0, xmm1, xmm2
	accum			 xmm5, xmm6, xmm7

	mulcore	[edi +  8], esi, xmm0, xmm1
	accum			 xmm6, xmm7

	mulcore	[edi + 12], esi, xmm0
	accum			 xmm7

	// That's lots of pieces.  Now we have to assemble the answer.
	squash	xmm4, xmm5, xmm6, xmm7,  xmm0, xmm1,  xmm4

	// Expand it.
	pxor	xmm2, xmm2
	expand	xmm2, xmm4, xmm1
	movdqa	[edx +  0], xmm4
	movdqa	[edx + 16], xmm1

	// Initialize the carry from W.
	movd	xmm4, [edi +  0]
	movd	xmm5, [edi +  4]
	movd	xmm6, [edi +  8]
	movd	xmm7, [edi + 12]

	// Finish the calculation by adding the Montgomery product.
	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
	propout	[edi +  0],	 xmm4, xmm5

	mulacc	[ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
	propout	[edi +  4],	 xmm5, xmm6

	mulacc	[ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
	propout	[edi +  8],	 xmm6, xmm7

	mulacc	[ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
	propout	[edi + 12],	 xmm7, xmm4

	// And, with that, we're done.
	ret

ENDFUNC

///--------------------------------------------------------------------------
/// Bulk multipliers.

FUNC(mpx_umul4_x86_avx)
	.arch	.avx
	vzeroupper
  endprologue
	// and drop through...
	.arch	pentium4
ENDFUNC

FUNC(mpx_umul4_x86_sse2)
	// void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
	//			   const mpw *bv, const mpw *bvl);

	// Build a stack frame.  Arguments will be relative to EBP, as
	// follows.
	//
	//	ebp + 20	dv
	//	ebp + 24	av
	//	ebp + 28	avl
	//	ebp + 32	bv
	//	ebp + 36	bvl
	//
	// Locals are relative to ESP, as follows.
	//
	//	esp +  0	expanded Y (32 bytes)
	//	esp + 32	(top of locals)
	pushreg	ebp
	pushreg	ebx
	pushreg	esi
	pushreg	edi
	setfp
	and	esp, ~15
	sub	esp, 32
  endprologue

	// Prepare for the first iteration.
	mov	esi, [ebp + 32]		// -> bv[0]
	pxor	xmm7, xmm7
	movdqu	xmm0, [esi]		// bv[0]
	mov	edi, [ebp + 20]		// -> dv[0]
	mov	ecx, edi		// outer loop dv cursor
	expand	xmm7, xmm0, xmm1
	mov	ebx, [ebp + 24]		// -> av[0]
	mov	eax, [ebp + 28]		// -> av[m] = av limit
	mov	edx, esp		// -> expanded Y = bv[0]
	movdqa	[esp + 0], xmm0		// bv[0] expanded low
	movdqa	[esp + 16], xmm1	// bv[0] expanded high
	call	mul4zc
	add	ebx, 16
	add	edi, 16
	add	ecx, 16
	add	esi, 16
	cmp	ebx, eax		// all done?
	jae	8f

	.p2align 4
	// Continue with the first iteration.
0:	call	mul4
	add	ebx, 16
	add	edi, 16
	cmp	ebx, eax		// all done?
	jb	0b

	// Write out the leftover carry.  There can be no tail here.
8:	call	carryprop
	cmp	esi, [ebp + 36]		// more passes to do?
	jae	9f

	.p2align 4
	// Set up for the next pass.
1:	movdqu	xmm0, [esi]		// bv[i]
	mov	edi, ecx		// -> dv[i]
	pxor	xmm7, xmm7
	expand	xmm7, xmm0, xmm1
	mov	ebx, [ebp + 24]		// -> av[0]
	movdqa	[esp + 0], xmm0		// bv[i] expanded low
	movdqa	[esp + 16], xmm1	// bv[i] expanded high
	call	mla4zc
	add	edi, 16
	add	ebx, 16
	add	ecx, 16
	add	esi, 16
	cmp	ebx, eax		// done yet?
	jae	8f

	.p2align 4
	// Continue...
0:	call	mla4
	add	ebx, 16
	add	edi, 16
	cmp	ebx, eax
	jb	0b

	// Finish off this pass.  There was no tail on the previous pass, and
	// there can be none on this pass.
8:	call	carryprop
	cmp	esi, [ebp + 36]
	jb	1b

	// All over.
9:	dropfp
	pop	edi
	pop	esi
	pop	ebx
	pop	ebp
	ret

ENDFUNC

FUNC(mpxmont_mul4_x86_avx)
	.arch	.avx
	vzeroupper
  endprologue
	// and drop through...
	.arch	pentium4
ENDFUNC

FUNC(mpxmont_mul4_x86_sse2)
	// void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
	//			     const mpw *nv, size_t n, const mpw *mi);

	// Build a stack frame.  Arguments will be relative to EBP, as
	// follows.
	//
	//	ebp + 20	dv
	//	ebp + 24	av
	//	ebp + 28	bv
	//	ebp + 32	nv
	//	ebp + 36	n (nonzero multiple of 4)
	//	ebp + 40	mi
	//
	// Locals are relative to ESP, which 16-byte aligned, as follows.
	//
	//	esp +   0	expanded V (32 bytes)
	//	esp +  32	expanded M (32 bytes)
	//	esp +  64	expanded Y (32 bytes)
	//	esp +  96	outer loop dv
	//	esp + 100	outer loop bv
	//	esp + 104	av limit (mostly in ESI)
	//	esp + 108	bv limit
	//	esp + 112	(top of locals)
	pushreg	ebp
	pushreg	ebx
	pushreg	esi
	pushreg	edi
	setfp
	and	esp, ~15
	sub	esp, 112
  endprologue

	// Establish the expanded operands.
	pxor	xmm7, xmm7
	mov	ecx, [ebp + 28]		// -> bv
	mov	edx, [ebp + 40]		// -> mi
	movdqu	xmm0, [ecx]		// bv[0]
	movdqu	xmm2, [edx]		// mi
	expand	xmm7, xmm0, xmm1, xmm2, xmm3
	movdqa	[esp +  0], xmm0	// bv[0] expanded low
	movdqa	[esp + 16], xmm1	// bv[0] expanded high
	movdqa	[esp + 32], xmm2	// mi expanded low
	movdqa	[esp + 48], xmm3	// mi expanded high

	// Set up the outer loop state and prepare for the first iteration.
	mov	edx, [ebp + 36]		// n
	mov	eax, [ebp + 24]		// -> U = av[0]
	mov	ebx, [ebp + 32]		// -> X = nv[0]
	mov	edi, [ebp + 20]		// -> Z = dv[0]
	mov	[esp + 100], ecx
	lea	ecx, [ecx + 4*edx]	// -> bv[n/4] = bv limit
	lea	edx, [eax + 4*edx]	// -> av[n/4] = av limit
	mov	[esp + 96], edi
	mov	[esp + 104], edx
	mov	[esp + 108], ecx
	lea	ecx, [esp + 0]		// -> expanded V = bv[0]
	lea	esi, [esp + 32]		// -> expanded M = mi
	lea	edx, [esp + 64]		// -> space for Y
	call	mmul4
	mov	esi, [esp + 104]	// recover av limit
	add	edi, 16
	add	eax, 16
	add	ebx, 16
	cmp	eax, esi		// done already?
	jae	8f
	mov	[esp + 96], edi

	.p2align 4
	// Complete the first inner loop.
0:	call	dmul4
	add	edi, 16
	add	eax, 16
	add	ebx, 16
	cmp	eax, esi		// done yet?
	jb	0b

	// Still have carries left to propagate.
	call	carryprop
	movd	[edi + 16], xmm4

	.p2align 4
	// Embark on the next iteration.  (There must be one.  If n = 1, then
	// we would have bailed above, to label 8.  Similarly, the subsequent
	// iterations can fall into the inner loop immediately.)
1:	mov	eax, [esp + 100]	// -> bv[i - 1]
	mov	edi, [esp + 96]		// -> Z = dv[i]
	add	eax, 16			// -> bv[i]
	pxor	xmm7, xmm7
	mov	[esp + 100], eax
	cmp	eax, [esp + 108]	// done yet?
	jae	9f
	movdqu	xmm0, [eax]		// bv[i]
	mov	ebx, [ebp + 32]		// -> X = nv[0]
	lea	esi, [esp + 32]		// -> expanded M = mi
	mov	eax, [ebp + 24]		// -> U = av[0]
	expand	xmm7, xmm0, xmm1
	movdqa	[esp + 0], xmm0		// bv[i] expanded low
	movdqa	[esp + 16], xmm1	// bv[i] expanded high
	call	mmla4
	mov	esi, [esp + 104]	// recover av limit
	add	edi, 16
	add	eax, 16
	add	ebx, 16
	mov	[esp + 96], edi

	.p2align 4
	// Complete the next inner loop.
0:	call	dmla4
	add	edi, 16
	add	eax, 16
	add	ebx, 16
	cmp	eax, esi
	jb	0b

	// Still have carries left to propagate, and they overlap the
	// previous iteration's final tail, so read that in and add it.
	movd	xmm0, [edi]
	paddq	xmm4, xmm0
	call	carryprop
	movd	[edi + 16], xmm4

	// Back again.
	jmp	1b

	// First iteration was short.  Write out the carries and we're done.
	// (This could be folded into the main loop structure, but that would
	// penalize small numbers more.)
8:	call	carryprop
	movd	[edi + 16], xmm4

	// All done.
9:	dropfp
	popreg	edi
	popreg	esi
	popreg	ebx
	popreg	ebp
	ret

ENDFUNC

FUNC(mpxmont_redc4_x86_avx)
	.arch	.avx
	vzeroupper
  endprologue
	// and drop through...
	.arch	pentium4
ENDFUNC

FUNC(mpxmont_redc4_x86_sse2)
	// void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
	//			       size_t n, const mpw *mi);

	// Build a stack frame.  Arguments will be relative to EBP, as
	// follows.
	//
	//	ebp + 20	dv
	//	ebp + 24	dvl
	//	ebp + 28	nv
	//	ebp + 32	n (nonzero multiple of 4)
	//	ebp + 36	mi
	//
	// Locals are relative to ESP, as follows.
	//
	//	esp +  0	outer loop dv
	//	esp +  4	outer dv limit
	//	esp +  8	blocks-of-4 dv limit
	//	esp + 12	expanded M (32 bytes)
	//	esp + 44	expanded Y (32 bytes)
	//	esp + 76	(top of locals)
	pushreg	ebp
	pushreg	ebx
	pushreg	esi
	pushreg	edi
	setfp
	and	esp, ~15
	sub	esp, 76
  endprologue

	// Establish the expanded operands and the blocks-of-4 dv limit.
	mov	edi, [ebp + 20]		// -> Z = dv[0]
	pxor	xmm7, xmm7
	mov	eax, [ebp + 24]		// -> dv[n] = dv limit
	sub	eax, edi		// length of dv in bytes
	mov	edx, [ebp + 36]		// -> mi
	movdqu	xmm0, [edx]		// mi
	and	eax, ~15		// mask off the tail end
	expand	xmm7, xmm0, xmm1
	add	eax, edi		// find limit
	movdqa	[esp + 12], xmm0	// mi expanded low
	movdqa	[esp + 28], xmm1	// mi expanded high
	mov	[esp + 8], eax

	// Set up the outer loop state and prepare for the first iteration.
	mov	ecx, [ebp + 32]		// n
	mov	ebx, [ebp + 28]		// -> X = nv[0]
	lea	edx, [edi + 4*ecx]	// -> dv[n/4] = outer dv limit
	lea	ecx, [ebx + 4*ecx]	// -> nv[n/4] = nv limit
	mov	[esp + 0], edi
	mov	[esp + 4], edx
	lea	esi, [esp + 12]		// -> expanded M = mi
	lea	edx, [esp + 44]		// -> space for Y
	call	mont4
	add	ebx, 16
	add	edi, 16
	cmp	ebx, ecx		// done already?
	jae	8f

	.p2align 4
	// Complete the first inner loop.
5:	call	mla4
	add	ebx, 16
	add	edi, 16
	cmp	ebx, ecx		// done yet?
	jb	5b

	// Still have carries left to propagate.
8:	carryadd
	mov	esi, [esp + 8]		// -> dv blocks limit
	mov	edx, [ebp + 24]		// dv limit
	psllq	xmm7, 16
	pslldq	xmm7, 8
	paddq	xmm6, xmm7
	call	carryprop
	movd	eax, xmm4
	add	edi, 16
	cmp	edi, esi
	jae	7f

	.p2align 4
	// Continue carry propagation until the end of the buffer.
0:	add	[edi], eax
	mov	eax, 0			// preserves flags
	adcd	[edi + 4], 0
	adcd	[edi + 8], 0
	adcd	[edi + 12], 0
	adc	eax, 0
	add	edi, 16
	cmp	edi, esi
	jb	0b

	// Deal with the tail end.
7:	add	[edi], eax
	mov	eax, 0			// preserves flags
	add	edi, 4
	adc	eax, 0
	cmp	edi, edx
	jb	7b

	// All done for this iteration.  Start the next.  (This must have at
	// least one follow-on iteration, or we'd not have started this outer
	// loop.)
8:	mov	edi, [esp + 0]		// -> dv[i - 1]
	mov	ebx, [ebp + 28]		// -> X = nv[0]
	lea	edx, [esp + 44]		// -> space for Y
	lea	esi, [esp + 12]		// -> expanded M = mi
	add	edi, 16			// -> Z = dv[i]
	cmp	edi, [esp + 4]		// all done yet?
	jae	9f
	mov	[esp + 0], edi
	call	mont4
	add	edi, 16
	add	ebx, 16
	jmp	5b

	// All over.
9:	dropfp
	popreg	edi
	popreg	esi
	popreg	ebx
	popreg	ebp
	ret

ENDFUNC

///--------------------------------------------------------------------------
/// Testing and performance measurement.

#ifdef TEST_MUL4

.macro	cysetup	c
	rdtsc
	mov	[\c], eax
	mov	[\c + 4], edx
.endm

.macro	cystore	c, v, n
	rdtsc
	sub	eax, [\c]
	sbb	edx, [\c + 4]
	mov	ebx, [\v]
	mov	ecx, [\n]
	dec	ecx
	mov	[\n], ecx
	mov	[ebx + ecx*8], eax
	mov	[ebx + ecx*8 + 4], edx
.endm

.macro	testprologue n
	pushreg	ebp
	pushreg	ebx
	pushreg	esi
	pushreg	edi
	setfp
	and	esp, ~15
	sub	esp, 3*32 + 4*4
  endprologue
	mov	eax, \n
	mov	[esp + 104], eax
	// vars:
	//	esp +   0 = v expanded
	//	esp +  32 = y expanded
	//	esp +  64 = ? expanded
	//	esp +  96 = cycles
	//	esp + 104 = count
.endm

.macro	testepilogue
	dropfp
	popreg	edi
	popreg	esi
	popreg	ebx
	popreg	ebp
	ret
.endm

.macro	testldcarry c
	mov	ecx, \c			// -> c
	movdqu	xmm4, [ecx +  0]	// (c'_0; c''_0)
	movdqu	xmm5, [ecx + 16]	// (c'_1; c''_1)
	movdqu	xmm6, [ecx + 32]	// (c'_2; c''_2)
.endm

.macro	testexpand v=nil, y=nil
	pxor	xmm7, xmm7
  .ifnes "\v", "nil"
	mov	ecx, \v
	movdqu	xmm0, [ecx]
	expand	xmm7, xmm0, xmm1
	movdqa	[esp +  0], xmm0
	movdqa	[esp + 16], xmm1
  .endif
  .ifnes "\y", "nil"
	mov	edx, \y
	movdqu	xmm2, [edx]
	expand	xmm7, xmm2, xmm3
	movdqa	[esp + 32], xmm2
	movdqa	[esp + 48], xmm3
  .endif
.endm

.macro	testtop	u=nil, x=nil, mode=nil
	.p2align 4
0:
  .ifnes "\u", "nil"
	lea	ecx, [esp + 0]
  .endif
	mov	ebx, \x
  .ifeqs "\mode", "mont"
	lea	esi, [esp + 32]
  .endif
	cysetup	esp + 96
  .ifnes "\u", "nil"
	mov	eax, \u
  .endif
  .ifeqs "\mode", "mont"
	lea	edx, [esp + 64]
  .else
	lea	edx, [esp + 32]
  .endif
.endm

.macro	testtail cyv
	cystore	esp + 96, \cyv, esp + 104
	jnz	0b
.endm

.macro	testcarryout c
	mov	ecx, \c
	movdqu	[ecx +  0], xmm4
	movdqu	[ecx + 16], xmm5
	movdqu	[ecx + 32], xmm6
.endm

FUNC(test_dmul4)
	testprologue [ebp + 44]
	testldcarry [ebp + 24]
	testexpand [ebp + 36], [ebp + 40]
	mov	edi, [ebp + 20]
	testtop	[ebp + 28], [ebp + 32]
	call	dmul4
	testtail [ebp + 48]
	testcarryout [ebp + 24]
	testepilogue
ENDFUNC

FUNC(test_dmla4)
	testprologue [ebp + 44]
	testldcarry [ebp + 24]
	testexpand [ebp + 36], [ebp + 40]
	mov	edi, [ebp + 20]
	testtop	[ebp + 28], [ebp + 32]
	call	dmla4
	testtail [ebp + 48]
	testcarryout [ebp + 24]
	testepilogue
ENDFUNC

FUNC(test_mul4)
	testprologue [ebp + 36]
	testldcarry [ebp + 24]
	testexpand nil, [ebp + 32]
	mov	edi, [ebp + 20]
	testtop	nil, [ebp + 28]
	call	mul4
	testtail [ebp + 40]
	testcarryout [ebp + 24]
	testepilogue
ENDFUNC

FUNC(test_mla4)
	testprologue [ebp + 36]
	testldcarry [ebp + 24]
	testexpand nil, [ebp + 32]
	mov	edi, [ebp + 20]
	testtop	nil, [ebp + 28]
	call	mla4
	testtail [ebp + 40]
	testcarryout [ebp + 24]
	testepilogue
ENDFUNC

FUNC(test_mmul4)
	testprologue [ebp + 48]
	testexpand [ebp + 40], [ebp + 44]
	mov	edi, [ebp + 20]
	testtop	[ebp + 32], [ebp + 36], mont
	call	mmul4
	testtail [ebp + 52]
	mov	edi, [ebp + 28]
	movdqa	xmm0, [esp + 64]
	movdqa	xmm1, [esp + 80]
	movdqu	[edi], xmm0
	movdqu	[edi + 16], xmm1
	testcarryout [ebp + 24]
	testepilogue
ENDFUNC

FUNC(test_mmla4)
	testprologue [ebp + 48]
	testexpand [ebp + 40], [ebp + 44]
	mov	edi, [ebp + 20]
	testtop	[ebp + 32], [ebp + 36], mont
	call	mmla4
	testtail [ebp + 52]
	mov	edi, [ebp + 28]
	movdqa	xmm0, [esp + 64]
	movdqa	xmm1, [esp + 80]
	movdqu	[edi], xmm0
	movdqu	[edi + 16], xmm1
	testcarryout [ebp + 24]
	testepilogue
ENDFUNC

FUNC(test_mont4)
	testprologue [ebp + 40]
	testexpand nil, [ebp + 36]
	mov	edi, [ebp + 20]
	testtop	nil, [ebp + 32], mont
	call	mont4
	testtail [ebp + 44]
	mov	edi, [ebp + 28]
	movdqa	xmm0, [esp + 64]
	movdqa	xmm1, [esp + 80]
	movdqu	[edi], xmm0
	movdqu	[edi + 16], xmm1
	testcarryout [ebp + 24]
	testepilogue
ENDFUNC

#endif

///----- That's all, folks --------------------------------------------------