From 3119b3ae038717e3860a7ee0fda55098af389809 Mon Sep 17 00:00:00 2001
From: Mark Wooding <mdw@distorted.org.uk>
Date: Wed, 4 Jan 2017 01:42:16 +0000
Subject: [PATCH] math/mpx-mul4-amd64-sse2.S: SSE2 multipliers for AMD64.

Plus the various hangers on.
---
 math/Makefile.am           |    3 +
 math/mpmont.c              |   16 +
 math/mpx-mul4-amd64-sse2.S | 1577 ++++++++++++++++++++++++++++++++++++++++++++
 math/mpx.c                 |    8 +
 math/t/mpmont              |    5 +
 5 files changed, 1609 insertions(+)
 create mode 100644 math/mpx-mul4-amd64-sse2.S

diff --git a/math/Makefile.am b/math/Makefile.am
index 69940ba5..0afee1f1 100644
--- a/math/Makefile.am
+++ b/math/Makefile.am
@@ -184,6 +184,9 @@ EXTRA_DIST		+= t/mpx
 if CPUFAM_X86
 libmath_la_SOURCES	+= mpx-mul4-x86-sse2.S
 endif
+if CPUFAM_AMD64
+libmath_la_SOURCES	+= mpx-mul4-amd64-sse2.S
+endif
 
 ## A quick-and-dirty parser, used for parsing descriptions of groups, fields,
 ## etc.
diff --git a/math/mpmont.c b/math/mpmont.c
index 774e8523..fe1e6457 100644
--- a/math/mpmont.c
+++ b/math/mpmont.c
@@ -92,12 +92,20 @@ static void simple_redccore(mpw *dv, mpw *dvl, const mpw *mv,
   MAYBE_REDC4(x86_sse2)
 #endif
 
+#if CPUFAM_AMD64
+  MAYBE_REDC4(amd64_sse2)
+#endif
+
 static redccore__functype *pick_redccore(void)
 {
 #if CPUFAM_X86
   DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_x86_sse2,
 		     cpu_feature_p(CPUFEAT_X86_SSE2));
 #endif
+#if CPUFAM_AMD64
+  DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_amd64_sse2,
+		     cpu_feature_p(CPUFEAT_X86_SSE2));
+#endif
   DISPATCH_PICK_FALLBACK(mpmont_reduce, simple_redccore);
 }
 
@@ -184,12 +192,20 @@ static void simple_mulcore(mpw *dv, mpw *dvl,
   MAYBE_MUL4(x86_sse2)
 #endif
 
+#if CPUFAM_AMD64
+  MAYBE_MUL4(amd64_sse2)
+#endif
+
 static mulcore__functype *pick_mulcore(void)
 {
 #if CPUFAM_X86
   DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_x86_sse2,
 		     cpu_feature_p(CPUFEAT_X86_SSE2));
 #endif
+#if CPUFAM_AMD64
+  DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_amd64_sse2,
+		     cpu_feature_p(CPUFEAT_X86_SSE2));
+#endif
   DISPATCH_PICK_FALLBACK(mpmont_mul, simple_mulcore);
 }
 
diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S
new file mode 100644
index 00000000..2d78a992
--- /dev/null
+++ b/math/mpx-mul4-amd64-sse2.S
@@ -0,0 +1,1577 @@
+/// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
+///
+/// Large SIMD-based multiplications
+///
+/// (c) 2016 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+#include "config.h"
+#include "asm-common.h"
+
+///--------------------------------------------------------------------------
+/// Prologue.
+
+	.arch	pentium4
+	.text
+
+///--------------------------------------------------------------------------
+/// Theory.
+///
+/// We define a number of primitive fixed-size multipliers from which we can
+/// construct more general variable-length multipliers.
+///
+/// The basic trick is the same throughout.  In an operand-scanning
+/// multiplication, the inner multiplication loop multiplies a
+/// multiple-precision operand by a single precision factor, and adds the
+/// result, appropriately shifted, to the result.  A `finely integrated
+/// operand scanning' implementation of Montgomery multiplication also adds
+/// the product of a single-precision `Montgomery factor' and the modulus,
+/// calculated in the same pass.  The more common `coarsely integrated
+/// operand scanning' alternates main multiplication and Montgomery passes,
+/// which requires additional carry propagation.
+///
+/// Throughout both plain-multiplication and Montgomery stages, then, one of
+/// the factors remains constant throughout the operation, so we can afford
+/// to take a little time to preprocess it.  The transformation we perform is
+/// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
+/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
+/// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
+/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
+/// operands, as follows.
+///
+///	Offset	   0	   4	    8	   12
+///	   0	v'_0	v'_1	v''_0	v''_1
+///	  16	v'_2	v'_3	v''_2	v''_3
+///
+/// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
+/// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
+/// this vector right by 4 bytes brings v'_1 and v''_1 into position.  We can
+/// multiply such a vector by a full 32-bit scalar to produce two 48-bit
+/// results in 64-bit fields.  The sixteen bits of headroom allows us to add
+/// many products together before we must deal with carrying; it also allows
+/// for some calculations to be performed on the above expanded form.
+///
+/// ...
+///
+/// We maintain four `carry' registers accumulating intermediate results.
+/// The registers' precise roles rotate during the computation; we name them
+/// `c0', `c1', `c2', and `c3'.  Each carry register holds two 64-bit halves:
+/// the register c0, for example, holds c'_0 (low half) and c''_0 (high
+/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
+/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3.  The
+/// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
+/// lanes of its vector) and an operand in the expanded form above produces a
+/// result which can be added directly to the appropriate carry register.
+/// Following a pass of four multiplications, we perform some limited carry
+/// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
+/// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
+/// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
+/// zeroed becomes c3.
+
+///--------------------------------------------------------------------------
+/// Macro definitions.
+
+.macro	mulcore	r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
+	// Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
+	// of the product in registers D0, D1, D2, D3.
+	pshufd	\d0, \r, SHUF(3, \i, 3, \i) // (r_i, ?, r_i, ?)
+  .ifnes "\d1", "nil"
+	movdqa	\d1, \slo		// (s'_0, s'_1, s''_0, s''_1)
+  .endif
+  .ifnes "\d3", "nil"
+	movdqa	\d3, \shi		// (s'_2, s'_3, s''_2, s''_3)
+  .endif
+  .ifnes "\d1", "nil"
+	psrldq	\d1, 4			// (s'_1, s''_0, s''_1, 0)
+  .endif
+  .ifnes "\d2", "nil"
+	movdqa	\d2, \d0		// another copy of (r_i, ?, r_i, ?)
+  .endif
+  .ifnes "\d3", "nil"
+	psrldq	\d3, 4			// (s'_3, s''_2, s''_3, 0)
+  .endif
+  .ifnes "\d1", "nil"
+	pmuludq	\d1, \d0		// (r_i s'_1, r_i s''_1)
+  .endif
+  .ifnes "\d3", "nil"
+	pmuludq	\d3, \d0		// (r_i s'_3, r_i s''_3)
+  .endif
+  .ifnes "\d2", "nil"
+	pmuludq	\d2, \shi		// (r_i s'_2, r_i s''_2)
+  .endif
+	pmuludq	\d0, \slo		// (r_i s'_0, r_i s''_0)
+.endm
+
+.macro	accum	c0, c1=nil, c2=nil, c3=nil
+	// Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
+	// carry registers C0--C3.  Any or all of C1--C3 may be `nil' to skip
+	// updating that register.
+	paddq	\c0, xmm0
+  .ifnes "\c1", "nil"
+	paddq	\c1, xmm1
+  .endif
+  .ifnes "\c2", "nil"
+	paddq	\c2, xmm2
+  .endif
+  .ifnes "\c3", "nil"
+	paddq	\c3, xmm3
+  .endif
+.endm
+
+.macro	mulacc	r, i, slo, shi, c0=nil, c1=nil, c2=nil, c3=nil, z3p=nil
+	// Multiply R_I by the expanded operand SLO/SHI, and accumulate in
+	// carry registers C0, C1, C2, C3.  If Z3P is `t' then C3 notionally
+	// contains zero, but needs clearing; in practice, we store the
+	// product directly rather than attempting to add.  On completion,
+	// XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P is not `t'.
+  .ifeqs "\z3p", "t"
+	mulcore	\r, \i, \slo, \shi, xmm0, xmm1, xmm2, \c3
+	accum			    \c0,  \c1,  \c2
+  .else
+	mulcore	\r, \i, \slo, \shi, xmm0, xmm1, xmm2, xmm3
+	accum			    \c0,  \c1,  \c2,  \c3
+  .endif
+.endm
+
+.macro	propout	d, pos, c, cc=nil
+	// Calculate an output word from C, and store it at POS in D;
+	// propagate carries out from C to CC in preparation for a rotation
+	// of the carry registers.  D is an XMM register; the POS is either
+	// `lo' or `hi' according to whether the output word should be in
+	// lane 0 or 1 of D; the high two lanes of D are clobbered.  On
+	// completion, XMM3 is clobbered.  If CC is `nil', then the
+	// contribution which would have been added to it is left in C.
+	pshufd	xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
+	psrldq	xmm3, 12		// (t, 0, 0, 0) = (t, 0)
+	pslldq	xmm3, 2			// (t b, 0)
+	paddq	\c, xmm3		// (c' + t b, c'')
+  .ifeqs "\pos", "lo"
+	movdqa	\d, \c
+  .else
+	punpckldq \d, \c
+  .endif
+	psrlq	\c, 32			// floor(c/B)
+  .ifnes "\cc", "nil"
+	paddq	\cc, \c			// propagate up
+  .endif
+.endm
+
+.macro endprop d, pos, c, t
+	// On entry, C contains a carry register.  On exit, the low 32 bits
+	// of the value represented in C are written at POS in D, and the
+	// remaining bits are left at the bottom of T.
+	movdqa	\t, \c
+	psllq	\t, 16			// (?, c'' b)
+	pslldq	\c, 8			// (0, c')
+	paddq	\t, \c			// (?, c' + c'' b)
+	psrldq	\t, 8			// c' + c'' b
+  .ifeqs "\pos", "lo"
+	movdqa	\d, \t
+  .else
+	punpckldq \d, \t
+  .endif
+	psrldq	\t, 4			// floor((c' + c'' b)/B)
+.endm
+
+.macro	expand	z, a, b, c=nil, d=nil
+	// On entry, A and C hold packed 128-bit values, and Z is zero.  On
+	// exit, A:B and C:D together hold the same values in expanded
+	// form.  If C is `nil', then only expand A to A:B.
+	movdqa	\b, \a			// (a_0, a_1, a_2, a_3)
+  .ifnes "\c", "nil"
+	movdqa	\d, \c			// (c_0, c_1, c_2, c_3)
+  .endif
+	punpcklwd \a, \z		// (a'_0, a''_0, a'_1, a''_1)
+	punpckhwd \b, \z		// (a'_2, a''_2, a'_3, a''_3)
+  .ifnes "\c", "nil"
+	punpcklwd \c, \z		// (c'_0, c''_0, c'_1, c''_1)
+	punpckhwd \d, \z		// (c'_2, c''_2, c'_3, c''_3)
+  .endif
+	pshufd	\a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
+	pshufd	\b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
+  .ifnes "\c", "nil"
+	pshufd	\c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
+	pshufd	\d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
+  .endif
+.endm
+
+.macro	squash	c0, c1, c2, c3, t, u, lo, hi=nil
+	// On entry, C0, C1, C2, C3 are carry registers representing a value
+	// Y.  On exit, LO holds the low 128 bits of the carry value; C1, C2,
+	// C3, T, and U are clobbered; and the high bits of Y are stored in
+	// HI, if this is not `nil'.
+
+	// The first step is to eliminate the `double-prime' pieces -- i.e.,
+	// the ones offset by 16 bytes from a 32-bit boundary -- by carrying
+	// them into the 32-bit-aligned pieces above and below.  But before
+	// we can do that, we must gather them together.
+	movdqa	\t, \c0
+	movdqa	\u, \c1
+	punpcklqdq \t, \c2		// (y'_0, y'_2)
+	punpckhqdq \c0, \c2		// (y''_0, y''_2)
+	punpcklqdq \u, \c3		// (y'_1, y'_3)
+	punpckhqdq \c1, \c3		// (y''_1, y''_3)
+
+	// Now split the double-prime pieces.  The high (up to) 48 bits will
+	// go up; the low 16 bits go down.
+	movdqa	\c2, \c0
+	movdqa	\c3, \c1
+	psllq	\c2, 48
+	psllq	\c3, 48
+	psrlq	\c0, 16			// high parts of (y''_0, y''_2)
+	psrlq	\c1, 16			// high parts of (y''_1, y''_3)
+	psrlq	\c2, 32			// low parts of (y''_0, y''_2)
+	psrlq	\c3, 32			// low parts of (y''_1, y''_3)
+  .ifnes "\hi", "nil"
+	movdqa	\hi, \c1
+  .endif
+	pslldq	\c1, 8			// high part of (0, y''_1)
+
+	paddq	\t, \c2			// propagate down
+	paddq	\u, \c3
+	paddq	\t, \c1			// and up: (y_0, y_2)
+	paddq	\u, \c0			// (y_1, y_3)
+  .ifnes "\hi", "nil"
+	psrldq	\hi, 8			// high part of (y''_3, 0)
+  .endif
+
+	// Finally extract the answer.  This complicated dance is better than
+	// storing to memory and loading, because the piecemeal stores
+	// inhibit store forwarding.
+	movdqa	\c3, \t			// (y_0, y_1)
+	movdqa	\lo, \t			// (y^*_0, ?, ?, ?)
+	psrldq	\t, 8			// (y_2, 0)
+	psrlq	\c3, 32			// (floor(y_0/B), ?)
+	paddq	\c3, \u			// (y_1 + floor(y_0/B), ?)
+	movdqa	\c1, \c3		// (y^*_1, ?, ?, ?)
+	psrldq	\u, 8			// (y_3, 0)
+	psrlq	\c3, 32			// (floor((y_1 B + y_0)/B^2, ?)
+	paddq	\c3, \t			// (y_2 + floor((y_1 B + y_0)/B^2, ?)
+	punpckldq \lo, \c3		// (y^*_0, y^*_2, ?, ?)
+	psrlq	\c3, 32		    // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
+	paddq	\c3, \u	      // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
+  .ifnes "\hi", "nil"
+	movdqa	\t, \c3
+	pxor	\u, \u
+  .endif
+	punpckldq \c1, \c3		// (y^*_1, y^*_3, ?, ?)
+  .ifnes "\hi", "nil"
+	psrlq	\t, 32			// very high bits of y
+	paddq	\hi, \t
+	punpcklqdq \hi, \u		// carry up
+  .endif
+	punpckldq \lo, \c1		// y mod B^4
+.endm
+
+.macro	carryadd
+	// On entry, RDI points to a packed addend A, and XMM12, XMM13, XMM14
+	// hold the incoming carry registers c0, c1, and c2 representing a
+	// carry-in C.
+	//
+	// On exit, the carry registers, including XMM15, are updated to hold
+	// C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
+	// registers are preserved.
+	movd	xmm0, [rdi +  0]	// (a_0, 0)
+	movd	xmm1, [rdi +  4]	// (a_1, 0)
+	movd	xmm2, [rdi +  8]	// (a_2, 0)
+	movd	xmm15, [rdi + 12]	// (a_3, 0)
+	paddq	xmm12, xmm0		// (c'_0 + a_0, c''_0)
+	paddq	xmm13, xmm1		// (c'_1 + a_1, c''_1)
+	paddq	xmm14, xmm2		// (c'_2 + a_2, c''_2 + a_3 b)
+.endm
+
+///--------------------------------------------------------------------------
+/// Primitive multipliers and related utilities.
+
+INTFUNC(carryprop)
+	// On entry, XMM12, XMM13, and XMM14 hold a 144-bit carry in an
+	// expanded form.  Store the low 128 bits of the represented carry to
+	// [RDI] as a packed 128-bit value, and leave the remaining 16 bits
+	// in the low 32 bits of XMM12.  On exit, XMM0, XMM1, XMM3, XMM13 and
+	// XMM14 are clobbered.
+  endprologue
+
+	propout	xmm0, lo, xmm12, xmm13
+	propout	xmm1, lo, xmm13, xmm14
+	propout	xmm0, hi, xmm14, nil
+	endprop	xmm1, hi, xmm14, xmm12
+	punpckldq xmm0, xmm1
+	movdqu	[rdi], xmm0
+
+	ret
+
+ENDFUNC
+
+INTFUNC(dmul4)
+	// On entry, RDI points to the destination buffer; RAX and RBX point
+	// to the packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
+	// expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
+	// incoming carry registers c0, c1, and c2; c3 is assumed to be zero.
+	//
+	// On exit, we write the low 128 bits of the sum C + U V + X Y to
+	// [RDI], and update the carry registers with the carry out.  The
+	// registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
+	// registers are preserved.
+  endprologue
+
+	movdqu	xmm4, [rax]
+	movdqu	xmm5, [rbx]
+
+	mulacc	xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15, t
+	mulacc	xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
+	propout	xmm6, lo,		 xmm12, xmm13
+
+	mulacc	xmm4, 1,   xmm8,  xmm9,  xmm13, xmm14, xmm15, xmm12, t
+	mulacc	xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
+	propout	xmm7, lo,		 xmm13, xmm14
+
+	mulacc	xmm4, 2,   xmm8,  xmm9,  xmm14, xmm15, xmm12, xmm13, t
+	mulacc	xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
+	propout	xmm6, hi,		 xmm14, xmm15
+
+	mulacc	xmm4, 3,   xmm8,  xmm9,  xmm15, xmm12, xmm13, xmm14, t
+	mulacc	xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
+	propout	xmm7, hi,		 xmm15, xmm12
+
+	punpckldq xmm6, xmm7
+	movdqu	[rdi], xmm6
+
+	ret
+
+ENDFUNC
+
+INTFUNC(dmla4)
+	// On entry, RDI points to the destination buffer, which also
+	// contains an addend A to accumulate; RAX and RBX point to the
+	// packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
+	// expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
+	// incoming carry registers c0, c1, and c2 representing a carry-in C;
+	// c3 is assumed to be zero.
+	//
+	// On exit, we write the low 128 bits of the sum A + C + U V + X Y to
+	// [RDI], and update the carry registers with the carry out.  The
+	// registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
+	// registers are preserved.
+  endprologue
+
+	movdqu	xmm4, [rax]
+	movdqu	xmm5, [rbx]
+	carryadd
+
+	mulacc	xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15
+	mulacc	xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
+	propout	xmm6, lo,		 xmm12, xmm13
+
+	mulacc	xmm4, 1,   xmm8,  xmm9,  xmm13, xmm14, xmm15, xmm12, t
+	mulacc	xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
+	propout	xmm7, lo,		 xmm13, xmm14
+
+	mulacc	xmm4, 2,   xmm8,  xmm9,  xmm14, xmm15, xmm12, xmm13, t
+	mulacc	xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
+	propout	xmm6, hi,		 xmm14, xmm15
+
+	mulacc	xmm4, 3,   xmm8,  xmm9,  xmm15, xmm12, xmm13, xmm14, t
+	mulacc	xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
+	propout	xmm7, hi,		 xmm15, xmm12
+
+	punpckldq xmm6, xmm7
+	movdqu	[rdi], xmm6
+
+	ret
+
+ENDFUNC
+
+INTFUNC(mul4zc)
+	// On entry, RDI points to the destination buffer; RBX points to a
+	// packed operand X; and XMM10/XMM11 hold an expanded operand Y.
+	//
+	// On exit, we write the low 128 bits of the product X Y to [RDI],
+	// and set the carry registers XMM12, XMM13, XMM14 to the carry out.
+	// The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
+	// general-purpose registers are preserved.
+  endprologue
+
+	movdqu	xmm5, [rbx]
+
+	mulcore	xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
+	propout	xmm6, lo,		 xmm12, xmm13
+
+	mulacc	xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
+	propout	xmm7, lo,		 xmm13, xmm14
+
+	mulacc	xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
+	propout	xmm6, hi,		 xmm14, xmm15
+
+	mulacc	xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
+	propout	xmm7, hi,		 xmm15, xmm12
+
+	punpckldq xmm6, xmm7
+	movdqu	[rdi], xmm6
+
+	ret
+
+ENDFUNC
+
+INTFUNC(mul4)
+	// On entry, RDI points to the destination buffer; RBX points to a
+	// packed operand X; XMM10/XMM11 hold an expanded operand Y; and
+	// XMM12, XMM13, XMM14 hold the incoming carry registers c0, c1, and
+	// c2, representing a carry-in C; c3 is assumed to be zero.
+	//
+	// On exit, we write the low 128 bits of the sum C + X Y to [RDI],
+	// and update the carry registers with the carry out.  The registers
+	// XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
+	// general-purpose registers are preserved.
+  endprologue
+
+	movdqu	xmm5, [rbx]
+
+	mulacc	xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, t
+	propout	xmm6, lo,		 xmm12, xmm13
+
+	mulacc	xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
+	propout	xmm7, lo,		 xmm13, xmm14
+
+	mulacc	xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
+	propout	xmm6, hi,		 xmm14, xmm15
+
+	mulacc	xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
+	propout	xmm7, hi,		 xmm15, xmm12
+
+	punpckldq xmm6, xmm7
+	movdqu	[rdi], xmm6
+
+	ret
+
+ENDFUNC
+
+INTFUNC(mla4zc)
+	// On entry, RDI points to the destination buffer, which also
+	// contains an addend A to accumulate; RBX points to a packed operand
+	// X; and XMM10/XMM11 points to an expanded operand Y.
+	//
+	// On exit, we write the low 128 bits of the sum A + X Y to [RDI],
+	// and set the carry registers XMM12, XMM13, XMM14 to the carry out.
+	// The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
+	// general-purpose registers are preserved.
+  endprologue
+
+	movdqu	xmm5, [rbx]
+	movd	xmm12, [rdi +  0]
+	movd	xmm13, [rdi +  4]
+	movd	xmm14, [rdi +  8]
+	movd	xmm15, [rdi + 12]
+
+	mulacc	xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
+	propout	xmm6, lo,		 xmm12, xmm13
+
+	mulacc	xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
+	propout	xmm7, lo,		 xmm13, xmm14
+
+	mulacc	xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
+	propout	xmm6, hi,		 xmm14, xmm15
+
+	mulacc	xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
+	propout	xmm7, hi,		 xmm15, xmm12
+
+	punpckldq xmm6, xmm7
+	movdqu	[rdi], xmm6
+
+	ret
+
+ENDFUNC
+
+INTFUNC(mla4)
+	// On entry, RDI points to the destination buffer, which also
+	// contains an addend A to accumulate; RBX points to a packed operand
+	// X; XMM10/XMM11 holds an expanded operand Y; and XMM12, XMM13,
+	// XMM14 hold the incoming carry registers c0, c1, and c2,
+	// representing a carry-in C; c3 is assumed to be zero.
+	//
+	// On exit, we write the low 128 bits of the sum A + C + X Y to
+	// [RDI], and update the carry registers with the carry out.  The
+	// registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
+	// general-purpose registers are preserved.
+  endprologue
+
+	movdqu	xmm5, [rbx]
+	carryadd
+
+	mulacc	xmm5, 0,    xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
+	propout	xmm6, lo,		 xmm12, xmm13
+
+	mulacc	xmm5, 1,    xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
+	propout	xmm7, lo,		 xmm13, xmm14
+
+	mulacc	xmm5, 2,    xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
+	propout	xmm6, hi,		 xmm14, xmm15
+
+	mulacc	xmm5, 3,    xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
+	propout	xmm7, hi,		 xmm15, xmm12
+
+	punpckldq xmm6, xmm7
+	movdqu	[rdi], xmm6
+
+	ret
+
+ENDFUNC
+
+INTFUNC(mmul4)
+	// On entry, RDI points to the destination buffer; RAX and RBX point
+	// to the packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold
+	// the expanded operands V and M.  The stack pointer must be 8 modulo 16
+	// (as usual for AMD64 ABIs).
+	//
+	// On exit, we store Y = U V M mod B in XMM10/XMM11, and write the
+	// low 128 bits of the sum U V + N Y to [RDI], leaving the remaining
+	// carry in XMM12, XMM13, and XMM14.  The registers XMM0--XMM7, and
+	// XMM15 are clobbered; the general-purpose registers are preserved.
+	movdqu	xmm4, [rax]
+#if ABI_WIN
+	stalloc	48 + 8			// space for the carries
+#endif
+  endprologue
+
+	// Calculate W = U V, and leave it in XMM7.  Stash the carry pieces
+	// for later.
+	mulcore	xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15
+	propout	xmm7, lo,		 xmm12, xmm13
+	jmp	5f
+
+ENDFUNC
+
+INTFUNC(mmla4)
+	// On entry, RDI points to the destination buffer, which also
+	// contains an addend A to accumulate; RAX and RBX point to the
+	// packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold the
+	// expanded operands V and M.  The stack pointer must be 8 modulo 16
+	// (as usual for AMD64 ABIs).
+	//
+	// On exit, we store Y = (A + U V) M mod B in XMM10/XMM11, and write
+	// the low 128 bits of the sum A + U V + N Y to [RDI], leaving the
+	// remaining carry in XMM12, XMM13, and XMM14.  The registers
+	// XMM0--XMM7, and XMM15 are clobbered; the general-purpose registers
+	// are preserved.
+	movdqu	xmm4, [rax]
+#if ABI_WIN
+	stalloc	48 + 8			// space for the carries
+#  define STKTMP(i) [rsp + i]
+#endif
+#if ABI_SYSV
+#  define STKTMP(i) [rsp + i - 48 - 8]	// use red zone
+#endif
+  endprologue
+
+	movd	xmm12, [rdi +  0]
+	movd	xmm13, [rdi +  4]
+	movd	xmm14, [rdi +  8]
+	movd	xmm15, [rdi + 12]
+
+	// Calculate W = U V, and leave it in XMM7.  Stash the carry pieces
+	// for later.
+	mulacc	xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15
+	propout	xmm7, lo,		 xmm12, xmm13
+
+5:	mulacc	xmm4, 1,   xmm8,  xmm9,  xmm13, xmm14, xmm15, xmm12, t
+	propout	xmm6, lo,		 xmm13, xmm14
+
+	mulacc	xmm4, 2,   xmm8,  xmm9,  xmm14, xmm15, xmm12, xmm13, t
+	propout	xmm7, hi,		 xmm14, xmm15
+
+	mulacc	xmm4, 3,   xmm8,  xmm9,  xmm15, xmm12, xmm13, xmm14, t
+	propout	xmm6, hi,		 xmm15, xmm12
+
+	// Prepare W, and stash carries for later.
+	punpckldq xmm7, xmm6
+	movdqa	STKTMP( 0), xmm12
+	movdqa	STKTMP(16), xmm13
+	movdqa	STKTMP(32), xmm14
+
+	// Calculate Y = W M.  We just about have enough spare registers to
+	// make this work.
+	mulcore	xmm7, 0,   xmm10, xmm11, xmm3,  xmm4,  xmm5,  xmm6
+
+	// Start expanding W back into the main carry registers...
+	pxor	xmm15, xmm15
+	movdqa	xmm12, xmm7
+	movdqa	xmm14, xmm7
+
+	mulcore	xmm7, 1,   xmm10, xmm11, xmm0,  xmm1,  xmm2
+	accum				 xmm4,  xmm5,  xmm6
+
+	punpckldq xmm12, xmm15		// (w_0, 0, w_1, 0)
+	punpckhdq xmm14, xmm15		// (w_2, 0, w_3, 0)
+
+	mulcore	xmm7, 2,   xmm10, xmm11, xmm0,  xmm1
+	accum				 xmm5,  xmm6
+
+	pxor	xmm2, xmm2
+	movdqa	xmm13, xmm12
+	movdqa	xmm15, xmm14
+
+	mulcore	xmm7, 3,   xmm10, xmm11, xmm0
+	accum				 xmm6
+
+	punpckldq xmm12, xmm2		// (w_0, 0, 0, 0)
+	punpckldq xmm14, xmm2		// (w_2, 0, 0, 0)
+	punpckhdq xmm13, xmm2		// (w_1, 0, 0, 0)
+	punpckhdq xmm15, xmm2		// (w_3, 0, 0, 0)
+
+	// That's lots of pieces.  Now we have to assemble the answer.
+	squash	xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
+
+	// Expand it.
+	movdqu	xmm5, [rbx]
+	expand	xmm2, xmm10, xmm11
+
+	// Finish the calculation by adding the Montgomery product.
+	mulacc	xmm5, 0    xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
+	propout	xmm6, lo,		 xmm12, xmm13
+
+	mulacc	xmm5, 1    xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
+	propout	xmm7, lo,		 xmm13, xmm14
+
+	mulacc	xmm5, 2    xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
+	propout	xmm6, hi,		 xmm14, xmm15
+
+	mulacc	xmm5, 3    xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
+	propout	xmm7, hi,		 xmm15, xmm12
+
+	punpckldq xmm6, xmm7
+
+	// Add add on the carry we calculated earlier.
+	paddq	xmm12, STKTMP( 0)
+	paddq	xmm13, STKTMP(16)
+	paddq	xmm14, STKTMP(32)
+
+	// And, with that, we're done.
+	movdqu	[rdi], xmm6
+#if ABI_WIN
+	stfree	56
+#endif
+	ret
+
+#undef STKTMP
+
+ENDFUNC
+
+INTFUNC(mont4)
+	// On entry, RDI points to the destination buffer holding a packed
+	// value W; RBX points to a packed operand N; and XMM8/XMM9 hold an
+	// expanded operand M.
+	//
+	// On exit, we store Y = W M mod B in XMM10/XMM11, and write the low
+	// 128 bits of the sum W + N Y to [RDI], leaving the remaining carry
+	// in XMM12, XMM13, and XMM14.  The registers XMM0--XMM3, XMM5--XMM7,
+	// and XMM15 are clobbered; the general-purpose registers are
+	// preserved.
+  endprologue
+
+	movdqu	xmm7, [rdi]
+
+	// Calculate Y = W M.  Avoid the standard carry registers, because
+	// we're setting something else up there.
+	mulcore	xmm7, 0,   xmm8,  xmm9,  xmm3,  xmm4,  xmm5,  xmm6
+
+	// Start expanding W back into the main carry registers...
+	pxor	xmm15, xmm15
+	movdqa	xmm12, xmm7
+	movdqa	xmm14, xmm7
+
+	mulcore	xmm7, 1,   xmm8,  xmm9,  xmm0,  xmm1,  xmm2
+	accum				 xmm4,  xmm5,  xmm6
+
+	punpckldq xmm12, xmm15		// (w_0, 0, w_1, 0)
+	punpckhdq xmm14, xmm15		// (w_2, 0, w_3, 0)
+
+	mulcore	xmm7, 2,   xmm8,  xmm9,  xmm0,  xmm1
+	accum				 xmm5,  xmm6
+
+	pxor	xmm2, xmm2
+	movdqa	xmm13, xmm12
+	movdqa	xmm15, xmm14
+
+	mulcore	xmm7, 3,   xmm8,  xmm9,  xmm0
+	accum				 xmm6
+
+	punpckldq xmm12, xmm2		// (w_0, 0, 0, 0)
+	punpckldq xmm14, xmm2		// (w_2, 0, 0, 0)
+	punpckhdq xmm13, xmm2		// (w_1, 0, 0, 0)
+	punpckhdq xmm15, xmm2		// (w_3, 0, 0, 0)
+
+	// That's lots of pieces.  Now we have to assemble the answer.
+	squash	xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
+
+	// Expand it.
+	movdqu	xmm5, [rbx]
+	expand	xmm2, xmm10, xmm11
+
+	// Finish the calculation by adding the Montgomery product.
+	mulacc	xmm5, 0    xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
+	propout	xmm6, lo,		 xmm12, xmm13
+
+	mulacc	xmm5, 1    xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
+	propout	xmm7, lo,		 xmm13, xmm14
+
+	mulacc	xmm5, 2    xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
+	propout	xmm6, hi,		 xmm14, xmm15
+
+	mulacc	xmm5, 3    xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
+	propout	xmm7, hi,		 xmm15, xmm12
+
+	punpckldq xmm6, xmm7
+
+	// And, with that, we're done.
+	movdqu	[rdi], xmm6
+	ret
+
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Bulk multipliers.
+
+FUNC(mpx_umul4_amd64_sse2)
+	// void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
+	//			   const mpw *bv, const mpw *bvl);
+
+	// Establish the arguments and do initial setup.
+	//
+	//			sysv	win
+	// inner loop dv	rdi	rdi*
+	// inner loop av	rbx*	rbx*
+	// outer loop dv	r10	rcx
+	// outer loop bv	rcx	r9
+	// av base		rsi	rdx
+	// av limit		rdx	r8
+	// bv limit		r8	r10
+
+#if ABI_SYSV
+#  define DV r10
+#  define AV rsi
+#  define AVL rdx
+#  define BV rcx
+#  define BVL r8
+
+	pushreg	rbx
+  endprologue
+
+	mov	DV, rdi
+
+#endif
+
+#if ABI_WIN
+#  define DV rcx
+#  define AV rdx
+#  define AVL r8
+#  define BV r9
+#  define BVL r10
+
+	pushreg	rbx
+	pushreg	rdi
+	stalloc	160 + 8
+
+	savexmm	xmm6,    0
+	savexmm	xmm7,   16
+	savexmm	xmm8,   32
+	savexmm	xmm9,   48
+	savexmm	xmm10,  64
+	savexmm	xmm11,  80
+	savexmm	xmm12,  96
+	savexmm	xmm13, 112
+	savexmm	xmm14, 128
+	savexmm	xmm15, 144
+
+  endprologue
+
+	mov	rdi, DV
+	mov	BVL, [rsp + 224]
+
+#endif
+
+	// Prepare for the first iteration.
+	pxor	xmm0, xmm0
+	movdqu	xmm10, [BV]		// bv[0]
+	mov	rbx, AV
+	add	DV, 16
+	add	BV, 16
+	expand	xmm0, xmm10, xmm11
+	call	mul4zc
+	add	rbx, 16
+	add	rdi, 16
+	cmp	rbx, AVL		// all done?
+	jae	8f
+
+	.p2align 4
+	// Continue with the first iteration.
+0:	call	mul4
+	add	rbx, 16
+	add	rdi, 16
+	cmp	rbx, AVL		// all done?
+	jb	0b
+
+	// Write out the leftover carry.  There can be no tail here.
+8:	call	carryprop
+	cmp	BV, BVL			// more passes to do?
+	jae	9f
+
+	.p2align 4
+	// Set up for the next pass.
+1:	movdqu	xmm10, [BV]		// bv[i]
+	mov	rdi, DV			// -> dv[i]
+	pxor	xmm0, xmm0
+	expand	xmm0, xmm10, xmm11
+	mov	rbx, AV			// -> av[0]
+	add	DV, 16
+	add	BV, 16
+	call	mla4zc
+	add	rbx, 16
+	add	rdi, 16
+	cmp	rbx, AVL		// done yet?
+	jae	8f
+
+	.p2align 4
+	// Continue...
+0:	call	mla4
+	add	rbx, 16
+	add	rdi, 16
+	cmp	rbx, AVL
+	jb	0b
+
+	// Finish off this pass.  There was no tail on the previous pass, and
+	// there can be none on this pass.
+8:	call	carryprop
+	cmp	BV, BVL
+	jb	1b
+
+	// All over.
+9:
+
+#if ABI_SYSV
+	popreg	rbx
+#endif
+
+#if ABI_WIN
+
+	rstrxmm	xmm6,    0
+	rstrxmm	xmm7,   16
+	rstrxmm	xmm8,   32
+	rstrxmm	xmm9,   48
+	rstrxmm	xmm10,  64
+	rstrxmm	xmm11,  80
+	rstrxmm	xmm12,  96
+	rstrxmm	xmm13, 112
+	rstrxmm	xmm14, 128
+	rstrxmm	xmm15, 144
+
+	stfree	160 + 8
+	popreg	rdi
+	popreg	rbx
+
+#endif
+
+	ret
+
+#undef DV
+#undef AV
+#undef AVL
+#undef BV
+#undef BVL
+
+ENDFUNC
+
+FUNC(mpxmont_mul4_amd64_sse2)
+	// void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
+	//			     const mpw *nv, size_t n, const mpw *mi);
+
+	// Establish the arguments and do initial setup.
+	//
+	//			sysv	win
+	// inner loop dv	rdi	rdi*
+	// inner loop av	rax	rax
+	// inner loop nv	rbx*	rbx*
+	// mi			r9	r10
+	// outer loop dv	r10	rcx
+	// outer loop bv	rdx	r8
+	// av base		rsi	rdx
+	// av limit		r11	r11
+	// bv limit		r8	r12*
+	// nv base		rcx	r9
+	// n			r8	r12*
+
+#if ABI_SYSV
+#  define DV r10
+#  define AV rsi
+#  define AVL r11
+#  define BV rdx
+#  define BVL r8
+#  define NV rcx
+#  define N r8
+#  define MI r9
+
+	pushreg	rbx
+  endprologue
+
+	mov	DV, rdi
+
+#endif
+
+#if ABI_WIN
+#  define DV rcx
+#  define AV rdx
+#  define AVL r11
+#  define BV r8
+#  define BVL r12
+#  define NV r9
+#  define N r12
+#  define MI r10
+
+	pushreg	rbx
+	pushreg	rdi
+	pushreg	r12
+	stalloc	160
+
+	savexmm	xmm6,    0
+	savexmm	xmm7,   16
+	savexmm	xmm8,   32
+	savexmm	xmm9,   48
+	savexmm	xmm10,  64
+	savexmm	xmm11,  80
+	savexmm	xmm12,  96
+	savexmm	xmm13, 112
+	savexmm	xmm14, 128
+	savexmm	xmm15, 144
+
+  endprologue
+
+	mov	rdi, DV
+	mov	N, [rsp + 224]
+	mov	MI, [rsp + 232]
+
+#endif
+
+	// Establish the expanded operands.
+	pxor	xmm0, xmm0
+	movdqu	xmm8, [BV]		// bv[0]
+	movdqu	xmm10, [MI]		// mi
+	expand	xmm0, xmm8, xmm9, xmm10, xmm11
+
+	// Set up the outer loop state and prepare for the first iteration.
+	mov	rax, AV			// -> U = av[0]
+	mov	rbx, NV			// -> X = nv[0]
+	lea	AVL, [AV + 4*N]		// -> av[n/4] = av limit
+	lea	BVL, [BV + 4*N]		// -> bv[n/4] = bv limit
+	add	BV, 16
+	add	DV, 16
+	call	mmul4
+	add	rdi, 16
+	add	rax, 16
+	add	rbx, 16
+	cmp	rax, AVL		// done already?
+	jae	8f
+
+	.p2align 4
+	// Complete the first inner loop.
+0:	call	dmul4
+	add	rdi, 16
+	add	rax, 16
+	add	rbx, 16
+	cmp	rax, AVL		// done yet?
+	jb	0b
+
+	// Still have carries left to propagate.
+	call	carryprop
+	movd	[rdi + 16], xmm12
+
+	.p2align 4
+	// Embark on the next iteration.  (There must be one.  If n = 1, then
+	// we would have bailed above, to label 8.  Similarly, the subsequent
+	// iterations can fall into the inner loop immediately.)
+1:	pxor	xmm0, xmm0
+	movdqu	xmm8, [BV]		// bv[i]
+	movdqu	xmm10, [MI]		// mi
+	mov	rdi, DV			// -> Z = dv[i]
+	mov	rax, AV			// -> U = av[0]
+	mov	rbx, NV			// -> X = nv[0]
+	expand	xmm0, xmm8, xmm9, xmm10, xmm11
+	add	BV, 16
+	add	DV, 16
+	call	mmla4
+	add	rdi, 16
+	add	rax, 16
+	add	rbx, 16
+
+	.p2align 4
+	// Complete the next inner loop.
+0:	call	dmla4
+	add	rdi, 16
+	add	rax, 16
+	add	rbx, 16
+	cmp	rax, AVL
+	jb	0b
+
+	// Still have carries left to propagate, and they overlap the
+	// previous iteration's final tail, so read that in and add it.
+	movd	xmm0, [rdi]
+	paddq	xmm12, xmm0
+	call	carryprop
+	movd	[rdi + 16], xmm12
+
+	// Back again, maybe.
+	cmp	BV, BVL
+	jb	1b
+
+	// All done.
+9:
+
+#if ABI_SYSV
+	popreg	rbx
+#endif
+
+#if ABI_WIN
+
+	rstrxmm	xmm6,    0
+	rstrxmm	xmm7,   16
+	rstrxmm	xmm8,   32
+	rstrxmm	xmm9,   48
+	rstrxmm	xmm10,  64
+	rstrxmm	xmm11,  80
+	rstrxmm	xmm12,  96
+	rstrxmm	xmm13, 112
+	rstrxmm	xmm14, 128
+	rstrxmm	xmm15, 144
+
+	stfree	160
+	popreg	r12
+	popreg	rdi
+	popreg	rbx
+
+#endif
+
+	ret
+
+	// First iteration was short.  Write out the carries and we're done.
+	// (This could be folded into the main loop structure, but that would
+	// penalize small numbers more.)
+8:	call	carryprop
+	movd	[rdi + 16], xmm12
+#if ABI_SYSV
+	popreg	rbx
+	ret
+#endif
+#if ABI_WIN
+	jmp	9b
+#endif
+
+#undef DV
+#undef AV
+#undef AVL
+#undef BV
+#undef BVL
+#undef NV
+#undef N
+#undef MI
+
+ENDFUNC
+
+FUNC(mpxmont_redc4_amd64_sse2)
+	// void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
+	//			       size_t n, const mpw *mi);
+
+	// Establish the arguments and do initial setup.
+	//
+	//			sysv	win
+	// inner loop dv	rdi	rdi*
+	// dv limit		rax	rax
+	// blocks-of-4 dv limit	rsi	rdx
+	// inner loop nv	rbx*	rbx*
+	// mi			r8	r10
+	// outer loop dv	r10	rcx
+	// outer loop dv limit	r11	r11
+	// nv base		rdx	r8
+	// nv limit		r9	r12*
+	// n			rcx	r9
+	// c			rcx	r9
+
+#if ABI_SYSV
+
+#  define DVL rax
+#  define DVL4 rsi
+#  define MI r8
+#  define DV r10
+#  define DVLO r11
+#  define NV rdx
+#  define NVL r9
+#  define N rcx
+#  define C ecx
+
+	pushreg	rbx
+  endprologue
+
+	mov	DV, rdi
+
+#endif
+
+#if ABI_WIN
+
+#  define DVL rax
+#  define DVL4 rdx
+#  define MI r10
+#  define DV rcx
+#  define DVLO r11
+#  define NV r8
+#  define NVL r12
+#  define N r9
+#  define C r9d
+
+	pushreg	rbx
+	pushreg	rdi
+	pushreg	r12
+	stalloc	160
+
+	savexmm	xmm6,    0
+	savexmm	xmm7,   16
+	savexmm	xmm8,   32
+	savexmm	xmm9,   48
+	savexmm	xmm10,  64
+	savexmm	xmm11,  80
+	savexmm	xmm12,  96
+	savexmm	xmm13, 112
+	savexmm	xmm14, 128
+	savexmm	xmm15, 144
+
+  endprologue
+
+	mov	rdi, DV
+	mov	MI, [rsp + 224]
+
+#endif
+
+	// Establish the expanded operands and the blocks-of-4 dv limit.
+	pxor	xmm0, xmm0
+	mov	DVL, DVL4		// -> dv[n] = dv limit
+	sub	DVL4, DV		// length of dv in bytes
+	movdqu	xmm8, [MI]		// mi
+	and	DVL4, ~15		// mask off the tail end
+	expand	xmm0, xmm8, xmm9
+	add	DVL4, DV		// find limit
+
+	// Set up the outer loop state and prepare for the first iteration.
+	mov	rbx, NV			// -> X = nv[0]
+	lea	DVLO, [DV + 4*N]	// -> dv[n/4] = outer dv limit
+	lea	NVL, [NV + 4*N]		// -> nv[n/4] = nv limit
+	add	DV, 16
+	call	mont4
+	add	rbx, 16
+	add	rdi, 16
+	cmp	rbx, NVL		// done already?
+	jae	8f
+
+	.p2align 4
+	// Complete the first inner loop.
+5:	call	mla4
+	add	rbx, 16
+	add	rdi, 16
+	cmp	rbx, NVL		// done yet?
+	jb	5b
+
+	// Still have carries left to propagate.
+8:	carryadd
+	psllq	xmm15, 16
+	pslldq	xmm15, 8
+	paddq	xmm14, xmm15
+	call	carryprop
+	movd	C, xmm12
+	add	rdi, 16
+	cmp	rdi, DVL4
+	jae	7f
+
+	.p2align 4
+	// Continue carry propagation until the end of the buffer.
+0:	add	[rdi], C
+	mov	C, 0			// preserves flags
+	adcd	[rdi + 4], 0
+	adcd	[rdi + 8], 0
+	adcd	[rdi + 12], 0
+	adc	C, 0
+	add	rdi, 16
+	cmp	rdi, DVL4
+	jb	0b
+
+	// Deal with the tail end.
+7:	add	[rdi], C
+	mov	C, 0			// preserves flags
+	add	rdi, 4
+	adc	C, 0
+	cmp	rdi, DVL
+	jb	7b
+
+	// All done for this iteration.  Start the next.  (This must have at
+	// least one follow-on iteration, or we'd not have started this outer
+	// loop.)
+8:	mov	rdi, DV			// -> Z = dv[i]
+	mov	rbx, NV			// -> X = nv[0]
+	cmp	rdi, DVLO		// all done yet?
+	jae	9f
+	add	DV, 16
+	call	mont4
+	add	rdi, 16
+	add	rbx, 16
+	jmp	5b
+
+	// All over.
+9:
+
+#if ABI_SYSV
+	popreg	rbx
+#endif
+
+#if ABI_WIN
+
+	rstrxmm	xmm6,    0
+	rstrxmm	xmm7,   16
+	rstrxmm	xmm8,   32
+	rstrxmm	xmm9,   48
+	rstrxmm	xmm10,  64
+	rstrxmm	xmm11,  80
+	rstrxmm	xmm12,  96
+	rstrxmm	xmm13, 112
+	rstrxmm	xmm14, 128
+	rstrxmm	xmm15, 144
+
+	stfree	160
+	popreg	r12
+	popreg	rdi
+	popreg	rbx
+
+#endif
+
+	ret
+
+#undef DVL
+#undef DVL4
+#undef MI
+#undef DV
+#undef DVLO
+#undef NV
+#undef NVL
+#undef N
+#undef C
+
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Testing and performance measurement.
+
+#ifdef TEST_MUL4
+
+#if ABI_SYSV
+#  define ARG0 rdi
+#  define ARG1 rsi
+#  define ARG2 rdx
+#  define ARG3 rcx
+#  define ARG4 r8
+#  define ARG5 r9
+#  define ARG6 STKARG(0)
+#  define ARG7 STKARG(1)
+#  define ARG8 STKARG(2)
+#  define STKARG_OFFSET 16
+#endif
+#if ABI_WIN
+#  define ARG0 rcx
+#  define ARG1 rdx
+#  define ARG2 r8
+#  define ARG3 r9
+#  define ARG4 STKARG(0)
+#  define ARG5 STKARG(1)
+#  define ARG6 STKARG(2)
+#  define ARG7 STKARG(3)
+#  define ARG8 STKARG(4)
+#  define STKARG_OFFSET 40
+#endif
+#define STKARG(i) [rsp + STKARG_OFFSET + 8*(i)]
+
+//		  sysv				win
+//		  dmul  smul  mmul  mont	dmul  smul  mmul  mont
+// A	rax
+// D	rdx
+// z	rdi	  rdi   rdi   rdi    rdi	rcx   rcx   rcx    rcx
+// c	rcx	  rsi   rsi   rsi    rsi	rdx   rdx   rdx    rdx
+// y	r10	  --    --    rdx    rdx	--    --    r8     r8
+// u	r11	  rdx   --    rcx    --		r8    --    r9     --
+// x	rbx	  rcx   rdx   r8     rcx	r9    r8    stk0   r9
+// vv	xmm8/9	  r8    --    r9     r8		stk0  --    stk1   stk0
+// yy	xmm10/11  r9    rcx   stk0   --		stk1  r9    stk2   --
+// n	r8	  stk0  r8    stk1   r9		stk2  stk0  stk3   stk1
+// cyv	r9	  stk1  r9    stk2   stk0	stk3  stk1  stk4   stk2
+
+.macro	cysetup	v, n
+	rdtsc
+	shl	rdx, 32
+	or	rax, rdx
+	mov	[\v + 8*\n - 8], rax
+.endm
+
+.macro	cystore	v, n
+	rdtsc
+	shl	rdx, 32
+	or	rax, rdx
+	sub	rax, [\v + 8*\n - 8]
+	mov	[\v + 8*\n - 8], rax
+	dec	\n
+.endm
+
+.macro	testprologue mode
+	pushreg	rbx
+#if ABI_SYSV
+  endprologue
+  .ifeqs "\mode", "dmul"
+	mov	rbx, rcx
+	movdqu	xmm8, [r8]
+	movdqu	xmm10, [r9]
+	mov	r8d, STKARG(0)
+	mov	r9, STKARG(1)
+	mov	r11, rdx
+	mov	rcx, rsi
+  .endif
+  .ifeqs "\mode", "smul"
+	mov	rbx, rdx
+	movdqu	xmm10, [rcx]
+	mov	rcx, rsi
+  .endif
+  .ifeqs "\mode", "mmul"
+	mov	rax, STKARG(0)
+	mov	rbx, r8
+	movdqu	xmm8, [r9]
+	movdqu	xmm10, [rax]
+	mov	r8, STKARG(1)
+	mov	r9, STKARG(2)
+	mov	r10, rdx
+	mov	r11, rcx
+	mov	rcx, rsi
+  .endif
+  .ifeqs "\mode", "mont"
+	mov	rbx, rcx
+	movdqu	xmm8, [r8]
+	mov	r8, r9
+	mov	r9, STKARG(0)
+	mov	r10, rdx
+	mov	rcx, rsi
+  .endif
+#endif
+#if ABI_WIN
+	pushreg	rdi
+	stalloc	168
+	savexmm	xmm6,    0
+	savexmm	xmm7,   16
+	savexmm	xmm8,   32
+	savexmm	xmm9,   48
+	savexmm	xmm10,  64
+	savexmm	xmm11,  80
+	savexmm	xmm12,  96
+	savexmm	xmm13, 112
+	savexmm	xmm14, 128
+	savexmm	xmm15, 144
+  endprologue
+  .ifeqs "\mode", "dmul"
+	mov	r10, STKARG(0)
+	mov	r11, STKARG(1)
+	mov	rdi, rcx
+	mov	rcx, rdx
+	mov	rbx, r9
+	movdqu	xmm8, [r10]
+	movdqu	xmm10, [r11]
+	mov	r8, STKARG(2)
+	mov	r9, STKARG(3)
+	mov	r11, r8
+  .endif
+  .ifeqs "\mode", "smul"
+	mov	rdi, rcx
+	mov	rcx, rdx
+	mov	rbx, r8
+	movdqu	xmm10, [r9]
+	mov	r8, STKARG(0)
+	mov	r9, STKARG(1)
+  .endif
+  .ifeqs "\mode", "mmul"
+	mov	r10, STKARG(1)
+	mov	r11, STKARG(2)
+	mov	rdi, rcx
+	mov	rcx, rdx
+	mov	rbx, STKARG(0)
+	movdqu	xmm8, [r10]
+	movdqu	xmm10, [r11]
+	mov	r8, STKARG(3)
+	mov	r9, STKARG(4)
+	mov	r10, r8
+	mov	r11, r9
+  .endif
+  .ifeqs "\mode", "mont"
+	mov	r10, STKARG(0)
+	mov	rdi, rcx
+	mov	rcx, rdx
+	mov	rbx, r9
+	movdqu	xmm8, [r10]
+	mov	r8, STKARG(1)
+	mov	r9, STKARG(2)
+	mov	r10, r8
+  .endif
+#endif
+
+	pxor	xmm0, xmm0
+  .ifeqs "\mode", "dmul"
+	expand	xmm0, xmm8, xmm9, xmm10, xmm11
+  .endif
+  .ifeqs "\mode", "smul"
+	expand	xmm0, xmm10, xmm11
+  .endif
+  .ifeqs "\mode", "mmul"
+	expand	xmm0, xmm8, xmm9, xmm10, xmm11
+  .endif
+  .ifeqs "\mode", "mont"
+	expand	xmm0, xmm8, xmm9
+  .endif
+.endm
+
+.macro	testepilogue
+#if ABI_WIN
+	rstrxmm	xmm6,    0
+	rstrxmm	xmm7,   16
+	rstrxmm	xmm8,   32
+	rstrxmm	xmm9,   48
+	rstrxmm	xmm10,  64
+	rstrxmm	xmm11,  80
+	rstrxmm	xmm12,  96
+	rstrxmm	xmm13, 112
+	rstrxmm	xmm14, 128
+	rstrxmm	xmm15, 144
+	stfree	168
+	popreg	rdi
+#endif
+	popreg	rbx
+	ret
+.endm
+
+.macro	testldcarry
+	movdqu	xmm12, [rcx +  0]	// (c'_0, c''_0)
+	movdqu	xmm13, [rcx + 16]	// (c'_1, c''_1)
+	movdqu	xmm14, [rcx + 32]	// (c'_2, c''_2)
+.endm
+
+.macro	testtop	u=nil
+	.p2align 4
+0:
+	cysetup	r9, r8
+  .ifnes "\u", "nil"
+	mov	rax, \u
+  .endif
+.endm
+
+.macro	testtail
+	cystore	r9, r8
+	jnz	0b
+.endm
+
+.macro	testcarryout
+	movdqu	[rcx +  0], xmm12
+	movdqu	[rcx + 16], xmm13
+	movdqu	[rcx + 32], xmm14
+.endm
+
+FUNC(test_dmul4)
+	testprologue dmul
+	testldcarry
+	testtop	r11
+	call	dmul4
+	testtail
+	testcarryout
+	testepilogue
+ENDFUNC
+
+FUNC(test_dmla4)
+	testprologue dmul
+	testldcarry
+	testtop	r11
+	call	dmla4
+	testtail
+	testcarryout
+	testepilogue
+ENDFUNC
+
+FUNC(test_mul4)
+	testprologue smul
+	testldcarry
+	testtop	nil
+	call	mul4
+	testtail
+	testcarryout
+	testepilogue
+ENDFUNC
+
+FUNC(test_mla4)
+	testprologue smul
+	testldcarry
+	testtop	nil
+	call	mla4
+	testtail
+	testcarryout
+	testepilogue
+ENDFUNC
+
+FUNC(test_mmul4)
+	testprologue mmul
+	testtop	r11
+	call	mmul4
+	testtail
+	movdqu	[r10 +  0], xmm10
+	movdqu	[r10 + 16], xmm11
+	testcarryout
+	testepilogue
+ENDFUNC
+
+FUNC(test_mmla4)
+	testprologue mmul
+	testtop	r11
+	call	mmla4
+	testtail
+	movdqu	[r10 +  0], xmm10
+	movdqu	[r10 + 16], xmm11
+	testcarryout
+	testepilogue
+ENDFUNC
+
+FUNC(test_mont4)
+	testprologue mont
+	testtop
+	call	mont4
+	testtail
+	movdqu	[r10 +  0], xmm10
+	movdqu	[r10 + 16], xmm11
+	testcarryout
+	testepilogue
+ENDFUNC
+
+#endif
+
+///----- That's all, folks --------------------------------------------------
diff --git a/math/mpx.c b/math/mpx.c
index 8f1dcede..e759c5f2 100644
--- a/math/mpx.c
+++ b/math/mpx.c
@@ -914,12 +914,20 @@ static void simple_umul(mpw *dv, mpw *dvl, const mpw *av, const mpw *avl,
   MAYBE_UMUL4(x86_sse2)
 #endif
 
+#if CPUFAM_AMD64
+  MAYBE_UMUL4(amd64_sse2)
+#endif
+
 static mpx_umul__functype *pick_umul(void)
 {
 #if CPUFAM_X86
   DISPATCH_PICK_COND(mpx_umul, maybe_umul4_x86_sse2,
 		     cpu_feature_p(CPUFEAT_X86_SSE2));
 #endif
+#if CPUFAM_AMD64
+  DISPATCH_PICK_COND(mpx_umul, maybe_umul4_amd64_sse2,
+		     cpu_feature_p(CPUFEAT_X86_SSE2));
+#endif
   DISPATCH_PICK_FALLBACK(mpx_umul, simple_umul);
 }
 
diff --git a/math/t/mpmont b/math/t/mpmont
index cd2569b7..d6539636 100644
--- a/math/t/mpmont
+++ b/math/t/mpmont
@@ -25,6 +25,11 @@ mul {
   6456542564
   10807149256;
 
+  4325987397987458979875737589783
+  1
+  4309747041023999857206910900081
+  4309747041023999857206910900081;
+
   6277101735386680763835789423207666416083908700390324961279
   2455155546008943817740293915197451784769108058161191238065
   340282366920938463500268095579187314689
-- 
2.11.0