From 444083aef7e70ce9afe893a36d72e1a1a976f1ed Mon Sep 17 00:00:00 2001
From: Mark Wooding <mdw@distorted.org.uk>
Date: Mon, 12 Sep 2016 22:32:37 +0100
Subject: [PATCH] math/: SSE2-based high-performance multipliers.

---
 math/Makefile.am         |    3 +
 math/mpmont.c            |   89 +++-
 math/mpx-mul4-x86-sse2.S | 1224 ++++++++++++++++++++++++++++++++++++++++++++++
 math/mpx.c               |   43 +-
 math/t/mpmont            |    7 +
 5 files changed, 1354 insertions(+), 12 deletions(-)
 create mode 100644 math/mpx-mul4-x86-sse2.S

diff --git a/math/Makefile.am b/math/Makefile.am
index 908d268b..69940ba5 100644
--- a/math/Makefile.am
+++ b/math/Makefile.am
@@ -181,6 +181,9 @@ TESTS			+= mpx-kmul.t$(EXEEXT) mpx-ksqr.t$(EXEEXT)
 noinst_PROGRAMS		+= bittest
 TESTS			+= bittest
 EXTRA_DIST		+= t/mpx
+if CPUFAM_X86
+libmath_la_SOURCES	+= mpx-mul4-x86-sse2.S
+endif
 
 ## A quick-and-dirty parser, used for parsing descriptions of groups, fields,
 ## etc.
diff --git a/math/mpmont.c b/math/mpmont.c
index a86623b2..968766d3 100644
--- a/math/mpmont.c
+++ b/math/mpmont.c
@@ -27,6 +27,8 @@
 
 /*----- Header files ------------------------------------------------------*/
 
+#include "config.h"
+#include "dispatch.h"
 #include "mp.h"
 #include "mpmont.h"
 
@@ -58,8 +60,12 @@
  *		least %$2 n + 1$% words of result.
  */
 
-static void redccore(mpw *dv, mpw *dvl, const mpw *mv,
-		     size_t n, const mpw *mi)
+CPU_DISPATCH(static, (void), void, redccore,
+	     (mpw *dv, mpw *dvl, const mpw *mv, size_t n, const mpw *mi),
+	     (dv, dvl, mv, n, mi), pick_redccore, simple_redccore);
+
+static void simple_redccore(mpw *dv, mpw *dvl, const mpw *mv,
+			    size_t n, const mpw *mi)
 {
   mpw mi0 = *mi;
   size_t i;
@@ -70,6 +76,29 @@ static void redccore(mpw *dv, mpw *dvl, const mpw *mv,
   }
 }
 
+#define MAYBE_REDC4(impl)						\
+  extern void mpxmont_redc4_##impl(mpw *dv, mpw *dvl, const mpw *mv,	\
+				   size_t n, const mpw *mi);		\
+  static void maybe_redc4_##impl(mpw *dv, mpw *dvl, const mpw *mv,	\
+				 size_t n, const mpw *mi)		\
+  {									\
+    if (n%4) simple_redccore(dv, dvl, mv, n, mi);			\
+    else mpxmont_redc4_##impl(dv, dvl, mv, n, mi);			\
+  }
+
+#if CPUFAM_X86
+  MAYBE_REDC4(x86_sse2)
+#endif
+
+static redccore__functype *pick_redccore(void)
+{
+#if CPUFAM_X86
+  DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_x86_sse2,
+		     cpu_feature_p(CPUFEAT_X86_SSE2));
+#endif
+  DISPATCH_PICK_FALLBACK(mpmont_reduce, simple_redccore);
+}
+
 /* --- @redccore@ --- *
  *
  * Arguments:	@mpw *dv, *dvl@ = base and limit of source/destination
@@ -85,10 +114,17 @@ static void redccore(mpw *dv, mpw *dvl, const mpw *mv,
  *		Store in %$d$% the value %$a b + (m' a b \bmod R) m$%.
  */
 
-static void mulcore(mpw *dv, mpw *dvl,
-		    const mpw *av, const mpw *avl,
-		    const mpw *bv, const mpw *bvl,
-		    const mpw *mv, size_t n, const mpw *mi)
+CPU_DISPATCH(static, (void), void, mulcore,
+	     (mpw *dv, mpw *dvl, const mpw *av, const mpw *avl,
+	      const mpw *bv, const mpw *bvl, const mpw *mv,
+	      size_t n, const mpw *mi),
+	     (dv, dvl, av, avl, bv, bvl, mv, n, mi),
+	     pick_mulcore, simple_mulcore);
+
+static void simple_mulcore(mpw *dv, mpw *dvl,
+			   const mpw *av, const mpw *avl,
+			   const mpw *bv, const mpw *bvl,
+			   const mpw *mv, size_t n, const mpw *mi)
 {
   mpw ai, b0, y, mi0 = *mi;
   const mpw *tv, *tvl;
@@ -123,6 +159,38 @@ static void mulcore(mpw *dv, mpw *dvl,
   }
 }
 
+#define MAYBE_MUL4(impl)						\
+  extern void mpxmont_mul4_##impl(mpw *dv,				\
+				  const mpw *av, const mpw *bv,		\
+				  const mpw *mv,			\
+				  size_t n, const mpw *mi);		\
+  static void maybe_mul4_##impl(mpw *dv, mpw *dvl,			\
+			   const mpw *av, const mpw *avl,		\
+			   const mpw *bv, const mpw *bvl,		\
+			   const mpw *mv, size_t n, const mpw *mi)	\
+  {									\
+    size_t an = avl - av, bn = bvl - bv;				\
+    if (n%4 || an != n || bn != n)					\
+      simple_mulcore(dv, dvl, av, avl, bv, bvl, mv, n, mi);		\
+    else {								\
+      mpxmont_mul4_##impl(dv, av, bv, mv, n, mi);			\
+      MPX_ZERO(dv + 2*n + 1, dvl);					\
+    }									\
+  }
+
+#if CPUFAM_X86
+  MAYBE_MUL4(x86_sse2)
+#endif
+
+static mulcore__functype *pick_mulcore(void)
+{
+#if CPUFAM_X86
+  DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_x86_sse2,
+		     cpu_feature_p(CPUFEAT_X86_SSE2));
+#endif
+  DISPATCH_PICK_FALLBACK(mpmont_mul, simple_mulcore);
+}
+
 /* --- @finish@ --- *
  *
  * Arguments:	@mpmont *mm@ = pointer to a Montgomery reduction context
@@ -321,13 +389,14 @@ mp *mpmont_mul(mpmont *mm, mp *d, mp *a, mp *b)
 
 mp *mpmont_mul(mpmont *mm, mp *d, mp *a, mp *b)
 {
-  if (mm->n > MPK_THRESH * 3) {
+  size_t n = mm->n;
+
+  if (n > MPK_THRESH * 3) {
     d = mp_mul(d, a, b);
     d = mpmont_reduce(mm, d, d);
   } else {
-    a = MP_COPY(a);
-    b = MP_COPY(b);
-    MP_DEST(d, 2*mm->n + 1, a->f | b->f | MP_UNDEF);
+    a = MP_COPY(a); b = MP_COPY(b);
+    MP_DEST(d, 2*n + 1, a->f | b->f | MP_UNDEF);
     mulcore(d->v, d->vl, a->v, a->vl, b->v, b->vl,
 	    mm->m->v, mm->n, mm->mi->v);
     d->f = ((a->f | b->f) & MP_BURN) | ((a->f ^ b->f) & MP_NEG);
diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S
new file mode 100644
index 00000000..c0e1a788
--- /dev/null
+++ b/math/mpx-mul4-x86-sse2.S
@@ -0,0 +1,1224 @@
+/// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
+///
+/// Large SIMD-based multiplications
+///
+/// (c) 2016 Straylight/Edgeware
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+#include "config.h"
+#include "asm-common.h"
+
+///--------------------------------------------------------------------------
+/// Prologue.
+
+	.arch	pentium4
+	.text
+
+///--------------------------------------------------------------------------
+/// Theory.
+///
+/// We define a number of primitive fixed-size multipliers from which we can
+/// construct more general variable-length multipliers.
+///
+/// The basic trick is the same throughout.  In an operand-scanning
+/// multiplication, the inner multiplication loop multiplies a
+/// multiple-precision operand by a single precision factor, and adds the
+/// result, appropriately shifted, to the result.  A `finely integrated
+/// operand scanning' implementation of Montgomery multiplication also adds
+/// the product of a single-precision `Montgomery factor' and the modulus,
+/// calculated in the same pass.  The more common `coarsely integrated
+/// operand scanning' alternates main multiplication and Montgomery passes,
+/// which requires additional carry propagation.
+///
+/// Throughout both plain-multiplication and Montgomery stages, then, one of
+/// the factors remains constant throughout the operation, so we can afford
+/// to take a little time to preprocess it.  The transformation we perform is
+/// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
+/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
+/// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
+/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
+/// operands, as follows.
+///
+///	Offset	   0	   4	    8	   12
+///	   0	v'_0	v'_1	v''_0	v''_1
+///	  16	v'_2	v'_3	v''_2	v''_3
+///
+/// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
+/// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
+/// this vector right by 4 bytes brings v'_1 and v''_1 into position.  We can
+/// multiply such a vector by a full 32-bit scalar to produce two 48-bit
+/// results in 64-bit fields.  The sixteen bits of headroom allows us to add
+/// many products together before we must deal with carrying; it also allows
+/// for some calculations to be performed on the above expanded form.
+///
+/// On 32-bit x86, we are register starved: the expanded operands are kept in
+/// memory, typically in warm L1 cache.
+///
+/// We maintain four `carry' registers accumulating intermediate results.
+/// The registers' precise roles rotate during the computation; we name them
+/// `c0', `c1', `c2', and `c3'.  Each carry register holds two 64-bit halves:
+/// the register c0, for example, holds c'_0 (low half) and c''_0 (high
+/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
+/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3.  The
+/// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
+/// lanes of its vector) and an operand in the expanded form above produces a
+/// result which can be added directly to the appropriate carry register.
+/// Following a pass of four multiplications, we perform some limited carry
+/// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
+/// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
+/// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
+/// zeroed becomes c3.
+
+///--------------------------------------------------------------------------
+/// Macro definitions.
+
+.macro	mulcore	r, s, d0, d1, d2, d3
+	// Load a word r_i from R, multiply by the expanded operand [S], and
+	// leave the pieces of the product in registers D0, D1, D2, D3.
+	movd	\d0, \r			// (r_i, 0, 0, 0)
+  .ifnes "\d1", "nil"
+	movdqa	\d1, [\s]		// (s'_0, s'_1, s''_0, s''_1)
+  .endif
+  .ifnes "\d3", "nil"
+	movdqa	\d3, [\s + 16]		// (s'_2, s'_3, s''_2, s''_3)
+  .endif
+	pshufd	\d0, \d0, 0b11001100	// (r_i, ?, r_i, ?)
+  .ifnes "\d1", "nil"
+	psrldq	\d1, 4			// (s'_1, s''_0, s''_1, 0)
+  .endif
+  .ifnes "\d2", "nil"
+    .ifnes "\d3", "nil"
+	movdqa	\d2, \d3		// another copy of (s'_2, s'_3, ...)
+    .else
+	movdqa	\d2, \d0		// another copy of (r_i, ?, r_i, ?)
+    .endif
+  .endif
+  .ifnes "\d3", "nil"
+	psrldq	\d3, 4			// (s'_3, s''_2, s''_3, 0)
+  .endif
+  .ifnes "\d1", "nil"
+	pmuludqd \d1, \d0		// (r_i s'_1, r_i s''_1)
+  .endif
+  .ifnes "\d3", "nil"
+	pmuludqd \d3, \d0		// (r_i s'_3, r_i s''_3)
+  .endif
+  .ifnes "\d2", "nil"
+    .ifnes "\d3", "nil"
+	pmuludqd \d2, \d0		// (r_i s'_2, r_i s''_2)
+    .else
+	pmuludqd \d2, [\s + 16]
+    .endif
+  .endif
+	pmuludqd \d0, [\s]		// (r_i s'_0, r_i s''_0)
+.endm
+
+.macro	accum	c0, c1, c2, c3
+	paddq	\c0, xmm0
+  .ifnes "\c1", "nil"
+	paddq	\c1, xmm1
+  .endif
+  .ifnes "\c2", "nil"
+	paddq	\c2, xmm2
+  .endif
+  .ifnes "\c3", "nil"
+	paddq	\c3, xmm3
+  .endif
+.endm
+
+.macro	mulacc	r, s, c0, c1, c2, c3, z3p
+	// Load a word r_i from R, multiply by the expanded operand [S],
+	// and accumulate in carry registers C0, C1, C2, C3.  If Z3P is `t'
+	// then C3 notionally contains zero, but needs clearing; in practice,
+	// we store the product directly rather than attempting to add.  On
+	// completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
+	// is not `t'.
+  .ifeqs "\z3p", "t"
+	mulcore	\r, \s, xmm0, xmm1, xmm2, \c3
+	accum		\c0,  \c1,  \c2,  nil
+  .else
+	mulcore	\r, \s, xmm0, xmm1, xmm2, xmm3
+	accum		\c0,  \c1,  \c2,  \c3
+  .endif
+.endm
+
+.macro	propout	d, c, cc
+	// Calculate an output word from C, and store it in D; propagate
+	// carries out from C to CC in preparation for a rotation of the
+	// carry registers.  On completion, XMM3 is clobbered.  If CC is
+	// `nil', then the contribution which would have been added to it is
+	// left in C.
+	pshufd	xmm3, \c, 0b10111111	// (?, ?, ?, t = c'' mod B)
+	psrldq	xmm3, 12		// (t, 0, 0, 0) = (t, 0)
+	pslldq	xmm3, 2			// (t b, 0)
+	paddq	\c, xmm3		// (c' + t b, c'')
+	movd	\d, \c
+	psrlq	\c, 32			// floor(c/B)
+  .ifnes "\cc", "nil"
+	paddq	\cc, \c			// propagate up
+  .endif
+.endm
+
+.macro	endprop	d, c, t
+	// On entry, C contains a carry register.  On exit, the low 32 bits
+	// of the value represented in C are written to D, and the remaining
+	// bits are left at the bottom of T.
+	movdqa	\t, \c
+	psllq	\t, 16			// (?, c'' b)
+	pslldq	\c, 8			// (0, c')
+	paddq	\t, \c			// (?, c' + c'' b)
+	psrldq	\t, 8			// c' + c'' b
+	movd	\d, \t
+	psrldq	\t, 4			// floor((c' + c'' b)/B)
+.endm
+
+.macro	expand	a, b, c, d, z
+	// On entry, A and C hold packed 128-bit values, and Z is zero.  On
+	// exit, A:B and C:D together hold the same values in expanded
+	// form.  If C is `nil', then only expand A to A:B.
+	movdqa	\b, \a			// (a_0, a_1, a_2, a_3)
+  .ifnes "\c", "nil"
+	movdqa	\d, \c			// (c_0, c_1, c_2, c_3)
+  .endif
+	punpcklwd \a, \z		// (a'_0, a''_0, a'_1, a''_1)
+	punpckhwd \b, \z		// (a'_2, a''_2, a'_3, a''_3)
+  .ifnes "\c", "nil"
+	punpcklwd \c, \z		// (c'_0, c''_0, c'_1, c''_1)
+	punpckhwd \d, \z		// (c'_2, c''_2, c'_3, c''_3)
+  .endif
+	pshufd	\a, \a, 0b11011000	// (a'_0, a'_1, a''_0, a''_1)
+	pshufd	\b, \b, 0b11011000	// (a'_2, a'_3, a''_2, a''_3)
+  .ifnes "\c", "nil"
+	pshufd	\c, \c, 0b11011000	// (c'_0, c'_1, c''_0, c''_1)
+	pshufd	\d, \d, 0b11011000	// (c'_2, c'_3, c''_2, c''_3)
+  .endif
+.endm
+
+.macro	squash	c0, c1, c2, c3, h, t, u
+	// On entry, C0, C1, C2, C3 are carry registers representing a value
+	// Y.  On exit, C0 holds the low 128 bits of the carry value; C1, C2,
+	// C3, T, and U are clobbered; and the high bits of Y are stored in
+	// H, if this is not `nil'.
+
+	// The first step is to eliminate the `double-prime' pieces -- i.e.,
+	// the ones offset by 16 bytes from a 32-bit boundary -- by carrying
+	// them into the 32-bit-aligned pieces above and below.  But before
+	// we can do that, we must gather them together.
+	movdqa	\t, \c0
+	movdqa	\u, \c1
+	punpcklqdq \t, \c2		// (y'_0, y'_2)
+	punpckhqdq \c0, \c2		// (y''_0, y''_2)
+	punpcklqdq \u, \c3		// (y'_1, y'_3)
+	punpckhqdq \c1, \c3		// (y''_1, y''_3)
+
+	// Now split the double-prime pieces.  The high (up to) 48 bits will
+	// go up; the low 16 bits go down.
+	movdqa	\c2, \c0
+	movdqa	\c3, \c1
+	psllq	\c2, 48
+	psllq	\c3, 48
+	psrlq	\c0, 16			// high parts of (y''_0, y''_2)
+	psrlq	\c1, 16			// high parts of (y''_1, y''_3)
+	psrlq	\c2, 32			// low parts of (y''_0, y''_2)
+	psrlq	\c3, 32			// low parts of (y''_1, y''_3)
+  .ifnes "\h", "nil"
+	movdqa	\h, \c1
+  .endif
+	pslldq	\c1, 8			// high part of (0, y''_1)
+
+	paddq	\t, \c2			// propagate down
+	paddq	\u, \c3
+	paddq	\t, \c1			// and up: (y_0, y_2)
+	paddq	\u, \c0			// (y_1, y_3)
+  .ifnes "\h", "nil"
+	psrldq	\h, 8			// high part of (y''_3, 0)
+  .endif
+
+	// Finally extract the answer.  This complicated dance is better than
+	// storing to memory and loading, because the piecemeal stores
+	// inhibit store forwarding.
+	movdqa	\c3, \t			// (y_0, y_1)
+	movdqa	\c0, \t			// (y^*_0, ?, ?, ?)
+	psrldq	\t, 8			// (y_2, 0)
+	psrlq	\c3, 32			// (floor(y_0/B), ?)
+	paddq	\c3, \u			// (y_1 + floor(y_0/B), ?)
+	pslldq	\c0, 12			// (0, 0, 0, y^*_0)
+	movdqa	\c1, \c3		// (y^*_1, ?, ?, ?)
+	psrldq	\u, 8			// (y_3, 0)
+	psrlq	\c3, 32			// (floor((y_1 B + y_0)/B^2, ?)
+	paddq	\c3, \t			// (y_2 + floor((y_1 B + y_0)/B^2, ?)
+	pslldq	\c1, 12			// (0, 0, 0, y^*_1)
+	psrldq	\c0, 12			// (y^*_0, 0, 0, 0)
+	movdqa	\c2, \c3		// (y^*_2, ?, ?, ?)
+	psrlq	\c3, 32		    // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
+	paddq	\c3, \u	      // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
+	pslldq	\c2, 12			// (0, 0, 0, y^*_2)
+	psrldq	\c1, 8			// (0, y^*_1, 0, 0)
+	psrldq	\c2, 4			// (0, 0, y^*_2, 0)
+  .ifnes "\h", "nil"
+	movdqu	\t, \c3
+	pxor	\u, \u
+  .endif
+	pslldq	\c3, 12			// (0, 0, 0, y^*_3)
+	por	\c0, \c1		// (y^*_0, y^*_1, 0, 0)
+	por	\c2, \c3		// (0, 0, y^*_2, y^*_3)
+	por	\c0, \c2		// y mod B^4
+  .ifnes "\h", "nil"
+	psrlq	\t, 32			// very high bits of y
+	paddq	\h, \t
+	punpcklqdq \h, \u		// carry up
+  .endif
+.endm
+
+.macro	carryadd
+	// On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
+	// hold the incoming carry registers c0, c1, and c2 representing a
+	// carry-in C.
+	//
+	// On exit, the carry registers, including XMM7, are updated to hold
+	// C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
+	// registers are preserved.
+	movd	xmm0, [edi +  0]	// (a_0, 0)
+	movd	xmm1, [edi +  4]	// (a_1, 0)
+	movd	xmm2, [edi +  8]	// (a_2, 0)
+	movd	xmm7, [edi + 12]	// (a_3, 0)
+	paddq	xmm4, xmm0		// (c'_0 + a_0, c''_0)
+	paddq	xmm5, xmm1		// (c'_1 + a_1, c''_1)
+	paddq	xmm6, xmm2		// (c'_2 + a_2, c''_2 + a_3 b)
+.endm
+
+///--------------------------------------------------------------------------
+/// Primitive multipliers and related utilities.
+
+	.p2align 4
+carryprop:
+	// On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
+	// form.  Store the low 128 bits of the represented carry to [EDI] as
+	// a packed 128-bit value, and leave the remaining 16 bits in the low
+	// 32 bits of XMM4.  On exit, XMM3, XMM5 and XMM6 are clobbered.
+	propout	[edi +  0], xmm4, xmm5
+	propout	[edi +  4], xmm5, xmm6
+	propout	[edi +  8], xmm6, nil
+	endprop	[edi + 12], xmm6, xmm4
+	ret
+
+	.p2align 4
+dmul4:
+	// On entry, EDI points to the destination buffer; EAX and EBX point
+	// to the packed operands U and X; ECX and EDX point to the expanded
+	// operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
+	// registers c0, c1, and c2; c3 is assumed to be zero.
+	//
+	// On exit, we write the low 128 bits of the sum C + U V + X Y to
+	// [EDI], and update the carry registers with the carry out.  The
+	// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
+	// general-purpose registers are preserved.
+	mulacc	[eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, t
+	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
+	propout	[edi +  0],	 xmm4, xmm5
+
+	mulacc	[eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
+	mulacc	[ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, nil
+	propout	[edi +  4],	 xmm5, xmm6
+
+	mulacc	[eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
+	mulacc	[ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, nil
+	propout	[edi +  8],	 xmm6, xmm7
+
+	mulacc	[eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
+	mulacc	[ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
+	propout	[edi + 12],	 xmm7, xmm4
+
+	ret
+
+	.p2align 4
+dmla4:
+	// On entry, EDI points to the destination buffer, which also
+	// contains an addend A to accumulate; EAX and EBX point to the
+	// packed operands U and X; ECX and EDX point to the expanded
+	// operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
+	// registers c0, c1, and c2 representing a carry-in C; c3 is assumed
+	// to be zero.
+	//
+	// On exit, we write the low 128 bits of the sum A + C + U V + X Y to
+	// [EDI], and update the carry registers with the carry out.  The
+	// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
+	// general-purpose registers are preserved.
+	carryadd
+
+	mulacc	[eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, nil
+	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
+	propout	[edi +  0],	 xmm4, xmm5
+
+	mulacc	[eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
+	mulacc	[ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, nil
+	propout	[edi +  4],	 xmm5, xmm6
+
+	mulacc	[eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
+	mulacc	[ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, nil
+	propout	[edi +  8],	 xmm6, xmm7
+
+	mulacc	[eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
+	mulacc	[ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
+	propout	[edi + 12],	 xmm7, xmm4
+
+	ret
+
+	.p2align 4
+mul4zc:
+	// On entry, EDI points to the destination buffer; EBX points to a
+	// packed operand X; and EDX points to an expanded operand Y.
+	//
+	// On exit, we write the low 128 bits of the product X Y to [EDI],
+	// and set the carry registers XMM4, XMM5, XMM6 to the carry out.
+	// The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
+	// general-purpose registers are preserved.
+	mulcore	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
+	propout	[edi +  0],	 xmm4, xmm5
+
+	mulacc	[ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
+	propout	[edi +  4],	 xmm5, xmm6
+
+	mulacc	[ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
+	propout	[edi +  8],	 xmm6, xmm7
+
+	mulacc	[ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
+	propout	[edi + 12],	 xmm7, xmm4
+
+	ret
+
+	.p2align 4
+mul4:
+	// On entry, EDI points to the destination buffer; EBX points to a
+	// packed operand X; EDX points to an expanded operand Y; and XMM4,
+	// XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
+	// representing a carry-in C; c3 is assumed to be zero.
+	//
+	// On exit, we write the low 128 bits of the sum C + X Y to [EDI],
+	// and update the carry registers with the carry out.  The registers
+	// XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
+	// general-purpose registers are preserved.
+	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, t
+	propout	[edi +  0],	 xmm4, xmm5
+
+	mulacc	[ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
+	propout	[edi +  4],	 xmm5, xmm6
+
+	mulacc	[ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
+	propout	[edi +  8],	 xmm6, xmm7
+
+	mulacc	[ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
+	propout	[edi + 12],	 xmm7, xmm4
+
+	ret
+
+	.p2align 4
+mla4zc:
+	// On entry, EDI points to the destination buffer, which also
+	// contains an addend A to accumulate; EBX points to a packed operand
+	// X; and EDX points to an expanded operand Y.
+	//
+	// On exit, we write the low 128 bits of the sum A + X Y to [EDI],
+	// and set the carry registers XMM4, XMM5, XMM6 to the carry out.
+	// The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
+	// general-purpose registers are preserved.
+	movd	xmm4, [edi +  0]
+	movd	xmm5, [edi +  4]
+	movd	xmm6, [edi +  8]
+	movd	xmm7, [edi + 12]
+
+	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
+	propout	[edi +  0],	 xmm4, xmm5
+
+	mulacc	[ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
+	propout	[edi +  4],	 xmm5, xmm6
+
+	mulacc	[ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
+	propout	[edi +  8],	 xmm6, xmm7
+
+	mulacc	[ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
+	propout	[edi + 12],	 xmm7, xmm4
+
+	ret
+
+	.p2align 4
+mla4:
+	// On entry, EDI points to the destination buffer, which also
+	// contains an addend A to accumulate; EBX points to a packed operand
+	// X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
+	// the incoming carry registers c0, c1, and c2, representing a
+	// carry-in C; c3 is assumed to be zero.
+	//
+	// On exit, we write the low 128 bits of the sum A + C + X Y to
+	// [EDI], and update the carry registers with the carry out.  The
+	// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
+	// general-purpose registers are preserved.
+	carryadd
+
+	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
+	propout	[edi +  0],	 xmm4, xmm5
+
+	mulacc	[ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
+	propout	[edi +  4],	 xmm5, xmm6
+
+	mulacc	[ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
+	propout	[edi +  8],	 xmm6, xmm7
+
+	mulacc	[ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
+	propout	[edi + 12],	 xmm7, xmm4
+
+	ret
+
+	.p2align 4
+mmul4:
+	// On entry, EDI points to the destination buffer; EAX and EBX point
+	// to the packed operands U and N; ECX and ESI point to the expanded
+	// operands V and M; and EDX points to a place to store an expanded
+	// result Y (32 bytes, at a 16-byte boundary).  The stack pointer
+	// must be 16-byte aligned.  (This is not the usual convention, which
+	// requires alignment before the call.)
+	//
+	// On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
+	// of the sum U V + N Y to [EDI], leaving the remaining carry in
+	// XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
+	// XMM7 are clobbered; the general-purpose registers are preserved.
+	sub	esp, 64			// space for the carries
+
+	// Calculate W = U V, and leave it in the destination.  Stash the
+	// carry pieces for later.
+	mulcore	[eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
+	propout	[edi +  0],	 xmm4, xmm5
+	jmp	5f
+
+	.p2align 4
+mmla4:
+	// On entry, EDI points to the destination buffer, which also
+	// contains an addend A to accumulate; EAX and EBX point
+	// to the packed operands U and N; ECX and ESI point to the expanded
+	// operands V and M; and EDX points to a place to store an expanded
+	// result Y (32 bytes, at a 16-byte boundary).  The stack pointer
+	// must be 16-byte aligned.  (This is not the usual convention, which
+	// requires alignment before the call.)
+	//
+	// On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
+	// bits of the sum A + U V + N Y to [EDI], leaving the remaining
+	// carry in XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2,
+	// XMM3, and XMM7 are clobbered; the general-purpose registers are
+	// preserved.
+	sub	esp, 64			// space for the carries
+	movd	xmm4, [edi +  0]
+	movd	xmm5, [edi +  4]
+	movd	xmm6, [edi +  8]
+	movd	xmm7, [edi + 12]
+	mulacc	[eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, nil
+	propout	[edi +  0],	 xmm4, xmm5
+
+5:	mulacc	[eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
+	propout	[edi +  4],	 xmm5, xmm6
+
+	mulacc	[eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
+	propout	[edi +  8],	 xmm6, xmm7
+
+	mulacc	[eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
+	propout	[edi + 12],	 xmm7, xmm4
+
+	movdqa	[esp +  0], xmm4
+	movdqa	[esp + 16], xmm5
+	movdqa	[esp + 32], xmm6
+
+	// Calculate Y = W M.
+	mulcore	[edi +  0], esi, xmm4, xmm5, xmm6, xmm7
+
+	mulcore	[edi +  4], esi, xmm0, xmm1, xmm2, nil
+	accum			 xmm5, xmm6, xmm7, nil
+
+	mulcore	[edi +  8], esi, xmm0, xmm1, nil,  nil
+	accum			 xmm6, xmm7, nil,  nil
+
+	mulcore	[edi + 12], esi, xmm0, nil,  nil,  nil
+	accum			 xmm7, nil,  nil,  nil
+
+	// That's lots of pieces.  Now we have to assemble the answer.
+	squash	xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
+
+	// Expand it.
+	pxor	xmm2, xmm2
+	expand	xmm4, xmm1, nil, nil, xmm2
+	movdqa	[edx +  0], xmm4
+	movdqa	[edx + 16], xmm1
+
+	// Initialize the carry from the value for W we calculated earlier.
+	movd	xmm4, [edi +  0]
+	movd	xmm5, [edi +  4]
+	movd	xmm6, [edi +  8]
+	movd	xmm7, [edi + 12]
+
+	// Finish the calculation by adding the Montgomery product.
+	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
+	propout	[edi +  0],	 xmm4, xmm5
+
+	mulacc	[ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
+	propout	[edi +  4],	 xmm5, xmm6
+
+	mulacc	[ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
+	propout	[edi +  8],	 xmm6, xmm7
+
+	mulacc	[ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
+	propout	[edi + 12],	 xmm7, xmm4
+
+	// Add add on the carry we calculated earlier.
+	paddq	xmm4, [esp +  0]
+	paddq	xmm5, [esp + 16]
+	paddq	xmm6, [esp + 32]
+
+	// And, with that, we're done.
+	add	esp, 64
+	ret
+
+	.p2align 4
+mont4:
+	// On entry, EDI points to the destination buffer holding a packed
+	// value A; EBX points to a packed operand N; ESI points to an
+	// expanded operand M; and EDX points to a place to store an expanded
+	// result Y (32 bytes, at a 16-byte boundary).
+	//
+	// On exit, we write Y = W M mod B to [EDX], and the low 128 bits
+	// of the sum W + N Y to [EDI], leaving the remaining carry in
+	// XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
+	// XMM7 are clobbered; the general-purpose registers are preserved.
+
+	// Calculate Y = W M.
+	mulcore	[edi +  0], esi, xmm4, xmm5, xmm6, xmm7
+
+	mulcore	[edi +  4], esi, xmm0, xmm1, xmm2, nil
+	accum			 xmm5, xmm6, xmm7, nil
+
+	mulcore	[edi +  8], esi, xmm0, xmm1, nil,  nil
+	accum			 xmm6, xmm7, nil,  nil
+
+	mulcore	[edi + 12], esi, xmm0, nil,  nil,  nil
+	accum			 xmm7, nil,  nil,  nil
+
+	// That's lots of pieces.  Now we have to assemble the answer.
+	squash	xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
+
+	// Expand it.
+	pxor	xmm2, xmm2
+	expand	xmm4, xmm1, nil, nil, xmm2
+	movdqa	[edx +  0], xmm4
+	movdqa	[edx + 16], xmm1
+
+	// Initialize the carry from W.
+	movd	xmm4, [edi +  0]
+	movd	xmm5, [edi +  4]
+	movd	xmm6, [edi +  8]
+	movd	xmm7, [edi + 12]
+
+	// Finish the calculation by adding the Montgomery product.
+	mulacc	[ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
+	propout	[edi +  0],	 xmm4, xmm5
+
+	mulacc	[ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
+	propout	[edi +  4],	 xmm5, xmm6
+
+	mulacc	[ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
+	propout	[edi +  8],	 xmm6, xmm7
+
+	mulacc	[ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
+	propout	[edi + 12],	 xmm7, xmm4
+
+	// And, with that, we're done.
+	ret
+
+///--------------------------------------------------------------------------
+/// Bulk multipliers.
+
+FUNC(mpx_umul4_x86_sse2)
+	// void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
+	//			   const mpw *bv, const mpw *bvl);
+
+	// Build a stack frame.  Arguments will be relative to EBP, as
+	// follows.
+	//
+	//	ebp + 20	dv
+	//	ebp + 24	av
+	//	ebp + 28	avl
+	//	ebp + 32	bv
+	//	ebp + 36	bvl
+	//
+	// Locals are relative to ESP, as follows.
+	//
+	//	esp +  0	expanded Y (32 bytes)
+	//	esp + 32	(top of locals)
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	ebp, esp
+	and	esp, ~15
+	sub	esp, 32
+
+	// Prepare for the first iteration.
+	mov	esi, [ebp + 32]		// -> bv[0]
+	pxor	xmm7, xmm7
+	movdqu	xmm0, [esi]		// bv[0]
+	mov	edi, [ebp + 20]		// -> dv[0]
+	mov	ecx, edi		// outer loop dv cursor
+	expand	xmm0, xmm1, nil, nil, xmm7
+	mov	ebx, [ebp + 24]		// -> av[0]
+	mov	eax, [ebp + 28]		// -> av[m] = av limit
+	mov	edx, esp		// -> expanded Y = bv[0]
+	movdqa	[esp + 0], xmm0		// bv[0] expanded low
+	movdqa	[esp + 16], xmm1	// bv[0] expanded high
+	call	mul4zc
+	add	ebx, 16
+	add	edi, 16
+	add	ecx, 16
+	add	esi, 16
+	cmp	ebx, eax		// all done?
+	jae	8f
+
+	.p2align 4
+	// Continue with the first iteration.
+0:	call	mul4
+	add	ebx, 16
+	add	edi, 16
+	cmp	ebx, eax		// all done?
+	jb	0b
+
+	// Write out the leftover carry.  There can be no tail here.
+8:	call	carryprop
+	cmp	esi, [ebp + 36]		// more passes to do?
+	jae	9f
+
+	.p2align 4
+	// Set up for the next pass.
+1:	movdqu	xmm0, [esi]		// bv[i]
+	mov	edi, ecx		// -> dv[i]
+	pxor	xmm7, xmm7
+	expand	xmm0, xmm1, nil, nil, xmm7
+	mov	ebx, [ebp + 24]		// -> av[0]
+	movdqa	[esp + 0], xmm0		// bv[i] expanded low
+	movdqa	[esp + 16], xmm1	// bv[i] expanded high
+	call	mla4zc
+	add	edi, 16
+	add	ebx, 16
+	add	ecx, 16
+	add	esi, 16
+	cmp	ebx, eax		// done yet?
+	jae	8f
+
+	.p2align 4
+	// Continue...
+0:	call	mla4
+	add	ebx, 16
+	add	edi, 16
+	cmp	ebx, eax
+	jb	0b
+
+	// Finish off this pass.  There was no tail on the previous pass, and
+	// there can be none on this pass.
+8:	call	carryprop
+	cmp	esi, [ebp + 36]
+	jb	1b
+
+	// All over.
+9:	mov	esp, ebp
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+
+ENDFUNC
+
+FUNC(mpxmont_mul4_x86_sse2)
+	// void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
+	//			     const mpw *nv, size_t n, const mpw *mi);
+
+	// Build a stack frame.  Arguments will be relative to EBP, as
+	// follows.
+	//
+	//	ebp + 20	dv
+	//	ebp + 24	av
+	//	ebp + 28	bv
+	//	ebp + 32	nv
+	//	ebp + 36	n (nonzero multiple of 4)
+	//	ebp + 40	mi
+	//
+	// Locals are relative to ESP, which is 4 mod 16, as follows.
+	//
+	//	esp +   0	outer loop dv
+	//	esp +   4	outer loop bv
+	//	esp +   8	av limit (mostly in ESI)
+	//	esp +  12	expanded V (32 bytes)
+	//	esp +  44	expanded M (32 bytes)
+	//	esp +  76	expanded Y (32 bytes)
+	//	esp + 108	bv limit
+	//	esp + 112	(gap)
+	//	esp + 124	(top of locals)
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	ebp, esp
+	and	esp, ~15
+	sub	esp, 124
+
+	// Establish the expanded operands.
+	pxor	xmm7, xmm7
+	mov	ecx, [ebp + 28]		// -> bv
+	mov	edx, [ebp + 40]		// -> mi
+	movdqu	xmm0, [ecx]		// bv[0]
+	movdqu	xmm2, [edx]		// mi
+	expand	xmm0, xmm1, xmm2, xmm3, xmm7
+	movdqa	[esp + 12], xmm0	// bv[0] expanded low
+	movdqa	[esp + 28], xmm1	// bv[0] expanded high
+	movdqa	[esp + 44], xmm2	// mi expanded low
+	movdqa	[esp + 60], xmm3	// mi expanded high
+
+	// Set up the outer loop state and prepare for the first iteration.
+	mov	edx, [ebp + 36]		// n
+	mov	eax, [ebp + 24]		// -> U = av[0]
+	mov	ebx, [ebp + 32]		// -> X = nv[0]
+	mov	edi, [ebp + 20]		// -> Z = dv[0]
+	mov	[esp + 4], ecx
+	lea	ecx, [ecx + 4*edx]	// -> bv[n/4] = bv limit
+	lea	edx, [eax + 4*edx]	// -> av[n/4] = av limit
+	mov	[esp + 0], edi
+	mov	[esp + 108], ecx
+	mov	[esp + 8], edx
+	lea	ecx, [esp + 12]		// -> expanded V = bv[0]
+	lea	esi, [esp + 44]		// -> expanded M = mi
+	lea	edx, [esp + 76]		// -> space for Y
+	call	mmul4
+	mov	esi, [esp + 8]		// recover av limit
+	add	edi, 16
+	add	eax, 16
+	add	ebx, 16
+	cmp	eax, esi		// done already?
+	jae	8f
+	mov	[esp + 0], edi
+
+	.p2align 4
+	// Complete the first inner loop.
+0:	call	dmul4
+	add	edi, 16
+	add	eax, 16
+	add	ebx, 16
+	cmp	eax, esi		// done yet?
+	jb	0b
+
+	// Still have carries left to propagate.
+	call	carryprop
+	movd	[edi + 16], xmm4
+
+	.p2align 4
+	// Embark on the next iteration.  (There must be one.  If n = 1, then
+	// we would have bailed above, to label 8.  Similarly, the subsequent
+	// iterations can fall into the inner loop immediately.)
+1:	mov	eax, [esp + 4]		// -> bv[i - 1]
+	mov	edi, [esp + 0]		// -> Z = dv[i]
+	add	eax, 16			// -> bv[i]
+	pxor	xmm7, xmm7
+	movdqu	xmm0, [eax]		// bv[i]
+	mov	[esp + 4], eax
+	cmp	eax, [esp + 108]	// done yet?
+	jae	9f
+	mov	ebx, [ebp + 32]		// -> X = nv[0]
+	lea	esi, [esp + 44]		// -> expanded M = mi
+	mov	eax, [ebp + 24]		// -> U = av[0]
+	expand	xmm0, xmm1, nil, nil, xmm7
+	movdqa	[esp + 12], xmm0	// bv[i] expanded low
+	movdqa	[esp + 28], xmm1	// bv[i] expanded high
+	call	mmla4
+	mov	esi, [esp + 8]		// recover av limit
+	add	edi, 16
+	add	eax, 16
+	add	ebx, 16
+	mov	[esp + 0], edi
+
+	.p2align 4
+	// Complete the next inner loop.
+0:	call	dmla4
+	add	edi, 16
+	add	eax, 16
+	add	ebx, 16
+	cmp	eax, esi
+	jb	0b
+
+	// Still have carries left to propagate, and they overlap the
+	// previous iteration's final tail, so read that in and add it.
+	movd	xmm0, [edi]
+	paddq	xmm4, xmm0
+	call	carryprop
+	movd	[edi + 16], xmm4
+
+	// Back again.
+	jmp	1b
+
+	// First iteration was short.  Write out the carries and we're done.
+	// (This could be folded into the main loop structure, but that would
+	// penalize small numbers more.)
+8:	call	carryprop
+	movd	[edi + 16], xmm4
+
+	// All done.
+9:	mov	esp, ebp
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+
+ENDFUNC
+
+FUNC(mpxmont_redc4_x86_sse2)
+	// void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
+	//			       size_t n, const mpw *mi);
+
+	// Build a stack frame.  Arguments will be relative to EBP, as
+	// follows.
+	//
+	//	ebp + 20	dv
+	//	ebp + 24	dvl
+	//	ebp + 28	nv
+	//	ebp + 32	n (nonzero multiple of 4)
+	//	ebp + 36	mi
+	//
+	// Locals are relative to ESP, as follows.
+	//
+	//	esp +  0	outer loop dv
+	//	esp +  4	outer dv limit
+	//	esp +  8	blocks-of-4 dv limit
+	//	esp + 12	expanded M (32 bytes)
+	//	esp + 44	expanded Y (32 bytes)
+	//	esp + 76	(top of locals)
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	ebp, esp
+	and	esp, ~15
+	sub	esp, 76
+
+	// Establish the expanded operands and the blocks-of-4 dv limit.
+	mov	edi, [ebp + 20]		// -> Z = dv[0]
+	pxor	xmm7, xmm7
+	mov	eax, [ebp + 24]		// -> dv[n] = dv limit
+	sub	eax, edi		// length of dv in bytes
+	mov	edx, [ebp + 36]		// -> mi
+	movdqu	xmm0, [edx]		// mi
+	and	eax, ~15		// mask off the tail end
+	expand	xmm0, xmm1, nil, nil, xmm7
+	add	eax, edi		// find limit
+	movdqa	[esp + 12], xmm0	// mi expanded low
+	movdqa	[esp + 28], xmm1	// mi expanded high
+	mov	[esp + 8], eax
+
+	// Set up the outer loop state and prepare for the first iteration.
+	mov	ecx, [ebp + 32]		// n
+	mov	ebx, [ebp + 28]		// -> X = nv[0]
+	lea	edx, [edi + 4*ecx]	// -> dv[n/4] = outer dv limit
+	lea	ecx, [ebx + 4*ecx]	// -> nv[n/4] = nv limit
+	mov	[esp + 0], edi
+	mov	[esp + 4], edx
+	lea	esi, [esp + 12]		// -> expanded M = mi
+	lea	edx, [esp + 44]		// -> space for Y
+	call	mont4
+	add	edi, 16
+	add	ebx, 16
+	cmp	ebx, ecx		// done already?
+	jae	8f
+
+	.p2align 4
+	// Complete the first inner loop.
+5:	call	mla4
+	add	ebx, 16
+	add	edi, 16
+	cmp	ebx, ecx		// done yet?
+	jb	5b
+
+	// Still have carries left to propagate.
+8:	carryadd
+	mov	esi, [esp + 8]		// -> dv blocks limit
+	mov	edx, [ebp + 24]		// dv limit
+	psllq	xmm7, 16
+	pslldq	xmm7, 8
+	paddq	xmm6, xmm7
+	call	carryprop
+	movd	eax, xmm4
+	add	edi, 16
+	cmp	edi, esi
+	jae	7f
+
+	.p2align 4
+	// Continue carry propagation until the end of the buffer.
+0:	add	[edi], eax
+	mov	eax, 0			// preserves flags
+	adcd	[edi + 4], 0
+	adcd	[edi + 8], 0
+	adcd	[edi + 12], 0
+	adc	eax, 0
+	add	edi, 16
+	cmp	edi, esi
+	jb	0b
+
+	// Deal with the tail end.
+7:	add	[edi], eax
+	mov	eax, 0			// preserves flags
+	add	edi, 4
+	adc	eax, 0
+	cmp	edi, edx
+	jb	7b
+
+	// All done for this iteration.  Start the next.  (This must have at
+	// least one follow-on iteration, or we'd not have started this outer
+	// loop.)
+8:	mov	edi, [esp + 0]		// -> dv[i - 1]
+	mov	ebx, [ebp + 28]		// -> X = nv[0]
+	lea	edx, [esp + 44]		// -> space for Y
+	lea	esi, [esp + 12]		// -> expanded M = mi
+	add	edi, 16			// -> Z = dv[i]
+	cmp	edi, [esp + 4]		// all done yet?
+	jae	9f
+	mov	[esp + 0], edi
+	call	mont4
+	add	edi, 16
+	add	ebx, 16
+	jmp	5b
+
+	// All over.
+9:	mov	esp, ebp
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Testing and performance measurement.
+
+#ifdef TEST_MUL4
+
+.macro	cysetup	c
+	rdtsc
+	mov	[\c], eax
+	mov	[\c + 4], edx
+.endm
+
+.macro	cystore	c, v, n
+	rdtsc
+	sub	eax, [\c]
+	sbb	edx, [\c + 4]
+	mov	ebx, [\v]
+	mov	ecx, [\n]
+	dec	ecx
+	mov	[\n], ecx
+	mov	[ebx + ecx*8], eax
+	mov	[ebx + ecx*8 + 4], edx
+.endm
+
+.macro	testprologue
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	ebp, esp
+	and	esp, ~15
+	sub	esp, 3*32 + 12
+	// vars:
+	//	esp +  0 = cycles
+	//	esp + 12 = v expanded
+	//	esp + 44 = y expanded
+	//	esp + 72 = ? expanded
+.endm
+
+.macro	testepilogue
+	mov	esp, ebp
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+.endm
+
+.macro	testldcarry c
+	mov	ecx, \c			// -> c
+	movdqu	xmm4, [ecx +  0]	// (c'_0, c''_0)
+	movdqu	xmm5, [ecx + 16]	// (c'_1, c''_1)
+	movdqu	xmm6, [ecx + 32]	// (c'_2, c''_2)
+.endm
+
+.macro	testexpand v, y
+	pxor	xmm7, xmm7
+  .ifnes "\v", "nil"
+	mov	ecx, \v
+	movdqu	xmm0, [ecx]
+	expand	xmm0, xmm1, nil, nil, xmm7
+	movdqa	[esp + 12], xmm0
+	movdqa	[esp + 28], xmm1
+  .endif
+  .ifnes "\y", "nil"
+	mov	edx, \y
+	movdqu	xmm2, [edx]
+	expand	xmm2, xmm3, nil, nil, xmm7
+	movdqa	[esp + 44], xmm2
+	movdqa	[esp + 60], xmm3
+  .endif
+.endm
+
+.macro	testtop	u, x, mode
+	.p2align 4
+0:
+  .ifnes "\u", "nil"
+	lea	ecx, [esp + 12]
+  .endif
+	mov	ebx, \x
+  .ifeqs "\mode", "mont"
+	lea	esi, [esp + 44]
+  .endif
+	cysetup	esp + 0
+  .ifnes "\u", "nil"
+	mov	eax, \u
+  .endif
+  .ifeqs "\mode", "mont"
+	lea	edx, [esp + 76]
+  .else
+	lea	edx, [esp + 44]
+  .endif
+.endm
+
+.macro	testtail cyv, n
+	cystore	esp + 0, \cyv, \n
+	jnz	0b
+.endm
+
+.macro	testcarryout c
+	mov	ecx, \c
+	movdqu	[ecx +  0], xmm4
+	movdqu	[ecx + 16], xmm5
+	movdqu	[ecx + 32], xmm6
+.endm
+
+	.globl	test_dmul4
+test_dmul4:
+	testprologue
+	testldcarry [ebp + 24]
+	testexpand [ebp + 36], [ebp + 40]
+	mov	edi, [ebp + 20]
+	testtop	[ebp + 28], [ebp + 32]
+	call	dmul4
+	testtail [ebp + 48], [ebp + 44]
+	testcarryout [ebp + 24]
+	testepilogue
+
+	.globl	test_dmla4
+test_dmla4:
+	testprologue
+	testldcarry [ebp + 24]
+	testexpand [ebp + 36], [ebp + 40]
+	mov	edi, [ebp + 20]
+	testtop	[ebp + 28], [ebp + 32]
+	call	dmla4
+	testtail [ebp + 48], [ebp + 44]
+	testcarryout [ebp + 24]
+	testepilogue
+
+	.globl	test_mul4
+test_mul4:
+	testprologue
+	testldcarry [ebp + 24]
+	testexpand nil, [ebp + 32]
+	mov	edi, [ebp + 20]
+	testtop	nil, [ebp + 28]
+	call	mul4
+	testtail [ebp + 40], [ebp + 36]
+	testcarryout [ebp + 24]
+	testepilogue
+
+	.globl	test_mla4
+test_mla4:
+	testprologue
+	testldcarry [ebp + 24]
+	testexpand nil, [ebp + 32]
+	mov	edi, [ebp + 20]
+	testtop	nil, [ebp + 28]
+	call	mla4
+	testtail [ebp + 40], [ebp + 36]
+	testcarryout [ebp + 24]
+	testepilogue
+
+	.globl	test_mmul4
+test_mmul4:
+	testprologue
+	testexpand [ebp + 40], [ebp + 44]
+	mov	edi, [ebp + 20]
+	testtop	[ebp + 32], [ebp + 36], mont
+	call	mmul4
+	testtail [ebp + 52], [ebp + 48]
+	mov	edi, [ebp + 28]
+	movdqa	xmm0, [esp + 76]
+	movdqa	xmm1, [esp + 92]
+	movdqu	[edi], xmm0
+	movdqu	[edi + 16], xmm1
+	testcarryout [ebp + 24]
+	testepilogue
+
+	.globl	test_mmla4
+test_mmla4:
+	testprologue
+	testexpand [ebp + 40], [ebp + 44]
+	mov	edi, [ebp + 20]
+	testtop	[ebp + 32], [ebp + 36], mont
+	call	mmla4
+	testtail [ebp + 52], [ebp + 48]
+	mov	edi, [ebp + 28]
+	movdqa	xmm0, [esp + 76]
+	movdqa	xmm1, [esp + 92]
+	movdqu	[edi], xmm0
+	movdqu	[edi + 16], xmm1
+	testcarryout [ebp + 24]
+	testepilogue
+
+	.globl	test_mont4
+test_mont4:
+	testprologue
+	testexpand nil, [ebp + 36]
+	mov	edi, [ebp + 20]
+	testtop	nil, [ebp + 32], mont
+	call	mont4
+	testtail [ebp + 44], [ebp + 40]
+	mov	edi, [ebp + 28]
+	movdqa	xmm0, [esp + 76]
+	movdqa	xmm1, [esp + 92]
+	movdqu	[edi], xmm0
+	movdqu	[edi + 16], xmm1
+	testcarryout [ebp + 24]
+	testepilogue
+
+#endif
+
+///----- That's all, folks --------------------------------------------------
diff --git a/math/mpx.c b/math/mpx.c
index 2745fe0f..8f1dcede 100644
--- a/math/mpx.c
+++ b/math/mpx.c
@@ -27,6 +27,8 @@
 
 /*----- Header files ------------------------------------------------------*/
 
+#include "config.h"
+
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -35,6 +37,7 @@
 #include <mLib/bits.h>
 #include <mLib/macros.h>
 
+#include "dispatch.h"
 #include "mptypes.h"
 #include "mpx.h"
 #include "bitops.h"
@@ -845,8 +848,13 @@ void mpx_usubnlsl(mpw *dv, mpw *dvl, mpw a, unsigned o)
  *		vectors in any way.
  */
 
-void mpx_umul(mpw *dv, mpw *dvl, const mpw *av, const mpw *avl,
-	      const mpw *bv, const mpw *bvl)
+CPU_DISPATCH(EMPTY, (void), void, mpx_umul,
+	     (mpw *dv, mpw *dvl, const mpw *av, const mpw *avl,
+	      const mpw *bv, const mpw *bvl),
+	     (dv, dvl, av, avl, bv, bvl), pick_umul, simple_umul);
+
+static void simple_umul(mpw *dv, mpw *dvl, const mpw *av, const mpw *avl,
+			const mpw *bv, const mpw *bvl)
 {
   /* --- This is probably worthwhile on a multiply --- */
 
@@ -885,6 +893,36 @@ void mpx_umul(mpw *dv, mpw *dvl, const mpw *av, const mpw *avl,
   }
 }
 
+#define MAYBE_UMUL4(impl)						\
+  extern void mpx_umul4_##impl(mpw */*dv*/,				\
+			       const mpw */*av*/, const mpw */*avl*/,	\
+			       const mpw */*bv*/, const mpw */*bvl*/);	\
+  static void maybe_umul4_##impl(mpw *dv, mpw *dvl,			\
+				 const mpw *av, const mpw *avl,		\
+				 const mpw *bv, const mpw *bvl)		\
+  {									\
+    size_t an = avl - av, bn = bvl - bv, dn = dvl - dv;			\
+    if (!an || an%4 != 0 || !bn || bn%4 != 0 || dn < an + bn)		\
+      simple_umul(dv, dvl, av, avl, bv, bvl);				\
+    else {								\
+      mpx_umul4_##impl(dv, av, avl, bv, bvl);				\
+      MPX_ZERO(dv + an + bn, dvl);					\
+    }									\
+  }
+
+#if CPUFAM_X86
+  MAYBE_UMUL4(x86_sse2)
+#endif
+
+static mpx_umul__functype *pick_umul(void)
+{
+#if CPUFAM_X86
+  DISPATCH_PICK_COND(mpx_umul, maybe_umul4_x86_sse2,
+		     cpu_feature_p(CPUFEAT_X86_SSE2));
+#endif
+  DISPATCH_PICK_FALLBACK(mpx_umul, simple_umul);
+}
+
 /* --- @mpx_umuln@ --- *
  *
  * Arguments:	@mpw *dv, *dvl@ = destination vector base and limit
@@ -1220,6 +1258,7 @@ mpw mpx_udivn(mpw *qv, mpw *qvl, const mpw *rv, const mpw *rvl, mpw d)
   size_t _sz = (sz);							\
   mpw *_vv = xmalloc(MPWS(_sz));					\
   mpw *_vvl = _vv + _sz;						\
+  memset(_vv, 0xa5, MPWS(_sz));						\
   (v) = _vv;								\
   (vl) = _vvl;								\
 } while (0)
diff --git a/math/t/mpmont b/math/t/mpmont
index 81fbe634..cd2569b7 100644
--- a/math/t/mpmont
+++ b/math/t/mpmont
@@ -1,5 +1,12 @@
 # Test vectors for Montgomery reduction
 
+mul {
+  6277101735386680763835789423207666416083908700390324961279
+  2455155546008943817740293915197451784769108058161191238065
+  340282366920938463500268095579187314689
+  5646741895976341600220572388698743135318229029826089708489;
+}
+
 create {
   340809809850981098423498794792349	# m
   266454859				# -m^{-1} mod b
-- 
2.11.0