From: Mark Wooding <mdw@distorted.org.uk>
Date: Mon, 4 Nov 2019 12:22:00 +0000 (+0000)
Subject: math/mpx-mul4-{arm-neon,arm64-simd}.S, etc.: Add ARM versions of `mul4'.
X-Git-Tag: 2.6.0~9
X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/commitdiff_plain/ea1b3cec199052eda3a0054d86c70e948c6e7580

math/mpx-mul4-{arm-neon,arm64-simd}.S, etc.: Add ARM versions of `mul4'.

With this, I think we (finally) have parity across the various premier
target platforms.
---

diff --git a/debian/catacomb2.symbols b/debian/catacomb2.symbols
index 23ca209b..84ff8938 100644
--- a/debian/catacomb2.symbols
+++ b/debian/catacomb2.symbols
@@ -128,14 +128,20 @@ libcatacomb.so.2 catacomb2 #MINVER#
 	(optional|arch=i386)mpx_umul4_x86_avx@Base 2.3.0
 	(optional|arch=amd64)mpx_umul4_amd64_sse2@Base 2.3.0
 	(optional|arch=amd64)mpx_umul4_amd64_avx@Base 2.3.0
+	(optional|arch=armel armhf)mpx_umul4_arm_neon@Base 2.5.99~
+	(optional|arch=arm64)mpx_umul4_arm64_simd@Base 2.5.99~
 	(optional|arch=i386)mpxmont_mul4_x86_sse2@Base 2.3.0
 	(optional|arch=i386)mpxmont_mul4_x86_avx@Base 2.3.0
 	(optional|arch=amd64)mpxmont_mul4_amd64_sse2@Base 2.3.0
 	(optional|arch=amd64)mpxmont_mul4_amd64_avx@Base 2.3.0
+	(optional|arch=armel armhf)mpxmont_mul4_arm_neon@Base 2.5.99~
+	(optional|arch=arm64)mpxmont_mul4_arm64_simd@Base 2.5.99~
 	(optional|arch=i386)mpxmont_redc4_x86_sse2@Base 2.3.0
 	(optional|arch=i386)mpxmont_redc4_x86_avx@Base 2.3.0
 	(optional|arch=amd64)mpxmont_redc4_amd64_sse2@Base 2.3.0
 	(optional|arch=amd64)mpxmont_redc4_amd64_avx@Base 2.3.0
+	(optional|arch=armel armhf)mpxmont_redc4_arm_neon@Base 2.5.99~
+	(optional|arch=arm64)mpxmont_redc4_arm64_simd@Base 2.5.99~
 
 ## mparena
 	mparena_create@Base 2.0.0
diff --git a/math/Makefile.am b/math/Makefile.am
index 7c4ffed4..37d88d1a 100644
--- a/math/Makefile.am
+++ b/math/Makefile.am
@@ -192,6 +192,16 @@ MPX_MUL4_SOURCES	 = mpx-mul4-amd64-sse2.S
 check_PROGRAMS		+= mpx-mul4.t
 TESTS			+= mpx-mul4.t$(EXEEXT)
 endif
+if CPUFAM_ARMEL
+MPX_MUL4_SOURCES	 = mpx-mul4-arm-neon.S
+check_PROGRAMS		+= mpx-mul4.t
+TESTS			+= mpx-mul4.t$(EXEEXT)
+endif
+if CPUFAM_ARM64
+MPX_MUL4_SOURCES	 = mpx-mul4-arm64-simd.S
+check_PROGRAMS		+= mpx-mul4.t
+TESTS			+= mpx-mul4.t$(EXEEXT)
+endif
 libmath_la_SOURCES	+= $(MPX_MUL4_SOURCES)
 mpx_mul4_t_SOURCES	 = mpx-mul4-test.c $(MPX_MUL4_SOURCES)
 mpx_mul4_t_CPPFLAGS	 = \
diff --git a/math/mpmont.c b/math/mpmont.c
index 3edb58fb..6109c856 100644
--- a/math/mpmont.c
+++ b/math/mpmont.c
@@ -98,6 +98,14 @@ static void simple_redccore(mpw *dv, mpw *dvl, const mpw *mv,
   MAYBE_REDC4(amd64_avx)
 #endif
 
+#if CPUFAM_ARMEL
+  MAYBE_REDC4(arm_neon)
+#endif
+
+#if CPUFAM_ARM64
+  MAYBE_REDC4(arm64_simd)
+#endif
+
 static redccore__functype *pick_redccore(void)
 {
 #if CPUFAM_X86
@@ -112,6 +120,13 @@ static redccore__functype *pick_redccore(void)
   DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_amd64_sse2,
 		     cpu_feature_p(CPUFEAT_X86_SSE2));
 #endif
+#if CPUFAM_ARMEL
+  DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_arm_neon,
+		     cpu_feature_p(CPUFEAT_ARM_NEON));
+#endif
+#if CPUFAM_ARM64
+  DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_arm64_simd, 1);
+#endif
   DISPATCH_PICK_FALLBACK(mpmont_reduce, simple_redccore);
 }
 
@@ -204,6 +219,14 @@ static void simple_mulcore(mpw *dv, mpw *dvl,
   MAYBE_MUL4(amd64_avx)
 #endif
 
+#if CPUFAM_ARMEL
+  MAYBE_MUL4(arm_neon)
+#endif
+
+#if CPUFAM_ARM64
+  MAYBE_MUL4(arm64_simd)
+#endif
+
 static mulcore__functype *pick_mulcore(void)
 {
 #if CPUFAM_X86
@@ -218,6 +241,13 @@ static mulcore__functype *pick_mulcore(void)
   DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_amd64_sse2,
 		     cpu_feature_p(CPUFEAT_X86_SSE2));
 #endif
+#if CPUFAM_ARMEL
+  DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_arm_neon,
+		     cpu_feature_p(CPUFEAT_ARM_NEON));
+#endif
+#if CPUFAM_ARM64
+  DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_arm64_simd, 1);
+#endif
   DISPATCH_PICK_FALLBACK(mpmont_mul, simple_mulcore);
 }
 
diff --git a/math/mpx-mul4-arm-neon.S b/math/mpx-mul4-arm-neon.S
new file mode 100644
index 00000000..efca7902
--- /dev/null
+++ b/math/mpx-mul4-arm-neon.S
@@ -0,0 +1,1189 @@
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// Large SIMD-based multiplications
+///
+/// (c) 2019 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software: you can redistribute it and/or modify it
+/// under the terms of the GNU Library General Public License as published
+/// by the Free Software Foundation; either version 2 of the License, or
+/// (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful, but
+/// WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+/// Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb.  If not, write to the Free Software
+/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+/// USA.
+
+///--------------------------------------------------------------------------
+/// Preliminaries.
+
+#include "config.h"
+#include "asm-common.h"
+
+	.arch	armv7-a
+	.fpu	neon
+
+	.text
+
+///--------------------------------------------------------------------------
+/// Theory.
+///
+/// We define a number of primitive fixed-size multipliers from which we can
+/// construct more general variable-length multipliers.
+///
+/// The basic trick is the same throughout.  In an operand-scanning
+/// multiplication, the inner multiplication loop multiplies a multiple-
+/// precision operand by a single precision factor, and adds the result,
+/// appropriately shifted, to the result.  A `finely integrated operand
+/// scanning' implementation of Montgomery multiplication also adds the
+/// product of a single-precision `Montgomery factor' and the modulus,
+/// calculated in the same pass.  The more common `coarsely integrated
+/// operand scanning' alternates main multiplication and Montgomery passes,
+/// which requires additional carry propagation.
+///
+/// Throughout both plain-multiplication and Montgomery stages, then, one of
+/// the factors remains constant throughout the operation, so we can afford
+/// to take a little time to preprocess it.  The transformation we perform is
+/// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
+/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
+/// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
+/// pieces are placed into 32-bit cells, and arranged as two 128-bit NEON
+/// operands, as follows.
+///
+///	Offset	   0	   4	    8	   12
+///	   0	v'_0   v''_0	 v'_1	v''_1
+///	  16	v'_2   v''_2	 v'_3	v''_3
+///
+/// The `vmull' and `vmlal' instructions can multiply a vector of two 32-bit
+/// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
+/// on (say) v'_0 and v''_0 in a single instruction, to produce two 48-bit
+/// results in 64-bit fields.  The sixteen bits of headroom allows us to add
+/// many products together before we must deal with carrying; it also allows
+/// for some calculations to be performed on the above expanded form.
+///
+/// We maintain three `carry' registers, q12--q14, accumulating intermediate
+/// results; we name them `c0', `c1', and `c2'.  Each carry register holds
+/// two 64-bit halves: the register c0, for example, holds c'_0 (low half)
+/// and c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the
+/// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2.
+/// The `vmull' or `vmlal' instruction acting on a scalar operand and an
+/// operand in the expanded form above produces a result which can be added
+/// directly to the appropriate carry register.
+///
+/// Multiplication is performed in product-scanning order, since ARM
+/// processors commonly implement result forwarding for consecutive multiply-
+/// and-accumulate instructions specifying the same destination.
+/// Experimentally, this runs faster than operand-scanning in an attempt to
+/// hide instruction latencies.
+///
+/// On 32-bit ARM, we have a reasonable number of registers: the expanded
+/// operands are kept in registers.  The packed operands are read from memory
+/// into working registers q0 and q1.  The following conventional argument
+/// names and locations are used throughout.
+///
+/// Arg	Format	    Location	Notes
+///
+/// U	packed	    [r1]
+/// X	packed	    [r2]	In Montgomery multiplication, X = N
+/// V	expanded    q2/q3
+/// Y	expanded    q4/q5	In Montgomery multiplication, Y = (A + U V) M
+/// M	expanded    q4/q5	-N^{-1} (mod B^4)
+/// N				Modulus, for Montgomery multiplication
+/// A	packed	    [r0]	Destination/accumulator
+/// C	carry	    q13--q15
+///
+/// The calculation is some variant of
+///
+///	A' + C' B^4 <- U V + X Y + A + C
+///
+/// The low-level functions fit into a fairly traditional (finely-integrated)
+/// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y)
+/// (indexed by i).
+///
+/// The variants are as follows.
+///
+/// Function	Variant			Use		i	j
+///
+/// mmul4	A = C = 0		Montgomery	0	0
+/// dmul4	A = 0			Montgomery	0	+
+/// mmla4	C = 0			Montgomery	+	0
+/// dmla4	exactly as shown	Montgomery	+	+
+///
+/// mul4zc	U = V = A = C = 0	Plain		0	0
+/// mul4	U = V = A = 0		Plain		0	+
+/// mla4zc	U = V = C = 0		Plain		+	0
+/// mla4	U = V = 0		Plain		+	+
+///
+/// The `mmul4' and `mmla4' functions are also responsible for calculating
+/// the Montgomery reduction factor Y = (A + U V) M used by the rest of the
+/// inner loop.
+
+///--------------------------------------------------------------------------
+/// Macro definitions.
+
+.macro	mulacc	z, u, v, x=nil, y=nil
+	// Set Z = Z + U V + X Y.  X may be `nil' to omit the second
+	// operand.  Z should be a 128-bit `qN' register; V and Y should be
+	// 64-bit `dN' registers; and U and X should be 32-bit `dN[I]'
+	// scalars; the multiplications produce two 64-bit elementwise
+	// products, which are added elementwise to Z.
+
+	vmlal.u32 \z, \v, \u
+    .ifnes "\x", "nil"
+	vmlal.u32 \z, \y, \x
+    .endif
+.endm
+
+.macro	mulinit	z, zinitp, u, v, x, y
+	// If ZINITP then set Z = Z + U V + X Y, as for `mulacc'; otherwise,
+	// set Z = U V + X Y.  Operand requirements and detailed operation
+	// are as for `mulacc'.
+
+  .ifeqs "\zinitp", "t"
+	mulacc	\z, \u, \v, \x, \y
+  .else
+	vmull.u32 \z, \v, \u
+    .ifnes "\x", "nil"
+	vmlal.u32 \z, \y, \x
+    .endif
+  .endif
+.endm
+
+// `MULI': accumulate the B^I and b B^i terms of the polynomial product sum U
+// V + X Y, given that U = u_0 + B u_1 + B^2 u_2 + B^3 u_3 (and similarly for
+// x), and V = v'_0 + b v''_0 + B (v'_1 + b v''_1) + B^2 (v'_2 + b v''_2) +
+// B^3 (v'_3 + b v''_3) (and similarly for Y).  The 64-bit coefficients are
+// added into the low and high halves of the 128-bit register Z (if ZINIT is
+// `nil' then simply set Z, as if it were initially zero).
+#define MUL0(z, zinitp, u, v0, v1, x, y0, y1)				\
+	mulinit	z, zinitp, QW(u, 0), D0(v0), QW(x, 0), D0(y0)
+#define MUL1(z, zinitp, u, v0, v1, x, y0, y1)				\
+	mulinit	z, zinitp, QW(u, 0), D1(v0), QW(x, 0), D1(y0);		\
+	mulacc	z,	   QW(u, 1), D0(v0), QW(x, 1), D0(y0)
+#define MUL2(z, zinitp, u, v0, v1, x, y0, y1)				\
+	mulinit	z, zinitp, QW(u, 0), D0(v1), QW(x, 0), D0(y1);		\
+	mulacc	z,	   QW(u, 1), D1(v0), QW(x, 1), D1(y0);		\
+	mulacc	z,	   QW(u, 2), D0(v0), QW(x, 2), D0(y0)
+#define MUL3(z, zinitp, u, v0, v1, x, y0, y1)				\
+	mulinit	z, zinitp, QW(u, 0), D1(v1), QW(x, 0), D1(y1);		\
+	mulacc	z,	   QW(u, 1), D0(v1), QW(x, 1), D0(y1);		\
+	mulacc	z,	   QW(u, 2), D1(v0), QW(x, 2), D1(y0);		\
+	mulacc	z,	   QW(u, 3), D0(v0), QW(x, 3), D0(y0)
+#define MUL4(z, zinitp, u, v0, v1, x, y0, y1)				\
+	mulinit	z, zinitp, QW(u, 1), D1(v1), QW(x, 1), D1(y1);		\
+	mulacc	z,	   QW(u, 2), D0(v1), QW(x, 2), D0(y1);		\
+	mulacc	z,	   QW(u, 3), D1(v0), QW(x, 3), D1(y0)
+#define MUL5(z, zinitp, u, v0, v1, x, y0, y1)				\
+	mulinit	z, zinitp, QW(u, 2), D1(v1), QW(x, 2), D1(y1);		\
+	mulacc	z,	   QW(u, 3), D0(v1), QW(x, 3), D0(y1)
+#define MUL6(z, zinitp, u, v0, v1, x, y0, y1)				\
+	mulinit	z, zinitp, QW(u, 3), D1(v1), QW(x, 3), D1(y1)
+
+// Steps in the process of propagating carry bits from ZLO to ZHI (both
+// 128-bit `qN' registers).  Here, T is a 128-bit `qN' temporary register.
+// Set the low 32 bits of the 64-bit `dN' register ZOUT to the completed
+// coefficient z_i.
+//
+// In detail, what happens is as follows.  Suppose initially that ZLO =
+// (z'_i; z''_i) and ZHI = (z'_{i+1}; z''_{i+1}).  Let t = z'_i + b z''_i;
+// observe that floor(t/b) = floor(z'_i/b) + z''_i.  Let z_i = t mod B, and
+// add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}.  This has
+// a circuit depth of 4; I don't know how to do better.
+.macro	_carry0	zout, zlo0, zlo1, t0, t1
+	// ZLO0 and ZLO1 are the low and high halves of a carry register.
+	// Extract a 32-bit output, in the bottom 32 bits of ZOUT, and set T1
+	// so as to continue processing using `_carry1'.  All operands are
+	// 64-bit `dN' registers.  If ZOUT is `nil' then no output is
+	// produced; if T1 is `nil' then no further processing will be
+	// possible.
+  .ifnes "\zout", "nil"
+	vshl.i64 \t0, \zlo1, #16
+  .endif
+  .ifnes "\t1", "nil"
+	vshr.u64 \t1, \zlo0, #16
+  .endif
+  .ifnes "\zout", "nil"
+	vadd.i64 \zout, \zlo0, \t0
+  .endif
+  .ifnes "\t1", "nil"
+	vadd.i64 \t1, \t1, \zlo1
+  .endif
+.endm
+.macro	_carry1	u, zhi0, t1
+	// ZHI0 is the low half of a carry register, and T1 is the result of
+	// applying `_carry0' to the previous carry register.  Set U to the
+	// result of propagating the carry into ZHI0.
+	vshr.u64 \t1, \t1, #16
+	vadd.i64 \u, \zhi0, \t1
+.endm
+
+// More convenient wrappers for `_carry0' and `_carry1'.
+//
+// CARRY0(ZOUT, ZLO, T)
+//	Store a 32-bit output in ZOUT from carry ZLO, using T as a
+//	temporary.  ZOUT is a 64-bit `dN' register; ZLO and T are 128-bit
+//	`qN' registers.
+//
+// CARRY1(ZHI, T)
+//	Propagate carry from T into ZHI.  Both are 128-bit `qN' registers;
+//	ZHI is updated.
+#define CARRY0(zout, zlo, t)						\
+	CASIDE0(zout, D0(zlo), zlo, t)
+#define CARRY1(zhi, t)							\
+	CASIDE1(D0(zhi), zhi, t)
+
+// Versions of `CARRY0' and `CARRY1' which don't mutate their operands.
+//
+// CASIDE0(ZOUT, U, ZLO, T)
+//	As for `CARRY0', but the low half of ZLO is actually in U (a 64-bit
+//	`dN' register).
+//
+// CASIDE0E(ZOUT, U, ZLO, T)
+//	As for `CASIDE0', but only calculate the output word, and no
+//	carry-out.
+//
+// CASIDE1(U, ZHI, T)
+//	As for `CARRY1', but write the updated low half of ZHI to U.
+#define CASIDE0(zout, u, zlo, t)					\
+	_carry0	zout, u, D1(zlo), D0(t), D1(t)
+#define CASIDE0E(zout, u, zlo, t)					\
+	_carry0	zout, u, D1(zlo), D0(t), nil
+#define CASIDE1(u, zhi, t)						\
+	_carry1	u, D0(zhi), D1(t)
+
+// Steps in spreading a packed 128-bit operand in A0 across A0, A1, A2, A3 in
+// carry format.
+#define SPREADACC0(a0, a1, a2, a3)					\
+	vmov.i32 a1, #0;						\
+	vmov.i32 a2, #0;						\
+	vmov.i32 a3, #0
+#define SPREADACC1(a0, a1, a2, a3)					\
+	vswp	D1(a0), D0(a2);						\
+	vtrn.32	D0(a0), D0(a1);						\
+	vtrn.32	D0(a2), D0(a3)
+
+// Add the carry-format values A0, A1, A2 into the existing carries C0, C1,
+// C2 (leaving A3 where it is).
+#define CARRYACC(a0, a1, a2, a3, c0, c1, c2)				\
+	vadd.i64 c0, c0, a0;						\
+	vadd.i64 c1, c1, a1;						\
+	vadd.i64 c2, c2, a2
+
+///--------------------------------------------------------------------------
+/// Primitive multipliers and related utilities.
+
+INTFUNC(carryprop)
+	// On entry, r0 points to a destination, and q13--q15 hold incoming
+	// carries c0--c2.  On exit, the low 128 bits of the carry value are
+	// stored at [r0]; the remaining 16 bits of carry are left in d30; r0
+	// is advanced by 16; and q10--q14 are clobbered.
+  endprologue
+
+	CARRY0(D0(q10), q13, q12)
+	CARRY1(q14, q12)
+	CARRY0(D0(q11), q14, q12)
+	CARRY1(q15, q12)
+	CARRY0(D1(q10), q15, q12)
+	vshr.u64 D1(q11), D1(q12), #16
+	vshr.u64 D0(q15), D1(q12), #48
+	vtrn.32	q10, q11
+	vst1.32	{q10}, [r0]!
+	bx	r14
+ENDFUNC
+
+INTFUNC(dmul4)
+	// On entry, r0 points to the destination; r1 and r2 point to packed
+	// operands U and X; q2/q3 and q4/q5 hold expanded operands V and Y;
+	// and q13--q15 hold incoming carries c0--c2.  On exit, the
+	// destination and carries are updated; r0, r1, r2 are each advanced
+	// by 16; q2--q5 are preserved; and the other NEON registers are
+	// clobbered.
+  endprologue
+
+	// Start by loading the operand words from memory.
+	vld1.32	{q0}, [r1]!
+	vld1.32	{q1}, [r2]!
+
+	// Do the multiplication.
+	MUL0(q13, t,	  q0,  q2, q3,	  q1,  q4, q5)
+	MUL1(q14, t,	  q0,  q2, q3,	  q1,  q4, q5)
+	 CARRY0(D0(q8), q13, q6)
+	MUL2(q15, t,	  q0,  q2, q3,	  q1,  q4, q5)
+	 CARRY1(q14, q6)
+	 CARRY0(D0(q9), q14, q6)
+	MUL3(q12, nil,	  q0,  q2, q3,	  q1,  q4, q5)
+	 CARRY1(q15, q6)
+	 CARRY0(D1(q8), q15, q6)
+	MUL4(q13, nil,	  q0,  q2, q3,	  q1,  q4, q5)
+	 CARRY1(q12, q6)
+	 CARRY0(D1(q9), q12, q6)
+	MUL5(q14, nil,	  q0,  q2, q3,	  q1,  q4, q5)
+	 CARRY1(q13, q6)
+	MUL6(q15, nil,	  q0,  q2, q3,	  q1,  q4, q5)
+
+	// Finish up and store the result.
+	vtrn.32	q8, q9
+	vst1.32	{q8}, [r0]!
+
+	// All done.
+	bx	r14
+ENDFUNC
+
+INTFUNC(dmla4)
+	// On entry, r0 points to the destination/accumulator; r1 and r2
+	// point to packed operands U and X; q2/q3 and q4/q5 hold expanded
+	// operands V and Y; and q13--q15 hold incoming carries c0--c2.  On
+	// exit, the accumulator and carries are updated; r0, r1, r2 are each
+	// advanced by 16; q2--q5 are preserved; and the other NEON registers
+	// are clobbered.
+  endprologue
+
+	// Start by loading the operand words from memory.
+	vld1.32	{q9}, [r0]
+	SPREADACC0(q9, q10, q11, q12)
+	vld1.32	{q0}, [r1]!
+	vld1.32	{q1}, [r2]!
+
+	// Add the accumulator input to the incoming carries.  Split the
+	// accumulator into four pieces and add the carries onto them.
+	SPREADACC1(q9, q10, q11, q12)
+	CARRYACC(q9, q10, q11, q12, q13, q14, q15)
+
+	// Do the multiplication.
+	MUL0(q13, t,	  q0,  q2, q3,	  q1,  q4, q5)
+	MUL1(q14, t,	  q0,  q2, q3,	  q1,  q4, q5)
+	 CARRY0(D0(q8), q13, q6)
+	MUL2(q15, t,	  q0,  q2, q3,	  q1,  q4, q5)
+	 CARRY1(q14, q6)
+	 CARRY0(D0(q9), q14, q6)
+	MUL3(q12, t,	  q0,  q2, q3,	  q1,  q4, q5)
+	 CARRY1(q15, q6)
+	 CARRY0(D1(q8), q15, q6)
+	MUL4(q13, nil,	  q0,  q2, q3,	  q1,  q4, q5)
+	 CARRY1(q12, q6)
+	 CARRY0(D1(q9), q12, q6)
+	MUL5(q14, nil,	  q0,  q2, q3,	  q1,  q4, q5)
+	 CARRY1(q13, q6)
+	MUL6(q15, nil,	  q0,  q2, q3,	  q1,  q4, q5)
+
+	// Finish up and store the result.
+	vtrn.32	q8, q9
+	vst1.32	{q8}, [r0]!
+
+	// All done.
+	bx	r14
+ENDFUNC
+
+INTFUNC(mul4)
+	// On entry, r0 points to the destination; r2 points to a packed
+	// operand X; q4/q5 holds an expanded operand Y; and q13--q15 hold
+	// incoming carries c0--c2.  On exit, the destination and carries are
+	// updated; r0 and r2 are each advanced by 16; q4 and q5 are
+	// preserved; and the other NEON registers are clobbered.
+  endprologue
+
+	// Start by loading the operand words from memory.
+	vld1.32	{q1}, [r2]!
+
+	// Do the multiplication.
+	MUL0(q13, t,	  q1,  q4, q5,	  nil,  nil, nil)
+	MUL1(q14, t,	  q1,  q4, q5,	  nil,  nil, nil)
+	 CARRY0(D0(q8), q13, q6)
+	MUL2(q15, t,	  q1,  q4, q5,	  nil,  nil, nil)
+	 CARRY1(q14, q6)
+	 CARRY0(D0(q9), q14, q6)
+	MUL3(q12, nil,	  q1,  q4, q5,	  nil,  nil, nil)
+	 CARRY1(q15, q6)
+	 CARRY0(D1(q8), q15, q6)
+	MUL4(q13, nil,	  q1,  q4, q5,	  nil,  nil, nil)
+	 CARRY1(q12, q6)
+	 CARRY0(D1(q9), q12, q6)
+	MUL5(q14, nil,	  q1,  q4, q5,	  nil,  nil, nil)
+	 CARRY1(q13, q6)
+	MUL6(q15, nil,	  q1,  q4, q5,	  nil,  nil, nil)
+
+	// Finish up and store the result.
+	vtrn.32	q8, q9
+	vst1.32	{q8}, [r0]!
+
+	// All done.
+	bx	r14
+ENDFUNC
+
+INTFUNC(mul4zc)
+	// On entry, r0 points to the destination; r2 points to a packed
+	// operand X; and q4/q5 holds an expanded operand Y.  On exit, the
+	// destination is updated; q13--q15 hold outgoing carries c0--c2; r0
+	// and r2 are each advanced by 16; q4 and q5 are preserved; and the
+	// other NEON registers are clobbered.
+  endprologue
+
+	// Start by loading the operand words from memory.
+	vld1.32	{q1}, [r2]!
+
+	// Do the multiplication.
+	MUL0(q13, nil,	  q1,  q4, q5,	  nil,  nil, nil)
+	MUL1(q14, nil,	  q1,  q4, q5,	  nil,  nil, nil)
+	 CARRY0(D0(q8), q13, q6)
+	MUL2(q15, nil,	  q1,  q4, q5,	  nil,  nil, nil)
+	 CARRY1(q14, q6)
+	 CARRY0(D0(q9), q14, q6)
+	MUL3(q12, nil,	  q1,  q4, q5,	  nil,  nil, nil)
+	 CARRY1(q15, q6)
+	 CARRY0(D1(q8), q15, q6)
+	MUL4(q13, nil,	  q1,  q4, q5,	  nil,  nil, nil)
+	 CARRY1(q12, q6)
+	 CARRY0(D1(q9), q12, q6)
+	MUL5(q14, nil,	  q1,  q4, q5,	  nil,  nil, nil)
+	 CARRY1(q13, q6)
+	MUL6(q15, nil,	  q1,  q4, q5,	  nil,  nil, nil)
+
+	// Finish up and store the result.
+	vtrn.32	q8, q9
+	vst1.32	{q8}, [r0]!
+
+	// All done.
+	bx	r14
+ENDFUNC
+
+INTFUNC(mla4)
+	// On entry, r0 points to the destination/accumulator; r2 points to a
+	// packed operand X; q4/q5 holds an expanded operand Y; and q13--q15
+	// hold incoming carries c0--c2.  On exit, the accumulator and
+	// carries are updated; r0 and r2 are each advanced by 16; q4 and q5
+	// are preserved; and the other NEON registers are clobbered.
+  endprologue
+
+	// Start by loading the operand words from memory.
+	vld1.32	{q9}, [r0]
+	SPREADACC0(q9, q10, q11, q12)
+	vld1.32	{q1}, [r2]!
+
+	// Add the accumulator input to the incoming carries.  Split the
+	// accumulator into four pieces and add the carries onto them.
+	SPREADACC1(q9, q10, q11, q12)
+	CARRYACC(q9, q10, q11, q12, q13, q14, q15)
+
+	// Do the multiplication.
+mla4_common:
+	MUL0(q13, t,	  q1,  q4, q5,	  nil,  nil, nil)
+	MUL1(q14, t,	  q1,  q4, q5,	  nil,  nil, nil)
+	 CARRY0(D0(q8), q13, q6)
+	MUL2(q15, t,	  q1,  q4, q5,	  nil,  nil, nil)
+	 CARRY1(q14, q6)
+	 CARRY0(D0(q9), q14, q6)
+	MUL3(q12, t,	  q1,  q4, q5,	  nil,  nil, nil)
+	 CARRY1(q15, q6)
+	 CARRY0(D1(q8), q15, q6)
+	MUL4(q13, nil,	  q1,  q4, q5,	  nil,  nil, nil)
+	 CARRY1(q12, q6)
+	 CARRY0(D1(q9), q12, q6)
+	MUL5(q14, nil,	  q1,  q4, q5,	  nil,  nil, nil)
+	 CARRY1(q13, q6)
+	MUL6(q15, nil,	  q1,  q4, q5,	  nil,  nil, nil)
+
+	// Finish up and store the result.
+	vtrn.32	q8, q9
+	vst1.32	{q8}, [r0]!
+
+	// All done.
+	bx	r14
+ENDFUNC
+
+INTFUNC(mla4zc)
+	// On entry, r0 points to the destination/accumulator; r2 points to a
+	// packed operand X; and q4/q5 holds an expanded operand Y.  On exit,
+	// the accumulator is updated; q13--q15 hold outgoing carries c0--c2;
+	// r0 and r2 are each advanced by 16; q4 and q5 are preserved; and
+	// the other NEON registers are clobbered.
+  endprologue
+
+	// Start by loading the operand words from memory.
+	vld1.32	{q13}, [r0]
+	SPREADACC0(q13, q14, q15, q12)
+	vld1.32	{q1}, [r2]!
+
+	// Move the accumulator input to the incoming carry slots.  Split the
+	// accumulator into four pieces.
+	SPREADACC1(q13, q14, q15, q12)
+
+	b	mla4_common
+ENDFUNC
+
+INTFUNC(mmul4)
+	// On entry, r0 points to the destination; r1 points to a packed
+	// operand U; r2 points to a packed operand X (the modulus); q2/q3
+	// holds an expanded operand V; and q4/q5 holds an expanded operand M
+	// (the Montgomery factor -N^{-1} (mod B)).  On exit, the destination
+	// is updated (to zero); q4/q5 hold an expanded factor Y = U V M (mod
+	// B); q13--q15 hold outgoing carries c0--c2; r0, r1, and r2 are each
+	// advanced by 16; q2 and q3 are preserved; and the other NEON
+	// registers are clobbered.
+
+	// Start by loading the operand words from memory.
+	vld1.32	{q0}, [r1]!
+
+	// Calculate the low half of W = A + U V, being careful to leave the
+	// carries in place.
+	MUL0(q13, nil,	  q0,  q2, q3,	  nil,  nil, nil)
+	MUL1(q14, nil,	  q0,  q2, q3,	  nil,  nil, nil)
+	 CARRY0(D0(q6), q13, q8)
+	MUL2(q15, nil,	  q0,  q2, q3,	  nil,  nil, nil)
+	 CASIDE1(D0(q9), q14, q8)
+	 CASIDE0(D0(q7), D0(q9), q14, q8)
+	MUL3(q12, nil,	  q0,  q2, q3,	  nil,  nil, nil)
+	b mmla4_common
+ENDFUNC
+
+INTFUNC(mmla4)
+	// On entry, r0 points to the destination/accumulator A; r1 points to
+	// a packed operand U; r2 points to a packed operand X (the modulus);
+	// q2/q3 holds an expanded operand V; and q4/q5 holds an expanded
+	// operand M (the Montgomery factor -N^{-1} (mod B)).  On exit, the
+	// accumulator is updated (to zero); q4/q5 hold an expanded factor Y
+	// = (A + U V) M (mod B); q13--q15 hold outgoing carries c0--c2; r0,
+	// r1, and r2 are each advanced by 16; q2 and q3 are preserved; and
+	// the other NEON registers are clobbered.
+  endprologue
+
+	// Start by loading the operand words from memory.
+	vld1.32	{q13}, [r0]
+	SPREADACC0(q13, q14, q15, q12)
+	vld1.32	{q0}, [r1]!
+
+	// Move the accumulator input to the incoming carry slots.  Split the
+	// accumulator into four pieces.
+	SPREADACC1(q13, q14, q15, q12)
+
+	// Calculate the low half of W = A + U V, being careful to leave the
+	// carries in place.
+	MUL0(q13, t,	  q0,  q2, q3,	  nil,  nil, nil)
+	MUL1(q14, t,	  q0,  q2, q3,	  nil,  nil, nil)
+	 CARRY0(D0(q6), q13, q8)
+	MUL2(q15, t,	  q0,  q2, q3,	  nil,  nil, nil)
+	 CASIDE1(D0(q9), q14, q8)
+	 CASIDE0(D0(q7), D0(q9), q14, q8)
+	MUL3(q12, t,	  q0,  q2, q3,	  nil,  nil, nil)
+mmla4_common:
+	 CASIDE1(D0(q9), q15, q8)
+	 CASIDE0(D1(q6), D0(q9), q15, q8)
+	 CASIDE1(D0(q9), q12, q8)
+	 CASIDE0E(D1(q7), D0(q9), q12, q8)
+	vtrn.32	q6, q7
+
+	// Calculate the low half of the Montgomery factor Y = W M.  At this
+	// point, registers are a little tight.
+	MUL0( q8, nil,	  q6,  q4, q5,	  nil,  nil, nil)
+	MUL1( q9, nil,	  q6,  q4, q5,	  nil,  nil, nil)
+	 CARRY0(D0(q8), q8, q1)
+	MUL2(q10, nil,	  q6,  q4, q5,	  nil,  nil, nil)
+	 CARRY1(q9, q1)
+	 CARRY0(D0(q9), q9, q1)
+	MUL3(q11, nil,	  q6,  q4, q5,	  nil,  nil, nil)
+	 CARRY1(q10, q1)
+	 CARRY0(D1(q8), q10, q1)
+	  vmov.i32 q5, #0
+	 CARRY1(q11, q1)
+	 CARRY0(D1(q9), q11, q1)
+	  vld1.32 {q1}, [r2]!
+	vtrn.32	q8, q9
+
+	// Expand Y.  We'll put it in its proper place a bit later.
+	vzip.16	q8, q5
+
+	// Build up the product X Y in the carry slots.
+	MUL0(q13, t,	  q1,  q8, q5,	  nil,  nil, nil)
+	MUL1(q14, t,	  q1,  q8, q5,	  nil,  nil, nil)
+	 CARRY0(nil, q13, q9)
+	MUL2(q15, t,	  q1,  q8, q5,	  nil,  nil, nil)
+	 CARRY1(q14, q9)
+	  vmov	q4, q8
+	 CARRY0(nil, q14, q9)
+	MUL3(q12, t,	  q1,  q8, q5,	  nil,  nil, nil)
+	 CARRY1(q15, q9)
+	 CARRY0(nil, q15, q9)
+	vmov.u32 q6, #0
+
+	// And complete the calculation.
+	MUL4(q13, nil,	  q0,  q2, q3,	  q1,  q4, q5)
+	 CARRY1(q12, q9)
+	 CARRY0(nil, q12, q9)
+	MUL5(q14, nil,	  q0,  q2, q3,	  q1,  q4, q5)
+	 CARRY1(q13, q9)
+	MUL6(q15, nil,	  q0,  q2, q3,	  q1,  q4, q5)
+
+	// Finish up and store the result.
+	vst1.32	{q6}, [r0]!
+
+	// All done.
+	bx	r14
+ENDFUNC
+
+INTFUNC(mont4)
+	// On entry, r0 points to the destination/accumulator A; r2 points to
+	// a packed operand X (the modulus); and q2/q3 holds an expanded
+	// operand M (the Montgomery factor -N^{-1} (mod B)).  On exit, the
+	// accumulator is updated (to zero); q4/q5 hold an expanded factor Y
+	// = A M (mod B); q13--q15 hold outgoing carries c0--c2; r0 and r2
+	// are each advanced by 16; q2 and q3 are preserved; and the other
+	// NEON registers are clobbered.
+  endprologue
+
+	// Start by loading the operand words from memory.
+	vld1.32	{q0}, [r0]
+	vld1.32 {q1}, [r2]!
+
+	// Calculate Y = A M (mod B).
+	MUL0(q8, nil,	  q0,  q2, q3,	  nil,  nil, nil)
+	MUL1(q9, nil,	  q0,  q2, q3,	  nil,  nil, nil)
+	 CARRY0(D0(q4), q8, q6)
+	MUL2(q10, nil,	  q0,  q2, q3,	  nil,  nil, nil)
+	 CARRY1(q9, q6)
+	  vmov	q13, q0
+	 CARRY0(D0(q7), q9, q6)
+	MUL3(q11, nil,	  q0,  q2, q3,	  nil,  nil, nil)
+	 CARRY1(q10, q6)
+	 CARRY0(D1(q4), q10, q6)
+	  SPREADACC0(q13, q14, q15, q12)
+	 CARRY1(q11, q6)
+	 CARRY0(D1(q7), q11, q6)
+	  SPREADACC1(q13, q14, q15, q12)
+	vmov.i32 q5, #0
+	vtrn.32	q4, q7
+	vzip.16	q4, q5
+
+	// Calculate the actual result.  Well, the carries, at least.
+	vmov.i32 q8, #0
+	MUL0(q13, t,	  q1,  q4, q5,	  nil,  nil, nil)
+	MUL1(q14, t,	  q1,  q4, q5,	  nil,  nil, nil)
+	 CARRY0(nil, q13, q6)
+	MUL2(q15, t,	  q1,  q4, q5,	  nil,  nil, nil)
+	 CARRY1(q14, q6)
+	 CARRY0(nil, q14, q6)
+	MUL3(q12, t,	  q1,  q4, q5,	  nil,  nil, nil)
+	 CARRY1(q15, q6)
+	 CARRY0(nil, q15, q6)
+	MUL4(q13, nil,	  q1,  q4, q5,	  nil,  nil, nil)
+	 CARRY1(q12, q6)
+	 CARRY0(nil, q12, q6)
+	MUL5(q14, nil,	  q1,  q4, q5,	  nil,  nil, nil)
+	 CARRY1(q13, q6)
+	MUL6(q15, nil,	  q1,  q4, q5,	  nil,  nil, nil)
+
+	// Finish up and store the result.
+	vst1.32	{q8}, [r0]!
+
+	// All done.
+	bx	r14
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Bulk multipliers.
+
+FUNC(mpx_umul4_arm_neon)
+	// void mpx_umul4_arm_neon(mpw *dv, const mpw *av, const mpw *avl,
+	//			   const mpw *bv, const mpw *bvl);
+
+	// Establish the arguments and do initial setup.
+	//
+	// inner loop dv	r0
+	// inner loop av	r2
+	// outer loop dv	r5
+	// outer loop bv	r3
+	// av base		r1
+	// av limit		r12
+	// bv limit		r4
+	pushreg	r4, r5, r14
+	pushvfp	QQ(q4, q7)
+  endprologue
+
+	// Prepare for the first iteration.
+	vld1.32	{q4}, [r3]!		// = Y = bv[0]
+	vmov.i32 q5, #0
+	// r0				// = dv for inner loop
+	// r1				// = av base
+	// r3				// = bv for outer loop
+	ldr	r4, [sp, #76]		// = bv limit
+	mov	r12, r2			// = av limit
+	mov	r2, r1			// = av for inner loop
+	add	r5, r0, #16		// = dv for outer loop
+	vzip.16	q4, q5			// expand Y
+	bl	mul4zc
+	cmp	r2, r12			// all done?
+	bhs	8f
+
+	// Continue with the first iteration.
+0:	bl	mul4
+	cmp	r2, r12			// all done?
+	blo	0b
+
+	// Write out the leftover carry.  There can be no tail here.
+8:	bl	carryprop
+	cmp	r3, r4			// more passes to do?
+	bhs	9f
+
+	// Set up for the next pass.
+1:	vld1.32	{q4}, [r3]!		// = Y = bv[i]
+	vmov.i32 q5, #0
+	mov	r0, r5			// -> dv[i]
+	mov	r2, r1			// -> av[0]
+	add	r5, r5, #16
+	vzip.16	q4, q5			// expand Y
+	bl	mla4zc
+	cmp	r2, r12			// done yet?
+	bhs	8f
+
+	// Continue...
+0:	bl	mla4
+	cmp	r2, r12
+	blo	0b
+
+	// Finish off this pass.  There was no tail on the previous pass, and
+	// there can be done on this pass.
+8:	bl	carryprop
+	cmp	r3, r4
+	blo	1b
+
+	// All over.
+9:	popvfp	QQ(q4, q7)
+	popreg	r4, r5, r14
+	bx	r14
+ENDFUNC
+
+FUNC(mpxmont_mul4_arm_neon)
+	// void mpxmont_mul4_arm_neon(mpw *dv, const mpw *av, const mpw *bv,
+	//			     const mpw *nv, size_t n, const mpw *mi);
+
+	// Establish the arguments and do initial setup.
+	//
+	// inner loop dv	r0
+	// inner loop av	r1
+	// inner loop nv	r2
+	// mi			r5
+	// outer loop dv	r6
+	// outer loop bv	r7
+	// av base		r8
+	// av limit		r9
+	// bv limit		r4
+	// nv base		r3
+	// n			r4
+	// c			r10
+	// 0			r12
+
+	pushreg	r4-r10, r14
+	pushvfp	QQ(q4, q7)
+  endprologue
+
+	// Establish the expanded operands.
+	ldrd	r4, r5, [sp, #96]	// r4 = n; r5 -> mi
+	vld1.32	{q2}, [r2]		// = V = bv[0]
+	vmov.i32 q3, #0
+	vmov.i32 q5, #0
+	vld1.32	{q4}, [r5]		// = M
+
+	// Set up the outer loop state and prepare for the first iteration.
+	// r0				// -> dv for inner loop
+	// r1				// -> av for inner loop
+	add	r7, r2, #16		// -> bv
+	// r3				// -> nv
+	add	r6, r0, #16		// -> dv
+	mov	r8, r1			// -> av
+	add	r9, r1, r4, lsl #2	// -> av limit
+	add	r4, r2, r4, lsl #2	// -> bv limit
+	mov	r2, r3			// -> nv for inner loop
+	mov	r12, #0			// = 0
+
+	vzip.16	q2, q3			// expand V
+	vzip.16	q4, q5			// expand M
+	bl	mmul4
+	cmp	r1, r9			// done already?
+	bhs	8f
+
+	// Complete the first inner loop.
+0:	bl	dmul4
+	cmp	r1, r9			// done yet?
+	blo	0b
+
+	// Still have carries left to propagate.  Rather than store the tail
+	// end in memory, keep it in a general-purpose register for later.
+	bl	carryprop
+	vmov.32	r10, QW(q15, 0)
+
+	// Embark on the next iteration.  (There must be one.  If n = 1 then
+	// we would have bailed above, to label 8.  Similarly, the subsequent
+	// iterations can fall into the inner loop immediately.)
+1:	vld1.32 {q2}, [r7]!		// = V = bv[i]
+	vld1.32	{q4}, [r5]		// = M
+	vmov.i32 q3, #0
+	vmov.i32 q5, #0
+	mov	r0, r6			// -> dv[i]
+	add	r6, r6, #16
+	mov	r1, r8			// -> av[0]
+	mov	r2, r3			// -> nv[0]
+	vzip.16	q2, q3			// expand V
+	vzip.16	q4, q5			// expand M
+	bl	mmla4
+
+	// Complete the next inner loop.
+0:	bl	dmla4
+	cmp	r1, r9			// done yet?
+	blo	0b
+
+	// Still have carries left to propagate, and they overlap the
+	// previous iteration's final tail, so read that and add it.
+	cmp	r7, r4
+	vmov.32	D0(q12), r10, r12
+	vadd.i64 D0(q13), D0(q13), D0(q12)
+	bl	carryprop
+	vmov.32	r10, QW(q15, 0)
+
+	// Back again, maybe.
+	blo	1b
+
+	// All done, almost.
+	str	r10, [r0], #4
+	popvfp	QQ(q4, q7)
+	popreg	r4-r10, r14
+	bx	r14
+
+	// First iteration was short.  Write out the carries and we're done.
+	// (This could be folded into the main loop structure, but that would
+	// penalize small numbers more.)
+8:	bl	carryprop
+	vst1.32	{QW(q15, 0)}, [r0]!
+	popvfp	QQ(q4, q7)
+	popreg	r4-r10, r14
+	bx	r14
+ENDFUNC
+
+FUNC(mpxmont_redc4_arm_neon)
+	// void mpxmont_redc4_arm_neon(mpw *dv, mpw *dvl, const mpw *nv,
+	//			       size_t n, const mpw *mi);
+
+	// Establish the arguments and do initial setup.
+	//
+	// inner loop dv	r0
+	// dv limit		r1
+	// inner loop nv	r2
+	// blocks-of-4 dv limit	r3
+	// mi			(r14)
+	// outer loop dv	r4
+	// outer loop dv limit	r5
+	// nv base		r6
+	// nv limit		r7
+	// n			r3
+	// c			(r14)
+	// t0, t1, t2, t3	r2, r8, r9, r10
+	// 0			r12
+
+	pushreg	r4-r10, r14
+	pushvfp	QQ(q4, q7)
+  endprologue
+
+	// Set up the outer loop state and prepare for the first iteration.
+	ldr	r14, [sp, #96]		// -> mi
+	vmov.i32 q3, #0
+	 sub	r12, r1, r0		// total dv bytes
+	// r0				// -> dv for inner loop
+	// r1				// -> overall dv limit
+	// r2				// -> nv for inner loop
+	// r3				// = n (for now)
+	add	r4, r0, #16		// -> dv for outer loop
+	add	r5, r0, r3, lsl #2	// -> dv limit
+	 bic	r12, r12, #15		// dv blocks of 4
+	 vld1.32 {q2}, [r14]		// = M
+	mov	r6, r2			// -> nv
+	add	r7, r2, r3, lsl #2	// -> nv limit
+	add	r3, r0, r12		// -> dv blocks-of-4 limit
+	 vzip.16 q2, q3			// expand M
+	mov	r12, #0			// = 0
+	bl	mont4
+	cmp	r2, r7			// done already?
+	bhs	8f
+
+5:	bl	mla4
+	cmp	r2, r7			// done yet?
+	blo	5b
+
+	// Still have carries left to propagate.  Adding the accumulator
+	// block into the carries is a little different this time, because
+	// all four accumulator limbs have to be squished into the three
+	// carry registers for `carryprop' to do its thing.
+8:	vld1.32	{q9}, [r0]
+	SPREADACC0(q9, q10, q11, q12)
+	SPREADACC1(q9, q10, q11, q12)
+	vshl.u64 D0(q12), D0(q12), #16
+	CARRYACC(q9, q10, q11, q12, q13, q14, q15)
+	vadd.u64 D1(q15), D1(q15), D0(q12)
+
+	bl	carryprop
+	vmov.32	r14, QW(q15, 0)
+	cmp	r0, r3
+	bhs	7f
+
+	// Propagate the first group of carries.
+	ldmia	r0, {r2, r8-r10}
+	adds	r2, r2, r14
+	adcs	r8, r8, #0
+	adcs	r9, r9, #0
+	adcs	r10, r10, #0
+	stmia	r0!, {r2, r8-r10}
+	teq	r0, r3
+	beq	6f
+
+	// Continue carry propagation until the end of the buffer.
+0:	ldmia	r0, {r2, r8-r10}
+	adcs	r2, r2, #0
+	adcs	r8, r8, #0
+	adcs	r9, r9, #0
+	adcs	r10, r10, #0
+	stmia	r0!, {r2, r8-r10}
+	teq	r0, r3
+	bne	0b
+
+	// Deal with the tail end.  Note that the actual destination length
+	// won't be an exacty number of blocks of four, so it's safe to just
+	// drop through here.
+6:	adc	r14, r12, #0
+7:	ldr	r2, [r0]
+	adds	r2, r2, r14
+	str	r2, [r0], #4
+	teq	r0, r1
+	beq	8f
+0:	ldr	r2, [r0]
+	adcs	r2, r2, #0
+	str	r2, [r0], #4
+	teq	r0, r1
+	bne	0b
+
+	// All done for this iteration.  Start the next.
+8:	cmp	r4, r5
+	bhs	9f
+	mov	r0, r4
+	add	r4, r4, #16
+	mov	r2, r6
+	bl	mont4
+	b	5b
+
+	// All over.
+9:	popvfp	QQ(q4, q7)
+	popreg	r4-r10, r14
+	bx	r14
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Testing and performance measurement.
+
+#ifdef TEST_MUL4
+
+//		  dmul  smul  mmul  mont
+// z	r0	  r0    r0    r0     r0
+// c	r4	  r1    r1    r1     r1
+// y	r3	  --    --    r2     r2
+// u	r1	  r2    --    r3     --
+// x	r2	  r3    r2    stk0   r3
+// vv	q2/q3	  stk0  --    stk1   stk0
+// yy	q4/q5	  stk1  r3    stk2   --
+// n	r5	  stk2  stk0  stk3   stk1
+// cyv	r6	  stk3  stk1  stk4   stk2
+
+#define STKARG(i) sp, #80 + 4*(i)
+
+.macro	testprologue mode
+	pushreg	r4-r6, r14
+	pushvfp	QQ(q4, q7)
+  endprologue
+
+  .ifeqs "\mode", "dmul"
+	mov	r4, r1			// -> c
+	mov	r1, r2			// -> u
+	mov	r2, r3			// -> x
+
+	ldr	r14, [STKARG(0)]	// -> vv
+	vld1.32	{q2}, [r14]
+	vmov.i32 q3, #0
+	vzip.16	q2, q3			// (v'_0, v''_0; v'_1, v''_1)
+
+	ldr	r14, [STKARG(1)]	// -> yy
+	vld1.32	{q4}, [r14]
+	vmov.i32 q5, #0
+	vzip.16	q4, q5			// (y'_0, y''_0; y'_1, y''_1)
+
+	ldr	r5, [STKARG(2)]		// = n
+	ldr	r6, [STKARG(3)]		// -> cyv
+  .endif
+
+  .ifeqs "\mode", "smul"
+	mov	r4, r1			// -> c
+	// r2				// -> x
+
+	vld1.32	{q4}, [r3]
+	vmov.i32 q5, #0
+	vzip.16	q4, q5			// (y'_0, y''_0; y'_1, y''_1)
+
+	ldr	r5, [STKARG(0)]		// = n
+	ldr	r6, [STKARG(1)]		// -> cyv
+  .endif
+
+  .ifeqs "\mode", "mmul"
+	mov	r4, r1			// -> c
+	mov	r1, r3			// -> u
+	mov	r3, r2			// -> y
+	ldr	r2, [STKARG(0)]		// -> x
+
+	ldr	r14, [STKARG(1)]	// -> vv
+	vld1.32	{q2}, [r14]
+	vmov.i32 q3, #0
+	vzip.16	q2, q3			// (v'_0, v''_0; v'_1, v''_1)
+
+	ldr	r14, [STKARG(2)]	// -> yy
+	vld1.32	{q4}, [r14]
+	vmov.i32 q5, #0
+	vzip.16	q4, q5			// (y'_0, y''_0; y'_1, y''_1)
+
+	ldr	r5, [STKARG(3)]		// = n
+	ldr	r6, [STKARG(4)]		// -> cyv
+  .endif
+
+  .ifeqs "\mode", "mont"
+	mov	r4, r1			// -> c
+	mov	r1, r3			// -> u
+	mov	r14, r2
+	mov	r2, r3			// -> x
+	mov	r3, r14			// -> y
+
+	ldr	r14, [STKARG(0)]	// -> vv
+	vld1.32	{q2}, [r14]
+	vmov.i32 q3, #0
+	vzip.16	q2, q3			// (v'_0, v''_0; v'_1, v''_1)
+
+	ldr	r5, [STKARG(1)]		// = n
+	ldr	r6, [STKARG(2)]		// -> cyv
+  .endif
+.endm
+
+.macro	testldcarry
+	vldmia	r4, {QQ(q13, q15)}	// c0, c1, c2
+.endm
+
+.macro	testtop
+0:	subs	r5, r5, #1
+.endm
+
+.macro	testtail
+	bne	0b
+.endm
+
+.macro	testcarryout
+	vstmia	r4, {QQ(q13, q15)}
+.endm
+
+.macro	testepilogue
+	popvfp	QQ(q4, q7)
+	popreg	r4-r6, r14
+	bx	r14
+.endm
+
+FUNC(test_dmul4)
+	testprologue dmul
+	testldcarry
+	testtop
+	bl	dmul4
+	testtail
+	testcarryout
+	testepilogue
+ENDFUNC
+
+FUNC(test_dmla4)
+	testprologue dmul
+	testldcarry
+	testtop
+	bl	dmla4
+	testtail
+	testcarryout
+	testepilogue
+ENDFUNC
+
+FUNC(test_mul4)
+	testprologue smul
+	testldcarry
+	testtop
+	bl	mul4
+	testtail
+	testcarryout
+	testepilogue
+ENDFUNC
+
+FUNC(test_mul4zc)
+	testprologue smul
+	testldcarry
+	testtop
+	bl	mul4zc
+	testtail
+	testcarryout
+	testepilogue
+ENDFUNC
+
+FUNC(test_mla4)
+	testprologue smul
+	testldcarry
+	testtop
+	bl	mla4
+	testtail
+	testcarryout
+	testepilogue
+ENDFUNC
+
+FUNC(test_mla4zc)
+	testprologue smul
+	testldcarry
+	testtop
+	bl	mla4zc
+	testtail
+	testcarryout
+	testepilogue
+ENDFUNC
+
+FUNC(test_mmul4)
+	testprologue mmul
+	testtop
+	bl	mmul4
+	testtail
+	vst1.32 {q4, q5}, [r3]
+	testcarryout
+	testepilogue
+ENDFUNC
+
+FUNC(test_mmla4)
+	testprologue mmul
+	testtop
+	bl	mmla4
+	testtail
+	vst1.32 {q4, q5}, [r3]
+	testcarryout
+	testepilogue
+ENDFUNC
+
+FUNC(test_mont4)
+	testprologue mont
+	testtop
+	bl	mont4
+	testtail
+	vst1.32 {q4, q5}, [r3]
+	testcarryout
+	testepilogue
+ENDFUNC
+
+#endif
+
+///----- That's all, folks --------------------------------------------------
diff --git a/math/mpx-mul4-arm64-simd.S b/math/mpx-mul4-arm64-simd.S
new file mode 100644
index 00000000..8f482dd8
--- /dev/null
+++ b/math/mpx-mul4-arm64-simd.S
@@ -0,0 +1,1217 @@
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// Large SIMD-based multiplications
+///
+/// (c) 2019 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software: you can redistribute it and/or modify it
+/// under the terms of the GNU Library General Public License as published
+/// by the Free Software Foundation; either version 2 of the License, or
+/// (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful, but
+/// WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+/// Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb.  If not, write to the Free Software
+/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+/// USA.
+
+///--------------------------------------------------------------------------
+/// Preliminaries.
+
+#include "config.h"
+#include "asm-common.h"
+
+	.text
+
+///--------------------------------------------------------------------------
+/// Theory.
+///
+/// We define a number of primitive fixed-size multipliers from which we can
+/// construct more general variable-length multipliers.
+///
+/// The basic trick is the same throughout.  In an operand-scanning
+/// multiplication, the inner multiplication loop multiplies a multiple-
+/// precision operand by a single precision factor, and adds the result,
+/// appropriately shifted, to the result.  A `finely integrated operand
+/// scanning' implementation of Montgomery multiplication also adds the
+/// product of a single-precision `Montgomery factor' and the modulus,
+/// calculated in the same pass.  The more common `coarsely integrated
+/// operand scanning' alternates main multiplication and Montgomery passes,
+/// which requires additional carry propagation.
+///
+/// Throughout both plain-multiplication and Montgomery stages, then, one of
+/// the factors remains constant throughout the operation, so we can afford
+/// to take a little time to preprocess it.  The transformation we perform is
+/// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
+/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
+/// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
+/// pieces are placed into 32-bit cells, and arranged as two 128-bit SIMD
+/// operands, as follows.
+///
+///	Offset	   0	   4	    8	   12
+///	   0	v'_0   v''_0	 v'_1	v''_1
+///	  16	v'_2   v''_2	 v'_3	v''_3
+///
+/// The `umull' and `umlal' instructions can multiply a vector of two 32-bit
+/// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
+/// on (say) v'_0 and v''_0 in a single instruction, to produce two 48-bit
+/// results in 64-bit fields.  The sixteen bits of headroom allows us to add
+/// many products together before we must deal with carrying; it also allows
+/// for some calculations to be performed on the above expanded form.
+///
+/// We maintain three `carry' registers, v28--v30, accumulating intermediate
+/// results; we name them `c0', `c1', and `c2'.  Each carry register holds
+/// two 64-bit halves: the register c0, for example, holds c'_0 (low half)
+/// and c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the
+/// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2.
+/// The `umull' or `umlal' instruction acting on a scalar operand and an
+/// operand in the expanded form above produces a result which can be added
+/// directly to the appropriate carry register.
+///
+/// An unusual feature of this code, as compared to the other `mul4'
+/// implementations, is that it makes extensive use of the ARM64
+/// general-purpose registers for carry resolution and output construction.
+/// As a result, an additional general-purpose register (typically x15) is
+/// used as an additional carry, with the carry value in bits 16--63.
+///
+/// Multiplication is performed in product-scanning order, since ARM
+/// processors commonly implement result forwarding for consecutive multiply-
+/// and-accumulate instructions specifying the same destination.
+/// Experimentally, this runs faster than operand-scanning in an attempt to
+/// hide instruction latencies.
+///
+/// On 64-bit ARM, we have a vast supply number of registers: the expanded
+/// operands are kept in registers.  The packed operands are read from memory
+/// into working registers v0 and v1.  The following conventional argument
+/// names and locations are used throughout.
+///
+/// Arg	Format	    Location	Notes
+///
+/// U	packed	    [x1]
+/// X	packed	    [x2]	In Montgomery multiplication, X = N
+/// V	expanded    v2/v3
+/// Y	expanded    v4/v5	In Montgomery multiplication, Y = (A + U V) M
+/// M	expanded    v6/v7	-N^{-1} (mod B^4)
+/// N				Modulus, for Montgomery multiplication
+/// A	packed	    [x0]	Destination/accumulator
+/// C	carry	    v28--v30
+/// 0		    v31		128-bit zero
+///
+/// The calculation is some variant of
+///
+///	A' + C' B^4 <- U V + X Y + A + C
+///
+/// The low-level functions fit into a fairly traditional (finely-integrated)
+/// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y)
+/// (indexed by i).
+///
+/// The variants are as follows.
+///
+/// Function	Variant			Use		i	j
+///
+/// mmul4	A = C = 0		Montgomery	0	0
+/// dmul4	A = 0			Montgomery	0	+
+/// mmla4	C = 0			Montgomery	+	0
+/// dmla4	exactly as shown	Montgomery	+	+
+///
+/// mul4zc	U = V = A = C = 0	Plain		0	0
+/// mul4	U = V = A = 0		Plain		0	+
+/// mla4zc	U = V = C = 0		Plain		+	0
+/// mla4	U = V = 0		Plain		+	+
+///
+/// The `mmul4' and `mmla4' functions are also responsible for calculating
+/// the Montgomery reduction factor Y = (A + U V) M used by the rest of the
+/// inner loop.
+
+///--------------------------------------------------------------------------
+/// Macro definitions.
+
+.macro	mulacc	z, u, v, x=nil, y=nil
+	// Set Z = Z + U V + X Y, using the low halves of V and Y.  Y may be
+	// `nil' to omit the second operand.  Z, V, and Y should be 128-bit
+	// `vN' registers; and U and X should be 32-bit `vN.s[I]' scalars;
+	// the multiplications produce two 64-bit elementwise products, which
+	// are added elementwise to Z.
+
+	umlal	\z\().2d, \v\().2s, \u
+    .ifnes "\y", "nil"
+	umlal	\z\().2d, \y\().2s, \x
+    .endif
+.endm
+
+.macro	mulacc2	z, u, v, x=nil, y=nil
+	// Set Z = Z + U V + X Y, using the high halves of V and Y; see
+	// `mulacc'.
+
+	umlal2	\z\().2d, \v\().4s, \u
+    .ifnes "\y", "nil"
+	umlal2	\z\().2d, \y\().4s, \x
+    .endif
+.endm
+
+.macro	mulinit	z, zinitp, u, v=nil, x=nil, y=nil
+	// If ZINITP then set Z = Z + U V + X Y, as for `mulacc'; otherwise,
+	// set Z = U V + X Y.  Operand requirements and detailed operation
+	// are as for `mulacc'.
+
+  .ifeqs "\zinitp", "t"
+	mulacc	\z, \u, \v, \x, \y
+  .else
+	umull	\z\().2d, \v\().2s, \u
+    .ifnes "\y", "nil"
+	umlal	\z\().2d, \y\().2s, \x
+    .endif
+  .endif
+.endm
+
+.macro	mulini2	z, zinitp, u, v=nil, x=nil, y=nil
+	// As `mulinit', but with the high halves of V and Y.
+
+  .ifeqs "\zinitp", "t"
+	mulacc2	\z, \u, \v, \x, \y
+  .else
+	umull2	\z\().2d, \v\().4s, \u
+    .ifnes "\y", "nil"
+	umlal2	\z\().2d, \y\().4s, \x
+    .endif
+  .endif
+.endm
+
+// `mulI': accumulate the B^I and b B^i terms of the polynomial product sum U
+// V + X Y, given that U = u_0 + B u_1 + B^2 u_2 + B^3 u_3 (and similarly for
+// x), and V = v'_0 + b v''_0 + B (v'_1 + b v''_1) + B^2 (v'_2 + b v''_2) +
+// B^3 (v'_3 + b v''_3) (and similarly for Y).  The 64-bit coefficients are
+// added into the low and high halves of the 128-bit register Z (if ZINIT is
+// `nil' then simply set Z, as if it were initially zero).
+.macro	mul0	z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
+	mulinit	\z, \zinitp, \u\().s[0], \v0, \x\().s[0], \y0
+.endm
+.macro	mul1	z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
+	mulini2	\z, \zinitp, \u\().s[0], \v0, \x\().s[0], \y0
+	mulacc	\z,	     \u\().s[1], \v0, \x\().s[1], \y0
+.endm
+.macro	mul2	z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
+	mulinit	\z, \zinitp, \u\().s[0], \v1, \x\().s[0], \y1
+	mulacc2	\z,	     \u\().s[1], \v0, \x\().s[1], \y0
+	mulacc	\z,	     \u\().s[2], \v0, \x\().s[2], \y0
+.endm
+.macro	mul3	z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
+	mulini2	\z, \zinitp, \u\().s[0], \v1, \x\().s[0], \y1
+	mulacc	\z,	     \u\().s[1], \v1, \x\().s[1], \y1
+	mulacc2	\z,	     \u\().s[2], \v0, \x\().s[2], \y0
+	mulacc	\z,	     \u\().s[3], \v0, \x\().s[3], \y0
+.endm
+.macro	mul4	z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
+	mulini2	\z, \zinitp, \u\().s[1], \v1, \x\().s[1], \y1
+	mulacc	\z,	     \u\().s[2], \v1, \x\().s[2], \y1
+	mulacc2	\z,	     \u\().s[3], \v0, \x\().s[3], \y0
+.endm
+.macro	mul5	z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
+	mulini2	\z, \zinitp, \u\().s[2], \v1, \x\().s[2], \y1
+	mulacc	\z,	     \u\().s[3], \v1, \x\().s[3], \y1
+.endm
+.macro	mul6	z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
+	mulini2	\z, \zinitp, \u\().s[3], \v1, \x\().s[3], \y1
+.endm
+
+// Steps in the process of propagating carry bits upwards from ZLO (a 128-bit
+// `vN' register).  Here, T0, T1, and CG are 64-bit `xN' general-purpose
+// registers clobbered in the process.  Set the low 32 bits of the 64-bit
+// `xN' general-purpose register ZOUT to the completed coefficient z_1,
+// leaving a carry in CG.
+//
+// In detail, what happens is as follows.  Suppose initially that ZLO =
+// (z'_i; z''_i) and ZHI = (z'_{i+1}; z''_{i+1}).  Let t = z'_i + b z''_i;
+// observe that floor(t/b) = floor(z'_i/b) + z''_i.  Let z_i = t mod B, and
+// add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}.  This has
+// a circuit depth of 3; I don't know how to do better.
+//
+// Output words are left in the low half of a 64-bit register, with rubbish
+// in the high half.  Two such results can be combined using the `bfi'
+// instruction.
+.macro	carry0	zlo, cg=x15, t0=x16, t1=x17
+	// Capture the values of carry-register ZLO and CG (if not `nil') in
+	// general-purpose registers T0 and T1, suitable for use in `carry1'.
+	mov	\t0, \zlo\().d[0]
+	mov	\t1, \zlo\().d[1]
+  .ifnes "\cg", "nil"
+	add	\t0, \t0, \cg, lsr #16
+  .endif
+.endm
+.macro	carry1	zout, cg=x15, t0=x16, t1=x17
+	// Collect a 32-bit output word in the low 32 bits of ZOUT (leaving
+	// rubbish in the high 32 bits), and update CG suitably to continue
+	// processing with the next carry register.
+  .ifnes "\zout", "nil"
+	add	\zout, \t0, \t1, lsl #16
+  .endif
+  .ifnes "\cg", "nil"
+	add	\cg, \t1, \t0, lsr #16
+  .endif
+.endm
+
+.macro	expand	vlo, vhi, vz=v31
+	// Expand the packed 128-bit operand in VLO to an expanded operand in
+	// VLO and VHI, assuming that VZ is all-bits-zero.  All three are
+	// `vN' 128-bit SIMD registers.
+	zip2	\vhi\().8h, \vlo\().8h, \vz\().8h
+	zip1	\vlo\().8h, \vlo\().8h, \vz\().8h
+.endm
+
+.macro	sprdacc	a0, a1, a2, a3=nil
+	// Spread the packed 128-bit operand in A0 into carry-format values
+	// in A0, A1, A2, A3.  If A3 is `nil', then spread the same value
+	// into A0, A1, A2 only, clobbering x16.
+  .ifeqs "\a3", "nil"
+	mov	w16, \a0\().s[3]
+  .endif
+	trn2	\a2\().2d, \a0\().2d, v31.2d
+	trn2	\a1\().2s, \a0\().2s, v31.2s
+  .ifeqs "\a3", "nil"
+	lsl	x16, x16, #16
+  .endif
+	trn1	\a0\().2s, \a0\().2s, v31.2s
+  .ifeqs "\a3", "nil"
+	mov	\a2\().d[1], x16
+  .else
+	trn2	\a3\().2s, \a2\().2s, v31.2s
+  .endif
+	mov	\a2\().s[1], wzr
+.endm
+
+.macro	crryacc	a0, a1, a2, a3, c0, c1, c2
+	// Add the carry-format values A0, A1, A2 into the existing carries
+	// C0, C1, C2 (leaving A3 where it is).
+	add	\c0\().2d, \c0\().2d, \a0\().2d
+	add	\c1\().2d, \c1\().2d, \a1\().2d
+	add	\c2\().2d, \c2\().2d, \a2\().2d
+.endm
+
+///--------------------------------------------------------------------------
+/// Primitive multipliers and related utilities.
+
+INTFUNC(carryprop)
+	// On entry, x0 points to a destination, and v28--v30 and x15 hold
+	// incoming carries c0--c2 and cg.  On exit, the low 128 bits of the
+	// carry value are stored at [x0]; the remaining 16 bits of carry are
+	// left in x10; x0 is advanced by 16; and x11--x17 are clobbered.
+  endprologue
+
+	carry0	v28
+	carry1	x11
+	carry0	v29
+	carry1	x13
+	carry0	v30
+	carry1	x12
+	bfi	x11, x13, #32, #32
+	lsr	x14, x15, #16
+	lsr	x10, x15, #48
+	bfi	x12, x14, #32, #32
+	stp	x11, x12, [x0], #16
+	ret
+ENDFUNC
+
+INTFUNC(dmul4)
+	// On entry, x0 points to the destination; x1 and x2 point to packed
+	// operands U and X; v2/v3 and v4/v5 hold expanded operands V and Y;
+	// v28--v30 and x15 hold incoming carries c0--c2 and cg; and v31 is
+	// zero.  On exit, the destination and carries are updated; x0, x1,
+	// x2 are each advanced by 16; v2--v5 and v8--v15 are preserved; and
+	// x11--x14, x16, x17 and the other SIMD registers are clobbered.
+  endprologue
+
+	// Start by loading the operand words from memory.
+	ldr	q0, [x1], #16
+	ldr	q1, [x2], #16
+
+	// Do the multiplication.
+	mul0	v28, t,	     v0,  v2, v3,    v1,  v4, v5
+	mul1	v29, t,	     v0,  v2, v3,    v1,  v4, v5
+	 carry0	v28
+	mul2	v30, t,	     v0,  v2, v3,    v1,  v4, v5
+	 carry1	x11
+	 carry0	v29
+	mul3	v27, nil,    v0,  v2, v3,    v1,  v4, v5
+	 carry1	x13
+	 carry0	v30
+	mul4	v28, nil,    v0,  v2, v3,    v1,  v4, v5
+	 carry1	x12
+	 carry0	v27
+	mul5	v29, nil,    v0,  v2, v3,    v1,  v4, v5
+	 carry1	x14
+	mul6	v30, nil,    v0,  v2, v3,    v1,  v4, v5
+
+	// Finish up and store the result.
+	bfi	x11, x13, #32, #32
+	bfi	x12, x14, #32, #32
+	stp	x11, x12, [x0], #16
+
+	// All done.
+	ret
+ENDFUNC
+
+INTFUNC(dmla4)
+	// On entry, x0 points to the destination/accumulator A; x1 and x2
+	// point to packed operands U and X; v2/v3 and v4/v5 hold expanded
+	// operands V and Y; v28--v30 and x15 hold incoming carries c0--c2
+	// and cg; and v31 is zero.  On exit, the accumulator and carries are
+	// updated; x0, x1, x2 are each advanced by 16; v2--v5 and v8--v15
+	// are preserved; and x11--x14, x16, x17 and the other SIMD registers
+	// are clobbered.
+  endprologue
+
+	// Start by loading the operand words from memory.
+	ldr	q24, [x0]
+	ldr	q0, [x1], #16
+	ldr	q1, [x2], #16
+	sprdacc	v24, v25, v26, v27
+	crryacc	v24, v25, v26, v27, v28, v29, v30
+
+	// Do the multiplication.
+	mul0	v28, t,	     v0,  v2, v3,    v1,  v4, v5
+	mul1	v29, t,	     v0,  v2, v3,    v1,  v4, v5
+	 carry0	v28
+	mul2	v30, t,	     v0,  v2, v3,    v1,  v4, v5
+	 carry1	x11
+	 carry0	v29
+	mul3	v27, t,      v0,  v2, v3,    v1,  v4, v5
+	 carry1	x13
+	 carry0	v30
+	mul4	v28, nil,    v0,  v2, v3,    v1,  v4, v5
+	 carry1	x12
+	 carry0	v27
+	mul5	v29, nil,    v0,  v2, v3,    v1,  v4, v5
+	 carry1	x14
+	mul6	v30, nil,    v0,  v2, v3,    v1,  v4, v5
+
+	// Finish up and store the result.
+	bfi	x11, x13, #32, #32
+	bfi	x12, x14, #32, #32
+	stp	x11, x12, [x0], #16
+
+	// All done.
+	ret
+ENDFUNC
+
+INTFUNC(mul4)
+	// On entry, x0 points to the destination; x2 points to a packed
+	// operand X; v4/v5 holds an expanded operand Y; v13--v15 and x15
+	// hold incoming carries c0--c2 and cg; and v31 is zero.  On exit,
+	// the destination and carries are updated; x0 and x2 are each
+	// advanced by 16; v4 and v5 and v8--v15 are preserved; and x11--x14,
+	// x16, x17 and the other SIMD registers are clobbered.
+  endprologue
+
+	// Start by loading the operand words from memory.
+	ldr	q1, [x2], #16
+
+	// Do the multiplication.
+	mul0	v28, t,	     v1,  v4, v5
+	mul1	v29, t,	     v1,  v4, v5
+	 carry0	v28
+	mul2	v30, t,	     v1,  v4, v5
+	 carry1	x11
+	 carry0	v29
+	mul3	v27, nil,    v1,  v4, v5
+	 carry1	x13
+	 carry0	v30
+	mul4	v28, nil,    v1,  v4, v5
+	 carry1	x12
+	 carry0	v27
+	mul5	v29, nil,    v1,  v4, v5
+	 carry1	x14
+	mul6	v30, nil,    v1,  v4, v5
+
+	// Finish up and store the result.
+	bfi	x11, x13, #32, #32
+	bfi	x12, x14, #32, #32
+	stp	x11, x12, [x0], #16
+
+	// All done.
+	ret
+ENDFUNC
+
+INTFUNC(mul4zc)
+	// On entry, x0 points to the destination; x2 points to a packed
+	// operand X; v4/v5 holds an expanded operand Y; and v31 is zero.  On
+	// exit, the destination is updated; v28--v30 and x15 hold outgoing
+	// carries c0--c2 and cg; x0 and x2 are each advanced by 16; v4 and
+	// v5 and v8--v15 are preserved; and x11--x14, x16, x17 and the other
+	// SIMD registers are clobbered.
+  endprologue
+
+	// Start by loading the operand words from memory.
+	ldr	q1, [x2], #16
+
+	// Do the multiplication.
+	mul0	v28, nil,    v1,  v4, v5
+	mul1	v29, nil,    v1,  v4, v5
+	 carry0	v28, nil
+	mul2	v30, nil,    v1,  v4, v5
+	 carry1	x11
+	 carry0	v29
+	mul3	v27, nil,    v1,  v4, v5
+	 carry1	x13
+	 carry0	v30
+	mul4	v28, nil,    v1,  v4, v5
+	 carry1	x12
+	 carry0	v27
+	mul5	v29, nil,    v1,  v4, v5
+	 carry1	x14
+	mul6	v30, nil,    v1,  v4, v5
+
+	// Finish up and store the result.
+	bfi	x11, x13, #32, #32
+	bfi	x12, x14, #32, #32
+	stp	x11, x12, [x0], #16
+
+	// All done.
+	ret
+ENDFUNC
+
+INTFUNC(mla4)
+	// On entry, x0 points to the destination/accumulator A; x2 points to
+	// a packed operand X; v4/v5 holds an expanded operand Y; v13--v15
+	// and x15 hold incoming carries c0--c2 and cg; and v31 is zero.  On
+	// exit, the accumulator and carries are updated; x0 and x2 are each
+	// advanced by 16; v4 and v5 and v8--v15 are preserved; and x11--x14,
+	// x16, x17 and the other SIMD registers are clobbered.
+  endprologue
+
+	// Start by loading the operand words from memory.
+	ldr	q24, [x0]
+	ldr	q1, [x2], #16
+	sprdacc	v24, v25, v26, v27
+	crryacc	v24, v25, v26, v27, v28, v29, v30
+
+	// Do the multiplication.
+	mul0	v28, t,	     v1,  v4, v5
+	mul1	v29, t,	     v1,  v4, v5
+	 carry0	v28
+	mul2	v30, t,	     v1,  v4, v5
+	 carry1	x11
+	 carry0	v29
+	mul3	v27, t,      v1,  v4, v5
+	 carry1	x13
+	 carry0	v30
+	mul4	v28, nil,    v1,  v4, v5
+	 carry1	x12
+	 carry0	v27
+	mul5	v29, nil,    v1,  v4, v5
+	 carry1	x14
+	mul6	v30, nil,    v1,  v4, v5
+
+	// Finish up and store the result.
+	bfi	x11, x13, #32, #32
+	bfi	x12, x14, #32, #32
+	stp	x11, x12, [x0], #16
+
+	// All done.
+	ret
+ENDFUNC
+
+INTFUNC(mla4zc)
+	// On entry, x0 points to the destination/accumulator A; x2 points to
+	// a packed operand X; v4/v5 holds an expanded operand Y; and v31 is
+	// zero.  On exit, the accumulator is updated; v28--v30 and x15 hold
+	// outgoing carries c0--c2 and cg; x0 and x2 are each advanced by 16;
+	// v4, v5, and v8--v15 are preserved; and x11--x14, x16, x17 and the
+	// other SIMD registers are clobbered.
+  endprologue
+
+	// Start by loading the operand words from memory.
+	ldr	q28, [x0]
+	ldr	q1, [x2], #16
+	sprdacc	v28, v29, v30, v27
+
+	// Do the multiplication.
+	mul0	v28, t,	     v1,  v4, v5
+	mul1	v29, t,	     v1,  v4, v5
+	 carry0	v28, nil
+	mul2	v30, t,	     v1,  v4, v5
+	 carry1	x11
+	 carry0	v29
+	mul3	v27, t,      v1,  v4, v5
+	 carry1	x13
+	 carry0	v30
+	mul4	v28, nil,    v1,  v4, v5
+	 carry1	x12
+	 carry0	v27
+	mul5	v29, nil,    v1,  v4, v5
+	 carry1	x14
+	mul6	v30, nil,    v1,  v4, v5
+
+	// Finish up and store the result.
+	bfi	x11, x13, #32, #32
+	bfi	x12, x14, #32, #32
+	stp	x11, x12, [x0], #16
+
+	// All done.
+	ret
+ENDFUNC
+
+INTFUNC(mmul4)
+	// On entry, x0 points to the destination; x1 points to a packed
+	// operand U; x2 points to a packed operand X (the modulus); v2/v3
+	// holds an expanded operand V; and v6/v7 holds an expanded operand M
+	// (the Montgomery factor -N^{-1} (mod B)).  On exit, the destination
+	// is updated (to zero); v4/v5 hold an expanded factor Y = U V M (mod
+	// B); v28--v30 and x15 hold outgoing carries c0--c2 and cg; x0, x1,
+	// and x2 are each advanced by 16; v2, v3, and v8--v15 are preserved;
+	// and x11--x14, x16, x17 and the other SIMD registers are clobbered.
+  endprologue
+
+	// Start by loading the operand words from memory.
+	ldr	q0, [x1], #16
+	ldr	q1, [x2], #16
+
+	// Calculate the low half of W = A + U V, being careful to leave the
+	// carries in place.
+	mul0	v28, nil,  v0,  v2, v3
+	mul1	v29, nil,  v0,  v2, v3
+	 carry0	v28, nil
+	mul2	v30, nil,  v0,  v2, v3
+	 carry1	x11
+	 carry0	v29
+	mul3	v27, nil,  v0,  v2, v3
+	b	mmla4_common
+ENDFUNC
+
+INTFUNC(mmla4)
+	// On entry, x0 points to the destination/accumulator A; x1 points to
+	// a packed operand U; x2 points to a packed operand X (the modulus);
+	// v2/v3 holds an expanded operand V; and v6/v7 holds an expanded
+	// operand M (the Montgomery factor -N^{-1} (mod B)).  On exit, the
+	// accumulator is updated (to zero); v4/v5 hold an expanded factor Y
+	// = (A + U V) M (mod B); v28--v30 and x15 hold outgoing carries
+	// c0--c2 and cg; x0, x1, and x2 are each advanced by 16; v2, v3, v6,
+	// v7, and v8--v15 are preserved; and x11--x14, x16, x17 and the
+	// other SIMD registers are clobbered.
+  endprologue
+
+	// Start by loading the operand words from memory.
+	ldr	q28, [x0]
+	ldr	q0, [x1], #16
+	ldr	q1, [x2], #16
+	sprdacc	v28, v29, v30, v27
+
+	// Calculate the low half of W = A + U V, being careful to leave the
+	// carries in place.
+	mul0	v28, t,	   v0,  v2, v3
+	mul1	v29, t,	   v0,  v2, v3
+	 carry0	v28, nil
+	mul2	v30, t,	   v0,  v2, v3
+	 carry1	x11
+	 carry0	v29
+	mul3	v27, t,	   v0,  v2, v3
+mmla4_common:
+	 carry1	x13
+	 carry0	v30
+	 carry1	x12
+	 carry0	v27
+	 carry1	x14, nil
+
+	// Piece the result together and ship it back.
+	bfi	x11, x13, #32, #32
+	bfi	x12, x14, #32, #32
+	mov	v16.d[0], x11
+	mov	v16.d[1], x12
+
+	// Calculate the low half of the Montgomery factor Y = W M.
+	mul0	v18, nil,    v16,  v6, v7
+	mul1	v19, nil,    v16,  v6, v7
+	 carry0	v18, nil
+	mul2	v20, nil,    v16,  v6, v7
+	 carry1	x11
+	 carry0	v19
+	mul3	v21, nil,    v16,  v6, v7
+	 carry1	x13
+	 carry0	v20
+	 carry1	x12
+	 carry0	v21
+	 carry1	x14, nil
+
+	// Piece the result together, ship it back, and expand.
+	bfi	x11, x13, #32, #32
+	bfi	x12, x14, #32, #32
+	mov	v4.d[0], x11
+	mov	v4.d[1], x12
+	expand	v4, v5
+
+	// Build up the product X Y in the carry slots.
+	mul0	v28, t,			     v1,  v4, v5
+	mul1	v29, t,			     v1,  v4, v5
+	 carry0	v28, nil
+	mul2	v30, t,			     v1,  v4, v5
+	 carry1	nil
+	 carry0 v29
+	mul3	v27, t,			     v1,  v4, v5
+	 carry1	nil
+	 carry0 v30
+
+	// And complete the calculation.
+	mul4	v28, nil,    v0,  v2, v3,    v1,  v4, v5
+	 carry1	nil
+	 carry0 v27
+	mul5	v29, nil,    v0,  v2, v3,    v1,  v4, v5
+	 carry1	nil
+	mul6	v30, nil,    v0,  v2, v3,    v1,  v4, v5
+
+	// Finish up and store the result.
+	stp	xzr, xzr, [x0], #16
+
+	// All done.
+	ret
+ENDFUNC
+
+INTFUNC(mont4)
+	// On entry, x0 points to the destination/accumulator A; x2 points to
+	// a packed operand X (the modulus); and v6/v7 holds an expanded
+	// operand M (the Montgomery factor -N^{-1} (mod B)).  On exit, the
+	// accumulator is updated (to zero); v4/v5 hold an expanded factor Y
+	// = A M (mod B); v28--v30 and x15 hold outgoing carries c0--c2 and
+	// cg; x0 and x2 are each advanced by 16; v6, v7, and v8--v15 are
+	// preserved; and x11--x14, x16, x17 and the other SIMD registers are
+	// clobbered.
+  endprologue
+
+	// Start by loading the operand words from memory.
+	ldr	q28, [x0]
+	ldr	q1, [x2], #16
+
+	// Calculate Y = A M (mod B).
+	mul0	v18, nil,    v28,  v6, v7
+	mul1	v19, nil,    v28,  v6, v7
+	 carry0	v18, nil
+	mul2	v20, nil,    v28,  v6, v7
+	 carry1	x11
+	 carry0	v19
+	mul3	v21, nil,    v28,  v6, v7
+	 carry1	x13
+	 carry0	v20
+	  sprdacc v28, v29, v30, v27
+	 carry1	x12
+	 carry0	v21
+	 carry1	x14, nil
+
+	// Piece the result together, ship it back, and expand.
+	bfi	x11, x13, #32, #32
+	bfi	x12, x14, #32, #32
+	mov	v4.d[0], x11
+	mov	v4.d[1], x12
+	expand	v4, v5
+
+	// Calculate the actual result.  Well, the carries, at least.
+	mul0	v28, t,	     v1,  v4, v5
+	mul1	v29, t,	     v1,  v4, v5
+	 carry0	v28, nil
+	mul2	v30, t,	     v1,  v4, v5
+	 carry1	nil
+	 carry0 v29
+	mul3	v27, t,	     v1,  v4, v5
+	 carry1	nil
+	 carry0 v30
+
+	// And complete the calculation.
+	mul4	v28, nil,    v1,  v4, v5
+	 carry1	nil
+	 carry0 v27
+	mul5	v29, nil,    v1,  v4, v5
+	 carry1	nil
+	mul6	v30, nil,    v1,  v4, v5
+
+	// Finish up and store the result.
+	stp	xzr, xzr, [x0], #16
+
+	// All done.
+	ret
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Bulk multipliers.
+
+FUNC(mpx_umul4_arm64_simd)
+	// void mpx_umul4_arm64_simd(mpw *dv, const mpw *av, const mpw *avl,
+	//			     const mpw *bv, const mpw *bvl);
+
+	// Establish the arguments and do initial setup.
+	//
+	// inner loop dv	x0
+	// inner loop av	x2
+	// outer loop dv	x5
+	// outer loop bv	x3
+	// av base		x1
+	// inner n		x6
+	// n base		x7
+	// outer n		x4
+	pushreg	x29, x30
+	setfp
+  endprologue
+
+	// Prepare for the first iteration.
+	ldr	q4, [x3], #16		// Y = bv[0]
+	movi	v31.4s, #0
+	sub	x7, x2, x1		// = inner loop count base
+	// x0				// = dv for inner loop
+	// x1				// = av base
+	// x3				// = bv for outer loop
+	sub	x4, x4, x3		// = outer loop count (decremented)
+	sub	x6, x7, #16		// = inner loop count (decremented)
+	mov	x2, x1			// = av for inner loop
+	add	x5, x0, #16		// = dv for outer loop
+	expand	v4, v5			// expand Y
+	bl	mul4zc
+	cbz	x6, 8f			// all done?
+
+	// Continue with the first iteration.
+0:	sub	x6, x6, #16
+	bl	mul4
+	cbnz	x6, 0b
+
+	// Write out the leftover carry.  There can be no tail here.
+8:	bl	carryprop
+	cbz	x4, 9f
+
+	// Set up for the next pass.
+1:	ldr	q4, [x3], #16		// Y = bv[i]
+	mov	x0, x5			// -> dv[i]
+	mov	x2, x1			// -> av[0]
+	add	x5, x5, #16
+	sub	x6, x7, #16		// = inner loop count (decremented)
+	sub	x4, x4, #16		// outer loop count
+	expand	v4, v5			// expand Y
+	bl	mla4zc
+	cbz	x6, 8f
+
+	// Continue...
+0:	sub	x6, x6, #16
+	bl	mla4
+	cbnz	x6, 0b
+
+	// Finish off this pass.  There was no tail on the previous pass, and
+	// there can be done on this pass.
+8:	bl	carryprop
+	cbnz	x4, 1b
+
+	// All over.
+9:	popreg	x29, x30
+	ret
+ENDFUNC
+
+FUNC(mpxmont_mul4_arm64_simd)
+	// void mpxmont_mul4_arm64_simd(mpw *dv,
+	//				const mpw *av, const mpw *bv,
+	//				const mpw *nv, size_t n,
+	//				const mpw *mi);
+
+	// Establish the arguments and do initial setup.
+	//
+	// inner loop dv	x0
+	// inner loop av	x1
+	// inner loop nv	x2
+	// nv base		x3
+	// base n		x4
+	// mi			(x5)
+	// outer loop dv	x5
+	// outer loop bv	x6
+	// av base		x7
+	// inner n		x8
+	// outer n		x9
+	// c			x10
+	pushreg	x29, x30
+	setfp
+  endprologue
+
+	// Set up the outer loop state and prepare for the first iteration.
+	ldr	q2, [x2]		// = V = bv[0]
+	ldr	q6, [x5]		// = M
+	movi	v31.4s, #0
+	// x0				// -> dv for inner loop
+	// x1				// -> av for inner loop
+	// x3				// -> nv base
+	// x4				// = n base
+	add	x5, x0, #16		// -> dv
+	add	x6, x2, #16		// -> bv
+	mov	x2, x3			// -> nv[0]
+	mov	x7, x1			// -> av base
+	sub	x8, x4, #4		// = inner n (decremented)
+	sub	x9, x4, #4		// = outer n (decremented)
+	expand	v2, v3			// expand V
+	expand	v6, v7			// expand M
+	bl	mmul4
+	cbz	x8, 8f			// done already?
+
+	// Complete the first inner loop.
+0:	sub	x8, x8, #4
+	bl	dmul4
+	cbnz	x8, 0b			// done yet?
+
+	// Still have carries left to propagate.  Rather than store the tail
+	// end in memory, keep it in x10 for later.
+	bl	carryprop
+
+	// Embark on the next iteration.  (There must be one.  If n = 1 then
+	// we would have bailed above, to label 8.  Similarly, the subsequent
+	// iterations can fall into the inner loop immediately.)
+1:	ldr	q2, [x6], #16		// = Y = bv[i]
+	mov	x0, x5			// -> dv[i]
+	mov	x1, x7			// -> av[0]
+	mov	x2, x3			// -> nv[0]
+	add	x5, x5, #16
+	sub	x8, x4, #4
+	sub	x9, x9, #4
+	expand	v2, v3
+	bl	mmla4
+
+	// Complete the next inner loop.
+0:	sub	x8, x8, #4
+	bl	dmla4
+	cbnz	x8, 0b
+
+	// Still have carries left to propagate, and they overlap the
+	// previous iteration's final tail, so read that and add it.
+	add	x15, x15, x10, lsl #16
+	bl	carryprop
+
+	// Back again, maybe.
+	cbnz	x9, 1b
+
+	// All done, almost.
+	str	w10, [x0], #4
+	popreg	x29, x30
+	ret
+
+	// First iteration was short.  Write out the carries and we're done.
+	// (This could be folded into the main loop structure, but that would
+	// penalize small numbers more.)
+8:	bl	carryprop
+	str	w10, [x0], #4
+	popreg	x29, x30
+	ret
+ENDFUNC
+
+FUNC(mpxmont_redc4_arm64_simd)
+	// void mpxmont_redc4_arm64_simd(mpw *dv, mpw *dvl, const mpw *nv,
+	//				 size_t n, const mpw *mi);
+
+	// Establish the arguments and do initial setup.
+	//
+	// inner loop dv	x0
+	// inner loop nv	x2
+	// blocks-of-4 count	x6
+	// tail count		x7
+	// mi			(x4)
+	// outer loop dv	x4
+	// outer loop count	x8
+	// nv base		x5
+	// inner loop count	x1
+	// n			x3
+	// c			x10
+	// t0, t1		x11, x12
+
+	pushreg	x29, x30
+	setfp
+  endprologue
+
+	// Set up the outer loop state and prepare for the first iteration.
+	ldr	q6, [x4]		// = M
+	movi	v31.4s, #0
+	// x0				// -> dv for inner loop
+	 sub	x6, x1, x0		// total dv bytes
+	sub	x1, x3, #4		// inner loop counter
+	// x2				// -> nv for inner loop
+	// x3				// = n
+	add	x4, x0, #16		// -> dv for outer loop
+	mov	x5, x2			// -> nv base
+	 sub	x6, x6, x3, lsl #2	// dv carry range bytes
+	sub	x8, x3, #4		// outer loop counter
+	 sub	x6, x6, #16		// dv steam-powered carry bytes
+	expand	v6, v7			// expand M
+	and	x7, x6, #15		// dv tail length in bytes
+	bic	x6, x6, #15		// dv blocks-of-four length in bytes
+
+	bl	mont4
+	cbz	x1, 8f			// done already?
+
+5:	sub	x1, x1, #4
+	bl	mla4
+	cbnz	x1, 5b			// done yet?
+
+	// Still have carries left to propagate.  Adding the accumulator
+	// block into the carries is a little different this time, because
+	// all four accumulator limbs have to be squished into the three
+	// carry registers for `carryprop' to do its thing.
+8:	ldr	q24, [x0]
+	sprdacc	v24, v25, v26
+	add	v28.2d, v28.2d, v24.2d
+	add	v29.2d, v29.2d, v25.2d
+	add	v30.2d, v30.2d, v26.2d
+	bl	carryprop
+	cbz	x6, 7f
+
+	// Propagate the first group of carries.
+	ldp	x16, x17, [x0]
+	sub	x1, x6, #16
+	adds	x16, x16, x10
+	adcs	x17, x17, xzr
+	stp	x16, x17, [x0], #16
+	cbz	x1, 6f
+
+	// Continue carry propagation until the end of the buffer.
+0:	ldp	x16, x17, [x0]
+	sub	x1, x1, #16
+	adcs	x16, x16, xzr
+	adcs	x17, x17, xzr
+	stp	x16, x17, [x0], #16
+	cbnz	x1, 0b
+
+	// Deal with the tail end.  Note that the actual destination length
+	// won't be an exacty number of blocks of four, so it's safe to just
+	// drop through here.
+6:	adc	w10, wzr, wzr
+7:	ldr	w16, [x0]
+	sub	x1, x7, #4
+	adds	w16, w16, w10
+	str	w16, [x0], #4
+	cbz	x1, 8f
+0:	ldr	w16, [x0]
+	sub	x1, x1, #4
+	adcs	w16, w16, wzr
+	str	w16, [x0], #4
+	cbnz	x1, 0b
+
+	// All done for this iteration.  Start the next.
+8:	cbz	x8, 9f
+	mov	x0, x4
+	add	x4, x4, #16
+	sub	x1, x3, #4
+	mov	x2, x5
+	sub	x8, x8, #4
+	sub	x6, x6, #16
+	bl	mont4
+	b	5b
+
+	// All over.
+9:	popreg	x29, x30
+	ret
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Testing and performance measurement.
+
+#ifdef TEST_MUL4
+
+//		  dmul  smul  mmul  mont
+// z	x0	  x0    x0    x0     x0
+// c	x3	  x1    x1    x1     x1
+// y	x4	  --    --    x2     x2
+// u	x1	  x2    --    x3     --
+// x	x2	  x3    x2    x4     x3
+// vv	v2/v3	  x4    --    x5     --
+// yy	v4/v5	  x5    x3    x6     --
+// mm	v6/v7	  --	--    --     x4
+// n	x5	  x6    x4    x7     x5
+// cyv	x6	  x7    x5    stk0   x6
+
+#define STKARG(i) sp, #16 + i
+
+.macro	testprologue mode
+	pushreg	x29, x30
+	setfp
+  endprologue
+	movi	v31.4s, #0
+
+  .ifeqs "\mode", "dmul"
+	ldr	q2, [x4]
+	zip2	v3.8h, v2.8h, v31.8h	// (v'_2, v''_2; v'_3, v''_3)
+	zip1	v2.8h, v2.8h, v31.8h	// (v'_0, v''_0; v'_1, v''_1)
+
+	ldr	q4, [x5]
+	zip2	v5.8h, v4.8h, v31.8h	// (y'_2, y''_2; y'_3, y''_3)
+	zip1	v4.8h, v4.8h, v31.8h	// (y'_0, y''_0; y'_1, y''_1)
+
+	mov	x16, x1
+	mov	x1, x2			// -> u
+	mov	x2, x3			// -> x
+	mov	x3, x16			// -> c
+
+	mov	x5, x6			// = n
+	mov	x6, x7			// -> cyv
+  .endif
+
+  .ifeqs "\mode", "smul"
+	ldr	q4, [x3]
+	zip2	v5.8h, v4.8h, v31.8h	// (y'_2, y''_2; y'_3, y''_3)
+	zip1	v4.8h, v4.8h, v31.8h	// (y'_0, y''_0; y'_1, y''_1)
+
+	// x2				// -> x
+	mov	x3, x1			// -> c
+	mov	x6, x5			// -> cyv
+	mov	x5, x4			// = n
+  .endif
+
+  .ifeqs "\mode", "mmul"
+	ldr	q2, [x5]
+	zip2	v3.8h, v2.8h, v31.8h	// (v'_2, v''_2; v'_3, v''_3)
+	zip1	v2.8h, v2.8h, v31.8h	// (v'_0, v''_0; v'_1, v''_1)
+
+	ldr	q6, [x6]
+	zip2	v7.8h, v6.8h, v31.8h	// (y'_2, y''_2; y'_3, y''_3)
+	zip1	v6.8h, v6.8h, v31.8h	// (y'_0, y''_0; y'_1, y''_1)
+
+	mov	x16, x1
+	mov	x1, x3			// -> u
+	mov	x3, x16			// -> c
+
+	mov	x16, x2
+	mov	x2, x4			// -> x
+	mov	x4, x16			// -> y
+
+	mov	x5, x7			// = n
+	ldr	x6, [STKARG(0)]		// -> cyv
+  .endif
+
+  .ifeqs "\mode", "mont"
+	ldr	q6, [x4]
+	zip2	v7.8h, v6.8h, v31.8h	// (m'_2, m''_2; m'_3, m''_3)
+	zip1	v6.8h, v6.8h, v31.8h	// (m'_0, m''_0; m'_1, m''_1)
+
+	mov	x4, x2			// -> y
+	mov	x2, x3			// -> x
+	mov	x3, x1			// -> c
+
+	// x5				// = n
+	// x6				// -> cyv
+  .endif
+.endm
+
+.macro	testldcarry
+	ld1	{v28.2d-v30.2d}, [x3]
+	mov	x15, #0
+.endm
+
+.macro	testtop
+0:	sub	x5, x5, #1
+.endm
+
+.macro	testtail
+	cbnz	x5, 0b
+.endm
+
+.macro	testcarryout
+	// More complicated than usual because we must mix the general-
+	// purpose carry back in.
+	lsr	x15, x15, #16
+	mov	v0.d[0], x15
+	mov	v0.d[1], xzr
+	add	v28.2d, v28.2d, v0.2d
+	st1	{v28.2d-v30.2d}, [x3]
+.endm
+
+.macro	testepilogue
+	popreg	x29, x30
+	ret
+.endm
+
+FUNC(test_dmul4)
+	testprologue dmul
+	testldcarry
+	testtop
+	bl	dmul4
+	testtail
+	testcarryout
+	testepilogue
+ENDFUNC
+
+FUNC(test_dmla4)
+	testprologue dmul
+	testldcarry
+	testtop
+	bl	dmla4
+	testtail
+	testcarryout
+	testepilogue
+ENDFUNC
+
+FUNC(test_mul4)
+	testprologue smul
+	testldcarry
+	testtop
+	bl	mul4
+	testtail
+	testcarryout
+	testepilogue
+ENDFUNC
+
+FUNC(test_mul4zc)
+	testprologue smul
+	testldcarry
+	testtop
+	bl	mul4zc
+	testtail
+	testcarryout
+	testepilogue
+ENDFUNC
+
+FUNC(test_mla4)
+	testprologue smul
+	testldcarry
+	testtop
+	bl	mla4
+	testtail
+	testcarryout
+	testepilogue
+ENDFUNC
+
+FUNC(test_mla4zc)
+	testprologue smul
+	testldcarry
+	testtop
+	bl	mla4zc
+	testtail
+	testcarryout
+	testepilogue
+ENDFUNC
+
+FUNC(test_mmul4)
+	testprologue mmul
+	testtop
+	bl	mmul4
+	testtail
+	stp	q4, q5, [x4]
+	testcarryout
+	testepilogue
+ENDFUNC
+
+FUNC(test_mmla4)
+	testprologue mmul
+	testtop
+	bl	mmla4
+	testtail
+	stp	q4, q5, [x4]
+	testcarryout
+	testepilogue
+ENDFUNC
+
+FUNC(test_mont4)
+	testprologue mont
+	testtop
+	bl	mont4
+	testtail
+	stp	q4, q5, [x4]
+	testcarryout
+	testepilogue
+ENDFUNC
+
+#endif
+
+///----- That's all, folks --------------------------------------------------
diff --git a/math/mpx-mul4-test.c b/math/mpx-mul4-test.c
index 85a15327..5325db0f 100644
--- a/math/mpx-mul4-test.c
+++ b/math/mpx-mul4-test.c
@@ -61,6 +61,18 @@ static int cpu_features_p(void) { return (cpu_feature_p(CPUFEAT_X86_SSE2)); }
 static int cpu_features_p(void) { return (cpu_feature_p(CPUFEAT_X86_SSE2)); }
 #endif
 
+#if CPUFAM_ARMEL
+#  define VARIANT _arm_neon
+#  define REPR_32
+static int cpu_features_p(void) { return (cpu_feature_p(CPUFEAT_ARM_NEON)); }
+#endif
+
+#if CPUFAM_ARM64
+#  define VARIANT _arm64_simd
+#  define REPR_32
+static int cpu_features_p(void) { return (1); }
+#endif
+
 #ifndef VARIANT
 #  error "Unsupported CPU family."
 #endif
diff --git a/math/mpx.c b/math/mpx.c
index f761003f..d3d0a04a 100644
--- a/math/mpx.c
+++ b/math/mpx.c
@@ -931,6 +931,14 @@ static void simple_umul(mpw *dv, mpw *dvl, const mpw *av, const mpw *avl,
   MAYBE_UMUL4(amd64_avx)
 #endif
 
+#if CPUFAM_ARMEL
+  MAYBE_UMUL4(arm_neon)
+#endif
+
+#if CPUFAM_ARM64
+  MAYBE_UMUL4(arm64_simd)
+#endif
+
 static mpx_umul__functype *pick_umul(void)
 {
 #if CPUFAM_X86
@@ -945,6 +953,13 @@ static mpx_umul__functype *pick_umul(void)
   DISPATCH_PICK_COND(mpx_umul, maybe_umul4_amd64_sse2,
 		     cpu_feature_p(CPUFEAT_X86_SSE2));
 #endif
+#if CPUFAM_ARMEL
+  DISPATCH_PICK_COND(mpx_umul, maybe_umul4_arm_neon,
+		     cpu_feature_p(CPUFEAT_ARM_NEON));
+#endif
+#if CPUFAM_ARM64
+  DISPATCH_PICK_COND(mpx_umul, maybe_umul4_arm64_simd, 1);
+#endif
   DISPATCH_PICK_FALLBACK(mpx_umul, simple_umul);
 }