mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/// -- mode: asm; asm-comment-char: ?/; comment-start: "// " --
	2	///
	3	/// Large SIMD-based multiplications
	4	///
	5	/// (c) 2016 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software; you can redistribute it and/or modify
	13	/// it under the terms of the GNU Library General Public License as
	14	/// published by the Free Software Foundation; either version 2 of the
	15	/// License, or (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful,
	18	/// but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	/// GNU Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb; if not, write to the Free
	24	/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	/// MA 02111-1307, USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// External definitions.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	///--------------------------------------------------------------------------
	34	/// Prologue.
	35
	36	.arch pentium4
	37	.text
	38
	39	///--------------------------------------------------------------------------
	40	/// Theory.
	41	///
	42	/// We define a number of primitive fixed-size multipliers from which we can
	43	/// construct more general variable-length multipliers.
	44	///
	45	/// The basic trick is the same throughout. In an operand-scanning
	46	/// multiplication, the inner multiplication loop multiplies a
	47	/// multiple-precision operand by a single precision factor, and adds the
	48	/// result, appropriately shifted, to the result. A `finely integrated
	49	/// operand scanning' implementation of Montgomery multiplication also adds
	50	/// the product of a single-precision `Montgomery factor' and the modulus,
	51	/// calculated in the same pass. The more common `coarsely integrated
	52	/// operand scanning' alternates main multiplication and Montgomery passes,
	53	/// which requires additional carry propagation.
	54	///
	55	/// Throughout both plain-multiplication and Montgomery stages, then, one of
	56	/// the factors remains constant throughout the operation, so we can afford
	57	/// to take a little time to preprocess it. The transformation we perform is
	58	/// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
	59	/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
	60	/// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
	61	/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
	62	/// operands, as follows.
	63	///
	64	/// Offset 0 4 8 12
	65	/// 0 v'_0 v'_1 v''_0 v''_1
	66	/// 16 v'_2 v'_3 v''_2 v''_3
	67	///
	68	/// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
	69	/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
	70	/// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
	71	/// multiply such a vector by a full 32-bit scalar to produce two 48-bit
	72	/// results in 64-bit fields. The sixteen bits of headroom allows us to add
	73	/// many products together before we must deal with carrying; it also allows
	74	/// for some calculations to be performed on the above expanded form.
	75	///
	76	/// ...
	77	///
	78	/// We maintain four `carry' registers accumulating intermediate results.
	79	/// The registers' precise roles rotate during the computation; we name them
	80	/// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves:
	81	/// the register c0, for example, holds c'_0 (low half) and c''_0 (high
	82	/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
	83	/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
	84	/// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
	85	/// lanes of its vector) and an operand in the expanded form above produces a
	86	/// result which can be added directly to the appropriate carry register.
	87	/// Following a pass of four multiplications, we perform some limited carry
	88	/// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
	89	/// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
	90	/// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
	91	/// zeroed becomes c3.
	92
	93	///--------------------------------------------------------------------------
	94	/// Macro definitions.
	95
	96	.macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
	97	// Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
	98	// of the product in registers D0, D1, D2, D3.
	99	pshufd \d0, \r, SHUF(3, \i, 3, \i) // (r_i, ?, r_i, ?)
	100	.ifnes "\d1", "nil"
	101	movdqa \d1, \slo // (s'_0, s'_1, s''_0, s''_1)
	102	.endif
	103	.ifnes "\d3", "nil"
	104	movdqa \d3, \shi // (s'_2, s'_3, s''_2, s''_3)
	105	.endif
	106	.ifnes "\d1", "nil"
	107	psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0)
	108	.endif
	109	.ifnes "\d2", "nil"
	110	movdqa \d2, \d0 // another copy of (r_i, ?, r_i, ?)
	111	.endif
	112	.ifnes "\d3", "nil"
	113	psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
	114	.endif
	115	.ifnes "\d1", "nil"
	116	pmuludq \d1, \d0 // (r_i s'_1, r_i s''_1)
	117	.endif
	118	.ifnes "\d3", "nil"
	119	pmuludq \d3, \d0 // (r_i s'_3, r_i s''_3)
	120	.endif
	121	.ifnes "\d2", "nil"
	122	pmuludq \d2, \shi // (r_i s'_2, r_i s''_2)
	123	.endif
	124	pmuludq \d0, \slo // (r_i s'_0, r_i s''_0)
	125	.endm
	126
	127	.macro accum c0, c1=nil, c2=nil, c3=nil
	128	// Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
	129	// carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
	130	// updating that register.
	131	paddq \c0, xmm0
	132	.ifnes "\c1", "nil"
	133	paddq \c1, xmm1
	134	.endif
	135	.ifnes "\c2", "nil"
	136	paddq \c2, xmm2
	137	.endif
	138	.ifnes "\c3", "nil"
	139	paddq \c3, xmm3
	140	.endif
	141	.endm
	142
	143	.macro mulacc r, i, slo, shi, c0=nil, c1=nil, c2=nil, c3=nil, z3p=nil
	144	// Multiply R_I by the expanded operand SLO/SHI, and accumulate in
	145	// carry registers C0, C1, C2, C3. If Z3P is `t' then C3 notionally
	146	// contains zero, but needs clearing; in practice, we store the
	147	// product directly rather than attempting to add. On completion,
	148	// XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P is not `t'.
	149	.ifeqs "\z3p", "t"
	150	mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, \c3
	151	accum \c0, \c1, \c2
	152	.else
	153	mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, xmm3
	154	accum \c0, \c1, \c2, \c3
	155	.endif
	156	.endm
	157
	158	.macro propout d, pos, c, cc=nil
	159	// Calculate an output word from C, and store it at POS in D;
	160	// propagate carries out from C to CC in preparation for a rotation
	161	// of the carry registers. D is an XMM register; the POS is either
	162	// `lo' or `hi' according to whether the output word should be in
	163	// lane 0 or 1 of D; the high two lanes of D are clobbered. On
	164	// completion, XMM3 is clobbered. If CC is `nil', then the
	165	// contribution which would have been added to it is left in C.
	166	pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
	167	psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0)
	168	pslldq xmm3, 2 // (t b, 0)
	169	paddq \c, xmm3 // (c' + t b, c'')
	170	.ifeqs "\pos", "lo"
	171	movdqa \d, \c
	172	.else
	173	punpckldq \d, \c
	174	.endif
	175	psrlq \c, 32 // floor(c/B)
	176	.ifnes "\cc", "nil"
	177	paddq \cc, \c // propagate up
	178	.endif
	179	.endm
	180
	181	.macro endprop d, pos, c, t
	182	// On entry, C contains a carry register. On exit, the low 32 bits
	183	// of the value represented in C are written at POS in D, and the
	184	// remaining bits are left at the bottom of T.
	185	movdqa \t, \c
	186	psllq \t, 16 // (?, c'' b)
	187	pslldq \c, 8 // (0, c')
	188	paddq \t, \c // (?, c' + c'' b)
	189	psrldq \t, 8 // c' + c'' b
	190	.ifeqs "\pos", "lo"
	191	movdqa \d, \t
	192	.else
	193	punpckldq \d, \t
	194	.endif
	195	psrldq \t, 4 // floor((c' + c'' b)/B)
	196	.endm
	197
	198	.macro expand z, a, b, c=nil, d=nil
	199	// On entry, A and C hold packed 128-bit values, and Z is zero. On
	200	// exit, A:B and C:D together hold the same values in expanded
	201	// form. If C is `nil', then only expand A to A:B.
	202	movdqa \b, \a // (a_0, a_1, a_2, a_3)
	203	.ifnes "\c", "nil"
	204	movdqa \d, \c // (c_0, c_1, c_2, c_3)
	205	.endif
	206	punpcklwd \a, \z // (a'_0, a''_0, a'_1, a''_1)
	207	punpckhwd \b, \z // (a'_2, a''_2, a'_3, a''_3)
	208	.ifnes "\c", "nil"
	209	punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1)
	210	punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3)
	211	.endif
	212	pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
	213	pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
	214	.ifnes "\c", "nil"
	215	pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
	216	pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
	217	.endif
	218	.endm
	219
	220	.macro squash c0, c1, c2, c3, t, u, lo, hi=nil
	221	// On entry, C0, C1, C2, C3 are carry registers representing a value
	222	// Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
	223	// C3, T, and U are clobbered; and the high bits of Y are stored in
	224	// HI, if this is not `nil'.
	225
	226	// The first step is to eliminate the `double-prime' pieces -- i.e.,
	227	// the ones offset by 16 bytes from a 32-bit boundary -- by carrying
	228	// them into the 32-bit-aligned pieces above and below. But before
	229	// we can do that, we must gather them together.
	230	movdqa \t, \c0
	231	movdqa \u, \c1
	232	punpcklqdq \t, \c2 // (y'_0, y'_2)
	233	punpckhqdq \c0, \c2 // (y''_0, y''_2)
	234	punpcklqdq \u, \c3 // (y'_1, y'_3)
	235	punpckhqdq \c1, \c3 // (y''_1, y''_3)
	236
	237	// Now split the double-prime pieces. The high (up to) 48 bits will
	238	// go up; the low 16 bits go down.
	239	movdqa \c2, \c0
	240	movdqa \c3, \c1
	241	psllq \c2, 48
	242	psllq \c3, 48
	243	psrlq \c0, 16 // high parts of (y''_0, y''_2)
	244	psrlq \c1, 16 // high parts of (y''_1, y''_3)
	245	psrlq \c2, 32 // low parts of (y''_0, y''_2)
	246	psrlq \c3, 32 // low parts of (y''_1, y''_3)
	247	.ifnes "\hi", "nil"
	248	movdqa \hi, \c1
	249	.endif
	250	pslldq \c1, 8 // high part of (0, y''_1)
	251
	252	paddq \t, \c2 // propagate down
	253	paddq \u, \c3
	254	paddq \t, \c1 // and up: (y_0, y_2)
	255	paddq \u, \c0 // (y_1, y_3)
	256	.ifnes "\hi", "nil"
	257	psrldq \hi, 8 // high part of (y''_3, 0)
	258	.endif
	259
	260	// Finally extract the answer. This complicated dance is better than
	261	// storing to memory and loading, because the piecemeal stores
	262	// inhibit store forwarding.
	263	movdqa \c3, \t // (y_0, y_1)
	264	movdqa \lo, \t // (y^*_0, ?, ?, ?)
	265	psrldq \t, 8 // (y_2, 0)
	266	psrlq \c3, 32 // (floor(y_0/B), ?)
	267	paddq \c3, \u // (y_1 + floor(y_0/B), ?)
	268	movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
	269	psrldq \u, 8 // (y_3, 0)
	270	psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
	271	paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
	272	punpckldq \lo, \c3 // (y^_0, y^_2, ?, ?)
	273	psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
	274	paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
	275	.ifnes "\hi", "nil"
	276	movdqa \t, \c3
	277	pxor \u, \u
	278	.endif
	279	punpckldq \c1, \c3 // (y^_1, y^_3, ?, ?)
	280	.ifnes "\hi", "nil"
	281	psrlq \t, 32 // very high bits of y
	282	paddq \hi, \t
	283	punpcklqdq \hi, \u // carry up
	284	.endif
	285	punpckldq \lo, \c1 // y mod B^4
	286	.endm
	287
	288	.macro carryadd
	289	// On entry, RDI points to a packed addend A, and XMM12, XMM13, XMM14
	290	// hold the incoming carry registers c0, c1, and c2 representing a
	291	// carry-in C.
	292	//
	293	// On exit, the carry registers, including XMM15, are updated to hold
	294	// C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
	295	// registers are preserved.
	296	movd xmm0, [rdi + 0] // (a_0, 0)
	297	movd xmm1, [rdi + 4] // (a_1, 0)
	298	movd xmm2, [rdi + 8] // (a_2, 0)
	299	movd xmm15, [rdi + 12] // (a_3, 0)
	300	paddq xmm12, xmm0 // (c'_0 + a_0, c''_0)
	301	paddq xmm13, xmm1 // (c'_1 + a_1, c''_1)
	302	paddq xmm14, xmm2 // (c'_2 + a_2, c''_2 + a_3 b)
	303	.endm
	304
	305	///--------------------------------------------------------------------------
	306	/// Primitive multipliers and related utilities.
	307
	308	INTFUNC(carryprop)
	309	// On entry, XMM12, XMM13, and XMM14 hold a 144-bit carry in an
	310	// expanded form. Store the low 128 bits of the represented carry to
	311	// [RDI] as a packed 128-bit value, and leave the remaining 16 bits
	312	// in the low 32 bits of XMM12. On exit, XMM0, XMM1, XMM3, XMM13 and
	313	// XMM14 are clobbered.
	314	endprologue
	315
	316	propout xmm0, lo, xmm12, xmm13
	317	propout xmm1, lo, xmm13, xmm14
	318	propout xmm0, hi, xmm14, nil
	319	endprop xmm1, hi, xmm14, xmm12
	320	punpckldq xmm0, xmm1
	321	movdqu [rdi], xmm0
	322
	323	ret
	324
	325	ENDFUNC
	326
	327	INTFUNC(dmul4)
	328	// On entry, RDI points to the destination buffer; RAX and RBX point
	329	// to the packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
	330	// expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
	331	// incoming carry registers c0, c1, and c2; c3 is assumed to be zero.
	332	//
	333	// On exit, we write the low 128 bits of the sum C + U V + X Y to
	334	// [RDI], and update the carry registers with the carry out. The
	335	// registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
	336	// registers are preserved.
	337	endprologue
	338
	339	movdqu xmm4, [rax]
	340	movdqu xmm5, [rbx]
	341
	342	mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15, t
	343	mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
	344	propout xmm6, lo, xmm12, xmm13
	345
	346	mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
	347	mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
	348	propout xmm7, lo, xmm13, xmm14
	349
	350	mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
	351	mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
	352	propout xmm6, hi, xmm14, xmm15
	353
	354	mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
	355	mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
	356	propout xmm7, hi, xmm15, xmm12
	357
	358	punpckldq xmm6, xmm7
	359	movdqu [rdi], xmm6
	360
	361	ret
	362
	363	ENDFUNC
	364
	365	INTFUNC(dmla4)
	366	// On entry, RDI points to the destination buffer, which also
	367	// contains an addend A to accumulate; RAX and RBX point to the
	368	// packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
	369	// expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
	370	// incoming carry registers c0, c1, and c2 representing a carry-in C;
	371	// c3 is assumed to be zero.
	372	//
	373	// On exit, we write the low 128 bits of the sum A + C + U V + X Y to
	374	// [RDI], and update the carry registers with the carry out. The
	375	// registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
	376	// registers are preserved.
	377	endprologue
	378
	379	movdqu xmm4, [rax]
	380	movdqu xmm5, [rbx]
	381	carryadd
	382
	383	mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
	384	mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
	385	propout xmm6, lo, xmm12, xmm13
	386
	387	mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
	388	mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
	389	propout xmm7, lo, xmm13, xmm14
	390
	391	mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
	392	mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
	393	propout xmm6, hi, xmm14, xmm15
	394
	395	mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
	396	mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
	397	propout xmm7, hi, xmm15, xmm12
	398
	399	punpckldq xmm6, xmm7
	400	movdqu [rdi], xmm6
	401
	402	ret
	403
	404	ENDFUNC
	405
	406	INTFUNC(mul4zc)
	407	// On entry, RDI points to the destination buffer; RBX points to a
	408	// packed operand X; and XMM10/XMM11 hold an expanded operand Y.
	409	//
	410	// On exit, we write the low 128 bits of the product X Y to [RDI],
	411	// and set the carry registers XMM12, XMM13, XMM14 to the carry out.
	412	// The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
	413	// general-purpose registers are preserved.
	414	endprologue
	415
	416	movdqu xmm5, [rbx]
	417
	418	mulcore xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
	419	propout xmm6, lo, xmm12, xmm13
	420
	421	mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
	422	propout xmm7, lo, xmm13, xmm14
	423
	424	mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
	425	propout xmm6, hi, xmm14, xmm15
	426
	427	mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
	428	propout xmm7, hi, xmm15, xmm12
	429
	430	punpckldq xmm6, xmm7
	431	movdqu [rdi], xmm6
	432
	433	ret
	434
	435	ENDFUNC
	436
	437	INTFUNC(mul4)
	438	// On entry, RDI points to the destination buffer; RBX points to a
	439	// packed operand X; XMM10/XMM11 hold an expanded operand Y; and
	440	// XMM12, XMM13, XMM14 hold the incoming carry registers c0, c1, and
	441	// c2, representing a carry-in C; c3 is assumed to be zero.
	442	//
	443	// On exit, we write the low 128 bits of the sum C + X Y to [RDI],
	444	// and update the carry registers with the carry out. The registers
	445	// XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
	446	// general-purpose registers are preserved.
	447	endprologue
	448
	449	movdqu xmm5, [rbx]
	450
	451	mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, t
	452	propout xmm6, lo, xmm12, xmm13
	453
	454	mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
	455	propout xmm7, lo, xmm13, xmm14
	456
	457	mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
	458	propout xmm6, hi, xmm14, xmm15
	459
	460	mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
	461	propout xmm7, hi, xmm15, xmm12
	462
	463	punpckldq xmm6, xmm7
	464	movdqu [rdi], xmm6
	465
	466	ret
	467
	468	ENDFUNC
	469
	470	INTFUNC(mla4zc)
	471	// On entry, RDI points to the destination buffer, which also
	472	// contains an addend A to accumulate; RBX points to a packed operand
	473	// X; and XMM10/XMM11 points to an expanded operand Y.
	474	//
	475	// On exit, we write the low 128 bits of the sum A + X Y to [RDI],
	476	// and set the carry registers XMM12, XMM13, XMM14 to the carry out.
	477	// The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
	478	// general-purpose registers are preserved.
	479	endprologue
	480
	481	movdqu xmm5, [rbx]
	482	movd xmm12, [rdi + 0]
	483	movd xmm13, [rdi + 4]
	484	movd xmm14, [rdi + 8]
	485	movd xmm15, [rdi + 12]
	486
	487	mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
	488	propout xmm6, lo, xmm12, xmm13
	489
	490	mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
	491	propout xmm7, lo, xmm13, xmm14
	492
	493	mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
	494	propout xmm6, hi, xmm14, xmm15
	495
	496	mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
	497	propout xmm7, hi, xmm15, xmm12
	498
	499	punpckldq xmm6, xmm7
	500	movdqu [rdi], xmm6
	501
	502	ret
	503
	504	ENDFUNC
	505
	506	INTFUNC(mla4)
	507	// On entry, RDI points to the destination buffer, which also
	508	// contains an addend A to accumulate; RBX points to a packed operand
	509	// X; XMM10/XMM11 holds an expanded operand Y; and XMM12, XMM13,
	510	// XMM14 hold the incoming carry registers c0, c1, and c2,
	511	// representing a carry-in C; c3 is assumed to be zero.
	512	//
	513	// On exit, we write the low 128 bits of the sum A + C + X Y to
	514	// [RDI], and update the carry registers with the carry out. The
	515	// registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
	516	// general-purpose registers are preserved.
	517	endprologue
	518
	519	movdqu xmm5, [rbx]
	520	carryadd
	521
	522	mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
	523	propout xmm6, lo, xmm12, xmm13
	524
	525	mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
	526	propout xmm7, lo, xmm13, xmm14
	527
	528	mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
	529	propout xmm6, hi, xmm14, xmm15
	530
	531	mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
	532	propout xmm7, hi, xmm15, xmm12
	533
	534	punpckldq xmm6, xmm7
	535	movdqu [rdi], xmm6
	536
	537	ret
	538
	539	ENDFUNC
	540
	541	INTFUNC(mmul4)
	542	// On entry, RDI points to the destination buffer; RAX and RBX point
	543	// to the packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold
	544	// the expanded operands V and M. The stack pointer must be 8 modulo 16
	545	// (as usual for AMD64 ABIs).
	546	//
	547	// On exit, we store Y = U V M mod B in XMM10/XMM11, and write the
	548	// low 128 bits of the sum U V + N Y to [RDI], leaving the remaining
	549	// carry in XMM12, XMM13, and XMM14. The registers XMM0--XMM7, and
	550	// XMM15 are clobbered; the general-purpose registers are preserved.
	551	movdqu xmm4, [rax]
	552	#if ABI_WIN
	553	stalloc 48 + 8 // space for the carries
	554	#endif
	555	endprologue
	556
	557	// Calculate W = U V, and leave it in XMM7. Stash the carry pieces
	558	// for later.
	559	mulcore xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
	560	propout xmm7, lo, xmm12, xmm13
	561	jmp 5f
	562
	563	ENDFUNC
	564
	565	INTFUNC(mmla4)
	566	// On entry, RDI points to the destination buffer, which also
	567	// contains an addend A to accumulate; RAX and RBX point to the
	568	// packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold the
	569	// expanded operands V and M. The stack pointer must be 8 modulo 16
	570	// (as usual for AMD64 ABIs).
	571	//
	572	// On exit, we store Y = (A + U V) M mod B in XMM10/XMM11, and write
	573	// the low 128 bits of the sum A + U V + N Y to [RDI], leaving the
	574	// remaining carry in XMM12, XMM13, and XMM14. The registers
	575	// XMM0--XMM7, and XMM15 are clobbered; the general-purpose registers
	576	// are preserved.
	577	movdqu xmm4, [rax]
	578	#if ABI_WIN
	579	stalloc 48 + 8 // space for the carries
	580	# define STKTMP(i) [rsp + i]
	581	#endif
	582	#if ABI_SYSV
	583	# define STKTMP(i) [rsp + i - 48 - 8] // use red zone
	584	#endif
	585	endprologue
	586
	587	movd xmm12, [rdi + 0]
	588	movd xmm13, [rdi + 4]
	589	movd xmm14, [rdi + 8]
	590	movd xmm15, [rdi + 12]
	591
	592	// Calculate W = U V, and leave it in XMM7. Stash the carry pieces
	593	// for later.
	594	mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
	595	propout xmm7, lo, xmm12, xmm13
	596
	597	5: mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t
	598	propout xmm6, lo, xmm13, xmm14
	599
	600	mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t
	601	propout xmm7, hi, xmm14, xmm15
	602
	603	mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t
	604	propout xmm6, hi, xmm15, xmm12
	605
	606	// Prepare W, and stash carries for later.
	607	punpckldq xmm7, xmm6
	608	movdqa STKTMP( 0), xmm12
	609	movdqa STKTMP(16), xmm13
	610	movdqa STKTMP(32), xmm14
	611
	612	// Calculate Y = W M. We just about have enough spare registers to
	613	// make this work.
	614	mulcore xmm7, 0, xmm10, xmm11, xmm3, xmm4, xmm5, xmm6
	615
	616	// Start expanding W back into the main carry registers...
	617	pxor xmm15, xmm15
	618	movdqa xmm12, xmm7
	619	movdqa xmm14, xmm7
	620
	621	mulcore xmm7, 1, xmm10, xmm11, xmm0, xmm1, xmm2
	622	accum xmm4, xmm5, xmm6
	623
	624	punpckldq xmm12, xmm15 // (w_0, 0, w_1, 0)
	625	punpckhdq xmm14, xmm15 // (w_2, 0, w_3, 0)
	626
	627	mulcore xmm7, 2, xmm10, xmm11, xmm0, xmm1
	628	accum xmm5, xmm6
	629
	630	pxor xmm2, xmm2
	631	movdqa xmm13, xmm12
	632	movdqa xmm15, xmm14
	633
	634	mulcore xmm7, 3, xmm10, xmm11, xmm0
	635	accum xmm6
	636
	637	punpckldq xmm12, xmm2 // (w_0, 0, 0, 0)
	638	punpckldq xmm14, xmm2 // (w_2, 0, 0, 0)
	639	punpckhdq xmm13, xmm2 // (w_1, 0, 0, 0)
	640	punpckhdq xmm15, xmm2 // (w_3, 0, 0, 0)
	641
	642	// That's lots of pieces. Now we have to assemble the answer.
	643	squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
	644
	645	// Expand it.
	646	movdqu xmm5, [rbx]
	647	expand xmm2, xmm10, xmm11
	648
	649	// Finish the calculation by adding the Montgomery product.
	650	mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
	651	propout xmm6, lo, xmm12, xmm13
	652
	653	mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
	654	propout xmm7, lo, xmm13, xmm14
	655
	656	mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
	657	propout xmm6, hi, xmm14, xmm15
	658
	659	mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
	660	propout xmm7, hi, xmm15, xmm12
	661
	662	punpckldq xmm6, xmm7
	663
	664	// Add add on the carry we calculated earlier.
	665	paddq xmm12, STKTMP( 0)
	666	paddq xmm13, STKTMP(16)
	667	paddq xmm14, STKTMP(32)
	668
	669	// And, with that, we're done.
	670	movdqu [rdi], xmm6
	671	#if ABI_WIN
	672	stfree 56
	673	#endif
	674	ret
	675
	676	#undef STKTMP
	677
	678	ENDFUNC
	679
	680	INTFUNC(mont4)
	681	// On entry, RDI points to the destination buffer holding a packed
	682	// value W; RBX points to a packed operand N; and XMM8/XMM9 hold an
	683	// expanded operand M.
	684	//
	685	// On exit, we store Y = W M mod B in XMM10/XMM11, and write the low
	686	// 128 bits of the sum W + N Y to [RDI], leaving the remaining carry
	687	// in XMM12, XMM13, and XMM14. The registers XMM0--XMM3, XMM5--XMM7,
	688	// and XMM15 are clobbered; the general-purpose registers are
	689	// preserved.
	690	endprologue
	691
	692	movdqu xmm7, [rdi]
	693
	694	// Calculate Y = W M. Avoid the standard carry registers, because
	695	// we're setting something else up there.
	696	mulcore xmm7, 0, xmm8, xmm9, xmm3, xmm4, xmm5, xmm6
	697
	698	// Start expanding W back into the main carry registers...
	699	pxor xmm15, xmm15
	700	movdqa xmm12, xmm7
	701	movdqa xmm14, xmm7
	702
	703	mulcore xmm7, 1, xmm8, xmm9, xmm0, xmm1, xmm2
	704	accum xmm4, xmm5, xmm6
	705
	706	punpckldq xmm12, xmm15 // (w_0, 0, w_1, 0)
	707	punpckhdq xmm14, xmm15 // (w_2, 0, w_3, 0)
	708
	709	mulcore xmm7, 2, xmm8, xmm9, xmm0, xmm1
	710	accum xmm5, xmm6
	711
	712	pxor xmm2, xmm2
	713	movdqa xmm13, xmm12
	714	movdqa xmm15, xmm14
	715
	716	mulcore xmm7, 3, xmm8, xmm9, xmm0
	717	accum xmm6
	718
	719	punpckldq xmm12, xmm2 // (w_0, 0, 0, 0)
	720	punpckldq xmm14, xmm2 // (w_2, 0, 0, 0)
	721	punpckhdq xmm13, xmm2 // (w_1, 0, 0, 0)
	722	punpckhdq xmm15, xmm2 // (w_3, 0, 0, 0)
	723
	724	// That's lots of pieces. Now we have to assemble the answer.
	725	squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
	726
	727	// Expand it.
	728	movdqu xmm5, [rbx]
	729	expand xmm2, xmm10, xmm11
	730
	731	// Finish the calculation by adding the Montgomery product.
	732	mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
	733	propout xmm6, lo, xmm12, xmm13
	734
	735	mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
	736	propout xmm7, lo, xmm13, xmm14
	737
	738	mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
	739	propout xmm6, hi, xmm14, xmm15
	740
	741	mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
	742	propout xmm7, hi, xmm15, xmm12
	743
	744	punpckldq xmm6, xmm7
	745
	746	// And, with that, we're done.
	747	movdqu [rdi], xmm6
	748	ret
	749
	750	ENDFUNC
	751
	752	///--------------------------------------------------------------------------
	753	/// Bulk multipliers.
	754
	755	FUNC(mpx_umul4_amd64_sse2)
	756	// void mpx_umul4_amd64_sse2(mpw dv, const mpw av, const mpw *avl,
	757	// const mpw bv, const mpw bvl);
	758
	759	// Establish the arguments and do initial setup.
	760	//
	761	// sysv win
	762	// inner loop dv rdi rdi*
	763	// inner loop av rbx* rbx*
	764	// outer loop dv r10 rcx
	765	// outer loop bv rcx r9
	766	// av base rsi rdx
	767	// av limit rdx r8
	768	// bv limit r8 r10
	769
	770	#if ABI_SYSV
	771	# define DV r10
	772	# define AV rsi
	773	# define AVL rdx
	774	# define BV rcx
	775	# define BVL r8
	776
	777	pushreg rbx
	778	endprologue
	779
	780	mov DV, rdi
	781
	782	#endif
	783
	784	#if ABI_WIN
	785	# define DV rcx
	786	# define AV rdx
	787	# define AVL r8
	788	# define BV r9
	789	# define BVL r10
	790
	791	pushreg rbx
	792	pushreg rdi
	793	stalloc 160 + 8
	794
	795	savexmm xmm6, 0
	796	savexmm xmm7, 16
	797	savexmm xmm8, 32
	798	savexmm xmm9, 48
	799	savexmm xmm10, 64
	800	savexmm xmm11, 80
	801	savexmm xmm12, 96
	802	savexmm xmm13, 112
	803	savexmm xmm14, 128
	804	savexmm xmm15, 144
	805
	806	endprologue
	807
	808	mov rdi, DV
	809	mov BVL, [rsp + 224]
	810
	811	#endif
	812
	813	// Prepare for the first iteration.
	814	pxor xmm0, xmm0
	815	movdqu xmm10, [BV] // bv[0]
	816	mov rbx, AV
	817	add DV, 16
	818	add BV, 16
	819	expand xmm0, xmm10, xmm11
	820	call mul4zc
	821	add rbx, 16
	822	add rdi, 16
	823	cmp rbx, AVL // all done?
	824	jae 8f
	825
	826	.p2align 4
	827	// Continue with the first iteration.
	828	0: call mul4
	829	add rbx, 16
	830	add rdi, 16
	831	cmp rbx, AVL // all done?
	832	jb 0b
	833
	834	// Write out the leftover carry. There can be no tail here.
	835	8: call carryprop
	836	cmp BV, BVL // more passes to do?
	837	jae 9f
	838
	839	.p2align 4
	840	// Set up for the next pass.
	841	1: movdqu xmm10, [BV] // bv[i]
	842	mov rdi, DV // -> dv[i]
	843	pxor xmm0, xmm0
	844	expand xmm0, xmm10, xmm11
	845	mov rbx, AV // -> av[0]
	846	add DV, 16
	847	add BV, 16
	848	call mla4zc
	849	add rbx, 16
	850	add rdi, 16
	851	cmp rbx, AVL // done yet?
	852	jae 8f
	853
	854	.p2align 4
	855	// Continue...
	856	0: call mla4
	857	add rbx, 16
	858	add rdi, 16
	859	cmp rbx, AVL
	860	jb 0b
	861
	862	// Finish off this pass. There was no tail on the previous pass, and
	863	// there can be none on this pass.
	864	8: call carryprop
	865	cmp BV, BVL
	866	jb 1b
	867
	868	// All over.
	869	9:
	870
	871	#if ABI_SYSV
	872	popreg rbx
	873	#endif
	874
	875	#if ABI_WIN
	876
	877	rstrxmm xmm6, 0
	878	rstrxmm xmm7, 16
	879	rstrxmm xmm8, 32
	880	rstrxmm xmm9, 48
	881	rstrxmm xmm10, 64
	882	rstrxmm xmm11, 80
	883	rstrxmm xmm12, 96
	884	rstrxmm xmm13, 112
	885	rstrxmm xmm14, 128
	886	rstrxmm xmm15, 144
	887
	888	stfree 160 + 8
	889	popreg rdi
	890	popreg rbx
	891
	892	#endif
	893
	894	ret
	895
	896	#undef DV
	897	#undef AV
	898	#undef AVL
	899	#undef BV
	900	#undef BVL
	901
	902	ENDFUNC
	903
	904	FUNC(mpxmont_mul4_amd64_sse2)
	905	// void mpxmont_mul4_amd64_sse2(mpw dv, const mpw av, const mpw *bv,
	906	// const mpw nv, size_t n, const mpw mi);
	907
	908	// Establish the arguments and do initial setup.
	909	//
	910	// sysv win
	911	// inner loop dv rdi rdi*
	912	// inner loop av rax rax
	913	// inner loop nv rbx* rbx*
	914	// mi r9 r10
	915	// outer loop dv r10 rcx
	916	// outer loop bv rdx r8
	917	// av base rsi rdx
	918	// av limit r11 r11
	919	// bv limit r8 r12*
	920	// nv base rcx r9
	921	// n r8 r12*
	922
	923	#if ABI_SYSV
	924	# define DV r10
	925	# define AV rsi
	926	# define AVL r11
	927	# define BV rdx
	928	# define BVL r8
	929	# define NV rcx
	930	# define N r8
	931	# define MI r9
	932
	933	pushreg rbx
	934	endprologue
	935
	936	mov DV, rdi
	937
	938	#endif
	939
	940	#if ABI_WIN
	941	# define DV rcx
	942	# define AV rdx
	943	# define AVL r11
	944	# define BV r8
	945	# define BVL r12
	946	# define NV r9
	947	# define N r12
	948	# define MI r10
	949
	950	pushreg rbx
	951	pushreg rdi
	952	pushreg r12
	953	stalloc 160
	954
	955	savexmm xmm6, 0
	956	savexmm xmm7, 16
	957	savexmm xmm8, 32
	958	savexmm xmm9, 48
	959	savexmm xmm10, 64
	960	savexmm xmm11, 80
	961	savexmm xmm12, 96
	962	savexmm xmm13, 112
	963	savexmm xmm14, 128
	964	savexmm xmm15, 144
	965
	966	endprologue
	967
	968	mov rdi, DV
	969	mov N, [rsp + 224]
	970	mov MI, [rsp + 232]
	971
	972	#endif
	973
	974	// Establish the expanded operands.
	975	pxor xmm0, xmm0
	976	movdqu xmm8, [BV] // bv[0]
	977	movdqu xmm10, [MI] // mi
	978	expand xmm0, xmm8, xmm9, xmm10, xmm11
	979
	980	// Set up the outer loop state and prepare for the first iteration.
	981	mov rax, AV // -> U = av[0]
	982	mov rbx, NV // -> X = nv[0]
	983	lea AVL, [AV + 4*N] // -> av[n/4] = av limit
	984	lea BVL, [BV + 4*N] // -> bv[n/4] = bv limit
	985	add BV, 16
	986	add DV, 16
	987	call mmul4
	988	add rdi, 16
	989	add rax, 16
	990	add rbx, 16
	991	cmp rax, AVL // done already?
	992	jae 8f
	993
	994	.p2align 4
	995	// Complete the first inner loop.
	996	0: call dmul4
	997	add rdi, 16
	998	add rax, 16
	999	add rbx, 16
	1000	cmp rax, AVL // done yet?
	1001	jb 0b
	1002
	1003	// Still have carries left to propagate.
	1004	call carryprop
	1005	movd [rdi + 16], xmm12
	1006
	1007	.p2align 4
	1008	// Embark on the next iteration. (There must be one. If n = 1, then
	1009	// we would have bailed above, to label 8. Similarly, the subsequent
	1010	// iterations can fall into the inner loop immediately.)
	1011	1: pxor xmm0, xmm0
	1012	movdqu xmm8, [BV] // bv[i]
	1013	movdqu xmm10, [MI] // mi
	1014	mov rdi, DV // -> Z = dv[i]
	1015	mov rax, AV // -> U = av[0]
	1016	mov rbx, NV // -> X = nv[0]
	1017	expand xmm0, xmm8, xmm9, xmm10, xmm11
	1018	add BV, 16
	1019	add DV, 16
	1020	call mmla4
	1021	add rdi, 16
	1022	add rax, 16
	1023	add rbx, 16
	1024
	1025	.p2align 4
	1026	// Complete the next inner loop.
	1027	0: call dmla4
	1028	add rdi, 16
	1029	add rax, 16
	1030	add rbx, 16
	1031	cmp rax, AVL
	1032	jb 0b
	1033
	1034	// Still have carries left to propagate, and they overlap the
	1035	// previous iteration's final tail, so read that in and add it.
	1036	movd xmm0, [rdi]
	1037	paddq xmm12, xmm0
	1038	call carryprop
	1039	movd [rdi + 16], xmm12
	1040
	1041	// Back again, maybe.
	1042	cmp BV, BVL
	1043	jb 1b
	1044
	1045	// All done.
	1046	9:
	1047
	1048	#if ABI_SYSV
	1049	popreg rbx
	1050	#endif
	1051
	1052	#if ABI_WIN
	1053
	1054	rstrxmm xmm6, 0
	1055	rstrxmm xmm7, 16
	1056	rstrxmm xmm8, 32
	1057	rstrxmm xmm9, 48
	1058	rstrxmm xmm10, 64
	1059	rstrxmm xmm11, 80
	1060	rstrxmm xmm12, 96
	1061	rstrxmm xmm13, 112
	1062	rstrxmm xmm14, 128
	1063	rstrxmm xmm15, 144
	1064
	1065	stfree 160
	1066	popreg r12
	1067	popreg rdi
	1068	popreg rbx
	1069
	1070	#endif
	1071
	1072	ret
	1073
	1074	// First iteration was short. Write out the carries and we're done.
	1075	// (This could be folded into the main loop structure, but that would
	1076	// penalize small numbers more.)
	1077	8: call carryprop
	1078	movd [rdi + 16], xmm12
	1079	#if ABI_SYSV
	1080	popreg rbx
	1081	ret
	1082	#endif
	1083	#if ABI_WIN
	1084	jmp 9b
	1085	#endif
	1086
	1087	#undef DV
	1088	#undef AV
	1089	#undef AVL
	1090	#undef BV
	1091	#undef BVL
	1092	#undef NV
	1093	#undef N
	1094	#undef MI
	1095
	1096	ENDFUNC
	1097
	1098	FUNC(mpxmont_redc4_amd64_sse2)
	1099	// void mpxmont_redc4_amd64_sse2(mpw dv, mpw dvl, const mpw *nv,
	1100	// size_t n, const mpw *mi);
	1101
	1102	// Establish the arguments and do initial setup.
	1103	//
	1104	// sysv win
	1105	// inner loop dv rdi rdi*
	1106	// dv limit rax rax
	1107	// blocks-of-4 dv limit rsi rdx
	1108	// inner loop nv rbx* rbx*
	1109	// mi r8 r10
	1110	// outer loop dv r10 rcx
	1111	// outer loop dv limit r11 r11
	1112	// nv base rdx r8
	1113	// nv limit r9 r12*
	1114	// n rcx r9
	1115	// c rcx r9
	1116
	1117	#if ABI_SYSV
	1118
	1119	# define DVL rax
	1120	# define DVL4 rsi
	1121	# define MI r8
	1122	# define DV r10
	1123	# define DVLO r11
	1124	# define NV rdx
	1125	# define NVL r9
	1126	# define N rcx
	1127	# define C ecx
	1128
	1129	pushreg rbx
	1130	endprologue
	1131
	1132	mov DV, rdi
	1133
	1134	#endif
	1135
	1136	#if ABI_WIN
	1137
	1138	# define DVL rax
	1139	# define DVL4 rdx
	1140	# define MI r10
	1141	# define DV rcx
	1142	# define DVLO r11
	1143	# define NV r8
	1144	# define NVL r12
	1145	# define N r9
	1146	# define C r9d
	1147
	1148	pushreg rbx
	1149	pushreg rdi
	1150	pushreg r12
	1151	stalloc 160
	1152
	1153	savexmm xmm6, 0
	1154	savexmm xmm7, 16
	1155	savexmm xmm8, 32
	1156	savexmm xmm9, 48
	1157	savexmm xmm10, 64
	1158	savexmm xmm11, 80
	1159	savexmm xmm12, 96
	1160	savexmm xmm13, 112
	1161	savexmm xmm14, 128
	1162	savexmm xmm15, 144
	1163
	1164	endprologue
	1165
	1166	mov rdi, DV
	1167	mov MI, [rsp + 224]
	1168
	1169	#endif
	1170
	1171	// Establish the expanded operands and the blocks-of-4 dv limit.
	1172	pxor xmm0, xmm0
	1173	mov DVL, DVL4 // -> dv[n] = dv limit
	1174	sub DVL4, DV // length of dv in bytes
	1175	movdqu xmm8, [MI] // mi
	1176	and DVL4, ~15 // mask off the tail end
	1177	expand xmm0, xmm8, xmm9
	1178	add DVL4, DV // find limit
	1179
	1180	// Set up the outer loop state and prepare for the first iteration.
	1181	mov rbx, NV // -> X = nv[0]
	1182	lea DVLO, [DV + 4*N] // -> dv[n/4] = outer dv limit
	1183	lea NVL, [NV + 4*N] // -> nv[n/4] = nv limit
	1184	add DV, 16
	1185	call mont4
	1186	add rbx, 16
	1187	add rdi, 16
	1188	cmp rbx, NVL // done already?
	1189	jae 8f
	1190
	1191	.p2align 4
	1192	// Complete the first inner loop.
	1193	5: call mla4
	1194	add rbx, 16
	1195	add rdi, 16
	1196	cmp rbx, NVL // done yet?
	1197	jb 5b
	1198
	1199	// Still have carries left to propagate.
	1200	8: carryadd
	1201	psllq xmm15, 16
	1202	pslldq xmm15, 8
	1203	paddq xmm14, xmm15
	1204	call carryprop
	1205	movd C, xmm12
	1206	add rdi, 16
	1207	cmp rdi, DVL4
	1208	jae 7f
	1209
	1210	.p2align 4
	1211	// Continue carry propagation until the end of the buffer.
	1212	0: add [rdi], C
	1213	mov C, 0 // preserves flags
	1214	adcd [rdi + 4], 0
	1215	adcd [rdi + 8], 0
	1216	adcd [rdi + 12], 0
	1217	adc C, 0
	1218	add rdi, 16
	1219	cmp rdi, DVL4
	1220	jb 0b
	1221
	1222	// Deal with the tail end.
	1223	7: add [rdi], C
	1224	mov C, 0 // preserves flags
	1225	add rdi, 4
	1226	adc C, 0
	1227	cmp rdi, DVL
	1228	jb 7b
	1229
	1230	// All done for this iteration. Start the next. (This must have at
	1231	// least one follow-on iteration, or we'd not have started this outer
	1232	// loop.)
	1233	8: mov rdi, DV // -> Z = dv[i]
	1234	mov rbx, NV // -> X = nv[0]
	1235	cmp rdi, DVLO // all done yet?
	1236	jae 9f
	1237	add DV, 16
	1238	call mont4
	1239	add rdi, 16
	1240	add rbx, 16
	1241	jmp 5b
	1242
	1243	// All over.
	1244	9:
	1245
	1246	#if ABI_SYSV
	1247	popreg rbx
	1248	#endif
	1249
	1250	#if ABI_WIN
	1251
	1252	rstrxmm xmm6, 0
	1253	rstrxmm xmm7, 16
	1254	rstrxmm xmm8, 32
	1255	rstrxmm xmm9, 48
	1256	rstrxmm xmm10, 64
	1257	rstrxmm xmm11, 80
	1258	rstrxmm xmm12, 96
	1259	rstrxmm xmm13, 112
	1260	rstrxmm xmm14, 128
	1261	rstrxmm xmm15, 144
	1262
	1263	stfree 160
	1264	popreg r12
	1265	popreg rdi
	1266	popreg rbx
	1267
	1268	#endif
	1269
	1270	ret
	1271
	1272	#undef DVL
	1273	#undef DVL4
	1274	#undef MI
	1275	#undef DV
	1276	#undef DVLO
	1277	#undef NV
	1278	#undef NVL
	1279	#undef N
	1280	#undef C
	1281
	1282	ENDFUNC
	1283
	1284	///--------------------------------------------------------------------------
	1285	/// Testing and performance measurement.
	1286
	1287	#ifdef TEST_MUL4
	1288
	1289	#if ABI_SYSV
	1290	# define ARG0 rdi
	1291	# define ARG1 rsi
	1292	# define ARG2 rdx
	1293	# define ARG3 rcx
	1294	# define ARG4 r8
	1295	# define ARG5 r9
	1296	# define ARG6 STKARG(0)
	1297	# define ARG7 STKARG(1)
	1298	# define ARG8 STKARG(2)
	1299	# define STKARG_OFFSET 16
	1300	#endif
	1301	#if ABI_WIN
	1302	# define ARG0 rcx
	1303	# define ARG1 rdx
	1304	# define ARG2 r8
	1305	# define ARG3 r9
	1306	# define ARG4 STKARG(0)
	1307	# define ARG5 STKARG(1)
	1308	# define ARG6 STKARG(2)
	1309	# define ARG7 STKARG(3)
	1310	# define ARG8 STKARG(4)
	1311	# define STKARG_OFFSET 224
	1312	#endif
	1313	#define STKARG(i) [rsp + STKARG_OFFSET + 8*(i)]
	1314
	1315	// sysv win
	1316	// dmul smul mmul mont dmul smul mmul mont
	1317	// A rax
	1318	// D rdx
	1319	// z rdi rdi rdi rdi rdi rcx rcx rcx rcx
	1320	// c rcx rsi rsi rsi rsi rdx rdx rdx rdx
	1321	// y r10 -- -- rdx rdx -- -- r8 r8
	1322	// u r11 rdx -- rcx -- r8 -- r9 --
	1323	// x rbx rcx rdx r8 rcx r9 r8 stk0 r9
	1324	// vv xmm8/9 r8 -- r9 r8 stk0 -- stk1 stk0
	1325	// yy xmm10/11 r9 rcx stk0 -- stk1 r9 stk2 --
	1326	// n r8 stk0 r8 stk1 r9 stk2 stk0 stk3 stk1
	1327	// cyv r9 stk1 r9 stk2 stk0 stk3 stk1 stk4 stk2
	1328
	1329	.macro cysetup v, n
	1330	rdtsc
	1331	shl rdx, 32
	1332	or rax, rdx
	1333	mov [\v + 8*\n - 8], rax
	1334	.endm
	1335
	1336	.macro cystore v, n
	1337	rdtsc
	1338	shl rdx, 32
	1339	or rax, rdx
	1340	sub rax, [\v + 8*\n - 8]
	1341	mov [\v + 8*\n - 8], rax
	1342	dec \n
	1343	.endm
	1344
	1345	.macro testprologue mode
	1346	pushreg rbx
	1347	#if ABI_SYSV
	1348	endprologue
	1349	.ifeqs "\mode", "dmul"
	1350	mov rbx, rcx
	1351	movdqu xmm8, [r8]
	1352	movdqu xmm10, [r9]
	1353	mov r8d, STKARG(0)
	1354	mov r9, STKARG(1)
	1355	mov r11, rdx
	1356	mov rcx, rsi
	1357	.endif
	1358	.ifeqs "\mode", "smul"
	1359	mov rbx, rdx
	1360	movdqu xmm10, [rcx]
	1361	mov rcx, rsi
	1362	.endif
	1363	.ifeqs "\mode", "mmul"
	1364	mov rax, STKARG(0)
	1365	mov rbx, r8
	1366	movdqu xmm8, [r9]
	1367	movdqu xmm10, [rax]
	1368	mov r8d, STKARG(1)
	1369	mov r9, STKARG(2)
	1370	mov r10, rdx
	1371	mov r11, rcx
	1372	mov rcx, rsi
	1373	.endif
	1374	.ifeqs "\mode", "mont"
	1375	mov rbx, rcx
	1376	movdqu xmm8, [r8]
	1377	mov r8d, r9d
	1378	mov r9, STKARG(0)
	1379	mov r10, rdx
	1380	mov rcx, rsi
	1381	.endif
	1382	#endif
	1383	#if ABI_WIN
	1384	pushreg rdi
	1385	stalloc 168
	1386	savexmm xmm6, 0
	1387	savexmm xmm7, 16
	1388	savexmm xmm8, 32
	1389	savexmm xmm9, 48
	1390	savexmm xmm10, 64
	1391	savexmm xmm11, 80
	1392	savexmm xmm12, 96
	1393	savexmm xmm13, 112
	1394	savexmm xmm14, 128
	1395	savexmm xmm15, 144
	1396	endprologue
	1397	.ifeqs "\mode", "dmul"
	1398	mov r10, STKARG(0)
	1399	mov r11, STKARG(1)
	1400	mov rdi, rcx
	1401	mov rcx, rdx
	1402	mov rbx, r9
	1403	movdqu xmm8, [r10]
	1404	movdqu xmm10, [r11]
	1405	mov r11, r8
	1406	mov r8d, STKARG(2)
	1407	mov r9, STKARG(3)
	1408	.endif
	1409	.ifeqs "\mode", "smul"
	1410	mov rdi, rcx
	1411	mov rcx, rdx
	1412	mov rbx, r8
	1413	movdqu xmm10, [r9]
	1414	mov r8d, STKARG(0)
	1415	mov r9, STKARG(1)
	1416	.endif
	1417	.ifeqs "\mode", "mmul"
	1418	mov r10, STKARG(1)
	1419	mov r11, STKARG(2)
	1420	mov rdi, rcx
	1421	mov rcx, rdx
	1422	mov rbx, STKARG(0)
	1423	movdqu xmm8, [r10]
	1424	movdqu xmm10, [r11]
	1425	mov r10, r8
	1426	mov r11, r9
	1427	mov r8d, STKARG(3)
	1428	mov r9, STKARG(4)
	1429	.endif
	1430	.ifeqs "\mode", "mont"
	1431	mov r10, STKARG(0)
	1432	mov rdi, rcx
	1433	mov rcx, rdx
	1434	mov rbx, r9
	1435	movdqu xmm8, [r10]
	1436	mov r10, r8
	1437	mov r8d, STKARG(1)
	1438	mov r9, STKARG(2)
	1439	.endif
	1440	#endif
	1441
	1442	pxor xmm0, xmm0
	1443	.ifeqs "\mode", "dmul"
	1444	expand xmm0, xmm8, xmm9, xmm10, xmm11
	1445	.endif
	1446	.ifeqs "\mode", "smul"
	1447	expand xmm0, xmm10, xmm11
	1448	.endif
	1449	.ifeqs "\mode", "mmul"
	1450	expand xmm0, xmm8, xmm9, xmm10, xmm11
	1451	.endif
	1452	.ifeqs "\mode", "mont"
	1453	expand xmm0, xmm8, xmm9
	1454	.endif
	1455	.endm
	1456
	1457	.macro testepilogue
	1458	#if ABI_WIN
	1459	rstrxmm xmm6, 0
	1460	rstrxmm xmm7, 16
	1461	rstrxmm xmm8, 32
	1462	rstrxmm xmm9, 48
	1463	rstrxmm xmm10, 64
	1464	rstrxmm xmm11, 80
	1465	rstrxmm xmm12, 96
	1466	rstrxmm xmm13, 112
	1467	rstrxmm xmm14, 128
	1468	rstrxmm xmm15, 144
	1469	stfree 168
	1470	popreg rdi
	1471	#endif
	1472	popreg rbx
	1473	ret
	1474	.endm
	1475
	1476	.macro testldcarry
	1477	movdqu xmm12, [rcx + 0] // (c'_0, c''_0)
	1478	movdqu xmm13, [rcx + 16] // (c'_1, c''_1)
	1479	movdqu xmm14, [rcx + 32] // (c'_2, c''_2)
	1480	.endm
	1481
	1482	.macro testtop u=nil
	1483	.p2align 4
	1484	0:
	1485	cysetup r9, r8
	1486	.ifnes "\u", "nil"
	1487	mov rax, \u
	1488	.endif
	1489	.endm
	1490
	1491	.macro testtail
	1492	cystore r9, r8
	1493	jnz 0b
	1494	.endm
	1495
	1496	.macro testcarryout
	1497	movdqu [rcx + 0], xmm12
	1498	movdqu [rcx + 16], xmm13
	1499	movdqu [rcx + 32], xmm14
	1500	.endm
	1501
	1502	FUNC(test_dmul4)
	1503	testprologue dmul
	1504	testldcarry
	1505	testtop r11
	1506	call dmul4
	1507	testtail
	1508	testcarryout
	1509	testepilogue
	1510	ENDFUNC
	1511
	1512	FUNC(test_dmla4)
	1513	testprologue dmul
	1514	testldcarry
	1515	testtop r11
	1516	call dmla4
	1517	testtail
	1518	testcarryout
	1519	testepilogue
	1520	ENDFUNC
	1521
	1522	FUNC(test_mul4)
	1523	testprologue smul
	1524	testldcarry
	1525	testtop nil
	1526	call mul4
	1527	testtail
	1528	testcarryout
	1529	testepilogue
	1530	ENDFUNC
	1531
	1532	FUNC(test_mla4)
	1533	testprologue smul
	1534	testldcarry
	1535	testtop nil
	1536	call mla4
	1537	testtail
	1538	testcarryout
	1539	testepilogue
	1540	ENDFUNC
	1541
	1542	FUNC(test_mmul4)
	1543	testprologue mmul
	1544	testtop r11
	1545	call mmul4
	1546	testtail
	1547	movdqu [r10 + 0], xmm10
	1548	movdqu [r10 + 16], xmm11
	1549	testcarryout
	1550	testepilogue
	1551	ENDFUNC
	1552
	1553	FUNC(test_mmla4)
	1554	testprologue mmul
	1555	testtop r11
	1556	call mmla4
	1557	testtail
	1558	movdqu [r10 + 0], xmm10
	1559	movdqu [r10 + 16], xmm11
	1560	testcarryout
	1561	testepilogue
	1562	ENDFUNC
	1563
	1564	FUNC(test_mont4)
	1565	testprologue mont
	1566	testtop
	1567	call mont4
	1568	testtail
	1569	movdqu [r10 + 0], xmm10
	1570	movdqu [r10 + 16], xmm11
	1571	testcarryout
	1572	testepilogue
	1573	ENDFUNC
	1574
	1575	#endif
	1576
	1577	///----- That's all, folks --------------------------------------------------