mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// Large SIMD-based multiplications
	4	///
	5	/// (c) 2019 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software: you can redistribute it and/or modify it
	13	/// under the terms of the GNU Library General Public License as published
	14	/// by the Free Software Foundation; either version 2 of the License, or
	15	/// (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful, but
	18	/// WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	20	/// Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb. If not, write to the Free Software
	24	/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
	25	/// USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// Preliminaries.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	.text
	34
	35	///--------------------------------------------------------------------------
	36	/// Theory.
	37	///
	38	/// We define a number of primitive fixed-size multipliers from which we can
	39	/// construct more general variable-length multipliers.
	40	///
	41	/// The basic trick is the same throughout. In an operand-scanning
	42	/// multiplication, the inner multiplication loop multiplies a multiple-
	43	/// precision operand by a single precision factor, and adds the result,
	44	/// appropriately shifted, to the result. A `finely integrated operand
	45	/// scanning' implementation of Montgomery multiplication also adds the
	46	/// product of a single-precision `Montgomery factor' and the modulus,
	47	/// calculated in the same pass. The more common `coarsely integrated
	48	/// operand scanning' alternates main multiplication and Montgomery passes,
	49	/// which requires additional carry propagation.
	50	///
	51	/// Throughout both plain-multiplication and Montgomery stages, then, one of
	52	/// the factors remains constant throughout the operation, so we can afford
	53	/// to take a little time to preprocess it. The transformation we perform is
	54	/// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
	55	/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
	56	/// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
	57	/// pieces are placed into 32-bit cells, and arranged as two 128-bit SIMD
	58	/// operands, as follows.
	59	///
	60	/// Offset 0 4 8 12
	61	/// 0 v'_0 v''_0 v'_1 v''_1
	62	/// 16 v'_2 v''_2 v'_3 v''_3
	63	///
	64	/// The `umull' and `umlal' instructions can multiply a vector of two 32-bit
	65	/// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
	66	/// on (say) v'_0 and v''_0 in a single instruction, to produce two 48-bit
	67	/// results in 64-bit fields. The sixteen bits of headroom allows us to add
	68	/// many products together before we must deal with carrying; it also allows
	69	/// for some calculations to be performed on the above expanded form.
	70	///
	71	/// We maintain three `carry' registers, v28--v30, accumulating intermediate
	72	/// results; we name them `c0', `c1', and `c2'. Each carry register holds
	73	/// two 64-bit halves: the register c0, for example, holds c'_0 (low half)
	74	/// and c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the
	75	/// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2.
	76	/// The `umull' or `umlal' instruction acting on a scalar operand and an
	77	/// operand in the expanded form above produces a result which can be added
	78	/// directly to the appropriate carry register.
	79	///
	80	/// An unusual feature of this code, as compared to the other `mul4'
	81	/// implementations, is that it makes extensive use of the ARM64
	82	/// general-purpose registers for carry resolution and output construction.
	83	/// As a result, an additional general-purpose register (typically x15) is
	84	/// used as an additional carry, with the carry value in bits 16--63.
	85	///
	86	/// Multiplication is performed in product-scanning order, since ARM
	87	/// processors commonly implement result forwarding for consecutive multiply-
	88	/// and-accumulate instructions specifying the same destination.
	89	/// Experimentally, this runs faster than operand-scanning in an attempt to
	90	/// hide instruction latencies.
	91	///
	92	/// On 64-bit ARM, we have a vast supply number of registers: the expanded
	93	/// operands are kept in registers. The packed operands are read from memory
	94	/// into working registers v0 and v1. The following conventional argument
	95	/// names and locations are used throughout.
	96	///
	97	/// Arg Format Location Notes
	98	///
	99	/// U packed [x1]
	100	/// X packed [x2] In Montgomery multiplication, X = N
	101	/// V expanded v2/v3
	102	/// Y expanded v4/v5 In Montgomery multiplication, Y = (A + U V) M
	103	/// M expanded v6/v7 -N^{-1} (mod B^4)
	104	/// N Modulus, for Montgomery multiplication
	105	/// A packed [x0] Destination/accumulator
	106	/// C carry v28--v30
	107	/// 0 v31 128-bit zero
	108	///
	109	/// The calculation is some variant of
	110	///
	111	/// A' + C' B^4 <- U V + X Y + A + C
	112	///
	113	/// The low-level functions fit into a fairly traditional (finely-integrated)
	114	/// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y)
	115	/// (indexed by i).
	116	///
	117	/// The variants are as follows.
	118	///
	119	/// Function Variant Use i j
	120	///
	121	/// mmul4 A = C = 0 Montgomery 0 0
	122	/// dmul4 A = 0 Montgomery 0 +
	123	/// mmla4 C = 0 Montgomery + 0
	124	/// dmla4 exactly as shown Montgomery + +
	125	///
	126	/// mul4zc U = V = A = C = 0 Plain 0 0
	127	/// mul4 U = V = A = 0 Plain 0 +
	128	/// mla4zc U = V = C = 0 Plain + 0
	129	/// mla4 U = V = 0 Plain + +
	130	///
	131	/// The `mmul4' and `mmla4' functions are also responsible for calculating
	132	/// the Montgomery reduction factor Y = (A + U V) M used by the rest of the
	133	/// inner loop.
	134
	135	///--------------------------------------------------------------------------
	136	/// Macro definitions.
	137
	138	.macro mulacc z, u, v, x=nil, y=nil
	139	// Set Z = Z + U V + X Y, using the low halves of V and Y. Y may be
	140	// `nil' to omit the second operand. Z, V, and Y should be 128-bit
	141	// `vN' registers; and U and X should be 32-bit `vN.s[I]' scalars;
	142	// the multiplications produce two 64-bit elementwise products, which
	143	// are added elementwise to Z.
	144
	145	umlal \z\().2d, \v\().2s, \u
	146	.ifnes "\y", "nil"
	147	umlal \z\().2d, \y\().2s, \x
	148	.endif
	149	.endm
	150
	151	.macro mulacc2 z, u, v, x=nil, y=nil
	152	// Set Z = Z + U V + X Y, using the high halves of V and Y; see
	153	// `mulacc'.
	154
	155	umlal2 \z\().2d, \v\().4s, \u
	156	.ifnes "\y", "nil"
	157	umlal2 \z\().2d, \y\().4s, \x
	158	.endif
	159	.endm
	160
	161	.macro mulinit z, zinitp, u, v=nil, x=nil, y=nil
	162	// If ZINITP then set Z = Z + U V + X Y, as for `mulacc'; otherwise,
	163	// set Z = U V + X Y. Operand requirements and detailed operation
	164	// are as for `mulacc'.
	165
	166	.ifeqs "\zinitp", "t"
	167	mulacc \z, \u, \v, \x, \y
	168	.else
	169	umull \z\().2d, \v\().2s, \u
	170	.ifnes "\y", "nil"
	171	umlal \z\().2d, \y\().2s, \x
	172	.endif
	173	.endif
	174	.endm
	175
	176	.macro mulini2 z, zinitp, u, v=nil, x=nil, y=nil
	177	// As `mulinit', but with the high halves of V and Y.
	178
	179	.ifeqs "\zinitp", "t"
	180	mulacc2 \z, \u, \v, \x, \y
	181	.else
	182	umull2 \z\().2d, \v\().4s, \u
	183	.ifnes "\y", "nil"
	184	umlal2 \z\().2d, \y\().4s, \x
	185	.endif
	186	.endif
	187	.endm
	188
	189	// `mulI': accumulate the B^I and b B^I terms of the polynomial product sum
	190	// U V + X Y, given that U = u_0 + B u_1 + B^2 u_2 + B^3 u_3 (and similarly
	191	// for x), and V = v'_0 + b v''_0 + B (v'_1 + b v''_1) + B^2 (v'_2 + b v''_2)
	192	// + B^3 (v'_3 + b v''_3) (and similarly for Y). The 64-bit coefficients are
	193	// added into the low and high halves of the 128-bit register Z (if ZINIT is
	194	// `nil' then simply set Z, as if it were initially zero).
	195	.macro mul0 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
	196	mulinit \z, \zinitp, \u\().s[0], \v0, \x\().s[0], \y0
	197	.endm
	198	.macro mul1 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
	199	mulini2 \z, \zinitp, \u\().s[0], \v0, \x\().s[0], \y0
	200	mulacc \z, \u\().s[1], \v0, \x\().s[1], \y0
	201	.endm
	202	.macro mul2 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
	203	mulinit \z, \zinitp, \u\().s[0], \v1, \x\().s[0], \y1
	204	mulacc2 \z, \u\().s[1], \v0, \x\().s[1], \y0
	205	mulacc \z, \u\().s[2], \v0, \x\().s[2], \y0
	206	.endm
	207	.macro mul3 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
	208	mulini2 \z, \zinitp, \u\().s[0], \v1, \x\().s[0], \y1
	209	mulacc \z, \u\().s[1], \v1, \x\().s[1], \y1
	210	mulacc2 \z, \u\().s[2], \v0, \x\().s[2], \y0
	211	mulacc \z, \u\().s[3], \v0, \x\().s[3], \y0
	212	.endm
	213	.macro mul4 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
	214	mulini2 \z, \zinitp, \u\().s[1], \v1, \x\().s[1], \y1
	215	mulacc \z, \u\().s[2], \v1, \x\().s[2], \y1
	216	mulacc2 \z, \u\().s[3], \v0, \x\().s[3], \y0
	217	.endm
	218	.macro mul5 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
	219	mulini2 \z, \zinitp, \u\().s[2], \v1, \x\().s[2], \y1
	220	mulacc \z, \u\().s[3], \v1, \x\().s[3], \y1
	221	.endm
	222	.macro mul6 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
	223	mulini2 \z, \zinitp, \u\().s[3], \v1, \x\().s[3], \y1
	224	.endm
	225
	226	// Steps in the process of propagating carry bits upwards from ZLO (a 128-bit
	227	// `vN' register). Here, T0, T1, and CG are 64-bit `xN' general-purpose
	228	// registers clobbered in the process. Set the low 32 bits of the 64-bit
	229	// `xN' general-purpose register ZOUT to the completed coefficient z_1,
	230	// leaving a carry in CG.
	231	//
	232	// In detail, what happens is as follows. Suppose initially that ZLO =
	233	// (z'_i; z''_i) and ZHI = (z'_{i+1}; z''_{i+1}). Let t = z'_i + b z''_i;
	234	// observe that floor(t/b) = floor(z'_i/b) + z''_i. Let z_i = t mod B, and
	235	// add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}. This has
	236	// a circuit depth of 3; I don't know how to do better.
	237	//
	238	// Output words are left in the low half of a 64-bit register, with rubbish
	239	// in the high half. Two such results can be combined using the `bfi'
	240	// instruction.
	241	.macro carry0 zlo, cg=x15, t0=x16, t1=x17
	242	// Capture the values of carry-register ZLO and CG (if not `nil') in
	243	// general-purpose registers T0 and T1, suitable for use in `carry1'.
	244	mov \t0, \zlo\().d[0]
	245	mov \t1, \zlo\().d[1]
	246	.ifnes "\cg", "nil"
	247	add \t0, \t0, \cg, lsr #16
	248	.endif
	249	.endm
	250	.macro carry1 zout, cg=x15, t0=x16, t1=x17
	251	// Collect a 32-bit output word in the low 32 bits of ZOUT (leaving
	252	// rubbish in the high 32 bits), and update CG suitably to continue
	253	// processing with the next carry register.
	254	.ifnes "\zout", "nil"
	255	add \zout, \t0, \t1, lsl #16
	256	.endif
	257	.ifnes "\cg", "nil"
	258	add \cg, \t1, \t0, lsr #16
	259	.endif
	260	.endm
	261
	262	.macro expand vlo, vhi, vz=v31
	263	// Expand the packed 128-bit operand in VLO to an expanded operand in
	264	// VLO and VHI, assuming that VZ is all-bits-zero. All three are
	265	// `vN' 128-bit SIMD registers.
	266	zip2 \vhi\().8h, \vlo\().8h, \vz\().8h
	267	zip1 \vlo\().8h, \vlo\().8h, \vz\().8h
	268	.endm
	269
	270	.macro sprdacc a0, a1, a2, a3=nil
	271	// Spread the packed 128-bit operand in A0 into carry-format values
	272	// in A0, A1, A2, A3. If A3 is `nil', then spread the same value
	273	// into A0, A1, A2 only, clobbering x16.
	274	.ifeqs "\a3", "nil"
	275	mov w16, \a0\().s[3]
	276	.endif
	277	trn2 \a2\().2d, \a0\().2d, v31.2d
	278	trn2 \a1\().2s, \a0\().2s, v31.2s
	279	.ifeqs "\a3", "nil"
	280	lsl x16, x16, #16
	281	.endif
	282	trn1 \a0\().2s, \a0\().2s, v31.2s
	283	.ifeqs "\a3", "nil"
	284	mov \a2\().d[1], x16
	285	.else
	286	trn2 \a3\().2s, \a2\().2s, v31.2s
	287	.endif
	288	mov \a2\().s[1], wzr
	289	.endm
	290
	291	.macro crryacc a0, a1, a2, a3, c0, c1, c2
	292	// Add the carry-format values A0, A1, A2 into the existing carries
	293	// C0, C1, C2 (leaving A3 where it is).
	294	add \c0\().2d, \c0\().2d, \a0\().2d
	295	add \c1\().2d, \c1\().2d, \a1\().2d
	296	add \c2\().2d, \c2\().2d, \a2\().2d
	297	.endm
	298
	299	///--------------------------------------------------------------------------
	300	/// Primitive multipliers and related utilities.
	301
	302	INTFUNC(carryprop)
	303	// On entry, x0 points to a destination, and v28--v30 and x15 hold
	304	// incoming carries c0--c2 and cg. On exit, the low 128 bits of the
	305	// carry value are stored at [x0]; the remaining 16 bits of carry are
	306	// left in x10; x0 is advanced by 16; and x11--x17 are clobbered.
	307	endprologue
	308
	309	carry0 v28
	310	carry1 x11
	311	carry0 v29
	312	carry1 x13
	313	carry0 v30
	314	carry1 x12
	315	bfi x11, x13, #32, #32
	316	lsr x14, x15, #16
	317	lsr x10, x15, #48
	318	bfi x12, x14, #32, #32
	319	stp x11, x12, [x0], #16
	320	ret
	321	ENDFUNC
	322
	323	INTFUNC(dmul4)
	324	// On entry, x0 points to the destination; x1 and x2 point to packed
	325	// operands U and X; v2/v3 and v4/v5 hold expanded operands V and Y;
	326	// v28--v30 and x15 hold incoming carries c0--c2 and cg; and v31 is
	327	// zero. On exit, the destination and carries are updated; x0, x1,
	328	// x2 are each advanced by 16; v2--v5 and v8--v15 are preserved; and
	329	// x11--x14, x16, x17 and the other SIMD registers are clobbered.
	330	endprologue
	331
	332	// Start by loading the operand words from memory.
	333	ldr q0, [x1], #16
	334	ldr q1, [x2], #16
	335
	336	// Do the multiplication.
	337	mul0 v28, t, v0, v2, v3, v1, v4, v5
	338	mul1 v29, t, v0, v2, v3, v1, v4, v5
	339	carry0 v28
	340	mul2 v30, t, v0, v2, v3, v1, v4, v5
	341	carry1 x11
	342	carry0 v29
	343	mul3 v27, nil, v0, v2, v3, v1, v4, v5
	344	carry1 x13
	345	carry0 v30
	346	mul4 v28, nil, v0, v2, v3, v1, v4, v5
	347	carry1 x12
	348	carry0 v27
	349	mul5 v29, nil, v0, v2, v3, v1, v4, v5
	350	carry1 x14
	351	mul6 v30, nil, v0, v2, v3, v1, v4, v5
	352
	353	// Finish up and store the result.
	354	bfi x11, x13, #32, #32
	355	bfi x12, x14, #32, #32
	356	stp x11, x12, [x0], #16
	357
	358	// All done.
	359	ret
	360	ENDFUNC
	361
	362	INTFUNC(dmla4)
	363	// On entry, x0 points to the destination/accumulator A; x1 and x2
	364	// point to packed operands U and X; v2/v3 and v4/v5 hold expanded
	365	// operands V and Y; v28--v30 and x15 hold incoming carries c0--c2
	366	// and cg; and v31 is zero. On exit, the accumulator and carries are
	367	// updated; x0, x1, x2 are each advanced by 16; v2--v5 and v8--v15
	368	// are preserved; and x11--x14, x16, x17 and the other SIMD registers
	369	// are clobbered.
	370	endprologue
	371
	372	// Start by loading the operand words from memory.
	373	ldr q24, [x0]
	374	ldr q0, [x1], #16
	375	ldr q1, [x2], #16
	376	sprdacc v24, v25, v26, v27
	377	crryacc v24, v25, v26, v27, v28, v29, v30
	378
	379	// Do the multiplication.
	380	mul0 v28, t, v0, v2, v3, v1, v4, v5
	381	mul1 v29, t, v0, v2, v3, v1, v4, v5
	382	carry0 v28
	383	mul2 v30, t, v0, v2, v3, v1, v4, v5
	384	carry1 x11
	385	carry0 v29
	386	mul3 v27, t, v0, v2, v3, v1, v4, v5
	387	carry1 x13
	388	carry0 v30
	389	mul4 v28, nil, v0, v2, v3, v1, v4, v5
	390	carry1 x12
	391	carry0 v27
	392	mul5 v29, nil, v0, v2, v3, v1, v4, v5
	393	carry1 x14
	394	mul6 v30, nil, v0, v2, v3, v1, v4, v5
	395
	396	// Finish up and store the result.
	397	bfi x11, x13, #32, #32
	398	bfi x12, x14, #32, #32
	399	stp x11, x12, [x0], #16
	400
	401	// All done.
	402	ret
	403	ENDFUNC
	404
	405	INTFUNC(mul4)
	406	// On entry, x0 points to the destination; x2 points to a packed
	407	// operand X; v4/v5 holds an expanded operand Y; v13--v15 and x15
	408	// hold incoming carries c0--c2 and cg; and v31 is zero. On exit,
	409	// the destination and carries are updated; x0 and x2 are each
	410	// advanced by 16; v4 and v5 and v8--v15 are preserved; and x11--x14,
	411	// x16, x17 and the other SIMD registers are clobbered.
	412	endprologue
	413
	414	// Start by loading the operand words from memory.
	415	ldr q1, [x2], #16
	416
	417	// Do the multiplication.
	418	mul0 v28, t, v1, v4, v5
	419	mul1 v29, t, v1, v4, v5
	420	carry0 v28
	421	mul2 v30, t, v1, v4, v5
	422	carry1 x11
	423	carry0 v29
	424	mul3 v27, nil, v1, v4, v5
	425	carry1 x13
	426	carry0 v30
	427	mul4 v28, nil, v1, v4, v5
	428	carry1 x12
	429	carry0 v27
	430	mul5 v29, nil, v1, v4, v5
	431	carry1 x14
	432	mul6 v30, nil, v1, v4, v5
	433
	434	// Finish up and store the result.
	435	bfi x11, x13, #32, #32
	436	bfi x12, x14, #32, #32
	437	stp x11, x12, [x0], #16
	438
	439	// All done.
	440	ret
	441	ENDFUNC
	442
	443	INTFUNC(mul4zc)
	444	// On entry, x0 points to the destination; x2 points to a packed
	445	// operand X; v4/v5 holds an expanded operand Y; and v31 is zero. On
	446	// exit, the destination is updated; v28--v30 and x15 hold outgoing
	447	// carries c0--c2 and cg; x0 and x2 are each advanced by 16; v4 and
	448	// v5 and v8--v15 are preserved; and x11--x14, x16, x17 and the other
	449	// SIMD registers are clobbered.
	450	endprologue
	451
	452	// Start by loading the operand words from memory.
	453	ldr q1, [x2], #16
	454
	455	// Do the multiplication.
	456	mul0 v28, nil, v1, v4, v5
	457	mul1 v29, nil, v1, v4, v5
	458	carry0 v28, nil
	459	mul2 v30, nil, v1, v4, v5
	460	carry1 x11
	461	carry0 v29
	462	mul3 v27, nil, v1, v4, v5
	463	carry1 x13
	464	carry0 v30
	465	mul4 v28, nil, v1, v4, v5
	466	carry1 x12
	467	carry0 v27
	468	mul5 v29, nil, v1, v4, v5
	469	carry1 x14
	470	mul6 v30, nil, v1, v4, v5
	471
	472	// Finish up and store the result.
	473	bfi x11, x13, #32, #32
	474	bfi x12, x14, #32, #32
	475	stp x11, x12, [x0], #16
	476
	477	// All done.
	478	ret
	479	ENDFUNC
	480
	481	INTFUNC(mla4)
	482	// On entry, x0 points to the destination/accumulator A; x2 points to
	483	// a packed operand X; v4/v5 holds an expanded operand Y; v13--v15
	484	// and x15 hold incoming carries c0--c2 and cg; and v31 is zero. On
	485	// exit, the accumulator and carries are updated; x0 and x2 are each
	486	// advanced by 16; v4 and v5 and v8--v15 are preserved; and x11--x14,
	487	// x16, x17 and the other SIMD registers are clobbered.
	488	endprologue
	489
	490	// Start by loading the operand words from memory.
	491	ldr q24, [x0]
	492	ldr q1, [x2], #16
	493	sprdacc v24, v25, v26, v27
	494	crryacc v24, v25, v26, v27, v28, v29, v30
	495
	496	// Do the multiplication.
	497	mul0 v28, t, v1, v4, v5
	498	mul1 v29, t, v1, v4, v5
	499	carry0 v28
	500	mul2 v30, t, v1, v4, v5
	501	carry1 x11
	502	carry0 v29
	503	mul3 v27, t, v1, v4, v5
	504	carry1 x13
	505	carry0 v30
	506	mul4 v28, nil, v1, v4, v5
	507	carry1 x12
	508	carry0 v27
	509	mul5 v29, nil, v1, v4, v5
	510	carry1 x14
	511	mul6 v30, nil, v1, v4, v5
	512
	513	// Finish up and store the result.
	514	bfi x11, x13, #32, #32
	515	bfi x12, x14, #32, #32
	516	stp x11, x12, [x0], #16
	517
	518	// All done.
	519	ret
	520	ENDFUNC
	521
	522	INTFUNC(mla4zc)
	523	// On entry, x0 points to the destination/accumulator A; x2 points to
	524	// a packed operand X; v4/v5 holds an expanded operand Y; and v31 is
	525	// zero. On exit, the accumulator is updated; v28--v30 and x15 hold
	526	// outgoing carries c0--c2 and cg; x0 and x2 are each advanced by 16;
	527	// v4, v5, and v8--v15 are preserved; and x11--x14, x16, x17 and the
	528	// other SIMD registers are clobbered.
	529	endprologue
	530
	531	// Start by loading the operand words from memory.
	532	ldr q28, [x0]
	533	ldr q1, [x2], #16
	534	sprdacc v28, v29, v30, v27
	535
	536	// Do the multiplication.
	537	mul0 v28, t, v1, v4, v5
	538	mul1 v29, t, v1, v4, v5
	539	carry0 v28, nil
	540	mul2 v30, t, v1, v4, v5
	541	carry1 x11
	542	carry0 v29
	543	mul3 v27, t, v1, v4, v5
	544	carry1 x13
	545	carry0 v30
	546	mul4 v28, nil, v1, v4, v5
	547	carry1 x12
	548	carry0 v27
	549	mul5 v29, nil, v1, v4, v5
	550	carry1 x14
	551	mul6 v30, nil, v1, v4, v5
	552
	553	// Finish up and store the result.
	554	bfi x11, x13, #32, #32
	555	bfi x12, x14, #32, #32
	556	stp x11, x12, [x0], #16
	557
	558	// All done.
	559	ret
	560	ENDFUNC
	561
	562	INTFUNC(mmul4)
	563	// On entry, x0 points to the destination; x1 points to a packed
	564	// operand U; x2 points to a packed operand X (the modulus); v2/v3
	565	// holds an expanded operand V; and v6/v7 holds an expanded operand M
	566	// (the Montgomery factor -N^{-1} (mod B)). On exit, the destination
	567	// is updated (to zero); v4/v5 hold an expanded factor Y = U V M (mod
	568	// B); v28--v30 and x15 hold outgoing carries c0--c2 and cg; x0, x1,
	569	// and x2 are each advanced by 16; v2, v3, and v8--v15 are preserved;
	570	// and x11--x14, x16, x17 and the other SIMD registers are clobbered.
	571	endprologue
	572
	573	// Start by loading the operand words from memory.
	574	ldr q0, [x1], #16
	575	ldr q1, [x2], #16
	576
	577	// Calculate the low half of W = A + U V, being careful to leave the
	578	// carries in place.
	579	mul0 v28, nil, v0, v2, v3
	580	mul1 v29, nil, v0, v2, v3
	581	carry0 v28, nil
	582	mul2 v30, nil, v0, v2, v3
	583	carry1 x11
	584	carry0 v29
	585	mul3 v27, nil, v0, v2, v3
	586	b mmla4_common
	587	ENDFUNC
	588
	589	INTFUNC(mmla4)
	590	// On entry, x0 points to the destination/accumulator A; x1 points to
	591	// a packed operand U; x2 points to a packed operand X (the modulus);
	592	// v2/v3 holds an expanded operand V; and v6/v7 holds an expanded
	593	// operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the
	594	// accumulator is updated (to zero); v4/v5 hold an expanded factor Y
	595	// = (A + U V) M (mod B); v28--v30 and x15 hold outgoing carries
	596	// c0--c2 and cg; x0, x1, and x2 are each advanced by 16; v2, v3, v6,
	597	// v7, and v8--v15 are preserved; and x11--x14, x16, x17 and the
	598	// other SIMD registers are clobbered.
	599	endprologue
	600
	601	// Start by loading the operand words from memory.
	602	ldr q28, [x0]
	603	ldr q0, [x1], #16
	604	ldr q1, [x2], #16
	605	sprdacc v28, v29, v30, v27
	606
	607	// Calculate the low half of W = A + U V, being careful to leave the
	608	// carries in place.
	609	mul0 v28, t, v0, v2, v3
	610	mul1 v29, t, v0, v2, v3
	611	carry0 v28, nil
	612	mul2 v30, t, v0, v2, v3
	613	carry1 x11
	614	carry0 v29
	615	mul3 v27, t, v0, v2, v3
	616	mmla4_common:
	617	carry1 x13
	618	carry0 v30
	619	carry1 x12
	620	carry0 v27
	621	carry1 x14, nil
	622
	623	// Piece the result together and ship it back.
	624	bfi x11, x13, #32, #32
	625	bfi x12, x14, #32, #32
	626	mov v16.d[0], x11
	627	mov v16.d[1], x12
	628
	629	// Calculate the low half of the Montgomery factor Y = W M.
	630	mul0 v18, nil, v16, v6, v7
	631	mul1 v19, nil, v16, v6, v7
	632	carry0 v18, nil
	633	mul2 v20, nil, v16, v6, v7
	634	carry1 x11
	635	carry0 v19
	636	mul3 v21, nil, v16, v6, v7
	637	carry1 x13
	638	carry0 v20
	639	carry1 x12
	640	carry0 v21
	641	carry1 x14, nil
	642
	643	// Piece the result together, ship it back, and expand.
	644	bfi x11, x13, #32, #32
	645	bfi x12, x14, #32, #32
	646	mov v4.d[0], x11
	647	mov v4.d[1], x12
	648	expand v4, v5
	649
	650	// Build up the product X Y in the carry slots.
	651	mul0 v28, t, v1, v4, v5
	652	mul1 v29, t, v1, v4, v5
	653	carry0 v28, nil
	654	mul2 v30, t, v1, v4, v5
	655	carry1 nil
	656	carry0 v29
	657	mul3 v27, t, v1, v4, v5
	658	carry1 nil
	659	carry0 v30
	660
	661	// And complete the calculation.
	662	mul4 v28, nil, v0, v2, v3, v1, v4, v5
	663	carry1 nil
	664	carry0 v27
	665	mul5 v29, nil, v0, v2, v3, v1, v4, v5
	666	carry1 nil
	667	mul6 v30, nil, v0, v2, v3, v1, v4, v5
	668
	669	// Finish up and store the result.
	670	stp xzr, xzr, [x0], #16
	671
	672	// All done.
	673	ret
	674	ENDFUNC
	675
	676	INTFUNC(mont4)
	677	// On entry, x0 points to the destination/accumulator A; x2 points to
	678	// a packed operand X (the modulus); and v6/v7 holds an expanded
	679	// operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the
	680	// accumulator is updated (to zero); v4/v5 hold an expanded factor Y
	681	// = A M (mod B); v28--v30 and x15 hold outgoing carries c0--c2 and
	682	// cg; x0 and x2 are each advanced by 16; v6, v7, and v8--v15 are
	683	// preserved; and x11--x14, x16, x17 and the other SIMD registers are
	684	// clobbered.
	685	endprologue
	686
	687	// Start by loading the operand words from memory.
	688	ldr q28, [x0]
	689	ldr q1, [x2], #16
	690
	691	// Calculate Y = A M (mod B).
	692	mul0 v18, nil, v28, v6, v7
	693	mul1 v19, nil, v28, v6, v7
	694	carry0 v18, nil
	695	mul2 v20, nil, v28, v6, v7
	696	carry1 x11
	697	carry0 v19
	698	mul3 v21, nil, v28, v6, v7
	699	carry1 x13
	700	carry0 v20
	701	sprdacc v28, v29, v30, v27
	702	carry1 x12
	703	carry0 v21
	704	carry1 x14, nil
	705
	706	// Piece the result together, ship it back, and expand.
	707	bfi x11, x13, #32, #32
	708	bfi x12, x14, #32, #32
	709	mov v4.d[0], x11
	710	mov v4.d[1], x12
	711	expand v4, v5
	712
	713	// Calculate the actual result. Well, the carries, at least.
	714	mul0 v28, t, v1, v4, v5
	715	mul1 v29, t, v1, v4, v5
	716	carry0 v28, nil
	717	mul2 v30, t, v1, v4, v5
	718	carry1 nil
	719	carry0 v29
	720	mul3 v27, t, v1, v4, v5
	721	carry1 nil
	722	carry0 v30
	723
	724	// And complete the calculation.
	725	mul4 v28, nil, v1, v4, v5
	726	carry1 nil
	727	carry0 v27
	728	mul5 v29, nil, v1, v4, v5
	729	carry1 nil
	730	mul6 v30, nil, v1, v4, v5
	731
	732	// Finish up and store the result.
	733	stp xzr, xzr, [x0], #16
	734
	735	// All done.
	736	ret
	737	ENDFUNC
	738
	739	///--------------------------------------------------------------------------
	740	/// Bulk multipliers.
	741
	742	FUNC(mpx_umul4_arm64_simd)
	743	// void mpx_umul4_arm64_simd(mpw dv, const mpw av, const mpw *avl,
	744	// const mpw bv, const mpw bvl);
	745
	746	// Establish the arguments and do initial setup.
	747	//
	748	// inner loop dv x0
	749	// inner loop av x2
	750	// outer loop dv x5
	751	// outer loop bv x3
	752	// av base x1
	753	// inner n x6
	754	// n base x7
	755	// outer n x4
	756	pushreg x29, x30
	757	setfp
	758	endprologue
	759
	760	// Prepare for the first iteration.
	761	ldr q4, [x3], #16 // Y = bv[0]
	762	movi v31.4s, #0
	763	sub x7, x2, x1 // = inner loop count base
	764	// x0 // = dv for inner loop
	765	// x1 // = av base
	766	// x3 // = bv for outer loop
	767	sub x4, x4, x3 // = outer loop count (decremented)
	768	sub x6, x7, #16 // = inner loop count (decremented)
	769	mov x2, x1 // = av for inner loop
	770	add x5, x0, #16 // = dv for outer loop
	771	expand v4, v5 // expand Y
	772	bl mul4zc
	773	cbz x6, 8f // all done?
	774
	775	// Continue with the first iteration.
	776	0: sub x6, x6, #16
	777	bl mul4
	778	cbnz x6, 0b
	779
	780	// Write out the leftover carry. There can be no tail here.
	781	8: bl carryprop
	782	cbz x4, 9f
	783
	784	// Set up for the next pass.
	785	1: ldr q4, [x3], #16 // Y = bv[i]
	786	mov x0, x5 // -> dv[i]
	787	mov x2, x1 // -> av[0]
	788	add x5, x5, #16
	789	sub x6, x7, #16 // = inner loop count (decremented)
	790	sub x4, x4, #16 // outer loop count
	791	expand v4, v5 // expand Y
	792	bl mla4zc
	793	cbz x6, 8f
	794
	795	// Continue...
	796	0: sub x6, x6, #16
	797	bl mla4
	798	cbnz x6, 0b
	799
	800	// Finish off this pass. There was no tail on the previous pass, and
	801	// there can be done on this pass.
	802	8: bl carryprop
	803	cbnz x4, 1b
	804
	805	// All over.
	806	9: popreg x29, x30
	807	ret
	808	ENDFUNC
	809
	810	FUNC(mpxmont_mul4_arm64_simd)
	811	// void mpxmont_mul4_arm64_simd(mpw *dv,
	812	// const mpw av, const mpw bv,
	813	// const mpw *nv, size_t n,
	814	// const mpw *mi);
	815
	816	// Establish the arguments and do initial setup.
	817	//
	818	// inner loop dv x0
	819	// inner loop av x1
	820	// inner loop nv x2
	821	// nv base x3
	822	// base n x4
	823	// mi (x5)
	824	// outer loop dv x5
	825	// outer loop bv x6
	826	// av base x7
	827	// inner n x8
	828	// outer n x9
	829	// c x10
	830	pushreg x29, x30
	831	setfp
	832	endprologue
	833
	834	// Set up the outer loop state and prepare for the first iteration.
	835	ldr q2, [x2] // = V = bv[0]
	836	ldr q6, [x5] // = M
	837	movi v31.4s, #0
	838	// x0 // -> dv for inner loop
	839	// x1 // -> av for inner loop
	840	// x3 // -> nv base
	841	// x4 // = n base
	842	add x5, x0, #16 // -> dv
	843	add x6, x2, #16 // -> bv
	844	mov x2, x3 // -> nv[0]
	845	mov x7, x1 // -> av base
	846	sub x8, x4, #4 // = inner n (decremented)
	847	sub x9, x4, #4 // = outer n (decremented)
	848	expand v2, v3 // expand V
	849	expand v6, v7 // expand M
	850	bl mmul4
	851	cbz x8, 8f // done already?
	852
	853	// Complete the first inner loop.
	854	0: sub x8, x8, #4
	855	bl dmul4
	856	cbnz x8, 0b // done yet?
	857
	858	// Still have carries left to propagate. Rather than store the tail
	859	// end in memory, keep it in x10 for later.
	860	bl carryprop
	861
	862	// Embark on the next iteration. (There must be one. If n = 1 then
	863	// we would have bailed above, to label 8. Similarly, the subsequent
	864	// iterations can fall into the inner loop immediately.)
	865	1: ldr q2, [x6], #16 // = Y = bv[i]
	866	mov x0, x5 // -> dv[i]
	867	mov x1, x7 // -> av[0]
	868	mov x2, x3 // -> nv[0]
	869	add x5, x5, #16
	870	sub x8, x4, #4
	871	sub x9, x9, #4
	872	expand v2, v3
	873	bl mmla4
	874
	875	// Complete the next inner loop.
	876	0: sub x8, x8, #4
	877	bl dmla4
	878	cbnz x8, 0b
	879
	880	// Still have carries left to propagate, and they overlap the
	881	// previous iteration's final tail, so read that and add it.
	882	add x15, x15, x10, lsl #16
	883	bl carryprop
	884
	885	// Back again, maybe.
	886	cbnz x9, 1b
	887
	888	// All done, almost.
	889	str w10, [x0], #4
	890	popreg x29, x30
	891	ret
	892
	893	// First iteration was short. Write out the carries and we're done.
	894	// (This could be folded into the main loop structure, but that would
	895	// penalize small numbers more.)
	896	8: bl carryprop
	897	str w10, [x0], #4
	898	popreg x29, x30
	899	ret
	900	ENDFUNC
	901
	902	FUNC(mpxmont_redc4_arm64_simd)
	903	// void mpxmont_redc4_arm64_simd(mpw dv, mpw dvl, const mpw *nv,
	904	// size_t n, const mpw *mi);
	905
	906	// Establish the arguments and do initial setup.
	907	//
	908	// inner loop dv x0
	909	// inner loop nv x2
	910	// blocks-of-4 count x6
	911	// tail count x7
	912	// mi (x4)
	913	// outer loop dv x4
	914	// outer loop count x8
	915	// nv base x5
	916	// inner loop count x1
	917	// n x3
	918	// c x10
	919	// t0, t1 x11, x12
	920
	921	pushreg x29, x30
	922	setfp
	923	endprologue
	924
	925	// Set up the outer loop state and prepare for the first iteration.
	926	ldr q6, [x4] // = M
	927	movi v31.4s, #0
	928	// x0 // -> dv for inner loop
	929	sub x6, x1, x0 // total dv bytes
	930	sub x1, x3, #4 // inner loop counter
	931	// x2 // -> nv for inner loop
	932	// x3 // = n
	933	add x4, x0, #16 // -> dv for outer loop
	934	mov x5, x2 // -> nv base
	935	sub x6, x6, x3, lsl #2 // dv carry range bytes
	936	sub x8, x3, #4 // outer loop counter
	937	sub x6, x6, #16 // dv steam-powered carry bytes
	938	expand v6, v7 // expand M
	939	and x7, x6, #15 // dv tail length in bytes
	940	bic x6, x6, #15 // dv blocks-of-four length in bytes
	941
	942	bl mont4
	943	cbz x1, 8f // done already?
	944
	945	5: sub x1, x1, #4
	946	bl mla4
	947	cbnz x1, 5b // done yet?
	948
	949	// Still have carries left to propagate. Adding the accumulator
	950	// block into the carries is a little different this time, because
	951	// all four accumulator limbs have to be squished into the three
	952	// carry registers for `carryprop' to do its thing.
	953	8: ldr q24, [x0]
	954	sprdacc v24, v25, v26
	955	add v28.2d, v28.2d, v24.2d
	956	add v29.2d, v29.2d, v25.2d
	957	add v30.2d, v30.2d, v26.2d
	958	bl carryprop
	959	cbz x6, 7f
	960
	961	// Propagate the first group of carries.
	962	ldp x16, x17, [x0]
	963	sub x1, x6, #16
	964	adds x16, x16, x10
	965	adcs x17, x17, xzr
	966	stp x16, x17, [x0], #16
	967	cbz x1, 6f
	968
	969	// Continue carry propagation until the end of the buffer.
	970	0: ldp x16, x17, [x0]
	971	sub x1, x1, #16
	972	adcs x16, x16, xzr
	973	adcs x17, x17, xzr
	974	stp x16, x17, [x0], #16
	975	cbnz x1, 0b
	976
	977	// Deal with the tail end. Note that the actual destination length
	978	// won't be an exacty number of blocks of four, so it's safe to just
	979	// drop through here.
	980	6: adc w10, wzr, wzr
	981	7: ldr w16, [x0]
	982	sub x1, x7, #4
	983	adds w16, w16, w10
	984	str w16, [x0], #4
	985	cbz x1, 8f
	986	0: ldr w16, [x0]
	987	sub x1, x1, #4
	988	adcs w16, w16, wzr
	989	str w16, [x0], #4
	990	cbnz x1, 0b
	991
	992	// All done for this iteration. Start the next.
	993	8: cbz x8, 9f
	994	mov x0, x4
	995	add x4, x4, #16
	996	sub x1, x3, #4
	997	mov x2, x5
	998	sub x8, x8, #4
	999	sub x6, x6, #16
	1000	bl mont4
	1001	b 5b
	1002
	1003	// All over.
	1004	9: popreg x29, x30
	1005	ret
	1006	ENDFUNC
	1007
	1008	///--------------------------------------------------------------------------
	1009	/// Testing and performance measurement.
	1010
	1011	#ifdef TEST_MUL4
	1012
	1013	// dmul smul mmul mont
	1014	// z x0 x0 x0 x0 x0
	1015	// c x3 x1 x1 x1 x1
	1016	// y x4 -- -- x2 x2
	1017	// u x1 x2 -- x3 --
	1018	// x x2 x3 x2 x4 x3
	1019	// vv v2/v3 x4 -- x5 --
	1020	// yy v4/v5 x5 x3 x6 --
	1021	// mm v6/v7 -- -- -- x4
	1022	// n x5 x6 x4 x7 x5
	1023	// cyv x6 x7 x5 stk0 x6
	1024
	1025	#define STKARG(i) sp, #16 + i
	1026
	1027	.macro testprologue mode
	1028	pushreg x29, x30
	1029	setfp
	1030	endprologue
	1031	movi v31.4s, #0
	1032
	1033	.ifeqs "\mode", "dmul"
	1034	ldr q2, [x4]
	1035	zip2 v3.8h, v2.8h, v31.8h // (v'_2, v''_2; v'_3, v''_3)
	1036	zip1 v2.8h, v2.8h, v31.8h // (v'_0, v''_0; v'_1, v''_1)
	1037
	1038	ldr q4, [x5]
	1039	zip2 v5.8h, v4.8h, v31.8h // (y'_2, y''_2; y'_3, y''_3)
	1040	zip1 v4.8h, v4.8h, v31.8h // (y'_0, y''_0; y'_1, y''_1)
	1041
	1042	mov x16, x1
	1043	mov x1, x2 // -> u
	1044	mov x2, x3 // -> x
	1045	mov x3, x16 // -> c
	1046
	1047	mov x5, x6 // = n
	1048	mov x6, x7 // -> cyv
	1049	.endif
	1050
	1051	.ifeqs "\mode", "smul"
	1052	ldr q4, [x3]
	1053	zip2 v5.8h, v4.8h, v31.8h // (y'_2, y''_2; y'_3, y''_3)
	1054	zip1 v4.8h, v4.8h, v31.8h // (y'_0, y''_0; y'_1, y''_1)
	1055
	1056	// x2 // -> x
	1057	mov x3, x1 // -> c
	1058	mov x6, x5 // -> cyv
	1059	mov x5, x4 // = n
	1060	.endif
	1061
	1062	.ifeqs "\mode", "mmul"
	1063	ldr q2, [x5]
	1064	zip2 v3.8h, v2.8h, v31.8h // (v'_2, v''_2; v'_3, v''_3)
	1065	zip1 v2.8h, v2.8h, v31.8h // (v'_0, v''_0; v'_1, v''_1)
	1066
	1067	ldr q6, [x6]
	1068	zip2 v7.8h, v6.8h, v31.8h // (y'_2, y''_2; y'_3, y''_3)
	1069	zip1 v6.8h, v6.8h, v31.8h // (y'_0, y''_0; y'_1, y''_1)
	1070
	1071	mov x16, x1
	1072	mov x1, x3 // -> u
	1073	mov x3, x16 // -> c
	1074
	1075	mov x16, x2
	1076	mov x2, x4 // -> x
	1077	mov x4, x16 // -> y
	1078
	1079	mov x5, x7 // = n
	1080	ldr x6, [STKARG(0)] // -> cyv
	1081	.endif
	1082
	1083	.ifeqs "\mode", "mont"
	1084	ldr q6, [x4]
	1085	zip2 v7.8h, v6.8h, v31.8h // (m'_2, m''_2; m'_3, m''_3)
	1086	zip1 v6.8h, v6.8h, v31.8h // (m'_0, m''_0; m'_1, m''_1)
	1087
	1088	mov x4, x2 // -> y
	1089	mov x2, x3 // -> x
	1090	mov x3, x1 // -> c
	1091
	1092	// x5 // = n
	1093	// x6 // -> cyv
	1094	.endif
	1095	.endm
	1096
	1097	.macro testldcarry
	1098	ld1 {v28.2d-v30.2d}, [x3]
	1099	mov x15, #0
	1100	.endm
	1101
	1102	.macro testtop
	1103	0: sub x5, x5, #1
	1104	.endm
	1105
	1106	.macro testtail
	1107	cbnz x5, 0b
	1108	.endm
	1109
	1110	.macro testcarryout
	1111	// More complicated than usual because we must mix the general-
	1112	// purpose carry back in.
	1113	lsr x15, x15, #16
	1114	mov v0.d[0], x15
	1115	mov v0.d[1], xzr
	1116	add v28.2d, v28.2d, v0.2d
	1117	st1 {v28.2d-v30.2d}, [x3]
	1118	.endm
	1119
	1120	.macro testepilogue
	1121	popreg x29, x30
	1122	ret
	1123	.endm
	1124
	1125	FUNC(test_dmul4)
	1126	testprologue dmul
	1127	testldcarry
	1128	testtop
	1129	bl dmul4
	1130	testtail
	1131	testcarryout
	1132	testepilogue
	1133	ENDFUNC
	1134
	1135	FUNC(test_dmla4)
	1136	testprologue dmul
	1137	testldcarry
	1138	testtop
	1139	bl dmla4
	1140	testtail
	1141	testcarryout
	1142	testepilogue
	1143	ENDFUNC
	1144
	1145	FUNC(test_mul4)
	1146	testprologue smul
	1147	testldcarry
	1148	testtop
	1149	bl mul4
	1150	testtail
	1151	testcarryout
	1152	testepilogue
	1153	ENDFUNC
	1154
	1155	FUNC(test_mul4zc)
	1156	testprologue smul
	1157	testldcarry
	1158	testtop
	1159	bl mul4zc
	1160	testtail
	1161	testcarryout
	1162	testepilogue
	1163	ENDFUNC
	1164
	1165	FUNC(test_mla4)
	1166	testprologue smul
	1167	testldcarry
	1168	testtop
	1169	bl mla4
	1170	testtail
	1171	testcarryout
	1172	testepilogue
	1173	ENDFUNC
	1174
	1175	FUNC(test_mla4zc)
	1176	testprologue smul
	1177	testldcarry
	1178	testtop
	1179	bl mla4zc
	1180	testtail
	1181	testcarryout
	1182	testepilogue
	1183	ENDFUNC
	1184
	1185	FUNC(test_mmul4)
	1186	testprologue mmul
	1187	testtop
	1188	bl mmul4
	1189	testtail
	1190	stp q4, q5, [x4]
	1191	testcarryout
	1192	testepilogue
	1193	ENDFUNC
	1194
	1195	FUNC(test_mmla4)
	1196	testprologue mmul
	1197	testtop
	1198	bl mmla4
	1199	testtail
	1200	stp q4, q5, [x4]
	1201	testcarryout
	1202	testepilogue
	1203	ENDFUNC
	1204
	1205	FUNC(test_mont4)
	1206	testprologue mont
	1207	testtop
	1208	bl mont4
	1209	testtail
	1210	stp q4, q5, [x4]
	1211	testcarryout
	1212	testepilogue
	1213	ENDFUNC
	1214
	1215	#endif
	1216
	1217	///----- That's all, folks --------------------------------------------------