mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/// -- mode: asm; asm-comment-char: ?/ --
	2	///
	3	/// Large SIMD-based multiplications
	4	///
	5	/// (c) 2019 Straylight/Edgeware
	6	///
	7
	8	///----- Licensing notice ---------------------------------------------------
	9	///
	10	/// This file is part of Catacomb.
	11	///
	12	/// Catacomb is free software: you can redistribute it and/or modify it
	13	/// under the terms of the GNU Library General Public License as published
	14	/// by the Free Software Foundation; either version 2 of the License, or
	15	/// (at your option) any later version.
	16	///
	17	/// Catacomb is distributed in the hope that it will be useful, but
	18	/// WITHOUT ANY WARRANTY; without even the implied warranty of
	19	/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	20	/// Library General Public License for more details.
	21	///
	22	/// You should have received a copy of the GNU Library General Public
	23	/// License along with Catacomb. If not, write to the Free Software
	24	/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
	25	/// USA.
	26
	27	///--------------------------------------------------------------------------
	28	/// Preliminaries.
	29
	30	#include "config.h"
	31	#include "asm-common.h"
	32
	33	.arch armv7-a
	34	.fpu neon
	35
	36	.text
	37
	38	///--------------------------------------------------------------------------
	39	/// Theory.
	40	///
	41	/// We define a number of primitive fixed-size multipliers from which we can
	42	/// construct more general variable-length multipliers.
	43	///
	44	/// The basic trick is the same throughout. In an operand-scanning
	45	/// multiplication, the inner multiplication loop multiplies a multiple-
	46	/// precision operand by a single precision factor, and adds the result,
	47	/// appropriately shifted, to the result. A `finely integrated operand
	48	/// scanning' implementation of Montgomery multiplication also adds the
	49	/// product of a single-precision `Montgomery factor' and the modulus,
	50	/// calculated in the same pass. The more common `coarsely integrated
	51	/// operand scanning' alternates main multiplication and Montgomery passes,
	52	/// which requires additional carry propagation.
	53	///
	54	/// Throughout both plain-multiplication and Montgomery stages, then, one of
	55	/// the factors remains constant throughout the operation, so we can afford
	56	/// to take a little time to preprocess it. The transformation we perform is
	57	/// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
	58	/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
	59	/// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
	60	/// pieces are placed into 32-bit cells, and arranged as two 128-bit NEON
	61	/// operands, as follows.
	62	///
	63	/// Offset 12 8 4 0
	64	/// 0 v''_1 v'_1 v''_0 v'_0
	65	/// 16 v''_3 v'_3 v''_2 v'_2
	66	///
	67	/// The `vmull' and `vmlal' instructions can multiply a vector of two 32-bit
	68	/// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
	69	/// on (say) v'_0 and v''_0 in a single instruction, to produce two 48-bit
	70	/// results in 64-bit fields. The sixteen bits of headroom allows us to add
	71	/// many products together before we must deal with carrying; it also allows
	72	/// for some calculations to be performed on the above expanded form.
	73	///
	74	/// We maintain three `carry' registers, q12--q14, accumulating intermediate
	75	/// results; we name them `c0', `c1', and `c2'. Each carry register holds
	76	/// two 64-bit halves: the register c0, for example, holds c'_0 (low half)
	77	/// and c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the
	78	/// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2.
	79	/// The `vmull' or `vmlal' instruction acting on a scalar operand and an
	80	/// operand in the expanded form above produces a result which can be added
	81	/// directly to the appropriate carry register.
	82	///
	83	/// Multiplication is performed in product-scanning order, since ARM
	84	/// processors commonly implement result forwarding for consecutive multiply-
	85	/// and-accumulate instructions specifying the same destination.
	86	/// Experimentally, this runs faster than operand-scanning in an attempt to
	87	/// hide instruction latencies.
	88	///
	89	/// On 32-bit ARM, we have a reasonable number of registers: the expanded
	90	/// operands are kept in registers. The packed operands are read from memory
	91	/// into working registers q0 and q1. The following conventional argument
	92	/// names and locations are used throughout.
	93	///
	94	/// Arg Format Location Notes
	95	///
	96	/// U packed [r1]
	97	/// X packed [r2] In Montgomery multiplication, X = N
	98	/// V expanded q2/q3
	99	/// Y expanded q4/q5 In Montgomery multiplication, Y = (A + U V) M
	100	/// M expanded q4/q5 -N^{-1} (mod B^4)
	101	/// N Modulus, for Montgomery multiplication
	102	/// A packed [r0] Destination/accumulator
	103	/// C carry q13--q15
	104	///
	105	/// The calculation is some variant of
	106	///
	107	/// A' + C' B^4 <- U V + X Y + A + C
	108	///
	109	/// The low-level functions fit into a fairly traditional (finely-integrated)
	110	/// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y)
	111	/// (indexed by i).
	112	///
	113	/// The variants are as follows.
	114	///
	115	/// Function Variant Use i j
	116	///
	117	/// mmul4 A = C = 0 Montgomery 0 0
	118	/// dmul4 A = 0 Montgomery 0 +
	119	/// mmla4 C = 0 Montgomery + 0
	120	/// dmla4 exactly as shown Montgomery + +
	121	///
	122	/// mul4zc U = V = A = C = 0 Plain 0 0
	123	/// mul4 U = V = A = 0 Plain 0 +
	124	/// mla4zc U = V = C = 0 Plain + 0
	125	/// mla4 U = V = 0 Plain + +
	126	///
	127	/// The `mmul4' and `mmla4' functions are also responsible for calculating
	128	/// the Montgomery reduction factor Y = (A + U V) M used by the rest of the
	129	/// inner loop.
	130
	131	///--------------------------------------------------------------------------
	132	/// Macro definitions.
	133
	134	.macro mulacc z, u, v, x=nil, y=nil
	135	// Set Z = Z + U V + X Y. X may be `nil' to omit the second
	136	// operand. Z should be a 128-bit `qN' register; V and Y should be
	137	// 64-bit `dN' registers; and U and X should be 32-bit `dN[I]'
	138	// scalars; the multiplications produce two 64-bit elementwise
	139	// products, which are added elementwise to Z.
	140
	141	vmlal.u32 \z, \v, \u
	142	.ifnes "\x", "nil"
	143	vmlal.u32 \z, \y, \x
	144	.endif
	145	.endm
	146
	147	.macro mulinit z, zinitp, u, v, x, y
	148	// If ZINITP then set Z = Z + U V + X Y, as for `mulacc'; otherwise,
	149	// set Z = U V + X Y. Operand requirements and detailed operation
	150	// are as for `mulacc'.
	151
	152	.ifeqs "\zinitp", "t"
	153	mulacc \z, \u, \v, \x, \y
	154	.else
	155	vmull.u32 \z, \v, \u
	156	.ifnes "\x", "nil"
	157	vmlal.u32 \z, \y, \x
	158	.endif
	159	.endif
	160	.endm
	161
	162	// `MULI': accumulate the B^I and b B^i terms of the polynomial product sum U
	163	// V + X Y, given that U = u_0 + B u_1 + B^2 u_2 + B^3 u_3 (and similarly for
	164	// x), and V = v'_0 + b v''_0 + B (v'_1 + b v''_1) + B^2 (v'_2 + b v''_2) +
	165	// B^3 (v'_3 + b v''_3) (and similarly for Y). The 64-bit coefficients are
	166	// added into the low and high halves of the 128-bit register Z (if ZINIT is
	167	// `nil' then simply set Z, as if it were initially zero).
	168	#define MUL0(z, zinitp, u, v0, v1, x, y0, y1) \
	169	mulinit z, zinitp, QW(u, 0), D0(v0), QW(x, 0), D0(y0)
	170	#define MUL1(z, zinitp, u, v0, v1, x, y0, y1) \
	171	mulinit z, zinitp, QW(u, 0), D1(v0), QW(x, 0), D1(y0); \
	172	mulacc z, QW(u, 1), D0(v0), QW(x, 1), D0(y0)
	173	#define MUL2(z, zinitp, u, v0, v1, x, y0, y1) \
	174	mulinit z, zinitp, QW(u, 0), D0(v1), QW(x, 0), D0(y1); \
	175	mulacc z, QW(u, 1), D1(v0), QW(x, 1), D1(y0); \
	176	mulacc z, QW(u, 2), D0(v0), QW(x, 2), D0(y0)
	177	#define MUL3(z, zinitp, u, v0, v1, x, y0, y1) \
	178	mulinit z, zinitp, QW(u, 0), D1(v1), QW(x, 0), D1(y1); \
	179	mulacc z, QW(u, 1), D0(v1), QW(x, 1), D0(y1); \
	180	mulacc z, QW(u, 2), D1(v0), QW(x, 2), D1(y0); \
	181	mulacc z, QW(u, 3), D0(v0), QW(x, 3), D0(y0)
	182	#define MUL4(z, zinitp, u, v0, v1, x, y0, y1) \
	183	mulinit z, zinitp, QW(u, 1), D1(v1), QW(x, 1), D1(y1); \
	184	mulacc z, QW(u, 2), D0(v1), QW(x, 2), D0(y1); \
	185	mulacc z, QW(u, 3), D1(v0), QW(x, 3), D1(y0)
	186	#define MUL5(z, zinitp, u, v0, v1, x, y0, y1) \
	187	mulinit z, zinitp, QW(u, 2), D1(v1), QW(x, 2), D1(y1); \
	188	mulacc z, QW(u, 3), D0(v1), QW(x, 3), D0(y1)
	189	#define MUL6(z, zinitp, u, v0, v1, x, y0, y1) \
	190	mulinit z, zinitp, QW(u, 3), D1(v1), QW(x, 3), D1(y1)
	191
	192	// Steps in the process of propagating carry bits from ZLO to ZHI (both
	193	// 128-bit `qN' registers). Here, T is a 128-bit `qN' temporary register.
	194	// Set the low 32 bits of the 64-bit `dN' register ZOUT to the completed
	195	// coefficient z_i.
	196	//
	197	// In detail, what happens is as follows. Suppose initially that ZLO =
	198	// (z'_i; z''_i) and ZHI = (z'_{i+1}; z''_{i+1}). Let t = z'_i + b z''_i;
	199	// observe that floor(t/b) = floor(z'_i/b) + z''_i. Let z_i = t mod B, and
	200	// add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}. This has
	201	// a circuit depth of 4; I don't know how to do better.
	202	.macro _carry0 zout, zlo0, zlo1, t0, t1
	203	// ZLO0 and ZLO1 are the low and high halves of a carry register.
	204	// Extract a 32-bit output, in the bottom 32 bits of ZOUT, and set T1
	205	// so as to continue processing using `_carry1'. All operands are
	206	// 64-bit `dN' registers. If ZOUT is `nil' then no output is
	207	// produced; if T1 is `nil' then no further processing will be
	208	// possible.
	209	.ifnes "\zout", "nil"
	210	vshl.i64 \t0, \zlo1, #16
	211	.endif
	212	.ifnes "\t1", "nil"
	213	vshr.u64 \t1, \zlo0, #16
	214	.endif
	215	.ifnes "\zout", "nil"
	216	vadd.i64 \zout, \zlo0, \t0
	217	.endif
	218	.ifnes "\t1", "nil"
	219	vadd.i64 \t1, \t1, \zlo1
	220	.endif
	221	.endm
	222	.macro _carry1 u, zhi0, t1
	223	// ZHI0 is the low half of a carry register, and T1 is the result of
	224	// applying `_carry0' to the previous carry register. Set U to the
	225	// result of propagating the carry into ZHI0.
	226	vshr.u64 \t1, \t1, #16
	227	vadd.i64 \u, \zhi0, \t1
	228	.endm
	229
	230	// More convenient wrappers for `_carry0' and `_carry1'.
	231	//
	232	// CARRY0(ZOUT, ZLO, T)
	233	// Store a 32-bit output in ZOUT from carry ZLO, using T as a
	234	// temporary. ZOUT is a 64-bit `dN' register; ZLO and T are 128-bit
	235	// `qN' registers.
	236	//
	237	// CARRY1(ZHI, T)
	238	// Propagate carry from T into ZHI. Both are 128-bit `qN' registers;
	239	// ZHI is updated.
	240	#define CARRY0(zout, zlo, t) \
	241	CASIDE0(zout, D0(zlo), zlo, t)
	242	#define CARRY1(zhi, t) \
	243	CASIDE1(D0(zhi), zhi, t)
	244
	245	// Versions of `CARRY0' and `CARRY1' which don't mutate their operands.
	246	//
	247	// CASIDE0(ZOUT, U, ZLO, T)
	248	// As for `CARRY0', but the low half of ZLO is actually in U (a 64-bit
	249	// `dN' register).
	250	//
	251	// CASIDE0E(ZOUT, U, ZLO, T)
	252	// As for `CASIDE0', but only calculate the output word, and no
	253	// carry-out.
	254	//
	255	// CASIDE1(U, ZHI, T)
	256	// As for `CARRY1', but write the updated low half of ZHI to U.
	257	#define CASIDE0(zout, u, zlo, t) \
	258	_carry0 zout, u, D1(zlo), D0(t), D1(t)
	259	#define CASIDE0E(zout, u, zlo, t) \
	260	_carry0 zout, u, D1(zlo), D0(t), nil
	261	#define CASIDE1(u, zhi, t) \
	262	_carry1 u, D0(zhi), D1(t)
	263
	264	// Steps in spreading a packed 128-bit operand in A0 across A0, A1, A2, A3 in
	265	// carry format.
	266	#define SPREADACC0(a0, a1, a2, a3) \
	267	vmov.i32 a1, #0; \
	268	vmov.i32 a2, #0; \
	269	vmov.i32 a3, #0
	270	#define SPREADACC1(a0, a1, a2, a3) \
	271	vswp D1(a0), D0(a2); \
	272	vtrn.32 D0(a0), D0(a1); \
	273	vtrn.32 D0(a2), D0(a3)
	274
	275	// Add the carry-format values A0, A1, A2 into the existing carries C0, C1,
	276	// C2 (leaving A3 where it is).
	277	#define CARRYACC(a0, a1, a2, a3, c0, c1, c2) \
	278	vadd.i64 c0, c0, a0; \
	279	vadd.i64 c1, c1, a1; \
	280	vadd.i64 c2, c2, a2
	281
	282	///--------------------------------------------------------------------------
	283	/// Primitive multipliers and related utilities.
	284
	285	INTFUNC(carryprop)
	286	// On entry, r0 points to a destination, and q13--q15 hold incoming
	287	// carries c0--c2. On exit, the low 128 bits of the carry value are
	288	// stored at [r0]; the remaining 16 bits of carry are left in d30; r0
	289	// is advanced by 16; and q10--q14 are clobbered.
	290	endprologue
	291
	292	CARRY0(D0(q10), q13, q12)
	293	CARRY1(q14, q12)
	294	CARRY0(D0(q11), q14, q12)
	295	CARRY1(q15, q12)
	296	CARRY0(D1(q10), q15, q12)
	297	vshr.u64 D1(q11), D1(q12), #16
	298	vshr.u64 D0(q15), D1(q12), #48
	299	vtrn.32 q10, q11
	300	vst1.32 {q10}, [r0]!
	301	bx r14
	302	ENDFUNC
	303
	304	INTFUNC(dmul4)
	305	// On entry, r0 points to the destination; r1 and r2 point to packed
	306	// operands U and X; q2/q3 and q4/q5 hold expanded operands V and Y;
	307	// and q13--q15 hold incoming carries c0--c2. On exit, the
	308	// destination and carries are updated; r0, r1, r2 are each advanced
	309	// by 16; q2--q5 are preserved; and the other NEON registers are
	310	// clobbered.
	311	endprologue
	312
	313	// Start by loading the operand words from memory.
	314	vld1.32 {q0}, [r1]!
	315	vld1.32 {q1}, [r2]!
	316
	317	// Do the multiplication.
	318	MUL0(q13, t, q0, q2, q3, q1, q4, q5)
	319	MUL1(q14, t, q0, q2, q3, q1, q4, q5)
	320	CARRY0(D0(q8), q13, q6)
	321	MUL2(q15, t, q0, q2, q3, q1, q4, q5)
	322	CARRY1(q14, q6)
	323	CARRY0(D0(q9), q14, q6)
	324	MUL3(q12, nil, q0, q2, q3, q1, q4, q5)
	325	CARRY1(q15, q6)
	326	CARRY0(D1(q8), q15, q6)
	327	MUL4(q13, nil, q0, q2, q3, q1, q4, q5)
	328	CARRY1(q12, q6)
	329	CARRY0(D1(q9), q12, q6)
	330	MUL5(q14, nil, q0, q2, q3, q1, q4, q5)
	331	CARRY1(q13, q6)
	332	MUL6(q15, nil, q0, q2, q3, q1, q4, q5)
	333
	334	// Finish up and store the result.
	335	vtrn.32 q8, q9
	336	vst1.32 {q8}, [r0]!
	337
	338	// All done.
	339	bx r14
	340	ENDFUNC
	341
	342	INTFUNC(dmla4)
	343	// On entry, r0 points to the destination/accumulator; r1 and r2
	344	// point to packed operands U and X; q2/q3 and q4/q5 hold expanded
	345	// operands V and Y; and q13--q15 hold incoming carries c0--c2. On
	346	// exit, the accumulator and carries are updated; r0, r1, r2 are each
	347	// advanced by 16; q2--q5 are preserved; and the other NEON registers
	348	// are clobbered.
	349	endprologue
	350
	351	// Start by loading the operand words from memory.
	352	vld1.32 {q9}, [r0]
	353	SPREADACC0(q9, q10, q11, q12)
	354	vld1.32 {q0}, [r1]!
	355	vld1.32 {q1}, [r2]!
	356
	357	// Add the accumulator input to the incoming carries. Split the
	358	// accumulator into four pieces and add the carries onto them.
	359	SPREADACC1(q9, q10, q11, q12)
	360	CARRYACC(q9, q10, q11, q12, q13, q14, q15)
	361
	362	// Do the multiplication.
	363	MUL0(q13, t, q0, q2, q3, q1, q4, q5)
	364	MUL1(q14, t, q0, q2, q3, q1, q4, q5)
	365	CARRY0(D0(q8), q13, q6)
	366	MUL2(q15, t, q0, q2, q3, q1, q4, q5)
	367	CARRY1(q14, q6)
	368	CARRY0(D0(q9), q14, q6)
	369	MUL3(q12, t, q0, q2, q3, q1, q4, q5)
	370	CARRY1(q15, q6)
	371	CARRY0(D1(q8), q15, q6)
	372	MUL4(q13, nil, q0, q2, q3, q1, q4, q5)
	373	CARRY1(q12, q6)
	374	CARRY0(D1(q9), q12, q6)
	375	MUL5(q14, nil, q0, q2, q3, q1, q4, q5)
	376	CARRY1(q13, q6)
	377	MUL6(q15, nil, q0, q2, q3, q1, q4, q5)
	378
	379	// Finish up and store the result.
	380	vtrn.32 q8, q9
	381	vst1.32 {q8}, [r0]!
	382
	383	// All done.
	384	bx r14
	385	ENDFUNC
	386
	387	INTFUNC(mul4)
	388	// On entry, r0 points to the destination; r2 points to a packed
	389	// operand X; q4/q5 holds an expanded operand Y; and q13--q15 hold
	390	// incoming carries c0--c2. On exit, the destination and carries are
	391	// updated; r0 and r2 are each advanced by 16; q4 and q5 are
	392	// preserved; and the other NEON registers are clobbered.
	393	endprologue
	394
	395	// Start by loading the operand words from memory.
	396	vld1.32 {q1}, [r2]!
	397
	398	// Do the multiplication.
	399	MUL0(q13, t, q1, q4, q5, nil, nil, nil)
	400	MUL1(q14, t, q1, q4, q5, nil, nil, nil)
	401	CARRY0(D0(q8), q13, q6)
	402	MUL2(q15, t, q1, q4, q5, nil, nil, nil)
	403	CARRY1(q14, q6)
	404	CARRY0(D0(q9), q14, q6)
	405	MUL3(q12, nil, q1, q4, q5, nil, nil, nil)
	406	CARRY1(q15, q6)
	407	CARRY0(D1(q8), q15, q6)
	408	MUL4(q13, nil, q1, q4, q5, nil, nil, nil)
	409	CARRY1(q12, q6)
	410	CARRY0(D1(q9), q12, q6)
	411	MUL5(q14, nil, q1, q4, q5, nil, nil, nil)
	412	CARRY1(q13, q6)
	413	MUL6(q15, nil, q1, q4, q5, nil, nil, nil)
	414
	415	// Finish up and store the result.
	416	vtrn.32 q8, q9
	417	vst1.32 {q8}, [r0]!
	418
	419	// All done.
	420	bx r14
	421	ENDFUNC
	422
	423	INTFUNC(mul4zc)
	424	// On entry, r0 points to the destination; r2 points to a packed
	425	// operand X; and q4/q5 holds an expanded operand Y. On exit, the
	426	// destination is updated; q13--q15 hold outgoing carries c0--c2; r0
	427	// and r2 are each advanced by 16; q4 and q5 are preserved; and the
	428	// other NEON registers are clobbered.
	429	endprologue
	430
	431	// Start by loading the operand words from memory.
	432	vld1.32 {q1}, [r2]!
	433
	434	// Do the multiplication.
	435	MUL0(q13, nil, q1, q4, q5, nil, nil, nil)
	436	MUL1(q14, nil, q1, q4, q5, nil, nil, nil)
	437	CARRY0(D0(q8), q13, q6)
	438	MUL2(q15, nil, q1, q4, q5, nil, nil, nil)
	439	CARRY1(q14, q6)
	440	CARRY0(D0(q9), q14, q6)
	441	MUL3(q12, nil, q1, q4, q5, nil, nil, nil)
	442	CARRY1(q15, q6)
	443	CARRY0(D1(q8), q15, q6)
	444	MUL4(q13, nil, q1, q4, q5, nil, nil, nil)
	445	CARRY1(q12, q6)
	446	CARRY0(D1(q9), q12, q6)
	447	MUL5(q14, nil, q1, q4, q5, nil, nil, nil)
	448	CARRY1(q13, q6)
	449	MUL6(q15, nil, q1, q4, q5, nil, nil, nil)
	450
	451	// Finish up and store the result.
	452	vtrn.32 q8, q9
	453	vst1.32 {q8}, [r0]!
	454
	455	// All done.
	456	bx r14
	457	ENDFUNC
	458
	459	INTFUNC(mla4)
	460	// On entry, r0 points to the destination/accumulator; r2 points to a
	461	// packed operand X; q4/q5 holds an expanded operand Y; and q13--q15
	462	// hold incoming carries c0--c2. On exit, the accumulator and
	463	// carries are updated; r0 and r2 are each advanced by 16; q4 and q5
	464	// are preserved; and the other NEON registers are clobbered.
	465	endprologue
	466
	467	// Start by loading the operand words from memory.
	468	vld1.32 {q9}, [r0]
	469	SPREADACC0(q9, q10, q11, q12)
	470	vld1.32 {q1}, [r2]!
	471
	472	// Add the accumulator input to the incoming carries. Split the
	473	// accumulator into four pieces and add the carries onto them.
	474	SPREADACC1(q9, q10, q11, q12)
	475	CARRYACC(q9, q10, q11, q12, q13, q14, q15)
	476
	477	// Do the multiplication.
	478	mla4_common:
	479	MUL0(q13, t, q1, q4, q5, nil, nil, nil)
	480	MUL1(q14, t, q1, q4, q5, nil, nil, nil)
	481	CARRY0(D0(q8), q13, q6)
	482	MUL2(q15, t, q1, q4, q5, nil, nil, nil)
	483	CARRY1(q14, q6)
	484	CARRY0(D0(q9), q14, q6)
	485	MUL3(q12, t, q1, q4, q5, nil, nil, nil)
	486	CARRY1(q15, q6)
	487	CARRY0(D1(q8), q15, q6)
	488	MUL4(q13, nil, q1, q4, q5, nil, nil, nil)
	489	CARRY1(q12, q6)
	490	CARRY0(D1(q9), q12, q6)
	491	MUL5(q14, nil, q1, q4, q5, nil, nil, nil)
	492	CARRY1(q13, q6)
	493	MUL6(q15, nil, q1, q4, q5, nil, nil, nil)
	494
	495	// Finish up and store the result.
	496	vtrn.32 q8, q9
	497	vst1.32 {q8}, [r0]!
	498
	499	// All done.
	500	bx r14
	501	ENDFUNC
	502
	503	INTFUNC(mla4zc)
	504	// On entry, r0 points to the destination/accumulator; r2 points to a
	505	// packed operand X; and q4/q5 holds an expanded operand Y. On exit,
	506	// the accumulator is updated; q13--q15 hold outgoing carries c0--c2;
	507	// r0 and r2 are each advanced by 16; q4 and q5 are preserved; and
	508	// the other NEON registers are clobbered.
	509	endprologue
	510
	511	// Start by loading the operand words from memory.
	512	vld1.32 {q13}, [r0]
	513	SPREADACC0(q13, q14, q15, q12)
	514	vld1.32 {q1}, [r2]!
	515
	516	// Move the accumulator input to the incoming carry slots. Split the
	517	// accumulator into four pieces.
	518	SPREADACC1(q13, q14, q15, q12)
	519
	520	b mla4_common
	521	ENDFUNC
	522
	523	INTFUNC(mmul4)
	524	// On entry, r0 points to the destination; r1 points to a packed
	525	// operand U; r2 points to a packed operand X (the modulus); q2/q3
	526	// holds an expanded operand V; and q4/q5 holds an expanded operand M
	527	// (the Montgomery factor -N^{-1} (mod B)). On exit, the destination
	528	// is updated (to zero); q4/q5 hold an expanded factor Y = U V M (mod
	529	// B); q13--q15 hold outgoing carries c0--c2; r0, r1, and r2 are each
	530	// advanced by 16; q2 and q3 are preserved; and the other NEON
	531	// registers are clobbered.
	532
	533	// Start by loading the operand words from memory.
	534	vld1.32 {q0}, [r1]!
	535
	536	// Calculate the low half of W = A + U V, being careful to leave the
	537	// carries in place.
	538	MUL0(q13, nil, q0, q2, q3, nil, nil, nil)
	539	MUL1(q14, nil, q0, q2, q3, nil, nil, nil)
	540	CARRY0(D0(q6), q13, q8)
	541	MUL2(q15, nil, q0, q2, q3, nil, nil, nil)
	542	CASIDE1(D0(q9), q14, q8)
	543	CASIDE0(D0(q7), D0(q9), q14, q8)
	544	MUL3(q12, nil, q0, q2, q3, nil, nil, nil)
	545	b mmla4_common
	546	ENDFUNC
	547
	548	INTFUNC(mmla4)
	549	// On entry, r0 points to the destination/accumulator A; r1 points to
	550	// a packed operand U; r2 points to a packed operand X (the modulus);
	551	// q2/q3 holds an expanded operand V; and q4/q5 holds an expanded
	552	// operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the
	553	// accumulator is updated (to zero); q4/q5 hold an expanded factor Y
	554	// = (A + U V) M (mod B); q13--q15 hold outgoing carries c0--c2; r0,
	555	// r1, and r2 are each advanced by 16; q2 and q3 are preserved; and
	556	// the other NEON registers are clobbered.
	557	endprologue
	558
	559	// Start by loading the operand words from memory.
	560	vld1.32 {q13}, [r0]
	561	SPREADACC0(q13, q14, q15, q12)
	562	vld1.32 {q0}, [r1]!
	563
	564	// Move the accumulator input to the incoming carry slots. Split the
	565	// accumulator into four pieces.
	566	SPREADACC1(q13, q14, q15, q12)
	567
	568	// Calculate the low half of W = A + U V, being careful to leave the
	569	// carries in place.
	570	MUL0(q13, t, q0, q2, q3, nil, nil, nil)
	571	MUL1(q14, t, q0, q2, q3, nil, nil, nil)
	572	CARRY0(D0(q6), q13, q8)
	573	MUL2(q15, t, q0, q2, q3, nil, nil, nil)
	574	CASIDE1(D0(q9), q14, q8)
	575	CASIDE0(D0(q7), D0(q9), q14, q8)
	576	MUL3(q12, t, q0, q2, q3, nil, nil, nil)
	577	mmla4_common:
	578	CASIDE1(D0(q9), q15, q8)
	579	CASIDE0(D1(q6), D0(q9), q15, q8)
	580	CASIDE1(D0(q9), q12, q8)
	581	CASIDE0E(D1(q7), D0(q9), q12, q8)
	582	vtrn.32 q6, q7
	583
	584	// Calculate the low half of the Montgomery factor Y = W M. At this
	585	// point, registers are a little tight.
	586	MUL0( q8, nil, q6, q4, q5, nil, nil, nil)
	587	MUL1( q9, nil, q6, q4, q5, nil, nil, nil)
	588	CARRY0(D0(q8), q8, q1)
	589	MUL2(q10, nil, q6, q4, q5, nil, nil, nil)
	590	CARRY1(q9, q1)
	591	CARRY0(D0(q9), q9, q1)
	592	MUL3(q11, nil, q6, q4, q5, nil, nil, nil)
	593	CARRY1(q10, q1)
	594	CARRY0(D1(q8), q10, q1)
	595	vmov.i32 q5, #0
	596	CARRY1(q11, q1)
	597	CARRY0(D1(q9), q11, q1)
	598	vld1.32 {q1}, [r2]!
	599	vtrn.32 q8, q9
	600
	601	// Expand Y. We'll put it in its proper place a bit later.
	602	vzip.16 q8, q5
	603
	604	// Build up the product X Y in the carry slots.
	605	MUL0(q13, t, q1, q8, q5, nil, nil, nil)
	606	MUL1(q14, t, q1, q8, q5, nil, nil, nil)
	607	CARRY0(nil, q13, q9)
	608	MUL2(q15, t, q1, q8, q5, nil, nil, nil)
	609	CARRY1(q14, q9)
	610	vmov q4, q8
	611	CARRY0(nil, q14, q9)
	612	MUL3(q12, t, q1, q8, q5, nil, nil, nil)
	613	CARRY1(q15, q9)
	614	CARRY0(nil, q15, q9)
	615	vmov.u32 q6, #0
	616
	617	// And complete the calculation.
	618	MUL4(q13, nil, q0, q2, q3, q1, q4, q5)
	619	CARRY1(q12, q9)
	620	CARRY0(nil, q12, q9)
	621	MUL5(q14, nil, q0, q2, q3, q1, q4, q5)
	622	CARRY1(q13, q9)
	623	MUL6(q15, nil, q0, q2, q3, q1, q4, q5)
	624
	625	// Finish up and store the result.
	626	vst1.32 {q6}, [r0]!
	627
	628	// All done.
	629	bx r14
	630	ENDFUNC
	631
	632	INTFUNC(mont4)
	633	// On entry, r0 points to the destination/accumulator A; r2 points to
	634	// a packed operand X (the modulus); and q2/q3 holds an expanded
	635	// operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the
	636	// accumulator is updated (to zero); q4/q5 hold an expanded factor Y
	637	// = A M (mod B); q13--q15 hold outgoing carries c0--c2; r0 and r2
	638	// are each advanced by 16; q2 and q3 are preserved; and the other
	639	// NEON registers are clobbered.
	640	endprologue
	641
	642	// Start by loading the operand words from memory.
	643	vld1.32 {q0}, [r0]
	644	vld1.32 {q1}, [r2]!
	645
	646	// Calculate Y = A M (mod B).
	647	MUL0(q8, nil, q0, q2, q3, nil, nil, nil)
	648	MUL1(q9, nil, q0, q2, q3, nil, nil, nil)
	649	CARRY0(D0(q4), q8, q6)
	650	MUL2(q10, nil, q0, q2, q3, nil, nil, nil)
	651	CARRY1(q9, q6)
	652	vmov q13, q0
	653	CARRY0(D0(q7), q9, q6)
	654	MUL3(q11, nil, q0, q2, q3, nil, nil, nil)
	655	CARRY1(q10, q6)
	656	CARRY0(D1(q4), q10, q6)
	657	SPREADACC0(q13, q14, q15, q12)
	658	CARRY1(q11, q6)
	659	CARRY0(D1(q7), q11, q6)
	660	SPREADACC1(q13, q14, q15, q12)
	661	vmov.i32 q5, #0
	662	vtrn.32 q4, q7
	663	vzip.16 q4, q5
	664
	665	// Calculate the actual result. Well, the carries, at least.
	666	vmov.i32 q8, #0
	667	MUL0(q13, t, q1, q4, q5, nil, nil, nil)
	668	MUL1(q14, t, q1, q4, q5, nil, nil, nil)
	669	CARRY0(nil, q13, q6)
	670	MUL2(q15, t, q1, q4, q5, nil, nil, nil)
	671	CARRY1(q14, q6)
	672	CARRY0(nil, q14, q6)
	673	MUL3(q12, t, q1, q4, q5, nil, nil, nil)
	674	CARRY1(q15, q6)
	675	CARRY0(nil, q15, q6)
	676	MUL4(q13, nil, q1, q4, q5, nil, nil, nil)
	677	CARRY1(q12, q6)
	678	CARRY0(nil, q12, q6)
	679	MUL5(q14, nil, q1, q4, q5, nil, nil, nil)
	680	CARRY1(q13, q6)
	681	MUL6(q15, nil, q1, q4, q5, nil, nil, nil)
	682
	683	// Finish up and store the result.
	684	vst1.32 {q8}, [r0]!
	685
	686	// All done.
	687	bx r14
	688	ENDFUNC
	689
	690	///--------------------------------------------------------------------------
	691	/// Bulk multipliers.
	692
	693	FUNC(mpx_umul4_arm_neon)
	694	// void mpx_umul4_arm_neon(mpw dv, const mpw av, const mpw *avl,
	695	// const mpw bv, const mpw bvl);
	696
	697	// Establish the arguments and do initial setup.
	698	//
	699	// inner loop dv r0
	700	// inner loop av r2
	701	// outer loop dv r5
	702	// outer loop bv r3
	703	// av base r1
	704	// av limit r12
	705	// bv limit r4
	706	pushreg r4, r5, r14
	707	pushvfp QQ(q4, q7)
	708	endprologue
	709
	710	// Prepare for the first iteration.
	711	vld1.32 {q4}, [r3]! // = Y = bv[0]
	712	vmov.i32 q5, #0
	713	// r0 // = dv for inner loop
	714	// r1 // = av base
	715	// r3 // = bv for outer loop
	716	ldr r4, [sp, #76] // = bv limit
	717	mov r12, r2 // = av limit
	718	mov r2, r1 // = av for inner loop
	719	add r5, r0, #16 // = dv for outer loop
	720	vzip.16 q4, q5 // expand Y
	721	bl mul4zc
	722	cmp r2, r12 // all done?
	723	bhs 8f
	724
	725	// Continue with the first iteration.
	726	0: bl mul4
	727	cmp r2, r12 // all done?
	728	blo 0b
	729
	730	// Write out the leftover carry. There can be no tail here.
	731	8: bl carryprop
	732	cmp r3, r4 // more passes to do?
	733	bhs 9f
	734
	735	// Set up for the next pass.
	736	1: vld1.32 {q4}, [r3]! // = Y = bv[i]
	737	vmov.i32 q5, #0
	738	mov r0, r5 // -> dv[i]
	739	mov r2, r1 // -> av[0]
	740	add r5, r5, #16
	741	vzip.16 q4, q5 // expand Y
	742	bl mla4zc
	743	cmp r2, r12 // done yet?
	744	bhs 8f
	745
	746	// Continue...
	747	0: bl mla4
	748	cmp r2, r12
	749	blo 0b
	750
	751	// Finish off this pass. There was no tail on the previous pass, and
	752	// there can be done on this pass.
	753	8: bl carryprop
	754	cmp r3, r4
	755	blo 1b
	756
	757	// All over.
	758	9: popvfp QQ(q4, q7)
	759	popreg r4, r5, r14
	760	bx r14
	761	ENDFUNC
	762
	763	FUNC(mpxmont_mul4_arm_neon)
	764	// void mpxmont_mul4_arm_neon(mpw dv, const mpw av, const mpw *bv,
	765	// const mpw nv, size_t n, const mpw mi);
	766
	767	// Establish the arguments and do initial setup.
	768	//
	769	// inner loop dv r0
	770	// inner loop av r1
	771	// inner loop nv r2
	772	// mi r5
	773	// outer loop dv r6
	774	// outer loop bv r7
	775	// av base r8
	776	// av limit r9
	777	// bv limit r4
	778	// nv base r3
	779	// n r4
	780	// c r10
	781	// 0 r12
	782
	783	pushreg r4-r10, r14
	784	pushvfp QQ(q4, q7)
	785	endprologue
	786
	787	// Establish the expanded operands.
	788	ldrd r4, r5, [sp, #96] // r4 = n; r5 -> mi
	789	vld1.32 {q2}, [r2] // = V = bv[0]
	790	vmov.i32 q3, #0
	791	vmov.i32 q5, #0
	792	vld1.32 {q4}, [r5] // = M
	793
	794	// Set up the outer loop state and prepare for the first iteration.
	795	// r0 // -> dv for inner loop
	796	// r1 // -> av for inner loop
	797	add r7, r2, #16 // -> bv
	798	// r3 // -> nv
	799	add r6, r0, #16 // -> dv
	800	mov r8, r1 // -> av
	801	add r9, r1, r4, lsl #2 // -> av limit
	802	add r4, r2, r4, lsl #2 // -> bv limit
	803	mov r2, r3 // -> nv for inner loop
	804	mov r12, #0 // = 0
	805
	806	vzip.16 q2, q3 // expand V
	807	vzip.16 q4, q5 // expand M
	808	bl mmul4
	809	cmp r1, r9 // done already?
	810	bhs 8f
	811
	812	// Complete the first inner loop.
	813	0: bl dmul4
	814	cmp r1, r9 // done yet?
	815	blo 0b
	816
	817	// Still have carries left to propagate. Rather than store the tail
	818	// end in memory, keep it in a general-purpose register for later.
	819	bl carryprop
	820	vmov.32 r10, QW(q15, 0)
	821
	822	// Embark on the next iteration. (There must be one. If n = 1 then
	823	// we would have bailed above, to label 8. Similarly, the subsequent
	824	// iterations can fall into the inner loop immediately.)
	825	1: vld1.32 {q2}, [r7]! // = V = bv[i]
	826	vld1.32 {q4}, [r5] // = M
	827	vmov.i32 q3, #0
	828	vmov.i32 q5, #0
	829	mov r0, r6 // -> dv[i]
	830	add r6, r6, #16
	831	mov r1, r8 // -> av[0]
	832	mov r2, r3 // -> nv[0]
	833	vzip.16 q2, q3 // expand V
	834	vzip.16 q4, q5 // expand M
	835	bl mmla4
	836
	837	// Complete the next inner loop.
	838	0: bl dmla4
	839	cmp r1, r9 // done yet?
	840	blo 0b
	841
	842	// Still have carries left to propagate, and they overlap the
	843	// previous iteration's final tail, so read that and add it.
	844	cmp r7, r4
	845	vmov.32 D0(q12), r10, r12
	846	vadd.i64 D0(q13), D0(q13), D0(q12)
	847	bl carryprop
	848	vmov.32 r10, QW(q15, 0)
	849
	850	// Back again, maybe.
	851	blo 1b
	852
	853	// All done, almost.
	854	str r10, [r0], #4
	855	popvfp QQ(q4, q7)
	856	popreg r4-r10, r14
	857	bx r14
	858
	859	// First iteration was short. Write out the carries and we're done.
	860	// (This could be folded into the main loop structure, but that would
	861	// penalize small numbers more.)
	862	8: bl carryprop
	863	vst1.32 {QW(q15, 0)}, [r0]!
	864	popvfp QQ(q4, q7)
	865	popreg r4-r10, r14
	866	bx r14
	867	ENDFUNC
	868
	869	FUNC(mpxmont_redc4_arm_neon)
	870	// void mpxmont_redc4_arm_neon(mpw dv, mpw dvl, const mpw *nv,
	871	// size_t n, const mpw *mi);
	872
	873	// Establish the arguments and do initial setup.
	874	//
	875	// inner loop dv r0
	876	// dv limit r1
	877	// inner loop nv r2
	878	// blocks-of-4 dv limit r3
	879	// mi (r14)
	880	// outer loop dv r4
	881	// outer loop dv limit r5
	882	// nv base r6
	883	// nv limit r7
	884	// n r3
	885	// c (r14)
	886	// t0, t1, t2, t3 r2, r8, r9, r10
	887	// 0 r12
	888
	889	pushreg r4-r10, r14
	890	pushvfp QQ(q4, q7)
	891	endprologue
	892
	893	// Set up the outer loop state and prepare for the first iteration.
	894	ldr r14, [sp, #96] // -> mi
	895	vmov.i32 q3, #0
	896	sub r12, r1, r0 // total dv bytes
	897	// r0 // -> dv for inner loop
	898	// r1 // -> overall dv limit
	899	// r2 // -> nv for inner loop
	900	// r3 // = n (for now)
	901	add r4, r0, #16 // -> dv for outer loop
	902	add r5, r0, r3, lsl #2 // -> dv limit
	903	bic r12, r12, #15 // dv blocks of 4
	904	vld1.32 {q2}, [r14] // = M
	905	mov r6, r2 // -> nv
	906	add r7, r2, r3, lsl #2 // -> nv limit
	907	add r3, r0, r12 // -> dv blocks-of-4 limit
	908	vzip.16 q2, q3 // expand M
	909	mov r12, #0 // = 0
	910	bl mont4
	911	cmp r2, r7 // done already?
	912	bhs 8f
	913
	914	5: bl mla4
	915	cmp r2, r7 // done yet?
	916	blo 5b
	917
	918	// Still have carries left to propagate. Adding the accumulator
	919	// block into the carries is a little different this time, because
	920	// all four accumulator limbs have to be squished into the three
	921	// carry registers for `carryprop' to do its thing.
	922	8: vld1.32 {q9}, [r0]
	923	SPREADACC0(q9, q10, q11, q12)
	924	SPREADACC1(q9, q10, q11, q12)
	925	vshl.u64 D0(q12), D0(q12), #16
	926	CARRYACC(q9, q10, q11, q12, q13, q14, q15)
	927	vadd.u64 D1(q15), D1(q15), D0(q12)
	928
	929	bl carryprop
	930	vmov.32 r14, QW(q15, 0)
	931	cmp r0, r3
	932	bhs 7f
	933
	934	// Propagate the first group of carries.
	935	ldmia r0, {r2, r8-r10}
	936	adds r2, r2, r14
	937	adcs r8, r8, #0
	938	adcs r9, r9, #0
	939	adcs r10, r10, #0
	940	stmia r0!, {r2, r8-r10}
	941	teq r0, r3
	942	beq 6f
	943
	944	// Continue carry propagation until the end of the buffer.
	945	0: ldmia r0, {r2, r8-r10}
	946	adcs r2, r2, #0
	947	adcs r8, r8, #0
	948	adcs r9, r9, #0
	949	adcs r10, r10, #0
	950	stmia r0!, {r2, r8-r10}
	951	teq r0, r3
	952	bne 0b
	953
	954	// Deal with the tail end. Note that the actual destination length
	955	// won't be an exacty number of blocks of four, so it's safe to just
	956	// drop through here.
	957	6: adc r14, r12, #0
	958	7: ldr r2, [r0]
	959	adds r2, r2, r14
	960	str r2, [r0], #4
	961	teq r0, r1
	962	beq 8f
	963	0: ldr r2, [r0]
	964	adcs r2, r2, #0
	965	str r2, [r0], #4
	966	teq r0, r1
	967	bne 0b
	968
	969	// All done for this iteration. Start the next.
	970	8: cmp r4, r5
	971	bhs 9f
	972	mov r0, r4
	973	add r4, r4, #16
	974	mov r2, r6
	975	bl mont4
	976	b 5b
	977
	978	// All over.
	979	9: popvfp QQ(q4, q7)
	980	popreg r4-r10, r14
	981	bx r14
	982	ENDFUNC
	983
	984	///--------------------------------------------------------------------------
	985	/// Testing and performance measurement.
	986
	987	#ifdef TEST_MUL4
	988
	989	// dmul smul mmul mont
	990	// z r0 r0 r0 r0 r0
	991	// c r4 r1 r1 r1 r1
	992	// y r3 -- -- r2 r2
	993	// u r1 r2 -- r3 --
	994	// x r2 r3 r2 stk0 r3
	995	// vv q2/q3 stk0 -- stk1 stk0
	996	// yy q4/q5 stk1 r3 stk2 --
	997	// n r5 stk2 stk0 stk3 stk1
	998	// cyv r6 stk3 stk1 stk4 stk2
	999
	1000	#define STKARG(i) sp, #80 + 4*(i)
	1001
	1002	.macro testprologue mode
	1003	pushreg r4-r6, r14
	1004	pushvfp QQ(q4, q7)
	1005	endprologue
	1006
	1007	.ifeqs "\mode", "dmul"
	1008	mov r4, r1 // -> c
	1009	mov r1, r2 // -> u
	1010	mov r2, r3 // -> x
	1011
	1012	ldr r14, [STKARG(0)] // -> vv
	1013	vld1.32 {q2}, [r14]
	1014	vmov.i32 q3, #0
	1015	vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0)
	1016
	1017	ldr r14, [STKARG(1)] // -> yy
	1018	vld1.32 {q4}, [r14]
	1019	vmov.i32 q5, #0
	1020	vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0)
	1021
	1022	ldr r5, [STKARG(2)] // = n
	1023	ldr r6, [STKARG(3)] // -> cyv
	1024	.endif
	1025
	1026	.ifeqs "\mode", "smul"
	1027	mov r4, r1 // -> c
	1028	// r2 // -> x
	1029
	1030	vld1.32 {q4}, [r3]
	1031	vmov.i32 q5, #0
	1032	vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0)
	1033
	1034	ldr r5, [STKARG(0)] // = n
	1035	ldr r6, [STKARG(1)] // -> cyv
	1036	.endif
	1037
	1038	.ifeqs "\mode", "mmul"
	1039	mov r4, r1 // -> c
	1040	mov r1, r3 // -> u
	1041	mov r3, r2 // -> y
	1042	ldr r2, [STKARG(0)] // -> x
	1043
	1044	ldr r14, [STKARG(1)] // -> vv
	1045	vld1.32 {q2}, [r14]
	1046	vmov.i32 q3, #0
	1047	vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0)
	1048
	1049	ldr r14, [STKARG(2)] // -> yy
	1050	vld1.32 {q4}, [r14]
	1051	vmov.i32 q5, #0
	1052	vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0)
	1053
	1054	ldr r5, [STKARG(3)] // = n
	1055	ldr r6, [STKARG(4)] // -> cyv
	1056	.endif
	1057
	1058	.ifeqs "\mode", "mont"
	1059	mov r4, r1 // -> c
	1060	mov r1, r3 // -> u
	1061	mov r14, r2
	1062	mov r2, r3 // -> x
	1063	mov r3, r14 // -> y
	1064
	1065	ldr r14, [STKARG(0)] // -> vv
	1066	vld1.32 {q2}, [r14]
	1067	vmov.i32 q3, #0
	1068	vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0)
	1069
	1070	ldr r5, [STKARG(1)] // = n
	1071	ldr r6, [STKARG(2)] // -> cyv
	1072	.endif
	1073	.endm
	1074
	1075	.macro testldcarry
	1076	vldmia r4, {QQ(q13, q15)} // c0, c1, c2
	1077	.endm
	1078
	1079	.macro testtop
	1080	0: subs r5, r5, #1
	1081	.endm
	1082
	1083	.macro testtail
	1084	bne 0b
	1085	.endm
	1086
	1087	.macro testcarryout
	1088	vstmia r4, {QQ(q13, q15)}
	1089	.endm
	1090
	1091	.macro testepilogue
	1092	popvfp QQ(q4, q7)
	1093	popreg r4-r6, r14
	1094	bx r14
	1095	.endm
	1096
	1097	FUNC(test_dmul4)
	1098	testprologue dmul
	1099	testldcarry
	1100	testtop
	1101	bl dmul4
	1102	testtail
	1103	testcarryout
	1104	testepilogue
	1105	ENDFUNC
	1106
	1107	FUNC(test_dmla4)
	1108	testprologue dmul
	1109	testldcarry
	1110	testtop
	1111	bl dmla4
	1112	testtail
	1113	testcarryout
	1114	testepilogue
	1115	ENDFUNC
	1116
	1117	FUNC(test_mul4)
	1118	testprologue smul
	1119	testldcarry
	1120	testtop
	1121	bl mul4
	1122	testtail
	1123	testcarryout
	1124	testepilogue
	1125	ENDFUNC
	1126
	1127	FUNC(test_mul4zc)
	1128	testprologue smul
	1129	testldcarry
	1130	testtop
	1131	bl mul4zc
	1132	testtail
	1133	testcarryout
	1134	testepilogue
	1135	ENDFUNC
	1136
	1137	FUNC(test_mla4)
	1138	testprologue smul
	1139	testldcarry
	1140	testtop
	1141	bl mla4
	1142	testtail
	1143	testcarryout
	1144	testepilogue
	1145	ENDFUNC
	1146
	1147	FUNC(test_mla4zc)
	1148	testprologue smul
	1149	testldcarry
	1150	testtop
	1151	bl mla4zc
	1152	testtail
	1153	testcarryout
	1154	testepilogue
	1155	ENDFUNC
	1156
	1157	FUNC(test_mmul4)
	1158	testprologue mmul
	1159	testtop
	1160	bl mmul4
	1161	testtail
	1162	vst1.32 {q4, q5}, [r3]
	1163	testcarryout
	1164	testepilogue
	1165	ENDFUNC
	1166
	1167	FUNC(test_mmla4)
	1168	testprologue mmul
	1169	testtop
	1170	bl mmla4
	1171	testtail
	1172	vst1.32 {q4, q5}, [r3]
	1173	testcarryout
	1174	testepilogue
	1175	ENDFUNC
	1176
	1177	FUNC(test_mont4)
	1178	testprologue mont
	1179	testtop
	1180	bl mont4
	1181	testtail
	1182	vst1.32 {q4, q5}, [r3]
	1183	testcarryout
	1184	testepilogue
	1185	ENDFUNC
	1186
	1187	#endif
	1188
	1189	///----- That's all, folks --------------------------------------------------