/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// Large SIMD-based multiplications
///
/// (c) 2019 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software: you can redistribute it and/or modify it
/// under the terms of the GNU Library General Public License as published
/// by the Free Software Foundation; either version 2 of the License, or
/// (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful, but
/// WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
/// Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb.  If not, write to the Free Software
/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
/// USA.

///--------------------------------------------------------------------------
/// Preliminaries.

#include "config.h"
#include "asm-common.h"

	.arch	armv7-a
	.fpu	neon

	.text

///--------------------------------------------------------------------------
/// Theory.
///
/// We define a number of primitive fixed-size multipliers from which we can
/// construct more general variable-length multipliers.
///
/// The basic trick is the same throughout.  In an operand-scanning
/// multiplication, the inner multiplication loop multiplies a multiple-
/// precision operand by a single precision factor, and adds the result,
/// appropriately shifted, to the result.  A `finely integrated operand
/// scanning' implementation of Montgomery multiplication also adds the
/// product of a single-precision `Montgomery factor' and the modulus,
/// calculated in the same pass.  The more common `coarsely integrated
/// operand scanning' alternates main multiplication and Montgomery passes,
/// which requires additional carry propagation.
///
/// Throughout both plain-multiplication and Montgomery stages, then, one of
/// the factors remains constant throughout the operation, so we can afford
/// to take a little time to preprocess it.  The transformation we perform is
/// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
/// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
/// pieces are placed into 32-bit cells, and arranged as two 128-bit NEON
/// operands, as follows.
///
///	Offset	   12	   8	    4	   0
///	   0	v''_1	v'_1	v''_0	v'_0
///	  16	v''_3	v'_3	v''_2	v'_2
///
/// The `vmull' and `vmlal' instructions can multiply a vector of two 32-bit
/// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
/// on (say) v'_0 and v''_0 in a single instruction, to produce two 48-bit
/// results in 64-bit fields.  The sixteen bits of headroom allows us to add
/// many products together before we must deal with carrying; it also allows
/// for some calculations to be performed on the above expanded form.
///
/// We maintain three `carry' registers, q12--q14, accumulating intermediate
/// results; we name them `c0', `c1', and `c2'.  Each carry register holds
/// two 64-bit halves: the register c0, for example, holds c'_0 (low half)
/// and c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the
/// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2.
/// The `vmull' or `vmlal' instruction acting on a scalar operand and an
/// operand in the expanded form above produces a result which can be added
/// directly to the appropriate carry register.
///
/// Multiplication is performed in product-scanning order, since ARM
/// processors commonly implement result forwarding for consecutive multiply-
/// and-accumulate instructions specifying the same destination.
/// Experimentally, this runs faster than operand-scanning in an attempt to
/// hide instruction latencies.
///
/// On 32-bit ARM, we have a reasonable number of registers: the expanded
/// operands are kept in registers.  The packed operands are read from memory
/// into working registers q0 and q1.  The following conventional argument
/// names and locations are used throughout.
///
/// Arg	Format	    Location	Notes
///
/// U	packed	    [r1]
/// X	packed	    [r2]	In Montgomery multiplication, X = N
/// V	expanded    q2/q3
/// Y	expanded    q4/q5	In Montgomery multiplication, Y = (A + U V) M
/// M	expanded    q4/q5	-N^{-1} (mod B^4)
/// N				Modulus, for Montgomery multiplication
/// A	packed	    [r0]	Destination/accumulator
/// C	carry	    q13--q15
///
/// The calculation is some variant of
///
///	A' + C' B^4 <- U V + X Y + A + C
///
/// The low-level functions fit into a fairly traditional (finely-integrated)
/// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y)
/// (indexed by i).
///
/// The variants are as follows.
///
/// Function	Variant			Use		i	j
///
/// mmul4	A = C = 0		Montgomery	0	0
/// dmul4	A = 0			Montgomery	0	+
/// mmla4	C = 0			Montgomery	+	0
/// dmla4	exactly as shown	Montgomery	+	+
///
/// mul4zc	U = V = A = C = 0	Plain		0	0
/// mul4	U = V = A = 0		Plain		0	+
/// mla4zc	U = V = C = 0		Plain		+	0
/// mla4	U = V = 0		Plain		+	+
///
/// The `mmul4' and `mmla4' functions are also responsible for calculating
/// the Montgomery reduction factor Y = (A + U V) M used by the rest of the
/// inner loop.

///--------------------------------------------------------------------------
/// Macro definitions.

.macro	mulacc	z, u, v, x=nil, y=nil
	// Set Z = Z + U V + X Y.  X may be `nil' to omit the second
	// operand.  Z should be a 128-bit `qN' register; V and Y should be
	// 64-bit `dN' registers; and U and X should be 32-bit `dN[I]'
	// scalars; the multiplications produce two 64-bit elementwise
	// products, which are added elementwise to Z.

	vmlal.u32 \z, \v, \u
    .ifnes "\x", "nil"
	vmlal.u32 \z, \y, \x
    .endif
.endm

.macro	mulinit	z, zinitp, u, v, x, y
	// If ZINITP then set Z = Z + U V + X Y, as for `mulacc'; otherwise,
	// set Z = U V + X Y.  Operand requirements and detailed operation
	// are as for `mulacc'.

  .ifeqs "\zinitp", "t"
	mulacc	\z, \u, \v, \x, \y
  .else
	vmull.u32 \z, \v, \u
    .ifnes "\x", "nil"
	vmlal.u32 \z, \y, \x
    .endif
  .endif
.endm

// `MULI': accumulate the B^I and b B^i terms of the polynomial product sum U
// V + X Y, given that U = u_0 + B u_1 + B^2 u_2 + B^3 u_3 (and similarly for
// x), and V = v'_0 + b v''_0 + B (v'_1 + b v''_1) + B^2 (v'_2 + b v''_2) +
// B^3 (v'_3 + b v''_3) (and similarly for Y).  The 64-bit coefficients are
// added into the low and high halves of the 128-bit register Z (if ZINIT is
// `nil' then simply set Z, as if it were initially zero).
#define MUL0(z, zinitp, u, v0, v1, x, y0, y1)				\
	mulinit	z, zinitp, QW(u, 0), D0(v0), QW(x, 0), D0(y0)
#define MUL1(z, zinitp, u, v0, v1, x, y0, y1)				\
	mulinit	z, zinitp, QW(u, 0), D1(v0), QW(x, 0), D1(y0);		\
	mulacc	z,	   QW(u, 1), D0(v0), QW(x, 1), D0(y0)
#define MUL2(z, zinitp, u, v0, v1, x, y0, y1)				\
	mulinit	z, zinitp, QW(u, 0), D0(v1), QW(x, 0), D0(y1);		\
	mulacc	z,	   QW(u, 1), D1(v0), QW(x, 1), D1(y0);		\
	mulacc	z,	   QW(u, 2), D0(v0), QW(x, 2), D0(y0)
#define MUL3(z, zinitp, u, v0, v1, x, y0, y1)				\
	mulinit	z, zinitp, QW(u, 0), D1(v1), QW(x, 0), D1(y1);		\
	mulacc	z,	   QW(u, 1), D0(v1), QW(x, 1), D0(y1);		\
	mulacc	z,	   QW(u, 2), D1(v0), QW(x, 2), D1(y0);		\
	mulacc	z,	   QW(u, 3), D0(v0), QW(x, 3), D0(y0)
#define MUL4(z, zinitp, u, v0, v1, x, y0, y1)				\
	mulinit	z, zinitp, QW(u, 1), D1(v1), QW(x, 1), D1(y1);		\
	mulacc	z,	   QW(u, 2), D0(v1), QW(x, 2), D0(y1);		\
	mulacc	z,	   QW(u, 3), D1(v0), QW(x, 3), D1(y0)
#define MUL5(z, zinitp, u, v0, v1, x, y0, y1)				\
	mulinit	z, zinitp, QW(u, 2), D1(v1), QW(x, 2), D1(y1);		\
	mulacc	z,	   QW(u, 3), D0(v1), QW(x, 3), D0(y1)
#define MUL6(z, zinitp, u, v0, v1, x, y0, y1)				\
	mulinit	z, zinitp, QW(u, 3), D1(v1), QW(x, 3), D1(y1)

// Steps in the process of propagating carry bits from ZLO to ZHI (both
// 128-bit `qN' registers).  Here, T is a 128-bit `qN' temporary register.
// Set the low 32 bits of the 64-bit `dN' register ZOUT to the completed
// coefficient z_i.
//
// In detail, what happens is as follows.  Suppose initially that ZLO =
// (z'_i; z''_i) and ZHI = (z'_{i+1}; z''_{i+1}).  Let t = z'_i + b z''_i;
// observe that floor(t/b) = floor(z'_i/b) + z''_i.  Let z_i = t mod B, and
// add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}.  This has
// a circuit depth of 4; I don't know how to do better.
.macro	_carry0	zout, zlo0, zlo1, t0, t1
	// ZLO0 and ZLO1 are the low and high halves of a carry register.
	// Extract a 32-bit output, in the bottom 32 bits of ZOUT, and set T1
	// so as to continue processing using `_carry1'.  All operands are
	// 64-bit `dN' registers.  If ZOUT is `nil' then no output is
	// produced; if T1 is `nil' then no further processing will be
	// possible.
  .ifnes "\zout", "nil"
	vshl.i64 \t0, \zlo1, #16
  .endif
  .ifnes "\t1", "nil"
	vshr.u64 \t1, \zlo0, #16
  .endif
  .ifnes "\zout", "nil"
	vadd.i64 \zout, \zlo0, \t0
  .endif
  .ifnes "\t1", "nil"
	vadd.i64 \t1, \t1, \zlo1
  .endif
.endm
.macro	_carry1	u, zhi0, t1
	// ZHI0 is the low half of a carry register, and T1 is the result of
	// applying `_carry0' to the previous carry register.  Set U to the
	// result of propagating the carry into ZHI0.
	vshr.u64 \t1, \t1, #16
	vadd.i64 \u, \zhi0, \t1
.endm

// More convenient wrappers for `_carry0' and `_carry1'.
//
// CARRY0(ZOUT, ZLO, T)
//	Store a 32-bit output in ZOUT from carry ZLO, using T as a
//	temporary.  ZOUT is a 64-bit `dN' register; ZLO and T are 128-bit
//	`qN' registers.
//
// CARRY1(ZHI, T)
//	Propagate carry from T into ZHI.  Both are 128-bit `qN' registers;
//	ZHI is updated.
#define CARRY0(zout, zlo, t)						\
	CASIDE0(zout, D0(zlo), zlo, t)
#define CARRY1(zhi, t)							\
	CASIDE1(D0(zhi), zhi, t)

// Versions of `CARRY0' and `CARRY1' which don't mutate their operands.
//
// CASIDE0(ZOUT, U, ZLO, T)
//	As for `CARRY0', but the low half of ZLO is actually in U (a 64-bit
//	`dN' register).
//
// CASIDE0E(ZOUT, U, ZLO, T)
//	As for `CASIDE0', but only calculate the output word, and no
//	carry-out.
//
// CASIDE1(U, ZHI, T)
//	As for `CARRY1', but write the updated low half of ZHI to U.
#define CASIDE0(zout, u, zlo, t)					\
	_carry0	zout, u, D1(zlo), D0(t), D1(t)
#define CASIDE0E(zout, u, zlo, t)					\
	_carry0	zout, u, D1(zlo), D0(t), nil
#define CASIDE1(u, zhi, t)						\
	_carry1	u, D0(zhi), D1(t)

// Steps in spreading a packed 128-bit operand in A0 across A0, A1, A2, A3 in
// carry format.
#define SPREADACC0(a0, a1, a2, a3)					\
	vmov.i32 a1, #0;						\
	vmov.i32 a2, #0;						\
	vmov.i32 a3, #0
#define SPREADACC1(a0, a1, a2, a3)					\
	vswp	D1(a0), D0(a2);						\
	vtrn.32	D0(a0), D0(a1);						\
	vtrn.32	D0(a2), D0(a3)

// Add the carry-format values A0, A1, A2 into the existing carries C0, C1,
// C2 (leaving A3 where it is).
#define CARRYACC(a0, a1, a2, a3, c0, c1, c2)				\
	vadd.i64 c0, c0, a0;						\
	vadd.i64 c1, c1, a1;						\
	vadd.i64 c2, c2, a2

///--------------------------------------------------------------------------
/// Primitive multipliers and related utilities.

INTFUNC(carryprop)
	// On entry, r0 points to a destination, and q13--q15 hold incoming
	// carries c0--c2.  On exit, the low 128 bits of the carry value are
	// stored at [r0]; the remaining 16 bits of carry are left in d30; r0
	// is advanced by 16; and q10--q14 are clobbered.
  endprologue

	CARRY0(D0(q10), q13, q12)
	CARRY1(q14, q12)
	CARRY0(D0(q11), q14, q12)
	CARRY1(q15, q12)
	CARRY0(D1(q10), q15, q12)
	vshr.u64 D1(q11), D1(q12), #16
	vshr.u64 D0(q15), D1(q12), #48
	vtrn.32	q10, q11
	vst1.32	{q10}, [r0]!
	bx	r14
ENDFUNC

INTFUNC(dmul4)
	// On entry, r0 points to the destination; r1 and r2 point to packed
	// operands U and X; q2/q3 and q4/q5 hold expanded operands V and Y;
	// and q13--q15 hold incoming carries c0--c2.  On exit, the
	// destination and carries are updated; r0, r1, r2 are each advanced
	// by 16; q2--q5 are preserved; and the other NEON registers are
	// clobbered.
  endprologue

	// Start by loading the operand words from memory.
	vld1.32	{q0}, [r1]!
	vld1.32	{q1}, [r2]!

	// Do the multiplication.
	MUL0(q13, t,	  q0,  q2, q3,	  q1,  q4, q5)
	MUL1(q14, t,	  q0,  q2, q3,	  q1,  q4, q5)
	 CARRY0(D0(q8), q13, q6)
	MUL2(q15, t,	  q0,  q2, q3,	  q1,  q4, q5)
	 CARRY1(q14, q6)
	 CARRY0(D0(q9), q14, q6)
	MUL3(q12, nil,	  q0,  q2, q3,	  q1,  q4, q5)
	 CARRY1(q15, q6)
	 CARRY0(D1(q8), q15, q6)
	MUL4(q13, nil,	  q0,  q2, q3,	  q1,  q4, q5)
	 CARRY1(q12, q6)
	 CARRY0(D1(q9), q12, q6)
	MUL5(q14, nil,	  q0,  q2, q3,	  q1,  q4, q5)
	 CARRY1(q13, q6)
	MUL6(q15, nil,	  q0,  q2, q3,	  q1,  q4, q5)

	// Finish up and store the result.
	vtrn.32	q8, q9
	vst1.32	{q8}, [r0]!

	// All done.
	bx	r14
ENDFUNC

INTFUNC(dmla4)
	// On entry, r0 points to the destination/accumulator; r1 and r2
	// point to packed operands U and X; q2/q3 and q4/q5 hold expanded
	// operands V and Y; and q13--q15 hold incoming carries c0--c2.  On
	// exit, the accumulator and carries are updated; r0, r1, r2 are each
	// advanced by 16; q2--q5 are preserved; and the other NEON registers
	// are clobbered.
  endprologue

	// Start by loading the operand words from memory.
	vld1.32	{q9}, [r0]
	SPREADACC0(q9, q10, q11, q12)
	vld1.32	{q0}, [r1]!
	vld1.32	{q1}, [r2]!

	// Add the accumulator input to the incoming carries.  Split the
	// accumulator into four pieces and add the carries onto them.
	SPREADACC1(q9, q10, q11, q12)
	CARRYACC(q9, q10, q11, q12, q13, q14, q15)

	// Do the multiplication.
	MUL0(q13, t,	  q0,  q2, q3,	  q1,  q4, q5)
	MUL1(q14, t,	  q0,  q2, q3,	  q1,  q4, q5)
	 CARRY0(D0(q8), q13, q6)
	MUL2(q15, t,	  q0,  q2, q3,	  q1,  q4, q5)
	 CARRY1(q14, q6)
	 CARRY0(D0(q9), q14, q6)
	MUL3(q12, t,	  q0,  q2, q3,	  q1,  q4, q5)
	 CARRY1(q15, q6)
	 CARRY0(D1(q8), q15, q6)
	MUL4(q13, nil,	  q0,  q2, q3,	  q1,  q4, q5)
	 CARRY1(q12, q6)
	 CARRY0(D1(q9), q12, q6)
	MUL5(q14, nil,	  q0,  q2, q3,	  q1,  q4, q5)
	 CARRY1(q13, q6)
	MUL6(q15, nil,	  q0,  q2, q3,	  q1,  q4, q5)

	// Finish up and store the result.
	vtrn.32	q8, q9
	vst1.32	{q8}, [r0]!

	// All done.
	bx	r14
ENDFUNC

INTFUNC(mul4)
	// On entry, r0 points to the destination; r2 points to a packed
	// operand X; q4/q5 holds an expanded operand Y; and q13--q15 hold
	// incoming carries c0--c2.  On exit, the destination and carries are
	// updated; r0 and r2 are each advanced by 16; q4 and q5 are
	// preserved; and the other NEON registers are clobbered.
  endprologue

	// Start by loading the operand words from memory.
	vld1.32	{q1}, [r2]!

	// Do the multiplication.
	MUL0(q13, t,	  q1,  q4, q5,	  nil,  nil, nil)
	MUL1(q14, t,	  q1,  q4, q5,	  nil,  nil, nil)
	 CARRY0(D0(q8), q13, q6)
	MUL2(q15, t,	  q1,  q4, q5,	  nil,  nil, nil)
	 CARRY1(q14, q6)
	 CARRY0(D0(q9), q14, q6)
	MUL3(q12, nil,	  q1,  q4, q5,	  nil,  nil, nil)
	 CARRY1(q15, q6)
	 CARRY0(D1(q8), q15, q6)
	MUL4(q13, nil,	  q1,  q4, q5,	  nil,  nil, nil)
	 CARRY1(q12, q6)
	 CARRY0(D1(q9), q12, q6)
	MUL5(q14, nil,	  q1,  q4, q5,	  nil,  nil, nil)
	 CARRY1(q13, q6)
	MUL6(q15, nil,	  q1,  q4, q5,	  nil,  nil, nil)

	// Finish up and store the result.
	vtrn.32	q8, q9
	vst1.32	{q8}, [r0]!

	// All done.
	bx	r14
ENDFUNC

INTFUNC(mul4zc)
	// On entry, r0 points to the destination; r2 points to a packed
	// operand X; and q4/q5 holds an expanded operand Y.  On exit, the
	// destination is updated; q13--q15 hold outgoing carries c0--c2; r0
	// and r2 are each advanced by 16; q4 and q5 are preserved; and the
	// other NEON registers are clobbered.
  endprologue

	// Start by loading the operand words from memory.
	vld1.32	{q1}, [r2]!

	// Do the multiplication.
	MUL0(q13, nil,	  q1,  q4, q5,	  nil,  nil, nil)
	MUL1(q14, nil,	  q1,  q4, q5,	  nil,  nil, nil)
	 CARRY0(D0(q8), q13, q6)
	MUL2(q15, nil,	  q1,  q4, q5,	  nil,  nil, nil)
	 CARRY1(q14, q6)
	 CARRY0(D0(q9), q14, q6)
	MUL3(q12, nil,	  q1,  q4, q5,	  nil,  nil, nil)
	 CARRY1(q15, q6)
	 CARRY0(D1(q8), q15, q6)
	MUL4(q13, nil,	  q1,  q4, q5,	  nil,  nil, nil)
	 CARRY1(q12, q6)
	 CARRY0(D1(q9), q12, q6)
	MUL5(q14, nil,	  q1,  q4, q5,	  nil,  nil, nil)
	 CARRY1(q13, q6)
	MUL6(q15, nil,	  q1,  q4, q5,	  nil,  nil, nil)

	// Finish up and store the result.
	vtrn.32	q8, q9
	vst1.32	{q8}, [r0]!

	// All done.
	bx	r14
ENDFUNC

INTFUNC(mla4)
	// On entry, r0 points to the destination/accumulator; r2 points to a
	// packed operand X; q4/q5 holds an expanded operand Y; and q13--q15
	// hold incoming carries c0--c2.  On exit, the accumulator and
	// carries are updated; r0 and r2 are each advanced by 16; q4 and q5
	// are preserved; and the other NEON registers are clobbered.
  endprologue

	// Start by loading the operand words from memory.
	vld1.32	{q9}, [r0]
	SPREADACC0(q9, q10, q11, q12)
	vld1.32	{q1}, [r2]!

	// Add the accumulator input to the incoming carries.  Split the
	// accumulator into four pieces and add the carries onto them.
	SPREADACC1(q9, q10, q11, q12)
	CARRYACC(q9, q10, q11, q12, q13, q14, q15)

	// Do the multiplication.
mla4_common:
	MUL0(q13, t,	  q1,  q4, q5,	  nil,  nil, nil)
	MUL1(q14, t,	  q1,  q4, q5,	  nil,  nil, nil)
	 CARRY0(D0(q8), q13, q6)
	MUL2(q15, t,	  q1,  q4, q5,	  nil,  nil, nil)
	 CARRY1(q14, q6)
	 CARRY0(D0(q9), q14, q6)
	MUL3(q12, t,	  q1,  q4, q5,	  nil,  nil, nil)
	 CARRY1(q15, q6)
	 CARRY0(D1(q8), q15, q6)
	MUL4(q13, nil,	  q1,  q4, q5,	  nil,  nil, nil)
	 CARRY1(q12, q6)
	 CARRY0(D1(q9), q12, q6)
	MUL5(q14, nil,	  q1,  q4, q5,	  nil,  nil, nil)
	 CARRY1(q13, q6)
	MUL6(q15, nil,	  q1,  q4, q5,	  nil,  nil, nil)

	// Finish up and store the result.
	vtrn.32	q8, q9
	vst1.32	{q8}, [r0]!

	// All done.
	bx	r14
ENDFUNC

INTFUNC(mla4zc)
	// On entry, r0 points to the destination/accumulator; r2 points to a
	// packed operand X; and q4/q5 holds an expanded operand Y.  On exit,
	// the accumulator is updated; q13--q15 hold outgoing carries c0--c2;
	// r0 and r2 are each advanced by 16; q4 and q5 are preserved; and
	// the other NEON registers are clobbered.
  endprologue

	// Start by loading the operand words from memory.
	vld1.32	{q13}, [r0]
	SPREADACC0(q13, q14, q15, q12)
	vld1.32	{q1}, [r2]!

	// Move the accumulator input to the incoming carry slots.  Split the
	// accumulator into four pieces.
	SPREADACC1(q13, q14, q15, q12)

	b	mla4_common
ENDFUNC

INTFUNC(mmul4)
	// On entry, r0 points to the destination; r1 points to a packed
	// operand U; r2 points to a packed operand X (the modulus); q2/q3
	// holds an expanded operand V; and q4/q5 holds an expanded operand M
	// (the Montgomery factor -N^{-1} (mod B)).  On exit, the destination
	// is updated (to zero); q4/q5 hold an expanded factor Y = U V M (mod
	// B); q13--q15 hold outgoing carries c0--c2; r0, r1, and r2 are each
	// advanced by 16; q2 and q3 are preserved; and the other NEON
	// registers are clobbered.

	// Start by loading the operand words from memory.
	vld1.32	{q0}, [r1]!

	// Calculate the low half of W = A + U V, being careful to leave the
	// carries in place.
	MUL0(q13, nil,	  q0,  q2, q3,	  nil,  nil, nil)
	MUL1(q14, nil,	  q0,  q2, q3,	  nil,  nil, nil)
	 CARRY0(D0(q6), q13, q8)
	MUL2(q15, nil,	  q0,  q2, q3,	  nil,  nil, nil)
	 CASIDE1(D0(q9), q14, q8)
	 CASIDE0(D0(q7), D0(q9), q14, q8)
	MUL3(q12, nil,	  q0,  q2, q3,	  nil,  nil, nil)
	b mmla4_common
ENDFUNC

INTFUNC(mmla4)
	// On entry, r0 points to the destination/accumulator A; r1 points to
	// a packed operand U; r2 points to a packed operand X (the modulus);
	// q2/q3 holds an expanded operand V; and q4/q5 holds an expanded
	// operand M (the Montgomery factor -N^{-1} (mod B)).  On exit, the
	// accumulator is updated (to zero); q4/q5 hold an expanded factor Y
	// = (A + U V) M (mod B); q13--q15 hold outgoing carries c0--c2; r0,
	// r1, and r2 are each advanced by 16; q2 and q3 are preserved; and
	// the other NEON registers are clobbered.
  endprologue

	// Start by loading the operand words from memory.
	vld1.32	{q13}, [r0]
	SPREADACC0(q13, q14, q15, q12)
	vld1.32	{q0}, [r1]!

	// Move the accumulator input to the incoming carry slots.  Split the
	// accumulator into four pieces.
	SPREADACC1(q13, q14, q15, q12)

	// Calculate the low half of W = A + U V, being careful to leave the
	// carries in place.
	MUL0(q13, t,	  q0,  q2, q3,	  nil,  nil, nil)
	MUL1(q14, t,	  q0,  q2, q3,	  nil,  nil, nil)
	 CARRY0(D0(q6), q13, q8)
	MUL2(q15, t,	  q0,  q2, q3,	  nil,  nil, nil)
	 CASIDE1(D0(q9), q14, q8)
	 CASIDE0(D0(q7), D0(q9), q14, q8)
	MUL3(q12, t,	  q0,  q2, q3,	  nil,  nil, nil)
mmla4_common:
	 CASIDE1(D0(q9), q15, q8)
	 CASIDE0(D1(q6), D0(q9), q15, q8)
	 CASIDE1(D0(q9), q12, q8)
	 CASIDE0E(D1(q7), D0(q9), q12, q8)
	vtrn.32	q6, q7

	// Calculate the low half of the Montgomery factor Y = W M.  At this
	// point, registers are a little tight.
	MUL0( q8, nil,	  q6,  q4, q5,	  nil,  nil, nil)
	MUL1( q9, nil,	  q6,  q4, q5,	  nil,  nil, nil)
	 CARRY0(D0(q8), q8, q1)
	MUL2(q10, nil,	  q6,  q4, q5,	  nil,  nil, nil)
	 CARRY1(q9, q1)
	 CARRY0(D0(q9), q9, q1)
	MUL3(q11, nil,	  q6,  q4, q5,	  nil,  nil, nil)
	 CARRY1(q10, q1)
	 CARRY0(D1(q8), q10, q1)
	  vmov.i32 q5, #0
	 CARRY1(q11, q1)
	 CARRY0(D1(q9), q11, q1)
	  vld1.32 {q1}, [r2]!
	vtrn.32	q8, q9

	// Expand Y.  We'll put it in its proper place a bit later.
	vzip.16	q8, q5

	// Build up the product X Y in the carry slots.
	MUL0(q13, t,	  q1,  q8, q5,	  nil,  nil, nil)
	MUL1(q14, t,	  q1,  q8, q5,	  nil,  nil, nil)
	 CARRY0(nil, q13, q9)
	MUL2(q15, t,	  q1,  q8, q5,	  nil,  nil, nil)
	 CARRY1(q14, q9)
	  vmov	q4, q8
	 CARRY0(nil, q14, q9)
	MUL3(q12, t,	  q1,  q8, q5,	  nil,  nil, nil)
	 CARRY1(q15, q9)
	 CARRY0(nil, q15, q9)
	vmov.u32 q6, #0

	// And complete the calculation.
	MUL4(q13, nil,	  q0,  q2, q3,	  q1,  q4, q5)
	 CARRY1(q12, q9)
	 CARRY0(nil, q12, q9)
	MUL5(q14, nil,	  q0,  q2, q3,	  q1,  q4, q5)
	 CARRY1(q13, q9)
	MUL6(q15, nil,	  q0,  q2, q3,	  q1,  q4, q5)

	// Finish up and store the result.
	vst1.32	{q6}, [r0]!

	// All done.
	bx	r14
ENDFUNC

INTFUNC(mont4)
	// On entry, r0 points to the destination/accumulator A; r2 points to
	// a packed operand X (the modulus); and q2/q3 holds an expanded
	// operand M (the Montgomery factor -N^{-1} (mod B)).  On exit, the
	// accumulator is updated (to zero); q4/q5 hold an expanded factor Y
	// = A M (mod B); q13--q15 hold outgoing carries c0--c2; r0 and r2
	// are each advanced by 16; q2 and q3 are preserved; and the other
	// NEON registers are clobbered.
  endprologue

	// Start by loading the operand words from memory.
	vld1.32	{q0}, [r0]
	vld1.32 {q1}, [r2]!

	// Calculate Y = A M (mod B).
	MUL0(q8, nil,	  q0,  q2, q3,	  nil,  nil, nil)
	MUL1(q9, nil,	  q0,  q2, q3,	  nil,  nil, nil)
	 CARRY0(D0(q4), q8, q6)
	MUL2(q10, nil,	  q0,  q2, q3,	  nil,  nil, nil)
	 CARRY1(q9, q6)
	  vmov	q13, q0
	 CARRY0(D0(q7), q9, q6)
	MUL3(q11, nil,	  q0,  q2, q3,	  nil,  nil, nil)
	 CARRY1(q10, q6)
	 CARRY0(D1(q4), q10, q6)
	  SPREADACC0(q13, q14, q15, q12)
	 CARRY1(q11, q6)
	 CARRY0(D1(q7), q11, q6)
	  SPREADACC1(q13, q14, q15, q12)
	vmov.i32 q5, #0
	vtrn.32	q4, q7
	vzip.16	q4, q5

	// Calculate the actual result.  Well, the carries, at least.
	vmov.i32 q8, #0
	MUL0(q13, t,	  q1,  q4, q5,	  nil,  nil, nil)
	MUL1(q14, t,	  q1,  q4, q5,	  nil,  nil, nil)
	 CARRY0(nil, q13, q6)
	MUL2(q15, t,	  q1,  q4, q5,	  nil,  nil, nil)
	 CARRY1(q14, q6)
	 CARRY0(nil, q14, q6)
	MUL3(q12, t,	  q1,  q4, q5,	  nil,  nil, nil)
	 CARRY1(q15, q6)
	 CARRY0(nil, q15, q6)
	MUL4(q13, nil,	  q1,  q4, q5,	  nil,  nil, nil)
	 CARRY1(q12, q6)
	 CARRY0(nil, q12, q6)
	MUL5(q14, nil,	  q1,  q4, q5,	  nil,  nil, nil)
	 CARRY1(q13, q6)
	MUL6(q15, nil,	  q1,  q4, q5,	  nil,  nil, nil)

	// Finish up and store the result.
	vst1.32	{q8}, [r0]!

	// All done.
	bx	r14
ENDFUNC

///--------------------------------------------------------------------------
/// Bulk multipliers.

FUNC(mpx_umul4_arm_neon)
	// void mpx_umul4_arm_neon(mpw *dv, const mpw *av, const mpw *avl,
	//			   const mpw *bv, const mpw *bvl);

	// Establish the arguments and do initial setup.
	//
	// inner loop dv	r0
	// inner loop av	r2
	// outer loop dv	r5
	// outer loop bv	r3
	// av base		r1
	// av limit		r12
	// bv limit		r4
	pushreg	r4, r5, r14
	pushvfp	QQ(q4, q7)
  endprologue

	// Prepare for the first iteration.
	vld1.32	{q4}, [r3]!		// = Y = bv[0]
	vmov.i32 q5, #0
	// r0				// = dv for inner loop
	// r1				// = av base
	// r3				// = bv for outer loop
	ldr	r4, [sp, #76]		// = bv limit
	mov	r12, r2			// = av limit
	mov	r2, r1			// = av for inner loop
	add	r5, r0, #16		// = dv for outer loop
	vzip.16	q4, q5			// expand Y
	bl	mul4zc
	cmp	r2, r12			// all done?
	bhs	8f

	// Continue with the first iteration.
0:	bl	mul4
	cmp	r2, r12			// all done?
	blo	0b

	// Write out the leftover carry.  There can be no tail here.
8:	bl	carryprop
	cmp	r3, r4			// more passes to do?
	bhs	9f

	// Set up for the next pass.
1:	vld1.32	{q4}, [r3]!		// = Y = bv[i]
	vmov.i32 q5, #0
	mov	r0, r5			// -> dv[i]
	mov	r2, r1			// -> av[0]
	add	r5, r5, #16
	vzip.16	q4, q5			// expand Y
	bl	mla4zc
	cmp	r2, r12			// done yet?
	bhs	8f

	// Continue...
0:	bl	mla4
	cmp	r2, r12
	blo	0b

	// Finish off this pass.  There was no tail on the previous pass, and
	// there can be done on this pass.
8:	bl	carryprop
	cmp	r3, r4
	blo	1b

	// All over.
9:	popvfp	QQ(q4, q7)
	popreg	r4, r5, r14
	bx	r14
ENDFUNC

FUNC(mpxmont_mul4_arm_neon)
	// void mpxmont_mul4_arm_neon(mpw *dv, const mpw *av, const mpw *bv,
	//			     const mpw *nv, size_t n, const mpw *mi);

	// Establish the arguments and do initial setup.
	//
	// inner loop dv	r0
	// inner loop av	r1
	// inner loop nv	r2
	// mi			r5
	// outer loop dv	r6
	// outer loop bv	r7
	// av base		r8
	// av limit		r9
	// bv limit		r4
	// nv base		r3
	// n			r4
	// c			r10
	// 0			r12

	pushreg	r4-r10, r14
	pushvfp	QQ(q4, q7)
  endprologue

	// Establish the expanded operands.
	ldrd	r4, r5, [sp, #96]	// r4 = n; r5 -> mi
	vld1.32	{q2}, [r2]		// = V = bv[0]
	vmov.i32 q3, #0
	vmov.i32 q5, #0
	vld1.32	{q4}, [r5]		// = M

	// Set up the outer loop state and prepare for the first iteration.
	// r0				// -> dv for inner loop
	// r1				// -> av for inner loop
	add	r7, r2, #16		// -> bv
	// r3				// -> nv
	add	r6, r0, #16		// -> dv
	mov	r8, r1			// -> av
	add	r9, r1, r4, lsl #2	// -> av limit
	add	r4, r2, r4, lsl #2	// -> bv limit
	mov	r2, r3			// -> nv for inner loop
	mov	r12, #0			// = 0

	vzip.16	q2, q3			// expand V
	vzip.16	q4, q5			// expand M
	bl	mmul4
	cmp	r1, r9			// done already?
	bhs	8f

	// Complete the first inner loop.
0:	bl	dmul4
	cmp	r1, r9			// done yet?
	blo	0b

	// Still have carries left to propagate.  Rather than store the tail
	// end in memory, keep it in a general-purpose register for later.
	bl	carryprop
	vmov.32	r10, QW(q15, 0)

	// Embark on the next iteration.  (There must be one.  If n = 1 then
	// we would have bailed above, to label 8.  Similarly, the subsequent
	// iterations can fall into the inner loop immediately.)
1:	vld1.32 {q2}, [r7]!		// = V = bv[i]
	vld1.32	{q4}, [r5]		// = M
	vmov.i32 q3, #0
	vmov.i32 q5, #0
	mov	r0, r6			// -> dv[i]
	add	r6, r6, #16
	mov	r1, r8			// -> av[0]
	mov	r2, r3			// -> nv[0]
	vzip.16	q2, q3			// expand V
	vzip.16	q4, q5			// expand M
	bl	mmla4

	// Complete the next inner loop.
0:	bl	dmla4
	cmp	r1, r9			// done yet?
	blo	0b

	// Still have carries left to propagate, and they overlap the
	// previous iteration's final tail, so read that and add it.
	cmp	r7, r4
	vmov.32	D0(q12), r10, r12
	vadd.i64 D0(q13), D0(q13), D0(q12)
	bl	carryprop
	vmov.32	r10, QW(q15, 0)

	// Back again, maybe.
	blo	1b

	// All done, almost.
	str	r10, [r0], #4
	popvfp	QQ(q4, q7)
	popreg	r4-r10, r14
	bx	r14

	// First iteration was short.  Write out the carries and we're done.
	// (This could be folded into the main loop structure, but that would
	// penalize small numbers more.)
8:	bl	carryprop
	vst1.32	{QW(q15, 0)}, [r0]!
	popvfp	QQ(q4, q7)
	popreg	r4-r10, r14
	bx	r14
ENDFUNC

FUNC(mpxmont_redc4_arm_neon)
	// void mpxmont_redc4_arm_neon(mpw *dv, mpw *dvl, const mpw *nv,
	//			       size_t n, const mpw *mi);

	// Establish the arguments and do initial setup.
	//
	// inner loop dv	r0
	// dv limit		r1
	// inner loop nv	r2
	// blocks-of-4 dv limit	r3
	// mi			(r14)
	// outer loop dv	r4
	// outer loop dv limit	r5
	// nv base		r6
	// nv limit		r7
	// n			r3
	// c			(r14)
	// t0, t1, t2, t3	r2, r8, r9, r10
	// 0			r12

	pushreg	r4-r10, r14
	pushvfp	QQ(q4, q7)
  endprologue

	// Set up the outer loop state and prepare for the first iteration.
	ldr	r14, [sp, #96]		// -> mi
	vmov.i32 q3, #0
	 sub	r12, r1, r0		// total dv bytes
	// r0				// -> dv for inner loop
	// r1				// -> overall dv limit
	// r2				// -> nv for inner loop
	// r3				// = n (for now)
	add	r4, r0, #16		// -> dv for outer loop
	add	r5, r0, r3, lsl #2	// -> dv limit
	 bic	r12, r12, #15		// dv blocks of 4
	 vld1.32 {q2}, [r14]		// = M
	mov	r6, r2			// -> nv
	add	r7, r2, r3, lsl #2	// -> nv limit
	add	r3, r0, r12		// -> dv blocks-of-4 limit
	 vzip.16 q2, q3			// expand M
	mov	r12, #0			// = 0
	bl	mont4
	cmp	r2, r7			// done already?
	bhs	8f

5:	bl	mla4
	cmp	r2, r7			// done yet?
	blo	5b

	// Still have carries left to propagate.  Adding the accumulator
	// block into the carries is a little different this time, because
	// all four accumulator limbs have to be squished into the three
	// carry registers for `carryprop' to do its thing.
8:	vld1.32	{q9}, [r0]
	SPREADACC0(q9, q10, q11, q12)
	SPREADACC1(q9, q10, q11, q12)
	vshl.u64 D0(q12), D0(q12), #16
	CARRYACC(q9, q10, q11, q12, q13, q14, q15)
	vadd.u64 D1(q15), D1(q15), D0(q12)

	bl	carryprop
	vmov.32	r14, QW(q15, 0)
	cmp	r0, r3
	bhs	7f

	// Propagate the first group of carries.
	ldmia	r0, {r2, r8-r10}
	adds	r2, r2, r14
	adcs	r8, r8, #0
	adcs	r9, r9, #0
	adcs	r10, r10, #0
	stmia	r0!, {r2, r8-r10}
	teq	r0, r3
	beq	6f

	// Continue carry propagation until the end of the buffer.
0:	ldmia	r0, {r2, r8-r10}
	adcs	r2, r2, #0
	adcs	r8, r8, #0
	adcs	r9, r9, #0
	adcs	r10, r10, #0
	stmia	r0!, {r2, r8-r10}
	teq	r0, r3
	bne	0b

	// Deal with the tail end.  Note that the actual destination length
	// won't be an exacty number of blocks of four, so it's safe to just
	// drop through here.
6:	adc	r14, r12, #0
7:	ldr	r2, [r0]
	adds	r2, r2, r14
	str	r2, [r0], #4
	teq	r0, r1
	beq	8f
0:	ldr	r2, [r0]
	adcs	r2, r2, #0
	str	r2, [r0], #4
	teq	r0, r1
	bne	0b

	// All done for this iteration.  Start the next.
8:	cmp	r4, r5
	bhs	9f
	mov	r0, r4
	add	r4, r4, #16
	mov	r2, r6
	bl	mont4
	b	5b

	// All over.
9:	popvfp	QQ(q4, q7)
	popreg	r4-r10, r14
	bx	r14
ENDFUNC

///--------------------------------------------------------------------------
/// Testing and performance measurement.

#ifdef TEST_MUL4

//		  dmul  smul  mmul  mont
// z	r0	  r0    r0    r0     r0
// c	r4	  r1    r1    r1     r1
// y	r3	  --    --    r2     r2
// u	r1	  r2    --    r3     --
// x	r2	  r3    r2    stk0   r3
// vv	q2/q3	  stk0  --    stk1   stk0
// yy	q4/q5	  stk1  r3    stk2   --
// n	r5	  stk2  stk0  stk3   stk1
// cyv	r6	  stk3  stk1  stk4   stk2

#define STKARG(i) sp, #80 + 4*(i)

.macro	testprologue mode
	pushreg	r4-r6, r14
	pushvfp	QQ(q4, q7)
  endprologue

  .ifeqs "\mode", "dmul"
	mov	r4, r1			// -> c
	mov	r1, r2			// -> u
	mov	r2, r3			// -> x

	ldr	r14, [STKARG(0)]	// -> vv
	vld1.32	{q2}, [r14]
	vmov.i32 q3, #0
	vzip.16	q2, q3			// (v''_1, v'_1; v''_0, v'_0)

	ldr	r14, [STKARG(1)]	// -> yy
	vld1.32	{q4}, [r14]
	vmov.i32 q5, #0
	vzip.16	q4, q5			// (y''_1, y'_1; y''_0, y'_0)

	ldr	r5, [STKARG(2)]		// = n
	ldr	r6, [STKARG(3)]		// -> cyv
  .endif

  .ifeqs "\mode", "smul"
	mov	r4, r1			// -> c
	// r2				// -> x

	vld1.32	{q4}, [r3]
	vmov.i32 q5, #0
	vzip.16	q4, q5			// (y''_1, y'_1; y''_0, y'_0)

	ldr	r5, [STKARG(0)]		// = n
	ldr	r6, [STKARG(1)]		// -> cyv
  .endif

  .ifeqs "\mode", "mmul"
	mov	r4, r1			// -> c
	mov	r1, r3			// -> u
	mov	r3, r2			// -> y
	ldr	r2, [STKARG(0)]		// -> x

	ldr	r14, [STKARG(1)]	// -> vv
	vld1.32	{q2}, [r14]
	vmov.i32 q3, #0
	vzip.16	q2, q3			// (v''_1, v'_1; v''_0, v'_0)

	ldr	r14, [STKARG(2)]	// -> yy
	vld1.32	{q4}, [r14]
	vmov.i32 q5, #0
	vzip.16	q4, q5			// (y''_1, y'_1; y''_0, y'_0)

	ldr	r5, [STKARG(3)]		// = n
	ldr	r6, [STKARG(4)]		// -> cyv
  .endif

  .ifeqs "\mode", "mont"
	mov	r4, r1			// -> c
	mov	r1, r3			// -> u
	mov	r14, r2
	mov	r2, r3			// -> x
	mov	r3, r14			// -> y

	ldr	r14, [STKARG(0)]	// -> vv
	vld1.32	{q2}, [r14]
	vmov.i32 q3, #0
	vzip.16	q2, q3			// (v''_1, v'_1; v''_0, v'_0)

	ldr	r5, [STKARG(1)]		// = n
	ldr	r6, [STKARG(2)]		// -> cyv
  .endif
.endm

.macro	testldcarry
	vldmia	r4, {QQ(q13, q15)}	// c0, c1, c2
.endm

.macro	testtop
0:	subs	r5, r5, #1
.endm

.macro	testtail
	bne	0b
.endm

.macro	testcarryout
	vstmia	r4, {QQ(q13, q15)}
.endm

.macro	testepilogue
	popvfp	QQ(q4, q7)
	popreg	r4-r6, r14
	bx	r14
.endm

FUNC(test_dmul4)
	testprologue dmul
	testldcarry
	testtop
	bl	dmul4
	testtail
	testcarryout
	testepilogue
ENDFUNC

FUNC(test_dmla4)
	testprologue dmul
	testldcarry
	testtop
	bl	dmla4
	testtail
	testcarryout
	testepilogue
ENDFUNC

FUNC(test_mul4)
	testprologue smul
	testldcarry
	testtop
	bl	mul4
	testtail
	testcarryout
	testepilogue
ENDFUNC

FUNC(test_mul4zc)
	testprologue smul
	testldcarry
	testtop
	bl	mul4zc
	testtail
	testcarryout
	testepilogue
ENDFUNC

FUNC(test_mla4)
	testprologue smul
	testldcarry
	testtop
	bl	mla4
	testtail
	testcarryout
	testepilogue
ENDFUNC

FUNC(test_mla4zc)
	testprologue smul
	testldcarry
	testtop
	bl	mla4zc
	testtail
	testcarryout
	testepilogue
ENDFUNC

FUNC(test_mmul4)
	testprologue mmul
	testtop
	bl	mmul4
	testtail
	vst1.32 {q4, q5}, [r3]
	testcarryout
	testepilogue
ENDFUNC

FUNC(test_mmla4)
	testprologue mmul
	testtop
	bl	mmla4
	testtail
	vst1.32 {q4, q5}, [r3]
	testcarryout
	testepilogue
ENDFUNC

FUNC(test_mont4)
	testprologue mont
	testtop
	bl	mont4
	testtail
	vst1.32 {q4, q5}, [r3]
	testcarryout
	testepilogue
ENDFUNC

#endif

///----- That's all, folks --------------------------------------------------