[catacomb] / math / mpx-mul4-arm64-simd.S

/// -*- mode: asm; asm-comment-char: ?/ -*-
///
/// Large SIMD-based multiplications
///
/// (c) 2019 Straylight/Edgeware
///

///----- Licensing notice ---------------------------------------------------
///
/// This file is part of Catacomb.
///
/// Catacomb is free software: you can redistribute it and/or modify it
/// under the terms of the GNU Library General Public License as published
/// by the Free Software Foundation; either version 2 of the License, or
/// (at your option) any later version.
///
/// Catacomb is distributed in the hope that it will be useful, but
/// WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
/// Library General Public License for more details.
///
/// You should have received a copy of the GNU Library General Public
/// License along with Catacomb.  If not, write to the Free Software
/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
/// USA.

///--------------------------------------------------------------------------
/// Preliminaries.

#include "config.h"
#include "asm-common.h"

	.text

///--------------------------------------------------------------------------
/// Theory.
///
/// We define a number of primitive fixed-size multipliers from which we can
/// construct more general variable-length multipliers.
///
/// The basic trick is the same throughout.  In an operand-scanning
/// multiplication, the inner multiplication loop multiplies a multiple-
/// precision operand by a single precision factor, and adds the result,
/// appropriately shifted, to the result.  A `finely integrated operand
/// scanning' implementation of Montgomery multiplication also adds the
/// product of a single-precision `Montgomery factor' and the modulus,
/// calculated in the same pass.  The more common `coarsely integrated
/// operand scanning' alternates main multiplication and Montgomery passes,
/// which requires additional carry propagation.
///
/// Throughout both plain-multiplication and Montgomery stages, then, one of
/// the factors remains constant throughout the operation, so we can afford
/// to take a little time to preprocess it.  The transformation we perform is
/// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
/// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
/// pieces are placed into 32-bit cells, and arranged as two 128-bit SIMD
/// operands, as follows.
///
///	Offset	   12	   8	    4	   0
///	   0	v''_1	v'_1	v''_0	v'_0
///	  16	v''_3	v'_3	v''_2	v'_2
///
/// The `umull' and `umlal' instructions can multiply a vector of two 32-bit
/// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
/// on (say) v'_0 and v''_0 in a single instruction, to produce two 48-bit
/// results in 64-bit fields.  The sixteen bits of headroom allows us to add
/// many products together before we must deal with carrying; it also allows
/// for some calculations to be performed on the above expanded form.
///
/// We maintain three `carry' registers, v28--v30, accumulating intermediate
/// results; we name them `c0', `c1', and `c2'.  Each carry register holds
/// two 64-bit halves: the register c0, for example, holds c'_0 (low half)
/// and c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the
/// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2.
/// The `umull' or `umlal' instruction acting on a scalar operand and an
/// operand in the expanded form above produces a result which can be added
/// directly to the appropriate carry register.
///
/// An unusual feature of this code, as compared to the other `mul4'
/// implementations, is that it makes extensive use of the ARM64
/// general-purpose registers for carry resolution and output construction.
/// As a result, an additional general-purpose register (typically x15) is
/// used as an additional carry, with the carry value in bits 16--63.
///
/// Multiplication is performed in product-scanning order, since ARM
/// processors commonly implement result forwarding for consecutive multiply-
/// and-accumulate instructions specifying the same destination.
/// Experimentally, this runs faster than operand-scanning in an attempt to
/// hide instruction latencies.
///
/// On 64-bit ARM, we have a vast supply number of registers: the expanded
/// operands are kept in registers.  The packed operands are read from memory
/// into working registers v0 and v1.  The following conventional argument
/// names and locations are used throughout.
///
/// Arg	Format	    Location	Notes
///
/// U	packed	    [x1]
/// X	packed	    [x2]	In Montgomery multiplication, X = N
/// V	expanded    v2/v3
/// Y	expanded    v4/v5	In Montgomery multiplication, Y = (A + U V) M
/// M	expanded    v6/v7	-N^{-1} (mod B^4)
/// N				Modulus, for Montgomery multiplication
/// A	packed	    [x0]	Destination/accumulator
/// C	carry	    v28--v30
/// 0		    v31		128-bit zero
///
/// The calculation is some variant of
///
///	A' + C' B^4 <- U V + X Y + A + C
///
/// The low-level functions fit into a fairly traditional (finely-integrated)
/// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y)
/// (indexed by i).
///
/// The variants are as follows.
///
/// Function	Variant			Use		i	j
///
/// mmul4	A = C = 0		Montgomery	0	0
/// dmul4	A = 0			Montgomery	0	+
/// mmla4	C = 0			Montgomery	+	0
/// dmla4	exactly as shown	Montgomery	+	+
///
/// mul4zc	U = V = A = C = 0	Plain		0	0
/// mul4	U = V = A = 0		Plain		0	+
/// mla4zc	U = V = C = 0		Plain		+	0
/// mla4	U = V = 0		Plain		+	+
///
/// The `mmul4' and `mmla4' functions are also responsible for calculating
/// the Montgomery reduction factor Y = (A + U V) M used by the rest of the
/// inner loop.

///--------------------------------------------------------------------------
/// Macro definitions.

.macro	mulacc	z, u, v, x=nil, y=nil
	// Set Z = Z + U V + X Y, using the low halves of V and Y.  Y may be
	// `nil' to omit the second operand.  Z, V, and Y should be 128-bit
	// `vN' registers; and U and X should be 32-bit `vN.s[I]' scalars;
	// the multiplications produce two 64-bit elementwise products, which
	// are added elementwise to Z.

	umlal	\z\().2d, \v\().2s, \u
    .ifnes "\y", "nil"
	umlal	\z\().2d, \y\().2s, \x
    .endif
.endm

.macro	mulacc2	z, u, v, x=nil, y=nil
	// Set Z = Z + U V + X Y, using the high halves of V and Y; see
	// `mulacc'.

	umlal2	\z\().2d, \v\().4s, \u
    .ifnes "\y", "nil"
	umlal2	\z\().2d, \y\().4s, \x
    .endif
.endm

.macro	mulinit	z, zinitp, u, v=nil, x=nil, y=nil
	// If ZINITP then set Z = Z + U V + X Y, as for `mulacc'; otherwise,
	// set Z = U V + X Y.  Operand requirements and detailed operation
	// are as for `mulacc'.

  .ifeqs "\zinitp", "t"
	mulacc	\z, \u, \v, \x, \y
  .else
	umull	\z\().2d, \v\().2s, \u
    .ifnes "\y", "nil"
	umlal	\z\().2d, \y\().2s, \x
    .endif
  .endif
.endm

.macro	mulini2	z, zinitp, u, v=nil, x=nil, y=nil
	// As `mulinit', but with the high halves of V and Y.

  .ifeqs "\zinitp", "t"
	mulacc2	\z, \u, \v, \x, \y
  .else
	umull2	\z\().2d, \v\().4s, \u
    .ifnes "\y", "nil"
	umlal2	\z\().2d, \y\().4s, \x
    .endif
  .endif
.endm

// `mulI': accumulate the B^I and b B^I terms of the polynomial product sum
// U V + X Y, given that U = u_0 + B u_1 + B^2 u_2 + B^3 u_3 (and similarly
// for x), and V = v'_0 + b v''_0 + B (v'_1 + b v''_1) + B^2 (v'_2 + b v''_2)
// + B^3 (v'_3 + b v''_3) (and similarly for Y).  The 64-bit coefficients are
// added into the low and high halves of the 128-bit register Z (if ZINIT is
// `nil' then simply set Z, as if it were initially zero).
.macro	mul0	z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
	mulinit	\z, \zinitp, \u\().s[0], \v0, \x\().s[0], \y0
.endm
.macro	mul1	z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
	mulini2	\z, \zinitp, \u\().s[0], \v0, \x\().s[0], \y0
	mulacc	\z,	     \u\().s[1], \v0, \x\().s[1], \y0
.endm
.macro	mul2	z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
	mulinit	\z, \zinitp, \u\().s[0], \v1, \x\().s[0], \y1
	mulacc2	\z,	     \u\().s[1], \v0, \x\().s[1], \y0
	mulacc	\z,	     \u\().s[2], \v0, \x\().s[2], \y0
.endm
.macro	mul3	z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
	mulini2	\z, \zinitp, \u\().s[0], \v1, \x\().s[0], \y1
	mulacc	\z,	     \u\().s[1], \v1, \x\().s[1], \y1
	mulacc2	\z,	     \u\().s[2], \v0, \x\().s[2], \y0
	mulacc	\z,	     \u\().s[3], \v0, \x\().s[3], \y0
.endm
.macro	mul4	z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
	mulini2	\z, \zinitp, \u\().s[1], \v1, \x\().s[1], \y1
	mulacc	\z,	     \u\().s[2], \v1, \x\().s[2], \y1
	mulacc2	\z,	     \u\().s[3], \v0, \x\().s[3], \y0
.endm
.macro	mul5	z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
	mulini2	\z, \zinitp, \u\().s[2], \v1, \x\().s[2], \y1
	mulacc	\z,	     \u\().s[3], \v1, \x\().s[3], \y1
.endm
.macro	mul6	z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
	mulini2	\z, \zinitp, \u\().s[3], \v1, \x\().s[3], \y1
.endm

// Steps in the process of propagating carry bits upwards from ZLO (a 128-bit
// `vN' register).  Here, T0, T1, and CG are 64-bit `xN' general-purpose
// registers clobbered in the process.  Set the low 32 bits of the 64-bit
// `xN' general-purpose register ZOUT to the completed coefficient z_1,
// leaving a carry in CG.
//
// In detail, what happens is as follows.  Suppose initially that ZLO =
// (z''_i; z'_i) and ZHI = (z''_{i+1}; z'_{i+1}).  Let t = z'_i + b z''_i;
// observe that floor(t/b) = floor(z'_i/b) + z''_i.  Let z_i = t mod B, and
// add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}.  This has
// a circuit depth of 3; I don't know how to do better.
//
// Output words are left in the low half of a 64-bit register, with rubbish
// in the high half.  Two such results can be combined using the `bfi'
// instruction.
.macro	carry0	zlo, cg=x15, t0=x16, t1=x17
	// Capture the values of carry-register ZLO and CG (if not `nil') in
	// general-purpose registers T0 and T1, suitable for use in `carry1'.
	mov	\t0, \zlo\().d[0]
	mov	\t1, \zlo\().d[1]
  .ifnes "\cg", "nil"
	add	\t0, \t0, \cg, lsr #16
  .endif
.endm
.macro	carry1	zout, cg=x15, t0=x16, t1=x17
	// Collect a 32-bit output word in the low 32 bits of ZOUT (leaving
	// rubbish in the high 32 bits), and update CG suitably to continue
	// processing with the next carry register.
  .ifnes "\zout", "nil"
	add	\zout, \t0, \t1, lsl #16
  .endif
  .ifnes "\cg", "nil"
	add	\cg, \t1, \t0, lsr #16
  .endif
.endm

.macro	expand	vlo, vhi, vz=v31
	// Expand the packed 128-bit operand in VLO to an expanded operand in
	// VLO and VHI, assuming that VZ is all-bits-zero.  All three are
	// `vN' 128-bit SIMD registers.
	zip2	\vhi\().8h, \vlo\().8h, \vz\().8h
	zip1	\vlo\().8h, \vlo\().8h, \vz\().8h
.endm

.macro	sprdacc	a0, a1, a2, a3=nil
	// Spread the packed 128-bit operand in A0 into carry-format values
	// in A0, A1, A2, A3.  If A3 is `nil', then spread the same value
	// into A0, A1, A2 only, clobbering x16.
  .ifeqs "\a3", "nil"
	mov	w16, \a0\().s[3]
  .endif
	trn2	\a2\().2d, \a0\().2d, v31.2d
	trn2	\a1\().2s, \a0\().2s, v31.2s
  .ifeqs "\a3", "nil"
	lsl	x16, x16, #16
  .endif
	trn1	\a0\().2s, \a0\().2s, v31.2s
  .ifeqs "\a3", "nil"
	mov	\a2\().d[1], x16
  .else
	trn2	\a3\().2s, \a2\().2s, v31.2s
  .endif
	mov	\a2\().s[1], wzr
.endm

.macro	crryacc	a0, a1, a2, a3, c0, c1, c2
	// Add the carry-format values A0, A1, A2 into the existing carries
	// C0, C1, C2 (leaving A3 where it is).
	add	\c0\().2d, \c0\().2d, \a0\().2d
	add	\c1\().2d, \c1\().2d, \a1\().2d
	add	\c2\().2d, \c2\().2d, \a2\().2d
.endm

///--------------------------------------------------------------------------
/// Primitive multipliers and related utilities.

INTFUNC(carryprop)
	// On entry, x0 points to a destination, and v28--v30 and x15 hold
	// incoming carries c0--c2 and cg.  On exit, the low 128 bits of the
	// carry value are stored at [x0]; the remaining 16 bits of carry are
	// left in x10; x0 is advanced by 16; and x11--x17 are clobbered.
  endprologue

	carry0	v28
	carry1	x11
	carry0	v29
	carry1	x13
	carry0	v30
	carry1	x12
	bfi	x11, x13, #32, #32
	lsr	x14, x15, #16
	lsr	x10, x15, #48
	bfi	x12, x14, #32, #32
	stp	x11, x12, [x0], #16
	ret
ENDFUNC

INTFUNC(dmul4)
	// On entry, x0 points to the destination; x1 and x2 point to packed
	// operands U and X; v2/v3 and v4/v5 hold expanded operands V and Y;
	// v28--v30 and x15 hold incoming carries c0--c2 and cg; and v31 is
	// zero.  On exit, the destination and carries are updated; x0, x1,
	// x2 are each advanced by 16; v2--v5 and v8--v15 are preserved; and
	// x11--x14, x16, x17 and the other SIMD registers are clobbered.
  endprologue

	// Start by loading the operand words from memory.
	ldr	q0, [x1], #16
	ldr	q1, [x2], #16

	// Do the multiplication.
	mul0	v28, t,	     v0,  v2, v3,    v1,  v4, v5
	mul1	v29, t,	     v0,  v2, v3,    v1,  v4, v5
	 carry0	v28
	mul2	v30, t,	     v0,  v2, v3,    v1,  v4, v5
	 carry1	x11
	 carry0	v29
	mul3	v27, nil,    v0,  v2, v3,    v1,  v4, v5
	 carry1	x13
	 carry0	v30
	mul4	v28, nil,    v0,  v2, v3,    v1,  v4, v5
	 carry1	x12
	 carry0	v27
	mul5	v29, nil,    v0,  v2, v3,    v1,  v4, v5
	 carry1	x14
	mul6	v30, nil,    v0,  v2, v3,    v1,  v4, v5

	// Finish up and store the result.
	bfi	x11, x13, #32, #32
	bfi	x12, x14, #32, #32
	stp	x11, x12, [x0], #16

	// All done.
	ret
ENDFUNC

INTFUNC(dmla4)
	// On entry, x0 points to the destination/accumulator A; x1 and x2
	// point to packed operands U and X; v2/v3 and v4/v5 hold expanded
	// operands V and Y; v28--v30 and x15 hold incoming carries c0--c2
	// and cg; and v31 is zero.  On exit, the accumulator and carries are
	// updated; x0, x1, x2 are each advanced by 16; v2--v5 and v8--v15
	// are preserved; and x11--x14, x16, x17 and the other SIMD registers
	// are clobbered.
  endprologue

	// Start by loading the operand words from memory.
	ldr	q24, [x0]
	ldr	q0, [x1], #16
	ldr	q1, [x2], #16
	sprdacc	v24, v25, v26, v27
	crryacc	v24, v25, v26, v27, v28, v29, v30

	// Do the multiplication.
	mul0	v28, t,	     v0,  v2, v3,    v1,  v4, v5
	mul1	v29, t,	     v0,  v2, v3,    v1,  v4, v5
	 carry0	v28
	mul2	v30, t,	     v0,  v2, v3,    v1,  v4, v5
	 carry1	x11
	 carry0	v29
	mul3	v27, t,      v0,  v2, v3,    v1,  v4, v5
	 carry1	x13
	 carry0	v30
	mul4	v28, nil,    v0,  v2, v3,    v1,  v4, v5
	 carry1	x12
	 carry0	v27
	mul5	v29, nil,    v0,  v2, v3,    v1,  v4, v5
	 carry1	x14
	mul6	v30, nil,    v0,  v2, v3,    v1,  v4, v5

	// Finish up and store the result.
	bfi	x11, x13, #32, #32
	bfi	x12, x14, #32, #32
	stp	x11, x12, [x0], #16

	// All done.
	ret
ENDFUNC

INTFUNC(mul4)
	// On entry, x0 points to the destination; x2 points to a packed
	// operand X; v4/v5 holds an expanded operand Y; v13--v15 and x15
	// hold incoming carries c0--c2 and cg; and v31 is zero.  On exit,
	// the destination and carries are updated; x0 and x2 are each
	// advanced by 16; v4 and v5 and v8--v15 are preserved; and x11--x14,
	// x16, x17 and the other SIMD registers are clobbered.
  endprologue

	// Start by loading the operand words from memory.
	ldr	q1, [x2], #16

	// Do the multiplication.
	mul0	v28, t,	     v1,  v4, v5
	mul1	v29, t,	     v1,  v4, v5
	 carry0	v28
	mul2	v30, t,	     v1,  v4, v5
	 carry1	x11
	 carry0	v29
	mul3	v27, nil,    v1,  v4, v5
	 carry1	x13
	 carry0	v30
	mul4	v28, nil,    v1,  v4, v5
	 carry1	x12
	 carry0	v27
	mul5	v29, nil,    v1,  v4, v5
	 carry1	x14
	mul6	v30, nil,    v1,  v4, v5

	// Finish up and store the result.
	bfi	x11, x13, #32, #32
	bfi	x12, x14, #32, #32
	stp	x11, x12, [x0], #16

	// All done.
	ret
ENDFUNC

INTFUNC(mul4zc)
	// On entry, x0 points to the destination; x2 points to a packed
	// operand X; v4/v5 holds an expanded operand Y; and v31 is zero.  On
	// exit, the destination is updated; v28--v30 and x15 hold outgoing
	// carries c0--c2 and cg; x0 and x2 are each advanced by 16; v4 and
	// v5 and v8--v15 are preserved; and x11--x14, x16, x17 and the other
	// SIMD registers are clobbered.
  endprologue

	// Start by loading the operand words from memory.
	ldr	q1, [x2], #16

	// Do the multiplication.
	mul0	v28, nil,    v1,  v4, v5
	mul1	v29, nil,    v1,  v4, v5
	 carry0	v28, nil
	mul2	v30, nil,    v1,  v4, v5
	 carry1	x11
	 carry0	v29
	mul3	v27, nil,    v1,  v4, v5
	 carry1	x13
	 carry0	v30
	mul4	v28, nil,    v1,  v4, v5
	 carry1	x12
	 carry0	v27
	mul5	v29, nil,    v1,  v4, v5
	 carry1	x14
	mul6	v30, nil,    v1,  v4, v5

	// Finish up and store the result.
	bfi	x11, x13, #32, #32
	bfi	x12, x14, #32, #32
	stp	x11, x12, [x0], #16

	// All done.
	ret
ENDFUNC

INTFUNC(mla4)
	// On entry, x0 points to the destination/accumulator A; x2 points to
	// a packed operand X; v4/v5 holds an expanded operand Y; v13--v15
	// and x15 hold incoming carries c0--c2 and cg; and v31 is zero.  On
	// exit, the accumulator and carries are updated; x0 and x2 are each
	// advanced by 16; v4 and v5 and v8--v15 are preserved; and x11--x14,
	// x16, x17 and the other SIMD registers are clobbered.
  endprologue

	// Start by loading the operand words from memory.
	ldr	q24, [x0]
	ldr	q1, [x2], #16
	sprdacc	v24, v25, v26, v27
	crryacc	v24, v25, v26, v27, v28, v29, v30

	// Do the multiplication.
	mul0	v28, t,	     v1,  v4, v5
	mul1	v29, t,	     v1,  v4, v5
	 carry0	v28
	mul2	v30, t,	     v1,  v4, v5
	 carry1	x11
	 carry0	v29
	mul3	v27, t,      v1,  v4, v5
	 carry1	x13
	 carry0	v30
	mul4	v28, nil,    v1,  v4, v5
	 carry1	x12
	 carry0	v27
	mul5	v29, nil,    v1,  v4, v5
	 carry1	x14
	mul6	v30, nil,    v1,  v4, v5

	// Finish up and store the result.
	bfi	x11, x13, #32, #32
	bfi	x12, x14, #32, #32
	stp	x11, x12, [x0], #16

	// All done.
	ret
ENDFUNC

INTFUNC(mla4zc)
	// On entry, x0 points to the destination/accumulator A; x2 points to
	// a packed operand X; v4/v5 holds an expanded operand Y; and v31 is
	// zero.  On exit, the accumulator is updated; v28--v30 and x15 hold
	// outgoing carries c0--c2 and cg; x0 and x2 are each advanced by 16;
	// v4, v5, and v8--v15 are preserved; and x11--x14, x16, x17 and the
	// other SIMD registers are clobbered.
  endprologue

	// Start by loading the operand words from memory.
	ldr	q28, [x0]
	ldr	q1, [x2], #16
	sprdacc	v28, v29, v30, v27

	// Do the multiplication.
	mul0	v28, t,	     v1,  v4, v5
	mul1	v29, t,	     v1,  v4, v5
	 carry0	v28, nil
	mul2	v30, t,	     v1,  v4, v5
	 carry1	x11
	 carry0	v29
	mul3	v27, t,      v1,  v4, v5
	 carry1	x13
	 carry0	v30
	mul4	v28, nil,    v1,  v4, v5
	 carry1	x12
	 carry0	v27
	mul5	v29, nil,    v1,  v4, v5
	 carry1	x14
	mul6	v30, nil,    v1,  v4, v5

	// Finish up and store the result.
	bfi	x11, x13, #32, #32
	bfi	x12, x14, #32, #32
	stp	x11, x12, [x0], #16

	// All done.
	ret
ENDFUNC

INTFUNC(mmul4)
	// On entry, x0 points to the destination; x1 points to a packed
	// operand U; x2 points to a packed operand X (the modulus); v2/v3
	// holds an expanded operand V; and v6/v7 holds an expanded operand M
	// (the Montgomery factor -N^{-1} (mod B)).  On exit, the destination
	// is updated (to zero); v4/v5 hold an expanded factor Y = U V M (mod
	// B); v28--v30 and x15 hold outgoing carries c0--c2 and cg; x0, x1,
	// and x2 are each advanced by 16; v2, v3, and v8--v15 are preserved;
	// and x11--x14, x16, x17 and the other SIMD registers are clobbered.
  endprologue

	// Start by loading the operand words from memory.
	ldr	q0, [x1], #16
	ldr	q1, [x2], #16

	// Calculate the low half of W = A + U V, being careful to leave the
	// carries in place.
	mul0	v28, nil,  v0,  v2, v3
	mul1	v29, nil,  v0,  v2, v3
	 carry0	v28, nil
	mul2	v30, nil,  v0,  v2, v3
	 carry1	x11
	 carry0	v29
	mul3	v27, nil,  v0,  v2, v3
	b	mmla4_common
ENDFUNC

INTFUNC(mmla4)
	// On entry, x0 points to the destination/accumulator A; x1 points to
	// a packed operand U; x2 points to a packed operand X (the modulus);
	// v2/v3 holds an expanded operand V; and v6/v7 holds an expanded
	// operand M (the Montgomery factor -N^{-1} (mod B)).  On exit, the
	// accumulator is updated (to zero); v4/v5 hold an expanded factor Y
	// = (A + U V) M (mod B); v28--v30 and x15 hold outgoing carries
	// c0--c2 and cg; x0, x1, and x2 are each advanced by 16; v2, v3, v6,
	// v7, and v8--v15 are preserved; and x11--x14, x16, x17 and the
	// other SIMD registers are clobbered.
  endprologue

	// Start by loading the operand words from memory.
	ldr	q28, [x0]
	ldr	q0, [x1], #16
	ldr	q1, [x2], #16
	sprdacc	v28, v29, v30, v27

	// Calculate the low half of W = A + U V, being careful to leave the
	// carries in place.
	mul0	v28, t,	   v0,  v2, v3
	mul1	v29, t,	   v0,  v2, v3
	 carry0	v28, nil
	mul2	v30, t,	   v0,  v2, v3
	 carry1	x11
	 carry0	v29
	mul3	v27, t,	   v0,  v2, v3
mmla4_common:
	 carry1	x13
	 carry0	v30
	 carry1	x12
	 carry0	v27
	 carry1	x14, nil

	// Piece the result together and ship it back.
	bfi	x11, x13, #32, #32
	bfi	x12, x14, #32, #32
	mov	v16.d[0], x11
	mov	v16.d[1], x12

	// Calculate the low half of the Montgomery factor Y = W M.
	mul0	v18, nil,    v16,  v6, v7
	mul1	v19, nil,    v16,  v6, v7
	 carry0	v18, nil
	mul2	v20, nil,    v16,  v6, v7
	 carry1	x11
	 carry0	v19
	mul3	v21, nil,    v16,  v6, v7
	 carry1	x13
	 carry0	v20
	 carry1	x12
	 carry0	v21
	 carry1	x14, nil

	// Piece the result together, ship it back, and expand.
	bfi	x11, x13, #32, #32
	bfi	x12, x14, #32, #32
	mov	v4.d[0], x11
	mov	v4.d[1], x12
	expand	v4, v5

	// Build up the product X Y in the carry slots.
	mul0	v28, t,			     v1,  v4, v5
	mul1	v29, t,			     v1,  v4, v5
	 carry0	v28, nil
	mul2	v30, t,			     v1,  v4, v5
	 carry1	nil
	 carry0 v29
	mul3	v27, t,			     v1,  v4, v5
	 carry1	nil
	 carry0 v30

	// And complete the calculation.
	mul4	v28, nil,    v0,  v2, v3,    v1,  v4, v5
	 carry1	nil
	 carry0 v27
	mul5	v29, nil,    v0,  v2, v3,    v1,  v4, v5
	 carry1	nil
	mul6	v30, nil,    v0,  v2, v3,    v1,  v4, v5

	// Finish up and store the result.
	stp	xzr, xzr, [x0], #16

	// All done.
	ret
ENDFUNC

INTFUNC(mont4)
	// On entry, x0 points to the destination/accumulator A; x2 points to
	// a packed operand X (the modulus); and v6/v7 holds an expanded
	// operand M (the Montgomery factor -N^{-1} (mod B)).  On exit, the
	// accumulator is updated (to zero); v4/v5 hold an expanded factor Y
	// = A M (mod B); v28--v30 and x15 hold outgoing carries c0--c2 and
	// cg; x0 and x2 are each advanced by 16; v6, v7, and v8--v15 are
	// preserved; and x11--x14, x16, x17 and the other SIMD registers are
	// clobbered.
  endprologue

	// Start by loading the operand words from memory.
	ldr	q28, [x0]
	ldr	q1, [x2], #16

	// Calculate Y = A M (mod B).
	mul0	v18, nil,    v28,  v6, v7
	mul1	v19, nil,    v28,  v6, v7
	 carry0	v18, nil
	mul2	v20, nil,    v28,  v6, v7
	 carry1	x11
	 carry0	v19
	mul3	v21, nil,    v28,  v6, v7
	 carry1	x13
	 carry0	v20
	  sprdacc v28, v29, v30, v27
	 carry1	x12
	 carry0	v21
	 carry1	x14, nil

	// Piece the result together, ship it back, and expand.
	bfi	x11, x13, #32, #32
	bfi	x12, x14, #32, #32
	mov	v4.d[0], x11
	mov	v4.d[1], x12
	expand	v4, v5

	// Calculate the actual result.  Well, the carries, at least.
	mul0	v28, t,	     v1,  v4, v5
	mul1	v29, t,	     v1,  v4, v5
	 carry0	v28, nil
	mul2	v30, t,	     v1,  v4, v5
	 carry1	nil
	 carry0 v29
	mul3	v27, t,	     v1,  v4, v5
	 carry1	nil
	 carry0 v30

	// And complete the calculation.
	mul4	v28, nil,    v1,  v4, v5
	 carry1	nil
	 carry0 v27
	mul5	v29, nil,    v1,  v4, v5
	 carry1	nil
	mul6	v30, nil,    v1,  v4, v5

	// Finish up and store the result.
	stp	xzr, xzr, [x0], #16

	// All done.
	ret
ENDFUNC

///--------------------------------------------------------------------------
/// Bulk multipliers.

FUNC(mpx_umul4_arm64_simd)
	// void mpx_umul4_arm64_simd(mpw *dv, const mpw *av, const mpw *avl,
	//			     const mpw *bv, const mpw *bvl);

	// Establish the arguments and do initial setup.
	//
	// inner loop dv	x0
	// inner loop av	x2
	// outer loop dv	x5
	// outer loop bv	x3
	// av base		x1
	// inner n		x6
	// n base		x7
	// outer n		x4
	pushreg	x29, x30
	setfp
  endprologue

	// Prepare for the first iteration.
	ldr	q4, [x3], #16		// Y = bv[0]
	movi	v31.4s, #0
	sub	x7, x2, x1		// = inner loop count base
	// x0				// = dv for inner loop
	// x1				// = av base
	// x3				// = bv for outer loop
	sub	x4, x4, x3		// = outer loop count (decremented)
	sub	x6, x7, #16		// = inner loop count (decremented)
	mov	x2, x1			// = av for inner loop
	add	x5, x0, #16		// = dv for outer loop
	expand	v4, v5			// expand Y
	bl	mul4zc
	cbz	x6, 8f			// all done?

	// Continue with the first iteration.
0:	sub	x6, x6, #16
	bl	mul4
	cbnz	x6, 0b

	// Write out the leftover carry.  There can be no tail here.
8:	bl	carryprop
	cbz	x4, 9f

	// Set up for the next pass.
1:	ldr	q4, [x3], #16		// Y = bv[i]
	mov	x0, x5			// -> dv[i]
	mov	x2, x1			// -> av[0]
	add	x5, x5, #16
	sub	x6, x7, #16		// = inner loop count (decremented)
	sub	x4, x4, #16		// outer loop count
	expand	v4, v5			// expand Y
	bl	mla4zc
	cbz	x6, 8f

	// Continue...
0:	sub	x6, x6, #16
	bl	mla4
	cbnz	x6, 0b

	// Finish off this pass.  There was no tail on the previous pass, and
	// there can be done on this pass.
8:	bl	carryprop
	cbnz	x4, 1b

	// All over.
9:	popreg	x29, x30
	ret
ENDFUNC

FUNC(mpxmont_mul4_arm64_simd)
	// void mpxmont_mul4_arm64_simd(mpw *dv,
	//				const mpw *av, const mpw *bv,
	//				const mpw *nv, size_t n,
	//				const mpw *mi);

	// Establish the arguments and do initial setup.
	//
	// inner loop dv	x0
	// inner loop av	x1
	// inner loop nv	x2
	// nv base		x3
	// base n		x4
	// mi			(x5)
	// outer loop dv	x5
	// outer loop bv	x6
	// av base		x7
	// inner n		x8
	// outer n		x9
	// c			x10
	pushreg	x29, x30
	setfp
  endprologue

	// Set up the outer loop state and prepare for the first iteration.
	ldr	q2, [x2]		// = V = bv[0]
	ldr	q6, [x5]		// = M
	movi	v31.4s, #0
	// x0				// -> dv for inner loop
	// x1				// -> av for inner loop
	// x3				// -> nv base
	// x4				// = n base
	add	x5, x0, #16		// -> dv
	add	x6, x2, #16		// -> bv
	mov	x2, x3			// -> nv[0]
	mov	x7, x1			// -> av base
	sub	x8, x4, #4		// = inner n (decremented)
	sub	x9, x4, #4		// = outer n (decremented)
	expand	v2, v3			// expand V
	expand	v6, v7			// expand M
	bl	mmul4
	cbz	x8, 8f			// done already?

	// Complete the first inner loop.
0:	sub	x8, x8, #4
	bl	dmul4
	cbnz	x8, 0b			// done yet?

	// Still have carries left to propagate.  Rather than store the tail
	// end in memory, keep it in x10 for later.
	bl	carryprop

	// Embark on the next iteration.  (There must be one.  If n = 1 then
	// we would have bailed above, to label 8.  Similarly, the subsequent
	// iterations can fall into the inner loop immediately.)
1:	ldr	q2, [x6], #16		// = Y = bv[i]
	mov	x0, x5			// -> dv[i]
	mov	x1, x7			// -> av[0]
	mov	x2, x3			// -> nv[0]
	add	x5, x5, #16
	sub	x8, x4, #4
	sub	x9, x9, #4
	expand	v2, v3
	bl	mmla4

	// Complete the next inner loop.
0:	sub	x8, x8, #4
	bl	dmla4
	cbnz	x8, 0b

	// Still have carries left to propagate, and they overlap the
	// previous iteration's final tail, so read that and add it.
	add	x15, x15, x10, lsl #16
	bl	carryprop

	// Back again, maybe.
	cbnz	x9, 1b

	// All done, almost.
	str	w10, [x0], #4
	popreg	x29, x30
	ret

	// First iteration was short.  Write out the carries and we're done.
	// (This could be folded into the main loop structure, but that would
	// penalize small numbers more.)
8:	bl	carryprop
	str	w10, [x0], #4
	popreg	x29, x30
	ret
ENDFUNC

FUNC(mpxmont_redc4_arm64_simd)
	// void mpxmont_redc4_arm64_simd(mpw *dv, mpw *dvl, const mpw *nv,
	//				 size_t n, const mpw *mi);

	// Establish the arguments and do initial setup.
	//
	// inner loop dv	x0
	// inner loop nv	x2
	// blocks-of-4 count	x6
	// tail count		x7
	// mi			(x4)
	// outer loop dv	x4
	// outer loop count	x8
	// nv base		x5
	// inner loop count	x1
	// n			x3
	// c			x10
	// t0, t1		x11, x12

	pushreg	x29, x30
	setfp
  endprologue

	// Set up the outer loop state and prepare for the first iteration.
	ldr	q6, [x4]		// = M
	movi	v31.4s, #0
	// x0				// -> dv for inner loop
	 sub	x6, x1, x0		// total dv bytes
	sub	x1, x3, #4		// inner loop counter
	// x2				// -> nv for inner loop
	// x3				// = n
	add	x4, x0, #16		// -> dv for outer loop
	mov	x5, x2			// -> nv base
	 sub	x6, x6, x3, lsl #2	// dv carry range bytes
	sub	x8, x3, #4		// outer loop counter
	 sub	x6, x6, #16		// dv steam-powered carry bytes
	expand	v6, v7			// expand M
	and	x7, x6, #15		// dv tail length in bytes
	bic	x6, x6, #15		// dv blocks-of-four length in bytes

	bl	mont4
	cbz	x1, 8f			// done already?

5:	sub	x1, x1, #4
	bl	mla4
	cbnz	x1, 5b			// done yet?

	// Still have carries left to propagate.  Adding the accumulator
	// block into the carries is a little different this time, because
	// all four accumulator limbs have to be squished into the three
	// carry registers for `carryprop' to do its thing.
8:	ldr	q24, [x0]
	sprdacc	v24, v25, v26
	add	v28.2d, v28.2d, v24.2d
	add	v29.2d, v29.2d, v25.2d
	add	v30.2d, v30.2d, v26.2d
	bl	carryprop
	cbz	x6, 7f

	// Propagate the first group of carries.
	ldp	x16, x17, [x0]
	sub	x1, x6, #16
	adds	x16, x16, x10
	adcs	x17, x17, xzr
	stp	x16, x17, [x0], #16
	cbz	x1, 6f

	// Continue carry propagation until the end of the buffer.
0:	ldp	x16, x17, [x0]
	sub	x1, x1, #16
	adcs	x16, x16, xzr
	adcs	x17, x17, xzr
	stp	x16, x17, [x0], #16
	cbnz	x1, 0b

	// Deal with the tail end.  Note that the actual destination length
	// won't be an exacty number of blocks of four, so it's safe to just
	// drop through here.
6:	adc	w10, wzr, wzr
7:	ldr	w16, [x0]
	sub	x1, x7, #4
	adds	w16, w16, w10
	str	w16, [x0], #4
	cbz	x1, 8f
0:	ldr	w16, [x0]
	sub	x1, x1, #4
	adcs	w16, w16, wzr
	str	w16, [x0], #4
	cbnz	x1, 0b

	// All done for this iteration.  Start the next.
8:	cbz	x8, 9f
	mov	x0, x4
	add	x4, x4, #16
	sub	x1, x3, #4
	mov	x2, x5
	sub	x8, x8, #4
	sub	x6, x6, #16
	bl	mont4
	b	5b

	// All over.
9:	popreg	x29, x30
	ret
ENDFUNC

///--------------------------------------------------------------------------
/// Testing and performance measurement.

#ifdef TEST_MUL4

//		  dmul  smul  mmul  mont
// z	x0	  x0    x0    x0     x0
// c	x3	  x1    x1    x1     x1
// y	x4	  --    --    x2     x2
// u	x1	  x2    --    x3     --
// x	x2	  x3    x2    x4     x3
// vv	v2/v3	  x4    --    x5     --
// yy	v4/v5	  x5    x3    x6     --
// mm	v6/v7	  --	--    --     x4
// n	x5	  x6    x4    x7     x5
// cyv	x6	  x7    x5    stk0   x6

#define STKARG(i) sp, #16 + i

.macro	testprologue mode
	pushreg	x29, x30
	setfp
  endprologue
	movi	v31.4s, #0

  .ifeqs "\mode", "dmul"
	ldr	q2, [x4]
	zip2	v3.8h, v2.8h, v31.8h	// (v''_3, v'_3; v''_2, v'_2)
	zip1	v2.8h, v2.8h, v31.8h	// (v''_1, v'_1; v''_0, v'_0)

	ldr	q4, [x5]
	zip2	v5.8h, v4.8h, v31.8h	// (y''_3, y'_3; y''_2, y'_2)
	zip1	v4.8h, v4.8h, v31.8h	// (y''_1, y'_1; y''_0, y'_0)

	mov	x16, x1
	mov	x1, x2			// -> u
	mov	x2, x3			// -> x
	mov	x3, x16			// -> c

	mov	x5, x6			// = n
	mov	x6, x7			// -> cyv
  .endif

  .ifeqs "\mode", "smul"
	ldr	q4, [x3]
	zip2	v5.8h, v4.8h, v31.8h	// (y''_3, y'_3; y''_2, y'_2)
	zip1	v4.8h, v4.8h, v31.8h	// (y''_1, y'_1; y''_0, y'_0)

	// x2				// -> x
	mov	x3, x1			// -> c
	mov	x6, x5			// -> cyv
	mov	x5, x4			// = n
  .endif

  .ifeqs "\mode", "mmul"
	ldr	q2, [x5]
	zip2	v3.8h, v2.8h, v31.8h	// (v''_3, v'_3; v''_2, v'_2)
	zip1	v2.8h, v2.8h, v31.8h	// (v''_1, v'_1; v''_0, v'_0)

	ldr	q6, [x6]
	zip2	v7.8h, v6.8h, v31.8h	// (y''_3, y'_3; y''_2, y'_2)
	zip1	v6.8h, v6.8h, v31.8h	// (y''_1, y'_1; y''_0, y'_0)

	mov	x16, x1
	mov	x1, x3			// -> u
	mov	x3, x16			// -> c

	mov	x16, x2
	mov	x2, x4			// -> x
	mov	x4, x16			// -> y

	mov	x5, x7			// = n
	ldr	x6, [STKARG(0)]		// -> cyv
  .endif

  .ifeqs "\mode", "mont"
	ldr	q6, [x4]
	zip2	v7.8h, v6.8h, v31.8h	// (m''_3, m'_3; m''_2, m'_2)
	zip1	v6.8h, v6.8h, v31.8h	// (m''_1, m'_1; m''_0, m'_0)

	mov	x4, x2			// -> y
	mov	x2, x3			// -> x
	mov	x3, x1			// -> c

	// x5				// = n
	// x6				// -> cyv
  .endif
.endm

.macro	testldcarry
	ld1	{v28.2d-v30.2d}, [x3]
	mov	x15, #0
.endm

.macro	testtop
0:	sub	x5, x5, #1
.endm

.macro	testtail
	cbnz	x5, 0b
.endm

.macro	testcarryout
	// More complicated than usual because we must mix the general-
	// purpose carry back in.
	lsr	x15, x15, #16
	mov	v0.d[0], x15
	mov	v0.d[1], xzr
	add	v28.2d, v28.2d, v0.2d
	st1	{v28.2d-v30.2d}, [x3]
.endm

.macro	testepilogue
	popreg	x29, x30
	ret
.endm

FUNC(test_dmul4)
	testprologue dmul
	testldcarry
	testtop
	bl	dmul4
	testtail
	testcarryout
	testepilogue
ENDFUNC

FUNC(test_dmla4)
	testprologue dmul
	testldcarry
	testtop
	bl	dmla4
	testtail
	testcarryout
	testepilogue
ENDFUNC

FUNC(test_mul4)
	testprologue smul
	testldcarry
	testtop
	bl	mul4
	testtail
	testcarryout
	testepilogue
ENDFUNC

FUNC(test_mul4zc)
	testprologue smul
	testldcarry
	testtop
	bl	mul4zc
	testtail
	testcarryout
	testepilogue
ENDFUNC

FUNC(test_mla4)
	testprologue smul
	testldcarry
	testtop
	bl	mla4
	testtail
	testcarryout
	testepilogue
ENDFUNC

FUNC(test_mla4zc)
	testprologue smul
	testldcarry
	testtop
	bl	mla4zc
	testtail
	testcarryout
	testepilogue
ENDFUNC

FUNC(test_mmul4)
	testprologue mmul
	testtop
	bl	mmul4
	testtail
	stp	q4, q5, [x4]
	testcarryout
	testepilogue
ENDFUNC

FUNC(test_mmla4)
	testprologue mmul
	testtop
	bl	mmla4
	testtail
	stp	q4, q5, [x4]
	testcarryout
	testepilogue
ENDFUNC

FUNC(test_mont4)
	testprologue mont
	testtop
	bl	mont4
	testtail
	stp	q4, q5, [x4]
	testcarryout
	testepilogue
ENDFUNC

#endif

///----- That's all, folks --------------------------------------------------