--- /dev/null
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// Large SIMD-based multiplications
+///
+/// (c) 2019 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software: you can redistribute it and/or modify it
+/// under the terms of the GNU Library General Public License as published
+/// by the Free Software Foundation; either version 2 of the License, or
+/// (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful, but
+/// WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+/// Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb. If not, write to the Free Software
+/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+/// USA.
+
+///--------------------------------------------------------------------------
+/// Preliminaries.
+
+#include "config.h"
+#include "asm-common.h"
+
+ .arch armv7-a
+ .fpu neon
+
+ .text
+
+///--------------------------------------------------------------------------
+/// Theory.
+///
+/// We define a number of primitive fixed-size multipliers from which we can
+/// construct more general variable-length multipliers.
+///
+/// The basic trick is the same throughout. In an operand-scanning
+/// multiplication, the inner multiplication loop multiplies a multiple-
+/// precision operand by a single precision factor, and adds the result,
+/// appropriately shifted, to the result. A `finely integrated operand
+/// scanning' implementation of Montgomery multiplication also adds the
+/// product of a single-precision `Montgomery factor' and the modulus,
+/// calculated in the same pass. The more common `coarsely integrated
+/// operand scanning' alternates main multiplication and Montgomery passes,
+/// which requires additional carry propagation.
+///
+/// Throughout both plain-multiplication and Montgomery stages, then, one of
+/// the factors remains constant throughout the operation, so we can afford
+/// to take a little time to preprocess it. The transformation we perform is
+/// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
+/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
+/// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
+/// pieces are placed into 32-bit cells, and arranged as two 128-bit NEON
+/// operands, as follows.
+///
+/// Offset 0 4 8 12
+/// 0 v'_0 v''_0 v'_1 v''_1
+/// 16 v'_2 v''_2 v'_3 v''_3
+///
+/// The `vmull' and `vmlal' instructions can multiply a vector of two 32-bit
+/// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
+/// on (say) v'_0 and v''_0 in a single instruction, to produce two 48-bit
+/// results in 64-bit fields. The sixteen bits of headroom allows us to add
+/// many products together before we must deal with carrying; it also allows
+/// for some calculations to be performed on the above expanded form.
+///
+/// We maintain three `carry' registers, q12--q14, accumulating intermediate
+/// results; we name them `c0', `c1', and `c2'. Each carry register holds
+/// two 64-bit halves: the register c0, for example, holds c'_0 (low half)
+/// and c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the
+/// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2.
+/// The `vmull' or `vmlal' instruction acting on a scalar operand and an
+/// operand in the expanded form above produces a result which can be added
+/// directly to the appropriate carry register.
+///
+/// Multiplication is performed in product-scanning order, since ARM
+/// processors commonly implement result forwarding for consecutive multiply-
+/// and-accumulate instructions specifying the same destination.
+/// Experimentally, this runs faster than operand-scanning in an attempt to
+/// hide instruction latencies.
+///
+/// On 32-bit ARM, we have a reasonable number of registers: the expanded
+/// operands are kept in registers. The packed operands are read from memory
+/// into working registers q0 and q1. The following conventional argument
+/// names and locations are used throughout.
+///
+/// Arg Format Location Notes
+///
+/// U packed [r1]
+/// X packed [r2] In Montgomery multiplication, X = N
+/// V expanded q2/q3
+/// Y expanded q4/q5 In Montgomery multiplication, Y = (A + U V) M
+/// M expanded q4/q5 -N^{-1} (mod B^4)
+/// N Modulus, for Montgomery multiplication
+/// A packed [r0] Destination/accumulator
+/// C carry q13--q15
+///
+/// The calculation is some variant of
+///
+/// A' + C' B^4 <- U V + X Y + A + C
+///
+/// The low-level functions fit into a fairly traditional (finely-integrated)
+/// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y)
+/// (indexed by i).
+///
+/// The variants are as follows.
+///
+/// Function Variant Use i j
+///
+/// mmul4 A = C = 0 Montgomery 0 0
+/// dmul4 A = 0 Montgomery 0 +
+/// mmla4 C = 0 Montgomery + 0
+/// dmla4 exactly as shown Montgomery + +
+///
+/// mul4zc U = V = A = C = 0 Plain 0 0
+/// mul4 U = V = A = 0 Plain 0 +
+/// mla4zc U = V = C = 0 Plain + 0
+/// mla4 U = V = 0 Plain + +
+///
+/// The `mmul4' and `mmla4' functions are also responsible for calculating
+/// the Montgomery reduction factor Y = (A + U V) M used by the rest of the
+/// inner loop.
+
+///--------------------------------------------------------------------------
+/// Macro definitions.
+
+.macro mulacc z, u, v, x=nil, y=nil
+ // Set Z = Z + U V + X Y. X may be `nil' to omit the second
+ // operand. Z should be a 128-bit `qN' register; V and Y should be
+ // 64-bit `dN' registers; and U and X should be 32-bit `dN[I]'
+ // scalars; the multiplications produce two 64-bit elementwise
+ // products, which are added elementwise to Z.
+
+ vmlal.u32 \z, \v, \u
+ .ifnes "\x", "nil"
+ vmlal.u32 \z, \y, \x
+ .endif
+.endm
+
+.macro mulinit z, zinitp, u, v, x, y
+ // If ZINITP then set Z = Z + U V + X Y, as for `mulacc'; otherwise,
+ // set Z = U V + X Y. Operand requirements and detailed operation
+ // are as for `mulacc'.
+
+ .ifeqs "\zinitp", "t"
+ mulacc \z, \u, \v, \x, \y
+ .else
+ vmull.u32 \z, \v, \u
+ .ifnes "\x", "nil"
+ vmlal.u32 \z, \y, \x
+ .endif
+ .endif
+.endm
+
+// `MULI': accumulate the B^I and b B^i terms of the polynomial product sum U
+// V + X Y, given that U = u_0 + B u_1 + B^2 u_2 + B^3 u_3 (and similarly for
+// x), and V = v'_0 + b v''_0 + B (v'_1 + b v''_1) + B^2 (v'_2 + b v''_2) +
+// B^3 (v'_3 + b v''_3) (and similarly for Y). The 64-bit coefficients are
+// added into the low and high halves of the 128-bit register Z (if ZINIT is
+// `nil' then simply set Z, as if it were initially zero).
+#define MUL0(z, zinitp, u, v0, v1, x, y0, y1) \
+ mulinit z, zinitp, QW(u, 0), D0(v0), QW(x, 0), D0(y0)
+#define MUL1(z, zinitp, u, v0, v1, x, y0, y1) \
+ mulinit z, zinitp, QW(u, 0), D1(v0), QW(x, 0), D1(y0); \
+ mulacc z, QW(u, 1), D0(v0), QW(x, 1), D0(y0)
+#define MUL2(z, zinitp, u, v0, v1, x, y0, y1) \
+ mulinit z, zinitp, QW(u, 0), D0(v1), QW(x, 0), D0(y1); \
+ mulacc z, QW(u, 1), D1(v0), QW(x, 1), D1(y0); \
+ mulacc z, QW(u, 2), D0(v0), QW(x, 2), D0(y0)
+#define MUL3(z, zinitp, u, v0, v1, x, y0, y1) \
+ mulinit z, zinitp, QW(u, 0), D1(v1), QW(x, 0), D1(y1); \
+ mulacc z, QW(u, 1), D0(v1), QW(x, 1), D0(y1); \
+ mulacc z, QW(u, 2), D1(v0), QW(x, 2), D1(y0); \
+ mulacc z, QW(u, 3), D0(v0), QW(x, 3), D0(y0)
+#define MUL4(z, zinitp, u, v0, v1, x, y0, y1) \
+ mulinit z, zinitp, QW(u, 1), D1(v1), QW(x, 1), D1(y1); \
+ mulacc z, QW(u, 2), D0(v1), QW(x, 2), D0(y1); \
+ mulacc z, QW(u, 3), D1(v0), QW(x, 3), D1(y0)
+#define MUL5(z, zinitp, u, v0, v1, x, y0, y1) \
+ mulinit z, zinitp, QW(u, 2), D1(v1), QW(x, 2), D1(y1); \
+ mulacc z, QW(u, 3), D0(v1), QW(x, 3), D0(y1)
+#define MUL6(z, zinitp, u, v0, v1, x, y0, y1) \
+ mulinit z, zinitp, QW(u, 3), D1(v1), QW(x, 3), D1(y1)
+
+// Steps in the process of propagating carry bits from ZLO to ZHI (both
+// 128-bit `qN' registers). Here, T is a 128-bit `qN' temporary register.
+// Set the low 32 bits of the 64-bit `dN' register ZOUT to the completed
+// coefficient z_i.
+//
+// In detail, what happens is as follows. Suppose initially that ZLO =
+// (z'_i; z''_i) and ZHI = (z'_{i+1}; z''_{i+1}). Let t = z'_i + b z''_i;
+// observe that floor(t/b) = floor(z'_i/b) + z''_i. Let z_i = t mod B, and
+// add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}. This has
+// a circuit depth of 4; I don't know how to do better.
+.macro _carry0 zout, zlo0, zlo1, t0, t1
+ // ZLO0 and ZLO1 are the low and high halves of a carry register.
+ // Extract a 32-bit output, in the bottom 32 bits of ZOUT, and set T1
+ // so as to continue processing using `_carry1'. All operands are
+ // 64-bit `dN' registers. If ZOUT is `nil' then no output is
+ // produced; if T1 is `nil' then no further processing will be
+ // possible.
+ .ifnes "\zout", "nil"
+ vshl.i64 \t0, \zlo1, #16
+ .endif
+ .ifnes "\t1", "nil"
+ vshr.u64 \t1, \zlo0, #16
+ .endif
+ .ifnes "\zout", "nil"
+ vadd.i64 \zout, \zlo0, \t0
+ .endif
+ .ifnes "\t1", "nil"
+ vadd.i64 \t1, \t1, \zlo1
+ .endif
+.endm
+.macro _carry1 u, zhi0, t1
+ // ZHI0 is the low half of a carry register, and T1 is the result of
+ // applying `_carry0' to the previous carry register. Set U to the
+ // result of propagating the carry into ZHI0.
+ vshr.u64 \t1, \t1, #16
+ vadd.i64 \u, \zhi0, \t1
+.endm
+
+// More convenient wrappers for `_carry0' and `_carry1'.
+//
+// CARRY0(ZOUT, ZLO, T)
+// Store a 32-bit output in ZOUT from carry ZLO, using T as a
+// temporary. ZOUT is a 64-bit `dN' register; ZLO and T are 128-bit
+// `qN' registers.
+//
+// CARRY1(ZHI, T)
+// Propagate carry from T into ZHI. Both are 128-bit `qN' registers;
+// ZHI is updated.
+#define CARRY0(zout, zlo, t) \
+ CASIDE0(zout, D0(zlo), zlo, t)
+#define CARRY1(zhi, t) \
+ CASIDE1(D0(zhi), zhi, t)
+
+// Versions of `CARRY0' and `CARRY1' which don't mutate their operands.
+//
+// CASIDE0(ZOUT, U, ZLO, T)
+// As for `CARRY0', but the low half of ZLO is actually in U (a 64-bit
+// `dN' register).
+//
+// CASIDE0E(ZOUT, U, ZLO, T)
+// As for `CASIDE0', but only calculate the output word, and no
+// carry-out.
+//
+// CASIDE1(U, ZHI, T)
+// As for `CARRY1', but write the updated low half of ZHI to U.
+#define CASIDE0(zout, u, zlo, t) \
+ _carry0 zout, u, D1(zlo), D0(t), D1(t)
+#define CASIDE0E(zout, u, zlo, t) \
+ _carry0 zout, u, D1(zlo), D0(t), nil
+#define CASIDE1(u, zhi, t) \
+ _carry1 u, D0(zhi), D1(t)
+
+// Steps in spreading a packed 128-bit operand in A0 across A0, A1, A2, A3 in
+// carry format.
+#define SPREADACC0(a0, a1, a2, a3) \
+ vmov.i32 a1, #0; \
+ vmov.i32 a2, #0; \
+ vmov.i32 a3, #0
+#define SPREADACC1(a0, a1, a2, a3) \
+ vswp D1(a0), D0(a2); \
+ vtrn.32 D0(a0), D0(a1); \
+ vtrn.32 D0(a2), D0(a3)
+
+// Add the carry-format values A0, A1, A2 into the existing carries C0, C1,
+// C2 (leaving A3 where it is).
+#define CARRYACC(a0, a1, a2, a3, c0, c1, c2) \
+ vadd.i64 c0, c0, a0; \
+ vadd.i64 c1, c1, a1; \
+ vadd.i64 c2, c2, a2
+
+///--------------------------------------------------------------------------
+/// Primitive multipliers and related utilities.
+
+INTFUNC(carryprop)
+ // On entry, r0 points to a destination, and q13--q15 hold incoming
+ // carries c0--c2. On exit, the low 128 bits of the carry value are
+ // stored at [r0]; the remaining 16 bits of carry are left in d30; r0
+ // is advanced by 16; and q10--q14 are clobbered.
+ endprologue
+
+ CARRY0(D0(q10), q13, q12)
+ CARRY1(q14, q12)
+ CARRY0(D0(q11), q14, q12)
+ CARRY1(q15, q12)
+ CARRY0(D1(q10), q15, q12)
+ vshr.u64 D1(q11), D1(q12), #16
+ vshr.u64 D0(q15), D1(q12), #48
+ vtrn.32 q10, q11
+ vst1.32 {q10}, [r0]!
+ bx r14
+ENDFUNC
+
+INTFUNC(dmul4)
+ // On entry, r0 points to the destination; r1 and r2 point to packed
+ // operands U and X; q2/q3 and q4/q5 hold expanded operands V and Y;
+ // and q13--q15 hold incoming carries c0--c2. On exit, the
+ // destination and carries are updated; r0, r1, r2 are each advanced
+ // by 16; q2--q5 are preserved; and the other NEON registers are
+ // clobbered.
+ endprologue
+
+ // Start by loading the operand words from memory.
+ vld1.32 {q0}, [r1]!
+ vld1.32 {q1}, [r2]!
+
+ // Do the multiplication.
+ MUL0(q13, t, q0, q2, q3, q1, q4, q5)
+ MUL1(q14, t, q0, q2, q3, q1, q4, q5)
+ CARRY0(D0(q8), q13, q6)
+ MUL2(q15, t, q0, q2, q3, q1, q4, q5)
+ CARRY1(q14, q6)
+ CARRY0(D0(q9), q14, q6)
+ MUL3(q12, nil, q0, q2, q3, q1, q4, q5)
+ CARRY1(q15, q6)
+ CARRY0(D1(q8), q15, q6)
+ MUL4(q13, nil, q0, q2, q3, q1, q4, q5)
+ CARRY1(q12, q6)
+ CARRY0(D1(q9), q12, q6)
+ MUL5(q14, nil, q0, q2, q3, q1, q4, q5)
+ CARRY1(q13, q6)
+ MUL6(q15, nil, q0, q2, q3, q1, q4, q5)
+
+ // Finish up and store the result.
+ vtrn.32 q8, q9
+ vst1.32 {q8}, [r0]!
+
+ // All done.
+ bx r14
+ENDFUNC
+
+INTFUNC(dmla4)
+ // On entry, r0 points to the destination/accumulator; r1 and r2
+ // point to packed operands U and X; q2/q3 and q4/q5 hold expanded
+ // operands V and Y; and q13--q15 hold incoming carries c0--c2. On
+ // exit, the accumulator and carries are updated; r0, r1, r2 are each
+ // advanced by 16; q2--q5 are preserved; and the other NEON registers
+ // are clobbered.
+ endprologue
+
+ // Start by loading the operand words from memory.
+ vld1.32 {q9}, [r0]
+ SPREADACC0(q9, q10, q11, q12)
+ vld1.32 {q0}, [r1]!
+ vld1.32 {q1}, [r2]!
+
+ // Add the accumulator input to the incoming carries. Split the
+ // accumulator into four pieces and add the carries onto them.
+ SPREADACC1(q9, q10, q11, q12)
+ CARRYACC(q9, q10, q11, q12, q13, q14, q15)
+
+ // Do the multiplication.
+ MUL0(q13, t, q0, q2, q3, q1, q4, q5)
+ MUL1(q14, t, q0, q2, q3, q1, q4, q5)
+ CARRY0(D0(q8), q13, q6)
+ MUL2(q15, t, q0, q2, q3, q1, q4, q5)
+ CARRY1(q14, q6)
+ CARRY0(D0(q9), q14, q6)
+ MUL3(q12, t, q0, q2, q3, q1, q4, q5)
+ CARRY1(q15, q6)
+ CARRY0(D1(q8), q15, q6)
+ MUL4(q13, nil, q0, q2, q3, q1, q4, q5)
+ CARRY1(q12, q6)
+ CARRY0(D1(q9), q12, q6)
+ MUL5(q14, nil, q0, q2, q3, q1, q4, q5)
+ CARRY1(q13, q6)
+ MUL6(q15, nil, q0, q2, q3, q1, q4, q5)
+
+ // Finish up and store the result.
+ vtrn.32 q8, q9
+ vst1.32 {q8}, [r0]!
+
+ // All done.
+ bx r14
+ENDFUNC
+
+INTFUNC(mul4)
+ // On entry, r0 points to the destination; r2 points to a packed
+ // operand X; q4/q5 holds an expanded operand Y; and q13--q15 hold
+ // incoming carries c0--c2. On exit, the destination and carries are
+ // updated; r0 and r2 are each advanced by 16; q4 and q5 are
+ // preserved; and the other NEON registers are clobbered.
+ endprologue
+
+ // Start by loading the operand words from memory.
+ vld1.32 {q1}, [r2]!
+
+ // Do the multiplication.
+ MUL0(q13, t, q1, q4, q5, nil, nil, nil)
+ MUL1(q14, t, q1, q4, q5, nil, nil, nil)
+ CARRY0(D0(q8), q13, q6)
+ MUL2(q15, t, q1, q4, q5, nil, nil, nil)
+ CARRY1(q14, q6)
+ CARRY0(D0(q9), q14, q6)
+ MUL3(q12, nil, q1, q4, q5, nil, nil, nil)
+ CARRY1(q15, q6)
+ CARRY0(D1(q8), q15, q6)
+ MUL4(q13, nil, q1, q4, q5, nil, nil, nil)
+ CARRY1(q12, q6)
+ CARRY0(D1(q9), q12, q6)
+ MUL5(q14, nil, q1, q4, q5, nil, nil, nil)
+ CARRY1(q13, q6)
+ MUL6(q15, nil, q1, q4, q5, nil, nil, nil)
+
+ // Finish up and store the result.
+ vtrn.32 q8, q9
+ vst1.32 {q8}, [r0]!
+
+ // All done.
+ bx r14
+ENDFUNC
+
+INTFUNC(mul4zc)
+ // On entry, r0 points to the destination; r2 points to a packed
+ // operand X; and q4/q5 holds an expanded operand Y. On exit, the
+ // destination is updated; q13--q15 hold outgoing carries c0--c2; r0
+ // and r2 are each advanced by 16; q4 and q5 are preserved; and the
+ // other NEON registers are clobbered.
+ endprologue
+
+ // Start by loading the operand words from memory.
+ vld1.32 {q1}, [r2]!
+
+ // Do the multiplication.
+ MUL0(q13, nil, q1, q4, q5, nil, nil, nil)
+ MUL1(q14, nil, q1, q4, q5, nil, nil, nil)
+ CARRY0(D0(q8), q13, q6)
+ MUL2(q15, nil, q1, q4, q5, nil, nil, nil)
+ CARRY1(q14, q6)
+ CARRY0(D0(q9), q14, q6)
+ MUL3(q12, nil, q1, q4, q5, nil, nil, nil)
+ CARRY1(q15, q6)
+ CARRY0(D1(q8), q15, q6)
+ MUL4(q13, nil, q1, q4, q5, nil, nil, nil)
+ CARRY1(q12, q6)
+ CARRY0(D1(q9), q12, q6)
+ MUL5(q14, nil, q1, q4, q5, nil, nil, nil)
+ CARRY1(q13, q6)
+ MUL6(q15, nil, q1, q4, q5, nil, nil, nil)
+
+ // Finish up and store the result.
+ vtrn.32 q8, q9
+ vst1.32 {q8}, [r0]!
+
+ // All done.
+ bx r14
+ENDFUNC
+
+INTFUNC(mla4)
+ // On entry, r0 points to the destination/accumulator; r2 points to a
+ // packed operand X; q4/q5 holds an expanded operand Y; and q13--q15
+ // hold incoming carries c0--c2. On exit, the accumulator and
+ // carries are updated; r0 and r2 are each advanced by 16; q4 and q5
+ // are preserved; and the other NEON registers are clobbered.
+ endprologue
+
+ // Start by loading the operand words from memory.
+ vld1.32 {q9}, [r0]
+ SPREADACC0(q9, q10, q11, q12)
+ vld1.32 {q1}, [r2]!
+
+ // Add the accumulator input to the incoming carries. Split the
+ // accumulator into four pieces and add the carries onto them.
+ SPREADACC1(q9, q10, q11, q12)
+ CARRYACC(q9, q10, q11, q12, q13, q14, q15)
+
+ // Do the multiplication.
+mla4_common:
+ MUL0(q13, t, q1, q4, q5, nil, nil, nil)
+ MUL1(q14, t, q1, q4, q5, nil, nil, nil)
+ CARRY0(D0(q8), q13, q6)
+ MUL2(q15, t, q1, q4, q5, nil, nil, nil)
+ CARRY1(q14, q6)
+ CARRY0(D0(q9), q14, q6)
+ MUL3(q12, t, q1, q4, q5, nil, nil, nil)
+ CARRY1(q15, q6)
+ CARRY0(D1(q8), q15, q6)
+ MUL4(q13, nil, q1, q4, q5, nil, nil, nil)
+ CARRY1(q12, q6)
+ CARRY0(D1(q9), q12, q6)
+ MUL5(q14, nil, q1, q4, q5, nil, nil, nil)
+ CARRY1(q13, q6)
+ MUL6(q15, nil, q1, q4, q5, nil, nil, nil)
+
+ // Finish up and store the result.
+ vtrn.32 q8, q9
+ vst1.32 {q8}, [r0]!
+
+ // All done.
+ bx r14
+ENDFUNC
+
+INTFUNC(mla4zc)
+ // On entry, r0 points to the destination/accumulator; r2 points to a
+ // packed operand X; and q4/q5 holds an expanded operand Y. On exit,
+ // the accumulator is updated; q13--q15 hold outgoing carries c0--c2;
+ // r0 and r2 are each advanced by 16; q4 and q5 are preserved; and
+ // the other NEON registers are clobbered.
+ endprologue
+
+ // Start by loading the operand words from memory.
+ vld1.32 {q13}, [r0]
+ SPREADACC0(q13, q14, q15, q12)
+ vld1.32 {q1}, [r2]!
+
+ // Move the accumulator input to the incoming carry slots. Split the
+ // accumulator into four pieces.
+ SPREADACC1(q13, q14, q15, q12)
+
+ b mla4_common
+ENDFUNC
+
+INTFUNC(mmul4)
+ // On entry, r0 points to the destination; r1 points to a packed
+ // operand U; r2 points to a packed operand X (the modulus); q2/q3
+ // holds an expanded operand V; and q4/q5 holds an expanded operand M
+ // (the Montgomery factor -N^{-1} (mod B)). On exit, the destination
+ // is updated (to zero); q4/q5 hold an expanded factor Y = U V M (mod
+ // B); q13--q15 hold outgoing carries c0--c2; r0, r1, and r2 are each
+ // advanced by 16; q2 and q3 are preserved; and the other NEON
+ // registers are clobbered.
+
+ // Start by loading the operand words from memory.
+ vld1.32 {q0}, [r1]!
+
+ // Calculate the low half of W = A + U V, being careful to leave the
+ // carries in place.
+ MUL0(q13, nil, q0, q2, q3, nil, nil, nil)
+ MUL1(q14, nil, q0, q2, q3, nil, nil, nil)
+ CARRY0(D0(q6), q13, q8)
+ MUL2(q15, nil, q0, q2, q3, nil, nil, nil)
+ CASIDE1(D0(q9), q14, q8)
+ CASIDE0(D0(q7), D0(q9), q14, q8)
+ MUL3(q12, nil, q0, q2, q3, nil, nil, nil)
+ b mmla4_common
+ENDFUNC
+
+INTFUNC(mmla4)
+ // On entry, r0 points to the destination/accumulator A; r1 points to
+ // a packed operand U; r2 points to a packed operand X (the modulus);
+ // q2/q3 holds an expanded operand V; and q4/q5 holds an expanded
+ // operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the
+ // accumulator is updated (to zero); q4/q5 hold an expanded factor Y
+ // = (A + U V) M (mod B); q13--q15 hold outgoing carries c0--c2; r0,
+ // r1, and r2 are each advanced by 16; q2 and q3 are preserved; and
+ // the other NEON registers are clobbered.
+ endprologue
+
+ // Start by loading the operand words from memory.
+ vld1.32 {q13}, [r0]
+ SPREADACC0(q13, q14, q15, q12)
+ vld1.32 {q0}, [r1]!
+
+ // Move the accumulator input to the incoming carry slots. Split the
+ // accumulator into four pieces.
+ SPREADACC1(q13, q14, q15, q12)
+
+ // Calculate the low half of W = A + U V, being careful to leave the
+ // carries in place.
+ MUL0(q13, t, q0, q2, q3, nil, nil, nil)
+ MUL1(q14, t, q0, q2, q3, nil, nil, nil)
+ CARRY0(D0(q6), q13, q8)
+ MUL2(q15, t, q0, q2, q3, nil, nil, nil)
+ CASIDE1(D0(q9), q14, q8)
+ CASIDE0(D0(q7), D0(q9), q14, q8)
+ MUL3(q12, t, q0, q2, q3, nil, nil, nil)
+mmla4_common:
+ CASIDE1(D0(q9), q15, q8)
+ CASIDE0(D1(q6), D0(q9), q15, q8)
+ CASIDE1(D0(q9), q12, q8)
+ CASIDE0E(D1(q7), D0(q9), q12, q8)
+ vtrn.32 q6, q7
+
+ // Calculate the low half of the Montgomery factor Y = W M. At this
+ // point, registers are a little tight.
+ MUL0( q8, nil, q6, q4, q5, nil, nil, nil)
+ MUL1( q9, nil, q6, q4, q5, nil, nil, nil)
+ CARRY0(D0(q8), q8, q1)
+ MUL2(q10, nil, q6, q4, q5, nil, nil, nil)
+ CARRY1(q9, q1)
+ CARRY0(D0(q9), q9, q1)
+ MUL3(q11, nil, q6, q4, q5, nil, nil, nil)
+ CARRY1(q10, q1)
+ CARRY0(D1(q8), q10, q1)
+ vmov.i32 q5, #0
+ CARRY1(q11, q1)
+ CARRY0(D1(q9), q11, q1)
+ vld1.32 {q1}, [r2]!
+ vtrn.32 q8, q9
+
+ // Expand Y. We'll put it in its proper place a bit later.
+ vzip.16 q8, q5
+
+ // Build up the product X Y in the carry slots.
+ MUL0(q13, t, q1, q8, q5, nil, nil, nil)
+ MUL1(q14, t, q1, q8, q5, nil, nil, nil)
+ CARRY0(nil, q13, q9)
+ MUL2(q15, t, q1, q8, q5, nil, nil, nil)
+ CARRY1(q14, q9)
+ vmov q4, q8
+ CARRY0(nil, q14, q9)
+ MUL3(q12, t, q1, q8, q5, nil, nil, nil)
+ CARRY1(q15, q9)
+ CARRY0(nil, q15, q9)
+ vmov.u32 q6, #0
+
+ // And complete the calculation.
+ MUL4(q13, nil, q0, q2, q3, q1, q4, q5)
+ CARRY1(q12, q9)
+ CARRY0(nil, q12, q9)
+ MUL5(q14, nil, q0, q2, q3, q1, q4, q5)
+ CARRY1(q13, q9)
+ MUL6(q15, nil, q0, q2, q3, q1, q4, q5)
+
+ // Finish up and store the result.
+ vst1.32 {q6}, [r0]!
+
+ // All done.
+ bx r14
+ENDFUNC
+
+INTFUNC(mont4)
+ // On entry, r0 points to the destination/accumulator A; r2 points to
+ // a packed operand X (the modulus); and q2/q3 holds an expanded
+ // operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the
+ // accumulator is updated (to zero); q4/q5 hold an expanded factor Y
+ // = A M (mod B); q13--q15 hold outgoing carries c0--c2; r0 and r2
+ // are each advanced by 16; q2 and q3 are preserved; and the other
+ // NEON registers are clobbered.
+ endprologue
+
+ // Start by loading the operand words from memory.
+ vld1.32 {q0}, [r0]
+ vld1.32 {q1}, [r2]!
+
+ // Calculate Y = A M (mod B).
+ MUL0(q8, nil, q0, q2, q3, nil, nil, nil)
+ MUL1(q9, nil, q0, q2, q3, nil, nil, nil)
+ CARRY0(D0(q4), q8, q6)
+ MUL2(q10, nil, q0, q2, q3, nil, nil, nil)
+ CARRY1(q9, q6)
+ vmov q13, q0
+ CARRY0(D0(q7), q9, q6)
+ MUL3(q11, nil, q0, q2, q3, nil, nil, nil)
+ CARRY1(q10, q6)
+ CARRY0(D1(q4), q10, q6)
+ SPREADACC0(q13, q14, q15, q12)
+ CARRY1(q11, q6)
+ CARRY0(D1(q7), q11, q6)
+ SPREADACC1(q13, q14, q15, q12)
+ vmov.i32 q5, #0
+ vtrn.32 q4, q7
+ vzip.16 q4, q5
+
+ // Calculate the actual result. Well, the carries, at least.
+ vmov.i32 q8, #0
+ MUL0(q13, t, q1, q4, q5, nil, nil, nil)
+ MUL1(q14, t, q1, q4, q5, nil, nil, nil)
+ CARRY0(nil, q13, q6)
+ MUL2(q15, t, q1, q4, q5, nil, nil, nil)
+ CARRY1(q14, q6)
+ CARRY0(nil, q14, q6)
+ MUL3(q12, t, q1, q4, q5, nil, nil, nil)
+ CARRY1(q15, q6)
+ CARRY0(nil, q15, q6)
+ MUL4(q13, nil, q1, q4, q5, nil, nil, nil)
+ CARRY1(q12, q6)
+ CARRY0(nil, q12, q6)
+ MUL5(q14, nil, q1, q4, q5, nil, nil, nil)
+ CARRY1(q13, q6)
+ MUL6(q15, nil, q1, q4, q5, nil, nil, nil)
+
+ // Finish up and store the result.
+ vst1.32 {q8}, [r0]!
+
+ // All done.
+ bx r14
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Bulk multipliers.
+
+FUNC(mpx_umul4_arm_neon)
+ // void mpx_umul4_arm_neon(mpw *dv, const mpw *av, const mpw *avl,
+ // const mpw *bv, const mpw *bvl);
+
+ // Establish the arguments and do initial setup.
+ //
+ // inner loop dv r0
+ // inner loop av r2
+ // outer loop dv r5
+ // outer loop bv r3
+ // av base r1
+ // av limit r12
+ // bv limit r4
+ pushreg r4, r5, r14
+ pushvfp QQ(q4, q7)
+ endprologue
+
+ // Prepare for the first iteration.
+ vld1.32 {q4}, [r3]! // = Y = bv[0]
+ vmov.i32 q5, #0
+ // r0 // = dv for inner loop
+ // r1 // = av base
+ // r3 // = bv for outer loop
+ ldr r4, [sp, #76] // = bv limit
+ mov r12, r2 // = av limit
+ mov r2, r1 // = av for inner loop
+ add r5, r0, #16 // = dv for outer loop
+ vzip.16 q4, q5 // expand Y
+ bl mul4zc
+ cmp r2, r12 // all done?
+ bhs 8f
+
+ // Continue with the first iteration.
+0: bl mul4
+ cmp r2, r12 // all done?
+ blo 0b
+
+ // Write out the leftover carry. There can be no tail here.
+8: bl carryprop
+ cmp r3, r4 // more passes to do?
+ bhs 9f
+
+ // Set up for the next pass.
+1: vld1.32 {q4}, [r3]! // = Y = bv[i]
+ vmov.i32 q5, #0
+ mov r0, r5 // -> dv[i]
+ mov r2, r1 // -> av[0]
+ add r5, r5, #16
+ vzip.16 q4, q5 // expand Y
+ bl mla4zc
+ cmp r2, r12 // done yet?
+ bhs 8f
+
+ // Continue...
+0: bl mla4
+ cmp r2, r12
+ blo 0b
+
+ // Finish off this pass. There was no tail on the previous pass, and
+ // there can be done on this pass.
+8: bl carryprop
+ cmp r3, r4
+ blo 1b
+
+ // All over.
+9: popvfp QQ(q4, q7)
+ popreg r4, r5, r14
+ bx r14
+ENDFUNC
+
+FUNC(mpxmont_mul4_arm_neon)
+ // void mpxmont_mul4_arm_neon(mpw *dv, const mpw *av, const mpw *bv,
+ // const mpw *nv, size_t n, const mpw *mi);
+
+ // Establish the arguments and do initial setup.
+ //
+ // inner loop dv r0
+ // inner loop av r1
+ // inner loop nv r2
+ // mi r5
+ // outer loop dv r6
+ // outer loop bv r7
+ // av base r8
+ // av limit r9
+ // bv limit r4
+ // nv base r3
+ // n r4
+ // c r10
+ // 0 r12
+
+ pushreg r4-r10, r14
+ pushvfp QQ(q4, q7)
+ endprologue
+
+ // Establish the expanded operands.
+ ldrd r4, r5, [sp, #96] // r4 = n; r5 -> mi
+ vld1.32 {q2}, [r2] // = V = bv[0]
+ vmov.i32 q3, #0
+ vmov.i32 q5, #0
+ vld1.32 {q4}, [r5] // = M
+
+ // Set up the outer loop state and prepare for the first iteration.
+ // r0 // -> dv for inner loop
+ // r1 // -> av for inner loop
+ add r7, r2, #16 // -> bv
+ // r3 // -> nv
+ add r6, r0, #16 // -> dv
+ mov r8, r1 // -> av
+ add r9, r1, r4, lsl #2 // -> av limit
+ add r4, r2, r4, lsl #2 // -> bv limit
+ mov r2, r3 // -> nv for inner loop
+ mov r12, #0 // = 0
+
+ vzip.16 q2, q3 // expand V
+ vzip.16 q4, q5 // expand M
+ bl mmul4
+ cmp r1, r9 // done already?
+ bhs 8f
+
+ // Complete the first inner loop.
+0: bl dmul4
+ cmp r1, r9 // done yet?
+ blo 0b
+
+ // Still have carries left to propagate. Rather than store the tail
+ // end in memory, keep it in a general-purpose register for later.
+ bl carryprop
+ vmov.32 r10, QW(q15, 0)
+
+ // Embark on the next iteration. (There must be one. If n = 1 then
+ // we would have bailed above, to label 8. Similarly, the subsequent
+ // iterations can fall into the inner loop immediately.)
+1: vld1.32 {q2}, [r7]! // = V = bv[i]
+ vld1.32 {q4}, [r5] // = M
+ vmov.i32 q3, #0
+ vmov.i32 q5, #0
+ mov r0, r6 // -> dv[i]
+ add r6, r6, #16
+ mov r1, r8 // -> av[0]
+ mov r2, r3 // -> nv[0]
+ vzip.16 q2, q3 // expand V
+ vzip.16 q4, q5 // expand M
+ bl mmla4
+
+ // Complete the next inner loop.
+0: bl dmla4
+ cmp r1, r9 // done yet?
+ blo 0b
+
+ // Still have carries left to propagate, and they overlap the
+ // previous iteration's final tail, so read that and add it.
+ cmp r7, r4
+ vmov.32 D0(q12), r10, r12
+ vadd.i64 D0(q13), D0(q13), D0(q12)
+ bl carryprop
+ vmov.32 r10, QW(q15, 0)
+
+ // Back again, maybe.
+ blo 1b
+
+ // All done, almost.
+ str r10, [r0], #4
+ popvfp QQ(q4, q7)
+ popreg r4-r10, r14
+ bx r14
+
+ // First iteration was short. Write out the carries and we're done.
+ // (This could be folded into the main loop structure, but that would
+ // penalize small numbers more.)
+8: bl carryprop
+ vst1.32 {QW(q15, 0)}, [r0]!
+ popvfp QQ(q4, q7)
+ popreg r4-r10, r14
+ bx r14
+ENDFUNC
+
+FUNC(mpxmont_redc4_arm_neon)
+ // void mpxmont_redc4_arm_neon(mpw *dv, mpw *dvl, const mpw *nv,
+ // size_t n, const mpw *mi);
+
+ // Establish the arguments and do initial setup.
+ //
+ // inner loop dv r0
+ // dv limit r1
+ // inner loop nv r2
+ // blocks-of-4 dv limit r3
+ // mi (r14)
+ // outer loop dv r4
+ // outer loop dv limit r5
+ // nv base r6
+ // nv limit r7
+ // n r3
+ // c (r14)
+ // t0, t1, t2, t3 r2, r8, r9, r10
+ // 0 r12
+
+ pushreg r4-r10, r14
+ pushvfp QQ(q4, q7)
+ endprologue
+
+ // Set up the outer loop state and prepare for the first iteration.
+ ldr r14, [sp, #96] // -> mi
+ vmov.i32 q3, #0
+ sub r12, r1, r0 // total dv bytes
+ // r0 // -> dv for inner loop
+ // r1 // -> overall dv limit
+ // r2 // -> nv for inner loop
+ // r3 // = n (for now)
+ add r4, r0, #16 // -> dv for outer loop
+ add r5, r0, r3, lsl #2 // -> dv limit
+ bic r12, r12, #15 // dv blocks of 4
+ vld1.32 {q2}, [r14] // = M
+ mov r6, r2 // -> nv
+ add r7, r2, r3, lsl #2 // -> nv limit
+ add r3, r0, r12 // -> dv blocks-of-4 limit
+ vzip.16 q2, q3 // expand M
+ mov r12, #0 // = 0
+ bl mont4
+ cmp r2, r7 // done already?
+ bhs 8f
+
+5: bl mla4
+ cmp r2, r7 // done yet?
+ blo 5b
+
+ // Still have carries left to propagate. Adding the accumulator
+ // block into the carries is a little different this time, because
+ // all four accumulator limbs have to be squished into the three
+ // carry registers for `carryprop' to do its thing.
+8: vld1.32 {q9}, [r0]
+ SPREADACC0(q9, q10, q11, q12)
+ SPREADACC1(q9, q10, q11, q12)
+ vshl.u64 D0(q12), D0(q12), #16
+ CARRYACC(q9, q10, q11, q12, q13, q14, q15)
+ vadd.u64 D1(q15), D1(q15), D0(q12)
+
+ bl carryprop
+ vmov.32 r14, QW(q15, 0)
+ cmp r0, r3
+ bhs 7f
+
+ // Propagate the first group of carries.
+ ldmia r0, {r2, r8-r10}
+ adds r2, r2, r14
+ adcs r8, r8, #0
+ adcs r9, r9, #0
+ adcs r10, r10, #0
+ stmia r0!, {r2, r8-r10}
+ teq r0, r3
+ beq 6f
+
+ // Continue carry propagation until the end of the buffer.
+0: ldmia r0, {r2, r8-r10}
+ adcs r2, r2, #0
+ adcs r8, r8, #0
+ adcs r9, r9, #0
+ adcs r10, r10, #0
+ stmia r0!, {r2, r8-r10}
+ teq r0, r3
+ bne 0b
+
+ // Deal with the tail end. Note that the actual destination length
+ // won't be an exacty number of blocks of four, so it's safe to just
+ // drop through here.
+6: adc r14, r12, #0
+7: ldr r2, [r0]
+ adds r2, r2, r14
+ str r2, [r0], #4
+ teq r0, r1
+ beq 8f
+0: ldr r2, [r0]
+ adcs r2, r2, #0
+ str r2, [r0], #4
+ teq r0, r1
+ bne 0b
+
+ // All done for this iteration. Start the next.
+8: cmp r4, r5
+ bhs 9f
+ mov r0, r4
+ add r4, r4, #16
+ mov r2, r6
+ bl mont4
+ b 5b
+
+ // All over.
+9: popvfp QQ(q4, q7)
+ popreg r4-r10, r14
+ bx r14
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Testing and performance measurement.
+
+#ifdef TEST_MUL4
+
+// dmul smul mmul mont
+// z r0 r0 r0 r0 r0
+// c r4 r1 r1 r1 r1
+// y r3 -- -- r2 r2
+// u r1 r2 -- r3 --
+// x r2 r3 r2 stk0 r3
+// vv q2/q3 stk0 -- stk1 stk0
+// yy q4/q5 stk1 r3 stk2 --
+// n r5 stk2 stk0 stk3 stk1
+// cyv r6 stk3 stk1 stk4 stk2
+
+#define STKARG(i) sp, #80 + 4*(i)
+
+.macro testprologue mode
+ pushreg r4-r6, r14
+ pushvfp QQ(q4, q7)
+ endprologue
+
+ .ifeqs "\mode", "dmul"
+ mov r4, r1 // -> c
+ mov r1, r2 // -> u
+ mov r2, r3 // -> x
+
+ ldr r14, [STKARG(0)] // -> vv
+ vld1.32 {q2}, [r14]
+ vmov.i32 q3, #0
+ vzip.16 q2, q3 // (v'_0, v''_0; v'_1, v''_1)
+
+ ldr r14, [STKARG(1)] // -> yy
+ vld1.32 {q4}, [r14]
+ vmov.i32 q5, #0
+ vzip.16 q4, q5 // (y'_0, y''_0; y'_1, y''_1)
+
+ ldr r5, [STKARG(2)] // = n
+ ldr r6, [STKARG(3)] // -> cyv
+ .endif
+
+ .ifeqs "\mode", "smul"
+ mov r4, r1 // -> c
+ // r2 // -> x
+
+ vld1.32 {q4}, [r3]
+ vmov.i32 q5, #0
+ vzip.16 q4, q5 // (y'_0, y''_0; y'_1, y''_1)
+
+ ldr r5, [STKARG(0)] // = n
+ ldr r6, [STKARG(1)] // -> cyv
+ .endif
+
+ .ifeqs "\mode", "mmul"
+ mov r4, r1 // -> c
+ mov r1, r3 // -> u
+ mov r3, r2 // -> y
+ ldr r2, [STKARG(0)] // -> x
+
+ ldr r14, [STKARG(1)] // -> vv
+ vld1.32 {q2}, [r14]
+ vmov.i32 q3, #0
+ vzip.16 q2, q3 // (v'_0, v''_0; v'_1, v''_1)
+
+ ldr r14, [STKARG(2)] // -> yy
+ vld1.32 {q4}, [r14]
+ vmov.i32 q5, #0
+ vzip.16 q4, q5 // (y'_0, y''_0; y'_1, y''_1)
+
+ ldr r5, [STKARG(3)] // = n
+ ldr r6, [STKARG(4)] // -> cyv
+ .endif
+
+ .ifeqs "\mode", "mont"
+ mov r4, r1 // -> c
+ mov r1, r3 // -> u
+ mov r14, r2
+ mov r2, r3 // -> x
+ mov r3, r14 // -> y
+
+ ldr r14, [STKARG(0)] // -> vv
+ vld1.32 {q2}, [r14]
+ vmov.i32 q3, #0
+ vzip.16 q2, q3 // (v'_0, v''_0; v'_1, v''_1)
+
+ ldr r5, [STKARG(1)] // = n
+ ldr r6, [STKARG(2)] // -> cyv
+ .endif
+.endm
+
+.macro testldcarry
+ vldmia r4, {QQ(q13, q15)} // c0, c1, c2
+.endm
+
+.macro testtop
+0: subs r5, r5, #1
+.endm
+
+.macro testtail
+ bne 0b
+.endm
+
+.macro testcarryout
+ vstmia r4, {QQ(q13, q15)}
+.endm
+
+.macro testepilogue
+ popvfp QQ(q4, q7)
+ popreg r4-r6, r14
+ bx r14
+.endm
+
+FUNC(test_dmul4)
+ testprologue dmul
+ testldcarry
+ testtop
+ bl dmul4
+ testtail
+ testcarryout
+ testepilogue
+ENDFUNC
+
+FUNC(test_dmla4)
+ testprologue dmul
+ testldcarry
+ testtop
+ bl dmla4
+ testtail
+ testcarryout
+ testepilogue
+ENDFUNC
+
+FUNC(test_mul4)
+ testprologue smul
+ testldcarry
+ testtop
+ bl mul4
+ testtail
+ testcarryout
+ testepilogue
+ENDFUNC
+
+FUNC(test_mul4zc)
+ testprologue smul
+ testldcarry
+ testtop
+ bl mul4zc
+ testtail
+ testcarryout
+ testepilogue
+ENDFUNC
+
+FUNC(test_mla4)
+ testprologue smul
+ testldcarry
+ testtop
+ bl mla4
+ testtail
+ testcarryout
+ testepilogue
+ENDFUNC
+
+FUNC(test_mla4zc)
+ testprologue smul
+ testldcarry
+ testtop
+ bl mla4zc
+ testtail
+ testcarryout
+ testepilogue
+ENDFUNC
+
+FUNC(test_mmul4)
+ testprologue mmul
+ testtop
+ bl mmul4
+ testtail
+ vst1.32 {q4, q5}, [r3]
+ testcarryout
+ testepilogue
+ENDFUNC
+
+FUNC(test_mmla4)
+ testprologue mmul
+ testtop
+ bl mmla4
+ testtail
+ vst1.32 {q4, q5}, [r3]
+ testcarryout
+ testepilogue
+ENDFUNC
+
+FUNC(test_mont4)
+ testprologue mont
+ testtop
+ bl mont4
+ testtail
+ vst1.32 {q4, q5}, [r3]
+ testcarryout
+ testepilogue
+ENDFUNC
+
+#endif
+
+///----- That's all, folks --------------------------------------------------
--- /dev/null
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// Large SIMD-based multiplications
+///
+/// (c) 2019 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software: you can redistribute it and/or modify it
+/// under the terms of the GNU Library General Public License as published
+/// by the Free Software Foundation; either version 2 of the License, or
+/// (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful, but
+/// WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+/// Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb. If not, write to the Free Software
+/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+/// USA.
+
+///--------------------------------------------------------------------------
+/// Preliminaries.
+
+#include "config.h"
+#include "asm-common.h"
+
+ .text
+
+///--------------------------------------------------------------------------
+/// Theory.
+///
+/// We define a number of primitive fixed-size multipliers from which we can
+/// construct more general variable-length multipliers.
+///
+/// The basic trick is the same throughout. In an operand-scanning
+/// multiplication, the inner multiplication loop multiplies a multiple-
+/// precision operand by a single precision factor, and adds the result,
+/// appropriately shifted, to the result. A `finely integrated operand
+/// scanning' implementation of Montgomery multiplication also adds the
+/// product of a single-precision `Montgomery factor' and the modulus,
+/// calculated in the same pass. The more common `coarsely integrated
+/// operand scanning' alternates main multiplication and Montgomery passes,
+/// which requires additional carry propagation.
+///
+/// Throughout both plain-multiplication and Montgomery stages, then, one of
+/// the factors remains constant throughout the operation, so we can afford
+/// to take a little time to preprocess it. The transformation we perform is
+/// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a
+/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into
+/// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit
+/// pieces are placed into 32-bit cells, and arranged as two 128-bit SIMD
+/// operands, as follows.
+///
+/// Offset 0 4 8 12
+/// 0 v'_0 v''_0 v'_1 v''_1
+/// 16 v'_2 v''_2 v'_3 v''_3
+///
+/// The `umull' and `umlal' instructions can multiply a vector of two 32-bit
+/// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
+/// on (say) v'_0 and v''_0 in a single instruction, to produce two 48-bit
+/// results in 64-bit fields. The sixteen bits of headroom allows us to add
+/// many products together before we must deal with carrying; it also allows
+/// for some calculations to be performed on the above expanded form.
+///
+/// We maintain three `carry' registers, v28--v30, accumulating intermediate
+/// results; we name them `c0', `c1', and `c2'. Each carry register holds
+/// two 64-bit halves: the register c0, for example, holds c'_0 (low half)
+/// and c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the
+/// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2.
+/// The `umull' or `umlal' instruction acting on a scalar operand and an
+/// operand in the expanded form above produces a result which can be added
+/// directly to the appropriate carry register.
+///
+/// An unusual feature of this code, as compared to the other `mul4'
+/// implementations, is that it makes extensive use of the ARM64
+/// general-purpose registers for carry resolution and output construction.
+/// As a result, an additional general-purpose register (typically x15) is
+/// used as an additional carry, with the carry value in bits 16--63.
+///
+/// Multiplication is performed in product-scanning order, since ARM
+/// processors commonly implement result forwarding for consecutive multiply-
+/// and-accumulate instructions specifying the same destination.
+/// Experimentally, this runs faster than operand-scanning in an attempt to
+/// hide instruction latencies.
+///
+/// On 64-bit ARM, we have a vast supply number of registers: the expanded
+/// operands are kept in registers. The packed operands are read from memory
+/// into working registers v0 and v1. The following conventional argument
+/// names and locations are used throughout.
+///
+/// Arg Format Location Notes
+///
+/// U packed [x1]
+/// X packed [x2] In Montgomery multiplication, X = N
+/// V expanded v2/v3
+/// Y expanded v4/v5 In Montgomery multiplication, Y = (A + U V) M
+/// M expanded v6/v7 -N^{-1} (mod B^4)
+/// N Modulus, for Montgomery multiplication
+/// A packed [x0] Destination/accumulator
+/// C carry v28--v30
+/// 0 v31 128-bit zero
+///
+/// The calculation is some variant of
+///
+/// A' + C' B^4 <- U V + X Y + A + C
+///
+/// The low-level functions fit into a fairly traditional (finely-integrated)
+/// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y)
+/// (indexed by i).
+///
+/// The variants are as follows.
+///
+/// Function Variant Use i j
+///
+/// mmul4 A = C = 0 Montgomery 0 0
+/// dmul4 A = 0 Montgomery 0 +
+/// mmla4 C = 0 Montgomery + 0
+/// dmla4 exactly as shown Montgomery + +
+///
+/// mul4zc U = V = A = C = 0 Plain 0 0
+/// mul4 U = V = A = 0 Plain 0 +
+/// mla4zc U = V = C = 0 Plain + 0
+/// mla4 U = V = 0 Plain + +
+///
+/// The `mmul4' and `mmla4' functions are also responsible for calculating
+/// the Montgomery reduction factor Y = (A + U V) M used by the rest of the
+/// inner loop.
+
+///--------------------------------------------------------------------------
+/// Macro definitions.
+
+.macro mulacc z, u, v, x=nil, y=nil
+ // Set Z = Z + U V + X Y, using the low halves of V and Y. Y may be
+ // `nil' to omit the second operand. Z, V, and Y should be 128-bit
+ // `vN' registers; and U and X should be 32-bit `vN.s[I]' scalars;
+ // the multiplications produce two 64-bit elementwise products, which
+ // are added elementwise to Z.
+
+ umlal \z\().2d, \v\().2s, \u
+ .ifnes "\y", "nil"
+ umlal \z\().2d, \y\().2s, \x
+ .endif
+.endm
+
+.macro mulacc2 z, u, v, x=nil, y=nil
+ // Set Z = Z + U V + X Y, using the high halves of V and Y; see
+ // `mulacc'.
+
+ umlal2 \z\().2d, \v\().4s, \u
+ .ifnes "\y", "nil"
+ umlal2 \z\().2d, \y\().4s, \x
+ .endif
+.endm
+
+.macro mulinit z, zinitp, u, v=nil, x=nil, y=nil
+ // If ZINITP then set Z = Z + U V + X Y, as for `mulacc'; otherwise,
+ // set Z = U V + X Y. Operand requirements and detailed operation
+ // are as for `mulacc'.
+
+ .ifeqs "\zinitp", "t"
+ mulacc \z, \u, \v, \x, \y
+ .else
+ umull \z\().2d, \v\().2s, \u
+ .ifnes "\y", "nil"
+ umlal \z\().2d, \y\().2s, \x
+ .endif
+ .endif
+.endm
+
+.macro mulini2 z, zinitp, u, v=nil, x=nil, y=nil
+ // As `mulinit', but with the high halves of V and Y.
+
+ .ifeqs "\zinitp", "t"
+ mulacc2 \z, \u, \v, \x, \y
+ .else
+ umull2 \z\().2d, \v\().4s, \u
+ .ifnes "\y", "nil"
+ umlal2 \z\().2d, \y\().4s, \x
+ .endif
+ .endif
+.endm
+
+// `mulI': accumulate the B^I and b B^i terms of the polynomial product sum U
+// V + X Y, given that U = u_0 + B u_1 + B^2 u_2 + B^3 u_3 (and similarly for
+// x), and V = v'_0 + b v''_0 + B (v'_1 + b v''_1) + B^2 (v'_2 + b v''_2) +
+// B^3 (v'_3 + b v''_3) (and similarly for Y). The 64-bit coefficients are
+// added into the low and high halves of the 128-bit register Z (if ZINIT is
+// `nil' then simply set Z, as if it were initially zero).
+.macro mul0 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
+ mulinit \z, \zinitp, \u\().s[0], \v0, \x\().s[0], \y0
+.endm
+.macro mul1 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
+ mulini2 \z, \zinitp, \u\().s[0], \v0, \x\().s[0], \y0
+ mulacc \z, \u\().s[1], \v0, \x\().s[1], \y0
+.endm
+.macro mul2 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
+ mulinit \z, \zinitp, \u\().s[0], \v1, \x\().s[0], \y1
+ mulacc2 \z, \u\().s[1], \v0, \x\().s[1], \y0
+ mulacc \z, \u\().s[2], \v0, \x\().s[2], \y0
+.endm
+.macro mul3 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
+ mulini2 \z, \zinitp, \u\().s[0], \v1, \x\().s[0], \y1
+ mulacc \z, \u\().s[1], \v1, \x\().s[1], \y1
+ mulacc2 \z, \u\().s[2], \v0, \x\().s[2], \y0
+ mulacc \z, \u\().s[3], \v0, \x\().s[3], \y0
+.endm
+.macro mul4 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
+ mulini2 \z, \zinitp, \u\().s[1], \v1, \x\().s[1], \y1
+ mulacc \z, \u\().s[2], \v1, \x\().s[2], \y1
+ mulacc2 \z, \u\().s[3], \v0, \x\().s[3], \y0
+.endm
+.macro mul5 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
+ mulini2 \z, \zinitp, \u\().s[2], \v1, \x\().s[2], \y1
+ mulacc \z, \u\().s[3], \v1, \x\().s[3], \y1
+.endm
+.macro mul6 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil
+ mulini2 \z, \zinitp, \u\().s[3], \v1, \x\().s[3], \y1
+.endm
+
+// Steps in the process of propagating carry bits upwards from ZLO (a 128-bit
+// `vN' register). Here, T0, T1, and CG are 64-bit `xN' general-purpose
+// registers clobbered in the process. Set the low 32 bits of the 64-bit
+// `xN' general-purpose register ZOUT to the completed coefficient z_1,
+// leaving a carry in CG.
+//
+// In detail, what happens is as follows. Suppose initially that ZLO =
+// (z'_i; z''_i) and ZHI = (z'_{i+1}; z''_{i+1}). Let t = z'_i + b z''_i;
+// observe that floor(t/b) = floor(z'_i/b) + z''_i. Let z_i = t mod B, and
+// add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}. This has
+// a circuit depth of 3; I don't know how to do better.
+//
+// Output words are left in the low half of a 64-bit register, with rubbish
+// in the high half. Two such results can be combined using the `bfi'
+// instruction.
+.macro carry0 zlo, cg=x15, t0=x16, t1=x17
+ // Capture the values of carry-register ZLO and CG (if not `nil') in
+ // general-purpose registers T0 and T1, suitable for use in `carry1'.
+ mov \t0, \zlo\().d[0]
+ mov \t1, \zlo\().d[1]
+ .ifnes "\cg", "nil"
+ add \t0, \t0, \cg, lsr #16
+ .endif
+.endm
+.macro carry1 zout, cg=x15, t0=x16, t1=x17
+ // Collect a 32-bit output word in the low 32 bits of ZOUT (leaving
+ // rubbish in the high 32 bits), and update CG suitably to continue
+ // processing with the next carry register.
+ .ifnes "\zout", "nil"
+ add \zout, \t0, \t1, lsl #16
+ .endif
+ .ifnes "\cg", "nil"
+ add \cg, \t1, \t0, lsr #16
+ .endif
+.endm
+
+.macro expand vlo, vhi, vz=v31
+ // Expand the packed 128-bit operand in VLO to an expanded operand in
+ // VLO and VHI, assuming that VZ is all-bits-zero. All three are
+ // `vN' 128-bit SIMD registers.
+ zip2 \vhi\().8h, \vlo\().8h, \vz\().8h
+ zip1 \vlo\().8h, \vlo\().8h, \vz\().8h
+.endm
+
+.macro sprdacc a0, a1, a2, a3=nil
+ // Spread the packed 128-bit operand in A0 into carry-format values
+ // in A0, A1, A2, A3. If A3 is `nil', then spread the same value
+ // into A0, A1, A2 only, clobbering x16.
+ .ifeqs "\a3", "nil"
+ mov w16, \a0\().s[3]
+ .endif
+ trn2 \a2\().2d, \a0\().2d, v31.2d
+ trn2 \a1\().2s, \a0\().2s, v31.2s
+ .ifeqs "\a3", "nil"
+ lsl x16, x16, #16
+ .endif
+ trn1 \a0\().2s, \a0\().2s, v31.2s
+ .ifeqs "\a3", "nil"
+ mov \a2\().d[1], x16
+ .else
+ trn2 \a3\().2s, \a2\().2s, v31.2s
+ .endif
+ mov \a2\().s[1], wzr
+.endm
+
+.macro crryacc a0, a1, a2, a3, c0, c1, c2
+ // Add the carry-format values A0, A1, A2 into the existing carries
+ // C0, C1, C2 (leaving A3 where it is).
+ add \c0\().2d, \c0\().2d, \a0\().2d
+ add \c1\().2d, \c1\().2d, \a1\().2d
+ add \c2\().2d, \c2\().2d, \a2\().2d
+.endm
+
+///--------------------------------------------------------------------------
+/// Primitive multipliers and related utilities.
+
+INTFUNC(carryprop)
+ // On entry, x0 points to a destination, and v28--v30 and x15 hold
+ // incoming carries c0--c2 and cg. On exit, the low 128 bits of the
+ // carry value are stored at [x0]; the remaining 16 bits of carry are
+ // left in x10; x0 is advanced by 16; and x11--x17 are clobbered.
+ endprologue
+
+ carry0 v28
+ carry1 x11
+ carry0 v29
+ carry1 x13
+ carry0 v30
+ carry1 x12
+ bfi x11, x13, #32, #32
+ lsr x14, x15, #16
+ lsr x10, x15, #48
+ bfi x12, x14, #32, #32
+ stp x11, x12, [x0], #16
+ ret
+ENDFUNC
+
+INTFUNC(dmul4)
+ // On entry, x0 points to the destination; x1 and x2 point to packed
+ // operands U and X; v2/v3 and v4/v5 hold expanded operands V and Y;
+ // v28--v30 and x15 hold incoming carries c0--c2 and cg; and v31 is
+ // zero. On exit, the destination and carries are updated; x0, x1,
+ // x2 are each advanced by 16; v2--v5 and v8--v15 are preserved; and
+ // x11--x14, x16, x17 and the other SIMD registers are clobbered.
+ endprologue
+
+ // Start by loading the operand words from memory.
+ ldr q0, [x1], #16
+ ldr q1, [x2], #16
+
+ // Do the multiplication.
+ mul0 v28, t, v0, v2, v3, v1, v4, v5
+ mul1 v29, t, v0, v2, v3, v1, v4, v5
+ carry0 v28
+ mul2 v30, t, v0, v2, v3, v1, v4, v5
+ carry1 x11
+ carry0 v29
+ mul3 v27, nil, v0, v2, v3, v1, v4, v5
+ carry1 x13
+ carry0 v30
+ mul4 v28, nil, v0, v2, v3, v1, v4, v5
+ carry1 x12
+ carry0 v27
+ mul5 v29, nil, v0, v2, v3, v1, v4, v5
+ carry1 x14
+ mul6 v30, nil, v0, v2, v3, v1, v4, v5
+
+ // Finish up and store the result.
+ bfi x11, x13, #32, #32
+ bfi x12, x14, #32, #32
+ stp x11, x12, [x0], #16
+
+ // All done.
+ ret
+ENDFUNC
+
+INTFUNC(dmla4)
+ // On entry, x0 points to the destination/accumulator A; x1 and x2
+ // point to packed operands U and X; v2/v3 and v4/v5 hold expanded
+ // operands V and Y; v28--v30 and x15 hold incoming carries c0--c2
+ // and cg; and v31 is zero. On exit, the accumulator and carries are
+ // updated; x0, x1, x2 are each advanced by 16; v2--v5 and v8--v15
+ // are preserved; and x11--x14, x16, x17 and the other SIMD registers
+ // are clobbered.
+ endprologue
+
+ // Start by loading the operand words from memory.
+ ldr q24, [x0]
+ ldr q0, [x1], #16
+ ldr q1, [x2], #16
+ sprdacc v24, v25, v26, v27
+ crryacc v24, v25, v26, v27, v28, v29, v30
+
+ // Do the multiplication.
+ mul0 v28, t, v0, v2, v3, v1, v4, v5
+ mul1 v29, t, v0, v2, v3, v1, v4, v5
+ carry0 v28
+ mul2 v30, t, v0, v2, v3, v1, v4, v5
+ carry1 x11
+ carry0 v29
+ mul3 v27, t, v0, v2, v3, v1, v4, v5
+ carry1 x13
+ carry0 v30
+ mul4 v28, nil, v0, v2, v3, v1, v4, v5
+ carry1 x12
+ carry0 v27
+ mul5 v29, nil, v0, v2, v3, v1, v4, v5
+ carry1 x14
+ mul6 v30, nil, v0, v2, v3, v1, v4, v5
+
+ // Finish up and store the result.
+ bfi x11, x13, #32, #32
+ bfi x12, x14, #32, #32
+ stp x11, x12, [x0], #16
+
+ // All done.
+ ret
+ENDFUNC
+
+INTFUNC(mul4)
+ // On entry, x0 points to the destination; x2 points to a packed
+ // operand X; v4/v5 holds an expanded operand Y; v13--v15 and x15
+ // hold incoming carries c0--c2 and cg; and v31 is zero. On exit,
+ // the destination and carries are updated; x0 and x2 are each
+ // advanced by 16; v4 and v5 and v8--v15 are preserved; and x11--x14,
+ // x16, x17 and the other SIMD registers are clobbered.
+ endprologue
+
+ // Start by loading the operand words from memory.
+ ldr q1, [x2], #16
+
+ // Do the multiplication.
+ mul0 v28, t, v1, v4, v5
+ mul1 v29, t, v1, v4, v5
+ carry0 v28
+ mul2 v30, t, v1, v4, v5
+ carry1 x11
+ carry0 v29
+ mul3 v27, nil, v1, v4, v5
+ carry1 x13
+ carry0 v30
+ mul4 v28, nil, v1, v4, v5
+ carry1 x12
+ carry0 v27
+ mul5 v29, nil, v1, v4, v5
+ carry1 x14
+ mul6 v30, nil, v1, v4, v5
+
+ // Finish up and store the result.
+ bfi x11, x13, #32, #32
+ bfi x12, x14, #32, #32
+ stp x11, x12, [x0], #16
+
+ // All done.
+ ret
+ENDFUNC
+
+INTFUNC(mul4zc)
+ // On entry, x0 points to the destination; x2 points to a packed
+ // operand X; v4/v5 holds an expanded operand Y; and v31 is zero. On
+ // exit, the destination is updated; v28--v30 and x15 hold outgoing
+ // carries c0--c2 and cg; x0 and x2 are each advanced by 16; v4 and
+ // v5 and v8--v15 are preserved; and x11--x14, x16, x17 and the other
+ // SIMD registers are clobbered.
+ endprologue
+
+ // Start by loading the operand words from memory.
+ ldr q1, [x2], #16
+
+ // Do the multiplication.
+ mul0 v28, nil, v1, v4, v5
+ mul1 v29, nil, v1, v4, v5
+ carry0 v28, nil
+ mul2 v30, nil, v1, v4, v5
+ carry1 x11
+ carry0 v29
+ mul3 v27, nil, v1, v4, v5
+ carry1 x13
+ carry0 v30
+ mul4 v28, nil, v1, v4, v5
+ carry1 x12
+ carry0 v27
+ mul5 v29, nil, v1, v4, v5
+ carry1 x14
+ mul6 v30, nil, v1, v4, v5
+
+ // Finish up and store the result.
+ bfi x11, x13, #32, #32
+ bfi x12, x14, #32, #32
+ stp x11, x12, [x0], #16
+
+ // All done.
+ ret
+ENDFUNC
+
+INTFUNC(mla4)
+ // On entry, x0 points to the destination/accumulator A; x2 points to
+ // a packed operand X; v4/v5 holds an expanded operand Y; v13--v15
+ // and x15 hold incoming carries c0--c2 and cg; and v31 is zero. On
+ // exit, the accumulator and carries are updated; x0 and x2 are each
+ // advanced by 16; v4 and v5 and v8--v15 are preserved; and x11--x14,
+ // x16, x17 and the other SIMD registers are clobbered.
+ endprologue
+
+ // Start by loading the operand words from memory.
+ ldr q24, [x0]
+ ldr q1, [x2], #16
+ sprdacc v24, v25, v26, v27
+ crryacc v24, v25, v26, v27, v28, v29, v30
+
+ // Do the multiplication.
+ mul0 v28, t, v1, v4, v5
+ mul1 v29, t, v1, v4, v5
+ carry0 v28
+ mul2 v30, t, v1, v4, v5
+ carry1 x11
+ carry0 v29
+ mul3 v27, t, v1, v4, v5
+ carry1 x13
+ carry0 v30
+ mul4 v28, nil, v1, v4, v5
+ carry1 x12
+ carry0 v27
+ mul5 v29, nil, v1, v4, v5
+ carry1 x14
+ mul6 v30, nil, v1, v4, v5
+
+ // Finish up and store the result.
+ bfi x11, x13, #32, #32
+ bfi x12, x14, #32, #32
+ stp x11, x12, [x0], #16
+
+ // All done.
+ ret
+ENDFUNC
+
+INTFUNC(mla4zc)
+ // On entry, x0 points to the destination/accumulator A; x2 points to
+ // a packed operand X; v4/v5 holds an expanded operand Y; and v31 is
+ // zero. On exit, the accumulator is updated; v28--v30 and x15 hold
+ // outgoing carries c0--c2 and cg; x0 and x2 are each advanced by 16;
+ // v4, v5, and v8--v15 are preserved; and x11--x14, x16, x17 and the
+ // other SIMD registers are clobbered.
+ endprologue
+
+ // Start by loading the operand words from memory.
+ ldr q28, [x0]
+ ldr q1, [x2], #16
+ sprdacc v28, v29, v30, v27
+
+ // Do the multiplication.
+ mul0 v28, t, v1, v4, v5
+ mul1 v29, t, v1, v4, v5
+ carry0 v28, nil
+ mul2 v30, t, v1, v4, v5
+ carry1 x11
+ carry0 v29
+ mul3 v27, t, v1, v4, v5
+ carry1 x13
+ carry0 v30
+ mul4 v28, nil, v1, v4, v5
+ carry1 x12
+ carry0 v27
+ mul5 v29, nil, v1, v4, v5
+ carry1 x14
+ mul6 v30, nil, v1, v4, v5
+
+ // Finish up and store the result.
+ bfi x11, x13, #32, #32
+ bfi x12, x14, #32, #32
+ stp x11, x12, [x0], #16
+
+ // All done.
+ ret
+ENDFUNC
+
+INTFUNC(mmul4)
+ // On entry, x0 points to the destination; x1 points to a packed
+ // operand U; x2 points to a packed operand X (the modulus); v2/v3
+ // holds an expanded operand V; and v6/v7 holds an expanded operand M
+ // (the Montgomery factor -N^{-1} (mod B)). On exit, the destination
+ // is updated (to zero); v4/v5 hold an expanded factor Y = U V M (mod
+ // B); v28--v30 and x15 hold outgoing carries c0--c2 and cg; x0, x1,
+ // and x2 are each advanced by 16; v2, v3, and v8--v15 are preserved;
+ // and x11--x14, x16, x17 and the other SIMD registers are clobbered.
+ endprologue
+
+ // Start by loading the operand words from memory.
+ ldr q0, [x1], #16
+ ldr q1, [x2], #16
+
+ // Calculate the low half of W = A + U V, being careful to leave the
+ // carries in place.
+ mul0 v28, nil, v0, v2, v3
+ mul1 v29, nil, v0, v2, v3
+ carry0 v28, nil
+ mul2 v30, nil, v0, v2, v3
+ carry1 x11
+ carry0 v29
+ mul3 v27, nil, v0, v2, v3
+ b mmla4_common
+ENDFUNC
+
+INTFUNC(mmla4)
+ // On entry, x0 points to the destination/accumulator A; x1 points to
+ // a packed operand U; x2 points to a packed operand X (the modulus);
+ // v2/v3 holds an expanded operand V; and v6/v7 holds an expanded
+ // operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the
+ // accumulator is updated (to zero); v4/v5 hold an expanded factor Y
+ // = (A + U V) M (mod B); v28--v30 and x15 hold outgoing carries
+ // c0--c2 and cg; x0, x1, and x2 are each advanced by 16; v2, v3, v6,
+ // v7, and v8--v15 are preserved; and x11--x14, x16, x17 and the
+ // other SIMD registers are clobbered.
+ endprologue
+
+ // Start by loading the operand words from memory.
+ ldr q28, [x0]
+ ldr q0, [x1], #16
+ ldr q1, [x2], #16
+ sprdacc v28, v29, v30, v27
+
+ // Calculate the low half of W = A + U V, being careful to leave the
+ // carries in place.
+ mul0 v28, t, v0, v2, v3
+ mul1 v29, t, v0, v2, v3
+ carry0 v28, nil
+ mul2 v30, t, v0, v2, v3
+ carry1 x11
+ carry0 v29
+ mul3 v27, t, v0, v2, v3
+mmla4_common:
+ carry1 x13
+ carry0 v30
+ carry1 x12
+ carry0 v27
+ carry1 x14, nil
+
+ // Piece the result together and ship it back.
+ bfi x11, x13, #32, #32
+ bfi x12, x14, #32, #32
+ mov v16.d[0], x11
+ mov v16.d[1], x12
+
+ // Calculate the low half of the Montgomery factor Y = W M.
+ mul0 v18, nil, v16, v6, v7
+ mul1 v19, nil, v16, v6, v7
+ carry0 v18, nil
+ mul2 v20, nil, v16, v6, v7
+ carry1 x11
+ carry0 v19
+ mul3 v21, nil, v16, v6, v7
+ carry1 x13
+ carry0 v20
+ carry1 x12
+ carry0 v21
+ carry1 x14, nil
+
+ // Piece the result together, ship it back, and expand.
+ bfi x11, x13, #32, #32
+ bfi x12, x14, #32, #32
+ mov v4.d[0], x11
+ mov v4.d[1], x12
+ expand v4, v5
+
+ // Build up the product X Y in the carry slots.
+ mul0 v28, t, v1, v4, v5
+ mul1 v29, t, v1, v4, v5
+ carry0 v28, nil
+ mul2 v30, t, v1, v4, v5
+ carry1 nil
+ carry0 v29
+ mul3 v27, t, v1, v4, v5
+ carry1 nil
+ carry0 v30
+
+ // And complete the calculation.
+ mul4 v28, nil, v0, v2, v3, v1, v4, v5
+ carry1 nil
+ carry0 v27
+ mul5 v29, nil, v0, v2, v3, v1, v4, v5
+ carry1 nil
+ mul6 v30, nil, v0, v2, v3, v1, v4, v5
+
+ // Finish up and store the result.
+ stp xzr, xzr, [x0], #16
+
+ // All done.
+ ret
+ENDFUNC
+
+INTFUNC(mont4)
+ // On entry, x0 points to the destination/accumulator A; x2 points to
+ // a packed operand X (the modulus); and v6/v7 holds an expanded
+ // operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the
+ // accumulator is updated (to zero); v4/v5 hold an expanded factor Y
+ // = A M (mod B); v28--v30 and x15 hold outgoing carries c0--c2 and
+ // cg; x0 and x2 are each advanced by 16; v6, v7, and v8--v15 are
+ // preserved; and x11--x14, x16, x17 and the other SIMD registers are
+ // clobbered.
+ endprologue
+
+ // Start by loading the operand words from memory.
+ ldr q28, [x0]
+ ldr q1, [x2], #16
+
+ // Calculate Y = A M (mod B).
+ mul0 v18, nil, v28, v6, v7
+ mul1 v19, nil, v28, v6, v7
+ carry0 v18, nil
+ mul2 v20, nil, v28, v6, v7
+ carry1 x11
+ carry0 v19
+ mul3 v21, nil, v28, v6, v7
+ carry1 x13
+ carry0 v20
+ sprdacc v28, v29, v30, v27
+ carry1 x12
+ carry0 v21
+ carry1 x14, nil
+
+ // Piece the result together, ship it back, and expand.
+ bfi x11, x13, #32, #32
+ bfi x12, x14, #32, #32
+ mov v4.d[0], x11
+ mov v4.d[1], x12
+ expand v4, v5
+
+ // Calculate the actual result. Well, the carries, at least.
+ mul0 v28, t, v1, v4, v5
+ mul1 v29, t, v1, v4, v5
+ carry0 v28, nil
+ mul2 v30, t, v1, v4, v5
+ carry1 nil
+ carry0 v29
+ mul3 v27, t, v1, v4, v5
+ carry1 nil
+ carry0 v30
+
+ // And complete the calculation.
+ mul4 v28, nil, v1, v4, v5
+ carry1 nil
+ carry0 v27
+ mul5 v29, nil, v1, v4, v5
+ carry1 nil
+ mul6 v30, nil, v1, v4, v5
+
+ // Finish up and store the result.
+ stp xzr, xzr, [x0], #16
+
+ // All done.
+ ret
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Bulk multipliers.
+
+FUNC(mpx_umul4_arm64_simd)
+ // void mpx_umul4_arm64_simd(mpw *dv, const mpw *av, const mpw *avl,
+ // const mpw *bv, const mpw *bvl);
+
+ // Establish the arguments and do initial setup.
+ //
+ // inner loop dv x0
+ // inner loop av x2
+ // outer loop dv x5
+ // outer loop bv x3
+ // av base x1
+ // inner n x6
+ // n base x7
+ // outer n x4
+ pushreg x29, x30
+ setfp
+ endprologue
+
+ // Prepare for the first iteration.
+ ldr q4, [x3], #16 // Y = bv[0]
+ movi v31.4s, #0
+ sub x7, x2, x1 // = inner loop count base
+ // x0 // = dv for inner loop
+ // x1 // = av base
+ // x3 // = bv for outer loop
+ sub x4, x4, x3 // = outer loop count (decremented)
+ sub x6, x7, #16 // = inner loop count (decremented)
+ mov x2, x1 // = av for inner loop
+ add x5, x0, #16 // = dv for outer loop
+ expand v4, v5 // expand Y
+ bl mul4zc
+ cbz x6, 8f // all done?
+
+ // Continue with the first iteration.
+0: sub x6, x6, #16
+ bl mul4
+ cbnz x6, 0b
+
+ // Write out the leftover carry. There can be no tail here.
+8: bl carryprop
+ cbz x4, 9f
+
+ // Set up for the next pass.
+1: ldr q4, [x3], #16 // Y = bv[i]
+ mov x0, x5 // -> dv[i]
+ mov x2, x1 // -> av[0]
+ add x5, x5, #16
+ sub x6, x7, #16 // = inner loop count (decremented)
+ sub x4, x4, #16 // outer loop count
+ expand v4, v5 // expand Y
+ bl mla4zc
+ cbz x6, 8f
+
+ // Continue...
+0: sub x6, x6, #16
+ bl mla4
+ cbnz x6, 0b
+
+ // Finish off this pass. There was no tail on the previous pass, and
+ // there can be done on this pass.
+8: bl carryprop
+ cbnz x4, 1b
+
+ // All over.
+9: popreg x29, x30
+ ret
+ENDFUNC
+
+FUNC(mpxmont_mul4_arm64_simd)
+ // void mpxmont_mul4_arm64_simd(mpw *dv,
+ // const mpw *av, const mpw *bv,
+ // const mpw *nv, size_t n,
+ // const mpw *mi);
+
+ // Establish the arguments and do initial setup.
+ //
+ // inner loop dv x0
+ // inner loop av x1
+ // inner loop nv x2
+ // nv base x3
+ // base n x4
+ // mi (x5)
+ // outer loop dv x5
+ // outer loop bv x6
+ // av base x7
+ // inner n x8
+ // outer n x9
+ // c x10
+ pushreg x29, x30
+ setfp
+ endprologue
+
+ // Set up the outer loop state and prepare for the first iteration.
+ ldr q2, [x2] // = V = bv[0]
+ ldr q6, [x5] // = M
+ movi v31.4s, #0
+ // x0 // -> dv for inner loop
+ // x1 // -> av for inner loop
+ // x3 // -> nv base
+ // x4 // = n base
+ add x5, x0, #16 // -> dv
+ add x6, x2, #16 // -> bv
+ mov x2, x3 // -> nv[0]
+ mov x7, x1 // -> av base
+ sub x8, x4, #4 // = inner n (decremented)
+ sub x9, x4, #4 // = outer n (decremented)
+ expand v2, v3 // expand V
+ expand v6, v7 // expand M
+ bl mmul4
+ cbz x8, 8f // done already?
+
+ // Complete the first inner loop.
+0: sub x8, x8, #4
+ bl dmul4
+ cbnz x8, 0b // done yet?
+
+ // Still have carries left to propagate. Rather than store the tail
+ // end in memory, keep it in x10 for later.
+ bl carryprop
+
+ // Embark on the next iteration. (There must be one. If n = 1 then
+ // we would have bailed above, to label 8. Similarly, the subsequent
+ // iterations can fall into the inner loop immediately.)
+1: ldr q2, [x6], #16 // = Y = bv[i]
+ mov x0, x5 // -> dv[i]
+ mov x1, x7 // -> av[0]
+ mov x2, x3 // -> nv[0]
+ add x5, x5, #16
+ sub x8, x4, #4
+ sub x9, x9, #4
+ expand v2, v3
+ bl mmla4
+
+ // Complete the next inner loop.
+0: sub x8, x8, #4
+ bl dmla4
+ cbnz x8, 0b
+
+ // Still have carries left to propagate, and they overlap the
+ // previous iteration's final tail, so read that and add it.
+ add x15, x15, x10, lsl #16
+ bl carryprop
+
+ // Back again, maybe.
+ cbnz x9, 1b
+
+ // All done, almost.
+ str w10, [x0], #4
+ popreg x29, x30
+ ret
+
+ // First iteration was short. Write out the carries and we're done.
+ // (This could be folded into the main loop structure, but that would
+ // penalize small numbers more.)
+8: bl carryprop
+ str w10, [x0], #4
+ popreg x29, x30
+ ret
+ENDFUNC
+
+FUNC(mpxmont_redc4_arm64_simd)
+ // void mpxmont_redc4_arm64_simd(mpw *dv, mpw *dvl, const mpw *nv,
+ // size_t n, const mpw *mi);
+
+ // Establish the arguments and do initial setup.
+ //
+ // inner loop dv x0
+ // inner loop nv x2
+ // blocks-of-4 count x6
+ // tail count x7
+ // mi (x4)
+ // outer loop dv x4
+ // outer loop count x8
+ // nv base x5
+ // inner loop count x1
+ // n x3
+ // c x10
+ // t0, t1 x11, x12
+
+ pushreg x29, x30
+ setfp
+ endprologue
+
+ // Set up the outer loop state and prepare for the first iteration.
+ ldr q6, [x4] // = M
+ movi v31.4s, #0
+ // x0 // -> dv for inner loop
+ sub x6, x1, x0 // total dv bytes
+ sub x1, x3, #4 // inner loop counter
+ // x2 // -> nv for inner loop
+ // x3 // = n
+ add x4, x0, #16 // -> dv for outer loop
+ mov x5, x2 // -> nv base
+ sub x6, x6, x3, lsl #2 // dv carry range bytes
+ sub x8, x3, #4 // outer loop counter
+ sub x6, x6, #16 // dv steam-powered carry bytes
+ expand v6, v7 // expand M
+ and x7, x6, #15 // dv tail length in bytes
+ bic x6, x6, #15 // dv blocks-of-four length in bytes
+
+ bl mont4
+ cbz x1, 8f // done already?
+
+5: sub x1, x1, #4
+ bl mla4
+ cbnz x1, 5b // done yet?
+
+ // Still have carries left to propagate. Adding the accumulator
+ // block into the carries is a little different this time, because
+ // all four accumulator limbs have to be squished into the three
+ // carry registers for `carryprop' to do its thing.
+8: ldr q24, [x0]
+ sprdacc v24, v25, v26
+ add v28.2d, v28.2d, v24.2d
+ add v29.2d, v29.2d, v25.2d
+ add v30.2d, v30.2d, v26.2d
+ bl carryprop
+ cbz x6, 7f
+
+ // Propagate the first group of carries.
+ ldp x16, x17, [x0]
+ sub x1, x6, #16
+ adds x16, x16, x10
+ adcs x17, x17, xzr
+ stp x16, x17, [x0], #16
+ cbz x1, 6f
+
+ // Continue carry propagation until the end of the buffer.
+0: ldp x16, x17, [x0]
+ sub x1, x1, #16
+ adcs x16, x16, xzr
+ adcs x17, x17, xzr
+ stp x16, x17, [x0], #16
+ cbnz x1, 0b
+
+ // Deal with the tail end. Note that the actual destination length
+ // won't be an exacty number of blocks of four, so it's safe to just
+ // drop through here.
+6: adc w10, wzr, wzr
+7: ldr w16, [x0]
+ sub x1, x7, #4
+ adds w16, w16, w10
+ str w16, [x0], #4
+ cbz x1, 8f
+0: ldr w16, [x0]
+ sub x1, x1, #4
+ adcs w16, w16, wzr
+ str w16, [x0], #4
+ cbnz x1, 0b
+
+ // All done for this iteration. Start the next.
+8: cbz x8, 9f
+ mov x0, x4
+ add x4, x4, #16
+ sub x1, x3, #4
+ mov x2, x5
+ sub x8, x8, #4
+ sub x6, x6, #16
+ bl mont4
+ b 5b
+
+ // All over.
+9: popreg x29, x30
+ ret
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Testing and performance measurement.
+
+#ifdef TEST_MUL4
+
+// dmul smul mmul mont
+// z x0 x0 x0 x0 x0
+// c x3 x1 x1 x1 x1
+// y x4 -- -- x2 x2
+// u x1 x2 -- x3 --
+// x x2 x3 x2 x4 x3
+// vv v2/v3 x4 -- x5 --
+// yy v4/v5 x5 x3 x6 --
+// mm v6/v7 -- -- -- x4
+// n x5 x6 x4 x7 x5
+// cyv x6 x7 x5 stk0 x6
+
+#define STKARG(i) sp, #16 + i
+
+.macro testprologue mode
+ pushreg x29, x30
+ setfp
+ endprologue
+ movi v31.4s, #0
+
+ .ifeqs "\mode", "dmul"
+ ldr q2, [x4]
+ zip2 v3.8h, v2.8h, v31.8h // (v'_2, v''_2; v'_3, v''_3)
+ zip1 v2.8h, v2.8h, v31.8h // (v'_0, v''_0; v'_1, v''_1)
+
+ ldr q4, [x5]
+ zip2 v5.8h, v4.8h, v31.8h // (y'_2, y''_2; y'_3, y''_3)
+ zip1 v4.8h, v4.8h, v31.8h // (y'_0, y''_0; y'_1, y''_1)
+
+ mov x16, x1
+ mov x1, x2 // -> u
+ mov x2, x3 // -> x
+ mov x3, x16 // -> c
+
+ mov x5, x6 // = n
+ mov x6, x7 // -> cyv
+ .endif
+
+ .ifeqs "\mode", "smul"
+ ldr q4, [x3]
+ zip2 v5.8h, v4.8h, v31.8h // (y'_2, y''_2; y'_3, y''_3)
+ zip1 v4.8h, v4.8h, v31.8h // (y'_0, y''_0; y'_1, y''_1)
+
+ // x2 // -> x
+ mov x3, x1 // -> c
+ mov x6, x5 // -> cyv
+ mov x5, x4 // = n
+ .endif
+
+ .ifeqs "\mode", "mmul"
+ ldr q2, [x5]
+ zip2 v3.8h, v2.8h, v31.8h // (v'_2, v''_2; v'_3, v''_3)
+ zip1 v2.8h, v2.8h, v31.8h // (v'_0, v''_0; v'_1, v''_1)
+
+ ldr q6, [x6]
+ zip2 v7.8h, v6.8h, v31.8h // (y'_2, y''_2; y'_3, y''_3)
+ zip1 v6.8h, v6.8h, v31.8h // (y'_0, y''_0; y'_1, y''_1)
+
+ mov x16, x1
+ mov x1, x3 // -> u
+ mov x3, x16 // -> c
+
+ mov x16, x2
+ mov x2, x4 // -> x
+ mov x4, x16 // -> y
+
+ mov x5, x7 // = n
+ ldr x6, [STKARG(0)] // -> cyv
+ .endif
+
+ .ifeqs "\mode", "mont"
+ ldr q6, [x4]
+ zip2 v7.8h, v6.8h, v31.8h // (m'_2, m''_2; m'_3, m''_3)
+ zip1 v6.8h, v6.8h, v31.8h // (m'_0, m''_0; m'_1, m''_1)
+
+ mov x4, x2 // -> y
+ mov x2, x3 // -> x
+ mov x3, x1 // -> c
+
+ // x5 // = n
+ // x6 // -> cyv
+ .endif
+.endm
+
+.macro testldcarry
+ ld1 {v28.2d-v30.2d}, [x3]
+ mov x15, #0
+.endm
+
+.macro testtop
+0: sub x5, x5, #1
+.endm
+
+.macro testtail
+ cbnz x5, 0b
+.endm
+
+.macro testcarryout
+ // More complicated than usual because we must mix the general-
+ // purpose carry back in.
+ lsr x15, x15, #16
+ mov v0.d[0], x15
+ mov v0.d[1], xzr
+ add v28.2d, v28.2d, v0.2d
+ st1 {v28.2d-v30.2d}, [x3]
+.endm
+
+.macro testepilogue
+ popreg x29, x30
+ ret
+.endm
+
+FUNC(test_dmul4)
+ testprologue dmul
+ testldcarry
+ testtop
+ bl dmul4
+ testtail
+ testcarryout
+ testepilogue
+ENDFUNC
+
+FUNC(test_dmla4)
+ testprologue dmul
+ testldcarry
+ testtop
+ bl dmla4
+ testtail
+ testcarryout
+ testepilogue
+ENDFUNC
+
+FUNC(test_mul4)
+ testprologue smul
+ testldcarry
+ testtop
+ bl mul4
+ testtail
+ testcarryout
+ testepilogue
+ENDFUNC
+
+FUNC(test_mul4zc)
+ testprologue smul
+ testldcarry
+ testtop
+ bl mul4zc
+ testtail
+ testcarryout
+ testepilogue
+ENDFUNC
+
+FUNC(test_mla4)
+ testprologue smul
+ testldcarry
+ testtop
+ bl mla4
+ testtail
+ testcarryout
+ testepilogue
+ENDFUNC
+
+FUNC(test_mla4zc)
+ testprologue smul
+ testldcarry
+ testtop
+ bl mla4zc
+ testtail
+ testcarryout
+ testepilogue
+ENDFUNC
+
+FUNC(test_mmul4)
+ testprologue mmul
+ testtop
+ bl mmul4
+ testtail
+ stp q4, q5, [x4]
+ testcarryout
+ testepilogue
+ENDFUNC
+
+FUNC(test_mmla4)
+ testprologue mmul
+ testtop
+ bl mmla4
+ testtail
+ stp q4, q5, [x4]
+ testcarryout
+ testepilogue
+ENDFUNC
+
+FUNC(test_mont4)
+ testprologue mont
+ testtop
+ bl mont4
+ testtail
+ stp q4, q5, [x4]
+ testcarryout
+ testepilogue
+ENDFUNC
+
+#endif
+
+///----- That's all, folks --------------------------------------------------