X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/cd303963aec7291de8aabf9aa04d1423fe7dcac4..ea1b3cec199052eda3a0054d86c70e948c6e7580:/math/mpx-mul4-arm64-simd.S diff --git a/math/mpx-mul4-arm64-simd.S b/math/mpx-mul4-arm64-simd.S new file mode 100644 index 00000000..8f482dd8 --- /dev/null +++ b/math/mpx-mul4-arm64-simd.S @@ -0,0 +1,1217 @@ +/// -*- mode: asm; asm-comment-char: ?/ -*- +/// +/// Large SIMD-based multiplications +/// +/// (c) 2019 Straylight/Edgeware +/// + +///----- Licensing notice --------------------------------------------------- +/// +/// This file is part of Catacomb. +/// +/// Catacomb is free software: you can redistribute it and/or modify it +/// under the terms of the GNU Library General Public License as published +/// by the Free Software Foundation; either version 2 of the License, or +/// (at your option) any later version. +/// +/// Catacomb is distributed in the hope that it will be useful, but +/// WITHOUT ANY WARRANTY; without even the implied warranty of +/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +/// Library General Public License for more details. +/// +/// You should have received a copy of the GNU Library General Public +/// License along with Catacomb. If not, write to the Free Software +/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, +/// USA. + +///-------------------------------------------------------------------------- +/// Preliminaries. + +#include "config.h" +#include "asm-common.h" + + .text + +///-------------------------------------------------------------------------- +/// Theory. +/// +/// We define a number of primitive fixed-size multipliers from which we can +/// construct more general variable-length multipliers. +/// +/// The basic trick is the same throughout. In an operand-scanning +/// multiplication, the inner multiplication loop multiplies a multiple- +/// precision operand by a single precision factor, and adds the result, +/// appropriately shifted, to the result. A `finely integrated operand +/// scanning' implementation of Montgomery multiplication also adds the +/// product of a single-precision `Montgomery factor' and the modulus, +/// calculated in the same pass. The more common `coarsely integrated +/// operand scanning' alternates main multiplication and Montgomery passes, +/// which requires additional carry propagation. +/// +/// Throughout both plain-multiplication and Montgomery stages, then, one of +/// the factors remains constant throughout the operation, so we can afford +/// to take a little time to preprocess it. The transformation we perform is +/// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a +/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into +/// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit +/// pieces are placed into 32-bit cells, and arranged as two 128-bit SIMD +/// operands, as follows. +/// +/// Offset 0 4 8 12 +/// 0 v'_0 v''_0 v'_1 v''_1 +/// 16 v'_2 v''_2 v'_3 v''_3 +/// +/// The `umull' and `umlal' instructions can multiply a vector of two 32-bit +/// values by a 32-bit scalar, giving two 64-bit results; thus, it will act +/// on (say) v'_0 and v''_0 in a single instruction, to produce two 48-bit +/// results in 64-bit fields. The sixteen bits of headroom allows us to add +/// many products together before we must deal with carrying; it also allows +/// for some calculations to be performed on the above expanded form. +/// +/// We maintain three `carry' registers, v28--v30, accumulating intermediate +/// results; we name them `c0', `c1', and `c2'. Each carry register holds +/// two 64-bit halves: the register c0, for example, holds c'_0 (low half) +/// and c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the +/// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2. +/// The `umull' or `umlal' instruction acting on a scalar operand and an +/// operand in the expanded form above produces a result which can be added +/// directly to the appropriate carry register. +/// +/// An unusual feature of this code, as compared to the other `mul4' +/// implementations, is that it makes extensive use of the ARM64 +/// general-purpose registers for carry resolution and output construction. +/// As a result, an additional general-purpose register (typically x15) is +/// used as an additional carry, with the carry value in bits 16--63. +/// +/// Multiplication is performed in product-scanning order, since ARM +/// processors commonly implement result forwarding for consecutive multiply- +/// and-accumulate instructions specifying the same destination. +/// Experimentally, this runs faster than operand-scanning in an attempt to +/// hide instruction latencies. +/// +/// On 64-bit ARM, we have a vast supply number of registers: the expanded +/// operands are kept in registers. The packed operands are read from memory +/// into working registers v0 and v1. The following conventional argument +/// names and locations are used throughout. +/// +/// Arg Format Location Notes +/// +/// U packed [x1] +/// X packed [x2] In Montgomery multiplication, X = N +/// V expanded v2/v3 +/// Y expanded v4/v5 In Montgomery multiplication, Y = (A + U V) M +/// M expanded v6/v7 -N^{-1} (mod B^4) +/// N Modulus, for Montgomery multiplication +/// A packed [x0] Destination/accumulator +/// C carry v28--v30 +/// 0 v31 128-bit zero +/// +/// The calculation is some variant of +/// +/// A' + C' B^4 <- U V + X Y + A + C +/// +/// The low-level functions fit into a fairly traditional (finely-integrated) +/// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y) +/// (indexed by i). +/// +/// The variants are as follows. +/// +/// Function Variant Use i j +/// +/// mmul4 A = C = 0 Montgomery 0 0 +/// dmul4 A = 0 Montgomery 0 + +/// mmla4 C = 0 Montgomery + 0 +/// dmla4 exactly as shown Montgomery + + +/// +/// mul4zc U = V = A = C = 0 Plain 0 0 +/// mul4 U = V = A = 0 Plain 0 + +/// mla4zc U = V = C = 0 Plain + 0 +/// mla4 U = V = 0 Plain + + +/// +/// The `mmul4' and `mmla4' functions are also responsible for calculating +/// the Montgomery reduction factor Y = (A + U V) M used by the rest of the +/// inner loop. + +///-------------------------------------------------------------------------- +/// Macro definitions. + +.macro mulacc z, u, v, x=nil, y=nil + // Set Z = Z + U V + X Y, using the low halves of V and Y. Y may be + // `nil' to omit the second operand. Z, V, and Y should be 128-bit + // `vN' registers; and U and X should be 32-bit `vN.s[I]' scalars; + // the multiplications produce two 64-bit elementwise products, which + // are added elementwise to Z. + + umlal \z\().2d, \v\().2s, \u + .ifnes "\y", "nil" + umlal \z\().2d, \y\().2s, \x + .endif +.endm + +.macro mulacc2 z, u, v, x=nil, y=nil + // Set Z = Z + U V + X Y, using the high halves of V and Y; see + // `mulacc'. + + umlal2 \z\().2d, \v\().4s, \u + .ifnes "\y", "nil" + umlal2 \z\().2d, \y\().4s, \x + .endif +.endm + +.macro mulinit z, zinitp, u, v=nil, x=nil, y=nil + // If ZINITP then set Z = Z + U V + X Y, as for `mulacc'; otherwise, + // set Z = U V + X Y. Operand requirements and detailed operation + // are as for `mulacc'. + + .ifeqs "\zinitp", "t" + mulacc \z, \u, \v, \x, \y + .else + umull \z\().2d, \v\().2s, \u + .ifnes "\y", "nil" + umlal \z\().2d, \y\().2s, \x + .endif + .endif +.endm + +.macro mulini2 z, zinitp, u, v=nil, x=nil, y=nil + // As `mulinit', but with the high halves of V and Y. + + .ifeqs "\zinitp", "t" + mulacc2 \z, \u, \v, \x, \y + .else + umull2 \z\().2d, \v\().4s, \u + .ifnes "\y", "nil" + umlal2 \z\().2d, \y\().4s, \x + .endif + .endif +.endm + +// `mulI': accumulate the B^I and b B^i terms of the polynomial product sum U +// V + X Y, given that U = u_0 + B u_1 + B^2 u_2 + B^3 u_3 (and similarly for +// x), and V = v'_0 + b v''_0 + B (v'_1 + b v''_1) + B^2 (v'_2 + b v''_2) + +// B^3 (v'_3 + b v''_3) (and similarly for Y). The 64-bit coefficients are +// added into the low and high halves of the 128-bit register Z (if ZINIT is +// `nil' then simply set Z, as if it were initially zero). +.macro mul0 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil + mulinit \z, \zinitp, \u\().s[0], \v0, \x\().s[0], \y0 +.endm +.macro mul1 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil + mulini2 \z, \zinitp, \u\().s[0], \v0, \x\().s[0], \y0 + mulacc \z, \u\().s[1], \v0, \x\().s[1], \y0 +.endm +.macro mul2 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil + mulinit \z, \zinitp, \u\().s[0], \v1, \x\().s[0], \y1 + mulacc2 \z, \u\().s[1], \v0, \x\().s[1], \y0 + mulacc \z, \u\().s[2], \v0, \x\().s[2], \y0 +.endm +.macro mul3 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil + mulini2 \z, \zinitp, \u\().s[0], \v1, \x\().s[0], \y1 + mulacc \z, \u\().s[1], \v1, \x\().s[1], \y1 + mulacc2 \z, \u\().s[2], \v0, \x\().s[2], \y0 + mulacc \z, \u\().s[3], \v0, \x\().s[3], \y0 +.endm +.macro mul4 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil + mulini2 \z, \zinitp, \u\().s[1], \v1, \x\().s[1], \y1 + mulacc \z, \u\().s[2], \v1, \x\().s[2], \y1 + mulacc2 \z, \u\().s[3], \v0, \x\().s[3], \y0 +.endm +.macro mul5 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil + mulini2 \z, \zinitp, \u\().s[2], \v1, \x\().s[2], \y1 + mulacc \z, \u\().s[3], \v1, \x\().s[3], \y1 +.endm +.macro mul6 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil + mulini2 \z, \zinitp, \u\().s[3], \v1, \x\().s[3], \y1 +.endm + +// Steps in the process of propagating carry bits upwards from ZLO (a 128-bit +// `vN' register). Here, T0, T1, and CG are 64-bit `xN' general-purpose +// registers clobbered in the process. Set the low 32 bits of the 64-bit +// `xN' general-purpose register ZOUT to the completed coefficient z_1, +// leaving a carry in CG. +// +// In detail, what happens is as follows. Suppose initially that ZLO = +// (z'_i; z''_i) and ZHI = (z'_{i+1}; z''_{i+1}). Let t = z'_i + b z''_i; +// observe that floor(t/b) = floor(z'_i/b) + z''_i. Let z_i = t mod B, and +// add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}. This has +// a circuit depth of 3; I don't know how to do better. +// +// Output words are left in the low half of a 64-bit register, with rubbish +// in the high half. Two such results can be combined using the `bfi' +// instruction. +.macro carry0 zlo, cg=x15, t0=x16, t1=x17 + // Capture the values of carry-register ZLO and CG (if not `nil') in + // general-purpose registers T0 and T1, suitable for use in `carry1'. + mov \t0, \zlo\().d[0] + mov \t1, \zlo\().d[1] + .ifnes "\cg", "nil" + add \t0, \t0, \cg, lsr #16 + .endif +.endm +.macro carry1 zout, cg=x15, t0=x16, t1=x17 + // Collect a 32-bit output word in the low 32 bits of ZOUT (leaving + // rubbish in the high 32 bits), and update CG suitably to continue + // processing with the next carry register. + .ifnes "\zout", "nil" + add \zout, \t0, \t1, lsl #16 + .endif + .ifnes "\cg", "nil" + add \cg, \t1, \t0, lsr #16 + .endif +.endm + +.macro expand vlo, vhi, vz=v31 + // Expand the packed 128-bit operand in VLO to an expanded operand in + // VLO and VHI, assuming that VZ is all-bits-zero. All three are + // `vN' 128-bit SIMD registers. + zip2 \vhi\().8h, \vlo\().8h, \vz\().8h + zip1 \vlo\().8h, \vlo\().8h, \vz\().8h +.endm + +.macro sprdacc a0, a1, a2, a3=nil + // Spread the packed 128-bit operand in A0 into carry-format values + // in A0, A1, A2, A3. If A3 is `nil', then spread the same value + // into A0, A1, A2 only, clobbering x16. + .ifeqs "\a3", "nil" + mov w16, \a0\().s[3] + .endif + trn2 \a2\().2d, \a0\().2d, v31.2d + trn2 \a1\().2s, \a0\().2s, v31.2s + .ifeqs "\a3", "nil" + lsl x16, x16, #16 + .endif + trn1 \a0\().2s, \a0\().2s, v31.2s + .ifeqs "\a3", "nil" + mov \a2\().d[1], x16 + .else + trn2 \a3\().2s, \a2\().2s, v31.2s + .endif + mov \a2\().s[1], wzr +.endm + +.macro crryacc a0, a1, a2, a3, c0, c1, c2 + // Add the carry-format values A0, A1, A2 into the existing carries + // C0, C1, C2 (leaving A3 where it is). + add \c0\().2d, \c0\().2d, \a0\().2d + add \c1\().2d, \c1\().2d, \a1\().2d + add \c2\().2d, \c2\().2d, \a2\().2d +.endm + +///-------------------------------------------------------------------------- +/// Primitive multipliers and related utilities. + +INTFUNC(carryprop) + // On entry, x0 points to a destination, and v28--v30 and x15 hold + // incoming carries c0--c2 and cg. On exit, the low 128 bits of the + // carry value are stored at [x0]; the remaining 16 bits of carry are + // left in x10; x0 is advanced by 16; and x11--x17 are clobbered. + endprologue + + carry0 v28 + carry1 x11 + carry0 v29 + carry1 x13 + carry0 v30 + carry1 x12 + bfi x11, x13, #32, #32 + lsr x14, x15, #16 + lsr x10, x15, #48 + bfi x12, x14, #32, #32 + stp x11, x12, [x0], #16 + ret +ENDFUNC + +INTFUNC(dmul4) + // On entry, x0 points to the destination; x1 and x2 point to packed + // operands U and X; v2/v3 and v4/v5 hold expanded operands V and Y; + // v28--v30 and x15 hold incoming carries c0--c2 and cg; and v31 is + // zero. On exit, the destination and carries are updated; x0, x1, + // x2 are each advanced by 16; v2--v5 and v8--v15 are preserved; and + // x11--x14, x16, x17 and the other SIMD registers are clobbered. + endprologue + + // Start by loading the operand words from memory. + ldr q0, [x1], #16 + ldr q1, [x2], #16 + + // Do the multiplication. + mul0 v28, t, v0, v2, v3, v1, v4, v5 + mul1 v29, t, v0, v2, v3, v1, v4, v5 + carry0 v28 + mul2 v30, t, v0, v2, v3, v1, v4, v5 + carry1 x11 + carry0 v29 + mul3 v27, nil, v0, v2, v3, v1, v4, v5 + carry1 x13 + carry0 v30 + mul4 v28, nil, v0, v2, v3, v1, v4, v5 + carry1 x12 + carry0 v27 + mul5 v29, nil, v0, v2, v3, v1, v4, v5 + carry1 x14 + mul6 v30, nil, v0, v2, v3, v1, v4, v5 + + // Finish up and store the result. + bfi x11, x13, #32, #32 + bfi x12, x14, #32, #32 + stp x11, x12, [x0], #16 + + // All done. + ret +ENDFUNC + +INTFUNC(dmla4) + // On entry, x0 points to the destination/accumulator A; x1 and x2 + // point to packed operands U and X; v2/v3 and v4/v5 hold expanded + // operands V and Y; v28--v30 and x15 hold incoming carries c0--c2 + // and cg; and v31 is zero. On exit, the accumulator and carries are + // updated; x0, x1, x2 are each advanced by 16; v2--v5 and v8--v15 + // are preserved; and x11--x14, x16, x17 and the other SIMD registers + // are clobbered. + endprologue + + // Start by loading the operand words from memory. + ldr q24, [x0] + ldr q0, [x1], #16 + ldr q1, [x2], #16 + sprdacc v24, v25, v26, v27 + crryacc v24, v25, v26, v27, v28, v29, v30 + + // Do the multiplication. + mul0 v28, t, v0, v2, v3, v1, v4, v5 + mul1 v29, t, v0, v2, v3, v1, v4, v5 + carry0 v28 + mul2 v30, t, v0, v2, v3, v1, v4, v5 + carry1 x11 + carry0 v29 + mul3 v27, t, v0, v2, v3, v1, v4, v5 + carry1 x13 + carry0 v30 + mul4 v28, nil, v0, v2, v3, v1, v4, v5 + carry1 x12 + carry0 v27 + mul5 v29, nil, v0, v2, v3, v1, v4, v5 + carry1 x14 + mul6 v30, nil, v0, v2, v3, v1, v4, v5 + + // Finish up and store the result. + bfi x11, x13, #32, #32 + bfi x12, x14, #32, #32 + stp x11, x12, [x0], #16 + + // All done. + ret +ENDFUNC + +INTFUNC(mul4) + // On entry, x0 points to the destination; x2 points to a packed + // operand X; v4/v5 holds an expanded operand Y; v13--v15 and x15 + // hold incoming carries c0--c2 and cg; and v31 is zero. On exit, + // the destination and carries are updated; x0 and x2 are each + // advanced by 16; v4 and v5 and v8--v15 are preserved; and x11--x14, + // x16, x17 and the other SIMD registers are clobbered. + endprologue + + // Start by loading the operand words from memory. + ldr q1, [x2], #16 + + // Do the multiplication. + mul0 v28, t, v1, v4, v5 + mul1 v29, t, v1, v4, v5 + carry0 v28 + mul2 v30, t, v1, v4, v5 + carry1 x11 + carry0 v29 + mul3 v27, nil, v1, v4, v5 + carry1 x13 + carry0 v30 + mul4 v28, nil, v1, v4, v5 + carry1 x12 + carry0 v27 + mul5 v29, nil, v1, v4, v5 + carry1 x14 + mul6 v30, nil, v1, v4, v5 + + // Finish up and store the result. + bfi x11, x13, #32, #32 + bfi x12, x14, #32, #32 + stp x11, x12, [x0], #16 + + // All done. + ret +ENDFUNC + +INTFUNC(mul4zc) + // On entry, x0 points to the destination; x2 points to a packed + // operand X; v4/v5 holds an expanded operand Y; and v31 is zero. On + // exit, the destination is updated; v28--v30 and x15 hold outgoing + // carries c0--c2 and cg; x0 and x2 are each advanced by 16; v4 and + // v5 and v8--v15 are preserved; and x11--x14, x16, x17 and the other + // SIMD registers are clobbered. + endprologue + + // Start by loading the operand words from memory. + ldr q1, [x2], #16 + + // Do the multiplication. + mul0 v28, nil, v1, v4, v5 + mul1 v29, nil, v1, v4, v5 + carry0 v28, nil + mul2 v30, nil, v1, v4, v5 + carry1 x11 + carry0 v29 + mul3 v27, nil, v1, v4, v5 + carry1 x13 + carry0 v30 + mul4 v28, nil, v1, v4, v5 + carry1 x12 + carry0 v27 + mul5 v29, nil, v1, v4, v5 + carry1 x14 + mul6 v30, nil, v1, v4, v5 + + // Finish up and store the result. + bfi x11, x13, #32, #32 + bfi x12, x14, #32, #32 + stp x11, x12, [x0], #16 + + // All done. + ret +ENDFUNC + +INTFUNC(mla4) + // On entry, x0 points to the destination/accumulator A; x2 points to + // a packed operand X; v4/v5 holds an expanded operand Y; v13--v15 + // and x15 hold incoming carries c0--c2 and cg; and v31 is zero. On + // exit, the accumulator and carries are updated; x0 and x2 are each + // advanced by 16; v4 and v5 and v8--v15 are preserved; and x11--x14, + // x16, x17 and the other SIMD registers are clobbered. + endprologue + + // Start by loading the operand words from memory. + ldr q24, [x0] + ldr q1, [x2], #16 + sprdacc v24, v25, v26, v27 + crryacc v24, v25, v26, v27, v28, v29, v30 + + // Do the multiplication. + mul0 v28, t, v1, v4, v5 + mul1 v29, t, v1, v4, v5 + carry0 v28 + mul2 v30, t, v1, v4, v5 + carry1 x11 + carry0 v29 + mul3 v27, t, v1, v4, v5 + carry1 x13 + carry0 v30 + mul4 v28, nil, v1, v4, v5 + carry1 x12 + carry0 v27 + mul5 v29, nil, v1, v4, v5 + carry1 x14 + mul6 v30, nil, v1, v4, v5 + + // Finish up and store the result. + bfi x11, x13, #32, #32 + bfi x12, x14, #32, #32 + stp x11, x12, [x0], #16 + + // All done. + ret +ENDFUNC + +INTFUNC(mla4zc) + // On entry, x0 points to the destination/accumulator A; x2 points to + // a packed operand X; v4/v5 holds an expanded operand Y; and v31 is + // zero. On exit, the accumulator is updated; v28--v30 and x15 hold + // outgoing carries c0--c2 and cg; x0 and x2 are each advanced by 16; + // v4, v5, and v8--v15 are preserved; and x11--x14, x16, x17 and the + // other SIMD registers are clobbered. + endprologue + + // Start by loading the operand words from memory. + ldr q28, [x0] + ldr q1, [x2], #16 + sprdacc v28, v29, v30, v27 + + // Do the multiplication. + mul0 v28, t, v1, v4, v5 + mul1 v29, t, v1, v4, v5 + carry0 v28, nil + mul2 v30, t, v1, v4, v5 + carry1 x11 + carry0 v29 + mul3 v27, t, v1, v4, v5 + carry1 x13 + carry0 v30 + mul4 v28, nil, v1, v4, v5 + carry1 x12 + carry0 v27 + mul5 v29, nil, v1, v4, v5 + carry1 x14 + mul6 v30, nil, v1, v4, v5 + + // Finish up and store the result. + bfi x11, x13, #32, #32 + bfi x12, x14, #32, #32 + stp x11, x12, [x0], #16 + + // All done. + ret +ENDFUNC + +INTFUNC(mmul4) + // On entry, x0 points to the destination; x1 points to a packed + // operand U; x2 points to a packed operand X (the modulus); v2/v3 + // holds an expanded operand V; and v6/v7 holds an expanded operand M + // (the Montgomery factor -N^{-1} (mod B)). On exit, the destination + // is updated (to zero); v4/v5 hold an expanded factor Y = U V M (mod + // B); v28--v30 and x15 hold outgoing carries c0--c2 and cg; x0, x1, + // and x2 are each advanced by 16; v2, v3, and v8--v15 are preserved; + // and x11--x14, x16, x17 and the other SIMD registers are clobbered. + endprologue + + // Start by loading the operand words from memory. + ldr q0, [x1], #16 + ldr q1, [x2], #16 + + // Calculate the low half of W = A + U V, being careful to leave the + // carries in place. + mul0 v28, nil, v0, v2, v3 + mul1 v29, nil, v0, v2, v3 + carry0 v28, nil + mul2 v30, nil, v0, v2, v3 + carry1 x11 + carry0 v29 + mul3 v27, nil, v0, v2, v3 + b mmla4_common +ENDFUNC + +INTFUNC(mmla4) + // On entry, x0 points to the destination/accumulator A; x1 points to + // a packed operand U; x2 points to a packed operand X (the modulus); + // v2/v3 holds an expanded operand V; and v6/v7 holds an expanded + // operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the + // accumulator is updated (to zero); v4/v5 hold an expanded factor Y + // = (A + U V) M (mod B); v28--v30 and x15 hold outgoing carries + // c0--c2 and cg; x0, x1, and x2 are each advanced by 16; v2, v3, v6, + // v7, and v8--v15 are preserved; and x11--x14, x16, x17 and the + // other SIMD registers are clobbered. + endprologue + + // Start by loading the operand words from memory. + ldr q28, [x0] + ldr q0, [x1], #16 + ldr q1, [x2], #16 + sprdacc v28, v29, v30, v27 + + // Calculate the low half of W = A + U V, being careful to leave the + // carries in place. + mul0 v28, t, v0, v2, v3 + mul1 v29, t, v0, v2, v3 + carry0 v28, nil + mul2 v30, t, v0, v2, v3 + carry1 x11 + carry0 v29 + mul3 v27, t, v0, v2, v3 +mmla4_common: + carry1 x13 + carry0 v30 + carry1 x12 + carry0 v27 + carry1 x14, nil + + // Piece the result together and ship it back. + bfi x11, x13, #32, #32 + bfi x12, x14, #32, #32 + mov v16.d[0], x11 + mov v16.d[1], x12 + + // Calculate the low half of the Montgomery factor Y = W M. + mul0 v18, nil, v16, v6, v7 + mul1 v19, nil, v16, v6, v7 + carry0 v18, nil + mul2 v20, nil, v16, v6, v7 + carry1 x11 + carry0 v19 + mul3 v21, nil, v16, v6, v7 + carry1 x13 + carry0 v20 + carry1 x12 + carry0 v21 + carry1 x14, nil + + // Piece the result together, ship it back, and expand. + bfi x11, x13, #32, #32 + bfi x12, x14, #32, #32 + mov v4.d[0], x11 + mov v4.d[1], x12 + expand v4, v5 + + // Build up the product X Y in the carry slots. + mul0 v28, t, v1, v4, v5 + mul1 v29, t, v1, v4, v5 + carry0 v28, nil + mul2 v30, t, v1, v4, v5 + carry1 nil + carry0 v29 + mul3 v27, t, v1, v4, v5 + carry1 nil + carry0 v30 + + // And complete the calculation. + mul4 v28, nil, v0, v2, v3, v1, v4, v5 + carry1 nil + carry0 v27 + mul5 v29, nil, v0, v2, v3, v1, v4, v5 + carry1 nil + mul6 v30, nil, v0, v2, v3, v1, v4, v5 + + // Finish up and store the result. + stp xzr, xzr, [x0], #16 + + // All done. + ret +ENDFUNC + +INTFUNC(mont4) + // On entry, x0 points to the destination/accumulator A; x2 points to + // a packed operand X (the modulus); and v6/v7 holds an expanded + // operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the + // accumulator is updated (to zero); v4/v5 hold an expanded factor Y + // = A M (mod B); v28--v30 and x15 hold outgoing carries c0--c2 and + // cg; x0 and x2 are each advanced by 16; v6, v7, and v8--v15 are + // preserved; and x11--x14, x16, x17 and the other SIMD registers are + // clobbered. + endprologue + + // Start by loading the operand words from memory. + ldr q28, [x0] + ldr q1, [x2], #16 + + // Calculate Y = A M (mod B). + mul0 v18, nil, v28, v6, v7 + mul1 v19, nil, v28, v6, v7 + carry0 v18, nil + mul2 v20, nil, v28, v6, v7 + carry1 x11 + carry0 v19 + mul3 v21, nil, v28, v6, v7 + carry1 x13 + carry0 v20 + sprdacc v28, v29, v30, v27 + carry1 x12 + carry0 v21 + carry1 x14, nil + + // Piece the result together, ship it back, and expand. + bfi x11, x13, #32, #32 + bfi x12, x14, #32, #32 + mov v4.d[0], x11 + mov v4.d[1], x12 + expand v4, v5 + + // Calculate the actual result. Well, the carries, at least. + mul0 v28, t, v1, v4, v5 + mul1 v29, t, v1, v4, v5 + carry0 v28, nil + mul2 v30, t, v1, v4, v5 + carry1 nil + carry0 v29 + mul3 v27, t, v1, v4, v5 + carry1 nil + carry0 v30 + + // And complete the calculation. + mul4 v28, nil, v1, v4, v5 + carry1 nil + carry0 v27 + mul5 v29, nil, v1, v4, v5 + carry1 nil + mul6 v30, nil, v1, v4, v5 + + // Finish up and store the result. + stp xzr, xzr, [x0], #16 + + // All done. + ret +ENDFUNC + +///-------------------------------------------------------------------------- +/// Bulk multipliers. + +FUNC(mpx_umul4_arm64_simd) + // void mpx_umul4_arm64_simd(mpw *dv, const mpw *av, const mpw *avl, + // const mpw *bv, const mpw *bvl); + + // Establish the arguments and do initial setup. + // + // inner loop dv x0 + // inner loop av x2 + // outer loop dv x5 + // outer loop bv x3 + // av base x1 + // inner n x6 + // n base x7 + // outer n x4 + pushreg x29, x30 + setfp + endprologue + + // Prepare for the first iteration. + ldr q4, [x3], #16 // Y = bv[0] + movi v31.4s, #0 + sub x7, x2, x1 // = inner loop count base + // x0 // = dv for inner loop + // x1 // = av base + // x3 // = bv for outer loop + sub x4, x4, x3 // = outer loop count (decremented) + sub x6, x7, #16 // = inner loop count (decremented) + mov x2, x1 // = av for inner loop + add x5, x0, #16 // = dv for outer loop + expand v4, v5 // expand Y + bl mul4zc + cbz x6, 8f // all done? + + // Continue with the first iteration. +0: sub x6, x6, #16 + bl mul4 + cbnz x6, 0b + + // Write out the leftover carry. There can be no tail here. +8: bl carryprop + cbz x4, 9f + + // Set up for the next pass. +1: ldr q4, [x3], #16 // Y = bv[i] + mov x0, x5 // -> dv[i] + mov x2, x1 // -> av[0] + add x5, x5, #16 + sub x6, x7, #16 // = inner loop count (decremented) + sub x4, x4, #16 // outer loop count + expand v4, v5 // expand Y + bl mla4zc + cbz x6, 8f + + // Continue... +0: sub x6, x6, #16 + bl mla4 + cbnz x6, 0b + + // Finish off this pass. There was no tail on the previous pass, and + // there can be done on this pass. +8: bl carryprop + cbnz x4, 1b + + // All over. +9: popreg x29, x30 + ret +ENDFUNC + +FUNC(mpxmont_mul4_arm64_simd) + // void mpxmont_mul4_arm64_simd(mpw *dv, + // const mpw *av, const mpw *bv, + // const mpw *nv, size_t n, + // const mpw *mi); + + // Establish the arguments and do initial setup. + // + // inner loop dv x0 + // inner loop av x1 + // inner loop nv x2 + // nv base x3 + // base n x4 + // mi (x5) + // outer loop dv x5 + // outer loop bv x6 + // av base x7 + // inner n x8 + // outer n x9 + // c x10 + pushreg x29, x30 + setfp + endprologue + + // Set up the outer loop state and prepare for the first iteration. + ldr q2, [x2] // = V = bv[0] + ldr q6, [x5] // = M + movi v31.4s, #0 + // x0 // -> dv for inner loop + // x1 // -> av for inner loop + // x3 // -> nv base + // x4 // = n base + add x5, x0, #16 // -> dv + add x6, x2, #16 // -> bv + mov x2, x3 // -> nv[0] + mov x7, x1 // -> av base + sub x8, x4, #4 // = inner n (decremented) + sub x9, x4, #4 // = outer n (decremented) + expand v2, v3 // expand V + expand v6, v7 // expand M + bl mmul4 + cbz x8, 8f // done already? + + // Complete the first inner loop. +0: sub x8, x8, #4 + bl dmul4 + cbnz x8, 0b // done yet? + + // Still have carries left to propagate. Rather than store the tail + // end in memory, keep it in x10 for later. + bl carryprop + + // Embark on the next iteration. (There must be one. If n = 1 then + // we would have bailed above, to label 8. Similarly, the subsequent + // iterations can fall into the inner loop immediately.) +1: ldr q2, [x6], #16 // = Y = bv[i] + mov x0, x5 // -> dv[i] + mov x1, x7 // -> av[0] + mov x2, x3 // -> nv[0] + add x5, x5, #16 + sub x8, x4, #4 + sub x9, x9, #4 + expand v2, v3 + bl mmla4 + + // Complete the next inner loop. +0: sub x8, x8, #4 + bl dmla4 + cbnz x8, 0b + + // Still have carries left to propagate, and they overlap the + // previous iteration's final tail, so read that and add it. + add x15, x15, x10, lsl #16 + bl carryprop + + // Back again, maybe. + cbnz x9, 1b + + // All done, almost. + str w10, [x0], #4 + popreg x29, x30 + ret + + // First iteration was short. Write out the carries and we're done. + // (This could be folded into the main loop structure, but that would + // penalize small numbers more.) +8: bl carryprop + str w10, [x0], #4 + popreg x29, x30 + ret +ENDFUNC + +FUNC(mpxmont_redc4_arm64_simd) + // void mpxmont_redc4_arm64_simd(mpw *dv, mpw *dvl, const mpw *nv, + // size_t n, const mpw *mi); + + // Establish the arguments and do initial setup. + // + // inner loop dv x0 + // inner loop nv x2 + // blocks-of-4 count x6 + // tail count x7 + // mi (x4) + // outer loop dv x4 + // outer loop count x8 + // nv base x5 + // inner loop count x1 + // n x3 + // c x10 + // t0, t1 x11, x12 + + pushreg x29, x30 + setfp + endprologue + + // Set up the outer loop state and prepare for the first iteration. + ldr q6, [x4] // = M + movi v31.4s, #0 + // x0 // -> dv for inner loop + sub x6, x1, x0 // total dv bytes + sub x1, x3, #4 // inner loop counter + // x2 // -> nv for inner loop + // x3 // = n + add x4, x0, #16 // -> dv for outer loop + mov x5, x2 // -> nv base + sub x6, x6, x3, lsl #2 // dv carry range bytes + sub x8, x3, #4 // outer loop counter + sub x6, x6, #16 // dv steam-powered carry bytes + expand v6, v7 // expand M + and x7, x6, #15 // dv tail length in bytes + bic x6, x6, #15 // dv blocks-of-four length in bytes + + bl mont4 + cbz x1, 8f // done already? + +5: sub x1, x1, #4 + bl mla4 + cbnz x1, 5b // done yet? + + // Still have carries left to propagate. Adding the accumulator + // block into the carries is a little different this time, because + // all four accumulator limbs have to be squished into the three + // carry registers for `carryprop' to do its thing. +8: ldr q24, [x0] + sprdacc v24, v25, v26 + add v28.2d, v28.2d, v24.2d + add v29.2d, v29.2d, v25.2d + add v30.2d, v30.2d, v26.2d + bl carryprop + cbz x6, 7f + + // Propagate the first group of carries. + ldp x16, x17, [x0] + sub x1, x6, #16 + adds x16, x16, x10 + adcs x17, x17, xzr + stp x16, x17, [x0], #16 + cbz x1, 6f + + // Continue carry propagation until the end of the buffer. +0: ldp x16, x17, [x0] + sub x1, x1, #16 + adcs x16, x16, xzr + adcs x17, x17, xzr + stp x16, x17, [x0], #16 + cbnz x1, 0b + + // Deal with the tail end. Note that the actual destination length + // won't be an exacty number of blocks of four, so it's safe to just + // drop through here. +6: adc w10, wzr, wzr +7: ldr w16, [x0] + sub x1, x7, #4 + adds w16, w16, w10 + str w16, [x0], #4 + cbz x1, 8f +0: ldr w16, [x0] + sub x1, x1, #4 + adcs w16, w16, wzr + str w16, [x0], #4 + cbnz x1, 0b + + // All done for this iteration. Start the next. +8: cbz x8, 9f + mov x0, x4 + add x4, x4, #16 + sub x1, x3, #4 + mov x2, x5 + sub x8, x8, #4 + sub x6, x6, #16 + bl mont4 + b 5b + + // All over. +9: popreg x29, x30 + ret +ENDFUNC + +///-------------------------------------------------------------------------- +/// Testing and performance measurement. + +#ifdef TEST_MUL4 + +// dmul smul mmul mont +// z x0 x0 x0 x0 x0 +// c x3 x1 x1 x1 x1 +// y x4 -- -- x2 x2 +// u x1 x2 -- x3 -- +// x x2 x3 x2 x4 x3 +// vv v2/v3 x4 -- x5 -- +// yy v4/v5 x5 x3 x6 -- +// mm v6/v7 -- -- -- x4 +// n x5 x6 x4 x7 x5 +// cyv x6 x7 x5 stk0 x6 + +#define STKARG(i) sp, #16 + i + +.macro testprologue mode + pushreg x29, x30 + setfp + endprologue + movi v31.4s, #0 + + .ifeqs "\mode", "dmul" + ldr q2, [x4] + zip2 v3.8h, v2.8h, v31.8h // (v'_2, v''_2; v'_3, v''_3) + zip1 v2.8h, v2.8h, v31.8h // (v'_0, v''_0; v'_1, v''_1) + + ldr q4, [x5] + zip2 v5.8h, v4.8h, v31.8h // (y'_2, y''_2; y'_3, y''_3) + zip1 v4.8h, v4.8h, v31.8h // (y'_0, y''_0; y'_1, y''_1) + + mov x16, x1 + mov x1, x2 // -> u + mov x2, x3 // -> x + mov x3, x16 // -> c + + mov x5, x6 // = n + mov x6, x7 // -> cyv + .endif + + .ifeqs "\mode", "smul" + ldr q4, [x3] + zip2 v5.8h, v4.8h, v31.8h // (y'_2, y''_2; y'_3, y''_3) + zip1 v4.8h, v4.8h, v31.8h // (y'_0, y''_0; y'_1, y''_1) + + // x2 // -> x + mov x3, x1 // -> c + mov x6, x5 // -> cyv + mov x5, x4 // = n + .endif + + .ifeqs "\mode", "mmul" + ldr q2, [x5] + zip2 v3.8h, v2.8h, v31.8h // (v'_2, v''_2; v'_3, v''_3) + zip1 v2.8h, v2.8h, v31.8h // (v'_0, v''_0; v'_1, v''_1) + + ldr q6, [x6] + zip2 v7.8h, v6.8h, v31.8h // (y'_2, y''_2; y'_3, y''_3) + zip1 v6.8h, v6.8h, v31.8h // (y'_0, y''_0; y'_1, y''_1) + + mov x16, x1 + mov x1, x3 // -> u + mov x3, x16 // -> c + + mov x16, x2 + mov x2, x4 // -> x + mov x4, x16 // -> y + + mov x5, x7 // = n + ldr x6, [STKARG(0)] // -> cyv + .endif + + .ifeqs "\mode", "mont" + ldr q6, [x4] + zip2 v7.8h, v6.8h, v31.8h // (m'_2, m''_2; m'_3, m''_3) + zip1 v6.8h, v6.8h, v31.8h // (m'_0, m''_0; m'_1, m''_1) + + mov x4, x2 // -> y + mov x2, x3 // -> x + mov x3, x1 // -> c + + // x5 // = n + // x6 // -> cyv + .endif +.endm + +.macro testldcarry + ld1 {v28.2d-v30.2d}, [x3] + mov x15, #0 +.endm + +.macro testtop +0: sub x5, x5, #1 +.endm + +.macro testtail + cbnz x5, 0b +.endm + +.macro testcarryout + // More complicated than usual because we must mix the general- + // purpose carry back in. + lsr x15, x15, #16 + mov v0.d[0], x15 + mov v0.d[1], xzr + add v28.2d, v28.2d, v0.2d + st1 {v28.2d-v30.2d}, [x3] +.endm + +.macro testepilogue + popreg x29, x30 + ret +.endm + +FUNC(test_dmul4) + testprologue dmul + testldcarry + testtop + bl dmul4 + testtail + testcarryout + testepilogue +ENDFUNC + +FUNC(test_dmla4) + testprologue dmul + testldcarry + testtop + bl dmla4 + testtail + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mul4) + testprologue smul + testldcarry + testtop + bl mul4 + testtail + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mul4zc) + testprologue smul + testldcarry + testtop + bl mul4zc + testtail + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mla4) + testprologue smul + testldcarry + testtop + bl mla4 + testtail + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mla4zc) + testprologue smul + testldcarry + testtop + bl mla4zc + testtail + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mmul4) + testprologue mmul + testtop + bl mmul4 + testtail + stp q4, q5, [x4] + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mmla4) + testprologue mmul + testtop + bl mmla4 + testtail + stp q4, q5, [x4] + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mont4) + testprologue mont + testtop + bl mont4 + testtail + stp q4, q5, [x4] + testcarryout + testepilogue +ENDFUNC + +#endif + +///----- That's all, folks --------------------------------------------------