/// -*- mode: asm; asm-comment-char: ?/ -*- /// /// Large SIMD-based multiplications /// /// (c) 2019 Straylight/Edgeware /// ///----- Licensing notice --------------------------------------------------- /// /// This file is part of Catacomb. /// /// Catacomb is free software: you can redistribute it and/or modify it /// under the terms of the GNU Library General Public License as published /// by the Free Software Foundation; either version 2 of the License, or /// (at your option) any later version. /// /// Catacomb is distributed in the hope that it will be useful, but /// WITHOUT ANY WARRANTY; without even the implied warranty of /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU /// Library General Public License for more details. /// /// You should have received a copy of the GNU Library General Public /// License along with Catacomb. If not, write to the Free Software /// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, /// USA. ///-------------------------------------------------------------------------- /// Preliminaries. #include "config.h" #include "asm-common.h" .text ///-------------------------------------------------------------------------- /// Theory. /// /// We define a number of primitive fixed-size multipliers from which we can /// construct more general variable-length multipliers. /// /// The basic trick is the same throughout. In an operand-scanning /// multiplication, the inner multiplication loop multiplies a multiple- /// precision operand by a single precision factor, and adds the result, /// appropriately shifted, to the result. A `finely integrated operand /// scanning' implementation of Montgomery multiplication also adds the /// product of a single-precision `Montgomery factor' and the modulus, /// calculated in the same pass. The more common `coarsely integrated /// operand scanning' alternates main multiplication and Montgomery passes, /// which requires additional carry propagation. /// /// Throughout both plain-multiplication and Montgomery stages, then, one of /// the factors remains constant throughout the operation, so we can afford /// to take a little time to preprocess it. The transformation we perform is /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit /// pieces are placed into 32-bit cells, and arranged as two 128-bit SIMD /// operands, as follows. /// /// Offset 12 8 4 0 /// 0 v''_1 v'_1 v''_0 v'_0 /// 16 v''_3 v'_3 v''_2 v'_2 /// /// The `umull' and `umlal' instructions can multiply a vector of two 32-bit /// values by a 32-bit scalar, giving two 64-bit results; thus, it will act /// on (say) v'_0 and v''_0 in a single instruction, to produce two 48-bit /// results in 64-bit fields. The sixteen bits of headroom allows us to add /// many products together before we must deal with carrying; it also allows /// for some calculations to be performed on the above expanded form. /// /// We maintain three `carry' registers, v28--v30, accumulating intermediate /// results; we name them `c0', `c1', and `c2'. Each carry register holds /// two 64-bit halves: the register c0, for example, holds c'_0 (low half) /// and c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the /// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2. /// The `umull' or `umlal' instruction acting on a scalar operand and an /// operand in the expanded form above produces a result which can be added /// directly to the appropriate carry register. /// /// An unusual feature of this code, as compared to the other `mul4' /// implementations, is that it makes extensive use of the ARM64 /// general-purpose registers for carry resolution and output construction. /// As a result, an additional general-purpose register (typically x15) is /// used as an additional carry, with the carry value in bits 16--63. /// /// Multiplication is performed in product-scanning order, since ARM /// processors commonly implement result forwarding for consecutive multiply- /// and-accumulate instructions specifying the same destination. /// Experimentally, this runs faster than operand-scanning in an attempt to /// hide instruction latencies. /// /// On 64-bit ARM, we have a vast supply number of registers: the expanded /// operands are kept in registers. The packed operands are read from memory /// into working registers v0 and v1. The following conventional argument /// names and locations are used throughout. /// /// Arg Format Location Notes /// /// U packed [x1] /// X packed [x2] In Montgomery multiplication, X = N /// V expanded v2/v3 /// Y expanded v4/v5 In Montgomery multiplication, Y = (A + U V) M /// M expanded v6/v7 -N^{-1} (mod B^4) /// N Modulus, for Montgomery multiplication /// A packed [x0] Destination/accumulator /// C carry v28--v30 /// 0 v31 128-bit zero /// /// The calculation is some variant of /// /// A' + C' B^4 <- U V + X Y + A + C /// /// The low-level functions fit into a fairly traditional (finely-integrated) /// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y) /// (indexed by i). /// /// The variants are as follows. /// /// Function Variant Use i j /// /// mmul4 A = C = 0 Montgomery 0 0 /// dmul4 A = 0 Montgomery 0 + /// mmla4 C = 0 Montgomery + 0 /// dmla4 exactly as shown Montgomery + + /// /// mul4zc U = V = A = C = 0 Plain 0 0 /// mul4 U = V = A = 0 Plain 0 + /// mla4zc U = V = C = 0 Plain + 0 /// mla4 U = V = 0 Plain + + /// /// The `mmul4' and `mmla4' functions are also responsible for calculating /// the Montgomery reduction factor Y = (A + U V) M used by the rest of the /// inner loop. ///-------------------------------------------------------------------------- /// Macro definitions. .macro mulacc z, u, v, x=nil, y=nil // Set Z = Z + U V + X Y, using the low halves of V and Y. Y may be // `nil' to omit the second operand. Z, V, and Y should be 128-bit // `vN' registers; and U and X should be 32-bit `vN.s[I]' scalars; // the multiplications produce two 64-bit elementwise products, which // are added elementwise to Z. umlal \z\().2d, \v\().2s, \u .ifnes "\y", "nil" umlal \z\().2d, \y\().2s, \x .endif .endm .macro mulacc2 z, u, v, x=nil, y=nil // Set Z = Z + U V + X Y, using the high halves of V and Y; see // `mulacc'. umlal2 \z\().2d, \v\().4s, \u .ifnes "\y", "nil" umlal2 \z\().2d, \y\().4s, \x .endif .endm .macro mulinit z, zinitp, u, v=nil, x=nil, y=nil // If ZINITP then set Z = Z + U V + X Y, as for `mulacc'; otherwise, // set Z = U V + X Y. Operand requirements and detailed operation // are as for `mulacc'. .ifeqs "\zinitp", "t" mulacc \z, \u, \v, \x, \y .else umull \z\().2d, \v\().2s, \u .ifnes "\y", "nil" umlal \z\().2d, \y\().2s, \x .endif .endif .endm .macro mulini2 z, zinitp, u, v=nil, x=nil, y=nil // As `mulinit', but with the high halves of V and Y. .ifeqs "\zinitp", "t" mulacc2 \z, \u, \v, \x, \y .else umull2 \z\().2d, \v\().4s, \u .ifnes "\y", "nil" umlal2 \z\().2d, \y\().4s, \x .endif .endif .endm // `mulI': accumulate the B^I and b B^I terms of the polynomial product sum // U V + X Y, given that U = u_0 + B u_1 + B^2 u_2 + B^3 u_3 (and similarly // for x), and V = v'_0 + b v''_0 + B (v'_1 + b v''_1) + B^2 (v'_2 + b v''_2) // + B^3 (v'_3 + b v''_3) (and similarly for Y). The 64-bit coefficients are // added into the low and high halves of the 128-bit register Z (if ZINIT is // `nil' then simply set Z, as if it were initially zero). .macro mul0 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil mulinit \z, \zinitp, \u\().s[0], \v0, \x\().s[0], \y0 .endm .macro mul1 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil mulini2 \z, \zinitp, \u\().s[0], \v0, \x\().s[0], \y0 mulacc \z, \u\().s[1], \v0, \x\().s[1], \y0 .endm .macro mul2 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil mulinit \z, \zinitp, \u\().s[0], \v1, \x\().s[0], \y1 mulacc2 \z, \u\().s[1], \v0, \x\().s[1], \y0 mulacc \z, \u\().s[2], \v0, \x\().s[2], \y0 .endm .macro mul3 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil mulini2 \z, \zinitp, \u\().s[0], \v1, \x\().s[0], \y1 mulacc \z, \u\().s[1], \v1, \x\().s[1], \y1 mulacc2 \z, \u\().s[2], \v0, \x\().s[2], \y0 mulacc \z, \u\().s[3], \v0, \x\().s[3], \y0 .endm .macro mul4 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil mulini2 \z, \zinitp, \u\().s[1], \v1, \x\().s[1], \y1 mulacc \z, \u\().s[2], \v1, \x\().s[2], \y1 mulacc2 \z, \u\().s[3], \v0, \x\().s[3], \y0 .endm .macro mul5 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil mulini2 \z, \zinitp, \u\().s[2], \v1, \x\().s[2], \y1 mulacc \z, \u\().s[3], \v1, \x\().s[3], \y1 .endm .macro mul6 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil mulini2 \z, \zinitp, \u\().s[3], \v1, \x\().s[3], \y1 .endm // Steps in the process of propagating carry bits upwards from ZLO (a 128-bit // `vN' register). Here, T0, T1, and CG are 64-bit `xN' general-purpose // registers clobbered in the process. Set the low 32 bits of the 64-bit // `xN' general-purpose register ZOUT to the completed coefficient z_1, // leaving a carry in CG. // // In detail, what happens is as follows. Suppose initially that ZLO = // (z''_i; z'_i) and ZHI = (z''_{i+1}; z'_{i+1}). Let t = z'_i + b z''_i; // observe that floor(t/b) = floor(z'_i/b) + z''_i. Let z_i = t mod B, and // add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}. This has // a circuit depth of 3; I don't know how to do better. // // Output words are left in the low half of a 64-bit register, with rubbish // in the high half. Two such results can be combined using the `bfi' // instruction. .macro carry0 zlo, cg=x15, t0=x16, t1=x17 // Capture the values of carry-register ZLO and CG (if not `nil') in // general-purpose registers T0 and T1, suitable for use in `carry1'. mov \t0, \zlo\().d[0] mov \t1, \zlo\().d[1] .ifnes "\cg", "nil" add \t0, \t0, \cg, lsr #16 .endif .endm .macro carry1 zout, cg=x15, t0=x16, t1=x17 // Collect a 32-bit output word in the low 32 bits of ZOUT (leaving // rubbish in the high 32 bits), and update CG suitably to continue // processing with the next carry register. .ifnes "\zout", "nil" add \zout, \t0, \t1, lsl #16 .endif .ifnes "\cg", "nil" add \cg, \t1, \t0, lsr #16 .endif .endm .macro expand vlo, vhi, vz=v31 // Expand the packed 128-bit operand in VLO to an expanded operand in // VLO and VHI, assuming that VZ is all-bits-zero. All three are // `vN' 128-bit SIMD registers. zip2 \vhi\().8h, \vlo\().8h, \vz\().8h zip1 \vlo\().8h, \vlo\().8h, \vz\().8h .endm .macro sprdacc a0, a1, a2, a3=nil // Spread the packed 128-bit operand in A0 into carry-format values // in A0, A1, A2, A3. If A3 is `nil', then spread the same value // into A0, A1, A2 only, clobbering x16. .ifeqs "\a3", "nil" mov w16, \a0\().s[3] .endif trn2 \a2\().2d, \a0\().2d, v31.2d trn2 \a1\().2s, \a0\().2s, v31.2s .ifeqs "\a3", "nil" lsl x16, x16, #16 .endif trn1 \a0\().2s, \a0\().2s, v31.2s .ifeqs "\a3", "nil" mov \a2\().d[1], x16 .else trn2 \a3\().2s, \a2\().2s, v31.2s .endif mov \a2\().s[1], wzr .endm .macro crryacc a0, a1, a2, a3, c0, c1, c2 // Add the carry-format values A0, A1, A2 into the existing carries // C0, C1, C2 (leaving A3 where it is). add \c0\().2d, \c0\().2d, \a0\().2d add \c1\().2d, \c1\().2d, \a1\().2d add \c2\().2d, \c2\().2d, \a2\().2d .endm ///-------------------------------------------------------------------------- /// Primitive multipliers and related utilities. INTFUNC(carryprop) // On entry, x0 points to a destination, and v28--v30 and x15 hold // incoming carries c0--c2 and cg. On exit, the low 128 bits of the // carry value are stored at [x0]; the remaining 16 bits of carry are // left in x10; x0 is advanced by 16; and x11--x17 are clobbered. endprologue carry0 v28 carry1 x11 carry0 v29 carry1 x13 carry0 v30 carry1 x12 bfi x11, x13, #32, #32 lsr x14, x15, #16 lsr x10, x15, #48 bfi x12, x14, #32, #32 stp x11, x12, [x0], #16 ret ENDFUNC INTFUNC(dmul4) // On entry, x0 points to the destination; x1 and x2 point to packed // operands U and X; v2/v3 and v4/v5 hold expanded operands V and Y; // v28--v30 and x15 hold incoming carries c0--c2 and cg; and v31 is // zero. On exit, the destination and carries are updated; x0, x1, // x2 are each advanced by 16; v2--v5 and v8--v15 are preserved; and // x11--x14, x16, x17 and the other SIMD registers are clobbered. endprologue // Start by loading the operand words from memory. ldr q0, [x1], #16 ldr q1, [x2], #16 // Do the multiplication. mul0 v28, t, v0, v2, v3, v1, v4, v5 mul1 v29, t, v0, v2, v3, v1, v4, v5 carry0 v28 mul2 v30, t, v0, v2, v3, v1, v4, v5 carry1 x11 carry0 v29 mul3 v27, nil, v0, v2, v3, v1, v4, v5 carry1 x13 carry0 v30 mul4 v28, nil, v0, v2, v3, v1, v4, v5 carry1 x12 carry0 v27 mul5 v29, nil, v0, v2, v3, v1, v4, v5 carry1 x14 mul6 v30, nil, v0, v2, v3, v1, v4, v5 // Finish up and store the result. bfi x11, x13, #32, #32 bfi x12, x14, #32, #32 stp x11, x12, [x0], #16 // All done. ret ENDFUNC INTFUNC(dmla4) // On entry, x0 points to the destination/accumulator A; x1 and x2 // point to packed operands U and X; v2/v3 and v4/v5 hold expanded // operands V and Y; v28--v30 and x15 hold incoming carries c0--c2 // and cg; and v31 is zero. On exit, the accumulator and carries are // updated; x0, x1, x2 are each advanced by 16; v2--v5 and v8--v15 // are preserved; and x11--x14, x16, x17 and the other SIMD registers // are clobbered. endprologue // Start by loading the operand words from memory. ldr q24, [x0] ldr q0, [x1], #16 ldr q1, [x2], #16 sprdacc v24, v25, v26, v27 crryacc v24, v25, v26, v27, v28, v29, v30 // Do the multiplication. mul0 v28, t, v0, v2, v3, v1, v4, v5 mul1 v29, t, v0, v2, v3, v1, v4, v5 carry0 v28 mul2 v30, t, v0, v2, v3, v1, v4, v5 carry1 x11 carry0 v29 mul3 v27, t, v0, v2, v3, v1, v4, v5 carry1 x13 carry0 v30 mul4 v28, nil, v0, v2, v3, v1, v4, v5 carry1 x12 carry0 v27 mul5 v29, nil, v0, v2, v3, v1, v4, v5 carry1 x14 mul6 v30, nil, v0, v2, v3, v1, v4, v5 // Finish up and store the result. bfi x11, x13, #32, #32 bfi x12, x14, #32, #32 stp x11, x12, [x0], #16 // All done. ret ENDFUNC INTFUNC(mul4) // On entry, x0 points to the destination; x2 points to a packed // operand X; v4/v5 holds an expanded operand Y; v13--v15 and x15 // hold incoming carries c0--c2 and cg; and v31 is zero. On exit, // the destination and carries are updated; x0 and x2 are each // advanced by 16; v4 and v5 and v8--v15 are preserved; and x11--x14, // x16, x17 and the other SIMD registers are clobbered. endprologue // Start by loading the operand words from memory. ldr q1, [x2], #16 // Do the multiplication. mul0 v28, t, v1, v4, v5 mul1 v29, t, v1, v4, v5 carry0 v28 mul2 v30, t, v1, v4, v5 carry1 x11 carry0 v29 mul3 v27, nil, v1, v4, v5 carry1 x13 carry0 v30 mul4 v28, nil, v1, v4, v5 carry1 x12 carry0 v27 mul5 v29, nil, v1, v4, v5 carry1 x14 mul6 v30, nil, v1, v4, v5 // Finish up and store the result. bfi x11, x13, #32, #32 bfi x12, x14, #32, #32 stp x11, x12, [x0], #16 // All done. ret ENDFUNC INTFUNC(mul4zc) // On entry, x0 points to the destination; x2 points to a packed // operand X; v4/v5 holds an expanded operand Y; and v31 is zero. On // exit, the destination is updated; v28--v30 and x15 hold outgoing // carries c0--c2 and cg; x0 and x2 are each advanced by 16; v4 and // v5 and v8--v15 are preserved; and x11--x14, x16, x17 and the other // SIMD registers are clobbered. endprologue // Start by loading the operand words from memory. ldr q1, [x2], #16 // Do the multiplication. mul0 v28, nil, v1, v4, v5 mul1 v29, nil, v1, v4, v5 carry0 v28, nil mul2 v30, nil, v1, v4, v5 carry1 x11 carry0 v29 mul3 v27, nil, v1, v4, v5 carry1 x13 carry0 v30 mul4 v28, nil, v1, v4, v5 carry1 x12 carry0 v27 mul5 v29, nil, v1, v4, v5 carry1 x14 mul6 v30, nil, v1, v4, v5 // Finish up and store the result. bfi x11, x13, #32, #32 bfi x12, x14, #32, #32 stp x11, x12, [x0], #16 // All done. ret ENDFUNC INTFUNC(mla4) // On entry, x0 points to the destination/accumulator A; x2 points to // a packed operand X; v4/v5 holds an expanded operand Y; v13--v15 // and x15 hold incoming carries c0--c2 and cg; and v31 is zero. On // exit, the accumulator and carries are updated; x0 and x2 are each // advanced by 16; v4 and v5 and v8--v15 are preserved; and x11--x14, // x16, x17 and the other SIMD registers are clobbered. endprologue // Start by loading the operand words from memory. ldr q24, [x0] ldr q1, [x2], #16 sprdacc v24, v25, v26, v27 crryacc v24, v25, v26, v27, v28, v29, v30 // Do the multiplication. mul0 v28, t, v1, v4, v5 mul1 v29, t, v1, v4, v5 carry0 v28 mul2 v30, t, v1, v4, v5 carry1 x11 carry0 v29 mul3 v27, t, v1, v4, v5 carry1 x13 carry0 v30 mul4 v28, nil, v1, v4, v5 carry1 x12 carry0 v27 mul5 v29, nil, v1, v4, v5 carry1 x14 mul6 v30, nil, v1, v4, v5 // Finish up and store the result. bfi x11, x13, #32, #32 bfi x12, x14, #32, #32 stp x11, x12, [x0], #16 // All done. ret ENDFUNC INTFUNC(mla4zc) // On entry, x0 points to the destination/accumulator A; x2 points to // a packed operand X; v4/v5 holds an expanded operand Y; and v31 is // zero. On exit, the accumulator is updated; v28--v30 and x15 hold // outgoing carries c0--c2 and cg; x0 and x2 are each advanced by 16; // v4, v5, and v8--v15 are preserved; and x11--x14, x16, x17 and the // other SIMD registers are clobbered. endprologue // Start by loading the operand words from memory. ldr q28, [x0] ldr q1, [x2], #16 sprdacc v28, v29, v30, v27 // Do the multiplication. mul0 v28, t, v1, v4, v5 mul1 v29, t, v1, v4, v5 carry0 v28, nil mul2 v30, t, v1, v4, v5 carry1 x11 carry0 v29 mul3 v27, t, v1, v4, v5 carry1 x13 carry0 v30 mul4 v28, nil, v1, v4, v5 carry1 x12 carry0 v27 mul5 v29, nil, v1, v4, v5 carry1 x14 mul6 v30, nil, v1, v4, v5 // Finish up and store the result. bfi x11, x13, #32, #32 bfi x12, x14, #32, #32 stp x11, x12, [x0], #16 // All done. ret ENDFUNC INTFUNC(mmul4) // On entry, x0 points to the destination; x1 points to a packed // operand U; x2 points to a packed operand X (the modulus); v2/v3 // holds an expanded operand V; and v6/v7 holds an expanded operand M // (the Montgomery factor -N^{-1} (mod B)). On exit, the destination // is updated (to zero); v4/v5 hold an expanded factor Y = U V M (mod // B); v28--v30 and x15 hold outgoing carries c0--c2 and cg; x0, x1, // and x2 are each advanced by 16; v2, v3, and v8--v15 are preserved; // and x11--x14, x16, x17 and the other SIMD registers are clobbered. endprologue // Start by loading the operand words from memory. ldr q0, [x1], #16 ldr q1, [x2], #16 // Calculate the low half of W = A + U V, being careful to leave the // carries in place. mul0 v28, nil, v0, v2, v3 mul1 v29, nil, v0, v2, v3 carry0 v28, nil mul2 v30, nil, v0, v2, v3 carry1 x11 carry0 v29 mul3 v27, nil, v0, v2, v3 b mmla4_common ENDFUNC INTFUNC(mmla4) // On entry, x0 points to the destination/accumulator A; x1 points to // a packed operand U; x2 points to a packed operand X (the modulus); // v2/v3 holds an expanded operand V; and v6/v7 holds an expanded // operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the // accumulator is updated (to zero); v4/v5 hold an expanded factor Y // = (A + U V) M (mod B); v28--v30 and x15 hold outgoing carries // c0--c2 and cg; x0, x1, and x2 are each advanced by 16; v2, v3, v6, // v7, and v8--v15 are preserved; and x11--x14, x16, x17 and the // other SIMD registers are clobbered. endprologue // Start by loading the operand words from memory. ldr q28, [x0] ldr q0, [x1], #16 ldr q1, [x2], #16 sprdacc v28, v29, v30, v27 // Calculate the low half of W = A + U V, being careful to leave the // carries in place. mul0 v28, t, v0, v2, v3 mul1 v29, t, v0, v2, v3 carry0 v28, nil mul2 v30, t, v0, v2, v3 carry1 x11 carry0 v29 mul3 v27, t, v0, v2, v3 mmla4_common: carry1 x13 carry0 v30 carry1 x12 carry0 v27 carry1 x14, nil // Piece the result together and ship it back. bfi x11, x13, #32, #32 bfi x12, x14, #32, #32 mov v16.d[0], x11 mov v16.d[1], x12 // Calculate the low half of the Montgomery factor Y = W M. mul0 v18, nil, v16, v6, v7 mul1 v19, nil, v16, v6, v7 carry0 v18, nil mul2 v20, nil, v16, v6, v7 carry1 x11 carry0 v19 mul3 v21, nil, v16, v6, v7 carry1 x13 carry0 v20 carry1 x12 carry0 v21 carry1 x14, nil // Piece the result together, ship it back, and expand. bfi x11, x13, #32, #32 bfi x12, x14, #32, #32 mov v4.d[0], x11 mov v4.d[1], x12 expand v4, v5 // Build up the product X Y in the carry slots. mul0 v28, t, v1, v4, v5 mul1 v29, t, v1, v4, v5 carry0 v28, nil mul2 v30, t, v1, v4, v5 carry1 nil carry0 v29 mul3 v27, t, v1, v4, v5 carry1 nil carry0 v30 // And complete the calculation. mul4 v28, nil, v0, v2, v3, v1, v4, v5 carry1 nil carry0 v27 mul5 v29, nil, v0, v2, v3, v1, v4, v5 carry1 nil mul6 v30, nil, v0, v2, v3, v1, v4, v5 // Finish up and store the result. stp xzr, xzr, [x0], #16 // All done. ret ENDFUNC INTFUNC(mont4) // On entry, x0 points to the destination/accumulator A; x2 points to // a packed operand X (the modulus); and v6/v7 holds an expanded // operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the // accumulator is updated (to zero); v4/v5 hold an expanded factor Y // = A M (mod B); v28--v30 and x15 hold outgoing carries c0--c2 and // cg; x0 and x2 are each advanced by 16; v6, v7, and v8--v15 are // preserved; and x11--x14, x16, x17 and the other SIMD registers are // clobbered. endprologue // Start by loading the operand words from memory. ldr q28, [x0] ldr q1, [x2], #16 // Calculate Y = A M (mod B). mul0 v18, nil, v28, v6, v7 mul1 v19, nil, v28, v6, v7 carry0 v18, nil mul2 v20, nil, v28, v6, v7 carry1 x11 carry0 v19 mul3 v21, nil, v28, v6, v7 carry1 x13 carry0 v20 sprdacc v28, v29, v30, v27 carry1 x12 carry0 v21 carry1 x14, nil // Piece the result together, ship it back, and expand. bfi x11, x13, #32, #32 bfi x12, x14, #32, #32 mov v4.d[0], x11 mov v4.d[1], x12 expand v4, v5 // Calculate the actual result. Well, the carries, at least. mul0 v28, t, v1, v4, v5 mul1 v29, t, v1, v4, v5 carry0 v28, nil mul2 v30, t, v1, v4, v5 carry1 nil carry0 v29 mul3 v27, t, v1, v4, v5 carry1 nil carry0 v30 // And complete the calculation. mul4 v28, nil, v1, v4, v5 carry1 nil carry0 v27 mul5 v29, nil, v1, v4, v5 carry1 nil mul6 v30, nil, v1, v4, v5 // Finish up and store the result. stp xzr, xzr, [x0], #16 // All done. ret ENDFUNC ///-------------------------------------------------------------------------- /// Bulk multipliers. FUNC(mpx_umul4_arm64_simd) // void mpx_umul4_arm64_simd(mpw *dv, const mpw *av, const mpw *avl, // const mpw *bv, const mpw *bvl); // Establish the arguments and do initial setup. // // inner loop dv x0 // inner loop av x2 // outer loop dv x5 // outer loop bv x3 // av base x1 // inner n x6 // n base x7 // outer n x4 pushreg x29, x30 setfp endprologue // Prepare for the first iteration. ldr q4, [x3], #16 // Y = bv[0] movi v31.4s, #0 sub x7, x2, x1 // = inner loop count base // x0 // = dv for inner loop // x1 // = av base // x3 // = bv for outer loop sub x4, x4, x3 // = outer loop count (decremented) sub x6, x7, #16 // = inner loop count (decremented) mov x2, x1 // = av for inner loop add x5, x0, #16 // = dv for outer loop expand v4, v5 // expand Y bl mul4zc cbz x6, 8f // all done? // Continue with the first iteration. 0: sub x6, x6, #16 bl mul4 cbnz x6, 0b // Write out the leftover carry. There can be no tail here. 8: bl carryprop cbz x4, 9f // Set up for the next pass. 1: ldr q4, [x3], #16 // Y = bv[i] mov x0, x5 // -> dv[i] mov x2, x1 // -> av[0] add x5, x5, #16 sub x6, x7, #16 // = inner loop count (decremented) sub x4, x4, #16 // outer loop count expand v4, v5 // expand Y bl mla4zc cbz x6, 8f // Continue... 0: sub x6, x6, #16 bl mla4 cbnz x6, 0b // Finish off this pass. There was no tail on the previous pass, and // there can be done on this pass. 8: bl carryprop cbnz x4, 1b // All over. 9: popreg x29, x30 ret ENDFUNC FUNC(mpxmont_mul4_arm64_simd) // void mpxmont_mul4_arm64_simd(mpw *dv, // const mpw *av, const mpw *bv, // const mpw *nv, size_t n, // const mpw *mi); // Establish the arguments and do initial setup. // // inner loop dv x0 // inner loop av x1 // inner loop nv x2 // nv base x3 // base n x4 // mi (x5) // outer loop dv x5 // outer loop bv x6 // av base x7 // inner n x8 // outer n x9 // c x10 pushreg x29, x30 setfp endprologue // Set up the outer loop state and prepare for the first iteration. ldr q2, [x2] // = V = bv[0] ldr q6, [x5] // = M movi v31.4s, #0 // x0 // -> dv for inner loop // x1 // -> av for inner loop // x3 // -> nv base // x4 // = n base add x5, x0, #16 // -> dv add x6, x2, #16 // -> bv mov x2, x3 // -> nv[0] mov x7, x1 // -> av base sub x8, x4, #4 // = inner n (decremented) sub x9, x4, #4 // = outer n (decremented) expand v2, v3 // expand V expand v6, v7 // expand M bl mmul4 cbz x8, 8f // done already? // Complete the first inner loop. 0: sub x8, x8, #4 bl dmul4 cbnz x8, 0b // done yet? // Still have carries left to propagate. Rather than store the tail // end in memory, keep it in x10 for later. bl carryprop // Embark on the next iteration. (There must be one. If n = 1 then // we would have bailed above, to label 8. Similarly, the subsequent // iterations can fall into the inner loop immediately.) 1: ldr q2, [x6], #16 // = Y = bv[i] mov x0, x5 // -> dv[i] mov x1, x7 // -> av[0] mov x2, x3 // -> nv[0] add x5, x5, #16 sub x8, x4, #4 sub x9, x9, #4 expand v2, v3 bl mmla4 // Complete the next inner loop. 0: sub x8, x8, #4 bl dmla4 cbnz x8, 0b // Still have carries left to propagate, and they overlap the // previous iteration's final tail, so read that and add it. add x15, x15, x10, lsl #16 bl carryprop // Back again, maybe. cbnz x9, 1b // All done, almost. str w10, [x0], #4 popreg x29, x30 ret // First iteration was short. Write out the carries and we're done. // (This could be folded into the main loop structure, but that would // penalize small numbers more.) 8: bl carryprop str w10, [x0], #4 popreg x29, x30 ret ENDFUNC FUNC(mpxmont_redc4_arm64_simd) // void mpxmont_redc4_arm64_simd(mpw *dv, mpw *dvl, const mpw *nv, // size_t n, const mpw *mi); // Establish the arguments and do initial setup. // // inner loop dv x0 // inner loop nv x2 // blocks-of-4 count x6 // tail count x7 // mi (x4) // outer loop dv x4 // outer loop count x8 // nv base x5 // inner loop count x1 // n x3 // c x10 // t0, t1 x11, x12 pushreg x29, x30 setfp endprologue // Set up the outer loop state and prepare for the first iteration. ldr q6, [x4] // = M movi v31.4s, #0 // x0 // -> dv for inner loop sub x6, x1, x0 // total dv bytes sub x1, x3, #4 // inner loop counter // x2 // -> nv for inner loop // x3 // = n add x4, x0, #16 // -> dv for outer loop mov x5, x2 // -> nv base sub x6, x6, x3, lsl #2 // dv carry range bytes sub x8, x3, #4 // outer loop counter sub x6, x6, #16 // dv steam-powered carry bytes expand v6, v7 // expand M and x7, x6, #15 // dv tail length in bytes bic x6, x6, #15 // dv blocks-of-four length in bytes bl mont4 cbz x1, 8f // done already? 5: sub x1, x1, #4 bl mla4 cbnz x1, 5b // done yet? // Still have carries left to propagate. Adding the accumulator // block into the carries is a little different this time, because // all four accumulator limbs have to be squished into the three // carry registers for `carryprop' to do its thing. 8: ldr q24, [x0] sprdacc v24, v25, v26 add v28.2d, v28.2d, v24.2d add v29.2d, v29.2d, v25.2d add v30.2d, v30.2d, v26.2d bl carryprop cbz x6, 7f // Propagate the first group of carries. ldp x16, x17, [x0] sub x1, x6, #16 adds x16, x16, x10 adcs x17, x17, xzr stp x16, x17, [x0], #16 cbz x1, 6f // Continue carry propagation until the end of the buffer. 0: ldp x16, x17, [x0] sub x1, x1, #16 adcs x16, x16, xzr adcs x17, x17, xzr stp x16, x17, [x0], #16 cbnz x1, 0b // Deal with the tail end. Note that the actual destination length // won't be an exacty number of blocks of four, so it's safe to just // drop through here. 6: adc w10, wzr, wzr 7: ldr w16, [x0] sub x1, x7, #4 adds w16, w16, w10 str w16, [x0], #4 cbz x1, 8f 0: ldr w16, [x0] sub x1, x1, #4 adcs w16, w16, wzr str w16, [x0], #4 cbnz x1, 0b // All done for this iteration. Start the next. 8: cbz x8, 9f mov x0, x4 add x4, x4, #16 sub x1, x3, #4 mov x2, x5 sub x8, x8, #4 sub x6, x6, #16 bl mont4 b 5b // All over. 9: popreg x29, x30 ret ENDFUNC ///-------------------------------------------------------------------------- /// Testing and performance measurement. #ifdef TEST_MUL4 // dmul smul mmul mont // z x0 x0 x0 x0 x0 // c x3 x1 x1 x1 x1 // y x4 -- -- x2 x2 // u x1 x2 -- x3 -- // x x2 x3 x2 x4 x3 // vv v2/v3 x4 -- x5 -- // yy v4/v5 x5 x3 x6 -- // mm v6/v7 -- -- -- x4 // n x5 x6 x4 x7 x5 // cyv x6 x7 x5 stk0 x6 #define STKARG(i) sp, #16 + i .macro testprologue mode pushreg x29, x30 setfp endprologue movi v31.4s, #0 .ifeqs "\mode", "dmul" ldr q2, [x4] zip2 v3.8h, v2.8h, v31.8h // (v''_3, v'_3; v''_2, v'_2) zip1 v2.8h, v2.8h, v31.8h // (v''_1, v'_1; v''_0, v'_0) ldr q4, [x5] zip2 v5.8h, v4.8h, v31.8h // (y''_3, y'_3; y''_2, y'_2) zip1 v4.8h, v4.8h, v31.8h // (y''_1, y'_1; y''_0, y'_0) mov x16, x1 mov x1, x2 // -> u mov x2, x3 // -> x mov x3, x16 // -> c mov x5, x6 // = n mov x6, x7 // -> cyv .endif .ifeqs "\mode", "smul" ldr q4, [x3] zip2 v5.8h, v4.8h, v31.8h // (y''_3, y'_3; y''_2, y'_2) zip1 v4.8h, v4.8h, v31.8h // (y''_1, y'_1; y''_0, y'_0) // x2 // -> x mov x3, x1 // -> c mov x6, x5 // -> cyv mov x5, x4 // = n .endif .ifeqs "\mode", "mmul" ldr q2, [x5] zip2 v3.8h, v2.8h, v31.8h // (v''_3, v'_3; v''_2, v'_2) zip1 v2.8h, v2.8h, v31.8h // (v''_1, v'_1; v''_0, v'_0) ldr q6, [x6] zip2 v7.8h, v6.8h, v31.8h // (y''_3, y'_3; y''_2, y'_2) zip1 v6.8h, v6.8h, v31.8h // (y''_1, y'_1; y''_0, y'_0) mov x16, x1 mov x1, x3 // -> u mov x3, x16 // -> c mov x16, x2 mov x2, x4 // -> x mov x4, x16 // -> y mov x5, x7 // = n ldr x6, [STKARG(0)] // -> cyv .endif .ifeqs "\mode", "mont" ldr q6, [x4] zip2 v7.8h, v6.8h, v31.8h // (m''_3, m'_3; m''_2, m'_2) zip1 v6.8h, v6.8h, v31.8h // (m''_1, m'_1; m''_0, m'_0) mov x4, x2 // -> y mov x2, x3 // -> x mov x3, x1 // -> c // x5 // = n // x6 // -> cyv .endif .endm .macro testldcarry ld1 {v28.2d-v30.2d}, [x3] mov x15, #0 .endm .macro testtop 0: sub x5, x5, #1 .endm .macro testtail cbnz x5, 0b .endm .macro testcarryout // More complicated than usual because we must mix the general- // purpose carry back in. lsr x15, x15, #16 mov v0.d[0], x15 mov v0.d[1], xzr add v28.2d, v28.2d, v0.2d st1 {v28.2d-v30.2d}, [x3] .endm .macro testepilogue popreg x29, x30 ret .endm FUNC(test_dmul4) testprologue dmul testldcarry testtop bl dmul4 testtail testcarryout testepilogue ENDFUNC FUNC(test_dmla4) testprologue dmul testldcarry testtop bl dmla4 testtail testcarryout testepilogue ENDFUNC FUNC(test_mul4) testprologue smul testldcarry testtop bl mul4 testtail testcarryout testepilogue ENDFUNC FUNC(test_mul4zc) testprologue smul testldcarry testtop bl mul4zc testtail testcarryout testepilogue ENDFUNC FUNC(test_mla4) testprologue smul testldcarry testtop bl mla4 testtail testcarryout testepilogue ENDFUNC FUNC(test_mla4zc) testprologue smul testldcarry testtop bl mla4zc testtail testcarryout testepilogue ENDFUNC FUNC(test_mmul4) testprologue mmul testtop bl mmul4 testtail stp q4, q5, [x4] testcarryout testepilogue ENDFUNC FUNC(test_mmla4) testprologue mmul testtop bl mmla4 testtail stp q4, q5, [x4] testcarryout testepilogue ENDFUNC FUNC(test_mont4) testprologue mont testtop bl mont4 testtail stp q4, q5, [x4] testcarryout testepilogue ENDFUNC #endif ///----- That's all, folks --------------------------------------------------