/// -*- mode: asm; asm-comment-char: ?/ -*- /// /// Large SIMD-based multiplications /// /// (c) 2019 Straylight/Edgeware /// ///----- Licensing notice --------------------------------------------------- /// /// This file is part of Catacomb. /// /// Catacomb is free software: you can redistribute it and/or modify it /// under the terms of the GNU Library General Public License as published /// by the Free Software Foundation; either version 2 of the License, or /// (at your option) any later version. /// /// Catacomb is distributed in the hope that it will be useful, but /// WITHOUT ANY WARRANTY; without even the implied warranty of /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU /// Library General Public License for more details. /// /// You should have received a copy of the GNU Library General Public /// License along with Catacomb. If not, write to the Free Software /// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, /// USA. ///-------------------------------------------------------------------------- /// Preliminaries. #include "config.h" #include "asm-common.h" .arch armv7-a .fpu neon .text ///-------------------------------------------------------------------------- /// Theory. /// /// We define a number of primitive fixed-size multipliers from which we can /// construct more general variable-length multipliers. /// /// The basic trick is the same throughout. In an operand-scanning /// multiplication, the inner multiplication loop multiplies a multiple- /// precision operand by a single precision factor, and adds the result, /// appropriately shifted, to the result. A `finely integrated operand /// scanning' implementation of Montgomery multiplication also adds the /// product of a single-precision `Montgomery factor' and the modulus, /// calculated in the same pass. The more common `coarsely integrated /// operand scanning' alternates main multiplication and Montgomery passes, /// which requires additional carry propagation. /// /// Throughout both plain-multiplication and Montgomery stages, then, one of /// the factors remains constant throughout the operation, so we can afford /// to take a little time to preprocess it. The transformation we perform is /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit /// pieces are placed into 32-bit cells, and arranged as two 128-bit NEON /// operands, as follows. /// /// Offset 12 8 4 0 /// 0 v''_1 v'_1 v''_0 v'_0 /// 16 v''_3 v'_3 v''_2 v'_2 /// /// The `vmull' and `vmlal' instructions can multiply a vector of two 32-bit /// values by a 32-bit scalar, giving two 64-bit results; thus, it will act /// on (say) v'_0 and v''_0 in a single instruction, to produce two 48-bit /// results in 64-bit fields. The sixteen bits of headroom allows us to add /// many products together before we must deal with carrying; it also allows /// for some calculations to be performed on the above expanded form. /// /// We maintain three `carry' registers, q12--q14, accumulating intermediate /// results; we name them `c0', `c1', and `c2'. Each carry register holds /// two 64-bit halves: the register c0, for example, holds c'_0 (low half) /// and c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the /// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2. /// The `vmull' or `vmlal' instruction acting on a scalar operand and an /// operand in the expanded form above produces a result which can be added /// directly to the appropriate carry register. /// /// Multiplication is performed in product-scanning order, since ARM /// processors commonly implement result forwarding for consecutive multiply- /// and-accumulate instructions specifying the same destination. /// Experimentally, this runs faster than operand-scanning in an attempt to /// hide instruction latencies. /// /// On 32-bit ARM, we have a reasonable number of registers: the expanded /// operands are kept in registers. The packed operands are read from memory /// into working registers q0 and q1. The following conventional argument /// names and locations are used throughout. /// /// Arg Format Location Notes /// /// U packed [r1] /// X packed [r2] In Montgomery multiplication, X = N /// V expanded q2/q3 /// Y expanded q4/q5 In Montgomery multiplication, Y = (A + U V) M /// M expanded q4/q5 -N^{-1} (mod B^4) /// N Modulus, for Montgomery multiplication /// A packed [r0] Destination/accumulator /// C carry q13--q15 /// /// The calculation is some variant of /// /// A' + C' B^4 <- U V + X Y + A + C /// /// The low-level functions fit into a fairly traditional (finely-integrated) /// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y) /// (indexed by i). /// /// The variants are as follows. /// /// Function Variant Use i j /// /// mmul4 A = C = 0 Montgomery 0 0 /// dmul4 A = 0 Montgomery 0 + /// mmla4 C = 0 Montgomery + 0 /// dmla4 exactly as shown Montgomery + + /// /// mul4zc U = V = A = C = 0 Plain 0 0 /// mul4 U = V = A = 0 Plain 0 + /// mla4zc U = V = C = 0 Plain + 0 /// mla4 U = V = 0 Plain + + /// /// The `mmul4' and `mmla4' functions are also responsible for calculating /// the Montgomery reduction factor Y = (A + U V) M used by the rest of the /// inner loop. ///-------------------------------------------------------------------------- /// Macro definitions. .macro mulacc z, u, v, x=nil, y=nil // Set Z = Z + U V + X Y. X may be `nil' to omit the second // operand. Z should be a 128-bit `qN' register; V and Y should be // 64-bit `dN' registers; and U and X should be 32-bit `dN[I]' // scalars; the multiplications produce two 64-bit elementwise // products, which are added elementwise to Z. vmlal.u32 \z, \v, \u .ifnes "\x", "nil" vmlal.u32 \z, \y, \x .endif .endm .macro mulinit z, zinitp, u, v, x, y // If ZINITP then set Z = Z + U V + X Y, as for `mulacc'; otherwise, // set Z = U V + X Y. Operand requirements and detailed operation // are as for `mulacc'. .ifeqs "\zinitp", "t" mulacc \z, \u, \v, \x, \y .else vmull.u32 \z, \v, \u .ifnes "\x", "nil" vmlal.u32 \z, \y, \x .endif .endif .endm // `MULI': accumulate the B^I and b B^i terms of the polynomial product sum U // V + X Y, given that U = u_0 + B u_1 + B^2 u_2 + B^3 u_3 (and similarly for // x), and V = v'_0 + b v''_0 + B (v'_1 + b v''_1) + B^2 (v'_2 + b v''_2) + // B^3 (v'_3 + b v''_3) (and similarly for Y). The 64-bit coefficients are // added into the low and high halves of the 128-bit register Z (if ZINIT is // `nil' then simply set Z, as if it were initially zero). #define MUL0(z, zinitp, u, v0, v1, x, y0, y1) \ mulinit z, zinitp, QW(u, 0), D0(v0), QW(x, 0), D0(y0) #define MUL1(z, zinitp, u, v0, v1, x, y0, y1) \ mulinit z, zinitp, QW(u, 0), D1(v0), QW(x, 0), D1(y0); \ mulacc z, QW(u, 1), D0(v0), QW(x, 1), D0(y0) #define MUL2(z, zinitp, u, v0, v1, x, y0, y1) \ mulinit z, zinitp, QW(u, 0), D0(v1), QW(x, 0), D0(y1); \ mulacc z, QW(u, 1), D1(v0), QW(x, 1), D1(y0); \ mulacc z, QW(u, 2), D0(v0), QW(x, 2), D0(y0) #define MUL3(z, zinitp, u, v0, v1, x, y0, y1) \ mulinit z, zinitp, QW(u, 0), D1(v1), QW(x, 0), D1(y1); \ mulacc z, QW(u, 1), D0(v1), QW(x, 1), D0(y1); \ mulacc z, QW(u, 2), D1(v0), QW(x, 2), D1(y0); \ mulacc z, QW(u, 3), D0(v0), QW(x, 3), D0(y0) #define MUL4(z, zinitp, u, v0, v1, x, y0, y1) \ mulinit z, zinitp, QW(u, 1), D1(v1), QW(x, 1), D1(y1); \ mulacc z, QW(u, 2), D0(v1), QW(x, 2), D0(y1); \ mulacc z, QW(u, 3), D1(v0), QW(x, 3), D1(y0) #define MUL5(z, zinitp, u, v0, v1, x, y0, y1) \ mulinit z, zinitp, QW(u, 2), D1(v1), QW(x, 2), D1(y1); \ mulacc z, QW(u, 3), D0(v1), QW(x, 3), D0(y1) #define MUL6(z, zinitp, u, v0, v1, x, y0, y1) \ mulinit z, zinitp, QW(u, 3), D1(v1), QW(x, 3), D1(y1) // Steps in the process of propagating carry bits from ZLO to ZHI (both // 128-bit `qN' registers). Here, T is a 128-bit `qN' temporary register. // Set the low 32 bits of the 64-bit `dN' register ZOUT to the completed // coefficient z_i. // // In detail, what happens is as follows. Suppose initially that ZLO = // (z'_i; z''_i) and ZHI = (z'_{i+1}; z''_{i+1}). Let t = z'_i + b z''_i; // observe that floor(t/b) = floor(z'_i/b) + z''_i. Let z_i = t mod B, and // add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}. This has // a circuit depth of 4; I don't know how to do better. .macro _carry0 zout, zlo0, zlo1, t0, t1 // ZLO0 and ZLO1 are the low and high halves of a carry register. // Extract a 32-bit output, in the bottom 32 bits of ZOUT, and set T1 // so as to continue processing using `_carry1'. All operands are // 64-bit `dN' registers. If ZOUT is `nil' then no output is // produced; if T1 is `nil' then no further processing will be // possible. .ifnes "\zout", "nil" vshl.i64 \t0, \zlo1, #16 .endif .ifnes "\t1", "nil" vshr.u64 \t1, \zlo0, #16 .endif .ifnes "\zout", "nil" vadd.i64 \zout, \zlo0, \t0 .endif .ifnes "\t1", "nil" vadd.i64 \t1, \t1, \zlo1 .endif .endm .macro _carry1 u, zhi0, t1 // ZHI0 is the low half of a carry register, and T1 is the result of // applying `_carry0' to the previous carry register. Set U to the // result of propagating the carry into ZHI0. vshr.u64 \t1, \t1, #16 vadd.i64 \u, \zhi0, \t1 .endm // More convenient wrappers for `_carry0' and `_carry1'. // // CARRY0(ZOUT, ZLO, T) // Store a 32-bit output in ZOUT from carry ZLO, using T as a // temporary. ZOUT is a 64-bit `dN' register; ZLO and T are 128-bit // `qN' registers. // // CARRY1(ZHI, T) // Propagate carry from T into ZHI. Both are 128-bit `qN' registers; // ZHI is updated. #define CARRY0(zout, zlo, t) \ CASIDE0(zout, D0(zlo), zlo, t) #define CARRY1(zhi, t) \ CASIDE1(D0(zhi), zhi, t) // Versions of `CARRY0' and `CARRY1' which don't mutate their operands. // // CASIDE0(ZOUT, U, ZLO, T) // As for `CARRY0', but the low half of ZLO is actually in U (a 64-bit // `dN' register). // // CASIDE0E(ZOUT, U, ZLO, T) // As for `CASIDE0', but only calculate the output word, and no // carry-out. // // CASIDE1(U, ZHI, T) // As for `CARRY1', but write the updated low half of ZHI to U. #define CASIDE0(zout, u, zlo, t) \ _carry0 zout, u, D1(zlo), D0(t), D1(t) #define CASIDE0E(zout, u, zlo, t) \ _carry0 zout, u, D1(zlo), D0(t), nil #define CASIDE1(u, zhi, t) \ _carry1 u, D0(zhi), D1(t) // Steps in spreading a packed 128-bit operand in A0 across A0, A1, A2, A3 in // carry format. #define SPREADACC0(a0, a1, a2, a3) \ vmov.i32 a1, #0; \ vmov.i32 a2, #0; \ vmov.i32 a3, #0 #define SPREADACC1(a0, a1, a2, a3) \ vswp D1(a0), D0(a2); \ vtrn.32 D0(a0), D0(a1); \ vtrn.32 D0(a2), D0(a3) // Add the carry-format values A0, A1, A2 into the existing carries C0, C1, // C2 (leaving A3 where it is). #define CARRYACC(a0, a1, a2, a3, c0, c1, c2) \ vadd.i64 c0, c0, a0; \ vadd.i64 c1, c1, a1; \ vadd.i64 c2, c2, a2 ///-------------------------------------------------------------------------- /// Primitive multipliers and related utilities. INTFUNC(carryprop) // On entry, r0 points to a destination, and q13--q15 hold incoming // carries c0--c2. On exit, the low 128 bits of the carry value are // stored at [r0]; the remaining 16 bits of carry are left in d30; r0 // is advanced by 16; and q10--q14 are clobbered. endprologue CARRY0(D0(q10), q13, q12) CARRY1(q14, q12) CARRY0(D0(q11), q14, q12) CARRY1(q15, q12) CARRY0(D1(q10), q15, q12) vshr.u64 D1(q11), D1(q12), #16 vshr.u64 D0(q15), D1(q12), #48 vtrn.32 q10, q11 vst1.32 {q10}, [r0]! bx r14 ENDFUNC INTFUNC(dmul4) // On entry, r0 points to the destination; r1 and r2 point to packed // operands U and X; q2/q3 and q4/q5 hold expanded operands V and Y; // and q13--q15 hold incoming carries c0--c2. On exit, the // destination and carries are updated; r0, r1, r2 are each advanced // by 16; q2--q5 are preserved; and the other NEON registers are // clobbered. endprologue // Start by loading the operand words from memory. vld1.32 {q0}, [r1]! vld1.32 {q1}, [r2]! // Do the multiplication. MUL0(q13, t, q0, q2, q3, q1, q4, q5) MUL1(q14, t, q0, q2, q3, q1, q4, q5) CARRY0(D0(q8), q13, q6) MUL2(q15, t, q0, q2, q3, q1, q4, q5) CARRY1(q14, q6) CARRY0(D0(q9), q14, q6) MUL3(q12, nil, q0, q2, q3, q1, q4, q5) CARRY1(q15, q6) CARRY0(D1(q8), q15, q6) MUL4(q13, nil, q0, q2, q3, q1, q4, q5) CARRY1(q12, q6) CARRY0(D1(q9), q12, q6) MUL5(q14, nil, q0, q2, q3, q1, q4, q5) CARRY1(q13, q6) MUL6(q15, nil, q0, q2, q3, q1, q4, q5) // Finish up and store the result. vtrn.32 q8, q9 vst1.32 {q8}, [r0]! // All done. bx r14 ENDFUNC INTFUNC(dmla4) // On entry, r0 points to the destination/accumulator; r1 and r2 // point to packed operands U and X; q2/q3 and q4/q5 hold expanded // operands V and Y; and q13--q15 hold incoming carries c0--c2. On // exit, the accumulator and carries are updated; r0, r1, r2 are each // advanced by 16; q2--q5 are preserved; and the other NEON registers // are clobbered. endprologue // Start by loading the operand words from memory. vld1.32 {q9}, [r0] SPREADACC0(q9, q10, q11, q12) vld1.32 {q0}, [r1]! vld1.32 {q1}, [r2]! // Add the accumulator input to the incoming carries. Split the // accumulator into four pieces and add the carries onto them. SPREADACC1(q9, q10, q11, q12) CARRYACC(q9, q10, q11, q12, q13, q14, q15) // Do the multiplication. MUL0(q13, t, q0, q2, q3, q1, q4, q5) MUL1(q14, t, q0, q2, q3, q1, q4, q5) CARRY0(D0(q8), q13, q6) MUL2(q15, t, q0, q2, q3, q1, q4, q5) CARRY1(q14, q6) CARRY0(D0(q9), q14, q6) MUL3(q12, t, q0, q2, q3, q1, q4, q5) CARRY1(q15, q6) CARRY0(D1(q8), q15, q6) MUL4(q13, nil, q0, q2, q3, q1, q4, q5) CARRY1(q12, q6) CARRY0(D1(q9), q12, q6) MUL5(q14, nil, q0, q2, q3, q1, q4, q5) CARRY1(q13, q6) MUL6(q15, nil, q0, q2, q3, q1, q4, q5) // Finish up and store the result. vtrn.32 q8, q9 vst1.32 {q8}, [r0]! // All done. bx r14 ENDFUNC INTFUNC(mul4) // On entry, r0 points to the destination; r2 points to a packed // operand X; q4/q5 holds an expanded operand Y; and q13--q15 hold // incoming carries c0--c2. On exit, the destination and carries are // updated; r0 and r2 are each advanced by 16; q4 and q5 are // preserved; and the other NEON registers are clobbered. endprologue // Start by loading the operand words from memory. vld1.32 {q1}, [r2]! // Do the multiplication. MUL0(q13, t, q1, q4, q5, nil, nil, nil) MUL1(q14, t, q1, q4, q5, nil, nil, nil) CARRY0(D0(q8), q13, q6) MUL2(q15, t, q1, q4, q5, nil, nil, nil) CARRY1(q14, q6) CARRY0(D0(q9), q14, q6) MUL3(q12, nil, q1, q4, q5, nil, nil, nil) CARRY1(q15, q6) CARRY0(D1(q8), q15, q6) MUL4(q13, nil, q1, q4, q5, nil, nil, nil) CARRY1(q12, q6) CARRY0(D1(q9), q12, q6) MUL5(q14, nil, q1, q4, q5, nil, nil, nil) CARRY1(q13, q6) MUL6(q15, nil, q1, q4, q5, nil, nil, nil) // Finish up and store the result. vtrn.32 q8, q9 vst1.32 {q8}, [r0]! // All done. bx r14 ENDFUNC INTFUNC(mul4zc) // On entry, r0 points to the destination; r2 points to a packed // operand X; and q4/q5 holds an expanded operand Y. On exit, the // destination is updated; q13--q15 hold outgoing carries c0--c2; r0 // and r2 are each advanced by 16; q4 and q5 are preserved; and the // other NEON registers are clobbered. endprologue // Start by loading the operand words from memory. vld1.32 {q1}, [r2]! // Do the multiplication. MUL0(q13, nil, q1, q4, q5, nil, nil, nil) MUL1(q14, nil, q1, q4, q5, nil, nil, nil) CARRY0(D0(q8), q13, q6) MUL2(q15, nil, q1, q4, q5, nil, nil, nil) CARRY1(q14, q6) CARRY0(D0(q9), q14, q6) MUL3(q12, nil, q1, q4, q5, nil, nil, nil) CARRY1(q15, q6) CARRY0(D1(q8), q15, q6) MUL4(q13, nil, q1, q4, q5, nil, nil, nil) CARRY1(q12, q6) CARRY0(D1(q9), q12, q6) MUL5(q14, nil, q1, q4, q5, nil, nil, nil) CARRY1(q13, q6) MUL6(q15, nil, q1, q4, q5, nil, nil, nil) // Finish up and store the result. vtrn.32 q8, q9 vst1.32 {q8}, [r0]! // All done. bx r14 ENDFUNC INTFUNC(mla4) // On entry, r0 points to the destination/accumulator; r2 points to a // packed operand X; q4/q5 holds an expanded operand Y; and q13--q15 // hold incoming carries c0--c2. On exit, the accumulator and // carries are updated; r0 and r2 are each advanced by 16; q4 and q5 // are preserved; and the other NEON registers are clobbered. endprologue // Start by loading the operand words from memory. vld1.32 {q9}, [r0] SPREADACC0(q9, q10, q11, q12) vld1.32 {q1}, [r2]! // Add the accumulator input to the incoming carries. Split the // accumulator into four pieces and add the carries onto them. SPREADACC1(q9, q10, q11, q12) CARRYACC(q9, q10, q11, q12, q13, q14, q15) // Do the multiplication. mla4_common: MUL0(q13, t, q1, q4, q5, nil, nil, nil) MUL1(q14, t, q1, q4, q5, nil, nil, nil) CARRY0(D0(q8), q13, q6) MUL2(q15, t, q1, q4, q5, nil, nil, nil) CARRY1(q14, q6) CARRY0(D0(q9), q14, q6) MUL3(q12, t, q1, q4, q5, nil, nil, nil) CARRY1(q15, q6) CARRY0(D1(q8), q15, q6) MUL4(q13, nil, q1, q4, q5, nil, nil, nil) CARRY1(q12, q6) CARRY0(D1(q9), q12, q6) MUL5(q14, nil, q1, q4, q5, nil, nil, nil) CARRY1(q13, q6) MUL6(q15, nil, q1, q4, q5, nil, nil, nil) // Finish up and store the result. vtrn.32 q8, q9 vst1.32 {q8}, [r0]! // All done. bx r14 ENDFUNC INTFUNC(mla4zc) // On entry, r0 points to the destination/accumulator; r2 points to a // packed operand X; and q4/q5 holds an expanded operand Y. On exit, // the accumulator is updated; q13--q15 hold outgoing carries c0--c2; // r0 and r2 are each advanced by 16; q4 and q5 are preserved; and // the other NEON registers are clobbered. endprologue // Start by loading the operand words from memory. vld1.32 {q13}, [r0] SPREADACC0(q13, q14, q15, q12) vld1.32 {q1}, [r2]! // Move the accumulator input to the incoming carry slots. Split the // accumulator into four pieces. SPREADACC1(q13, q14, q15, q12) b mla4_common ENDFUNC INTFUNC(mmul4) // On entry, r0 points to the destination; r1 points to a packed // operand U; r2 points to a packed operand X (the modulus); q2/q3 // holds an expanded operand V; and q4/q5 holds an expanded operand M // (the Montgomery factor -N^{-1} (mod B)). On exit, the destination // is updated (to zero); q4/q5 hold an expanded factor Y = U V M (mod // B); q13--q15 hold outgoing carries c0--c2; r0, r1, and r2 are each // advanced by 16; q2 and q3 are preserved; and the other NEON // registers are clobbered. // Start by loading the operand words from memory. vld1.32 {q0}, [r1]! // Calculate the low half of W = A + U V, being careful to leave the // carries in place. MUL0(q13, nil, q0, q2, q3, nil, nil, nil) MUL1(q14, nil, q0, q2, q3, nil, nil, nil) CARRY0(D0(q6), q13, q8) MUL2(q15, nil, q0, q2, q3, nil, nil, nil) CASIDE1(D0(q9), q14, q8) CASIDE0(D0(q7), D0(q9), q14, q8) MUL3(q12, nil, q0, q2, q3, nil, nil, nil) b mmla4_common ENDFUNC INTFUNC(mmla4) // On entry, r0 points to the destination/accumulator A; r1 points to // a packed operand U; r2 points to a packed operand X (the modulus); // q2/q3 holds an expanded operand V; and q4/q5 holds an expanded // operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the // accumulator is updated (to zero); q4/q5 hold an expanded factor Y // = (A + U V) M (mod B); q13--q15 hold outgoing carries c0--c2; r0, // r1, and r2 are each advanced by 16; q2 and q3 are preserved; and // the other NEON registers are clobbered. endprologue // Start by loading the operand words from memory. vld1.32 {q13}, [r0] SPREADACC0(q13, q14, q15, q12) vld1.32 {q0}, [r1]! // Move the accumulator input to the incoming carry slots. Split the // accumulator into four pieces. SPREADACC1(q13, q14, q15, q12) // Calculate the low half of W = A + U V, being careful to leave the // carries in place. MUL0(q13, t, q0, q2, q3, nil, nil, nil) MUL1(q14, t, q0, q2, q3, nil, nil, nil) CARRY0(D0(q6), q13, q8) MUL2(q15, t, q0, q2, q3, nil, nil, nil) CASIDE1(D0(q9), q14, q8) CASIDE0(D0(q7), D0(q9), q14, q8) MUL3(q12, t, q0, q2, q3, nil, nil, nil) mmla4_common: CASIDE1(D0(q9), q15, q8) CASIDE0(D1(q6), D0(q9), q15, q8) CASIDE1(D0(q9), q12, q8) CASIDE0E(D1(q7), D0(q9), q12, q8) vtrn.32 q6, q7 // Calculate the low half of the Montgomery factor Y = W M. At this // point, registers are a little tight. MUL0( q8, nil, q6, q4, q5, nil, nil, nil) MUL1( q9, nil, q6, q4, q5, nil, nil, nil) CARRY0(D0(q8), q8, q1) MUL2(q10, nil, q6, q4, q5, nil, nil, nil) CARRY1(q9, q1) CARRY0(D0(q9), q9, q1) MUL3(q11, nil, q6, q4, q5, nil, nil, nil) CARRY1(q10, q1) CARRY0(D1(q8), q10, q1) vmov.i32 q5, #0 CARRY1(q11, q1) CARRY0(D1(q9), q11, q1) vld1.32 {q1}, [r2]! vtrn.32 q8, q9 // Expand Y. We'll put it in its proper place a bit later. vzip.16 q8, q5 // Build up the product X Y in the carry slots. MUL0(q13, t, q1, q8, q5, nil, nil, nil) MUL1(q14, t, q1, q8, q5, nil, nil, nil) CARRY0(nil, q13, q9) MUL2(q15, t, q1, q8, q5, nil, nil, nil) CARRY1(q14, q9) vmov q4, q8 CARRY0(nil, q14, q9) MUL3(q12, t, q1, q8, q5, nil, nil, nil) CARRY1(q15, q9) CARRY0(nil, q15, q9) vmov.u32 q6, #0 // And complete the calculation. MUL4(q13, nil, q0, q2, q3, q1, q4, q5) CARRY1(q12, q9) CARRY0(nil, q12, q9) MUL5(q14, nil, q0, q2, q3, q1, q4, q5) CARRY1(q13, q9) MUL6(q15, nil, q0, q2, q3, q1, q4, q5) // Finish up and store the result. vst1.32 {q6}, [r0]! // All done. bx r14 ENDFUNC INTFUNC(mont4) // On entry, r0 points to the destination/accumulator A; r2 points to // a packed operand X (the modulus); and q2/q3 holds an expanded // operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the // accumulator is updated (to zero); q4/q5 hold an expanded factor Y // = A M (mod B); q13--q15 hold outgoing carries c0--c2; r0 and r2 // are each advanced by 16; q2 and q3 are preserved; and the other // NEON registers are clobbered. endprologue // Start by loading the operand words from memory. vld1.32 {q0}, [r0] vld1.32 {q1}, [r2]! // Calculate Y = A M (mod B). MUL0(q8, nil, q0, q2, q3, nil, nil, nil) MUL1(q9, nil, q0, q2, q3, nil, nil, nil) CARRY0(D0(q4), q8, q6) MUL2(q10, nil, q0, q2, q3, nil, nil, nil) CARRY1(q9, q6) vmov q13, q0 CARRY0(D0(q7), q9, q6) MUL3(q11, nil, q0, q2, q3, nil, nil, nil) CARRY1(q10, q6) CARRY0(D1(q4), q10, q6) SPREADACC0(q13, q14, q15, q12) CARRY1(q11, q6) CARRY0(D1(q7), q11, q6) SPREADACC1(q13, q14, q15, q12) vmov.i32 q5, #0 vtrn.32 q4, q7 vzip.16 q4, q5 // Calculate the actual result. Well, the carries, at least. vmov.i32 q8, #0 MUL0(q13, t, q1, q4, q5, nil, nil, nil) MUL1(q14, t, q1, q4, q5, nil, nil, nil) CARRY0(nil, q13, q6) MUL2(q15, t, q1, q4, q5, nil, nil, nil) CARRY1(q14, q6) CARRY0(nil, q14, q6) MUL3(q12, t, q1, q4, q5, nil, nil, nil) CARRY1(q15, q6) CARRY0(nil, q15, q6) MUL4(q13, nil, q1, q4, q5, nil, nil, nil) CARRY1(q12, q6) CARRY0(nil, q12, q6) MUL5(q14, nil, q1, q4, q5, nil, nil, nil) CARRY1(q13, q6) MUL6(q15, nil, q1, q4, q5, nil, nil, nil) // Finish up and store the result. vst1.32 {q8}, [r0]! // All done. bx r14 ENDFUNC ///-------------------------------------------------------------------------- /// Bulk multipliers. FUNC(mpx_umul4_arm_neon) // void mpx_umul4_arm_neon(mpw *dv, const mpw *av, const mpw *avl, // const mpw *bv, const mpw *bvl); // Establish the arguments and do initial setup. // // inner loop dv r0 // inner loop av r2 // outer loop dv r5 // outer loop bv r3 // av base r1 // av limit r12 // bv limit r4 pushreg r4, r5, r14 pushvfp QQ(q4, q7) endprologue // Prepare for the first iteration. vld1.32 {q4}, [r3]! // = Y = bv[0] vmov.i32 q5, #0 // r0 // = dv for inner loop // r1 // = av base // r3 // = bv for outer loop ldr r4, [sp, #76] // = bv limit mov r12, r2 // = av limit mov r2, r1 // = av for inner loop add r5, r0, #16 // = dv for outer loop vzip.16 q4, q5 // expand Y bl mul4zc cmp r2, r12 // all done? bhs 8f // Continue with the first iteration. 0: bl mul4 cmp r2, r12 // all done? blo 0b // Write out the leftover carry. There can be no tail here. 8: bl carryprop cmp r3, r4 // more passes to do? bhs 9f // Set up for the next pass. 1: vld1.32 {q4}, [r3]! // = Y = bv[i] vmov.i32 q5, #0 mov r0, r5 // -> dv[i] mov r2, r1 // -> av[0] add r5, r5, #16 vzip.16 q4, q5 // expand Y bl mla4zc cmp r2, r12 // done yet? bhs 8f // Continue... 0: bl mla4 cmp r2, r12 blo 0b // Finish off this pass. There was no tail on the previous pass, and // there can be done on this pass. 8: bl carryprop cmp r3, r4 blo 1b // All over. 9: popvfp QQ(q4, q7) popreg r4, r5, r14 bx r14 ENDFUNC FUNC(mpxmont_mul4_arm_neon) // void mpxmont_mul4_arm_neon(mpw *dv, const mpw *av, const mpw *bv, // const mpw *nv, size_t n, const mpw *mi); // Establish the arguments and do initial setup. // // inner loop dv r0 // inner loop av r1 // inner loop nv r2 // mi r5 // outer loop dv r6 // outer loop bv r7 // av base r8 // av limit r9 // bv limit r4 // nv base r3 // n r4 // c r10 // 0 r12 pushreg r4-r10, r14 pushvfp QQ(q4, q7) endprologue // Establish the expanded operands. ldrd r4, r5, [sp, #96] // r4 = n; r5 -> mi vld1.32 {q2}, [r2] // = V = bv[0] vmov.i32 q3, #0 vmov.i32 q5, #0 vld1.32 {q4}, [r5] // = M // Set up the outer loop state and prepare for the first iteration. // r0 // -> dv for inner loop // r1 // -> av for inner loop add r7, r2, #16 // -> bv // r3 // -> nv add r6, r0, #16 // -> dv mov r8, r1 // -> av add r9, r1, r4, lsl #2 // -> av limit add r4, r2, r4, lsl #2 // -> bv limit mov r2, r3 // -> nv for inner loop mov r12, #0 // = 0 vzip.16 q2, q3 // expand V vzip.16 q4, q5 // expand M bl mmul4 cmp r1, r9 // done already? bhs 8f // Complete the first inner loop. 0: bl dmul4 cmp r1, r9 // done yet? blo 0b // Still have carries left to propagate. Rather than store the tail // end in memory, keep it in a general-purpose register for later. bl carryprop vmov.32 r10, QW(q15, 0) // Embark on the next iteration. (There must be one. If n = 1 then // we would have bailed above, to label 8. Similarly, the subsequent // iterations can fall into the inner loop immediately.) 1: vld1.32 {q2}, [r7]! // = V = bv[i] vld1.32 {q4}, [r5] // = M vmov.i32 q3, #0 vmov.i32 q5, #0 mov r0, r6 // -> dv[i] add r6, r6, #16 mov r1, r8 // -> av[0] mov r2, r3 // -> nv[0] vzip.16 q2, q3 // expand V vzip.16 q4, q5 // expand M bl mmla4 // Complete the next inner loop. 0: bl dmla4 cmp r1, r9 // done yet? blo 0b // Still have carries left to propagate, and they overlap the // previous iteration's final tail, so read that and add it. cmp r7, r4 vmov.32 D0(q12), r10, r12 vadd.i64 D0(q13), D0(q13), D0(q12) bl carryprop vmov.32 r10, QW(q15, 0) // Back again, maybe. blo 1b // All done, almost. str r10, [r0], #4 popvfp QQ(q4, q7) popreg r4-r10, r14 bx r14 // First iteration was short. Write out the carries and we're done. // (This could be folded into the main loop structure, but that would // penalize small numbers more.) 8: bl carryprop vst1.32 {QW(q15, 0)}, [r0]! popvfp QQ(q4, q7) popreg r4-r10, r14 bx r14 ENDFUNC FUNC(mpxmont_redc4_arm_neon) // void mpxmont_redc4_arm_neon(mpw *dv, mpw *dvl, const mpw *nv, // size_t n, const mpw *mi); // Establish the arguments and do initial setup. // // inner loop dv r0 // dv limit r1 // inner loop nv r2 // blocks-of-4 dv limit r3 // mi (r14) // outer loop dv r4 // outer loop dv limit r5 // nv base r6 // nv limit r7 // n r3 // c (r14) // t0, t1, t2, t3 r2, r8, r9, r10 // 0 r12 pushreg r4-r10, r14 pushvfp QQ(q4, q7) endprologue // Set up the outer loop state and prepare for the first iteration. ldr r14, [sp, #96] // -> mi vmov.i32 q3, #0 sub r12, r1, r0 // total dv bytes // r0 // -> dv for inner loop // r1 // -> overall dv limit // r2 // -> nv for inner loop // r3 // = n (for now) add r4, r0, #16 // -> dv for outer loop add r5, r0, r3, lsl #2 // -> dv limit bic r12, r12, #15 // dv blocks of 4 vld1.32 {q2}, [r14] // = M mov r6, r2 // -> nv add r7, r2, r3, lsl #2 // -> nv limit add r3, r0, r12 // -> dv blocks-of-4 limit vzip.16 q2, q3 // expand M mov r12, #0 // = 0 bl mont4 cmp r2, r7 // done already? bhs 8f 5: bl mla4 cmp r2, r7 // done yet? blo 5b // Still have carries left to propagate. Adding the accumulator // block into the carries is a little different this time, because // all four accumulator limbs have to be squished into the three // carry registers for `carryprop' to do its thing. 8: vld1.32 {q9}, [r0] SPREADACC0(q9, q10, q11, q12) SPREADACC1(q9, q10, q11, q12) vshl.u64 D0(q12), D0(q12), #16 CARRYACC(q9, q10, q11, q12, q13, q14, q15) vadd.u64 D1(q15), D1(q15), D0(q12) bl carryprop vmov.32 r14, QW(q15, 0) cmp r0, r3 bhs 7f // Propagate the first group of carries. ldmia r0, {r2, r8-r10} adds r2, r2, r14 adcs r8, r8, #0 adcs r9, r9, #0 adcs r10, r10, #0 stmia r0!, {r2, r8-r10} teq r0, r3 beq 6f // Continue carry propagation until the end of the buffer. 0: ldmia r0, {r2, r8-r10} adcs r2, r2, #0 adcs r8, r8, #0 adcs r9, r9, #0 adcs r10, r10, #0 stmia r0!, {r2, r8-r10} teq r0, r3 bne 0b // Deal with the tail end. Note that the actual destination length // won't be an exacty number of blocks of four, so it's safe to just // drop through here. 6: adc r14, r12, #0 7: ldr r2, [r0] adds r2, r2, r14 str r2, [r0], #4 teq r0, r1 beq 8f 0: ldr r2, [r0] adcs r2, r2, #0 str r2, [r0], #4 teq r0, r1 bne 0b // All done for this iteration. Start the next. 8: cmp r4, r5 bhs 9f mov r0, r4 add r4, r4, #16 mov r2, r6 bl mont4 b 5b // All over. 9: popvfp QQ(q4, q7) popreg r4-r10, r14 bx r14 ENDFUNC ///-------------------------------------------------------------------------- /// Testing and performance measurement. #ifdef TEST_MUL4 // dmul smul mmul mont // z r0 r0 r0 r0 r0 // c r4 r1 r1 r1 r1 // y r3 -- -- r2 r2 // u r1 r2 -- r3 -- // x r2 r3 r2 stk0 r3 // vv q2/q3 stk0 -- stk1 stk0 // yy q4/q5 stk1 r3 stk2 -- // n r5 stk2 stk0 stk3 stk1 // cyv r6 stk3 stk1 stk4 stk2 #define STKARG(i) sp, #80 + 4*(i) .macro testprologue mode pushreg r4-r6, r14 pushvfp QQ(q4, q7) endprologue .ifeqs "\mode", "dmul" mov r4, r1 // -> c mov r1, r2 // -> u mov r2, r3 // -> x ldr r14, [STKARG(0)] // -> vv vld1.32 {q2}, [r14] vmov.i32 q3, #0 vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0) ldr r14, [STKARG(1)] // -> yy vld1.32 {q4}, [r14] vmov.i32 q5, #0 vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0) ldr r5, [STKARG(2)] // = n ldr r6, [STKARG(3)] // -> cyv .endif .ifeqs "\mode", "smul" mov r4, r1 // -> c // r2 // -> x vld1.32 {q4}, [r3] vmov.i32 q5, #0 vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0) ldr r5, [STKARG(0)] // = n ldr r6, [STKARG(1)] // -> cyv .endif .ifeqs "\mode", "mmul" mov r4, r1 // -> c mov r1, r3 // -> u mov r3, r2 // -> y ldr r2, [STKARG(0)] // -> x ldr r14, [STKARG(1)] // -> vv vld1.32 {q2}, [r14] vmov.i32 q3, #0 vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0) ldr r14, [STKARG(2)] // -> yy vld1.32 {q4}, [r14] vmov.i32 q5, #0 vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0) ldr r5, [STKARG(3)] // = n ldr r6, [STKARG(4)] // -> cyv .endif .ifeqs "\mode", "mont" mov r4, r1 // -> c mov r1, r3 // -> u mov r14, r2 mov r2, r3 // -> x mov r3, r14 // -> y ldr r14, [STKARG(0)] // -> vv vld1.32 {q2}, [r14] vmov.i32 q3, #0 vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0) ldr r5, [STKARG(1)] // = n ldr r6, [STKARG(2)] // -> cyv .endif .endm .macro testldcarry vldmia r4, {QQ(q13, q15)} // c0, c1, c2 .endm .macro testtop 0: subs r5, r5, #1 .endm .macro testtail bne 0b .endm .macro testcarryout vstmia r4, {QQ(q13, q15)} .endm .macro testepilogue popvfp QQ(q4, q7) popreg r4-r6, r14 bx r14 .endm FUNC(test_dmul4) testprologue dmul testldcarry testtop bl dmul4 testtail testcarryout testepilogue ENDFUNC FUNC(test_dmla4) testprologue dmul testldcarry testtop bl dmla4 testtail testcarryout testepilogue ENDFUNC FUNC(test_mul4) testprologue smul testldcarry testtop bl mul4 testtail testcarryout testepilogue ENDFUNC FUNC(test_mul4zc) testprologue smul testldcarry testtop bl mul4zc testtail testcarryout testepilogue ENDFUNC FUNC(test_mla4) testprologue smul testldcarry testtop bl mla4 testtail testcarryout testepilogue ENDFUNC FUNC(test_mla4zc) testprologue smul testldcarry testtop bl mla4zc testtail testcarryout testepilogue ENDFUNC FUNC(test_mmul4) testprologue mmul testtop bl mmul4 testtail vst1.32 {q4, q5}, [r3] testcarryout testepilogue ENDFUNC FUNC(test_mmla4) testprologue mmul testtop bl mmla4 testtail vst1.32 {q4, q5}, [r3] testcarryout testepilogue ENDFUNC FUNC(test_mont4) testprologue mont testtop bl mont4 testtail vst1.32 {q4, q5}, [r3] testcarryout testepilogue ENDFUNC #endif ///----- That's all, folks --------------------------------------------------