From: Mark Wooding Date: Mon, 4 Nov 2019 12:22:00 +0000 (+0000) Subject: math/mpx-mul4-{arm-neon,arm64-simd}.S, etc.: Add ARM versions of `mul4'. X-Git-Tag: 2.6.0~9 X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/commitdiff_plain/ea1b3cec199052eda3a0054d86c70e948c6e7580 math/mpx-mul4-{arm-neon,arm64-simd}.S, etc.: Add ARM versions of `mul4'. With this, I think we (finally) have parity across the various premier target platforms. --- diff --git a/debian/catacomb2.symbols b/debian/catacomb2.symbols index 23ca209b..84ff8938 100644 --- a/debian/catacomb2.symbols +++ b/debian/catacomb2.symbols @@ -128,14 +128,20 @@ libcatacomb.so.2 catacomb2 #MINVER# (optional|arch=i386)mpx_umul4_x86_avx@Base 2.3.0 (optional|arch=amd64)mpx_umul4_amd64_sse2@Base 2.3.0 (optional|arch=amd64)mpx_umul4_amd64_avx@Base 2.3.0 + (optional|arch=armel armhf)mpx_umul4_arm_neon@Base 2.5.99~ + (optional|arch=arm64)mpx_umul4_arm64_simd@Base 2.5.99~ (optional|arch=i386)mpxmont_mul4_x86_sse2@Base 2.3.0 (optional|arch=i386)mpxmont_mul4_x86_avx@Base 2.3.0 (optional|arch=amd64)mpxmont_mul4_amd64_sse2@Base 2.3.0 (optional|arch=amd64)mpxmont_mul4_amd64_avx@Base 2.3.0 + (optional|arch=armel armhf)mpxmont_mul4_arm_neon@Base 2.5.99~ + (optional|arch=arm64)mpxmont_mul4_arm64_simd@Base 2.5.99~ (optional|arch=i386)mpxmont_redc4_x86_sse2@Base 2.3.0 (optional|arch=i386)mpxmont_redc4_x86_avx@Base 2.3.0 (optional|arch=amd64)mpxmont_redc4_amd64_sse2@Base 2.3.0 (optional|arch=amd64)mpxmont_redc4_amd64_avx@Base 2.3.0 + (optional|arch=armel armhf)mpxmont_redc4_arm_neon@Base 2.5.99~ + (optional|arch=arm64)mpxmont_redc4_arm64_simd@Base 2.5.99~ ## mparena mparena_create@Base 2.0.0 diff --git a/math/Makefile.am b/math/Makefile.am index 7c4ffed4..37d88d1a 100644 --- a/math/Makefile.am +++ b/math/Makefile.am @@ -192,6 +192,16 @@ MPX_MUL4_SOURCES = mpx-mul4-amd64-sse2.S check_PROGRAMS += mpx-mul4.t TESTS += mpx-mul4.t$(EXEEXT) endif +if CPUFAM_ARMEL +MPX_MUL4_SOURCES = mpx-mul4-arm-neon.S +check_PROGRAMS += mpx-mul4.t +TESTS += mpx-mul4.t$(EXEEXT) +endif +if CPUFAM_ARM64 +MPX_MUL4_SOURCES = mpx-mul4-arm64-simd.S +check_PROGRAMS += mpx-mul4.t +TESTS += mpx-mul4.t$(EXEEXT) +endif libmath_la_SOURCES += $(MPX_MUL4_SOURCES) mpx_mul4_t_SOURCES = mpx-mul4-test.c $(MPX_MUL4_SOURCES) mpx_mul4_t_CPPFLAGS = \ diff --git a/math/mpmont.c b/math/mpmont.c index 3edb58fb..6109c856 100644 --- a/math/mpmont.c +++ b/math/mpmont.c @@ -98,6 +98,14 @@ static void simple_redccore(mpw *dv, mpw *dvl, const mpw *mv, MAYBE_REDC4(amd64_avx) #endif +#if CPUFAM_ARMEL + MAYBE_REDC4(arm_neon) +#endif + +#if CPUFAM_ARM64 + MAYBE_REDC4(arm64_simd) +#endif + static redccore__functype *pick_redccore(void) { #if CPUFAM_X86 @@ -112,6 +120,13 @@ static redccore__functype *pick_redccore(void) DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_amd64_sse2, cpu_feature_p(CPUFEAT_X86_SSE2)); #endif +#if CPUFAM_ARMEL + DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_arm_neon, + cpu_feature_p(CPUFEAT_ARM_NEON)); +#endif +#if CPUFAM_ARM64 + DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_arm64_simd, 1); +#endif DISPATCH_PICK_FALLBACK(mpmont_reduce, simple_redccore); } @@ -204,6 +219,14 @@ static void simple_mulcore(mpw *dv, mpw *dvl, MAYBE_MUL4(amd64_avx) #endif +#if CPUFAM_ARMEL + MAYBE_MUL4(arm_neon) +#endif + +#if CPUFAM_ARM64 + MAYBE_MUL4(arm64_simd) +#endif + static mulcore__functype *pick_mulcore(void) { #if CPUFAM_X86 @@ -218,6 +241,13 @@ static mulcore__functype *pick_mulcore(void) DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_amd64_sse2, cpu_feature_p(CPUFEAT_X86_SSE2)); #endif +#if CPUFAM_ARMEL + DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_arm_neon, + cpu_feature_p(CPUFEAT_ARM_NEON)); +#endif +#if CPUFAM_ARM64 + DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_arm64_simd, 1); +#endif DISPATCH_PICK_FALLBACK(mpmont_mul, simple_mulcore); } diff --git a/math/mpx-mul4-arm-neon.S b/math/mpx-mul4-arm-neon.S new file mode 100644 index 00000000..efca7902 --- /dev/null +++ b/math/mpx-mul4-arm-neon.S @@ -0,0 +1,1189 @@ +/// -*- mode: asm; asm-comment-char: ?/ -*- +/// +/// Large SIMD-based multiplications +/// +/// (c) 2019 Straylight/Edgeware +/// + +///----- Licensing notice --------------------------------------------------- +/// +/// This file is part of Catacomb. +/// +/// Catacomb is free software: you can redistribute it and/or modify it +/// under the terms of the GNU Library General Public License as published +/// by the Free Software Foundation; either version 2 of the License, or +/// (at your option) any later version. +/// +/// Catacomb is distributed in the hope that it will be useful, but +/// WITHOUT ANY WARRANTY; without even the implied warranty of +/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +/// Library General Public License for more details. +/// +/// You should have received a copy of the GNU Library General Public +/// License along with Catacomb. If not, write to the Free Software +/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, +/// USA. + +///-------------------------------------------------------------------------- +/// Preliminaries. + +#include "config.h" +#include "asm-common.h" + + .arch armv7-a + .fpu neon + + .text + +///-------------------------------------------------------------------------- +/// Theory. +/// +/// We define a number of primitive fixed-size multipliers from which we can +/// construct more general variable-length multipliers. +/// +/// The basic trick is the same throughout. In an operand-scanning +/// multiplication, the inner multiplication loop multiplies a multiple- +/// precision operand by a single precision factor, and adds the result, +/// appropriately shifted, to the result. A `finely integrated operand +/// scanning' implementation of Montgomery multiplication also adds the +/// product of a single-precision `Montgomery factor' and the modulus, +/// calculated in the same pass. The more common `coarsely integrated +/// operand scanning' alternates main multiplication and Montgomery passes, +/// which requires additional carry propagation. +/// +/// Throughout both plain-multiplication and Montgomery stages, then, one of +/// the factors remains constant throughout the operation, so we can afford +/// to take a little time to preprocess it. The transformation we perform is +/// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a +/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into +/// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit +/// pieces are placed into 32-bit cells, and arranged as two 128-bit NEON +/// operands, as follows. +/// +/// Offset 0 4 8 12 +/// 0 v'_0 v''_0 v'_1 v''_1 +/// 16 v'_2 v''_2 v'_3 v''_3 +/// +/// The `vmull' and `vmlal' instructions can multiply a vector of two 32-bit +/// values by a 32-bit scalar, giving two 64-bit results; thus, it will act +/// on (say) v'_0 and v''_0 in a single instruction, to produce two 48-bit +/// results in 64-bit fields. The sixteen bits of headroom allows us to add +/// many products together before we must deal with carrying; it also allows +/// for some calculations to be performed on the above expanded form. +/// +/// We maintain three `carry' registers, q12--q14, accumulating intermediate +/// results; we name them `c0', `c1', and `c2'. Each carry register holds +/// two 64-bit halves: the register c0, for example, holds c'_0 (low half) +/// and c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the +/// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2. +/// The `vmull' or `vmlal' instruction acting on a scalar operand and an +/// operand in the expanded form above produces a result which can be added +/// directly to the appropriate carry register. +/// +/// Multiplication is performed in product-scanning order, since ARM +/// processors commonly implement result forwarding for consecutive multiply- +/// and-accumulate instructions specifying the same destination. +/// Experimentally, this runs faster than operand-scanning in an attempt to +/// hide instruction latencies. +/// +/// On 32-bit ARM, we have a reasonable number of registers: the expanded +/// operands are kept in registers. The packed operands are read from memory +/// into working registers q0 and q1. The following conventional argument +/// names and locations are used throughout. +/// +/// Arg Format Location Notes +/// +/// U packed [r1] +/// X packed [r2] In Montgomery multiplication, X = N +/// V expanded q2/q3 +/// Y expanded q4/q5 In Montgomery multiplication, Y = (A + U V) M +/// M expanded q4/q5 -N^{-1} (mod B^4) +/// N Modulus, for Montgomery multiplication +/// A packed [r0] Destination/accumulator +/// C carry q13--q15 +/// +/// The calculation is some variant of +/// +/// A' + C' B^4 <- U V + X Y + A + C +/// +/// The low-level functions fit into a fairly traditional (finely-integrated) +/// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y) +/// (indexed by i). +/// +/// The variants are as follows. +/// +/// Function Variant Use i j +/// +/// mmul4 A = C = 0 Montgomery 0 0 +/// dmul4 A = 0 Montgomery 0 + +/// mmla4 C = 0 Montgomery + 0 +/// dmla4 exactly as shown Montgomery + + +/// +/// mul4zc U = V = A = C = 0 Plain 0 0 +/// mul4 U = V = A = 0 Plain 0 + +/// mla4zc U = V = C = 0 Plain + 0 +/// mla4 U = V = 0 Plain + + +/// +/// The `mmul4' and `mmla4' functions are also responsible for calculating +/// the Montgomery reduction factor Y = (A + U V) M used by the rest of the +/// inner loop. + +///-------------------------------------------------------------------------- +/// Macro definitions. + +.macro mulacc z, u, v, x=nil, y=nil + // Set Z = Z + U V + X Y. X may be `nil' to omit the second + // operand. Z should be a 128-bit `qN' register; V and Y should be + // 64-bit `dN' registers; and U and X should be 32-bit `dN[I]' + // scalars; the multiplications produce two 64-bit elementwise + // products, which are added elementwise to Z. + + vmlal.u32 \z, \v, \u + .ifnes "\x", "nil" + vmlal.u32 \z, \y, \x + .endif +.endm + +.macro mulinit z, zinitp, u, v, x, y + // If ZINITP then set Z = Z + U V + X Y, as for `mulacc'; otherwise, + // set Z = U V + X Y. Operand requirements and detailed operation + // are as for `mulacc'. + + .ifeqs "\zinitp", "t" + mulacc \z, \u, \v, \x, \y + .else + vmull.u32 \z, \v, \u + .ifnes "\x", "nil" + vmlal.u32 \z, \y, \x + .endif + .endif +.endm + +// `MULI': accumulate the B^I and b B^i terms of the polynomial product sum U +// V + X Y, given that U = u_0 + B u_1 + B^2 u_2 + B^3 u_3 (and similarly for +// x), and V = v'_0 + b v''_0 + B (v'_1 + b v''_1) + B^2 (v'_2 + b v''_2) + +// B^3 (v'_3 + b v''_3) (and similarly for Y). The 64-bit coefficients are +// added into the low and high halves of the 128-bit register Z (if ZINIT is +// `nil' then simply set Z, as if it were initially zero). +#define MUL0(z, zinitp, u, v0, v1, x, y0, y1) \ + mulinit z, zinitp, QW(u, 0), D0(v0), QW(x, 0), D0(y0) +#define MUL1(z, zinitp, u, v0, v1, x, y0, y1) \ + mulinit z, zinitp, QW(u, 0), D1(v0), QW(x, 0), D1(y0); \ + mulacc z, QW(u, 1), D0(v0), QW(x, 1), D0(y0) +#define MUL2(z, zinitp, u, v0, v1, x, y0, y1) \ + mulinit z, zinitp, QW(u, 0), D0(v1), QW(x, 0), D0(y1); \ + mulacc z, QW(u, 1), D1(v0), QW(x, 1), D1(y0); \ + mulacc z, QW(u, 2), D0(v0), QW(x, 2), D0(y0) +#define MUL3(z, zinitp, u, v0, v1, x, y0, y1) \ + mulinit z, zinitp, QW(u, 0), D1(v1), QW(x, 0), D1(y1); \ + mulacc z, QW(u, 1), D0(v1), QW(x, 1), D0(y1); \ + mulacc z, QW(u, 2), D1(v0), QW(x, 2), D1(y0); \ + mulacc z, QW(u, 3), D0(v0), QW(x, 3), D0(y0) +#define MUL4(z, zinitp, u, v0, v1, x, y0, y1) \ + mulinit z, zinitp, QW(u, 1), D1(v1), QW(x, 1), D1(y1); \ + mulacc z, QW(u, 2), D0(v1), QW(x, 2), D0(y1); \ + mulacc z, QW(u, 3), D1(v0), QW(x, 3), D1(y0) +#define MUL5(z, zinitp, u, v0, v1, x, y0, y1) \ + mulinit z, zinitp, QW(u, 2), D1(v1), QW(x, 2), D1(y1); \ + mulacc z, QW(u, 3), D0(v1), QW(x, 3), D0(y1) +#define MUL6(z, zinitp, u, v0, v1, x, y0, y1) \ + mulinit z, zinitp, QW(u, 3), D1(v1), QW(x, 3), D1(y1) + +// Steps in the process of propagating carry bits from ZLO to ZHI (both +// 128-bit `qN' registers). Here, T is a 128-bit `qN' temporary register. +// Set the low 32 bits of the 64-bit `dN' register ZOUT to the completed +// coefficient z_i. +// +// In detail, what happens is as follows. Suppose initially that ZLO = +// (z'_i; z''_i) and ZHI = (z'_{i+1}; z''_{i+1}). Let t = z'_i + b z''_i; +// observe that floor(t/b) = floor(z'_i/b) + z''_i. Let z_i = t mod B, and +// add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}. This has +// a circuit depth of 4; I don't know how to do better. +.macro _carry0 zout, zlo0, zlo1, t0, t1 + // ZLO0 and ZLO1 are the low and high halves of a carry register. + // Extract a 32-bit output, in the bottom 32 bits of ZOUT, and set T1 + // so as to continue processing using `_carry1'. All operands are + // 64-bit `dN' registers. If ZOUT is `nil' then no output is + // produced; if T1 is `nil' then no further processing will be + // possible. + .ifnes "\zout", "nil" + vshl.i64 \t0, \zlo1, #16 + .endif + .ifnes "\t1", "nil" + vshr.u64 \t1, \zlo0, #16 + .endif + .ifnes "\zout", "nil" + vadd.i64 \zout, \zlo0, \t0 + .endif + .ifnes "\t1", "nil" + vadd.i64 \t1, \t1, \zlo1 + .endif +.endm +.macro _carry1 u, zhi0, t1 + // ZHI0 is the low half of a carry register, and T1 is the result of + // applying `_carry0' to the previous carry register. Set U to the + // result of propagating the carry into ZHI0. + vshr.u64 \t1, \t1, #16 + vadd.i64 \u, \zhi0, \t1 +.endm + +// More convenient wrappers for `_carry0' and `_carry1'. +// +// CARRY0(ZOUT, ZLO, T) +// Store a 32-bit output in ZOUT from carry ZLO, using T as a +// temporary. ZOUT is a 64-bit `dN' register; ZLO and T are 128-bit +// `qN' registers. +// +// CARRY1(ZHI, T) +// Propagate carry from T into ZHI. Both are 128-bit `qN' registers; +// ZHI is updated. +#define CARRY0(zout, zlo, t) \ + CASIDE0(zout, D0(zlo), zlo, t) +#define CARRY1(zhi, t) \ + CASIDE1(D0(zhi), zhi, t) + +// Versions of `CARRY0' and `CARRY1' which don't mutate their operands. +// +// CASIDE0(ZOUT, U, ZLO, T) +// As for `CARRY0', but the low half of ZLO is actually in U (a 64-bit +// `dN' register). +// +// CASIDE0E(ZOUT, U, ZLO, T) +// As for `CASIDE0', but only calculate the output word, and no +// carry-out. +// +// CASIDE1(U, ZHI, T) +// As for `CARRY1', but write the updated low half of ZHI to U. +#define CASIDE0(zout, u, zlo, t) \ + _carry0 zout, u, D1(zlo), D0(t), D1(t) +#define CASIDE0E(zout, u, zlo, t) \ + _carry0 zout, u, D1(zlo), D0(t), nil +#define CASIDE1(u, zhi, t) \ + _carry1 u, D0(zhi), D1(t) + +// Steps in spreading a packed 128-bit operand in A0 across A0, A1, A2, A3 in +// carry format. +#define SPREADACC0(a0, a1, a2, a3) \ + vmov.i32 a1, #0; \ + vmov.i32 a2, #0; \ + vmov.i32 a3, #0 +#define SPREADACC1(a0, a1, a2, a3) \ + vswp D1(a0), D0(a2); \ + vtrn.32 D0(a0), D0(a1); \ + vtrn.32 D0(a2), D0(a3) + +// Add the carry-format values A0, A1, A2 into the existing carries C0, C1, +// C2 (leaving A3 where it is). +#define CARRYACC(a0, a1, a2, a3, c0, c1, c2) \ + vadd.i64 c0, c0, a0; \ + vadd.i64 c1, c1, a1; \ + vadd.i64 c2, c2, a2 + +///-------------------------------------------------------------------------- +/// Primitive multipliers and related utilities. + +INTFUNC(carryprop) + // On entry, r0 points to a destination, and q13--q15 hold incoming + // carries c0--c2. On exit, the low 128 bits of the carry value are + // stored at [r0]; the remaining 16 bits of carry are left in d30; r0 + // is advanced by 16; and q10--q14 are clobbered. + endprologue + + CARRY0(D0(q10), q13, q12) + CARRY1(q14, q12) + CARRY0(D0(q11), q14, q12) + CARRY1(q15, q12) + CARRY0(D1(q10), q15, q12) + vshr.u64 D1(q11), D1(q12), #16 + vshr.u64 D0(q15), D1(q12), #48 + vtrn.32 q10, q11 + vst1.32 {q10}, [r0]! + bx r14 +ENDFUNC + +INTFUNC(dmul4) + // On entry, r0 points to the destination; r1 and r2 point to packed + // operands U and X; q2/q3 and q4/q5 hold expanded operands V and Y; + // and q13--q15 hold incoming carries c0--c2. On exit, the + // destination and carries are updated; r0, r1, r2 are each advanced + // by 16; q2--q5 are preserved; and the other NEON registers are + // clobbered. + endprologue + + // Start by loading the operand words from memory. + vld1.32 {q0}, [r1]! + vld1.32 {q1}, [r2]! + + // Do the multiplication. + MUL0(q13, t, q0, q2, q3, q1, q4, q5) + MUL1(q14, t, q0, q2, q3, q1, q4, q5) + CARRY0(D0(q8), q13, q6) + MUL2(q15, t, q0, q2, q3, q1, q4, q5) + CARRY1(q14, q6) + CARRY0(D0(q9), q14, q6) + MUL3(q12, nil, q0, q2, q3, q1, q4, q5) + CARRY1(q15, q6) + CARRY0(D1(q8), q15, q6) + MUL4(q13, nil, q0, q2, q3, q1, q4, q5) + CARRY1(q12, q6) + CARRY0(D1(q9), q12, q6) + MUL5(q14, nil, q0, q2, q3, q1, q4, q5) + CARRY1(q13, q6) + MUL6(q15, nil, q0, q2, q3, q1, q4, q5) + + // Finish up and store the result. + vtrn.32 q8, q9 + vst1.32 {q8}, [r0]! + + // All done. + bx r14 +ENDFUNC + +INTFUNC(dmla4) + // On entry, r0 points to the destination/accumulator; r1 and r2 + // point to packed operands U and X; q2/q3 and q4/q5 hold expanded + // operands V and Y; and q13--q15 hold incoming carries c0--c2. On + // exit, the accumulator and carries are updated; r0, r1, r2 are each + // advanced by 16; q2--q5 are preserved; and the other NEON registers + // are clobbered. + endprologue + + // Start by loading the operand words from memory. + vld1.32 {q9}, [r0] + SPREADACC0(q9, q10, q11, q12) + vld1.32 {q0}, [r1]! + vld1.32 {q1}, [r2]! + + // Add the accumulator input to the incoming carries. Split the + // accumulator into four pieces and add the carries onto them. + SPREADACC1(q9, q10, q11, q12) + CARRYACC(q9, q10, q11, q12, q13, q14, q15) + + // Do the multiplication. + MUL0(q13, t, q0, q2, q3, q1, q4, q5) + MUL1(q14, t, q0, q2, q3, q1, q4, q5) + CARRY0(D0(q8), q13, q6) + MUL2(q15, t, q0, q2, q3, q1, q4, q5) + CARRY1(q14, q6) + CARRY0(D0(q9), q14, q6) + MUL3(q12, t, q0, q2, q3, q1, q4, q5) + CARRY1(q15, q6) + CARRY0(D1(q8), q15, q6) + MUL4(q13, nil, q0, q2, q3, q1, q4, q5) + CARRY1(q12, q6) + CARRY0(D1(q9), q12, q6) + MUL5(q14, nil, q0, q2, q3, q1, q4, q5) + CARRY1(q13, q6) + MUL6(q15, nil, q0, q2, q3, q1, q4, q5) + + // Finish up and store the result. + vtrn.32 q8, q9 + vst1.32 {q8}, [r0]! + + // All done. + bx r14 +ENDFUNC + +INTFUNC(mul4) + // On entry, r0 points to the destination; r2 points to a packed + // operand X; q4/q5 holds an expanded operand Y; and q13--q15 hold + // incoming carries c0--c2. On exit, the destination and carries are + // updated; r0 and r2 are each advanced by 16; q4 and q5 are + // preserved; and the other NEON registers are clobbered. + endprologue + + // Start by loading the operand words from memory. + vld1.32 {q1}, [r2]! + + // Do the multiplication. + MUL0(q13, t, q1, q4, q5, nil, nil, nil) + MUL1(q14, t, q1, q4, q5, nil, nil, nil) + CARRY0(D0(q8), q13, q6) + MUL2(q15, t, q1, q4, q5, nil, nil, nil) + CARRY1(q14, q6) + CARRY0(D0(q9), q14, q6) + MUL3(q12, nil, q1, q4, q5, nil, nil, nil) + CARRY1(q15, q6) + CARRY0(D1(q8), q15, q6) + MUL4(q13, nil, q1, q4, q5, nil, nil, nil) + CARRY1(q12, q6) + CARRY0(D1(q9), q12, q6) + MUL5(q14, nil, q1, q4, q5, nil, nil, nil) + CARRY1(q13, q6) + MUL6(q15, nil, q1, q4, q5, nil, nil, nil) + + // Finish up and store the result. + vtrn.32 q8, q9 + vst1.32 {q8}, [r0]! + + // All done. + bx r14 +ENDFUNC + +INTFUNC(mul4zc) + // On entry, r0 points to the destination; r2 points to a packed + // operand X; and q4/q5 holds an expanded operand Y. On exit, the + // destination is updated; q13--q15 hold outgoing carries c0--c2; r0 + // and r2 are each advanced by 16; q4 and q5 are preserved; and the + // other NEON registers are clobbered. + endprologue + + // Start by loading the operand words from memory. + vld1.32 {q1}, [r2]! + + // Do the multiplication. + MUL0(q13, nil, q1, q4, q5, nil, nil, nil) + MUL1(q14, nil, q1, q4, q5, nil, nil, nil) + CARRY0(D0(q8), q13, q6) + MUL2(q15, nil, q1, q4, q5, nil, nil, nil) + CARRY1(q14, q6) + CARRY0(D0(q9), q14, q6) + MUL3(q12, nil, q1, q4, q5, nil, nil, nil) + CARRY1(q15, q6) + CARRY0(D1(q8), q15, q6) + MUL4(q13, nil, q1, q4, q5, nil, nil, nil) + CARRY1(q12, q6) + CARRY0(D1(q9), q12, q6) + MUL5(q14, nil, q1, q4, q5, nil, nil, nil) + CARRY1(q13, q6) + MUL6(q15, nil, q1, q4, q5, nil, nil, nil) + + // Finish up and store the result. + vtrn.32 q8, q9 + vst1.32 {q8}, [r0]! + + // All done. + bx r14 +ENDFUNC + +INTFUNC(mla4) + // On entry, r0 points to the destination/accumulator; r2 points to a + // packed operand X; q4/q5 holds an expanded operand Y; and q13--q15 + // hold incoming carries c0--c2. On exit, the accumulator and + // carries are updated; r0 and r2 are each advanced by 16; q4 and q5 + // are preserved; and the other NEON registers are clobbered. + endprologue + + // Start by loading the operand words from memory. + vld1.32 {q9}, [r0] + SPREADACC0(q9, q10, q11, q12) + vld1.32 {q1}, [r2]! + + // Add the accumulator input to the incoming carries. Split the + // accumulator into four pieces and add the carries onto them. + SPREADACC1(q9, q10, q11, q12) + CARRYACC(q9, q10, q11, q12, q13, q14, q15) + + // Do the multiplication. +mla4_common: + MUL0(q13, t, q1, q4, q5, nil, nil, nil) + MUL1(q14, t, q1, q4, q5, nil, nil, nil) + CARRY0(D0(q8), q13, q6) + MUL2(q15, t, q1, q4, q5, nil, nil, nil) + CARRY1(q14, q6) + CARRY0(D0(q9), q14, q6) + MUL3(q12, t, q1, q4, q5, nil, nil, nil) + CARRY1(q15, q6) + CARRY0(D1(q8), q15, q6) + MUL4(q13, nil, q1, q4, q5, nil, nil, nil) + CARRY1(q12, q6) + CARRY0(D1(q9), q12, q6) + MUL5(q14, nil, q1, q4, q5, nil, nil, nil) + CARRY1(q13, q6) + MUL6(q15, nil, q1, q4, q5, nil, nil, nil) + + // Finish up and store the result. + vtrn.32 q8, q9 + vst1.32 {q8}, [r0]! + + // All done. + bx r14 +ENDFUNC + +INTFUNC(mla4zc) + // On entry, r0 points to the destination/accumulator; r2 points to a + // packed operand X; and q4/q5 holds an expanded operand Y. On exit, + // the accumulator is updated; q13--q15 hold outgoing carries c0--c2; + // r0 and r2 are each advanced by 16; q4 and q5 are preserved; and + // the other NEON registers are clobbered. + endprologue + + // Start by loading the operand words from memory. + vld1.32 {q13}, [r0] + SPREADACC0(q13, q14, q15, q12) + vld1.32 {q1}, [r2]! + + // Move the accumulator input to the incoming carry slots. Split the + // accumulator into four pieces. + SPREADACC1(q13, q14, q15, q12) + + b mla4_common +ENDFUNC + +INTFUNC(mmul4) + // On entry, r0 points to the destination; r1 points to a packed + // operand U; r2 points to a packed operand X (the modulus); q2/q3 + // holds an expanded operand V; and q4/q5 holds an expanded operand M + // (the Montgomery factor -N^{-1} (mod B)). On exit, the destination + // is updated (to zero); q4/q5 hold an expanded factor Y = U V M (mod + // B); q13--q15 hold outgoing carries c0--c2; r0, r1, and r2 are each + // advanced by 16; q2 and q3 are preserved; and the other NEON + // registers are clobbered. + + // Start by loading the operand words from memory. + vld1.32 {q0}, [r1]! + + // Calculate the low half of W = A + U V, being careful to leave the + // carries in place. + MUL0(q13, nil, q0, q2, q3, nil, nil, nil) + MUL1(q14, nil, q0, q2, q3, nil, nil, nil) + CARRY0(D0(q6), q13, q8) + MUL2(q15, nil, q0, q2, q3, nil, nil, nil) + CASIDE1(D0(q9), q14, q8) + CASIDE0(D0(q7), D0(q9), q14, q8) + MUL3(q12, nil, q0, q2, q3, nil, nil, nil) + b mmla4_common +ENDFUNC + +INTFUNC(mmla4) + // On entry, r0 points to the destination/accumulator A; r1 points to + // a packed operand U; r2 points to a packed operand X (the modulus); + // q2/q3 holds an expanded operand V; and q4/q5 holds an expanded + // operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the + // accumulator is updated (to zero); q4/q5 hold an expanded factor Y + // = (A + U V) M (mod B); q13--q15 hold outgoing carries c0--c2; r0, + // r1, and r2 are each advanced by 16; q2 and q3 are preserved; and + // the other NEON registers are clobbered. + endprologue + + // Start by loading the operand words from memory. + vld1.32 {q13}, [r0] + SPREADACC0(q13, q14, q15, q12) + vld1.32 {q0}, [r1]! + + // Move the accumulator input to the incoming carry slots. Split the + // accumulator into four pieces. + SPREADACC1(q13, q14, q15, q12) + + // Calculate the low half of W = A + U V, being careful to leave the + // carries in place. + MUL0(q13, t, q0, q2, q3, nil, nil, nil) + MUL1(q14, t, q0, q2, q3, nil, nil, nil) + CARRY0(D0(q6), q13, q8) + MUL2(q15, t, q0, q2, q3, nil, nil, nil) + CASIDE1(D0(q9), q14, q8) + CASIDE0(D0(q7), D0(q9), q14, q8) + MUL3(q12, t, q0, q2, q3, nil, nil, nil) +mmla4_common: + CASIDE1(D0(q9), q15, q8) + CASIDE0(D1(q6), D0(q9), q15, q8) + CASIDE1(D0(q9), q12, q8) + CASIDE0E(D1(q7), D0(q9), q12, q8) + vtrn.32 q6, q7 + + // Calculate the low half of the Montgomery factor Y = W M. At this + // point, registers are a little tight. + MUL0( q8, nil, q6, q4, q5, nil, nil, nil) + MUL1( q9, nil, q6, q4, q5, nil, nil, nil) + CARRY0(D0(q8), q8, q1) + MUL2(q10, nil, q6, q4, q5, nil, nil, nil) + CARRY1(q9, q1) + CARRY0(D0(q9), q9, q1) + MUL3(q11, nil, q6, q4, q5, nil, nil, nil) + CARRY1(q10, q1) + CARRY0(D1(q8), q10, q1) + vmov.i32 q5, #0 + CARRY1(q11, q1) + CARRY0(D1(q9), q11, q1) + vld1.32 {q1}, [r2]! + vtrn.32 q8, q9 + + // Expand Y. We'll put it in its proper place a bit later. + vzip.16 q8, q5 + + // Build up the product X Y in the carry slots. + MUL0(q13, t, q1, q8, q5, nil, nil, nil) + MUL1(q14, t, q1, q8, q5, nil, nil, nil) + CARRY0(nil, q13, q9) + MUL2(q15, t, q1, q8, q5, nil, nil, nil) + CARRY1(q14, q9) + vmov q4, q8 + CARRY0(nil, q14, q9) + MUL3(q12, t, q1, q8, q5, nil, nil, nil) + CARRY1(q15, q9) + CARRY0(nil, q15, q9) + vmov.u32 q6, #0 + + // And complete the calculation. + MUL4(q13, nil, q0, q2, q3, q1, q4, q5) + CARRY1(q12, q9) + CARRY0(nil, q12, q9) + MUL5(q14, nil, q0, q2, q3, q1, q4, q5) + CARRY1(q13, q9) + MUL6(q15, nil, q0, q2, q3, q1, q4, q5) + + // Finish up and store the result. + vst1.32 {q6}, [r0]! + + // All done. + bx r14 +ENDFUNC + +INTFUNC(mont4) + // On entry, r0 points to the destination/accumulator A; r2 points to + // a packed operand X (the modulus); and q2/q3 holds an expanded + // operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the + // accumulator is updated (to zero); q4/q5 hold an expanded factor Y + // = A M (mod B); q13--q15 hold outgoing carries c0--c2; r0 and r2 + // are each advanced by 16; q2 and q3 are preserved; and the other + // NEON registers are clobbered. + endprologue + + // Start by loading the operand words from memory. + vld1.32 {q0}, [r0] + vld1.32 {q1}, [r2]! + + // Calculate Y = A M (mod B). + MUL0(q8, nil, q0, q2, q3, nil, nil, nil) + MUL1(q9, nil, q0, q2, q3, nil, nil, nil) + CARRY0(D0(q4), q8, q6) + MUL2(q10, nil, q0, q2, q3, nil, nil, nil) + CARRY1(q9, q6) + vmov q13, q0 + CARRY0(D0(q7), q9, q6) + MUL3(q11, nil, q0, q2, q3, nil, nil, nil) + CARRY1(q10, q6) + CARRY0(D1(q4), q10, q6) + SPREADACC0(q13, q14, q15, q12) + CARRY1(q11, q6) + CARRY0(D1(q7), q11, q6) + SPREADACC1(q13, q14, q15, q12) + vmov.i32 q5, #0 + vtrn.32 q4, q7 + vzip.16 q4, q5 + + // Calculate the actual result. Well, the carries, at least. + vmov.i32 q8, #0 + MUL0(q13, t, q1, q4, q5, nil, nil, nil) + MUL1(q14, t, q1, q4, q5, nil, nil, nil) + CARRY0(nil, q13, q6) + MUL2(q15, t, q1, q4, q5, nil, nil, nil) + CARRY1(q14, q6) + CARRY0(nil, q14, q6) + MUL3(q12, t, q1, q4, q5, nil, nil, nil) + CARRY1(q15, q6) + CARRY0(nil, q15, q6) + MUL4(q13, nil, q1, q4, q5, nil, nil, nil) + CARRY1(q12, q6) + CARRY0(nil, q12, q6) + MUL5(q14, nil, q1, q4, q5, nil, nil, nil) + CARRY1(q13, q6) + MUL6(q15, nil, q1, q4, q5, nil, nil, nil) + + // Finish up and store the result. + vst1.32 {q8}, [r0]! + + // All done. + bx r14 +ENDFUNC + +///-------------------------------------------------------------------------- +/// Bulk multipliers. + +FUNC(mpx_umul4_arm_neon) + // void mpx_umul4_arm_neon(mpw *dv, const mpw *av, const mpw *avl, + // const mpw *bv, const mpw *bvl); + + // Establish the arguments and do initial setup. + // + // inner loop dv r0 + // inner loop av r2 + // outer loop dv r5 + // outer loop bv r3 + // av base r1 + // av limit r12 + // bv limit r4 + pushreg r4, r5, r14 + pushvfp QQ(q4, q7) + endprologue + + // Prepare for the first iteration. + vld1.32 {q4}, [r3]! // = Y = bv[0] + vmov.i32 q5, #0 + // r0 // = dv for inner loop + // r1 // = av base + // r3 // = bv for outer loop + ldr r4, [sp, #76] // = bv limit + mov r12, r2 // = av limit + mov r2, r1 // = av for inner loop + add r5, r0, #16 // = dv for outer loop + vzip.16 q4, q5 // expand Y + bl mul4zc + cmp r2, r12 // all done? + bhs 8f + + // Continue with the first iteration. +0: bl mul4 + cmp r2, r12 // all done? + blo 0b + + // Write out the leftover carry. There can be no tail here. +8: bl carryprop + cmp r3, r4 // more passes to do? + bhs 9f + + // Set up for the next pass. +1: vld1.32 {q4}, [r3]! // = Y = bv[i] + vmov.i32 q5, #0 + mov r0, r5 // -> dv[i] + mov r2, r1 // -> av[0] + add r5, r5, #16 + vzip.16 q4, q5 // expand Y + bl mla4zc + cmp r2, r12 // done yet? + bhs 8f + + // Continue... +0: bl mla4 + cmp r2, r12 + blo 0b + + // Finish off this pass. There was no tail on the previous pass, and + // there can be done on this pass. +8: bl carryprop + cmp r3, r4 + blo 1b + + // All over. +9: popvfp QQ(q4, q7) + popreg r4, r5, r14 + bx r14 +ENDFUNC + +FUNC(mpxmont_mul4_arm_neon) + // void mpxmont_mul4_arm_neon(mpw *dv, const mpw *av, const mpw *bv, + // const mpw *nv, size_t n, const mpw *mi); + + // Establish the arguments and do initial setup. + // + // inner loop dv r0 + // inner loop av r1 + // inner loop nv r2 + // mi r5 + // outer loop dv r6 + // outer loop bv r7 + // av base r8 + // av limit r9 + // bv limit r4 + // nv base r3 + // n r4 + // c r10 + // 0 r12 + + pushreg r4-r10, r14 + pushvfp QQ(q4, q7) + endprologue + + // Establish the expanded operands. + ldrd r4, r5, [sp, #96] // r4 = n; r5 -> mi + vld1.32 {q2}, [r2] // = V = bv[0] + vmov.i32 q3, #0 + vmov.i32 q5, #0 + vld1.32 {q4}, [r5] // = M + + // Set up the outer loop state and prepare for the first iteration. + // r0 // -> dv for inner loop + // r1 // -> av for inner loop + add r7, r2, #16 // -> bv + // r3 // -> nv + add r6, r0, #16 // -> dv + mov r8, r1 // -> av + add r9, r1, r4, lsl #2 // -> av limit + add r4, r2, r4, lsl #2 // -> bv limit + mov r2, r3 // -> nv for inner loop + mov r12, #0 // = 0 + + vzip.16 q2, q3 // expand V + vzip.16 q4, q5 // expand M + bl mmul4 + cmp r1, r9 // done already? + bhs 8f + + // Complete the first inner loop. +0: bl dmul4 + cmp r1, r9 // done yet? + blo 0b + + // Still have carries left to propagate. Rather than store the tail + // end in memory, keep it in a general-purpose register for later. + bl carryprop + vmov.32 r10, QW(q15, 0) + + // Embark on the next iteration. (There must be one. If n = 1 then + // we would have bailed above, to label 8. Similarly, the subsequent + // iterations can fall into the inner loop immediately.) +1: vld1.32 {q2}, [r7]! // = V = bv[i] + vld1.32 {q4}, [r5] // = M + vmov.i32 q3, #0 + vmov.i32 q5, #0 + mov r0, r6 // -> dv[i] + add r6, r6, #16 + mov r1, r8 // -> av[0] + mov r2, r3 // -> nv[0] + vzip.16 q2, q3 // expand V + vzip.16 q4, q5 // expand M + bl mmla4 + + // Complete the next inner loop. +0: bl dmla4 + cmp r1, r9 // done yet? + blo 0b + + // Still have carries left to propagate, and they overlap the + // previous iteration's final tail, so read that and add it. + cmp r7, r4 + vmov.32 D0(q12), r10, r12 + vadd.i64 D0(q13), D0(q13), D0(q12) + bl carryprop + vmov.32 r10, QW(q15, 0) + + // Back again, maybe. + blo 1b + + // All done, almost. + str r10, [r0], #4 + popvfp QQ(q4, q7) + popreg r4-r10, r14 + bx r14 + + // First iteration was short. Write out the carries and we're done. + // (This could be folded into the main loop structure, but that would + // penalize small numbers more.) +8: bl carryprop + vst1.32 {QW(q15, 0)}, [r0]! + popvfp QQ(q4, q7) + popreg r4-r10, r14 + bx r14 +ENDFUNC + +FUNC(mpxmont_redc4_arm_neon) + // void mpxmont_redc4_arm_neon(mpw *dv, mpw *dvl, const mpw *nv, + // size_t n, const mpw *mi); + + // Establish the arguments and do initial setup. + // + // inner loop dv r0 + // dv limit r1 + // inner loop nv r2 + // blocks-of-4 dv limit r3 + // mi (r14) + // outer loop dv r4 + // outer loop dv limit r5 + // nv base r6 + // nv limit r7 + // n r3 + // c (r14) + // t0, t1, t2, t3 r2, r8, r9, r10 + // 0 r12 + + pushreg r4-r10, r14 + pushvfp QQ(q4, q7) + endprologue + + // Set up the outer loop state and prepare for the first iteration. + ldr r14, [sp, #96] // -> mi + vmov.i32 q3, #0 + sub r12, r1, r0 // total dv bytes + // r0 // -> dv for inner loop + // r1 // -> overall dv limit + // r2 // -> nv for inner loop + // r3 // = n (for now) + add r4, r0, #16 // -> dv for outer loop + add r5, r0, r3, lsl #2 // -> dv limit + bic r12, r12, #15 // dv blocks of 4 + vld1.32 {q2}, [r14] // = M + mov r6, r2 // -> nv + add r7, r2, r3, lsl #2 // -> nv limit + add r3, r0, r12 // -> dv blocks-of-4 limit + vzip.16 q2, q3 // expand M + mov r12, #0 // = 0 + bl mont4 + cmp r2, r7 // done already? + bhs 8f + +5: bl mla4 + cmp r2, r7 // done yet? + blo 5b + + // Still have carries left to propagate. Adding the accumulator + // block into the carries is a little different this time, because + // all four accumulator limbs have to be squished into the three + // carry registers for `carryprop' to do its thing. +8: vld1.32 {q9}, [r0] + SPREADACC0(q9, q10, q11, q12) + SPREADACC1(q9, q10, q11, q12) + vshl.u64 D0(q12), D0(q12), #16 + CARRYACC(q9, q10, q11, q12, q13, q14, q15) + vadd.u64 D1(q15), D1(q15), D0(q12) + + bl carryprop + vmov.32 r14, QW(q15, 0) + cmp r0, r3 + bhs 7f + + // Propagate the first group of carries. + ldmia r0, {r2, r8-r10} + adds r2, r2, r14 + adcs r8, r8, #0 + adcs r9, r9, #0 + adcs r10, r10, #0 + stmia r0!, {r2, r8-r10} + teq r0, r3 + beq 6f + + // Continue carry propagation until the end of the buffer. +0: ldmia r0, {r2, r8-r10} + adcs r2, r2, #0 + adcs r8, r8, #0 + adcs r9, r9, #0 + adcs r10, r10, #0 + stmia r0!, {r2, r8-r10} + teq r0, r3 + bne 0b + + // Deal with the tail end. Note that the actual destination length + // won't be an exacty number of blocks of four, so it's safe to just + // drop through here. +6: adc r14, r12, #0 +7: ldr r2, [r0] + adds r2, r2, r14 + str r2, [r0], #4 + teq r0, r1 + beq 8f +0: ldr r2, [r0] + adcs r2, r2, #0 + str r2, [r0], #4 + teq r0, r1 + bne 0b + + // All done for this iteration. Start the next. +8: cmp r4, r5 + bhs 9f + mov r0, r4 + add r4, r4, #16 + mov r2, r6 + bl mont4 + b 5b + + // All over. +9: popvfp QQ(q4, q7) + popreg r4-r10, r14 + bx r14 +ENDFUNC + +///-------------------------------------------------------------------------- +/// Testing and performance measurement. + +#ifdef TEST_MUL4 + +// dmul smul mmul mont +// z r0 r0 r0 r0 r0 +// c r4 r1 r1 r1 r1 +// y r3 -- -- r2 r2 +// u r1 r2 -- r3 -- +// x r2 r3 r2 stk0 r3 +// vv q2/q3 stk0 -- stk1 stk0 +// yy q4/q5 stk1 r3 stk2 -- +// n r5 stk2 stk0 stk3 stk1 +// cyv r6 stk3 stk1 stk4 stk2 + +#define STKARG(i) sp, #80 + 4*(i) + +.macro testprologue mode + pushreg r4-r6, r14 + pushvfp QQ(q4, q7) + endprologue + + .ifeqs "\mode", "dmul" + mov r4, r1 // -> c + mov r1, r2 // -> u + mov r2, r3 // -> x + + ldr r14, [STKARG(0)] // -> vv + vld1.32 {q2}, [r14] + vmov.i32 q3, #0 + vzip.16 q2, q3 // (v'_0, v''_0; v'_1, v''_1) + + ldr r14, [STKARG(1)] // -> yy + vld1.32 {q4}, [r14] + vmov.i32 q5, #0 + vzip.16 q4, q5 // (y'_0, y''_0; y'_1, y''_1) + + ldr r5, [STKARG(2)] // = n + ldr r6, [STKARG(3)] // -> cyv + .endif + + .ifeqs "\mode", "smul" + mov r4, r1 // -> c + // r2 // -> x + + vld1.32 {q4}, [r3] + vmov.i32 q5, #0 + vzip.16 q4, q5 // (y'_0, y''_0; y'_1, y''_1) + + ldr r5, [STKARG(0)] // = n + ldr r6, [STKARG(1)] // -> cyv + .endif + + .ifeqs "\mode", "mmul" + mov r4, r1 // -> c + mov r1, r3 // -> u + mov r3, r2 // -> y + ldr r2, [STKARG(0)] // -> x + + ldr r14, [STKARG(1)] // -> vv + vld1.32 {q2}, [r14] + vmov.i32 q3, #0 + vzip.16 q2, q3 // (v'_0, v''_0; v'_1, v''_1) + + ldr r14, [STKARG(2)] // -> yy + vld1.32 {q4}, [r14] + vmov.i32 q5, #0 + vzip.16 q4, q5 // (y'_0, y''_0; y'_1, y''_1) + + ldr r5, [STKARG(3)] // = n + ldr r6, [STKARG(4)] // -> cyv + .endif + + .ifeqs "\mode", "mont" + mov r4, r1 // -> c + mov r1, r3 // -> u + mov r14, r2 + mov r2, r3 // -> x + mov r3, r14 // -> y + + ldr r14, [STKARG(0)] // -> vv + vld1.32 {q2}, [r14] + vmov.i32 q3, #0 + vzip.16 q2, q3 // (v'_0, v''_0; v'_1, v''_1) + + ldr r5, [STKARG(1)] // = n + ldr r6, [STKARG(2)] // -> cyv + .endif +.endm + +.macro testldcarry + vldmia r4, {QQ(q13, q15)} // c0, c1, c2 +.endm + +.macro testtop +0: subs r5, r5, #1 +.endm + +.macro testtail + bne 0b +.endm + +.macro testcarryout + vstmia r4, {QQ(q13, q15)} +.endm + +.macro testepilogue + popvfp QQ(q4, q7) + popreg r4-r6, r14 + bx r14 +.endm + +FUNC(test_dmul4) + testprologue dmul + testldcarry + testtop + bl dmul4 + testtail + testcarryout + testepilogue +ENDFUNC + +FUNC(test_dmla4) + testprologue dmul + testldcarry + testtop + bl dmla4 + testtail + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mul4) + testprologue smul + testldcarry + testtop + bl mul4 + testtail + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mul4zc) + testprologue smul + testldcarry + testtop + bl mul4zc + testtail + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mla4) + testprologue smul + testldcarry + testtop + bl mla4 + testtail + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mla4zc) + testprologue smul + testldcarry + testtop + bl mla4zc + testtail + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mmul4) + testprologue mmul + testtop + bl mmul4 + testtail + vst1.32 {q4, q5}, [r3] + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mmla4) + testprologue mmul + testtop + bl mmla4 + testtail + vst1.32 {q4, q5}, [r3] + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mont4) + testprologue mont + testtop + bl mont4 + testtail + vst1.32 {q4, q5}, [r3] + testcarryout + testepilogue +ENDFUNC + +#endif + +///----- That's all, folks -------------------------------------------------- diff --git a/math/mpx-mul4-arm64-simd.S b/math/mpx-mul4-arm64-simd.S new file mode 100644 index 00000000..8f482dd8 --- /dev/null +++ b/math/mpx-mul4-arm64-simd.S @@ -0,0 +1,1217 @@ +/// -*- mode: asm; asm-comment-char: ?/ -*- +/// +/// Large SIMD-based multiplications +/// +/// (c) 2019 Straylight/Edgeware +/// + +///----- Licensing notice --------------------------------------------------- +/// +/// This file is part of Catacomb. +/// +/// Catacomb is free software: you can redistribute it and/or modify it +/// under the terms of the GNU Library General Public License as published +/// by the Free Software Foundation; either version 2 of the License, or +/// (at your option) any later version. +/// +/// Catacomb is distributed in the hope that it will be useful, but +/// WITHOUT ANY WARRANTY; without even the implied warranty of +/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +/// Library General Public License for more details. +/// +/// You should have received a copy of the GNU Library General Public +/// License along with Catacomb. If not, write to the Free Software +/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, +/// USA. + +///-------------------------------------------------------------------------- +/// Preliminaries. + +#include "config.h" +#include "asm-common.h" + + .text + +///-------------------------------------------------------------------------- +/// Theory. +/// +/// We define a number of primitive fixed-size multipliers from which we can +/// construct more general variable-length multipliers. +/// +/// The basic trick is the same throughout. In an operand-scanning +/// multiplication, the inner multiplication loop multiplies a multiple- +/// precision operand by a single precision factor, and adds the result, +/// appropriately shifted, to the result. A `finely integrated operand +/// scanning' implementation of Montgomery multiplication also adds the +/// product of a single-precision `Montgomery factor' and the modulus, +/// calculated in the same pass. The more common `coarsely integrated +/// operand scanning' alternates main multiplication and Montgomery passes, +/// which requires additional carry propagation. +/// +/// Throughout both plain-multiplication and Montgomery stages, then, one of +/// the factors remains constant throughout the operation, so we can afford +/// to take a little time to preprocess it. The transformation we perform is +/// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a +/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into +/// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit +/// pieces are placed into 32-bit cells, and arranged as two 128-bit SIMD +/// operands, as follows. +/// +/// Offset 0 4 8 12 +/// 0 v'_0 v''_0 v'_1 v''_1 +/// 16 v'_2 v''_2 v'_3 v''_3 +/// +/// The `umull' and `umlal' instructions can multiply a vector of two 32-bit +/// values by a 32-bit scalar, giving two 64-bit results; thus, it will act +/// on (say) v'_0 and v''_0 in a single instruction, to produce two 48-bit +/// results in 64-bit fields. The sixteen bits of headroom allows us to add +/// many products together before we must deal with carrying; it also allows +/// for some calculations to be performed on the above expanded form. +/// +/// We maintain three `carry' registers, v28--v30, accumulating intermediate +/// results; we name them `c0', `c1', and `c2'. Each carry register holds +/// two 64-bit halves: the register c0, for example, holds c'_0 (low half) +/// and c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the +/// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2. +/// The `umull' or `umlal' instruction acting on a scalar operand and an +/// operand in the expanded form above produces a result which can be added +/// directly to the appropriate carry register. +/// +/// An unusual feature of this code, as compared to the other `mul4' +/// implementations, is that it makes extensive use of the ARM64 +/// general-purpose registers for carry resolution and output construction. +/// As a result, an additional general-purpose register (typically x15) is +/// used as an additional carry, with the carry value in bits 16--63. +/// +/// Multiplication is performed in product-scanning order, since ARM +/// processors commonly implement result forwarding for consecutive multiply- +/// and-accumulate instructions specifying the same destination. +/// Experimentally, this runs faster than operand-scanning in an attempt to +/// hide instruction latencies. +/// +/// On 64-bit ARM, we have a vast supply number of registers: the expanded +/// operands are kept in registers. The packed operands are read from memory +/// into working registers v0 and v1. The following conventional argument +/// names and locations are used throughout. +/// +/// Arg Format Location Notes +/// +/// U packed [x1] +/// X packed [x2] In Montgomery multiplication, X = N +/// V expanded v2/v3 +/// Y expanded v4/v5 In Montgomery multiplication, Y = (A + U V) M +/// M expanded v6/v7 -N^{-1} (mod B^4) +/// N Modulus, for Montgomery multiplication +/// A packed [x0] Destination/accumulator +/// C carry v28--v30 +/// 0 v31 128-bit zero +/// +/// The calculation is some variant of +/// +/// A' + C' B^4 <- U V + X Y + A + C +/// +/// The low-level functions fit into a fairly traditional (finely-integrated) +/// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y) +/// (indexed by i). +/// +/// The variants are as follows. +/// +/// Function Variant Use i j +/// +/// mmul4 A = C = 0 Montgomery 0 0 +/// dmul4 A = 0 Montgomery 0 + +/// mmla4 C = 0 Montgomery + 0 +/// dmla4 exactly as shown Montgomery + + +/// +/// mul4zc U = V = A = C = 0 Plain 0 0 +/// mul4 U = V = A = 0 Plain 0 + +/// mla4zc U = V = C = 0 Plain + 0 +/// mla4 U = V = 0 Plain + + +/// +/// The `mmul4' and `mmla4' functions are also responsible for calculating +/// the Montgomery reduction factor Y = (A + U V) M used by the rest of the +/// inner loop. + +///-------------------------------------------------------------------------- +/// Macro definitions. + +.macro mulacc z, u, v, x=nil, y=nil + // Set Z = Z + U V + X Y, using the low halves of V and Y. Y may be + // `nil' to omit the second operand. Z, V, and Y should be 128-bit + // `vN' registers; and U and X should be 32-bit `vN.s[I]' scalars; + // the multiplications produce two 64-bit elementwise products, which + // are added elementwise to Z. + + umlal \z\().2d, \v\().2s, \u + .ifnes "\y", "nil" + umlal \z\().2d, \y\().2s, \x + .endif +.endm + +.macro mulacc2 z, u, v, x=nil, y=nil + // Set Z = Z + U V + X Y, using the high halves of V and Y; see + // `mulacc'. + + umlal2 \z\().2d, \v\().4s, \u + .ifnes "\y", "nil" + umlal2 \z\().2d, \y\().4s, \x + .endif +.endm + +.macro mulinit z, zinitp, u, v=nil, x=nil, y=nil + // If ZINITP then set Z = Z + U V + X Y, as for `mulacc'; otherwise, + // set Z = U V + X Y. Operand requirements and detailed operation + // are as for `mulacc'. + + .ifeqs "\zinitp", "t" + mulacc \z, \u, \v, \x, \y + .else + umull \z\().2d, \v\().2s, \u + .ifnes "\y", "nil" + umlal \z\().2d, \y\().2s, \x + .endif + .endif +.endm + +.macro mulini2 z, zinitp, u, v=nil, x=nil, y=nil + // As `mulinit', but with the high halves of V and Y. + + .ifeqs "\zinitp", "t" + mulacc2 \z, \u, \v, \x, \y + .else + umull2 \z\().2d, \v\().4s, \u + .ifnes "\y", "nil" + umlal2 \z\().2d, \y\().4s, \x + .endif + .endif +.endm + +// `mulI': accumulate the B^I and b B^i terms of the polynomial product sum U +// V + X Y, given that U = u_0 + B u_1 + B^2 u_2 + B^3 u_3 (and similarly for +// x), and V = v'_0 + b v''_0 + B (v'_1 + b v''_1) + B^2 (v'_2 + b v''_2) + +// B^3 (v'_3 + b v''_3) (and similarly for Y). The 64-bit coefficients are +// added into the low and high halves of the 128-bit register Z (if ZINIT is +// `nil' then simply set Z, as if it were initially zero). +.macro mul0 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil + mulinit \z, \zinitp, \u\().s[0], \v0, \x\().s[0], \y0 +.endm +.macro mul1 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil + mulini2 \z, \zinitp, \u\().s[0], \v0, \x\().s[0], \y0 + mulacc \z, \u\().s[1], \v0, \x\().s[1], \y0 +.endm +.macro mul2 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil + mulinit \z, \zinitp, \u\().s[0], \v1, \x\().s[0], \y1 + mulacc2 \z, \u\().s[1], \v0, \x\().s[1], \y0 + mulacc \z, \u\().s[2], \v0, \x\().s[2], \y0 +.endm +.macro mul3 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil + mulini2 \z, \zinitp, \u\().s[0], \v1, \x\().s[0], \y1 + mulacc \z, \u\().s[1], \v1, \x\().s[1], \y1 + mulacc2 \z, \u\().s[2], \v0, \x\().s[2], \y0 + mulacc \z, \u\().s[3], \v0, \x\().s[3], \y0 +.endm +.macro mul4 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil + mulini2 \z, \zinitp, \u\().s[1], \v1, \x\().s[1], \y1 + mulacc \z, \u\().s[2], \v1, \x\().s[2], \y1 + mulacc2 \z, \u\().s[3], \v0, \x\().s[3], \y0 +.endm +.macro mul5 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil + mulini2 \z, \zinitp, \u\().s[2], \v1, \x\().s[2], \y1 + mulacc \z, \u\().s[3], \v1, \x\().s[3], \y1 +.endm +.macro mul6 z, zinitp, u, v0, v1, x=nil, y0=nil, y1=nil + mulini2 \z, \zinitp, \u\().s[3], \v1, \x\().s[3], \y1 +.endm + +// Steps in the process of propagating carry bits upwards from ZLO (a 128-bit +// `vN' register). Here, T0, T1, and CG are 64-bit `xN' general-purpose +// registers clobbered in the process. Set the low 32 bits of the 64-bit +// `xN' general-purpose register ZOUT to the completed coefficient z_1, +// leaving a carry in CG. +// +// In detail, what happens is as follows. Suppose initially that ZLO = +// (z'_i; z''_i) and ZHI = (z'_{i+1}; z''_{i+1}). Let t = z'_i + b z''_i; +// observe that floor(t/b) = floor(z'_i/b) + z''_i. Let z_i = t mod B, and +// add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}. This has +// a circuit depth of 3; I don't know how to do better. +// +// Output words are left in the low half of a 64-bit register, with rubbish +// in the high half. Two such results can be combined using the `bfi' +// instruction. +.macro carry0 zlo, cg=x15, t0=x16, t1=x17 + // Capture the values of carry-register ZLO and CG (if not `nil') in + // general-purpose registers T0 and T1, suitable for use in `carry1'. + mov \t0, \zlo\().d[0] + mov \t1, \zlo\().d[1] + .ifnes "\cg", "nil" + add \t0, \t0, \cg, lsr #16 + .endif +.endm +.macro carry1 zout, cg=x15, t0=x16, t1=x17 + // Collect a 32-bit output word in the low 32 bits of ZOUT (leaving + // rubbish in the high 32 bits), and update CG suitably to continue + // processing with the next carry register. + .ifnes "\zout", "nil" + add \zout, \t0, \t1, lsl #16 + .endif + .ifnes "\cg", "nil" + add \cg, \t1, \t0, lsr #16 + .endif +.endm + +.macro expand vlo, vhi, vz=v31 + // Expand the packed 128-bit operand in VLO to an expanded operand in + // VLO and VHI, assuming that VZ is all-bits-zero. All three are + // `vN' 128-bit SIMD registers. + zip2 \vhi\().8h, \vlo\().8h, \vz\().8h + zip1 \vlo\().8h, \vlo\().8h, \vz\().8h +.endm + +.macro sprdacc a0, a1, a2, a3=nil + // Spread the packed 128-bit operand in A0 into carry-format values + // in A0, A1, A2, A3. If A3 is `nil', then spread the same value + // into A0, A1, A2 only, clobbering x16. + .ifeqs "\a3", "nil" + mov w16, \a0\().s[3] + .endif + trn2 \a2\().2d, \a0\().2d, v31.2d + trn2 \a1\().2s, \a0\().2s, v31.2s + .ifeqs "\a3", "nil" + lsl x16, x16, #16 + .endif + trn1 \a0\().2s, \a0\().2s, v31.2s + .ifeqs "\a3", "nil" + mov \a2\().d[1], x16 + .else + trn2 \a3\().2s, \a2\().2s, v31.2s + .endif + mov \a2\().s[1], wzr +.endm + +.macro crryacc a0, a1, a2, a3, c0, c1, c2 + // Add the carry-format values A0, A1, A2 into the existing carries + // C0, C1, C2 (leaving A3 where it is). + add \c0\().2d, \c0\().2d, \a0\().2d + add \c1\().2d, \c1\().2d, \a1\().2d + add \c2\().2d, \c2\().2d, \a2\().2d +.endm + +///-------------------------------------------------------------------------- +/// Primitive multipliers and related utilities. + +INTFUNC(carryprop) + // On entry, x0 points to a destination, and v28--v30 and x15 hold + // incoming carries c0--c2 and cg. On exit, the low 128 bits of the + // carry value are stored at [x0]; the remaining 16 bits of carry are + // left in x10; x0 is advanced by 16; and x11--x17 are clobbered. + endprologue + + carry0 v28 + carry1 x11 + carry0 v29 + carry1 x13 + carry0 v30 + carry1 x12 + bfi x11, x13, #32, #32 + lsr x14, x15, #16 + lsr x10, x15, #48 + bfi x12, x14, #32, #32 + stp x11, x12, [x0], #16 + ret +ENDFUNC + +INTFUNC(dmul4) + // On entry, x0 points to the destination; x1 and x2 point to packed + // operands U and X; v2/v3 and v4/v5 hold expanded operands V and Y; + // v28--v30 and x15 hold incoming carries c0--c2 and cg; and v31 is + // zero. On exit, the destination and carries are updated; x0, x1, + // x2 are each advanced by 16; v2--v5 and v8--v15 are preserved; and + // x11--x14, x16, x17 and the other SIMD registers are clobbered. + endprologue + + // Start by loading the operand words from memory. + ldr q0, [x1], #16 + ldr q1, [x2], #16 + + // Do the multiplication. + mul0 v28, t, v0, v2, v3, v1, v4, v5 + mul1 v29, t, v0, v2, v3, v1, v4, v5 + carry0 v28 + mul2 v30, t, v0, v2, v3, v1, v4, v5 + carry1 x11 + carry0 v29 + mul3 v27, nil, v0, v2, v3, v1, v4, v5 + carry1 x13 + carry0 v30 + mul4 v28, nil, v0, v2, v3, v1, v4, v5 + carry1 x12 + carry0 v27 + mul5 v29, nil, v0, v2, v3, v1, v4, v5 + carry1 x14 + mul6 v30, nil, v0, v2, v3, v1, v4, v5 + + // Finish up and store the result. + bfi x11, x13, #32, #32 + bfi x12, x14, #32, #32 + stp x11, x12, [x0], #16 + + // All done. + ret +ENDFUNC + +INTFUNC(dmla4) + // On entry, x0 points to the destination/accumulator A; x1 and x2 + // point to packed operands U and X; v2/v3 and v4/v5 hold expanded + // operands V and Y; v28--v30 and x15 hold incoming carries c0--c2 + // and cg; and v31 is zero. On exit, the accumulator and carries are + // updated; x0, x1, x2 are each advanced by 16; v2--v5 and v8--v15 + // are preserved; and x11--x14, x16, x17 and the other SIMD registers + // are clobbered. + endprologue + + // Start by loading the operand words from memory. + ldr q24, [x0] + ldr q0, [x1], #16 + ldr q1, [x2], #16 + sprdacc v24, v25, v26, v27 + crryacc v24, v25, v26, v27, v28, v29, v30 + + // Do the multiplication. + mul0 v28, t, v0, v2, v3, v1, v4, v5 + mul1 v29, t, v0, v2, v3, v1, v4, v5 + carry0 v28 + mul2 v30, t, v0, v2, v3, v1, v4, v5 + carry1 x11 + carry0 v29 + mul3 v27, t, v0, v2, v3, v1, v4, v5 + carry1 x13 + carry0 v30 + mul4 v28, nil, v0, v2, v3, v1, v4, v5 + carry1 x12 + carry0 v27 + mul5 v29, nil, v0, v2, v3, v1, v4, v5 + carry1 x14 + mul6 v30, nil, v0, v2, v3, v1, v4, v5 + + // Finish up and store the result. + bfi x11, x13, #32, #32 + bfi x12, x14, #32, #32 + stp x11, x12, [x0], #16 + + // All done. + ret +ENDFUNC + +INTFUNC(mul4) + // On entry, x0 points to the destination; x2 points to a packed + // operand X; v4/v5 holds an expanded operand Y; v13--v15 and x15 + // hold incoming carries c0--c2 and cg; and v31 is zero. On exit, + // the destination and carries are updated; x0 and x2 are each + // advanced by 16; v4 and v5 and v8--v15 are preserved; and x11--x14, + // x16, x17 and the other SIMD registers are clobbered. + endprologue + + // Start by loading the operand words from memory. + ldr q1, [x2], #16 + + // Do the multiplication. + mul0 v28, t, v1, v4, v5 + mul1 v29, t, v1, v4, v5 + carry0 v28 + mul2 v30, t, v1, v4, v5 + carry1 x11 + carry0 v29 + mul3 v27, nil, v1, v4, v5 + carry1 x13 + carry0 v30 + mul4 v28, nil, v1, v4, v5 + carry1 x12 + carry0 v27 + mul5 v29, nil, v1, v4, v5 + carry1 x14 + mul6 v30, nil, v1, v4, v5 + + // Finish up and store the result. + bfi x11, x13, #32, #32 + bfi x12, x14, #32, #32 + stp x11, x12, [x0], #16 + + // All done. + ret +ENDFUNC + +INTFUNC(mul4zc) + // On entry, x0 points to the destination; x2 points to a packed + // operand X; v4/v5 holds an expanded operand Y; and v31 is zero. On + // exit, the destination is updated; v28--v30 and x15 hold outgoing + // carries c0--c2 and cg; x0 and x2 are each advanced by 16; v4 and + // v5 and v8--v15 are preserved; and x11--x14, x16, x17 and the other + // SIMD registers are clobbered. + endprologue + + // Start by loading the operand words from memory. + ldr q1, [x2], #16 + + // Do the multiplication. + mul0 v28, nil, v1, v4, v5 + mul1 v29, nil, v1, v4, v5 + carry0 v28, nil + mul2 v30, nil, v1, v4, v5 + carry1 x11 + carry0 v29 + mul3 v27, nil, v1, v4, v5 + carry1 x13 + carry0 v30 + mul4 v28, nil, v1, v4, v5 + carry1 x12 + carry0 v27 + mul5 v29, nil, v1, v4, v5 + carry1 x14 + mul6 v30, nil, v1, v4, v5 + + // Finish up and store the result. + bfi x11, x13, #32, #32 + bfi x12, x14, #32, #32 + stp x11, x12, [x0], #16 + + // All done. + ret +ENDFUNC + +INTFUNC(mla4) + // On entry, x0 points to the destination/accumulator A; x2 points to + // a packed operand X; v4/v5 holds an expanded operand Y; v13--v15 + // and x15 hold incoming carries c0--c2 and cg; and v31 is zero. On + // exit, the accumulator and carries are updated; x0 and x2 are each + // advanced by 16; v4 and v5 and v8--v15 are preserved; and x11--x14, + // x16, x17 and the other SIMD registers are clobbered. + endprologue + + // Start by loading the operand words from memory. + ldr q24, [x0] + ldr q1, [x2], #16 + sprdacc v24, v25, v26, v27 + crryacc v24, v25, v26, v27, v28, v29, v30 + + // Do the multiplication. + mul0 v28, t, v1, v4, v5 + mul1 v29, t, v1, v4, v5 + carry0 v28 + mul2 v30, t, v1, v4, v5 + carry1 x11 + carry0 v29 + mul3 v27, t, v1, v4, v5 + carry1 x13 + carry0 v30 + mul4 v28, nil, v1, v4, v5 + carry1 x12 + carry0 v27 + mul5 v29, nil, v1, v4, v5 + carry1 x14 + mul6 v30, nil, v1, v4, v5 + + // Finish up and store the result. + bfi x11, x13, #32, #32 + bfi x12, x14, #32, #32 + stp x11, x12, [x0], #16 + + // All done. + ret +ENDFUNC + +INTFUNC(mla4zc) + // On entry, x0 points to the destination/accumulator A; x2 points to + // a packed operand X; v4/v5 holds an expanded operand Y; and v31 is + // zero. On exit, the accumulator is updated; v28--v30 and x15 hold + // outgoing carries c0--c2 and cg; x0 and x2 are each advanced by 16; + // v4, v5, and v8--v15 are preserved; and x11--x14, x16, x17 and the + // other SIMD registers are clobbered. + endprologue + + // Start by loading the operand words from memory. + ldr q28, [x0] + ldr q1, [x2], #16 + sprdacc v28, v29, v30, v27 + + // Do the multiplication. + mul0 v28, t, v1, v4, v5 + mul1 v29, t, v1, v4, v5 + carry0 v28, nil + mul2 v30, t, v1, v4, v5 + carry1 x11 + carry0 v29 + mul3 v27, t, v1, v4, v5 + carry1 x13 + carry0 v30 + mul4 v28, nil, v1, v4, v5 + carry1 x12 + carry0 v27 + mul5 v29, nil, v1, v4, v5 + carry1 x14 + mul6 v30, nil, v1, v4, v5 + + // Finish up and store the result. + bfi x11, x13, #32, #32 + bfi x12, x14, #32, #32 + stp x11, x12, [x0], #16 + + // All done. + ret +ENDFUNC + +INTFUNC(mmul4) + // On entry, x0 points to the destination; x1 points to a packed + // operand U; x2 points to a packed operand X (the modulus); v2/v3 + // holds an expanded operand V; and v6/v7 holds an expanded operand M + // (the Montgomery factor -N^{-1} (mod B)). On exit, the destination + // is updated (to zero); v4/v5 hold an expanded factor Y = U V M (mod + // B); v28--v30 and x15 hold outgoing carries c0--c2 and cg; x0, x1, + // and x2 are each advanced by 16; v2, v3, and v8--v15 are preserved; + // and x11--x14, x16, x17 and the other SIMD registers are clobbered. + endprologue + + // Start by loading the operand words from memory. + ldr q0, [x1], #16 + ldr q1, [x2], #16 + + // Calculate the low half of W = A + U V, being careful to leave the + // carries in place. + mul0 v28, nil, v0, v2, v3 + mul1 v29, nil, v0, v2, v3 + carry0 v28, nil + mul2 v30, nil, v0, v2, v3 + carry1 x11 + carry0 v29 + mul3 v27, nil, v0, v2, v3 + b mmla4_common +ENDFUNC + +INTFUNC(mmla4) + // On entry, x0 points to the destination/accumulator A; x1 points to + // a packed operand U; x2 points to a packed operand X (the modulus); + // v2/v3 holds an expanded operand V; and v6/v7 holds an expanded + // operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the + // accumulator is updated (to zero); v4/v5 hold an expanded factor Y + // = (A + U V) M (mod B); v28--v30 and x15 hold outgoing carries + // c0--c2 and cg; x0, x1, and x2 are each advanced by 16; v2, v3, v6, + // v7, and v8--v15 are preserved; and x11--x14, x16, x17 and the + // other SIMD registers are clobbered. + endprologue + + // Start by loading the operand words from memory. + ldr q28, [x0] + ldr q0, [x1], #16 + ldr q1, [x2], #16 + sprdacc v28, v29, v30, v27 + + // Calculate the low half of W = A + U V, being careful to leave the + // carries in place. + mul0 v28, t, v0, v2, v3 + mul1 v29, t, v0, v2, v3 + carry0 v28, nil + mul2 v30, t, v0, v2, v3 + carry1 x11 + carry0 v29 + mul3 v27, t, v0, v2, v3 +mmla4_common: + carry1 x13 + carry0 v30 + carry1 x12 + carry0 v27 + carry1 x14, nil + + // Piece the result together and ship it back. + bfi x11, x13, #32, #32 + bfi x12, x14, #32, #32 + mov v16.d[0], x11 + mov v16.d[1], x12 + + // Calculate the low half of the Montgomery factor Y = W M. + mul0 v18, nil, v16, v6, v7 + mul1 v19, nil, v16, v6, v7 + carry0 v18, nil + mul2 v20, nil, v16, v6, v7 + carry1 x11 + carry0 v19 + mul3 v21, nil, v16, v6, v7 + carry1 x13 + carry0 v20 + carry1 x12 + carry0 v21 + carry1 x14, nil + + // Piece the result together, ship it back, and expand. + bfi x11, x13, #32, #32 + bfi x12, x14, #32, #32 + mov v4.d[0], x11 + mov v4.d[1], x12 + expand v4, v5 + + // Build up the product X Y in the carry slots. + mul0 v28, t, v1, v4, v5 + mul1 v29, t, v1, v4, v5 + carry0 v28, nil + mul2 v30, t, v1, v4, v5 + carry1 nil + carry0 v29 + mul3 v27, t, v1, v4, v5 + carry1 nil + carry0 v30 + + // And complete the calculation. + mul4 v28, nil, v0, v2, v3, v1, v4, v5 + carry1 nil + carry0 v27 + mul5 v29, nil, v0, v2, v3, v1, v4, v5 + carry1 nil + mul6 v30, nil, v0, v2, v3, v1, v4, v5 + + // Finish up and store the result. + stp xzr, xzr, [x0], #16 + + // All done. + ret +ENDFUNC + +INTFUNC(mont4) + // On entry, x0 points to the destination/accumulator A; x2 points to + // a packed operand X (the modulus); and v6/v7 holds an expanded + // operand M (the Montgomery factor -N^{-1} (mod B)). On exit, the + // accumulator is updated (to zero); v4/v5 hold an expanded factor Y + // = A M (mod B); v28--v30 and x15 hold outgoing carries c0--c2 and + // cg; x0 and x2 are each advanced by 16; v6, v7, and v8--v15 are + // preserved; and x11--x14, x16, x17 and the other SIMD registers are + // clobbered. + endprologue + + // Start by loading the operand words from memory. + ldr q28, [x0] + ldr q1, [x2], #16 + + // Calculate Y = A M (mod B). + mul0 v18, nil, v28, v6, v7 + mul1 v19, nil, v28, v6, v7 + carry0 v18, nil + mul2 v20, nil, v28, v6, v7 + carry1 x11 + carry0 v19 + mul3 v21, nil, v28, v6, v7 + carry1 x13 + carry0 v20 + sprdacc v28, v29, v30, v27 + carry1 x12 + carry0 v21 + carry1 x14, nil + + // Piece the result together, ship it back, and expand. + bfi x11, x13, #32, #32 + bfi x12, x14, #32, #32 + mov v4.d[0], x11 + mov v4.d[1], x12 + expand v4, v5 + + // Calculate the actual result. Well, the carries, at least. + mul0 v28, t, v1, v4, v5 + mul1 v29, t, v1, v4, v5 + carry0 v28, nil + mul2 v30, t, v1, v4, v5 + carry1 nil + carry0 v29 + mul3 v27, t, v1, v4, v5 + carry1 nil + carry0 v30 + + // And complete the calculation. + mul4 v28, nil, v1, v4, v5 + carry1 nil + carry0 v27 + mul5 v29, nil, v1, v4, v5 + carry1 nil + mul6 v30, nil, v1, v4, v5 + + // Finish up and store the result. + stp xzr, xzr, [x0], #16 + + // All done. + ret +ENDFUNC + +///-------------------------------------------------------------------------- +/// Bulk multipliers. + +FUNC(mpx_umul4_arm64_simd) + // void mpx_umul4_arm64_simd(mpw *dv, const mpw *av, const mpw *avl, + // const mpw *bv, const mpw *bvl); + + // Establish the arguments and do initial setup. + // + // inner loop dv x0 + // inner loop av x2 + // outer loop dv x5 + // outer loop bv x3 + // av base x1 + // inner n x6 + // n base x7 + // outer n x4 + pushreg x29, x30 + setfp + endprologue + + // Prepare for the first iteration. + ldr q4, [x3], #16 // Y = bv[0] + movi v31.4s, #0 + sub x7, x2, x1 // = inner loop count base + // x0 // = dv for inner loop + // x1 // = av base + // x3 // = bv for outer loop + sub x4, x4, x3 // = outer loop count (decremented) + sub x6, x7, #16 // = inner loop count (decremented) + mov x2, x1 // = av for inner loop + add x5, x0, #16 // = dv for outer loop + expand v4, v5 // expand Y + bl mul4zc + cbz x6, 8f // all done? + + // Continue with the first iteration. +0: sub x6, x6, #16 + bl mul4 + cbnz x6, 0b + + // Write out the leftover carry. There can be no tail here. +8: bl carryprop + cbz x4, 9f + + // Set up for the next pass. +1: ldr q4, [x3], #16 // Y = bv[i] + mov x0, x5 // -> dv[i] + mov x2, x1 // -> av[0] + add x5, x5, #16 + sub x6, x7, #16 // = inner loop count (decremented) + sub x4, x4, #16 // outer loop count + expand v4, v5 // expand Y + bl mla4zc + cbz x6, 8f + + // Continue... +0: sub x6, x6, #16 + bl mla4 + cbnz x6, 0b + + // Finish off this pass. There was no tail on the previous pass, and + // there can be done on this pass. +8: bl carryprop + cbnz x4, 1b + + // All over. +9: popreg x29, x30 + ret +ENDFUNC + +FUNC(mpxmont_mul4_arm64_simd) + // void mpxmont_mul4_arm64_simd(mpw *dv, + // const mpw *av, const mpw *bv, + // const mpw *nv, size_t n, + // const mpw *mi); + + // Establish the arguments and do initial setup. + // + // inner loop dv x0 + // inner loop av x1 + // inner loop nv x2 + // nv base x3 + // base n x4 + // mi (x5) + // outer loop dv x5 + // outer loop bv x6 + // av base x7 + // inner n x8 + // outer n x9 + // c x10 + pushreg x29, x30 + setfp + endprologue + + // Set up the outer loop state and prepare for the first iteration. + ldr q2, [x2] // = V = bv[0] + ldr q6, [x5] // = M + movi v31.4s, #0 + // x0 // -> dv for inner loop + // x1 // -> av for inner loop + // x3 // -> nv base + // x4 // = n base + add x5, x0, #16 // -> dv + add x6, x2, #16 // -> bv + mov x2, x3 // -> nv[0] + mov x7, x1 // -> av base + sub x8, x4, #4 // = inner n (decremented) + sub x9, x4, #4 // = outer n (decremented) + expand v2, v3 // expand V + expand v6, v7 // expand M + bl mmul4 + cbz x8, 8f // done already? + + // Complete the first inner loop. +0: sub x8, x8, #4 + bl dmul4 + cbnz x8, 0b // done yet? + + // Still have carries left to propagate. Rather than store the tail + // end in memory, keep it in x10 for later. + bl carryprop + + // Embark on the next iteration. (There must be one. If n = 1 then + // we would have bailed above, to label 8. Similarly, the subsequent + // iterations can fall into the inner loop immediately.) +1: ldr q2, [x6], #16 // = Y = bv[i] + mov x0, x5 // -> dv[i] + mov x1, x7 // -> av[0] + mov x2, x3 // -> nv[0] + add x5, x5, #16 + sub x8, x4, #4 + sub x9, x9, #4 + expand v2, v3 + bl mmla4 + + // Complete the next inner loop. +0: sub x8, x8, #4 + bl dmla4 + cbnz x8, 0b + + // Still have carries left to propagate, and they overlap the + // previous iteration's final tail, so read that and add it. + add x15, x15, x10, lsl #16 + bl carryprop + + // Back again, maybe. + cbnz x9, 1b + + // All done, almost. + str w10, [x0], #4 + popreg x29, x30 + ret + + // First iteration was short. Write out the carries and we're done. + // (This could be folded into the main loop structure, but that would + // penalize small numbers more.) +8: bl carryprop + str w10, [x0], #4 + popreg x29, x30 + ret +ENDFUNC + +FUNC(mpxmont_redc4_arm64_simd) + // void mpxmont_redc4_arm64_simd(mpw *dv, mpw *dvl, const mpw *nv, + // size_t n, const mpw *mi); + + // Establish the arguments and do initial setup. + // + // inner loop dv x0 + // inner loop nv x2 + // blocks-of-4 count x6 + // tail count x7 + // mi (x4) + // outer loop dv x4 + // outer loop count x8 + // nv base x5 + // inner loop count x1 + // n x3 + // c x10 + // t0, t1 x11, x12 + + pushreg x29, x30 + setfp + endprologue + + // Set up the outer loop state and prepare for the first iteration. + ldr q6, [x4] // = M + movi v31.4s, #0 + // x0 // -> dv for inner loop + sub x6, x1, x0 // total dv bytes + sub x1, x3, #4 // inner loop counter + // x2 // -> nv for inner loop + // x3 // = n + add x4, x0, #16 // -> dv for outer loop + mov x5, x2 // -> nv base + sub x6, x6, x3, lsl #2 // dv carry range bytes + sub x8, x3, #4 // outer loop counter + sub x6, x6, #16 // dv steam-powered carry bytes + expand v6, v7 // expand M + and x7, x6, #15 // dv tail length in bytes + bic x6, x6, #15 // dv blocks-of-four length in bytes + + bl mont4 + cbz x1, 8f // done already? + +5: sub x1, x1, #4 + bl mla4 + cbnz x1, 5b // done yet? + + // Still have carries left to propagate. Adding the accumulator + // block into the carries is a little different this time, because + // all four accumulator limbs have to be squished into the three + // carry registers for `carryprop' to do its thing. +8: ldr q24, [x0] + sprdacc v24, v25, v26 + add v28.2d, v28.2d, v24.2d + add v29.2d, v29.2d, v25.2d + add v30.2d, v30.2d, v26.2d + bl carryprop + cbz x6, 7f + + // Propagate the first group of carries. + ldp x16, x17, [x0] + sub x1, x6, #16 + adds x16, x16, x10 + adcs x17, x17, xzr + stp x16, x17, [x0], #16 + cbz x1, 6f + + // Continue carry propagation until the end of the buffer. +0: ldp x16, x17, [x0] + sub x1, x1, #16 + adcs x16, x16, xzr + adcs x17, x17, xzr + stp x16, x17, [x0], #16 + cbnz x1, 0b + + // Deal with the tail end. Note that the actual destination length + // won't be an exacty number of blocks of four, so it's safe to just + // drop through here. +6: adc w10, wzr, wzr +7: ldr w16, [x0] + sub x1, x7, #4 + adds w16, w16, w10 + str w16, [x0], #4 + cbz x1, 8f +0: ldr w16, [x0] + sub x1, x1, #4 + adcs w16, w16, wzr + str w16, [x0], #4 + cbnz x1, 0b + + // All done for this iteration. Start the next. +8: cbz x8, 9f + mov x0, x4 + add x4, x4, #16 + sub x1, x3, #4 + mov x2, x5 + sub x8, x8, #4 + sub x6, x6, #16 + bl mont4 + b 5b + + // All over. +9: popreg x29, x30 + ret +ENDFUNC + +///-------------------------------------------------------------------------- +/// Testing and performance measurement. + +#ifdef TEST_MUL4 + +// dmul smul mmul mont +// z x0 x0 x0 x0 x0 +// c x3 x1 x1 x1 x1 +// y x4 -- -- x2 x2 +// u x1 x2 -- x3 -- +// x x2 x3 x2 x4 x3 +// vv v2/v3 x4 -- x5 -- +// yy v4/v5 x5 x3 x6 -- +// mm v6/v7 -- -- -- x4 +// n x5 x6 x4 x7 x5 +// cyv x6 x7 x5 stk0 x6 + +#define STKARG(i) sp, #16 + i + +.macro testprologue mode + pushreg x29, x30 + setfp + endprologue + movi v31.4s, #0 + + .ifeqs "\mode", "dmul" + ldr q2, [x4] + zip2 v3.8h, v2.8h, v31.8h // (v'_2, v''_2; v'_3, v''_3) + zip1 v2.8h, v2.8h, v31.8h // (v'_0, v''_0; v'_1, v''_1) + + ldr q4, [x5] + zip2 v5.8h, v4.8h, v31.8h // (y'_2, y''_2; y'_3, y''_3) + zip1 v4.8h, v4.8h, v31.8h // (y'_0, y''_0; y'_1, y''_1) + + mov x16, x1 + mov x1, x2 // -> u + mov x2, x3 // -> x + mov x3, x16 // -> c + + mov x5, x6 // = n + mov x6, x7 // -> cyv + .endif + + .ifeqs "\mode", "smul" + ldr q4, [x3] + zip2 v5.8h, v4.8h, v31.8h // (y'_2, y''_2; y'_3, y''_3) + zip1 v4.8h, v4.8h, v31.8h // (y'_0, y''_0; y'_1, y''_1) + + // x2 // -> x + mov x3, x1 // -> c + mov x6, x5 // -> cyv + mov x5, x4 // = n + .endif + + .ifeqs "\mode", "mmul" + ldr q2, [x5] + zip2 v3.8h, v2.8h, v31.8h // (v'_2, v''_2; v'_3, v''_3) + zip1 v2.8h, v2.8h, v31.8h // (v'_0, v''_0; v'_1, v''_1) + + ldr q6, [x6] + zip2 v7.8h, v6.8h, v31.8h // (y'_2, y''_2; y'_3, y''_3) + zip1 v6.8h, v6.8h, v31.8h // (y'_0, y''_0; y'_1, y''_1) + + mov x16, x1 + mov x1, x3 // -> u + mov x3, x16 // -> c + + mov x16, x2 + mov x2, x4 // -> x + mov x4, x16 // -> y + + mov x5, x7 // = n + ldr x6, [STKARG(0)] // -> cyv + .endif + + .ifeqs "\mode", "mont" + ldr q6, [x4] + zip2 v7.8h, v6.8h, v31.8h // (m'_2, m''_2; m'_3, m''_3) + zip1 v6.8h, v6.8h, v31.8h // (m'_0, m''_0; m'_1, m''_1) + + mov x4, x2 // -> y + mov x2, x3 // -> x + mov x3, x1 // -> c + + // x5 // = n + // x6 // -> cyv + .endif +.endm + +.macro testldcarry + ld1 {v28.2d-v30.2d}, [x3] + mov x15, #0 +.endm + +.macro testtop +0: sub x5, x5, #1 +.endm + +.macro testtail + cbnz x5, 0b +.endm + +.macro testcarryout + // More complicated than usual because we must mix the general- + // purpose carry back in. + lsr x15, x15, #16 + mov v0.d[0], x15 + mov v0.d[1], xzr + add v28.2d, v28.2d, v0.2d + st1 {v28.2d-v30.2d}, [x3] +.endm + +.macro testepilogue + popreg x29, x30 + ret +.endm + +FUNC(test_dmul4) + testprologue dmul + testldcarry + testtop + bl dmul4 + testtail + testcarryout + testepilogue +ENDFUNC + +FUNC(test_dmla4) + testprologue dmul + testldcarry + testtop + bl dmla4 + testtail + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mul4) + testprologue smul + testldcarry + testtop + bl mul4 + testtail + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mul4zc) + testprologue smul + testldcarry + testtop + bl mul4zc + testtail + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mla4) + testprologue smul + testldcarry + testtop + bl mla4 + testtail + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mla4zc) + testprologue smul + testldcarry + testtop + bl mla4zc + testtail + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mmul4) + testprologue mmul + testtop + bl mmul4 + testtail + stp q4, q5, [x4] + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mmla4) + testprologue mmul + testtop + bl mmla4 + testtail + stp q4, q5, [x4] + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mont4) + testprologue mont + testtop + bl mont4 + testtail + stp q4, q5, [x4] + testcarryout + testepilogue +ENDFUNC + +#endif + +///----- That's all, folks -------------------------------------------------- diff --git a/math/mpx-mul4-test.c b/math/mpx-mul4-test.c index 85a15327..5325db0f 100644 --- a/math/mpx-mul4-test.c +++ b/math/mpx-mul4-test.c @@ -61,6 +61,18 @@ static int cpu_features_p(void) { return (cpu_feature_p(CPUFEAT_X86_SSE2)); } static int cpu_features_p(void) { return (cpu_feature_p(CPUFEAT_X86_SSE2)); } #endif +#if CPUFAM_ARMEL +# define VARIANT _arm_neon +# define REPR_32 +static int cpu_features_p(void) { return (cpu_feature_p(CPUFEAT_ARM_NEON)); } +#endif + +#if CPUFAM_ARM64 +# define VARIANT _arm64_simd +# define REPR_32 +static int cpu_features_p(void) { return (1); } +#endif + #ifndef VARIANT # error "Unsupported CPU family." #endif diff --git a/math/mpx.c b/math/mpx.c index f761003f..d3d0a04a 100644 --- a/math/mpx.c +++ b/math/mpx.c @@ -931,6 +931,14 @@ static void simple_umul(mpw *dv, mpw *dvl, const mpw *av, const mpw *avl, MAYBE_UMUL4(amd64_avx) #endif +#if CPUFAM_ARMEL + MAYBE_UMUL4(arm_neon) +#endif + +#if CPUFAM_ARM64 + MAYBE_UMUL4(arm64_simd) +#endif + static mpx_umul__functype *pick_umul(void) { #if CPUFAM_X86 @@ -945,6 +953,13 @@ static mpx_umul__functype *pick_umul(void) DISPATCH_PICK_COND(mpx_umul, maybe_umul4_amd64_sse2, cpu_feature_p(CPUFEAT_X86_SSE2)); #endif +#if CPUFAM_ARMEL + DISPATCH_PICK_COND(mpx_umul, maybe_umul4_arm_neon, + cpu_feature_p(CPUFEAT_ARM_NEON)); +#endif +#if CPUFAM_ARM64 + DISPATCH_PICK_COND(mpx_umul, maybe_umul4_arm64_simd, 1); +#endif DISPATCH_PICK_FALLBACK(mpx_umul, simple_umul); }