X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/d0d41c6ebfbfebca8dbb516a1de4107c82b1bc6b..bd6d65e32b835551677456bf286d09ced6859882:/math/mpx-mul4-amd64-sse2.S diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S index 94befa4d..5a748c60 100644 --- a/math/mpx-mul4-amd64-sse2.S +++ b/math/mpx-mul4-amd64-sse2.S @@ -25,15 +25,13 @@ /// MA 02111-1307, USA. ///-------------------------------------------------------------------------- -/// External definitions. +/// Preliminaries. #include "config.h" #include "asm-common.h" -///-------------------------------------------------------------------------- -/// Prologue. - .arch pentium4 + .text ///-------------------------------------------------------------------------- @@ -43,11 +41,11 @@ /// construct more general variable-length multipliers. /// /// The basic trick is the same throughout. In an operand-scanning -/// multiplication, the inner multiplication loop multiplies a -/// multiple-precision operand by a single precision factor, and adds the -/// result, appropriately shifted, to the result. A `finely integrated -/// operand scanning' implementation of Montgomery multiplication also adds -/// the product of a single-precision `Montgomery factor' and the modulus, +/// multiplication, the inner multiplication loop multiplies a multiple- +/// precision operand by a single precision factor, and adds the result, +/// appropriately shifted, to the result. A `finely integrated operand +/// scanning' implementation of Montgomery multiplication also adds the +/// product of a single-precision `Montgomery factor' and the modulus, /// calculated in the same pass. The more common `coarsely integrated /// operand scanning' alternates main multiplication and Montgomery passes, /// which requires additional carry propagation. @@ -73,22 +71,65 @@ /// many products together before we must deal with carrying; it also allows /// for some calculations to be performed on the above expanded form. /// -/// ... +/// We maintain four `carry' registers XMM12--XMM15 accumulating intermediate +/// results. The registers' precise roles rotate during the computation; we +/// name them `c0', `c1', `c2', and `c3'. Each carry register holds two +/// 64-bit halves: the register c0, for example, holds c'_0 (low half) and +/// c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the +/// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2 + +/// c_3 B^3. The `pmuluqdq' instruction acting on a scalar operand +/// (broadcast across all lanes of its vector) and an operand in the expanded +/// form above produces a result which can be added directly to the +/// appropriate carry register. Following a pass of four multiplications, we +/// perform some limited carry propagation: let t = c''_0 mod B, and let d = +/// c'_0 + t b; then we output z = d mod B, add (floor(d/B), floor(c''_0/B)) +/// to c1, and cycle the carry registers around, so that c1 becomes c0, and +/// the old (implicitly) zeroed c0 becomes c3. +/// +/// On 64-bit AMD64, we have a reasonable number of registers: the expanded +/// operands are kept in registers. The packed operands are read from memory +/// into working registers XMM4 and XMM5; XMM0--XMM3 are used for the actual +/// multiplications; and XMM6 and XMM7 are used for combining the results. +/// The following conventional argument names and locations are used +/// throughout. +/// +/// Arg Format Location Notes +/// +/// U packed [RAX] +/// X packed [RBX] In Montgomery multiplication, X = N +/// V expanded XMM8/XMM9 +/// Y expanded XMM10/XMM11 In Montgomery multiplication, Y = (A + U V) M +/// M expanded (see below) Montgomery factor, M = -N^{-1} (mod B^4) +/// N Modulus, for Montgomery multiplication +/// A packed [RDI] Destination/accumulator +/// C carry XMM12--XMM15 +/// +/// The calculation is some variant of +/// +/// A' + C' B^4 <- U V + X Y + A + C /// -/// We maintain four `carry' registers accumulating intermediate results. -/// The registers' precise roles rotate during the computation; we name them -/// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves: -/// the register c0, for example, holds c'_0 (low half) and c''_0 (high -/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers -/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The -/// `pmuluqdq' instruction acting on a scalar operand (broadcast across all -/// lanes of its vector) and an operand in the expanded form above produces a -/// result which can be added directly to the appropriate carry register. -/// Following a pass of four multiplications, we perform some limited carry -/// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output -/// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry -/// registers around, so that c1 becomes c0, and the old c0 is (implicitly) -/// zeroed becomes c3. +/// The low-level functions fit into a fairly traditional (finely-integrated) +/// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y) +/// (indexed by i). +/// +/// The variants are as follows. +/// +/// Function Variant Use i j +/// +/// mmul4 A = C = 0, Y = M Montgomery 0 0 +/// dmul4 A = 0 Montgomery 0 + +/// mmla4 C = 0, Y = M Montgomery + 0 +/// dmla4 exactly as shown Montgomery + + +/// mont4 U = C = 0, V = M Montgomery any 0 +/// +/// mul4zc U = V = A = C = 0 Plain 0 0 +/// mul4 U = V = A = 0 Plain 0 + +/// mla4zc U = V = C = 0 Plain + 0 +/// mla4 U = V = 0 Plain + + +/// +/// The `mmul4' and `mmla4' functions are also responsible for calculating +/// the Montgomery reduction factor Y = (A + U V) M used by the rest of the +/// inner loop. ///-------------------------------------------------------------------------- /// Macro definitions. @@ -96,32 +137,32 @@ .macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces // of the product in registers D0, D1, D2, D3. - pshufd \d0, \r, SHUF(3, \i, 3, \i) // (r_i, ?, r_i, ?) + pshufd \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?) .ifnes "\d1", "nil" - movdqa \d1, \slo // (s'_0, s'_1, s''_0, s''_1) + movdqa \d1, \slo // (s'_0, s'_1; s''_0, s''_1) .endif .ifnes "\d3", "nil" - movdqa \d3, \shi // (s'_2, s'_3, s''_2, s''_3) + movdqa \d3, \shi // (s'_2, s'_3; s''_2, s''_3) .endif .ifnes "\d1", "nil" - psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0) + psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0) .endif .ifnes "\d2", "nil" - movdqa \d2, \d0 // another copy of (r_i, ?, r_i, ?) + movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?) .endif .ifnes "\d3", "nil" - psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0) + psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0) .endif .ifnes "\d1", "nil" - pmuludq \d1, \d0 // (r_i s'_1, r_i s''_1) + pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1) .endif .ifnes "\d3", "nil" - pmuludq \d3, \d0 // (r_i s'_3, r_i s''_3) + pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3) .endif .ifnes "\d2", "nil" - pmuludq \d2, \shi // (r_i s'_2, r_i s''_2) + pmuludq \d2, \shi // (r_i s'_2; r_i s''_2) .endif - pmuludq \d0, \slo // (r_i s'_0, r_i s''_0) + pmuludq \d0, \slo // (r_i s'_0; r_i s''_0) .endm .macro accum c0, c1=nil, c2=nil, c3=nil @@ -163,10 +204,10 @@ // lane 0 or 1 of D; the high two lanes of D are clobbered. On // completion, XMM3 is clobbered. If CC is `nil', then the // contribution which would have been added to it is left in C. - pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B) - psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0) - pslldq xmm3, 2 // (t b, 0) - paddq \c, xmm3 // (c' + t b, c'') + pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B) + psrldq xmm3, 12 // (t, 0; 0, 0) = (t; 0) + pslldq xmm3, 2 // (t b; 0) + paddq \c, xmm3 // (c' + t b; c'') .ifeqs "\pos", "lo" movdqa \d, \c .else @@ -183,37 +224,37 @@ // of the value represented in C are written at POS in D, and the // remaining bits are left at the bottom of T. movdqa \t, \c - psllq \t, 16 // (?, c'' b) - pslldq \c, 8 // (0, c') - paddq \t, \c // (?, c' + c'' b) - psrldq \t, 8 // c' + c'' b + psllq \t, 16 // (?; c'' b) + pslldq \c, 8 // (0; c') + paddq \t, \c // (?; c' + c'' b) + psrldq \t, 8 // (c' + c'' b; 0) = (c; 0) .ifeqs "\pos", "lo" movdqa \d, \t .else punpckldq \d, \t .endif - psrldq \t, 4 // floor((c' + c'' b)/B) + psrldq \t, 4 // (floor(c/B); 0) .endm .macro expand z, a, b, c=nil, d=nil // On entry, A and C hold packed 128-bit values, and Z is zero. On // exit, A:B and C:D together hold the same values in expanded // form. If C is `nil', then only expand A to A:B. - movdqa \b, \a // (a_0, a_1, a_2, a_3) + movdqa \b, \a // (a_0, a_1; a_2, a_3) .ifnes "\c", "nil" - movdqa \d, \c // (c_0, c_1, c_2, c_3) + movdqa \d, \c // (c_0, c_1; c_2, c_3) .endif - punpcklwd \a, \z // (a'_0, a''_0, a'_1, a''_1) - punpckhwd \b, \z // (a'_2, a''_2, a'_3, a''_3) + punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1) + punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3) .ifnes "\c", "nil" - punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1) - punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3) + punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1) + punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3) .endif - pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1) - pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3) + pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1) + pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3) .ifnes "\c", "nil" - pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1) - pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3) + pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1) + pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3) .endif .endm @@ -229,10 +270,10 @@ // we can do that, we must gather them together. movdqa \t, \c0 movdqa \u, \c1 - punpcklqdq \t, \c2 // (y'_0, y'_2) - punpckhqdq \c0, \c2 // (y''_0, y''_2) - punpcklqdq \u, \c3 // (y'_1, y'_3) - punpckhqdq \c1, \c3 // (y''_1, y''_3) + punpcklqdq \t, \c2 // (y'_0; y'_2) + punpckhqdq \c0, \c2 // (y''_0; y''_2) + punpcklqdq \u, \c3 // (y'_1; y'_3) + punpckhqdq \c1, \c3 // (y''_1; y''_3) // Now split the double-prime pieces. The high (up to) 48 bits will // go up; the low 16 bits go down. @@ -240,43 +281,43 @@ movdqa \c3, \c1 psllq \c2, 48 psllq \c3, 48 - psrlq \c0, 16 // high parts of (y''_0, y''_2) - psrlq \c1, 16 // high parts of (y''_1, y''_3) - psrlq \c2, 32 // low parts of (y''_0, y''_2) - psrlq \c3, 32 // low parts of (y''_1, y''_3) + psrlq \c0, 16 // high parts of (y''_0; y''_2) + psrlq \c1, 16 // high parts of (y''_1; y''_3) + psrlq \c2, 32 // low parts of (y''_0; y''_2) + psrlq \c3, 32 // low parts of (y''_1; y''_3) .ifnes "\hi", "nil" movdqa \hi, \c1 .endif - pslldq \c1, 8 // high part of (0, y''_1) + pslldq \c1, 8 // high part of (0; y''_1) paddq \t, \c2 // propagate down paddq \u, \c3 - paddq \t, \c1 // and up: (y_0, y_2) - paddq \u, \c0 // (y_1, y_3) + paddq \t, \c1 // and up: (y_0; y_2) + paddq \u, \c0 // (y_1; y_3) .ifnes "\hi", "nil" - psrldq \hi, 8 // high part of (y''_3, 0) + psrldq \hi, 8 // high part of (y''_3; 0) .endif // Finally extract the answer. This complicated dance is better than // storing to memory and loading, because the piecemeal stores // inhibit store forwarding. - movdqa \c3, \t // (y_0, y_1) - movdqa \lo, \t // (y^*_0, ?, ?, ?) - psrldq \t, 8 // (y_2, 0) - psrlq \c3, 32 // (floor(y_0/B), ?) - paddq \c3, \u // (y_1 + floor(y_0/B), ?) - movdqa \c1, \c3 // (y^*_1, ?, ?, ?) - psrldq \u, 8 // (y_3, 0) - psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?) - paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?) - punpckldq \lo, \c3 // (y^*_0, y^*_2, ?, ?) - psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?) - paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?) + movdqa \c3, \t // (y_0; ?) + movdqa \lo, \t // (y^*_0, ?; ?, ?) + psrldq \t, 8 // (y_2; 0) + psrlq \c3, 32 // (floor(y_0/B); ?) + paddq \c3, \u // (y_1 + floor(y_0/B); ?) + movdqa \c1, \c3 // (y^*_1, ?; ?, ?) + psrldq \u, 8 // (y_3; 0) + psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?) + paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?) + punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?) + psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?) + paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?) .ifnes "\hi", "nil" movdqa \t, \c3 pxor \u, \u .endif - punpckldq \c1, \c3 // (y^*_1, y^*_3, ?, ?) + punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?) .ifnes "\hi", "nil" psrlq \t, 32 // very high bits of y paddq \hi, \t @@ -293,13 +334,13 @@ // On exit, the carry registers, including XMM15, are updated to hold // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other // registers are preserved. - movd xmm0, [rdi + 0] // (a_0, 0) - movd xmm1, [rdi + 4] // (a_1, 0) - movd xmm2, [rdi + 8] // (a_2, 0) - movd xmm15, [rdi + 12] // (a_3, 0) - paddq xmm12, xmm0 // (c'_0 + a_0, c''_0) - paddq xmm13, xmm1 // (c'_1 + a_1, c''_1) - paddq xmm14, xmm2 // (c'_2 + a_2, c''_2 + a_3 b) + movd xmm0, [rdi + 0] // (a_0; 0) + movd xmm1, [rdi + 4] // (a_1; 0) + movd xmm2, [rdi + 8] // (a_2; 0) + movd xmm15, [rdi + 12] // (a_3; 0) + paddq xmm12, xmm0 // (c'_0 + a_0; c''_0) + paddq xmm13, xmm1 // (c'_1 + a_1; c''_1) + paddq xmm14, xmm2 // (c'_2 + a_2; c''_2 + a_3 b) .endm ///-------------------------------------------------------------------------- @@ -321,7 +362,6 @@ INTFUNC(carryprop) movdqu [rdi], xmm0 ret - ENDFUNC INTFUNC(dmul4) @@ -359,7 +399,6 @@ INTFUNC(dmul4) movdqu [rdi], xmm6 ret - ENDFUNC INTFUNC(dmla4) @@ -400,7 +439,6 @@ INTFUNC(dmla4) movdqu [rdi], xmm6 ret - ENDFUNC INTFUNC(mul4zc) @@ -431,7 +469,6 @@ INTFUNC(mul4zc) movdqu [rdi], xmm6 ret - ENDFUNC INTFUNC(mul4) @@ -464,7 +501,6 @@ INTFUNC(mul4) movdqu [rdi], xmm6 ret - ENDFUNC INTFUNC(mla4zc) @@ -500,7 +536,6 @@ INTFUNC(mla4zc) movdqu [rdi], xmm6 ret - ENDFUNC INTFUNC(mla4) @@ -535,14 +570,13 @@ INTFUNC(mla4) movdqu [rdi], xmm6 ret - ENDFUNC INTFUNC(mmul4) // On entry, RDI points to the destination buffer; RAX and RBX point // to the packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold - // the expanded operands V and M. The stack pointer must be 8 modulo 16 - // (as usual for AMD64 ABIs). + // the expanded operands V and M. The stack pointer must be 8 modulo + // 16 (as usual for AMD64 ABIs). // // On exit, we store Y = U V M mod B in XMM10/XMM11, and write the // low 128 bits of the sum U V + N Y to [RDI], leaving the remaining @@ -559,7 +593,6 @@ INTFUNC(mmul4) mulcore xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15 propout xmm7, lo, xmm12, xmm13 jmp 5f - ENDFUNC INTFUNC(mmla4) @@ -577,10 +610,10 @@ INTFUNC(mmla4) movdqu xmm4, [rax] #if ABI_WIN stalloc 48 + 8 // space for the carries -# define STKTMP(i) [rsp + i] +# define STKTMP(i) [SP + i] #endif #if ABI_SYSV -# define STKTMP(i) [rsp + i - 48 - 8] // use red zone +# define STKTMP(i) [SP + i - 48 - 8] // use red zone #endif endprologue @@ -621,8 +654,8 @@ INTFUNC(mmla4) mulcore xmm7, 1, xmm10, xmm11, xmm0, xmm1, xmm2 accum xmm4, xmm5, xmm6 - punpckldq xmm12, xmm15 // (w_0, 0, w_1, 0) - punpckhdq xmm14, xmm15 // (w_2, 0, w_3, 0) + punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0) + punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0) mulcore xmm7, 2, xmm10, xmm11, xmm0, xmm1 accum xmm5, xmm6 @@ -634,10 +667,10 @@ INTFUNC(mmla4) mulcore xmm7, 3, xmm10, xmm11, xmm0 accum xmm6 - punpckldq xmm12, xmm2 // (w_0, 0, 0, 0) - punpckldq xmm14, xmm2 // (w_2, 0, 0, 0) - punpckhdq xmm13, xmm2 // (w_1, 0, 0, 0) - punpckhdq xmm15, xmm2 // (w_3, 0, 0, 0) + punpckldq xmm12, xmm2 // (w_0, 0; 0, 0) + punpckldq xmm14, xmm2 // (w_2, 0; 0, 0) + punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0) + punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0) // That's lots of pieces. Now we have to assemble the answer. squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10 @@ -703,8 +736,8 @@ INTFUNC(mont4) mulcore xmm7, 1, xmm8, xmm9, xmm0, xmm1, xmm2 accum xmm4, xmm5, xmm6 - punpckldq xmm12, xmm15 // (w_0, 0, w_1, 0) - punpckhdq xmm14, xmm15 // (w_2, 0, w_3, 0) + punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0) + punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0) mulcore xmm7, 2, xmm8, xmm9, xmm0, xmm1 accum xmm5, xmm6 @@ -716,10 +749,10 @@ INTFUNC(mont4) mulcore xmm7, 3, xmm8, xmm9, xmm0 accum xmm6 - punpckldq xmm12, xmm2 // (w_0, 0, 0, 0) - punpckldq xmm14, xmm2 // (w_2, 0, 0, 0) - punpckhdq xmm13, xmm2 // (w_1, 0, 0, 0) - punpckhdq xmm15, xmm2 // (w_3, 0, 0, 0) + punpckldq xmm12, xmm2 // (w_0, 0; 0, 0) + punpckldq xmm14, xmm2 // (w_2, 0; 0, 0) + punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0) + punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0) // That's lots of pieces. Now we have to assemble the answer. squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10 @@ -746,15 +779,21 @@ INTFUNC(mont4) // And, with that, we're done. movdqu [rdi], xmm6 ret - ENDFUNC ///-------------------------------------------------------------------------- /// Bulk multipliers. +FUNC(mpx_umul4_amd64_avx) + .arch .avx + vzeroupper + endprologue + .arch pentium4 +ENDFUNC + FUNC(mpx_umul4_amd64_sse2) // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl, - // const mpw *bv, const mpw *bvl); + // const mpw *bv, const mpw *bvl); // Establish the arguments and do initial setup. // @@ -778,7 +817,6 @@ FUNC(mpx_umul4_amd64_sse2) endprologue mov DV, rdi - #endif #if ABI_WIN @@ -806,8 +844,7 @@ FUNC(mpx_umul4_amd64_sse2) endprologue mov rdi, DV - mov BVL, [rsp + 224] - + mov BVL, [SP + 224] #endif // Prepare for the first iteration. @@ -873,7 +910,6 @@ FUNC(mpx_umul4_amd64_sse2) #endif #if ABI_WIN - rstrxmm xmm6, 0 rstrxmm xmm7, 16 rstrxmm xmm8, 32 @@ -888,7 +924,6 @@ FUNC(mpx_umul4_amd64_sse2) stfree 160 + 8 popreg rdi popreg rbx - #endif ret @@ -901,6 +936,13 @@ FUNC(mpx_umul4_amd64_sse2) ENDFUNC +FUNC(mpxmont_mul4_amd64_avx) + .arch .avx + vzeroupper + endprologue + .arch pentium4 +ENDFUNC + FUNC(mpxmont_mul4_amd64_sse2) // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv, // const mpw *nv, size_t n, const mpw *mi); @@ -934,7 +976,6 @@ FUNC(mpxmont_mul4_amd64_sse2) endprologue mov DV, rdi - #endif #if ABI_WIN @@ -966,9 +1007,8 @@ FUNC(mpxmont_mul4_amd64_sse2) endprologue mov rdi, DV - mov N, [rsp + 224] - mov MI, [rsp + 232] - + mov N, [SP + 224] + mov MI, [SP + 232] #endif // Establish the expanded operands. @@ -1050,7 +1090,6 @@ FUNC(mpxmont_mul4_amd64_sse2) #endif #if ABI_WIN - rstrxmm xmm6, 0 rstrxmm xmm7, 16 rstrxmm xmm8, 32 @@ -1066,7 +1105,6 @@ FUNC(mpxmont_mul4_amd64_sse2) popreg r12 popreg rdi popreg rbx - #endif ret @@ -1095,6 +1133,13 @@ FUNC(mpxmont_mul4_amd64_sse2) ENDFUNC +FUNC(mpxmont_redc4_amd64_avx) + .arch .avx + vzeroupper + endprologue + .arch pentium4 +ENDFUNC + FUNC(mpxmont_redc4_amd64_sse2) // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv, // size_t n, const mpw *mi); @@ -1110,12 +1155,11 @@ FUNC(mpxmont_redc4_amd64_sse2) // outer loop dv r10 rcx // outer loop dv limit r11 r11 // nv base rdx r8 - // nv limit r9 r12* + // nv limit r9 r10* // n rcx r9 // c rcx r9 #if ABI_SYSV - # define DVL rax # define DVL4 rsi # define MI r8 @@ -1130,25 +1174,22 @@ FUNC(mpxmont_redc4_amd64_sse2) endprologue mov DV, rdi - #endif #if ABI_WIN - # define DVL rax # define DVL4 rdx # define MI r10 # define DV rcx # define DVLO r11 # define NV r8 -# define NVL r12 +# define NVL r10 # define N r9 # define C r9d pushreg rbx pushreg rdi - pushreg r12 - stalloc 160 + stalloc 168 savexmm xmm6, 0 savexmm xmm7, 16 @@ -1164,8 +1205,7 @@ FUNC(mpxmont_redc4_amd64_sse2) endprologue mov rdi, DV - mov MI, [rsp + 224] - + mov MI, [SP + 224] #endif // Establish the expanded operands and the blocks-of-4 dv limit. @@ -1211,29 +1251,29 @@ FUNC(mpxmont_redc4_amd64_sse2) // Continue carry propagation until the end of the buffer. 0: add [rdi], C mov C, 0 // preserves flags - adcd [rdi + 4], 0 - adcd [rdi + 8], 0 - adcd [rdi + 12], 0 + adc dword ptr [rdi + 4], 0 + adc dword ptr [rdi + 8], 0 + adc dword ptr [rdi + 12], 0 adc C, 0 add rdi, 16 cmp rdi, DVL4 jb 0b - // Deal with the tail end. + // Deal with the tail end. Note that the actual destination length + // won't be an exacty number of blocks of four, so it's safe to just + // drop through here. 7: add [rdi], C - mov C, 0 // preserves flags + mov C, 0 add rdi, 4 adc C, 0 cmp rdi, DVL jb 7b - // All done for this iteration. Start the next. (This must have at - // least one follow-on iteration, or we'd not have started this outer - // loop.) -8: mov rdi, DV // -> Z = dv[i] - mov rbx, NV // -> X = nv[0] - cmp rdi, DVLO // all done yet? + // All done for this iteration. Start the next. + cmp DV, DVLO // all done yet? jae 9f + mov rdi, DV // -> Z = dv[i] + mov rbx, NV // -> X = nv[0] add DV, 16 call mont4 add rdi, 16 @@ -1248,7 +1288,6 @@ FUNC(mpxmont_redc4_amd64_sse2) #endif #if ABI_WIN - rstrxmm xmm6, 0 rstrxmm xmm7, 16 rstrxmm xmm8, 32 @@ -1260,11 +1299,9 @@ FUNC(mpxmont_redc4_amd64_sse2) rstrxmm xmm14, 128 rstrxmm xmm15, 144 - stfree 160 - popreg r12 + stfree 168 popreg rdi popreg rbx - #endif ret @@ -1310,7 +1347,7 @@ ENDFUNC # define ARG8 STKARG(4) # define STKARG_OFFSET 224 #endif -#define STKARG(i) [rsp + STKARG_OFFSET + 8*(i)] +#define STKARG(i) [SP + STKARG_OFFSET + 8*(i)] // sysv win // dmul smul mmul mont dmul smul mmul mont @@ -1474,9 +1511,9 @@ ENDFUNC .endm .macro testldcarry - movdqu xmm12, [rcx + 0] // (c'_0, c''_0) - movdqu xmm13, [rcx + 16] // (c'_1, c''_1) - movdqu xmm14, [rcx + 32] // (c'_2, c''_2) + movdqu xmm12, [rcx + 0] // (c'_0; c''_0) + movdqu xmm13, [rcx + 16] // (c'_1; c''_1) + movdqu xmm14, [rcx + 32] // (c'_2; c''_2) .endm .macro testtop u=nil @@ -1564,6 +1601,8 @@ FUNC(test_mmul4) testtop r11 call mmul4 testtail + pshufd xmm10, xmm10, SHUF(0, 2, 1, 3) + pshufd xmm11, xmm11, SHUF(0, 2, 1, 3) movdqu [r10 + 0], xmm10 movdqu [r10 + 16], xmm11 testcarryout @@ -1575,6 +1614,8 @@ FUNC(test_mmla4) testtop r11 call mmla4 testtail + pshufd xmm10, xmm10, SHUF(0, 2, 1, 3) + pshufd xmm11, xmm11, SHUF(0, 2, 1, 3) movdqu [r10 + 0], xmm10 movdqu [r10 + 16], xmm11 testcarryout @@ -1586,6 +1627,8 @@ FUNC(test_mont4) testtop call mont4 testtail + pshufd xmm10, xmm10, SHUF(0, 2, 1, 3) + pshufd xmm11, xmm11, SHUF(0, 2, 1, 3) movdqu [r10 + 0], xmm10 movdqu [r10 + 16], xmm11 testcarryout