X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/8e91d6e5b9c82efb626869d6618b240d2ae8ad05..8c5956c14f5834a072e1a9345ae1f356b14164ca:/math/mpx-mul4-amd64-sse2.S diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S index 8b8cd414..d313765f 100644 --- a/math/mpx-mul4-amd64-sse2.S +++ b/math/mpx-mul4-amd64-sse2.S @@ -25,15 +25,13 @@ /// MA 02111-1307, USA. ///-------------------------------------------------------------------------- -/// External definitions. +/// Preliminaries. #include "config.h" #include "asm-common.h" -///-------------------------------------------------------------------------- -/// Prologue. - .arch pentium4 + .text ///-------------------------------------------------------------------------- @@ -96,7 +94,7 @@ .macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces // of the product in registers D0, D1, D2, D3. - pshufd \d0, \r, SHUF(3, \i, 3, \i) // (r_i, ?; r_i, ?) + pshufd \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?) .ifnes "\d1", "nil" movdqa \d1, \slo // (s'_0, s'_1; s''_0, s''_1) .endif @@ -163,7 +161,7 @@ // lane 0 or 1 of D; the high two lanes of D are clobbered. On // completion, XMM3 is clobbered. If CC is `nil', then the // contribution which would have been added to it is left in C. - pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?; ?, t = c'' mod B) + pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B) psrldq xmm3, 12 // (t, 0; 0, 0) = (t; 0) pslldq xmm3, 2 // (t b; 0) paddq \c, xmm3 // (c' + t b; c'') @@ -209,11 +207,11 @@ punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1) punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3) .endif - pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1; a''_0, a''_1) - pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3; a''_2, a''_3) + pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1) + pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3) .ifnes "\c", "nil" - pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1; c''_0, c''_1) - pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3; c''_2, c''_3) + pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1) + pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3) .endif .endm @@ -1329,7 +1327,7 @@ ENDFUNC # define ARG6 STKARG(2) # define ARG7 STKARG(3) # define ARG8 STKARG(4) -# define STKARG_OFFSET 40 +# define STKARG_OFFSET 224 #endif #define STKARG(i) [rsp + STKARG_OFFSET + 8*(i)] @@ -1386,7 +1384,7 @@ ENDFUNC mov rbx, r8 movdqu xmm8, [r9] movdqu xmm10, [rax] - mov r8, STKARG(1) + mov r8d, STKARG(1) mov r9, STKARG(2) mov r10, rdx mov r11, rcx @@ -1395,7 +1393,7 @@ ENDFUNC .ifeqs "\mode", "mont" mov rbx, rcx movdqu xmm8, [r8] - mov r8, r9 + mov r8d, r9d mov r9, STKARG(0) mov r10, rdx mov rcx, rsi @@ -1423,16 +1421,16 @@ ENDFUNC mov rbx, r9 movdqu xmm8, [r10] movdqu xmm10, [r11] - mov r8, STKARG(2) - mov r9, STKARG(3) mov r11, r8 + mov r8d, STKARG(2) + mov r9, STKARG(3) .endif .ifeqs "\mode", "smul" mov rdi, rcx mov rcx, rdx mov rbx, r8 movdqu xmm10, [r9] - mov r8, STKARG(0) + mov r8d, STKARG(0) mov r9, STKARG(1) .endif .ifeqs "\mode", "mmul" @@ -1443,10 +1441,10 @@ ENDFUNC mov rbx, STKARG(0) movdqu xmm8, [r10] movdqu xmm10, [r11] - mov r8, STKARG(3) - mov r9, STKARG(4) mov r10, r8 mov r11, r9 + mov r8d, STKARG(3) + mov r9, STKARG(4) .endif .ifeqs "\mode", "mont" mov r10, STKARG(0) @@ -1454,9 +1452,9 @@ ENDFUNC mov rcx, rdx mov rbx, r9 movdqu xmm8, [r10] - mov r8, STKARG(1) - mov r9, STKARG(2) mov r10, r8 + mov r8d, STKARG(1) + mov r9, STKARG(2) .endif #endif @@ -1550,6 +1548,16 @@ FUNC(test_mul4) testepilogue ENDFUNC +FUNC(test_mul4zc) + testprologue smul + testldcarry + testtop nil + call mul4zc + testtail + testcarryout + testepilogue +ENDFUNC + FUNC(test_mla4) testprologue smul testldcarry @@ -1560,6 +1568,16 @@ FUNC(test_mla4) testepilogue ENDFUNC +FUNC(test_mla4zc) + testprologue smul + testldcarry + testtop nil + call mla4zc + testtail + testcarryout + testepilogue +ENDFUNC + FUNC(test_mmul4) testprologue mmul testtop r11