X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/8e91d6e5b9c82efb626869d6618b240d2ae8ad05..8c5956c14f5834a072e1a9345ae1f356b14164ca:/math/mpx-mul4-x86-sse2.S diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S index 591a7a8f..904c0d0a 100644 --- a/math/mpx-mul4-x86-sse2.S +++ b/math/mpx-mul4-x86-sse2.S @@ -24,15 +24,13 @@ /// MA 02111-1307, USA. ///-------------------------------------------------------------------------- -/// External definitions. +/// Preliminaries. #include "config.h" #include "asm-common.h" -///-------------------------------------------------------------------------- -/// Prologue. - .arch pentium4 + .text ///-------------------------------------------------------------------------- @@ -103,7 +101,7 @@ .ifnes "\d3", "nil" movdqa \d3, [\s + 16] // (s'_2, s'_3; s''_2, s''_3) .endif - pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?; r_i, ?) + pshufd \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?) .ifnes "\d1", "nil" psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0) .endif @@ -171,7 +169,7 @@ // carry registers. On completion, XMM3 is clobbered. If CC is // `nil', then the contribution which would have been added to it is // left in C. - pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?; ?, t = c'' mod B) + pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B) psrldq xmm3, 12 // (t, 0; 0, 0) = (t, 0) pslldq xmm3, 2 // (t b; 0) paddq \c, xmm3 // (c' + t b; c'') @@ -209,11 +207,11 @@ punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1) punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3) .endif - pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1; a''_0, a''_1) - pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3; a''_2, a''_3) + pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1) + pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3) .ifnes "\c", "nil" - pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1; c''_0, c''_1) - pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3; c''_2, c''_3) + pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1) + pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3) .endif .endm @@ -707,7 +705,7 @@ FUNC(mpx_umul4_x86_sse2) pushreg ebx pushreg esi pushreg edi - setfp ebp + setfp and esp, ~15 sub esp, 32 endprologue @@ -822,7 +820,7 @@ FUNC(mpxmont_mul4_x86_sse2) pushreg ebx pushreg esi pushreg edi - setfp ebp + setfp and esp, ~15 sub esp, 112 endprologue @@ -968,7 +966,7 @@ FUNC(mpxmont_redc4_x86_sse2) pushreg ebx pushreg esi pushreg edi - setfp ebp + setfp and esp, ~15 sub esp, 76 endprologue @@ -1097,7 +1095,7 @@ ENDFUNC pushreg ebx pushreg esi pushreg edi - setfp ebp + setfp and esp, ~15 sub esp, 3*32 + 4*4 endprologue @@ -1214,6 +1212,18 @@ FUNC(test_mul4) testepilogue ENDFUNC +FUNC(test_mul4zc) + testprologue [ebp + 36] + testldcarry [ebp + 24] + testexpand nil, [ebp + 32] + mov edi, [ebp + 20] + testtop nil, [ebp + 28] + call mul4zc + testtail [ebp + 40] + testcarryout [ebp + 24] + testepilogue +ENDFUNC + FUNC(test_mla4) testprologue [ebp + 36] testldcarry [ebp + 24] @@ -1226,6 +1236,18 @@ FUNC(test_mla4) testepilogue ENDFUNC +FUNC(test_mla4zc) + testprologue [ebp + 36] + testldcarry [ebp + 24] + testexpand nil, [ebp + 32] + mov edi, [ebp + 20] + testtop nil, [ebp + 28] + call mla4zc + testtail [ebp + 40] + testcarryout [ebp + 24] + testepilogue +ENDFUNC + FUNC(test_mmul4) testprologue [ebp + 48] testexpand [ebp + 40], [ebp + 44]