X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/866fb27f61c9de2b09d8febad41286fc3f953a4e..a117c06f5ee62cbe7812769703eada01843f76ca:/math/mpx-mul4-amd64-sse2.S diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S index 8b8cd414..64460ca9 100644 --- a/math/mpx-mul4-amd64-sse2.S +++ b/math/mpx-mul4-amd64-sse2.S @@ -96,7 +96,7 @@ .macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces // of the product in registers D0, D1, D2, D3. - pshufd \d0, \r, SHUF(3, \i, 3, \i) // (r_i, ?; r_i, ?) + pshufd \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?) .ifnes "\d1", "nil" movdqa \d1, \slo // (s'_0, s'_1; s''_0, s''_1) .endif @@ -163,7 +163,7 @@ // lane 0 or 1 of D; the high two lanes of D are clobbered. On // completion, XMM3 is clobbered. If CC is `nil', then the // contribution which would have been added to it is left in C. - pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?; ?, t = c'' mod B) + pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B) psrldq xmm3, 12 // (t, 0; 0, 0) = (t; 0) pslldq xmm3, 2 // (t b; 0) paddq \c, xmm3 // (c' + t b; c'') @@ -209,11 +209,11 @@ punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1) punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3) .endif - pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1; a''_0, a''_1) - pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3; a''_2, a''_3) + pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1) + pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3) .ifnes "\c", "nil" - pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1; c''_0, c''_1) - pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3; c''_2, c''_3) + pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1) + pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3) .endif .endm