.macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
// Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
// of the product in registers D0, D1, D2, D3.
.macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
// Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
// of the product in registers D0, D1, D2, D3.
- pshufd \d0, \r, SHUF(3, \i, 3, \i) // (r_i, ?; r_i, ?)
+ pshufd \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
.ifnes "\d1", "nil"
movdqa \d1, \slo // (s'_0, s'_1; s''_0, s''_1)
.endif
.ifnes "\d1", "nil"
movdqa \d1, \slo // (s'_0, s'_1; s''_0, s''_1)
.endif
// lane 0 or 1 of D; the high two lanes of D are clobbered. On
// completion, XMM3 is clobbered. If CC is `nil', then the
// contribution which would have been added to it is left in C.
// lane 0 or 1 of D; the high two lanes of D are clobbered. On
// completion, XMM3 is clobbered. If CC is `nil', then the
// contribution which would have been added to it is left in C.
- pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?; ?, t = c'' mod B)
+ pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
psrldq xmm3, 12 // (t, 0; 0, 0) = (t; 0)
pslldq xmm3, 2 // (t b; 0)
paddq \c, xmm3 // (c' + t b; c'')
psrldq xmm3, 12 // (t, 0; 0, 0) = (t; 0)
pslldq xmm3, 2 // (t b; 0)
paddq \c, xmm3 // (c' + t b; c'')
punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
.endif
punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
.endif
- pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1; a''_0, a''_1)
- pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3; a''_2, a''_3)
+ pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
+ pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
- pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1; c''_0, c''_1)
- pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3; c''_2, c''_3)
+ pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
+ pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
mulcore xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
propout xmm7, lo, xmm12, xmm13
jmp 5f
mulcore xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15
propout xmm7, lo, xmm12, xmm13
jmp 5f