/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
/// operands, as follows.
///
/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
/// operands, as follows.
///
///
/// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
///
/// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
.macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
// Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
// of the product in registers D0, D1, D2, D3.
.macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
// Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
// of the product in registers D0, D1, D2, D3.
- pshufd \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
+ pshufd \d0, \r, SHUF(3, \i, 3, \i) // (?, r_i; ?, r_i)
- movdqa \d1, \slo // (s'_0, s'_1; s''_0, s''_1)
+ movdqa \d1, \slo // (s''_1, s''_0; s'_1, s'_0)
- movdqa \d3, \shi // (s'_2, s'_3; s''_2, s''_3)
+ movdqa \d3, \shi // (s''_3, s''_2; s'_3, s'_2)
- psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
+ psrldq \d1, 4 // (0, s''_1; s''_0, s'_1)
- movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?)
+ movdqa \d2, \d0 // another copy of (?, r_i; ?, r_i)
- psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0)
+ psrldq \d3, 4 // (0, s''_3; s''_2, s'_3)
.endm
.macro accum c0, c1=nil, c2=nil, c3=nil
.endm
.macro accum c0, c1=nil, c2=nil, c3=nil
// lane 0 or 1 of D; the high two lanes of D are clobbered. On
// completion, XMM3 is clobbered. If CC is `nil', then the
// contribution which would have been added to it is left in C.
// lane 0 or 1 of D; the high two lanes of D are clobbered. On
// completion, XMM3 is clobbered. If CC is `nil', then the
// contribution which would have been added to it is left in C.
- pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
- psrldq xmm3, 12 // (t, 0; 0, 0) = (t; 0)
- pslldq xmm3, 2 // (t b; 0)
- paddq \c, xmm3 // (c' + t b; c'')
+ pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (t = c'' mod B, ?; ?, ?)
+ psrldq xmm3, 12 // (0, 0; 0, t) = (0; t)
+ pslldq xmm3, 2 // (0; t b)
+ paddq \c, xmm3 // (c''; c' + t b)
// of the value represented in C are written at POS in D, and the
// remaining bits are left at the bottom of T.
movdqa \t, \c
// of the value represented in C are written at POS in D, and the
// remaining bits are left at the bottom of T.
movdqa \t, \c
- psllq \t, 16 // (?; c'' b)
- pslldq \c, 8 // (0; c')
- paddq \t, \c // (?; c' + c'' b)
- psrldq \t, 8 // (c' + c'' b; 0) = (c; 0)
+ psllq \t, 16 // (c'' b; ?)
+ pslldq \c, 8 // (c'; 0)
+ paddq \t, \c // (c' + c'' b; ?)
+ psrldq \t, 8 // (0; c' + c'' b) = (0; c)
// On entry, A and C hold packed 128-bit values, and Z is zero. On
// exit, A:B and C:D together hold the same values in expanded
// form. If C is `nil', then only expand A to A:B.
// On entry, A and C hold packed 128-bit values, and Z is zero. On
// exit, A:B and C:D together hold the same values in expanded
// form. If C is `nil', then only expand A to A:B.
- movdqa \b, \a // (a_0, a_1; a_2, a_3)
+ movdqa \b, \a // (a_3, a_2; a_1, a_0)
- movdqa \d, \c // (c_0, c_1; c_2, c_3)
+ movdqa \d, \c // (c_3, c_2; c_1, c_0)
- punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1)
- punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3)
+ punpcklwd \a, \z // (a''_1, a'_1; a''_0, a'_0)
+ punpckhwd \b, \z // (a''_3, a'_3; a''_2, a'_2)
- punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
- punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
+ punpcklwd \c, \z // (c''_1, c'_1; c''_0, c'_0)
+ punpckhwd \d, \z // (c''_3, c'_3; c''_2, c'_2)
- pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
- pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
+ pshufd \a, \a, SHUF(3, 1, 2, 0) // (a''_1, a''_0; a'_1, a'_0)
+ pshufd \b, \b, SHUF(3, 1, 2, 0) // (a''_3, a''_2; a'_3, a'_2)
- pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
- pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
+ pshufd \c, \c, SHUF(3, 1, 2, 0) // (c''_1, c''_0; c'_1, c'_0)
+ pshufd \d, \d, SHUF(3, 1, 2, 0) // (c''_3, c''_2; c'_3, c'_2)
- punpcklqdq \t, \c2 // (y'_0; y'_2)
- punpckhqdq \c0, \c2 // (y''_0; y''_2)
- punpcklqdq \u, \c3 // (y'_1; y'_3)
- punpckhqdq \c1, \c3 // (y''_1; y''_3)
+ punpcklqdq \t, \c2 // (y'_2; y'_0)
+ punpckhqdq \c0, \c2 // (y''_2; y''_0)
+ punpcklqdq \u, \c3 // (y'_3; y'_1)
+ punpckhqdq \c1, \c3 // (y''_3; y''_1)
// Now split the double-prime pieces. The high (up to) 48 bits will
// go up; the low 16 bits go down.
// Now split the double-prime pieces. The high (up to) 48 bits will
// go up; the low 16 bits go down.
- psrlq \c0, 16 // high parts of (y''_0; y''_2)
- psrlq \c1, 16 // high parts of (y''_1; y''_3)
- psrlq \c2, 32 // low parts of (y''_0; y''_2)
- psrlq \c3, 32 // low parts of (y''_1; y''_3)
+ psrlq \c0, 16 // high parts of (y''_2; y''_0)
+ psrlq \c1, 16 // high parts of (y''_3; y''_1)
+ psrlq \c2, 32 // low parts of (y''_2; y''_0)
+ psrlq \c3, 32 // low parts of (y''_3; y''_1)
- paddq \t, \c1 // and up: (y_0; y_2)
- paddq \u, \c0 // (y_1; y_3)
+ paddq \t, \c1 // and up: (y_2; y_0)
+ paddq \u, \c0 // (y_3; y_1)
.endif
// Finally extract the answer. This complicated dance is better than
// storing to memory and loading, because the piecemeal stores
// inhibit store forwarding.
.endif
// Finally extract the answer. This complicated dance is better than
// storing to memory and loading, because the piecemeal stores
// inhibit store forwarding.
- movdqa \c3, \t // (y_0; ?)
- movdqa \lo, \t // (y^*_0, ?; ?, ?)
- psrldq \t, 8 // (y_2; 0)
+ movdqa \c3, \t // (?; y_0)
+ movdqa \lo, \t // (?, ?; ?, y^*_0)
+ psrldq \t, 8 // (0; y_2)
psrlq \c3, 32 // (floor(y_0/B); ?)
paddq \c3, \u // (y_1 + floor(y_0/B); ?)
psrlq \c3, 32 // (floor(y_0/B); ?)
paddq \c3, \u // (y_1 + floor(y_0/B); ?)
- movdqa \c1, \c3 // (y^*_1, ?; ?, ?)
- psrldq \u, 8 // (y_3; 0)
+ movdqa \c1, \c3 // (?, ?; ?, y^*_1)
+ psrldq \u, 8 // (0; y_3)
psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
- punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?)
+ punpckldq \lo, \c3 // (?, ?; y^*_2, y^*_0)
psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
.ifnes "\hi", "nil"
movdqa \t, \c3
pxor \u, \u
.endif
psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
.ifnes "\hi", "nil"
movdqa \t, \c3
pxor \u, \u
.endif
- punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?)
+ punpckldq \c1, \c3 // (?, ?; y^*_3, y^*_1)
// On exit, the carry registers, including XMM15, are updated to hold
// C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
// registers are preserved.
// On exit, the carry registers, including XMM15, are updated to hold
// C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
// registers are preserved.
- movd xmm0, [rdi + 0] // (a_0; 0)
- movd xmm1, [rdi + 4] // (a_1; 0)
- movd xmm2, [rdi + 8] // (a_2; 0)
- movd xmm15, [rdi + 12] // (a_3; 0)
- paddq xmm12, xmm0 // (c'_0 + a_0; c''_0)
- paddq xmm13, xmm1 // (c'_1 + a_1; c''_1)
- paddq xmm14, xmm2 // (c'_2 + a_2; c''_2 + a_3 b)
+ movd xmm0, [rdi + 0] // (0; a_0)
+ movd xmm1, [rdi + 4] // (0; a_1)
+ movd xmm2, [rdi + 8] // (0; a_2)
+ movd xmm15, [rdi + 12] // (0; a_3)
+ paddq xmm12, xmm0 // (c''_0; c'_0 + a_0)
+ paddq xmm13, xmm1 // (c''_1; c'_1 + a_1)
+ paddq xmm14, xmm2 // (c''_2 + a_3 b; c'_2 + a_2)
mulcore xmm7, 1, xmm10, xmm11, xmm0, xmm1, xmm2
accum xmm4, xmm5, xmm6
mulcore xmm7, 1, xmm10, xmm11, xmm0, xmm1, xmm2
accum xmm4, xmm5, xmm6
- punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0)
- punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0)
+ punpckldq xmm12, xmm15 // (0, w_1; 0, w_0)
+ punpckhdq xmm14, xmm15 // (0, w_3; 0, w_2)
mulcore xmm7, 2, xmm10, xmm11, xmm0, xmm1
accum xmm5, xmm6
mulcore xmm7, 2, xmm10, xmm11, xmm0, xmm1
accum xmm5, xmm6
mulcore xmm7, 3, xmm10, xmm11, xmm0
accum xmm6
mulcore xmm7, 3, xmm10, xmm11, xmm0
accum xmm6
- punpckldq xmm12, xmm2 // (w_0, 0; 0, 0)
- punpckldq xmm14, xmm2 // (w_2, 0; 0, 0)
- punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0)
- punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0)
+ punpckldq xmm12, xmm2 // (0, 0; 0, w_0)
+ punpckldq xmm14, xmm2 // (0, 0; 0, w_2)
+ punpckhdq xmm13, xmm2 // (0, 0; 0, w_1)
+ punpckhdq xmm15, xmm2 // (0, 0; 0, w_3)
// That's lots of pieces. Now we have to assemble the answer.
squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
// That's lots of pieces. Now we have to assemble the answer.
squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
mulcore xmm7, 1, xmm8, xmm9, xmm0, xmm1, xmm2
accum xmm4, xmm5, xmm6
mulcore xmm7, 1, xmm8, xmm9, xmm0, xmm1, xmm2
accum xmm4, xmm5, xmm6
- punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0)
- punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0)
+ punpckldq xmm12, xmm15 // (0, w_1; 0, w_0)
+ punpckhdq xmm14, xmm15 // (0, w_3; 0, w_2)
mulcore xmm7, 2, xmm8, xmm9, xmm0, xmm1
accum xmm5, xmm6
mulcore xmm7, 2, xmm8, xmm9, xmm0, xmm1
accum xmm5, xmm6
mulcore xmm7, 3, xmm8, xmm9, xmm0
accum xmm6
mulcore xmm7, 3, xmm8, xmm9, xmm0
accum xmm6
- punpckldq xmm12, xmm2 // (w_0, 0; 0, 0)
- punpckldq xmm14, xmm2 // (w_2, 0; 0, 0)
- punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0)
- punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0)
+ punpckldq xmm12, xmm2 // (0, 0; 0, w_0)
+ punpckldq xmm14, xmm2 // (0, 0; 0, w_2)
+ punpckhdq xmm13, xmm2 // (0, 0; 0, w_1)
+ punpckhdq xmm15, xmm2 // (0, 0; 0, w_3)
// That's lots of pieces. Now we have to assemble the answer.
squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
// That's lots of pieces. Now we have to assemble the answer.
squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
- movdqu xmm12, [rcx + 0] // (c'_0; c''_0)
- movdqu xmm13, [rcx + 16] // (c'_1; c''_1)
- movdqu xmm14, [rcx + 32] // (c'_2; c''_2)
+ movdqu xmm12, [rcx + 0] // (c''_0; c'_0)
+ movdqu xmm13, [rcx + 16] // (c''_1; c'_1)
+ movdqu xmm14, [rcx + 32] // (c''_2; c'_2)
- pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
- pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
+ pshufd xmm10, xmm10, SHUF(3, 1, 2, 0)
+ pshufd xmm11, xmm11, SHUF(3, 1, 2, 0)
- pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
- pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
+ pshufd xmm10, xmm10, SHUF(3, 1, 2, 0)
+ pshufd xmm11, xmm11, SHUF(3, 1, 2, 0)
- pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
- pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
+ pshufd xmm10, xmm10, SHUF(3, 1, 2, 0)
+ pshufd xmm11, xmm11, SHUF(3, 1, 2, 0)