/// 0 v'_0 v'_1 v''_0 v''_1
/// 16 v'_2 v'_3 v''_2 v''_3
///
-/// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
+/// A `pmuludq' instruction ignores the odd positions in its operands; thus,
/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
/// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
/// multiply such a vector by a full 32-bit scalar to produce two 48-bit
/// the register c0, for example, holds c'_0 (low half) and c''_0 (high
/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
-/// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
+/// `pmuluqd' instruction acting on a scalar operand (broadcast across all
/// lanes of its vector) and an operand in the expanded form above produces a
/// result which can be added directly to the appropriate carry register.
/// Following a pass of four multiplications, we perform some limited carry
psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
.endif
.ifnes "\d1", "nil"
- pmuludqd \d1, \d0 // (r_i s'_1, r_i s''_1)
+ pmuludq \d1, \d0 // (r_i s'_1, r_i s''_1)
.endif
.ifnes "\d3", "nil"
- pmuludqd \d3, \d0 // (r_i s'_3, r_i s''_3)
+ pmuludq \d3, \d0 // (r_i s'_3, r_i s''_3)
.endif
.ifnes "\d2", "nil"
.ifnes "\d3", "nil"
- pmuludqd \d2, \d0 // (r_i s'_2, r_i s''_2)
+ pmuludq \d2, \d0 // (r_i s'_2, r_i s''_2)
.else
- pmuludqd \d2, [\s + 16]
+ pmuludq \d2, [\s + 16]
.endif
.endif
- pmuludqd \d0, [\s] // (r_i s'_0, r_i s''_0)
+ pmuludq \d0, [\s] // (r_i s'_0, r_i s''_0)
.endm
.macro accum c0, c1, c2, c3
.endif
.endm
-.macro squash c0, c1, c2, c3, h, t, u
+.macro squash lo, hi, c0, c1, c2, c3, t, u
// On entry, C0, C1, C2, C3 are carry registers representing a value
- // Y. On exit, C0 holds the low 128 bits of the carry value; C1, C2,
+ // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
// C3, T, and U are clobbered; and the high bits of Y are stored in
- // H, if this is not `nil'.
+ // HI, if this is not `nil'.
// The first step is to eliminate the `double-prime' pieces -- i.e.,
// the ones offset by 16 bytes from a 32-bit boundary -- by carrying
psrlq \c1, 16 // high parts of (y''_1, y''_3)
psrlq \c2, 32 // low parts of (y''_0, y''_2)
psrlq \c3, 32 // low parts of (y''_1, y''_3)
- .ifnes "\h", "nil"
- movdqa \h, \c1
+ .ifnes "\hi", "nil"
+ movdqa \hi, \c1
.endif
pslldq \c1, 8 // high part of (0, y''_1)
paddq \u, \c3
paddq \t, \c1 // and up: (y_0, y_2)
paddq \u, \c0 // (y_1, y_3)
- .ifnes "\h", "nil"
- psrldq \h, 8 // high part of (y''_3, 0)
+ .ifnes "\hi", "nil"
+ psrldq \hi, 8 // high part of (y''_3, 0)
.endif
// Finally extract the answer. This complicated dance is better than
// storing to memory and loading, because the piecemeal stores
// inhibit store forwarding.
movdqa \c3, \t // (y_0, y_1)
- movdqa \c0, \t // (y^*_0, ?, ?, ?)
+ movdqa \lo, \t // (y^*_0, ?, ?, ?)
psrldq \t, 8 // (y_2, 0)
psrlq \c3, 32 // (floor(y_0/B), ?)
paddq \c3, \u // (y_1 + floor(y_0/B), ?)
- pslldq \c0, 12 // (0, 0, 0, y^*_0)
movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
psrldq \u, 8 // (y_3, 0)
psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
- pslldq \c1, 12 // (0, 0, 0, y^*_1)
- psrldq \c0, 12 // (y^*_0, 0, 0, 0)
- movdqa \c2, \c3 // (y^*_2, ?, ?, ?)
+ punpckldq \lo, \c3 // (y^*_0, y^*_2, ?, ?)
psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
- pslldq \c2, 12 // (0, 0, 0, y^*_2)
- psrldq \c1, 8 // (0, y^*_1, 0, 0)
- psrldq \c2, 4 // (0, 0, y^*_2, 0)
- .ifnes "\h", "nil"
- movdqu \t, \c3
+ .ifnes "\hi", "nil"
+ movdqa \t, \c3
pxor \u, \u
.endif
- pslldq \c3, 12 // (0, 0, 0, y^*_3)
- por \c0, \c1 // (y^*_0, y^*_1, 0, 0)
- por \c2, \c3 // (0, 0, y^*_2, y^*_3)
- por \c0, \c2 // y mod B^4
- .ifnes "\h", "nil"
+ punpckldq \c1, \c3 // (y^*_1, y^*_3, ?, ?)
+ .ifnes "\hi", "nil"
psrlq \t, 32 // very high bits of y
- paddq \h, \t
- punpcklqdq \h, \u // carry up
+ paddq \hi, \t
+ punpcklqdq \hi, \u // carry up
.endif
+ punpckldq \lo, \c1 // y mod B^4
.endm
.macro carryadd
movd xmm1, [edi + 4] // (a_1, 0)
movd xmm2, [edi + 8] // (a_2, 0)
movd xmm7, [edi + 12] // (a_3, 0)
+
paddq xmm4, xmm0 // (c'_0 + a_0, c''_0)
paddq xmm5, xmm1 // (c'_1 + a_1, c''_1)
paddq xmm6, xmm2 // (c'_2 + a_2, c''_2 + a_3 b)
accum xmm7, nil, nil, nil
// That's lots of pieces. Now we have to assemble the answer.
- squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
+ squash xmm4, nil, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1
// Expand it.
pxor xmm2, xmm2
accum xmm7, nil, nil, nil
// That's lots of pieces. Now we have to assemble the answer.
- squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
+ squash xmm4, nil, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1
// Expand it.
pxor xmm2, xmm2