~mdw
/
catacomb
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
math/mpx-mul4-x86-sse2.S: Additional piece of commentary.
[catacomb]
/
math
/
mpx-mul4-x86-sse2.S
diff --git
a/math/mpx-mul4-x86-sse2.S
b/math/mpx-mul4-x86-sse2.S
index
ab23886
..
5d80860
100644
(file)
--- a/
math/mpx-mul4-x86-sse2.S
+++ b/
math/mpx-mul4-x86-sse2.S
@@
-64,7
+64,7
@@
/// 0 v'_0 v'_1 v''_0 v''_1
/// 16 v'_2 v'_3 v''_2 v''_3
///
/// 0 v'_0 v'_1 v''_0 v''_1
/// 16 v'_2 v'_3 v''_2 v''_3
///
-/// A `pmuludq
d
' instruction ignores the odd positions in its operands; thus,
+/// A `pmuludq' instruction ignores the odd positions in its operands; thus,
/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
/// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
/// multiply such a vector by a full 32-bit scalar to produce two 48-bit
/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
/// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
/// multiply such a vector by a full 32-bit scalar to produce two 48-bit
@@
-81,7
+81,7
@@
/// the register c0, for example, holds c'_0 (low half) and c''_0 (high
/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
/// the register c0, for example, holds c'_0 (low half) and c''_0 (high
/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
-/// `pmuluqd
q
' instruction acting on a scalar operand (broadcast across all
+/// `pmuluqd' instruction acting on a scalar operand (broadcast across all
/// lanes of its vector) and an operand in the expanded form above produces a
/// result which can be added directly to the appropriate carry register.
/// Following a pass of four multiplications, we perform some limited carry
/// lanes of its vector) and an operand in the expanded form above produces a
/// result which can be added directly to the appropriate carry register.
/// Following a pass of four multiplications, we perform some limited carry
@@
-93,7
+93,7
@@
///--------------------------------------------------------------------------
/// Macro definitions.
///--------------------------------------------------------------------------
/// Macro definitions.
-.macro mulcore r, s, d0, d1
, d2, d3
+.macro mulcore r, s, d0, d1
=nil, d2=nil, d3=nil
// Load a word r_i from R, multiply by the expanded operand [S], and
// leave the pieces of the product in registers D0, D1, D2, D3.
movd \d0, \r // (r_i, 0, 0, 0)
// Load a word r_i from R, multiply by the expanded operand [S], and
// leave the pieces of the product in registers D0, D1, D2, D3.
movd \d0, \r // (r_i, 0, 0, 0)
@@
-118,22
+118,25
@@
psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
.endif
.ifnes "\d1", "nil"
psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
.endif
.ifnes "\d1", "nil"
- pmuludq
d \d1, \d0
// (r_i s'_1, r_i s''_1)
+ pmuludq
\d1, \d0
// (r_i s'_1, r_i s''_1)
.endif
.ifnes "\d3", "nil"
.endif
.ifnes "\d3", "nil"
- pmuludq
d \d3, \d0
// (r_i s'_3, r_i s''_3)
+ pmuludq
\d3, \d0
// (r_i s'_3, r_i s''_3)
.endif
.ifnes "\d2", "nil"
.ifnes "\d3", "nil"
.endif
.ifnes "\d2", "nil"
.ifnes "\d3", "nil"
- pmuludq
d \d2, \d0
// (r_i s'_2, r_i s''_2)
+ pmuludq
\d2, \d0
// (r_i s'_2, r_i s''_2)
.else
.else
- pmuludq
d
\d2, [\s + 16]
+ pmuludq \d2, [\s + 16]
.endif
.endif
.endif
.endif
- pmuludq
d \d0, [\s]
// (r_i s'_0, r_i s''_0)
+ pmuludq
\d0, [\s]
// (r_i s'_0, r_i s''_0)
.endm
.endm
-.macro accum c0, c1, c2, c3
+.macro accum c0, c1=nil, c2=nil, c3=nil
+ // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
+ // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
+ // updating that register.
paddq \c0, xmm0
.ifnes "\c1", "nil"
paddq \c1, xmm1
paddq \c0, xmm0
.ifnes "\c1", "nil"
paddq \c1, xmm1
@@
-146,7
+149,7
@@
.endif
.endm
.endif
.endm
-.macro mulacc r, s, c0, c1, c2, c3, z3p
+.macro mulacc r, s, c0, c1, c2, c3, z3p
=nil
// Load a word r_i from R, multiply by the expanded operand [S],
// and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
// then C3 notionally contains zero, but needs clearing; in practice,
// Load a word r_i from R, multiply by the expanded operand [S],
// and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
// then C3 notionally contains zero, but needs clearing; in practice,
@@
-155,14
+158,14
@@
// is not `t'.
.ifeqs "\z3p", "t"
mulcore \r, \s, xmm0, xmm1, xmm2, \c3
// is not `t'.
.ifeqs "\z3p", "t"
mulcore \r, \s, xmm0, xmm1, xmm2, \c3
- accum \c0, \c1, \c2
, nil
+ accum \c0, \c1, \c2
.else
mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
accum \c0, \c1, \c2, \c3
.endif
.endm
.else
mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
accum \c0, \c1, \c2, \c3
.endif
.endm
-.macro propout d, c, cc
+.macro propout d, c, cc
=nil
// Calculate an output word from C, and store it in D; propagate
// carries out from C to CC in preparation for a rotation of the
// carry registers. On completion, XMM3 is clobbered. If CC is
// Calculate an output word from C, and store it in D; propagate
// carries out from C to CC in preparation for a rotation of the
// carry registers. On completion, XMM3 is clobbered. If CC is
@@
-192,7
+195,7
@@
psrldq \t, 4 // floor((c' + c'' b)/B)
.endm
psrldq \t, 4 // floor((c' + c'' b)/B)
.endm
-.macro expand
a, b, c, d, z
+.macro expand
z, a, b, c=nil, d=nil
// On entry, A and C hold packed 128-bit values, and Z is zero. On
// exit, A:B and C:D together hold the same values in expanded
// form. If C is `nil', then only expand A to A:B.
// On entry, A and C hold packed 128-bit values, and Z is zero. On
// exit, A:B and C:D together hold the same values in expanded
// form. If C is `nil', then only expand A to A:B.
@@
-214,11
+217,11
@@
.endif
.endm
.endif
.endm
-.macro squash c0, c1, c2, c3,
h, t, u
+.macro squash c0, c1, c2, c3,
t, u, lo, hi=nil
// On entry, C0, C1, C2, C3 are carry registers representing a value
// On entry, C0, C1, C2, C3 are carry registers representing a value
- // Y. On exit,
C0
holds the low 128 bits of the carry value; C1, C2,
+ // Y. On exit,
LO
holds the low 128 bits of the carry value; C1, C2,
// C3, T, and U are clobbered; and the high bits of Y are stored in
// C3, T, and U are clobbered; and the high bits of Y are stored in
- // H, if this is not `nil'.
+ // H
I
, if this is not `nil'.
// The first step is to eliminate the `double-prime' pieces -- i.e.,
// the ones offset by 16 bytes from a 32-bit boundary -- by carrying
// The first step is to eliminate the `double-prime' pieces -- i.e.,
// the ones offset by 16 bytes from a 32-bit boundary -- by carrying
@@
-241,8
+244,8
@@
psrlq \c1, 16 // high parts of (y''_1, y''_3)
psrlq \c2, 32 // low parts of (y''_0, y''_2)
psrlq \c3, 32 // low parts of (y''_1, y''_3)
psrlq \c1, 16 // high parts of (y''_1, y''_3)
psrlq \c2, 32 // low parts of (y''_0, y''_2)
psrlq \c3, 32 // low parts of (y''_1, y''_3)
- .ifnes "\h", "nil"
- movdqa \h, \c1
+ .ifnes "\h
i
", "nil"
+ movdqa \h
i
, \c1
.endif
pslldq \c1, 8 // high part of (0, y''_1)
.endif
pslldq \c1, 8 // high part of (0, y''_1)
@@
-250,44
+253,36
@@
paddq \u, \c3
paddq \t, \c1 // and up: (y_0, y_2)
paddq \u, \c0 // (y_1, y_3)
paddq \u, \c3
paddq \t, \c1 // and up: (y_0, y_2)
paddq \u, \c0 // (y_1, y_3)
- .ifnes "\h", "nil"
- psrldq \h
, 8
// high part of (y''_3, 0)
+ .ifnes "\h
i
", "nil"
+ psrldq \h
i, 8
// high part of (y''_3, 0)
.endif
// Finally extract the answer. This complicated dance is better than
// storing to memory and loading, because the piecemeal stores
// inhibit store forwarding.
movdqa \c3, \t // (y_0, y_1)
.endif
// Finally extract the answer. This complicated dance is better than
// storing to memory and loading, because the piecemeal stores
// inhibit store forwarding.
movdqa \c3, \t // (y_0, y_1)
- movdqa \
c0
, \t // (y^*_0, ?, ?, ?)
+ movdqa \
lo
, \t // (y^*_0, ?, ?, ?)
psrldq \t, 8 // (y_2, 0)
psrlq \c3, 32 // (floor(y_0/B), ?)
paddq \c3, \u // (y_1 + floor(y_0/B), ?)
psrldq \t, 8 // (y_2, 0)
psrlq \c3, 32 // (floor(y_0/B), ?)
paddq \c3, \u // (y_1 + floor(y_0/B), ?)
- pslldq \c0, 12 // (0, 0, 0, y^*_0)
movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
psrldq \u, 8 // (y_3, 0)
psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
psrldq \u, 8 // (y_3, 0)
psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
- pslldq \c1, 12 // (0, 0, 0, y^*_1)
- psrldq \c0, 12 // (y^*_0, 0, 0, 0)
- movdqa \c2, \c3 // (y^*_2, ?, ?, ?)
+ punpckldq \lo, \c3 // (y^*_0, y^*_2, ?, ?)
psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
- pslldq \c2, 12 // (0, 0, 0, y^*_2)
- psrldq \c1, 8 // (0, y^*_1, 0, 0)
- psrldq \c2, 4 // (0, 0, y^*_2, 0)
- .ifnes "\h", "nil"
- movdqu \t, \c3
+ .ifnes "\hi", "nil"
+ movdqa \t, \c3
pxor \u, \u
.endif
pxor \u, \u
.endif
- pslldq \c3, 12 // (0, 0, 0, y^*_3)
- por \c0, \c1 // (y^*_0, y^*_1, 0, 0)
- por \c2, \c3 // (0, 0, y^*_2, y^*_3)
- por \c0, \c2 // y mod B^4
- .ifnes "\h", "nil"
+ punpckldq \c1, \c3 // (y^*_1, y^*_3, ?, ?)
+ .ifnes "\hi", "nil"
psrlq \t, 32 // very high bits of y
psrlq \t, 32 // very high bits of y
- paddq \h, \t
- punpcklqdq \h
, \u
// carry up
+ paddq \h
i
, \t
+ punpcklqdq \h
i, \u
// carry up
.endif
.endif
+ punpckldq \lo, \c1 // y mod B^4
.endm
.macro carryadd
.endm
.macro carryadd
@@
-339,19
+334,19
@@
INTFUNC(dmul4)
endprologue
mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
endprologue
mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
- mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
, nil
+ mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
propout [edi + 4], xmm5, xmm6
mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
propout [edi + 4], xmm5, xmm6
mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
- mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
, nil
+ mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
propout [edi + 8], xmm6, xmm7
mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
propout [edi + 8], xmm6, xmm7
mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
- mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
, nil
+ mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
propout [edi + 12], xmm7, xmm4
ret
propout [edi + 12], xmm7, xmm4
ret
@@
-374,20
+369,20
@@
INTFUNC(dmla4)
carryadd
carryadd
- mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
, nil
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
, nil
+ mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
- mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
, nil
+ mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
propout [edi + 4], xmm5, xmm6
mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
propout [edi + 4], xmm5, xmm6
mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
- mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
, nil
+ mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
propout [edi + 8], xmm6, xmm7
mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
propout [edi + 8], xmm6, xmm7
mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
- mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
, nil
+ mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
propout [edi + 12], xmm7, xmm4
ret
propout [edi + 12], xmm7, xmm4
ret
@@
-464,7
+459,7
@@
INTFUNC(mla4zc)
movd xmm6, [edi + 8]
movd xmm7, [edi + 12]
movd xmm6, [edi + 8]
movd xmm7, [edi + 12]
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
@@
-495,7
+490,7
@@
INTFUNC(mla4)
carryadd
carryadd
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
@@
-555,7
+550,10
@@
INTFUNC(mmla4)
movd xmm5, [edi + 4]
movd xmm6, [edi + 8]
movd xmm7, [edi + 12]
movd xmm5, [edi + 4]
movd xmm6, [edi + 8]
movd xmm7, [edi + 12]
- mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
+
+ // Calculate W = U V, and leave it in the destination. Stash the
+ // carry pieces for later.
+ mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
@@
-574,21
+572,21
@@
INTFUNC(mmla4)
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
- mulcore [edi + 4], esi, xmm0, xmm1, xmm2
, nil
- accum xmm5, xmm6, xmm7
, nil
+ mulcore [edi + 4], esi, xmm0, xmm1, xmm2
+ accum xmm5, xmm6, xmm7
- mulcore [edi + 8], esi, xmm0, xmm1
, nil, nil
- accum xmm6, xmm7
, nil, nil
+ mulcore [edi + 8], esi, xmm0, xmm1
+ accum xmm6, xmm7
- mulcore [edi + 12], esi, xmm0
, nil, nil, nil
- accum xmm7
, nil, nil, nil
+ mulcore [edi + 12], esi, xmm0
+ accum xmm7
// That's lots of pieces. Now we have to assemble the answer.
// That's lots of pieces. Now we have to assemble the answer.
- squash xmm4, xmm5, xmm6, xmm7,
nil, xmm0, xmm1
+ squash xmm4, xmm5, xmm6, xmm7,
xmm0, xmm1, xmm4
// Expand it.
pxor xmm2, xmm2
// Expand it.
pxor xmm2, xmm2
- expand xmm
4, xmm1, nil, nil, xmm2
+ expand xmm
2, xmm4, xmm1
movdqa [edx + 0], xmm4
movdqa [edx + 16], xmm1
movdqa [edx + 0], xmm4
movdqa [edx + 16], xmm1
@@
-599,7
+597,7
@@
INTFUNC(mmla4)
movd xmm7, [edi + 12]
// Finish the calculation by adding the Montgomery product.
movd xmm7, [edi + 12]
// Finish the calculation by adding the Montgomery product.
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
@@
-637,21
+635,21
@@
INTFUNC(mont4)
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
- mulcore [edi + 4], esi, xmm0, xmm1, xmm2
, nil
- accum xmm5, xmm6, xmm7
, nil
+ mulcore [edi + 4], esi, xmm0, xmm1, xmm2
+ accum xmm5, xmm6, xmm7
- mulcore [edi + 8], esi, xmm0, xmm1
, nil, nil
- accum xmm6, xmm7
, nil, nil
+ mulcore [edi + 8], esi, xmm0, xmm1
+ accum xmm6, xmm7
- mulcore [edi + 12], esi, xmm0
, nil, nil, nil
- accum xmm7
, nil, nil, nil
+ mulcore [edi + 12], esi, xmm0
+ accum xmm7
// That's lots of pieces. Now we have to assemble the answer.
// That's lots of pieces. Now we have to assemble the answer.
- squash xmm4, xmm5, xmm6, xmm7,
nil, xmm0, xmm1
+ squash xmm4, xmm5, xmm6, xmm7,
xmm0, xmm1, xmm4
// Expand it.
pxor xmm2, xmm2
// Expand it.
pxor xmm2, xmm2
- expand xmm
4, xmm1, nil, nil, xmm2
+ expand xmm
2, xmm4, xmm1
movdqa [edx + 0], xmm4
movdqa [edx + 16], xmm1
movdqa [edx + 0], xmm4
movdqa [edx + 16], xmm1
@@
-662,7
+660,7
@@
INTFUNC(mont4)
movd xmm7, [edi + 12]
// Finish the calculation by adding the Montgomery product.
movd xmm7, [edi + 12]
// Finish the calculation by adding the Montgomery product.
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
@@
-714,7
+712,7
@@
FUNC(mpx_umul4_x86_sse2)
movdqu xmm0, [esi] // bv[0]
mov edi, [ebp + 20] // -> dv[0]
mov ecx, edi // outer loop dv cursor
movdqu xmm0, [esi] // bv[0]
mov edi, [ebp + 20] // -> dv[0]
mov ecx, edi // outer loop dv cursor
- expand xmm
0, xmm1, nil, nil, xmm7
+ expand xmm
7, xmm0, xmm1
mov ebx, [ebp + 24] // -> av[0]
mov eax, [ebp + 28] // -> av[m] = av limit
mov edx, esp // -> expanded Y = bv[0]
mov ebx, [ebp + 24] // -> av[0]
mov eax, [ebp + 28] // -> av[m] = av limit
mov edx, esp // -> expanded Y = bv[0]
@@
-746,7
+744,7
@@
FUNC(mpx_umul4_x86_sse2)
1: movdqu xmm0, [esi] // bv[i]
mov edi, ecx // -> dv[i]
pxor xmm7, xmm7
1: movdqu xmm0, [esi] // bv[i]
mov edi, ecx // -> dv[i]
pxor xmm7, xmm7
- expand xmm
0, xmm1, nil, nil, xmm7
+ expand xmm
7, xmm0, xmm1
mov ebx, [ebp + 24] // -> av[0]
movdqa [esp + 0], xmm0 // bv[i] expanded low
movdqa [esp + 16], xmm1 // bv[i] expanded high
mov ebx, [ebp + 24] // -> av[0]
movdqa [esp + 0], xmm0 // bv[i] expanded low
movdqa [esp + 16], xmm1 // bv[i] expanded high
@@
-822,7
+820,7
@@
FUNC(mpxmont_mul4_x86_sse2)
mov edx, [ebp + 40] // -> mi
movdqu xmm0, [ecx] // bv[0]
movdqu xmm2, [edx] // mi
mov edx, [ebp + 40] // -> mi
movdqu xmm0, [ecx] // bv[0]
movdqu xmm2, [edx] // mi
- expand xmm
0, xmm1, xmm2, xmm3, xmm7
+ expand xmm
7, xmm0, xmm1, xmm2, xmm3
movdqa [esp + 12], xmm0 // bv[0] expanded low
movdqa [esp + 28], xmm1 // bv[0] expanded high
movdqa [esp + 44], xmm2 // mi expanded low
movdqa [esp + 12], xmm0 // bv[0] expanded low
movdqa [esp + 28], xmm1 // bv[0] expanded high
movdqa [esp + 44], xmm2 // mi expanded low
@@
-879,7
+877,7
@@
FUNC(mpxmont_mul4_x86_sse2)
mov ebx, [ebp + 32] // -> X = nv[0]
lea esi, [esp + 44] // -> expanded M = mi
mov eax, [ebp + 24] // -> U = av[0]
mov ebx, [ebp + 32] // -> X = nv[0]
lea esi, [esp + 44] // -> expanded M = mi
mov eax, [ebp + 24] // -> U = av[0]
- expand xmm
0, xmm1, nil, nil, xmm7
+ expand xmm
7, xmm0, xmm1
movdqa [esp + 12], xmm0 // bv[i] expanded low
movdqa [esp + 28], xmm1 // bv[i] expanded high
call mmla4
movdqa [esp + 12], xmm0 // bv[i] expanded low
movdqa [esp + 28], xmm1 // bv[i] expanded high
call mmla4
@@
-962,7
+960,7
@@
FUNC(mpxmont_redc4_x86_sse2)
mov edx, [ebp + 36] // -> mi
movdqu xmm0, [edx] // mi
and eax, ~15 // mask off the tail end
mov edx, [ebp + 36] // -> mi
movdqu xmm0, [edx] // mi
and eax, ~15 // mask off the tail end
- expand xmm
0, xmm1, nil, nil, xmm7
+ expand xmm
7, xmm0, xmm1
add eax, edi // find limit
movdqa [esp + 12], xmm0 // mi expanded low
movdqa [esp + 28], xmm1 // mi expanded high
add eax, edi // find limit
movdqa [esp + 12], xmm0 // mi expanded low
movdqa [esp + 28], xmm1 // mi expanded high
@@
-1105,25
+1103,25
@@
ENDFUNC
movdqu xmm6, [ecx + 32] // (c'_2, c''_2)
.endm
movdqu xmm6, [ecx + 32] // (c'_2, c''_2)
.endm
-.macro testexpand v
, y
+.macro testexpand v
=nil, y=nil
pxor xmm7, xmm7
.ifnes "\v", "nil"
mov ecx, \v
movdqu xmm0, [ecx]
pxor xmm7, xmm7
.ifnes "\v", "nil"
mov ecx, \v
movdqu xmm0, [ecx]
- expand xmm
0, xmm1, nil, nil, xmm7
+ expand xmm
7, xmm0, xmm1
movdqa [esp + 12], xmm0
movdqa [esp + 28], xmm1
.endif
.ifnes "\y", "nil"
mov edx, \y
movdqu xmm2, [edx]
movdqa [esp + 12], xmm0
movdqa [esp + 28], xmm1
.endif
.ifnes "\y", "nil"
mov edx, \y
movdqu xmm2, [edx]
- expand xmm
2, xmm3, nil, nil, xmm7
+ expand xmm
7, xmm2, xmm3
movdqa [esp + 44], xmm2
movdqa [esp + 60], xmm3
.endif
.endm
movdqa [esp + 44], xmm2
movdqa [esp + 60], xmm3
.endif
.endm
-.macro testtop u
, x, mode
+.macro testtop u
=nil, x=nil, mode=nil
.p2align 4
0:
.ifnes "\u", "nil"
.p2align 4
0:
.ifnes "\u", "nil"