~mdw
/
catacomb
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
math/mpx-mul4-*: Test the `...zc' variants too.
[catacomb]
/
math
/
mpx-mul4-x86-sse2.S
diff --git
a/math/mpx-mul4-x86-sse2.S
b/math/mpx-mul4-x86-sse2.S
index
8f69a55
..
baf7cc5
100644
(file)
--- a/
math/mpx-mul4-x86-sse2.S
+++ b/
math/mpx-mul4-x86-sse2.S
@@
-64,7
+64,7
@@
/// 0 v'_0 v'_1 v''_0 v''_1
/// 16 v'_2 v'_3 v''_2 v''_3
///
/// 0 v'_0 v'_1 v''_0 v''_1
/// 16 v'_2 v'_3 v''_2 v''_3
///
-/// A `pmuludq
d
' instruction ignores the odd positions in its operands; thus,
+/// A `pmuludq' instruction ignores the odd positions in its operands; thus,
/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
/// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
/// multiply such a vector by a full 32-bit scalar to produce two 48-bit
/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
/// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
/// multiply such a vector by a full 32-bit scalar to produce two 48-bit
@@
-81,7
+81,7
@@
/// the register c0, for example, holds c'_0 (low half) and c''_0 (high
/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
/// the register c0, for example, holds c'_0 (low half) and c''_0 (high
/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
-/// `pmuluqd
q
' instruction acting on a scalar operand (broadcast across all
+/// `pmuluqd' instruction acting on a scalar operand (broadcast across all
/// lanes of its vector) and an operand in the expanded form above produces a
/// result which can be added directly to the appropriate carry register.
/// Following a pass of four multiplications, we perform some limited carry
/// lanes of its vector) and an operand in the expanded form above produces a
/// result which can be added directly to the appropriate carry register.
/// Following a pass of four multiplications, we perform some limited carry
@@
-93,7
+93,7
@@
///--------------------------------------------------------------------------
/// Macro definitions.
///--------------------------------------------------------------------------
/// Macro definitions.
-.macro mulcore r, s, d0, d1
, d2, d3
+.macro mulcore r, s, d0, d1
=nil, d2=nil, d3=nil
// Load a word r_i from R, multiply by the expanded operand [S], and
// leave the pieces of the product in registers D0, D1, D2, D3.
movd \d0, \r // (r_i, 0, 0, 0)
// Load a word r_i from R, multiply by the expanded operand [S], and
// leave the pieces of the product in registers D0, D1, D2, D3.
movd \d0, \r // (r_i, 0, 0, 0)
@@
-118,22
+118,25
@@
psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
.endif
.ifnes "\d1", "nil"
psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
.endif
.ifnes "\d1", "nil"
- pmuludq
d \d1, \d0
// (r_i s'_1, r_i s''_1)
+ pmuludq
\d1, \d0
// (r_i s'_1, r_i s''_1)
.endif
.ifnes "\d3", "nil"
.endif
.ifnes "\d3", "nil"
- pmuludq
d \d3, \d0
// (r_i s'_3, r_i s''_3)
+ pmuludq
\d3, \d0
// (r_i s'_3, r_i s''_3)
.endif
.ifnes "\d2", "nil"
.ifnes "\d3", "nil"
.endif
.ifnes "\d2", "nil"
.ifnes "\d3", "nil"
- pmuludq
d \d2, \d0
// (r_i s'_2, r_i s''_2)
+ pmuludq
\d2, \d0
// (r_i s'_2, r_i s''_2)
.else
.else
- pmuludq
d
\d2, [\s + 16]
+ pmuludq \d2, [\s + 16]
.endif
.endif
.endif
.endif
- pmuludq
d \d0, [\s]
// (r_i s'_0, r_i s''_0)
+ pmuludq
\d0, [\s]
// (r_i s'_0, r_i s''_0)
.endm
.endm
-.macro accum c0, c1, c2, c3
+.macro accum c0, c1=nil, c2=nil, c3=nil
+ // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
+ // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
+ // updating that register.
paddq \c0, xmm0
.ifnes "\c1", "nil"
paddq \c1, xmm1
paddq \c0, xmm0
.ifnes "\c1", "nil"
paddq \c1, xmm1
@@
-146,7
+149,7
@@
.endif
.endm
.endif
.endm
-.macro mulacc r, s, c0, c1, c2, c3, z3p
+.macro mulacc r, s, c0, c1, c2, c3, z3p
=nil
// Load a word r_i from R, multiply by the expanded operand [S],
// and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
// then C3 notionally contains zero, but needs clearing; in practice,
// Load a word r_i from R, multiply by the expanded operand [S],
// and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
// then C3 notionally contains zero, but needs clearing; in practice,
@@
-155,14
+158,14
@@
// is not `t'.
.ifeqs "\z3p", "t"
mulcore \r, \s, xmm0, xmm1, xmm2, \c3
// is not `t'.
.ifeqs "\z3p", "t"
mulcore \r, \s, xmm0, xmm1, xmm2, \c3
- accum \c0, \c1, \c2
, nil
+ accum \c0, \c1, \c2
.else
mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
accum \c0, \c1, \c2, \c3
.endif
.endm
.else
mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
accum \c0, \c1, \c2, \c3
.endif
.endm
-.macro propout d, c, cc
+.macro propout d, c, cc
=nil
// Calculate an output word from C, and store it in D; propagate
// carries out from C to CC in preparation for a rotation of the
// carry registers. On completion, XMM3 is clobbered. If CC is
// Calculate an output word from C, and store it in D; propagate
// carries out from C to CC in preparation for a rotation of the
// carry registers. On completion, XMM3 is clobbered. If CC is
@@
-192,7
+195,7
@@
psrldq \t, 4 // floor((c' + c'' b)/B)
.endm
psrldq \t, 4 // floor((c' + c'' b)/B)
.endm
-.macro expand
a, b, c, d, z
+.macro expand
z, a, b, c=nil, d=nil
// On entry, A and C hold packed 128-bit values, and Z is zero. On
// exit, A:B and C:D together hold the same values in expanded
// form. If C is `nil', then only expand A to A:B.
// On entry, A and C hold packed 128-bit values, and Z is zero. On
// exit, A:B and C:D together hold the same values in expanded
// form. If C is `nil', then only expand A to A:B.
@@
-214,11
+217,11
@@
.endif
.endm
.endif
.endm
-.macro squash c0, c1, c2, c3,
h, t, u
+.macro squash c0, c1, c2, c3,
t, u, lo, hi=nil
// On entry, C0, C1, C2, C3 are carry registers representing a value
// On entry, C0, C1, C2, C3 are carry registers representing a value
- // Y. On exit,
C0
holds the low 128 bits of the carry value; C1, C2,
+ // Y. On exit,
LO
holds the low 128 bits of the carry value; C1, C2,
// C3, T, and U are clobbered; and the high bits of Y are stored in
// C3, T, and U are clobbered; and the high bits of Y are stored in
- // H, if this is not `nil'.
+ // H
I
, if this is not `nil'.
// The first step is to eliminate the `double-prime' pieces -- i.e.,
// the ones offset by 16 bytes from a 32-bit boundary -- by carrying
// The first step is to eliminate the `double-prime' pieces -- i.e.,
// the ones offset by 16 bytes from a 32-bit boundary -- by carrying
@@
-241,8
+244,8
@@
psrlq \c1, 16 // high parts of (y''_1, y''_3)
psrlq \c2, 32 // low parts of (y''_0, y''_2)
psrlq \c3, 32 // low parts of (y''_1, y''_3)
psrlq \c1, 16 // high parts of (y''_1, y''_3)
psrlq \c2, 32 // low parts of (y''_0, y''_2)
psrlq \c3, 32 // low parts of (y''_1, y''_3)
- .ifnes "\h", "nil"
- movdqa \h, \c1
+ .ifnes "\h
i
", "nil"
+ movdqa \h
i
, \c1
.endif
pslldq \c1, 8 // high part of (0, y''_1)
.endif
pslldq \c1, 8 // high part of (0, y''_1)
@@
-250,44
+253,36
@@
paddq \u, \c3
paddq \t, \c1 // and up: (y_0, y_2)
paddq \u, \c0 // (y_1, y_3)
paddq \u, \c3
paddq \t, \c1 // and up: (y_0, y_2)
paddq \u, \c0 // (y_1, y_3)
- .ifnes "\h", "nil"
- psrldq \h
, 8
// high part of (y''_3, 0)
+ .ifnes "\h
i
", "nil"
+ psrldq \h
i, 8
// high part of (y''_3, 0)
.endif
// Finally extract the answer. This complicated dance is better than
// storing to memory and loading, because the piecemeal stores
// inhibit store forwarding.
movdqa \c3, \t // (y_0, y_1)
.endif
// Finally extract the answer. This complicated dance is better than
// storing to memory and loading, because the piecemeal stores
// inhibit store forwarding.
movdqa \c3, \t // (y_0, y_1)
- movdqa \
c0
, \t // (y^*_0, ?, ?, ?)
+ movdqa \
lo
, \t // (y^*_0, ?, ?, ?)
psrldq \t, 8 // (y_2, 0)
psrlq \c3, 32 // (floor(y_0/B), ?)
paddq \c3, \u // (y_1 + floor(y_0/B), ?)
psrldq \t, 8 // (y_2, 0)
psrlq \c3, 32 // (floor(y_0/B), ?)
paddq \c3, \u // (y_1 + floor(y_0/B), ?)
- pslldq \c0, 12 // (0, 0, 0, y^*_0)
movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
psrldq \u, 8 // (y_3, 0)
psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
psrldq \u, 8 // (y_3, 0)
psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
- pslldq \c1, 12 // (0, 0, 0, y^*_1)
- psrldq \c0, 12 // (y^*_0, 0, 0, 0)
- movdqa \c2, \c3 // (y^*_2, ?, ?, ?)
+ punpckldq \lo, \c3 // (y^*_0, y^*_2, ?, ?)
psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
- pslldq \c2, 12 // (0, 0, 0, y^*_2)
- psrldq \c1, 8 // (0, y^*_1, 0, 0)
- psrldq \c2, 4 // (0, 0, y^*_2, 0)
- .ifnes "\h", "nil"
- movdqu \t, \c3
+ .ifnes "\hi", "nil"
+ movdqa \t, \c3
pxor \u, \u
.endif
pxor \u, \u
.endif
- pslldq \c3, 12 // (0, 0, 0, y^*_3)
- por \c0, \c1 // (y^*_0, y^*_1, 0, 0)
- por \c2, \c3 // (0, 0, y^*_2, y^*_3)
- por \c0, \c2 // y mod B^4
- .ifnes "\h", "nil"
+ punpckldq \c1, \c3 // (y^*_1, y^*_3, ?, ?)
+ .ifnes "\hi", "nil"
psrlq \t, 32 // very high bits of y
psrlq \t, 32 // very high bits of y
- paddq \h, \t
- punpcklqdq \h
, \u
// carry up
+ paddq \h
i
, \t
+ punpcklqdq \h
i, \u
// carry up
.endif
.endif
+ punpckldq \lo, \c1 // y mod B^4
.endm
.macro carryadd
.endm
.macro carryadd
@@
-302,6
+297,7
@@
movd xmm1, [edi + 4] // (a_1, 0)
movd xmm2, [edi + 8] // (a_2, 0)
movd xmm7, [edi + 12] // (a_3, 0)
movd xmm1, [edi + 4] // (a_1, 0)
movd xmm2, [edi + 8] // (a_2, 0)
movd xmm7, [edi + 12] // (a_3, 0)
+
paddq xmm4, xmm0 // (c'_0 + a_0, c''_0)
paddq xmm5, xmm1 // (c'_1 + a_1, c''_1)
paddq xmm6, xmm2 // (c'_2 + a_2, c''_2 + a_3 b)
paddq xmm4, xmm0 // (c'_0 + a_0, c''_0)
paddq xmm5, xmm1 // (c'_1 + a_1, c''_1)
paddq xmm6, xmm2 // (c'_2 + a_2, c''_2 + a_3 b)
@@
-315,6
+311,8
@@
INTFUNC(carryprop)
// form. Store the low 128 bits of the represented carry to [EDI] as
// a packed 128-bit value, and leave the remaining 16 bits in the low
// 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
// form. Store the low 128 bits of the represented carry to [EDI] as
// a packed 128-bit value, and leave the remaining 16 bits in the low
// 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered.
+ endprologue
+
propout [edi + 0], xmm4, xmm5
propout [edi + 4], xmm5, xmm6
propout [edi + 8], xmm6, nil
propout [edi + 0], xmm4, xmm5
propout [edi + 4], xmm5, xmm6
propout [edi + 8], xmm6, nil
@@
-333,20
+331,22
@@
INTFUNC(dmul4)
// [EDI], and update the carry registers with the carry out. The
// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
// [EDI], and update the carry registers with the carry out. The
// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
+ endprologue
+
mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
- mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
, nil
+ mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
propout [edi + 4], xmm5, xmm6
mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
propout [edi + 4], xmm5, xmm6
mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
- mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
, nil
+ mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
propout [edi + 8], xmm6, xmm7
mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
propout [edi + 8], xmm6, xmm7
mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
- mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
, nil
+ mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
propout [edi + 12], xmm7, xmm4
ret
propout [edi + 12], xmm7, xmm4
ret
@@
-365,22
+365,24
@@
INTFUNC(dmla4)
// [EDI], and update the carry registers with the carry out. The
// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
// [EDI], and update the carry registers with the carry out. The
// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
+ endprologue
+
carryadd
carryadd
- mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
, nil
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
, nil
+ mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
- mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
, nil
+ mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
propout [edi + 4], xmm5, xmm6
mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
propout [edi + 4], xmm5, xmm6
mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
- mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
, nil
+ mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
propout [edi + 8], xmm6, xmm7
mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
propout [edi + 8], xmm6, xmm7
mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
- mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
, nil
+ mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
propout [edi + 12], xmm7, xmm4
ret
propout [edi + 12], xmm7, xmm4
ret
@@
-395,6
+397,8
@@
INTFUNC(mul4zc)
// and set the carry registers XMM4, XMM5, XMM6 to the carry out.
// The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
// and set the carry registers XMM4, XMM5, XMM6 to the carry out.
// The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
+ endprologue
+
mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
@@
-421,6
+425,8
@@
INTFUNC(mul4)
// and update the carry registers with the carry out. The registers
// XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
// and update the carry registers with the carry out. The registers
// XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
+ endprologue
+
mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t
propout [edi + 0], xmm4, xmm5
@@
-446,12
+452,14
@@
INTFUNC(mla4zc)
// and set the carry registers XMM4, XMM5, XMM6 to the carry out.
// The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
// and set the carry registers XMM4, XMM5, XMM6 to the carry out.
// The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
+ endprologue
+
movd xmm4, [edi + 0]
movd xmm5, [edi + 4]
movd xmm6, [edi + 8]
movd xmm7, [edi + 12]
movd xmm4, [edi + 0]
movd xmm5, [edi + 4]
movd xmm6, [edi + 8]
movd xmm7, [edi + 12]
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
@@
-478,9
+486,11
@@
INTFUNC(mla4)
// [EDI], and update the carry registers with the carry out. The
// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
// [EDI], and update the carry registers with the carry out. The
// registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
// general-purpose registers are preserved.
+ endprologue
+
carryadd
carryadd
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
@@
-501,14
+511,14
@@
INTFUNC(mmul4)
// to the packed operands U and N; ECX and ESI point to the expanded
// operands V and M; and EDX points to a place to store an expanded
// result Y (32 bytes, at a 16-byte boundary). The stack pointer
// to the packed operands U and N; ECX and ESI point to the expanded
// operands V and M; and EDX points to a place to store an expanded
// result Y (32 bytes, at a 16-byte boundary). The stack pointer
- // must be 16-byte aligned. (This is not the usual convention, which
- // requires alignment before the call.)
+ // must be 12 modulo 16, as is usual for modern x86 ABIs.
//
// On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
// of the sum U V + N Y to [EDI], leaving the remaining carry in
// XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
// XMM7 are clobbered; the general-purpose registers are preserved.
//
// On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
// of the sum U V + N Y to [EDI], leaving the remaining carry in
// XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
// XMM7 are clobbered; the general-purpose registers are preserved.
- sub esp, 64 // space for the carries
+ stalloc 48 + 12 // space for the carries
+ endprologue
// Calculate W = U V, and leave it in the destination. Stash the
// carry pieces for later.
// Calculate W = U V, and leave it in the destination. Stash the
// carry pieces for later.
@@
-520,24
+530,28
@@
ENDFUNC
INTFUNC(mmla4)
// On entry, EDI points to the destination buffer, which also
INTFUNC(mmla4)
// On entry, EDI points to the destination buffer, which also
- // contains an addend A to accumulate; EAX and EBX point
- //
to the
packed operands U and N; ECX and ESI point to the expanded
+ // contains an addend A to accumulate; EAX and EBX point
to the
+ // packed operands U and N; ECX and ESI point to the expanded
// operands V and M; and EDX points to a place to store an expanded
// result Y (32 bytes, at a 16-byte boundary). The stack pointer
// operands V and M; and EDX points to a place to store an expanded
// result Y (32 bytes, at a 16-byte boundary). The stack pointer
- // must be 16-byte aligned. (This is not the usual convention, which
- // requires alignment before the call.)
+ // must be 12 modulo 16, as is usual for modern x86 ABIs.
//
// On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
// bits of the sum A + U V + N Y to [EDI], leaving the remaining
// carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
// XMM3, and XMM7 are clobbered; the general-purpose registers are
// preserved.
//
// On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
// bits of the sum A + U V + N Y to [EDI], leaving the remaining
// carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
// XMM3, and XMM7 are clobbered; the general-purpose registers are
// preserved.
- sub esp, 64 // space for the carries
+ stalloc 48 + 12 // space for the carries
+ endprologue
+
movd xmm4, [edi + 0]
movd xmm5, [edi + 4]
movd xmm6, [edi + 8]
movd xmm7, [edi + 12]
movd xmm4, [edi + 0]
movd xmm5, [edi + 4]
movd xmm6, [edi + 8]
movd xmm7, [edi + 12]
- mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
+
+ // Calculate W = U V, and leave it in the destination. Stash the
+ // carry pieces for later.
+ mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
@@
-556,21
+570,21
@@
INTFUNC(mmla4)
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
- mulcore [edi + 4], esi, xmm0, xmm1, xmm2
, nil
- accum xmm5, xmm6, xmm7
, nil
+ mulcore [edi + 4], esi, xmm0, xmm1, xmm2
+ accum xmm5, xmm6, xmm7
- mulcore [edi + 8], esi, xmm0, xmm1
, nil, nil
- accum xmm6, xmm7
, nil, nil
+ mulcore [edi + 8], esi, xmm0, xmm1
+ accum xmm6, xmm7
- mulcore [edi + 12], esi, xmm0
, nil, nil, nil
- accum xmm7
, nil, nil, nil
+ mulcore [edi + 12], esi, xmm0
+ accum xmm7
// That's lots of pieces. Now we have to assemble the answer.
// That's lots of pieces. Now we have to assemble the answer.
- squash xmm4, xmm5, xmm6, xmm7,
nil, xmm0, xmm1
+ squash xmm4, xmm5, xmm6, xmm7,
xmm0, xmm1, xmm4
// Expand it.
pxor xmm2, xmm2
// Expand it.
pxor xmm2, xmm2
- expand xmm
4, xmm1, nil, nil, xmm2
+ expand xmm
2, xmm4, xmm1
movdqa [edx + 0], xmm4
movdqa [edx + 16], xmm1
movdqa [edx + 0], xmm4
movdqa [edx + 16], xmm1
@@
-581,7
+595,7
@@
INTFUNC(mmla4)
movd xmm7, [edi + 12]
// Finish the calculation by adding the Montgomery product.
movd xmm7, [edi + 12]
// Finish the calculation by adding the Montgomery product.
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
@@
-599,14
+613,14
@@
INTFUNC(mmla4)
paddq xmm6, [esp + 32]
// And, with that, we're done.
paddq xmm6, [esp + 32]
// And, with that, we're done.
- add esp, 64
+ stfree 48 + 12
ret
ENDFUNC
INTFUNC(mont4)
// On entry, EDI points to the destination buffer holding a packed
ret
ENDFUNC
INTFUNC(mont4)
// On entry, EDI points to the destination buffer holding a packed
- // value
A
; EBX points to a packed operand N; ESI points to an
+ // value
W
; EBX points to a packed operand N; ESI points to an
// expanded operand M; and EDX points to a place to store an expanded
// result Y (32 bytes, at a 16-byte boundary).
//
// expanded operand M; and EDX points to a place to store an expanded
// result Y (32 bytes, at a 16-byte boundary).
//
@@
-614,25
+628,26
@@
INTFUNC(mont4)
// of the sum W + N Y to [EDI], leaving the remaining carry in
// XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
// XMM7 are clobbered; the general-purpose registers are preserved.
// of the sum W + N Y to [EDI], leaving the remaining carry in
// XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
// XMM7 are clobbered; the general-purpose registers are preserved.
+ endprologue
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
- mulcore [edi + 4], esi, xmm0, xmm1, xmm2
, nil
- accum xmm5, xmm6, xmm7
, nil
+ mulcore [edi + 4], esi, xmm0, xmm1, xmm2
+ accum xmm5, xmm6, xmm7
- mulcore [edi + 8], esi, xmm0, xmm1
, nil, nil
- accum xmm6, xmm7
, nil, nil
+ mulcore [edi + 8], esi, xmm0, xmm1
+ accum xmm6, xmm7
- mulcore [edi + 12], esi, xmm0
, nil, nil, nil
- accum xmm7
, nil, nil, nil
+ mulcore [edi + 12], esi, xmm0
+ accum xmm7
// That's lots of pieces. Now we have to assemble the answer.
// That's lots of pieces. Now we have to assemble the answer.
- squash xmm4, xmm5, xmm6, xmm7,
nil, xmm0, xmm1
+ squash xmm4, xmm5, xmm6, xmm7,
xmm0, xmm1, xmm4
// Expand it.
pxor xmm2, xmm2
// Expand it.
pxor xmm2, xmm2
- expand xmm
4, xmm1, nil, nil, xmm2
+ expand xmm
2, xmm4, xmm1
movdqa [edx + 0], xmm4
movdqa [edx + 16], xmm1
movdqa [edx + 0], xmm4
movdqa [edx + 16], xmm1
@@
-643,7
+658,7
@@
INTFUNC(mont4)
movd xmm7, [edi + 12]
// Finish the calculation by adding the Montgomery product.
movd xmm7, [edi + 12]
// Finish the calculation by adding the Montgomery product.
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
@@
-680,13
+695,14
@@
FUNC(mpx_umul4_x86_sse2)
//
// esp + 0 expanded Y (32 bytes)
// esp + 32 (top of locals)
//
// esp + 0 expanded Y (32 bytes)
// esp + 32 (top of locals)
- push
ebp
- push
ebx
- push
esi
- push
edi
-
mov ebp, es
p
+ push
reg
ebp
+ push
reg
ebx
+ push
reg
esi
+ push
reg
edi
+
setfp eb
p
and esp, ~15
sub esp, 32
and esp, ~15
sub esp, 32
+ endprologue
// Prepare for the first iteration.
mov esi, [ebp + 32] // -> bv[0]
// Prepare for the first iteration.
mov esi, [ebp + 32] // -> bv[0]
@@
-694,7
+710,7
@@
FUNC(mpx_umul4_x86_sse2)
movdqu xmm0, [esi] // bv[0]
mov edi, [ebp + 20] // -> dv[0]
mov ecx, edi // outer loop dv cursor
movdqu xmm0, [esi] // bv[0]
mov edi, [ebp + 20] // -> dv[0]
mov ecx, edi // outer loop dv cursor
- expand xmm
0, xmm1, nil, nil, xmm7
+ expand xmm
7, xmm0, xmm1
mov ebx, [ebp + 24] // -> av[0]
mov eax, [ebp + 28] // -> av[m] = av limit
mov edx, esp // -> expanded Y = bv[0]
mov ebx, [ebp + 24] // -> av[0]
mov eax, [ebp + 28] // -> av[m] = av limit
mov edx, esp // -> expanded Y = bv[0]
@@
-726,7
+742,7
@@
FUNC(mpx_umul4_x86_sse2)
1: movdqu xmm0, [esi] // bv[i]
mov edi, ecx // -> dv[i]
pxor xmm7, xmm7
1: movdqu xmm0, [esi] // bv[i]
mov edi, ecx // -> dv[i]
pxor xmm7, xmm7
- expand xmm
0, xmm1, nil, nil, xmm7
+ expand xmm
7, xmm0, xmm1
mov ebx, [ebp + 24] // -> av[0]
movdqa [esp + 0], xmm0 // bv[i] expanded low
movdqa [esp + 16], xmm1 // bv[i] expanded high
mov ebx, [ebp + 24] // -> av[0]
movdqa [esp + 0], xmm0 // bv[i] expanded low
movdqa [esp + 16], xmm1 // bv[i] expanded high
@@
-753,7
+769,7
@@
FUNC(mpx_umul4_x86_sse2)
jb 1b
// All over.
jb 1b
// All over.
-9:
mov esp, eb
p
+9:
dropf
p
pop edi
pop esi
pop ebx
pop edi
pop esi
pop ebx
@@
-776,24
+792,24
@@
FUNC(mpxmont_mul4_x86_sse2)
// ebp + 36 n (nonzero multiple of 4)
// ebp + 40 mi
//
// ebp + 36 n (nonzero multiple of 4)
// ebp + 40 mi
//
- // Locals are relative to ESP, which
is 4 mod 16
, as follows.
+ // Locals are relative to ESP, which
16-byte aligned
, as follows.
//
//
- // esp + 0
outer loop dv
- // esp +
4 outer loop bv
- // esp +
8 av limit (mostly in ESI
)
- // esp +
12 expanded V (32 bytes)
- // esp +
44 expanded M (32 bytes)
- // esp +
76 expanded Y (32 bytes
)
+ // esp + 0
expanded V (32 bytes)
+ // esp +
32 expanded M (32 bytes)
+ // esp +
64 expanded Y (32 bytes
)
+ // esp +
96 outer loop dv
+ // esp +
100 outer loop bv
+ // esp +
104 av limit (mostly in ESI
)
// esp + 108 bv limit
// esp + 108 bv limit
- // esp + 112 (gap)
- // esp + 124 (top of locals)
- push ebp
- push ebx
- push esi
- push edi
- mov ebp, esp
+ // esp + 112 (top of locals)
+ pushreg ebp
+ pushreg ebx
+ pushreg esi
+ pushreg edi
+ setfp ebp
and esp, ~15
and esp, ~15
- sub esp, 124
+ sub esp, 112
+ endprologue
// Establish the expanded operands.
pxor xmm7, xmm7
// Establish the expanded operands.
pxor xmm7, xmm7
@@
-801,34
+817,34
@@
FUNC(mpxmont_mul4_x86_sse2)
mov edx, [ebp + 40] // -> mi
movdqu xmm0, [ecx] // bv[0]
movdqu xmm2, [edx] // mi
mov edx, [ebp + 40] // -> mi
movdqu xmm0, [ecx] // bv[0]
movdqu xmm2, [edx] // mi
- expand xmm
0, xmm1, xmm2, xmm3, xmm7
- movdqa [esp +
12
], xmm0 // bv[0] expanded low
- movdqa [esp +
28
], xmm1 // bv[0] expanded high
- movdqa [esp +
44
], xmm2 // mi expanded low
- movdqa [esp +
60
], xmm3 // mi expanded high
+ expand xmm
7, xmm0, xmm1, xmm2, xmm3
+ movdqa [esp +
0
], xmm0 // bv[0] expanded low
+ movdqa [esp +
16
], xmm1 // bv[0] expanded high
+ movdqa [esp +
32
], xmm2 // mi expanded low
+ movdqa [esp +
48
], xmm3 // mi expanded high
// Set up the outer loop state and prepare for the first iteration.
mov edx, [ebp + 36] // n
mov eax, [ebp + 24] // -> U = av[0]
mov ebx, [ebp + 32] // -> X = nv[0]
mov edi, [ebp + 20] // -> Z = dv[0]
// Set up the outer loop state and prepare for the first iteration.
mov edx, [ebp + 36] // n
mov eax, [ebp + 24] // -> U = av[0]
mov ebx, [ebp + 32] // -> X = nv[0]
mov edi, [ebp + 20] // -> Z = dv[0]
- mov [esp +
4
], ecx
+ mov [esp +
100
], ecx
lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
lea edx, [eax + 4*edx] // -> av[n/4] = av limit
lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
lea edx, [eax + 4*edx] // -> av[n/4] = av limit
- mov [esp + 0], edi
+ mov [esp + 96], edi
+ mov [esp + 104], edx
mov [esp + 108], ecx
mov [esp + 108], ecx
- mov [esp + 8], edx
- lea ecx, [esp + 12] // -> expanded V = bv[0]
- lea esi, [esp + 44] // -> expanded M = mi
- lea edx, [esp + 76] // -> space for Y
+ lea ecx, [esp + 0] // -> expanded V = bv[0]
+ lea esi, [esp + 32] // -> expanded M = mi
+ lea edx, [esp + 64] // -> space for Y
call mmul4
call mmul4
- mov esi, [esp +
8]
// recover av limit
+ mov esi, [esp +
104]
// recover av limit
add edi, 16
add eax, 16
add ebx, 16
cmp eax, esi // done already?
jae 8f
add edi, 16
add eax, 16
add ebx, 16
cmp eax, esi // done already?
jae 8f
- mov [esp +
0
], edi
+ mov [esp +
96
], edi
.p2align 4
// Complete the first inner loop.
.p2align 4
// Complete the first inner loop.
@@
-847,26
+863,26
@@
FUNC(mpxmont_mul4_x86_sse2)
// Embark on the next iteration. (There must be one. If n = 1, then
// we would have bailed above, to label 8. Similarly, the subsequent
// iterations can fall into the inner loop immediately.)
// Embark on the next iteration. (There must be one. If n = 1, then
// we would have bailed above, to label 8. Similarly, the subsequent
// iterations can fall into the inner loop immediately.)
-1: mov eax, [esp +
4]
// -> bv[i - 1]
- mov edi, [esp +
0]
// -> Z = dv[i]
+1: mov eax, [esp +
100]
// -> bv[i - 1]
+ mov edi, [esp +
96]
// -> Z = dv[i]
add eax, 16 // -> bv[i]
pxor xmm7, xmm7
add eax, 16 // -> bv[i]
pxor xmm7, xmm7
- movdqu xmm0, [eax] // bv[i]
- mov [esp + 4], eax
+ mov [esp + 100], eax
cmp eax, [esp + 108] // done yet?
jae 9f
cmp eax, [esp + 108] // done yet?
jae 9f
+ movdqu xmm0, [eax] // bv[i]
mov ebx, [ebp + 32] // -> X = nv[0]
mov ebx, [ebp + 32] // -> X = nv[0]
- lea esi, [esp +
44
] // -> expanded M = mi
+ lea esi, [esp +
32
] // -> expanded M = mi
mov eax, [ebp + 24] // -> U = av[0]
mov eax, [ebp + 24] // -> U = av[0]
- expand xmm
0, xmm1, nil, nil, xmm7
- movdqa [esp +
12], xmm0
// bv[i] expanded low
- movdqa [esp +
28
], xmm1 // bv[i] expanded high
+ expand xmm
7, xmm0, xmm1
+ movdqa [esp +
0], xmm0
// bv[i] expanded low
+ movdqa [esp +
16
], xmm1 // bv[i] expanded high
call mmla4
call mmla4
- mov esi, [esp +
8]
// recover av limit
+ mov esi, [esp +
104]
// recover av limit
add edi, 16
add eax, 16
add ebx, 16
add edi, 16
add eax, 16
add ebx, 16
- mov [esp +
0
], edi
+ mov [esp +
96
], edi
.p2align 4
// Complete the next inner loop.
.p2align 4
// Complete the next inner loop.
@@
-894,11
+910,11
@@
FUNC(mpxmont_mul4_x86_sse2)
movd [edi + 16], xmm4
// All done.
movd [edi + 16], xmm4
// All done.
-9:
mov esp, eb
p
- pop
edi
- pop
esi
- pop
ebx
- pop
ebp
+9:
dropf
p
+ pop
reg
edi
+ pop
reg
esi
+ pop
reg
ebx
+ pop
reg
ebp
ret
ENDFUNC
ret
ENDFUNC
@@
-924,13
+940,14
@@
FUNC(mpxmont_redc4_x86_sse2)
// esp + 12 expanded M (32 bytes)
// esp + 44 expanded Y (32 bytes)
// esp + 76 (top of locals)
// esp + 12 expanded M (32 bytes)
// esp + 44 expanded Y (32 bytes)
// esp + 76 (top of locals)
- push
ebp
- push
ebx
- push
esi
- push
edi
-
mov ebp, es
p
+ push
reg
ebp
+ push
reg
ebx
+ push
reg
esi
+ push
reg
edi
+
setfp eb
p
and esp, ~15
sub esp, 76
and esp, ~15
sub esp, 76
+ endprologue
// Establish the expanded operands and the blocks-of-4 dv limit.
mov edi, [ebp + 20] // -> Z = dv[0]
// Establish the expanded operands and the blocks-of-4 dv limit.
mov edi, [ebp + 20] // -> Z = dv[0]
@@
-940,7
+957,7
@@
FUNC(mpxmont_redc4_x86_sse2)
mov edx, [ebp + 36] // -> mi
movdqu xmm0, [edx] // mi
and eax, ~15 // mask off the tail end
mov edx, [ebp + 36] // -> mi
movdqu xmm0, [edx] // mi
and eax, ~15 // mask off the tail end
- expand xmm
0, xmm1, nil, nil, xmm7
+ expand xmm
7, xmm0, xmm1
add eax, edi // find limit
movdqa [esp + 12], xmm0 // mi expanded low
movdqa [esp + 28], xmm1 // mi expanded high
add eax, edi // find limit
movdqa [esp + 12], xmm0 // mi expanded low
movdqa [esp + 28], xmm1 // mi expanded high
@@
-956,8
+973,8
@@
FUNC(mpxmont_redc4_x86_sse2)
lea esi, [esp + 12] // -> expanded M = mi
lea edx, [esp + 44] // -> space for Y
call mont4
lea esi, [esp + 12] // -> expanded M = mi
lea edx, [esp + 44] // -> space for Y
call mont4
- add edi, 16
add ebx, 16
add ebx, 16
+ add edi, 16
cmp ebx, ecx // done already?
jae 8f
cmp ebx, ecx // done already?
jae 8f
@@
-1019,11
+1036,11
@@
FUNC(mpxmont_redc4_x86_sse2)
jmp 5b
// All over.
jmp 5b
// All over.
-9:
mov esp, eb
p
- pop
edi
- pop
esi
- pop
ebx
- pop
ebp
+9:
dropf
p
+ pop
reg
edi
+ pop
reg
esi
+ pop
reg
ebx
+ pop
reg
ebp
ret
ENDFUNC
ret
ENDFUNC
@@
-1051,27
+1068,31
@@
ENDFUNC
mov [ebx + ecx*8 + 4], edx
.endm
mov [ebx + ecx*8 + 4], edx
.endm
-.macro testprologue
- push
ebp
- push
ebx
- push
esi
- push
edi
-
mov ebp, es
p
+.macro testprologue
n
+ push
reg
ebp
+ push
reg
ebx
+ push
reg
esi
+ push
reg
edi
+
setfp eb
p
and esp, ~15
and esp, ~15
- sub esp, 3*32 + 12
+ sub esp, 3*32 + 4*4
+ endprologue
+ mov eax, \n
+ mov [esp + 104], eax
// vars:
// vars:
- // esp + 0 = cycles
- // esp + 12 = v expanded
- // esp + 44 = y expanded
- // esp + 72 = ? expanded
+ // esp + 0 = v expanded
+ // esp + 32 = y expanded
+ // esp + 64 = ? expanded
+ // esp + 96 = cycles
+ // esp + 104 = count
.endm
.macro testepilogue
.endm
.macro testepilogue
-
mov esp, eb
p
- pop
edi
- pop
esi
- pop
ebx
- pop
ebp
+
dropf
p
+ pop
reg
edi
+ pop
reg
esi
+ pop
reg
ebx
+ pop
reg
ebp
ret
.endm
ret
.endm
@@
-1082,47
+1103,47
@@
ENDFUNC
movdqu xmm6, [ecx + 32] // (c'_2, c''_2)
.endm
movdqu xmm6, [ecx + 32] // (c'_2, c''_2)
.endm
-.macro testexpand v
, y
+.macro testexpand v
=nil, y=nil
pxor xmm7, xmm7
.ifnes "\v", "nil"
mov ecx, \v
movdqu xmm0, [ecx]
pxor xmm7, xmm7
.ifnes "\v", "nil"
mov ecx, \v
movdqu xmm0, [ecx]
- expand xmm
0, xmm1, nil, nil, xmm7
- movdqa [esp +
12
], xmm0
- movdqa [esp +
28
], xmm1
+ expand xmm
7, xmm0, xmm1
+ movdqa [esp +
0
], xmm0
+ movdqa [esp +
16
], xmm1
.endif
.ifnes "\y", "nil"
mov edx, \y
movdqu xmm2, [edx]
.endif
.ifnes "\y", "nil"
mov edx, \y
movdqu xmm2, [edx]
- expand xmm
2, xmm3, nil, nil, xmm7
- movdqa [esp +
44
], xmm2
- movdqa [esp +
60
], xmm3
+ expand xmm
7, xmm2, xmm3
+ movdqa [esp +
32
], xmm2
+ movdqa [esp +
48
], xmm3
.endif
.endm
.endif
.endm
-.macro testtop u
, x, mode
+.macro testtop u
=nil, x=nil, mode=nil
.p2align 4
0:
.ifnes "\u", "nil"
.p2align 4
0:
.ifnes "\u", "nil"
- lea ecx, [esp +
12
]
+ lea ecx, [esp +
0
]
.endif
mov ebx, \x
.ifeqs "\mode", "mont"
.endif
mov ebx, \x
.ifeqs "\mode", "mont"
- lea esi, [esp +
44
]
+ lea esi, [esp +
32
]
.endif
.endif
- cysetup esp +
0
+ cysetup esp +
96
.ifnes "\u", "nil"
mov eax, \u
.endif
.ifeqs "\mode", "mont"
.ifnes "\u", "nil"
mov eax, \u
.endif
.ifeqs "\mode", "mont"
- lea edx, [esp +
76
]
+ lea edx, [esp +
64
]
.else
.else
- lea edx, [esp +
44
]
+ lea edx, [esp +
32
]
.endif
.endm
.endif
.endm
-.macro testtail cyv
, n
- cystore esp +
0, \cyv, \n
+.macro testtail cyv
+ cystore esp +
96, \cyv, esp + 104
jnz 0b
.endm
jnz 0b
.endm
@@
-1133,101
+1154,125
@@
ENDFUNC
movdqu [ecx + 32], xmm6
.endm
movdqu [ecx + 32], xmm6
.endm
- .globl test_dmul4
-test_dmul4:
- testprologue
+FUNC(test_dmul4)
+ testprologue [ebp + 44]
testldcarry [ebp + 24]
testexpand [ebp + 36], [ebp + 40]
mov edi, [ebp + 20]
testtop [ebp + 28], [ebp + 32]
call dmul4
testldcarry [ebp + 24]
testexpand [ebp + 36], [ebp + 40]
mov edi, [ebp + 20]
testtop [ebp + 28], [ebp + 32]
call dmul4
- testtail [ebp + 48]
, [ebp + 44]
+ testtail [ebp + 48]
testcarryout [ebp + 24]
testepilogue
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
- .globl test_dmla4
-test_dmla4:
- testprologue
+FUNC(test_dmla4)
+ testprologue [ebp + 44]
testldcarry [ebp + 24]
testexpand [ebp + 36], [ebp + 40]
mov edi, [ebp + 20]
testtop [ebp + 28], [ebp + 32]
call dmla4
testldcarry [ebp + 24]
testexpand [ebp + 36], [ebp + 40]
mov edi, [ebp + 20]
testtop [ebp + 28], [ebp + 32]
call dmla4
- testtail [ebp + 48]
, [ebp + 44]
+ testtail [ebp + 48]
testcarryout [ebp + 24]
testepilogue
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
- .globl test_mul4
-test_mul4:
- testprologue
+FUNC(test_mul4)
+ testprologue [ebp + 36]
testldcarry [ebp + 24]
testexpand nil, [ebp + 32]
mov edi, [ebp + 20]
testtop nil, [ebp + 28]
call mul4
testldcarry [ebp + 24]
testexpand nil, [ebp + 32]
mov edi, [ebp + 20]
testtop nil, [ebp + 28]
call mul4
- testtail [ebp + 40], [ebp + 36]
+ testtail [ebp + 40]
+ testcarryout [ebp + 24]
+ testepilogue
+ENDFUNC
+
+FUNC(test_mul4zc)
+ testprologue [ebp + 36]
+ testldcarry [ebp + 24]
+ testexpand nil, [ebp + 32]
+ mov edi, [ebp + 20]
+ testtop nil, [ebp + 28]
+ call mul4zc
+ testtail [ebp + 40]
testcarryout [ebp + 24]
testepilogue
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
- .globl test_mla4
-test_mla4:
- testprologue
+FUNC(test_mla4)
+ testprologue [ebp + 36]
testldcarry [ebp + 24]
testexpand nil, [ebp + 32]
mov edi, [ebp + 20]
testtop nil, [ebp + 28]
call mla4
testldcarry [ebp + 24]
testexpand nil, [ebp + 32]
mov edi, [ebp + 20]
testtop nil, [ebp + 28]
call mla4
- testtail [ebp + 40]
, [ebp + 36]
+ testtail [ebp + 40]
testcarryout [ebp + 24]
testepilogue
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
- .globl test_mmul4
-test_mmul4:
- testprologue
+FUNC(test_mla4zc)
+ testprologue [ebp + 36]
+ testldcarry [ebp + 24]
+ testexpand nil, [ebp + 32]
+ mov edi, [ebp + 20]
+ testtop nil, [ebp + 28]
+ call mla4zc
+ testtail [ebp + 40]
+ testcarryout [ebp + 24]
+ testepilogue
+ENDFUNC
+
+FUNC(test_mmul4)
+ testprologue [ebp + 48]
testexpand [ebp + 40], [ebp + 44]
mov edi, [ebp + 20]
testtop [ebp + 32], [ebp + 36], mont
call mmul4
testexpand [ebp + 40], [ebp + 44]
mov edi, [ebp + 20]
testtop [ebp + 32], [ebp + 36], mont
call mmul4
- testtail [ebp + 52]
, [ebp + 48]
+ testtail [ebp + 52]
mov edi, [ebp + 28]
mov edi, [ebp + 28]
- movdqa xmm0, [esp +
76
]
- movdqa xmm1, [esp +
92
]
+ movdqa xmm0, [esp +
64
]
+ movdqa xmm1, [esp +
80
]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
testepilogue
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
- .globl test_mmla4
-test_mmla4:
- testprologue
+FUNC(test_mmla4)
+ testprologue [ebp + 48]
testexpand [ebp + 40], [ebp + 44]
mov edi, [ebp + 20]
testtop [ebp + 32], [ebp + 36], mont
call mmla4
testexpand [ebp + 40], [ebp + 44]
mov edi, [ebp + 20]
testtop [ebp + 32], [ebp + 36], mont
call mmla4
- testtail [ebp + 52]
, [ebp + 48]
+ testtail [ebp + 52]
mov edi, [ebp + 28]
mov edi, [ebp + 28]
- movdqa xmm0, [esp +
76
]
- movdqa xmm1, [esp +
92
]
+ movdqa xmm0, [esp +
64
]
+ movdqa xmm1, [esp +
80
]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
testepilogue
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
- .globl test_mont4
-test_mont4:
- testprologue
+FUNC(test_mont4)
+ testprologue [ebp + 40]
testexpand nil, [ebp + 36]
mov edi, [ebp + 20]
testtop nil, [ebp + 32], mont
call mont4
testexpand nil, [ebp + 36]
mov edi, [ebp + 20]
testtop nil, [ebp + 32], mont
call mont4
- testtail [ebp + 44]
, [ebp + 40]
+ testtail [ebp + 44]
mov edi, [ebp + 28]
mov edi, [ebp + 28]
- movdqa xmm0, [esp +
76
]
- movdqa xmm1, [esp +
92
]
+ movdqa xmm0, [esp +
64
]
+ movdqa xmm1, [esp +
80
]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
testepilogue
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
testepilogue
+ENDFUNC
#endif
#endif