~mdw
/
catacomb
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
math/Makefile.am, symm/Makefile.am: Use `--no-install' on oddball tests.
[catacomb]
/
math
/
mpx-mul4-x86-sse2.S
diff --git
a/math/mpx-mul4-x86-sse2.S
b/math/mpx-mul4-x86-sse2.S
index
0e87ff5
..
f6c8167
100644
(file)
--- a/
math/mpx-mul4-x86-sse2.S
+++ b/
math/mpx-mul4-x86-sse2.S
@@
-64,7
+64,7
@@
/// 0 v'_0 v'_1 v''_0 v''_1
/// 16 v'_2 v'_3 v''_2 v''_3
///
/// 0 v'_0 v'_1 v''_0 v''_1
/// 16 v'_2 v'_3 v''_2 v''_3
///
-/// A `pmuludq
d
' instruction ignores the odd positions in its operands; thus,
+/// A `pmuludq' instruction ignores the odd positions in its operands; thus,
/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
/// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
/// multiply such a vector by a full 32-bit scalar to produce two 48-bit
/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
/// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can
/// multiply such a vector by a full 32-bit scalar to produce two 48-bit
@@
-81,7
+81,7
@@
/// the register c0, for example, holds c'_0 (low half) and c''_0 (high
/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
/// the register c0, for example, holds c'_0 (low half) and c''_0 (high
/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The
-/// `pmuluqd
q
' instruction acting on a scalar operand (broadcast across all
+/// `pmuluqd' instruction acting on a scalar operand (broadcast across all
/// lanes of its vector) and an operand in the expanded form above produces a
/// result which can be added directly to the appropriate carry register.
/// Following a pass of four multiplications, we perform some limited carry
/// lanes of its vector) and an operand in the expanded form above produces a
/// result which can be added directly to the appropriate carry register.
/// Following a pass of four multiplications, we perform some limited carry
@@
-93,7
+93,7
@@
///--------------------------------------------------------------------------
/// Macro definitions.
///--------------------------------------------------------------------------
/// Macro definitions.
-.macro mulcore r, s, d0, d1
, d2, d3
+.macro mulcore r, s, d0, d1
=nil, d2=nil, d3=nil
// Load a word r_i from R, multiply by the expanded operand [S], and
// leave the pieces of the product in registers D0, D1, D2, D3.
movd \d0, \r // (r_i, 0, 0, 0)
// Load a word r_i from R, multiply by the expanded operand [S], and
// leave the pieces of the product in registers D0, D1, D2, D3.
movd \d0, \r // (r_i, 0, 0, 0)
@@
-118,22
+118,25
@@
psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
.endif
.ifnes "\d1", "nil"
psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
.endif
.ifnes "\d1", "nil"
- pmuludq
d \d1, \d0
// (r_i s'_1, r_i s''_1)
+ pmuludq
\d1, \d0
// (r_i s'_1, r_i s''_1)
.endif
.ifnes "\d3", "nil"
.endif
.ifnes "\d3", "nil"
- pmuludq
d \d3, \d0
// (r_i s'_3, r_i s''_3)
+ pmuludq
\d3, \d0
// (r_i s'_3, r_i s''_3)
.endif
.ifnes "\d2", "nil"
.ifnes "\d3", "nil"
.endif
.ifnes "\d2", "nil"
.ifnes "\d3", "nil"
- pmuludq
d \d2, \d0
// (r_i s'_2, r_i s''_2)
+ pmuludq
\d2, \d0
// (r_i s'_2, r_i s''_2)
.else
.else
- pmuludq
d
\d2, [\s + 16]
+ pmuludq \d2, [\s + 16]
.endif
.endif
.endif
.endif
- pmuludq
d \d0, [\s]
// (r_i s'_0, r_i s''_0)
+ pmuludq
\d0, [\s]
// (r_i s'_0, r_i s''_0)
.endm
.endm
-.macro accum c0, c1, c2, c3
+.macro accum c0, c1=nil, c2=nil, c3=nil
+ // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
+ // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip
+ // updating that register.
paddq \c0, xmm0
.ifnes "\c1", "nil"
paddq \c1, xmm1
paddq \c0, xmm0
.ifnes "\c1", "nil"
paddq \c1, xmm1
@@
-146,7
+149,7
@@
.endif
.endm
.endif
.endm
-.macro mulacc r, s, c0, c1, c2, c3, z3p
+.macro mulacc r, s, c0, c1, c2, c3, z3p
=nil
// Load a word r_i from R, multiply by the expanded operand [S],
// and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
// then C3 notionally contains zero, but needs clearing; in practice,
// Load a word r_i from R, multiply by the expanded operand [S],
// and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t'
// then C3 notionally contains zero, but needs clearing; in practice,
@@
-155,14
+158,14
@@
// is not `t'.
.ifeqs "\z3p", "t"
mulcore \r, \s, xmm0, xmm1, xmm2, \c3
// is not `t'.
.ifeqs "\z3p", "t"
mulcore \r, \s, xmm0, xmm1, xmm2, \c3
- accum \c0, \c1, \c2
, nil
+ accum \c0, \c1, \c2
.else
mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
accum \c0, \c1, \c2, \c3
.endif
.endm
.else
mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
accum \c0, \c1, \c2, \c3
.endif
.endm
-.macro propout d, c, cc
+.macro propout d, c, cc
=nil
// Calculate an output word from C, and store it in D; propagate
// carries out from C to CC in preparation for a rotation of the
// carry registers. On completion, XMM3 is clobbered. If CC is
// Calculate an output word from C, and store it in D; propagate
// carries out from C to CC in preparation for a rotation of the
// carry registers. On completion, XMM3 is clobbered. If CC is
@@
-192,7
+195,7
@@
psrldq \t, 4 // floor((c' + c'' b)/B)
.endm
psrldq \t, 4 // floor((c' + c'' b)/B)
.endm
-.macro expand
a, b, c, d, z
+.macro expand
z, a, b, c=nil, d=nil
// On entry, A and C hold packed 128-bit values, and Z is zero. On
// exit, A:B and C:D together hold the same values in expanded
// form. If C is `nil', then only expand A to A:B.
// On entry, A and C hold packed 128-bit values, and Z is zero. On
// exit, A:B and C:D together hold the same values in expanded
// form. If C is `nil', then only expand A to A:B.
@@
-214,11
+217,11
@@
.endif
.endm
.endif
.endm
-.macro squash c0, c1, c2, c3,
h, t, u
+.macro squash c0, c1, c2, c3,
t, u, lo, hi=nil
// On entry, C0, C1, C2, C3 are carry registers representing a value
// On entry, C0, C1, C2, C3 are carry registers representing a value
- // Y. On exit,
C0
holds the low 128 bits of the carry value; C1, C2,
+ // Y. On exit,
LO
holds the low 128 bits of the carry value; C1, C2,
// C3, T, and U are clobbered; and the high bits of Y are stored in
// C3, T, and U are clobbered; and the high bits of Y are stored in
- // H, if this is not `nil'.
+ // H
I
, if this is not `nil'.
// The first step is to eliminate the `double-prime' pieces -- i.e.,
// the ones offset by 16 bytes from a 32-bit boundary -- by carrying
// The first step is to eliminate the `double-prime' pieces -- i.e.,
// the ones offset by 16 bytes from a 32-bit boundary -- by carrying
@@
-241,8
+244,8
@@
psrlq \c1, 16 // high parts of (y''_1, y''_3)
psrlq \c2, 32 // low parts of (y''_0, y''_2)
psrlq \c3, 32 // low parts of (y''_1, y''_3)
psrlq \c1, 16 // high parts of (y''_1, y''_3)
psrlq \c2, 32 // low parts of (y''_0, y''_2)
psrlq \c3, 32 // low parts of (y''_1, y''_3)
- .ifnes "\h", "nil"
- movdqa \h, \c1
+ .ifnes "\h
i
", "nil"
+ movdqa \h
i
, \c1
.endif
pslldq \c1, 8 // high part of (0, y''_1)
.endif
pslldq \c1, 8 // high part of (0, y''_1)
@@
-250,44
+253,36
@@
paddq \u, \c3
paddq \t, \c1 // and up: (y_0, y_2)
paddq \u, \c0 // (y_1, y_3)
paddq \u, \c3
paddq \t, \c1 // and up: (y_0, y_2)
paddq \u, \c0 // (y_1, y_3)
- .ifnes "\h", "nil"
- psrldq \h
, 8
// high part of (y''_3, 0)
+ .ifnes "\h
i
", "nil"
+ psrldq \h
i, 8
// high part of (y''_3, 0)
.endif
// Finally extract the answer. This complicated dance is better than
// storing to memory and loading, because the piecemeal stores
// inhibit store forwarding.
movdqa \c3, \t // (y_0, y_1)
.endif
// Finally extract the answer. This complicated dance is better than
// storing to memory and loading, because the piecemeal stores
// inhibit store forwarding.
movdqa \c3, \t // (y_0, y_1)
- movdqa \
c0
, \t // (y^*_0, ?, ?, ?)
+ movdqa \
lo
, \t // (y^*_0, ?, ?, ?)
psrldq \t, 8 // (y_2, 0)
psrlq \c3, 32 // (floor(y_0/B), ?)
paddq \c3, \u // (y_1 + floor(y_0/B), ?)
psrldq \t, 8 // (y_2, 0)
psrlq \c3, 32 // (floor(y_0/B), ?)
paddq \c3, \u // (y_1 + floor(y_0/B), ?)
- pslldq \c0, 12 // (0, 0, 0, y^*_0)
movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
psrldq \u, 8 // (y_3, 0)
psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
psrldq \u, 8 // (y_3, 0)
psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
- pslldq \c1, 12 // (0, 0, 0, y^*_1)
- psrldq \c0, 12 // (y^*_0, 0, 0, 0)
- movdqa \c2, \c3 // (y^*_2, ?, ?, ?)
+ punpckldq \lo, \c3 // (y^*_0, y^*_2, ?, ?)
psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
- pslldq \c2, 12 // (0, 0, 0, y^*_2)
- psrldq \c1, 8 // (0, y^*_1, 0, 0)
- psrldq \c2, 4 // (0, 0, y^*_2, 0)
- .ifnes "\h", "nil"
+ .ifnes "\hi", "nil"
movdqa \t, \c3
pxor \u, \u
.endif
movdqa \t, \c3
pxor \u, \u
.endif
- pslldq \c3, 12 // (0, 0, 0, y^*_3)
- por \c0, \c1 // (y^*_0, y^*_1, 0, 0)
- por \c2, \c3 // (0, 0, y^*_2, y^*_3)
- por \c0, \c2 // y mod B^4
- .ifnes "\h", "nil"
+ punpckldq \c1, \c3 // (y^*_1, y^*_3, ?, ?)
+ .ifnes "\hi", "nil"
psrlq \t, 32 // very high bits of y
psrlq \t, 32 // very high bits of y
- paddq \h, \t
- punpcklqdq \h
, \u
// carry up
+ paddq \h
i
, \t
+ punpcklqdq \h
i, \u
// carry up
.endif
.endif
+ punpckldq \lo, \c1 // y mod B^4
.endm
.macro carryadd
.endm
.macro carryadd
@@
-339,19
+334,19
@@
INTFUNC(dmul4)
endprologue
mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
endprologue
mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
- mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
, nil
+ mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
propout [edi + 4], xmm5, xmm6
mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
propout [edi + 4], xmm5, xmm6
mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
- mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
, nil
+ mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
propout [edi + 8], xmm6, xmm7
mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
propout [edi + 8], xmm6, xmm7
mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
- mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
, nil
+ mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
propout [edi + 12], xmm7, xmm4
ret
propout [edi + 12], xmm7, xmm4
ret
@@
-374,20
+369,20
@@
INTFUNC(dmla4)
carryadd
carryadd
- mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
, nil
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
, nil
+ mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
- mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
, nil
+ mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4
propout [edi + 4], xmm5, xmm6
mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
propout [edi + 4], xmm5, xmm6
mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t
- mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
, nil
+ mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5
propout [edi + 8], xmm6, xmm7
mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
propout [edi + 8], xmm6, xmm7
mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
- mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
, nil
+ mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
propout [edi + 12], xmm7, xmm4
ret
propout [edi + 12], xmm7, xmm4
ret
@@
-464,7
+459,7
@@
INTFUNC(mla4zc)
movd xmm6, [edi + 8]
movd xmm7, [edi + 12]
movd xmm6, [edi + 8]
movd xmm7, [edi + 12]
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
@@
-495,7
+490,7
@@
INTFUNC(mla4)
carryadd
carryadd
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
@@
-516,14
+511,13
@@
INTFUNC(mmul4)
// to the packed operands U and N; ECX and ESI point to the expanded
// operands V and M; and EDX points to a place to store an expanded
// result Y (32 bytes, at a 16-byte boundary). The stack pointer
// to the packed operands U and N; ECX and ESI point to the expanded
// operands V and M; and EDX points to a place to store an expanded
// result Y (32 bytes, at a 16-byte boundary). The stack pointer
- // must be 16-byte aligned. (This is not the usual convention, which
- // requires alignment before the call.)
+ // must be 12 modulo 16, as is usual for modern x86 ABIs.
//
// On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
// of the sum U V + N Y to [EDI], leaving the remaining carry in
// XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
// XMM7 are clobbered; the general-purpose registers are preserved.
//
// On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
// of the sum U V + N Y to [EDI], leaving the remaining carry in
// XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and
// XMM7 are clobbered; the general-purpose registers are preserved.
- stalloc 48
// space for the carries
+ stalloc 48
+ 12
// space for the carries
endprologue
// Calculate W = U V, and leave it in the destination. Stash the
endprologue
// Calculate W = U V, and leave it in the destination. Stash the
@@
-536,26
+530,28
@@
ENDFUNC
INTFUNC(mmla4)
// On entry, EDI points to the destination buffer, which also
INTFUNC(mmla4)
// On entry, EDI points to the destination buffer, which also
- // contains an addend A to accumulate; EAX and EBX point
- //
to the
packed operands U and N; ECX and ESI point to the expanded
+ // contains an addend A to accumulate; EAX and EBX point
to the
+ // packed operands U and N; ECX and ESI point to the expanded
// operands V and M; and EDX points to a place to store an expanded
// result Y (32 bytes, at a 16-byte boundary). The stack pointer
// operands V and M; and EDX points to a place to store an expanded
// result Y (32 bytes, at a 16-byte boundary). The stack pointer
- // must be 16-byte aligned. (This is not the usual convention, which
- // requires alignment before the call.)
+ // must be 12 modulo 16, as is usual for modern x86 ABIs.
//
// On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
// bits of the sum A + U V + N Y to [EDI], leaving the remaining
// carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
// XMM3, and XMM7 are clobbered; the general-purpose registers are
// preserved.
//
// On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
// bits of the sum A + U V + N Y to [EDI], leaving the remaining
// carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2,
// XMM3, and XMM7 are clobbered; the general-purpose registers are
// preserved.
- stalloc 48
// space for the carries
+ stalloc 48
+ 12
// space for the carries
endprologue
movd xmm4, [edi + 0]
movd xmm5, [edi + 4]
movd xmm6, [edi + 8]
movd xmm7, [edi + 12]
endprologue
movd xmm4, [edi + 0]
movd xmm5, [edi + 4]
movd xmm6, [edi + 8]
movd xmm7, [edi + 12]
- mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil
+
+ // Calculate W = U V, and leave it in the destination. Stash the
+ // carry pieces for later.
+ mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t
@@
-574,21
+570,21
@@
INTFUNC(mmla4)
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
- mulcore [edi + 4], esi, xmm0, xmm1, xmm2
, nil
- accum xmm5, xmm6, xmm7
, nil
+ mulcore [edi + 4], esi, xmm0, xmm1, xmm2
+ accum xmm5, xmm6, xmm7
- mulcore [edi + 8], esi, xmm0, xmm1
, nil, nil
- accum xmm6, xmm7
, nil, nil
+ mulcore [edi + 8], esi, xmm0, xmm1
+ accum xmm6, xmm7
- mulcore [edi + 12], esi, xmm0
, nil, nil, nil
- accum xmm7
, nil, nil, nil
+ mulcore [edi + 12], esi, xmm0
+ accum xmm7
// That's lots of pieces. Now we have to assemble the answer.
// That's lots of pieces. Now we have to assemble the answer.
- squash xmm4, xmm5, xmm6, xmm7,
nil, xmm0, xmm1
+ squash xmm4, xmm5, xmm6, xmm7,
xmm0, xmm1, xmm4
// Expand it.
pxor xmm2, xmm2
// Expand it.
pxor xmm2, xmm2
- expand xmm
4, xmm1, nil, nil, xmm2
+ expand xmm
2, xmm4, xmm1
movdqa [edx + 0], xmm4
movdqa [edx + 16], xmm1
movdqa [edx + 0], xmm4
movdqa [edx + 16], xmm1
@@
-599,7
+595,7
@@
INTFUNC(mmla4)
movd xmm7, [edi + 12]
// Finish the calculation by adding the Montgomery product.
movd xmm7, [edi + 12]
// Finish the calculation by adding the Montgomery product.
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
@@
-617,7
+613,7
@@
INTFUNC(mmla4)
paddq xmm6, [esp + 32]
// And, with that, we're done.
paddq xmm6, [esp + 32]
// And, with that, we're done.
- stfree 48
+ stfree 48
+ 12
ret
ENDFUNC
ret
ENDFUNC
@@
-637,21
+633,21
@@
INTFUNC(mont4)
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
// Calculate Y = W M.
mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7
- mulcore [edi + 4], esi, xmm0, xmm1, xmm2
, nil
- accum xmm5, xmm6, xmm7
, nil
+ mulcore [edi + 4], esi, xmm0, xmm1, xmm2
+ accum xmm5, xmm6, xmm7
- mulcore [edi + 8], esi, xmm0, xmm1
, nil, nil
- accum xmm6, xmm7
, nil, nil
+ mulcore [edi + 8], esi, xmm0, xmm1
+ accum xmm6, xmm7
- mulcore [edi + 12], esi, xmm0
, nil, nil, nil
- accum xmm7
, nil, nil, nil
+ mulcore [edi + 12], esi, xmm0
+ accum xmm7
// That's lots of pieces. Now we have to assemble the answer.
// That's lots of pieces. Now we have to assemble the answer.
- squash xmm4, xmm5, xmm6, xmm7,
nil, xmm0, xmm1
+ squash xmm4, xmm5, xmm6, xmm7,
xmm0, xmm1, xmm4
// Expand it.
pxor xmm2, xmm2
// Expand it.
pxor xmm2, xmm2
- expand xmm
4, xmm1, nil, nil, xmm2
+ expand xmm
2, xmm4, xmm1
movdqa [edx + 0], xmm4
movdqa [edx + 16], xmm1
movdqa [edx + 0], xmm4
movdqa [edx + 16], xmm1
@@
-662,7
+658,7
@@
INTFUNC(mont4)
movd xmm7, [edi + 12]
// Finish the calculation by adding the Montgomery product.
movd xmm7, [edi + 12]
// Finish the calculation by adding the Montgomery product.
- mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
, nil
+ mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
propout [edi + 0], xmm4, xmm5
mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t
@@
-714,7
+710,7
@@
FUNC(mpx_umul4_x86_sse2)
movdqu xmm0, [esi] // bv[0]
mov edi, [ebp + 20] // -> dv[0]
mov ecx, edi // outer loop dv cursor
movdqu xmm0, [esi] // bv[0]
mov edi, [ebp + 20] // -> dv[0]
mov ecx, edi // outer loop dv cursor
- expand xmm
0, xmm1, nil, nil, xmm7
+ expand xmm
7, xmm0, xmm1
mov ebx, [ebp + 24] // -> av[0]
mov eax, [ebp + 28] // -> av[m] = av limit
mov edx, esp // -> expanded Y = bv[0]
mov ebx, [ebp + 24] // -> av[0]
mov eax, [ebp + 28] // -> av[m] = av limit
mov edx, esp // -> expanded Y = bv[0]
@@
-746,7
+742,7
@@
FUNC(mpx_umul4_x86_sse2)
1: movdqu xmm0, [esi] // bv[i]
mov edi, ecx // -> dv[i]
pxor xmm7, xmm7
1: movdqu xmm0, [esi] // bv[i]
mov edi, ecx // -> dv[i]
pxor xmm7, xmm7
- expand xmm
0, xmm1, nil, nil, xmm7
+ expand xmm
7, xmm0, xmm1
mov ebx, [ebp + 24] // -> av[0]
movdqa [esp + 0], xmm0 // bv[i] expanded low
movdqa [esp + 16], xmm1 // bv[i] expanded high
mov ebx, [ebp + 24] // -> av[0]
movdqa [esp + 0], xmm0 // bv[i] expanded low
movdqa [esp + 16], xmm1 // bv[i] expanded high
@@
-796,24
+792,23
@@
FUNC(mpxmont_mul4_x86_sse2)
// ebp + 36 n (nonzero multiple of 4)
// ebp + 40 mi
//
// ebp + 36 n (nonzero multiple of 4)
// ebp + 40 mi
//
- // Locals are relative to ESP, which
is 4 mod 16
, as follows.
+ // Locals are relative to ESP, which
16-byte aligned
, as follows.
//
//
- // esp + 0
outer loop dv
- // esp +
4 outer loop bv
- // esp +
8 av limit (mostly in ESI
)
- // esp +
12 expanded V (32 bytes)
- // esp +
44 expanded M (32 bytes)
- // esp +
76 expanded Y (32 bytes
)
+ // esp + 0
expanded V (32 bytes)
+ // esp +
32 expanded M (32 bytes)
+ // esp +
64 expanded Y (32 bytes
)
+ // esp +
96 outer loop dv
+ // esp +
100 outer loop bv
+ // esp +
104 av limit (mostly in ESI
)
// esp + 108 bv limit
// esp + 108 bv limit
- // esp + 112 (gap)
- // esp + 124 (top of locals)
+ // esp + 112 (top of locals)
pushreg ebp
pushreg ebx
pushreg esi
pushreg edi
setfp ebp
and esp, ~15
pushreg ebp
pushreg ebx
pushreg esi
pushreg edi
setfp ebp
and esp, ~15
- sub esp, 1
24
+ sub esp, 1
12
endprologue
// Establish the expanded operands.
endprologue
// Establish the expanded operands.
@@
-822,34
+817,34
@@
FUNC(mpxmont_mul4_x86_sse2)
mov edx, [ebp + 40] // -> mi
movdqu xmm0, [ecx] // bv[0]
movdqu xmm2, [edx] // mi
mov edx, [ebp + 40] // -> mi
movdqu xmm0, [ecx] // bv[0]
movdqu xmm2, [edx] // mi
- expand xmm
0, xmm1, xmm2, xmm3, xmm7
- movdqa [esp +
12
], xmm0 // bv[0] expanded low
- movdqa [esp +
28
], xmm1 // bv[0] expanded high
- movdqa [esp +
44
], xmm2 // mi expanded low
- movdqa [esp +
60
], xmm3 // mi expanded high
+ expand xmm
7, xmm0, xmm1, xmm2, xmm3
+ movdqa [esp +
0
], xmm0 // bv[0] expanded low
+ movdqa [esp +
16
], xmm1 // bv[0] expanded high
+ movdqa [esp +
32
], xmm2 // mi expanded low
+ movdqa [esp +
48
], xmm3 // mi expanded high
// Set up the outer loop state and prepare for the first iteration.
mov edx, [ebp + 36] // n
mov eax, [ebp + 24] // -> U = av[0]
mov ebx, [ebp + 32] // -> X = nv[0]
mov edi, [ebp + 20] // -> Z = dv[0]
// Set up the outer loop state and prepare for the first iteration.
mov edx, [ebp + 36] // n
mov eax, [ebp + 24] // -> U = av[0]
mov ebx, [ebp + 32] // -> X = nv[0]
mov edi, [ebp + 20] // -> Z = dv[0]
- mov [esp +
4
], ecx
+ mov [esp +
100
], ecx
lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
lea edx, [eax + 4*edx] // -> av[n/4] = av limit
lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit
lea edx, [eax + 4*edx] // -> av[n/4] = av limit
- mov [esp + 0], edi
+ mov [esp + 96], edi
+ mov [esp + 104], edx
mov [esp + 108], ecx
mov [esp + 108], ecx
- mov [esp + 8], edx
- lea ecx, [esp + 12] // -> expanded V = bv[0]
- lea esi, [esp + 44] // -> expanded M = mi
- lea edx, [esp + 76] // -> space for Y
+ lea ecx, [esp + 0] // -> expanded V = bv[0]
+ lea esi, [esp + 32] // -> expanded M = mi
+ lea edx, [esp + 64] // -> space for Y
call mmul4
call mmul4
- mov esi, [esp +
8]
// recover av limit
+ mov esi, [esp +
104]
// recover av limit
add edi, 16
add eax, 16
add ebx, 16
cmp eax, esi // done already?
jae 8f
add edi, 16
add eax, 16
add ebx, 16
cmp eax, esi // done already?
jae 8f
- mov [esp +
0
], edi
+ mov [esp +
96
], edi
.p2align 4
// Complete the first inner loop.
.p2align 4
// Complete the first inner loop.
@@
-868,26
+863,26
@@
FUNC(mpxmont_mul4_x86_sse2)
// Embark on the next iteration. (There must be one. If n = 1, then
// we would have bailed above, to label 8. Similarly, the subsequent
// iterations can fall into the inner loop immediately.)
// Embark on the next iteration. (There must be one. If n = 1, then
// we would have bailed above, to label 8. Similarly, the subsequent
// iterations can fall into the inner loop immediately.)
-1: mov eax, [esp +
4]
// -> bv[i - 1]
- mov edi, [esp +
0]
// -> Z = dv[i]
+1: mov eax, [esp +
100]
// -> bv[i - 1]
+ mov edi, [esp +
96]
// -> Z = dv[i]
add eax, 16 // -> bv[i]
pxor xmm7, xmm7
add eax, 16 // -> bv[i]
pxor xmm7, xmm7
- movdqu xmm0, [eax] // bv[i]
- mov [esp + 4], eax
+ mov [esp + 100], eax
cmp eax, [esp + 108] // done yet?
jae 9f
cmp eax, [esp + 108] // done yet?
jae 9f
+ movdqu xmm0, [eax] // bv[i]
mov ebx, [ebp + 32] // -> X = nv[0]
mov ebx, [ebp + 32] // -> X = nv[0]
- lea esi, [esp +
44
] // -> expanded M = mi
+ lea esi, [esp +
32
] // -> expanded M = mi
mov eax, [ebp + 24] // -> U = av[0]
mov eax, [ebp + 24] // -> U = av[0]
- expand xmm
0, xmm1, nil, nil, xmm7
- movdqa [esp +
12], xmm0
// bv[i] expanded low
- movdqa [esp +
28
], xmm1 // bv[i] expanded high
+ expand xmm
7, xmm0, xmm1
+ movdqa [esp +
0], xmm0
// bv[i] expanded low
+ movdqa [esp +
16
], xmm1 // bv[i] expanded high
call mmla4
call mmla4
- mov esi, [esp +
8]
// recover av limit
+ mov esi, [esp +
104]
// recover av limit
add edi, 16
add eax, 16
add ebx, 16
add edi, 16
add eax, 16
add ebx, 16
- mov [esp +
0
], edi
+ mov [esp +
96
], edi
.p2align 4
// Complete the next inner loop.
.p2align 4
// Complete the next inner loop.
@@
-962,7
+957,7
@@
FUNC(mpxmont_redc4_x86_sse2)
mov edx, [ebp + 36] // -> mi
movdqu xmm0, [edx] // mi
and eax, ~15 // mask off the tail end
mov edx, [ebp + 36] // -> mi
movdqu xmm0, [edx] // mi
and eax, ~15 // mask off the tail end
- expand xmm
0, xmm1, nil, nil, xmm7
+ expand xmm
7, xmm0, xmm1
add eax, edi // find limit
movdqa [esp + 12], xmm0 // mi expanded low
movdqa [esp + 28], xmm1 // mi expanded high
add eax, edi // find limit
movdqa [esp + 12], xmm0 // mi expanded low
movdqa [esp + 28], xmm1 // mi expanded high
@@
-978,8
+973,8
@@
FUNC(mpxmont_redc4_x86_sse2)
lea esi, [esp + 12] // -> expanded M = mi
lea edx, [esp + 44] // -> space for Y
call mont4
lea esi, [esp + 12] // -> expanded M = mi
lea edx, [esp + 44] // -> space for Y
call mont4
- add edi, 16
add ebx, 16
add ebx, 16
+ add edi, 16
cmp ebx, ecx // done already?
jae 8f
cmp ebx, ecx // done already?
jae 8f
@@
-1073,20
+1068,23
@@
ENDFUNC
mov [ebx + ecx*8 + 4], edx
.endm
mov [ebx + ecx*8 + 4], edx
.endm
-.macro testprologue
+.macro testprologue
n
pushreg ebp
pushreg ebx
pushreg esi
pushreg edi
setfp ebp
and esp, ~15
pushreg ebp
pushreg ebx
pushreg esi
pushreg edi
setfp ebp
and esp, ~15
- sub esp, 3*32 +
12
+ sub esp, 3*32 +
4*4
endprologue
endprologue
+ mov eax, \n
+ mov [esp + 104], eax
// vars:
// vars:
- // esp + 0 = cycles
- // esp + 12 = v expanded
- // esp + 44 = y expanded
- // esp + 72 = ? expanded
+ // esp + 0 = v expanded
+ // esp + 32 = y expanded
+ // esp + 64 = ? expanded
+ // esp + 96 = cycles
+ // esp + 104 = count
.endm
.macro testepilogue
.endm
.macro testepilogue
@@
-1105,47
+1103,47
@@
ENDFUNC
movdqu xmm6, [ecx + 32] // (c'_2, c''_2)
.endm
movdqu xmm6, [ecx + 32] // (c'_2, c''_2)
.endm
-.macro testexpand v
, y
+.macro testexpand v
=nil, y=nil
pxor xmm7, xmm7
.ifnes "\v", "nil"
mov ecx, \v
movdqu xmm0, [ecx]
pxor xmm7, xmm7
.ifnes "\v", "nil"
mov ecx, \v
movdqu xmm0, [ecx]
- expand xmm
0, xmm1, nil, nil, xmm7
- movdqa [esp +
12
], xmm0
- movdqa [esp +
28
], xmm1
+ expand xmm
7, xmm0, xmm1
+ movdqa [esp +
0
], xmm0
+ movdqa [esp +
16
], xmm1
.endif
.ifnes "\y", "nil"
mov edx, \y
movdqu xmm2, [edx]
.endif
.ifnes "\y", "nil"
mov edx, \y
movdqu xmm2, [edx]
- expand xmm
2, xmm3, nil, nil, xmm7
- movdqa [esp +
44
], xmm2
- movdqa [esp +
60
], xmm3
+ expand xmm
7, xmm2, xmm3
+ movdqa [esp +
32
], xmm2
+ movdqa [esp +
48
], xmm3
.endif
.endm
.endif
.endm
-.macro testtop u
, x, mode
+.macro testtop u
=nil, x=nil, mode=nil
.p2align 4
0:
.ifnes "\u", "nil"
.p2align 4
0:
.ifnes "\u", "nil"
- lea ecx, [esp +
12
]
+ lea ecx, [esp +
0
]
.endif
mov ebx, \x
.ifeqs "\mode", "mont"
.endif
mov ebx, \x
.ifeqs "\mode", "mont"
- lea esi, [esp +
44
]
+ lea esi, [esp +
32
]
.endif
.endif
- cysetup esp +
0
+ cysetup esp +
96
.ifnes "\u", "nil"
mov eax, \u
.endif
.ifeqs "\mode", "mont"
.ifnes "\u", "nil"
mov eax, \u
.endif
.ifeqs "\mode", "mont"
- lea edx, [esp +
76
]
+ lea edx, [esp +
64
]
.else
.else
- lea edx, [esp +
44
]
+ lea edx, [esp +
32
]
.endif
.endm
.endif
.endm
-.macro testtail cyv
, n
- cystore esp +
0, \cyv, \n
+.macro testtail cyv
+ cystore esp +
96, \cyv, esp + 104
jnz 0b
.endm
jnz 0b
.endm
@@
-1157,63
+1155,63
@@
ENDFUNC
.endm
FUNC(test_dmul4)
.endm
FUNC(test_dmul4)
- testprologue
+ testprologue
[ebp + 44]
testldcarry [ebp + 24]
testexpand [ebp + 36], [ebp + 40]
mov edi, [ebp + 20]
testtop [ebp + 28], [ebp + 32]
call dmul4
testldcarry [ebp + 24]
testexpand [ebp + 36], [ebp + 40]
mov edi, [ebp + 20]
testtop [ebp + 28], [ebp + 32]
call dmul4
- testtail [ebp + 48]
, [ebp + 44]
+ testtail [ebp + 48]
testcarryout [ebp + 24]
testepilogue
ENDFUNC
FUNC(test_dmla4)
testcarryout [ebp + 24]
testepilogue
ENDFUNC
FUNC(test_dmla4)
- testprologue
+ testprologue
[ebp + 44]
testldcarry [ebp + 24]
testexpand [ebp + 36], [ebp + 40]
mov edi, [ebp + 20]
testtop [ebp + 28], [ebp + 32]
call dmla4
testldcarry [ebp + 24]
testexpand [ebp + 36], [ebp + 40]
mov edi, [ebp + 20]
testtop [ebp + 28], [ebp + 32]
call dmla4
- testtail [ebp + 48]
, [ebp + 44]
+ testtail [ebp + 48]
testcarryout [ebp + 24]
testepilogue
ENDFUNC
FUNC(test_mul4)
testcarryout [ebp + 24]
testepilogue
ENDFUNC
FUNC(test_mul4)
- testprologue
+ testprologue
[ebp + 36]
testldcarry [ebp + 24]
testexpand nil, [ebp + 32]
mov edi, [ebp + 20]
testtop nil, [ebp + 28]
call mul4
testldcarry [ebp + 24]
testexpand nil, [ebp + 32]
mov edi, [ebp + 20]
testtop nil, [ebp + 28]
call mul4
- testtail [ebp + 40]
, [ebp + 36]
+ testtail [ebp + 40]
testcarryout [ebp + 24]
testepilogue
ENDFUNC
FUNC(test_mla4)
testcarryout [ebp + 24]
testepilogue
ENDFUNC
FUNC(test_mla4)
- testprologue
+ testprologue
[ebp + 36]
testldcarry [ebp + 24]
testexpand nil, [ebp + 32]
mov edi, [ebp + 20]
testtop nil, [ebp + 28]
call mla4
testldcarry [ebp + 24]
testexpand nil, [ebp + 32]
mov edi, [ebp + 20]
testtop nil, [ebp + 28]
call mla4
- testtail [ebp + 40]
, [ebp + 36]
+ testtail [ebp + 40]
testcarryout [ebp + 24]
testepilogue
ENDFUNC
FUNC(test_mmul4)
testcarryout [ebp + 24]
testepilogue
ENDFUNC
FUNC(test_mmul4)
- testprologue
+ testprologue
[ebp + 48]
testexpand [ebp + 40], [ebp + 44]
mov edi, [ebp + 20]
testtop [ebp + 32], [ebp + 36], mont
call mmul4
testexpand [ebp + 40], [ebp + 44]
mov edi, [ebp + 20]
testtop [ebp + 32], [ebp + 36], mont
call mmul4
- testtail [ebp + 52]
, [ebp + 48]
+ testtail [ebp + 52]
mov edi, [ebp + 28]
mov edi, [ebp + 28]
- movdqa xmm0, [esp +
76
]
- movdqa xmm1, [esp +
92
]
+ movdqa xmm0, [esp +
64
]
+ movdqa xmm1, [esp +
80
]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
@@
-1221,15
+1219,15
@@
FUNC(test_mmul4)
ENDFUNC
FUNC(test_mmla4)
ENDFUNC
FUNC(test_mmla4)
- testprologue
+ testprologue
[ebp + 48]
testexpand [ebp + 40], [ebp + 44]
mov edi, [ebp + 20]
testtop [ebp + 32], [ebp + 36], mont
call mmla4
testexpand [ebp + 40], [ebp + 44]
mov edi, [ebp + 20]
testtop [ebp + 32], [ebp + 36], mont
call mmla4
- testtail [ebp + 52]
, [ebp + 48]
+ testtail [ebp + 52]
mov edi, [ebp + 28]
mov edi, [ebp + 28]
- movdqa xmm0, [esp +
76
]
- movdqa xmm1, [esp +
92
]
+ movdqa xmm0, [esp +
64
]
+ movdqa xmm1, [esp +
80
]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
@@
-1237,15
+1235,15
@@
FUNC(test_mmla4)
ENDFUNC
FUNC(test_mont4)
ENDFUNC
FUNC(test_mont4)
- testprologue
+ testprologue
[ebp + 40]
testexpand nil, [ebp + 36]
mov edi, [ebp + 20]
testtop nil, [ebp + 32], mont
call mont4
testexpand nil, [ebp + 36]
mov edi, [ebp + 20]
testtop nil, [ebp + 32], mont
call mont4
- testtail [ebp + 44]
, [ebp + 40]
+ testtail [ebp + 44]
mov edi, [ebp + 28]
mov edi, [ebp + 28]
- movdqa xmm0, [esp +
76
]
- movdqa xmm1, [esp +
92
]
+ movdqa xmm0, [esp +
64
]
+ movdqa xmm1, [esp +
80
]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [ebp + 24]