X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/71ac8e5eb7dcaf08a9db60c7b460120f3f43d8a1..a1a9ee0a7240087e202a7855e470573de0e59c09:/math/mpx-mul4-x86-sse2.S diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S index 5d5714da..2f7b5ec9 100644 --- a/math/mpx-mul4-x86-sse2.S +++ b/math/mpx-mul4-x86-sse2.S @@ -24,15 +24,13 @@ /// MA 02111-1307, USA. ///-------------------------------------------------------------------------- -/// External definitions. +/// Preliminaries. #include "config.h" #include "asm-common.h" -///-------------------------------------------------------------------------- -/// Prologue. - .arch pentium4 + .text ///-------------------------------------------------------------------------- @@ -96,41 +94,41 @@ .macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil // Load a word r_i from R, multiply by the expanded operand [S], and // leave the pieces of the product in registers D0, D1, D2, D3. - movd \d0, \r // (r_i, 0, 0, 0) + movd \d0, \r // (r_i, 0; 0, 0) .ifnes "\d1", "nil" - movdqa \d1, [\s] // (s'_0, s'_1, s''_0, s''_1) + movdqa \d1, [\s] // (s'_0, s'_1; s''_0, s''_1) .endif .ifnes "\d3", "nil" - movdqa \d3, [\s + 16] // (s'_2, s'_3, s''_2, s''_3) + movdqa \d3, [\s + 16] // (s'_2, s'_3; s''_2, s''_3) .endif - pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?, r_i, ?) + pshufd \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?) .ifnes "\d1", "nil" - psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0) + psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0) .endif .ifnes "\d2", "nil" .ifnes "\d3", "nil" - movdqa \d2, \d3 // another copy of (s'_2, s'_3, ...) + movdqa \d2, \d3 // another copy of (s'_2, s'_3; ...) .else - movdqa \d2, \d0 // another copy of (r_i, ?, r_i, ?) + movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?) .endif .endif .ifnes "\d3", "nil" - psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0) + psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0) .endif .ifnes "\d1", "nil" - pmuludq \d1, \d0 // (r_i s'_1, r_i s''_1) + pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1) .endif .ifnes "\d3", "nil" - pmuludq \d3, \d0 // (r_i s'_3, r_i s''_3) + pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3) .endif .ifnes "\d2", "nil" .ifnes "\d3", "nil" - pmuludq \d2, \d0 // (r_i s'_2, r_i s''_2) + pmuludq \d2, \d0 // (r_i s'_2; r_i s''_2) .else pmuludq \d2, [\s + 16] .endif .endif - pmuludq \d0, [\s] // (r_i s'_0, r_i s''_0) + pmuludq \d0, [\s] // (r_i s'_0; r_i s''_0) .endm .macro accum c0, c1=nil, c2=nil, c3=nil @@ -171,10 +169,10 @@ // carry registers. On completion, XMM3 is clobbered. If CC is // `nil', then the contribution which would have been added to it is // left in C. - pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B) - psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0) - pslldq xmm3, 2 // (t b, 0) - paddq \c, xmm3 // (c' + t b, c'') + pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B) + psrldq xmm3, 12 // (t, 0; 0, 0) = (t, 0) + pslldq xmm3, 2 // (t b; 0) + paddq \c, xmm3 // (c' + t b; c'') movd \d, \c psrlq \c, 32 // floor(c/B) .ifnes "\cc", "nil" @@ -187,33 +185,33 @@ // of the value represented in C are written to D, and the remaining // bits are left at the bottom of T. movdqa \t, \c - psllq \t, 16 // (?, c'' b) - pslldq \c, 8 // (0, c') - paddq \t, \c // (?, c' + c'' b) - psrldq \t, 8 // c' + c'' b + psllq \t, 16 // (?; c'' b) + pslldq \c, 8 // (0; c') + paddq \t, \c // (?; c' + c'' b) + psrldq \t, 8 // (c' + c'' b; 0) = (c; 0) movd \d, \t - psrldq \t, 4 // floor((c' + c'' b)/B) + psrldq \t, 4 // (floor(c/B); 0) .endm .macro expand z, a, b, c=nil, d=nil // On entry, A and C hold packed 128-bit values, and Z is zero. On // exit, A:B and C:D together hold the same values in expanded // form. If C is `nil', then only expand A to A:B. - movdqa \b, \a // (a_0, a_1, a_2, a_3) + movdqa \b, \a // (a_0, a_1; a_2, a_3) .ifnes "\c", "nil" - movdqa \d, \c // (c_0, c_1, c_2, c_3) + movdqa \d, \c // (c_0, c_1; c_2, c_3) .endif - punpcklwd \a, \z // (a'_0, a''_0, a'_1, a''_1) - punpckhwd \b, \z // (a'_2, a''_2, a'_3, a''_3) + punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1) + punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3) .ifnes "\c", "nil" - punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1) - punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3) + punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1) + punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3) .endif - pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1) - pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3) + pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1) + pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3) .ifnes "\c", "nil" - pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1) - pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3) + pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1) + pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3) .endif .endm @@ -229,10 +227,10 @@ // we can do that, we must gather them together. movdqa \t, \c0 movdqa \u, \c1 - punpcklqdq \t, \c2 // (y'_0, y'_2) - punpckhqdq \c0, \c2 // (y''_0, y''_2) - punpcklqdq \u, \c3 // (y'_1, y'_3) - punpckhqdq \c1, \c3 // (y''_1, y''_3) + punpcklqdq \t, \c2 // (y'_0; y'_2) + punpckhqdq \c0, \c2 // (y''_0; y''_2) + punpcklqdq \u, \c3 // (y'_1; y'_3) + punpckhqdq \c1, \c3 // (y''_1; y''_3) // Now split the double-prime pieces. The high (up to) 48 bits will // go up; the low 16 bits go down. @@ -240,43 +238,43 @@ movdqa \c3, \c1 psllq \c2, 48 psllq \c3, 48 - psrlq \c0, 16 // high parts of (y''_0, y''_2) - psrlq \c1, 16 // high parts of (y''_1, y''_3) - psrlq \c2, 32 // low parts of (y''_0, y''_2) - psrlq \c3, 32 // low parts of (y''_1, y''_3) + psrlq \c0, 16 // high parts of (y''_0; y''_2) + psrlq \c1, 16 // high parts of (y''_1; y''_3) + psrlq \c2, 32 // low parts of (y''_0; y''_2) + psrlq \c3, 32 // low parts of (y''_1; y''_3) .ifnes "\hi", "nil" movdqa \hi, \c1 .endif - pslldq \c1, 8 // high part of (0, y''_1) + pslldq \c1, 8 // high part of (0; y''_1) paddq \t, \c2 // propagate down paddq \u, \c3 - paddq \t, \c1 // and up: (y_0, y_2) - paddq \u, \c0 // (y_1, y_3) + paddq \t, \c1 // and up: (y_0; y_2) + paddq \u, \c0 // (y_1; y_3) .ifnes "\hi", "nil" - psrldq \hi, 8 // high part of (y''_3, 0) + psrldq \hi, 8 // high part of (y''_3; 0) .endif // Finally extract the answer. This complicated dance is better than // storing to memory and loading, because the piecemeal stores // inhibit store forwarding. - movdqa \c3, \t // (y_0, y_1) - movdqa \lo, \t // (y^*_0, ?, ?, ?) - psrldq \t, 8 // (y_2, 0) - psrlq \c3, 32 // (floor(y_0/B), ?) - paddq \c3, \u // (y_1 + floor(y_0/B), ?) - movdqa \c1, \c3 // (y^*_1, ?, ?, ?) - psrldq \u, 8 // (y_3, 0) - psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?) - paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?) - punpckldq \lo, \c3 // (y^*_0, y^*_2, ?, ?) - psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?) - paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?) + movdqa \c3, \t // (y_0; ?) + movdqa \lo, \t // (y^*_0, ?; ?, ?) + psrldq \t, 8 // (y_2; 0) + psrlq \c3, 32 // (floor(y_0/B); ?) + paddq \c3, \u // (y_1 + floor(y_0/B); ?) + movdqa \c1, \c3 // (y^*_1, ?; ?, ?) + psrldq \u, 8 // (y_3; 0) + psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?) + paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?) + punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?) + psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?) + paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?) .ifnes "\hi", "nil" movdqa \t, \c3 pxor \u, \u .endif - punpckldq \c1, \c3 // (y^*_1, y^*_3, ?, ?) + punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?) .ifnes "\hi", "nil" psrlq \t, 32 // very high bits of y paddq \hi, \t @@ -293,14 +291,14 @@ // On exit, the carry registers, including XMM7, are updated to hold // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other // registers are preserved. - movd xmm0, [edi + 0] // (a_0, 0) - movd xmm1, [edi + 4] // (a_1, 0) - movd xmm2, [edi + 8] // (a_2, 0) - movd xmm7, [edi + 12] // (a_3, 0) - - paddq xmm4, xmm0 // (c'_0 + a_0, c''_0) - paddq xmm5, xmm1 // (c'_1 + a_1, c''_1) - paddq xmm6, xmm2 // (c'_2 + a_2, c''_2 + a_3 b) + movd xmm0, [edi + 0] // (a_0; 0) + movd xmm1, [edi + 4] // (a_1; 0) + movd xmm2, [edi + 8] // (a_2; 0) + movd xmm7, [edi + 12] // (a_3; 0) + + paddq xmm4, xmm0 // (c'_0 + a_0; c''_0) + paddq xmm5, xmm1 // (c'_1 + a_1; c''_1) + paddq xmm6, xmm2 // (c'_2 + a_2; c''_2 + a_3 b) .endm ///-------------------------------------------------------------------------- @@ -318,7 +316,6 @@ INTFUNC(carryprop) propout [edi + 8], xmm6, nil endprop [edi + 12], xmm6, xmm4 ret - ENDFUNC INTFUNC(dmul4) @@ -350,7 +347,6 @@ INTFUNC(dmul4) propout [edi + 12], xmm7, xmm4 ret - ENDFUNC INTFUNC(dmla4) @@ -386,7 +382,6 @@ INTFUNC(dmla4) propout [edi + 12], xmm7, xmm4 ret - ENDFUNC INTFUNC(mul4zc) @@ -412,7 +407,6 @@ INTFUNC(mul4zc) propout [edi + 12], xmm7, xmm4 ret - ENDFUNC INTFUNC(mul4) @@ -440,7 +434,6 @@ INTFUNC(mul4) propout [edi + 12], xmm7, xmm4 ret - ENDFUNC INTFUNC(mla4zc) @@ -472,7 +465,6 @@ INTFUNC(mla4zc) propout [edi + 12], xmm7, xmm4 ret - ENDFUNC INTFUNC(mla4) @@ -503,7 +495,6 @@ INTFUNC(mla4) propout [edi + 12], xmm7, xmm4 ret - ENDFUNC INTFUNC(mmul4) @@ -511,14 +502,13 @@ INTFUNC(mmul4) // to the packed operands U and N; ECX and ESI point to the expanded // operands V and M; and EDX points to a place to store an expanded // result Y (32 bytes, at a 16-byte boundary). The stack pointer - // must be 16-byte aligned. (This is not the usual convention, which - // requires alignment before the call.) + // must be 12 modulo 16, as is usual for modern x86 ABIs. // // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits // of the sum U V + N Y to [EDI], leaving the remaining carry in // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and // XMM7 are clobbered; the general-purpose registers are preserved. - stalloc 48 // space for the carries + stalloc 48 + 12 // space for the carries endprologue // Calculate W = U V, and leave it in the destination. Stash the @@ -526,30 +516,31 @@ INTFUNC(mmul4) mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 jmp 5f - ENDFUNC INTFUNC(mmla4) // On entry, EDI points to the destination buffer, which also - // contains an addend A to accumulate; EAX and EBX point - // to the packed operands U and N; ECX and ESI point to the expanded + // contains an addend A to accumulate; EAX and EBX point to the + // packed operands U and N; ECX and ESI point to the expanded // operands V and M; and EDX points to a place to store an expanded // result Y (32 bytes, at a 16-byte boundary). The stack pointer - // must be 16-byte aligned. (This is not the usual convention, which - // requires alignment before the call.) + // must be 12 modulo 16, as is usual for modern x86 ABIs. // // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128 // bits of the sum A + U V + N Y to [EDI], leaving the remaining // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, // XMM3, and XMM7 are clobbered; the general-purpose registers are // preserved. - stalloc 48 // space for the carries + stalloc 48 + 12 // space for the carries endprologue movd xmm4, [edi + 0] movd xmm5, [edi + 4] movd xmm6, [edi + 8] movd xmm7, [edi + 12] + + // Calculate W = U V, and leave it in the destination. Stash the + // carry pieces for later. mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 @@ -562,9 +553,9 @@ INTFUNC(mmla4) mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t propout [edi + 12], xmm7, xmm4 - movdqa [esp + 0], xmm4 - movdqa [esp + 16], xmm5 - movdqa [esp + 32], xmm6 + movdqa [SP + 0], xmm4 + movdqa [SP + 16], xmm5 + movdqa [SP + 32], xmm6 // Calculate Y = W M. mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7 @@ -607,14 +598,13 @@ INTFUNC(mmla4) propout [edi + 12], xmm7, xmm4 // Add add on the carry we calculated earlier. - paddq xmm4, [esp + 0] - paddq xmm5, [esp + 16] - paddq xmm6, [esp + 32] + paddq xmm4, [SP + 0] + paddq xmm5, [SP + 16] + paddq xmm6, [SP + 32] // And, with that, we're done. - stfree 48 + stfree 48 + 12 ret - ENDFUNC INTFUNC(mont4) @@ -671,50 +661,57 @@ INTFUNC(mont4) // And, with that, we're done. ret - ENDFUNC ///-------------------------------------------------------------------------- /// Bulk multipliers. +FUNC(mpx_umul4_x86_avx) + .arch .avx + vzeroupper + endprologue + // and drop through... + .arch pentium4 +ENDFUNC + FUNC(mpx_umul4_x86_sse2) // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl, // const mpw *bv, const mpw *bvl); - // Build a stack frame. Arguments will be relative to EBP, as + // Build a stack frame. Arguments will be relative to BP, as // follows. // - // ebp + 20 dv - // ebp + 24 av - // ebp + 28 avl - // ebp + 32 bv - // ebp + 36 bvl + // BP + 20 dv + // BP + 24 av + // BP + 28 avl + // BP + 32 bv + // BP + 36 bvl // - // Locals are relative to ESP, as follows. + // Locals are relative to SP, as follows. // - // esp + 0 expanded Y (32 bytes) - // esp + 32 (top of locals) - pushreg ebp + // SP + 0 expanded Y (32 bytes) + // SP + 32 (top of locals) + pushreg BP pushreg ebx pushreg esi pushreg edi - setfp ebp - and esp, ~15 - sub esp, 32 + setfp + stalloc 32 + and SP, ~15 endprologue // Prepare for the first iteration. - mov esi, [ebp + 32] // -> bv[0] + mov esi, [BP + 32] // -> bv[0] pxor xmm7, xmm7 movdqu xmm0, [esi] // bv[0] - mov edi, [ebp + 20] // -> dv[0] + mov edi, [BP + 20] // -> dv[0] mov ecx, edi // outer loop dv cursor expand xmm7, xmm0, xmm1 - mov ebx, [ebp + 24] // -> av[0] - mov eax, [ebp + 28] // -> av[m] = av limit - mov edx, esp // -> expanded Y = bv[0] - movdqa [esp + 0], xmm0 // bv[0] expanded low - movdqa [esp + 16], xmm1 // bv[0] expanded high + mov ebx, [BP + 24] // -> av[0] + mov eax, [BP + 28] // -> av[m] = av limit + mov edx, SP // -> expanded Y = bv[0] + movdqa [SP + 0], xmm0 // bv[0] expanded low + movdqa [SP + 16], xmm1 // bv[0] expanded high call mul4zc add ebx, 16 add edi, 16 @@ -733,7 +730,7 @@ FUNC(mpx_umul4_x86_sse2) // Write out the leftover carry. There can be no tail here. 8: call carryprop - cmp esi, [ebp + 36] // more passes to do? + cmp esi, [BP + 36] // more passes to do? jae 9f .p2align 4 @@ -742,9 +739,9 @@ FUNC(mpx_umul4_x86_sse2) mov edi, ecx // -> dv[i] pxor xmm7, xmm7 expand xmm7, xmm0, xmm1 - mov ebx, [ebp + 24] // -> av[0] - movdqa [esp + 0], xmm0 // bv[i] expanded low - movdqa [esp + 16], xmm1 // bv[i] expanded high + mov ebx, [BP + 24] // -> av[0] + movdqa [SP + 0], xmm0 // bv[i] expanded low + movdqa [SP + 16], xmm1 // bv[i] expanded high call mla4zc add edi, 16 add ebx, 16 @@ -764,7 +761,7 @@ FUNC(mpx_umul4_x86_sse2) // Finish off this pass. There was no tail on the previous pass, and // there can be none on this pass. 8: call carryprop - cmp esi, [ebp + 36] + cmp esi, [BP + 36] jb 1b // All over. @@ -772,79 +769,85 @@ FUNC(mpx_umul4_x86_sse2) pop edi pop esi pop ebx - pop ebp + pop BP ret +ENDFUNC +FUNC(mpxmont_mul4_x86_avx) + .arch .avx + vzeroupper + endprologue + // and drop through... + .arch pentium4 ENDFUNC FUNC(mpxmont_mul4_x86_sse2) // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv, // const mpw *nv, size_t n, const mpw *mi); - // Build a stack frame. Arguments will be relative to EBP, as + // Build a stack frame. Arguments will be relative to BP, as // follows. // - // ebp + 20 dv - // ebp + 24 av - // ebp + 28 bv - // ebp + 32 nv - // ebp + 36 n (nonzero multiple of 4) - // ebp + 40 mi + // BP + 20 dv + // BP + 24 av + // BP + 28 bv + // BP + 32 nv + // BP + 36 n (nonzero multiple of 4) + // BP + 40 mi // - // Locals are relative to ESP, which is 4 mod 16, as follows. + // Locals are relative to SP, which 16-byte aligned, as follows. // - // esp + 0 outer loop dv - // esp + 4 outer loop bv - // esp + 8 av limit (mostly in ESI) - // esp + 12 expanded V (32 bytes) - // esp + 44 expanded M (32 bytes) - // esp + 76 expanded Y (32 bytes) - // esp + 108 bv limit - // esp + 112 (gap) - // esp + 124 (top of locals) - pushreg ebp + // SP + 0 expanded V (32 bytes) + // SP + 32 expanded M (32 bytes) + // SP + 64 expanded Y (32 bytes) + // SP + 96 outer loop dv + // SP + 100 outer loop bv + // SP + 104 av limit (mostly in ESI) + // SP + 108 bv limit + // SP + 112 (top of locals) + pushreg BP pushreg ebx pushreg esi pushreg edi - setfp ebp - and esp, ~15 - sub esp, 124 + setfp + stalloc 112 + and SP, ~15 endprologue // Establish the expanded operands. pxor xmm7, xmm7 - mov ecx, [ebp + 28] // -> bv - mov edx, [ebp + 40] // -> mi + mov ecx, [BP + 28] // -> bv + mov edx, [BP + 40] // -> mi movdqu xmm0, [ecx] // bv[0] movdqu xmm2, [edx] // mi expand xmm7, xmm0, xmm1, xmm2, xmm3 - movdqa [esp + 12], xmm0 // bv[0] expanded low - movdqa [esp + 28], xmm1 // bv[0] expanded high - movdqa [esp + 44], xmm2 // mi expanded low - movdqa [esp + 60], xmm3 // mi expanded high + movdqa [SP + 0], xmm0 // bv[0] expanded low + movdqa [SP + 16], xmm1 // bv[0] expanded high + movdqa [SP + 32], xmm2 // mi expanded low + movdqa [SP + 48], xmm3 // mi expanded high // Set up the outer loop state and prepare for the first iteration. - mov edx, [ebp + 36] // n - mov eax, [ebp + 24] // -> U = av[0] - mov ebx, [ebp + 32] // -> X = nv[0] - mov edi, [ebp + 20] // -> Z = dv[0] - mov [esp + 4], ecx + mov edx, [BP + 36] // n + mov eax, [BP + 24] // -> U = av[0] + mov ebx, [BP + 32] // -> X = nv[0] + mov edi, [BP + 20] // -> Z = dv[0] + mov [SP + 100], ecx lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit lea edx, [eax + 4*edx] // -> av[n/4] = av limit - mov [esp + 0], edi - mov [esp + 108], ecx - mov [esp + 8], edx - lea ecx, [esp + 12] // -> expanded V = bv[0] - lea esi, [esp + 44] // -> expanded M = mi - lea edx, [esp + 76] // -> space for Y + mov [SP + 96], edi + mov [SP + 104], edx + mov [SP + 108], ecx + lea ecx, [SP + 0] // -> expanded V = bv[0] + lea esi, [SP + 32] // -> expanded M = mi + lea edx, [SP + 64] // -> space for Y call mmul4 - mov esi, [esp + 8] // recover av limit + mov esi, [SP + 104] // recover av limit add edi, 16 add eax, 16 add ebx, 16 cmp eax, esi // done already? jae 8f - mov [esp + 0], edi + mov [SP + 96], edi .p2align 4 // Complete the first inner loop. @@ -863,26 +866,26 @@ FUNC(mpxmont_mul4_x86_sse2) // Embark on the next iteration. (There must be one. If n = 1, then // we would have bailed above, to label 8. Similarly, the subsequent // iterations can fall into the inner loop immediately.) -1: mov eax, [esp + 4] // -> bv[i - 1] - mov edi, [esp + 0] // -> Z = dv[i] +1: mov eax, [SP + 100] // -> bv[i - 1] + mov edi, [SP + 96] // -> Z = dv[i] add eax, 16 // -> bv[i] pxor xmm7, xmm7 - movdqu xmm0, [eax] // bv[i] - mov [esp + 4], eax - cmp eax, [esp + 108] // done yet? + mov [SP + 100], eax + cmp eax, [SP + 108] // done yet? jae 9f - mov ebx, [ebp + 32] // -> X = nv[0] - lea esi, [esp + 44] // -> expanded M = mi - mov eax, [ebp + 24] // -> U = av[0] + movdqu xmm0, [eax] // bv[i] + mov ebx, [BP + 32] // -> X = nv[0] + lea esi, [SP + 32] // -> expanded M = mi + mov eax, [BP + 24] // -> U = av[0] expand xmm7, xmm0, xmm1 - movdqa [esp + 12], xmm0 // bv[i] expanded low - movdqa [esp + 28], xmm1 // bv[i] expanded high + movdqa [SP + 0], xmm0 // bv[i] expanded low + movdqa [SP + 16], xmm1 // bv[i] expanded high call mmla4 - mov esi, [esp + 8] // recover av limit + mov esi, [SP + 104] // recover av limit add edi, 16 add eax, 16 add ebx, 16 - mov [esp + 0], edi + mov [SP + 96], edi .p2align 4 // Complete the next inner loop. @@ -914,67 +917,74 @@ FUNC(mpxmont_mul4_x86_sse2) popreg edi popreg esi popreg ebx - popreg ebp + popreg BP ret +ENDFUNC +FUNC(mpxmont_redc4_x86_avx) + .arch .avx + vzeroupper + endprologue + // and drop through... + .arch pentium4 ENDFUNC FUNC(mpxmont_redc4_x86_sse2) // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv, // size_t n, const mpw *mi); - // Build a stack frame. Arguments will be relative to EBP, as + // Build a stack frame. Arguments will be relative to BP, as // follows. // - // ebp + 20 dv - // ebp + 24 dvl - // ebp + 28 nv - // ebp + 32 n (nonzero multiple of 4) - // ebp + 36 mi + // BP + 20 dv + // BP + 24 dvl + // BP + 28 nv + // BP + 32 n (nonzero multiple of 4) + // BP + 36 mi // - // Locals are relative to ESP, as follows. + // Locals are relative to SP, as follows. // - // esp + 0 outer loop dv - // esp + 4 outer dv limit - // esp + 8 blocks-of-4 dv limit - // esp + 12 expanded M (32 bytes) - // esp + 44 expanded Y (32 bytes) - // esp + 76 (top of locals) - pushreg ebp + // SP + 0 outer loop dv + // SP + 4 outer dv limit + // SP + 8 blocks-of-4 dv limit + // SP + 12 expanded M (32 bytes) + // SP + 44 expanded Y (32 bytes) + // SP + 76 (top of locals) + pushreg BP pushreg ebx pushreg esi pushreg edi - setfp ebp - and esp, ~15 - sub esp, 76 + setfp + and SP, ~15 + stalloc 76 endprologue // Establish the expanded operands and the blocks-of-4 dv limit. - mov edi, [ebp + 20] // -> Z = dv[0] + mov edi, [BP + 20] // -> Z = dv[0] pxor xmm7, xmm7 - mov eax, [ebp + 24] // -> dv[n] = dv limit + mov eax, [BP + 24] // -> dv[n] = dv limit sub eax, edi // length of dv in bytes - mov edx, [ebp + 36] // -> mi + mov edx, [BP + 36] // -> mi movdqu xmm0, [edx] // mi and eax, ~15 // mask off the tail end expand xmm7, xmm0, xmm1 add eax, edi // find limit - movdqa [esp + 12], xmm0 // mi expanded low - movdqa [esp + 28], xmm1 // mi expanded high - mov [esp + 8], eax + movdqa [SP + 12], xmm0 // mi expanded low + movdqa [SP + 28], xmm1 // mi expanded high + mov [SP + 8], eax // Set up the outer loop state and prepare for the first iteration. - mov ecx, [ebp + 32] // n - mov ebx, [ebp + 28] // -> X = nv[0] + mov ecx, [BP + 32] // n + mov ebx, [BP + 28] // -> X = nv[0] lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit - mov [esp + 0], edi - mov [esp + 4], edx - lea esi, [esp + 12] // -> expanded M = mi - lea edx, [esp + 44] // -> space for Y + mov [SP + 0], edi + mov [SP + 4], edx + lea esi, [SP + 12] // -> expanded M = mi + lea edx, [SP + 44] // -> space for Y call mont4 - add edi, 16 add ebx, 16 + add edi, 16 cmp ebx, ecx // done already? jae 8f @@ -988,8 +998,8 @@ FUNC(mpxmont_redc4_x86_sse2) // Still have carries left to propagate. 8: carryadd - mov esi, [esp + 8] // -> dv blocks limit - mov edx, [ebp + 24] // dv limit + mov esi, [SP + 8] // -> dv blocks limit + mov edx, [BP + 24] // dv limit psllq xmm7, 16 pslldq xmm7, 8 paddq xmm6, xmm7 @@ -1022,14 +1032,14 @@ FUNC(mpxmont_redc4_x86_sse2) // All done for this iteration. Start the next. (This must have at // least one follow-on iteration, or we'd not have started this outer // loop.) -8: mov edi, [esp + 0] // -> dv[i - 1] - mov ebx, [ebp + 28] // -> X = nv[0] - lea edx, [esp + 44] // -> space for Y - lea esi, [esp + 12] // -> expanded M = mi +8: mov edi, [SP + 0] // -> dv[i - 1] + mov ebx, [BP + 28] // -> X = nv[0] + lea edx, [SP + 44] // -> space for Y + lea esi, [SP + 12] // -> expanded M = mi add edi, 16 // -> Z = dv[i] - cmp edi, [esp + 4] // all done yet? + cmp edi, [SP + 4] // all done yet? jae 9f - mov [esp + 0], edi + mov [SP + 0], edi call mont4 add edi, 16 add ebx, 16 @@ -1040,9 +1050,8 @@ FUNC(mpxmont_redc4_x86_sse2) popreg edi popreg esi popreg ebx - popreg ebp + popreg BP ret - ENDFUNC ///-------------------------------------------------------------------------- @@ -1068,20 +1077,23 @@ ENDFUNC mov [ebx + ecx*8 + 4], edx .endm -.macro testprologue - pushreg ebp +.macro testprologue n + pushreg BP pushreg ebx pushreg esi pushreg edi - setfp ebp - and esp, ~15 - sub esp, 3*32 + 12 + setfp + stalloc 3*32 + 4*4 + and SP, ~15 endprologue + mov eax, \n + mov [SP + 104], eax // vars: - // esp + 0 = cycles - // esp + 12 = v expanded - // esp + 44 = y expanded - // esp + 72 = ? expanded + // SP + 0 = v expanded + // SP + 32 = y expanded + // SP + 64 = ? expanded + // SP + 96 = cycles + // SP + 104 = count .endm .macro testepilogue @@ -1089,15 +1101,15 @@ ENDFUNC popreg edi popreg esi popreg ebx - popreg ebp + popreg BP ret .endm .macro testldcarry c mov ecx, \c // -> c - movdqu xmm4, [ecx + 0] // (c'_0, c''_0) - movdqu xmm5, [ecx + 16] // (c'_1, c''_1) - movdqu xmm6, [ecx + 32] // (c'_2, c''_2) + movdqu xmm4, [ecx + 0] // (c'_0; c''_0) + movdqu xmm5, [ecx + 16] // (c'_1; c''_1) + movdqu xmm6, [ecx + 32] // (c'_2; c''_2) .endm .macro testexpand v=nil, y=nil @@ -1106,15 +1118,15 @@ ENDFUNC mov ecx, \v movdqu xmm0, [ecx] expand xmm7, xmm0, xmm1 - movdqa [esp + 12], xmm0 - movdqa [esp + 28], xmm1 + movdqa [SP + 0], xmm0 + movdqa [SP + 16], xmm1 .endif .ifnes "\y", "nil" mov edx, \y movdqu xmm2, [edx] expand xmm7, xmm2, xmm3 - movdqa [esp + 44], xmm2 - movdqa [esp + 60], xmm3 + movdqa [SP + 32], xmm2 + movdqa [SP + 48], xmm3 .endif .endm @@ -1122,25 +1134,25 @@ ENDFUNC .p2align 4 0: .ifnes "\u", "nil" - lea ecx, [esp + 12] + lea ecx, [SP + 0] .endif mov ebx, \x .ifeqs "\mode", "mont" - lea esi, [esp + 44] + lea esi, [SP + 32] .endif - cysetup esp + 0 + cysetup SP + 96 .ifnes "\u", "nil" mov eax, \u .endif .ifeqs "\mode", "mont" - lea edx, [esp + 76] + lea edx, [SP + 64] .else - lea edx, [esp + 44] + lea edx, [SP + 32] .endif .endm -.macro testtail cyv, n - cystore esp + 0, \cyv, \n +.macro testtail cyv + cystore SP + 96, \cyv, SP + 104 jnz 0b .endm @@ -1152,98 +1164,122 @@ ENDFUNC .endm FUNC(test_dmul4) - testprologue - testldcarry [ebp + 24] - testexpand [ebp + 36], [ebp + 40] - mov edi, [ebp + 20] - testtop [ebp + 28], [ebp + 32] + testprologue [BP + 44] + testldcarry [BP + 24] + testexpand [BP + 36], [BP + 40] + mov edi, [BP + 20] + testtop [BP + 28], [BP + 32] call dmul4 - testtail [ebp + 48], [ebp + 44] - testcarryout [ebp + 24] + testtail [BP + 48] + testcarryout [BP + 24] testepilogue ENDFUNC FUNC(test_dmla4) - testprologue - testldcarry [ebp + 24] - testexpand [ebp + 36], [ebp + 40] - mov edi, [ebp + 20] - testtop [ebp + 28], [ebp + 32] + testprologue [BP + 44] + testldcarry [BP + 24] + testexpand [BP + 36], [BP + 40] + mov edi, [BP + 20] + testtop [BP + 28], [BP + 32] call dmla4 - testtail [ebp + 48], [ebp + 44] - testcarryout [ebp + 24] + testtail [BP + 48] + testcarryout [BP + 24] testepilogue ENDFUNC FUNC(test_mul4) - testprologue - testldcarry [ebp + 24] - testexpand nil, [ebp + 32] - mov edi, [ebp + 20] - testtop nil, [ebp + 28] + testprologue [BP + 36] + testldcarry [BP + 24] + testexpand nil, [BP + 32] + mov edi, [BP + 20] + testtop nil, [BP + 28] call mul4 - testtail [ebp + 40], [ebp + 36] - testcarryout [ebp + 24] + testtail [BP + 40] + testcarryout [BP + 24] + testepilogue +ENDFUNC + +FUNC(test_mul4zc) + testprologue [BP + 36] + testldcarry [BP + 24] + testexpand nil, [BP + 32] + mov edi, [BP + 20] + testtop nil, [BP + 28] + call mul4zc + testtail [BP + 40] + testcarryout [BP + 24] testepilogue ENDFUNC FUNC(test_mla4) - testprologue - testldcarry [ebp + 24] - testexpand nil, [ebp + 32] - mov edi, [ebp + 20] - testtop nil, [ebp + 28] + testprologue [BP + 36] + testldcarry [BP + 24] + testexpand nil, [BP + 32] + mov edi, [BP + 20] + testtop nil, [BP + 28] call mla4 - testtail [ebp + 40], [ebp + 36] - testcarryout [ebp + 24] + testtail [BP + 40] + testcarryout [BP + 24] + testepilogue +ENDFUNC + +FUNC(test_mla4zc) + testprologue [BP + 36] + testldcarry [BP + 24] + testexpand nil, [BP + 32] + mov edi, [BP + 20] + testtop nil, [BP + 28] + call mla4zc + testtail [BP + 40] + testcarryout [BP + 24] testepilogue ENDFUNC FUNC(test_mmul4) - testprologue - testexpand [ebp + 40], [ebp + 44] - mov edi, [ebp + 20] - testtop [ebp + 32], [ebp + 36], mont + testprologue [BP + 48] + testexpand [BP + 40], [BP + 44] + mov edi, [BP + 20] + testtop [BP + 32], [BP + 36], mont call mmul4 - testtail [ebp + 52], [ebp + 48] - mov edi, [ebp + 28] - movdqa xmm0, [esp + 76] - movdqa xmm1, [esp + 92] + testtail [BP + 52] + mov edi, [BP + 28] + movdqa xmm0, [SP + 64] + movdqa xmm1, [SP + 80] movdqu [edi], xmm0 movdqu [edi + 16], xmm1 - testcarryout [ebp + 24] + testcarryout [BP + 24] testepilogue ENDFUNC FUNC(test_mmla4) - testprologue - testexpand [ebp + 40], [ebp + 44] - mov edi, [ebp + 20] - testtop [ebp + 32], [ebp + 36], mont + testprologue [BP + 48] + testexpand [BP + 40], [BP + 44] + mov edi, [BP + 20] + testtop [BP + 32], [BP + 36], mont call mmla4 - testtail [ebp + 52], [ebp + 48] - mov edi, [ebp + 28] - movdqa xmm0, [esp + 76] - movdqa xmm1, [esp + 92] + testtail [BP + 52] + mov edi, [BP + 28] + movdqa xmm0, [SP + 64] + movdqa xmm1, [SP + 80] movdqu [edi], xmm0 movdqu [edi + 16], xmm1 - testcarryout [ebp + 24] + testcarryout [BP + 24] testepilogue ENDFUNC FUNC(test_mont4) - testprologue - testexpand nil, [ebp + 36] - mov edi, [ebp + 20] - testtop nil, [ebp + 32], mont + testprologue [BP + 40] + testexpand nil, [BP + 36] + mov edi, [BP + 20] + testtop nil, [BP + 32], mont call mont4 - testtail [ebp + 44], [ebp + 40] - mov edi, [ebp + 28] - movdqa xmm0, [esp + 76] - movdqa xmm1, [esp + 92] + testtail [BP + 44] + mov edi, [BP + 28] + movdqa xmm0, [SP + 64] + movdqa xmm1, [SP + 80] movdqu [edi], xmm0 movdqu [edi + 16], xmm1 - testcarryout [ebp + 24] + testcarryout [BP + 24] testepilogue ENDFUNC