X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/444083aef7e70ce9afe893a36d72e1a1a976f1ed..d0d41c6ebfbfebca8dbb516a1de4107c82b1bc6b:/math/mpx-mul4-x86-sse2.S diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S index c0e1a788..baf7cc50 100644 --- a/math/mpx-mul4-x86-sse2.S +++ b/math/mpx-mul4-x86-sse2.S @@ -64,7 +64,7 @@ /// 0 v'_0 v'_1 v''_0 v''_1 /// 16 v'_2 v'_3 v''_2 v''_3 /// -/// A `pmuludqd' instruction ignores the odd positions in its operands; thus, +/// A `pmuludq' instruction ignores the odd positions in its operands; thus, /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can /// multiply such a vector by a full 32-bit scalar to produce two 48-bit @@ -81,7 +81,7 @@ /// the register c0, for example, holds c'_0 (low half) and c''_0 (high /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The -/// `pmuluqdq' instruction acting on a scalar operand (broadcast across all +/// `pmuluqd' instruction acting on a scalar operand (broadcast across all /// lanes of its vector) and an operand in the expanded form above produces a /// result which can be added directly to the appropriate carry register. /// Following a pass of four multiplications, we perform some limited carry @@ -93,7 +93,7 @@ ///-------------------------------------------------------------------------- /// Macro definitions. -.macro mulcore r, s, d0, d1, d2, d3 +.macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil // Load a word r_i from R, multiply by the expanded operand [S], and // leave the pieces of the product in registers D0, D1, D2, D3. movd \d0, \r // (r_i, 0, 0, 0) @@ -103,7 +103,7 @@ .ifnes "\d3", "nil" movdqa \d3, [\s + 16] // (s'_2, s'_3, s''_2, s''_3) .endif - pshufd \d0, \d0, 0b11001100 // (r_i, ?, r_i, ?) + pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?, r_i, ?) .ifnes "\d1", "nil" psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0) .endif @@ -118,22 +118,25 @@ psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0) .endif .ifnes "\d1", "nil" - pmuludqd \d1, \d0 // (r_i s'_1, r_i s''_1) + pmuludq \d1, \d0 // (r_i s'_1, r_i s''_1) .endif .ifnes "\d3", "nil" - pmuludqd \d3, \d0 // (r_i s'_3, r_i s''_3) + pmuludq \d3, \d0 // (r_i s'_3, r_i s''_3) .endif .ifnes "\d2", "nil" .ifnes "\d3", "nil" - pmuludqd \d2, \d0 // (r_i s'_2, r_i s''_2) + pmuludq \d2, \d0 // (r_i s'_2, r_i s''_2) .else - pmuludqd \d2, [\s + 16] + pmuludq \d2, [\s + 16] .endif .endif - pmuludqd \d0, [\s] // (r_i s'_0, r_i s''_0) + pmuludq \d0, [\s] // (r_i s'_0, r_i s''_0) .endm -.macro accum c0, c1, c2, c3 +.macro accum c0, c1=nil, c2=nil, c3=nil + // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding + // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip + // updating that register. paddq \c0, xmm0 .ifnes "\c1", "nil" paddq \c1, xmm1 @@ -146,7 +149,7 @@ .endif .endm -.macro mulacc r, s, c0, c1, c2, c3, z3p +.macro mulacc r, s, c0, c1, c2, c3, z3p=nil // Load a word r_i from R, multiply by the expanded operand [S], // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t' // then C3 notionally contains zero, but needs clearing; in practice, @@ -155,20 +158,20 @@ // is not `t'. .ifeqs "\z3p", "t" mulcore \r, \s, xmm0, xmm1, xmm2, \c3 - accum \c0, \c1, \c2, nil + accum \c0, \c1, \c2 .else mulcore \r, \s, xmm0, xmm1, xmm2, xmm3 accum \c0, \c1, \c2, \c3 .endif .endm -.macro propout d, c, cc +.macro propout d, c, cc=nil // Calculate an output word from C, and store it in D; propagate // carries out from C to CC in preparation for a rotation of the // carry registers. On completion, XMM3 is clobbered. If CC is // `nil', then the contribution which would have been added to it is // left in C. - pshufd xmm3, \c, 0b10111111 // (?, ?, ?, t = c'' mod B) + pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B) psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0) pslldq xmm3, 2 // (t b, 0) paddq \c, xmm3 // (c' + t b, c'') @@ -192,7 +195,7 @@ psrldq \t, 4 // floor((c' + c'' b)/B) .endm -.macro expand a, b, c, d, z +.macro expand z, a, b, c=nil, d=nil // On entry, A and C hold packed 128-bit values, and Z is zero. On // exit, A:B and C:D together hold the same values in expanded // form. If C is `nil', then only expand A to A:B. @@ -206,19 +209,19 @@ punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1) punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3) .endif - pshufd \a, \a, 0b11011000 // (a'_0, a'_1, a''_0, a''_1) - pshufd \b, \b, 0b11011000 // (a'_2, a'_3, a''_2, a''_3) + pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1) + pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3) .ifnes "\c", "nil" - pshufd \c, \c, 0b11011000 // (c'_0, c'_1, c''_0, c''_1) - pshufd \d, \d, 0b11011000 // (c'_2, c'_3, c''_2, c''_3) + pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1) + pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3) .endif .endm -.macro squash c0, c1, c2, c3, h, t, u +.macro squash c0, c1, c2, c3, t, u, lo, hi=nil // On entry, C0, C1, C2, C3 are carry registers representing a value - // Y. On exit, C0 holds the low 128 bits of the carry value; C1, C2, + // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2, // C3, T, and U are clobbered; and the high bits of Y are stored in - // H, if this is not `nil'. + // HI, if this is not `nil'. // The first step is to eliminate the `double-prime' pieces -- i.e., // the ones offset by 16 bytes from a 32-bit boundary -- by carrying @@ -241,8 +244,8 @@ psrlq \c1, 16 // high parts of (y''_1, y''_3) psrlq \c2, 32 // low parts of (y''_0, y''_2) psrlq \c3, 32 // low parts of (y''_1, y''_3) - .ifnes "\h", "nil" - movdqa \h, \c1 + .ifnes "\hi", "nil" + movdqa \hi, \c1 .endif pslldq \c1, 8 // high part of (0, y''_1) @@ -250,44 +253,36 @@ paddq \u, \c3 paddq \t, \c1 // and up: (y_0, y_2) paddq \u, \c0 // (y_1, y_3) - .ifnes "\h", "nil" - psrldq \h, 8 // high part of (y''_3, 0) + .ifnes "\hi", "nil" + psrldq \hi, 8 // high part of (y''_3, 0) .endif // Finally extract the answer. This complicated dance is better than // storing to memory and loading, because the piecemeal stores // inhibit store forwarding. movdqa \c3, \t // (y_0, y_1) - movdqa \c0, \t // (y^*_0, ?, ?, ?) + movdqa \lo, \t // (y^*_0, ?, ?, ?) psrldq \t, 8 // (y_2, 0) psrlq \c3, 32 // (floor(y_0/B), ?) paddq \c3, \u // (y_1 + floor(y_0/B), ?) - pslldq \c0, 12 // (0, 0, 0, y^*_0) movdqa \c1, \c3 // (y^*_1, ?, ?, ?) psrldq \u, 8 // (y_3, 0) psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?) paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?) - pslldq \c1, 12 // (0, 0, 0, y^*_1) - psrldq \c0, 12 // (y^*_0, 0, 0, 0) - movdqa \c2, \c3 // (y^*_2, ?, ?, ?) + punpckldq \lo, \c3 // (y^*_0, y^*_2, ?, ?) psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?) paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?) - pslldq \c2, 12 // (0, 0, 0, y^*_2) - psrldq \c1, 8 // (0, y^*_1, 0, 0) - psrldq \c2, 4 // (0, 0, y^*_2, 0) - .ifnes "\h", "nil" - movdqu \t, \c3 + .ifnes "\hi", "nil" + movdqa \t, \c3 pxor \u, \u .endif - pslldq \c3, 12 // (0, 0, 0, y^*_3) - por \c0, \c1 // (y^*_0, y^*_1, 0, 0) - por \c2, \c3 // (0, 0, y^*_2, y^*_3) - por \c0, \c2 // y mod B^4 - .ifnes "\h", "nil" + punpckldq \c1, \c3 // (y^*_1, y^*_3, ?, ?) + .ifnes "\hi", "nil" psrlq \t, 32 // very high bits of y - paddq \h, \t - punpcklqdq \h, \u // carry up + paddq \hi, \t + punpcklqdq \hi, \u // carry up .endif + punpckldq \lo, \c1 // y mod B^4 .endm .macro carryadd @@ -302,6 +297,7 @@ movd xmm1, [edi + 4] // (a_1, 0) movd xmm2, [edi + 8] // (a_2, 0) movd xmm7, [edi + 12] // (a_3, 0) + paddq xmm4, xmm0 // (c'_0 + a_0, c''_0) paddq xmm5, xmm1 // (c'_1 + a_1, c''_1) paddq xmm6, xmm2 // (c'_2 + a_2, c''_2 + a_3 b) @@ -310,20 +306,22 @@ ///-------------------------------------------------------------------------- /// Primitive multipliers and related utilities. - .p2align 4 -carryprop: +INTFUNC(carryprop) // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded // form. Store the low 128 bits of the represented carry to [EDI] as // a packed 128-bit value, and leave the remaining 16 bits in the low // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered. + endprologue + propout [edi + 0], xmm4, xmm5 propout [edi + 4], xmm5, xmm6 propout [edi + 8], xmm6, nil endprop [edi + 12], xmm6, xmm4 ret - .p2align 4 -dmul4: +ENDFUNC + +INTFUNC(dmul4) // On entry, EDI points to the destination buffer; EAX and EBX point // to the packed operands U and X; ECX and EDX point to the expanded // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry @@ -333,26 +331,29 @@ dmul4: // [EDI], and update the carry registers with the carry out. The // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the // general-purpose registers are preserved. + endprologue + mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t - mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t - mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil + mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4 propout [edi + 4], xmm5, xmm6 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t - mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil + mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5 propout [edi + 8], xmm6, xmm7 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t - mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil + mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6 propout [edi + 12], xmm7, xmm4 ret - .p2align 4 -dmla4: +ENDFUNC + +INTFUNC(dmla4) // On entry, EDI points to the destination buffer, which also // contains an addend A to accumulate; EAX and EBX point to the // packed operands U and X; ECX and EDX point to the expanded @@ -364,28 +365,31 @@ dmla4: // [EDI], and update the carry registers with the carry out. The // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the // general-purpose registers are preserved. + endprologue + carryadd - mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil - mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil + mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7 + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t - mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil + mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4 propout [edi + 4], xmm5, xmm6 mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t - mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil + mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5 propout [edi + 8], xmm6, xmm7 mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t - mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil + mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6 propout [edi + 12], xmm7, xmm4 ret - .p2align 4 -mul4zc: +ENDFUNC + +INTFUNC(mul4zc) // On entry, EDI points to the destination buffer; EBX points to a // packed operand X; and EDX points to an expanded operand Y. // @@ -393,6 +397,8 @@ mul4zc: // and set the carry registers XMM4, XMM5, XMM6 to the carry out. // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the // general-purpose registers are preserved. + endprologue + mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 @@ -407,8 +413,9 @@ mul4zc: ret - .p2align 4 -mul4: +ENDFUNC + +INTFUNC(mul4) // On entry, EDI points to the destination buffer; EBX points to a // packed operand X; EDX points to an expanded operand Y; and XMM4, // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2, @@ -418,6 +425,8 @@ mul4: // and update the carry registers with the carry out. The registers // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the // general-purpose registers are preserved. + endprologue + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t propout [edi + 0], xmm4, xmm5 @@ -432,8 +441,9 @@ mul4: ret - .p2align 4 -mla4zc: +ENDFUNC + +INTFUNC(mla4zc) // On entry, EDI points to the destination buffer, which also // contains an addend A to accumulate; EBX points to a packed operand // X; and EDX points to an expanded operand Y. @@ -442,12 +452,14 @@ mla4zc: // and set the carry registers XMM4, XMM5, XMM6 to the carry out. // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the // general-purpose registers are preserved. + endprologue + movd xmm4, [edi + 0] movd xmm5, [edi + 4] movd xmm6, [edi + 8] movd xmm7, [edi + 12] - mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t @@ -461,8 +473,9 @@ mla4zc: ret - .p2align 4 -mla4: +ENDFUNC + +INTFUNC(mla4) // On entry, EDI points to the destination buffer, which also // contains an addend A to accumulate; EBX points to a packed operand // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold @@ -473,9 +486,11 @@ mla4: // [EDI], and update the carry registers with the carry out. The // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the // general-purpose registers are preserved. + endprologue + carryadd - mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t @@ -489,20 +504,21 @@ mla4: ret - .p2align 4 -mmul4: +ENDFUNC + +INTFUNC(mmul4) // On entry, EDI points to the destination buffer; EAX and EBX point // to the packed operands U and N; ECX and ESI point to the expanded // operands V and M; and EDX points to a place to store an expanded // result Y (32 bytes, at a 16-byte boundary). The stack pointer - // must be 16-byte aligned. (This is not the usual convention, which - // requires alignment before the call.) + // must be 12 modulo 16, as is usual for modern x86 ABIs. // // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits // of the sum U V + N Y to [EDI], leaving the remaining carry in // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and // XMM7 are clobbered; the general-purpose registers are preserved. - sub esp, 64 // space for the carries + stalloc 48 + 12 // space for the carries + endprologue // Calculate W = U V, and leave it in the destination. Stash the // carry pieces for later. @@ -510,27 +526,32 @@ mmul4: propout [edi + 0], xmm4, xmm5 jmp 5f - .p2align 4 -mmla4: +ENDFUNC + +INTFUNC(mmla4) // On entry, EDI points to the destination buffer, which also - // contains an addend A to accumulate; EAX and EBX point - // to the packed operands U and N; ECX and ESI point to the expanded + // contains an addend A to accumulate; EAX and EBX point to the + // packed operands U and N; ECX and ESI point to the expanded // operands V and M; and EDX points to a place to store an expanded // result Y (32 bytes, at a 16-byte boundary). The stack pointer - // must be 16-byte aligned. (This is not the usual convention, which - // requires alignment before the call.) + // must be 12 modulo 16, as is usual for modern x86 ABIs. // // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128 // bits of the sum A + U V + N Y to [EDI], leaving the remaining // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, // XMM3, and XMM7 are clobbered; the general-purpose registers are // preserved. - sub esp, 64 // space for the carries + stalloc 48 + 12 // space for the carries + endprologue + movd xmm4, [edi + 0] movd xmm5, [edi + 4] movd xmm6, [edi + 8] movd xmm7, [edi + 12] - mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil + + // Calculate W = U V, and leave it in the destination. Stash the + // carry pieces for later. + mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t @@ -549,21 +570,21 @@ mmla4: // Calculate Y = W M. mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7 - mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil - accum xmm5, xmm6, xmm7, nil + mulcore [edi + 4], esi, xmm0, xmm1, xmm2 + accum xmm5, xmm6, xmm7 - mulcore [edi + 8], esi, xmm0, xmm1, nil, nil - accum xmm6, xmm7, nil, nil + mulcore [edi + 8], esi, xmm0, xmm1 + accum xmm6, xmm7 - mulcore [edi + 12], esi, xmm0, nil, nil, nil - accum xmm7, nil, nil, nil + mulcore [edi + 12], esi, xmm0 + accum xmm7 // That's lots of pieces. Now we have to assemble the answer. - squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1 + squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4 // Expand it. pxor xmm2, xmm2 - expand xmm4, xmm1, nil, nil, xmm2 + expand xmm2, xmm4, xmm1 movdqa [edx + 0], xmm4 movdqa [edx + 16], xmm1 @@ -574,7 +595,7 @@ mmla4: movd xmm7, [edi + 12] // Finish the calculation by adding the Montgomery product. - mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t @@ -592,13 +613,14 @@ mmla4: paddq xmm6, [esp + 32] // And, with that, we're done. - add esp, 64 + stfree 48 + 12 ret - .p2align 4 -mont4: +ENDFUNC + +INTFUNC(mont4) // On entry, EDI points to the destination buffer holding a packed - // value A; EBX points to a packed operand N; ESI points to an + // value W; EBX points to a packed operand N; ESI points to an // expanded operand M; and EDX points to a place to store an expanded // result Y (32 bytes, at a 16-byte boundary). // @@ -606,25 +628,26 @@ mont4: // of the sum W + N Y to [EDI], leaving the remaining carry in // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and // XMM7 are clobbered; the general-purpose registers are preserved. + endprologue // Calculate Y = W M. mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7 - mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil - accum xmm5, xmm6, xmm7, nil + mulcore [edi + 4], esi, xmm0, xmm1, xmm2 + accum xmm5, xmm6, xmm7 - mulcore [edi + 8], esi, xmm0, xmm1, nil, nil - accum xmm6, xmm7, nil, nil + mulcore [edi + 8], esi, xmm0, xmm1 + accum xmm6, xmm7 - mulcore [edi + 12], esi, xmm0, nil, nil, nil - accum xmm7, nil, nil, nil + mulcore [edi + 12], esi, xmm0 + accum xmm7 // That's lots of pieces. Now we have to assemble the answer. - squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1 + squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4 // Expand it. pxor xmm2, xmm2 - expand xmm4, xmm1, nil, nil, xmm2 + expand xmm2, xmm4, xmm1 movdqa [edx + 0], xmm4 movdqa [edx + 16], xmm1 @@ -635,7 +658,7 @@ mont4: movd xmm7, [edi + 12] // Finish the calculation by adding the Montgomery product. - mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t @@ -650,6 +673,8 @@ mont4: // And, with that, we're done. ret +ENDFUNC + ///-------------------------------------------------------------------------- /// Bulk multipliers. @@ -670,13 +695,14 @@ FUNC(mpx_umul4_x86_sse2) // // esp + 0 expanded Y (32 bytes) // esp + 32 (top of locals) - push ebp - push ebx - push esi - push edi - mov ebp, esp + pushreg ebp + pushreg ebx + pushreg esi + pushreg edi + setfp ebp and esp, ~15 sub esp, 32 + endprologue // Prepare for the first iteration. mov esi, [ebp + 32] // -> bv[0] @@ -684,7 +710,7 @@ FUNC(mpx_umul4_x86_sse2) movdqu xmm0, [esi] // bv[0] mov edi, [ebp + 20] // -> dv[0] mov ecx, edi // outer loop dv cursor - expand xmm0, xmm1, nil, nil, xmm7 + expand xmm7, xmm0, xmm1 mov ebx, [ebp + 24] // -> av[0] mov eax, [ebp + 28] // -> av[m] = av limit mov edx, esp // -> expanded Y = bv[0] @@ -716,7 +742,7 @@ FUNC(mpx_umul4_x86_sse2) 1: movdqu xmm0, [esi] // bv[i] mov edi, ecx // -> dv[i] pxor xmm7, xmm7 - expand xmm0, xmm1, nil, nil, xmm7 + expand xmm7, xmm0, xmm1 mov ebx, [ebp + 24] // -> av[0] movdqa [esp + 0], xmm0 // bv[i] expanded low movdqa [esp + 16], xmm1 // bv[i] expanded high @@ -743,7 +769,7 @@ FUNC(mpx_umul4_x86_sse2) jb 1b // All over. -9: mov esp, ebp +9: dropfp pop edi pop esi pop ebx @@ -766,24 +792,24 @@ FUNC(mpxmont_mul4_x86_sse2) // ebp + 36 n (nonzero multiple of 4) // ebp + 40 mi // - // Locals are relative to ESP, which is 4 mod 16, as follows. + // Locals are relative to ESP, which 16-byte aligned, as follows. // - // esp + 0 outer loop dv - // esp + 4 outer loop bv - // esp + 8 av limit (mostly in ESI) - // esp + 12 expanded V (32 bytes) - // esp + 44 expanded M (32 bytes) - // esp + 76 expanded Y (32 bytes) + // esp + 0 expanded V (32 bytes) + // esp + 32 expanded M (32 bytes) + // esp + 64 expanded Y (32 bytes) + // esp + 96 outer loop dv + // esp + 100 outer loop bv + // esp + 104 av limit (mostly in ESI) // esp + 108 bv limit - // esp + 112 (gap) - // esp + 124 (top of locals) - push ebp - push ebx - push esi - push edi - mov ebp, esp + // esp + 112 (top of locals) + pushreg ebp + pushreg ebx + pushreg esi + pushreg edi + setfp ebp and esp, ~15 - sub esp, 124 + sub esp, 112 + endprologue // Establish the expanded operands. pxor xmm7, xmm7 @@ -791,34 +817,34 @@ FUNC(mpxmont_mul4_x86_sse2) mov edx, [ebp + 40] // -> mi movdqu xmm0, [ecx] // bv[0] movdqu xmm2, [edx] // mi - expand xmm0, xmm1, xmm2, xmm3, xmm7 - movdqa [esp + 12], xmm0 // bv[0] expanded low - movdqa [esp + 28], xmm1 // bv[0] expanded high - movdqa [esp + 44], xmm2 // mi expanded low - movdqa [esp + 60], xmm3 // mi expanded high + expand xmm7, xmm0, xmm1, xmm2, xmm3 + movdqa [esp + 0], xmm0 // bv[0] expanded low + movdqa [esp + 16], xmm1 // bv[0] expanded high + movdqa [esp + 32], xmm2 // mi expanded low + movdqa [esp + 48], xmm3 // mi expanded high // Set up the outer loop state and prepare for the first iteration. mov edx, [ebp + 36] // n mov eax, [ebp + 24] // -> U = av[0] mov ebx, [ebp + 32] // -> X = nv[0] mov edi, [ebp + 20] // -> Z = dv[0] - mov [esp + 4], ecx + mov [esp + 100], ecx lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit lea edx, [eax + 4*edx] // -> av[n/4] = av limit - mov [esp + 0], edi + mov [esp + 96], edi + mov [esp + 104], edx mov [esp + 108], ecx - mov [esp + 8], edx - lea ecx, [esp + 12] // -> expanded V = bv[0] - lea esi, [esp + 44] // -> expanded M = mi - lea edx, [esp + 76] // -> space for Y + lea ecx, [esp + 0] // -> expanded V = bv[0] + lea esi, [esp + 32] // -> expanded M = mi + lea edx, [esp + 64] // -> space for Y call mmul4 - mov esi, [esp + 8] // recover av limit + mov esi, [esp + 104] // recover av limit add edi, 16 add eax, 16 add ebx, 16 cmp eax, esi // done already? jae 8f - mov [esp + 0], edi + mov [esp + 96], edi .p2align 4 // Complete the first inner loop. @@ -837,26 +863,26 @@ FUNC(mpxmont_mul4_x86_sse2) // Embark on the next iteration. (There must be one. If n = 1, then // we would have bailed above, to label 8. Similarly, the subsequent // iterations can fall into the inner loop immediately.) -1: mov eax, [esp + 4] // -> bv[i - 1] - mov edi, [esp + 0] // -> Z = dv[i] +1: mov eax, [esp + 100] // -> bv[i - 1] + mov edi, [esp + 96] // -> Z = dv[i] add eax, 16 // -> bv[i] pxor xmm7, xmm7 - movdqu xmm0, [eax] // bv[i] - mov [esp + 4], eax + mov [esp + 100], eax cmp eax, [esp + 108] // done yet? jae 9f + movdqu xmm0, [eax] // bv[i] mov ebx, [ebp + 32] // -> X = nv[0] - lea esi, [esp + 44] // -> expanded M = mi + lea esi, [esp + 32] // -> expanded M = mi mov eax, [ebp + 24] // -> U = av[0] - expand xmm0, xmm1, nil, nil, xmm7 - movdqa [esp + 12], xmm0 // bv[i] expanded low - movdqa [esp + 28], xmm1 // bv[i] expanded high + expand xmm7, xmm0, xmm1 + movdqa [esp + 0], xmm0 // bv[i] expanded low + movdqa [esp + 16], xmm1 // bv[i] expanded high call mmla4 - mov esi, [esp + 8] // recover av limit + mov esi, [esp + 104] // recover av limit add edi, 16 add eax, 16 add ebx, 16 - mov [esp + 0], edi + mov [esp + 96], edi .p2align 4 // Complete the next inner loop. @@ -884,11 +910,11 @@ FUNC(mpxmont_mul4_x86_sse2) movd [edi + 16], xmm4 // All done. -9: mov esp, ebp - pop edi - pop esi - pop ebx - pop ebp +9: dropfp + popreg edi + popreg esi + popreg ebx + popreg ebp ret ENDFUNC @@ -914,13 +940,14 @@ FUNC(mpxmont_redc4_x86_sse2) // esp + 12 expanded M (32 bytes) // esp + 44 expanded Y (32 bytes) // esp + 76 (top of locals) - push ebp - push ebx - push esi - push edi - mov ebp, esp + pushreg ebp + pushreg ebx + pushreg esi + pushreg edi + setfp ebp and esp, ~15 sub esp, 76 + endprologue // Establish the expanded operands and the blocks-of-4 dv limit. mov edi, [ebp + 20] // -> Z = dv[0] @@ -930,7 +957,7 @@ FUNC(mpxmont_redc4_x86_sse2) mov edx, [ebp + 36] // -> mi movdqu xmm0, [edx] // mi and eax, ~15 // mask off the tail end - expand xmm0, xmm1, nil, nil, xmm7 + expand xmm7, xmm0, xmm1 add eax, edi // find limit movdqa [esp + 12], xmm0 // mi expanded low movdqa [esp + 28], xmm1 // mi expanded high @@ -946,8 +973,8 @@ FUNC(mpxmont_redc4_x86_sse2) lea esi, [esp + 12] // -> expanded M = mi lea edx, [esp + 44] // -> space for Y call mont4 - add edi, 16 add ebx, 16 + add edi, 16 cmp ebx, ecx // done already? jae 8f @@ -1009,11 +1036,11 @@ FUNC(mpxmont_redc4_x86_sse2) jmp 5b // All over. -9: mov esp, ebp - pop edi - pop esi - pop ebx - pop ebp +9: dropfp + popreg edi + popreg esi + popreg ebx + popreg ebp ret ENDFUNC @@ -1041,27 +1068,31 @@ ENDFUNC mov [ebx + ecx*8 + 4], edx .endm -.macro testprologue - push ebp - push ebx - push esi - push edi - mov ebp, esp +.macro testprologue n + pushreg ebp + pushreg ebx + pushreg esi + pushreg edi + setfp ebp and esp, ~15 - sub esp, 3*32 + 12 + sub esp, 3*32 + 4*4 + endprologue + mov eax, \n + mov [esp + 104], eax // vars: - // esp + 0 = cycles - // esp + 12 = v expanded - // esp + 44 = y expanded - // esp + 72 = ? expanded + // esp + 0 = v expanded + // esp + 32 = y expanded + // esp + 64 = ? expanded + // esp + 96 = cycles + // esp + 104 = count .endm .macro testepilogue - mov esp, ebp - pop edi - pop esi - pop ebx - pop ebp + dropfp + popreg edi + popreg esi + popreg ebx + popreg ebp ret .endm @@ -1072,47 +1103,47 @@ ENDFUNC movdqu xmm6, [ecx + 32] // (c'_2, c''_2) .endm -.macro testexpand v, y +.macro testexpand v=nil, y=nil pxor xmm7, xmm7 .ifnes "\v", "nil" mov ecx, \v movdqu xmm0, [ecx] - expand xmm0, xmm1, nil, nil, xmm7 - movdqa [esp + 12], xmm0 - movdqa [esp + 28], xmm1 + expand xmm7, xmm0, xmm1 + movdqa [esp + 0], xmm0 + movdqa [esp + 16], xmm1 .endif .ifnes "\y", "nil" mov edx, \y movdqu xmm2, [edx] - expand xmm2, xmm3, nil, nil, xmm7 - movdqa [esp + 44], xmm2 - movdqa [esp + 60], xmm3 + expand xmm7, xmm2, xmm3 + movdqa [esp + 32], xmm2 + movdqa [esp + 48], xmm3 .endif .endm -.macro testtop u, x, mode +.macro testtop u=nil, x=nil, mode=nil .p2align 4 0: .ifnes "\u", "nil" - lea ecx, [esp + 12] + lea ecx, [esp + 0] .endif mov ebx, \x .ifeqs "\mode", "mont" - lea esi, [esp + 44] + lea esi, [esp + 32] .endif - cysetup esp + 0 + cysetup esp + 96 .ifnes "\u", "nil" mov eax, \u .endif .ifeqs "\mode", "mont" - lea edx, [esp + 76] + lea edx, [esp + 64] .else - lea edx, [esp + 44] + lea edx, [esp + 32] .endif .endm -.macro testtail cyv, n - cystore esp + 0, \cyv, \n +.macro testtail cyv + cystore esp + 96, \cyv, esp + 104 jnz 0b .endm @@ -1123,101 +1154,125 @@ ENDFUNC movdqu [ecx + 32], xmm6 .endm - .globl test_dmul4 -test_dmul4: - testprologue +FUNC(test_dmul4) + testprologue [ebp + 44] testldcarry [ebp + 24] testexpand [ebp + 36], [ebp + 40] mov edi, [ebp + 20] testtop [ebp + 28], [ebp + 32] call dmul4 - testtail [ebp + 48], [ebp + 44] + testtail [ebp + 48] testcarryout [ebp + 24] testepilogue +ENDFUNC - .globl test_dmla4 -test_dmla4: - testprologue +FUNC(test_dmla4) + testprologue [ebp + 44] testldcarry [ebp + 24] testexpand [ebp + 36], [ebp + 40] mov edi, [ebp + 20] testtop [ebp + 28], [ebp + 32] call dmla4 - testtail [ebp + 48], [ebp + 44] + testtail [ebp + 48] testcarryout [ebp + 24] testepilogue +ENDFUNC - .globl test_mul4 -test_mul4: - testprologue +FUNC(test_mul4) + testprologue [ebp + 36] testldcarry [ebp + 24] testexpand nil, [ebp + 32] mov edi, [ebp + 20] testtop nil, [ebp + 28] call mul4 - testtail [ebp + 40], [ebp + 36] + testtail [ebp + 40] testcarryout [ebp + 24] testepilogue +ENDFUNC - .globl test_mla4 -test_mla4: - testprologue +FUNC(test_mul4zc) + testprologue [ebp + 36] + testldcarry [ebp + 24] + testexpand nil, [ebp + 32] + mov edi, [ebp + 20] + testtop nil, [ebp + 28] + call mul4zc + testtail [ebp + 40] + testcarryout [ebp + 24] + testepilogue +ENDFUNC + +FUNC(test_mla4) + testprologue [ebp + 36] testldcarry [ebp + 24] testexpand nil, [ebp + 32] mov edi, [ebp + 20] testtop nil, [ebp + 28] call mla4 - testtail [ebp + 40], [ebp + 36] + testtail [ebp + 40] + testcarryout [ebp + 24] + testepilogue +ENDFUNC + +FUNC(test_mla4zc) + testprologue [ebp + 36] + testldcarry [ebp + 24] + testexpand nil, [ebp + 32] + mov edi, [ebp + 20] + testtop nil, [ebp + 28] + call mla4zc + testtail [ebp + 40] testcarryout [ebp + 24] testepilogue +ENDFUNC - .globl test_mmul4 -test_mmul4: - testprologue +FUNC(test_mmul4) + testprologue [ebp + 48] testexpand [ebp + 40], [ebp + 44] mov edi, [ebp + 20] testtop [ebp + 32], [ebp + 36], mont call mmul4 - testtail [ebp + 52], [ebp + 48] + testtail [ebp + 52] mov edi, [ebp + 28] - movdqa xmm0, [esp + 76] - movdqa xmm1, [esp + 92] + movdqa xmm0, [esp + 64] + movdqa xmm1, [esp + 80] movdqu [edi], xmm0 movdqu [edi + 16], xmm1 testcarryout [ebp + 24] testepilogue +ENDFUNC - .globl test_mmla4 -test_mmla4: - testprologue +FUNC(test_mmla4) + testprologue [ebp + 48] testexpand [ebp + 40], [ebp + 44] mov edi, [ebp + 20] testtop [ebp + 32], [ebp + 36], mont call mmla4 - testtail [ebp + 52], [ebp + 48] + testtail [ebp + 52] mov edi, [ebp + 28] - movdqa xmm0, [esp + 76] - movdqa xmm1, [esp + 92] + movdqa xmm0, [esp + 64] + movdqa xmm1, [esp + 80] movdqu [edi], xmm0 movdqu [edi + 16], xmm1 testcarryout [ebp + 24] testepilogue +ENDFUNC - .globl test_mont4 -test_mont4: - testprologue +FUNC(test_mont4) + testprologue [ebp + 40] testexpand nil, [ebp + 36] mov edi, [ebp + 20] testtop nil, [ebp + 32], mont call mont4 - testtail [ebp + 44], [ebp + 40] + testtail [ebp + 44] mov edi, [ebp + 28] - movdqa xmm0, [esp + 76] - movdqa xmm1, [esp + 92] + movdqa xmm0, [esp + 64] + movdqa xmm1, [esp + 80] movdqu [edi], xmm0 movdqu [edi + 16], xmm1 testcarryout [ebp + 24] testepilogue +ENDFUNC #endif