X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/a1a9ee0a7240087e202a7855e470573de0e59c09..HEAD:/math/mpx-mul4-x86-sse2.S diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S index 2f7b5ec9..0964de9f 100644 --- a/math/mpx-mul4-x86-sse2.S +++ b/math/mpx-mul4-x86-sse2.S @@ -40,11 +40,11 @@ /// construct more general variable-length multipliers. /// /// The basic trick is the same throughout. In an operand-scanning -/// multiplication, the inner multiplication loop multiplies a -/// multiple-precision operand by a single precision factor, and adds the -/// result, appropriately shifted, to the result. A `finely integrated -/// operand scanning' implementation of Montgomery multiplication also adds -/// the product of a single-precision `Montgomery factor' and the modulus, +/// multiplication, the inner multiplication loop multiplies a multiple- +/// precision operand by a single precision factor, and adds the result, +/// appropriately shifted, to the result. A `finely integrated operand +/// scanning' implementation of Montgomery multiplication also adds the +/// product of a single-precision `Montgomery factor' and the modulus, /// calculated in the same pass. The more common `coarsely integrated /// operand scanning' alternates main multiplication and Montgomery passes, /// which requires additional carry propagation. @@ -58,9 +58,9 @@ /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE /// operands, as follows. /// -/// Offset 0 4 8 12 -/// 0 v'_0 v'_1 v''_0 v''_1 -/// 16 v'_2 v'_3 v''_2 v''_3 +/// Offset 12 8 4 0 +/// 0 v''_1 v''_0 v'_1 v'_0 +/// 16 v''_3 v''_2 v'_3 v'_2 /// /// A `pmuludq' instruction ignores the odd positions in its operands; thus, /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting @@ -70,23 +70,64 @@ /// many products together before we must deal with carrying; it also allows /// for some calculations to be performed on the above expanded form. /// +/// We maintain four `carry' registers XMM4--XMM7 accumulating intermediate +/// results. The registers' precise roles rotate during the computation; we +/// name them `c0', `c1', `c2', and `c3'. Each carry register holds two +/// 64-bit halves: the register c0, for example, holds c'_0 (low half) and +/// c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the +/// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2 + +/// c_3 B^3. The `pmuluqd' instruction acting on a scalar operand (broadcast +/// across all lanes of its vector) and an operand in the expanded form above +/// produces a result which can be added directly to the appropriate carry +/// register. Following a pass of four multiplications, we perform some +/// limited carry propagation: let t = c''_0 mod B, and let d = c'_0 + t b; +/// then we output z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and +/// cycle the carry registers around, so that c1 becomes c0, and the old +/// (implicitly) zeroed c0 becomes c3. +/// /// On 32-bit x86, we are register starved: the expanded operands are kept in -/// memory, typically in warm L1 cache. +/// memory, typically in warm L1 cache. The packed operands are read from +/// memory into working registers XMM0--XMM3 and processed immediately. +/// The following conventional argument names and locations are used +/// throughout. +/// +/// Arg Format Location Notes +/// +/// U packed [EAX] +/// X packed [EBX] In Montgomery multiplication, X = N +/// V expanded [ECX] +/// Y expanded [EDX] In Montgomery multiplication, Y = (A + U V) M +/// M expanded [ESI] -N^{-1} (mod B^4) +/// N Modulus, for Montgomery multiplication +/// A packed [EDI] Destination/accumulator +/// C carry XMM4--XMM7 +/// +/// The calculation is some variant of +/// +/// A' + C' B^4 <- U V + X Y + A + C +/// +/// The low-level functions fit into a fairly traditional (finely-integrated) +/// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y) +/// (indexed by i). /// -/// We maintain four `carry' registers accumulating intermediate results. -/// The registers' precise roles rotate during the computation; we name them -/// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves: -/// the register c0, for example, holds c'_0 (low half) and c''_0 (high -/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers -/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The -/// `pmuluqd' instruction acting on a scalar operand (broadcast across all -/// lanes of its vector) and an operand in the expanded form above produces a -/// result which can be added directly to the appropriate carry register. -/// Following a pass of four multiplications, we perform some limited carry -/// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output -/// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry -/// registers around, so that c1 becomes c0, and the old c0 is (implicitly) -/// zeroed becomes c3. +/// The variants are as follows. +/// +/// Function Variant Use i j +/// +/// mmul4 A = C = 0 Montgomery 0 0 +/// dmul4 A = 0 Montgomery 0 + +/// mmla4 C = 0 Montgomery + 0 +/// dmla4 exactly as shown Montgomery + + +/// mont4 U = V = C = 0 Montgomery any 0 +/// +/// mul4zc U = V = A = C = 0 Plain 0 0 +/// mul4 U = V = A = 0 Plain 0 + +/// mla4zc U = V = C = 0 Plain + 0 +/// mla4 U = V = 0 Plain + + +/// +/// The `mmul4' and `mmla4' functions are also responsible for calculating +/// the Montgomery reduction factor Y = (A + U V) M used by the rest of the +/// inner loop. ///-------------------------------------------------------------------------- /// Macro definitions. @@ -94,41 +135,41 @@ .macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil // Load a word r_i from R, multiply by the expanded operand [S], and // leave the pieces of the product in registers D0, D1, D2, D3. - movd \d0, \r // (r_i, 0; 0, 0) + movd \d0, \r // (0, 0; 0, r_i) .ifnes "\d1", "nil" - movdqa \d1, [\s] // (s'_0, s'_1; s''_0, s''_1) + movdqa \d1, [\s] // (s''_1, s''_0; s'_1, s'_0) .endif .ifnes "\d3", "nil" - movdqa \d3, [\s + 16] // (s'_2, s'_3; s''_2, s''_3) + movdqa \d3, [\s + 16] // (s''_3, s''_2; s'_3, s'_2) .endif - pshufd \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?) + pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (?, r_i; ?, r_i) .ifnes "\d1", "nil" - psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0) + psrldq \d1, 4 // (0, s''_1; s''_0, s'_1) .endif .ifnes "\d2", "nil" .ifnes "\d3", "nil" movdqa \d2, \d3 // another copy of (s'_2, s'_3; ...) .else - movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?) + movdqa \d2, \d0 // another copy of (?, r_i; ?, r_i) .endif .endif .ifnes "\d3", "nil" - psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0) + psrldq \d3, 4 // (0, s''_3; s''_2, s'_3) .endif .ifnes "\d1", "nil" - pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1) + pmuludq \d1, \d0 // (r_i s''_1; r_i s'_1) .endif .ifnes "\d3", "nil" - pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3) + pmuludq \d3, \d0 // (r_i s''_3; r_i s'_3) .endif .ifnes "\d2", "nil" .ifnes "\d3", "nil" - pmuludq \d2, \d0 // (r_i s'_2; r_i s''_2) + pmuludq \d2, \d0 // (r_i s''_2; r_i s'_2) .else pmuludq \d2, [\s + 16] .endif .endif - pmuludq \d0, [\s] // (r_i s'_0; r_i s''_0) + pmuludq \d0, [\s] // (r_i s''_0; r_i s'_0) .endm .macro accum c0, c1=nil, c2=nil, c3=nil @@ -169,10 +210,10 @@ // carry registers. On completion, XMM3 is clobbered. If CC is // `nil', then the contribution which would have been added to it is // left in C. - pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B) - psrldq xmm3, 12 // (t, 0; 0, 0) = (t, 0) - pslldq xmm3, 2 // (t b; 0) - paddq \c, xmm3 // (c' + t b; c'') + pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (t = c'' mod B, ?; ?, ?) + psrldq xmm3, 12 // (0, 0; 0, t) = (0; t) + pslldq xmm3, 2 // (0; t b) + paddq \c, xmm3 // (c''; c' + t b) movd \d, \c psrlq \c, 32 // floor(c/B) .ifnes "\cc", "nil" @@ -185,10 +226,10 @@ // of the value represented in C are written to D, and the remaining // bits are left at the bottom of T. movdqa \t, \c - psllq \t, 16 // (?; c'' b) - pslldq \c, 8 // (0; c') - paddq \t, \c // (?; c' + c'' b) - psrldq \t, 8 // (c' + c'' b; 0) = (c; 0) + psllq \t, 16 // (c'' b; ?) + pslldq \c, 8 // (c'; 0) + paddq \t, \c // (c' + c'' b; ?) + psrldq \t, 8 // (0; c' + c'' b) = (0; c) movd \d, \t psrldq \t, 4 // (floor(c/B); 0) .endm @@ -197,21 +238,21 @@ // On entry, A and C hold packed 128-bit values, and Z is zero. On // exit, A:B and C:D together hold the same values in expanded // form. If C is `nil', then only expand A to A:B. - movdqa \b, \a // (a_0, a_1; a_2, a_3) + movdqa \b, \a // (a_3, a_2; a_1, a_0) .ifnes "\c", "nil" - movdqa \d, \c // (c_0, c_1; c_2, c_3) + movdqa \d, \c // (c_3, c_2; c_1, c_0) .endif - punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1) - punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3) + punpcklwd \a, \z // (a''_1, a'_1; a''_0, a'_0) + punpckhwd \b, \z // (a''_3, a'_3; a''_2, a'_2) .ifnes "\c", "nil" - punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1) - punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3) + punpcklwd \c, \z // (c''_1, c'_1; c''_0, c'_0) + punpckhwd \d, \z // (c''_3, c'_3; c''_2, c'_2) .endif - pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1) - pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3) + pshufd \a, \a, SHUF(3, 1, 2, 0) // (a''_1, a''_0; a'_1, a'_0) + pshufd \b, \b, SHUF(3, 1, 2, 0) // (a''_3, a''_2; a'_3, a'_2) .ifnes "\c", "nil" - pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1) - pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3) + pshufd \c, \c, SHUF(3, 1, 2, 0) // (c''_1, c''_0; c'_1, c'_0) + pshufd \d, \d, SHUF(3, 1, 2, 0) // (c''_3, c''_2; c'_3, c'_2) .endif .endm @@ -227,10 +268,10 @@ // we can do that, we must gather them together. movdqa \t, \c0 movdqa \u, \c1 - punpcklqdq \t, \c2 // (y'_0; y'_2) - punpckhqdq \c0, \c2 // (y''_0; y''_2) - punpcklqdq \u, \c3 // (y'_1; y'_3) - punpckhqdq \c1, \c3 // (y''_1; y''_3) + punpcklqdq \t, \c2 // (y'_2; y'_0) + punpckhqdq \c0, \c2 // (y''_2; y''_0) + punpcklqdq \u, \c3 // (y'_3; y'_1) + punpckhqdq \c1, \c3 // (y''_3; y''_1) // Now split the double-prime pieces. The high (up to) 48 bits will // go up; the low 16 bits go down. @@ -238,43 +279,43 @@ movdqa \c3, \c1 psllq \c2, 48 psllq \c3, 48 - psrlq \c0, 16 // high parts of (y''_0; y''_2) - psrlq \c1, 16 // high parts of (y''_1; y''_3) - psrlq \c2, 32 // low parts of (y''_0; y''_2) - psrlq \c3, 32 // low parts of (y''_1; y''_3) + psrlq \c0, 16 // high parts of (y''_2; y''_0) + psrlq \c1, 16 // high parts of (y''_3; y''_1) + psrlq \c2, 32 // low parts of (y''_2; y''_0) + psrlq \c3, 32 // low parts of (y''_3; y''_1) .ifnes "\hi", "nil" movdqa \hi, \c1 .endif - pslldq \c1, 8 // high part of (0; y''_1) + pslldq \c1, 8 // high part of (y''_1; 0) paddq \t, \c2 // propagate down paddq \u, \c3 - paddq \t, \c1 // and up: (y_0; y_2) - paddq \u, \c0 // (y_1; y_3) + paddq \t, \c1 // and up: (y_2; y_0) + paddq \u, \c0 // (y_3; y_1) .ifnes "\hi", "nil" - psrldq \hi, 8 // high part of (y''_3; 0) + psrldq \hi, 8 // high part of (0; y''_3) .endif // Finally extract the answer. This complicated dance is better than // storing to memory and loading, because the piecemeal stores // inhibit store forwarding. - movdqa \c3, \t // (y_0; ?) - movdqa \lo, \t // (y^*_0, ?; ?, ?) - psrldq \t, 8 // (y_2; 0) + movdqa \c3, \t // (?; y_0) + movdqa \lo, \t // (?, ?; ?, y^*_0) + psrldq \t, 8 // (0; y_2) psrlq \c3, 32 // (floor(y_0/B); ?) paddq \c3, \u // (y_1 + floor(y_0/B); ?) - movdqa \c1, \c3 // (y^*_1, ?; ?, ?) - psrldq \u, 8 // (y_3; 0) + movdqa \c1, \c3 // (?, ?; ?, y^*_1) + psrldq \u, 8 // (0; y_3) psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?) paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?) - punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?) + punpckldq \lo, \c3 // (?, ?; y^*_2, y^*_0) psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?) paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?) .ifnes "\hi", "nil" movdqa \t, \c3 pxor \u, \u .endif - punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?) + punpckldq \c1, \c3 // (?, ?; y^*_3, y^*_1) .ifnes "\hi", "nil" psrlq \t, 32 // very high bits of y paddq \hi, \t @@ -291,14 +332,14 @@ // On exit, the carry registers, including XMM7, are updated to hold // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other // registers are preserved. - movd xmm0, [edi + 0] // (a_0; 0) - movd xmm1, [edi + 4] // (a_1; 0) - movd xmm2, [edi + 8] // (a_2; 0) - movd xmm7, [edi + 12] // (a_3; 0) - - paddq xmm4, xmm0 // (c'_0 + a_0; c''_0) - paddq xmm5, xmm1 // (c'_1 + a_1; c''_1) - paddq xmm6, xmm2 // (c'_2 + a_2; c''_2 + a_3 b) + movd xmm0, [edi + 0] // (0; a_0) + movd xmm1, [edi + 4] // (0; a_1) + movd xmm2, [edi + 8] // (0; a_2) + movd xmm7, [edi + 12] // (0; a_3) + + paddq xmm4, xmm0 // (c''_0; c'_0 + a_0) + paddq xmm5, xmm1 // (c''_1; c'_1 + a_1) + paddq xmm6, xmm2 // (c''_2 + a_3 b; c'_2 + a_2) .endm ///-------------------------------------------------------------------------- @@ -1013,25 +1054,25 @@ FUNC(mpxmont_redc4_x86_sse2) // Continue carry propagation until the end of the buffer. 0: add [edi], eax mov eax, 0 // preserves flags - adcd [edi + 4], 0 - adcd [edi + 8], 0 - adcd [edi + 12], 0 + adc dword ptr [edi + 4], 0 + adc dword ptr [edi + 8], 0 + adc dword ptr [edi + 12], 0 adc eax, 0 add edi, 16 cmp edi, esi jb 0b - // Deal with the tail end. + // Deal with the tail end. Note that the actual destination length + // won't be an exact number of blocks of four, so it's safe to just + // drop through here. 7: add [edi], eax - mov eax, 0 // preserves flags + mov eax, 0 add edi, 4 adc eax, 0 cmp edi, edx jb 7b - // All done for this iteration. Start the next. (This must have at - // least one follow-on iteration, or we'd not have started this outer - // loop.) + // All done for this iteration. Start the next. 8: mov edi, [SP + 0] // -> dv[i - 1] mov ebx, [BP + 28] // -> X = nv[0] lea edx, [SP + 44] // -> space for Y @@ -1107,9 +1148,9 @@ ENDFUNC .macro testldcarry c mov ecx, \c // -> c - movdqu xmm4, [ecx + 0] // (c'_0; c''_0) - movdqu xmm5, [ecx + 16] // (c'_1; c''_1) - movdqu xmm6, [ecx + 32] // (c'_2; c''_2) + movdqu xmm4, [ecx + 0] // (c''_0; c'_0) + movdqu xmm5, [ecx + 16] // (c''_1; c'_1) + movdqu xmm6, [ecx + 32] // (c''_2; c'_2) .endm .macro testexpand v=nil, y=nil @@ -1245,6 +1286,8 @@ FUNC(test_mmul4) mov edi, [BP + 28] movdqa xmm0, [SP + 64] movdqa xmm1, [SP + 80] + pshufd xmm0, xmm0, SHUF(3, 1, 2, 0) + pshufd xmm1, xmm1, SHUF(3, 1, 2, 0) movdqu [edi], xmm0 movdqu [edi + 16], xmm1 testcarryout [BP + 24] @@ -1261,6 +1304,8 @@ FUNC(test_mmla4) mov edi, [BP + 28] movdqa xmm0, [SP + 64] movdqa xmm1, [SP + 80] + pshufd xmm0, xmm0, SHUF(3, 1, 2, 0) + pshufd xmm1, xmm1, SHUF(3, 1, 2, 0) movdqu [edi], xmm0 movdqu [edi + 16], xmm1 testcarryout [BP + 24] @@ -1277,6 +1322,8 @@ FUNC(test_mont4) mov edi, [BP + 28] movdqa xmm0, [SP + 64] movdqa xmm1, [SP + 80] + pshufd xmm0, xmm0, SHUF(3, 1, 2, 0) + pshufd xmm1, xmm1, SHUF(3, 1, 2, 0) movdqu [edi], xmm0 movdqu [edi + 16], xmm1 testcarryout [BP + 24]