X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/6d2bd7f11dcd292461bbd66e487e4367f05f9fe8..9599917f4a0aa31e2dafd15a7f0e4993bdedf715:/math/mpx-mul4-x86-sse2.S diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S index 0e90541e..b1072ff2 100644 --- a/math/mpx-mul4-x86-sse2.S +++ b/math/mpx-mul4-x86-sse2.S @@ -40,11 +40,11 @@ /// construct more general variable-length multipliers. /// /// The basic trick is the same throughout. In an operand-scanning -/// multiplication, the inner multiplication loop multiplies a -/// multiple-precision operand by a single precision factor, and adds the -/// result, appropriately shifted, to the result. A `finely integrated -/// operand scanning' implementation of Montgomery multiplication also adds -/// the product of a single-precision `Montgomery factor' and the modulus, +/// multiplication, the inner multiplication loop multiplies a multiple- +/// precision operand by a single precision factor, and adds the result, +/// appropriately shifted, to the result. A `finely integrated operand +/// scanning' implementation of Montgomery multiplication also adds the +/// product of a single-precision `Montgomery factor' and the modulus, /// calculated in the same pass. The more common `coarsely integrated /// operand scanning' alternates main multiplication and Montgomery passes, /// which requires additional carry propagation. @@ -70,23 +70,64 @@ /// many products together before we must deal with carrying; it also allows /// for some calculations to be performed on the above expanded form. /// +/// We maintain four `carry' registers XMM4--XMM7 accumulating intermediate +/// results. The registers' precise roles rotate during the computation; we +/// name them `c0', `c1', `c2', and `c3'. Each carry register holds two +/// 64-bit halves: the register c0, for example, holds c'_0 (low half) and +/// c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the +/// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2 + +/// c_3 B^3. The `pmuluqd' instruction acting on a scalar operand (broadcast +/// across all lanes of its vector) and an operand in the expanded form above +/// produces a result which can be added directly to the appropriate carry +/// register. Following a pass of four multiplications, we perform some +/// limited carry propagation: let t = c''_0 mod B, and let d = c'_0 + t b; +/// then we output z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and +/// cycle the carry registers around, so that c1 becomes c0, and the old +/// (implicitly) zeroed c0 becomes c3. +/// /// On 32-bit x86, we are register starved: the expanded operands are kept in -/// memory, typically in warm L1 cache. +/// memory, typically in warm L1 cache. The packed operands are read from +/// memory into working registers XMM0--XMM3 and processed immediately. +/// The following conventional argument names and locations are used +/// throughout. +/// +/// Arg Format Location Notes +/// +/// U packed [EAX] +/// X packed [EBX] In Montgomery multiplication, X = N +/// V expanded [ECX] +/// Y expanded [EDX] In Montgomery multiplication, Y = (A + U V) M +/// M expanded [ESI] -N^{-1} (mod B^4) +/// N Modulus, for Montgomery multiplication +/// A packed [EDI] Destination/accumulator +/// C carry XMM4--XMM7 +/// +/// The calculation is some variant of +/// +/// A' + C' B^4 <- U V + X Y + A + C +/// +/// The low-level functions fit into a fairly traditional (finely-integrated) +/// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y) +/// (indexed by i). /// -/// We maintain four `carry' registers accumulating intermediate results. -/// The registers' precise roles rotate during the computation; we name them -/// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves: -/// the register c0, for example, holds c'_0 (low half) and c''_0 (high -/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers -/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The -/// `pmuluqd' instruction acting on a scalar operand (broadcast across all -/// lanes of its vector) and an operand in the expanded form above produces a -/// result which can be added directly to the appropriate carry register. -/// Following a pass of four multiplications, we perform some limited carry -/// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output -/// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry -/// registers around, so that c1 becomes c0, and the old c0 is (implicitly) -/// zeroed becomes c3. +/// The variants are as follows. +/// +/// Function Variant Use i j +/// +/// mmul4 A = C = 0 Montgomery 0 0 +/// dmul4 A = 0 Montgomery 0 + +/// mmla4 C = 0 Montgomery + 0 +/// dmla4 exactly as shown Montgomery + + +/// mont4 U = V = C = 0 Montgomery any 0 +/// +/// mul4zc U = V = A = C = 0 Plain 0 0 +/// mul4 U = V = A = 0 Plain 0 + +/// mla4zc U = V = C = 0 Plain + 0 +/// mla4 U = V = 0 Plain + + +/// +/// The `mmul4' and `mmla4' functions are also responsible for calculating +/// the Montgomery reduction factor Y = (A + U V) M used by the rest of the +/// inner loop. ///-------------------------------------------------------------------------- /// Macro definitions. @@ -316,7 +357,6 @@ INTFUNC(carryprop) propout [edi + 8], xmm6, nil endprop [edi + 12], xmm6, xmm4 ret - ENDFUNC INTFUNC(dmul4) @@ -348,7 +388,6 @@ INTFUNC(dmul4) propout [edi + 12], xmm7, xmm4 ret - ENDFUNC INTFUNC(dmla4) @@ -384,7 +423,6 @@ INTFUNC(dmla4) propout [edi + 12], xmm7, xmm4 ret - ENDFUNC INTFUNC(mul4zc) @@ -410,7 +448,6 @@ INTFUNC(mul4zc) propout [edi + 12], xmm7, xmm4 ret - ENDFUNC INTFUNC(mul4) @@ -438,7 +475,6 @@ INTFUNC(mul4) propout [edi + 12], xmm7, xmm4 ret - ENDFUNC INTFUNC(mla4zc) @@ -470,7 +506,6 @@ INTFUNC(mla4zc) propout [edi + 12], xmm7, xmm4 ret - ENDFUNC INTFUNC(mla4) @@ -501,7 +536,6 @@ INTFUNC(mla4) propout [edi + 12], xmm7, xmm4 ret - ENDFUNC INTFUNC(mmul4) @@ -523,7 +557,6 @@ INTFUNC(mmul4) mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7 propout [edi + 0], xmm4, xmm5 jmp 5f - ENDFUNC INTFUNC(mmla4) @@ -613,7 +646,6 @@ INTFUNC(mmla4) // And, with that, we're done. stfree 48 + 12 ret - ENDFUNC INTFUNC(mont4) @@ -670,7 +702,6 @@ INTFUNC(mont4) // And, with that, we're done. ret - ENDFUNC ///-------------------------------------------------------------------------- @@ -781,7 +812,6 @@ FUNC(mpx_umul4_x86_sse2) pop ebx pop BP ret - ENDFUNC FUNC(mpxmont_mul4_x86_avx) @@ -930,7 +960,6 @@ FUNC(mpxmont_mul4_x86_sse2) popreg ebx popreg BP ret - ENDFUNC FUNC(mpxmont_redc4_x86_avx) @@ -1064,7 +1093,6 @@ FUNC(mpxmont_redc4_x86_sse2) popreg ebx popreg BP ret - ENDFUNC ///--------------------------------------------------------------------------