X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/a1a9ee0a7240087e202a7855e470573de0e59c09..bd6d65e32b835551677456bf286d09ced6859882:/math/mpx-mul4-amd64-sse2.S diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S index bd8ff2f9..5a748c60 100644 --- a/math/mpx-mul4-amd64-sse2.S +++ b/math/mpx-mul4-amd64-sse2.S @@ -41,11 +41,11 @@ /// construct more general variable-length multipliers. /// /// The basic trick is the same throughout. In an operand-scanning -/// multiplication, the inner multiplication loop multiplies a -/// multiple-precision operand by a single precision factor, and adds the -/// result, appropriately shifted, to the result. A `finely integrated -/// operand scanning' implementation of Montgomery multiplication also adds -/// the product of a single-precision `Montgomery factor' and the modulus, +/// multiplication, the inner multiplication loop multiplies a multiple- +/// precision operand by a single precision factor, and adds the result, +/// appropriately shifted, to the result. A `finely integrated operand +/// scanning' implementation of Montgomery multiplication also adds the +/// product of a single-precision `Montgomery factor' and the modulus, /// calculated in the same pass. The more common `coarsely integrated /// operand scanning' alternates main multiplication and Montgomery passes, /// which requires additional carry propagation. @@ -71,22 +71,65 @@ /// many products together before we must deal with carrying; it also allows /// for some calculations to be performed on the above expanded form. /// -/// ... +/// We maintain four `carry' registers XMM12--XMM15 accumulating intermediate +/// results. The registers' precise roles rotate during the computation; we +/// name them `c0', `c1', `c2', and `c3'. Each carry register holds two +/// 64-bit halves: the register c0, for example, holds c'_0 (low half) and +/// c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the +/// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2 + +/// c_3 B^3. The `pmuluqdq' instruction acting on a scalar operand +/// (broadcast across all lanes of its vector) and an operand in the expanded +/// form above produces a result which can be added directly to the +/// appropriate carry register. Following a pass of four multiplications, we +/// perform some limited carry propagation: let t = c''_0 mod B, and let d = +/// c'_0 + t b; then we output z = d mod B, add (floor(d/B), floor(c''_0/B)) +/// to c1, and cycle the carry registers around, so that c1 becomes c0, and +/// the old (implicitly) zeroed c0 becomes c3. /// -/// We maintain four `carry' registers accumulating intermediate results. -/// The registers' precise roles rotate during the computation; we name them -/// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves: -/// the register c0, for example, holds c'_0 (low half) and c''_0 (high -/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers -/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The -/// `pmuluqdq' instruction acting on a scalar operand (broadcast across all -/// lanes of its vector) and an operand in the expanded form above produces a -/// result which can be added directly to the appropriate carry register. -/// Following a pass of four multiplications, we perform some limited carry -/// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output -/// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry -/// registers around, so that c1 becomes c0, and the old c0 is (implicitly) -/// zeroed becomes c3. +/// On 64-bit AMD64, we have a reasonable number of registers: the expanded +/// operands are kept in registers. The packed operands are read from memory +/// into working registers XMM4 and XMM5; XMM0--XMM3 are used for the actual +/// multiplications; and XMM6 and XMM7 are used for combining the results. +/// The following conventional argument names and locations are used +/// throughout. +/// +/// Arg Format Location Notes +/// +/// U packed [RAX] +/// X packed [RBX] In Montgomery multiplication, X = N +/// V expanded XMM8/XMM9 +/// Y expanded XMM10/XMM11 In Montgomery multiplication, Y = (A + U V) M +/// M expanded (see below) Montgomery factor, M = -N^{-1} (mod B^4) +/// N Modulus, for Montgomery multiplication +/// A packed [RDI] Destination/accumulator +/// C carry XMM12--XMM15 +/// +/// The calculation is some variant of +/// +/// A' + C' B^4 <- U V + X Y + A + C +/// +/// The low-level functions fit into a fairly traditional (finely-integrated) +/// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y) +/// (indexed by i). +/// +/// The variants are as follows. +/// +/// Function Variant Use i j +/// +/// mmul4 A = C = 0, Y = M Montgomery 0 0 +/// dmul4 A = 0 Montgomery 0 + +/// mmla4 C = 0, Y = M Montgomery + 0 +/// dmla4 exactly as shown Montgomery + + +/// mont4 U = C = 0, V = M Montgomery any 0 +/// +/// mul4zc U = V = A = C = 0 Plain 0 0 +/// mul4 U = V = A = 0 Plain 0 + +/// mla4zc U = V = C = 0 Plain + 0 +/// mla4 U = V = 0 Plain + + +/// +/// The `mmul4' and `mmla4' functions are also responsible for calculating +/// the Montgomery reduction factor Y = (A + U V) M used by the rest of the +/// inner loop. ///-------------------------------------------------------------------------- /// Macro definitions. @@ -532,8 +575,8 @@ ENDFUNC INTFUNC(mmul4) // On entry, RDI points to the destination buffer; RAX and RBX point // to the packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold - // the expanded operands V and M. The stack pointer must be 8 modulo 16 - // (as usual for AMD64 ABIs). + // the expanded operands V and M. The stack pointer must be 8 modulo + // 16 (as usual for AMD64 ABIs). // // On exit, we store Y = U V M mod B in XMM10/XMM11, and write the // low 128 bits of the sum U V + N Y to [RDI], leaving the remaining @@ -750,7 +793,7 @@ ENDFUNC FUNC(mpx_umul4_amd64_sse2) // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl, - // const mpw *bv, const mpw *bvl); + // const mpw *bv, const mpw *bvl); // Establish the arguments and do initial setup. // @@ -1112,7 +1155,7 @@ FUNC(mpxmont_redc4_amd64_sse2) // outer loop dv r10 rcx // outer loop dv limit r11 r11 // nv base rdx r8 - // nv limit r9 r12* + // nv limit r9 r10* // n rcx r9 // c rcx r9 @@ -1140,14 +1183,13 @@ FUNC(mpxmont_redc4_amd64_sse2) # define DV rcx # define DVLO r11 # define NV r8 -# define NVL r12 +# define NVL r10 # define N r9 # define C r9d pushreg rbx pushreg rdi - pushreg r12 - stalloc 160 + stalloc 168 savexmm xmm6, 0 savexmm xmm7, 16 @@ -1209,29 +1251,29 @@ FUNC(mpxmont_redc4_amd64_sse2) // Continue carry propagation until the end of the buffer. 0: add [rdi], C mov C, 0 // preserves flags - adcd [rdi + 4], 0 - adcd [rdi + 8], 0 - adcd [rdi + 12], 0 + adc dword ptr [rdi + 4], 0 + adc dword ptr [rdi + 8], 0 + adc dword ptr [rdi + 12], 0 adc C, 0 add rdi, 16 cmp rdi, DVL4 jb 0b - // Deal with the tail end. + // Deal with the tail end. Note that the actual destination length + // won't be an exacty number of blocks of four, so it's safe to just + // drop through here. 7: add [rdi], C - mov C, 0 // preserves flags + mov C, 0 add rdi, 4 adc C, 0 cmp rdi, DVL jb 7b - // All done for this iteration. Start the next. (This must have at - // least one follow-on iteration, or we'd not have started this outer - // loop.) -8: mov rdi, DV // -> Z = dv[i] - mov rbx, NV // -> X = nv[0] - cmp rdi, DVLO // all done yet? + // All done for this iteration. Start the next. + cmp DV, DVLO // all done yet? jae 9f + mov rdi, DV // -> Z = dv[i] + mov rbx, NV // -> X = nv[0] add DV, 16 call mont4 add rdi, 16 @@ -1257,8 +1299,7 @@ FUNC(mpxmont_redc4_amd64_sse2) rstrxmm xmm14, 128 rstrxmm xmm15, 144 - stfree 160 - popreg r12 + stfree 168 popreg rdi popreg rbx #endif @@ -1560,6 +1601,8 @@ FUNC(test_mmul4) testtop r11 call mmul4 testtail + pshufd xmm10, xmm10, SHUF(0, 2, 1, 3) + pshufd xmm11, xmm11, SHUF(0, 2, 1, 3) movdqu [r10 + 0], xmm10 movdqu [r10 + 16], xmm11 testcarryout @@ -1571,6 +1614,8 @@ FUNC(test_mmla4) testtop r11 call mmla4 testtail + pshufd xmm10, xmm10, SHUF(0, 2, 1, 3) + pshufd xmm11, xmm11, SHUF(0, 2, 1, 3) movdqu [r10 + 0], xmm10 movdqu [r10 + 16], xmm11 testcarryout @@ -1582,6 +1627,8 @@ FUNC(test_mont4) testtop call mont4 testtail + pshufd xmm10, xmm10, SHUF(0, 2, 1, 3) + pshufd xmm11, xmm11, SHUF(0, 2, 1, 3) movdqu [r10 + 0], xmm10 movdqu [r10 + 16], xmm11 testcarryout