X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/8e91d6e5b9c82efb626869d6618b240d2ae8ad05..9599917f4a0aa31e2dafd15a7f0e4993bdedf715:/math/mpx-mul4-amd64-sse2.S diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S index 8b8cd414..0f2dbd32 100644 --- a/math/mpx-mul4-amd64-sse2.S +++ b/math/mpx-mul4-amd64-sse2.S @@ -25,15 +25,13 @@ /// MA 02111-1307, USA. ///-------------------------------------------------------------------------- -/// External definitions. +/// Preliminaries. #include "config.h" #include "asm-common.h" -///-------------------------------------------------------------------------- -/// Prologue. - .arch pentium4 + .text ///-------------------------------------------------------------------------- @@ -43,11 +41,11 @@ /// construct more general variable-length multipliers. /// /// The basic trick is the same throughout. In an operand-scanning -/// multiplication, the inner multiplication loop multiplies a -/// multiple-precision operand by a single precision factor, and adds the -/// result, appropriately shifted, to the result. A `finely integrated -/// operand scanning' implementation of Montgomery multiplication also adds -/// the product of a single-precision `Montgomery factor' and the modulus, +/// multiplication, the inner multiplication loop multiplies a multiple- +/// precision operand by a single precision factor, and adds the result, +/// appropriately shifted, to the result. A `finely integrated operand +/// scanning' implementation of Montgomery multiplication also adds the +/// product of a single-precision `Montgomery factor' and the modulus, /// calculated in the same pass. The more common `coarsely integrated /// operand scanning' alternates main multiplication and Montgomery passes, /// which requires additional carry propagation. @@ -73,22 +71,65 @@ /// many products together before we must deal with carrying; it also allows /// for some calculations to be performed on the above expanded form. /// -/// ... +/// We maintain four `carry' registers XMM12--XMM15 accumulating intermediate +/// results. The registers' precise roles rotate during the computation; we +/// name them `c0', `c1', `c2', and `c3'. Each carry register holds two +/// 64-bit halves: the register c0, for example, holds c'_0 (low half) and +/// c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the +/// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2 + +/// c_3 B^3. The `pmuluqdq' instruction acting on a scalar operand +/// (broadcast across all lanes of its vector) and an operand in the expanded +/// form above produces a result which can be added directly to the +/// appropriate carry register. Following a pass of four multiplications, we +/// perform some limited carry propagation: let t = c''_0 mod B, and let d = +/// c'_0 + t b; then we output z = d mod B, add (floor(d/B), floor(c''_0/B)) +/// to c1, and cycle the carry registers around, so that c1 becomes c0, and +/// the old (implicitly) zeroed c0 becomes c3. +/// +/// On 64-bit AMD64, we have a reasonable number of registers: the expanded +/// operands are kept in registers. The packed operands are read from memory +/// into working registers XMM4 and XMM5; XMM0--XMM3 are used for the actual +/// multiplications; and XMM6 and XMM7 are used for combining the results. +/// The following conventional argument names and locations are used +/// throughout. +/// +/// Arg Format Location Notes +/// +/// U packed [RAX] +/// X packed [RBX] In Montgomery multiplication, X = N +/// V expanded XMM8/XMM9 +/// Y expanded XMM10/XMM11 In Montgomery multiplication, Y = (A + U V) M +/// M expanded (see below) Montgomery factor, M = -N^{-1} (mod B^4) +/// N Modulus, for Montgomery multiplication +/// A packed [RDI] Destination/accumulator +/// C carry XMM12--XMM15 +/// +/// The calculation is some variant of +/// +/// A' + C' B^4 <- U V + X Y + A + C /// -/// We maintain four `carry' registers accumulating intermediate results. -/// The registers' precise roles rotate during the computation; we name them -/// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves: -/// the register c0, for example, holds c'_0 (low half) and c''_0 (high -/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers -/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The -/// `pmuluqdq' instruction acting on a scalar operand (broadcast across all -/// lanes of its vector) and an operand in the expanded form above produces a -/// result which can be added directly to the appropriate carry register. -/// Following a pass of four multiplications, we perform some limited carry -/// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output -/// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry -/// registers around, so that c1 becomes c0, and the old c0 is (implicitly) -/// zeroed becomes c3. +/// The low-level functions fit into a fairly traditional (finely-integrated) +/// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y) +/// (indexed by i). +/// +/// The variants are as follows. +/// +/// Function Variant Use i j +/// +/// mmul4 A = C = 0, Y = M Montgomery 0 0 +/// dmul4 A = 0 Montgomery 0 + +/// mmla4 C = 0, Y = M Montgomery + 0 +/// dmla4 exactly as shown Montgomery + + +/// mont4 U = C = 0, V = M Montgomery any 0 +/// +/// mul4zc U = V = A = C = 0 Plain 0 0 +/// mul4 U = V = A = 0 Plain 0 + +/// mla4zc U = V = C = 0 Plain + 0 +/// mla4 U = V = 0 Plain + + +/// +/// The `mmul4' and `mmla4' functions are also responsible for calculating +/// the Montgomery reduction factor Y = (A + U V) M used by the rest of the +/// inner loop. ///-------------------------------------------------------------------------- /// Macro definitions. @@ -96,7 +137,7 @@ .macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces // of the product in registers D0, D1, D2, D3. - pshufd \d0, \r, SHUF(3, \i, 3, \i) // (r_i, ?; r_i, ?) + pshufd \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?) .ifnes "\d1", "nil" movdqa \d1, \slo // (s'_0, s'_1; s''_0, s''_1) .endif @@ -163,7 +204,7 @@ // lane 0 or 1 of D; the high two lanes of D are clobbered. On // completion, XMM3 is clobbered. If CC is `nil', then the // contribution which would have been added to it is left in C. - pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?; ?, t = c'' mod B) + pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B) psrldq xmm3, 12 // (t, 0; 0, 0) = (t; 0) pslldq xmm3, 2 // (t b; 0) paddq \c, xmm3 // (c' + t b; c'') @@ -209,11 +250,11 @@ punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1) punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3) .endif - pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1; a''_0, a''_1) - pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3; a''_2, a''_3) + pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1) + pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3) .ifnes "\c", "nil" - pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1; c''_0, c''_1) - pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3; c''_2, c''_3) + pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1) + pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3) .endif .endm @@ -321,7 +362,6 @@ INTFUNC(carryprop) movdqu [rdi], xmm0 ret - ENDFUNC INTFUNC(dmul4) @@ -359,7 +399,6 @@ INTFUNC(dmul4) movdqu [rdi], xmm6 ret - ENDFUNC INTFUNC(dmla4) @@ -400,7 +439,6 @@ INTFUNC(dmla4) movdqu [rdi], xmm6 ret - ENDFUNC INTFUNC(mul4zc) @@ -431,7 +469,6 @@ INTFUNC(mul4zc) movdqu [rdi], xmm6 ret - ENDFUNC INTFUNC(mul4) @@ -464,7 +501,6 @@ INTFUNC(mul4) movdqu [rdi], xmm6 ret - ENDFUNC INTFUNC(mla4zc) @@ -500,7 +536,6 @@ INTFUNC(mla4zc) movdqu [rdi], xmm6 ret - ENDFUNC INTFUNC(mla4) @@ -535,14 +570,13 @@ INTFUNC(mla4) movdqu [rdi], xmm6 ret - ENDFUNC INTFUNC(mmul4) // On entry, RDI points to the destination buffer; RAX and RBX point // to the packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold - // the expanded operands V and M. The stack pointer must be 8 modulo 16 - // (as usual for AMD64 ABIs). + // the expanded operands V and M. The stack pointer must be 8 modulo + // 16 (as usual for AMD64 ABIs). // // On exit, we store Y = U V M mod B in XMM10/XMM11, and write the // low 128 bits of the sum U V + N Y to [RDI], leaving the remaining @@ -559,7 +593,6 @@ INTFUNC(mmul4) mulcore xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15 propout xmm7, lo, xmm12, xmm13 jmp 5f - ENDFUNC INTFUNC(mmla4) @@ -577,10 +610,10 @@ INTFUNC(mmla4) movdqu xmm4, [rax] #if ABI_WIN stalloc 48 + 8 // space for the carries -# define STKTMP(i) [rsp + i] +# define STKTMP(i) [SP + i] #endif #if ABI_SYSV -# define STKTMP(i) [rsp + i - 48 - 8] // use red zone +# define STKTMP(i) [SP + i - 48 - 8] // use red zone #endif endprologue @@ -746,7 +779,6 @@ INTFUNC(mont4) // And, with that, we're done. movdqu [rdi], xmm6 ret - ENDFUNC ///-------------------------------------------------------------------------- @@ -761,7 +793,7 @@ ENDFUNC FUNC(mpx_umul4_amd64_sse2) // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl, - // const mpw *bv, const mpw *bvl); + // const mpw *bv, const mpw *bvl); // Establish the arguments and do initial setup. // @@ -785,7 +817,6 @@ FUNC(mpx_umul4_amd64_sse2) endprologue mov DV, rdi - #endif #if ABI_WIN @@ -813,8 +844,7 @@ FUNC(mpx_umul4_amd64_sse2) endprologue mov rdi, DV - mov BVL, [rsp + 224] - + mov BVL, [SP + 224] #endif // Prepare for the first iteration. @@ -880,7 +910,6 @@ FUNC(mpx_umul4_amd64_sse2) #endif #if ABI_WIN - rstrxmm xmm6, 0 rstrxmm xmm7, 16 rstrxmm xmm8, 32 @@ -895,7 +924,6 @@ FUNC(mpx_umul4_amd64_sse2) stfree 160 + 8 popreg rdi popreg rbx - #endif ret @@ -948,7 +976,6 @@ FUNC(mpxmont_mul4_amd64_sse2) endprologue mov DV, rdi - #endif #if ABI_WIN @@ -980,9 +1007,8 @@ FUNC(mpxmont_mul4_amd64_sse2) endprologue mov rdi, DV - mov N, [rsp + 224] - mov MI, [rsp + 232] - + mov N, [SP + 224] + mov MI, [SP + 232] #endif // Establish the expanded operands. @@ -1064,7 +1090,6 @@ FUNC(mpxmont_mul4_amd64_sse2) #endif #if ABI_WIN - rstrxmm xmm6, 0 rstrxmm xmm7, 16 rstrxmm xmm8, 32 @@ -1080,7 +1105,6 @@ FUNC(mpxmont_mul4_amd64_sse2) popreg r12 popreg rdi popreg rbx - #endif ret @@ -1136,7 +1160,6 @@ FUNC(mpxmont_redc4_amd64_sse2) // c rcx r9 #if ABI_SYSV - # define DVL rax # define DVL4 rsi # define MI r8 @@ -1151,11 +1174,9 @@ FUNC(mpxmont_redc4_amd64_sse2) endprologue mov DV, rdi - #endif #if ABI_WIN - # define DVL rax # define DVL4 rdx # define MI r10 @@ -1185,8 +1206,7 @@ FUNC(mpxmont_redc4_amd64_sse2) endprologue mov rdi, DV - mov MI, [rsp + 224] - + mov MI, [SP + 224] #endif // Establish the expanded operands and the blocks-of-4 dv limit. @@ -1269,7 +1289,6 @@ FUNC(mpxmont_redc4_amd64_sse2) #endif #if ABI_WIN - rstrxmm xmm6, 0 rstrxmm xmm7, 16 rstrxmm xmm8, 32 @@ -1285,7 +1304,6 @@ FUNC(mpxmont_redc4_amd64_sse2) popreg r12 popreg rdi popreg rbx - #endif ret @@ -1329,9 +1347,9 @@ ENDFUNC # define ARG6 STKARG(2) # define ARG7 STKARG(3) # define ARG8 STKARG(4) -# define STKARG_OFFSET 40 +# define STKARG_OFFSET 224 #endif -#define STKARG(i) [rsp + STKARG_OFFSET + 8*(i)] +#define STKARG(i) [SP + STKARG_OFFSET + 8*(i)] // sysv win // dmul smul mmul mont dmul smul mmul mont @@ -1386,7 +1404,7 @@ ENDFUNC mov rbx, r8 movdqu xmm8, [r9] movdqu xmm10, [rax] - mov r8, STKARG(1) + mov r8d, STKARG(1) mov r9, STKARG(2) mov r10, rdx mov r11, rcx @@ -1395,7 +1413,7 @@ ENDFUNC .ifeqs "\mode", "mont" mov rbx, rcx movdqu xmm8, [r8] - mov r8, r9 + mov r8d, r9d mov r9, STKARG(0) mov r10, rdx mov rcx, rsi @@ -1423,16 +1441,16 @@ ENDFUNC mov rbx, r9 movdqu xmm8, [r10] movdqu xmm10, [r11] - mov r8, STKARG(2) - mov r9, STKARG(3) mov r11, r8 + mov r8d, STKARG(2) + mov r9, STKARG(3) .endif .ifeqs "\mode", "smul" mov rdi, rcx mov rcx, rdx mov rbx, r8 movdqu xmm10, [r9] - mov r8, STKARG(0) + mov r8d, STKARG(0) mov r9, STKARG(1) .endif .ifeqs "\mode", "mmul" @@ -1443,10 +1461,10 @@ ENDFUNC mov rbx, STKARG(0) movdqu xmm8, [r10] movdqu xmm10, [r11] - mov r8, STKARG(3) - mov r9, STKARG(4) mov r10, r8 mov r11, r9 + mov r8d, STKARG(3) + mov r9, STKARG(4) .endif .ifeqs "\mode", "mont" mov r10, STKARG(0) @@ -1454,9 +1472,9 @@ ENDFUNC mov rcx, rdx mov rbx, r9 movdqu xmm8, [r10] - mov r8, STKARG(1) - mov r9, STKARG(2) mov r10, r8 + mov r8d, STKARG(1) + mov r9, STKARG(2) .endif #endif @@ -1550,6 +1568,16 @@ FUNC(test_mul4) testepilogue ENDFUNC +FUNC(test_mul4zc) + testprologue smul + testldcarry + testtop nil + call mul4zc + testtail + testcarryout + testepilogue +ENDFUNC + FUNC(test_mla4) testprologue smul testldcarry @@ -1560,6 +1588,16 @@ FUNC(test_mla4) testepilogue ENDFUNC +FUNC(test_mla4zc) + testprologue smul + testldcarry + testtop nil + call mla4zc + testtail + testcarryout + testepilogue +ENDFUNC + FUNC(test_mmul4) testprologue mmul testtop r11