From 3119b3ae038717e3860a7ee0fda55098af389809 Mon Sep 17 00:00:00 2001 From: Mark Wooding Date: Wed, 4 Jan 2017 01:42:16 +0000 Subject: [PATCH] math/mpx-mul4-amd64-sse2.S: SSE2 multipliers for AMD64. Plus the various hangers on. --- math/Makefile.am | 3 + math/mpmont.c | 16 + math/mpx-mul4-amd64-sse2.S | 1577 ++++++++++++++++++++++++++++++++++++++++++++ math/mpx.c | 8 + math/t/mpmont | 5 + 5 files changed, 1609 insertions(+) create mode 100644 math/mpx-mul4-amd64-sse2.S diff --git a/math/Makefile.am b/math/Makefile.am index 69940ba5..0afee1f1 100644 --- a/math/Makefile.am +++ b/math/Makefile.am @@ -184,6 +184,9 @@ EXTRA_DIST += t/mpx if CPUFAM_X86 libmath_la_SOURCES += mpx-mul4-x86-sse2.S endif +if CPUFAM_AMD64 +libmath_la_SOURCES += mpx-mul4-amd64-sse2.S +endif ## A quick-and-dirty parser, used for parsing descriptions of groups, fields, ## etc. diff --git a/math/mpmont.c b/math/mpmont.c index 774e8523..fe1e6457 100644 --- a/math/mpmont.c +++ b/math/mpmont.c @@ -92,12 +92,20 @@ static void simple_redccore(mpw *dv, mpw *dvl, const mpw *mv, MAYBE_REDC4(x86_sse2) #endif +#if CPUFAM_AMD64 + MAYBE_REDC4(amd64_sse2) +#endif + static redccore__functype *pick_redccore(void) { #if CPUFAM_X86 DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_x86_sse2, cpu_feature_p(CPUFEAT_X86_SSE2)); #endif +#if CPUFAM_AMD64 + DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_amd64_sse2, + cpu_feature_p(CPUFEAT_X86_SSE2)); +#endif DISPATCH_PICK_FALLBACK(mpmont_reduce, simple_redccore); } @@ -184,12 +192,20 @@ static void simple_mulcore(mpw *dv, mpw *dvl, MAYBE_MUL4(x86_sse2) #endif +#if CPUFAM_AMD64 + MAYBE_MUL4(amd64_sse2) +#endif + static mulcore__functype *pick_mulcore(void) { #if CPUFAM_X86 DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_x86_sse2, cpu_feature_p(CPUFEAT_X86_SSE2)); #endif +#if CPUFAM_AMD64 + DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_amd64_sse2, + cpu_feature_p(CPUFEAT_X86_SSE2)); +#endif DISPATCH_PICK_FALLBACK(mpmont_mul, simple_mulcore); } diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S new file mode 100644 index 00000000..2d78a992 --- /dev/null +++ b/math/mpx-mul4-amd64-sse2.S @@ -0,0 +1,1577 @@ +/// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*- +/// +/// Large SIMD-based multiplications +/// +/// (c) 2016 Straylight/Edgeware +/// + +///----- Licensing notice --------------------------------------------------- +/// +/// This file is part of Catacomb. +/// +/// Catacomb is free software; you can redistribute it and/or modify +/// it under the terms of the GNU Library General Public License as +/// published by the Free Software Foundation; either version 2 of the +/// License, or (at your option) any later version. +/// +/// Catacomb is distributed in the hope that it will be useful, +/// but WITHOUT ANY WARRANTY; without even the implied warranty of +/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +/// GNU Library General Public License for more details. +/// +/// You should have received a copy of the GNU Library General Public +/// License along with Catacomb; if not, write to the Free +/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +/// MA 02111-1307, USA. + +///-------------------------------------------------------------------------- +/// External definitions. + +#include "config.h" +#include "asm-common.h" + +///-------------------------------------------------------------------------- +/// Prologue. + + .arch pentium4 + .text + +///-------------------------------------------------------------------------- +/// Theory. +/// +/// We define a number of primitive fixed-size multipliers from which we can +/// construct more general variable-length multipliers. +/// +/// The basic trick is the same throughout. In an operand-scanning +/// multiplication, the inner multiplication loop multiplies a +/// multiple-precision operand by a single precision factor, and adds the +/// result, appropriately shifted, to the result. A `finely integrated +/// operand scanning' implementation of Montgomery multiplication also adds +/// the product of a single-precision `Montgomery factor' and the modulus, +/// calculated in the same pass. The more common `coarsely integrated +/// operand scanning' alternates main multiplication and Montgomery passes, +/// which requires additional carry propagation. +/// +/// Throughout both plain-multiplication and Montgomery stages, then, one of +/// the factors remains constant throughout the operation, so we can afford +/// to take a little time to preprocess it. The transformation we perform is +/// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a +/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into +/// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit +/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE +/// operands, as follows. +/// +/// Offset 0 4 8 12 +/// 0 v'_0 v'_1 v''_0 v''_1 +/// 16 v'_2 v'_3 v''_2 v''_3 +/// +/// A `pmuludqd' instruction ignores the odd positions in its operands; thus, +/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting +/// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can +/// multiply such a vector by a full 32-bit scalar to produce two 48-bit +/// results in 64-bit fields. The sixteen bits of headroom allows us to add +/// many products together before we must deal with carrying; it also allows +/// for some calculations to be performed on the above expanded form. +/// +/// ... +/// +/// We maintain four `carry' registers accumulating intermediate results. +/// The registers' precise roles rotate during the computation; we name them +/// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves: +/// the register c0, for example, holds c'_0 (low half) and c''_0 (high +/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers +/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The +/// `pmuluqdq' instruction acting on a scalar operand (broadcast across all +/// lanes of its vector) and an operand in the expanded form above produces a +/// result which can be added directly to the appropriate carry register. +/// Following a pass of four multiplications, we perform some limited carry +/// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output +/// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry +/// registers around, so that c1 becomes c0, and the old c0 is (implicitly) +/// zeroed becomes c3. + +///-------------------------------------------------------------------------- +/// Macro definitions. + +.macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil + // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces + // of the product in registers D0, D1, D2, D3. + pshufd \d0, \r, SHUF(3, \i, 3, \i) // (r_i, ?, r_i, ?) + .ifnes "\d1", "nil" + movdqa \d1, \slo // (s'_0, s'_1, s''_0, s''_1) + .endif + .ifnes "\d3", "nil" + movdqa \d3, \shi // (s'_2, s'_3, s''_2, s''_3) + .endif + .ifnes "\d1", "nil" + psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0) + .endif + .ifnes "\d2", "nil" + movdqa \d2, \d0 // another copy of (r_i, ?, r_i, ?) + .endif + .ifnes "\d3", "nil" + psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0) + .endif + .ifnes "\d1", "nil" + pmuludq \d1, \d0 // (r_i s'_1, r_i s''_1) + .endif + .ifnes "\d3", "nil" + pmuludq \d3, \d0 // (r_i s'_3, r_i s''_3) + .endif + .ifnes "\d2", "nil" + pmuludq \d2, \shi // (r_i s'_2, r_i s''_2) + .endif + pmuludq \d0, \slo // (r_i s'_0, r_i s''_0) +.endm + +.macro accum c0, c1=nil, c2=nil, c3=nil + // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding + // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip + // updating that register. + paddq \c0, xmm0 + .ifnes "\c1", "nil" + paddq \c1, xmm1 + .endif + .ifnes "\c2", "nil" + paddq \c2, xmm2 + .endif + .ifnes "\c3", "nil" + paddq \c3, xmm3 + .endif +.endm + +.macro mulacc r, i, slo, shi, c0=nil, c1=nil, c2=nil, c3=nil, z3p=nil + // Multiply R_I by the expanded operand SLO/SHI, and accumulate in + // carry registers C0, C1, C2, C3. If Z3P is `t' then C3 notionally + // contains zero, but needs clearing; in practice, we store the + // product directly rather than attempting to add. On completion, + // XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P is not `t'. + .ifeqs "\z3p", "t" + mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, \c3 + accum \c0, \c1, \c2 + .else + mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, xmm3 + accum \c0, \c1, \c2, \c3 + .endif +.endm + +.macro propout d, pos, c, cc=nil + // Calculate an output word from C, and store it at POS in D; + // propagate carries out from C to CC in preparation for a rotation + // of the carry registers. D is an XMM register; the POS is either + // `lo' or `hi' according to whether the output word should be in + // lane 0 or 1 of D; the high two lanes of D are clobbered. On + // completion, XMM3 is clobbered. If CC is `nil', then the + // contribution which would have been added to it is left in C. + pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B) + psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0) + pslldq xmm3, 2 // (t b, 0) + paddq \c, xmm3 // (c' + t b, c'') + .ifeqs "\pos", "lo" + movdqa \d, \c + .else + punpckldq \d, \c + .endif + psrlq \c, 32 // floor(c/B) + .ifnes "\cc", "nil" + paddq \cc, \c // propagate up + .endif +.endm + +.macro endprop d, pos, c, t + // On entry, C contains a carry register. On exit, the low 32 bits + // of the value represented in C are written at POS in D, and the + // remaining bits are left at the bottom of T. + movdqa \t, \c + psllq \t, 16 // (?, c'' b) + pslldq \c, 8 // (0, c') + paddq \t, \c // (?, c' + c'' b) + psrldq \t, 8 // c' + c'' b + .ifeqs "\pos", "lo" + movdqa \d, \t + .else + punpckldq \d, \t + .endif + psrldq \t, 4 // floor((c' + c'' b)/B) +.endm + +.macro expand z, a, b, c=nil, d=nil + // On entry, A and C hold packed 128-bit values, and Z is zero. On + // exit, A:B and C:D together hold the same values in expanded + // form. If C is `nil', then only expand A to A:B. + movdqa \b, \a // (a_0, a_1, a_2, a_3) + .ifnes "\c", "nil" + movdqa \d, \c // (c_0, c_1, c_2, c_3) + .endif + punpcklwd \a, \z // (a'_0, a''_0, a'_1, a''_1) + punpckhwd \b, \z // (a'_2, a''_2, a'_3, a''_3) + .ifnes "\c", "nil" + punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1) + punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3) + .endif + pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1) + pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3) + .ifnes "\c", "nil" + pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1) + pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3) + .endif +.endm + +.macro squash c0, c1, c2, c3, t, u, lo, hi=nil + // On entry, C0, C1, C2, C3 are carry registers representing a value + // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2, + // C3, T, and U are clobbered; and the high bits of Y are stored in + // HI, if this is not `nil'. + + // The first step is to eliminate the `double-prime' pieces -- i.e., + // the ones offset by 16 bytes from a 32-bit boundary -- by carrying + // them into the 32-bit-aligned pieces above and below. But before + // we can do that, we must gather them together. + movdqa \t, \c0 + movdqa \u, \c1 + punpcklqdq \t, \c2 // (y'_0, y'_2) + punpckhqdq \c0, \c2 // (y''_0, y''_2) + punpcklqdq \u, \c3 // (y'_1, y'_3) + punpckhqdq \c1, \c3 // (y''_1, y''_3) + + // Now split the double-prime pieces. The high (up to) 48 bits will + // go up; the low 16 bits go down. + movdqa \c2, \c0 + movdqa \c3, \c1 + psllq \c2, 48 + psllq \c3, 48 + psrlq \c0, 16 // high parts of (y''_0, y''_2) + psrlq \c1, 16 // high parts of (y''_1, y''_3) + psrlq \c2, 32 // low parts of (y''_0, y''_2) + psrlq \c3, 32 // low parts of (y''_1, y''_3) + .ifnes "\hi", "nil" + movdqa \hi, \c1 + .endif + pslldq \c1, 8 // high part of (0, y''_1) + + paddq \t, \c2 // propagate down + paddq \u, \c3 + paddq \t, \c1 // and up: (y_0, y_2) + paddq \u, \c0 // (y_1, y_3) + .ifnes "\hi", "nil" + psrldq \hi, 8 // high part of (y''_3, 0) + .endif + + // Finally extract the answer. This complicated dance is better than + // storing to memory and loading, because the piecemeal stores + // inhibit store forwarding. + movdqa \c3, \t // (y_0, y_1) + movdqa \lo, \t // (y^*_0, ?, ?, ?) + psrldq \t, 8 // (y_2, 0) + psrlq \c3, 32 // (floor(y_0/B), ?) + paddq \c3, \u // (y_1 + floor(y_0/B), ?) + movdqa \c1, \c3 // (y^*_1, ?, ?, ?) + psrldq \u, 8 // (y_3, 0) + psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?) + paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?) + punpckldq \lo, \c3 // (y^*_0, y^*_2, ?, ?) + psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?) + paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?) + .ifnes "\hi", "nil" + movdqa \t, \c3 + pxor \u, \u + .endif + punpckldq \c1, \c3 // (y^*_1, y^*_3, ?, ?) + .ifnes "\hi", "nil" + psrlq \t, 32 // very high bits of y + paddq \hi, \t + punpcklqdq \hi, \u // carry up + .endif + punpckldq \lo, \c1 // y mod B^4 +.endm + +.macro carryadd + // On entry, RDI points to a packed addend A, and XMM12, XMM13, XMM14 + // hold the incoming carry registers c0, c1, and c2 representing a + // carry-in C. + // + // On exit, the carry registers, including XMM15, are updated to hold + // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other + // registers are preserved. + movd xmm0, [rdi + 0] // (a_0, 0) + movd xmm1, [rdi + 4] // (a_1, 0) + movd xmm2, [rdi + 8] // (a_2, 0) + movd xmm15, [rdi + 12] // (a_3, 0) + paddq xmm12, xmm0 // (c'_0 + a_0, c''_0) + paddq xmm13, xmm1 // (c'_1 + a_1, c''_1) + paddq xmm14, xmm2 // (c'_2 + a_2, c''_2 + a_3 b) +.endm + +///-------------------------------------------------------------------------- +/// Primitive multipliers and related utilities. + +INTFUNC(carryprop) + // On entry, XMM12, XMM13, and XMM14 hold a 144-bit carry in an + // expanded form. Store the low 128 bits of the represented carry to + // [RDI] as a packed 128-bit value, and leave the remaining 16 bits + // in the low 32 bits of XMM12. On exit, XMM0, XMM1, XMM3, XMM13 and + // XMM14 are clobbered. + endprologue + + propout xmm0, lo, xmm12, xmm13 + propout xmm1, lo, xmm13, xmm14 + propout xmm0, hi, xmm14, nil + endprop xmm1, hi, xmm14, xmm12 + punpckldq xmm0, xmm1 + movdqu [rdi], xmm0 + + ret + +ENDFUNC + +INTFUNC(dmul4) + // On entry, RDI points to the destination buffer; RAX and RBX point + // to the packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the + // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the + // incoming carry registers c0, c1, and c2; c3 is assumed to be zero. + // + // On exit, we write the low 128 bits of the sum C + U V + X Y to + // [RDI], and update the carry registers with the carry out. The + // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose + // registers are preserved. + endprologue + + movdqu xmm4, [rax] + movdqu xmm5, [rbx] + + mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15, t + mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 + propout xmm6, lo, xmm12, xmm13 + + mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t + mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12 + propout xmm7, lo, xmm13, xmm14 + + mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t + mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13 + propout xmm6, hi, xmm14, xmm15 + + mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t + mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14 + propout xmm7, hi, xmm15, xmm12 + + punpckldq xmm6, xmm7 + movdqu [rdi], xmm6 + + ret + +ENDFUNC + +INTFUNC(dmla4) + // On entry, RDI points to the destination buffer, which also + // contains an addend A to accumulate; RAX and RBX point to the + // packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the + // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the + // incoming carry registers c0, c1, and c2 representing a carry-in C; + // c3 is assumed to be zero. + // + // On exit, we write the low 128 bits of the sum A + C + U V + X Y to + // [RDI], and update the carry registers with the carry out. The + // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose + // registers are preserved. + endprologue + + movdqu xmm4, [rax] + movdqu xmm5, [rbx] + carryadd + + mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15 + mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 + propout xmm6, lo, xmm12, xmm13 + + mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t + mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12 + propout xmm7, lo, xmm13, xmm14 + + mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t + mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13 + propout xmm6, hi, xmm14, xmm15 + + mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t + mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14 + propout xmm7, hi, xmm15, xmm12 + + punpckldq xmm6, xmm7 + movdqu [rdi], xmm6 + + ret + +ENDFUNC + +INTFUNC(mul4zc) + // On entry, RDI points to the destination buffer; RBX points to a + // packed operand X; and XMM10/XMM11 hold an expanded operand Y. + // + // On exit, we write the low 128 bits of the product X Y to [RDI], + // and set the carry registers XMM12, XMM13, XMM14 to the carry out. + // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the + // general-purpose registers are preserved. + endprologue + + movdqu xmm5, [rbx] + + mulcore xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 + propout xmm6, lo, xmm12, xmm13 + + mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t + propout xmm7, lo, xmm13, xmm14 + + mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t + propout xmm6, hi, xmm14, xmm15 + + mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t + propout xmm7, hi, xmm15, xmm12 + + punpckldq xmm6, xmm7 + movdqu [rdi], xmm6 + + ret + +ENDFUNC + +INTFUNC(mul4) + // On entry, RDI points to the destination buffer; RBX points to a + // packed operand X; XMM10/XMM11 hold an expanded operand Y; and + // XMM12, XMM13, XMM14 hold the incoming carry registers c0, c1, and + // c2, representing a carry-in C; c3 is assumed to be zero. + // + // On exit, we write the low 128 bits of the sum C + X Y to [RDI], + // and update the carry registers with the carry out. The registers + // XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the + // general-purpose registers are preserved. + endprologue + + movdqu xmm5, [rbx] + + mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, t + propout xmm6, lo, xmm12, xmm13 + + mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t + propout xmm7, lo, xmm13, xmm14 + + mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t + propout xmm6, hi, xmm14, xmm15 + + mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t + propout xmm7, hi, xmm15, xmm12 + + punpckldq xmm6, xmm7 + movdqu [rdi], xmm6 + + ret + +ENDFUNC + +INTFUNC(mla4zc) + // On entry, RDI points to the destination buffer, which also + // contains an addend A to accumulate; RBX points to a packed operand + // X; and XMM10/XMM11 points to an expanded operand Y. + // + // On exit, we write the low 128 bits of the sum A + X Y to [RDI], + // and set the carry registers XMM12, XMM13, XMM14 to the carry out. + // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the + // general-purpose registers are preserved. + endprologue + + movdqu xmm5, [rbx] + movd xmm12, [rdi + 0] + movd xmm13, [rdi + 4] + movd xmm14, [rdi + 8] + movd xmm15, [rdi + 12] + + mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 + propout xmm6, lo, xmm12, xmm13 + + mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t + propout xmm7, lo, xmm13, xmm14 + + mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t + propout xmm6, hi, xmm14, xmm15 + + mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t + propout xmm7, hi, xmm15, xmm12 + + punpckldq xmm6, xmm7 + movdqu [rdi], xmm6 + + ret + +ENDFUNC + +INTFUNC(mla4) + // On entry, RDI points to the destination buffer, which also + // contains an addend A to accumulate; RBX points to a packed operand + // X; XMM10/XMM11 holds an expanded operand Y; and XMM12, XMM13, + // XMM14 hold the incoming carry registers c0, c1, and c2, + // representing a carry-in C; c3 is assumed to be zero. + // + // On exit, we write the low 128 bits of the sum A + C + X Y to + // [RDI], and update the carry registers with the carry out. The + // registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the + // general-purpose registers are preserved. + endprologue + + movdqu xmm5, [rbx] + carryadd + + mulacc xmm5, 0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 + propout xmm6, lo, xmm12, xmm13 + + mulacc xmm5, 1, xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t + propout xmm7, lo, xmm13, xmm14 + + mulacc xmm5, 2, xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t + propout xmm6, hi, xmm14, xmm15 + + mulacc xmm5, 3, xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t + propout xmm7, hi, xmm15, xmm12 + + punpckldq xmm6, xmm7 + movdqu [rdi], xmm6 + + ret + +ENDFUNC + +INTFUNC(mmul4) + // On entry, RDI points to the destination buffer; RAX and RBX point + // to the packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold + // the expanded operands V and M. The stack pointer must be 8 modulo 16 + // (as usual for AMD64 ABIs). + // + // On exit, we store Y = U V M mod B in XMM10/XMM11, and write the + // low 128 bits of the sum U V + N Y to [RDI], leaving the remaining + // carry in XMM12, XMM13, and XMM14. The registers XMM0--XMM7, and + // XMM15 are clobbered; the general-purpose registers are preserved. + movdqu xmm4, [rax] +#if ABI_WIN + stalloc 48 + 8 // space for the carries +#endif + endprologue + + // Calculate W = U V, and leave it in XMM7. Stash the carry pieces + // for later. + mulcore xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15 + propout xmm7, lo, xmm12, xmm13 + jmp 5f + +ENDFUNC + +INTFUNC(mmla4) + // On entry, RDI points to the destination buffer, which also + // contains an addend A to accumulate; RAX and RBX point to the + // packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold the + // expanded operands V and M. The stack pointer must be 8 modulo 16 + // (as usual for AMD64 ABIs). + // + // On exit, we store Y = (A + U V) M mod B in XMM10/XMM11, and write + // the low 128 bits of the sum A + U V + N Y to [RDI], leaving the + // remaining carry in XMM12, XMM13, and XMM14. The registers + // XMM0--XMM7, and XMM15 are clobbered; the general-purpose registers + // are preserved. + movdqu xmm4, [rax] +#if ABI_WIN + stalloc 48 + 8 // space for the carries +# define STKTMP(i) [rsp + i] +#endif +#if ABI_SYSV +# define STKTMP(i) [rsp + i - 48 - 8] // use red zone +#endif + endprologue + + movd xmm12, [rdi + 0] + movd xmm13, [rdi + 4] + movd xmm14, [rdi + 8] + movd xmm15, [rdi + 12] + + // Calculate W = U V, and leave it in XMM7. Stash the carry pieces + // for later. + mulacc xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15 + propout xmm7, lo, xmm12, xmm13 + +5: mulacc xmm4, 1, xmm8, xmm9, xmm13, xmm14, xmm15, xmm12, t + propout xmm6, lo, xmm13, xmm14 + + mulacc xmm4, 2, xmm8, xmm9, xmm14, xmm15, xmm12, xmm13, t + propout xmm7, hi, xmm14, xmm15 + + mulacc xmm4, 3, xmm8, xmm9, xmm15, xmm12, xmm13, xmm14, t + propout xmm6, hi, xmm15, xmm12 + + // Prepare W, and stash carries for later. + punpckldq xmm7, xmm6 + movdqa STKTMP( 0), xmm12 + movdqa STKTMP(16), xmm13 + movdqa STKTMP(32), xmm14 + + // Calculate Y = W M. We just about have enough spare registers to + // make this work. + mulcore xmm7, 0, xmm10, xmm11, xmm3, xmm4, xmm5, xmm6 + + // Start expanding W back into the main carry registers... + pxor xmm15, xmm15 + movdqa xmm12, xmm7 + movdqa xmm14, xmm7 + + mulcore xmm7, 1, xmm10, xmm11, xmm0, xmm1, xmm2 + accum xmm4, xmm5, xmm6 + + punpckldq xmm12, xmm15 // (w_0, 0, w_1, 0) + punpckhdq xmm14, xmm15 // (w_2, 0, w_3, 0) + + mulcore xmm7, 2, xmm10, xmm11, xmm0, xmm1 + accum xmm5, xmm6 + + pxor xmm2, xmm2 + movdqa xmm13, xmm12 + movdqa xmm15, xmm14 + + mulcore xmm7, 3, xmm10, xmm11, xmm0 + accum xmm6 + + punpckldq xmm12, xmm2 // (w_0, 0, 0, 0) + punpckldq xmm14, xmm2 // (w_2, 0, 0, 0) + punpckhdq xmm13, xmm2 // (w_1, 0, 0, 0) + punpckhdq xmm15, xmm2 // (w_3, 0, 0, 0) + + // That's lots of pieces. Now we have to assemble the answer. + squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10 + + // Expand it. + movdqu xmm5, [rbx] + expand xmm2, xmm10, xmm11 + + // Finish the calculation by adding the Montgomery product. + mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 + propout xmm6, lo, xmm12, xmm13 + + mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t + propout xmm7, lo, xmm13, xmm14 + + mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t + propout xmm6, hi, xmm14, xmm15 + + mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t + propout xmm7, hi, xmm15, xmm12 + + punpckldq xmm6, xmm7 + + // Add add on the carry we calculated earlier. + paddq xmm12, STKTMP( 0) + paddq xmm13, STKTMP(16) + paddq xmm14, STKTMP(32) + + // And, with that, we're done. + movdqu [rdi], xmm6 +#if ABI_WIN + stfree 56 +#endif + ret + +#undef STKTMP + +ENDFUNC + +INTFUNC(mont4) + // On entry, RDI points to the destination buffer holding a packed + // value W; RBX points to a packed operand N; and XMM8/XMM9 hold an + // expanded operand M. + // + // On exit, we store Y = W M mod B in XMM10/XMM11, and write the low + // 128 bits of the sum W + N Y to [RDI], leaving the remaining carry + // in XMM12, XMM13, and XMM14. The registers XMM0--XMM3, XMM5--XMM7, + // and XMM15 are clobbered; the general-purpose registers are + // preserved. + endprologue + + movdqu xmm7, [rdi] + + // Calculate Y = W M. Avoid the standard carry registers, because + // we're setting something else up there. + mulcore xmm7, 0, xmm8, xmm9, xmm3, xmm4, xmm5, xmm6 + + // Start expanding W back into the main carry registers... + pxor xmm15, xmm15 + movdqa xmm12, xmm7 + movdqa xmm14, xmm7 + + mulcore xmm7, 1, xmm8, xmm9, xmm0, xmm1, xmm2 + accum xmm4, xmm5, xmm6 + + punpckldq xmm12, xmm15 // (w_0, 0, w_1, 0) + punpckhdq xmm14, xmm15 // (w_2, 0, w_3, 0) + + mulcore xmm7, 2, xmm8, xmm9, xmm0, xmm1 + accum xmm5, xmm6 + + pxor xmm2, xmm2 + movdqa xmm13, xmm12 + movdqa xmm15, xmm14 + + mulcore xmm7, 3, xmm8, xmm9, xmm0 + accum xmm6 + + punpckldq xmm12, xmm2 // (w_0, 0, 0, 0) + punpckldq xmm14, xmm2 // (w_2, 0, 0, 0) + punpckhdq xmm13, xmm2 // (w_1, 0, 0, 0) + punpckhdq xmm15, xmm2 // (w_3, 0, 0, 0) + + // That's lots of pieces. Now we have to assemble the answer. + squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10 + + // Expand it. + movdqu xmm5, [rbx] + expand xmm2, xmm10, xmm11 + + // Finish the calculation by adding the Montgomery product. + mulacc xmm5, 0 xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 + propout xmm6, lo, xmm12, xmm13 + + mulacc xmm5, 1 xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t + propout xmm7, lo, xmm13, xmm14 + + mulacc xmm5, 2 xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t + propout xmm6, hi, xmm14, xmm15 + + mulacc xmm5, 3 xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t + propout xmm7, hi, xmm15, xmm12 + + punpckldq xmm6, xmm7 + + // And, with that, we're done. + movdqu [rdi], xmm6 + ret + +ENDFUNC + +///-------------------------------------------------------------------------- +/// Bulk multipliers. + +FUNC(mpx_umul4_amd64_sse2) + // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl, + // const mpw *bv, const mpw *bvl); + + // Establish the arguments and do initial setup. + // + // sysv win + // inner loop dv rdi rdi* + // inner loop av rbx* rbx* + // outer loop dv r10 rcx + // outer loop bv rcx r9 + // av base rsi rdx + // av limit rdx r8 + // bv limit r8 r10 + +#if ABI_SYSV +# define DV r10 +# define AV rsi +# define AVL rdx +# define BV rcx +# define BVL r8 + + pushreg rbx + endprologue + + mov DV, rdi + +#endif + +#if ABI_WIN +# define DV rcx +# define AV rdx +# define AVL r8 +# define BV r9 +# define BVL r10 + + pushreg rbx + pushreg rdi + stalloc 160 + 8 + + savexmm xmm6, 0 + savexmm xmm7, 16 + savexmm xmm8, 32 + savexmm xmm9, 48 + savexmm xmm10, 64 + savexmm xmm11, 80 + savexmm xmm12, 96 + savexmm xmm13, 112 + savexmm xmm14, 128 + savexmm xmm15, 144 + + endprologue + + mov rdi, DV + mov BVL, [rsp + 224] + +#endif + + // Prepare for the first iteration. + pxor xmm0, xmm0 + movdqu xmm10, [BV] // bv[0] + mov rbx, AV + add DV, 16 + add BV, 16 + expand xmm0, xmm10, xmm11 + call mul4zc + add rbx, 16 + add rdi, 16 + cmp rbx, AVL // all done? + jae 8f + + .p2align 4 + // Continue with the first iteration. +0: call mul4 + add rbx, 16 + add rdi, 16 + cmp rbx, AVL // all done? + jb 0b + + // Write out the leftover carry. There can be no tail here. +8: call carryprop + cmp BV, BVL // more passes to do? + jae 9f + + .p2align 4 + // Set up for the next pass. +1: movdqu xmm10, [BV] // bv[i] + mov rdi, DV // -> dv[i] + pxor xmm0, xmm0 + expand xmm0, xmm10, xmm11 + mov rbx, AV // -> av[0] + add DV, 16 + add BV, 16 + call mla4zc + add rbx, 16 + add rdi, 16 + cmp rbx, AVL // done yet? + jae 8f + + .p2align 4 + // Continue... +0: call mla4 + add rbx, 16 + add rdi, 16 + cmp rbx, AVL + jb 0b + + // Finish off this pass. There was no tail on the previous pass, and + // there can be none on this pass. +8: call carryprop + cmp BV, BVL + jb 1b + + // All over. +9: + +#if ABI_SYSV + popreg rbx +#endif + +#if ABI_WIN + + rstrxmm xmm6, 0 + rstrxmm xmm7, 16 + rstrxmm xmm8, 32 + rstrxmm xmm9, 48 + rstrxmm xmm10, 64 + rstrxmm xmm11, 80 + rstrxmm xmm12, 96 + rstrxmm xmm13, 112 + rstrxmm xmm14, 128 + rstrxmm xmm15, 144 + + stfree 160 + 8 + popreg rdi + popreg rbx + +#endif + + ret + +#undef DV +#undef AV +#undef AVL +#undef BV +#undef BVL + +ENDFUNC + +FUNC(mpxmont_mul4_amd64_sse2) + // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv, + // const mpw *nv, size_t n, const mpw *mi); + + // Establish the arguments and do initial setup. + // + // sysv win + // inner loop dv rdi rdi* + // inner loop av rax rax + // inner loop nv rbx* rbx* + // mi r9 r10 + // outer loop dv r10 rcx + // outer loop bv rdx r8 + // av base rsi rdx + // av limit r11 r11 + // bv limit r8 r12* + // nv base rcx r9 + // n r8 r12* + +#if ABI_SYSV +# define DV r10 +# define AV rsi +# define AVL r11 +# define BV rdx +# define BVL r8 +# define NV rcx +# define N r8 +# define MI r9 + + pushreg rbx + endprologue + + mov DV, rdi + +#endif + +#if ABI_WIN +# define DV rcx +# define AV rdx +# define AVL r11 +# define BV r8 +# define BVL r12 +# define NV r9 +# define N r12 +# define MI r10 + + pushreg rbx + pushreg rdi + pushreg r12 + stalloc 160 + + savexmm xmm6, 0 + savexmm xmm7, 16 + savexmm xmm8, 32 + savexmm xmm9, 48 + savexmm xmm10, 64 + savexmm xmm11, 80 + savexmm xmm12, 96 + savexmm xmm13, 112 + savexmm xmm14, 128 + savexmm xmm15, 144 + + endprologue + + mov rdi, DV + mov N, [rsp + 224] + mov MI, [rsp + 232] + +#endif + + // Establish the expanded operands. + pxor xmm0, xmm0 + movdqu xmm8, [BV] // bv[0] + movdqu xmm10, [MI] // mi + expand xmm0, xmm8, xmm9, xmm10, xmm11 + + // Set up the outer loop state and prepare for the first iteration. + mov rax, AV // -> U = av[0] + mov rbx, NV // -> X = nv[0] + lea AVL, [AV + 4*N] // -> av[n/4] = av limit + lea BVL, [BV + 4*N] // -> bv[n/4] = bv limit + add BV, 16 + add DV, 16 + call mmul4 + add rdi, 16 + add rax, 16 + add rbx, 16 + cmp rax, AVL // done already? + jae 8f + + .p2align 4 + // Complete the first inner loop. +0: call dmul4 + add rdi, 16 + add rax, 16 + add rbx, 16 + cmp rax, AVL // done yet? + jb 0b + + // Still have carries left to propagate. + call carryprop + movd [rdi + 16], xmm12 + + .p2align 4 + // Embark on the next iteration. (There must be one. If n = 1, then + // we would have bailed above, to label 8. Similarly, the subsequent + // iterations can fall into the inner loop immediately.) +1: pxor xmm0, xmm0 + movdqu xmm8, [BV] // bv[i] + movdqu xmm10, [MI] // mi + mov rdi, DV // -> Z = dv[i] + mov rax, AV // -> U = av[0] + mov rbx, NV // -> X = nv[0] + expand xmm0, xmm8, xmm9, xmm10, xmm11 + add BV, 16 + add DV, 16 + call mmla4 + add rdi, 16 + add rax, 16 + add rbx, 16 + + .p2align 4 + // Complete the next inner loop. +0: call dmla4 + add rdi, 16 + add rax, 16 + add rbx, 16 + cmp rax, AVL + jb 0b + + // Still have carries left to propagate, and they overlap the + // previous iteration's final tail, so read that in and add it. + movd xmm0, [rdi] + paddq xmm12, xmm0 + call carryprop + movd [rdi + 16], xmm12 + + // Back again, maybe. + cmp BV, BVL + jb 1b + + // All done. +9: + +#if ABI_SYSV + popreg rbx +#endif + +#if ABI_WIN + + rstrxmm xmm6, 0 + rstrxmm xmm7, 16 + rstrxmm xmm8, 32 + rstrxmm xmm9, 48 + rstrxmm xmm10, 64 + rstrxmm xmm11, 80 + rstrxmm xmm12, 96 + rstrxmm xmm13, 112 + rstrxmm xmm14, 128 + rstrxmm xmm15, 144 + + stfree 160 + popreg r12 + popreg rdi + popreg rbx + +#endif + + ret + + // First iteration was short. Write out the carries and we're done. + // (This could be folded into the main loop structure, but that would + // penalize small numbers more.) +8: call carryprop + movd [rdi + 16], xmm12 +#if ABI_SYSV + popreg rbx + ret +#endif +#if ABI_WIN + jmp 9b +#endif + +#undef DV +#undef AV +#undef AVL +#undef BV +#undef BVL +#undef NV +#undef N +#undef MI + +ENDFUNC + +FUNC(mpxmont_redc4_amd64_sse2) + // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv, + // size_t n, const mpw *mi); + + // Establish the arguments and do initial setup. + // + // sysv win + // inner loop dv rdi rdi* + // dv limit rax rax + // blocks-of-4 dv limit rsi rdx + // inner loop nv rbx* rbx* + // mi r8 r10 + // outer loop dv r10 rcx + // outer loop dv limit r11 r11 + // nv base rdx r8 + // nv limit r9 r12* + // n rcx r9 + // c rcx r9 + +#if ABI_SYSV + +# define DVL rax +# define DVL4 rsi +# define MI r8 +# define DV r10 +# define DVLO r11 +# define NV rdx +# define NVL r9 +# define N rcx +# define C ecx + + pushreg rbx + endprologue + + mov DV, rdi + +#endif + +#if ABI_WIN + +# define DVL rax +# define DVL4 rdx +# define MI r10 +# define DV rcx +# define DVLO r11 +# define NV r8 +# define NVL r12 +# define N r9 +# define C r9d + + pushreg rbx + pushreg rdi + pushreg r12 + stalloc 160 + + savexmm xmm6, 0 + savexmm xmm7, 16 + savexmm xmm8, 32 + savexmm xmm9, 48 + savexmm xmm10, 64 + savexmm xmm11, 80 + savexmm xmm12, 96 + savexmm xmm13, 112 + savexmm xmm14, 128 + savexmm xmm15, 144 + + endprologue + + mov rdi, DV + mov MI, [rsp + 224] + +#endif + + // Establish the expanded operands and the blocks-of-4 dv limit. + pxor xmm0, xmm0 + mov DVL, DVL4 // -> dv[n] = dv limit + sub DVL4, DV // length of dv in bytes + movdqu xmm8, [MI] // mi + and DVL4, ~15 // mask off the tail end + expand xmm0, xmm8, xmm9 + add DVL4, DV // find limit + + // Set up the outer loop state and prepare for the first iteration. + mov rbx, NV // -> X = nv[0] + lea DVLO, [DV + 4*N] // -> dv[n/4] = outer dv limit + lea NVL, [NV + 4*N] // -> nv[n/4] = nv limit + add DV, 16 + call mont4 + add rbx, 16 + add rdi, 16 + cmp rbx, NVL // done already? + jae 8f + + .p2align 4 + // Complete the first inner loop. +5: call mla4 + add rbx, 16 + add rdi, 16 + cmp rbx, NVL // done yet? + jb 5b + + // Still have carries left to propagate. +8: carryadd + psllq xmm15, 16 + pslldq xmm15, 8 + paddq xmm14, xmm15 + call carryprop + movd C, xmm12 + add rdi, 16 + cmp rdi, DVL4 + jae 7f + + .p2align 4 + // Continue carry propagation until the end of the buffer. +0: add [rdi], C + mov C, 0 // preserves flags + adcd [rdi + 4], 0 + adcd [rdi + 8], 0 + adcd [rdi + 12], 0 + adc C, 0 + add rdi, 16 + cmp rdi, DVL4 + jb 0b + + // Deal with the tail end. +7: add [rdi], C + mov C, 0 // preserves flags + add rdi, 4 + adc C, 0 + cmp rdi, DVL + jb 7b + + // All done for this iteration. Start the next. (This must have at + // least one follow-on iteration, or we'd not have started this outer + // loop.) +8: mov rdi, DV // -> Z = dv[i] + mov rbx, NV // -> X = nv[0] + cmp rdi, DVLO // all done yet? + jae 9f + add DV, 16 + call mont4 + add rdi, 16 + add rbx, 16 + jmp 5b + + // All over. +9: + +#if ABI_SYSV + popreg rbx +#endif + +#if ABI_WIN + + rstrxmm xmm6, 0 + rstrxmm xmm7, 16 + rstrxmm xmm8, 32 + rstrxmm xmm9, 48 + rstrxmm xmm10, 64 + rstrxmm xmm11, 80 + rstrxmm xmm12, 96 + rstrxmm xmm13, 112 + rstrxmm xmm14, 128 + rstrxmm xmm15, 144 + + stfree 160 + popreg r12 + popreg rdi + popreg rbx + +#endif + + ret + +#undef DVL +#undef DVL4 +#undef MI +#undef DV +#undef DVLO +#undef NV +#undef NVL +#undef N +#undef C + +ENDFUNC + +///-------------------------------------------------------------------------- +/// Testing and performance measurement. + +#ifdef TEST_MUL4 + +#if ABI_SYSV +# define ARG0 rdi +# define ARG1 rsi +# define ARG2 rdx +# define ARG3 rcx +# define ARG4 r8 +# define ARG5 r9 +# define ARG6 STKARG(0) +# define ARG7 STKARG(1) +# define ARG8 STKARG(2) +# define STKARG_OFFSET 16 +#endif +#if ABI_WIN +# define ARG0 rcx +# define ARG1 rdx +# define ARG2 r8 +# define ARG3 r9 +# define ARG4 STKARG(0) +# define ARG5 STKARG(1) +# define ARG6 STKARG(2) +# define ARG7 STKARG(3) +# define ARG8 STKARG(4) +# define STKARG_OFFSET 40 +#endif +#define STKARG(i) [rsp + STKARG_OFFSET + 8*(i)] + +// sysv win +// dmul smul mmul mont dmul smul mmul mont +// A rax +// D rdx +// z rdi rdi rdi rdi rdi rcx rcx rcx rcx +// c rcx rsi rsi rsi rsi rdx rdx rdx rdx +// y r10 -- -- rdx rdx -- -- r8 r8 +// u r11 rdx -- rcx -- r8 -- r9 -- +// x rbx rcx rdx r8 rcx r9 r8 stk0 r9 +// vv xmm8/9 r8 -- r9 r8 stk0 -- stk1 stk0 +// yy xmm10/11 r9 rcx stk0 -- stk1 r9 stk2 -- +// n r8 stk0 r8 stk1 r9 stk2 stk0 stk3 stk1 +// cyv r9 stk1 r9 stk2 stk0 stk3 stk1 stk4 stk2 + +.macro cysetup v, n + rdtsc + shl rdx, 32 + or rax, rdx + mov [\v + 8*\n - 8], rax +.endm + +.macro cystore v, n + rdtsc + shl rdx, 32 + or rax, rdx + sub rax, [\v + 8*\n - 8] + mov [\v + 8*\n - 8], rax + dec \n +.endm + +.macro testprologue mode + pushreg rbx +#if ABI_SYSV + endprologue + .ifeqs "\mode", "dmul" + mov rbx, rcx + movdqu xmm8, [r8] + movdqu xmm10, [r9] + mov r8d, STKARG(0) + mov r9, STKARG(1) + mov r11, rdx + mov rcx, rsi + .endif + .ifeqs "\mode", "smul" + mov rbx, rdx + movdqu xmm10, [rcx] + mov rcx, rsi + .endif + .ifeqs "\mode", "mmul" + mov rax, STKARG(0) + mov rbx, r8 + movdqu xmm8, [r9] + movdqu xmm10, [rax] + mov r8, STKARG(1) + mov r9, STKARG(2) + mov r10, rdx + mov r11, rcx + mov rcx, rsi + .endif + .ifeqs "\mode", "mont" + mov rbx, rcx + movdqu xmm8, [r8] + mov r8, r9 + mov r9, STKARG(0) + mov r10, rdx + mov rcx, rsi + .endif +#endif +#if ABI_WIN + pushreg rdi + stalloc 168 + savexmm xmm6, 0 + savexmm xmm7, 16 + savexmm xmm8, 32 + savexmm xmm9, 48 + savexmm xmm10, 64 + savexmm xmm11, 80 + savexmm xmm12, 96 + savexmm xmm13, 112 + savexmm xmm14, 128 + savexmm xmm15, 144 + endprologue + .ifeqs "\mode", "dmul" + mov r10, STKARG(0) + mov r11, STKARG(1) + mov rdi, rcx + mov rcx, rdx + mov rbx, r9 + movdqu xmm8, [r10] + movdqu xmm10, [r11] + mov r8, STKARG(2) + mov r9, STKARG(3) + mov r11, r8 + .endif + .ifeqs "\mode", "smul" + mov rdi, rcx + mov rcx, rdx + mov rbx, r8 + movdqu xmm10, [r9] + mov r8, STKARG(0) + mov r9, STKARG(1) + .endif + .ifeqs "\mode", "mmul" + mov r10, STKARG(1) + mov r11, STKARG(2) + mov rdi, rcx + mov rcx, rdx + mov rbx, STKARG(0) + movdqu xmm8, [r10] + movdqu xmm10, [r11] + mov r8, STKARG(3) + mov r9, STKARG(4) + mov r10, r8 + mov r11, r9 + .endif + .ifeqs "\mode", "mont" + mov r10, STKARG(0) + mov rdi, rcx + mov rcx, rdx + mov rbx, r9 + movdqu xmm8, [r10] + mov r8, STKARG(1) + mov r9, STKARG(2) + mov r10, r8 + .endif +#endif + + pxor xmm0, xmm0 + .ifeqs "\mode", "dmul" + expand xmm0, xmm8, xmm9, xmm10, xmm11 + .endif + .ifeqs "\mode", "smul" + expand xmm0, xmm10, xmm11 + .endif + .ifeqs "\mode", "mmul" + expand xmm0, xmm8, xmm9, xmm10, xmm11 + .endif + .ifeqs "\mode", "mont" + expand xmm0, xmm8, xmm9 + .endif +.endm + +.macro testepilogue +#if ABI_WIN + rstrxmm xmm6, 0 + rstrxmm xmm7, 16 + rstrxmm xmm8, 32 + rstrxmm xmm9, 48 + rstrxmm xmm10, 64 + rstrxmm xmm11, 80 + rstrxmm xmm12, 96 + rstrxmm xmm13, 112 + rstrxmm xmm14, 128 + rstrxmm xmm15, 144 + stfree 168 + popreg rdi +#endif + popreg rbx + ret +.endm + +.macro testldcarry + movdqu xmm12, [rcx + 0] // (c'_0, c''_0) + movdqu xmm13, [rcx + 16] // (c'_1, c''_1) + movdqu xmm14, [rcx + 32] // (c'_2, c''_2) +.endm + +.macro testtop u=nil + .p2align 4 +0: + cysetup r9, r8 + .ifnes "\u", "nil" + mov rax, \u + .endif +.endm + +.macro testtail + cystore r9, r8 + jnz 0b +.endm + +.macro testcarryout + movdqu [rcx + 0], xmm12 + movdqu [rcx + 16], xmm13 + movdqu [rcx + 32], xmm14 +.endm + +FUNC(test_dmul4) + testprologue dmul + testldcarry + testtop r11 + call dmul4 + testtail + testcarryout + testepilogue +ENDFUNC + +FUNC(test_dmla4) + testprologue dmul + testldcarry + testtop r11 + call dmla4 + testtail + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mul4) + testprologue smul + testldcarry + testtop nil + call mul4 + testtail + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mla4) + testprologue smul + testldcarry + testtop nil + call mla4 + testtail + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mmul4) + testprologue mmul + testtop r11 + call mmul4 + testtail + movdqu [r10 + 0], xmm10 + movdqu [r10 + 16], xmm11 + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mmla4) + testprologue mmul + testtop r11 + call mmla4 + testtail + movdqu [r10 + 0], xmm10 + movdqu [r10 + 16], xmm11 + testcarryout + testepilogue +ENDFUNC + +FUNC(test_mont4) + testprologue mont + testtop + call mont4 + testtail + movdqu [r10 + 0], xmm10 + movdqu [r10 + 16], xmm11 + testcarryout + testepilogue +ENDFUNC + +#endif + +///----- That's all, folks -------------------------------------------------- diff --git a/math/mpx.c b/math/mpx.c index 8f1dcede..e759c5f2 100644 --- a/math/mpx.c +++ b/math/mpx.c @@ -914,12 +914,20 @@ static void simple_umul(mpw *dv, mpw *dvl, const mpw *av, const mpw *avl, MAYBE_UMUL4(x86_sse2) #endif +#if CPUFAM_AMD64 + MAYBE_UMUL4(amd64_sse2) +#endif + static mpx_umul__functype *pick_umul(void) { #if CPUFAM_X86 DISPATCH_PICK_COND(mpx_umul, maybe_umul4_x86_sse2, cpu_feature_p(CPUFEAT_X86_SSE2)); #endif +#if CPUFAM_AMD64 + DISPATCH_PICK_COND(mpx_umul, maybe_umul4_amd64_sse2, + cpu_feature_p(CPUFEAT_X86_SSE2)); +#endif DISPATCH_PICK_FALLBACK(mpx_umul, simple_umul); } diff --git a/math/t/mpmont b/math/t/mpmont index cd2569b7..d6539636 100644 --- a/math/t/mpmont +++ b/math/t/mpmont @@ -25,6 +25,11 @@ mul { 6456542564 10807149256; + 4325987397987458979875737589783 + 1 + 4309747041023999857206910900081 + 4309747041023999857206910900081; + 6277101735386680763835789423207666416083908700390324961279 2455155546008943817740293915197451784769108058161191238065 340282366920938463500268095579187314689 -- 2.11.0