From 444083aef7e70ce9afe893a36d72e1a1a976f1ed Mon Sep 17 00:00:00 2001 From: Mark Wooding Date: Mon, 12 Sep 2016 22:32:37 +0100 Subject: [PATCH] math/: SSE2-based high-performance multipliers. --- math/Makefile.am | 3 + math/mpmont.c | 89 +++- math/mpx-mul4-x86-sse2.S | 1224 ++++++++++++++++++++++++++++++++++++++++++++++ math/mpx.c | 43 +- math/t/mpmont | 7 + 5 files changed, 1354 insertions(+), 12 deletions(-) create mode 100644 math/mpx-mul4-x86-sse2.S diff --git a/math/Makefile.am b/math/Makefile.am index 908d268b..69940ba5 100644 --- a/math/Makefile.am +++ b/math/Makefile.am @@ -181,6 +181,9 @@ TESTS += mpx-kmul.t$(EXEEXT) mpx-ksqr.t$(EXEEXT) noinst_PROGRAMS += bittest TESTS += bittest EXTRA_DIST += t/mpx +if CPUFAM_X86 +libmath_la_SOURCES += mpx-mul4-x86-sse2.S +endif ## A quick-and-dirty parser, used for parsing descriptions of groups, fields, ## etc. diff --git a/math/mpmont.c b/math/mpmont.c index a86623b2..968766d3 100644 --- a/math/mpmont.c +++ b/math/mpmont.c @@ -27,6 +27,8 @@ /*----- Header files ------------------------------------------------------*/ +#include "config.h" +#include "dispatch.h" #include "mp.h" #include "mpmont.h" @@ -58,8 +60,12 @@ * least %$2 n + 1$% words of result. */ -static void redccore(mpw *dv, mpw *dvl, const mpw *mv, - size_t n, const mpw *mi) +CPU_DISPATCH(static, (void), void, redccore, + (mpw *dv, mpw *dvl, const mpw *mv, size_t n, const mpw *mi), + (dv, dvl, mv, n, mi), pick_redccore, simple_redccore); + +static void simple_redccore(mpw *dv, mpw *dvl, const mpw *mv, + size_t n, const mpw *mi) { mpw mi0 = *mi; size_t i; @@ -70,6 +76,29 @@ static void redccore(mpw *dv, mpw *dvl, const mpw *mv, } } +#define MAYBE_REDC4(impl) \ + extern void mpxmont_redc4_##impl(mpw *dv, mpw *dvl, const mpw *mv, \ + size_t n, const mpw *mi); \ + static void maybe_redc4_##impl(mpw *dv, mpw *dvl, const mpw *mv, \ + size_t n, const mpw *mi) \ + { \ + if (n%4) simple_redccore(dv, dvl, mv, n, mi); \ + else mpxmont_redc4_##impl(dv, dvl, mv, n, mi); \ + } + +#if CPUFAM_X86 + MAYBE_REDC4(x86_sse2) +#endif + +static redccore__functype *pick_redccore(void) +{ +#if CPUFAM_X86 + DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_x86_sse2, + cpu_feature_p(CPUFEAT_X86_SSE2)); +#endif + DISPATCH_PICK_FALLBACK(mpmont_reduce, simple_redccore); +} + /* --- @redccore@ --- * * * Arguments: @mpw *dv, *dvl@ = base and limit of source/destination @@ -85,10 +114,17 @@ static void redccore(mpw *dv, mpw *dvl, const mpw *mv, * Store in %$d$% the value %$a b + (m' a b \bmod R) m$%. */ -static void mulcore(mpw *dv, mpw *dvl, - const mpw *av, const mpw *avl, - const mpw *bv, const mpw *bvl, - const mpw *mv, size_t n, const mpw *mi) +CPU_DISPATCH(static, (void), void, mulcore, + (mpw *dv, mpw *dvl, const mpw *av, const mpw *avl, + const mpw *bv, const mpw *bvl, const mpw *mv, + size_t n, const mpw *mi), + (dv, dvl, av, avl, bv, bvl, mv, n, mi), + pick_mulcore, simple_mulcore); + +static void simple_mulcore(mpw *dv, mpw *dvl, + const mpw *av, const mpw *avl, + const mpw *bv, const mpw *bvl, + const mpw *mv, size_t n, const mpw *mi) { mpw ai, b0, y, mi0 = *mi; const mpw *tv, *tvl; @@ -123,6 +159,38 @@ static void mulcore(mpw *dv, mpw *dvl, } } +#define MAYBE_MUL4(impl) \ + extern void mpxmont_mul4_##impl(mpw *dv, \ + const mpw *av, const mpw *bv, \ + const mpw *mv, \ + size_t n, const mpw *mi); \ + static void maybe_mul4_##impl(mpw *dv, mpw *dvl, \ + const mpw *av, const mpw *avl, \ + const mpw *bv, const mpw *bvl, \ + const mpw *mv, size_t n, const mpw *mi) \ + { \ + size_t an = avl - av, bn = bvl - bv; \ + if (n%4 || an != n || bn != n) \ + simple_mulcore(dv, dvl, av, avl, bv, bvl, mv, n, mi); \ + else { \ + mpxmont_mul4_##impl(dv, av, bv, mv, n, mi); \ + MPX_ZERO(dv + 2*n + 1, dvl); \ + } \ + } + +#if CPUFAM_X86 + MAYBE_MUL4(x86_sse2) +#endif + +static mulcore__functype *pick_mulcore(void) +{ +#if CPUFAM_X86 + DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_x86_sse2, + cpu_feature_p(CPUFEAT_X86_SSE2)); +#endif + DISPATCH_PICK_FALLBACK(mpmont_mul, simple_mulcore); +} + /* --- @finish@ --- * * * Arguments: @mpmont *mm@ = pointer to a Montgomery reduction context @@ -321,13 +389,14 @@ mp *mpmont_mul(mpmont *mm, mp *d, mp *a, mp *b) mp *mpmont_mul(mpmont *mm, mp *d, mp *a, mp *b) { - if (mm->n > MPK_THRESH * 3) { + size_t n = mm->n; + + if (n > MPK_THRESH * 3) { d = mp_mul(d, a, b); d = mpmont_reduce(mm, d, d); } else { - a = MP_COPY(a); - b = MP_COPY(b); - MP_DEST(d, 2*mm->n + 1, a->f | b->f | MP_UNDEF); + a = MP_COPY(a); b = MP_COPY(b); + MP_DEST(d, 2*n + 1, a->f | b->f | MP_UNDEF); mulcore(d->v, d->vl, a->v, a->vl, b->v, b->vl, mm->m->v, mm->n, mm->mi->v); d->f = ((a->f | b->f) & MP_BURN) | ((a->f ^ b->f) & MP_NEG); diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S new file mode 100644 index 00000000..c0e1a788 --- /dev/null +++ b/math/mpx-mul4-x86-sse2.S @@ -0,0 +1,1224 @@ +/// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*- +/// +/// Large SIMD-based multiplications +/// +/// (c) 2016 Straylight/Edgeware + +///----- Licensing notice --------------------------------------------------- +/// +/// This file is part of Catacomb. +/// +/// Catacomb is free software; you can redistribute it and/or modify +/// it under the terms of the GNU Library General Public License as +/// published by the Free Software Foundation; either version 2 of the +/// License, or (at your option) any later version. +/// +/// Catacomb is distributed in the hope that it will be useful, +/// but WITHOUT ANY WARRANTY; without even the implied warranty of +/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +/// GNU Library General Public License for more details. +/// +/// You should have received a copy of the GNU Library General Public +/// License along with Catacomb; if not, write to the Free +/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +/// MA 02111-1307, USA. + +///-------------------------------------------------------------------------- +/// External definitions. + +#include "config.h" +#include "asm-common.h" + +///-------------------------------------------------------------------------- +/// Prologue. + + .arch pentium4 + .text + +///-------------------------------------------------------------------------- +/// Theory. +/// +/// We define a number of primitive fixed-size multipliers from which we can +/// construct more general variable-length multipliers. +/// +/// The basic trick is the same throughout. In an operand-scanning +/// multiplication, the inner multiplication loop multiplies a +/// multiple-precision operand by a single precision factor, and adds the +/// result, appropriately shifted, to the result. A `finely integrated +/// operand scanning' implementation of Montgomery multiplication also adds +/// the product of a single-precision `Montgomery factor' and the modulus, +/// calculated in the same pass. The more common `coarsely integrated +/// operand scanning' alternates main multiplication and Montgomery passes, +/// which requires additional carry propagation. +/// +/// Throughout both plain-multiplication and Montgomery stages, then, one of +/// the factors remains constant throughout the operation, so we can afford +/// to take a little time to preprocess it. The transformation we perform is +/// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a +/// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into +/// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit +/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE +/// operands, as follows. +/// +/// Offset 0 4 8 12 +/// 0 v'_0 v'_1 v''_0 v''_1 +/// 16 v'_2 v'_3 v''_2 v''_3 +/// +/// A `pmuludqd' instruction ignores the odd positions in its operands; thus, +/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting +/// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can +/// multiply such a vector by a full 32-bit scalar to produce two 48-bit +/// results in 64-bit fields. The sixteen bits of headroom allows us to add +/// many products together before we must deal with carrying; it also allows +/// for some calculations to be performed on the above expanded form. +/// +/// On 32-bit x86, we are register starved: the expanded operands are kept in +/// memory, typically in warm L1 cache. +/// +/// We maintain four `carry' registers accumulating intermediate results. +/// The registers' precise roles rotate during the computation; we name them +/// `c0', `c1', `c2', and `c3'. Each carry register holds two 64-bit halves: +/// the register c0, for example, holds c'_0 (low half) and c''_0 (high +/// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers +/// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3. The +/// `pmuluqdq' instruction acting on a scalar operand (broadcast across all +/// lanes of its vector) and an operand in the expanded form above produces a +/// result which can be added directly to the appropriate carry register. +/// Following a pass of four multiplications, we perform some limited carry +/// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output +/// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry +/// registers around, so that c1 becomes c0, and the old c0 is (implicitly) +/// zeroed becomes c3. + +///-------------------------------------------------------------------------- +/// Macro definitions. + +.macro mulcore r, s, d0, d1, d2, d3 + // Load a word r_i from R, multiply by the expanded operand [S], and + // leave the pieces of the product in registers D0, D1, D2, D3. + movd \d0, \r // (r_i, 0, 0, 0) + .ifnes "\d1", "nil" + movdqa \d1, [\s] // (s'_0, s'_1, s''_0, s''_1) + .endif + .ifnes "\d3", "nil" + movdqa \d3, [\s + 16] // (s'_2, s'_3, s''_2, s''_3) + .endif + pshufd \d0, \d0, 0b11001100 // (r_i, ?, r_i, ?) + .ifnes "\d1", "nil" + psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0) + .endif + .ifnes "\d2", "nil" + .ifnes "\d3", "nil" + movdqa \d2, \d3 // another copy of (s'_2, s'_3, ...) + .else + movdqa \d2, \d0 // another copy of (r_i, ?, r_i, ?) + .endif + .endif + .ifnes "\d3", "nil" + psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0) + .endif + .ifnes "\d1", "nil" + pmuludqd \d1, \d0 // (r_i s'_1, r_i s''_1) + .endif + .ifnes "\d3", "nil" + pmuludqd \d3, \d0 // (r_i s'_3, r_i s''_3) + .endif + .ifnes "\d2", "nil" + .ifnes "\d3", "nil" + pmuludqd \d2, \d0 // (r_i s'_2, r_i s''_2) + .else + pmuludqd \d2, [\s + 16] + .endif + .endif + pmuludqd \d0, [\s] // (r_i s'_0, r_i s''_0) +.endm + +.macro accum c0, c1, c2, c3 + paddq \c0, xmm0 + .ifnes "\c1", "nil" + paddq \c1, xmm1 + .endif + .ifnes "\c2", "nil" + paddq \c2, xmm2 + .endif + .ifnes "\c3", "nil" + paddq \c3, xmm3 + .endif +.endm + +.macro mulacc r, s, c0, c1, c2, c3, z3p + // Load a word r_i from R, multiply by the expanded operand [S], + // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t' + // then C3 notionally contains zero, but needs clearing; in practice, + // we store the product directly rather than attempting to add. On + // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P + // is not `t'. + .ifeqs "\z3p", "t" + mulcore \r, \s, xmm0, xmm1, xmm2, \c3 + accum \c0, \c1, \c2, nil + .else + mulcore \r, \s, xmm0, xmm1, xmm2, xmm3 + accum \c0, \c1, \c2, \c3 + .endif +.endm + +.macro propout d, c, cc + // Calculate an output word from C, and store it in D; propagate + // carries out from C to CC in preparation for a rotation of the + // carry registers. On completion, XMM3 is clobbered. If CC is + // `nil', then the contribution which would have been added to it is + // left in C. + pshufd xmm3, \c, 0b10111111 // (?, ?, ?, t = c'' mod B) + psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0) + pslldq xmm3, 2 // (t b, 0) + paddq \c, xmm3 // (c' + t b, c'') + movd \d, \c + psrlq \c, 32 // floor(c/B) + .ifnes "\cc", "nil" + paddq \cc, \c // propagate up + .endif +.endm + +.macro endprop d, c, t + // On entry, C contains a carry register. On exit, the low 32 bits + // of the value represented in C are written to D, and the remaining + // bits are left at the bottom of T. + movdqa \t, \c + psllq \t, 16 // (?, c'' b) + pslldq \c, 8 // (0, c') + paddq \t, \c // (?, c' + c'' b) + psrldq \t, 8 // c' + c'' b + movd \d, \t + psrldq \t, 4 // floor((c' + c'' b)/B) +.endm + +.macro expand a, b, c, d, z + // On entry, A and C hold packed 128-bit values, and Z is zero. On + // exit, A:B and C:D together hold the same values in expanded + // form. If C is `nil', then only expand A to A:B. + movdqa \b, \a // (a_0, a_1, a_2, a_3) + .ifnes "\c", "nil" + movdqa \d, \c // (c_0, c_1, c_2, c_3) + .endif + punpcklwd \a, \z // (a'_0, a''_0, a'_1, a''_1) + punpckhwd \b, \z // (a'_2, a''_2, a'_3, a''_3) + .ifnes "\c", "nil" + punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1) + punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3) + .endif + pshufd \a, \a, 0b11011000 // (a'_0, a'_1, a''_0, a''_1) + pshufd \b, \b, 0b11011000 // (a'_2, a'_3, a''_2, a''_3) + .ifnes "\c", "nil" + pshufd \c, \c, 0b11011000 // (c'_0, c'_1, c''_0, c''_1) + pshufd \d, \d, 0b11011000 // (c'_2, c'_3, c''_2, c''_3) + .endif +.endm + +.macro squash c0, c1, c2, c3, h, t, u + // On entry, C0, C1, C2, C3 are carry registers representing a value + // Y. On exit, C0 holds the low 128 bits of the carry value; C1, C2, + // C3, T, and U are clobbered; and the high bits of Y are stored in + // H, if this is not `nil'. + + // The first step is to eliminate the `double-prime' pieces -- i.e., + // the ones offset by 16 bytes from a 32-bit boundary -- by carrying + // them into the 32-bit-aligned pieces above and below. But before + // we can do that, we must gather them together. + movdqa \t, \c0 + movdqa \u, \c1 + punpcklqdq \t, \c2 // (y'_0, y'_2) + punpckhqdq \c0, \c2 // (y''_0, y''_2) + punpcklqdq \u, \c3 // (y'_1, y'_3) + punpckhqdq \c1, \c3 // (y''_1, y''_3) + + // Now split the double-prime pieces. The high (up to) 48 bits will + // go up; the low 16 bits go down. + movdqa \c2, \c0 + movdqa \c3, \c1 + psllq \c2, 48 + psllq \c3, 48 + psrlq \c0, 16 // high parts of (y''_0, y''_2) + psrlq \c1, 16 // high parts of (y''_1, y''_3) + psrlq \c2, 32 // low parts of (y''_0, y''_2) + psrlq \c3, 32 // low parts of (y''_1, y''_3) + .ifnes "\h", "nil" + movdqa \h, \c1 + .endif + pslldq \c1, 8 // high part of (0, y''_1) + + paddq \t, \c2 // propagate down + paddq \u, \c3 + paddq \t, \c1 // and up: (y_0, y_2) + paddq \u, \c0 // (y_1, y_3) + .ifnes "\h", "nil" + psrldq \h, 8 // high part of (y''_3, 0) + .endif + + // Finally extract the answer. This complicated dance is better than + // storing to memory and loading, because the piecemeal stores + // inhibit store forwarding. + movdqa \c3, \t // (y_0, y_1) + movdqa \c0, \t // (y^*_0, ?, ?, ?) + psrldq \t, 8 // (y_2, 0) + psrlq \c3, 32 // (floor(y_0/B), ?) + paddq \c3, \u // (y_1 + floor(y_0/B), ?) + pslldq \c0, 12 // (0, 0, 0, y^*_0) + movdqa \c1, \c3 // (y^*_1, ?, ?, ?) + psrldq \u, 8 // (y_3, 0) + psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?) + paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?) + pslldq \c1, 12 // (0, 0, 0, y^*_1) + psrldq \c0, 12 // (y^*_0, 0, 0, 0) + movdqa \c2, \c3 // (y^*_2, ?, ?, ?) + psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?) + paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?) + pslldq \c2, 12 // (0, 0, 0, y^*_2) + psrldq \c1, 8 // (0, y^*_1, 0, 0) + psrldq \c2, 4 // (0, 0, y^*_2, 0) + .ifnes "\h", "nil" + movdqu \t, \c3 + pxor \u, \u + .endif + pslldq \c3, 12 // (0, 0, 0, y^*_3) + por \c0, \c1 // (y^*_0, y^*_1, 0, 0) + por \c2, \c3 // (0, 0, y^*_2, y^*_3) + por \c0, \c2 // y mod B^4 + .ifnes "\h", "nil" + psrlq \t, 32 // very high bits of y + paddq \h, \t + punpcklqdq \h, \u // carry up + .endif +.endm + +.macro carryadd + // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6 + // hold the incoming carry registers c0, c1, and c2 representing a + // carry-in C. + // + // On exit, the carry registers, including XMM7, are updated to hold + // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other + // registers are preserved. + movd xmm0, [edi + 0] // (a_0, 0) + movd xmm1, [edi + 4] // (a_1, 0) + movd xmm2, [edi + 8] // (a_2, 0) + movd xmm7, [edi + 12] // (a_3, 0) + paddq xmm4, xmm0 // (c'_0 + a_0, c''_0) + paddq xmm5, xmm1 // (c'_1 + a_1, c''_1) + paddq xmm6, xmm2 // (c'_2 + a_2, c''_2 + a_3 b) +.endm + +///-------------------------------------------------------------------------- +/// Primitive multipliers and related utilities. + + .p2align 4 +carryprop: + // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded + // form. Store the low 128 bits of the represented carry to [EDI] as + // a packed 128-bit value, and leave the remaining 16 bits in the low + // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered. + propout [edi + 0], xmm4, xmm5 + propout [edi + 4], xmm5, xmm6 + propout [edi + 8], xmm6, nil + endprop [edi + 12], xmm6, xmm4 + ret + + .p2align 4 +dmul4: + // On entry, EDI points to the destination buffer; EAX and EBX point + // to the packed operands U and X; ECX and EDX point to the expanded + // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry + // registers c0, c1, and c2; c3 is assumed to be zero. + // + // On exit, we write the low 128 bits of the sum C + U V + X Y to + // [EDI], and update the carry registers with the carry out. The + // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the + // general-purpose registers are preserved. + mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil + propout [edi + 0], xmm4, xmm5 + + mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t + mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil + propout [edi + 4], xmm5, xmm6 + + mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t + mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil + propout [edi + 8], xmm6, xmm7 + + mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t + mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil + propout [edi + 12], xmm7, xmm4 + + ret + + .p2align 4 +dmla4: + // On entry, EDI points to the destination buffer, which also + // contains an addend A to accumulate; EAX and EBX point to the + // packed operands U and X; ECX and EDX point to the expanded + // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry + // registers c0, c1, and c2 representing a carry-in C; c3 is assumed + // to be zero. + // + // On exit, we write the low 128 bits of the sum A + C + U V + X Y to + // [EDI], and update the carry registers with the carry out. The + // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the + // general-purpose registers are preserved. + carryadd + + mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil + propout [edi + 0], xmm4, xmm5 + + mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t + mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, nil + propout [edi + 4], xmm5, xmm6 + + mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t + mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, nil + propout [edi + 8], xmm6, xmm7 + + mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t + mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil + propout [edi + 12], xmm7, xmm4 + + ret + + .p2align 4 +mul4zc: + // On entry, EDI points to the destination buffer; EBX points to a + // packed operand X; and EDX points to an expanded operand Y. + // + // On exit, we write the low 128 bits of the product X Y to [EDI], + // and set the carry registers XMM4, XMM5, XMM6 to the carry out. + // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the + // general-purpose registers are preserved. + mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 + propout [edi + 0], xmm4, xmm5 + + mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t + propout [edi + 4], xmm5, xmm6 + + mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t + propout [edi + 8], xmm6, xmm7 + + mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t + propout [edi + 12], xmm7, xmm4 + + ret + + .p2align 4 +mul4: + // On entry, EDI points to the destination buffer; EBX points to a + // packed operand X; EDX points to an expanded operand Y; and XMM4, + // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2, + // representing a carry-in C; c3 is assumed to be zero. + // + // On exit, we write the low 128 bits of the sum C + X Y to [EDI], + // and update the carry registers with the carry out. The registers + // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the + // general-purpose registers are preserved. + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t + propout [edi + 0], xmm4, xmm5 + + mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t + propout [edi + 4], xmm5, xmm6 + + mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t + propout [edi + 8], xmm6, xmm7 + + mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t + propout [edi + 12], xmm7, xmm4 + + ret + + .p2align 4 +mla4zc: + // On entry, EDI points to the destination buffer, which also + // contains an addend A to accumulate; EBX points to a packed operand + // X; and EDX points to an expanded operand Y. + // + // On exit, we write the low 128 bits of the sum A + X Y to [EDI], + // and set the carry registers XMM4, XMM5, XMM6 to the carry out. + // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the + // general-purpose registers are preserved. + movd xmm4, [edi + 0] + movd xmm5, [edi + 4] + movd xmm6, [edi + 8] + movd xmm7, [edi + 12] + + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil + propout [edi + 0], xmm4, xmm5 + + mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t + propout [edi + 4], xmm5, xmm6 + + mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t + propout [edi + 8], xmm6, xmm7 + + mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t + propout [edi + 12], xmm7, xmm4 + + ret + + .p2align 4 +mla4: + // On entry, EDI points to the destination buffer, which also + // contains an addend A to accumulate; EBX points to a packed operand + // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold + // the incoming carry registers c0, c1, and c2, representing a + // carry-in C; c3 is assumed to be zero. + // + // On exit, we write the low 128 bits of the sum A + C + X Y to + // [EDI], and update the carry registers with the carry out. The + // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the + // general-purpose registers are preserved. + carryadd + + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil + propout [edi + 0], xmm4, xmm5 + + mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t + propout [edi + 4], xmm5, xmm6 + + mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t + propout [edi + 8], xmm6, xmm7 + + mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t + propout [edi + 12], xmm7, xmm4 + + ret + + .p2align 4 +mmul4: + // On entry, EDI points to the destination buffer; EAX and EBX point + // to the packed operands U and N; ECX and ESI point to the expanded + // operands V and M; and EDX points to a place to store an expanded + // result Y (32 bytes, at a 16-byte boundary). The stack pointer + // must be 16-byte aligned. (This is not the usual convention, which + // requires alignment before the call.) + // + // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits + // of the sum U V + N Y to [EDI], leaving the remaining carry in + // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and + // XMM7 are clobbered; the general-purpose registers are preserved. + sub esp, 64 // space for the carries + + // Calculate W = U V, and leave it in the destination. Stash the + // carry pieces for later. + mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7 + propout [edi + 0], xmm4, xmm5 + jmp 5f + + .p2align 4 +mmla4: + // On entry, EDI points to the destination buffer, which also + // contains an addend A to accumulate; EAX and EBX point + // to the packed operands U and N; ECX and ESI point to the expanded + // operands V and M; and EDX points to a place to store an expanded + // result Y (32 bytes, at a 16-byte boundary). The stack pointer + // must be 16-byte aligned. (This is not the usual convention, which + // requires alignment before the call.) + // + // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128 + // bits of the sum A + U V + N Y to [EDI], leaving the remaining + // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, + // XMM3, and XMM7 are clobbered; the general-purpose registers are + // preserved. + sub esp, 64 // space for the carries + movd xmm4, [edi + 0] + movd xmm5, [edi + 4] + movd xmm6, [edi + 8] + movd xmm7, [edi + 12] + mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, nil + propout [edi + 0], xmm4, xmm5 + +5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t + propout [edi + 4], xmm5, xmm6 + + mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t + propout [edi + 8], xmm6, xmm7 + + mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t + propout [edi + 12], xmm7, xmm4 + + movdqa [esp + 0], xmm4 + movdqa [esp + 16], xmm5 + movdqa [esp + 32], xmm6 + + // Calculate Y = W M. + mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7 + + mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil + accum xmm5, xmm6, xmm7, nil + + mulcore [edi + 8], esi, xmm0, xmm1, nil, nil + accum xmm6, xmm7, nil, nil + + mulcore [edi + 12], esi, xmm0, nil, nil, nil + accum xmm7, nil, nil, nil + + // That's lots of pieces. Now we have to assemble the answer. + squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1 + + // Expand it. + pxor xmm2, xmm2 + expand xmm4, xmm1, nil, nil, xmm2 + movdqa [edx + 0], xmm4 + movdqa [edx + 16], xmm1 + + // Initialize the carry from the value for W we calculated earlier. + movd xmm4, [edi + 0] + movd xmm5, [edi + 4] + movd xmm6, [edi + 8] + movd xmm7, [edi + 12] + + // Finish the calculation by adding the Montgomery product. + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil + propout [edi + 0], xmm4, xmm5 + + mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t + propout [edi + 4], xmm5, xmm6 + + mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t + propout [edi + 8], xmm6, xmm7 + + mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t + propout [edi + 12], xmm7, xmm4 + + // Add add on the carry we calculated earlier. + paddq xmm4, [esp + 0] + paddq xmm5, [esp + 16] + paddq xmm6, [esp + 32] + + // And, with that, we're done. + add esp, 64 + ret + + .p2align 4 +mont4: + // On entry, EDI points to the destination buffer holding a packed + // value A; EBX points to a packed operand N; ESI points to an + // expanded operand M; and EDX points to a place to store an expanded + // result Y (32 bytes, at a 16-byte boundary). + // + // On exit, we write Y = W M mod B to [EDX], and the low 128 bits + // of the sum W + N Y to [EDI], leaving the remaining carry in + // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and + // XMM7 are clobbered; the general-purpose registers are preserved. + + // Calculate Y = W M. + mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7 + + mulcore [edi + 4], esi, xmm0, xmm1, xmm2, nil + accum xmm5, xmm6, xmm7, nil + + mulcore [edi + 8], esi, xmm0, xmm1, nil, nil + accum xmm6, xmm7, nil, nil + + mulcore [edi + 12], esi, xmm0, nil, nil, nil + accum xmm7, nil, nil, nil + + // That's lots of pieces. Now we have to assemble the answer. + squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1 + + // Expand it. + pxor xmm2, xmm2 + expand xmm4, xmm1, nil, nil, xmm2 + movdqa [edx + 0], xmm4 + movdqa [edx + 16], xmm1 + + // Initialize the carry from W. + movd xmm4, [edi + 0] + movd xmm5, [edi + 4] + movd xmm6, [edi + 8] + movd xmm7, [edi + 12] + + // Finish the calculation by adding the Montgomery product. + mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, nil + propout [edi + 0], xmm4, xmm5 + + mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t + propout [edi + 4], xmm5, xmm6 + + mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t + propout [edi + 8], xmm6, xmm7 + + mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t + propout [edi + 12], xmm7, xmm4 + + // And, with that, we're done. + ret + +///-------------------------------------------------------------------------- +/// Bulk multipliers. + +FUNC(mpx_umul4_x86_sse2) + // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl, + // const mpw *bv, const mpw *bvl); + + // Build a stack frame. Arguments will be relative to EBP, as + // follows. + // + // ebp + 20 dv + // ebp + 24 av + // ebp + 28 avl + // ebp + 32 bv + // ebp + 36 bvl + // + // Locals are relative to ESP, as follows. + // + // esp + 0 expanded Y (32 bytes) + // esp + 32 (top of locals) + push ebp + push ebx + push esi + push edi + mov ebp, esp + and esp, ~15 + sub esp, 32 + + // Prepare for the first iteration. + mov esi, [ebp + 32] // -> bv[0] + pxor xmm7, xmm7 + movdqu xmm0, [esi] // bv[0] + mov edi, [ebp + 20] // -> dv[0] + mov ecx, edi // outer loop dv cursor + expand xmm0, xmm1, nil, nil, xmm7 + mov ebx, [ebp + 24] // -> av[0] + mov eax, [ebp + 28] // -> av[m] = av limit + mov edx, esp // -> expanded Y = bv[0] + movdqa [esp + 0], xmm0 // bv[0] expanded low + movdqa [esp + 16], xmm1 // bv[0] expanded high + call mul4zc + add ebx, 16 + add edi, 16 + add ecx, 16 + add esi, 16 + cmp ebx, eax // all done? + jae 8f + + .p2align 4 + // Continue with the first iteration. +0: call mul4 + add ebx, 16 + add edi, 16 + cmp ebx, eax // all done? + jb 0b + + // Write out the leftover carry. There can be no tail here. +8: call carryprop + cmp esi, [ebp + 36] // more passes to do? + jae 9f + + .p2align 4 + // Set up for the next pass. +1: movdqu xmm0, [esi] // bv[i] + mov edi, ecx // -> dv[i] + pxor xmm7, xmm7 + expand xmm0, xmm1, nil, nil, xmm7 + mov ebx, [ebp + 24] // -> av[0] + movdqa [esp + 0], xmm0 // bv[i] expanded low + movdqa [esp + 16], xmm1 // bv[i] expanded high + call mla4zc + add edi, 16 + add ebx, 16 + add ecx, 16 + add esi, 16 + cmp ebx, eax // done yet? + jae 8f + + .p2align 4 + // Continue... +0: call mla4 + add ebx, 16 + add edi, 16 + cmp ebx, eax + jb 0b + + // Finish off this pass. There was no tail on the previous pass, and + // there can be none on this pass. +8: call carryprop + cmp esi, [ebp + 36] + jb 1b + + // All over. +9: mov esp, ebp + pop edi + pop esi + pop ebx + pop ebp + ret + +ENDFUNC + +FUNC(mpxmont_mul4_x86_sse2) + // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv, + // const mpw *nv, size_t n, const mpw *mi); + + // Build a stack frame. Arguments will be relative to EBP, as + // follows. + // + // ebp + 20 dv + // ebp + 24 av + // ebp + 28 bv + // ebp + 32 nv + // ebp + 36 n (nonzero multiple of 4) + // ebp + 40 mi + // + // Locals are relative to ESP, which is 4 mod 16, as follows. + // + // esp + 0 outer loop dv + // esp + 4 outer loop bv + // esp + 8 av limit (mostly in ESI) + // esp + 12 expanded V (32 bytes) + // esp + 44 expanded M (32 bytes) + // esp + 76 expanded Y (32 bytes) + // esp + 108 bv limit + // esp + 112 (gap) + // esp + 124 (top of locals) + push ebp + push ebx + push esi + push edi + mov ebp, esp + and esp, ~15 + sub esp, 124 + + // Establish the expanded operands. + pxor xmm7, xmm7 + mov ecx, [ebp + 28] // -> bv + mov edx, [ebp + 40] // -> mi + movdqu xmm0, [ecx] // bv[0] + movdqu xmm2, [edx] // mi + expand xmm0, xmm1, xmm2, xmm3, xmm7 + movdqa [esp + 12], xmm0 // bv[0] expanded low + movdqa [esp + 28], xmm1 // bv[0] expanded high + movdqa [esp + 44], xmm2 // mi expanded low + movdqa [esp + 60], xmm3 // mi expanded high + + // Set up the outer loop state and prepare for the first iteration. + mov edx, [ebp + 36] // n + mov eax, [ebp + 24] // -> U = av[0] + mov ebx, [ebp + 32] // -> X = nv[0] + mov edi, [ebp + 20] // -> Z = dv[0] + mov [esp + 4], ecx + lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit + lea edx, [eax + 4*edx] // -> av[n/4] = av limit + mov [esp + 0], edi + mov [esp + 108], ecx + mov [esp + 8], edx + lea ecx, [esp + 12] // -> expanded V = bv[0] + lea esi, [esp + 44] // -> expanded M = mi + lea edx, [esp + 76] // -> space for Y + call mmul4 + mov esi, [esp + 8] // recover av limit + add edi, 16 + add eax, 16 + add ebx, 16 + cmp eax, esi // done already? + jae 8f + mov [esp + 0], edi + + .p2align 4 + // Complete the first inner loop. +0: call dmul4 + add edi, 16 + add eax, 16 + add ebx, 16 + cmp eax, esi // done yet? + jb 0b + + // Still have carries left to propagate. + call carryprop + movd [edi + 16], xmm4 + + .p2align 4 + // Embark on the next iteration. (There must be one. If n = 1, then + // we would have bailed above, to label 8. Similarly, the subsequent + // iterations can fall into the inner loop immediately.) +1: mov eax, [esp + 4] // -> bv[i - 1] + mov edi, [esp + 0] // -> Z = dv[i] + add eax, 16 // -> bv[i] + pxor xmm7, xmm7 + movdqu xmm0, [eax] // bv[i] + mov [esp + 4], eax + cmp eax, [esp + 108] // done yet? + jae 9f + mov ebx, [ebp + 32] // -> X = nv[0] + lea esi, [esp + 44] // -> expanded M = mi + mov eax, [ebp + 24] // -> U = av[0] + expand xmm0, xmm1, nil, nil, xmm7 + movdqa [esp + 12], xmm0 // bv[i] expanded low + movdqa [esp + 28], xmm1 // bv[i] expanded high + call mmla4 + mov esi, [esp + 8] // recover av limit + add edi, 16 + add eax, 16 + add ebx, 16 + mov [esp + 0], edi + + .p2align 4 + // Complete the next inner loop. +0: call dmla4 + add edi, 16 + add eax, 16 + add ebx, 16 + cmp eax, esi + jb 0b + + // Still have carries left to propagate, and they overlap the + // previous iteration's final tail, so read that in and add it. + movd xmm0, [edi] + paddq xmm4, xmm0 + call carryprop + movd [edi + 16], xmm4 + + // Back again. + jmp 1b + + // First iteration was short. Write out the carries and we're done. + // (This could be folded into the main loop structure, but that would + // penalize small numbers more.) +8: call carryprop + movd [edi + 16], xmm4 + + // All done. +9: mov esp, ebp + pop edi + pop esi + pop ebx + pop ebp + ret + +ENDFUNC + +FUNC(mpxmont_redc4_x86_sse2) + // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv, + // size_t n, const mpw *mi); + + // Build a stack frame. Arguments will be relative to EBP, as + // follows. + // + // ebp + 20 dv + // ebp + 24 dvl + // ebp + 28 nv + // ebp + 32 n (nonzero multiple of 4) + // ebp + 36 mi + // + // Locals are relative to ESP, as follows. + // + // esp + 0 outer loop dv + // esp + 4 outer dv limit + // esp + 8 blocks-of-4 dv limit + // esp + 12 expanded M (32 bytes) + // esp + 44 expanded Y (32 bytes) + // esp + 76 (top of locals) + push ebp + push ebx + push esi + push edi + mov ebp, esp + and esp, ~15 + sub esp, 76 + + // Establish the expanded operands and the blocks-of-4 dv limit. + mov edi, [ebp + 20] // -> Z = dv[0] + pxor xmm7, xmm7 + mov eax, [ebp + 24] // -> dv[n] = dv limit + sub eax, edi // length of dv in bytes + mov edx, [ebp + 36] // -> mi + movdqu xmm0, [edx] // mi + and eax, ~15 // mask off the tail end + expand xmm0, xmm1, nil, nil, xmm7 + add eax, edi // find limit + movdqa [esp + 12], xmm0 // mi expanded low + movdqa [esp + 28], xmm1 // mi expanded high + mov [esp + 8], eax + + // Set up the outer loop state and prepare for the first iteration. + mov ecx, [ebp + 32] // n + mov ebx, [ebp + 28] // -> X = nv[0] + lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit + lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit + mov [esp + 0], edi + mov [esp + 4], edx + lea esi, [esp + 12] // -> expanded M = mi + lea edx, [esp + 44] // -> space for Y + call mont4 + add edi, 16 + add ebx, 16 + cmp ebx, ecx // done already? + jae 8f + + .p2align 4 + // Complete the first inner loop. +5: call mla4 + add ebx, 16 + add edi, 16 + cmp ebx, ecx // done yet? + jb 5b + + // Still have carries left to propagate. +8: carryadd + mov esi, [esp + 8] // -> dv blocks limit + mov edx, [ebp + 24] // dv limit + psllq xmm7, 16 + pslldq xmm7, 8 + paddq xmm6, xmm7 + call carryprop + movd eax, xmm4 + add edi, 16 + cmp edi, esi + jae 7f + + .p2align 4 + // Continue carry propagation until the end of the buffer. +0: add [edi], eax + mov eax, 0 // preserves flags + adcd [edi + 4], 0 + adcd [edi + 8], 0 + adcd [edi + 12], 0 + adc eax, 0 + add edi, 16 + cmp edi, esi + jb 0b + + // Deal with the tail end. +7: add [edi], eax + mov eax, 0 // preserves flags + add edi, 4 + adc eax, 0 + cmp edi, edx + jb 7b + + // All done for this iteration. Start the next. (This must have at + // least one follow-on iteration, or we'd not have started this outer + // loop.) +8: mov edi, [esp + 0] // -> dv[i - 1] + mov ebx, [ebp + 28] // -> X = nv[0] + lea edx, [esp + 44] // -> space for Y + lea esi, [esp + 12] // -> expanded M = mi + add edi, 16 // -> Z = dv[i] + cmp edi, [esp + 4] // all done yet? + jae 9f + mov [esp + 0], edi + call mont4 + add edi, 16 + add ebx, 16 + jmp 5b + + // All over. +9: mov esp, ebp + pop edi + pop esi + pop ebx + pop ebp + ret + +ENDFUNC + +///-------------------------------------------------------------------------- +/// Testing and performance measurement. + +#ifdef TEST_MUL4 + +.macro cysetup c + rdtsc + mov [\c], eax + mov [\c + 4], edx +.endm + +.macro cystore c, v, n + rdtsc + sub eax, [\c] + sbb edx, [\c + 4] + mov ebx, [\v] + mov ecx, [\n] + dec ecx + mov [\n], ecx + mov [ebx + ecx*8], eax + mov [ebx + ecx*8 + 4], edx +.endm + +.macro testprologue + push ebp + push ebx + push esi + push edi + mov ebp, esp + and esp, ~15 + sub esp, 3*32 + 12 + // vars: + // esp + 0 = cycles + // esp + 12 = v expanded + // esp + 44 = y expanded + // esp + 72 = ? expanded +.endm + +.macro testepilogue + mov esp, ebp + pop edi + pop esi + pop ebx + pop ebp + ret +.endm + +.macro testldcarry c + mov ecx, \c // -> c + movdqu xmm4, [ecx + 0] // (c'_0, c''_0) + movdqu xmm5, [ecx + 16] // (c'_1, c''_1) + movdqu xmm6, [ecx + 32] // (c'_2, c''_2) +.endm + +.macro testexpand v, y + pxor xmm7, xmm7 + .ifnes "\v", "nil" + mov ecx, \v + movdqu xmm0, [ecx] + expand xmm0, xmm1, nil, nil, xmm7 + movdqa [esp + 12], xmm0 + movdqa [esp + 28], xmm1 + .endif + .ifnes "\y", "nil" + mov edx, \y + movdqu xmm2, [edx] + expand xmm2, xmm3, nil, nil, xmm7 + movdqa [esp + 44], xmm2 + movdqa [esp + 60], xmm3 + .endif +.endm + +.macro testtop u, x, mode + .p2align 4 +0: + .ifnes "\u", "nil" + lea ecx, [esp + 12] + .endif + mov ebx, \x + .ifeqs "\mode", "mont" + lea esi, [esp + 44] + .endif + cysetup esp + 0 + .ifnes "\u", "nil" + mov eax, \u + .endif + .ifeqs "\mode", "mont" + lea edx, [esp + 76] + .else + lea edx, [esp + 44] + .endif +.endm + +.macro testtail cyv, n + cystore esp + 0, \cyv, \n + jnz 0b +.endm + +.macro testcarryout c + mov ecx, \c + movdqu [ecx + 0], xmm4 + movdqu [ecx + 16], xmm5 + movdqu [ecx + 32], xmm6 +.endm + + .globl test_dmul4 +test_dmul4: + testprologue + testldcarry [ebp + 24] + testexpand [ebp + 36], [ebp + 40] + mov edi, [ebp + 20] + testtop [ebp + 28], [ebp + 32] + call dmul4 + testtail [ebp + 48], [ebp + 44] + testcarryout [ebp + 24] + testepilogue + + .globl test_dmla4 +test_dmla4: + testprologue + testldcarry [ebp + 24] + testexpand [ebp + 36], [ebp + 40] + mov edi, [ebp + 20] + testtop [ebp + 28], [ebp + 32] + call dmla4 + testtail [ebp + 48], [ebp + 44] + testcarryout [ebp + 24] + testepilogue + + .globl test_mul4 +test_mul4: + testprologue + testldcarry [ebp + 24] + testexpand nil, [ebp + 32] + mov edi, [ebp + 20] + testtop nil, [ebp + 28] + call mul4 + testtail [ebp + 40], [ebp + 36] + testcarryout [ebp + 24] + testepilogue + + .globl test_mla4 +test_mla4: + testprologue + testldcarry [ebp + 24] + testexpand nil, [ebp + 32] + mov edi, [ebp + 20] + testtop nil, [ebp + 28] + call mla4 + testtail [ebp + 40], [ebp + 36] + testcarryout [ebp + 24] + testepilogue + + .globl test_mmul4 +test_mmul4: + testprologue + testexpand [ebp + 40], [ebp + 44] + mov edi, [ebp + 20] + testtop [ebp + 32], [ebp + 36], mont + call mmul4 + testtail [ebp + 52], [ebp + 48] + mov edi, [ebp + 28] + movdqa xmm0, [esp + 76] + movdqa xmm1, [esp + 92] + movdqu [edi], xmm0 + movdqu [edi + 16], xmm1 + testcarryout [ebp + 24] + testepilogue + + .globl test_mmla4 +test_mmla4: + testprologue + testexpand [ebp + 40], [ebp + 44] + mov edi, [ebp + 20] + testtop [ebp + 32], [ebp + 36], mont + call mmla4 + testtail [ebp + 52], [ebp + 48] + mov edi, [ebp + 28] + movdqa xmm0, [esp + 76] + movdqa xmm1, [esp + 92] + movdqu [edi], xmm0 + movdqu [edi + 16], xmm1 + testcarryout [ebp + 24] + testepilogue + + .globl test_mont4 +test_mont4: + testprologue + testexpand nil, [ebp + 36] + mov edi, [ebp + 20] + testtop nil, [ebp + 32], mont + call mont4 + testtail [ebp + 44], [ebp + 40] + mov edi, [ebp + 28] + movdqa xmm0, [esp + 76] + movdqa xmm1, [esp + 92] + movdqu [edi], xmm0 + movdqu [edi + 16], xmm1 + testcarryout [ebp + 24] + testepilogue + +#endif + +///----- That's all, folks -------------------------------------------------- diff --git a/math/mpx.c b/math/mpx.c index 2745fe0f..8f1dcede 100644 --- a/math/mpx.c +++ b/math/mpx.c @@ -27,6 +27,8 @@ /*----- Header files ------------------------------------------------------*/ +#include "config.h" + #include #include #include @@ -35,6 +37,7 @@ #include #include +#include "dispatch.h" #include "mptypes.h" #include "mpx.h" #include "bitops.h" @@ -845,8 +848,13 @@ void mpx_usubnlsl(mpw *dv, mpw *dvl, mpw a, unsigned o) * vectors in any way. */ -void mpx_umul(mpw *dv, mpw *dvl, const mpw *av, const mpw *avl, - const mpw *bv, const mpw *bvl) +CPU_DISPATCH(EMPTY, (void), void, mpx_umul, + (mpw *dv, mpw *dvl, const mpw *av, const mpw *avl, + const mpw *bv, const mpw *bvl), + (dv, dvl, av, avl, bv, bvl), pick_umul, simple_umul); + +static void simple_umul(mpw *dv, mpw *dvl, const mpw *av, const mpw *avl, + const mpw *bv, const mpw *bvl) { /* --- This is probably worthwhile on a multiply --- */ @@ -885,6 +893,36 @@ void mpx_umul(mpw *dv, mpw *dvl, const mpw *av, const mpw *avl, } } +#define MAYBE_UMUL4(impl) \ + extern void mpx_umul4_##impl(mpw */*dv*/, \ + const mpw */*av*/, const mpw */*avl*/, \ + const mpw */*bv*/, const mpw */*bvl*/); \ + static void maybe_umul4_##impl(mpw *dv, mpw *dvl, \ + const mpw *av, const mpw *avl, \ + const mpw *bv, const mpw *bvl) \ + { \ + size_t an = avl - av, bn = bvl - bv, dn = dvl - dv; \ + if (!an || an%4 != 0 || !bn || bn%4 != 0 || dn < an + bn) \ + simple_umul(dv, dvl, av, avl, bv, bvl); \ + else { \ + mpx_umul4_##impl(dv, av, avl, bv, bvl); \ + MPX_ZERO(dv + an + bn, dvl); \ + } \ + } + +#if CPUFAM_X86 + MAYBE_UMUL4(x86_sse2) +#endif + +static mpx_umul__functype *pick_umul(void) +{ +#if CPUFAM_X86 + DISPATCH_PICK_COND(mpx_umul, maybe_umul4_x86_sse2, + cpu_feature_p(CPUFEAT_X86_SSE2)); +#endif + DISPATCH_PICK_FALLBACK(mpx_umul, simple_umul); +} + /* --- @mpx_umuln@ --- * * * Arguments: @mpw *dv, *dvl@ = destination vector base and limit @@ -1220,6 +1258,7 @@ mpw mpx_udivn(mpw *qv, mpw *qvl, const mpw *rv, const mpw *rvl, mpw d) size_t _sz = (sz); \ mpw *_vv = xmalloc(MPWS(_sz)); \ mpw *_vvl = _vv + _sz; \ + memset(_vv, 0xa5, MPWS(_sz)); \ (v) = _vv; \ (vl) = _vvl; \ } while (0) diff --git a/math/t/mpmont b/math/t/mpmont index 81fbe634..cd2569b7 100644 --- a/math/t/mpmont +++ b/math/t/mpmont @@ -1,5 +1,12 @@ # Test vectors for Montgomery reduction +mul { + 6277101735386680763835789423207666416083908700390324961279 + 2455155546008943817740293915197451784769108058161191238065 + 340282366920938463500268095579187314689 + 5646741895976341600220572388698743135318229029826089708489; +} + create { 340809809850981098423498794792349 # m 266454859 # -m^{-1} mod b -- 2.11.0