mdw@git.distorted.org.uk Git - catacomb/blob - math/mpx-mul4-amd64-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
   2 ///
   3 /// Large SIMD-based multiplications
   4 ///
   5 /// (c) 2016 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// Preliminaries.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33         .arch   pentium4
  34
  35         .text
  36
  37 ///--------------------------------------------------------------------------
  38 /// Theory.
  39 ///
  40 /// We define a number of primitive fixed-size multipliers from which we can
  41 /// construct more general variable-length multipliers.
  42 ///
  43 /// The basic trick is the same throughout.  In an operand-scanning
  44 /// multiplication, the inner multiplication loop multiplies a multiple-
  45 /// precision operand by a single precision factor, and adds the result,
  46 /// appropriately shifted, to the result.  A `finely integrated operand
  47 /// scanning' implementation of Montgomery multiplication also adds the
  48 /// product of a single-precision `Montgomery factor' and the modulus,
  49 /// calculated in the same pass.  The more common `coarsely integrated
  50 /// operand scanning' alternates main multiplication and Montgomery passes,
  51 /// which requires additional carry propagation.
  52 ///
  53 /// Throughout both plain-multiplication and Montgomery stages, then, one of
  54 /// the factors remains constant throughout the operation, so we can afford
  55 /// to take a little time to preprocess it.  The transformation we perform is
  56 /// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
  57 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
  58 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
  59 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
  60 /// operands, as follows.
  61 ///
  62 ///     Offset     0       4        8      12
  63 ///        0    v'_0    v'_1    v''_0   v''_1
  64 ///       16    v'_2    v'_3    v''_2   v''_3
  65 ///
  66 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
  67 /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
  68 /// this vector right by 4 bytes brings v'_1 and v''_1 into position.  We can
  69 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
  70 /// results in 64-bit fields.  The sixteen bits of headroom allows us to add
  71 /// many products together before we must deal with carrying; it also allows
  72 /// for some calculations to be performed on the above expanded form.
  73 ///
  74 /// We maintain four `carry' registers XMM12--XMM15 accumulating intermediate
  75 /// results.  The registers' precise roles rotate during the computation; we
  76 /// name them `c0', `c1', `c2', and `c3'.  Each carry register holds two
  77 /// 64-bit halves: the register c0, for example, holds c'_0 (low half) and
  78 /// c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the
  79 /// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2 +
  80 /// c_3 B^3.  The `pmuluqdq' instruction acting on a scalar operand
  81 /// (broadcast across all lanes of its vector) and an operand in the expanded
  82 /// form above produces a result which can be added directly to the
  83 /// appropriate carry register.  Following a pass of four multiplications, we
  84 /// perform some limited carry propagation: let t = c''_0 mod B, and let d =
  85 /// c'_0 + t b; then we output z = d mod B, add (floor(d/B), floor(c''_0/B))
  86 /// to c1, and cycle the carry registers around, so that c1 becomes c0, and
  87 /// the old (implicitly) zeroed c0 becomes c3.
  88 ///
  89 /// On 64-bit AMD64, we have a reasonable number of registers: the expanded
  90 /// operands are kept in registers.  The packed operands are read from memory
  91 /// into working registers XMM4 and XMM5; XMM0--XMM3 are used for the actual
  92 /// multiplications; and XMM6 and XMM7 are used for combining the results.
  93 /// The following conventional argument names and locations are used
  94 /// throughout.
  95 ///
  96 /// Arg Format      Location    Notes
  97 ///
  98 /// U   packed      [RAX]
  99 /// X   packed      [RBX]       In Montgomery multiplication, X = N
 100 /// V   expanded    XMM8/XMM9
 101 /// Y   expanded    XMM10/XMM11 In Montgomery multiplication, Y = (A + U V) M
 102 /// M   expanded    (see below) Montgomery factor, M = -N^{-1} (mod B^4)
 103 /// N                           Modulus, for Montgomery multiplication
 104 /// A   packed      [RDI]       Destination/accumulator
 105 /// C   carry       XMM12--XMM15
 106 ///
 107 /// The calculation is some variant of
 108 ///
 109 ///     A' + C' B^4 <- U V + X Y + A + C
 110 ///
 111 /// The low-level functions fit into a fairly traditional (finely-integrated)
 112 /// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y)
 113 /// (indexed by i).
 114 ///
 115 /// The variants are as follows.
 116 ///
 117 /// Function    Variant                 Use             i       j
 118 ///
 119 /// mmul4       A = C = 0, Y = M        Montgomery      0       0
 120 /// dmul4       A = 0                   Montgomery      0       +
 121 /// mmla4       C = 0, Y = M            Montgomery      +       0
 122 /// dmla4       exactly as shown        Montgomery      +       +
 123 /// mont4       U = C = 0, V = M        Montgomery      any     0
 124 ///
 125 /// mul4zc      U = V = A = C = 0       Plain           0       0
 126 /// mul4        U = V = A = 0           Plain           0       +
 127 /// mla4zc      U = V = C = 0           Plain           +       0
 128 /// mla4        U = V = 0               Plain           +       +
 129 ///
 130 /// The `mmul4' and `mmla4' functions are also responsible for calculating
 131 /// the Montgomery reduction factor Y = (A + U V) M used by the rest of the
 132 /// inner loop.
 133
 134 ///--------------------------------------------------------------------------
 135 /// Macro definitions.
 136
 137 .macro  mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
 138         // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
 139         // of the product in registers D0, D1, D2, D3.
 140         pshufd  \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
 141   .ifnes "\d1", "nil"
 142         movdqa  \d1, \slo               // (s'_0, s'_1; s''_0, s''_1)
 143   .endif
 144   .ifnes "\d3", "nil"
 145         movdqa  \d3, \shi               // (s'_2, s'_3; s''_2, s''_3)
 146   .endif
 147   .ifnes "\d1", "nil"
 148         psrldq  \d1, 4                  // (s'_1, s''_0; s''_1, 0)
 149   .endif
 150   .ifnes "\d2", "nil"
 151         movdqa  \d2, \d0                // another copy of (r_i, ?; r_i, ?)
 152   .endif
 153   .ifnes "\d3", "nil"
 154         psrldq  \d3, 4                  // (s'_3, s''_2; s''_3, 0)
 155   .endif
 156   .ifnes "\d1", "nil"
 157         pmuludq \d1, \d0                // (r_i s'_1; r_i s''_1)
 158   .endif
 159   .ifnes "\d3", "nil"
 160         pmuludq \d3, \d0                // (r_i s'_3; r_i s''_3)
 161   .endif
 162   .ifnes "\d2", "nil"
 163         pmuludq \d2, \shi               // (r_i s'_2; r_i s''_2)
 164   .endif
 165         pmuludq \d0, \slo               // (r_i s'_0; r_i s''_0)
 166 .endm
 167
 168 .macro  accum   c0, c1=nil, c2=nil, c3=nil
 169         // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
 170         // carry registers C0--C3.  Any or all of C1--C3 may be `nil' to skip
 171         // updating that register.
 172         paddq   \c0, xmm0
 173   .ifnes "\c1", "nil"
 174         paddq   \c1, xmm1
 175   .endif
 176   .ifnes "\c2", "nil"
 177         paddq   \c2, xmm2
 178   .endif
 179   .ifnes "\c3", "nil"
 180         paddq   \c3, xmm3
 181   .endif
 182 .endm
 183
 184 .macro  mulacc  r, i, slo, shi, c0=nil, c1=nil, c2=nil, c3=nil, z3p=nil
 185         // Multiply R_I by the expanded operand SLO/SHI, and accumulate in
 186         // carry registers C0, C1, C2, C3.  If Z3P is `t' then C3 notionally
 187         // contains zero, but needs clearing; in practice, we store the
 188         // product directly rather than attempting to add.  On completion,
 189         // XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P is not `t'.
 190   .ifeqs "\z3p", "t"
 191         mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, \c3
 192         accum                       \c0,  \c1,  \c2
 193   .else
 194         mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, xmm3
 195         accum                       \c0,  \c1,  \c2,  \c3
 196   .endif
 197 .endm
 198
 199 .macro  propout d, pos, c, cc=nil
 200         // Calculate an output word from C, and store it at POS in D;
 201         // propagate carries out from C to CC in preparation for a rotation
 202         // of the carry registers.  D is an XMM register; the POS is either
 203         // `lo' or `hi' according to whether the output word should be in
 204         // lane 0 or 1 of D; the high two lanes of D are clobbered.  On
 205         // completion, XMM3 is clobbered.  If CC is `nil', then the
 206         // contribution which would have been added to it is left in C.
 207         pshufd  xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
 208         psrldq  xmm3, 12                // (t, 0; 0, 0) = (t; 0)
 209         pslldq  xmm3, 2                 // (t b; 0)
 210         paddq   \c, xmm3                // (c' + t b; c'')
 211   .ifeqs "\pos", "lo"
 212         movdqa  \d, \c
 213   .else
 214         punpckldq \d, \c
 215   .endif
 216         psrlq   \c, 32                  // floor(c/B)
 217   .ifnes "\cc", "nil"
 218         paddq   \cc, \c                 // propagate up
 219   .endif
 220 .endm
 221
 222 .macro endprop d, pos, c, t
 223         // On entry, C contains a carry register.  On exit, the low 32 bits
 224         // of the value represented in C are written at POS in D, and the
 225         // remaining bits are left at the bottom of T.
 226         movdqa  \t, \c
 227         psllq   \t, 16                  // (?; c'' b)
 228         pslldq  \c, 8                   // (0; c')
 229         paddq   \t, \c                  // (?; c' + c'' b)
 230         psrldq  \t, 8                   // (c' + c'' b; 0) = (c; 0)
 231   .ifeqs "\pos", "lo"
 232         movdqa  \d, \t
 233   .else
 234         punpckldq \d, \t
 235   .endif
 236         psrldq  \t, 4                   // (floor(c/B); 0)
 237 .endm
 238
 239 .macro  expand  z, a, b, c=nil, d=nil
 240         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
 241         // exit, A:B and C:D together hold the same values in expanded
 242         // form.  If C is `nil', then only expand A to A:B.
 243         movdqa  \b, \a                  // (a_0, a_1; a_2, a_3)
 244   .ifnes "\c", "nil"
 245         movdqa  \d, \c                  // (c_0, c_1; c_2, c_3)
 246   .endif
 247         punpcklwd \a, \z                // (a'_0, a''_0; a'_1, a''_1)
 248         punpckhwd \b, \z                // (a'_2, a''_2; a'_3, a''_3)
 249   .ifnes "\c", "nil"
 250         punpcklwd \c, \z                // (c'_0, c''_0; c'_1, c''_1)
 251         punpckhwd \d, \z                // (c'_2, c''_2; c'_3, c''_3)
 252   .endif
 253         pshufd  \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
 254         pshufd  \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
 255   .ifnes "\c", "nil"
 256         pshufd  \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
 257         pshufd  \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
 258   .endif
 259 .endm
 260
 261 .macro  squash  c0, c1, c2, c3, t, u, lo, hi=nil
 262         // On entry, C0, C1, C2, C3 are carry registers representing a value
 263         // Y.  On exit, LO holds the low 128 bits of the carry value; C1, C2,
 264         // C3, T, and U are clobbered; and the high bits of Y are stored in
 265         // HI, if this is not `nil'.
 266
 267         // The first step is to eliminate the `double-prime' pieces -- i.e.,
 268         // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
 269         // them into the 32-bit-aligned pieces above and below.  But before
 270         // we can do that, we must gather them together.
 271         movdqa  \t, \c0
 272         movdqa  \u, \c1
 273         punpcklqdq \t, \c2              // (y'_0; y'_2)
 274         punpckhqdq \c0, \c2             // (y''_0; y''_2)
 275         punpcklqdq \u, \c3              // (y'_1; y'_3)
 276         punpckhqdq \c1, \c3             // (y''_1; y''_3)
 277
 278         // Now split the double-prime pieces.  The high (up to) 48 bits will
 279         // go up; the low 16 bits go down.
 280         movdqa  \c2, \c0
 281         movdqa  \c3, \c1
 282         psllq   \c2, 48
 283         psllq   \c3, 48
 284         psrlq   \c0, 16                 // high parts of (y''_0; y''_2)
 285         psrlq   \c1, 16                 // high parts of (y''_1; y''_3)
 286         psrlq   \c2, 32                 // low parts of (y''_0; y''_2)
 287         psrlq   \c3, 32                 // low parts of (y''_1; y''_3)
 288   .ifnes "\hi", "nil"
 289         movdqa  \hi, \c1
 290   .endif
 291         pslldq  \c1, 8                  // high part of (0; y''_1)
 292
 293         paddq   \t, \c2                 // propagate down
 294         paddq   \u, \c3
 295         paddq   \t, \c1                 // and up: (y_0; y_2)
 296         paddq   \u, \c0                 // (y_1; y_3)
 297   .ifnes "\hi", "nil"
 298         psrldq  \hi, 8                  // high part of (y''_3; 0)
 299   .endif
 300
 301         // Finally extract the answer.  This complicated dance is better than
 302         // storing to memory and loading, because the piecemeal stores
 303         // inhibit store forwarding.
 304         movdqa  \c3, \t                 // (y_0; ?)
 305         movdqa  \lo, \t                 // (y^*_0, ?; ?, ?)
 306         psrldq  \t, 8                   // (y_2; 0)
 307         psrlq   \c3, 32                 // (floor(y_0/B); ?)
 308         paddq   \c3, \u                 // (y_1 + floor(y_0/B); ?)
 309         movdqa  \c1, \c3                // (y^*_1, ?; ?, ?)
 310         psrldq  \u, 8                   // (y_3; 0)
 311         psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2; ?)
 312         paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2; ?)
 313         punpckldq \lo, \c3              // (y^*_0, y^*_2; ?, ?)
 314         psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
 315         paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
 316   .ifnes "\hi", "nil"
 317         movdqa  \t, \c3
 318         pxor    \u, \u
 319   .endif
 320         punpckldq \c1, \c3              // (y^*_1, y^*_3; ?, ?)
 321   .ifnes "\hi", "nil"
 322         psrlq   \t, 32                  // very high bits of y
 323         paddq   \hi, \t
 324         punpcklqdq \hi, \u              // carry up
 325   .endif
 326         punpckldq \lo, \c1              // y mod B^4
 327 .endm
 328
 329 .macro  carryadd
 330         // On entry, RDI points to a packed addend A, and XMM12, XMM13, XMM14
 331         // hold the incoming carry registers c0, c1, and c2 representing a
 332         // carry-in C.
 333         //
 334         // On exit, the carry registers, including XMM15, are updated to hold
 335         // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
 336         // registers are preserved.
 337         movd    xmm0, [rdi +  0]        // (a_0; 0)
 338         movd    xmm1, [rdi +  4]        // (a_1; 0)
 339         movd    xmm2, [rdi +  8]        // (a_2; 0)
 340         movd    xmm15, [rdi + 12]       // (a_3; 0)
 341         paddq   xmm12, xmm0             // (c'_0 + a_0; c''_0)
 342         paddq   xmm13, xmm1             // (c'_1 + a_1; c''_1)
 343         paddq   xmm14, xmm2             // (c'_2 + a_2; c''_2 + a_3 b)
 344 .endm
 345
 346 ///--------------------------------------------------------------------------
 347 /// Primitive multipliers and related utilities.
 348
 349 INTFUNC(carryprop)
 350         // On entry, XMM12, XMM13, and XMM14 hold a 144-bit carry in an
 351         // expanded form.  Store the low 128 bits of the represented carry to
 352         // [RDI] as a packed 128-bit value, and leave the remaining 16 bits
 353         // in the low 32 bits of XMM12.  On exit, XMM0, XMM1, XMM3, XMM13 and
 354         // XMM14 are clobbered.
 355   endprologue
 356
 357         propout xmm0, lo, xmm12, xmm13
 358         propout xmm1, lo, xmm13, xmm14
 359         propout xmm0, hi, xmm14, nil
 360         endprop xmm1, hi, xmm14, xmm12
 361         punpckldq xmm0, xmm1
 362         movdqu  [rdi], xmm0
 363
 364         ret
 365 ENDFUNC
 366
 367 INTFUNC(dmul4)
 368         // On entry, RDI points to the destination buffer; RAX and RBX point
 369         // to the packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
 370         // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
 371         // incoming carry registers c0, c1, and c2; c3 is assumed to be zero.
 372         //
 373         // On exit, we write the low 128 bits of the sum C + U V + X Y to
 374         // [RDI], and update the carry registers with the carry out.  The
 375         // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
 376         // registers are preserved.
 377   endprologue
 378
 379         movdqu  xmm4, [rax]
 380         movdqu  xmm5, [rbx]
 381
 382         mulacc  xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15, t
 383         mulacc  xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 384         propout xmm6, lo,                xmm12, xmm13
 385
 386         mulacc  xmm4, 1,   xmm8,  xmm9,  xmm13, xmm14, xmm15, xmm12, t
 387         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
 388         propout xmm7, lo,                xmm13, xmm14
 389
 390         mulacc  xmm4, 2,   xmm8,  xmm9,  xmm14, xmm15, xmm12, xmm13, t
 391         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
 392         propout xmm6, hi,                xmm14, xmm15
 393
 394         mulacc  xmm4, 3,   xmm8,  xmm9,  xmm15, xmm12, xmm13, xmm14, t
 395         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
 396         propout xmm7, hi,                xmm15, xmm12
 397
 398         punpckldq xmm6, xmm7
 399         movdqu  [rdi], xmm6
 400
 401         ret
 402 ENDFUNC
 403
 404 INTFUNC(dmla4)
 405         // On entry, RDI points to the destination buffer, which also
 406         // contains an addend A to accumulate; RAX and RBX point to the
 407         // packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
 408         // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
 409         // incoming carry registers c0, c1, and c2 representing a carry-in C;
 410         // c3 is assumed to be zero.
 411         //
 412         // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
 413         // [RDI], and update the carry registers with the carry out.  The
 414         // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
 415         // registers are preserved.
 416   endprologue
 417
 418         movdqu  xmm4, [rax]
 419         movdqu  xmm5, [rbx]
 420         carryadd
 421
 422         mulacc  xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15
 423         mulacc  xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 424         propout xmm6, lo,                xmm12, xmm13
 425
 426         mulacc  xmm4, 1,   xmm8,  xmm9,  xmm13, xmm14, xmm15, xmm12, t
 427         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
 428         propout xmm7, lo,                xmm13, xmm14
 429
 430         mulacc  xmm4, 2,   xmm8,  xmm9,  xmm14, xmm15, xmm12, xmm13, t
 431         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
 432         propout xmm6, hi,                xmm14, xmm15
 433
 434         mulacc  xmm4, 3,   xmm8,  xmm9,  xmm15, xmm12, xmm13, xmm14, t
 435         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
 436         propout xmm7, hi,                xmm15, xmm12
 437
 438         punpckldq xmm6, xmm7
 439         movdqu  [rdi], xmm6
 440
 441         ret
 442 ENDFUNC
 443
 444 INTFUNC(mul4zc)
 445         // On entry, RDI points to the destination buffer; RBX points to a
 446         // packed operand X; and XMM10/XMM11 hold an expanded operand Y.
 447         //
 448         // On exit, we write the low 128 bits of the product X Y to [RDI],
 449         // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
 450         // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
 451         // general-purpose registers are preserved.
 452   endprologue
 453
 454         movdqu  xmm5, [rbx]
 455
 456         mulcore xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 457         propout xmm6, lo,                xmm12, xmm13
 458
 459         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 460         propout xmm7, lo,                xmm13, xmm14
 461
 462         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 463         propout xmm6, hi,                xmm14, xmm15
 464
 465         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 466         propout xmm7, hi,                xmm15, xmm12
 467
 468         punpckldq xmm6, xmm7
 469         movdqu  [rdi], xmm6
 470
 471         ret
 472 ENDFUNC
 473
 474 INTFUNC(mul4)
 475         // On entry, RDI points to the destination buffer; RBX points to a
 476         // packed operand X; XMM10/XMM11 hold an expanded operand Y; and
 477         // XMM12, XMM13, XMM14 hold the incoming carry registers c0, c1, and
 478         // c2, representing a carry-in C; c3 is assumed to be zero.
 479         //
 480         // On exit, we write the low 128 bits of the sum C + X Y to [RDI],
 481         // and update the carry registers with the carry out.  The registers
 482         // XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
 483         // general-purpose registers are preserved.
 484   endprologue
 485
 486         movdqu  xmm5, [rbx]
 487
 488         mulacc  xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, t
 489         propout xmm6, lo,                xmm12, xmm13
 490
 491         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 492         propout xmm7, lo,                xmm13, xmm14
 493
 494         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 495         propout xmm6, hi,                xmm14, xmm15
 496
 497         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 498         propout xmm7, hi,                xmm15, xmm12
 499
 500         punpckldq xmm6, xmm7
 501         movdqu  [rdi], xmm6
 502
 503         ret
 504 ENDFUNC
 505
 506 INTFUNC(mla4zc)
 507         // On entry, RDI points to the destination buffer, which also
 508         // contains an addend A to accumulate; RBX points to a packed operand
 509         // X; and XMM10/XMM11 points to an expanded operand Y.
 510         //
 511         // On exit, we write the low 128 bits of the sum A + X Y to [RDI],
 512         // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
 513         // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
 514         // general-purpose registers are preserved.
 515   endprologue
 516
 517         movdqu  xmm5, [rbx]
 518         movd    xmm12, [rdi +  0]
 519         movd    xmm13, [rdi +  4]
 520         movd    xmm14, [rdi +  8]
 521         movd    xmm15, [rdi + 12]
 522
 523         mulacc  xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 524         propout xmm6, lo,                xmm12, xmm13
 525
 526         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 527         propout xmm7, lo,                xmm13, xmm14
 528
 529         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 530         propout xmm6, hi,                xmm14, xmm15
 531
 532         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 533         propout xmm7, hi,                xmm15, xmm12
 534
 535         punpckldq xmm6, xmm7
 536         movdqu  [rdi], xmm6
 537
 538         ret
 539 ENDFUNC
 540
 541 INTFUNC(mla4)
 542         // On entry, RDI points to the destination buffer, which also
 543         // contains an addend A to accumulate; RBX points to a packed operand
 544         // X; XMM10/XMM11 holds an expanded operand Y; and XMM12, XMM13,
 545         // XMM14 hold the incoming carry registers c0, c1, and c2,
 546         // representing a carry-in C; c3 is assumed to be zero.
 547         //
 548         // On exit, we write the low 128 bits of the sum A + C + X Y to
 549         // [RDI], and update the carry registers with the carry out.  The
 550         // registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
 551         // general-purpose registers are preserved.
 552   endprologue
 553
 554         movdqu  xmm5, [rbx]
 555         carryadd
 556
 557         mulacc  xmm5, 0,    xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 558         propout xmm6, lo,                xmm12, xmm13
 559
 560         mulacc  xmm5, 1,    xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 561         propout xmm7, lo,                xmm13, xmm14
 562
 563         mulacc  xmm5, 2,    xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 564         propout xmm6, hi,                xmm14, xmm15
 565
 566         mulacc  xmm5, 3,    xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 567         propout xmm7, hi,                xmm15, xmm12
 568
 569         punpckldq xmm6, xmm7
 570         movdqu  [rdi], xmm6
 571
 572         ret
 573 ENDFUNC
 574
 575 INTFUNC(mmul4)
 576         // On entry, RDI points to the destination buffer; RAX and RBX point
 577         // to the packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold
 578         // the expanded operands V and M.  The stack pointer must be 8 modulo
 579         // 16 (as usual for AMD64 ABIs).
 580         //
 581         // On exit, we store Y = U V M mod B in XMM10/XMM11, and write the
 582         // low 128 bits of the sum U V + N Y to [RDI], leaving the remaining
 583         // carry in XMM12, XMM13, and XMM14.  The registers XMM0--XMM7, and
 584         // XMM15 are clobbered; the general-purpose registers are preserved.
 585         movdqu  xmm4, [rax]
 586 #if ABI_WIN
 587         stalloc 48 + 8                  // space for the carries
 588 #endif
 589   endprologue
 590
 591         // Calculate W = U V, and leave it in XMM7.  Stash the carry pieces
 592         // for later.
 593         mulcore xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15
 594         propout xmm7, lo,                xmm12, xmm13
 595         jmp     5f
 596 ENDFUNC
 597
 598 INTFUNC(mmla4)
 599         // On entry, RDI points to the destination buffer, which also
 600         // contains an addend A to accumulate; RAX and RBX point to the
 601         // packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold the
 602         // expanded operands V and M.  The stack pointer must be 8 modulo 16
 603         // (as usual for AMD64 ABIs).
 604         //
 605         // On exit, we store Y = (A + U V) M mod B in XMM10/XMM11, and write
 606         // the low 128 bits of the sum A + U V + N Y to [RDI], leaving the
 607         // remaining carry in XMM12, XMM13, and XMM14.  The registers
 608         // XMM0--XMM7, and XMM15 are clobbered; the general-purpose registers
 609         // are preserved.
 610         movdqu  xmm4, [rax]
 611 #if ABI_WIN
 612         stalloc 48 + 8                  // space for the carries
 613 #  define STKTMP(i) [SP + i]
 614 #endif
 615 #if ABI_SYSV
 616 #  define STKTMP(i) [SP + i - 48 - 8]   // use red zone
 617 #endif
 618   endprologue
 619
 620         movd    xmm12, [rdi +  0]
 621         movd    xmm13, [rdi +  4]
 622         movd    xmm14, [rdi +  8]
 623         movd    xmm15, [rdi + 12]
 624
 625         // Calculate W = U V, and leave it in XMM7.  Stash the carry pieces
 626         // for later.
 627         mulacc  xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15
 628         propout xmm7, lo,                xmm12, xmm13
 629
 630 5:      mulacc  xmm4, 1,   xmm8,  xmm9,  xmm13, xmm14, xmm15, xmm12, t
 631         propout xmm6, lo,                xmm13, xmm14
 632
 633         mulacc  xmm4, 2,   xmm8,  xmm9,  xmm14, xmm15, xmm12, xmm13, t
 634         propout xmm7, hi,                xmm14, xmm15
 635
 636         mulacc  xmm4, 3,   xmm8,  xmm9,  xmm15, xmm12, xmm13, xmm14, t
 637         propout xmm6, hi,                xmm15, xmm12
 638
 639         // Prepare W, and stash carries for later.
 640         punpckldq xmm7, xmm6
 641         movdqa  STKTMP( 0), xmm12
 642         movdqa  STKTMP(16), xmm13
 643         movdqa  STKTMP(32), xmm14
 644
 645         // Calculate Y = W M.  We just about have enough spare registers to
 646         // make this work.
 647         mulcore xmm7, 0,   xmm10, xmm11, xmm3,  xmm4,  xmm5,  xmm6
 648
 649         // Start expanding W back into the main carry registers...
 650         pxor    xmm15, xmm15
 651         movdqa  xmm12, xmm7
 652         movdqa  xmm14, xmm7
 653
 654         mulcore xmm7, 1,   xmm10, xmm11, xmm0,  xmm1,  xmm2
 655         accum                            xmm4,  xmm5,  xmm6
 656
 657         punpckldq xmm12, xmm15          // (w_0, 0; w_1, 0)
 658         punpckhdq xmm14, xmm15          // (w_2, 0; w_3, 0)
 659
 660         mulcore xmm7, 2,   xmm10, xmm11, xmm0,  xmm1
 661         accum                            xmm5,  xmm6
 662
 663         pxor    xmm2, xmm2
 664         movdqa  xmm13, xmm12
 665         movdqa  xmm15, xmm14
 666
 667         mulcore xmm7, 3,   xmm10, xmm11, xmm0
 668         accum                            xmm6
 669
 670         punpckldq xmm12, xmm2           // (w_0, 0; 0, 0)
 671         punpckldq xmm14, xmm2           // (w_2, 0; 0, 0)
 672         punpckhdq xmm13, xmm2           // (w_1, 0; 0, 0)
 673         punpckhdq xmm15, xmm2           // (w_3, 0; 0, 0)
 674
 675         // That's lots of pieces.  Now we have to assemble the answer.
 676         squash  xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
 677
 678         // Expand it.
 679         movdqu  xmm5, [rbx]
 680         expand  xmm2, xmm10, xmm11
 681
 682         // Finish the calculation by adding the Montgomery product.
 683         mulacc  xmm5, 0    xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 684         propout xmm6, lo,                xmm12, xmm13
 685
 686         mulacc  xmm5, 1    xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 687         propout xmm7, lo,                xmm13, xmm14
 688
 689         mulacc  xmm5, 2    xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 690         propout xmm6, hi,                xmm14, xmm15
 691
 692         mulacc  xmm5, 3    xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 693         propout xmm7, hi,                xmm15, xmm12
 694
 695         punpckldq xmm6, xmm7
 696
 697         // Add add on the carry we calculated earlier.
 698         paddq   xmm12, STKTMP( 0)
 699         paddq   xmm13, STKTMP(16)
 700         paddq   xmm14, STKTMP(32)
 701
 702         // And, with that, we're done.
 703         movdqu  [rdi], xmm6
 704 #if ABI_WIN
 705         stfree  56
 706 #endif
 707         ret
 708
 709 #undef STKTMP
 710
 711 ENDFUNC
 712
 713 INTFUNC(mont4)
 714         // On entry, RDI points to the destination buffer holding a packed
 715         // value W; RBX points to a packed operand N; and XMM8/XMM9 hold an
 716         // expanded operand M.
 717         //
 718         // On exit, we store Y = W M mod B in XMM10/XMM11, and write the low
 719         // 128 bits of the sum W + N Y to [RDI], leaving the remaining carry
 720         // in XMM12, XMM13, and XMM14.  The registers XMM0--XMM3, XMM5--XMM7,
 721         // and XMM15 are clobbered; the general-purpose registers are
 722         // preserved.
 723   endprologue
 724
 725         movdqu  xmm7, [rdi]
 726
 727         // Calculate Y = W M.  Avoid the standard carry registers, because
 728         // we're setting something else up there.
 729         mulcore xmm7, 0,   xmm8,  xmm9,  xmm3,  xmm4,  xmm5,  xmm6
 730
 731         // Start expanding W back into the main carry registers...
 732         pxor    xmm15, xmm15
 733         movdqa  xmm12, xmm7
 734         movdqa  xmm14, xmm7
 735
 736         mulcore xmm7, 1,   xmm8,  xmm9,  xmm0,  xmm1,  xmm2
 737         accum                            xmm4,  xmm5,  xmm6
 738
 739         punpckldq xmm12, xmm15          // (w_0, 0; w_1, 0)
 740         punpckhdq xmm14, xmm15          // (w_2, 0; w_3, 0)
 741
 742         mulcore xmm7, 2,   xmm8,  xmm9,  xmm0,  xmm1
 743         accum                            xmm5,  xmm6
 744
 745         pxor    xmm2, xmm2
 746         movdqa  xmm13, xmm12
 747         movdqa  xmm15, xmm14
 748
 749         mulcore xmm7, 3,   xmm8,  xmm9,  xmm0
 750         accum                            xmm6
 751
 752         punpckldq xmm12, xmm2           // (w_0, 0; 0, 0)
 753         punpckldq xmm14, xmm2           // (w_2, 0; 0, 0)
 754         punpckhdq xmm13, xmm2           // (w_1, 0; 0, 0)
 755         punpckhdq xmm15, xmm2           // (w_3, 0; 0, 0)
 756
 757         // That's lots of pieces.  Now we have to assemble the answer.
 758         squash  xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
 759
 760         // Expand it.
 761         movdqu  xmm5, [rbx]
 762         expand  xmm2, xmm10, xmm11
 763
 764         // Finish the calculation by adding the Montgomery product.
 765         mulacc  xmm5, 0    xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 766         propout xmm6, lo,                xmm12, xmm13
 767
 768         mulacc  xmm5, 1    xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 769         propout xmm7, lo,                xmm13, xmm14
 770
 771         mulacc  xmm5, 2    xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 772         propout xmm6, hi,                xmm14, xmm15
 773
 774         mulacc  xmm5, 3    xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 775         propout xmm7, hi,                xmm15, xmm12
 776
 777         punpckldq xmm6, xmm7
 778
 779         // And, with that, we're done.
 780         movdqu  [rdi], xmm6
 781         ret
 782 ENDFUNC
 783
 784 ///--------------------------------------------------------------------------
 785 /// Bulk multipliers.
 786
 787 FUNC(mpx_umul4_amd64_avx)
 788         .arch   .avx
 789         vzeroupper
 790   endprologue
 791         .arch   pentium4
 792 ENDFUNC
 793
 794 FUNC(mpx_umul4_amd64_sse2)
 795         // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
 796         //                           const mpw *bv, const mpw *bvl);
 797
 798         // Establish the arguments and do initial setup.
 799         //
 800         //                      sysv    win
 801         // inner loop dv        rdi     rdi*
 802         // inner loop av        rbx*    rbx*
 803         // outer loop dv        r10     rcx
 804         // outer loop bv        rcx     r9
 805         // av base              rsi     rdx
 806         // av limit             rdx     r8
 807         // bv limit             r8      r10
 808
 809 #if ABI_SYSV
 810 #  define DV r10
 811 #  define AV rsi
 812 #  define AVL rdx
 813 #  define BV rcx
 814 #  define BVL r8
 815
 816         pushreg rbx
 817   endprologue
 818
 819         mov     DV, rdi
 820 #endif
 821
 822 #if ABI_WIN
 823 #  define DV rcx
 824 #  define AV rdx
 825 #  define AVL r8
 826 #  define BV r9
 827 #  define BVL r10
 828
 829         pushreg rbx
 830         pushreg rdi
 831         stalloc 160 + 8
 832
 833         savexmm xmm6,    0
 834         savexmm xmm7,   16
 835         savexmm xmm8,   32
 836         savexmm xmm9,   48
 837         savexmm xmm10,  64
 838         savexmm xmm11,  80
 839         savexmm xmm12,  96
 840         savexmm xmm13, 112
 841         savexmm xmm14, 128
 842         savexmm xmm15, 144
 843
 844   endprologue
 845
 846         mov     rdi, DV
 847         mov     BVL, [SP + 224]
 848 #endif
 849
 850         // Prepare for the first iteration.
 851         pxor    xmm0, xmm0
 852         movdqu  xmm10, [BV]             // bv[0]
 853         mov     rbx, AV
 854         add     DV, 16
 855         add     BV, 16
 856         expand  xmm0, xmm10, xmm11
 857         call    mul4zc
 858         add     rbx, 16
 859         add     rdi, 16
 860         cmp     rbx, AVL                // all done?
 861         jae     8f
 862
 863         .p2align 4
 864         // Continue with the first iteration.
 865 0:      call    mul4
 866         add     rbx, 16
 867         add     rdi, 16
 868         cmp     rbx, AVL                // all done?
 869         jb      0b
 870
 871         // Write out the leftover carry.  There can be no tail here.
 872 8:      call    carryprop
 873         cmp     BV, BVL                 // more passes to do?
 874         jae     9f
 875
 876         .p2align 4
 877         // Set up for the next pass.
 878 1:      movdqu  xmm10, [BV]             // bv[i]
 879         mov     rdi, DV                 // -> dv[i]
 880         pxor    xmm0, xmm0
 881         expand  xmm0, xmm10, xmm11
 882         mov     rbx, AV                 // -> av[0]
 883         add     DV, 16
 884         add     BV, 16
 885         call    mla4zc
 886         add     rbx, 16
 887         add     rdi, 16
 888         cmp     rbx, AVL                // done yet?
 889         jae     8f
 890
 891         .p2align 4
 892         // Continue...
 893 0:      call    mla4
 894         add     rbx, 16
 895         add     rdi, 16
 896         cmp     rbx, AVL
 897         jb      0b
 898
 899         // Finish off this pass.  There was no tail on the previous pass, and
 900         // there can be none on this pass.
 901 8:      call    carryprop
 902         cmp     BV, BVL
 903         jb      1b
 904
 905         // All over.
 906 9:
 907
 908 #if ABI_SYSV
 909         popreg  rbx
 910 #endif
 911
 912 #if ABI_WIN
 913         rstrxmm xmm6,    0
 914         rstrxmm xmm7,   16
 915         rstrxmm xmm8,   32
 916         rstrxmm xmm9,   48
 917         rstrxmm xmm10,  64
 918         rstrxmm xmm11,  80
 919         rstrxmm xmm12,  96
 920         rstrxmm xmm13, 112
 921         rstrxmm xmm14, 128
 922         rstrxmm xmm15, 144
 923
 924         stfree  160 + 8
 925         popreg  rdi
 926         popreg  rbx
 927 #endif
 928
 929         ret
 930
 931 #undef DV
 932 #undef AV
 933 #undef AVL
 934 #undef BV
 935 #undef BVL
 936
 937 ENDFUNC
 938
 939 FUNC(mpxmont_mul4_amd64_avx)
 940         .arch   .avx
 941         vzeroupper
 942   endprologue
 943         .arch   pentium4
 944 ENDFUNC
 945
 946 FUNC(mpxmont_mul4_amd64_sse2)
 947         // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
 948         //                           const mpw *nv, size_t n, const mpw *mi);
 949
 950         // Establish the arguments and do initial setup.
 951         //
 952         //                      sysv    win
 953         // inner loop dv        rdi     rdi*
 954         // inner loop av        rax     rax
 955         // inner loop nv        rbx*    rbx*
 956         // mi                   r9      r10
 957         // outer loop dv        r10     rcx
 958         // outer loop bv        rdx     r8
 959         // av base              rsi     rdx
 960         // av limit             r11     r11
 961         // bv limit             r8      r12*
 962         // nv base              rcx     r9
 963         // n                    r8      r12*
 964
 965 #if ABI_SYSV
 966 #  define DV r10
 967 #  define AV rsi
 968 #  define AVL r11
 969 #  define BV rdx
 970 #  define BVL r8
 971 #  define NV rcx
 972 #  define N r8
 973 #  define MI r9
 974
 975         pushreg rbx
 976   endprologue
 977
 978         mov     DV, rdi
 979 #endif
 980
 981 #if ABI_WIN
 982 #  define DV rcx
 983 #  define AV rdx
 984 #  define AVL r11
 985 #  define BV r8
 986 #  define BVL r12
 987 #  define NV r9
 988 #  define N r12
 989 #  define MI r10
 990
 991         pushreg rbx
 992         pushreg rdi
 993         pushreg r12
 994         stalloc 160
 995
 996         savexmm xmm6,    0
 997         savexmm xmm7,   16
 998         savexmm xmm8,   32
 999         savexmm xmm9,   48
1000         savexmm xmm10,  64
1001         savexmm xmm11,  80
1002         savexmm xmm12,  96
1003         savexmm xmm13, 112
1004         savexmm xmm14, 128
1005         savexmm xmm15, 144
1006
1007   endprologue
1008
1009         mov     rdi, DV
1010         mov     N, [SP + 224]
1011         mov     MI, [SP + 232]
1012 #endif
1013
1014         // Establish the expanded operands.
1015         pxor    xmm0, xmm0
1016         movdqu  xmm8, [BV]              // bv[0]
1017         movdqu  xmm10, [MI]             // mi
1018         expand  xmm0, xmm8, xmm9, xmm10, xmm11
1019
1020         // Set up the outer loop state and prepare for the first iteration.
1021         mov     rax, AV                 // -> U = av[0]
1022         mov     rbx, NV                 // -> X = nv[0]
1023         lea     AVL, [AV + 4*N]         // -> av[n/4] = av limit
1024         lea     BVL, [BV + 4*N]         // -> bv[n/4] = bv limit
1025         add     BV, 16
1026         add     DV, 16
1027         call    mmul4
1028         add     rdi, 16
1029         add     rax, 16
1030         add     rbx, 16
1031         cmp     rax, AVL                // done already?
1032         jae     8f
1033
1034         .p2align 4
1035         // Complete the first inner loop.
1036 0:      call    dmul4
1037         add     rdi, 16
1038         add     rax, 16
1039         add     rbx, 16
1040         cmp     rax, AVL                // done yet?
1041         jb      0b
1042
1043         // Still have carries left to propagate.
1044         call    carryprop
1045         movd    [rdi + 16], xmm12
1046
1047         .p2align 4
1048         // Embark on the next iteration.  (There must be one.  If n = 1, then
1049         // we would have bailed above, to label 8.  Similarly, the subsequent
1050         // iterations can fall into the inner loop immediately.)
1051 1:      pxor    xmm0, xmm0
1052         movdqu  xmm8, [BV]              // bv[i]
1053         movdqu  xmm10, [MI]             // mi
1054         mov     rdi, DV                 // -> Z = dv[i]
1055         mov     rax, AV                 // -> U = av[0]
1056         mov     rbx, NV                 // -> X = nv[0]
1057         expand  xmm0, xmm8, xmm9, xmm10, xmm11
1058         add     BV, 16
1059         add     DV, 16
1060         call    mmla4
1061         add     rdi, 16
1062         add     rax, 16
1063         add     rbx, 16
1064
1065         .p2align 4
1066         // Complete the next inner loop.
1067 0:      call    dmla4
1068         add     rdi, 16
1069         add     rax, 16
1070         add     rbx, 16
1071         cmp     rax, AVL
1072         jb      0b
1073
1074         // Still have carries left to propagate, and they overlap the
1075         // previous iteration's final tail, so read that in and add it.
1076         movd    xmm0, [rdi]
1077         paddq   xmm12, xmm0
1078         call    carryprop
1079         movd    [rdi + 16], xmm12
1080
1081         // Back again, maybe.
1082         cmp     BV, BVL
1083         jb      1b
1084
1085         // All done.
1086 9:
1087
1088 #if ABI_SYSV
1089         popreg  rbx
1090 #endif
1091
1092 #if ABI_WIN
1093         rstrxmm xmm6,    0
1094         rstrxmm xmm7,   16
1095         rstrxmm xmm8,   32
1096         rstrxmm xmm9,   48
1097         rstrxmm xmm10,  64
1098         rstrxmm xmm11,  80
1099         rstrxmm xmm12,  96
1100         rstrxmm xmm13, 112
1101         rstrxmm xmm14, 128
1102         rstrxmm xmm15, 144
1103
1104         stfree  160
1105         popreg  r12
1106         popreg  rdi
1107         popreg  rbx
1108 #endif
1109
1110         ret
1111
1112         // First iteration was short.  Write out the carries and we're done.
1113         // (This could be folded into the main loop structure, but that would
1114         // penalize small numbers more.)
1115 8:      call    carryprop
1116         movd    [rdi + 16], xmm12
1117 #if ABI_SYSV
1118         popreg  rbx
1119         ret
1120 #endif
1121 #if ABI_WIN
1122         jmp     9b
1123 #endif
1124
1125 #undef DV
1126 #undef AV
1127 #undef AVL
1128 #undef BV
1129 #undef BVL
1130 #undef NV
1131 #undef N
1132 #undef MI
1133
1134 ENDFUNC
1135
1136 FUNC(mpxmont_redc4_amd64_avx)
1137         .arch   .avx
1138         vzeroupper
1139   endprologue
1140         .arch   pentium4
1141 ENDFUNC
1142
1143 FUNC(mpxmont_redc4_amd64_sse2)
1144         // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
1145         //                             size_t n, const mpw *mi);
1146
1147         // Establish the arguments and do initial setup.
1148         //
1149         //                      sysv    win
1150         // inner loop dv        rdi     rdi*
1151         // dv limit             rax     rax
1152         // blocks-of-4 dv limit rsi     rdx
1153         // inner loop nv        rbx*    rbx*
1154         // mi                   r8      r10
1155         // outer loop dv        r10     rcx
1156         // outer loop dv limit  r11     r11
1157         // nv base              rdx     r8
1158         // nv limit             r9      r12*
1159         // n                    rcx     r9
1160         // c                    rcx     r9
1161
1162 #if ABI_SYSV
1163 #  define DVL rax
1164 #  define DVL4 rsi
1165 #  define MI r8
1166 #  define DV r10
1167 #  define DVLO r11
1168 #  define NV rdx
1169 #  define NVL r9
1170 #  define N rcx
1171 #  define C ecx
1172
1173         pushreg rbx
1174   endprologue
1175
1176         mov     DV, rdi
1177 #endif
1178
1179 #if ABI_WIN
1180 #  define DVL rax
1181 #  define DVL4 rdx
1182 #  define MI r10
1183 #  define DV rcx
1184 #  define DVLO r11
1185 #  define NV r8
1186 #  define NVL r12
1187 #  define N r9
1188 #  define C r9d
1189
1190         pushreg rbx
1191         pushreg rdi
1192         pushreg r12
1193         stalloc 160
1194
1195         savexmm xmm6,    0
1196         savexmm xmm7,   16
1197         savexmm xmm8,   32
1198         savexmm xmm9,   48
1199         savexmm xmm10,  64
1200         savexmm xmm11,  80
1201         savexmm xmm12,  96
1202         savexmm xmm13, 112
1203         savexmm xmm14, 128
1204         savexmm xmm15, 144
1205
1206   endprologue
1207
1208         mov     rdi, DV
1209         mov     MI, [SP + 224]
1210 #endif
1211
1212         // Establish the expanded operands and the blocks-of-4 dv limit.
1213         pxor    xmm0, xmm0
1214         mov     DVL, DVL4               // -> dv[n] = dv limit
1215         sub     DVL4, DV                // length of dv in bytes
1216         movdqu  xmm8, [MI]              // mi
1217         and     DVL4, ~15               // mask off the tail end
1218         expand  xmm0, xmm8, xmm9
1219         add     DVL4, DV                // find limit
1220
1221         // Set up the outer loop state and prepare for the first iteration.
1222         mov     rbx, NV                 // -> X = nv[0]
1223         lea     DVLO, [DV + 4*N]        // -> dv[n/4] = outer dv limit
1224         lea     NVL, [NV + 4*N]         // -> nv[n/4] = nv limit
1225         add     DV, 16
1226         call    mont4
1227         add     rbx, 16
1228         add     rdi, 16
1229         cmp     rbx, NVL                // done already?
1230         jae     8f
1231
1232         .p2align 4
1233         // Complete the first inner loop.
1234 5:      call    mla4
1235         add     rbx, 16
1236         add     rdi, 16
1237         cmp     rbx, NVL                // done yet?
1238         jb      5b
1239
1240         // Still have carries left to propagate.
1241 8:      carryadd
1242         psllq   xmm15, 16
1243         pslldq  xmm15, 8
1244         paddq   xmm14, xmm15
1245         call    carryprop
1246         movd    C, xmm12
1247         add     rdi, 16
1248         cmp     rdi, DVL4
1249         jae     7f
1250
1251         .p2align 4
1252         // Continue carry propagation until the end of the buffer.
1253 0:      add     [rdi], C
1254         mov     C, 0                    // preserves flags
1255         adcd    [rdi + 4], 0
1256         adcd    [rdi + 8], 0
1257         adcd    [rdi + 12], 0
1258         adc     C, 0
1259         add     rdi, 16
1260         cmp     rdi, DVL4
1261         jb      0b
1262
1263         // Deal with the tail end.
1264 7:      add     [rdi], C
1265         mov     C, 0                    // preserves flags
1266         add     rdi, 4
1267         adc     C, 0
1268         cmp     rdi, DVL
1269         jb      7b
1270
1271         // All done for this iteration.  Start the next.  (This must have at
1272         // least one follow-on iteration, or we'd not have started this outer
1273         // loop.)
1274 8:      mov     rdi, DV                 // -> Z = dv[i]
1275         mov     rbx, NV                 // -> X = nv[0]
1276         cmp     rdi, DVLO               // all done yet?
1277         jae     9f
1278         add     DV, 16
1279         call    mont4
1280         add     rdi, 16
1281         add     rbx, 16
1282         jmp     5b
1283
1284         // All over.
1285 9:
1286
1287 #if ABI_SYSV
1288         popreg  rbx
1289 #endif
1290
1291 #if ABI_WIN
1292         rstrxmm xmm6,    0
1293         rstrxmm xmm7,   16
1294         rstrxmm xmm8,   32
1295         rstrxmm xmm9,   48
1296         rstrxmm xmm10,  64
1297         rstrxmm xmm11,  80
1298         rstrxmm xmm12,  96
1299         rstrxmm xmm13, 112
1300         rstrxmm xmm14, 128
1301         rstrxmm xmm15, 144
1302
1303         stfree  160
1304         popreg  r12
1305         popreg  rdi
1306         popreg  rbx
1307 #endif
1308
1309         ret
1310
1311 #undef DVL
1312 #undef DVL4
1313 #undef MI
1314 #undef DV
1315 #undef DVLO
1316 #undef NV
1317 #undef NVL
1318 #undef N
1319 #undef C
1320
1321 ENDFUNC
1322
1323 ///--------------------------------------------------------------------------
1324 /// Testing and performance measurement.
1325
1326 #ifdef TEST_MUL4
1327
1328 #if ABI_SYSV
1329 #  define ARG0 rdi
1330 #  define ARG1 rsi
1331 #  define ARG2 rdx
1332 #  define ARG3 rcx
1333 #  define ARG4 r8
1334 #  define ARG5 r9
1335 #  define ARG6 STKARG(0)
1336 #  define ARG7 STKARG(1)
1337 #  define ARG8 STKARG(2)
1338 #  define STKARG_OFFSET 16
1339 #endif
1340 #if ABI_WIN
1341 #  define ARG0 rcx
1342 #  define ARG1 rdx
1343 #  define ARG2 r8
1344 #  define ARG3 r9
1345 #  define ARG4 STKARG(0)
1346 #  define ARG5 STKARG(1)
1347 #  define ARG6 STKARG(2)
1348 #  define ARG7 STKARG(3)
1349 #  define ARG8 STKARG(4)
1350 #  define STKARG_OFFSET 224
1351 #endif
1352 #define STKARG(i) [SP + STKARG_OFFSET + 8*(i)]
1353
1354 //                sysv                          win
1355 //                dmul  smul  mmul  mont        dmul  smul  mmul  mont
1356 // A    rax
1357 // D    rdx
1358 // z    rdi       rdi   rdi   rdi    rdi        rcx   rcx   rcx    rcx
1359 // c    rcx       rsi   rsi   rsi    rsi        rdx   rdx   rdx    rdx
1360 // y    r10       --    --    rdx    rdx        --    --    r8     r8
1361 // u    r11       rdx   --    rcx    --         r8    --    r9     --
1362 // x    rbx       rcx   rdx   r8     rcx        r9    r8    stk0   r9
1363 // vv   xmm8/9    r8    --    r9     r8         stk0  --    stk1   stk0
1364 // yy   xmm10/11  r9    rcx   stk0   --         stk1  r9    stk2   --
1365 // n    r8        stk0  r8    stk1   r9         stk2  stk0  stk3   stk1
1366 // cyv  r9        stk1  r9    stk2   stk0       stk3  stk1  stk4   stk2
1367
1368 .macro  cysetup v, n
1369         rdtsc
1370         shl     rdx, 32
1371         or      rax, rdx
1372         mov     [\v + 8*\n - 8], rax
1373 .endm
1374
1375 .macro  cystore v, n
1376         rdtsc
1377         shl     rdx, 32
1378         or      rax, rdx
1379         sub     rax, [\v + 8*\n - 8]
1380         mov     [\v + 8*\n - 8], rax
1381         dec     \n
1382 .endm
1383
1384 .macro  testprologue mode
1385         pushreg rbx
1386 #if ABI_SYSV
1387   endprologue
1388   .ifeqs "\mode", "dmul"
1389         mov     rbx, rcx
1390         movdqu  xmm8, [r8]
1391         movdqu  xmm10, [r9]
1392         mov     r8d, STKARG(0)
1393         mov     r9, STKARG(1)
1394         mov     r11, rdx
1395         mov     rcx, rsi
1396   .endif
1397   .ifeqs "\mode", "smul"
1398         mov     rbx, rdx
1399         movdqu  xmm10, [rcx]
1400         mov     rcx, rsi
1401   .endif
1402   .ifeqs "\mode", "mmul"
1403         mov     rax, STKARG(0)
1404         mov     rbx, r8
1405         movdqu  xmm8, [r9]
1406         movdqu  xmm10, [rax]
1407         mov     r8d, STKARG(1)
1408         mov     r9, STKARG(2)
1409         mov     r10, rdx
1410         mov     r11, rcx
1411         mov     rcx, rsi
1412   .endif
1413   .ifeqs "\mode", "mont"
1414         mov     rbx, rcx
1415         movdqu  xmm8, [r8]
1416         mov     r8d, r9d
1417         mov     r9, STKARG(0)
1418         mov     r10, rdx
1419         mov     rcx, rsi
1420   .endif
1421 #endif
1422 #if ABI_WIN
1423         pushreg rdi
1424         stalloc 168
1425         savexmm xmm6,    0
1426         savexmm xmm7,   16
1427         savexmm xmm8,   32
1428         savexmm xmm9,   48
1429         savexmm xmm10,  64
1430         savexmm xmm11,  80
1431         savexmm xmm12,  96
1432         savexmm xmm13, 112
1433         savexmm xmm14, 128
1434         savexmm xmm15, 144
1435   endprologue
1436   .ifeqs "\mode", "dmul"
1437         mov     r10, STKARG(0)
1438         mov     r11, STKARG(1)
1439         mov     rdi, rcx
1440         mov     rcx, rdx
1441         mov     rbx, r9
1442         movdqu  xmm8, [r10]
1443         movdqu  xmm10, [r11]
1444         mov     r11, r8
1445         mov     r8d, STKARG(2)
1446         mov     r9, STKARG(3)
1447   .endif
1448   .ifeqs "\mode", "smul"
1449         mov     rdi, rcx
1450         mov     rcx, rdx
1451         mov     rbx, r8
1452         movdqu  xmm10, [r9]
1453         mov     r8d, STKARG(0)
1454         mov     r9, STKARG(1)
1455   .endif
1456   .ifeqs "\mode", "mmul"
1457         mov     r10, STKARG(1)
1458         mov     r11, STKARG(2)
1459         mov     rdi, rcx
1460         mov     rcx, rdx
1461         mov     rbx, STKARG(0)
1462         movdqu  xmm8, [r10]
1463         movdqu  xmm10, [r11]
1464         mov     r10, r8
1465         mov     r11, r9
1466         mov     r8d, STKARG(3)
1467         mov     r9, STKARG(4)
1468   .endif
1469   .ifeqs "\mode", "mont"
1470         mov     r10, STKARG(0)
1471         mov     rdi, rcx
1472         mov     rcx, rdx
1473         mov     rbx, r9
1474         movdqu  xmm8, [r10]
1475         mov     r10, r8
1476         mov     r8d, STKARG(1)
1477         mov     r9, STKARG(2)
1478   .endif
1479 #endif
1480
1481         pxor    xmm0, xmm0
1482   .ifeqs "\mode", "dmul"
1483         expand  xmm0, xmm8, xmm9, xmm10, xmm11
1484   .endif
1485   .ifeqs "\mode", "smul"
1486         expand  xmm0, xmm10, xmm11
1487   .endif
1488   .ifeqs "\mode", "mmul"
1489         expand  xmm0, xmm8, xmm9, xmm10, xmm11
1490   .endif
1491   .ifeqs "\mode", "mont"
1492         expand  xmm0, xmm8, xmm9
1493   .endif
1494 .endm
1495
1496 .macro  testepilogue
1497 #if ABI_WIN
1498         rstrxmm xmm6,    0
1499         rstrxmm xmm7,   16
1500         rstrxmm xmm8,   32
1501         rstrxmm xmm9,   48
1502         rstrxmm xmm10,  64
1503         rstrxmm xmm11,  80
1504         rstrxmm xmm12,  96
1505         rstrxmm xmm13, 112
1506         rstrxmm xmm14, 128
1507         rstrxmm xmm15, 144
1508         stfree  168
1509         popreg  rdi
1510 #endif
1511         popreg  rbx
1512         ret
1513 .endm
1514
1515 .macro  testldcarry
1516         movdqu  xmm12, [rcx +  0]       // (c'_0; c''_0)
1517         movdqu  xmm13, [rcx + 16]       // (c'_1; c''_1)
1518         movdqu  xmm14, [rcx + 32]       // (c'_2; c''_2)
1519 .endm
1520
1521 .macro  testtop u=nil
1522         .p2align 4
1523 0:
1524         cysetup r9, r8
1525   .ifnes "\u", "nil"
1526         mov     rax, \u
1527   .endif
1528 .endm
1529
1530 .macro  testtail
1531         cystore r9, r8
1532         jnz     0b
1533 .endm
1534
1535 .macro  testcarryout
1536         movdqu  [rcx +  0], xmm12
1537         movdqu  [rcx + 16], xmm13
1538         movdqu  [rcx + 32], xmm14
1539 .endm
1540
1541 FUNC(test_dmul4)
1542         testprologue dmul
1543         testldcarry
1544         testtop r11
1545         call    dmul4
1546         testtail
1547         testcarryout
1548         testepilogue
1549 ENDFUNC
1550
1551 FUNC(test_dmla4)
1552         testprologue dmul
1553         testldcarry
1554         testtop r11
1555         call    dmla4
1556         testtail
1557         testcarryout
1558         testepilogue
1559 ENDFUNC
1560
1561 FUNC(test_mul4)
1562         testprologue smul
1563         testldcarry
1564         testtop nil
1565         call    mul4
1566         testtail
1567         testcarryout
1568         testepilogue
1569 ENDFUNC
1570
1571 FUNC(test_mul4zc)
1572         testprologue smul
1573         testldcarry
1574         testtop nil
1575         call    mul4zc
1576         testtail
1577         testcarryout
1578         testepilogue
1579 ENDFUNC
1580
1581 FUNC(test_mla4)
1582         testprologue smul
1583         testldcarry
1584         testtop nil
1585         call    mla4
1586         testtail
1587         testcarryout
1588         testepilogue
1589 ENDFUNC
1590
1591 FUNC(test_mla4zc)
1592         testprologue smul
1593         testldcarry
1594         testtop nil
1595         call    mla4zc
1596         testtail
1597         testcarryout
1598         testepilogue
1599 ENDFUNC
1600
1601 FUNC(test_mmul4)
1602         testprologue mmul
1603         testtop r11
1604         call    mmul4
1605         testtail
1606         movdqu  [r10 +  0], xmm10
1607         movdqu  [r10 + 16], xmm11
1608         testcarryout
1609         testepilogue
1610 ENDFUNC
1611
1612 FUNC(test_mmla4)
1613         testprologue mmul
1614         testtop r11
1615         call    mmla4
1616         testtail
1617         movdqu  [r10 +  0], xmm10
1618         movdqu  [r10 + 16], xmm11
1619         testcarryout
1620         testepilogue
1621 ENDFUNC
1622
1623 FUNC(test_mont4)
1624         testprologue mont
1625         testtop
1626         call    mont4
1627         testtail
1628         movdqu  [r10 +  0], xmm10
1629         movdqu  [r10 + 16], xmm11
1630         testcarryout
1631         testepilogue
1632 ENDFUNC
1633
1634 #endif
1635
1636 ///----- That's all, folks --------------------------------------------------