mdw@git.distorted.org.uk Git - catacomb/blob - math/mpx-mul4-amd64-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
   2 ///
   3 /// Large SIMD-based multiplications
   4 ///
   5 /// (c) 2016 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// Preliminaries.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33         .arch   pentium4
  34
  35         .text
  36
  37 ///--------------------------------------------------------------------------
  38 /// Theory.
  39 ///
  40 /// We define a number of primitive fixed-size multipliers from which we can
  41 /// construct more general variable-length multipliers.
  42 ///
  43 /// The basic trick is the same throughout.  In an operand-scanning
  44 /// multiplication, the inner multiplication loop multiplies a
  45 /// multiple-precision operand by a single precision factor, and adds the
  46 /// result, appropriately shifted, to the result.  A `finely integrated
  47 /// operand scanning' implementation of Montgomery multiplication also adds
  48 /// the product of a single-precision `Montgomery factor' and the modulus,
  49 /// calculated in the same pass.  The more common `coarsely integrated
  50 /// operand scanning' alternates main multiplication and Montgomery passes,
  51 /// which requires additional carry propagation.
  52 ///
  53 /// Throughout both plain-multiplication and Montgomery stages, then, one of
  54 /// the factors remains constant throughout the operation, so we can afford
  55 /// to take a little time to preprocess it.  The transformation we perform is
  56 /// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
  57 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
  58 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
  59 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
  60 /// operands, as follows.
  61 ///
  62 ///     Offset     0       4        8      12
  63 ///        0    v'_0    v'_1    v''_0   v''_1
  64 ///       16    v'_2    v'_3    v''_2   v''_3
  65 ///
  66 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
  67 /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
  68 /// this vector right by 4 bytes brings v'_1 and v''_1 into position.  We can
  69 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
  70 /// results in 64-bit fields.  The sixteen bits of headroom allows us to add
  71 /// many products together before we must deal with carrying; it also allows
  72 /// for some calculations to be performed on the above expanded form.
  73 ///
  74 /// ...
  75 ///
  76 /// We maintain four `carry' registers accumulating intermediate results.
  77 /// The registers' precise roles rotate during the computation; we name them
  78 /// `c0', `c1', `c2', and `c3'.  Each carry register holds two 64-bit halves:
  79 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
  80 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
  81 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3.  The
  82 /// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
  83 /// lanes of its vector) and an operand in the expanded form above produces a
  84 /// result which can be added directly to the appropriate carry register.
  85 /// Following a pass of four multiplications, we perform some limited carry
  86 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
  87 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
  88 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
  89 /// zeroed becomes c3.
  90
  91 ///--------------------------------------------------------------------------
  92 /// Macro definitions.
  93
  94 .macro  mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
  95         // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
  96         // of the product in registers D0, D1, D2, D3.
  97         pshufd  \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
  98   .ifnes "\d1", "nil"
  99         movdqa  \d1, \slo               // (s'_0, s'_1; s''_0, s''_1)
 100   .endif
 101   .ifnes "\d3", "nil"
 102         movdqa  \d3, \shi               // (s'_2, s'_3; s''_2, s''_3)
 103   .endif
 104   .ifnes "\d1", "nil"
 105         psrldq  \d1, 4                  // (s'_1, s''_0; s''_1, 0)
 106   .endif
 107   .ifnes "\d2", "nil"
 108         movdqa  \d2, \d0                // another copy of (r_i, ?; r_i, ?)
 109   .endif
 110   .ifnes "\d3", "nil"
 111         psrldq  \d3, 4                  // (s'_3, s''_2; s''_3, 0)
 112   .endif
 113   .ifnes "\d1", "nil"
 114         pmuludq \d1, \d0                // (r_i s'_1; r_i s''_1)
 115   .endif
 116   .ifnes "\d3", "nil"
 117         pmuludq \d3, \d0                // (r_i s'_3; r_i s''_3)
 118   .endif
 119   .ifnes "\d2", "nil"
 120         pmuludq \d2, \shi               // (r_i s'_2; r_i s''_2)
 121   .endif
 122         pmuludq \d0, \slo               // (r_i s'_0; r_i s''_0)
 123 .endm
 124
 125 .macro  accum   c0, c1=nil, c2=nil, c3=nil
 126         // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
 127         // carry registers C0--C3.  Any or all of C1--C3 may be `nil' to skip
 128         // updating that register.
 129         paddq   \c0, xmm0
 130   .ifnes "\c1", "nil"
 131         paddq   \c1, xmm1
 132   .endif
 133   .ifnes "\c2", "nil"
 134         paddq   \c2, xmm2
 135   .endif
 136   .ifnes "\c3", "nil"
 137         paddq   \c3, xmm3
 138   .endif
 139 .endm
 140
 141 .macro  mulacc  r, i, slo, shi, c0=nil, c1=nil, c2=nil, c3=nil, z3p=nil
 142         // Multiply R_I by the expanded operand SLO/SHI, and accumulate in
 143         // carry registers C0, C1, C2, C3.  If Z3P is `t' then C3 notionally
 144         // contains zero, but needs clearing; in practice, we store the
 145         // product directly rather than attempting to add.  On completion,
 146         // XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P is not `t'.
 147   .ifeqs "\z3p", "t"
 148         mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, \c3
 149         accum                       \c0,  \c1,  \c2
 150   .else
 151         mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, xmm3
 152         accum                       \c0,  \c1,  \c2,  \c3
 153   .endif
 154 .endm
 155
 156 .macro  propout d, pos, c, cc=nil
 157         // Calculate an output word from C, and store it at POS in D;
 158         // propagate carries out from C to CC in preparation for a rotation
 159         // of the carry registers.  D is an XMM register; the POS is either
 160         // `lo' or `hi' according to whether the output word should be in
 161         // lane 0 or 1 of D; the high two lanes of D are clobbered.  On
 162         // completion, XMM3 is clobbered.  If CC is `nil', then the
 163         // contribution which would have been added to it is left in C.
 164         pshufd  xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
 165         psrldq  xmm3, 12                // (t, 0; 0, 0) = (t; 0)
 166         pslldq  xmm3, 2                 // (t b; 0)
 167         paddq   \c, xmm3                // (c' + t b; c'')
 168   .ifeqs "\pos", "lo"
 169         movdqa  \d, \c
 170   .else
 171         punpckldq \d, \c
 172   .endif
 173         psrlq   \c, 32                  // floor(c/B)
 174   .ifnes "\cc", "nil"
 175         paddq   \cc, \c                 // propagate up
 176   .endif
 177 .endm
 178
 179 .macro endprop d, pos, c, t
 180         // On entry, C contains a carry register.  On exit, the low 32 bits
 181         // of the value represented in C are written at POS in D, and the
 182         // remaining bits are left at the bottom of T.
 183         movdqa  \t, \c
 184         psllq   \t, 16                  // (?; c'' b)
 185         pslldq  \c, 8                   // (0; c')
 186         paddq   \t, \c                  // (?; c' + c'' b)
 187         psrldq  \t, 8                   // (c' + c'' b; 0) = (c; 0)
 188   .ifeqs "\pos", "lo"
 189         movdqa  \d, \t
 190   .else
 191         punpckldq \d, \t
 192   .endif
 193         psrldq  \t, 4                   // (floor(c/B); 0)
 194 .endm
 195
 196 .macro  expand  z, a, b, c=nil, d=nil
 197         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
 198         // exit, A:B and C:D together hold the same values in expanded
 199         // form.  If C is `nil', then only expand A to A:B.
 200         movdqa  \b, \a                  // (a_0, a_1; a_2, a_3)
 201   .ifnes "\c", "nil"
 202         movdqa  \d, \c                  // (c_0, c_1; c_2, c_3)
 203   .endif
 204         punpcklwd \a, \z                // (a'_0, a''_0; a'_1, a''_1)
 205         punpckhwd \b, \z                // (a'_2, a''_2; a'_3, a''_3)
 206   .ifnes "\c", "nil"
 207         punpcklwd \c, \z                // (c'_0, c''_0; c'_1, c''_1)
 208         punpckhwd \d, \z                // (c'_2, c''_2; c'_3, c''_3)
 209   .endif
 210         pshufd  \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
 211         pshufd  \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
 212   .ifnes "\c", "nil"
 213         pshufd  \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
 214         pshufd  \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
 215   .endif
 216 .endm
 217
 218 .macro  squash  c0, c1, c2, c3, t, u, lo, hi=nil
 219         // On entry, C0, C1, C2, C3 are carry registers representing a value
 220         // Y.  On exit, LO holds the low 128 bits of the carry value; C1, C2,
 221         // C3, T, and U are clobbered; and the high bits of Y are stored in
 222         // HI, if this is not `nil'.
 223
 224         // The first step is to eliminate the `double-prime' pieces -- i.e.,
 225         // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
 226         // them into the 32-bit-aligned pieces above and below.  But before
 227         // we can do that, we must gather them together.
 228         movdqa  \t, \c0
 229         movdqa  \u, \c1
 230         punpcklqdq \t, \c2              // (y'_0; y'_2)
 231         punpckhqdq \c0, \c2             // (y''_0; y''_2)
 232         punpcklqdq \u, \c3              // (y'_1; y'_3)
 233         punpckhqdq \c1, \c3             // (y''_1; y''_3)
 234
 235         // Now split the double-prime pieces.  The high (up to) 48 bits will
 236         // go up; the low 16 bits go down.
 237         movdqa  \c2, \c0
 238         movdqa  \c3, \c1
 239         psllq   \c2, 48
 240         psllq   \c3, 48
 241         psrlq   \c0, 16                 // high parts of (y''_0; y''_2)
 242         psrlq   \c1, 16                 // high parts of (y''_1; y''_3)
 243         psrlq   \c2, 32                 // low parts of (y''_0; y''_2)
 244         psrlq   \c3, 32                 // low parts of (y''_1; y''_3)
 245   .ifnes "\hi", "nil"
 246         movdqa  \hi, \c1
 247   .endif
 248         pslldq  \c1, 8                  // high part of (0; y''_1)
 249
 250         paddq   \t, \c2                 // propagate down
 251         paddq   \u, \c3
 252         paddq   \t, \c1                 // and up: (y_0; y_2)
 253         paddq   \u, \c0                 // (y_1; y_3)
 254   .ifnes "\hi", "nil"
 255         psrldq  \hi, 8                  // high part of (y''_3; 0)
 256   .endif
 257
 258         // Finally extract the answer.  This complicated dance is better than
 259         // storing to memory and loading, because the piecemeal stores
 260         // inhibit store forwarding.
 261         movdqa  \c3, \t                 // (y_0; ?)
 262         movdqa  \lo, \t                 // (y^*_0, ?; ?, ?)
 263         psrldq  \t, 8                   // (y_2; 0)
 264         psrlq   \c3, 32                 // (floor(y_0/B); ?)
 265         paddq   \c3, \u                 // (y_1 + floor(y_0/B); ?)
 266         movdqa  \c1, \c3                // (y^*_1, ?; ?, ?)
 267         psrldq  \u, 8                   // (y_3; 0)
 268         psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2; ?)
 269         paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2; ?)
 270         punpckldq \lo, \c3              // (y^*_0, y^*_2; ?, ?)
 271         psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
 272         paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
 273   .ifnes "\hi", "nil"
 274         movdqa  \t, \c3
 275         pxor    \u, \u
 276   .endif
 277         punpckldq \c1, \c3              // (y^*_1, y^*_3; ?, ?)
 278   .ifnes "\hi", "nil"
 279         psrlq   \t, 32                  // very high bits of y
 280         paddq   \hi, \t
 281         punpcklqdq \hi, \u              // carry up
 282   .endif
 283         punpckldq \lo, \c1              // y mod B^4
 284 .endm
 285
 286 .macro  carryadd
 287         // On entry, RDI points to a packed addend A, and XMM12, XMM13, XMM14
 288         // hold the incoming carry registers c0, c1, and c2 representing a
 289         // carry-in C.
 290         //
 291         // On exit, the carry registers, including XMM15, are updated to hold
 292         // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
 293         // registers are preserved.
 294         movd    xmm0, [rdi +  0]        // (a_0; 0)
 295         movd    xmm1, [rdi +  4]        // (a_1; 0)
 296         movd    xmm2, [rdi +  8]        // (a_2; 0)
 297         movd    xmm15, [rdi + 12]       // (a_3; 0)
 298         paddq   xmm12, xmm0             // (c'_0 + a_0; c''_0)
 299         paddq   xmm13, xmm1             // (c'_1 + a_1; c''_1)
 300         paddq   xmm14, xmm2             // (c'_2 + a_2; c''_2 + a_3 b)
 301 .endm
 302
 303 ///--------------------------------------------------------------------------
 304 /// Primitive multipliers and related utilities.
 305
 306 INTFUNC(carryprop)
 307         // On entry, XMM12, XMM13, and XMM14 hold a 144-bit carry in an
 308         // expanded form.  Store the low 128 bits of the represented carry to
 309         // [RDI] as a packed 128-bit value, and leave the remaining 16 bits
 310         // in the low 32 bits of XMM12.  On exit, XMM0, XMM1, XMM3, XMM13 and
 311         // XMM14 are clobbered.
 312   endprologue
 313
 314         propout xmm0, lo, xmm12, xmm13
 315         propout xmm1, lo, xmm13, xmm14
 316         propout xmm0, hi, xmm14, nil
 317         endprop xmm1, hi, xmm14, xmm12
 318         punpckldq xmm0, xmm1
 319         movdqu  [rdi], xmm0
 320
 321         ret
 322
 323 ENDFUNC
 324
 325 INTFUNC(dmul4)
 326         // On entry, RDI points to the destination buffer; RAX and RBX point
 327         // to the packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
 328         // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
 329         // incoming carry registers c0, c1, and c2; c3 is assumed to be zero.
 330         //
 331         // On exit, we write the low 128 bits of the sum C + U V + X Y to
 332         // [RDI], and update the carry registers with the carry out.  The
 333         // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
 334         // registers are preserved.
 335   endprologue
 336
 337         movdqu  xmm4, [rax]
 338         movdqu  xmm5, [rbx]
 339
 340         mulacc  xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15, t
 341         mulacc  xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 342         propout xmm6, lo,                xmm12, xmm13
 343
 344         mulacc  xmm4, 1,   xmm8,  xmm9,  xmm13, xmm14, xmm15, xmm12, t
 345         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
 346         propout xmm7, lo,                xmm13, xmm14
 347
 348         mulacc  xmm4, 2,   xmm8,  xmm9,  xmm14, xmm15, xmm12, xmm13, t
 349         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
 350         propout xmm6, hi,                xmm14, xmm15
 351
 352         mulacc  xmm4, 3,   xmm8,  xmm9,  xmm15, xmm12, xmm13, xmm14, t
 353         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
 354         propout xmm7, hi,                xmm15, xmm12
 355
 356         punpckldq xmm6, xmm7
 357         movdqu  [rdi], xmm6
 358
 359         ret
 360
 361 ENDFUNC
 362
 363 INTFUNC(dmla4)
 364         // On entry, RDI points to the destination buffer, which also
 365         // contains an addend A to accumulate; RAX and RBX point to the
 366         // packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
 367         // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
 368         // incoming carry registers c0, c1, and c2 representing a carry-in C;
 369         // c3 is assumed to be zero.
 370         //
 371         // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
 372         // [RDI], and update the carry registers with the carry out.  The
 373         // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
 374         // registers are preserved.
 375   endprologue
 376
 377         movdqu  xmm4, [rax]
 378         movdqu  xmm5, [rbx]
 379         carryadd
 380
 381         mulacc  xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15
 382         mulacc  xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 383         propout xmm6, lo,                xmm12, xmm13
 384
 385         mulacc  xmm4, 1,   xmm8,  xmm9,  xmm13, xmm14, xmm15, xmm12, t
 386         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
 387         propout xmm7, lo,                xmm13, xmm14
 388
 389         mulacc  xmm4, 2,   xmm8,  xmm9,  xmm14, xmm15, xmm12, xmm13, t
 390         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
 391         propout xmm6, hi,                xmm14, xmm15
 392
 393         mulacc  xmm4, 3,   xmm8,  xmm9,  xmm15, xmm12, xmm13, xmm14, t
 394         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
 395         propout xmm7, hi,                xmm15, xmm12
 396
 397         punpckldq xmm6, xmm7
 398         movdqu  [rdi], xmm6
 399
 400         ret
 401
 402 ENDFUNC
 403
 404 INTFUNC(mul4zc)
 405         // On entry, RDI points to the destination buffer; RBX points to a
 406         // packed operand X; and XMM10/XMM11 hold an expanded operand Y.
 407         //
 408         // On exit, we write the low 128 bits of the product X Y to [RDI],
 409         // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
 410         // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
 411         // general-purpose registers are preserved.
 412   endprologue
 413
 414         movdqu  xmm5, [rbx]
 415
 416         mulcore xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 417         propout xmm6, lo,                xmm12, xmm13
 418
 419         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 420         propout xmm7, lo,                xmm13, xmm14
 421
 422         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 423         propout xmm6, hi,                xmm14, xmm15
 424
 425         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 426         propout xmm7, hi,                xmm15, xmm12
 427
 428         punpckldq xmm6, xmm7
 429         movdqu  [rdi], xmm6
 430
 431         ret
 432
 433 ENDFUNC
 434
 435 INTFUNC(mul4)
 436         // On entry, RDI points to the destination buffer; RBX points to a
 437         // packed operand X; XMM10/XMM11 hold an expanded operand Y; and
 438         // XMM12, XMM13, XMM14 hold the incoming carry registers c0, c1, and
 439         // c2, representing a carry-in C; c3 is assumed to be zero.
 440         //
 441         // On exit, we write the low 128 bits of the sum C + X Y to [RDI],
 442         // and update the carry registers with the carry out.  The registers
 443         // XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
 444         // general-purpose registers are preserved.
 445   endprologue
 446
 447         movdqu  xmm5, [rbx]
 448
 449         mulacc  xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, t
 450         propout xmm6, lo,                xmm12, xmm13
 451
 452         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 453         propout xmm7, lo,                xmm13, xmm14
 454
 455         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 456         propout xmm6, hi,                xmm14, xmm15
 457
 458         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 459         propout xmm7, hi,                xmm15, xmm12
 460
 461         punpckldq xmm6, xmm7
 462         movdqu  [rdi], xmm6
 463
 464         ret
 465
 466 ENDFUNC
 467
 468 INTFUNC(mla4zc)
 469         // On entry, RDI points to the destination buffer, which also
 470         // contains an addend A to accumulate; RBX points to a packed operand
 471         // X; and XMM10/XMM11 points to an expanded operand Y.
 472         //
 473         // On exit, we write the low 128 bits of the sum A + X Y to [RDI],
 474         // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
 475         // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
 476         // general-purpose registers are preserved.
 477   endprologue
 478
 479         movdqu  xmm5, [rbx]
 480         movd    xmm12, [rdi +  0]
 481         movd    xmm13, [rdi +  4]
 482         movd    xmm14, [rdi +  8]
 483         movd    xmm15, [rdi + 12]
 484
 485         mulacc  xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 486         propout xmm6, lo,                xmm12, xmm13
 487
 488         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 489         propout xmm7, lo,                xmm13, xmm14
 490
 491         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 492         propout xmm6, hi,                xmm14, xmm15
 493
 494         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 495         propout xmm7, hi,                xmm15, xmm12
 496
 497         punpckldq xmm6, xmm7
 498         movdqu  [rdi], xmm6
 499
 500         ret
 501
 502 ENDFUNC
 503
 504 INTFUNC(mla4)
 505         // On entry, RDI points to the destination buffer, which also
 506         // contains an addend A to accumulate; RBX points to a packed operand
 507         // X; XMM10/XMM11 holds an expanded operand Y; and XMM12, XMM13,
 508         // XMM14 hold the incoming carry registers c0, c1, and c2,
 509         // representing a carry-in C; c3 is assumed to be zero.
 510         //
 511         // On exit, we write the low 128 bits of the sum A + C + X Y to
 512         // [RDI], and update the carry registers with the carry out.  The
 513         // registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
 514         // general-purpose registers are preserved.
 515   endprologue
 516
 517         movdqu  xmm5, [rbx]
 518         carryadd
 519
 520         mulacc  xmm5, 0,    xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 521         propout xmm6, lo,                xmm12, xmm13
 522
 523         mulacc  xmm5, 1,    xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 524         propout xmm7, lo,                xmm13, xmm14
 525
 526         mulacc  xmm5, 2,    xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 527         propout xmm6, hi,                xmm14, xmm15
 528
 529         mulacc  xmm5, 3,    xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 530         propout xmm7, hi,                xmm15, xmm12
 531
 532         punpckldq xmm6, xmm7
 533         movdqu  [rdi], xmm6
 534
 535         ret
 536
 537 ENDFUNC
 538
 539 INTFUNC(mmul4)
 540         // On entry, RDI points to the destination buffer; RAX and RBX point
 541         // to the packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold
 542         // the expanded operands V and M.  The stack pointer must be 8 modulo 16
 543         // (as usual for AMD64 ABIs).
 544         //
 545         // On exit, we store Y = U V M mod B in XMM10/XMM11, and write the
 546         // low 128 bits of the sum U V + N Y to [RDI], leaving the remaining
 547         // carry in XMM12, XMM13, and XMM14.  The registers XMM0--XMM7, and
 548         // XMM15 are clobbered; the general-purpose registers are preserved.
 549         movdqu  xmm4, [rax]
 550 #if ABI_WIN
 551         stalloc 48 + 8                  // space for the carries
 552 #endif
 553   endprologue
 554
 555         // Calculate W = U V, and leave it in XMM7.  Stash the carry pieces
 556         // for later.
 557         mulcore xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15
 558         propout xmm7, lo,                xmm12, xmm13
 559         jmp     5f
 560
 561 ENDFUNC
 562
 563 INTFUNC(mmla4)
 564         // On entry, RDI points to the destination buffer, which also
 565         // contains an addend A to accumulate; RAX and RBX point to the
 566         // packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold the
 567         // expanded operands V and M.  The stack pointer must be 8 modulo 16
 568         // (as usual for AMD64 ABIs).
 569         //
 570         // On exit, we store Y = (A + U V) M mod B in XMM10/XMM11, and write
 571         // the low 128 bits of the sum A + U V + N Y to [RDI], leaving the
 572         // remaining carry in XMM12, XMM13, and XMM14.  The registers
 573         // XMM0--XMM7, and XMM15 are clobbered; the general-purpose registers
 574         // are preserved.
 575         movdqu  xmm4, [rax]
 576 #if ABI_WIN
 577         stalloc 48 + 8                  // space for the carries
 578 #  define STKTMP(i) [rsp + i]
 579 #endif
 580 #if ABI_SYSV
 581 #  define STKTMP(i) [rsp + i - 48 - 8]  // use red zone
 582 #endif
 583   endprologue
 584
 585         movd    xmm12, [rdi +  0]
 586         movd    xmm13, [rdi +  4]
 587         movd    xmm14, [rdi +  8]
 588         movd    xmm15, [rdi + 12]
 589
 590         // Calculate W = U V, and leave it in XMM7.  Stash the carry pieces
 591         // for later.
 592         mulacc  xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15
 593         propout xmm7, lo,                xmm12, xmm13
 594
 595 5:      mulacc  xmm4, 1,   xmm8,  xmm9,  xmm13, xmm14, xmm15, xmm12, t
 596         propout xmm6, lo,                xmm13, xmm14
 597
 598         mulacc  xmm4, 2,   xmm8,  xmm9,  xmm14, xmm15, xmm12, xmm13, t
 599         propout xmm7, hi,                xmm14, xmm15
 600
 601         mulacc  xmm4, 3,   xmm8,  xmm9,  xmm15, xmm12, xmm13, xmm14, t
 602         propout xmm6, hi,                xmm15, xmm12
 603
 604         // Prepare W, and stash carries for later.
 605         punpckldq xmm7, xmm6
 606         movdqa  STKTMP( 0), xmm12
 607         movdqa  STKTMP(16), xmm13
 608         movdqa  STKTMP(32), xmm14
 609
 610         // Calculate Y = W M.  We just about have enough spare registers to
 611         // make this work.
 612         mulcore xmm7, 0,   xmm10, xmm11, xmm3,  xmm4,  xmm5,  xmm6
 613
 614         // Start expanding W back into the main carry registers...
 615         pxor    xmm15, xmm15
 616         movdqa  xmm12, xmm7
 617         movdqa  xmm14, xmm7
 618
 619         mulcore xmm7, 1,   xmm10, xmm11, xmm0,  xmm1,  xmm2
 620         accum                            xmm4,  xmm5,  xmm6
 621
 622         punpckldq xmm12, xmm15          // (w_0, 0; w_1, 0)
 623         punpckhdq xmm14, xmm15          // (w_2, 0; w_3, 0)
 624
 625         mulcore xmm7, 2,   xmm10, xmm11, xmm0,  xmm1
 626         accum                            xmm5,  xmm6
 627
 628         pxor    xmm2, xmm2
 629         movdqa  xmm13, xmm12
 630         movdqa  xmm15, xmm14
 631
 632         mulcore xmm7, 3,   xmm10, xmm11, xmm0
 633         accum                            xmm6
 634
 635         punpckldq xmm12, xmm2           // (w_0, 0; 0, 0)
 636         punpckldq xmm14, xmm2           // (w_2, 0; 0, 0)
 637         punpckhdq xmm13, xmm2           // (w_1, 0; 0, 0)
 638         punpckhdq xmm15, xmm2           // (w_3, 0; 0, 0)
 639
 640         // That's lots of pieces.  Now we have to assemble the answer.
 641         squash  xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
 642
 643         // Expand it.
 644         movdqu  xmm5, [rbx]
 645         expand  xmm2, xmm10, xmm11
 646
 647         // Finish the calculation by adding the Montgomery product.
 648         mulacc  xmm5, 0    xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 649         propout xmm6, lo,                xmm12, xmm13
 650
 651         mulacc  xmm5, 1    xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 652         propout xmm7, lo,                xmm13, xmm14
 653
 654         mulacc  xmm5, 2    xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 655         propout xmm6, hi,                xmm14, xmm15
 656
 657         mulacc  xmm5, 3    xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 658         propout xmm7, hi,                xmm15, xmm12
 659
 660         punpckldq xmm6, xmm7
 661
 662         // Add add on the carry we calculated earlier.
 663         paddq   xmm12, STKTMP( 0)
 664         paddq   xmm13, STKTMP(16)
 665         paddq   xmm14, STKTMP(32)
 666
 667         // And, with that, we're done.
 668         movdqu  [rdi], xmm6
 669 #if ABI_WIN
 670         stfree  56
 671 #endif
 672         ret
 673
 674 #undef STKTMP
 675
 676 ENDFUNC
 677
 678 INTFUNC(mont4)
 679         // On entry, RDI points to the destination buffer holding a packed
 680         // value W; RBX points to a packed operand N; and XMM8/XMM9 hold an
 681         // expanded operand M.
 682         //
 683         // On exit, we store Y = W M mod B in XMM10/XMM11, and write the low
 684         // 128 bits of the sum W + N Y to [RDI], leaving the remaining carry
 685         // in XMM12, XMM13, and XMM14.  The registers XMM0--XMM3, XMM5--XMM7,
 686         // and XMM15 are clobbered; the general-purpose registers are
 687         // preserved.
 688   endprologue
 689
 690         movdqu  xmm7, [rdi]
 691
 692         // Calculate Y = W M.  Avoid the standard carry registers, because
 693         // we're setting something else up there.
 694         mulcore xmm7, 0,   xmm8,  xmm9,  xmm3,  xmm4,  xmm5,  xmm6
 695
 696         // Start expanding W back into the main carry registers...
 697         pxor    xmm15, xmm15
 698         movdqa  xmm12, xmm7
 699         movdqa  xmm14, xmm7
 700
 701         mulcore xmm7, 1,   xmm8,  xmm9,  xmm0,  xmm1,  xmm2
 702         accum                            xmm4,  xmm5,  xmm6
 703
 704         punpckldq xmm12, xmm15          // (w_0, 0; w_1, 0)
 705         punpckhdq xmm14, xmm15          // (w_2, 0; w_3, 0)
 706
 707         mulcore xmm7, 2,   xmm8,  xmm9,  xmm0,  xmm1
 708         accum                            xmm5,  xmm6
 709
 710         pxor    xmm2, xmm2
 711         movdqa  xmm13, xmm12
 712         movdqa  xmm15, xmm14
 713
 714         mulcore xmm7, 3,   xmm8,  xmm9,  xmm0
 715         accum                            xmm6
 716
 717         punpckldq xmm12, xmm2           // (w_0, 0; 0, 0)
 718         punpckldq xmm14, xmm2           // (w_2, 0; 0, 0)
 719         punpckhdq xmm13, xmm2           // (w_1, 0; 0, 0)
 720         punpckhdq xmm15, xmm2           // (w_3, 0; 0, 0)
 721
 722         // That's lots of pieces.  Now we have to assemble the answer.
 723         squash  xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
 724
 725         // Expand it.
 726         movdqu  xmm5, [rbx]
 727         expand  xmm2, xmm10, xmm11
 728
 729         // Finish the calculation by adding the Montgomery product.
 730         mulacc  xmm5, 0    xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 731         propout xmm6, lo,                xmm12, xmm13
 732
 733         mulacc  xmm5, 1    xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 734         propout xmm7, lo,                xmm13, xmm14
 735
 736         mulacc  xmm5, 2    xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 737         propout xmm6, hi,                xmm14, xmm15
 738
 739         mulacc  xmm5, 3    xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 740         propout xmm7, hi,                xmm15, xmm12
 741
 742         punpckldq xmm6, xmm7
 743
 744         // And, with that, we're done.
 745         movdqu  [rdi], xmm6
 746         ret
 747
 748 ENDFUNC
 749
 750 ///--------------------------------------------------------------------------
 751 /// Bulk multipliers.
 752
 753 FUNC(mpx_umul4_amd64_avx)
 754         .arch   .avx
 755         vzeroupper
 756   endprologue
 757         .arch   pentium4
 758 ENDFUNC
 759
 760 FUNC(mpx_umul4_amd64_sse2)
 761         // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
 762         //                         const mpw *bv, const mpw *bvl);
 763
 764         // Establish the arguments and do initial setup.
 765         //
 766         //                      sysv    win
 767         // inner loop dv        rdi     rdi*
 768         // inner loop av        rbx*    rbx*
 769         // outer loop dv        r10     rcx
 770         // outer loop bv        rcx     r9
 771         // av base              rsi     rdx
 772         // av limit             rdx     r8
 773         // bv limit             r8      r10
 774
 775 #if ABI_SYSV
 776 #  define DV r10
 777 #  define AV rsi
 778 #  define AVL rdx
 779 #  define BV rcx
 780 #  define BVL r8
 781
 782         pushreg rbx
 783   endprologue
 784
 785         mov     DV, rdi
 786
 787 #endif
 788
 789 #if ABI_WIN
 790 #  define DV rcx
 791 #  define AV rdx
 792 #  define AVL r8
 793 #  define BV r9
 794 #  define BVL r10
 795
 796         pushreg rbx
 797         pushreg rdi
 798         stalloc 160 + 8
 799
 800         savexmm xmm6,    0
 801         savexmm xmm7,   16
 802         savexmm xmm8,   32
 803         savexmm xmm9,   48
 804         savexmm xmm10,  64
 805         savexmm xmm11,  80
 806         savexmm xmm12,  96
 807         savexmm xmm13, 112
 808         savexmm xmm14, 128
 809         savexmm xmm15, 144
 810
 811   endprologue
 812
 813         mov     rdi, DV
 814         mov     BVL, [rsp + 224]
 815
 816 #endif
 817
 818         // Prepare for the first iteration.
 819         pxor    xmm0, xmm0
 820         movdqu  xmm10, [BV]             // bv[0]
 821         mov     rbx, AV
 822         add     DV, 16
 823         add     BV, 16
 824         expand  xmm0, xmm10, xmm11
 825         call    mul4zc
 826         add     rbx, 16
 827         add     rdi, 16
 828         cmp     rbx, AVL                // all done?
 829         jae     8f
 830
 831         .p2align 4
 832         // Continue with the first iteration.
 833 0:      call    mul4
 834         add     rbx, 16
 835         add     rdi, 16
 836         cmp     rbx, AVL                // all done?
 837         jb      0b
 838
 839         // Write out the leftover carry.  There can be no tail here.
 840 8:      call    carryprop
 841         cmp     BV, BVL                 // more passes to do?
 842         jae     9f
 843
 844         .p2align 4
 845         // Set up for the next pass.
 846 1:      movdqu  xmm10, [BV]             // bv[i]
 847         mov     rdi, DV                 // -> dv[i]
 848         pxor    xmm0, xmm0
 849         expand  xmm0, xmm10, xmm11
 850         mov     rbx, AV                 // -> av[0]
 851         add     DV, 16
 852         add     BV, 16
 853         call    mla4zc
 854         add     rbx, 16
 855         add     rdi, 16
 856         cmp     rbx, AVL                // done yet?
 857         jae     8f
 858
 859         .p2align 4
 860         // Continue...
 861 0:      call    mla4
 862         add     rbx, 16
 863         add     rdi, 16
 864         cmp     rbx, AVL
 865         jb      0b
 866
 867         // Finish off this pass.  There was no tail on the previous pass, and
 868         // there can be none on this pass.
 869 8:      call    carryprop
 870         cmp     BV, BVL
 871         jb      1b
 872
 873         // All over.
 874 9:
 875
 876 #if ABI_SYSV
 877         popreg  rbx
 878 #endif
 879
 880 #if ABI_WIN
 881
 882         rstrxmm xmm6,    0
 883         rstrxmm xmm7,   16
 884         rstrxmm xmm8,   32
 885         rstrxmm xmm9,   48
 886         rstrxmm xmm10,  64
 887         rstrxmm xmm11,  80
 888         rstrxmm xmm12,  96
 889         rstrxmm xmm13, 112
 890         rstrxmm xmm14, 128
 891         rstrxmm xmm15, 144
 892
 893         stfree  160 + 8
 894         popreg  rdi
 895         popreg  rbx
 896
 897 #endif
 898
 899         ret
 900
 901 #undef DV
 902 #undef AV
 903 #undef AVL
 904 #undef BV
 905 #undef BVL
 906
 907 ENDFUNC
 908
 909 FUNC(mpxmont_mul4_amd64_avx)
 910         .arch   .avx
 911         vzeroupper
 912   endprologue
 913         .arch   pentium4
 914 ENDFUNC
 915
 916 FUNC(mpxmont_mul4_amd64_sse2)
 917         // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
 918         //                           const mpw *nv, size_t n, const mpw *mi);
 919
 920         // Establish the arguments and do initial setup.
 921         //
 922         //                      sysv    win
 923         // inner loop dv        rdi     rdi*
 924         // inner loop av        rax     rax
 925         // inner loop nv        rbx*    rbx*
 926         // mi                   r9      r10
 927         // outer loop dv        r10     rcx
 928         // outer loop bv        rdx     r8
 929         // av base              rsi     rdx
 930         // av limit             r11     r11
 931         // bv limit             r8      r12*
 932         // nv base              rcx     r9
 933         // n                    r8      r12*
 934
 935 #if ABI_SYSV
 936 #  define DV r10
 937 #  define AV rsi
 938 #  define AVL r11
 939 #  define BV rdx
 940 #  define BVL r8
 941 #  define NV rcx
 942 #  define N r8
 943 #  define MI r9
 944
 945         pushreg rbx
 946   endprologue
 947
 948         mov     DV, rdi
 949
 950 #endif
 951
 952 #if ABI_WIN
 953 #  define DV rcx
 954 #  define AV rdx
 955 #  define AVL r11
 956 #  define BV r8
 957 #  define BVL r12
 958 #  define NV r9
 959 #  define N r12
 960 #  define MI r10
 961
 962         pushreg rbx
 963         pushreg rdi
 964         pushreg r12
 965         stalloc 160
 966
 967         savexmm xmm6,    0
 968         savexmm xmm7,   16
 969         savexmm xmm8,   32
 970         savexmm xmm9,   48
 971         savexmm xmm10,  64
 972         savexmm xmm11,  80
 973         savexmm xmm12,  96
 974         savexmm xmm13, 112
 975         savexmm xmm14, 128
 976         savexmm xmm15, 144
 977
 978   endprologue
 979
 980         mov     rdi, DV
 981         mov     N, [rsp + 224]
 982         mov     MI, [rsp + 232]
 983
 984 #endif
 985
 986         // Establish the expanded operands.
 987         pxor    xmm0, xmm0
 988         movdqu  xmm8, [BV]              // bv[0]
 989         movdqu  xmm10, [MI]             // mi
 990         expand  xmm0, xmm8, xmm9, xmm10, xmm11
 991
 992         // Set up the outer loop state and prepare for the first iteration.
 993         mov     rax, AV                 // -> U = av[0]
 994         mov     rbx, NV                 // -> X = nv[0]
 995         lea     AVL, [AV + 4*N]         // -> av[n/4] = av limit
 996         lea     BVL, [BV + 4*N]         // -> bv[n/4] = bv limit
 997         add     BV, 16
 998         add     DV, 16
 999         call    mmul4
1000         add     rdi, 16
1001         add     rax, 16
1002         add     rbx, 16
1003         cmp     rax, AVL                // done already?
1004         jae     8f
1005
1006         .p2align 4
1007         // Complete the first inner loop.
1008 0:      call    dmul4
1009         add     rdi, 16
1010         add     rax, 16
1011         add     rbx, 16
1012         cmp     rax, AVL                // done yet?
1013         jb      0b
1014
1015         // Still have carries left to propagate.
1016         call    carryprop
1017         movd    [rdi + 16], xmm12
1018
1019         .p2align 4
1020         // Embark on the next iteration.  (There must be one.  If n = 1, then
1021         // we would have bailed above, to label 8.  Similarly, the subsequent
1022         // iterations can fall into the inner loop immediately.)
1023 1:      pxor    xmm0, xmm0
1024         movdqu  xmm8, [BV]              // bv[i]
1025         movdqu  xmm10, [MI]             // mi
1026         mov     rdi, DV                 // -> Z = dv[i]
1027         mov     rax, AV                 // -> U = av[0]
1028         mov     rbx, NV                 // -> X = nv[0]
1029         expand  xmm0, xmm8, xmm9, xmm10, xmm11
1030         add     BV, 16
1031         add     DV, 16
1032         call    mmla4
1033         add     rdi, 16
1034         add     rax, 16
1035         add     rbx, 16
1036
1037         .p2align 4
1038         // Complete the next inner loop.
1039 0:      call    dmla4
1040         add     rdi, 16
1041         add     rax, 16
1042         add     rbx, 16
1043         cmp     rax, AVL
1044         jb      0b
1045
1046         // Still have carries left to propagate, and they overlap the
1047         // previous iteration's final tail, so read that in and add it.
1048         movd    xmm0, [rdi]
1049         paddq   xmm12, xmm0
1050         call    carryprop
1051         movd    [rdi + 16], xmm12
1052
1053         // Back again, maybe.
1054         cmp     BV, BVL
1055         jb      1b
1056
1057         // All done.
1058 9:
1059
1060 #if ABI_SYSV
1061         popreg  rbx
1062 #endif
1063
1064 #if ABI_WIN
1065
1066         rstrxmm xmm6,    0
1067         rstrxmm xmm7,   16
1068         rstrxmm xmm8,   32
1069         rstrxmm xmm9,   48
1070         rstrxmm xmm10,  64
1071         rstrxmm xmm11,  80
1072         rstrxmm xmm12,  96
1073         rstrxmm xmm13, 112
1074         rstrxmm xmm14, 128
1075         rstrxmm xmm15, 144
1076
1077         stfree  160
1078         popreg  r12
1079         popreg  rdi
1080         popreg  rbx
1081
1082 #endif
1083
1084         ret
1085
1086         // First iteration was short.  Write out the carries and we're done.
1087         // (This could be folded into the main loop structure, but that would
1088         // penalize small numbers more.)
1089 8:      call    carryprop
1090         movd    [rdi + 16], xmm12
1091 #if ABI_SYSV
1092         popreg  rbx
1093         ret
1094 #endif
1095 #if ABI_WIN
1096         jmp     9b
1097 #endif
1098
1099 #undef DV
1100 #undef AV
1101 #undef AVL
1102 #undef BV
1103 #undef BVL
1104 #undef NV
1105 #undef N
1106 #undef MI
1107
1108 ENDFUNC
1109
1110 FUNC(mpxmont_redc4_amd64_avx)
1111         .arch   .avx
1112         vzeroupper
1113   endprologue
1114         .arch   pentium4
1115 ENDFUNC
1116
1117 FUNC(mpxmont_redc4_amd64_sse2)
1118         // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
1119         //                             size_t n, const mpw *mi);
1120
1121         // Establish the arguments and do initial setup.
1122         //
1123         //                      sysv    win
1124         // inner loop dv        rdi     rdi*
1125         // dv limit             rax     rax
1126         // blocks-of-4 dv limit rsi     rdx
1127         // inner loop nv        rbx*    rbx*
1128         // mi                   r8      r10
1129         // outer loop dv        r10     rcx
1130         // outer loop dv limit  r11     r11
1131         // nv base              rdx     r8
1132         // nv limit             r9      r12*
1133         // n                    rcx     r9
1134         // c                    rcx     r9
1135
1136 #if ABI_SYSV
1137
1138 #  define DVL rax
1139 #  define DVL4 rsi
1140 #  define MI r8
1141 #  define DV r10
1142 #  define DVLO r11
1143 #  define NV rdx
1144 #  define NVL r9
1145 #  define N rcx
1146 #  define C ecx
1147
1148         pushreg rbx
1149   endprologue
1150
1151         mov     DV, rdi
1152
1153 #endif
1154
1155 #if ABI_WIN
1156
1157 #  define DVL rax
1158 #  define DVL4 rdx
1159 #  define MI r10
1160 #  define DV rcx
1161 #  define DVLO r11
1162 #  define NV r8
1163 #  define NVL r12
1164 #  define N r9
1165 #  define C r9d
1166
1167         pushreg rbx
1168         pushreg rdi
1169         pushreg r12
1170         stalloc 160
1171
1172         savexmm xmm6,    0
1173         savexmm xmm7,   16
1174         savexmm xmm8,   32
1175         savexmm xmm9,   48
1176         savexmm xmm10,  64
1177         savexmm xmm11,  80
1178         savexmm xmm12,  96
1179         savexmm xmm13, 112
1180         savexmm xmm14, 128
1181         savexmm xmm15, 144
1182
1183   endprologue
1184
1185         mov     rdi, DV
1186         mov     MI, [rsp + 224]
1187
1188 #endif
1189
1190         // Establish the expanded operands and the blocks-of-4 dv limit.
1191         pxor    xmm0, xmm0
1192         mov     DVL, DVL4               // -> dv[n] = dv limit
1193         sub     DVL4, DV                // length of dv in bytes
1194         movdqu  xmm8, [MI]              // mi
1195         and     DVL4, ~15               // mask off the tail end
1196         expand  xmm0, xmm8, xmm9
1197         add     DVL4, DV                // find limit
1198
1199         // Set up the outer loop state and prepare for the first iteration.
1200         mov     rbx, NV                 // -> X = nv[0]
1201         lea     DVLO, [DV + 4*N]        // -> dv[n/4] = outer dv limit
1202         lea     NVL, [NV + 4*N]         // -> nv[n/4] = nv limit
1203         add     DV, 16
1204         call    mont4
1205         add     rbx, 16
1206         add     rdi, 16
1207         cmp     rbx, NVL                // done already?
1208         jae     8f
1209
1210         .p2align 4
1211         // Complete the first inner loop.
1212 5:      call    mla4
1213         add     rbx, 16
1214         add     rdi, 16
1215         cmp     rbx, NVL                // done yet?
1216         jb      5b
1217
1218         // Still have carries left to propagate.
1219 8:      carryadd
1220         psllq   xmm15, 16
1221         pslldq  xmm15, 8
1222         paddq   xmm14, xmm15
1223         call    carryprop
1224         movd    C, xmm12
1225         add     rdi, 16
1226         cmp     rdi, DVL4
1227         jae     7f
1228
1229         .p2align 4
1230         // Continue carry propagation until the end of the buffer.
1231 0:      add     [rdi], C
1232         mov     C, 0                    // preserves flags
1233         adcd    [rdi + 4], 0
1234         adcd    [rdi + 8], 0
1235         adcd    [rdi + 12], 0
1236         adc     C, 0
1237         add     rdi, 16
1238         cmp     rdi, DVL4
1239         jb      0b
1240
1241         // Deal with the tail end.
1242 7:      add     [rdi], C
1243         mov     C, 0                    // preserves flags
1244         add     rdi, 4
1245         adc     C, 0
1246         cmp     rdi, DVL
1247         jb      7b
1248
1249         // All done for this iteration.  Start the next.  (This must have at
1250         // least one follow-on iteration, or we'd not have started this outer
1251         // loop.)
1252 8:      mov     rdi, DV                 // -> Z = dv[i]
1253         mov     rbx, NV                 // -> X = nv[0]
1254         cmp     rdi, DVLO               // all done yet?
1255         jae     9f
1256         add     DV, 16
1257         call    mont4
1258         add     rdi, 16
1259         add     rbx, 16
1260         jmp     5b
1261
1262         // All over.
1263 9:
1264
1265 #if ABI_SYSV
1266         popreg  rbx
1267 #endif
1268
1269 #if ABI_WIN
1270
1271         rstrxmm xmm6,    0
1272         rstrxmm xmm7,   16
1273         rstrxmm xmm8,   32
1274         rstrxmm xmm9,   48
1275         rstrxmm xmm10,  64
1276         rstrxmm xmm11,  80
1277         rstrxmm xmm12,  96
1278         rstrxmm xmm13, 112
1279         rstrxmm xmm14, 128
1280         rstrxmm xmm15, 144
1281
1282         stfree  160
1283         popreg  r12
1284         popreg  rdi
1285         popreg  rbx
1286
1287 #endif
1288
1289         ret
1290
1291 #undef DVL
1292 #undef DVL4
1293 #undef MI
1294 #undef DV
1295 #undef DVLO
1296 #undef NV
1297 #undef NVL
1298 #undef N
1299 #undef C
1300
1301 ENDFUNC
1302
1303 ///--------------------------------------------------------------------------
1304 /// Testing and performance measurement.
1305
1306 #ifdef TEST_MUL4
1307
1308 #if ABI_SYSV
1309 #  define ARG0 rdi
1310 #  define ARG1 rsi
1311 #  define ARG2 rdx
1312 #  define ARG3 rcx
1313 #  define ARG4 r8
1314 #  define ARG5 r9
1315 #  define ARG6 STKARG(0)
1316 #  define ARG7 STKARG(1)
1317 #  define ARG8 STKARG(2)
1318 #  define STKARG_OFFSET 16
1319 #endif
1320 #if ABI_WIN
1321 #  define ARG0 rcx
1322 #  define ARG1 rdx
1323 #  define ARG2 r8
1324 #  define ARG3 r9
1325 #  define ARG4 STKARG(0)
1326 #  define ARG5 STKARG(1)
1327 #  define ARG6 STKARG(2)
1328 #  define ARG7 STKARG(3)
1329 #  define ARG8 STKARG(4)
1330 #  define STKARG_OFFSET 224
1331 #endif
1332 #define STKARG(i) [rsp + STKARG_OFFSET + 8*(i)]
1333
1334 //                sysv                          win
1335 //                dmul  smul  mmul  mont        dmul  smul  mmul  mont
1336 // A    rax
1337 // D    rdx
1338 // z    rdi       rdi   rdi   rdi    rdi        rcx   rcx   rcx    rcx
1339 // c    rcx       rsi   rsi   rsi    rsi        rdx   rdx   rdx    rdx
1340 // y    r10       --    --    rdx    rdx        --    --    r8     r8
1341 // u    r11       rdx   --    rcx    --         r8    --    r9     --
1342 // x    rbx       rcx   rdx   r8     rcx        r9    r8    stk0   r9
1343 // vv   xmm8/9    r8    --    r9     r8         stk0  --    stk1   stk0
1344 // yy   xmm10/11  r9    rcx   stk0   --         stk1  r9    stk2   --
1345 // n    r8        stk0  r8    stk1   r9         stk2  stk0  stk3   stk1
1346 // cyv  r9        stk1  r9    stk2   stk0       stk3  stk1  stk4   stk2
1347
1348 .macro  cysetup v, n
1349         rdtsc
1350         shl     rdx, 32
1351         or      rax, rdx
1352         mov     [\v + 8*\n - 8], rax
1353 .endm
1354
1355 .macro  cystore v, n
1356         rdtsc
1357         shl     rdx, 32
1358         or      rax, rdx
1359         sub     rax, [\v + 8*\n - 8]
1360         mov     [\v + 8*\n - 8], rax
1361         dec     \n
1362 .endm
1363
1364 .macro  testprologue mode
1365         pushreg rbx
1366 #if ABI_SYSV
1367   endprologue
1368   .ifeqs "\mode", "dmul"
1369         mov     rbx, rcx
1370         movdqu  xmm8, [r8]
1371         movdqu  xmm10, [r9]
1372         mov     r8d, STKARG(0)
1373         mov     r9, STKARG(1)
1374         mov     r11, rdx
1375         mov     rcx, rsi
1376   .endif
1377   .ifeqs "\mode", "smul"
1378         mov     rbx, rdx
1379         movdqu  xmm10, [rcx]
1380         mov     rcx, rsi
1381   .endif
1382   .ifeqs "\mode", "mmul"
1383         mov     rax, STKARG(0)
1384         mov     rbx, r8
1385         movdqu  xmm8, [r9]
1386         movdqu  xmm10, [rax]
1387         mov     r8d, STKARG(1)
1388         mov     r9, STKARG(2)
1389         mov     r10, rdx
1390         mov     r11, rcx
1391         mov     rcx, rsi
1392   .endif
1393   .ifeqs "\mode", "mont"
1394         mov     rbx, rcx
1395         movdqu  xmm8, [r8]
1396         mov     r8d, r9d
1397         mov     r9, STKARG(0)
1398         mov     r10, rdx
1399         mov     rcx, rsi
1400   .endif
1401 #endif
1402 #if ABI_WIN
1403         pushreg rdi
1404         stalloc 168
1405         savexmm xmm6,    0
1406         savexmm xmm7,   16
1407         savexmm xmm8,   32
1408         savexmm xmm9,   48
1409         savexmm xmm10,  64
1410         savexmm xmm11,  80
1411         savexmm xmm12,  96
1412         savexmm xmm13, 112
1413         savexmm xmm14, 128
1414         savexmm xmm15, 144
1415   endprologue
1416   .ifeqs "\mode", "dmul"
1417         mov     r10, STKARG(0)
1418         mov     r11, STKARG(1)
1419         mov     rdi, rcx
1420         mov     rcx, rdx
1421         mov     rbx, r9
1422         movdqu  xmm8, [r10]
1423         movdqu  xmm10, [r11]
1424         mov     r11, r8
1425         mov     r8d, STKARG(2)
1426         mov     r9, STKARG(3)
1427   .endif
1428   .ifeqs "\mode", "smul"
1429         mov     rdi, rcx
1430         mov     rcx, rdx
1431         mov     rbx, r8
1432         movdqu  xmm10, [r9]
1433         mov     r8d, STKARG(0)
1434         mov     r9, STKARG(1)
1435   .endif
1436   .ifeqs "\mode", "mmul"
1437         mov     r10, STKARG(1)
1438         mov     r11, STKARG(2)
1439         mov     rdi, rcx
1440         mov     rcx, rdx
1441         mov     rbx, STKARG(0)
1442         movdqu  xmm8, [r10]
1443         movdqu  xmm10, [r11]
1444         mov     r10, r8
1445         mov     r11, r9
1446         mov     r8d, STKARG(3)
1447         mov     r9, STKARG(4)
1448   .endif
1449   .ifeqs "\mode", "mont"
1450         mov     r10, STKARG(0)
1451         mov     rdi, rcx
1452         mov     rcx, rdx
1453         mov     rbx, r9
1454         movdqu  xmm8, [r10]
1455         mov     r10, r8
1456         mov     r8d, STKARG(1)
1457         mov     r9, STKARG(2)
1458   .endif
1459 #endif
1460
1461         pxor    xmm0, xmm0
1462   .ifeqs "\mode", "dmul"
1463         expand  xmm0, xmm8, xmm9, xmm10, xmm11
1464   .endif
1465   .ifeqs "\mode", "smul"
1466         expand  xmm0, xmm10, xmm11
1467   .endif
1468   .ifeqs "\mode", "mmul"
1469         expand  xmm0, xmm8, xmm9, xmm10, xmm11
1470   .endif
1471   .ifeqs "\mode", "mont"
1472         expand  xmm0, xmm8, xmm9
1473   .endif
1474 .endm
1475
1476 .macro  testepilogue
1477 #if ABI_WIN
1478         rstrxmm xmm6,    0
1479         rstrxmm xmm7,   16
1480         rstrxmm xmm8,   32
1481         rstrxmm xmm9,   48
1482         rstrxmm xmm10,  64
1483         rstrxmm xmm11,  80
1484         rstrxmm xmm12,  96
1485         rstrxmm xmm13, 112
1486         rstrxmm xmm14, 128
1487         rstrxmm xmm15, 144
1488         stfree  168
1489         popreg  rdi
1490 #endif
1491         popreg  rbx
1492         ret
1493 .endm
1494
1495 .macro  testldcarry
1496         movdqu  xmm12, [rcx +  0]       // (c'_0; c''_0)
1497         movdqu  xmm13, [rcx + 16]       // (c'_1; c''_1)
1498         movdqu  xmm14, [rcx + 32]       // (c'_2; c''_2)
1499 .endm
1500
1501 .macro  testtop u=nil
1502         .p2align 4
1503 0:
1504         cysetup r9, r8
1505   .ifnes "\u", "nil"
1506         mov     rax, \u
1507   .endif
1508 .endm
1509
1510 .macro  testtail
1511         cystore r9, r8
1512         jnz     0b
1513 .endm
1514
1515 .macro  testcarryout
1516         movdqu  [rcx +  0], xmm12
1517         movdqu  [rcx + 16], xmm13
1518         movdqu  [rcx + 32], xmm14
1519 .endm
1520
1521 FUNC(test_dmul4)
1522         testprologue dmul
1523         testldcarry
1524         testtop r11
1525         call    dmul4
1526         testtail
1527         testcarryout
1528         testepilogue
1529 ENDFUNC
1530
1531 FUNC(test_dmla4)
1532         testprologue dmul
1533         testldcarry
1534         testtop r11
1535         call    dmla4
1536         testtail
1537         testcarryout
1538         testepilogue
1539 ENDFUNC
1540
1541 FUNC(test_mul4)
1542         testprologue smul
1543         testldcarry
1544         testtop nil
1545         call    mul4
1546         testtail
1547         testcarryout
1548         testepilogue
1549 ENDFUNC
1550
1551 FUNC(test_mla4)
1552         testprologue smul
1553         testldcarry
1554         testtop nil
1555         call    mla4
1556         testtail
1557         testcarryout
1558         testepilogue
1559 ENDFUNC
1560
1561 FUNC(test_mmul4)
1562         testprologue mmul
1563         testtop r11
1564         call    mmul4
1565         testtail
1566         movdqu  [r10 +  0], xmm10
1567         movdqu  [r10 + 16], xmm11
1568         testcarryout
1569         testepilogue
1570 ENDFUNC
1571
1572 FUNC(test_mmla4)
1573         testprologue mmul
1574         testtop r11
1575         call    mmla4
1576         testtail
1577         movdqu  [r10 +  0], xmm10
1578         movdqu  [r10 + 16], xmm11
1579         testcarryout
1580         testepilogue
1581 ENDFUNC
1582
1583 FUNC(test_mont4)
1584         testprologue mont
1585         testtop
1586         call    mont4
1587         testtail
1588         movdqu  [r10 +  0], xmm10
1589         movdqu  [r10 + 16], xmm11
1590         testcarryout
1591         testepilogue
1592 ENDFUNC
1593
1594 #endif
1595
1596 ///----- That's all, folks --------------------------------------------------