mdw@git.distorted.org.uk Git - catacomb/blob - math/mpx-mul4-amd64-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
   2 ///
   3 /// Large SIMD-based multiplications
   4 ///
   5 /// (c) 2016 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// External definitions.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33 ///--------------------------------------------------------------------------
  34 /// Prologue.
  35
  36         .arch   pentium4
  37         .text
  38
  39 ///--------------------------------------------------------------------------
  40 /// Theory.
  41 ///
  42 /// We define a number of primitive fixed-size multipliers from which we can
  43 /// construct more general variable-length multipliers.
  44 ///
  45 /// The basic trick is the same throughout.  In an operand-scanning
  46 /// multiplication, the inner multiplication loop multiplies a
  47 /// multiple-precision operand by a single precision factor, and adds the
  48 /// result, appropriately shifted, to the result.  A `finely integrated
  49 /// operand scanning' implementation of Montgomery multiplication also adds
  50 /// the product of a single-precision `Montgomery factor' and the modulus,
  51 /// calculated in the same pass.  The more common `coarsely integrated
  52 /// operand scanning' alternates main multiplication and Montgomery passes,
  53 /// which requires additional carry propagation.
  54 ///
  55 /// Throughout both plain-multiplication and Montgomery stages, then, one of
  56 /// the factors remains constant throughout the operation, so we can afford
  57 /// to take a little time to preprocess it.  The transformation we perform is
  58 /// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
  59 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
  60 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
  61 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
  62 /// operands, as follows.
  63 ///
  64 ///     Offset     0       4        8      12
  65 ///        0    v'_0    v'_1    v''_0   v''_1
  66 ///       16    v'_2    v'_3    v''_2   v''_3
  67 ///
  68 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
  69 /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
  70 /// this vector right by 4 bytes brings v'_1 and v''_1 into position.  We can
  71 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
  72 /// results in 64-bit fields.  The sixteen bits of headroom allows us to add
  73 /// many products together before we must deal with carrying; it also allows
  74 /// for some calculations to be performed on the above expanded form.
  75 ///
  76 /// ...
  77 ///
  78 /// We maintain four `carry' registers accumulating intermediate results.
  79 /// The registers' precise roles rotate during the computation; we name them
  80 /// `c0', `c1', `c2', and `c3'.  Each carry register holds two 64-bit halves:
  81 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
  82 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
  83 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3.  The
  84 /// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
  85 /// lanes of its vector) and an operand in the expanded form above produces a
  86 /// result which can be added directly to the appropriate carry register.
  87 /// Following a pass of four multiplications, we perform some limited carry
  88 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
  89 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
  90 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
  91 /// zeroed becomes c3.
  92
  93 ///--------------------------------------------------------------------------
  94 /// Macro definitions.
  95
  96 .macro  mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
  97         // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
  98         // of the product in registers D0, D1, D2, D3.
  99         pshufd  \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
 100   .ifnes "\d1", "nil"
 101         movdqa  \d1, \slo               // (s'_0, s'_1; s''_0, s''_1)
 102   .endif
 103   .ifnes "\d3", "nil"
 104         movdqa  \d3, \shi               // (s'_2, s'_3; s''_2, s''_3)
 105   .endif
 106   .ifnes "\d1", "nil"
 107         psrldq  \d1, 4                  // (s'_1, s''_0; s''_1, 0)
 108   .endif
 109   .ifnes "\d2", "nil"
 110         movdqa  \d2, \d0                // another copy of (r_i, ?; r_i, ?)
 111   .endif
 112   .ifnes "\d3", "nil"
 113         psrldq  \d3, 4                  // (s'_3, s''_2; s''_3, 0)
 114   .endif
 115   .ifnes "\d1", "nil"
 116         pmuludq \d1, \d0                // (r_i s'_1; r_i s''_1)
 117   .endif
 118   .ifnes "\d3", "nil"
 119         pmuludq \d3, \d0                // (r_i s'_3; r_i s''_3)
 120   .endif
 121   .ifnes "\d2", "nil"
 122         pmuludq \d2, \shi               // (r_i s'_2; r_i s''_2)
 123   .endif
 124         pmuludq \d0, \slo               // (r_i s'_0; r_i s''_0)
 125 .endm
 126
 127 .macro  accum   c0, c1=nil, c2=nil, c3=nil
 128         // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
 129         // carry registers C0--C3.  Any or all of C1--C3 may be `nil' to skip
 130         // updating that register.
 131         paddq   \c0, xmm0
 132   .ifnes "\c1", "nil"
 133         paddq   \c1, xmm1
 134   .endif
 135   .ifnes "\c2", "nil"
 136         paddq   \c2, xmm2
 137   .endif
 138   .ifnes "\c3", "nil"
 139         paddq   \c3, xmm3
 140   .endif
 141 .endm
 142
 143 .macro  mulacc  r, i, slo, shi, c0=nil, c1=nil, c2=nil, c3=nil, z3p=nil
 144         // Multiply R_I by the expanded operand SLO/SHI, and accumulate in
 145         // carry registers C0, C1, C2, C3.  If Z3P is `t' then C3 notionally
 146         // contains zero, but needs clearing; in practice, we store the
 147         // product directly rather than attempting to add.  On completion,
 148         // XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P is not `t'.
 149   .ifeqs "\z3p", "t"
 150         mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, \c3
 151         accum                       \c0,  \c1,  \c2
 152   .else
 153         mulcore \r, \i, \slo, \shi, xmm0, xmm1, xmm2, xmm3
 154         accum                       \c0,  \c1,  \c2,  \c3
 155   .endif
 156 .endm
 157
 158 .macro  propout d, pos, c, cc=nil
 159         // Calculate an output word from C, and store it at POS in D;
 160         // propagate carries out from C to CC in preparation for a rotation
 161         // of the carry registers.  D is an XMM register; the POS is either
 162         // `lo' or `hi' according to whether the output word should be in
 163         // lane 0 or 1 of D; the high two lanes of D are clobbered.  On
 164         // completion, XMM3 is clobbered.  If CC is `nil', then the
 165         // contribution which would have been added to it is left in C.
 166         pshufd  xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
 167         psrldq  xmm3, 12                // (t, 0; 0, 0) = (t; 0)
 168         pslldq  xmm3, 2                 // (t b; 0)
 169         paddq   \c, xmm3                // (c' + t b; c'')
 170   .ifeqs "\pos", "lo"
 171         movdqa  \d, \c
 172   .else
 173         punpckldq \d, \c
 174   .endif
 175         psrlq   \c, 32                  // floor(c/B)
 176   .ifnes "\cc", "nil"
 177         paddq   \cc, \c                 // propagate up
 178   .endif
 179 .endm
 180
 181 .macro endprop d, pos, c, t
 182         // On entry, C contains a carry register.  On exit, the low 32 bits
 183         // of the value represented in C are written at POS in D, and the
 184         // remaining bits are left at the bottom of T.
 185         movdqa  \t, \c
 186         psllq   \t, 16                  // (?; c'' b)
 187         pslldq  \c, 8                   // (0; c')
 188         paddq   \t, \c                  // (?; c' + c'' b)
 189         psrldq  \t, 8                   // (c' + c'' b; 0) = (c; 0)
 190   .ifeqs "\pos", "lo"
 191         movdqa  \d, \t
 192   .else
 193         punpckldq \d, \t
 194   .endif
 195         psrldq  \t, 4                   // (floor(c/B); 0)
 196 .endm
 197
 198 .macro  expand  z, a, b, c=nil, d=nil
 199         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
 200         // exit, A:B and C:D together hold the same values in expanded
 201         // form.  If C is `nil', then only expand A to A:B.
 202         movdqa  \b, \a                  // (a_0, a_1; a_2, a_3)
 203   .ifnes "\c", "nil"
 204         movdqa  \d, \c                  // (c_0, c_1; c_2, c_3)
 205   .endif
 206         punpcklwd \a, \z                // (a'_0, a''_0; a'_1, a''_1)
 207         punpckhwd \b, \z                // (a'_2, a''_2; a'_3, a''_3)
 208   .ifnes "\c", "nil"
 209         punpcklwd \c, \z                // (c'_0, c''_0; c'_1, c''_1)
 210         punpckhwd \d, \z                // (c'_2, c''_2; c'_3, c''_3)
 211   .endif
 212         pshufd  \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
 213         pshufd  \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
 214   .ifnes "\c", "nil"
 215         pshufd  \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
 216         pshufd  \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
 217   .endif
 218 .endm
 219
 220 .macro  squash  c0, c1, c2, c3, t, u, lo, hi=nil
 221         // On entry, C0, C1, C2, C3 are carry registers representing a value
 222         // Y.  On exit, LO holds the low 128 bits of the carry value; C1, C2,
 223         // C3, T, and U are clobbered; and the high bits of Y are stored in
 224         // HI, if this is not `nil'.
 225
 226         // The first step is to eliminate the `double-prime' pieces -- i.e.,
 227         // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
 228         // them into the 32-bit-aligned pieces above and below.  But before
 229         // we can do that, we must gather them together.
 230         movdqa  \t, \c0
 231         movdqa  \u, \c1
 232         punpcklqdq \t, \c2              // (y'_0; y'_2)
 233         punpckhqdq \c0, \c2             // (y''_0; y''_2)
 234         punpcklqdq \u, \c3              // (y'_1; y'_3)
 235         punpckhqdq \c1, \c3             // (y''_1; y''_3)
 236
 237         // Now split the double-prime pieces.  The high (up to) 48 bits will
 238         // go up; the low 16 bits go down.
 239         movdqa  \c2, \c0
 240         movdqa  \c3, \c1
 241         psllq   \c2, 48
 242         psllq   \c3, 48
 243         psrlq   \c0, 16                 // high parts of (y''_0; y''_2)
 244         psrlq   \c1, 16                 // high parts of (y''_1; y''_3)
 245         psrlq   \c2, 32                 // low parts of (y''_0; y''_2)
 246         psrlq   \c3, 32                 // low parts of (y''_1; y''_3)
 247   .ifnes "\hi", "nil"
 248         movdqa  \hi, \c1
 249   .endif
 250         pslldq  \c1, 8                  // high part of (0; y''_1)
 251
 252         paddq   \t, \c2                 // propagate down
 253         paddq   \u, \c3
 254         paddq   \t, \c1                 // and up: (y_0; y_2)
 255         paddq   \u, \c0                 // (y_1; y_3)
 256   .ifnes "\hi", "nil"
 257         psrldq  \hi, 8                  // high part of (y''_3; 0)
 258   .endif
 259
 260         // Finally extract the answer.  This complicated dance is better than
 261         // storing to memory and loading, because the piecemeal stores
 262         // inhibit store forwarding.
 263         movdqa  \c3, \t                 // (y_0; ?)
 264         movdqa  \lo, \t                 // (y^*_0, ?; ?, ?)
 265         psrldq  \t, 8                   // (y_2; 0)
 266         psrlq   \c3, 32                 // (floor(y_0/B); ?)
 267         paddq   \c3, \u                 // (y_1 + floor(y_0/B); ?)
 268         movdqa  \c1, \c3                // (y^*_1, ?; ?, ?)
 269         psrldq  \u, 8                   // (y_3; 0)
 270         psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2; ?)
 271         paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2; ?)
 272         punpckldq \lo, \c3              // (y^*_0, y^*_2; ?, ?)
 273         psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
 274         paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
 275   .ifnes "\hi", "nil"
 276         movdqa  \t, \c3
 277         pxor    \u, \u
 278   .endif
 279         punpckldq \c1, \c3              // (y^*_1, y^*_3; ?, ?)
 280   .ifnes "\hi", "nil"
 281         psrlq   \t, 32                  // very high bits of y
 282         paddq   \hi, \t
 283         punpcklqdq \hi, \u              // carry up
 284   .endif
 285         punpckldq \lo, \c1              // y mod B^4
 286 .endm
 287
 288 .macro  carryadd
 289         // On entry, RDI points to a packed addend A, and XMM12, XMM13, XMM14
 290         // hold the incoming carry registers c0, c1, and c2 representing a
 291         // carry-in C.
 292         //
 293         // On exit, the carry registers, including XMM15, are updated to hold
 294         // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
 295         // registers are preserved.
 296         movd    xmm0, [rdi +  0]        // (a_0; 0)
 297         movd    xmm1, [rdi +  4]        // (a_1; 0)
 298         movd    xmm2, [rdi +  8]        // (a_2; 0)
 299         movd    xmm15, [rdi + 12]       // (a_3; 0)
 300         paddq   xmm12, xmm0             // (c'_0 + a_0; c''_0)
 301         paddq   xmm13, xmm1             // (c'_1 + a_1; c''_1)
 302         paddq   xmm14, xmm2             // (c'_2 + a_2; c''_2 + a_3 b)
 303 .endm
 304
 305 ///--------------------------------------------------------------------------
 306 /// Primitive multipliers and related utilities.
 307
 308 INTFUNC(carryprop)
 309         // On entry, XMM12, XMM13, and XMM14 hold a 144-bit carry in an
 310         // expanded form.  Store the low 128 bits of the represented carry to
 311         // [RDI] as a packed 128-bit value, and leave the remaining 16 bits
 312         // in the low 32 bits of XMM12.  On exit, XMM0, XMM1, XMM3, XMM13 and
 313         // XMM14 are clobbered.
 314   endprologue
 315
 316         propout xmm0, lo, xmm12, xmm13
 317         propout xmm1, lo, xmm13, xmm14
 318         propout xmm0, hi, xmm14, nil
 319         endprop xmm1, hi, xmm14, xmm12
 320         punpckldq xmm0, xmm1
 321         movdqu  [rdi], xmm0
 322
 323         ret
 324
 325 ENDFUNC
 326
 327 INTFUNC(dmul4)
 328         // On entry, RDI points to the destination buffer; RAX and RBX point
 329         // to the packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
 330         // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
 331         // incoming carry registers c0, c1, and c2; c3 is assumed to be zero.
 332         //
 333         // On exit, we write the low 128 bits of the sum C + U V + X Y to
 334         // [RDI], and update the carry registers with the carry out.  The
 335         // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
 336         // registers are preserved.
 337   endprologue
 338
 339         movdqu  xmm4, [rax]
 340         movdqu  xmm5, [rbx]
 341
 342         mulacc  xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15, t
 343         mulacc  xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 344         propout xmm6, lo,                xmm12, xmm13
 345
 346         mulacc  xmm4, 1,   xmm8,  xmm9,  xmm13, xmm14, xmm15, xmm12, t
 347         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
 348         propout xmm7, lo,                xmm13, xmm14
 349
 350         mulacc  xmm4, 2,   xmm8,  xmm9,  xmm14, xmm15, xmm12, xmm13, t
 351         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
 352         propout xmm6, hi,                xmm14, xmm15
 353
 354         mulacc  xmm4, 3,   xmm8,  xmm9,  xmm15, xmm12, xmm13, xmm14, t
 355         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
 356         propout xmm7, hi,                xmm15, xmm12
 357
 358         punpckldq xmm6, xmm7
 359         movdqu  [rdi], xmm6
 360
 361         ret
 362
 363 ENDFUNC
 364
 365 INTFUNC(dmla4)
 366         // On entry, RDI points to the destination buffer, which also
 367         // contains an addend A to accumulate; RAX and RBX point to the
 368         // packed operands U and X; XMM8/XMM9 and XMM10/XMM11 hold the
 369         // expanded operands V and Y; and XMM12, XMM13, XMM14 hold the
 370         // incoming carry registers c0, c1, and c2 representing a carry-in C;
 371         // c3 is assumed to be zero.
 372         //
 373         // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
 374         // [RDI], and update the carry registers with the carry out.  The
 375         // registers XMM0--XMM7, and XMM15 are clobbered; the general-purpose
 376         // registers are preserved.
 377   endprologue
 378
 379         movdqu  xmm4, [rax]
 380         movdqu  xmm5, [rbx]
 381         carryadd
 382
 383         mulacc  xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15
 384         mulacc  xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 385         propout xmm6, lo,                xmm12, xmm13
 386
 387         mulacc  xmm4, 1,   xmm8,  xmm9,  xmm13, xmm14, xmm15, xmm12, t
 388         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12
 389         propout xmm7, lo,                xmm13, xmm14
 390
 391         mulacc  xmm4, 2,   xmm8,  xmm9,  xmm14, xmm15, xmm12, xmm13, t
 392         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13
 393         propout xmm6, hi,                xmm14, xmm15
 394
 395         mulacc  xmm4, 3,   xmm8,  xmm9,  xmm15, xmm12, xmm13, xmm14, t
 396         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14
 397         propout xmm7, hi,                xmm15, xmm12
 398
 399         punpckldq xmm6, xmm7
 400         movdqu  [rdi], xmm6
 401
 402         ret
 403
 404 ENDFUNC
 405
 406 INTFUNC(mul4zc)
 407         // On entry, RDI points to the destination buffer; RBX points to a
 408         // packed operand X; and XMM10/XMM11 hold an expanded operand Y.
 409         //
 410         // On exit, we write the low 128 bits of the product X Y to [RDI],
 411         // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
 412         // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
 413         // general-purpose registers are preserved.
 414   endprologue
 415
 416         movdqu  xmm5, [rbx]
 417
 418         mulcore xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 419         propout xmm6, lo,                xmm12, xmm13
 420
 421         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 422         propout xmm7, lo,                xmm13, xmm14
 423
 424         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 425         propout xmm6, hi,                xmm14, xmm15
 426
 427         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 428         propout xmm7, hi,                xmm15, xmm12
 429
 430         punpckldq xmm6, xmm7
 431         movdqu  [rdi], xmm6
 432
 433         ret
 434
 435 ENDFUNC
 436
 437 INTFUNC(mul4)
 438         // On entry, RDI points to the destination buffer; RBX points to a
 439         // packed operand X; XMM10/XMM11 hold an expanded operand Y; and
 440         // XMM12, XMM13, XMM14 hold the incoming carry registers c0, c1, and
 441         // c2, representing a carry-in C; c3 is assumed to be zero.
 442         //
 443         // On exit, we write the low 128 bits of the sum C + X Y to [RDI],
 444         // and update the carry registers with the carry out.  The registers
 445         // XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
 446         // general-purpose registers are preserved.
 447   endprologue
 448
 449         movdqu  xmm5, [rbx]
 450
 451         mulacc  xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, t
 452         propout xmm6, lo,                xmm12, xmm13
 453
 454         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 455         propout xmm7, lo,                xmm13, xmm14
 456
 457         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 458         propout xmm6, hi,                xmm14, xmm15
 459
 460         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 461         propout xmm7, hi,                xmm15, xmm12
 462
 463         punpckldq xmm6, xmm7
 464         movdqu  [rdi], xmm6
 465
 466         ret
 467
 468 ENDFUNC
 469
 470 INTFUNC(mla4zc)
 471         // On entry, RDI points to the destination buffer, which also
 472         // contains an addend A to accumulate; RBX points to a packed operand
 473         // X; and XMM10/XMM11 points to an expanded operand Y.
 474         //
 475         // On exit, we write the low 128 bits of the sum A + X Y to [RDI],
 476         // and set the carry registers XMM12, XMM13, XMM14 to the carry out.
 477         // The registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
 478         // general-purpose registers are preserved.
 479   endprologue
 480
 481         movdqu  xmm5, [rbx]
 482         movd    xmm12, [rdi +  0]
 483         movd    xmm13, [rdi +  4]
 484         movd    xmm14, [rdi +  8]
 485         movd    xmm15, [rdi + 12]
 486
 487         mulacc  xmm5, 0,   xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 488         propout xmm6, lo,                xmm12, xmm13
 489
 490         mulacc  xmm5, 1,   xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 491         propout xmm7, lo,                xmm13, xmm14
 492
 493         mulacc  xmm5, 2,   xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 494         propout xmm6, hi,                xmm14, xmm15
 495
 496         mulacc  xmm5, 3,   xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 497         propout xmm7, hi,                xmm15, xmm12
 498
 499         punpckldq xmm6, xmm7
 500         movdqu  [rdi], xmm6
 501
 502         ret
 503
 504 ENDFUNC
 505
 506 INTFUNC(mla4)
 507         // On entry, RDI points to the destination buffer, which also
 508         // contains an addend A to accumulate; RBX points to a packed operand
 509         // X; XMM10/XMM11 holds an expanded operand Y; and XMM12, XMM13,
 510         // XMM14 hold the incoming carry registers c0, c1, and c2,
 511         // representing a carry-in C; c3 is assumed to be zero.
 512         //
 513         // On exit, we write the low 128 bits of the sum A + C + X Y to
 514         // [RDI], and update the carry registers with the carry out.  The
 515         // registers XMM0--XMM3, XMM5--XMM7, and XMM15 are clobbered; the
 516         // general-purpose registers are preserved.
 517   endprologue
 518
 519         movdqu  xmm5, [rbx]
 520         carryadd
 521
 522         mulacc  xmm5, 0,    xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 523         propout xmm6, lo,                xmm12, xmm13
 524
 525         mulacc  xmm5, 1,    xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 526         propout xmm7, lo,                xmm13, xmm14
 527
 528         mulacc  xmm5, 2,    xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 529         propout xmm6, hi,                xmm14, xmm15
 530
 531         mulacc  xmm5, 3,    xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 532         propout xmm7, hi,                xmm15, xmm12
 533
 534         punpckldq xmm6, xmm7
 535         movdqu  [rdi], xmm6
 536
 537         ret
 538
 539 ENDFUNC
 540
 541 INTFUNC(mmul4)
 542         // On entry, RDI points to the destination buffer; RAX and RBX point
 543         // to the packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold
 544         // the expanded operands V and M.  The stack pointer must be 8 modulo 16
 545         // (as usual for AMD64 ABIs).
 546         //
 547         // On exit, we store Y = U V M mod B in XMM10/XMM11, and write the
 548         // low 128 bits of the sum U V + N Y to [RDI], leaving the remaining
 549         // carry in XMM12, XMM13, and XMM14.  The registers XMM0--XMM7, and
 550         // XMM15 are clobbered; the general-purpose registers are preserved.
 551         movdqu  xmm4, [rax]
 552 #if ABI_WIN
 553         stalloc 48 + 8                  // space for the carries
 554 #endif
 555   endprologue
 556
 557         // Calculate W = U V, and leave it in XMM7.  Stash the carry pieces
 558         // for later.
 559         mulcore xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15
 560         propout xmm7, lo,                xmm12, xmm13
 561         jmp     5f
 562
 563 ENDFUNC
 564
 565 INTFUNC(mmla4)
 566         // On entry, RDI points to the destination buffer, which also
 567         // contains an addend A to accumulate; RAX and RBX point to the
 568         // packed operands U and N; and XMM8/XMM9 and XMM10/XMM11 hold the
 569         // expanded operands V and M.  The stack pointer must be 8 modulo 16
 570         // (as usual for AMD64 ABIs).
 571         //
 572         // On exit, we store Y = (A + U V) M mod B in XMM10/XMM11, and write
 573         // the low 128 bits of the sum A + U V + N Y to [RDI], leaving the
 574         // remaining carry in XMM12, XMM13, and XMM14.  The registers
 575         // XMM0--XMM7, and XMM15 are clobbered; the general-purpose registers
 576         // are preserved.
 577         movdqu  xmm4, [rax]
 578 #if ABI_WIN
 579         stalloc 48 + 8                  // space for the carries
 580 #  define STKTMP(i) [rsp + i]
 581 #endif
 582 #if ABI_SYSV
 583 #  define STKTMP(i) [rsp + i - 48 - 8]  // use red zone
 584 #endif
 585   endprologue
 586
 587         movd    xmm12, [rdi +  0]
 588         movd    xmm13, [rdi +  4]
 589         movd    xmm14, [rdi +  8]
 590         movd    xmm15, [rdi + 12]
 591
 592         // Calculate W = U V, and leave it in XMM7.  Stash the carry pieces
 593         // for later.
 594         mulacc  xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15
 595         propout xmm7, lo,                xmm12, xmm13
 596
 597 5:      mulacc  xmm4, 1,   xmm8,  xmm9,  xmm13, xmm14, xmm15, xmm12, t
 598         propout xmm6, lo,                xmm13, xmm14
 599
 600         mulacc  xmm4, 2,   xmm8,  xmm9,  xmm14, xmm15, xmm12, xmm13, t
 601         propout xmm7, hi,                xmm14, xmm15
 602
 603         mulacc  xmm4, 3,   xmm8,  xmm9,  xmm15, xmm12, xmm13, xmm14, t
 604         propout xmm6, hi,                xmm15, xmm12
 605
 606         // Prepare W, and stash carries for later.
 607         punpckldq xmm7, xmm6
 608         movdqa  STKTMP( 0), xmm12
 609         movdqa  STKTMP(16), xmm13
 610         movdqa  STKTMP(32), xmm14
 611
 612         // Calculate Y = W M.  We just about have enough spare registers to
 613         // make this work.
 614         mulcore xmm7, 0,   xmm10, xmm11, xmm3,  xmm4,  xmm5,  xmm6
 615
 616         // Start expanding W back into the main carry registers...
 617         pxor    xmm15, xmm15
 618         movdqa  xmm12, xmm7
 619         movdqa  xmm14, xmm7
 620
 621         mulcore xmm7, 1,   xmm10, xmm11, xmm0,  xmm1,  xmm2
 622         accum                            xmm4,  xmm5,  xmm6
 623
 624         punpckldq xmm12, xmm15          // (w_0, 0; w_1, 0)
 625         punpckhdq xmm14, xmm15          // (w_2, 0; w_3, 0)
 626
 627         mulcore xmm7, 2,   xmm10, xmm11, xmm0,  xmm1
 628         accum                            xmm5,  xmm6
 629
 630         pxor    xmm2, xmm2
 631         movdqa  xmm13, xmm12
 632         movdqa  xmm15, xmm14
 633
 634         mulcore xmm7, 3,   xmm10, xmm11, xmm0
 635         accum                            xmm6
 636
 637         punpckldq xmm12, xmm2           // (w_0, 0; 0, 0)
 638         punpckldq xmm14, xmm2           // (w_2, 0; 0, 0)
 639         punpckhdq xmm13, xmm2           // (w_1, 0; 0, 0)
 640         punpckhdq xmm15, xmm2           // (w_3, 0; 0, 0)
 641
 642         // That's lots of pieces.  Now we have to assemble the answer.
 643         squash  xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
 644
 645         // Expand it.
 646         movdqu  xmm5, [rbx]
 647         expand  xmm2, xmm10, xmm11
 648
 649         // Finish the calculation by adding the Montgomery product.
 650         mulacc  xmm5, 0    xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 651         propout xmm6, lo,                xmm12, xmm13
 652
 653         mulacc  xmm5, 1    xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 654         propout xmm7, lo,                xmm13, xmm14
 655
 656         mulacc  xmm5, 2    xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 657         propout xmm6, hi,                xmm14, xmm15
 658
 659         mulacc  xmm5, 3    xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 660         propout xmm7, hi,                xmm15, xmm12
 661
 662         punpckldq xmm6, xmm7
 663
 664         // Add add on the carry we calculated earlier.
 665         paddq   xmm12, STKTMP( 0)
 666         paddq   xmm13, STKTMP(16)
 667         paddq   xmm14, STKTMP(32)
 668
 669         // And, with that, we're done.
 670         movdqu  [rdi], xmm6
 671 #if ABI_WIN
 672         stfree  56
 673 #endif
 674         ret
 675
 676 #undef STKTMP
 677
 678 ENDFUNC
 679
 680 INTFUNC(mont4)
 681         // On entry, RDI points to the destination buffer holding a packed
 682         // value W; RBX points to a packed operand N; and XMM8/XMM9 hold an
 683         // expanded operand M.
 684         //
 685         // On exit, we store Y = W M mod B in XMM10/XMM11, and write the low
 686         // 128 bits of the sum W + N Y to [RDI], leaving the remaining carry
 687         // in XMM12, XMM13, and XMM14.  The registers XMM0--XMM3, XMM5--XMM7,
 688         // and XMM15 are clobbered; the general-purpose registers are
 689         // preserved.
 690   endprologue
 691
 692         movdqu  xmm7, [rdi]
 693
 694         // Calculate Y = W M.  Avoid the standard carry registers, because
 695         // we're setting something else up there.
 696         mulcore xmm7, 0,   xmm8,  xmm9,  xmm3,  xmm4,  xmm5,  xmm6
 697
 698         // Start expanding W back into the main carry registers...
 699         pxor    xmm15, xmm15
 700         movdqa  xmm12, xmm7
 701         movdqa  xmm14, xmm7
 702
 703         mulcore xmm7, 1,   xmm8,  xmm9,  xmm0,  xmm1,  xmm2
 704         accum                            xmm4,  xmm5,  xmm6
 705
 706         punpckldq xmm12, xmm15          // (w_0, 0; w_1, 0)
 707         punpckhdq xmm14, xmm15          // (w_2, 0; w_3, 0)
 708
 709         mulcore xmm7, 2,   xmm8,  xmm9,  xmm0,  xmm1
 710         accum                            xmm5,  xmm6
 711
 712         pxor    xmm2, xmm2
 713         movdqa  xmm13, xmm12
 714         movdqa  xmm15, xmm14
 715
 716         mulcore xmm7, 3,   xmm8,  xmm9,  xmm0
 717         accum                            xmm6
 718
 719         punpckldq xmm12, xmm2           // (w_0, 0; 0, 0)
 720         punpckldq xmm14, xmm2           // (w_2, 0; 0, 0)
 721         punpckhdq xmm13, xmm2           // (w_1, 0; 0, 0)
 722         punpckhdq xmm15, xmm2           // (w_3, 0; 0, 0)
 723
 724         // That's lots of pieces.  Now we have to assemble the answer.
 725         squash  xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
 726
 727         // Expand it.
 728         movdqu  xmm5, [rbx]
 729         expand  xmm2, xmm10, xmm11
 730
 731         // Finish the calculation by adding the Montgomery product.
 732         mulacc  xmm5, 0    xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
 733         propout xmm6, lo,                xmm12, xmm13
 734
 735         mulacc  xmm5, 1    xmm10, xmm11, xmm13, xmm14, xmm15, xmm12, t
 736         propout xmm7, lo,                xmm13, xmm14
 737
 738         mulacc  xmm5, 2    xmm10, xmm11, xmm14, xmm15, xmm12, xmm13, t
 739         propout xmm6, hi,                xmm14, xmm15
 740
 741         mulacc  xmm5, 3    xmm10, xmm11, xmm15, xmm12, xmm13, xmm14, t
 742         propout xmm7, hi,                xmm15, xmm12
 743
 744         punpckldq xmm6, xmm7
 745
 746         // And, with that, we're done.
 747         movdqu  [rdi], xmm6
 748         ret
 749
 750 ENDFUNC
 751
 752 ///--------------------------------------------------------------------------
 753 /// Bulk multipliers.
 754
 755 FUNC(mpx_umul4_amd64_avx)
 756         .arch   .avx
 757         vzeroupper
 758   endprologue
 759         .arch   pentium4
 760 ENDFUNC
 761
 762 FUNC(mpx_umul4_amd64_sse2)
 763         // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
 764         //                         const mpw *bv, const mpw *bvl);
 765
 766         // Establish the arguments and do initial setup.
 767         //
 768         //                      sysv    win
 769         // inner loop dv        rdi     rdi*
 770         // inner loop av        rbx*    rbx*
 771         // outer loop dv        r10     rcx
 772         // outer loop bv        rcx     r9
 773         // av base              rsi     rdx
 774         // av limit             rdx     r8
 775         // bv limit             r8      r10
 776
 777 #if ABI_SYSV
 778 #  define DV r10
 779 #  define AV rsi
 780 #  define AVL rdx
 781 #  define BV rcx
 782 #  define BVL r8
 783
 784         pushreg rbx
 785   endprologue
 786
 787         mov     DV, rdi
 788
 789 #endif
 790
 791 #if ABI_WIN
 792 #  define DV rcx
 793 #  define AV rdx
 794 #  define AVL r8
 795 #  define BV r9
 796 #  define BVL r10
 797
 798         pushreg rbx
 799         pushreg rdi
 800         stalloc 160 + 8
 801
 802         savexmm xmm6,    0
 803         savexmm xmm7,   16
 804         savexmm xmm8,   32
 805         savexmm xmm9,   48
 806         savexmm xmm10,  64
 807         savexmm xmm11,  80
 808         savexmm xmm12,  96
 809         savexmm xmm13, 112
 810         savexmm xmm14, 128
 811         savexmm xmm15, 144
 812
 813   endprologue
 814
 815         mov     rdi, DV
 816         mov     BVL, [rsp + 224]
 817
 818 #endif
 819
 820         // Prepare for the first iteration.
 821         pxor    xmm0, xmm0
 822         movdqu  xmm10, [BV]             // bv[0]
 823         mov     rbx, AV
 824         add     DV, 16
 825         add     BV, 16
 826         expand  xmm0, xmm10, xmm11
 827         call    mul4zc
 828         add     rbx, 16
 829         add     rdi, 16
 830         cmp     rbx, AVL                // all done?
 831         jae     8f
 832
 833         .p2align 4
 834         // Continue with the first iteration.
 835 0:      call    mul4
 836         add     rbx, 16
 837         add     rdi, 16
 838         cmp     rbx, AVL                // all done?
 839         jb      0b
 840
 841         // Write out the leftover carry.  There can be no tail here.
 842 8:      call    carryprop
 843         cmp     BV, BVL                 // more passes to do?
 844         jae     9f
 845
 846         .p2align 4
 847         // Set up for the next pass.
 848 1:      movdqu  xmm10, [BV]             // bv[i]
 849         mov     rdi, DV                 // -> dv[i]
 850         pxor    xmm0, xmm0
 851         expand  xmm0, xmm10, xmm11
 852         mov     rbx, AV                 // -> av[0]
 853         add     DV, 16
 854         add     BV, 16
 855         call    mla4zc
 856         add     rbx, 16
 857         add     rdi, 16
 858         cmp     rbx, AVL                // done yet?
 859         jae     8f
 860
 861         .p2align 4
 862         // Continue...
 863 0:      call    mla4
 864         add     rbx, 16
 865         add     rdi, 16
 866         cmp     rbx, AVL
 867         jb      0b
 868
 869         // Finish off this pass.  There was no tail on the previous pass, and
 870         // there can be none on this pass.
 871 8:      call    carryprop
 872         cmp     BV, BVL
 873         jb      1b
 874
 875         // All over.
 876 9:
 877
 878 #if ABI_SYSV
 879         popreg  rbx
 880 #endif
 881
 882 #if ABI_WIN
 883
 884         rstrxmm xmm6,    0
 885         rstrxmm xmm7,   16
 886         rstrxmm xmm8,   32
 887         rstrxmm xmm9,   48
 888         rstrxmm xmm10,  64
 889         rstrxmm xmm11,  80
 890         rstrxmm xmm12,  96
 891         rstrxmm xmm13, 112
 892         rstrxmm xmm14, 128
 893         rstrxmm xmm15, 144
 894
 895         stfree  160 + 8
 896         popreg  rdi
 897         popreg  rbx
 898
 899 #endif
 900
 901         ret
 902
 903 #undef DV
 904 #undef AV
 905 #undef AVL
 906 #undef BV
 907 #undef BVL
 908
 909 ENDFUNC
 910
 911 FUNC(mpxmont_mul4_amd64_avx)
 912         .arch   .avx
 913         vzeroupper
 914   endprologue
 915         .arch   pentium4
 916 ENDFUNC
 917
 918 FUNC(mpxmont_mul4_amd64_sse2)
 919         // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
 920         //                           const mpw *nv, size_t n, const mpw *mi);
 921
 922         // Establish the arguments and do initial setup.
 923         //
 924         //                      sysv    win
 925         // inner loop dv        rdi     rdi*
 926         // inner loop av        rax     rax
 927         // inner loop nv        rbx*    rbx*
 928         // mi                   r9      r10
 929         // outer loop dv        r10     rcx
 930         // outer loop bv        rdx     r8
 931         // av base              rsi     rdx
 932         // av limit             r11     r11
 933         // bv limit             r8      r12*
 934         // nv base              rcx     r9
 935         // n                    r8      r12*
 936
 937 #if ABI_SYSV
 938 #  define DV r10
 939 #  define AV rsi
 940 #  define AVL r11
 941 #  define BV rdx
 942 #  define BVL r8
 943 #  define NV rcx
 944 #  define N r8
 945 #  define MI r9
 946
 947         pushreg rbx
 948   endprologue
 949
 950         mov     DV, rdi
 951
 952 #endif
 953
 954 #if ABI_WIN
 955 #  define DV rcx
 956 #  define AV rdx
 957 #  define AVL r11
 958 #  define BV r8
 959 #  define BVL r12
 960 #  define NV r9
 961 #  define N r12
 962 #  define MI r10
 963
 964         pushreg rbx
 965         pushreg rdi
 966         pushreg r12
 967         stalloc 160
 968
 969         savexmm xmm6,    0
 970         savexmm xmm7,   16
 971         savexmm xmm8,   32
 972         savexmm xmm9,   48
 973         savexmm xmm10,  64
 974         savexmm xmm11,  80
 975         savexmm xmm12,  96
 976         savexmm xmm13, 112
 977         savexmm xmm14, 128
 978         savexmm xmm15, 144
 979
 980   endprologue
 981
 982         mov     rdi, DV
 983         mov     N, [rsp + 224]
 984         mov     MI, [rsp + 232]
 985
 986 #endif
 987
 988         // Establish the expanded operands.
 989         pxor    xmm0, xmm0
 990         movdqu  xmm8, [BV]              // bv[0]
 991         movdqu  xmm10, [MI]             // mi
 992         expand  xmm0, xmm8, xmm9, xmm10, xmm11
 993
 994         // Set up the outer loop state and prepare for the first iteration.
 995         mov     rax, AV                 // -> U = av[0]
 996         mov     rbx, NV                 // -> X = nv[0]
 997         lea     AVL, [AV + 4*N]         // -> av[n/4] = av limit
 998         lea     BVL, [BV + 4*N]         // -> bv[n/4] = bv limit
 999         add     BV, 16
1000         add     DV, 16
1001         call    mmul4
1002         add     rdi, 16
1003         add     rax, 16
1004         add     rbx, 16
1005         cmp     rax, AVL                // done already?
1006         jae     8f
1007
1008         .p2align 4
1009         // Complete the first inner loop.
1010 0:      call    dmul4
1011         add     rdi, 16
1012         add     rax, 16
1013         add     rbx, 16
1014         cmp     rax, AVL                // done yet?
1015         jb      0b
1016
1017         // Still have carries left to propagate.
1018         call    carryprop
1019         movd    [rdi + 16], xmm12
1020
1021         .p2align 4
1022         // Embark on the next iteration.  (There must be one.  If n = 1, then
1023         // we would have bailed above, to label 8.  Similarly, the subsequent
1024         // iterations can fall into the inner loop immediately.)
1025 1:      pxor    xmm0, xmm0
1026         movdqu  xmm8, [BV]              // bv[i]
1027         movdqu  xmm10, [MI]             // mi
1028         mov     rdi, DV                 // -> Z = dv[i]
1029         mov     rax, AV                 // -> U = av[0]
1030         mov     rbx, NV                 // -> X = nv[0]
1031         expand  xmm0, xmm8, xmm9, xmm10, xmm11
1032         add     BV, 16
1033         add     DV, 16
1034         call    mmla4
1035         add     rdi, 16
1036         add     rax, 16
1037         add     rbx, 16
1038
1039         .p2align 4
1040         // Complete the next inner loop.
1041 0:      call    dmla4
1042         add     rdi, 16
1043         add     rax, 16
1044         add     rbx, 16
1045         cmp     rax, AVL
1046         jb      0b
1047
1048         // Still have carries left to propagate, and they overlap the
1049         // previous iteration's final tail, so read that in and add it.
1050         movd    xmm0, [rdi]
1051         paddq   xmm12, xmm0
1052         call    carryprop
1053         movd    [rdi + 16], xmm12
1054
1055         // Back again, maybe.
1056         cmp     BV, BVL
1057         jb      1b
1058
1059         // All done.
1060 9:
1061
1062 #if ABI_SYSV
1063         popreg  rbx
1064 #endif
1065
1066 #if ABI_WIN
1067
1068         rstrxmm xmm6,    0
1069         rstrxmm xmm7,   16
1070         rstrxmm xmm8,   32
1071         rstrxmm xmm9,   48
1072         rstrxmm xmm10,  64
1073         rstrxmm xmm11,  80
1074         rstrxmm xmm12,  96
1075         rstrxmm xmm13, 112
1076         rstrxmm xmm14, 128
1077         rstrxmm xmm15, 144
1078
1079         stfree  160
1080         popreg  r12
1081         popreg  rdi
1082         popreg  rbx
1083
1084 #endif
1085
1086         ret
1087
1088         // First iteration was short.  Write out the carries and we're done.
1089         // (This could be folded into the main loop structure, but that would
1090         // penalize small numbers more.)
1091 8:      call    carryprop
1092         movd    [rdi + 16], xmm12
1093 #if ABI_SYSV
1094         popreg  rbx
1095         ret
1096 #endif
1097 #if ABI_WIN
1098         jmp     9b
1099 #endif
1100
1101 #undef DV
1102 #undef AV
1103 #undef AVL
1104 #undef BV
1105 #undef BVL
1106 #undef NV
1107 #undef N
1108 #undef MI
1109
1110 ENDFUNC
1111
1112 FUNC(mpxmont_redc4_amd64_avx)
1113         .arch   .avx
1114         vzeroupper
1115   endprologue
1116         .arch   pentium4
1117 ENDFUNC
1118
1119 FUNC(mpxmont_redc4_amd64_sse2)
1120         // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
1121         //                             size_t n, const mpw *mi);
1122
1123         // Establish the arguments and do initial setup.
1124         //
1125         //                      sysv    win
1126         // inner loop dv        rdi     rdi*
1127         // dv limit             rax     rax
1128         // blocks-of-4 dv limit rsi     rdx
1129         // inner loop nv        rbx*    rbx*
1130         // mi                   r8      r10
1131         // outer loop dv        r10     rcx
1132         // outer loop dv limit  r11     r11
1133         // nv base              rdx     r8
1134         // nv limit             r9      r12*
1135         // n                    rcx     r9
1136         // c                    rcx     r9
1137
1138 #if ABI_SYSV
1139
1140 #  define DVL rax
1141 #  define DVL4 rsi
1142 #  define MI r8
1143 #  define DV r10
1144 #  define DVLO r11
1145 #  define NV rdx
1146 #  define NVL r9
1147 #  define N rcx
1148 #  define C ecx
1149
1150         pushreg rbx
1151   endprologue
1152
1153         mov     DV, rdi
1154
1155 #endif
1156
1157 #if ABI_WIN
1158
1159 #  define DVL rax
1160 #  define DVL4 rdx
1161 #  define MI r10
1162 #  define DV rcx
1163 #  define DVLO r11
1164 #  define NV r8
1165 #  define NVL r12
1166 #  define N r9
1167 #  define C r9d
1168
1169         pushreg rbx
1170         pushreg rdi
1171         pushreg r12
1172         stalloc 160
1173
1174         savexmm xmm6,    0
1175         savexmm xmm7,   16
1176         savexmm xmm8,   32
1177         savexmm xmm9,   48
1178         savexmm xmm10,  64
1179         savexmm xmm11,  80
1180         savexmm xmm12,  96
1181         savexmm xmm13, 112
1182         savexmm xmm14, 128
1183         savexmm xmm15, 144
1184
1185   endprologue
1186
1187         mov     rdi, DV
1188         mov     MI, [rsp + 224]
1189
1190 #endif
1191
1192         // Establish the expanded operands and the blocks-of-4 dv limit.
1193         pxor    xmm0, xmm0
1194         mov     DVL, DVL4               // -> dv[n] = dv limit
1195         sub     DVL4, DV                // length of dv in bytes
1196         movdqu  xmm8, [MI]              // mi
1197         and     DVL4, ~15               // mask off the tail end
1198         expand  xmm0, xmm8, xmm9
1199         add     DVL4, DV                // find limit
1200
1201         // Set up the outer loop state and prepare for the first iteration.
1202         mov     rbx, NV                 // -> X = nv[0]
1203         lea     DVLO, [DV + 4*N]        // -> dv[n/4] = outer dv limit
1204         lea     NVL, [NV + 4*N]         // -> nv[n/4] = nv limit
1205         add     DV, 16
1206         call    mont4
1207         add     rbx, 16
1208         add     rdi, 16
1209         cmp     rbx, NVL                // done already?
1210         jae     8f
1211
1212         .p2align 4
1213         // Complete the first inner loop.
1214 5:      call    mla4
1215         add     rbx, 16
1216         add     rdi, 16
1217         cmp     rbx, NVL                // done yet?
1218         jb      5b
1219
1220         // Still have carries left to propagate.
1221 8:      carryadd
1222         psllq   xmm15, 16
1223         pslldq  xmm15, 8
1224         paddq   xmm14, xmm15
1225         call    carryprop
1226         movd    C, xmm12
1227         add     rdi, 16
1228         cmp     rdi, DVL4
1229         jae     7f
1230
1231         .p2align 4
1232         // Continue carry propagation until the end of the buffer.
1233 0:      add     [rdi], C
1234         mov     C, 0                    // preserves flags
1235         adcd    [rdi + 4], 0
1236         adcd    [rdi + 8], 0
1237         adcd    [rdi + 12], 0
1238         adc     C, 0
1239         add     rdi, 16
1240         cmp     rdi, DVL4
1241         jb      0b
1242
1243         // Deal with the tail end.
1244 7:      add     [rdi], C
1245         mov     C, 0                    // preserves flags
1246         add     rdi, 4
1247         adc     C, 0
1248         cmp     rdi, DVL
1249         jb      7b
1250
1251         // All done for this iteration.  Start the next.  (This must have at
1252         // least one follow-on iteration, or we'd not have started this outer
1253         // loop.)
1254 8:      mov     rdi, DV                 // -> Z = dv[i]
1255         mov     rbx, NV                 // -> X = nv[0]
1256         cmp     rdi, DVLO               // all done yet?
1257         jae     9f
1258         add     DV, 16
1259         call    mont4
1260         add     rdi, 16
1261         add     rbx, 16
1262         jmp     5b
1263
1264         // All over.
1265 9:
1266
1267 #if ABI_SYSV
1268         popreg  rbx
1269 #endif
1270
1271 #if ABI_WIN
1272
1273         rstrxmm xmm6,    0
1274         rstrxmm xmm7,   16
1275         rstrxmm xmm8,   32
1276         rstrxmm xmm9,   48
1277         rstrxmm xmm10,  64
1278         rstrxmm xmm11,  80
1279         rstrxmm xmm12,  96
1280         rstrxmm xmm13, 112
1281         rstrxmm xmm14, 128
1282         rstrxmm xmm15, 144
1283
1284         stfree  160
1285         popreg  r12
1286         popreg  rdi
1287         popreg  rbx
1288
1289 #endif
1290
1291         ret
1292
1293 #undef DVL
1294 #undef DVL4
1295 #undef MI
1296 #undef DV
1297 #undef DVLO
1298 #undef NV
1299 #undef NVL
1300 #undef N
1301 #undef C
1302
1303 ENDFUNC
1304
1305 ///--------------------------------------------------------------------------
1306 /// Testing and performance measurement.
1307
1308 #ifdef TEST_MUL4
1309
1310 #if ABI_SYSV
1311 #  define ARG0 rdi
1312 #  define ARG1 rsi
1313 #  define ARG2 rdx
1314 #  define ARG3 rcx
1315 #  define ARG4 r8
1316 #  define ARG5 r9
1317 #  define ARG6 STKARG(0)
1318 #  define ARG7 STKARG(1)
1319 #  define ARG8 STKARG(2)
1320 #  define STKARG_OFFSET 16
1321 #endif
1322 #if ABI_WIN
1323 #  define ARG0 rcx
1324 #  define ARG1 rdx
1325 #  define ARG2 r8
1326 #  define ARG3 r9
1327 #  define ARG4 STKARG(0)
1328 #  define ARG5 STKARG(1)
1329 #  define ARG6 STKARG(2)
1330 #  define ARG7 STKARG(3)
1331 #  define ARG8 STKARG(4)
1332 #  define STKARG_OFFSET 224
1333 #endif
1334 #define STKARG(i) [rsp + STKARG_OFFSET + 8*(i)]
1335
1336 //                sysv                          win
1337 //                dmul  smul  mmul  mont        dmul  smul  mmul  mont
1338 // A    rax
1339 // D    rdx
1340 // z    rdi       rdi   rdi   rdi    rdi        rcx   rcx   rcx    rcx
1341 // c    rcx       rsi   rsi   rsi    rsi        rdx   rdx   rdx    rdx
1342 // y    r10       --    --    rdx    rdx        --    --    r8     r8
1343 // u    r11       rdx   --    rcx    --         r8    --    r9     --
1344 // x    rbx       rcx   rdx   r8     rcx        r9    r8    stk0   r9
1345 // vv   xmm8/9    r8    --    r9     r8         stk0  --    stk1   stk0
1346 // yy   xmm10/11  r9    rcx   stk0   --         stk1  r9    stk2   --
1347 // n    r8        stk0  r8    stk1   r9         stk2  stk0  stk3   stk1
1348 // cyv  r9        stk1  r9    stk2   stk0       stk3  stk1  stk4   stk2
1349
1350 .macro  cysetup v, n
1351         rdtsc
1352         shl     rdx, 32
1353         or      rax, rdx
1354         mov     [\v + 8*\n - 8], rax
1355 .endm
1356
1357 .macro  cystore v, n
1358         rdtsc
1359         shl     rdx, 32
1360         or      rax, rdx
1361         sub     rax, [\v + 8*\n - 8]
1362         mov     [\v + 8*\n - 8], rax
1363         dec     \n
1364 .endm
1365
1366 .macro  testprologue mode
1367         pushreg rbx
1368 #if ABI_SYSV
1369   endprologue
1370   .ifeqs "\mode", "dmul"
1371         mov     rbx, rcx
1372         movdqu  xmm8, [r8]
1373         movdqu  xmm10, [r9]
1374         mov     r8d, STKARG(0)
1375         mov     r9, STKARG(1)
1376         mov     r11, rdx
1377         mov     rcx, rsi
1378   .endif
1379   .ifeqs "\mode", "smul"
1380         mov     rbx, rdx
1381         movdqu  xmm10, [rcx]
1382         mov     rcx, rsi
1383   .endif
1384   .ifeqs "\mode", "mmul"
1385         mov     rax, STKARG(0)
1386         mov     rbx, r8
1387         movdqu  xmm8, [r9]
1388         movdqu  xmm10, [rax]
1389         mov     r8d, STKARG(1)
1390         mov     r9, STKARG(2)
1391         mov     r10, rdx
1392         mov     r11, rcx
1393         mov     rcx, rsi
1394   .endif
1395   .ifeqs "\mode", "mont"
1396         mov     rbx, rcx
1397         movdqu  xmm8, [r8]
1398         mov     r8d, r9d
1399         mov     r9, STKARG(0)
1400         mov     r10, rdx
1401         mov     rcx, rsi
1402   .endif
1403 #endif
1404 #if ABI_WIN
1405         pushreg rdi
1406         stalloc 168
1407         savexmm xmm6,    0
1408         savexmm xmm7,   16
1409         savexmm xmm8,   32
1410         savexmm xmm9,   48
1411         savexmm xmm10,  64
1412         savexmm xmm11,  80
1413         savexmm xmm12,  96
1414         savexmm xmm13, 112
1415         savexmm xmm14, 128
1416         savexmm xmm15, 144
1417   endprologue
1418   .ifeqs "\mode", "dmul"
1419         mov     r10, STKARG(0)
1420         mov     r11, STKARG(1)
1421         mov     rdi, rcx
1422         mov     rcx, rdx
1423         mov     rbx, r9
1424         movdqu  xmm8, [r10]
1425         movdqu  xmm10, [r11]
1426         mov     r11, r8
1427         mov     r8d, STKARG(2)
1428         mov     r9, STKARG(3)
1429   .endif
1430   .ifeqs "\mode", "smul"
1431         mov     rdi, rcx
1432         mov     rcx, rdx
1433         mov     rbx, r8
1434         movdqu  xmm10, [r9]
1435         mov     r8d, STKARG(0)
1436         mov     r9, STKARG(1)
1437   .endif
1438   .ifeqs "\mode", "mmul"
1439         mov     r10, STKARG(1)
1440         mov     r11, STKARG(2)
1441         mov     rdi, rcx
1442         mov     rcx, rdx
1443         mov     rbx, STKARG(0)
1444         movdqu  xmm8, [r10]
1445         movdqu  xmm10, [r11]
1446         mov     r10, r8
1447         mov     r11, r9
1448         mov     r8d, STKARG(3)
1449         mov     r9, STKARG(4)
1450   .endif
1451   .ifeqs "\mode", "mont"
1452         mov     r10, STKARG(0)
1453         mov     rdi, rcx
1454         mov     rcx, rdx
1455         mov     rbx, r9
1456         movdqu  xmm8, [r10]
1457         mov     r10, r8
1458         mov     r8d, STKARG(1)
1459         mov     r9, STKARG(2)
1460   .endif
1461 #endif
1462
1463         pxor    xmm0, xmm0
1464   .ifeqs "\mode", "dmul"
1465         expand  xmm0, xmm8, xmm9, xmm10, xmm11
1466   .endif
1467   .ifeqs "\mode", "smul"
1468         expand  xmm0, xmm10, xmm11
1469   .endif
1470   .ifeqs "\mode", "mmul"
1471         expand  xmm0, xmm8, xmm9, xmm10, xmm11
1472   .endif
1473   .ifeqs "\mode", "mont"
1474         expand  xmm0, xmm8, xmm9
1475   .endif
1476 .endm
1477
1478 .macro  testepilogue
1479 #if ABI_WIN
1480         rstrxmm xmm6,    0
1481         rstrxmm xmm7,   16
1482         rstrxmm xmm8,   32
1483         rstrxmm xmm9,   48
1484         rstrxmm xmm10,  64
1485         rstrxmm xmm11,  80
1486         rstrxmm xmm12,  96
1487         rstrxmm xmm13, 112
1488         rstrxmm xmm14, 128
1489         rstrxmm xmm15, 144
1490         stfree  168
1491         popreg  rdi
1492 #endif
1493         popreg  rbx
1494         ret
1495 .endm
1496
1497 .macro  testldcarry
1498         movdqu  xmm12, [rcx +  0]       // (c'_0; c''_0)
1499         movdqu  xmm13, [rcx + 16]       // (c'_1; c''_1)
1500         movdqu  xmm14, [rcx + 32]       // (c'_2; c''_2)
1501 .endm
1502
1503 .macro  testtop u=nil
1504         .p2align 4
1505 0:
1506         cysetup r9, r8
1507   .ifnes "\u", "nil"
1508         mov     rax, \u
1509   .endif
1510 .endm
1511
1512 .macro  testtail
1513         cystore r9, r8
1514         jnz     0b
1515 .endm
1516
1517 .macro  testcarryout
1518         movdqu  [rcx +  0], xmm12
1519         movdqu  [rcx + 16], xmm13
1520         movdqu  [rcx + 32], xmm14
1521 .endm
1522
1523 FUNC(test_dmul4)
1524         testprologue dmul
1525         testldcarry
1526         testtop r11
1527         call    dmul4
1528         testtail
1529         testcarryout
1530         testepilogue
1531 ENDFUNC
1532
1533 FUNC(test_dmla4)
1534         testprologue dmul
1535         testldcarry
1536         testtop r11
1537         call    dmla4
1538         testtail
1539         testcarryout
1540         testepilogue
1541 ENDFUNC
1542
1543 FUNC(test_mul4)
1544         testprologue smul
1545         testldcarry
1546         testtop nil
1547         call    mul4
1548         testtail
1549         testcarryout
1550         testepilogue
1551 ENDFUNC
1552
1553 FUNC(test_mla4)
1554         testprologue smul
1555         testldcarry
1556         testtop nil
1557         call    mla4
1558         testtail
1559         testcarryout
1560         testepilogue
1561 ENDFUNC
1562
1563 FUNC(test_mmul4)
1564         testprologue mmul
1565         testtop r11
1566         call    mmul4
1567         testtail
1568         movdqu  [r10 +  0], xmm10
1569         movdqu  [r10 + 16], xmm11
1570         testcarryout
1571         testepilogue
1572 ENDFUNC
1573
1574 FUNC(test_mmla4)
1575         testprologue mmul
1576         testtop r11
1577         call    mmla4
1578         testtail
1579         movdqu  [r10 +  0], xmm10
1580         movdqu  [r10 + 16], xmm11
1581         testcarryout
1582         testepilogue
1583 ENDFUNC
1584
1585 FUNC(test_mont4)
1586         testprologue mont
1587         testtop
1588         call    mont4
1589         testtail
1590         movdqu  [r10 +  0], xmm10
1591         movdqu  [r10 + 16], xmm11
1592         testcarryout
1593         testepilogue
1594 ENDFUNC
1595
1596 #endif
1597
1598 ///----- That's all, folks --------------------------------------------------