mdw@git.distorted.org.uk Git - catacomb/blob - math/mpx-mul4-x86-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
   2 ///
   3 /// Large SIMD-based multiplications
   4 ///
   5 /// (c) 2016 Straylight/Edgeware
   6
   7 ///----- Licensing notice ---------------------------------------------------
   8 ///
   9 /// This file is part of Catacomb.
  10 ///
  11 /// Catacomb is free software; you can redistribute it and/or modify
  12 /// it under the terms of the GNU Library General Public License as
  13 /// published by the Free Software Foundation; either version 2 of the
  14 /// License, or (at your option) any later version.
  15 ///
  16 /// Catacomb is distributed in the hope that it will be useful,
  17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 /// GNU Library General Public License for more details.
  20 ///
  21 /// You should have received a copy of the GNU Library General Public
  22 /// License along with Catacomb; if not, write to the Free
  23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  24 /// MA 02111-1307, USA.
  25
  26 ///--------------------------------------------------------------------------
  27 /// External definitions.
  28
  29 #include "config.h"
  30 #include "asm-common.h"
  31
  32 ///--------------------------------------------------------------------------
  33 /// Prologue.
  34
  35         .arch   pentium4
  36         .text
  37
  38 ///--------------------------------------------------------------------------
  39 /// Theory.
  40 ///
  41 /// We define a number of primitive fixed-size multipliers from which we can
  42 /// construct more general variable-length multipliers.
  43 ///
  44 /// The basic trick is the same throughout.  In an operand-scanning
  45 /// multiplication, the inner multiplication loop multiplies a
  46 /// multiple-precision operand by a single precision factor, and adds the
  47 /// result, appropriately shifted, to the result.  A `finely integrated
  48 /// operand scanning' implementation of Montgomery multiplication also adds
  49 /// the product of a single-precision `Montgomery factor' and the modulus,
  50 /// calculated in the same pass.  The more common `coarsely integrated
  51 /// operand scanning' alternates main multiplication and Montgomery passes,
  52 /// which requires additional carry propagation.
  53 ///
  54 /// Throughout both plain-multiplication and Montgomery stages, then, one of
  55 /// the factors remains constant throughout the operation, so we can afford
  56 /// to take a little time to preprocess it.  The transformation we perform is
  57 /// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
  58 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
  59 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
  60 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
  61 /// operands, as follows.
  62 ///
  63 ///     Offset     0       4        8      12
  64 ///        0    v'_0    v'_1    v''_0   v''_1
  65 ///       16    v'_2    v'_3    v''_2   v''_3
  66 ///
  67 /// A `pmuludq' instruction ignores the odd positions in its operands; thus,
  68 /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
  69 /// this vector right by 4 bytes brings v'_1 and v''_1 into position.  We can
  70 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
  71 /// results in 64-bit fields.  The sixteen bits of headroom allows us to add
  72 /// many products together before we must deal with carrying; it also allows
  73 /// for some calculations to be performed on the above expanded form.
  74 ///
  75 /// On 32-bit x86, we are register starved: the expanded operands are kept in
  76 /// memory, typically in warm L1 cache.
  77 ///
  78 /// We maintain four `carry' registers accumulating intermediate results.
  79 /// The registers' precise roles rotate during the computation; we name them
  80 /// `c0', `c1', `c2', and `c3'.  Each carry register holds two 64-bit halves:
  81 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
  82 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
  83 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3.  The
  84 /// `pmuluqd' instruction acting on a scalar operand (broadcast across all
  85 /// lanes of its vector) and an operand in the expanded form above produces a
  86 /// result which can be added directly to the appropriate carry register.
  87 /// Following a pass of four multiplications, we perform some limited carry
  88 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
  89 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
  90 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
  91 /// zeroed becomes c3.
  92
  93 ///--------------------------------------------------------------------------
  94 /// Macro definitions.
  95
  96 .macro  mulcore r, s, d0, d1=nil, d2=nil, d3=nil
  97         // Load a word r_i from R, multiply by the expanded operand [S], and
  98         // leave the pieces of the product in registers D0, D1, D2, D3.
  99         movd    \d0, \r                 // (r_i, 0, 0, 0)
 100   .ifnes "\d1", "nil"
 101         movdqa  \d1, [\s]               // (s'_0, s'_1, s''_0, s''_1)
 102   .endif
 103   .ifnes "\d3", "nil"
 104         movdqa  \d3, [\s + 16]          // (s'_2, s'_3, s''_2, s''_3)
 105   .endif
 106         pshufd  \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?, r_i, ?)
 107   .ifnes "\d1", "nil"
 108         psrldq  \d1, 4                  // (s'_1, s''_0, s''_1, 0)
 109   .endif
 110   .ifnes "\d2", "nil"
 111     .ifnes "\d3", "nil"
 112         movdqa  \d2, \d3                // another copy of (s'_2, s'_3, ...)
 113     .else
 114         movdqa  \d2, \d0                // another copy of (r_i, ?, r_i, ?)
 115     .endif
 116   .endif
 117   .ifnes "\d3", "nil"
 118         psrldq  \d3, 4                  // (s'_3, s''_2, s''_3, 0)
 119   .endif
 120   .ifnes "\d1", "nil"
 121         pmuludq \d1, \d0                // (r_i s'_1, r_i s''_1)
 122   .endif
 123   .ifnes "\d3", "nil"
 124         pmuludq \d3, \d0                // (r_i s'_3, r_i s''_3)
 125   .endif
 126   .ifnes "\d2", "nil"
 127     .ifnes "\d3", "nil"
 128         pmuludq \d2, \d0                // (r_i s'_2, r_i s''_2)
 129     .else
 130         pmuludq \d2, [\s + 16]
 131     .endif
 132   .endif
 133         pmuludq \d0, [\s]               // (r_i s'_0, r_i s''_0)
 134 .endm
 135
 136 .macro  accum   c0, c1=nil, c2=nil, c3=nil
 137         // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
 138         // carry registers C0--C3.  Any or all of C1--C3 may be `nil' to skip
 139         // updating that register.
 140         paddq   \c0, xmm0
 141   .ifnes "\c1", "nil"
 142         paddq   \c1, xmm1
 143   .endif
 144   .ifnes "\c2", "nil"
 145         paddq   \c2, xmm2
 146   .endif
 147   .ifnes "\c3", "nil"
 148         paddq   \c3, xmm3
 149   .endif
 150 .endm
 151
 152 .macro  mulacc  r, s, c0, c1, c2, c3, z3p=nil
 153         // Load a word r_i from R, multiply by the expanded operand [S],
 154         // and accumulate in carry registers C0, C1, C2, C3.  If Z3P is `t'
 155         // then C3 notionally contains zero, but needs clearing; in practice,
 156         // we store the product directly rather than attempting to add.  On
 157         // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
 158         // is not `t'.
 159   .ifeqs "\z3p", "t"
 160         mulcore \r, \s, xmm0, xmm1, xmm2, \c3
 161         accum           \c0,  \c1,  \c2
 162   .else
 163         mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
 164         accum           \c0,  \c1,  \c2,  \c3
 165   .endif
 166 .endm
 167
 168 .macro  propout d, c, cc=nil
 169         // Calculate an output word from C, and store it in D; propagate
 170         // carries out from C to CC in preparation for a rotation of the
 171         // carry registers.  On completion, XMM3 is clobbered.  If CC is
 172         // `nil', then the contribution which would have been added to it is
 173         // left in C.
 174         pshufd  xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
 175         psrldq  xmm3, 12                // (t, 0, 0, 0) = (t, 0)
 176         pslldq  xmm3, 2                 // (t b, 0)
 177         paddq   \c, xmm3                // (c' + t b, c'')
 178         movd    \d, \c
 179         psrlq   \c, 32                  // floor(c/B)
 180   .ifnes "\cc", "nil"
 181         paddq   \cc, \c                 // propagate up
 182   .endif
 183 .endm
 184
 185 .macro  endprop d, c, t
 186         // On entry, C contains a carry register.  On exit, the low 32 bits
 187         // of the value represented in C are written to D, and the remaining
 188         // bits are left at the bottom of T.
 189         movdqa  \t, \c
 190         psllq   \t, 16                  // (?, c'' b)
 191         pslldq  \c, 8                   // (0, c')
 192         paddq   \t, \c                  // (?, c' + c'' b)
 193         psrldq  \t, 8                   // c' + c'' b
 194         movd    \d, \t
 195         psrldq  \t, 4                   // floor((c' + c'' b)/B)
 196 .endm
 197
 198 .macro  expand  z, a, b, c=nil, d=nil
 199         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
 200         // exit, A:B and C:D together hold the same values in expanded
 201         // form.  If C is `nil', then only expand A to A:B.
 202         movdqa  \b, \a                  // (a_0, a_1, a_2, a_3)
 203   .ifnes "\c", "nil"
 204         movdqa  \d, \c                  // (c_0, c_1, c_2, c_3)
 205   .endif
 206         punpcklwd \a, \z                // (a'_0, a''_0, a'_1, a''_1)
 207         punpckhwd \b, \z                // (a'_2, a''_2, a'_3, a''_3)
 208   .ifnes "\c", "nil"
 209         punpcklwd \c, \z                // (c'_0, c''_0, c'_1, c''_1)
 210         punpckhwd \d, \z                // (c'_2, c''_2, c'_3, c''_3)
 211   .endif
 212         pshufd  \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
 213         pshufd  \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
 214   .ifnes "\c", "nil"
 215         pshufd  \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
 216         pshufd  \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
 217   .endif
 218 .endm
 219
 220 .macro  squash  c0, c1, c2, c3, t, u, lo, hi=nil
 221         // On entry, C0, C1, C2, C3 are carry registers representing a value
 222         // Y.  On exit, LO holds the low 128 bits of the carry value; C1, C2,
 223         // C3, T, and U are clobbered; and the high bits of Y are stored in
 224         // HI, if this is not `nil'.
 225
 226         // The first step is to eliminate the `double-prime' pieces -- i.e.,
 227         // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
 228         // them into the 32-bit-aligned pieces above and below.  But before
 229         // we can do that, we must gather them together.
 230         movdqa  \t, \c0
 231         movdqa  \u, \c1
 232         punpcklqdq \t, \c2              // (y'_0, y'_2)
 233         punpckhqdq \c0, \c2             // (y''_0, y''_2)
 234         punpcklqdq \u, \c3              // (y'_1, y'_3)
 235         punpckhqdq \c1, \c3             // (y''_1, y''_3)
 236
 237         // Now split the double-prime pieces.  The high (up to) 48 bits will
 238         // go up; the low 16 bits go down.
 239         movdqa  \c2, \c0
 240         movdqa  \c3, \c1
 241         psllq   \c2, 48
 242         psllq   \c3, 48
 243         psrlq   \c0, 16                 // high parts of (y''_0, y''_2)
 244         psrlq   \c1, 16                 // high parts of (y''_1, y''_3)
 245         psrlq   \c2, 32                 // low parts of (y''_0, y''_2)
 246         psrlq   \c3, 32                 // low parts of (y''_1, y''_3)
 247   .ifnes "\hi", "nil"
 248         movdqa  \hi, \c1
 249   .endif
 250         pslldq  \c1, 8                  // high part of (0, y''_1)
 251
 252         paddq   \t, \c2                 // propagate down
 253         paddq   \u, \c3
 254         paddq   \t, \c1                 // and up: (y_0, y_2)
 255         paddq   \u, \c0                 // (y_1, y_3)
 256   .ifnes "\hi", "nil"
 257         psrldq  \hi, 8                  // high part of (y''_3, 0)
 258   .endif
 259
 260         // Finally extract the answer.  This complicated dance is better than
 261         // storing to memory and loading, because the piecemeal stores
 262         // inhibit store forwarding.
 263         movdqa  \c3, \t                 // (y_0, y_1)
 264         movdqa  \lo, \t                 // (y^*_0, ?, ?, ?)
 265         psrldq  \t, 8                   // (y_2, 0)
 266         psrlq   \c3, 32                 // (floor(y_0/B), ?)
 267         paddq   \c3, \u                 // (y_1 + floor(y_0/B), ?)
 268         movdqa  \c1, \c3                // (y^*_1, ?, ?, ?)
 269         psrldq  \u, 8                   // (y_3, 0)
 270         psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2, ?)
 271         paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2, ?)
 272         punpckldq \lo, \c3              // (y^*_0, y^*_2, ?, ?)
 273         psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
 274         paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
 275   .ifnes "\hi", "nil"
 276         movdqa  \t, \c3
 277         pxor    \u, \u
 278   .endif
 279         punpckldq \c1, \c3              // (y^*_1, y^*_3, ?, ?)
 280   .ifnes "\hi", "nil"
 281         psrlq   \t, 32                  // very high bits of y
 282         paddq   \hi, \t
 283         punpcklqdq \hi, \u              // carry up
 284   .endif
 285         punpckldq \lo, \c1              // y mod B^4
 286 .endm
 287
 288 .macro  carryadd
 289         // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
 290         // hold the incoming carry registers c0, c1, and c2 representing a
 291         // carry-in C.
 292         //
 293         // On exit, the carry registers, including XMM7, are updated to hold
 294         // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
 295         // registers are preserved.
 296         movd    xmm0, [edi +  0]        // (a_0, 0)
 297         movd    xmm1, [edi +  4]        // (a_1, 0)
 298         movd    xmm2, [edi +  8]        // (a_2, 0)
 299         movd    xmm7, [edi + 12]        // (a_3, 0)
 300
 301         paddq   xmm4, xmm0              // (c'_0 + a_0, c''_0)
 302         paddq   xmm5, xmm1              // (c'_1 + a_1, c''_1)
 303         paddq   xmm6, xmm2              // (c'_2 + a_2, c''_2 + a_3 b)
 304 .endm
 305
 306 ///--------------------------------------------------------------------------
 307 /// Primitive multipliers and related utilities.
 308
 309 INTFUNC(carryprop)
 310         // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
 311         // form.  Store the low 128 bits of the represented carry to [EDI] as
 312         // a packed 128-bit value, and leave the remaining 16 bits in the low
 313         // 32 bits of XMM4.  On exit, XMM3, XMM5 and XMM6 are clobbered.
 314   endprologue
 315
 316         propout [edi +  0], xmm4, xmm5
 317         propout [edi +  4], xmm5, xmm6
 318         propout [edi +  8], xmm6, nil
 319         endprop [edi + 12], xmm6, xmm4
 320         ret
 321
 322 ENDFUNC
 323
 324 INTFUNC(dmul4)
 325         // On entry, EDI points to the destination buffer; EAX and EBX point
 326         // to the packed operands U and X; ECX and EDX point to the expanded
 327         // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
 328         // registers c0, c1, and c2; c3 is assumed to be zero.
 329         //
 330         // On exit, we write the low 128 bits of the sum C + U V + X Y to
 331         // [EDI], and update the carry registers with the carry out.  The
 332         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 333         // general-purpose registers are preserved.
 334   endprologue
 335
 336         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, t
 337         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 338         propout [edi +  0],      xmm4, xmm5
 339
 340         mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 341         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4
 342         propout [edi +  4],      xmm5, xmm6
 343
 344         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 345         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5
 346         propout [edi +  8],      xmm6, xmm7
 347
 348         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 349         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
 350         propout [edi + 12],      xmm7, xmm4
 351
 352         ret
 353
 354 ENDFUNC
 355
 356 INTFUNC(dmla4)
 357         // On entry, EDI points to the destination buffer, which also
 358         // contains an addend A to accumulate; EAX and EBX point to the
 359         // packed operands U and X; ECX and EDX point to the expanded
 360         // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
 361         // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
 362         // to be zero.
 363         //
 364         // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
 365         // [EDI], and update the carry registers with the carry out.  The
 366         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 367         // general-purpose registers are preserved.
 368   endprologue
 369
 370         carryadd
 371
 372         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
 373         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 374         propout [edi +  0],      xmm4, xmm5
 375
 376         mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 377         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4
 378         propout [edi +  4],      xmm5, xmm6
 379
 380         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 381         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5
 382         propout [edi +  8],      xmm6, xmm7
 383
 384         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 385         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
 386         propout [edi + 12],      xmm7, xmm4
 387
 388         ret
 389
 390 ENDFUNC
 391
 392 INTFUNC(mul4zc)
 393         // On entry, EDI points to the destination buffer; EBX points to a
 394         // packed operand X; and EDX points to an expanded operand Y.
 395         //
 396         // On exit, we write the low 128 bits of the product X Y to [EDI],
 397         // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
 398         // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 399         // general-purpose registers are preserved.
 400   endprologue
 401
 402         mulcore [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 403         propout [edi +  0],      xmm4, xmm5
 404
 405         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 406         propout [edi +  4],      xmm5, xmm6
 407
 408         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 409         propout [edi +  8],      xmm6, xmm7
 410
 411         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 412         propout [edi + 12],      xmm7, xmm4
 413
 414         ret
 415
 416 ENDFUNC
 417
 418 INTFUNC(mul4)
 419         // On entry, EDI points to the destination buffer; EBX points to a
 420         // packed operand X; EDX points to an expanded operand Y; and XMM4,
 421         // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
 422         // representing a carry-in C; c3 is assumed to be zero.
 423         //
 424         // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
 425         // and update the carry registers with the carry out.  The registers
 426         // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 427         // general-purpose registers are preserved.
 428   endprologue
 429
 430         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, t
 431         propout [edi +  0],      xmm4, xmm5
 432
 433         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 434         propout [edi +  4],      xmm5, xmm6
 435
 436         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 437         propout [edi +  8],      xmm6, xmm7
 438
 439         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 440         propout [edi + 12],      xmm7, xmm4
 441
 442         ret
 443
 444 ENDFUNC
 445
 446 INTFUNC(mla4zc)
 447         // On entry, EDI points to the destination buffer, which also
 448         // contains an addend A to accumulate; EBX points to a packed operand
 449         // X; and EDX points to an expanded operand Y.
 450         //
 451         // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
 452         // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
 453         // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 454         // general-purpose registers are preserved.
 455   endprologue
 456
 457         movd    xmm4, [edi +  0]
 458         movd    xmm5, [edi +  4]
 459         movd    xmm6, [edi +  8]
 460         movd    xmm7, [edi + 12]
 461
 462         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 463         propout [edi +  0],      xmm4, xmm5
 464
 465         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 466         propout [edi +  4],      xmm5, xmm6
 467
 468         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 469         propout [edi +  8],      xmm6, xmm7
 470
 471         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 472         propout [edi + 12],      xmm7, xmm4
 473
 474         ret
 475
 476 ENDFUNC
 477
 478 INTFUNC(mla4)
 479         // On entry, EDI points to the destination buffer, which also
 480         // contains an addend A to accumulate; EBX points to a packed operand
 481         // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
 482         // the incoming carry registers c0, c1, and c2, representing a
 483         // carry-in C; c3 is assumed to be zero.
 484         //
 485         // On exit, we write the low 128 bits of the sum A + C + X Y to
 486         // [EDI], and update the carry registers with the carry out.  The
 487         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 488         // general-purpose registers are preserved.
 489   endprologue
 490
 491         carryadd
 492
 493         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 494         propout [edi +  0],      xmm4, xmm5
 495
 496         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 497         propout [edi +  4],      xmm5, xmm6
 498
 499         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 500         propout [edi +  8],      xmm6, xmm7
 501
 502         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 503         propout [edi + 12],      xmm7, xmm4
 504
 505         ret
 506
 507 ENDFUNC
 508
 509 INTFUNC(mmul4)
 510         // On entry, EDI points to the destination buffer; EAX and EBX point
 511         // to the packed operands U and N; ECX and ESI point to the expanded
 512         // operands V and M; and EDX points to a place to store an expanded
 513         // result Y (32 bytes, at a 16-byte boundary).  The stack pointer
 514         // must be 12 modulo 16, as is usual for modern x86 ABIs.
 515         //
 516         // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
 517         // of the sum U V + N Y to [EDI], leaving the remaining carry in
 518         // XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
 519         // XMM7 are clobbered; the general-purpose registers are preserved.
 520         stalloc 48 + 12                 // space for the carries
 521   endprologue
 522
 523         // Calculate W = U V, and leave it in the destination.  Stash the
 524         // carry pieces for later.
 525         mulcore [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
 526         propout [edi +  0],      xmm4, xmm5
 527         jmp     5f
 528
 529 ENDFUNC
 530
 531 INTFUNC(mmla4)
 532         // On entry, EDI points to the destination buffer, which also
 533         // contains an addend A to accumulate; EAX and EBX point to the
 534         // packed operands U and N; ECX and ESI point to the expanded
 535         // operands V and M; and EDX points to a place to store an expanded
 536         // result Y (32 bytes, at a 16-byte boundary).  The stack pointer
 537         // must be 12 modulo 16, as is usual for modern x86 ABIs.
 538         //
 539         // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
 540         // bits of the sum A + U V + N Y to [EDI], leaving the remaining
 541         // carry in XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2,
 542         // XMM3, and XMM7 are clobbered; the general-purpose registers are
 543         // preserved.
 544         stalloc 48 + 12                 // space for the carries
 545   endprologue
 546
 547         movd    xmm4, [edi +  0]
 548         movd    xmm5, [edi +  4]
 549         movd    xmm6, [edi +  8]
 550         movd    xmm7, [edi + 12]
 551
 552         // Calculate W = U V, and leave it in the destination.  Stash the
 553         // carry pieces for later.
 554         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
 555         propout [edi +  0],      xmm4, xmm5
 556
 557 5:      mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 558         propout [edi +  4],      xmm5, xmm6
 559
 560         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 561         propout [edi +  8],      xmm6, xmm7
 562
 563         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 564         propout [edi + 12],      xmm7, xmm4
 565
 566         movdqa  [esp +  0], xmm4
 567         movdqa  [esp + 16], xmm5
 568         movdqa  [esp + 32], xmm6
 569
 570         // Calculate Y = W M.
 571         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
 572
 573         mulcore [edi +  4], esi, xmm0, xmm1, xmm2
 574         accum                    xmm5, xmm6, xmm7
 575
 576         mulcore [edi +  8], esi, xmm0, xmm1
 577         accum                    xmm6, xmm7
 578
 579         mulcore [edi + 12], esi, xmm0
 580         accum                    xmm7
 581
 582         // That's lots of pieces.  Now we have to assemble the answer.
 583         squash  xmm4, xmm5, xmm6, xmm7,  xmm0, xmm1,  xmm4
 584
 585         // Expand it.
 586         pxor    xmm2, xmm2
 587         expand  xmm2, xmm4, xmm1
 588         movdqa  [edx +  0], xmm4
 589         movdqa  [edx + 16], xmm1
 590
 591         // Initialize the carry from the value for W we calculated earlier.
 592         movd    xmm4, [edi +  0]
 593         movd    xmm5, [edi +  4]
 594         movd    xmm6, [edi +  8]
 595         movd    xmm7, [edi + 12]
 596
 597         // Finish the calculation by adding the Montgomery product.
 598         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 599         propout [edi +  0],      xmm4, xmm5
 600
 601         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 602         propout [edi +  4],      xmm5, xmm6
 603
 604         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 605         propout [edi +  8],      xmm6, xmm7
 606
 607         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 608         propout [edi + 12],      xmm7, xmm4
 609
 610         // Add add on the carry we calculated earlier.
 611         paddq   xmm4, [esp +  0]
 612         paddq   xmm5, [esp + 16]
 613         paddq   xmm6, [esp + 32]
 614
 615         // And, with that, we're done.
 616         stfree  48 + 12
 617         ret
 618
 619 ENDFUNC
 620
 621 INTFUNC(mont4)
 622         // On entry, EDI points to the destination buffer holding a packed
 623         // value W; EBX points to a packed operand N; ESI points to an
 624         // expanded operand M; and EDX points to a place to store an expanded
 625         // result Y (32 bytes, at a 16-byte boundary).
 626         //
 627         // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
 628         // of the sum W + N Y to [EDI], leaving the remaining carry in
 629         // XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
 630         // XMM7 are clobbered; the general-purpose registers are preserved.
 631   endprologue
 632
 633         // Calculate Y = W M.
 634         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
 635
 636         mulcore [edi +  4], esi, xmm0, xmm1, xmm2
 637         accum                    xmm5, xmm6, xmm7
 638
 639         mulcore [edi +  8], esi, xmm0, xmm1
 640         accum                    xmm6, xmm7
 641
 642         mulcore [edi + 12], esi, xmm0
 643         accum                    xmm7
 644
 645         // That's lots of pieces.  Now we have to assemble the answer.
 646         squash  xmm4, xmm5, xmm6, xmm7,  xmm0, xmm1,  xmm4
 647
 648         // Expand it.
 649         pxor    xmm2, xmm2
 650         expand  xmm2, xmm4, xmm1
 651         movdqa  [edx +  0], xmm4
 652         movdqa  [edx + 16], xmm1
 653
 654         // Initialize the carry from W.
 655         movd    xmm4, [edi +  0]
 656         movd    xmm5, [edi +  4]
 657         movd    xmm6, [edi +  8]
 658         movd    xmm7, [edi + 12]
 659
 660         // Finish the calculation by adding the Montgomery product.
 661         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 662         propout [edi +  0],      xmm4, xmm5
 663
 664         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 665         propout [edi +  4],      xmm5, xmm6
 666
 667         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 668         propout [edi +  8],      xmm6, xmm7
 669
 670         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 671         propout [edi + 12],      xmm7, xmm4
 672
 673         // And, with that, we're done.
 674         ret
 675
 676 ENDFUNC
 677
 678 ///--------------------------------------------------------------------------
 679 /// Bulk multipliers.
 680
 681 FUNC(mpx_umul4_x86_sse2)
 682         // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
 683         //                         const mpw *bv, const mpw *bvl);
 684
 685         // Build a stack frame.  Arguments will be relative to EBP, as
 686         // follows.
 687         //
 688         //      ebp + 20        dv
 689         //      ebp + 24        av
 690         //      ebp + 28        avl
 691         //      ebp + 32        bv
 692         //      ebp + 36        bvl
 693         //
 694         // Locals are relative to ESP, as follows.
 695         //
 696         //      esp +  0        expanded Y (32 bytes)
 697         //      esp + 32        (top of locals)
 698         pushreg ebp
 699         pushreg ebx
 700         pushreg esi
 701         pushreg edi
 702         setfp   ebp
 703         and     esp, ~15
 704         sub     esp, 32
 705   endprologue
 706
 707         // Prepare for the first iteration.
 708         mov     esi, [ebp + 32]         // -> bv[0]
 709         pxor    xmm7, xmm7
 710         movdqu  xmm0, [esi]             // bv[0]
 711         mov     edi, [ebp + 20]         // -> dv[0]
 712         mov     ecx, edi                // outer loop dv cursor
 713         expand  xmm7, xmm0, xmm1
 714         mov     ebx, [ebp + 24]         // -> av[0]
 715         mov     eax, [ebp + 28]         // -> av[m] = av limit
 716         mov     edx, esp                // -> expanded Y = bv[0]
 717         movdqa  [esp + 0], xmm0         // bv[0] expanded low
 718         movdqa  [esp + 16], xmm1        // bv[0] expanded high
 719         call    mul4zc
 720         add     ebx, 16
 721         add     edi, 16
 722         add     ecx, 16
 723         add     esi, 16
 724         cmp     ebx, eax                // all done?
 725         jae     8f
 726
 727         .p2align 4
 728         // Continue with the first iteration.
 729 0:      call    mul4
 730         add     ebx, 16
 731         add     edi, 16
 732         cmp     ebx, eax                // all done?
 733         jb      0b
 734
 735         // Write out the leftover carry.  There can be no tail here.
 736 8:      call    carryprop
 737         cmp     esi, [ebp + 36]         // more passes to do?
 738         jae     9f
 739
 740         .p2align 4
 741         // Set up for the next pass.
 742 1:      movdqu  xmm0, [esi]             // bv[i]
 743         mov     edi, ecx                // -> dv[i]
 744         pxor    xmm7, xmm7
 745         expand  xmm7, xmm0, xmm1
 746         mov     ebx, [ebp + 24]         // -> av[0]
 747         movdqa  [esp + 0], xmm0         // bv[i] expanded low
 748         movdqa  [esp + 16], xmm1        // bv[i] expanded high
 749         call    mla4zc
 750         add     edi, 16
 751         add     ebx, 16
 752         add     ecx, 16
 753         add     esi, 16
 754         cmp     ebx, eax                // done yet?
 755         jae     8f
 756
 757         .p2align 4
 758         // Continue...
 759 0:      call    mla4
 760         add     ebx, 16
 761         add     edi, 16
 762         cmp     ebx, eax
 763         jb      0b
 764
 765         // Finish off this pass.  There was no tail on the previous pass, and
 766         // there can be none on this pass.
 767 8:      call    carryprop
 768         cmp     esi, [ebp + 36]
 769         jb      1b
 770
 771         // All over.
 772 9:      dropfp
 773         pop     edi
 774         pop     esi
 775         pop     ebx
 776         pop     ebp
 777         ret
 778
 779 ENDFUNC
 780
 781 FUNC(mpxmont_mul4_x86_sse2)
 782         // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
 783         //                           const mpw *nv, size_t n, const mpw *mi);
 784
 785         // Build a stack frame.  Arguments will be relative to EBP, as
 786         // follows.
 787         //
 788         //      ebp + 20        dv
 789         //      ebp + 24        av
 790         //      ebp + 28        bv
 791         //      ebp + 32        nv
 792         //      ebp + 36        n (nonzero multiple of 4)
 793         //      ebp + 40        mi
 794         //
 795         // Locals are relative to ESP, which 16-byte aligned, as follows.
 796         //
 797         //      esp +   0       expanded V (32 bytes)
 798         //      esp +  32       expanded M (32 bytes)
 799         //      esp +  64       expanded Y (32 bytes)
 800         //      esp +  96       outer loop dv
 801         //      esp + 100       outer loop bv
 802         //      esp + 104       av limit (mostly in ESI)
 803         //      esp + 108       bv limit
 804         //      esp + 112       (top of locals)
 805         pushreg ebp
 806         pushreg ebx
 807         pushreg esi
 808         pushreg edi
 809         setfp   ebp
 810         and     esp, ~15
 811         sub     esp, 112
 812   endprologue
 813
 814         // Establish the expanded operands.
 815         pxor    xmm7, xmm7
 816         mov     ecx, [ebp + 28]         // -> bv
 817         mov     edx, [ebp + 40]         // -> mi
 818         movdqu  xmm0, [ecx]             // bv[0]
 819         movdqu  xmm2, [edx]             // mi
 820         expand  xmm7, xmm0, xmm1, xmm2, xmm3
 821         movdqa  [esp +  0], xmm0        // bv[0] expanded low
 822         movdqa  [esp + 16], xmm1        // bv[0] expanded high
 823         movdqa  [esp + 32], xmm2        // mi expanded low
 824         movdqa  [esp + 48], xmm3        // mi expanded high
 825
 826         // Set up the outer loop state and prepare for the first iteration.
 827         mov     edx, [ebp + 36]         // n
 828         mov     eax, [ebp + 24]         // -> U = av[0]
 829         mov     ebx, [ebp + 32]         // -> X = nv[0]
 830         mov     edi, [ebp + 20]         // -> Z = dv[0]
 831         mov     [esp + 100], ecx
 832         lea     ecx, [ecx + 4*edx]      // -> bv[n/4] = bv limit
 833         lea     edx, [eax + 4*edx]      // -> av[n/4] = av limit
 834         mov     [esp + 96], edi
 835         mov     [esp + 104], edx
 836         mov     [esp + 108], ecx
 837         lea     ecx, [esp + 0]          // -> expanded V = bv[0]
 838         lea     esi, [esp + 32]         // -> expanded M = mi
 839         lea     edx, [esp + 64]         // -> space for Y
 840         call    mmul4
 841         mov     esi, [esp + 104]        // recover av limit
 842         add     edi, 16
 843         add     eax, 16
 844         add     ebx, 16
 845         cmp     eax, esi                // done already?
 846         jae     8f
 847         mov     [esp + 96], edi
 848
 849         .p2align 4
 850         // Complete the first inner loop.
 851 0:      call    dmul4
 852         add     edi, 16
 853         add     eax, 16
 854         add     ebx, 16
 855         cmp     eax, esi                // done yet?
 856         jb      0b
 857
 858         // Still have carries left to propagate.
 859         call    carryprop
 860         movd    [edi + 16], xmm4
 861
 862         .p2align 4
 863         // Embark on the next iteration.  (There must be one.  If n = 1, then
 864         // we would have bailed above, to label 8.  Similarly, the subsequent
 865         // iterations can fall into the inner loop immediately.)
 866 1:      mov     eax, [esp + 100]        // -> bv[i - 1]
 867         mov     edi, [esp + 96]         // -> Z = dv[i]
 868         add     eax, 16                 // -> bv[i]
 869         pxor    xmm7, xmm7
 870         mov     [esp + 100], eax
 871         cmp     eax, [esp + 108]        // done yet?
 872         jae     9f
 873         movdqu  xmm0, [eax]             // bv[i]
 874         mov     ebx, [ebp + 32]         // -> X = nv[0]
 875         lea     esi, [esp + 32]         // -> expanded M = mi
 876         mov     eax, [ebp + 24]         // -> U = av[0]
 877         expand  xmm7, xmm0, xmm1
 878         movdqa  [esp + 0], xmm0         // bv[i] expanded low
 879         movdqa  [esp + 16], xmm1        // bv[i] expanded high
 880         call    mmla4
 881         mov     esi, [esp + 104]        // recover av limit
 882         add     edi, 16
 883         add     eax, 16
 884         add     ebx, 16
 885         mov     [esp + 96], edi
 886
 887         .p2align 4
 888         // Complete the next inner loop.
 889 0:      call    dmla4
 890         add     edi, 16
 891         add     eax, 16
 892         add     ebx, 16
 893         cmp     eax, esi
 894         jb      0b
 895
 896         // Still have carries left to propagate, and they overlap the
 897         // previous iteration's final tail, so read that in and add it.
 898         movd    xmm0, [edi]
 899         paddq   xmm4, xmm0
 900         call    carryprop
 901         movd    [edi + 16], xmm4
 902
 903         // Back again.
 904         jmp     1b
 905
 906         // First iteration was short.  Write out the carries and we're done.
 907         // (This could be folded into the main loop structure, but that would
 908         // penalize small numbers more.)
 909 8:      call    carryprop
 910         movd    [edi + 16], xmm4
 911
 912         // All done.
 913 9:      dropfp
 914         popreg  edi
 915         popreg  esi
 916         popreg  ebx
 917         popreg  ebp
 918         ret
 919
 920 ENDFUNC
 921
 922 FUNC(mpxmont_redc4_x86_sse2)
 923         // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
 924         //                             size_t n, const mpw *mi);
 925
 926         // Build a stack frame.  Arguments will be relative to EBP, as
 927         // follows.
 928         //
 929         //      ebp + 20        dv
 930         //      ebp + 24        dvl
 931         //      ebp + 28        nv
 932         //      ebp + 32        n (nonzero multiple of 4)
 933         //      ebp + 36        mi
 934         //
 935         // Locals are relative to ESP, as follows.
 936         //
 937         //      esp +  0        outer loop dv
 938         //      esp +  4        outer dv limit
 939         //      esp +  8        blocks-of-4 dv limit
 940         //      esp + 12        expanded M (32 bytes)
 941         //      esp + 44        expanded Y (32 bytes)
 942         //      esp + 76        (top of locals)
 943         pushreg ebp
 944         pushreg ebx
 945         pushreg esi
 946         pushreg edi
 947         setfp   ebp
 948         and     esp, ~15
 949         sub     esp, 76
 950   endprologue
 951
 952         // Establish the expanded operands and the blocks-of-4 dv limit.
 953         mov     edi, [ebp + 20]         // -> Z = dv[0]
 954         pxor    xmm7, xmm7
 955         mov     eax, [ebp + 24]         // -> dv[n] = dv limit
 956         sub     eax, edi                // length of dv in bytes
 957         mov     edx, [ebp + 36]         // -> mi
 958         movdqu  xmm0, [edx]             // mi
 959         and     eax, ~15                // mask off the tail end
 960         expand  xmm7, xmm0, xmm1
 961         add     eax, edi                // find limit
 962         movdqa  [esp + 12], xmm0        // mi expanded low
 963         movdqa  [esp + 28], xmm1        // mi expanded high
 964         mov     [esp + 8], eax
 965
 966         // Set up the outer loop state and prepare for the first iteration.
 967         mov     ecx, [ebp + 32]         // n
 968         mov     ebx, [ebp + 28]         // -> X = nv[0]
 969         lea     edx, [edi + 4*ecx]      // -> dv[n/4] = outer dv limit
 970         lea     ecx, [ebx + 4*ecx]      // -> nv[n/4] = nv limit
 971         mov     [esp + 0], edi
 972         mov     [esp + 4], edx
 973         lea     esi, [esp + 12]         // -> expanded M = mi
 974         lea     edx, [esp + 44]         // -> space for Y
 975         call    mont4
 976         add     ebx, 16
 977         add     edi, 16
 978         cmp     ebx, ecx                // done already?
 979         jae     8f
 980
 981         .p2align 4
 982         // Complete the first inner loop.
 983 5:      call    mla4
 984         add     ebx, 16
 985         add     edi, 16
 986         cmp     ebx, ecx                // done yet?
 987         jb      5b
 988
 989         // Still have carries left to propagate.
 990 8:      carryadd
 991         mov     esi, [esp + 8]          // -> dv blocks limit
 992         mov     edx, [ebp + 24]         // dv limit
 993         psllq   xmm7, 16
 994         pslldq  xmm7, 8
 995         paddq   xmm6, xmm7
 996         call    carryprop
 997         movd    eax, xmm4
 998         add     edi, 16
 999         cmp     edi, esi
1000         jae     7f
1001
1002         .p2align 4
1003         // Continue carry propagation until the end of the buffer.
1004 0:      add     [edi], eax
1005         mov     eax, 0                  // preserves flags
1006         adcd    [edi + 4], 0
1007         adcd    [edi + 8], 0
1008         adcd    [edi + 12], 0
1009         adc     eax, 0
1010         add     edi, 16
1011         cmp     edi, esi
1012         jb      0b
1013
1014         // Deal with the tail end.
1015 7:      add     [edi], eax
1016         mov     eax, 0                  // preserves flags
1017         add     edi, 4
1018         adc     eax, 0
1019         cmp     edi, edx
1020         jb      7b
1021
1022         // All done for this iteration.  Start the next.  (This must have at
1023         // least one follow-on iteration, or we'd not have started this outer
1024         // loop.)
1025 8:      mov     edi, [esp + 0]          // -> dv[i - 1]
1026         mov     ebx, [ebp + 28]         // -> X = nv[0]
1027         lea     edx, [esp + 44]         // -> space for Y
1028         lea     esi, [esp + 12]         // -> expanded M = mi
1029         add     edi, 16                 // -> Z = dv[i]
1030         cmp     edi, [esp + 4]          // all done yet?
1031         jae     9f
1032         mov     [esp + 0], edi
1033         call    mont4
1034         add     edi, 16
1035         add     ebx, 16
1036         jmp     5b
1037
1038         // All over.
1039 9:      dropfp
1040         popreg  edi
1041         popreg  esi
1042         popreg  ebx
1043         popreg  ebp
1044         ret
1045
1046 ENDFUNC
1047
1048 ///--------------------------------------------------------------------------
1049 /// Testing and performance measurement.
1050
1051 #ifdef TEST_MUL4
1052
1053 .macro  cysetup c
1054         rdtsc
1055         mov     [\c], eax
1056         mov     [\c + 4], edx
1057 .endm
1058
1059 .macro  cystore c, v, n
1060         rdtsc
1061         sub     eax, [\c]
1062         sbb     edx, [\c + 4]
1063         mov     ebx, [\v]
1064         mov     ecx, [\n]
1065         dec     ecx
1066         mov     [\n], ecx
1067         mov     [ebx + ecx*8], eax
1068         mov     [ebx + ecx*8 + 4], edx
1069 .endm
1070
1071 .macro  testprologue n
1072         pushreg ebp
1073         pushreg ebx
1074         pushreg esi
1075         pushreg edi
1076         setfp   ebp
1077         and     esp, ~15
1078         sub     esp, 3*32 + 4*4
1079   endprologue
1080         mov     eax, \n
1081         mov     [esp + 104], eax
1082         // vars:
1083         //      esp +   0 = v expanded
1084         //      esp +  32 = y expanded
1085         //      esp +  64 = ? expanded
1086         //      esp +  96 = cycles
1087         //      esp + 104 = count
1088 .endm
1089
1090 .macro  testepilogue
1091         dropfp
1092         popreg  edi
1093         popreg  esi
1094         popreg  ebx
1095         popreg  ebp
1096         ret
1097 .endm
1098
1099 .macro  testldcarry c
1100         mov     ecx, \c                 // -> c
1101         movdqu  xmm4, [ecx +  0]        // (c'_0, c''_0)
1102         movdqu  xmm5, [ecx + 16]        // (c'_1, c''_1)
1103         movdqu  xmm6, [ecx + 32]        // (c'_2, c''_2)
1104 .endm
1105
1106 .macro  testexpand v=nil, y=nil
1107         pxor    xmm7, xmm7
1108   .ifnes "\v", "nil"
1109         mov     ecx, \v
1110         movdqu  xmm0, [ecx]
1111         expand  xmm7, xmm0, xmm1
1112         movdqa  [esp +  0], xmm0
1113         movdqa  [esp + 16], xmm1
1114   .endif
1115   .ifnes "\y", "nil"
1116         mov     edx, \y
1117         movdqu  xmm2, [edx]
1118         expand  xmm7, xmm2, xmm3
1119         movdqa  [esp + 32], xmm2
1120         movdqa  [esp + 48], xmm3
1121   .endif
1122 .endm
1123
1124 .macro  testtop u=nil, x=nil, mode=nil
1125         .p2align 4
1126 0:
1127   .ifnes "\u", "nil"
1128         lea     ecx, [esp + 0]
1129   .endif
1130         mov     ebx, \x
1131   .ifeqs "\mode", "mont"
1132         lea     esi, [esp + 32]
1133   .endif
1134         cysetup esp + 96
1135   .ifnes "\u", "nil"
1136         mov     eax, \u
1137   .endif
1138   .ifeqs "\mode", "mont"
1139         lea     edx, [esp + 64]
1140   .else
1141         lea     edx, [esp + 32]
1142   .endif
1143 .endm
1144
1145 .macro  testtail cyv
1146         cystore esp + 96, \cyv, esp + 104
1147         jnz     0b
1148 .endm
1149
1150 .macro  testcarryout c
1151         mov     ecx, \c
1152         movdqu  [ecx +  0], xmm4
1153         movdqu  [ecx + 16], xmm5
1154         movdqu  [ecx + 32], xmm6
1155 .endm
1156
1157 FUNC(test_dmul4)
1158         testprologue [ebp + 44]
1159         testldcarry [ebp + 24]
1160         testexpand [ebp + 36], [ebp + 40]
1161         mov     edi, [ebp + 20]
1162         testtop [ebp + 28], [ebp + 32]
1163         call    dmul4
1164         testtail [ebp + 48]
1165         testcarryout [ebp + 24]
1166         testepilogue
1167 ENDFUNC
1168
1169 FUNC(test_dmla4)
1170         testprologue [ebp + 44]
1171         testldcarry [ebp + 24]
1172         testexpand [ebp + 36], [ebp + 40]
1173         mov     edi, [ebp + 20]
1174         testtop [ebp + 28], [ebp + 32]
1175         call    dmla4
1176         testtail [ebp + 48]
1177         testcarryout [ebp + 24]
1178         testepilogue
1179 ENDFUNC
1180
1181 FUNC(test_mul4)
1182         testprologue [ebp + 36]
1183         testldcarry [ebp + 24]
1184         testexpand nil, [ebp + 32]
1185         mov     edi, [ebp + 20]
1186         testtop nil, [ebp + 28]
1187         call    mul4
1188         testtail [ebp + 40]
1189         testcarryout [ebp + 24]
1190         testepilogue
1191 ENDFUNC
1192
1193 FUNC(test_mla4)
1194         testprologue [ebp + 36]
1195         testldcarry [ebp + 24]
1196         testexpand nil, [ebp + 32]
1197         mov     edi, [ebp + 20]
1198         testtop nil, [ebp + 28]
1199         call    mla4
1200         testtail [ebp + 40]
1201         testcarryout [ebp + 24]
1202         testepilogue
1203 ENDFUNC
1204
1205 FUNC(test_mmul4)
1206         testprologue [ebp + 48]
1207         testexpand [ebp + 40], [ebp + 44]
1208         mov     edi, [ebp + 20]
1209         testtop [ebp + 32], [ebp + 36], mont
1210         call    mmul4
1211         testtail [ebp + 52]
1212         mov     edi, [ebp + 28]
1213         movdqa  xmm0, [esp + 64]
1214         movdqa  xmm1, [esp + 80]
1215         movdqu  [edi], xmm0
1216         movdqu  [edi + 16], xmm1
1217         testcarryout [ebp + 24]
1218         testepilogue
1219 ENDFUNC
1220
1221 FUNC(test_mmla4)
1222         testprologue [ebp + 48]
1223         testexpand [ebp + 40], [ebp + 44]
1224         mov     edi, [ebp + 20]
1225         testtop [ebp + 32], [ebp + 36], mont
1226         call    mmla4
1227         testtail [ebp + 52]
1228         mov     edi, [ebp + 28]
1229         movdqa  xmm0, [esp + 64]
1230         movdqa  xmm1, [esp + 80]
1231         movdqu  [edi], xmm0
1232         movdqu  [edi + 16], xmm1
1233         testcarryout [ebp + 24]
1234         testepilogue
1235 ENDFUNC
1236
1237 FUNC(test_mont4)
1238         testprologue [ebp + 40]
1239         testexpand nil, [ebp + 36]
1240         mov     edi, [ebp + 20]
1241         testtop nil, [ebp + 32], mont
1242         call    mont4
1243         testtail [ebp + 44]
1244         mov     edi, [ebp + 28]
1245         movdqa  xmm0, [esp + 64]
1246         movdqa  xmm1, [esp + 80]
1247         movdqu  [edi], xmm0
1248         movdqu  [edi + 16], xmm1
1249         testcarryout [ebp + 24]
1250         testepilogue
1251 ENDFUNC
1252
1253 #endif
1254
1255 ///----- That's all, folks --------------------------------------------------