mdw@git.distorted.org.uk Git - catacomb/blob - math/mpx-mul4-x86-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
   2 ///
   3 /// Large SIMD-based multiplications
   4 ///
   5 /// (c) 2016 Straylight/Edgeware
   6
   7 ///----- Licensing notice ---------------------------------------------------
   8 ///
   9 /// This file is part of Catacomb.
  10 ///
  11 /// Catacomb is free software; you can redistribute it and/or modify
  12 /// it under the terms of the GNU Library General Public License as
  13 /// published by the Free Software Foundation; either version 2 of the
  14 /// License, or (at your option) any later version.
  15 ///
  16 /// Catacomb is distributed in the hope that it will be useful,
  17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 /// GNU Library General Public License for more details.
  20 ///
  21 /// You should have received a copy of the GNU Library General Public
  22 /// License along with Catacomb; if not, write to the Free
  23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  24 /// MA 02111-1307, USA.
  25
  26 ///--------------------------------------------------------------------------
  27 /// Preliminaries.
  28
  29 #include "config.h"
  30 #include "asm-common.h"
  31
  32         .arch   pentium4
  33
  34         .text
  35
  36 ///--------------------------------------------------------------------------
  37 /// Theory.
  38 ///
  39 /// We define a number of primitive fixed-size multipliers from which we can
  40 /// construct more general variable-length multipliers.
  41 ///
  42 /// The basic trick is the same throughout.  In an operand-scanning
  43 /// multiplication, the inner multiplication loop multiplies a
  44 /// multiple-precision operand by a single precision factor, and adds the
  45 /// result, appropriately shifted, to the result.  A `finely integrated
  46 /// operand scanning' implementation of Montgomery multiplication also adds
  47 /// the product of a single-precision `Montgomery factor' and the modulus,
  48 /// calculated in the same pass.  The more common `coarsely integrated
  49 /// operand scanning' alternates main multiplication and Montgomery passes,
  50 /// which requires additional carry propagation.
  51 ///
  52 /// Throughout both plain-multiplication and Montgomery stages, then, one of
  53 /// the factors remains constant throughout the operation, so we can afford
  54 /// to take a little time to preprocess it.  The transformation we perform is
  55 /// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
  56 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
  57 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
  58 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
  59 /// operands, as follows.
  60 ///
  61 ///     Offset     0       4        8      12
  62 ///        0    v'_0    v'_1    v''_0   v''_1
  63 ///       16    v'_2    v'_3    v''_2   v''_3
  64 ///
  65 /// A `pmuludq' instruction ignores the odd positions in its operands; thus,
  66 /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
  67 /// this vector right by 4 bytes brings v'_1 and v''_1 into position.  We can
  68 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
  69 /// results in 64-bit fields.  The sixteen bits of headroom allows us to add
  70 /// many products together before we must deal with carrying; it also allows
  71 /// for some calculations to be performed on the above expanded form.
  72 ///
  73 /// On 32-bit x86, we are register starved: the expanded operands are kept in
  74 /// memory, typically in warm L1 cache.
  75 ///
  76 /// We maintain four `carry' registers accumulating intermediate results.
  77 /// The registers' precise roles rotate during the computation; we name them
  78 /// `c0', `c1', `c2', and `c3'.  Each carry register holds two 64-bit halves:
  79 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
  80 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
  81 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3.  The
  82 /// `pmuluqd' instruction acting on a scalar operand (broadcast across all
  83 /// lanes of its vector) and an operand in the expanded form above produces a
  84 /// result which can be added directly to the appropriate carry register.
  85 /// Following a pass of four multiplications, we perform some limited carry
  86 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
  87 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
  88 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
  89 /// zeroed becomes c3.
  90
  91 ///--------------------------------------------------------------------------
  92 /// Macro definitions.
  93
  94 .macro  mulcore r, s, d0, d1=nil, d2=nil, d3=nil
  95         // Load a word r_i from R, multiply by the expanded operand [S], and
  96         // leave the pieces of the product in registers D0, D1, D2, D3.
  97         movd    \d0, \r                 // (r_i, 0; 0, 0)
  98   .ifnes "\d1", "nil"
  99         movdqa  \d1, [\s]               // (s'_0, s'_1; s''_0, s''_1)
 100   .endif
 101   .ifnes "\d3", "nil"
 102         movdqa  \d3, [\s + 16]          // (s'_2, s'_3; s''_2, s''_3)
 103   .endif
 104         pshufd  \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?)
 105   .ifnes "\d1", "nil"
 106         psrldq  \d1, 4                  // (s'_1, s''_0; s''_1, 0)
 107   .endif
 108   .ifnes "\d2", "nil"
 109     .ifnes "\d3", "nil"
 110         movdqa  \d2, \d3                // another copy of (s'_2, s'_3; ...)
 111     .else
 112         movdqa  \d2, \d0                // another copy of (r_i, ?; r_i, ?)
 113     .endif
 114   .endif
 115   .ifnes "\d3", "nil"
 116         psrldq  \d3, 4                  // (s'_3, s''_2; s''_3, 0)
 117   .endif
 118   .ifnes "\d1", "nil"
 119         pmuludq \d1, \d0                // (r_i s'_1; r_i s''_1)
 120   .endif
 121   .ifnes "\d3", "nil"
 122         pmuludq \d3, \d0                // (r_i s'_3; r_i s''_3)
 123   .endif
 124   .ifnes "\d2", "nil"
 125     .ifnes "\d3", "nil"
 126         pmuludq \d2, \d0                // (r_i s'_2; r_i s''_2)
 127     .else
 128         pmuludq \d2, [\s + 16]
 129     .endif
 130   .endif
 131         pmuludq \d0, [\s]               // (r_i s'_0; r_i s''_0)
 132 .endm
 133
 134 .macro  accum   c0, c1=nil, c2=nil, c3=nil
 135         // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
 136         // carry registers C0--C3.  Any or all of C1--C3 may be `nil' to skip
 137         // updating that register.
 138         paddq   \c0, xmm0
 139   .ifnes "\c1", "nil"
 140         paddq   \c1, xmm1
 141   .endif
 142   .ifnes "\c2", "nil"
 143         paddq   \c2, xmm2
 144   .endif
 145   .ifnes "\c3", "nil"
 146         paddq   \c3, xmm3
 147   .endif
 148 .endm
 149
 150 .macro  mulacc  r, s, c0, c1, c2, c3, z3p=nil
 151         // Load a word r_i from R, multiply by the expanded operand [S],
 152         // and accumulate in carry registers C0, C1, C2, C3.  If Z3P is `t'
 153         // then C3 notionally contains zero, but needs clearing; in practice,
 154         // we store the product directly rather than attempting to add.  On
 155         // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
 156         // is not `t'.
 157   .ifeqs "\z3p", "t"
 158         mulcore \r, \s, xmm0, xmm1, xmm2, \c3
 159         accum           \c0,  \c1,  \c2
 160   .else
 161         mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
 162         accum           \c0,  \c1,  \c2,  \c3
 163   .endif
 164 .endm
 165
 166 .macro  propout d, c, cc=nil
 167         // Calculate an output word from C, and store it in D; propagate
 168         // carries out from C to CC in preparation for a rotation of the
 169         // carry registers.  On completion, XMM3 is clobbered.  If CC is
 170         // `nil', then the contribution which would have been added to it is
 171         // left in C.
 172         pshufd  xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
 173         psrldq  xmm3, 12                // (t, 0; 0, 0) = (t, 0)
 174         pslldq  xmm3, 2                 // (t b; 0)
 175         paddq   \c, xmm3                // (c' + t b; c'')
 176         movd    \d, \c
 177         psrlq   \c, 32                  // floor(c/B)
 178   .ifnes "\cc", "nil"
 179         paddq   \cc, \c                 // propagate up
 180   .endif
 181 .endm
 182
 183 .macro  endprop d, c, t
 184         // On entry, C contains a carry register.  On exit, the low 32 bits
 185         // of the value represented in C are written to D, and the remaining
 186         // bits are left at the bottom of T.
 187         movdqa  \t, \c
 188         psllq   \t, 16                  // (?; c'' b)
 189         pslldq  \c, 8                   // (0; c')
 190         paddq   \t, \c                  // (?; c' + c'' b)
 191         psrldq  \t, 8                   // (c' + c'' b; 0) = (c; 0)
 192         movd    \d, \t
 193         psrldq  \t, 4                   // (floor(c/B); 0)
 194 .endm
 195
 196 .macro  expand  z, a, b, c=nil, d=nil
 197         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
 198         // exit, A:B and C:D together hold the same values in expanded
 199         // form.  If C is `nil', then only expand A to A:B.
 200         movdqa  \b, \a                  // (a_0, a_1; a_2, a_3)
 201   .ifnes "\c", "nil"
 202         movdqa  \d, \c                  // (c_0, c_1; c_2, c_3)
 203   .endif
 204         punpcklwd \a, \z                // (a'_0, a''_0; a'_1, a''_1)
 205         punpckhwd \b, \z                // (a'_2, a''_2; a'_3, a''_3)
 206   .ifnes "\c", "nil"
 207         punpcklwd \c, \z                // (c'_0, c''_0; c'_1, c''_1)
 208         punpckhwd \d, \z                // (c'_2, c''_2; c'_3, c''_3)
 209   .endif
 210         pshufd  \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
 211         pshufd  \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
 212   .ifnes "\c", "nil"
 213         pshufd  \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
 214         pshufd  \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
 215   .endif
 216 .endm
 217
 218 .macro  squash  c0, c1, c2, c3, t, u, lo, hi=nil
 219         // On entry, C0, C1, C2, C3 are carry registers representing a value
 220         // Y.  On exit, LO holds the low 128 bits of the carry value; C1, C2,
 221         // C3, T, and U are clobbered; and the high bits of Y are stored in
 222         // HI, if this is not `nil'.
 223
 224         // The first step is to eliminate the `double-prime' pieces -- i.e.,
 225         // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
 226         // them into the 32-bit-aligned pieces above and below.  But before
 227         // we can do that, we must gather them together.
 228         movdqa  \t, \c0
 229         movdqa  \u, \c1
 230         punpcklqdq \t, \c2              // (y'_0; y'_2)
 231         punpckhqdq \c0, \c2             // (y''_0; y''_2)
 232         punpcklqdq \u, \c3              // (y'_1; y'_3)
 233         punpckhqdq \c1, \c3             // (y''_1; y''_3)
 234
 235         // Now split the double-prime pieces.  The high (up to) 48 bits will
 236         // go up; the low 16 bits go down.
 237         movdqa  \c2, \c0
 238         movdqa  \c3, \c1
 239         psllq   \c2, 48
 240         psllq   \c3, 48
 241         psrlq   \c0, 16                 // high parts of (y''_0; y''_2)
 242         psrlq   \c1, 16                 // high parts of (y''_1; y''_3)
 243         psrlq   \c2, 32                 // low parts of (y''_0; y''_2)
 244         psrlq   \c3, 32                 // low parts of (y''_1; y''_3)
 245   .ifnes "\hi", "nil"
 246         movdqa  \hi, \c1
 247   .endif
 248         pslldq  \c1, 8                  // high part of (0; y''_1)
 249
 250         paddq   \t, \c2                 // propagate down
 251         paddq   \u, \c3
 252         paddq   \t, \c1                 // and up: (y_0; y_2)
 253         paddq   \u, \c0                 // (y_1; y_3)
 254   .ifnes "\hi", "nil"
 255         psrldq  \hi, 8                  // high part of (y''_3; 0)
 256   .endif
 257
 258         // Finally extract the answer.  This complicated dance is better than
 259         // storing to memory and loading, because the piecemeal stores
 260         // inhibit store forwarding.
 261         movdqa  \c3, \t                 // (y_0; ?)
 262         movdqa  \lo, \t                 // (y^*_0, ?; ?, ?)
 263         psrldq  \t, 8                   // (y_2; 0)
 264         psrlq   \c3, 32                 // (floor(y_0/B); ?)
 265         paddq   \c3, \u                 // (y_1 + floor(y_0/B); ?)
 266         movdqa  \c1, \c3                // (y^*_1, ?; ?, ?)
 267         psrldq  \u, 8                   // (y_3; 0)
 268         psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2; ?)
 269         paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2; ?)
 270         punpckldq \lo, \c3              // (y^*_0, y^*_2; ?, ?)
 271         psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
 272         paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
 273   .ifnes "\hi", "nil"
 274         movdqa  \t, \c3
 275         pxor    \u, \u
 276   .endif
 277         punpckldq \c1, \c3              // (y^*_1, y^*_3; ?, ?)
 278   .ifnes "\hi", "nil"
 279         psrlq   \t, 32                  // very high bits of y
 280         paddq   \hi, \t
 281         punpcklqdq \hi, \u              // carry up
 282   .endif
 283         punpckldq \lo, \c1              // y mod B^4
 284 .endm
 285
 286 .macro  carryadd
 287         // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
 288         // hold the incoming carry registers c0, c1, and c2 representing a
 289         // carry-in C.
 290         //
 291         // On exit, the carry registers, including XMM7, are updated to hold
 292         // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
 293         // registers are preserved.
 294         movd    xmm0, [edi +  0]        // (a_0; 0)
 295         movd    xmm1, [edi +  4]        // (a_1; 0)
 296         movd    xmm2, [edi +  8]        // (a_2; 0)
 297         movd    xmm7, [edi + 12]        // (a_3; 0)
 298
 299         paddq   xmm4, xmm0              // (c'_0 + a_0; c''_0)
 300         paddq   xmm5, xmm1              // (c'_1 + a_1; c''_1)
 301         paddq   xmm6, xmm2              // (c'_2 + a_2; c''_2 + a_3 b)
 302 .endm
 303
 304 ///--------------------------------------------------------------------------
 305 /// Primitive multipliers and related utilities.
 306
 307 INTFUNC(carryprop)
 308         // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
 309         // form.  Store the low 128 bits of the represented carry to [EDI] as
 310         // a packed 128-bit value, and leave the remaining 16 bits in the low
 311         // 32 bits of XMM4.  On exit, XMM3, XMM5 and XMM6 are clobbered.
 312   endprologue
 313
 314         propout [edi +  0], xmm4, xmm5
 315         propout [edi +  4], xmm5, xmm6
 316         propout [edi +  8], xmm6, nil
 317         endprop [edi + 12], xmm6, xmm4
 318         ret
 319
 320 ENDFUNC
 321
 322 INTFUNC(dmul4)
 323         // On entry, EDI points to the destination buffer; EAX and EBX point
 324         // to the packed operands U and X; ECX and EDX point to the expanded
 325         // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
 326         // registers c0, c1, and c2; c3 is assumed to be zero.
 327         //
 328         // On exit, we write the low 128 bits of the sum C + U V + X Y to
 329         // [EDI], and update the carry registers with the carry out.  The
 330         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 331         // general-purpose registers are preserved.
 332   endprologue
 333
 334         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, t
 335         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 336         propout [edi +  0],      xmm4, xmm5
 337
 338         mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 339         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4
 340         propout [edi +  4],      xmm5, xmm6
 341
 342         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 343         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5
 344         propout [edi +  8],      xmm6, xmm7
 345
 346         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 347         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
 348         propout [edi + 12],      xmm7, xmm4
 349
 350         ret
 351
 352 ENDFUNC
 353
 354 INTFUNC(dmla4)
 355         // On entry, EDI points to the destination buffer, which also
 356         // contains an addend A to accumulate; EAX and EBX point to the
 357         // packed operands U and X; ECX and EDX point to the expanded
 358         // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
 359         // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
 360         // to be zero.
 361         //
 362         // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
 363         // [EDI], and update the carry registers with the carry out.  The
 364         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 365         // general-purpose registers are preserved.
 366   endprologue
 367
 368         carryadd
 369
 370         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
 371         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 372         propout [edi +  0],      xmm4, xmm5
 373
 374         mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 375         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4
 376         propout [edi +  4],      xmm5, xmm6
 377
 378         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 379         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5
 380         propout [edi +  8],      xmm6, xmm7
 381
 382         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 383         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
 384         propout [edi + 12],      xmm7, xmm4
 385
 386         ret
 387
 388 ENDFUNC
 389
 390 INTFUNC(mul4zc)
 391         // On entry, EDI points to the destination buffer; EBX points to a
 392         // packed operand X; and EDX points to an expanded operand Y.
 393         //
 394         // On exit, we write the low 128 bits of the product X Y to [EDI],
 395         // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
 396         // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 397         // general-purpose registers are preserved.
 398   endprologue
 399
 400         mulcore [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 401         propout [edi +  0],      xmm4, xmm5
 402
 403         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 404         propout [edi +  4],      xmm5, xmm6
 405
 406         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 407         propout [edi +  8],      xmm6, xmm7
 408
 409         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 410         propout [edi + 12],      xmm7, xmm4
 411
 412         ret
 413
 414 ENDFUNC
 415
 416 INTFUNC(mul4)
 417         // On entry, EDI points to the destination buffer; EBX points to a
 418         // packed operand X; EDX points to an expanded operand Y; and XMM4,
 419         // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
 420         // representing a carry-in C; c3 is assumed to be zero.
 421         //
 422         // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
 423         // and update the carry registers with the carry out.  The registers
 424         // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 425         // general-purpose registers are preserved.
 426   endprologue
 427
 428         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, t
 429         propout [edi +  0],      xmm4, xmm5
 430
 431         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 432         propout [edi +  4],      xmm5, xmm6
 433
 434         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 435         propout [edi +  8],      xmm6, xmm7
 436
 437         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 438         propout [edi + 12],      xmm7, xmm4
 439
 440         ret
 441
 442 ENDFUNC
 443
 444 INTFUNC(mla4zc)
 445         // On entry, EDI points to the destination buffer, which also
 446         // contains an addend A to accumulate; EBX points to a packed operand
 447         // X; and EDX points to an expanded operand Y.
 448         //
 449         // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
 450         // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
 451         // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 452         // general-purpose registers are preserved.
 453   endprologue
 454
 455         movd    xmm4, [edi +  0]
 456         movd    xmm5, [edi +  4]
 457         movd    xmm6, [edi +  8]
 458         movd    xmm7, [edi + 12]
 459
 460         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 461         propout [edi +  0],      xmm4, xmm5
 462
 463         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 464         propout [edi +  4],      xmm5, xmm6
 465
 466         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 467         propout [edi +  8],      xmm6, xmm7
 468
 469         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 470         propout [edi + 12],      xmm7, xmm4
 471
 472         ret
 473
 474 ENDFUNC
 475
 476 INTFUNC(mla4)
 477         // On entry, EDI points to the destination buffer, which also
 478         // contains an addend A to accumulate; EBX points to a packed operand
 479         // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
 480         // the incoming carry registers c0, c1, and c2, representing a
 481         // carry-in C; c3 is assumed to be zero.
 482         //
 483         // On exit, we write the low 128 bits of the sum A + C + X Y to
 484         // [EDI], and update the carry registers with the carry out.  The
 485         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 486         // general-purpose registers are preserved.
 487   endprologue
 488
 489         carryadd
 490
 491         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 492         propout [edi +  0],      xmm4, xmm5
 493
 494         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 495         propout [edi +  4],      xmm5, xmm6
 496
 497         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 498         propout [edi +  8],      xmm6, xmm7
 499
 500         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 501         propout [edi + 12],      xmm7, xmm4
 502
 503         ret
 504
 505 ENDFUNC
 506
 507 INTFUNC(mmul4)
 508         // On entry, EDI points to the destination buffer; EAX and EBX point
 509         // to the packed operands U and N; ECX and ESI point to the expanded
 510         // operands V and M; and EDX points to a place to store an expanded
 511         // result Y (32 bytes, at a 16-byte boundary).  The stack pointer
 512         // must be 12 modulo 16, as is usual for modern x86 ABIs.
 513         //
 514         // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
 515         // of the sum U V + N Y to [EDI], leaving the remaining carry in
 516         // XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
 517         // XMM7 are clobbered; the general-purpose registers are preserved.
 518         stalloc 48 + 12                 // space for the carries
 519   endprologue
 520
 521         // Calculate W = U V, and leave it in the destination.  Stash the
 522         // carry pieces for later.
 523         mulcore [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
 524         propout [edi +  0],      xmm4, xmm5
 525         jmp     5f
 526
 527 ENDFUNC
 528
 529 INTFUNC(mmla4)
 530         // On entry, EDI points to the destination buffer, which also
 531         // contains an addend A to accumulate; EAX and EBX point to the
 532         // packed operands U and N; ECX and ESI point to the expanded
 533         // operands V and M; and EDX points to a place to store an expanded
 534         // result Y (32 bytes, at a 16-byte boundary).  The stack pointer
 535         // must be 12 modulo 16, as is usual for modern x86 ABIs.
 536         //
 537         // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
 538         // bits of the sum A + U V + N Y to [EDI], leaving the remaining
 539         // carry in XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2,
 540         // XMM3, and XMM7 are clobbered; the general-purpose registers are
 541         // preserved.
 542         stalloc 48 + 12                 // space for the carries
 543   endprologue
 544
 545         movd    xmm4, [edi +  0]
 546         movd    xmm5, [edi +  4]
 547         movd    xmm6, [edi +  8]
 548         movd    xmm7, [edi + 12]
 549
 550         // Calculate W = U V, and leave it in the destination.  Stash the
 551         // carry pieces for later.
 552         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
 553         propout [edi +  0],      xmm4, xmm5
 554
 555 5:      mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 556         propout [edi +  4],      xmm5, xmm6
 557
 558         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 559         propout [edi +  8],      xmm6, xmm7
 560
 561         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 562         propout [edi + 12],      xmm7, xmm4
 563
 564         movdqa  [esp +  0], xmm4
 565         movdqa  [esp + 16], xmm5
 566         movdqa  [esp + 32], xmm6
 567
 568         // Calculate Y = W M.
 569         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
 570
 571         mulcore [edi +  4], esi, xmm0, xmm1, xmm2
 572         accum                    xmm5, xmm6, xmm7
 573
 574         mulcore [edi +  8], esi, xmm0, xmm1
 575         accum                    xmm6, xmm7
 576
 577         mulcore [edi + 12], esi, xmm0
 578         accum                    xmm7
 579
 580         // That's lots of pieces.  Now we have to assemble the answer.
 581         squash  xmm4, xmm5, xmm6, xmm7,  xmm0, xmm1,  xmm4
 582
 583         // Expand it.
 584         pxor    xmm2, xmm2
 585         expand  xmm2, xmm4, xmm1
 586         movdqa  [edx +  0], xmm4
 587         movdqa  [edx + 16], xmm1
 588
 589         // Initialize the carry from the value for W we calculated earlier.
 590         movd    xmm4, [edi +  0]
 591         movd    xmm5, [edi +  4]
 592         movd    xmm6, [edi +  8]
 593         movd    xmm7, [edi + 12]
 594
 595         // Finish the calculation by adding the Montgomery product.
 596         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 597         propout [edi +  0],      xmm4, xmm5
 598
 599         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 600         propout [edi +  4],      xmm5, xmm6
 601
 602         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 603         propout [edi +  8],      xmm6, xmm7
 604
 605         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 606         propout [edi + 12],      xmm7, xmm4
 607
 608         // Add add on the carry we calculated earlier.
 609         paddq   xmm4, [esp +  0]
 610         paddq   xmm5, [esp + 16]
 611         paddq   xmm6, [esp + 32]
 612
 613         // And, with that, we're done.
 614         stfree  48 + 12
 615         ret
 616
 617 ENDFUNC
 618
 619 INTFUNC(mont4)
 620         // On entry, EDI points to the destination buffer holding a packed
 621         // value W; EBX points to a packed operand N; ESI points to an
 622         // expanded operand M; and EDX points to a place to store an expanded
 623         // result Y (32 bytes, at a 16-byte boundary).
 624         //
 625         // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
 626         // of the sum W + N Y to [EDI], leaving the remaining carry in
 627         // XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
 628         // XMM7 are clobbered; the general-purpose registers are preserved.
 629   endprologue
 630
 631         // Calculate Y = W M.
 632         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
 633
 634         mulcore [edi +  4], esi, xmm0, xmm1, xmm2
 635         accum                    xmm5, xmm6, xmm7
 636
 637         mulcore [edi +  8], esi, xmm0, xmm1
 638         accum                    xmm6, xmm7
 639
 640         mulcore [edi + 12], esi, xmm0
 641         accum                    xmm7
 642
 643         // That's lots of pieces.  Now we have to assemble the answer.
 644         squash  xmm4, xmm5, xmm6, xmm7,  xmm0, xmm1,  xmm4
 645
 646         // Expand it.
 647         pxor    xmm2, xmm2
 648         expand  xmm2, xmm4, xmm1
 649         movdqa  [edx +  0], xmm4
 650         movdqa  [edx + 16], xmm1
 651
 652         // Initialize the carry from W.
 653         movd    xmm4, [edi +  0]
 654         movd    xmm5, [edi +  4]
 655         movd    xmm6, [edi +  8]
 656         movd    xmm7, [edi + 12]
 657
 658         // Finish the calculation by adding the Montgomery product.
 659         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 660         propout [edi +  0],      xmm4, xmm5
 661
 662         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 663         propout [edi +  4],      xmm5, xmm6
 664
 665         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 666         propout [edi +  8],      xmm6, xmm7
 667
 668         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 669         propout [edi + 12],      xmm7, xmm4
 670
 671         // And, with that, we're done.
 672         ret
 673
 674 ENDFUNC
 675
 676 ///--------------------------------------------------------------------------
 677 /// Bulk multipliers.
 678
 679 FUNC(mpx_umul4_x86_avx)
 680         .arch   .avx
 681         vzeroupper
 682   endprologue
 683         // and drop through...
 684         .arch   pentium4
 685 ENDFUNC
 686
 687 FUNC(mpx_umul4_x86_sse2)
 688         // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
 689         //                         const mpw *bv, const mpw *bvl);
 690
 691         // Build a stack frame.  Arguments will be relative to EBP, as
 692         // follows.
 693         //
 694         //      ebp + 20        dv
 695         //      ebp + 24        av
 696         //      ebp + 28        avl
 697         //      ebp + 32        bv
 698         //      ebp + 36        bvl
 699         //
 700         // Locals are relative to ESP, as follows.
 701         //
 702         //      esp +  0        expanded Y (32 bytes)
 703         //      esp + 32        (top of locals)
 704         pushreg ebp
 705         pushreg ebx
 706         pushreg esi
 707         pushreg edi
 708         setfp
 709         and     esp, ~15
 710         sub     esp, 32
 711   endprologue
 712
 713         // Prepare for the first iteration.
 714         mov     esi, [ebp + 32]         // -> bv[0]
 715         pxor    xmm7, xmm7
 716         movdqu  xmm0, [esi]             // bv[0]
 717         mov     edi, [ebp + 20]         // -> dv[0]
 718         mov     ecx, edi                // outer loop dv cursor
 719         expand  xmm7, xmm0, xmm1
 720         mov     ebx, [ebp + 24]         // -> av[0]
 721         mov     eax, [ebp + 28]         // -> av[m] = av limit
 722         mov     edx, esp                // -> expanded Y = bv[0]
 723         movdqa  [esp + 0], xmm0         // bv[0] expanded low
 724         movdqa  [esp + 16], xmm1        // bv[0] expanded high
 725         call    mul4zc
 726         add     ebx, 16
 727         add     edi, 16
 728         add     ecx, 16
 729         add     esi, 16
 730         cmp     ebx, eax                // all done?
 731         jae     8f
 732
 733         .p2align 4
 734         // Continue with the first iteration.
 735 0:      call    mul4
 736         add     ebx, 16
 737         add     edi, 16
 738         cmp     ebx, eax                // all done?
 739         jb      0b
 740
 741         // Write out the leftover carry.  There can be no tail here.
 742 8:      call    carryprop
 743         cmp     esi, [ebp + 36]         // more passes to do?
 744         jae     9f
 745
 746         .p2align 4
 747         // Set up for the next pass.
 748 1:      movdqu  xmm0, [esi]             // bv[i]
 749         mov     edi, ecx                // -> dv[i]
 750         pxor    xmm7, xmm7
 751         expand  xmm7, xmm0, xmm1
 752         mov     ebx, [ebp + 24]         // -> av[0]
 753         movdqa  [esp + 0], xmm0         // bv[i] expanded low
 754         movdqa  [esp + 16], xmm1        // bv[i] expanded high
 755         call    mla4zc
 756         add     edi, 16
 757         add     ebx, 16
 758         add     ecx, 16
 759         add     esi, 16
 760         cmp     ebx, eax                // done yet?
 761         jae     8f
 762
 763         .p2align 4
 764         // Continue...
 765 0:      call    mla4
 766         add     ebx, 16
 767         add     edi, 16
 768         cmp     ebx, eax
 769         jb      0b
 770
 771         // Finish off this pass.  There was no tail on the previous pass, and
 772         // there can be none on this pass.
 773 8:      call    carryprop
 774         cmp     esi, [ebp + 36]
 775         jb      1b
 776
 777         // All over.
 778 9:      dropfp
 779         pop     edi
 780         pop     esi
 781         pop     ebx
 782         pop     ebp
 783         ret
 784
 785 ENDFUNC
 786
 787 FUNC(mpxmont_mul4_x86_avx)
 788         .arch   .avx
 789         vzeroupper
 790   endprologue
 791         // and drop through...
 792         .arch   pentium4
 793 ENDFUNC
 794
 795 FUNC(mpxmont_mul4_x86_sse2)
 796         // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
 797         //                           const mpw *nv, size_t n, const mpw *mi);
 798
 799         // Build a stack frame.  Arguments will be relative to EBP, as
 800         // follows.
 801         //
 802         //      ebp + 20        dv
 803         //      ebp + 24        av
 804         //      ebp + 28        bv
 805         //      ebp + 32        nv
 806         //      ebp + 36        n (nonzero multiple of 4)
 807         //      ebp + 40        mi
 808         //
 809         // Locals are relative to ESP, which 16-byte aligned, as follows.
 810         //
 811         //      esp +   0       expanded V (32 bytes)
 812         //      esp +  32       expanded M (32 bytes)
 813         //      esp +  64       expanded Y (32 bytes)
 814         //      esp +  96       outer loop dv
 815         //      esp + 100       outer loop bv
 816         //      esp + 104       av limit (mostly in ESI)
 817         //      esp + 108       bv limit
 818         //      esp + 112       (top of locals)
 819         pushreg ebp
 820         pushreg ebx
 821         pushreg esi
 822         pushreg edi
 823         setfp
 824         and     esp, ~15
 825         sub     esp, 112
 826   endprologue
 827
 828         // Establish the expanded operands.
 829         pxor    xmm7, xmm7
 830         mov     ecx, [ebp + 28]         // -> bv
 831         mov     edx, [ebp + 40]         // -> mi
 832         movdqu  xmm0, [ecx]             // bv[0]
 833         movdqu  xmm2, [edx]             // mi
 834         expand  xmm7, xmm0, xmm1, xmm2, xmm3
 835         movdqa  [esp +  0], xmm0        // bv[0] expanded low
 836         movdqa  [esp + 16], xmm1        // bv[0] expanded high
 837         movdqa  [esp + 32], xmm2        // mi expanded low
 838         movdqa  [esp + 48], xmm3        // mi expanded high
 839
 840         // Set up the outer loop state and prepare for the first iteration.
 841         mov     edx, [ebp + 36]         // n
 842         mov     eax, [ebp + 24]         // -> U = av[0]
 843         mov     ebx, [ebp + 32]         // -> X = nv[0]
 844         mov     edi, [ebp + 20]         // -> Z = dv[0]
 845         mov     [esp + 100], ecx
 846         lea     ecx, [ecx + 4*edx]      // -> bv[n/4] = bv limit
 847         lea     edx, [eax + 4*edx]      // -> av[n/4] = av limit
 848         mov     [esp + 96], edi
 849         mov     [esp + 104], edx
 850         mov     [esp + 108], ecx
 851         lea     ecx, [esp + 0]          // -> expanded V = bv[0]
 852         lea     esi, [esp + 32]         // -> expanded M = mi
 853         lea     edx, [esp + 64]         // -> space for Y
 854         call    mmul4
 855         mov     esi, [esp + 104]        // recover av limit
 856         add     edi, 16
 857         add     eax, 16
 858         add     ebx, 16
 859         cmp     eax, esi                // done already?
 860         jae     8f
 861         mov     [esp + 96], edi
 862
 863         .p2align 4
 864         // Complete the first inner loop.
 865 0:      call    dmul4
 866         add     edi, 16
 867         add     eax, 16
 868         add     ebx, 16
 869         cmp     eax, esi                // done yet?
 870         jb      0b
 871
 872         // Still have carries left to propagate.
 873         call    carryprop
 874         movd    [edi + 16], xmm4
 875
 876         .p2align 4
 877         // Embark on the next iteration.  (There must be one.  If n = 1, then
 878         // we would have bailed above, to label 8.  Similarly, the subsequent
 879         // iterations can fall into the inner loop immediately.)
 880 1:      mov     eax, [esp + 100]        // -> bv[i - 1]
 881         mov     edi, [esp + 96]         // -> Z = dv[i]
 882         add     eax, 16                 // -> bv[i]
 883         pxor    xmm7, xmm7
 884         mov     [esp + 100], eax
 885         cmp     eax, [esp + 108]        // done yet?
 886         jae     9f
 887         movdqu  xmm0, [eax]             // bv[i]
 888         mov     ebx, [ebp + 32]         // -> X = nv[0]
 889         lea     esi, [esp + 32]         // -> expanded M = mi
 890         mov     eax, [ebp + 24]         // -> U = av[0]
 891         expand  xmm7, xmm0, xmm1
 892         movdqa  [esp + 0], xmm0         // bv[i] expanded low
 893         movdqa  [esp + 16], xmm1        // bv[i] expanded high
 894         call    mmla4
 895         mov     esi, [esp + 104]        // recover av limit
 896         add     edi, 16
 897         add     eax, 16
 898         add     ebx, 16
 899         mov     [esp + 96], edi
 900
 901         .p2align 4
 902         // Complete the next inner loop.
 903 0:      call    dmla4
 904         add     edi, 16
 905         add     eax, 16
 906         add     ebx, 16
 907         cmp     eax, esi
 908         jb      0b
 909
 910         // Still have carries left to propagate, and they overlap the
 911         // previous iteration's final tail, so read that in and add it.
 912         movd    xmm0, [edi]
 913         paddq   xmm4, xmm0
 914         call    carryprop
 915         movd    [edi + 16], xmm4
 916
 917         // Back again.
 918         jmp     1b
 919
 920         // First iteration was short.  Write out the carries and we're done.
 921         // (This could be folded into the main loop structure, but that would
 922         // penalize small numbers more.)
 923 8:      call    carryprop
 924         movd    [edi + 16], xmm4
 925
 926         // All done.
 927 9:      dropfp
 928         popreg  edi
 929         popreg  esi
 930         popreg  ebx
 931         popreg  ebp
 932         ret
 933
 934 ENDFUNC
 935
 936 FUNC(mpxmont_redc4_x86_avx)
 937         .arch   .avx
 938         vzeroupper
 939   endprologue
 940         // and drop through...
 941         .arch   pentium4
 942 ENDFUNC
 943
 944 FUNC(mpxmont_redc4_x86_sse2)
 945         // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
 946         //                             size_t n, const mpw *mi);
 947
 948         // Build a stack frame.  Arguments will be relative to EBP, as
 949         // follows.
 950         //
 951         //      ebp + 20        dv
 952         //      ebp + 24        dvl
 953         //      ebp + 28        nv
 954         //      ebp + 32        n (nonzero multiple of 4)
 955         //      ebp + 36        mi
 956         //
 957         // Locals are relative to ESP, as follows.
 958         //
 959         //      esp +  0        outer loop dv
 960         //      esp +  4        outer dv limit
 961         //      esp +  8        blocks-of-4 dv limit
 962         //      esp + 12        expanded M (32 bytes)
 963         //      esp + 44        expanded Y (32 bytes)
 964         //      esp + 76        (top of locals)
 965         pushreg ebp
 966         pushreg ebx
 967         pushreg esi
 968         pushreg edi
 969         setfp
 970         and     esp, ~15
 971         sub     esp, 76
 972   endprologue
 973
 974         // Establish the expanded operands and the blocks-of-4 dv limit.
 975         mov     edi, [ebp + 20]         // -> Z = dv[0]
 976         pxor    xmm7, xmm7
 977         mov     eax, [ebp + 24]         // -> dv[n] = dv limit
 978         sub     eax, edi                // length of dv in bytes
 979         mov     edx, [ebp + 36]         // -> mi
 980         movdqu  xmm0, [edx]             // mi
 981         and     eax, ~15                // mask off the tail end
 982         expand  xmm7, xmm0, xmm1
 983         add     eax, edi                // find limit
 984         movdqa  [esp + 12], xmm0        // mi expanded low
 985         movdqa  [esp + 28], xmm1        // mi expanded high
 986         mov     [esp + 8], eax
 987
 988         // Set up the outer loop state and prepare for the first iteration.
 989         mov     ecx, [ebp + 32]         // n
 990         mov     ebx, [ebp + 28]         // -> X = nv[0]
 991         lea     edx, [edi + 4*ecx]      // -> dv[n/4] = outer dv limit
 992         lea     ecx, [ebx + 4*ecx]      // -> nv[n/4] = nv limit
 993         mov     [esp + 0], edi
 994         mov     [esp + 4], edx
 995         lea     esi, [esp + 12]         // -> expanded M = mi
 996         lea     edx, [esp + 44]         // -> space for Y
 997         call    mont4
 998         add     ebx, 16
 999         add     edi, 16
1000         cmp     ebx, ecx                // done already?
1001         jae     8f
1002
1003         .p2align 4
1004         // Complete the first inner loop.
1005 5:      call    mla4
1006         add     ebx, 16
1007         add     edi, 16
1008         cmp     ebx, ecx                // done yet?
1009         jb      5b
1010
1011         // Still have carries left to propagate.
1012 8:      carryadd
1013         mov     esi, [esp + 8]          // -> dv blocks limit
1014         mov     edx, [ebp + 24]         // dv limit
1015         psllq   xmm7, 16
1016         pslldq  xmm7, 8
1017         paddq   xmm6, xmm7
1018         call    carryprop
1019         movd    eax, xmm4
1020         add     edi, 16
1021         cmp     edi, esi
1022         jae     7f
1023
1024         .p2align 4
1025         // Continue carry propagation until the end of the buffer.
1026 0:      add     [edi], eax
1027         mov     eax, 0                  // preserves flags
1028         adcd    [edi + 4], 0
1029         adcd    [edi + 8], 0
1030         adcd    [edi + 12], 0
1031         adc     eax, 0
1032         add     edi, 16
1033         cmp     edi, esi
1034         jb      0b
1035
1036         // Deal with the tail end.
1037 7:      add     [edi], eax
1038         mov     eax, 0                  // preserves flags
1039         add     edi, 4
1040         adc     eax, 0
1041         cmp     edi, edx
1042         jb      7b
1043
1044         // All done for this iteration.  Start the next.  (This must have at
1045         // least one follow-on iteration, or we'd not have started this outer
1046         // loop.)
1047 8:      mov     edi, [esp + 0]          // -> dv[i - 1]
1048         mov     ebx, [ebp + 28]         // -> X = nv[0]
1049         lea     edx, [esp + 44]         // -> space for Y
1050         lea     esi, [esp + 12]         // -> expanded M = mi
1051         add     edi, 16                 // -> Z = dv[i]
1052         cmp     edi, [esp + 4]          // all done yet?
1053         jae     9f
1054         mov     [esp + 0], edi
1055         call    mont4
1056         add     edi, 16
1057         add     ebx, 16
1058         jmp     5b
1059
1060         // All over.
1061 9:      dropfp
1062         popreg  edi
1063         popreg  esi
1064         popreg  ebx
1065         popreg  ebp
1066         ret
1067
1068 ENDFUNC
1069
1070 ///--------------------------------------------------------------------------
1071 /// Testing and performance measurement.
1072
1073 #ifdef TEST_MUL4
1074
1075 .macro  cysetup c
1076         rdtsc
1077         mov     [\c], eax
1078         mov     [\c + 4], edx
1079 .endm
1080
1081 .macro  cystore c, v, n
1082         rdtsc
1083         sub     eax, [\c]
1084         sbb     edx, [\c + 4]
1085         mov     ebx, [\v]
1086         mov     ecx, [\n]
1087         dec     ecx
1088         mov     [\n], ecx
1089         mov     [ebx + ecx*8], eax
1090         mov     [ebx + ecx*8 + 4], edx
1091 .endm
1092
1093 .macro  testprologue n
1094         pushreg ebp
1095         pushreg ebx
1096         pushreg esi
1097         pushreg edi
1098         setfp
1099         and     esp, ~15
1100         sub     esp, 3*32 + 4*4
1101   endprologue
1102         mov     eax, \n
1103         mov     [esp + 104], eax
1104         // vars:
1105         //      esp +   0 = v expanded
1106         //      esp +  32 = y expanded
1107         //      esp +  64 = ? expanded
1108         //      esp +  96 = cycles
1109         //      esp + 104 = count
1110 .endm
1111
1112 .macro  testepilogue
1113         dropfp
1114         popreg  edi
1115         popreg  esi
1116         popreg  ebx
1117         popreg  ebp
1118         ret
1119 .endm
1120
1121 .macro  testldcarry c
1122         mov     ecx, \c                 // -> c
1123         movdqu  xmm4, [ecx +  0]        // (c'_0; c''_0)
1124         movdqu  xmm5, [ecx + 16]        // (c'_1; c''_1)
1125         movdqu  xmm6, [ecx + 32]        // (c'_2; c''_2)
1126 .endm
1127
1128 .macro  testexpand v=nil, y=nil
1129         pxor    xmm7, xmm7
1130   .ifnes "\v", "nil"
1131         mov     ecx, \v
1132         movdqu  xmm0, [ecx]
1133         expand  xmm7, xmm0, xmm1
1134         movdqa  [esp +  0], xmm0
1135         movdqa  [esp + 16], xmm1
1136   .endif
1137   .ifnes "\y", "nil"
1138         mov     edx, \y
1139         movdqu  xmm2, [edx]
1140         expand  xmm7, xmm2, xmm3
1141         movdqa  [esp + 32], xmm2
1142         movdqa  [esp + 48], xmm3
1143   .endif
1144 .endm
1145
1146 .macro  testtop u=nil, x=nil, mode=nil
1147         .p2align 4
1148 0:
1149   .ifnes "\u", "nil"
1150         lea     ecx, [esp + 0]
1151   .endif
1152         mov     ebx, \x
1153   .ifeqs "\mode", "mont"
1154         lea     esi, [esp + 32]
1155   .endif
1156         cysetup esp + 96
1157   .ifnes "\u", "nil"
1158         mov     eax, \u
1159   .endif
1160   .ifeqs "\mode", "mont"
1161         lea     edx, [esp + 64]
1162   .else
1163         lea     edx, [esp + 32]
1164   .endif
1165 .endm
1166
1167 .macro  testtail cyv
1168         cystore esp + 96, \cyv, esp + 104
1169         jnz     0b
1170 .endm
1171
1172 .macro  testcarryout c
1173         mov     ecx, \c
1174         movdqu  [ecx +  0], xmm4
1175         movdqu  [ecx + 16], xmm5
1176         movdqu  [ecx + 32], xmm6
1177 .endm
1178
1179 FUNC(test_dmul4)
1180         testprologue [ebp + 44]
1181         testldcarry [ebp + 24]
1182         testexpand [ebp + 36], [ebp + 40]
1183         mov     edi, [ebp + 20]
1184         testtop [ebp + 28], [ebp + 32]
1185         call    dmul4
1186         testtail [ebp + 48]
1187         testcarryout [ebp + 24]
1188         testepilogue
1189 ENDFUNC
1190
1191 FUNC(test_dmla4)
1192         testprologue [ebp + 44]
1193         testldcarry [ebp + 24]
1194         testexpand [ebp + 36], [ebp + 40]
1195         mov     edi, [ebp + 20]
1196         testtop [ebp + 28], [ebp + 32]
1197         call    dmla4
1198         testtail [ebp + 48]
1199         testcarryout [ebp + 24]
1200         testepilogue
1201 ENDFUNC
1202
1203 FUNC(test_mul4)
1204         testprologue [ebp + 36]
1205         testldcarry [ebp + 24]
1206         testexpand nil, [ebp + 32]
1207         mov     edi, [ebp + 20]
1208         testtop nil, [ebp + 28]
1209         call    mul4
1210         testtail [ebp + 40]
1211         testcarryout [ebp + 24]
1212         testepilogue
1213 ENDFUNC
1214
1215 FUNC(test_mla4)
1216         testprologue [ebp + 36]
1217         testldcarry [ebp + 24]
1218         testexpand nil, [ebp + 32]
1219         mov     edi, [ebp + 20]
1220         testtop nil, [ebp + 28]
1221         call    mla4
1222         testtail [ebp + 40]
1223         testcarryout [ebp + 24]
1224         testepilogue
1225 ENDFUNC
1226
1227 FUNC(test_mmul4)
1228         testprologue [ebp + 48]
1229         testexpand [ebp + 40], [ebp + 44]
1230         mov     edi, [ebp + 20]
1231         testtop [ebp + 32], [ebp + 36], mont
1232         call    mmul4
1233         testtail [ebp + 52]
1234         mov     edi, [ebp + 28]
1235         movdqa  xmm0, [esp + 64]
1236         movdqa  xmm1, [esp + 80]
1237         movdqu  [edi], xmm0
1238         movdqu  [edi + 16], xmm1
1239         testcarryout [ebp + 24]
1240         testepilogue
1241 ENDFUNC
1242
1243 FUNC(test_mmla4)
1244         testprologue [ebp + 48]
1245         testexpand [ebp + 40], [ebp + 44]
1246         mov     edi, [ebp + 20]
1247         testtop [ebp + 32], [ebp + 36], mont
1248         call    mmla4
1249         testtail [ebp + 52]
1250         mov     edi, [ebp + 28]
1251         movdqa  xmm0, [esp + 64]
1252         movdqa  xmm1, [esp + 80]
1253         movdqu  [edi], xmm0
1254         movdqu  [edi + 16], xmm1
1255         testcarryout [ebp + 24]
1256         testepilogue
1257 ENDFUNC
1258
1259 FUNC(test_mont4)
1260         testprologue [ebp + 40]
1261         testexpand nil, [ebp + 36]
1262         mov     edi, [ebp + 20]
1263         testtop nil, [ebp + 32], mont
1264         call    mont4
1265         testtail [ebp + 44]
1266         mov     edi, [ebp + 28]
1267         movdqa  xmm0, [esp + 64]
1268         movdqa  xmm1, [esp + 80]
1269         movdqu  [edi], xmm0
1270         movdqu  [edi + 16], xmm1
1271         testcarryout [ebp + 24]
1272         testepilogue
1273 ENDFUNC
1274
1275 #endif
1276
1277 ///----- That's all, folks --------------------------------------------------