mdw@git.distorted.org.uk Git - catacomb/blob - math/mpx-mul4-x86-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
   2 ///
   3 /// Large SIMD-based multiplications
   4 ///
   5 /// (c) 2016 Straylight/Edgeware
   6
   7 ///----- Licensing notice ---------------------------------------------------
   8 ///
   9 /// This file is part of Catacomb.
  10 ///
  11 /// Catacomb is free software; you can redistribute it and/or modify
  12 /// it under the terms of the GNU Library General Public License as
  13 /// published by the Free Software Foundation; either version 2 of the
  14 /// License, or (at your option) any later version.
  15 ///
  16 /// Catacomb is distributed in the hope that it will be useful,
  17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 /// GNU Library General Public License for more details.
  20 ///
  21 /// You should have received a copy of the GNU Library General Public
  22 /// License along with Catacomb; if not, write to the Free
  23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  24 /// MA 02111-1307, USA.
  25
  26 ///--------------------------------------------------------------------------
  27 /// Preliminaries.
  28
  29 #include "config.h"
  30 #include "asm-common.h"
  31
  32         .arch   pentium4
  33
  34         .text
  35
  36 ///--------------------------------------------------------------------------
  37 /// Theory.
  38 ///
  39 /// We define a number of primitive fixed-size multipliers from which we can
  40 /// construct more general variable-length multipliers.
  41 ///
  42 /// The basic trick is the same throughout.  In an operand-scanning
  43 /// multiplication, the inner multiplication loop multiplies a
  44 /// multiple-precision operand by a single precision factor, and adds the
  45 /// result, appropriately shifted, to the result.  A `finely integrated
  46 /// operand scanning' implementation of Montgomery multiplication also adds
  47 /// the product of a single-precision `Montgomery factor' and the modulus,
  48 /// calculated in the same pass.  The more common `coarsely integrated
  49 /// operand scanning' alternates main multiplication and Montgomery passes,
  50 /// which requires additional carry propagation.
  51 ///
  52 /// Throughout both plain-multiplication and Montgomery stages, then, one of
  53 /// the factors remains constant throughout the operation, so we can afford
  54 /// to take a little time to preprocess it.  The transformation we perform is
  55 /// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
  56 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
  57 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
  58 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
  59 /// operands, as follows.
  60 ///
  61 ///     Offset     0       4        8      12
  62 ///        0    v'_0    v'_1    v''_0   v''_1
  63 ///       16    v'_2    v'_3    v''_2   v''_3
  64 ///
  65 /// A `pmuludq' instruction ignores the odd positions in its operands; thus,
  66 /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
  67 /// this vector right by 4 bytes brings v'_1 and v''_1 into position.  We can
  68 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
  69 /// results in 64-bit fields.  The sixteen bits of headroom allows us to add
  70 /// many products together before we must deal with carrying; it also allows
  71 /// for some calculations to be performed on the above expanded form.
  72 ///
  73 /// On 32-bit x86, we are register starved: the expanded operands are kept in
  74 /// memory, typically in warm L1 cache.
  75 ///
  76 /// We maintain four `carry' registers accumulating intermediate results.
  77 /// The registers' precise roles rotate during the computation; we name them
  78 /// `c0', `c1', `c2', and `c3'.  Each carry register holds two 64-bit halves:
  79 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
  80 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
  81 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3.  The
  82 /// `pmuluqd' instruction acting on a scalar operand (broadcast across all
  83 /// lanes of its vector) and an operand in the expanded form above produces a
  84 /// result which can be added directly to the appropriate carry register.
  85 /// Following a pass of four multiplications, we perform some limited carry
  86 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
  87 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
  88 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
  89 /// zeroed becomes c3.
  90
  91 ///--------------------------------------------------------------------------
  92 /// Macro definitions.
  93
  94 .macro  mulcore r, s, d0, d1=nil, d2=nil, d3=nil
  95         // Load a word r_i from R, multiply by the expanded operand [S], and
  96         // leave the pieces of the product in registers D0, D1, D2, D3.
  97         movd    \d0, \r                 // (r_i, 0; 0, 0)
  98   .ifnes "\d1", "nil"
  99         movdqa  \d1, [\s]               // (s'_0, s'_1; s''_0, s''_1)
 100   .endif
 101   .ifnes "\d3", "nil"
 102         movdqa  \d3, [\s + 16]          // (s'_2, s'_3; s''_2, s''_3)
 103   .endif
 104         pshufd  \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?)
 105   .ifnes "\d1", "nil"
 106         psrldq  \d1, 4                  // (s'_1, s''_0; s''_1, 0)
 107   .endif
 108   .ifnes "\d2", "nil"
 109     .ifnes "\d3", "nil"
 110         movdqa  \d2, \d3                // another copy of (s'_2, s'_3; ...)
 111     .else
 112         movdqa  \d2, \d0                // another copy of (r_i, ?; r_i, ?)
 113     .endif
 114   .endif
 115   .ifnes "\d3", "nil"
 116         psrldq  \d3, 4                  // (s'_3, s''_2; s''_3, 0)
 117   .endif
 118   .ifnes "\d1", "nil"
 119         pmuludq \d1, \d0                // (r_i s'_1; r_i s''_1)
 120   .endif
 121   .ifnes "\d3", "nil"
 122         pmuludq \d3, \d0                // (r_i s'_3; r_i s''_3)
 123   .endif
 124   .ifnes "\d2", "nil"
 125     .ifnes "\d3", "nil"
 126         pmuludq \d2, \d0                // (r_i s'_2; r_i s''_2)
 127     .else
 128         pmuludq \d2, [\s + 16]
 129     .endif
 130   .endif
 131         pmuludq \d0, [\s]               // (r_i s'_0; r_i s''_0)
 132 .endm
 133
 134 .macro  accum   c0, c1=nil, c2=nil, c3=nil
 135         // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
 136         // carry registers C0--C3.  Any or all of C1--C3 may be `nil' to skip
 137         // updating that register.
 138         paddq   \c0, xmm0
 139   .ifnes "\c1", "nil"
 140         paddq   \c1, xmm1
 141   .endif
 142   .ifnes "\c2", "nil"
 143         paddq   \c2, xmm2
 144   .endif
 145   .ifnes "\c3", "nil"
 146         paddq   \c3, xmm3
 147   .endif
 148 .endm
 149
 150 .macro  mulacc  r, s, c0, c1, c2, c3, z3p=nil
 151         // Load a word r_i from R, multiply by the expanded operand [S],
 152         // and accumulate in carry registers C0, C1, C2, C3.  If Z3P is `t'
 153         // then C3 notionally contains zero, but needs clearing; in practice,
 154         // we store the product directly rather than attempting to add.  On
 155         // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
 156         // is not `t'.
 157   .ifeqs "\z3p", "t"
 158         mulcore \r, \s, xmm0, xmm1, xmm2, \c3
 159         accum           \c0,  \c1,  \c2
 160   .else
 161         mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
 162         accum           \c0,  \c1,  \c2,  \c3
 163   .endif
 164 .endm
 165
 166 .macro  propout d, c, cc=nil
 167         // Calculate an output word from C, and store it in D; propagate
 168         // carries out from C to CC in preparation for a rotation of the
 169         // carry registers.  On completion, XMM3 is clobbered.  If CC is
 170         // `nil', then the contribution which would have been added to it is
 171         // left in C.
 172         pshufd  xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
 173         psrldq  xmm3, 12                // (t, 0; 0, 0) = (t, 0)
 174         pslldq  xmm3, 2                 // (t b; 0)
 175         paddq   \c, xmm3                // (c' + t b; c'')
 176         movd    \d, \c
 177         psrlq   \c, 32                  // floor(c/B)
 178   .ifnes "\cc", "nil"
 179         paddq   \cc, \c                 // propagate up
 180   .endif
 181 .endm
 182
 183 .macro  endprop d, c, t
 184         // On entry, C contains a carry register.  On exit, the low 32 bits
 185         // of the value represented in C are written to D, and the remaining
 186         // bits are left at the bottom of T.
 187         movdqa  \t, \c
 188         psllq   \t, 16                  // (?; c'' b)
 189         pslldq  \c, 8                   // (0; c')
 190         paddq   \t, \c                  // (?; c' + c'' b)
 191         psrldq  \t, 8                   // (c' + c'' b; 0) = (c; 0)
 192         movd    \d, \t
 193         psrldq  \t, 4                   // (floor(c/B); 0)
 194 .endm
 195
 196 .macro  expand  z, a, b, c=nil, d=nil
 197         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
 198         // exit, A:B and C:D together hold the same values in expanded
 199         // form.  If C is `nil', then only expand A to A:B.
 200         movdqa  \b, \a                  // (a_0, a_1; a_2, a_3)
 201   .ifnes "\c", "nil"
 202         movdqa  \d, \c                  // (c_0, c_1; c_2, c_3)
 203   .endif
 204         punpcklwd \a, \z                // (a'_0, a''_0; a'_1, a''_1)
 205         punpckhwd \b, \z                // (a'_2, a''_2; a'_3, a''_3)
 206   .ifnes "\c", "nil"
 207         punpcklwd \c, \z                // (c'_0, c''_0; c'_1, c''_1)
 208         punpckhwd \d, \z                // (c'_2, c''_2; c'_3, c''_3)
 209   .endif
 210         pshufd  \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
 211         pshufd  \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
 212   .ifnes "\c", "nil"
 213         pshufd  \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
 214         pshufd  \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
 215   .endif
 216 .endm
 217
 218 .macro  squash  c0, c1, c2, c3, t, u, lo, hi=nil
 219         // On entry, C0, C1, C2, C3 are carry registers representing a value
 220         // Y.  On exit, LO holds the low 128 bits of the carry value; C1, C2,
 221         // C3, T, and U are clobbered; and the high bits of Y are stored in
 222         // HI, if this is not `nil'.
 223
 224         // The first step is to eliminate the `double-prime' pieces -- i.e.,
 225         // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
 226         // them into the 32-bit-aligned pieces above and below.  But before
 227         // we can do that, we must gather them together.
 228         movdqa  \t, \c0
 229         movdqa  \u, \c1
 230         punpcklqdq \t, \c2              // (y'_0; y'_2)
 231         punpckhqdq \c0, \c2             // (y''_0; y''_2)
 232         punpcklqdq \u, \c3              // (y'_1; y'_3)
 233         punpckhqdq \c1, \c3             // (y''_1; y''_3)
 234
 235         // Now split the double-prime pieces.  The high (up to) 48 bits will
 236         // go up; the low 16 bits go down.
 237         movdqa  \c2, \c0
 238         movdqa  \c3, \c1
 239         psllq   \c2, 48
 240         psllq   \c3, 48
 241         psrlq   \c0, 16                 // high parts of (y''_0; y''_2)
 242         psrlq   \c1, 16                 // high parts of (y''_1; y''_3)
 243         psrlq   \c2, 32                 // low parts of (y''_0; y''_2)
 244         psrlq   \c3, 32                 // low parts of (y''_1; y''_3)
 245   .ifnes "\hi", "nil"
 246         movdqa  \hi, \c1
 247   .endif
 248         pslldq  \c1, 8                  // high part of (0; y''_1)
 249
 250         paddq   \t, \c2                 // propagate down
 251         paddq   \u, \c3
 252         paddq   \t, \c1                 // and up: (y_0; y_2)
 253         paddq   \u, \c0                 // (y_1; y_3)
 254   .ifnes "\hi", "nil"
 255         psrldq  \hi, 8                  // high part of (y''_3; 0)
 256   .endif
 257
 258         // Finally extract the answer.  This complicated dance is better than
 259         // storing to memory and loading, because the piecemeal stores
 260         // inhibit store forwarding.
 261         movdqa  \c3, \t                 // (y_0; ?)
 262         movdqa  \lo, \t                 // (y^*_0, ?; ?, ?)
 263         psrldq  \t, 8                   // (y_2; 0)
 264         psrlq   \c3, 32                 // (floor(y_0/B); ?)
 265         paddq   \c3, \u                 // (y_1 + floor(y_0/B); ?)
 266         movdqa  \c1, \c3                // (y^*_1, ?; ?, ?)
 267         psrldq  \u, 8                   // (y_3; 0)
 268         psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2; ?)
 269         paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2; ?)
 270         punpckldq \lo, \c3              // (y^*_0, y^*_2; ?, ?)
 271         psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
 272         paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
 273   .ifnes "\hi", "nil"
 274         movdqa  \t, \c3
 275         pxor    \u, \u
 276   .endif
 277         punpckldq \c1, \c3              // (y^*_1, y^*_3; ?, ?)
 278   .ifnes "\hi", "nil"
 279         psrlq   \t, 32                  // very high bits of y
 280         paddq   \hi, \t
 281         punpcklqdq \hi, \u              // carry up
 282   .endif
 283         punpckldq \lo, \c1              // y mod B^4
 284 .endm
 285
 286 .macro  carryadd
 287         // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
 288         // hold the incoming carry registers c0, c1, and c2 representing a
 289         // carry-in C.
 290         //
 291         // On exit, the carry registers, including XMM7, are updated to hold
 292         // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
 293         // registers are preserved.
 294         movd    xmm0, [edi +  0]        // (a_0; 0)
 295         movd    xmm1, [edi +  4]        // (a_1; 0)
 296         movd    xmm2, [edi +  8]        // (a_2; 0)
 297         movd    xmm7, [edi + 12]        // (a_3; 0)
 298
 299         paddq   xmm4, xmm0              // (c'_0 + a_0; c''_0)
 300         paddq   xmm5, xmm1              // (c'_1 + a_1; c''_1)
 301         paddq   xmm6, xmm2              // (c'_2 + a_2; c''_2 + a_3 b)
 302 .endm
 303
 304 ///--------------------------------------------------------------------------
 305 /// Primitive multipliers and related utilities.
 306
 307 INTFUNC(carryprop)
 308         // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
 309         // form.  Store the low 128 bits of the represented carry to [EDI] as
 310         // a packed 128-bit value, and leave the remaining 16 bits in the low
 311         // 32 bits of XMM4.  On exit, XMM3, XMM5 and XMM6 are clobbered.
 312   endprologue
 313
 314         propout [edi +  0], xmm4, xmm5
 315         propout [edi +  4], xmm5, xmm6
 316         propout [edi +  8], xmm6, nil
 317         endprop [edi + 12], xmm6, xmm4
 318         ret
 319 ENDFUNC
 320
 321 INTFUNC(dmul4)
 322         // On entry, EDI points to the destination buffer; EAX and EBX point
 323         // to the packed operands U and X; ECX and EDX point to the expanded
 324         // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
 325         // registers c0, c1, and c2; c3 is assumed to be zero.
 326         //
 327         // On exit, we write the low 128 bits of the sum C + U V + X Y to
 328         // [EDI], and update the carry registers with the carry out.  The
 329         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 330         // general-purpose registers are preserved.
 331   endprologue
 332
 333         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, t
 334         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 335         propout [edi +  0],      xmm4, xmm5
 336
 337         mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 338         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4
 339         propout [edi +  4],      xmm5, xmm6
 340
 341         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 342         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5
 343         propout [edi +  8],      xmm6, xmm7
 344
 345         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 346         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
 347         propout [edi + 12],      xmm7, xmm4
 348
 349         ret
 350 ENDFUNC
 351
 352 INTFUNC(dmla4)
 353         // On entry, EDI points to the destination buffer, which also
 354         // contains an addend A to accumulate; EAX and EBX point to the
 355         // packed operands U and X; ECX and EDX point to the expanded
 356         // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
 357         // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
 358         // to be zero.
 359         //
 360         // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
 361         // [EDI], and update the carry registers with the carry out.  The
 362         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 363         // general-purpose registers are preserved.
 364   endprologue
 365
 366         carryadd
 367
 368         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
 369         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 370         propout [edi +  0],      xmm4, xmm5
 371
 372         mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 373         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4
 374         propout [edi +  4],      xmm5, xmm6
 375
 376         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 377         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5
 378         propout [edi +  8],      xmm6, xmm7
 379
 380         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 381         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
 382         propout [edi + 12],      xmm7, xmm4
 383
 384         ret
 385 ENDFUNC
 386
 387 INTFUNC(mul4zc)
 388         // On entry, EDI points to the destination buffer; EBX points to a
 389         // packed operand X; and EDX points to an expanded operand Y.
 390         //
 391         // On exit, we write the low 128 bits of the product X Y to [EDI],
 392         // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
 393         // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 394         // general-purpose registers are preserved.
 395   endprologue
 396
 397         mulcore [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 398         propout [edi +  0],      xmm4, xmm5
 399
 400         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 401         propout [edi +  4],      xmm5, xmm6
 402
 403         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 404         propout [edi +  8],      xmm6, xmm7
 405
 406         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 407         propout [edi + 12],      xmm7, xmm4
 408
 409         ret
 410 ENDFUNC
 411
 412 INTFUNC(mul4)
 413         // On entry, EDI points to the destination buffer; EBX points to a
 414         // packed operand X; EDX points to an expanded operand Y; and XMM4,
 415         // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
 416         // representing a carry-in C; c3 is assumed to be zero.
 417         //
 418         // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
 419         // and update the carry registers with the carry out.  The registers
 420         // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 421         // general-purpose registers are preserved.
 422   endprologue
 423
 424         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, t
 425         propout [edi +  0],      xmm4, xmm5
 426
 427         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 428         propout [edi +  4],      xmm5, xmm6
 429
 430         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 431         propout [edi +  8],      xmm6, xmm7
 432
 433         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 434         propout [edi + 12],      xmm7, xmm4
 435
 436         ret
 437 ENDFUNC
 438
 439 INTFUNC(mla4zc)
 440         // On entry, EDI points to the destination buffer, which also
 441         // contains an addend A to accumulate; EBX points to a packed operand
 442         // X; and EDX points to an expanded operand Y.
 443         //
 444         // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
 445         // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
 446         // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 447         // general-purpose registers are preserved.
 448   endprologue
 449
 450         movd    xmm4, [edi +  0]
 451         movd    xmm5, [edi +  4]
 452         movd    xmm6, [edi +  8]
 453         movd    xmm7, [edi + 12]
 454
 455         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 456         propout [edi +  0],      xmm4, xmm5
 457
 458         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 459         propout [edi +  4],      xmm5, xmm6
 460
 461         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 462         propout [edi +  8],      xmm6, xmm7
 463
 464         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 465         propout [edi + 12],      xmm7, xmm4
 466
 467         ret
 468 ENDFUNC
 469
 470 INTFUNC(mla4)
 471         // On entry, EDI points to the destination buffer, which also
 472         // contains an addend A to accumulate; EBX points to a packed operand
 473         // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
 474         // the incoming carry registers c0, c1, and c2, representing a
 475         // carry-in C; c3 is assumed to be zero.
 476         //
 477         // On exit, we write the low 128 bits of the sum A + C + X Y to
 478         // [EDI], and update the carry registers with the carry out.  The
 479         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 480         // general-purpose registers are preserved.
 481   endprologue
 482
 483         carryadd
 484
 485         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 486         propout [edi +  0],      xmm4, xmm5
 487
 488         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 489         propout [edi +  4],      xmm5, xmm6
 490
 491         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 492         propout [edi +  8],      xmm6, xmm7
 493
 494         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 495         propout [edi + 12],      xmm7, xmm4
 496
 497         ret
 498 ENDFUNC
 499
 500 INTFUNC(mmul4)
 501         // On entry, EDI points to the destination buffer; EAX and EBX point
 502         // to the packed operands U and N; ECX and ESI point to the expanded
 503         // operands V and M; and EDX points to a place to store an expanded
 504         // result Y (32 bytes, at a 16-byte boundary).  The stack pointer
 505         // must be 12 modulo 16, as is usual for modern x86 ABIs.
 506         //
 507         // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
 508         // of the sum U V + N Y to [EDI], leaving the remaining carry in
 509         // XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
 510         // XMM7 are clobbered; the general-purpose registers are preserved.
 511         stalloc 48 + 12                 // space for the carries
 512   endprologue
 513
 514         // Calculate W = U V, and leave it in the destination.  Stash the
 515         // carry pieces for later.
 516         mulcore [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
 517         propout [edi +  0],      xmm4, xmm5
 518         jmp     5f
 519 ENDFUNC
 520
 521 INTFUNC(mmla4)
 522         // On entry, EDI points to the destination buffer, which also
 523         // contains an addend A to accumulate; EAX and EBX point to the
 524         // packed operands U and N; ECX and ESI point to the expanded
 525         // operands V and M; and EDX points to a place to store an expanded
 526         // result Y (32 bytes, at a 16-byte boundary).  The stack pointer
 527         // must be 12 modulo 16, as is usual for modern x86 ABIs.
 528         //
 529         // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
 530         // bits of the sum A + U V + N Y to [EDI], leaving the remaining
 531         // carry in XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2,
 532         // XMM3, and XMM7 are clobbered; the general-purpose registers are
 533         // preserved.
 534         stalloc 48 + 12                 // space for the carries
 535   endprologue
 536
 537         movd    xmm4, [edi +  0]
 538         movd    xmm5, [edi +  4]
 539         movd    xmm6, [edi +  8]
 540         movd    xmm7, [edi + 12]
 541
 542         // Calculate W = U V, and leave it in the destination.  Stash the
 543         // carry pieces for later.
 544         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
 545         propout [edi +  0],      xmm4, xmm5
 546
 547 5:      mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 548         propout [edi +  4],      xmm5, xmm6
 549
 550         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 551         propout [edi +  8],      xmm6, xmm7
 552
 553         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 554         propout [edi + 12],      xmm7, xmm4
 555
 556         movdqa  [SP +  0], xmm4
 557         movdqa  [SP + 16], xmm5
 558         movdqa  [SP + 32], xmm6
 559
 560         // Calculate Y = W M.
 561         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
 562
 563         mulcore [edi +  4], esi, xmm0, xmm1, xmm2
 564         accum                    xmm5, xmm6, xmm7
 565
 566         mulcore [edi +  8], esi, xmm0, xmm1
 567         accum                    xmm6, xmm7
 568
 569         mulcore [edi + 12], esi, xmm0
 570         accum                    xmm7
 571
 572         // That's lots of pieces.  Now we have to assemble the answer.
 573         squash  xmm4, xmm5, xmm6, xmm7,  xmm0, xmm1,  xmm4
 574
 575         // Expand it.
 576         pxor    xmm2, xmm2
 577         expand  xmm2, xmm4, xmm1
 578         movdqa  [edx +  0], xmm4
 579         movdqa  [edx + 16], xmm1
 580
 581         // Initialize the carry from the value for W we calculated earlier.
 582         movd    xmm4, [edi +  0]
 583         movd    xmm5, [edi +  4]
 584         movd    xmm6, [edi +  8]
 585         movd    xmm7, [edi + 12]
 586
 587         // Finish the calculation by adding the Montgomery product.
 588         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 589         propout [edi +  0],      xmm4, xmm5
 590
 591         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 592         propout [edi +  4],      xmm5, xmm6
 593
 594         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 595         propout [edi +  8],      xmm6, xmm7
 596
 597         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 598         propout [edi + 12],      xmm7, xmm4
 599
 600         // Add add on the carry we calculated earlier.
 601         paddq   xmm4, [SP +  0]
 602         paddq   xmm5, [SP + 16]
 603         paddq   xmm6, [SP + 32]
 604
 605         // And, with that, we're done.
 606         stfree  48 + 12
 607         ret
 608 ENDFUNC
 609
 610 INTFUNC(mont4)
 611         // On entry, EDI points to the destination buffer holding a packed
 612         // value W; EBX points to a packed operand N; ESI points to an
 613         // expanded operand M; and EDX points to a place to store an expanded
 614         // result Y (32 bytes, at a 16-byte boundary).
 615         //
 616         // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
 617         // of the sum W + N Y to [EDI], leaving the remaining carry in
 618         // XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
 619         // XMM7 are clobbered; the general-purpose registers are preserved.
 620   endprologue
 621
 622         // Calculate Y = W M.
 623         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
 624
 625         mulcore [edi +  4], esi, xmm0, xmm1, xmm2
 626         accum                    xmm5, xmm6, xmm7
 627
 628         mulcore [edi +  8], esi, xmm0, xmm1
 629         accum                    xmm6, xmm7
 630
 631         mulcore [edi + 12], esi, xmm0
 632         accum                    xmm7
 633
 634         // That's lots of pieces.  Now we have to assemble the answer.
 635         squash  xmm4, xmm5, xmm6, xmm7,  xmm0, xmm1,  xmm4
 636
 637         // Expand it.
 638         pxor    xmm2, xmm2
 639         expand  xmm2, xmm4, xmm1
 640         movdqa  [edx +  0], xmm4
 641         movdqa  [edx + 16], xmm1
 642
 643         // Initialize the carry from W.
 644         movd    xmm4, [edi +  0]
 645         movd    xmm5, [edi +  4]
 646         movd    xmm6, [edi +  8]
 647         movd    xmm7, [edi + 12]
 648
 649         // Finish the calculation by adding the Montgomery product.
 650         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 651         propout [edi +  0],      xmm4, xmm5
 652
 653         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 654         propout [edi +  4],      xmm5, xmm6
 655
 656         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 657         propout [edi +  8],      xmm6, xmm7
 658
 659         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 660         propout [edi + 12],      xmm7, xmm4
 661
 662         // And, with that, we're done.
 663         ret
 664 ENDFUNC
 665
 666 ///--------------------------------------------------------------------------
 667 /// Bulk multipliers.
 668
 669 FUNC(mpx_umul4_x86_avx)
 670         .arch   .avx
 671         vzeroupper
 672   endprologue
 673         // and drop through...
 674         .arch   pentium4
 675 ENDFUNC
 676
 677 FUNC(mpx_umul4_x86_sse2)
 678         // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
 679         //                         const mpw *bv, const mpw *bvl);
 680
 681         // Build a stack frame.  Arguments will be relative to BP, as
 682         // follows.
 683         //
 684         //      BP + 20 dv
 685         //      BP + 24 av
 686         //      BP + 28 avl
 687         //      BP + 32 bv
 688         //      BP + 36 bvl
 689         //
 690         // Locals are relative to SP, as follows.
 691         //
 692         //      SP +  0 expanded Y (32 bytes)
 693         //      SP + 32 (top of locals)
 694         pushreg BP
 695         pushreg ebx
 696         pushreg esi
 697         pushreg edi
 698         setfp
 699         stalloc 32
 700         and     SP, ~15
 701   endprologue
 702
 703         // Prepare for the first iteration.
 704         mov     esi, [BP + 32]          // -> bv[0]
 705         pxor    xmm7, xmm7
 706         movdqu  xmm0, [esi]             // bv[0]
 707         mov     edi, [BP + 20]          // -> dv[0]
 708         mov     ecx, edi                // outer loop dv cursor
 709         expand  xmm7, xmm0, xmm1
 710         mov     ebx, [BP + 24]          // -> av[0]
 711         mov     eax, [BP + 28]          // -> av[m] = av limit
 712         mov     edx, SP                 // -> expanded Y = bv[0]
 713         movdqa  [SP + 0], xmm0          // bv[0] expanded low
 714         movdqa  [SP + 16], xmm1         // bv[0] expanded high
 715         call    mul4zc
 716         add     ebx, 16
 717         add     edi, 16
 718         add     ecx, 16
 719         add     esi, 16
 720         cmp     ebx, eax                // all done?
 721         jae     8f
 722
 723         .p2align 4
 724         // Continue with the first iteration.
 725 0:      call    mul4
 726         add     ebx, 16
 727         add     edi, 16
 728         cmp     ebx, eax                // all done?
 729         jb      0b
 730
 731         // Write out the leftover carry.  There can be no tail here.
 732 8:      call    carryprop
 733         cmp     esi, [BP + 36]          // more passes to do?
 734         jae     9f
 735
 736         .p2align 4
 737         // Set up for the next pass.
 738 1:      movdqu  xmm0, [esi]             // bv[i]
 739         mov     edi, ecx                // -> dv[i]
 740         pxor    xmm7, xmm7
 741         expand  xmm7, xmm0, xmm1
 742         mov     ebx, [BP + 24]          // -> av[0]
 743         movdqa  [SP + 0], xmm0          // bv[i] expanded low
 744         movdqa  [SP + 16], xmm1         // bv[i] expanded high
 745         call    mla4zc
 746         add     edi, 16
 747         add     ebx, 16
 748         add     ecx, 16
 749         add     esi, 16
 750         cmp     ebx, eax                // done yet?
 751         jae     8f
 752
 753         .p2align 4
 754         // Continue...
 755 0:      call    mla4
 756         add     ebx, 16
 757         add     edi, 16
 758         cmp     ebx, eax
 759         jb      0b
 760
 761         // Finish off this pass.  There was no tail on the previous pass, and
 762         // there can be none on this pass.
 763 8:      call    carryprop
 764         cmp     esi, [BP + 36]
 765         jb      1b
 766
 767         // All over.
 768 9:      dropfp
 769         pop     edi
 770         pop     esi
 771         pop     ebx
 772         pop     BP
 773         ret
 774 ENDFUNC
 775
 776 FUNC(mpxmont_mul4_x86_avx)
 777         .arch   .avx
 778         vzeroupper
 779   endprologue
 780         // and drop through...
 781         .arch   pentium4
 782 ENDFUNC
 783
 784 FUNC(mpxmont_mul4_x86_sse2)
 785         // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
 786         //                           const mpw *nv, size_t n, const mpw *mi);
 787
 788         // Build a stack frame.  Arguments will be relative to BP, as
 789         // follows.
 790         //
 791         //      BP + 20 dv
 792         //      BP + 24 av
 793         //      BP + 28 bv
 794         //      BP + 32 nv
 795         //      BP + 36 n (nonzero multiple of 4)
 796         //      BP + 40 mi
 797         //
 798         // Locals are relative to SP, which 16-byte aligned, as follows.
 799         //
 800         //      SP +   0        expanded V (32 bytes)
 801         //      SP +  32        expanded M (32 bytes)
 802         //      SP +  64        expanded Y (32 bytes)
 803         //      SP +  96        outer loop dv
 804         //      SP + 100        outer loop bv
 805         //      SP + 104        av limit (mostly in ESI)
 806         //      SP + 108        bv limit
 807         //      SP + 112        (top of locals)
 808         pushreg BP
 809         pushreg ebx
 810         pushreg esi
 811         pushreg edi
 812         setfp
 813         stalloc 112
 814         and     SP, ~15
 815   endprologue
 816
 817         // Establish the expanded operands.
 818         pxor    xmm7, xmm7
 819         mov     ecx, [BP + 28]          // -> bv
 820         mov     edx, [BP + 40]          // -> mi
 821         movdqu  xmm0, [ecx]             // bv[0]
 822         movdqu  xmm2, [edx]             // mi
 823         expand  xmm7, xmm0, xmm1, xmm2, xmm3
 824         movdqa  [SP +  0], xmm0         // bv[0] expanded low
 825         movdqa  [SP + 16], xmm1         // bv[0] expanded high
 826         movdqa  [SP + 32], xmm2         // mi expanded low
 827         movdqa  [SP + 48], xmm3         // mi expanded high
 828
 829         // Set up the outer loop state and prepare for the first iteration.
 830         mov     edx, [BP + 36]          // n
 831         mov     eax, [BP + 24]          // -> U = av[0]
 832         mov     ebx, [BP + 32]          // -> X = nv[0]
 833         mov     edi, [BP + 20]          // -> Z = dv[0]
 834         mov     [SP + 100], ecx
 835         lea     ecx, [ecx + 4*edx]      // -> bv[n/4] = bv limit
 836         lea     edx, [eax + 4*edx]      // -> av[n/4] = av limit
 837         mov     [SP + 96], edi
 838         mov     [SP + 104], edx
 839         mov     [SP + 108], ecx
 840         lea     ecx, [SP + 0]           // -> expanded V = bv[0]
 841         lea     esi, [SP + 32]          // -> expanded M = mi
 842         lea     edx, [SP + 64]          // -> space for Y
 843         call    mmul4
 844         mov     esi, [SP + 104]         // recover av limit
 845         add     edi, 16
 846         add     eax, 16
 847         add     ebx, 16
 848         cmp     eax, esi                // done already?
 849         jae     8f
 850         mov     [SP + 96], edi
 851
 852         .p2align 4
 853         // Complete the first inner loop.
 854 0:      call    dmul4
 855         add     edi, 16
 856         add     eax, 16
 857         add     ebx, 16
 858         cmp     eax, esi                // done yet?
 859         jb      0b
 860
 861         // Still have carries left to propagate.
 862         call    carryprop
 863         movd    [edi + 16], xmm4
 864
 865         .p2align 4
 866         // Embark on the next iteration.  (There must be one.  If n = 1, then
 867         // we would have bailed above, to label 8.  Similarly, the subsequent
 868         // iterations can fall into the inner loop immediately.)
 869 1:      mov     eax, [SP + 100]         // -> bv[i - 1]
 870         mov     edi, [SP + 96]          // -> Z = dv[i]
 871         add     eax, 16                 // -> bv[i]
 872         pxor    xmm7, xmm7
 873         mov     [SP + 100], eax
 874         cmp     eax, [SP + 108]         // done yet?
 875         jae     9f
 876         movdqu  xmm0, [eax]             // bv[i]
 877         mov     ebx, [BP + 32]          // -> X = nv[0]
 878         lea     esi, [SP + 32]          // -> expanded M = mi
 879         mov     eax, [BP + 24]          // -> U = av[0]
 880         expand  xmm7, xmm0, xmm1
 881         movdqa  [SP + 0], xmm0          // bv[i] expanded low
 882         movdqa  [SP + 16], xmm1         // bv[i] expanded high
 883         call    mmla4
 884         mov     esi, [SP + 104]         // recover av limit
 885         add     edi, 16
 886         add     eax, 16
 887         add     ebx, 16
 888         mov     [SP + 96], edi
 889
 890         .p2align 4
 891         // Complete the next inner loop.
 892 0:      call    dmla4
 893         add     edi, 16
 894         add     eax, 16
 895         add     ebx, 16
 896         cmp     eax, esi
 897         jb      0b
 898
 899         // Still have carries left to propagate, and they overlap the
 900         // previous iteration's final tail, so read that in and add it.
 901         movd    xmm0, [edi]
 902         paddq   xmm4, xmm0
 903         call    carryprop
 904         movd    [edi + 16], xmm4
 905
 906         // Back again.
 907         jmp     1b
 908
 909         // First iteration was short.  Write out the carries and we're done.
 910         // (This could be folded into the main loop structure, but that would
 911         // penalize small numbers more.)
 912 8:      call    carryprop
 913         movd    [edi + 16], xmm4
 914
 915         // All done.
 916 9:      dropfp
 917         popreg  edi
 918         popreg  esi
 919         popreg  ebx
 920         popreg  BP
 921         ret
 922 ENDFUNC
 923
 924 FUNC(mpxmont_redc4_x86_avx)
 925         .arch   .avx
 926         vzeroupper
 927   endprologue
 928         // and drop through...
 929         .arch   pentium4
 930 ENDFUNC
 931
 932 FUNC(mpxmont_redc4_x86_sse2)
 933         // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
 934         //                             size_t n, const mpw *mi);
 935
 936         // Build a stack frame.  Arguments will be relative to BP, as
 937         // follows.
 938         //
 939         //      BP + 20 dv
 940         //      BP + 24 dvl
 941         //      BP + 28 nv
 942         //      BP + 32 n (nonzero multiple of 4)
 943         //      BP + 36 mi
 944         //
 945         // Locals are relative to SP, as follows.
 946         //
 947         //      SP +  0 outer loop dv
 948         //      SP +  4 outer dv limit
 949         //      SP +  8 blocks-of-4 dv limit
 950         //      SP + 12 expanded M (32 bytes)
 951         //      SP + 44 expanded Y (32 bytes)
 952         //      SP + 76 (top of locals)
 953         pushreg BP
 954         pushreg ebx
 955         pushreg esi
 956         pushreg edi
 957         setfp
 958         and     SP, ~15
 959         stalloc 76
 960   endprologue
 961
 962         // Establish the expanded operands and the blocks-of-4 dv limit.
 963         mov     edi, [BP + 20]          // -> Z = dv[0]
 964         pxor    xmm7, xmm7
 965         mov     eax, [BP + 24]          // -> dv[n] = dv limit
 966         sub     eax, edi                // length of dv in bytes
 967         mov     edx, [BP + 36]          // -> mi
 968         movdqu  xmm0, [edx]             // mi
 969         and     eax, ~15                // mask off the tail end
 970         expand  xmm7, xmm0, xmm1
 971         add     eax, edi                // find limit
 972         movdqa  [SP + 12], xmm0         // mi expanded low
 973         movdqa  [SP + 28], xmm1         // mi expanded high
 974         mov     [SP + 8], eax
 975
 976         // Set up the outer loop state and prepare for the first iteration.
 977         mov     ecx, [BP + 32]          // n
 978         mov     ebx, [BP + 28]          // -> X = nv[0]
 979         lea     edx, [edi + 4*ecx]      // -> dv[n/4] = outer dv limit
 980         lea     ecx, [ebx + 4*ecx]      // -> nv[n/4] = nv limit
 981         mov     [SP + 0], edi
 982         mov     [SP + 4], edx
 983         lea     esi, [SP + 12]          // -> expanded M = mi
 984         lea     edx, [SP + 44]          // -> space for Y
 985         call    mont4
 986         add     ebx, 16
 987         add     edi, 16
 988         cmp     ebx, ecx                // done already?
 989         jae     8f
 990
 991         .p2align 4
 992         // Complete the first inner loop.
 993 5:      call    mla4
 994         add     ebx, 16
 995         add     edi, 16
 996         cmp     ebx, ecx                // done yet?
 997         jb      5b
 998
 999         // Still have carries left to propagate.
1000 8:      carryadd
1001         mov     esi, [SP + 8]           // -> dv blocks limit
1002         mov     edx, [BP + 24]          // dv limit
1003         psllq   xmm7, 16
1004         pslldq  xmm7, 8
1005         paddq   xmm6, xmm7
1006         call    carryprop
1007         movd    eax, xmm4
1008         add     edi, 16
1009         cmp     edi, esi
1010         jae     7f
1011
1012         .p2align 4
1013         // Continue carry propagation until the end of the buffer.
1014 0:      add     [edi], eax
1015         mov     eax, 0                  // preserves flags
1016         adcd    [edi + 4], 0
1017         adcd    [edi + 8], 0
1018         adcd    [edi + 12], 0
1019         adc     eax, 0
1020         add     edi, 16
1021         cmp     edi, esi
1022         jb      0b
1023
1024         // Deal with the tail end.
1025 7:      add     [edi], eax
1026         mov     eax, 0                  // preserves flags
1027         add     edi, 4
1028         adc     eax, 0
1029         cmp     edi, edx
1030         jb      7b
1031
1032         // All done for this iteration.  Start the next.  (This must have at
1033         // least one follow-on iteration, or we'd not have started this outer
1034         // loop.)
1035 8:      mov     edi, [SP + 0]           // -> dv[i - 1]
1036         mov     ebx, [BP + 28]          // -> X = nv[0]
1037         lea     edx, [SP + 44]          // -> space for Y
1038         lea     esi, [SP + 12]          // -> expanded M = mi
1039         add     edi, 16                 // -> Z = dv[i]
1040         cmp     edi, [SP + 4]           // all done yet?
1041         jae     9f
1042         mov     [SP + 0], edi
1043         call    mont4
1044         add     edi, 16
1045         add     ebx, 16
1046         jmp     5b
1047
1048         // All over.
1049 9:      dropfp
1050         popreg  edi
1051         popreg  esi
1052         popreg  ebx
1053         popreg  BP
1054         ret
1055 ENDFUNC
1056
1057 ///--------------------------------------------------------------------------
1058 /// Testing and performance measurement.
1059
1060 #ifdef TEST_MUL4
1061
1062 .macro  cysetup c
1063         rdtsc
1064         mov     [\c], eax
1065         mov     [\c + 4], edx
1066 .endm
1067
1068 .macro  cystore c, v, n
1069         rdtsc
1070         sub     eax, [\c]
1071         sbb     edx, [\c + 4]
1072         mov     ebx, [\v]
1073         mov     ecx, [\n]
1074         dec     ecx
1075         mov     [\n], ecx
1076         mov     [ebx + ecx*8], eax
1077         mov     [ebx + ecx*8 + 4], edx
1078 .endm
1079
1080 .macro  testprologue n
1081         pushreg BP
1082         pushreg ebx
1083         pushreg esi
1084         pushreg edi
1085         setfp
1086         stalloc 3*32 + 4*4
1087         and     SP, ~15
1088   endprologue
1089         mov     eax, \n
1090         mov     [SP + 104], eax
1091         // vars:
1092         //      SP +   0 = v expanded
1093         //      SP +  32 = y expanded
1094         //      SP +  64 = ? expanded
1095         //      SP +  96 = cycles
1096         //      SP + 104 = count
1097 .endm
1098
1099 .macro  testepilogue
1100         dropfp
1101         popreg  edi
1102         popreg  esi
1103         popreg  ebx
1104         popreg  BP
1105         ret
1106 .endm
1107
1108 .macro  testldcarry c
1109         mov     ecx, \c                 // -> c
1110         movdqu  xmm4, [ecx +  0]        // (c'_0; c''_0)
1111         movdqu  xmm5, [ecx + 16]        // (c'_1; c''_1)
1112         movdqu  xmm6, [ecx + 32]        // (c'_2; c''_2)
1113 .endm
1114
1115 .macro  testexpand v=nil, y=nil
1116         pxor    xmm7, xmm7
1117   .ifnes "\v", "nil"
1118         mov     ecx, \v
1119         movdqu  xmm0, [ecx]
1120         expand  xmm7, xmm0, xmm1
1121         movdqa  [SP +  0], xmm0
1122         movdqa  [SP + 16], xmm1
1123   .endif
1124   .ifnes "\y", "nil"
1125         mov     edx, \y
1126         movdqu  xmm2, [edx]
1127         expand  xmm7, xmm2, xmm3
1128         movdqa  [SP + 32], xmm2
1129         movdqa  [SP + 48], xmm3
1130   .endif
1131 .endm
1132
1133 .macro  testtop u=nil, x=nil, mode=nil
1134         .p2align 4
1135 0:
1136   .ifnes "\u", "nil"
1137         lea     ecx, [SP + 0]
1138   .endif
1139         mov     ebx, \x
1140   .ifeqs "\mode", "mont"
1141         lea     esi, [SP + 32]
1142   .endif
1143         cysetup SP + 96
1144   .ifnes "\u", "nil"
1145         mov     eax, \u
1146   .endif
1147   .ifeqs "\mode", "mont"
1148         lea     edx, [SP + 64]
1149   .else
1150         lea     edx, [SP + 32]
1151   .endif
1152 .endm
1153
1154 .macro  testtail cyv
1155         cystore SP + 96, \cyv, SP + 104
1156         jnz     0b
1157 .endm
1158
1159 .macro  testcarryout c
1160         mov     ecx, \c
1161         movdqu  [ecx +  0], xmm4
1162         movdqu  [ecx + 16], xmm5
1163         movdqu  [ecx + 32], xmm6
1164 .endm
1165
1166 FUNC(test_dmul4)
1167         testprologue [BP + 44]
1168         testldcarry [BP + 24]
1169         testexpand [BP + 36], [BP + 40]
1170         mov     edi, [BP + 20]
1171         testtop [BP + 28], [BP + 32]
1172         call    dmul4
1173         testtail [BP + 48]
1174         testcarryout [BP + 24]
1175         testepilogue
1176 ENDFUNC
1177
1178 FUNC(test_dmla4)
1179         testprologue [BP + 44]
1180         testldcarry [BP + 24]
1181         testexpand [BP + 36], [BP + 40]
1182         mov     edi, [BP + 20]
1183         testtop [BP + 28], [BP + 32]
1184         call    dmla4
1185         testtail [BP + 48]
1186         testcarryout [BP + 24]
1187         testepilogue
1188 ENDFUNC
1189
1190 FUNC(test_mul4)
1191         testprologue [BP + 36]
1192         testldcarry [BP + 24]
1193         testexpand nil, [BP + 32]
1194         mov     edi, [BP + 20]
1195         testtop nil, [BP + 28]
1196         call    mul4
1197         testtail [BP + 40]
1198         testcarryout [BP + 24]
1199         testepilogue
1200 ENDFUNC
1201
1202 FUNC(test_mul4zc)
1203         testprologue [BP + 36]
1204         testldcarry [BP + 24]
1205         testexpand nil, [BP + 32]
1206         mov     edi, [BP + 20]
1207         testtop nil, [BP + 28]
1208         call    mul4zc
1209         testtail [BP + 40]
1210         testcarryout [BP + 24]
1211         testepilogue
1212 ENDFUNC
1213
1214 FUNC(test_mla4)
1215         testprologue [BP + 36]
1216         testldcarry [BP + 24]
1217         testexpand nil, [BP + 32]
1218         mov     edi, [BP + 20]
1219         testtop nil, [BP + 28]
1220         call    mla4
1221         testtail [BP + 40]
1222         testcarryout [BP + 24]
1223         testepilogue
1224 ENDFUNC
1225
1226 FUNC(test_mla4zc)
1227         testprologue [BP + 36]
1228         testldcarry [BP + 24]
1229         testexpand nil, [BP + 32]
1230         mov     edi, [BP + 20]
1231         testtop nil, [BP + 28]
1232         call    mla4zc
1233         testtail [BP + 40]
1234         testcarryout [BP + 24]
1235         testepilogue
1236 ENDFUNC
1237
1238 FUNC(test_mmul4)
1239         testprologue [BP + 48]
1240         testexpand [BP + 40], [BP + 44]
1241         mov     edi, [BP + 20]
1242         testtop [BP + 32], [BP + 36], mont
1243         call    mmul4
1244         testtail [BP + 52]
1245         mov     edi, [BP + 28]
1246         movdqa  xmm0, [SP + 64]
1247         movdqa  xmm1, [SP + 80]
1248         movdqu  [edi], xmm0
1249         movdqu  [edi + 16], xmm1
1250         testcarryout [BP + 24]
1251         testepilogue
1252 ENDFUNC
1253
1254 FUNC(test_mmla4)
1255         testprologue [BP + 48]
1256         testexpand [BP + 40], [BP + 44]
1257         mov     edi, [BP + 20]
1258         testtop [BP + 32], [BP + 36], mont
1259         call    mmla4
1260         testtail [BP + 52]
1261         mov     edi, [BP + 28]
1262         movdqa  xmm0, [SP + 64]
1263         movdqa  xmm1, [SP + 80]
1264         movdqu  [edi], xmm0
1265         movdqu  [edi + 16], xmm1
1266         testcarryout [BP + 24]
1267         testepilogue
1268 ENDFUNC
1269
1270 FUNC(test_mont4)
1271         testprologue [BP + 40]
1272         testexpand nil, [BP + 36]
1273         mov     edi, [BP + 20]
1274         testtop nil, [BP + 32], mont
1275         call    mont4
1276         testtail [BP + 44]
1277         mov     edi, [BP + 28]
1278         movdqa  xmm0, [SP + 64]
1279         movdqa  xmm1, [SP + 80]
1280         movdqu  [edi], xmm0
1281         movdqu  [edi + 16], xmm1
1282         testcarryout [BP + 24]
1283         testepilogue
1284 ENDFUNC
1285
1286 #endif
1287
1288 ///----- That's all, folks --------------------------------------------------