mdw@git.distorted.org.uk Git - catacomb/blob - math/mpx-mul4-x86-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
   2 ///
   3 /// Large SIMD-based multiplications
   4 ///
   5 /// (c) 2016 Straylight/Edgeware
   6
   7 ///----- Licensing notice ---------------------------------------------------
   8 ///
   9 /// This file is part of Catacomb.
  10 ///
  11 /// Catacomb is free software; you can redistribute it and/or modify
  12 /// it under the terms of the GNU Library General Public License as
  13 /// published by the Free Software Foundation; either version 2 of the
  14 /// License, or (at your option) any later version.
  15 ///
  16 /// Catacomb is distributed in the hope that it will be useful,
  17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 /// GNU Library General Public License for more details.
  20 ///
  21 /// You should have received a copy of the GNU Library General Public
  22 /// License along with Catacomb; if not, write to the Free
  23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  24 /// MA 02111-1307, USA.
  25
  26 ///--------------------------------------------------------------------------
  27 /// External definitions.
  28
  29 #include "config.h"
  30 #include "asm-common.h"
  31
  32 ///--------------------------------------------------------------------------
  33 /// Prologue.
  34
  35         .arch   pentium4
  36         .text
  37
  38 ///--------------------------------------------------------------------------
  39 /// Theory.
  40 ///
  41 /// We define a number of primitive fixed-size multipliers from which we can
  42 /// construct more general variable-length multipliers.
  43 ///
  44 /// The basic trick is the same throughout.  In an operand-scanning
  45 /// multiplication, the inner multiplication loop multiplies a
  46 /// multiple-precision operand by a single precision factor, and adds the
  47 /// result, appropriately shifted, to the result.  A `finely integrated
  48 /// operand scanning' implementation of Montgomery multiplication also adds
  49 /// the product of a single-precision `Montgomery factor' and the modulus,
  50 /// calculated in the same pass.  The more common `coarsely integrated
  51 /// operand scanning' alternates main multiplication and Montgomery passes,
  52 /// which requires additional carry propagation.
  53 ///
  54 /// Throughout both plain-multiplication and Montgomery stages, then, one of
  55 /// the factors remains constant throughout the operation, so we can afford
  56 /// to take a little time to preprocess it.  The transformation we perform is
  57 /// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
  58 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
  59 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
  60 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
  61 /// operands, as follows.
  62 ///
  63 ///     Offset     0       4        8      12
  64 ///        0    v'_0    v'_1    v''_0   v''_1
  65 ///       16    v'_2    v'_3    v''_2   v''_3
  66 ///
  67 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
  68 /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
  69 /// this vector right by 4 bytes brings v'_1 and v''_1 into position.  We can
  70 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
  71 /// results in 64-bit fields.  The sixteen bits of headroom allows us to add
  72 /// many products together before we must deal with carrying; it also allows
  73 /// for some calculations to be performed on the above expanded form.
  74 ///
  75 /// On 32-bit x86, we are register starved: the expanded operands are kept in
  76 /// memory, typically in warm L1 cache.
  77 ///
  78 /// We maintain four `carry' registers accumulating intermediate results.
  79 /// The registers' precise roles rotate during the computation; we name them
  80 /// `c0', `c1', `c2', and `c3'.  Each carry register holds two 64-bit halves:
  81 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
  82 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
  83 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3.  The
  84 /// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
  85 /// lanes of its vector) and an operand in the expanded form above produces a
  86 /// result which can be added directly to the appropriate carry register.
  87 /// Following a pass of four multiplications, we perform some limited carry
  88 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
  89 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
  90 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
  91 /// zeroed becomes c3.
  92
  93 ///--------------------------------------------------------------------------
  94 /// Macro definitions.
  95
  96 .macro  mulcore r, s, d0, d1, d2, d3
  97         // Load a word r_i from R, multiply by the expanded operand [S], and
  98         // leave the pieces of the product in registers D0, D1, D2, D3.
  99         movd    \d0, \r                 // (r_i, 0, 0, 0)
 100   .ifnes "\d1", "nil"
 101         movdqa  \d1, [\s]               // (s'_0, s'_1, s''_0, s''_1)
 102   .endif
 103   .ifnes "\d3", "nil"
 104         movdqa  \d3, [\s + 16]          // (s'_2, s'_3, s''_2, s''_3)
 105   .endif
 106         pshufd  \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?, r_i, ?)
 107   .ifnes "\d1", "nil"
 108         psrldq  \d1, 4                  // (s'_1, s''_0, s''_1, 0)
 109   .endif
 110   .ifnes "\d2", "nil"
 111     .ifnes "\d3", "nil"
 112         movdqa  \d2, \d3                // another copy of (s'_2, s'_3, ...)
 113     .else
 114         movdqa  \d2, \d0                // another copy of (r_i, ?, r_i, ?)
 115     .endif
 116   .endif
 117   .ifnes "\d3", "nil"
 118         psrldq  \d3, 4                  // (s'_3, s''_2, s''_3, 0)
 119   .endif
 120   .ifnes "\d1", "nil"
 121         pmuludqd \d1, \d0               // (r_i s'_1, r_i s''_1)
 122   .endif
 123   .ifnes "\d3", "nil"
 124         pmuludqd \d3, \d0               // (r_i s'_3, r_i s''_3)
 125   .endif
 126   .ifnes "\d2", "nil"
 127     .ifnes "\d3", "nil"
 128         pmuludqd \d2, \d0               // (r_i s'_2, r_i s''_2)
 129     .else
 130         pmuludqd \d2, [\s + 16]
 131     .endif
 132   .endif
 133         pmuludqd \d0, [\s]              // (r_i s'_0, r_i s''_0)
 134 .endm
 135
 136 .macro  accum   c0, c1, c2, c3
 137         paddq   \c0, xmm0
 138   .ifnes "\c1", "nil"
 139         paddq   \c1, xmm1
 140   .endif
 141   .ifnes "\c2", "nil"
 142         paddq   \c2, xmm2
 143   .endif
 144   .ifnes "\c3", "nil"
 145         paddq   \c3, xmm3
 146   .endif
 147 .endm
 148
 149 .macro  mulacc  r, s, c0, c1, c2, c3, z3p
 150         // Load a word r_i from R, multiply by the expanded operand [S],
 151         // and accumulate in carry registers C0, C1, C2, C3.  If Z3P is `t'
 152         // then C3 notionally contains zero, but needs clearing; in practice,
 153         // we store the product directly rather than attempting to add.  On
 154         // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
 155         // is not `t'.
 156   .ifeqs "\z3p", "t"
 157         mulcore \r, \s, xmm0, xmm1, xmm2, \c3
 158         accum           \c0,  \c1,  \c2,  nil
 159   .else
 160         mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
 161         accum           \c0,  \c1,  \c2,  \c3
 162   .endif
 163 .endm
 164
 165 .macro  propout d, c, cc
 166         // Calculate an output word from C, and store it in D; propagate
 167         // carries out from C to CC in preparation for a rotation of the
 168         // carry registers.  On completion, XMM3 is clobbered.  If CC is
 169         // `nil', then the contribution which would have been added to it is
 170         // left in C.
 171         pshufd  xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
 172         psrldq  xmm3, 12                // (t, 0, 0, 0) = (t, 0)
 173         pslldq  xmm3, 2                 // (t b, 0)
 174         paddq   \c, xmm3                // (c' + t b, c'')
 175         movd    \d, \c
 176         psrlq   \c, 32                  // floor(c/B)
 177   .ifnes "\cc", "nil"
 178         paddq   \cc, \c                 // propagate up
 179   .endif
 180 .endm
 181
 182 .macro  endprop d, c, t
 183         // On entry, C contains a carry register.  On exit, the low 32 bits
 184         // of the value represented in C are written to D, and the remaining
 185         // bits are left at the bottom of T.
 186         movdqa  \t, \c
 187         psllq   \t, 16                  // (?, c'' b)
 188         pslldq  \c, 8                   // (0, c')
 189         paddq   \t, \c                  // (?, c' + c'' b)
 190         psrldq  \t, 8                   // c' + c'' b
 191         movd    \d, \t
 192         psrldq  \t, 4                   // floor((c' + c'' b)/B)
 193 .endm
 194
 195 .macro  expand  a, b, c, d, z
 196         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
 197         // exit, A:B and C:D together hold the same values in expanded
 198         // form.  If C is `nil', then only expand A to A:B.
 199         movdqa  \b, \a                  // (a_0, a_1, a_2, a_3)
 200   .ifnes "\c", "nil"
 201         movdqa  \d, \c                  // (c_0, c_1, c_2, c_3)
 202   .endif
 203         punpcklwd \a, \z                // (a'_0, a''_0, a'_1, a''_1)
 204         punpckhwd \b, \z                // (a'_2, a''_2, a'_3, a''_3)
 205   .ifnes "\c", "nil"
 206         punpcklwd \c, \z                // (c'_0, c''_0, c'_1, c''_1)
 207         punpckhwd \d, \z                // (c'_2, c''_2, c'_3, c''_3)
 208   .endif
 209         pshufd  \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
 210         pshufd  \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
 211   .ifnes "\c", "nil"
 212         pshufd  \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
 213         pshufd  \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
 214   .endif
 215 .endm
 216
 217 .macro  squash  c0, c1, c2, c3, h, t, u
 218         // On entry, C0, C1, C2, C3 are carry registers representing a value
 219         // Y.  On exit, C0 holds the low 128 bits of the carry value; C1, C2,
 220         // C3, T, and U are clobbered; and the high bits of Y are stored in
 221         // H, if this is not `nil'.
 222
 223         // The first step is to eliminate the `double-prime' pieces -- i.e.,
 224         // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
 225         // them into the 32-bit-aligned pieces above and below.  But before
 226         // we can do that, we must gather them together.
 227         movdqa  \t, \c0
 228         movdqa  \u, \c1
 229         punpcklqdq \t, \c2              // (y'_0, y'_2)
 230         punpckhqdq \c0, \c2             // (y''_0, y''_2)
 231         punpcklqdq \u, \c3              // (y'_1, y'_3)
 232         punpckhqdq \c1, \c3             // (y''_1, y''_3)
 233
 234         // Now split the double-prime pieces.  The high (up to) 48 bits will
 235         // go up; the low 16 bits go down.
 236         movdqa  \c2, \c0
 237         movdqa  \c3, \c1
 238         psllq   \c2, 48
 239         psllq   \c3, 48
 240         psrlq   \c0, 16                 // high parts of (y''_0, y''_2)
 241         psrlq   \c1, 16                 // high parts of (y''_1, y''_3)
 242         psrlq   \c2, 32                 // low parts of (y''_0, y''_2)
 243         psrlq   \c3, 32                 // low parts of (y''_1, y''_3)
 244   .ifnes "\h", "nil"
 245         movdqa  \h, \c1
 246   .endif
 247         pslldq  \c1, 8                  // high part of (0, y''_1)
 248
 249         paddq   \t, \c2                 // propagate down
 250         paddq   \u, \c3
 251         paddq   \t, \c1                 // and up: (y_0, y_2)
 252         paddq   \u, \c0                 // (y_1, y_3)
 253   .ifnes "\h", "nil"
 254         psrldq  \h, 8                   // high part of (y''_3, 0)
 255   .endif
 256
 257         // Finally extract the answer.  This complicated dance is better than
 258         // storing to memory and loading, because the piecemeal stores
 259         // inhibit store forwarding.
 260         movdqa  \c3, \t                 // (y_0, y_1)
 261         movdqa  \c0, \t                 // (y^*_0, ?, ?, ?)
 262         psrldq  \t, 8                   // (y_2, 0)
 263         psrlq   \c3, 32                 // (floor(y_0/B), ?)
 264         paddq   \c3, \u                 // (y_1 + floor(y_0/B), ?)
 265         pslldq  \c0, 12                 // (0, 0, 0, y^*_0)
 266         movdqa  \c1, \c3                // (y^*_1, ?, ?, ?)
 267         psrldq  \u, 8                   // (y_3, 0)
 268         psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2, ?)
 269         paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2, ?)
 270         pslldq  \c1, 12                 // (0, 0, 0, y^*_1)
 271         psrldq  \c0, 12                 // (y^*_0, 0, 0, 0)
 272         movdqa  \c2, \c3                // (y^*_2, ?, ?, ?)
 273         psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
 274         paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
 275         pslldq  \c2, 12                 // (0, 0, 0, y^*_2)
 276         psrldq  \c1, 8                  // (0, y^*_1, 0, 0)
 277         psrldq  \c2, 4                  // (0, 0, y^*_2, 0)
 278   .ifnes "\h", "nil"
 279         movdqu  \t, \c3
 280         pxor    \u, \u
 281   .endif
 282         pslldq  \c3, 12                 // (0, 0, 0, y^*_3)
 283         por     \c0, \c1                // (y^*_0, y^*_1, 0, 0)
 284         por     \c2, \c3                // (0, 0, y^*_2, y^*_3)
 285         por     \c0, \c2                // y mod B^4
 286   .ifnes "\h", "nil"
 287         psrlq   \t, 32                  // very high bits of y
 288         paddq   \h, \t
 289         punpcklqdq \h, \u               // carry up
 290   .endif
 291 .endm
 292
 293 .macro  carryadd
 294         // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
 295         // hold the incoming carry registers c0, c1, and c2 representing a
 296         // carry-in C.
 297         //
 298         // On exit, the carry registers, including XMM7, are updated to hold
 299         // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
 300         // registers are preserved.
 301         movd    xmm0, [edi +  0]        // (a_0, 0)
 302         movd    xmm1, [edi +  4]        // (a_1, 0)
 303         movd    xmm2, [edi +  8]        // (a_2, 0)
 304         movd    xmm7, [edi + 12]        // (a_3, 0)
 305         paddq   xmm4, xmm0              // (c'_0 + a_0, c''_0)
 306         paddq   xmm5, xmm1              // (c'_1 + a_1, c''_1)
 307         paddq   xmm6, xmm2              // (c'_2 + a_2, c''_2 + a_3 b)
 308 .endm
 309
 310 ///--------------------------------------------------------------------------
 311 /// Primitive multipliers and related utilities.
 312
 313 INTFUNC(carryprop)
 314         // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
 315         // form.  Store the low 128 bits of the represented carry to [EDI] as
 316         // a packed 128-bit value, and leave the remaining 16 bits in the low
 317         // 32 bits of XMM4.  On exit, XMM3, XMM5 and XMM6 are clobbered.
 318   endprologue
 319
 320         propout [edi +  0], xmm4, xmm5
 321         propout [edi +  4], xmm5, xmm6
 322         propout [edi +  8], xmm6, nil
 323         endprop [edi + 12], xmm6, xmm4
 324         ret
 325
 326 ENDFUNC
 327
 328 INTFUNC(dmul4)
 329         // On entry, EDI points to the destination buffer; EAX and EBX point
 330         // to the packed operands U and X; ECX and EDX point to the expanded
 331         // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
 332         // registers c0, c1, and c2; c3 is assumed to be zero.
 333         //
 334         // On exit, we write the low 128 bits of the sum C + U V + X Y to
 335         // [EDI], and update the carry registers with the carry out.  The
 336         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 337         // general-purpose registers are preserved.
 338   endprologue
 339
 340         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, t
 341         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 342         propout [edi +  0],      xmm4, xmm5
 343
 344         mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 345         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, nil
 346         propout [edi +  4],      xmm5, xmm6
 347
 348         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 349         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, nil
 350         propout [edi +  8],      xmm6, xmm7
 351
 352         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 353         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
 354         propout [edi + 12],      xmm7, xmm4
 355
 356         ret
 357
 358 ENDFUNC
 359
 360 INTFUNC(dmla4)
 361         // On entry, EDI points to the destination buffer, which also
 362         // contains an addend A to accumulate; EAX and EBX point to the
 363         // packed operands U and X; ECX and EDX point to the expanded
 364         // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
 365         // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
 366         // to be zero.
 367         //
 368         // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
 369         // [EDI], and update the carry registers with the carry out.  The
 370         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 371         // general-purpose registers are preserved.
 372   endprologue
 373
 374         carryadd
 375
 376         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, nil
 377         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 378         propout [edi +  0],      xmm4, xmm5
 379
 380         mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 381         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, nil
 382         propout [edi +  4],      xmm5, xmm6
 383
 384         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 385         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, nil
 386         propout [edi +  8],      xmm6, xmm7
 387
 388         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 389         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
 390         propout [edi + 12],      xmm7, xmm4
 391
 392         ret
 393
 394 ENDFUNC
 395
 396 INTFUNC(mul4zc)
 397         // On entry, EDI points to the destination buffer; EBX points to a
 398         // packed operand X; and EDX points to an expanded operand Y.
 399         //
 400         // On exit, we write the low 128 bits of the product X Y to [EDI],
 401         // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
 402         // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 403         // general-purpose registers are preserved.
 404   endprologue
 405
 406         mulcore [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 407         propout [edi +  0],      xmm4, xmm5
 408
 409         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 410         propout [edi +  4],      xmm5, xmm6
 411
 412         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 413         propout [edi +  8],      xmm6, xmm7
 414
 415         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 416         propout [edi + 12],      xmm7, xmm4
 417
 418         ret
 419
 420 ENDFUNC
 421
 422 INTFUNC(mul4)
 423         // On entry, EDI points to the destination buffer; EBX points to a
 424         // packed operand X; EDX points to an expanded operand Y; and XMM4,
 425         // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
 426         // representing a carry-in C; c3 is assumed to be zero.
 427         //
 428         // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
 429         // and update the carry registers with the carry out.  The registers
 430         // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 431         // general-purpose registers are preserved.
 432   endprologue
 433
 434         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, t
 435         propout [edi +  0],      xmm4, xmm5
 436
 437         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 438         propout [edi +  4],      xmm5, xmm6
 439
 440         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 441         propout [edi +  8],      xmm6, xmm7
 442
 443         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 444         propout [edi + 12],      xmm7, xmm4
 445
 446         ret
 447
 448 ENDFUNC
 449
 450 INTFUNC(mla4zc)
 451         // On entry, EDI points to the destination buffer, which also
 452         // contains an addend A to accumulate; EBX points to a packed operand
 453         // X; and EDX points to an expanded operand Y.
 454         //
 455         // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
 456         // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
 457         // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 458         // general-purpose registers are preserved.
 459   endprologue
 460
 461         movd    xmm4, [edi +  0]
 462         movd    xmm5, [edi +  4]
 463         movd    xmm6, [edi +  8]
 464         movd    xmm7, [edi + 12]
 465
 466         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 467         propout [edi +  0],      xmm4, xmm5
 468
 469         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 470         propout [edi +  4],      xmm5, xmm6
 471
 472         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 473         propout [edi +  8],      xmm6, xmm7
 474
 475         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 476         propout [edi + 12],      xmm7, xmm4
 477
 478         ret
 479
 480 ENDFUNC
 481
 482 INTFUNC(mla4)
 483         // On entry, EDI points to the destination buffer, which also
 484         // contains an addend A to accumulate; EBX points to a packed operand
 485         // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
 486         // the incoming carry registers c0, c1, and c2, representing a
 487         // carry-in C; c3 is assumed to be zero.
 488         //
 489         // On exit, we write the low 128 bits of the sum A + C + X Y to
 490         // [EDI], and update the carry registers with the carry out.  The
 491         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 492         // general-purpose registers are preserved.
 493   endprologue
 494
 495         carryadd
 496
 497         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 498         propout [edi +  0],      xmm4, xmm5
 499
 500         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 501         propout [edi +  4],      xmm5, xmm6
 502
 503         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 504         propout [edi +  8],      xmm6, xmm7
 505
 506         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 507         propout [edi + 12],      xmm7, xmm4
 508
 509         ret
 510
 511 ENDFUNC
 512
 513 INTFUNC(mmul4)
 514         // On entry, EDI points to the destination buffer; EAX and EBX point
 515         // to the packed operands U and N; ECX and ESI point to the expanded
 516         // operands V and M; and EDX points to a place to store an expanded
 517         // result Y (32 bytes, at a 16-byte boundary).  The stack pointer
 518         // must be 16-byte aligned.  (This is not the usual convention, which
 519         // requires alignment before the call.)
 520         //
 521         // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
 522         // of the sum U V + N Y to [EDI], leaving the remaining carry in
 523         // XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
 524         // XMM7 are clobbered; the general-purpose registers are preserved.
 525         stalloc 48                      // space for the carries
 526   endprologue
 527
 528         // Calculate W = U V, and leave it in the destination.  Stash the
 529         // carry pieces for later.
 530         mulcore [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
 531         propout [edi +  0],      xmm4, xmm5
 532         jmp     5f
 533
 534 ENDFUNC
 535
 536 INTFUNC(mmla4)
 537         // On entry, EDI points to the destination buffer, which also
 538         // contains an addend A to accumulate; EAX and EBX point
 539         // to the packed operands U and N; ECX and ESI point to the expanded
 540         // operands V and M; and EDX points to a place to store an expanded
 541         // result Y (32 bytes, at a 16-byte boundary).  The stack pointer
 542         // must be 16-byte aligned.  (This is not the usual convention, which
 543         // requires alignment before the call.)
 544         //
 545         // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
 546         // bits of the sum A + U V + N Y to [EDI], leaving the remaining
 547         // carry in XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2,
 548         // XMM3, and XMM7 are clobbered; the general-purpose registers are
 549         // preserved.
 550         stalloc 48                      // space for the carries
 551   endprologue
 552
 553         movd    xmm4, [edi +  0]
 554         movd    xmm5, [edi +  4]
 555         movd    xmm6, [edi +  8]
 556         movd    xmm7, [edi + 12]
 557         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, nil
 558         propout [edi +  0],      xmm4, xmm5
 559
 560 5:      mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 561         propout [edi +  4],      xmm5, xmm6
 562
 563         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 564         propout [edi +  8],      xmm6, xmm7
 565
 566         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 567         propout [edi + 12],      xmm7, xmm4
 568
 569         movdqa  [esp +  0], xmm4
 570         movdqa  [esp + 16], xmm5
 571         movdqa  [esp + 32], xmm6
 572
 573         // Calculate Y = W M.
 574         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
 575
 576         mulcore [edi +  4], esi, xmm0, xmm1, xmm2, nil
 577         accum                    xmm5, xmm6, xmm7, nil
 578
 579         mulcore [edi +  8], esi, xmm0, xmm1, nil,  nil
 580         accum                    xmm6, xmm7, nil,  nil
 581
 582         mulcore [edi + 12], esi, xmm0, nil,  nil,  nil
 583         accum                    xmm7, nil,  nil,  nil
 584
 585         // That's lots of pieces.  Now we have to assemble the answer.
 586         squash  xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
 587
 588         // Expand it.
 589         pxor    xmm2, xmm2
 590         expand  xmm4, xmm1, nil, nil, xmm2
 591         movdqa  [edx +  0], xmm4
 592         movdqa  [edx + 16], xmm1
 593
 594         // Initialize the carry from the value for W we calculated earlier.
 595         movd    xmm4, [edi +  0]
 596         movd    xmm5, [edi +  4]
 597         movd    xmm6, [edi +  8]
 598         movd    xmm7, [edi + 12]
 599
 600         // Finish the calculation by adding the Montgomery product.
 601         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 602         propout [edi +  0],      xmm4, xmm5
 603
 604         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 605         propout [edi +  4],      xmm5, xmm6
 606
 607         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 608         propout [edi +  8],      xmm6, xmm7
 609
 610         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 611         propout [edi + 12],      xmm7, xmm4
 612
 613         // Add add on the carry we calculated earlier.
 614         paddq   xmm4, [esp +  0]
 615         paddq   xmm5, [esp + 16]
 616         paddq   xmm6, [esp + 32]
 617
 618         // And, with that, we're done.
 619         stfree  48
 620         ret
 621
 622 ENDFUNC
 623
 624 INTFUNC(mont4)
 625         // On entry, EDI points to the destination buffer holding a packed
 626         // value W; EBX points to a packed operand N; ESI points to an
 627         // expanded operand M; and EDX points to a place to store an expanded
 628         // result Y (32 bytes, at a 16-byte boundary).
 629         //
 630         // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
 631         // of the sum W + N Y to [EDI], leaving the remaining carry in
 632         // XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
 633         // XMM7 are clobbered; the general-purpose registers are preserved.
 634   endprologue
 635
 636         // Calculate Y = W M.
 637         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
 638
 639         mulcore [edi +  4], esi, xmm0, xmm1, xmm2, nil
 640         accum                    xmm5, xmm6, xmm7, nil
 641
 642         mulcore [edi +  8], esi, xmm0, xmm1, nil,  nil
 643         accum                    xmm6, xmm7, nil,  nil
 644
 645         mulcore [edi + 12], esi, xmm0, nil,  nil,  nil
 646         accum                    xmm7, nil,  nil,  nil
 647
 648         // That's lots of pieces.  Now we have to assemble the answer.
 649         squash  xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
 650
 651         // Expand it.
 652         pxor    xmm2, xmm2
 653         expand  xmm4, xmm1, nil, nil, xmm2
 654         movdqa  [edx +  0], xmm4
 655         movdqa  [edx + 16], xmm1
 656
 657         // Initialize the carry from W.
 658         movd    xmm4, [edi +  0]
 659         movd    xmm5, [edi +  4]
 660         movd    xmm6, [edi +  8]
 661         movd    xmm7, [edi + 12]
 662
 663         // Finish the calculation by adding the Montgomery product.
 664         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 665         propout [edi +  0],      xmm4, xmm5
 666
 667         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 668         propout [edi +  4],      xmm5, xmm6
 669
 670         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 671         propout [edi +  8],      xmm6, xmm7
 672
 673         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 674         propout [edi + 12],      xmm7, xmm4
 675
 676         // And, with that, we're done.
 677         ret
 678
 679 ENDFUNC
 680
 681 ///--------------------------------------------------------------------------
 682 /// Bulk multipliers.
 683
 684 FUNC(mpx_umul4_x86_sse2)
 685         // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
 686         //                         const mpw *bv, const mpw *bvl);
 687
 688         // Build a stack frame.  Arguments will be relative to EBP, as
 689         // follows.
 690         //
 691         //      ebp + 20        dv
 692         //      ebp + 24        av
 693         //      ebp + 28        avl
 694         //      ebp + 32        bv
 695         //      ebp + 36        bvl
 696         //
 697         // Locals are relative to ESP, as follows.
 698         //
 699         //      esp +  0        expanded Y (32 bytes)
 700         //      esp + 32        (top of locals)
 701         pushreg ebp
 702         pushreg ebx
 703         pushreg esi
 704         pushreg edi
 705         setfp   ebp
 706         and     esp, ~15
 707         sub     esp, 32
 708   endprologue
 709
 710         // Prepare for the first iteration.
 711         mov     esi, [ebp + 32]         // -> bv[0]
 712         pxor    xmm7, xmm7
 713         movdqu  xmm0, [esi]             // bv[0]
 714         mov     edi, [ebp + 20]         // -> dv[0]
 715         mov     ecx, edi                // outer loop dv cursor
 716         expand  xmm0, xmm1, nil, nil, xmm7
 717         mov     ebx, [ebp + 24]         // -> av[0]
 718         mov     eax, [ebp + 28]         // -> av[m] = av limit
 719         mov     edx, esp                // -> expanded Y = bv[0]
 720         movdqa  [esp + 0], xmm0         // bv[0] expanded low
 721         movdqa  [esp + 16], xmm1        // bv[0] expanded high
 722         call    mul4zc
 723         add     ebx, 16
 724         add     edi, 16
 725         add     ecx, 16
 726         add     esi, 16
 727         cmp     ebx, eax                // all done?
 728         jae     8f
 729
 730         .p2align 4
 731         // Continue with the first iteration.
 732 0:      call    mul4
 733         add     ebx, 16
 734         add     edi, 16
 735         cmp     ebx, eax                // all done?
 736         jb      0b
 737
 738         // Write out the leftover carry.  There can be no tail here.
 739 8:      call    carryprop
 740         cmp     esi, [ebp + 36]         // more passes to do?
 741         jae     9f
 742
 743         .p2align 4
 744         // Set up for the next pass.
 745 1:      movdqu  xmm0, [esi]             // bv[i]
 746         mov     edi, ecx                // -> dv[i]
 747         pxor    xmm7, xmm7
 748         expand  xmm0, xmm1, nil, nil, xmm7
 749         mov     ebx, [ebp + 24]         // -> av[0]
 750         movdqa  [esp + 0], xmm0         // bv[i] expanded low
 751         movdqa  [esp + 16], xmm1        // bv[i] expanded high
 752         call    mla4zc
 753         add     edi, 16
 754         add     ebx, 16
 755         add     ecx, 16
 756         add     esi, 16
 757         cmp     ebx, eax                // done yet?
 758         jae     8f
 759
 760         .p2align 4
 761         // Continue...
 762 0:      call    mla4
 763         add     ebx, 16
 764         add     edi, 16
 765         cmp     ebx, eax
 766         jb      0b
 767
 768         // Finish off this pass.  There was no tail on the previous pass, and
 769         // there can be none on this pass.
 770 8:      call    carryprop
 771         cmp     esi, [ebp + 36]
 772         jb      1b
 773
 774         // All over.
 775 9:      dropfp
 776         pop     edi
 777         pop     esi
 778         pop     ebx
 779         pop     ebp
 780         ret
 781
 782 ENDFUNC
 783
 784 FUNC(mpxmont_mul4_x86_sse2)
 785         // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
 786         //                           const mpw *nv, size_t n, const mpw *mi);
 787
 788         // Build a stack frame.  Arguments will be relative to EBP, as
 789         // follows.
 790         //
 791         //      ebp + 20        dv
 792         //      ebp + 24        av
 793         //      ebp + 28        bv
 794         //      ebp + 32        nv
 795         //      ebp + 36        n (nonzero multiple of 4)
 796         //      ebp + 40        mi
 797         //
 798         // Locals are relative to ESP, which is 4 mod 16, as follows.
 799         //
 800         //      esp +   0       outer loop dv
 801         //      esp +   4       outer loop bv
 802         //      esp +   8       av limit (mostly in ESI)
 803         //      esp +  12       expanded V (32 bytes)
 804         //      esp +  44       expanded M (32 bytes)
 805         //      esp +  76       expanded Y (32 bytes)
 806         //      esp + 108       bv limit
 807         //      esp + 112       (gap)
 808         //      esp + 124       (top of locals)
 809         pushreg ebp
 810         pushreg ebx
 811         pushreg esi
 812         pushreg edi
 813         setfp   ebp
 814         and     esp, ~15
 815         sub     esp, 124
 816   endprologue
 817
 818         // Establish the expanded operands.
 819         pxor    xmm7, xmm7
 820         mov     ecx, [ebp + 28]         // -> bv
 821         mov     edx, [ebp + 40]         // -> mi
 822         movdqu  xmm0, [ecx]             // bv[0]
 823         movdqu  xmm2, [edx]             // mi
 824         expand  xmm0, xmm1, xmm2, xmm3, xmm7
 825         movdqa  [esp + 12], xmm0        // bv[0] expanded low
 826         movdqa  [esp + 28], xmm1        // bv[0] expanded high
 827         movdqa  [esp + 44], xmm2        // mi expanded low
 828         movdqa  [esp + 60], xmm3        // mi expanded high
 829
 830         // Set up the outer loop state and prepare for the first iteration.
 831         mov     edx, [ebp + 36]         // n
 832         mov     eax, [ebp + 24]         // -> U = av[0]
 833         mov     ebx, [ebp + 32]         // -> X = nv[0]
 834         mov     edi, [ebp + 20]         // -> Z = dv[0]
 835         mov     [esp + 4], ecx
 836         lea     ecx, [ecx + 4*edx]      // -> bv[n/4] = bv limit
 837         lea     edx, [eax + 4*edx]      // -> av[n/4] = av limit
 838         mov     [esp + 0], edi
 839         mov     [esp + 108], ecx
 840         mov     [esp + 8], edx
 841         lea     ecx, [esp + 12]         // -> expanded V = bv[0]
 842         lea     esi, [esp + 44]         // -> expanded M = mi
 843         lea     edx, [esp + 76]         // -> space for Y
 844         call    mmul4
 845         mov     esi, [esp + 8]          // recover av limit
 846         add     edi, 16
 847         add     eax, 16
 848         add     ebx, 16
 849         cmp     eax, esi                // done already?
 850         jae     8f
 851         mov     [esp + 0], edi
 852
 853         .p2align 4
 854         // Complete the first inner loop.
 855 0:      call    dmul4
 856         add     edi, 16
 857         add     eax, 16
 858         add     ebx, 16
 859         cmp     eax, esi                // done yet?
 860         jb      0b
 861
 862         // Still have carries left to propagate.
 863         call    carryprop
 864         movd    [edi + 16], xmm4
 865
 866         .p2align 4
 867         // Embark on the next iteration.  (There must be one.  If n = 1, then
 868         // we would have bailed above, to label 8.  Similarly, the subsequent
 869         // iterations can fall into the inner loop immediately.)
 870 1:      mov     eax, [esp + 4]          // -> bv[i - 1]
 871         mov     edi, [esp + 0]          // -> Z = dv[i]
 872         add     eax, 16                 // -> bv[i]
 873         pxor    xmm7, xmm7
 874         movdqu  xmm0, [eax]             // bv[i]
 875         mov     [esp + 4], eax
 876         cmp     eax, [esp + 108]        // done yet?
 877         jae     9f
 878         mov     ebx, [ebp + 32]         // -> X = nv[0]
 879         lea     esi, [esp + 44]         // -> expanded M = mi
 880         mov     eax, [ebp + 24]         // -> U = av[0]
 881         expand  xmm0, xmm1, nil, nil, xmm7
 882         movdqa  [esp + 12], xmm0        // bv[i] expanded low
 883         movdqa  [esp + 28], xmm1        // bv[i] expanded high
 884         call    mmla4
 885         mov     esi, [esp + 8]          // recover av limit
 886         add     edi, 16
 887         add     eax, 16
 888         add     ebx, 16
 889         mov     [esp + 0], edi
 890
 891         .p2align 4
 892         // Complete the next inner loop.
 893 0:      call    dmla4
 894         add     edi, 16
 895         add     eax, 16
 896         add     ebx, 16
 897         cmp     eax, esi
 898         jb      0b
 899
 900         // Still have carries left to propagate, and they overlap the
 901         // previous iteration's final tail, so read that in and add it.
 902         movd    xmm0, [edi]
 903         paddq   xmm4, xmm0
 904         call    carryprop
 905         movd    [edi + 16], xmm4
 906
 907         // Back again.
 908         jmp     1b
 909
 910         // First iteration was short.  Write out the carries and we're done.
 911         // (This could be folded into the main loop structure, but that would
 912         // penalize small numbers more.)
 913 8:      call    carryprop
 914         movd    [edi + 16], xmm4
 915
 916         // All done.
 917 9:      dropfp
 918         popreg  edi
 919         popreg  esi
 920         popreg  ebx
 921         popreg  ebp
 922         ret
 923
 924 ENDFUNC
 925
 926 FUNC(mpxmont_redc4_x86_sse2)
 927         // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
 928         //                             size_t n, const mpw *mi);
 929
 930         // Build a stack frame.  Arguments will be relative to EBP, as
 931         // follows.
 932         //
 933         //      ebp + 20        dv
 934         //      ebp + 24        dvl
 935         //      ebp + 28        nv
 936         //      ebp + 32        n (nonzero multiple of 4)
 937         //      ebp + 36        mi
 938         //
 939         // Locals are relative to ESP, as follows.
 940         //
 941         //      esp +  0        outer loop dv
 942         //      esp +  4        outer dv limit
 943         //      esp +  8        blocks-of-4 dv limit
 944         //      esp + 12        expanded M (32 bytes)
 945         //      esp + 44        expanded Y (32 bytes)
 946         //      esp + 76        (top of locals)
 947         pushreg ebp
 948         pushreg ebx
 949         pushreg esi
 950         pushreg edi
 951         setfp   ebp
 952         and     esp, ~15
 953         sub     esp, 76
 954   endprologue
 955
 956         // Establish the expanded operands and the blocks-of-4 dv limit.
 957         mov     edi, [ebp + 20]         // -> Z = dv[0]
 958         pxor    xmm7, xmm7
 959         mov     eax, [ebp + 24]         // -> dv[n] = dv limit
 960         sub     eax, edi                // length of dv in bytes
 961         mov     edx, [ebp + 36]         // -> mi
 962         movdqu  xmm0, [edx]             // mi
 963         and     eax, ~15                // mask off the tail end
 964         expand  xmm0, xmm1, nil, nil, xmm7
 965         add     eax, edi                // find limit
 966         movdqa  [esp + 12], xmm0        // mi expanded low
 967         movdqa  [esp + 28], xmm1        // mi expanded high
 968         mov     [esp + 8], eax
 969
 970         // Set up the outer loop state and prepare for the first iteration.
 971         mov     ecx, [ebp + 32]         // n
 972         mov     ebx, [ebp + 28]         // -> X = nv[0]
 973         lea     edx, [edi + 4*ecx]      // -> dv[n/4] = outer dv limit
 974         lea     ecx, [ebx + 4*ecx]      // -> nv[n/4] = nv limit
 975         mov     [esp + 0], edi
 976         mov     [esp + 4], edx
 977         lea     esi, [esp + 12]         // -> expanded M = mi
 978         lea     edx, [esp + 44]         // -> space for Y
 979         call    mont4
 980         add     edi, 16
 981         add     ebx, 16
 982         cmp     ebx, ecx                // done already?
 983         jae     8f
 984
 985         .p2align 4
 986         // Complete the first inner loop.
 987 5:      call    mla4
 988         add     ebx, 16
 989         add     edi, 16
 990         cmp     ebx, ecx                // done yet?
 991         jb      5b
 992
 993         // Still have carries left to propagate.
 994 8:      carryadd
 995         mov     esi, [esp + 8]          // -> dv blocks limit
 996         mov     edx, [ebp + 24]         // dv limit
 997         psllq   xmm7, 16
 998         pslldq  xmm7, 8
 999         paddq   xmm6, xmm7
1000         call    carryprop
1001         movd    eax, xmm4
1002         add     edi, 16
1003         cmp     edi, esi
1004         jae     7f
1005
1006         .p2align 4
1007         // Continue carry propagation until the end of the buffer.
1008 0:      add     [edi], eax
1009         mov     eax, 0                  // preserves flags
1010         adcd    [edi + 4], 0
1011         adcd    [edi + 8], 0
1012         adcd    [edi + 12], 0
1013         adc     eax, 0
1014         add     edi, 16
1015         cmp     edi, esi
1016         jb      0b
1017
1018         // Deal with the tail end.
1019 7:      add     [edi], eax
1020         mov     eax, 0                  // preserves flags
1021         add     edi, 4
1022         adc     eax, 0
1023         cmp     edi, edx
1024         jb      7b
1025
1026         // All done for this iteration.  Start the next.  (This must have at
1027         // least one follow-on iteration, or we'd not have started this outer
1028         // loop.)
1029 8:      mov     edi, [esp + 0]          // -> dv[i - 1]
1030         mov     ebx, [ebp + 28]         // -> X = nv[0]
1031         lea     edx, [esp + 44]         // -> space for Y
1032         lea     esi, [esp + 12]         // -> expanded M = mi
1033         add     edi, 16                 // -> Z = dv[i]
1034         cmp     edi, [esp + 4]          // all done yet?
1035         jae     9f
1036         mov     [esp + 0], edi
1037         call    mont4
1038         add     edi, 16
1039         add     ebx, 16
1040         jmp     5b
1041
1042         // All over.
1043 9:      dropfp
1044         popreg  edi
1045         popreg  esi
1046         popreg  ebx
1047         popreg  ebp
1048         ret
1049
1050 ENDFUNC
1051
1052 ///--------------------------------------------------------------------------
1053 /// Testing and performance measurement.
1054
1055 #ifdef TEST_MUL4
1056
1057 .macro  cysetup c
1058         rdtsc
1059         mov     [\c], eax
1060         mov     [\c + 4], edx
1061 .endm
1062
1063 .macro  cystore c, v, n
1064         rdtsc
1065         sub     eax, [\c]
1066         sbb     edx, [\c + 4]
1067         mov     ebx, [\v]
1068         mov     ecx, [\n]
1069         dec     ecx
1070         mov     [\n], ecx
1071         mov     [ebx + ecx*8], eax
1072         mov     [ebx + ecx*8 + 4], edx
1073 .endm
1074
1075 .macro  testprologue
1076         pushreg ebp
1077         pushreg ebx
1078         pushreg esi
1079         pushreg edi
1080         setfp   ebp
1081         and     esp, ~15
1082         sub     esp, 3*32 + 12
1083   endprologue
1084         // vars:
1085         //      esp +  0 = cycles
1086         //      esp + 12 = v expanded
1087         //      esp + 44 = y expanded
1088         //      esp + 72 = ? expanded
1089 .endm
1090
1091 .macro  testepilogue
1092         dropfp
1093         popreg  edi
1094         popreg  esi
1095         popreg  ebx
1096         popreg  ebp
1097         ret
1098 .endm
1099
1100 .macro  testldcarry c
1101         mov     ecx, \c                 // -> c
1102         movdqu  xmm4, [ecx +  0]        // (c'_0, c''_0)
1103         movdqu  xmm5, [ecx + 16]        // (c'_1, c''_1)
1104         movdqu  xmm6, [ecx + 32]        // (c'_2, c''_2)
1105 .endm
1106
1107 .macro  testexpand v, y
1108         pxor    xmm7, xmm7
1109   .ifnes "\v", "nil"
1110         mov     ecx, \v
1111         movdqu  xmm0, [ecx]
1112         expand  xmm0, xmm1, nil, nil, xmm7
1113         movdqa  [esp + 12], xmm0
1114         movdqa  [esp + 28], xmm1
1115   .endif
1116   .ifnes "\y", "nil"
1117         mov     edx, \y
1118         movdqu  xmm2, [edx]
1119         expand  xmm2, xmm3, nil, nil, xmm7
1120         movdqa  [esp + 44], xmm2
1121         movdqa  [esp + 60], xmm3
1122   .endif
1123 .endm
1124
1125 .macro  testtop u, x, mode
1126         .p2align 4
1127 0:
1128   .ifnes "\u", "nil"
1129         lea     ecx, [esp + 12]
1130   .endif
1131         mov     ebx, \x
1132   .ifeqs "\mode", "mont"
1133         lea     esi, [esp + 44]
1134   .endif
1135         cysetup esp + 0
1136   .ifnes "\u", "nil"
1137         mov     eax, \u
1138   .endif
1139   .ifeqs "\mode", "mont"
1140         lea     edx, [esp + 76]
1141   .else
1142         lea     edx, [esp + 44]
1143   .endif
1144 .endm
1145
1146 .macro  testtail cyv, n
1147         cystore esp + 0, \cyv, \n
1148         jnz     0b
1149 .endm
1150
1151 .macro  testcarryout c
1152         mov     ecx, \c
1153         movdqu  [ecx +  0], xmm4
1154         movdqu  [ecx + 16], xmm5
1155         movdqu  [ecx + 32], xmm6
1156 .endm
1157
1158 FUNC(test_dmul4)
1159         testprologue
1160         testldcarry [ebp + 24]
1161         testexpand [ebp + 36], [ebp + 40]
1162         mov     edi, [ebp + 20]
1163         testtop [ebp + 28], [ebp + 32]
1164         call    dmul4
1165         testtail [ebp + 48], [ebp + 44]
1166         testcarryout [ebp + 24]
1167         testepilogue
1168 ENDFUNC
1169
1170 FUNC(test_dmla4)
1171         testprologue
1172         testldcarry [ebp + 24]
1173         testexpand [ebp + 36], [ebp + 40]
1174         mov     edi, [ebp + 20]
1175         testtop [ebp + 28], [ebp + 32]
1176         call    dmla4
1177         testtail [ebp + 48], [ebp + 44]
1178         testcarryout [ebp + 24]
1179         testepilogue
1180 ENDFUNC
1181
1182 FUNC(test_mul4)
1183         testprologue
1184         testldcarry [ebp + 24]
1185         testexpand nil, [ebp + 32]
1186         mov     edi, [ebp + 20]
1187         testtop nil, [ebp + 28]
1188         call    mul4
1189         testtail [ebp + 40], [ebp + 36]
1190         testcarryout [ebp + 24]
1191         testepilogue
1192 ENDFUNC
1193
1194 FUNC(test_mla4)
1195         testprologue
1196         testldcarry [ebp + 24]
1197         testexpand nil, [ebp + 32]
1198         mov     edi, [ebp + 20]
1199         testtop nil, [ebp + 28]
1200         call    mla4
1201         testtail [ebp + 40], [ebp + 36]
1202         testcarryout [ebp + 24]
1203         testepilogue
1204 ENDFUNC
1205
1206 FUNC(test_mmul4)
1207         testprologue
1208         testexpand [ebp + 40], [ebp + 44]
1209         mov     edi, [ebp + 20]
1210         testtop [ebp + 32], [ebp + 36], mont
1211         call    mmul4
1212         testtail [ebp + 52], [ebp + 48]
1213         mov     edi, [ebp + 28]
1214         movdqa  xmm0, [esp + 76]
1215         movdqa  xmm1, [esp + 92]
1216         movdqu  [edi], xmm0
1217         movdqu  [edi + 16], xmm1
1218         testcarryout [ebp + 24]
1219         testepilogue
1220 ENDFUNC
1221
1222 FUNC(test_mmla4)
1223         testprologue
1224         testexpand [ebp + 40], [ebp + 44]
1225         mov     edi, [ebp + 20]
1226         testtop [ebp + 32], [ebp + 36], mont
1227         call    mmla4
1228         testtail [ebp + 52], [ebp + 48]
1229         mov     edi, [ebp + 28]
1230         movdqa  xmm0, [esp + 76]
1231         movdqa  xmm1, [esp + 92]
1232         movdqu  [edi], xmm0
1233         movdqu  [edi + 16], xmm1
1234         testcarryout [ebp + 24]
1235         testepilogue
1236 ENDFUNC
1237
1238 FUNC(test_mont4)
1239         testprologue
1240         testexpand nil, [ebp + 36]
1241         mov     edi, [ebp + 20]
1242         testtop nil, [ebp + 32], mont
1243         call    mont4
1244         testtail [ebp + 44], [ebp + 40]
1245         mov     edi, [ebp + 28]
1246         movdqa  xmm0, [esp + 76]
1247         movdqa  xmm1, [esp + 92]
1248         movdqu  [edi], xmm0
1249         movdqu  [edi + 16], xmm1
1250         testcarryout [ebp + 24]
1251         testepilogue
1252 ENDFUNC
1253
1254 #endif
1255
1256 ///----- That's all, folks --------------------------------------------------