mdw@git.distorted.org.uk Git - catacomb/blob - math/mpx-mul4-x86-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*-
   2 ///
   3 /// Large SIMD-based multiplications
   4 ///
   5 /// (c) 2016 Straylight/Edgeware
   6
   7 ///----- Licensing notice ---------------------------------------------------
   8 ///
   9 /// This file is part of Catacomb.
  10 ///
  11 /// Catacomb is free software; you can redistribute it and/or modify
  12 /// it under the terms of the GNU Library General Public License as
  13 /// published by the Free Software Foundation; either version 2 of the
  14 /// License, or (at your option) any later version.
  15 ///
  16 /// Catacomb is distributed in the hope that it will be useful,
  17 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 /// GNU Library General Public License for more details.
  20 ///
  21 /// You should have received a copy of the GNU Library General Public
  22 /// License along with Catacomb; if not, write to the Free
  23 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  24 /// MA 02111-1307, USA.
  25
  26 ///--------------------------------------------------------------------------
  27 /// External definitions.
  28
  29 #include "config.h"
  30 #include "asm-common.h"
  31
  32 ///--------------------------------------------------------------------------
  33 /// Prologue.
  34
  35         .arch   pentium4
  36         .text
  37
  38 ///--------------------------------------------------------------------------
  39 /// Theory.
  40 ///
  41 /// We define a number of primitive fixed-size multipliers from which we can
  42 /// construct more general variable-length multipliers.
  43 ///
  44 /// The basic trick is the same throughout.  In an operand-scanning
  45 /// multiplication, the inner multiplication loop multiplies a
  46 /// multiple-precision operand by a single precision factor, and adds the
  47 /// result, appropriately shifted, to the result.  A `finely integrated
  48 /// operand scanning' implementation of Montgomery multiplication also adds
  49 /// the product of a single-precision `Montgomery factor' and the modulus,
  50 /// calculated in the same pass.  The more common `coarsely integrated
  51 /// operand scanning' alternates main multiplication and Montgomery passes,
  52 /// which requires additional carry propagation.
  53 ///
  54 /// Throughout both plain-multiplication and Montgomery stages, then, one of
  55 /// the factors remains constant throughout the operation, so we can afford
  56 /// to take a little time to preprocess it.  The transformation we perform is
  57 /// as follows.  Let b = 2^16, and B = b^2 = 2^32.  Suppose we're given a
  58 /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3.  Split each v_i into
  59 /// two sixteen-bit pieces, so v_i = v'_i + v''_i b.  These eight 16-bit
  60 /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
  61 /// operands, as follows.
  62 ///
  63 ///     Offset     0       4        8      12
  64 ///        0    v'_0    v'_1    v''_0   v''_1
  65 ///       16    v'_2    v'_3    v''_2   v''_3
  66 ///
  67 /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
  68 /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
  69 /// this vector right by 4 bytes brings v'_1 and v''_1 into position.  We can
  70 /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
  71 /// results in 64-bit fields.  The sixteen bits of headroom allows us to add
  72 /// many products together before we must deal with carrying; it also allows
  73 /// for some calculations to be performed on the above expanded form.
  74 ///
  75 /// On 32-bit x86, we are register starved: the expanded operands are kept in
  76 /// memory, typically in warm L1 cache.
  77 ///
  78 /// We maintain four `carry' registers accumulating intermediate results.
  79 /// The registers' precise roles rotate during the computation; we name them
  80 /// `c0', `c1', `c2', and `c3'.  Each carry register holds two 64-bit halves:
  81 /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
  82 /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
  83 /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3.  The
  84 /// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
  85 /// lanes of its vector) and an operand in the expanded form above produces a
  86 /// result which can be added directly to the appropriate carry register.
  87 /// Following a pass of four multiplications, we perform some limited carry
  88 /// propagation: let t = c''_0 mod B, and let d = c'_0 + t b; then we output
  89 /// z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and cycle the carry
  90 /// registers around, so that c1 becomes c0, and the old c0 is (implicitly)
  91 /// zeroed becomes c3.
  92
  93 ///--------------------------------------------------------------------------
  94 /// Macro definitions.
  95
  96 .macro  mulcore r, s, d0, d1, d2, d3
  97         // Load a word r_i from R, multiply by the expanded operand [S], and
  98         // leave the pieces of the product in registers D0, D1, D2, D3.
  99         movd    \d0, \r                 // (r_i, 0, 0, 0)
 100   .ifnes "\d1", "nil"
 101         movdqa  \d1, [\s]               // (s'_0, s'_1, s''_0, s''_1)
 102   .endif
 103   .ifnes "\d3", "nil"
 104         movdqa  \d3, [\s + 16]          // (s'_2, s'_3, s''_2, s''_3)
 105   .endif
 106         pshufd  \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?, r_i, ?)
 107   .ifnes "\d1", "nil"
 108         psrldq  \d1, 4                  // (s'_1, s''_0, s''_1, 0)
 109   .endif
 110   .ifnes "\d2", "nil"
 111     .ifnes "\d3", "nil"
 112         movdqa  \d2, \d3                // another copy of (s'_2, s'_3, ...)
 113     .else
 114         movdqa  \d2, \d0                // another copy of (r_i, ?, r_i, ?)
 115     .endif
 116   .endif
 117   .ifnes "\d3", "nil"
 118         psrldq  \d3, 4                  // (s'_3, s''_2, s''_3, 0)
 119   .endif
 120   .ifnes "\d1", "nil"
 121         pmuludqd \d1, \d0               // (r_i s'_1, r_i s''_1)
 122   .endif
 123   .ifnes "\d3", "nil"
 124         pmuludqd \d3, \d0               // (r_i s'_3, r_i s''_3)
 125   .endif
 126   .ifnes "\d2", "nil"
 127     .ifnes "\d3", "nil"
 128         pmuludqd \d2, \d0               // (r_i s'_2, r_i s''_2)
 129     .else
 130         pmuludqd \d2, [\s + 16]
 131     .endif
 132   .endif
 133         pmuludqd \d0, [\s]              // (r_i s'_0, r_i s''_0)
 134 .endm
 135
 136 .macro  accum   c0, c1, c2, c3
 137         paddq   \c0, xmm0
 138   .ifnes "\c1", "nil"
 139         paddq   \c1, xmm1
 140   .endif
 141   .ifnes "\c2", "nil"
 142         paddq   \c2, xmm2
 143   .endif
 144   .ifnes "\c3", "nil"
 145         paddq   \c3, xmm3
 146   .endif
 147 .endm
 148
 149 .macro  mulacc  r, s, c0, c1, c2, c3, z3p
 150         // Load a word r_i from R, multiply by the expanded operand [S],
 151         // and accumulate in carry registers C0, C1, C2, C3.  If Z3P is `t'
 152         // then C3 notionally contains zero, but needs clearing; in practice,
 153         // we store the product directly rather than attempting to add.  On
 154         // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P
 155         // is not `t'.
 156   .ifeqs "\z3p", "t"
 157         mulcore \r, \s, xmm0, xmm1, xmm2, \c3
 158         accum           \c0,  \c1,  \c2,  nil
 159   .else
 160         mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
 161         accum           \c0,  \c1,  \c2,  \c3
 162   .endif
 163 .endm
 164
 165 .macro  propout d, c, cc
 166         // Calculate an output word from C, and store it in D; propagate
 167         // carries out from C to CC in preparation for a rotation of the
 168         // carry registers.  On completion, XMM3 is clobbered.  If CC is
 169         // `nil', then the contribution which would have been added to it is
 170         // left in C.
 171         pshufd  xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
 172         psrldq  xmm3, 12                // (t, 0, 0, 0) = (t, 0)
 173         pslldq  xmm3, 2                 // (t b, 0)
 174         paddq   \c, xmm3                // (c' + t b, c'')
 175         movd    \d, \c
 176         psrlq   \c, 32                  // floor(c/B)
 177   .ifnes "\cc", "nil"
 178         paddq   \cc, \c                 // propagate up
 179   .endif
 180 .endm
 181
 182 .macro  endprop d, c, t
 183         // On entry, C contains a carry register.  On exit, the low 32 bits
 184         // of the value represented in C are written to D, and the remaining
 185         // bits are left at the bottom of T.
 186         movdqa  \t, \c
 187         psllq   \t, 16                  // (?, c'' b)
 188         pslldq  \c, 8                   // (0, c')
 189         paddq   \t, \c                  // (?, c' + c'' b)
 190         psrldq  \t, 8                   // c' + c'' b
 191         movd    \d, \t
 192         psrldq  \t, 4                   // floor((c' + c'' b)/B)
 193 .endm
 194
 195 .macro  expand  a, b, c, d, z
 196         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
 197         // exit, A:B and C:D together hold the same values in expanded
 198         // form.  If C is `nil', then only expand A to A:B.
 199         movdqa  \b, \a                  // (a_0, a_1, a_2, a_3)
 200   .ifnes "\c", "nil"
 201         movdqa  \d, \c                  // (c_0, c_1, c_2, c_3)
 202   .endif
 203         punpcklwd \a, \z                // (a'_0, a''_0, a'_1, a''_1)
 204         punpckhwd \b, \z                // (a'_2, a''_2, a'_3, a''_3)
 205   .ifnes "\c", "nil"
 206         punpcklwd \c, \z                // (c'_0, c''_0, c'_1, c''_1)
 207         punpckhwd \d, \z                // (c'_2, c''_2, c'_3, c''_3)
 208   .endif
 209         pshufd  \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
 210         pshufd  \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
 211   .ifnes "\c", "nil"
 212         pshufd  \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
 213         pshufd  \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
 214   .endif
 215 .endm
 216
 217 .macro  squash  c0, c1, c2, c3, h, t, u
 218         // On entry, C0, C1, C2, C3 are carry registers representing a value
 219         // Y.  On exit, C0 holds the low 128 bits of the carry value; C1, C2,
 220         // C3, T, and U are clobbered; and the high bits of Y are stored in
 221         // H, if this is not `nil'.
 222
 223         // The first step is to eliminate the `double-prime' pieces -- i.e.,
 224         // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
 225         // them into the 32-bit-aligned pieces above and below.  But before
 226         // we can do that, we must gather them together.
 227         movdqa  \t, \c0
 228         movdqa  \u, \c1
 229         punpcklqdq \t, \c2              // (y'_0, y'_2)
 230         punpckhqdq \c0, \c2             // (y''_0, y''_2)
 231         punpcklqdq \u, \c3              // (y'_1, y'_3)
 232         punpckhqdq \c1, \c3             // (y''_1, y''_3)
 233
 234         // Now split the double-prime pieces.  The high (up to) 48 bits will
 235         // go up; the low 16 bits go down.
 236         movdqa  \c2, \c0
 237         movdqa  \c3, \c1
 238         psllq   \c2, 48
 239         psllq   \c3, 48
 240         psrlq   \c0, 16                 // high parts of (y''_0, y''_2)
 241         psrlq   \c1, 16                 // high parts of (y''_1, y''_3)
 242         psrlq   \c2, 32                 // low parts of (y''_0, y''_2)
 243         psrlq   \c3, 32                 // low parts of (y''_1, y''_3)
 244   .ifnes "\h", "nil"
 245         movdqa  \h, \c1
 246   .endif
 247         pslldq  \c1, 8                  // high part of (0, y''_1)
 248
 249         paddq   \t, \c2                 // propagate down
 250         paddq   \u, \c3
 251         paddq   \t, \c1                 // and up: (y_0, y_2)
 252         paddq   \u, \c0                 // (y_1, y_3)
 253   .ifnes "\h", "nil"
 254         psrldq  \h, 8                   // high part of (y''_3, 0)
 255   .endif
 256
 257         // Finally extract the answer.  This complicated dance is better than
 258         // storing to memory and loading, because the piecemeal stores
 259         // inhibit store forwarding.
 260         movdqa  \c3, \t                 // (y_0, y_1)
 261         movdqa  \c0, \t                 // (y^*_0, ?, ?, ?)
 262         psrldq  \t, 8                   // (y_2, 0)
 263         psrlq   \c3, 32                 // (floor(y_0/B), ?)
 264         paddq   \c3, \u                 // (y_1 + floor(y_0/B), ?)
 265         pslldq  \c0, 12                 // (0, 0, 0, y^*_0)
 266         movdqa  \c1, \c3                // (y^*_1, ?, ?, ?)
 267         psrldq  \u, 8                   // (y_3, 0)
 268         psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2, ?)
 269         paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2, ?)
 270         pslldq  \c1, 12                 // (0, 0, 0, y^*_1)
 271         psrldq  \c0, 12                 // (y^*_0, 0, 0, 0)
 272         movdqa  \c2, \c3                // (y^*_2, ?, ?, ?)
 273         psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
 274         paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
 275         pslldq  \c2, 12                 // (0, 0, 0, y^*_2)
 276         psrldq  \c1, 8                  // (0, y^*_1, 0, 0)
 277         psrldq  \c2, 4                  // (0, 0, y^*_2, 0)
 278   .ifnes "\h", "nil"
 279         movdqu  \t, \c3
 280         pxor    \u, \u
 281   .endif
 282         pslldq  \c3, 12                 // (0, 0, 0, y^*_3)
 283         por     \c0, \c1                // (y^*_0, y^*_1, 0, 0)
 284         por     \c2, \c3                // (0, 0, y^*_2, y^*_3)
 285         por     \c0, \c2                // y mod B^4
 286   .ifnes "\h", "nil"
 287         psrlq   \t, 32                  // very high bits of y
 288         paddq   \h, \t
 289         punpcklqdq \h, \u               // carry up
 290   .endif
 291 .endm
 292
 293 .macro  carryadd
 294         // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6
 295         // hold the incoming carry registers c0, c1, and c2 representing a
 296         // carry-in C.
 297         //
 298         // On exit, the carry registers, including XMM7, are updated to hold
 299         // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
 300         // registers are preserved.
 301         movd    xmm0, [edi +  0]        // (a_0, 0)
 302         movd    xmm1, [edi +  4]        // (a_1, 0)
 303         movd    xmm2, [edi +  8]        // (a_2, 0)
 304         movd    xmm7, [edi + 12]        // (a_3, 0)
 305         paddq   xmm4, xmm0              // (c'_0 + a_0, c''_0)
 306         paddq   xmm5, xmm1              // (c'_1 + a_1, c''_1)
 307         paddq   xmm6, xmm2              // (c'_2 + a_2, c''_2 + a_3 b)
 308 .endm
 309
 310 ///--------------------------------------------------------------------------
 311 /// Primitive multipliers and related utilities.
 312
 313 INTFUNC(carryprop)
 314         // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
 315         // form.  Store the low 128 bits of the represented carry to [EDI] as
 316         // a packed 128-bit value, and leave the remaining 16 bits in the low
 317         // 32 bits of XMM4.  On exit, XMM3, XMM5 and XMM6 are clobbered.
 318         propout [edi +  0], xmm4, xmm5
 319         propout [edi +  4], xmm5, xmm6
 320         propout [edi +  8], xmm6, nil
 321         endprop [edi + 12], xmm6, xmm4
 322         ret
 323
 324 ENDFUNC
 325
 326 INTFUNC(dmul4)
 327         // On entry, EDI points to the destination buffer; EAX and EBX point
 328         // to the packed operands U and X; ECX and EDX point to the expanded
 329         // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
 330         // registers c0, c1, and c2; c3 is assumed to be zero.
 331         //
 332         // On exit, we write the low 128 bits of the sum C + U V + X Y to
 333         // [EDI], and update the carry registers with the carry out.  The
 334         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 335         // general-purpose registers are preserved.
 336         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, t
 337         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 338         propout [edi +  0],      xmm4, xmm5
 339
 340         mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 341         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, nil
 342         propout [edi +  4],      xmm5, xmm6
 343
 344         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 345         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, nil
 346         propout [edi +  8],      xmm6, xmm7
 347
 348         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 349         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
 350         propout [edi + 12],      xmm7, xmm4
 351
 352         ret
 353
 354 ENDFUNC
 355
 356 INTFUNC(dmla4)
 357         // On entry, EDI points to the destination buffer, which also
 358         // contains an addend A to accumulate; EAX and EBX point to the
 359         // packed operands U and X; ECX and EDX point to the expanded
 360         // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
 361         // registers c0, c1, and c2 representing a carry-in C; c3 is assumed
 362         // to be zero.
 363         //
 364         // On exit, we write the low 128 bits of the sum A + C + U V + X Y to
 365         // [EDI], and update the carry registers with the carry out.  The
 366         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 367         // general-purpose registers are preserved.
 368         carryadd
 369
 370         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, nil
 371         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 372         propout [edi +  0],      xmm4, xmm5
 373
 374         mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 375         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, nil
 376         propout [edi +  4],      xmm5, xmm6
 377
 378         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 379         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, nil
 380         propout [edi +  8],      xmm6, xmm7
 381
 382         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 383         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
 384         propout [edi + 12],      xmm7, xmm4
 385
 386         ret
 387
 388 ENDFUNC
 389
 390 INTFUNC(mul4zc)
 391         // On entry, EDI points to the destination buffer; EBX points to a
 392         // packed operand X; and EDX points to an expanded operand Y.
 393         //
 394         // On exit, we write the low 128 bits of the product X Y to [EDI],
 395         // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
 396         // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 397         // general-purpose registers are preserved.
 398         mulcore [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
 399         propout [edi +  0],      xmm4, xmm5
 400
 401         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 402         propout [edi +  4],      xmm5, xmm6
 403
 404         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 405         propout [edi +  8],      xmm6, xmm7
 406
 407         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 408         propout [edi + 12],      xmm7, xmm4
 409
 410         ret
 411
 412 ENDFUNC
 413
 414 INTFUNC(mul4)
 415         // On entry, EDI points to the destination buffer; EBX points to a
 416         // packed operand X; EDX points to an expanded operand Y; and XMM4,
 417         // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
 418         // representing a carry-in C; c3 is assumed to be zero.
 419         //
 420         // On exit, we write the low 128 bits of the sum C + X Y to [EDI],
 421         // and update the carry registers with the carry out.  The registers
 422         // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 423         // general-purpose registers are preserved.
 424         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, t
 425         propout [edi +  0],      xmm4, xmm5
 426
 427         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 428         propout [edi +  4],      xmm5, xmm6
 429
 430         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 431         propout [edi +  8],      xmm6, xmm7
 432
 433         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 434         propout [edi + 12],      xmm7, xmm4
 435
 436         ret
 437
 438 ENDFUNC
 439
 440 INTFUNC(mla4zc)
 441         // On entry, EDI points to the destination buffer, which also
 442         // contains an addend A to accumulate; EBX points to a packed operand
 443         // X; and EDX points to an expanded operand Y.
 444         //
 445         // On exit, we write the low 128 bits of the sum A + X Y to [EDI],
 446         // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
 447         // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 448         // general-purpose registers are preserved.
 449         movd    xmm4, [edi +  0]
 450         movd    xmm5, [edi +  4]
 451         movd    xmm6, [edi +  8]
 452         movd    xmm7, [edi + 12]
 453
 454         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 455         propout [edi +  0],      xmm4, xmm5
 456
 457         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 458         propout [edi +  4],      xmm5, xmm6
 459
 460         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 461         propout [edi +  8],      xmm6, xmm7
 462
 463         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 464         propout [edi + 12],      xmm7, xmm4
 465
 466         ret
 467
 468 ENDFUNC
 469
 470 INTFUNC(mla4)
 471         // On entry, EDI points to the destination buffer, which also
 472         // contains an addend A to accumulate; EBX points to a packed operand
 473         // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
 474         // the incoming carry registers c0, c1, and c2, representing a
 475         // carry-in C; c3 is assumed to be zero.
 476         //
 477         // On exit, we write the low 128 bits of the sum A + C + X Y to
 478         // [EDI], and update the carry registers with the carry out.  The
 479         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
 480         // general-purpose registers are preserved.
 481         carryadd
 482
 483         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 484         propout [edi +  0],      xmm4, xmm5
 485
 486         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 487         propout [edi +  4],      xmm5, xmm6
 488
 489         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 490         propout [edi +  8],      xmm6, xmm7
 491
 492         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 493         propout [edi + 12],      xmm7, xmm4
 494
 495         ret
 496
 497 ENDFUNC
 498
 499 INTFUNC(mmul4)
 500         // On entry, EDI points to the destination buffer; EAX and EBX point
 501         // to the packed operands U and N; ECX and ESI point to the expanded
 502         // operands V and M; and EDX points to a place to store an expanded
 503         // result Y (32 bytes, at a 16-byte boundary).  The stack pointer
 504         // must be 16-byte aligned.  (This is not the usual convention, which
 505         // requires alignment before the call.)
 506         //
 507         // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
 508         // of the sum U V + N Y to [EDI], leaving the remaining carry in
 509         // XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
 510         // XMM7 are clobbered; the general-purpose registers are preserved.
 511         sub     esp, 64                 // space for the carries
 512
 513         // Calculate W = U V, and leave it in the destination.  Stash the
 514         // carry pieces for later.
 515         mulcore [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
 516         propout [edi +  0],      xmm4, xmm5
 517         jmp     5f
 518
 519 ENDFUNC
 520
 521 INTFUNC(mmla4)
 522         // On entry, EDI points to the destination buffer, which also
 523         // contains an addend A to accumulate; EAX and EBX point
 524         // to the packed operands U and N; ECX and ESI point to the expanded
 525         // operands V and M; and EDX points to a place to store an expanded
 526         // result Y (32 bytes, at a 16-byte boundary).  The stack pointer
 527         // must be 16-byte aligned.  (This is not the usual convention, which
 528         // requires alignment before the call.)
 529         //
 530         // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
 531         // bits of the sum A + U V + N Y to [EDI], leaving the remaining
 532         // carry in XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2,
 533         // XMM3, and XMM7 are clobbered; the general-purpose registers are
 534         // preserved.
 535         sub     esp, 64                 // space for the carries
 536         movd    xmm4, [edi +  0]
 537         movd    xmm5, [edi +  4]
 538         movd    xmm6, [edi +  8]
 539         movd    xmm7, [edi + 12]
 540         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, nil
 541         propout [edi +  0],      xmm4, xmm5
 542
 543 5:      mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
 544         propout [edi +  4],      xmm5, xmm6
 545
 546         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
 547         propout [edi +  8],      xmm6, xmm7
 548
 549         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
 550         propout [edi + 12],      xmm7, xmm4
 551
 552         movdqa  [esp +  0], xmm4
 553         movdqa  [esp + 16], xmm5
 554         movdqa  [esp + 32], xmm6
 555
 556         // Calculate Y = W M.
 557         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
 558
 559         mulcore [edi +  4], esi, xmm0, xmm1, xmm2, nil
 560         accum                    xmm5, xmm6, xmm7, nil
 561
 562         mulcore [edi +  8], esi, xmm0, xmm1, nil,  nil
 563         accum                    xmm6, xmm7, nil,  nil
 564
 565         mulcore [edi + 12], esi, xmm0, nil,  nil,  nil
 566         accum                    xmm7, nil,  nil,  nil
 567
 568         // That's lots of pieces.  Now we have to assemble the answer.
 569         squash  xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
 570
 571         // Expand it.
 572         pxor    xmm2, xmm2
 573         expand  xmm4, xmm1, nil, nil, xmm2
 574         movdqa  [edx +  0], xmm4
 575         movdqa  [edx + 16], xmm1
 576
 577         // Initialize the carry from the value for W we calculated earlier.
 578         movd    xmm4, [edi +  0]
 579         movd    xmm5, [edi +  4]
 580         movd    xmm6, [edi +  8]
 581         movd    xmm7, [edi + 12]
 582
 583         // Finish the calculation by adding the Montgomery product.
 584         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 585         propout [edi +  0],      xmm4, xmm5
 586
 587         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 588         propout [edi +  4],      xmm5, xmm6
 589
 590         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 591         propout [edi +  8],      xmm6, xmm7
 592
 593         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 594         propout [edi + 12],      xmm7, xmm4
 595
 596         // Add add on the carry we calculated earlier.
 597         paddq   xmm4, [esp +  0]
 598         paddq   xmm5, [esp + 16]
 599         paddq   xmm6, [esp + 32]
 600
 601         // And, with that, we're done.
 602         add     esp, 64
 603         ret
 604
 605 ENDFUNC
 606
 607 INTFUNC(mont4)
 608         // On entry, EDI points to the destination buffer holding a packed
 609         // value A; EBX points to a packed operand N; ESI points to an
 610         // expanded operand M; and EDX points to a place to store an expanded
 611         // result Y (32 bytes, at a 16-byte boundary).
 612         //
 613         // On exit, we write Y = W M mod B to [EDX], and the low 128 bits
 614         // of the sum W + N Y to [EDI], leaving the remaining carry in
 615         // XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
 616         // XMM7 are clobbered; the general-purpose registers are preserved.
 617
 618         // Calculate Y = W M.
 619         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
 620
 621         mulcore [edi +  4], esi, xmm0, xmm1, xmm2, nil
 622         accum                    xmm5, xmm6, xmm7, nil
 623
 624         mulcore [edi +  8], esi, xmm0, xmm1, nil,  nil
 625         accum                    xmm6, xmm7, nil,  nil
 626
 627         mulcore [edi + 12], esi, xmm0, nil,  nil,  nil
 628         accum                    xmm7, nil,  nil,  nil
 629
 630         // That's lots of pieces.  Now we have to assemble the answer.
 631         squash  xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
 632
 633         // Expand it.
 634         pxor    xmm2, xmm2
 635         expand  xmm4, xmm1, nil, nil, xmm2
 636         movdqa  [edx +  0], xmm4
 637         movdqa  [edx + 16], xmm1
 638
 639         // Initialize the carry from W.
 640         movd    xmm4, [edi +  0]
 641         movd    xmm5, [edi +  4]
 642         movd    xmm6, [edi +  8]
 643         movd    xmm7, [edi + 12]
 644
 645         // Finish the calculation by adding the Montgomery product.
 646         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
 647         propout [edi +  0],      xmm4, xmm5
 648
 649         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
 650         propout [edi +  4],      xmm5, xmm6
 651
 652         mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, t
 653         propout [edi +  8],      xmm6, xmm7
 654
 655         mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t
 656         propout [edi + 12],      xmm7, xmm4
 657
 658         // And, with that, we're done.
 659         ret
 660
 661 ENDFUNC
 662
 663 ///--------------------------------------------------------------------------
 664 /// Bulk multipliers.
 665
 666 FUNC(mpx_umul4_x86_sse2)
 667         // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
 668         //                         const mpw *bv, const mpw *bvl);
 669
 670         // Build a stack frame.  Arguments will be relative to EBP, as
 671         // follows.
 672         //
 673         //      ebp + 20        dv
 674         //      ebp + 24        av
 675         //      ebp + 28        avl
 676         //      ebp + 32        bv
 677         //      ebp + 36        bvl
 678         //
 679         // Locals are relative to ESP, as follows.
 680         //
 681         //      esp +  0        expanded Y (32 bytes)
 682         //      esp + 32        (top of locals)
 683         push    ebp
 684         push    ebx
 685         push    esi
 686         push    edi
 687         mov     ebp, esp
 688         and     esp, ~15
 689         sub     esp, 32
 690
 691         // Prepare for the first iteration.
 692         mov     esi, [ebp + 32]         // -> bv[0]
 693         pxor    xmm7, xmm7
 694         movdqu  xmm0, [esi]             // bv[0]
 695         mov     edi, [ebp + 20]         // -> dv[0]
 696         mov     ecx, edi                // outer loop dv cursor
 697         expand  xmm0, xmm1, nil, nil, xmm7
 698         mov     ebx, [ebp + 24]         // -> av[0]
 699         mov     eax, [ebp + 28]         // -> av[m] = av limit
 700         mov     edx, esp                // -> expanded Y = bv[0]
 701         movdqa  [esp + 0], xmm0         // bv[0] expanded low
 702         movdqa  [esp + 16], xmm1        // bv[0] expanded high
 703         call    mul4zc
 704         add     ebx, 16
 705         add     edi, 16
 706         add     ecx, 16
 707         add     esi, 16
 708         cmp     ebx, eax                // all done?
 709         jae     8f
 710
 711         .p2align 4
 712         // Continue with the first iteration.
 713 0:      call    mul4
 714         add     ebx, 16
 715         add     edi, 16
 716         cmp     ebx, eax                // all done?
 717         jb      0b
 718
 719         // Write out the leftover carry.  There can be no tail here.
 720 8:      call    carryprop
 721         cmp     esi, [ebp + 36]         // more passes to do?
 722         jae     9f
 723
 724         .p2align 4
 725         // Set up for the next pass.
 726 1:      movdqu  xmm0, [esi]             // bv[i]
 727         mov     edi, ecx                // -> dv[i]
 728         pxor    xmm7, xmm7
 729         expand  xmm0, xmm1, nil, nil, xmm7
 730         mov     ebx, [ebp + 24]         // -> av[0]
 731         movdqa  [esp + 0], xmm0         // bv[i] expanded low
 732         movdqa  [esp + 16], xmm1        // bv[i] expanded high
 733         call    mla4zc
 734         add     edi, 16
 735         add     ebx, 16
 736         add     ecx, 16
 737         add     esi, 16
 738         cmp     ebx, eax                // done yet?
 739         jae     8f
 740
 741         .p2align 4
 742         // Continue...
 743 0:      call    mla4
 744         add     ebx, 16
 745         add     edi, 16
 746         cmp     ebx, eax
 747         jb      0b
 748
 749         // Finish off this pass.  There was no tail on the previous pass, and
 750         // there can be none on this pass.
 751 8:      call    carryprop
 752         cmp     esi, [ebp + 36]
 753         jb      1b
 754
 755         // All over.
 756 9:      mov     esp, ebp
 757         pop     edi
 758         pop     esi
 759         pop     ebx
 760         pop     ebp
 761         ret
 762
 763 ENDFUNC
 764
 765 FUNC(mpxmont_mul4_x86_sse2)
 766         // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
 767         //                           const mpw *nv, size_t n, const mpw *mi);
 768
 769         // Build a stack frame.  Arguments will be relative to EBP, as
 770         // follows.
 771         //
 772         //      ebp + 20        dv
 773         //      ebp + 24        av
 774         //      ebp + 28        bv
 775         //      ebp + 32        nv
 776         //      ebp + 36        n (nonzero multiple of 4)
 777         //      ebp + 40        mi
 778         //
 779         // Locals are relative to ESP, which is 4 mod 16, as follows.
 780         //
 781         //      esp +   0       outer loop dv
 782         //      esp +   4       outer loop bv
 783         //      esp +   8       av limit (mostly in ESI)
 784         //      esp +  12       expanded V (32 bytes)
 785         //      esp +  44       expanded M (32 bytes)
 786         //      esp +  76       expanded Y (32 bytes)
 787         //      esp + 108       bv limit
 788         //      esp + 112       (gap)
 789         //      esp + 124       (top of locals)
 790         push    ebp
 791         push    ebx
 792         push    esi
 793         push    edi
 794         mov     ebp, esp
 795         and     esp, ~15
 796         sub     esp, 124
 797
 798         // Establish the expanded operands.
 799         pxor    xmm7, xmm7
 800         mov     ecx, [ebp + 28]         // -> bv
 801         mov     edx, [ebp + 40]         // -> mi
 802         movdqu  xmm0, [ecx]             // bv[0]
 803         movdqu  xmm2, [edx]             // mi
 804         expand  xmm0, xmm1, xmm2, xmm3, xmm7
 805         movdqa  [esp + 12], xmm0        // bv[0] expanded low
 806         movdqa  [esp + 28], xmm1        // bv[0] expanded high
 807         movdqa  [esp + 44], xmm2        // mi expanded low
 808         movdqa  [esp + 60], xmm3        // mi expanded high
 809
 810         // Set up the outer loop state and prepare for the first iteration.
 811         mov     edx, [ebp + 36]         // n
 812         mov     eax, [ebp + 24]         // -> U = av[0]
 813         mov     ebx, [ebp + 32]         // -> X = nv[0]
 814         mov     edi, [ebp + 20]         // -> Z = dv[0]
 815         mov     [esp + 4], ecx
 816         lea     ecx, [ecx + 4*edx]      // -> bv[n/4] = bv limit
 817         lea     edx, [eax + 4*edx]      // -> av[n/4] = av limit
 818         mov     [esp + 0], edi
 819         mov     [esp + 108], ecx
 820         mov     [esp + 8], edx
 821         lea     ecx, [esp + 12]         // -> expanded V = bv[0]
 822         lea     esi, [esp + 44]         // -> expanded M = mi
 823         lea     edx, [esp + 76]         // -> space for Y
 824         call    mmul4
 825         mov     esi, [esp + 8]          // recover av limit
 826         add     edi, 16
 827         add     eax, 16
 828         add     ebx, 16
 829         cmp     eax, esi                // done already?
 830         jae     8f
 831         mov     [esp + 0], edi
 832
 833         .p2align 4
 834         // Complete the first inner loop.
 835 0:      call    dmul4
 836         add     edi, 16
 837         add     eax, 16
 838         add     ebx, 16
 839         cmp     eax, esi                // done yet?
 840         jb      0b
 841
 842         // Still have carries left to propagate.
 843         call    carryprop
 844         movd    [edi + 16], xmm4
 845
 846         .p2align 4
 847         // Embark on the next iteration.  (There must be one.  If n = 1, then
 848         // we would have bailed above, to label 8.  Similarly, the subsequent
 849         // iterations can fall into the inner loop immediately.)
 850 1:      mov     eax, [esp + 4]          // -> bv[i - 1]
 851         mov     edi, [esp + 0]          // -> Z = dv[i]
 852         add     eax, 16                 // -> bv[i]
 853         pxor    xmm7, xmm7
 854         movdqu  xmm0, [eax]             // bv[i]
 855         mov     [esp + 4], eax
 856         cmp     eax, [esp + 108]        // done yet?
 857         jae     9f
 858         mov     ebx, [ebp + 32]         // -> X = nv[0]
 859         lea     esi, [esp + 44]         // -> expanded M = mi
 860         mov     eax, [ebp + 24]         // -> U = av[0]
 861         expand  xmm0, xmm1, nil, nil, xmm7
 862         movdqa  [esp + 12], xmm0        // bv[i] expanded low
 863         movdqa  [esp + 28], xmm1        // bv[i] expanded high
 864         call    mmla4
 865         mov     esi, [esp + 8]          // recover av limit
 866         add     edi, 16
 867         add     eax, 16
 868         add     ebx, 16
 869         mov     [esp + 0], edi
 870
 871         .p2align 4
 872         // Complete the next inner loop.
 873 0:      call    dmla4
 874         add     edi, 16
 875         add     eax, 16
 876         add     ebx, 16
 877         cmp     eax, esi
 878         jb      0b
 879
 880         // Still have carries left to propagate, and they overlap the
 881         // previous iteration's final tail, so read that in and add it.
 882         movd    xmm0, [edi]
 883         paddq   xmm4, xmm0
 884         call    carryprop
 885         movd    [edi + 16], xmm4
 886
 887         // Back again.
 888         jmp     1b
 889
 890         // First iteration was short.  Write out the carries and we're done.
 891         // (This could be folded into the main loop structure, but that would
 892         // penalize small numbers more.)
 893 8:      call    carryprop
 894         movd    [edi + 16], xmm4
 895
 896         // All done.
 897 9:      mov     esp, ebp
 898         pop     edi
 899         pop     esi
 900         pop     ebx
 901         pop     ebp
 902         ret
 903
 904 ENDFUNC
 905
 906 FUNC(mpxmont_redc4_x86_sse2)
 907         // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
 908         //                             size_t n, const mpw *mi);
 909
 910         // Build a stack frame.  Arguments will be relative to EBP, as
 911         // follows.
 912         //
 913         //      ebp + 20        dv
 914         //      ebp + 24        dvl
 915         //      ebp + 28        nv
 916         //      ebp + 32        n (nonzero multiple of 4)
 917         //      ebp + 36        mi
 918         //
 919         // Locals are relative to ESP, as follows.
 920         //
 921         //      esp +  0        outer loop dv
 922         //      esp +  4        outer dv limit
 923         //      esp +  8        blocks-of-4 dv limit
 924         //      esp + 12        expanded M (32 bytes)
 925         //      esp + 44        expanded Y (32 bytes)
 926         //      esp + 76        (top of locals)
 927         push    ebp
 928         push    ebx
 929         push    esi
 930         push    edi
 931         mov     ebp, esp
 932         and     esp, ~15
 933         sub     esp, 76
 934
 935         // Establish the expanded operands and the blocks-of-4 dv limit.
 936         mov     edi, [ebp + 20]         // -> Z = dv[0]
 937         pxor    xmm7, xmm7
 938         mov     eax, [ebp + 24]         // -> dv[n] = dv limit
 939         sub     eax, edi                // length of dv in bytes
 940         mov     edx, [ebp + 36]         // -> mi
 941         movdqu  xmm0, [edx]             // mi
 942         and     eax, ~15                // mask off the tail end
 943         expand  xmm0, xmm1, nil, nil, xmm7
 944         add     eax, edi                // find limit
 945         movdqa  [esp + 12], xmm0        // mi expanded low
 946         movdqa  [esp + 28], xmm1        // mi expanded high
 947         mov     [esp + 8], eax
 948
 949         // Set up the outer loop state and prepare for the first iteration.
 950         mov     ecx, [ebp + 32]         // n
 951         mov     ebx, [ebp + 28]         // -> X = nv[0]
 952         lea     edx, [edi + 4*ecx]      // -> dv[n/4] = outer dv limit
 953         lea     ecx, [ebx + 4*ecx]      // -> nv[n/4] = nv limit
 954         mov     [esp + 0], edi
 955         mov     [esp + 4], edx
 956         lea     esi, [esp + 12]         // -> expanded M = mi
 957         lea     edx, [esp + 44]         // -> space for Y
 958         call    mont4
 959         add     edi, 16
 960         add     ebx, 16
 961         cmp     ebx, ecx                // done already?
 962         jae     8f
 963
 964         .p2align 4
 965         // Complete the first inner loop.
 966 5:      call    mla4
 967         add     ebx, 16
 968         add     edi, 16
 969         cmp     ebx, ecx                // done yet?
 970         jb      5b
 971
 972         // Still have carries left to propagate.
 973 8:      carryadd
 974         mov     esi, [esp + 8]          // -> dv blocks limit
 975         mov     edx, [ebp + 24]         // dv limit
 976         psllq   xmm7, 16
 977         pslldq  xmm7, 8
 978         paddq   xmm6, xmm7
 979         call    carryprop
 980         movd    eax, xmm4
 981         add     edi, 16
 982         cmp     edi, esi
 983         jae     7f
 984
 985         .p2align 4
 986         // Continue carry propagation until the end of the buffer.
 987 0:      add     [edi], eax
 988         mov     eax, 0                  // preserves flags
 989         adcd    [edi + 4], 0
 990         adcd    [edi + 8], 0
 991         adcd    [edi + 12], 0
 992         adc     eax, 0
 993         add     edi, 16
 994         cmp     edi, esi
 995         jb      0b
 996
 997         // Deal with the tail end.
 998 7:      add     [edi], eax
 999         mov     eax, 0                  // preserves flags
1000         add     edi, 4
1001         adc     eax, 0
1002         cmp     edi, edx
1003         jb      7b
1004
1005         // All done for this iteration.  Start the next.  (This must have at
1006         // least one follow-on iteration, or we'd not have started this outer
1007         // loop.)
1008 8:      mov     edi, [esp + 0]          // -> dv[i - 1]
1009         mov     ebx, [ebp + 28]         // -> X = nv[0]
1010         lea     edx, [esp + 44]         // -> space for Y
1011         lea     esi, [esp + 12]         // -> expanded M = mi
1012         add     edi, 16                 // -> Z = dv[i]
1013         cmp     edi, [esp + 4]          // all done yet?
1014         jae     9f
1015         mov     [esp + 0], edi
1016         call    mont4
1017         add     edi, 16
1018         add     ebx, 16
1019         jmp     5b
1020
1021         // All over.
1022 9:      mov     esp, ebp
1023         pop     edi
1024         pop     esi
1025         pop     ebx
1026         pop     ebp
1027         ret
1028
1029 ENDFUNC
1030
1031 ///--------------------------------------------------------------------------
1032 /// Testing and performance measurement.
1033
1034 #ifdef TEST_MUL4
1035
1036 .macro  cysetup c
1037         rdtsc
1038         mov     [\c], eax
1039         mov     [\c + 4], edx
1040 .endm
1041
1042 .macro  cystore c, v, n
1043         rdtsc
1044         sub     eax, [\c]
1045         sbb     edx, [\c + 4]
1046         mov     ebx, [\v]
1047         mov     ecx, [\n]
1048         dec     ecx
1049         mov     [\n], ecx
1050         mov     [ebx + ecx*8], eax
1051         mov     [ebx + ecx*8 + 4], edx
1052 .endm
1053
1054 .macro  testprologue
1055         push    ebp
1056         push    ebx
1057         push    esi
1058         push    edi
1059         mov     ebp, esp
1060         and     esp, ~15
1061         sub     esp, 3*32 + 12
1062         // vars:
1063         //      esp +  0 = cycles
1064         //      esp + 12 = v expanded
1065         //      esp + 44 = y expanded
1066         //      esp + 72 = ? expanded
1067 .endm
1068
1069 .macro  testepilogue
1070         mov     esp, ebp
1071         pop     edi
1072         pop     esi
1073         pop     ebx
1074         pop     ebp
1075         ret
1076 .endm
1077
1078 .macro  testldcarry c
1079         mov     ecx, \c                 // -> c
1080         movdqu  xmm4, [ecx +  0]        // (c'_0, c''_0)
1081         movdqu  xmm5, [ecx + 16]        // (c'_1, c''_1)
1082         movdqu  xmm6, [ecx + 32]        // (c'_2, c''_2)
1083 .endm
1084
1085 .macro  testexpand v, y
1086         pxor    xmm7, xmm7
1087   .ifnes "\v", "nil"
1088         mov     ecx, \v
1089         movdqu  xmm0, [ecx]
1090         expand  xmm0, xmm1, nil, nil, xmm7
1091         movdqa  [esp + 12], xmm0
1092         movdqa  [esp + 28], xmm1
1093   .endif
1094   .ifnes "\y", "nil"
1095         mov     edx, \y
1096         movdqu  xmm2, [edx]
1097         expand  xmm2, xmm3, nil, nil, xmm7
1098         movdqa  [esp + 44], xmm2
1099         movdqa  [esp + 60], xmm3
1100   .endif
1101 .endm
1102
1103 .macro  testtop u, x, mode
1104         .p2align 4
1105 0:
1106   .ifnes "\u", "nil"
1107         lea     ecx, [esp + 12]
1108   .endif
1109         mov     ebx, \x
1110   .ifeqs "\mode", "mont"
1111         lea     esi, [esp + 44]
1112   .endif
1113         cysetup esp + 0
1114   .ifnes "\u", "nil"
1115         mov     eax, \u
1116   .endif
1117   .ifeqs "\mode", "mont"
1118         lea     edx, [esp + 76]
1119   .else
1120         lea     edx, [esp + 44]
1121   .endif
1122 .endm
1123
1124 .macro  testtail cyv, n
1125         cystore esp + 0, \cyv, \n
1126         jnz     0b
1127 .endm
1128
1129 .macro  testcarryout c
1130         mov     ecx, \c
1131         movdqu  [ecx +  0], xmm4
1132         movdqu  [ecx + 16], xmm5
1133         movdqu  [ecx + 32], xmm6
1134 .endm
1135
1136         .globl  test_dmul4
1137 test_dmul4:
1138         testprologue
1139         testldcarry [ebp + 24]
1140         testexpand [ebp + 36], [ebp + 40]
1141         mov     edi, [ebp + 20]
1142         testtop [ebp + 28], [ebp + 32]
1143         call    dmul4
1144         testtail [ebp + 48], [ebp + 44]
1145         testcarryout [ebp + 24]
1146         testepilogue
1147
1148         .globl  test_dmla4
1149 test_dmla4:
1150         testprologue
1151         testldcarry [ebp + 24]
1152         testexpand [ebp + 36], [ebp + 40]
1153         mov     edi, [ebp + 20]
1154         testtop [ebp + 28], [ebp + 32]
1155         call    dmla4
1156         testtail [ebp + 48], [ebp + 44]
1157         testcarryout [ebp + 24]
1158         testepilogue
1159
1160         .globl  test_mul4
1161 test_mul4:
1162         testprologue
1163         testldcarry [ebp + 24]
1164         testexpand nil, [ebp + 32]
1165         mov     edi, [ebp + 20]
1166         testtop nil, [ebp + 28]
1167         call    mul4
1168         testtail [ebp + 40], [ebp + 36]
1169         testcarryout [ebp + 24]
1170         testepilogue
1171
1172         .globl  test_mla4
1173 test_mla4:
1174         testprologue
1175         testldcarry [ebp + 24]
1176         testexpand nil, [ebp + 32]
1177         mov     edi, [ebp + 20]
1178         testtop nil, [ebp + 28]
1179         call    mla4
1180         testtail [ebp + 40], [ebp + 36]
1181         testcarryout [ebp + 24]
1182         testepilogue
1183
1184         .globl  test_mmul4
1185 test_mmul4:
1186         testprologue
1187         testexpand [ebp + 40], [ebp + 44]
1188         mov     edi, [ebp + 20]
1189         testtop [ebp + 32], [ebp + 36], mont
1190         call    mmul4
1191         testtail [ebp + 52], [ebp + 48]
1192         mov     edi, [ebp + 28]
1193         movdqa  xmm0, [esp + 76]
1194         movdqa  xmm1, [esp + 92]
1195         movdqu  [edi], xmm0
1196         movdqu  [edi + 16], xmm1
1197         testcarryout [ebp + 24]
1198         testepilogue
1199
1200         .globl  test_mmla4
1201 test_mmla4:
1202         testprologue
1203         testexpand [ebp + 40], [ebp + 44]
1204         mov     edi, [ebp + 20]
1205         testtop [ebp + 32], [ebp + 36], mont
1206         call    mmla4
1207         testtail [ebp + 52], [ebp + 48]
1208         mov     edi, [ebp + 28]
1209         movdqa  xmm0, [esp + 76]
1210         movdqa  xmm1, [esp + 92]
1211         movdqu  [edi], xmm0
1212         movdqu  [edi + 16], xmm1
1213         testcarryout [ebp + 24]
1214         testepilogue
1215
1216         .globl  test_mont4
1217 test_mont4:
1218         testprologue
1219         testexpand nil, [ebp + 36]
1220         mov     edi, [ebp + 20]
1221         testtop nil, [ebp + 32], mont
1222         call    mont4
1223         testtail [ebp + 44], [ebp + 40]
1224         mov     edi, [ebp + 28]
1225         movdqa  xmm0, [esp + 76]
1226         movdqa  xmm1, [esp + 92]
1227         movdqu  [edi], xmm0
1228         movdqu  [edi + 16], xmm1
1229         testcarryout [ebp + 24]
1230         testepilogue
1231
1232 #endif
1233
1234 ///----- That's all, folks --------------------------------------------------