math/mpx-mul4-*: Test the `...zc' variants too.

[catacomb] / math / mpx-mul4-x86-sse2.S
diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S

index 8f69a55..baf7cc5 100644 (file)
--- a/math/mpx-mul4-x86-sse2.S
+++ b/math/mpx-mul4-x86-sse2.S
@@ -64,7 +64,7 @@
  ///       0    v'_0    v'_1    v''_0   v''_1
  ///      16    v'_2    v'_3    v''_2   v''_3
  ///
  ///       0    v'_0    v'_1    v''_0   v''_1
  ///      16    v'_2    v'_3    v''_2   v''_3
  ///
-/// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
+/// A `pmuludq' instruction ignores the odd positions in its operands; thus,
  /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
  /// this vector right by 4 bytes brings v'_1 and v''_1 into position.  We can
  /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
  /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
  /// this vector right by 4 bytes brings v'_1 and v''_1 into position.  We can
  /// multiply such a vector by a full 32-bit scalar to produce two 48-bit
@@ -81,7 +81,7 @@
  /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
  /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
  /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3.  The
  /// the register c0, for example, holds c'_0 (low half) and c''_0 (high
  /// half), and represents the value c_0 = c'_0 + c''_0 b; the carry registers
  /// collectively represent the value c_0 + c_1 B + c_2 B^2 + c_3 B^3.  The
-/// `pmuluqdq' instruction acting on a scalar operand (broadcast across all
+/// `pmuluqd' instruction acting on a scalar operand (broadcast across all
  /// lanes of its vector) and an operand in the expanded form above produces a
  /// result which can be added directly to the appropriate carry register.
  /// Following a pass of four multiplications, we perform some limited carry
  /// lanes of its vector) and an operand in the expanded form above produces a
  /// result which can be added directly to the appropriate carry register.
  /// Following a pass of four multiplications, we perform some limited carry
@@ -93,7 +93,7 @@
  ///--------------------------------------------------------------------------
  /// Macro definitions.
  
  ///--------------------------------------------------------------------------
  /// Macro definitions.
  
-.macro mulcore r, s, d0, d1, d2, d3
+.macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil
         // Load a word r_i from R, multiply by the expanded operand [S], and
         // leave the pieces of the product in registers D0, D1, D2, D3.
         movd    \d0, \r                 // (r_i, 0, 0, 0)
         // Load a word r_i from R, multiply by the expanded operand [S], and
         // leave the pieces of the product in registers D0, D1, D2, D3.
         movd    \d0, \r                 // (r_i, 0, 0, 0)
@@ -118,22 +118,25 @@
         psrldq  \d3, 4                  // (s'_3, s''_2, s''_3, 0)
    .endif
    .ifnes "\d1", "nil"
         psrldq  \d3, 4                  // (s'_3, s''_2, s''_3, 0)
    .endif
    .ifnes "\d1", "nil"
-       pmuludqd \d1, \d0               // (r_i s'_1, r_i s''_1)
+       pmuludq \d1, \d0                // (r_i s'_1, r_i s''_1)
    .endif
    .ifnes "\d3", "nil"
    .endif
    .ifnes "\d3", "nil"
-       pmuludqd \d3, \d0               // (r_i s'_3, r_i s''_3)
+       pmuludq \d3, \d0                // (r_i s'_3, r_i s''_3)
    .endif
    .ifnes "\d2", "nil"
      .ifnes "\d3", "nil"
    .endif
    .ifnes "\d2", "nil"
      .ifnes "\d3", "nil"
-       pmuludqd \d2, \d0               // (r_i s'_2, r_i s''_2)
+       pmuludq \d2, \d0                // (r_i s'_2, r_i s''_2)
      .else
      .else
-       pmuludqd \d2, [\s + 16]
+       pmuludq \d2, [\s + 16]
      .endif
    .endif
      .endif
    .endif
-       pmuludqd \d0, [\s]              // (r_i s'_0, r_i s''_0)
+       pmuludq \d0, [\s]               // (r_i s'_0, r_i s''_0)
  .endm
  
  .endm
  
-.macro accum   c0, c1, c2, c3
+.macro accum   c0, c1=nil, c2=nil, c3=nil
+       // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding
+       // carry registers C0--C3.  Any or all of C1--C3 may be `nil' to skip
+       // updating that register.
         paddq   \c0, xmm0
    .ifnes "\c1", "nil"
         paddq   \c1, xmm1
         paddq   \c0, xmm0
    .ifnes "\c1", "nil"
         paddq   \c1, xmm1
@@ -146,7 +149,7 @@
    .endif
  .endm
  
    .endif
  .endm
  
-.macro mulacc  r, s, c0, c1, c2, c3, z3p
+.macro mulacc  r, s, c0, c1, c2, c3, z3p=nil
         // Load a word r_i from R, multiply by the expanded operand [S],
         // and accumulate in carry registers C0, C1, C2, C3.  If Z3P is `t'
         // then C3 notionally contains zero, but needs clearing; in practice,
         // Load a word r_i from R, multiply by the expanded operand [S],
         // and accumulate in carry registers C0, C1, C2, C3.  If Z3P is `t'
         // then C3 notionally contains zero, but needs clearing; in practice,
@@ -155,14 +158,14 @@
         // is not `t'.
    .ifeqs "\z3p", "t"
         mulcore \r, \s, xmm0, xmm1, xmm2, \c3
         // is not `t'.
    .ifeqs "\z3p", "t"
         mulcore \r, \s, xmm0, xmm1, xmm2, \c3
-       accum           \c0,  \c1,  \c2,  nil
+       accum           \c0,  \c1,  \c2
    .else
         mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
         accum           \c0,  \c1,  \c2,  \c3
    .endif
  .endm
  
    .else
         mulcore \r, \s, xmm0, xmm1, xmm2, xmm3
         accum           \c0,  \c1,  \c2,  \c3
    .endif
  .endm
  
-.macro propout d, c, cc
+.macro propout d, c, cc=nil
         // Calculate an output word from C, and store it in D; propagate
         // carries out from C to CC in preparation for a rotation of the
         // carry registers.  On completion, XMM3 is clobbered.  If CC is
         // Calculate an output word from C, and store it in D; propagate
         // carries out from C to CC in preparation for a rotation of the
         // carry registers.  On completion, XMM3 is clobbered.  If CC is
@@ -192,7 +195,7 @@
         psrldq  \t, 4                   // floor((c' + c'' b)/B)
  .endm
  
         psrldq  \t, 4                   // floor((c' + c'' b)/B)
  .endm
  
-.macro expand  a, b, c, d, z
+.macro expand  z, a, b, c=nil, d=nil
         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
         // exit, A:B and C:D together hold the same values in expanded
         // form.  If C is `nil', then only expand A to A:B.
         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
         // exit, A:B and C:D together hold the same values in expanded
         // form.  If C is `nil', then only expand A to A:B.
@@ -214,11 +217,11 @@
    .endif
  .endm
  
    .endif
  .endm
  
-.macro squash  c0, c1, c2, c3, h, t, u
+.macro squash  c0, c1, c2, c3, t, u, lo, hi=nil
         // On entry, C0, C1, C2, C3 are carry registers representing a value
         // On entry, C0, C1, C2, C3 are carry registers representing a value
-       // Y.  On exit, C0 holds the low 128 bits of the carry value; C1, C2,
+       // Y.  On exit, LO holds the low 128 bits of the carry value; C1, C2,
         // C3, T, and U are clobbered; and the high bits of Y are stored in
         // C3, T, and U are clobbered; and the high bits of Y are stored in
-       // H, if this is not `nil'.
+       // HI, if this is not `nil'.
  
         // The first step is to eliminate the `double-prime' pieces -- i.e.,
         // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
  
         // The first step is to eliminate the `double-prime' pieces -- i.e.,
         // the ones offset by 16 bytes from a 32-bit boundary -- by carrying
@@ -241,8 +244,8 @@
         psrlq   \c1, 16                 // high parts of (y''_1, y''_3)
         psrlq   \c2, 32                 // low parts of (y''_0, y''_2)
         psrlq   \c3, 32                 // low parts of (y''_1, y''_3)
         psrlq   \c1, 16                 // high parts of (y''_1, y''_3)
         psrlq   \c2, 32                 // low parts of (y''_0, y''_2)
         psrlq   \c3, 32                 // low parts of (y''_1, y''_3)
-  .ifnes "\h", "nil"
-       movdqa  \h, \c1
+  .ifnes "\hi", "nil"
+       movdqa  \hi, \c1
    .endif
         pslldq  \c1, 8                  // high part of (0, y''_1)
  
    .endif
         pslldq  \c1, 8                  // high part of (0, y''_1)
  
@@ -250,44 +253,36 @@
         paddq   \u, \c3
         paddq   \t, \c1                 // and up: (y_0, y_2)
         paddq   \u, \c0                 // (y_1, y_3)
         paddq   \u, \c3
         paddq   \t, \c1                 // and up: (y_0, y_2)
         paddq   \u, \c0                 // (y_1, y_3)
-  .ifnes "\h", "nil"
-       psrldq  \h, 8                   // high part of (y''_3, 0)
+  .ifnes "\hi", "nil"
+       psrldq  \hi, 8                  // high part of (y''_3, 0)
    .endif
  
         // Finally extract the answer.  This complicated dance is better than
         // storing to memory and loading, because the piecemeal stores
         // inhibit store forwarding.
         movdqa  \c3, \t                 // (y_0, y_1)
    .endif
  
         // Finally extract the answer.  This complicated dance is better than
         // storing to memory and loading, because the piecemeal stores
         // inhibit store forwarding.
         movdqa  \c3, \t                 // (y_0, y_1)
-       movdqa  \c0, \t                 // (y^*_0, ?, ?, ?)
+       movdqa  \lo, \t                 // (y^*_0, ?, ?, ?)
         psrldq  \t, 8                   // (y_2, 0)
         psrlq   \c3, 32                 // (floor(y_0/B), ?)
         paddq   \c3, \u                 // (y_1 + floor(y_0/B), ?)
         psrldq  \t, 8                   // (y_2, 0)
         psrlq   \c3, 32                 // (floor(y_0/B), ?)
         paddq   \c3, \u                 // (y_1 + floor(y_0/B), ?)
-       pslldq  \c0, 12                 // (0, 0, 0, y^*_0)
         movdqa  \c1, \c3                // (y^*_1, ?, ?, ?)
         psrldq  \u, 8                   // (y_3, 0)
         psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2, ?)
         paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2, ?)
         movdqa  \c1, \c3                // (y^*_1, ?, ?, ?)
         psrldq  \u, 8                   // (y_3, 0)
         psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2, ?)
         paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2, ?)
-       pslldq  \c1, 12                 // (0, 0, 0, y^*_1)
-       psrldq  \c0, 12                 // (y^*_0, 0, 0, 0)
-       movdqa  \c2, \c3                // (y^*_2, ?, ?, ?)
+       punpckldq \lo, \c3              // (y^*_0, y^*_2, ?, ?)
         psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
         paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
         psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
         paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
-       pslldq  \c2, 12                 // (0, 0, 0, y^*_2)
-       psrldq  \c1, 8                  // (0, y^*_1, 0, 0)
-       psrldq  \c2, 4                  // (0, 0, y^*_2, 0)
-  .ifnes "\h", "nil"
-       movdqu  \t, \c3
+  .ifnes "\hi", "nil"
+       movdqa  \t, \c3
         pxor    \u, \u
    .endif
         pxor    \u, \u
    .endif
-       pslldq  \c3, 12                 // (0, 0, 0, y^*_3)
-       por     \c0, \c1                // (y^*_0, y^*_1, 0, 0)
-       por     \c2, \c3                // (0, 0, y^*_2, y^*_3)
-       por     \c0, \c2                // y mod B^4
-  .ifnes "\h", "nil"
+       punpckldq \c1, \c3              // (y^*_1, y^*_3, ?, ?)
+  .ifnes "\hi", "nil"
         psrlq   \t, 32                  // very high bits of y
         psrlq   \t, 32                  // very high bits of y
-       paddq   \h, \t
-       punpcklqdq \h, \u               // carry up
+       paddq   \hi, \t
+       punpcklqdq \hi, \u              // carry up
    .endif
    .endif
+       punpckldq \lo, \c1              // y mod B^4
  .endm
  
  .macro carryadd
  .endm
  
  .macro carryadd
@@ -302,6 +297,7 @@
         movd    xmm1, [edi +  4]        // (a_1, 0)
         movd    xmm2, [edi +  8]        // (a_2, 0)
         movd    xmm7, [edi + 12]        // (a_3, 0)
         movd    xmm1, [edi +  4]        // (a_1, 0)
         movd    xmm2, [edi +  8]        // (a_2, 0)
         movd    xmm7, [edi + 12]        // (a_3, 0)
+
         paddq   xmm4, xmm0              // (c'_0 + a_0, c''_0)
         paddq   xmm5, xmm1              // (c'_1 + a_1, c''_1)
         paddq   xmm6, xmm2              // (c'_2 + a_2, c''_2 + a_3 b)
         paddq   xmm4, xmm0              // (c'_0 + a_0, c''_0)
         paddq   xmm5, xmm1              // (c'_1 + a_1, c''_1)
         paddq   xmm6, xmm2              // (c'_2 + a_2, c''_2 + a_3 b)
@@ -315,6 +311,8 @@ INTFUNC(carryprop)
         // form.  Store the low 128 bits of the represented carry to [EDI] as
         // a packed 128-bit value, and leave the remaining 16 bits in the low
         // 32 bits of XMM4.  On exit, XMM3, XMM5 and XMM6 are clobbered.
         // form.  Store the low 128 bits of the represented carry to [EDI] as
         // a packed 128-bit value, and leave the remaining 16 bits in the low
         // 32 bits of XMM4.  On exit, XMM3, XMM5 and XMM6 are clobbered.
+  endprologue
+
         propout [edi +  0], xmm4, xmm5
         propout [edi +  4], xmm5, xmm6
         propout [edi +  8], xmm6, nil
         propout [edi +  0], xmm4, xmm5
         propout [edi +  4], xmm5, xmm6
         propout [edi +  8], xmm6, nil
@@ -333,20 +331,22 @@ INTFUNC(dmul4)
         // [EDI], and update the carry registers with the carry out.  The
         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
         // general-purpose registers are preserved.
         // [EDI], and update the carry registers with the carry out.  The
         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
         // general-purpose registers are preserved.
+  endprologue
+
         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, t
         mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, t
-       mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
+       mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
         propout [edi +  0],      xmm4, xmm5
  
         mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
         propout [edi +  0],      xmm4, xmm5
  
         mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
-       mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, nil
+       mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4
         propout [edi +  4],      xmm5, xmm6
  
         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
         propout [edi +  4],      xmm5, xmm6
  
         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
-       mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, nil
+       mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5
         propout [edi +  8],      xmm6, xmm7
  
         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
         propout [edi +  8],      xmm6, xmm7
  
         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
-       mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
+       mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
         propout [edi + 12],      xmm7, xmm4
  
         ret
         propout [edi + 12],      xmm7, xmm4
  
         ret
@@ -365,22 +365,24 @@ INTFUNC(dmla4)
         // [EDI], and update the carry registers with the carry out.  The
         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
         // general-purpose registers are preserved.
         // [EDI], and update the carry registers with the carry out.  The
         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
         // general-purpose registers are preserved.
+  endprologue
+
         carryadd
  
         carryadd
  
-       mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, nil
-       mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
+       mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
+       mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
         propout [edi +  0],      xmm4, xmm5
  
         mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
         propout [edi +  0],      xmm4, xmm5
  
         mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
-       mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, nil
+       mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4
         propout [edi +  4],      xmm5, xmm6
  
         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
         propout [edi +  4],      xmm5, xmm6
  
         mulacc  [eax +  8], ecx, xmm6, xmm7, xmm4, xmm5, t
-       mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5, nil
+       mulacc  [ebx +  8], edx, xmm6, xmm7, xmm4, xmm5
         propout [edi +  8],      xmm6, xmm7
  
         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
         propout [edi +  8],      xmm6, xmm7
  
         mulacc  [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t
-       mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, nil
+       mulacc  [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6
         propout [edi + 12],      xmm7, xmm4
  
         ret
         propout [edi + 12],      xmm7, xmm4
  
         ret
@@ -395,6 +397,8 @@ INTFUNC(mul4zc)
         // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
         // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
         // general-purpose registers are preserved.
         // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
         // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
         // general-purpose registers are preserved.
+  endprologue
+
         mulcore [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
         propout [edi +  0],      xmm4, xmm5
  
         mulcore [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
         propout [edi +  0],      xmm4, xmm5
  
@@ -421,6 +425,8 @@ INTFUNC(mul4)
         // and update the carry registers with the carry out.  The registers
         // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
         // general-purpose registers are preserved.
         // and update the carry registers with the carry out.  The registers
         // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
         // general-purpose registers are preserved.
+  endprologue
+
         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, t
         propout [edi +  0],      xmm4, xmm5
  
         mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, t
         propout [edi +  0],      xmm4, xmm5
  
@@ -446,12 +452,14 @@ INTFUNC(mla4zc)
         // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
         // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
         // general-purpose registers are preserved.
         // and set the carry registers XMM4, XMM5, XMM6 to the carry out.
         // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
         // general-purpose registers are preserved.
+  endprologue
+
         movd    xmm4, [edi +  0]
         movd    xmm5, [edi +  4]
         movd    xmm6, [edi +  8]
         movd    xmm7, [edi + 12]
  
         movd    xmm4, [edi +  0]
         movd    xmm5, [edi +  4]
         movd    xmm6, [edi +  8]
         movd    xmm7, [edi + 12]
  
-       mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
+       mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
         propout [edi +  0],      xmm4, xmm5
  
         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
         propout [edi +  0],      xmm4, xmm5
  
         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
@@ -478,9 +486,11 @@ INTFUNC(mla4)
         // [EDI], and update the carry registers with the carry out.  The
         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
         // general-purpose registers are preserved.
         // [EDI], and update the carry registers with the carry out.  The
         // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the
         // general-purpose registers are preserved.
+  endprologue
+
         carryadd
  
         carryadd
  
-       mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
+       mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
         propout [edi +  0],      xmm4, xmm5
  
         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
         propout [edi +  0],      xmm4, xmm5
  
         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
@@ -501,14 +511,14 @@ INTFUNC(mmul4)
         // to the packed operands U and N; ECX and ESI point to the expanded
         // operands V and M; and EDX points to a place to store an expanded
         // result Y (32 bytes, at a 16-byte boundary).  The stack pointer
         // to the packed operands U and N; ECX and ESI point to the expanded
         // operands V and M; and EDX points to a place to store an expanded
         // result Y (32 bytes, at a 16-byte boundary).  The stack pointer
-       // must be 16-byte aligned.  (This is not the usual convention, which
-       // requires alignment before the call.)
+       // must be 12 modulo 16, as is usual for modern x86 ABIs.
         //
         // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
         // of the sum U V + N Y to [EDI], leaving the remaining carry in
         // XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
         // XMM7 are clobbered; the general-purpose registers are preserved.
         //
         // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits
         // of the sum U V + N Y to [EDI], leaving the remaining carry in
         // XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
         // XMM7 are clobbered; the general-purpose registers are preserved.
-       sub     esp, 64                 // space for the carries
+       stalloc 48 + 12                 // space for the carries
+  endprologue
  
         // Calculate W = U V, and leave it in the destination.  Stash the
         // carry pieces for later.
  
         // Calculate W = U V, and leave it in the destination.  Stash the
         // carry pieces for later.
@@ -520,24 +530,28 @@ ENDFUNC
  
  INTFUNC(mmla4)
         // On entry, EDI points to the destination buffer, which also
  
  INTFUNC(mmla4)
         // On entry, EDI points to the destination buffer, which also
-       // contains an addend A to accumulate; EAX and EBX point
-       // to the packed operands U and N; ECX and ESI point to the expanded
+       // contains an addend A to accumulate; EAX and EBX point to the
+       // packed operands U and N; ECX and ESI point to the expanded
         // operands V and M; and EDX points to a place to store an expanded
         // result Y (32 bytes, at a 16-byte boundary).  The stack pointer
         // operands V and M; and EDX points to a place to store an expanded
         // result Y (32 bytes, at a 16-byte boundary).  The stack pointer
-       // must be 16-byte aligned.  (This is not the usual convention, which
-       // requires alignment before the call.)
+       // must be 12 modulo 16, as is usual for modern x86 ABIs.
         //
         // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
         // bits of the sum A + U V + N Y to [EDI], leaving the remaining
         // carry in XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2,
         // XMM3, and XMM7 are clobbered; the general-purpose registers are
         // preserved.
         //
         // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128
         // bits of the sum A + U V + N Y to [EDI], leaving the remaining
         // carry in XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2,
         // XMM3, and XMM7 are clobbered; the general-purpose registers are
         // preserved.
-       sub     esp, 64                 // space for the carries
+       stalloc 48 + 12                 // space for the carries
+  endprologue
+
         movd    xmm4, [edi +  0]
         movd    xmm5, [edi +  4]
         movd    xmm6, [edi +  8]
         movd    xmm7, [edi + 12]
         movd    xmm4, [edi +  0]
         movd    xmm5, [edi +  4]
         movd    xmm6, [edi +  8]
         movd    xmm7, [edi + 12]
-       mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7, nil
+
+       // Calculate W = U V, and leave it in the destination.  Stash the
+       // carry pieces for later.
+       mulacc  [eax +  0], ecx, xmm4, xmm5, xmm6, xmm7
         propout [edi +  0],      xmm4, xmm5
  
  5:     mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
         propout [edi +  0],      xmm4, xmm5
  
  5:     mulacc  [eax +  4], ecx, xmm5, xmm6, xmm7, xmm4, t
@@ -556,21 +570,21 @@ INTFUNC(mmla4)
         // Calculate Y = W M.
         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
  
         // Calculate Y = W M.
         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
  
-       mulcore [edi +  4], esi, xmm0, xmm1, xmm2, nil
-       accum                    xmm5, xmm6, xmm7, nil
+       mulcore [edi +  4], esi, xmm0, xmm1, xmm2
+       accum                    xmm5, xmm6, xmm7
  
  
-       mulcore [edi +  8], esi, xmm0, xmm1, nil,  nil
-       accum                    xmm6, xmm7, nil,  nil
+       mulcore [edi +  8], esi, xmm0, xmm1
+       accum                    xmm6, xmm7
  
  
-       mulcore [edi + 12], esi, xmm0, nil,  nil,  nil
-       accum                    xmm7, nil,  nil,  nil
+       mulcore [edi + 12], esi, xmm0
+       accum                    xmm7
  
         // That's lots of pieces.  Now we have to assemble the answer.
  
         // That's lots of pieces.  Now we have to assemble the answer.
-       squash  xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
+       squash  xmm4, xmm5, xmm6, xmm7,  xmm0, xmm1,  xmm4
  
         // Expand it.
         pxor    xmm2, xmm2
  
         // Expand it.
         pxor    xmm2, xmm2
-       expand  xmm4, xmm1, nil, nil, xmm2
+       expand  xmm2, xmm4, xmm1
         movdqa  [edx +  0], xmm4
         movdqa  [edx + 16], xmm1
  
         movdqa  [edx +  0], xmm4
         movdqa  [edx + 16], xmm1
  
@@ -581,7 +595,7 @@ INTFUNC(mmla4)
         movd    xmm7, [edi + 12]
  
         // Finish the calculation by adding the Montgomery product.
         movd    xmm7, [edi + 12]
  
         // Finish the calculation by adding the Montgomery product.
-       mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
+       mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
         propout [edi +  0],      xmm4, xmm5
  
         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
         propout [edi +  0],      xmm4, xmm5
  
         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
@@ -599,14 +613,14 @@ INTFUNC(mmla4)
         paddq   xmm6, [esp + 32]
  
         // And, with that, we're done.
         paddq   xmm6, [esp + 32]
  
         // And, with that, we're done.
-       add     esp, 64
+       stfree  48 + 12
         ret
  
  ENDFUNC
  
  INTFUNC(mont4)
         // On entry, EDI points to the destination buffer holding a packed
         ret
  
  ENDFUNC
  
  INTFUNC(mont4)
         // On entry, EDI points to the destination buffer holding a packed
-       // value A; EBX points to a packed operand N; ESI points to an
+       // value W; EBX points to a packed operand N; ESI points to an
         // expanded operand M; and EDX points to a place to store an expanded
         // result Y (32 bytes, at a 16-byte boundary).
         //
         // expanded operand M; and EDX points to a place to store an expanded
         // result Y (32 bytes, at a 16-byte boundary).
         //
@@ -614,25 +628,26 @@ INTFUNC(mont4)
         // of the sum W + N Y to [EDI], leaving the remaining carry in
         // XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
         // XMM7 are clobbered; the general-purpose registers are preserved.
         // of the sum W + N Y to [EDI], leaving the remaining carry in
         // XMM4, XMM5, and XMM6.  The registers XMM0, XMM1, XMM2, XMM3, and
         // XMM7 are clobbered; the general-purpose registers are preserved.
+  endprologue
  
         // Calculate Y = W M.
         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
  
  
         // Calculate Y = W M.
         mulcore [edi +  0], esi, xmm4, xmm5, xmm6, xmm7
  
-       mulcore [edi +  4], esi, xmm0, xmm1, xmm2, nil
-       accum                    xmm5, xmm6, xmm7, nil
+       mulcore [edi +  4], esi, xmm0, xmm1, xmm2
+       accum                    xmm5, xmm6, xmm7
  
  
-       mulcore [edi +  8], esi, xmm0, xmm1, nil,  nil
-       accum                    xmm6, xmm7, nil,  nil
+       mulcore [edi +  8], esi, xmm0, xmm1
+       accum                    xmm6, xmm7
  
  
-       mulcore [edi + 12], esi, xmm0, nil,  nil,  nil
-       accum                    xmm7, nil,  nil,  nil
+       mulcore [edi + 12], esi, xmm0
+       accum                    xmm7
  
         // That's lots of pieces.  Now we have to assemble the answer.
  
         // That's lots of pieces.  Now we have to assemble the answer.
-       squash  xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
+       squash  xmm4, xmm5, xmm6, xmm7,  xmm0, xmm1,  xmm4
  
         // Expand it.
         pxor    xmm2, xmm2
  
         // Expand it.
         pxor    xmm2, xmm2
-       expand  xmm4, xmm1, nil, nil, xmm2
+       expand  xmm2, xmm4, xmm1
         movdqa  [edx +  0], xmm4
         movdqa  [edx + 16], xmm1
  
         movdqa  [edx +  0], xmm4
         movdqa  [edx + 16], xmm1
  
@@ -643,7 +658,7 @@ INTFUNC(mont4)
         movd    xmm7, [edi + 12]
  
         // Finish the calculation by adding the Montgomery product.
         movd    xmm7, [edi + 12]
  
         // Finish the calculation by adding the Montgomery product.
-       mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7, nil
+       mulacc  [ebx +  0], edx, xmm4, xmm5, xmm6, xmm7
         propout [edi +  0],      xmm4, xmm5
  
         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
         propout [edi +  0],      xmm4, xmm5
  
         mulacc  [ebx +  4], edx, xmm5, xmm6, xmm7, xmm4, t
@@ -680,13 +695,14 @@ FUNC(mpx_umul4_x86_sse2)
         //
         //      esp +  0        expanded Y (32 bytes)
         //      esp + 32        (top of locals)
         //
         //      esp +  0        expanded Y (32 bytes)
         //      esp + 32        (top of locals)
-       push    ebp
-       push    ebx
-       push    esi
-       push    edi
-       mov     ebp, esp
+       pushreg ebp
+       pushreg ebx
+       pushreg esi
+       pushreg edi
+       setfp   ebp
         and     esp, ~15
         sub     esp, 32
         and     esp, ~15
         sub     esp, 32
+  endprologue
  
         // Prepare for the first iteration.
         mov     esi, [ebp + 32]         // -> bv[0]
  
         // Prepare for the first iteration.
         mov     esi, [ebp + 32]         // -> bv[0]
@@ -694,7 +710,7 @@ FUNC(mpx_umul4_x86_sse2)
         movdqu  xmm0, [esi]             // bv[0]
         mov     edi, [ebp + 20]         // -> dv[0]
         mov     ecx, edi                // outer loop dv cursor
         movdqu  xmm0, [esi]             // bv[0]
         mov     edi, [ebp + 20]         // -> dv[0]
         mov     ecx, edi                // outer loop dv cursor
-       expand  xmm0, xmm1, nil, nil, xmm7
+       expand  xmm7, xmm0, xmm1
         mov     ebx, [ebp + 24]         // -> av[0]
         mov     eax, [ebp + 28]         // -> av[m] = av limit
         mov     edx, esp                // -> expanded Y = bv[0]
         mov     ebx, [ebp + 24]         // -> av[0]
         mov     eax, [ebp + 28]         // -> av[m] = av limit
         mov     edx, esp                // -> expanded Y = bv[0]
@@ -726,7 +742,7 @@ FUNC(mpx_umul4_x86_sse2)
  1:     movdqu  xmm0, [esi]             // bv[i]
         mov     edi, ecx                // -> dv[i]
         pxor    xmm7, xmm7
  1:     movdqu  xmm0, [esi]             // bv[i]
         mov     edi, ecx                // -> dv[i]
         pxor    xmm7, xmm7
-       expand  xmm0, xmm1, nil, nil, xmm7
+       expand  xmm7, xmm0, xmm1
         mov     ebx, [ebp + 24]         // -> av[0]
         movdqa  [esp + 0], xmm0         // bv[i] expanded low
         movdqa  [esp + 16], xmm1        // bv[i] expanded high
         mov     ebx, [ebp + 24]         // -> av[0]
         movdqa  [esp + 0], xmm0         // bv[i] expanded low
         movdqa  [esp + 16], xmm1        // bv[i] expanded high
@@ -753,7 +769,7 @@ FUNC(mpx_umul4_x86_sse2)
         jb      1b
  
         // All over.
         jb      1b
  
         // All over.
-9:     mov     esp, ebp
+9:     dropfp
         pop     edi
         pop     esi
         pop     ebx
         pop     edi
         pop     esi
         pop     ebx
@@ -776,24 +792,24 @@ FUNC(mpxmont_mul4_x86_sse2)
         //      ebp + 36        n (nonzero multiple of 4)
         //      ebp + 40        mi
         //
         //      ebp + 36        n (nonzero multiple of 4)
         //      ebp + 40        mi
         //
-       // Locals are relative to ESP, which is 4 mod 16, as follows.
+       // Locals are relative to ESP, which 16-byte aligned, as follows.
         //
         //
-       //      esp +   0       outer loop dv
-       //      esp +   4       outer loop bv
-       //      esp +   8       av limit (mostly in ESI)
-       //      esp +  12       expanded V (32 bytes)
-       //      esp +  44       expanded M (32 bytes)
-       //      esp +  76       expanded Y (32 bytes)
+       //      esp +   0       expanded V (32 bytes)
+       //      esp +  32       expanded M (32 bytes)
+       //      esp +  64       expanded Y (32 bytes)
+       //      esp +  96       outer loop dv
+       //      esp + 100       outer loop bv
+       //      esp + 104       av limit (mostly in ESI)
         //      esp + 108       bv limit
         //      esp + 108       bv limit
-       //      esp + 112       (gap)
-       //      esp + 124       (top of locals)
-       push    ebp
-       push    ebx
-       push    esi
-       push    edi
-       mov     ebp, esp
+       //      esp + 112       (top of locals)
+       pushreg ebp
+       pushreg ebx
+       pushreg esi
+       pushreg edi
+       setfp   ebp
         and     esp, ~15
         and     esp, ~15
-       sub     esp, 124
+       sub     esp, 112
+  endprologue
  
         // Establish the expanded operands.
         pxor    xmm7, xmm7
  
         // Establish the expanded operands.
         pxor    xmm7, xmm7
@@ -801,34 +817,34 @@ FUNC(mpxmont_mul4_x86_sse2)
         mov     edx, [ebp + 40]         // -> mi
         movdqu  xmm0, [ecx]             // bv[0]
         movdqu  xmm2, [edx]             // mi
         mov     edx, [ebp + 40]         // -> mi
         movdqu  xmm0, [ecx]             // bv[0]
         movdqu  xmm2, [edx]             // mi
-       expand  xmm0, xmm1, xmm2, xmm3, xmm7
-       movdqa  [esp + 12], xmm0        // bv[0] expanded low
-       movdqa  [esp + 28], xmm1        // bv[0] expanded high
-       movdqa  [esp + 44], xmm2        // mi expanded low
-       movdqa  [esp + 60], xmm3        // mi expanded high
+       expand  xmm7, xmm0, xmm1, xmm2, xmm3
+       movdqa  [esp +  0], xmm0        // bv[0] expanded low
+       movdqa  [esp + 16], xmm1        // bv[0] expanded high
+       movdqa  [esp + 32], xmm2        // mi expanded low
+       movdqa  [esp + 48], xmm3        // mi expanded high
  
         // Set up the outer loop state and prepare for the first iteration.
         mov     edx, [ebp + 36]         // n
         mov     eax, [ebp + 24]         // -> U = av[0]
         mov     ebx, [ebp + 32]         // -> X = nv[0]
         mov     edi, [ebp + 20]         // -> Z = dv[0]
  
         // Set up the outer loop state and prepare for the first iteration.
         mov     edx, [ebp + 36]         // n
         mov     eax, [ebp + 24]         // -> U = av[0]
         mov     ebx, [ebp + 32]         // -> X = nv[0]
         mov     edi, [ebp + 20]         // -> Z = dv[0]
-       mov     [esp + 4], ecx
+       mov     [esp + 100], ecx
         lea     ecx, [ecx + 4*edx]      // -> bv[n/4] = bv limit
         lea     edx, [eax + 4*edx]      // -> av[n/4] = av limit
         lea     ecx, [ecx + 4*edx]      // -> bv[n/4] = bv limit
         lea     edx, [eax + 4*edx]      // -> av[n/4] = av limit
-       mov     [esp + 0], edi
+       mov     [esp + 96], edi
+       mov     [esp + 104], edx
         mov     [esp + 108], ecx
         mov     [esp + 108], ecx
-       mov     [esp + 8], edx
-       lea     ecx, [esp + 12]         // -> expanded V = bv[0]
-       lea     esi, [esp + 44]         // -> expanded M = mi
-       lea     edx, [esp + 76]         // -> space for Y
+       lea     ecx, [esp + 0]          // -> expanded V = bv[0]
+       lea     esi, [esp + 32]         // -> expanded M = mi
+       lea     edx, [esp + 64]         // -> space for Y
         call    mmul4
         call    mmul4
-       mov     esi, [esp + 8]          // recover av limit
+       mov     esi, [esp + 104]        // recover av limit
         add     edi, 16
         add     eax, 16
         add     ebx, 16
         cmp     eax, esi                // done already?
         jae     8f
         add     edi, 16
         add     eax, 16
         add     ebx, 16
         cmp     eax, esi                // done already?
         jae     8f
-       mov     [esp + 0], edi
+       mov     [esp + 96], edi
  
         .p2align 4
         // Complete the first inner loop.
  
         .p2align 4
         // Complete the first inner loop.
@@ -847,26 +863,26 @@ FUNC(mpxmont_mul4_x86_sse2)
         // Embark on the next iteration.  (There must be one.  If n = 1, then
         // we would have bailed above, to label 8.  Similarly, the subsequent
         // iterations can fall into the inner loop immediately.)
         // Embark on the next iteration.  (There must be one.  If n = 1, then
         // we would have bailed above, to label 8.  Similarly, the subsequent
         // iterations can fall into the inner loop immediately.)
-1:     mov     eax, [esp + 4]          // -> bv[i - 1]
-       mov     edi, [esp + 0]          // -> Z = dv[i]
+1:     mov     eax, [esp + 100]        // -> bv[i - 1]
+       mov     edi, [esp + 96]         // -> Z = dv[i]
         add     eax, 16                 // -> bv[i]
         pxor    xmm7, xmm7
         add     eax, 16                 // -> bv[i]
         pxor    xmm7, xmm7
-       movdqu  xmm0, [eax]             // bv[i]
-       mov     [esp + 4], eax
+       mov     [esp + 100], eax
         cmp     eax, [esp + 108]        // done yet?
         jae     9f
         cmp     eax, [esp + 108]        // done yet?
         jae     9f
+       movdqu  xmm0, [eax]             // bv[i]
         mov     ebx, [ebp + 32]         // -> X = nv[0]
         mov     ebx, [ebp + 32]         // -> X = nv[0]
-       lea     esi, [esp + 44]         // -> expanded M = mi
+       lea     esi, [esp + 32]         // -> expanded M = mi
         mov     eax, [ebp + 24]         // -> U = av[0]
         mov     eax, [ebp + 24]         // -> U = av[0]
-       expand  xmm0, xmm1, nil, nil, xmm7
-       movdqa  [esp + 12], xmm0        // bv[i] expanded low
-       movdqa  [esp + 28], xmm1        // bv[i] expanded high
+       expand  xmm7, xmm0, xmm1
+       movdqa  [esp + 0], xmm0         // bv[i] expanded low
+       movdqa  [esp + 16], xmm1        // bv[i] expanded high
         call    mmla4
         call    mmla4
-       mov     esi, [esp + 8]          // recover av limit
+       mov     esi, [esp + 104]        // recover av limit
         add     edi, 16
         add     eax, 16
         add     ebx, 16
         add     edi, 16
         add     eax, 16
         add     ebx, 16
-       mov     [esp + 0], edi
+       mov     [esp + 96], edi
  
         .p2align 4
         // Complete the next inner loop.
  
         .p2align 4
         // Complete the next inner loop.
@@ -894,11 +910,11 @@ FUNC(mpxmont_mul4_x86_sse2)
         movd    [edi + 16], xmm4
  
         // All done.
         movd    [edi + 16], xmm4
  
         // All done.
-9:     mov     esp, ebp
-       pop     edi
-       pop     esi
-       pop     ebx
-       pop     ebp
+9:     dropfp
+       popreg  edi
+       popreg  esi
+       popreg  ebx
+       popreg  ebp
         ret
  
  ENDFUNC
         ret
  
  ENDFUNC
@@ -924,13 +940,14 @@ FUNC(mpxmont_redc4_x86_sse2)
         //      esp + 12        expanded M (32 bytes)
         //      esp + 44        expanded Y (32 bytes)
         //      esp + 76        (top of locals)
         //      esp + 12        expanded M (32 bytes)
         //      esp + 44        expanded Y (32 bytes)
         //      esp + 76        (top of locals)
-       push    ebp
-       push    ebx
-       push    esi
-       push    edi
-       mov     ebp, esp
+       pushreg ebp
+       pushreg ebx
+       pushreg esi
+       pushreg edi
+       setfp   ebp
         and     esp, ~15
         sub     esp, 76
         and     esp, ~15
         sub     esp, 76
+  endprologue
  
         // Establish the expanded operands and the blocks-of-4 dv limit.
         mov     edi, [ebp + 20]         // -> Z = dv[0]
  
         // Establish the expanded operands and the blocks-of-4 dv limit.
         mov     edi, [ebp + 20]         // -> Z = dv[0]
@@ -940,7 +957,7 @@ FUNC(mpxmont_redc4_x86_sse2)
         mov     edx, [ebp + 36]         // -> mi
         movdqu  xmm0, [edx]             // mi
         and     eax, ~15                // mask off the tail end
         mov     edx, [ebp + 36]         // -> mi
         movdqu  xmm0, [edx]             // mi
         and     eax, ~15                // mask off the tail end
-       expand  xmm0, xmm1, nil, nil, xmm7
+       expand  xmm7, xmm0, xmm1
         add     eax, edi                // find limit
         movdqa  [esp + 12], xmm0        // mi expanded low
         movdqa  [esp + 28], xmm1        // mi expanded high
         add     eax, edi                // find limit
         movdqa  [esp + 12], xmm0        // mi expanded low
         movdqa  [esp + 28], xmm1        // mi expanded high
@@ -956,8 +973,8 @@ FUNC(mpxmont_redc4_x86_sse2)
         lea     esi, [esp + 12]         // -> expanded M = mi
         lea     edx, [esp + 44]         // -> space for Y
         call    mont4
         lea     esi, [esp + 12]         // -> expanded M = mi
         lea     edx, [esp + 44]         // -> space for Y
         call    mont4
-       add     edi, 16
         add     ebx, 16
         add     ebx, 16
+       add     edi, 16
         cmp     ebx, ecx                // done already?
         jae     8f
  
         cmp     ebx, ecx                // done already?
         jae     8f
  
@@ -1019,11 +1036,11 @@ FUNC(mpxmont_redc4_x86_sse2)
         jmp     5b
  
         // All over.
         jmp     5b
  
         // All over.
-9:     mov     esp, ebp
-       pop     edi
-       pop     esi
-       pop     ebx
-       pop     ebp
+9:     dropfp
+       popreg  edi
+       popreg  esi
+       popreg  ebx
+       popreg  ebp
         ret
  
  ENDFUNC
         ret
  
  ENDFUNC
@@ -1051,27 +1068,31 @@ ENDFUNC
         mov     [ebx + ecx*8 + 4], edx
  .endm
  
         mov     [ebx + ecx*8 + 4], edx
  .endm
  
-.macro testprologue
-       push    ebp
-       push    ebx
-       push    esi
-       push    edi
-       mov     ebp, esp
+.macro testprologue n
+       pushreg ebp
+       pushreg ebx
+       pushreg esi
+       pushreg edi
+       setfp   ebp
         and     esp, ~15
         and     esp, ~15
-       sub     esp, 3*32 + 12
+       sub     esp, 3*32 + 4*4
+  endprologue
+       mov     eax, \n
+       mov     [esp + 104], eax
         // vars:
         // vars:
-       //      esp +  0 = cycles
-       //      esp + 12 = v expanded
-       //      esp + 44 = y expanded
-       //      esp + 72 = ? expanded
+       //      esp +   0 = v expanded
+       //      esp +  32 = y expanded
+       //      esp +  64 = ? expanded
+       //      esp +  96 = cycles
+       //      esp + 104 = count
  .endm
  
  .macro testepilogue
  .endm
  
  .macro testepilogue
-       mov     esp, ebp
-       pop     edi
-       pop     esi
-       pop     ebx
-       pop     ebp
+       dropfp
+       popreg  edi
+       popreg  esi
+       popreg  ebx
+       popreg  ebp
         ret
  .endm
  
         ret
  .endm
  
@@ -1082,47 +1103,47 @@ ENDFUNC
         movdqu  xmm6, [ecx + 32]        // (c'_2, c''_2)
  .endm
  
         movdqu  xmm6, [ecx + 32]        // (c'_2, c''_2)
  .endm
  
-.macro testexpand v, y
+.macro testexpand v=nil, y=nil
         pxor    xmm7, xmm7
    .ifnes "\v", "nil"
         mov     ecx, \v
         movdqu  xmm0, [ecx]
         pxor    xmm7, xmm7
    .ifnes "\v", "nil"
         mov     ecx, \v
         movdqu  xmm0, [ecx]
-       expand  xmm0, xmm1, nil, nil, xmm7
-       movdqa  [esp + 12], xmm0
-       movdqa  [esp + 28], xmm1
+       expand  xmm7, xmm0, xmm1
+       movdqa  [esp +  0], xmm0
+       movdqa  [esp + 16], xmm1
    .endif
    .ifnes "\y", "nil"
         mov     edx, \y
         movdqu  xmm2, [edx]
    .endif
    .ifnes "\y", "nil"
         mov     edx, \y
         movdqu  xmm2, [edx]
-       expand  xmm2, xmm3, nil, nil, xmm7
-       movdqa  [esp + 44], xmm2
-       movdqa  [esp + 60], xmm3
+       expand  xmm7, xmm2, xmm3
+       movdqa  [esp + 32], xmm2
+       movdqa  [esp + 48], xmm3
    .endif
  .endm
  
    .endif
  .endm
  
-.macro testtop u, x, mode
+.macro testtop u=nil, x=nil, mode=nil
         .p2align 4
  0:
    .ifnes "\u", "nil"
         .p2align 4
  0:
    .ifnes "\u", "nil"
-       lea     ecx, [esp + 12]
+       lea     ecx, [esp + 0]
    .endif
         mov     ebx, \x
    .ifeqs "\mode", "mont"
    .endif
         mov     ebx, \x
    .ifeqs "\mode", "mont"
-       lea     esi, [esp + 44]
+       lea     esi, [esp + 32]
    .endif
    .endif
-       cysetup esp + 0
+       cysetup esp + 96
    .ifnes "\u", "nil"
         mov     eax, \u
    .endif
    .ifeqs "\mode", "mont"
    .ifnes "\u", "nil"
         mov     eax, \u
    .endif
    .ifeqs "\mode", "mont"
-       lea     edx, [esp + 76]
+       lea     edx, [esp + 64]
    .else
    .else
-       lea     edx, [esp + 44]
+       lea     edx, [esp + 32]
    .endif
  .endm
  
    .endif
  .endm
  
-.macro testtail cyv, n
-       cystore esp + 0, \cyv, \n
+.macro testtail cyv
+       cystore esp + 96, \cyv, esp + 104
         jnz     0b
  .endm
  
         jnz     0b
  .endm
  
@@ -1133,101 +1154,125 @@ ENDFUNC
         movdqu  [ecx + 32], xmm6
  .endm
  
         movdqu  [ecx + 32], xmm6
  .endm
  
-       .globl  test_dmul4
-test_dmul4:
-       testprologue
+FUNC(test_dmul4)
+       testprologue [ebp + 44]
         testldcarry [ebp + 24]
         testexpand [ebp + 36], [ebp + 40]
         mov     edi, [ebp + 20]
         testtop [ebp + 28], [ebp + 32]
         call    dmul4
         testldcarry [ebp + 24]
         testexpand [ebp + 36], [ebp + 40]
         mov     edi, [ebp + 20]
         testtop [ebp + 28], [ebp + 32]
         call    dmul4
-       testtail [ebp + 48], [ebp + 44]
+       testtail [ebp + 48]
         testcarryout [ebp + 24]
         testepilogue
         testcarryout [ebp + 24]
         testepilogue
+ENDFUNC
  
  
-       .globl  test_dmla4
-test_dmla4:
-       testprologue
+FUNC(test_dmla4)
+       testprologue [ebp + 44]
         testldcarry [ebp + 24]
         testexpand [ebp + 36], [ebp + 40]
         mov     edi, [ebp + 20]
         testtop [ebp + 28], [ebp + 32]
         call    dmla4
         testldcarry [ebp + 24]
         testexpand [ebp + 36], [ebp + 40]
         mov     edi, [ebp + 20]
         testtop [ebp + 28], [ebp + 32]
         call    dmla4
-       testtail [ebp + 48], [ebp + 44]
+       testtail [ebp + 48]
         testcarryout [ebp + 24]
         testepilogue
         testcarryout [ebp + 24]
         testepilogue
+ENDFUNC
  
  
-       .globl  test_mul4
-test_mul4:
-       testprologue
+FUNC(test_mul4)
+       testprologue [ebp + 36]
         testldcarry [ebp + 24]
         testexpand nil, [ebp + 32]
         mov     edi, [ebp + 20]
         testtop nil, [ebp + 28]
         call    mul4
         testldcarry [ebp + 24]
         testexpand nil, [ebp + 32]
         mov     edi, [ebp + 20]
         testtop nil, [ebp + 28]
         call    mul4
-       testtail [ebp + 40], [ebp + 36]
+       testtail [ebp + 40]
+       testcarryout [ebp + 24]
+       testepilogue
+ENDFUNC
+
+FUNC(test_mul4zc)
+       testprologue [ebp + 36]
+       testldcarry [ebp + 24]
+       testexpand nil, [ebp + 32]
+       mov     edi, [ebp + 20]
+       testtop nil, [ebp + 28]
+       call    mul4zc
+       testtail [ebp + 40]
         testcarryout [ebp + 24]
         testepilogue
         testcarryout [ebp + 24]
         testepilogue
+ENDFUNC
  
  
-       .globl  test_mla4
-test_mla4:
-       testprologue
+FUNC(test_mla4)
+       testprologue [ebp + 36]
         testldcarry [ebp + 24]
         testexpand nil, [ebp + 32]
         mov     edi, [ebp + 20]
         testtop nil, [ebp + 28]
         call    mla4
         testldcarry [ebp + 24]
         testexpand nil, [ebp + 32]
         mov     edi, [ebp + 20]
         testtop nil, [ebp + 28]
         call    mla4
-       testtail [ebp + 40], [ebp + 36]
+       testtail [ebp + 40]
         testcarryout [ebp + 24]
         testepilogue
         testcarryout [ebp + 24]
         testepilogue
+ENDFUNC
  
  
-       .globl  test_mmul4
-test_mmul4:
-       testprologue
+FUNC(test_mla4zc)
+       testprologue [ebp + 36]
+       testldcarry [ebp + 24]
+       testexpand nil, [ebp + 32]
+       mov     edi, [ebp + 20]
+       testtop nil, [ebp + 28]
+       call    mla4zc
+       testtail [ebp + 40]
+       testcarryout [ebp + 24]
+       testepilogue
+ENDFUNC
+
+FUNC(test_mmul4)
+       testprologue [ebp + 48]
         testexpand [ebp + 40], [ebp + 44]
         mov     edi, [ebp + 20]
         testtop [ebp + 32], [ebp + 36], mont
         call    mmul4
         testexpand [ebp + 40], [ebp + 44]
         mov     edi, [ebp + 20]
         testtop [ebp + 32], [ebp + 36], mont
         call    mmul4
-       testtail [ebp + 52], [ebp + 48]
+       testtail [ebp + 52]
         mov     edi, [ebp + 28]
         mov     edi, [ebp + 28]
-       movdqa  xmm0, [esp + 76]
-       movdqa  xmm1, [esp + 92]
+       movdqa  xmm0, [esp + 64]
+       movdqa  xmm1, [esp + 80]
         movdqu  [edi], xmm0
         movdqu  [edi + 16], xmm1
         testcarryout [ebp + 24]
         testepilogue
         movdqu  [edi], xmm0
         movdqu  [edi + 16], xmm1
         testcarryout [ebp + 24]
         testepilogue
+ENDFUNC
  
  
-       .globl  test_mmla4
-test_mmla4:
-       testprologue
+FUNC(test_mmla4)
+       testprologue [ebp + 48]
         testexpand [ebp + 40], [ebp + 44]
         mov     edi, [ebp + 20]
         testtop [ebp + 32], [ebp + 36], mont
         call    mmla4
         testexpand [ebp + 40], [ebp + 44]
         mov     edi, [ebp + 20]
         testtop [ebp + 32], [ebp + 36], mont
         call    mmla4
-       testtail [ebp + 52], [ebp + 48]
+       testtail [ebp + 52]
         mov     edi, [ebp + 28]
         mov     edi, [ebp + 28]
-       movdqa  xmm0, [esp + 76]
-       movdqa  xmm1, [esp + 92]
+       movdqa  xmm0, [esp + 64]
+       movdqa  xmm1, [esp + 80]
         movdqu  [edi], xmm0
         movdqu  [edi + 16], xmm1
         testcarryout [ebp + 24]
         testepilogue
         movdqu  [edi], xmm0
         movdqu  [edi + 16], xmm1
         testcarryout [ebp + 24]
         testepilogue
+ENDFUNC
  
  
-       .globl  test_mont4
-test_mont4:
-       testprologue
+FUNC(test_mont4)
+       testprologue [ebp + 40]
         testexpand nil, [ebp + 36]
         mov     edi, [ebp + 20]
         testtop nil, [ebp + 32], mont
         call    mont4
         testexpand nil, [ebp + 36]
         mov     edi, [ebp + 20]
         testtop nil, [ebp + 32], mont
         call    mont4
-       testtail [ebp + 44], [ebp + 40]
+       testtail [ebp + 44]
         mov     edi, [ebp + 28]
         mov     edi, [ebp + 28]
-       movdqa  xmm0, [esp + 76]
-       movdqa  xmm1, [esp + 92]
+       movdqa  xmm0, [esp + 64]
+       movdqa  xmm1, [esp + 80]
         movdqu  [edi], xmm0
         movdqu  [edi + 16], xmm1
         testcarryout [ebp + 24]
         testepilogue
         movdqu  [edi], xmm0
         movdqu  [edi + 16], xmm1
         testcarryout [ebp + 24]
         testepilogue
+ENDFUNC
  
  #endif
  
  
  #endif