math/: Delete some unnecessary blank lines.

[catacomb] / math / mpx-mul4-amd64-sse2.S
diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S

index 94befa4..bd8ff2f 100644 (file)
--- a/math/mpx-mul4-amd64-sse2.S
+++ b/math/mpx-mul4-amd64-sse2.S
@@ -25,15 +25,13 @@
  /// MA 02111-1307, USA.
  
  ///--------------------------------------------------------------------------
  /// MA 02111-1307, USA.
  
  ///--------------------------------------------------------------------------
-/// External definitions.
+/// Preliminaries.
  
  #include "config.h"
  #include "asm-common.h"
  
  
  #include "config.h"
  #include "asm-common.h"
  
-///--------------------------------------------------------------------------
-/// Prologue.
-
         .arch   pentium4
         .arch   pentium4
+
         .text
  
  ///--------------------------------------------------------------------------
         .text
  
  ///--------------------------------------------------------------------------
@@ -96,32 +94,32 @@
  .macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
         // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
         // of the product in registers D0, D1, D2, D3.
  .macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
         // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
         // of the product in registers D0, D1, D2, D3.
-       pshufd  \d0, \r, SHUF(3, \i, 3, \i) // (r_i, ?, r_i, ?)
+       pshufd  \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
    .ifnes "\d1", "nil"
    .ifnes "\d1", "nil"
-       movdqa  \d1, \slo               // (s'_0, s'_1, s''_0, s''_1)
+       movdqa  \d1, \slo               // (s'_0, s'_1; s''_0, s''_1)
    .endif
    .ifnes "\d3", "nil"
    .endif
    .ifnes "\d3", "nil"
-       movdqa  \d3, \shi               // (s'_2, s'_3, s''_2, s''_3)
+       movdqa  \d3, \shi               // (s'_2, s'_3; s''_2, s''_3)
    .endif
    .ifnes "\d1", "nil"
    .endif
    .ifnes "\d1", "nil"
-       psrldq  \d1, 4                  // (s'_1, s''_0, s''_1, 0)
+       psrldq  \d1, 4                  // (s'_1, s''_0; s''_1, 0)
    .endif
    .ifnes "\d2", "nil"
    .endif
    .ifnes "\d2", "nil"
-       movdqa  \d2, \d0                // another copy of (r_i, ?, r_i, ?)
+       movdqa  \d2, \d0                // another copy of (r_i, ?; r_i, ?)
    .endif
    .ifnes "\d3", "nil"
    .endif
    .ifnes "\d3", "nil"
-       psrldq  \d3, 4                  // (s'_3, s''_2, s''_3, 0)
+       psrldq  \d3, 4                  // (s'_3, s''_2; s''_3, 0)
    .endif
    .ifnes "\d1", "nil"
    .endif
    .ifnes "\d1", "nil"
-       pmuludq \d1, \d0                // (r_i s'_1, r_i s''_1)
+       pmuludq \d1, \d0                // (r_i s'_1; r_i s''_1)
    .endif
    .ifnes "\d3", "nil"
    .endif
    .ifnes "\d3", "nil"
-       pmuludq \d3, \d0                // (r_i s'_3, r_i s''_3)
+       pmuludq \d3, \d0                // (r_i s'_3; r_i s''_3)
    .endif
    .ifnes "\d2", "nil"
    .endif
    .ifnes "\d2", "nil"
-       pmuludq \d2, \shi               // (r_i s'_2, r_i s''_2)
+       pmuludq \d2, \shi               // (r_i s'_2; r_i s''_2)
    .endif
    .endif
-       pmuludq \d0, \slo               // (r_i s'_0, r_i s''_0)
+       pmuludq \d0, \slo               // (r_i s'_0; r_i s''_0)
  .endm
  
  .macro accum   c0, c1=nil, c2=nil, c3=nil
  .endm
  
  .macro accum   c0, c1=nil, c2=nil, c3=nil
@@ -163,10 +161,10 @@
         // lane 0 or 1 of D; the high two lanes of D are clobbered.  On
         // completion, XMM3 is clobbered.  If CC is `nil', then the
         // contribution which would have been added to it is left in C.
         // lane 0 or 1 of D; the high two lanes of D are clobbered.  On
         // completion, XMM3 is clobbered.  If CC is `nil', then the
         // contribution which would have been added to it is left in C.
-       pshufd  xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
-       psrldq  xmm3, 12                // (t, 0, 0, 0) = (t, 0)
-       pslldq  xmm3, 2                 // (t b, 0)
-       paddq   \c, xmm3                // (c' + t b, c'')
+       pshufd  xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
+       psrldq  xmm3, 12                // (t, 0; 0, 0) = (t; 0)
+       pslldq  xmm3, 2                 // (t b; 0)
+       paddq   \c, xmm3                // (c' + t b; c'')
    .ifeqs "\pos", "lo"
         movdqa  \d, \c
    .else
    .ifeqs "\pos", "lo"
         movdqa  \d, \c
    .else
@@ -183,37 +181,37 @@
         // of the value represented in C are written at POS in D, and the
         // remaining bits are left at the bottom of T.
         movdqa  \t, \c
         // of the value represented in C are written at POS in D, and the
         // remaining bits are left at the bottom of T.
         movdqa  \t, \c
-       psllq   \t, 16                  // (?, c'' b)
-       pslldq  \c, 8                   // (0, c')
-       paddq   \t, \c                  // (?, c' + c'' b)
-       psrldq  \t, 8                   // c' + c'' b
+       psllq   \t, 16                  // (?; c'' b)
+       pslldq  \c, 8                   // (0; c')
+       paddq   \t, \c                  // (?; c' + c'' b)
+       psrldq  \t, 8                   // (c' + c'' b; 0) = (c; 0)
    .ifeqs "\pos", "lo"
         movdqa  \d, \t
    .else
         punpckldq \d, \t
    .endif
    .ifeqs "\pos", "lo"
         movdqa  \d, \t
    .else
         punpckldq \d, \t
    .endif
-       psrldq  \t, 4                   // floor((c' + c'' b)/B)
+       psrldq  \t, 4                   // (floor(c/B); 0)
  .endm
  
  .macro expand  z, a, b, c=nil, d=nil
         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
         // exit, A:B and C:D together hold the same values in expanded
         // form.  If C is `nil', then only expand A to A:B.
  .endm
  
  .macro expand  z, a, b, c=nil, d=nil
         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
         // exit, A:B and C:D together hold the same values in expanded
         // form.  If C is `nil', then only expand A to A:B.
-       movdqa  \b, \a                  // (a_0, a_1, a_2, a_3)
+       movdqa  \b, \a                  // (a_0, a_1; a_2, a_3)
    .ifnes "\c", "nil"
    .ifnes "\c", "nil"
-       movdqa  \d, \c                  // (c_0, c_1, c_2, c_3)
+       movdqa  \d, \c                  // (c_0, c_1; c_2, c_3)
    .endif
    .endif
-       punpcklwd \a, \z                // (a'_0, a''_0, a'_1, a''_1)
-       punpckhwd \b, \z                // (a'_2, a''_2, a'_3, a''_3)
+       punpcklwd \a, \z                // (a'_0, a''_0; a'_1, a''_1)
+       punpckhwd \b, \z                // (a'_2, a''_2; a'_3, a''_3)
    .ifnes "\c", "nil"
    .ifnes "\c", "nil"
-       punpcklwd \c, \z                // (c'_0, c''_0, c'_1, c''_1)
-       punpckhwd \d, \z                // (c'_2, c''_2, c'_3, c''_3)
+       punpcklwd \c, \z                // (c'_0, c''_0; c'_1, c''_1)
+       punpckhwd \d, \z                // (c'_2, c''_2; c'_3, c''_3)
    .endif
    .endif
-       pshufd  \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
-       pshufd  \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
+       pshufd  \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
+       pshufd  \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
    .ifnes "\c", "nil"
    .ifnes "\c", "nil"
-       pshufd  \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
-       pshufd  \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
+       pshufd  \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
+       pshufd  \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
    .endif
  .endm
  
    .endif
  .endm
  
@@ -229,10 +227,10 @@
         // we can do that, we must gather them together.
         movdqa  \t, \c0
         movdqa  \u, \c1
         // we can do that, we must gather them together.
         movdqa  \t, \c0
         movdqa  \u, \c1
-       punpcklqdq \t, \c2              // (y'_0, y'_2)
-       punpckhqdq \c0, \c2             // (y''_0, y''_2)
-       punpcklqdq \u, \c3              // (y'_1, y'_3)
-       punpckhqdq \c1, \c3             // (y''_1, y''_3)
+       punpcklqdq \t, \c2              // (y'_0; y'_2)
+       punpckhqdq \c0, \c2             // (y''_0; y''_2)
+       punpcklqdq \u, \c3              // (y'_1; y'_3)
+       punpckhqdq \c1, \c3             // (y''_1; y''_3)
  
         // Now split the double-prime pieces.  The high (up to) 48 bits will
         // go up; the low 16 bits go down.
  
         // Now split the double-prime pieces.  The high (up to) 48 bits will
         // go up; the low 16 bits go down.
@@ -240,43 +238,43 @@
         movdqa  \c3, \c1
         psllq   \c2, 48
         psllq   \c3, 48
         movdqa  \c3, \c1
         psllq   \c2, 48
         psllq   \c3, 48
-       psrlq   \c0, 16                 // high parts of (y''_0, y''_2)
-       psrlq   \c1, 16                 // high parts of (y''_1, y''_3)
-       psrlq   \c2, 32                 // low parts of (y''_0, y''_2)
-       psrlq   \c3, 32                 // low parts of (y''_1, y''_3)
+       psrlq   \c0, 16                 // high parts of (y''_0; y''_2)
+       psrlq   \c1, 16                 // high parts of (y''_1; y''_3)
+       psrlq   \c2, 32                 // low parts of (y''_0; y''_2)
+       psrlq   \c3, 32                 // low parts of (y''_1; y''_3)
    .ifnes "\hi", "nil"
         movdqa  \hi, \c1
    .endif
    .ifnes "\hi", "nil"
         movdqa  \hi, \c1
    .endif
-       pslldq  \c1, 8                  // high part of (0, y''_1)
+       pslldq  \c1, 8                  // high part of (0; y''_1)
  
         paddq   \t, \c2                 // propagate down
         paddq   \u, \c3
  
         paddq   \t, \c2                 // propagate down
         paddq   \u, \c3
-       paddq   \t, \c1                 // and up: (y_0, y_2)
-       paddq   \u, \c0                 // (y_1, y_3)
+       paddq   \t, \c1                 // and up: (y_0; y_2)
+       paddq   \u, \c0                 // (y_1; y_3)
    .ifnes "\hi", "nil"
    .ifnes "\hi", "nil"
-       psrldq  \hi, 8                  // high part of (y''_3, 0)
+       psrldq  \hi, 8                  // high part of (y''_3; 0)
    .endif
  
         // Finally extract the answer.  This complicated dance is better than
         // storing to memory and loading, because the piecemeal stores
         // inhibit store forwarding.
    .endif
  
         // Finally extract the answer.  This complicated dance is better than
         // storing to memory and loading, because the piecemeal stores
         // inhibit store forwarding.
-       movdqa  \c3, \t                 // (y_0, y_1)
-       movdqa  \lo, \t                 // (y^*_0, ?, ?, ?)
-       psrldq  \t, 8                   // (y_2, 0)
-       psrlq   \c3, 32                 // (floor(y_0/B), ?)
-       paddq   \c3, \u                 // (y_1 + floor(y_0/B), ?)
-       movdqa  \c1, \c3                // (y^*_1, ?, ?, ?)
-       psrldq  \u, 8                   // (y_3, 0)
-       psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2, ?)
-       paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2, ?)
-       punpckldq \lo, \c3              // (y^*_0, y^*_2, ?, ?)
-       psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
-       paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
+       movdqa  \c3, \t                 // (y_0; ?)
+       movdqa  \lo, \t                 // (y^*_0, ?; ?, ?)
+       psrldq  \t, 8                   // (y_2; 0)
+       psrlq   \c3, 32                 // (floor(y_0/B); ?)
+       paddq   \c3, \u                 // (y_1 + floor(y_0/B); ?)
+       movdqa  \c1, \c3                // (y^*_1, ?; ?, ?)
+       psrldq  \u, 8                   // (y_3; 0)
+       psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2; ?)
+       paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2; ?)
+       punpckldq \lo, \c3              // (y^*_0, y^*_2; ?, ?)
+       psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
+       paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
    .ifnes "\hi", "nil"
         movdqa  \t, \c3
         pxor    \u, \u
    .endif
    .ifnes "\hi", "nil"
         movdqa  \t, \c3
         pxor    \u, \u
    .endif
-       punpckldq \c1, \c3              // (y^*_1, y^*_3, ?, ?)
+       punpckldq \c1, \c3              // (y^*_1, y^*_3; ?, ?)
    .ifnes "\hi", "nil"
         psrlq   \t, 32                  // very high bits of y
         paddq   \hi, \t
    .ifnes "\hi", "nil"
         psrlq   \t, 32                  // very high bits of y
         paddq   \hi, \t
@@ -293,13 +291,13 @@
         // On exit, the carry registers, including XMM15, are updated to hold
         // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
         // registers are preserved.
         // On exit, the carry registers, including XMM15, are updated to hold
         // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
         // registers are preserved.
-       movd    xmm0, [rdi +  0]        // (a_0, 0)
-       movd    xmm1, [rdi +  4]        // (a_1, 0)
-       movd    xmm2, [rdi +  8]        // (a_2, 0)
-       movd    xmm15, [rdi + 12]       // (a_3, 0)
-       paddq   xmm12, xmm0             // (c'_0 + a_0, c''_0)
-       paddq   xmm13, xmm1             // (c'_1 + a_1, c''_1)
-       paddq   xmm14, xmm2             // (c'_2 + a_2, c''_2 + a_3 b)
+       movd    xmm0, [rdi +  0]        // (a_0; 0)
+       movd    xmm1, [rdi +  4]        // (a_1; 0)
+       movd    xmm2, [rdi +  8]        // (a_2; 0)
+       movd    xmm15, [rdi + 12]       // (a_3; 0)
+       paddq   xmm12, xmm0             // (c'_0 + a_0; c''_0)
+       paddq   xmm13, xmm1             // (c'_1 + a_1; c''_1)
+       paddq   xmm14, xmm2             // (c'_2 + a_2; c''_2 + a_3 b)
  .endm
  
  ///--------------------------------------------------------------------------
  .endm
  
  ///--------------------------------------------------------------------------
@@ -321,7 +319,6 @@ INTFUNC(carryprop)
         movdqu  [rdi], xmm0
  
         ret
         movdqu  [rdi], xmm0
  
         ret
-
  ENDFUNC
  
  INTFUNC(dmul4)
  ENDFUNC
  
  INTFUNC(dmul4)
@@ -359,7 +356,6 @@ INTFUNC(dmul4)
         movdqu  [rdi], xmm6
  
         ret
         movdqu  [rdi], xmm6
  
         ret
-
  ENDFUNC
  
  INTFUNC(dmla4)
  ENDFUNC
  
  INTFUNC(dmla4)
@@ -400,7 +396,6 @@ INTFUNC(dmla4)
         movdqu  [rdi], xmm6
  
         ret
         movdqu  [rdi], xmm6
  
         ret
-
  ENDFUNC
  
  INTFUNC(mul4zc)
  ENDFUNC
  
  INTFUNC(mul4zc)
@@ -431,7 +426,6 @@ INTFUNC(mul4zc)
         movdqu  [rdi], xmm6
  
         ret
         movdqu  [rdi], xmm6
  
         ret
-
  ENDFUNC
  
  INTFUNC(mul4)
  ENDFUNC
  
  INTFUNC(mul4)
@@ -464,7 +458,6 @@ INTFUNC(mul4)
         movdqu  [rdi], xmm6
  
         ret
         movdqu  [rdi], xmm6
  
         ret
-
  ENDFUNC
  
  INTFUNC(mla4zc)
  ENDFUNC
  
  INTFUNC(mla4zc)
@@ -500,7 +493,6 @@ INTFUNC(mla4zc)
         movdqu  [rdi], xmm6
  
         ret
         movdqu  [rdi], xmm6
  
         ret
-
  ENDFUNC
  
  INTFUNC(mla4)
  ENDFUNC
  
  INTFUNC(mla4)
@@ -535,7 +527,6 @@ INTFUNC(mla4)
         movdqu  [rdi], xmm6
  
         ret
         movdqu  [rdi], xmm6
  
         ret
-
  ENDFUNC
  
  INTFUNC(mmul4)
  ENDFUNC
  
  INTFUNC(mmul4)
@@ -559,7 +550,6 @@ INTFUNC(mmul4)
         mulcore xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15
         propout xmm7, lo,                xmm12, xmm13
         jmp     5f
         mulcore xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15
         propout xmm7, lo,                xmm12, xmm13
         jmp     5f
-
  ENDFUNC
  
  INTFUNC(mmla4)
  ENDFUNC
  
  INTFUNC(mmla4)
@@ -577,10 +567,10 @@ INTFUNC(mmla4)
         movdqu  xmm4, [rax]
  #if ABI_WIN
         stalloc 48 + 8                  // space for the carries
         movdqu  xmm4, [rax]
  #if ABI_WIN
         stalloc 48 + 8                  // space for the carries
-#  define STKTMP(i) [rsp + i]
+#  define STKTMP(i) [SP + i]
  #endif
  #if ABI_SYSV
  #endif
  #if ABI_SYSV
-#  define STKTMP(i) [rsp + i - 48 - 8] // use red zone
+#  define STKTMP(i) [SP + i - 48 - 8]  // use red zone
  #endif
    endprologue
  
  #endif
    endprologue
  
@@ -621,8 +611,8 @@ INTFUNC(mmla4)
         mulcore xmm7, 1,   xmm10, xmm11, xmm0,  xmm1,  xmm2
         accum                            xmm4,  xmm5,  xmm6
  
         mulcore xmm7, 1,   xmm10, xmm11, xmm0,  xmm1,  xmm2
         accum                            xmm4,  xmm5,  xmm6
  
-       punpckldq xmm12, xmm15          // (w_0, 0, w_1, 0)
-       punpckhdq xmm14, xmm15          // (w_2, 0, w_3, 0)
+       punpckldq xmm12, xmm15          // (w_0, 0; w_1, 0)
+       punpckhdq xmm14, xmm15          // (w_2, 0; w_3, 0)
  
         mulcore xmm7, 2,   xmm10, xmm11, xmm0,  xmm1
         accum                            xmm5,  xmm6
  
         mulcore xmm7, 2,   xmm10, xmm11, xmm0,  xmm1
         accum                            xmm5,  xmm6
@@ -634,10 +624,10 @@ INTFUNC(mmla4)
         mulcore xmm7, 3,   xmm10, xmm11, xmm0
         accum                            xmm6
  
         mulcore xmm7, 3,   xmm10, xmm11, xmm0
         accum                            xmm6
  
-       punpckldq xmm12, xmm2           // (w_0, 0, 0, 0)
-       punpckldq xmm14, xmm2           // (w_2, 0, 0, 0)
-       punpckhdq xmm13, xmm2           // (w_1, 0, 0, 0)
-       punpckhdq xmm15, xmm2           // (w_3, 0, 0, 0)
+       punpckldq xmm12, xmm2           // (w_0, 0; 0, 0)
+       punpckldq xmm14, xmm2           // (w_2, 0; 0, 0)
+       punpckhdq xmm13, xmm2           // (w_1, 0; 0, 0)
+       punpckhdq xmm15, xmm2           // (w_3, 0; 0, 0)
  
         // That's lots of pieces.  Now we have to assemble the answer.
         squash  xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
  
         // That's lots of pieces.  Now we have to assemble the answer.
         squash  xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
@@ -703,8 +693,8 @@ INTFUNC(mont4)
         mulcore xmm7, 1,   xmm8,  xmm9,  xmm0,  xmm1,  xmm2
         accum                            xmm4,  xmm5,  xmm6
  
         mulcore xmm7, 1,   xmm8,  xmm9,  xmm0,  xmm1,  xmm2
         accum                            xmm4,  xmm5,  xmm6
  
-       punpckldq xmm12, xmm15          // (w_0, 0, w_1, 0)
-       punpckhdq xmm14, xmm15          // (w_2, 0, w_3, 0)
+       punpckldq xmm12, xmm15          // (w_0, 0; w_1, 0)
+       punpckhdq xmm14, xmm15          // (w_2, 0; w_3, 0)
  
         mulcore xmm7, 2,   xmm8,  xmm9,  xmm0,  xmm1
         accum                            xmm5,  xmm6
  
         mulcore xmm7, 2,   xmm8,  xmm9,  xmm0,  xmm1
         accum                            xmm5,  xmm6
@@ -716,10 +706,10 @@ INTFUNC(mont4)
         mulcore xmm7, 3,   xmm8,  xmm9,  xmm0
         accum                            xmm6
  
         mulcore xmm7, 3,   xmm8,  xmm9,  xmm0
         accum                            xmm6
  
-       punpckldq xmm12, xmm2           // (w_0, 0, 0, 0)
-       punpckldq xmm14, xmm2           // (w_2, 0, 0, 0)
-       punpckhdq xmm13, xmm2           // (w_1, 0, 0, 0)
-       punpckhdq xmm15, xmm2           // (w_3, 0, 0, 0)
+       punpckldq xmm12, xmm2           // (w_0, 0; 0, 0)
+       punpckldq xmm14, xmm2           // (w_2, 0; 0, 0)
+       punpckhdq xmm13, xmm2           // (w_1, 0; 0, 0)
+       punpckhdq xmm15, xmm2           // (w_3, 0; 0, 0)
  
         // That's lots of pieces.  Now we have to assemble the answer.
         squash  xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
  
         // That's lots of pieces.  Now we have to assemble the answer.
         squash  xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
@@ -746,12 +736,18 @@ INTFUNC(mont4)
         // And, with that, we're done.
         movdqu  [rdi], xmm6
         ret
         // And, with that, we're done.
         movdqu  [rdi], xmm6
         ret
-
  ENDFUNC
  
  ///--------------------------------------------------------------------------
  /// Bulk multipliers.
  
  ENDFUNC
  
  ///--------------------------------------------------------------------------
  /// Bulk multipliers.
  
+FUNC(mpx_umul4_amd64_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       .arch   pentium4
+ENDFUNC
+
  FUNC(mpx_umul4_amd64_sse2)
         // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
         //                         const mpw *bv, const mpw *bvl);
  FUNC(mpx_umul4_amd64_sse2)
         // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
         //                         const mpw *bv, const mpw *bvl);
@@ -778,7 +774,6 @@ FUNC(mpx_umul4_amd64_sse2)
    endprologue
  
         mov     DV, rdi
    endprologue
  
         mov     DV, rdi
-
  #endif
  
  #if ABI_WIN
  #endif
  
  #if ABI_WIN
@@ -806,8 +801,7 @@ FUNC(mpx_umul4_amd64_sse2)
    endprologue
  
         mov     rdi, DV
    endprologue
  
         mov     rdi, DV
-       mov     BVL, [rsp + 224]
-
+       mov     BVL, [SP + 224]
  #endif
  
         // Prepare for the first iteration.
  #endif
  
         // Prepare for the first iteration.
@@ -873,7 +867,6 @@ FUNC(mpx_umul4_amd64_sse2)
  #endif
  
  #if ABI_WIN
  #endif
  
  #if ABI_WIN
-
         rstrxmm xmm6,    0
         rstrxmm xmm7,   16
         rstrxmm xmm8,   32
         rstrxmm xmm6,    0
         rstrxmm xmm7,   16
         rstrxmm xmm8,   32
@@ -888,7 +881,6 @@ FUNC(mpx_umul4_amd64_sse2)
         stfree  160 + 8
         popreg  rdi
         popreg  rbx
         stfree  160 + 8
         popreg  rdi
         popreg  rbx
-
  #endif
  
         ret
  #endif
  
         ret
@@ -901,6 +893,13 @@ FUNC(mpx_umul4_amd64_sse2)
  
  ENDFUNC
  
  
  ENDFUNC
  
+FUNC(mpxmont_mul4_amd64_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       .arch   pentium4
+ENDFUNC
+
  FUNC(mpxmont_mul4_amd64_sse2)
         // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
         //                           const mpw *nv, size_t n, const mpw *mi);
  FUNC(mpxmont_mul4_amd64_sse2)
         // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
         //                           const mpw *nv, size_t n, const mpw *mi);
@@ -934,7 +933,6 @@ FUNC(mpxmont_mul4_amd64_sse2)
    endprologue
  
         mov     DV, rdi
    endprologue
  
         mov     DV, rdi
-
  #endif
  
  #if ABI_WIN
  #endif
  
  #if ABI_WIN
@@ -966,9 +964,8 @@ FUNC(mpxmont_mul4_amd64_sse2)
    endprologue
  
         mov     rdi, DV
    endprologue
  
         mov     rdi, DV
-       mov     N, [rsp + 224]
-       mov     MI, [rsp + 232]
-
+       mov     N, [SP + 224]
+       mov     MI, [SP + 232]
  #endif
  
         // Establish the expanded operands.
  #endif
  
         // Establish the expanded operands.
@@ -1050,7 +1047,6 @@ FUNC(mpxmont_mul4_amd64_sse2)
  #endif
  
  #if ABI_WIN
  #endif
  
  #if ABI_WIN
-
         rstrxmm xmm6,    0
         rstrxmm xmm7,   16
         rstrxmm xmm8,   32
         rstrxmm xmm6,    0
         rstrxmm xmm7,   16
         rstrxmm xmm8,   32
@@ -1066,7 +1062,6 @@ FUNC(mpxmont_mul4_amd64_sse2)
         popreg  r12
         popreg  rdi
         popreg  rbx
         popreg  r12
         popreg  rdi
         popreg  rbx
-
  #endif
  
         ret
  #endif
  
         ret
@@ -1095,6 +1090,13 @@ FUNC(mpxmont_mul4_amd64_sse2)
  
  ENDFUNC
  
  
  ENDFUNC
  
+FUNC(mpxmont_redc4_amd64_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       .arch   pentium4
+ENDFUNC
+
  FUNC(mpxmont_redc4_amd64_sse2)
         // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
         //                             size_t n, const mpw *mi);
  FUNC(mpxmont_redc4_amd64_sse2)
         // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
         //                             size_t n, const mpw *mi);
@@ -1115,7 +1117,6 @@ FUNC(mpxmont_redc4_amd64_sse2)
         // c                    rcx     r9
  
  #if ABI_SYSV
         // c                    rcx     r9
  
  #if ABI_SYSV
-
  #  define DVL rax
  #  define DVL4 rsi
  #  define MI r8
  #  define DVL rax
  #  define DVL4 rsi
  #  define MI r8
@@ -1130,11 +1131,9 @@ FUNC(mpxmont_redc4_amd64_sse2)
    endprologue
  
         mov     DV, rdi
    endprologue
  
         mov     DV, rdi
-
  #endif
  
  #if ABI_WIN
  #endif
  
  #if ABI_WIN
-
  #  define DVL rax
  #  define DVL4 rdx
  #  define MI r10
  #  define DVL rax
  #  define DVL4 rdx
  #  define MI r10
@@ -1164,8 +1163,7 @@ FUNC(mpxmont_redc4_amd64_sse2)
    endprologue
  
         mov     rdi, DV
    endprologue
  
         mov     rdi, DV
-       mov     MI, [rsp + 224]
-
+       mov     MI, [SP + 224]
  #endif
  
         // Establish the expanded operands and the blocks-of-4 dv limit.
  #endif
  
         // Establish the expanded operands and the blocks-of-4 dv limit.
@@ -1248,7 +1246,6 @@ FUNC(mpxmont_redc4_amd64_sse2)
  #endif
  
  #if ABI_WIN
  #endif
  
  #if ABI_WIN
-
         rstrxmm xmm6,    0
         rstrxmm xmm7,   16
         rstrxmm xmm8,   32
         rstrxmm xmm6,    0
         rstrxmm xmm7,   16
         rstrxmm xmm8,   32
@@ -1264,7 +1261,6 @@ FUNC(mpxmont_redc4_amd64_sse2)
         popreg  r12
         popreg  rdi
         popreg  rbx
         popreg  r12
         popreg  rdi
         popreg  rbx
-
  #endif
  
         ret
  #endif
  
         ret
@@ -1310,7 +1306,7 @@ ENDFUNC
  #  define ARG8 STKARG(4)
  #  define STKARG_OFFSET 224
  #endif
  #  define ARG8 STKARG(4)
  #  define STKARG_OFFSET 224
  #endif
-#define STKARG(i) [rsp + STKARG_OFFSET + 8*(i)]
+#define STKARG(i) [SP + STKARG_OFFSET + 8*(i)]
  
  //               sysv                          win
  //               dmul  smul  mmul  mont        dmul  smul  mmul  mont
  
  //               sysv                          win
  //               dmul  smul  mmul  mont        dmul  smul  mmul  mont
@@ -1474,9 +1470,9 @@ ENDFUNC
  .endm
  
  .macro testldcarry
  .endm
  
  .macro testldcarry
-       movdqu  xmm12, [rcx +  0]       // (c'_0, c''_0)
-       movdqu  xmm13, [rcx + 16]       // (c'_1, c''_1)
-       movdqu  xmm14, [rcx + 32]       // (c'_2, c''_2)
+       movdqu  xmm12, [rcx +  0]       // (c'_0; c''_0)
+       movdqu  xmm13, [rcx + 16]       // (c'_1; c''_1)
+       movdqu  xmm14, [rcx + 32]       // (c'_2; c''_2)
  .endm
  
  .macro testtop u=nil
  .endm
  
  .macro testtop u=nil