math/mpx-mul4-*-sse2.S (squash): We don't care about the top half of c3 here.

[catacomb] / math / mpx-mul4-x86-sse2.S
diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S

index f6c8167..ee741d2 100644 (file)
--- a/math/mpx-mul4-x86-sse2.S
+++ b/math/mpx-mul4-x86-sse2.S
@@ -260,7 +260,7 @@
         // Finally extract the answer.  This complicated dance is better than
         // storing to memory and loading, because the piecemeal stores
         // inhibit store forwarding.
-       movdqa  \c3, \t                 // (y_0, y_1)
+       movdqa  \c3, \t                 // (y_0, ?)
         movdqa  \lo, \t                 // (y^*_0, ?, ?, ?)
         psrldq  \t, 8                   // (y_2, 0)
         psrlq   \c3, 32                 // (floor(y_0/B), ?)
@@ -678,6 +678,14 @@ ENDFUNC
  ///--------------------------------------------------------------------------
  /// Bulk multipliers.
  
+FUNC(mpx_umul4_x86_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       // and drop through...
+       .arch   pentium4
+ENDFUNC
+
  FUNC(mpx_umul4_x86_sse2)
         // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
         //                         const mpw *bv, const mpw *bvl);
@@ -778,6 +786,14 @@ FUNC(mpx_umul4_x86_sse2)
  
  ENDFUNC
  
+FUNC(mpxmont_mul4_x86_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       // and drop through...
+       .arch   pentium4
+ENDFUNC
+
  FUNC(mpxmont_mul4_x86_sse2)
         // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
         //                           const mpw *nv, size_t n, const mpw *mi);
@@ -919,6 +935,14 @@ FUNC(mpxmont_mul4_x86_sse2)
  
  ENDFUNC
  
+FUNC(mpxmont_redc4_x86_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       // and drop through...
+       .arch   pentium4
+ENDFUNC
+
  FUNC(mpxmont_redc4_x86_sse2)
         // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
         //                             size_t n, const mpw *mi);