X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/0c9ebe471cfa8343f2ac5d8bd206870f82e87837..e830bb692041c75eb29b8c511db21af81b3aae2d:/math/mpx.c

diff --git a/math/mpx.c b/math/mpx.c
index 2745fe0f..e759c5f2 100644
--- a/math/mpx.c
+++ b/math/mpx.c
@@ -27,6 +27,8 @@
 
 /*----- Header files ------------------------------------------------------*/
 
+#include "config.h"
+
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -35,6 +37,7 @@
 #include <mLib/bits.h>
 #include <mLib/macros.h>
 
+#include "dispatch.h"
 #include "mptypes.h"
 #include "mpx.h"
 #include "bitops.h"
@@ -845,8 +848,13 @@ void mpx_usubnlsl(mpw *dv, mpw *dvl, mpw a, unsigned o)
  *		vectors in any way.
  */
 
-void mpx_umul(mpw *dv, mpw *dvl, const mpw *av, const mpw *avl,
-	      const mpw *bv, const mpw *bvl)
+CPU_DISPATCH(EMPTY, (void), void, mpx_umul,
+	     (mpw *dv, mpw *dvl, const mpw *av, const mpw *avl,
+	      const mpw *bv, const mpw *bvl),
+	     (dv, dvl, av, avl, bv, bvl), pick_umul, simple_umul);
+
+static void simple_umul(mpw *dv, mpw *dvl, const mpw *av, const mpw *avl,
+			const mpw *bv, const mpw *bvl)
 {
   /* --- This is probably worthwhile on a multiply --- */
 
@@ -885,6 +893,44 @@ void mpx_umul(mpw *dv, mpw *dvl, const mpw *av, const mpw *avl,
   }
 }
 
+#define MAYBE_UMUL4(impl)						\
+  extern void mpx_umul4_##impl(mpw */*dv*/,				\
+			       const mpw */*av*/, const mpw */*avl*/,	\
+			       const mpw */*bv*/, const mpw */*bvl*/);	\
+  static void maybe_umul4_##impl(mpw *dv, mpw *dvl,			\
+				 const mpw *av, const mpw *avl,		\
+				 const mpw *bv, const mpw *bvl)		\
+  {									\
+    size_t an = avl - av, bn = bvl - bv, dn = dvl - dv;			\
+    if (!an || an%4 != 0 || !bn || bn%4 != 0 || dn < an + bn)		\
+      simple_umul(dv, dvl, av, avl, bv, bvl);				\
+    else {								\
+      mpx_umul4_##impl(dv, av, avl, bv, bvl);				\
+      MPX_ZERO(dv + an + bn, dvl);					\
+    }									\
+  }
+
+#if CPUFAM_X86
+  MAYBE_UMUL4(x86_sse2)
+#endif
+
+#if CPUFAM_AMD64
+  MAYBE_UMUL4(amd64_sse2)
+#endif
+
+static mpx_umul__functype *pick_umul(void)
+{
+#if CPUFAM_X86
+  DISPATCH_PICK_COND(mpx_umul, maybe_umul4_x86_sse2,
+		     cpu_feature_p(CPUFEAT_X86_SSE2));
+#endif
+#if CPUFAM_AMD64
+  DISPATCH_PICK_COND(mpx_umul, maybe_umul4_amd64_sse2,
+		     cpu_feature_p(CPUFEAT_X86_SSE2));
+#endif
+  DISPATCH_PICK_FALLBACK(mpx_umul, simple_umul);
+}
+
 /* --- @mpx_umuln@ --- *
  *
  * Arguments:	@mpw *dv, *dvl@ = destination vector base and limit
@@ -1220,6 +1266,7 @@ mpw mpx_udivn(mpw *qv, mpw *qvl, const mpw *rv, const mpw *rvl, mpw d)
   size_t _sz = (sz);							\
   mpw *_vv = xmalloc(MPWS(_sz));					\
   mpw *_vvl = _vv + _sz;						\
+  memset(_vv, 0xa5, MPWS(_sz));						\
   (v) = _vv;								\
   (vl) = _vvl;								\
 } while (0)