X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/0c9ebe471cfa8343f2ac5d8bd206870f82e87837..e830bb692041c75eb29b8c511db21af81b3aae2d:/math/mpx.c diff --git a/math/mpx.c b/math/mpx.c index 2745fe0f..e759c5f2 100644 --- a/math/mpx.c +++ b/math/mpx.c @@ -27,6 +27,8 @@ /*----- Header files ------------------------------------------------------*/ +#include "config.h" + #include #include #include @@ -35,6 +37,7 @@ #include #include +#include "dispatch.h" #include "mptypes.h" #include "mpx.h" #include "bitops.h" @@ -845,8 +848,13 @@ void mpx_usubnlsl(mpw *dv, mpw *dvl, mpw a, unsigned o) * vectors in any way. */ -void mpx_umul(mpw *dv, mpw *dvl, const mpw *av, const mpw *avl, - const mpw *bv, const mpw *bvl) +CPU_DISPATCH(EMPTY, (void), void, mpx_umul, + (mpw *dv, mpw *dvl, const mpw *av, const mpw *avl, + const mpw *bv, const mpw *bvl), + (dv, dvl, av, avl, bv, bvl), pick_umul, simple_umul); + +static void simple_umul(mpw *dv, mpw *dvl, const mpw *av, const mpw *avl, + const mpw *bv, const mpw *bvl) { /* --- This is probably worthwhile on a multiply --- */ @@ -885,6 +893,44 @@ void mpx_umul(mpw *dv, mpw *dvl, const mpw *av, const mpw *avl, } } +#define MAYBE_UMUL4(impl) \ + extern void mpx_umul4_##impl(mpw */*dv*/, \ + const mpw */*av*/, const mpw */*avl*/, \ + const mpw */*bv*/, const mpw */*bvl*/); \ + static void maybe_umul4_##impl(mpw *dv, mpw *dvl, \ + const mpw *av, const mpw *avl, \ + const mpw *bv, const mpw *bvl) \ + { \ + size_t an = avl - av, bn = bvl - bv, dn = dvl - dv; \ + if (!an || an%4 != 0 || !bn || bn%4 != 0 || dn < an + bn) \ + simple_umul(dv, dvl, av, avl, bv, bvl); \ + else { \ + mpx_umul4_##impl(dv, av, avl, bv, bvl); \ + MPX_ZERO(dv + an + bn, dvl); \ + } \ + } + +#if CPUFAM_X86 + MAYBE_UMUL4(x86_sse2) +#endif + +#if CPUFAM_AMD64 + MAYBE_UMUL4(amd64_sse2) +#endif + +static mpx_umul__functype *pick_umul(void) +{ +#if CPUFAM_X86 + DISPATCH_PICK_COND(mpx_umul, maybe_umul4_x86_sse2, + cpu_feature_p(CPUFEAT_X86_SSE2)); +#endif +#if CPUFAM_AMD64 + DISPATCH_PICK_COND(mpx_umul, maybe_umul4_amd64_sse2, + cpu_feature_p(CPUFEAT_X86_SSE2)); +#endif + DISPATCH_PICK_FALLBACK(mpx_umul, simple_umul); +} + /* --- @mpx_umuln@ --- * * * Arguments: @mpw *dv, *dvl@ = destination vector base and limit @@ -1220,6 +1266,7 @@ mpw mpx_udivn(mpw *qv, mpw *qvl, const mpw *rv, const mpw *rvl, mpw d) size_t _sz = (sz); \ mpw *_vv = xmalloc(MPWS(_sz)); \ mpw *_vvl = _vv + _sz; \ + memset(_vv, 0xa5, MPWS(_sz)); \ (v) = _vv; \ (vl) = _vvl; \ } while (0)