// Finally extract the answer. This complicated dance is better than
// storing to memory and loading, because the piecemeal stores
// inhibit store forwarding.
- movdqa \c3, \t // (y_0, y_1)
+ movdqa \c3, \t // (y_0, ?)
movdqa \lo, \t // (y^*_0, ?, ?, ?)
psrldq \t, 8 // (y_2, 0)
psrlq \c3, 32 // (floor(y_0/B), ?)
///--------------------------------------------------------------------------
/// Bulk multipliers.
+FUNC(mpx_umul4_x86_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ // and drop through...
+ .arch pentium4
+ENDFUNC
+
FUNC(mpx_umul4_x86_sse2)
// void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
// const mpw *bv, const mpw *bvl);
ENDFUNC
+FUNC(mpxmont_mul4_x86_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ // and drop through...
+ .arch pentium4
+ENDFUNC
+
FUNC(mpxmont_mul4_x86_sse2)
// void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
// const mpw *nv, size_t n, const mpw *mi);
ENDFUNC
+FUNC(mpxmont_redc4_x86_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ // and drop through...
+ .arch pentium4
+ENDFUNC
+
FUNC(mpxmont_redc4_x86_sse2)
// void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
// size_t n, const mpw *mi);