# define CPUID1D_SSE2 (1u << 26)
# define CPUID1D_FXSR (1u << 24)
# define CPUID1C_AESNI (1u << 25)
+# define CPUID1C_AVX (1u << 28)
# define CPUID1C_RDRAND (1u << 30)
struct cpuid { unsigned a, b, c, d; };
cpuid_features_p(CPUID1D_SSE2, CPUID1C_AESNI));
CASE_CPUFEAT(X86_RDRAND, "x86:rdrand",
cpuid_features_p(0, CPUID1C_RDRAND));
+ CASE_CPUFEAT(X86_AVX, "x86:avx",
+ xmm_registers_available_p() &&
+ cpuid_features_p(0, CPUID1C_AVX));
#endif
#ifdef CAPMAP
# define FEATP__CASE(feat, tok) \
CPUFEAT_ARM_V4, /* VFPv4 and/or SIMD v2 */
CPUFEAT_ARM_D32, /* 32 double registers, not 16 */
CPUFEAT_X86_RDRAND, /* Built-in entropy source */
- CPUFEAT_ARM_AES /* AES instructions */
+ CPUFEAT_ARM_AES, /* AES instructions */
+ CPUFEAT_X86_AVX /* AVX 1 (i.e., 256-bit YMM regs) */
};
extern int cpu_feature_p(int /*feat*/);
#if CPUFAM_X86
MAYBE_REDC4(x86_sse2)
+ MAYBE_REDC4(x86_avx)
#endif
#if CPUFAM_AMD64
MAYBE_REDC4(amd64_sse2)
+ MAYBE_REDC4(amd64_avx)
#endif
static redccore__functype *pick_redccore(void)
{
#if CPUFAM_X86
+ DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_x86_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX));
DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_x86_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
#if CPUFAM_AMD64
+ DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_amd64_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX));
DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_amd64_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
#if CPUFAM_X86
MAYBE_MUL4(x86_sse2)
+ MAYBE_MUL4(x86_avx)
#endif
#if CPUFAM_AMD64
MAYBE_MUL4(amd64_sse2)
+ MAYBE_MUL4(amd64_avx)
#endif
static mulcore__functype *pick_mulcore(void)
{
#if CPUFAM_X86
+ DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_x86_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX));
DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_x86_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
#if CPUFAM_AMD64
+ DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_amd64_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX));
DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_amd64_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
///--------------------------------------------------------------------------
/// Bulk multipliers.
+FUNC(mpx_umul4_amd64_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ .arch pentium4
+ENDFUNC
+
FUNC(mpx_umul4_amd64_sse2)
// void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
// const mpw *bv, const mpw *bvl);
ENDFUNC
+FUNC(mpxmont_mul4_amd64_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ .arch pentium4
+ENDFUNC
+
FUNC(mpxmont_mul4_amd64_sse2)
// void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
// const mpw *nv, size_t n, const mpw *mi);
ENDFUNC
+FUNC(mpxmont_redc4_amd64_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ .arch pentium4
+ENDFUNC
+
FUNC(mpxmont_redc4_amd64_sse2)
// void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
// size_t n, const mpw *mi);
///--------------------------------------------------------------------------
/// Bulk multipliers.
+FUNC(mpx_umul4_x86_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ // and drop through...
+ .arch pentium4
+ENDFUNC
+
FUNC(mpx_umul4_x86_sse2)
// void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
// const mpw *bv, const mpw *bvl);
ENDFUNC
+FUNC(mpxmont_mul4_x86_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ // and drop through...
+ .arch pentium4
+ENDFUNC
+
FUNC(mpxmont_mul4_x86_sse2)
// void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
// const mpw *nv, size_t n, const mpw *mi);
ENDFUNC
+FUNC(mpxmont_redc4_x86_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ // and drop through...
+ .arch pentium4
+ENDFUNC
+
FUNC(mpxmont_redc4_x86_sse2)
// void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
// size_t n, const mpw *mi);
#if CPUFAM_X86
MAYBE_UMUL4(x86_sse2)
+ MAYBE_UMUL4(x86_avx)
#endif
#if CPUFAM_AMD64
MAYBE_UMUL4(amd64_sse2)
+ MAYBE_UMUL4(amd64_avx)
#endif
static mpx_umul__functype *pick_umul(void)
{
#if CPUFAM_X86
+ DISPATCH_PICK_COND(mpx_umul, maybe_umul4_x86_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX));
DISPATCH_PICK_COND(mpx_umul, maybe_umul4_x86_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
#if CPUFAM_AMD64
+ DISPATCH_PICK_COND(mpx_umul, maybe_umul4_amd64_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX));
DISPATCH_PICK_COND(mpx_umul, maybe_umul4_amd64_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
///--------------------------------------------------------------------------
/// Main code.
- .arch pentium4
.text
+FUNC(chacha_core_x86ish_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ // drop through...
+ENDFUNC
+
+ .arch pentium4
+
FUNC(chacha_core_x86ish_sse2)
// Initial setup.
#if CPUFAM_X86 || CPUFAM_AMD64
extern core__functype chacha_core_x86ish_sse2;
+extern core__functype chacha_core_x86ish_avx;
#endif
#if CPUFAM_ARMEL
static core__functype *pick_core(void)
{
#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX));
DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
#if CPUFAM_X86 || CPUFAM_AMD64
extern setup__functype rijndael_setup_x86ish_aesni;
+extern setup__functype rijndael_setup_x86ish_aesni_avx;
#endif
#if CPUFAM_ARMEL && HAVE_AS_ARMV8_CRYPTO
extern setup__functype rijndael_setup_arm_crypto;
static setup__functype *pick_setup(void)
{
#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86ish_aesni_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX) &&
+ cpu_feature_p(CPUFEAT_X86_AESNI));
DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86ish_aesni,
cpu_feature_p(CPUFEAT_X86_AESNI));
#endif
///--------------------------------------------------------------------------
/// Key setup.
+FUNC(rijndael_setup_x86ish_aesni_avx)
+ vzeroupper // avoid penalty on `legacy' XMM access
+ endprologue
+ // and drop through...
+ENDFUNC
+
FUNC(rijndael_setup_x86ish_aesni)
#define SI WHOLE(si)
/// Encrypting and decrypting blocks.
.macro encdec op, aes, koff
+ FUNC(rijndael_\op\()_x86ish_aesni_avx)
+ vzeroupper // avoid XMM penalties
+ endprologue
+ // and drop through...
+ ENDFUNC
+
FUNC(rijndael_\op\()_x86ish_aesni)
#if CPUFAM_X86
#if CPUFAM_X86 || CPUFAM_AMD64
extern rijndael_eblk__functype rijndael_eblk_x86ish_aesni;
extern rijndael_dblk__functype rijndael_dblk_x86ish_aesni;
+extern rijndael_eblk__functype rijndael_eblk_x86ish_aesni_avx;
+extern rijndael_dblk__functype rijndael_dblk_x86ish_aesni_avx;
#endif
#if CPUFAM_ARMEL && HAVE_AS_ARMV8_CRYPTO
extern rijndael_eblk__functype rijndael_eblk_arm_crypto;
static rijndael_eblk__functype *pick_eblk(void)
{
#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86ish_aesni_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX) &&
+ cpu_feature_p(CPUFEAT_X86_AESNI));
DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86ish_aesni,
cpu_feature_p(CPUFEAT_X86_AESNI));
#endif
static rijndael_dblk__functype *pick_dblk(void)
{
#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86ish_aesni_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX) &&
+ cpu_feature_p(CPUFEAT_X86_AESNI));
DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86ish_aesni,
cpu_feature_p(CPUFEAT_X86_AESNI));
#endif
///--------------------------------------------------------------------------
/// Main code.
- .arch pentium4
.text
+FUNC(salsa20_core_x86ish_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ // drop through...
+ENDFUNC
+
+ .arch pentium4
+
FUNC(salsa20_core_x86ish_sse2)
// Initial setup.
#if CPUFAM_X86 || CPUFAM_AMD64
extern core__functype salsa20_core_x86ish_sse2;
+extern core__functype salsa20_core_x86ish_avx;
#endif
#if CPUFAM_ARMEL
static core__functype *pick_core(void)
{
#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX));
DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif