There's a performance penalty to trying to preserve the upper parts of
the SSE/AVX vector registers, and it's pointless because we don't need
to preserve them. (Earlier AVX-capable processors would carefully snip
off the upper parts of the registers and put them in a box, and then
glue them back on when they were wanted, which isn't so bad. Later
processors instead just track the upper part of the register as an
additional operand, which leads to unnecessary latency.)
Add AVX-specific entry points to the necessary routines, and call them
when AVX is detected. This would all be easier if Intel had chosen
`vzeroupper' from an existing `nop' encoding space.
13 files changed:
# define CPUID1D_SSE2 (1u << 26)
# define CPUID1D_FXSR (1u << 24)
# define CPUID1C_AESNI (1u << 25)
# define CPUID1D_SSE2 (1u << 26)
# define CPUID1D_FXSR (1u << 24)
# define CPUID1C_AESNI (1u << 25)
+# define CPUID1C_AVX (1u << 28)
# define CPUID1C_RDRAND (1u << 30)
struct cpuid { unsigned a, b, c, d; };
# define CPUID1C_RDRAND (1u << 30)
struct cpuid { unsigned a, b, c, d; };
cpuid_features_p(CPUID1D_SSE2, CPUID1C_AESNI));
CASE_CPUFEAT(X86_RDRAND, "x86:rdrand",
cpuid_features_p(0, CPUID1C_RDRAND));
cpuid_features_p(CPUID1D_SSE2, CPUID1C_AESNI));
CASE_CPUFEAT(X86_RDRAND, "x86:rdrand",
cpuid_features_p(0, CPUID1C_RDRAND));
+ CASE_CPUFEAT(X86_AVX, "x86:avx",
+ xmm_registers_available_p() &&
+ cpuid_features_p(0, CPUID1C_AVX));
#endif
#ifdef CAPMAP
# define FEATP__CASE(feat, tok) \
#endif
#ifdef CAPMAP
# define FEATP__CASE(feat, tok) \
CPUFEAT_ARM_V4, /* VFPv4 and/or SIMD v2 */
CPUFEAT_ARM_D32, /* 32 double registers, not 16 */
CPUFEAT_X86_RDRAND, /* Built-in entropy source */
CPUFEAT_ARM_V4, /* VFPv4 and/or SIMD v2 */
CPUFEAT_ARM_D32, /* 32 double registers, not 16 */
CPUFEAT_X86_RDRAND, /* Built-in entropy source */
- CPUFEAT_ARM_AES /* AES instructions */
+ CPUFEAT_ARM_AES, /* AES instructions */
+ CPUFEAT_X86_AVX /* AVX 1 (i.e., 256-bit YMM regs) */
};
extern int cpu_feature_p(int /*feat*/);
};
extern int cpu_feature_p(int /*feat*/);
#if CPUFAM_X86
MAYBE_REDC4(x86_sse2)
#if CPUFAM_X86
MAYBE_REDC4(x86_sse2)
#endif
#if CPUFAM_AMD64
MAYBE_REDC4(amd64_sse2)
#endif
#if CPUFAM_AMD64
MAYBE_REDC4(amd64_sse2)
#endif
static redccore__functype *pick_redccore(void)
{
#if CPUFAM_X86
#endif
static redccore__functype *pick_redccore(void)
{
#if CPUFAM_X86
+ DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_x86_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX));
DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_x86_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
#if CPUFAM_AMD64
DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_x86_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
#if CPUFAM_AMD64
+ DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_amd64_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX));
DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_amd64_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_amd64_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
#if CPUFAM_X86
MAYBE_MUL4(x86_sse2)
#if CPUFAM_X86
MAYBE_MUL4(x86_sse2)
#endif
#if CPUFAM_AMD64
MAYBE_MUL4(amd64_sse2)
#endif
#if CPUFAM_AMD64
MAYBE_MUL4(amd64_sse2)
#endif
static mulcore__functype *pick_mulcore(void)
{
#if CPUFAM_X86
#endif
static mulcore__functype *pick_mulcore(void)
{
#if CPUFAM_X86
+ DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_x86_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX));
DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_x86_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
#if CPUFAM_AMD64
DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_x86_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
#if CPUFAM_AMD64
+ DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_amd64_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX));
DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_amd64_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_amd64_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
///--------------------------------------------------------------------------
/// Bulk multipliers.
///--------------------------------------------------------------------------
/// Bulk multipliers.
+FUNC(mpx_umul4_amd64_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ .arch pentium4
+ENDFUNC
+
FUNC(mpx_umul4_amd64_sse2)
// void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
// const mpw *bv, const mpw *bvl);
FUNC(mpx_umul4_amd64_sse2)
// void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
// const mpw *bv, const mpw *bvl);
+FUNC(mpxmont_mul4_amd64_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ .arch pentium4
+ENDFUNC
+
FUNC(mpxmont_mul4_amd64_sse2)
// void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
// const mpw *nv, size_t n, const mpw *mi);
FUNC(mpxmont_mul4_amd64_sse2)
// void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
// const mpw *nv, size_t n, const mpw *mi);
+FUNC(mpxmont_redc4_amd64_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ .arch pentium4
+ENDFUNC
+
FUNC(mpxmont_redc4_amd64_sse2)
// void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
// size_t n, const mpw *mi);
FUNC(mpxmont_redc4_amd64_sse2)
// void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
// size_t n, const mpw *mi);
///--------------------------------------------------------------------------
/// Bulk multipliers.
///--------------------------------------------------------------------------
/// Bulk multipliers.
+FUNC(mpx_umul4_x86_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ // and drop through...
+ .arch pentium4
+ENDFUNC
+
FUNC(mpx_umul4_x86_sse2)
// void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
// const mpw *bv, const mpw *bvl);
FUNC(mpx_umul4_x86_sse2)
// void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
// const mpw *bv, const mpw *bvl);
+FUNC(mpxmont_mul4_x86_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ // and drop through...
+ .arch pentium4
+ENDFUNC
+
FUNC(mpxmont_mul4_x86_sse2)
// void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
// const mpw *nv, size_t n, const mpw *mi);
FUNC(mpxmont_mul4_x86_sse2)
// void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
// const mpw *nv, size_t n, const mpw *mi);
+FUNC(mpxmont_redc4_x86_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ // and drop through...
+ .arch pentium4
+ENDFUNC
+
FUNC(mpxmont_redc4_x86_sse2)
// void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
// size_t n, const mpw *mi);
FUNC(mpxmont_redc4_x86_sse2)
// void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
// size_t n, const mpw *mi);
#if CPUFAM_X86
MAYBE_UMUL4(x86_sse2)
#if CPUFAM_X86
MAYBE_UMUL4(x86_sse2)
#endif
#if CPUFAM_AMD64
MAYBE_UMUL4(amd64_sse2)
#endif
#if CPUFAM_AMD64
MAYBE_UMUL4(amd64_sse2)
#endif
static mpx_umul__functype *pick_umul(void)
{
#if CPUFAM_X86
#endif
static mpx_umul__functype *pick_umul(void)
{
#if CPUFAM_X86
+ DISPATCH_PICK_COND(mpx_umul, maybe_umul4_x86_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX));
DISPATCH_PICK_COND(mpx_umul, maybe_umul4_x86_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
#if CPUFAM_AMD64
DISPATCH_PICK_COND(mpx_umul, maybe_umul4_x86_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
#if CPUFAM_AMD64
+ DISPATCH_PICK_COND(mpx_umul, maybe_umul4_amd64_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX));
DISPATCH_PICK_COND(mpx_umul, maybe_umul4_amd64_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
DISPATCH_PICK_COND(mpx_umul, maybe_umul4_amd64_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
///--------------------------------------------------------------------------
/// Main code.
///--------------------------------------------------------------------------
/// Main code.
+FUNC(chacha_core_x86ish_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ // drop through...
+ENDFUNC
+
+ .arch pentium4
+
FUNC(chacha_core_x86ish_sse2)
// Initial setup.
FUNC(chacha_core_x86ish_sse2)
// Initial setup.
#if CPUFAM_X86 || CPUFAM_AMD64
extern core__functype chacha_core_x86ish_sse2;
#if CPUFAM_X86 || CPUFAM_AMD64
extern core__functype chacha_core_x86ish_sse2;
+extern core__functype chacha_core_x86ish_avx;
static core__functype *pick_core(void)
{
#if CPUFAM_X86 || CPUFAM_AMD64
static core__functype *pick_core(void)
{
#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX));
DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
#if CPUFAM_X86 || CPUFAM_AMD64
extern setup__functype rijndael_setup_x86ish_aesni;
#if CPUFAM_X86 || CPUFAM_AMD64
extern setup__functype rijndael_setup_x86ish_aesni;
+extern setup__functype rijndael_setup_x86ish_aesni_avx;
#endif
#if CPUFAM_ARMEL && HAVE_AS_ARMV8_CRYPTO
extern setup__functype rijndael_setup_arm_crypto;
#endif
#if CPUFAM_ARMEL && HAVE_AS_ARMV8_CRYPTO
extern setup__functype rijndael_setup_arm_crypto;
static setup__functype *pick_setup(void)
{
#if CPUFAM_X86 || CPUFAM_AMD64
static setup__functype *pick_setup(void)
{
#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86ish_aesni_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX) &&
+ cpu_feature_p(CPUFEAT_X86_AESNI));
DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86ish_aesni,
cpu_feature_p(CPUFEAT_X86_AESNI));
#endif
DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86ish_aesni,
cpu_feature_p(CPUFEAT_X86_AESNI));
#endif
///--------------------------------------------------------------------------
/// Key setup.
///--------------------------------------------------------------------------
/// Key setup.
+FUNC(rijndael_setup_x86ish_aesni_avx)
+ vzeroupper // avoid penalty on `legacy' XMM access
+ endprologue
+ // and drop through...
+ENDFUNC
+
FUNC(rijndael_setup_x86ish_aesni)
#define SI WHOLE(si)
FUNC(rijndael_setup_x86ish_aesni)
#define SI WHOLE(si)
/// Encrypting and decrypting blocks.
.macro encdec op, aes, koff
/// Encrypting and decrypting blocks.
.macro encdec op, aes, koff
+ FUNC(rijndael_\op\()_x86ish_aesni_avx)
+ vzeroupper // avoid XMM penalties
+ endprologue
+ // and drop through...
+ ENDFUNC
+
FUNC(rijndael_\op\()_x86ish_aesni)
#if CPUFAM_X86
FUNC(rijndael_\op\()_x86ish_aesni)
#if CPUFAM_X86
#if CPUFAM_X86 || CPUFAM_AMD64
extern rijndael_eblk__functype rijndael_eblk_x86ish_aesni;
extern rijndael_dblk__functype rijndael_dblk_x86ish_aesni;
#if CPUFAM_X86 || CPUFAM_AMD64
extern rijndael_eblk__functype rijndael_eblk_x86ish_aesni;
extern rijndael_dblk__functype rijndael_dblk_x86ish_aesni;
+extern rijndael_eblk__functype rijndael_eblk_x86ish_aesni_avx;
+extern rijndael_dblk__functype rijndael_dblk_x86ish_aesni_avx;
#endif
#if CPUFAM_ARMEL && HAVE_AS_ARMV8_CRYPTO
extern rijndael_eblk__functype rijndael_eblk_arm_crypto;
#endif
#if CPUFAM_ARMEL && HAVE_AS_ARMV8_CRYPTO
extern rijndael_eblk__functype rijndael_eblk_arm_crypto;
static rijndael_eblk__functype *pick_eblk(void)
{
#if CPUFAM_X86 || CPUFAM_AMD64
static rijndael_eblk__functype *pick_eblk(void)
{
#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86ish_aesni_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX) &&
+ cpu_feature_p(CPUFEAT_X86_AESNI));
DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86ish_aesni,
cpu_feature_p(CPUFEAT_X86_AESNI));
#endif
DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86ish_aesni,
cpu_feature_p(CPUFEAT_X86_AESNI));
#endif
static rijndael_dblk__functype *pick_dblk(void)
{
#if CPUFAM_X86 || CPUFAM_AMD64
static rijndael_dblk__functype *pick_dblk(void)
{
#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86ish_aesni_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX) &&
+ cpu_feature_p(CPUFEAT_X86_AESNI));
DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86ish_aesni,
cpu_feature_p(CPUFEAT_X86_AESNI));
#endif
DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86ish_aesni,
cpu_feature_p(CPUFEAT_X86_AESNI));
#endif
///--------------------------------------------------------------------------
/// Main code.
///--------------------------------------------------------------------------
/// Main code.
+FUNC(salsa20_core_x86ish_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ // drop through...
+ENDFUNC
+
+ .arch pentium4
+
FUNC(salsa20_core_x86ish_sse2)
// Initial setup.
FUNC(salsa20_core_x86ish_sse2)
// Initial setup.
#if CPUFAM_X86 || CPUFAM_AMD64
extern core__functype salsa20_core_x86ish_sse2;
#if CPUFAM_X86 || CPUFAM_AMD64
extern core__functype salsa20_core_x86ish_sse2;
+extern core__functype salsa20_core_x86ish_avx;
static core__functype *pick_core(void)
{
#if CPUFAM_X86 || CPUFAM_AMD64
static core__functype *pick_core(void)
{
#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX));
DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif