From: Mark Wooding Date: Thu, 23 Aug 2018 04:13:55 +0000 (+0100) Subject: (x86 asm): Zero the high parts of the ?MM registers if available. X-Git-Tag: 2.5.0~35 X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/commitdiff_plain/b9b279b4105524d5d4e5dcd389141645d904aa0c (x86 asm): Zero the high parts of the ?MM registers if available. There's a performance penalty to trying to preserve the upper parts of the SSE/AVX vector registers, and it's pointless because we don't need to preserve them. (Earlier AVX-capable processors would carefully snip off the upper parts of the registers and put them in a box, and then glue them back on when they were wanted, which isn't so bad. Later processors instead just track the upper part of the register as an additional operand, which leads to unnecessary latency.) Add AVX-specific entry points to the necessary routines, and call them when AVX is detected. This would all be easier if Intel had chosen `vzeroupper' from an existing `nop' encoding space. --- diff --git a/base/dispatch.c b/base/dispatch.c index 908a4e31..9ba6a7cd 100644 --- a/base/dispatch.c +++ b/base/dispatch.c @@ -47,6 +47,7 @@ # define CPUID1D_SSE2 (1u << 26) # define CPUID1D_FXSR (1u << 24) # define CPUID1C_AESNI (1u << 25) +# define CPUID1C_AVX (1u << 28) # define CPUID1C_RDRAND (1u << 30) struct cpuid { unsigned a, b, c, d; }; @@ -545,6 +546,9 @@ int cpu_feature_p(int feat) cpuid_features_p(CPUID1D_SSE2, CPUID1C_AESNI)); CASE_CPUFEAT(X86_RDRAND, "x86:rdrand", cpuid_features_p(0, CPUID1C_RDRAND)); + CASE_CPUFEAT(X86_AVX, "x86:avx", + xmm_registers_available_p() && + cpuid_features_p(0, CPUID1C_AVX)); #endif #ifdef CAPMAP # define FEATP__CASE(feat, tok) \ diff --git a/base/dispatch.h b/base/dispatch.h index f778068c..dae6a689 100644 --- a/base/dispatch.h +++ b/base/dispatch.h @@ -181,7 +181,8 @@ enum { CPUFEAT_ARM_V4, /* VFPv4 and/or SIMD v2 */ CPUFEAT_ARM_D32, /* 32 double registers, not 16 */ CPUFEAT_X86_RDRAND, /* Built-in entropy source */ - CPUFEAT_ARM_AES /* AES instructions */ + CPUFEAT_ARM_AES, /* AES instructions */ + CPUFEAT_X86_AVX /* AVX 1 (i.e., 256-bit YMM regs) */ }; extern int cpu_feature_p(int /*feat*/); diff --git a/math/mpmont.c b/math/mpmont.c index f8a26119..094ac401 100644 --- a/math/mpmont.c +++ b/math/mpmont.c @@ -90,19 +90,25 @@ static void simple_redccore(mpw *dv, mpw *dvl, const mpw *mv, #if CPUFAM_X86 MAYBE_REDC4(x86_sse2) + MAYBE_REDC4(x86_avx) #endif #if CPUFAM_AMD64 MAYBE_REDC4(amd64_sse2) + MAYBE_REDC4(amd64_avx) #endif static redccore__functype *pick_redccore(void) { #if CPUFAM_X86 + DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_x86_avx, + cpu_feature_p(CPUFEAT_X86_AVX)); DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_x86_sse2, cpu_feature_p(CPUFEAT_X86_SSE2)); #endif #if CPUFAM_AMD64 + DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_amd64_avx, + cpu_feature_p(CPUFEAT_X86_AVX)); DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_amd64_sse2, cpu_feature_p(CPUFEAT_X86_SSE2)); #endif @@ -190,19 +196,25 @@ static void simple_mulcore(mpw *dv, mpw *dvl, #if CPUFAM_X86 MAYBE_MUL4(x86_sse2) + MAYBE_MUL4(x86_avx) #endif #if CPUFAM_AMD64 MAYBE_MUL4(amd64_sse2) + MAYBE_MUL4(amd64_avx) #endif static mulcore__functype *pick_mulcore(void) { #if CPUFAM_X86 + DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_x86_avx, + cpu_feature_p(CPUFEAT_X86_AVX)); DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_x86_sse2, cpu_feature_p(CPUFEAT_X86_SSE2)); #endif #if CPUFAM_AMD64 + DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_amd64_avx, + cpu_feature_p(CPUFEAT_X86_AVX)); DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_amd64_sse2, cpu_feature_p(CPUFEAT_X86_SSE2)); #endif diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S index 2d78a992..d8f54e1f 100644 --- a/math/mpx-mul4-amd64-sse2.S +++ b/math/mpx-mul4-amd64-sse2.S @@ -752,6 +752,13 @@ ENDFUNC ///-------------------------------------------------------------------------- /// Bulk multipliers. +FUNC(mpx_umul4_amd64_avx) + .arch .avx + vzeroupper + endprologue + .arch pentium4 +ENDFUNC + FUNC(mpx_umul4_amd64_sse2) // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl, // const mpw *bv, const mpw *bvl); @@ -901,6 +908,13 @@ FUNC(mpx_umul4_amd64_sse2) ENDFUNC +FUNC(mpxmont_mul4_amd64_avx) + .arch .avx + vzeroupper + endprologue + .arch pentium4 +ENDFUNC + FUNC(mpxmont_mul4_amd64_sse2) // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv, // const mpw *nv, size_t n, const mpw *mi); @@ -1095,6 +1109,13 @@ FUNC(mpxmont_mul4_amd64_sse2) ENDFUNC +FUNC(mpxmont_redc4_amd64_avx) + .arch .avx + vzeroupper + endprologue + .arch pentium4 +ENDFUNC + FUNC(mpxmont_redc4_amd64_sse2) // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv, // size_t n, const mpw *mi); diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S index f6c81673..cdc35967 100644 --- a/math/mpx-mul4-x86-sse2.S +++ b/math/mpx-mul4-x86-sse2.S @@ -678,6 +678,14 @@ ENDFUNC ///-------------------------------------------------------------------------- /// Bulk multipliers. +FUNC(mpx_umul4_x86_avx) + .arch .avx + vzeroupper + endprologue + // and drop through... + .arch pentium4 +ENDFUNC + FUNC(mpx_umul4_x86_sse2) // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl, // const mpw *bv, const mpw *bvl); @@ -778,6 +786,14 @@ FUNC(mpx_umul4_x86_sse2) ENDFUNC +FUNC(mpxmont_mul4_x86_avx) + .arch .avx + vzeroupper + endprologue + // and drop through... + .arch pentium4 +ENDFUNC + FUNC(mpxmont_mul4_x86_sse2) // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv, // const mpw *nv, size_t n, const mpw *mi); @@ -919,6 +935,14 @@ FUNC(mpxmont_mul4_x86_sse2) ENDFUNC +FUNC(mpxmont_redc4_x86_avx) + .arch .avx + vzeroupper + endprologue + // and drop through... + .arch pentium4 +ENDFUNC + FUNC(mpxmont_redc4_x86_sse2) // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv, // size_t n, const mpw *mi); diff --git a/math/mpx.c b/math/mpx.c index 3983e7ca..42948457 100644 --- a/math/mpx.c +++ b/math/mpx.c @@ -923,19 +923,25 @@ static void simple_umul(mpw *dv, mpw *dvl, const mpw *av, const mpw *avl, #if CPUFAM_X86 MAYBE_UMUL4(x86_sse2) + MAYBE_UMUL4(x86_avx) #endif #if CPUFAM_AMD64 MAYBE_UMUL4(amd64_sse2) + MAYBE_UMUL4(amd64_avx) #endif static mpx_umul__functype *pick_umul(void) { #if CPUFAM_X86 + DISPATCH_PICK_COND(mpx_umul, maybe_umul4_x86_avx, + cpu_feature_p(CPUFEAT_X86_AVX)); DISPATCH_PICK_COND(mpx_umul, maybe_umul4_x86_sse2, cpu_feature_p(CPUFEAT_X86_SSE2)); #endif #if CPUFAM_AMD64 + DISPATCH_PICK_COND(mpx_umul, maybe_umul4_amd64_avx, + cpu_feature_p(CPUFEAT_X86_AVX)); DISPATCH_PICK_COND(mpx_umul, maybe_umul4_amd64_sse2, cpu_feature_p(CPUFEAT_X86_SSE2)); #endif diff --git a/symm/chacha-x86ish-sse2.S b/symm/chacha-x86ish-sse2.S index 2dab283b..b8f72d53 100644 --- a/symm/chacha-x86ish-sse2.S +++ b/symm/chacha-x86ish-sse2.S @@ -33,9 +33,17 @@ ///-------------------------------------------------------------------------- /// Main code. - .arch pentium4 .text +FUNC(chacha_core_x86ish_avx) + .arch .avx + vzeroupper + endprologue + // drop through... +ENDFUNC + + .arch pentium4 + FUNC(chacha_core_x86ish_sse2) // Initial setup. diff --git a/symm/chacha.c b/symm/chacha.c index 34198618..9b83eea5 100644 --- a/symm/chacha.c +++ b/symm/chacha.c @@ -72,6 +72,7 @@ static void simple_core(unsigned r, const chacha_matrix src, #if CPUFAM_X86 || CPUFAM_AMD64 extern core__functype chacha_core_x86ish_sse2; +extern core__functype chacha_core_x86ish_avx; #endif #if CPUFAM_ARMEL @@ -85,6 +86,8 @@ extern core__functype chacha_core_arm64; static core__functype *pick_core(void) { #if CPUFAM_X86 || CPUFAM_AMD64 + DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_avx, + cpu_feature_p(CPUFEAT_X86_AVX)); DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_sse2, cpu_feature_p(CPUFEAT_X86_SSE2)); #endif diff --git a/symm/rijndael-base.c b/symm/rijndael-base.c index 83a49e92..2f651918 100644 --- a/symm/rijndael-base.c +++ b/symm/rijndael-base.c @@ -118,6 +118,7 @@ CPU_DISPATCH(static, EMPTY, void, setup, #if CPUFAM_X86 || CPUFAM_AMD64 extern setup__functype rijndael_setup_x86ish_aesni; +extern setup__functype rijndael_setup_x86ish_aesni_avx; #endif #if CPUFAM_ARMEL && HAVE_AS_ARMV8_CRYPTO extern setup__functype rijndael_setup_arm_crypto; @@ -129,6 +130,9 @@ extern setup__functype rijndael_setup_arm64_crypto; static setup__functype *pick_setup(void) { #if CPUFAM_X86 || CPUFAM_AMD64 + DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86ish_aesni_avx, + cpu_feature_p(CPUFEAT_X86_AVX) && + cpu_feature_p(CPUFEAT_X86_AESNI)); DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86ish_aesni, cpu_feature_p(CPUFEAT_X86_AESNI)); #endif diff --git a/symm/rijndael-x86ish-aesni.S b/symm/rijndael-x86ish-aesni.S index e556aa53..a7a1ece3 100644 --- a/symm/rijndael-x86ish-aesni.S +++ b/symm/rijndael-x86ish-aesni.S @@ -61,6 +61,12 @@ ///-------------------------------------------------------------------------- /// Key setup. +FUNC(rijndael_setup_x86ish_aesni_avx) + vzeroupper // avoid penalty on `legacy' XMM access + endprologue + // and drop through... +ENDFUNC + FUNC(rijndael_setup_x86ish_aesni) #define SI WHOLE(si) @@ -365,6 +371,12 @@ ENDFUNC /// Encrypting and decrypting blocks. .macro encdec op, aes, koff + FUNC(rijndael_\op\()_x86ish_aesni_avx) + vzeroupper // avoid XMM penalties + endprologue + // and drop through... + ENDFUNC + FUNC(rijndael_\op\()_x86ish_aesni) #if CPUFAM_X86 diff --git a/symm/rijndael.c b/symm/rijndael.c index 02cfb76b..7db9e012 100644 --- a/symm/rijndael.c +++ b/symm/rijndael.c @@ -83,6 +83,8 @@ CPU_DISPATCH(EMPTY, EMPTY, void, rijndael_dblk, #if CPUFAM_X86 || CPUFAM_AMD64 extern rijndael_eblk__functype rijndael_eblk_x86ish_aesni; extern rijndael_dblk__functype rijndael_dblk_x86ish_aesni; +extern rijndael_eblk__functype rijndael_eblk_x86ish_aesni_avx; +extern rijndael_dblk__functype rijndael_dblk_x86ish_aesni_avx; #endif #if CPUFAM_ARMEL && HAVE_AS_ARMV8_CRYPTO extern rijndael_eblk__functype rijndael_eblk_arm_crypto; @@ -96,6 +98,9 @@ extern rijndael_dblk__functype rijndael_dblk_arm64_crypto; static rijndael_eblk__functype *pick_eblk(void) { #if CPUFAM_X86 || CPUFAM_AMD64 + DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86ish_aesni_avx, + cpu_feature_p(CPUFEAT_X86_AVX) && + cpu_feature_p(CPUFEAT_X86_AESNI)); DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86ish_aesni, cpu_feature_p(CPUFEAT_X86_AESNI)); #endif @@ -113,6 +118,9 @@ static rijndael_eblk__functype *pick_eblk(void) static rijndael_dblk__functype *pick_dblk(void) { #if CPUFAM_X86 || CPUFAM_AMD64 + DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86ish_aesni_avx, + cpu_feature_p(CPUFEAT_X86_AVX) && + cpu_feature_p(CPUFEAT_X86_AESNI)); DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86ish_aesni, cpu_feature_p(CPUFEAT_X86_AESNI)); #endif diff --git a/symm/salsa20-x86ish-sse2.S b/symm/salsa20-x86ish-sse2.S index 9cbaeff4..76ac0ed9 100644 --- a/symm/salsa20-x86ish-sse2.S +++ b/symm/salsa20-x86ish-sse2.S @@ -33,9 +33,17 @@ ///-------------------------------------------------------------------------- /// Main code. - .arch pentium4 .text +FUNC(salsa20_core_x86ish_avx) + .arch .avx + vzeroupper + endprologue + // drop through... +ENDFUNC + + .arch pentium4 + FUNC(salsa20_core_x86ish_sse2) // Initial setup. diff --git a/symm/salsa20.c b/symm/salsa20.c index 03fcf469..e78baf05 100644 --- a/symm/salsa20.c +++ b/symm/salsa20.c @@ -72,6 +72,7 @@ static void simple_core(unsigned r, const salsa20_matrix src, #if CPUFAM_X86 || CPUFAM_AMD64 extern core__functype salsa20_core_x86ish_sse2; +extern core__functype salsa20_core_x86ish_avx; #endif #if CPUFAM_ARMEL @@ -85,6 +86,8 @@ extern core__functype salsa20_core_arm64; static core__functype *pick_core(void) { #if CPUFAM_X86 || CPUFAM_AMD64 + DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_avx, + cpu_feature_p(CPUFEAT_X86_AVX)); DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_sse2, cpu_feature_p(CPUFEAT_X86_SSE2)); #endif