(x86 asm): Zero the high parts of the ?MM registers if available.
authorMark Wooding <mdw@distorted.org.uk>
Thu, 23 Aug 2018 04:13:55 +0000 (05:13 +0100)
committerMark Wooding <mdw@distorted.org.uk>
Thu, 23 Aug 2018 06:23:31 +0000 (07:23 +0100)
There's a performance penalty to trying to preserve the upper parts of
the SSE/AVX vector registers, and it's pointless because we don't need
to preserve them.  (Earlier AVX-capable processors would carefully snip
off the upper parts of the registers and put them in a box, and then
glue them back on when they were wanted, which isn't so bad.  Later
processors instead just track the upper part of the register as an
additional operand, which leads to unnecessary latency.)

Add AVX-specific entry points to the necessary routines, and call them
when AVX is detected.  This would all be easier if Intel had chosen
`vzeroupper' from an existing `nop' encoding space.

13 files changed:
base/dispatch.c
base/dispatch.h
math/mpmont.c
math/mpx-mul4-amd64-sse2.S
math/mpx-mul4-x86-sse2.S
math/mpx.c
symm/chacha-x86ish-sse2.S
symm/chacha.c
symm/rijndael-base.c
symm/rijndael-x86ish-aesni.S
symm/rijndael.c
symm/salsa20-x86ish-sse2.S
symm/salsa20.c

index 908a4e3..9ba6a7c 100644 (file)
@@ -47,6 +47,7 @@
 #  define CPUID1D_SSE2 (1u << 26)
 #  define CPUID1D_FXSR (1u << 24)
 #  define CPUID1C_AESNI (1u << 25)
+#  define CPUID1C_AVX (1u << 28)
 #  define CPUID1C_RDRAND (1u << 30)
 
 struct cpuid { unsigned a, b, c, d; };
@@ -545,6 +546,9 @@ int cpu_feature_p(int feat)
                 cpuid_features_p(CPUID1D_SSE2, CPUID1C_AESNI));
     CASE_CPUFEAT(X86_RDRAND, "x86:rdrand",
                 cpuid_features_p(0, CPUID1C_RDRAND));
+    CASE_CPUFEAT(X86_AVX, "x86:avx",
+                xmm_registers_available_p() &&
+                cpuid_features_p(0, CPUID1C_AVX));
 #endif
 #ifdef CAPMAP
 #  define FEATP__CASE(feat, tok)                                       \
index f778068..dae6a68 100644 (file)
@@ -181,7 +181,8 @@ enum {
   CPUFEAT_ARM_V4,                      /* VFPv4 and/or SIMD v2 */
   CPUFEAT_ARM_D32,                     /* 32 double registers, not 16 */
   CPUFEAT_X86_RDRAND,                  /* Built-in entropy source */
-  CPUFEAT_ARM_AES                      /* AES instructions */
+  CPUFEAT_ARM_AES,                     /* AES instructions */
+  CPUFEAT_X86_AVX                      /* AVX 1 (i.e., 256-bit YMM regs) */
 };
 
 extern int cpu_feature_p(int /*feat*/);
index f8a2611..094ac40 100644 (file)
@@ -90,19 +90,25 @@ static void simple_redccore(mpw *dv, mpw *dvl, const mpw *mv,
 
 #if CPUFAM_X86
   MAYBE_REDC4(x86_sse2)
+  MAYBE_REDC4(x86_avx)
 #endif
 
 #if CPUFAM_AMD64
   MAYBE_REDC4(amd64_sse2)
+  MAYBE_REDC4(amd64_avx)
 #endif
 
 static redccore__functype *pick_redccore(void)
 {
 #if CPUFAM_X86
+  DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_x86_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX));
   DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_x86_sse2,
                     cpu_feature_p(CPUFEAT_X86_SSE2));
 #endif
 #if CPUFAM_AMD64
+  DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_amd64_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX));
   DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_amd64_sse2,
                     cpu_feature_p(CPUFEAT_X86_SSE2));
 #endif
@@ -190,19 +196,25 @@ static void simple_mulcore(mpw *dv, mpw *dvl,
 
 #if CPUFAM_X86
   MAYBE_MUL4(x86_sse2)
+  MAYBE_MUL4(x86_avx)
 #endif
 
 #if CPUFAM_AMD64
   MAYBE_MUL4(amd64_sse2)
+  MAYBE_MUL4(amd64_avx)
 #endif
 
 static mulcore__functype *pick_mulcore(void)
 {
 #if CPUFAM_X86
+  DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_x86_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX));
   DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_x86_sse2,
                     cpu_feature_p(CPUFEAT_X86_SSE2));
 #endif
 #if CPUFAM_AMD64
+  DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_amd64_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX));
   DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_amd64_sse2,
                     cpu_feature_p(CPUFEAT_X86_SSE2));
 #endif
index 2d78a99..d8f54e1 100644 (file)
@@ -752,6 +752,13 @@ ENDFUNC
 ///--------------------------------------------------------------------------
 /// Bulk multipliers.
 
+FUNC(mpx_umul4_amd64_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       .arch   pentium4
+ENDFUNC
+
 FUNC(mpx_umul4_amd64_sse2)
        // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
        //                         const mpw *bv, const mpw *bvl);
@@ -901,6 +908,13 @@ FUNC(mpx_umul4_amd64_sse2)
 
 ENDFUNC
 
+FUNC(mpxmont_mul4_amd64_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       .arch   pentium4
+ENDFUNC
+
 FUNC(mpxmont_mul4_amd64_sse2)
        // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
        //                           const mpw *nv, size_t n, const mpw *mi);
@@ -1095,6 +1109,13 @@ FUNC(mpxmont_mul4_amd64_sse2)
 
 ENDFUNC
 
+FUNC(mpxmont_redc4_amd64_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       .arch   pentium4
+ENDFUNC
+
 FUNC(mpxmont_redc4_amd64_sse2)
        // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
        //                             size_t n, const mpw *mi);
index f6c8167..cdc3596 100644 (file)
@@ -678,6 +678,14 @@ ENDFUNC
 ///--------------------------------------------------------------------------
 /// Bulk multipliers.
 
+FUNC(mpx_umul4_x86_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       // and drop through...
+       .arch   pentium4
+ENDFUNC
+
 FUNC(mpx_umul4_x86_sse2)
        // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
        //                         const mpw *bv, const mpw *bvl);
@@ -778,6 +786,14 @@ FUNC(mpx_umul4_x86_sse2)
 
 ENDFUNC
 
+FUNC(mpxmont_mul4_x86_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       // and drop through...
+       .arch   pentium4
+ENDFUNC
+
 FUNC(mpxmont_mul4_x86_sse2)
        // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
        //                           const mpw *nv, size_t n, const mpw *mi);
@@ -919,6 +935,14 @@ FUNC(mpxmont_mul4_x86_sse2)
 
 ENDFUNC
 
+FUNC(mpxmont_redc4_x86_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       // and drop through...
+       .arch   pentium4
+ENDFUNC
+
 FUNC(mpxmont_redc4_x86_sse2)
        // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
        //                             size_t n, const mpw *mi);
index 3983e7c..4294845 100644 (file)
@@ -923,19 +923,25 @@ static void simple_umul(mpw *dv, mpw *dvl, const mpw *av, const mpw *avl,
 
 #if CPUFAM_X86
   MAYBE_UMUL4(x86_sse2)
+  MAYBE_UMUL4(x86_avx)
 #endif
 
 #if CPUFAM_AMD64
   MAYBE_UMUL4(amd64_sse2)
+  MAYBE_UMUL4(amd64_avx)
 #endif
 
 static mpx_umul__functype *pick_umul(void)
 {
 #if CPUFAM_X86
+  DISPATCH_PICK_COND(mpx_umul, maybe_umul4_x86_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX));
   DISPATCH_PICK_COND(mpx_umul, maybe_umul4_x86_sse2,
                     cpu_feature_p(CPUFEAT_X86_SSE2));
 #endif
 #if CPUFAM_AMD64
+  DISPATCH_PICK_COND(mpx_umul, maybe_umul4_amd64_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX));
   DISPATCH_PICK_COND(mpx_umul, maybe_umul4_amd64_sse2,
                     cpu_feature_p(CPUFEAT_X86_SSE2));
 #endif
index 2dab283..b8f72d5 100644 (file)
 ///--------------------------------------------------------------------------
 /// Main code.
 
-       .arch pentium4
        .text
 
+FUNC(chacha_core_x86ish_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       // drop through...
+ENDFUNC
+
+       .arch   pentium4
+
 FUNC(chacha_core_x86ish_sse2)
 
        // Initial setup.
index 3419861..9b83eea 100644 (file)
@@ -72,6 +72,7 @@ static void simple_core(unsigned r, const chacha_matrix src,
 
 #if CPUFAM_X86 || CPUFAM_AMD64
 extern core__functype chacha_core_x86ish_sse2;
+extern core__functype chacha_core_x86ish_avx;
 #endif
 
 #if CPUFAM_ARMEL
@@ -85,6 +86,8 @@ extern core__functype chacha_core_arm64;
 static core__functype *pick_core(void)
 {
 #if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX));
   DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_sse2,
                     cpu_feature_p(CPUFEAT_X86_SSE2));
 #endif
index 83a49e9..2f65191 100644 (file)
@@ -118,6 +118,7 @@ CPU_DISPATCH(static, EMPTY, void, setup,
 
 #if CPUFAM_X86 || CPUFAM_AMD64
 extern setup__functype rijndael_setup_x86ish_aesni;
+extern setup__functype rijndael_setup_x86ish_aesni_avx;
 #endif
 #if CPUFAM_ARMEL && HAVE_AS_ARMV8_CRYPTO
 extern setup__functype rijndael_setup_arm_crypto;
@@ -129,6 +130,9 @@ extern setup__functype rijndael_setup_arm64_crypto;
 static setup__functype *pick_setup(void)
 {
 #if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86ish_aesni_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX) &&
+                    cpu_feature_p(CPUFEAT_X86_AESNI));
   DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86ish_aesni,
                     cpu_feature_p(CPUFEAT_X86_AESNI));
 #endif
index e556aa5..a7a1ece 100644 (file)
 ///--------------------------------------------------------------------------
 /// Key setup.
 
+FUNC(rijndael_setup_x86ish_aesni_avx)
+       vzeroupper                    // avoid penalty on `legacy' XMM access
+  endprologue
+       // and drop through...
+ENDFUNC
+
 FUNC(rijndael_setup_x86ish_aesni)
 
 #define SI WHOLE(si)
@@ -365,6 +371,12 @@ ENDFUNC
 /// Encrypting and decrypting blocks.
 
 .macro encdec  op, aes, koff
+  FUNC(rijndael_\op\()_x86ish_aesni_avx)
+       vzeroupper                      // avoid XMM penalties
+  endprologue
+       // and drop through...
+  ENDFUNC
+
   FUNC(rijndael_\op\()_x86ish_aesni)
 
 #if CPUFAM_X86
index 02cfb76..7db9e01 100644 (file)
@@ -83,6 +83,8 @@ CPU_DISPATCH(EMPTY, EMPTY, void, rijndael_dblk,
 #if CPUFAM_X86 || CPUFAM_AMD64
 extern rijndael_eblk__functype rijndael_eblk_x86ish_aesni;
 extern rijndael_dblk__functype rijndael_dblk_x86ish_aesni;
+extern rijndael_eblk__functype rijndael_eblk_x86ish_aesni_avx;
+extern rijndael_dblk__functype rijndael_dblk_x86ish_aesni_avx;
 #endif
 #if CPUFAM_ARMEL && HAVE_AS_ARMV8_CRYPTO
 extern rijndael_eblk__functype rijndael_eblk_arm_crypto;
@@ -96,6 +98,9 @@ extern rijndael_dblk__functype rijndael_dblk_arm64_crypto;
 static rijndael_eblk__functype *pick_eblk(void)
 {
 #if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86ish_aesni_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX) &&
+                    cpu_feature_p(CPUFEAT_X86_AESNI));
   DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86ish_aesni,
                     cpu_feature_p(CPUFEAT_X86_AESNI));
 #endif
@@ -113,6 +118,9 @@ static rijndael_eblk__functype *pick_eblk(void)
 static rijndael_dblk__functype *pick_dblk(void)
 {
 #if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86ish_aesni_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX) &&
+                    cpu_feature_p(CPUFEAT_X86_AESNI));
   DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86ish_aesni,
                     cpu_feature_p(CPUFEAT_X86_AESNI));
 #endif
index 9cbaeff..76ac0ed 100644 (file)
 ///--------------------------------------------------------------------------
 /// Main code.
 
-       .arch pentium4
        .text
 
+FUNC(salsa20_core_x86ish_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       // drop through...
+ENDFUNC
+
+       .arch   pentium4
+
 FUNC(salsa20_core_x86ish_sse2)
 
        // Initial setup.
index 03fcf46..e78baf0 100644 (file)
@@ -72,6 +72,7 @@ static void simple_core(unsigned r, const salsa20_matrix src,
 
 #if CPUFAM_X86 || CPUFAM_AMD64
 extern core__functype salsa20_core_x86ish_sse2;
+extern core__functype salsa20_core_x86ish_avx;
 #endif
 
 #if CPUFAM_ARMEL
@@ -85,6 +86,8 @@ extern core__functype salsa20_core_arm64;
 static core__functype *pick_core(void)
 {
 #if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX));
   DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_sse2,
                     cpu_feature_p(CPUFEAT_X86_SSE2));
 #endif