Merge branch '2.4.x'

author Mark Wooding <mdw@distorted.org.uk>

Sat, 24 Nov 2018 21:53:58 +0000 (21:53 +0000)

committer Mark Wooding <mdw@distorted.org.uk>

Sat, 24 Nov 2018 21:53:58 +0000 (21:53 +0000)
author Mark Wooding <mdw@distorted.org.uk>
Sat, 24 Nov 2018 21:53:58 +0000 (21:53 +0000)
committer Mark Wooding <mdw@distorted.org.uk>
Sat, 24 Nov 2018 21:53:58 +0000 (21:53 +0000)
diff --git a/.gitignore b/.gitignore

index 1d7acb5..9c2b37d 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -1,61 +1,10 @@
  Makefile.in
-aclocal.m4
-configure
-COPYING.LIB
-autom4te.cache
-config
-precomp
-progs/getdate.h
-progs/getdate.y
-symm/modes.am
-symm/stubs.am
+/aclocal.m4
+/configure
+/COPYING.LIB
+/autom4te.cache/
+/config/
+/precomp/
  *.sage.py
  *.t
  *.to
-/symm/safersk.h
-/symm/salsa2012.h
-/symm/salsa208.h
-/symm/salsa20-ietf.h
-/symm/salsa2012-ietf.h
-/symm/salsa208-ietf.h
-/symm/sha224.h
-/symm/sha384.h
-/symm/whirlpool256.h
-/symm/xsalsa20.h
-/symm/xsalsa2012.h
-/symm/xsalsa208.h
-/symm/stubs.gen-stamp
-/symm/t/salsa20
-/symm/xchacha12.h
-/symm/xchacha20.h
-/symm/xchacha8.h
-/symm/chacha12.h
-/symm/chacha20.h
-/symm/chacha8.h
-/symm/chacha12-ietf.h
-/symm/chacha20-ietf.h
-/symm/chacha8-ietf.h
-/symm/xchacha.h
-/symm/kmac128.h
-/symm/kmac256.h
-/symm/safersk.c
-/symm/sha224.c
-/symm/sha3-224.c
-/symm/sha3-224.h
-/symm/sha3-256.c
-/symm/sha3-256.h
-/symm/sha3-384.c
-/symm/sha3-384.h
-/symm/sha3-512.c
-/symm/sha3-512.h
-/symm/sha384.c
-/symm/sha512-224.c
-/symm/sha512-224.h
-/symm/sha512-256.c
-/symm/sha512-256.h
-/symm/shake128.h
-/symm/shake256.h
-/symm/t/sha3
-/symm/whirlpool256.c
-/symm/shake128-xof.h
-/symm/shake256-xof.h
diff --git a/base/asm-common.h b/base/asm-common.h

index 8e51ea3..d6a8b01 100644 (file)
--- a/base/asm-common.h
+++ b/base/asm-common.h
@@ -217,11 +217,11 @@ name:
  #  define INTADDR__1(addr, got) addr
  #endif
  
-// Permutations for SIMD instructions.  SHUF(D, C, B, A) is an immediate,
-// suitable for use in `pshufd' or `shufpd', which copies element D
-// (0 <= D < 4) of the source to element 3 of the destination, element C to
-// element 2, element B to element 1, and element A to element 0.
-#define SHUF(d, c, b, a) (64*(d) + 16*(c) + 4*(b) + (a))
+// Permutations for SIMD instructions.  SHUF(A, B, C, D) is an immediate,
+// suitable for use in `pshufd' or `shufpd', which copies element A
+// (0 <= A < 4) of the source to element 0 of the destination, element B to
+// element 1, element C to element 2, and element D to element 3.
+#define SHUF(a, b, c, d) ((a) + 4*(b) + 16*(c) + 64*(d))
  
  // Map register names to their individual pieces.
  
diff --git a/base/dispatch.c b/base/dispatch.c

index 908a4e3..9ba6a7c 100644 (file)
--- a/base/dispatch.c
+++ b/base/dispatch.c
@@ -47,6 +47,7 @@
  #  define CPUID1D_SSE2 (1u << 26)
  #  define CPUID1D_FXSR (1u << 24)
  #  define CPUID1C_AESNI (1u << 25)
+#  define CPUID1C_AVX (1u << 28)
  #  define CPUID1C_RDRAND (1u << 30)
  
  struct cpuid { unsigned a, b, c, d; };
@@ -545,6 +546,9 @@ int cpu_feature_p(int feat)
                  cpuid_features_p(CPUID1D_SSE2, CPUID1C_AESNI));
      CASE_CPUFEAT(X86_RDRAND, "x86:rdrand",
                  cpuid_features_p(0, CPUID1C_RDRAND));
+    CASE_CPUFEAT(X86_AVX, "x86:avx",
+                xmm_registers_available_p() &&
+                cpuid_features_p(0, CPUID1C_AVX));
  #endif
  #ifdef CAPMAP
  #  define FEATP__CASE(feat, tok)                                       \
diff --git a/base/dispatch.h b/base/dispatch.h

index f778068..dae6a68 100644 (file)
--- a/base/dispatch.h
+++ b/base/dispatch.h
@@ -181,7 +181,8 @@ enum {
    CPUFEAT_ARM_V4,                      /* VFPv4 and/or SIMD v2 */
    CPUFEAT_ARM_D32,                     /* 32 double registers, not 16 */
    CPUFEAT_X86_RDRAND,                  /* Built-in entropy source */
-  CPUFEAT_ARM_AES                      /* AES instructions */
+  CPUFEAT_ARM_AES,                     /* AES instructions */
+  CPUFEAT_X86_AVX                      /* AVX 1 (i.e., 256-bit YMM regs) */
  };
  
  extern int cpu_feature_p(int /*feat*/);
diff --git a/configure.ac b/configure.ac

index f8ad8b7..a2af5bf 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -278,6 +278,9 @@ AC_CHECK_HEADERS([sys/auxv.h])
  AC_CHECK_HEADERS([linux/auxvec.h])
  AC_CHECK_FUNCS([getauxval])
  
+dnl Some equipment for measuring CPU performance.
+AC_CHECK_HEADERS([linux/perf_event.h])
+
  dnl Find the bit lengths of the obvious integer types.  This will be useful
  dnl when deciding on a representation for multiprecision integers.
  type_bits="" type_bits_sep=""
diff --git a/math/mpmont.c b/math/mpmont.c

index f8a2611..094ac40 100644 (file)
--- a/math/mpmont.c
+++ b/math/mpmont.c
@@ -90,19 +90,25 @@ static void simple_redccore(mpw *dv, mpw *dvl, const mpw *mv,
  
  #if CPUFAM_X86
    MAYBE_REDC4(x86_sse2)
+  MAYBE_REDC4(x86_avx)
  #endif
  
  #if CPUFAM_AMD64
    MAYBE_REDC4(amd64_sse2)
+  MAYBE_REDC4(amd64_avx)
  #endif
  
  static redccore__functype *pick_redccore(void)
  {
  #if CPUFAM_X86
+  DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_x86_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX));
    DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_x86_sse2,
                      cpu_feature_p(CPUFEAT_X86_SSE2));
  #endif
  #if CPUFAM_AMD64
+  DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_amd64_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX));
    DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_amd64_sse2,
                      cpu_feature_p(CPUFEAT_X86_SSE2));
  #endif
@@ -190,19 +196,25 @@ static void simple_mulcore(mpw *dv, mpw *dvl,
  
  #if CPUFAM_X86
    MAYBE_MUL4(x86_sse2)
+  MAYBE_MUL4(x86_avx)
  #endif
  
  #if CPUFAM_AMD64
    MAYBE_MUL4(amd64_sse2)
+  MAYBE_MUL4(amd64_avx)
  #endif
  
  static mulcore__functype *pick_mulcore(void)
  {
  #if CPUFAM_X86
+  DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_x86_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX));
    DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_x86_sse2,
                      cpu_feature_p(CPUFEAT_X86_SSE2));
  #endif
  #if CPUFAM_AMD64
+  DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_amd64_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX));
    DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_amd64_sse2,
                      cpu_feature_p(CPUFEAT_X86_SSE2));
  #endif
diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S

index 9146a63..29939c1 100644 (file)
--- a/math/mpx-mul4-amd64-sse2.S
+++ b/math/mpx-mul4-amd64-sse2.S
@@ -96,32 +96,32 @@
  .macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
         // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
         // of the product in registers D0, D1, D2, D3.
-       pshufd  \d0, \r, SHUF(3, \i, 3, \i) // (r_i, ?, r_i, ?)
+       pshufd  \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
    .ifnes "\d1", "nil"
-       movdqa  \d1, \slo               // (s'_0, s'_1, s''_0, s''_1)
+       movdqa  \d1, \slo               // (s'_0, s'_1; s''_0, s''_1)
    .endif
    .ifnes "\d3", "nil"
-       movdqa  \d3, \shi               // (s'_2, s'_3, s''_2, s''_3)
+       movdqa  \d3, \shi               // (s'_2, s'_3; s''_2, s''_3)
    .endif
    .ifnes "\d1", "nil"
-       psrldq  \d1, 4                  // (s'_1, s''_0, s''_1, 0)
+       psrldq  \d1, 4                  // (s'_1, s''_0; s''_1, 0)
    .endif
    .ifnes "\d2", "nil"
-       movdqa  \d2, \d0                // another copy of (r_i, ?, r_i, ?)
+       movdqa  \d2, \d0                // another copy of (r_i, ?; r_i, ?)
    .endif
    .ifnes "\d3", "nil"
-       psrldq  \d3, 4                  // (s'_3, s''_2, s''_3, 0)
+       psrldq  \d3, 4                  // (s'_3, s''_2; s''_3, 0)
    .endif
    .ifnes "\d1", "nil"
-       pmuludq \d1, \d0                // (r_i s'_1, r_i s''_1)
+       pmuludq \d1, \d0                // (r_i s'_1; r_i s''_1)
    .endif
    .ifnes "\d3", "nil"
-       pmuludq \d3, \d0                // (r_i s'_3, r_i s''_3)
+       pmuludq \d3, \d0                // (r_i s'_3; r_i s''_3)
    .endif
    .ifnes "\d2", "nil"
-       pmuludq \d2, \shi               // (r_i s'_2, r_i s''_2)
+       pmuludq \d2, \shi               // (r_i s'_2; r_i s''_2)
    .endif
-       pmuludq \d0, \slo               // (r_i s'_0, r_i s''_0)
+       pmuludq \d0, \slo               // (r_i s'_0; r_i s''_0)
  .endm
  
  .macro accum   c0, c1=nil, c2=nil, c3=nil
@@ -163,10 +163,10 @@
         // lane 0 or 1 of D; the high two lanes of D are clobbered.  On
         // completion, XMM3 is clobbered.  If CC is `nil', then the
         // contribution which would have been added to it is left in C.
-       pshufd  xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
-       psrldq  xmm3, 12                // (t, 0, 0, 0) = (t, 0)
-       pslldq  xmm3, 2                 // (t b, 0)
-       paddq   \c, xmm3                // (c' + t b, c'')
+       pshufd  xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
+       psrldq  xmm3, 12                // (t, 0; 0, 0) = (t; 0)
+       pslldq  xmm3, 2                 // (t b; 0)
+       paddq   \c, xmm3                // (c' + t b; c'')
    .ifeqs "\pos", "lo"
         movdqa  \d, \c
    .else
@@ -183,37 +183,37 @@
         // of the value represented in C are written at POS in D, and the
         // remaining bits are left at the bottom of T.
         movdqa  \t, \c
-       psllq   \t, 16                  // (?, c'' b)
-       pslldq  \c, 8                   // (0, c')
-       paddq   \t, \c                  // (?, c' + c'' b)
-       psrldq  \t, 8                   // c' + c'' b
+       psllq   \t, 16                  // (?; c'' b)
+       pslldq  \c, 8                   // (0; c')
+       paddq   \t, \c                  // (?; c' + c'' b)
+       psrldq  \t, 8                   // (c' + c'' b; 0) = (c; 0)
    .ifeqs "\pos", "lo"
         movdqa  \d, \t
    .else
         punpckldq \d, \t
    .endif
-       psrldq  \t, 4                   // floor((c' + c'' b)/B)
+       psrldq  \t, 4                   // (floor(c/B); 0)
  .endm
  
  .macro expand  z, a, b, c=nil, d=nil
         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
         // exit, A:B and C:D together hold the same values in expanded
         // form.  If C is `nil', then only expand A to A:B.
-       movdqa  \b, \a                  // (a_0, a_1, a_2, a_3)
+       movdqa  \b, \a                  // (a_0, a_1; a_2, a_3)
    .ifnes "\c", "nil"
-       movdqa  \d, \c                  // (c_0, c_1, c_2, c_3)
+       movdqa  \d, \c                  // (c_0, c_1; c_2, c_3)
    .endif
-       punpcklwd \a, \z                // (a'_0, a''_0, a'_1, a''_1)
-       punpckhwd \b, \z                // (a'_2, a''_2, a'_3, a''_3)
+       punpcklwd \a, \z                // (a'_0, a''_0; a'_1, a''_1)
+       punpckhwd \b, \z                // (a'_2, a''_2; a'_3, a''_3)
    .ifnes "\c", "nil"
-       punpcklwd \c, \z                // (c'_0, c''_0, c'_1, c''_1)
-       punpckhwd \d, \z                // (c'_2, c''_2, c'_3, c''_3)
+       punpcklwd \c, \z                // (c'_0, c''_0; c'_1, c''_1)
+       punpckhwd \d, \z                // (c'_2, c''_2; c'_3, c''_3)
    .endif
-       pshufd  \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
-       pshufd  \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
+       pshufd  \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
+       pshufd  \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
    .ifnes "\c", "nil"
-       pshufd  \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
-       pshufd  \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
+       pshufd  \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
+       pshufd  \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
    .endif
  .endm
  
@@ -229,10 +229,10 @@
         // we can do that, we must gather them together.
         movdqa  \t, \c0
         movdqa  \u, \c1
-       punpcklqdq \t, \c2              // (y'_0, y'_2)
-       punpckhqdq \c0, \c2             // (y''_0, y''_2)
-       punpcklqdq \u, \c3              // (y'_1, y'_3)
-       punpckhqdq \c1, \c3             // (y''_1, y''_3)
+       punpcklqdq \t, \c2              // (y'_0; y'_2)
+       punpckhqdq \c0, \c2             // (y''_0; y''_2)
+       punpcklqdq \u, \c3              // (y'_1; y'_3)
+       punpckhqdq \c1, \c3             // (y''_1; y''_3)
  
         // Now split the double-prime pieces.  The high (up to) 48 bits will
         // go up; the low 16 bits go down.
@@ -240,43 +240,43 @@
         movdqa  \c3, \c1
         psllq   \c2, 48
         psllq   \c3, 48
-       psrlq   \c0, 16                 // high parts of (y''_0, y''_2)
-       psrlq   \c1, 16                 // high parts of (y''_1, y''_3)
-       psrlq   \c2, 32                 // low parts of (y''_0, y''_2)
-       psrlq   \c3, 32                 // low parts of (y''_1, y''_3)
+       psrlq   \c0, 16                 // high parts of (y''_0; y''_2)
+       psrlq   \c1, 16                 // high parts of (y''_1; y''_3)
+       psrlq   \c2, 32                 // low parts of (y''_0; y''_2)
+       psrlq   \c3, 32                 // low parts of (y''_1; y''_3)
    .ifnes "\hi", "nil"
         movdqa  \hi, \c1
    .endif
-       pslldq  \c1, 8                  // high part of (0, y''_1)
+       pslldq  \c1, 8                  // high part of (0; y''_1)
  
         paddq   \t, \c2                 // propagate down
         paddq   \u, \c3
-       paddq   \t, \c1                 // and up: (y_0, y_2)
-       paddq   \u, \c0                 // (y_1, y_3)
+       paddq   \t, \c1                 // and up: (y_0; y_2)
+       paddq   \u, \c0                 // (y_1; y_3)
    .ifnes "\hi", "nil"
-       psrldq  \hi, 8                  // high part of (y''_3, 0)
+       psrldq  \hi, 8                  // high part of (y''_3; 0)
    .endif
  
         // Finally extract the answer.  This complicated dance is better than
         // storing to memory and loading, because the piecemeal stores
         // inhibit store forwarding.
-       movdqa  \c3, \t                 // (y_0, y_1)
-       movdqa  \lo, \t                 // (y^*_0, ?, ?, ?)
-       psrldq  \t, 8                   // (y_2, 0)
-       psrlq   \c3, 32                 // (floor(y_0/B), ?)
-       paddq   \c3, \u                 // (y_1 + floor(y_0/B), ?)
-       movdqa  \c1, \c3                // (y^*_1, ?, ?, ?)
-       psrldq  \u, 8                   // (y_3, 0)
-       psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2, ?)
-       paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2, ?)
-       punpckldq \lo, \c3              // (y^*_0, y^*_2, ?, ?)
-       psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
-       paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
+       movdqa  \c3, \t                 // (y_0; ?)
+       movdqa  \lo, \t                 // (y^*_0, ?; ?, ?)
+       psrldq  \t, 8                   // (y_2; 0)
+       psrlq   \c3, 32                 // (floor(y_0/B); ?)
+       paddq   \c3, \u                 // (y_1 + floor(y_0/B); ?)
+       movdqa  \c1, \c3                // (y^*_1, ?; ?, ?)
+       psrldq  \u, 8                   // (y_3; 0)
+       psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2; ?)
+       paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2; ?)
+       punpckldq \lo, \c3              // (y^*_0, y^*_2; ?, ?)
+       psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
+       paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
    .ifnes "\hi", "nil"
         movdqa  \t, \c3
         pxor    \u, \u
    .endif
-       punpckldq \c1, \c3              // (y^*_1, y^*_3, ?, ?)
+       punpckldq \c1, \c3              // (y^*_1, y^*_3; ?, ?)
    .ifnes "\hi", "nil"
         psrlq   \t, 32                  // very high bits of y
         paddq   \hi, \t
@@ -293,13 +293,13 @@
         // On exit, the carry registers, including XMM15, are updated to hold
         // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
         // registers are preserved.
-       movd    xmm0, [rdi +  0]        // (a_0, 0)
-       movd    xmm1, [rdi +  4]        // (a_1, 0)
-       movd    xmm2, [rdi +  8]        // (a_2, 0)
-       movd    xmm15, [rdi + 12]       // (a_3, 0)
-       paddq   xmm12, xmm0             // (c'_0 + a_0, c''_0)
-       paddq   xmm13, xmm1             // (c'_1 + a_1, c''_1)
-       paddq   xmm14, xmm2             // (c'_2 + a_2, c''_2 + a_3 b)
+       movd    xmm0, [rdi +  0]        // (a_0; 0)
+       movd    xmm1, [rdi +  4]        // (a_1; 0)
+       movd    xmm2, [rdi +  8]        // (a_2; 0)
+       movd    xmm15, [rdi + 12]       // (a_3; 0)
+       paddq   xmm12, xmm0             // (c'_0 + a_0; c''_0)
+       paddq   xmm13, xmm1             // (c'_1 + a_1; c''_1)
+       paddq   xmm14, xmm2             // (c'_2 + a_2; c''_2 + a_3 b)
  .endm
  
  ///--------------------------------------------------------------------------
@@ -621,8 +621,8 @@ INTFUNC(mmla4)
         mulcore xmm7, 1,   xmm10, xmm11, xmm0,  xmm1,  xmm2
         accum                            xmm4,  xmm5,  xmm6
  
-       punpckldq xmm12, xmm15          // (w_0, 0, w_1, 0)
-       punpckhdq xmm14, xmm15          // (w_2, 0, w_3, 0)
+       punpckldq xmm12, xmm15          // (w_0, 0; w_1, 0)
+       punpckhdq xmm14, xmm15          // (w_2, 0; w_3, 0)
  
         mulcore xmm7, 2,   xmm10, xmm11, xmm0,  xmm1
         accum                            xmm5,  xmm6
@@ -634,10 +634,10 @@ INTFUNC(mmla4)
         mulcore xmm7, 3,   xmm10, xmm11, xmm0
         accum                            xmm6
  
-       punpckldq xmm12, xmm2           // (w_0, 0, 0, 0)
-       punpckldq xmm14, xmm2           // (w_2, 0, 0, 0)
-       punpckhdq xmm13, xmm2           // (w_1, 0, 0, 0)
-       punpckhdq xmm15, xmm2           // (w_3, 0, 0, 0)
+       punpckldq xmm12, xmm2           // (w_0, 0; 0, 0)
+       punpckldq xmm14, xmm2           // (w_2, 0; 0, 0)
+       punpckhdq xmm13, xmm2           // (w_1, 0; 0, 0)
+       punpckhdq xmm15, xmm2           // (w_3, 0; 0, 0)
  
         // That's lots of pieces.  Now we have to assemble the answer.
         squash  xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
@@ -703,8 +703,8 @@ INTFUNC(mont4)
         mulcore xmm7, 1,   xmm8,  xmm9,  xmm0,  xmm1,  xmm2
         accum                            xmm4,  xmm5,  xmm6
  
-       punpckldq xmm12, xmm15          // (w_0, 0, w_1, 0)
-       punpckhdq xmm14, xmm15          // (w_2, 0, w_3, 0)
+       punpckldq xmm12, xmm15          // (w_0, 0; w_1, 0)
+       punpckhdq xmm14, xmm15          // (w_2, 0; w_3, 0)
  
         mulcore xmm7, 2,   xmm8,  xmm9,  xmm0,  xmm1
         accum                            xmm5,  xmm6
@@ -716,10 +716,10 @@ INTFUNC(mont4)
         mulcore xmm7, 3,   xmm8,  xmm9,  xmm0
         accum                            xmm6
  
-       punpckldq xmm12, xmm2           // (w_0, 0, 0, 0)
-       punpckldq xmm14, xmm2           // (w_2, 0, 0, 0)
-       punpckhdq xmm13, xmm2           // (w_1, 0, 0, 0)
-       punpckhdq xmm15, xmm2           // (w_3, 0, 0, 0)
+       punpckldq xmm12, xmm2           // (w_0, 0; 0, 0)
+       punpckldq xmm14, xmm2           // (w_2, 0; 0, 0)
+       punpckhdq xmm13, xmm2           // (w_1, 0; 0, 0)
+       punpckhdq xmm15, xmm2           // (w_3, 0; 0, 0)
  
         // That's lots of pieces.  Now we have to assemble the answer.
         squash  xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
@@ -752,6 +752,13 @@ ENDFUNC
  ///--------------------------------------------------------------------------
  /// Bulk multipliers.
  
+FUNC(mpx_umul4_amd64_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       .arch   pentium4
+ENDFUNC
+
  FUNC(mpx_umul4_amd64_sse2)
         // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
         //                         const mpw *bv, const mpw *bvl);
@@ -901,6 +908,13 @@ FUNC(mpx_umul4_amd64_sse2)
  
  ENDFUNC
  
+FUNC(mpxmont_mul4_amd64_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       .arch   pentium4
+ENDFUNC
+
  FUNC(mpxmont_mul4_amd64_sse2)
         // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
         //                           const mpw *nv, size_t n, const mpw *mi);
@@ -1095,6 +1109,13 @@ FUNC(mpxmont_mul4_amd64_sse2)
  
  ENDFUNC
  
+FUNC(mpxmont_redc4_amd64_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       .arch   pentium4
+ENDFUNC
+
  FUNC(mpxmont_redc4_amd64_sse2)
         // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
         //                             size_t n, const mpw *mi);
@@ -1474,9 +1495,9 @@ ENDFUNC
  .endm
  
  .macro testldcarry
-       movdqu  xmm12, [rcx +  0]       // (c'_0, c''_0)
-       movdqu  xmm13, [rcx + 16]       // (c'_1, c''_1)
-       movdqu  xmm14, [rcx + 32]       // (c'_2, c''_2)
+       movdqu  xmm12, [rcx +  0]       // (c'_0; c''_0)
+       movdqu  xmm13, [rcx + 16]       // (c'_1; c''_1)
+       movdqu  xmm14, [rcx + 32]       // (c'_2; c''_2)
  .endm
  
  .macro testtop u=nil
diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S

index f6c8167..11aadc9 100644 (file)
--- a/math/mpx-mul4-x86-sse2.S
+++ b/math/mpx-mul4-x86-sse2.S
@@ -96,41 +96,41 @@
  .macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil
         // Load a word r_i from R, multiply by the expanded operand [S], and
         // leave the pieces of the product in registers D0, D1, D2, D3.
-       movd    \d0, \r                 // (r_i, 0, 0, 0)
+       movd    \d0, \r                 // (r_i, 0; 0, 0)
    .ifnes "\d1", "nil"
-       movdqa  \d1, [\s]               // (s'_0, s'_1, s''_0, s''_1)
+       movdqa  \d1, [\s]               // (s'_0, s'_1; s''_0, s''_1)
    .endif
    .ifnes "\d3", "nil"
-       movdqa  \d3, [\s + 16]          // (s'_2, s'_3, s''_2, s''_3)
+       movdqa  \d3, [\s + 16]          // (s'_2, s'_3; s''_2, s''_3)
    .endif
-       pshufd  \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?, r_i, ?)
+       pshufd  \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?)
    .ifnes "\d1", "nil"
-       psrldq  \d1, 4                  // (s'_1, s''_0, s''_1, 0)
+       psrldq  \d1, 4                  // (s'_1, s''_0; s''_1, 0)
    .endif
    .ifnes "\d2", "nil"
      .ifnes "\d3", "nil"
-       movdqa  \d2, \d3                // another copy of (s'_2, s'_3, ...)
+       movdqa  \d2, \d3                // another copy of (s'_2, s'_3; ...)
      .else
-       movdqa  \d2, \d0                // another copy of (r_i, ?, r_i, ?)
+       movdqa  \d2, \d0                // another copy of (r_i, ?; r_i, ?)
      .endif
    .endif
    .ifnes "\d3", "nil"
-       psrldq  \d3, 4                  // (s'_3, s''_2, s''_3, 0)
+       psrldq  \d3, 4                  // (s'_3, s''_2; s''_3, 0)
    .endif
    .ifnes "\d1", "nil"
-       pmuludq \d1, \d0                // (r_i s'_1, r_i s''_1)
+       pmuludq \d1, \d0                // (r_i s'_1; r_i s''_1)
    .endif
    .ifnes "\d3", "nil"
-       pmuludq \d3, \d0                // (r_i s'_3, r_i s''_3)
+       pmuludq \d3, \d0                // (r_i s'_3; r_i s''_3)
    .endif
    .ifnes "\d2", "nil"
      .ifnes "\d3", "nil"
-       pmuludq \d2, \d0                // (r_i s'_2, r_i s''_2)
+       pmuludq \d2, \d0                // (r_i s'_2; r_i s''_2)
      .else
         pmuludq \d2, [\s + 16]
      .endif
    .endif
-       pmuludq \d0, [\s]               // (r_i s'_0, r_i s''_0)
+       pmuludq \d0, [\s]               // (r_i s'_0; r_i s''_0)
  .endm
  
  .macro accum   c0, c1=nil, c2=nil, c3=nil
@@ -171,10 +171,10 @@
         // carry registers.  On completion, XMM3 is clobbered.  If CC is
         // `nil', then the contribution which would have been added to it is
         // left in C.
-       pshufd  xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
-       psrldq  xmm3, 12                // (t, 0, 0, 0) = (t, 0)
-       pslldq  xmm3, 2                 // (t b, 0)
-       paddq   \c, xmm3                // (c' + t b, c'')
+       pshufd  xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
+       psrldq  xmm3, 12                // (t, 0; 0, 0) = (t, 0)
+       pslldq  xmm3, 2                 // (t b; 0)
+       paddq   \c, xmm3                // (c' + t b; c'')
         movd    \d, \c
         psrlq   \c, 32                  // floor(c/B)
    .ifnes "\cc", "nil"
@@ -187,33 +187,33 @@
         // of the value represented in C are written to D, and the remaining
         // bits are left at the bottom of T.
         movdqa  \t, \c
-       psllq   \t, 16                  // (?, c'' b)
-       pslldq  \c, 8                   // (0, c')
-       paddq   \t, \c                  // (?, c' + c'' b)
-       psrldq  \t, 8                   // c' + c'' b
+       psllq   \t, 16                  // (?; c'' b)
+       pslldq  \c, 8                   // (0; c')
+       paddq   \t, \c                  // (?; c' + c'' b)
+       psrldq  \t, 8                   // (c' + c'' b; 0) = (c; 0)
         movd    \d, \t
-       psrldq  \t, 4                   // floor((c' + c'' b)/B)
+       psrldq  \t, 4                   // (floor(c/B); 0)
  .endm
  
  .macro expand  z, a, b, c=nil, d=nil
         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
         // exit, A:B and C:D together hold the same values in expanded
         // form.  If C is `nil', then only expand A to A:B.
-       movdqa  \b, \a                  // (a_0, a_1, a_2, a_3)
+       movdqa  \b, \a                  // (a_0, a_1; a_2, a_3)
    .ifnes "\c", "nil"
-       movdqa  \d, \c                  // (c_0, c_1, c_2, c_3)
+       movdqa  \d, \c                  // (c_0, c_1; c_2, c_3)
    .endif
-       punpcklwd \a, \z                // (a'_0, a''_0, a'_1, a''_1)
-       punpckhwd \b, \z                // (a'_2, a''_2, a'_3, a''_3)
+       punpcklwd \a, \z                // (a'_0, a''_0; a'_1, a''_1)
+       punpckhwd \b, \z                // (a'_2, a''_2; a'_3, a''_3)
    .ifnes "\c", "nil"
-       punpcklwd \c, \z                // (c'_0, c''_0, c'_1, c''_1)
-       punpckhwd \d, \z                // (c'_2, c''_2, c'_3, c''_3)
+       punpcklwd \c, \z                // (c'_0, c''_0; c'_1, c''_1)
+       punpckhwd \d, \z                // (c'_2, c''_2; c'_3, c''_3)
    .endif
-       pshufd  \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
-       pshufd  \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
+       pshufd  \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
+       pshufd  \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
    .ifnes "\c", "nil"
-       pshufd  \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
-       pshufd  \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
+       pshufd  \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
+       pshufd  \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
    .endif
  .endm
  
@@ -229,10 +229,10 @@
         // we can do that, we must gather them together.
         movdqa  \t, \c0
         movdqa  \u, \c1
-       punpcklqdq \t, \c2              // (y'_0, y'_2)
-       punpckhqdq \c0, \c2             // (y''_0, y''_2)
-       punpcklqdq \u, \c3              // (y'_1, y'_3)
-       punpckhqdq \c1, \c3             // (y''_1, y''_3)
+       punpcklqdq \t, \c2              // (y'_0; y'_2)
+       punpckhqdq \c0, \c2             // (y''_0; y''_2)
+       punpcklqdq \u, \c3              // (y'_1; y'_3)
+       punpckhqdq \c1, \c3             // (y''_1; y''_3)
  
         // Now split the double-prime pieces.  The high (up to) 48 bits will
         // go up; the low 16 bits go down.
@@ -240,43 +240,43 @@
         movdqa  \c3, \c1
         psllq   \c2, 48
         psllq   \c3, 48
-       psrlq   \c0, 16                 // high parts of (y''_0, y''_2)
-       psrlq   \c1, 16                 // high parts of (y''_1, y''_3)
-       psrlq   \c2, 32                 // low parts of (y''_0, y''_2)
-       psrlq   \c3, 32                 // low parts of (y''_1, y''_3)
+       psrlq   \c0, 16                 // high parts of (y''_0; y''_2)
+       psrlq   \c1, 16                 // high parts of (y''_1; y''_3)
+       psrlq   \c2, 32                 // low parts of (y''_0; y''_2)
+       psrlq   \c3, 32                 // low parts of (y''_1; y''_3)
    .ifnes "\hi", "nil"
         movdqa  \hi, \c1
    .endif
-       pslldq  \c1, 8                  // high part of (0, y''_1)
+       pslldq  \c1, 8                  // high part of (0; y''_1)
  
         paddq   \t, \c2                 // propagate down
         paddq   \u, \c3
-       paddq   \t, \c1                 // and up: (y_0, y_2)
-       paddq   \u, \c0                 // (y_1, y_3)
+       paddq   \t, \c1                 // and up: (y_0; y_2)
+       paddq   \u, \c0                 // (y_1; y_3)
    .ifnes "\hi", "nil"
-       psrldq  \hi, 8                  // high part of (y''_3, 0)
+       psrldq  \hi, 8                  // high part of (y''_3; 0)
    .endif
  
         // Finally extract the answer.  This complicated dance is better than
         // storing to memory and loading, because the piecemeal stores
         // inhibit store forwarding.
-       movdqa  \c3, \t                 // (y_0, y_1)
-       movdqa  \lo, \t                 // (y^*_0, ?, ?, ?)
-       psrldq  \t, 8                   // (y_2, 0)
-       psrlq   \c3, 32                 // (floor(y_0/B), ?)
-       paddq   \c3, \u                 // (y_1 + floor(y_0/B), ?)
-       movdqa  \c1, \c3                // (y^*_1, ?, ?, ?)
-       psrldq  \u, 8                   // (y_3, 0)
-       psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2, ?)
-       paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2, ?)
-       punpckldq \lo, \c3              // (y^*_0, y^*_2, ?, ?)
-       psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
-       paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
+       movdqa  \c3, \t                 // (y_0; ?)
+       movdqa  \lo, \t                 // (y^*_0, ?; ?, ?)
+       psrldq  \t, 8                   // (y_2; 0)
+       psrlq   \c3, 32                 // (floor(y_0/B); ?)
+       paddq   \c3, \u                 // (y_1 + floor(y_0/B); ?)
+       movdqa  \c1, \c3                // (y^*_1, ?; ?, ?)
+       psrldq  \u, 8                   // (y_3; 0)
+       psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2; ?)
+       paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2; ?)
+       punpckldq \lo, \c3              // (y^*_0, y^*_2; ?, ?)
+       psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
+       paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
    .ifnes "\hi", "nil"
         movdqa  \t, \c3
         pxor    \u, \u
    .endif
-       punpckldq \c1, \c3              // (y^*_1, y^*_3, ?, ?)
+       punpckldq \c1, \c3              // (y^*_1, y^*_3; ?, ?)
    .ifnes "\hi", "nil"
         psrlq   \t, 32                  // very high bits of y
         paddq   \hi, \t
@@ -293,14 +293,14 @@
         // On exit, the carry registers, including XMM7, are updated to hold
         // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
         // registers are preserved.
-       movd    xmm0, [edi +  0]        // (a_0, 0)
-       movd    xmm1, [edi +  4]        // (a_1, 0)
-       movd    xmm2, [edi +  8]        // (a_2, 0)
-       movd    xmm7, [edi + 12]        // (a_3, 0)
-
-       paddq   xmm4, xmm0              // (c'_0 + a_0, c''_0)
-       paddq   xmm5, xmm1              // (c'_1 + a_1, c''_1)
-       paddq   xmm6, xmm2              // (c'_2 + a_2, c''_2 + a_3 b)
+       movd    xmm0, [edi +  0]        // (a_0; 0)
+       movd    xmm1, [edi +  4]        // (a_1; 0)
+       movd    xmm2, [edi +  8]        // (a_2; 0)
+       movd    xmm7, [edi + 12]        // (a_3; 0)
+
+       paddq   xmm4, xmm0              // (c'_0 + a_0; c''_0)
+       paddq   xmm5, xmm1              // (c'_1 + a_1; c''_1)
+       paddq   xmm6, xmm2              // (c'_2 + a_2; c''_2 + a_3 b)
  .endm
  
  ///--------------------------------------------------------------------------
@@ -678,6 +678,14 @@ ENDFUNC
  ///--------------------------------------------------------------------------
  /// Bulk multipliers.
  
+FUNC(mpx_umul4_x86_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       // and drop through...
+       .arch   pentium4
+ENDFUNC
+
  FUNC(mpx_umul4_x86_sse2)
         // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
         //                         const mpw *bv, const mpw *bvl);
@@ -778,6 +786,14 @@ FUNC(mpx_umul4_x86_sse2)
  
  ENDFUNC
  
+FUNC(mpxmont_mul4_x86_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       // and drop through...
+       .arch   pentium4
+ENDFUNC
+
  FUNC(mpxmont_mul4_x86_sse2)
         // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
         //                           const mpw *nv, size_t n, const mpw *mi);
@@ -919,6 +935,14 @@ FUNC(mpxmont_mul4_x86_sse2)
  
  ENDFUNC
  
+FUNC(mpxmont_redc4_x86_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       // and drop through...
+       .arch   pentium4
+ENDFUNC
+
  FUNC(mpxmont_redc4_x86_sse2)
         // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
         //                             size_t n, const mpw *mi);
@@ -1098,9 +1122,9 @@ ENDFUNC
  
  .macro testldcarry c
         mov     ecx, \c                 // -> c
-       movdqu  xmm4, [ecx +  0]        // (c'_0, c''_0)
-       movdqu  xmm5, [ecx + 16]        // (c'_1, c''_1)
-       movdqu  xmm6, [ecx + 32]        // (c'_2, c''_2)
+       movdqu  xmm4, [ecx +  0]        // (c'_0; c''_0)
+       movdqu  xmm5, [ecx + 16]        // (c'_1; c''_1)
+       movdqu  xmm6, [ecx + 32]        // (c'_2; c''_2)
  .endm
  
  .macro testexpand v=nil, y=nil
diff --git a/math/mpx.c b/math/mpx.c

index 3983e7c..4294845 100644 (file)
--- a/math/mpx.c
+++ b/math/mpx.c
@@ -923,19 +923,25 @@ static void simple_umul(mpw *dv, mpw *dvl, const mpw *av, const mpw *avl,
  
  #if CPUFAM_X86
    MAYBE_UMUL4(x86_sse2)
+  MAYBE_UMUL4(x86_avx)
  #endif
  
  #if CPUFAM_AMD64
    MAYBE_UMUL4(amd64_sse2)
+  MAYBE_UMUL4(amd64_avx)
  #endif
  
  static mpx_umul__functype *pick_umul(void)
  {
  #if CPUFAM_X86
+  DISPATCH_PICK_COND(mpx_umul, maybe_umul4_x86_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX));
    DISPATCH_PICK_COND(mpx_umul, maybe_umul4_x86_sse2,
                      cpu_feature_p(CPUFEAT_X86_SSE2));
  #endif
  #if CPUFAM_AMD64
+  DISPATCH_PICK_COND(mpx_umul, maybe_umul4_amd64_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX));
    DISPATCH_PICK_COND(mpx_umul, maybe_umul4_amd64_sse2,
                      cpu_feature_p(CPUFEAT_X86_SSE2));
  #endif
diff --git a/progs/.gitignore b/progs/.gitignore

new file mode 100644 (file)

index 0000000..b46a9e8
--- /dev/null
+++ b/progs/.gitignore
@@ -0,0 +1,2 @@
+/getdate.h
+/getdate.y
diff --git a/progs/catcrypt.1 b/progs/catcrypt.1

index 6c55440..d55b7bf 100644 (file)
--- a/progs/catcrypt.1
+++ b/progs/catcrypt.1
@@ -248,7 +248,12 @@ Makes use of
  .B cipher
  and
  .B mac
-attributes.
+attributes.  Run
+.B catcrypt show cipher
+for a list of supported symmetric encryption algorithms; the default
+.I cipher
+is
+.BR blowfish-cbc .
  This is the default transform.
  .TP
  .B naclbox
@@ -286,11 +291,8 @@ attribute then the
  .I bulk
  in the
  .I kemalgspec
-is used; if that it absent, then the default of
-.B blowfish-cbc
-is used.  Run
-.B catcrypt show cipher
-for a list of supported symmetric encryption algorithms.
+is used; if that it absent, then the default depends on the bulk
+transform.
  .TP
  .B hash
  This is the hash function used to distil entropy from the shared secret
@@ -559,24 +561,26 @@ key-encapsulation key's
  attribute.
  .TP
  .B cipher
-The symmetric encryption algorithms which can be used in a
+The symmetric encryption algorithms which can be named in a
  key-encapsulation key's
  .B cipher
-attribute.
+attribute when using the
+.B gencomp
+bulk transform.
  .TP
  .B mac
-The message authentication algorithms which can be used in a
+The message authentication algorithms which can be named in a
  key-encapsulation key's
  .B mac
  attribute.
  .TP
  .B sig
-The signature algorithms which can be used in a signing key's
+The signature algorithms which can be named in a signing key's
  .B sig
  attribute.
  .TP
  .B hash
-The hash functions which can be used in a key's
+The hash functions which can be named in a key's
  .B hash
  attribute.
  .TP
diff --git a/progs/perftest.c b/progs/perftest.c

index f064c2a..267712b 100644 (file)
--- a/progs/perftest.c
+++ b/progs/perftest.c
@@ -43,7 +43,13 @@
  #include <sys/time.h>
  #include <unistd.h>
  
+#ifdef HAVE_LINUX_PERF_EVENT_H
+#  include <linux/perf_event.h>
+#  include <asm/unistd.h>
+#endif
+
  #include <mLib/alloc.h>
+#include <mLib/bits.h>
  #include <mLib/dstr.h>
  #include <mLib/mdwopt.h>
  #include <mLib/quis.h>
@@ -82,10 +88,13 @@
  
  typedef struct opts {
    const char *name;                    /* Pre-configured named thing */
+  const char *opwhat;                  /* What to call operations */
    unsigned fbits;                      /* Field size bits */
    unsigned gbits;                      /* Group size bits */
    unsigned n;                          /* Number of factors */
    unsigned i;                          /* Number of intervals (or zero) */
+  unsigned k;                          /* Main loop batch size */
+  unsigned long sc;                    /* Scale factor */
    double t;                            /* Time for each interval (secs) */
    mp *e;                               /* Public exponent */
    unsigned f;                          /* Flags */
@@ -495,7 +504,9 @@ static void *ksched_init(opts *o)
      die(1, "must specify encryption scheme name");
    if ((c->c = gcipher_byname(o->name)) == 0)
      die(1, "encryption scheme `%s' not known", o->name);
-  c->ksz = keysz(o->gbits/8, c->c->keysz);
+  c->ksz = keysz(o->fbits/8, c->c->keysz);
+  if (o->fbits%8 || (o->fbits && c->ksz != o->fbits/8))
+    die(1, "bad key size %u for %s", o->fbits, o->name);
    c->k = xmalloc(c->ksz);
    rand_get(RAND_GLOBAL, c->k, c->ksz);
    return (c);
@@ -525,13 +536,16 @@ static void *enc_init(opts *o)
      die(1, "must specify encryption scheme name");
    if ((cc = gcipher_byname(o->name)) == 0)
      die(1, "encryption scheme `%s' not known", o->name);
-  ksz = keysz(0, cc->keysz);
+  ksz = keysz(o->fbits/8, cc->keysz);
+  if (o->fbits%8 || (o->fbits && ksz != o->fbits/8))
+    die(1, "bad key size %u for %s", o->fbits, o->name);
    k = xmalloc(ksz);
    rand_get(RAND_GLOBAL, k, ksz);
    c->c = GC_INIT(cc, k, ksz);
    xfree(k);
    c->sz = o->gbits ? o->gbits : 65536;
    c->n = o->n ? o->n : 16;
+  o->opwhat = "byte"; o->sc = c->n*c->sz;
    c->m = xmalloc(c->sz);
    return (c);
  }
@@ -562,6 +576,7 @@ static void *hash_init(opts *o)
      die(1, "hash function `%s' not known", o->name);
    c->sz = o->gbits ? o->gbits : 65536;
    c->n = o->n ? o->n : 16;
+  o->opwhat = "byte"; o->sc = c->n*c->sz;
    c->m = xmalloc(c->sz);
    return (c);
  }
@@ -596,6 +611,7 @@ static void *poly1305_jobinit(opts *o)
    rand_get(RAND_GLOBAL, c->s, sizeof(c->s));
    c->sz = o->gbits ? o->gbits : 65536;
    c->n = o->n ? o->n : 16;
+  o->opwhat = "byte"; o->sc = c->n*c->sz;
    c->m = xmalloc(c->sz);
    return (c);
  }
@@ -644,6 +660,73 @@ static const jobops jobtab[] = {
    { 0,                         0,                      0 }
  };
  
+/*----- Cycle counting ----------------------------------------------------*/
+
+typedef kludge64 cycles;
+static int cyclecount_active_p = 0;
+
+#if defined(__GNUC__) && (CPUFAM_X86 || CPUFAM_AMD64)
+
+static void init_cyclecount(void) { cyclecount_active_p = 1; }
+
+static cycles cyclecount(void)
+{
+  uint32 lo, hi;
+  kludge64 cy;
+
+  __asm__("rdtsc" : "=a"(lo), "=d"(hi));
+  SET64(cy, hi, lo);
+  return cy;
+}
+
+#elif defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
+
+static int perf_fd = -1;
+
+static void init_cyclecount(void)
+{
+  struct perf_event_attr attr = { 0 };
+
+  attr.type = PERF_TYPE_HARDWARE;
+  attr.size = sizeof(attr);
+  attr.config = PERF_COUNT_HW_CPU_CYCLES;
+  attr.disabled = 0;
+  attr.exclude_kernel = 1;
+  attr.exclude_hv = 1;
+
+  if ((perf_fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0)) < 0)
+    moan("failed to open perf event: %s", strerror(errno));
+  else
+    cyclecount_active_p = 1;
+}
+
+static cycles cyclecount(void)
+{
+  kludge64 cy;
+  ssize_t n;
+
+  if (!cyclecount_active_p)
+    goto fail;
+  else if ((n = read(perf_fd, &cy.i, sizeof(cy.i))) != sizeof(cy.i)) {
+    if (n < 0) moan("error reading perf event: %s", strerror(errno));
+    else moan("unexpected short read from perf event");
+    cyclecount_active_p = 0; close(perf_fd); perf_fd = -1;
+    goto fail;
+  }
+end:
+  return (cy);
+fail:
+  SET64(cy, 0, 0);
+  goto end;
+}
+
+#else
+
+static void init_cyclecount(void) { cyclecount_active_p = 0; }
+static cycles cyclecount(void) { kludge64 cy; SET64(cy, 0, 0); return (cy); }
+
+#endif
+
  /*----- Main code ---------------------------------------------------------*/
  
  void version(FILE *fp)
@@ -672,12 +755,14 @@ Options:\n\
  -l, --list [ITEM...]   List all the various names of things.\n\
  \n\
  -C, --name=NAME                Select curve/DH-group/enc/hash name.\n\
--b, --field-bits       Field size for g-prime and rsa.\n\
+-b, --field-bits       Field size for g-prime and rsa;\n\
+                         key bits for ksched and enc.\n\
  -q, --no-check         Don't check field/group for validity.\n\
--B, --group-bits       Group size for g-prime; key size for ksched;\n\
-                         data size for enc and hash.\n\
--n, --factors=COUNT    Number of factors for {exp,mul}-sim.\n\
+-B, --group-bits       Group size for g-prime; data size for enc and hash.\n\
+-n, --factors=COUNT    Number of factors for {exp,mul}-sim;\n\
+                         inner iterations for enc and hash.\n\
  -i, --intervals=COUNT  Number of intervals to run for.  [0; forever]\n\
+-k, --batch=COUNT      Number of operations to batch between timer checks.\n\
  -t, --time=TIME                Length of an interval in seconds.  [1]\n\
  ");
  }
@@ -734,15 +819,16 @@ int main(int argc, char *argv[])
    opts o = { 0 };
    const jobops *j;
    struct timeval tv_next, tv_now;
-  double t, ttot;
-  unsigned n;
+  double t, ttot, cy, cytot;
+  unsigned n, k;
    unsigned long ii;
-  clock_t c_start, c_stop;
+  clock_t c0, c1;
+  kludge64 cy0, cy1, cydiff;
    double itot;
    void *p;
  
    ego(argv[0]);
-  o.t = 1;
+  o.t = 1; o.k = 1; o.sc = 1; o.opwhat = "op";
    for (;;) {
      static const struct option opts[] = {
        { "help",                0,              0,      'h' },
@@ -754,13 +840,14 @@ int main(int argc, char *argv[])
        { "group-bits",  OPTF_ARGREQ,    0,      'B' },
        { "factors",     OPTF_ARGREQ,    0,      'n' },
        { "intervals",   OPTF_ARGREQ,    0,      'i' },
+      { "batch",       OPTF_ARGREQ,    0,      'k' },
        { "public-exponent", OPTF_ARGREQ, 0,     'e' },
        { "time",                OPTF_ARGREQ,    0,      't' },
        { "no-check",    0,              0,      'q' },
        { 0,             0,              0,      0 }
      };
  
-    i = mdwopt(argc, argv, "hvulC:b:B:n:i:e:t:q", opts, 0, 0, 0);
+    i = mdwopt(argc, argv, "hvulC:b:B:n:i:k:e:t:q", opts, 0, 0, 0);
      if (i < 0) break;
      switch (i) {
        case 'h': help(stdout); exit(0);
@@ -778,6 +865,7 @@ int main(int argc, char *argv[])
         break;
        case 'i': o.i = uarg("interval count", optarg); break;
        case 't': o.t = farg("interval length", optarg); break;
+      case 'k': o.k = uarg("batch size", optarg); break;
        case 'q': o.f |= OF_NOCHECK; break;
        default: usage(stderr); exit(1);
      }
@@ -790,23 +878,29 @@ int main(int argc, char *argv[])
    p = j->init(&o);
  
    n = 0;
-  ttot = itot =         0;
+  ttot = itot = 0; cytot = 0; init_cyclecount();
    gettimeofday(&tv_now, 0);
    do {
      tv_addl(&tv_next, &tv_now, o.t, fmod(o.t * MILLION, MILLION));
      ii = 0;
-    c_start = clock();
+    c0 = clock(); cy0 = cyclecount();
      do {
-      j->run(p);
-      ii++;
+      for (k = 0; k < o.k; k++) { j->run(p); }
+      ii += k;
        gettimeofday(&tv_now, 0);
      } while (TV_CMP(&tv_now, <, &tv_next));
-    c_stop = clock();
-    t = (double)(c_stop - c_start)/CLOCKS_PER_SEC;
-    itot += ii;
-    ttot += t;
-    printf("%5u: did = %5lu; /sec = %5f; avg /sec = %5f\n",
+    cy1 = cyclecount(); c1 = clock();
+    t = (double)(c1 - c0)/CLOCKS_PER_SEC;
+    itot += ii; ttot += t;
+    printf("%5u: did = %5lu; /sec = %5f; avg /sec = %5f",
            n, ii, ii/t, itot/ttot);
+    if (cyclecount_active_p) {
+      SUB64(cydiff, cy1, cy0); cy = LO64(cydiff) + ldexp(HI64(cydiff), 32);
+      cytot += cy;
+      printf(" (cy/%s = %3f; avg cy/%s = %3f)",
+            o.opwhat, cy/ii/o.sc, o.opwhat, cytot/itot/o.sc);
+    }
+    putchar('\n');
      fflush(stdout);
      n++;
    } while (!o.i || n < o.i);
diff --git a/symm/.gitignore b/symm/.gitignore

index 55e2c17..fa75057 100644 (file)
--- a/symm/.gitignore
+++ b/symm/.gitignore
@@ -1 +1,57 @@
-modes/
+/modes/
+/modes.am
+/stubs.am
+/stubs.gen-stamp
+
+/t/salsa20
+/t/sha3
+
+/sha224.c
+/sha224.h
+/sha384.h
+/sha384.c
+/sha512-224.c
+/sha512-224.h
+/sha512-256.c
+/sha512-256.h
+
+/safersk.c
+/safersk.h
+
+/whirlpool256.c
+/whirlpool256.h
+
+/sha3-224.c
+/sha3-224.h
+/sha3-256.c
+/sha3-256.h
+/sha3-384.c
+/sha3-384.h
+/sha3-512.c
+/sha3-512.h
+/kmac128.h
+/kmac256.h
+/shake128.h
+/shake256.h
+/shake128-xof.h
+/shake256-xof.h
+
+/chacha20.h
+/chacha12.h
+/chacha8.h
+/chacha12-ietf.h
+/chacha20-ietf.h
+/chacha8-ietf.h
+/xchacha.h
+/xchacha20.h
+/xchacha12.h
+/xchacha8.h
+
+/salsa2012.h
+/salsa208.h
+/salsa20-ietf.h
+/salsa2012-ietf.h
+/salsa208-ietf.h
+/xsalsa20.h
+/xsalsa2012.h
+/xsalsa208.h
diff --git a/symm/blkc.h b/symm/blkc.h

index ff631f0..e083752 100644 (file)
--- a/symm/blkc.h
+++ b/symm/blkc.h
@@ -109,7 +109,7 @@
  
  #define BLKC_SHOW(PRE, tag, w) do {                                    \
    fputs(tag ": ", stdout);                                             \
-  BLKC_SKEL_X(PRE, BLKC_W(w);, printf("%08x ", *_w++););               \
+  BLKC_SKEL_X(PRE, const BLKC_W(w);, printf("%08x ", *_w++););         \
    fputc('\n', stdout);                                                 \
  } while (0)
  
diff --git a/symm/chacha-x86ish-sse2.S b/symm/chacha-x86ish-sse2.S

index 2dab283..77047eb 100644 (file)
--- a/symm/chacha-x86ish-sse2.S
+++ b/symm/chacha-x86ish-sse2.S
@@ -33,9 +33,17 @@
  ///--------------------------------------------------------------------------
  /// Main code.
  
-       .arch pentium4
         .text
  
+FUNC(chacha_core_x86ish_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       // drop through...
+ENDFUNC
+
+       .arch   pentium4
+
  FUNC(chacha_core_x86ish_sse2)
  
         // Initial setup.
@@ -156,9 +164,9 @@ FUNC(chacha_core_x86ish_sse2)
  
         // c += d; b ^= c; b <<<=  7
         paddd   xmm2, xmm3
-        pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
+        pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
         pxor    xmm1, xmm2
-        pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
+        pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
         movdqa  xmm4, xmm1
         pslld   xmm1, 7
         psrld   xmm4, 25
@@ -176,7 +184,7 @@ FUNC(chacha_core_x86ish_sse2)
         //
         // The shuffles have quite high latency, so they've mostly been
         // pushed upwards.  The remaining one can't be moved, though.
-       pshufd  xmm1, xmm1, SHUF(0, 3, 2, 1)
+       pshufd  xmm1, xmm1, SHUF(1, 2, 3, 0)
  
         // Apply the diagonal quarterround to each of the columns
         // simultaneously.
@@ -207,9 +215,9 @@ FUNC(chacha_core_x86ish_sse2)
  
         // c += d; b ^= c; b <<<=  7
         paddd   xmm2, xmm3
-        pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
+        pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
         pxor    xmm1, xmm2
-        pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
+        pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
         movdqa  xmm4, xmm1
         pslld   xmm1, 7
         psrld   xmm4, 25
@@ -218,7 +226,7 @@ FUNC(chacha_core_x86ish_sse2)
         // Finally, finish off undoing the transpose, and we're done for this
         // doubleround.  Again, most of this was done above so we don't have
         // to wait for the shuffles.
-       pshufd  xmm1, xmm1, SHUF(2, 1, 0, 3)
+       pshufd  xmm1, xmm1, SHUF(3, 0, 1, 2)
  
         // Decrement the loop counter and see if we should go round again.
         sub     NR, 2
diff --git a/symm/chacha.c b/symm/chacha.c

index 3419861..9b83eea 100644 (file)
--- a/symm/chacha.c
+++ b/symm/chacha.c
@@ -72,6 +72,7 @@ static void simple_core(unsigned r, const chacha_matrix src,
  
  #if CPUFAM_X86 || CPUFAM_AMD64
  extern core__functype chacha_core_x86ish_sse2;
+extern core__functype chacha_core_x86ish_avx;
  #endif
  
  #if CPUFAM_ARMEL
@@ -85,6 +86,8 @@ extern core__functype chacha_core_arm64;
  static core__functype *pick_core(void)
  {
  #if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX));
    DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_sse2,
                      cpu_feature_p(CPUFEAT_X86_SSE2));
  #endif
diff --git a/symm/rijndael-base.c b/symm/rijndael-base.c

index 83a49e9..2f65191 100644 (file)
--- a/symm/rijndael-base.c
+++ b/symm/rijndael-base.c
@@ -118,6 +118,7 @@ CPU_DISPATCH(static, EMPTY, void, setup,
  
  #if CPUFAM_X86 || CPUFAM_AMD64
  extern setup__functype rijndael_setup_x86ish_aesni;
+extern setup__functype rijndael_setup_x86ish_aesni_avx;
  #endif
  #if CPUFAM_ARMEL && HAVE_AS_ARMV8_CRYPTO
  extern setup__functype rijndael_setup_arm_crypto;
@@ -129,6 +130,9 @@ extern setup__functype rijndael_setup_arm64_crypto;
  static setup__functype *pick_setup(void)
  {
  #if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86ish_aesni_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX) &&
+                    cpu_feature_p(CPUFEAT_X86_AESNI));
    DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86ish_aesni,
                      cpu_feature_p(CPUFEAT_X86_AESNI));
  #endif
diff --git a/symm/rijndael-x86ish-aesni.S b/symm/rijndael-x86ish-aesni.S

index e556aa5..dc80f4d 100644 (file)
--- a/symm/rijndael-x86ish-aesni.S
+++ b/symm/rijndael-x86ish-aesni.S
@@ -61,6 +61,12 @@
  ///--------------------------------------------------------------------------
  /// Key setup.
  
+FUNC(rijndael_setup_x86ish_aesni_avx)
+       vzeroupper                    // avoid penalty on `legacy' XMM access
+  endprologue
+       // and drop through...
+ENDFUNC
+
  FUNC(rijndael_setup_x86ish_aesni)
  
  #define SI WHOLE(si)
@@ -205,16 +211,16 @@ FUNC(rijndael_setup_x86ish_aesni)
         // Fourth word of the cycle, and seven or eight words of key.  Do a
         // byte substitution.
         movd    xmm0, eax
-       pshufd  xmm0, xmm0, SHUF(2, 1, 0, 3)
+       pshufd  xmm0, xmm0, SHUF(3, 0, 1, 2)
         aeskeygenassist xmm1, xmm0, 0
         movd    eax, xmm1
         jmp     2f
  
         // First word of the cycle.  This is the complicated piece.
  1:     movd    xmm0, eax
-       pshufd  xmm0, xmm0, SHUF(0, 3, 2, 1)
+       pshufd  xmm0, xmm0, SHUF(1, 2, 3, 0)
         aeskeygenassist xmm1, xmm0, 0
-       pshufd  xmm1, xmm1, SHUF(2, 1, 0, 3)
+       pshufd  xmm1, xmm1, SHUF(3, 0, 1, 2)
         movd    eax, xmm1
         xor     al, [RCON]
         inc     RCON
@@ -365,6 +371,12 @@ ENDFUNC
  /// Encrypting and decrypting blocks.
  
  .macro encdec  op, aes, koff
+  FUNC(rijndael_\op\()_x86ish_aesni_avx)
+       vzeroupper                      // avoid XMM penalties
+  endprologue
+       // and drop through...
+  ENDFUNC
+
    FUNC(rijndael_\op\()_x86ish_aesni)
  
  #if CPUFAM_X86
diff --git a/symm/rijndael.c b/symm/rijndael.c

index 02cfb76..7db9e01 100644 (file)
--- a/symm/rijndael.c
+++ b/symm/rijndael.c
@@ -83,6 +83,8 @@ CPU_DISPATCH(EMPTY, EMPTY, void, rijndael_dblk,
  #if CPUFAM_X86 || CPUFAM_AMD64
  extern rijndael_eblk__functype rijndael_eblk_x86ish_aesni;
  extern rijndael_dblk__functype rijndael_dblk_x86ish_aesni;
+extern rijndael_eblk__functype rijndael_eblk_x86ish_aesni_avx;
+extern rijndael_dblk__functype rijndael_dblk_x86ish_aesni_avx;
  #endif
  #if CPUFAM_ARMEL && HAVE_AS_ARMV8_CRYPTO
  extern rijndael_eblk__functype rijndael_eblk_arm_crypto;
@@ -96,6 +98,9 @@ extern rijndael_dblk__functype rijndael_dblk_arm64_crypto;
  static rijndael_eblk__functype *pick_eblk(void)
  {
  #if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86ish_aesni_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX) &&
+                    cpu_feature_p(CPUFEAT_X86_AESNI));
    DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86ish_aesni,
                      cpu_feature_p(CPUFEAT_X86_AESNI));
  #endif
@@ -113,6 +118,9 @@ static rijndael_eblk__functype *pick_eblk(void)
  static rijndael_dblk__functype *pick_dblk(void)
  {
  #if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86ish_aesni_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX) &&
+                    cpu_feature_p(CPUFEAT_X86_AESNI));
    DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86ish_aesni,
                      cpu_feature_p(CPUFEAT_X86_AESNI));
  #endif
diff --git a/symm/salsa20-x86ish-sse2.S b/symm/salsa20-x86ish-sse2.S

index 7d8e2e3..06ba3d2 100644 (file)
--- a/symm/salsa20-x86ish-sse2.S
+++ b/symm/salsa20-x86ish-sse2.S
@@ -33,9 +33,17 @@
  ///--------------------------------------------------------------------------
  /// Main code.
  
-       .arch pentium4
         .text
  
+FUNC(salsa20_core_x86ish_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       // drop through...
+ENDFUNC
+
+       .arch   pentium4
+
  FUNC(salsa20_core_x86ish_sse2)
  
         // Initial setup.
@@ -172,7 +180,7 @@ FUNC(salsa20_core_x86ish_sse2)
         // d ^= (c + b) <<< 13
         movdqa  xmm4, xmm2
         paddd   xmm4, xmm1
-        pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
+        pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
         movdqa  xmm5, xmm4
         pslld   xmm4, 13
         psrld   xmm5, 19
@@ -181,9 +189,9 @@ FUNC(salsa20_core_x86ish_sse2)
  
         // a ^= (d + c) <<< 18
         movdqa  xmm4, xmm3
-        pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
+        pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
         paddd   xmm4, xmm2
-        pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
+        pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
         movdqa  xmm5, xmm4
         pslld   xmm4, 18
         psrld   xmm5, 14
@@ -227,7 +235,7 @@ FUNC(salsa20_core_x86ish_sse2)
         // d ^= (c + b) <<< 13
         movdqa  xmm4, xmm2
         paddd   xmm4, xmm3
-        pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
+        pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
         movdqa  xmm5, xmm4
         pslld   xmm4, 13
         psrld   xmm5, 19
@@ -236,9 +244,9 @@ FUNC(salsa20_core_x86ish_sse2)
  
         // a ^= (d + c) <<< 18
         movdqa  xmm4, xmm1
-        pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
+        pshufd xmm1, xmm1, SHUF(1, 2, 3, 0)
         paddd   xmm4, xmm2
-        pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
+        pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
         movdqa  xmm5, xmm4
         pslld   xmm4, 18
         psrld   xmm5, 14
@@ -262,9 +270,9 @@ FUNC(salsa20_core_x86ish_sse2)
         // input.  This can be done by juggling values in registers, with the
         // following fancy footwork: some row rotations, a transpose, and
         // some more rotations.
-       pshufd  xmm1, xmm1, SHUF(2, 1, 0, 3)    //  3,  4,  9, 14
-       pshufd  xmm2, xmm2, SHUF(1, 0, 3, 2)    //  2,  7,  8, 13
-       pshufd  xmm3, xmm3, SHUF(0, 3, 2, 1)    //  1,  6, 11, 12
+       pshufd  xmm1, xmm1, SHUF(3, 0, 1, 2)    //  3,  4,  9, 14
+       pshufd  xmm2, xmm2, SHUF(2, 3, 0, 1)    //  2,  7,  8, 13
+       pshufd  xmm3, xmm3, SHUF(1, 2, 3, 0)    //  1,  6, 11, 12
  
         movdqa  xmm4, xmm0
         movdqa  xmm5, xmm3
@@ -280,9 +288,9 @@ FUNC(salsa20_core_x86ish_sse2)
         punpckhdq xmm1, xmm3                    //  5,  6,  7,  4
         punpckhdq xmm2, xmm5                    // 15, 12, 13, 14
  
-       pshufd  xmm1, xmm1, SHUF(2, 1, 0, 3)    //  4,  5,  6,  7
-       pshufd  xmm4, xmm4, SHUF(1, 0, 3, 2)    //  8,  9, 10, 11
-       pshufd  xmm2, xmm2, SHUF(0, 3, 2, 1)    // 12, 13, 14, 15
+       pshufd  xmm1, xmm1, SHUF(3, 0, 1, 2)    //  4,  5,  6,  7
+       pshufd  xmm4, xmm4, SHUF(2, 3, 0, 1)    //  8,  9, 10, 11
+       pshufd  xmm2, xmm2, SHUF(1, 2, 3, 0)    // 12, 13, 14, 15
  
         // Finally we have to write out the result.
         movdqu  [OUT +  0], xmm0
diff --git a/symm/salsa20.c b/symm/salsa20.c

index 03fcf46..e78baf0 100644 (file)
--- a/symm/salsa20.c
+++ b/symm/salsa20.c
@@ -72,6 +72,7 @@ static void simple_core(unsigned r, const salsa20_matrix src,
  
  #if CPUFAM_X86 || CPUFAM_AMD64
  extern core__functype salsa20_core_x86ish_sse2;
+extern core__functype salsa20_core_x86ish_avx;
  #endif
  
  #if CPUFAM_ARMEL
@@ -85,6 +86,8 @@ extern core__functype salsa20_core_arm64;
  static core__functype *pick_core(void)
  {
  #if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX));
    DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_sse2,
                      cpu_feature_p(CPUFEAT_X86_SSE2));
  #endif
diff --git a/symm/stub.h.in b/symm/stub.h.in

index c29de5b..686b574 100644 (file)
--- a/symm/stub.h.in
+++ b/symm/stub.h.in
@@ -10,7 +10,7 @@
  #ifndef CATACOMB_@{name:u:c}_H
  #define CATACOMB_@{name:u:c}_H
  
-#ifndef CATACOMB_@{base:u}_H
+#ifndef CATACOMB_@{base:u:c}_H
  #  include "@base.h"
  #endif
author	Mark Wooding <mdw@distorted.org.uk>
	Sat, 24 Nov 2018 21:53:58 +0000 (21:53 +0000)
committer	Mark Wooding <mdw@distorted.org.uk>
	Sat, 24 Nov 2018 21:53:58 +0000 (21:53 +0000)
.gitignore		patch \| blob \| blame \| history
base/asm-common.h		patch \| blob \| blame \| history
base/dispatch.c		patch \| blob \| blame \| history
base/dispatch.h		patch \| blob \| blame \| history
configure.ac		patch \| blob \| blame \| history
math/mpmont.c		patch \| blob \| blame \| history
math/mpx-mul4-amd64-sse2.S		patch \| blob \| blame \| history
math/mpx-mul4-x86-sse2.S		patch \| blob \| blame \| history
math/mpx.c		patch \| blob \| blame \| history
progs/.gitignore	[new file with mode: 0644]	patch \| blob
progs/catcrypt.1		patch \| blob \| blame \| history
progs/perftest.c		patch \| blob \| blame \| history
symm/.gitignore		patch \| blob \| blame \| history
symm/blkc.h		patch \| blob \| blame \| history
symm/chacha-x86ish-sse2.S		patch \| blob \| blame \| history
symm/chacha.c		patch \| blob \| blame \| history
symm/rijndael-base.c		patch \| blob \| blame \| history
symm/rijndael-x86ish-aesni.S		patch \| blob \| blame \| history
symm/rijndael.c		patch \| blob \| blame \| history
symm/salsa20-x86ish-sse2.S		patch \| blob \| blame \| history
symm/salsa20.c		patch \| blob \| blame \| history
symm/stub.h.in		patch \| blob \| blame \| history