Makefile.in
-aclocal.m4
-configure
-COPYING.LIB
-autom4te.cache
-config
-precomp
-progs/getdate.h
-progs/getdate.y
-symm/modes.am
-symm/stubs.am
+/aclocal.m4
+/configure
+/COPYING.LIB
+/autom4te.cache/
+/config/
+/precomp/
*.sage.py
*.t
*.to
-/symm/safersk.h
-/symm/salsa2012.h
-/symm/salsa208.h
-/symm/salsa20-ietf.h
-/symm/salsa2012-ietf.h
-/symm/salsa208-ietf.h
-/symm/sha224.h
-/symm/sha384.h
-/symm/whirlpool256.h
-/symm/xsalsa20.h
-/symm/xsalsa2012.h
-/symm/xsalsa208.h
-/symm/stubs.gen-stamp
-/symm/t/salsa20
-/symm/xchacha12.h
-/symm/xchacha20.h
-/symm/xchacha8.h
-/symm/chacha12.h
-/symm/chacha20.h
-/symm/chacha8.h
-/symm/chacha12-ietf.h
-/symm/chacha20-ietf.h
-/symm/chacha8-ietf.h
-/symm/xchacha.h
-/symm/kmac128.h
-/symm/kmac256.h
-/symm/safersk.c
-/symm/sha224.c
-/symm/sha3-224.c
-/symm/sha3-224.h
-/symm/sha3-256.c
-/symm/sha3-256.h
-/symm/sha3-384.c
-/symm/sha3-384.h
-/symm/sha3-512.c
-/symm/sha3-512.h
-/symm/sha384.c
-/symm/sha512-224.c
-/symm/sha512-224.h
-/symm/sha512-256.c
-/symm/sha512-256.h
-/symm/shake128.h
-/symm/shake256.h
-/symm/t/sha3
-/symm/whirlpool256.c
-/symm/shake128-xof.h
-/symm/shake256-xof.h
# define INTADDR__1(addr, got) addr
#endif
-// Permutations for SIMD instructions. SHUF(D, C, B, A) is an immediate,
-// suitable for use in `pshufd' or `shufpd', which copies element D
-// (0 <= D < 4) of the source to element 3 of the destination, element C to
-// element 2, element B to element 1, and element A to element 0.
-#define SHUF(d, c, b, a) (64*(d) + 16*(c) + 4*(b) + (a))
+// Permutations for SIMD instructions. SHUF(A, B, C, D) is an immediate,
+// suitable for use in `pshufd' or `shufpd', which copies element A
+// (0 <= A < 4) of the source to element 0 of the destination, element B to
+// element 1, element C to element 2, and element D to element 3.
+#define SHUF(a, b, c, d) ((a) + 4*(b) + 16*(c) + 64*(d))
// Map register names to their individual pieces.
# define CPUID1D_SSE2 (1u << 26)
# define CPUID1D_FXSR (1u << 24)
# define CPUID1C_AESNI (1u << 25)
+# define CPUID1C_AVX (1u << 28)
# define CPUID1C_RDRAND (1u << 30)
struct cpuid { unsigned a, b, c, d; };
cpuid_features_p(CPUID1D_SSE2, CPUID1C_AESNI));
CASE_CPUFEAT(X86_RDRAND, "x86:rdrand",
cpuid_features_p(0, CPUID1C_RDRAND));
+ CASE_CPUFEAT(X86_AVX, "x86:avx",
+ xmm_registers_available_p() &&
+ cpuid_features_p(0, CPUID1C_AVX));
#endif
#ifdef CAPMAP
# define FEATP__CASE(feat, tok) \
CPUFEAT_ARM_V4, /* VFPv4 and/or SIMD v2 */
CPUFEAT_ARM_D32, /* 32 double registers, not 16 */
CPUFEAT_X86_RDRAND, /* Built-in entropy source */
- CPUFEAT_ARM_AES /* AES instructions */
+ CPUFEAT_ARM_AES, /* AES instructions */
+ CPUFEAT_X86_AVX /* AVX 1 (i.e., 256-bit YMM regs) */
};
extern int cpu_feature_p(int /*feat*/);
AC_CHECK_HEADERS([linux/auxvec.h])
AC_CHECK_FUNCS([getauxval])
+dnl Some equipment for measuring CPU performance.
+AC_CHECK_HEADERS([linux/perf_event.h])
+
dnl Find the bit lengths of the obvious integer types. This will be useful
dnl when deciding on a representation for multiprecision integers.
type_bits="" type_bits_sep=""
#if CPUFAM_X86
MAYBE_REDC4(x86_sse2)
+ MAYBE_REDC4(x86_avx)
#endif
#if CPUFAM_AMD64
MAYBE_REDC4(amd64_sse2)
+ MAYBE_REDC4(amd64_avx)
#endif
static redccore__functype *pick_redccore(void)
{
#if CPUFAM_X86
+ DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_x86_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX));
DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_x86_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
#if CPUFAM_AMD64
+ DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_amd64_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX));
DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_amd64_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
#if CPUFAM_X86
MAYBE_MUL4(x86_sse2)
+ MAYBE_MUL4(x86_avx)
#endif
#if CPUFAM_AMD64
MAYBE_MUL4(amd64_sse2)
+ MAYBE_MUL4(amd64_avx)
#endif
static mulcore__functype *pick_mulcore(void)
{
#if CPUFAM_X86
+ DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_x86_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX));
DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_x86_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
#if CPUFAM_AMD64
+ DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_amd64_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX));
DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_amd64_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
.macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
// Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
// of the product in registers D0, D1, D2, D3.
- pshufd \d0, \r, SHUF(3, \i, 3, \i) // (r_i, ?, r_i, ?)
+ pshufd \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
.ifnes "\d1", "nil"
- movdqa \d1, \slo // (s'_0, s'_1, s''_0, s''_1)
+ movdqa \d1, \slo // (s'_0, s'_1; s''_0, s''_1)
.endif
.ifnes "\d3", "nil"
- movdqa \d3, \shi // (s'_2, s'_3, s''_2, s''_3)
+ movdqa \d3, \shi // (s'_2, s'_3; s''_2, s''_3)
.endif
.ifnes "\d1", "nil"
- psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0)
+ psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
.endif
.ifnes "\d2", "nil"
- movdqa \d2, \d0 // another copy of (r_i, ?, r_i, ?)
+ movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?)
.endif
.ifnes "\d3", "nil"
- psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
+ psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0)
.endif
.ifnes "\d1", "nil"
- pmuludq \d1, \d0 // (r_i s'_1, r_i s''_1)
+ pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1)
.endif
.ifnes "\d3", "nil"
- pmuludq \d3, \d0 // (r_i s'_3, r_i s''_3)
+ pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3)
.endif
.ifnes "\d2", "nil"
- pmuludq \d2, \shi // (r_i s'_2, r_i s''_2)
+ pmuludq \d2, \shi // (r_i s'_2; r_i s''_2)
.endif
- pmuludq \d0, \slo // (r_i s'_0, r_i s''_0)
+ pmuludq \d0, \slo // (r_i s'_0; r_i s''_0)
.endm
.macro accum c0, c1=nil, c2=nil, c3=nil
// lane 0 or 1 of D; the high two lanes of D are clobbered. On
// completion, XMM3 is clobbered. If CC is `nil', then the
// contribution which would have been added to it is left in C.
- pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
- psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0)
- pslldq xmm3, 2 // (t b, 0)
- paddq \c, xmm3 // (c' + t b, c'')
+ pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
+ psrldq xmm3, 12 // (t, 0; 0, 0) = (t; 0)
+ pslldq xmm3, 2 // (t b; 0)
+ paddq \c, xmm3 // (c' + t b; c'')
.ifeqs "\pos", "lo"
movdqa \d, \c
.else
// of the value represented in C are written at POS in D, and the
// remaining bits are left at the bottom of T.
movdqa \t, \c
- psllq \t, 16 // (?, c'' b)
- pslldq \c, 8 // (0, c')
- paddq \t, \c // (?, c' + c'' b)
- psrldq \t, 8 // c' + c'' b
+ psllq \t, 16 // (?; c'' b)
+ pslldq \c, 8 // (0; c')
+ paddq \t, \c // (?; c' + c'' b)
+ psrldq \t, 8 // (c' + c'' b; 0) = (c; 0)
.ifeqs "\pos", "lo"
movdqa \d, \t
.else
punpckldq \d, \t
.endif
- psrldq \t, 4 // floor((c' + c'' b)/B)
+ psrldq \t, 4 // (floor(c/B); 0)
.endm
.macro expand z, a, b, c=nil, d=nil
// On entry, A and C hold packed 128-bit values, and Z is zero. On
// exit, A:B and C:D together hold the same values in expanded
// form. If C is `nil', then only expand A to A:B.
- movdqa \b, \a // (a_0, a_1, a_2, a_3)
+ movdqa \b, \a // (a_0, a_1; a_2, a_3)
.ifnes "\c", "nil"
- movdqa \d, \c // (c_0, c_1, c_2, c_3)
+ movdqa \d, \c // (c_0, c_1; c_2, c_3)
.endif
- punpcklwd \a, \z // (a'_0, a''_0, a'_1, a''_1)
- punpckhwd \b, \z // (a'_2, a''_2, a'_3, a''_3)
+ punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1)
+ punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3)
.ifnes "\c", "nil"
- punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1)
- punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3)
+ punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
+ punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
.endif
- pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
- pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
+ pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
+ pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
.ifnes "\c", "nil"
- pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
- pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
+ pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
+ pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
.endif
.endm
// we can do that, we must gather them together.
movdqa \t, \c0
movdqa \u, \c1
- punpcklqdq \t, \c2 // (y'_0, y'_2)
- punpckhqdq \c0, \c2 // (y''_0, y''_2)
- punpcklqdq \u, \c3 // (y'_1, y'_3)
- punpckhqdq \c1, \c3 // (y''_1, y''_3)
+ punpcklqdq \t, \c2 // (y'_0; y'_2)
+ punpckhqdq \c0, \c2 // (y''_0; y''_2)
+ punpcklqdq \u, \c3 // (y'_1; y'_3)
+ punpckhqdq \c1, \c3 // (y''_1; y''_3)
// Now split the double-prime pieces. The high (up to) 48 bits will
// go up; the low 16 bits go down.
movdqa \c3, \c1
psllq \c2, 48
psllq \c3, 48
- psrlq \c0, 16 // high parts of (y''_0, y''_2)
- psrlq \c1, 16 // high parts of (y''_1, y''_3)
- psrlq \c2, 32 // low parts of (y''_0, y''_2)
- psrlq \c3, 32 // low parts of (y''_1, y''_3)
+ psrlq \c0, 16 // high parts of (y''_0; y''_2)
+ psrlq \c1, 16 // high parts of (y''_1; y''_3)
+ psrlq \c2, 32 // low parts of (y''_0; y''_2)
+ psrlq \c3, 32 // low parts of (y''_1; y''_3)
.ifnes "\hi", "nil"
movdqa \hi, \c1
.endif
- pslldq \c1, 8 // high part of (0, y''_1)
+ pslldq \c1, 8 // high part of (0; y''_1)
paddq \t, \c2 // propagate down
paddq \u, \c3
- paddq \t, \c1 // and up: (y_0, y_2)
- paddq \u, \c0 // (y_1, y_3)
+ paddq \t, \c1 // and up: (y_0; y_2)
+ paddq \u, \c0 // (y_1; y_3)
.ifnes "\hi", "nil"
- psrldq \hi, 8 // high part of (y''_3, 0)
+ psrldq \hi, 8 // high part of (y''_3; 0)
.endif
// Finally extract the answer. This complicated dance is better than
// storing to memory and loading, because the piecemeal stores
// inhibit store forwarding.
- movdqa \c3, \t // (y_0, y_1)
- movdqa \lo, \t // (y^*_0, ?, ?, ?)
- psrldq \t, 8 // (y_2, 0)
- psrlq \c3, 32 // (floor(y_0/B), ?)
- paddq \c3, \u // (y_1 + floor(y_0/B), ?)
- movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
- psrldq \u, 8 // (y_3, 0)
- psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
- paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
- punpckldq \lo, \c3 // (y^*_0, y^*_2, ?, ?)
- psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
- paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
+ movdqa \c3, \t // (y_0; ?)
+ movdqa \lo, \t // (y^*_0, ?; ?, ?)
+ psrldq \t, 8 // (y_2; 0)
+ psrlq \c3, 32 // (floor(y_0/B); ?)
+ paddq \c3, \u // (y_1 + floor(y_0/B); ?)
+ movdqa \c1, \c3 // (y^*_1, ?; ?, ?)
+ psrldq \u, 8 // (y_3; 0)
+ psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
+ paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
+ punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?)
+ psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
+ paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
.ifnes "\hi", "nil"
movdqa \t, \c3
pxor \u, \u
.endif
- punpckldq \c1, \c3 // (y^*_1, y^*_3, ?, ?)
+ punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?)
.ifnes "\hi", "nil"
psrlq \t, 32 // very high bits of y
paddq \hi, \t
// On exit, the carry registers, including XMM15, are updated to hold
// C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
// registers are preserved.
- movd xmm0, [rdi + 0] // (a_0, 0)
- movd xmm1, [rdi + 4] // (a_1, 0)
- movd xmm2, [rdi + 8] // (a_2, 0)
- movd xmm15, [rdi + 12] // (a_3, 0)
- paddq xmm12, xmm0 // (c'_0 + a_0, c''_0)
- paddq xmm13, xmm1 // (c'_1 + a_1, c''_1)
- paddq xmm14, xmm2 // (c'_2 + a_2, c''_2 + a_3 b)
+ movd xmm0, [rdi + 0] // (a_0; 0)
+ movd xmm1, [rdi + 4] // (a_1; 0)
+ movd xmm2, [rdi + 8] // (a_2; 0)
+ movd xmm15, [rdi + 12] // (a_3; 0)
+ paddq xmm12, xmm0 // (c'_0 + a_0; c''_0)
+ paddq xmm13, xmm1 // (c'_1 + a_1; c''_1)
+ paddq xmm14, xmm2 // (c'_2 + a_2; c''_2 + a_3 b)
.endm
///--------------------------------------------------------------------------
mulcore xmm7, 1, xmm10, xmm11, xmm0, xmm1, xmm2
accum xmm4, xmm5, xmm6
- punpckldq xmm12, xmm15 // (w_0, 0, w_1, 0)
- punpckhdq xmm14, xmm15 // (w_2, 0, w_3, 0)
+ punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0)
+ punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0)
mulcore xmm7, 2, xmm10, xmm11, xmm0, xmm1
accum xmm5, xmm6
mulcore xmm7, 3, xmm10, xmm11, xmm0
accum xmm6
- punpckldq xmm12, xmm2 // (w_0, 0, 0, 0)
- punpckldq xmm14, xmm2 // (w_2, 0, 0, 0)
- punpckhdq xmm13, xmm2 // (w_1, 0, 0, 0)
- punpckhdq xmm15, xmm2 // (w_3, 0, 0, 0)
+ punpckldq xmm12, xmm2 // (w_0, 0; 0, 0)
+ punpckldq xmm14, xmm2 // (w_2, 0; 0, 0)
+ punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0)
+ punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0)
// That's lots of pieces. Now we have to assemble the answer.
squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
mulcore xmm7, 1, xmm8, xmm9, xmm0, xmm1, xmm2
accum xmm4, xmm5, xmm6
- punpckldq xmm12, xmm15 // (w_0, 0, w_1, 0)
- punpckhdq xmm14, xmm15 // (w_2, 0, w_3, 0)
+ punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0)
+ punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0)
mulcore xmm7, 2, xmm8, xmm9, xmm0, xmm1
accum xmm5, xmm6
mulcore xmm7, 3, xmm8, xmm9, xmm0
accum xmm6
- punpckldq xmm12, xmm2 // (w_0, 0, 0, 0)
- punpckldq xmm14, xmm2 // (w_2, 0, 0, 0)
- punpckhdq xmm13, xmm2 // (w_1, 0, 0, 0)
- punpckhdq xmm15, xmm2 // (w_3, 0, 0, 0)
+ punpckldq xmm12, xmm2 // (w_0, 0; 0, 0)
+ punpckldq xmm14, xmm2 // (w_2, 0; 0, 0)
+ punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0)
+ punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0)
// That's lots of pieces. Now we have to assemble the answer.
squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
///--------------------------------------------------------------------------
/// Bulk multipliers.
+FUNC(mpx_umul4_amd64_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ .arch pentium4
+ENDFUNC
+
FUNC(mpx_umul4_amd64_sse2)
// void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
// const mpw *bv, const mpw *bvl);
ENDFUNC
+FUNC(mpxmont_mul4_amd64_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ .arch pentium4
+ENDFUNC
+
FUNC(mpxmont_mul4_amd64_sse2)
// void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
// const mpw *nv, size_t n, const mpw *mi);
ENDFUNC
+FUNC(mpxmont_redc4_amd64_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ .arch pentium4
+ENDFUNC
+
FUNC(mpxmont_redc4_amd64_sse2)
// void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
// size_t n, const mpw *mi);
.endm
.macro testldcarry
- movdqu xmm12, [rcx + 0] // (c'_0, c''_0)
- movdqu xmm13, [rcx + 16] // (c'_1, c''_1)
- movdqu xmm14, [rcx + 32] // (c'_2, c''_2)
+ movdqu xmm12, [rcx + 0] // (c'_0; c''_0)
+ movdqu xmm13, [rcx + 16] // (c'_1; c''_1)
+ movdqu xmm14, [rcx + 32] // (c'_2; c''_2)
.endm
.macro testtop u=nil
.macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil
// Load a word r_i from R, multiply by the expanded operand [S], and
// leave the pieces of the product in registers D0, D1, D2, D3.
- movd \d0, \r // (r_i, 0, 0, 0)
+ movd \d0, \r // (r_i, 0; 0, 0)
.ifnes "\d1", "nil"
- movdqa \d1, [\s] // (s'_0, s'_1, s''_0, s''_1)
+ movdqa \d1, [\s] // (s'_0, s'_1; s''_0, s''_1)
.endif
.ifnes "\d3", "nil"
- movdqa \d3, [\s + 16] // (s'_2, s'_3, s''_2, s''_3)
+ movdqa \d3, [\s + 16] // (s'_2, s'_3; s''_2, s''_3)
.endif
- pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (r_i, ?, r_i, ?)
+ pshufd \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?)
.ifnes "\d1", "nil"
- psrldq \d1, 4 // (s'_1, s''_0, s''_1, 0)
+ psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
.endif
.ifnes "\d2", "nil"
.ifnes "\d3", "nil"
- movdqa \d2, \d3 // another copy of (s'_2, s'_3, ...)
+ movdqa \d2, \d3 // another copy of (s'_2, s'_3; ...)
.else
- movdqa \d2, \d0 // another copy of (r_i, ?, r_i, ?)
+ movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?)
.endif
.endif
.ifnes "\d3", "nil"
- psrldq \d3, 4 // (s'_3, s''_2, s''_3, 0)
+ psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0)
.endif
.ifnes "\d1", "nil"
- pmuludq \d1, \d0 // (r_i s'_1, r_i s''_1)
+ pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1)
.endif
.ifnes "\d3", "nil"
- pmuludq \d3, \d0 // (r_i s'_3, r_i s''_3)
+ pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3)
.endif
.ifnes "\d2", "nil"
.ifnes "\d3", "nil"
- pmuludq \d2, \d0 // (r_i s'_2, r_i s''_2)
+ pmuludq \d2, \d0 // (r_i s'_2; r_i s''_2)
.else
pmuludq \d2, [\s + 16]
.endif
.endif
- pmuludq \d0, [\s] // (r_i s'_0, r_i s''_0)
+ pmuludq \d0, [\s] // (r_i s'_0; r_i s''_0)
.endm
.macro accum c0, c1=nil, c2=nil, c3=nil
// carry registers. On completion, XMM3 is clobbered. If CC is
// `nil', then the contribution which would have been added to it is
// left in C.
- pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?, ?, t = c'' mod B)
- psrldq xmm3, 12 // (t, 0, 0, 0) = (t, 0)
- pslldq xmm3, 2 // (t b, 0)
- paddq \c, xmm3 // (c' + t b, c'')
+ pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
+ psrldq xmm3, 12 // (t, 0; 0, 0) = (t, 0)
+ pslldq xmm3, 2 // (t b; 0)
+ paddq \c, xmm3 // (c' + t b; c'')
movd \d, \c
psrlq \c, 32 // floor(c/B)
.ifnes "\cc", "nil"
// of the value represented in C are written to D, and the remaining
// bits are left at the bottom of T.
movdqa \t, \c
- psllq \t, 16 // (?, c'' b)
- pslldq \c, 8 // (0, c')
- paddq \t, \c // (?, c' + c'' b)
- psrldq \t, 8 // c' + c'' b
+ psllq \t, 16 // (?; c'' b)
+ pslldq \c, 8 // (0; c')
+ paddq \t, \c // (?; c' + c'' b)
+ psrldq \t, 8 // (c' + c'' b; 0) = (c; 0)
movd \d, \t
- psrldq \t, 4 // floor((c' + c'' b)/B)
+ psrldq \t, 4 // (floor(c/B); 0)
.endm
.macro expand z, a, b, c=nil, d=nil
// On entry, A and C hold packed 128-bit values, and Z is zero. On
// exit, A:B and C:D together hold the same values in expanded
// form. If C is `nil', then only expand A to A:B.
- movdqa \b, \a // (a_0, a_1, a_2, a_3)
+ movdqa \b, \a // (a_0, a_1; a_2, a_3)
.ifnes "\c", "nil"
- movdqa \d, \c // (c_0, c_1, c_2, c_3)
+ movdqa \d, \c // (c_0, c_1; c_2, c_3)
.endif
- punpcklwd \a, \z // (a'_0, a''_0, a'_1, a''_1)
- punpckhwd \b, \z // (a'_2, a''_2, a'_3, a''_3)
+ punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1)
+ punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3)
.ifnes "\c", "nil"
- punpcklwd \c, \z // (c'_0, c''_0, c'_1, c''_1)
- punpckhwd \d, \z // (c'_2, c''_2, c'_3, c''_3)
+ punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
+ punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
.endif
- pshufd \a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1, a''_0, a''_1)
- pshufd \b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3, a''_2, a''_3)
+ pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
+ pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
.ifnes "\c", "nil"
- pshufd \c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1, c''_0, c''_1)
- pshufd \d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3, c''_2, c''_3)
+ pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
+ pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
.endif
.endm
// we can do that, we must gather them together.
movdqa \t, \c0
movdqa \u, \c1
- punpcklqdq \t, \c2 // (y'_0, y'_2)
- punpckhqdq \c0, \c2 // (y''_0, y''_2)
- punpcklqdq \u, \c3 // (y'_1, y'_3)
- punpckhqdq \c1, \c3 // (y''_1, y''_3)
+ punpcklqdq \t, \c2 // (y'_0; y'_2)
+ punpckhqdq \c0, \c2 // (y''_0; y''_2)
+ punpcklqdq \u, \c3 // (y'_1; y'_3)
+ punpckhqdq \c1, \c3 // (y''_1; y''_3)
// Now split the double-prime pieces. The high (up to) 48 bits will
// go up; the low 16 bits go down.
movdqa \c3, \c1
psllq \c2, 48
psllq \c3, 48
- psrlq \c0, 16 // high parts of (y''_0, y''_2)
- psrlq \c1, 16 // high parts of (y''_1, y''_3)
- psrlq \c2, 32 // low parts of (y''_0, y''_2)
- psrlq \c3, 32 // low parts of (y''_1, y''_3)
+ psrlq \c0, 16 // high parts of (y''_0; y''_2)
+ psrlq \c1, 16 // high parts of (y''_1; y''_3)
+ psrlq \c2, 32 // low parts of (y''_0; y''_2)
+ psrlq \c3, 32 // low parts of (y''_1; y''_3)
.ifnes "\hi", "nil"
movdqa \hi, \c1
.endif
- pslldq \c1, 8 // high part of (0, y''_1)
+ pslldq \c1, 8 // high part of (0; y''_1)
paddq \t, \c2 // propagate down
paddq \u, \c3
- paddq \t, \c1 // and up: (y_0, y_2)
- paddq \u, \c0 // (y_1, y_3)
+ paddq \t, \c1 // and up: (y_0; y_2)
+ paddq \u, \c0 // (y_1; y_3)
.ifnes "\hi", "nil"
- psrldq \hi, 8 // high part of (y''_3, 0)
+ psrldq \hi, 8 // high part of (y''_3; 0)
.endif
// Finally extract the answer. This complicated dance is better than
// storing to memory and loading, because the piecemeal stores
// inhibit store forwarding.
- movdqa \c3, \t // (y_0, y_1)
- movdqa \lo, \t // (y^*_0, ?, ?, ?)
- psrldq \t, 8 // (y_2, 0)
- psrlq \c3, 32 // (floor(y_0/B), ?)
- paddq \c3, \u // (y_1 + floor(y_0/B), ?)
- movdqa \c1, \c3 // (y^*_1, ?, ?, ?)
- psrldq \u, 8 // (y_3, 0)
- psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
- paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
- punpckldq \lo, \c3 // (y^*_0, y^*_2, ?, ?)
- psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
- paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
+ movdqa \c3, \t // (y_0; ?)
+ movdqa \lo, \t // (y^*_0, ?; ?, ?)
+ psrldq \t, 8 // (y_2; 0)
+ psrlq \c3, 32 // (floor(y_0/B); ?)
+ paddq \c3, \u // (y_1 + floor(y_0/B); ?)
+ movdqa \c1, \c3 // (y^*_1, ?; ?, ?)
+ psrldq \u, 8 // (y_3; 0)
+ psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
+ paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
+ punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?)
+ psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
+ paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
.ifnes "\hi", "nil"
movdqa \t, \c3
pxor \u, \u
.endif
- punpckldq \c1, \c3 // (y^*_1, y^*_3, ?, ?)
+ punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?)
.ifnes "\hi", "nil"
psrlq \t, 32 // very high bits of y
paddq \hi, \t
// On exit, the carry registers, including XMM7, are updated to hold
// C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
// registers are preserved.
- movd xmm0, [edi + 0] // (a_0, 0)
- movd xmm1, [edi + 4] // (a_1, 0)
- movd xmm2, [edi + 8] // (a_2, 0)
- movd xmm7, [edi + 12] // (a_3, 0)
-
- paddq xmm4, xmm0 // (c'_0 + a_0, c''_0)
- paddq xmm5, xmm1 // (c'_1 + a_1, c''_1)
- paddq xmm6, xmm2 // (c'_2 + a_2, c''_2 + a_3 b)
+ movd xmm0, [edi + 0] // (a_0; 0)
+ movd xmm1, [edi + 4] // (a_1; 0)
+ movd xmm2, [edi + 8] // (a_2; 0)
+ movd xmm7, [edi + 12] // (a_3; 0)
+
+ paddq xmm4, xmm0 // (c'_0 + a_0; c''_0)
+ paddq xmm5, xmm1 // (c'_1 + a_1; c''_1)
+ paddq xmm6, xmm2 // (c'_2 + a_2; c''_2 + a_3 b)
.endm
///--------------------------------------------------------------------------
///--------------------------------------------------------------------------
/// Bulk multipliers.
+FUNC(mpx_umul4_x86_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ // and drop through...
+ .arch pentium4
+ENDFUNC
+
FUNC(mpx_umul4_x86_sse2)
// void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
// const mpw *bv, const mpw *bvl);
ENDFUNC
+FUNC(mpxmont_mul4_x86_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ // and drop through...
+ .arch pentium4
+ENDFUNC
+
FUNC(mpxmont_mul4_x86_sse2)
// void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
// const mpw *nv, size_t n, const mpw *mi);
ENDFUNC
+FUNC(mpxmont_redc4_x86_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ // and drop through...
+ .arch pentium4
+ENDFUNC
+
FUNC(mpxmont_redc4_x86_sse2)
// void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
// size_t n, const mpw *mi);
.macro testldcarry c
mov ecx, \c // -> c
- movdqu xmm4, [ecx + 0] // (c'_0, c''_0)
- movdqu xmm5, [ecx + 16] // (c'_1, c''_1)
- movdqu xmm6, [ecx + 32] // (c'_2, c''_2)
+ movdqu xmm4, [ecx + 0] // (c'_0; c''_0)
+ movdqu xmm5, [ecx + 16] // (c'_1; c''_1)
+ movdqu xmm6, [ecx + 32] // (c'_2; c''_2)
.endm
.macro testexpand v=nil, y=nil
#if CPUFAM_X86
MAYBE_UMUL4(x86_sse2)
+ MAYBE_UMUL4(x86_avx)
#endif
#if CPUFAM_AMD64
MAYBE_UMUL4(amd64_sse2)
+ MAYBE_UMUL4(amd64_avx)
#endif
static mpx_umul__functype *pick_umul(void)
{
#if CPUFAM_X86
+ DISPATCH_PICK_COND(mpx_umul, maybe_umul4_x86_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX));
DISPATCH_PICK_COND(mpx_umul, maybe_umul4_x86_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
#if CPUFAM_AMD64
+ DISPATCH_PICK_COND(mpx_umul, maybe_umul4_amd64_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX));
DISPATCH_PICK_COND(mpx_umul, maybe_umul4_amd64_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
--- /dev/null
+/getdate.h
+/getdate.y
.B cipher
and
.B mac
-attributes.
+attributes. Run
+.B catcrypt show cipher
+for a list of supported symmetric encryption algorithms; the default
+.I cipher
+is
+.BR blowfish-cbc .
This is the default transform.
.TP
.B naclbox
.I bulk
in the
.I kemalgspec
-is used; if that it absent, then the default of
-.B blowfish-cbc
-is used. Run
-.B catcrypt show cipher
-for a list of supported symmetric encryption algorithms.
+is used; if that it absent, then the default depends on the bulk
+transform.
.TP
.B hash
This is the hash function used to distil entropy from the shared secret
attribute.
.TP
.B cipher
-The symmetric encryption algorithms which can be used in a
+The symmetric encryption algorithms which can be named in a
key-encapsulation key's
.B cipher
-attribute.
+attribute when using the
+.B gencomp
+bulk transform.
.TP
.B mac
-The message authentication algorithms which can be used in a
+The message authentication algorithms which can be named in a
key-encapsulation key's
.B mac
attribute.
.TP
.B sig
-The signature algorithms which can be used in a signing key's
+The signature algorithms which can be named in a signing key's
.B sig
attribute.
.TP
.B hash
-The hash functions which can be used in a key's
+The hash functions which can be named in a key's
.B hash
attribute.
.TP
#include <sys/time.h>
#include <unistd.h>
+#ifdef HAVE_LINUX_PERF_EVENT_H
+# include <linux/perf_event.h>
+# include <asm/unistd.h>
+#endif
+
#include <mLib/alloc.h>
+#include <mLib/bits.h>
#include <mLib/dstr.h>
#include <mLib/mdwopt.h>
#include <mLib/quis.h>
typedef struct opts {
const char *name; /* Pre-configured named thing */
+ const char *opwhat; /* What to call operations */
unsigned fbits; /* Field size bits */
unsigned gbits; /* Group size bits */
unsigned n; /* Number of factors */
unsigned i; /* Number of intervals (or zero) */
+ unsigned k; /* Main loop batch size */
+ unsigned long sc; /* Scale factor */
double t; /* Time for each interval (secs) */
mp *e; /* Public exponent */
unsigned f; /* Flags */
die(1, "must specify encryption scheme name");
if ((c->c = gcipher_byname(o->name)) == 0)
die(1, "encryption scheme `%s' not known", o->name);
- c->ksz = keysz(o->gbits/8, c->c->keysz);
+ c->ksz = keysz(o->fbits/8, c->c->keysz);
+ if (o->fbits%8 || (o->fbits && c->ksz != o->fbits/8))
+ die(1, "bad key size %u for %s", o->fbits, o->name);
c->k = xmalloc(c->ksz);
rand_get(RAND_GLOBAL, c->k, c->ksz);
return (c);
die(1, "must specify encryption scheme name");
if ((cc = gcipher_byname(o->name)) == 0)
die(1, "encryption scheme `%s' not known", o->name);
- ksz = keysz(0, cc->keysz);
+ ksz = keysz(o->fbits/8, cc->keysz);
+ if (o->fbits%8 || (o->fbits && ksz != o->fbits/8))
+ die(1, "bad key size %u for %s", o->fbits, o->name);
k = xmalloc(ksz);
rand_get(RAND_GLOBAL, k, ksz);
c->c = GC_INIT(cc, k, ksz);
xfree(k);
c->sz = o->gbits ? o->gbits : 65536;
c->n = o->n ? o->n : 16;
+ o->opwhat = "byte"; o->sc = c->n*c->sz;
c->m = xmalloc(c->sz);
return (c);
}
die(1, "hash function `%s' not known", o->name);
c->sz = o->gbits ? o->gbits : 65536;
c->n = o->n ? o->n : 16;
+ o->opwhat = "byte"; o->sc = c->n*c->sz;
c->m = xmalloc(c->sz);
return (c);
}
rand_get(RAND_GLOBAL, c->s, sizeof(c->s));
c->sz = o->gbits ? o->gbits : 65536;
c->n = o->n ? o->n : 16;
+ o->opwhat = "byte"; o->sc = c->n*c->sz;
c->m = xmalloc(c->sz);
return (c);
}
{ 0, 0, 0 }
};
+/*----- Cycle counting ----------------------------------------------------*/
+
+typedef kludge64 cycles;
+static int cyclecount_active_p = 0;
+
+#if defined(__GNUC__) && (CPUFAM_X86 || CPUFAM_AMD64)
+
+static void init_cyclecount(void) { cyclecount_active_p = 1; }
+
+static cycles cyclecount(void)
+{
+ uint32 lo, hi;
+ kludge64 cy;
+
+ __asm__("rdtsc" : "=a"(lo), "=d"(hi));
+ SET64(cy, hi, lo);
+ return cy;
+}
+
+#elif defined(HAVE_LINUX_PERF_EVENT_H) && defined(HAVE_UINT64)
+
+static int perf_fd = -1;
+
+static void init_cyclecount(void)
+{
+ struct perf_event_attr attr = { 0 };
+
+ attr.type = PERF_TYPE_HARDWARE;
+ attr.size = sizeof(attr);
+ attr.config = PERF_COUNT_HW_CPU_CYCLES;
+ attr.disabled = 0;
+ attr.exclude_kernel = 1;
+ attr.exclude_hv = 1;
+
+ if ((perf_fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0)) < 0)
+ moan("failed to open perf event: %s", strerror(errno));
+ else
+ cyclecount_active_p = 1;
+}
+
+static cycles cyclecount(void)
+{
+ kludge64 cy;
+ ssize_t n;
+
+ if (!cyclecount_active_p)
+ goto fail;
+ else if ((n = read(perf_fd, &cy.i, sizeof(cy.i))) != sizeof(cy.i)) {
+ if (n < 0) moan("error reading perf event: %s", strerror(errno));
+ else moan("unexpected short read from perf event");
+ cyclecount_active_p = 0; close(perf_fd); perf_fd = -1;
+ goto fail;
+ }
+end:
+ return (cy);
+fail:
+ SET64(cy, 0, 0);
+ goto end;
+}
+
+#else
+
+static void init_cyclecount(void) { cyclecount_active_p = 0; }
+static cycles cyclecount(void) { kludge64 cy; SET64(cy, 0, 0); return (cy); }
+
+#endif
+
/*----- Main code ---------------------------------------------------------*/
void version(FILE *fp)
-l, --list [ITEM...] List all the various names of things.\n\
\n\
-C, --name=NAME Select curve/DH-group/enc/hash name.\n\
--b, --field-bits Field size for g-prime and rsa.\n\
+-b, --field-bits Field size for g-prime and rsa;\n\
+ key bits for ksched and enc.\n\
-q, --no-check Don't check field/group for validity.\n\
--B, --group-bits Group size for g-prime; key size for ksched;\n\
- data size for enc and hash.\n\
--n, --factors=COUNT Number of factors for {exp,mul}-sim.\n\
+-B, --group-bits Group size for g-prime; data size for enc and hash.\n\
+-n, --factors=COUNT Number of factors for {exp,mul}-sim;\n\
+ inner iterations for enc and hash.\n\
-i, --intervals=COUNT Number of intervals to run for. [0; forever]\n\
+-k, --batch=COUNT Number of operations to batch between timer checks.\n\
-t, --time=TIME Length of an interval in seconds. [1]\n\
");
}
opts o = { 0 };
const jobops *j;
struct timeval tv_next, tv_now;
- double t, ttot;
- unsigned n;
+ double t, ttot, cy, cytot;
+ unsigned n, k;
unsigned long ii;
- clock_t c_start, c_stop;
+ clock_t c0, c1;
+ kludge64 cy0, cy1, cydiff;
double itot;
void *p;
ego(argv[0]);
- o.t = 1;
+ o.t = 1; o.k = 1; o.sc = 1; o.opwhat = "op";
for (;;) {
static const struct option opts[] = {
{ "help", 0, 0, 'h' },
{ "group-bits", OPTF_ARGREQ, 0, 'B' },
{ "factors", OPTF_ARGREQ, 0, 'n' },
{ "intervals", OPTF_ARGREQ, 0, 'i' },
+ { "batch", OPTF_ARGREQ, 0, 'k' },
{ "public-exponent", OPTF_ARGREQ, 0, 'e' },
{ "time", OPTF_ARGREQ, 0, 't' },
{ "no-check", 0, 0, 'q' },
{ 0, 0, 0, 0 }
};
- i = mdwopt(argc, argv, "hvulC:b:B:n:i:e:t:q", opts, 0, 0, 0);
+ i = mdwopt(argc, argv, "hvulC:b:B:n:i:k:e:t:q", opts, 0, 0, 0);
if (i < 0) break;
switch (i) {
case 'h': help(stdout); exit(0);
break;
case 'i': o.i = uarg("interval count", optarg); break;
case 't': o.t = farg("interval length", optarg); break;
+ case 'k': o.k = uarg("batch size", optarg); break;
case 'q': o.f |= OF_NOCHECK; break;
default: usage(stderr); exit(1);
}
p = j->init(&o);
n = 0;
- ttot = itot = 0;
+ ttot = itot = 0; cytot = 0; init_cyclecount();
gettimeofday(&tv_now, 0);
do {
tv_addl(&tv_next, &tv_now, o.t, fmod(o.t * MILLION, MILLION));
ii = 0;
- c_start = clock();
+ c0 = clock(); cy0 = cyclecount();
do {
- j->run(p);
- ii++;
+ for (k = 0; k < o.k; k++) { j->run(p); }
+ ii += k;
gettimeofday(&tv_now, 0);
} while (TV_CMP(&tv_now, <, &tv_next));
- c_stop = clock();
- t = (double)(c_stop - c_start)/CLOCKS_PER_SEC;
- itot += ii;
- ttot += t;
- printf("%5u: did = %5lu; /sec = %5f; avg /sec = %5f\n",
+ cy1 = cyclecount(); c1 = clock();
+ t = (double)(c1 - c0)/CLOCKS_PER_SEC;
+ itot += ii; ttot += t;
+ printf("%5u: did = %5lu; /sec = %5f; avg /sec = %5f",
n, ii, ii/t, itot/ttot);
+ if (cyclecount_active_p) {
+ SUB64(cydiff, cy1, cy0); cy = LO64(cydiff) + ldexp(HI64(cydiff), 32);
+ cytot += cy;
+ printf(" (cy/%s = %3f; avg cy/%s = %3f)",
+ o.opwhat, cy/ii/o.sc, o.opwhat, cytot/itot/o.sc);
+ }
+ putchar('\n');
fflush(stdout);
n++;
} while (!o.i || n < o.i);
-modes/
+/modes/
+/modes.am
+/stubs.am
+/stubs.gen-stamp
+
+/t/salsa20
+/t/sha3
+
+/sha224.c
+/sha224.h
+/sha384.h
+/sha384.c
+/sha512-224.c
+/sha512-224.h
+/sha512-256.c
+/sha512-256.h
+
+/safersk.c
+/safersk.h
+
+/whirlpool256.c
+/whirlpool256.h
+
+/sha3-224.c
+/sha3-224.h
+/sha3-256.c
+/sha3-256.h
+/sha3-384.c
+/sha3-384.h
+/sha3-512.c
+/sha3-512.h
+/kmac128.h
+/kmac256.h
+/shake128.h
+/shake256.h
+/shake128-xof.h
+/shake256-xof.h
+
+/chacha20.h
+/chacha12.h
+/chacha8.h
+/chacha12-ietf.h
+/chacha20-ietf.h
+/chacha8-ietf.h
+/xchacha.h
+/xchacha20.h
+/xchacha12.h
+/xchacha8.h
+
+/salsa2012.h
+/salsa208.h
+/salsa20-ietf.h
+/salsa2012-ietf.h
+/salsa208-ietf.h
+/xsalsa20.h
+/xsalsa2012.h
+/xsalsa208.h
#define BLKC_SHOW(PRE, tag, w) do { \
fputs(tag ": ", stdout); \
- BLKC_SKEL_X(PRE, BLKC_W(w);, printf("%08x ", *_w++);); \
+ BLKC_SKEL_X(PRE, const BLKC_W(w);, printf("%08x ", *_w++);); \
fputc('\n', stdout); \
} while (0)
///--------------------------------------------------------------------------
/// Main code.
- .arch pentium4
.text
+FUNC(chacha_core_x86ish_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ // drop through...
+ENDFUNC
+
+ .arch pentium4
+
FUNC(chacha_core_x86ish_sse2)
// Initial setup.
// c += d; b ^= c; b <<<= 7
paddd xmm2, xmm3
- pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
+ pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
pxor xmm1, xmm2
- pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
+ pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
movdqa xmm4, xmm1
pslld xmm1, 7
psrld xmm4, 25
//
// The shuffles have quite high latency, so they've mostly been
// pushed upwards. The remaining one can't be moved, though.
- pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
+ pshufd xmm1, xmm1, SHUF(1, 2, 3, 0)
// Apply the diagonal quarterround to each of the columns
// simultaneously.
// c += d; b ^= c; b <<<= 7
paddd xmm2, xmm3
- pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
+ pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
pxor xmm1, xmm2
- pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
+ pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
movdqa xmm4, xmm1
pslld xmm1, 7
psrld xmm4, 25
// Finally, finish off undoing the transpose, and we're done for this
// doubleround. Again, most of this was done above so we don't have
// to wait for the shuffles.
- pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
+ pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
// Decrement the loop counter and see if we should go round again.
sub NR, 2
#if CPUFAM_X86 || CPUFAM_AMD64
extern core__functype chacha_core_x86ish_sse2;
+extern core__functype chacha_core_x86ish_avx;
#endif
#if CPUFAM_ARMEL
static core__functype *pick_core(void)
{
#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX));
DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
#if CPUFAM_X86 || CPUFAM_AMD64
extern setup__functype rijndael_setup_x86ish_aesni;
+extern setup__functype rijndael_setup_x86ish_aesni_avx;
#endif
#if CPUFAM_ARMEL && HAVE_AS_ARMV8_CRYPTO
extern setup__functype rijndael_setup_arm_crypto;
static setup__functype *pick_setup(void)
{
#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86ish_aesni_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX) &&
+ cpu_feature_p(CPUFEAT_X86_AESNI));
DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86ish_aesni,
cpu_feature_p(CPUFEAT_X86_AESNI));
#endif
///--------------------------------------------------------------------------
/// Key setup.
+FUNC(rijndael_setup_x86ish_aesni_avx)
+ vzeroupper // avoid penalty on `legacy' XMM access
+ endprologue
+ // and drop through...
+ENDFUNC
+
FUNC(rijndael_setup_x86ish_aesni)
#define SI WHOLE(si)
// Fourth word of the cycle, and seven or eight words of key. Do a
// byte substitution.
movd xmm0, eax
- pshufd xmm0, xmm0, SHUF(2, 1, 0, 3)
+ pshufd xmm0, xmm0, SHUF(3, 0, 1, 2)
aeskeygenassist xmm1, xmm0, 0
movd eax, xmm1
jmp 2f
// First word of the cycle. This is the complicated piece.
1: movd xmm0, eax
- pshufd xmm0, xmm0, SHUF(0, 3, 2, 1)
+ pshufd xmm0, xmm0, SHUF(1, 2, 3, 0)
aeskeygenassist xmm1, xmm0, 0
- pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
+ pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
movd eax, xmm1
xor al, [RCON]
inc RCON
/// Encrypting and decrypting blocks.
.macro encdec op, aes, koff
+ FUNC(rijndael_\op\()_x86ish_aesni_avx)
+ vzeroupper // avoid XMM penalties
+ endprologue
+ // and drop through...
+ ENDFUNC
+
FUNC(rijndael_\op\()_x86ish_aesni)
#if CPUFAM_X86
#if CPUFAM_X86 || CPUFAM_AMD64
extern rijndael_eblk__functype rijndael_eblk_x86ish_aesni;
extern rijndael_dblk__functype rijndael_dblk_x86ish_aesni;
+extern rijndael_eblk__functype rijndael_eblk_x86ish_aesni_avx;
+extern rijndael_dblk__functype rijndael_dblk_x86ish_aesni_avx;
#endif
#if CPUFAM_ARMEL && HAVE_AS_ARMV8_CRYPTO
extern rijndael_eblk__functype rijndael_eblk_arm_crypto;
static rijndael_eblk__functype *pick_eblk(void)
{
#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86ish_aesni_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX) &&
+ cpu_feature_p(CPUFEAT_X86_AESNI));
DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86ish_aesni,
cpu_feature_p(CPUFEAT_X86_AESNI));
#endif
static rijndael_dblk__functype *pick_dblk(void)
{
#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86ish_aesni_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX) &&
+ cpu_feature_p(CPUFEAT_X86_AESNI));
DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86ish_aesni,
cpu_feature_p(CPUFEAT_X86_AESNI));
#endif
///--------------------------------------------------------------------------
/// Main code.
- .arch pentium4
.text
+FUNC(salsa20_core_x86ish_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ // drop through...
+ENDFUNC
+
+ .arch pentium4
+
FUNC(salsa20_core_x86ish_sse2)
// Initial setup.
// d ^= (c + b) <<< 13
movdqa xmm4, xmm2
paddd xmm4, xmm1
- pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
+ pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
movdqa xmm5, xmm4
pslld xmm4, 13
psrld xmm5, 19
// a ^= (d + c) <<< 18
movdqa xmm4, xmm3
- pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
+ pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
paddd xmm4, xmm2
- pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
+ pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
movdqa xmm5, xmm4
pslld xmm4, 18
psrld xmm5, 14
// d ^= (c + b) <<< 13
movdqa xmm4, xmm2
paddd xmm4, xmm3
- pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
+ pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
movdqa xmm5, xmm4
pslld xmm4, 13
psrld xmm5, 19
// a ^= (d + c) <<< 18
movdqa xmm4, xmm1
- pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
+ pshufd xmm1, xmm1, SHUF(1, 2, 3, 0)
paddd xmm4, xmm2
- pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
+ pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
movdqa xmm5, xmm4
pslld xmm4, 18
psrld xmm5, 14
// input. This can be done by juggling values in registers, with the
// following fancy footwork: some row rotations, a transpose, and
// some more rotations.
- pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 3, 4, 9, 14
- pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) // 2, 7, 8, 13
- pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) // 1, 6, 11, 12
+ pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) // 3, 4, 9, 14
+ pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) // 2, 7, 8, 13
+ pshufd xmm3, xmm3, SHUF(1, 2, 3, 0) // 1, 6, 11, 12
movdqa xmm4, xmm0
movdqa xmm5, xmm3
punpckhdq xmm1, xmm3 // 5, 6, 7, 4
punpckhdq xmm2, xmm5 // 15, 12, 13, 14
- pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 4, 5, 6, 7
- pshufd xmm4, xmm4, SHUF(1, 0, 3, 2) // 8, 9, 10, 11
- pshufd xmm2, xmm2, SHUF(0, 3, 2, 1) // 12, 13, 14, 15
+ pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) // 4, 5, 6, 7
+ pshufd xmm4, xmm4, SHUF(2, 3, 0, 1) // 8, 9, 10, 11
+ pshufd xmm2, xmm2, SHUF(1, 2, 3, 0) // 12, 13, 14, 15
// Finally we have to write out the result.
movdqu [OUT + 0], xmm0
#if CPUFAM_X86 || CPUFAM_AMD64
extern core__functype salsa20_core_x86ish_sse2;
+extern core__functype salsa20_core_x86ish_avx;
#endif
#if CPUFAM_ARMEL
static core__functype *pick_core(void)
{
#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_avx,
+ cpu_feature_p(CPUFEAT_X86_AVX));
DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_sse2,
cpu_feature_p(CPUFEAT_X86_SSE2));
#endif
#ifndef CATACOMB_@{name:u:c}_H
#define CATACOMB_@{name:u:c}_H
-#ifndef CATACOMB_@{base:u}_H
+#ifndef CATACOMB_@{base:u:c}_H
# include "@base.h"
#endif