From: Mark Wooding Date: Wed, 18 May 2016 09:29:03 +0000 (+0100) Subject: Merge branch 'mdw/cpu-dispatch' X-Git-Tag: 2.2.3~1^2~22 X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/commitdiff_plain/1da1ed6a5815deef6c33d74f1eb3c856793df3e5?hp=-c Merge branch 'mdw/cpu-dispatch' * mdw/cpu-dispatch: Add support machinery for ARM hosts. base/dispatch.c: Add (unused) machinery for probing ELF auxilary vector. Add support for AMD64 processors and Microsoft Windows. symm/rijndael-x86-aseni.S: Unify encryption and decryption with a macro. symm/rijndael-x86-aesni.S: Use xmm5 instead of xmm7. symm/*.S: Symbolic names for shuffles. symm/chacha-x86-sse2.S: Fix the register allocation comment. Preprocess the assembler files. configure.ac: Improve the host CPU family detection. base/dispatch.c: Indent some preprocessor definitions properly. Add a pile of debug output around the CPU dispatching machinery. base/dispatch.c: Add documentation for some internal functions. base/dispatch.c: Add in more useful section markers. Support Intel's AES Native Instructions where available on x86 hardware. symm/: New SSE2 implementations of Salsa20 and ChaCha. symm/salsa20.c, symm/salsa20-core.h: Permute input matrix for SIMD. debian/rules: Run tests twice, once without any detected CPU features. base/dispatch.c: Check operating system support for XMM registers. configure.ac, base/dispatch.[ch]: CPU-specific implementations. configure.ac: Arrange to have an assembler available. Conflicts: configure.ac symm/Makefile.am --- 1da1ed6a5815deef6c33d74f1eb3c856793df3e5 diff --combined configure.ac index c3e55ca4,e3e5c21a..715a5c28 --- a/configure.ac +++ b/configure.ac @@@ -31,7 -31,8 +31,8 @@@ mdw_AUTO_VERSIO AC_INIT([catacomb], AUTO_VERSION, [mdw@distorted.org.uk]) AC_CONFIG_SRCDIR([catacomb.pc.in]) AC_CONFIG_AUX_DIR([config]) -AM_INIT_AUTOMAKE([foreign parallel-tests]) +AM_INIT_AUTOMAKE([foreign parallel-tests color-tests subdir-objects]) + AC_CANONICAL_HOST mdw_SILENT_RULES AC_PROG_CC @@@ -39,11 -40,102 +40,102 @@@ AX_CFLAGS_WARN_AL AM_PROG_LIBTOOL mdw_LIBTOOL_VERSION_INFO + AM_PROG_AS + AC_PROG_YACC AC_SUBST(AM_CFLAGS) dnl-------------------------------------------------------------------------- + dnl Host-specific configuration. + + AC_MSG_CHECKING([CPU family and ABI]) + + dnl The table of CPU families and ABIs which we might support. Support is + dnl not uniform: each dispatched function might or might not have an + dnl implementation for any particular CPU/ABI combination. + AC_DEFUN([catacomb_CPU_FAMILIES], + [$1([i[[3-6]]86,cygwin], [x86], [win]) + $1([i[[3-6]]86,*], [x86], [sysv]) + $1([x86_64,cygwin], [amd64], [win]) + $1([x86_64,*], [amd64], [sysv]) + $1([armv*,*-gnueabi | armv*,*-gnueabihf], [armel], [gnueabi])]) + + dnl A utility to clear the `seen' flags, used so as to process each CPU or + dnl ABI once. + m4_define([catacomb_CLEAR_FLAGS], + [m4_ifdef([catacomb_seen_cpu/$2], + [m4_undefine([catacomb_seen_cpu/$2])])dnl + m4_ifdef([catacomb_seen_abi/$3], + [m4_undefine([catacomb_seen_abi/$3])])]) + + dnl Identify the current host. + case $host_cpu,$host_os in + m4_define([catacomb_CPU_CASE], + [$1) CPUFAM=$2 ABI=$3 ;; + ]) + catacomb_CPU_FAMILIES([catacomb_CPU_CASE]) + *) CPUFAM=nil ABI=nil ;; + esac + + dnl Figure out the current CPU. + catacomb_CPU_FAMILIES([catacomb_CLEAR_FLAGS]) + case $CPUFAM in + m4_define([catacomb_DEFINE_CPU], + [m4_ifdef([catacomb_seen_cpu/$2], [], + [$2) + AC_DEFINE([CPUFAM_]m4_translit([$2], [a-z], [A-Z]), [1], + [Define if host CPU family is \`$2\'.]) + ;;m4_define([catacomb_seen_cpu/$2], [t])])]) + catacomb_CPU_FAMILIES([catacomb_DEFINE_CPU]) + nil) ;; + *) AC_MSG_ERROR([BUG: unexpected cpufam \`$CPUFAM']) ;; + esac + AC_SUBST([CPUFAM]) + + dnl Figure out the current ABI. + catacomb_CPU_FAMILIES([catacomb_CLEAR_FLAGS]) + case $ABI in + m4_define([catacomb_DEFINE_ABI], + [m4_ifdef([catacomb_seen_abi/$3], [], + [$3) + AC_DEFINE([ABI_]m4_translit([$3], [a-z], [A-Z]), [1], + [Define if host ABI variant is \`$3\'.]) + ;;m4_define([catacomb_seen_abi/$3], [t])])]) + catacomb_CPU_FAMILIES([catacomb_DEFINE_ABI]) + nil) ;; + *) AC_MSG_ERROR([BUG: unexpected ABI \`$ABI']) ;; + esac + AC_SUBST([ABI]) + + dnl Establish Automake conditions for things. + catacomb_CPU_FAMILIES([catacomb_CLEAR_FLAGS]) + m4_define([catacomb_COND_CPU], + [m4_define([_CPU], m4_translit([$2], [a-z], [A-Z])) + m4_define([_ABI], m4_translit([$3], [a-z], [A-Z])) + AM_CONDITIONAL([CPUABI_]_CPU[_]_ABI, [test x$CPUFAM/$ABI = x$2/$3]) + m4_ifdef([catacomb_seen_cpu/$2], [], + [AM_CONDITIONAL([CPUFAM_]_CPU, [test x$CPUFAM = x$2])dnl + m4_define([catacomb_seen_cpu/$2], [t])]) + m4_ifdef([catacomb_seen_abi/$3], [], + [AM_CONDITIONAL([ABI_]_ABI, [test x$ABI = x$3])dnl + m4_define([catacomb_seen_abi/$3], [t])])]) + catacomb_CPU_FAMILIES([catacomb_COND_CPU]) + AM_CONDITIONAL([KNOWN_CPUFAM], [test x$CPUFAM != xnil]) + + dnl Report on what we found. + case $CPUFAM in + nil) AC_MSG_RESULT([not supported]) ;; + *) AC_MSG_RESULT([$CPUFAM/$ABI]) ;; + esac + + dnl Some equipment wanted for checking CPU features at runtime. + AC_CHECK_HEADERS([asm/hwcap.h]) + AC_CHECK_HEADERS([sys/auxv.h]) + AC_CHECK_HEADERS([linux/auxvec.h]) + AC_CHECK_FUNCS([getauxval]) + + dnl-------------------------------------------------------------------------- dnl C programming environment. dnl Find out if we're cross-compiling. @@@ -156,7 -248,7 +248,7 @@@ dnl Memory locking support AC_CHECK_FUNCS([mlock]) dnl Necessary support libraries. -PKG_CHECK_MODULES([mLib], [mLib >= 2.2.1]) +PKG_CHECK_MODULES([mLib], [mLib >= 2.2.2.1]) AM_CFLAGS="$AM_CFLAGS $mLib_CFLAGS" dnl-------------------------------------------------------------------------- diff --combined symm/Makefile.am index 69c1013f,e78277b7..1d3374f5 --- a/symm/Makefile.am +++ b/symm/Makefile.am @@@ -102,23 -102,22 +102,23 @@@ pkginclude_HEADERS += $(BLKC_H ## Schneier's `Blowfish' block cipher. BLKCS += blowfish -libsymm_la_SOURCES += $(precomp)/blowfish-tab.c -PRECOMPS += $(precomp)/blowfish-tab.c +nodist_libsymm_la_SOURCES += ../precomp/symm/blowfish-tab.c +PRECOMPS += $(precomp)/symm/blowfish-tab.c PRECOMP_PROGS += blowfish-mktab blowfish_mktab_CPPFLAGS = $(AM_CPPFLAGS) -DQUIET if !CROSS_COMPILING -$(precomp)/blowfish-tab.c: - $(AM_V_at)$(MKDIR_P) $(precomp) - $(AM_V_at)$(MAKE) blowfish-mktab$e - $(AM_V_GEN)./blowfish-mktab >$(precomp)/blowfish-tab.c.new && \ - mv $(precomp)/blowfish-tab.c.new $(precomp)/blowfish-tab.c +$(precomp)/symm/blowfish-tab.c: + $(AM_V_at)$(MKDIR_P) $(precomp)/symm + $(AM_V_at)$(MAKE) blowfish-mktab$(EXEEXT) + $(AM_V_GEN)./blowfish-mktab >$(precomp)/symm/blowfish-tab.c.new && \ + mv $(precomp)/symm/blowfish-tab.c.new \ + $(precomp)/symm/blowfish-tab.c endif ## Adams and Tavares' `CAST' block ciphers. BLKCS += cast128 cast256 libsymm_la_SOURCES += cast-s.c cast-sk.c cast-base.h -cast256.$t: t/cast256 +cast256.t$(EXEEXT): t/cast256 EXTRA_DIST += t/cast256.aes MAINTAINERCLEANFILES += $(srcdir)/t/cast256 t/cast256: t/cast256.aes @@@ -129,16 -128,15 +129,16 @@@ ## IBM's `DES' block cipher, by Feistel, Coppersmith, and others. BLKCS += des des3 -libsymm_la_SOURCES += des-base.h $(precomp)/des-tab.c -PRECOMPS += $(precomp)/des-tab.c +libsymm_la_SOURCES += des-base.h +nodist_libsymm_la_SOURCES += ../precomp/symm/des-tab.c +PRECOMPS += $(precomp)/symm/des-tab.c PRECOMP_PROGS += des-mktab if !CROSS_COMPILING -$(precomp)/des-tab.c: - $(AM_V_at)$(MKDIR_P) $(precomp) - $(AM_V_at)$(MAKE) des-mktab$e - $(AM_V_GEN)./des-mktab >$(precomp)/des-tab.c.new && \ - mv $(precomp)/des-tab.c.new $(precomp)/des-tab.c +$(precomp)/symm/des-tab.c: + $(AM_V_at)$(MKDIR_P) $(precomp)/symm + $(AM_V_at)$(MAKE) des-mktab$(EXEEXT) + $(AM_V_GEN)./des-mktab >$(precomp)/symm/des-tab.c.new && \ + mv $(precomp)/symm/des-tab.c.new $(precomp)/symm/des-tab.c endif ## Rivest's `DESX' variant, with pre- and post-whitening. @@@ -150,17 -148,17 +150,17 @@@ BLKCS += ide ## IBM's `MARS' block cipher. BLKCS += mars -libsymm_la_SOURCES += $(precomp)/mars-tab.c -PRECOMPS += $(precomp)/mars-tab.c +nodist_libsymm_la_SOURCES += ../precomp/symm/mars-tab.c +PRECOMPS += $(precomp)/symm/mars-tab.c PRECOMP_PROGS += mars-mktab if !CROSS_COMPILING -$(precomp)/mars-tab.c: - $(AM_V_at)$(MKDIR_P) $(precomp) - $(AM_V_at)$(MAKE) mars-mktab$e - $(AM_V_GEN)./mars-mktab >$(precomp)/mars-tab.c.new && \ - mv $(precomp)/mars-tab.c.new $(precomp)/mars-tab.c +$(precomp)/symm/mars-tab.c: + $(AM_V_at)$(MKDIR_P) $(precomp)/symm + $(AM_V_at)$(MAKE) mars-mktab$(EXEEXT) + $(AM_V_GEN)./mars-mktab >$(precomp)/symm/mars-tab.c.new && \ + mv $(precomp)/symm/mars-tab.c.new $(precomp)/symm/mars-tab.c endif -mars.$t: t/mars +mars.t$(EXEEXT): t/mars EXTRA_DIST += t/mars.aes MAINTAINERCLEANFILES += $(srcdir)/t/mars t/mars: t/mars.aes @@@ -182,18 -180,23 +182,24 @@@ BLKCS += rc ## Daemen and Rijmen's `Rijndael' block cipher, selected as AES. BLKCS += rijndael rijndael192 rijndael256 libsymm_la_SOURCES += rijndael-base.h rijndael-base.c + if CPUFAM_X86 + libsymm_la_SOURCES += rijndael-x86ish-aesni.S + endif + if CPUFAM_AMD64 + libsymm_la_SOURCES += rijndael-x86ish-aesni.S + endif -libsymm_la_SOURCES += $(precomp)/rijndael-tab.c -PRECOMPS += $(precomp)/rijndael-tab.c +nodist_libsymm_la_SOURCES += ../precomp/symm/rijndael-tab.c +PRECOMPS += $(precomp)/symm/rijndael-tab.c PRECOMP_PROGS += rijndael-mktab if !CROSS_COMPILING -$(precomp)/rijndael-tab.c: - $(AM_V_at)$(MKDIR_P) $(precomp) - $(AM_V_at)$(MAKE) rijndael-mktab$e - $(AM_V_GEN)./rijndael-mktab >$(precomp)/rijndael-tab.c.new && \ - mv $(precomp)/rijndael-tab.c.new $(precomp)/rijndael-tab.c +$(precomp)/symm/rijndael-tab.c: + $(AM_V_at)$(MKDIR_P) $(precomp)/symm + $(AM_V_at)$(MAKE) rijndael-mktab$(EXEEXT) + $(AM_V_GEN)./rijndael-mktab >$(precomp)/symm/rijndael-tab.c.new && \ + mv $(precomp)/symm/rijndael-tab.c.new \ + $(precomp)/symm/rijndael-tab.c endif -rijndael.$t: t/rijndael +rijndael.t$(EXEEXT): t/rijndael EXTRA_DIST += t/rijndael.aes MAINTAINERCLEANFILES += $(srcdir)/t/rijndael t/rijndael: t/rijndael.aes @@@ -204,17 -207,16 +210,17 @@@ ## Massey's `SAFER' block ciphers. BLKCS += safer safersk -libsymm_la_SOURCES += $(precomp)/safer-tab.c -PRECOMPS += $(precomp)/safer-tab.c +nodist_libsymm_la_SOURCES += ../precomp/symm/safer-tab.c +PRECOMPS += $(precomp)/symm/safer-tab.c PRECOMP_PROGS += safer-mktab STUBS_HDR += SAFER-SK,safersk,safer if !CROSS_COMPILING -$(precomp)/safer-tab.c: - $(AM_V_at)$(MKDIR_P) $(precomp) - $(AM_V_at)$(MAKE) safer-mktab$e - $(AM_V_GEN)./safer-mktab >$(precomp)/safer-tab.c.new && \ - mv $(precomp)/safer-tab.c.new $(precomp)/safer-tab.c +$(precomp)/symm/safer-tab.c: + $(AM_V_at)$(MKDIR_P) $(precomp)/symm + $(AM_V_at)$(MAKE) safer-mktab$(EXEEXT) + $(AM_V_GEN)./safer-mktab >$(precomp)/symm/safer-tab.c.new && \ + mv $(precomp)/symm/safer-tab.c.new \ + $(precomp)/symm/safer-tab.c endif ## Anderson, Biham and Knudsen's `Serpent' block cipher. @@@ -222,7 -224,7 +228,7 @@@ BLKCS += serpen libsymm_la_SOURCES += serpent-sbox.h check_PROGRAMS += serpent-check TESTS += serpent-check -serpent.$t: t/serpent +serpent.t$(EXEEXT): t/serpent EXTRA_DIST += t/serpent.aes MAINTAINERCLEANFILES += $(srcdir)/t/serpent t/serpent: t/serpent.aes @@@ -238,16 -240,15 +244,16 @@@ libsymm_la_SOURCES += skipjack-tab. ## Daemen and Rijmen's `Square' block cipher. BLKCS += square -libsymm_la_SOURCES += $(precomp)/square-tab.c -PRECOMPS += $(precomp)/square-tab.c +nodist_libsymm_la_SOURCES += ../precomp/symm/square-tab.c +PRECOMPS += $(precomp)/symm/square-tab.c PRECOMP_PROGS += square-mktab if !CROSS_COMPILING -$(precomp)/square-tab.c: - $(AM_V_at)$(MKDIR_P) $(precomp) - $(AM_V_at)$(MAKE) square-mktab$e - $(AM_V_GEN)./square-mktab >$(precomp)/square-tab.c.new && \ - mv $(precomp)/square-tab.c.new $(precomp)/square-tab.c +$(precomp)/symm/square-tab.c: + $(AM_V_at)$(MKDIR_P) $(precomp)/symm + $(AM_V_at)$(MAKE) square-mktab$(EXEEXT) + $(AM_V_GEN)./square-mktab >$(precomp)/symm/square-tab.c.new && \ + mv $(precomp)/symm/square-tab.c.new \ + $(precomp)/symm/square-tab.c endif ## Wheeler and Needham's `TEA' and `XTEA' block ciphers. @@@ -256,18 -257,17 +262,18 @@@ BLKCS += tea xte ## Schneier, Kelsey, Whiting, Wagner, Hall and Ferguson's `Twofish' block ## cipher. BLKCS += twofish -libsymm_la_SOURCES += $(precomp)/twofish-tab.c -PRECOMPS += $(precomp)/twofish-tab.c +nodist_libsymm_la_SOURCES += ../precomp/symm/twofish-tab.c +PRECOMPS += $(precomp)/symm/twofish-tab.c PRECOMP_PROGS += twofish-mktab if !CROSS_COMPILING -$(precomp)/twofish-tab.c: - $(AM_V_at)$(MKDIR_P) $(precomp) - $(AM_V_at)$(MAKE) twofish-mktab$e - $(AM_V_GEN)./twofish-mktab >$(precomp)/twofish-tab.c.new && \ - mv $(precomp)/twofish-tab.c.new $(precomp)/twofish-tab.c +$(precomp)/symm/twofish-tab.c: + $(AM_V_at)$(MKDIR_P) $(precomp)/symm + $(AM_V_at)$(MAKE) twofish-mktab$(EXEEXT) + $(AM_V_GEN)./twofish-mktab >$(precomp)/symm/twofish-tab.c.new && \ + mv $(precomp)/symm/twofish-tab.c.new \ + $(precomp)/symm/twofish-tab.c endif -twofish.$t: t/twofish +twofish.t$(EXEEXT): t/twofish EXTRA_DIST += t/twofish.aes MAINTAINERCLEANFILES += $(srcdir)/t/twofish t/twofish: t/twofish.aes @@@ -322,33 -322,29 +328,33 @@@ STUBS_HDR += SHA-384,sha384,sha51 ## Anderson and Biham's `Tiger' hash function. HASHES += tiger -libsymm_la_SOURCES += tiger-base.h $(precomp)/tiger-tab.c -PRECOMPS += $(precomp)/tiger-tab.c +libsymm_la_SOURCES += tiger-base.h +nodist_libsymm_la_SOURCES += ../precomp/symm/tiger-tab.c +PRECOMPS += $(precomp)/symm/tiger-tab.c PRECOMP_PROGS += tiger-mktab if !CROSS_COMPILING -$(precomp)/tiger-tab.c: - $(AM_V_at)$(MKDIR_P) $(precomp) - $(AM_V_at)$(MAKE) tiger-mktab$e - $(AM_V_GEN)./tiger-mktab >$(precomp)/tiger-tab.c.new && \ - mv $(precomp)/tiger-tab.c.new $(precomp)/tiger-tab.c +$(precomp)/symm/tiger-tab.c: + $(AM_V_at)$(MKDIR_P) $(precomp)/symm + $(AM_V_at)$(MAKE) tiger-mktab$(EXEEXT) + $(AM_V_GEN)./tiger-mktab >$(precomp)/symm/tiger-tab.c.new && \ + mv $(precomp)/symm/tiger-tab.c.new \ + $(precomp)/symm/tiger-tab.c endif ## Barreto and Rijmen's `Whirlpool' hash function. HASHES += whirlpool whirlpool256 -libsymm_la_SOURCES += $(precomp)/whirlpool-tab.c -PRECOMPS += $(precomp)/whirlpool-tab.c +nodist_libsymm_la_SOURCES += ../precomp/symm/whirlpool-tab.c +PRECOMPS += $(precomp)/symm/whirlpool-tab.c PRECOMP_PROGS += whirlpool-mktab STUBS_HDR += Whirlpool-256,whirlpool256,whirlpool if !CROSS_COMPILING -$(precomp)/whirlpool-tab.c: - $(AM_V_at)$(MKDIR_P) $(precomp) - $(AM_V_at)$(MAKE) whirlpool-mktab$e - $(AM_V_GEN)./whirlpool-mktab >$(precomp)/whirlpool-tab.c.new && \ - mv $(precomp)/whirlpool-tab.c.new $(precomp)/whirlpool-tab.c +$(precomp)/symm/whirlpool-tab.c: + $(AM_V_at)$(MKDIR_P) $(precomp)/symm + $(AM_V_at)$(MAKE) whirlpool-mktab$(EXEEXT) + $(AM_V_GEN)./whirlpool-mktab \ + >$(precomp)/symm/whirlpool-tab.c.new && \ + mv $(precomp)/symm/whirlpool-tab.c.new \ + $(precomp)/symm/whirlpool-tab.c endif ## Bellare, Canetti and Krawczyk's `HMAC' mode for message authentication. @@@ -368,14 -364,14 +374,14 @@@ ALL_HASHES += crc32=gcrc3 ## Rivest's `RC4' stream cipher. pkginclude_HEADERS += rc4.h libsymm_la_SOURCES += rc4.c -TESTS += rc4.$t +TESTS += rc4.t$(EXEEXT) EXTRA_DIST += t/rc4 ALL_CIPHERS += rc4 ## Coppersmith and Rogaway's `SEAL' pseudorandom function. pkginclude_HEADERS += seal.h libsymm_la_SOURCES += seal.c -TESTS += seal.$t +TESTS += seal.t$(EXEEXT) EXTRA_DIST += t/seal ALL_CIPHERS += seal @@@ -388,7 -384,13 +394,13 @@@ EXTRA_DIST += salsa20-tvconv pkginclude_HEADERS += salsa20.h salsa20-core.h libsymm_la_SOURCES += salsa20.c + if CPUFAM_X86 + libsymm_la_SOURCES += salsa20-x86ish-sse2.S + endif + if CPUFAM_AMD64 + libsymm_la_SOURCES += salsa20-x86ish-sse2.S + endif -TESTS += salsa20.$t +TESTS += salsa20.t$(EXEEXT) ALL_CIPHERS += salsa20 salsa2012 salsa208 ALL_CIPHERS += xsalsa20 xsalsa2012 xsalsa208 STUBS_HDR += Salsa20/12,salsa2012,salsa20 @@@ -398,7 -400,7 +410,7 @@@ STUBS_HDR += XSalsa20/12,xsalsa2012,sa STUBS_HDR += XSalsa20/8,xsalsa208,salsa20 EXTRA_DIST += t/salsa20 MAINTAINERCLEANFILES += t/salsa20 -salsa20.$t: t/salsa20 +salsa20.t$(EXEEXT): t/salsa20 SALSA20_ESTREAM_TV = t/salsa20.estream SALSA20_ESTREAM_TV += t/salsa2012.estream SALSA20_ESTREAM_TV += t/salsa208.estream @@@ -414,7 -416,13 +426,13 @@@ t/salsa20: salsa20-tvconv t/salsa20.loc ## Bernstein's `ChaCha' stream cipher. pkginclude_HEADERS += chacha.h chacha-core.h libsymm_la_SOURCES += chacha.c + if CPUFAM_X86 + libsymm_la_SOURCES += chacha-x86ish-sse2.S + endif + if CPUFAM_AMD64 + libsymm_la_SOURCES += chacha-x86ish-sse2.S + endif -TESTS += chacha.$t +TESTS += chacha.t$(EXEEXT) EXTRA_DIST += t/chacha ALL_CIPHERS += chacha20 chacha12 chacha8 ALL_CIPHERS += xchacha20 xchacha12 xchacha8 @@@ -525,6 -533,6 +543,6 @@@ EXTRA_DIST += $(SYMM_TEST_FILES EXTRA_DIST += daftstory.h ## Clean the debris from the `modes' subdirectory. -CLEANFILES += modes/*.to modes/*.$t +CLEANFILES += modes/*.to modes/*.t$(EXEEXT) ###----- That's all, folks -------------------------------------------------- diff --combined symm/chacha.c index e694ad22,80a84c17..0c8aa003 --- a/symm/chacha.c +++ b/symm/chacha.c @@@ -27,6 -27,8 +27,8 @@@ /*----- Header files ------------------------------------------------------*/ + #include "config.h" + #include #include @@@ -34,6 -36,7 +36,7 @@@ #include "arena.h" #include "chacha.h" #include "chacha-core.h" + #include "dispatch.h" #include "gcipher.h" #include "grand.h" #include "keysz.h" @@@ -59,9 -62,29 +62,29 @@@ const octet chacha_keysz[] = { KSZ_SET * the feedforward step. */ - static void core(unsigned r, const chacha_matrix src, chacha_matrix dest) + CPU_DISPATCH(static, (void), + void, core, (unsigned r, const chacha_matrix src, + chacha_matrix dest), + (r, src, dest), + pick_core, simple_core); + + static void simple_core(unsigned r, const chacha_matrix src, + chacha_matrix dest) { CHACHA_nR(dest, src, r); CHACHA_FFWD(dest, src); } + #if CPUFAM_X86 || CPUFAM_AMD64 + extern core__functype chacha_core_x86ish_sse2; + #endif + + static core__functype *pick_core(void) + { + #if CPUFAM_X86 || CPUFAM_AMD64 + DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_sse2, + cpu_feature_p(CPUFEAT_X86_SSE2)); + #endif + DISPATCH_PICK_FALLBACK(chacha_core, simple_core); + } + /* --- @populate@ --- * * * Arguments: @chacha_matrix a@ = a matrix to fill in @@@ -672,7 -695,7 +695,7 @@@ static void grdestroy(grand *r static const grand_ops grops_rand_##rr = { \ "chacha" #rr, GRAND_CRYPTO, 0, \ grmisc, grdestroy, grword, \ - grbyte, grword, grand_range, grfill \ + grbyte, grword, grand_defaultrange, grfill \ }; \ \ grand *chacha##rr##_rand(const void *k, size_t ksz, const void *n) \ @@@ -714,7 -737,7 +737,7 @@@ CHACHA_VARS(DEFGRAND static const grand_ops grxops_rand_##rr = { \ "xchacha" #rr, GRAND_CRYPTO, 0, \ grmisc, grxdestroy_##rr, grword, \ - grbyte, grword, grand_range, grfill \ + grbyte, grword, grand_defaultrange, grfill \ }; \ \ grand *xchacha##rr##_rand(const void *k, size_t ksz, const void *n) \ diff --combined symm/salsa20.c index 4b35cbd7,eb4e67ad..40f28fc0 --- a/symm/salsa20.c +++ b/symm/salsa20.c @@@ -7,11 -7,14 +7,14 @@@ /*----- Header files ------------------------------------------------------*/ + #include "config.h" + #include #include #include "arena.h" + #include "dispatch.h" #include "gcipher.h" #include "grand.h" #include "keysz.h" @@@ -39,9 -42,29 +42,29 @@@ const octet salsa20_keysz[] = { KSZ_SET * the feedforward step. */ - static void core(unsigned r, const salsa20_matrix src, salsa20_matrix dest) + CPU_DISPATCH(static, (void), + void, core, (unsigned r, const salsa20_matrix src, + salsa20_matrix dest), + (r, src, dest), + pick_core, simple_core); + + static void simple_core(unsigned r, const salsa20_matrix src, + salsa20_matrix dest) { SALSA20_nR(dest, src, r); SALSA20_FFWD(dest, src); } + #if CPUFAM_X86 || CPUFAM_AMD64 + extern core__functype salsa20_core_x86ish_sse2; + #endif + + static core__functype *pick_core(void) + { + #if CPUFAM_X86 || CPUFAM_AMD64 + DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_sse2, + cpu_feature_p(CPUFEAT_X86_SSE2)); + #endif + DISPATCH_PICK_FALLBACK(salsa20_core, simple_core); + } + /* --- @populate@ --- * * * Arguments: @salsa20_matrix a@ = a matrix to fill in @@@ -61,33 -84,42 +84,42 @@@ static void populate(salsa20_matrix a, KSZ_ASSERT(salsa20, ksz); - a[ 1] = LOAD32_L(k + 0); - a[ 2] = LOAD32_L(k + 4); + /* Here's the pattern of key, constant, nonce, and counter pieces in the + * matrix, before and after our permutation. + * + * [ C0 K0 K1 K2 ] [ C0 C1 C2 C3 ] + * [ K3 C1 N0 N1 ] --> [ K3 T1 K7 K2 ] + * [ T0 T1 C2 K4 ] [ T0 K6 K1 N1 ] + * [ K5 K6 K7 C3 ] [ K5 K0 N0 K4 ] + */ + + a[13] = LOAD32_L(k + 0); + a[10] = LOAD32_L(k + 4); if (ksz == 10) { - a[ 3] = LOAD16_L(k + 8); + a[ 7] = LOAD16_L(k + 8); a[ 4] = 0; } else { - a[ 3] = LOAD32_L(k + 8); + a[ 7] = LOAD32_L(k + 8); a[ 4] = LOAD32_L(k + 12); } if (ksz <= 16) { - a[11] = a[ 1]; - a[12] = a[ 2]; - a[13] = a[ 3]; - a[14] = a[ 4]; + a[15] = a[13]; + a[12] = a[10]; + a[ 9] = a[ 7]; + a[ 6] = a[ 4]; a[ 0] = SALSA20_A128; - a[ 5] = SALSA20_B128; - a[10] = ksz == 10 ? SALSA20_C80 : SALSA20_C128; - a[15] = SALSA20_D128; + a[ 1] = SALSA20_B128; + a[ 2] = ksz == 10 ? SALSA20_C80 : SALSA20_C128; + a[ 3] = SALSA20_D128; } else { - a[11] = LOAD32_L(k + 16); + a[15] = LOAD32_L(k + 16); a[12] = LOAD32_L(k + 20); - a[13] = LOAD32_L(k + 24); - a[14] = LOAD32_L(k + 28); + a[ 9] = LOAD32_L(k + 24); + a[ 6] = LOAD32_L(k + 28); a[ 0] = SALSA20_A256; - a[ 5] = SALSA20_B256; - a[10] = SALSA20_C256; - a[15] = SALSA20_D256; + a[ 1] = SALSA20_B256; + a[ 2] = SALSA20_C256; + a[ 3] = SALSA20_D256; } } @@@ -130,8 -162,8 +162,8 @@@ void salsa20_setnonce(salsa20_ctx *ctx { const octet *n = nonce; - ctx->a[6] = LOAD32_L(n + 0); - ctx->a[7] = LOAD32_L(n + 4); + ctx->a[14] = LOAD32_L(n + 0); + ctx->a[11] = LOAD32_L(n + 4); salsa20_seek(ctx, 0); } @@@ -153,7 -185,7 +185,7 @@@ void salsa20_seek(salsa20_ctx *ctx, uns void salsa20_seeku64(salsa20_ctx *ctx, kludge64 i) { - ctx->a[8] = LO64(i); ctx->a[9] = HI64(i); + ctx->a[8] = LO64(i); ctx->a[5] = HI64(i); ctx->bufi = SALSA20_OUTSZ; } @@@ -169,7 -201,7 +201,7 @@@ unsigned long salsa20_tell(salsa20_ctx { kludge64 i = salsa20_tellu64(ctx); return (GET64(unsigned long, i)); } kludge64 salsa20_tellu64(salsa20_ctx *ctx) - { kludge64 i; SET64(i, ctx->a[9], ctx->a[8]); return (i); } + { kludge64 i; SET64(i, ctx->a[5], ctx->a[8]); return (i); } /* --- @salsa20{,12,8}_encrypt@ --- * * @@@ -272,10 -304,10 +304,10 @@@ SALSA20_VARS(DEFENCRYPT * speed critical, so we do it the harder way. \ */ \ \ - for (i = 0; i < 4; i++) k[i + 6] = src[i]; \ + for (i = 0; i < 4; i++) k[14 - 3*i] = src[i]; \ core(r, k, a); \ - for (i = 0; i < 4; i++) dest[i] = a[5*i] - k[5*i]; \ - for (i = 4; i < 8; i++) dest[i] = a[i + 2] - k[i + 2]; \ + for (i = 0; i < 4; i++) dest[i] = a[5*i] - k[i]; \ + for (i = 4; i < 8; i++) dest[i] = a[i + 2] - k[26 - 3*i]; \ } \ \ void HSALSA20_PRF(r, salsa20_ctx *ctx, const void *src, void *dest) \ @@@ -340,9 -372,9 +372,9 @@@ SALSA20_VARS(DEFHSALSA20 \ populate(ctx->k, key, ksz); \ ctx->s.a[ 0] = SALSA20_A256; \ - ctx->s.a[ 5] = SALSA20_B256; \ - ctx->s.a[10] = SALSA20_C256; \ - ctx->s.a[15] = SALSA20_D256; \ + ctx->s.a[ 1] = SALSA20_B256; \ + ctx->s.a[ 2] = SALSA20_C256; \ + ctx->s.a[ 3] = SALSA20_D256; \ XSALSA20_SETNONCE(r, ctx, nonce ? nonce : zerononce); \ } SALSA20_VARS(DEFXINIT) @@@ -371,8 -403,8 +403,8 @@@ \ for (i = 0; i < 4; i++) in[i] = LOAD32_L(n + 4*i); \ HSALSA20_RAW(r, ctx->k, in, out); \ - for (i = 0; i < 4; i++) ctx->s.a[i + 1] = out[i]; \ - for (i = 4; i < 8; i++) ctx->s.a[i + 7] = out[i]; \ + for (i = 0; i < 4; i++) ctx->s.a[13 - 3*i] = out[i]; \ + for (i = 4; i < 8; i++) ctx->s.a[27 - 3*i] = out[i]; \ salsa20_setnonce(&ctx->s, n + 16); \ } SALSA20_VARS(DEFXNONCE) @@@ -663,7 -695,7 +695,7 @@@ static void grdestroy(grand *r static const grand_ops grops_rand_##rr = { \ SALSA20_NAME_##rr, GRAND_CRYPTO, 0, \ grmisc, grdestroy, grword, \ - grbyte, grword, grand_range, grfill \ + grbyte, grword, grand_defaultrange, grfill \ }; \ \ grand *SALSA20_DECOR(salsa20, rr, _rand) \ @@@ -706,7 -738,7 +738,7 @@@ SALSA20_VARS(DEFGRAND static const grand_ops grxops_rand_##rr = { \ "x" SALSA20_NAME_##rr, GRAND_CRYPTO, 0, \ grmisc, grxdestroy_##rr, grword, \ - grbyte, grword, grand_range, grfill \ + grbyte, grword, grand_defaultrange, grfill \ }; \ \ grand *SALSA20_DECOR(xsalsa20, rr, _rand) \ @@@ -730,23 -762,31 +762,31 @@@ SALSA20_VARS(DEFXGRAND #include #include + static const int perm[] = { + 0, 13, 10, 7, + 4, 1, 14, 11, + 8, 5, 2, 15, + 12, 9, 6, 3 + }; + #define DEFVCORE(r) \ static int v_core_##r(dstr *v) \ { \ salsa20_matrix a, b; \ dstr d = DSTR_INIT; \ - int i, n; \ + int i, j, n; \ int ok = 1; \ \ DENSURE(&d, SALSA20_OUTSZ); d.len = SALSA20_OUTSZ; \ n = *(int *)v[0].buf; \ for (i = 0; i < SALSA20_OUTSZ/4; i++) \ - a[i] = LOAD32_L(v[1].buf + 4*i); \ + b[i] = LOAD32_L(v[1].buf + 4*i); \ for (i = 0; i < n; i++) { \ + for (j = 0; j < 16; j++) a[perm[j]] = b[j]; \ core(r, a, b); \ memcpy(a, b, sizeof(a)); \ } \ - for (i = 0; i < SALSA20_OUTSZ/4; i++) STORE32_L(d.buf + 4*i, a[i]); \ + for (i = 0; i < SALSA20_OUTSZ/4; i++) STORE32_L(d.buf + 4*i, b[i]); \ \ if (d.len != v[2].len || memcmp(d.buf, v[2].buf, v[2].len) != 0) { \ ok = 0; \