Merge branch 'mdw/cpu-dispatch'
authorMark Wooding <mdw@distorted.org.uk>
Wed, 18 May 2016 09:29:03 +0000 (10:29 +0100)
committerMark Wooding <mdw@distorted.org.uk>
Sat, 21 May 2016 16:56:04 +0000 (17:56 +0100)
* mdw/cpu-dispatch:
  Add support machinery for ARM hosts.
  base/dispatch.c: Add (unused) machinery for probing ELF auxilary vector.
  Add support for AMD64 processors and Microsoft Windows.
  symm/rijndael-x86-aseni.S: Unify encryption and decryption with a macro.
  symm/rijndael-x86-aesni.S: Use xmm5 instead of xmm7.
  symm/*.S: Symbolic names for shuffles.
  symm/chacha-x86-sse2.S: Fix the register allocation comment.
  Preprocess the assembler files.
  configure.ac: Improve the host CPU family detection.
  base/dispatch.c: Indent some preprocessor definitions properly.
  Add a pile of debug output around the CPU dispatching machinery.
  base/dispatch.c: Add documentation for some internal functions.
  base/dispatch.c: Add in more useful section markers.
  Support Intel's AES Native Instructions where available on x86 hardware.
  symm/: New SSE2 implementations of Salsa20 and ChaCha.
  symm/salsa20.c, symm/salsa20-core.h: Permute input matrix for SIMD.
  debian/rules: Run tests twice, once without any detected CPU features.
  base/dispatch.c: Check operating system support for XMM registers.
  configure.ac, base/dispatch.[ch]: CPU-specific implementations.
  configure.ac: Arrange to have an assembler available.

Conflicts:
configure.ac
symm/Makefile.am

1  2 
configure.ac
symm/Makefile.am
symm/chacha.c
symm/salsa20.c

diff --combined configure.ac
@@@ -31,7 -31,8 +31,8 @@@ mdw_AUTO_VERSIO
  AC_INIT([catacomb], AUTO_VERSION, [mdw@distorted.org.uk])
  AC_CONFIG_SRCDIR([catacomb.pc.in])
  AC_CONFIG_AUX_DIR([config])
 -AM_INIT_AUTOMAKE([foreign parallel-tests])
 +AM_INIT_AUTOMAKE([foreign parallel-tests color-tests subdir-objects])
+ AC_CANONICAL_HOST
  mdw_SILENT_RULES
  
  AC_PROG_CC
@@@ -39,11 -40,102 +40,102 @@@ AX_CFLAGS_WARN_AL
  AM_PROG_LIBTOOL
  mdw_LIBTOOL_VERSION_INFO
  
+ AM_PROG_AS
  AC_PROG_YACC
  
  AC_SUBST(AM_CFLAGS)
  
  dnl--------------------------------------------------------------------------
+ dnl Host-specific configuration.
+ AC_MSG_CHECKING([CPU family and ABI])
+ dnl The table of CPU families and ABIs which we might support.  Support is
+ dnl not uniform: each dispatched function might or might not have an
+ dnl implementation for any particular CPU/ABI combination.
+ AC_DEFUN([catacomb_CPU_FAMILIES],
+   [$1([i[[3-6]]86,cygwin], [x86], [win])
+    $1([i[[3-6]]86,*], [x86], [sysv])
+    $1([x86_64,cygwin], [amd64], [win])
+    $1([x86_64,*], [amd64], [sysv])
+    $1([armv*,*-gnueabi | armv*,*-gnueabihf], [armel], [gnueabi])])
+ dnl A utility to clear the `seen' flags, used so as to process each CPU or
+ dnl ABI once.
+ m4_define([catacomb_CLEAR_FLAGS],
+ [m4_ifdef([catacomb_seen_cpu/$2],
+         [m4_undefine([catacomb_seen_cpu/$2])])dnl
+ m4_ifdef([catacomb_seen_abi/$3],
+         [m4_undefine([catacomb_seen_abi/$3])])])
+ dnl Identify the current host.
+ case $host_cpu,$host_os in
+   m4_define([catacomb_CPU_CASE],
+     [$1) CPUFAM=$2 ABI=$3 ;;
+ ])
+   catacomb_CPU_FAMILIES([catacomb_CPU_CASE])
+   *) CPUFAM=nil ABI=nil ;;
+ esac
+ dnl Figure out the current CPU.
+ catacomb_CPU_FAMILIES([catacomb_CLEAR_FLAGS])
+ case $CPUFAM in
+   m4_define([catacomb_DEFINE_CPU],
+     [m4_ifdef([catacomb_seen_cpu/$2], [],
+       [$2)
+       AC_DEFINE([CPUFAM_]m4_translit([$2], [a-z], [A-Z]), [1],
+                 [Define if host CPU family is \`$2\'.])
+       ;;m4_define([catacomb_seen_cpu/$2], [t])])])
+   catacomb_CPU_FAMILIES([catacomb_DEFINE_CPU])
+   nil) ;;
+   *) AC_MSG_ERROR([BUG: unexpected cpufam \`$CPUFAM']) ;;
+ esac
+ AC_SUBST([CPUFAM])
+ dnl Figure out the current ABI.
+ catacomb_CPU_FAMILIES([catacomb_CLEAR_FLAGS])
+ case $ABI in
+   m4_define([catacomb_DEFINE_ABI],
+     [m4_ifdef([catacomb_seen_abi/$3], [],
+       [$3)
+       AC_DEFINE([ABI_]m4_translit([$3], [a-z], [A-Z]), [1],
+                 [Define if host ABI variant is \`$3\'.])
+       ;;m4_define([catacomb_seen_abi/$3], [t])])])
+   catacomb_CPU_FAMILIES([catacomb_DEFINE_ABI])
+   nil) ;;
+   *) AC_MSG_ERROR([BUG: unexpected ABI \`$ABI']) ;;
+ esac
+ AC_SUBST([ABI])
+ dnl Establish Automake conditions for things.
+ catacomb_CPU_FAMILIES([catacomb_CLEAR_FLAGS])
+ m4_define([catacomb_COND_CPU],
+ [m4_define([_CPU], m4_translit([$2], [a-z], [A-Z]))
+ m4_define([_ABI], m4_translit([$3], [a-z], [A-Z]))
+ AM_CONDITIONAL([CPUABI_]_CPU[_]_ABI, [test x$CPUFAM/$ABI = x$2/$3])
+ m4_ifdef([catacomb_seen_cpu/$2], [],
+ [AM_CONDITIONAL([CPUFAM_]_CPU, [test x$CPUFAM = x$2])dnl
+ m4_define([catacomb_seen_cpu/$2], [t])])
+ m4_ifdef([catacomb_seen_abi/$3], [],
+ [AM_CONDITIONAL([ABI_]_ABI, [test x$ABI = x$3])dnl
+ m4_define([catacomb_seen_abi/$3], [t])])])
+ catacomb_CPU_FAMILIES([catacomb_COND_CPU])
+ AM_CONDITIONAL([KNOWN_CPUFAM], [test x$CPUFAM != xnil])
+ dnl Report on what we found.
+ case $CPUFAM in
+   nil) AC_MSG_RESULT([not supported]) ;;
+   *) AC_MSG_RESULT([$CPUFAM/$ABI]) ;;
+ esac
+ dnl Some equipment wanted for checking CPU features at runtime.
+ AC_CHECK_HEADERS([asm/hwcap.h])
+ AC_CHECK_HEADERS([sys/auxv.h])
+ AC_CHECK_HEADERS([linux/auxvec.h])
+ AC_CHECK_FUNCS([getauxval])
+ dnl--------------------------------------------------------------------------
  dnl C programming environment.
  
  dnl Find out if we're cross-compiling.
@@@ -156,7 -248,7 +248,7 @@@ dnl Memory locking support
  AC_CHECK_FUNCS([mlock])
  
  dnl Necessary support libraries.
 -PKG_CHECK_MODULES([mLib], [mLib >= 2.2.1])
 +PKG_CHECK_MODULES([mLib], [mLib >= 2.2.2.1])
  AM_CFLAGS="$AM_CFLAGS $mLib_CFLAGS"
  
  dnl--------------------------------------------------------------------------
diff --combined symm/Makefile.am
@@@ -102,23 -102,22 +102,23 @@@ pkginclude_HEADERS      += $(BLKC_H
  
  ## Schneier's `Blowfish' block cipher.
  BLKCS                 += blowfish
 -libsymm_la_SOURCES    += $(precomp)/blowfish-tab.c
 -PRECOMPS              += $(precomp)/blowfish-tab.c
 +nodist_libsymm_la_SOURCES += ../precomp/symm/blowfish-tab.c
 +PRECOMPS              += $(precomp)/symm/blowfish-tab.c
  PRECOMP_PROGS         += blowfish-mktab
  blowfish_mktab_CPPFLAGS        = $(AM_CPPFLAGS) -DQUIET
  if !CROSS_COMPILING
 -$(precomp)/blowfish-tab.c:
 -      $(AM_V_at)$(MKDIR_P) $(precomp)
 -      $(AM_V_at)$(MAKE) blowfish-mktab$e
 -      $(AM_V_GEN)./blowfish-mktab >$(precomp)/blowfish-tab.c.new && \
 -              mv $(precomp)/blowfish-tab.c.new $(precomp)/blowfish-tab.c
 +$(precomp)/symm/blowfish-tab.c:
 +      $(AM_V_at)$(MKDIR_P) $(precomp)/symm
 +      $(AM_V_at)$(MAKE) blowfish-mktab$(EXEEXT)
 +      $(AM_V_GEN)./blowfish-mktab >$(precomp)/symm/blowfish-tab.c.new && \
 +              mv $(precomp)/symm/blowfish-tab.c.new \
 +                      $(precomp)/symm/blowfish-tab.c
  endif
  
  ## Adams and Tavares' `CAST' block ciphers.
  BLKCS                 += cast128 cast256
  libsymm_la_SOURCES    += cast-s.c cast-sk.c cast-base.h
 -cast256.$t: t/cast256
 +cast256.t$(EXEEXT): t/cast256
  EXTRA_DIST            += t/cast256.aes
  MAINTAINERCLEANFILES  += $(srcdir)/t/cast256
  t/cast256: t/cast256.aes
  
  ## IBM's `DES' block cipher, by Feistel, Coppersmith, and others.
  BLKCS                 += des des3
 -libsymm_la_SOURCES    += des-base.h $(precomp)/des-tab.c
 -PRECOMPS              += $(precomp)/des-tab.c
 +libsymm_la_SOURCES    += des-base.h
 +nodist_libsymm_la_SOURCES += ../precomp/symm/des-tab.c
 +PRECOMPS              += $(precomp)/symm/des-tab.c
  PRECOMP_PROGS         += des-mktab
  if !CROSS_COMPILING
 -$(precomp)/des-tab.c:
 -      $(AM_V_at)$(MKDIR_P) $(precomp)
 -      $(AM_V_at)$(MAKE) des-mktab$e
 -      $(AM_V_GEN)./des-mktab >$(precomp)/des-tab.c.new && \
 -              mv $(precomp)/des-tab.c.new $(precomp)/des-tab.c
 +$(precomp)/symm/des-tab.c:
 +      $(AM_V_at)$(MKDIR_P) $(precomp)/symm
 +      $(AM_V_at)$(MAKE) des-mktab$(EXEEXT)
 +      $(AM_V_GEN)./des-mktab >$(precomp)/symm/des-tab.c.new && \
 +              mv $(precomp)/symm/des-tab.c.new $(precomp)/symm/des-tab.c
  endif
  
  ## Rivest's `DESX' variant, with pre- and post-whitening.
@@@ -150,17 -148,17 +150,17 @@@ BLKCS                   += ide
  
  ## IBM's `MARS' block cipher.
  BLKCS                 += mars
 -libsymm_la_SOURCES    += $(precomp)/mars-tab.c
 -PRECOMPS              += $(precomp)/mars-tab.c
 +nodist_libsymm_la_SOURCES += ../precomp/symm/mars-tab.c
 +PRECOMPS              += $(precomp)/symm/mars-tab.c
  PRECOMP_PROGS         += mars-mktab
  if !CROSS_COMPILING
 -$(precomp)/mars-tab.c:
 -      $(AM_V_at)$(MKDIR_P) $(precomp)
 -      $(AM_V_at)$(MAKE) mars-mktab$e
 -      $(AM_V_GEN)./mars-mktab >$(precomp)/mars-tab.c.new && \
 -              mv $(precomp)/mars-tab.c.new $(precomp)/mars-tab.c
 +$(precomp)/symm/mars-tab.c:
 +      $(AM_V_at)$(MKDIR_P) $(precomp)/symm
 +      $(AM_V_at)$(MAKE) mars-mktab$(EXEEXT)
 +      $(AM_V_GEN)./mars-mktab >$(precomp)/symm/mars-tab.c.new && \
 +              mv $(precomp)/symm/mars-tab.c.new $(precomp)/symm/mars-tab.c
  endif
 -mars.$t: t/mars
 +mars.t$(EXEEXT): t/mars
  EXTRA_DIST            += t/mars.aes
  MAINTAINERCLEANFILES  += $(srcdir)/t/mars
  t/mars: t/mars.aes
@@@ -182,18 -180,23 +182,24 @@@ BLKCS                   += rc
  ## Daemen and Rijmen's `Rijndael' block cipher, selected as AES.
  BLKCS                 += rijndael rijndael192 rijndael256
  libsymm_la_SOURCES    += rijndael-base.h rijndael-base.c
 -libsymm_la_SOURCES    += $(precomp)/rijndael-tab.c
 -PRECOMPS              += $(precomp)/rijndael-tab.c
+ if CPUFAM_X86
+ libsymm_la_SOURCES    += rijndael-x86ish-aesni.S
+ endif
+ if CPUFAM_AMD64
+ libsymm_la_SOURCES    += rijndael-x86ish-aesni.S
+ endif
 +nodist_libsymm_la_SOURCES += ../precomp/symm/rijndael-tab.c
 +PRECOMPS              += $(precomp)/symm/rijndael-tab.c
  PRECOMP_PROGS         += rijndael-mktab
  if !CROSS_COMPILING
 -$(precomp)/rijndael-tab.c:
 -      $(AM_V_at)$(MKDIR_P) $(precomp)
 -      $(AM_V_at)$(MAKE) rijndael-mktab$e
 -      $(AM_V_GEN)./rijndael-mktab >$(precomp)/rijndael-tab.c.new && \
 -              mv $(precomp)/rijndael-tab.c.new $(precomp)/rijndael-tab.c
 +$(precomp)/symm/rijndael-tab.c:
 +      $(AM_V_at)$(MKDIR_P) $(precomp)/symm
 +      $(AM_V_at)$(MAKE) rijndael-mktab$(EXEEXT)
 +      $(AM_V_GEN)./rijndael-mktab >$(precomp)/symm/rijndael-tab.c.new && \
 +              mv $(precomp)/symm/rijndael-tab.c.new \
 +                      $(precomp)/symm/rijndael-tab.c
  endif
 -rijndael.$t: t/rijndael
 +rijndael.t$(EXEEXT): t/rijndael
  EXTRA_DIST            += t/rijndael.aes
  MAINTAINERCLEANFILES  += $(srcdir)/t/rijndael
  t/rijndael: t/rijndael.aes
  
  ## Massey's `SAFER' block ciphers.
  BLKCS                 += safer safersk
 -libsymm_la_SOURCES    += $(precomp)/safer-tab.c
 -PRECOMPS              += $(precomp)/safer-tab.c
 +nodist_libsymm_la_SOURCES += ../precomp/symm/safer-tab.c
 +PRECOMPS              += $(precomp)/symm/safer-tab.c
  PRECOMP_PROGS         += safer-mktab
  STUBS_HDR             += SAFER-SK,safersk,safer
  if !CROSS_COMPILING
 -$(precomp)/safer-tab.c:
 -      $(AM_V_at)$(MKDIR_P) $(precomp)
 -      $(AM_V_at)$(MAKE) safer-mktab$e
 -      $(AM_V_GEN)./safer-mktab >$(precomp)/safer-tab.c.new && \
 -              mv $(precomp)/safer-tab.c.new $(precomp)/safer-tab.c
 +$(precomp)/symm/safer-tab.c:
 +      $(AM_V_at)$(MKDIR_P) $(precomp)/symm
 +      $(AM_V_at)$(MAKE) safer-mktab$(EXEEXT)
 +      $(AM_V_GEN)./safer-mktab >$(precomp)/symm/safer-tab.c.new && \
 +              mv $(precomp)/symm/safer-tab.c.new \
 +                      $(precomp)/symm/safer-tab.c
  endif
  
  ## Anderson, Biham and Knudsen's `Serpent' block cipher.
@@@ -222,7 -224,7 +228,7 @@@ BLKCS                      += serpen
  libsymm_la_SOURCES    += serpent-sbox.h
  check_PROGRAMS                += serpent-check
  TESTS                 += serpent-check
 -serpent.$t: t/serpent
 +serpent.t$(EXEEXT): t/serpent
  EXTRA_DIST            += t/serpent.aes
  MAINTAINERCLEANFILES  += $(srcdir)/t/serpent
  t/serpent: t/serpent.aes
@@@ -238,16 -240,15 +244,16 @@@ libsymm_la_SOURCES      += skipjack-tab.
  
  ## Daemen and Rijmen's `Square' block cipher.
  BLKCS                 += square
 -libsymm_la_SOURCES    += $(precomp)/square-tab.c
 -PRECOMPS              += $(precomp)/square-tab.c
 +nodist_libsymm_la_SOURCES += ../precomp/symm/square-tab.c
 +PRECOMPS              += $(precomp)/symm/square-tab.c
  PRECOMP_PROGS         += square-mktab
  if !CROSS_COMPILING
 -$(precomp)/square-tab.c:
 -      $(AM_V_at)$(MKDIR_P) $(precomp)
 -      $(AM_V_at)$(MAKE) square-mktab$e
 -      $(AM_V_GEN)./square-mktab >$(precomp)/square-tab.c.new && \
 -              mv $(precomp)/square-tab.c.new $(precomp)/square-tab.c
 +$(precomp)/symm/square-tab.c:
 +      $(AM_V_at)$(MKDIR_P) $(precomp)/symm
 +      $(AM_V_at)$(MAKE) square-mktab$(EXEEXT)
 +      $(AM_V_GEN)./square-mktab >$(precomp)/symm/square-tab.c.new && \
 +              mv $(precomp)/symm/square-tab.c.new \
 +                      $(precomp)/symm/square-tab.c
  endif
  
  ## Wheeler and Needham's `TEA' and `XTEA' block ciphers.
@@@ -256,18 -257,17 +262,18 @@@ BLKCS                   += tea xte
  ## Schneier, Kelsey, Whiting, Wagner, Hall and Ferguson's `Twofish' block
  ## cipher.
  BLKCS                 += twofish
 -libsymm_la_SOURCES    += $(precomp)/twofish-tab.c
 -PRECOMPS              += $(precomp)/twofish-tab.c
 +nodist_libsymm_la_SOURCES += ../precomp/symm/twofish-tab.c
 +PRECOMPS              += $(precomp)/symm/twofish-tab.c
  PRECOMP_PROGS         += twofish-mktab
  if !CROSS_COMPILING
 -$(precomp)/twofish-tab.c:
 -      $(AM_V_at)$(MKDIR_P) $(precomp)
 -      $(AM_V_at)$(MAKE) twofish-mktab$e
 -      $(AM_V_GEN)./twofish-mktab >$(precomp)/twofish-tab.c.new && \
 -              mv $(precomp)/twofish-tab.c.new $(precomp)/twofish-tab.c
 +$(precomp)/symm/twofish-tab.c:
 +      $(AM_V_at)$(MKDIR_P) $(precomp)/symm
 +      $(AM_V_at)$(MAKE) twofish-mktab$(EXEEXT)
 +      $(AM_V_GEN)./twofish-mktab >$(precomp)/symm/twofish-tab.c.new && \
 +              mv $(precomp)/symm/twofish-tab.c.new \
 +                      $(precomp)/symm/twofish-tab.c
  endif
 -twofish.$t: t/twofish
 +twofish.t$(EXEEXT): t/twofish
  EXTRA_DIST            += t/twofish.aes
  MAINTAINERCLEANFILES  += $(srcdir)/t/twofish
  t/twofish: t/twofish.aes
@@@ -322,33 -322,29 +328,33 @@@ STUBS_HDR               += SHA-384,sha384,sha51
  
  ## Anderson and Biham's `Tiger' hash function.
  HASHES                        += tiger
 -libsymm_la_SOURCES    += tiger-base.h $(precomp)/tiger-tab.c
 -PRECOMPS              += $(precomp)/tiger-tab.c
 +libsymm_la_SOURCES    += tiger-base.h
 +nodist_libsymm_la_SOURCES += ../precomp/symm/tiger-tab.c
 +PRECOMPS              += $(precomp)/symm/tiger-tab.c
  PRECOMP_PROGS         += tiger-mktab
  if !CROSS_COMPILING
 -$(precomp)/tiger-tab.c:
 -      $(AM_V_at)$(MKDIR_P) $(precomp)
 -      $(AM_V_at)$(MAKE) tiger-mktab$e
 -      $(AM_V_GEN)./tiger-mktab >$(precomp)/tiger-tab.c.new && \
 -              mv $(precomp)/tiger-tab.c.new $(precomp)/tiger-tab.c
 +$(precomp)/symm/tiger-tab.c:
 +      $(AM_V_at)$(MKDIR_P) $(precomp)/symm
 +      $(AM_V_at)$(MAKE) tiger-mktab$(EXEEXT)
 +      $(AM_V_GEN)./tiger-mktab >$(precomp)/symm/tiger-tab.c.new && \
 +              mv $(precomp)/symm/tiger-tab.c.new \
 +                      $(precomp)/symm/tiger-tab.c
  endif
  
  ## Barreto and Rijmen's `Whirlpool' hash function.
  HASHES                        += whirlpool whirlpool256
 -libsymm_la_SOURCES    += $(precomp)/whirlpool-tab.c
 -PRECOMPS              += $(precomp)/whirlpool-tab.c
 +nodist_libsymm_la_SOURCES += ../precomp/symm/whirlpool-tab.c
 +PRECOMPS              += $(precomp)/symm/whirlpool-tab.c
  PRECOMP_PROGS         += whirlpool-mktab
  STUBS_HDR             += Whirlpool-256,whirlpool256,whirlpool
  if !CROSS_COMPILING
 -$(precomp)/whirlpool-tab.c:
 -      $(AM_V_at)$(MKDIR_P) $(precomp)
 -      $(AM_V_at)$(MAKE) whirlpool-mktab$e
 -      $(AM_V_GEN)./whirlpool-mktab >$(precomp)/whirlpool-tab.c.new && \
 -              mv $(precomp)/whirlpool-tab.c.new $(precomp)/whirlpool-tab.c
 +$(precomp)/symm/whirlpool-tab.c:
 +      $(AM_V_at)$(MKDIR_P) $(precomp)/symm
 +      $(AM_V_at)$(MAKE) whirlpool-mktab$(EXEEXT)
 +      $(AM_V_GEN)./whirlpool-mktab \
 +                      >$(precomp)/symm/whirlpool-tab.c.new && \
 +              mv $(precomp)/symm/whirlpool-tab.c.new \
 +                      $(precomp)/symm/whirlpool-tab.c
  endif
  
  ## Bellare, Canetti and Krawczyk's `HMAC' mode for message authentication.
@@@ -368,14 -364,14 +374,14 @@@ ALL_HASHES              += crc32=gcrc3
  ## Rivest's `RC4' stream cipher.
  pkginclude_HEADERS    += rc4.h
  libsymm_la_SOURCES    += rc4.c
 -TESTS                 += rc4.$t
 +TESTS                 += rc4.t$(EXEEXT)
  EXTRA_DIST            += t/rc4
  ALL_CIPHERS           += rc4
  
  ## Coppersmith and Rogaway's `SEAL' pseudorandom function.
  pkginclude_HEADERS    += seal.h
  libsymm_la_SOURCES    += seal.c
 -TESTS                 += seal.$t
 +TESTS                 += seal.t$(EXEEXT)
  EXTRA_DIST            += t/seal
  ALL_CIPHERS           += seal
  
  EXTRA_DIST            += salsa20-tvconv
  pkginclude_HEADERS    += salsa20.h salsa20-core.h
  libsymm_la_SOURCES    += salsa20.c
 -TESTS                 += salsa20.$t
+ if CPUFAM_X86
+ libsymm_la_SOURCES    += salsa20-x86ish-sse2.S
+ endif
+ if CPUFAM_AMD64
+ libsymm_la_SOURCES    += salsa20-x86ish-sse2.S
+ endif
 +TESTS                 += salsa20.t$(EXEEXT)
  ALL_CIPHERS           += salsa20 salsa2012 salsa208
  ALL_CIPHERS           += xsalsa20 xsalsa2012 xsalsa208
  STUBS_HDR             += Salsa20/12,salsa2012,salsa20
@@@ -398,7 -400,7 +410,7 @@@ STUBS_HDR          += XSalsa20/12,xsalsa2012,sa
  STUBS_HDR             += XSalsa20/8,xsalsa208,salsa20
  EXTRA_DIST            += t/salsa20
  MAINTAINERCLEANFILES  += t/salsa20
 -salsa20.$t: t/salsa20
 +salsa20.t$(EXEEXT): t/salsa20
  SALSA20_ESTREAM_TV     = t/salsa20.estream
  SALSA20_ESTREAM_TV    += t/salsa2012.estream
  SALSA20_ESTREAM_TV    += t/salsa208.estream
@@@ -414,7 -416,13 +426,13 @@@ t/salsa20: salsa20-tvconv t/salsa20.loc
  ## Bernstein's `ChaCha' stream cipher.
  pkginclude_HEADERS    += chacha.h chacha-core.h
  libsymm_la_SOURCES    += chacha.c
 -TESTS                 += chacha.$t
+ if CPUFAM_X86
+ libsymm_la_SOURCES    += chacha-x86ish-sse2.S
+ endif
+ if CPUFAM_AMD64
+ libsymm_la_SOURCES    += chacha-x86ish-sse2.S
+ endif
 +TESTS                 += chacha.t$(EXEEXT)
  EXTRA_DIST            += t/chacha
  ALL_CIPHERS           += chacha20 chacha12 chacha8
  ALL_CIPHERS           += xchacha20 xchacha12 xchacha8
@@@ -525,6 -533,6 +543,6 @@@ EXTRA_DIST         += $(SYMM_TEST_FILES
  EXTRA_DIST            += daftstory.h
  
  ## Clean the debris from the `modes' subdirectory.
 -CLEANFILES            += modes/*.to modes/*.$t
 +CLEANFILES            += modes/*.to modes/*.t$(EXEEXT)
  
  ###----- That's all, folks --------------------------------------------------
diff --combined symm/chacha.c
@@@ -27,6 -27,8 +27,8 @@@
  
  /*----- Header files ------------------------------------------------------*/
  
+ #include "config.h"
  #include <stdarg.h>
  
  #include <mLib/bits.h>
@@@ -34,6 -36,7 +36,7 @@@
  #include "arena.h"
  #include "chacha.h"
  #include "chacha-core.h"
+ #include "dispatch.h"
  #include "gcipher.h"
  #include "grand.h"
  #include "keysz.h"
@@@ -59,9 -62,29 +62,29 @@@ const octet chacha_keysz[] = { KSZ_SET
   *            the feedforward step.
   */
  
- static void core(unsigned r, const chacha_matrix src, chacha_matrix dest)
+ CPU_DISPATCH(static, (void),
+            void, core, (unsigned r, const chacha_matrix src,
+                         chacha_matrix dest),
+            (r, src, dest),
+            pick_core, simple_core);
+ static void simple_core(unsigned r, const chacha_matrix src,
+                       chacha_matrix dest)
    { CHACHA_nR(dest, src, r); CHACHA_FFWD(dest, src); }
  
+ #if CPUFAM_X86 || CPUFAM_AMD64
+ extern core__functype chacha_core_x86ish_sse2;
+ #endif
+ static core__functype *pick_core(void)
+ {
+ #if CPUFAM_X86 || CPUFAM_AMD64
+   DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_sse2,
+                    cpu_feature_p(CPUFEAT_X86_SSE2));
+ #endif
+   DISPATCH_PICK_FALLBACK(chacha_core, simple_core);
+ }
  /* --- @populate@ --- *
   *
   * Arguments: @chacha_matrix a@ = a matrix to fill in
@@@ -672,7 -695,7 +695,7 @@@ static void grdestroy(grand *r
    static const grand_ops grops_rand_##rr = {                          \
      "chacha" #rr, GRAND_CRYPTO, 0,                                    \
      grmisc, grdestroy, grword,                                                \
 -    grbyte, grword, grand_range, grfill                                       \
 +    grbyte, grword, grand_defaultrange, grfill                                \
    };                                                                  \
                                                                        \
    grand *chacha##rr##_rand(const void *k, size_t ksz, const void *n)  \
@@@ -714,7 -737,7 +737,7 @@@ CHACHA_VARS(DEFGRAND
    static const grand_ops grxops_rand_##rr = {                         \
      "xchacha" #rr, GRAND_CRYPTO, 0,                                   \
      grmisc, grxdestroy_##rr, grword,                                  \
 -    grbyte, grword, grand_range, grfill                                       \
 +    grbyte, grword, grand_defaultrange, grfill                                \
    };                                                                  \
                                                                        \
    grand *xchacha##rr##_rand(const void *k, size_t ksz, const void *n) \
diff --combined symm/salsa20.c
@@@ -7,11 -7,14 +7,14 @@@
  
  /*----- Header files ------------------------------------------------------*/
  
+ #include "config.h"
  #include <stdarg.h>
  
  #include <mLib/bits.h>
  
  #include "arena.h"
+ #include "dispatch.h"
  #include "gcipher.h"
  #include "grand.h"
  #include "keysz.h"
@@@ -39,9 -42,29 +42,29 @@@ const octet salsa20_keysz[] = { KSZ_SET
   *            the feedforward step.
   */
  
- static void core(unsigned r, const salsa20_matrix src, salsa20_matrix dest)
+ CPU_DISPATCH(static, (void),
+            void, core, (unsigned r, const salsa20_matrix src,
+                         salsa20_matrix dest),
+            (r, src, dest),
+            pick_core, simple_core);
+ static void simple_core(unsigned r, const salsa20_matrix src,
+                       salsa20_matrix dest)
    { SALSA20_nR(dest, src, r); SALSA20_FFWD(dest, src); }
  
+ #if CPUFAM_X86 || CPUFAM_AMD64
+ extern core__functype salsa20_core_x86ish_sse2;
+ #endif
+ static core__functype *pick_core(void)
+ {
+ #if CPUFAM_X86 || CPUFAM_AMD64
+   DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_sse2,
+                    cpu_feature_p(CPUFEAT_X86_SSE2));
+ #endif
+   DISPATCH_PICK_FALLBACK(salsa20_core, simple_core);
+ }
  /* --- @populate@ --- *
   *
   * Arguments: @salsa20_matrix a@ = a matrix to fill in
@@@ -61,33 -84,42 +84,42 @@@ static void populate(salsa20_matrix a, 
  
    KSZ_ASSERT(salsa20, ksz);
  
-   a[ 1] = LOAD32_L(k +  0);
-   a[ 2] = LOAD32_L(k +  4);
+   /* Here's the pattern of key, constant, nonce, and counter pieces in the
+    * matrix, before and after our permutation.
+    *
+    * [ C0 K0 K1 K2 ]     [ C0 C1 C2 C3 ]
+    * [ K3 C1 N0 N1 ]  -->  [ K3 T1 K7 K2 ]
+    * [ T0 T1 C2 K4 ]     [ T0 K6 K1 N1 ]
+    * [ K5 K6 K7 C3 ]     [ K5 K0 N0 K4 ]
+    */
+   a[13] = LOAD32_L(k +  0);
+   a[10] = LOAD32_L(k +  4);
    if (ksz == 10) {
-     a[ 3] = LOAD16_L(k +  8);
+     a[ 7] = LOAD16_L(k +  8);
      a[ 4] = 0;
    } else {
-     a[ 3] = LOAD32_L(k +  8);
+     a[ 7] = LOAD32_L(k +  8);
      a[ 4] = LOAD32_L(k + 12);
    }
    if (ksz <= 16) {
-     a[11] = a[ 1];
-     a[12] = a[ 2];
-     a[13] = a[ 3];
-     a[14] = a[ 4];
+     a[15] = a[13];
+     a[12] = a[10];
+     a[ 9] = a[ 7];
+     a[ 6] = a[ 4];
      a[ 0] = SALSA20_A128;
-     a[ 5] = SALSA20_B128;
-     a[10] = ksz == 10 ? SALSA20_C80 : SALSA20_C128;
-     a[15] = SALSA20_D128;
+     a[ 1] = SALSA20_B128;
+     a[ 2] = ksz == 10 ? SALSA20_C80 : SALSA20_C128;
+     a[ 3] = SALSA20_D128;
    } else {
-     a[11] = LOAD32_L(k + 16);
+     a[15] = LOAD32_L(k + 16);
      a[12] = LOAD32_L(k + 20);
-     a[13] = LOAD32_L(k + 24);
-     a[14] = LOAD32_L(k + 28);
+     a[ 9] = LOAD32_L(k + 24);
+     a[ 6] = LOAD32_L(k + 28);
      a[ 0] = SALSA20_A256;
-     a[ 5] = SALSA20_B256;
-     a[10] = SALSA20_C256;
-     a[15] = SALSA20_D256;
+     a[ 1] = SALSA20_B256;
+     a[ 2] = SALSA20_C256;
+     a[ 3] = SALSA20_D256;
    }
  }
  
@@@ -130,8 -162,8 +162,8 @@@ void salsa20_setnonce(salsa20_ctx *ctx
  {
    const octet *n = nonce;
  
-   ctx->a[6] = LOAD32_L(n + 0);
-   ctx->a[7] = LOAD32_L(n + 4);
+   ctx->a[14] = LOAD32_L(n + 0);
+   ctx->a[11] = LOAD32_L(n + 4);
    salsa20_seek(ctx, 0);
  }
  
@@@ -153,7 -185,7 +185,7 @@@ void salsa20_seek(salsa20_ctx *ctx, uns
  
  void salsa20_seeku64(salsa20_ctx *ctx, kludge64 i)
  {
-   ctx->a[8] = LO64(i); ctx->a[9] = HI64(i);
+   ctx->a[8] = LO64(i); ctx->a[5] = HI64(i);
    ctx->bufi = SALSA20_OUTSZ;
  }
  
@@@ -169,7 -201,7 +201,7 @@@ unsigned long salsa20_tell(salsa20_ctx 
    { kludge64 i = salsa20_tellu64(ctx); return (GET64(unsigned long, i)); }
  
  kludge64 salsa20_tellu64(salsa20_ctx *ctx)
-   { kludge64 i; SET64(i, ctx->a[9], ctx->a[8]); return (i); }
+   { kludge64 i; SET64(i, ctx->a[5], ctx->a[8]); return (i); }
  
  /* --- @salsa20{,12,8}_encrypt@ --- *
   *
@@@ -272,10 -304,10 +304,10 @@@ SALSA20_VARS(DEFENCRYPT
       * speed critical, so we do it the harder way.                    \
       */                                                                       \
                                                                        \
-     for (i = 0; i < 4; i++) k[i + 6] = src[i];                                \
+     for (i = 0; i < 4; i++) k[14 - 3*i] = src[i];                     \
      core(r, k, a);                                                    \
-     for (i = 0; i < 4; i++) dest[i] = a[5*i] - k[5*i];                        \
-     for (i = 4; i < 8; i++) dest[i] = a[i + 2] - k[i + 2];            \
+     for (i = 0; i < 4; i++) dest[i] = a[5*i] - k[i];                  \
+     for (i = 4; i < 8; i++) dest[i] = a[i + 2] - k[26 - 3*i];         \
    }                                                                   \
                                                                        \
    void HSALSA20_PRF(r, salsa20_ctx *ctx, const void *src, void *dest) \
@@@ -340,9 -372,9 +372,9 @@@ SALSA20_VARS(DEFHSALSA20
                                                                        \
      populate(ctx->k, key, ksz);                                               \
      ctx->s.a[ 0] = SALSA20_A256;                                      \
-     ctx->s.a[ 5] = SALSA20_B256;                                      \
-     ctx->s.a[10] = SALSA20_C256;                                      \
-     ctx->s.a[15] = SALSA20_D256;                                      \
+     ctx->s.a[ 1] = SALSA20_B256;                                      \
+     ctx->s.a[ 2] = SALSA20_C256;                                      \
+     ctx->s.a[ 3] = SALSA20_D256;                                      \
      XSALSA20_SETNONCE(r, ctx, nonce ? nonce : zerononce);             \
    }
  SALSA20_VARS(DEFXINIT)
                                                                        \
      for (i = 0; i < 4; i++) in[i] = LOAD32_L(n + 4*i);                        \
      HSALSA20_RAW(r, ctx->k, in, out);                                 \
-     for (i = 0; i < 4; i++) ctx->s.a[i + 1] = out[i];                 \
-     for (i = 4; i < 8; i++) ctx->s.a[i + 7] = out[i];                 \
+     for (i = 0; i < 4; i++) ctx->s.a[13 - 3*i] = out[i];              \
+     for (i = 4; i < 8; i++) ctx->s.a[27 - 3*i] = out[i];              \
      salsa20_setnonce(&ctx->s, n + 16);                                        \
    }
  SALSA20_VARS(DEFXNONCE)
@@@ -663,7 -695,7 +695,7 @@@ static void grdestroy(grand *r
    static const grand_ops grops_rand_##rr = {                          \
      SALSA20_NAME_##rr, GRAND_CRYPTO, 0,                                       \
      grmisc, grdestroy, grword,                                                \
 -    grbyte, grword, grand_range, grfill                                       \
 +    grbyte, grword, grand_defaultrange, grfill                                \
    };                                                                  \
                                                                        \
    grand *SALSA20_DECOR(salsa20, rr, _rand)                            \
@@@ -706,7 -738,7 +738,7 @@@ SALSA20_VARS(DEFGRAND
    static const grand_ops grxops_rand_##rr = {                         \
      "x" SALSA20_NAME_##rr, GRAND_CRYPTO, 0,                           \
      grmisc, grxdestroy_##rr, grword,                                  \
 -    grbyte, grword, grand_range, grfill                                       \
 +    grbyte, grword, grand_defaultrange, grfill                                \
    };                                                                  \
                                                                        \
    grand *SALSA20_DECOR(xsalsa20, rr, _rand)                           \
@@@ -730,23 -762,31 +762,31 @@@ SALSA20_VARS(DEFXGRAND
  #include <mLib/quis.h>
  #include <mLib/testrig.h>
  
+ static const int perm[] = {
+    0, 13, 10,  7,
+    4,  1, 14, 11,
+    8,  5,  2, 15,
+   12,  9,  6,  3
+ };
  #define DEFVCORE(r)                                                   \
    static int v_core_##r(dstr *v)                                      \
    {                                                                   \
      salsa20_matrix a, b;                                              \
      dstr d = DSTR_INIT;                                                       \
-     int i, n;                                                         \
+     int i, j, n;                                                      \
      int ok = 1;                                                               \
                                                                        \
      DENSURE(&d, SALSA20_OUTSZ); d.len = SALSA20_OUTSZ;                        \
      n = *(int *)v[0].buf;                                             \
      for (i = 0; i < SALSA20_OUTSZ/4; i++)                             \
-       a[i] = LOAD32_L(v[1].buf + 4*i);                                        \
+       b[i] = LOAD32_L(v[1].buf + 4*i);                                        \
      for (i = 0; i < n; i++) {                                         \
+       for (j = 0; j < 16; j++) a[perm[j]] = b[j];                     \
        core(r, a, b);                                                  \
        memcpy(a, b, sizeof(a));                                                \
      }                                                                 \
-     for (i = 0; i < SALSA20_OUTSZ/4; i++) STORE32_L(d.buf + 4*i, a[i]);       \
+     for (i = 0; i < SALSA20_OUTSZ/4; i++) STORE32_L(d.buf + 4*i, b[i]);       \
                                                                        \
      if (d.len != v[2].len || memcmp(d.buf, v[2].buf, v[2].len) != 0) {        \
        ok = 0;                                                         \