From d3f33b9a37f45bc5809af314b1b436b712d59a80 Mon Sep 17 00:00:00 2001
From: Mark Wooding <mdw@distorted.org.uk>
Date: Thu, 1 Feb 2024 12:33:33 +0000
Subject: [PATCH] base/permute.h, utils/permute.lisp, symm/...: Formalize bit
 permutations.

Add a bunch of Lisp code for messing with permutations.  Add a C header
file with macros for implementing interesting primitive permutations.
Apply these to DES and Keccak-1600 as a demonstration.
---
 base/Makefile.am   |   3 +
 base/permute.h     | 285 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 symm/des-base.h    |  89 +++++++++++++----
 symm/des.c         |  11 +++
 symm/des3.c        |  11 +++
 symm/desx.c        |  11 +++
 symm/keccak1600.c  |  47 ++++-----
 utils/permute.lisp | 280 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 695 insertions(+), 42 deletions(-)
 create mode 100644 base/permute.h
 create mode 100644 utils/permute.lisp

diff --git a/base/Makefile.am b/base/Makefile.am
index 8b7c0fcd..f4c99732 100644
--- a/base/Makefile.am
+++ b/base/Makefile.am
@@ -42,6 +42,9 @@ libbase_la_SOURCES	+= arena.c
 pkginclude_HEADERS	+= ct.h
 libbase_la_SOURCES	+= ct.c ct-test.c
 
+## Bit permutations.
+pkginclude_HEADERS	+= permute.h
+
 ## CPU-specific dispatch.
 pkginclude_HEADERS	+= dispatch.h
 libbase_la_SOURCES	+= dispatch.c
diff --git a/base/permute.h b/base/permute.h
new file mode 100644
index 00000000..ef8eb169
--- /dev/null
+++ b/base/permute.h
@@ -0,0 +1,285 @@
+/* -*-c-*-
+ *
+ * Bit permutations
+ *
+ * (c) 2024 Straylight/Edgeware
+ */
+
+/*----- Licensing notice --------------------------------------------------*
+ *
+ * This file is part of Catacomb.
+ *
+ * Catacomb is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU Library General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Catacomb is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with Catacomb.  If not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+ * USA.
+ */
+
+#ifndef CATACOMB_PERMUTE_H
+#define CATACOMB_PERMUTE_H
+
+#ifdef __cplusplus
+  extern "C" {
+#endif
+
+/*----- Header files ------------------------------------------------------*/
+
+#include <mLib/macros.h>
+
+/*----- Macros provided ---------------------------------------------------*/
+
+/* --- Theory lesson --- *
+ *
+ * It's often useful to rearrange the bits in a word, or a value split across
+ * two (or more) words, so it's worth taking a moment to consider how this
+ * might be done efficiently.  Throughout this discussion, we use the
+ * standard bit numbering, where the least significant bit in a word is bit
+ * zero, with numbering increasing with significance.  Equivalently, bit
+ * %$k$% has the numerical value %$2^k$%.
+ *
+ * An essential primitive is the `swizzle', which exchanges two similarly
+ * arranged but disjoint groups of bits within a word which are separated by
+ * a constant offset.  The groups of bits don't have to be contiguous, but
+ * they must be identified by shifts of the same mask.
+ *
+ * An especially important class of swizzle permutations considers the
+ * individual bits of bit indices.  Permutations of the bits in a word can be
+ * interpreted as operations on the bits of the indices.  For %$i \ge 0$%,
+ * let %$\mu_i$% be the mask such that bit %$k$% of %$\mu_i$% is set if and
+ * only if bit %$i$% is clear in %$k$%.  Hence
+ *
+ *	%$\mu_0 = (\ldots 01010101010101010101010101010101)_2 = -1/3$% ,
+ *	%$\mu_1 = (\ldots 00110011001100110011001100110011)_2 = -1/5$% ,
+ *	%$\mu_2 = (\ldots 00001111000011110000111100001111)_2 = -1/17$% ,
+ *	%$\mu_3 = (\ldots 00000000111111110000000011111111)_2 = -1/257$% ,
+ *	etc.
+ *
+ * In general, the low %$2^i$% bits of %$\mu_i$% are set, the next least
+ * significant %$2^i$% bits are clear, the next %$2^i$% bits are set, and so
+ * on.  Hence, %$\mu_i \lsl 2^i = \bar{\mu}_i$%, or, in the %$2$%-adic
+ * numbers %$\Z_2$%, %$2^{2^i} \mu_k = -1 - \mu_i$%, whence, in general,
+ * %$\mu_i = -1/(2^{2^i} + 1)$%.  Let %$x$% be some binary value; now we can
+ * describe some important swizzles.
+ *
+ *   * Let %$y=(x\bitand\mu_i)\lsl 2^i \bitor (x\bitand\bar{\mu}_i)\lsr 2^i%,
+ *     or %$y = (x\bitand\mu_i) \lsl 2^k \bitor (x \lsr 2^k)\bitand\mu_i$%.
+ *     This exchanges the two sub-blocks of %$2^i$% bits in each %$2^{i+1}$%
+ *     block in %$x$%.  In terms of indices, now the bits at indices in which
+ *     bit %$i$% is set precede those in which bit %$i$% is clear.
+ *     %%\emph{We have inverted index bit %$i$%.}%%
+ *
+ *   * Suppose that %$i < j$%, and let %$m = \bar{\mu}_i \bitand \mu_j$% and
+ *     %$s = 2^j - 2^i$%; let %$y = (x \bitand m) \lsl s \bitor {}$%
+ *     %$(x \lsr s)\bitand m \bitor (x\bitand \overline{m \bitor m \lsl s$%.
+ *     Now, %$m$% has its bit %$k$% set if and only if bit %$i$% of %$k$% is
+ *     set and bit %$j$% of %$k$% is clear.  The related mask %$m \lsl s$%
+ *     has bit %$k + s$% set if %$k% has the same property; but, %$k$%
+ *     will have bit %$i$% set and bit %$j$% clear if and only if bit %$i$%
+ *     is clear and bit %$j$% is set in %$k + s$%.  Combined, the mask
+ *     %$m \bitor (m \lsl s)$% selects bits at indices in which bits %$i$%
+ *     and %$j$% differ, so %$\overline{m \bitor (m \lsl s)}$% selects the
+ *     bits at indices where bits %$i$% and %$j$% are equal.
+ *
+ *     This swizzle therefore exchanges the bits of %$x$% at indices where
+ *     bit %$i$% is set and bit %$j$% is clear with those at indices where
+ *     bit %$j$% is set and %$i$% is clear, leaving alone those bits at
+ *     indices where bits %$i$% and %$j$% are either both clear or both set.
+ *     %%\emph{We have exchanged index bits %$i$% and %$j$%.}%%
+ *
+ *   * Rounding off this little collection, suppose again that %$i < j$%, and
+ *     let %$m = \mu_i \bitand \mu_j$% and %$s = 2^i + 2^j$%; and again, let
+ *     %$y = (x \bitand m) \lsl s \bitor (x \lsr s) \bitand m \bitor {}$%
+ *     %$(x \bitand \overline{m \bitor m \lsl s$%.  Now, %$m$% has its bit
+ *     %$k$% set if and only if bits %$i$% and %$j$% of %$k$% are both clear.
+ *     This swizzle therefore exchanges the bits of %$x$% at indices where
+ *     bits %$i$% and %$j$% are both clear with those at indices where
+ *     bits %$i$% and %$j$% are both set, leaving alone those bits at indices
+ *     where bits %$i$% and %$j$% differ.  It takes a little work to (left as
+ *     an exercise) to see that the effect combines the previous two.
+ *     %%\emph{We have exchanged and inverted index bits %$i$% and %$j$%.}%%
+ *
+ * Related is the `twizzle', which exchanges similarly arranged groups of
+ * bits within two different words.  This can be seen as a multiprecision
+ * variant of the swizzle.
+ *
+ * The machinery here expects some parameters to have been defined:
+ *
+ *   * @regty@ should be an unsigned integer type, and
+ *
+ *   * @REGWD@ should be a power of two such that @regty@ can store at least
+ *     @REGWD@ bits.
+ */
+
+/* We begin with some internal utilities.  @CATACOMB__REPLICATE_n_(x)@
+ * produces a hexadecimal constant consisting of %$n$% copies of the digits
+ * @x@.
+ */
+#define CATACOMB__REPLICATE_16_(x) CATACOMB__REPLICATE_8_(GLUE(x, x))
+#define CATACOMB__REPLICATE_8_(x) CATACOMB__REPLICATE_4_(GLUE(x, x))
+#define CATACOMB__REPLICATE_4_(x) CATACOMB__REPLICATE_2_(GLUE(x, x))
+#define CATACOMB__REPLICATE_2_(x) CATACOMB__REPLICATE_1_(GLUE(x, x))
+#define CATACOMB__REPLICATE_1_(x) GLUE(0x, x)
+
+/* More internal utilities.  @CATACOMB__REPLi_Un(x)@ returns an %$n$%-bit
+ * hexadecimal constant formed by replicating the %$i$%-bit constant (which
+ * must have leading zeros) %$n/i$% times.
+ */
+#define CATACOMB__REPL8_U8 CATACOMB__REPLICATE_1_
+#define CATACOMB__REPL8_U16 CATACOMB__REPLICATE_2_
+#define CATACOMB__REPL8_U32 CATACOMB__REPLICATE_4_
+#define CATACOMB__REPL8_U64 CATACOMB__REPLICATE_8_
+#define CATACOMB__REPL8_U128 CATACOMB__REPLICATE_16_
+
+#define CATACOMB__REPL16_U16 CATACOMB__REPLICATE_1_
+#define CATACOMB__REPL16_U32 CATACOMB__REPLICATE_2_
+#define CATACOMB__REPL16_U64 CATACOMB__REPLICATE_4_
+#define CATACOMB__REPL16_U128 CATACOMB__REPLICATE_8_
+
+#define CATACOMB__REPL32_U32 CATACOMB__REPLICATE_1_
+#define CATACOMB__REPL32_U64 CATACOMB__REPLICATE_2_
+#define CATACOMB__REPL32_U128 CATACOMB__REPLICATE_4_
+
+#define CATACOMB__REPL64_U64 CATACOMB__REPLICATE_1_
+#define CATACOMB__REPL64_U128 CATACOMB__REPLICATE_2_
+
+#define CATACOMB__REPL128_U128 CATACOMB__REPLICATE_1_
+
+/* Finally, @CATACOMB__REPLi(x)@ returns a hexadecimal constant formed by
+ * replicating the %$i$%-bit constant (including leading zeros) sufficiently
+ * many times as to fill a @REGWD@-bit wide register.
+ */
+#define CATACOMB__REPL8(x) GLUE(CATACOMB__REPL8_U, REGWD)(x)
+#define CATACOMB__REPL16(x) GLUE(CATACOMB__REPL16_U, REGWD)(x)
+#define CATACOMB__REPL32(x) GLUE(CATACOMB__REPL32_U, REGWD)(x)
+#define CATACOMB__REPL64(x) GLUE(CATACOMB__REPL64_U, REGWD)(x)
+#define CATACOMB__REPL128(x) GLUE(CATACOMB__REPL128_U, REGWD)(x)
+
+/* The macro @CATACOMB__IXMASK_Bi(_)@ evaluates to the low @REGWD@ bits of
+ * the constant %$\mu_i$% defined above.  The argument is ignored; it's
+ * necessary to prevent technical problems with macro expansion
+ * (specifically, to allow the blue paint on @GLUE@ to be washed off before
+ * invoking @CATACOMB__REPLi@).
+ */
+#define CATACOMB__IXMASK_B0(_) CATACOMB__REPL8(55)
+#define CATACOMB__IXMASK_B1(_) CATACOMB__REPL8(33)
+#define CATACOMB__IXMASK_B2(_) CATACOMB__REPL8(0f)
+#define CATACOMB__IXMASK_B3(_) CATACOMB__REPL16(00ff)
+#define CATACOMB__IXMASK_B4(_) CATACOMB__REPL32(0000ffff)
+#define CATACOMB__IXMASK_B5(_) CATACOMB__REPL64(00000000ffffffff)
+#define CATACOMB__IXMASK_B6(_)						\
+	CATACOMB__REPL128(0000000000000000ffffffffffffffff)
+
+/* @IXMASK(i)@ returns the low @REGWD@ bits of %$\mu_i$%.  The argument @i@
+ * must be a decimal integer constant, without leading zeros.
+ */
+#define IXMASK(i) GLUE(CATACOMB__IXMASK_B, i)(hunoz)
+
+/* @IXMASK_xy(i, j)@ returns a @REGWD@-bit mask in which bit %$k$% is set if
+ * bit %$i$% of %$k$% is equal to %$x$% and bit %$j$% of %$k$% is equal to
+ * %$y$%.  The arguments @i@ and @j@ must be decimal integer constants,
+ * without leading zeros.
+ */
+#define IXMASK_00(i, j) (IXMASK(i)&IXMASK(j))
+#define IXMASK_01(i, j) (IXMASK(i)&~IXMASK(j))
+#define IXMASK_10(i, j) (~IXMASK(i)&IXMASK(j))
+#define IXMASK_11(i, j) (~IXMASK(i)&~IXMASK(j))
+
+/* The general swizzle operation.  Exchange the bits in @x@ selected by
+ * @mask@ with those selected by @mask << shift@.
+ */
+#define SWIZZLE(x, shift, mask) do {					\
+  regty _t = ((x) ^ ((x) >> (shift)))&(mask);				\
+  (x) ^= _t | (_t << (shift));						\
+} while (0)
+
+/* A swizzle on two words @x@ and @y@, using the same shift, but different
+ * masks @mask0@ and @mask1@.  This is just a simple abbreviation.
+ */
+#define SWIZZLE_2(x, y, shift, mask0, mask1) do {			\
+  SWIZZLE(x, shift, mask0); SWIZZLE(y, shift, mask1);			\
+} while (0)
+
+/* A `twizzle', or a swizzle across two words.
+ *
+ * The @TWIZZLE_0@ macro exchanges the bits of @x@ and @y selected by
+ * @mask@.  The @TWIZZLE_L@ and @TWIZZLE_R@ macros exchange the bits selected
+ * by @mask@ in @y@ with the bits in @x@ selected by @mask << shift@ or
+ * @mask >> shift@ respectively.  (The names are from the direction in which
+ * @x@ is shifted, not the direction the mask is shifted.)
+ *
+ * These are used to synthesize swizzles within multiprecision words: if the
+ * intended shift is %$a w + b$%, where %$w$% is the word width, then %$a$%
+ * gives the difference between word indices of the words to be processed,
+ * and %$\abs{b$}% gives the @shift@ argument; use @TWIZZLE_R@ if
+ * %$b \ge 0$%, @TWIZZLE_L@ if %$b \le 0$% is nonpositive, or @TWIZZLE_0@ if
+ * %$b = 0$%.  (We can easily distinguish which of %$a w \pm b$% or
+ * %$(a \pm 1) w \mp (w - b)$%, since one kind of shift will keep @mask@
+ * within the same word, and the other will shift it out completely.)
+ */
+#define TWIZZLE_0(x, y, mask) do {					\
+  regty _t = ((y) ^ ((x)))&(mask);					\
+  (x) ^= _t; (y) ^= _t;							\
+} while (0)
+#define TWIZZLE_L(x, y, shift, mask) do {				\
+  regty _t = ((y) ^ ((x) << (shift)))&(mask);				\
+  (x) ^= _t >> (shift); (y) ^= _t;					\
+} while (0)
+#define TWIZZLE_R(x, y, shift, mask) do {				\
+  regty _t = ((y) ^ ((x) >> (shift)))&(mask);				\
+  (x) ^= _t << (shift); (y) ^= _t;					\
+} while (0)
+
+/* @SWIZZLE_CPL@ applies a swizzle to @x@ which complements index bit @i@;
+ * @SWIZZLE_EXCH@ applies a swizzle to exchange index bits @i@ and @j@; and
+ * @SWIZZLE_XCPL@ applies a swizzle to exchange and invert index bits @i@ and
+ * @j@.  The arguments @i@ and @j@ must be decimal integer constants without
+ * leading zeros, with %$i \le j$%.  (The macros do nothing if %$i = j$%.)
+ *
+ * The variants with @2@ in their names act identically on @x@ and @y@, and
+ * are intended as a simple convenience.
+ */
+#define SWIZZLE_CPL(x, i) SWIZZLE(x, (1 << (i)), IXMASK(i))
+#define SWIZZLE_EXCH(x, i, j)						\
+	SWIZZLE(x, (1 << (j)) - (1 << (i)), IXMASK_10(i, j))
+#define SWIZZLE_XCPL(x, i, j)						\
+	SWIZZLE(x, (1 << (j)) + (1 << (i)), IXMASK_00(i, j))
+
+#define SWIZZLE_CPL2(x, y, i)						\
+	SWIZZLE_2(x, y, (1 << (i)), IXMASK(i), IXMASK(i))
+#define SWIZZLE_EXCH2(x, y, i, j)					\
+	SWIZZLE_2(x, y, (1 << (j)) - (1 << (i)),			\
+		  IXMASK_10(i, j), IXMASK_10(i, j))
+#define SWIZZLE_XCPL2(x, y, i, j)					\
+	SWIZZLE_2(x, y, (1 << (j)) + (1 << (i)),			\
+		  IXMASK_00(i, j), IXMASK_00(i, j))
+
+/* The @TWIZZLE_EXCH@ and @TWIZZLE_XCPL@ macros act like the similarly named
+ * @SWIZZLE_...@ macros above, except that (a) they act on two words from a
+ * multiprecision value, and the @j@ index is implicit in the selection of
+ * the operands @x@ and @y@.  If the word width is %$w$%, and %$2^j = n w$%,
+ * then %$x$% should be chosen to be %$n$% slots more significant than
+ * %$y$%.
+ *
+ * Note that there is no @TWIZZLE_CPL@, since this would simply involve
+ * exchanging two entries in an array.
+ */
+#define TWIZZLE_EXCH(x, y, i) TWIZZLE_L(x, y, (1 << (i)), ~IXMASK(i))
+#define TWIZZLE_XCPL(x, y, i) TWIZZLE_R(x, y, (1 << (i)), IXMASK(i))
+
+/*----- That's all, folks -------------------------------------------------*/
+
+#ifdef __cplusplus
+  }
+#endif
+
+#endif
diff --git a/symm/des-base.h b/symm/des-base.h
index 49bbed7e..7e55e044 100644
--- a/symm/des-base.h
+++ b/symm/des-base.h
@@ -36,6 +36,10 @@
 
 #include <mLib/bits.h>
 
+#ifndef CATACOMB_PERMUTE_H
+#  include "permute.h"
+#endif
+
 /*----- External data -----------------------------------------------------*/
 
 extern const uint32 des_sp[8][64];
@@ -68,32 +72,77 @@ extern const uint32 des_sp[8][64];
  * The cryptographically useless initial and final permutations.  The initial
  * permutation also rotates the two block halves left by one place.  This is
  * undone by the inverse permutation at the end.
+ *
+ * The initial permutation is traditionally given by the table
+ *
+ *	58 50 42 34 26 18 10  2
+ *	60 52 44 36 28 20 12  4
+ *	62 54 46 38 30 22 14  6
+ *	64 56 48 40 32 24 16  8
+ *	57 49 41 33 25 17  9  1
+ *	59 51 43 35 27 19 11  3
+ *	61 53 45 37 29 21 13  5
+ *	63 55 47 39 31 23 15  7
+ *
+ * The bit numbering is terrible.  If the two halves are X and Y, then the
+ * numbering starts with the most significant bit of X, which is bit 1,
+ * working down towards the least significant bit, and then continuing with
+ * the bits of Y, again in order of decreasing significance.  The table
+ * entries are in this same order, indicating that `bit 1' of the output is
+ * `bit 58' of the input and so on.
+ *
+ * I opt instead to number the bits starting from the least significant bit
+ * of Y, which is bit 0, up to the most significant bit of X, which is
+ * bit 63.  This means that we need to reverse the table (because we're going
+ * to read it in the other direction) and subtract each entry from 64 (to
+ * correct the bit numbering).  The resulting table is
+ *
+ *	57 49 41 33 25 17  9  1
+ *	59 51 43 35 27 19 11  3
+ *	61 53 45 37 29 21 13  5
+ *	63 55 47 39 31 23 15  7
+ *	56 48 40 32 24 16  8  0
+ *	58 50 42 34 26 18 10  2
+ *	60 52 44 36 28 20 12  4
+ *	62 54 46 38 30 22 14  6
+ *
+ * which, interestingly, is /also/ what you get if you just subtract one from
+ * each of the original table's entries.
+ *
+ * If we look at this table in binary, the patterns are much clearer.
+ *
+ *	111001 110001 101001 100001 011001 010001 001001 000001
+ *	111011 110011 101011 100011 011011 010011 001011 000011
+ *	111101 110101 101101 100101 011101 010101 001101 000101
+ *	111111 110111 101111 100111 011111 010111 001111 000111
+ *	111000 110000 101000 100000 011000 010000 001000 000000
+ *	111010 110010 101010 100010 011010 010010 001010 000010
+ *	111100 110100 101100 100100 011100 010100 001100 000100
+ *	111110 110110 101110 100110 011110 010110 001110 000110
+ *
+ * We can implement this efficiently using our permutation machinery.
+ * Writing ~k for index bit @k@ inverted, this permutation reflects the index
+ * transformation given by ~0, 2, 1, ~5, ~4, ~3.  The following sequence of
+ * operations is traditional.  Essentially this is an antitranspose -- a
+ * reflection about the antidiagonal -- followed by a couple of fixup stages.
  */
 
-#define DES_IP(x, y) do {						\
-  uint32 _t;								\
-  _t = (y ^ (x >> 4)) & 0x0f0f0f0f; y ^= _t; x ^= _t << 4;		\
-  _t = (x ^ (x >> 18)) & 0x00003333; x ^= _t; x ^= _t << 18;		\
-  _t = (y ^ (y >> 18)) & 0x00003333; y ^= _t; y ^= _t << 18;		\
-  _t = (x ^ (x >> 9)) & 0x00550055; x ^= _t; x ^= _t << 9;		\
-  _t = (y ^ (y >> 9)) & 0x00550055; y ^= _t; y ^= _t << 9;		\
-  _t = (x ^ (x >> 24)) & 0x000000ff; x ^= _t; x ^= _t << 24;		\
-  _t = (y ^ (y >> 24)) & 0x000000ff; y ^= _t; y ^= _t << 24;		\
-  _t = (y ^ (x >> 16)) & 0x0000ffff; y ^= _t; x ^= _t << 16;		\
+#define DES_IP(x, y) do {		/*  5,  4,  3,  2,  1,  0 */	\
+  TWIZZLE_XCPL (x, y, 2);		/* ~2,  4,  3, ~5,  1,  0 */	\
+  SWIZZLE_XCPL2(x, y, 1, 4);		/* ~2, ~1,  3, ~5, ~4,  0 */	\
+  SWIZZLE_XCPL2(x, y, 0, 3);		/* ~2, ~1, ~0, ~5, ~4, ~3 */	\
+  SWIZZLE_XCPL2(x, y, 3, 4);		/* ~2,  0,  1, ~5, ~4, ~3 */	\
+  TWIZZLE_XCPL (x, y, 4);		/* ~0,  2,  1, ~5, ~4, ~3 */	\
   x = ROL32(x, 1); y = ROL32(y, 1);					\
 } while (0)
 
 #define DES_IPINV(x, y) do {						\
-  uint32 _t;								\
-  x = ROR32(x, 1); y = ROR32(y, 1);					\
-  _t = (y ^ (x >> 16)) & 0x0000ffff; y ^= _t; x ^= _t << 16;		\
-  _t = (x ^ (x >> 24)) & 0x000000ff; x ^= _t; x ^= _t << 24;		\
-  _t = (y ^ (y >> 24)) & 0x000000ff; y ^= _t; y ^= _t << 24;		\
-  _t = (y ^ (x >> 4)) & 0x0f0f0f0f; y ^= _t; x ^= _t << 4;		\
-  _t = (x ^ (x >> 18)) & 0x00003333; x ^= _t; x ^= _t << 18;		\
-  _t = (y ^ (y >> 18)) & 0x00003333; y ^= _t; y ^= _t << 18;		\
-  _t = (x ^ (x >> 9)) & 0x00550055; x ^= _t; x ^= _t << 9;		\
-  _t = (y ^ (y >> 9)) & 0x00550055; y ^= _t; y ^= _t << 9;		\
+  x = ROR32(x, 1); y = ROR32(y, 1);	/* ~0,  2,  1, ~5, ~4, ~3 */	\
+  TWIZZLE_XCPL (x, y, 4);		/* ~2,  0,  1, ~5, ~4, ~3 */	\
+  SWIZZLE_XCPL2(x, y, 3, 4);		/* ~2, ~1, ~0, ~5, ~4, ~3 */	\
+  SWIZZLE_XCPL2(x, y, 0, 3);		/* ~2, ~1,  3, ~5, ~4,  0 */	\
+  SWIZZLE_XCPL2(x, y, 1, 4);		/* ~2,  4,  3, ~5,  1,  0 */	\
+  TWIZZLE_XCPL (x, y, 2);		/*  5,  4,  3,  2,  1,  0 */	\
 } while (0)
 
 /* --- @DES_EBLK@, @DES_DBLK@ --- *
diff --git a/symm/des.c b/symm/des.c
index 06400a15..e751cf0e 100644
--- a/symm/des.c
+++ b/symm/des.c
@@ -37,6 +37,7 @@
 #include "blkc.h"
 #include "des-base.h"
 #include "des.h"
+#include "permute.h"
 #include "gcipher.h"
 
 /*----- Global variables --------------------------------------------------*/
@@ -258,20 +259,30 @@ void des_init(des_ctx *k, const void *buf, size_t sz)
 
 void des_eblk(const des_ctx *k, const uint32 *s, uint32 *d)
 {
+#define REGWD 32
+  typedef uint32 regty;
+
   uint32 x = s[0], y = s[1];
   DES_IP(x, y);
   DES_EBLK(k->k, x, y, x, y);
   DES_IPINV(x, y);
   d[0] = x, d[1] = y;
+
+#undef REGWD
 }
 
 void des_dblk(const des_ctx *k, const uint32 *s, uint32 *d)
 {
+#define REGWD 32
+  typedef uint32 regty;
+
   uint32 x = s[0], y = s[1];
   DES_IP(x, y);
   DES_DBLK(k->k, x, y, x, y);
   DES_IPINV(x, y);
   d[0] = x, d[1] = y;
+
+#undef REGWD
 }
 
 BLKC_TEST(DES, des)
diff --git a/symm/des3.c b/symm/des3.c
index dedd9f28..92a6f2af 100644
--- a/symm/des3.c
+++ b/symm/des3.c
@@ -38,6 +38,7 @@
 #include "des-base.h"
 #include "des.h"
 #include "des3.h"
+#include "permute.h"
 #include "gcipher.h"
 
 /*----- Global variables --------------------------------------------------*/
@@ -92,6 +93,9 @@ void des3_init(des3_ctx *k, const void *buf, size_t sz)
 
 void des3_eblk(const des3_ctx *k, const uint32 *s, uint32 *d)
 {
+#define REGWD 32
+  typedef uint32 regty;
+
   uint32 x = s[0], y = s[1];
   DES_IP(x, y);
   DES_EBLK(k->a.k, x, y, x, y);
@@ -99,10 +103,15 @@ void des3_eblk(const des3_ctx *k, const uint32 *s, uint32 *d)
   DES_EBLK(k->c.k, x, y, x, y);
   DES_IPINV(x, y);
   d[0] = x, d[1] = y;
+
+#undef REGWD
 }
 
 void des3_dblk(const des3_ctx *k, const uint32 *s, uint32 *d)
 {
+#define REGWD 32
+  typedef uint32 regty;
+
   uint32 x = s[0], y = s[1];
   DES_IP(x, y);
   DES_DBLK(k->c.k, x, y, x, y);
@@ -110,6 +119,8 @@ void des3_dblk(const des3_ctx *k, const uint32 *s, uint32 *d)
   DES_DBLK(k->a.k, x, y, x, y);
   DES_IPINV(x, y);
   d[0] = x, d[1] = y;
+
+#undef REGWD
 }
 
 BLKC_TEST(DES3, des3)
diff --git a/symm/desx.c b/symm/desx.c
index 14115a57..e3737ccb 100644
--- a/symm/desx.c
+++ b/symm/desx.c
@@ -39,6 +39,7 @@
 #include "des.h"
 #include "desx.h"
 #include "gcipher.h"
+#include "permute.h"
 
 /*----- Tables ------------------------------------------------------------*/
 
@@ -123,6 +124,9 @@ void desx_init(desx_ctx *k, const void *buf, size_t sz)
 
 void desx_eblk(const desx_ctx *k, const uint32 *s, uint32 *d)
 {
+#define REGWD 32
+  typedef uint32 regty;
+
   uint32 x = s[0], y = s[1];
   x ^= k->prea; y ^= k->preb;
   DES_IP(x, y);
@@ -130,10 +134,15 @@ void desx_eblk(const desx_ctx *k, const uint32 *s, uint32 *d)
   DES_IPINV(x, y);
   x ^= k->posta; y ^= k->postb;
   d[0] = x, d[1] = y;
+
+#undef REGWD
 }
 
 void desx_dblk(const desx_ctx *k, const uint32 *s, uint32 *d)
 {
+#define REGWD 32
+  typedef uint32 regty;
+
   uint32 x = s[0], y = s[1];
   x ^= k->posta; y ^= k->postb;
   DES_IP(x, y);
@@ -141,6 +150,8 @@ void desx_dblk(const desx_ctx *k, const uint32 *s, uint32 *d)
   DES_IPINV(x, y);
   x ^= k->prea; y ^= k->preb;
   d[0] = x, d[1] = y;
+
+#undef REGWD
 }
 
 BLKC_TEST(DESX, desx)
diff --git a/symm/keccak1600.c b/symm/keccak1600.c
index cfbfdefe..a4a6564f 100644
--- a/symm/keccak1600.c
+++ b/symm/keccak1600.c
@@ -33,6 +33,7 @@
 #include <mLib/bits.h>
 
 #include "keccak1600.h"
+#include "permute.h"
 
 /* #define KECCAK_DEBUG */
 
@@ -66,25 +67,22 @@ static lane interlace(kludge64 x)
 {
   /* Given a 64-bit string X, return a lane Z containing the even- and
    * odd-numbered bits of X.
-   *
-   * This becomes more manageable if we look at what happens to the bit
-   * indices: bit i of X becomes bit ROR_6(i, 1) of Z.  We can effectively
-   * swap two bits of the indices by swapping the object bits where those
-   * index bits differ.  Fortunately, this is fairly easy.
-   *
-   * We arrange to swap bits between the two halves of X, rather than within
-   * a half.
    */
 
-  uint32 x0 = LO64(x), x1 = HI64(x), t;
+typedef uint32 regty;
+#define REGWD 32
+
+  uint32 x0 = LO64(x), x1 = HI64(x);
   lane z;
-							    /* 543210 */
-  t = ((x0 >> 16) ^ x1)&0x0000ffff; x0 ^= t << 16; x1 ^= t; /* 453210 */
-  t = ((x0 >>  8) ^ x1)&0x00ff00ff; x0 ^= t <<  8; x1 ^= t; /* 354210 */
-  t = ((x0 >>  4) ^ x1)&0x0f0f0f0f; x0 ^= t <<  4; x1 ^= t; /* 254310 */
-  t = ((x0 >>  2) ^ x1)&0x33333333; x0 ^= t <<  2; x1 ^= t; /* 154320 */
-  t = ((x0 >>  1) ^ x1)&0x55555555; x0 ^= t <<  1; x1 ^= t; /* 054321 */
+					/* 5, 4, 3, 2, 1, 0 */
+  TWIZZLE_EXCH(x1, x0, 4);		/* 4, 5, 3, 2, 1, 0 */
+  TWIZZLE_EXCH(x1, x0, 3);		/* 3, 5, 4, 2, 1, 0 */
+  TWIZZLE_EXCH(x1, x0, 2);		/* 2, 5, 4, 3, 1, 0 */
+  TWIZZLE_EXCH(x1, x0, 1);		/* 1, 5, 4, 3, 2, 0 */
+  TWIZZLE_EXCH(x1, x0, 0);		/* 0, 5, 4, 3, 2, 1 */
   z.even = x0; z.odd = x1; return (z);
+
+#undef REGWD
 }
 
 static kludge64 deinterlace(lane x)
@@ -93,15 +91,20 @@ static kludge64 deinterlace(lane x)
    * to `interlace' above, and the principle is the same
    */
 
-  uint32 x0 = x.even, x1 = x.odd, t;
+typedef uint32 regty;
+#define REGWD 32
+
+  uint32 x0 = x.even, x1 = x.odd;
   kludge64 z;
-							    /* 054321 */
-  t = ((x0 >>  1) ^ x1)&0x55555555; x0 ^= t <<  1; x1 ^= t; /* 154320 */
-  t = ((x0 >>  2) ^ x1)&0x33333333; x0 ^= t <<  2; x1 ^= t; /* 254310 */
-  t = ((x0 >>  4) ^ x1)&0x0f0f0f0f; x0 ^= t <<  4; x1 ^= t; /* 354210 */
-  t = ((x0 >>  8) ^ x1)&0x00ff00ff; x0 ^= t <<  8; x1 ^= t; /* 453210 */
-  t = ((x0 >> 16) ^ x1)&0x0000ffff; x0 ^= t << 16; x1 ^= t; /* 543210 */
+					/* 0, 5, 4, 3, 2, 1 */
+  TWIZZLE_EXCH(x1, x0, 0);		/* 1, 5, 4, 3, 2, 0 */
+  TWIZZLE_EXCH(x1, x0, 1);		/* 2, 5, 4, 3, 1, 0 */
+  TWIZZLE_EXCH(x1, x0, 2);		/* 3, 5, 4, 2, 1, 0 */
+  TWIZZLE_EXCH(x1, x0, 3);		/* 4, 5, 3, 2, 1, 0 */
+  TWIZZLE_EXCH(x1, x0, 4);		/* 5, 4, 3, 2, 1, 0 */
   SET64(z, x1, x0); return (z);
+
+#undef REGWD
 }
 
 #define TO_LANE(x) (interlace(x))
diff --git a/utils/permute.lisp b/utils/permute.lisp
new file mode 100644
index 00000000..372f5cfc
--- /dev/null
+++ b/utils/permute.lisp
@@ -0,0 +1,280 @@
+;;; -*-lisp-*-
+
+;;; This file isn't a program as such: rather, it's a collection of handy
+;;; functions which can be used in an interactive session.
+
+;;;--------------------------------------------------------------------------
+;;; General permutation utilities.
+
+(defun shuffle (v)
+  "Randomly permute the elements of the vector V.  Return V."
+  (let ((n (length v)))
+    (do ((k n (1- k)))
+	((<= k 1) v)
+      (let ((i (random k)))
+	(unless (= i (1- k))
+	  (rotatef (aref v i) (aref v (1- k))))))))
+
+(defun identity-permutation (n)
+  "Return the do-nothing permutation on N elements."
+  (let ((v (make-array n :element-type 'fixnum)))
+    (dotimes (i n v) (setf (aref v i) i))))
+
+(defun invert-permutation (p)
+  "Given a permutation P, return its inverse."
+  (let* ((n (length p)) (p-inv (make-array n :element-type 'fixnum)))
+    (dotimes (i n) (setf (aref p-inv (aref p i)) i))
+    p-inv))
+
+(defun next-permutation (v)
+  "Adjust V so that it reflects the next permutation in ascending order.
+
+   V should be a vector of real numbers.  Returns V if successful, or nil if
+   there are no more permutations."
+
+  ;; The tail of the vector consists of a sequence ... A, Z, Y, X, ..., where
+  ;; Z > Y > X ... is in reverse order, and A < Z.  The next permutation is
+  ;; then the smallest out of Z, Y, X, ... which is larger than A, followed
+  ;; by the remaining elements in ascending order.
+  ;;
+  ;; Equivalently, reverse the tail Z, Y, X, ... so we have A, ... X, Y, Z,
+  ;; and swap A with the next larger element.
+
+  (let ((n (length v)))
+    (cond ((< n 2) nil)
+	  (t (let* ((k (1- n))
+		    (x (aref v k)))
+	       (loop (when (zerop k) (return-from next-permutation nil))
+		     (decf k)
+		     (let ((y (aref v k)))
+		       (when (prog1 (< y x)
+			       (setf x y))
+			 (return))))
+	       (do ((i (1+ k) (1+ i))
+		    (j (1- n) (1- j)))
+		   ((> i j))
+		 (rotatef (aref v i) (aref v j)))
+	       (do ((i (- n 2) (1- i)))
+		   ((or (<= i k) (< (aref v i) x))
+		    (rotatef (aref v k) (aref v (1+ i)))))
+	       v)))))
+
+(defun make-index-mask (w mask-expr)
+  "Construct a bitmask based on bitwise properties of the bit indices.
+
+   The function returns a W-bit mask in which each bit is set if MASK-EXPR
+   of true of the bit's index.  MASK-EXPR may be one of the following:
+
+     * I -- an integer I is true if bit I of the bit index is set;
+     * (not EXPR) -- is true if EXPR is false;
+     * (and EXPR EXPR ...) -- is true if all of the EXPRs are true; and
+     * (or EXPR EXPR ...) -- is true if any of the EXPRs is true."
+
+  (let ((max-bit (1- (integer-length (1- w))))
+	(mask 0))
+    (dotimes (i w mask)
+      (labels ((interpret (expr)
+		 (cond ((and (integerp expr) (<= 0 expr max-bit))
+			(logbitp expr i))
+		       ((and (consp expr) (eq (car expr) 'not)
+			     (null (cddr expr)))
+			(not (interpret (cadr expr))))
+		       ((and (consp expr) (eq (car expr) 'and))
+			(every #'interpret (cdr expr)))
+		       ((and (consp expr) (eq (car expr) 'or))
+			(some #'interpret (cdr expr)))
+		       (t
+			(error "unknown mask expression ~S" expr)))))
+	(when (interpret mask-expr)
+	  (setf (ldb (byte 1 i) mask) 1))))))
+
+(defun make-permutation-network (w steps)
+  "Construct a permutation network.
+
+   The integer W gives the number of bits to be acted upon.  The STEPS are a
+   list of instructions of the following forms:
+
+     * (SHIFT . MASK) -- a pair of integers is treated literally;
+
+     * (SHIFT MASK-EXPR) -- the SHIFT is literal, but the MASK-EXPR is
+       processed by `make-index-mask' to calculate the mask;
+
+     * (:invert I) -- make an instruction which inverts the sense of the
+       index bit I;
+
+     * (:exchange I J) -- make an instruction which exchanges index bits I
+       and J; or
+
+     * (:exchange-invert I J) -- make an instruction which exchanges and
+       inverts index bits I and J.
+
+   The output is a list of primitive (SHIFT . MASK) steps, indicating that
+   the bits of the input selected by MASK are to be swapped with the bits
+   selected by (ash MASK SHIFT)."
+
+  (let ((max-mask (1- (ash 1 w)))
+	(max-shift (1- w))
+	(max-bit (1- (integer-length (1- w))))
+	(list nil))
+    (dolist (step steps)
+      (cond ((and (consp step)
+		  (integerp (car step)) (<= 0 (car step) max-shift)
+		  (integerp (cdr step)) (<= 0 (cdr step) max-mask))
+	     (push step list))
+	    ((and (consp step)
+		  (integerp (car step)) (<= 0 (car step) max-shift)
+		  (null (cddr step)))
+	     (push (cons (car step) (make-index-mask w (cadr step))) list))
+	    ((and (consp step)
+		  (eq (car step) :invert)
+		  (integerp (cadr step)) (<= 0 (cadr step) max-bit)
+		  (null (cddr step)))
+	     (let ((i (cadr step)))
+	       (push (cons (ash 1 i) (make-index-mask w `(not ,i))) list)))
+	    ((and (consp step)
+		  (eq (car step) :exchange)
+		  (integerp (cadr step)) (integerp (caddr step))
+		  (<= 0 (cadr step) (caddr step) max-bit)
+		  (null (cdddr step)))
+	     (let ((i (cadr step)) (j (caddr step)))
+	       (push (cons (- (ash 1 j) (ash 1 i))
+			   (make-index-mask w `(and ,i (not ,j))))
+		     list)))
+	    ((and (consp step)
+		  (eq (car step) :exchange-invert)
+		  (integerp (cadr step)) (integerp (caddr step))
+		  (<= 0 (cadr step) (caddr step) max-bit)
+		  (null (cdddr step)))
+	     (let ((i (cadr step)) (j (caddr step)))
+	       (push (cons (+ (ash 1 i) (ash 1 j))
+			   (make-index-mask w `(and (not ,i) (not ,j))))
+		     list)))
+	    (t
+	     (error "unknown permutation step ~S" step))))
+    (nreverse list)))
+
+;;;--------------------------------------------------------------------------
+;;; Permutation network diagnostics.
+
+(defun print-permutation-network (steps &optional (stream *standard-output*))
+  "Print a description of the permutation network STEPS to STREAM.
+
+   A permutation network consists of a list of pairs
+
+	(SHIFT . MASK)
+
+   indicating that the bits selected by MASK, and those SHIFT bits to the
+   left, should be exchanged.
+
+   The output is intended to be human-readable and is subject to change."
+
+  (let ((shiftwd 1) (maskwd 2))
+
+    ;; Determine suitable print widths for shifts and masks.
+    (dolist (step steps)
+      (let ((shift (car step)) (mask (cdr step)))
+	(let ((swd (1+ (floor (log shift 10))))
+	      (mwd (ash 1 (- (integer-length (1- (integer-length mask)))
+			     2))))
+	  (when (> swd shiftwd) (setf shiftwd swd))
+	  (when (> mwd maskwd) (setf maskwd mwd)))))
+
+    ;; Print the display.
+    (pprint-logical-block (stream steps :prefix "(" :suffix ")")
+      (let ((first t))
+	(dolist (step steps)
+	  (let ((shift (car step)) (mask (cdr step)))
+
+	    ;; Separate entries with newlines.
+	    (cond (first (setf first nil))
+		  (t (pprint-newline :mandatory stream)))
+
+	    (let ((swaps nil))
+
+	      ;; Determine the list of exchanges implied by the mask.
+	      (dotimes (i (integer-length mask))
+		(when (logbitp i mask)
+		  (push (cons i (+ i shift)) swaps)))
+	      (setf swaps (nreverse swaps))
+
+	      ;; Print the entry.
+	      (format stream "~@<(~;~vD #x~(~v,'0X~) ~8I~:@_~W~;)~:>"
+		      shiftwd shift maskwd mask swaps))))))
+
+    ;; Print a final newline following the close parenthesis.
+    (terpri stream)))
+
+(defun demonstrate-permutation-network
+    (n steps &optional reference (stream *standard-output*))
+  "Print, on STREAM, a demonstration of the permutation STEPS.
+
+   Begin, on the left, with the integers from 0 up to N - 1.  For each
+   (SHIFT . MASK) element in STEPS, print an additional column showing the
+   effect of that step on the vector.  If REFERENCE is not nil, then it
+   should be a vector of length at least N: on the right, print the REFERENCE
+   vector, showing where the result of the permutation STEPS differs from the
+   REFERENCE.  Return non-nil if the output matches the reference; return nil
+   if the output doesn't match, or no reference was supplied."
+
+  (let ((v (make-array n)))
+
+    ;; Initialize a vector of lists which will record, for each step in the
+    ;; permutation network, which value is in that position.  The lists are
+    ;; reversed, so the `current' value is at the front.
+    (dotimes (i n) (setf (aref v i) (cons i nil)))
+
+    ;; Work through the permutation steps updating the vector.
+    (dolist (step steps)
+      (let ((shift (car step)) (mask (cdr step)))
+
+	(dotimes (i n) (push (car (aref v i)) (aref v i)))
+
+	(dotimes (i n)
+	  (when (logbitp i mask)
+	    (rotatef (car (aref v i))
+		     (car (aref v (+ i shift))))))))
+
+    ;; Print the result.
+    (let ((ok (not (null reference))))
+      (dotimes (i n)
+	(let* ((entry (aref v i))
+	       (final (car entry)))
+	  (format stream "~{ ~7D~}" (reverse entry))
+	  (when reference
+	    (let* ((want (aref reference i))
+		   (match (eql final want)))
+	      (format stream " ~:[/=~;==~] ~7D" match want)
+	      (unless match (setf ok nil))))
+	  (terpri stream)))
+      (when reference
+	(format stream "~:[FAIL~;pass~]~%" ok))
+      ok)))
+
+;;;--------------------------------------------------------------------------
+;;; Examples and useful runes.
+
+#+example
+(let* ((ip #(58 50 42 34 26 18 10  2
+	     60 52 44 36 28 20 12  4
+	     62 54 46 38 30 22 14  6
+	     64 56 48 40 32 24 16  8
+	     57 49 41 33 25 17  9  1
+	     59 51 43 35 27 19 11  3
+	     61 53 45 37 29 21 13  5
+	     63 55 47 39 31 23 15  7))
+       (fixed-ip (map '(vector fixnum)
+		      (lambda (x) (- 64 x))
+		      (reverse ip)))
+       (trad-network
+	(make-permutation-network
+	 64				;  5  4  3  2  1  0
+	 '((:exchange-invert 2 5)	; ~2  4  3 ~5  1  0
+	   (:exchange-invert 1 4)	; ~2 ~1  3 ~5 ~4  0
+	   (:exchange-invert 0 3)	; ~2 ~1 ~0 ~5 ~4 ~3
+	   (:exchange-invert 3 4)	; ~2  0  1 ~5 ~4 ~3
+	   (:exchange-invert 4 5)))))	; ~0  2  1 ~5 ~4 ~3
+
+  (fresh-line)
+
+  (print-permutation-network trad-network)
+  (demonstrate-permutation-network 64 trad-network fixed-ip))
-- 
2.11.0