--- /dev/null
+/* -*-c-*-
+ *
+ * The Keccak-p[1600, n] permutation
+ *
+ * (c) 2017 Straylight/Edgeware
+ */
+
+/*----- Licensing notice --------------------------------------------------*
+ *
+ * This file is part of secnet.
+ * See README for full list of copyright holders.
+ *
+ * secnet is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version d of the License, or
+ * (at your option) any later version.
+ *
+ * secnet is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 3 along with secnet; if not, see
+ * https://www.gnu.org/licenses/gpl.html.
+ *
+ * This file was originally part of Catacomb, but has been automatically
+ * modified for incorporation into secnet: see `import-catacomb-crypto'
+ * for details.
+ *
+ * Catacomb is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Library General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * Catacomb is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with Catacomb; if not, write to the Free
+ * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ * MA 02111-1307, USA.
+ */
+
+/*----- Header files ------------------------------------------------------*/
+
+#include <limits.h>
+#include <string.h>
+
+#include "fake-mLib-bits.h"
+
+#include "keccak1600.h"
+
+/* #define KECCAK_DEBUG */
+
+/*----- Miscellaneous utilities -------------------------------------------*/
+
+#define I(x, y) ((x) + 5*(y)) /* Column-major indexing */
+
+/*----- Interlacing or not ------------------------------------------------*/
+
+/* We should prefer the interlaced representation if the target is really
+ * 32-bit and only providing synthetic 64-bit integers. Alas, the Windows
+ * 64-bit ABI specifies that `long' is only 32-bits (i.e., it is IL32/LLP64),
+ * so detect x86 specifically.
+ */
+#if (ULONG_MAX >> 31) <= 0xffffffff && \
+ !defined(__amd64__) && !defined(_M_AMD64)
+# define KECCAK_I32
+#endif
+
+#ifdef KECCAK_I32
+/* A 32-bit target with at best weak support for 64-bit shifts. Maintain a
+ * lane as two 32-bit pieces representing the even and odd bits of the lane.
+ * There are slightly fiddly transformations to apply on the way in and out
+ * of the main permutation.
+ */
+
+typedef keccak1600_lane_i32 lane;
+#define S si32
+
+static lane interlace(kludge64 x)
+{
+ /* Given a 64-bit string X, return a lane Z containing the even- and
+ * odd-numbered bits of X.
+ *
+ * This becomes more manageable if we look at what happens to the bit
+ * indices: bit i of X becomes bit ROR_6(i, 1) of Z. We can effectively
+ * swap two bits of the indices by swapping the object bits where those
+ * index bits differ. Fortunately, this is fairly easy.
+ *
+ * We arrange to swap bits between the two halves of X, rather than within
+ * a half.
+ */
+
+ uint32 x0 = LO64(x), x1 = HI64(x), t;
+ lane z;
+ /* 543210 */
+ t = ((x0 >> 16) ^ x1)&0x0000ffff; x0 ^= t << 16; x1 ^= t; /* 453210 */
+ t = ((x0 >> 8) ^ x1)&0x00ff00ff; x0 ^= t << 8; x1 ^= t; /* 354210 */
+ t = ((x0 >> 4) ^ x1)&0x0f0f0f0f; x0 ^= t << 4; x1 ^= t; /* 254310 */
+ t = ((x0 >> 2) ^ x1)&0x33333333; x0 ^= t << 2; x1 ^= t; /* 154320 */
+ t = ((x0 >> 1) ^ x1)&0x55555555; x0 ^= t << 1; x1 ^= t; /* 054321 */
+ z.even = x0; z.odd = x1; return (z);
+}
+
+static kludge64 deinterlace(lane x)
+{
+ /* Given a lane X, return the combined 64-bit value. This is the inverse
+ * to `interlace' above, and the principle is the same
+ */
+
+ uint32 x0 = x.even, x1 = x.odd, t;
+ kludge64 z;
+ /* 054321 */
+ t = ((x0 >> 1) ^ x1)&0x55555555; x0 ^= t << 1; x1 ^= t; /* 154320 */
+ t = ((x0 >> 2) ^ x1)&0x33333333; x0 ^= t << 2; x1 ^= t; /* 254310 */
+ t = ((x0 >> 4) ^ x1)&0x0f0f0f0f; x0 ^= t << 4; x1 ^= t; /* 354210 */
+ t = ((x0 >> 8) ^ x1)&0x00ff00ff; x0 ^= t << 8; x1 ^= t; /* 453210 */
+ t = ((x0 >> 16) ^ x1)&0x0000ffff; x0 ^= t << 16; x1 ^= t; /* 543210 */
+ SET64(z, x1, x0); return (z);
+}
+
+#define TO_LANE(x) (interlace(x))
+#define FROM_LANE(x) (deinterlace(x))
+
+#define PRINTFMT_LANE "%08lx:%08lx"
+#define PRINTARGS_LANE(x) (unsigned long)(x).even, (unsigned long)(x).odd
+
+#define BINOP_LANE(z, op, x, y) \
+ ((z).even = (x).even op (y).even, (z).odd = (x).odd op (y).odd)
+#define XOR_LANE(z, x, y) BINOP_LANE(z, ^, x, y)
+#define AND_LANE(z, x, y) BINOP_LANE(z, &, x, y)
+#define OR_LANE(z, x, y) BINOP_LANE(z, |, x, y)
+#define NOT_LANE(z, x) ((z).even = ~(x).even, (z).odd = ~(x).odd)
+
+#define ROTL_LANE(z, x, n) do { \
+ lane _t = (x); \
+ (z).even = (n)%2 ? ROL32(_t.odd, ((n) + 1)/2) \
+ : ROL32(_t.even, (n)/2); \
+ (z).odd = (n)%2 ? ROL32(_t.even, ((n) - 1)/2) \
+ : ROL32(_t.odd, (n)/2); \
+} while (0)
+
+#define LANE_ZERO { 0, 0 }
+#define LANE_CMPL { 0xffffffff, 0xffffffff }
+
+static const lane rcon[24] = {
+ { 0x00000001, 0x00000000 }, { 0x00000000, 0x00000089 },
+ { 0x00000000, 0x8000008b }, { 0x00000000, 0x80008080 },
+ { 0x00000001, 0x0000008b }, { 0x00000001, 0x00008000 },
+ { 0x00000001, 0x80008088 }, { 0x00000001, 0x80000082 },
+ { 0x00000000, 0x0000000b }, { 0x00000000, 0x0000000a },
+ { 0x00000001, 0x00008082 }, { 0x00000000, 0x00008003 },
+ { 0x00000001, 0x0000808b }, { 0x00000001, 0x8000000b },
+ { 0x00000001, 0x8000008a }, { 0x00000001, 0x80000081 },
+ { 0x00000000, 0x80000081 }, { 0x00000000, 0x80000008 },
+ { 0x00000000, 0x00000083 }, { 0x00000000, 0x80008003 },
+ { 0x00000001, 0x80008088 }, { 0x00000000, 0x80000088 },
+ { 0x00000001, 0x00008000 }, { 0x00000000, 0x80008082 }
+};
+
+#else
+/* A target with good support for 64-bit shifts. We store lanes as 64-bit
+ * quantities and deal with them in the obvious, natural way.
+ */
+
+typedef keccak1600_lane_64 lane;
+#define S s64
+
+#define TO_LANE(x) (x)
+#define FROM_LANE(x) (x)
+
+#define PRINTFMT_LANE "%08lx%08lx"
+#define PRINTARGS_LANE(x) (unsigned long)HI64(x), (unsigned long)LO64(x)
+
+#define XOR_LANE(z, x, y) XOR64((z), (x), (y))
+#define AND_LANE(z, x, y) AND64((z), (x), (y))
+#define OR_LANE(z, x, y) OR64((z), (x), (y))
+#define NOT_LANE(z, x) CPL64((z), (x))
+#define ROTL_LANE(z, x, n) ROL64_((z), (x), (n))
+
+#define LANE_ZERO X64( 0, 0)
+#define LANE_CMPL X64(ffffffff, ffffffff)
+
+static const lane rcon[24] = {
+ X64(00000000, 00000001), X64(00000000, 00008082),
+ X64(80000000, 0000808a), X64(80000000, 80008000),
+ X64(00000000, 0000808b), X64(00000000, 80000001),
+ X64(80000000, 80008081), X64(80000000, 00008009),
+ X64(00000000, 0000008a), X64(00000000, 00000088),
+ X64(00000000, 80008009), X64(00000000, 8000000a),
+ X64(00000000, 8000808b), X64(80000000, 0000008b),
+ X64(80000000, 00008089), X64(80000000, 00008003),
+ X64(80000000, 00008002), X64(80000000, 00000080),
+ X64(00000000, 0000800a), X64(80000000, 8000000a),
+ X64(80000000, 80008081), X64(80000000, 00008080),
+ X64(00000000, 80000001), X64(80000000, 80008008)
+};
+
+#endif
+
+/*----- Complementing or not ----------------------------------------------*/
+
+/* We should use the complemented representation if the target doesn't have a
+ * fused and-not operation. There doesn't appear to be a principled way to
+ * do this, so we'll just have to make do with a big list. Worse, in my
+ * brief survey of the architecture reference manuals I have lying about,
+ * they've split close to 50/50 on this question, so I don't have an
+ * especially good way to pick a default. The `no-fused-op' architectures
+ * seem generally a bit more modern than the `fused-op' architectures, so I
+ * guess I'll make the complemented representation the default.
+ *
+ * and-not No and-not
+ * ------- ----------
+ * ARM (`bic') x86/amd64
+ * Sparc (`andn') z/Architecture
+ * MMIX (`andn') MIPS
+ * IA64 (`andcm') 68k
+ * VAX (`bic') RISC-V
+ * PDP-10 (`andc')
+ */
+#if !(defined(__arm__) || defined(__thumb__) || defined(__aarch64__) || \
+ defined(_M_ARM) || defined(_M_THUMB)) && \
+ !(defined(__ia64__) || defined(__ia64) || defined(__itanium__) || \
+ defined(_M_IA64)) && \
+ !defined(__mmix__) && \
+ !(defined(__sparc__) || defined(__sparc)) && \
+ !defined(__vax__) && \
+ !defined(__pdp10__)
+# define KECCAK_COMPL
+#endif
+
+#ifdef KECCAK_COMPL
+/* A target without fused and/not (`bic', `andc2'). We complement some of
+ * the lanes in the initial state and undo this on output. (Absorbing XORs
+ * input into the state, so this is unaffected.) See the handling of chi in
+ * `keccak1600_round' below for the details.
+ */
+
+#define STATE_INIT(z) do { \
+ lane cmpl = LANE_CMPL; \
+ (z)->S[I(1, 0)] = cmpl; (z)->S[I(2, 0)] = cmpl; \
+ (z)->S[I(3, 1)] = cmpl; (z)->S[I(2, 2)] = cmpl; \
+ (z)->S[I(2, 3)] = cmpl; (z)->S[I(0, 4)] = cmpl; \
+} while (0)
+
+#define STATE_OUT(z) do { \
+ NOT_LANE((z)->S[I(1, 0)], (z)->S[I(1, 0)]); \
+ NOT_LANE((z)->S[I(2, 0)], (z)->S[I(2, 0)]); \
+ NOT_LANE((z)->S[I(3, 1)], (z)->S[I(3, 1)]); \
+ NOT_LANE((z)->S[I(2, 2)], (z)->S[I(2, 2)]); \
+ NOT_LANE((z)->S[I(2, 3)], (z)->S[I(2, 3)]); \
+ NOT_LANE((z)->S[I(0, 4)], (z)->S[I(0, 4)]); \
+} while (0)
+
+#else
+/* A target with fused and/not (`bic', `andc2'). Everything is simple. */
+
+#define STATE_INIT(z) do {} while (0)
+#define STATE_OUT(z) do {} while (0)
+
+#endif
+
+/*----- Other magic constants ---------------------------------------------*/
+
+/* The rotation constants. These are systematically named -- see `THETA_RHO'
+ * below.
+ */
+#define ROT_0_0 0
+#define ROT_1_0 1
+#define ROT_2_0 62
+#define ROT_3_0 28
+#define ROT_4_0 27
+
+#define ROT_0_1 36
+#define ROT_1_1 44
+#define ROT_2_1 6
+#define ROT_3_1 55
+#define ROT_4_1 20
+
+#define ROT_0_2 3
+#define ROT_1_2 10
+#define ROT_2_2 43
+#define ROT_3_2 25
+#define ROT_4_2 39
+
+#define ROT_0_3 41
+#define ROT_1_3 45
+#define ROT_2_3 15
+#define ROT_3_3 21
+#define ROT_4_3 8
+
+#define ROT_0_4 18
+#define ROT_1_4 2
+#define ROT_2_4 61
+#define ROT_3_4 56
+#define ROT_4_4 14
+
+/*----- Debugging ---------------------------------------------------------*/
+
+#ifdef KECCAK_DEBUG
+
+#include <stdio.h>
+
+static void dump_state(const char *what, unsigned ir,
+ const keccak1600_state *x)
+{
+ unsigned i, j;
+ keccak1600_state y;
+ kludge64 a;
+ int sep;
+
+ printf(";; %s [round %u]\n", what, ir);
+ printf(";; raw state...\n");
+ for (j = 0; j < 5; j++) {
+ printf(";;");
+ for (i = 0, sep = '\t'; i < 5; i++, sep = ' ')
+ printf("%c" PRINTFMT_LANE, sep, PRINTARGS_LANE(x->S[I(i, j)]));
+ fputc('\n', stdout);
+ }
+ y = *x; STATE_OUT(&y);
+#ifdef KECCAK_COMPL
+ printf(";; uncomplemented state...\n");
+ for (j = 0; j < 5; j++) {
+ printf(";;");
+ for (i = 0, sep = '\t'; i < 5; i++, sep = ' ')
+ printf("%c" PRINTFMT_LANE, sep, PRINTARGS_LANE(y.S[I(i, j)]));
+ fputc('\n', stdout);
+ }
+#endif
+#ifdef KECCAK_I32
+ printf(";; deinterlaced state...\n");
+ for (j = 0; j < 5; j++) {
+ printf(";;");
+ for (i = 0, sep = '\t'; i < 5; i++, sep = ' ') {
+ a = FROM_LANE(y.S[I(i, j)]);
+ printf("%c%08lx%08lx", sep,
+ (unsigned long)HI64(a), (unsigned long)LO64(a));
+ }
+ fputc('\n', stdout);
+ }
+#endif
+ fputc('\n', stdout);
+}
+
+#endif
+
+/*----- The Keccak-p[1600, n] permutation ---------------------------------*/
+
+static void keccak1600_round(keccak1600_state *z,
+ const keccak1600_state *x, unsigned i)
+{
+ /* Perform a round of Keccak-p[1600, n]. Process the state X and write the
+ * result to Z.
+ */
+
+ lane c[5], d[5], t;
+
+ /* Theta, first step: calculate the column parities. */
+#define COLPARITY(j) do { \
+ d[j] = x->S[I(j, 0)]; \
+ XOR_LANE(d[j], d[j], x->S[I(j, 1)]); \
+ XOR_LANE(d[j], d[j], x->S[I(j, 2)]); \
+ XOR_LANE(d[j], d[j], x->S[I(j, 3)]); \
+ XOR_LANE(d[j], d[j], x->S[I(j, 4)]); \
+} while (0)
+ COLPARITY(0); COLPARITY(1); COLPARITY(2); COLPARITY(3); COLPARITY(4);
+#undef COLPARITY
+
+ /* Theta, second step: calculate the combined effect. */
+ ROTL_LANE(c[0], d[1], 1); XOR_LANE(c[0], c[0], d[4]);
+ ROTL_LANE(c[1], d[2], 1); XOR_LANE(c[1], c[1], d[0]);
+ ROTL_LANE(c[2], d[3], 1); XOR_LANE(c[2], c[2], d[1]);
+ ROTL_LANE(c[3], d[4], 1); XOR_LANE(c[3], c[3], d[2]);
+ ROTL_LANE(c[4], d[0], 1); XOR_LANE(c[4], c[4], d[3]);
+
+ /* Now we work plane by plane through the output. To do this, we must undo
+ * the pi transposition. Pi maps (x', y') = (y, 2 x + 3 y), so y = x', and
+ * x = (y' - 3 y)/2 = 3 (y' - 3 x') = x' + 3 y'.
+ */
+#define THETA_RHO(i0, i1, i2, i3, i4) do { \
+ \
+ /* First, theta. */ \
+ XOR_LANE(d[0], x->S[I(i0, 0)], c[i0]); \
+ XOR_LANE(d[1], x->S[I(i1, 1)], c[i1]); \
+ XOR_LANE(d[2], x->S[I(i2, 2)], c[i2]); \
+ XOR_LANE(d[3], x->S[I(i3, 3)], c[i3]); \
+ XOR_LANE(d[4], x->S[I(i4, 4)], c[i4]); \
+ \
+ /* Then rho. */ \
+ ROTL_LANE(d[0], d[0], ROT_##i0##_0); \
+ ROTL_LANE(d[1], d[1], ROT_##i1##_1); \
+ ROTL_LANE(d[2], d[2], ROT_##i2##_2); \
+ ROTL_LANE(d[3], d[3], ROT_##i3##_3); \
+ ROTL_LANE(d[4], d[4], ROT_##i4##_4); \
+} while (0)
+
+ /* The basic chi operation is: z = w ^ (~a&b), but this involves an
+ * inversion which we can mostly avoid by being clever: observe that
+ *
+ * w ^ (~a&~~b) = w ^ ~(a | ~b) = ~w ^ (a | ~b)
+ *
+ * by De Morgan's law. Furthermore, complementing w or z is basically
+ * equivalent. Bertoni, Daemen, Peeters, Van Assche, and Van Keer, `Keccak
+ * implementation overview', describe a pattern of lane complementation
+ * which propagates through theta and pi in exactly the right way to be
+ * restored easily by chi, here, with exactly one inversion per plane.
+ *
+ * Here's the pattern.
+ *
+ * [ * . * * . ] [ . * * . . ]
+ * [ * . * . . ] [ . . . * . ]
+ * [ * . * . . ] -> [ . . * . . ]
+ * [ . * . * * ] [ . . * . . ]
+ * [ * . . * . ] [ * . . . . ]
+ *
+ * where a `.' means that the lane is unchanged, and a `*' means that it
+ * has been complemented.
+ *
+ * The macros `CHI_wxy_z' calculate z in terms of w, x, y assuming that the
+ * inputs w, x, y marked with a `1' are complemented on input, and arrange
+ * for z to be complemented on output if z is so marked.
+ *
+ * The diagrams to the right show the fragment of the complementation
+ * pattern being handled by the corresponding line of code. A symbol in
+ * brackets indicates a deviation from the input pattern forced by explicit
+ * complementation: there will be exactly one of these for each plane.
+ */
+#ifdef KECCAK_COMPL
+# define CHI_COMPL(z, x) NOT_LANE((z), (x))
+# define CHI_001_1(z, w, x, y) \
+ (OR_LANE((z), (x), (y)), XOR_LANE((z), (z), (w)))
+# define CHI_010_0(z, w, x, y) \
+ (AND_LANE((z), (x), (y)), XOR_LANE((z), (z), (w)))
+# define CHI_101_0 CHI_001_1
+# define CHI_110_1 CHI_010_0
+#else
+# define CHI(z, w, x, y) \
+ (NOT_LANE((z), (x)), \
+ AND_LANE((z), (z), (y)), \
+ XOR_LANE((z), (z), (w)))
+# define CHI_COMPL(z, x) ((z) = (x))
+# define CHI_001_1 CHI
+# define CHI_010_0 CHI
+# define CHI_101_0 CHI
+# define CHI_110_1 CHI
+#endif
+
+ /* Let's do the y' = 0 plane first. Theta and rho are easy with our macro,
+ * and we've done pi with the coordinate hacking. That leaves chi next.
+ * This is hairy because we must worry about complementation.
+ */
+ THETA_RHO(0, 1, 2, 3, 4);
+ CHI_COMPL(t, d[2]); /* [.] */
+ CHI_101_0(z->S[I(0, 0)], d[0], d[1], d[2]); /* * . * -> . */
+ CHI_001_1(z->S[I(1, 0)], d[1], t, d[3]); /* . [.] * -> * */
+ CHI_110_1(z->S[I(2, 0)], d[2], d[3], d[4]); /* * * . -> * */
+ CHI_101_0(z->S[I(3, 0)], d[3], d[4], d[0]); /* * * . -> . */
+ CHI_010_0(z->S[I(4, 0)], d[4], d[0], d[1]); /* * . . -> . */
+
+ /* We'd better do iota before we forget. */
+ XOR_LANE(z->S[I(0, 0)], z->S[I(0, 0)], rcon[i]);
+
+ /* That was fun. Maybe y' = 1 will be as good. */
+ THETA_RHO(3, 4, 0, 1, 2);
+ CHI_COMPL(t, d[4]); /* [*] */
+ CHI_101_0(z->S[I(0, 1)], d[0], d[1], d[2]); /* * . * -> . */
+ CHI_010_0(z->S[I(1, 1)], d[1], d[2], d[3]); /* . * . -> . */
+ CHI_101_0(z->S[I(2, 1)], d[2], d[3], t); /* * . [*] -> . */
+ CHI_001_1(z->S[I(3, 1)], d[3], d[4], d[0]); /* * . . -> * */
+ CHI_010_0(z->S[I(4, 1)], d[4], d[0], d[1]); /* * . . -> . */
+
+ /* We're getting the hang of this. The y' = 2 plane shouldn't be any
+ * trouble.
+ */
+ THETA_RHO(1, 2, 3, 4, 0);
+ CHI_COMPL(t, d[3]); /* [*] */
+ CHI_101_0(z->S[I(0, 2)], d[0], d[1], d[2]); /* * . * -> . */
+ CHI_010_0(z->S[I(1, 2)], d[1], d[2], d[3]); /* . * . -> . */
+ CHI_110_1(z->S[I(2, 2)], d[2], t, d[4]); /* * [*] . -> * */
+ CHI_101_0(z->S[I(3, 2)], t, d[4], d[0]); /* * [*] . -> . */
+ CHI_010_0(z->S[I(4, 2)], d[4], d[0], d[1]); /* * . . -> . */
+
+ /* This isn't as interesting any more. Let's do y' = 3 before boredom sets
+ * in.
+ */
+ THETA_RHO(4, 0, 1, 2, 3);
+ CHI_COMPL(t, d[3]); /* [.] */
+ CHI_010_0(z->S[I(0, 3)], d[0], d[1], d[2]); /* . * . -> . */
+ CHI_101_0(z->S[I(1, 3)], d[1], d[2], d[3]); /* * . * -> . */
+ CHI_001_1(z->S[I(2, 3)], d[2], t, d[4]); /* . [.] * -> * */
+ CHI_010_0(z->S[I(3, 3)], t, d[4], d[0]); /* . [.] * -> . */
+ CHI_101_0(z->S[I(4, 3)], d[4], d[0], d[1]); /* . * * -> . */
+
+ /* Last plane. Just y' = 4 to go. */
+ THETA_RHO(2, 3, 4, 0, 1);
+ CHI_COMPL(t, d[1]); /* [*] */
+ CHI_110_1(z->S[I(0, 4)], d[0], t, d[2]); /* * [*] . -> * */
+ CHI_101_0(z->S[I(1, 4)], t, d[2], d[3]); /* [*] . * -> . */
+ CHI_010_0(z->S[I(2, 4)], d[2], d[3], d[4]); /* . * . -> . */
+ CHI_101_0(z->S[I(3, 4)], d[3], d[4], d[0]); /* * * . -> . */
+ CHI_010_0(z->S[I(4, 4)], d[4], d[0], d[1]); /* * . . -> . */
+
+ /* And we're done. */
+#undef THETA_RHO
+#undef CHI_COMPL
+#undef CHI_001_1
+#undef CHI_010_0
+#undef CHI_101_0
+#undef CHI_110_1
+#undef CHI
+}
+
+/* --- @keccak1600_p@ --- *
+ *
+ * Arguments: @keccak1600_state *z@ = where to write the output state
+ * @conts keccak1600_state *x@ = input state
+ * @unsigned n@ = number of rounds to perform
+ *
+ * Returns: ---
+ *
+ * Use: Implements the %$\Keccak[1600, n]$% permutation at the core
+ * of Keccak and the SHA-3 standard.
+ */
+
+void keccak1600_p(keccak1600_state *z, const keccak1600_state *x, unsigned n)
+{
+ keccak1600_state u, v;
+ unsigned i = 0;
+
+#ifdef KECCAK_DEBUG
+ dump_state("init", 0, x);
+#endif
+ keccak1600_round(&u, x, i++); n--;
+ while (n > 8) {
+ keccak1600_round(&v, &u, i++);
+ keccak1600_round(&u, &v, i++);
+ keccak1600_round(&v, &u, i++);
+ keccak1600_round(&u, &v, i++);
+ keccak1600_round(&v, &u, i++);
+ keccak1600_round(&u, &v, i++);
+ keccak1600_round(&v, &u, i++);
+ keccak1600_round(&u, &v, i++);
+ n -= 8;
+ }
+ switch (n) {
+ case 7: keccak1600_round(&v, &u, i++);
+ keccak1600_round(&u, &v, i++);
+ case 5: keccak1600_round(&v, &u, i++);
+ keccak1600_round(&u, &v, i++);
+ case 3: keccak1600_round(&v, &u, i++);
+ keccak1600_round(&u, &v, i++);
+ case 1: keccak1600_round( z, &u, i++);
+ break;
+ case 8: keccak1600_round(&v, &u, i++);
+ keccak1600_round(&u, &v, i++);
+ case 6: keccak1600_round(&v, &u, i++);
+ keccak1600_round(&u, &v, i++);
+ case 4: keccak1600_round(&v, &u, i++);
+ keccak1600_round(&u, &v, i++);
+ case 2: keccak1600_round(&v, &u, i++);
+ keccak1600_round( z, &v, i++);
+ break;
+ }
+#ifdef KECCAK_DEBUG
+ dump_state("final", 0, z);
+#endif
+}
+
+/* --- @keccack1600_init@ --- *
+ *
+ * Arguments: @keccak1600_state *s@ = a state to initialize
+ *
+ * Returns: ---
+ *
+ * Use: Initialize @s@ to the root state.
+ */
+
+void keccak1600_init(keccak1600_state *s)
+ { memset(s->S, 0, sizeof(s->S)); STATE_INIT(s); }
+
+/* --- @keccak1600_mix@ --- *
+ *
+ * Arguments: @keccak1600_state *s@ = a state to update
+ * @const kludge64 *p@ = pointer to 64-bit words to mix in
+ * @size_t n@ = size of the input, in 64-bit words
+ *
+ * Returns: ---
+ *
+ * Use: Mixes data into a %$\Keccak[r, 1600 - r]$% state. Note that
+ * it's the caller's responsibility to pass in no more than
+ * %$r$% bits of data.
+ */
+
+void keccak1600_mix(keccak1600_state *s, const kludge64 *p, size_t n)
+{
+ unsigned i;
+ lane a;
+
+ for (i = 0; i < n; i++)
+ { a = TO_LANE(p[i]); XOR_LANE(s->S[i], s->S[i], a); }
+}
+
+/* --- @keccak1600_extract@ --- *
+ *
+ * Arguments: @const keccak1600_state *s@ = a state to extract output from
+ * @kludge64 *p@ = pointer to 64-bit words to write
+ * @size_t n@ = size of the output, in 64-bit words
+ *
+ * Returns: ---
+ *
+ * Use: Reads output from a %$\Keccak[r, 1600 - r]$% state. Note
+ * that it's the caller's responsibility to extract no more than
+ * %$r$% bits of data.
+ */
+
+void keccak1600_extract(const keccak1600_state *s, kludge64 *p, size_t n)
+{
+ unsigned i;
+ keccak1600_state t;
+
+ t = *s; STATE_OUT(&t);
+ for (i = 0; i < n; i++) p[i] = FROM_LANE(t.S[i]);
+}
+
+/*----- That's all, folks -------------------------------------------------*/