From a905c0d66f0cb9d58a897a27233be1881c246a0d Mon Sep 17 00:00:00 2001 From: Mark Wooding Date: Mon, 1 May 2017 01:38:30 +0100 Subject: [PATCH] symm/keccak1600.[ch]: Add the Keccak-p[1600, n] permutation. Currently just a special snowflake. Fancier things forthcoming. --- symm/Makefile.am | 7 + symm/keccak1600.c | 666 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ symm/keccak1600.h | 137 +++++++++++ symm/t/keccak1600 | 12 + utils/keccak-toy | 213 +++++++++++++++++ 5 files changed, 1035 insertions(+) create mode 100644 symm/keccak1600.c create mode 100644 symm/keccak1600.h create mode 100644 symm/t/keccak1600 create mode 100755 utils/keccak-toy diff --git a/symm/Makefile.am b/symm/Makefile.am index 9d4e3f42..504779d7 100644 --- a/symm/Makefile.am +++ b/symm/Makefile.am @@ -370,6 +370,13 @@ $(precomp)/symm/whirlpool-tab.c: $(precomp)/symm/whirlpool-tab.c endif +## Bertoni, Daemen, Peeters, and Van Assche's `Keccak', selected as the basis +## for SHA-3. +pkginclude_HEADERS += keccak1600.h +libsymm_la_SOURCES += keccak1600.c +TESTS += keccak1600.t$(EXEEXT) +EXTRA_DIST += t/keccak1600 + ## Bellare, Canetti and Krawczyk's `HMAC' mode for message authentication. HASHMACMODES += hmac diff --git a/symm/keccak1600.c b/symm/keccak1600.c new file mode 100644 index 00000000..e57aaa81 --- /dev/null +++ b/symm/keccak1600.c @@ -0,0 +1,666 @@ +/* -*-c-*- + * + * The Keccak-p[1600, n] permutation + * + * (c) 2017 Straylight/Edgeware + */ + +/*----- Licensing notice --------------------------------------------------* + * + * This file is part of Catacomb. + * + * Catacomb is free software; you can redistribute it and/or modify + * it under the terms of the GNU Library General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * Catacomb is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with Catacomb; if not, write to the Free + * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + * MA 02111-1307, USA. + */ + +/*----- Header files ------------------------------------------------------*/ + +#include +#include + +#include + +#include "keccak1600.h" + +/* #define KECCAK_DEBUG */ + +/*----- Miscellaneous utilities -------------------------------------------*/ + +#define I(x, y) ((x) + 5*(y)) /* Column-major indexing */ + +/*----- Interlacing or not ------------------------------------------------*/ + +/* We should prefer the interlaced representation if the target is really + * 32-bit and only providing synthetic 64-bit integers. Alas, the Windows + * 64-bit ABI specifies that `long' is only 32-bits (i.e., it is IL32/LLP64), + * so detect x86 specifically. + */ +#if (ULONG_MAX >> 31) <= 0xffffffff && \ + !defined(__amd64__) && !defined(_M_AMD64) +# define KECCAK_I32 +#endif + +#ifdef KECCAK_I32 +/* A 32-bit target with at best weak support for 64-bit shifts. Maintain a + * lane as two 32-bit pieces representing the even and odd bits of the lane. + * There are slightly fiddly transformations to apply on the way in and out + * of the main permutation. + */ + +typedef keccak1600_lane_i32 lane; +#define S si32 + +static lane interlace(kludge64 x) +{ + /* Given a 64-bit string X, return a lane Z containing the even- and + * odd-numbered bits of X. + * + * This becomes more manageable if we look at what happens to the bit + * indices: bit i of X becomes bit ROR_6(i, 1) of Z. We can effectively + * swap two bits of the indices by swapping the object bits where those + * index bits differ. Fortunately, this is fairly easy. + * + * We arrange to swap bits between the two halves of X, rather than within + * a half. + */ + + uint32 x0 = LO64(x), x1 = HI64(x), t; + lane z; + /* 543210 */ + t = ((x0 >> 16) ^ x1)&0x0000ffff; x0 ^= t << 16; x1 ^= t; /* 453210 */ + t = ((x0 >> 8) ^ x1)&0x00ff00ff; x0 ^= t << 8; x1 ^= t; /* 354210 */ + t = ((x0 >> 4) ^ x1)&0x0f0f0f0f; x0 ^= t << 4; x1 ^= t; /* 254310 */ + t = ((x0 >> 2) ^ x1)&0x33333333; x0 ^= t << 2; x1 ^= t; /* 154320 */ + t = ((x0 >> 1) ^ x1)&0x55555555; x0 ^= t << 1; x1 ^= t; /* 054321 */ + z.even = x0; z.odd = x1; return (z); +} + +static kludge64 deinterlace(lane x) +{ + /* Given a lane X, return the combined 64-bit value. This is the inverse + * to `interlace' above, and the principle is the same + */ + + uint32 x0 = x.even, x1 = x.odd, t; + kludge64 z; + /* 054321 */ + t = ((x0 >> 1) ^ x1)&0x55555555; x0 ^= t << 1; x1 ^= t; /* 154320 */ + t = ((x0 >> 2) ^ x1)&0x33333333; x0 ^= t << 2; x1 ^= t; /* 254310 */ + t = ((x0 >> 4) ^ x1)&0x0f0f0f0f; x0 ^= t << 4; x1 ^= t; /* 354210 */ + t = ((x0 >> 8) ^ x1)&0x00ff00ff; x0 ^= t << 8; x1 ^= t; /* 453210 */ + t = ((x0 >> 16) ^ x1)&0x0000ffff; x0 ^= t << 16; x1 ^= t; /* 543210 */ + SET64(z, x1, x0); return (z); +} + +#define TO_LANE(x) (interlace(x)) +#define FROM_LANE(x) (deinterlace(x)) + +#define PRINTFMT_LANE "%08lx:%08lx" +#define PRINTARGS_LANE(x) (unsigned long)(x).even, (unsigned long)(x).odd + +#define BINOP_LANE(z, op, x, y) \ + ((z).even = (x).even op (y).even, (z).odd = (x).odd op (y).odd) +#define XOR_LANE(z, x, y) BINOP_LANE(z, ^, x, y) +#define AND_LANE(z, x, y) BINOP_LANE(z, &, x, y) +#define OR_LANE(z, x, y) BINOP_LANE(z, |, x, y) +#define NOT_LANE(z, x) ((z).even = ~(x).even, (z).odd = ~(x).odd) + +#define ROTL_LANE(z, x, n) do { \ + lane _t = (x); \ + (z).even = (n)%2 ? ROL32(_t.odd, ((n) + 1)/2) \ + : ROL32(_t.even, (n)/2); \ + (z).odd = (n)%2 ? ROL32(_t.even, ((n) - 1)/2) \ + : ROL32(_t.odd, (n)/2); \ +} while (0) + +#define LANE_ZERO { 0, 0 } +#define LANE_CMPL { 0xffffffff, 0xffffffff } + +static const lane rcon[24] = { + { 0x00000001, 0x00000000 }, { 0x00000000, 0x00000089 }, + { 0x00000000, 0x8000008b }, { 0x00000000, 0x80008080 }, + { 0x00000001, 0x0000008b }, { 0x00000001, 0x00008000 }, + { 0x00000001, 0x80008088 }, { 0x00000001, 0x80000082 }, + { 0x00000000, 0x0000000b }, { 0x00000000, 0x0000000a }, + { 0x00000001, 0x00008082 }, { 0x00000000, 0x00008003 }, + { 0x00000001, 0x0000808b }, { 0x00000001, 0x8000000b }, + { 0x00000001, 0x8000008a }, { 0x00000001, 0x80000081 }, + { 0x00000000, 0x80000081 }, { 0x00000000, 0x80000008 }, + { 0x00000000, 0x00000083 }, { 0x00000000, 0x80008003 }, + { 0x00000001, 0x80008088 }, { 0x00000000, 0x80000088 }, + { 0x00000001, 0x00008000 }, { 0x00000000, 0x80008082 } +}; + +#else +/* A target with good support for 64-bit shifts. We store lanes as 64-bit + * quantities and deal with them in the obvious, natural way. + */ + +typedef keccak1600_lane_64 lane; +#define S s64 + +#define TO_LANE(x) (x) +#define FROM_LANE(x) (x) + +#define PRINTFMT_LANE "%08lx%08lx" +#define PRINTARGS_LANE(x) (unsigned long)HI64(x), (unsigned long)LO64(x) + +#define XOR_LANE(z, x, y) XOR64((z), (x), (y)) +#define AND_LANE(z, x, y) AND64((z), (x), (y)) +#define OR_LANE(z, x, y) OR64((z), (x), (y)) +#define NOT_LANE(z, x) CPL64((z), (x)) +#define ROTL_LANE(z, x, n) ROL64_((z), (x), (n)) + +#define LANE_ZERO X64( 0, 0) +#define LANE_CMPL X64(ffffffff, ffffffff) + +static const lane rcon[24] = { + X64(00000000, 00000001), X64(00000000, 00008082), + X64(80000000, 0000808a), X64(80000000, 80008000), + X64(00000000, 0000808b), X64(00000000, 80000001), + X64(80000000, 80008081), X64(80000000, 00008009), + X64(00000000, 0000008a), X64(00000000, 00000088), + X64(00000000, 80008009), X64(00000000, 8000000a), + X64(00000000, 8000808b), X64(80000000, 0000008b), + X64(80000000, 00008089), X64(80000000, 00008003), + X64(80000000, 00008002), X64(80000000, 00000080), + X64(00000000, 0000800a), X64(80000000, 8000000a), + X64(80000000, 80008081), X64(80000000, 00008080), + X64(00000000, 80000001), X64(80000000, 80008008) +}; + +#endif + +/*----- Complementing or not ----------------------------------------------*/ + +/* We should use the complemented representation if the target doesn't have a + * fused and-not operation. There doesn't appear to be a principled way to + * do this, so we'll just have to make do with a big list. Worse, in my + * brief survey of the architecture reference manuals I have lying about, + * they've split close to 50/50 on this question, so I don't have an + * especially good way to pick a default. The `no-fused-op' architectures + * seem generally a bit more modern than the `fused-op' architectures, so I + * guess I'll make the complemented representation the default. + * + * and-not No and-not + * ------- ---------- + * ARM (`bic') x86/amd64 + * Sparc (`andn') z/Architecture + * MMIX (`andn') MIPS + * IA64 (`andcm') 68k + * VAX (`bic') RISC-V + * PDP-10 (`andc') + */ +#if !(defined(__arm__) || defined(__thumb__) || defined(__aarch64__) || \ + defined(_M_ARM) || defined(_M_THUMB)) && \ + !(defined(__ia64__) || defined(__ia64) || defined(__itanium__) || \ + defined(_M_IA64)) && \ + !defined(__mmix__) && \ + !(defined(__sparc__) || defined(__sparc)) && \ + !defined(__vax__) && \ + !defined(__pdp10__) +# define KECCAK_COMPL +#endif + +#ifdef KECCAK_COMPL +/* A target without fused and/not (`bic', `andc2'). We complement some of + * the lanes in the initial state and undo this on output. (Absorbing XORs + * input into the state, so this is unaffected.) See the handling of chi in + * `keccak1600_round' below for the details. + */ + +#define STATE_INIT(z) do { \ + lane cmpl = LANE_CMPL; \ + (z)->S[I(1, 0)] = cmpl; (z)->S[I(2, 0)] = cmpl; \ + (z)->S[I(3, 1)] = cmpl; (z)->S[I(2, 2)] = cmpl; \ + (z)->S[I(2, 3)] = cmpl; (z)->S[I(0, 4)] = cmpl; \ +} while (0) + +#define STATE_OUT(z) do { \ + NOT_LANE((z)->S[I(1, 0)], (z)->S[I(1, 0)]); \ + NOT_LANE((z)->S[I(2, 0)], (z)->S[I(2, 0)]); \ + NOT_LANE((z)->S[I(3, 1)], (z)->S[I(3, 1)]); \ + NOT_LANE((z)->S[I(2, 2)], (z)->S[I(2, 2)]); \ + NOT_LANE((z)->S[I(2, 3)], (z)->S[I(2, 3)]); \ + NOT_LANE((z)->S[I(0, 4)], (z)->S[I(0, 4)]); \ +} while (0) + +#else +/* A target with fused and/not (`bic', `andc2'). Everything is simple. */ + +#define STATE_INIT(z) do ; while (0) +#define STATE_OUT(z) do ; while (0) + +#endif + +/*----- Other magic constants ---------------------------------------------*/ + +/* The rotation constants. These are systematically named -- see `THETA_RHO' + * below. + */ +#define ROT_0_0 0 +#define ROT_1_0 1 +#define ROT_2_0 62 +#define ROT_3_0 28 +#define ROT_4_0 27 + +#define ROT_0_1 36 +#define ROT_1_1 44 +#define ROT_2_1 6 +#define ROT_3_1 55 +#define ROT_4_1 20 + +#define ROT_0_2 3 +#define ROT_1_2 10 +#define ROT_2_2 43 +#define ROT_3_2 25 +#define ROT_4_2 39 + +#define ROT_0_3 41 +#define ROT_1_3 45 +#define ROT_2_3 15 +#define ROT_3_3 21 +#define ROT_4_3 8 + +#define ROT_0_4 18 +#define ROT_1_4 2 +#define ROT_2_4 61 +#define ROT_3_4 56 +#define ROT_4_4 14 + +/*----- Debugging ---------------------------------------------------------*/ + +#ifdef KECCAK_DEBUG + +#include + +static void dump_state(const char *what, unsigned ir, + const keccak1600_state *x) +{ + unsigned i, j; + keccak1600_state y; + kludge64 a; + int sep; + + printf(";; %s [round %u]\n", what, ir); + printf(";; raw state...\n"); + for (j = 0; j < 5; j++) { + printf(";;"); + for (i = 0, sep = '\t'; i < 5; i++, sep = ' ') + printf("%c" PRINTFMT_LANE, sep, PRINTARGS_LANE(x->S[I(i, j)])); + fputc('\n', stdout); + } + y = *x; STATE_OUT(&y); +#ifdef KECCAK_COMPL + printf(";; uncomplemented state...\n"); + for (j = 0; j < 5; j++) { + printf(";;"); + for (i = 0, sep = '\t'; i < 5; i++, sep = ' ') + printf("%c" PRINTFMT_LANE, sep, PRINTARGS_LANE(y.S[I(i, j)])); + fputc('\n', stdout); + } +#endif +#ifdef KECCAK_I32 + printf(";; deinterlaced state...\n"); + for (j = 0; j < 5; j++) { + printf(";;"); + for (i = 0, sep = '\t'; i < 5; i++, sep = ' ') { + a = FROM_LANE(y.S[I(i, j)]); + printf("%c%08lx%08lx", sep, + (unsigned long)HI64(a), (unsigned long)LO64(a)); + } + fputc('\n', stdout); + } +#endif + fputc('\n', stdout); +} + +#endif + +/*----- The Keccak-p[1600, n] permutation ---------------------------------*/ + +static void keccak1600_round(keccak1600_state *z, + const keccak1600_state *x, unsigned i) +{ + /* Perform a round of Keccak-p[1600, n]. Process the state X and write the + * result to Z. + */ + + lane b[5], c[5], d[5], t; + + /* Theta, first step: calculate the column parities. */ +#define COLPARITY(j) do { \ + c[j] = x->S[I(j, 0)]; \ + XOR_LANE(c[j], c[j], x->S[I(j, 1)]); \ + XOR_LANE(c[j], c[j], x->S[I(j, 2)]); \ + XOR_LANE(c[j], c[j], x->S[I(j, 3)]); \ + XOR_LANE(c[j], c[j], x->S[I(j, 4)]); \ +} while (0) + COLPARITY(0); COLPARITY(1); COLPARITY(2); COLPARITY(3); COLPARITY(4); +#undef COLPARITY + + /* Theta, second step: calculate the combined effect. */ + ROTL_LANE(d[0], c[1], 1); XOR_LANE(d[0], d[0], c[4]); + ROTL_LANE(d[1], c[2], 1); XOR_LANE(d[1], d[1], c[0]); + ROTL_LANE(d[2], c[3], 1); XOR_LANE(d[2], d[2], c[1]); + ROTL_LANE(d[3], c[4], 1); XOR_LANE(d[3], d[3], c[2]); + ROTL_LANE(d[4], c[0], 1); XOR_LANE(d[4], d[4], c[3]); + + /* Now we work plane by plane through the output. To do this, we must undo + * the pi transposition. Pi maps (x', y') = (y, 2 x + 3 y), so y = x', and + * x = (y' - 3 y)/2 = 3 (y' - 3 x') = x' + 3 y'. + */ +#define THETA_RHO(i0, i1, i2, i3, i4) do { \ + \ + /* First, theta. */ \ + XOR_LANE(b[0], x->S[I(i0, 0)], d[i0]); \ + XOR_LANE(b[1], x->S[I(i1, 1)], d[i1]); \ + XOR_LANE(b[2], x->S[I(i2, 2)], d[i2]); \ + XOR_LANE(b[3], x->S[I(i3, 3)], d[i3]); \ + XOR_LANE(b[4], x->S[I(i4, 4)], d[i4]); \ + \ + /* Then rho. */ \ + ROTL_LANE(b[0], b[0], ROT_##i0##_0); \ + ROTL_LANE(b[1], b[1], ROT_##i1##_1); \ + ROTL_LANE(b[2], b[2], ROT_##i2##_2); \ + ROTL_LANE(b[3], b[3], ROT_##i3##_3); \ + ROTL_LANE(b[4], b[4], ROT_##i4##_4); \ +} while (0) + + /* The basic chi operation is: z = w ^ (~a&b), but this involves an + * inversion which we can mostly avoid by being clever: observe that + * + * w ^ (~a&~~b) = w ^ ~(a | ~b) = ~w ^ (a | ~b) + * + * by De Morgan's law. Furthermore, complementing w or z is basically + * equivalent. Bertoni, Daemen, Peeters, Van Assche, and Van Keer, `Keccak + * implementation overview', describe a pattern of lane complementation + * which propagates through theta and pi in exactly the right way to be + * restored easily by chi, here, with exactly one inversion per plane. + * + * Here's the pattern. + * + * [ * . * * . ] [ . * * . . ] + * [ * . * . . ] [ . . . * . ] + * [ * . * . . ] -> [ . . * . . ] + * [ . * . * * ] [ . . * . . ] + * [ * . . * . ] [ * . . . . ] + * + * where a `.' means that the lane is unchanged, and a `*' means that it + * has been complemented. + * + * The macros `CHI_wxy_z' calculate z in terms of w, x, y assuming that the + * inputs w, x, y marked with a `1' are complemented on input, and arrange + * for z to be complemented on output if z is so marked. + * + * The diagrams to the right show the fragment of the complementation + * pattern being handled by the corresponding line of code. A symbol in + * brackets indicates a deviation from the input pattern forced by explicit + * complementation: there will be exactly one of these for each plane. + */ +#ifdef KECCAK_COMPL +# define CHI_COMPL(z, x) NOT_LANE((z), (x)) +# define CHI_001_1(z, w, x, y) \ + (OR_LANE((z), (x), (y)), XOR_LANE((z), (z), (w))) +# define CHI_010_0(z, w, x, y) \ + (AND_LANE((z), (x), (y)), XOR_LANE((z), (z), (w))) +# define CHI_101_0 CHI_001_1 +# define CHI_110_1 CHI_010_0 +#else +# define CHI(z, w, x, y) \ + (NOT_LANE((z), (x)), \ + AND_LANE((z), (z), (y)), \ + XOR_LANE((z), (z), (w))) +# define CHI_COMPL(z, x) ((z) = (x)) +# define CHI_001_1 CHI +# define CHI_010_0 CHI +# define CHI_101_0 CHI +# define CHI_110_1 CHI +#endif + + /* Let's do the y' = 0 plane first. Theta and rho are easy with our macro, + * and we've done pi with the coordinate hacking. That leaves chi next. + * This is hairy because we must worry about complementation. + */ + THETA_RHO(0, 1, 2, 3, 4); + CHI_COMPL(t, b[2]); /* [.] */ + CHI_101_0(z->S[I(0, 0)], b[0], b[1], b[2]); /* * . * -> . */ + CHI_001_1(z->S[I(1, 0)], b[1], t, b[3]); /* . [.] * -> * */ + CHI_110_1(z->S[I(2, 0)], b[2], b[3], b[4]); /* * * . -> * */ + CHI_101_0(z->S[I(3, 0)], b[3], b[4], b[0]); /* * * . -> . */ + CHI_010_0(z->S[I(4, 0)], b[4], b[0], b[1]); /* * . . -> . */ + + /* We'd better do iota before we forget. */ + XOR_LANE(z->S[I(0, 0)], z->S[I(0, 0)], rcon[i]); + + /* That was fun. Maybe y' = 1 will be as good. */ + THETA_RHO(3, 4, 0, 1, 2); + CHI_COMPL(t, b[4]); /* [*] */ + CHI_101_0(z->S[I(0, 1)], b[0], b[1], b[2]); /* * . * -> . */ + CHI_010_0(z->S[I(1, 1)], b[1], b[2], b[3]); /* . * . -> . */ + CHI_101_0(z->S[I(2, 1)], b[2], b[3], t); /* * . [*] -> . */ + CHI_001_1(z->S[I(3, 1)], b[3], b[4], b[0]); /* * . . -> * */ + CHI_010_0(z->S[I(4, 1)], b[4], b[0], b[1]); /* * . . -> . */ + + /* We're getting the hang of this. The y' = 2 plane shouldn't be any + * trouble. + */ + THETA_RHO(1, 2, 3, 4, 0); + CHI_COMPL(t, b[3]); /* [*] */ + CHI_101_0(z->S[I(0, 2)], b[0], b[1], b[2]); /* * . * -> . */ + CHI_010_0(z->S[I(1, 2)], b[1], b[2], b[3]); /* . * . -> . */ + CHI_110_1(z->S[I(2, 2)], b[2], t, b[4]); /* * [*] . -> * */ + CHI_101_0(z->S[I(3, 2)], t, b[4], b[0]); /* * [*] . -> . */ + CHI_010_0(z->S[I(4, 2)], b[4], b[0], b[1]); /* * . . -> . */ + + /* This isn't as interesting any more. Let's do y' = 3 before boredom sets + * in. + */ + THETA_RHO(4, 0, 1, 2, 3); + CHI_COMPL(t, b[3]); /* [.] */ + CHI_010_0(z->S[I(0, 3)], b[0], b[1], b[2]); /* . * . -> . */ + CHI_101_0(z->S[I(1, 3)], b[1], b[2], b[3]); /* * . * -> . */ + CHI_001_1(z->S[I(2, 3)], b[2], t, b[4]); /* . [.] * -> * */ + CHI_010_0(z->S[I(3, 3)], t, b[4], b[0]); /* . [.] * -> . */ + CHI_101_0(z->S[I(4, 3)], b[4], b[0], b[1]); /* . * * -> . */ + + /* Last plane. Just y' = 4 to go. */ + THETA_RHO(2, 3, 4, 0, 1); + CHI_COMPL(t, b[1]); /* [*] */ + CHI_110_1(z->S[I(0, 4)], b[0], t, b[2]); /* * [*] . -> * */ + CHI_101_0(z->S[I(1, 4)], t, b[2], b[3]); /* [*] . * -> . */ + CHI_010_0(z->S[I(2, 4)], b[2], b[3], b[4]); /* . * . -> . */ + CHI_101_0(z->S[I(3, 4)], b[3], b[4], b[0]); /* * * . -> . */ + CHI_010_0(z->S[I(4, 4)], b[4], b[0], b[1]); /* * . . -> . */ + + /* And we're done. */ +#undef THETA_RHO +#undef CHI_COMPL +#undef CHI_001_1 +#undef CHI_010_0 +#undef CHI_101_0 +#undef CHI_110_1 +#undef CHI +} + +/* --- @keccak1600_p@ --- * + * + * Arguments: @keccak1600_state *z@ = where to write the output state + * @conts keccak1600_state *x@ = input state + * @unsigned n@ = number of rounds to perform + * + * Returns: --- + * + * Use: Implements the %$\Keccak[1600, n]$% permutation at the core + * of Keccak and the SHA-3 standard. + */ + +void keccak1600_p(keccak1600_state *z, const keccak1600_state *x, unsigned n) +{ + keccak1600_state u, v; + unsigned i = 0; + +#ifdef KECCAK_DEBUG + dump_state("init", 0, x); +#endif + keccak1600_round(&u, x, i++); n--; + while (n > 8) { + keccak1600_round(&v, &u, i++); + keccak1600_round(&u, &v, i++); + keccak1600_round(&v, &u, i++); + keccak1600_round(&u, &v, i++); + keccak1600_round(&v, &u, i++); + keccak1600_round(&u, &v, i++); + keccak1600_round(&v, &u, i++); + keccak1600_round(&u, &v, i++); + n -= 8; + } + switch (n) { + case 7: keccak1600_round(&v, &u, i++); + keccak1600_round(&u, &v, i++); + case 5: keccak1600_round(&v, &u, i++); + keccak1600_round(&u, &v, i++); + case 3: keccak1600_round(&v, &u, i++); + keccak1600_round(&u, &v, i++); + case 1: keccak1600_round(z, &u, i++); + break; + case 8: keccak1600_round(&v, &u, i++); + keccak1600_round(&u, &v, i++); + case 6: keccak1600_round(&v, &u, i++); + keccak1600_round(&u, &v, i++); + case 4: keccak1600_round(&v, &u, i++); + keccak1600_round(&u, &v, i++); + case 2: keccak1600_round(&v, &u, i++); + keccak1600_round(z, &v, i++); + break; + } +#ifdef KECCAK_DEBUG + dump_state("final", 0, z); +#endif +} + +/* --- @keccack1600_init@ --- * + * + * Arguments: @keccak1600_state *s@ = a state to initialize + * + * Returns: --- + * + * Use: Initialize @s@ to the root state. + */ + +void keccak1600_init(keccak1600_state *s) + { memset(s->S, 0, sizeof(s->S)); STATE_INIT(s); } + +/* --- @keccak1600_mix@ --- * + * + * Arguments: @keccak1600_state *s@ = a state to update + * @const kludge64 *p@ = pointer to 64-bit words to mix in + * @size_t n@ = size of the input, in 64-bit words + * + * Returns: --- + * + * Use: Mixes data into a %$\Keccak[r, 1600 - r]$% state. Note that + * it's the caller's responsibility to pass in no more than + * %$r$% bits of data. + */ + +void keccak1600_mix(keccak1600_state *s, const kludge64 *p, size_t n) +{ + unsigned i; + lane a; + + for (i = 0; i < n; i++) + { a = TO_LANE(p[i]); XOR_LANE(s->S[i], s->S[i], a); } +} + +/* --- @keccak1600_extract@ --- * + * + * Arguments: @const keccak1600_state *s@ = a state to extract output from + * @kludge64 *p@ = pointer to 64-bit words to write + * @size_t n@ = size of the output, in 64-bit words + * + * Returns: --- + * + * Use: Reads output from a %$\Keccak[r, 1600 - r]$% state. Note + * that it's the caller's responsibility to extract no more than + * %$r$% bits of data. + */ + +void keccak1600_extract(const keccak1600_state *s, kludge64 *p, size_t n) +{ + unsigned i; + keccak1600_state t; + + t = *s; STATE_OUT(&t); + for (i = 0; i < n; i++) p[i] = FROM_LANE(t.S[i]); +} + +/*----- Test rig ----------------------------------------------------------*/ + +#ifdef TEST_RIG + +#include + +#include +#include +#include + +static int vrf_p(dstr v[]) +{ + keccak1600_state u; + kludge64 t[25]; + dstr d = DSTR_INIT; + int n; + unsigned i; + int ok = 1; + + if (v[0].len != 200) die(1, "bad input size"); + if (v[2].len != 200) die(1, "bad output size"); + n = *(int *)v[1].buf; + dstr_ensure(&d, 200); d.len = 200; + + keccak1600_init(&u); + for (i = 0; i < 25; i++) LOAD64_L_(t[i], v[0].buf + 8*i); + keccak1600_mix(&u, t, 25); + keccak1600_p(&u, &u, n); + keccak1600_extract(&u, t, 25); + for (i = 0; i < 25; i++) STORE64_L_(d.buf + 8*i, t[i]); + if (memcmp(d.buf, v[2].buf, 200) != 0) { + ok = 0; + fprintf(stderr, "failed!"); + fprintf(stderr, "\n\t input = "); type_hex.dump(&v[0], stderr); + fprintf(stderr, "\n\t rounds = %d", n); + fprintf(stderr, "\n\t expected = "); type_hex.dump(&v[2], stderr); + fprintf(stderr, "\n\t calclated = "); type_hex.dump(&d, stderr); + } + + dstr_destroy(&d); + return (ok); +} + +static test_chunk defs[] = { + { "p", vrf_p, { &type_hex, &type_int, &type_hex } }, + { 0, 0, { 0 } } +}; + +int main(int argc, char *argv[]) +{ + test_run(argc, argv, defs, SRCDIR"/t/keccak1600"); + return (0); +} + +#endif + +/*----- That's all, folks -------------------------------------------------*/ diff --git a/symm/keccak1600.h b/symm/keccak1600.h new file mode 100644 index 00000000..2867be96 --- /dev/null +++ b/symm/keccak1600.h @@ -0,0 +1,137 @@ +/* -*-c-*- + * + * The Keccak-p[1600, n] permutation + * + * (c) 2017 Straylight/Edgeware + */ + +/*----- Licensing notice --------------------------------------------------* + * + * This file is part of Catacomb. + * + * Catacomb is free software; you can redistribute it and/or modify + * it under the terms of the GNU Library General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * Catacomb is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with Catacomb; if not, write to the Free + * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + * MA 02111-1307, USA. + */ + +#ifndef CATACOMB_KECCAK1600_H +#define CATACOMB_KECCAK1600_H + +#ifdef __cplusplus + extern "C" { +#endif + +/*----- Notes on the Keccak-p[1600, n] permutation ------------------------* + * + * Keccak, by Guido Bertoni, Joan Daemen, Michaël Peeters, and Gilles Van + * Assche, was the winning submission in NIST's competition to design a new + * hash function, and is now the basis of the SHA3 standard. + * + * The cryptographic primitive, Keccak-p[1600, n], is an unkeyed permutation + * on strings of 1600 bits. It's used in the authors' `sponge construction', + * which splits a 1600-bit state into an `inner part', whose size is known as + * the `capacity', and an `outer part', whose size is known as the `rate'. + * The security of the construction is determined by the capacity, and its + * performance is determined by the rate. + * + * This file just implements the permutation and some accessors for the state + * (which may be represented internally in a strange way, for performance + * reasons). Many useful things are defined along with SHA3. + */ + +/*----- Header files ------------------------------------------------------*/ + +#include + +/*----- Data structures ---------------------------------------------------*/ + +/* There are two possible internal representations for a lane in the state. + * Which one is in use isn't specified here. The context simply allows space + * for either. + */ +typedef kludge64 keccak1600_lane_64; +typedef struct { uint32 even, odd; } keccak1600_lane_i32; + +typedef union keccak1600_state { + keccak1600_lane_64 s64[25]; + keccak1600_lane_i32 si32[25]; +} keccak1600_state; + +/*----- Functions provided ------------------------------------------------*/ + +/* --- @keccak1600_p@ --- * + * + * Arguments: @keccak1600_state *z@ = where to write the output state + * @conts keccak1600_state *x@ = input state + * @unsigned n@ = number of rounds to perform + * + * Returns: --- + * + * Use: Implements the %$\Keccak[1600, n]$% permutation at the core + * of Keccak and the SHA-3 standard. + */ + +extern void keccak1600_p(keccak1600_state */*z*/, + const keccak1600_state */*x*/, unsigned /*n*/); + +/* --- @keccack1600_init@ --- * + * + * Arguments: @keccak1600_state *s@ = a state to initialize + * + * Returns: --- + * + * Use: Initialize @s@ to the root state. + */ + +extern void keccak1600_init(keccak1600_state */*s*/); + +/* --- @keccak1600_mix@ --- * + * + * Arguments: @keccak1600_state *s@ = a state to update + * @const kludge64 *p@ = pointer to 64-bit words to mix in + * @size_t n@ = size of the input, in 64-bit words + * + * Returns: --- + * + * Use: Mixes data into a %$\Keccak[r, 1600 - r]$% state. Note that + * it's the caller's responsibility to pass in no more than + * %$r$% bits of data. + */ + +extern void keccak1600_mix(keccak1600_state */*s*/, + const kludge64 */*p*/, size_t /*n*/); + +/* --- @keccak1600_extract@ --- * + * + * Arguments: @const keccak1600_state *s@ = a state to extract output from + * @kludge64 *p@ = pointer to 64-bit words to write + * @size_t n@ = size of the output, in 64-bit words + * + * Returns: --- + * + * Use: Reads output from a %$\Keccak[r, 1600 - r]$% state. Note + * that it's the caller's responsibility to extract no more than + * %$r$% bits of data. + */ + +extern void keccak1600_extract(const keccak1600_state */*s*/, + kludge64 */*p*/, size_t /*n*/); + +/*----- That's all, folks -------------------------------------------------*/ + +#ifdef __cplusplus + } +#endif + +#endif diff --git a/symm/t/keccak1600 b/symm/t/keccak1600 new file mode 100644 index 00000000..2b580243 --- /dev/null +++ b/symm/t/keccak1600 @@ -0,0 +1,12 @@ +# Test vectors for Keccak[1600, n]. + +p { + ## From the KeccakCodePackage, + ## TestVectors/KeccakF-1600-IntermediateValues.txt. + 0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 + 24 + e7dde140798f25f18a47c033f9ccd584eea95aa61e2698d54d49806f304715bd57d05362054e288bd46f8e7f2da497ffc44746a4a0e5fe90762e19d60cda5b8c9c05191bf7a630ad64fc8fd0b75a933035d617233fa95aeb0321710d26e6a6a95f55cfdb167ca58126c84703cd31b8439f56a5111a2ff20161aed9215a63e505f270c98cf2febe641166c47b95703661cb0ed04f555a7cb8c832cf1c8ae83e8c14263aae22790c94e409c5a224f94118c26504e72635f5163ba1307fe944f67549a2ec5c7bfff1ea; + e7dde140798f25f18a47c033f9ccd584eea95aa61e2698d54d49806f304715bd57d05362054e288bd46f8e7f2da497ffc44746a4a0e5fe90762e19d60cda5b8c9c05191bf7a630ad64fc8fd0b75a933035d617233fa95aeb0321710d26e6a6a95f55cfdb167ca58126c84703cd31b8439f56a5111a2ff20161aed9215a63e505f270c98cf2febe641166c47b95703661cb0ed04f555a7cb8c832cf1c8ae83e8c14263aae22790c94e409c5a224f94118c26504e72635f5163ba1307fe944f67549a2ec5c7bfff1ea + 24 + 3ccb6ef94d955c2d6db55770d02c336a6c6bd770128d3d0994d06955b2d9208a56f1e7e5994f9c4f38fb65daa2b957f90daf7512ae3d7785f710d8c347f2f4fa59879af7e69e1b1f25b498ee0fccfee4a168ceb9b661ce684f978fbac466eadef5b1af6e833dc433d9db1927045406e065128309f0a9f87c434717bfa64954fd404b99d833addd9774e70b5dfcd5ea483cb0b755eec8b8e3e9429e646e22a0917bddbae729310e90e8cca3fac59e2a20b63d1c4e4602345b59104ca4624e9f605cbf8f6ad26cd020; +} diff --git a/utils/keccak-toy b/utils/keccak-toy new file mode 100755 index 00000000..f77f875b --- /dev/null +++ b/utils/keccak-toy @@ -0,0 +1,213 @@ +#! /usr/bin/python + +import itertools as I + +INTERLACE = 1 +FAKE_INTERLACE = 2 +COMPLEMENT = 4 +COMPLHACK = 8 +OPTIONS = 0 + +def lfsr(): + poly = 0x171 + a = 1 + while True: + yield a&1 + a <<= 1 + if a&0x100: a ^= poly + +M32 = (1 << 32) - 1 +M64 = (1 << 64) - 1 +BEBIGOKIMISA = [(1, 0), (2, 0), (3, 1), (2, 2), (2, 3), (0, 4)] +def rotl(x, n): return ((x << n) | (x >> 64 - n))&M64 +def rotl_32(x, n): return ((x << n) | (x >> 32 - n))&M32 + +def interlace(x): + x0, x1 = x&M32, (x >> 32)&M32 # 543210 + t = ((x0 >> 16) ^ x1)&0x0000ffff; x0 ^= t << 16; x1 ^= t # 453210 + t = ((x0 >> 8) ^ x1)&0x00ff00ff; x0 ^= t << 8; x1 ^= t # 354210 + t = ((x0 >> 4) ^ x1)&0x0f0f0f0f; x0 ^= t << 4; x1 ^= t # 254310 + t = ((x0 >> 2) ^ x1)&0x33333333; x0 ^= t << 2; x1 ^= t # 154320 + t = ((x0 >> 1) ^ x1)&0x55555555; x0 ^= t << 1; x1 ^= t # 054321 + return x0, x1 + +def deinterlace((x0, x1)): + t = ((x0 >> 1) ^ x1)&0x55555555; x0 ^= t << 1; x1 ^= t # 154320 + t = ((x0 >> 2) ^ x1)&0x33333333; x0 ^= t << 2; x1 ^= t # 254310 + t = ((x0 >> 4) ^ x1)&0x0f0f0f0f; x0 ^= t << 4; x1 ^= t # 354210 + t = ((x0 >> 8) ^ x1)&0x00ff00ff; x0 ^= t << 8; x1 ^= t # 453210 + t = ((x0 >> 16) ^ x1)&0x0000ffff; x0 ^= t << 16; x1 ^= t # 543210 + return x0 | (x1 << 32) + +def identity(x): return x + +RC = 24*[0] +bits = lfsr() +for i in xrange(24): + for j in xrange(7): + RC[i] |= bits.next() << (1 << j) - 1 +print 'Round constants...' +for i, rc in enumerate(RC): + rc0, rc1 = interlace(rc) + print '%2d: 0x%016x = 0x%08x, 0x%08x' % (i, rc, rc0, rc1) + +ROT = [5*[0] for i in xrange(5)] +x, y = 1, 0 +for t in xrange(24): + ROT[x][y] = ((t + 1)*(t + 2)/2)%64 + x, y = y, (2*x + 3*y)%5 +print '\nRotations...' +for y in xrange(2, -3, -1): + print '%2d: %s' % (y, ', '.join('%3d' % ROT[x%5][y%5] + for x in xrange(-2, 3))) + +def print_state(A): + if OPTIONS & (INTERLACE | FAKE_INTERLACE): + fn = (OPTIONS & FAKE_INTERLACE) and interlace or identity + for y in xrange(5): + print '%2d: %s' % (y, ' '.join('%08x:%08x' % fn(A[x%5][y%5]) + for x in xrange(5))) + else: + for y in xrange(5): + print '%2d: %s' % (y, ' '.join('%016x' % (A[x%5][y%5] ^ + ROOT[x%5][y%5]) + for x in xrange(5))) + +def p(what, A): + print '\n%s...' % what + print_state(A) + return A + +def statemap(fn, A): + return [[fn(A[x][y]) for y in xrange(5)] for x in xrange(5)] + +if OPTIONS & INTERLACE: + + def to_interlace(A): return statemap(interlace, A) + def from_interlace(A): return statemap(deinterlace, A) + + def theta(A): + C = [reduce(lambda a, b: (a[0] ^ b[0], a[1] ^ b[1]), + (A[x][y] for y in xrange(5))) + for x in xrange(5)] + D = [(C[(x - 1)%5][0] ^ rotl_32(C[(x + 1)%5][1], 1), + C[(x - 1)%5][1] ^ C[(x + 1)%5][0]) + for x in xrange(5)] + return p('theta', [[(A[x][y][0] ^ D[x][0], A[x][y][1] ^ D[x][1]) + for y in xrange(5)] for x in xrange(5)]) + + def rho(A): + def f((a0, a1), n): + if n%2 == 0: return rotl_32(a0, n/2), rotl_32(a1, n/2) + else: return rotl_32(a1, (n + 1)/2), rotl_32(a0, (n - 1)/2) + return p('rho', [[f(A[x][y], ROT[x][y]) + for y in xrange(5)] for x in xrange(5)]) + + def pi(A): + ## x' = y, y' = 2 x + 3 y + ## y = x', x = (y' - 3 x')/2 = 3 y' - 4 x' = x' + 3 y' + return p('pi', [[(A[(x + 3*y)%5][x][0], A[(x + 3*y)%5][x][1]) + for y in xrange(5)] for x in xrange(5)]) + + def chi(A): + return p('chi', [[(A[x][y][0] ^ (~A[(x + 1)%5][y][0]&A[(x + 2)%5][y][0]), + A[x][y][1] ^ (~A[(x + 1)%5][y][1]&A[(x + 2)%5][y][1])) + for y in xrange(5)] for x in xrange(5)]) + + def iota(A, i): + rc = interlace(RC[i]) + return p('iota[%d]' % i, [[(A[x][y][0] ^ (x == y == 0 and rc[0] or 0), + A[x][y][1] ^ (x == y == 0 and rc[1] or 0)) + for y in xrange(5)] for x in xrange(5)]) + + def round(A, i): + return iota(chi(pi(rho(theta(A)))), i) + + ROOT = [5*[(0, 0)] for i in xrange(5)] + +else: + + def theta(A): + C = [reduce(lambda a, b: a ^ b, (A[x][y] for y in xrange(5))) + for x in xrange(5)] + D = [C[(x - 1)%5] ^ rotl(C[(x + 1)%5], 1) for x in xrange(5)] + return p('theta', [[A[x][y] ^ D[x] + for y in xrange(5)] for x in xrange(5)]) + + def rho(A): + return p('rho', [[rotl(A[x][y], ROT[x][y]) + for y in xrange(5)] for x in xrange(5)]) + + def pi(A): + ## x' = y, y' = 2 x + 3 y + ## y = x', x = (y' - 3 x')/2 = 3 y' - 4 x' = x' + 3 y' + return p('pi', [[A[(x + 3*y)%5][x] + for y in xrange(5)] for x in xrange(5)]) + + if OPTIONS & COMPLEMENT: + def chi(A): + Z = [5*[None] for i in xrange(5)] + + ## a ^ ( b | ~c) = ~z | . . * -> * + ## a ^ (~b & c) = z | . * . -> . + ## ~a ^ ( b | ~c) = z | * . * -> . + ## ~a ^ (~b & c) = ~z | * * . -> * + + Z[0][0] = A[0][0] ^ ( A[1][0] | A[2][0]) # * . * -> . + Z[1][0] = A[1][0] ^ (~A[2][0] | A[3][0]) # . [.] * -> * + Z[2][0] = A[2][0] ^ ( A[3][0] & A[4][0]) # * * . -> * + Z[3][0] = A[3][0] ^ ( A[4][0] | A[0][0]) # * * . -> . + Z[4][0] = A[4][0] ^ ( A[0][0] & A[1][0]) # * . . -> . + + Z[0][1] = A[0][1] ^ ( A[1][1] | A[2][1]) # * . * -> . + Z[1][1] = A[1][1] ^ ( A[2][1] & A[3][1]) # . * . -> . + Z[2][1] = A[2][1] ^ ( A[3][1] | ~A[4][1]) # * . [*] -> . + Z[3][1] = A[3][1] ^ ( A[4][1] | A[0][1]) # * . . -> * + Z[4][1] = A[4][1] ^ ( A[0][1] & A[1][1]) # * . . -> . + + t = ~A[3][2] # [*] + Z[0][2] = A[0][2] ^ ( A[1][2] | A[2][2]) # * . * -> . + Z[1][2] = A[1][2] ^ ( A[2][2] & A[3][2]) # . * . -> . + Z[2][2] = A[2][2] ^ ( t & A[4][2]) # * [*] . -> * + Z[3][2] = t ^ ( A[4][2] | A[0][2]) # * [*] . -> . + Z[4][2] = A[4][2] ^ ( A[0][2] & A[1][2]) # * . . -> . + + t = ~A[3][3] # [.] + Z[0][3] = A[0][3] ^ ( A[1][3] & A[2][3]) # . * . -> . + Z[1][3] = A[1][3] ^ ( A[2][3] | A[3][3]) # * . * -> . + Z[2][3] = A[2][3] ^ ( t | A[4][3]) # . [.] * -> * + Z[3][3] = t ^ ( A[4][3] & A[0][3]) # . [.] * -> . + Z[4][3] = A[4][3] ^ ( A[0][3] | A[1][3]) # . * * -> . + + t = ~A[1][4] # [*] + Z[0][4] = A[0][4] ^ ( t & A[2][4]) # * [*] . -> * + Z[1][4] = t ^ ( A[2][4] | A[3][4]) # [*] . * -> . + Z[2][4] = A[2][4] ^ ( A[3][4] & A[4][4]) # . * . -> . + Z[3][4] = A[3][4] ^ ( A[4][4] | A[0][4]) # * * . -> . + Z[4][4] = A[4][4] ^ ( A[0][4] & A[1][4]) # * . . -> . + + return p('chi', statemap(lambda i: i&M64, Z)) + else: + def chi(A): + return p('chi', [[A[x][y] ^ (~A[(x + 1)%5][y]&A[(x + 2)%5][y]) + for y in xrange(5)] for x in xrange(5)]) + + def iota(A, i): + return p('iota[%d]' % i, [[A[x][y] ^ (x == y == 0 and RC[i] or 0) + for y in xrange(5)] for x in xrange(5)]) + + def round(A, i): + return iota(chi(pi(rho(theta(A)))), i) + + if OPTIONS & COMPLEMENT: + ROOT = [[(x, y) in BEBIGOKIMISA and M64 or 0 + for y in xrange(5)] for x in xrange(5)] + else: + ROOT = [5*[0] for i in xrange(5)] + +def keccak1600_p(A, n): + for i in xrange(n): A = round(A, i) + return p('done', A) + +p('init', ROOT) +keccak1600_p(ROOT, 24) -- 2.11.0