mdw@git.distorted.org.uk Git - catacomb/blob - symm/keccak1600.c

   1 /* -*-c-*-
   2  *
   3  * The Keccak-p[1600, n] permutation
   4  *
   5  * (c) 2017 Straylight/Edgeware
   6  */
   7
   8 /*----- Licensing notice --------------------------------------------------*
   9  *
  10  * This file is part of Catacomb.
  11  *
  12  * Catacomb is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU Library General Public License as
  14  * published by the Free Software Foundation; either version 2 of the
  15  * License, or (at your option) any later version.
  16  *
  17  * Catacomb is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20  * GNU Library General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Library General Public
  23  * License along with Catacomb; if not, write to the Free
  24  * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25  * MA 02111-1307, USA.
  26  */
  27
  28 /*----- Header files ------------------------------------------------------*/
  29
  30 #include <limits.h>
  31 #include <string.h>
  32
  33 #include <mLib/bits.h>
  34
  35 #include "keccak1600.h"
  36 #include "permute.h"
  37
  38 /* #define KECCAK_DEBUG */
  39
  40 /*----- Miscellaneous utilities -------------------------------------------*/
  41
  42 #define I(x, y) ((x) + 5*(y))           /* Column-major indexing */
  43
  44 /*----- Interlacing or not ------------------------------------------------*/
  45
  46 /* We should prefer the interlaced representation if the target is really
  47  * 32-bit and only providing synthetic 64-bit integers.  Alas, the Windows
  48  * 64-bit ABI specifies that `long' is only 32-bits (i.e., it is IL32/LLP64),
  49  * so detect x86 specifically.
  50  */
  51 #if (ULONG_MAX >> 31) <= 0xffffffff && \
  52   !defined(__amd64__) && !defined(_M_AMD64)
  53 #  define KECCAK_I32
  54 #endif
  55
  56 #ifdef KECCAK_I32
  57 /* A 32-bit target with at best weak support for 64-bit shifts.  Maintain a
  58  * lane as two 32-bit pieces representing the even and odd bits of the lane.
  59  * There are slightly fiddly transformations to apply on the way in and out
  60  * of the main permutation.
  61  */
  62
  63 typedef keccak1600_lane_i32 lane;
  64 #define S si32
  65
  66 static lane interlace(kludge64 x)
  67 {
  68   /* Given a 64-bit string X, return a lane Z containing the even- and
  69    * odd-numbered bits of X.
  70    */
  71
  72 typedef uint32 regty;
  73 #define REGWD 32
  74
  75   uint32 x0 = LO64(x), x1 = HI64(x);
  76   lane z;
  77                                         /* 5, 4, 3, 2, 1, 0 */
  78   TWIZZLE_EXCH(x1, x0, 4);              /* 4, 5, 3, 2, 1, 0 */
  79   TWIZZLE_EXCH(x1, x0, 3);              /* 3, 5, 4, 2, 1, 0 */
  80   TWIZZLE_EXCH(x1, x0, 2);              /* 2, 5, 4, 3, 1, 0 */
  81   TWIZZLE_EXCH(x1, x0, 1);              /* 1, 5, 4, 3, 2, 0 */
  82   TWIZZLE_EXCH(x1, x0, 0);              /* 0, 5, 4, 3, 2, 1 */
  83   z.even = x0; z.odd = x1; return (z);
  84
  85 #undef REGWD
  86 }
  87
  88 static kludge64 deinterlace(lane x)
  89 {
  90   /* Given a lane X, return the combined 64-bit value.  This is the inverse
  91    * to `interlace' above, and the principle is the same
  92    */
  93
  94 typedef uint32 regty;
  95 #define REGWD 32
  96
  97   uint32 x0 = x.even, x1 = x.odd;
  98   kludge64 z;
  99                                         /* 0, 5, 4, 3, 2, 1 */
 100   TWIZZLE_EXCH(x1, x0, 0);              /* 1, 5, 4, 3, 2, 0 */
 101   TWIZZLE_EXCH(x1, x0, 1);              /* 2, 5, 4, 3, 1, 0 */
 102   TWIZZLE_EXCH(x1, x0, 2);              /* 3, 5, 4, 2, 1, 0 */
 103   TWIZZLE_EXCH(x1, x0, 3);              /* 4, 5, 3, 2, 1, 0 */
 104   TWIZZLE_EXCH(x1, x0, 4);              /* 5, 4, 3, 2, 1, 0 */
 105   SET64(z, x1, x0); return (z);
 106
 107 #undef REGWD
 108 }
 109
 110 #define TO_LANE(x) (interlace(x))
 111 #define FROM_LANE(x) (deinterlace(x))
 112
 113 #define PRINTFMT_LANE "%08lx:%08lx"
 114 #define PRINTARGS_LANE(x) (unsigned long)(x).even, (unsigned long)(x).odd
 115
 116 #define BINOP_LANE(z, op, x, y)                                         \
 117   ((z).even = (x).even op (y).even, (z).odd = (x).odd op (y).odd)
 118 #define XOR_LANE(z, x, y) BINOP_LANE(z, ^, x, y)
 119 #define AND_LANE(z, x, y) BINOP_LANE(z, &, x, y)
 120 #define OR_LANE(z, x, y) BINOP_LANE(z, |, x, y)
 121 #define NOT_LANE(z, x) ((z).even = ~(x).even, (z).odd = ~(x).odd)
 122
 123 #define ROTL_LANE(z, x, n) do {                                         \
 124   lane _t = (x);                                                        \
 125   (z).even = (n)%2 ? ROL32(_t.odd,  ((n) + 1)/2)                        \
 126                    : ROL32(_t.even,  (n)/2);                            \
 127   (z).odd  = (n)%2 ? ROL32(_t.even, ((n) - 1)/2)                        \
 128                    : ROL32(_t.odd,   (n)/2);                            \
 129 } while (0)
 130
 131 #define LANE_ZERO {          0,          0 }
 132 #define LANE_CMPL { 0xffffffff, 0xffffffff }
 133
 134 static const lane rcon[24] = {
 135   { 0x00000001, 0x00000000 }, { 0x00000000, 0x00000089 },
 136   { 0x00000000, 0x8000008b }, { 0x00000000, 0x80008080 },
 137   { 0x00000001, 0x0000008b }, { 0x00000001, 0x00008000 },
 138   { 0x00000001, 0x80008088 }, { 0x00000001, 0x80000082 },
 139   { 0x00000000, 0x0000000b }, { 0x00000000, 0x0000000a },
 140   { 0x00000001, 0x00008082 }, { 0x00000000, 0x00008003 },
 141   { 0x00000001, 0x0000808b }, { 0x00000001, 0x8000000b },
 142   { 0x00000001, 0x8000008a }, { 0x00000001, 0x80000081 },
 143   { 0x00000000, 0x80000081 }, { 0x00000000, 0x80000008 },
 144   { 0x00000000, 0x00000083 }, { 0x00000000, 0x80008003 },
 145   { 0x00000001, 0x80008088 }, { 0x00000000, 0x80000088 },
 146   { 0x00000001, 0x00008000 }, { 0x00000000, 0x80008082 }
 147 };
 148
 149 #else
 150 /* A target with good support for 64-bit shifts.  We store lanes as 64-bit
 151  * quantities and deal with them in the obvious, natural way.
 152  */
 153
 154 typedef keccak1600_lane_64 lane;
 155 #define S s64
 156
 157 #define TO_LANE(x) (x)
 158 #define FROM_LANE(x) (x)
 159
 160 #define PRINTFMT_LANE "%08lx%08lx"
 161 #define PRINTARGS_LANE(x) (unsigned long)HI64(x), (unsigned long)LO64(x)
 162
 163 #define XOR_LANE(z, x, y) XOR64((z), (x), (y))
 164 #define AND_LANE(z, x, y) AND64((z), (x), (y))
 165 #define OR_LANE(z, x, y) OR64((z), (x), (y))
 166 #define NOT_LANE(z, x) CPL64((z), (x))
 167 #define ROTL_LANE(z, x, n) ROL64_((z), (x), (n))
 168
 169 #define LANE_ZERO X64(       0,        0)
 170 #define LANE_CMPL X64(ffffffff, ffffffff)
 171
 172 static const lane rcon[24] = {
 173   X64(00000000, 00000001), X64(00000000, 00008082),
 174   X64(80000000, 0000808a), X64(80000000, 80008000),
 175   X64(00000000, 0000808b), X64(00000000, 80000001),
 176   X64(80000000, 80008081), X64(80000000, 00008009),
 177   X64(00000000, 0000008a), X64(00000000, 00000088),
 178   X64(00000000, 80008009), X64(00000000, 8000000a),
 179   X64(00000000, 8000808b), X64(80000000, 0000008b),
 180   X64(80000000, 00008089), X64(80000000, 00008003),
 181   X64(80000000, 00008002), X64(80000000, 00000080),
 182   X64(00000000, 0000800a), X64(80000000, 8000000a),
 183   X64(80000000, 80008081), X64(80000000, 00008080),
 184   X64(00000000, 80000001), X64(80000000, 80008008)
 185 };
 186
 187 #endif
 188
 189 /*----- Complementing or not ----------------------------------------------*/
 190
 191 /* We should use the complemented representation if the target doesn't have a
 192  * fused and-not operation.  There doesn't appear to be a principled way to
 193  * do this, so we'll just have to make do with a big list.  Worse, in my
 194  * brief survey of the architecture reference manuals I have lying about,
 195  * they've split close to 50/50 on this question, so I don't have an
 196  * especially good way to pick a default.  The `no-fused-op' architectures
 197  * seem generally a bit more modern than the `fused-op' architectures, so I
 198  * guess I'll make the complemented representation the default.
 199  *
 200  *              and-not         No and-not
 201  *              -------         ----------
 202  *              ARM (`bic')     x86/amd64
 203  *              Sparc (`andn')  z/Architecture
 204  *              MMIX (`andn')   MIPS
 205  *              IA64 (`andcm')  68k
 206  *              VAX (`bic')     RISC-V
 207  *              PDP-10 (`andc')
 208  */
 209 #if !(defined(__arm__) || defined(__thumb__) || defined(__aarch64__) || \
 210       defined(_M_ARM) || defined(_M_THUMB)) &&                          \
 211     !(defined(__ia64__) || defined(__ia64) || defined(__itanium__) ||   \
 212       defined(_M_IA64)) &&                                              \
 213     !defined(__mmix__) &&                                               \
 214     !(defined(__sparc__) || defined(__sparc)) &&                        \
 215     !defined(__vax__) &&                                                \
 216     !defined(__pdp10__)
 217 #  define KECCAK_COMPL
 218 #endif
 219
 220 #ifdef KECCAK_COMPL
 221 /* A target without fused and/not (`bic', `andc2').  We complement some of
 222  * the lanes in the initial state and undo this on output.  (Absorbing XORs
 223  * input into the state, so this is unaffected.)  See the handling of chi in
 224  * `keccak1600_round' below for the details.
 225  */
 226
 227 #define COMPL_MASK 0x00121106u
 228
 229 #define STATE_INIT(z) do {                                              \
 230   lane cmpl = LANE_CMPL;                                                \
 231   (z)->S[I(1, 0)] = cmpl; (z)->S[I(2, 0)] = cmpl;                       \
 232   (z)->S[I(3, 1)] = cmpl; (z)->S[I(2, 2)] = cmpl;                       \
 233   (z)->S[I(2, 3)] = cmpl; (z)->S[I(0, 4)] = cmpl;                       \
 234 } while (0)
 235
 236 #define STATE_OUT(z) do {                                               \
 237   NOT_LANE((z)->S[I(1, 0)], (z)->S[I(1, 0)]);                           \
 238   NOT_LANE((z)->S[I(2, 0)], (z)->S[I(2, 0)]);                           \
 239   NOT_LANE((z)->S[I(3, 1)], (z)->S[I(3, 1)]);                           \
 240   NOT_LANE((z)->S[I(2, 2)], (z)->S[I(2, 2)]);                           \
 241   NOT_LANE((z)->S[I(2, 3)], (z)->S[I(2, 3)]);                           \
 242   NOT_LANE((z)->S[I(0, 4)], (z)->S[I(0, 4)]);                           \
 243 } while (0)
 244
 245 #else
 246 /* A target with fused and/not (`bic', `andc2').  Everything is simple. */
 247
 248 #define COMPL_MASK 0u
 249
 250 #define STATE_INIT(z) do ; while (0)
 251 #define STATE_OUT(z) do ; while (0)
 252
 253 #endif
 254
 255 /*----- Other magic constants ---------------------------------------------*/
 256
 257 /* The rotation constants.  These are systematically named -- see `THETA_RHO'
 258  * below.
 259  */
 260 #define ROT_0_0  0
 261 #define ROT_1_0  1
 262 #define ROT_2_0 62
 263 #define ROT_3_0 28
 264 #define ROT_4_0 27
 265
 266 #define ROT_0_1 36
 267 #define ROT_1_1 44
 268 #define ROT_2_1  6
 269 #define ROT_3_1 55
 270 #define ROT_4_1 20
 271
 272 #define ROT_0_2  3
 273 #define ROT_1_2 10
 274 #define ROT_2_2 43
 275 #define ROT_3_2 25
 276 #define ROT_4_2 39
 277
 278 #define ROT_0_3 41
 279 #define ROT_1_3 45
 280 #define ROT_2_3 15
 281 #define ROT_3_3 21
 282 #define ROT_4_3  8
 283
 284 #define ROT_0_4 18
 285 #define ROT_1_4  2
 286 #define ROT_2_4 61
 287 #define ROT_3_4 56
 288 #define ROT_4_4 14
 289
 290 /*----- Debugging ---------------------------------------------------------*/
 291
 292 #ifdef KECCAK_DEBUG
 293
 294 #include <stdio.h>
 295
 296 static void dump_state(const char *what, unsigned ir,
 297                        const keccak1600_state *x)
 298 {
 299   unsigned i, j;
 300   keccak1600_state y;
 301   kludge64 a;
 302   int sep;
 303
 304   printf(";; %s [round %u]\n", what, ir);
 305   printf(";; raw state...\n");
 306   for (j = 0; j < 5; j++) {
 307     printf(";;");
 308     for (i = 0, sep = '\t'; i < 5; i++, sep = ' ')
 309       printf("%c" PRINTFMT_LANE, sep, PRINTARGS_LANE(x->S[I(i, j)]));
 310     fputc('\n', stdout);
 311   }
 312   y = *x; STATE_OUT(&y);
 313 #ifdef KECCAK_COMPL
 314   printf(";; uncomplemented state...\n");
 315   for (j = 0; j < 5; j++) {
 316     printf(";;");
 317     for (i = 0, sep = '\t'; i < 5; i++, sep = ' ')
 318       printf("%c" PRINTFMT_LANE, sep, PRINTARGS_LANE(y.S[I(i, j)]));
 319     fputc('\n', stdout);
 320   }
 321 #endif
 322 #ifdef KECCAK_I32
 323   printf(";; deinterlaced state...\n");
 324   for (j = 0; j < 5; j++) {
 325     printf(";;");
 326     for (i = 0, sep = '\t'; i < 5; i++, sep = ' ') {
 327       a = FROM_LANE(y.S[I(i, j)]);
 328       printf("%c%08lx%08lx", sep,
 329              (unsigned long)HI64(a), (unsigned long)LO64(a));
 330     }
 331     fputc('\n', stdout);
 332   }
 333 #endif
 334   fputc('\n', stdout);
 335 }
 336
 337 #endif
 338
 339 /*----- The Keccak-p[1600, n] permutation ---------------------------------*/
 340
 341 static void keccak1600_round(keccak1600_state *z,
 342                              const keccak1600_state *x, unsigned i)
 343 {
 344   /* Perform a round of Keccak-p[1600, n].  Process the state X and write the
 345    * result to Z.
 346    */
 347
 348   lane c[5], d[5], t;
 349
 350   /* Theta, first step: calculate the column parities. */
 351 #define COLPARITY(j) do {                                               \
 352            d[j] =      x->S[I(j, 0)];                                   \
 353   XOR_LANE(d[j], d[j], x->S[I(j, 1)]);                                  \
 354   XOR_LANE(d[j], d[j], x->S[I(j, 2)]);                                  \
 355   XOR_LANE(d[j], d[j], x->S[I(j, 3)]);                                  \
 356   XOR_LANE(d[j], d[j], x->S[I(j, 4)]);                                  \
 357 } while (0)
 358   COLPARITY(0); COLPARITY(1); COLPARITY(2); COLPARITY(3); COLPARITY(4);
 359 #undef COLPARITY
 360
 361   /* Theta, second step: calculate the combined effect. */
 362   ROTL_LANE(c[0], d[1], 1); XOR_LANE(c[0], c[0], d[4]);
 363   ROTL_LANE(c[1], d[2], 1); XOR_LANE(c[1], c[1], d[0]);
 364   ROTL_LANE(c[2], d[3], 1); XOR_LANE(c[2], c[2], d[1]);
 365   ROTL_LANE(c[3], d[4], 1); XOR_LANE(c[3], c[3], d[2]);
 366   ROTL_LANE(c[4], d[0], 1); XOR_LANE(c[4], c[4], d[3]);
 367
 368   /* Now we work plane by plane through the output.  To do this, we must undo
 369    * the pi transposition.  Pi maps (x', y') = (y, 2 x + 3 y), so y = x', and
 370    * x = (y' - 3 y)/2 = 3 (y' - 3 x') = x' + 3 y'.
 371    */
 372 #define THETA_RHO(i0, i1, i2, i3, i4) do {                              \
 373                                                                         \
 374   /* First, theta. */                                                   \
 375   XOR_LANE(d[0], x->S[I(i0, 0)], c[i0]);                                \
 376   XOR_LANE(d[1], x->S[I(i1, 1)], c[i1]);                                \
 377   XOR_LANE(d[2], x->S[I(i2, 2)], c[i2]);                                \
 378   XOR_LANE(d[3], x->S[I(i3, 3)], c[i3]);                                \
 379   XOR_LANE(d[4], x->S[I(i4, 4)], c[i4]);                                \
 380                                                                         \
 381   /* Then rho. */                                                       \
 382   ROTL_LANE(d[0], d[0], ROT_##i0##_0);                                  \
 383   ROTL_LANE(d[1], d[1], ROT_##i1##_1);                                  \
 384   ROTL_LANE(d[2], d[2], ROT_##i2##_2);                                  \
 385   ROTL_LANE(d[3], d[3], ROT_##i3##_3);                                  \
 386   ROTL_LANE(d[4], d[4], ROT_##i4##_4);                                  \
 387 } while (0)
 388
 389   /* The basic chi operation is: z = w ^ (~a&b), but this involves an
 390    * inversion which we can mostly avoid by being clever: observe that
 391    *
 392    *            w ^ (~a&~~b) = w ^ ~(a | ~b) = ~w ^ (a | ~b)
 393    *
 394    * by De Morgan's law.  Furthermore, complementing w or z is basically
 395    * equivalent.  Bertoni, Daemen, Peeters, Van Assche, and Van Keer, `Keccak
 396    * implementation overview', describe a pattern of lane complementation
 397    * which propagates through theta and pi in exactly the right way to be
 398    * restored easily by chi, here, with exactly one inversion per plane.
 399    *
 400    * Here's the pattern.
 401    *
 402    *                    [ * . * * . ]        [ . * * . . ]
 403    *                    [ * . * . . ]        [ . . . * . ]
 404    *                    [ * . * . . ]   ->   [ . . * . . ]
 405    *                    [ . * . * * ]        [ . . * . . ]
 406    *                    [ * . . * . ]        [ * . . . . ]
 407    *
 408    * where a `.' means that the lane is unchanged, and a `*' means that it
 409    * has been complemented.
 410    *
 411    * The macros `CHI_wxy_z' calculate z in terms of w, x, y assuming that the
 412    * inputs w, x, y marked with a `1' are complemented on input, and arrange
 413    * for z to be complemented on output if z is so marked.
 414    *
 415    * The diagrams to the right show the fragment of the complementation
 416    * pattern being handled by the corresponding line of code.  A symbol in
 417    * brackets indicates a deviation from the input pattern forced by explicit
 418    * complementation: there will be exactly one of these for each plane.
 419    */
 420 #ifdef KECCAK_COMPL
 421 #  define CHI_COMPL(z, x) NOT_LANE((z), (x))
 422 #  define CHI_001_1(z, w, x, y)                                         \
 423         (OR_LANE((z), (x), (y)), XOR_LANE((z), (z), (w)))
 424 #  define CHI_010_0(z, w, x, y)                                         \
 425         (AND_LANE((z), (x), (y)), XOR_LANE((z), (z), (w)))
 426 #  define CHI_101_0 CHI_001_1
 427 #  define CHI_110_1 CHI_010_0
 428 #else
 429 #  define CHI(z, w, x, y)                                               \
 430         (NOT_LANE((z), (x)),                                            \
 431          AND_LANE((z), (z), (y)),                                       \
 432          XOR_LANE((z), (z), (w)))
 433 #  define CHI_COMPL(z, x) ((z) = (x))
 434 #  define CHI_001_1 CHI
 435 #  define CHI_010_0 CHI
 436 #  define CHI_101_0 CHI
 437 #  define CHI_110_1 CHI
 438 #endif
 439
 440   /* Let's do the y' = 0 plane first.  Theta and rho are easy with our macro,
 441    * and we've done pi with the coordinate hacking.  That leaves chi next.
 442    * This is hairy because we must worry about complementation.
 443    */
 444   THETA_RHO(0, 1, 2, 3, 4);
 445   CHI_COMPL(t, d[2]);                         /*         [.]               */
 446   CHI_101_0(z->S[I(0, 0)], d[0], d[1], d[2]); /*  *   .   *          ->  . */
 447   CHI_001_1(z->S[I(1, 0)], d[1], t,    d[3]); /*      .  [.]  *      ->  * */
 448   CHI_110_1(z->S[I(2, 0)], d[2], d[3], d[4]); /*          *   *   .  ->  * */
 449   CHI_101_0(z->S[I(3, 0)], d[3], d[4], d[0]); /*  *           *   .  ->  . */
 450   CHI_010_0(z->S[I(4, 0)], d[4], d[0], d[1]); /*  *   .           .  ->  . */
 451
 452   /* We'd better do iota before we forget. */
 453   XOR_LANE(z->S[I(0, 0)], z->S[I(0, 0)], rcon[i]);
 454
 455   /* That was fun.  Maybe y' = 1 will be as good. */
 456   THETA_RHO(3, 4, 0, 1, 2);
 457   CHI_COMPL(t, d[4]);                         /*                 [*]       */
 458   CHI_101_0(z->S[I(0, 1)], d[0], d[1], d[2]); /*  *   .   *          ->  . */
 459   CHI_010_0(z->S[I(1, 1)], d[1], d[2], d[3]); /*      .   *   .      ->  . */
 460   CHI_101_0(z->S[I(2, 1)], d[2], d[3], t);    /*          *   .  [*] ->  . */
 461   CHI_001_1(z->S[I(3, 1)], d[3], d[4], d[0]); /*  *           .   .  ->  * */
 462   CHI_010_0(z->S[I(4, 1)], d[4], d[0], d[1]); /*  *   .           .  ->  . */
 463
 464   /* We're getting the hang of this.  The y' = 2 plane shouldn't be any
 465    * trouble.
 466    */
 467   THETA_RHO(1, 2, 3, 4, 0);
 468   CHI_COMPL(t, d[3]);                         /*             [*]           */
 469   CHI_101_0(z->S[I(0, 2)], d[0], d[1], d[2]); /*  *   .   *          ->  . */
 470   CHI_010_0(z->S[I(1, 2)], d[1], d[2], d[3]); /*      .   *   .      ->  . */
 471   CHI_110_1(z->S[I(2, 2)], d[2], t,    d[4]); /*          *  [*]  .  ->  * */
 472   CHI_101_0(z->S[I(3, 2)], t,    d[4], d[0]); /*  *          [*]  .  ->  . */
 473   CHI_010_0(z->S[I(4, 2)], d[4], d[0], d[1]); /*  *   .           .  ->  . */
 474
 475   /* This isn't as interesting any more.  Let's do y' = 3 before boredom sets
 476    * in.
 477    */
 478   THETA_RHO(4, 0, 1, 2, 3);
 479   CHI_COMPL(t, d[3]);                         /*             [.]           */
 480   CHI_010_0(z->S[I(0, 3)], d[0], d[1], d[2]); /*  .   *   .          ->  . */
 481   CHI_101_0(z->S[I(1, 3)], d[1], d[2], d[3]); /*      *   .   *      ->  . */
 482   CHI_001_1(z->S[I(2, 3)], d[2], t,    d[4]); /*          .  [.]  *  ->  * */
 483   CHI_010_0(z->S[I(3, 3)], t,    d[4], d[0]); /*  .          [.]  *  ->  . */
 484   CHI_101_0(z->S[I(4, 3)], d[4], d[0], d[1]); /*  .   *           *  ->  . */
 485
 486   /* Last plane.  Just y' = 4 to go. */
 487   THETA_RHO(2, 3, 4, 0, 1);
 488   CHI_COMPL(t, d[1]);                         /*     [*]                   */
 489   CHI_110_1(z->S[I(0, 4)], d[0], t,    d[2]); /*  *  [*]  .          ->  * */
 490   CHI_101_0(z->S[I(1, 4)], t,    d[2], d[3]); /*     [*]  .   *      ->  . */
 491   CHI_010_0(z->S[I(2, 4)], d[2], d[3], d[4]); /*          .   *   .  ->  . */
 492   CHI_101_0(z->S[I(3, 4)], d[3], d[4], d[0]); /*  *           *   .  ->  . */
 493   CHI_010_0(z->S[I(4, 4)], d[4], d[0], d[1]); /*  *   .           .  ->  . */
 494
 495   /* And we're done. */
 496 #undef THETA_RHO
 497 #undef CHI_COMPL
 498 #undef CHI_001_1
 499 #undef CHI_010_0
 500 #undef CHI_101_0
 501 #undef CHI_110_1
 502 #undef CHI
 503 }
 504
 505 /* --- @keccak1600_p@ --- *
 506  *
 507  * Arguments:   @keccak1600_state *z@ = where to write the output state
 508  *              @conts keccak1600_state *x@ = input state
 509  *              @unsigned n@ = number of rounds to perform
 510  *
 511  * Returns:     ---
 512  *
 513  * Use:         Implements the %$\Keccak[1600, n]$% permutation at the core
 514  *              of Keccak and the SHA-3 standard.
 515  */
 516
 517 void keccak1600_p(keccak1600_state *z, const keccak1600_state *x, unsigned n)
 518 {
 519   keccak1600_state u, v;
 520   unsigned i = 0;
 521
 522 #ifdef KECCAK_DEBUG
 523   dump_state("init", 0, x);
 524 #endif
 525   keccak1600_round(&u, x, i++); n--;
 526   while (n > 8) {
 527     keccak1600_round(&v, &u, i++);
 528     keccak1600_round(&u, &v, i++);
 529     keccak1600_round(&v, &u, i++);
 530     keccak1600_round(&u, &v, i++);
 531     keccak1600_round(&v, &u, i++);
 532     keccak1600_round(&u, &v, i++);
 533     keccak1600_round(&v, &u, i++);
 534     keccak1600_round(&u, &v, i++);
 535     n -= 8;
 536   }
 537   switch (n) {
 538     case 7: keccak1600_round(&v, &u, i++);
 539             keccak1600_round(&u, &v, i++);
 540     case 5: keccak1600_round(&v, &u, i++);
 541             keccak1600_round(&u, &v, i++);
 542     case 3: keccak1600_round(&v, &u, i++);
 543             keccak1600_round(&u, &v, i++);
 544     case 1: keccak1600_round( z, &u, i++);
 545             break;
 546     case 8: keccak1600_round(&v, &u, i++);
 547             keccak1600_round(&u, &v, i++);
 548     case 6: keccak1600_round(&v, &u, i++);
 549             keccak1600_round(&u, &v, i++);
 550     case 4: keccak1600_round(&v, &u, i++);
 551             keccak1600_round(&u, &v, i++);
 552     case 2: keccak1600_round(&v, &u, i++);
 553             keccak1600_round( z, &v, i++);
 554             break;
 555   }
 556 #ifdef KECCAK_DEBUG
 557   dump_state("final", 0, z);
 558 #endif
 559 }
 560
 561 /* --- @keccack1600_init@ --- *
 562  *
 563  * Arguments:   @keccak1600_state *s@ = a state to initialize
 564  *
 565  * Returns:     ---
 566  *
 567  * Use:         Initialize @s@ to the root state.
 568  */
 569
 570 void keccak1600_init(keccak1600_state *s)
 571   { memset(s->S, 0, sizeof(s->S)); STATE_INIT(s); }
 572
 573 /* --- @keccak1600_mix@ --- *
 574  *
 575  * Arguments:   @keccak1600_state *s@ = a state to update
 576  *              @const kludge64 *p@ = pointer to 64-bit words to mix in
 577  *              @size_t n@ = size of the input, in 64-bit words
 578  *
 579  * Returns:     ---
 580  *
 581  * Use:         Mixes data into a %$\Keccak[r, 1600 - r]$% state.  Note that
 582  *              it's the caller's responsibility to pass in no more than
 583  *              %$r$% bits of data.
 584  */
 585
 586 void keccak1600_mix(keccak1600_state *s, const kludge64 *p, size_t n)
 587 {
 588   unsigned i;
 589   lane a;
 590
 591   for (i = 0; i < n; i++)
 592     { a = TO_LANE(p[i]); XOR_LANE(s->S[i], s->S[i], a); }
 593 }
 594
 595 /* --- @keccak1600_set@ --- *
 596  *
 597  * Arguments:   @keccak1600_state *s@ = a state to update
 598  *              @const kludge64 *p@ = pointer to 64-bit words to mix in
 599  *              @size_t n@ = size of the input, in 64-bit words
 600  *
 601  * Returns:     ---
 602  *
 603  * Use:         Stores data into a %$\Keccak[r, 1600 - r]$% state.  Note that
 604  *              it's the caller's responsibility to pass in no more than
 605  *              %$r$% bits of data.
 606  *
 607  *              This is not the operation you wanted for ordinary hashing.
 608  *              It's provided for the use of higher-level protocols which use
 609  *              duplexing and other fancy sponge features.
 610  */
 611
 612 void keccak1600_set(keccak1600_state *s, const kludge64 *p, size_t n)
 613 {
 614   uint32 m = COMPL_MASK;
 615   unsigned i;
 616   lane a;
 617
 618   for (i = 0; i < n; i++) {
 619     a = TO_LANE(p[i]); if (m&1) NOT_LANE(a, a);
 620     s->S[i] = a; m >>= 1;
 621   }
 622 }
 623
 624 /* --- @keccak1600_extract@ --- *
 625  *
 626  * Arguments:   @const keccak1600_state *s@ = a state to extract output from
 627  *              @kludge64 *p@ = pointer to 64-bit words to write
 628  *              @size_t n@ = size of the output, in 64-bit words
 629  *
 630  * Returns:     ---
 631  *
 632  * Use:         Reads output from a %$\Keccak[r, 1600 - r]$% state.  Note
 633  *              that it's the caller's responsibility to extract no more than
 634  *              %$r$% bits of data.
 635  */
 636
 637 void keccak1600_extract(const keccak1600_state *s, kludge64 *p, size_t n)
 638 {
 639   uint32 m = COMPL_MASK;
 640   unsigned i;
 641   lane t;
 642
 643   for (i = 0; i < n; i++) {
 644     t = s->S[i]; if (m&1) NOT_LANE(t, t);
 645     *p++ = FROM_LANE(t); m >>= 1;
 646   }
 647 }
 648
 649 /*----- Test rig ----------------------------------------------------------*/
 650
 651 #ifdef TEST_RIG
 652
 653 #include <stdio.h>
 654
 655 #include <mLib/macros.h>
 656 #include <mLib/quis.h>
 657 #include <mLib/report.h>
 658 #include <mLib/testrig.h>
 659
 660 static int vrf_p(dstr v[])
 661 {
 662   keccak1600_state u;
 663   kludge64 t[25];
 664   dstr d = DSTR_INIT;
 665   int n;
 666   unsigned i;
 667   int ok = 1;
 668
 669   if (v[0].len != 200) die(1, "bad input size");
 670   if (v[2].len != 200) die(1, "bad output size");
 671   n = *(int *)v[1].buf;
 672   dstr_ensure(&d, 200); d.len = 200;
 673
 674   keccak1600_init(&u);
 675   for (i = 0; i < 25; i++) LOAD64_L_(t[i], v[0].buf + 8*i);
 676   keccak1600_mix(&u, t, 25);
 677   keccak1600_p(&u, &u, n);
 678   keccak1600_extract(&u, t, 25);
 679   for (i = 0; i < 25; i++) STORE64_L_(d.buf + 8*i, t[i]);
 680   if (MEMCMP(d.buf, !=, v[2].buf, 200)) {
 681     ok = 0;
 682     fprintf(stderr, "failed!");
 683     fprintf(stderr, "\n\t     input = "); type_hex.dump(&v[0], stderr);
 684     fprintf(stderr, "\n\t    rounds = %d", n);
 685     fprintf(stderr, "\n\t  expected = "); type_hex.dump(&v[2], stderr);
 686     fprintf(stderr, "\n\t calclated = "); type_hex.dump(&d, stderr);
 687   }
 688
 689   dstr_destroy(&d);
 690   return (ok);
 691 }
 692
 693 static test_chunk defs[] = {
 694   { "p", vrf_p, { &type_hex, &type_int, &type_hex } },
 695   { 0, 0, { 0 } }
 696 };
 697
 698 int main(int argc, char *argv[])
 699 {
 700   test_run(argc, argv, defs, SRCDIR"/t/keccak1600");
 701   return (0);
 702 }
 703
 704 #endif
 705
 706 /*----- That's all, folks -------------------------------------------------*/