mdw@git.distorted.org.uk Git - secnet/blob - keccak1600.c

   1 /* -*-c-*-
   2  *
   3  * The Keccak-p[1600, n] permutation
   4  *
   5  * (c) 2017 Straylight/Edgeware
   6  */
   7
   8 /*----- Licensing notice --------------------------------------------------*
   9  *
  10  * This file is part of secnet.
  11  * See README for full list of copyright holders.
  12  *
  13  * secnet is free software; you can redistribute it and/or modify it
  14  * under the terms of the GNU General Public License as published by
  15  * the Free Software Foundation; either version d of the License, or
  16  * (at your option) any later version.
  17  *
  18  * secnet is distributed in the hope that it will be useful, but
  19  * WITHOUT ANY WARRANTY; without even the implied warranty of
  20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21  * General Public License for more details.
  22  *
  23  * You should have received a copy of the GNU General Public License
  24  * version 3 along with secnet; if not, see
  25  * https://www.gnu.org/licenses/gpl.html.
  26  *
  27  * This file was originally part of Catacomb, but has been automatically
  28  * modified for incorporation into secnet: see `import-catacomb-crypto'
  29  * for details.
  30  *
  31  * Catacomb is free software; you can redistribute it and/or modify
  32  * it under the terms of the GNU Library General Public License as
  33  * published by the Free Software Foundation; either version 2 of the
  34  * License, or (at your option) any later version.
  35  *
  36  * Catacomb is distributed in the hope that it will be useful,
  37  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  38  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  39  * GNU Library General Public License for more details.
  40  *
  41  * You should have received a copy of the GNU Library General Public
  42  * License along with Catacomb; if not, write to the Free
  43  * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  44  * MA 02111-1307, USA.
  45  */
  46
  47 /*----- Header files ------------------------------------------------------*/
  48
  49 #include <limits.h>
  50 #include <string.h>
  51
  52 #include "fake-mLib-bits.h"
  53
  54 #include "keccak1600.h"
  55
  56 /* #define KECCAK_DEBUG */
  57
  58 /*----- Miscellaneous utilities -------------------------------------------*/
  59
  60 #define I(x, y) ((x) + 5*(y))           /* Column-major indexing */
  61
  62 /*----- Interlacing or not ------------------------------------------------*/
  63
  64 /* We should prefer the interlaced representation if the target is really
  65  * 32-bit and only providing synthetic 64-bit integers.  Alas, the Windows
  66  * 64-bit ABI specifies that `long' is only 32-bits (i.e., it is IL32/LLP64),
  67  * so detect x86 specifically.
  68  */
  69 #if (ULONG_MAX >> 31) <= 0xffffffff && \
  70   !defined(__amd64__) && !defined(_M_AMD64)
  71 #  define KECCAK_I32
  72 #endif
  73
  74 #ifdef KECCAK_I32
  75 /* A 32-bit target with at best weak support for 64-bit shifts.  Maintain a
  76  * lane as two 32-bit pieces representing the even and odd bits of the lane.
  77  * There are slightly fiddly transformations to apply on the way in and out
  78  * of the main permutation.
  79  */
  80
  81 typedef keccak1600_lane_i32 lane;
  82 #define S si32
  83
  84 static lane interlace(kludge64 x)
  85 {
  86   /* Given a 64-bit string X, return a lane Z containing the even- and
  87    * odd-numbered bits of X.
  88    *
  89    * This becomes more manageable if we look at what happens to the bit
  90    * indices: bit i of X becomes bit ROR_6(i, 1) of Z.  We can effectively
  91    * swap two bits of the indices by swapping the object bits where those
  92    * index bits differ.  Fortunately, this is fairly easy.
  93    *
  94    * We arrange to swap bits between the two halves of X, rather than within
  95    * a half.
  96    */
  97
  98   uint32 x0 = LO64(x), x1 = HI64(x), t;
  99   lane z;
 100                                                             /* 543210 */
 101   t = ((x0 >> 16) ^ x1)&0x0000ffff; x0 ^= t << 16; x1 ^= t; /* 453210 */
 102   t = ((x0 >>  8) ^ x1)&0x00ff00ff; x0 ^= t <<  8; x1 ^= t; /* 354210 */
 103   t = ((x0 >>  4) ^ x1)&0x0f0f0f0f; x0 ^= t <<  4; x1 ^= t; /* 254310 */
 104   t = ((x0 >>  2) ^ x1)&0x33333333; x0 ^= t <<  2; x1 ^= t; /* 154320 */
 105   t = ((x0 >>  1) ^ x1)&0x55555555; x0 ^= t <<  1; x1 ^= t; /* 054321 */
 106   z.even = x0; z.odd = x1; return (z);
 107 }
 108
 109 static kludge64 deinterlace(lane x)
 110 {
 111   /* Given a lane X, return the combined 64-bit value.  This is the inverse
 112    * to `interlace' above, and the principle is the same
 113    */
 114
 115   uint32 x0 = x.even, x1 = x.odd, t;
 116   kludge64 z;
 117                                                             /* 054321 */
 118   t = ((x0 >>  1) ^ x1)&0x55555555; x0 ^= t <<  1; x1 ^= t; /* 154320 */
 119   t = ((x0 >>  2) ^ x1)&0x33333333; x0 ^= t <<  2; x1 ^= t; /* 254310 */
 120   t = ((x0 >>  4) ^ x1)&0x0f0f0f0f; x0 ^= t <<  4; x1 ^= t; /* 354210 */
 121   t = ((x0 >>  8) ^ x1)&0x00ff00ff; x0 ^= t <<  8; x1 ^= t; /* 453210 */
 122   t = ((x0 >> 16) ^ x1)&0x0000ffff; x0 ^= t << 16; x1 ^= t; /* 543210 */
 123   SET64(z, x1, x0); return (z);
 124 }
 125
 126 #define TO_LANE(x) (interlace(x))
 127 #define FROM_LANE(x) (deinterlace(x))
 128
 129 #define PRINTFMT_LANE "%08lx:%08lx"
 130 #define PRINTARGS_LANE(x) (unsigned long)(x).even, (unsigned long)(x).odd
 131
 132 #define BINOP_LANE(z, op, x, y)                                         \
 133   ((z).even = (x).even op (y).even, (z).odd = (x).odd op (y).odd)
 134 #define XOR_LANE(z, x, y) BINOP_LANE(z, ^, x, y)
 135 #define AND_LANE(z, x, y) BINOP_LANE(z, &, x, y)
 136 #define OR_LANE(z, x, y) BINOP_LANE(z, |, x, y)
 137 #define NOT_LANE(z, x) ((z).even = ~(x).even, (z).odd = ~(x).odd)
 138
 139 #define ROTL_LANE(z, x, n) do {                                         \
 140   lane _t = (x);                                                        \
 141   (z).even = (n)%2 ? ROL32(_t.odd,  ((n) + 1)/2)                        \
 142                    : ROL32(_t.even,  (n)/2);                            \
 143   (z).odd  = (n)%2 ? ROL32(_t.even, ((n) - 1)/2)                        \
 144                    : ROL32(_t.odd,   (n)/2);                            \
 145 } while (0)
 146
 147 #define LANE_ZERO {          0,          0 }
 148 #define LANE_CMPL { 0xffffffff, 0xffffffff }
 149
 150 static const lane rcon[24] = {
 151   { 0x00000001, 0x00000000 }, { 0x00000000, 0x00000089 },
 152   { 0x00000000, 0x8000008b }, { 0x00000000, 0x80008080 },
 153   { 0x00000001, 0x0000008b }, { 0x00000001, 0x00008000 },
 154   { 0x00000001, 0x80008088 }, { 0x00000001, 0x80000082 },
 155   { 0x00000000, 0x0000000b }, { 0x00000000, 0x0000000a },
 156   { 0x00000001, 0x00008082 }, { 0x00000000, 0x00008003 },
 157   { 0x00000001, 0x0000808b }, { 0x00000001, 0x8000000b },
 158   { 0x00000001, 0x8000008a }, { 0x00000001, 0x80000081 },
 159   { 0x00000000, 0x80000081 }, { 0x00000000, 0x80000008 },
 160   { 0x00000000, 0x00000083 }, { 0x00000000, 0x80008003 },
 161   { 0x00000001, 0x80008088 }, { 0x00000000, 0x80000088 },
 162   { 0x00000001, 0x00008000 }, { 0x00000000, 0x80008082 }
 163 };
 164
 165 #else
 166 /* A target with good support for 64-bit shifts.  We store lanes as 64-bit
 167  * quantities and deal with them in the obvious, natural way.
 168  */
 169
 170 typedef keccak1600_lane_64 lane;
 171 #define S s64
 172
 173 #define TO_LANE(x) (x)
 174 #define FROM_LANE(x) (x)
 175
 176 #define PRINTFMT_LANE "%08lx%08lx"
 177 #define PRINTARGS_LANE(x) (unsigned long)HI64(x), (unsigned long)LO64(x)
 178
 179 #define XOR_LANE(z, x, y) XOR64((z), (x), (y))
 180 #define AND_LANE(z, x, y) AND64((z), (x), (y))
 181 #define OR_LANE(z, x, y) OR64((z), (x), (y))
 182 #define NOT_LANE(z, x) CPL64((z), (x))
 183 #define ROTL_LANE(z, x, n) ROL64_((z), (x), (n))
 184
 185 #define LANE_ZERO X64(       0,        0)
 186 #define LANE_CMPL X64(ffffffff, ffffffff)
 187
 188 static const lane rcon[24] = {
 189   X64(00000000, 00000001), X64(00000000, 00008082),
 190   X64(80000000, 0000808a), X64(80000000, 80008000),
 191   X64(00000000, 0000808b), X64(00000000, 80000001),
 192   X64(80000000, 80008081), X64(80000000, 00008009),
 193   X64(00000000, 0000008a), X64(00000000, 00000088),
 194   X64(00000000, 80008009), X64(00000000, 8000000a),
 195   X64(00000000, 8000808b), X64(80000000, 0000008b),
 196   X64(80000000, 00008089), X64(80000000, 00008003),
 197   X64(80000000, 00008002), X64(80000000, 00000080),
 198   X64(00000000, 0000800a), X64(80000000, 8000000a),
 199   X64(80000000, 80008081), X64(80000000, 00008080),
 200   X64(00000000, 80000001), X64(80000000, 80008008)
 201 };
 202
 203 #endif
 204
 205 /*----- Complementing or not ----------------------------------------------*/
 206
 207 /* We should use the complemented representation if the target doesn't have a
 208  * fused and-not operation.  There doesn't appear to be a principled way to
 209  * do this, so we'll just have to make do with a big list.  Worse, in my
 210  * brief survey of the architecture reference manuals I have lying about,
 211  * they've split close to 50/50 on this question, so I don't have an
 212  * especially good way to pick a default.  The `no-fused-op' architectures
 213  * seem generally a bit more modern than the `fused-op' architectures, so I
 214  * guess I'll make the complemented representation the default.
 215  *
 216  *              and-not         No and-not
 217  *              -------         ----------
 218  *              ARM (`bic')     x86/amd64
 219  *              Sparc (`andn')  z/Architecture
 220  *              MMIX (`andn')   MIPS
 221  *              IA64 (`andcm')  68k
 222  *              VAX (`bic')     RISC-V
 223  *              PDP-10 (`andc')
 224  */
 225 #if !(defined(__arm__) || defined(__thumb__) || defined(__aarch64__) || \
 226       defined(_M_ARM) || defined(_M_THUMB)) &&                          \
 227     !(defined(__ia64__) || defined(__ia64) || defined(__itanium__) ||   \
 228       defined(_M_IA64)) &&                                              \
 229     !defined(__mmix__) &&                                               \
 230     !(defined(__sparc__) || defined(__sparc)) &&                        \
 231     !defined(__vax__) &&                                                \
 232     !defined(__pdp10__)
 233 #  define KECCAK_COMPL
 234 #endif
 235
 236 #ifdef KECCAK_COMPL
 237 /* A target without fused and/not (`bic', `andc2').  We complement some of
 238  * the lanes in the initial state and undo this on output.  (Absorbing XORs
 239  * input into the state, so this is unaffected.)  See the handling of chi in
 240  * `keccak1600_round' below for the details.
 241  */
 242
 243 #define STATE_INIT(z) do {                                              \
 244   lane cmpl = LANE_CMPL;                                                \
 245   (z)->S[I(1, 0)] = cmpl; (z)->S[I(2, 0)] = cmpl;                       \
 246   (z)->S[I(3, 1)] = cmpl; (z)->S[I(2, 2)] = cmpl;                       \
 247   (z)->S[I(2, 3)] = cmpl; (z)->S[I(0, 4)] = cmpl;                       \
 248 } while (0)
 249
 250 #define STATE_OUT(z) do {                                               \
 251   NOT_LANE((z)->S[I(1, 0)], (z)->S[I(1, 0)]);                           \
 252   NOT_LANE((z)->S[I(2, 0)], (z)->S[I(2, 0)]);                           \
 253   NOT_LANE((z)->S[I(3, 1)], (z)->S[I(3, 1)]);                           \
 254   NOT_LANE((z)->S[I(2, 2)], (z)->S[I(2, 2)]);                           \
 255   NOT_LANE((z)->S[I(2, 3)], (z)->S[I(2, 3)]);                           \
 256   NOT_LANE((z)->S[I(0, 4)], (z)->S[I(0, 4)]);                           \
 257 } while (0)
 258
 259 #else
 260 /* A target with fused and/not (`bic', `andc2').  Everything is simple. */
 261
 262 #define STATE_INIT(z) do ; while (0)
 263 #define STATE_OUT(z) do ; while (0)
 264
 265 #endif
 266
 267 /*----- Other magic constants ---------------------------------------------*/
 268
 269 /* The rotation constants.  These are systematically named -- see `THETA_RHO'
 270  * below.
 271  */
 272 #define ROT_0_0  0
 273 #define ROT_1_0  1
 274 #define ROT_2_0 62
 275 #define ROT_3_0 28
 276 #define ROT_4_0 27
 277
 278 #define ROT_0_1 36
 279 #define ROT_1_1 44
 280 #define ROT_2_1  6
 281 #define ROT_3_1 55
 282 #define ROT_4_1 20
 283
 284 #define ROT_0_2  3
 285 #define ROT_1_2 10
 286 #define ROT_2_2 43
 287 #define ROT_3_2 25
 288 #define ROT_4_2 39
 289
 290 #define ROT_0_3 41
 291 #define ROT_1_3 45
 292 #define ROT_2_3 15
 293 #define ROT_3_3 21
 294 #define ROT_4_3  8
 295
 296 #define ROT_0_4 18
 297 #define ROT_1_4  2
 298 #define ROT_2_4 61
 299 #define ROT_3_4 56
 300 #define ROT_4_4 14
 301
 302 /*----- Debugging ---------------------------------------------------------*/
 303
 304 #ifdef KECCAK_DEBUG
 305
 306 #include <stdio.h>
 307
 308 static void dump_state(const char *what, unsigned ir,
 309                        const keccak1600_state *x)
 310 {
 311   unsigned i, j;
 312   keccak1600_state y;
 313   kludge64 a;
 314   int sep;
 315
 316   printf(";; %s [round %u]\n", what, ir);
 317   printf(";; raw state...\n");
 318   for (j = 0; j < 5; j++) {
 319     printf(";;");
 320     for (i = 0, sep = '\t'; i < 5; i++, sep = ' ')
 321       printf("%c" PRINTFMT_LANE, sep, PRINTARGS_LANE(x->S[I(i, j)]));
 322     fputc('\n', stdout);
 323   }
 324   y = *x; STATE_OUT(&y);
 325 #ifdef KECCAK_COMPL
 326   printf(";; uncomplemented state...\n");
 327   for (j = 0; j < 5; j++) {
 328     printf(";;");
 329     for (i = 0, sep = '\t'; i < 5; i++, sep = ' ')
 330       printf("%c" PRINTFMT_LANE, sep, PRINTARGS_LANE(y.S[I(i, j)]));
 331     fputc('\n', stdout);
 332   }
 333 #endif
 334 #ifdef KECCAK_I32
 335   printf(";; deinterlaced state...\n");
 336   for (j = 0; j < 5; j++) {
 337     printf(";;");
 338     for (i = 0, sep = '\t'; i < 5; i++, sep = ' ') {
 339       a = FROM_LANE(y.S[I(i, j)]);
 340       printf("%c%08lx%08lx", sep,
 341              (unsigned long)HI64(a), (unsigned long)LO64(a));
 342     }
 343     fputc('\n', stdout);
 344   }
 345 #endif
 346   fputc('\n', stdout);
 347 }
 348
 349 #endif
 350
 351 /*----- The Keccak-p[1600, n] permutation ---------------------------------*/
 352
 353 static void keccak1600_round(keccak1600_state *z,
 354                              const keccak1600_state *x, unsigned i)
 355 {
 356   /* Perform a round of Keccak-p[1600, n].  Process the state X and write the
 357    * result to Z.
 358    */
 359
 360   lane c[5], d[5], t;
 361
 362   /* Theta, first step: calculate the column parities. */
 363 #define COLPARITY(j) do {                                               \
 364            d[j] =      x->S[I(j, 0)];                                   \
 365   XOR_LANE(d[j], d[j], x->S[I(j, 1)]);                                  \
 366   XOR_LANE(d[j], d[j], x->S[I(j, 2)]);                                  \
 367   XOR_LANE(d[j], d[j], x->S[I(j, 3)]);                                  \
 368   XOR_LANE(d[j], d[j], x->S[I(j, 4)]);                                  \
 369 } while (0)
 370   COLPARITY(0); COLPARITY(1); COLPARITY(2); COLPARITY(3); COLPARITY(4);
 371 #undef COLPARITY
 372
 373   /* Theta, second step: calculate the combined effect. */
 374   ROTL_LANE(c[0], d[1], 1); XOR_LANE(c[0], c[0], d[4]);
 375   ROTL_LANE(c[1], d[2], 1); XOR_LANE(c[1], c[1], d[0]);
 376   ROTL_LANE(c[2], d[3], 1); XOR_LANE(c[2], c[2], d[1]);
 377   ROTL_LANE(c[3], d[4], 1); XOR_LANE(c[3], c[3], d[2]);
 378   ROTL_LANE(c[4], d[0], 1); XOR_LANE(c[4], c[4], d[3]);
 379
 380   /* Now we work plane by plane through the output.  To do this, we must undo
 381    * the pi transposition.  Pi maps (x', y') = (y, 2 x + 3 y), so y = x', and
 382    * x = (y' - 3 y)/2 = 3 (y' - 3 x') = x' + 3 y'.
 383    */
 384 #define THETA_RHO(i0, i1, i2, i3, i4) do {                              \
 385                                                                         \
 386   /* First, theta. */                                                   \
 387   XOR_LANE(d[0], x->S[I(i0, 0)], c[i0]);                                \
 388   XOR_LANE(d[1], x->S[I(i1, 1)], c[i1]);                                \
 389   XOR_LANE(d[2], x->S[I(i2, 2)], c[i2]);                                \
 390   XOR_LANE(d[3], x->S[I(i3, 3)], c[i3]);                                \
 391   XOR_LANE(d[4], x->S[I(i4, 4)], c[i4]);                                \
 392                                                                         \
 393   /* Then rho. */                                                       \
 394   ROTL_LANE(d[0], d[0], ROT_##i0##_0);                                  \
 395   ROTL_LANE(d[1], d[1], ROT_##i1##_1);                                  \
 396   ROTL_LANE(d[2], d[2], ROT_##i2##_2);                                  \
 397   ROTL_LANE(d[3], d[3], ROT_##i3##_3);                                  \
 398   ROTL_LANE(d[4], d[4], ROT_##i4##_4);                                  \
 399 } while (0)
 400
 401   /* The basic chi operation is: z = w ^ (~a&b), but this involves an
 402    * inversion which we can mostly avoid by being clever: observe that
 403    *
 404    *            w ^ (~a&~~b) = w ^ ~(a | ~b) = ~w ^ (a | ~b)
 405    *
 406    * by De Morgan's law.  Furthermore, complementing w or z is basically
 407    * equivalent.  Bertoni, Daemen, Peeters, Van Assche, and Van Keer, `Keccak
 408    * implementation overview', describe a pattern of lane complementation
 409    * which propagates through theta and pi in exactly the right way to be
 410    * restored easily by chi, here, with exactly one inversion per plane.
 411    *
 412    * Here's the pattern.
 413    *
 414    *                    [ * . * * . ]        [ . * * . . ]
 415    *                    [ * . * . . ]        [ . . . * . ]
 416    *                    [ * . * . . ]   ->   [ . . * . . ]
 417    *                    [ . * . * * ]        [ . . * . . ]
 418    *                    [ * . . * . ]        [ * . . . . ]
 419    *
 420    * where a `.' means that the lane is unchanged, and a `*' means that it
 421    * has been complemented.
 422    *
 423    * The macros `CHI_wxy_z' calculate z in terms of w, x, y assuming that the
 424    * inputs w, x, y marked with a `1' are complemented on input, and arrange
 425    * for z to be complemented on output if z is so marked.
 426    *
 427    * The diagrams to the right show the fragment of the complementation
 428    * pattern being handled by the corresponding line of code.  A symbol in
 429    * brackets indicates a deviation from the input pattern forced by explicit
 430    * complementation: there will be exactly one of these for each plane.
 431    */
 432 #ifdef KECCAK_COMPL
 433 #  define CHI_COMPL(z, x) NOT_LANE((z), (x))
 434 #  define CHI_001_1(z, w, x, y)                                         \
 435         (OR_LANE((z), (x), (y)), XOR_LANE((z), (z), (w)))
 436 #  define CHI_010_0(z, w, x, y)                                         \
 437         (AND_LANE((z), (x), (y)), XOR_LANE((z), (z), (w)))
 438 #  define CHI_101_0 CHI_001_1
 439 #  define CHI_110_1 CHI_010_0
 440 #else
 441 #  define CHI(z, w, x, y)                                               \
 442         (NOT_LANE((z), (x)),                                            \
 443          AND_LANE((z), (z), (y)),                                       \
 444          XOR_LANE((z), (z), (w)))
 445 #  define CHI_COMPL(z, x) ((z) = (x))
 446 #  define CHI_001_1 CHI
 447 #  define CHI_010_0 CHI
 448 #  define CHI_101_0 CHI
 449 #  define CHI_110_1 CHI
 450 #endif
 451
 452   /* Let's do the y' = 0 plane first.  Theta and rho are easy with our macro,
 453    * and we've done pi with the coordinate hacking.  That leaves chi next.
 454    * This is hairy because we must worry about complementation.
 455    */
 456   THETA_RHO(0, 1, 2, 3, 4);
 457   CHI_COMPL(t, d[2]);                         /*         [.]               */
 458   CHI_101_0(z->S[I(0, 0)], d[0], d[1], d[2]); /*  *   .   *          ->  . */
 459   CHI_001_1(z->S[I(1, 0)], d[1], t,    d[3]); /*      .  [.]  *      ->  * */
 460   CHI_110_1(z->S[I(2, 0)], d[2], d[3], d[4]); /*          *   *   .  ->  * */
 461   CHI_101_0(z->S[I(3, 0)], d[3], d[4], d[0]); /*  *           *   .  ->  . */
 462   CHI_010_0(z->S[I(4, 0)], d[4], d[0], d[1]); /*  *   .           .  ->  . */
 463
 464   /* We'd better do iota before we forget. */
 465   XOR_LANE(z->S[I(0, 0)], z->S[I(0, 0)], rcon[i]);
 466
 467   /* That was fun.  Maybe y' = 1 will be as good. */
 468   THETA_RHO(3, 4, 0, 1, 2);
 469   CHI_COMPL(t, d[4]);                         /*                 [*]       */
 470   CHI_101_0(z->S[I(0, 1)], d[0], d[1], d[2]); /*  *   .   *          ->  . */
 471   CHI_010_0(z->S[I(1, 1)], d[1], d[2], d[3]); /*      .   *   .      ->  . */
 472   CHI_101_0(z->S[I(2, 1)], d[2], d[3], t);    /*          *   .  [*] ->  . */
 473   CHI_001_1(z->S[I(3, 1)], d[3], d[4], d[0]); /*  *           .   .  ->  * */
 474   CHI_010_0(z->S[I(4, 1)], d[4], d[0], d[1]); /*  *   .           .  ->  . */
 475
 476   /* We're getting the hang of this.  The y' = 2 plane shouldn't be any
 477    * trouble.
 478    */
 479   THETA_RHO(1, 2, 3, 4, 0);
 480   CHI_COMPL(t, d[3]);                         /*             [*]           */
 481   CHI_101_0(z->S[I(0, 2)], d[0], d[1], d[2]); /*  *   .   *          ->  . */
 482   CHI_010_0(z->S[I(1, 2)], d[1], d[2], d[3]); /*      .   *   .      ->  . */
 483   CHI_110_1(z->S[I(2, 2)], d[2], t,    d[4]); /*          *  [*]  .  ->  * */
 484   CHI_101_0(z->S[I(3, 2)], t,    d[4], d[0]); /*  *          [*]  .  ->  . */
 485   CHI_010_0(z->S[I(4, 2)], d[4], d[0], d[1]); /*  *   .           .  ->  . */
 486
 487   /* This isn't as interesting any more.  Let's do y' = 3 before boredom sets
 488    * in.
 489    */
 490   THETA_RHO(4, 0, 1, 2, 3);
 491   CHI_COMPL(t, d[3]);                         /*             [.]           */
 492   CHI_010_0(z->S[I(0, 3)], d[0], d[1], d[2]); /*  .   *   .          ->  . */
 493   CHI_101_0(z->S[I(1, 3)], d[1], d[2], d[3]); /*      *   .   *      ->  . */
 494   CHI_001_1(z->S[I(2, 3)], d[2], t,    d[4]); /*          .  [.]  *  ->  * */
 495   CHI_010_0(z->S[I(3, 3)], t,    d[4], d[0]); /*  .          [.]  *  ->  . */
 496   CHI_101_0(z->S[I(4, 3)], d[4], d[0], d[1]); /*  .   *           *  ->  . */
 497
 498   /* Last plane.  Just y' = 4 to go. */
 499   THETA_RHO(2, 3, 4, 0, 1);
 500   CHI_COMPL(t, d[1]);                         /*     [*]                   */
 501   CHI_110_1(z->S[I(0, 4)], d[0], t,    d[2]); /*  *  [*]  .          ->  * */
 502   CHI_101_0(z->S[I(1, 4)], t,    d[2], d[3]); /*     [*]  .   *      ->  . */
 503   CHI_010_0(z->S[I(2, 4)], d[2], d[3], d[4]); /*          .   *   .  ->  . */
 504   CHI_101_0(z->S[I(3, 4)], d[3], d[4], d[0]); /*  *           *   .  ->  . */
 505   CHI_010_0(z->S[I(4, 4)], d[4], d[0], d[1]); /*  *   .           .  ->  . */
 506
 507   /* And we're done. */
 508 #undef THETA_RHO
 509 #undef CHI_COMPL
 510 #undef CHI_001_1
 511 #undef CHI_010_0
 512 #undef CHI_101_0
 513 #undef CHI_110_1
 514 #undef CHI
 515 }
 516
 517 /* --- @keccak1600_p@ --- *
 518  *
 519  * Arguments:   @keccak1600_state *z@ = where to write the output state
 520  *              @conts keccak1600_state *x@ = input state
 521  *              @unsigned n@ = number of rounds to perform
 522  *
 523  * Returns:     ---
 524  *
 525  * Use:         Implements the %$\Keccak[1600, n]$% permutation at the core
 526  *              of Keccak and the SHA-3 standard.
 527  */
 528
 529 void keccak1600_p(keccak1600_state *z, const keccak1600_state *x, unsigned n)
 530 {
 531   keccak1600_state u, v;
 532   unsigned i = 0;
 533
 534 #ifdef KECCAK_DEBUG
 535   dump_state("init", 0, x);
 536 #endif
 537   keccak1600_round(&u, x, i++); n--;
 538   while (n > 8) {
 539     keccak1600_round(&v, &u, i++);
 540     keccak1600_round(&u, &v, i++);
 541     keccak1600_round(&v, &u, i++);
 542     keccak1600_round(&u, &v, i++);
 543     keccak1600_round(&v, &u, i++);
 544     keccak1600_round(&u, &v, i++);
 545     keccak1600_round(&v, &u, i++);
 546     keccak1600_round(&u, &v, i++);
 547     n -= 8;
 548   }
 549   switch (n) {
 550     case 7: keccak1600_round(&v, &u, i++);
 551             keccak1600_round(&u, &v, i++);
 552     case 5: keccak1600_round(&v, &u, i++);
 553             keccak1600_round(&u, &v, i++);
 554     case 3: keccak1600_round(&v, &u, i++);
 555             keccak1600_round(&u, &v, i++);
 556     case 1: keccak1600_round( z, &u, i++);
 557             break;
 558     case 8: keccak1600_round(&v, &u, i++);
 559             keccak1600_round(&u, &v, i++);
 560     case 6: keccak1600_round(&v, &u, i++);
 561             keccak1600_round(&u, &v, i++);
 562     case 4: keccak1600_round(&v, &u, i++);
 563             keccak1600_round(&u, &v, i++);
 564     case 2: keccak1600_round(&v, &u, i++);
 565             keccak1600_round( z, &v, i++);
 566             break;
 567   }
 568 #ifdef KECCAK_DEBUG
 569   dump_state("final", 0, z);
 570 #endif
 571 }
 572
 573 /* --- @keccack1600_init@ --- *
 574  *
 575  * Arguments:   @keccak1600_state *s@ = a state to initialize
 576  *
 577  * Returns:     ---
 578  *
 579  * Use:         Initialize @s@ to the root state.
 580  */
 581
 582 void keccak1600_init(keccak1600_state *s)
 583   { memset(s->S, 0, sizeof(s->S)); STATE_INIT(s); }
 584
 585 /* --- @keccak1600_mix@ --- *
 586  *
 587  * Arguments:   @keccak1600_state *s@ = a state to update
 588  *              @const kludge64 *p@ = pointer to 64-bit words to mix in
 589  *              @size_t n@ = size of the input, in 64-bit words
 590  *
 591  * Returns:     ---
 592  *
 593  * Use:         Mixes data into a %$\Keccak[r, 1600 - r]$% state.  Note that
 594  *              it's the caller's responsibility to pass in no more than
 595  *              %$r$% bits of data.
 596  */
 597
 598 void keccak1600_mix(keccak1600_state *s, const kludge64 *p, size_t n)
 599 {
 600   unsigned i;
 601   lane a;
 602
 603   for (i = 0; i < n; i++)
 604     { a = TO_LANE(p[i]); XOR_LANE(s->S[i], s->S[i], a); }
 605 }
 606
 607 /* --- @keccak1600_extract@ --- *
 608  *
 609  * Arguments:   @const keccak1600_state *s@ = a state to extract output from
 610  *              @kludge64 *p@ = pointer to 64-bit words to write
 611  *              @size_t n@ = size of the output, in 64-bit words
 612  *
 613  * Returns:     ---
 614  *
 615  * Use:         Reads output from a %$\Keccak[r, 1600 - r]$% state.  Note
 616  *              that it's the caller's responsibility to extract no more than
 617  *              %$r$% bits of data.
 618  */
 619
 620 void keccak1600_extract(const keccak1600_state *s, kludge64 *p, size_t n)
 621 {
 622   unsigned i;
 623   keccak1600_state t;
 624
 625   t = *s; STATE_OUT(&t);
 626   for (i = 0; i < n; i++) p[i] = FROM_LANE(t.S[i]);
 627 }
 628
 629 /*----- That's all, folks -------------------------------------------------*/