| 1 | /* -*-c-*- |
| 2 | * |
| 3 | * The Keccak-p[1600, n] permutation |
| 4 | * |
| 5 | * (c) 2017 Straylight/Edgeware |
| 6 | */ |
| 7 | |
| 8 | /*----- Licensing notice --------------------------------------------------* |
| 9 | * |
| 10 | * This file is part of Catacomb. |
| 11 | * |
| 12 | * Catacomb is free software; you can redistribute it and/or modify |
| 13 | * it under the terms of the GNU Library General Public License as |
| 14 | * published by the Free Software Foundation; either version 2 of the |
| 15 | * License, or (at your option) any later version. |
| 16 | * |
| 17 | * Catacomb is distributed in the hope that it will be useful, |
| 18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 20 | * GNU Library General Public License for more details. |
| 21 | * |
| 22 | * You should have received a copy of the GNU Library General Public |
| 23 | * License along with Catacomb; if not, write to the Free |
| 24 | * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, |
| 25 | * MA 02111-1307, USA. |
| 26 | */ |
| 27 | |
| 28 | /*----- Header files ------------------------------------------------------*/ |
| 29 | |
| 30 | #include <limits.h> |
| 31 | #include <string.h> |
| 32 | |
| 33 | #include <mLib/bits.h> |
| 34 | |
| 35 | #include "keccak1600.h" |
| 36 | #include "permute.h" |
| 37 | |
| 38 | /* #define KECCAK_DEBUG */ |
| 39 | |
| 40 | /*----- Miscellaneous utilities -------------------------------------------*/ |
| 41 | |
| 42 | #define I(x, y) ((x) + 5*(y)) /* Column-major indexing */ |
| 43 | |
| 44 | /*----- Interlacing or not ------------------------------------------------*/ |
| 45 | |
| 46 | /* We should prefer the interlaced representation if the target is really |
| 47 | * 32-bit and only providing synthetic 64-bit integers. Alas, the Windows |
| 48 | * 64-bit ABI specifies that `long' is only 32-bits (i.e., it is IL32/LLP64), |
| 49 | * so detect x86 specifically. |
| 50 | */ |
| 51 | #if (ULONG_MAX >> 31) <= 0xffffffff && \ |
| 52 | !defined(__amd64__) && !defined(_M_AMD64) |
| 53 | # define KECCAK_I32 |
| 54 | #endif |
| 55 | |
| 56 | #ifdef KECCAK_I32 |
| 57 | /* A 32-bit target with at best weak support for 64-bit shifts. Maintain a |
| 58 | * lane as two 32-bit pieces representing the even and odd bits of the lane. |
| 59 | * There are slightly fiddly transformations to apply on the way in and out |
| 60 | * of the main permutation. |
| 61 | */ |
| 62 | |
| 63 | typedef keccak1600_lane_i32 lane; |
| 64 | #define S si32 |
| 65 | |
| 66 | static lane interlace(kludge64 x) |
| 67 | { |
| 68 | /* Given a 64-bit string X, return a lane Z containing the even- and |
| 69 | * odd-numbered bits of X. |
| 70 | */ |
| 71 | |
| 72 | typedef uint32 regty; |
| 73 | #define REGWD 32 |
| 74 | |
| 75 | uint32 x0 = LO64(x), x1 = HI64(x); |
| 76 | lane z; |
| 77 | /* 5, 4, 3, 2, 1, 0 */ |
| 78 | TWIZZLE_EXCH(x1, x0, 4); /* 4, 5, 3, 2, 1, 0 */ |
| 79 | TWIZZLE_EXCH(x1, x0, 3); /* 3, 5, 4, 2, 1, 0 */ |
| 80 | TWIZZLE_EXCH(x1, x0, 2); /* 2, 5, 4, 3, 1, 0 */ |
| 81 | TWIZZLE_EXCH(x1, x0, 1); /* 1, 5, 4, 3, 2, 0 */ |
| 82 | TWIZZLE_EXCH(x1, x0, 0); /* 0, 5, 4, 3, 2, 1 */ |
| 83 | z.even = x0; z.odd = x1; return (z); |
| 84 | |
| 85 | #undef REGWD |
| 86 | } |
| 87 | |
| 88 | static kludge64 deinterlace(lane x) |
| 89 | { |
| 90 | /* Given a lane X, return the combined 64-bit value. This is the inverse |
| 91 | * to `interlace' above, and the principle is the same |
| 92 | */ |
| 93 | |
| 94 | typedef uint32 regty; |
| 95 | #define REGWD 32 |
| 96 | |
| 97 | uint32 x0 = x.even, x1 = x.odd; |
| 98 | kludge64 z; |
| 99 | /* 0, 5, 4, 3, 2, 1 */ |
| 100 | TWIZZLE_EXCH(x1, x0, 0); /* 1, 5, 4, 3, 2, 0 */ |
| 101 | TWIZZLE_EXCH(x1, x0, 1); /* 2, 5, 4, 3, 1, 0 */ |
| 102 | TWIZZLE_EXCH(x1, x0, 2); /* 3, 5, 4, 2, 1, 0 */ |
| 103 | TWIZZLE_EXCH(x1, x0, 3); /* 4, 5, 3, 2, 1, 0 */ |
| 104 | TWIZZLE_EXCH(x1, x0, 4); /* 5, 4, 3, 2, 1, 0 */ |
| 105 | SET64(z, x1, x0); return (z); |
| 106 | |
| 107 | #undef REGWD |
| 108 | } |
| 109 | |
| 110 | #define TO_LANE(x) (interlace(x)) |
| 111 | #define FROM_LANE(x) (deinterlace(x)) |
| 112 | |
| 113 | #define PRINTFMT_LANE "%08lx:%08lx" |
| 114 | #define PRINTARGS_LANE(x) (unsigned long)(x).even, (unsigned long)(x).odd |
| 115 | |
| 116 | #define BINOP_LANE(z, op, x, y) \ |
| 117 | ((z).even = (x).even op (y).even, (z).odd = (x).odd op (y).odd) |
| 118 | #define XOR_LANE(z, x, y) BINOP_LANE(z, ^, x, y) |
| 119 | #define AND_LANE(z, x, y) BINOP_LANE(z, &, x, y) |
| 120 | #define OR_LANE(z, x, y) BINOP_LANE(z, |, x, y) |
| 121 | #define NOT_LANE(z, x) ((z).even = ~(x).even, (z).odd = ~(x).odd) |
| 122 | |
| 123 | #define ROTL_LANE(z, x, n) do { \ |
| 124 | lane _t = (x); \ |
| 125 | (z).even = (n)%2 ? ROL32(_t.odd, ((n) + 1)/2) \ |
| 126 | : ROL32(_t.even, (n)/2); \ |
| 127 | (z).odd = (n)%2 ? ROL32(_t.even, ((n) - 1)/2) \ |
| 128 | : ROL32(_t.odd, (n)/2); \ |
| 129 | } while (0) |
| 130 | |
| 131 | #define LANE_ZERO { 0, 0 } |
| 132 | #define LANE_CMPL { 0xffffffff, 0xffffffff } |
| 133 | |
| 134 | static const lane rcon[24] = { |
| 135 | { 0x00000001, 0x00000000 }, { 0x00000000, 0x00000089 }, |
| 136 | { 0x00000000, 0x8000008b }, { 0x00000000, 0x80008080 }, |
| 137 | { 0x00000001, 0x0000008b }, { 0x00000001, 0x00008000 }, |
| 138 | { 0x00000001, 0x80008088 }, { 0x00000001, 0x80000082 }, |
| 139 | { 0x00000000, 0x0000000b }, { 0x00000000, 0x0000000a }, |
| 140 | { 0x00000001, 0x00008082 }, { 0x00000000, 0x00008003 }, |
| 141 | { 0x00000001, 0x0000808b }, { 0x00000001, 0x8000000b }, |
| 142 | { 0x00000001, 0x8000008a }, { 0x00000001, 0x80000081 }, |
| 143 | { 0x00000000, 0x80000081 }, { 0x00000000, 0x80000008 }, |
| 144 | { 0x00000000, 0x00000083 }, { 0x00000000, 0x80008003 }, |
| 145 | { 0x00000001, 0x80008088 }, { 0x00000000, 0x80000088 }, |
| 146 | { 0x00000001, 0x00008000 }, { 0x00000000, 0x80008082 } |
| 147 | }; |
| 148 | |
| 149 | #else |
| 150 | /* A target with good support for 64-bit shifts. We store lanes as 64-bit |
| 151 | * quantities and deal with them in the obvious, natural way. |
| 152 | */ |
| 153 | |
| 154 | typedef keccak1600_lane_64 lane; |
| 155 | #define S s64 |
| 156 | |
| 157 | #define TO_LANE(x) (x) |
| 158 | #define FROM_LANE(x) (x) |
| 159 | |
| 160 | #define PRINTFMT_LANE "%08lx%08lx" |
| 161 | #define PRINTARGS_LANE(x) (unsigned long)HI64(x), (unsigned long)LO64(x) |
| 162 | |
| 163 | #define XOR_LANE(z, x, y) XOR64((z), (x), (y)) |
| 164 | #define AND_LANE(z, x, y) AND64((z), (x), (y)) |
| 165 | #define OR_LANE(z, x, y) OR64((z), (x), (y)) |
| 166 | #define NOT_LANE(z, x) CPL64((z), (x)) |
| 167 | #define ROTL_LANE(z, x, n) ROL64_((z), (x), (n)) |
| 168 | |
| 169 | #define LANE_ZERO X64( 0, 0) |
| 170 | #define LANE_CMPL X64(ffffffff, ffffffff) |
| 171 | |
| 172 | static const lane rcon[24] = { |
| 173 | X64(00000000, 00000001), X64(00000000, 00008082), |
| 174 | X64(80000000, 0000808a), X64(80000000, 80008000), |
| 175 | X64(00000000, 0000808b), X64(00000000, 80000001), |
| 176 | X64(80000000, 80008081), X64(80000000, 00008009), |
| 177 | X64(00000000, 0000008a), X64(00000000, 00000088), |
| 178 | X64(00000000, 80008009), X64(00000000, 8000000a), |
| 179 | X64(00000000, 8000808b), X64(80000000, 0000008b), |
| 180 | X64(80000000, 00008089), X64(80000000, 00008003), |
| 181 | X64(80000000, 00008002), X64(80000000, 00000080), |
| 182 | X64(00000000, 0000800a), X64(80000000, 8000000a), |
| 183 | X64(80000000, 80008081), X64(80000000, 00008080), |
| 184 | X64(00000000, 80000001), X64(80000000, 80008008) |
| 185 | }; |
| 186 | |
| 187 | #endif |
| 188 | |
| 189 | /*----- Complementing or not ----------------------------------------------*/ |
| 190 | |
| 191 | /* We should use the complemented representation if the target doesn't have a |
| 192 | * fused and-not operation. There doesn't appear to be a principled way to |
| 193 | * do this, so we'll just have to make do with a big list. Worse, in my |
| 194 | * brief survey of the architecture reference manuals I have lying about, |
| 195 | * they've split close to 50/50 on this question, so I don't have an |
| 196 | * especially good way to pick a default. The `no-fused-op' architectures |
| 197 | * seem generally a bit more modern than the `fused-op' architectures, so I |
| 198 | * guess I'll make the complemented representation the default. |
| 199 | * |
| 200 | * and-not No and-not |
| 201 | * ------- ---------- |
| 202 | * ARM (`bic') x86/amd64 |
| 203 | * Sparc (`andn') z/Architecture |
| 204 | * MMIX (`andn') MIPS |
| 205 | * IA64 (`andcm') 68k |
| 206 | * VAX (`bic') RISC-V |
| 207 | * PDP-10 (`andc') |
| 208 | */ |
| 209 | #if !(defined(__arm__) || defined(__thumb__) || defined(__aarch64__) || \ |
| 210 | defined(_M_ARM) || defined(_M_THUMB)) && \ |
| 211 | !(defined(__ia64__) || defined(__ia64) || defined(__itanium__) || \ |
| 212 | defined(_M_IA64)) && \ |
| 213 | !defined(__mmix__) && \ |
| 214 | !(defined(__sparc__) || defined(__sparc)) && \ |
| 215 | !defined(__vax__) && \ |
| 216 | !defined(__pdp10__) |
| 217 | # define KECCAK_COMPL |
| 218 | #endif |
| 219 | |
| 220 | #ifdef KECCAK_COMPL |
| 221 | /* A target without fused and/not (`bic', `andc2'). We complement some of |
| 222 | * the lanes in the initial state and undo this on output. (Absorbing XORs |
| 223 | * input into the state, so this is unaffected.) See the handling of chi in |
| 224 | * `keccak1600_round' below for the details. |
| 225 | */ |
| 226 | |
| 227 | #define COMPL_MASK 0x00121106u |
| 228 | |
| 229 | #define STATE_INIT(z) do { \ |
| 230 | lane cmpl = LANE_CMPL; \ |
| 231 | (z)->S[I(1, 0)] = cmpl; (z)->S[I(2, 0)] = cmpl; \ |
| 232 | (z)->S[I(3, 1)] = cmpl; (z)->S[I(2, 2)] = cmpl; \ |
| 233 | (z)->S[I(2, 3)] = cmpl; (z)->S[I(0, 4)] = cmpl; \ |
| 234 | } while (0) |
| 235 | |
| 236 | #define STATE_OUT(z) do { \ |
| 237 | NOT_LANE((z)->S[I(1, 0)], (z)->S[I(1, 0)]); \ |
| 238 | NOT_LANE((z)->S[I(2, 0)], (z)->S[I(2, 0)]); \ |
| 239 | NOT_LANE((z)->S[I(3, 1)], (z)->S[I(3, 1)]); \ |
| 240 | NOT_LANE((z)->S[I(2, 2)], (z)->S[I(2, 2)]); \ |
| 241 | NOT_LANE((z)->S[I(2, 3)], (z)->S[I(2, 3)]); \ |
| 242 | NOT_LANE((z)->S[I(0, 4)], (z)->S[I(0, 4)]); \ |
| 243 | } while (0) |
| 244 | |
| 245 | #else |
| 246 | /* A target with fused and/not (`bic', `andc2'). Everything is simple. */ |
| 247 | |
| 248 | #define COMPL_MASK 0u |
| 249 | |
| 250 | #define STATE_INIT(z) do ; while (0) |
| 251 | #define STATE_OUT(z) do ; while (0) |
| 252 | |
| 253 | #endif |
| 254 | |
| 255 | /*----- Other magic constants ---------------------------------------------*/ |
| 256 | |
| 257 | /* The rotation constants. These are systematically named -- see `THETA_RHO' |
| 258 | * below. |
| 259 | */ |
| 260 | #define ROT_0_0 0 |
| 261 | #define ROT_1_0 1 |
| 262 | #define ROT_2_0 62 |
| 263 | #define ROT_3_0 28 |
| 264 | #define ROT_4_0 27 |
| 265 | |
| 266 | #define ROT_0_1 36 |
| 267 | #define ROT_1_1 44 |
| 268 | #define ROT_2_1 6 |
| 269 | #define ROT_3_1 55 |
| 270 | #define ROT_4_1 20 |
| 271 | |
| 272 | #define ROT_0_2 3 |
| 273 | #define ROT_1_2 10 |
| 274 | #define ROT_2_2 43 |
| 275 | #define ROT_3_2 25 |
| 276 | #define ROT_4_2 39 |
| 277 | |
| 278 | #define ROT_0_3 41 |
| 279 | #define ROT_1_3 45 |
| 280 | #define ROT_2_3 15 |
| 281 | #define ROT_3_3 21 |
| 282 | #define ROT_4_3 8 |
| 283 | |
| 284 | #define ROT_0_4 18 |
| 285 | #define ROT_1_4 2 |
| 286 | #define ROT_2_4 61 |
| 287 | #define ROT_3_4 56 |
| 288 | #define ROT_4_4 14 |
| 289 | |
| 290 | /*----- Debugging ---------------------------------------------------------*/ |
| 291 | |
| 292 | #ifdef KECCAK_DEBUG |
| 293 | |
| 294 | #include <stdio.h> |
| 295 | |
| 296 | static void dump_state(const char *what, unsigned ir, |
| 297 | const keccak1600_state *x) |
| 298 | { |
| 299 | unsigned i, j; |
| 300 | keccak1600_state y; |
| 301 | kludge64 a; |
| 302 | int sep; |
| 303 | |
| 304 | printf(";; %s [round %u]\n", what, ir); |
| 305 | printf(";; raw state...\n"); |
| 306 | for (j = 0; j < 5; j++) { |
| 307 | printf(";;"); |
| 308 | for (i = 0, sep = '\t'; i < 5; i++, sep = ' ') |
| 309 | printf("%c" PRINTFMT_LANE, sep, PRINTARGS_LANE(x->S[I(i, j)])); |
| 310 | fputc('\n', stdout); |
| 311 | } |
| 312 | y = *x; STATE_OUT(&y); |
| 313 | #ifdef KECCAK_COMPL |
| 314 | printf(";; uncomplemented state...\n"); |
| 315 | for (j = 0; j < 5; j++) { |
| 316 | printf(";;"); |
| 317 | for (i = 0, sep = '\t'; i < 5; i++, sep = ' ') |
| 318 | printf("%c" PRINTFMT_LANE, sep, PRINTARGS_LANE(y.S[I(i, j)])); |
| 319 | fputc('\n', stdout); |
| 320 | } |
| 321 | #endif |
| 322 | #ifdef KECCAK_I32 |
| 323 | printf(";; deinterlaced state...\n"); |
| 324 | for (j = 0; j < 5; j++) { |
| 325 | printf(";;"); |
| 326 | for (i = 0, sep = '\t'; i < 5; i++, sep = ' ') { |
| 327 | a = FROM_LANE(y.S[I(i, j)]); |
| 328 | printf("%c%08lx%08lx", sep, |
| 329 | (unsigned long)HI64(a), (unsigned long)LO64(a)); |
| 330 | } |
| 331 | fputc('\n', stdout); |
| 332 | } |
| 333 | #endif |
| 334 | fputc('\n', stdout); |
| 335 | } |
| 336 | |
| 337 | #endif |
| 338 | |
| 339 | /*----- The Keccak-p[1600, n] permutation ---------------------------------*/ |
| 340 | |
| 341 | static void keccak1600_round(keccak1600_state *z, |
| 342 | const keccak1600_state *x, unsigned i) |
| 343 | { |
| 344 | /* Perform a round of Keccak-p[1600, n]. Process the state X and write the |
| 345 | * result to Z. |
| 346 | */ |
| 347 | |
| 348 | lane c[5], d[5], t; |
| 349 | |
| 350 | /* Theta, first step: calculate the column parities. */ |
| 351 | #define COLPARITY(j) do { \ |
| 352 | d[j] = x->S[I(j, 0)]; \ |
| 353 | XOR_LANE(d[j], d[j], x->S[I(j, 1)]); \ |
| 354 | XOR_LANE(d[j], d[j], x->S[I(j, 2)]); \ |
| 355 | XOR_LANE(d[j], d[j], x->S[I(j, 3)]); \ |
| 356 | XOR_LANE(d[j], d[j], x->S[I(j, 4)]); \ |
| 357 | } while (0) |
| 358 | COLPARITY(0); COLPARITY(1); COLPARITY(2); COLPARITY(3); COLPARITY(4); |
| 359 | #undef COLPARITY |
| 360 | |
| 361 | /* Theta, second step: calculate the combined effect. */ |
| 362 | ROTL_LANE(c[0], d[1], 1); XOR_LANE(c[0], c[0], d[4]); |
| 363 | ROTL_LANE(c[1], d[2], 1); XOR_LANE(c[1], c[1], d[0]); |
| 364 | ROTL_LANE(c[2], d[3], 1); XOR_LANE(c[2], c[2], d[1]); |
| 365 | ROTL_LANE(c[3], d[4], 1); XOR_LANE(c[3], c[3], d[2]); |
| 366 | ROTL_LANE(c[4], d[0], 1); XOR_LANE(c[4], c[4], d[3]); |
| 367 | |
| 368 | /* Now we work plane by plane through the output. To do this, we must undo |
| 369 | * the pi transposition. Pi maps (x', y') = (y, 2 x + 3 y), so y = x', and |
| 370 | * x = (y' - 3 y)/2 = 3 (y' - 3 x') = x' + 3 y'. |
| 371 | */ |
| 372 | #define THETA_RHO(i0, i1, i2, i3, i4) do { \ |
| 373 | \ |
| 374 | /* First, theta. */ \ |
| 375 | XOR_LANE(d[0], x->S[I(i0, 0)], c[i0]); \ |
| 376 | XOR_LANE(d[1], x->S[I(i1, 1)], c[i1]); \ |
| 377 | XOR_LANE(d[2], x->S[I(i2, 2)], c[i2]); \ |
| 378 | XOR_LANE(d[3], x->S[I(i3, 3)], c[i3]); \ |
| 379 | XOR_LANE(d[4], x->S[I(i4, 4)], c[i4]); \ |
| 380 | \ |
| 381 | /* Then rho. */ \ |
| 382 | ROTL_LANE(d[0], d[0], ROT_##i0##_0); \ |
| 383 | ROTL_LANE(d[1], d[1], ROT_##i1##_1); \ |
| 384 | ROTL_LANE(d[2], d[2], ROT_##i2##_2); \ |
| 385 | ROTL_LANE(d[3], d[3], ROT_##i3##_3); \ |
| 386 | ROTL_LANE(d[4], d[4], ROT_##i4##_4); \ |
| 387 | } while (0) |
| 388 | |
| 389 | /* The basic chi operation is: z = w ^ (~a&b), but this involves an |
| 390 | * inversion which we can mostly avoid by being clever: observe that |
| 391 | * |
| 392 | * w ^ (~a&~~b) = w ^ ~(a | ~b) = ~w ^ (a | ~b) |
| 393 | * |
| 394 | * by De Morgan's law. Furthermore, complementing w or z is basically |
| 395 | * equivalent. Bertoni, Daemen, Peeters, Van Assche, and Van Keer, `Keccak |
| 396 | * implementation overview', describe a pattern of lane complementation |
| 397 | * which propagates through theta and pi in exactly the right way to be |
| 398 | * restored easily by chi, here, with exactly one inversion per plane. |
| 399 | * |
| 400 | * Here's the pattern. |
| 401 | * |
| 402 | * [ * . * * . ] [ . * * . . ] |
| 403 | * [ * . * . . ] [ . . . * . ] |
| 404 | * [ * . * . . ] -> [ . . * . . ] |
| 405 | * [ . * . * * ] [ . . * . . ] |
| 406 | * [ * . . * . ] [ * . . . . ] |
| 407 | * |
| 408 | * where a `.' means that the lane is unchanged, and a `*' means that it |
| 409 | * has been complemented. |
| 410 | * |
| 411 | * The macros `CHI_wxy_z' calculate z in terms of w, x, y assuming that the |
| 412 | * inputs w, x, y marked with a `1' are complemented on input, and arrange |
| 413 | * for z to be complemented on output if z is so marked. |
| 414 | * |
| 415 | * The diagrams to the right show the fragment of the complementation |
| 416 | * pattern being handled by the corresponding line of code. A symbol in |
| 417 | * brackets indicates a deviation from the input pattern forced by explicit |
| 418 | * complementation: there will be exactly one of these for each plane. |
| 419 | */ |
| 420 | #ifdef KECCAK_COMPL |
| 421 | # define CHI_COMPL(z, x) NOT_LANE((z), (x)) |
| 422 | # define CHI_001_1(z, w, x, y) \ |
| 423 | (OR_LANE((z), (x), (y)), XOR_LANE((z), (z), (w))) |
| 424 | # define CHI_010_0(z, w, x, y) \ |
| 425 | (AND_LANE((z), (x), (y)), XOR_LANE((z), (z), (w))) |
| 426 | # define CHI_101_0 CHI_001_1 |
| 427 | # define CHI_110_1 CHI_010_0 |
| 428 | #else |
| 429 | # define CHI(z, w, x, y) \ |
| 430 | (NOT_LANE((z), (x)), \ |
| 431 | AND_LANE((z), (z), (y)), \ |
| 432 | XOR_LANE((z), (z), (w))) |
| 433 | # define CHI_COMPL(z, x) ((z) = (x)) |
| 434 | # define CHI_001_1 CHI |
| 435 | # define CHI_010_0 CHI |
| 436 | # define CHI_101_0 CHI |
| 437 | # define CHI_110_1 CHI |
| 438 | #endif |
| 439 | |
| 440 | /* Let's do the y' = 0 plane first. Theta and rho are easy with our macro, |
| 441 | * and we've done pi with the coordinate hacking. That leaves chi next. |
| 442 | * This is hairy because we must worry about complementation. |
| 443 | */ |
| 444 | THETA_RHO(0, 1, 2, 3, 4); |
| 445 | CHI_COMPL(t, d[2]); /* [.] */ |
| 446 | CHI_101_0(z->S[I(0, 0)], d[0], d[1], d[2]); /* * . * -> . */ |
| 447 | CHI_001_1(z->S[I(1, 0)], d[1], t, d[3]); /* . [.] * -> * */ |
| 448 | CHI_110_1(z->S[I(2, 0)], d[2], d[3], d[4]); /* * * . -> * */ |
| 449 | CHI_101_0(z->S[I(3, 0)], d[3], d[4], d[0]); /* * * . -> . */ |
| 450 | CHI_010_0(z->S[I(4, 0)], d[4], d[0], d[1]); /* * . . -> . */ |
| 451 | |
| 452 | /* We'd better do iota before we forget. */ |
| 453 | XOR_LANE(z->S[I(0, 0)], z->S[I(0, 0)], rcon[i]); |
| 454 | |
| 455 | /* That was fun. Maybe y' = 1 will be as good. */ |
| 456 | THETA_RHO(3, 4, 0, 1, 2); |
| 457 | CHI_COMPL(t, d[4]); /* [*] */ |
| 458 | CHI_101_0(z->S[I(0, 1)], d[0], d[1], d[2]); /* * . * -> . */ |
| 459 | CHI_010_0(z->S[I(1, 1)], d[1], d[2], d[3]); /* . * . -> . */ |
| 460 | CHI_101_0(z->S[I(2, 1)], d[2], d[3], t); /* * . [*] -> . */ |
| 461 | CHI_001_1(z->S[I(3, 1)], d[3], d[4], d[0]); /* * . . -> * */ |
| 462 | CHI_010_0(z->S[I(4, 1)], d[4], d[0], d[1]); /* * . . -> . */ |
| 463 | |
| 464 | /* We're getting the hang of this. The y' = 2 plane shouldn't be any |
| 465 | * trouble. |
| 466 | */ |
| 467 | THETA_RHO(1, 2, 3, 4, 0); |
| 468 | CHI_COMPL(t, d[3]); /* [*] */ |
| 469 | CHI_101_0(z->S[I(0, 2)], d[0], d[1], d[2]); /* * . * -> . */ |
| 470 | CHI_010_0(z->S[I(1, 2)], d[1], d[2], d[3]); /* . * . -> . */ |
| 471 | CHI_110_1(z->S[I(2, 2)], d[2], t, d[4]); /* * [*] . -> * */ |
| 472 | CHI_101_0(z->S[I(3, 2)], t, d[4], d[0]); /* * [*] . -> . */ |
| 473 | CHI_010_0(z->S[I(4, 2)], d[4], d[0], d[1]); /* * . . -> . */ |
| 474 | |
| 475 | /* This isn't as interesting any more. Let's do y' = 3 before boredom sets |
| 476 | * in. |
| 477 | */ |
| 478 | THETA_RHO(4, 0, 1, 2, 3); |
| 479 | CHI_COMPL(t, d[3]); /* [.] */ |
| 480 | CHI_010_0(z->S[I(0, 3)], d[0], d[1], d[2]); /* . * . -> . */ |
| 481 | CHI_101_0(z->S[I(1, 3)], d[1], d[2], d[3]); /* * . * -> . */ |
| 482 | CHI_001_1(z->S[I(2, 3)], d[2], t, d[4]); /* . [.] * -> * */ |
| 483 | CHI_010_0(z->S[I(3, 3)], t, d[4], d[0]); /* . [.] * -> . */ |
| 484 | CHI_101_0(z->S[I(4, 3)], d[4], d[0], d[1]); /* . * * -> . */ |
| 485 | |
| 486 | /* Last plane. Just y' = 4 to go. */ |
| 487 | THETA_RHO(2, 3, 4, 0, 1); |
| 488 | CHI_COMPL(t, d[1]); /* [*] */ |
| 489 | CHI_110_1(z->S[I(0, 4)], d[0], t, d[2]); /* * [*] . -> * */ |
| 490 | CHI_101_0(z->S[I(1, 4)], t, d[2], d[3]); /* [*] . * -> . */ |
| 491 | CHI_010_0(z->S[I(2, 4)], d[2], d[3], d[4]); /* . * . -> . */ |
| 492 | CHI_101_0(z->S[I(3, 4)], d[3], d[4], d[0]); /* * * . -> . */ |
| 493 | CHI_010_0(z->S[I(4, 4)], d[4], d[0], d[1]); /* * . . -> . */ |
| 494 | |
| 495 | /* And we're done. */ |
| 496 | #undef THETA_RHO |
| 497 | #undef CHI_COMPL |
| 498 | #undef CHI_001_1 |
| 499 | #undef CHI_010_0 |
| 500 | #undef CHI_101_0 |
| 501 | #undef CHI_110_1 |
| 502 | #undef CHI |
| 503 | } |
| 504 | |
| 505 | /* --- @keccak1600_p@ --- * |
| 506 | * |
| 507 | * Arguments: @keccak1600_state *z@ = where to write the output state |
| 508 | * @conts keccak1600_state *x@ = input state |
| 509 | * @unsigned n@ = number of rounds to perform |
| 510 | * |
| 511 | * Returns: --- |
| 512 | * |
| 513 | * Use: Implements the %$\Keccak[1600, n]$% permutation at the core |
| 514 | * of Keccak and the SHA-3 standard. |
| 515 | */ |
| 516 | |
| 517 | void keccak1600_p(keccak1600_state *z, const keccak1600_state *x, unsigned n) |
| 518 | { |
| 519 | keccak1600_state u, v; |
| 520 | unsigned i = 0; |
| 521 | |
| 522 | #ifdef KECCAK_DEBUG |
| 523 | dump_state("init", 0, x); |
| 524 | #endif |
| 525 | keccak1600_round(&u, x, i++); n--; |
| 526 | while (n > 8) { |
| 527 | keccak1600_round(&v, &u, i++); |
| 528 | keccak1600_round(&u, &v, i++); |
| 529 | keccak1600_round(&v, &u, i++); |
| 530 | keccak1600_round(&u, &v, i++); |
| 531 | keccak1600_round(&v, &u, i++); |
| 532 | keccak1600_round(&u, &v, i++); |
| 533 | keccak1600_round(&v, &u, i++); |
| 534 | keccak1600_round(&u, &v, i++); |
| 535 | n -= 8; |
| 536 | } |
| 537 | switch (n) { |
| 538 | case 7: keccak1600_round(&v, &u, i++); |
| 539 | keccak1600_round(&u, &v, i++); |
| 540 | case 5: keccak1600_round(&v, &u, i++); |
| 541 | keccak1600_round(&u, &v, i++); |
| 542 | case 3: keccak1600_round(&v, &u, i++); |
| 543 | keccak1600_round(&u, &v, i++); |
| 544 | case 1: keccak1600_round( z, &u, i++); |
| 545 | break; |
| 546 | case 8: keccak1600_round(&v, &u, i++); |
| 547 | keccak1600_round(&u, &v, i++); |
| 548 | case 6: keccak1600_round(&v, &u, i++); |
| 549 | keccak1600_round(&u, &v, i++); |
| 550 | case 4: keccak1600_round(&v, &u, i++); |
| 551 | keccak1600_round(&u, &v, i++); |
| 552 | case 2: keccak1600_round(&v, &u, i++); |
| 553 | keccak1600_round( z, &v, i++); |
| 554 | break; |
| 555 | } |
| 556 | #ifdef KECCAK_DEBUG |
| 557 | dump_state("final", 0, z); |
| 558 | #endif |
| 559 | } |
| 560 | |
| 561 | /* --- @keccack1600_init@ --- * |
| 562 | * |
| 563 | * Arguments: @keccak1600_state *s@ = a state to initialize |
| 564 | * |
| 565 | * Returns: --- |
| 566 | * |
| 567 | * Use: Initialize @s@ to the root state. |
| 568 | */ |
| 569 | |
| 570 | void keccak1600_init(keccak1600_state *s) |
| 571 | { memset(s->S, 0, sizeof(s->S)); STATE_INIT(s); } |
| 572 | |
| 573 | /* --- @keccak1600_mix@ --- * |
| 574 | * |
| 575 | * Arguments: @keccak1600_state *s@ = a state to update |
| 576 | * @const kludge64 *p@ = pointer to 64-bit words to mix in |
| 577 | * @size_t n@ = size of the input, in 64-bit words |
| 578 | * |
| 579 | * Returns: --- |
| 580 | * |
| 581 | * Use: Mixes data into a %$\Keccak[r, 1600 - r]$% state. Note that |
| 582 | * it's the caller's responsibility to pass in no more than |
| 583 | * %$r$% bits of data. |
| 584 | */ |
| 585 | |
| 586 | void keccak1600_mix(keccak1600_state *s, const kludge64 *p, size_t n) |
| 587 | { |
| 588 | unsigned i; |
| 589 | lane a; |
| 590 | |
| 591 | for (i = 0; i < n; i++) |
| 592 | { a = TO_LANE(p[i]); XOR_LANE(s->S[i], s->S[i], a); } |
| 593 | } |
| 594 | |
| 595 | /* --- @keccak1600_set@ --- * |
| 596 | * |
| 597 | * Arguments: @keccak1600_state *s@ = a state to update |
| 598 | * @const kludge64 *p@ = pointer to 64-bit words to mix in |
| 599 | * @size_t n@ = size of the input, in 64-bit words |
| 600 | * |
| 601 | * Returns: --- |
| 602 | * |
| 603 | * Use: Stores data into a %$\Keccak[r, 1600 - r]$% state. Note that |
| 604 | * it's the caller's responsibility to pass in no more than |
| 605 | * %$r$% bits of data. |
| 606 | * |
| 607 | * This is not the operation you wanted for ordinary hashing. |
| 608 | * It's provided for the use of higher-level protocols which use |
| 609 | * duplexing and other fancy sponge features. |
| 610 | */ |
| 611 | |
| 612 | void keccak1600_set(keccak1600_state *s, const kludge64 *p, size_t n) |
| 613 | { |
| 614 | uint32 m = COMPL_MASK; |
| 615 | unsigned i; |
| 616 | lane a; |
| 617 | |
| 618 | for (i = 0; i < n; i++) { |
| 619 | a = TO_LANE(p[i]); if (m&1) NOT_LANE(a, a); |
| 620 | s->S[i] = a; m >>= 1; |
| 621 | } |
| 622 | } |
| 623 | |
| 624 | /* --- @keccak1600_extract@ --- * |
| 625 | * |
| 626 | * Arguments: @const keccak1600_state *s@ = a state to extract output from |
| 627 | * @kludge64 *p@ = pointer to 64-bit words to write |
| 628 | * @size_t n@ = size of the output, in 64-bit words |
| 629 | * |
| 630 | * Returns: --- |
| 631 | * |
| 632 | * Use: Reads output from a %$\Keccak[r, 1600 - r]$% state. Note |
| 633 | * that it's the caller's responsibility to extract no more than |
| 634 | * %$r$% bits of data. |
| 635 | */ |
| 636 | |
| 637 | void keccak1600_extract(const keccak1600_state *s, kludge64 *p, size_t n) |
| 638 | { |
| 639 | uint32 m = COMPL_MASK; |
| 640 | unsigned i; |
| 641 | lane t; |
| 642 | |
| 643 | for (i = 0; i < n; i++) { |
| 644 | t = s->S[i]; if (m&1) NOT_LANE(t, t); |
| 645 | *p++ = FROM_LANE(t); m >>= 1; |
| 646 | } |
| 647 | } |
| 648 | |
| 649 | /*----- Test rig ----------------------------------------------------------*/ |
| 650 | |
| 651 | #ifdef TEST_RIG |
| 652 | |
| 653 | #include <stdio.h> |
| 654 | |
| 655 | #include <mLib/macros.h> |
| 656 | #include <mLib/quis.h> |
| 657 | #include <mLib/report.h> |
| 658 | #include <mLib/testrig.h> |
| 659 | |
| 660 | static int vrf_p(dstr v[]) |
| 661 | { |
| 662 | keccak1600_state u; |
| 663 | kludge64 t[25]; |
| 664 | dstr d = DSTR_INIT; |
| 665 | int n; |
| 666 | unsigned i; |
| 667 | int ok = 1; |
| 668 | |
| 669 | if (v[0].len != 200) die(1, "bad input size"); |
| 670 | if (v[2].len != 200) die(1, "bad output size"); |
| 671 | n = *(int *)v[1].buf; |
| 672 | dstr_ensure(&d, 200); d.len = 200; |
| 673 | |
| 674 | keccak1600_init(&u); |
| 675 | for (i = 0; i < 25; i++) LOAD64_L_(t[i], v[0].buf + 8*i); |
| 676 | keccak1600_mix(&u, t, 25); |
| 677 | keccak1600_p(&u, &u, n); |
| 678 | keccak1600_extract(&u, t, 25); |
| 679 | for (i = 0; i < 25; i++) STORE64_L_(d.buf + 8*i, t[i]); |
| 680 | if (MEMCMP(d.buf, !=, v[2].buf, 200)) { |
| 681 | ok = 0; |
| 682 | fprintf(stderr, "failed!"); |
| 683 | fprintf(stderr, "\n\t input = "); type_hex.dump(&v[0], stderr); |
| 684 | fprintf(stderr, "\n\t rounds = %d", n); |
| 685 | fprintf(stderr, "\n\t expected = "); type_hex.dump(&v[2], stderr); |
| 686 | fprintf(stderr, "\n\t calclated = "); type_hex.dump(&d, stderr); |
| 687 | } |
| 688 | |
| 689 | dstr_destroy(&d); |
| 690 | return (ok); |
| 691 | } |
| 692 | |
| 693 | static test_chunk defs[] = { |
| 694 | { "p", vrf_p, { &type_hex, &type_int, &type_hex } }, |
| 695 | { 0, 0, { 0 } } |
| 696 | }; |
| 697 | |
| 698 | int main(int argc, char *argv[]) |
| 699 | { |
| 700 | test_run(argc, argv, defs, SRCDIR"/t/keccak1600"); |
| 701 | return (0); |
| 702 | } |
| 703 | |
| 704 | #endif |
| 705 | |
| 706 | /*----- That's all, folks -------------------------------------------------*/ |