mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/* --c--
	2	*
	3	* The Keccak-p[1600, n] permutation
	4	*
	5	* (c) 2017 Straylight/Edgeware
	6	*/
	7
	8	/----- Licensing notice --------------------------------------------------
	9	*
	10	* This file is part of Catacomb.
	11	*
	12	* Catacomb is free software; you can redistribute it and/or modify
	13	* it under the terms of the GNU Library General Public License as
	14	* published by the Free Software Foundation; either version 2 of the
	15	* License, or (at your option) any later version.
	16	*
	17	* Catacomb is distributed in the hope that it will be useful,
	18	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	* GNU Library General Public License for more details.
	21	*
	22	* You should have received a copy of the GNU Library General Public
	23	* License along with Catacomb; if not, write to the Free
	24	* Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	* MA 02111-1307, USA.
	26	*/
	27
	28	/----- Header files ------------------------------------------------------/
	29
	30	#include <limits.h>
	31	#include <string.h>
	32
	33	#include <mLib/bits.h>
	34
	35	#include "keccak1600.h"
	36	#include "permute.h"
	37
	38	/* #define KECCAK_DEBUG */
	39
	40	/----- Miscellaneous utilities -------------------------------------------/
	41
	42	#define I(x, y) ((x) + 5(y)) / Column-major indexing */
	43
	44	/----- Interlacing or not ------------------------------------------------/
	45
	46	/* We should prefer the interlaced representation if the target is really
	47	* 32-bit and only providing synthetic 64-bit integers. Alas, the Windows
	48	* 64-bit ABI specifies that `long' is only 32-bits (i.e., it is IL32/LLP64),
	49	* so detect x86 specifically.
	50	*/
	51	#if (ULONG_MAX >> 31) <= 0xffffffff && \
	52	!defined(__amd64__) && !defined(_M_AMD64)
	53	# define KECCAK_I32
	54	#endif
	55
	56	#ifdef KECCAK_I32
	57	/* A 32-bit target with at best weak support for 64-bit shifts. Maintain a
	58	* lane as two 32-bit pieces representing the even and odd bits of the lane.
	59	* There are slightly fiddly transformations to apply on the way in and out
	60	* of the main permutation.
	61	*/
	62
	63	typedef keccak1600_lane_i32 lane;
	64	#define S si32
	65
	66	static lane interlace(kludge64 x)
	67	{
	68	/* Given a 64-bit string X, return a lane Z containing the even- and
	69	* odd-numbered bits of X.
	70	*/
	71
	72	typedef uint32 regty;
	73	#define REGWD 32
	74
	75	uint32 x0 = LO64(x), x1 = HI64(x);
	76	lane z;
	77	/* 5, 4, 3, 2, 1, 0 */
	78	TWIZZLE_EXCH(x1, x0, 4); /* 4, 5, 3, 2, 1, 0 */
	79	TWIZZLE_EXCH(x1, x0, 3); /* 3, 5, 4, 2, 1, 0 */
	80	TWIZZLE_EXCH(x1, x0, 2); /* 2, 5, 4, 3, 1, 0 */
	81	TWIZZLE_EXCH(x1, x0, 1); /* 1, 5, 4, 3, 2, 0 */
	82	TWIZZLE_EXCH(x1, x0, 0); /* 0, 5, 4, 3, 2, 1 */
	83	z.even = x0; z.odd = x1; return (z);
	84
	85	#undef REGWD
	86	}
	87
	88	static kludge64 deinterlace(lane x)
	89	{
	90	/* Given a lane X, return the combined 64-bit value. This is the inverse
	91	* to `interlace' above, and the principle is the same
	92	*/
	93
	94	typedef uint32 regty;
	95	#define REGWD 32
	96
	97	uint32 x0 = x.even, x1 = x.odd;
	98	kludge64 z;
	99	/* 0, 5, 4, 3, 2, 1 */
	100	TWIZZLE_EXCH(x1, x0, 0); /* 1, 5, 4, 3, 2, 0 */
	101	TWIZZLE_EXCH(x1, x0, 1); /* 2, 5, 4, 3, 1, 0 */
	102	TWIZZLE_EXCH(x1, x0, 2); /* 3, 5, 4, 2, 1, 0 */
	103	TWIZZLE_EXCH(x1, x0, 3); /* 4, 5, 3, 2, 1, 0 */
	104	TWIZZLE_EXCH(x1, x0, 4); /* 5, 4, 3, 2, 1, 0 */
	105	SET64(z, x1, x0); return (z);
	106
	107	#undef REGWD
	108	}
	109
	110	#define TO_LANE(x) (interlace(x))
	111	#define FROM_LANE(x) (deinterlace(x))
	112
	113	#define PRINTFMT_LANE "%08lx:%08lx"
	114	#define PRINTARGS_LANE(x) (unsigned long)(x).even, (unsigned long)(x).odd
	115
	116	#define BINOP_LANE(z, op, x, y) \
	117	((z).even = (x).even op (y).even, (z).odd = (x).odd op (y).odd)
	118	#define XOR_LANE(z, x, y) BINOP_LANE(z, ^, x, y)
	119	#define AND_LANE(z, x, y) BINOP_LANE(z, &, x, y)
	120	#define OR_LANE(z, x, y) BINOP_LANE(z, \|, x, y)
	121	#define NOT_LANE(z, x) ((z).even = ~(x).even, (z).odd = ~(x).odd)
	122
	123	#define ROTL_LANE(z, x, n) do { \
	124	lane _t = (x); \
	125	(z).even = (n)%2 ? ROL32(_t.odd, ((n) + 1)/2) \
	126	: ROL32(_t.even, (n)/2); \
	127	(z).odd = (n)%2 ? ROL32(_t.even, ((n) - 1)/2) \
	128	: ROL32(_t.odd, (n)/2); \
	129	} while (0)
	130
	131	#define LANE_ZERO { 0, 0 }
	132	#define LANE_CMPL { 0xffffffff, 0xffffffff }
	133
	134	static const lane rcon[24] = {
	135	{ 0x00000001, 0x00000000 }, { 0x00000000, 0x00000089 },
	136	{ 0x00000000, 0x8000008b }, { 0x00000000, 0x80008080 },
	137	{ 0x00000001, 0x0000008b }, { 0x00000001, 0x00008000 },
	138	{ 0x00000001, 0x80008088 }, { 0x00000001, 0x80000082 },
	139	{ 0x00000000, 0x0000000b }, { 0x00000000, 0x0000000a },
	140	{ 0x00000001, 0x00008082 }, { 0x00000000, 0x00008003 },
	141	{ 0x00000001, 0x0000808b }, { 0x00000001, 0x8000000b },
	142	{ 0x00000001, 0x8000008a }, { 0x00000001, 0x80000081 },
	143	{ 0x00000000, 0x80000081 }, { 0x00000000, 0x80000008 },
	144	{ 0x00000000, 0x00000083 }, { 0x00000000, 0x80008003 },
	145	{ 0x00000001, 0x80008088 }, { 0x00000000, 0x80000088 },
	146	{ 0x00000001, 0x00008000 }, { 0x00000000, 0x80008082 }
	147	};
	148
	149	#else
	150	/* A target with good support for 64-bit shifts. We store lanes as 64-bit
	151	* quantities and deal with them in the obvious, natural way.
	152	*/
	153
	154	typedef keccak1600_lane_64 lane;
	155	#define S s64
	156
	157	#define TO_LANE(x) (x)
	158	#define FROM_LANE(x) (x)
	159
	160	#define PRINTFMT_LANE "%08lx%08lx"
	161	#define PRINTARGS_LANE(x) (unsigned long)HI64(x), (unsigned long)LO64(x)
	162
	163	#define XOR_LANE(z, x, y) XOR64((z), (x), (y))
	164	#define AND_LANE(z, x, y) AND64((z), (x), (y))
	165	#define OR_LANE(z, x, y) OR64((z), (x), (y))
	166	#define NOT_LANE(z, x) CPL64((z), (x))
	167	#define ROTL_LANE(z, x, n) ROL64_((z), (x), (n))
	168
	169	#define LANE_ZERO X64( 0, 0)
	170	#define LANE_CMPL X64(ffffffff, ffffffff)
	171
	172	static const lane rcon[24] = {
	173	X64(00000000, 00000001), X64(00000000, 00008082),
	174	X64(80000000, 0000808a), X64(80000000, 80008000),
	175	X64(00000000, 0000808b), X64(00000000, 80000001),
	176	X64(80000000, 80008081), X64(80000000, 00008009),
	177	X64(00000000, 0000008a), X64(00000000, 00000088),
	178	X64(00000000, 80008009), X64(00000000, 8000000a),
	179	X64(00000000, 8000808b), X64(80000000, 0000008b),
	180	X64(80000000, 00008089), X64(80000000, 00008003),
	181	X64(80000000, 00008002), X64(80000000, 00000080),
	182	X64(00000000, 0000800a), X64(80000000, 8000000a),
	183	X64(80000000, 80008081), X64(80000000, 00008080),
	184	X64(00000000, 80000001), X64(80000000, 80008008)
	185	};
	186
	187	#endif
	188
	189	/----- Complementing or not ----------------------------------------------/
	190
	191	/* We should use the complemented representation if the target doesn't have a
	192	* fused and-not operation. There doesn't appear to be a principled way to
	193	* do this, so we'll just have to make do with a big list. Worse, in my
	194	* brief survey of the architecture reference manuals I have lying about,
	195	* they've split close to 50/50 on this question, so I don't have an
	196	* especially good way to pick a default. The `no-fused-op' architectures
	197	* seem generally a bit more modern than the `fused-op' architectures, so I
	198	* guess I'll make the complemented representation the default.
	199	*
	200	* and-not No and-not
	201	* ------- ----------
	202	* ARM (`bic') x86/amd64
	203	* Sparc (`andn') z/Architecture
	204	* MMIX (`andn') MIPS
	205	* IA64 (`andcm') 68k
	206	* VAX (`bic') RISC-V
	207	* PDP-10 (`andc')
	208	*/
	209	#if !(defined(__arm__) \|\| defined(__thumb__) \|\| defined(__aarch64__) \|\| \
	210	defined(_M_ARM) \|\| defined(_M_THUMB)) && \
	211	!(defined(__ia64__) \|\| defined(__ia64) \|\| defined(__itanium__) \|\| \
	212	defined(_M_IA64)) && \
	213	!defined(__mmix__) && \
	214	!(defined(__sparc__) \|\| defined(__sparc)) && \
	215	!defined(__vax__) && \
	216	!defined(__pdp10__)
	217	# define KECCAK_COMPL
	218	#endif
	219
	220	#ifdef KECCAK_COMPL
	221	/* A target without fused and/not (`bic', `andc2'). We complement some of
	222	* the lanes in the initial state and undo this on output. (Absorbing XORs
	223	* input into the state, so this is unaffected.) See the handling of chi in
	224	* `keccak1600_round' below for the details.
	225	*/
	226
	227	#define COMPL_MASK 0x00121106u
	228
	229	#define STATE_INIT(z) do { \
	230	lane cmpl = LANE_CMPL; \
	231	(z)->S[I(1, 0)] = cmpl; (z)->S[I(2, 0)] = cmpl; \
	232	(z)->S[I(3, 1)] = cmpl; (z)->S[I(2, 2)] = cmpl; \
	233	(z)->S[I(2, 3)] = cmpl; (z)->S[I(0, 4)] = cmpl; \
	234	} while (0)
	235
	236	#define STATE_OUT(z) do { \
	237	NOT_LANE((z)->S[I(1, 0)], (z)->S[I(1, 0)]); \
	238	NOT_LANE((z)->S[I(2, 0)], (z)->S[I(2, 0)]); \
	239	NOT_LANE((z)->S[I(3, 1)], (z)->S[I(3, 1)]); \
	240	NOT_LANE((z)->S[I(2, 2)], (z)->S[I(2, 2)]); \
	241	NOT_LANE((z)->S[I(2, 3)], (z)->S[I(2, 3)]); \
	242	NOT_LANE((z)->S[I(0, 4)], (z)->S[I(0, 4)]); \
	243	} while (0)
	244
	245	#else
	246	/* A target with fused and/not (`bic', `andc2'). Everything is simple. */
	247
	248	#define COMPL_MASK 0u
	249
	250	#define STATE_INIT(z) do ; while (0)
	251	#define STATE_OUT(z) do ; while (0)
	252
	253	#endif
	254
	255	/----- Other magic constants ---------------------------------------------/
	256
	257	/* The rotation constants. These are systematically named -- see `THETA_RHO'
	258	* below.
	259	*/
	260	#define ROT_0_0 0
	261	#define ROT_1_0 1
	262	#define ROT_2_0 62
	263	#define ROT_3_0 28
	264	#define ROT_4_0 27
	265
	266	#define ROT_0_1 36
	267	#define ROT_1_1 44
	268	#define ROT_2_1 6
	269	#define ROT_3_1 55
	270	#define ROT_4_1 20
	271
	272	#define ROT_0_2 3
	273	#define ROT_1_2 10
	274	#define ROT_2_2 43
	275	#define ROT_3_2 25
	276	#define ROT_4_2 39
	277
	278	#define ROT_0_3 41
	279	#define ROT_1_3 45
	280	#define ROT_2_3 15
	281	#define ROT_3_3 21
	282	#define ROT_4_3 8
	283
	284	#define ROT_0_4 18
	285	#define ROT_1_4 2
	286	#define ROT_2_4 61
	287	#define ROT_3_4 56
	288	#define ROT_4_4 14
	289
	290	/----- Debugging ---------------------------------------------------------/
	291
	292	#ifdef KECCAK_DEBUG
	293
	294	#include <stdio.h>
	295
	296	static void dump_state(const char *what, unsigned ir,
	297	const keccak1600_state *x)
	298	{
	299	unsigned i, j;
	300	keccak1600_state y;
	301	kludge64 a;
	302	int sep;
	303
	304	printf(";; %s [round %u]\n", what, ir);
	305	printf(";; raw state...\n");
	306	for (j = 0; j < 5; j++) {
	307	printf(";;");
	308	for (i = 0, sep = '\t'; i < 5; i++, sep = ' ')
	309	printf("%c" PRINTFMT_LANE, sep, PRINTARGS_LANE(x->S[I(i, j)]));
	310	fputc('\n', stdout);
	311	}
	312	y = *x; STATE_OUT(&y);
	313	#ifdef KECCAK_COMPL
	314	printf(";; uncomplemented state...\n");
	315	for (j = 0; j < 5; j++) {
	316	printf(";;");
	317	for (i = 0, sep = '\t'; i < 5; i++, sep = ' ')
	318	printf("%c" PRINTFMT_LANE, sep, PRINTARGS_LANE(y.S[I(i, j)]));
	319	fputc('\n', stdout);
	320	}
	321	#endif
	322	#ifdef KECCAK_I32
	323	printf(";; deinterlaced state...\n");
	324	for (j = 0; j < 5; j++) {
	325	printf(";;");
	326	for (i = 0, sep = '\t'; i < 5; i++, sep = ' ') {
	327	a = FROM_LANE(y.S[I(i, j)]);
	328	printf("%c%08lx%08lx", sep,
	329	(unsigned long)HI64(a), (unsigned long)LO64(a));
	330	}
	331	fputc('\n', stdout);
	332	}
	333	#endif
	334	fputc('\n', stdout);
	335	}
	336
	337	#endif
	338
	339	/----- The Keccak-p[1600, n] permutation ---------------------------------/
	340
	341	static void keccak1600_round(keccak1600_state *z,
	342	const keccak1600_state *x, unsigned i)
	343	{
	344	/* Perform a round of Keccak-p[1600, n]. Process the state X and write the
	345	* result to Z.
	346	*/
	347
	348	lane c[5], d[5], t;
	349
	350	/* Theta, first step: calculate the column parities. */
	351	#define COLPARITY(j) do { \
	352	d[j] = x->S[I(j, 0)]; \
	353	XOR_LANE(d[j], d[j], x->S[I(j, 1)]); \
	354	XOR_LANE(d[j], d[j], x->S[I(j, 2)]); \
	355	XOR_LANE(d[j], d[j], x->S[I(j, 3)]); \
	356	XOR_LANE(d[j], d[j], x->S[I(j, 4)]); \
	357	} while (0)
	358	COLPARITY(0); COLPARITY(1); COLPARITY(2); COLPARITY(3); COLPARITY(4);
	359	#undef COLPARITY
	360
	361	/* Theta, second step: calculate the combined effect. */
	362	ROTL_LANE(c[0], d[1], 1); XOR_LANE(c[0], c[0], d[4]);
	363	ROTL_LANE(c[1], d[2], 1); XOR_LANE(c[1], c[1], d[0]);
	364	ROTL_LANE(c[2], d[3], 1); XOR_LANE(c[2], c[2], d[1]);
	365	ROTL_LANE(c[3], d[4], 1); XOR_LANE(c[3], c[3], d[2]);
	366	ROTL_LANE(c[4], d[0], 1); XOR_LANE(c[4], c[4], d[3]);
	367
	368	/* Now we work plane by plane through the output. To do this, we must undo
	369	* the pi transposition. Pi maps (x', y') = (y, 2 x + 3 y), so y = x', and
	370	* x = (y' - 3 y)/2 = 3 (y' - 3 x') = x' + 3 y'.
	371	*/
	372	#define THETA_RHO(i0, i1, i2, i3, i4) do { \
	373	\
	374	/* First, theta. */ \
	375	XOR_LANE(d[0], x->S[I(i0, 0)], c[i0]); \
	376	XOR_LANE(d[1], x->S[I(i1, 1)], c[i1]); \
	377	XOR_LANE(d[2], x->S[I(i2, 2)], c[i2]); \
	378	XOR_LANE(d[3], x->S[I(i3, 3)], c[i3]); \
	379	XOR_LANE(d[4], x->S[I(i4, 4)], c[i4]); \
	380	\
	381	/* Then rho. */ \
	382	ROTL_LANE(d[0], d[0], ROT_##i0##_0); \
	383	ROTL_LANE(d[1], d[1], ROT_##i1##_1); \
	384	ROTL_LANE(d[2], d[2], ROT_##i2##_2); \
	385	ROTL_LANE(d[3], d[3], ROT_##i3##_3); \
	386	ROTL_LANE(d[4], d[4], ROT_##i4##_4); \
	387	} while (0)
	388
	389	/* The basic chi operation is: z = w ^ (~a&b), but this involves an
	390	* inversion which we can mostly avoid by being clever: observe that
	391	*
	392	* w ^ (~a&~~b) = w ^ ~(a \| ~b) = ~w ^ (a \| ~b)
	393	*
	394	* by De Morgan's law. Furthermore, complementing w or z is basically
	395	* equivalent. Bertoni, Daemen, Peeters, Van Assche, and Van Keer, `Keccak
	396	* implementation overview', describe a pattern of lane complementation
	397	* which propagates through theta and pi in exactly the right way to be
	398	* restored easily by chi, here, with exactly one inversion per plane.
	399	*
	400	* Here's the pattern.
	401	*
	402	* [ * . * * . ] [ . * * . . ]
	403	* [ * . * . . ] [ . . . * . ]
	404	* [ * . * . . ] -> [ . . * . . ]
	405	* [ . * . * * ] [ . . * . . ]
	406	* [ * . . * . ] [ * . . . . ]
	407	*
	408	* where a `.' means that the lane is unchanged, and a `*' means that it
	409	* has been complemented.
	410	*
	411	* The macros `CHI_wxy_z' calculate z in terms of w, x, y assuming that the
	412	* inputs w, x, y marked with a `1' are complemented on input, and arrange
	413	* for z to be complemented on output if z is so marked.
	414	*
	415	* The diagrams to the right show the fragment of the complementation
	416	* pattern being handled by the corresponding line of code. A symbol in
	417	* brackets indicates a deviation from the input pattern forced by explicit
	418	* complementation: there will be exactly one of these for each plane.
	419	*/
	420	#ifdef KECCAK_COMPL
	421	# define CHI_COMPL(z, x) NOT_LANE((z), (x))
	422	# define CHI_001_1(z, w, x, y) \
	423	(OR_LANE((z), (x), (y)), XOR_LANE((z), (z), (w)))
	424	# define CHI_010_0(z, w, x, y) \
	425	(AND_LANE((z), (x), (y)), XOR_LANE((z), (z), (w)))
	426	# define CHI_101_0 CHI_001_1
	427	# define CHI_110_1 CHI_010_0
	428	#else
	429	# define CHI(z, w, x, y) \
	430	(NOT_LANE((z), (x)), \
	431	AND_LANE((z), (z), (y)), \
	432	XOR_LANE((z), (z), (w)))
	433	# define CHI_COMPL(z, x) ((z) = (x))
	434	# define CHI_001_1 CHI
	435	# define CHI_010_0 CHI
	436	# define CHI_101_0 CHI
	437	# define CHI_110_1 CHI
	438	#endif
	439
	440	/* Let's do the y' = 0 plane first. Theta and rho are easy with our macro,
	441	* and we've done pi with the coordinate hacking. That leaves chi next.
	442	* This is hairy because we must worry about complementation.
	443	*/
	444	THETA_RHO(0, 1, 2, 3, 4);
	445	CHI_COMPL(t, d[2]); /* [.] */
	446	CHI_101_0(z->S[I(0, 0)], d[0], d[1], d[2]); /* * . * -> . */
	447	CHI_001_1(z->S[I(1, 0)], d[1], t, d[3]); /* . [.] * -> * */
	448	CHI_110_1(z->S[I(2, 0)], d[2], d[3], d[4]); /* * * . -> * */
	449	CHI_101_0(z->S[I(3, 0)], d[3], d[4], d[0]); /* * * . -> . */
	450	CHI_010_0(z->S[I(4, 0)], d[4], d[0], d[1]); /* * . . -> . */
	451
	452	/* We'd better do iota before we forget. */
	453	XOR_LANE(z->S[I(0, 0)], z->S[I(0, 0)], rcon[i]);
	454
	455	/* That was fun. Maybe y' = 1 will be as good. */
	456	THETA_RHO(3, 4, 0, 1, 2);
	457	CHI_COMPL(t, d[4]); /* [] /
	458	CHI_101_0(z->S[I(0, 1)], d[0], d[1], d[2]); /* * . * -> . */
	459	CHI_010_0(z->S[I(1, 1)], d[1], d[2], d[3]); /* . * . -> . */
	460	CHI_101_0(z->S[I(2, 1)], d[2], d[3], t); /* * . [] -> . /
	461	CHI_001_1(z->S[I(3, 1)], d[3], d[4], d[0]); /* * . . -> * */
	462	CHI_010_0(z->S[I(4, 1)], d[4], d[0], d[1]); /* * . . -> . */
	463
	464	/* We're getting the hang of this. The y' = 2 plane shouldn't be any
	465	* trouble.
	466	*/
	467	THETA_RHO(1, 2, 3, 4, 0);
	468	CHI_COMPL(t, d[3]); /* [] /
	469	CHI_101_0(z->S[I(0, 2)], d[0], d[1], d[2]); /* * . * -> . */
	470	CHI_010_0(z->S[I(1, 2)], d[1], d[2], d[3]); /* . * . -> . */
	471	CHI_110_1(z->S[I(2, 2)], d[2], t, d[4]); /* * [] . -> */
	472	CHI_101_0(z->S[I(3, 2)], t, d[4], d[0]); /* * [] . -> . /
	473	CHI_010_0(z->S[I(4, 2)], d[4], d[0], d[1]); /* * . . -> . */
	474
	475	/* This isn't as interesting any more. Let's do y' = 3 before boredom sets
	476	* in.
	477	*/
	478	THETA_RHO(4, 0, 1, 2, 3);
	479	CHI_COMPL(t, d[3]); /* [.] */
	480	CHI_010_0(z->S[I(0, 3)], d[0], d[1], d[2]); /* . * . -> . */
	481	CHI_101_0(z->S[I(1, 3)], d[1], d[2], d[3]); /* * . * -> . */
	482	CHI_001_1(z->S[I(2, 3)], d[2], t, d[4]); /* . [.] * -> * */
	483	CHI_010_0(z->S[I(3, 3)], t, d[4], d[0]); /* . [.] * -> . */
	484	CHI_101_0(z->S[I(4, 3)], d[4], d[0], d[1]); /* . * * -> . */
	485
	486	/* Last plane. Just y' = 4 to go. */
	487	THETA_RHO(2, 3, 4, 0, 1);
	488	CHI_COMPL(t, d[1]); /* [] /
	489	CHI_110_1(z->S[I(0, 4)], d[0], t, d[2]); /* * [] . -> */
	490	CHI_101_0(z->S[I(1, 4)], t, d[2], d[3]); /* [] . -> . */
	491	CHI_010_0(z->S[I(2, 4)], d[2], d[3], d[4]); /* . * . -> . */
	492	CHI_101_0(z->S[I(3, 4)], d[3], d[4], d[0]); /* * * . -> . */
	493	CHI_010_0(z->S[I(4, 4)], d[4], d[0], d[1]); /* * . . -> . */
	494
	495	/* And we're done. */
	496	#undef THETA_RHO
	497	#undef CHI_COMPL
	498	#undef CHI_001_1
	499	#undef CHI_010_0
	500	#undef CHI_101_0
	501	#undef CHI_110_1
	502	#undef CHI
	503	}
	504
	505	/* --- @keccak1600_p@ --- *
	506	*
	507	* Arguments: @keccak1600_state *z@ = where to write the output state
	508	* @conts keccak1600_state *x@ = input state
	509	* @unsigned n@ = number of rounds to perform
	510	*
	511	* Returns: ---
	512	*
	513	* Use: Implements the %$\Keccak[1600, n]$% permutation at the core
	514	* of Keccak and the SHA-3 standard.
	515	*/
	516
	517	void keccak1600_p(keccak1600_state z, const keccak1600_state x, unsigned n)
	518	{
	519	keccak1600_state u, v;
	520	unsigned i = 0;
	521
	522	#ifdef KECCAK_DEBUG
	523	dump_state("init", 0, x);
	524	#endif
	525	keccak1600_round(&u, x, i++); n--;
	526	while (n > 8) {
	527	keccak1600_round(&v, &u, i++);
	528	keccak1600_round(&u, &v, i++);
	529	keccak1600_round(&v, &u, i++);
	530	keccak1600_round(&u, &v, i++);
	531	keccak1600_round(&v, &u, i++);
	532	keccak1600_round(&u, &v, i++);
	533	keccak1600_round(&v, &u, i++);
	534	keccak1600_round(&u, &v, i++);
	535	n -= 8;
	536	}
	537	switch (n) {
	538	case 7: keccak1600_round(&v, &u, i++);
	539	keccak1600_round(&u, &v, i++);
	540	case 5: keccak1600_round(&v, &u, i++);
	541	keccak1600_round(&u, &v, i++);
	542	case 3: keccak1600_round(&v, &u, i++);
	543	keccak1600_round(&u, &v, i++);
	544	case 1: keccak1600_round( z, &u, i++);
	545	break;
	546	case 8: keccak1600_round(&v, &u, i++);
	547	keccak1600_round(&u, &v, i++);
	548	case 6: keccak1600_round(&v, &u, i++);
	549	keccak1600_round(&u, &v, i++);
	550	case 4: keccak1600_round(&v, &u, i++);
	551	keccak1600_round(&u, &v, i++);
	552	case 2: keccak1600_round(&v, &u, i++);
	553	keccak1600_round( z, &v, i++);
	554	break;
	555	}
	556	#ifdef KECCAK_DEBUG
	557	dump_state("final", 0, z);
	558	#endif
	559	}
	560
	561	/* --- @keccack1600_init@ --- *
	562	*
	563	* Arguments: @keccak1600_state *s@ = a state to initialize
	564	*
	565	* Returns: ---
	566	*
	567	* Use: Initialize @s@ to the root state.
	568	*/
	569
	570	void keccak1600_init(keccak1600_state *s)
	571	{ memset(s->S, 0, sizeof(s->S)); STATE_INIT(s); }
	572
	573	/* --- @keccak1600_mix@ --- *
	574	*
	575	* Arguments: @keccak1600_state *s@ = a state to update
	576	* @const kludge64 *p@ = pointer to 64-bit words to mix in
	577	* @size_t n@ = size of the input, in 64-bit words
	578	*
	579	* Returns: ---
	580	*
	581	* Use: Mixes data into a %$\Keccak[r, 1600 - r]$% state. Note that
	582	* it's the caller's responsibility to pass in no more than
	583	* %$r$% bits of data.
	584	*/
	585
	586	void keccak1600_mix(keccak1600_state s, const kludge64 p, size_t n)
	587	{
	588	unsigned i;
	589	lane a;
	590
	591	for (i = 0; i < n; i++)
	592	{ a = TO_LANE(p[i]); XOR_LANE(s->S[i], s->S[i], a); }
	593	}
	594
	595	/* --- @keccak1600_set@ --- *
	596	*
	597	* Arguments: @keccak1600_state *s@ = a state to update
	598	* @const kludge64 *p@ = pointer to 64-bit words to mix in
	599	* @size_t n@ = size of the input, in 64-bit words
	600	*
	601	* Returns: ---
	602	*
	603	* Use: Stores data into a %$\Keccak[r, 1600 - r]$% state. Note that
	604	* it's the caller's responsibility to pass in no more than
	605	* %$r$% bits of data.
	606	*
	607	* This is not the operation you wanted for ordinary hashing.
	608	* It's provided for the use of higher-level protocols which use
	609	* duplexing and other fancy sponge features.
	610	*/
	611
	612	void keccak1600_set(keccak1600_state s, const kludge64 p, size_t n)
	613	{
	614	uint32 m = COMPL_MASK;
	615	unsigned i;
	616	lane a;
	617
	618	for (i = 0; i < n; i++) {
	619	a = TO_LANE(p[i]); if (m&1) NOT_LANE(a, a);
	620	s->S[i] = a; m >>= 1;
	621	}
	622	}
	623
	624	/* --- @keccak1600_extract@ --- *
	625	*
	626	* Arguments: @const keccak1600_state *s@ = a state to extract output from
	627	* @kludge64 *p@ = pointer to 64-bit words to write
	628	* @size_t n@ = size of the output, in 64-bit words
	629	*
	630	* Returns: ---
	631	*
	632	* Use: Reads output from a %$\Keccak[r, 1600 - r]$% state. Note
	633	* that it's the caller's responsibility to extract no more than
	634	* %$r$% bits of data.
	635	*/
	636
	637	void keccak1600_extract(const keccak1600_state s, kludge64 p, size_t n)
	638	{
	639	uint32 m = COMPL_MASK;
	640	unsigned i;
	641	lane t;
	642
	643	for (i = 0; i < n; i++) {
	644	t = s->S[i]; if (m&1) NOT_LANE(t, t);
	645	*p++ = FROM_LANE(t); m >>= 1;
	646	}
	647	}
	648
	649	/----- Test rig ----------------------------------------------------------/
	650
	651	#ifdef TEST_RIG
	652
	653	#include <stdio.h>
	654
	655	#include <mLib/macros.h>
	656	#include <mLib/quis.h>
	657	#include <mLib/report.h>
	658	#include <mLib/testrig.h>
	659
	660	static int vrf_p(dstr v[])
	661	{
	662	keccak1600_state u;
	663	kludge64 t[25];
	664	dstr d = DSTR_INIT;
	665	int n;
	666	unsigned i;
	667	int ok = 1;
	668
	669	if (v[0].len != 200) die(1, "bad input size");
	670	if (v[2].len != 200) die(1, "bad output size");
	671	n = (int )v[1].buf;
	672	dstr_ensure(&d, 200); d.len = 200;
	673
	674	keccak1600_init(&u);
	675	for (i = 0; i < 25; i++) LOAD64_L_(t[i], v[0].buf + 8*i);
	676	keccak1600_mix(&u, t, 25);
	677	keccak1600_p(&u, &u, n);
	678	keccak1600_extract(&u, t, 25);
	679	for (i = 0; i < 25; i++) STORE64_L_(d.buf + 8*i, t[i]);
	680	if (MEMCMP(d.buf, !=, v[2].buf, 200)) {
	681	ok = 0;
	682	fprintf(stderr, "failed!");
	683	fprintf(stderr, "\n\t input = "); type_hex.dump(&v[0], stderr);
	684	fprintf(stderr, "\n\t rounds = %d", n);
	685	fprintf(stderr, "\n\t expected = "); type_hex.dump(&v[2], stderr);
	686	fprintf(stderr, "\n\t calclated = "); type_hex.dump(&d, stderr);
	687	}
	688
	689	dstr_destroy(&d);
	690	return (ok);
	691	}
	692
	693	static test_chunk defs[] = {
	694	{ "p", vrf_p, { &type_hex, &type_int, &type_hex } },
	695	{ 0, 0, { 0 } }
	696	};
	697
	698	int main(int argc, char *argv[])
	699	{
	700	test_run(argc, argv, defs, SRCDIR"/t/keccak1600");
	701	return (0);
	702	}
	703
	704	#endif
	705
	706	/----- That's all, folks -------------------------------------------------/