mdw@git.distorted.org.uk Git - secnet/blame_incremental

... / ...

Commit	Line	Data
	1	/* --c--
	2	*
	3	* The Keccak-p[1600, n] permutation
	4	*
	5	* (c) 2017 Straylight/Edgeware
	6	*/
	7
	8	/----- Licensing notice --------------------------------------------------
	9	*
	10	* This file is part of secnet.
	11	* See README for full list of copyright holders.
	12	*
	13	* secnet is free software; you can redistribute it and/or modify it
	14	* under the terms of the GNU General Public License as published by
	15	* the Free Software Foundation; either version d of the License, or
	16	* (at your option) any later version.
	17	*
	18	* secnet is distributed in the hope that it will be useful, but
	19	* WITHOUT ANY WARRANTY; without even the implied warranty of
	20	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	21	* General Public License for more details.
	22	*
	23	* You should have received a copy of the GNU General Public License
	24	* version 3 along with secnet; if not, see
	25	* https://www.gnu.org/licenses/gpl.html.
	26	*
	27	* This file was originally part of Catacomb, but has been automatically
	28	* modified for incorporation into secnet: see `import-catacomb-crypto'
	29	* for details.
	30	*
	31	* Catacomb is free software; you can redistribute it and/or modify
	32	* it under the terms of the GNU Library General Public License as
	33	* published by the Free Software Foundation; either version 2 of the
	34	* License, or (at your option) any later version.
	35	*
	36	* Catacomb is distributed in the hope that it will be useful,
	37	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	38	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	39	* GNU Library General Public License for more details.
	40	*
	41	* You should have received a copy of the GNU Library General Public
	42	* License along with Catacomb; if not, write to the Free
	43	* Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	44	* MA 02111-1307, USA.
	45	*/
	46
	47	/----- Header files ------------------------------------------------------/
	48
	49	#include <limits.h>
	50	#include <string.h>
	51
	52	#include "fake-mLib-bits.h"
	53
	54	#include "keccak1600.h"
	55
	56	/* #define KECCAK_DEBUG */
	57
	58	/----- Miscellaneous utilities -------------------------------------------/
	59
	60	#define I(x, y) ((x) + 5(y)) / Column-major indexing */
	61
	62	/----- Interlacing or not ------------------------------------------------/
	63
	64	/* We should prefer the interlaced representation if the target is really
	65	* 32-bit and only providing synthetic 64-bit integers. Alas, the Windows
	66	* 64-bit ABI specifies that `long' is only 32-bits (i.e., it is IL32/LLP64),
	67	* so detect x86 specifically.
	68	*/
	69	#if (ULONG_MAX >> 31) <= 0xffffffff && \
	70	!defined(__amd64__) && !defined(_M_AMD64)
	71	# define KECCAK_I32
	72	#endif
	73
	74	#ifdef KECCAK_I32
	75	/* A 32-bit target with at best weak support for 64-bit shifts. Maintain a
	76	* lane as two 32-bit pieces representing the even and odd bits of the lane.
	77	* There are slightly fiddly transformations to apply on the way in and out
	78	* of the main permutation.
	79	*/
	80
	81	typedef keccak1600_lane_i32 lane;
	82	#define S si32
	83
	84	static lane interlace(kludge64 x)
	85	{
	86	/* Given a 64-bit string X, return a lane Z containing the even- and
	87	* odd-numbered bits of X.
	88	*
	89	* This becomes more manageable if we look at what happens to the bit
	90	* indices: bit i of X becomes bit ROR_6(i, 1) of Z. We can effectively
	91	* swap two bits of the indices by swapping the object bits where those
	92	* index bits differ. Fortunately, this is fairly easy.
	93	*
	94	* We arrange to swap bits between the two halves of X, rather than within
	95	* a half.
	96	*/
	97
	98	uint32 x0 = LO64(x), x1 = HI64(x), t;
	99	lane z;
	100	/* 543210 */
	101	t = ((x0 >> 16) ^ x1)&0x0000ffff; x0 ^= t << 16; x1 ^= t; /* 453210 */
	102	t = ((x0 >> 8) ^ x1)&0x00ff00ff; x0 ^= t << 8; x1 ^= t; /* 354210 */
	103	t = ((x0 >> 4) ^ x1)&0x0f0f0f0f; x0 ^= t << 4; x1 ^= t; /* 254310 */
	104	t = ((x0 >> 2) ^ x1)&0x33333333; x0 ^= t << 2; x1 ^= t; /* 154320 */
	105	t = ((x0 >> 1) ^ x1)&0x55555555; x0 ^= t << 1; x1 ^= t; /* 054321 */
	106	z.even = x0; z.odd = x1; return (z);
	107	}
	108
	109	static kludge64 deinterlace(lane x)
	110	{
	111	/* Given a lane X, return the combined 64-bit value. This is the inverse
	112	* to `interlace' above, and the principle is the same
	113	*/
	114
	115	uint32 x0 = x.even, x1 = x.odd, t;
	116	kludge64 z;
	117	/* 054321 */
	118	t = ((x0 >> 1) ^ x1)&0x55555555; x0 ^= t << 1; x1 ^= t; /* 154320 */
	119	t = ((x0 >> 2) ^ x1)&0x33333333; x0 ^= t << 2; x1 ^= t; /* 254310 */
	120	t = ((x0 >> 4) ^ x1)&0x0f0f0f0f; x0 ^= t << 4; x1 ^= t; /* 354210 */
	121	t = ((x0 >> 8) ^ x1)&0x00ff00ff; x0 ^= t << 8; x1 ^= t; /* 453210 */
	122	t = ((x0 >> 16) ^ x1)&0x0000ffff; x0 ^= t << 16; x1 ^= t; /* 543210 */
	123	SET64(z, x1, x0); return (z);
	124	}
	125
	126	#define TO_LANE(x) (interlace(x))
	127	#define FROM_LANE(x) (deinterlace(x))
	128
	129	#define PRINTFMT_LANE "%08lx:%08lx"
	130	#define PRINTARGS_LANE(x) (unsigned long)(x).even, (unsigned long)(x).odd
	131
	132	#define BINOP_LANE(z, op, x, y) \
	133	((z).even = (x).even op (y).even, (z).odd = (x).odd op (y).odd)
	134	#define XOR_LANE(z, x, y) BINOP_LANE(z, ^, x, y)
	135	#define AND_LANE(z, x, y) BINOP_LANE(z, &, x, y)
	136	#define OR_LANE(z, x, y) BINOP_LANE(z, \|, x, y)
	137	#define NOT_LANE(z, x) ((z).even = ~(x).even, (z).odd = ~(x).odd)
	138
	139	#define ROTL_LANE(z, x, n) do { \
	140	lane _t = (x); \
	141	(z).even = (n)%2 ? ROL32(_t.odd, ((n) + 1)/2) \
	142	: ROL32(_t.even, (n)/2); \
	143	(z).odd = (n)%2 ? ROL32(_t.even, ((n) - 1)/2) \
	144	: ROL32(_t.odd, (n)/2); \
	145	} while (0)
	146
	147	#define LANE_ZERO { 0, 0 }
	148	#define LANE_CMPL { 0xffffffff, 0xffffffff }
	149
	150	static const lane rcon[24] = {
	151	{ 0x00000001, 0x00000000 }, { 0x00000000, 0x00000089 },
	152	{ 0x00000000, 0x8000008b }, { 0x00000000, 0x80008080 },
	153	{ 0x00000001, 0x0000008b }, { 0x00000001, 0x00008000 },
	154	{ 0x00000001, 0x80008088 }, { 0x00000001, 0x80000082 },
	155	{ 0x00000000, 0x0000000b }, { 0x00000000, 0x0000000a },
	156	{ 0x00000001, 0x00008082 }, { 0x00000000, 0x00008003 },
	157	{ 0x00000001, 0x0000808b }, { 0x00000001, 0x8000000b },
	158	{ 0x00000001, 0x8000008a }, { 0x00000001, 0x80000081 },
	159	{ 0x00000000, 0x80000081 }, { 0x00000000, 0x80000008 },
	160	{ 0x00000000, 0x00000083 }, { 0x00000000, 0x80008003 },
	161	{ 0x00000001, 0x80008088 }, { 0x00000000, 0x80000088 },
	162	{ 0x00000001, 0x00008000 }, { 0x00000000, 0x80008082 }
	163	};
	164
	165	#else
	166	/* A target with good support for 64-bit shifts. We store lanes as 64-bit
	167	* quantities and deal with them in the obvious, natural way.
	168	*/
	169
	170	typedef keccak1600_lane_64 lane;
	171	#define S s64
	172
	173	#define TO_LANE(x) (x)
	174	#define FROM_LANE(x) (x)
	175
	176	#define PRINTFMT_LANE "%08lx%08lx"
	177	#define PRINTARGS_LANE(x) (unsigned long)HI64(x), (unsigned long)LO64(x)
	178
	179	#define XOR_LANE(z, x, y) XOR64((z), (x), (y))
	180	#define AND_LANE(z, x, y) AND64((z), (x), (y))
	181	#define OR_LANE(z, x, y) OR64((z), (x), (y))
	182	#define NOT_LANE(z, x) CPL64((z), (x))
	183	#define ROTL_LANE(z, x, n) ROL64_((z), (x), (n))
	184
	185	#define LANE_ZERO X64( 0, 0)
	186	#define LANE_CMPL X64(ffffffff, ffffffff)
	187
	188	static const lane rcon[24] = {
	189	X64(00000000, 00000001), X64(00000000, 00008082),
	190	X64(80000000, 0000808a), X64(80000000, 80008000),
	191	X64(00000000, 0000808b), X64(00000000, 80000001),
	192	X64(80000000, 80008081), X64(80000000, 00008009),
	193	X64(00000000, 0000008a), X64(00000000, 00000088),
	194	X64(00000000, 80008009), X64(00000000, 8000000a),
	195	X64(00000000, 8000808b), X64(80000000, 0000008b),
	196	X64(80000000, 00008089), X64(80000000, 00008003),
	197	X64(80000000, 00008002), X64(80000000, 00000080),
	198	X64(00000000, 0000800a), X64(80000000, 8000000a),
	199	X64(80000000, 80008081), X64(80000000, 00008080),
	200	X64(00000000, 80000001), X64(80000000, 80008008)
	201	};
	202
	203	#endif
	204
	205	/----- Complementing or not ----------------------------------------------/
	206
	207	/* We should use the complemented representation if the target doesn't have a
	208	* fused and-not operation. There doesn't appear to be a principled way to
	209	* do this, so we'll just have to make do with a big list. Worse, in my
	210	* brief survey of the architecture reference manuals I have lying about,
	211	* they've split close to 50/50 on this question, so I don't have an
	212	* especially good way to pick a default. The `no-fused-op' architectures
	213	* seem generally a bit more modern than the `fused-op' architectures, so I
	214	* guess I'll make the complemented representation the default.
	215	*
	216	* and-not No and-not
	217	* ------- ----------
	218	* ARM (`bic') x86/amd64
	219	* Sparc (`andn') z/Architecture
	220	* MMIX (`andn') MIPS
	221	* IA64 (`andcm') 68k
	222	* VAX (`bic') RISC-V
	223	* PDP-10 (`andc')
	224	*/
	225	#if !(defined(__arm__) \|\| defined(__thumb__) \|\| defined(__aarch64__) \|\| \
	226	defined(_M_ARM) \|\| defined(_M_THUMB)) && \
	227	!(defined(__ia64__) \|\| defined(__ia64) \|\| defined(__itanium__) \|\| \
	228	defined(_M_IA64)) && \
	229	!defined(__mmix__) && \
	230	!(defined(__sparc__) \|\| defined(__sparc)) && \
	231	!defined(__vax__) && \
	232	!defined(__pdp10__)
	233	# define KECCAK_COMPL
	234	#endif
	235
	236	#ifdef KECCAK_COMPL
	237	/* A target without fused and/not (`bic', `andc2'). We complement some of
	238	* the lanes in the initial state and undo this on output. (Absorbing XORs
	239	* input into the state, so this is unaffected.) See the handling of chi in
	240	* `keccak1600_round' below for the details.
	241	*/
	242
	243	#define STATE_INIT(z) do { \
	244	lane cmpl = LANE_CMPL; \
	245	(z)->S[I(1, 0)] = cmpl; (z)->S[I(2, 0)] = cmpl; \
	246	(z)->S[I(3, 1)] = cmpl; (z)->S[I(2, 2)] = cmpl; \
	247	(z)->S[I(2, 3)] = cmpl; (z)->S[I(0, 4)] = cmpl; \
	248	} while (0)
	249
	250	#define STATE_OUT(z) do { \
	251	NOT_LANE((z)->S[I(1, 0)], (z)->S[I(1, 0)]); \
	252	NOT_LANE((z)->S[I(2, 0)], (z)->S[I(2, 0)]); \
	253	NOT_LANE((z)->S[I(3, 1)], (z)->S[I(3, 1)]); \
	254	NOT_LANE((z)->S[I(2, 2)], (z)->S[I(2, 2)]); \
	255	NOT_LANE((z)->S[I(2, 3)], (z)->S[I(2, 3)]); \
	256	NOT_LANE((z)->S[I(0, 4)], (z)->S[I(0, 4)]); \
	257	} while (0)
	258
	259	#else
	260	/* A target with fused and/not (`bic', `andc2'). Everything is simple. */
	261
	262	#define STATE_INIT(z) do {} while (0)
	263	#define STATE_OUT(z) do {} while (0)
	264
	265	#endif
	266
	267	/----- Other magic constants ---------------------------------------------/
	268
	269	/* The rotation constants. These are systematically named -- see `THETA_RHO'
	270	* below.
	271	*/
	272	#define ROT_0_0 0
	273	#define ROT_1_0 1
	274	#define ROT_2_0 62
	275	#define ROT_3_0 28
	276	#define ROT_4_0 27
	277
	278	#define ROT_0_1 36
	279	#define ROT_1_1 44
	280	#define ROT_2_1 6
	281	#define ROT_3_1 55
	282	#define ROT_4_1 20
	283
	284	#define ROT_0_2 3
	285	#define ROT_1_2 10
	286	#define ROT_2_2 43
	287	#define ROT_3_2 25
	288	#define ROT_4_2 39
	289
	290	#define ROT_0_3 41
	291	#define ROT_1_3 45
	292	#define ROT_2_3 15
	293	#define ROT_3_3 21
	294	#define ROT_4_3 8
	295
	296	#define ROT_0_4 18
	297	#define ROT_1_4 2
	298	#define ROT_2_4 61
	299	#define ROT_3_4 56
	300	#define ROT_4_4 14
	301
	302	/----- Debugging ---------------------------------------------------------/
	303
	304	#ifdef KECCAK_DEBUG
	305
	306	#include <stdio.h>
	307
	308	static void dump_state(const char *what, unsigned ir,
	309	const keccak1600_state *x)
	310	{
	311	unsigned i, j;
	312	keccak1600_state y;
	313	kludge64 a;
	314	int sep;
	315
	316	printf(";; %s [round %u]\n", what, ir);
	317	printf(";; raw state...\n");
	318	for (j = 0; j < 5; j++) {
	319	printf(";;");
	320	for (i = 0, sep = '\t'; i < 5; i++, sep = ' ')
	321	printf("%c" PRINTFMT_LANE, sep, PRINTARGS_LANE(x->S[I(i, j)]));
	322	fputc('\n', stdout);
	323	}
	324	y = *x; STATE_OUT(&y);
	325	#ifdef KECCAK_COMPL
	326	printf(";; uncomplemented state...\n");
	327	for (j = 0; j < 5; j++) {
	328	printf(";;");
	329	for (i = 0, sep = '\t'; i < 5; i++, sep = ' ')
	330	printf("%c" PRINTFMT_LANE, sep, PRINTARGS_LANE(y.S[I(i, j)]));
	331	fputc('\n', stdout);
	332	}
	333	#endif
	334	#ifdef KECCAK_I32
	335	printf(";; deinterlaced state...\n");
	336	for (j = 0; j < 5; j++) {
	337	printf(";;");
	338	for (i = 0, sep = '\t'; i < 5; i++, sep = ' ') {
	339	a = FROM_LANE(y.S[I(i, j)]);
	340	printf("%c%08lx%08lx", sep,
	341	(unsigned long)HI64(a), (unsigned long)LO64(a));
	342	}
	343	fputc('\n', stdout);
	344	}
	345	#endif
	346	fputc('\n', stdout);
	347	}
	348
	349	#endif
	350
	351	/----- The Keccak-p[1600, n] permutation ---------------------------------/
	352
	353	static void keccak1600_round(keccak1600_state *z,
	354	const keccak1600_state *x, unsigned i)
	355	{
	356	/* Perform a round of Keccak-p[1600, n]. Process the state X and write the
	357	* result to Z.
	358	*/
	359
	360	lane c[5], d[5], t;
	361
	362	/* Theta, first step: calculate the column parities. */
	363	#define COLPARITY(j) do { \
	364	d[j] = x->S[I(j, 0)]; \
	365	XOR_LANE(d[j], d[j], x->S[I(j, 1)]); \
	366	XOR_LANE(d[j], d[j], x->S[I(j, 2)]); \
	367	XOR_LANE(d[j], d[j], x->S[I(j, 3)]); \
	368	XOR_LANE(d[j], d[j], x->S[I(j, 4)]); \
	369	} while (0)
	370	COLPARITY(0); COLPARITY(1); COLPARITY(2); COLPARITY(3); COLPARITY(4);
	371	#undef COLPARITY
	372
	373	/* Theta, second step: calculate the combined effect. */
	374	ROTL_LANE(c[0], d[1], 1); XOR_LANE(c[0], c[0], d[4]);
	375	ROTL_LANE(c[1], d[2], 1); XOR_LANE(c[1], c[1], d[0]);
	376	ROTL_LANE(c[2], d[3], 1); XOR_LANE(c[2], c[2], d[1]);
	377	ROTL_LANE(c[3], d[4], 1); XOR_LANE(c[3], c[3], d[2]);
	378	ROTL_LANE(c[4], d[0], 1); XOR_LANE(c[4], c[4], d[3]);
	379
	380	/* Now we work plane by plane through the output. To do this, we must undo
	381	* the pi transposition. Pi maps (x', y') = (y, 2 x + 3 y), so y = x', and
	382	* x = (y' - 3 y)/2 = 3 (y' - 3 x') = x' + 3 y'.
	383	*/
	384	#define THETA_RHO(i0, i1, i2, i3, i4) do { \
	385	\
	386	/* First, theta. */ \
	387	XOR_LANE(d[0], x->S[I(i0, 0)], c[i0]); \
	388	XOR_LANE(d[1], x->S[I(i1, 1)], c[i1]); \
	389	XOR_LANE(d[2], x->S[I(i2, 2)], c[i2]); \
	390	XOR_LANE(d[3], x->S[I(i3, 3)], c[i3]); \
	391	XOR_LANE(d[4], x->S[I(i4, 4)], c[i4]); \
	392	\
	393	/* Then rho. */ \
	394	ROTL_LANE(d[0], d[0], ROT_##i0##_0); \
	395	ROTL_LANE(d[1], d[1], ROT_##i1##_1); \
	396	ROTL_LANE(d[2], d[2], ROT_##i2##_2); \
	397	ROTL_LANE(d[3], d[3], ROT_##i3##_3); \
	398	ROTL_LANE(d[4], d[4], ROT_##i4##_4); \
	399	} while (0)
	400
	401	/* The basic chi operation is: z = w ^ (~a&b), but this involves an
	402	* inversion which we can mostly avoid by being clever: observe that
	403	*
	404	* w ^ (~a&~~b) = w ^ ~(a \| ~b) = ~w ^ (a \| ~b)
	405	*
	406	* by De Morgan's law. Furthermore, complementing w or z is basically
	407	* equivalent. Bertoni, Daemen, Peeters, Van Assche, and Van Keer, `Keccak
	408	* implementation overview', describe a pattern of lane complementation
	409	* which propagates through theta and pi in exactly the right way to be
	410	* restored easily by chi, here, with exactly one inversion per plane.
	411	*
	412	* Here's the pattern.
	413	*
	414	* [ * . * * . ] [ . * * . . ]
	415	* [ * . * . . ] [ . . . * . ]
	416	* [ * . * . . ] -> [ . . * . . ]
	417	* [ . * . * * ] [ . . * . . ]
	418	* [ * . . * . ] [ * . . . . ]
	419	*
	420	* where a `.' means that the lane is unchanged, and a `*' means that it
	421	* has been complemented.
	422	*
	423	* The macros `CHI_wxy_z' calculate z in terms of w, x, y assuming that the
	424	* inputs w, x, y marked with a `1' are complemented on input, and arrange
	425	* for z to be complemented on output if z is so marked.
	426	*
	427	* The diagrams to the right show the fragment of the complementation
	428	* pattern being handled by the corresponding line of code. A symbol in
	429	* brackets indicates a deviation from the input pattern forced by explicit
	430	* complementation: there will be exactly one of these for each plane.
	431	*/
	432	#ifdef KECCAK_COMPL
	433	# define CHI_COMPL(z, x) NOT_LANE((z), (x))
	434	# define CHI_001_1(z, w, x, y) \
	435	(OR_LANE((z), (x), (y)), XOR_LANE((z), (z), (w)))
	436	# define CHI_010_0(z, w, x, y) \
	437	(AND_LANE((z), (x), (y)), XOR_LANE((z), (z), (w)))
	438	# define CHI_101_0 CHI_001_1
	439	# define CHI_110_1 CHI_010_0
	440	#else
	441	# define CHI(z, w, x, y) \
	442	(NOT_LANE((z), (x)), \
	443	AND_LANE((z), (z), (y)), \
	444	XOR_LANE((z), (z), (w)))
	445	# define CHI_COMPL(z, x) ((z) = (x))
	446	# define CHI_001_1 CHI
	447	# define CHI_010_0 CHI
	448	# define CHI_101_0 CHI
	449	# define CHI_110_1 CHI
	450	#endif
	451
	452	/* Let's do the y' = 0 plane first. Theta and rho are easy with our macro,
	453	* and we've done pi with the coordinate hacking. That leaves chi next.
	454	* This is hairy because we must worry about complementation.
	455	*/
	456	THETA_RHO(0, 1, 2, 3, 4);
	457	CHI_COMPL(t, d[2]); /* [.] */
	458	CHI_101_0(z->S[I(0, 0)], d[0], d[1], d[2]); /* * . * -> . */
	459	CHI_001_1(z->S[I(1, 0)], d[1], t, d[3]); /* . [.] * -> * */
	460	CHI_110_1(z->S[I(2, 0)], d[2], d[3], d[4]); /* * * . -> * */
	461	CHI_101_0(z->S[I(3, 0)], d[3], d[4], d[0]); /* * * . -> . */
	462	CHI_010_0(z->S[I(4, 0)], d[4], d[0], d[1]); /* * . . -> . */
	463
	464	/* We'd better do iota before we forget. */
	465	XOR_LANE(z->S[I(0, 0)], z->S[I(0, 0)], rcon[i]);
	466
	467	/* That was fun. Maybe y' = 1 will be as good. */
	468	THETA_RHO(3, 4, 0, 1, 2);
	469	CHI_COMPL(t, d[4]); /* [] /
	470	CHI_101_0(z->S[I(0, 1)], d[0], d[1], d[2]); /* * . * -> . */
	471	CHI_010_0(z->S[I(1, 1)], d[1], d[2], d[3]); /* . * . -> . */
	472	CHI_101_0(z->S[I(2, 1)], d[2], d[3], t); /* * . [] -> . /
	473	CHI_001_1(z->S[I(3, 1)], d[3], d[4], d[0]); /* * . . -> * */
	474	CHI_010_0(z->S[I(4, 1)], d[4], d[0], d[1]); /* * . . -> . */
	475
	476	/* We're getting the hang of this. The y' = 2 plane shouldn't be any
	477	* trouble.
	478	*/
	479	THETA_RHO(1, 2, 3, 4, 0);
	480	CHI_COMPL(t, d[3]); /* [] /
	481	CHI_101_0(z->S[I(0, 2)], d[0], d[1], d[2]); /* * . * -> . */
	482	CHI_010_0(z->S[I(1, 2)], d[1], d[2], d[3]); /* . * . -> . */
	483	CHI_110_1(z->S[I(2, 2)], d[2], t, d[4]); /* * [] . -> */
	484	CHI_101_0(z->S[I(3, 2)], t, d[4], d[0]); /* * [] . -> . /
	485	CHI_010_0(z->S[I(4, 2)], d[4], d[0], d[1]); /* * . . -> . */
	486
	487	/* This isn't as interesting any more. Let's do y' = 3 before boredom sets
	488	* in.
	489	*/
	490	THETA_RHO(4, 0, 1, 2, 3);
	491	CHI_COMPL(t, d[3]); /* [.] */
	492	CHI_010_0(z->S[I(0, 3)], d[0], d[1], d[2]); /* . * . -> . */
	493	CHI_101_0(z->S[I(1, 3)], d[1], d[2], d[3]); /* * . * -> . */
	494	CHI_001_1(z->S[I(2, 3)], d[2], t, d[4]); /* . [.] * -> * */
	495	CHI_010_0(z->S[I(3, 3)], t, d[4], d[0]); /* . [.] * -> . */
	496	CHI_101_0(z->S[I(4, 3)], d[4], d[0], d[1]); /* . * * -> . */
	497
	498	/* Last plane. Just y' = 4 to go. */
	499	THETA_RHO(2, 3, 4, 0, 1);
	500	CHI_COMPL(t, d[1]); /* [] /
	501	CHI_110_1(z->S[I(0, 4)], d[0], t, d[2]); /* * [] . -> */
	502	CHI_101_0(z->S[I(1, 4)], t, d[2], d[3]); /* [] . -> . */
	503	CHI_010_0(z->S[I(2, 4)], d[2], d[3], d[4]); /* . * . -> . */
	504	CHI_101_0(z->S[I(3, 4)], d[3], d[4], d[0]); /* * * . -> . */
	505	CHI_010_0(z->S[I(4, 4)], d[4], d[0], d[1]); /* * . . -> . */
	506
	507	/* And we're done. */
	508	#undef THETA_RHO
	509	#undef CHI_COMPL
	510	#undef CHI_001_1
	511	#undef CHI_010_0
	512	#undef CHI_101_0
	513	#undef CHI_110_1
	514	#undef CHI
	515	}
	516
	517	/* --- @keccak1600_p@ --- *
	518	*
	519	* Arguments: @keccak1600_state *z@ = where to write the output state
	520	* @conts keccak1600_state *x@ = input state
	521	* @unsigned n@ = number of rounds to perform
	522	*
	523	* Returns: ---
	524	*
	525	* Use: Implements the %$\Keccak[1600, n]$% permutation at the core
	526	* of Keccak and the SHA-3 standard.
	527	*/
	528
	529	void keccak1600_p(keccak1600_state z, const keccak1600_state x, unsigned n)
	530	{
	531	keccak1600_state u, v;
	532	unsigned i = 0;
	533
	534	#ifdef KECCAK_DEBUG
	535	dump_state("init", 0, x);
	536	#endif
	537	keccak1600_round(&u, x, i++); n--;
	538	while (n > 8) {
	539	keccak1600_round(&v, &u, i++);
	540	keccak1600_round(&u, &v, i++);
	541	keccak1600_round(&v, &u, i++);
	542	keccak1600_round(&u, &v, i++);
	543	keccak1600_round(&v, &u, i++);
	544	keccak1600_round(&u, &v, i++);
	545	keccak1600_round(&v, &u, i++);
	546	keccak1600_round(&u, &v, i++);
	547	n -= 8;
	548	}
	549	switch (n) {
	550	case 7: keccak1600_round(&v, &u, i++);
	551	keccak1600_round(&u, &v, i++);
	552	case 5: keccak1600_round(&v, &u, i++);
	553	keccak1600_round(&u, &v, i++);
	554	case 3: keccak1600_round(&v, &u, i++);
	555	keccak1600_round(&u, &v, i++);
	556	case 1: keccak1600_round( z, &u, i++);
	557	break;
	558	case 8: keccak1600_round(&v, &u, i++);
	559	keccak1600_round(&u, &v, i++);
	560	case 6: keccak1600_round(&v, &u, i++);
	561	keccak1600_round(&u, &v, i++);
	562	case 4: keccak1600_round(&v, &u, i++);
	563	keccak1600_round(&u, &v, i++);
	564	case 2: keccak1600_round(&v, &u, i++);
	565	keccak1600_round( z, &v, i++);
	566	break;
	567	}
	568	#ifdef KECCAK_DEBUG
	569	dump_state("final", 0, z);
	570	#endif
	571	}
	572
	573	/* --- @keccack1600_init@ --- *
	574	*
	575	* Arguments: @keccak1600_state *s@ = a state to initialize
	576	*
	577	* Returns: ---
	578	*
	579	* Use: Initialize @s@ to the root state.
	580	*/
	581
	582	void keccak1600_init(keccak1600_state *s)
	583	{ memset(s->S, 0, sizeof(s->S)); STATE_INIT(s); }
	584
	585	/* --- @keccak1600_mix@ --- *
	586	*
	587	* Arguments: @keccak1600_state *s@ = a state to update
	588	* @const kludge64 *p@ = pointer to 64-bit words to mix in
	589	* @size_t n@ = size of the input, in 64-bit words
	590	*
	591	* Returns: ---
	592	*
	593	* Use: Mixes data into a %$\Keccak[r, 1600 - r]$% state. Note that
	594	* it's the caller's responsibility to pass in no more than
	595	* %$r$% bits of data.
	596	*/
	597
	598	void keccak1600_mix(keccak1600_state s, const kludge64 p, size_t n)
	599	{
	600	unsigned i;
	601	lane a;
	602
	603	for (i = 0; i < n; i++)
	604	{ a = TO_LANE(p[i]); XOR_LANE(s->S[i], s->S[i], a); }
	605	}
	606
	607	/* --- @keccak1600_extract@ --- *
	608	*
	609	* Arguments: @const keccak1600_state *s@ = a state to extract output from
	610	* @kludge64 *p@ = pointer to 64-bit words to write
	611	* @size_t n@ = size of the output, in 64-bit words
	612	*
	613	* Returns: ---
	614	*
	615	* Use: Reads output from a %$\Keccak[r, 1600 - r]$% state. Note
	616	* that it's the caller's responsibility to extract no more than
	617	* %$r$% bits of data.
	618	*/
	619
	620	void keccak1600_extract(const keccak1600_state s, kludge64 p, size_t n)
	621	{
	622	unsigned i;
	623	keccak1600_state t;
	624
	625	t = *s; STATE_OUT(&t);
	626	for (i = 0; i < n; i++) p[i] = FROM_LANE(t.S[i]);
	627	}
	628
	629	/----- That's all, folks -------------------------------------------------/