[catacomb] / symm / salsa20-core.h

/* -*-c-*-
 *
 * Salsa20 core definitions
 *
 * (c) 2015 Straylight/Edgeware
 */

#ifndef CATACOMB_SALSA20_CORE_H
#define CATACOMB_SALSA20_CORE_H

#ifdef __cplusplus
  extern "C" {
#endif

/*----- Header files ------------------------------------------------------*/

#include <mLib/bits.h>
#include <mLib/macros.h>

#ifndef CATACOMB_SALSA20_H
#  include "salsa20.h"
#endif

/*----- Magic constants ---------------------------------------------------*/

/* The magic Salsa20 constants, for 256-bit keys... */
#define SALSA20_A256 0x61707865		/* e x p a */
#define SALSA20_B256 0x3320646e		/* n d   3 */
#define SALSA20_C256 0x79622d32		/* 2 - b y */
#define SALSA20_D256 0x6b206574		/* t e   k */

/* ... and for 128-bit keys ... */
#define SALSA20_A128 SALSA20_A256	/* e x p a */
#define SALSA20_B128 0x3120646e		/* n d   1 */
#define SALSA20_C128 0x79622d36		/* 6 - b y */
#define SALSA20_D128 SALSA20_D256	/* t e   k */

/* ... and for 80-bit keys, for completeness's sake. */
#define SALSA20_A80 SALSA20_A128	/* e x p a */
#define SALSA20_B80 SALSA20_B128	/* n d   1 */
#define SALSA20_C80 0x79622d30		/* 0 - b y */
#define SALSA20_D80 SALSA20_D128	/* t e   k */

/*----- The Salsa20 core function -----------------------------------------*/

/* It makes life somewhat easier if we don't actually store and maintain the
 * input matrix in the textbook order.  Instead, we rotate the columns other
 * than the leftmost one upwards, so that the constants which were originally
 * along the diagonal end up on the top row.  We'll need to undo this
 * permutation on output, but that's not too terrible an imposition.
 *
 * The permutation we're applying to the matrix elements is this:
 *
 * [  0  1  2  3 ]	 [  0  5 10 15 ]
 * [  4  5  6  7 ]  -->  [  4  9 14  3 ]
 * [  8  9 10 11 ]	 [  8 13  2  7 ]
 * [ 12 13 14 15 ]	 [ 12  1  6 11 ]
 *
 * and as a result, we need to apply this inverse permutation to figure out
 * which indices to use in the doublerow function and elsewhere.
 *
 * [  0 13 10  7 ]
 * [  4  1 14 11 ]
 * [  8  5  2 15 ]
 * [ 12  9  6  3 ]
 */

/* The Salsa20 quarter-round.  Read from the matrix @y@ at indices @a@, @b@,
 * @c@, and @d@; and write to the corresponding elements of @z@.
 */
#define SALSA20_QR(z, y, a, b, c, d) do {				\
  (z)[b] = (y)[b] ^ ROL32((y)[a] + (y)[d],  7);				\
  (z)[c] = (y)[c] ^ ROL32((z)[b] + (y)[a],  9);				\
  (z)[d] = (y)[d] ^ ROL32((z)[c] + (z)[b], 13);				\
  (z)[a] = (y)[a] ^ ROL32((z)[d] + (z)[c], 18);				\
} while (0)

/* The Salsa20 double-round.  Read from matrix @y@, writing the result to
 * @z@.
 */
#define SALSA20_DR(z, y) do {						\
  SALSA20_QR(z, y,  0,  4,  8, 12);					\
  SALSA20_QR(z, y,  1,  5,  9, 13);					\
  SALSA20_QR(z, y,  2,  6, 10, 14);					\
  SALSA20_QR(z, y,  3,  7, 11, 15);					\
  SALSA20_QR(z, z,  0, 13, 10,  7);					\
  SALSA20_QR(z, z,  1, 14, 11,  4);					\
  SALSA20_QR(z, z,  2, 15,  8,  5);					\
  SALSA20_QR(z, z,  3, 12,  9,  6);					\
} while (0)

/* The Salsa20 feedforward step, used at the end of the core function.  Here,
 * @y@ contains the original input matrix; @z@ contains the final one, and is
 * updated.  The output is rendered in canonical order, ready for output.
 */
#define SALSA20_FFWD(z, y) do {						\
  const uint32 *_y = (y);						\
  uint32 *_z = (z);							\
  int _t;								\
  _z[ 0] = _z[ 0] + _y[ 0]; _z[ 4] = _z[ 4] + _y[ 4];			\
  _z[ 8] = _z[ 8] + _y[ 8]; _z[12] = _z[12] + _y[12];			\
      _t = _z[ 1] + _y[ 1]; _z[ 1] = _z[13] + _y[13];			\
  _z[13] = _z[ 9] + _y[ 9]; _z[ 9] = _z[ 5] + _y[ 5]; _z[ 5] = _t;	\
      _t = _z[ 2] + _y[ 2]; _z[ 2] = _z[10] + _y[10]; _z[10] = _t;	\
      _t = _z[ 6] + _y[ 6]; _z[ 6] = _z[14] + _y[14]; _z[14] = _t;	\
      _t = _z[ 3] + _y[ 3]; _z[ 3] = _z[ 7] + _y[ 7];			\
  _z[ 7] = _z[11] + _y[11]; _z[11] = _z[15] + _y[15]; _z[15] = _t;	\
} while (0)

/* Various numbers of rounds, unrolled.  Read from @y@, and write to @z@. */
#define SALSA20_4R(z, y)						\
  do { SALSA20_DR(z, y); SALSA20_DR(z, z); } while (0)
#define SALSA20_8R(z, y)						\
  do { SALSA20_4R(z, y); SALSA20_4R(z, z); } while (0)
#define SALSA20_12R(z, y)						\
  do { SALSA20_8R(z, y); SALSA20_4R(z, z); } while (0)
#define SALSA20_20R(z, y)						\
  do { SALSA20_12R(z, y); SALSA20_8R(z, z); } while (0)

/* Apply @n@ (must be even) rounds, rolled.  (This seems to be faster,
 * probably because it fits in cache better).  Read from @y@, and write to
 * @z@.
 */
#define SALSA20_nR(z, y, n) do {					\
  int _i;								\
  SALSA20_DR(z, y);							\
  for (_i = 0; _i < (n)/2 - 1; _i++) SALSA20_DR(z, z);			\
} while (0)

/* Step the counter in the Salsa20 state matrix @a@. */
#define SALSA20_STEP(a)							\
  do { (a)[8] = U32((a)[8] + 1); (a)[5] += !(a)[8]; } while (0)

/*----- Buffering and output ----------------------------------------------*
 *
 * These macros are also used by ChaCha.
 */

/* Copy the Salsa20 matrix @a@ to the output buffer at @d@, advancing @d@
 * past the new material.
 */
#define SALSA20_GENFULL(a, d) do {					\
  int _i;								\
									\
  for (_i = 0; _i < 16; _i++) { STORE32_L((d), (a)[_i]); (d) += 4; }	\
} while (0)

/* XOR the contents the input buffer at @s@ with the Salsa20 matrix @a@,
 * writing the result to @d@ and advance @s@ and @d@.
 */
#define SALSA20_MIXFULL(a, d, s) do {					\
  uint32 _x;								\
  int _i;								\
									\
  for (_i = 0; _i < 16; _i++) {						\
    _x = LOAD32_L(s); (s) += 4;						\
    _x ^= (a)[_i];							\
    STORE32_L((d), _x); (d) += 4;					\
  }									\
} while (0)

/* Fill the context @ctx@'s buffer from the matrix @a@ in preparation for
 * emitting partial blocks of output.
 */
#define SALSA20_PREPBUF(ctx, a) do {					\
  int _i;								\
  for (_i = 0; _i < 16; _i++) STORE32_L((ctx)->b + 4*_i, (a)[_i]);	\
  (ctx)->off = 0;							\
} while (0)

/* Write at most @n@ bytes of buffered output from the context @ctx@ to the
 * output buffer @d@ (if it's not null), XORing it with the input buffer @s@
 * (if that's not null).  Both @s@ and @d@ are advanced if they aren't null;
 * @n@ is decreased appropriately.
 */
#define SALSA20_OUTBUF(ctx, d, s, n) do {				\
  const octet *_p = (ctx)->b + (ctx)->off;				\
  size_t _n = (n);							\
									\
  (ctx)->off += _n;							\
  if (!(d)) /* nothing to do */;					\
  else if (!(s)) { memcpy((d), _p, _n); (d) += _n; }			\
  else while (_n--) *(d)++ = *(s)++ ^ *_p++;				\
} while (0)

/*----- Variants and naming -----------------------------------------------*/

/* Common numbers of rounds, for which we generate definitions. */
#define SALSA20_VARS(_) _(8) _(12) _(20)

/* Constructing externally-facing names. */
#define SALSA20_DECOR(base, r, suff) SALSA20__DECOR_##r(base, suff)
#define SALSA20__DECOR_20(base, suff) GLUE(base, suff)
#define SALSA20__DECOR_12(base, suff) GLUE(base##12, suff)
#define SALSA20__DECOR_8(base, suff) GLUE(base##8, suff)

/* Preprocessor-time table of the standard names. */
#define SALSA20_NAME_20 "salsa20"
#define SALSA20_NAME_12 "salsa20/12"
#define SALSA20_NAME_8 "salsa20/8"

/*----- That's all, folks -------------------------------------------------*/

#ifdef __cplusplus
  }
#endif

#endif
Commit	Line	Data
194e93f2 MW	1	/* --c--
	2	*
	3	* Salsa20 core definitions
	4	*
	5	* (c) 2015 Straylight/Edgeware
	6	*/
	7
	8	#ifndef CATACOMB_SALSA20_CORE_H
	9	#define CATACOMB_SALSA20_CORE_H
	10
	11	#ifdef __cplusplus
	12	extern "C" {
	13	#endif
	14
	15	/----- Header files ------------------------------------------------------/
	16
	17	#include <mLib/bits.h>
	18	#include <mLib/macros.h>
	19
	20	#ifndef CATACOMB_SALSA20_H
	21	# include "salsa20.h"
	22	#endif
	23
	24	/----- Magic constants ---------------------------------------------------/
	25
	26	/* The magic Salsa20 constants, for 256-bit keys... */
	27	#define SALSA20_A256 0x61707865 /* e x p a */
	28	#define SALSA20_B256 0x3320646e /* n d 3 */
	29	#define SALSA20_C256 0x79622d32 /* 2 - b y */
	30	#define SALSA20_D256 0x6b206574 /* t e k */
	31
	32	/* ... and for 128-bit keys ... */
	33	#define SALSA20_A128 SALSA20_A256 /* e x p a */
	34	#define SALSA20_B128 0x3120646e /* n d 1 */
	35	#define SALSA20_C128 0x79622d36 /* 6 - b y */
	36	#define SALSA20_D128 SALSA20_D256 /* t e k */
	37
	38	/* ... and for 80-bit keys, for completeness's sake. */
	39	#define SALSA20_A80 SALSA20_A128 /* e x p a */
	40	#define SALSA20_B80 SALSA20_B128 /* n d 1 */
	41	#define SALSA20_C80 0x79622d30 /* 0 - b y */
	42	#define SALSA20_D80 SALSA20_D128 /* t e k */
	43
	44	/----- The Salsa20 core function -----------------------------------------/
	45
a4c2e267 MW	46	/* It makes life somewhat easier if we don't actually store and maintain the
	47	* input matrix in the textbook order. Instead, we rotate the columns other
	48	* than the leftmost one upwards, so that the constants which were originally
	49	* along the diagonal end up on the top row. We'll need to undo this
	50	* permutation on output, but that's not too terrible an imposition.
	51	*
	52	* The permutation we're applying to the matrix elements is this:
	53	*
	54	* [ 0 1 2 3 ] [ 0 5 10 15 ]
	55	* [ 4 5 6 7 ] --> [ 4 9 14 3 ]
	56	* [ 8 9 10 11 ] [ 8 13 2 7 ]
	57	* [ 12 13 14 15 ] [ 12 1 6 11 ]
	58	*
	59	* and as a result, we need to apply this inverse permutation to figure out
	60	* which indices to use in the doublerow function and elsewhere.
	61	*
	62	* [ 0 13 10 7 ]
	63	* [ 4 1 14 11 ]
	64	* [ 8 5 2 15 ]
	65	* [ 12 9 6 3 ]
	66	*/
	67
194e93f2 MW	68	/* The Salsa20 quarter-round. Read from the matrix @y@ at indices @a@, @b@,
	69	* @c@, and @d@; and write to the corresponding elements of @z@.
	70	*/
	71	#define SALSA20_QR(z, y, a, b, c, d) do { \
	72	(z)[b] = (y)[b] ^ ROL32((y)[a] + (y)[d], 7); \
	73	(z)[c] = (y)[c] ^ ROL32((z)[b] + (y)[a], 9); \
	74	(z)[d] = (y)[d] ^ ROL32((z)[c] + (z)[b], 13); \
	75	(z)[a] = (y)[a] ^ ROL32((z)[d] + (z)[c], 18); \
	76	} while (0)
	77
	78	/* The Salsa20 double-round. Read from matrix @y@, writing the result to
	79	* @z@.
	80	*/
	81	#define SALSA20_DR(z, y) do { \
	82	SALSA20_QR(z, y, 0, 4, 8, 12); \
a4c2e267 MW	83	SALSA20_QR(z, y, 1, 5, 9, 13); \
	84	SALSA20_QR(z, y, 2, 6, 10, 14); \
	85	SALSA20_QR(z, y, 3, 7, 11, 15); \
	86	SALSA20_QR(z, z, 0, 13, 10, 7); \
	87	SALSA20_QR(z, z, 1, 14, 11, 4); \
	88	SALSA20_QR(z, z, 2, 15, 8, 5); \
	89	SALSA20_QR(z, z, 3, 12, 9, 6); \
194e93f2 MW	90	} while (0)
	91
	92	/* The Salsa20 feedforward step, used at the end of the core function. Here,
	93	* @y@ contains the original input matrix; @z@ contains the final one, and is
a4c2e267	94	* updated. The output is rendered in canonical order, ready for output.
194e93f2 MW	95	*/
194e93f2 MW	96	#define SALSA20_FFWD(z, y) do { \
a4c2e267 MW	97	const uint32 *_y = (y); \
	98	uint32 *_z = (z); \
	99	int _t; \
	100	_z[ 0] = _z[ 0] + _y[ 0]; _z[ 4] = _z[ 4] + _y[ 4]; \
	101	_z[ 8] = _z[ 8] + _y[ 8]; _z[12] = _z[12] + _y[12]; \
	102	_t = _z[ 1] + _y[ 1]; _z[ 1] = _z[13] + _y[13]; \
	103	_z[13] = _z[ 9] + _y[ 9]; _z[ 9] = _z[ 5] + _y[ 5]; _z[ 5] = _t; \
	104	_t = _z[ 2] + _y[ 2]; _z[ 2] = _z[10] + _y[10]; _z[10] = _t; \
	105	_t = _z[ 6] + _y[ 6]; _z[ 6] = _z[14] + _y[14]; _z[14] = _t; \
	106	_t = _z[ 3] + _y[ 3]; _z[ 3] = _z[ 7] + _y[ 7]; \
	107	_z[ 7] = _z[11] + _y[11]; _z[11] = _z[15] + _y[15]; _z[15] = _t; \
194e93f2 MW	108	} while (0)
	109
	110	/* Various numbers of rounds, unrolled. Read from @y@, and write to @z@. */
	111	#define SALSA20_4R(z, y) \
	112	do { SALSA20_DR(z, y); SALSA20_DR(z, z); } while (0)
	113	#define SALSA20_8R(z, y) \
	114	do { SALSA20_4R(z, y); SALSA20_4R(z, z); } while (0)
	115	#define SALSA20_12R(z, y) \
	116	do { SALSA20_8R(z, y); SALSA20_4R(z, z); } while (0)
	117	#define SALSA20_20R(z, y) \
	118	do { SALSA20_12R(z, y); SALSA20_8R(z, z); } while (0)
	119
	120	/* Apply @n@ (must be even) rounds, rolled. (This seems to be faster,
	121	* probably because it fits in cache better). Read from @y@, and write to
	122	* @z@.
	123	*/
	124	#define SALSA20_nR(z, y, n) do { \
	125	int _i; \
	126	SALSA20_DR(z, y); \
	127	for (_i = 0; _i < (n)/2 - 1; _i++) SALSA20_DR(z, z); \
	128	} while (0)
	129
	130	/* Step the counter in the Salsa20 state matrix @a@. */
	131	#define SALSA20_STEP(a) \
a4c2e267	132	do { (a)[8] = U32((a)[8] + 1); (a)[5] += !(a)[8]; } while (0)
194e93f2 MW	133
	134	/----- Buffering and output ----------------------------------------------
	135	*
	136	* These macros are also used by ChaCha.
	137	*/
	138
	139	/* Copy the Salsa20 matrix @a@ to the output buffer at @d@, advancing @d@
	140	* past the new material.
	141	*/
	142	#define SALSA20_GENFULL(a, d) do { \
	143	int _i; \
	144	\
	145	for (_i = 0; _i < 16; _i++) { STORE32_L((d), (a)[_i]); (d) += 4; } \
	146	} while (0)
	147
	148	/* XOR the contents the input buffer at @s@ with the Salsa20 matrix @a@,
	149	* writing the result to @d@ and advance @s@ and @d@.
	150	*/
	151	#define SALSA20_MIXFULL(a, d, s) do { \
	152	uint32 _x; \
	153	int _i; \
	154	\
	155	for (_i = 0; _i < 16; _i++) { \
	156	_x = LOAD32_L(s); (s) += 4; \
	157	_x ^= (a)[_i]; \
	158	STORE32_L((d), _x); (d) += 4; \
	159	} \
	160	} while (0)
	161
	162	/* Fill the context @ctx@'s buffer from the matrix @a@ in preparation for
	163	* emitting partial blocks of output.
	164	*/
	165	#define SALSA20_PREPBUF(ctx, a) do { \
	166	int _i; \
0fee61eb MW	167	for (_i = 0; _i < 16; _i++) STORE32_L((ctx)->b + 4*_i, (a)[_i]); \
0fee61eb MW	168	(ctx)->off = 0; \
194e93f2 MW	169	} while (0)
	170
	171	/* Write at most @n@ bytes of buffered output from the context @ctx@ to the
	172	* output buffer @d@ (if it's not null), XORing it with the input buffer @s@
	173	* (if that's not null). Both @s@ and @d@ are advanced if they aren't null;
	174	* @n@ is decreased appropriately.
	175	*/
	176	#define SALSA20_OUTBUF(ctx, d, s, n) do { \
6a0eb244 MW	177	const octet *_p = (ctx)->b + (ctx)->off; \
	178	size_t _n = (n); \
	179	\
	180	(ctx)->off += _n; \
	181	if (!(d)) /* nothing to do */; \
	182	else if (!(s)) { memcpy((d), _p, _n); (d) += _n; } \
	183	else while (_n--) (d)++ = (s)++ ^ *_p++; \
194e93f2 MW	184	} while (0)
	185
	186	/----- Variants and naming -----------------------------------------------/
	187
	188	/* Common numbers of rounds, for which we generate definitions. */
	189	#define SALSA20_VARS(_) _(8) _(12) _(20)
	190
	191	/* Constructing externally-facing names. */
	192	#define SALSA20_DECOR(base, r, suff) SALSA20__DECOR_##r(base, suff)
	193	#define SALSA20__DECOR_20(base, suff) GLUE(base, suff)
	194	#define SALSA20__DECOR_12(base, suff) GLUE(base##12, suff)
	195	#define SALSA20__DECOR_8(base, suff) GLUE(base##8, suff)
	196
	197	/* Preprocessor-time table of the standard names. */
	198	#define SALSA20_NAME_20 "salsa20"
	199	#define SALSA20_NAME_12 "salsa20/12"
	200	#define SALSA20_NAME_8 "salsa20/8"
	201
	202	/----- That's all, folks -------------------------------------------------/
	203
	204	#ifdef __cplusplus
	205	}
	206	#endif
	207
	208	#endif