[catacomb] / symm / salsa20-core.h

/* -*-c-*-
 *
 * Salsa20 core definitions
 *
 * (c) 2015 Straylight/Edgeware
 */

#ifndef CATACOMB_SALSA20_CORE_H
#define CATACOMB_SALSA20_CORE_H

#ifdef __cplusplus
  extern "C" {
#endif

/*----- Header files ------------------------------------------------------*/

#include <mLib/bits.h>
#include <mLib/macros.h>

#ifndef CATACOMB_SALSA20_H
#  include "salsa20.h"
#endif

/*----- Magic constants ---------------------------------------------------*/

/* The magic Salsa20 constants, for 256-bit keys... */
#define SALSA20_A256 0x61707865		/* e x p a */
#define SALSA20_B256 0x3320646e		/* n d   3 */
#define SALSA20_C256 0x79622d32		/* 2 - b y */
#define SALSA20_D256 0x6b206574		/* t e   k */

/* ... and for 128-bit keys ... */
#define SALSA20_A128 SALSA20_A256	/* e x p a */
#define SALSA20_B128 0x3120646e		/* n d   1 */
#define SALSA20_C128 0x79622d36		/* 6 - b y */
#define SALSA20_D128 SALSA20_D256	/* t e   k */

/* ... and for 80-bit keys, for completeness's sake. */
#define SALSA20_A80 SALSA20_A128	/* e x p a */
#define SALSA20_B80 SALSA20_B128	/* n d   1 */
#define SALSA20_C80 0x79622d30		/* 0 - b y */
#define SALSA20_D80 SALSA20_D128	/* t e   k */

/*----- The Salsa20 core function -----------------------------------------*/

/* The Salsa20 quarter-round.  Read from the matrix @y@ at indices @a@, @b@,
 * @c@, and @d@; and write to the corresponding elements of @z@.
 */
#define SALSA20_QR(z, y, a, b, c, d) do {				\
  (z)[b] = (y)[b] ^ ROL32((y)[a] + (y)[d],  7);				\
  (z)[c] = (y)[c] ^ ROL32((z)[b] + (y)[a],  9);				\
  (z)[d] = (y)[d] ^ ROL32((z)[c] + (z)[b], 13);				\
  (z)[a] = (y)[a] ^ ROL32((z)[d] + (z)[c], 18);				\
} while (0)

/* The Salsa20 double-round.  Read from matrix @y@, writing the result to
 * @z@.
 */
#define SALSA20_DR(z, y) do {						\
  SALSA20_QR(z, y,  0,  4,  8, 12);					\
  SALSA20_QR(z, y,  5,  9, 13,  1);					\
  SALSA20_QR(z, y, 10, 14,  2,  6);					\
  SALSA20_QR(z, y, 15,  3,  7, 11);					\
  SALSA20_QR(z, z,  0,  1,  2,  3);					\
  SALSA20_QR(z, z,  5,  6,  7,  4);					\
  SALSA20_QR(z, z, 10, 11,  8,  9);					\
  SALSA20_QR(z, z, 15, 12, 13, 14);					\
} while (0)

/* The Salsa20 feedforward step, used at the end of the core function.  Here,
 * @y@ contains the original input matrix; @z@ contains the final one, and is
 * updated.
 */
#define SALSA20_FFWD(z, y) do {						\
  int _i;								\
  for (_i = 0; _i < 16; _i++) (z)[_i] += (y)[_i];			\
} while (0)

/* Various numbers of rounds, unrolled.  Read from @y@, and write to @z@. */
#define SALSA20_4R(z, y)						\
  do { SALSA20_DR(z, y); SALSA20_DR(z, z); } while (0)
#define SALSA20_8R(z, y)						\
  do { SALSA20_4R(z, y); SALSA20_4R(z, z); } while (0)
#define SALSA20_12R(z, y)						\
  do { SALSA20_8R(z, y); SALSA20_4R(z, z); } while (0)
#define SALSA20_20R(z, y)						\
  do { SALSA20_12R(z, y); SALSA20_8R(z, z); } while (0)

/* Apply @n@ (must be even) rounds, rolled.  (This seems to be faster,
 * probably because it fits in cache better).  Read from @y@, and write to
 * @z@.
 */
#define SALSA20_nR(z, y, n) do {					\
  int _i;								\
  SALSA20_DR(z, y);							\
  for (_i = 0; _i < (n)/2 - 1; _i++) SALSA20_DR(z, z);			\
} while (0)

/* Step the counter in the Salsa20 state matrix @a@. */
#define SALSA20_STEP(a)							\
  do { (a)[8] = U32((a)[8] + 1); (a)[9] += !(a)[8]; } while (0)

/*----- Buffering and output ----------------------------------------------*
 *
 * These macros are also used by ChaCha.
 */

/* Copy the Salsa20 matrix @a@ to the output buffer at @d@, advancing @d@
 * past the new material.
 */
#define SALSA20_GENFULL(a, d) do {					\
  int _i;								\
									\
  for (_i = 0; _i < 16; _i++) { STORE32_L((d), (a)[_i]); (d) += 4; }	\
} while (0)

/* XOR the contents the input buffer at @s@ with the Salsa20 matrix @a@,
 * writing the result to @d@ and advance @s@ and @d@.
 */
#define SALSA20_MIXFULL(a, d, s) do {					\
  uint32 _x;								\
  int _i;								\
									\
  for (_i = 0; _i < 16; _i++) {						\
    _x = LOAD32_L(s); (s) += 4;						\
    _x ^= (a)[_i];							\
    STORE32_L((d), _x); (d) += 4;					\
  }									\
} while (0)

/* Fill the context @ctx@'s buffer from the matrix @a@ in preparation for
 * emitting partial blocks of output.
 */
#define SALSA20_PREPBUF(ctx, a) do {					\
  int _i;								\
  for (_i = 0; _i < 16; _i++) STORE32_L((ctx)->buf + 4*_i, (a)[_i]);	\
  (ctx)->bufi = 0;							\
} while (0)

/* Write at most @n@ bytes of buffered output from the context @ctx@ to the
 * output buffer @d@ (if it's not null), XORing it with the input buffer @s@
 * (if that's not null).  Both @s@ and @d@ are advanced if they aren't null;
 * @n@ is decreased appropriately.
 */
#define SALSA20_OUTBUF(ctx, d, s, n) do {				\
  size_t _n = (n), _left = SALSA20_OUTSZ - (ctx)->bufi;			\
  if (_n > _left) _n = _left;						\
  (n) -= _n;								\
  if (!(d)) (ctx)->bufi += _n;						\
  else if (s) while (_n--) *(d)++ = (ctx)->buf[(ctx)->bufi++] ^ *(s)++;	\
  else while (_n--) *(d)++ = (ctx)->buf[(ctx)->bufi++];			\
} while (0)

/*----- Variants and naming -----------------------------------------------*/

/* Common numbers of rounds, for which we generate definitions. */
#define SALSA20_VARS(_) _(8) _(12) _(20)

/* Constructing externally-facing names. */
#define SALSA20_DECOR(base, r, suff) SALSA20__DECOR_##r(base, suff)
#define SALSA20__DECOR_20(base, suff) GLUE(base, suff)
#define SALSA20__DECOR_12(base, suff) GLUE(base##12, suff)
#define SALSA20__DECOR_8(base, suff) GLUE(base##8, suff)

/* Preprocessor-time table of the standard names. */
#define SALSA20_NAME_20 "salsa20"
#define SALSA20_NAME_12 "salsa20/12"
#define SALSA20_NAME_8 "salsa20/8"

/*----- That's all, folks -------------------------------------------------*/

#ifdef __cplusplus
  }
#endif

#endif
Commit	Line	Data
194e93f2 MW	1	/* --c--
	2	*
	3	* Salsa20 core definitions
	4	*
	5	* (c) 2015 Straylight/Edgeware
	6	*/
	7
	8	#ifndef CATACOMB_SALSA20_CORE_H
	9	#define CATACOMB_SALSA20_CORE_H
	10
	11	#ifdef __cplusplus
	12	extern "C" {
	13	#endif
	14
	15	/----- Header files ------------------------------------------------------/
	16
	17	#include <mLib/bits.h>
	18	#include <mLib/macros.h>
	19
	20	#ifndef CATACOMB_SALSA20_H
	21	# include "salsa20.h"
	22	#endif
	23
	24	/----- Magic constants ---------------------------------------------------/
	25
	26	/* The magic Salsa20 constants, for 256-bit keys... */
	27	#define SALSA20_A256 0x61707865 /* e x p a */
	28	#define SALSA20_B256 0x3320646e /* n d 3 */
	29	#define SALSA20_C256 0x79622d32 /* 2 - b y */
	30	#define SALSA20_D256 0x6b206574 /* t e k */
	31
	32	/* ... and for 128-bit keys ... */
	33	#define SALSA20_A128 SALSA20_A256 /* e x p a */
	34	#define SALSA20_B128 0x3120646e /* n d 1 */
	35	#define SALSA20_C128 0x79622d36 /* 6 - b y */
	36	#define SALSA20_D128 SALSA20_D256 /* t e k */
	37
	38	/* ... and for 80-bit keys, for completeness's sake. */
	39	#define SALSA20_A80 SALSA20_A128 /* e x p a */
	40	#define SALSA20_B80 SALSA20_B128 /* n d 1 */
	41	#define SALSA20_C80 0x79622d30 /* 0 - b y */
	42	#define SALSA20_D80 SALSA20_D128 /* t e k */
	43
	44	/----- The Salsa20 core function -----------------------------------------/
	45
	46	/* The Salsa20 quarter-round. Read from the matrix @y@ at indices @a@, @b@,
	47	* @c@, and @d@; and write to the corresponding elements of @z@.
	48	*/
	49	#define SALSA20_QR(z, y, a, b, c, d) do { \
	50	(z)[b] = (y)[b] ^ ROL32((y)[a] + (y)[d], 7); \
	51	(z)[c] = (y)[c] ^ ROL32((z)[b] + (y)[a], 9); \
	52	(z)[d] = (y)[d] ^ ROL32((z)[c] + (z)[b], 13); \
	53	(z)[a] = (y)[a] ^ ROL32((z)[d] + (z)[c], 18); \
	54	} while (0)
	55
	56	/* The Salsa20 double-round. Read from matrix @y@, writing the result to
	57	* @z@.
	58	*/
	59	#define SALSA20_DR(z, y) do { \
	60	SALSA20_QR(z, y, 0, 4, 8, 12); \
	61	SALSA20_QR(z, y, 5, 9, 13, 1); \
	62	SALSA20_QR(z, y, 10, 14, 2, 6); \
	63	SALSA20_QR(z, y, 15, 3, 7, 11); \
	64	SALSA20_QR(z, z, 0, 1, 2, 3); \
65	SALSA20_QR(z, z, 5, 6, 7, 4); \
66	SALSA20_QR(z, z, 10, 11, 8, 9); \
67	SALSA20_QR(z, z, 15, 12, 13, 14); \
68	} while (0)
69
70	/* The Salsa20 feedforward step, used at the end of the core function. Here,
71	* @y@ contains the original input matrix; @z@ contains the final one, and is
72	* updated.
73	*/
74	#define SALSA20_FFWD(z, y) do { \
75	int _i; \
76	for (_i = 0; _i < 16; _i++) (z)[_i] += (y)[_i]; \
77	} while (0)
78
79	/* Various numbers of rounds, unrolled. Read from @y@, and write to @z@. */
80	#define SALSA20_4R(z, y) \
81	do { SALSA20_DR(z, y); SALSA20_DR(z, z); } while (0)
82	#define SALSA20_8R(z, y) \
83	do { SALSA20_4R(z, y); SALSA20_4R(z, z); } while (0)
84	#define SALSA20_12R(z, y) \
85	do { SALSA20_8R(z, y); SALSA20_4R(z, z); } while (0)
86	#define SALSA20_20R(z, y) \
87	do { SALSA20_12R(z, y); SALSA20_8R(z, z); } while (0)
88
89	/* Apply @n@ (must be even) rounds, rolled. (This seems to be faster,
90	* probably because it fits in cache better). Read from @y@, and write to
91	* @z@.
92	*/
93	#define SALSA20_nR(z, y, n) do { \
94	int _i; \
95	SALSA20_DR(z, y); \
96	for (_i = 0; _i < (n)/2 - 1; _i++) SALSA20_DR(z, z); \
97	} while (0)
98
99	/* Step the counter in the Salsa20 state matrix @a@. */
100	#define SALSA20_STEP(a) \
101	do { (a)[8] = U32((a)[8] + 1); (a)[9] += !(a)[8]; } while (0)
102
103	/----- Buffering and output ----------------------------------------------
104	*
105	* These macros are also used by ChaCha.
106	*/
107
108	/* Copy the Salsa20 matrix @a@ to the output buffer at @d@, advancing @d@
109	* past the new material.
110	*/
111	#define SALSA20_GENFULL(a, d) do { \
112	int _i; \
113	\
114	for (_i = 0; _i < 16; _i++) { STORE32_L((d), (a)[_i]); (d) += 4; } \
115	} while (0)
116
117	/* XOR the contents the input buffer at @s@ with the Salsa20 matrix @a@,
118	* writing the result to @d@ and advance @s@ and @d@.
119	*/
120	#define SALSA20_MIXFULL(a, d, s) do { \
121	uint32 _x; \
122	int _i; \
123	\
124	for (_i = 0; _i < 16; _i++) { \
125	_x = LOAD32_L(s); (s) += 4; \
126	_x ^= (a)[_i]; \
127	STORE32_L((d), _x); (d) += 4; \
128	} \
129	} while (0)
130
131	/* Fill the context @ctx@'s buffer from the matrix @a@ in preparation for
132	* emitting partial blocks of output.
133	*/
134	#define SALSA20_PREPBUF(ctx, a) do { \
135	int _i; \
136	for (_i = 0; _i < 16; _i++) STORE32_L((ctx)->buf + 4*_i, (a)[_i]); \
137	(ctx)->bufi = 0; \
138	} while (0)
139
140	/* Write at most @n@ bytes of buffered output from the context @ctx@ to the
141	* output buffer @d@ (if it's not null), XORing it with the input buffer @s@
142	* (if that's not null). Both @s@ and @d@ are advanced if they aren't null;
143	* @n@ is decreased appropriately.
144	*/
145	#define SALSA20_OUTBUF(ctx, d, s, n) do { \
146	size_t _n = (n), _left = SALSA20_OUTSZ - (ctx)->bufi; \
147	if (_n > _left) _n = _left; \
148	(n) -= _n; \
149	if (!(d)) (ctx)->bufi += _n; \
150	else if (s) while (_n--) (d)++ = (ctx)->buf[(ctx)->bufi++] ^ (s)++; \
151	else while (_n--) *(d)++ = (ctx)->buf[(ctx)->bufi++]; \
152	} while (0)
153
154	/----- Variants and naming -----------------------------------------------/
155
156	/* Common numbers of rounds, for which we generate definitions. */
157	#define SALSA20_VARS(_) _(8) _(12) _(20)
158
159	/* Constructing externally-facing names. */
160	#define SALSA20_DECOR(base, r, suff) SALSA20__DECOR_##r(base, suff)
161	#define SALSA20__DECOR_20(base, suff) GLUE(base, suff)
162	#define SALSA20__DECOR_12(base, suff) GLUE(base##12, suff)
163	#define SALSA20__DECOR_8(base, suff) GLUE(base##8, suff)
164
165	/* Preprocessor-time table of the standard names. */
166	#define SALSA20_NAME_20 "salsa20"
167	#define SALSA20_NAME_12 "salsa20/12"
168	#define SALSA20_NAME_8 "salsa20/8"
169
170	/----- That's all, folks -------------------------------------------------/
171
172	#ifdef __cplusplus
173	}
174	#endif
175
176	#endif