symm: Implement Bernstein's Salsa20 stream cipher and its variants.

[catacomb] / symm / salsa20-core.h
diff --git a/symm/salsa20-core.h b/symm/salsa20-core.h

new file mode 100644 (file)

index 0000000..98efa72
--- /dev/null
+++ b/symm/salsa20-core.h
@@ -0,0 +1,176 @@
+/* -*-c-*-
+ *
+ * Salsa20 core definitions
+ *
+ * (c) 2015 Straylight/Edgeware
+ */
+
+#ifndef CATACOMB_SALSA20_CORE_H
+#define CATACOMB_SALSA20_CORE_H
+
+#ifdef __cplusplus
+  extern "C" {
+#endif
+
+/*----- Header files ------------------------------------------------------*/
+
+#include <mLib/bits.h>
+#include <mLib/macros.h>
+
+#ifndef CATACOMB_SALSA20_H
+#  include "salsa20.h"
+#endif
+
+/*----- Magic constants ---------------------------------------------------*/
+
+/* The magic Salsa20 constants, for 256-bit keys... */
+#define SALSA20_A256 0x61707865                /* e x p a */
+#define SALSA20_B256 0x3320646e                /* n d   3 */
+#define SALSA20_C256 0x79622d32                /* 2 - b y */
+#define SALSA20_D256 0x6b206574                /* t e   k */
+
+/* ... and for 128-bit keys ... */
+#define SALSA20_A128 SALSA20_A256      /* e x p a */
+#define SALSA20_B128 0x3120646e                /* n d   1 */
+#define SALSA20_C128 0x79622d36                /* 6 - b y */
+#define SALSA20_D128 SALSA20_D256      /* t e   k */
+
+/* ... and for 80-bit keys, for completeness's sake. */
+#define SALSA20_A80 SALSA20_A128       /* e x p a */
+#define SALSA20_B80 SALSA20_B128       /* n d   1 */
+#define SALSA20_C80 0x79622d30         /* 0 - b y */
+#define SALSA20_D80 SALSA20_D128       /* t e   k */
+
+/*----- The Salsa20 core function -----------------------------------------*/
+
+/* The Salsa20 quarter-round.  Read from the matrix @y@ at indices @a@, @b@,
+ * @c@, and @d@; and write to the corresponding elements of @z@.
+ */
+#define SALSA20_QR(z, y, a, b, c, d) do {                              \
+  (z)[b] = (y)[b] ^ ROL32((y)[a] + (y)[d],  7);                                \
+  (z)[c] = (y)[c] ^ ROL32((z)[b] + (y)[a],  9);                                \
+  (z)[d] = (y)[d] ^ ROL32((z)[c] + (z)[b], 13);                                \
+  (z)[a] = (y)[a] ^ ROL32((z)[d] + (z)[c], 18);                                \
+} while (0)
+
+/* The Salsa20 double-round.  Read from matrix @y@, writing the result to
+ * @z@.
+ */
+#define SALSA20_DR(z, y) do {                                          \
+  SALSA20_QR(z, y,  0,  4,  8, 12);                                    \
+  SALSA20_QR(z, y,  5,  9, 13,  1);                                    \
+  SALSA20_QR(z, y, 10, 14,  2,  6);                                    \
+  SALSA20_QR(z, y, 15,  3,  7, 11);                                    \
+  SALSA20_QR(z, z,  0,  1,  2,  3);                                    \
+  SALSA20_QR(z, z,  5,  6,  7,  4);                                    \
+  SALSA20_QR(z, z, 10, 11,  8,  9);                                    \
+  SALSA20_QR(z, z, 15, 12, 13, 14);                                    \
+} while (0)
+
+/* The Salsa20 feedforward step, used at the end of the core function.  Here,
+ * @y@ contains the original input matrix; @z@ contains the final one, and is
+ * updated.
+ */
+#define SALSA20_FFWD(z, y) do {                                                \
+  int _i;                                                              \
+  for (_i = 0; _i < 16; _i++) (z)[_i] += (y)[_i];                      \
+} while (0)
+
+/* Various numbers of rounds, unrolled.  Read from @y@, and write to @z@. */
+#define SALSA20_4R(z, y)                                               \
+  do { SALSA20_DR(z, y); SALSA20_DR(z, z); } while (0)
+#define SALSA20_8R(z, y)                                               \
+  do { SALSA20_4R(z, y); SALSA20_4R(z, z); } while (0)
+#define SALSA20_12R(z, y)                                              \
+  do { SALSA20_8R(z, y); SALSA20_4R(z, z); } while (0)
+#define SALSA20_20R(z, y)                                              \
+  do { SALSA20_12R(z, y); SALSA20_8R(z, z); } while (0)
+
+/* Apply @n@ (must be even) rounds, rolled.  (This seems to be faster,
+ * probably because it fits in cache better).  Read from @y@, and write to
+ * @z@.
+ */
+#define SALSA20_nR(z, y, n) do {                                       \
+  int _i;                                                              \
+  SALSA20_DR(z, y);                                                    \
+  for (_i = 0; _i < (n)/2 - 1; _i++) SALSA20_DR(z, z);                 \
+} while (0)
+
+/* Step the counter in the Salsa20 state matrix @a@. */
+#define SALSA20_STEP(a)                                                        \
+  do { (a)[8] = U32((a)[8] + 1); (a)[9] += !(a)[8]; } while (0)
+
+/*----- Buffering and output ----------------------------------------------*
+ *
+ * These macros are also used by ChaCha.
+ */
+
+/* Copy the Salsa20 matrix @a@ to the output buffer at @d@, advancing @d@
+ * past the new material.
+ */
+#define SALSA20_GENFULL(a, d) do {                                     \
+  int _i;                                                              \
+                                                                       \
+  for (_i = 0; _i < 16; _i++) { STORE32_L((d), (a)[_i]); (d) += 4; }   \
+} while (0)
+
+/* XOR the contents the input buffer at @s@ with the Salsa20 matrix @a@,
+ * writing the result to @d@ and advance @s@ and @d@.
+ */
+#define SALSA20_MIXFULL(a, d, s) do {                                  \
+  uint32 _x;                                                           \
+  int _i;                                                              \
+                                                                       \
+  for (_i = 0; _i < 16; _i++) {                                                \
+    _x = LOAD32_L(s); (s) += 4;                                                \
+    _x ^= (a)[_i];                                                     \
+    STORE32_L((d), _x); (d) += 4;                                      \
+  }                                                                    \
+} while (0)
+
+/* Fill the context @ctx@'s buffer from the matrix @a@ in preparation for
+ * emitting partial blocks of output.
+ */
+#define SALSA20_PREPBUF(ctx, a) do {                                   \
+  int _i;                                                              \
+  for (_i = 0; _i < 16; _i++) STORE32_L((ctx)->buf + 4*_i, (a)[_i]);   \
+  (ctx)->bufi = 0;                                                     \
+} while (0)
+
+/* Write at most @n@ bytes of buffered output from the context @ctx@ to the
+ * output buffer @d@ (if it's not null), XORing it with the input buffer @s@
+ * (if that's not null).  Both @s@ and @d@ are advanced if they aren't null;
+ * @n@ is decreased appropriately.
+ */
+#define SALSA20_OUTBUF(ctx, d, s, n) do {                              \
+  size_t _n = (n), _left = SALSA20_OUTSZ - (ctx)->bufi;                        \
+  if (_n > _left) _n = _left;                                          \
+  (n) -= _n;                                                           \
+  if (!(d)) (ctx)->bufi += _n;                                         \
+  else if (s) while (_n--) *(d)++ = (ctx)->buf[(ctx)->bufi++] ^ *(s)++;        \
+  else while (_n--) *(d)++ = (ctx)->buf[(ctx)->bufi++];                        \
+} while (0)
+
+/*----- Variants and naming -----------------------------------------------*/
+
+/* Common numbers of rounds, for which we generate definitions. */
+#define SALSA20_VARS(_) _(8) _(12) _(20)
+
+/* Constructing externally-facing names. */
+#define SALSA20_DECOR(base, r, suff) SALSA20__DECOR_##r(base, suff)
+#define SALSA20__DECOR_20(base, suff) GLUE(base, suff)
+#define SALSA20__DECOR_12(base, suff) GLUE(base##12, suff)
+#define SALSA20__DECOR_8(base, suff) GLUE(base##8, suff)
+
+/* Preprocessor-time table of the standard names. */
+#define SALSA20_NAME_20 "salsa20"
+#define SALSA20_NAME_12 "salsa20/12"
+#define SALSA20_NAME_8 "salsa20/8"
+
+/*----- That's all, folks -------------------------------------------------*/
+
+#ifdef __cplusplus
+  }
+#endif
+
+#endif