X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/290ddb617fe530512a3496de61318a98ae623fe7..92edc356a312bc64abca0c30bc03d4b6676f3d39:/symm/salsa20-core.h

diff --git a/symm/salsa20-core.h b/symm/salsa20-core.h
index 98efa72e..e64e7331 100644
--- a/symm/salsa20-core.h
+++ b/symm/salsa20-core.h
@@ -43,6 +43,28 @@
 
 /*----- The Salsa20 core function -----------------------------------------*/
 
+/* It makes life somewhat easier if we don't actually store and maintain the
+ * input matrix in the textbook order.  Instead, we rotate the columns other
+ * than the leftmost one upwards, so that the constants which were originally
+ * along the diagonal end up on the top row.  We'll need to undo this
+ * permutation on output, but that's not too terrible an imposition.
+ *
+ * The permutation we're applying to the matrix elements is this:
+ *
+ * [  0  1  2  3 ]	 [  0  5 10 15 ]
+ * [  4  5  6  7 ]  -->  [  4  9 14  3 ]
+ * [  8  9 10 11 ]	 [  8 13  2  7 ]
+ * [ 12 13 14 15 ]	 [ 12  1  6 11 ]
+ *
+ * and as a result, we need to apply this inverse permutation to figure out
+ * which indices to use in the doublerow function and elsewhere.
+ *
+ * [  0 13 10  7 ]
+ * [  4  1 14 11 ]
+ * [  8  5  2 15 ]
+ * [ 12  9  6  3 ]
+ */
+
 /* The Salsa20 quarter-round.  Read from the matrix @y@ at indices @a@, @b@,
  * @c@, and @d@; and write to the corresponding elements of @z@.
  */
@@ -58,22 +80,31 @@
  */
 #define SALSA20_DR(z, y) do {						\
   SALSA20_QR(z, y,  0,  4,  8, 12);					\
-  SALSA20_QR(z, y,  5,  9, 13,  1);					\
-  SALSA20_QR(z, y, 10, 14,  2,  6);					\
-  SALSA20_QR(z, y, 15,  3,  7, 11);					\
-  SALSA20_QR(z, z,  0,  1,  2,  3);					\
-  SALSA20_QR(z, z,  5,  6,  7,  4);					\
-  SALSA20_QR(z, z, 10, 11,  8,  9);					\
-  SALSA20_QR(z, z, 15, 12, 13, 14);					\
+  SALSA20_QR(z, y,  1,  5,  9, 13);					\
+  SALSA20_QR(z, y,  2,  6, 10, 14);					\
+  SALSA20_QR(z, y,  3,  7, 11, 15);					\
+  SALSA20_QR(z, z,  0, 13, 10,  7);					\
+  SALSA20_QR(z, z,  1, 14, 11,  4);					\
+  SALSA20_QR(z, z,  2, 15,  8,  5);					\
+  SALSA20_QR(z, z,  3, 12,  9,  6);					\
 } while (0)
 
 /* The Salsa20 feedforward step, used at the end of the core function.  Here,
  * @y@ contains the original input matrix; @z@ contains the final one, and is
- * updated.
+ * updated.  The output is rendered in canonical order, ready for output.
  */
 #define SALSA20_FFWD(z, y) do {						\
-  int _i;								\
-  for (_i = 0; _i < 16; _i++) (z)[_i] += (y)[_i];			\
+  const uint32 *_y = (y);						\
+  uint32 *_z = (z);							\
+  int _t;								\
+  _z[ 0] = _z[ 0] + _y[ 0]; _z[ 4] = _z[ 4] + _y[ 4];			\
+  _z[ 8] = _z[ 8] + _y[ 8]; _z[12] = _z[12] + _y[12];			\
+      _t = _z[ 1] + _y[ 1]; _z[ 1] = _z[13] + _y[13];			\
+  _z[13] = _z[ 9] + _y[ 9]; _z[ 9] = _z[ 5] + _y[ 5]; _z[ 5] = _t;	\
+      _t = _z[ 2] + _y[ 2]; _z[ 2] = _z[10] + _y[10]; _z[10] = _t;	\
+      _t = _z[ 6] + _y[ 6]; _z[ 6] = _z[14] + _y[14]; _z[14] = _t;	\
+      _t = _z[ 3] + _y[ 3]; _z[ 3] = _z[ 7] + _y[ 7];			\
+  _z[ 7] = _z[11] + _y[11]; _z[11] = _z[15] + _y[15]; _z[15] = _t;	\
 } while (0)
 
 /* Various numbers of rounds, unrolled.  Read from @y@, and write to @z@. */
@@ -98,7 +129,7 @@
 
 /* Step the counter in the Salsa20 state matrix @a@. */
 #define SALSA20_STEP(a)							\
-  do { (a)[8] = U32((a)[8] + 1); (a)[9] += !(a)[8]; } while (0)
+  do { (a)[8] = U32((a)[8] + 1); (a)[5] += !(a)[8]; } while (0)
 
 /*----- Buffering and output ----------------------------------------------*
  *
@@ -133,8 +164,8 @@
  */
 #define SALSA20_PREPBUF(ctx, a) do {					\
   int _i;								\
-  for (_i = 0; _i < 16; _i++) STORE32_L((ctx)->buf + 4*_i, (a)[_i]);	\
-  (ctx)->bufi = 0;							\
+  for (_i = 0; _i < 16; _i++) STORE32_L((ctx)->b + 4*_i, (a)[_i]);	\
+  (ctx)->off = 0;							\
 } while (0)
 
 /* Write at most @n@ bytes of buffered output from the context @ctx@ to the
@@ -143,12 +174,13 @@
  * @n@ is decreased appropriately.
  */
 #define SALSA20_OUTBUF(ctx, d, s, n) do {				\
-  size_t _n = (n), _left = SALSA20_OUTSZ - (ctx)->bufi;			\
-  if (_n > _left) _n = _left;						\
-  (n) -= _n;								\
-  if (!(d)) (ctx)->bufi += _n;						\
-  else if (s) while (_n--) *(d)++ = (ctx)->buf[(ctx)->bufi++] ^ *(s)++;	\
-  else while (_n--) *(d)++ = (ctx)->buf[(ctx)->bufi++];			\
+  const octet *_p = (ctx)->b + (ctx)->off;				\
+  size_t _n = (n);							\
+									\
+  (ctx)->off += _n;							\
+  if (!(d)) /* nothing to do */;					\
+  else if (!(s)) { memcpy((d), _p, _n); (d) += _n; }			\
+  else while (_n--) *(d)++ = *(s)++ ^ *_p++;				\
 } while (0)
 
 /*----- Variants and naming -----------------------------------------------*/