/*----- The Salsa20 core function -----------------------------------------*/
+/* It makes life somewhat easier if we don't actually store and maintain the
+ * input matrix in the textbook order. Instead, we rotate the columns other
+ * than the leftmost one upwards, so that the constants which were originally
+ * along the diagonal end up on the top row. We'll need to undo this
+ * permutation on output, but that's not too terrible an imposition.
+ *
+ * The permutation we're applying to the matrix elements is this:
+ *
+ * [ 0 1 2 3 ] [ 0 5 10 15 ]
+ * [ 4 5 6 7 ] --> [ 4 9 14 3 ]
+ * [ 8 9 10 11 ] [ 8 13 2 7 ]
+ * [ 12 13 14 15 ] [ 12 1 6 11 ]
+ *
+ * and as a result, we need to apply this inverse permutation to figure out
+ * which indices to use in the doublerow function and elsewhere.
+ *
+ * [ 0 13 10 7 ]
+ * [ 4 1 14 11 ]
+ * [ 8 5 2 15 ]
+ * [ 12 9 6 3 ]
+ */
+
/* The Salsa20 quarter-round. Read from the matrix @y@ at indices @a@, @b@,
* @c@, and @d@; and write to the corresponding elements of @z@.
*/
*/
#define SALSA20_DR(z, y) do { \
SALSA20_QR(z, y, 0, 4, 8, 12); \
- SALSA20_QR(z, y, 5, 9, 13, 1); \
- SALSA20_QR(z, y, 10, 14, 2, 6); \
- SALSA20_QR(z, y, 15, 3, 7, 11); \
- SALSA20_QR(z, z, 0, 1, 2, 3); \
- SALSA20_QR(z, z, 5, 6, 7, 4); \
- SALSA20_QR(z, z, 10, 11, 8, 9); \
- SALSA20_QR(z, z, 15, 12, 13, 14); \
+ SALSA20_QR(z, y, 1, 5, 9, 13); \
+ SALSA20_QR(z, y, 2, 6, 10, 14); \
+ SALSA20_QR(z, y, 3, 7, 11, 15); \
+ SALSA20_QR(z, z, 0, 13, 10, 7); \
+ SALSA20_QR(z, z, 1, 14, 11, 4); \
+ SALSA20_QR(z, z, 2, 15, 8, 5); \
+ SALSA20_QR(z, z, 3, 12, 9, 6); \
} while (0)
/* The Salsa20 feedforward step, used at the end of the core function. Here,
* @y@ contains the original input matrix; @z@ contains the final one, and is
- * updated.
+ * updated. The output is rendered in canonical order, ready for output.
*/
#define SALSA20_FFWD(z, y) do { \
- int _i; \
- for (_i = 0; _i < 16; _i++) (z)[_i] += (y)[_i]; \
+ const uint32 *_y = (y); \
+ uint32 *_z = (z); \
+ int _t; \
+ _z[ 0] = _z[ 0] + _y[ 0]; _z[ 4] = _z[ 4] + _y[ 4]; \
+ _z[ 8] = _z[ 8] + _y[ 8]; _z[12] = _z[12] + _y[12]; \
+ _t = _z[ 1] + _y[ 1]; _z[ 1] = _z[13] + _y[13]; \
+ _z[13] = _z[ 9] + _y[ 9]; _z[ 9] = _z[ 5] + _y[ 5]; _z[ 5] = _t; \
+ _t = _z[ 2] + _y[ 2]; _z[ 2] = _z[10] + _y[10]; _z[10] = _t; \
+ _t = _z[ 6] + _y[ 6]; _z[ 6] = _z[14] + _y[14]; _z[14] = _t; \
+ _t = _z[ 3] + _y[ 3]; _z[ 3] = _z[ 7] + _y[ 7]; \
+ _z[ 7] = _z[11] + _y[11]; _z[11] = _z[15] + _y[15]; _z[15] = _t; \
} while (0)
/* Various numbers of rounds, unrolled. Read from @y@, and write to @z@. */
/* Step the counter in the Salsa20 state matrix @a@. */
#define SALSA20_STEP(a) \
- do { (a)[8] = U32((a)[8] + 1); (a)[9] += !(a)[8]; } while (0)
+ do { (a)[8] = U32((a)[8] + 1); (a)[5] += !(a)[8]; } while (0)
/*----- Buffering and output ----------------------------------------------*
*
*/
#define SALSA20_PREPBUF(ctx, a) do { \
int _i; \
- for (_i = 0; _i < 16; _i++) STORE32_L((ctx)->buf + 4*_i, (a)[_i]); \
- (ctx)->bufi = 0; \
+ for (_i = 0; _i < 16; _i++) STORE32_L((ctx)->b + 4*_i, (a)[_i]); \
+ (ctx)->off = 0; \
} while (0)
/* Write at most @n@ bytes of buffered output from the context @ctx@ to the
* @n@ is decreased appropriately.
*/
#define SALSA20_OUTBUF(ctx, d, s, n) do { \
- size_t _n = (n), _left = SALSA20_OUTSZ - (ctx)->bufi; \
- if (_n > _left) _n = _left; \
- (n) -= _n; \
- if (!(d)) (ctx)->bufi += _n; \
- else if (s) while (_n--) *(d)++ = (ctx)->buf[(ctx)->bufi++] ^ *(s)++; \
- else while (_n--) *(d)++ = (ctx)->buf[(ctx)->bufi++]; \
+ const octet *_p = (ctx)->b + (ctx)->off; \
+ size_t _n = (n); \
+ \
+ (ctx)->off += _n; \
+ if (!(d)) /* nothing to do */; \
+ else if (!(s)) { memcpy((d), _p, _n); (d) += _n; } \
+ else while (_n--) *(d)++ = *(s)++ ^ *_p++; \
} while (0)
/*----- Variants and naming -----------------------------------------------*/