X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/290ddb617fe530512a3496de61318a98ae623fe7..92edc356a312bc64abca0c30bc03d4b6676f3d39:/symm/salsa20-core.h diff --git a/symm/salsa20-core.h b/symm/salsa20-core.h index 98efa72e..e64e7331 100644 --- a/symm/salsa20-core.h +++ b/symm/salsa20-core.h @@ -43,6 +43,28 @@ /*----- The Salsa20 core function -----------------------------------------*/ +/* It makes life somewhat easier if we don't actually store and maintain the + * input matrix in the textbook order. Instead, we rotate the columns other + * than the leftmost one upwards, so that the constants which were originally + * along the diagonal end up on the top row. We'll need to undo this + * permutation on output, but that's not too terrible an imposition. + * + * The permutation we're applying to the matrix elements is this: + * + * [ 0 1 2 3 ] [ 0 5 10 15 ] + * [ 4 5 6 7 ] --> [ 4 9 14 3 ] + * [ 8 9 10 11 ] [ 8 13 2 7 ] + * [ 12 13 14 15 ] [ 12 1 6 11 ] + * + * and as a result, we need to apply this inverse permutation to figure out + * which indices to use in the doublerow function and elsewhere. + * + * [ 0 13 10 7 ] + * [ 4 1 14 11 ] + * [ 8 5 2 15 ] + * [ 12 9 6 3 ] + */ + /* The Salsa20 quarter-round. Read from the matrix @y@ at indices @a@, @b@, * @c@, and @d@; and write to the corresponding elements of @z@. */ @@ -58,22 +80,31 @@ */ #define SALSA20_DR(z, y) do { \ SALSA20_QR(z, y, 0, 4, 8, 12); \ - SALSA20_QR(z, y, 5, 9, 13, 1); \ - SALSA20_QR(z, y, 10, 14, 2, 6); \ - SALSA20_QR(z, y, 15, 3, 7, 11); \ - SALSA20_QR(z, z, 0, 1, 2, 3); \ - SALSA20_QR(z, z, 5, 6, 7, 4); \ - SALSA20_QR(z, z, 10, 11, 8, 9); \ - SALSA20_QR(z, z, 15, 12, 13, 14); \ + SALSA20_QR(z, y, 1, 5, 9, 13); \ + SALSA20_QR(z, y, 2, 6, 10, 14); \ + SALSA20_QR(z, y, 3, 7, 11, 15); \ + SALSA20_QR(z, z, 0, 13, 10, 7); \ + SALSA20_QR(z, z, 1, 14, 11, 4); \ + SALSA20_QR(z, z, 2, 15, 8, 5); \ + SALSA20_QR(z, z, 3, 12, 9, 6); \ } while (0) /* The Salsa20 feedforward step, used at the end of the core function. Here, * @y@ contains the original input matrix; @z@ contains the final one, and is - * updated. + * updated. The output is rendered in canonical order, ready for output. */ #define SALSA20_FFWD(z, y) do { \ - int _i; \ - for (_i = 0; _i < 16; _i++) (z)[_i] += (y)[_i]; \ + const uint32 *_y = (y); \ + uint32 *_z = (z); \ + int _t; \ + _z[ 0] = _z[ 0] + _y[ 0]; _z[ 4] = _z[ 4] + _y[ 4]; \ + _z[ 8] = _z[ 8] + _y[ 8]; _z[12] = _z[12] + _y[12]; \ + _t = _z[ 1] + _y[ 1]; _z[ 1] = _z[13] + _y[13]; \ + _z[13] = _z[ 9] + _y[ 9]; _z[ 9] = _z[ 5] + _y[ 5]; _z[ 5] = _t; \ + _t = _z[ 2] + _y[ 2]; _z[ 2] = _z[10] + _y[10]; _z[10] = _t; \ + _t = _z[ 6] + _y[ 6]; _z[ 6] = _z[14] + _y[14]; _z[14] = _t; \ + _t = _z[ 3] + _y[ 3]; _z[ 3] = _z[ 7] + _y[ 7]; \ + _z[ 7] = _z[11] + _y[11]; _z[11] = _z[15] + _y[15]; _z[15] = _t; \ } while (0) /* Various numbers of rounds, unrolled. Read from @y@, and write to @z@. */ @@ -98,7 +129,7 @@ /* Step the counter in the Salsa20 state matrix @a@. */ #define SALSA20_STEP(a) \ - do { (a)[8] = U32((a)[8] + 1); (a)[9] += !(a)[8]; } while (0) + do { (a)[8] = U32((a)[8] + 1); (a)[5] += !(a)[8]; } while (0) /*----- Buffering and output ----------------------------------------------* * @@ -133,8 +164,8 @@ */ #define SALSA20_PREPBUF(ctx, a) do { \ int _i; \ - for (_i = 0; _i < 16; _i++) STORE32_L((ctx)->buf + 4*_i, (a)[_i]); \ - (ctx)->bufi = 0; \ + for (_i = 0; _i < 16; _i++) STORE32_L((ctx)->b + 4*_i, (a)[_i]); \ + (ctx)->off = 0; \ } while (0) /* Write at most @n@ bytes of buffered output from the context @ctx@ to the @@ -143,12 +174,13 @@ * @n@ is decreased appropriately. */ #define SALSA20_OUTBUF(ctx, d, s, n) do { \ - size_t _n = (n), _left = SALSA20_OUTSZ - (ctx)->bufi; \ - if (_n > _left) _n = _left; \ - (n) -= _n; \ - if (!(d)) (ctx)->bufi += _n; \ - else if (s) while (_n--) *(d)++ = (ctx)->buf[(ctx)->bufi++] ^ *(s)++; \ - else while (_n--) *(d)++ = (ctx)->buf[(ctx)->bufi++]; \ + const octet *_p = (ctx)->b + (ctx)->off; \ + size_t _n = (n); \ + \ + (ctx)->off += _n; \ + if (!(d)) /* nothing to do */; \ + else if (!(s)) { memcpy((d), _p, _n); (d) += _n; } \ + else while (_n--) *(d)++ = *(s)++ ^ *_p++; \ } while (0) /*----- Variants and naming -----------------------------------------------*/