From a4c2e267f29d601a9d68ecfedc66852379e429ae Mon Sep 17 00:00:00 2001 From: Mark Wooding Date: Sat, 2 May 2015 17:05:20 +0100 Subject: [PATCH] symm/salsa20.c, symm/salsa20-core.h: Permute input matrix for SIMD. Maintain the input matrix in the Salsa20 context structure in a permuted form which makes SIMD implementations of the core function rather more efficient. --- symm/chacha-core.h | 7 +++-- symm/salsa20-core.h | 53 +++++++++++++++++++++++++++-------- symm/salsa20.c | 81 ++++++++++++++++++++++++++++++++--------------------- 3 files changed, 96 insertions(+), 45 deletions(-) diff --git a/symm/chacha-core.h b/symm/chacha-core.h index ad6b05f5..1c0efcd9 100644 --- a/symm/chacha-core.h +++ b/symm/chacha-core.h @@ -95,9 +95,12 @@ /* The ChaCha feedforward step, used at the end of the core function. Here, * @y@ contains the original input matrix; @z@ contains the final one, and is - * updated. This is the same as Salsa20. + * updated. This is the same as Salsa20, only without the final permutation. */ -#define CHACHA_FFWD(z, y) SALSA20_FFWD(z, y) +#define CHACHA_FFWD(z, y) do { \ + int _i; \ + for (_i = 0; _i < 16; _i++) (z)[_i] += (y)[_i]; \ +} while (0) /* Various numbers of rounds, unrolled. Read from @y@, and write to @z@. */ #define CHACHA_4R(z, y) \ diff --git a/symm/salsa20-core.h b/symm/salsa20-core.h index 98efa72e..b27f2220 100644 --- a/symm/salsa20-core.h +++ b/symm/salsa20-core.h @@ -43,6 +43,28 @@ /*----- The Salsa20 core function -----------------------------------------*/ +/* It makes life somewhat easier if we don't actually store and maintain the + * input matrix in the textbook order. Instead, we rotate the columns other + * than the leftmost one upwards, so that the constants which were originally + * along the diagonal end up on the top row. We'll need to undo this + * permutation on output, but that's not too terrible an imposition. + * + * The permutation we're applying to the matrix elements is this: + * + * [ 0 1 2 3 ] [ 0 5 10 15 ] + * [ 4 5 6 7 ] --> [ 4 9 14 3 ] + * [ 8 9 10 11 ] [ 8 13 2 7 ] + * [ 12 13 14 15 ] [ 12 1 6 11 ] + * + * and as a result, we need to apply this inverse permutation to figure out + * which indices to use in the doublerow function and elsewhere. + * + * [ 0 13 10 7 ] + * [ 4 1 14 11 ] + * [ 8 5 2 15 ] + * [ 12 9 6 3 ] + */ + /* The Salsa20 quarter-round. Read from the matrix @y@ at indices @a@, @b@, * @c@, and @d@; and write to the corresponding elements of @z@. */ @@ -58,22 +80,31 @@ */ #define SALSA20_DR(z, y) do { \ SALSA20_QR(z, y, 0, 4, 8, 12); \ - SALSA20_QR(z, y, 5, 9, 13, 1); \ - SALSA20_QR(z, y, 10, 14, 2, 6); \ - SALSA20_QR(z, y, 15, 3, 7, 11); \ - SALSA20_QR(z, z, 0, 1, 2, 3); \ - SALSA20_QR(z, z, 5, 6, 7, 4); \ - SALSA20_QR(z, z, 10, 11, 8, 9); \ - SALSA20_QR(z, z, 15, 12, 13, 14); \ + SALSA20_QR(z, y, 1, 5, 9, 13); \ + SALSA20_QR(z, y, 2, 6, 10, 14); \ + SALSA20_QR(z, y, 3, 7, 11, 15); \ + SALSA20_QR(z, z, 0, 13, 10, 7); \ + SALSA20_QR(z, z, 1, 14, 11, 4); \ + SALSA20_QR(z, z, 2, 15, 8, 5); \ + SALSA20_QR(z, z, 3, 12, 9, 6); \ } while (0) /* The Salsa20 feedforward step, used at the end of the core function. Here, * @y@ contains the original input matrix; @z@ contains the final one, and is - * updated. + * updated. The output is rendered in canonical order, ready for output. */ #define SALSA20_FFWD(z, y) do { \ - int _i; \ - for (_i = 0; _i < 16; _i++) (z)[_i] += (y)[_i]; \ + const uint32 *_y = (y); \ + uint32 *_z = (z); \ + int _t; \ + _z[ 0] = _z[ 0] + _y[ 0]; _z[ 4] = _z[ 4] + _y[ 4]; \ + _z[ 8] = _z[ 8] + _y[ 8]; _z[12] = _z[12] + _y[12]; \ + _t = _z[ 1] + _y[ 1]; _z[ 1] = _z[13] + _y[13]; \ + _z[13] = _z[ 9] + _y[ 9]; _z[ 9] = _z[ 5] + _y[ 5]; _z[ 5] = _t; \ + _t = _z[ 2] + _y[ 2]; _z[ 2] = _z[10] + _y[10]; _z[10] = _t; \ + _t = _z[ 6] + _y[ 6]; _z[ 6] = _z[14] + _y[14]; _z[14] = _t; \ + _t = _z[ 3] + _y[ 3]; _z[ 3] = _z[ 7] + _y[ 7]; \ + _z[ 7] = _z[11] + _y[11]; _z[11] = _z[15] + _y[15]; _z[15] = _t; \ } while (0) /* Various numbers of rounds, unrolled. Read from @y@, and write to @z@. */ @@ -98,7 +129,7 @@ /* Step the counter in the Salsa20 state matrix @a@. */ #define SALSA20_STEP(a) \ - do { (a)[8] = U32((a)[8] + 1); (a)[9] += !(a)[8]; } while (0) + do { (a)[8] = U32((a)[8] + 1); (a)[5] += !(a)[8]; } while (0) /*----- Buffering and output ----------------------------------------------* * diff --git a/symm/salsa20.c b/symm/salsa20.c index c12945ef..0104e5e5 100644 --- a/symm/salsa20.c +++ b/symm/salsa20.c @@ -61,33 +61,42 @@ static void populate(salsa20_matrix a, const void *key, size_t ksz) KSZ_ASSERT(salsa20, ksz); - a[ 1] = LOAD32_L(k + 0); - a[ 2] = LOAD32_L(k + 4); + /* Here's the pattern of key, constant, nonce, and counter pieces in the + * matrix, before and after our permutation. + * + * [ C0 K0 K1 K2 ] [ C0 C1 C2 C3 ] + * [ K3 C1 N0 N1 ] --> [ K3 T1 K7 K2 ] + * [ T0 T1 C2 K4 ] [ T0 K6 K1 N1 ] + * [ K5 K6 K7 C3 ] [ K5 K0 N0 K4 ] + */ + + a[13] = LOAD32_L(k + 0); + a[10] = LOAD32_L(k + 4); if (ksz == 10) { - a[ 3] = LOAD16_L(k + 8); + a[ 7] = LOAD16_L(k + 8); a[ 4] = 0; } else { - a[ 3] = LOAD32_L(k + 8); + a[ 7] = LOAD32_L(k + 8); a[ 4] = LOAD32_L(k + 12); } if (ksz <= 16) { - a[11] = a[ 1]; - a[12] = a[ 2]; - a[13] = a[ 3]; - a[14] = a[ 4]; + a[15] = a[13]; + a[12] = a[10]; + a[ 9] = a[ 7]; + a[ 6] = a[ 4]; a[ 0] = SALSA20_A128; - a[ 5] = SALSA20_B128; - a[10] = ksz == 10 ? SALSA20_C80 : SALSA20_C128; - a[15] = SALSA20_D128; + a[ 1] = SALSA20_B128; + a[ 2] = ksz == 10 ? SALSA20_C80 : SALSA20_C128; + a[ 3] = SALSA20_D128; } else { - a[11] = LOAD32_L(k + 16); + a[15] = LOAD32_L(k + 16); a[12] = LOAD32_L(k + 20); - a[13] = LOAD32_L(k + 24); - a[14] = LOAD32_L(k + 28); + a[ 9] = LOAD32_L(k + 24); + a[ 6] = LOAD32_L(k + 28); a[ 0] = SALSA20_A256; - a[ 5] = SALSA20_B256; - a[10] = SALSA20_C256; - a[15] = SALSA20_D256; + a[ 1] = SALSA20_B256; + a[ 2] = SALSA20_C256; + a[ 3] = SALSA20_D256; } } @@ -130,8 +139,8 @@ void salsa20_setnonce(salsa20_ctx *ctx, const void *nonce) { const octet *n = nonce; - ctx->a[6] = LOAD32_L(n + 0); - ctx->a[7] = LOAD32_L(n + 4); + ctx->a[14] = LOAD32_L(n + 0); + ctx->a[11] = LOAD32_L(n + 4); salsa20_seek(ctx, 0); } @@ -153,7 +162,7 @@ void salsa20_seek(salsa20_ctx *ctx, unsigned long i) void salsa20_seeku64(salsa20_ctx *ctx, kludge64 i) { - ctx->a[8] = LO64(i); ctx->a[9] = HI64(i); + ctx->a[8] = LO64(i); ctx->a[5] = HI64(i); ctx->bufi = SALSA20_OUTSZ; } @@ -169,7 +178,7 @@ unsigned long salsa20_tell(salsa20_ctx *ctx) { kludge64 i = salsa20_tellu64(ctx); return (GET64(unsigned long, i)); } kludge64 salsa20_tellu64(salsa20_ctx *ctx) - { kludge64 i; SET64(i, ctx->a[9], ctx->a[8]); return (i); } + { kludge64 i; SET64(i, ctx->a[5], ctx->a[8]); return (i); } /* --- @salsa20{,12,8}_encrypt@ --- * * @@ -272,10 +281,10 @@ SALSA20_VARS(DEFENCRYPT) * speed critical, so we do it the harder way. \ */ \ \ - for (i = 0; i < 4; i++) k[i + 6] = src[i]; \ + for (i = 0; i < 4; i++) k[14 - 3*i] = src[i]; \ core(r, k, a); \ - for (i = 0; i < 4; i++) dest[i] = a[5*i] - k[5*i]; \ - for (i = 4; i < 8; i++) dest[i] = a[i + 2] - k[i + 2]; \ + for (i = 0; i < 4; i++) dest[i] = a[5*i] - k[i]; \ + for (i = 4; i < 8; i++) dest[i] = a[i + 2] - k[26 - 3*i]; \ } \ \ void HSALSA20_PRF(r, salsa20_ctx *ctx, const void *src, void *dest) \ @@ -340,9 +349,9 @@ SALSA20_VARS(DEFHSALSA20) \ populate(ctx->k, key, ksz); \ ctx->s.a[ 0] = SALSA20_A256; \ - ctx->s.a[ 5] = SALSA20_B256; \ - ctx->s.a[10] = SALSA20_C256; \ - ctx->s.a[15] = SALSA20_D256; \ + ctx->s.a[ 1] = SALSA20_B256; \ + ctx->s.a[ 2] = SALSA20_C256; \ + ctx->s.a[ 3] = SALSA20_D256; \ XSALSA20_SETNONCE(r, ctx, nonce ? nonce : zerononce); \ } SALSA20_VARS(DEFXINIT) @@ -371,8 +380,8 @@ SALSA20_VARS(DEFXINIT) \ for (i = 0; i < 4; i++) in[i] = LOAD32_L(n + 4*i); \ HSALSA20_RAW(r, ctx->k, in, out); \ - for (i = 0; i < 4; i++) ctx->s.a[i + 1] = out[i]; \ - for (i = 4; i < 8; i++) ctx->s.a[i + 7] = out[i]; \ + for (i = 0; i < 4; i++) ctx->s.a[13 - 3*i] = out[i]; \ + for (i = 4; i < 8; i++) ctx->s.a[27 - 3*i] = out[i]; \ salsa20_setnonce(&ctx->s, n + 16); \ } SALSA20_VARS(DEFXNONCE) @@ -730,23 +739,31 @@ SALSA20_VARS(DEFXGRAND) #include #include +static const int perm[] = { + 0, 13, 10, 7, + 4, 1, 14, 11, + 8, 5, 2, 15, + 12, 9, 6, 3 +}; + #define DEFVCORE(r) \ static int v_core_##r(dstr *v) \ { \ salsa20_matrix a, b; \ dstr d = DSTR_INIT; \ - int i, n; \ + int i, j, n; \ int ok = 1; \ \ DENSURE(&d, SALSA20_OUTSZ); d.len = SALSA20_OUTSZ; \ n = *(int *)v[0].buf; \ for (i = 0; i < SALSA20_OUTSZ/4; i++) \ - a[i] = LOAD32_L(v[1].buf + 4*i); \ + b[i] = LOAD32_L(v[1].buf + 4*i); \ for (i = 0; i < n; i++) { \ + for (j = 0; j < 16; j++) a[perm[j]] = b[j]; \ core(r, a, b); \ memcpy(a, b, sizeof(a)); \ } \ - for (i = 0; i < SALSA20_OUTSZ/4; i++) STORE32_L(d.buf + 4*i, a[i]); \ + for (i = 0; i < SALSA20_OUTSZ/4; i++) STORE32_L(d.buf + 4*i, b[i]); \ \ if (d.len != v[2].len || memcmp(d.buf, v[2].buf, v[2].len) != 0) { \ ok = 0; \ -- 2.11.0