| 1 | /* salsa208.c --- The Salsa20/8 stream cipher |
| 2 | * Copyright (C) Mark Wooding |
| 3 | * |
| 4 | * This file is free software; you can redistribute it and/or modify |
| 5 | * it under the terms of the GNU General Public License as published |
| 6 | * by the Free Software Foundation; either version 2, or (at your |
| 7 | * option) any later version. |
| 8 | * |
| 9 | * This file is distributed in the hope that it will be useful, but |
| 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 12 | * General Public License for more details. |
| 13 | * |
| 14 | * You should have received a copy of the GNU General Public License |
| 15 | * along with this file; if not, write to the Free Software |
| 16 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA |
| 17 | * 02110-1301, USA. |
| 18 | * |
| 19 | */ |
| 20 | /** @file lib/salsa208.c |
| 21 | * @brief Salsa20/8 stream cipher implementation |
| 22 | * |
| 23 | * For a description of the algorithm, see: |
| 24 | * |
| 25 | * Daniel J. Bernstein, `The Salsa20 family of stream ciphers', in Matthew |
| 26 | * Robshaw and Olivier Billet (eds.), `New Stream Cipher Designs', |
| 27 | * Springer--Verlag 2008, pp. 84--97; |
| 28 | * http://cr.yp.to/snuffle/salsafamily-20071225.pdf |
| 29 | * |
| 30 | * As far as I know, the best attack against all 8 rounds of Salsa20/8 is by |
| 31 | * Aumasson, Fischer, Khazaei, Meier, and Rechberger, which takes 2^251 |
| 32 | * operations to recover a 256-bit key, which is hopelessly impractical. |
| 33 | * Much more effective attacks are known against Salsa20/7, so we would have |
| 34 | * a tiny security margin if we were trying for security -- but we aren't. |
| 35 | * Instead, we want high-quality randomness for queue ids and for selecting |
| 36 | * random tracks. (The cookie machinery, which does want cryptographic |
| 37 | * security, makes its own arrangements.) Specifically, the intention is to |
| 38 | * replace RC4, which (a) is slow because it has a long dependency chain |
| 39 | * which plays badly with the deep pipelines in modern CPUs, and (b) has |
| 40 | * well-known and rather embarassing biases. On the other hand, Salsa20/8 |
| 41 | * has no known biases, and admits considerable instruction-level |
| 42 | * parallelism. In practice, Salsa20/8 is about 30% faster than RC4 even |
| 43 | * without a fancy SIMD implementation (which is good, because this isn't one |
| 44 | * of those); a vectorized implementation acting on multiple blocks at a time |
| 45 | * would be even faster. |
| 46 | * |
| 47 | * Salsa20/8 has a number of other attractive features, such as being |
| 48 | * trivially seekable, but we don't need those here and the necessary |
| 49 | * machinery is not implemented. |
| 50 | */ |
| 51 | |
| 52 | #include <assert.h> |
| 53 | #include <string.h> |
| 54 | |
| 55 | #include "salsa208.h" |
| 56 | |
| 57 | static inline uint32_t ld16(const void *p) { |
| 58 | const unsigned char *q = p; |
| 59 | return ((uint32_t)q[0] << 0) | ((uint32_t)q[1] << 8); |
| 60 | } |
| 61 | |
| 62 | static inline uint32_t ld32(const void *p) { |
| 63 | const unsigned char *q = p; |
| 64 | return ((uint32_t)q[0] << 0) | ((uint32_t)q[1] << 8) | |
| 65 | ((uint32_t)q[2] << 16) | ((uint32_t)q[3] << 24); |
| 66 | } |
| 67 | |
| 68 | static inline void st32(void *p, uint32_t x) { |
| 69 | unsigned char *q = p; |
| 70 | q[0] = (x >> 0)&0xff; q[1] = (x >> 8)&0xff; |
| 71 | q[2] = (x >> 16)&0xff; q[3] = (x >> 24)&0xff; |
| 72 | } |
| 73 | |
| 74 | static inline uint32_t rol32(uint32_t x, unsigned n) |
| 75 | { return (x << n) | (x >> (32 - n)); } |
| 76 | |
| 77 | static inline void quarterround(uint32_t m[16], int a, int b, int c, int d) { |
| 78 | m[b] ^= rol32(m[a] + m[d], 7); m[c] ^= rol32(m[b] + m[a], 9); |
| 79 | m[d] ^= rol32(m[c] + m[b], 13); m[a] ^= rol32(m[d] + m[c], 18); |
| 80 | } |
| 81 | |
| 82 | static void core(salsa208_context *context) { |
| 83 | unsigned i; |
| 84 | uint32_t t[16]; |
| 85 | |
| 86 | /* Copy the state. */ |
| 87 | for(i = 0; i < 16; i++) t[i] = context->m[i]; |
| 88 | |
| 89 | /* Hack on the state. */ |
| 90 | for(i = 0; i < 4; i++) { |
| 91 | |
| 92 | /* Vertical quarter-rounds. */ |
| 93 | quarterround(t, 0, 4, 8, 12); |
| 94 | quarterround(t, 5, 9, 13, 1); |
| 95 | quarterround(t, 10, 14, 2, 6); |
| 96 | quarterround(t, 15, 3, 7, 11); |
| 97 | |
| 98 | /* Horizontal quarter-rounds. */ |
| 99 | quarterround(t, 0, 1, 2, 3); |
| 100 | quarterround(t, 5, 6, 7, 4); |
| 101 | quarterround(t, 10, 11, 8, 9); |
| 102 | quarterround(t, 15, 12, 13, 14); |
| 103 | } |
| 104 | |
| 105 | /* Final feedforward. */ |
| 106 | for(i = 0; i < 16; i++) t[i] += context->m[i]; |
| 107 | |
| 108 | /* Output. */ |
| 109 | for(i = 0; i < 16; i++) st32(context->buf + 4*i, t[i]); |
| 110 | } |
| 111 | |
| 112 | static inline void xorbuf(void *z, const void *x, const void *y, size_t sz) { |
| 113 | unsigned char *zz = z; |
| 114 | const unsigned char *xx = x, *yy = y; |
| 115 | |
| 116 | if(!xx) memcpy(zz, yy, sz); |
| 117 | else while(sz--) *zz++ = *xx++ ^ *yy++; |
| 118 | } |
| 119 | |
| 120 | static inline void step(salsa208_context *context) |
| 121 | { if(!++context->m[8]) context->m[9]++; } |
| 122 | |
| 123 | /** @brief Encrypt or decrypt data using Salsa20/8. |
| 124 | * |
| 125 | * @param context The Salsa20/8 context, initialized using salsa208_setkey(). |
| 126 | * @param inbuf Pointer to input buffer |
| 127 | * @param outbuf Pointer to output buffer |
| 128 | * @param length Common size of both buffers |
| 129 | * |
| 130 | * Encrypt or decrypt (the operations are the same) @p length bytes of input |
| 131 | * data, writing the result, of the same length, to @p outbuf. The input and |
| 132 | * output pointers may be equal; the two buffers may not otherwise overlap. |
| 133 | * |
| 134 | * If @p inbuf is null, then simply write the next @p length bytes of |
| 135 | * Salsa20/8 output to @p outbuf. |
| 136 | */ |
| 137 | void salsa208_stream(salsa208_context *context, |
| 138 | const void *inbuf, void *outbuf, size_t length) { |
| 139 | size_t left = 64 - context->i; |
| 140 | unsigned char *z = outbuf; |
| 141 | const unsigned char *x = inbuf; |
| 142 | |
| 143 | /* If we can satisfy the request from our buffer then we should do that. */ |
| 144 | if(length <= left) { |
| 145 | xorbuf(z, x, context->buf + context->i, length); |
| 146 | context->i += length; |
| 147 | return; |
| 148 | } |
| 149 | |
| 150 | /* Drain the buffer of what we currently have and cycle the state. */ |
| 151 | xorbuf(z, x, context->buf + context->i, left); |
| 152 | length -= left; z += left; if(x) x += left; |
| 153 | core(context); step(context); |
| 154 | |
| 155 | /* Take multiple complete blocks directly. */ |
| 156 | while(length > 64) { |
| 157 | xorbuf(z, x, context->buf, 64); |
| 158 | length -= 64; z += 64; if(x) x += 64; |
| 159 | core(context); step(context); |
| 160 | } |
| 161 | |
| 162 | /* And the final tail end. */ |
| 163 | xorbuf(z, x, context->buf, length); |
| 164 | context->i = length; |
| 165 | } |
| 166 | |
| 167 | /** @brief Initialize a Salsa20/8 context. |
| 168 | * |
| 169 | * @param context The Salsa20/8 context to initialize |
| 170 | * @param key A pointer to the key material |
| 171 | * @param keylen The length of the key data, in bytes (must be 10, 16, or 32) |
| 172 | * |
| 173 | * The context is implicitly initialized with a zero nonce, which is fine if |
| 174 | * the key will be used only for a single message. Otherwise, a fresh nonce |
| 175 | * should be chosen somehow and set using salsa208_setnonce(). |
| 176 | */ |
| 177 | void salsa208_setkey(salsa208_context *context, |
| 178 | const void *key, size_t keylen) { |
| 179 | const unsigned char *k = key; |
| 180 | switch(keylen) { |
| 181 | case 32: |
| 182 | context->m[ 0] = 0x61707865; |
| 183 | context->m[ 1] = ld32(k + 0); |
| 184 | context->m[ 2] = ld32(k + 4); |
| 185 | context->m[ 3] = ld32(k + 8); |
| 186 | context->m[ 4] = ld32(k + 12); |
| 187 | context->m[ 5] = 0x3320646e; |
| 188 | context->m[ 6] = 0; |
| 189 | context->m[ 7] = 0; |
| 190 | context->m[ 8] = 0; |
| 191 | context->m[ 9] = 0; |
| 192 | context->m[10] = 0x79622d32; |
| 193 | context->m[11] = ld32(k + 16); |
| 194 | context->m[12] = ld32(k + 20); |
| 195 | context->m[13] = ld32(k + 24); |
| 196 | context->m[14] = ld32(k + 28); |
| 197 | context->m[15] = 0x6b206574; |
| 198 | break; |
| 199 | case 16: |
| 200 | context->m[ 0] = 0x61707865; |
| 201 | context->m[ 1] = ld32(k + 0); |
| 202 | context->m[ 2] = ld32(k + 4); |
| 203 | context->m[ 3] = ld32(k + 8); |
| 204 | context->m[ 4] = ld32(k + 12); |
| 205 | context->m[ 5] = 0x3120646e; |
| 206 | context->m[ 6] = 0; |
| 207 | context->m[ 7] = 0; |
| 208 | context->m[ 8] = 0; |
| 209 | context->m[ 9] = 0; |
| 210 | context->m[10] = 0x79622d36; |
| 211 | context->m[11] = context->m[1]; |
| 212 | context->m[12] = context->m[2]; |
| 213 | context->m[13] = context->m[3]; |
| 214 | context->m[14] = context->m[4]; |
| 215 | context->m[15] = 0x6b206574; |
| 216 | break; |
| 217 | case 10: |
| 218 | context->m[ 0] = 0x61707865; |
| 219 | context->m[ 1] = ld32(k + 0); |
| 220 | context->m[ 2] = ld32(k + 4); |
| 221 | context->m[ 3] = ld16(k + 8); |
| 222 | context->m[ 4] = 0; |
| 223 | context->m[ 5] = 0x3120646e; |
| 224 | context->m[ 6] = 0; |
| 225 | context->m[ 7] = 0; |
| 226 | context->m[ 8] = 0; |
| 227 | context->m[ 9] = 0; |
| 228 | context->m[10] = 0x79622d30; |
| 229 | context->m[11] = context->m[1]; |
| 230 | context->m[12] = context->m[2]; |
| 231 | context->m[13] = context->m[3]; |
| 232 | context->m[14] = 0; |
| 233 | context->m[15] = 0x6b206574; |
| 234 | break; |
| 235 | default: |
| 236 | assert(!"bad Salsa20 key length"); |
| 237 | } |
| 238 | context->i = 64; |
| 239 | } |
| 240 | |
| 241 | /** @brief Set the Salsa20/8 nonce. |
| 242 | * |
| 243 | * @param context The Salsa20/8 context |
| 244 | * @param nonce A pointer to the nonce |
| 245 | * @param noncelen The length of the nonce data, in bytes (must be exactly 8) |
| 246 | * |
| 247 | * The context is automatically rewound to the start of the stream |
| 248 | * corresponding to this nonce. |
| 249 | */ |
| 250 | void salsa208_setnonce(salsa208_context *context, |
| 251 | const void *nonce, size_t noncelen) { |
| 252 | const unsigned char *n = nonce; |
| 253 | assert(noncelen == 8); |
| 254 | context->m[6] = ld32(n + 0); |
| 255 | context->m[7] = ld32(n + 4); |
| 256 | context->m[8] = context->m[9] = 0; |
| 257 | context->i = 64; |
| 258 | } |