Commit | Line | Data |
---|---|---|
ed8e4373 MW |
1 | /* salsa208.c --- The Salsa20/8 stream cipher |
2 | * Copyright (C) Mark Wooding | |
3 | * | |
4 | * This file is free software; you can redistribute it and/or modify | |
5 | * it under the terms of the GNU General Public License as published | |
6 | * by the Free Software Foundation; either version 2, or (at your | |
7 | * option) any later version. | |
8 | * | |
9 | * This file is distributed in the hope that it will be useful, but | |
10 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
12 | * General Public License for more details. | |
13 | * | |
14 | * You should have received a copy of the GNU General Public License | |
15 | * along with this file; if not, write to the Free Software | |
16 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA | |
17 | * 02110-1301, USA. | |
18 | * | |
19 | */ | |
20 | /** @file lib/salsa208.c | |
21 | * @brief Salsa20/8 stream cipher implementation | |
22 | * | |
23 | * For a description of the algorithm, see: | |
24 | * | |
25 | * Daniel J. Bernstein, `The Salsa20 family of stream ciphers', in Matthew | |
26 | * Robshaw and Olivier Billet (eds.), `New Stream Cipher Designs', | |
27 | * Springer--Verlag 2008, pp. 84--97; | |
28 | * http://cr.yp.to/snuffle/salsafamily-20071225.pdf | |
29 | * | |
30 | * As far as I know, the best attack against all 8 rounds of Salsa20/8 is by | |
31 | * Aumasson, Fischer, Khazaei, Meier, and Rechberger, which takes 2^251 | |
32 | * operations to recover a 256-bit key, which is hopelessly impractical. | |
33 | * Much more effective attacks are known against Salsa20/7, so we would have | |
34 | * a tiny security margin if we were trying for security -- but we aren't. | |
35 | * Instead, we want high-quality randomness for queue ids and for selecting | |
36 | * random tracks. (The cookie machinery, which does want cryptographic | |
37 | * security, makes its own arrangements.) Specifically, the intention is to | |
38 | * replace RC4, which (a) is slow because it has a long dependency chain | |
39 | * which plays badly with the deep pipelines in modern CPUs, and (b) has | |
40 | * well-known and rather embarassing biases. On the other hand, Salsa20/8 | |
41 | * has no known biases, and admits considerable instruction-level | |
42 | * parallelism. In practice, Salsa20/8 is about 30% faster than RC4 even | |
43 | * without a fancy SIMD implementation (which is good, because this isn't one | |
44 | * of those); a vectorized implementation acting on multiple blocks at a time | |
45 | * would be even faster. | |
46 | * | |
47 | * Salsa20/8 has a number of other attractive features, such as being | |
48 | * trivially seekable, but we don't need those here and the necessary | |
49 | * machinery is not implemented. | |
50 | */ | |
51 | ||
52 | #include <assert.h> | |
53 | #include <string.h> | |
54 | ||
55 | #include "salsa208.h" | |
56 | ||
57 | static inline uint32_t ld16(const void *p) { | |
58 | const unsigned char *q = p; | |
59 | return ((uint32_t)q[0] << 0) | ((uint32_t)q[1] << 8); | |
60 | } | |
61 | ||
62 | static inline uint32_t ld32(const void *p) { | |
63 | const unsigned char *q = p; | |
64 | return ((uint32_t)q[0] << 0) | ((uint32_t)q[1] << 8) | | |
65 | ((uint32_t)q[2] << 16) | ((uint32_t)q[3] << 24); | |
66 | } | |
67 | ||
68 | static inline void st32(void *p, uint32_t x) { | |
69 | unsigned char *q = p; | |
70 | q[0] = (x >> 0)&0xff; q[1] = (x >> 8)&0xff; | |
71 | q[2] = (x >> 16)&0xff; q[3] = (x >> 24)&0xff; | |
72 | } | |
73 | ||
74 | static inline uint32_t rol32(uint32_t x, unsigned n) | |
75 | { return (x << n) | (x >> (32 - n)); } | |
76 | ||
77 | static inline void quarterround(uint32_t m[16], int a, int b, int c, int d) { | |
78 | m[b] ^= rol32(m[a] + m[d], 7); m[c] ^= rol32(m[b] + m[a], 9); | |
79 | m[d] ^= rol32(m[c] + m[b], 13); m[a] ^= rol32(m[d] + m[c], 18); | |
80 | } | |
81 | ||
82 | static void core(salsa208_context *context) { | |
83 | unsigned i; | |
84 | uint32_t t[16]; | |
85 | ||
86 | /* Copy the state. */ | |
87 | for(i = 0; i < 16; i++) t[i] = context->m[i]; | |
88 | ||
89 | /* Hack on the state. */ | |
90 | for(i = 0; i < 4; i++) { | |
91 | ||
92 | /* Vertical quarter-rounds. */ | |
93 | quarterround(t, 0, 4, 8, 12); | |
94 | quarterround(t, 5, 9, 13, 1); | |
95 | quarterround(t, 10, 14, 2, 6); | |
96 | quarterround(t, 15, 3, 7, 11); | |
97 | ||
98 | /* Horizontal quarter-rounds. */ | |
99 | quarterround(t, 0, 1, 2, 3); | |
100 | quarterround(t, 5, 6, 7, 4); | |
101 | quarterround(t, 10, 11, 8, 9); | |
102 | quarterround(t, 15, 12, 13, 14); | |
103 | } | |
104 | ||
105 | /* Final feedforward. */ | |
106 | for(i = 0; i < 16; i++) t[i] += context->m[i]; | |
107 | ||
108 | /* Output. */ | |
109 | for(i = 0; i < 16; i++) st32(context->buf + 4*i, t[i]); | |
110 | } | |
111 | ||
112 | static inline void xorbuf(void *z, const void *x, const void *y, size_t sz) { | |
113 | unsigned char *zz = z; | |
114 | const unsigned char *xx = x, *yy = y; | |
115 | ||
116 | if(!xx) memcpy(zz, yy, sz); | |
117 | else while(sz--) *zz++ = *xx++ ^ *yy++; | |
118 | } | |
119 | ||
120 | static inline void step(salsa208_context *context) | |
121 | { if(!++context->m[8]) context->m[9]++; } | |
122 | ||
123 | /** @brief Encrypt or decrypt data using Salsa20/8. | |
124 | * | |
125 | * @param context The Salsa20/8 context, initialized using salsa208_setkey(). | |
126 | * @param inbuf Pointer to input buffer | |
127 | * @param outbuf Pointer to output buffer | |
128 | * @param length Common size of both buffers | |
129 | * | |
130 | * Encrypt or decrypt (the operations are the same) @p length bytes of input | |
131 | * data, writing the result, of the same length, to @p outbuf. The input and | |
132 | * output pointers may be equal; the two buffers may not otherwise overlap. | |
133 | * | |
134 | * If @p inbuf is null, then simply write the next @p length bytes of | |
135 | * Salsa20/8 output to @p outbuf. | |
136 | */ | |
137 | void salsa208_stream(salsa208_context *context, | |
138 | const void *inbuf, void *outbuf, size_t length) { | |
139 | size_t left = 64 - context->i; | |
140 | unsigned char *z = outbuf; | |
141 | const unsigned char *x = inbuf; | |
142 | ||
143 | /* If we can satisfy the request from our buffer then we should do that. */ | |
144 | if(length <= left) { | |
145 | xorbuf(z, x, context->buf + context->i, length); | |
146 | context->i += length; | |
147 | return; | |
148 | } | |
149 | ||
150 | /* Drain the buffer of what we currently have and cycle the state. */ | |
151 | xorbuf(z, x, context->buf + context->i, left); | |
152 | length -= left; z += left; if(x) x += left; | |
153 | core(context); step(context); | |
154 | ||
155 | /* Take multiple complete blocks directly. */ | |
156 | while(length > 64) { | |
157 | xorbuf(z, x, context->buf, 64); | |
158 | length -= 64; z += 64; if(x) x += 64; | |
159 | core(context); step(context); | |
160 | } | |
161 | ||
162 | /* And the final tail end. */ | |
163 | xorbuf(z, x, context->buf, length); | |
164 | context->i = length; | |
165 | } | |
166 | ||
167 | /** @brief Initialize a Salsa20/8 context. | |
168 | * | |
169 | * @param context The Salsa20/8 context to initialize | |
170 | * @param key A pointer to the key material | |
171 | * @param keylen The length of the key data, in bytes (must be 10, 16, or 32) | |
172 | * | |
173 | * The context is implicitly initialized with a zero nonce, which is fine if | |
174 | * the key will be used only for a single message. Otherwise, a fresh nonce | |
175 | * should be chosen somehow and set using salsa208_setnonce(). | |
176 | */ | |
177 | void salsa208_setkey(salsa208_context *context, | |
178 | const void *key, size_t keylen) { | |
179 | const unsigned char *k = key; | |
180 | switch(keylen) { | |
181 | case 32: | |
182 | context->m[ 0] = 0x61707865; | |
183 | context->m[ 1] = ld32(k + 0); | |
184 | context->m[ 2] = ld32(k + 4); | |
185 | context->m[ 3] = ld32(k + 8); | |
186 | context->m[ 4] = ld32(k + 12); | |
187 | context->m[ 5] = 0x3320646e; | |
188 | context->m[ 6] = 0; | |
189 | context->m[ 7] = 0; | |
190 | context->m[ 8] = 0; | |
191 | context->m[ 9] = 0; | |
192 | context->m[10] = 0x79622d32; | |
193 | context->m[11] = ld32(k + 16); | |
194 | context->m[12] = ld32(k + 20); | |
195 | context->m[13] = ld32(k + 24); | |
196 | context->m[14] = ld32(k + 28); | |
197 | context->m[15] = 0x6b206574; | |
198 | break; | |
199 | case 16: | |
200 | context->m[ 0] = 0x61707865; | |
201 | context->m[ 1] = ld32(k + 0); | |
202 | context->m[ 2] = ld32(k + 4); | |
203 | context->m[ 3] = ld32(k + 8); | |
204 | context->m[ 4] = ld32(k + 12); | |
205 | context->m[ 5] = 0x3120646e; | |
206 | context->m[ 6] = 0; | |
207 | context->m[ 7] = 0; | |
208 | context->m[ 8] = 0; | |
209 | context->m[ 9] = 0; | |
210 | context->m[10] = 0x79622d36; | |
211 | context->m[11] = context->m[1]; | |
212 | context->m[12] = context->m[2]; | |
213 | context->m[13] = context->m[3]; | |
214 | context->m[14] = context->m[4]; | |
215 | context->m[15] = 0x6b206574; | |
216 | break; | |
217 | case 10: | |
218 | context->m[ 0] = 0x61707865; | |
219 | context->m[ 1] = ld32(k + 0); | |
220 | context->m[ 2] = ld32(k + 4); | |
221 | context->m[ 3] = ld16(k + 8); | |
222 | context->m[ 4] = 0; | |
223 | context->m[ 5] = 0x3120646e; | |
224 | context->m[ 6] = 0; | |
225 | context->m[ 7] = 0; | |
226 | context->m[ 8] = 0; | |
227 | context->m[ 9] = 0; | |
228 | context->m[10] = 0x79622d30; | |
229 | context->m[11] = context->m[1]; | |
230 | context->m[12] = context->m[2]; | |
231 | context->m[13] = context->m[3]; | |
232 | context->m[14] = 0; | |
233 | context->m[15] = 0x6b206574; | |
234 | break; | |
235 | default: | |
236 | assert(!"bad Salsa20 key length"); | |
237 | } | |
238 | context->i = 64; | |
239 | } | |
240 | ||
241 | /** @brief Set the Salsa20/8 nonce. | |
242 | * | |
243 | * @param context The Salsa20/8 context | |
244 | * @param nonce A pointer to the nonce | |
245 | * @param noncelen The length of the nonce data, in bytes (must be exactly 8) | |
246 | * | |
247 | * The context is automatically rewound to the start of the stream | |
248 | * corresponding to this nonce. | |
249 | */ | |
250 | void salsa208_setnonce(salsa208_context *context, | |
251 | const void *nonce, size_t noncelen) { | |
252 | const unsigned char *n = nonce; | |
253 | assert(noncelen == 8); | |
254 | context->m[6] = ld32(n + 0); | |
255 | context->m[7] = ld32(n + 4); | |
256 | context->m[8] = context->m[9] = 0; | |
257 | context->i = 64; | |
258 | } |