Commit | Line | Data |
---|---|---|
50df5733 MW |
1 | /* -*-c-*- |
2 | * | |
3 | * The GCM authenticated encryption mode | |
4 | * | |
5 | * (c) 2017 Straylight/Edgeware | |
6 | */ | |
7 | ||
8 | /*----- Licensing notice --------------------------------------------------* | |
9 | * | |
10 | * This file is part of Catacomb. | |
11 | * | |
12 | * Catacomb is free software: you can redistribute it and/or modify it | |
13 | * under the terms of the GNU Library General Public License as published | |
14 | * by the Free Software Foundation; either version 2 of the License, or | |
15 | * (at your option) any later version. | |
16 | * | |
17 | * Catacomb is distributed in the hope that it will be useful, but | |
18 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
20 | * Library General Public License for more details. | |
21 | * | |
22 | * You should have received a copy of the GNU Library General Public | |
23 | * License along with Catacomb. If not, write to the Free Software | |
24 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, | |
25 | * USA. | |
26 | */ | |
27 | ||
28 | /*----- Header files ------------------------------------------------------*/ | |
29 | ||
30 | #include "config.h" | |
31 | ||
32 | #include <stdio.h> | |
33 | ||
34 | #include <mLib/bits.h> | |
35 | ||
1d30a9b9 | 36 | #include "dispatch.h" |
50df5733 MW |
37 | #include "gcm.h" |
38 | #include "gcm-def.h" | |
39 | ||
40 | /*----- Overall strategy --------------------------------------------------* | |
41 | * | |
42 | * GCM is pretty awful to implement in software. (This presentation is going | |
43 | * to be somewhat different to that in the specification, but I think it | |
44 | * makes more sense like this.) | |
45 | * | |
46 | * We're given a %$w$%-bit blockcipher %$E$% with a key %$K$%. | |
47 | * | |
48 | * The main part is arithmetic in the finite field %$k = \gf{2^w}$%, which we | |
49 | * represent as the quotient ring %$\gf{2}[t]/(p_w(t))$% for some irreducible | |
50 | * degree-%$w$% polynomial %$p(t)$%, whose precise value isn't very important | |
51 | * right now. We choose a secret point %$x = E_K(0^w)$%. | |
52 | * | |
53 | * We choose a length size %$z$% as follows: if %$w < 96%$ then %$z = w$%; | |
54 | * otherwise %$z = w/2$%. Format a message pair as follows: | |
55 | * | |
56 | * %$F(a, b) = P_w(a) \cat P_w(b) \cat [\ell(a)]_z \cat [\ell(b)]_z$% | |
57 | * | |
58 | * where %$P_w(x) = x \cat 0^n$% where $%0 \le n < w$% such that | |
59 | * %$\ell(x) + n \equiv 0 \pmod{w}$%. | |
60 | * | |
61 | * Hash a (block-aligned) message %$u$% as follows. First, split %$u$% into | |
62 | * %$w$%-bit blocks %$u_0$%, %$u_1$%, %%\ldots%%, %$u_{n-1}$%. Interpret | |
63 | * these as elements of %$k$%. Then | |
64 | * | |
65 | * %$G_x(u) = u_0 t^n + u_1 t^{n-1} + \cdots + u_{n-1} t$% | |
66 | * | |
67 | * converted back to a %$w$%-bit string. | |
68 | * | |
69 | * We're ready to go now. Suppose we're to encrypt a message %$M$% with | |
70 | * header %$H$% and nonce %$N$%. If %$\ell(N) + 32 = w$% then let | |
71 | * %$N' = N$% and let %$i_0 = 1$%; otherwise, let %$U = G_t(F(\epsilon, N))$% | |
72 | * and split this into %$N' = U[0 \bitsto w - 32]$% and | |
73 | * %$[i_0]_{32} = U[w - 32 \bitsto w]$%. | |
74 | * | |
75 | * Let %$n = \lceil \ell(M)/w \rceil$%. Compute | |
76 | * | |
77 | * %$y_j = E_K(N' \cat [i_0 + j]_{32})$% | |
78 | * | |
79 | * for %$0 \le j \le n$%. Let | |
80 | * | |
81 | * %$s = (y_1 \cat y_2 \cat \cdots \cat y_n)[0 \bitsto \ell(M)$% | |
82 | * | |
83 | * Let %$C = M \xor s$% and let %$T = G_x(F(H, C)) \xor y_0$%. These are the | |
84 | * ciphertext and tag respectively. | |
85 | * | |
86 | * So why is this awful? | |
87 | * | |
88 | * For one thing, the bits are in a completely terrible order. The bytes are | |
89 | * arranged in little-endian order, so the unit coefficient is in the first | |
90 | * byte, and the degree-127 coefficient is in the last byte. But within each | |
91 | * byte, the lowest-degree coefficient is in the most significant bit. It's | |
92 | * therefore better to think of GCM as using a big-endian byte-ordering | |
93 | * convention, but with the bits backwards. | |
94 | * | |
95 | * But messing about with byte ordering is expensive, so let's not do that in | |
96 | * the inner loop. But multiplication in %$k$% is not easy either. Some | |
97 | * kind of precomputed table would be nice, but that will leak secrets | |
98 | * through the cache. | |
99 | * | |
100 | * I choose a particularly simple table: given %$x$%, let %$X[i'] = x t^i$%. | |
101 | * Then $%$x y = \sum_{0\le i<w} y_i X[i']$% which is just a bunch of | |
102 | * bitmasking. But the natural order for examining bits of %$y$% is not | |
103 | * necessarily the obvious one. We'll have already loaded %$y$% into | |
104 | * internal form, as 32-bit words. The good order to process these is left | |
105 | * to right, from high to low bits. But now the order of degrees depends on | |
106 | * the endianness of our conversion of bytes to words. Oh, well. | |
107 | * | |
108 | * If we've adopted a big-endian convention, then we'll see the degrees in | |
109 | * order, 0, 1, ..., all the way up to %$w - 1$% and everything is fine. If | |
110 | * we've adopted a little-endian convention, though, we'll see an ordering | |
111 | * like this: | |
112 | * | |
113 | * 24, 25, ..., 31, 16, 17, ..., 23, 8, 9, ..., 15, 0, 1, ..., 7, | |
114 | * 56, 57, ..., 63, 48, 49, ..., 55, 40, 41, ..., 47, 32, 33, ..., 39, | |
115 | * etc. | |
116 | * | |
117 | * which is the ordinary order with 0x18 = 24 XORed into the index. That is, | |
118 | * %$i' = i$% if we've adopted a big-endian convention, and | |
119 | * %$i' = i \xor 24$% if we've adopted a little-endian convention. | |
120 | */ | |
121 | ||
122 | /*----- Low-level utilities -----------------------------------------------*/ | |
123 | ||
ac4a43c1 | 124 | /* --- @mult@, @divt@ --- * |
50df5733 MW |
125 | * |
126 | * Arguments: @const gcm_params *p@ = pointer to the parameters | |
127 | * @uint32 *z@ = where to write the result | |
128 | * @const uint32 *x@ = input field element | |
129 | * | |
130 | * Returns: --- | |
131 | * | |
ac4a43c1 MW |
132 | * Use: Multiply or divide the input field element by %$t$%, and |
133 | * write the product or quotient to @z@. It's safe for @x@ and | |
134 | * @z@ to be equal, but they should not otherwise overlap. Both | |
135 | * input and output are in big-endian form, i.e., with the | |
136 | * lowest-degree coefficients in the most significant bits. | |
50df5733 MW |
137 | */ |
138 | ||
139 | static void mult(const gcm_params *p, uint32 *z, const uint32 *x) | |
140 | { | |
141 | uint32 m, c, t; | |
142 | unsigned i; | |
143 | ||
144 | t = x[p->n - 1]; m = -(t&1u); c = m&p->poly; | |
145 | for (i = 0; i < p->n; i++) { t = x[i]; z[i] = (t >> 1) ^ c; c = t << 31; } | |
146 | } | |
147 | ||
ac4a43c1 MW |
148 | #if CPUFAM_X86 || CPUFAM_AMD64 || CPUFAM_ARMEL |
149 | static void divt(const gcm_params *p, uint32 *z, const uint32 *x) | |
150 | { | |
151 | uint32 m, c, t; | |
152 | unsigned i; | |
153 | ||
154 | t = x[0]; m = -((t >> 31)&1u); c = m&1u; | |
155 | for (i = p->n - 1; i; i--) { t = x[i]; z[i] = (t << 1) | c; c = t >> 31; } | |
156 | t = x[0]; z[0] = ((t ^ (m&p->poly)) << 1) | c; | |
157 | } | |
158 | #endif | |
159 | ||
50df5733 MW |
160 | /* --- @mul@ --- * |
161 | * | |
162 | * Arguments: @const gcm_params *p@ = pointer to the parameters | |
163 | * @uint32 *z@ = where to write the result | |
164 | * @const uint32 *x, *y@ = input field elements | |
165 | * | |
166 | * Returns: --- | |
167 | * | |
168 | * Use: Multiply the input field elements together, and write the | |
169 | * product to @z@. It's safe for the operands to overlap. Both | |
170 | * inputs and the output are in big-endian form, i.e., with the | |
171 | * lowest-degree coefficients in the most significant bits. | |
172 | */ | |
173 | ||
174 | static void mul(const gcm_params *p, uint32 *z, | |
175 | const uint32 *x, const uint32 *y) | |
176 | { | |
177 | uint32 m, t, u[GCM_NMAX], v[GCM_NMAX]; | |
178 | unsigned i, j, k; | |
179 | ||
180 | /* We can't do this in-place at all, so use temporary space. Make a copy | |
181 | * of @x@ in @u@, where we can clobber it, and build the product in @v@. | |
182 | */ | |
183 | for (i = 0; i < p->n; i++) { u[i] = x[i]; v[i] = 0; } | |
184 | ||
185 | /* Repeatedly multiply @x@ (in @u@) by %$t$%, and add together those | |
186 | * %$x t^i$% selected by the bits of @y@. This is basically what you get | |
187 | * by streaming the result of @gcm_mktable@ into @gcm_mulk_...@. | |
188 | */ | |
189 | for (i = 0; i < p->n; i++) { | |
190 | t = y[i]; | |
191 | for (j = 0; j < 32; j++) { | |
192 | m = -((t >> 31)&1u); | |
193 | for (k = 0; k < p->n; k++) v[k] ^= u[k]&m; | |
194 | mult(p, u, u); t <<= 1; | |
195 | } | |
196 | } | |
197 | ||
198 | /* Write out the result now that it's ready. */ | |
199 | for (i = 0; i < p->n; i++) z[i] = v[i]; | |
200 | } | |
201 | ||
202 | /*----- Table-based multiplication ----------------------------------------*/ | |
203 | ||
204 | /* --- @gcm_mktable@ --- * | |
205 | * | |
206 | * Arguments: @const gcm_params *p@ = pointer to the parameters | |
207 | * @uint32 *ktab@ = where to write the table; there must be | |
208 | * space for %$32 n$% $%n$%-word entries, i.e., | |
209 | * %$32 n^2$% 32-bit words in total, where %$n$% is | |
210 | * @p->n@, the block size in words | |
211 | * @const uint32 *k@ = input field element | |
212 | * | |
213 | * Returns: --- | |
214 | * | |
215 | * Use: Construct a table for use by @gcm_mulk_...@ below, to | |
216 | * multiply (vaguely) efficiently by @k@. | |
217 | */ | |
218 | ||
1d30a9b9 MW |
219 | static void simple_mktable(const gcm_params *p, |
220 | uint32 *ktab, const uint32 *k) | |
50df5733 MW |
221 | { |
222 | unsigned m = (p->f&GCMF_SWAP ? 0x18 : 0); | |
223 | unsigned i, j, o = m*p->n; | |
224 | ||
225 | /* As described above, the table stores entries %$K[i \xor m] = k t^i$%, | |
226 | * where %$m = 0$% (big-endian cipher) or %$m = 24$% (little-endian). | |
227 | * The first job is to store %$K[m] = k$%. | |
228 | * | |
229 | * We initially build the table with the entries in big-endian order, and | |
230 | * then swap them if necessary. This makes the arithmetic functions more | |
231 | * amenable for use by @gcm_concat@ below. | |
232 | */ | |
233 | if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) ktab[o + i] = k[i]; | |
234 | else for (i = 0; i < p->n; i++) ktab[o + i] = ENDSWAP32(k[i]); | |
235 | ||
236 | /* Fill in the rest of the table by repeatedly multiplying the previous | |
237 | * entry by %$t$%. | |
238 | */ | |
239 | for (i = 1; i < 32*p->n; i++) | |
240 | { j = (i ^ m)*p->n; mult(p, ktab + j, ktab + o); o = j; } | |
241 | ||
242 | /* Finally, if the cipher uses a little-endian convention, then swap all of | |
243 | * the individual words. | |
244 | */ | |
245 | if (p->f&GCMF_SWAP) | |
246 | for (i = 0; i < 32*p->n*p->n; i++) ktab[i] = ENDSWAP32(ktab[i]); | |
247 | } | |
248 | ||
9e6a4409 MW |
249 | #if CPUFAM_X86 || CPUFAM_AMD64 |
250 | static void pclmul_mktable(const gcm_params *p, | |
251 | uint32 *ktab, const uint32 *k) | |
252 | { | |
ac4a43c1 | 253 | unsigned i, n = p->n; |
9e6a4409 | 254 | unsigned nz; |
ac4a43c1 | 255 | uint32 k_over_t[GCM_NMAX], *t; |
9e6a4409 | 256 | |
ac4a43c1 MW |
257 | /* We need to divide the value by t (to compensate for the one-bit shift |
258 | * resulting from GCM's backwards bit ordering) and store the value in a | |
259 | * way which is convenient for the assembler code to read back. That | |
260 | * involves reordering the words, and, in the case of 96-bit blocks, | |
261 | * padding with zeroes to fill out a 128-bit chunk. | |
9e6a4409 MW |
262 | */ |
263 | ||
ac4a43c1 MW |
264 | if (!(p->f&GCMF_SWAP)) divt(p, k_over_t, k); |
265 | else { | |
266 | for (i = 0; i < n; i++) k_over_t[i] = ENDSWAP32(k[i]); | |
267 | divt(p, k_over_t, k_over_t); | |
268 | } | |
269 | ||
9e6a4409 MW |
270 | if (n == 3) nz = 1; |
271 | else nz = 0; | |
ac4a43c1 | 272 | k = k_over_t; t = ktab + n + nz; while (n--) *--t = *k++; |
9e6a4409 MW |
273 | while (nz--) *--t = 0; |
274 | } | |
275 | #endif | |
276 | ||
277 | #if CPUFAM_ARMEL | |
278 | static void arm_crypto_mktable(const gcm_params *p, | |
279 | uint32 *ktab, const uint32 *k) | |
280 | { | |
ac4a43c1 MW |
281 | unsigned i, n = p->n; |
282 | uint32 k_over_t[GCM_NMAX], *t; | |
9e6a4409 | 283 | |
ac4a43c1 MW |
284 | /* We need to divide the value by t (to compensate for the one-bit shift |
285 | * resulting from GCM's backwards bit ordering) and store the value in a | |
286 | * way which is convenient for the assembler code to read back. That | |
287 | * involves swapping the bytes in each 64-bit lane. | |
9e6a4409 MW |
288 | */ |
289 | ||
ac4a43c1 MW |
290 | if (!(p->f&GCMF_SWAP)) divt(p, k_over_t, k); |
291 | else { | |
292 | for (i = 0; i < n; i++) k_over_t[i] = ENDSWAP32(k[i]); | |
293 | divt(p, k_over_t, k_over_t); | |
294 | } | |
295 | ||
296 | t = ktab; k = k_over_t; | |
297 | while (n >= 2) { | |
298 | t[1] = k[0]; t[0] = k[1]; | |
299 | t += 2; k += 2; n -= 2; | |
9e6a4409 | 300 | } |
ac4a43c1 | 301 | if (n) { t[1] = k[0]; t[0] = 0; } |
9e6a4409 MW |
302 | } |
303 | #endif | |
304 | ||
305 | #if CPUFAM_ARM64 | |
306 | static uint32 rbit32(uint32 x) | |
307 | { | |
308 | uint32 z, t; | |
309 | ||
310 | #if GCC_VERSION_P(4, 3) | |
311 | /* Two tricks here. Firstly, two separate steps, rather than a single | |
312 | * block of assembler, to allow finer-grained instruction scheduling. | |
313 | * Secondly, use `ENDSWAP32' so that the compiler can cancel it if the | |
314 | * caller actually wants the bytes reordered. | |
315 | */ | |
316 | __asm__("rbit %w0, %w1" : "=r"(t) : "r"(x)); | |
317 | z = ENDSWAP32(t); | |
318 | #else | |
319 | /* A generic but slightly clever implementation. */ | |
320 | # define SWIZZLE(x, m, s) ((((x)&(m)) << (s)) | (((x)&~(m)) >> (s))) | |
321 | /* 76543210 */ | |
322 | t = SWIZZLE(x, 0x0f0f0f0f, 4); /* 32107654 -- swap nibbles */ | |
323 | t = SWIZZLE(t, 0x33333333, 2); /* 10325476 -- swap bit pairs */ | |
324 | z = SWIZZLE(t, 0x55555555, 1); /* 01234567 -- swap adjacent bits */ | |
325 | # undef SWIZZLE | |
326 | #endif | |
327 | return (z); | |
328 | } | |
329 | ||
330 | static void arm64_pmull_mktable(const gcm_params *p, | |
331 | uint32 *ktab, const uint32 *k) | |
332 | { | |
333 | unsigned n = p->n; | |
334 | uint32 *t; | |
335 | ||
336 | /* We just need to store the value in a way which is convenient for the | |
337 | * assembler code to read back. That involves two transformations: | |
338 | * | |
339 | * * firstly, reversing the order of the bits in each byte; and, | |
340 | * | |
341 | * * secondly, storing two copies of each 64-bit chunk. | |
342 | * | |
343 | * Note that, in this case, we /want/ the little-endian byte order of GCM, | |
344 | * so endianness-swapping happens in the big-endian case. | |
345 | */ | |
346 | ||
347 | t = ktab; | |
348 | if (p->f&GCMF_SWAP) { | |
349 | while (n >= 2) { | |
350 | t[0] = t[2] = rbit32(k[0]); | |
351 | t[1] = t[3] = rbit32(k[1]); | |
352 | t += 4; k += 2; n -= 2; | |
353 | } | |
354 | if (n) { t[0] = t[2] = rbit32(k[0]); t[1] = t[3] = 0; } | |
355 | } else { | |
356 | while (n >= 2) { | |
357 | t[0] = t[2] = ENDSWAP32(rbit32(k[0])); | |
358 | t[1] = t[3] = ENDSWAP32(rbit32(k[1])); | |
359 | t += 4; k += 2; n -= 2; | |
360 | } | |
361 | if (n) { t[0] = t[2] = ENDSWAP32(rbit32(k[0])); t[1] = t[3] = 0; } | |
362 | } | |
363 | } | |
364 | #endif | |
365 | ||
1d30a9b9 MW |
366 | CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable, |
367 | (const gcm_params *p, uint32 *ktab, const uint32 *k), | |
368 | (p, ktab, k), | |
369 | pick_mktable, simple_mktable) | |
370 | ||
371 | static gcm_mktable__functype *pick_mktable(void) | |
372 | { | |
9e6a4409 MW |
373 | #if CPUFAM_X86 || CPUFAM_AMD64 |
374 | DISPATCH_PICK_COND(gcm_mktable, pclmul_mktable, | |
375 | cpu_feature_p(CPUFEAT_X86_SSSE3) && | |
376 | cpu_feature_p(CPUFEAT_X86_PCLMUL)); | |
377 | #endif | |
378 | #if CPUFAM_ARMEL | |
379 | DISPATCH_PICK_COND(gcm_mktable, arm_crypto_mktable, | |
380 | cpu_feature_p(CPUFEAT_ARM_PMULL)); | |
381 | #endif | |
382 | #if CPUFAM_ARM64 | |
383 | DISPATCH_PICK_COND(gcm_mktable, arm64_pmull_mktable, | |
384 | cpu_feature_p(CPUFEAT_ARM_PMULL)); | |
385 | #endif | |
1d30a9b9 MW |
386 | DISPATCH_PICK_FALLBACK(gcm_mktable, simple_mktable); |
387 | } | |
388 | ||
389 | /* --- @recover_k@ --- * | |
390 | * | |
391 | * Arguments: @const gcm_params *p@ = pointer to the parameters | |
392 | * @uint32 *k@ = block-sized vector in which to store %$k$% | |
393 | * @const uint32 *ktab@ = the table encoding %$k$% | |
394 | * | |
395 | * Returns: --- | |
396 | * | |
397 | * Use: Recovers %$k$%, the secret from which @ktab@ was by | |
398 | * @gcm_mktable@, from the table, and stores it in internal | |
399 | * (big-endian) form in @k@. | |
400 | */ | |
401 | ||
402 | static void simple_recover_k(const gcm_params *p, | |
403 | uint32 *k, const uint32 *ktab) | |
404 | { | |
405 | unsigned i; | |
406 | ||
407 | /* If the blockcipher is big-endian, then the key is simply in the first | |
408 | * table element, in the right format. If the blockcipher is little-endian | |
409 | * then it's in element 24, and the bytes need swapping. | |
410 | */ | |
411 | ||
412 | if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) k[i] = ktab[i]; | |
413 | else for (i = 0; i < p->n; i++) k[i] = ENDSWAP32(ktab[24*p->n + i]); | |
414 | } | |
415 | ||
9e6a4409 MW |
416 | #if CPUFAM_X86 || CPUFAM_AMD64 |
417 | static void pclmul_recover_k(const gcm_params *p, | |
418 | uint32 *k, const uint32 *ktab) | |
419 | { | |
420 | unsigned n = p->n; | |
421 | unsigned nz; | |
422 | const uint32 *t; | |
423 | ||
424 | /* The representation is already independent of the blockcipher endianness. | |
ac4a43c1 MW |
425 | * We need to compensate for padding, reorder the words, and multiply by t |
426 | * to compensate for the factor of t we divided out earlier. | |
9e6a4409 MW |
427 | */ |
428 | ||
429 | if (n == 3) nz = 1; else nz = 0; | |
430 | t = ktab + n + nz; | |
431 | while (n--) *k++ = *--t; | |
ac4a43c1 | 432 | mult(p, k - p->n, k - p->n); |
9e6a4409 MW |
433 | } |
434 | #endif | |
435 | ||
436 | #if CPUFAM_ARMEL | |
437 | static void arm_crypto_recover_k(const gcm_params *p, | |
438 | uint32 *k, const uint32 *ktab) | |
439 | { | |
440 | unsigned n = p->n; | |
441 | const uint32 *t; | |
442 | ||
443 | /* The representation is already independent of the blockcipher endianness. | |
ac4a43c1 MW |
444 | * We only need to reorder the words, and multiply by t to compensate for |
445 | * the factor of t we divided out earlier. | |
9e6a4409 MW |
446 | */ |
447 | ||
448 | t = ktab; | |
449 | while (n >= 2) { k[1] = t[0]; k[0] = t[1]; t += 2; k += 2; n -= 2; } | |
ac4a43c1 MW |
450 | if (n) { k[0] = t[1]; k++; n--; } |
451 | mult(p, k - p->n, k - p->n); | |
9e6a4409 MW |
452 | } |
453 | #endif | |
454 | ||
455 | #if CPUFAM_ARM64 | |
456 | static void arm64_pmull_recover_k(const gcm_params *p, | |
457 | uint32 *k, const uint32 *ktab) | |
458 | { | |
459 | unsigned n = p->n; | |
460 | const uint32 *t; | |
461 | ||
462 | /* The representation is already independent of the blockcipher endianness. | |
463 | * We need to skip the duplicate pieces, and unscramble the bytes. | |
464 | */ | |
465 | ||
466 | t = ktab; | |
467 | while (n >= 2) { | |
468 | k[0] = ENDSWAP32(rbit32(t[0])); | |
469 | k[1] = ENDSWAP32(rbit32(t[1])); | |
470 | t += 4; k += 2; n -= 2; | |
471 | } | |
472 | if (n) k[0] = ENDSWAP32(rbit32(t[0])); | |
473 | } | |
474 | #endif | |
475 | ||
1d30a9b9 MW |
476 | CPU_DISPATCH(static, EMPTY, void, recover_k, |
477 | (const gcm_params *p, uint32 *k, const uint32 *ktab), | |
478 | (p, k, ktab), | |
479 | pick_recover_k, simple_recover_k) | |
480 | ||
481 | static recover_k__functype *pick_recover_k(void) | |
9e6a4409 MW |
482 | { |
483 | #if CPUFAM_X86 || CPUFAM_AMD64 | |
484 | DISPATCH_PICK_COND(recover_k, pclmul_recover_k, | |
485 | cpu_feature_p(CPUFEAT_X86_SSSE3) && | |
486 | cpu_feature_p(CPUFEAT_X86_PCLMUL)); | |
487 | #endif | |
488 | #if CPUFAM_ARMEL | |
489 | DISPATCH_PICK_COND(recover_k, arm_crypto_recover_k, | |
490 | cpu_feature_p(CPUFEAT_ARM_PMULL)); | |
491 | #endif | |
492 | #if CPUFAM_ARM64 | |
493 | DISPATCH_PICK_COND(recover_k, arm64_pmull_recover_k, | |
494 | cpu_feature_p(CPUFEAT_ARM_PMULL)); | |
495 | #endif | |
496 | DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k); | |
497 | } | |
1d30a9b9 MW |
498 | |
499 | /* --- @gcm_mulk_N{b,l}@ --- * | |
50df5733 MW |
500 | * |
501 | * Arguments: @uint32 *a@ = accumulator to multiply | |
502 | * @const uint32 *ktab@ = table constructed by @gcm_mktable@ | |
503 | * | |
504 | * Returns: --- | |
505 | * | |
506 | * Use: Multiply @a@ by @k@ (implicitly represented in @ktab@), | |
507 | * updating @a@ in-place. There are separate functions for each | |
1d30a9b9 MW |
508 | * supported block size and endianness because this is the |
509 | * function whose performance actually matters. | |
50df5733 MW |
510 | */ |
511 | ||
9e6a4409 MW |
512 | #if CPUFAM_X86 || CPUFAM_AMD64 |
513 | # define DECL_MULK_X86ISH(var) extern gcm_mulk_##var##__functype \ | |
514 | gcm_mulk_##var##_x86ish_pclmul_avx, \ | |
515 | gcm_mulk_##var##_x86ish_pclmul; | |
516 | # define PICK_MULK_X86ISH(var) do { \ | |
517 | DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul_avx, \ | |
518 | cpu_feature_p(CPUFEAT_X86_AVX) && \ | |
519 | cpu_feature_p(CPUFEAT_X86_PCLMUL) && \ | |
520 | cpu_feature_p(CPUFEAT_X86_SSSE3)); \ | |
521 | DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul, \ | |
522 | cpu_feature_p(CPUFEAT_X86_PCLMUL) && \ | |
523 | cpu_feature_p(CPUFEAT_X86_SSSE3)); \ | |
524 | } while (0) | |
525 | #else | |
526 | # define DECL_MULK_X86ISH(var) | |
527 | # define PICK_MULK_X86ISH(var) do ; while (0) | |
528 | #endif | |
529 | ||
530 | #if CPUFAM_ARMEL | |
531 | # define DECL_MULK_ARM(var) \ | |
532 | extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm_crypto; | |
533 | # define PICK_MULK_ARM(var) do { \ | |
534 | DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm_crypto, \ | |
535 | cpu_feature_p(CPUFEAT_ARM_PMULL)); \ | |
536 | } while (0) | |
537 | #else | |
538 | # define DECL_MULK_ARM(var) | |
539 | # define PICK_MULK_ARM(var) do ; while (0) | |
540 | #endif | |
541 | ||
542 | #if CPUFAM_ARM64 | |
543 | # define DECL_MULK_ARM64(var) \ | |
544 | extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm64_pmull; | |
545 | # define PICK_MULK_ARM64(var) do { \ | |
546 | DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm64_pmull, \ | |
547 | cpu_feature_p(CPUFEAT_ARM_PMULL)); \ | |
548 | } while (0) | |
549 | #else | |
550 | # define DECL_MULK_ARM64(var) | |
551 | # define PICK_MULK_ARM64(var) do ; while (0) | |
552 | #endif | |
553 | ||
50df5733 | 554 | #define DEF_MULK(nbits) \ |
1d30a9b9 MW |
555 | \ |
556 | CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##b, \ | |
557 | (uint32 *a, const uint32 *ktab), (a, ktab), \ | |
558 | pick_mulk_##nbits##b, simple_mulk_##nbits) \ | |
559 | CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##l, \ | |
560 | (uint32 *a, const uint32 *ktab), (a, ktab), \ | |
561 | pick_mulk_##nbits##l, simple_mulk_##nbits) \ | |
562 | \ | |
563 | static void simple_mulk_##nbits(uint32 *a, const uint32 *ktab) \ | |
50df5733 MW |
564 | { \ |
565 | uint32 m, t; \ | |
566 | uint32 z[nbits/32]; \ | |
567 | unsigned i, j, k; \ | |
568 | \ | |
569 | for (i = 0; i < nbits/32; i++) z[i] = 0; \ | |
570 | \ | |
571 | for (i = 0; i < nbits/32; i++) { \ | |
572 | t = a[i]; \ | |
573 | for (j = 0; j < 32; j++) { \ | |
574 | m = -((t >> 31)&1u); \ | |
575 | for (k = 0; k < nbits/32; k++) z[k] ^= *ktab++&m; \ | |
576 | t <<= 1; \ | |
577 | } \ | |
578 | } \ | |
579 | \ | |
580 | for (i = 0; i < nbits/32; i++) a[i] = z[i]; \ | |
1d30a9b9 MW |
581 | } \ |
582 | \ | |
9e6a4409 MW |
583 | DECL_MULK_X86ISH(nbits##b) \ |
584 | DECL_MULK_ARM(nbits##b) \ | |
585 | DECL_MULK_ARM64(nbits##b) \ | |
1d30a9b9 | 586 | static gcm_mulk_##nbits##b##__functype *pick_mulk_##nbits##b(void) \ |
9e6a4409 MW |
587 | { \ |
588 | PICK_MULK_X86ISH(nbits##b); \ | |
589 | PICK_MULK_ARM(nbits##b); \ | |
590 | PICK_MULK_ARM64(nbits##b); \ | |
591 | DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits); \ | |
592 | } \ | |
593 | \ | |
594 | DECL_MULK_X86ISH(nbits##l) \ | |
595 | DECL_MULK_ARM(nbits##l) \ | |
596 | DECL_MULK_ARM64(nbits##l) \ | |
1d30a9b9 | 597 | static gcm_mulk_##nbits##l##__functype *pick_mulk_##nbits##l(void) \ |
9e6a4409 MW |
598 | { \ |
599 | PICK_MULK_X86ISH(nbits##l); \ | |
600 | PICK_MULK_ARM(nbits##l); \ | |
601 | PICK_MULK_ARM64(nbits##l); \ | |
602 | DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits); \ | |
603 | } | |
1d30a9b9 | 604 | |
50df5733 MW |
605 | GCM_WIDTHS(DEF_MULK) |
606 | ||
8f6a5276 | 607 | #define GCM_MULK_CASE(nbits) \ |
1d30a9b9 MW |
608 | case nbits/32: \ |
609 | if (_f&GCMF_SWAP) gcm_mulk_##nbits##l(_a, _ktab); \ | |
610 | else gcm_mulk_##nbits##b(_a, _ktab); \ | |
611 | break; | |
612 | #define MULK(n, f, a, ktab) do { \ | |
8f6a5276 | 613 | uint32 *_a = (a); const uint32 *_ktab = (ktab); \ |
1d30a9b9 | 614 | unsigned _f = (f); \ |
8f6a5276 MW |
615 | switch (n) { \ |
616 | GCM_WIDTHS(GCM_MULK_CASE) \ | |
617 | default: abort(); \ | |
618 | } \ | |
619 | } while (0) | |
620 | ||
50df5733 MW |
621 | /*----- Other utilities ---------------------------------------------------*/ |
622 | ||
623 | /* --- @putlen@ --- * | |
624 | * | |
625 | * Arguments: @octet *p@ = pointer to output buffer | |
626 | * @unsigned w@ = size of output buffer | |
627 | * @unsigned blksz@ = block size (assumed fairly small) | |
628 | * @unsigned long nblocks@ = number of blocks | |
629 | * @unsigned nbytes@ = tail size in bytes (assumed small) | |
630 | * | |
631 | * Returns: --- | |
632 | * | |
633 | * Use: Store the overall length in %$\emph{bits}$% (i.e., | |
634 | * @3*(nblocks*blksz + nbytes)@ in big-endian form in the | |
635 | * buffer @p@. | |
636 | */ | |
637 | ||
638 | static void putlen(octet *p, unsigned w, unsigned blksz, | |
639 | unsigned long nblocks, unsigned nbytes) | |
640 | { | |
641 | unsigned long nblo = nblocks&((1ul << (ULONG_BITS/2)) - 1), | |
642 | nbhi = nblocks >> ULONG_BITS/2; | |
643 | unsigned long nlo = nblo*blksz + nbytes, nhi = nbhi*blksz; | |
644 | ||
645 | /* This is fiddly. Split @nblocks@, which is the big number, into high and | |
646 | * low halves, multiply those separately by @blksz@, propagate carries, and | |
647 | * then multiply by eight. | |
648 | */ | |
649 | nhi += nlo >> ULONG_BITS/2; | |
650 | nlo &= (1ul << (ULONG_BITS/2)) - 1; | |
651 | nlo <<= 3; | |
652 | ||
653 | /* Now write out the size, feeding bits in from @nhi@ as necessary. */ | |
654 | p += w; | |
655 | while (w--) { | |
656 | *--p = U8(nlo); | |
657 | nlo = (nlo >> 8) | ((nhi&0xff) << (ULONG_BITS/2 - 5)); | |
658 | nhi >>= 8; | |
659 | } | |
660 | } | |
661 | ||
662 | /* --- @mix@ --- * | |
663 | * | |
664 | * Arguments: @const gcm_params *p@ = pointer to the parameters | |
665 | * @uint32 *a@ = GHASH accumulator | |
666 | * @const octet *q@ = pointer to an input block | |
667 | * @const uint32 *ktab@ = multiplication table, built by | |
668 | * @gcm_mktable@ | |
669 | * | |
670 | * Returns: --- | |
671 | * | |
672 | * Use: Fold the block @q@ into the GHASH accumulator. The | |
673 | * calculation is %$a' = k (a + q)$%. | |
674 | */ | |
675 | ||
676 | static void mix(const gcm_params *p, uint32 *a, | |
677 | const octet *q, const uint32 *ktab) | |
678 | { | |
679 | unsigned i; | |
680 | ||
50df5733 MW |
681 | if (p->f&GCMF_SWAP) |
682 | for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_L(q); q += 4; } | |
683 | else | |
684 | for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_B(q); q += 4; } | |
1d30a9b9 | 685 | MULK(p->n, p->f, a, ktab); |
50df5733 MW |
686 | } |
687 | ||
688 | /* --- @gcm_ghashdone@ --- * | |
689 | * | |
690 | * Arguments: @const gcm_params *p@ = pointer to the parameters | |
691 | * @uint32 *a@ = GHASH accumulator | |
692 | * @const uint32 *ktab@ = multiplication table, built by | |
693 | * @gcm_mktable@ | |
694 | * @unsigned long xblocks, yblocks@ = number of whole blocks in | |
695 | * the two inputs | |
696 | * @unsigned xbytes, ybytes@ = number of trailing bytes in the | |
697 | * two inputs | |
698 | * | |
699 | * Returns: --- | |
700 | * | |
701 | * Use: Finishes a GHASH operation by appending the appropriately | |
702 | * encoded lengths of the two constituent messages. | |
703 | */ | |
704 | ||
705 | void gcm_ghashdone(const gcm_params *p, uint32 *a, const uint32 *ktab, | |
706 | unsigned long xblocks, unsigned xbytes, | |
707 | unsigned long yblocks, unsigned ybytes) | |
708 | { | |
709 | octet b[4*GCM_NMAX]; | |
710 | unsigned w = p->n < 3 ? 4*p->n : 2*p->n; | |
711 | ||
712 | /* Construct the encoded lengths. Note that smaller-block versions of GCM | |
713 | * encode the lengths in separate blocks. GCM is only officially defined | |
714 | * for 64- and 128-bit blocks; I've placed the cutoff somewhat arbitrarily | |
715 | * at 96 bits. | |
716 | */ | |
717 | putlen(b, w, 4*p->n, xblocks, xbytes); | |
718 | putlen(b + w, w, 4*p->n, yblocks, ybytes); | |
719 | ||
720 | /* Feed the lengths into the accumulator. */ | |
721 | mix(p, a, b, ktab); | |
722 | if (p->n < 3) mix(p, a, b + w, ktab); | |
723 | } | |
724 | ||
725 | /* --- @gcm_concat@ --- * | |
726 | * | |
727 | * Arguments: @const gcm_params *p@ = pointer to the parameters | |
728 | * @uint32 *z@ = GHASH accumulator for suffix, updated | |
729 | * @const uint32 *x@ = GHASH accumulator for prefix | |
730 | * @const uint32 *ktab@ = multiplication table, built by | |
731 | * @gcm_mktable@ | |
732 | * @unsigned long n@ = length of suffix in whole blocks | |
733 | * | |
734 | * Returns: --- | |
735 | * | |
736 | * Use: On entry, @x@ and @z@ are the results of hashing two strings | |
737 | * %$a$% and %$b$%, each a whole number of blocks long; in | |
738 | * particular, %$b$% is @n@ blocks long. On exit, @z@ is | |
739 | * updated to be the hash of %$a \cat b$%. | |
740 | */ | |
741 | ||
742 | void gcm_concat(const gcm_params *p, uint32 *z, const uint32 *x, | |
743 | const uint32 *ktab, unsigned long n) | |
744 | { | |
745 | uint32 t[GCM_NMAX], u[GCM_NMAX]; | |
746 | unsigned i, j; | |
747 | ||
748 | if (!n) { | |
749 | /* If @n@ is zero, then there's not much to do. The mathematics | |
750 | * (explained below) still works, but the code takes a shortcut which | |
751 | * doesn't handle this case: so set %$z' = z + x k^n = z + x$%. | |
752 | */ | |
753 | ||
754 | for (j = 0; j < p->n; j++) z[j] ^= x[j]; | |
755 | } else { | |
756 | /* We have %$x = a_0 t^m + \cdots + a_{m-2} t^2 + a_{m-1} t$% and | |
757 | * %$z = b_0 t^n + \cdots + b_{n-2} t^2 + b_{n-1} t$%. What we'd like is | |
758 | * the hash of %$a \cat b$%, which is %$z + x k^n$%. | |
759 | * | |
760 | * The first job, then, is to calculate %$k^n$%, and for this we use a | |
761 | * simple left-to-right square-and-multiply algorithm. There's no need | |
762 | * to keep %$n$% secret here. | |
763 | */ | |
764 | ||
765 | /* Start by retrieving %$k$% from the table, and convert it to big-endian | |
766 | * form. | |
767 | */ | |
1d30a9b9 | 768 | recover_k(p, u, ktab); |
50df5733 MW |
769 | |
770 | /* Now calculate %$k^n$%. */ | |
771 | i = ULONG_BITS; | |
772 | #define BIT (1ul << (ULONG_BITS - 1)) | |
773 | while (!(n&BIT)) { n <<= 1; i--; } | |
774 | n <<= 1; i--; for (j = 0; j < p->n; j++) t[j] = u[j]; | |
775 | while (i--) { mul(p, t, t, t); if (n&BIT) mul(p, t, t, u); n <<= 1; } | |
776 | #undef BIT | |
777 | ||
778 | /* Next, calculate %$x k^n$%. If we're using a little-endian convention | |
779 | * then we must convert %$x$%; otherwise we can just use it in place. | |
780 | */ | |
781 | if (!(p->f&GCMF_SWAP)) | |
782 | mul(p, t, t, x); | |
783 | else { | |
784 | for (j = 0; j < p->n; j++) u[j] = ENDSWAP32(x[j]); | |
785 | mul(p, t, t, u); | |
786 | } | |
787 | ||
788 | /* Finally, add %$x k^n$% onto %$z$%, converting back to little-endian if | |
789 | * necessary. | |
790 | */ | |
791 | if (!(p->f&GCMF_SWAP)) for (j = 0; j < p->n; j++) z[j] ^= t[j]; | |
792 | else for (j = 0; j < p->n; j++) z[j] ^= ENDSWAP32(t[j]); | |
793 | } | |
794 | } | |
795 | ||
8f6a5276 MW |
796 | /*----- Test rig ----------------------------------------------------------*/ |
797 | ||
798 | #ifdef TEST_RIG | |
799 | ||
141c1284 | 800 | #include <mLib/macros.h> |
8f6a5276 MW |
801 | #include <mLib/quis.h> |
802 | #include <mLib/testrig.h> | |
803 | ||
416b8869 MW |
804 | #ifdef ENABLE_ASM_DEBUG |
805 | # include "regdump.h" | |
806 | #endif | |
807 | ||
8f6a5276 | 808 | static void report_failure(const char *test, unsigned nbits, |
1d30a9b9 | 809 | const char *ref, dstr v[], dstr *d) |
8f6a5276 MW |
810 | { |
811 | printf("test %s failed (nbits = %u)", test, nbits); | |
812 | printf("\n\tx = "); type_hex.dump(&v[0], stdout); | |
813 | printf("\n\ty = "); type_hex.dump(&v[1], stdout); | |
814 | printf("\n\tz = "); type_hex.dump(&v[2], stdout); | |
1d30a9b9 | 815 | printf("\n\t%s' = ", ref); type_hex.dump(d, stdout); |
8f6a5276 MW |
816 | putchar('\n'); |
817 | } | |
818 | ||
1d30a9b9 MW |
819 | static void mulk(unsigned nbits, unsigned f, uint32 *x, const uint32 *ktab) |
820 | { MULK(nbits/32, f, x, ktab); } | |
8f6a5276 MW |
821 | |
822 | static int test_mul(uint32 poly, dstr v[]) | |
823 | { | |
824 | uint32 x[GCM_NMAX], y[GCM_NMAX], z[GCM_NMAX], ktab[32*GCM_NMAX*GCM_NMAX]; | |
825 | gcm_params p; | |
826 | dstr d = DSTR_INIT; | |
827 | unsigned i, nbits; | |
828 | int ok = 1; | |
1d30a9b9 | 829 | enum { I_x, I_y, I_z }; |
8f6a5276 MW |
830 | |
831 | nbits = 8*v[0].len; p.f = 0; p.n = nbits/32; p.poly = poly; | |
832 | dstr_ensure(&d, nbits/8); d.len = nbits/8; | |
833 | ||
1d30a9b9 MW |
834 | #define LOADXY(E) do { \ |
835 | for (i = 0; i < nbits/32; i++) { \ | |
836 | x[i] = LOAD32_##E(v[I_x].buf + 4*i); \ | |
837 | y[i] = LOAD32_##E(v[I_y].buf + 4*i); \ | |
838 | } \ | |
839 | } while (0) | |
840 | ||
841 | #define INITZ(x) do { \ | |
842 | for (i = 0; i < nbits/32; i++) z[i] = (x)[i]; \ | |
843 | } while (0) | |
844 | ||
845 | #define CHECK(E, what, ref) do { \ | |
846 | for (i = 0; i < nbits/32; i++) STORE32_##E(d.buf + 4*i, z[i]); \ | |
141c1284 | 847 | if (MEMCMP(d.buf, !=, v[I_##ref].buf, nbits/8)) \ |
1d30a9b9 MW |
848 | { ok = 0; report_failure(what, nbits, #ref, v, &d); } \ |
849 | } while (0) | |
850 | ||
851 | #define TEST_PREP_1(E, x, y, what) do { \ | |
852 | gcm_mktable(&p, ktab, y); \ | |
853 | recover_k(&p, z, ktab); CHECK(B, "mktable/recover_k (" #y ")", y); \ | |
854 | INITZ(x); mulk(nbits, p.f, z, ktab); CHECK(E, what " (k = " #y ")", z); \ | |
855 | } while (0) | |
856 | ||
857 | #define TEST_PREP(E, what) do { \ | |
858 | TEST_PREP_1(E, x, y, what); \ | |
859 | TEST_PREP_1(E, y, x, what); \ | |
860 | } while (0) | |
861 | ||
8f6a5276 | 862 | /* First, test plain multiply. */ |
1d30a9b9 | 863 | LOADXY(B); mul(&p, z, x, y); CHECK(B, "gcm_mul", z); |
8f6a5276 MW |
864 | |
865 | /* Next, test big-endian prepared key. */ | |
1d30a9b9 | 866 | LOADXY(B); TEST_PREP(B, "gcm_kmul_b"); |
8f6a5276 MW |
867 | |
868 | /* Finally, test little-endian prepared key. */ | |
1d30a9b9 MW |
869 | p.f = GCMF_SWAP; LOADXY(L); |
870 | TEST_PREP(L, "gcm_kmul_l"); | |
871 | ||
872 | #undef LOADXY | |
873 | #undef INITZ | |
874 | #undef CHECK | |
875 | #undef TEST_PREP_1 | |
876 | #undef TEST_PREP | |
8f6a5276 MW |
877 | |
878 | /* All done. */ | |
879 | return (ok); | |
880 | } | |
881 | ||
882 | #define TEST(nbits) \ | |
883 | static int test_mul_##nbits(dstr v[]) \ | |
884 | { return (test_mul(GCM_POLY_##nbits, v)); } | |
885 | GCM_WIDTHS(TEST) | |
886 | #undef TEST | |
887 | ||
888 | static test_chunk defs[] = { | |
889 | #define TEST(nbits) \ | |
890 | { "gcm-mul" #nbits, test_mul_##nbits, \ | |
891 | { &type_hex, &type_hex, &type_hex, 0 } }, | |
892 | GCM_WIDTHS(TEST) | |
893 | #undef TEST | |
894 | { 0, 0, { 0 } } | |
895 | }; | |
896 | ||
897 | int main(int argc, char *argv[]) | |
898 | { | |
899 | ego(argv[0]); | |
416b8869 MW |
900 | #ifdef ENABLE_ASM_DEBUG |
901 | regdump_init(); | |
902 | #endif | |
8f6a5276 MW |
903 | test_run(argc, argv, defs, SRCDIR"/t/gcm"); |
904 | return (0); | |
905 | } | |
906 | ||
907 | #endif | |
908 | ||
50df5733 | 909 | /*----- That's all, folks -------------------------------------------------*/ |