progs/perftest.c: Use from Glibc syscall numbers.
[catacomb] / symm / gcm.c
CommitLineData
50df5733
MW
1/* -*-c-*-
2 *
3 * The GCM authenticated encryption mode
4 *
5 * (c) 2017 Straylight/Edgeware
6 */
7
8/*----- Licensing notice --------------------------------------------------*
9 *
10 * This file is part of Catacomb.
11 *
12 * Catacomb is free software: you can redistribute it and/or modify it
13 * under the terms of the GNU Library General Public License as published
14 * by the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * Catacomb is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Library General Public License for more details.
21 *
22 * You should have received a copy of the GNU Library General Public
23 * License along with Catacomb. If not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25 * USA.
26 */
27
28/*----- Header files ------------------------------------------------------*/
29
30#include "config.h"
31
32#include <stdio.h>
33
34#include <mLib/bits.h>
35
1d30a9b9 36#include "dispatch.h"
50df5733
MW
37#include "gcm.h"
38#include "gcm-def.h"
39
40/*----- Overall strategy --------------------------------------------------*
41 *
42 * GCM is pretty awful to implement in software. (This presentation is going
43 * to be somewhat different to that in the specification, but I think it
44 * makes more sense like this.)
45 *
46 * We're given a %$w$%-bit blockcipher %$E$% with a key %$K$%.
47 *
48 * The main part is arithmetic in the finite field %$k = \gf{2^w}$%, which we
49 * represent as the quotient ring %$\gf{2}[t]/(p_w(t))$% for some irreducible
50 * degree-%$w$% polynomial %$p(t)$%, whose precise value isn't very important
51 * right now. We choose a secret point %$x = E_K(0^w)$%.
52 *
53 * We choose a length size %$z$% as follows: if %$w < 96%$ then %$z = w$%;
54 * otherwise %$z = w/2$%. Format a message pair as follows:
55 *
56 * %$F(a, b) = P_w(a) \cat P_w(b) \cat [\ell(a)]_z \cat [\ell(b)]_z$%
57 *
58 * where %$P_w(x) = x \cat 0^n$% where $%0 \le n < w$% such that
59 * %$\ell(x) + n \equiv 0 \pmod{w}$%.
60 *
61 * Hash a (block-aligned) message %$u$% as follows. First, split %$u$% into
62 * %$w$%-bit blocks %$u_0$%, %$u_1$%, %%\ldots%%, %$u_{n-1}$%. Interpret
63 * these as elements of %$k$%. Then
64 *
65 * %$G_x(u) = u_0 t^n + u_1 t^{n-1} + \cdots + u_{n-1} t$%
66 *
67 * converted back to a %$w$%-bit string.
68 *
69 * We're ready to go now. Suppose we're to encrypt a message %$M$% with
70 * header %$H$% and nonce %$N$%. If %$\ell(N) + 32 = w$% then let
71 * %$N' = N$% and let %$i_0 = 1$%; otherwise, let %$U = G_t(F(\epsilon, N))$%
72 * and split this into %$N' = U[0 \bitsto w - 32]$% and
73 * %$[i_0]_{32} = U[w - 32 \bitsto w]$%.
74 *
75 * Let %$n = \lceil \ell(M)/w \rceil$%. Compute
76 *
77 * %$y_j = E_K(N' \cat [i_0 + j]_{32})$%
78 *
79 * for %$0 \le j \le n$%. Let
80 *
81 * %$s = (y_1 \cat y_2 \cat \cdots \cat y_n)[0 \bitsto \ell(M)$%
82 *
83 * Let %$C = M \xor s$% and let %$T = G_x(F(H, C)) \xor y_0$%. These are the
84 * ciphertext and tag respectively.
85 *
86 * So why is this awful?
87 *
88 * For one thing, the bits are in a completely terrible order. The bytes are
89 * arranged in little-endian order, so the unit coefficient is in the first
90 * byte, and the degree-127 coefficient is in the last byte. But within each
91 * byte, the lowest-degree coefficient is in the most significant bit. It's
92 * therefore better to think of GCM as using a big-endian byte-ordering
93 * convention, but with the bits backwards.
94 *
95 * But messing about with byte ordering is expensive, so let's not do that in
96 * the inner loop. But multiplication in %$k$% is not easy either. Some
97 * kind of precomputed table would be nice, but that will leak secrets
98 * through the cache.
99 *
100 * I choose a particularly simple table: given %$x$%, let %$X[i'] = x t^i$%.
101 * Then $%$x y = \sum_{0\le i<w} y_i X[i']$% which is just a bunch of
102 * bitmasking. But the natural order for examining bits of %$y$% is not
103 * necessarily the obvious one. We'll have already loaded %$y$% into
104 * internal form, as 32-bit words. The good order to process these is left
105 * to right, from high to low bits. But now the order of degrees depends on
106 * the endianness of our conversion of bytes to words. Oh, well.
107 *
108 * If we've adopted a big-endian convention, then we'll see the degrees in
109 * order, 0, 1, ..., all the way up to %$w - 1$% and everything is fine. If
110 * we've adopted a little-endian convention, though, we'll see an ordering
111 * like this:
112 *
113 * 24, 25, ..., 31, 16, 17, ..., 23, 8, 9, ..., 15, 0, 1, ..., 7,
114 * 56, 57, ..., 63, 48, 49, ..., 55, 40, 41, ..., 47, 32, 33, ..., 39,
115 * etc.
116 *
117 * which is the ordinary order with 0x18 = 24 XORed into the index. That is,
118 * %$i' = i$% if we've adopted a big-endian convention, and
119 * %$i' = i \xor 24$% if we've adopted a little-endian convention.
120 */
121
122/*----- Low-level utilities -----------------------------------------------*/
123
ac4a43c1 124/* --- @mult@, @divt@ --- *
50df5733
MW
125 *
126 * Arguments: @const gcm_params *p@ = pointer to the parameters
127 * @uint32 *z@ = where to write the result
128 * @const uint32 *x@ = input field element
129 *
130 * Returns: ---
131 *
ac4a43c1
MW
132 * Use: Multiply or divide the input field element by %$t$%, and
133 * write the product or quotient to @z@. It's safe for @x@ and
134 * @z@ to be equal, but they should not otherwise overlap. Both
135 * input and output are in big-endian form, i.e., with the
136 * lowest-degree coefficients in the most significant bits.
50df5733
MW
137 */
138
139static void mult(const gcm_params *p, uint32 *z, const uint32 *x)
140{
141 uint32 m, c, t;
142 unsigned i;
143
144 t = x[p->n - 1]; m = -(t&1u); c = m&p->poly;
145 for (i = 0; i < p->n; i++) { t = x[i]; z[i] = (t >> 1) ^ c; c = t << 31; }
146}
147
ac4a43c1
MW
148#if CPUFAM_X86 || CPUFAM_AMD64 || CPUFAM_ARMEL
149static void divt(const gcm_params *p, uint32 *z, const uint32 *x)
150{
151 uint32 m, c, t;
152 unsigned i;
153
154 t = x[0]; m = -((t >> 31)&1u); c = m&1u;
155 for (i = p->n - 1; i; i--) { t = x[i]; z[i] = (t << 1) | c; c = t >> 31; }
156 t = x[0]; z[0] = ((t ^ (m&p->poly)) << 1) | c;
157}
158#endif
159
50df5733
MW
160/* --- @mul@ --- *
161 *
162 * Arguments: @const gcm_params *p@ = pointer to the parameters
163 * @uint32 *z@ = where to write the result
164 * @const uint32 *x, *y@ = input field elements
165 *
166 * Returns: ---
167 *
168 * Use: Multiply the input field elements together, and write the
169 * product to @z@. It's safe for the operands to overlap. Both
170 * inputs and the output are in big-endian form, i.e., with the
171 * lowest-degree coefficients in the most significant bits.
172 */
173
174static void mul(const gcm_params *p, uint32 *z,
175 const uint32 *x, const uint32 *y)
176{
177 uint32 m, t, u[GCM_NMAX], v[GCM_NMAX];
178 unsigned i, j, k;
179
180 /* We can't do this in-place at all, so use temporary space. Make a copy
181 * of @x@ in @u@, where we can clobber it, and build the product in @v@.
182 */
183 for (i = 0; i < p->n; i++) { u[i] = x[i]; v[i] = 0; }
184
185 /* Repeatedly multiply @x@ (in @u@) by %$t$%, and add together those
186 * %$x t^i$% selected by the bits of @y@. This is basically what you get
187 * by streaming the result of @gcm_mktable@ into @gcm_mulk_...@.
188 */
189 for (i = 0; i < p->n; i++) {
190 t = y[i];
191 for (j = 0; j < 32; j++) {
192 m = -((t >> 31)&1u);
193 for (k = 0; k < p->n; k++) v[k] ^= u[k]&m;
194 mult(p, u, u); t <<= 1;
195 }
196 }
197
198 /* Write out the result now that it's ready. */
199 for (i = 0; i < p->n; i++) z[i] = v[i];
200}
201
202/*----- Table-based multiplication ----------------------------------------*/
203
204/* --- @gcm_mktable@ --- *
205 *
206 * Arguments: @const gcm_params *p@ = pointer to the parameters
207 * @uint32 *ktab@ = where to write the table; there must be
208 * space for %$32 n$% $%n$%-word entries, i.e.,
209 * %$32 n^2$% 32-bit words in total, where %$n$% is
210 * @p->n@, the block size in words
211 * @const uint32 *k@ = input field element
212 *
213 * Returns: ---
214 *
215 * Use: Construct a table for use by @gcm_mulk_...@ below, to
216 * multiply (vaguely) efficiently by @k@.
217 */
218
1d30a9b9
MW
219static void simple_mktable(const gcm_params *p,
220 uint32 *ktab, const uint32 *k)
50df5733
MW
221{
222 unsigned m = (p->f&GCMF_SWAP ? 0x18 : 0);
223 unsigned i, j, o = m*p->n;
224
225 /* As described above, the table stores entries %$K[i \xor m] = k t^i$%,
226 * where %$m = 0$% (big-endian cipher) or %$m = 24$% (little-endian).
227 * The first job is to store %$K[m] = k$%.
228 *
229 * We initially build the table with the entries in big-endian order, and
230 * then swap them if necessary. This makes the arithmetic functions more
231 * amenable for use by @gcm_concat@ below.
232 */
233 if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) ktab[o + i] = k[i];
234 else for (i = 0; i < p->n; i++) ktab[o + i] = ENDSWAP32(k[i]);
235
236 /* Fill in the rest of the table by repeatedly multiplying the previous
237 * entry by %$t$%.
238 */
239 for (i = 1; i < 32*p->n; i++)
240 { j = (i ^ m)*p->n; mult(p, ktab + j, ktab + o); o = j; }
241
242 /* Finally, if the cipher uses a little-endian convention, then swap all of
243 * the individual words.
244 */
245 if (p->f&GCMF_SWAP)
246 for (i = 0; i < 32*p->n*p->n; i++) ktab[i] = ENDSWAP32(ktab[i]);
247}
248
9e6a4409
MW
249#if CPUFAM_X86 || CPUFAM_AMD64
250static void pclmul_mktable(const gcm_params *p,
251 uint32 *ktab, const uint32 *k)
252{
ac4a43c1 253 unsigned i, n = p->n;
9e6a4409 254 unsigned nz;
ac4a43c1 255 uint32 k_over_t[GCM_NMAX], *t;
9e6a4409 256
ac4a43c1
MW
257 /* We need to divide the value by t (to compensate for the one-bit shift
258 * resulting from GCM's backwards bit ordering) and store the value in a
259 * way which is convenient for the assembler code to read back. That
260 * involves reordering the words, and, in the case of 96-bit blocks,
261 * padding with zeroes to fill out a 128-bit chunk.
9e6a4409
MW
262 */
263
ac4a43c1
MW
264 if (!(p->f&GCMF_SWAP)) divt(p, k_over_t, k);
265 else {
266 for (i = 0; i < n; i++) k_over_t[i] = ENDSWAP32(k[i]);
267 divt(p, k_over_t, k_over_t);
268 }
269
9e6a4409
MW
270 if (n == 3) nz = 1;
271 else nz = 0;
ac4a43c1 272 k = k_over_t; t = ktab + n + nz; while (n--) *--t = *k++;
9e6a4409
MW
273 while (nz--) *--t = 0;
274}
275#endif
276
277#if CPUFAM_ARMEL
278static void arm_crypto_mktable(const gcm_params *p,
279 uint32 *ktab, const uint32 *k)
280{
ac4a43c1
MW
281 unsigned i, n = p->n;
282 uint32 k_over_t[GCM_NMAX], *t;
9e6a4409 283
ac4a43c1
MW
284 /* We need to divide the value by t (to compensate for the one-bit shift
285 * resulting from GCM's backwards bit ordering) and store the value in a
286 * way which is convenient for the assembler code to read back. That
287 * involves swapping the bytes in each 64-bit lane.
9e6a4409
MW
288 */
289
ac4a43c1
MW
290 if (!(p->f&GCMF_SWAP)) divt(p, k_over_t, k);
291 else {
292 for (i = 0; i < n; i++) k_over_t[i] = ENDSWAP32(k[i]);
293 divt(p, k_over_t, k_over_t);
294 }
295
296 t = ktab; k = k_over_t;
297 while (n >= 2) {
298 t[1] = k[0]; t[0] = k[1];
299 t += 2; k += 2; n -= 2;
9e6a4409 300 }
ac4a43c1 301 if (n) { t[1] = k[0]; t[0] = 0; }
9e6a4409
MW
302}
303#endif
304
305#if CPUFAM_ARM64
306static uint32 rbit32(uint32 x)
307{
308 uint32 z, t;
309
310#if GCC_VERSION_P(4, 3)
311 /* Two tricks here. Firstly, two separate steps, rather than a single
312 * block of assembler, to allow finer-grained instruction scheduling.
313 * Secondly, use `ENDSWAP32' so that the compiler can cancel it if the
314 * caller actually wants the bytes reordered.
315 */
316 __asm__("rbit %w0, %w1" : "=r"(t) : "r"(x));
317 z = ENDSWAP32(t);
318#else
319 /* A generic but slightly clever implementation. */
320# define SWIZZLE(x, m, s) ((((x)&(m)) << (s)) | (((x)&~(m)) >> (s)))
321 /* 76543210 */
322 t = SWIZZLE(x, 0x0f0f0f0f, 4); /* 32107654 -- swap nibbles */
323 t = SWIZZLE(t, 0x33333333, 2); /* 10325476 -- swap bit pairs */
324 z = SWIZZLE(t, 0x55555555, 1); /* 01234567 -- swap adjacent bits */
325# undef SWIZZLE
326#endif
327 return (z);
328}
329
330static void arm64_pmull_mktable(const gcm_params *p,
331 uint32 *ktab, const uint32 *k)
332{
333 unsigned n = p->n;
334 uint32 *t;
335
336 /* We just need to store the value in a way which is convenient for the
337 * assembler code to read back. That involves two transformations:
338 *
339 * * firstly, reversing the order of the bits in each byte; and,
340 *
341 * * secondly, storing two copies of each 64-bit chunk.
342 *
343 * Note that, in this case, we /want/ the little-endian byte order of GCM,
344 * so endianness-swapping happens in the big-endian case.
345 */
346
347 t = ktab;
348 if (p->f&GCMF_SWAP) {
349 while (n >= 2) {
350 t[0] = t[2] = rbit32(k[0]);
351 t[1] = t[3] = rbit32(k[1]);
352 t += 4; k += 2; n -= 2;
353 }
354 if (n) { t[0] = t[2] = rbit32(k[0]); t[1] = t[3] = 0; }
355 } else {
356 while (n >= 2) {
357 t[0] = t[2] = ENDSWAP32(rbit32(k[0]));
358 t[1] = t[3] = ENDSWAP32(rbit32(k[1]));
359 t += 4; k += 2; n -= 2;
360 }
361 if (n) { t[0] = t[2] = ENDSWAP32(rbit32(k[0])); t[1] = t[3] = 0; }
362 }
363}
364#endif
365
1d30a9b9
MW
366CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable,
367 (const gcm_params *p, uint32 *ktab, const uint32 *k),
368 (p, ktab, k),
369 pick_mktable, simple_mktable)
370
371static gcm_mktable__functype *pick_mktable(void)
372{
9e6a4409
MW
373#if CPUFAM_X86 || CPUFAM_AMD64
374 DISPATCH_PICK_COND(gcm_mktable, pclmul_mktable,
375 cpu_feature_p(CPUFEAT_X86_SSSE3) &&
376 cpu_feature_p(CPUFEAT_X86_PCLMUL));
377#endif
378#if CPUFAM_ARMEL
379 DISPATCH_PICK_COND(gcm_mktable, arm_crypto_mktable,
380 cpu_feature_p(CPUFEAT_ARM_PMULL));
381#endif
382#if CPUFAM_ARM64
383 DISPATCH_PICK_COND(gcm_mktable, arm64_pmull_mktable,
384 cpu_feature_p(CPUFEAT_ARM_PMULL));
385#endif
1d30a9b9
MW
386 DISPATCH_PICK_FALLBACK(gcm_mktable, simple_mktable);
387}
388
389/* --- @recover_k@ --- *
390 *
391 * Arguments: @const gcm_params *p@ = pointer to the parameters
392 * @uint32 *k@ = block-sized vector in which to store %$k$%
393 * @const uint32 *ktab@ = the table encoding %$k$%
394 *
395 * Returns: ---
396 *
397 * Use: Recovers %$k$%, the secret from which @ktab@ was by
398 * @gcm_mktable@, from the table, and stores it in internal
399 * (big-endian) form in @k@.
400 */
401
402static void simple_recover_k(const gcm_params *p,
403 uint32 *k, const uint32 *ktab)
404{
405 unsigned i;
406
407 /* If the blockcipher is big-endian, then the key is simply in the first
408 * table element, in the right format. If the blockcipher is little-endian
409 * then it's in element 24, and the bytes need swapping.
410 */
411
412 if (!(p->f&GCMF_SWAP)) for (i = 0; i < p->n; i++) k[i] = ktab[i];
413 else for (i = 0; i < p->n; i++) k[i] = ENDSWAP32(ktab[24*p->n + i]);
414}
415
9e6a4409
MW
416#if CPUFAM_X86 || CPUFAM_AMD64
417static void pclmul_recover_k(const gcm_params *p,
418 uint32 *k, const uint32 *ktab)
419{
420 unsigned n = p->n;
421 unsigned nz;
422 const uint32 *t;
423
424 /* The representation is already independent of the blockcipher endianness.
ac4a43c1
MW
425 * We need to compensate for padding, reorder the words, and multiply by t
426 * to compensate for the factor of t we divided out earlier.
9e6a4409
MW
427 */
428
429 if (n == 3) nz = 1; else nz = 0;
430 t = ktab + n + nz;
431 while (n--) *k++ = *--t;
ac4a43c1 432 mult(p, k - p->n, k - p->n);
9e6a4409
MW
433}
434#endif
435
436#if CPUFAM_ARMEL
437static void arm_crypto_recover_k(const gcm_params *p,
438 uint32 *k, const uint32 *ktab)
439{
440 unsigned n = p->n;
441 const uint32 *t;
442
443 /* The representation is already independent of the blockcipher endianness.
ac4a43c1
MW
444 * We only need to reorder the words, and multiply by t to compensate for
445 * the factor of t we divided out earlier.
9e6a4409
MW
446 */
447
448 t = ktab;
449 while (n >= 2) { k[1] = t[0]; k[0] = t[1]; t += 2; k += 2; n -= 2; }
ac4a43c1
MW
450 if (n) { k[0] = t[1]; k++; n--; }
451 mult(p, k - p->n, k - p->n);
9e6a4409
MW
452}
453#endif
454
455#if CPUFAM_ARM64
456static void arm64_pmull_recover_k(const gcm_params *p,
457 uint32 *k, const uint32 *ktab)
458{
459 unsigned n = p->n;
460 const uint32 *t;
461
462 /* The representation is already independent of the blockcipher endianness.
463 * We need to skip the duplicate pieces, and unscramble the bytes.
464 */
465
466 t = ktab;
467 while (n >= 2) {
468 k[0] = ENDSWAP32(rbit32(t[0]));
469 k[1] = ENDSWAP32(rbit32(t[1]));
470 t += 4; k += 2; n -= 2;
471 }
472 if (n) k[0] = ENDSWAP32(rbit32(t[0]));
473}
474#endif
475
1d30a9b9
MW
476CPU_DISPATCH(static, EMPTY, void, recover_k,
477 (const gcm_params *p, uint32 *k, const uint32 *ktab),
478 (p, k, ktab),
479 pick_recover_k, simple_recover_k)
480
481static recover_k__functype *pick_recover_k(void)
9e6a4409
MW
482{
483#if CPUFAM_X86 || CPUFAM_AMD64
484 DISPATCH_PICK_COND(recover_k, pclmul_recover_k,
485 cpu_feature_p(CPUFEAT_X86_SSSE3) &&
486 cpu_feature_p(CPUFEAT_X86_PCLMUL));
487#endif
488#if CPUFAM_ARMEL
489 DISPATCH_PICK_COND(recover_k, arm_crypto_recover_k,
490 cpu_feature_p(CPUFEAT_ARM_PMULL));
491#endif
492#if CPUFAM_ARM64
493 DISPATCH_PICK_COND(recover_k, arm64_pmull_recover_k,
494 cpu_feature_p(CPUFEAT_ARM_PMULL));
495#endif
496 DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k);
497}
1d30a9b9
MW
498
499/* --- @gcm_mulk_N{b,l}@ --- *
50df5733
MW
500 *
501 * Arguments: @uint32 *a@ = accumulator to multiply
502 * @const uint32 *ktab@ = table constructed by @gcm_mktable@
503 *
504 * Returns: ---
505 *
506 * Use: Multiply @a@ by @k@ (implicitly represented in @ktab@),
507 * updating @a@ in-place. There are separate functions for each
1d30a9b9
MW
508 * supported block size and endianness because this is the
509 * function whose performance actually matters.
50df5733
MW
510 */
511
9e6a4409
MW
512#if CPUFAM_X86 || CPUFAM_AMD64
513# define DECL_MULK_X86ISH(var) extern gcm_mulk_##var##__functype \
514 gcm_mulk_##var##_x86ish_pclmul_avx, \
515 gcm_mulk_##var##_x86ish_pclmul;
516# define PICK_MULK_X86ISH(var) do { \
517 DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul_avx, \
518 cpu_feature_p(CPUFEAT_X86_AVX) && \
519 cpu_feature_p(CPUFEAT_X86_PCLMUL) && \
520 cpu_feature_p(CPUFEAT_X86_SSSE3)); \
521 DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul, \
522 cpu_feature_p(CPUFEAT_X86_PCLMUL) && \
523 cpu_feature_p(CPUFEAT_X86_SSSE3)); \
524} while (0)
525#else
526# define DECL_MULK_X86ISH(var)
527# define PICK_MULK_X86ISH(var) do ; while (0)
528#endif
529
530#if CPUFAM_ARMEL
531# define DECL_MULK_ARM(var) \
532 extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm_crypto;
533# define PICK_MULK_ARM(var) do { \
534 DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm_crypto, \
535 cpu_feature_p(CPUFEAT_ARM_PMULL)); \
536} while (0)
537#else
538# define DECL_MULK_ARM(var)
539# define PICK_MULK_ARM(var) do ; while (0)
540#endif
541
542#if CPUFAM_ARM64
543# define DECL_MULK_ARM64(var) \
544 extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm64_pmull;
545# define PICK_MULK_ARM64(var) do { \
546 DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm64_pmull, \
547 cpu_feature_p(CPUFEAT_ARM_PMULL)); \
548} while (0)
549#else
550# define DECL_MULK_ARM64(var)
551# define PICK_MULK_ARM64(var) do ; while (0)
552#endif
553
50df5733 554#define DEF_MULK(nbits) \
1d30a9b9
MW
555 \
556CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##b, \
557 (uint32 *a, const uint32 *ktab), (a, ktab), \
558 pick_mulk_##nbits##b, simple_mulk_##nbits) \
559CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##l, \
560 (uint32 *a, const uint32 *ktab), (a, ktab), \
561 pick_mulk_##nbits##l, simple_mulk_##nbits) \
562 \
563static void simple_mulk_##nbits(uint32 *a, const uint32 *ktab) \
50df5733
MW
564{ \
565 uint32 m, t; \
566 uint32 z[nbits/32]; \
567 unsigned i, j, k; \
568 \
569 for (i = 0; i < nbits/32; i++) z[i] = 0; \
570 \
571 for (i = 0; i < nbits/32; i++) { \
572 t = a[i]; \
573 for (j = 0; j < 32; j++) { \
574 m = -((t >> 31)&1u); \
575 for (k = 0; k < nbits/32; k++) z[k] ^= *ktab++&m; \
576 t <<= 1; \
577 } \
578 } \
579 \
580 for (i = 0; i < nbits/32; i++) a[i] = z[i]; \
1d30a9b9
MW
581} \
582 \
9e6a4409
MW
583DECL_MULK_X86ISH(nbits##b) \
584DECL_MULK_ARM(nbits##b) \
585DECL_MULK_ARM64(nbits##b) \
1d30a9b9 586static gcm_mulk_##nbits##b##__functype *pick_mulk_##nbits##b(void) \
9e6a4409
MW
587{ \
588 PICK_MULK_X86ISH(nbits##b); \
589 PICK_MULK_ARM(nbits##b); \
590 PICK_MULK_ARM64(nbits##b); \
591 DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits); \
592} \
593 \
594DECL_MULK_X86ISH(nbits##l) \
595DECL_MULK_ARM(nbits##l) \
596DECL_MULK_ARM64(nbits##l) \
1d30a9b9 597static gcm_mulk_##nbits##l##__functype *pick_mulk_##nbits##l(void) \
9e6a4409
MW
598{ \
599 PICK_MULK_X86ISH(nbits##l); \
600 PICK_MULK_ARM(nbits##l); \
601 PICK_MULK_ARM64(nbits##l); \
602 DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits); \
603}
1d30a9b9 604
50df5733
MW
605GCM_WIDTHS(DEF_MULK)
606
8f6a5276 607#define GCM_MULK_CASE(nbits) \
1d30a9b9
MW
608 case nbits/32: \
609 if (_f&GCMF_SWAP) gcm_mulk_##nbits##l(_a, _ktab); \
610 else gcm_mulk_##nbits##b(_a, _ktab); \
611 break;
612#define MULK(n, f, a, ktab) do { \
8f6a5276 613 uint32 *_a = (a); const uint32 *_ktab = (ktab); \
1d30a9b9 614 unsigned _f = (f); \
8f6a5276
MW
615 switch (n) { \
616 GCM_WIDTHS(GCM_MULK_CASE) \
617 default: abort(); \
618 } \
619} while (0)
620
50df5733
MW
621/*----- Other utilities ---------------------------------------------------*/
622
623/* --- @putlen@ --- *
624 *
625 * Arguments: @octet *p@ = pointer to output buffer
626 * @unsigned w@ = size of output buffer
627 * @unsigned blksz@ = block size (assumed fairly small)
628 * @unsigned long nblocks@ = number of blocks
629 * @unsigned nbytes@ = tail size in bytes (assumed small)
630 *
631 * Returns: ---
632 *
633 * Use: Store the overall length in %$\emph{bits}$% (i.e.,
634 * @3*(nblocks*blksz + nbytes)@ in big-endian form in the
635 * buffer @p@.
636 */
637
638static void putlen(octet *p, unsigned w, unsigned blksz,
639 unsigned long nblocks, unsigned nbytes)
640{
641 unsigned long nblo = nblocks&((1ul << (ULONG_BITS/2)) - 1),
642 nbhi = nblocks >> ULONG_BITS/2;
643 unsigned long nlo = nblo*blksz + nbytes, nhi = nbhi*blksz;
644
645 /* This is fiddly. Split @nblocks@, which is the big number, into high and
646 * low halves, multiply those separately by @blksz@, propagate carries, and
647 * then multiply by eight.
648 */
649 nhi += nlo >> ULONG_BITS/2;
650 nlo &= (1ul << (ULONG_BITS/2)) - 1;
651 nlo <<= 3;
652
653 /* Now write out the size, feeding bits in from @nhi@ as necessary. */
654 p += w;
655 while (w--) {
656 *--p = U8(nlo);
657 nlo = (nlo >> 8) | ((nhi&0xff) << (ULONG_BITS/2 - 5));
658 nhi >>= 8;
659 }
660}
661
662/* --- @mix@ --- *
663 *
664 * Arguments: @const gcm_params *p@ = pointer to the parameters
665 * @uint32 *a@ = GHASH accumulator
666 * @const octet *q@ = pointer to an input block
667 * @const uint32 *ktab@ = multiplication table, built by
668 * @gcm_mktable@
669 *
670 * Returns: ---
671 *
672 * Use: Fold the block @q@ into the GHASH accumulator. The
673 * calculation is %$a' = k (a + q)$%.
674 */
675
676static void mix(const gcm_params *p, uint32 *a,
677 const octet *q, const uint32 *ktab)
678{
679 unsigned i;
680
50df5733
MW
681 if (p->f&GCMF_SWAP)
682 for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_L(q); q += 4; }
683 else
684 for (i = 0; i < p->n; i++) { a[i] ^= LOAD32_B(q); q += 4; }
1d30a9b9 685 MULK(p->n, p->f, a, ktab);
50df5733
MW
686}
687
688/* --- @gcm_ghashdone@ --- *
689 *
690 * Arguments: @const gcm_params *p@ = pointer to the parameters
691 * @uint32 *a@ = GHASH accumulator
692 * @const uint32 *ktab@ = multiplication table, built by
693 * @gcm_mktable@
694 * @unsigned long xblocks, yblocks@ = number of whole blocks in
695 * the two inputs
696 * @unsigned xbytes, ybytes@ = number of trailing bytes in the
697 * two inputs
698 *
699 * Returns: ---
700 *
701 * Use: Finishes a GHASH operation by appending the appropriately
702 * encoded lengths of the two constituent messages.
703 */
704
705void gcm_ghashdone(const gcm_params *p, uint32 *a, const uint32 *ktab,
706 unsigned long xblocks, unsigned xbytes,
707 unsigned long yblocks, unsigned ybytes)
708{
709 octet b[4*GCM_NMAX];
710 unsigned w = p->n < 3 ? 4*p->n : 2*p->n;
711
712 /* Construct the encoded lengths. Note that smaller-block versions of GCM
713 * encode the lengths in separate blocks. GCM is only officially defined
714 * for 64- and 128-bit blocks; I've placed the cutoff somewhat arbitrarily
715 * at 96 bits.
716 */
717 putlen(b, w, 4*p->n, xblocks, xbytes);
718 putlen(b + w, w, 4*p->n, yblocks, ybytes);
719
720 /* Feed the lengths into the accumulator. */
721 mix(p, a, b, ktab);
722 if (p->n < 3) mix(p, a, b + w, ktab);
723}
724
725/* --- @gcm_concat@ --- *
726 *
727 * Arguments: @const gcm_params *p@ = pointer to the parameters
728 * @uint32 *z@ = GHASH accumulator for suffix, updated
729 * @const uint32 *x@ = GHASH accumulator for prefix
730 * @const uint32 *ktab@ = multiplication table, built by
731 * @gcm_mktable@
732 * @unsigned long n@ = length of suffix in whole blocks
733 *
734 * Returns: ---
735 *
736 * Use: On entry, @x@ and @z@ are the results of hashing two strings
737 * %$a$% and %$b$%, each a whole number of blocks long; in
738 * particular, %$b$% is @n@ blocks long. On exit, @z@ is
739 * updated to be the hash of %$a \cat b$%.
740 */
741
742void gcm_concat(const gcm_params *p, uint32 *z, const uint32 *x,
743 const uint32 *ktab, unsigned long n)
744{
745 uint32 t[GCM_NMAX], u[GCM_NMAX];
746 unsigned i, j;
747
748 if (!n) {
749 /* If @n@ is zero, then there's not much to do. The mathematics
750 * (explained below) still works, but the code takes a shortcut which
751 * doesn't handle this case: so set %$z' = z + x k^n = z + x$%.
752 */
753
754 for (j = 0; j < p->n; j++) z[j] ^= x[j];
755 } else {
756 /* We have %$x = a_0 t^m + \cdots + a_{m-2} t^2 + a_{m-1} t$% and
757 * %$z = b_0 t^n + \cdots + b_{n-2} t^2 + b_{n-1} t$%. What we'd like is
758 * the hash of %$a \cat b$%, which is %$z + x k^n$%.
759 *
760 * The first job, then, is to calculate %$k^n$%, and for this we use a
761 * simple left-to-right square-and-multiply algorithm. There's no need
762 * to keep %$n$% secret here.
763 */
764
765 /* Start by retrieving %$k$% from the table, and convert it to big-endian
766 * form.
767 */
1d30a9b9 768 recover_k(p, u, ktab);
50df5733
MW
769
770 /* Now calculate %$k^n$%. */
771 i = ULONG_BITS;
772#define BIT (1ul << (ULONG_BITS - 1))
773 while (!(n&BIT)) { n <<= 1; i--; }
774 n <<= 1; i--; for (j = 0; j < p->n; j++) t[j] = u[j];
775 while (i--) { mul(p, t, t, t); if (n&BIT) mul(p, t, t, u); n <<= 1; }
776#undef BIT
777
778 /* Next, calculate %$x k^n$%. If we're using a little-endian convention
779 * then we must convert %$x$%; otherwise we can just use it in place.
780 */
781 if (!(p->f&GCMF_SWAP))
782 mul(p, t, t, x);
783 else {
784 for (j = 0; j < p->n; j++) u[j] = ENDSWAP32(x[j]);
785 mul(p, t, t, u);
786 }
787
788 /* Finally, add %$x k^n$% onto %$z$%, converting back to little-endian if
789 * necessary.
790 */
791 if (!(p->f&GCMF_SWAP)) for (j = 0; j < p->n; j++) z[j] ^= t[j];
792 else for (j = 0; j < p->n; j++) z[j] ^= ENDSWAP32(t[j]);
793 }
794}
795
8f6a5276
MW
796/*----- Test rig ----------------------------------------------------------*/
797
798#ifdef TEST_RIG
799
141c1284 800#include <mLib/macros.h>
8f6a5276
MW
801#include <mLib/quis.h>
802#include <mLib/testrig.h>
803
416b8869
MW
804#ifdef ENABLE_ASM_DEBUG
805# include "regdump.h"
806#endif
807
8f6a5276 808static void report_failure(const char *test, unsigned nbits,
1d30a9b9 809 const char *ref, dstr v[], dstr *d)
8f6a5276
MW
810{
811 printf("test %s failed (nbits = %u)", test, nbits);
812 printf("\n\tx = "); type_hex.dump(&v[0], stdout);
813 printf("\n\ty = "); type_hex.dump(&v[1], stdout);
814 printf("\n\tz = "); type_hex.dump(&v[2], stdout);
1d30a9b9 815 printf("\n\t%s' = ", ref); type_hex.dump(d, stdout);
8f6a5276
MW
816 putchar('\n');
817}
818
1d30a9b9
MW
819static void mulk(unsigned nbits, unsigned f, uint32 *x, const uint32 *ktab)
820 { MULK(nbits/32, f, x, ktab); }
8f6a5276
MW
821
822static int test_mul(uint32 poly, dstr v[])
823{
824 uint32 x[GCM_NMAX], y[GCM_NMAX], z[GCM_NMAX], ktab[32*GCM_NMAX*GCM_NMAX];
825 gcm_params p;
826 dstr d = DSTR_INIT;
827 unsigned i, nbits;
828 int ok = 1;
1d30a9b9 829 enum { I_x, I_y, I_z };
8f6a5276
MW
830
831 nbits = 8*v[0].len; p.f = 0; p.n = nbits/32; p.poly = poly;
832 dstr_ensure(&d, nbits/8); d.len = nbits/8;
833
1d30a9b9
MW
834#define LOADXY(E) do { \
835 for (i = 0; i < nbits/32; i++) { \
836 x[i] = LOAD32_##E(v[I_x].buf + 4*i); \
837 y[i] = LOAD32_##E(v[I_y].buf + 4*i); \
838 } \
839} while (0)
840
841#define INITZ(x) do { \
842 for (i = 0; i < nbits/32; i++) z[i] = (x)[i]; \
843} while (0)
844
845#define CHECK(E, what, ref) do { \
846 for (i = 0; i < nbits/32; i++) STORE32_##E(d.buf + 4*i, z[i]); \
141c1284 847 if (MEMCMP(d.buf, !=, v[I_##ref].buf, nbits/8)) \
1d30a9b9
MW
848 { ok = 0; report_failure(what, nbits, #ref, v, &d); } \
849} while (0)
850
851#define TEST_PREP_1(E, x, y, what) do { \
852 gcm_mktable(&p, ktab, y); \
853 recover_k(&p, z, ktab); CHECK(B, "mktable/recover_k (" #y ")", y); \
854 INITZ(x); mulk(nbits, p.f, z, ktab); CHECK(E, what " (k = " #y ")", z); \
855} while (0)
856
857#define TEST_PREP(E, what) do { \
858 TEST_PREP_1(E, x, y, what); \
859 TEST_PREP_1(E, y, x, what); \
860} while (0)
861
8f6a5276 862 /* First, test plain multiply. */
1d30a9b9 863 LOADXY(B); mul(&p, z, x, y); CHECK(B, "gcm_mul", z);
8f6a5276
MW
864
865 /* Next, test big-endian prepared key. */
1d30a9b9 866 LOADXY(B); TEST_PREP(B, "gcm_kmul_b");
8f6a5276
MW
867
868 /* Finally, test little-endian prepared key. */
1d30a9b9
MW
869 p.f = GCMF_SWAP; LOADXY(L);
870 TEST_PREP(L, "gcm_kmul_l");
871
872#undef LOADXY
873#undef INITZ
874#undef CHECK
875#undef TEST_PREP_1
876#undef TEST_PREP
8f6a5276
MW
877
878 /* All done. */
879 return (ok);
880}
881
882#define TEST(nbits) \
883static int test_mul_##nbits(dstr v[]) \
884 { return (test_mul(GCM_POLY_##nbits, v)); }
885GCM_WIDTHS(TEST)
886#undef TEST
887
888static test_chunk defs[] = {
889#define TEST(nbits) \
890 { "gcm-mul" #nbits, test_mul_##nbits, \
891 { &type_hex, &type_hex, &type_hex, 0 } },
892GCM_WIDTHS(TEST)
893#undef TEST
894 { 0, 0, { 0 } }
895};
896
897int main(int argc, char *argv[])
898{
899 ego(argv[0]);
416b8869
MW
900#ifdef ENABLE_ASM_DEBUG
901 regdump_init();
902#endif
8f6a5276
MW
903 test_run(argc, argv, defs, SRCDIR"/t/gcm");
904 return (0);
905}
906
907#endif
908
50df5733 909/*----- That's all, folks -------------------------------------------------*/