3 * The GCM authenticated encryption mode
5 * (c) 2017 Straylight/Edgeware
8 /*----- Licensing notice --------------------------------------------------*
10 * This file is part of Catacomb.
12 * Catacomb is free software: you can redistribute it and/or modify it
13 * under the terms of the GNU Library General Public License as published
14 * by the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * Catacomb is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Library General Public License for more details.
22 * You should have received a copy of the GNU Library General Public
23 * License along with Catacomb. If not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
28 /*----- Header files ------------------------------------------------------*/
34 #include <mLib/bits.h>
40 /*----- Overall strategy --------------------------------------------------*
42 * GCM is pretty awful to implement in software. (This presentation is going
43 * to be somewhat different to that in the specification, but I think it
44 * makes more sense like this.)
46 * We're given a %$w$%-bit blockcipher %$E$% with a key %$K$%.
48 * The main part is arithmetic in the finite field %$k = \gf{2^w}$%, which we
49 * represent as the quotient ring %$\gf{2}[t]/(p_w(t))$% for some irreducible
50 * degree-%$w$% polynomial %$p(t)$%, whose precise value isn't very important
51 * right now. We choose a secret point %$x = E_K(0^w)$%.
53 * We choose a length size %$z$% as follows: if %$w < 96%$ then %$z = w$%;
54 * otherwise %$z = w/2$%. Format a message pair as follows:
56 * %$F(a, b) = P_w(a) \cat P_w(b) \cat [\ell(a)]_z \cat [\ell(b)]_z$%
58 * where %$P_w(x) = x \cat 0^n$% where $%0 \le n < w$% such that
59 * %$\ell(x) + n \equiv 0 \pmod{w}$%.
61 * Hash a (block-aligned) message %$u$% as follows. First, split %$u$% into
62 * %$w$%-bit blocks %$u_0$%, %$u_1$%, %%\ldots%%, %$u_{n-1}$%. Interpret
63 * these as elements of %$k$%. Then
65 * %$G_x(u) = u_0 t^n + u_1 t^{n-1} + \cdots + u_{n-1} t$%
67 * converted back to a %$w$%-bit string.
69 * We're ready to go now. Suppose we're to encrypt a message %$M$% with
70 * header %$H$% and nonce %$N$%. If %$\ell(N) + 32 = w$% then let
71 * %$N' = N$% and let %$i_0 = 1$%; otherwise, let %$U = G_t(F(\epsilon, N))$%
72 * and split this into %$N' = U[0 \bitsto w - 32]$% and
73 * %$[i_0]_{32} = U[w - 32 \bitsto w]$%.
75 * Let %$n = \lceil \ell(M)/w \rceil$%. Compute
77 * %$y_j = E_K(N' \cat [i_0 + j]_{32})$%
79 * for %$0 \le j \le n$%. Let
81 * %$s = (y_1 \cat y_2 \cat \cdots \cat y_n)[0 \bitsto \ell(M)$%
83 * Let %$C = M \xor s$% and let %$T = G_x(F(H, C)) \xor y_0$%. These are the
84 * ciphertext and tag respectively.
86 * So why is this awful?
88 * For one thing, the bits are in a completely terrible order. The bytes are
89 * arranged in little-endian order, so the unit coefficient is in the first
90 * byte, and the degree-127 coefficient is in the last byte. But within each
91 * byte, the lowest-degree coefficient is in the most significant bit. It's
92 * therefore better to think of GCM as using a big-endian byte-ordering
93 * convention, but with the bits backwards.
95 * But messing about with byte ordering is expensive, so let's not do that in
96 * the inner loop. But multiplication in %$k$% is not easy either. Some
97 * kind of precomputed table would be nice, but that will leak secrets
100 * I choose a particularly simple table: given %$x$%, let %$X[i'] = x t^i$%.
101 * Then $%$x y = \sum_{0\le i<w} y_i X[i']$% which is just a bunch of
102 * bitmasking. But the natural order for examining bits of %$y$% is not
103 * necessarily the obvious one. We'll have already loaded %$y$% into
104 * internal form, as 32-bit words. The good order to process these is left
105 * to right, from high to low bits. But now the order of degrees depends on
106 * the endianness of our conversion of bytes to words. Oh, well.
108 * If we've adopted a big-endian convention, then we'll see the degrees in
109 * order, 0, 1, ..., all the way up to %$w - 1$% and everything is fine. If
110 * we've adopted a little-endian convention, though, we'll see an ordering
113 * 24, 25, ..., 31, 16, 17, ..., 23, 8, 9, ..., 15, 0, 1, ..., 7,
114 * 56, 57, ..., 63, 48, 49, ..., 55, 40, 41, ..., 47, 32, 33, ..., 39,
117 * which is the ordinary order with 0x18 = 24 XORed into the index. That is,
118 * %$i' = i$% if we've adopted a big-endian convention, and
119 * %$i' = i \xor 24$% if we've adopted a little-endian convention.
122 /*----- Low-level utilities -----------------------------------------------*/
126 * Arguments: @const gcm_params *p@ = pointer to the parameters
127 * @uint32 *z@ = where to write the result
128 * @const uint32 *x@ = input field element
132 * Use: Multiply the input field element by %$t$%, and write the
133 * product to @z@. It's safe for @x@ and @z@ to be equal, but
134 * they should not otherwise overlap. Both input and output are
135 * in big-endian form, i.e., with the lowest-degree coefficients
136 * in the most significant bits.
139 static void mult(const gcm_params
*p
, uint32
*z
, const uint32
*x
)
144 t
= x
[p
->n
- 1]; m
= -(t
&1u); c
= m
&p
->poly
;
145 for (i
= 0; i
< p
->n
; i
++) { t
= x
[i
]; z
[i
] = (t
>> 1) ^ c
; c
= t
<< 31; }
150 * Arguments: @const gcm_params *p@ = pointer to the parameters
151 * @uint32 *z@ = where to write the result
152 * @const uint32 *x, *y@ = input field elements
156 * Use: Multiply the input field elements together, and write the
157 * product to @z@. It's safe for the operands to overlap. Both
158 * inputs and the output are in big-endian form, i.e., with the
159 * lowest-degree coefficients in the most significant bits.
162 static void mul(const gcm_params
*p
, uint32
*z
,
163 const uint32
*x
, const uint32
*y
)
165 uint32 m
, t
, u
[GCM_NMAX
], v
[GCM_NMAX
];
168 /* We can't do this in-place at all, so use temporary space. Make a copy
169 * of @x@ in @u@, where we can clobber it, and build the product in @v@.
171 for (i
= 0; i
< p
->n
; i
++) { u
[i
] = x
[i
]; v
[i
] = 0; }
173 /* Repeatedly multiply @x@ (in @u@) by %$t$%, and add together those
174 * %$x t^i$% selected by the bits of @y@. This is basically what you get
175 * by streaming the result of @gcm_mktable@ into @gcm_mulk_...@.
177 for (i
= 0; i
< p
->n
; i
++) {
179 for (j
= 0; j
< 32; j
++) {
181 for (k
= 0; k
< p
->n
; k
++) v
[k
] ^= u
[k
]&m
;
182 mult(p
, u
, u
); t
<<= 1;
186 /* Write out the result now that it's ready. */
187 for (i
= 0; i
< p
->n
; i
++) z
[i
] = v
[i
];
190 /*----- Table-based multiplication ----------------------------------------*/
192 /* --- @gcm_mktable@ --- *
194 * Arguments: @const gcm_params *p@ = pointer to the parameters
195 * @uint32 *ktab@ = where to write the table; there must be
196 * space for %$32 n$% $%n$%-word entries, i.e.,
197 * %$32 n^2$% 32-bit words in total, where %$n$% is
198 * @p->n@, the block size in words
199 * @const uint32 *k@ = input field element
203 * Use: Construct a table for use by @gcm_mulk_...@ below, to
204 * multiply (vaguely) efficiently by @k@.
207 static void simple_mktable(const gcm_params
*p
,
208 uint32
*ktab
, const uint32
*k
)
210 unsigned m
= (p
->f
&GCMF_SWAP ?
0x18 : 0);
211 unsigned i
, j
, o
= m
*p
->n
;
213 /* As described above, the table stores entries %$K[i \xor m] = k t^i$%,
214 * where %$m = 0$% (big-endian cipher) or %$m = 24$% (little-endian).
215 * The first job is to store %$K[m] = k$%.
217 * We initially build the table with the entries in big-endian order, and
218 * then swap them if necessary. This makes the arithmetic functions more
219 * amenable for use by @gcm_concat@ below.
221 if (!(p
->f
&GCMF_SWAP
)) for (i
= 0; i
< p
->n
; i
++) ktab
[o
+ i
] = k
[i
];
222 else for (i
= 0; i
< p
->n
; i
++) ktab
[o
+ i
] = ENDSWAP32(k
[i
]);
224 /* Fill in the rest of the table by repeatedly multiplying the previous
227 for (i
= 1; i
< 32*p
->n
; i
++)
228 { j
= (i
^ m
)*p
->n
; mult(p
, ktab
+ j
, ktab
+ o
); o
= j
; }
230 /* Finally, if the cipher uses a little-endian convention, then swap all of
231 * the individual words.
234 for (i
= 0; i
< 32*p
->n
*p
->n
; i
++) ktab
[i
] = ENDSWAP32(ktab
[i
]);
237 #if CPUFAM_X86 || CPUFAM_AMD64
238 static void pclmul_mktable(const gcm_params
*p
,
239 uint32
*ktab
, const uint32
*k
)
245 /* We just need to store the value in a way which is convenient for the
246 * assembler code to read back. That involves reordering the words, and,
247 * in the case of 96-bit blocks, padding with zeroes to fill out a 128-bit
255 if (p
->f
&GCMF_SWAP
) while (n
--) { *--t
= ENDSWAP32(*k
); k
++; }
256 else while (n
--) *--t
= *k
++;
257 while (nz
--) *--t
= 0;
262 static void arm_crypto_mktable(const gcm_params
*p
,
263 uint32
*ktab
, const uint32
*k
)
268 /* We just need to store the value in a way which is convenient for the
269 * assembler code to read back. That involves swapping the bytes in each
274 if (p
->f
&GCMF_SWAP
) {
276 t
[1] = ENDSWAP32(k
[0]); t
[0] = ENDSWAP32(k
[1]);
277 t
+= 2; k
+= 2; n
-= 2;
279 if (n
) { t
[1] = ENDSWAP32(k
[0]); t
[0] = 0; }
282 t
[1] = k
[0]; t
[0] = k
[1];
283 t
+= 2; k
+= 2; n
-= 2;
285 if (n
) { t
[1] = k
[0]; t
[0] = 0; }
291 static uint32
rbit32(uint32 x
)
295 #if GCC_VERSION_P(4, 3)
296 /* Two tricks here. Firstly, two separate steps, rather than a single
297 * block of assembler, to allow finer-grained instruction scheduling.
298 * Secondly, use `ENDSWAP32' so that the compiler can cancel it if the
299 * caller actually wants the bytes reordered.
301 __asm__("rbit %w0, %w1" : "=r"(t
) : "r"(x
));
304 /* A generic but slightly clever implementation. */
305 # define SWIZZLE(x, m, s) ((((x)&(m)) << (s)) | (((x)&~(m)) >> (s)))
307 t
= SWIZZLE(x
, 0x0f0f0f0f, 4); /* 32107654 -- swap nibbles */
308 t
= SWIZZLE(t
, 0x33333333, 2); /* 10325476 -- swap bit pairs */
309 z
= SWIZZLE(t
, 0x55555555, 1); /* 01234567 -- swap adjacent bits */
315 static void arm64_pmull_mktable(const gcm_params
*p
,
316 uint32
*ktab
, const uint32
*k
)
321 /* We just need to store the value in a way which is convenient for the
322 * assembler code to read back. That involves two transformations:
324 * * firstly, reversing the order of the bits in each byte; and,
326 * * secondly, storing two copies of each 64-bit chunk.
328 * Note that, in this case, we /want/ the little-endian byte order of GCM,
329 * so endianness-swapping happens in the big-endian case.
333 if (p
->f
&GCMF_SWAP
) {
335 t
[0] = t
[2] = rbit32(k
[0]);
336 t
[1] = t
[3] = rbit32(k
[1]);
337 t
+= 4; k
+= 2; n
-= 2;
339 if (n
) { t
[0] = t
[2] = rbit32(k
[0]); t
[1] = t
[3] = 0; }
342 t
[0] = t
[2] = ENDSWAP32(rbit32(k
[0]));
343 t
[1] = t
[3] = ENDSWAP32(rbit32(k
[1]));
344 t
+= 4; k
+= 2; n
-= 2;
346 if (n
) { t
[0] = t
[2] = ENDSWAP32(rbit32(k
[0])); t
[1] = t
[3] = 0; }
351 CPU_DISPATCH(EMPTY
, EMPTY
, void, gcm_mktable
,
352 (const gcm_params
*p
, uint32
*ktab
, const uint32
*k
),
354 pick_mktable
, simple_mktable
)
356 static gcm_mktable__functype
*pick_mktable(void)
358 #if CPUFAM_X86 || CPUFAM_AMD64
359 DISPATCH_PICK_COND(gcm_mktable
, pclmul_mktable
,
360 cpu_feature_p(CPUFEAT_X86_SSSE3
) &&
361 cpu_feature_p(CPUFEAT_X86_PCLMUL
));
364 DISPATCH_PICK_COND(gcm_mktable
, arm_crypto_mktable
,
365 cpu_feature_p(CPUFEAT_ARM_PMULL
));
368 DISPATCH_PICK_COND(gcm_mktable
, arm64_pmull_mktable
,
369 cpu_feature_p(CPUFEAT_ARM_PMULL
));
371 DISPATCH_PICK_FALLBACK(gcm_mktable
, simple_mktable
);
374 /* --- @recover_k@ --- *
376 * Arguments: @const gcm_params *p@ = pointer to the parameters
377 * @uint32 *k@ = block-sized vector in which to store %$k$%
378 * @const uint32 *ktab@ = the table encoding %$k$%
382 * Use: Recovers %$k$%, the secret from which @ktab@ was by
383 * @gcm_mktable@, from the table, and stores it in internal
384 * (big-endian) form in @k@.
387 static void simple_recover_k(const gcm_params
*p
,
388 uint32
*k
, const uint32
*ktab
)
392 /* If the blockcipher is big-endian, then the key is simply in the first
393 * table element, in the right format. If the blockcipher is little-endian
394 * then it's in element 24, and the bytes need swapping.
397 if (!(p
->f
&GCMF_SWAP
)) for (i
= 0; i
< p
->n
; i
++) k
[i
] = ktab
[i
];
398 else for (i
= 0; i
< p
->n
; i
++) k
[i
] = ENDSWAP32(ktab
[24*p
->n
+ i
]);
401 #if CPUFAM_X86 || CPUFAM_AMD64
402 static void pclmul_recover_k(const gcm_params
*p
,
403 uint32
*k
, const uint32
*ktab
)
409 /* The representation is already independent of the blockcipher endianness.
410 * We need to compensate for padding, and reorder the words.
413 if (n
== 3) nz
= 1; else nz
= 0;
415 while (n
--) *k
++ = *--t
;
420 static void arm_crypto_recover_k(const gcm_params
*p
,
421 uint32
*k
, const uint32
*ktab
)
426 /* The representation is already independent of the blockcipher endianness.
427 * We only need to reorder the words.
431 while (n
>= 2) { k
[1] = t
[0]; k
[0] = t
[1]; t
+= 2; k
+= 2; n
-= 2; }
437 static void arm64_pmull_recover_k(const gcm_params
*p
,
438 uint32
*k
, const uint32
*ktab
)
443 /* The representation is already independent of the blockcipher endianness.
444 * We need to skip the duplicate pieces, and unscramble the bytes.
449 k
[0] = ENDSWAP32(rbit32(t
[0]));
450 k
[1] = ENDSWAP32(rbit32(t
[1]));
451 t
+= 4; k
+= 2; n
-= 2;
453 if (n
) k
[0] = ENDSWAP32(rbit32(t
[0]));
457 CPU_DISPATCH(static, EMPTY
, void, recover_k
,
458 (const gcm_params
*p
, uint32
*k
, const uint32
*ktab
),
460 pick_recover_k
, simple_recover_k
)
462 static recover_k__functype
*pick_recover_k(void)
464 #if CPUFAM_X86 || CPUFAM_AMD64
465 DISPATCH_PICK_COND(recover_k
, pclmul_recover_k
,
466 cpu_feature_p(CPUFEAT_X86_SSSE3
) &&
467 cpu_feature_p(CPUFEAT_X86_PCLMUL
));
470 DISPATCH_PICK_COND(recover_k
, arm_crypto_recover_k
,
471 cpu_feature_p(CPUFEAT_ARM_PMULL
));
474 DISPATCH_PICK_COND(recover_k
, arm64_pmull_recover_k
,
475 cpu_feature_p(CPUFEAT_ARM_PMULL
));
477 DISPATCH_PICK_FALLBACK(recover_k
, simple_recover_k
);
480 /* --- @gcm_mulk_N{b,l}@ --- *
482 * Arguments: @uint32 *a@ = accumulator to multiply
483 * @const uint32 *ktab@ = table constructed by @gcm_mktable@
487 * Use: Multiply @a@ by @k@ (implicitly represented in @ktab@),
488 * updating @a@ in-place. There are separate functions for each
489 * supported block size and endianness because this is the
490 * function whose performance actually matters.
493 #if CPUFAM_X86 || CPUFAM_AMD64
494 # define DECL_MULK_X86ISH(var) extern gcm_mulk_##var##__functype \
495 gcm_mulk_##var##_x86ish_pclmul_avx, \
496 gcm_mulk_##var##_x86ish_pclmul;
497 # define PICK_MULK_X86ISH(var) do { \
498 DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul_avx, \
499 cpu_feature_p(CPUFEAT_X86_AVX) && \
500 cpu_feature_p(CPUFEAT_X86_PCLMUL) && \
501 cpu_feature_p(CPUFEAT_X86_SSSE3)); \
502 DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul, \
503 cpu_feature_p(CPUFEAT_X86_PCLMUL) && \
504 cpu_feature_p(CPUFEAT_X86_SSSE3)); \
507 # define DECL_MULK_X86ISH(var)
508 # define PICK_MULK_X86ISH(var) do ; while (0)
512 # define DECL_MULK_ARM(var) \
513 extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm_crypto;
514 # define PICK_MULK_ARM(var) do { \
515 DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm_crypto, \
516 cpu_feature_p(CPUFEAT_ARM_PMULL)); \
519 # define DECL_MULK_ARM(var)
520 # define PICK_MULK_ARM(var) do ; while (0)
524 # define DECL_MULK_ARM64(var) \
525 extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm64_pmull;
526 # define PICK_MULK_ARM64(var) do { \
527 DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm64_pmull, \
528 cpu_feature_p(CPUFEAT_ARM_PMULL)); \
531 # define DECL_MULK_ARM64(var)
532 # define PICK_MULK_ARM64(var) do ; while (0)
535 #define DEF_MULK(nbits) \
537 CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##b, \
538 (uint32 *a, const uint32 *ktab), (a, ktab), \
539 pick_mulk_##nbits##b, simple_mulk_##nbits) \
540 CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##l, \
541 (uint32 *a, const uint32 *ktab), (a, ktab), \
542 pick_mulk_##nbits##l, simple_mulk_##nbits) \
544 static void simple_mulk_##nbits(uint32 *a, const uint32 *ktab) \
547 uint32 z[nbits/32]; \
550 for (i = 0; i < nbits/32; i++) z[i] = 0; \
552 for (i = 0; i < nbits/32; i++) { \
554 for (j = 0; j < 32; j++) { \
555 m = -((t >> 31)&1u); \
556 for (k = 0; k < nbits/32; k++) z[k] ^= *ktab++&m; \
561 for (i = 0; i < nbits/32; i++) a[i] = z[i]; \
564 DECL_MULK_X86ISH(nbits##b) \
565 DECL_MULK_ARM(nbits##b) \
566 DECL_MULK_ARM64(nbits##b) \
567 static gcm_mulk_##nbits##b##__functype *pick_mulk_##nbits##b(void) \
569 PICK_MULK_X86ISH(nbits##b); \
570 PICK_MULK_ARM(nbits##b); \
571 PICK_MULK_ARM64(nbits##b); \
572 DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits); \
575 DECL_MULK_X86ISH(nbits##l) \
576 DECL_MULK_ARM(nbits##l) \
577 DECL_MULK_ARM64(nbits##l) \
578 static gcm_mulk_##nbits##l##__functype *pick_mulk_##nbits##l(void) \
580 PICK_MULK_X86ISH(nbits##l); \
581 PICK_MULK_ARM(nbits##l); \
582 PICK_MULK_ARM64(nbits##l); \
583 DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits); \
588 #define GCM_MULK_CASE(nbits) \
590 if (_f&GCMF_SWAP) gcm_mulk_##nbits##l(_a, _ktab); \
591 else gcm_mulk_##nbits##b(_a, _ktab); \
593 #define MULK(n, f, a, ktab) do { \
594 uint32 *_a = (a); const uint32 *_ktab = (ktab); \
597 GCM_WIDTHS(GCM_MULK_CASE) \
602 /*----- Other utilities ---------------------------------------------------*/
604 /* --- @putlen@ --- *
606 * Arguments: @octet *p@ = pointer to output buffer
607 * @unsigned w@ = size of output buffer
608 * @unsigned blksz@ = block size (assumed fairly small)
609 * @unsigned long nblocks@ = number of blocks
610 * @unsigned nbytes@ = tail size in bytes (assumed small)
614 * Use: Store the overall length in %$\emph{bits}$% (i.e.,
615 * @3*(nblocks*blksz + nbytes)@ in big-endian form in the
619 static void putlen(octet
*p
, unsigned w
, unsigned blksz
,
620 unsigned long nblocks
, unsigned nbytes
)
622 unsigned long nblo
= nblocks
&((1ul << (ULONG_BITS
/2)) - 1),
623 nbhi
= nblocks
>> ULONG_BITS
/2;
624 unsigned long nlo
= nblo
*blksz
+ nbytes
, nhi
= nbhi
*blksz
;
626 /* This is fiddly. Split @nblocks@, which is the big number, into high and
627 * low halves, multiply those separately by @blksz@, propagate carries, and
628 * then multiply by eight.
630 nhi
+= nlo
>> ULONG_BITS
/2;
631 nlo
&= (1ul << (ULONG_BITS
/2)) - 1;
634 /* Now write out the size, feeding bits in from @nhi@ as necessary. */
638 nlo
= (nlo
>> 8) | ((nhi
&0xff) << (ULONG_BITS
/2 - 5));
645 * Arguments: @const gcm_params *p@ = pointer to the parameters
646 * @uint32 *a@ = GHASH accumulator
647 * @const octet *q@ = pointer to an input block
648 * @const uint32 *ktab@ = multiplication table, built by
653 * Use: Fold the block @q@ into the GHASH accumulator. The
654 * calculation is %$a' = k (a + q)$%.
657 static void mix(const gcm_params
*p
, uint32
*a
,
658 const octet
*q
, const uint32
*ktab
)
663 for (i
= 0; i
< p
->n
; i
++) { a
[i
] ^= LOAD32_L(q
); q
+= 4; }
665 for (i
= 0; i
< p
->n
; i
++) { a
[i
] ^= LOAD32_B(q
); q
+= 4; }
666 MULK(p
->n
, p
->f
, a
, ktab
);
669 /* --- @gcm_ghashdone@ --- *
671 * Arguments: @const gcm_params *p@ = pointer to the parameters
672 * @uint32 *a@ = GHASH accumulator
673 * @const uint32 *ktab@ = multiplication table, built by
675 * @unsigned long xblocks, yblocks@ = number of whole blocks in
677 * @unsigned xbytes, ybytes@ = number of trailing bytes in the
682 * Use: Finishes a GHASH operation by appending the appropriately
683 * encoded lengths of the two constituent messages.
686 void gcm_ghashdone(const gcm_params
*p
, uint32
*a
, const uint32
*ktab
,
687 unsigned long xblocks
, unsigned xbytes
,
688 unsigned long yblocks
, unsigned ybytes
)
691 unsigned w
= p
->n
< 3 ?
4*p
->n
: 2*p
->n
;
693 /* Construct the encoded lengths. Note that smaller-block versions of GCM
694 * encode the lengths in separate blocks. GCM is only officially defined
695 * for 64- and 128-bit blocks; I've placed the cutoff somewhat arbitrarily
698 putlen(b
, w
, 4*p
->n
, xblocks
, xbytes
);
699 putlen(b
+ w
, w
, 4*p
->n
, yblocks
, ybytes
);
701 /* Feed the lengths into the accumulator. */
703 if (p
->n
< 3) mix(p
, a
, b
+ w
, ktab
);
706 /* --- @gcm_concat@ --- *
708 * Arguments: @const gcm_params *p@ = pointer to the parameters
709 * @uint32 *z@ = GHASH accumulator for suffix, updated
710 * @const uint32 *x@ = GHASH accumulator for prefix
711 * @const uint32 *ktab@ = multiplication table, built by
713 * @unsigned long n@ = length of suffix in whole blocks
717 * Use: On entry, @x@ and @z@ are the results of hashing two strings
718 * %$a$% and %$b$%, each a whole number of blocks long; in
719 * particular, %$b$% is @n@ blocks long. On exit, @z@ is
720 * updated to be the hash of %$a \cat b$%.
723 void gcm_concat(const gcm_params
*p
, uint32
*z
, const uint32
*x
,
724 const uint32
*ktab
, unsigned long n
)
726 uint32 t
[GCM_NMAX
], u
[GCM_NMAX
];
730 /* If @n@ is zero, then there's not much to do. The mathematics
731 * (explained below) still works, but the code takes a shortcut which
732 * doesn't handle this case: so set %$z' = z + x k^n = z + x$%.
735 for (j
= 0; j
< p
->n
; j
++) z
[j
] ^= x
[j
];
737 /* We have %$x = a_0 t^m + \cdots + a_{m-2} t^2 + a_{m-1} t$% and
738 * %$z = b_0 t^n + \cdots + b_{n-2} t^2 + b_{n-1} t$%. What we'd like is
739 * the hash of %$a \cat b$%, which is %$z + x k^n$%.
741 * The first job, then, is to calculate %$k^n$%, and for this we use a
742 * simple left-to-right square-and-multiply algorithm. There's no need
743 * to keep %$n$% secret here.
746 /* Start by retrieving %$k$% from the table, and convert it to big-endian
749 recover_k(p
, u
, ktab
);
751 /* Now calculate %$k^n$%. */
753 #define BIT (1ul << (ULONG_BITS - 1))
754 while (!(n
&BIT
)) { n
<<= 1; i
--; }
755 n
<<= 1; i
--; for (j
= 0; j
< p
->n
; j
++) t
[j
] = u
[j
];
756 while (i
--) { mul(p
, t
, t
, t
); if (n
&BIT
) mul(p
, t
, t
, u
); n
<<= 1; }
759 /* Next, calculate %$x k^n$%. If we're using a little-endian convention
760 * then we must convert %$x$%; otherwise we can just use it in place.
762 if (!(p
->f
&GCMF_SWAP
))
765 for (j
= 0; j
< p
->n
; j
++) u
[j
] = ENDSWAP32(x
[j
]);
769 /* Finally, add %$x k^n$% onto %$z$%, converting back to little-endian if
772 if (!(p
->f
&GCMF_SWAP
)) for (j
= 0; j
< p
->n
; j
++) z
[j
] ^= t
[j
];
773 else for (j
= 0; j
< p
->n
; j
++) z
[j
] ^= ENDSWAP32(t
[j
]);
777 /*----- Test rig ----------------------------------------------------------*/
781 #include <mLib/macros.h>
782 #include <mLib/quis.h>
783 #include <mLib/testrig.h>
785 static void report_failure(const char *test
, unsigned nbits
,
786 const char *ref
, dstr v
[], dstr
*d
)
788 printf("test %s failed (nbits = %u)", test
, nbits
);
789 printf("\n\tx = "); type_hex
.dump(&v
[0], stdout
);
790 printf("\n\ty = "); type_hex
.dump(&v
[1], stdout
);
791 printf("\n\tz = "); type_hex
.dump(&v
[2], stdout
);
792 printf("\n\t%s' = ", ref
); type_hex
.dump(d
, stdout
);
796 static void mulk(unsigned nbits
, unsigned f
, uint32
*x
, const uint32
*ktab
)
797 { MULK(nbits
/32, f
, x
, ktab
); }
799 static int test_mul(uint32 poly
, dstr v
[])
801 uint32 x
[GCM_NMAX
], y
[GCM_NMAX
], z
[GCM_NMAX
], ktab
[32*GCM_NMAX
*GCM_NMAX
];
806 enum { I_x
, I_y
, I_z
};
808 nbits
= 8*v
[0].len
; p
.f
= 0; p
.n
= nbits
/32; p
.poly
= poly
;
809 dstr_ensure(&d
, nbits
/8); d
.len
= nbits
/8;
811 #define LOADXY(E) do { \
812 for (i = 0; i < nbits/32; i++) { \
813 x[i] = LOAD32_##E(v[I_x].buf + 4*i); \
814 y[i] = LOAD32_##E(v[I_y].buf + 4*i); \
818 #define INITZ(x) do { \
819 for (i = 0; i < nbits/32; i++) z[i] = (x)[i]; \
822 #define CHECK(E, what, ref) do { \
823 for (i = 0; i < nbits/32; i++) STORE32_##E(d.buf + 4*i, z[i]); \
824 if (MEMCMP(d.buf, !=, v[I_##ref].buf, nbits/8)) \
825 { ok = 0; report_failure(what, nbits, #ref, v, &d); } \
828 #define TEST_PREP_1(E, x, y, what) do { \
829 gcm_mktable(&p, ktab, y); \
830 recover_k(&p, z, ktab); CHECK(B, "mktable/recover_k (" #y ")", y); \
831 INITZ(x); mulk(nbits, p.f, z, ktab); CHECK(E, what " (k = " #y ")", z); \
834 #define TEST_PREP(E, what) do { \
835 TEST_PREP_1(E, x, y, what); \
836 TEST_PREP_1(E, y, x, what); \
839 /* First, test plain multiply. */
840 LOADXY(B
); mul(&p
, z
, x
, y
); CHECK(B
, "gcm_mul", z
);
842 /* Next, test big-endian prepared key. */
843 LOADXY(B
); TEST_PREP(B
, "gcm_kmul_b");
845 /* Finally, test little-endian prepared key. */
846 p
.f
= GCMF_SWAP
; LOADXY(L
);
847 TEST_PREP(L
, "gcm_kmul_l");
859 #define TEST(nbits) \
860 static int test_mul_##nbits(dstr v[]) \
861 { return (test_mul(GCM_POLY_##nbits, v)); }
865 static test_chunk defs
[] = {
866 #define TEST(nbits) \
867 { "gcm-mul" #nbits, test_mul_##nbits, \
868 { &type_hex, &type_hex, &type_hex, 0 } },
874 int main(int argc
, char *argv
[])
877 test_run(argc
, argv
, defs
, SRCDIR
"/t/gcm");
883 /*----- That's all, folks -------------------------------------------------*/