From ac4a43c1da1d1554247212289cf483934b9e6d1c Mon Sep 17 00:00:00 2001 From: Mark Wooding Date: Tue, 16 Jan 2024 14:03:11 +0000 Subject: [PATCH] symm/gcm.c, symm/gcm-*.S, utils/gcm-ref: Replace one-bit shift with algebra. The ARM32 and x86 instruction sets lack an instruction to reverse the order of bits in each byte of a vector register. Therefore, they resolve the GCM bit-ordering nightmare by reordering the bytes and working with reversed polynomials. But when you multiply reversed polynomials, the result ends up being shifted by one bit relative to the answer you actually wanted -- and SIMD instruction sets are bad at multiprecision bit shifts, so this involves a lot of work. Instead, use algebra. If the result is shifted by one bit position, then it's been multiplied by the generator t. We can therefore fix this by dividing through by t. Of course, this might not even be possible working in general the ring of polynomials over GF(2), but we're actually working in the GCM quotient field, and t definitely has an inverse there. Also, dividing by t will take time and effort -- but in fact one of the operands is something we know ahead of time and get to prepare in whatever way we like. So, in particular, we could divide it by t before we even start. The result is that we get to delete a bunch of rather fiddly assembler code, in favour of some fairly simple C setup (and extra compensation work in `recover_k'). I stole this trick from PuTTY. --- symm/gcm-arm-crypto.S | 179 +++++--------------- symm/gcm-x86ish-pclmul.S | 416 ++++++++++++++--------------------------------- symm/gcm.c | 93 ++++++----- utils/gcm-ref | 37 +++-- 4 files changed, 253 insertions(+), 472 deletions(-) diff --git a/symm/gcm-arm-crypto.S b/symm/gcm-arm-crypto.S index 8861e601..d5a58f89 100644 --- a/symm/gcm-arm-crypto.S +++ b/symm/gcm-arm-crypto.S @@ -67,7 +67,14 @@ // = SUM_{0<=i,jn; i++) { t = x[i]; z[i] = (t >> 1) ^ c; c = t << 31; } } +#if CPUFAM_X86 || CPUFAM_AMD64 || CPUFAM_ARMEL +static void divt(const gcm_params *p, uint32 *z, const uint32 *x) +{ + uint32 m, c, t; + unsigned i; + + t = x[0]; m = -((t >> 31)&1u); c = m&1u; + for (i = p->n - 1; i; i--) { t = x[i]; z[i] = (t << 1) | c; c = t >> 31; } + t = x[0]; z[0] = ((t ^ (m&p->poly)) << 1) | c; +} +#endif + /* --- @mul@ --- * * * Arguments: @const gcm_params *p@ = pointer to the parameters @@ -238,22 +250,26 @@ static void simple_mktable(const gcm_params *p, static void pclmul_mktable(const gcm_params *p, uint32 *ktab, const uint32 *k) { - unsigned n = p->n; + unsigned i, n = p->n; unsigned nz; - uint32 *t; + uint32 k_over_t[GCM_NMAX], *t; - /* We just need to store the value in a way which is convenient for the - * assembler code to read back. That involves reordering the words, and, - * in the case of 96-bit blocks, padding with zeroes to fill out a 128-bit - * chunk. + /* We need to divide the value by t (to compensate for the one-bit shift + * resulting from GCM's backwards bit ordering) and store the value in a + * way which is convenient for the assembler code to read back. That + * involves reordering the words, and, in the case of 96-bit blocks, + * padding with zeroes to fill out a 128-bit chunk. */ + if (!(p->f&GCMF_SWAP)) divt(p, k_over_t, k); + else { + for (i = 0; i < n; i++) k_over_t[i] = ENDSWAP32(k[i]); + divt(p, k_over_t, k_over_t); + } + if (n == 3) nz = 1; else nz = 0; - t = ktab + n + nz; - - if (p->f&GCMF_SWAP) while (n--) { *--t = ENDSWAP32(*k); k++; } - else while (n--) *--t = *k++; + k = k_over_t; t = ktab + n + nz; while (n--) *--t = *k++; while (nz--) *--t = 0; } #endif @@ -262,28 +278,27 @@ static void pclmul_mktable(const gcm_params *p, static void arm_crypto_mktable(const gcm_params *p, uint32 *ktab, const uint32 *k) { - unsigned n = p->n; - uint32 *t; + unsigned i, n = p->n; + uint32 k_over_t[GCM_NMAX], *t; - /* We just need to store the value in a way which is convenient for the - * assembler code to read back. That involves swapping the bytes in each - * 64-bit lane. + /* We need to divide the value by t (to compensate for the one-bit shift + * resulting from GCM's backwards bit ordering) and store the value in a + * way which is convenient for the assembler code to read back. That + * involves swapping the bytes in each 64-bit lane. */ - t = ktab; - if (p->f&GCMF_SWAP) { - while (n >= 2) { - t[1] = ENDSWAP32(k[0]); t[0] = ENDSWAP32(k[1]); - t += 2; k += 2; n -= 2; - } - if (n) { t[1] = ENDSWAP32(k[0]); t[0] = 0; } - } else { - while (n >= 2) { - t[1] = k[0]; t[0] = k[1]; - t += 2; k += 2; n -= 2; - } - if (n) { t[1] = k[0]; t[0] = 0; } + if (!(p->f&GCMF_SWAP)) divt(p, k_over_t, k); + else { + for (i = 0; i < n; i++) k_over_t[i] = ENDSWAP32(k[i]); + divt(p, k_over_t, k_over_t); + } + + t = ktab; k = k_over_t; + while (n >= 2) { + t[1] = k[0]; t[0] = k[1]; + t += 2; k += 2; n -= 2; } + if (n) { t[1] = k[0]; t[0] = 0; } } #endif @@ -407,12 +422,14 @@ static void pclmul_recover_k(const gcm_params *p, const uint32 *t; /* The representation is already independent of the blockcipher endianness. - * We need to compensate for padding, and reorder the words. + * We need to compensate for padding, reorder the words, and multiply by t + * to compensate for the factor of t we divided out earlier. */ if (n == 3) nz = 1; else nz = 0; t = ktab + n + nz; while (n--) *k++ = *--t; + mult(p, k - p->n, k - p->n); } #endif @@ -424,12 +441,14 @@ static void arm_crypto_recover_k(const gcm_params *p, const uint32 *t; /* The representation is already independent of the blockcipher endianness. - * We only need to reorder the words. + * We only need to reorder the words, and multiply by t to compensate for + * the factor of t we divided out earlier. */ t = ktab; while (n >= 2) { k[1] = t[0]; k[0] = t[1]; t += 2; k += 2; n -= 2; } - if (n) k[0] = t[1]; + if (n) { k[0] = t[1]; k++; n--; } + mult(p, k - p->n, k - p->n); } #endif diff --git a/utils/gcm-ref b/utils/gcm-ref index 174e79e4..406c6595 100755 --- a/utils/gcm-ref +++ b/utils/gcm-ref @@ -123,6 +123,12 @@ def shift_left(x): p = poly(8*w) return gcm_mangle(C.GF.storel((C.GF.loadl(gcm_mangle(x)) << 1)%p)) +def shift_right(x): + """Given a field element X (in external format), return X/t.""" + w = len(x) + p = poly(8*w) + return gcm_mangle(C.GF.storel((C.GF.loadl(gcm_mangle(x))*p.modinv(2))%p)) + def table_common(u, v, flip, getword, ixmask): """ Multiply U by V using table lookup; common for `table-b' and `table-l'. @@ -180,12 +186,12 @@ def demo_table_l(u, v): _i = iota() TAG_INPUT_U = _i() TAG_INPUT_V = _i() +TAG_SHIFTED_V = _i() TAG_KPIECE_U = _i() TAG_KPIECE_V = _i() TAG_PRODPIECE = _i() TAG_PRODSUM = _i() TAG_PRODUCT = _i() -TAG_SHIFTED = _i() TAG_REDCBITS = _i() TAG_REDCFULL = _i() TAG_REDCMIX = _i() @@ -255,9 +261,9 @@ def present_gf_vmullp64(tag, wd, x, w, n, what): present_gf(y, (w + 63)&~63, n, what) def present_gf_pmull(tag, wd, x, w, n, what): - if tag == TAG_PRODPIECE or tag == TAG_REDCFULL or tag == TAG_SHIFTED: + if tag == TAG_PRODPIECE or tag == TAG_REDCFULL: return - elif tag == TAG_INPUT_V or tag == TAG_KPIECE_V: + elif tag == TAG_INPUT_V or tag == TAG_SHIFTED_V or tag == TAG_KPIECE_V: w = (w + 63)&~63 bx = C.ReadBuffer(x.storeb(w/8)) by = C.WriteBuffer() @@ -431,8 +437,19 @@ def poly64_redc(y, presfn, dispwd, redcwd): ## And we're done. return z.storeb(w/8) -def poly64_common(u, v, presfn, dispwd = 32, mulwd = 64, - redcwd = 32, klimit = 256): +def poly64_shiftcommon(u, v, presfn, dispwd = 32, mulwd = 64, + redcwd = 32, klimit = 256): + w = 8*len(u) + presfn(TAG_INPUT_U, w, C.GF.loadb(u), w, dispwd, 'u') + presfn(TAG_INPUT_V, w, C.GF.loadb(v), w, dispwd, 'v') + vv = shift_right(v) + presfn(TAG_SHIFTED_V, w, C.GF.loadb(vv), w, dispwd, "v'") + y = poly64_mul(u, vv, presfn, dispwd, mulwd, klimit, "u", "v'") + z = poly64_redc(y, presfn, dispwd, redcwd) + return z + +def poly64_directcommon(u, v, presfn, dispwd = 32, mulwd = 64, + redcwd = 32, klimit = 256): w = 8*len(u) presfn(TAG_INPUT_U, w, C.GF.loadb(u), w, dispwd, 'u') presfn(TAG_INPUT_V, w, C.GF.loadb(v), w, dispwd, 'v') @@ -443,19 +460,19 @@ def poly64_common(u, v, presfn, dispwd = 32, mulwd = 64, @demo def demo_pclmul(u, v): - return poly64_common(u, v, presfn = present_gf_pclmul) + return poly64_shiftcommon(u, v, presfn = present_gf_pclmul) @demo def demo_vmullp64(u, v): w = 8*len(u) - return poly64_common(u, v, presfn = present_gf_vmullp64, - redcwd = w%64 == 32 and 32 or 64) + return poly64_shiftcommon(u, v, presfn = present_gf_vmullp64, + redcwd = w%64 == 32 and 32 or 64) @demo def demo_pmull(u, v): w = 8*len(u) - return poly64_common(u, v, presfn = present_gf_pmull, - redcwd = w%64 == 32 and 32 or 64) + return poly64_directcommon(u, v, presfn = present_gf_pmull, + redcwd = w%64 == 32 and 32 or 64) ###-------------------------------------------------------------------------- ### @@@ Random debris to be deleted. @@@ -- 2.11.0