From 3709fba58c13ab6b9242ea1c63f7f559901dac30 Mon Sep 17 00:00:00 2001 From: Mark Wooding Date: Mon, 15 Jan 2024 18:41:30 +0000 Subject: [PATCH] @@@ mess --- base/asm-common.h | 16 ++ symm/gcm-arm-crypto.S | 185 ++++++--------------- symm/gcm-x86ish-pclmul.S | 422 +++++++++++++++-------------------------------- symm/gcm.c | 93 ++++++----- utils/gcm-ref | 99 ++++++----- 5 files changed, 310 insertions(+), 505 deletions(-) diff --git a/base/asm-common.h b/base/asm-common.h index b4d4a909..0c209580 100644 --- a/base/asm-common.h +++ b/base/asm-common.h @@ -666,6 +666,22 @@ name: vext.8 \vd, \vn, \vz, #\nbit >> 3 .endm +.macro vrol128 vd, vn, nbit + // Set VD to VN rotated left by NBIT. NBIT must be a multiple of 8. + .if \nbit&3 != 0 + .error "shift quantity must be whole number of bytes" + .endif + vext.8 \vd, \vn, \vn, #16 - (\nbit >> 3) +.endm + +.macro vror128 vd, vn, nbit + // Set VD to VN shifted right by NBIT. NBIT must be a multiple of 8. + .if \nbit&3 != 0 + .error "shift quantity must be whole number of bytes" + .endif + vext.8 \vd, \vn, \vn, #\nbit >> 3 +.endm + // Apply decoration decor to register name reg. #define _REGFORM(reg, decor) _GLUE(_REGFORM_, reg)(decor) diff --git a/symm/gcm-arm-crypto.S b/symm/gcm-arm-crypto.S index ddc714b3..12d6aa42 100644 --- a/symm/gcm-arm-crypto.S +++ b/symm/gcm-arm-crypto.S @@ -60,14 +60,21 @@ // u v = SUM_{0<=i,jn; i++) { t = x[i]; z[i] = (t >> 1) ^ c; c = t << 31; } } +#if CPUFAM_X86 || CPUFAM_AMD64 || CPUFAM_ARMEL +static void divt(const gcm_params *p, uint32 *z, const uint32 *x) +{ + uint32 m, c, t; + unsigned i; + + t = x[0]; m = -((t >> 31)&1u); c = m&1u; + for (i = p->n - 1; i; i--) { t = x[i]; z[i] = (t << 1) | c; c = t >> 31; } + t = x[0]; z[0] = ((t ^ (m&p->poly)) << 1) | c; +} +#endif + /* --- @mul@ --- * * * Arguments: @const gcm_params *p@ = pointer to the parameters @@ -238,22 +250,26 @@ static void simple_mktable(const gcm_params *p, static void pclmul_mktable(const gcm_params *p, uint32 *ktab, const uint32 *k) { - unsigned n = p->n; + unsigned i, n = p->n; unsigned nz; - uint32 *t; + uint32 k_over_t[GCM_NMAX], *t; - /* We just need to store the value in a way which is convenient for the - * assembler code to read back. That involves reordering the words, and, - * in the case of 96-bit blocks, padding with zeroes to fill out a 128-bit - * chunk. + /* We need to divide the value by t (to compensate for the one-bit shift + * resulting from GCM's backwards bit ordering) and store the value in a + * way which is convenient for the assembler code to read back. That + * involves reordering the words, and, in the case of 96-bit blocks, + * padding with zeroes to fill out a 128-bit chunk. */ + if (!(p->f&GCMF_SWAP)) divt(p, k_over_t, k); + else { + for (i = 0; i < n; i++) k_over_t[i] = ENDSWAP32(k[i]); + divt(p, k_over_t, k_over_t); + } + if (n == 3) nz = 1; else nz = 0; - t = ktab + n + nz; - - if (p->f&GCMF_SWAP) while (n--) { *--t = ENDSWAP32(*k); k++; } - else while (n--) *--t = *k++; + k = k_over_t; t = ktab + n + nz; while (n--) *--t = *k++; while (nz--) *--t = 0; } #endif @@ -262,28 +278,27 @@ static void pclmul_mktable(const gcm_params *p, static void arm_crypto_mktable(const gcm_params *p, uint32 *ktab, const uint32 *k) { - unsigned n = p->n; - uint32 *t; + unsigned i, n = p->n; + uint32 k_over_t[GCM_NMAX], *t; - /* We just need to store the value in a way which is convenient for the - * assembler code to read back. That involves swapping the bytes in each - * 64-bit lane. + /* We need to divide the value by t (to compensate for the one-bit shift + * resulting from GCM's backwards bit ordering) and store the value in a + * way which is convenient for the assembler code to read back. That + * involves swapping the bytes in each 64-bit lane. */ - t = ktab; - if (p->f&GCMF_SWAP) { - while (n >= 2) { - t[1] = ENDSWAP32(k[0]); t[0] = ENDSWAP32(k[1]); - t += 2; k += 2; n -= 2; - } - if (n) { t[1] = ENDSWAP32(k[0]); t[0] = 0; } - } else { - while (n >= 2) { - t[1] = k[0]; t[0] = k[1]; - t += 2; k += 2; n -= 2; - } - if (n) { t[1] = k[0]; t[0] = 0; } + if (!(p->f&GCMF_SWAP)) divt(p, k_over_t, k); + else { + for (i = 0; i < n; i++) k_over_t[i] = ENDSWAP32(k[i]); + divt(p, k_over_t, k_over_t); + } + + t = ktab; k = k_over_t; + while (n >= 2) { + t[1] = k[0]; t[0] = k[1]; + t += 2; k += 2; n -= 2; } + if (n) { t[1] = k[0]; t[0] = 0; } } #endif @@ -407,12 +422,14 @@ static void pclmul_recover_k(const gcm_params *p, const uint32 *t; /* The representation is already independent of the blockcipher endianness. - * We need to compensate for padding, and reorder the words. + * We need to compensate for padding, reorder the words, and multiply by t + * to compensate for the factor of t we divided out earlier. */ if (n == 3) nz = 1; else nz = 0; t = ktab + n + nz; while (n--) *k++ = *--t; + mult(p, k - p->n, k - p->n); } #endif @@ -424,12 +441,14 @@ static void arm_crypto_recover_k(const gcm_params *p, const uint32 *t; /* The representation is already independent of the blockcipher endianness. - * We only need to reorder the words. + * We only need to reorder the words, and multiply by t to compensate for + * the factor of t we divided out earlier. */ t = ktab; while (n >= 2) { k[1] = t[0]; k[0] = t[1]; t += 2; k += 2; n -= 2; } - if (n) k[0] = t[1]; + if (n) { k[0] = t[1]; k++; n--; } + mult(p, k - p->n, k - p->n); } #endif diff --git a/utils/gcm-ref b/utils/gcm-ref index 4a53737b..406c6595 100755 --- a/utils/gcm-ref +++ b/utils/gcm-ref @@ -123,11 +123,17 @@ def shift_left(x): p = poly(8*w) return gcm_mangle(C.GF.storel((C.GF.loadl(gcm_mangle(x)) << 1)%p)) +def shift_right(x): + """Given a field element X (in external format), return X/t.""" + w = len(x) + p = poly(8*w) + return gcm_mangle(C.GF.storel((C.GF.loadl(gcm_mangle(x))*p.modinv(2))%p)) + def table_common(u, v, flip, getword, ixmask): """ Multiply U by V using table lookup; common for `table-b' and `table-l'. - This matches the `simple_mulk_...' implementation in `gcm.c'. One-entry + This matches the `simple_mulk_...' implementation in `gcm.c'. One entry per bit is the best we can manage if we want a constant-time implementation: processing n bits at a time means we need to scan (2^n - 1)/n times as much memory. @@ -140,7 +146,7 @@ def table_common(u, v, flip, getword, ixmask): are processed most-significant first. * IXMASK is a mask XORed into table indices to permute the table so that - it's order matches that induced by GETWORD. + its order matches that induced by GETWORD. The table is built such that tab[i XOR IXMASK] = U t^i. """ @@ -172,7 +178,7 @@ def demo_table_b(u, v): @demo def demo_table_l(u, v): """Little-endian table lookup.""" - return table_common(u, v, endswap_words, lambda b: b.getu32l(), 0x18) + return table_common(u, v, endswap_words_32, lambda b: b.getu32l(), 0x18) ###-------------------------------------------------------------------------- ### Implementation using 64×64->128-bit binary polynomial multiplication. @@ -180,12 +186,12 @@ def demo_table_l(u, v): _i = iota() TAG_INPUT_U = _i() TAG_INPUT_V = _i() +TAG_SHIFTED_V = _i() TAG_KPIECE_U = _i() TAG_KPIECE_V = _i() TAG_PRODPIECE = _i() TAG_PRODSUM = _i() TAG_PRODUCT = _i() -TAG_SHIFTED = _i() TAG_REDCBITS = _i() TAG_REDCFULL = _i() TAG_REDCMIX = _i() @@ -237,7 +243,7 @@ def rev8(x): x = ((x&m_55) << 1) | ((x >> 1)&m_55) return x -def present_gf_mullp64(tag, wd, x, w, n, what): +def present_gf_vmullp64(tag, wd, x, w, n, what): if tag == TAG_PRODPIECE or tag == TAG_REDCFULL: return elif (wd == 128 or wd == 64) and TAG_PRODSUM <= tag <= TAG_PRODUCT: @@ -255,9 +261,10 @@ def present_gf_mullp64(tag, wd, x, w, n, what): present_gf(y, (w + 63)&~63, n, what) def present_gf_pmull(tag, wd, x, w, n, what): - if tag == TAG_PRODPIECE or tag == TAG_REDCFULL or tag == TAG_SHIFTED: + if tag == TAG_PRODPIECE or tag == TAG_REDCFULL: return - elif tag == TAG_INPUT_V or tag == TAG_KPIECE_V: + elif tag == TAG_INPUT_V or tag == TAG_SHIFTED_V or tag == TAG_KPIECE_V: + w = (w + 63)&~63 bx = C.ReadBuffer(x.storeb(w/8)) by = C.WriteBuffer() while bx.left: chunk = bx.get(8); by.put(chunk).put(chunk) @@ -280,10 +287,9 @@ def poly64_mul_simple(u, v, presfn, wd, dispwd, mulwd, uwhat, vwhat): ## We start by carving the operands into 64-bit pieces. This is ## straightforward except for the 96-bit case, where we end up with two ## short pieces which we pad at the beginning. - if uw%mulwd: pad = (-uw)%mulwd; u += C.ByteString.zero(pad); uw += pad - if vw%mulwd: pad = (-uw)%mulwd; v += C.ByteString.zero(pad); vw += pad - uu = split_gf(u, mulwd) - vv = split_gf(v, mulwd) + upad = (-uw)%mulwd; u += C.ByteString.zero(upad); uw += upad + vpad = (-vw)%mulwd; v += C.ByteString.zero(vpad); vw += vpad + uu = split_gf(u, mulwd); vv = split_gf(v, mulwd) ## Report and accumulate the individual product pieces. x = C.GF(0) @@ -300,7 +306,7 @@ def poly64_mul_simple(u, v, presfn, wd, dispwd, mulwd, uwhat, vwhat): x += t << (mulwd*i) presfn(TAG_PRODUCT, wd, x, uw + vw, dispwd, '%s %s' % (uwhat, vwhat)) - return x + return x >> (upad + vpad) def poly64_mul_karatsuba(u, v, klimit, presfn, wd, dispwd, mulwd, uwhat, vwhat): @@ -343,8 +349,7 @@ def poly64_mul_karatsuba(u, v, klimit, presfn, wd, presfn(TAG_PRODUCT, wd, x, 2*w, dispwd, '%s %s' % (uwhat, vwhat)) return x -def poly64_common(u, v, presfn, dispwd = 32, mulwd = 64, redcwd = 32, - klimit = 256): +def poly64_mul(u, v, presfn, dispwd, mulwd, klimit, uwhat, vwhat): """ Multiply U by V using a primitive 64-bit binary polynomial mutliplier. @@ -353,28 +358,27 @@ def poly64_common(u, v, presfn, dispwd = 32, mulwd = 64, redcwd = 32, Operands arrive in a `register format', which is a byte-swapped variant of the external format. Implementations differ on the precise details, - though. + though. Returns the double-precision product. """ - ## We work in two main phases: first, calculate the full double-width - ## product; and, second, reduce it modulo the field polynomial. - w = 8*len(u); assert(w == 8*len(v)) - p = poly(w) - presfn(TAG_INPUT_U, w, C.GF.loadb(u), w, dispwd, 'u') - presfn(TAG_INPUT_V, w, C.GF.loadb(v), w, dispwd, 'v') + x = poly64_mul_karatsuba(u, v, klimit, presfn, + w, dispwd, mulwd, uwhat, vwhat) - ## So, on to the first part: the multiplication. - x = poly64_mul_karatsuba(u, v, klimit, presfn, w, dispwd, mulwd, 'u', 'v') + return x.storeb(w/4) - ## Now we have to shift everything up one bit to account for GCM's crazy - ## bit ordering. - y = x << 1 - if w == 96: y >>= 64 - presfn(TAG_SHIFTED, w, y, 2*w, dispwd, 'y') +def poly64_redc(y, presfn, dispwd, redcwd): + """ + Reduce a double-precision product X modulo the appropriate polynomial. + + The operand arrives in a `register format', which is a byte-swapped variant + of the external format. Implementations differ on the precise details, + though. Returns the single-precision reduced value. + """ + + w = 4*len(y) + p = poly(w) - ## Now for the reduction. - ## ## Our polynomial has the form p = t^d + r where r = SUM_{0<=i