From 9e6a4409d58d1ed9dfe2de3c6ffaee822e051c9f Mon Sep 17 00:00:00 2001 From: Mark Wooding Date: Tue, 13 Nov 2018 11:28:53 +0000 Subject: [PATCH] symm/gcm-*.S: GCM acceleration using hardware polynomial multiplication. Add assembler implementations of the low-level GCM arithmetic which make use of polynomial multiplication instructions on x86 (the delightfully named `pclmul{l,h}q{l,h}dq' instructions) and ARM processors (the ARM32 `vmull.p64' and ARM64 `pmull{,2}' instructions). Of course, this involves adding the necessary CPU feature detection. GCM's bit and byte order is remarkably confusing. I've tried quite hard to write the code so as to help the reader keep track of which bits are where, but it's very difficult. There's also a Python implementation which has proven invaluable while debugging these things. --- base/dispatch.c | 18 +- base/dispatch.h | 5 +- symm/Makefile.am | 12 + symm/gcm-arm-crypto.S | 718 +++++++++++++++++++++++++++++++ symm/gcm-arm64-pmull.S | 661 ++++++++++++++++++++++++++++ symm/gcm-x86ish-pclmul.S | 1073 ++++++++++++++++++++++++++++++++++++++++++++++ symm/gcm.c | 263 +++++++++++- utils/gcm-ref | 502 ++++++++++++++++++++++ 8 files changed, 3246 insertions(+), 6 deletions(-) create mode 100644 symm/gcm-arm-crypto.S create mode 100644 symm/gcm-arm64-pmull.S create mode 100644 symm/gcm-x86ish-pclmul.S create mode 100755 utils/gcm-ref diff --git a/base/dispatch.c b/base/dispatch.c index 9ba6a7cd..9f2ac71b 100644 --- a/base/dispatch.c +++ b/base/dispatch.c @@ -46,6 +46,8 @@ # define EFLAGS_ID (1u << 21) # define CPUID1D_SSE2 (1u << 26) # define CPUID1D_FXSR (1u << 24) +# define CPUID1C_PCLMUL (1u << 1) +# define CPUID1C_SSSE3 (1u << 9) # define CPUID1C_AESNI (1u << 25) # define CPUID1C_AVX (1u << 28) # define CPUID1C_RDRAND (1u << 30) @@ -282,13 +284,15 @@ static unsigned hwcaps = 0; _(ARM_NEON, "arm:neon") \ _(ARM_V4, "arm:v4") \ _(ARM_D32, "arm:d32") \ - _(ARM_AES, "arm:aes") + _(ARM_AES, "arm:aes") \ + _(ARM_PMULL, "arm:pmull") #endif #if CPUFAM_ARM64 # define WANTAUX(_) \ WANT_AT_HWCAP(_) # define CAPMAP(_) \ - _(ARM_AES, "arm:aes") + _(ARM_AES, "arm:aes") \ + _(ARM_PMULL, "arm:pmull") #endif /* Build the bitmask for `hwcaps' from the `CAPMAP' list. */ @@ -402,9 +406,13 @@ static void probe_hwcaps(void) # ifdef HWCAP2_AES if (probed.hwcap2 & HWCAP2_AES) hw |= HF_ARM_AES; # endif +# ifdef HWCAP2_PMULL + if (probed.hwcap2 & HWCAP2_PMULL) hw |= HF_ARM_PMULL; +# endif #endif #if CPUFAM_ARM64 if (probed.hwcap & HWCAP_AES) hw |= HF_ARM_AES; + if (probed.hwcap & HWCAP_PMULL) hw |= HF_ARM_PMULL; #endif /* Store the bitmask of features we probed for everyone to see. */ @@ -549,6 +557,12 @@ int cpu_feature_p(int feat) CASE_CPUFEAT(X86_AVX, "x86:avx", xmm_registers_available_p() && cpuid_features_p(0, CPUID1C_AVX)); + CASE_CPUFEAT(X86_SSSE3, "x86:ssse3", + xmm_registers_available_p() && + cpuid_features_p(0, CPUID1C_SSSE3)); + CASE_CPUFEAT(X86_PCLMUL, "x86:pclmul", + xmm_registers_available_p() && + cpuid_features_p(0, CPUID1C_PCLMUL)); #endif #ifdef CAPMAP # define FEATP__CASE(feat, tok) \ diff --git a/base/dispatch.h b/base/dispatch.h index dae6a689..7c083821 100644 --- a/base/dispatch.h +++ b/base/dispatch.h @@ -182,7 +182,10 @@ enum { CPUFEAT_ARM_D32, /* 32 double registers, not 16 */ CPUFEAT_X86_RDRAND, /* Built-in entropy source */ CPUFEAT_ARM_AES, /* AES instructions */ - CPUFEAT_X86_AVX /* AVX 1 (i.e., 256-bit YMM regs) */ + CPUFEAT_X86_AVX, /* AVX 1 (i.e., 256-bit YMM regs) */ + CPUFEAT_X86_SSSE3, /* Supplementary SSE 3 */ + CPUFEAT_X86_PCLMUL, /* Carry-less multiplication */ + CPUFEAT_ARM_PMULL /* Polynomial multiplication */ }; extern int cpu_feature_p(int /*feat*/); diff --git a/symm/Makefile.am b/symm/Makefile.am index cb20f3a9..a4f45e9d 100644 --- a/symm/Makefile.am +++ b/symm/Makefile.am @@ -323,6 +323,18 @@ BLKCMACMODES += cmac pmac1 pkginclude_HEADERS += ocb.h BLKCAEADMODES += ccm eax gcm ocb1 ocb3 libsymm_la_SOURCES += ccm.c gcm.c ocb.c +if CPUFAM_X86 +libsymm_la_SOURCES += gcm-x86ish-pclmul.S +endif +if CPUFAM_AMD64 +libsymm_la_SOURCES += gcm-x86ish-pclmul.S +endif +if CPUFAM_ARMEL +libsymm_la_SOURCES += gcm-arm-crypto.S +endif +if CPUFAM_ARM64 +libsymm_la_SOURCES += gcm-arm64-pmull.S +endif TESTS += gcm.t$(EXEEXT) EXTRA_DIST += t/gcm diff --git a/symm/gcm-arm-crypto.S b/symm/gcm-arm-crypto.S new file mode 100644 index 00000000..ddc714b3 --- /dev/null +++ b/symm/gcm-arm-crypto.S @@ -0,0 +1,718 @@ +/// -*- mode: asm; asm-comment-char: ?/ -*- +/// +/// GCM acceleration for ARM processors +/// +/// (c) 2019 Straylight/Edgeware +/// + +///----- Licensing notice --------------------------------------------------- +/// +/// This file is part of Catacomb. +/// +/// Catacomb is free software: you can redistribute it and/or modify it +/// under the terms of the GNU Library General Public License as published +/// by the Free Software Foundation; either version 2 of the License, or +/// (at your option) any later version. +/// +/// Catacomb is distributed in the hope that it will be useful, but +/// WITHOUT ANY WARRANTY; without even the implied warranty of +/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +/// Library General Public License for more details. +/// +/// You should have received a copy of the GNU Library General Public +/// License along with Catacomb. If not, write to the Free Software +/// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, +/// USA. + +///-------------------------------------------------------------------------- +/// Preliminaries. + +#include "config.h" +#include "asm-common.h" + + .arch armv8-a + .fpu crypto-neon-fp-armv8 + + .text + +///-------------------------------------------------------------------------- +/// Multiplication macros. + + // The good news is that we have a fancy instruction to do the + // multiplications. The bad news is that it's not particularly well- + // suited to the job. + // + // For one thing, it only does a 64-bit multiplication, so in general + // we'll need to synthesize the full-width multiply by hand. For + // another thing, it doesn't help with the reduction, so we have to + // do that by hand too. And, finally, GCM has crazy bit ordering, + // and the instruction does nothing useful for that at all. + // + // Focusing on that last problem first: the bits aren't in monotonic + // significance order unless we permute them. If we reverse the byte + // order, then we'll have the bits in monotonic order, but backwards, + // so the degree-0 coefficient will be in the most-significant bit. + // + // This is less of a difficulty than it seems at first, because + // algebra. Suppose we are given u = SUM_{0<=in*p->n; i++) ktab[i] = ENDSWAP32(ktab[i]); } +#if CPUFAM_X86 || CPUFAM_AMD64 +static void pclmul_mktable(const gcm_params *p, + uint32 *ktab, const uint32 *k) +{ + unsigned n = p->n; + unsigned nz; + uint32 *t; + + /* We just need to store the value in a way which is convenient for the + * assembler code to read back. That involves reordering the words, and, + * in the case of 96-bit blocks, padding with zeroes to fill out a 128-bit + * chunk. + */ + + if (n == 3) nz = 1; + else nz = 0; + t = ktab + n + nz; + + if (p->f&GCMF_SWAP) while (n--) { *--t = ENDSWAP32(*k); k++; } + else while (n--) *--t = *k++; + while (nz--) *--t = 0; +} +#endif + +#if CPUFAM_ARMEL +static void arm_crypto_mktable(const gcm_params *p, + uint32 *ktab, const uint32 *k) +{ + unsigned n = p->n; + uint32 *t; + + /* We just need to store the value in a way which is convenient for the + * assembler code to read back. That involves swapping the bytes in each + * 64-bit lane. + */ + + t = ktab; + if (p->f&GCMF_SWAP) { + while (n >= 2) { + t[1] = ENDSWAP32(k[0]); t[0] = ENDSWAP32(k[1]); + t += 2; k += 2; n -= 2; + } + if (n) { t[1] = ENDSWAP32(k[0]); t[0] = 0; } + } else { + while (n >= 2) { + t[1] = k[0]; t[0] = k[1]; + t += 2; k += 2; n -= 2; + } + if (n) { t[1] = k[0]; t[0] = 0; } + } +} +#endif + +#if CPUFAM_ARM64 +static uint32 rbit32(uint32 x) +{ + uint32 z, t; + +#if GCC_VERSION_P(4, 3) + /* Two tricks here. Firstly, two separate steps, rather than a single + * block of assembler, to allow finer-grained instruction scheduling. + * Secondly, use `ENDSWAP32' so that the compiler can cancel it if the + * caller actually wants the bytes reordered. + */ + __asm__("rbit %w0, %w1" : "=r"(t) : "r"(x)); + z = ENDSWAP32(t); +#else + /* A generic but slightly clever implementation. */ +# define SWIZZLE(x, m, s) ((((x)&(m)) << (s)) | (((x)&~(m)) >> (s))) + /* 76543210 */ + t = SWIZZLE(x, 0x0f0f0f0f, 4); /* 32107654 -- swap nibbles */ + t = SWIZZLE(t, 0x33333333, 2); /* 10325476 -- swap bit pairs */ + z = SWIZZLE(t, 0x55555555, 1); /* 01234567 -- swap adjacent bits */ +# undef SWIZZLE +#endif + return (z); +} + +static void arm64_pmull_mktable(const gcm_params *p, + uint32 *ktab, const uint32 *k) +{ + unsigned n = p->n; + uint32 *t; + + /* We just need to store the value in a way which is convenient for the + * assembler code to read back. That involves two transformations: + * + * * firstly, reversing the order of the bits in each byte; and, + * + * * secondly, storing two copies of each 64-bit chunk. + * + * Note that, in this case, we /want/ the little-endian byte order of GCM, + * so endianness-swapping happens in the big-endian case. + */ + + t = ktab; + if (p->f&GCMF_SWAP) { + while (n >= 2) { + t[0] = t[2] = rbit32(k[0]); + t[1] = t[3] = rbit32(k[1]); + t += 4; k += 2; n -= 2; + } + if (n) { t[0] = t[2] = rbit32(k[0]); t[1] = t[3] = 0; } + } else { + while (n >= 2) { + t[0] = t[2] = ENDSWAP32(rbit32(k[0])); + t[1] = t[3] = ENDSWAP32(rbit32(k[1])); + t += 4; k += 2; n -= 2; + } + if (n) { t[0] = t[2] = ENDSWAP32(rbit32(k[0])); t[1] = t[3] = 0; } + } +} +#endif + CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable, (const gcm_params *p, uint32 *ktab, const uint32 *k), (p, ktab, k), @@ -241,6 +355,19 @@ CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mktable, static gcm_mktable__functype *pick_mktable(void) { +#if CPUFAM_X86 || CPUFAM_AMD64 + DISPATCH_PICK_COND(gcm_mktable, pclmul_mktable, + cpu_feature_p(CPUFEAT_X86_SSSE3) && + cpu_feature_p(CPUFEAT_X86_PCLMUL)); +#endif +#if CPUFAM_ARMEL + DISPATCH_PICK_COND(gcm_mktable, arm_crypto_mktable, + cpu_feature_p(CPUFEAT_ARM_PMULL)); +#endif +#if CPUFAM_ARM64 + DISPATCH_PICK_COND(gcm_mktable, arm64_pmull_mktable, + cpu_feature_p(CPUFEAT_ARM_PMULL)); +#endif DISPATCH_PICK_FALLBACK(gcm_mktable, simple_mktable); } @@ -271,13 +398,84 @@ static void simple_recover_k(const gcm_params *p, else for (i = 0; i < p->n; i++) k[i] = ENDSWAP32(ktab[24*p->n + i]); } +#if CPUFAM_X86 || CPUFAM_AMD64 +static void pclmul_recover_k(const gcm_params *p, + uint32 *k, const uint32 *ktab) +{ + unsigned n = p->n; + unsigned nz; + const uint32 *t; + + /* The representation is already independent of the blockcipher endianness. + * We need to compensate for padding, and reorder the words. + */ + + if (n == 3) nz = 1; else nz = 0; + t = ktab + n + nz; + while (n--) *k++ = *--t; +} +#endif + +#if CPUFAM_ARMEL +static void arm_crypto_recover_k(const gcm_params *p, + uint32 *k, const uint32 *ktab) +{ + unsigned n = p->n; + const uint32 *t; + + /* The representation is already independent of the blockcipher endianness. + * We only need to reorder the words. + */ + + t = ktab; + while (n >= 2) { k[1] = t[0]; k[0] = t[1]; t += 2; k += 2; n -= 2; } + if (n) k[0] = t[1]; +} +#endif + +#if CPUFAM_ARM64 +static void arm64_pmull_recover_k(const gcm_params *p, + uint32 *k, const uint32 *ktab) +{ + unsigned n = p->n; + const uint32 *t; + + /* The representation is already independent of the blockcipher endianness. + * We need to skip the duplicate pieces, and unscramble the bytes. + */ + + t = ktab; + while (n >= 2) { + k[0] = ENDSWAP32(rbit32(t[0])); + k[1] = ENDSWAP32(rbit32(t[1])); + t += 4; k += 2; n -= 2; + } + if (n) k[0] = ENDSWAP32(rbit32(t[0])); +} +#endif + CPU_DISPATCH(static, EMPTY, void, recover_k, (const gcm_params *p, uint32 *k, const uint32 *ktab), (p, k, ktab), pick_recover_k, simple_recover_k) static recover_k__functype *pick_recover_k(void) - { DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k); } +{ +#if CPUFAM_X86 || CPUFAM_AMD64 + DISPATCH_PICK_COND(recover_k, pclmul_recover_k, + cpu_feature_p(CPUFEAT_X86_SSSE3) && + cpu_feature_p(CPUFEAT_X86_PCLMUL)); +#endif +#if CPUFAM_ARMEL + DISPATCH_PICK_COND(recover_k, arm_crypto_recover_k, + cpu_feature_p(CPUFEAT_ARM_PMULL)); +#endif +#if CPUFAM_ARM64 + DISPATCH_PICK_COND(recover_k, arm64_pmull_recover_k, + cpu_feature_p(CPUFEAT_ARM_PMULL)); +#endif + DISPATCH_PICK_FALLBACK(recover_k, simple_recover_k); +} /* --- @gcm_mulk_N{b,l}@ --- * * @@ -292,6 +490,48 @@ static recover_k__functype *pick_recover_k(void) * function whose performance actually matters. */ +#if CPUFAM_X86 || CPUFAM_AMD64 +# define DECL_MULK_X86ISH(var) extern gcm_mulk_##var##__functype \ + gcm_mulk_##var##_x86ish_pclmul_avx, \ + gcm_mulk_##var##_x86ish_pclmul; +# define PICK_MULK_X86ISH(var) do { \ + DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul_avx, \ + cpu_feature_p(CPUFEAT_X86_AVX) && \ + cpu_feature_p(CPUFEAT_X86_PCLMUL) && \ + cpu_feature_p(CPUFEAT_X86_SSSE3)); \ + DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_x86ish_pclmul, \ + cpu_feature_p(CPUFEAT_X86_PCLMUL) && \ + cpu_feature_p(CPUFEAT_X86_SSSE3)); \ +} while (0) +#else +# define DECL_MULK_X86ISH(var) +# define PICK_MULK_X86ISH(var) do ; while (0) +#endif + +#if CPUFAM_ARMEL +# define DECL_MULK_ARM(var) \ + extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm_crypto; +# define PICK_MULK_ARM(var) do { \ + DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm_crypto, \ + cpu_feature_p(CPUFEAT_ARM_PMULL)); \ +} while (0) +#else +# define DECL_MULK_ARM(var) +# define PICK_MULK_ARM(var) do ; while (0) +#endif + +#if CPUFAM_ARM64 +# define DECL_MULK_ARM64(var) \ + extern gcm_mulk_##var##__functype gcm_mulk_##var##_arm64_pmull; +# define PICK_MULK_ARM64(var) do { \ + DISPATCH_PICK_COND(gcm_mulk_##var, gcm_mulk_##var##_arm64_pmull, \ + cpu_feature_p(CPUFEAT_ARM_PMULL)); \ +} while (0) +#else +# define DECL_MULK_ARM64(var) +# define PICK_MULK_ARM64(var) do ; while (0) +#endif + #define DEF_MULK(nbits) \ \ CPU_DISPATCH(EMPTY, EMPTY, void, gcm_mulk_##nbits##b, \ @@ -321,10 +561,27 @@ static void simple_mulk_##nbits(uint32 *a, const uint32 *ktab) \ for (i = 0; i < nbits/32; i++) a[i] = z[i]; \ } \ \ +DECL_MULK_X86ISH(nbits##b) \ +DECL_MULK_ARM(nbits##b) \ +DECL_MULK_ARM64(nbits##b) \ static gcm_mulk_##nbits##b##__functype *pick_mulk_##nbits##b(void) \ - { DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits); } \ +{ \ + PICK_MULK_X86ISH(nbits##b); \ + PICK_MULK_ARM(nbits##b); \ + PICK_MULK_ARM64(nbits##b); \ + DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##b, simple_mulk_##nbits); \ +} \ + \ +DECL_MULK_X86ISH(nbits##l) \ +DECL_MULK_ARM(nbits##l) \ +DECL_MULK_ARM64(nbits##l) \ static gcm_mulk_##nbits##l##__functype *pick_mulk_##nbits##l(void) \ - { DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits); } +{ \ + PICK_MULK_X86ISH(nbits##l); \ + PICK_MULK_ARM(nbits##l); \ + PICK_MULK_ARM64(nbits##l); \ + DISPATCH_PICK_FALLBACK(gcm_mulk_##nbits##l, simple_mulk_##nbits); \ +} GCM_WIDTHS(DEF_MULK) diff --git a/utils/gcm-ref b/utils/gcm-ref new file mode 100755 index 00000000..ccbf4321 --- /dev/null +++ b/utils/gcm-ref @@ -0,0 +1,502 @@ +#! /usr/bin/python +### -*- coding: utf-8 -*- + +from sys import argv, exit + +import catacomb as C + +###-------------------------------------------------------------------------- +### Random utilities. + +def words(s): + """Split S into 32-bit pieces and report their values as hex.""" + return ' '.join('%08x' % C.MP.loadb(s[i:i + 4]) + for i in xrange(0, len(s), 4)) + +def words_64(s): + """Split S into 64-bit pieces and report their values as hex.""" + return ' '.join('%016x' % C.MP.loadb(s[i:i + 8]) + for i in xrange(0, len(s), 8)) + +def repmask(val, wd, n): + """Return a mask consisting of N copies of the WD-bit value VAL.""" + v = C.GF(val) + a = C.GF(0) + for i in xrange(n): a = (a << wd) | v + return a + +def combs(things, k): + """Iterate over all possible combinations of K of the THINGS.""" + ii = range(k) + n = len(things) + while True: + yield [things[i] for i in ii] + for j in xrange(k): + if j == k - 1: lim = n + else: lim = ii[j + 1] + i = ii[j] + 1 + if i < lim: + ii[j] = i + break + ii[j] = j + else: + return + +POLYMAP = {} + +def poly(nbits): + """ + Return the lexically first irreducible polynomial of degree NBITS of lowest + weight. + """ + try: return POLYMAP[nbits] + except KeyError: pass + base = C.GF(0).setbit(nbits).setbit(0) + for k in xrange(1, nbits, 2): + for cc in combs(range(1, nbits), k): + p = base + sum(C.GF(0).setbit(c) for c in cc) + if p.irreduciblep(): POLYMAP[nbits] = p; return p + raise ValueError, nbits + +def gcm_mangle(x): + """Flip the bits within each byte according to GCM's insane convention.""" + y = C.WriteBuffer() + for b in x: + b = ord(b) + bb = 0 + for i in xrange(8): + bb <<= 1 + if b&1: bb |= 1 + b >>= 1 + y.putu8(bb) + return y.contents + +def endswap_words_32(x): + """End-swap each 32-bit word of X.""" + x = C.ReadBuffer(x) + y = C.WriteBuffer() + while x.left: y.putu32l(x.getu32b()) + return y.contents + +def endswap_words_64(x): + """End-swap each 64-bit word of X.""" + x = C.ReadBuffer(x) + y = C.WriteBuffer() + while x.left: y.putu64l(x.getu64b()) + return y.contents + +def endswap_bytes(x): + """End-swap X by bytes.""" + y = C.WriteBuffer() + for ch in reversed(x): y.put(ch) + return y.contents + +def gfmask(n): + return C.GF(C.MP(0).setbit(n) - 1) + +def gcm_mul(x, y): + """Multiply X and Y according to the GCM rules.""" + w = len(x) + p = poly(8*w) + u, v = C.GF.loadl(gcm_mangle(x)), C.GF.loadl(gcm_mangle(y)) + z = (u*v)%p + return gcm_mangle(z.storel(w)) + +DEMOMAP = {} +def demo(func): + name = func.func_name + assert(name.startswith('demo_')) + DEMOMAP[name[5:].replace('_', '-')] = func + return func + +def iota(i = 0): + vi = [i] + def next(): vi[0] += 1; return vi[0] - 1 + return next + +###-------------------------------------------------------------------------- +### Portable table-driven implementation. + +def shift_left(x): + """Given a field element X (in external format), return X t.""" + w = len(x) + p = poly(8*w) + return gcm_mangle(C.GF.storel((C.GF.loadl(gcm_mangle(x)) << 1)%p)) + +def table_common(u, v, flip, getword, ixmask): + """ + Multiply U by V using table lookup; common for `table-b' and `table-l'. + + This matches the `simple_mulk_...' implementation in `gcm.c'. One-entry + per bit is the best we can manage if we want a constant-time + implementation: processing n bits at a time means we need to scan + (2^n - 1)/n times as much memory. + + * FLIP is a function (assumed to be an involution) on one argument X to + convert X from external format to table-entry format or back again. + + * GETWORD is a function on one argument B to retrieve the next 32-bit + chunk of a field element held in a `ReadBuffer'. Bits within a word + are processed most-significant first. + + * IXMASK is a mask XORed into table indices to permute the table so that + it's order matches that induced by GETWORD. + + The table is built such that tab[i XOR IXMASK] = U t^i. + """ + w = len(u); assert(w == len(v)) + a = C.ByteString.zero(w) + tab = [None]*(8*w) + for i in xrange(8*w): + print ';; %9s = %7s = %s' % ('utab[%d]' % i, 'u t^%d' % i, words(u)) + tab[i ^ ixmask] = flip(u) + u = shift_left(u) + v = C.ReadBuffer(v) + i = 0 + while v.left: + t = getword(v) + for j in xrange(32): + bit = (t >> 31)&1 + if bit: a ^= tab[i] + print ';; %6s = %d: a <- %s [%9s = %s]' % \ + ('v[%d]' % (i ^ ixmask), bit, words(a), + 'utab[%d]' % (i ^ ixmask), words(tab[i])) + i += 1; t <<= 1 + return flip(a) + +@demo +def demo_table_b(u, v): + """Big-endian table lookup.""" + return table_common(u, v, lambda x: x, lambda b: b.getu32b(), 0) + +@demo +def demo_table_l(u, v): + """Little-endian table lookup.""" + return table_common(u, v, endswap_words, lambda b: b.getu32l(), 0x18) + +###-------------------------------------------------------------------------- +### Implementation using 64×64->128-bit binary polynomial multiplication. + +_i = iota() +TAG_INPUT_U = _i() +TAG_INPUT_V = _i() +TAG_KPIECE_U = _i() +TAG_KPIECE_V = _i() +TAG_PRODPIECE = _i() +TAG_PRODSUM = _i() +TAG_PRODUCT = _i() +TAG_SHIFTED = _i() +TAG_REDCBITS = _i() +TAG_REDCFULL = _i() +TAG_REDCMIX = _i() +TAG_OUTPUT = _i() + +def split_gf(x, n): + n /= 8 + return [C.GF.loadb(x[i:i + n]) for i in xrange(0, len(x), n)] + +def join_gf(xx, n): + x = C.GF(0) + for i in xrange(len(xx)): x = (x << n) | xx[i] + return x + +def present_gf(x, w, n, what): + firstp = True + m = gfmask(n) + for i in xrange(0, w, 128): + print ';; %12s%c =%s' % \ + (firstp and what or '', + firstp and ':' or ' ', + ''.join([j < w + and ' 0x%s' % hex(((x >> j)&m).storeb(n/8)) + or '' + for j in xrange(i, i + 128, n)])) + firstp = False + +def present_gf_pclmul(tag, wd, x, w, n, what): + if tag != TAG_PRODPIECE: present_gf(x, w, n, what) + +def reverse(x, w): + return C.GF.loadl(x.storeb(w/8)) + +def rev32(x): + w = x.noctets + m_ffff = repmask(0xffff, 32, w/4) + m_ff = repmask(0xff, 16, w/2) + x = ((x&m_ffff) << 16) | ((x >> 16)&m_ffff) + x = ((x&m_ff) << 8) | ((x >> 8)&m_ff) + return x + +def rev8(x): + w = x.noctets + m_0f = repmask(0x0f, 8, w) + m_33 = repmask(0x33, 8, w) + m_55 = repmask(0x55, 8, w) + x = ((x&m_0f) << 4) | ((x >> 4)&m_0f) + x = ((x&m_33) << 2) | ((x >> 2)&m_33) + x = ((x&m_55) << 1) | ((x >> 1)&m_55) + return x + +def present_gf_mullp64(tag, wd, x, w, n, what): + if tag == TAG_PRODPIECE or tag == TAG_REDCFULL: + return + elif (wd == 128 or wd == 64) and TAG_PRODSUM <= tag <= TAG_PRODUCT: + y = x + elif (wd == 96 or wd == 192 or wd == 256) and \ + TAG_PRODSUM <= tag < TAG_OUTPUT: + y = x + else: + xx = x.storeb(w/8) + extra = len(xx)%8 + if extra: xx += C.ByteString.zero(8 - extra) + yb = C.WriteBuffer() + for i in xrange(len(xx), 0, -8): yb.put(xx[i - 8:i]) + y = C.GF.loadb(yb.contents) + present_gf(y, (w + 63)&~63, n, what) + +def present_gf_pmull(tag, wd, x, w, n, what): + if tag == TAG_PRODPIECE or tag == TAG_REDCFULL or tag == TAG_SHIFTED: + return + elif tag == TAG_INPUT_V or tag == TAG_KPIECE_V: + bx = C.ReadBuffer(x.storeb(w/8)) + by = C.WriteBuffer() + while bx.left: chunk = bx.get(8); by.put(chunk).put(chunk) + x = C.GF.loadb(by.contents) + w *= 2 + elif TAG_PRODSUM <= tag <= TAG_PRODUCT: + x <<= 1 + y = reverse(rev8(x), w) + present_gf(y, w, n, what) + +def poly64_mul_simple(u, v, presfn, wd, dispwd, mulwd, uwhat, vwhat): + """ + Multiply U by V, returning the product. + + This is the fallback long multiplication. + """ + + uw, vw = 8*len(u), 8*len(v) + + ## We start by carving the operands into 64-bit pieces. This is + ## straightforward except for the 96-bit case, where we end up with two + ## short pieces which we pad at the beginning. + if uw%mulwd: pad = (-uw)%mulwd; u += C.ByteString.zero(pad); uw += pad + if vw%mulwd: pad = (-uw)%mulwd; v += C.ByteString.zero(pad); vw += pad + uu = split_gf(u, mulwd) + vv = split_gf(v, mulwd) + + ## Report and accumulate the individual product pieces. + x = C.GF(0) + ulim, vlim = uw/mulwd, vw/mulwd + for i in xrange(ulim + vlim - 2, -1, -1): + t = C.GF(0) + for j in xrange(max(0, i - vlim + 1), min(vlim, i + 1)): + s = uu[ulim - 1 - i + j]*vv[vlim - 1 - j] + presfn(TAG_PRODPIECE, wd, s, 2*mulwd, dispwd, + '%s_%d %s_%d' % (uwhat, i - j, vwhat, j)) + t += s + presfn(TAG_PRODSUM, wd, t, 2*mulwd, dispwd, + '(%s %s)_%d' % (uwhat, vwhat, ulim + vlim - 2 - i)) + x += t << (mulwd*i) + presfn(TAG_PRODUCT, wd, x, uw + vw, dispwd, '%s %s' % (uwhat, vwhat)) + + return x + +def poly64_mul_karatsuba(u, v, klimit, presfn, wd, + dispwd, mulwd, uwhat, vwhat): + """ + Multiply U by V, returning the product. + + If the length of U and V is at least KLIMIT, and the operands are otherwise + suitable, then do Karatsuba--Ofman multiplication; otherwise, delegate to + `poly64_mul_simple'. + """ + w = 8*len(u) + + if w < klimit or w != 8*len(v) or w%(2*mulwd) != 0: + return poly64_mul_simple(u, v, presfn, wd, dispwd, mulwd, uwhat, vwhat) + + hw = w/2 + u0, u1 = u[:hw/8], u[hw/8:] + v0, v1 = v[:hw/8], v[hw/8:] + uu, vv = u0 ^ u1, v0 ^ v1 + + presfn(TAG_KPIECE_U, wd, C.GF.loadb(uu), hw, dispwd, '%s*' % uwhat) + presfn(TAG_KPIECE_V, wd, C.GF.loadb(vv), hw, dispwd, '%s*' % vwhat) + uuvv = poly64_mul_karatsuba(uu, vv, klimit, presfn, wd, dispwd, mulwd, + '%s*' % uwhat, '%s*' % vwhat) + + presfn(TAG_KPIECE_U, wd, C.GF.loadb(u0), hw, dispwd, '%s0' % uwhat) + presfn(TAG_KPIECE_V, wd, C.GF.loadb(v0), hw, dispwd, '%s0' % vwhat) + u0v0 = poly64_mul_karatsuba(u0, v0, klimit, presfn, wd, dispwd, mulwd, + '%s0' % uwhat, '%s0' % vwhat) + + presfn(TAG_KPIECE_U, wd, C.GF.loadb(u1), hw, dispwd, '%s1' % uwhat) + presfn(TAG_KPIECE_V, wd, C.GF.loadb(v1), hw, dispwd, '%s1' % vwhat) + u1v1 = poly64_mul_karatsuba(u1, v1, klimit, presfn, wd, dispwd, mulwd, + '%s1' % uwhat, '%s1' % vwhat) + + uvuv = uuvv + u0v0 + u1v1 + presfn(TAG_PRODSUM, wd, uvuv, w, dispwd, '%s!%s' % (uwhat, vwhat)) + + x = u1v1 + (uvuv << hw) + (u0v0 << w) + presfn(TAG_PRODUCT, wd, x, 2*w, dispwd, '%s %s' % (uwhat, vwhat)) + return x + +def poly64_common(u, v, presfn, dispwd = 32, mulwd = 64, redcwd = 32, + klimit = 256): + """ + Multiply U by V using a primitive 64-bit binary polynomial mutliplier. + + Such a multiplier exists as the appallingly-named `pclmul[lh]q[lh]qdq' on + x86, and as `vmull.p64'/`pmull' on ARM. + + Operands arrive in a `register format', which is a byte-swapped variant of + the external format. Implementations differ on the precise details, + though. + """ + + ## We work in two main phases: first, calculate the full double-width + ## product; and, second, reduce it modulo the field polynomial. + + w = 8*len(u); assert(w == 8*len(v)) + p = poly(w) + presfn(TAG_INPUT_U, w, C.GF.loadb(u), w, dispwd, 'u') + presfn(TAG_INPUT_V, w, C.GF.loadb(v), w, dispwd, 'v') + + ## So, on to the first part: the multiplication. + x = poly64_mul_karatsuba(u, v, klimit, presfn, w, dispwd, mulwd, 'u', 'v') + + ## Now we have to shift everything up one bit to account for GCM's crazy + ## bit ordering. + y = x << 1 + if w == 96: y >>= 64 + presfn(TAG_SHIFTED, w, y, 2*w, dispwd, 'y') + + ## Now for the reduction. + ## + ## Our polynomial has the form p = t^d + r where r = SUM_{0<=i= m, needs reduction; but + ## y_i t^{ni} = y_i r t^{n(i-m)}, so we just multiply the top half by r and + ## add it to the bottom half. This all depends on r_i = 0 for all i >= + ## n/2. We process each nonzero coefficient of r separately, in two + ## passes. + ## + ## Multiplying a chunk y_i by some t^j is the same as shifting it left by j + ## bits (or would be if GCM weren't backwards, but let's not worry about + ## that right now). The high j bits will spill over into the next chunk, + ## while the low n - j bits will stay where they are. It's these high bits + ## which cause trouble -- particularly the high bits of the top chunk, + ## since we'll add them on to y_m, which will need further reduction. But + ## only the topmost j bits will do this. + ## + ## The trick is that we do all of the bits which spill over first -- all of + ## the top j bits in each chunk, for each j -- in one pass, and then a + ## second pass of all the bits which don't. Because j, j' < n/2 for any + ## two nonzero coefficient degrees j and j', we have j + j' < n whence j < + ## n - j' -- so all of the bits contributed to y_m will be handled in the + ## second pass when we handle the bits that don't spill over. + rr = [i for i in xrange(1, w) if p.testbit(i)] + m = gfmask(redcwd) + + ## Handle the spilling bits. + yy = split_gf(y.storeb(w/4), redcwd) + b = C.GF(0) + for rj in rr: + br = [(yi << (redcwd - rj))&m for yi in yy[w/redcwd:]] + presfn(TAG_REDCBITS, w, join_gf(br, redcwd), w, dispwd, 'b(%d)' % rj) + b += join_gf(br, redcwd) << (w - redcwd) + presfn(TAG_REDCFULL, w, b, 2*w, dispwd, 'b') + s = y + b + presfn(TAG_REDCMIX, w, s, 2*w, dispwd, 's') + + ## Handle the nonspilling bits. + ss = split_gf(s.storeb(w/4), redcwd) + a = C.GF(0) + for rj in rr: + ar = [si >> rj for si in ss[w/redcwd:]] + presfn(TAG_REDCBITS, w, join_gf(ar, redcwd), w, dispwd, 'a(%d)' % rj) + a += join_gf(ar, redcwd) + presfn(TAG_REDCFULL, w, a, w, dispwd, 'a') + + ## Mix everything together. + m = gfmask(w) + z = (s&m) + (s >> w) + a + presfn(TAG_OUTPUT, w, z, w, dispwd, 'z') + + ## And we're done. + return z.storeb(w/8) + +@demo +def demo_pclmul(u, v): + return poly64_common(u, v, presfn = present_gf_pclmul) + +@demo +def demo_vmullp64(u, v): + w = 8*len(u) + return poly64_common(u, v, presfn = present_gf_mullp64, + redcwd = w%64 == 32 and 32 or 64) + +@demo +def demo_pmull(u, v): + w = 8*len(u) + return poly64_common(u, v, presfn = present_gf_pmull, + redcwd = w%64 == 32 and 32 or 64) + +###-------------------------------------------------------------------------- +### @@@ Random debris to be deleted. @@@ + +def cutting_room_floor(): + + x = C.bytes('cde4bef260d7bcda163547d348b7551195e77022907dd1df') + y = C.bytes('f7dac5c9941d26d0c6eb14ad568f86edd1dc9268eeee5332') + + u, v = C.GF.loadb(x), C.GF.loadb(y) + + g = u*v << 1 + print 'y = %s' % words(g.storeb(48)) + b1 = (g&repmask(0x01, 32, 6)) << 191 + b2 = (g&repmask(0x03, 32, 6)) << 190 + b7 = (g&repmask(0x7f, 32, 6)) << 185 + b = b1 + b2 + b7 + print 'b = %s' % words(b.storeb(48)[0:28]) + h = g + b + print 'w = %s' % words(h.storeb(48)) + + a0 = (h&repmask(0xffffffff, 32, 6)) << 192 + a1 = (h&repmask(0xfffffffe, 32, 6)) << 191 + a2 = (h&repmask(0xfffffffc, 32, 6)) << 190 + a7 = (h&repmask(0xffffff80, 32, 6)) << 185 + a = a0 + a1 + a2 + a7 + + print ' a_1 = %s' % words(a1.storeb(48)[0:24]) + print ' a_2 = %s' % words(a2.storeb(48)[0:24]) + print ' a_7 = %s' % words(a7.storeb(48)[0:24]) + + print 'low+unit = %s' % words((h + a0).storeb(48)[0:24]) + print ' low+0,2 = %s' % words((h + a0 + a2).storeb(48)[0:24]) + print ' 1,7 = %s' % words((a1 + a7).storeb(48)[0:24]) + + print 'a = %s' % words(a.storeb(48)[0:24]) + z = h + a + print 'z = %s' % words(z.storeb(48)) + + z = gcm_mul(x, y) + print 'u v mod p = %s' % words(z) + +###-------------------------------------------------------------------------- +### Main program. + +style = argv[1] +u = C.bytes(argv[2]) +v = C.bytes(argv[3]) +zz = DEMOMAP[style](u, v) +assert zz == gcm_mul(u, v) + +###----- That's all, folks -------------------------------------------------- -- 2.11.0