X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/601ec68eda2cd99ae1b5dc1cfbc7749c92912802..e29fe90ce0520132d62df571f814a8492982d684:/utils/gcm-ref diff --git a/utils/gcm-ref b/utils/gcm-ref index 4a53737b..6a9c4c22 100755 --- a/utils/gcm-ref +++ b/utils/gcm-ref @@ -127,7 +127,7 @@ def table_common(u, v, flip, getword, ixmask): """ Multiply U by V using table lookup; common for `table-b' and `table-l'. - This matches the `simple_mulk_...' implementation in `gcm.c'. One-entry + This matches the `simple_mulk_...' implementation in `gcm.c'. One entry per bit is the best we can manage if we want a constant-time implementation: processing n bits at a time means we need to scan (2^n - 1)/n times as much memory. @@ -140,7 +140,7 @@ def table_common(u, v, flip, getword, ixmask): are processed most-significant first. * IXMASK is a mask XORed into table indices to permute the table so that - it's order matches that induced by GETWORD. + its order matches that induced by GETWORD. The table is built such that tab[i XOR IXMASK] = U t^i. """ @@ -172,7 +172,7 @@ def demo_table_b(u, v): @demo def demo_table_l(u, v): """Little-endian table lookup.""" - return table_common(u, v, endswap_words, lambda b: b.getu32l(), 0x18) + return table_common(u, v, endswap_words_32, lambda b: b.getu32l(), 0x18) ###-------------------------------------------------------------------------- ### Implementation using 64×64->128-bit binary polynomial multiplication. @@ -237,7 +237,7 @@ def rev8(x): x = ((x&m_55) << 1) | ((x >> 1)&m_55) return x -def present_gf_mullp64(tag, wd, x, w, n, what): +def present_gf_vmullp64(tag, wd, x, w, n, what): if tag == TAG_PRODPIECE or tag == TAG_REDCFULL: return elif (wd == 128 or wd == 64) and TAG_PRODSUM <= tag <= TAG_PRODUCT: @@ -258,6 +258,7 @@ def present_gf_pmull(tag, wd, x, w, n, what): if tag == TAG_PRODPIECE or tag == TAG_REDCFULL or tag == TAG_SHIFTED: return elif tag == TAG_INPUT_V or tag == TAG_KPIECE_V: + w = (w + 63)&~63 bx = C.ReadBuffer(x.storeb(w/8)) by = C.WriteBuffer() while bx.left: chunk = bx.get(8); by.put(chunk).put(chunk) @@ -280,10 +281,9 @@ def poly64_mul_simple(u, v, presfn, wd, dispwd, mulwd, uwhat, vwhat): ## We start by carving the operands into 64-bit pieces. This is ## straightforward except for the 96-bit case, where we end up with two ## short pieces which we pad at the beginning. - if uw%mulwd: pad = (-uw)%mulwd; u += C.ByteString.zero(pad); uw += pad - if vw%mulwd: pad = (-uw)%mulwd; v += C.ByteString.zero(pad); vw += pad - uu = split_gf(u, mulwd) - vv = split_gf(v, mulwd) + upad = (-uw)%mulwd; u += C.ByteString.zero(upad); uw += upad + vpad = (-vw)%mulwd; v += C.ByteString.zero(vpad); vw += vpad + uu = split_gf(u, mulwd); vv = split_gf(v, mulwd) ## Report and accumulate the individual product pieces. x = C.GF(0) @@ -300,7 +300,7 @@ def poly64_mul_simple(u, v, presfn, wd, dispwd, mulwd, uwhat, vwhat): x += t << (mulwd*i) presfn(TAG_PRODUCT, wd, x, uw + vw, dispwd, '%s %s' % (uwhat, vwhat)) - return x + return x >> (upad + vpad) def poly64_mul_karatsuba(u, v, klimit, presfn, wd, dispwd, mulwd, uwhat, vwhat): @@ -370,7 +370,6 @@ def poly64_common(u, v, presfn, dispwd = 32, mulwd = 64, redcwd = 32, ## Now we have to shift everything up one bit to account for GCM's crazy ## bit ordering. y = x << 1 - if w == 96: y >>= 64 presfn(TAG_SHIFTED, w, y, 2*w, dispwd, 'y') ## Now for the reduction. @@ -440,7 +439,7 @@ def demo_pclmul(u, v): @demo def demo_vmullp64(u, v): w = 8*len(u) - return poly64_common(u, v, presfn = present_gf_mullp64, + return poly64_common(u, v, presfn = present_gf_vmullp64, redcwd = w%64 == 32 and 32 or 64) @demo