utils/gcm-ref (present_gf_vmullp64): Add `v' prefix to match front end.

[catacomb] / utils / gcm-ref
diff --git a/utils/gcm-ref b/utils/gcm-ref

index 4a53737..6a9c4c2 100755 (executable)
--- a/utils/gcm-ref
+++ b/utils/gcm-ref
@@ -127,7 +127,7 @@ def table_common(u, v, flip, getword, ixmask):
    """
    Multiply U by V using table lookup; common for `table-b' and `table-l'.
  
-  This matches the `simple_mulk_...' implementation in `gcm.c'.  One-entry
+  This matches the `simple_mulk_...' implementation in `gcm.c'.  One entry
    per bit is the best we can manage if we want a constant-time
    implementation: processing n bits at a time means we need to scan
    (2^n - 1)/n times as much memory.
@@ -140,7 +140,7 @@ def table_common(u, v, flip, getword, ixmask):
        are processed most-significant first.
  
      * IXMASK is a mask XORed into table indices to permute the table so that
-      it's order matches that induced by GETWORD.
+      its order matches that induced by GETWORD.
  
    The table is built such that tab[i XOR IXMASK] = U t^i.
    """
@@ -172,7 +172,7 @@ def demo_table_b(u, v):
  @demo
  def demo_table_l(u, v):
    """Little-endian table lookup."""
-  return table_common(u, v, endswap_words, lambda b: b.getu32l(), 0x18)
+  return table_common(u, v, endswap_words_32, lambda b: b.getu32l(), 0x18)
  
  ###--------------------------------------------------------------------------
  ### Implementation using 64×64->128-bit binary polynomial multiplication.
@@ -237,7 +237,7 @@ def rev8(x):
    x = ((x&m_55) << 1) | ((x >> 1)&m_55)
    return x
  
-def present_gf_mullp64(tag, wd, x, w, n, what):
+def present_gf_vmullp64(tag, wd, x, w, n, what):
    if tag == TAG_PRODPIECE or tag == TAG_REDCFULL:
      return
    elif (wd == 128 or wd == 64) and TAG_PRODSUM <= tag <= TAG_PRODUCT:
@@ -258,6 +258,7 @@ def present_gf_pmull(tag, wd, x, w, n, what):
    if tag == TAG_PRODPIECE or tag == TAG_REDCFULL or tag == TAG_SHIFTED:
      return
    elif tag == TAG_INPUT_V or tag == TAG_KPIECE_V:
+    w = (w + 63)&~63
      bx = C.ReadBuffer(x.storeb(w/8))
      by = C.WriteBuffer()
      while bx.left: chunk = bx.get(8); by.put(chunk).put(chunk)
@@ -280,10 +281,9 @@ def poly64_mul_simple(u, v, presfn, wd, dispwd, mulwd, uwhat, vwhat):
    ## We start by carving the operands into 64-bit pieces.  This is
    ## straightforward except for the 96-bit case, where we end up with two
    ## short pieces which we pad at the beginning.
-  if uw%mulwd: pad = (-uw)%mulwd; u += C.ByteString.zero(pad); uw += pad
-  if vw%mulwd: pad = (-uw)%mulwd; v += C.ByteString.zero(pad); vw += pad
-  uu = split_gf(u, mulwd)
-  vv = split_gf(v, mulwd)
+  upad = (-uw)%mulwd; u += C.ByteString.zero(upad); uw += upad
+  vpad = (-vw)%mulwd; v += C.ByteString.zero(vpad); vw += vpad
+  uu = split_gf(u, mulwd); vv = split_gf(v, mulwd)
  
    ## Report and accumulate the individual product pieces.
    x = C.GF(0)
@@ -300,7 +300,7 @@ def poly64_mul_simple(u, v, presfn, wd, dispwd, mulwd, uwhat, vwhat):
      x += t << (mulwd*i)
    presfn(TAG_PRODUCT, wd, x, uw + vw, dispwd, '%s %s' % (uwhat, vwhat))
  
-  return x
+  return x >> (upad + vpad)
  
  def poly64_mul_karatsuba(u, v, klimit, presfn, wd,
                           dispwd, mulwd, uwhat, vwhat):
@@ -370,7 +370,6 @@ def poly64_common(u, v, presfn, dispwd = 32, mulwd = 64, redcwd = 32,
    ## Now we have to shift everything up one bit to account for GCM's crazy
    ## bit ordering.
    y = x << 1
-  if w == 96: y >>= 64
    presfn(TAG_SHIFTED, w, y, 2*w, dispwd, 'y')
  
    ## Now for the reduction.
@@ -440,7 +439,7 @@ def demo_pclmul(u, v):
  @demo
  def demo_vmullp64(u, v):
    w = 8*len(u)
-  return poly64_common(u, v, presfn = present_gf_mullp64,
+  return poly64_common(u, v, presfn = present_gf_vmullp64,
                         redcwd = w%64 == 32 and 32 or 64)
  
  @demo