[catacomb] / utils / gcm-ref

#! /usr/bin/python
### -*- coding: utf-8 -*-

from sys import argv, exit

import catacomb as C

###--------------------------------------------------------------------------
### Random utilities.

def words(s):
  """Split S into 32-bit pieces and report their values as hex."""
  return ' '.join('%08x' % C.MP.loadb(s[i:i + 4])
                  for i in xrange(0, len(s), 4))

def words_64(s):
  """Split S into 64-bit pieces and report their values as hex."""
  return ' '.join('%016x' % C.MP.loadb(s[i:i + 8])
                  for i in xrange(0, len(s), 8))

def repmask(val, wd, n):
  """Return a mask consisting of N copies of the WD-bit value VAL."""
  v = C.GF(val)
  a = C.GF(0)
  for i in xrange(n): a = (a << wd) | v
  return a

def combs(things, k):
  """Iterate over all possible combinations of K of the THINGS."""
  ii = range(k)
  n = len(things)
  while True:
    yield [things[i] for i in ii]
    for j in xrange(k):
      if j == k - 1: lim = n
      else: lim = ii[j + 1]
      i = ii[j] + 1
      if i < lim:
        ii[j] = i
        break
      ii[j] = j
    else:
      return

POLYMAP = {}

def poly(nbits):
  """
  Return the lexically first irreducible polynomial of degree NBITS of lowest
  weight.
  """
  try: return POLYMAP[nbits]
  except KeyError: pass
  base = C.GF(0).setbit(nbits).setbit(0)
  for k in xrange(1, nbits, 2):
    for cc in combs(range(1, nbits), k):
      p = base + sum((C.GF(0).setbit(c) for c in cc), C.GF(0))
      if p.irreduciblep(): POLYMAP[nbits] = p; return p
  raise ValueError, nbits

def gcm_mangle(x):
  """Flip the bits within each byte according to GCM's insane convention."""
  y = C.WriteBuffer()
  for b in x:
    b = ord(b)
    bb = 0
    for i in xrange(8):
      bb <<= 1
      if b&1: bb |= 1
      b >>= 1
    y.putu8(bb)
  return y.contents

def endswap_words_32(x):
  """End-swap each 32-bit word of X."""
  x = C.ReadBuffer(x)
  y = C.WriteBuffer()
  while x.left: y.putu32l(x.getu32b())
  return y.contents

def endswap_words_64(x):
  """End-swap each 64-bit word of X."""
  x = C.ReadBuffer(x)
  y = C.WriteBuffer()
  while x.left: y.putu64l(x.getu64b())
  return y.contents

def endswap_bytes(x):
  """End-swap X by bytes."""
  y = C.WriteBuffer()
  for ch in reversed(x): y.put(ch)
  return y.contents

def gfmask(n):
  return C.GF(C.MP(0).setbit(n) - 1)

def gcm_mul(x, y):
  """Multiply X and Y according to the GCM rules."""
  w = len(x)
  p = poly(8*w)
  u, v = C.GF.loadl(gcm_mangle(x)), C.GF.loadl(gcm_mangle(y))
  z = (u*v)%p
  return gcm_mangle(z.storel(w))

DEMOMAP = {}
def demo(func):
  name = func.func_name
  assert(name.startswith('demo_'))
  DEMOMAP[name[5:].replace('_', '-')] = func
  return func

def iota(i = 0):
  vi = [i]
  def next(): vi[0] += 1; return vi[0] - 1
  return next

###--------------------------------------------------------------------------
### Portable table-driven implementation.

def shift_left(x):
  """Given a field element X (in external format), return X t."""
  w = len(x)
  p = poly(8*w)
  return gcm_mangle(C.GF.storel((C.GF.loadl(gcm_mangle(x)) << 1)%p))

def table_common(u, v, flip, getword, ixmask):
  """
  Multiply U by V using table lookup; common for `table-b' and `table-l'.

  This matches the `simple_mulk_...' implementation in `gcm.c'.  One-entry
  per bit is the best we can manage if we want a constant-time
  implementation: processing n bits at a time means we need to scan
  (2^n - 1)/n times as much memory.

    * FLIP is a function (assumed to be an involution) on one argument X to
      convert X from external format to table-entry format or back again.

    * GETWORD is a function on one argument B to retrieve the next 32-bit
      chunk of a field element held in a `ReadBuffer'.  Bits within a word
      are processed most-significant first.

    * IXMASK is a mask XORed into table indices to permute the table so that
      it's order matches that induced by GETWORD.

  The table is built such that tab[i XOR IXMASK] = U t^i.
  """
  w = len(u); assert(w == len(v))
  a = C.ByteString.zero(w)
  tab = [None]*(8*w)
  for i in xrange(8*w):
    print ';; %9s = %7s = %s' % ('utab[%d]' % i, 'u t^%d' % i, words(u))
    tab[i ^ ixmask] = flip(u)
    u = shift_left(u)
  v = C.ReadBuffer(v)
  i = 0
  while v.left:
    t = getword(v)
    for j in xrange(32):
      bit = (t >> 31)&1
      if bit: a ^= tab[i]
      print ';; %6s = %d: a <- %s [%9s = %s]' % \
        ('v[%d]' % (i ^ ixmask), bit, words(a),
         'utab[%d]' % (i ^ ixmask), words(tab[i]))
      i += 1; t <<= 1
  return flip(a)

@demo
def demo_table_b(u, v):
  """Big-endian table lookup."""
  return table_common(u, v, lambda x: x, lambda b: b.getu32b(), 0)

@demo
def demo_table_l(u, v):
  """Little-endian table lookup."""
  return table_common(u, v, endswap_words, lambda b: b.getu32l(), 0x18)

###--------------------------------------------------------------------------
### Implementation using 64×64->128-bit binary polynomial multiplication.

_i = iota()
TAG_INPUT_U = _i()
TAG_INPUT_V = _i()
TAG_KPIECE_U = _i()
TAG_KPIECE_V = _i()
TAG_PRODPIECE = _i()
TAG_PRODSUM = _i()
TAG_PRODUCT = _i()
TAG_SHIFTED = _i()
TAG_REDCBITS = _i()
TAG_REDCFULL = _i()
TAG_REDCMIX = _i()
TAG_OUTPUT = _i()

def split_gf(x, n):
  n /= 8
  return [C.GF.loadb(x[i:i + n]) for i in xrange(0, len(x), n)]

def join_gf(xx, n):
  x = C.GF(0)
  for i in xrange(len(xx)): x = (x << n) | xx[i]
  return x

def present_gf(x, w, n, what):
  firstp = True
  m = gfmask(n)
  for i in xrange(0, w, 128):
    print ';; %12s%c         =%s' % \
      (firstp and what or '',
       firstp and ':' or ' ',
       ''.join([j < w
                and '          0x%s' % hex(((x >> j)&m).storeb(n/8))
                or ''
                for j in xrange(i, i + 128, n)]))
    firstp = False

def present_gf_pclmul(tag, wd, x, w, n, what):
  if tag != TAG_PRODPIECE: present_gf(x, w, n, what)

def reverse(x, w):
  return C.GF.loadl(x.storeb(w/8))

def rev32(x):
  w = x.noctets
  m_ffff = repmask(0xffff, 32, w/4)
  m_ff = repmask(0xff, 16, w/2)
  x = ((x&m_ffff) << 16) | ((x >> 16)&m_ffff)
  x = ((x&m_ff) << 8) | ((x >> 8)&m_ff)
  return x

def rev8(x):
  w = x.noctets
  m_0f = repmask(0x0f, 8, w)
  m_33 = repmask(0x33, 8, w)
  m_55 = repmask(0x55, 8, w)
  x = ((x&m_0f) << 4) | ((x >> 4)&m_0f)
  x = ((x&m_33) << 2) | ((x >> 2)&m_33)
  x = ((x&m_55) << 1) | ((x >> 1)&m_55)
  return x

def present_gf_mullp64(tag, wd, x, w, n, what):
  if tag == TAG_PRODPIECE or tag == TAG_REDCFULL:
    return
  elif (wd == 128 or wd == 64) and TAG_PRODSUM <= tag <= TAG_PRODUCT:
    y = x
  elif (wd == 96 or wd == 192 or wd == 256) and \
       TAG_PRODSUM <= tag < TAG_OUTPUT:
    y = x
  else:
    xx = x.storeb(w/8)
    extra = len(xx)%8
    if extra: xx += C.ByteString.zero(8 - extra)
    yb = C.WriteBuffer()
    for i in xrange(len(xx), 0, -8): yb.put(xx[i - 8:i])
    y = C.GF.loadb(yb.contents)
  present_gf(y, (w + 63)&~63, n, what)

def present_gf_pmull(tag, wd, x, w, n, what):
  if tag == TAG_PRODPIECE or tag == TAG_REDCFULL or tag == TAG_SHIFTED:
    return
  elif tag == TAG_INPUT_V or tag == TAG_KPIECE_V:
    bx = C.ReadBuffer(x.storeb(w/8))
    by = C.WriteBuffer()
    while bx.left: chunk = bx.get(8); by.put(chunk).put(chunk)
    x = C.GF.loadb(by.contents)
    w *= 2
  elif TAG_PRODSUM <= tag <= TAG_PRODUCT:
    x <<= 1
  y = reverse(rev8(x), w)
  present_gf(y, w, n, what)

def poly64_mul_simple(u, v, presfn, wd, dispwd, mulwd, uwhat, vwhat):
  """
  Multiply U by V, returning the product.

  This is the fallback long multiplication.
  """

  uw, vw = 8*len(u), 8*len(v)

  ## We start by carving the operands into 64-bit pieces.  This is
  ## straightforward except for the 96-bit case, where we end up with two
  ## short pieces which we pad at the beginning.
  if uw%mulwd: pad = (-uw)%mulwd; u += C.ByteString.zero(pad); uw += pad
  if vw%mulwd: pad = (-uw)%mulwd; v += C.ByteString.zero(pad); vw += pad
  uu = split_gf(u, mulwd)
  vv = split_gf(v, mulwd)

  ## Report and accumulate the individual product pieces.
  x = C.GF(0)
  ulim, vlim = uw/mulwd, vw/mulwd
  for i in xrange(ulim + vlim - 2, -1, -1):
    t = C.GF(0)
    for j in xrange(max(0, i - vlim + 1), min(vlim, i + 1)):
      s = uu[ulim - 1 - i + j]*vv[vlim - 1 - j]
      presfn(TAG_PRODPIECE, wd, s, 2*mulwd, dispwd,
             '%s_%d %s_%d' % (uwhat, i - j, vwhat, j))
      t += s
    presfn(TAG_PRODSUM, wd, t, 2*mulwd, dispwd,
           '(%s %s)_%d' % (uwhat, vwhat, ulim + vlim - 2 - i))
    x += t << (mulwd*i)
  presfn(TAG_PRODUCT, wd, x, uw + vw, dispwd, '%s %s' % (uwhat, vwhat))

  return x

def poly64_mul_karatsuba(u, v, klimit, presfn, wd,
                         dispwd, mulwd, uwhat, vwhat):
  """
  Multiply U by V, returning the product.

  If the length of U and V is at least KLIMIT, and the operands are otherwise
  suitable, then do Karatsuba--Ofman multiplication; otherwise, delegate to
  `poly64_mul_simple'.
  """
  w = 8*len(u)

  if w < klimit or w != 8*len(v) or w%(2*mulwd) != 0:
    return poly64_mul_simple(u, v, presfn, wd, dispwd, mulwd, uwhat, vwhat)

  hw = w/2
  u0, u1 = u[:hw/8], u[hw/8:]
  v0, v1 = v[:hw/8], v[hw/8:]
  uu, vv = u0 ^ u1, v0 ^ v1

  presfn(TAG_KPIECE_U, wd, C.GF.loadb(uu), hw, dispwd, '%s*' % uwhat)
  presfn(TAG_KPIECE_V, wd, C.GF.loadb(vv), hw, dispwd, '%s*' % vwhat)
  uuvv = poly64_mul_karatsuba(uu, vv, klimit, presfn, wd, dispwd, mulwd,
                              '%s*' % uwhat, '%s*' % vwhat)

  presfn(TAG_KPIECE_U, wd, C.GF.loadb(u0), hw, dispwd, '%s0' % uwhat)
  presfn(TAG_KPIECE_V, wd, C.GF.loadb(v0), hw, dispwd, '%s0' % vwhat)
  u0v0 = poly64_mul_karatsuba(u0, v0, klimit, presfn, wd, dispwd, mulwd,
                              '%s0' % uwhat, '%s0' % vwhat)

  presfn(TAG_KPIECE_U, wd, C.GF.loadb(u1), hw, dispwd, '%s1' % uwhat)
  presfn(TAG_KPIECE_V, wd, C.GF.loadb(v1), hw, dispwd, '%s1' % vwhat)
  u1v1 = poly64_mul_karatsuba(u1, v1, klimit, presfn, wd, dispwd, mulwd,
                              '%s1' % uwhat, '%s1' % vwhat)

  uvuv = uuvv + u0v0 + u1v1
  presfn(TAG_PRODSUM, wd, uvuv, w, dispwd, '%s!%s' % (uwhat, vwhat))

  x = u1v1 + (uvuv << hw) + (u0v0 << w)
  presfn(TAG_PRODUCT, wd, x, 2*w, dispwd, '%s %s' % (uwhat, vwhat))
  return x

def poly64_common(u, v, presfn, dispwd = 32, mulwd = 64, redcwd = 32,
                  klimit = 256):
  """
  Multiply U by V using a primitive 64-bit binary polynomial mutliplier.

  Such a multiplier exists as the appallingly-named `pclmul[lh]q[lh]qdq' on
  x86, and as `vmull.p64'/`pmull' on ARM.

  Operands arrive in a `register format', which is a byte-swapped variant of
  the external format.  Implementations differ on the precise details,
  though.
  """

  ## We work in two main phases: first, calculate the full double-width
  ## product; and, second, reduce it modulo the field polynomial.

  w = 8*len(u); assert(w == 8*len(v))
  p = poly(w)
  presfn(TAG_INPUT_U, w, C.GF.loadb(u), w, dispwd, 'u')
  presfn(TAG_INPUT_V, w, C.GF.loadb(v), w, dispwd, 'v')

  ## So, on to the first part: the multiplication.
  x = poly64_mul_karatsuba(u, v, klimit, presfn, w, dispwd, mulwd, 'u', 'v')

  ## Now we have to shift everything up one bit to account for GCM's crazy
  ## bit ordering.
  y = x << 1
  if w == 96: y >>= 64
  presfn(TAG_SHIFTED, w, y, 2*w, dispwd, 'y')

  ## Now for the reduction.
  ##
  ## Our polynomial has the form p = t^d + r where r = SUM_{0<=i<d} r_i t^i,
  ## with each r_i either 0 or 1.  Because we choose the lexically earliest
  ## irreducible polynomial with the necessary degree, r_i = 1 happens only
  ## for a small number of tiny i.  In our field, we have t^d = r.
  ##
  ## We carve the product into convenient n-bit pieces, for some n dividing d
  ## -- typically n = 32 or 64.  Let d = m n, and write y = SUM_{0<=i<2m} y_i
  ## t^{ni}.  The upper portion, the y_i with i >= m, needs reduction; but
  ## y_i t^{ni} = y_i r t^{n(i-m)}, so we just multiply the top half by r and
  ## add it to the bottom half.  This all depends on r_i = 0 for all i >=
  ## n/2.  We process each nonzero coefficient of r separately, in two
  ## passes.
  ##
  ## Multiplying a chunk y_i by some t^j is the same as shifting it left by j
  ## bits (or would be if GCM weren't backwards, but let's not worry about
  ## that right now).  The high j bits will spill over into the next chunk,
  ## while the low n - j bits will stay where they are.  It's these high bits
  ## which cause trouble -- particularly the high bits of the top chunk,
  ## since we'll add them on to y_m, which will need further reduction.  But
  ## only the topmost j bits will do this.
  ##
  ## The trick is that we do all of the bits which spill over first -- all of
  ## the top j bits in each chunk, for each j -- in one pass, and then a
  ## second pass of all the bits which don't.  Because j, j' < n/2 for any
  ## two nonzero coefficient degrees j and j', we have j + j' < n whence j <
  ## n - j' -- so all of the bits contributed to y_m will be handled in the
  ## second pass when we handle the bits that don't spill over.
  rr = [i for i in xrange(1, w) if p.testbit(i)]
  m = gfmask(redcwd)

  ## Handle the spilling bits.
  yy = split_gf(y.storeb(w/4), redcwd)
  b = C.GF(0)
  for rj in rr:
    br = [(yi << (redcwd - rj))&m for yi in yy[w/redcwd:]]
    presfn(TAG_REDCBITS, w, join_gf(br, redcwd), w, dispwd, 'b(%d)' % rj)
    b += join_gf(br, redcwd) << (w - redcwd)
  presfn(TAG_REDCFULL, w, b, 2*w, dispwd, 'b')
  s = y + b
  presfn(TAG_REDCMIX, w, s, 2*w, dispwd, 's')

  ## Handle the nonspilling bits.
  ss = split_gf(s.storeb(w/4), redcwd)
  a = C.GF(0)
  for rj in rr:
    ar = [si >> rj for si in ss[w/redcwd:]]
    presfn(TAG_REDCBITS, w, join_gf(ar, redcwd), w, dispwd, 'a(%d)' % rj)
    a += join_gf(ar, redcwd)
  presfn(TAG_REDCFULL, w, a, w, dispwd, 'a')

  ## Mix everything together.
  m = gfmask(w)
  z = (s&m) + (s >> w) + a
  presfn(TAG_OUTPUT, w, z, w, dispwd, 'z')

  ## And we're done.
  return z.storeb(w/8)

@demo
def demo_pclmul(u, v):
  return poly64_common(u, v, presfn = present_gf_pclmul)

@demo
def demo_vmullp64(u, v):
  w = 8*len(u)
  return poly64_common(u, v, presfn = present_gf_mullp64,
                       redcwd = w%64 == 32 and 32 or 64)

@demo
def demo_pmull(u, v):
  w = 8*len(u)
  return poly64_common(u, v, presfn = present_gf_pmull,
                       redcwd = w%64 == 32 and 32 or 64)

###--------------------------------------------------------------------------
### @@@ Random debris to be deleted. @@@

def cutting_room_floor():

  x = C.bytes('cde4bef260d7bcda163547d348b7551195e77022907dd1df')
  y = C.bytes('f7dac5c9941d26d0c6eb14ad568f86edd1dc9268eeee5332')

  u, v = C.GF.loadb(x), C.GF.loadb(y)

  g = u*v << 1
  print 'y = %s' % words(g.storeb(48))
  b1 = (g&repmask(0x01, 32, 6)) << 191
  b2 = (g&repmask(0x03, 32, 6)) << 190
  b7 = (g&repmask(0x7f, 32, 6)) << 185
  b = b1 + b2 + b7
  print 'b = %s' % words(b.storeb(48)[0:28])
  h = g + b
  print 'w = %s' % words(h.storeb(48))

  a0 = (h&repmask(0xffffffff, 32, 6)) << 192
  a1 = (h&repmask(0xfffffffe, 32, 6)) << 191
  a2 = (h&repmask(0xfffffffc, 32, 6)) << 190
  a7 = (h&repmask(0xffffff80, 32, 6)) << 185
  a = a0 + a1 + a2 + a7

  print '     a_1 = %s' % words(a1.storeb(48)[0:24])
  print '     a_2 = %s' % words(a2.storeb(48)[0:24])
  print '     a_7 = %s' % words(a7.storeb(48)[0:24])

  print 'low+unit = %s' % words((h + a0).storeb(48)[0:24])
  print ' low+0,2 = %s' % words((h + a0 + a2).storeb(48)[0:24])
  print '     1,7 = %s' % words((a1 + a7).storeb(48)[0:24])

  print 'a = %s' % words(a.storeb(48)[0:24])
  z = h + a
  print 'z = %s' % words(z.storeb(48))

  z = gcm_mul(x, y)
  print 'u v mod p = %s' % words(z)

###--------------------------------------------------------------------------
### Main program.

style = argv[1]
u = C.bytes(argv[2])
v = C.bytes(argv[3])
zz = DEMOMAP[style](u, v)
assert zz == gcm_mul(u, v)

###----- That's all, folks --------------------------------------------------
Commit	Line	Data
9e6a4409 MW	1	#! /usr/bin/python
	2	### -- coding: utf-8 --
	3
	4	from sys import argv, exit
	5
	6	import catacomb as C
	7
	8	###--------------------------------------------------------------------------
	9	### Random utilities.
	10
	11	def words(s):
	12	"""Split S into 32-bit pieces and report their values as hex."""
	13	return ' '.join('%08x' % C.MP.loadb(s[i:i + 4])
	14	for i in xrange(0, len(s), 4))
	15
	16	def words_64(s):
	17	"""Split S into 64-bit pieces and report their values as hex."""
	18	return ' '.join('%016x' % C.MP.loadb(s[i:i + 8])
	19	for i in xrange(0, len(s), 8))
	20
	21	def repmask(val, wd, n):
	22	"""Return a mask consisting of N copies of the WD-bit value VAL."""
	23	v = C.GF(val)
	24	a = C.GF(0)
	25	for i in xrange(n): a = (a << wd) \| v
	26	return a
	27
	28	def combs(things, k):
	29	"""Iterate over all possible combinations of K of the THINGS."""
	30	ii = range(k)
	31	n = len(things)
	32	while True:
	33	yield [things[i] for i in ii]
	34	for j in xrange(k):
	35	if j == k - 1: lim = n
	36	else: lim = ii[j + 1]
	37	i = ii[j] + 1
	38	if i < lim:
	39	ii[j] = i
	40	break
	41	ii[j] = j
	42	else:
	43	return
	44
	45	POLYMAP = {}
	46
	47	def poly(nbits):
	48	"""
	49	Return the lexically first irreducible polynomial of degree NBITS of lowest
	50	weight.
	51	"""
	52	try: return POLYMAP[nbits]
	53	except KeyError: pass
	54	base = C.GF(0).setbit(nbits).setbit(0)
	55	for k in xrange(1, nbits, 2):
	56	for cc in combs(range(1, nbits), k):
601ec68e	57	p = base + sum((C.GF(0).setbit(c) for c in cc), C.GF(0))
9e6a4409 MW	58	if p.irreduciblep(): POLYMAP[nbits] = p; return p
	59	raise ValueError, nbits
	60
	61	def gcm_mangle(x):
	62	"""Flip the bits within each byte according to GCM's insane convention."""
	63	y = C.WriteBuffer()
	64	for b in x:
	65	b = ord(b)
	66	bb = 0
	67	for i in xrange(8):
	68	bb <<= 1
	69	if b&1: bb \|= 1
	70	b >>= 1
	71	y.putu8(bb)
	72	return y.contents
	73
	74	def endswap_words_32(x):
	75	"""End-swap each 32-bit word of X."""
	76	x = C.ReadBuffer(x)
	77	y = C.WriteBuffer()
	78	while x.left: y.putu32l(x.getu32b())
	79	return y.contents
	80
	81	def endswap_words_64(x):
	82	"""End-swap each 64-bit word of X."""
	83	x = C.ReadBuffer(x)
	84	y = C.WriteBuffer()
	85	while x.left: y.putu64l(x.getu64b())
	86	return y.contents
	87
	88	def endswap_bytes(x):
	89	"""End-swap X by bytes."""
	90	y = C.WriteBuffer()
	91	for ch in reversed(x): y.put(ch)
	92	return y.contents
	93
	94	def gfmask(n):
	95	return C.GF(C.MP(0).setbit(n) - 1)
	96
	97	def gcm_mul(x, y):
	98	"""Multiply X and Y according to the GCM rules."""
	99	w = len(x)
	100	p = poly(8*w)
	101	u, v = C.GF.loadl(gcm_mangle(x)), C.GF.loadl(gcm_mangle(y))
	102	z = (u*v)%p
	103	return gcm_mangle(z.storel(w))
	104
	105	DEMOMAP = {}
	106	def demo(func):
	107	name = func.func_name
	108	assert(name.startswith('demo_'))
	109	DEMOMAP[name[5:].replace('_', '-')] = func
	110	return func
	111
	112	def iota(i = 0):
	113	vi = [i]
	114	def next(): vi[0] += 1; return vi[0] - 1
	115	return next
	116
	117	###--------------------------------------------------------------------------
	118	### Portable table-driven implementation.
	119
	120	def shift_left(x):
	121	"""Given a field element X (in external format), return X t."""
122	w = len(x)
123	p = poly(8*w)
124	return gcm_mangle(C.GF.storel((C.GF.loadl(gcm_mangle(x)) << 1)%p))
125
126	def table_common(u, v, flip, getword, ixmask):
127	"""
128	Multiply U by V using table lookup; common for `table-b' and `table-l'.
129
130	This matches the `simple_mulk_...' implementation in `gcm.c'. One-entry
131	per bit is the best we can manage if we want a constant-time
132	implementation: processing n bits at a time means we need to scan
133	(2^n - 1)/n times as much memory.
134
135	* FLIP is a function (assumed to be an involution) on one argument X to
136	convert X from external format to table-entry format or back again.
137
138	* GETWORD is a function on one argument B to retrieve the next 32-bit
139	chunk of a field element held in a `ReadBuffer'. Bits within a word
140	are processed most-significant first.
141
142	* IXMASK is a mask XORed into table indices to permute the table so that
143	it's order matches that induced by GETWORD.
144
145	The table is built such that tab[i XOR IXMASK] = U t^i.
146	"""
147	w = len(u); assert(w == len(v))
148	a = C.ByteString.zero(w)
149	tab = [None](8w)
150	for i in xrange(8*w):
151	print ';; %9s = %7s = %s' % ('utab[%d]' % i, 'u t^%d' % i, words(u))
152	tab[i ^ ixmask] = flip(u)
153	u = shift_left(u)
154	v = C.ReadBuffer(v)
155	i = 0
156	while v.left:
157	t = getword(v)
158	for j in xrange(32):
159	bit = (t >> 31)&1
160	if bit: a ^= tab[i]
161	print ';; %6s = %d: a <- %s [%9s = %s]' % \
162	('v[%d]' % (i ^ ixmask), bit, words(a),
163	'utab[%d]' % (i ^ ixmask), words(tab[i]))
164	i += 1; t <<= 1
165	return flip(a)
166
167	@demo
168	def demo_table_b(u, v):
169	"""Big-endian table lookup."""
170	return table_common(u, v, lambda x: x, lambda b: b.getu32b(), 0)
171
172	@demo
173	def demo_table_l(u, v):
174	"""Little-endian table lookup."""
175	return table_common(u, v, endswap_words, lambda b: b.getu32l(), 0x18)
176
177	###--------------------------------------------------------------------------
178	### Implementation using 64×64->128-bit binary polynomial multiplication.
179
180	_i = iota()
181	TAG_INPUT_U = _i()
182	TAG_INPUT_V = _i()
183	TAG_KPIECE_U = _i()
184	TAG_KPIECE_V = _i()
185	TAG_PRODPIECE = _i()
186	TAG_PRODSUM = _i()
187	TAG_PRODUCT = _i()
188	TAG_SHIFTED = _i()
189	TAG_REDCBITS = _i()
190	TAG_REDCFULL = _i()
191	TAG_REDCMIX = _i()
192	TAG_OUTPUT = _i()
193
194	def split_gf(x, n):
195	n /= 8
196	return [C.GF.loadb(x[i:i + n]) for i in xrange(0, len(x), n)]
197
198	def join_gf(xx, n):
199	x = C.GF(0)
200	for i in xrange(len(xx)): x = (x << n) \| xx[i]
201	return x
202
203	def present_gf(x, w, n, what):
204	firstp = True
205	m = gfmask(n)
206	for i in xrange(0, w, 128):
207	print ';; %12s%c =%s' % \
208	(firstp and what or '',
209	firstp and ':' or ' ',
210	''.join([j < w
211	and ' 0x%s' % hex(((x >> j)&m).storeb(n/8))
212	or ''
213	for j in xrange(i, i + 128, n)]))
214	firstp = False
215
216	def present_gf_pclmul(tag, wd, x, w, n, what):
217	if tag != TAG_PRODPIECE: present_gf(x, w, n, what)
218
219	def reverse(x, w):
220	return C.GF.loadl(x.storeb(w/8))
221
222	def rev32(x):
223	w = x.noctets
224	m_ffff = repmask(0xffff, 32, w/4)
225	m_ff = repmask(0xff, 16, w/2)
226	x = ((x&m_ffff) << 16) \| ((x >> 16)&m_ffff)
227	x = ((x&m_ff) << 8) \| ((x >> 8)&m_ff)
228	return x
229
230	def rev8(x):
231	w = x.noctets
232	m_0f = repmask(0x0f, 8, w)
233	m_33 = repmask(0x33, 8, w)
234	m_55 = repmask(0x55, 8, w)
235	x = ((x&m_0f) << 4) \| ((x >> 4)&m_0f)
236	x = ((x&m_33) << 2) \| ((x >> 2)&m_33)
237	x = ((x&m_55) << 1) \| ((x >> 1)&m_55)
238	return x
239
240	def present_gf_mullp64(tag, wd, x, w, n, what):
241	if tag == TAG_PRODPIECE or tag == TAG_REDCFULL:
242	return
243	elif (wd == 128 or wd == 64) and TAG_PRODSUM <= tag <= TAG_PRODUCT:
244	y = x
245	elif (wd == 96 or wd == 192 or wd == 256) and \
246	TAG_PRODSUM <= tag < TAG_OUTPUT:
247	y = x
248	else:
249	xx = x.storeb(w/8)
250	extra = len(xx)%8
251	if extra: xx += C.ByteString.zero(8 - extra)
252	yb = C.WriteBuffer()
253	for i in xrange(len(xx), 0, -8): yb.put(xx[i - 8:i])
254	y = C.GF.loadb(yb.contents)
255	present_gf(y, (w + 63)&~63, n, what)
256
257	def present_gf_pmull(tag, wd, x, w, n, what):
258	if tag == TAG_PRODPIECE or tag == TAG_REDCFULL or tag == TAG_SHIFTED:
259	return
260	elif tag == TAG_INPUT_V or tag == TAG_KPIECE_V:
261	bx = C.ReadBuffer(x.storeb(w/8))
262	by = C.WriteBuffer()
263	while bx.left: chunk = bx.get(8); by.put(chunk).put(chunk)
264	x = C.GF.loadb(by.contents)
265	w *= 2
266	elif TAG_PRODSUM <= tag <= TAG_PRODUCT:
267	x <<= 1
268	y = reverse(rev8(x), w)
269	present_gf(y, w, n, what)
270
271	def poly64_mul_simple(u, v, presfn, wd, dispwd, mulwd, uwhat, vwhat):
272	"""
273	Multiply U by V, returning the product.
274
275	This is the fallback long multiplication.
276	"""
277
278	uw, vw = 8len(u), 8len(v)
279
280	## We start by carving the operands into 64-bit pieces. This is
281	## straightforward except for the 96-bit case, where we end up with two
282	## short pieces which we pad at the beginning.
283	if uw%mulwd: pad = (-uw)%mulwd; u += C.ByteString.zero(pad); uw += pad
284	if vw%mulwd: pad = (-uw)%mulwd; v += C.ByteString.zero(pad); vw += pad
285	uu = split_gf(u, mulwd)
286	vv = split_gf(v, mulwd)
287
288	## Report and accumulate the individual product pieces.
289	x = C.GF(0)
290	ulim, vlim = uw/mulwd, vw/mulwd
291	for i in xrange(ulim + vlim - 2, -1, -1):
292	t = C.GF(0)
293	for j in xrange(max(0, i - vlim + 1), min(vlim, i + 1)):
294	s = uu[ulim - 1 - i + j]*vv[vlim - 1 - j]
295	presfn(TAG_PRODPIECE, wd, s, 2*mulwd, dispwd,
296	'%s_%d %s_%d' % (uwhat, i - j, vwhat, j))
297	t += s
298	presfn(TAG_PRODSUM, wd, t, 2*mulwd, dispwd,
299	'(%s %s)_%d' % (uwhat, vwhat, ulim + vlim - 2 - i))
300	x += t << (mulwd*i)
301	presfn(TAG_PRODUCT, wd, x, uw + vw, dispwd, '%s %s' % (uwhat, vwhat))
302
303	return x
304
305	def poly64_mul_karatsuba(u, v, klimit, presfn, wd,
306	dispwd, mulwd, uwhat, vwhat):
307	"""
308	Multiply U by V, returning the product.
309
310	If the length of U and V is at least KLIMIT, and the operands are otherwise
311	suitable, then do Karatsuba--Ofman multiplication; otherwise, delegate to
312	`poly64_mul_simple'.
313	"""
314	w = 8*len(u)
315
316	if w < klimit or w != 8len(v) or w%(2mulwd) != 0:
317	return poly64_mul_simple(u, v, presfn, wd, dispwd, mulwd, uwhat, vwhat)
318
319	hw = w/2
320	u0, u1 = u[:hw/8], u[hw/8:]
321	v0, v1 = v[:hw/8], v[hw/8:]
322	uu, vv = u0 ^ u1, v0 ^ v1
323
324	presfn(TAG_KPIECE_U, wd, C.GF.loadb(uu), hw, dispwd, '%s*' % uwhat)
325	presfn(TAG_KPIECE_V, wd, C.GF.loadb(vv), hw, dispwd, '%s*' % vwhat)
326	uuvv = poly64_mul_karatsuba(uu, vv, klimit, presfn, wd, dispwd, mulwd,
327	'%s' % uwhat, '%s' % vwhat)
328
329	presfn(TAG_KPIECE_U, wd, C.GF.loadb(u0), hw, dispwd, '%s0' % uwhat)
330	presfn(TAG_KPIECE_V, wd, C.GF.loadb(v0), hw, dispwd, '%s0' % vwhat)
331	u0v0 = poly64_mul_karatsuba(u0, v0, klimit, presfn, wd, dispwd, mulwd,
332	'%s0' % uwhat, '%s0' % vwhat)
333
334	presfn(TAG_KPIECE_U, wd, C.GF.loadb(u1), hw, dispwd, '%s1' % uwhat)
335	presfn(TAG_KPIECE_V, wd, C.GF.loadb(v1), hw, dispwd, '%s1' % vwhat)
336	u1v1 = poly64_mul_karatsuba(u1, v1, klimit, presfn, wd, dispwd, mulwd,
337	'%s1' % uwhat, '%s1' % vwhat)
338
339	uvuv = uuvv + u0v0 + u1v1
340	presfn(TAG_PRODSUM, wd, uvuv, w, dispwd, '%s!%s' % (uwhat, vwhat))
341
342	x = u1v1 + (uvuv << hw) + (u0v0 << w)
343	presfn(TAG_PRODUCT, wd, x, 2*w, dispwd, '%s %s' % (uwhat, vwhat))
344	return x
345
346	def poly64_common(u, v, presfn, dispwd = 32, mulwd = 64, redcwd = 32,
347	klimit = 256):
348	"""
349	Multiply U by V using a primitive 64-bit binary polynomial mutliplier.
350
351	Such a multiplier exists as the appallingly-named `pclmul[lh]q[lh]qdq' on
352	x86, and as `vmull.p64'/`pmull' on ARM.
353
354	Operands arrive in a `register format', which is a byte-swapped variant of
355	the external format. Implementations differ on the precise details,
356	though.
357	"""
358
359	## We work in two main phases: first, calculate the full double-width
360	## product; and, second, reduce it modulo the field polynomial.
361
362	w = 8len(u); assert(w == 8len(v))
363	p = poly(w)
364	presfn(TAG_INPUT_U, w, C.GF.loadb(u), w, dispwd, 'u')
365	presfn(TAG_INPUT_V, w, C.GF.loadb(v), w, dispwd, 'v')
366
367	## So, on to the first part: the multiplication.
368	x = poly64_mul_karatsuba(u, v, klimit, presfn, w, dispwd, mulwd, 'u', 'v')
369
370	## Now we have to shift everything up one bit to account for GCM's crazy
371	## bit ordering.
372	y = x << 1
373	if w == 96: y >>= 64
374	presfn(TAG_SHIFTED, w, y, 2*w, dispwd, 'y')
375
376	## Now for the reduction.
377	##
378	## Our polynomial has the form p = t^d + r where r = SUM_{0<=i<d} r_i t^i,
379	## with each r_i either 0 or 1. Because we choose the lexically earliest
380	## irreducible polynomial with the necessary degree, r_i = 1 happens only
381	## for a small number of tiny i. In our field, we have t^d = r.
382	##
383	## We carve the product into convenient n-bit pieces, for some n dividing d
384	## -- typically n = 32 or 64. Let d = m n, and write y = SUM_{0<=i<2m} y_i
385	## t^{ni}. The upper portion, the y_i with i >= m, needs reduction; but
386	## y_i t^{ni} = y_i r t^{n(i-m)}, so we just multiply the top half by r and
387	## add it to the bottom half. This all depends on r_i = 0 for all i >=
388	## n/2. We process each nonzero coefficient of r separately, in two
389	## passes.
390	##
391	## Multiplying a chunk y_i by some t^j is the same as shifting it left by j
392	## bits (or would be if GCM weren't backwards, but let's not worry about
393	## that right now). The high j bits will spill over into the next chunk,
394	## while the low n - j bits will stay where they are. It's these high bits
395	## which cause trouble -- particularly the high bits of the top chunk,
396	## since we'll add them on to y_m, which will need further reduction. But
397	## only the topmost j bits will do this.
398	##
399	## The trick is that we do all of the bits which spill over first -- all of
400	## the top j bits in each chunk, for each j -- in one pass, and then a
401	## second pass of all the bits which don't. Because j, j' < n/2 for any
402	## two nonzero coefficient degrees j and j', we have j + j' < n whence j <
403	## n - j' -- so all of the bits contributed to y_m will be handled in the
404	## second pass when we handle the bits that don't spill over.
405	rr = [i for i in xrange(1, w) if p.testbit(i)]
406	m = gfmask(redcwd)
407
408	## Handle the spilling bits.
409	yy = split_gf(y.storeb(w/4), redcwd)
410	b = C.GF(0)
411	for rj in rr:
412	br = [(yi << (redcwd - rj))&m for yi in yy[w/redcwd:]]
413	presfn(TAG_REDCBITS, w, join_gf(br, redcwd), w, dispwd, 'b(%d)' % rj)
414	b += join_gf(br, redcwd) << (w - redcwd)
415	presfn(TAG_REDCFULL, w, b, 2*w, dispwd, 'b')
416	s = y + b
417	presfn(TAG_REDCMIX, w, s, 2*w, dispwd, 's')
418
419	## Handle the nonspilling bits.
420	ss = split_gf(s.storeb(w/4), redcwd)
421	a = C.GF(0)
422	for rj in rr:
423	ar = [si >> rj for si in ss[w/redcwd:]]
424	presfn(TAG_REDCBITS, w, join_gf(ar, redcwd), w, dispwd, 'a(%d)' % rj)
425	a += join_gf(ar, redcwd)
426	presfn(TAG_REDCFULL, w, a, w, dispwd, 'a')
427
428	## Mix everything together.
429	m = gfmask(w)
430	z = (s&m) + (s >> w) + a
431	presfn(TAG_OUTPUT, w, z, w, dispwd, 'z')
432
433	## And we're done.
434	return z.storeb(w/8)
435
436	@demo
437	def demo_pclmul(u, v):
438	return poly64_common(u, v, presfn = present_gf_pclmul)
439
440	@demo
441	def demo_vmullp64(u, v):
442	w = 8*len(u)
443	return poly64_common(u, v, presfn = present_gf_mullp64,
444	redcwd = w%64 == 32 and 32 or 64)
445
446	@demo
447	def demo_pmull(u, v):
448	w = 8*len(u)
449	return poly64_common(u, v, presfn = present_gf_pmull,
450	redcwd = w%64 == 32 and 32 or 64)
451
452	###--------------------------------------------------------------------------
453	### @@@ Random debris to be deleted. @@@
454
455	def cutting_room_floor():
456
457	x = C.bytes('cde4bef260d7bcda163547d348b7551195e77022907dd1df')
458	y = C.bytes('f7dac5c9941d26d0c6eb14ad568f86edd1dc9268eeee5332')
459
460	u, v = C.GF.loadb(x), C.GF.loadb(y)
461
462	g = u*v << 1
463	print 'y = %s' % words(g.storeb(48))
464	b1 = (g&repmask(0x01, 32, 6)) << 191
465	b2 = (g&repmask(0x03, 32, 6)) << 190
466	b7 = (g&repmask(0x7f, 32, 6)) << 185
467	b = b1 + b2 + b7
468	print 'b = %s' % words(b.storeb(48)[0:28])
469	h = g + b
470	print 'w = %s' % words(h.storeb(48))
471
472	a0 = (h&repmask(0xffffffff, 32, 6)) << 192
473	a1 = (h&repmask(0xfffffffe, 32, 6)) << 191
474	a2 = (h&repmask(0xfffffffc, 32, 6)) << 190
475	a7 = (h&repmask(0xffffff80, 32, 6)) << 185
476	a = a0 + a1 + a2 + a7
477
478	print ' a_1 = %s' % words(a1.storeb(48)[0:24])
479	print ' a_2 = %s' % words(a2.storeb(48)[0:24])
480	print ' a_7 = %s' % words(a7.storeb(48)[0:24])
481
482	print 'low+unit = %s' % words((h + a0).storeb(48)[0:24])
483	print ' low+0,2 = %s' % words((h + a0 + a2).storeb(48)[0:24])
484	print ' 1,7 = %s' % words((a1 + a7).storeb(48)[0:24])
485
486	print 'a = %s' % words(a.storeb(48)[0:24])
487	z = h + a
488	print 'z = %s' % words(z.storeb(48))
489
490	z = gcm_mul(x, y)
491	print 'u v mod p = %s' % words(z)
492
493	###--------------------------------------------------------------------------
494	### Main program.
495
496	style = argv[1]
497	u = C.bytes(argv[2])
498	v = C.bytes(argv[3])
499	zz = DEMOMAP[style](u, v)
500	assert zz == gcm_mul(u, v)
501
502	###----- That's all, folks --------------------------------------------------