p = poly(8*w)
return gcm_mangle(C.GF.storel((C.GF.loadl(gcm_mangle(x)) << 1)%p))
+def shift_right(x):
+ """Given a field element X (in external format), return X/t."""
+ w = len(x)
+ p = poly(8*w)
+ return gcm_mangle(C.GF.storel((C.GF.loadl(gcm_mangle(x))*p.modinv(2))%p))
+
def table_common(u, v, flip, getword, ixmask):
"""
Multiply U by V using table lookup; common for `table-b' and `table-l'.
_i = iota()
TAG_INPUT_U = _i()
TAG_INPUT_V = _i()
+TAG_SHIFTED_V = _i()
TAG_KPIECE_U = _i()
TAG_KPIECE_V = _i()
TAG_PRODPIECE = _i()
TAG_PRODSUM = _i()
TAG_PRODUCT = _i()
-TAG_SHIFTED = _i()
TAG_REDCBITS = _i()
TAG_REDCFULL = _i()
TAG_REDCMIX = _i()
x = ((x&m_55) << 1) | ((x >> 1)&m_55)
return x
-def present_gf_mullp64(tag, wd, x, w, n, what):
+def present_gf_vmullp64(tag, wd, x, w, n, what):
if tag == TAG_PRODPIECE or tag == TAG_REDCFULL:
return
elif (wd == 128 or wd == 64) and TAG_PRODSUM <= tag <= TAG_PRODUCT:
present_gf(y, (w + 63)&~63, n, what)
def present_gf_pmull(tag, wd, x, w, n, what):
- if tag == TAG_PRODPIECE or tag == TAG_REDCFULL or tag == TAG_SHIFTED:
+ if tag == TAG_PRODPIECE or tag == TAG_REDCFULL:
return
- elif tag == TAG_INPUT_V or tag == TAG_KPIECE_V:
+ elif tag == TAG_INPUT_V or tag == TAG_SHIFTED_V or tag == TAG_KPIECE_V:
w = (w + 63)&~63
bx = C.ReadBuffer(x.storeb(w/8))
by = C.WriteBuffer()
## We start by carving the operands into 64-bit pieces. This is
## straightforward except for the 96-bit case, where we end up with two
## short pieces which we pad at the beginning.
- if uw%mulwd: pad = (-uw)%mulwd; u += C.ByteString.zero(pad); uw += pad
- if vw%mulwd: pad = (-vw)%mulwd; v += C.ByteString.zero(pad); vw += pad
- uu = split_gf(u, mulwd)
- vv = split_gf(v, mulwd)
+ upad = (-uw)%mulwd; u += C.ByteString.zero(upad); uw += upad
+ vpad = (-vw)%mulwd; v += C.ByteString.zero(vpad); vw += vpad
+ uu = split_gf(u, mulwd); vv = split_gf(v, mulwd)
## Report and accumulate the individual product pieces.
x = C.GF(0)
x += t << (mulwd*i)
presfn(TAG_PRODUCT, wd, x, uw + vw, dispwd, '%s %s' % (uwhat, vwhat))
- return x
+ return x >> (upad + vpad)
def poly64_mul_karatsuba(u, v, klimit, presfn, wd,
dispwd, mulwd, uwhat, vwhat):
presfn(TAG_PRODUCT, wd, x, 2*w, dispwd, '%s %s' % (uwhat, vwhat))
return x
-def poly64_common(u, v, presfn, dispwd = 32, mulwd = 64, redcwd = 32,
- klimit = 256):
+def poly64_mul(u, v, presfn, dispwd, mulwd, klimit, uwhat, vwhat):
"""
Multiply U by V using a primitive 64-bit binary polynomial mutliplier.
Operands arrive in a `register format', which is a byte-swapped variant of
the external format. Implementations differ on the precise details,
- though.
+ though. Returns the double-precision product.
"""
- ## We work in two main phases: first, calculate the full double-width
- ## product; and, second, reduce it modulo the field polynomial.
-
w = 8*len(u); assert(w == 8*len(v))
- p = poly(w)
- presfn(TAG_INPUT_U, w, C.GF.loadb(u), w, dispwd, 'u')
- presfn(TAG_INPUT_V, w, C.GF.loadb(v), w, dispwd, 'v')
+ x = poly64_mul_karatsuba(u, v, klimit, presfn,
+ w, dispwd, mulwd, uwhat, vwhat)
- ## So, on to the first part: the multiplication.
- x = poly64_mul_karatsuba(u, v, klimit, presfn, w, dispwd, mulwd, 'u', 'v')
+ return x.storeb(w/4)
- ## Now we have to shift everything up one bit to account for GCM's crazy
- ## bit ordering.
- y = x << 1
- if w == 96: y >>= 64
- presfn(TAG_SHIFTED, w, y, 2*w, dispwd, 'y')
+def poly64_redc(y, presfn, dispwd, redcwd):
+ """
+ Reduce a double-precision product X modulo the appropriate polynomial.
+
+ The operand arrives in a `register format', which is a byte-swapped variant
+ of the external format. Implementations differ on the precise details,
+ though. Returns the single-precision reduced value.
+ """
+
+ w = 4*len(y)
+ p = poly(w)
- ## Now for the reduction.
- ##
## Our polynomial has the form p = t^d + r where r = SUM_{0<=i<d} r_i t^i,
## with each r_i either 0 or 1. Because we choose the lexically earliest
## irreducible polynomial with the necessary degree, r_i = 1 happens only
m = gfmask(redcwd)
## Handle the spilling bits.
- yy = split_gf(y.storeb(w/4), redcwd)
+ yy = split_gf(y, redcwd)
b = C.GF(0)
for rj in rr:
br = [(yi << (redcwd - rj))&m for yi in yy[w/redcwd:]]
presfn(TAG_REDCBITS, w, join_gf(br, redcwd), w, dispwd, 'b(%d)' % rj)
b += join_gf(br, redcwd) << (w - redcwd)
presfn(TAG_REDCFULL, w, b, 2*w, dispwd, 'b')
- s = y + b
+ s = C.GF.loadb(y) + b
presfn(TAG_REDCMIX, w, s, 2*w, dispwd, 's')
## Handle the nonspilling bits.
## And we're done.
return z.storeb(w/8)
+def poly64_shiftcommon(u, v, presfn, dispwd = 32, mulwd = 64,
+ redcwd = 32, klimit = 256):
+ w = 8*len(u)
+ presfn(TAG_INPUT_U, w, C.GF.loadb(u), w, dispwd, 'u')
+ presfn(TAG_INPUT_V, w, C.GF.loadb(v), w, dispwd, 'v')
+ vv = shift_right(v)
+ presfn(TAG_SHIFTED_V, w, C.GF.loadb(vv), w, dispwd, "v'")
+ y = poly64_mul(u, vv, presfn, dispwd, mulwd, klimit, "u", "v'")
+ z = poly64_redc(y, presfn, dispwd, redcwd)
+ return z
+
+def poly64_directcommon(u, v, presfn, dispwd = 32, mulwd = 64,
+ redcwd = 32, klimit = 256):
+ w = 8*len(u)
+ presfn(TAG_INPUT_U, w, C.GF.loadb(u), w, dispwd, 'u')
+ presfn(TAG_INPUT_V, w, C.GF.loadb(v), w, dispwd, 'v')
+ y = poly64_mul(u, v, presfn, dispwd, mulwd, klimit, "u", "v")
+ y = (C.GF.loadb(y) << 1).storeb(w/4)
+ z = poly64_redc(y, presfn, dispwd, redcwd)
+ return z
+
@demo
def demo_pclmul(u, v):
- return poly64_common(u, v, presfn = present_gf_pclmul)
+ return poly64_shiftcommon(u, v, presfn = present_gf_pclmul)
@demo
def demo_vmullp64(u, v):
w = 8*len(u)
- return poly64_common(u, v, presfn = present_gf_mullp64,
- redcwd = w%64 == 32 and 32 or 64)
+ return poly64_shiftcommon(u, v, presfn = present_gf_vmullp64,
+ redcwd = w%64 == 32 and 32 or 64)
@demo
def demo_pmull(u, v):
w = 8*len(u)
- return poly64_common(u, v, presfn = present_gf_pmull,
- redcwd = w%64 == 32 and 32 or 64)
+ return poly64_directcommon(u, v, presfn = present_gf_pmull,
+ redcwd = w%64 == 32 and 32 or 64)
###--------------------------------------------------------------------------
### @@@ Random debris to be deleted. @@@