2 ### -*- coding: utf-8 -*-
4 from sys
import argv
, exit
8 ###--------------------------------------------------------------------------
12 """Split S into 32-bit pieces and report their values as hex."""
13 return ' '.join('%08x' % C
.MP
.loadb(s
[i
:i
+ 4])
14 for i
in xrange(0, len(s
), 4))
17 """Split S into 64-bit pieces and report their values as hex."""
18 return ' '.join('%016x' % C
.MP
.loadb(s
[i
:i
+ 8])
19 for i
in xrange(0, len(s
), 8))
21 def repmask(val
, wd
, n
):
22 """Return a mask consisting of N copies of the WD-bit value VAL."""
25 for i
in xrange(n
): a
= (a
<< wd
) | v
29 """Iterate over all possible combinations of K of the THINGS."""
33 yield [things
[i
] for i
in ii
]
35 if j
== k
- 1: lim
= n
49 Return the lexically first irreducible polynomial of degree NBITS of lowest
52 try: return POLYMAP
[nbits
]
54 base
= C
.GF(0).setbit(nbits
).setbit(0)
55 for k
in xrange(1, nbits
, 2):
56 for cc
in combs(range(1, nbits
), k
):
57 p
= base
+ sum(C
.GF(0).setbit(c
) for c
in cc
)
58 if p
.irreduciblep(): POLYMAP
[nbits
] = p
; return p
59 raise ValueError, nbits
62 """Flip the bits within each byte according to GCM's insane convention."""
74 def endswap_words_32(x
):
75 """End-swap each 32-bit word of X."""
78 while x
.left
: y
.putu32l(x
.getu32b())
81 def endswap_words_64(x
):
82 """End-swap each 64-bit word of X."""
85 while x
.left
: y
.putu64l(x
.getu64b())
89 """End-swap X by bytes."""
91 for ch
in reversed(x
): y
.put(ch
)
95 return C
.GF(C
.MP(0).setbit(n
) - 1)
98 """Multiply X and Y according to the GCM rules."""
101 u
, v
= C
.GF
.loadl(gcm_mangle(x
)), C
.GF
.loadl(gcm_mangle(y
))
103 return gcm_mangle(z
.storel(w
))
107 name
= func
.func_name
108 assert(name
.startswith('demo_'))
109 DEMOMAP
[name
[5:].replace('_', '-')] = func
114 def next(): vi
[0] += 1; return vi
[0] - 1
117 ###--------------------------------------------------------------------------
118 ### Portable table-driven implementation.
121 """Given a field element X (in external format), return X t."""
124 return gcm_mangle(C
.GF
.storel((C
.GF
.loadl(gcm_mangle(x
)) << 1)%p
))
126 def table_common(u
, v
, flip
, getword
, ixmask
):
128 Multiply U by V using table lookup; common for `table-b' and `table-l'.
130 This matches the `simple_mulk_...' implementation in `gcm.c'. One-entry
131 per bit is the best we can manage if we want a constant-time
132 implementation: processing n bits at a time means we need to scan
133 (2^n - 1)/n times as much memory.
135 * FLIP is a function (assumed to be an involution) on one argument X to
136 convert X from external format to table-entry format or back again.
138 * GETWORD is a function on one argument B to retrieve the next 32-bit
139 chunk of a field element held in a `ReadBuffer'. Bits within a word
140 are processed most-significant first.
142 * IXMASK is a mask XORed into table indices to permute the table so that
143 it's order matches that induced by GETWORD.
145 The table is built such that tab[i XOR IXMASK] = U t^i.
147 w
= len(u
); assert(w
== len(v
))
148 a
= C
.ByteString
.zero(w
)
150 for i
in xrange(8*w
):
151 print ';; %9s = %7s = %s' %
('utab[%d]' % i
, 'u t^%d' % i
, words(u
))
152 tab
[i ^ ixmask
] = flip(u
)
161 print ';; %6s = %d: a <- %s [%9s = %s]' % \
162 ('v[%d]' %
(i ^ ixmask
), bit
, words(a
),
163 'utab[%d]' %
(i ^ ixmask
), words(tab
[i
]))
168 def demo_table_b(u
, v
):
169 """Big-endian table lookup."""
170 return table_common(u
, v
, lambda x
: x
, lambda b
: b
.getu32b(), 0)
173 def demo_table_l(u
, v
):
174 """Little-endian table lookup."""
175 return table_common(u
, v
, endswap_words
, lambda b
: b
.getu32l(), 0x18)
177 ###--------------------------------------------------------------------------
178 ### Implementation using 64×64->128-bit binary polynomial multiplication.
196 return [C
.GF
.loadb(x
[i
:i
+ n
]) for i
in xrange(0, len(x
), n
)]
200 for i
in xrange(len(xx
)): x
= (x
<< n
) | xx
[i
]
203 def present_gf(x
, w
, n
, what
):
206 for i
in xrange(0, w
, 128):
207 print ';; %12s%c =%s' % \
208 (firstp
and what
or '',
209 firstp
and ':' or ' ',
211 and ' 0x%s' %
hex(((x
>> j
)&m
).storeb(n
/8))
213 for j
in xrange(i
, i
+ 128, n
)]))
216 def present_gf_pclmul(tag
, wd
, x
, w
, n
, what
):
217 if tag
!= TAG_PRODPIECE
: present_gf(x
, w
, n
, what
)
220 return C
.GF
.loadl(x
.storeb(w
/8))
224 m_ffff
= repmask(0xffff, 32, w
/4)
225 m_ff
= repmask(0xff, 16, w
/2)
226 x
= ((x
&m_ffff
) << 16) |
((x
>> 16)&m_ffff
)
227 x
= ((x
&m_ff
) << 8) |
((x
>> 8)&m_ff
)
232 m_0f
= repmask(0x0f, 8, w
)
233 m_33
= repmask(0x33, 8, w
)
234 m_55
= repmask(0x55, 8, w
)
235 x
= ((x
&m_0f
) << 4) |
((x
>> 4)&m_0f
)
236 x
= ((x
&m_33
) << 2) |
((x
>> 2)&m_33
)
237 x
= ((x
&m_55
) << 1) |
((x
>> 1)&m_55
)
240 def present_gf_mullp64(tag
, wd
, x
, w
, n
, what
):
241 if tag
== TAG_PRODPIECE
or tag
== TAG_REDCFULL
:
243 elif (wd
== 128 or wd
== 64) and TAG_PRODSUM
<= tag
<= TAG_PRODUCT
:
245 elif (wd
== 96 or wd
== 192 or wd
== 256) and \
246 TAG_PRODSUM
<= tag
< TAG_OUTPUT
:
251 if extra
: xx
+= C
.ByteString
.zero(8 - extra
)
253 for i
in xrange(len(xx
), 0, -8): yb
.put(xx
[i
- 8:i
])
254 y
= C
.GF
.loadb(yb
.contents
)
255 present_gf(y
, (w
+ 63)&~
63, n
, what
)
257 def present_gf_pmull(tag
, wd
, x
, w
, n
, what
):
258 if tag
== TAG_PRODPIECE
or tag
== TAG_REDCFULL
or tag
== TAG_SHIFTED
:
260 elif tag
== TAG_INPUT_V
or tag
== TAG_KPIECE_V
:
261 bx
= C
.ReadBuffer(x
.storeb(w
/8))
263 while bx
.left
: chunk
= bx
.get(8); by
.put(chunk
).put(chunk
)
264 x
= C
.GF
.loadb(by
.contents
)
266 elif TAG_PRODSUM
<= tag
<= TAG_PRODUCT
:
268 y
= reverse(rev8(x
), w
)
269 present_gf(y
, w
, n
, what
)
271 def poly64_mul_simple(u
, v
, presfn
, wd
, dispwd
, mulwd
, uwhat
, vwhat
):
273 Multiply U by V, returning the product.
275 This is the fallback long multiplication.
278 uw
, vw
= 8*len(u
), 8*len(v
)
280 ## We start by carving the operands into 64-bit pieces. This is
281 ## straightforward except for the 96-bit case, where we end up with two
282 ## short pieces which we pad at the beginning.
283 if uw
%mulwd
: pad
= (-uw
)%mulwd
; u
+= C
.ByteString
.zero(pad
); uw
+= pad
284 if vw
%mulwd
: pad
= (-uw
)%mulwd
; v
+= C
.ByteString
.zero(pad
); vw
+= pad
285 uu
= split_gf(u
, mulwd
)
286 vv
= split_gf(v
, mulwd
)
288 ## Report and accumulate the individual product pieces.
290 ulim
, vlim
= uw
/mulwd
, vw
/mulwd
291 for i
in xrange(ulim
+ vlim
- 2, -1, -1):
293 for j
in xrange(max(0, i
- vlim
+ 1), min(vlim
, i
+ 1)):
294 s
= uu
[ulim
- 1 - i
+ j
]*vv
[vlim
- 1 - j
]
295 presfn(TAG_PRODPIECE
, wd
, s
, 2*mulwd
, dispwd
,
296 '%s_%d %s_%d' %
(uwhat
, i
- j
, vwhat
, j
))
298 presfn(TAG_PRODSUM
, wd
, t
, 2*mulwd
, dispwd
,
299 '(%s %s)_%d' %
(uwhat
, vwhat
, ulim
+ vlim
- 2 - i
))
301 presfn(TAG_PRODUCT
, wd
, x
, uw
+ vw
, dispwd
, '%s %s' %
(uwhat
, vwhat
))
305 def poly64_mul_karatsuba(u
, v
, klimit
, presfn
, wd
,
306 dispwd
, mulwd
, uwhat
, vwhat
):
308 Multiply U by V, returning the product.
310 If the length of U and V is at least KLIMIT, and the operands are otherwise
311 suitable, then do Karatsuba--Ofman multiplication; otherwise, delegate to
316 if w
< klimit
or w
!= 8*len(v
) or w
%(2*mulwd
) != 0:
317 return poly64_mul_simple(u
, v
, presfn
, wd
, dispwd
, mulwd
, uwhat
, vwhat
)
320 u0
, u1
= u
[:hw
/8], u
[hw
/8:]
321 v0
, v1
= v
[:hw
/8], v
[hw
/8:]
322 uu
, vv
= u0 ^ u1
, v0 ^ v1
324 presfn(TAG_KPIECE_U
, wd
, C
.GF
.loadb(uu
), hw
, dispwd
, '%s*' % uwhat
)
325 presfn(TAG_KPIECE_V
, wd
, C
.GF
.loadb(vv
), hw
, dispwd
, '%s*' % vwhat
)
326 uuvv
= poly64_mul_karatsuba(uu
, vv
, klimit
, presfn
, wd
, dispwd
, mulwd
,
327 '%s*' % uwhat
, '%s*' % vwhat
)
329 presfn(TAG_KPIECE_U
, wd
, C
.GF
.loadb(u0
), hw
, dispwd
, '%s0' % uwhat
)
330 presfn(TAG_KPIECE_V
, wd
, C
.GF
.loadb(v0
), hw
, dispwd
, '%s0' % vwhat
)
331 u0v0
= poly64_mul_karatsuba(u0
, v0
, klimit
, presfn
, wd
, dispwd
, mulwd
,
332 '%s0' % uwhat
, '%s0' % vwhat
)
334 presfn(TAG_KPIECE_U
, wd
, C
.GF
.loadb(u1
), hw
, dispwd
, '%s1' % uwhat
)
335 presfn(TAG_KPIECE_V
, wd
, C
.GF
.loadb(v1
), hw
, dispwd
, '%s1' % vwhat
)
336 u1v1
= poly64_mul_karatsuba(u1
, v1
, klimit
, presfn
, wd
, dispwd
, mulwd
,
337 '%s1' % uwhat
, '%s1' % vwhat
)
339 uvuv
= uuvv
+ u0v0
+ u1v1
340 presfn(TAG_PRODSUM
, wd
, uvuv
, w
, dispwd
, '%s!%s' %
(uwhat
, vwhat
))
342 x
= u1v1
+ (uvuv
<< hw
) + (u0v0
<< w
)
343 presfn(TAG_PRODUCT
, wd
, x
, 2*w
, dispwd
, '%s %s' %
(uwhat
, vwhat
))
346 def poly64_common(u
, v
, presfn
, dispwd
= 32, mulwd
= 64, redcwd
= 32,
349 Multiply U by V using a primitive 64-bit binary polynomial mutliplier.
351 Such a multiplier exists as the appallingly-named `pclmul[lh]q[lh]qdq' on
352 x86, and as `vmull.p64'/`pmull' on ARM.
354 Operands arrive in a `register format', which is a byte-swapped variant of
355 the external format. Implementations differ on the precise details,
359 ## We work in two main phases: first, calculate the full double-width
360 ## product; and, second, reduce it modulo the field polynomial.
362 w
= 8*len(u
); assert(w
== 8*len(v
))
364 presfn(TAG_INPUT_U
, w
, C
.GF
.loadb(u
), w
, dispwd
, 'u')
365 presfn(TAG_INPUT_V
, w
, C
.GF
.loadb(v
), w
, dispwd
, 'v')
367 ## So, on to the first part: the multiplication.
368 x
= poly64_mul_karatsuba(u
, v
, klimit
, presfn
, w
, dispwd
, mulwd
, 'u', 'v')
370 ## Now we have to shift everything up one bit to account for GCM's crazy
374 presfn(TAG_SHIFTED
, w
, y
, 2*w
, dispwd
, 'y')
376 ## Now for the reduction.
378 ## Our polynomial has the form p = t^d + r where r = SUM_{0<=i<d} r_i t^i,
379 ## with each r_i either 0 or 1. Because we choose the lexically earliest
380 ## irreducible polynomial with the necessary degree, r_i = 1 happens only
381 ## for a small number of tiny i. In our field, we have t^d = r.
383 ## We carve the product into convenient n-bit pieces, for some n dividing d
384 ## -- typically n = 32 or 64. Let d = m n, and write y = SUM_{0<=i<2m} y_i
385 ## t^{ni}. The upper portion, the y_i with i >= m, needs reduction; but
386 ## y_i t^{ni} = y_i r t^{n(i-m)}, so we just multiply the top half by r and
387 ## add it to the bottom half. This all depends on r_i = 0 for all i >=
388 ## n/2. We process each nonzero coefficient of r separately, in two
391 ## Multiplying a chunk y_i by some t^j is the same as shifting it left by j
392 ## bits (or would be if GCM weren't backwards, but let's not worry about
393 ## that right now). The high j bits will spill over into the next chunk,
394 ## while the low n - j bits will stay where they are. It's these high bits
395 ## which cause trouble -- particularly the high bits of the top chunk,
396 ## since we'll add them on to y_m, which will need further reduction. But
397 ## only the topmost j bits will do this.
399 ## The trick is that we do all of the bits which spill over first -- all of
400 ## the top j bits in each chunk, for each j -- in one pass, and then a
401 ## second pass of all the bits which don't. Because j, j' < n/2 for any
402 ## two nonzero coefficient degrees j and j', we have j + j' < n whence j <
403 ## n - j' -- so all of the bits contributed to y_m will be handled in the
404 ## second pass when we handle the bits that don't spill over.
405 rr
= [i
for i
in xrange(1, w
) if p
.testbit(i
)]
408 ## Handle the spilling bits.
409 yy
= split_gf(y
.storeb(w
/4), redcwd
)
412 br
= [(yi
<< (redcwd
- rj
))&m
for yi
in yy
[w
/redcwd
:]]
413 presfn(TAG_REDCBITS
, w
, join_gf(br
, redcwd
), w
, dispwd
, 'b(%d)' % rj
)
414 b
+= join_gf(br
, redcwd
) << (w
- redcwd
)
415 presfn(TAG_REDCFULL
, w
, b
, 2*w
, dispwd
, 'b')
417 presfn(TAG_REDCMIX
, w
, s
, 2*w
, dispwd
, 's')
419 ## Handle the nonspilling bits.
420 ss
= split_gf(s
.storeb(w
/4), redcwd
)
423 ar
= [si
>> rj
for si
in ss
[w
/redcwd
:]]
424 presfn(TAG_REDCBITS
, w
, join_gf(ar
, redcwd
), w
, dispwd
, 'a(%d)' % rj
)
425 a
+= join_gf(ar
, redcwd
)
426 presfn(TAG_REDCFULL
, w
, a
, w
, dispwd
, 'a')
428 ## Mix everything together.
430 z
= (s
&m
) + (s
>> w
) + a
431 presfn(TAG_OUTPUT
, w
, z
, w
, dispwd
, 'z')
437 def demo_pclmul(u
, v
):
438 return poly64_common(u
, v
, presfn
= present_gf_pclmul
)
441 def demo_vmullp64(u
, v
):
443 return poly64_common(u
, v
, presfn
= present_gf_mullp64
,
444 redcwd
= w
%64 == 32 and 32 or 64)
447 def demo_pmull(u
, v
):
449 return poly64_common(u
, v
, presfn
= present_gf_pmull
,
450 redcwd
= w
%64 == 32 and 32 or 64)
452 ###--------------------------------------------------------------------------
453 ### @@@ Random debris to be deleted. @@@
455 def cutting_room_floor():
457 x
= C
.bytes('cde4bef260d7bcda163547d348b7551195e77022907dd1df')
458 y
= C
.bytes('f7dac5c9941d26d0c6eb14ad568f86edd1dc9268eeee5332')
460 u
, v
= C
.GF
.loadb(x
), C
.GF
.loadb(y
)
463 print 'y = %s' %
words(g
.storeb(48))
464 b1
= (g
&repmask(0x01, 32, 6)) << 191
465 b2
= (g
&repmask(0x03, 32, 6)) << 190
466 b7
= (g
&repmask(0x7f, 32, 6)) << 185
468 print 'b = %s' %
words(b
.storeb(48)[0:28])
470 print 'w = %s' %
words(h
.storeb(48))
472 a0
= (h
&repmask(0xffffffff, 32, 6)) << 192
473 a1
= (h
&repmask(0xfffffffe, 32, 6)) << 191
474 a2
= (h
&repmask(0xfffffffc, 32, 6)) << 190
475 a7
= (h
&repmask(0xffffff80, 32, 6)) << 185
476 a
= a0
+ a1
+ a2
+ a7
478 print ' a_1 = %s' %
words(a1
.storeb(48)[0:24])
479 print ' a_2 = %s' %
words(a2
.storeb(48)[0:24])
480 print ' a_7 = %s' %
words(a7
.storeb(48)[0:24])
482 print 'low+unit = %s' %
words((h
+ a0
).storeb(48)[0:24])
483 print ' low+0,2 = %s' %
words((h
+ a0
+ a2
).storeb(48)[0:24])
484 print ' 1,7 = %s' %
words((a1
+ a7
).storeb(48)[0:24])
486 print 'a = %s' %
words(a
.storeb(48)[0:24])
488 print 'z = %s' %
words(z
.storeb(48))
491 print 'u v mod p = %s' %
words(z
)
493 ###--------------------------------------------------------------------------
499 zz
= DEMOMAP
[style
](u
, v
)
500 assert zz
== gcm_mul(u
, v
)
502 ###----- That's all, folks --------------------------------------------------