bec00d7532c9abe45921051c78073a35ae4c736b
2 ### -*- coding: utf-8 -*-
4 from sys
import argv
, exit
8 ###--------------------------------------------------------------------------
12 """Split S into 32-bit pieces and report their values as hex."""
13 return ' '.join('%08x' % C
.MP
.loadb(s
[i
:i
+ 4])
14 for i
in xrange(0, len(s
), 4))
17 """Split S into 64-bit pieces and report their values as hex."""
18 return ' '.join('%016x' % C
.MP
.loadb(s
[i
:i
+ 8])
19 for i
in xrange(0, len(s
), 8))
21 def repmask(val
, wd
, n
):
22 """Return a mask consisting of N copies of the WD-bit value VAL."""
25 for i
in xrange(n
): a
= (a
<< wd
) | v
29 """Iterate over all possible combinations of K of the THINGS."""
33 yield [things
[i
] for i
in ii
]
35 if j
== k
- 1: lim
= n
49 Return the lexically first irreducible polynomial of degree NBITS of lowest
52 try: return POLYMAP
[nbits
]
54 base
= C
.GF(0).setbit(nbits
).setbit(0)
55 for k
in xrange(1, nbits
, 2):
56 for cc
in combs(range(1, nbits
), k
):
57 p
= base
+ sum((C
.GF(0).setbit(c
) for c
in cc
), C
.GF(0))
58 if p
.irreduciblep(): POLYMAP
[nbits
] = p
; return p
59 raise ValueError, nbits
62 """Flip the bits within each byte according to GCM's insane convention."""
74 def endswap_words_32(x
):
75 """End-swap each 32-bit word of X."""
78 while x
.left
: y
.putu32l(x
.getu32b())
81 def endswap_words_64(x
):
82 """End-swap each 64-bit word of X."""
85 while x
.left
: y
.putu64l(x
.getu64b())
89 """End-swap X by bytes."""
91 for ch
in reversed(x
): y
.put(ch
)
95 return C
.GF(C
.MP(0).setbit(n
) - 1)
98 """Multiply X and Y according to the GCM rules."""
101 u
, v
= C
.GF
.loadl(gcm_mangle(x
)), C
.GF
.loadl(gcm_mangle(y
))
103 return gcm_mangle(z
.storel(w
))
107 name
= func
.func_name
108 assert(name
.startswith('demo_'))
109 DEMOMAP
[name
[5:].replace('_', '-')] = func
114 def next(): vi
[0] += 1; return vi
[0] - 1
117 ###--------------------------------------------------------------------------
118 ### Portable table-driven implementation.
121 """Given a field element X (in external format), return X t."""
124 return gcm_mangle(C
.GF
.storel((C
.GF
.loadl(gcm_mangle(x
)) << 1)%p
))
126 def table_common(u
, v
, flip
, getword
, ixmask
):
128 Multiply U by V using table lookup; common for `table-b' and `table-l'.
130 This matches the `simple_mulk_...' implementation in `gcm.c'. One-entry
131 per bit is the best we can manage if we want a constant-time
132 implementation: processing n bits at a time means we need to scan
133 (2^n - 1)/n times as much memory.
135 * FLIP is a function (assumed to be an involution) on one argument X to
136 convert X from external format to table-entry format or back again.
138 * GETWORD is a function on one argument B to retrieve the next 32-bit
139 chunk of a field element held in a `ReadBuffer'. Bits within a word
140 are processed most-significant first.
142 * IXMASK is a mask XORed into table indices to permute the table so that
143 it's order matches that induced by GETWORD.
145 The table is built such that tab[i XOR IXMASK] = U t^i.
147 w
= len(u
); assert(w
== len(v
))
148 a
= C
.ByteString
.zero(w
)
150 for i
in xrange(8*w
):
151 print ';; %9s = %7s = %s' %
('utab[%d]' % i
, 'u t^%d' % i
, words(u
))
152 tab
[i ^ ixmask
] = flip(u
)
161 print ';; %6s = %d: a <- %s [%9s = %s]' % \
162 ('v[%d]' %
(i ^ ixmask
), bit
, words(a
),
163 'utab[%d]' %
(i ^ ixmask
), words(tab
[i
]))
168 def demo_table_b(u
, v
):
169 """Big-endian table lookup."""
170 return table_common(u
, v
, lambda x
: x
, lambda b
: b
.getu32b(), 0)
173 def demo_table_l(u
, v
):
174 """Little-endian table lookup."""
175 return table_common(u
, v
, endswap_words_32
, lambda b
: b
.getu32l(), 0x18)
177 ###--------------------------------------------------------------------------
178 ### Implementation using 64×64->128-bit binary polynomial multiplication.
196 return [C
.GF
.loadb(x
[i
:i
+ n
]) for i
in xrange(0, len(x
), n
)]
200 for i
in xrange(len(xx
)): x
= (x
<< n
) | xx
[i
]
203 def present_gf(x
, w
, n
, what
):
206 for i
in xrange(0, w
, 128):
207 print ';; %12s%c =%s' % \
208 (firstp
and what
or '',
209 firstp
and ':' or ' ',
211 and ' 0x%s' %
hex(((x
>> j
)&m
).storeb(n
/8))
213 for j
in xrange(i
, i
+ 128, n
)]))
216 def present_gf_pclmul(tag
, wd
, x
, w
, n
, what
):
217 if tag
!= TAG_PRODPIECE
: present_gf(x
, w
, n
, what
)
220 return C
.GF
.loadl(x
.storeb(w
/8))
224 m_ffff
= repmask(0xffff, 32, w
/4)
225 m_ff
= repmask(0xff, 16, w
/2)
226 x
= ((x
&m_ffff
) << 16) |
((x
>> 16)&m_ffff
)
227 x
= ((x
&m_ff
) << 8) |
((x
>> 8)&m_ff
)
232 m_0f
= repmask(0x0f, 8, w
)
233 m_33
= repmask(0x33, 8, w
)
234 m_55
= repmask(0x55, 8, w
)
235 x
= ((x
&m_0f
) << 4) |
((x
>> 4)&m_0f
)
236 x
= ((x
&m_33
) << 2) |
((x
>> 2)&m_33
)
237 x
= ((x
&m_55
) << 1) |
((x
>> 1)&m_55
)
240 def present_gf_mullp64(tag
, wd
, x
, w
, n
, what
):
241 if tag
== TAG_PRODPIECE
or tag
== TAG_REDCFULL
:
243 elif (wd
== 128 or wd
== 64) and TAG_PRODSUM
<= tag
<= TAG_PRODUCT
:
245 elif (wd
== 96 or wd
== 192 or wd
== 256) and \
246 TAG_PRODSUM
<= tag
< TAG_OUTPUT
:
251 if extra
: xx
+= C
.ByteString
.zero(8 - extra
)
253 for i
in xrange(len(xx
), 0, -8): yb
.put(xx
[i
- 8:i
])
254 y
= C
.GF
.loadb(yb
.contents
)
255 present_gf(y
, (w
+ 63)&~
63, n
, what
)
257 def present_gf_pmull(tag
, wd
, x
, w
, n
, what
):
258 if tag
== TAG_PRODPIECE
or tag
== TAG_REDCFULL
or tag
== TAG_SHIFTED
:
260 elif tag
== TAG_INPUT_V
or tag
== TAG_KPIECE_V
:
262 bx
= C
.ReadBuffer(x
.storeb(w
/8))
264 while bx
.left
: chunk
= bx
.get(8); by
.put(chunk
).put(chunk
)
265 x
= C
.GF
.loadb(by
.contents
)
267 elif TAG_PRODSUM
<= tag
<= TAG_PRODUCT
:
269 y
= reverse(rev8(x
), w
)
270 present_gf(y
, w
, n
, what
)
272 def poly64_mul_simple(u
, v
, presfn
, wd
, dispwd
, mulwd
, uwhat
, vwhat
):
274 Multiply U by V, returning the product.
276 This is the fallback long multiplication.
279 uw
, vw
= 8*len(u
), 8*len(v
)
281 ## We start by carving the operands into 64-bit pieces. This is
282 ## straightforward except for the 96-bit case, where we end up with two
283 ## short pieces which we pad at the beginning.
284 if uw
%mulwd
: pad
= (-uw
)%mulwd
; u
+= C
.ByteString
.zero(pad
); uw
+= pad
285 if vw
%mulwd
: pad
= (-vw
)%mulwd
; v
+= C
.ByteString
.zero(pad
); vw
+= pad
286 uu
= split_gf(u
, mulwd
)
287 vv
= split_gf(v
, mulwd
)
289 ## Report and accumulate the individual product pieces.
291 ulim
, vlim
= uw
/mulwd
, vw
/mulwd
292 for i
in xrange(ulim
+ vlim
- 2, -1, -1):
294 for j
in xrange(max(0, i
- vlim
+ 1), min(vlim
, i
+ 1)):
295 s
= uu
[ulim
- 1 - i
+ j
]*vv
[vlim
- 1 - j
]
296 presfn(TAG_PRODPIECE
, wd
, s
, 2*mulwd
, dispwd
,
297 '%s_%d %s_%d' %
(uwhat
, i
- j
, vwhat
, j
))
299 presfn(TAG_PRODSUM
, wd
, t
, 2*mulwd
, dispwd
,
300 '(%s %s)_%d' %
(uwhat
, vwhat
, ulim
+ vlim
- 2 - i
))
302 presfn(TAG_PRODUCT
, wd
, x
, uw
+ vw
, dispwd
, '%s %s' %
(uwhat
, vwhat
))
306 def poly64_mul_karatsuba(u
, v
, klimit
, presfn
, wd
,
307 dispwd
, mulwd
, uwhat
, vwhat
):
309 Multiply U by V, returning the product.
311 If the length of U and V is at least KLIMIT, and the operands are otherwise
312 suitable, then do Karatsuba--Ofman multiplication; otherwise, delegate to
317 if w
< klimit
or w
!= 8*len(v
) or w
%(2*mulwd
) != 0:
318 return poly64_mul_simple(u
, v
, presfn
, wd
, dispwd
, mulwd
, uwhat
, vwhat
)
321 u0
, u1
= u
[:hw
/8], u
[hw
/8:]
322 v0
, v1
= v
[:hw
/8], v
[hw
/8:]
323 uu
, vv
= u0 ^ u1
, v0 ^ v1
325 presfn(TAG_KPIECE_U
, wd
, C
.GF
.loadb(uu
), hw
, dispwd
, '%s*' % uwhat
)
326 presfn(TAG_KPIECE_V
, wd
, C
.GF
.loadb(vv
), hw
, dispwd
, '%s*' % vwhat
)
327 uuvv
= poly64_mul_karatsuba(uu
, vv
, klimit
, presfn
, wd
, dispwd
, mulwd
,
328 '%s*' % uwhat
, '%s*' % vwhat
)
330 presfn(TAG_KPIECE_U
, wd
, C
.GF
.loadb(u0
), hw
, dispwd
, '%s0' % uwhat
)
331 presfn(TAG_KPIECE_V
, wd
, C
.GF
.loadb(v0
), hw
, dispwd
, '%s0' % vwhat
)
332 u0v0
= poly64_mul_karatsuba(u0
, v0
, klimit
, presfn
, wd
, dispwd
, mulwd
,
333 '%s0' % uwhat
, '%s0' % vwhat
)
335 presfn(TAG_KPIECE_U
, wd
, C
.GF
.loadb(u1
), hw
, dispwd
, '%s1' % uwhat
)
336 presfn(TAG_KPIECE_V
, wd
, C
.GF
.loadb(v1
), hw
, dispwd
, '%s1' % vwhat
)
337 u1v1
= poly64_mul_karatsuba(u1
, v1
, klimit
, presfn
, wd
, dispwd
, mulwd
,
338 '%s1' % uwhat
, '%s1' % vwhat
)
340 uvuv
= uuvv
+ u0v0
+ u1v1
341 presfn(TAG_PRODSUM
, wd
, uvuv
, w
, dispwd
, '%s!%s' %
(uwhat
, vwhat
))
343 x
= u1v1
+ (uvuv
<< hw
) + (u0v0
<< w
)
344 presfn(TAG_PRODUCT
, wd
, x
, 2*w
, dispwd
, '%s %s' %
(uwhat
, vwhat
))
347 def poly64_common(u
, v
, presfn
, dispwd
= 32, mulwd
= 64, redcwd
= 32,
350 Multiply U by V using a primitive 64-bit binary polynomial mutliplier.
352 Such a multiplier exists as the appallingly-named `pclmul[lh]q[lh]qdq' on
353 x86, and as `vmull.p64'/`pmull' on ARM.
355 Operands arrive in a `register format', which is a byte-swapped variant of
356 the external format. Implementations differ on the precise details,
360 ## We work in two main phases: first, calculate the full double-width
361 ## product; and, second, reduce it modulo the field polynomial.
363 w
= 8*len(u
); assert(w
== 8*len(v
))
365 presfn(TAG_INPUT_U
, w
, C
.GF
.loadb(u
), w
, dispwd
, 'u')
366 presfn(TAG_INPUT_V
, w
, C
.GF
.loadb(v
), w
, dispwd
, 'v')
368 ## So, on to the first part: the multiplication.
369 x
= poly64_mul_karatsuba(u
, v
, klimit
, presfn
, w
, dispwd
, mulwd
, 'u', 'v')
371 ## Now we have to shift everything up one bit to account for GCM's crazy
375 presfn(TAG_SHIFTED
, w
, y
, 2*w
, dispwd
, 'y')
377 ## Now for the reduction.
379 ## Our polynomial has the form p = t^d + r where r = SUM_{0<=i<d} r_i t^i,
380 ## with each r_i either 0 or 1. Because we choose the lexically earliest
381 ## irreducible polynomial with the necessary degree, r_i = 1 happens only
382 ## for a small number of tiny i. In our field, we have t^d = r.
384 ## We carve the product into convenient n-bit pieces, for some n dividing d
385 ## -- typically n = 32 or 64. Let d = m n, and write y = SUM_{0<=i<2m} y_i
386 ## t^{ni}. The upper portion, the y_i with i >= m, needs reduction; but
387 ## y_i t^{ni} = y_i r t^{n(i-m)}, so we just multiply the top half by r and
388 ## add it to the bottom half. This all depends on r_i = 0 for all i >=
389 ## n/2. We process each nonzero coefficient of r separately, in two
392 ## Multiplying a chunk y_i by some t^j is the same as shifting it left by j
393 ## bits (or would be if GCM weren't backwards, but let's not worry about
394 ## that right now). The high j bits will spill over into the next chunk,
395 ## while the low n - j bits will stay where they are. It's these high bits
396 ## which cause trouble -- particularly the high bits of the top chunk,
397 ## since we'll add them on to y_m, which will need further reduction. But
398 ## only the topmost j bits will do this.
400 ## The trick is that we do all of the bits which spill over first -- all of
401 ## the top j bits in each chunk, for each j -- in one pass, and then a
402 ## second pass of all the bits which don't. Because j, j' < n/2 for any
403 ## two nonzero coefficient degrees j and j', we have j + j' < n whence j <
404 ## n - j' -- so all of the bits contributed to y_m will be handled in the
405 ## second pass when we handle the bits that don't spill over.
406 rr
= [i
for i
in xrange(1, w
) if p
.testbit(i
)]
409 ## Handle the spilling bits.
410 yy
= split_gf(y
.storeb(w
/4), redcwd
)
413 br
= [(yi
<< (redcwd
- rj
))&m
for yi
in yy
[w
/redcwd
:]]
414 presfn(TAG_REDCBITS
, w
, join_gf(br
, redcwd
), w
, dispwd
, 'b(%d)' % rj
)
415 b
+= join_gf(br
, redcwd
) << (w
- redcwd
)
416 presfn(TAG_REDCFULL
, w
, b
, 2*w
, dispwd
, 'b')
418 presfn(TAG_REDCMIX
, w
, s
, 2*w
, dispwd
, 's')
420 ## Handle the nonspilling bits.
421 ss
= split_gf(s
.storeb(w
/4), redcwd
)
424 ar
= [si
>> rj
for si
in ss
[w
/redcwd
:]]
425 presfn(TAG_REDCBITS
, w
, join_gf(ar
, redcwd
), w
, dispwd
, 'a(%d)' % rj
)
426 a
+= join_gf(ar
, redcwd
)
427 presfn(TAG_REDCFULL
, w
, a
, w
, dispwd
, 'a')
429 ## Mix everything together.
431 z
= (s
&m
) + (s
>> w
) + a
432 presfn(TAG_OUTPUT
, w
, z
, w
, dispwd
, 'z')
438 def demo_pclmul(u
, v
):
439 return poly64_common(u
, v
, presfn
= present_gf_pclmul
)
442 def demo_vmullp64(u
, v
):
444 return poly64_common(u
, v
, presfn
= present_gf_mullp64
,
445 redcwd
= w
%64 == 32 and 32 or 64)
448 def demo_pmull(u
, v
):
450 return poly64_common(u
, v
, presfn
= present_gf_pmull
,
451 redcwd
= w
%64 == 32 and 32 or 64)
453 ###--------------------------------------------------------------------------
454 ### @@@ Random debris to be deleted. @@@
456 def cutting_room_floor():
458 x
= C
.bytes('cde4bef260d7bcda163547d348b7551195e77022907dd1df')
459 y
= C
.bytes('f7dac5c9941d26d0c6eb14ad568f86edd1dc9268eeee5332')
461 u
, v
= C
.GF
.loadb(x
), C
.GF
.loadb(y
)
464 print 'y = %s' %
words(g
.storeb(48))
465 b1
= (g
&repmask(0x01, 32, 6)) << 191
466 b2
= (g
&repmask(0x03, 32, 6)) << 190
467 b7
= (g
&repmask(0x7f, 32, 6)) << 185
469 print 'b = %s' %
words(b
.storeb(48)[0:28])
471 print 'w = %s' %
words(h
.storeb(48))
473 a0
= (h
&repmask(0xffffffff, 32, 6)) << 192
474 a1
= (h
&repmask(0xfffffffe, 32, 6)) << 191
475 a2
= (h
&repmask(0xfffffffc, 32, 6)) << 190
476 a7
= (h
&repmask(0xffffff80, 32, 6)) << 185
477 a
= a0
+ a1
+ a2
+ a7
479 print ' a_1 = %s' %
words(a1
.storeb(48)[0:24])
480 print ' a_2 = %s' %
words(a2
.storeb(48)[0:24])
481 print ' a_7 = %s' %
words(a7
.storeb(48)[0:24])
483 print 'low+unit = %s' %
words((h
+ a0
).storeb(48)[0:24])
484 print ' low+0,2 = %s' %
words((h
+ a0
+ a2
).storeb(48)[0:24])
485 print ' 1,7 = %s' %
words((a1
+ a7
).storeb(48)[0:24])
487 print 'a = %s' %
words(a
.storeb(48)[0:24])
489 print 'z = %s' %
words(z
.storeb(48))
492 print 'u v mod p = %s' %
words(z
)
494 ###--------------------------------------------------------------------------
500 zz
= DEMOMAP
[style
](u
, v
)
501 assert zz
== gcm_mul(u
, v
)
503 ###----- That's all, folks --------------------------------------------------