10 def __init__(self
, file, datasofar
):
12 assert len(datasofar
) < 30
13 self
.header
= datasofar
16 self
.decompress
= zlib
.decompressobj()
17 # Zlib header bytes, expected by decompress obj but not
19 self
.decompress
.decompress("\x78\x9c")
22 if self
.dataleft
== None:
23 while len(self
.header
) < 30:
24 s
= self
.file.read(30 - len(self
.header
))
26 self
.header
= self
.header
+ s
27 # Name length and extra length.
28 namelen
= 256 * ord(self
.header
[27]) + ord(self
.header
[26])
29 extralen
= 256 * ord(self
.header
[29]) + ord(self
.header
[28])
30 while len(self
.header
) < 30 + namelen
+ extralen
:
31 s
= self
.file.read(30 + namelen
+ extralen
- len(self
.header
))
33 self
.header
= self
.header
+ s
35 256 * (256 * (256 * ord(self
.header
[21]) + ord(self
.header
[20])) \
36 + ord(self
.header
[19])) + ord(self
.header
[18])
37 k
= string
.find(self
.data
, "\n")
40 if rlen
> 4096: rlen
= 4096
42 d
= self
.file.read(rlen
)
44 self
.dataleft
= self
.dataleft
- rlen
45 self
.data
= self
.data
+ self
.decompress
.decompress(d
)
46 k
= string
.find(self
.data
, "\n")
53 self
.data
= self
.data
[k
+1:]
58 if s
[-1:] == "L" or s
[-1:] == "l":
60 if s
[:2] == "0x" or s
[:2] == "0X":
67 while len(key
) < 4: key
= "0" + key
68 key
= string
.upper(key
)
72 return "<han> " + value
77 return string
.split(value
, ";")[1]
79 return "<no name available>"
83 def output(char
, bytes
, errors
):
91 for i
in range(6-len(bytes
)):
103 if char
== -1 or errors
!= "":
104 # problem chars become U+FFFD REPLACEMENT CHARACTER
105 sys
.stdout
.write("\xEF\xBF\xBD")
108 sys
.stdout
.write(chr(i
))
110 def process_ucs(x
, bytes
=[], errors
=""):
124 assert x
< 0x80000000L
126 realbytes
= tmp
[1] + 1
127 utf8
= [tmp
[0] + (x
>> (6*tmp
[1]))]
128 for i
in range(tmp
[1]-1, -1, -1):
129 utf8
.append(0x80 + (0x3F & (x
>> (i
*6))))
131 if bytes
!= [] and len(bytes
) > realbytes
:
132 errors
= errors
+ " (overlong form of"
134 errors
= errors
+ " %02X" % i
135 errors
= errors
+ ")"
137 if x
>= 0xD800 and x
<= 0xDFFF:
138 errors
= errors
+ " (surrogate)"
139 if x
>= 0xFFFE and x
<= 0xFFFF:
140 errors
= errors
+ " (invalid char)"
142 output(x
, utf8
, errors
)
144 def process_utf8(next
):
150 process_ucs(i
) # single-byte char
152 elif i
== 0xfe or i
== 0xff:
153 output(-1, char
, " (invalid UTF-8 byte)")
155 elif i
>= 0x80 and i
<= 0xbf:
156 output(-1, char
, " (unexpected continuation byte)")
159 if i
>= 0xC0 and i
<= 0xDF:
162 elif i
>= 0xE0 and i
<= 0xEF:
165 elif i
>= 0xF0 and i
<= 0xF7:
168 elif i
>= 0xF8 and i
<= 0xFB:
171 elif i
>= 0xFC and i
<= 0xFD:
177 if c
== None or c
< 0x80 or c
> 0xBF:
181 acc
= (acc
<< 6) + (c
& 0x3F)
186 output(-1, char
, " (incomplete sequence)")
188 process_ucs(acc
, char
)
191 # Class to turn a list into a callable object that returns one
194 def __init__(self
, list):
198 if self
.index
>= len(self
.list):
200 ret
= self
.list[self
.index
]
201 self
.index
= self
.index
+ 1
206 if string
.upper(arg
[0]) == "U":
208 process_utf8(liststepper(list))
210 assert arg
[1] == "+" or arg
[1] == "-"
211 process_ucs(string
.atoi(arg
[2:], 16))
213 list.append(string
.atoi(arg
, 16))
216 process_utf8(liststepper(list))
222 if args
== [] or args
== ["--help"] or args
== ["--help-admin"]:
223 print "Usage: cvt-utf8 [flags] <hex UTF-8 bytes and/or U+codepoints>"
224 print " e.g. cvt-utf8 e2 82 ac"
225 print " or cvt-utf8 U+20ac"
226 print " or cvt-utf8 U-10ffff"
228 print "Flags: -o or --output just output well-formed UTF-8 instead of"
229 print " an analysis of the input data"
230 print " -h or --han also give Han definitions from unihan db"
232 print "Also: cvt-utf8 --test run Markus Kuhn's decoder stress tests" #'
233 print " cvt-utf8 --input (or -i)"
234 print " read, analyse and decode UTF-8 from stdin"
235 if args
== ["--help-admin"]:
236 print " cvt-utf8 --help display user help text"
237 print " cvt-utf8 --help-admin display admin help text (this one)"
238 print " cvt-utf8 --build <infile> <outfile>"
239 print " convert UnicodeData.txt to unicode db"
240 print " cvt-utf8 --build-unihan <infile> <outfile>"
241 print " convert Unihan.txt to unihan db"
242 print " cvt-utf8 --fetch-build <outfile>"
244 "build unicode db by download from unicode.org"
245 print " cvt-utf8 --fetch-build-unihan <outfile>"
247 "build Unihan db by download from unicode.org"
249 print " cvt-utf8 --help display this help text"
250 print " cvt-utf8 --help-admin display admin help text"
253 if args
[0] == "-o" or args
[0] == "--output":
257 if args
[0] == "-h" or args
[0] == "--han":
261 if args
[0] == "--build" or args
[0] == "--fetch-build":
262 if args
[0] == "--build":
264 print "cvt-utf8: --build expects two filename arguments"
266 infile
= open(args
[1], "r")
270 print "cvt-utf8: --fetch-build expects one filename argument"
273 infile
= urllib
.urlopen("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
275 # Now build the database.
276 if outfile
[-3:] == ".db":
277 print "cvt-utf8: warning: you should not append .db to db name"
279 db
= anydbm
.open(outfile
, "n")
281 s
= infile
.readline()
283 ss
= string
.split(s
, ";")[0]
288 if args
[0] == "--build-unihan" or args
[0] == "--fetch-build-unihan":
289 if args
[0] == "--build-unihan":
291 print "cvt-utf8: --build expects two filename arguments"
293 infile
= open(args
[1], "r")
295 # Unihan.txt starts with a hash. If this file starts with a
296 # P, we assume it's a zip file ("PK").
298 infile
= zip_untangler(infile
, s
)
303 print "cvt-utf8: --fetch-build-unihan expects one filename argument"
306 infile
= urllib
.urlopen("ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip")
307 # We know this one is zipped.
308 infile
= zip_untangler(infile
, "")
311 # Now build the database.
312 if outfile
[-3:] == ".db":
313 print "cvt-utf8: warning: you should not append .db to db name"
315 db
= anydbm
.open(outfile
, "n")
317 s
= s
+ infile
.readline()
319 while s
[-1:] == "\r" or s
[-1:] == "\n":
321 sa
= string
.split(s
, "\t")
322 if len(sa
) == 3 and sa
[1] == "kDefinition" and sa
[0][:2] == "U+":
323 db
[sa
[0][2:]] = sa
[2]
329 locations
.append("/usr/share/unicode/unicode")
330 locations
.append("/usr/lib/unicode/unicode")
331 locations
.append("/usr/local/share/unicode/unicode")
332 locations
.append("/usr/local/lib/unicode/unicode")
333 locations
.append(os
.environ
["HOME"] + "/share/unicode/unicode")
334 locations
.append(os
.environ
["HOME"] + "/lib/unicode/unicode")
336 for loc
in locations
:
338 db
= anydbm
.open(loc
, "r")
346 i
= string
.rfind(loc
, "/")
348 hanloc
= loc
[:i
+1] + "unihan"
349 handb
= anydbm
.open(hanloc
, "r")
350 # this has been explicitly required, so we don't squelch exceptions
352 if args
[0] == "--test":
353 do(["CE","BA","E1","BD","B9","CF","83","CE","BC","CE","B5"])
357 do(["F0","90","80","80"])
358 do(["F8","88","80","80","80"])
359 do(["FC","84","80","80","80","80"])
363 do(["F7","BF","BF","BF"])
364 do(["FB","BF","BF","BF","BF"])
365 do(["FD","BF","BF","BF","BF","BF"])
369 do(["F4","8F","BF","BF"])
370 do(["F4","90","80","80"])
375 do(["80","BF","80","BF"])
376 do(["80","BF","80","BF","80"])
377 do(["80","BF","80","BF","80","BF"])
378 do(["80","BF","80","BF","80","BF","80"])
379 do(["80","81","82","83","84","85","86","87",
380 "88","89","8A","8B","8C","8D","8E","8F",
381 "90","91","92","93","94","95","96","97",
382 "98","99","9A","9B","9C","9D","9E","9F",
383 "A0","A1","A2","A3","A4","A5","A6","A7",
384 "A8","A9","AA","AB","AC","AD","AE","AF",
385 "B0","B1","B2","B3","B4","B5","B6","B7",
386 "B8","B9","BA","BB","BC","BD","BE","BF"])
387 do(["C0","20","C1","20","C2","20","C3","20",
388 "C4","20","C5","20","C6","20","C7","20",
389 "C8","20","C9","20","CA","20","CB","20",
390 "CC","20","CD","20","CE","20","CF","20",
391 "D0","20","D1","20","D2","20","D3","20",
392 "D4","20","D5","20","D6","20","D7","20",
393 "D8","20","D9","20","DA","20","DB","20",
394 "DC","20","DD","20","DE","20","DF","20"])
395 do(["E0","20","E1","20","E2","20","E3","20",
396 "E4","20","E5","20","E6","20","E7","20",
397 "E8","20","E9","20","EA","20","EB","20",
398 "EC","20","ED","20","EE","20","EF","20"])
399 do(["F0","20","F1","20","F2","20","F3","20",
400 "F4","20","F5","20","F6","20","F7","20"])
401 do(["F8","20","F9","20","FA","20","FB","20"])
402 do(["FC","20","FD","20"])
406 do(["F8","80","80","80"])
407 do(["FC","80","80","80","80"])
411 do(["FB","BF","BF","BF"])
412 do(["FD","BF","BF","BF","BF"])
413 do(["C0","E0","80","F0","80","80","F8","80",
414 "80","80","FC","80","80","80","80",
415 "DF","EF","BF","F7","BF","BF","FB",
416 "BF","BF","BF","FD","BF","BF","BF","BF"])
419 do(["FE","FE","FF","FF"])
422 do(["F0","80","80","AF"])
423 do(["F8","80","80","80","AF"])
424 do(["FC","80","80","80","80","AF"])
427 do(["F0","8F","BF","BF"])
428 do(["F8","87","BF","BF","BF"])
429 do(["FC","83","BF","BF","BF","BF"])
432 do(["F0","80","80","80"])
433 do(["F8","80","80","80","80"])
434 do(["FC","80","80","80","80","80"])
442 do(["ED","A0","80","ED","B0","80"])
443 do(["ED","A0","80","ED","BF","BF"])
444 do(["ED","AD","BF","ED","B0","80"])
445 do(["ED","AD","BF","ED","BF","BF"])
446 do(["ED","AE","80","ED","B0","80"])
447 do(["ED","AE","80","ED","BF","BF"])
448 do(["ED","AF","BF","ED","B0","80"])
449 do(["ED","AF","BF","ED","BF","8F"])
452 elif args
[0] == "--input" or args
[0] == "-i":
454 s
= sys
.stdin
.read(1)
457 return ord(s
) & 0xFF # ensure it isn't negative
458 process_utf8(getchar
)