10 def __init__(self
, file, datasofar
):
12 assert len(datasofar
) < 30
13 self
.header
= datasofar
16 self
.decompress
= zlib
.decompressobj()
17 # Zlib header bytes, expected by decompress obj but not
19 self
.decompress
.decompress("\x78\x9c")
22 if self
.dataleft
== None:
23 while len(self
.header
) < 30:
24 s
= self
.file.read(30 - len(self
.header
))
26 self
.header
= self
.header
+ s
27 # Name length and extra length.
28 namelen
= 256 * ord(self
.header
[27]) + ord(self
.header
[26])
29 extralen
= 256 * ord(self
.header
[29]) + ord(self
.header
[28])
30 while len(self
.header
) < 30 + namelen
+ extralen
:
31 s
= self
.file.read(30 + namelen
+ extralen
- len(self
.header
))
33 self
.header
= self
.header
+ s
35 256 * (256 * (256 * ord(self
.header
[21]) + ord(self
.header
[20])) \
36 + ord(self
.header
[19])) + ord(self
.header
[18])
37 k
= string
.find(self
.data
, "\n")
40 if rlen
> 4096: rlen
= 4096
42 d
= self
.file.read(rlen
)
44 self
.dataleft
= self
.dataleft
- rlen
45 self
.data
= self
.data
+ self
.decompress
.decompress(d
)
46 k
= string
.find(self
.data
, "\n")
53 self
.data
= self
.data
[k
+1:]
58 if s
[-1:] == "L" or s
[-1:] == "l":
60 if s
[:2] == "0x" or s
[:2] == "0X":
67 while len(key
) < 4: key
= "0" + key
68 key
= string
.upper(key
)
72 return "<han> " + value
77 return string
.split(value
, ";")[1]
79 return "<no name available>"
83 def output(char
, bytes
, errors
):
91 for i
in range(6-len(bytes
)):
103 if char
== -1 or errors
!= "":
104 # problem chars become U+FFFD REPLACEMENT CHARACTER
105 sys
.stdout
.write("\xEF\xBF\xBD")
108 sys
.stdout
.write(chr(i
))
110 def process_ucs(x
, bytes
=[], errors
=""):
124 assert x
< 0x80000000L
126 realbytes
= tmp
[1] + 1
127 utf8
= [tmp
[0] + (x
>> (6*tmp
[1]))]
128 for i
in range(tmp
[1]-1, -1, -1):
129 utf8
.append(0x80 + (0x3F & (x
>> (i
*6))))
131 if bytes
!= [] and len(bytes
) > realbytes
:
132 errors
= errors
+ " (overlong form of"
134 errors
= errors
+ " %02X" % i
135 errors
= errors
+ ")"
137 if x
>= 0xD800 and x
<= 0xDFFF:
138 errors
= errors
+ " (surrogate)"
139 if x
>= 0xFFFE and x
<= 0xFFFF:
140 errors
= errors
+ " (invalid char)"
142 output(x
, utf8
, errors
)
144 def process_utf8(next
):
150 process_ucs(i
) # single-byte char
152 elif i
== 0xfe or i
== 0xff:
153 output(-1, char
, " (invalid UTF-8 byte)")
155 elif i
>= 0x80 and i
<= 0xbf:
156 output(-1, char
, " (unexpected continuation byte)")
159 if i
>= 0xC0 and i
<= 0xDF:
162 elif i
>= 0xE0 and i
<= 0xEF:
165 elif i
>= 0xF0 and i
<= 0xF7:
168 elif i
>= 0xF8 and i
<= 0xFB:
171 elif i
>= 0xFC and i
<= 0xFD:
177 if c
== None or c
< 0x80 or c
> 0xBF:
181 acc
= (acc
<< 6) + (c
& 0x3F)
186 output(-1, char
, " (incomplete sequence)")
188 process_ucs(acc
, char
)
191 # Class to turn a list into a callable object that returns one
194 def __init__(self
, list):
198 if self
.index
>= len(self
.list):
200 ret
= self
.list[self
.index
]
201 self
.index
= self
.index
+ 1
206 if string
.upper(arg
[0]) == "U":
208 process_utf8(liststepper(list))
210 assert arg
[1] == "+" or arg
[1] == "-"
211 process_ucs(string
.atoi(arg
[2:], 16))
213 list.append(string
.atoi(arg
, 16))
216 process_utf8(liststepper(list))
219 print "Usage: cvt-utf8 [flags] <hex UTF-8 bytes and/or U+codepoints>"
220 print " e.g. cvt-utf8 e2 82 ac"
221 print " or cvt-utf8 U+20ac"
222 print " or cvt-utf8 U-10ffff"
224 print "Flags: -o or --output just output well-formed UTF-8 instead of"
225 print " an analysis of the input data"
226 print " -h or --han also give Han definitions from unihan db"
228 print "Also: cvt-utf8 --test run Markus Kuhn's decoder stress tests" #'
229 print " cvt-utf8 --input (or -i)"
230 print " read, analyse and decode UTF-8 from stdin"
231 if arg
== "--help-admin":
232 print " cvt-utf8 --help display user help text"
233 print " cvt-utf8 --help-admin display admin help text (this one)"
234 print " cvt-utf8 --build <infile> <outfile>"
235 print " convert UnicodeData.txt to unicode db"
236 print " cvt-utf8 --build-unihan <infile> <outfile>"
237 print " convert Unihan.txt to unihan db"
238 print " cvt-utf8 --fetch-build <outfile>"
240 "build unicode db by download from unicode.org"
241 print " cvt-utf8 --fetch-build-unihan <outfile>"
243 "build Unihan db by download from unicode.org"
245 print " cvt-utf8 --help display this help text"
246 print " cvt-utf8 --help-admin display admin help text"
247 print " cvt-utf8 --version report version number"
248 print " cvt-utf8 --licence display (MIT) licence text"
251 print "cvt-utf8 is copyright 2002-2004 Simon Tatham."
253 print "Permission is hereby granted, free of charge, to any person"
254 print "obtaining a copy of this software and associated documentation files"
255 print "(the \"Software\"), to deal in the Software without restriction,"
256 print "including without limitation the rights to use, copy, modify, merge,"
257 print "publish, distribute, sublicense, and/or sell copies of the Software,"
258 print "and to permit persons to whom the Software is furnished to do so,"
259 print "subject to the following conditions:"
261 print "The above copyright notice and this permission notice shall be"
262 print "included in all copies or substantial portions of the Software."
264 print "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,"
265 print "EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF"
266 print "MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND"
267 print "NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS"
268 print "BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN"
269 print "ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN"
270 print "CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE"
275 rev
= string
.replace(rev
, " ", "")
276 rev
= string
.replace(rev
, "$", "")
277 revs
= string
.split(rev
, ":")
279 print "cvt-utf8 revision %s" % revs
[1]
281 print "cvt-utf8: unknown version"
292 while len(args
) > 0 and args
[0][:1] == "-":
293 if args
[0] == "--help" or args
[0] == "--help-admin":
297 elif args
[0] == "--licence" or args
[0] == "--license":
301 elif args
[0] == "--version":
305 elif args
[0] == "-o" or args
[0] == "--output":
309 elif args
[0] == "-h" or args
[0] == "--han":
313 elif args
[0] == "--build" or args
[0] == "--fetch-build":
314 if args
[0] == "--build":
316 print "cvt-utf8: --build expects two filename arguments"
318 infile
= open(args
[1], "r")
322 print "cvt-utf8: --fetch-build expects one filename argument"
325 infile
= urllib
.urlopen("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
327 # Now build the database.
328 if outfile
[-3:] == ".db":
329 print "cvt-utf8: warning: you should not append .db to db name"
331 db
= anydbm
.open(outfile
, "n")
333 s
= infile
.readline()
335 ss
= string
.split(s
, ";")[0]
340 elif args
[0] == "--build-unihan" or args
[0] == "--fetch-build-unihan":
341 if args
[0] == "--build-unihan":
343 print "cvt-utf8: --build expects two filename arguments"
345 infile
= open(args
[1], "r")
347 # Unihan.txt starts with a hash. If this file starts with a
348 # P, we assume it's a zip file ("PK").
350 infile
= zip_untangler(infile
, s
)
355 print "cvt-utf8: --fetch-build-unihan expects one filename argument"
358 infile
= urllib
.urlopen("ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip")
359 # We know this one is zipped.
360 infile
= zip_untangler(infile
, "")
363 # Now build the database.
364 if outfile
[-3:] == ".db":
365 print "cvt-utf8: warning: you should not append .db to db name"
367 db
= anydbm
.open(outfile
, "n")
369 s
= s
+ infile
.readline()
371 while s
[-1:] == "\r" or s
[-1:] == "\n":
373 sa
= string
.split(s
, "\t")
374 if len(sa
) == 3 and sa
[1] == "kDefinition" and sa
[0][:2] == "U+":
375 db
[sa
[0][2:]] = sa
[2]
380 elif args
[0] == "--test":
384 elif args
[0] == "--input" or args
[0] == "-i":
389 sys
.stderr
.write("cvt-utf8: unknown argument '%s'" % args
[0])
393 locations
.append("/usr/share/unicode/unicode")
394 locations
.append("/usr/lib/unicode/unicode")
395 locations
.append("/usr/local/share/unicode/unicode")
396 locations
.append("/usr/local/lib/unicode/unicode")
397 locations
.append(os
.environ
["HOME"] + "/share/unicode/unicode")
398 locations
.append(os
.environ
["HOME"] + "/lib/unicode/unicode")
400 for loc
in locations
:
402 db
= anydbm
.open(loc
, "r")
410 i
= string
.rfind(loc
, "/")
412 hanloc
= loc
[:i
+1] + "unihan"
413 handb
= anydbm
.open(hanloc
, "r")
414 # this has been explicitly required, so we don't squelch exceptions
417 do(["CE","BA","E1","BD","B9","CF","83","CE","BC","CE","B5"])
421 do(["F0","90","80","80"])
422 do(["F8","88","80","80","80"])
423 do(["FC","84","80","80","80","80"])
427 do(["F7","BF","BF","BF"])
428 do(["FB","BF","BF","BF","BF"])
429 do(["FD","BF","BF","BF","BF","BF"])
433 do(["F4","8F","BF","BF"])
434 do(["F4","90","80","80"])
439 do(["80","BF","80","BF"])
440 do(["80","BF","80","BF","80"])
441 do(["80","BF","80","BF","80","BF"])
442 do(["80","BF","80","BF","80","BF","80"])
443 do(["80","81","82","83","84","85","86","87",
444 "88","89","8A","8B","8C","8D","8E","8F",
445 "90","91","92","93","94","95","96","97",
446 "98","99","9A","9B","9C","9D","9E","9F",
447 "A0","A1","A2","A3","A4","A5","A6","A7",
448 "A8","A9","AA","AB","AC","AD","AE","AF",
449 "B0","B1","B2","B3","B4","B5","B6","B7",
450 "B8","B9","BA","BB","BC","BD","BE","BF"])
451 do(["C0","20","C1","20","C2","20","C3","20",
452 "C4","20","C5","20","C6","20","C7","20",
453 "C8","20","C9","20","CA","20","CB","20",
454 "CC","20","CD","20","CE","20","CF","20",
455 "D0","20","D1","20","D2","20","D3","20",
456 "D4","20","D5","20","D6","20","D7","20",
457 "D8","20","D9","20","DA","20","DB","20",
458 "DC","20","DD","20","DE","20","DF","20"])
459 do(["E0","20","E1","20","E2","20","E3","20",
460 "E4","20","E5","20","E6","20","E7","20",
461 "E8","20","E9","20","EA","20","EB","20",
462 "EC","20","ED","20","EE","20","EF","20"])
463 do(["F0","20","F1","20","F2","20","F3","20",
464 "F4","20","F5","20","F6","20","F7","20"])
465 do(["F8","20","F9","20","FA","20","FB","20"])
466 do(["FC","20","FD","20"])
470 do(["F8","80","80","80"])
471 do(["FC","80","80","80","80"])
475 do(["FB","BF","BF","BF"])
476 do(["FD","BF","BF","BF","BF"])
477 do(["C0","E0","80","F0","80","80","F8","80",
478 "80","80","FC","80","80","80","80",
479 "DF","EF","BF","F7","BF","BF","FB",
480 "BF","BF","BF","FD","BF","BF","BF","BF"])
483 do(["FE","FE","FF","FF"])
486 do(["F0","80","80","AF"])
487 do(["F8","80","80","80","AF"])
488 do(["FC","80","80","80","80","AF"])
491 do(["F0","8F","BF","BF"])
492 do(["F8","87","BF","BF","BF"])
493 do(["FC","83","BF","BF","BF","BF"])
496 do(["F0","80","80","80"])
497 do(["F8","80","80","80","80"])
498 do(["FC","80","80","80","80","80"])
506 do(["ED","A0","80","ED","B0","80"])
507 do(["ED","A0","80","ED","BF","BF"])
508 do(["ED","AD","BF","ED","B0","80"])
509 do(["ED","AD","BF","ED","BF","BF"])
510 do(["ED","AE","80","ED","B0","80"])
511 do(["ED","AE","80","ED","BF","BF"])
512 do(["ED","AF","BF","ED","B0","80"])
513 do(["ED","AF","BF","ED","BF","8F"])
516 elif mode
== "input":
518 s
= sys
.stdin
.read(1)
521 return ord(s
) & 0xFF # ensure it isn't negative
522 process_utf8(getchar
)