10 def __init__(self
, file, datasofar
):
12 assert len(datasofar
) < 30
13 self
.header
= datasofar
16 self
.decompress
= zlib
.decompressobj()
17 # Zlib header bytes, expected by decompress obj but not
19 self
.decompress
.decompress("\x78\x9c")
22 if self
.dataleft
== None:
23 while len(self
.header
) < 30:
24 s
= self
.file.read(30 - len(self
.header
))
26 self
.header
= self
.header
+ s
27 # Name length and extra length.
28 namelen
= 256 * ord(self
.header
[27]) + ord(self
.header
[26])
29 extralen
= 256 * ord(self
.header
[29]) + ord(self
.header
[28])
30 while len(self
.header
) < 30 + namelen
+ extralen
:
31 s
= self
.file.read(30 + namelen
+ extralen
- len(self
.header
))
33 self
.header
= self
.header
+ s
35 256 * (256 * (256 * ord(self
.header
[21]) + ord(self
.header
[20])) \
36 + ord(self
.header
[19])) + ord(self
.header
[18])
37 k
= string
.find(self
.data
, "\n")
40 if rlen
> 4096: rlen
= 4096
42 d
= self
.file.read(rlen
)
44 self
.dataleft
= self
.dataleft
- rlen
45 self
.data
= self
.data
+ self
.decompress
.decompress(d
)
46 k
= string
.find(self
.data
, "\n")
53 self
.data
= self
.data
[k
+1:]
58 if s
[-1:] == "L" or s
[-1:] == "l":
60 if s
[:2] == "0x" or s
[:2] == "0X":
67 while len(key
) < 4: key
= "0" + key
68 key
= string
.upper(key
)
72 return "<han> " + value
77 return string
.split(value
, ";")[1]
79 return "<no name available>"
83 def output(char
, bytes
, errors
):
91 for i
in range(6-len(bytes
)):
103 if char
== -1 or errors
!= "":
104 # problem chars become U+FFFD REPLACEMENT CHARACTER
105 sys
.stdout
.write("\xEF\xBF\xBD")
108 sys
.stdout
.write(chr(i
))
110 def process_ucs(x
, bytes
=[], errors
=""):
124 assert x
< 0x80000000L
126 realbytes
= tmp
[1] + 1
127 utf8
= [tmp
[0] + (x
>> (6*tmp
[1]))]
128 for i
in range(tmp
[1]-1, -1, -1):
129 utf8
.append(0x80 + (0x3F & (x
>> (i
*6))))
131 if bytes
!= [] and len(bytes
) > realbytes
:
132 errors
= errors
+ " (overlong form of"
134 errors
= errors
+ " %02X" % i
135 errors
= errors
+ ")"
137 if x
>= 0xD800 and x
<= 0xDFFF:
138 errors
= errors
+ " (surrogate)"
139 if x
>= 0xFFFE and x
<= 0xFFFF:
140 errors
= errors
+ " (invalid char)"
142 output(x
, utf8
, errors
)
144 def process_utf8(next
):
150 process_ucs(i
) # single-byte char
152 elif i
== 0xfe or i
== 0xff:
153 output(-1, char
, " (invalid UTF-8 byte)")
155 elif i
>= 0x80 and i
<= 0xbf:
156 output(-1, char
, " (unexpected continuation byte)")
159 if i
>= 0xC0 and i
<= 0xDF:
162 elif i
>= 0xE0 and i
<= 0xEF:
165 elif i
>= 0xF0 and i
<= 0xF7:
168 elif i
>= 0xF8 and i
<= 0xFB:
171 elif i
>= 0xFC and i
<= 0xFD:
177 if c
== None or c
< 0x80 or c
> 0xBF:
181 acc
= (acc
<< 6) + (c
& 0x3F)
184 output(-1, char
, " (incomplete sequence)")
186 process_ucs(acc
, char
)
191 # Class to turn a list into a callable object that returns one
194 def __init__(self
, list):
198 if self
.index
>= len(self
.list):
200 ret
= self
.list[self
.index
]
201 self
.index
= self
.index
+ 1
207 if string
.upper(arg
[0]) == "U":
208 assert arg
[1] == "+" or arg
[1] == "-"
209 got
= ('ucs', string
.atoi(arg
[2:], 16))
210 elif arg
[:2] == "&#":
211 # SGML character entity. Either &# followed by a
212 # number, or &#x followed by a hex number.
214 if s
[-1:] == ";": s
= s
[:-1]
215 if string
.upper(s
[:3]) == "&#X":
216 got
= ('ucs', string
.atoi(s
[3:], 16))
218 got
= ('ucs', string
.atoi(s
[2:], 10))
220 got
= ('utf8', string
.atoi(arg
, 16))
224 elif got
[0] == 'ucs':
226 process_utf8(liststepper(list))
231 process_utf8(liststepper(list))
234 print "usage: cvt-utf8 [flags] <hex UTF-8 bytes, U+codepoints, SGML entities>"
235 print " e.g. cvt-utf8 e2 82 ac"
236 print " or cvt-utf8 U+20ac"
237 print " or cvt-utf8 U-10ffff"
238 print " or cvt-utf8 '–'"
240 print "where: -o or --output just output well-formed UTF-8 instead of"
241 print " an analysis of the input data"
242 print " -h or --han also give Han definitions from unihan db"
244 print " also: cvt-utf8 --test run Markus Kuhn's decoder stress tests" #'
245 print " cvt-utf8 --input (or -i)"
246 print " read, analyse and decode UTF-8 from stdin"
247 if arg
== "--help-admin":
248 print " cvt-utf8 --help display user help text"
249 print " cvt-utf8 --help-admin display admin help text (this one)"
250 print " cvt-utf8 --build <infile> <outfile>"
251 print " convert UnicodeData.txt to unicode db"
252 print " cvt-utf8 --build-unihan <infile> <outfile>"
253 print " convert Unihan.txt to unihan db"
254 print " cvt-utf8 --fetch-build <outfile>"
256 "build unicode db by download from unicode.org"
257 print " cvt-utf8 --fetch-build-unihan <outfile>"
259 "build Unihan db by download from unicode.org"
261 print " cvt-utf8 --help display this help text"
262 print " cvt-utf8 --help-admin display admin help text"
263 print " cvt-utf8 --version report version number"
264 print " cvt-utf8 --licence display (MIT) licence text"
267 print "cvt-utf8 is copyright 2002-2004 Simon Tatham."
269 print "Permission is hereby granted, free of charge, to any person"
270 print "obtaining a copy of this software and associated documentation files"
271 print "(the \"Software\"), to deal in the Software without restriction,"
272 print "including without limitation the rights to use, copy, modify, merge,"
273 print "publish, distribute, sublicense, and/or sell copies of the Software,"
274 print "and to permit persons to whom the Software is furnished to do so,"
275 print "subject to the following conditions:"
277 print "The above copyright notice and this permission notice shall be"
278 print "included in all copies or substantial portions of the Software."
280 print "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,"
281 print "EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF"
282 print "MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND"
283 print "NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS"
284 print "BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN"
285 print "ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN"
286 print "CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE"
291 rev
= string
.replace(rev
, " ", "")
292 rev
= string
.replace(rev
, "$", "")
293 revs
= string
.split(rev
, ":")
295 print "cvt-utf8 revision %s" % revs
[1]
297 print "cvt-utf8: unknown version"
308 while len(args
) > 0 and args
[0][:1] == "-":
309 if args
[0] == "--help" or args
[0] == "--help-admin":
313 elif args
[0] == "--licence" or args
[0] == "--license":
317 elif args
[0] == "--version":
321 elif args
[0] == "-o" or args
[0] == "--output":
325 elif args
[0] == "-h" or args
[0] == "--han":
329 elif args
[0] == "--build" or args
[0] == "--fetch-build":
330 if args
[0] == "--build":
332 print "cvt-utf8: --build expects two filename arguments"
334 infile
= open(args
[1], "r")
338 print "cvt-utf8: --fetch-build expects one filename argument"
341 infile
= urllib
.urlopen("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
343 # Now build the database.
344 if outfile
[-3:] == ".db":
345 print "cvt-utf8: warning: you should not append .db to db name"
347 db
= anydbm
.open(outfile
, "n")
349 s
= infile
.readline()
351 ss
= string
.split(s
, ";")[0]
356 elif args
[0] == "--build-unihan" or args
[0] == "--fetch-build-unihan":
357 if args
[0] == "--build-unihan":
359 print "cvt-utf8: --build expects two filename arguments"
361 infile
= open(args
[1], "r")
363 # Unihan.txt starts with a hash. If this file starts with a
364 # P, we assume it's a zip file ("PK").
366 infile
= zip_untangler(infile
, s
)
371 print "cvt-utf8: --fetch-build-unihan expects one filename argument"
374 infile
= urllib
.urlopen("ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip")
375 # We know this one is zipped.
376 infile
= zip_untangler(infile
, "")
379 # Now build the database.
380 if outfile
[-3:] == ".db":
381 print "cvt-utf8: warning: you should not append .db to db name"
383 db
= anydbm
.open(outfile
, "n")
385 s
= s
+ infile
.readline()
387 while s
[-1:] == "\r" or s
[-1:] == "\n":
389 sa
= string
.split(s
, "\t")
390 if len(sa
) == 3 and sa
[1] == "kDefinition" and sa
[0][:2] == "U+":
391 db
[sa
[0][2:]] = sa
[2]
396 elif args
[0] == "--test":
400 elif args
[0] == "--input" or args
[0] == "-i":
405 sys
.stderr
.write("cvt-utf8: unknown argument '%s'" % args
[0])
409 locations
.append("/usr/share/unicode/unicode")
410 locations
.append("/usr/lib/unicode/unicode")
411 locations
.append("/usr/local/share/unicode/unicode")
412 locations
.append("/usr/local/lib/unicode/unicode")
413 locations
.append(os
.environ
["HOME"] + "/share/unicode/unicode")
414 locations
.append(os
.environ
["HOME"] + "/lib/unicode/unicode")
416 for loc
in locations
:
418 db
= anydbm
.open(loc
, "r")
426 i
= string
.rfind(loc
, "/")
428 hanloc
= loc
[:i
+1] + "unihan"
429 handb
= anydbm
.open(hanloc
, "r")
430 # this has been explicitly required, so we don't squelch exceptions
433 do(["CE","BA","E1","BD","B9","CF","83","CE","BC","CE","B5"])
437 do(["F0","90","80","80"])
438 do(["F8","88","80","80","80"])
439 do(["FC","84","80","80","80","80"])
443 do(["F7","BF","BF","BF"])
444 do(["FB","BF","BF","BF","BF"])
445 do(["FD","BF","BF","BF","BF","BF"])
449 do(["F4","8F","BF","BF"])
450 do(["F4","90","80","80"])
455 do(["80","BF","80","BF"])
456 do(["80","BF","80","BF","80"])
457 do(["80","BF","80","BF","80","BF"])
458 do(["80","BF","80","BF","80","BF","80"])
459 do(["80","81","82","83","84","85","86","87",
460 "88","89","8A","8B","8C","8D","8E","8F",
461 "90","91","92","93","94","95","96","97",
462 "98","99","9A","9B","9C","9D","9E","9F",
463 "A0","A1","A2","A3","A4","A5","A6","A7",
464 "A8","A9","AA","AB","AC","AD","AE","AF",
465 "B0","B1","B2","B3","B4","B5","B6","B7",
466 "B8","B9","BA","BB","BC","BD","BE","BF"])
467 do(["C0","20","C1","20","C2","20","C3","20",
468 "C4","20","C5","20","C6","20","C7","20",
469 "C8","20","C9","20","CA","20","CB","20",
470 "CC","20","CD","20","CE","20","CF","20",
471 "D0","20","D1","20","D2","20","D3","20",
472 "D4","20","D5","20","D6","20","D7","20",
473 "D8","20","D9","20","DA","20","DB","20",
474 "DC","20","DD","20","DE","20","DF","20"])
475 do(["E0","20","E1","20","E2","20","E3","20",
476 "E4","20","E5","20","E6","20","E7","20",
477 "E8","20","E9","20","EA","20","EB","20",
478 "EC","20","ED","20","EE","20","EF","20"])
479 do(["F0","20","F1","20","F2","20","F3","20",
480 "F4","20","F5","20","F6","20","F7","20"])
481 do(["F8","20","F9","20","FA","20","FB","20"])
482 do(["FC","20","FD","20"])
486 do(["F8","80","80","80"])
487 do(["FC","80","80","80","80"])
491 do(["FB","BF","BF","BF"])
492 do(["FD","BF","BF","BF","BF"])
493 do(["C0","E0","80","F0","80","80","F8","80",
494 "80","80","FC","80","80","80","80",
495 "DF","EF","BF","F7","BF","BF","FB",
496 "BF","BF","BF","FD","BF","BF","BF","BF"])
499 do(["FE","FE","FF","FF"])
502 do(["F0","80","80","AF"])
503 do(["F8","80","80","80","AF"])
504 do(["FC","80","80","80","80","AF"])
507 do(["F0","8F","BF","BF"])
508 do(["F8","87","BF","BF","BF"])
509 do(["FC","83","BF","BF","BF","BF"])
512 do(["F0","80","80","80"])
513 do(["F8","80","80","80","80"])
514 do(["FC","80","80","80","80","80"])
522 do(["ED","A0","80","ED","B0","80"])
523 do(["ED","A0","80","ED","BF","BF"])
524 do(["ED","AD","BF","ED","B0","80"])
525 do(["ED","AD","BF","ED","BF","BF"])
526 do(["ED","AE","80","ED","B0","80"])
527 do(["ED","AE","80","ED","BF","BF"])
528 do(["ED","AF","BF","ED","B0","80"])
529 do(["ED","AF","BF","ED","BF","8F"])
532 elif mode
== "input":
534 s
= sys
.stdin
.read(1)
537 return ord(s
) & 0xFF # ensure it isn't negative
538 process_utf8(getchar
)