7a854c000fa9f8ae278049f1fdbda4d83defe6b4
[sgt/utils] / cvt-utf8 / cvt-utf8
1 #!/usr/bin/env python
2
3 import sys
4 import string
5 import os
6 import anydbm
7 import zlib
8
9 class zip_untangler:
10 def __init__(self, file, datasofar):
11 self.file = file
12 assert len(datasofar) < 30
13 self.header = datasofar
14 self.data = ""
15 self.dataleft = None
16 self.decompress = zlib.decompressobj()
17 # Zlib header bytes, expected by decompress obj but not
18 # present in zip file
19 self.decompress.decompress("\x78\x9c")
20
21 def readline(self):
22 if self.dataleft == None:
23 while len(self.header) < 30:
24 s = self.file.read(30 - len(self.header))
25 assert s != ""
26 self.header = self.header + s
27 # Name length and extra length.
28 namelen = 256 * ord(self.header[27]) + ord(self.header[26])
29 extralen = 256 * ord(self.header[29]) + ord(self.header[28])
30 while len(self.header) < 30 + namelen + extralen:
31 s = self.file.read(30 + namelen + extralen - len(self.header))
32 assert s != ""
33 self.header = self.header + s
34 self.dataleft = \
35 256 * (256 * (256 * ord(self.header[21]) + ord(self.header[20])) \
36 + ord(self.header[19])) + ord(self.header[18])
37 k = string.find(self.data, "\n")
38 while k < 0:
39 rlen = self.dataleft
40 if rlen > 4096: rlen = 4096
41 if rlen == 0: break
42 d = self.file.read(rlen)
43 if d == "": break
44 self.dataleft = self.dataleft - rlen
45 self.data = self.data + self.decompress.decompress(d)
46 k = string.find(self.data, "\n")
47 if k < 0:
48 ret = self.data
49 self.data = ""
50 return ret
51 else:
52 ret = self.data[:k+1]
53 self.data = self.data[k+1:]
54 return ret
55
56 def hexstr(x):
57 s = hex(x)
58 if s[-1:] == "L" or s[-1:] == "l":
59 s = s[:-1]
60 if s[:2] == "0x" or s[:2] == "0X":
61 s = s[2:]
62 return s
63
64 def charname(x):
65 if db:
66 key = hexstr(x)
67 while len(key) < 4: key = "0" + key
68 key = string.upper(key)
69 if han_translations:
70 try:
71 value = handb[key]
72 return "<han> " + value
73 except KeyError:
74 pass
75 try:
76 value = db[key]
77 return string.split(value, ";")[1]
78 except KeyError:
79 return "<no name available>"
80 else:
81 return ""
82
83 def output(char, bytes, errors):
84 if output_analysis:
85 if char == -1:
86 s = " "
87 else:
88 s = "U-%08X " % char
89 for i in bytes:
90 s = s + " %02X" % i
91 for i in range(6-len(bytes)):
92 s = s + " "
93
94 if char == -1:
95 name = ""
96 else:
97 name = charname(char)
98 if name != "":
99 s = s + " " + name
100 s = s + errors
101 print s
102 else:
103 if char == -1 or errors != "":
104 # problem chars become U+FFFD REPLACEMENT CHARACTER
105 sys.stdout.write("\xEF\xBF\xBD")
106 else:
107 for i in bytes:
108 sys.stdout.write(chr(i))
109
110 def process_ucs(x, bytes=[], errors=""):
111 if x < 0x80:
112 utf8 = [x]
113 realbytes = 1
114 else:
115 if x < 0x800:
116 tmp = (0xC0, 1)
117 elif x < 0x10000:
118 tmp = (0xE0, 2)
119 elif x < 0x200000:
120 tmp = (0xF0, 3)
121 elif x < 0x4000000:
122 tmp = (0xF8, 4)
123 else:
124 assert x < 0x80000000L
125 tmp = (0xFC, 5)
126 realbytes = tmp[1] + 1
127 utf8 = [tmp[0] + (x >> (6*tmp[1]))]
128 for i in range(tmp[1]-1, -1, -1):
129 utf8.append(0x80 + (0x3F & (x >> (i*6))))
130
131 if bytes != [] and len(bytes) > realbytes:
132 errors = errors + " (overlong form of"
133 for i in utf8:
134 errors = errors + " %02X" % i
135 errors = errors + ")"
136 utf8 = bytes
137 if x >= 0xD800 and x <= 0xDFFF:
138 errors = errors + " (surrogate)"
139 if x >= 0xFFFE and x <= 0xFFFF:
140 errors = errors + " (invalid char)"
141
142 output(x, utf8, errors)
143
144 def process_utf8(next):
145 c = next()
146 while c != None:
147 char = [c]
148 i = c
149 if i < 0x80:
150 process_ucs(i) # single-byte char
151 c = next()
152 elif i == 0xfe or i == 0xff:
153 output(-1, char, " (invalid UTF-8 byte)")
154 c = next()
155 elif i >= 0x80 and i <= 0xbf:
156 output(-1, char, " (unexpected continuation byte)")
157 c = next()
158 else:
159 if i >= 0xC0 and i <= 0xDF:
160 acc = i &~ 0xC0
161 cbytes = 1
162 elif i >= 0xE0 and i <= 0xEF:
163 acc = i &~ 0xE0
164 cbytes = 2
165 elif i >= 0xF0 and i <= 0xF7:
166 acc = i &~ 0xF0
167 cbytes = 3
168 elif i >= 0xF8 and i <= 0xFB:
169 acc = i &~ 0xF8
170 cbytes = 4
171 elif i >= 0xFC and i <= 0xFD:
172 acc = i &~ 0xFC
173 cbytes = 5
174 gotone = 0
175 while cbytes > 0:
176 c = next()
177 if c == None or c < 0x80 or c > 0xBF:
178 gotone = 1
179 break
180 char.append(c)
181 acc = (acc << 6) + (c & 0x3F)
182 cbytes = cbytes - 1
183 if not gotone:
184 c = next()
185 if cbytes > 0:
186 output(-1, char, " (incomplete sequence)")
187 else:
188 process_ucs(acc, char)
189
190 def do(args):
191 # Class to turn a list into a callable object that returns one
192 # element at a time.
193 class liststepper:
194 def __init__(self, list):
195 self.list = list
196 self.index = 0
197 def __call__(self):
198 if self.index >= len(self.list):
199 return None
200 ret = self.list[self.index]
201 self.index = self.index + 1
202 return ret
203
204 list = []
205 for arg in args:
206 if string.upper(arg[0]) == "U":
207 if len(list) > 0:
208 process_utf8(liststepper(list))
209 list = []
210 assert arg[1] == "+" or arg[1] == "-"
211 process_ucs(string.atoi(arg[2:], 16))
212 else:
213 list.append(string.atoi(arg, 16))
214
215 if len(list) > 0:
216 process_utf8(liststepper(list))
217
218 def usage(arg):
219 print "usage: cvt-utf8 [flags] <hex UTF-8 bytes and/or U+codepoints>"
220 print " e.g. cvt-utf8 e2 82 ac"
221 print " or cvt-utf8 U+20ac"
222 print " or cvt-utf8 U-10ffff"
223 print ""
224 print "where: -o or --output just output well-formed UTF-8 instead of"
225 print " an analysis of the input data"
226 print " -h or --han also give Han definitions from unihan db"
227 print ""
228 print " also: cvt-utf8 --test run Markus Kuhn's decoder stress tests" #'
229 print " cvt-utf8 --input (or -i)"
230 print " read, analyse and decode UTF-8 from stdin"
231 if arg == "--help-admin":
232 print " cvt-utf8 --help display user help text"
233 print " cvt-utf8 --help-admin display admin help text (this one)"
234 print " cvt-utf8 --build <infile> <outfile>"
235 print " convert UnicodeData.txt to unicode db"
236 print " cvt-utf8 --build-unihan <infile> <outfile>"
237 print " convert Unihan.txt to unihan db"
238 print " cvt-utf8 --fetch-build <outfile>"
239 print " "+\
240 "build unicode db by download from unicode.org"
241 print " cvt-utf8 --fetch-build-unihan <outfile>"
242 print " "+\
243 "build Unihan db by download from unicode.org"
244 else:
245 print " cvt-utf8 --help display this help text"
246 print " cvt-utf8 --help-admin display admin help text"
247 print " cvt-utf8 --version report version number"
248 print " cvt-utf8 --licence display (MIT) licence text"
249
250 def licence():
251 print "cvt-utf8 is copyright 2002-2004 Simon Tatham."
252 print ""
253 print "Permission is hereby granted, free of charge, to any person"
254 print "obtaining a copy of this software and associated documentation files"
255 print "(the \"Software\"), to deal in the Software without restriction,"
256 print "including without limitation the rights to use, copy, modify, merge,"
257 print "publish, distribute, sublicense, and/or sell copies of the Software,"
258 print "and to permit persons to whom the Software is furnished to do so,"
259 print "subject to the following conditions:"
260 print ""
261 print "The above copyright notice and this permission notice shall be"
262 print "included in all copies or substantial portions of the Software."
263 print ""
264 print "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,"
265 print "EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF"
266 print "MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND"
267 print "NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS"
268 print "BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN"
269 print "ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN"
270 print "CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE"
271 print "SOFTWARE."
272
273 def version():
274 rev = "$Revision$"
275 rev = string.replace(rev, " ", "")
276 rev = string.replace(rev, "$", "")
277 revs = string.split(rev, ":")
278 if len(revs) > 1:
279 print "cvt-utf8 revision %s" % revs[1]
280 else:
281 print "cvt-utf8: unknown version"
282
283 args = sys.argv[1:]
284 output_analysis = 1
285 han_translations = 0
286 mode = "cmdline"
287
288 if args == []:
289 usage("")
290 sys.exit(0)
291
292 while len(args) > 0 and args[0][:1] == "-":
293 if args[0] == "--help" or args[0] == "--help-admin":
294 usage(args[0])
295 sys.exit(0)
296
297 elif args[0] == "--licence" or args[0] == "--license":
298 licence()
299 sys.exit(0)
300
301 elif args[0] == "--version":
302 version()
303 sys.exit(0)
304
305 elif args[0] == "-o" or args[0] == "--output":
306 output_analysis = 0
307 args = args[1:]
308
309 elif args[0] == "-h" or args[0] == "--han":
310 han_translations = 1
311 args = args[1:]
312
313 elif args[0] == "--build" or args[0] == "--fetch-build":
314 if args[0] == "--build":
315 if len(args) != 3:
316 print "cvt-utf8: --build expects two filename arguments"
317 sys.exit(1)
318 infile = open(args[1], "r")
319 outfile = args[2]
320 else:
321 if len(args) != 2:
322 print "cvt-utf8: --fetch-build expects one filename argument"
323 sys.exit(1)
324 import urllib
325 infile = urllib.urlopen("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
326 outfile = args[1]
327 # Now build the database.
328 if outfile[-3:] == ".db":
329 print "cvt-utf8: warning: you should not append .db to db name"
330
331 db = anydbm.open(outfile, "n")
332 while 1:
333 s = infile.readline()
334 if s == "": break
335 ss = string.split(s, ";")[0]
336 db[ss] = s
337 db.close()
338 sys.exit(0)
339
340 elif args[0] == "--build-unihan" or args[0] == "--fetch-build-unihan":
341 if args[0] == "--build-unihan":
342 if len(args) != 3:
343 print "cvt-utf8: --build expects two filename arguments"
344 sys.exit(1)
345 infile = open(args[1], "r")
346 s = infile.read(1)
347 # Unihan.txt starts with a hash. If this file starts with a
348 # P, we assume it's a zip file ("PK").
349 if s == "P":
350 infile = zip_untangler(infile, s)
351 s = ""
352 outfile = args[2]
353 else:
354 if len(args) != 2:
355 print "cvt-utf8: --fetch-build-unihan expects one filename argument"
356 sys.exit(1)
357 import urllib
358 infile = urllib.urlopen("ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip")
359 # We know this one is zipped.
360 infile = zip_untangler(infile, "")
361 outfile = args[1]
362 s = ""
363 # Now build the database.
364 if outfile[-3:] == ".db":
365 print "cvt-utf8: warning: you should not append .db to db name"
366
367 db = anydbm.open(outfile, "n")
368 while 1:
369 s = s + infile.readline()
370 if s == "": break
371 while s[-1:] == "\r" or s[-1:] == "\n":
372 s = s[:-1]
373 sa = string.split(s, "\t")
374 if len(sa) == 3 and sa[1] == "kDefinition" and sa[0][:2] == "U+":
375 db[sa[0][2:]] = sa[2]
376 s = ""
377 db.close()
378 sys.exit(0)
379
380 elif args[0] == "--test":
381 mode = "test"
382 args = args[1:]
383
384 elif args[0] == "--input" or args[0] == "-i":
385 mode = "input"
386 args = args[1:]
387
388 else:
389 sys.stderr.write("cvt-utf8: unknown argument '%s'" % args[0])
390 sys.exit(1)
391
392 locations = []
393 locations.append("/usr/share/unicode/unicode")
394 locations.append("/usr/lib/unicode/unicode")
395 locations.append("/usr/local/share/unicode/unicode")
396 locations.append("/usr/local/lib/unicode/unicode")
397 locations.append(os.environ["HOME"] + "/share/unicode/unicode")
398 locations.append(os.environ["HOME"] + "/lib/unicode/unicode")
399
400 for loc in locations:
401 try:
402 db = anydbm.open(loc, "r")
403 except IOError:
404 db = None
405 except anydbm.error:
406 db = None
407 if db != None:
408 break
409 if han_translations:
410 i = string.rfind(loc, "/")
411 assert i >= 0
412 hanloc = loc[:i+1] + "unihan"
413 handb = anydbm.open(hanloc, "r")
414 # this has been explicitly required, so we don't squelch exceptions
415
416 if mode == "test":
417 do(["CE","BA","E1","BD","B9","CF","83","CE","BC","CE","B5"])
418 do(["00"])
419 do(["C2","80"])
420 do(["E0","A0","80"])
421 do(["F0","90","80","80"])
422 do(["F8","88","80","80","80"])
423 do(["FC","84","80","80","80","80"])
424 do(["7F"])
425 do(["DF","BF"])
426 do(["EF","BF","BF"])
427 do(["F7","BF","BF","BF"])
428 do(["FB","BF","BF","BF","BF"])
429 do(["FD","BF","BF","BF","BF","BF"])
430 do(["ED","9F","BF"])
431 do(["EE","80","80"])
432 do(["EF","BF","BD"])
433 do(["F4","8F","BF","BF"])
434 do(["F4","90","80","80"])
435 do(["80"])
436 do(["BF"])
437 do(["80","BF"])
438 do(["80","BF","80"])
439 do(["80","BF","80","BF"])
440 do(["80","BF","80","BF","80"])
441 do(["80","BF","80","BF","80","BF"])
442 do(["80","BF","80","BF","80","BF","80"])
443 do(["80","81","82","83","84","85","86","87",
444 "88","89","8A","8B","8C","8D","8E","8F",
445 "90","91","92","93","94","95","96","97",
446 "98","99","9A","9B","9C","9D","9E","9F",
447 "A0","A1","A2","A3","A4","A5","A6","A7",
448 "A8","A9","AA","AB","AC","AD","AE","AF",
449 "B0","B1","B2","B3","B4","B5","B6","B7",
450 "B8","B9","BA","BB","BC","BD","BE","BF"])
451 do(["C0","20","C1","20","C2","20","C3","20",
452 "C4","20","C5","20","C6","20","C7","20",
453 "C8","20","C9","20","CA","20","CB","20",
454 "CC","20","CD","20","CE","20","CF","20",
455 "D0","20","D1","20","D2","20","D3","20",
456 "D4","20","D5","20","D6","20","D7","20",
457 "D8","20","D9","20","DA","20","DB","20",
458 "DC","20","DD","20","DE","20","DF","20"])
459 do(["E0","20","E1","20","E2","20","E3","20",
460 "E4","20","E5","20","E6","20","E7","20",
461 "E8","20","E9","20","EA","20","EB","20",
462 "EC","20","ED","20","EE","20","EF","20"])
463 do(["F0","20","F1","20","F2","20","F3","20",
464 "F4","20","F5","20","F6","20","F7","20"])
465 do(["F8","20","F9","20","FA","20","FB","20"])
466 do(["FC","20","FD","20"])
467 do(["C0"])
468 do(["E0","80"])
469 do(["F0","80","80"])
470 do(["F8","80","80","80"])
471 do(["FC","80","80","80","80"])
472 do(["DF"])
473 do(["EF","BF"])
474 do(["F7","BF","BF"])
475 do(["FB","BF","BF","BF"])
476 do(["FD","BF","BF","BF","BF"])
477 do(["C0","E0","80","F0","80","80","F8","80",
478 "80","80","FC","80","80","80","80",
479 "DF","EF","BF","F7","BF","BF","FB",
480 "BF","BF","BF","FD","BF","BF","BF","BF"])
481 do(["FE"])
482 do(["FF"])
483 do(["FE","FE","FF","FF"])
484 do(["C0","AF"])
485 do(["E0","80","AF"])
486 do(["F0","80","80","AF"])
487 do(["F8","80","80","80","AF"])
488 do(["FC","80","80","80","80","AF"])
489 do(["C1","BF"])
490 do(["E0","9F","BF"])
491 do(["F0","8F","BF","BF"])
492 do(["F8","87","BF","BF","BF"])
493 do(["FC","83","BF","BF","BF","BF"])
494 do(["C0","80"])
495 do(["E0","80","80"])
496 do(["F0","80","80","80"])
497 do(["F8","80","80","80","80"])
498 do(["FC","80","80","80","80","80"])
499 do(["ED","A0","80"])
500 do(["ED","AD","BF"])
501 do(["ED","AE","80"])
502 do(["ED","AF","BF"])
503 do(["ED","B0","80"])
504 do(["ED","BE","80"])
505 do(["ED","BF","BF"])
506 do(["ED","A0","80","ED","B0","80"])
507 do(["ED","A0","80","ED","BF","BF"])
508 do(["ED","AD","BF","ED","B0","80"])
509 do(["ED","AD","BF","ED","BF","BF"])
510 do(["ED","AE","80","ED","B0","80"])
511 do(["ED","AE","80","ED","BF","BF"])
512 do(["ED","AF","BF","ED","B0","80"])
513 do(["ED","AF","BF","ED","BF","8F"])
514 do(["EF","BF","BE"])
515 do(["EF","BF","BF"])
516 elif mode == "input":
517 def getchar():
518 s = sys.stdin.read(1)
519 if s == "":
520 return None
521 return ord(s) & 0xFF # ensure it isn't negative
522 process_utf8(getchar)
523 else:
524 do(args)