Output a just-read character _before_ reading the next byte. Causes
[sgt/utils] / cvt-utf8 / cvt-utf8
CommitLineData
27c26167 1#!/usr/bin/env python
9acadc2b 2
3import sys
4import string
5import os
6import anydbm
7import zlib
8
9class zip_untangler:
10 def __init__(self, file, datasofar):
11 self.file = file
12 assert len(datasofar) < 30
13 self.header = datasofar
14 self.data = ""
15 self.dataleft = None
16 self.decompress = zlib.decompressobj()
17 # Zlib header bytes, expected by decompress obj but not
18 # present in zip file
19 self.decompress.decompress("\x78\x9c")
20
21 def readline(self):
22 if self.dataleft == None:
23 while len(self.header) < 30:
24 s = self.file.read(30 - len(self.header))
25 assert s != ""
26 self.header = self.header + s
27 # Name length and extra length.
28 namelen = 256 * ord(self.header[27]) + ord(self.header[26])
29 extralen = 256 * ord(self.header[29]) + ord(self.header[28])
30 while len(self.header) < 30 + namelen + extralen:
31 s = self.file.read(30 + namelen + extralen - len(self.header))
32 assert s != ""
33 self.header = self.header + s
34 self.dataleft = \
35 256 * (256 * (256 * ord(self.header[21]) + ord(self.header[20])) \
36 + ord(self.header[19])) + ord(self.header[18])
37 k = string.find(self.data, "\n")
38 while k < 0:
39 rlen = self.dataleft
40 if rlen > 4096: rlen = 4096
41 if rlen == 0: break
42 d = self.file.read(rlen)
43 if d == "": break
44 self.dataleft = self.dataleft - rlen
45 self.data = self.data + self.decompress.decompress(d)
46 k = string.find(self.data, "\n")
47 if k < 0:
48 ret = self.data
49 self.data = ""
50 return ret
51 else:
52 ret = self.data[:k+1]
53 self.data = self.data[k+1:]
54 return ret
55
56def hexstr(x):
57 s = hex(x)
58 if s[-1:] == "L" or s[-1:] == "l":
30862ac8 59 s = s[:-1]
9acadc2b 60 if s[:2] == "0x" or s[:2] == "0X":
30862ac8 61 s = s[2:]
9acadc2b 62 return s
63
64def charname(x):
68c596fb 65 if db is not None:
30862ac8 66 key = hexstr(x)
67 while len(key) < 4: key = "0" + key
68 key = string.upper(key)
69 if han_translations:
70 try:
71 value = handb[key]
72 return "<han> " + value
73 except KeyError:
74 pass
75 try:
76 value = db[key]
77 return string.split(value, ";")[1]
78 except KeyError:
79 return "<no name available>"
9acadc2b 80 else:
30862ac8 81 return ""
9acadc2b 82
83def output(char, bytes, errors):
84 if output_analysis:
30862ac8 85 if char == -1:
86 s = " "
87 else:
88 s = "U-%08X " % char
89 for i in bytes:
90 s = s + " %02X" % i
91 for i in range(6-len(bytes)):
92 s = s + " "
93
94 if char == -1:
95 name = ""
96 else:
97 name = charname(char)
98 if name != "":
99 s = s + " " + name
100 s = s + errors
101 print s
9acadc2b 102 else:
30862ac8 103 if char == -1 or errors != "":
104 # problem chars become U+FFFD REPLACEMENT CHARACTER
105 sys.stdout.write("\xEF\xBF\xBD")
106 else:
107 for i in bytes:
108 sys.stdout.write(chr(i))
9acadc2b 109
110def process_ucs(x, bytes=[], errors=""):
111 if x < 0x80:
30862ac8 112 utf8 = [x]
113 realbytes = 1
9acadc2b 114 else:
30862ac8 115 if x < 0x800:
116 tmp = (0xC0, 1)
117 elif x < 0x10000:
118 tmp = (0xE0, 2)
119 elif x < 0x200000:
120 tmp = (0xF0, 3)
121 elif x < 0x4000000:
122 tmp = (0xF8, 4)
123 else:
124 assert x < 0x80000000L
125 tmp = (0xFC, 5)
126 realbytes = tmp[1] + 1
127 utf8 = [tmp[0] + (x >> (6*tmp[1]))]
128 for i in range(tmp[1]-1, -1, -1):
129 utf8.append(0x80 + (0x3F & (x >> (i*6))))
9acadc2b 130
131 if bytes != [] and len(bytes) > realbytes:
30862ac8 132 errors = errors + " (overlong form of"
133 for i in utf8:
134 errors = errors + " %02X" % i
135 errors = errors + ")"
136 utf8 = bytes
9acadc2b 137 if x >= 0xD800 and x <= 0xDFFF:
30862ac8 138 errors = errors + " (surrogate)"
9acadc2b 139 if x >= 0xFFFE and x <= 0xFFFF:
30862ac8 140 errors = errors + " (invalid char)"
9acadc2b 141
142 output(x, utf8, errors)
143
144def process_utf8(next):
145 c = next()
146 while c != None:
30862ac8 147 char = [c]
148 i = c
149 if i < 0x80:
150 process_ucs(i) # single-byte char
151 c = next()
152 elif i == 0xfe or i == 0xff:
153 output(-1, char, " (invalid UTF-8 byte)")
154 c = next()
155 elif i >= 0x80 and i <= 0xbf:
156 output(-1, char, " (unexpected continuation byte)")
157 c = next()
158 else:
159 if i >= 0xC0 and i <= 0xDF:
160 acc = i &~ 0xC0
161 cbytes = 1
162 elif i >= 0xE0 and i <= 0xEF:
163 acc = i &~ 0xE0
164 cbytes = 2
165 elif i >= 0xF0 and i <= 0xF7:
166 acc = i &~ 0xF0
167 cbytes = 3
168 elif i >= 0xF8 and i <= 0xFB:
169 acc = i &~ 0xF8
170 cbytes = 4
171 elif i >= 0xFC and i <= 0xFD:
172 acc = i &~ 0xFC
173 cbytes = 5
174 gotone = 0
175 while cbytes > 0:
176 c = next()
177 if c == None or c < 0x80 or c > 0xBF:
178 gotone = 1
179 break
180 char.append(c)
181 acc = (acc << 6) + (c & 0x3F)
182 cbytes = cbytes - 1
30862ac8 183 if cbytes > 0:
184 output(-1, char, " (incomplete sequence)")
185 else:
186 process_ucs(acc, char)
1e35c5c4 187 if not gotone:
188 c = next()
9acadc2b 189
190def do(args):
191 # Class to turn a list into a callable object that returns one
192 # element at a time.
193 class liststepper:
30862ac8 194 def __init__(self, list):
195 self.list = list
196 self.index = 0
197 def __call__(self):
198 if self.index >= len(self.list):
199 return None
200 ret = self.list[self.index]
201 self.index = self.index + 1
202 return ret
9acadc2b 203
204 list = []
205 for arg in args:
30862ac8 206 got = ('none')
207 if string.upper(arg[0]) == "U":
208 assert arg[1] == "+" or arg[1] == "-"
209 got = ('ucs', string.atoi(arg[2:], 16))
210 elif arg[:2] == "&#":
211 # SGML character entity. Either &# followed by a
212 # number, or &#x followed by a hex number.
213 s = arg
214 if s[-1:] == ";": s = s[:-1]
215 if string.upper(s[:3]) == "&#X":
216 got = ('ucs', string.atoi(s[3:], 16))
217 else:
218 got = ('ucs', string.atoi(s[2:], 10))
219 else:
220 got = ('utf8', string.atoi(arg, 16))
221
222 if got[0] == 'utf8':
223 list.append(got[1])
224 elif got[0] == 'ucs':
225 if len(list) > 0:
226 process_utf8(liststepper(list))
227 list = []
228 process_ucs(got[1])
9acadc2b 229
230 if len(list) > 0:
30862ac8 231 process_utf8(liststepper(list))
9acadc2b 232
da0f8522 233def usage(arg):
337e121d 234 print "usage: cvt-utf8 [flags] <hex UTF-8 bytes, U+codepoints, SGML entities>"
9acadc2b 235 print " e.g. cvt-utf8 e2 82 ac"
236 print " or cvt-utf8 U+20ac"
237 print " or cvt-utf8 U-10ffff"
337e121d 238 print " or cvt-utf8 '&#8211;'"
9acadc2b 239 print ""
c52f9fb9 240 print "where: -o or --output just output well-formed UTF-8 instead of"
9acadc2b 241 print " an analysis of the input data"
242 print " -h or --han also give Han definitions from unihan db"
243 print ""
c52f9fb9 244 print " also: cvt-utf8 --test run Markus Kuhn's decoder stress tests" #'
9acadc2b 245 print " cvt-utf8 --input (or -i)"
246 print " read, analyse and decode UTF-8 from stdin"
da0f8522 247 if arg == "--help-admin":
9acadc2b 248 print " cvt-utf8 --help display user help text"
249 print " cvt-utf8 --help-admin display admin help text (this one)"
250 print " cvt-utf8 --build <infile> <outfile>"
251 print " convert UnicodeData.txt to unicode db"
252 print " cvt-utf8 --build-unihan <infile> <outfile>"
253 print " convert Unihan.txt to unihan db"
254 print " cvt-utf8 --fetch-build <outfile>"
255 print " "+\
256 "build unicode db by download from unicode.org"
257 print " cvt-utf8 --fetch-build-unihan <outfile>"
258 print " "+\
259 "build Unihan db by download from unicode.org"
260 else:
261 print " cvt-utf8 --help display this help text"
262 print " cvt-utf8 --help-admin display admin help text"
da0f8522 263 print " cvt-utf8 --version report version number"
264 print " cvt-utf8 --licence display (MIT) licence text"
265
266def licence():
267 print "cvt-utf8 is copyright 2002-2004 Simon Tatham."
268 print ""
269 print "Permission is hereby granted, free of charge, to any person"
270 print "obtaining a copy of this software and associated documentation files"
271 print "(the \"Software\"), to deal in the Software without restriction,"
272 print "including without limitation the rights to use, copy, modify, merge,"
273 print "publish, distribute, sublicense, and/or sell copies of the Software,"
274 print "and to permit persons to whom the Software is furnished to do so,"
275 print "subject to the following conditions:"
276 print ""
277 print "The above copyright notice and this permission notice shall be"
278 print "included in all copies or substantial portions of the Software."
279 print ""
280 print "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,"
281 print "EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF"
282 print "MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND"
283 print "NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS"
284 print "BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN"
285 print "ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN"
286 print "CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE"
287 print "SOFTWARE."
9acadc2b 288
da0f8522 289def version():
290 rev = "$Revision$"
291 rev = string.replace(rev, " ", "")
292 rev = string.replace(rev, "$", "")
293 revs = string.split(rev, ":")
294 if len(revs) > 1:
30862ac8 295 print "cvt-utf8 revision %s" % revs[1]
9acadc2b 296 else:
30862ac8 297 print "cvt-utf8: unknown version"
da0f8522 298
299args = sys.argv[1:]
300output_analysis = 1
301han_translations = 0
302mode = "cmdline"
303
304if args == []:
305 usage("")
9acadc2b 306 sys.exit(0)
307
da0f8522 308while len(args) > 0 and args[0][:1] == "-":
309 if args[0] == "--help" or args[0] == "--help-admin":
30862ac8 310 usage(args[0])
311 sys.exit(0)
da0f8522 312
313 elif args[0] == "--licence" or args[0] == "--license":
30862ac8 314 licence()
315 sys.exit(0)
da0f8522 316
317 elif args[0] == "--version":
30862ac8 318 version()
319 sys.exit(0)
da0f8522 320
321 elif args[0] == "-o" or args[0] == "--output":
30862ac8 322 output_analysis = 0
323 args = args[1:]
da0f8522 324
325 elif args[0] == "-h" or args[0] == "--han":
30862ac8 326 han_translations = 1
327 args = args[1:]
da0f8522 328
329 elif args[0] == "--build" or args[0] == "--fetch-build":
30862ac8 330 if args[0] == "--build":
331 if len(args) != 3:
332 print "cvt-utf8: --build expects two filename arguments"
333 sys.exit(1)
334 infile = open(args[1], "r")
335 outfile = args[2]
336 else:
337 if len(args) != 2:
338 print "cvt-utf8: --fetch-build expects one filename argument"
339 sys.exit(1)
340 import urllib
341 infile = urllib.urlopen("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
342 outfile = args[1]
343 # Now build the database.
344 if outfile[-3:] == ".db":
345 print "cvt-utf8: warning: you should not append .db to db name"
346
347 db = anydbm.open(outfile, "n")
348 while 1:
349 s = infile.readline()
350 if s == "": break
351 ss = string.split(s, ";")[0]
352 db[ss] = s
353 db.close()
354 sys.exit(0)
da0f8522 355
356 elif args[0] == "--build-unihan" or args[0] == "--fetch-build-unihan":
30862ac8 357 if args[0] == "--build-unihan":
358 if len(args) != 3:
359 print "cvt-utf8: --build expects two filename arguments"
360 sys.exit(1)
361 infile = open(args[1], "r")
362 s = infile.read(1)
363 # Unihan.txt starts with a hash. If this file starts with a
364 # P, we assume it's a zip file ("PK").
365 if s == "P":
366 infile = zip_untangler(infile, s)
367 s = ""
368 outfile = args[2]
369 else:
370 if len(args) != 2:
371 print "cvt-utf8: --fetch-build-unihan expects one filename argument"
372 sys.exit(1)
373 import urllib
374 infile = urllib.urlopen("ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip")
375 # We know this one is zipped.
376 infile = zip_untangler(infile, "")
377 outfile = args[1]
378 s = ""
379 # Now build the database.
380 if outfile[-3:] == ".db":
381 print "cvt-utf8: warning: you should not append .db to db name"
382
383 db = anydbm.open(outfile, "n")
384 while 1:
385 s = s + infile.readline()
386 if s == "": break
387 while s[-1:] == "\r" or s[-1:] == "\n":
388 s = s[:-1]
389 sa = string.split(s, "\t")
390 if len(sa) == 3 and sa[1] == "kDefinition" and sa[0][:2] == "U+":
391 db[sa[0][2:]] = sa[2]
392 s = ""
393 db.close()
394 sys.exit(0)
da0f8522 395
396 elif args[0] == "--test":
30862ac8 397 mode = "test"
398 args = args[1:]
da0f8522 399
400 elif args[0] == "--input" or args[0] == "-i":
30862ac8 401 mode = "input"
402 args = args[1:]
da0f8522 403
9acadc2b 404 else:
30862ac8 405 sys.stderr.write("cvt-utf8: unknown argument '%s'" % args[0])
406 sys.exit(1)
9acadc2b 407
408locations = []
409locations.append("/usr/share/unicode/unicode")
410locations.append("/usr/lib/unicode/unicode")
411locations.append("/usr/local/share/unicode/unicode")
412locations.append("/usr/local/lib/unicode/unicode")
413locations.append(os.environ["HOME"] + "/share/unicode/unicode")
414locations.append(os.environ["HOME"] + "/lib/unicode/unicode")
415
416for loc in locations:
417 try:
30862ac8 418 db = anydbm.open(loc, "r")
9acadc2b 419 except IOError:
30862ac8 420 db = None
9acadc2b 421 except anydbm.error:
30862ac8 422 db = None
9acadc2b 423 if db != None:
30862ac8 424 break
9acadc2b 425if han_translations:
426 i = string.rfind(loc, "/")
427 assert i >= 0
428 hanloc = loc[:i+1] + "unihan"
429 handb = anydbm.open(hanloc, "r")
430 # this has been explicitly required, so we don't squelch exceptions
431
da0f8522 432if mode == "test":
9acadc2b 433 do(["CE","BA","E1","BD","B9","CF","83","CE","BC","CE","B5"])
434 do(["00"])
435 do(["C2","80"])
436 do(["E0","A0","80"])
437 do(["F0","90","80","80"])
438 do(["F8","88","80","80","80"])
439 do(["FC","84","80","80","80","80"])
440 do(["7F"])
441 do(["DF","BF"])
442 do(["EF","BF","BF"])
443 do(["F7","BF","BF","BF"])
444 do(["FB","BF","BF","BF","BF"])
445 do(["FD","BF","BF","BF","BF","BF"])
446 do(["ED","9F","BF"])
447 do(["EE","80","80"])
448 do(["EF","BF","BD"])
449 do(["F4","8F","BF","BF"])
450 do(["F4","90","80","80"])
451 do(["80"])
452 do(["BF"])
453 do(["80","BF"])
454 do(["80","BF","80"])
455 do(["80","BF","80","BF"])
456 do(["80","BF","80","BF","80"])
457 do(["80","BF","80","BF","80","BF"])
458 do(["80","BF","80","BF","80","BF","80"])
459 do(["80","81","82","83","84","85","86","87",
460 "88","89","8A","8B","8C","8D","8E","8F",
461 "90","91","92","93","94","95","96","97",
462 "98","99","9A","9B","9C","9D","9E","9F",
463 "A0","A1","A2","A3","A4","A5","A6","A7",
464 "A8","A9","AA","AB","AC","AD","AE","AF",
465 "B0","B1","B2","B3","B4","B5","B6","B7",
466 "B8","B9","BA","BB","BC","BD","BE","BF"])
467 do(["C0","20","C1","20","C2","20","C3","20",
468 "C4","20","C5","20","C6","20","C7","20",
469 "C8","20","C9","20","CA","20","CB","20",
470 "CC","20","CD","20","CE","20","CF","20",
471 "D0","20","D1","20","D2","20","D3","20",
472 "D4","20","D5","20","D6","20","D7","20",
473 "D8","20","D9","20","DA","20","DB","20",
474 "DC","20","DD","20","DE","20","DF","20"])
475 do(["E0","20","E1","20","E2","20","E3","20",
476 "E4","20","E5","20","E6","20","E7","20",
477 "E8","20","E9","20","EA","20","EB","20",
478 "EC","20","ED","20","EE","20","EF","20"])
479 do(["F0","20","F1","20","F2","20","F3","20",
480 "F4","20","F5","20","F6","20","F7","20"])
481 do(["F8","20","F9","20","FA","20","FB","20"])
482 do(["FC","20","FD","20"])
483 do(["C0"])
484 do(["E0","80"])
485 do(["F0","80","80"])
486 do(["F8","80","80","80"])
487 do(["FC","80","80","80","80"])
488 do(["DF"])
489 do(["EF","BF"])
490 do(["F7","BF","BF"])
491 do(["FB","BF","BF","BF"])
492 do(["FD","BF","BF","BF","BF"])
493 do(["C0","E0","80","F0","80","80","F8","80",
494 "80","80","FC","80","80","80","80",
495 "DF","EF","BF","F7","BF","BF","FB",
496 "BF","BF","BF","FD","BF","BF","BF","BF"])
497 do(["FE"])
498 do(["FF"])
499 do(["FE","FE","FF","FF"])
500 do(["C0","AF"])
501 do(["E0","80","AF"])
502 do(["F0","80","80","AF"])
503 do(["F8","80","80","80","AF"])
504 do(["FC","80","80","80","80","AF"])
505 do(["C1","BF"])
506 do(["E0","9F","BF"])
507 do(["F0","8F","BF","BF"])
508 do(["F8","87","BF","BF","BF"])
509 do(["FC","83","BF","BF","BF","BF"])
510 do(["C0","80"])
511 do(["E0","80","80"])
512 do(["F0","80","80","80"])
513 do(["F8","80","80","80","80"])
514 do(["FC","80","80","80","80","80"])
515 do(["ED","A0","80"])
516 do(["ED","AD","BF"])
517 do(["ED","AE","80"])
518 do(["ED","AF","BF"])
519 do(["ED","B0","80"])
520 do(["ED","BE","80"])
521 do(["ED","BF","BF"])
522 do(["ED","A0","80","ED","B0","80"])
523 do(["ED","A0","80","ED","BF","BF"])
524 do(["ED","AD","BF","ED","B0","80"])
525 do(["ED","AD","BF","ED","BF","BF"])
526 do(["ED","AE","80","ED","B0","80"])
527 do(["ED","AE","80","ED","BF","BF"])
528 do(["ED","AF","BF","ED","B0","80"])
529 do(["ED","AF","BF","ED","BF","8F"])
530 do(["EF","BF","BE"])
531 do(["EF","BF","BF"])
da0f8522 532elif mode == "input":
9acadc2b 533 def getchar():
30862ac8 534 s = sys.stdin.read(1)
535 if s == "":
536 return None
537 return ord(s) & 0xFF # ensure it isn't negative
9acadc2b 538 process_utf8(getchar)
539else:
540 do(args)