Move some of my more useful utilities out from my all-purpose
[sgt/utils] / cvt-utf8 / cvt-utf8
CommitLineData
9acadc2b 1#!/usr/bin/env python
2
3import sys
4import string
5import os
6import anydbm
7import zlib
8
9class zip_untangler:
10 def __init__(self, file, datasofar):
11 self.file = file
12 assert len(datasofar) < 30
13 self.header = datasofar
14 self.data = ""
15 self.dataleft = None
16 self.decompress = zlib.decompressobj()
17 # Zlib header bytes, expected by decompress obj but not
18 # present in zip file
19 self.decompress.decompress("\x78\x9c")
20
21 def readline(self):
22 if self.dataleft == None:
23 while len(self.header) < 30:
24 s = self.file.read(30 - len(self.header))
25 assert s != ""
26 self.header = self.header + s
27 # Name length and extra length.
28 namelen = 256 * ord(self.header[27]) + ord(self.header[26])
29 extralen = 256 * ord(self.header[29]) + ord(self.header[28])
30 while len(self.header) < 30 + namelen + extralen:
31 s = self.file.read(30 + namelen + extralen - len(self.header))
32 assert s != ""
33 self.header = self.header + s
34 self.dataleft = \
35 256 * (256 * (256 * ord(self.header[21]) + ord(self.header[20])) \
36 + ord(self.header[19])) + ord(self.header[18])
37 k = string.find(self.data, "\n")
38 while k < 0:
39 rlen = self.dataleft
40 if rlen > 4096: rlen = 4096
41 if rlen == 0: break
42 d = self.file.read(rlen)
43 if d == "": break
44 self.dataleft = self.dataleft - rlen
45 self.data = self.data + self.decompress.decompress(d)
46 k = string.find(self.data, "\n")
47 if k < 0:
48 ret = self.data
49 self.data = ""
50 return ret
51 else:
52 ret = self.data[:k+1]
53 self.data = self.data[k+1:]
54 return ret
55
56def hexstr(x):
57 s = hex(x)
58 if s[-1:] == "L" or s[-1:] == "l":
59 s = s[:-1]
60 if s[:2] == "0x" or s[:2] == "0X":
61 s = s[2:]
62 return s
63
64def charname(x):
65 if db:
66 key = hexstr(x)
67 while len(key) < 4: key = "0" + key
68 key = string.upper(key)
69 if han_translations:
70 try:
71 value = handb[key]
72 return "<han> " + value
73 except KeyError:
74 pass
75 try:
76 value = db[key]
77 return string.split(value, ";")[1]
78 except KeyError:
79 return "<no name available>"
80 else:
81 return ""
82
83def output(char, bytes, errors):
84 if output_analysis:
85 if char == -1:
86 s = " "
87 else:
88 s = "U-%08X " % char
89 for i in bytes:
90 s = s + " %02X" % i
91 for i in range(6-len(bytes)):
92 s = s + " "
93
94 if char == -1:
95 name = ""
96 else:
97 name = charname(char)
98 if name != "":
99 s = s + " " + name
100 s = s + errors
101 print s
102 else:
103 if char == -1 or errors != "":
104 # problem chars become U+FFFD REPLACEMENT CHARACTER
105 sys.stdout.write("\xEF\xBF\xBD")
106 else:
107 for i in bytes:
108 sys.stdout.write(chr(i))
109
110def process_ucs(x, bytes=[], errors=""):
111 if x < 0x80:
112 utf8 = [x]
113 realbytes = 1
114 else:
115 if x < 0x800:
116 tmp = (0xC0, 1)
117 elif x < 0x10000:
118 tmp = (0xE0, 2)
119 elif x < 0x200000:
120 tmp = (0xF0, 3)
121 elif x < 0x4000000:
122 tmp = (0xF8, 4)
123 else:
124 assert x < 0x80000000L
125 tmp = (0xFC, 5)
126 realbytes = tmp[1] + 1
127 utf8 = [tmp[0] + (x >> (6*tmp[1]))]
128 for i in range(tmp[1]-1, -1, -1):
129 utf8.append(0x80 + (0x3F & (x >> (i*6))))
130
131 if bytes != [] and len(bytes) > realbytes:
132 errors = errors + " (overlong form of"
133 for i in utf8:
134 errors = errors + " %02X" % i
135 errors = errors + ")"
136 utf8 = bytes
137 if x >= 0xD800 and x <= 0xDFFF:
138 errors = errors + " (surrogate)"
139 if x >= 0xFFFE and x <= 0xFFFF:
140 errors = errors + " (invalid char)"
141
142 output(x, utf8, errors)
143
144def process_utf8(next):
145 c = next()
146 while c != None:
147 char = [c]
148 i = c
149 if i < 0x80:
150 process_ucs(i) # single-byte char
151 c = next()
152 elif i == 0xfe or i == 0xff:
153 output(-1, char, " (invalid UTF-8 byte)")
154 c = next()
155 elif i >= 0x80 and i <= 0xbf:
156 output(-1, char, " (unexpected continuation byte)")
157 c = next()
158 else:
159 if i >= 0xC0 and i <= 0xDF:
160 acc = i &~ 0xC0
161 cbytes = 1
162 elif i >= 0xE0 and i <= 0xEF:
163 acc = i &~ 0xE0
164 cbytes = 2
165 elif i >= 0xF0 and i <= 0xF7:
166 acc = i &~ 0xF0
167 cbytes = 3
168 elif i >= 0xF8 and i <= 0xFB:
169 acc = i &~ 0xF8
170 cbytes = 4
171 elif i >= 0xFC and i <= 0xFD:
172 acc = i &~ 0xFC
173 cbytes = 5
174 gotone = 0
175 while cbytes > 0:
176 c = next()
177 if c == None or c < 0x80 or c > 0xBF:
178 gotone = 1
179 break
180 char.append(c)
181 acc = (acc << 6) + (c & 0x3F)
182 cbytes = cbytes - 1
183 if not gotone:
184 c = next()
185 if cbytes > 0:
186 output(-1, char, " (incomplete sequence)")
187 else:
188 process_ucs(acc, char)
189
190def do(args):
191 # Class to turn a list into a callable object that returns one
192 # element at a time.
193 class liststepper:
194 def __init__(self, list):
195 self.list = list
196 self.index = 0
197 def __call__(self):
198 if self.index >= len(self.list):
199 return None
200 ret = self.list[self.index]
201 self.index = self.index + 1
202 return ret
203
204 list = []
205 for arg in args:
206 if string.upper(arg[0]) == "U":
207 if len(list) > 0:
208 process_utf8(liststepper(list))
209 list = []
210 assert arg[1] == "+" or arg[1] == "-"
211 process_ucs(string.atoi(arg[2:], 16))
212 else:
213 list.append(string.atoi(arg, 16))
214
215 if len(list) > 0:
216 process_utf8(liststepper(list))
217
218args = sys.argv[1:]
219output_analysis = 1
220han_translations = 0
221
222if args == [] or args == ["--help"] or args == ["--help-admin"]:
223 print "Usage: cvt-utf8 [flags] <hex UTF-8 bytes and/or U+codepoints>"
224 print " e.g. cvt-utf8 e2 82 ac"
225 print " or cvt-utf8 U+20ac"
226 print " or cvt-utf8 U-10ffff"
227 print ""
228 print "Flags: -o or --output just output well-formed UTF-8 instead of"
229 print " an analysis of the input data"
230 print " -h or --han also give Han definitions from unihan db"
231 print ""
232 print "Also: cvt-utf8 --test run Markus Kuhn's decoder stress tests" #'
233 print " cvt-utf8 --input (or -i)"
234 print " read, analyse and decode UTF-8 from stdin"
235 if args == ["--help-admin"]:
236 print " cvt-utf8 --help display user help text"
237 print " cvt-utf8 --help-admin display admin help text (this one)"
238 print " cvt-utf8 --build <infile> <outfile>"
239 print " convert UnicodeData.txt to unicode db"
240 print " cvt-utf8 --build-unihan <infile> <outfile>"
241 print " convert Unihan.txt to unihan db"
242 print " cvt-utf8 --fetch-build <outfile>"
243 print " "+\
244 "build unicode db by download from unicode.org"
245 print " cvt-utf8 --fetch-build-unihan <outfile>"
246 print " "+\
247 "build Unihan db by download from unicode.org"
248 else:
249 print " cvt-utf8 --help display this help text"
250 print " cvt-utf8 --help-admin display admin help text"
251 sys.exit(0)
252
253if args[0] == "-o" or args[0] == "--output":
254 output_analysis = 0
255 args = args[1:]
256
257if args[0] == "-h" or args[0] == "--han":
258 han_translations = 1
259 args = args[1:]
260
261if args[0] == "--build" or args[0] == "--fetch-build":
262 if args[0] == "--build":
263 if len(args) != 3:
264 print "cvt-utf8: --build expects two filename arguments"
265 sys.exit(1)
266 infile = open(args[1], "r")
267 outfile = args[2]
268 else:
269 if len(args) != 2:
270 print "cvt-utf8: --fetch-build expects one filename argument"
271 sys.exit(1)
272 import urllib
273 infile = urllib.urlopen("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
274 outfile = args[1]
275 # Now build the database.
276 if outfile[-3:] == ".db":
277 print "cvt-utf8: warning: you should not append .db to db name"
278
279 db = anydbm.open(outfile, "n")
280 while 1:
281 s = infile.readline()
282 if s == "": break
283 ss = string.split(s, ";")[0]
284 db[ss] = s
285 db.close()
286 sys.exit(0)
287
288if args[0] == "--build-unihan" or args[0] == "--fetch-build-unihan":
289 if args[0] == "--build-unihan":
290 if len(args) != 3:
291 print "cvt-utf8: --build expects two filename arguments"
292 sys.exit(1)
293 infile = open(args[1], "r")
294 s = infile.read(1)
295 # Unihan.txt starts with a hash. If this file starts with a
296 # P, we assume it's a zip file ("PK").
297 if s == "P":
298 infile = zip_untangler(infile, s)
299 s = ""
300 outfile = args[2]
301 else:
302 if len(args) != 2:
303 print "cvt-utf8: --fetch-build-unihan expects one filename argument"
304 sys.exit(1)
305 import urllib
306 infile = urllib.urlopen("ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip")
307 # We know this one is zipped.
308 infile = zip_untangler(infile, "")
309 outfile = args[1]
310 s = ""
311 # Now build the database.
312 if outfile[-3:] == ".db":
313 print "cvt-utf8: warning: you should not append .db to db name"
314
315 db = anydbm.open(outfile, "n")
316 while 1:
317 s = s + infile.readline()
318 if s == "": break
319 while s[-1:] == "\r" or s[-1:] == "\n":
320 s = s[:-1]
321 sa = string.split(s, "\t")
322 if len(sa) == 3 and sa[1] == "kDefinition" and sa[0][:2] == "U+":
323 db[sa[0][2:]] = sa[2]
324 s = ""
325 db.close()
326 sys.exit(0)
327
328locations = []
329locations.append("/usr/share/unicode/unicode")
330locations.append("/usr/lib/unicode/unicode")
331locations.append("/usr/local/share/unicode/unicode")
332locations.append("/usr/local/lib/unicode/unicode")
333locations.append(os.environ["HOME"] + "/share/unicode/unicode")
334locations.append(os.environ["HOME"] + "/lib/unicode/unicode")
335
336for loc in locations:
337 try:
338 db = anydbm.open(loc, "r")
339 except IOError:
340 db = None
341 except anydbm.error:
342 db = None
343 if db != None:
344 break
345if han_translations:
346 i = string.rfind(loc, "/")
347 assert i >= 0
348 hanloc = loc[:i+1] + "unihan"
349 handb = anydbm.open(hanloc, "r")
350 # this has been explicitly required, so we don't squelch exceptions
351
352if args[0] == "--test":
353 do(["CE","BA","E1","BD","B9","CF","83","CE","BC","CE","B5"])
354 do(["00"])
355 do(["C2","80"])
356 do(["E0","A0","80"])
357 do(["F0","90","80","80"])
358 do(["F8","88","80","80","80"])
359 do(["FC","84","80","80","80","80"])
360 do(["7F"])
361 do(["DF","BF"])
362 do(["EF","BF","BF"])
363 do(["F7","BF","BF","BF"])
364 do(["FB","BF","BF","BF","BF"])
365 do(["FD","BF","BF","BF","BF","BF"])
366 do(["ED","9F","BF"])
367 do(["EE","80","80"])
368 do(["EF","BF","BD"])
369 do(["F4","8F","BF","BF"])
370 do(["F4","90","80","80"])
371 do(["80"])
372 do(["BF"])
373 do(["80","BF"])
374 do(["80","BF","80"])
375 do(["80","BF","80","BF"])
376 do(["80","BF","80","BF","80"])
377 do(["80","BF","80","BF","80","BF"])
378 do(["80","BF","80","BF","80","BF","80"])
379 do(["80","81","82","83","84","85","86","87",
380 "88","89","8A","8B","8C","8D","8E","8F",
381 "90","91","92","93","94","95","96","97",
382 "98","99","9A","9B","9C","9D","9E","9F",
383 "A0","A1","A2","A3","A4","A5","A6","A7",
384 "A8","A9","AA","AB","AC","AD","AE","AF",
385 "B0","B1","B2","B3","B4","B5","B6","B7",
386 "B8","B9","BA","BB","BC","BD","BE","BF"])
387 do(["C0","20","C1","20","C2","20","C3","20",
388 "C4","20","C5","20","C6","20","C7","20",
389 "C8","20","C9","20","CA","20","CB","20",
390 "CC","20","CD","20","CE","20","CF","20",
391 "D0","20","D1","20","D2","20","D3","20",
392 "D4","20","D5","20","D6","20","D7","20",
393 "D8","20","D9","20","DA","20","DB","20",
394 "DC","20","DD","20","DE","20","DF","20"])
395 do(["E0","20","E1","20","E2","20","E3","20",
396 "E4","20","E5","20","E6","20","E7","20",
397 "E8","20","E9","20","EA","20","EB","20",
398 "EC","20","ED","20","EE","20","EF","20"])
399 do(["F0","20","F1","20","F2","20","F3","20",
400 "F4","20","F5","20","F6","20","F7","20"])
401 do(["F8","20","F9","20","FA","20","FB","20"])
402 do(["FC","20","FD","20"])
403 do(["C0"])
404 do(["E0","80"])
405 do(["F0","80","80"])
406 do(["F8","80","80","80"])
407 do(["FC","80","80","80","80"])
408 do(["DF"])
409 do(["EF","BF"])
410 do(["F7","BF","BF"])
411 do(["FB","BF","BF","BF"])
412 do(["FD","BF","BF","BF","BF"])
413 do(["C0","E0","80","F0","80","80","F8","80",
414 "80","80","FC","80","80","80","80",
415 "DF","EF","BF","F7","BF","BF","FB",
416 "BF","BF","BF","FD","BF","BF","BF","BF"])
417 do(["FE"])
418 do(["FF"])
419 do(["FE","FE","FF","FF"])
420 do(["C0","AF"])
421 do(["E0","80","AF"])
422 do(["F0","80","80","AF"])
423 do(["F8","80","80","80","AF"])
424 do(["FC","80","80","80","80","AF"])
425 do(["C1","BF"])
426 do(["E0","9F","BF"])
427 do(["F0","8F","BF","BF"])
428 do(["F8","87","BF","BF","BF"])
429 do(["FC","83","BF","BF","BF","BF"])
430 do(["C0","80"])
431 do(["E0","80","80"])
432 do(["F0","80","80","80"])
433 do(["F8","80","80","80","80"])
434 do(["FC","80","80","80","80","80"])
435 do(["ED","A0","80"])
436 do(["ED","AD","BF"])
437 do(["ED","AE","80"])
438 do(["ED","AF","BF"])
439 do(["ED","B0","80"])
440 do(["ED","BE","80"])
441 do(["ED","BF","BF"])
442 do(["ED","A0","80","ED","B0","80"])
443 do(["ED","A0","80","ED","BF","BF"])
444 do(["ED","AD","BF","ED","B0","80"])
445 do(["ED","AD","BF","ED","BF","BF"])
446 do(["ED","AE","80","ED","B0","80"])
447 do(["ED","AE","80","ED","BF","BF"])
448 do(["ED","AF","BF","ED","B0","80"])
449 do(["ED","AF","BF","ED","BF","8F"])
450 do(["EF","BF","BE"])
451 do(["EF","BF","BF"])
452elif args[0] == "--input" or args[0] == "-i":
453 def getchar():
454 s = sys.stdin.read(1)
455 if s == "":
456 return None
457 return ord(s) & 0xFF # ensure it isn't negative
458 process_utf8(getchar)
459else:
460 do(args)