mdw@git.distorted.org.uk Git - sgt/utils/blob - cvt-utf8/cvt-utf8

   1 #!/usr/bin/env python
   2
   3 import sys
   4 import string
   5 import os
   6 import anydbm
   7 import zlib
   8
   9 class zip_untangler:
  10     def __init__(self, file, datasofar):
  11         self.file = file
  12         assert len(datasofar) < 30
  13         self.header = datasofar
  14         self.data = ""
  15         self.dataleft = None
  16         self.decompress = zlib.decompressobj()
  17         # Zlib header bytes, expected by decompress obj but not
  18         # present in zip file
  19         self.decompress.decompress("\x78\x9c")
  20
  21     def readline(self):
  22         if self.dataleft == None:
  23             while len(self.header) < 30:
  24                 s = self.file.read(30 - len(self.header))
  25                 assert s != ""
  26                 self.header = self.header + s
  27             # Name length and extra length.
  28             namelen = 256 * ord(self.header[27]) + ord(self.header[26])
  29             extralen = 256 * ord(self.header[29]) + ord(self.header[28])
  30             while len(self.header) < 30 + namelen + extralen:
  31                 s = self.file.read(30 + namelen + extralen - len(self.header))
  32                 assert s != ""
  33                 self.header = self.header + s
  34             self.dataleft = \
  35             256 * (256 * (256 * ord(self.header[21]) + ord(self.header[20])) \
  36             + ord(self.header[19])) + ord(self.header[18])
  37         k = string.find(self.data, "\n")
  38         while k < 0:
  39             rlen = self.dataleft
  40             if rlen > 4096: rlen = 4096
  41             if rlen == 0: break
  42             d = self.file.read(rlen)
  43             if d == "": break
  44             self.dataleft = self.dataleft - rlen
  45             self.data = self.data + self.decompress.decompress(d)
  46             k = string.find(self.data, "\n")
  47         if k < 0:
  48             ret = self.data
  49             self.data = ""
  50             return ret
  51         else:
  52             ret = self.data[:k+1]
  53             self.data = self.data[k+1:]
  54             return ret
  55
  56 def hexstr(x):
  57     s = hex(x)
  58     if s[-1:] == "L" or s[-1:] == "l":
  59         s = s[:-1]
  60     if s[:2] == "0x" or s[:2] == "0X":
  61         s = s[2:]
  62     return s
  63
  64 def charname(x):
  65     if db:
  66         key = hexstr(x)
  67         while len(key) < 4: key = "0" + key
  68         key = string.upper(key)
  69         if han_translations:
  70             try:
  71                 value = handb[key]
  72                 return "<han> " + value
  73             except KeyError:
  74                 pass
  75         try:
  76             value = db[key]
  77             return string.split(value, ";")[1]
  78         except KeyError:
  79             return "<no name available>"
  80     else:
  81         return ""
  82
  83 def output(char, bytes, errors):
  84     if output_analysis:
  85         if char == -1:
  86             s = "           "
  87         else:
  88             s = "U-%08X " % char
  89         for i in bytes:
  90             s = s + " %02X" % i
  91         for i in range(6-len(bytes)):
  92             s = s + "   "
  93
  94         if char == -1:
  95             name = ""
  96         else:
  97             name = charname(char)
  98         if name != "":
  99             s = s + " " + name
 100         s = s + errors
 101         print s
 102     else:
 103         if char == -1 or errors != "":
 104             # problem chars become U+FFFD REPLACEMENT CHARACTER
 105             sys.stdout.write("\xEF\xBF\xBD")
 106         else:
 107             for i in bytes:
 108                 sys.stdout.write(chr(i))
 109
 110 def process_ucs(x, bytes=[], errors=""):
 111     if x < 0x80:
 112         utf8 = [x]
 113         realbytes = 1
 114     else:
 115         if x < 0x800:
 116             tmp = (0xC0, 1)
 117         elif x < 0x10000:
 118             tmp = (0xE0, 2)
 119         elif x < 0x200000:
 120             tmp = (0xF0, 3)
 121         elif x < 0x4000000:
 122             tmp = (0xF8, 4)
 123         else:
 124             assert x < 0x80000000L
 125             tmp = (0xFC, 5)
 126         realbytes = tmp[1] + 1
 127         utf8 = [tmp[0] + (x >> (6*tmp[1]))]
 128         for i in range(tmp[1]-1, -1, -1):
 129             utf8.append(0x80 + (0x3F & (x >> (i*6))))
 130
 131     if bytes != [] and len(bytes) > realbytes:
 132         errors = errors + " (overlong form of"
 133         for i in utf8:
 134             errors = errors + " %02X" % i
 135         errors = errors + ")"
 136         utf8 = bytes
 137     if x >= 0xD800 and x <= 0xDFFF:
 138         errors = errors + " (surrogate)"
 139     if x >= 0xFFFE and x <= 0xFFFF:
 140         errors = errors + " (invalid char)"
 141
 142     output(x, utf8, errors)
 143
 144 def process_utf8(next):
 145     c = next()
 146     while c != None:
 147         char = [c]
 148         i = c
 149         if i < 0x80:
 150             process_ucs(i) # single-byte char
 151             c = next()
 152         elif i == 0xfe or i == 0xff:
 153             output(-1, char, " (invalid UTF-8 byte)")
 154             c = next()
 155         elif i >= 0x80 and i <= 0xbf:
 156             output(-1, char, " (unexpected continuation byte)")
 157             c = next()
 158         else:
 159             if i >= 0xC0 and i <= 0xDF:
 160                 acc = i &~ 0xC0
 161                 cbytes = 1
 162             elif i >= 0xE0 and i <= 0xEF:
 163                 acc = i &~ 0xE0
 164                 cbytes = 2
 165             elif i >= 0xF0 and i <= 0xF7:
 166                 acc = i &~ 0xF0
 167                 cbytes = 3
 168             elif i >= 0xF8 and i <= 0xFB:
 169                 acc = i &~ 0xF8
 170                 cbytes = 4
 171             elif i >= 0xFC and i <= 0xFD:
 172                 acc = i &~ 0xFC
 173                 cbytes = 5
 174             gotone = 0
 175             while cbytes > 0:
 176                 c = next()
 177                 if c == None or c < 0x80 or c > 0xBF:
 178                     gotone = 1
 179                     break
 180                 char.append(c)
 181                 acc = (acc << 6) + (c & 0x3F)
 182                 cbytes = cbytes - 1
 183             if not gotone:
 184                 c = next()
 185             if cbytes > 0:
 186                 output(-1, char, " (incomplete sequence)")
 187             else:
 188                 process_ucs(acc, char)
 189
 190 def do(args):
 191     # Class to turn a list into a callable object that returns one
 192     # element at a time.
 193     class liststepper:
 194         def __init__(self, list):
 195             self.list = list
 196             self.index = 0
 197         def __call__(self):
 198             if self.index >= len(self.list):
 199                 return None
 200             ret = self.list[self.index]
 201             self.index = self.index + 1
 202             return ret
 203
 204     list = []
 205     for arg in args:
 206         if string.upper(arg[0]) == "U":
 207             if len(list) > 0:
 208                 process_utf8(liststepper(list))
 209                 list = []
 210             assert arg[1] == "+" or arg[1] == "-"
 211             process_ucs(string.atoi(arg[2:], 16))
 212         else:
 213             list.append(string.atoi(arg, 16))
 214
 215     if len(list) > 0:
 216         process_utf8(liststepper(list))
 217
 218 args = sys.argv[1:]
 219 output_analysis = 1
 220 han_translations = 0
 221
 222 if args == [] or args == ["--help"] or args == ["--help-admin"]:
 223     print "Usage: cvt-utf8 [flags] <hex UTF-8 bytes and/or U+codepoints>"
 224     print "  e.g. cvt-utf8 e2 82 ac"
 225     print "    or cvt-utf8 U+20ac"
 226     print "    or cvt-utf8 U-10ffff"
 227     print ""
 228     print "Flags: -o or --output        just output well-formed UTF-8 instead of"
 229     print "                             an analysis of the input data"
 230     print "       -h or --han           also give Han definitions from unihan db"
 231     print ""
 232     print "Also:  cvt-utf8 --test       run Markus Kuhn's decoder stress tests" #'
 233     print "       cvt-utf8 --input (or -i)"
 234     print "                             read, analyse and decode UTF-8 from stdin"
 235     if args == ["--help-admin"]:
 236         print "       cvt-utf8 --help       display user help text"
 237         print "       cvt-utf8 --help-admin display admin help text (this one)"
 238         print "       cvt-utf8 --build <infile> <outfile>"
 239         print "                             convert UnicodeData.txt to unicode db"
 240         print "       cvt-utf8 --build-unihan <infile> <outfile>"
 241         print "                             convert Unihan.txt to unihan db"
 242         print "       cvt-utf8 --fetch-build <outfile>"
 243         print "                             "+\
 244         "build unicode db by download from unicode.org"
 245         print "       cvt-utf8 --fetch-build-unihan <outfile>"
 246         print "                             "+\
 247         "build Unihan db by download from unicode.org"
 248     else:
 249         print "       cvt-utf8 --help       display this help text"
 250         print "       cvt-utf8 --help-admin display admin help text"
 251     sys.exit(0)
 252
 253 if args[0] == "-o" or args[0] == "--output":
 254     output_analysis = 0
 255     args = args[1:]
 256
 257 if args[0] == "-h" or args[0] == "--han":
 258     han_translations = 1
 259     args = args[1:]
 260
 261 if args[0] == "--build" or args[0] == "--fetch-build":
 262     if args[0] == "--build":
 263         if len(args) != 3:
 264             print "cvt-utf8: --build expects two filename arguments"
 265             sys.exit(1)
 266         infile = open(args[1], "r")
 267         outfile = args[2]
 268     else:
 269         if len(args) != 2:
 270             print "cvt-utf8: --fetch-build expects one filename argument"
 271             sys.exit(1)
 272         import urllib
 273         infile = urllib.urlopen("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
 274         outfile = args[1]
 275     # Now build the database.
 276     if outfile[-3:] == ".db":
 277         print "cvt-utf8: warning: you should not append .db to db name"
 278
 279     db = anydbm.open(outfile, "n")
 280     while 1:
 281         s = infile.readline()
 282         if s == "": break
 283         ss = string.split(s, ";")[0]
 284         db[ss] = s
 285     db.close()
 286     sys.exit(0)
 287
 288 if args[0] == "--build-unihan" or args[0] == "--fetch-build-unihan":
 289     if args[0] == "--build-unihan":
 290         if len(args) != 3:
 291             print "cvt-utf8: --build expects two filename arguments"
 292             sys.exit(1)
 293         infile = open(args[1], "r")
 294         s = infile.read(1)
 295         # Unihan.txt starts with a hash. If this file starts with a
 296         # P, we assume it's a zip file ("PK").
 297         if s == "P":
 298             infile = zip_untangler(infile, s)
 299             s = ""
 300         outfile = args[2]
 301     else:
 302         if len(args) != 2:
 303             print "cvt-utf8: --fetch-build-unihan expects one filename argument"
 304             sys.exit(1)
 305         import urllib
 306         infile = urllib.urlopen("ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip")
 307         # We know this one is zipped.
 308         infile = zip_untangler(infile, "")
 309         outfile = args[1]
 310         s = ""
 311     # Now build the database.
 312     if outfile[-3:] == ".db":
 313         print "cvt-utf8: warning: you should not append .db to db name"
 314
 315     db = anydbm.open(outfile, "n")
 316     while 1:
 317         s = s + infile.readline()
 318         if s == "": break
 319         while s[-1:] == "\r" or s[-1:] == "\n":
 320             s = s[:-1]
 321         sa = string.split(s, "\t")
 322         if len(sa) == 3 and sa[1] == "kDefinition" and sa[0][:2] == "U+":
 323             db[sa[0][2:]] = sa[2]
 324         s = ""
 325     db.close()
 326     sys.exit(0)
 327
 328 locations = []
 329 locations.append("/usr/share/unicode/unicode")
 330 locations.append("/usr/lib/unicode/unicode")
 331 locations.append("/usr/local/share/unicode/unicode")
 332 locations.append("/usr/local/lib/unicode/unicode")
 333 locations.append(os.environ["HOME"] + "/share/unicode/unicode")
 334 locations.append(os.environ["HOME"] + "/lib/unicode/unicode")
 335
 336 for loc in locations:
 337     try:
 338         db = anydbm.open(loc, "r")
 339     except IOError:
 340         db = None
 341     except anydbm.error:
 342         db = None
 343     if db != None:
 344         break
 345 if han_translations:
 346     i = string.rfind(loc, "/")
 347     assert i >= 0
 348     hanloc = loc[:i+1] + "unihan"
 349     handb = anydbm.open(hanloc, "r")
 350     # this has been explicitly required, so we don't squelch exceptions
 351
 352 if args[0] == "--test":
 353     do(["CE","BA","E1","BD","B9","CF","83","CE","BC","CE","B5"])
 354     do(["00"])
 355     do(["C2","80"])
 356     do(["E0","A0","80"])
 357     do(["F0","90","80","80"])
 358     do(["F8","88","80","80","80"])
 359     do(["FC","84","80","80","80","80"])
 360     do(["7F"])
 361     do(["DF","BF"])
 362     do(["EF","BF","BF"])
 363     do(["F7","BF","BF","BF"])
 364     do(["FB","BF","BF","BF","BF"])
 365     do(["FD","BF","BF","BF","BF","BF"])
 366     do(["ED","9F","BF"])
 367     do(["EE","80","80"])
 368     do(["EF","BF","BD"])
 369     do(["F4","8F","BF","BF"])
 370     do(["F4","90","80","80"])
 371     do(["80"])
 372     do(["BF"])
 373     do(["80","BF"])
 374     do(["80","BF","80"])
 375     do(["80","BF","80","BF"])
 376     do(["80","BF","80","BF","80"])
 377     do(["80","BF","80","BF","80","BF"])
 378     do(["80","BF","80","BF","80","BF","80"])
 379     do(["80","81","82","83","84","85","86","87",
 380     "88","89","8A","8B","8C","8D","8E","8F",
 381     "90","91","92","93","94","95","96","97",
 382     "98","99","9A","9B","9C","9D","9E","9F",
 383     "A0","A1","A2","A3","A4","A5","A6","A7",
 384     "A8","A9","AA","AB","AC","AD","AE","AF",
 385     "B0","B1","B2","B3","B4","B5","B6","B7",
 386     "B8","B9","BA","BB","BC","BD","BE","BF"])
 387     do(["C0","20","C1","20","C2","20","C3","20",
 388     "C4","20","C5","20","C6","20","C7","20",
 389     "C8","20","C9","20","CA","20","CB","20",
 390     "CC","20","CD","20","CE","20","CF","20",
 391     "D0","20","D1","20","D2","20","D3","20",
 392     "D4","20","D5","20","D6","20","D7","20",
 393     "D8","20","D9","20","DA","20","DB","20",
 394     "DC","20","DD","20","DE","20","DF","20"])
 395     do(["E0","20","E1","20","E2","20","E3","20",
 396     "E4","20","E5","20","E6","20","E7","20",
 397     "E8","20","E9","20","EA","20","EB","20",
 398     "EC","20","ED","20","EE","20","EF","20"])
 399     do(["F0","20","F1","20","F2","20","F3","20",
 400     "F4","20","F5","20","F6","20","F7","20"])
 401     do(["F8","20","F9","20","FA","20","FB","20"])
 402     do(["FC","20","FD","20"])
 403     do(["C0"])
 404     do(["E0","80"])
 405     do(["F0","80","80"])
 406     do(["F8","80","80","80"])
 407     do(["FC","80","80","80","80"])
 408     do(["DF"])
 409     do(["EF","BF"])
 410     do(["F7","BF","BF"])
 411     do(["FB","BF","BF","BF"])
 412     do(["FD","BF","BF","BF","BF"])
 413     do(["C0","E0","80","F0","80","80","F8","80",
 414     "80","80","FC","80","80","80","80",
 415     "DF","EF","BF","F7","BF","BF","FB",
 416     "BF","BF","BF","FD","BF","BF","BF","BF"])
 417     do(["FE"])
 418     do(["FF"])
 419     do(["FE","FE","FF","FF"])
 420     do(["C0","AF"])
 421     do(["E0","80","AF"])
 422     do(["F0","80","80","AF"])
 423     do(["F8","80","80","80","AF"])
 424     do(["FC","80","80","80","80","AF"])
 425     do(["C1","BF"])
 426     do(["E0","9F","BF"])
 427     do(["F0","8F","BF","BF"])
 428     do(["F8","87","BF","BF","BF"])
 429     do(["FC","83","BF","BF","BF","BF"])
 430     do(["C0","80"])
 431     do(["E0","80","80"])
 432     do(["F0","80","80","80"])
 433     do(["F8","80","80","80","80"])
 434     do(["FC","80","80","80","80","80"])
 435     do(["ED","A0","80"])
 436     do(["ED","AD","BF"])
 437     do(["ED","AE","80"])
 438     do(["ED","AF","BF"])
 439     do(["ED","B0","80"])
 440     do(["ED","BE","80"])
 441     do(["ED","BF","BF"])
 442     do(["ED","A0","80","ED","B0","80"])
 443     do(["ED","A0","80","ED","BF","BF"])
 444     do(["ED","AD","BF","ED","B0","80"])
 445     do(["ED","AD","BF","ED","BF","BF"])
 446     do(["ED","AE","80","ED","B0","80"])
 447     do(["ED","AE","80","ED","BF","BF"])
 448     do(["ED","AF","BF","ED","B0","80"])
 449     do(["ED","AF","BF","ED","BF","8F"])
 450     do(["EF","BF","BE"])
 451     do(["EF","BF","BF"])
 452 elif args[0] == "--input" or args[0] == "-i":
 453     def getchar():
 454         s = sys.stdin.read(1)
 455         if s == "":
 456             return None
 457         return ord(s) & 0xFF   # ensure it isn't negative
 458     process_utf8(getchar)
 459 else:
 460     do(args)