mdw@git.distorted.org.uk Git - sgt/utils/blob - cvt-utf8/cvt-utf8

   1 #!/usr/bin/env python
   2
   3 import sys
   4 import string
   5 import os
   6 import anydbm
   7 import zlib
   8
   9 class zip_untangler:
  10     def __init__(self, file, datasofar):
  11         self.file = file
  12         assert len(datasofar) < 30
  13         self.header = datasofar
  14         self.data = ""
  15         self.dataleft = None
  16         self.decompress = zlib.decompressobj()
  17         # Zlib header bytes, expected by decompress obj but not
  18         # present in zip file
  19         self.decompress.decompress("\x78\x9c")
  20
  21     def readline(self):
  22         if self.dataleft == None:
  23             while len(self.header) < 30:
  24                 s = self.file.read(30 - len(self.header))
  25                 assert s != ""
  26                 self.header = self.header + s
  27             # Name length and extra length.
  28             namelen = 256 * ord(self.header[27]) + ord(self.header[26])
  29             extralen = 256 * ord(self.header[29]) + ord(self.header[28])
  30             while len(self.header) < 30 + namelen + extralen:
  31                 s = self.file.read(30 + namelen + extralen - len(self.header))
  32                 assert s != ""
  33                 self.header = self.header + s
  34             self.dataleft = \
  35             256 * (256 * (256 * ord(self.header[21]) + ord(self.header[20])) \
  36             + ord(self.header[19])) + ord(self.header[18])
  37         k = string.find(self.data, "\n")
  38         while k < 0:
  39             rlen = self.dataleft
  40             if rlen > 4096: rlen = 4096
  41             if rlen == 0: break
  42             d = self.file.read(rlen)
  43             if d == "": break
  44             self.dataleft = self.dataleft - rlen
  45             self.data = self.data + self.decompress.decompress(d)
  46             k = string.find(self.data, "\n")
  47         if k < 0:
  48             ret = self.data
  49             self.data = ""
  50             return ret
  51         else:
  52             ret = self.data[:k+1]
  53             self.data = self.data[k+1:]
  54             return ret
  55
  56 def hexstr(x):
  57     s = hex(x)
  58     if s[-1:] == "L" or s[-1:] == "l":
  59         s = s[:-1]
  60     if s[:2] == "0x" or s[:2] == "0X":
  61         s = s[2:]
  62     return s
  63
  64 def charname(x):
  65     if db:
  66         key = hexstr(x)
  67         while len(key) < 4: key = "0" + key
  68         key = string.upper(key)
  69         if han_translations:
  70             try:
  71                 value = handb[key]
  72                 return "<han> " + value
  73             except KeyError:
  74                 pass
  75         try:
  76             value = db[key]
  77             return string.split(value, ";")[1]
  78         except KeyError:
  79             return "<no name available>"
  80     else:
  81         return ""
  82
  83 def output(char, bytes, errors):
  84     if output_analysis:
  85         if char == -1:
  86             s = "           "
  87         else:
  88             s = "U-%08X " % char
  89         for i in bytes:
  90             s = s + " %02X" % i
  91         for i in range(6-len(bytes)):
  92             s = s + "   "
  93
  94         if char == -1:
  95             name = ""
  96         else:
  97             name = charname(char)
  98         if name != "":
  99             s = s + " " + name
 100         s = s + errors
 101         print s
 102     else:
 103         if char == -1 or errors != "":
 104             # problem chars become U+FFFD REPLACEMENT CHARACTER
 105             sys.stdout.write("\xEF\xBF\xBD")
 106         else:
 107             for i in bytes:
 108                 sys.stdout.write(chr(i))
 109
 110 def process_ucs(x, bytes=[], errors=""):
 111     if x < 0x80:
 112         utf8 = [x]
 113         realbytes = 1
 114     else:
 115         if x < 0x800:
 116             tmp = (0xC0, 1)
 117         elif x < 0x10000:
 118             tmp = (0xE0, 2)
 119         elif x < 0x200000:
 120             tmp = (0xF0, 3)
 121         elif x < 0x4000000:
 122             tmp = (0xF8, 4)
 123         else:
 124             assert x < 0x80000000L
 125             tmp = (0xFC, 5)
 126         realbytes = tmp[1] + 1
 127         utf8 = [tmp[0] + (x >> (6*tmp[1]))]
 128         for i in range(tmp[1]-1, -1, -1):
 129             utf8.append(0x80 + (0x3F & (x >> (i*6))))
 130
 131     if bytes != [] and len(bytes) > realbytes:
 132         errors = errors + " (overlong form of"
 133         for i in utf8:
 134             errors = errors + " %02X" % i
 135         errors = errors + ")"
 136         utf8 = bytes
 137     if x >= 0xD800 and x <= 0xDFFF:
 138         errors = errors + " (surrogate)"
 139     if x >= 0xFFFE and x <= 0xFFFF:
 140         errors = errors + " (invalid char)"
 141
 142     output(x, utf8, errors)
 143
 144 def process_utf8(next):
 145     c = next()
 146     while c != None:
 147         char = [c]
 148         i = c
 149         if i < 0x80:
 150             process_ucs(i) # single-byte char
 151             c = next()
 152         elif i == 0xfe or i == 0xff:
 153             output(-1, char, " (invalid UTF-8 byte)")
 154             c = next()
 155         elif i >= 0x80 and i <= 0xbf:
 156             output(-1, char, " (unexpected continuation byte)")
 157             c = next()
 158         else:
 159             if i >= 0xC0 and i <= 0xDF:
 160                 acc = i &~ 0xC0
 161                 cbytes = 1
 162             elif i >= 0xE0 and i <= 0xEF:
 163                 acc = i &~ 0xE0
 164                 cbytes = 2
 165             elif i >= 0xF0 and i <= 0xF7:
 166                 acc = i &~ 0xF0
 167                 cbytes = 3
 168             elif i >= 0xF8 and i <= 0xFB:
 169                 acc = i &~ 0xF8
 170                 cbytes = 4
 171             elif i >= 0xFC and i <= 0xFD:
 172                 acc = i &~ 0xFC
 173                 cbytes = 5
 174             gotone = 0
 175             while cbytes > 0:
 176                 c = next()
 177                 if c == None or c < 0x80 or c > 0xBF:
 178                     gotone = 1
 179                     break
 180                 char.append(c)
 181                 acc = (acc << 6) + (c & 0x3F)
 182                 cbytes = cbytes - 1
 183             if not gotone:
 184                 c = next()
 185             if cbytes > 0:
 186                 output(-1, char, " (incomplete sequence)")
 187             else:
 188                 process_ucs(acc, char)
 189
 190 def do(args):
 191     # Class to turn a list into a callable object that returns one
 192     # element at a time.
 193     class liststepper:
 194         def __init__(self, list):
 195             self.list = list
 196             self.index = 0
 197         def __call__(self):
 198             if self.index >= len(self.list):
 199                 return None
 200             ret = self.list[self.index]
 201             self.index = self.index + 1
 202             return ret
 203
 204     list = []
 205     for arg in args:
 206         if string.upper(arg[0]) == "U":
 207             if len(list) > 0:
 208                 process_utf8(liststepper(list))
 209                 list = []
 210             assert arg[1] == "+" or arg[1] == "-"
 211             process_ucs(string.atoi(arg[2:], 16))
 212         else:
 213             list.append(string.atoi(arg, 16))
 214
 215     if len(list) > 0:
 216         process_utf8(liststepper(list))
 217
 218 def usage(arg):
 219     print "Usage: cvt-utf8 [flags] <hex UTF-8 bytes and/or U+codepoints>"
 220     print "  e.g. cvt-utf8 e2 82 ac"
 221     print "    or cvt-utf8 U+20ac"
 222     print "    or cvt-utf8 U-10ffff"
 223     print ""
 224     print "Flags: -o or --output        just output well-formed UTF-8 instead of"
 225     print "                             an analysis of the input data"
 226     print "       -h or --han           also give Han definitions from unihan db"
 227     print ""
 228     print "Also:  cvt-utf8 --test       run Markus Kuhn's decoder stress tests" #'
 229     print "       cvt-utf8 --input (or -i)"
 230     print "                             read, analyse and decode UTF-8 from stdin"
 231     if arg == "--help-admin":
 232         print "       cvt-utf8 --help       display user help text"
 233         print "       cvt-utf8 --help-admin display admin help text (this one)"
 234         print "       cvt-utf8 --build <infile> <outfile>"
 235         print "                             convert UnicodeData.txt to unicode db"
 236         print "       cvt-utf8 --build-unihan <infile> <outfile>"
 237         print "                             convert Unihan.txt to unihan db"
 238         print "       cvt-utf8 --fetch-build <outfile>"
 239         print "                             "+\
 240         "build unicode db by download from unicode.org"
 241         print "       cvt-utf8 --fetch-build-unihan <outfile>"
 242         print "                             "+\
 243         "build Unihan db by download from unicode.org"
 244     else:
 245         print "       cvt-utf8 --help       display this help text"
 246         print "       cvt-utf8 --help-admin display admin help text"
 247     print "       cvt-utf8 --version    report version number"
 248     print "       cvt-utf8 --licence    display (MIT) licence text"
 249
 250 def licence():
 251     print "cvt-utf8 is copyright 2002-2004 Simon Tatham."
 252     print ""
 253     print "Permission is hereby granted, free of charge, to any person"
 254     print "obtaining a copy of this software and associated documentation files"
 255     print "(the \"Software\"), to deal in the Software without restriction,"
 256     print "including without limitation the rights to use, copy, modify, merge,"
 257     print "publish, distribute, sublicense, and/or sell copies of the Software,"
 258     print "and to permit persons to whom the Software is furnished to do so,"
 259     print "subject to the following conditions:"
 260     print ""
 261     print "The above copyright notice and this permission notice shall be"
 262     print "included in all copies or substantial portions of the Software."
 263     print ""
 264     print "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,"
 265     print "EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF"
 266     print "MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND"
 267     print "NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS"
 268     print "BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN"
 269     print "ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN"
 270     print "CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE"
 271     print "SOFTWARE."
 272
 273 def version():
 274     rev = "$Revision$"
 275     rev = string.replace(rev, " ", "")
 276     rev = string.replace(rev, "$", "")
 277     revs = string.split(rev, ":")
 278     if len(revs) > 1:
 279         print "cvt-utf8 revision %s" % revs[1]
 280     else:
 281         print "cvt-utf8: unknown version"
 282
 283 args = sys.argv[1:]
 284 output_analysis = 1
 285 han_translations = 0
 286 mode = "cmdline"
 287
 288 if args == []:
 289     usage("")
 290     sys.exit(0)
 291
 292 while len(args) > 0 and args[0][:1] == "-":
 293     if args[0] == "--help" or args[0] == "--help-admin":
 294         usage(args[0])
 295         sys.exit(0)
 296
 297     elif args[0] == "--licence" or args[0] == "--license":
 298         licence()
 299         sys.exit(0)
 300
 301     elif args[0] == "--version":
 302         version()
 303         sys.exit(0)
 304
 305     elif args[0] == "-o" or args[0] == "--output":
 306         output_analysis = 0
 307         args = args[1:]
 308
 309     elif args[0] == "-h" or args[0] == "--han":
 310         han_translations = 1
 311         args = args[1:]
 312
 313     elif args[0] == "--build" or args[0] == "--fetch-build":
 314         if args[0] == "--build":
 315             if len(args) != 3:
 316                 print "cvt-utf8: --build expects two filename arguments"
 317                 sys.exit(1)
 318             infile = open(args[1], "r")
 319             outfile = args[2]
 320         else:
 321             if len(args) != 2:
 322                 print "cvt-utf8: --fetch-build expects one filename argument"
 323                 sys.exit(1)
 324             import urllib
 325             infile = urllib.urlopen("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
 326             outfile = args[1]
 327         # Now build the database.
 328         if outfile[-3:] == ".db":
 329             print "cvt-utf8: warning: you should not append .db to db name"
 330
 331         db = anydbm.open(outfile, "n")
 332         while 1:
 333             s = infile.readline()
 334             if s == "": break
 335             ss = string.split(s, ";")[0]
 336             db[ss] = s
 337         db.close()
 338         sys.exit(0)
 339
 340     elif args[0] == "--build-unihan" or args[0] == "--fetch-build-unihan":
 341         if args[0] == "--build-unihan":
 342             if len(args) != 3:
 343                 print "cvt-utf8: --build expects two filename arguments"
 344                 sys.exit(1)
 345             infile = open(args[1], "r")
 346             s = infile.read(1)
 347             # Unihan.txt starts with a hash. If this file starts with a
 348             # P, we assume it's a zip file ("PK").
 349             if s == "P":
 350                 infile = zip_untangler(infile, s)
 351                 s = ""
 352             outfile = args[2]
 353         else:
 354             if len(args) != 2:
 355                 print "cvt-utf8: --fetch-build-unihan expects one filename argument"
 356                 sys.exit(1)
 357             import urllib
 358             infile = urllib.urlopen("ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip")
 359             # We know this one is zipped.
 360             infile = zip_untangler(infile, "")
 361             outfile = args[1]
 362             s = ""
 363         # Now build the database.
 364         if outfile[-3:] == ".db":
 365             print "cvt-utf8: warning: you should not append .db to db name"
 366
 367         db = anydbm.open(outfile, "n")
 368         while 1:
 369             s = s + infile.readline()
 370             if s == "": break
 371             while s[-1:] == "\r" or s[-1:] == "\n":
 372                 s = s[:-1]
 373             sa = string.split(s, "\t")
 374             if len(sa) == 3 and sa[1] == "kDefinition" and sa[0][:2] == "U+":
 375                 db[sa[0][2:]] = sa[2]
 376             s = ""
 377         db.close()
 378         sys.exit(0)
 379
 380     elif args[0] == "--test":
 381         mode = "test"
 382         args = args[1:]
 383
 384     elif args[0] == "--input" or args[0] == "-i":
 385         mode = "input"
 386         args = args[1:]
 387
 388     else:
 389         sys.stderr.write("cvt-utf8: unknown argument '%s'" % args[0])
 390         sys.exit(1)
 391
 392 locations = []
 393 locations.append("/usr/share/unicode/unicode")
 394 locations.append("/usr/lib/unicode/unicode")
 395 locations.append("/usr/local/share/unicode/unicode")
 396 locations.append("/usr/local/lib/unicode/unicode")
 397 locations.append(os.environ["HOME"] + "/share/unicode/unicode")
 398 locations.append(os.environ["HOME"] + "/lib/unicode/unicode")
 399
 400 for loc in locations:
 401     try:
 402         db = anydbm.open(loc, "r")
 403     except IOError:
 404         db = None
 405     except anydbm.error:
 406         db = None
 407     if db != None:
 408         break
 409 if han_translations:
 410     i = string.rfind(loc, "/")
 411     assert i >= 0
 412     hanloc = loc[:i+1] + "unihan"
 413     handb = anydbm.open(hanloc, "r")
 414     # this has been explicitly required, so we don't squelch exceptions
 415
 416 if mode == "test":
 417     do(["CE","BA","E1","BD","B9","CF","83","CE","BC","CE","B5"])
 418     do(["00"])
 419     do(["C2","80"])
 420     do(["E0","A0","80"])
 421     do(["F0","90","80","80"])
 422     do(["F8","88","80","80","80"])
 423     do(["FC","84","80","80","80","80"])
 424     do(["7F"])
 425     do(["DF","BF"])
 426     do(["EF","BF","BF"])
 427     do(["F7","BF","BF","BF"])
 428     do(["FB","BF","BF","BF","BF"])
 429     do(["FD","BF","BF","BF","BF","BF"])
 430     do(["ED","9F","BF"])
 431     do(["EE","80","80"])
 432     do(["EF","BF","BD"])
 433     do(["F4","8F","BF","BF"])
 434     do(["F4","90","80","80"])
 435     do(["80"])
 436     do(["BF"])
 437     do(["80","BF"])
 438     do(["80","BF","80"])
 439     do(["80","BF","80","BF"])
 440     do(["80","BF","80","BF","80"])
 441     do(["80","BF","80","BF","80","BF"])
 442     do(["80","BF","80","BF","80","BF","80"])
 443     do(["80","81","82","83","84","85","86","87",
 444     "88","89","8A","8B","8C","8D","8E","8F",
 445     "90","91","92","93","94","95","96","97",
 446     "98","99","9A","9B","9C","9D","9E","9F",
 447     "A0","A1","A2","A3","A4","A5","A6","A7",
 448     "A8","A9","AA","AB","AC","AD","AE","AF",
 449     "B0","B1","B2","B3","B4","B5","B6","B7",
 450     "B8","B9","BA","BB","BC","BD","BE","BF"])
 451     do(["C0","20","C1","20","C2","20","C3","20",
 452     "C4","20","C5","20","C6","20","C7","20",
 453     "C8","20","C9","20","CA","20","CB","20",
 454     "CC","20","CD","20","CE","20","CF","20",
 455     "D0","20","D1","20","D2","20","D3","20",
 456     "D4","20","D5","20","D6","20","D7","20",
 457     "D8","20","D9","20","DA","20","DB","20",
 458     "DC","20","DD","20","DE","20","DF","20"])
 459     do(["E0","20","E1","20","E2","20","E3","20",
 460     "E4","20","E5","20","E6","20","E7","20",
 461     "E8","20","E9","20","EA","20","EB","20",
 462     "EC","20","ED","20","EE","20","EF","20"])
 463     do(["F0","20","F1","20","F2","20","F3","20",
 464     "F4","20","F5","20","F6","20","F7","20"])
 465     do(["F8","20","F9","20","FA","20","FB","20"])
 466     do(["FC","20","FD","20"])
 467     do(["C0"])
 468     do(["E0","80"])
 469     do(["F0","80","80"])
 470     do(["F8","80","80","80"])
 471     do(["FC","80","80","80","80"])
 472     do(["DF"])
 473     do(["EF","BF"])
 474     do(["F7","BF","BF"])
 475     do(["FB","BF","BF","BF"])
 476     do(["FD","BF","BF","BF","BF"])
 477     do(["C0","E0","80","F0","80","80","F8","80",
 478     "80","80","FC","80","80","80","80",
 479     "DF","EF","BF","F7","BF","BF","FB",
 480     "BF","BF","BF","FD","BF","BF","BF","BF"])
 481     do(["FE"])
 482     do(["FF"])
 483     do(["FE","FE","FF","FF"])
 484     do(["C0","AF"])
 485     do(["E0","80","AF"])
 486     do(["F0","80","80","AF"])
 487     do(["F8","80","80","80","AF"])
 488     do(["FC","80","80","80","80","AF"])
 489     do(["C1","BF"])
 490     do(["E0","9F","BF"])
 491     do(["F0","8F","BF","BF"])
 492     do(["F8","87","BF","BF","BF"])
 493     do(["FC","83","BF","BF","BF","BF"])
 494     do(["C0","80"])
 495     do(["E0","80","80"])
 496     do(["F0","80","80","80"])
 497     do(["F8","80","80","80","80"])
 498     do(["FC","80","80","80","80","80"])
 499     do(["ED","A0","80"])
 500     do(["ED","AD","BF"])
 501     do(["ED","AE","80"])
 502     do(["ED","AF","BF"])
 503     do(["ED","B0","80"])
 504     do(["ED","BE","80"])
 505     do(["ED","BF","BF"])
 506     do(["ED","A0","80","ED","B0","80"])
 507     do(["ED","A0","80","ED","BF","BF"])
 508     do(["ED","AD","BF","ED","B0","80"])
 509     do(["ED","AD","BF","ED","BF","BF"])
 510     do(["ED","AE","80","ED","B0","80"])
 511     do(["ED","AE","80","ED","BF","BF"])
 512     do(["ED","AF","BF","ED","B0","80"])
 513     do(["ED","AF","BF","ED","BF","8F"])
 514     do(["EF","BF","BE"])
 515     do(["EF","BF","BF"])
 516 elif mode == "input":
 517     def getchar():
 518         s = sys.stdin.read(1)
 519         if s == "":
 520             return None
 521         return ord(s) & 0xFF   # ensure it isn't negative
 522     process_utf8(getchar)
 523 else:
 524     do(args)