mdw@git.distorted.org.uk Git - sgt/utils/blob - cvt-utf8/cvt-utf8

   1 #!/usr/bin/env python
   2
   3 import sys
   4 import string
   5 import os
   6 import anydbm
   7 import zlib
   8
   9 class zip_untangler:
  10     def __init__(self, file, datasofar):
  11         self.file = file
  12         assert len(datasofar) < 30
  13         self.header = datasofar
  14         self.data = ""
  15         self.dataleft = None
  16         self.decompress = zlib.decompressobj()
  17         # Zlib header bytes, expected by decompress obj but not
  18         # present in zip file
  19         self.decompress.decompress("\x78\x9c")
  20
  21     def readline(self):
  22         if self.dataleft == None:
  23             while len(self.header) < 30:
  24                 s = self.file.read(30 - len(self.header))
  25                 assert s != ""
  26                 self.header = self.header + s
  27             # Name length and extra length.
  28             namelen = 256 * ord(self.header[27]) + ord(self.header[26])
  29             extralen = 256 * ord(self.header[29]) + ord(self.header[28])
  30             while len(self.header) < 30 + namelen + extralen:
  31                 s = self.file.read(30 + namelen + extralen - len(self.header))
  32                 assert s != ""
  33                 self.header = self.header + s
  34             self.dataleft = \
  35             256 * (256 * (256 * ord(self.header[21]) + ord(self.header[20])) \
  36             + ord(self.header[19])) + ord(self.header[18])
  37         k = string.find(self.data, "\n")
  38         while k < 0:
  39             rlen = self.dataleft
  40             if rlen > 4096: rlen = 4096
  41             if rlen == 0: break
  42             d = self.file.read(rlen)
  43             if d == "": break
  44             self.dataleft = self.dataleft - rlen
  45             self.data = self.data + self.decompress.decompress(d)
  46             k = string.find(self.data, "\n")
  47         if k < 0:
  48             ret = self.data
  49             self.data = ""
  50             return ret
  51         else:
  52             ret = self.data[:k+1]
  53             self.data = self.data[k+1:]
  54             return ret
  55
  56 def hexstr(x):
  57     s = hex(x)
  58     if s[-1:] == "L" or s[-1:] == "l":
  59         s = s[:-1]
  60     if s[:2] == "0x" or s[:2] == "0X":
  61         s = s[2:]
  62     return s
  63
  64 def charname(x):
  65     if db is not None:
  66         key = hexstr(x)
  67         while len(key) < 4: key = "0" + key
  68         key = string.upper(key)
  69         if han_translations:
  70             try:
  71                 value = handb[key]
  72                 return "<han> " + value
  73             except KeyError:
  74                 pass
  75         try:
  76             value = db[key]
  77             return string.split(value, ";")[1]
  78         except KeyError:
  79             return "<no name available>"
  80     else:
  81         return ""
  82
  83 def output(char, bytes, errors):
  84     if output_analysis:
  85         if char == -1:
  86             s = "           "
  87         else:
  88             s = "U-%08X " % char
  89         for i in bytes:
  90             s = s + " %02X" % i
  91         for i in range(6-len(bytes)):
  92             s = s + "   "
  93
  94         if char == -1:
  95             name = ""
  96         else:
  97             name = charname(char)
  98         if name != "":
  99             s = s + " " + name
 100         s = s + errors
 101         print s
 102     else:
 103         if char == -1 or errors != "":
 104             # problem chars become U+FFFD REPLACEMENT CHARACTER
 105             sys.stdout.write("\xEF\xBF\xBD")
 106         else:
 107             for i in bytes:
 108                 sys.stdout.write(chr(i))
 109
 110 def process_ucs(x, bytes=[], errors=""):
 111     if x < 0x80:
 112         utf8 = [x]
 113         realbytes = 1
 114     else:
 115         if x < 0x800:
 116             tmp = (0xC0, 1)
 117         elif x < 0x10000:
 118             tmp = (0xE0, 2)
 119         elif x < 0x200000:
 120             tmp = (0xF0, 3)
 121         elif x < 0x4000000:
 122             tmp = (0xF8, 4)
 123         else:
 124             assert x < 0x80000000L
 125             tmp = (0xFC, 5)
 126         realbytes = tmp[1] + 1
 127         utf8 = [tmp[0] + (x >> (6*tmp[1]))]
 128         for i in range(tmp[1]-1, -1, -1):
 129             utf8.append(0x80 + (0x3F & (x >> (i*6))))
 130
 131     if bytes != [] and len(bytes) > realbytes:
 132         errors = errors + " (overlong form of"
 133         for i in utf8:
 134             errors = errors + " %02X" % i
 135         errors = errors + ")"
 136         utf8 = bytes
 137     if x >= 0xD800 and x <= 0xDFFF:
 138         errors = errors + " (surrogate)"
 139     if x >= 0xFFFE and x <= 0xFFFF:
 140         errors = errors + " (invalid char)"
 141
 142     output(x, utf8, errors)
 143
 144 def process_utf8(next):
 145     c = next()
 146     while c != None:
 147         char = [c]
 148         i = c
 149         if i < 0x80:
 150             process_ucs(i) # single-byte char
 151             c = next()
 152         elif i == 0xfe or i == 0xff:
 153             output(-1, char, " (invalid UTF-8 byte)")
 154             c = next()
 155         elif i >= 0x80 and i <= 0xbf:
 156             output(-1, char, " (unexpected continuation byte)")
 157             c = next()
 158         else:
 159             if i >= 0xC0 and i <= 0xDF:
 160                 acc = i &~ 0xC0
 161                 cbytes = 1
 162             elif i >= 0xE0 and i <= 0xEF:
 163                 acc = i &~ 0xE0
 164                 cbytes = 2
 165             elif i >= 0xF0 and i <= 0xF7:
 166                 acc = i &~ 0xF0
 167                 cbytes = 3
 168             elif i >= 0xF8 and i <= 0xFB:
 169                 acc = i &~ 0xF8
 170                 cbytes = 4
 171             elif i >= 0xFC and i <= 0xFD:
 172                 acc = i &~ 0xFC
 173                 cbytes = 5
 174             gotone = 0
 175             while cbytes > 0:
 176                 c = next()
 177                 if c == None or c < 0x80 or c > 0xBF:
 178                     gotone = 1
 179                     break
 180                 char.append(c)
 181                 acc = (acc << 6) + (c & 0x3F)
 182                 cbytes = cbytes - 1
 183             if cbytes > 0:
 184                 output(-1, char, " (incomplete sequence)")
 185             else:
 186                 process_ucs(acc, char)
 187             if not gotone:
 188                 c = next()
 189
 190 def do(args):
 191     # Class to turn a list into a callable object that returns one
 192     # element at a time.
 193     class liststepper:
 194         def __init__(self, list):
 195             self.list = list
 196             self.index = 0
 197         def __call__(self):
 198             if self.index >= len(self.list):
 199                 return None
 200             ret = self.list[self.index]
 201             self.index = self.index + 1
 202             return ret
 203
 204     list = []
 205     for arg in args:
 206         got = ('none')
 207         if string.upper(arg[0]) == "U":
 208             assert arg[1] == "+" or arg[1] == "-"
 209             got = ('ucs', string.atoi(arg[2:], 16))
 210         elif arg[:2] == "&#":
 211             # SGML character entity. Either &# followed by a
 212             # number, or &#x followed by a hex number.
 213             s = arg
 214             if s[-1:] == ";": s = s[:-1]
 215             if string.upper(s[:3]) == "&#X":
 216                 got = ('ucs', string.atoi(s[3:], 16))
 217             else:
 218                 got = ('ucs', string.atoi(s[2:], 10))
 219         else:
 220             got = ('utf8', string.atoi(arg, 16))
 221
 222         if got[0] == 'utf8':
 223             list.append(got[1])
 224         elif got[0] == 'ucs':
 225             if len(list) > 0:
 226                 process_utf8(liststepper(list))
 227                 list = []
 228             process_ucs(got[1])
 229
 230     if len(list) > 0:
 231         process_utf8(liststepper(list))
 232
 233 def usage(arg):
 234     print "usage: cvt-utf8 [flags] <hex UTF-8 bytes, U+codepoints, SGML entities>"
 235     print "  e.g. cvt-utf8 e2 82 ac"
 236     print "    or cvt-utf8 U+20ac"
 237     print "    or cvt-utf8 U-10ffff"
 238     print "    or cvt-utf8 '&#8211;'"
 239     print ""
 240     print "where: -o or --output        just output well-formed UTF-8 instead of"
 241     print "                             an analysis of the input data"
 242     print "       -h or --han           also give Han definitions from unihan db"
 243     print ""
 244     print " also: cvt-utf8 --test       run Markus Kuhn's decoder stress tests" #'
 245     print "       cvt-utf8 --input (or -i)"
 246     print "                             read, analyse and decode UTF-8 from stdin"
 247     if arg == "--help-admin":
 248         print "       cvt-utf8 --help       display user help text"
 249         print "       cvt-utf8 --help-admin display admin help text (this one)"
 250         print "       cvt-utf8 --build <infile> <outfile>"
 251         print "                             convert UnicodeData.txt to unicode db"
 252         print "       cvt-utf8 --build-unihan <infile> <outfile>"
 253         print "                             convert Unihan.txt to unihan db"
 254         print "       cvt-utf8 --fetch-build <outfile>"
 255         print "                             "+\
 256         "build unicode db by download from unicode.org"
 257         print "       cvt-utf8 --fetch-build-unihan <outfile>"
 258         print "                             "+\
 259         "build Unihan db by download from unicode.org"
 260     else:
 261         print "       cvt-utf8 --help       display this help text"
 262         print "       cvt-utf8 --help-admin display admin help text"
 263     print "       cvt-utf8 --version    report version number"
 264     print "       cvt-utf8 --licence    display (MIT) licence text"
 265
 266 def licence():
 267     print "cvt-utf8 is copyright 2002-2004 Simon Tatham."
 268     print ""
 269     print "Permission is hereby granted, free of charge, to any person"
 270     print "obtaining a copy of this software and associated documentation files"
 271     print "(the \"Software\"), to deal in the Software without restriction,"
 272     print "including without limitation the rights to use, copy, modify, merge,"
 273     print "publish, distribute, sublicense, and/or sell copies of the Software,"
 274     print "and to permit persons to whom the Software is furnished to do so,"
 275     print "subject to the following conditions:"
 276     print ""
 277     print "The above copyright notice and this permission notice shall be"
 278     print "included in all copies or substantial portions of the Software."
 279     print ""
 280     print "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,"
 281     print "EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF"
 282     print "MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND"
 283     print "NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS"
 284     print "BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN"
 285     print "ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN"
 286     print "CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE"
 287     print "SOFTWARE."
 288
 289 def version():
 290     rev = "$Revision$"
 291     rev = string.replace(rev, " ", "")
 292     rev = string.replace(rev, "$", "")
 293     revs = string.split(rev, ":")
 294     if len(revs) > 1:
 295         print "cvt-utf8 revision %s" % revs[1]
 296     else:
 297         print "cvt-utf8: unknown version"
 298
 299 args = sys.argv[1:]
 300 output_analysis = 1
 301 han_translations = 0
 302 mode = "cmdline"
 303
 304 if args == []:
 305     usage("")
 306     sys.exit(0)
 307
 308 while len(args) > 0 and args[0][:1] == "-":
 309     if args[0] == "--help" or args[0] == "--help-admin":
 310         usage(args[0])
 311         sys.exit(0)
 312
 313     elif args[0] == "--licence" or args[0] == "--license":
 314         licence()
 315         sys.exit(0)
 316
 317     elif args[0] == "--version":
 318         version()
 319         sys.exit(0)
 320
 321     elif args[0] == "-o" or args[0] == "--output":
 322         output_analysis = 0
 323         args = args[1:]
 324
 325     elif args[0] == "-h" or args[0] == "--han":
 326         han_translations = 1
 327         args = args[1:]
 328
 329     elif args[0] == "--build" or args[0] == "--fetch-build":
 330         if args[0] == "--build":
 331             if len(args) != 3:
 332                 print "cvt-utf8: --build expects two filename arguments"
 333                 sys.exit(1)
 334             infile = open(args[1], "r")
 335             outfile = args[2]
 336         else:
 337             if len(args) != 2:
 338                 print "cvt-utf8: --fetch-build expects one filename argument"
 339                 sys.exit(1)
 340             import urllib
 341             infile = urllib.urlopen("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
 342             outfile = args[1]
 343         # Now build the database.
 344         if outfile[-3:] == ".db":
 345             print "cvt-utf8: warning: you should not append .db to db name"
 346
 347         db = anydbm.open(outfile, "n")
 348         while 1:
 349             s = infile.readline()
 350             if s == "": break
 351             ss = string.split(s, ";")[0]
 352             db[ss] = s
 353         db.close()
 354         sys.exit(0)
 355
 356     elif args[0] == "--build-unihan" or args[0] == "--fetch-build-unihan":
 357         if args[0] == "--build-unihan":
 358             if len(args) != 3:
 359                 print "cvt-utf8: --build expects two filename arguments"
 360                 sys.exit(1)
 361             infile = open(args[1], "r")
 362             s = infile.read(1)
 363             # Unihan.txt starts with a hash. If this file starts with a
 364             # P, we assume it's a zip file ("PK").
 365             if s == "P":
 366                 infile = zip_untangler(infile, s)
 367                 s = ""
 368             outfile = args[2]
 369         else:
 370             if len(args) != 2:
 371                 print "cvt-utf8: --fetch-build-unihan expects one filename argument"
 372                 sys.exit(1)
 373             import urllib
 374             infile = urllib.urlopen("ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip")
 375             # We know this one is zipped.
 376             infile = zip_untangler(infile, "")
 377             outfile = args[1]
 378             s = ""
 379         # Now build the database.
 380         if outfile[-3:] == ".db":
 381             print "cvt-utf8: warning: you should not append .db to db name"
 382
 383         db = anydbm.open(outfile, "n")
 384         while 1:
 385             s = s + infile.readline()
 386             if s == "": break
 387             while s[-1:] == "\r" or s[-1:] == "\n":
 388                 s = s[:-1]
 389             sa = string.split(s, "\t")
 390             if len(sa) == 3 and sa[1] == "kDefinition" and sa[0][:2] == "U+":
 391                 db[sa[0][2:]] = sa[2]
 392             s = ""
 393         db.close()
 394         sys.exit(0)
 395
 396     elif args[0] == "--test":
 397         mode = "test"
 398         args = args[1:]
 399
 400     elif args[0] == "--input" or args[0] == "-i":
 401         mode = "input"
 402         args = args[1:]
 403
 404     else:
 405         sys.stderr.write("cvt-utf8: unknown argument '%s'" % args[0])
 406         sys.exit(1)
 407
 408 locations = []
 409 locations.append("/usr/share/unicode/unicode")
 410 locations.append("/usr/lib/unicode/unicode")
 411 locations.append("/usr/local/share/unicode/unicode")
 412 locations.append("/usr/local/lib/unicode/unicode")
 413 locations.append(os.environ["HOME"] + "/share/unicode/unicode")
 414 locations.append(os.environ["HOME"] + "/lib/unicode/unicode")
 415
 416 for loc in locations:
 417     try:
 418         db = anydbm.open(loc, "r")
 419     except IOError:
 420         db = None
 421     except anydbm.error:
 422         db = None
 423     if db != None:
 424         break
 425 if han_translations:
 426     i = string.rfind(loc, "/")
 427     assert i >= 0
 428     hanloc = loc[:i+1] + "unihan"
 429     handb = anydbm.open(hanloc, "r")
 430     # this has been explicitly required, so we don't squelch exceptions
 431
 432 if mode == "test":
 433     do(["CE","BA","E1","BD","B9","CF","83","CE","BC","CE","B5"])
 434     do(["00"])
 435     do(["C2","80"])
 436     do(["E0","A0","80"])
 437     do(["F0","90","80","80"])
 438     do(["F8","88","80","80","80"])
 439     do(["FC","84","80","80","80","80"])
 440     do(["7F"])
 441     do(["DF","BF"])
 442     do(["EF","BF","BF"])
 443     do(["F7","BF","BF","BF"])
 444     do(["FB","BF","BF","BF","BF"])
 445     do(["FD","BF","BF","BF","BF","BF"])
 446     do(["ED","9F","BF"])
 447     do(["EE","80","80"])
 448     do(["EF","BF","BD"])
 449     do(["F4","8F","BF","BF"])
 450     do(["F4","90","80","80"])
 451     do(["80"])
 452     do(["BF"])
 453     do(["80","BF"])
 454     do(["80","BF","80"])
 455     do(["80","BF","80","BF"])
 456     do(["80","BF","80","BF","80"])
 457     do(["80","BF","80","BF","80","BF"])
 458     do(["80","BF","80","BF","80","BF","80"])
 459     do(["80","81","82","83","84","85","86","87",
 460     "88","89","8A","8B","8C","8D","8E","8F",
 461     "90","91","92","93","94","95","96","97",
 462     "98","99","9A","9B","9C","9D","9E","9F",
 463     "A0","A1","A2","A3","A4","A5","A6","A7",
 464     "A8","A9","AA","AB","AC","AD","AE","AF",
 465     "B0","B1","B2","B3","B4","B5","B6","B7",
 466     "B8","B9","BA","BB","BC","BD","BE","BF"])
 467     do(["C0","20","C1","20","C2","20","C3","20",
 468     "C4","20","C5","20","C6","20","C7","20",
 469     "C8","20","C9","20","CA","20","CB","20",
 470     "CC","20","CD","20","CE","20","CF","20",
 471     "D0","20","D1","20","D2","20","D3","20",
 472     "D4","20","D5","20","D6","20","D7","20",
 473     "D8","20","D9","20","DA","20","DB","20",
 474     "DC","20","DD","20","DE","20","DF","20"])
 475     do(["E0","20","E1","20","E2","20","E3","20",
 476     "E4","20","E5","20","E6","20","E7","20",
 477     "E8","20","E9","20","EA","20","EB","20",
 478     "EC","20","ED","20","EE","20","EF","20"])
 479     do(["F0","20","F1","20","F2","20","F3","20",
 480     "F4","20","F5","20","F6","20","F7","20"])
 481     do(["F8","20","F9","20","FA","20","FB","20"])
 482     do(["FC","20","FD","20"])
 483     do(["C0"])
 484     do(["E0","80"])
 485     do(["F0","80","80"])
 486     do(["F8","80","80","80"])
 487     do(["FC","80","80","80","80"])
 488     do(["DF"])
 489     do(["EF","BF"])
 490     do(["F7","BF","BF"])
 491     do(["FB","BF","BF","BF"])
 492     do(["FD","BF","BF","BF","BF"])
 493     do(["C0","E0","80","F0","80","80","F8","80",
 494     "80","80","FC","80","80","80","80",
 495     "DF","EF","BF","F7","BF","BF","FB",
 496     "BF","BF","BF","FD","BF","BF","BF","BF"])
 497     do(["FE"])
 498     do(["FF"])
 499     do(["FE","FE","FF","FF"])
 500     do(["C0","AF"])
 501     do(["E0","80","AF"])
 502     do(["F0","80","80","AF"])
 503     do(["F8","80","80","80","AF"])
 504     do(["FC","80","80","80","80","AF"])
 505     do(["C1","BF"])
 506     do(["E0","9F","BF"])
 507     do(["F0","8F","BF","BF"])
 508     do(["F8","87","BF","BF","BF"])
 509     do(["FC","83","BF","BF","BF","BF"])
 510     do(["C0","80"])
 511     do(["E0","80","80"])
 512     do(["F0","80","80","80"])
 513     do(["F8","80","80","80","80"])
 514     do(["FC","80","80","80","80","80"])
 515     do(["ED","A0","80"])
 516     do(["ED","AD","BF"])
 517     do(["ED","AE","80"])
 518     do(["ED","AF","BF"])
 519     do(["ED","B0","80"])
 520     do(["ED","BE","80"])
 521     do(["ED","BF","BF"])
 522     do(["ED","A0","80","ED","B0","80"])
 523     do(["ED","A0","80","ED","BF","BF"])
 524     do(["ED","AD","BF","ED","B0","80"])
 525     do(["ED","AD","BF","ED","BF","BF"])
 526     do(["ED","AE","80","ED","B0","80"])
 527     do(["ED","AE","80","ED","BF","BF"])
 528     do(["ED","AF","BF","ED","B0","80"])
 529     do(["ED","AF","BF","ED","BF","8F"])
 530     do(["EF","BF","BE"])
 531     do(["EF","BF","BF"])
 532 elif mode == "input":
 533     def getchar():
 534         s = sys.stdin.read(1)
 535         if s == "":
 536             return None
 537         return ord(s) & 0xFF   # ensure it isn't negative
 538     process_utf8(getchar)
 539 else:
 540     do(args)