-#!/usr/bin/env python
+#!/usr/bin/env python
import sys
import string
def hexstr(x):
s = hex(x)
if s[-1:] == "L" or s[-1:] == "l":
- s = s[:-1]
+ s = s[:-1]
if s[:2] == "0x" or s[:2] == "0X":
- s = s[2:]
+ s = s[2:]
return s
def charname(x):
- if db:
- key = hexstr(x)
- while len(key) < 4: key = "0" + key
- key = string.upper(key)
- if han_translations:
- try:
- value = handb[key]
- return "<han> " + value
- except KeyError:
- pass
- try:
- value = db[key]
- return string.split(value, ";")[1]
- except KeyError:
- return "<no name available>"
+ if db is not None:
+ key = hexstr(x)
+ while len(key) < 4: key = "0" + key
+ key = string.upper(key)
+ if han_translations:
+ try:
+ value = handb[key]
+ return "<han> " + value
+ except KeyError:
+ pass
+ try:
+ value = db[key]
+ return string.split(value, ";")[1]
+ except KeyError:
+ return "<no name available>"
else:
- return ""
+ return ""
def output(char, bytes, errors):
if output_analysis:
- if char == -1:
- s = " "
- else:
- s = "U-%08X " % char
- for i in bytes:
- s = s + " %02X" % i
- for i in range(6-len(bytes)):
- s = s + " "
-
- if char == -1:
- name = ""
- else:
- name = charname(char)
- if name != "":
- s = s + " " + name
- s = s + errors
- print s
+ if char == -1:
+ s = " "
+ else:
+ s = "U-%08X " % char
+ for i in bytes:
+ s = s + " %02X" % i
+ for i in range(6-len(bytes)):
+ s = s + " "
+
+ if char == -1:
+ name = ""
+ else:
+ name = charname(char)
+ if name != "":
+ s = s + " " + name
+ s = s + errors
+ print s
else:
- if char == -1 or errors != "":
- # problem chars become U+FFFD REPLACEMENT CHARACTER
- sys.stdout.write("\xEF\xBF\xBD")
- else:
- for i in bytes:
- sys.stdout.write(chr(i))
+ if char == -1 or errors != "":
+ # problem chars become U+FFFD REPLACEMENT CHARACTER
+ sys.stdout.write("\xEF\xBF\xBD")
+ else:
+ for i in bytes:
+ sys.stdout.write(chr(i))
def process_ucs(x, bytes=[], errors=""):
if x < 0x80:
- utf8 = [x]
- realbytes = 1
+ utf8 = [x]
+ realbytes = 1
else:
- if x < 0x800:
- tmp = (0xC0, 1)
- elif x < 0x10000:
- tmp = (0xE0, 2)
- elif x < 0x200000:
- tmp = (0xF0, 3)
- elif x < 0x4000000:
- tmp = (0xF8, 4)
- else:
- assert x < 0x80000000L
- tmp = (0xFC, 5)
- realbytes = tmp[1] + 1
- utf8 = [tmp[0] + (x >> (6*tmp[1]))]
- for i in range(tmp[1]-1, -1, -1):
- utf8.append(0x80 + (0x3F & (x >> (i*6))))
+ if x < 0x800:
+ tmp = (0xC0, 1)
+ elif x < 0x10000:
+ tmp = (0xE0, 2)
+ elif x < 0x200000:
+ tmp = (0xF0, 3)
+ elif x < 0x4000000:
+ tmp = (0xF8, 4)
+ else:
+ assert x < 0x80000000L
+ tmp = (0xFC, 5)
+ realbytes = tmp[1] + 1
+ utf8 = [tmp[0] + (x >> (6*tmp[1]))]
+ for i in range(tmp[1]-1, -1, -1):
+ utf8.append(0x80 + (0x3F & (x >> (i*6))))
if bytes != [] and len(bytes) > realbytes:
- errors = errors + " (overlong form of"
- for i in utf8:
- errors = errors + " %02X" % i
- errors = errors + ")"
- utf8 = bytes
+ errors = errors + " (overlong form of"
+ for i in utf8:
+ errors = errors + " %02X" % i
+ errors = errors + ")"
+ utf8 = bytes
if x >= 0xD800 and x <= 0xDFFF:
- errors = errors + " (surrogate)"
+ errors = errors + " (surrogate)"
if x >= 0xFFFE and x <= 0xFFFF:
- errors = errors + " (invalid char)"
+ errors = errors + " (invalid char)"
output(x, utf8, errors)
def process_utf8(next):
c = next()
while c != None:
- char = [c]
- i = c
- if i < 0x80:
- process_ucs(i) # single-byte char
- c = next()
- elif i == 0xfe or i == 0xff:
- output(-1, char, " (invalid UTF-8 byte)")
- c = next()
- elif i >= 0x80 and i <= 0xbf:
- output(-1, char, " (unexpected continuation byte)")
- c = next()
- else:
- if i >= 0xC0 and i <= 0xDF:
- acc = i &~ 0xC0
- cbytes = 1
- elif i >= 0xE0 and i <= 0xEF:
- acc = i &~ 0xE0
- cbytes = 2
- elif i >= 0xF0 and i <= 0xF7:
- acc = i &~ 0xF0
- cbytes = 3
- elif i >= 0xF8 and i <= 0xFB:
- acc = i &~ 0xF8
- cbytes = 4
- elif i >= 0xFC and i <= 0xFD:
- acc = i &~ 0xFC
- cbytes = 5
- gotone = 0
- while cbytes > 0:
- c = next()
- if c == None or c < 0x80 or c > 0xBF:
- gotone = 1
- break
- char.append(c)
- acc = (acc << 6) + (c & 0x3F)
- cbytes = cbytes - 1
- if not gotone:
- c = next()
- if cbytes > 0:
- output(-1, char, " (incomplete sequence)")
- else:
- process_ucs(acc, char)
+ char = [c]
+ i = c
+ if i < 0x80:
+ process_ucs(i) # single-byte char
+ c = next()
+ elif i == 0xfe or i == 0xff:
+ output(-1, char, " (invalid UTF-8 byte)")
+ c = next()
+ elif i >= 0x80 and i <= 0xbf:
+ output(-1, char, " (unexpected continuation byte)")
+ c = next()
+ else:
+ if i >= 0xC0 and i <= 0xDF:
+ acc = i &~ 0xC0
+ cbytes = 1
+ elif i >= 0xE0 and i <= 0xEF:
+ acc = i &~ 0xE0
+ cbytes = 2
+ elif i >= 0xF0 and i <= 0xF7:
+ acc = i &~ 0xF0
+ cbytes = 3
+ elif i >= 0xF8 and i <= 0xFB:
+ acc = i &~ 0xF8
+ cbytes = 4
+ elif i >= 0xFC and i <= 0xFD:
+ acc = i &~ 0xFC
+ cbytes = 5
+ gotone = 0
+ while cbytes > 0:
+ c = next()
+ if c == None or c < 0x80 or c > 0xBF:
+ gotone = 1
+ break
+ char.append(c)
+ acc = (acc << 6) + (c & 0x3F)
+ cbytes = cbytes - 1
+ if cbytes > 0:
+ output(-1, char, " (incomplete sequence)")
+ else:
+ process_ucs(acc, char)
+ if not gotone:
+ c = next()
def do(args):
# Class to turn a list into a callable object that returns one
# element at a time.
class liststepper:
- def __init__(self, list):
- self.list = list
- self.index = 0
- def __call__(self):
- if self.index >= len(self.list):
- return None
- ret = self.list[self.index]
- self.index = self.index + 1
- return ret
+ def __init__(self, list):
+ self.list = list
+ self.index = 0
+ def __call__(self):
+ if self.index >= len(self.list):
+ return None
+ ret = self.list[self.index]
+ self.index = self.index + 1
+ return ret
list = []
for arg in args:
- if string.upper(arg[0]) == "U":
- if len(list) > 0:
- process_utf8(liststepper(list))
- list = []
- assert arg[1] == "+" or arg[1] == "-"
- process_ucs(string.atoi(arg[2:], 16))
- else:
- list.append(string.atoi(arg, 16))
+ got = ('none')
+ if string.upper(arg[0]) == "U":
+ assert arg[1] == "+" or arg[1] == "-"
+ got = ('ucs', string.atoi(arg[2:], 16))
+ elif arg[:2] == "&#":
+ # SGML character entity. Either &# followed by a
+ # number, or &#x followed by a hex number.
+ s = arg
+ if s[-1:] == ";": s = s[:-1]
+ if string.upper(s[:3]) == "&#X":
+ got = ('ucs', string.atoi(s[3:], 16))
+ else:
+ got = ('ucs', string.atoi(s[2:], 10))
+ else:
+ got = ('utf8', string.atoi(arg, 16))
- if len(list) > 0:
- process_utf8(liststepper(list))
+ if got[0] == 'utf8':
+ list.append(got[1])
+ elif got[0] == 'ucs':
+ if len(list) > 0:
+ process_utf8(liststepper(list))
+ list = []
+ process_ucs(got[1])
-args = sys.argv[1:]
-output_analysis = 1
-han_translations = 0
+ if len(list) > 0:
+ process_utf8(liststepper(list))
-if args == [] or args == ["--help"] or args == ["--help-admin"]:
- print "Usage: cvt-utf8 [flags] <hex UTF-8 bytes and/or U+codepoints>"
+def usage(arg):
+ print "usage: cvt-utf8 [flags] <hex UTF-8 bytes, U+codepoints, SGML entities>"
print " e.g. cvt-utf8 e2 82 ac"
print " or cvt-utf8 U+20ac"
print " or cvt-utf8 U-10ffff"
+ print " or cvt-utf8 '–'"
print ""
- print "Flags: -o or --output just output well-formed UTF-8 instead of"
+ print "where: -o or --output just output well-formed UTF-8 instead of"
print " an analysis of the input data"
print " -h or --han also give Han definitions from unihan db"
print ""
- print "Also: cvt-utf8 --test run Markus Kuhn's decoder stress tests" #'
+ print " also: cvt-utf8 --test run Markus Kuhn's decoder stress tests" #'
print " cvt-utf8 --input (or -i)"
print " read, analyse and decode UTF-8 from stdin"
- if args == ["--help-admin"]:
+ if arg == "--help-admin":
print " cvt-utf8 --help display user help text"
print " cvt-utf8 --help-admin display admin help text (this one)"
print " cvt-utf8 --build <infile> <outfile>"
else:
print " cvt-utf8 --help display this help text"
print " cvt-utf8 --help-admin display admin help text"
- sys.exit(0)
+ print " cvt-utf8 --version report version number"
+ print " cvt-utf8 --licence display (MIT) licence text"
-if args[0] == "-o" or args[0] == "--output":
- output_analysis = 0
- args = args[1:]
-
-if args[0] == "-h" or args[0] == "--han":
- han_translations = 1
- args = args[1:]
-
-if args[0] == "--build" or args[0] == "--fetch-build":
- if args[0] == "--build":
- if len(args) != 3:
- print "cvt-utf8: --build expects two filename arguments"
- sys.exit(1)
- infile = open(args[1], "r")
- outfile = args[2]
+def licence():
+ print "cvt-utf8 is copyright 2002-2004 Simon Tatham."
+ print ""
+ print "Permission is hereby granted, free of charge, to any person"
+ print "obtaining a copy of this software and associated documentation files"
+ print "(the \"Software\"), to deal in the Software without restriction,"
+ print "including without limitation the rights to use, copy, modify, merge,"
+ print "publish, distribute, sublicense, and/or sell copies of the Software,"
+ print "and to permit persons to whom the Software is furnished to do so,"
+ print "subject to the following conditions:"
+ print ""
+ print "The above copyright notice and this permission notice shall be"
+ print "included in all copies or substantial portions of the Software."
+ print ""
+ print "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,"
+ print "EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF"
+ print "MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND"
+ print "NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS"
+ print "BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN"
+ print "ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN"
+ print "CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE"
+ print "SOFTWARE."
+
+def version():
+ rev = "$Revision$"
+ rev = string.replace(rev, " ", "")
+ rev = string.replace(rev, "$", "")
+ revs = string.split(rev, ":")
+ if len(revs) > 1:
+ print "cvt-utf8 revision %s" % revs[1]
else:
- if len(args) != 2:
- print "cvt-utf8: --fetch-build expects one filename argument"
- sys.exit(1)
- import urllib
- infile = urllib.urlopen("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
- outfile = args[1]
- # Now build the database.
- if outfile[-3:] == ".db":
- print "cvt-utf8: warning: you should not append .db to db name"
-
- db = anydbm.open(outfile, "n")
- while 1:
- s = infile.readline()
- if s == "": break
- ss = string.split(s, ";")[0]
- db[ss] = s
- db.close()
+ print "cvt-utf8: unknown version"
+
+args = sys.argv[1:]
+output_analysis = 1
+han_translations = 0
+mode = "cmdline"
+
+if args == []:
+ usage("")
sys.exit(0)
-if args[0] == "--build-unihan" or args[0] == "--fetch-build-unihan":
- if args[0] == "--build-unihan":
- if len(args) != 3:
- print "cvt-utf8: --build expects two filename arguments"
- sys.exit(1)
- infile = open(args[1], "r")
- s = infile.read(1)
- # Unihan.txt starts with a hash. If this file starts with a
- # P, we assume it's a zip file ("PK").
- if s == "P":
- infile = zip_untangler(infile, s)
+while len(args) > 0 and args[0][:1] == "-":
+ if args[0] == "--help" or args[0] == "--help-admin":
+ usage(args[0])
+ sys.exit(0)
+
+ elif args[0] == "--licence" or args[0] == "--license":
+ licence()
+ sys.exit(0)
+
+ elif args[0] == "--version":
+ version()
+ sys.exit(0)
+
+ elif args[0] == "-o" or args[0] == "--output":
+ output_analysis = 0
+ args = args[1:]
+
+ elif args[0] == "-h" or args[0] == "--han":
+ han_translations = 1
+ args = args[1:]
+
+ elif args[0] == "--build" or args[0] == "--fetch-build":
+ if args[0] == "--build":
+ if len(args) != 3:
+ print "cvt-utf8: --build expects two filename arguments"
+ sys.exit(1)
+ infile = open(args[1], "r")
+ outfile = args[2]
+ else:
+ if len(args) != 2:
+ print "cvt-utf8: --fetch-build expects one filename argument"
+ sys.exit(1)
+ import urllib
+ infile = urllib.urlopen("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
+ outfile = args[1]
+ # Now build the database.
+ if outfile[-3:] == ".db":
+ print "cvt-utf8: warning: you should not append .db to db name"
+
+ db = anydbm.open(outfile, "n")
+ while 1:
+ s = infile.readline()
+ if s == "": break
+ ss = string.split(s, ";")[0]
+ db[ss] = s
+ db.close()
+ sys.exit(0)
+
+ elif args[0] == "--build-unihan" or args[0] == "--fetch-build-unihan":
+ if args[0] == "--build-unihan":
+ if len(args) != 3:
+ print "cvt-utf8: --build expects two filename arguments"
+ sys.exit(1)
+ infile = open(args[1], "r")
+ s = infile.read(1)
+ # Unihan.txt starts with a hash. If this file starts with a
+ # P, we assume it's a zip file ("PK").
+ if s == "P":
+ infile = zip_untangler(infile, s)
+ s = ""
+ outfile = args[2]
+ else:
+ if len(args) != 2:
+ print "cvt-utf8: --fetch-build-unihan expects one filename argument"
+ sys.exit(1)
+ import urllib
+ infile = urllib.urlopen("ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip")
+ # We know this one is zipped.
+ infile = zip_untangler(infile, "")
+ outfile = args[1]
+ s = ""
+ # Now build the database.
+ if outfile[-3:] == ".db":
+ print "cvt-utf8: warning: you should not append .db to db name"
+
+ db = anydbm.open(outfile, "n")
+ while 1:
+ s = s + infile.readline()
+ if s == "": break
+ while s[-1:] == "\r" or s[-1:] == "\n":
+ s = s[:-1]
+ sa = string.split(s, "\t")
+ if len(sa) == 3 and sa[1] == "kDefinition" and sa[0][:2] == "U+":
+ db[sa[0][2:]] = sa[2]
s = ""
- outfile = args[2]
+ db.close()
+ sys.exit(0)
+
+ elif args[0] == "--test":
+ mode = "test"
+ args = args[1:]
+
+ elif args[0] == "--input" or args[0] == "-i":
+ mode = "input"
+ args = args[1:]
+
else:
- if len(args) != 2:
- print "cvt-utf8: --fetch-build-unihan expects one filename argument"
- sys.exit(1)
- import urllib
- infile = urllib.urlopen("ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip")
- # We know this one is zipped.
- infile = zip_untangler(infile, "")
- outfile = args[1]
- s = ""
- # Now build the database.
- if outfile[-3:] == ".db":
- print "cvt-utf8: warning: you should not append .db to db name"
-
- db = anydbm.open(outfile, "n")
- while 1:
- s = s + infile.readline()
- if s == "": break
- while s[-1:] == "\r" or s[-1:] == "\n":
- s = s[:-1]
- sa = string.split(s, "\t")
- if len(sa) == 3 and sa[1] == "kDefinition" and sa[0][:2] == "U+":
- db[sa[0][2:]] = sa[2]
- s = ""
- db.close()
- sys.exit(0)
+ sys.stderr.write("cvt-utf8: unknown argument '%s'" % args[0])
+ sys.exit(1)
locations = []
locations.append("/usr/share/unicode/unicode")
for loc in locations:
try:
- db = anydbm.open(loc, "r")
+ db = anydbm.open(loc, "r")
except IOError:
- db = None
+ db = None
except anydbm.error:
- db = None
+ db = None
if db != None:
- break
+ break
if han_translations:
i = string.rfind(loc, "/")
assert i >= 0
handb = anydbm.open(hanloc, "r")
# this has been explicitly required, so we don't squelch exceptions
-if args[0] == "--test":
+if mode == "test":
do(["CE","BA","E1","BD","B9","CF","83","CE","BC","CE","B5"])
do(["00"])
do(["C2","80"])
do(["ED","AF","BF","ED","BF","8F"])
do(["EF","BF","BE"])
do(["EF","BF","BF"])
-elif args[0] == "--input" or args[0] == "-i":
+elif mode == "input":
def getchar():
- s = sys.stdin.read(1)
- if s == "":
- return None
- return ord(s) & 0xFF # ensure it isn't negative
+ s = sys.stdin.read(1)
+ if s == "":
+ return None
+ return ord(s) & 0xFF # ensure it isn't negative
process_utf8(getchar)
else:
do(args)