[sgt/utils] / cvt-utf8 / cvt-utf8

#!/usr/bin/env python

import sys
import string
import os
import anydbm
import zlib

class zip_untangler:
    def __init__(self, file, datasofar):
        self.file = file
        assert len(datasofar) < 30
        self.header = datasofar
        self.data = ""
        self.dataleft = None
        self.decompress = zlib.decompressobj()
        # Zlib header bytes, expected by decompress obj but not
        # present in zip file
        self.decompress.decompress("\x78\x9c")

    def readline(self):
        if self.dataleft == None:
            while len(self.header) < 30:
                s = self.file.read(30 - len(self.header))
                assert s != ""
                self.header = self.header + s
            # Name length and extra length.
            namelen = 256 * ord(self.header[27]) + ord(self.header[26])
            extralen = 256 * ord(self.header[29]) + ord(self.header[28])
            while len(self.header) < 30 + namelen + extralen:
                s = self.file.read(30 + namelen + extralen - len(self.header))
                assert s != ""
                self.header = self.header + s
            self.dataleft = \
            256 * (256 * (256 * ord(self.header[21]) + ord(self.header[20])) \
            + ord(self.header[19])) + ord(self.header[18])
        k = string.find(self.data, "\n")
        while k < 0:
            rlen = self.dataleft
            if rlen > 4096: rlen = 4096
            if rlen == 0: break
            d = self.file.read(rlen)
            if d == "": break
            self.dataleft = self.dataleft - rlen
            self.data = self.data + self.decompress.decompress(d)
            k = string.find(self.data, "\n")
        if k < 0:
            ret = self.data
            self.data = ""
            return ret
        else:
            ret = self.data[:k+1]
            self.data = self.data[k+1:]
            return ret

def hexstr(x):
    s = hex(x)
    if s[-1:] == "L" or s[-1:] == "l":
        s = s[:-1]
    if s[:2] == "0x" or s[:2] == "0X":
        s = s[2:]
    return s

def charname(x):
    if db is not None:
        key = hexstr(x)
        while len(key) < 4: key = "0" + key
        key = string.upper(key)
        if han_translations:
            try:
                value = handb[key]
                return "<han> " + value
            except KeyError:
                pass
        try:
            value = db[key]
            return string.split(value, ";")[1]
        except KeyError:
            return "<no name available>"
    else:
        return ""

def output(char, bytes, errors):
    if output_analysis:
        if char == -1:
            s = "           "
        else:
            s = "U-%08X " % char
        for i in bytes:
            s = s + " %02X" % i
        for i in range(6-len(bytes)):
            s = s + "   "

        if char == -1:
            name = ""
        else:
            name = charname(char)
        if name != "":
            s = s + " " + name
        s = s + errors
        print s
    else:
        if char == -1 or errors != "":
            # problem chars become U+FFFD REPLACEMENT CHARACTER
            sys.stdout.write("\xEF\xBF\xBD")
        else:
            for i in bytes:
                sys.stdout.write(chr(i))

def process_ucs(x, bytes=[], errors=""):
    if x < 0x80:
        utf8 = [x]
        realbytes = 1
    else:
        if x < 0x800:
            tmp = (0xC0, 1)
        elif x < 0x10000:
            tmp = (0xE0, 2)
        elif x < 0x200000:
            tmp = (0xF0, 3)
        elif x < 0x4000000:
            tmp = (0xF8, 4)
        else:
            assert x < 0x80000000L
            tmp = (0xFC, 5)
        realbytes = tmp[1] + 1
        utf8 = [tmp[0] + (x >> (6*tmp[1]))]
        for i in range(tmp[1]-1, -1, -1):
            utf8.append(0x80 + (0x3F & (x >> (i*6))))

    if bytes != [] and len(bytes) > realbytes:
        errors = errors + " (overlong form of"
        for i in utf8:
            errors = errors + " %02X" % i
        errors = errors + ")"
        utf8 = bytes
    if x >= 0xD800 and x <= 0xDFFF:
        errors = errors + " (surrogate)"
    if x >= 0xFFFE and x <= 0xFFFF:
        errors = errors + " (invalid char)"

    output(x, utf8, errors)

def process_utf8(next):
    c = next()
    while c != None:
        char = [c]
        i = c
        if i < 0x80:
            process_ucs(i) # single-byte char
            c = next()
        elif i == 0xfe or i == 0xff:
            output(-1, char, " (invalid UTF-8 byte)")
            c = next()
        elif i >= 0x80 and i <= 0xbf:
            output(-1, char, " (unexpected continuation byte)")
            c = next()
        else:
            if i >= 0xC0 and i <= 0xDF:
                acc = i &~ 0xC0
                cbytes = 1
            elif i >= 0xE0 and i <= 0xEF:
                acc = i &~ 0xE0
                cbytes = 2
            elif i >= 0xF0 and i <= 0xF7:
                acc = i &~ 0xF0
                cbytes = 3
            elif i >= 0xF8 and i <= 0xFB:
                acc = i &~ 0xF8
                cbytes = 4
            elif i >= 0xFC and i <= 0xFD:
                acc = i &~ 0xFC
                cbytes = 5
            gotone = 0
            while cbytes > 0:
                c = next()
                if c == None or c < 0x80 or c > 0xBF:
                    gotone = 1
                    break
                char.append(c)
                acc = (acc << 6) + (c & 0x3F)
                cbytes = cbytes - 1
            if not gotone:
                c = next()
            if cbytes > 0:
                output(-1, char, " (incomplete sequence)")
            else:
                process_ucs(acc, char)

def do(args):
    # Class to turn a list into a callable object that returns one
    # element at a time.
    class liststepper:
        def __init__(self, list):
            self.list = list
            self.index = 0
        def __call__(self):
            if self.index >= len(self.list):
                return None
            ret = self.list[self.index]
            self.index = self.index + 1
            return ret

    list = []
    for arg in args:
        got = ('none')
        if string.upper(arg[0]) == "U":
            assert arg[1] == "+" or arg[1] == "-"
            got = ('ucs', string.atoi(arg[2:], 16))
        elif arg[:2] == "&#":
            # SGML character entity. Either &# followed by a
            # number, or &#x followed by a hex number.
            s = arg
            if s[-1:] == ";": s = s[:-1]
            if string.upper(s[:3]) == "&#X":
                got = ('ucs', string.atoi(s[3:], 16))
            else:
                got = ('ucs', string.atoi(s[2:], 10))
        else:
            got = ('utf8', string.atoi(arg, 16))

        if got[0] == 'utf8':
            list.append(got[1])
        elif got[0] == 'ucs':
            if len(list) > 0:
                process_utf8(liststepper(list))
                list = []
            process_ucs(got[1])

    if len(list) > 0:
        process_utf8(liststepper(list))

def usage(arg):
    print "usage: cvt-utf8 [flags] <hex UTF-8 bytes, U+codepoints, SGML entities>"
    print "  e.g. cvt-utf8 e2 82 ac"
    print "    or cvt-utf8 U+20ac"
    print "    or cvt-utf8 U-10ffff"
    print "    or cvt-utf8 '&#8211;'"
    print ""
    print "where: -o or --output        just output well-formed UTF-8 instead of"
    print "                             an analysis of the input data"
    print "       -h or --han           also give Han definitions from unihan db"
    print ""
    print " also: cvt-utf8 --test       run Markus Kuhn's decoder stress tests" #'
    print "       cvt-utf8 --input (or -i)"
    print "                             read, analyse and decode UTF-8 from stdin"
    if arg == "--help-admin":
        print "       cvt-utf8 --help       display user help text"
        print "       cvt-utf8 --help-admin display admin help text (this one)"
        print "       cvt-utf8 --build <infile> <outfile>"
        print "                             convert UnicodeData.txt to unicode db"
        print "       cvt-utf8 --build-unihan <infile> <outfile>"
        print "                             convert Unihan.txt to unihan db"
        print "       cvt-utf8 --fetch-build <outfile>"
        print "                             "+\
        "build unicode db by download from unicode.org"
        print "       cvt-utf8 --fetch-build-unihan <outfile>"
        print "                             "+\
        "build Unihan db by download from unicode.org"
    else:
        print "       cvt-utf8 --help       display this help text"
        print "       cvt-utf8 --help-admin display admin help text"
    print "       cvt-utf8 --version    report version number"
    print "       cvt-utf8 --licence    display (MIT) licence text"

def licence():
    print "cvt-utf8 is copyright 2002-2004 Simon Tatham."
    print ""
    print "Permission is hereby granted, free of charge, to any person"
    print "obtaining a copy of this software and associated documentation files"
    print "(the \"Software\"), to deal in the Software without restriction,"
    print "including without limitation the rights to use, copy, modify, merge,"
    print "publish, distribute, sublicense, and/or sell copies of the Software,"
    print "and to permit persons to whom the Software is furnished to do so,"
    print "subject to the following conditions:"
    print ""
    print "The above copyright notice and this permission notice shall be"
    print "included in all copies or substantial portions of the Software."
    print ""
    print "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,"
    print "EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF"
    print "MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND"
    print "NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS"
    print "BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN"
    print "ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN"
    print "CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE"
    print "SOFTWARE."

def version():
    rev = "$Revision$"
    rev = string.replace(rev, " ", "")
    rev = string.replace(rev, "$", "")
    revs = string.split(rev, ":")
    if len(revs) > 1:
        print "cvt-utf8 revision %s" % revs[1]
    else:
        print "cvt-utf8: unknown version"

args = sys.argv[1:]
output_analysis = 1
han_translations = 0
mode = "cmdline"

if args == []:
    usage("")
    sys.exit(0)

while len(args) > 0 and args[0][:1] == "-":
    if args[0] == "--help" or args[0] == "--help-admin":
        usage(args[0])
        sys.exit(0)

    elif args[0] == "--licence" or args[0] == "--license":
        licence()
        sys.exit(0)

    elif args[0] == "--version":
        version()
        sys.exit(0)

    elif args[0] == "-o" or args[0] == "--output":
        output_analysis = 0
        args = args[1:]

    elif args[0] == "-h" or args[0] == "--han":
        han_translations = 1
        args = args[1:]

    elif args[0] == "--build" or args[0] == "--fetch-build":
        if args[0] == "--build":
            if len(args) != 3:
                print "cvt-utf8: --build expects two filename arguments"
                sys.exit(1)
            infile = open(args[1], "r")
            outfile = args[2]
        else:
            if len(args) != 2:
                print "cvt-utf8: --fetch-build expects one filename argument"
                sys.exit(1)
            import urllib
            infile = urllib.urlopen("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
            outfile = args[1]
        # Now build the database.
        if outfile[-3:] == ".db":
            print "cvt-utf8: warning: you should not append .db to db name"

        db = anydbm.open(outfile, "n")
        while 1:
            s = infile.readline()
            if s == "": break
            ss = string.split(s, ";")[0]
            db[ss] = s
        db.close()
        sys.exit(0)

    elif args[0] == "--build-unihan" or args[0] == "--fetch-build-unihan":
        if args[0] == "--build-unihan":
            if len(args) != 3:
                print "cvt-utf8: --build expects two filename arguments"
                sys.exit(1)
            infile = open(args[1], "r")
            s = infile.read(1)
            # Unihan.txt starts with a hash. If this file starts with a
            # P, we assume it's a zip file ("PK").
            if s == "P":
                infile = zip_untangler(infile, s)
                s = ""
            outfile = args[2]
        else:
            if len(args) != 2:
                print "cvt-utf8: --fetch-build-unihan expects one filename argument"
                sys.exit(1)
            import urllib
            infile = urllib.urlopen("ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip")
            # We know this one is zipped.
            infile = zip_untangler(infile, "")
            outfile = args[1]
            s = ""
        # Now build the database.
        if outfile[-3:] == ".db":
            print "cvt-utf8: warning: you should not append .db to db name"

        db = anydbm.open(outfile, "n")
        while 1:
            s = s + infile.readline()
            if s == "": break
            while s[-1:] == "\r" or s[-1:] == "\n":
                s = s[:-1]
            sa = string.split(s, "\t")
            if len(sa) == 3 and sa[1] == "kDefinition" and sa[0][:2] == "U+":
                db[sa[0][2:]] = sa[2]
            s = ""
        db.close()
        sys.exit(0)

    elif args[0] == "--test":
        mode = "test"
        args = args[1:]

    elif args[0] == "--input" or args[0] == "-i":
        mode = "input"
        args = args[1:]

    else:
        sys.stderr.write("cvt-utf8: unknown argument '%s'" % args[0])
        sys.exit(1)

locations = []
locations.append("/usr/share/unicode/unicode")
locations.append("/usr/lib/unicode/unicode")
locations.append("/usr/local/share/unicode/unicode")
locations.append("/usr/local/lib/unicode/unicode")
locations.append(os.environ["HOME"] + "/share/unicode/unicode")
locations.append(os.environ["HOME"] + "/lib/unicode/unicode")

for loc in locations:
    try:
        db = anydbm.open(loc, "r")
    except IOError:
        db = None
    except anydbm.error:
        db = None
    if db != None:
        break
if han_translations:
    i = string.rfind(loc, "/")
    assert i >= 0
    hanloc = loc[:i+1] + "unihan"
    handb = anydbm.open(hanloc, "r")
    # this has been explicitly required, so we don't squelch exceptions

if mode == "test":
    do(["CE","BA","E1","BD","B9","CF","83","CE","BC","CE","B5"])
    do(["00"])
    do(["C2","80"])
    do(["E0","A0","80"])
    do(["F0","90","80","80"])
    do(["F8","88","80","80","80"])
    do(["FC","84","80","80","80","80"])
    do(["7F"])
    do(["DF","BF"])
    do(["EF","BF","BF"])
    do(["F7","BF","BF","BF"])
    do(["FB","BF","BF","BF","BF"])
    do(["FD","BF","BF","BF","BF","BF"])
    do(["ED","9F","BF"])
    do(["EE","80","80"])
    do(["EF","BF","BD"])
    do(["F4","8F","BF","BF"])
    do(["F4","90","80","80"])
    do(["80"])
    do(["BF"])
    do(["80","BF"])
    do(["80","BF","80"])
    do(["80","BF","80","BF"])
    do(["80","BF","80","BF","80"])
    do(["80","BF","80","BF","80","BF"])
    do(["80","BF","80","BF","80","BF","80"])
    do(["80","81","82","83","84","85","86","87",
    "88","89","8A","8B","8C","8D","8E","8F",
    "90","91","92","93","94","95","96","97",
    "98","99","9A","9B","9C","9D","9E","9F",
    "A0","A1","A2","A3","A4","A5","A6","A7",
    "A8","A9","AA","AB","AC","AD","AE","AF",
    "B0","B1","B2","B3","B4","B5","B6","B7",
    "B8","B9","BA","BB","BC","BD","BE","BF"])
    do(["C0","20","C1","20","C2","20","C3","20",
    "C4","20","C5","20","C6","20","C7","20",
    "C8","20","C9","20","CA","20","CB","20",
    "CC","20","CD","20","CE","20","CF","20",
    "D0","20","D1","20","D2","20","D3","20",
    "D4","20","D5","20","D6","20","D7","20",
    "D8","20","D9","20","DA","20","DB","20",
    "DC","20","DD","20","DE","20","DF","20"])
    do(["E0","20","E1","20","E2","20","E3","20",
    "E4","20","E5","20","E6","20","E7","20",
    "E8","20","E9","20","EA","20","EB","20",
    "EC","20","ED","20","EE","20","EF","20"])
    do(["F0","20","F1","20","F2","20","F3","20",
    "F4","20","F5","20","F6","20","F7","20"])
    do(["F8","20","F9","20","FA","20","FB","20"])
    do(["FC","20","FD","20"])
    do(["C0"])
    do(["E0","80"])
    do(["F0","80","80"])
    do(["F8","80","80","80"])
    do(["FC","80","80","80","80"])
    do(["DF"])
    do(["EF","BF"])
    do(["F7","BF","BF"])
    do(["FB","BF","BF","BF"])
    do(["FD","BF","BF","BF","BF"])
    do(["C0","E0","80","F0","80","80","F8","80",
    "80","80","FC","80","80","80","80",
    "DF","EF","BF","F7","BF","BF","FB",
    "BF","BF","BF","FD","BF","BF","BF","BF"])
    do(["FE"])
    do(["FF"])
    do(["FE","FE","FF","FF"])
    do(["C0","AF"])
    do(["E0","80","AF"])
    do(["F0","80","80","AF"])
    do(["F8","80","80","80","AF"])
    do(["FC","80","80","80","80","AF"])
    do(["C1","BF"])
    do(["E0","9F","BF"])
    do(["F0","8F","BF","BF"])
    do(["F8","87","BF","BF","BF"])
    do(["FC","83","BF","BF","BF","BF"])
    do(["C0","80"])
    do(["E0","80","80"])
    do(["F0","80","80","80"])
    do(["F8","80","80","80","80"])
    do(["FC","80","80","80","80","80"])
    do(["ED","A0","80"])
    do(["ED","AD","BF"])
    do(["ED","AE","80"])
    do(["ED","AF","BF"])
    do(["ED","B0","80"])
    do(["ED","BE","80"])
    do(["ED","BF","BF"])
    do(["ED","A0","80","ED","B0","80"])
    do(["ED","A0","80","ED","BF","BF"])
    do(["ED","AD","BF","ED","B0","80"])
    do(["ED","AD","BF","ED","BF","BF"])
    do(["ED","AE","80","ED","B0","80"])
    do(["ED","AE","80","ED","BF","BF"])
    do(["ED","AF","BF","ED","B0","80"])
    do(["ED","AF","BF","ED","BF","8F"])
    do(["EF","BF","BE"])
    do(["EF","BF","BF"])
elif mode == "input":
    def getchar():
        s = sys.stdin.read(1)
        if s == "":
            return None
        return ord(s) & 0xFF   # ensure it isn't negative
    process_utf8(getchar)
else:
    do(args)
Commit	Line	Data
27c26167	1	#!/usr/bin/env python
9acadc2b	2
	3	import sys
	4	import string
	5	import os
	6	import anydbm
	7	import zlib
	8
	9	class zip_untangler:
	10	def __init__(self, file, datasofar):
	11	self.file = file
	12	assert len(datasofar) < 30
	13	self.header = datasofar
	14	self.data = ""
	15	self.dataleft = None
	16	self.decompress = zlib.decompressobj()
	17	# Zlib header bytes, expected by decompress obj but not
	18	# present in zip file
	19	self.decompress.decompress("\x78\x9c")
	20
	21	def readline(self):
	22	if self.dataleft == None:
	23	while len(self.header) < 30:
	24	s = self.file.read(30 - len(self.header))
	25	assert s != ""
	26	self.header = self.header + s
	27	# Name length and extra length.
	28	namelen = 256 * ord(self.header[27]) + ord(self.header[26])
	29	extralen = 256 * ord(self.header[29]) + ord(self.header[28])
	30	while len(self.header) < 30 + namelen + extralen:
	31	s = self.file.read(30 + namelen + extralen - len(self.header))
	32	assert s != ""
	33	self.header = self.header + s
	34	self.dataleft = \
	35	256 * (256 * (256 * ord(self.header[21]) + ord(self.header[20])) \
	36	+ ord(self.header[19])) + ord(self.header[18])
	37	k = string.find(self.data, "\n")
	38	while k < 0:
	39	rlen = self.dataleft
	40	if rlen > 4096: rlen = 4096
	41	if rlen == 0: break
	42	d = self.file.read(rlen)
	43	if d == "": break
	44	self.dataleft = self.dataleft - rlen
	45	self.data = self.data + self.decompress.decompress(d)
	46	k = string.find(self.data, "\n")
	47	if k < 0:
	48	ret = self.data
	49	self.data = ""
	50	return ret
	51	else:
	52	ret = self.data[:k+1]
	53	self.data = self.data[k+1:]
	54	return ret
	55
	56	def hexstr(x):
	57	s = hex(x)
	58	if s[-1:] == "L" or s[-1:] == "l":
30862ac8	59	s = s[:-1]
9acadc2b	60	if s[:2] == "0x" or s[:2] == "0X":
30862ac8	61	s = s[2:]
9acadc2b	62	return s
	63
	64	def charname(x):
68c596fb	65	if db is not None:
30862ac8	66	key = hexstr(x)
	67	while len(key) < 4: key = "0" + key
	68	key = string.upper(key)
	69	if han_translations:
	70	try:
	71	value = handb[key]
	72	return "<han> " + value
	73	except KeyError:
	74	pass
	75	try:
	76	value = db[key]
	77	return string.split(value, ";")[1]
	78	except KeyError:
	79	return "<no name available>"
9acadc2b	80	else:
30862ac8	81	return ""
9acadc2b	82
	83	def output(char, bytes, errors):
	84	if output_analysis:
30862ac8	85	if char == -1:
	86	s = " "
	87	else:
	88	s = "U-%08X " % char
	89	for i in bytes:
	90	s = s + " %02X" % i
	91	for i in range(6-len(bytes)):
	92	s = s + " "
	93
	94	if char == -1:
	95	name = ""
	96	else:
	97	name = charname(char)
	98	if name != "":
	99	s = s + " " + name
	100	s = s + errors
	101	print s
9acadc2b	102	else:
30862ac8	103	if char == -1 or errors != "":
	104	# problem chars become U+FFFD REPLACEMENT CHARACTER
	105	sys.stdout.write("\xEF\xBF\xBD")
	106	else:
	107	for i in bytes:
	108	sys.stdout.write(chr(i))
9acadc2b	109
	110	def process_ucs(x, bytes=[], errors=""):
	111	if x < 0x80:
30862ac8	112	utf8 = [x]
30862ac8	113	realbytes = 1
9acadc2b	114	else:
30862ac8	115	if x < 0x800:
	116	tmp = (0xC0, 1)
	117	elif x < 0x10000:
	118	tmp = (0xE0, 2)
	119	elif x < 0x200000:
	120	tmp = (0xF0, 3)
	121	elif x < 0x4000000:
	122	tmp = (0xF8, 4)
	123	else:
	124	assert x < 0x80000000L
	125	tmp = (0xFC, 5)
	126	realbytes = tmp[1] + 1
	127	utf8 = [tmp[0] + (x >> (6*tmp[1]))]
	128	for i in range(tmp[1]-1, -1, -1):
	129	utf8.append(0x80 + (0x3F & (x >> (i*6))))
9acadc2b	130
9acadc2b	131	if bytes != [] and len(bytes) > realbytes:
30862ac8	132	errors = errors + " (overlong form of"
	133	for i in utf8:
	134	errors = errors + " %02X" % i
	135	errors = errors + ")"
	136	utf8 = bytes
9acadc2b	137	if x >= 0xD800 and x <= 0xDFFF:
30862ac8	138	errors = errors + " (surrogate)"
9acadc2b	139	if x >= 0xFFFE and x <= 0xFFFF:
30862ac8	140	errors = errors + " (invalid char)"
9acadc2b	141
	142	output(x, utf8, errors)
	143
	144	def process_utf8(next):
	145	c = next()
	146	while c != None:
30862ac8	147	char = [c]
	148	i = c
	149	if i < 0x80:
	150	process_ucs(i) # single-byte char
	151	c = next()
	152	elif i == 0xfe or i == 0xff:
	153	output(-1, char, " (invalid UTF-8 byte)")
	154	c = next()
	155	elif i >= 0x80 and i <= 0xbf:
	156	output(-1, char, " (unexpected continuation byte)")
	157	c = next()
	158	else:
	159	if i >= 0xC0 and i <= 0xDF:
	160	acc = i &~ 0xC0
	161	cbytes = 1
	162	elif i >= 0xE0 and i <= 0xEF:
	163	acc = i &~ 0xE0
	164	cbytes = 2
	165	elif i >= 0xF0 and i <= 0xF7:
	166	acc = i &~ 0xF0
	167	cbytes = 3
	168	elif i >= 0xF8 and i <= 0xFB:
	169	acc = i &~ 0xF8
	170	cbytes = 4
	171	elif i >= 0xFC and i <= 0xFD:
	172	acc = i &~ 0xFC
	173	cbytes = 5
	174	gotone = 0
	175	while cbytes > 0:
	176	c = next()
	177	if c == None or c < 0x80 or c > 0xBF:
	178	gotone = 1
	179	break
	180	char.append(c)
	181	acc = (acc << 6) + (c & 0x3F)
	182	cbytes = cbytes - 1
	183	if not gotone:
	184	c = next()
	185	if cbytes > 0:
	186	output(-1, char, " (incomplete sequence)")
	187	else:
	188	process_ucs(acc, char)
9acadc2b	189
	190	def do(args):
	191	# Class to turn a list into a callable object that returns one
	192	# element at a time.
	193	class liststepper:
30862ac8	194	def __init__(self, list):
	195	self.list = list
	196	self.index = 0
	197	def __call__(self):
	198	if self.index >= len(self.list):
	199	return None
	200	ret = self.list[self.index]
	201	self.index = self.index + 1
	202	return ret
9acadc2b	203
	204	list = []
	205	for arg in args:
30862ac8	206	got = ('none')
	207	if string.upper(arg[0]) == "U":
	208	assert arg[1] == "+" or arg[1] == "-"
	209	got = ('ucs', string.atoi(arg[2:], 16))
	210	elif arg[:2] == "&#":
	211	# SGML character entity. Either &# followed by a
	212	# number, or &#x followed by a hex number.
	213	s = arg
	214	if s[-1:] == ";": s = s[:-1]
	215	if string.upper(s[:3]) == "&#X":
	216	got = ('ucs', string.atoi(s[3:], 16))
	217	else:
	218	got = ('ucs', string.atoi(s[2:], 10))
	219	else:
	220	got = ('utf8', string.atoi(arg, 16))
	221
	222	if got[0] == 'utf8':
	223	list.append(got[1])
	224	elif got[0] == 'ucs':
	225	if len(list) > 0:
	226	process_utf8(liststepper(list))
	227	list = []
	228	process_ucs(got[1])
9acadc2b	229
9acadc2b	230	if len(list) > 0:
30862ac8	231	process_utf8(liststepper(list))
9acadc2b	232
da0f8522	233	def usage(arg):
337e121d	234	print "usage: cvt-utf8 [flags] <hex UTF-8 bytes, U+codepoints, SGML entities>"
9acadc2b	235	print " e.g. cvt-utf8 e2 82 ac"
	236	print " or cvt-utf8 U+20ac"
	237	print " or cvt-utf8 U-10ffff"
337e121d	238	print " or cvt-utf8 '–'"
9acadc2b	239	print ""
c52f9fb9	240	print "where: -o or --output just output well-formed UTF-8 instead of"
9acadc2b	241	print " an analysis of the input data"
	242	print " -h or --han also give Han definitions from unihan db"
	243	print ""
c52f9fb9	244	print " also: cvt-utf8 --test run Markus Kuhn's decoder stress tests" #'
9acadc2b	245	print " cvt-utf8 --input (or -i)"
9acadc2b	246	print " read, analyse and decode UTF-8 from stdin"
da0f8522	247	if arg == "--help-admin":
9acadc2b	248	print " cvt-utf8 --help display user help text"
	249	print " cvt-utf8 --help-admin display admin help text (this one)"
	250	print " cvt-utf8 --build <infile> <outfile>"
	251	print " convert UnicodeData.txt to unicode db"
	252	print " cvt-utf8 --build-unihan <infile> <outfile>"
	253	print " convert Unihan.txt to unihan db"
	254	print " cvt-utf8 --fetch-build <outfile>"
	255	print " "+\
	256	"build unicode db by download from unicode.org"
	257	print " cvt-utf8 --fetch-build-unihan <outfile>"
	258	print " "+\
	259	"build Unihan db by download from unicode.org"
	260	else:
	261	print " cvt-utf8 --help display this help text"
	262	print " cvt-utf8 --help-admin display admin help text"
da0f8522	263	print " cvt-utf8 --version report version number"
	264	print " cvt-utf8 --licence display (MIT) licence text"
	265
	266	def licence():
	267	print "cvt-utf8 is copyright 2002-2004 Simon Tatham."
	268	print ""
	269	print "Permission is hereby granted, free of charge, to any person"
	270	print "obtaining a copy of this software and associated documentation files"
	271	print "(the \"Software\"), to deal in the Software without restriction,"
	272	print "including without limitation the rights to use, copy, modify, merge,"
	273	print "publish, distribute, sublicense, and/or sell copies of the Software,"
	274	print "and to permit persons to whom the Software is furnished to do so,"
	275	print "subject to the following conditions:"
	276	print ""
	277	print "The above copyright notice and this permission notice shall be"
	278	print "included in all copies or substantial portions of the Software."
	279	print ""
	280	print "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,"
	281	print "EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF"
	282	print "MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND"
	283	print "NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS"
	284	print "BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN"
	285	print "ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN"
	286	print "CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE"
	287	print "SOFTWARE."
9acadc2b	288
da0f8522	289	def version():
	290	rev = "$Revision$"
	291	rev = string.replace(rev, " ", "")
	292	rev = string.replace(rev, "$", "")
	293	revs = string.split(rev, ":")
	294	if len(revs) > 1:
30862ac8	295	print "cvt-utf8 revision %s" % revs[1]
9acadc2b	296	else:
30862ac8	297	print "cvt-utf8: unknown version"
da0f8522	298
	299	args = sys.argv[1:]
	300	output_analysis = 1
	301	han_translations = 0
	302	mode = "cmdline"
	303
	304	if args == []:
	305	usage("")
9acadc2b	306	sys.exit(0)
9acadc2b	307
da0f8522	308	while len(args) > 0 and args[0][:1] == "-":
da0f8522	309	if args[0] == "--help" or args[0] == "--help-admin":
30862ac8	310	usage(args[0])
30862ac8	311	sys.exit(0)
da0f8522	312
da0f8522	313	elif args[0] == "--licence" or args[0] == "--license":
30862ac8	314	licence()
30862ac8	315	sys.exit(0)
da0f8522	316
da0f8522	317	elif args[0] == "--version":
30862ac8	318	version()
30862ac8	319	sys.exit(0)
da0f8522	320
da0f8522	321	elif args[0] == "-o" or args[0] == "--output":
30862ac8	322	output_analysis = 0
30862ac8	323	args = args[1:]
da0f8522	324
da0f8522	325	elif args[0] == "-h" or args[0] == "--han":
30862ac8	326	han_translations = 1
30862ac8	327	args = args[1:]
da0f8522	328
da0f8522	329	elif args[0] == "--build" or args[0] == "--fetch-build":
30862ac8	330	if args[0] == "--build":
	331	if len(args) != 3:
	332	print "cvt-utf8: --build expects two filename arguments"
	333	sys.exit(1)
	334	infile = open(args[1], "r")
	335	outfile = args[2]
	336	else:
	337	if len(args) != 2:
	338	print "cvt-utf8: --fetch-build expects one filename argument"
	339	sys.exit(1)
	340	import urllib
	341	infile = urllib.urlopen("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
	342	outfile = args[1]
	343	# Now build the database.
	344	if outfile[-3:] == ".db":
	345	print "cvt-utf8: warning: you should not append .db to db name"
	346
	347	db = anydbm.open(outfile, "n")
	348	while 1:
	349	s = infile.readline()
	350	if s == "": break
	351	ss = string.split(s, ";")[0]
	352	db[ss] = s
	353	db.close()
	354	sys.exit(0)
da0f8522	355
da0f8522	356	elif args[0] == "--build-unihan" or args[0] == "--fetch-build-unihan":
30862ac8	357	if args[0] == "--build-unihan":
	358	if len(args) != 3:
	359	print "cvt-utf8: --build expects two filename arguments"
	360	sys.exit(1)
	361	infile = open(args[1], "r")
	362	s = infile.read(1)
	363	# Unihan.txt starts with a hash. If this file starts with a
	364	# P, we assume it's a zip file ("PK").
	365	if s == "P":
	366	infile = zip_untangler(infile, s)
	367	s = ""
	368	outfile = args[2]
	369	else:
	370	if len(args) != 2:
	371	print "cvt-utf8: --fetch-build-unihan expects one filename argument"
	372	sys.exit(1)
	373	import urllib
	374	infile = urllib.urlopen("ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip")
	375	# We know this one is zipped.
	376	infile = zip_untangler(infile, "")
	377	outfile = args[1]
	378	s = ""
	379	# Now build the database.
	380	if outfile[-3:] == ".db":
	381	print "cvt-utf8: warning: you should not append .db to db name"
	382
	383	db = anydbm.open(outfile, "n")
	384	while 1:
	385	s = s + infile.readline()
	386	if s == "": break
	387	while s[-1:] == "\r" or s[-1:] == "\n":
	388	s = s[:-1]
	389	sa = string.split(s, "\t")
	390	if len(sa) == 3 and sa[1] == "kDefinition" and sa[0][:2] == "U+":
	391	db[sa[0][2:]] = sa[2]
	392	s = ""
	393	db.close()
	394	sys.exit(0)
da0f8522	395
da0f8522	396	elif args[0] == "--test":
30862ac8	397	mode = "test"
30862ac8	398	args = args[1:]
da0f8522	399
da0f8522	400	elif args[0] == "--input" or args[0] == "-i":
30862ac8	401	mode = "input"
30862ac8	402	args = args[1:]
da0f8522	403
9acadc2b	404	else:
30862ac8	405	sys.stderr.write("cvt-utf8: unknown argument '%s'" % args[0])
30862ac8	406	sys.exit(1)
9acadc2b	407
	408	locations = []
	409	locations.append("/usr/share/unicode/unicode")
	410	locations.append("/usr/lib/unicode/unicode")
	411	locations.append("/usr/local/share/unicode/unicode")
	412	locations.append("/usr/local/lib/unicode/unicode")
	413	locations.append(os.environ["HOME"] + "/share/unicode/unicode")
	414	locations.append(os.environ["HOME"] + "/lib/unicode/unicode")
	415
	416	for loc in locations:
	417	try:
30862ac8	418	db = anydbm.open(loc, "r")
9acadc2b	419	except IOError:
30862ac8	420	db = None
9acadc2b	421	except anydbm.error:
30862ac8	422	db = None
9acadc2b	423	if db != None:
30862ac8	424	break
9acadc2b	425	if han_translations:
	426	i = string.rfind(loc, "/")
	427	assert i >= 0
	428	hanloc = loc[:i+1] + "unihan"
	429	handb = anydbm.open(hanloc, "r")
	430	# this has been explicitly required, so we don't squelch exceptions
	431
da0f8522	432	if mode == "test":
9acadc2b	433	do(["CE","BA","E1","BD","B9","CF","83","CE","BC","CE","B5"])
	434	do(["00"])
	435	do(["C2","80"])
	436	do(["E0","A0","80"])
	437	do(["F0","90","80","80"])
	438	do(["F8","88","80","80","80"])
	439	do(["FC","84","80","80","80","80"])
	440	do(["7F"])
	441	do(["DF","BF"])
	442	do(["EF","BF","BF"])
	443	do(["F7","BF","BF","BF"])
	444	do(["FB","BF","BF","BF","BF"])
	445	do(["FD","BF","BF","BF","BF","BF"])
	446	do(["ED","9F","BF"])
	447	do(["EE","80","80"])
	448	do(["EF","BF","BD"])
	449	do(["F4","8F","BF","BF"])
	450	do(["F4","90","80","80"])
	451	do(["80"])
	452	do(["BF"])
	453	do(["80","BF"])
	454	do(["80","BF","80"])
	455	do(["80","BF","80","BF"])
	456	do(["80","BF","80","BF","80"])
	457	do(["80","BF","80","BF","80","BF"])
	458	do(["80","BF","80","BF","80","BF","80"])
	459	do(["80","81","82","83","84","85","86","87",
	460	"88","89","8A","8B","8C","8D","8E","8F",
	461	"90","91","92","93","94","95","96","97",
	462	"98","99","9A","9B","9C","9D","9E","9F",
	463	"A0","A1","A2","A3","A4","A5","A6","A7",
	464	"A8","A9","AA","AB","AC","AD","AE","AF",
	465	"B0","B1","B2","B3","B4","B5","B6","B7",
	466	"B8","B9","BA","BB","BC","BD","BE","BF"])
	467	do(["C0","20","C1","20","C2","20","C3","20",
	468	"C4","20","C5","20","C6","20","C7","20",
	469	"C8","20","C9","20","CA","20","CB","20",
	470	"CC","20","CD","20","CE","20","CF","20",
	471	"D0","20","D1","20","D2","20","D3","20",
	472	"D4","20","D5","20","D6","20","D7","20",
	473	"D8","20","D9","20","DA","20","DB","20",
	474	"DC","20","DD","20","DE","20","DF","20"])
	475	do(["E0","20","E1","20","E2","20","E3","20",
	476	"E4","20","E5","20","E6","20","E7","20",
	477	"E8","20","E9","20","EA","20","EB","20",
	478	"EC","20","ED","20","EE","20","EF","20"])
	479	do(["F0","20","F1","20","F2","20","F3","20",
	480	"F4","20","F5","20","F6","20","F7","20"])
	481	do(["F8","20","F9","20","FA","20","FB","20"])
	482	do(["FC","20","FD","20"])
	483	do(["C0"])
	484	do(["E0","80"])
	485	do(["F0","80","80"])
	486	do(["F8","80","80","80"])
	487	do(["FC","80","80","80","80"])
	488	do(["DF"])
	489	do(["EF","BF"])
	490	do(["F7","BF","BF"])
	491	do(["FB","BF","BF","BF"])
	492	do(["FD","BF","BF","BF","BF"])
	493	do(["C0","E0","80","F0","80","80","F8","80",
	494	"80","80","FC","80","80","80","80",
	495	"DF","EF","BF","F7","BF","BF","FB",
	496	"BF","BF","BF","FD","BF","BF","BF","BF"])
497	do(["FE"])
498	do(["FF"])
499	do(["FE","FE","FF","FF"])
500	do(["C0","AF"])
501	do(["E0","80","AF"])
502	do(["F0","80","80","AF"])
503	do(["F8","80","80","80","AF"])
504	do(["FC","80","80","80","80","AF"])
505	do(["C1","BF"])
506	do(["E0","9F","BF"])
507	do(["F0","8F","BF","BF"])
508	do(["F8","87","BF","BF","BF"])
509	do(["FC","83","BF","BF","BF","BF"])
510	do(["C0","80"])
511	do(["E0","80","80"])
512	do(["F0","80","80","80"])
513	do(["F8","80","80","80","80"])
514	do(["FC","80","80","80","80","80"])
515	do(["ED","A0","80"])
516	do(["ED","AD","BF"])
517	do(["ED","AE","80"])
518	do(["ED","AF","BF"])
519	do(["ED","B0","80"])
520	do(["ED","BE","80"])
521	do(["ED","BF","BF"])
522	do(["ED","A0","80","ED","B0","80"])
523	do(["ED","A0","80","ED","BF","BF"])
524	do(["ED","AD","BF","ED","B0","80"])
525	do(["ED","AD","BF","ED","BF","BF"])
526	do(["ED","AE","80","ED","B0","80"])
527	do(["ED","AE","80","ED","BF","BF"])
528	do(["ED","AF","BF","ED","B0","80"])
529	do(["ED","AF","BF","ED","BF","8F"])
530	do(["EF","BF","BE"])
531	do(["EF","BF","BF"])
da0f8522	532	elif mode == "input":
9acadc2b	533	def getchar():
30862ac8	534	s = sys.stdin.read(1)
	535	if s == "":
	536	return None
	537	return ord(s) & 0xFF # ensure it isn't negative
9acadc2b	538	process_utf8(getchar)
	539	else:
	540	do(args)