[sgt/utils] / cvt-utf8 / cvt-utf8

#!/usr/bin/env python 

import sys
import string
import os
import anydbm
import zlib

class zip_untangler:
    def __init__(self, file, datasofar):
        self.file = file
        assert len(datasofar) < 30
        self.header = datasofar
        self.data = ""
        self.dataleft = None
        self.decompress = zlib.decompressobj()
        # Zlib header bytes, expected by decompress obj but not
        # present in zip file
        self.decompress.decompress("\x78\x9c")

    def readline(self):
        if self.dataleft == None:
            while len(self.header) < 30:
                s = self.file.read(30 - len(self.header))
                assert s != ""
                self.header = self.header + s
            # Name length and extra length.
            namelen = 256 * ord(self.header[27]) + ord(self.header[26])
            extralen = 256 * ord(self.header[29]) + ord(self.header[28])
            while len(self.header) < 30 + namelen + extralen:
                s = self.file.read(30 + namelen + extralen - len(self.header))
                assert s != ""
                self.header = self.header + s
            self.dataleft = \
            256 * (256 * (256 * ord(self.header[21]) + ord(self.header[20])) \
            + ord(self.header[19])) + ord(self.header[18])
        k = string.find(self.data, "\n")
        while k < 0:
            rlen = self.dataleft
            if rlen > 4096: rlen = 4096
            if rlen == 0: break
            d = self.file.read(rlen)
            if d == "": break
            self.dataleft = self.dataleft - rlen
            self.data = self.data + self.decompress.decompress(d)
            k = string.find(self.data, "\n")
        if k < 0:
            ret = self.data
            self.data = ""
            return ret
        else:
            ret = self.data[:k+1]
            self.data = self.data[k+1:]
            return ret

def hexstr(x):
    s = hex(x)
    if s[-1:] == "L" or s[-1:] == "l":
	s = s[:-1]
    if s[:2] == "0x" or s[:2] == "0X":
	s = s[2:]
    return s

def charname(x):
    if db:
	key = hexstr(x)
	while len(key) < 4: key = "0" + key
	key = string.upper(key)
	if han_translations:
	    try:
		value = handb[key]
		return "<han> " + value
	    except KeyError:
		pass
	try:
	    value = db[key]
	    return string.split(value, ";")[1]
	except KeyError:
	    return "<no name available>"
    else:
	return ""

def output(char, bytes, errors):
    if output_analysis:
	if char == -1:
	    s = "           "
	else:
	    s = "U-%08X " % char
	for i in bytes:
	    s = s + " %02X" % i
	for i in range(6-len(bytes)):
	    s = s + "   "

	if char == -1:
	    name = ""
	else:
	    name = charname(char)
	if name != "":
	    s = s + " " + name
	s = s + errors
	print s
    else:
	if char == -1 or errors != "":
	    # problem chars become U+FFFD REPLACEMENT CHARACTER
	    sys.stdout.write("\xEF\xBF\xBD")
	else:
	    for i in bytes:
		sys.stdout.write(chr(i))

def process_ucs(x, bytes=[], errors=""):
    if x < 0x80:
	utf8 = [x]
	realbytes = 1
    else:
	if x < 0x800:
	    tmp = (0xC0, 1)
	elif x < 0x10000:
	    tmp = (0xE0, 2)
	elif x < 0x200000:
	    tmp = (0xF0, 3)
	elif x < 0x4000000:
	    tmp = (0xF8, 4)
	else:
	    assert x < 0x80000000L
	    tmp = (0xFC, 5)
	realbytes = tmp[1] + 1
	utf8 = [tmp[0] + (x >> (6*tmp[1]))]
	for i in range(tmp[1]-1, -1, -1):
	    utf8.append(0x80 + (0x3F & (x >> (i*6))))

    if bytes != [] and len(bytes) > realbytes:
	errors = errors + " (overlong form of"
	for i in utf8:
	    errors = errors + " %02X" % i
	errors = errors + ")"
	utf8 = bytes
    if x >= 0xD800 and x <= 0xDFFF:
	errors = errors + " (surrogate)"
    if x >= 0xFFFE and x <= 0xFFFF:
	errors = errors + " (invalid char)"

    output(x, utf8, errors)

def process_utf8(next):
    c = next()
    while c != None:
	char = [c]
	i = c
	if i < 0x80:
	    process_ucs(i) # single-byte char
	    c = next()
	elif i == 0xfe or i == 0xff:
	    output(-1, char, " (invalid UTF-8 byte)")
	    c = next()
	elif i >= 0x80 and i <= 0xbf:
	    output(-1, char, " (unexpected continuation byte)")
	    c = next()
	else:
	    if i >= 0xC0 and i <= 0xDF:
		acc = i &~ 0xC0
		cbytes = 1
	    elif i >= 0xE0 and i <= 0xEF:
		acc = i &~ 0xE0
		cbytes = 2
	    elif i >= 0xF0 and i <= 0xF7:
		acc = i &~ 0xF0
		cbytes = 3
	    elif i >= 0xF8 and i <= 0xFB:
		acc = i &~ 0xF8
		cbytes = 4
	    elif i >= 0xFC and i <= 0xFD:
		acc = i &~ 0xFC
		cbytes = 5
	    gotone = 0
	    while cbytes > 0:
		c = next()
		if c == None or c < 0x80 or c > 0xBF:
		    gotone = 1
		    break
		char.append(c)
		acc = (acc << 6) + (c & 0x3F)
		cbytes = cbytes - 1
	    if not gotone:
		c = next()
	    if cbytes > 0:
		output(-1, char, " (incomplete sequence)")
	    else:
		process_ucs(acc, char)

def do(args):
    # Class to turn a list into a callable object that returns one
    # element at a time.
    class liststepper:
	def __init__(self, list):
	    self.list = list
	    self.index = 0
	def __call__(self):
	    if self.index >= len(self.list):
		return None
	    ret = self.list[self.index]
	    self.index = self.index + 1
	    return ret

    list = []
    for arg in args:
	if string.upper(arg[0]) == "U":
	    if len(list) > 0:
		process_utf8(liststepper(list))
		list = []
	    assert arg[1] == "+" or arg[1] == "-"
	    process_ucs(string.atoi(arg[2:], 16))
	else:
	    list.append(string.atoi(arg, 16))

    if len(list) > 0:
	process_utf8(liststepper(list))

def usage(arg):
    print "Usage: cvt-utf8 [flags] <hex UTF-8 bytes and/or U+codepoints>"
    print "  e.g. cvt-utf8 e2 82 ac"
    print "    or cvt-utf8 U+20ac"
    print "    or cvt-utf8 U-10ffff"
    print ""
    print "Flags: -o or --output        just output well-formed UTF-8 instead of"
    print "                             an analysis of the input data"
    print "       -h or --han           also give Han definitions from unihan db"
    print ""
    print "Also:  cvt-utf8 --test       run Markus Kuhn's decoder stress tests" #'
    print "       cvt-utf8 --input (or -i)"
    print "                             read, analyse and decode UTF-8 from stdin"
    if arg == "--help-admin":
        print "       cvt-utf8 --help       display user help text"
        print "       cvt-utf8 --help-admin display admin help text (this one)"
        print "       cvt-utf8 --build <infile> <outfile>"
        print "                             convert UnicodeData.txt to unicode db"
        print "       cvt-utf8 --build-unihan <infile> <outfile>"
        print "                             convert Unihan.txt to unihan db"
        print "       cvt-utf8 --fetch-build <outfile>"
        print "                             "+\
        "build unicode db by download from unicode.org"
        print "       cvt-utf8 --fetch-build-unihan <outfile>"
        print "                             "+\
        "build Unihan db by download from unicode.org"
    else:
        print "       cvt-utf8 --help       display this help text"
        print "       cvt-utf8 --help-admin display admin help text"
    print "       cvt-utf8 --version    report version number"
    print "       cvt-utf8 --licence    display (MIT) licence text"

def licence():
    print "cvt-utf8 is copyright 2002-2004 Simon Tatham."
    print ""
    print "Permission is hereby granted, free of charge, to any person"
    print "obtaining a copy of this software and associated documentation files"
    print "(the \"Software\"), to deal in the Software without restriction,"
    print "including without limitation the rights to use, copy, modify, merge,"
    print "publish, distribute, sublicense, and/or sell copies of the Software,"
    print "and to permit persons to whom the Software is furnished to do so,"
    print "subject to the following conditions:"
    print ""
    print "The above copyright notice and this permission notice shall be"
    print "included in all copies or substantial portions of the Software."
    print ""
    print "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,"
    print "EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF"
    print "MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND"
    print "NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS"
    print "BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN"
    print "ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN"
    print "CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE"
    print "SOFTWARE."

def version():
    rev = "$Revision$"
    rev = string.replace(rev, " ", "")
    rev = string.replace(rev, "$", "")
    revs = string.split(rev, ":")
    if len(revs) > 1:
	print "cvt-utf8 revision %s" % revs[1]
    else:
	print "cvt-utf8: unknown version"

args = sys.argv[1:]
output_analysis = 1
han_translations = 0
mode = "cmdline"

if args == []:
    usage("")
    sys.exit(0)

while len(args) > 0 and args[0][:1] == "-":
    if args[0] == "--help" or args[0] == "--help-admin":
	usage(args[0])
	sys.exit(0)

    elif args[0] == "--licence" or args[0] == "--license":
	licence()
	sys.exit(0)

    elif args[0] == "--version":
	version()
	sys.exit(0)

    elif args[0] == "-o" or args[0] == "--output":
	output_analysis = 0
	args = args[1:]

    elif args[0] == "-h" or args[0] == "--han":
	han_translations = 1
	args = args[1:]

    elif args[0] == "--build" or args[0] == "--fetch-build":
	if args[0] == "--build":
	    if len(args) != 3:
		print "cvt-utf8: --build expects two filename arguments"
		sys.exit(1)
	    infile = open(args[1], "r")
	    outfile = args[2]
	else:
	    if len(args) != 2:
		print "cvt-utf8: --fetch-build expects one filename argument"
		sys.exit(1)
	    import urllib
	    infile = urllib.urlopen("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
	    outfile = args[1]
	# Now build the database.
	if outfile[-3:] == ".db":
	    print "cvt-utf8: warning: you should not append .db to db name"

	db = anydbm.open(outfile, "n")
	while 1:
	    s = infile.readline()
	    if s == "": break
	    ss = string.split(s, ";")[0]
	    db[ss] = s
	db.close()
	sys.exit(0)

    elif args[0] == "--build-unihan" or args[0] == "--fetch-build-unihan":
	if args[0] == "--build-unihan":
	    if len(args) != 3:
		print "cvt-utf8: --build expects two filename arguments"
		sys.exit(1)
	    infile = open(args[1], "r")
	    s = infile.read(1)
	    # Unihan.txt starts with a hash. If this file starts with a
	    # P, we assume it's a zip file ("PK").
	    if s == "P":
		infile = zip_untangler(infile, s)
		s = ""
	    outfile = args[2]
	else:
	    if len(args) != 2:
		print "cvt-utf8: --fetch-build-unihan expects one filename argument"
		sys.exit(1)
	    import urllib
	    infile = urllib.urlopen("ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip")
	    # We know this one is zipped.
	    infile = zip_untangler(infile, "")
	    outfile = args[1]
	    s = ""
	# Now build the database.
	if outfile[-3:] == ".db":
	    print "cvt-utf8: warning: you should not append .db to db name"

	db = anydbm.open(outfile, "n")
	while 1:
	    s = s + infile.readline()
	    if s == "": break
	    while s[-1:] == "\r" or s[-1:] == "\n":
		s = s[:-1]
	    sa = string.split(s, "\t")
	    if len(sa) == 3 and sa[1] == "kDefinition" and sa[0][:2] == "U+":
		db[sa[0][2:]] = sa[2]
	    s = ""
	db.close()
	sys.exit(0)

    elif args[0] == "--test":
	mode = "test"
	args = args[1:]

    elif args[0] == "--input" or args[0] == "-i":
	mode = "input"
	args = args[1:]

    else:
	sys.stderr.write("cvt-utf8: unknown argument '%s'" % args[0])
	sys.exit(1)

locations = []
locations.append("/usr/share/unicode/unicode")
locations.append("/usr/lib/unicode/unicode")
locations.append("/usr/local/share/unicode/unicode")
locations.append("/usr/local/lib/unicode/unicode")
locations.append(os.environ["HOME"] + "/share/unicode/unicode")
locations.append(os.environ["HOME"] + "/lib/unicode/unicode")

for loc in locations:
    try:
	db = anydbm.open(loc, "r")
    except IOError:
	db = None
    except anydbm.error:
	db = None
    if db != None:
	break
if han_translations:
    i = string.rfind(loc, "/")
    assert i >= 0
    hanloc = loc[:i+1] + "unihan"
    handb = anydbm.open(hanloc, "r")
    # this has been explicitly required, so we don't squelch exceptions

if mode == "test":
    do(["CE","BA","E1","BD","B9","CF","83","CE","BC","CE","B5"])
    do(["00"])
    do(["C2","80"])
    do(["E0","A0","80"])
    do(["F0","90","80","80"])
    do(["F8","88","80","80","80"])
    do(["FC","84","80","80","80","80"])
    do(["7F"])
    do(["DF","BF"])
    do(["EF","BF","BF"])
    do(["F7","BF","BF","BF"])
    do(["FB","BF","BF","BF","BF"])
    do(["FD","BF","BF","BF","BF","BF"])
    do(["ED","9F","BF"])
    do(["EE","80","80"])
    do(["EF","BF","BD"])
    do(["F4","8F","BF","BF"])
    do(["F4","90","80","80"])
    do(["80"])
    do(["BF"])
    do(["80","BF"])
    do(["80","BF","80"])
    do(["80","BF","80","BF"])
    do(["80","BF","80","BF","80"])
    do(["80","BF","80","BF","80","BF"])
    do(["80","BF","80","BF","80","BF","80"])
    do(["80","81","82","83","84","85","86","87",
    "88","89","8A","8B","8C","8D","8E","8F",
    "90","91","92","93","94","95","96","97",
    "98","99","9A","9B","9C","9D","9E","9F",
    "A0","A1","A2","A3","A4","A5","A6","A7",
    "A8","A9","AA","AB","AC","AD","AE","AF",
    "B0","B1","B2","B3","B4","B5","B6","B7",
    "B8","B9","BA","BB","BC","BD","BE","BF"])
    do(["C0","20","C1","20","C2","20","C3","20",
    "C4","20","C5","20","C6","20","C7","20",
    "C8","20","C9","20","CA","20","CB","20",
    "CC","20","CD","20","CE","20","CF","20",
    "D0","20","D1","20","D2","20","D3","20",
    "D4","20","D5","20","D6","20","D7","20",
    "D8","20","D9","20","DA","20","DB","20",
    "DC","20","DD","20","DE","20","DF","20"])
    do(["E0","20","E1","20","E2","20","E3","20",
    "E4","20","E5","20","E6","20","E7","20",
    "E8","20","E9","20","EA","20","EB","20",
    "EC","20","ED","20","EE","20","EF","20"])
    do(["F0","20","F1","20","F2","20","F3","20",
    "F4","20","F5","20","F6","20","F7","20"])
    do(["F8","20","F9","20","FA","20","FB","20"])
    do(["FC","20","FD","20"])
    do(["C0"])
    do(["E0","80"])
    do(["F0","80","80"])
    do(["F8","80","80","80"])
    do(["FC","80","80","80","80"])
    do(["DF"])
    do(["EF","BF"])
    do(["F7","BF","BF"])
    do(["FB","BF","BF","BF"])
    do(["FD","BF","BF","BF","BF"])
    do(["C0","E0","80","F0","80","80","F8","80",
    "80","80","FC","80","80","80","80",
    "DF","EF","BF","F7","BF","BF","FB",
    "BF","BF","BF","FD","BF","BF","BF","BF"])
    do(["FE"])
    do(["FF"])
    do(["FE","FE","FF","FF"])
    do(["C0","AF"])
    do(["E0","80","AF"])
    do(["F0","80","80","AF"])
    do(["F8","80","80","80","AF"])
    do(["FC","80","80","80","80","AF"])
    do(["C1","BF"])
    do(["E0","9F","BF"])
    do(["F0","8F","BF","BF"])
    do(["F8","87","BF","BF","BF"])
    do(["FC","83","BF","BF","BF","BF"])
    do(["C0","80"])
    do(["E0","80","80"])
    do(["F0","80","80","80"])
    do(["F8","80","80","80","80"])
    do(["FC","80","80","80","80","80"])
    do(["ED","A0","80"])
    do(["ED","AD","BF"])
    do(["ED","AE","80"])
    do(["ED","AF","BF"])
    do(["ED","B0","80"])
    do(["ED","BE","80"])
    do(["ED","BF","BF"])
    do(["ED","A0","80","ED","B0","80"])
    do(["ED","A0","80","ED","BF","BF"])
    do(["ED","AD","BF","ED","B0","80"])
    do(["ED","AD","BF","ED","BF","BF"])
    do(["ED","AE","80","ED","B0","80"])
    do(["ED","AE","80","ED","BF","BF"])
    do(["ED","AF","BF","ED","B0","80"])
    do(["ED","AF","BF","ED","BF","8F"])
    do(["EF","BF","BE"])
    do(["EF","BF","BF"])
elif mode == "input":
    def getchar():
	s = sys.stdin.read(1)
	if s == "":
	    return None
	return ord(s) & 0xFF   # ensure it isn't negative
    process_utf8(getchar)
else:
    do(args)
Commit	Line	Data
9acadc2b	1	#!/usr/bin/env python
	2
	3	import sys
	4	import string
	5	import os
	6	import anydbm
	7	import zlib
	8
	9	class zip_untangler:
	10	def __init__(self, file, datasofar):
	11	self.file = file
	12	assert len(datasofar) < 30
	13	self.header = datasofar
	14	self.data = ""
	15	self.dataleft = None
	16	self.decompress = zlib.decompressobj()
	17	# Zlib header bytes, expected by decompress obj but not
	18	# present in zip file
	19	self.decompress.decompress("\x78\x9c")
	20
	21	def readline(self):
	22	if self.dataleft == None:
	23	while len(self.header) < 30:
	24	s = self.file.read(30 - len(self.header))
	25	assert s != ""
	26	self.header = self.header + s
	27	# Name length and extra length.
	28	namelen = 256 * ord(self.header[27]) + ord(self.header[26])
	29	extralen = 256 * ord(self.header[29]) + ord(self.header[28])
	30	while len(self.header) < 30 + namelen + extralen:
	31	s = self.file.read(30 + namelen + extralen - len(self.header))
	32	assert s != ""
	33	self.header = self.header + s
	34	self.dataleft = \
	35	256 * (256 * (256 * ord(self.header[21]) + ord(self.header[20])) \
	36	+ ord(self.header[19])) + ord(self.header[18])
	37	k = string.find(self.data, "\n")
	38	while k < 0:
	39	rlen = self.dataleft
	40	if rlen > 4096: rlen = 4096
	41	if rlen == 0: break
	42	d = self.file.read(rlen)
	43	if d == "": break
	44	self.dataleft = self.dataleft - rlen
	45	self.data = self.data + self.decompress.decompress(d)
	46	k = string.find(self.data, "\n")
	47	if k < 0:
	48	ret = self.data
	49	self.data = ""
	50	return ret
	51	else:
	52	ret = self.data[:k+1]
	53	self.data = self.data[k+1:]
	54	return ret
	55
	56	def hexstr(x):
	57	s = hex(x)
	58	if s[-1:] == "L" or s[-1:] == "l":
	59	s = s[:-1]
	60	if s[:2] == "0x" or s[:2] == "0X":
	61	s = s[2:]
	62	return s
	63
	64	def charname(x):
65	if db:
66	key = hexstr(x)
67	while len(key) < 4: key = "0" + key
68	key = string.upper(key)
69	if han_translations:
70	try:
71	value = handb[key]
72	return "<han> " + value
73	except KeyError:
74	pass
75	try:
76	value = db[key]
77	return string.split(value, ";")[1]
78	except KeyError:
79	return "<no name available>"
80	else:
81	return ""
82
83	def output(char, bytes, errors):
84	if output_analysis:
85	if char == -1:
86	s = " "
87	else:
88	s = "U-%08X " % char
89	for i in bytes:
90	s = s + " %02X" % i
91	for i in range(6-len(bytes)):
92	s = s + " "
93
94	if char == -1:
95	name = ""
96	else:
97	name = charname(char)
98	if name != "":
99	s = s + " " + name
100	s = s + errors
101	print s
102	else:
103	if char == -1 or errors != "":
104	# problem chars become U+FFFD REPLACEMENT CHARACTER
105	sys.stdout.write("\xEF\xBF\xBD")
106	else:
107	for i in bytes:
108	sys.stdout.write(chr(i))
109
110	def process_ucs(x, bytes=[], errors=""):
111	if x < 0x80:
112	utf8 = [x]
113	realbytes = 1
114	else:
115	if x < 0x800:
116	tmp = (0xC0, 1)
117	elif x < 0x10000:
118	tmp = (0xE0, 2)
119	elif x < 0x200000:
120	tmp = (0xF0, 3)
121	elif x < 0x4000000:
122	tmp = (0xF8, 4)
123	else:
124	assert x < 0x80000000L
125	tmp = (0xFC, 5)
126	realbytes = tmp[1] + 1
127	utf8 = [tmp[0] + (x >> (6*tmp[1]))]
128	for i in range(tmp[1]-1, -1, -1):
129	utf8.append(0x80 + (0x3F & (x >> (i*6))))
130
131	if bytes != [] and len(bytes) > realbytes:
132	errors = errors + " (overlong form of"
133	for i in utf8:
134	errors = errors + " %02X" % i
135	errors = errors + ")"
136	utf8 = bytes
137	if x >= 0xD800 and x <= 0xDFFF:
138	errors = errors + " (surrogate)"
139	if x >= 0xFFFE and x <= 0xFFFF:
140	errors = errors + " (invalid char)"
141
142	output(x, utf8, errors)
143
144	def process_utf8(next):
145	c = next()
146	while c != None:
147	char = [c]
148	i = c
149	if i < 0x80:
150	process_ucs(i) # single-byte char
151	c = next()
152	elif i == 0xfe or i == 0xff:
153	output(-1, char, " (invalid UTF-8 byte)")
154	c = next()
155	elif i >= 0x80 and i <= 0xbf:
156	output(-1, char, " (unexpected continuation byte)")
157	c = next()
158	else:
159	if i >= 0xC0 and i <= 0xDF:
160	acc = i &~ 0xC0
161	cbytes = 1
162	elif i >= 0xE0 and i <= 0xEF:
163	acc = i &~ 0xE0
164	cbytes = 2
165	elif i >= 0xF0 and i <= 0xF7:
166	acc = i &~ 0xF0
167	cbytes = 3
168	elif i >= 0xF8 and i <= 0xFB:
169	acc = i &~ 0xF8
170	cbytes = 4
171	elif i >= 0xFC and i <= 0xFD:
172	acc = i &~ 0xFC
173	cbytes = 5
174	gotone = 0
175	while cbytes > 0:
176	c = next()
177	if c == None or c < 0x80 or c > 0xBF:
178	gotone = 1
179	break
180	char.append(c)
181	acc = (acc << 6) + (c & 0x3F)
182	cbytes = cbytes - 1
183	if not gotone:
184	c = next()
185	if cbytes > 0:
186	output(-1, char, " (incomplete sequence)")
187	else:
188	process_ucs(acc, char)
189
190	def do(args):
191	# Class to turn a list into a callable object that returns one
192	# element at a time.
193	class liststepper:
194	def __init__(self, list):
195	self.list = list
196	self.index = 0
197	def __call__(self):
198	if self.index >= len(self.list):
199	return None
200	ret = self.list[self.index]
201	self.index = self.index + 1
202	return ret
203
204	list = []
205	for arg in args:
206	if string.upper(arg[0]) == "U":
207	if len(list) > 0:
208	process_utf8(liststepper(list))
209	list = []
210	assert arg[1] == "+" or arg[1] == "-"
211	process_ucs(string.atoi(arg[2:], 16))
212	else:
213	list.append(string.atoi(arg, 16))
214
215	if len(list) > 0:
216	process_utf8(liststepper(list))
217
da0f8522	218	def usage(arg):
9acadc2b	219	print "Usage: cvt-utf8 [flags] <hex UTF-8 bytes and/or U+codepoints>"
	220	print " e.g. cvt-utf8 e2 82 ac"
	221	print " or cvt-utf8 U+20ac"
	222	print " or cvt-utf8 U-10ffff"
	223	print ""
	224	print "Flags: -o or --output just output well-formed UTF-8 instead of"
	225	print " an analysis of the input data"
	226	print " -h or --han also give Han definitions from unihan db"
	227	print ""
	228	print "Also: cvt-utf8 --test run Markus Kuhn's decoder stress tests" #'
	229	print " cvt-utf8 --input (or -i)"
	230	print " read, analyse and decode UTF-8 from stdin"
da0f8522	231	if arg == "--help-admin":
9acadc2b	232	print " cvt-utf8 --help display user help text"
	233	print " cvt-utf8 --help-admin display admin help text (this one)"
	234	print " cvt-utf8 --build <infile> <outfile>"
	235	print " convert UnicodeData.txt to unicode db"
	236	print " cvt-utf8 --build-unihan <infile> <outfile>"
	237	print " convert Unihan.txt to unihan db"
	238	print " cvt-utf8 --fetch-build <outfile>"
	239	print " "+\
	240	"build unicode db by download from unicode.org"
	241	print " cvt-utf8 --fetch-build-unihan <outfile>"
	242	print " "+\
	243	"build Unihan db by download from unicode.org"
	244	else:
	245	print " cvt-utf8 --help display this help text"
	246	print " cvt-utf8 --help-admin display admin help text"
da0f8522	247	print " cvt-utf8 --version report version number"
	248	print " cvt-utf8 --licence display (MIT) licence text"
	249
	250	def licence():
	251	print "cvt-utf8 is copyright 2002-2004 Simon Tatham."
	252	print ""
	253	print "Permission is hereby granted, free of charge, to any person"
	254	print "obtaining a copy of this software and associated documentation files"
	255	print "(the \"Software\"), to deal in the Software without restriction,"
	256	print "including without limitation the rights to use, copy, modify, merge,"
	257	print "publish, distribute, sublicense, and/or sell copies of the Software,"
	258	print "and to permit persons to whom the Software is furnished to do so,"
	259	print "subject to the following conditions:"
	260	print ""
	261	print "The above copyright notice and this permission notice shall be"
	262	print "included in all copies or substantial portions of the Software."
	263	print ""
	264	print "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,"
	265	print "EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF"
	266	print "MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND"
	267	print "NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS"
	268	print "BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN"
	269	print "ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN"
	270	print "CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE"
	271	print "SOFTWARE."
9acadc2b	272
da0f8522	273	def version():
	274	rev = "$Revision$"
	275	rev = string.replace(rev, " ", "")
	276	rev = string.replace(rev, "$", "")
	277	revs = string.split(rev, ":")
	278	if len(revs) > 1:
	279	print "cvt-utf8 revision %s" % revs[1]
9acadc2b	280	else:
da0f8522	281	print "cvt-utf8: unknown version"
	282
	283	args = sys.argv[1:]
	284	output_analysis = 1
	285	han_translations = 0
	286	mode = "cmdline"
	287
	288	if args == []:
	289	usage("")
9acadc2b	290	sys.exit(0)
9acadc2b	291
da0f8522	292	while len(args) > 0 and args[0][:1] == "-":
	293	if args[0] == "--help" or args[0] == "--help-admin":
	294	usage(args[0])
	295	sys.exit(0)
	296
	297	elif args[0] == "--licence" or args[0] == "--license":
	298	licence()
	299	sys.exit(0)
	300
	301	elif args[0] == "--version":
	302	version()
	303	sys.exit(0)
	304
	305	elif args[0] == "-o" or args[0] == "--output":
	306	output_analysis = 0
	307	args = args[1:]
	308
	309	elif args[0] == "-h" or args[0] == "--han":
	310	han_translations = 1
	311	args = args[1:]
	312
	313	elif args[0] == "--build" or args[0] == "--fetch-build":
	314	if args[0] == "--build":
	315	if len(args) != 3:
	316	print "cvt-utf8: --build expects two filename arguments"
	317	sys.exit(1)
	318	infile = open(args[1], "r")
	319	outfile = args[2]
	320	else:
	321	if len(args) != 2:
	322	print "cvt-utf8: --fetch-build expects one filename argument"
	323	sys.exit(1)
	324	import urllib
	325	infile = urllib.urlopen("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
	326	outfile = args[1]
	327	# Now build the database.
	328	if outfile[-3:] == ".db":
	329	print "cvt-utf8: warning: you should not append .db to db name"
	330
	331	db = anydbm.open(outfile, "n")
	332	while 1:
	333	s = infile.readline()
	334	if s == "": break
	335	ss = string.split(s, ";")[0]
	336	db[ss] = s
	337	db.close()
	338	sys.exit(0)
	339
	340	elif args[0] == "--build-unihan" or args[0] == "--fetch-build-unihan":
	341	if args[0] == "--build-unihan":
	342	if len(args) != 3:
	343	print "cvt-utf8: --build expects two filename arguments"
	344	sys.exit(1)
	345	infile = open(args[1], "r")
	346	s = infile.read(1)
	347	# Unihan.txt starts with a hash. If this file starts with a
	348	# P, we assume it's a zip file ("PK").
	349	if s == "P":
	350	infile = zip_untangler(infile, s)
	351	s = ""
	352	outfile = args[2]
	353	else:
	354	if len(args) != 2:
	355	print "cvt-utf8: --fetch-build-unihan expects one filename argument"
356	sys.exit(1)
357	import urllib
358	infile = urllib.urlopen("ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip")
359	# We know this one is zipped.
360	infile = zip_untangler(infile, "")
361	outfile = args[1]
362	s = ""
363	# Now build the database.
364	if outfile[-3:] == ".db":
365	print "cvt-utf8: warning: you should not append .db to db name"
366
367	db = anydbm.open(outfile, "n")
368	while 1:
369	s = s + infile.readline()
370	if s == "": break
371	while s[-1:] == "\r" or s[-1:] == "\n":
372	s = s[:-1]
373	sa = string.split(s, "\t")
374	if len(sa) == 3 and sa[1] == "kDefinition" and sa[0][:2] == "U+":
375	db[sa[0][2:]] = sa[2]
376	s = ""
377	db.close()
378	sys.exit(0)
379
380	elif args[0] == "--test":
381	mode = "test"
382	args = args[1:]
383
384	elif args[0] == "--input" or args[0] == "-i":
385	mode = "input"
386	args = args[1:]
387
9acadc2b	388	else:
da0f8522	389	sys.stderr.write("cvt-utf8: unknown argument '%s'" % args[0])
da0f8522	390	sys.exit(1)
9acadc2b	391
	392	locations = []
	393	locations.append("/usr/share/unicode/unicode")
	394	locations.append("/usr/lib/unicode/unicode")
	395	locations.append("/usr/local/share/unicode/unicode")
	396	locations.append("/usr/local/lib/unicode/unicode")
	397	locations.append(os.environ["HOME"] + "/share/unicode/unicode")
	398	locations.append(os.environ["HOME"] + "/lib/unicode/unicode")
	399
	400	for loc in locations:
	401	try:
	402	db = anydbm.open(loc, "r")
	403	except IOError:
	404	db = None
	405	except anydbm.error:
	406	db = None
	407	if db != None:
	408	break
	409	if han_translations:
	410	i = string.rfind(loc, "/")
	411	assert i >= 0
	412	hanloc = loc[:i+1] + "unihan"
	413	handb = anydbm.open(hanloc, "r")
	414	# this has been explicitly required, so we don't squelch exceptions
	415
da0f8522	416	if mode == "test":
9acadc2b	417	do(["CE","BA","E1","BD","B9","CF","83","CE","BC","CE","B5"])
	418	do(["00"])
	419	do(["C2","80"])
	420	do(["E0","A0","80"])
	421	do(["F0","90","80","80"])
	422	do(["F8","88","80","80","80"])
	423	do(["FC","84","80","80","80","80"])
	424	do(["7F"])
	425	do(["DF","BF"])
	426	do(["EF","BF","BF"])
	427	do(["F7","BF","BF","BF"])
	428	do(["FB","BF","BF","BF","BF"])
	429	do(["FD","BF","BF","BF","BF","BF"])
	430	do(["ED","9F","BF"])
	431	do(["EE","80","80"])
	432	do(["EF","BF","BD"])
	433	do(["F4","8F","BF","BF"])
	434	do(["F4","90","80","80"])
	435	do(["80"])
	436	do(["BF"])
	437	do(["80","BF"])
	438	do(["80","BF","80"])
	439	do(["80","BF","80","BF"])
	440	do(["80","BF","80","BF","80"])
	441	do(["80","BF","80","BF","80","BF"])
	442	do(["80","BF","80","BF","80","BF","80"])
	443	do(["80","81","82","83","84","85","86","87",
	444	"88","89","8A","8B","8C","8D","8E","8F",
	445	"90","91","92","93","94","95","96","97",
	446	"98","99","9A","9B","9C","9D","9E","9F",
	447	"A0","A1","A2","A3","A4","A5","A6","A7",
	448	"A8","A9","AA","AB","AC","AD","AE","AF",
	449	"B0","B1","B2","B3","B4","B5","B6","B7",
	450	"B8","B9","BA","BB","BC","BD","BE","BF"])
	451	do(["C0","20","C1","20","C2","20","C3","20",
	452	"C4","20","C5","20","C6","20","C7","20",
	453	"C8","20","C9","20","CA","20","CB","20",
	454	"CC","20","CD","20","CE","20","CF","20",
	455	"D0","20","D1","20","D2","20","D3","20",
	456	"D4","20","D5","20","D6","20","D7","20",
	457	"D8","20","D9","20","DA","20","DB","20",
	458	"DC","20","DD","20","DE","20","DF","20"])
	459	do(["E0","20","E1","20","E2","20","E3","20",
	460	"E4","20","E5","20","E6","20","E7","20",
	461	"E8","20","E9","20","EA","20","EB","20",
	462	"EC","20","ED","20","EE","20","EF","20"])
	463	do(["F0","20","F1","20","F2","20","F3","20",
	464	"F4","20","F5","20","F6","20","F7","20"])
	465	do(["F8","20","F9","20","FA","20","FB","20"])
	466	do(["FC","20","FD","20"])
	467	do(["C0"])
	468	do(["E0","80"])
	469	do(["F0","80","80"])
	470	do(["F8","80","80","80"])
	471	do(["FC","80","80","80","80"])
	472	do(["DF"])
	473	do(["EF","BF"])
	474	do(["F7","BF","BF"])
	475	do(["FB","BF","BF","BF"])
	476	do(["FD","BF","BF","BF","BF"])
	477	do(["C0","E0","80","F0","80","80","F8","80",
	478	"80","80","FC","80","80","80","80",
	479	"DF","EF","BF","F7","BF","BF","FB",
	480	"BF","BF","BF","FD","BF","BF","BF","BF"])
481	do(["FE"])
482	do(["FF"])
483	do(["FE","FE","FF","FF"])
484	do(["C0","AF"])
485	do(["E0","80","AF"])
486	do(["F0","80","80","AF"])
487	do(["F8","80","80","80","AF"])
488	do(["FC","80","80","80","80","AF"])
489	do(["C1","BF"])
490	do(["E0","9F","BF"])
491	do(["F0","8F","BF","BF"])
492	do(["F8","87","BF","BF","BF"])
493	do(["FC","83","BF","BF","BF","BF"])
494	do(["C0","80"])
495	do(["E0","80","80"])
496	do(["F0","80","80","80"])
497	do(["F8","80","80","80","80"])
498	do(["FC","80","80","80","80","80"])
499	do(["ED","A0","80"])
500	do(["ED","AD","BF"])
501	do(["ED","AE","80"])
502	do(["ED","AF","BF"])
503	do(["ED","B0","80"])
504	do(["ED","BE","80"])
505	do(["ED","BF","BF"])
506	do(["ED","A0","80","ED","B0","80"])
507	do(["ED","A0","80","ED","BF","BF"])
508	do(["ED","AD","BF","ED","B0","80"])
509	do(["ED","AD","BF","ED","BF","BF"])
510	do(["ED","AE","80","ED","B0","80"])
511	do(["ED","AE","80","ED","BF","BF"])
512	do(["ED","AF","BF","ED","B0","80"])
513	do(["ED","AF","BF","ED","BF","8F"])
514	do(["EF","BF","BE"])
515	do(["EF","BF","BF"])
da0f8522	516	elif mode == "input":
9acadc2b	517	def getchar():
	518	s = sys.stdin.read(1)
	519	if s == "":
	520	return None
	521	return ord(s) & 0xFF # ensure it isn't negative
	522	process_utf8(getchar)
	523	else:
	524	do(args)