[sgt/utils] / cvt-utf8 / cvt-utf8

#!/usr/bin/env python 

import sys
import string
import os
import anydbm
import zlib

class zip_untangler:
    def __init__(self, file, datasofar):
        self.file = file
        assert len(datasofar) < 30
        self.header = datasofar
        self.data = ""
        self.dataleft = None
        self.decompress = zlib.decompressobj()
        # Zlib header bytes, expected by decompress obj but not
        # present in zip file
        self.decompress.decompress("\x78\x9c")

    def readline(self):
        if self.dataleft == None:
            while len(self.header) < 30:
                s = self.file.read(30 - len(self.header))
                assert s != ""
                self.header = self.header + s
            # Name length and extra length.
            namelen = 256 * ord(self.header[27]) + ord(self.header[26])
            extralen = 256 * ord(self.header[29]) + ord(self.header[28])
            while len(self.header) < 30 + namelen + extralen:
                s = self.file.read(30 + namelen + extralen - len(self.header))
                assert s != ""
                self.header = self.header + s
            self.dataleft = \
            256 * (256 * (256 * ord(self.header[21]) + ord(self.header[20])) \
            + ord(self.header[19])) + ord(self.header[18])
        k = string.find(self.data, "\n")
        while k < 0:
            rlen = self.dataleft
            if rlen > 4096: rlen = 4096
            if rlen == 0: break
            d = self.file.read(rlen)
            if d == "": break
            self.dataleft = self.dataleft - rlen
            self.data = self.data + self.decompress.decompress(d)
            k = string.find(self.data, "\n")
        if k < 0:
            ret = self.data
            self.data = ""
            return ret
        else:
            ret = self.data[:k+1]
            self.data = self.data[k+1:]
            return ret

def hexstr(x):
    s = hex(x)
    if s[-1:] == "L" or s[-1:] == "l":
	s = s[:-1]
    if s[:2] == "0x" or s[:2] == "0X":
	s = s[2:]
    return s

def charname(x):
    if db:
	key = hexstr(x)
	while len(key) < 4: key = "0" + key
	key = string.upper(key)
	if han_translations:
	    try:
		value = handb[key]
		return "<han> " + value
	    except KeyError:
		pass
	try:
	    value = db[key]
	    return string.split(value, ";")[1]
	except KeyError:
	    return "<no name available>"
    else:
	return ""

def output(char, bytes, errors):
    if output_analysis:
	if char == -1:
	    s = "           "
	else:
	    s = "U-%08X " % char
	for i in bytes:
	    s = s + " %02X" % i
	for i in range(6-len(bytes)):
	    s = s + "   "

	if char == -1:
	    name = ""
	else:
	    name = charname(char)
	if name != "":
	    s = s + " " + name
	s = s + errors
	print s
    else:
	if char == -1 or errors != "":
	    # problem chars become U+FFFD REPLACEMENT CHARACTER
	    sys.stdout.write("\xEF\xBF\xBD")
	else:
	    for i in bytes:
		sys.stdout.write(chr(i))

def process_ucs(x, bytes=[], errors=""):
    if x < 0x80:
	utf8 = [x]
	realbytes = 1
    else:
	if x < 0x800:
	    tmp = (0xC0, 1)
	elif x < 0x10000:
	    tmp = (0xE0, 2)
	elif x < 0x200000:
	    tmp = (0xF0, 3)
	elif x < 0x4000000:
	    tmp = (0xF8, 4)
	else:
	    assert x < 0x80000000L
	    tmp = (0xFC, 5)
	realbytes = tmp[1] + 1
	utf8 = [tmp[0] + (x >> (6*tmp[1]))]
	for i in range(tmp[1]-1, -1, -1):
	    utf8.append(0x80 + (0x3F & (x >> (i*6))))

    if bytes != [] and len(bytes) > realbytes:
	errors = errors + " (overlong form of"
	for i in utf8:
	    errors = errors + " %02X" % i
	errors = errors + ")"
	utf8 = bytes
    if x >= 0xD800 and x <= 0xDFFF:
	errors = errors + " (surrogate)"
    if x >= 0xFFFE and x <= 0xFFFF:
	errors = errors + " (invalid char)"

    output(x, utf8, errors)

def process_utf8(next):
    c = next()
    while c != None:
	char = [c]
	i = c
	if i < 0x80:
	    process_ucs(i) # single-byte char
	    c = next()
	elif i == 0xfe or i == 0xff:
	    output(-1, char, " (invalid UTF-8 byte)")
	    c = next()
	elif i >= 0x80 and i <= 0xbf:
	    output(-1, char, " (unexpected continuation byte)")
	    c = next()
	else:
	    if i >= 0xC0 and i <= 0xDF:
		acc = i &~ 0xC0
		cbytes = 1
	    elif i >= 0xE0 and i <= 0xEF:
		acc = i &~ 0xE0
		cbytes = 2
	    elif i >= 0xF0 and i <= 0xF7:
		acc = i &~ 0xF0
		cbytes = 3
	    elif i >= 0xF8 and i <= 0xFB:
		acc = i &~ 0xF8
		cbytes = 4
	    elif i >= 0xFC and i <= 0xFD:
		acc = i &~ 0xFC
		cbytes = 5
	    gotone = 0
	    while cbytes > 0:
		c = next()
		if c == None or c < 0x80 or c > 0xBF:
		    gotone = 1
		    break
		char.append(c)
		acc = (acc << 6) + (c & 0x3F)
		cbytes = cbytes - 1
	    if not gotone:
		c = next()
	    if cbytes > 0:
		output(-1, char, " (incomplete sequence)")
	    else:
		process_ucs(acc, char)

def do(args):
    # Class to turn a list into a callable object that returns one
    # element at a time.
    class liststepper:
	def __init__(self, list):
	    self.list = list
	    self.index = 0
	def __call__(self):
	    if self.index >= len(self.list):
		return None
	    ret = self.list[self.index]
	    self.index = self.index + 1
	    return ret

    list = []
    for arg in args:
	if string.upper(arg[0]) == "U":
	    if len(list) > 0:
		process_utf8(liststepper(list))
		list = []
	    assert arg[1] == "+" or arg[1] == "-"
	    process_ucs(string.atoi(arg[2:], 16))
	else:
	    list.append(string.atoi(arg, 16))

    if len(list) > 0:
	process_utf8(liststepper(list))

args = sys.argv[1:]
output_analysis = 1
han_translations = 0

if args == [] or args == ["--help"] or args == ["--help-admin"]:
    print "Usage: cvt-utf8 [flags] <hex UTF-8 bytes and/or U+codepoints>"
    print "  e.g. cvt-utf8 e2 82 ac"
    print "    or cvt-utf8 U+20ac"
    print "    or cvt-utf8 U-10ffff"
    print ""
    print "Flags: -o or --output        just output well-formed UTF-8 instead of"
    print "                             an analysis of the input data"
    print "       -h or --han           also give Han definitions from unihan db"
    print ""
    print "Also:  cvt-utf8 --test       run Markus Kuhn's decoder stress tests" #'
    print "       cvt-utf8 --input (or -i)"
    print "                             read, analyse and decode UTF-8 from stdin"
    if args == ["--help-admin"]:
        print "       cvt-utf8 --help       display user help text"
        print "       cvt-utf8 --help-admin display admin help text (this one)"
        print "       cvt-utf8 --build <infile> <outfile>"
        print "                             convert UnicodeData.txt to unicode db"
        print "       cvt-utf8 --build-unihan <infile> <outfile>"
        print "                             convert Unihan.txt to unihan db"
        print "       cvt-utf8 --fetch-build <outfile>"
        print "                             "+\
        "build unicode db by download from unicode.org"
        print "       cvt-utf8 --fetch-build-unihan <outfile>"
        print "                             "+\
        "build Unihan db by download from unicode.org"
    else:
        print "       cvt-utf8 --help       display this help text"
        print "       cvt-utf8 --help-admin display admin help text"
    sys.exit(0)

if args[0] == "-o" or args[0] == "--output":
    output_analysis = 0
    args = args[1:]

if args[0] == "-h" or args[0] == "--han":
    han_translations = 1
    args = args[1:]

if args[0] == "--build" or args[0] == "--fetch-build":
    if args[0] == "--build":
	if len(args) != 3:
	    print "cvt-utf8: --build expects two filename arguments"
	    sys.exit(1)
	infile = open(args[1], "r")
	outfile = args[2]
    else:
	if len(args) != 2:
	    print "cvt-utf8: --fetch-build expects one filename argument"
	    sys.exit(1)
	import urllib
	infile = urllib.urlopen("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
	outfile = args[1]
    # Now build the database.
    if outfile[-3:] == ".db":
	print "cvt-utf8: warning: you should not append .db to db name"

    db = anydbm.open(outfile, "n")
    while 1:
	s = infile.readline()
	if s == "": break
	ss = string.split(s, ";")[0]
	db[ss] = s
    db.close()
    sys.exit(0)

if args[0] == "--build-unihan" or args[0] == "--fetch-build-unihan":
    if args[0] == "--build-unihan":
        if len(args) != 3:
            print "cvt-utf8: --build expects two filename arguments"
            sys.exit(1)
        infile = open(args[1], "r")
        s = infile.read(1)
        # Unihan.txt starts with a hash. If this file starts with a
        # P, we assume it's a zip file ("PK").
        if s == "P":
            infile = zip_untangler(infile, s)
            s = ""
        outfile = args[2]
    else:
	if len(args) != 2:
	    print "cvt-utf8: --fetch-build-unihan expects one filename argument"
	    sys.exit(1)
	import urllib
	infile = urllib.urlopen("ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip")
        # We know this one is zipped.
        infile = zip_untangler(infile, "")
	outfile = args[1]
        s = ""
    # Now build the database.
    if outfile[-3:] == ".db":
	print "cvt-utf8: warning: you should not append .db to db name"

    db = anydbm.open(outfile, "n")
    while 1:
	s = s + infile.readline()
	if s == "": break
	while s[-1:] == "\r" or s[-1:] == "\n":
	    s = s[:-1]
	sa = string.split(s, "\t")
	if len(sa) == 3 and sa[1] == "kDefinition" and sa[0][:2] == "U+":
	    db[sa[0][2:]] = sa[2]
        s = ""
    db.close()
    sys.exit(0)

locations = []
locations.append("/usr/share/unicode/unicode")
locations.append("/usr/lib/unicode/unicode")
locations.append("/usr/local/share/unicode/unicode")
locations.append("/usr/local/lib/unicode/unicode")
locations.append(os.environ["HOME"] + "/share/unicode/unicode")
locations.append(os.environ["HOME"] + "/lib/unicode/unicode")

for loc in locations:
    try:
	db = anydbm.open(loc, "r")
    except IOError:
	db = None
    except anydbm.error:
	db = None
    if db != None:
	break
if han_translations:
    i = string.rfind(loc, "/")
    assert i >= 0
    hanloc = loc[:i+1] + "unihan"
    handb = anydbm.open(hanloc, "r")
    # this has been explicitly required, so we don't squelch exceptions

if args[0] == "--test":
    do(["CE","BA","E1","BD","B9","CF","83","CE","BC","CE","B5"])
    do(["00"])
    do(["C2","80"])
    do(["E0","A0","80"])
    do(["F0","90","80","80"])
    do(["F8","88","80","80","80"])
    do(["FC","84","80","80","80","80"])
    do(["7F"])
    do(["DF","BF"])
    do(["EF","BF","BF"])
    do(["F7","BF","BF","BF"])
    do(["FB","BF","BF","BF","BF"])
    do(["FD","BF","BF","BF","BF","BF"])
    do(["ED","9F","BF"])
    do(["EE","80","80"])
    do(["EF","BF","BD"])
    do(["F4","8F","BF","BF"])
    do(["F4","90","80","80"])
    do(["80"])
    do(["BF"])
    do(["80","BF"])
    do(["80","BF","80"])
    do(["80","BF","80","BF"])
    do(["80","BF","80","BF","80"])
    do(["80","BF","80","BF","80","BF"])
    do(["80","BF","80","BF","80","BF","80"])
    do(["80","81","82","83","84","85","86","87",
    "88","89","8A","8B","8C","8D","8E","8F",
    "90","91","92","93","94","95","96","97",
    "98","99","9A","9B","9C","9D","9E","9F",
    "A0","A1","A2","A3","A4","A5","A6","A7",
    "A8","A9","AA","AB","AC","AD","AE","AF",
    "B0","B1","B2","B3","B4","B5","B6","B7",
    "B8","B9","BA","BB","BC","BD","BE","BF"])
    do(["C0","20","C1","20","C2","20","C3","20",
    "C4","20","C5","20","C6","20","C7","20",
    "C8","20","C9","20","CA","20","CB","20",
    "CC","20","CD","20","CE","20","CF","20",
    "D0","20","D1","20","D2","20","D3","20",
    "D4","20","D5","20","D6","20","D7","20",
    "D8","20","D9","20","DA","20","DB","20",
    "DC","20","DD","20","DE","20","DF","20"])
    do(["E0","20","E1","20","E2","20","E3","20",
    "E4","20","E5","20","E6","20","E7","20",
    "E8","20","E9","20","EA","20","EB","20",
    "EC","20","ED","20","EE","20","EF","20"])
    do(["F0","20","F1","20","F2","20","F3","20",
    "F4","20","F5","20","F6","20","F7","20"])
    do(["F8","20","F9","20","FA","20","FB","20"])
    do(["FC","20","FD","20"])
    do(["C0"])
    do(["E0","80"])
    do(["F0","80","80"])
    do(["F8","80","80","80"])
    do(["FC","80","80","80","80"])
    do(["DF"])
    do(["EF","BF"])
    do(["F7","BF","BF"])
    do(["FB","BF","BF","BF"])
    do(["FD","BF","BF","BF","BF"])
    do(["C0","E0","80","F0","80","80","F8","80",
    "80","80","FC","80","80","80","80",
    "DF","EF","BF","F7","BF","BF","FB",
    "BF","BF","BF","FD","BF","BF","BF","BF"])
    do(["FE"])
    do(["FF"])
    do(["FE","FE","FF","FF"])
    do(["C0","AF"])
    do(["E0","80","AF"])
    do(["F0","80","80","AF"])
    do(["F8","80","80","80","AF"])
    do(["FC","80","80","80","80","AF"])
    do(["C1","BF"])
    do(["E0","9F","BF"])
    do(["F0","8F","BF","BF"])
    do(["F8","87","BF","BF","BF"])
    do(["FC","83","BF","BF","BF","BF"])
    do(["C0","80"])
    do(["E0","80","80"])
    do(["F0","80","80","80"])
    do(["F8","80","80","80","80"])
    do(["FC","80","80","80","80","80"])
    do(["ED","A0","80"])
    do(["ED","AD","BF"])
    do(["ED","AE","80"])
    do(["ED","AF","BF"])
    do(["ED","B0","80"])
    do(["ED","BE","80"])
    do(["ED","BF","BF"])
    do(["ED","A0","80","ED","B0","80"])
    do(["ED","A0","80","ED","BF","BF"])
    do(["ED","AD","BF","ED","B0","80"])
    do(["ED","AD","BF","ED","BF","BF"])
    do(["ED","AE","80","ED","B0","80"])
    do(["ED","AE","80","ED","BF","BF"])
    do(["ED","AF","BF","ED","B0","80"])
    do(["ED","AF","BF","ED","BF","8F"])
    do(["EF","BF","BE"])
    do(["EF","BF","BF"])
elif args[0] == "--input" or args[0] == "-i":
    def getchar():
	s = sys.stdin.read(1)
	if s == "":
	    return None
	return ord(s) & 0xFF   # ensure it isn't negative
    process_utf8(getchar)
else:
    do(args)
Commit	Line	Data
9acadc2b	1	#!/usr/bin/env python
	2
	3	import sys
	4	import string
	5	import os
	6	import anydbm
	7	import zlib
	8
	9	class zip_untangler:
	10	def __init__(self, file, datasofar):
	11	self.file = file
	12	assert len(datasofar) < 30
	13	self.header = datasofar
	14	self.data = ""
	15	self.dataleft = None
	16	self.decompress = zlib.decompressobj()
	17	# Zlib header bytes, expected by decompress obj but not
	18	# present in zip file
	19	self.decompress.decompress("\x78\x9c")
	20
	21	def readline(self):
	22	if self.dataleft == None:
	23	while len(self.header) < 30:
	24	s = self.file.read(30 - len(self.header))
	25	assert s != ""
	26	self.header = self.header + s
	27	# Name length and extra length.
	28	namelen = 256 * ord(self.header[27]) + ord(self.header[26])
	29	extralen = 256 * ord(self.header[29]) + ord(self.header[28])
	30	while len(self.header) < 30 + namelen + extralen:
	31	s = self.file.read(30 + namelen + extralen - len(self.header))
	32	assert s != ""
	33	self.header = self.header + s
	34	self.dataleft = \
	35	256 * (256 * (256 * ord(self.header[21]) + ord(self.header[20])) \
	36	+ ord(self.header[19])) + ord(self.header[18])
	37	k = string.find(self.data, "\n")
	38	while k < 0:
	39	rlen = self.dataleft
	40	if rlen > 4096: rlen = 4096
	41	if rlen == 0: break
	42	d = self.file.read(rlen)
	43	if d == "": break
	44	self.dataleft = self.dataleft - rlen
	45	self.data = self.data + self.decompress.decompress(d)
	46	k = string.find(self.data, "\n")
	47	if k < 0:
	48	ret = self.data
	49	self.data = ""
	50	return ret
	51	else:
	52	ret = self.data[:k+1]
	53	self.data = self.data[k+1:]
	54	return ret
	55
	56	def hexstr(x):
	57	s = hex(x)
	58	if s[-1:] == "L" or s[-1:] == "l":
	59	s = s[:-1]
	60	if s[:2] == "0x" or s[:2] == "0X":
	61	s = s[2:]
	62	return s
	63
	64	def charname(x):
65	if db:
66	key = hexstr(x)
67	while len(key) < 4: key = "0" + key
68	key = string.upper(key)
69	if han_translations:
70	try:
71	value = handb[key]
72	return "<han> " + value
73	except KeyError:
74	pass
75	try:
76	value = db[key]
77	return string.split(value, ";")[1]
78	except KeyError:
79	return "<no name available>"
80	else:
81	return ""
82
83	def output(char, bytes, errors):
84	if output_analysis:
85	if char == -1:
86	s = " "
87	else:
88	s = "U-%08X " % char
89	for i in bytes:
90	s = s + " %02X" % i
91	for i in range(6-len(bytes)):
92	s = s + " "
93
94	if char == -1:
95	name = ""
96	else:
97	name = charname(char)
98	if name != "":
99	s = s + " " + name
100	s = s + errors
101	print s
102	else:
103	if char == -1 or errors != "":
104	# problem chars become U+FFFD REPLACEMENT CHARACTER
105	sys.stdout.write("\xEF\xBF\xBD")
106	else:
107	for i in bytes:
108	sys.stdout.write(chr(i))
109
110	def process_ucs(x, bytes=[], errors=""):
111	if x < 0x80:
112	utf8 = [x]
113	realbytes = 1
114	else:
115	if x < 0x800:
116	tmp = (0xC0, 1)
117	elif x < 0x10000:
118	tmp = (0xE0, 2)
119	elif x < 0x200000:
120	tmp = (0xF0, 3)
121	elif x < 0x4000000:
122	tmp = (0xF8, 4)
123	else:
124	assert x < 0x80000000L
125	tmp = (0xFC, 5)
126	realbytes = tmp[1] + 1
127	utf8 = [tmp[0] + (x >> (6*tmp[1]))]
128	for i in range(tmp[1]-1, -1, -1):
129	utf8.append(0x80 + (0x3F & (x >> (i*6))))
130
131	if bytes != [] and len(bytes) > realbytes:
132	errors = errors + " (overlong form of"
133	for i in utf8:
134	errors = errors + " %02X" % i
135	errors = errors + ")"
136	utf8 = bytes
137	if x >= 0xD800 and x <= 0xDFFF:
138	errors = errors + " (surrogate)"
139	if x >= 0xFFFE and x <= 0xFFFF:
140	errors = errors + " (invalid char)"
141
142	output(x, utf8, errors)
143
144	def process_utf8(next):
145	c = next()
146	while c != None:
147	char = [c]
148	i = c
149	if i < 0x80:
150	process_ucs(i) # single-byte char
151	c = next()
152	elif i == 0xfe or i == 0xff:
153	output(-1, char, " (invalid UTF-8 byte)")
154	c = next()
155	elif i >= 0x80 and i <= 0xbf:
156	output(-1, char, " (unexpected continuation byte)")
157	c = next()
158	else:
159	if i >= 0xC0 and i <= 0xDF:
160	acc = i &~ 0xC0
161	cbytes = 1
162	elif i >= 0xE0 and i <= 0xEF:
163	acc = i &~ 0xE0
164	cbytes = 2
165	elif i >= 0xF0 and i <= 0xF7:
166	acc = i &~ 0xF0
167	cbytes = 3
168	elif i >= 0xF8 and i <= 0xFB:
169	acc = i &~ 0xF8
170	cbytes = 4
171	elif i >= 0xFC and i <= 0xFD:
172	acc = i &~ 0xFC
173	cbytes = 5
174	gotone = 0
175	while cbytes > 0:
176	c = next()
177	if c == None or c < 0x80 or c > 0xBF:
178	gotone = 1
179	break
180	char.append(c)
181	acc = (acc << 6) + (c & 0x3F)
182	cbytes = cbytes - 1
183	if not gotone:
184	c = next()
185	if cbytes > 0:
186	output(-1, char, " (incomplete sequence)")
187	else:
188	process_ucs(acc, char)
189
190	def do(args):
191	# Class to turn a list into a callable object that returns one
192	# element at a time.
193	class liststepper:
194	def __init__(self, list):
195	self.list = list
196	self.index = 0
197	def __call__(self):
198	if self.index >= len(self.list):
199	return None
200	ret = self.list[self.index]
201	self.index = self.index + 1
202	return ret
203
204	list = []
205	for arg in args:
206	if string.upper(arg[0]) == "U":
207	if len(list) > 0:
208	process_utf8(liststepper(list))
209	list = []
210	assert arg[1] == "+" or arg[1] == "-"
211	process_ucs(string.atoi(arg[2:], 16))
212	else:
213	list.append(string.atoi(arg, 16))
214
215	if len(list) > 0:
216	process_utf8(liststepper(list))
217
218	args = sys.argv[1:]
219	output_analysis = 1
220	han_translations = 0
221
222	if args == [] or args == ["--help"] or args == ["--help-admin"]:
223	print "Usage: cvt-utf8 [flags] <hex UTF-8 bytes and/or U+codepoints>"
224	print " e.g. cvt-utf8 e2 82 ac"
225	print " or cvt-utf8 U+20ac"
226	print " or cvt-utf8 U-10ffff"
227	print ""
228	print "Flags: -o or --output just output well-formed UTF-8 instead of"
229	print " an analysis of the input data"
230	print " -h or --han also give Han definitions from unihan db"
231	print ""
232	print "Also: cvt-utf8 --test run Markus Kuhn's decoder stress tests" #'
233	print " cvt-utf8 --input (or -i)"
234	print " read, analyse and decode UTF-8 from stdin"
235	if args == ["--help-admin"]:
236	print " cvt-utf8 --help display user help text"
237	print " cvt-utf8 --help-admin display admin help text (this one)"
238	print " cvt-utf8 --build <infile> <outfile>"
239	print " convert UnicodeData.txt to unicode db"
240	print " cvt-utf8 --build-unihan <infile> <outfile>"
241	print " convert Unihan.txt to unihan db"
242	print " cvt-utf8 --fetch-build <outfile>"
243	print " "+\
244	"build unicode db by download from unicode.org"
245	print " cvt-utf8 --fetch-build-unihan <outfile>"
246	print " "+\
247	"build Unihan db by download from unicode.org"
248	else:
249	print " cvt-utf8 --help display this help text"
250	print " cvt-utf8 --help-admin display admin help text"
251	sys.exit(0)
252
253	if args[0] == "-o" or args[0] == "--output":
254	output_analysis = 0
255	args = args[1:]
256
257	if args[0] == "-h" or args[0] == "--han":
258	han_translations = 1
259	args = args[1:]
260
261	if args[0] == "--build" or args[0] == "--fetch-build":
262	if args[0] == "--build":
263	if len(args) != 3:
264	print "cvt-utf8: --build expects two filename arguments"
265	sys.exit(1)
266	infile = open(args[1], "r")
267	outfile = args[2]
268	else:
269	if len(args) != 2:
270	print "cvt-utf8: --fetch-build expects one filename argument"
271	sys.exit(1)
272	import urllib
273	infile = urllib.urlopen("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
274	outfile = args[1]
275	# Now build the database.
276	if outfile[-3:] == ".db":
277	print "cvt-utf8: warning: you should not append .db to db name"
278
279	db = anydbm.open(outfile, "n")
280	while 1:
281	s = infile.readline()
282	if s == "": break
283	ss = string.split(s, ";")[0]
284	db[ss] = s
285	db.close()
286	sys.exit(0)
287
288	if args[0] == "--build-unihan" or args[0] == "--fetch-build-unihan":
289	if args[0] == "--build-unihan":
290	if len(args) != 3:
291	print "cvt-utf8: --build expects two filename arguments"
292	sys.exit(1)
293	infile = open(args[1], "r")
294	s = infile.read(1)
295	# Unihan.txt starts with a hash. If this file starts with a
296	# P, we assume it's a zip file ("PK").
297	if s == "P":
298	infile = zip_untangler(infile, s)
299	s = ""
300	outfile = args[2]
301	else:
302	if len(args) != 2:
303	print "cvt-utf8: --fetch-build-unihan expects one filename argument"
304	sys.exit(1)
305	import urllib
306	infile = urllib.urlopen("ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip")
307	# We know this one is zipped.
308	infile = zip_untangler(infile, "")
309	outfile = args[1]
310	s = ""
311	# Now build the database.
312	if outfile[-3:] == ".db":
313	print "cvt-utf8: warning: you should not append .db to db name"
314
315	db = anydbm.open(outfile, "n")
316	while 1:
317	s = s + infile.readline()
318	if s == "": break
319	while s[-1:] == "\r" or s[-1:] == "\n":
320	s = s[:-1]
321	sa = string.split(s, "\t")
322	if len(sa) == 3 and sa[1] == "kDefinition" and sa[0][:2] == "U+":
323	db[sa[0][2:]] = sa[2]
324	s = ""
325	db.close()
326	sys.exit(0)
327
328	locations = []
329	locations.append("/usr/share/unicode/unicode")
330	locations.append("/usr/lib/unicode/unicode")
331	locations.append("/usr/local/share/unicode/unicode")
332	locations.append("/usr/local/lib/unicode/unicode")
333	locations.append(os.environ["HOME"] + "/share/unicode/unicode")
334	locations.append(os.environ["HOME"] + "/lib/unicode/unicode")
335
336	for loc in locations:
337	try:
338	db = anydbm.open(loc, "r")
339	except IOError:
340	db = None
341	except anydbm.error:
342	db = None
343	if db != None:
344	break
345	if han_translations:
346	i = string.rfind(loc, "/")
347	assert i >= 0
348	hanloc = loc[:i+1] + "unihan"
349	handb = anydbm.open(hanloc, "r")
350	# this has been explicitly required, so we don't squelch exceptions
351
352	if args[0] == "--test":
353	do(["CE","BA","E1","BD","B9","CF","83","CE","BC","CE","B5"])
354	do(["00"])
355	do(["C2","80"])
356	do(["E0","A0","80"])
357	do(["F0","90","80","80"])
358	do(["F8","88","80","80","80"])
359	do(["FC","84","80","80","80","80"])
360	do(["7F"])
361	do(["DF","BF"])
362	do(["EF","BF","BF"])
363	do(["F7","BF","BF","BF"])
364	do(["FB","BF","BF","BF","BF"])
365	do(["FD","BF","BF","BF","BF","BF"])
366	do(["ED","9F","BF"])
367	do(["EE","80","80"])
368	do(["EF","BF","BD"])
369	do(["F4","8F","BF","BF"])
370	do(["F4","90","80","80"])
371	do(["80"])
372	do(["BF"])
373	do(["80","BF"])
374	do(["80","BF","80"])
375	do(["80","BF","80","BF"])
376	do(["80","BF","80","BF","80"])
377	do(["80","BF","80","BF","80","BF"])
378	do(["80","BF","80","BF","80","BF","80"])
379	do(["80","81","82","83","84","85","86","87",
380	"88","89","8A","8B","8C","8D","8E","8F",
381	"90","91","92","93","94","95","96","97",
382	"98","99","9A","9B","9C","9D","9E","9F",
383	"A0","A1","A2","A3","A4","A5","A6","A7",
384	"A8","A9","AA","AB","AC","AD","AE","AF",
385	"B0","B1","B2","B3","B4","B5","B6","B7",
386	"B8","B9","BA","BB","BC","BD","BE","BF"])
387	do(["C0","20","C1","20","C2","20","C3","20",
388	"C4","20","C5","20","C6","20","C7","20",
389	"C8","20","C9","20","CA","20","CB","20",
390	"CC","20","CD","20","CE","20","CF","20",
391	"D0","20","D1","20","D2","20","D3","20",
392	"D4","20","D5","20","D6","20","D7","20",
393	"D8","20","D9","20","DA","20","DB","20",
394	"DC","20","DD","20","DE","20","DF","20"])
395	do(["E0","20","E1","20","E2","20","E3","20",
396	"E4","20","E5","20","E6","20","E7","20",
397	"E8","20","E9","20","EA","20","EB","20",
398	"EC","20","ED","20","EE","20","EF","20"])
399	do(["F0","20","F1","20","F2","20","F3","20",
400	"F4","20","F5","20","F6","20","F7","20"])
401	do(["F8","20","F9","20","FA","20","FB","20"])
402	do(["FC","20","FD","20"])
403	do(["C0"])
404	do(["E0","80"])
405	do(["F0","80","80"])
406	do(["F8","80","80","80"])
407	do(["FC","80","80","80","80"])
408	do(["DF"])
409	do(["EF","BF"])
410	do(["F7","BF","BF"])
411	do(["FB","BF","BF","BF"])
412	do(["FD","BF","BF","BF","BF"])
413	do(["C0","E0","80","F0","80","80","F8","80",
414	"80","80","FC","80","80","80","80",
415	"DF","EF","BF","F7","BF","BF","FB",
416	"BF","BF","BF","FD","BF","BF","BF","BF"])
417	do(["FE"])
418	do(["FF"])
419	do(["FE","FE","FF","FF"])
420	do(["C0","AF"])
421	do(["E0","80","AF"])
422	do(["F0","80","80","AF"])
423	do(["F8","80","80","80","AF"])
424	do(["FC","80","80","80","80","AF"])
425	do(["C1","BF"])
426	do(["E0","9F","BF"])
427	do(["F0","8F","BF","BF"])
428	do(["F8","87","BF","BF","BF"])
429	do(["FC","83","BF","BF","BF","BF"])
430	do(["C0","80"])
431	do(["E0","80","80"])
432	do(["F0","80","80","80"])
433	do(["F8","80","80","80","80"])
434	do(["FC","80","80","80","80","80"])
435	do(["ED","A0","80"])
436	do(["ED","AD","BF"])
437	do(["ED","AE","80"])
438	do(["ED","AF","BF"])
439	do(["ED","B0","80"])
440	do(["ED","BE","80"])
441	do(["ED","BF","BF"])
442	do(["ED","A0","80","ED","B0","80"])
443	do(["ED","A0","80","ED","BF","BF"])
444	do(["ED","AD","BF","ED","B0","80"])
445	do(["ED","AD","BF","ED","BF","BF"])
446	do(["ED","AE","80","ED","B0","80"])
447	do(["ED","AE","80","ED","BF","BF"])
448	do(["ED","AF","BF","ED","B0","80"])
449	do(["ED","AF","BF","ED","BF","8F"])
450	do(["EF","BF","BE"])
451	do(["EF","BF","BF"])
452	elif args[0] == "--input" or args[0] == "-i":
453	def getchar():
454	s = sys.stdin.read(1)
455	if s == "":
456	return None
457	return ord(s) & 0xFF # ensure it isn't negative
458	process_utf8(getchar)
459	else:
460	do(args)