mdw@git.distorted.org.uk Git - sgt/utils/blame_incremental

... / ...

Commit	Line	Data
	1	#!/usr/bin/env python
	2
	3	import sys
	4	import string
	5	import os
	6	import anydbm
	7	import zlib
	8
	9	class zip_untangler:
	10	def __init__(self, file, datasofar):
	11	self.file = file
	12	assert len(datasofar) < 30
	13	self.header = datasofar
	14	self.data = ""
	15	self.dataleft = None
	16	self.decompress = zlib.decompressobj()
	17	# Zlib header bytes, expected by decompress obj but not
	18	# present in zip file
	19	self.decompress.decompress("\x78\x9c")
	20
	21	def readline(self):
	22	if self.dataleft == None:
	23	while len(self.header) < 30:
	24	s = self.file.read(30 - len(self.header))
	25	assert s != ""
	26	self.header = self.header + s
	27	# Name length and extra length.
	28	namelen = 256 * ord(self.header[27]) + ord(self.header[26])
	29	extralen = 256 * ord(self.header[29]) + ord(self.header[28])
	30	while len(self.header) < 30 + namelen + extralen:
	31	s = self.file.read(30 + namelen + extralen - len(self.header))
	32	assert s != ""
	33	self.header = self.header + s
	34	self.dataleft = \
	35	256 * (256 * (256 * ord(self.header[21]) + ord(self.header[20])) \
	36	+ ord(self.header[19])) + ord(self.header[18])
	37	k = string.find(self.data, "\n")
	38	while k < 0:
	39	rlen = self.dataleft
	40	if rlen > 4096: rlen = 4096
	41	if rlen == 0: break
	42	d = self.file.read(rlen)
	43	if d == "": break
	44	self.dataleft = self.dataleft - rlen
	45	self.data = self.data + self.decompress.decompress(d)
	46	k = string.find(self.data, "\n")
	47	if k < 0:
	48	ret = self.data
	49	self.data = ""
	50	return ret
	51	else:
	52	ret = self.data[:k+1]
	53	self.data = self.data[k+1:]
	54	return ret
	55
	56	def hexstr(x):
	57	s = hex(x)
	58	if s[-1:] == "L" or s[-1:] == "l":
	59	s = s[:-1]
	60	if s[:2] == "0x" or s[:2] == "0X":
	61	s = s[2:]
	62	return s
	63
	64	def charname(x):
	65	if db is not None:
	66	key = hexstr(x)
	67	while len(key) < 4: key = "0" + key
	68	key = string.upper(key)
	69	if han_translations:
	70	try:
	71	value = handb[key]
	72	return "<han> " + value
	73	except KeyError:
	74	pass
	75	try:
	76	value = db[key]
	77	return string.split(value, ";")[1]
	78	except KeyError:
	79	return "<no name available>"
	80	else:
	81	return ""
	82
	83	def output(char, bytes, errors):
	84	if output_analysis:
	85	if char == -1:
	86	s = " "
	87	else:
	88	s = "U-%08X " % char
	89	for i in bytes:
	90	s = s + " %02X" % i
	91	for i in range(6-len(bytes)):
	92	s = s + " "
	93
	94	if char == -1:
	95	name = ""
	96	else:
	97	name = charname(char)
	98	if name != "":
	99	s = s + " " + name
	100	s = s + errors
	101	print s
	102	else:
	103	if char == -1 or errors != "":
	104	# problem chars become U+FFFD REPLACEMENT CHARACTER
	105	sys.stdout.write("\xEF\xBF\xBD")
	106	else:
	107	for i in bytes:
	108	sys.stdout.write(chr(i))
	109
	110	def process_ucs(x, bytes=[], errors=""):
	111	if x < 0x80:
	112	utf8 = [x]
	113	realbytes = 1
	114	else:
	115	if x < 0x800:
	116	tmp = (0xC0, 1)
	117	elif x < 0x10000:
	118	tmp = (0xE0, 2)
	119	elif x < 0x200000:
	120	tmp = (0xF0, 3)
	121	elif x < 0x4000000:
	122	tmp = (0xF8, 4)
	123	else:
	124	assert x < 0x80000000L
	125	tmp = (0xFC, 5)
	126	realbytes = tmp[1] + 1
	127	utf8 = [tmp[0] + (x >> (6*tmp[1]))]
	128	for i in range(tmp[1]-1, -1, -1):
	129	utf8.append(0x80 + (0x3F & (x >> (i*6))))
	130
	131	if bytes != [] and len(bytes) > realbytes:
	132	errors = errors + " (overlong form of"
	133	for i in utf8:
	134	errors = errors + " %02X" % i
	135	errors = errors + ")"
	136	utf8 = bytes
	137	if x >= 0xD800 and x <= 0xDFFF:
	138	errors = errors + " (surrogate)"
	139	if x >= 0xFFFE and x <= 0xFFFF:
	140	errors = errors + " (invalid char)"
	141
	142	output(x, utf8, errors)
	143
	144	def process_utf8(next):
	145	c = next()
	146	while c != None:
	147	char = [c]
	148	i = c
	149	if i < 0x80:
	150	process_ucs(i) # single-byte char
	151	c = next()
	152	elif i == 0xfe or i == 0xff:
	153	output(-1, char, " (invalid UTF-8 byte)")
	154	c = next()
	155	elif i >= 0x80 and i <= 0xbf:
	156	output(-1, char, " (unexpected continuation byte)")
	157	c = next()
	158	else:
	159	if i >= 0xC0 and i <= 0xDF:
	160	acc = i &~ 0xC0
	161	cbytes = 1
	162	elif i >= 0xE0 and i <= 0xEF:
	163	acc = i &~ 0xE0
	164	cbytes = 2
	165	elif i >= 0xF0 and i <= 0xF7:
	166	acc = i &~ 0xF0
	167	cbytes = 3
	168	elif i >= 0xF8 and i <= 0xFB:
	169	acc = i &~ 0xF8
	170	cbytes = 4
	171	elif i >= 0xFC and i <= 0xFD:
	172	acc = i &~ 0xFC
	173	cbytes = 5
	174	gotone = 0
	175	while cbytes > 0:
	176	c = next()
	177	if c == None or c < 0x80 or c > 0xBF:
	178	gotone = 1
	179	break
	180	char.append(c)
	181	acc = (acc << 6) + (c & 0x3F)
	182	cbytes = cbytes - 1
	183	if cbytes > 0:
	184	output(-1, char, " (incomplete sequence)")
	185	else:
	186	process_ucs(acc, char)
	187	if not gotone:
	188	c = next()
	189
	190	def do(args):
	191	# Class to turn a list into a callable object that returns one
	192	# element at a time.
	193	class liststepper:
	194	def __init__(self, list):
	195	self.list = list
	196	self.index = 0
	197	def __call__(self):
	198	if self.index >= len(self.list):
	199	return None
	200	ret = self.list[self.index]
	201	self.index = self.index + 1
	202	return ret
	203
	204	list = []
	205	for arg in args:
	206	got = ('none')
	207	if string.upper(arg[0]) == "U":
	208	assert arg[1] == "+" or arg[1] == "-"
	209	got = ('ucs', string.atoi(arg[2:], 16))
	210	elif arg[:2] == "&#":
	211	# SGML character entity. Either &# followed by a
	212	# number, or &#x followed by a hex number.
	213	s = arg
	214	if s[-1:] == ";": s = s[:-1]
	215	if string.upper(s[:3]) == "&#X":
	216	got = ('ucs', string.atoi(s[3:], 16))
	217	else:
	218	got = ('ucs', string.atoi(s[2:], 10))
	219	else:
	220	got = ('utf8', string.atoi(arg, 16))
	221
	222	if got[0] == 'utf8':
	223	list.append(got[1])
	224	elif got[0] == 'ucs':
	225	if len(list) > 0:
	226	process_utf8(liststepper(list))
	227	list = []
	228	process_ucs(got[1])
	229
	230	if len(list) > 0:
	231	process_utf8(liststepper(list))
	232
	233	def usage(arg):
	234	print "usage: cvt-utf8 [flags] <hex UTF-8 bytes, U+codepoints, SGML entities>"
	235	print " e.g. cvt-utf8 e2 82 ac"
	236	print " or cvt-utf8 U+20ac"
	237	print " or cvt-utf8 U-10ffff"
	238	print " or cvt-utf8 '–'"
	239	print ""
	240	print "where: -o or --output just output well-formed UTF-8 instead of"
	241	print " an analysis of the input data"
	242	print " -h or --han also give Han definitions from unihan db"
	243	print ""
	244	print " also: cvt-utf8 --test run Markus Kuhn's decoder stress tests" #'
	245	print " cvt-utf8 --input (or -i)"
	246	print " read, analyse and decode UTF-8 from stdin"
	247	if arg == "--help-admin":
	248	print " cvt-utf8 --help display user help text"
	249	print " cvt-utf8 --help-admin display admin help text (this one)"
	250	print " cvt-utf8 --build <infile> <outfile>"
	251	print " convert UnicodeData.txt to unicode db"
	252	print " cvt-utf8 --build-unihan <infile> <outfile>"
	253	print " convert Unihan.txt to unihan db"
	254	print " cvt-utf8 --fetch-build <outfile>"
	255	print " "+\
	256	"build unicode db by download from unicode.org"
	257	print " cvt-utf8 --fetch-build-unihan <outfile>"
	258	print " "+\
	259	"build Unihan db by download from unicode.org"
	260	else:
	261	print " cvt-utf8 --help display this help text"
	262	print " cvt-utf8 --help-admin display admin help text"
	263	print " cvt-utf8 --version report version number"
	264	print " cvt-utf8 --licence display (MIT) licence text"
	265
	266	def licence():
	267	print "cvt-utf8 is copyright 2002-2004 Simon Tatham."
	268	print ""
	269	print "Permission is hereby granted, free of charge, to any person"
	270	print "obtaining a copy of this software and associated documentation files"
	271	print "(the \"Software\"), to deal in the Software without restriction,"
	272	print "including without limitation the rights to use, copy, modify, merge,"
	273	print "publish, distribute, sublicense, and/or sell copies of the Software,"
	274	print "and to permit persons to whom the Software is furnished to do so,"
	275	print "subject to the following conditions:"
	276	print ""
	277	print "The above copyright notice and this permission notice shall be"
	278	print "included in all copies or substantial portions of the Software."
	279	print ""
	280	print "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,"
	281	print "EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF"
	282	print "MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND"
	283	print "NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS"
	284	print "BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN"
	285	print "ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN"
	286	print "CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE"
	287	print "SOFTWARE."
	288
	289	def version():
	290	rev = "$Revision$"
	291	rev = string.replace(rev, " ", "")
	292	rev = string.replace(rev, "$", "")
	293	revs = string.split(rev, ":")
	294	if len(revs) > 1:
	295	print "cvt-utf8 revision %s" % revs[1]
	296	else:
	297	print "cvt-utf8: unknown version"
	298
	299	args = sys.argv[1:]
	300	output_analysis = 1
	301	han_translations = 0
	302	mode = "cmdline"
	303
	304	if args == []:
	305	usage("")
	306	sys.exit(0)
	307
	308	while len(args) > 0 and args[0][:1] == "-":
	309	if args[0] == "--help" or args[0] == "--help-admin":
	310	usage(args[0])
	311	sys.exit(0)
	312
	313	elif args[0] == "--licence" or args[0] == "--license":
	314	licence()
	315	sys.exit(0)
	316
	317	elif args[0] == "--version":
	318	version()
	319	sys.exit(0)
	320
	321	elif args[0] == "-o" or args[0] == "--output":
	322	output_analysis = 0
	323	args = args[1:]
	324
	325	elif args[0] == "-h" or args[0] == "--han":
	326	han_translations = 1
	327	args = args[1:]
	328
	329	elif args[0] == "--build" or args[0] == "--fetch-build":
	330	if args[0] == "--build":
	331	if len(args) != 3:
	332	print "cvt-utf8: --build expects two filename arguments"
	333	sys.exit(1)
	334	infile = open(args[1], "r")
	335	outfile = args[2]
	336	else:
	337	if len(args) != 2:
	338	print "cvt-utf8: --fetch-build expects one filename argument"
	339	sys.exit(1)
	340	import urllib
	341	infile = urllib.urlopen("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
	342	outfile = args[1]
	343	# Now build the database.
	344	if outfile[-3:] == ".db":
	345	print "cvt-utf8: warning: you should not append .db to db name"
	346
	347	db = anydbm.open(outfile, "n")
	348	while 1:
	349	s = infile.readline()
	350	if s == "": break
	351	ss = string.split(s, ";")[0]
	352	db[ss] = s
	353	db.close()
	354	sys.exit(0)
	355
	356	elif args[0] == "--build-unihan" or args[0] == "--fetch-build-unihan":
	357	if args[0] == "--build-unihan":
	358	if len(args) != 3:
	359	print "cvt-utf8: --build expects two filename arguments"
	360	sys.exit(1)
	361	infile = open(args[1], "r")
	362	s = infile.read(1)
	363	# Unihan.txt starts with a hash. If this file starts with a
	364	# P, we assume it's a zip file ("PK").
	365	if s == "P":
	366	infile = zip_untangler(infile, s)
	367	s = ""
	368	outfile = args[2]
	369	else:
	370	if len(args) != 2:
	371	print "cvt-utf8: --fetch-build-unihan expects one filename argument"
	372	sys.exit(1)
	373	import urllib
	374	infile = urllib.urlopen("ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip")
	375	# We know this one is zipped.
	376	infile = zip_untangler(infile, "")
	377	outfile = args[1]
	378	s = ""
	379	# Now build the database.
	380	if outfile[-3:] == ".db":
	381	print "cvt-utf8: warning: you should not append .db to db name"
	382
	383	db = anydbm.open(outfile, "n")
	384	while 1:
	385	s = s + infile.readline()
	386	if s == "": break
	387	while s[-1:] == "\r" or s[-1:] == "\n":
	388	s = s[:-1]
	389	sa = string.split(s, "\t")
	390	if len(sa) == 3 and sa[1] == "kDefinition" and sa[0][:2] == "U+":
	391	db[sa[0][2:]] = sa[2]
	392	s = ""
	393	db.close()
	394	sys.exit(0)
	395
	396	elif args[0] == "--test":
	397	mode = "test"
	398	args = args[1:]
	399
	400	elif args[0] == "--input" or args[0] == "-i":
	401	mode = "input"
	402	args = args[1:]
	403
	404	else:
	405	sys.stderr.write("cvt-utf8: unknown argument '%s'" % args[0])
	406	sys.exit(1)
	407
	408	locations = []
	409	locations.append("/usr/share/unicode/unicode")
	410	locations.append("/usr/lib/unicode/unicode")
	411	locations.append("/usr/local/share/unicode/unicode")
	412	locations.append("/usr/local/lib/unicode/unicode")
	413	locations.append(os.environ["HOME"] + "/share/unicode/unicode")
	414	locations.append(os.environ["HOME"] + "/lib/unicode/unicode")
	415
	416	for loc in locations:
	417	try:
	418	db = anydbm.open(loc, "r")
	419	except IOError:
	420	db = None
	421	except anydbm.error:
	422	db = None
	423	if db != None:
	424	break
	425	if han_translations:
	426	i = string.rfind(loc, "/")
	427	assert i >= 0
	428	hanloc = loc[:i+1] + "unihan"
	429	handb = anydbm.open(hanloc, "r")
	430	# this has been explicitly required, so we don't squelch exceptions
	431
	432	if mode == "test":
	433	do(["CE","BA","E1","BD","B9","CF","83","CE","BC","CE","B5"])
	434	do(["00"])
	435	do(["C2","80"])
	436	do(["E0","A0","80"])
	437	do(["F0","90","80","80"])
	438	do(["F8","88","80","80","80"])
	439	do(["FC","84","80","80","80","80"])
	440	do(["7F"])
	441	do(["DF","BF"])
	442	do(["EF","BF","BF"])
	443	do(["F7","BF","BF","BF"])
	444	do(["FB","BF","BF","BF","BF"])
	445	do(["FD","BF","BF","BF","BF","BF"])
	446	do(["ED","9F","BF"])
	447	do(["EE","80","80"])
	448	do(["EF","BF","BD"])
	449	do(["F4","8F","BF","BF"])
	450	do(["F4","90","80","80"])
	451	do(["80"])
	452	do(["BF"])
	453	do(["80","BF"])
	454	do(["80","BF","80"])
	455	do(["80","BF","80","BF"])
	456	do(["80","BF","80","BF","80"])
	457	do(["80","BF","80","BF","80","BF"])
	458	do(["80","BF","80","BF","80","BF","80"])
	459	do(["80","81","82","83","84","85","86","87",
	460	"88","89","8A","8B","8C","8D","8E","8F",
	461	"90","91","92","93","94","95","96","97",
	462	"98","99","9A","9B","9C","9D","9E","9F",
	463	"A0","A1","A2","A3","A4","A5","A6","A7",
	464	"A8","A9","AA","AB","AC","AD","AE","AF",
	465	"B0","B1","B2","B3","B4","B5","B6","B7",
	466	"B8","B9","BA","BB","BC","BD","BE","BF"])
	467	do(["C0","20","C1","20","C2","20","C3","20",
	468	"C4","20","C5","20","C6","20","C7","20",
	469	"C8","20","C9","20","CA","20","CB","20",
	470	"CC","20","CD","20","CE","20","CF","20",
	471	"D0","20","D1","20","D2","20","D3","20",
	472	"D4","20","D5","20","D6","20","D7","20",
	473	"D8","20","D9","20","DA","20","DB","20",
	474	"DC","20","DD","20","DE","20","DF","20"])
	475	do(["E0","20","E1","20","E2","20","E3","20",
	476	"E4","20","E5","20","E6","20","E7","20",
	477	"E8","20","E9","20","EA","20","EB","20",
	478	"EC","20","ED","20","EE","20","EF","20"])
	479	do(["F0","20","F1","20","F2","20","F3","20",
	480	"F4","20","F5","20","F6","20","F7","20"])
	481	do(["F8","20","F9","20","FA","20","FB","20"])
	482	do(["FC","20","FD","20"])
	483	do(["C0"])
	484	do(["E0","80"])
	485	do(["F0","80","80"])
	486	do(["F8","80","80","80"])
	487	do(["FC","80","80","80","80"])
	488	do(["DF"])
	489	do(["EF","BF"])
	490	do(["F7","BF","BF"])
	491	do(["FB","BF","BF","BF"])
	492	do(["FD","BF","BF","BF","BF"])
	493	do(["C0","E0","80","F0","80","80","F8","80",
	494	"80","80","FC","80","80","80","80",
	495	"DF","EF","BF","F7","BF","BF","FB",
	496	"BF","BF","BF","FD","BF","BF","BF","BF"])
	497	do(["FE"])
	498	do(["FF"])
	499	do(["FE","FE","FF","FF"])
	500	do(["C0","AF"])
	501	do(["E0","80","AF"])
	502	do(["F0","80","80","AF"])
	503	do(["F8","80","80","80","AF"])
	504	do(["FC","80","80","80","80","AF"])
	505	do(["C1","BF"])
	506	do(["E0","9F","BF"])
	507	do(["F0","8F","BF","BF"])
	508	do(["F8","87","BF","BF","BF"])
	509	do(["FC","83","BF","BF","BF","BF"])
	510	do(["C0","80"])
	511	do(["E0","80","80"])
	512	do(["F0","80","80","80"])
	513	do(["F8","80","80","80","80"])
	514	do(["FC","80","80","80","80","80"])
	515	do(["ED","A0","80"])
	516	do(["ED","AD","BF"])
	517	do(["ED","AE","80"])
	518	do(["ED","AF","BF"])
	519	do(["ED","B0","80"])
	520	do(["ED","BE","80"])
	521	do(["ED","BF","BF"])
	522	do(["ED","A0","80","ED","B0","80"])
	523	do(["ED","A0","80","ED","BF","BF"])
	524	do(["ED","AD","BF","ED","B0","80"])
	525	do(["ED","AD","BF","ED","BF","BF"])
	526	do(["ED","AE","80","ED","B0","80"])
	527	do(["ED","AE","80","ED","BF","BF"])
	528	do(["ED","AF","BF","ED","B0","80"])
	529	do(["ED","AF","BF","ED","BF","8F"])
	530	do(["EF","BF","BE"])
	531	do(["EF","BF","BF"])
	532	elif mode == "input":
	533	def getchar():
	534	s = sys.stdin.read(1)
	535	if s == "":
	536	return None
	537	return ord(s) & 0xFF # ensure it isn't negative
	538	process_utf8(getchar)
	539	else:
	540	do(args)