list = []
for arg in args:
+ got = ('none')
if string.upper(arg[0]) == "U":
+ assert arg[1] == "+" or arg[1] == "-"
+ got = ('ucs', string.atoi(arg[2:], 16))
+ elif arg[:2] == "&#":
+ # SGML character entity. Either &# followed by a
+ # number, or &#x followed by a hex number.
+ s = arg
+ if s[-1:] == ";": s = s[:-1]
+ if string.upper(s[:3]) == "&#X":
+ got = ('ucs', string.atoi(s[3:], 16))
+ else:
+ got = ('ucs', string.atoi(s[2:], 10))
+ else:
+ got = ('utf8', string.atoi(arg, 16))
+
+ if got[0] == 'utf8':
+ list.append(got[1])
+ elif got[0] == 'ucs':
if len(list) > 0:
process_utf8(liststepper(list))
list = []
- assert arg[1] == "+" or arg[1] == "-"
- process_ucs(string.atoi(arg[2:], 16))
- else:
- list.append(string.atoi(arg, 16))
+ process_ucs(got[1])
if len(list) > 0:
process_utf8(liststepper(list))
def usage(arg):
- print "usage: cvt-utf8 [flags] <hex UTF-8 bytes and/or U+codepoints>"
+ print "usage: cvt-utf8 [flags] <hex UTF-8 bytes, U+codepoints, SGML entities>"
print " e.g. cvt-utf8 e2 82 ac"
print " or cvt-utf8 U+20ac"
print " or cvt-utf8 U-10ffff"
+ print " or cvt-utf8 '–'"
print ""
print "where: -o or --output just output well-formed UTF-8 instead of"
print " an analysis of the input data"
\U SYNOPSIS
-\c cvt-utf8 [flags] [hex UTF-8 bytes and/or U+codepoints]
-\e bbbbbbbb iiiii iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii
+\c cvt-utf8 [flags] [hex UTF-8 bytes, U+codepoints, SGML entities]
+\e bbbbbbbb iiiii iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii
\U DESCRIPTION
\b Look up Unified Han characters in the \q{Unihan} database and
provide their translation text.
-By default, \cw{cvt-utf8} expects to receive hex numbers (either
-UTF-8 bytes or Unicode code points) on the command line, and it will
-print out a verbose analysis of the input data. If you need it to
-read UTF-8 from standard input or to write pure UTF-8 to standard
-output, you can do so using command-line options.
+By default, \cw{cvt-utf8} expects to receive character data on the
+command line (as a mixture of UTF-8 bytes, Unicode code points and
+SGML numeric character entities), and it will print out a verbose
+analysis of the input data. If you need it to read UTF-8 from
+standard input or to write pure UTF-8 to standard output, you can do
+so using command-line options.
\U OPTIONS
... and \cw{cvt-utf8} gives you the UTF-8 encodings plus the
character definitions.
+If it's more convenient, you can specify those characters as SGML
+numeric entity references (for example if you're cutting and pasting
+out of a web page):
+
+\c $ cvt-utf8 '€' '–'
+\e bbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+\c U-000020AC E2 82 AC EURO SIGN
+\c U-00002013 E2 80 93 EN DASH
+
Alternatively, you can supply a list of UTF-8 bytes...
\c $ cvt-utf8 D0 A0 D1 83 D1 81 D1 81 D0 BA D0 B8 D0 B9