Add the ability to cope with SGML entity syntax on cvt-utf8's

author simon <simon@cda61777-01e9-0310-a592-d414129be87e>

Wed, 15 Feb 2006 18:40:44 +0000 (18:40 +0000)

committer simon <simon@cda61777-01e9-0310-a592-d414129be87e>

Wed, 15 Feb 2006 18:40:44 +0000 (18:40 +0000)
author simon <simon@cda61777-01e9-0310-a592-d414129be87e>
Wed, 15 Feb 2006 18:40:44 +0000 (18:40 +0000)
committer simon <simon@cda61777-01e9-0310-a592-d414129be87e>
Wed, 15 Feb 2006 18:40:44 +0000 (18:40 +0000)
diff --git a/cvt-utf8/cvt-utf8 b/cvt-utf8/cvt-utf8

index 8df91ca..c7c4d20 100755 (executable)
--- a/cvt-utf8/cvt-utf8
+++ b/cvt-utf8/cvt-utf8
@@ -203,23 +203,39 @@ def do(args):
  
      list = []
      for arg in args:
+       got = ('none')
         if string.upper(arg[0]) == "U":
+           assert arg[1] == "+" or arg[1] == "-"
+           got = ('ucs', string.atoi(arg[2:], 16))
+       elif arg[:2] == "&#":
+           # SGML character entity. Either &# followed by a
+           # number, or &#x followed by a hex number.
+           s = arg
+           if s[-1:] == ";": s = s[:-1]
+           if string.upper(s[:3]) == "&#X":
+               got = ('ucs', string.atoi(s[3:], 16))
+           else:
+               got = ('ucs', string.atoi(s[2:], 10))
+       else:
+           got = ('utf8', string.atoi(arg, 16))
+
+       if got[0] == 'utf8':
+           list.append(got[1])
+       elif got[0] == 'ucs':
             if len(list) > 0:
                 process_utf8(liststepper(list))
                 list = []
-           assert arg[1] == "+" or arg[1] == "-"
-           process_ucs(string.atoi(arg[2:], 16))
-       else:
-           list.append(string.atoi(arg, 16))
+           process_ucs(got[1])
  
      if len(list) > 0:
         process_utf8(liststepper(list))
  
  def usage(arg):
-    print "usage: cvt-utf8 [flags] <hex UTF-8 bytes and/or U+codepoints>"
+    print "usage: cvt-utf8 [flags] <hex UTF-8 bytes, U+codepoints, SGML entities>"
      print "  e.g. cvt-utf8 e2 82 ac"
      print "    or cvt-utf8 U+20ac"
      print "    or cvt-utf8 U-10ffff"
+    print "    or cvt-utf8 '&#8211;'"
      print ""
      print "where: -o or --output        just output well-formed UTF-8 instead of"
      print "                             an analysis of the input data"
diff --git a/cvt-utf8/cvt-utf8.but b/cvt-utf8/cvt-utf8.but

index bf8294a..98848d7 100644 (file)
--- a/cvt-utf8/cvt-utf8.but
+++ b/cvt-utf8/cvt-utf8.but
@@ -8,8 +8,8 @@
  
  \U SYNOPSIS
  
-\c cvt-utf8 [flags] [hex UTF-8 bytes and/or U+codepoints]
-\e bbbbbbbb  iiiii   iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii
+\c cvt-utf8 [flags] [hex UTF-8 bytes, U+codepoints, SGML entities]
+\e bbbbbbbb  iiiii   iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii
  
  \U DESCRIPTION
  
@@ -28,11 +28,12 @@ code point in the Unicode character database and identify it.
  \b Look up Unified Han characters in the \q{Unihan} database and
  provide their translation text.
  
-By default, \cw{cvt-utf8} expects to receive hex numbers (either
-UTF-8 bytes or Unicode code points) on the command line, and it will
-print out a verbose analysis of the input data. If you need it to
-read UTF-8 from standard input or to write pure UTF-8 to standard
-output, you can do so using command-line options.
+By default, \cw{cvt-utf8} expects to receive character data on the
+command line (as a mixture of UTF-8 bytes, Unicode code points and
+SGML numeric character entities), and it will print out a verbose
+analysis of the input data. If you need it to read UTF-8 from
+standard input or to write pure UTF-8 to standard output, you can do
+so using command-line options.
  
  \U OPTIONS
  
@@ -66,6 +67,15 @@ points...
  ... and \cw{cvt-utf8} gives you the UTF-8 encodings plus the
  character definitions.
  
+If it's more convenient, you can specify those characters as SGML
+numeric entity references (for example if you're cutting and pasting
+out of a web page):
+
+\c $ cvt-utf8 '&#8364;' '&#x2013;'
+\e   bbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+\c U-000020AC  E2 82 AC          EURO SIGN
+\c U-00002013  E2 80 93          EN DASH
+
  Alternatively, you can supply a list of UTF-8 bytes...
  
  \c $ cvt-utf8 D0 A0 D1 83 D1 81 D1 81 D0 BA D0 B8 D0 B9
author	simon <simon@cda61777-01e9-0310-a592-d414129be87e>
	Wed, 15 Feb 2006 18:40:44 +0000 (18:40 +0000)
committer	simon <simon@cda61777-01e9-0310-a592-d414129be87e>
	Wed, 15 Feb 2006 18:40:44 +0000 (18:40 +0000)
cvt-utf8/cvt-utf8		patch \| blob \| blame \| history
cvt-utf8/cvt-utf8.but		patch \| blob \| blame \| history