--- /dev/null
+all: base64.1 base64
+
+base64: base64.c
+ $(CC) $(CFLAGS) -o $@ $<
+
+%.1: %.but
+ halibut --man=$@ $<
+
+clean:
+ rm -f *.1 base64
--- /dev/null
+\cfg{man-identity}{base64}{1}{2004-08-02}{Simon Tatham}{Simon Tatham}
+\cfg{man-mindepth}{1}
+
+\C{base64-manpage} Man page for \cw{base64}
+
+\H{base64-manpage-name} NAME
+
+\cw{base64} - stand-alone encoder and decoder for base64
+
+\H{base64-manpage-synopsis} SYNOPSIS
+
+\c base64 [ -d ] [ filename ]
+\e bbbbbb bb iiiiiiii
+\c base64 -e [ -cwidth ] [ filename ]
+\e bbbbbb bb bbiiiii iiiiiiii
+
+\H{base64-manpage-description} DESCRIPTION
+
+\cw{base64} is a command-line utility for encoding and decoding the
+\q{base64} encoding.
+
+This encoding, defined in RFC 2045, is primarily used to encode
+binary attachments in MIME e-mail, but is widely used in many other
+applications as well. For example, the \q{Content-MD5} mail header
+contains a small piece of base64; SSH private keys are generally
+stored as base64-encoded blobs; and so on.
+
+Other utilities, such as \cw{munpack}, exist which will take an
+entire MIME-encoded message, identify the base64-encoded subparts,
+and decode them. However, these utilities will not help you if you
+need to inspect a Content-MD5 header or an SSH private key.
+
+\cw{base64} is a very simple stand-alone encoder and decoder for the
+base64 format \e{alone}. It does not try to understand MIME headers
+or anything other than raw data.
+
+\H{base64-manpage-options} OPTIONS
+
+By default (if neither \cw{-d} or \cw{-e} is supplied), \cw{base64}
+operates in decode mode.
+
+\dt \cw{-d}
+
+\dd Places \cw{base64} into decode mode. In this mode, it will read
+from standard input or the supplied file name, ignore all characters
+that are not part of the base64 alphabet, decode the ones that are,
+and output the decoded data on standard output.
+
+\dt \cw{-e}
+
+\dd Places \cw{base64} into encode mode. In this mode, it will read
+binary data from standard input or the supplied file name, encode it
+as base64, and output the encoded data on standard output.
+
+\dt \cw{-c}\e{width}
+
+\dd If \cw{base64} is operating in encode mode, this controls the
+number of base64 characters output per line of the encoded file.
+Normally base64-reading applications do not care about this, so the
+default of 64 characters per line is perfectly adequate.
+
+\lcont{
+
+The special value 0 will prevent \cw{base64} from ever writing a
+line break in the middle of the data at all.
+
+The base64 encoding converts between a group of three plaintext
+bytes and a group of four encoded bytes. \cw{base64} does not
+support breaking an encoded group across a line. Therefore, the
+\e{width} parameter passed to \cw{-c} must be a multiple of 4.
+
+}
--- /dev/null
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+
+#define isbase64(c) ( ((c) >= 'A' && (c) <= 'Z') || \
+ ((c) >= 'a' && (c) <= 'z') || \
+ ((c) >= '0' && (c) <= '9') || \
+ (c) == '+' || (c) == '/' || (c) == '=' \
+ )
+
+int base64_decode_atom(char *atom, unsigned char *out) {
+ int vals[4];
+ int i, v, len;
+ unsigned word;
+ char c;
+
+ for (i = 0; i < 4; i++) {
+ c = atom[i];
+ if (c >= 'A' && c <= 'Z')
+ v = c - 'A';
+ else if (c >= 'a' && c <= 'z')
+ v = c - 'a' + 26;
+ else if (c >= '0' && c <= '9')
+ v = c - '0' + 52;
+ else if (c == '+')
+ v = 62;
+ else if (c == '/')
+ v = 63;
+ else if (c == '=')
+ v = -1;
+ else
+ return 0; /* invalid atom */
+ vals[i] = v;
+ }
+
+ if (vals[0] == -1 || vals[1] == -1)
+ return 0;
+ if (vals[2] == -1 && vals[3] != -1)
+ return 0;
+
+ if (vals[3] != -1)
+ len = 3;
+ else if (vals[2] != -1)
+ len = 2;
+ else
+ len = 1;
+
+ word = ((vals[0] << 18) |
+ (vals[1] << 12) |
+ ((vals[2] & 0x3F) << 6) |
+ (vals[3] & 0x3F));
+ out[0] = (word >> 16) & 0xFF;
+ if (len > 1)
+ out[1] = (word >> 8) & 0xFF;
+ if (len > 2)
+ out[2] = word & 0xFF;
+ return len;
+}
+
+void base64_encode_atom(unsigned char *data, int n, char *out) {
+ static const char base64_chars[] =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+ unsigned word;
+
+ word = data[0] << 16;
+ if (n > 1)
+ word |= data[1] << 8;
+ if (n > 2)
+ word |= data[2];
+ out[0] = base64_chars[(word >> 18) & 0x3F];
+ out[1] = base64_chars[(word >> 12) & 0x3F];
+ if (n > 1)
+ out[2] = base64_chars[(word >> 6) & 0x3F];
+ else
+ out[2] = '=';
+ if (n > 2)
+ out[3] = base64_chars[word & 0x3F];
+ else
+ out[3] = '=';
+}
+
+const char usagemsg[] =
+ "usage: base64 [-d] [filename] decode from a file or from stdin\n"
+ " or: base64 -e [-cNNN] [filename] encode from a file or from stdin\n"
+ " also: base64 --version report version number\n"
+ " and: base64 --help display this help text\n"
+ "where: -d decode mode (default)\n"
+ " -e encode mode\n"
+ " -cNNN set number of chars per line for encoded output\n"
+ ;
+
+void usage(void) {
+ fputs(usagemsg, stdout);
+}
+
+void version(void) {
+#define SVN_REV "$Revision$"
+ char rev[sizeof(SVN_REV)];
+ char *p, *q;
+
+ strcpy(rev, SVN_REV);
+
+ for (p = rev; *p && *p != ':'; p++);
+ if (*p) {
+ p++;
+ while (*p && isspace(*p)) p++;
+ for (q = p; *q && *q != '$'; q++);
+ if (*q) *q = '\0';
+ printf("base64 revision %s\n", p);
+ } else {
+ printf("base64: unknown version\n");
+ }
+}
+
+int main(int ac, char **av) {
+ int encoding = 0;
+ int cpl = 64;
+ FILE *fp;
+ char *fname;
+ char *eptr;
+
+ fname = NULL;
+
+ while (--ac) {
+ char *v, *p = *++av;
+ if (*p == '-') {
+ while (*p) {
+ char c = *++p;
+ switch (c) {
+ case '-':
+ if (!strcmp(p, "version")) {
+ version();
+ exit(0);
+ }
+ if (!strcmp(p, "help")) {
+ usage();
+ exit(0);
+ }
+ break;
+ case 'v':
+ case 'V':
+ version();
+ exit(0);
+ break;
+ case 'h':
+ case 'H':
+ usage();
+ exit(0);
+ break;
+ case 'd':
+ encoding = 0;
+ break;
+ case 'e':
+ encoding = 1;
+ break;
+ case 'c':
+ /*
+ * Options requiring values.
+ */
+ v = p+1;
+ if (!*v && ac > 1) {
+ --ac;
+ v = *++av;
+ }
+ if (!*v) {
+ fprintf(stderr, "base64: option '-%c' expects"
+ " an argument\n", c);
+ exit(1);
+ }
+ switch (c) {
+ case 'c':
+ cpl = strtol(v, &eptr, 10);
+ if (eptr && *eptr) {
+ fprintf(stderr, "base64: option -c expects"
+ " a numeric argument\n");
+ exit(1);
+ }
+ if (cpl % 4) {
+ fprintf(stderr, "base64: chars per line should be"
+ " divisible by 4\n");
+ exit(1);
+ }
+ break;
+ }
+ p = "";
+ break;
+ }
+ }
+ } else {
+ if (!fname)
+ fname = p;
+ else {
+ fprintf(stderr, "base64: expected only one filename\n");
+ exit(0);
+ }
+ }
+ }
+
+ if (fname) {
+ fp = fopen(fname, encoding ? "rb" : "r");
+ if (!fp) {
+ fprintf(stderr, "base64: unable to open '%s': %s\n", fname,
+ strerror(errno));
+ exit(1);
+ }
+ } else
+ fp = stdin;
+
+ if (encoding) {
+ unsigned char in[3];
+ char out[4];
+ int column;
+ int n;
+
+ column = 0;
+ while (1) {
+ if (cpl && column >= cpl) {
+ putchar('\n');
+ column = 0;
+ }
+ n = fread(in, 1, 3, fp);
+ if (n == 0) break;
+ base64_encode_atom(in, n, out);
+ fwrite(out, 1, 4, stdout);
+ column += 4;
+ }
+
+ putchar('\n');
+ } else {
+ char in[4];
+ unsigned char out[3];
+ int c, i, n, eof;
+
+ eof = 0;
+ do {
+ for (i = 0; i < 4; i++) {
+ do {
+ c = fgetc(fp);
+ } while (c != EOF && !isbase64(c));
+ if (c == EOF) {
+ eof = 1;
+ break;
+ }
+ in[i] = c;
+ }
+ if (i > 0) {
+ if (i < 4) {
+ fprintf(stderr, "base64: warning: number of base64"
+ " characters was not a multiple of 4\n");
+ while (i < 4) in[i++] = '=';
+ }
+ n = base64_decode_atom(in, out);
+ fwrite(out, 1, n, stdout);
+ }
+ } while (!eof);
+ }
+
+ if (fname)
+ fclose(fp);
+
+ return 0;
+}
--- /dev/null
+all: cvt-utf8.1
+
+%.1: %.but
+ halibut --man=$@ $<
+
+clean:
+ rm -f *.1
--- /dev/null
+#!/usr/bin/env python
+
+import sys
+import string
+import os
+import anydbm
+import zlib
+
+class zip_untangler:
+ def __init__(self, file, datasofar):
+ self.file = file
+ assert len(datasofar) < 30
+ self.header = datasofar
+ self.data = ""
+ self.dataleft = None
+ self.decompress = zlib.decompressobj()
+ # Zlib header bytes, expected by decompress obj but not
+ # present in zip file
+ self.decompress.decompress("\x78\x9c")
+
+ def readline(self):
+ if self.dataleft == None:
+ while len(self.header) < 30:
+ s = self.file.read(30 - len(self.header))
+ assert s != ""
+ self.header = self.header + s
+ # Name length and extra length.
+ namelen = 256 * ord(self.header[27]) + ord(self.header[26])
+ extralen = 256 * ord(self.header[29]) + ord(self.header[28])
+ while len(self.header) < 30 + namelen + extralen:
+ s = self.file.read(30 + namelen + extralen - len(self.header))
+ assert s != ""
+ self.header = self.header + s
+ self.dataleft = \
+ 256 * (256 * (256 * ord(self.header[21]) + ord(self.header[20])) \
+ + ord(self.header[19])) + ord(self.header[18])
+ k = string.find(self.data, "\n")
+ while k < 0:
+ rlen = self.dataleft
+ if rlen > 4096: rlen = 4096
+ if rlen == 0: break
+ d = self.file.read(rlen)
+ if d == "": break
+ self.dataleft = self.dataleft - rlen
+ self.data = self.data + self.decompress.decompress(d)
+ k = string.find(self.data, "\n")
+ if k < 0:
+ ret = self.data
+ self.data = ""
+ return ret
+ else:
+ ret = self.data[:k+1]
+ self.data = self.data[k+1:]
+ return ret
+
+def hexstr(x):
+ s = hex(x)
+ if s[-1:] == "L" or s[-1:] == "l":
+ s = s[:-1]
+ if s[:2] == "0x" or s[:2] == "0X":
+ s = s[2:]
+ return s
+
+def charname(x):
+ if db:
+ key = hexstr(x)
+ while len(key) < 4: key = "0" + key
+ key = string.upper(key)
+ if han_translations:
+ try:
+ value = handb[key]
+ return "<han> " + value
+ except KeyError:
+ pass
+ try:
+ value = db[key]
+ return string.split(value, ";")[1]
+ except KeyError:
+ return "<no name available>"
+ else:
+ return ""
+
+def output(char, bytes, errors):
+ if output_analysis:
+ if char == -1:
+ s = " "
+ else:
+ s = "U-%08X " % char
+ for i in bytes:
+ s = s + " %02X" % i
+ for i in range(6-len(bytes)):
+ s = s + " "
+
+ if char == -1:
+ name = ""
+ else:
+ name = charname(char)
+ if name != "":
+ s = s + " " + name
+ s = s + errors
+ print s
+ else:
+ if char == -1 or errors != "":
+ # problem chars become U+FFFD REPLACEMENT CHARACTER
+ sys.stdout.write("\xEF\xBF\xBD")
+ else:
+ for i in bytes:
+ sys.stdout.write(chr(i))
+
+def process_ucs(x, bytes=[], errors=""):
+ if x < 0x80:
+ utf8 = [x]
+ realbytes = 1
+ else:
+ if x < 0x800:
+ tmp = (0xC0, 1)
+ elif x < 0x10000:
+ tmp = (0xE0, 2)
+ elif x < 0x200000:
+ tmp = (0xF0, 3)
+ elif x < 0x4000000:
+ tmp = (0xF8, 4)
+ else:
+ assert x < 0x80000000L
+ tmp = (0xFC, 5)
+ realbytes = tmp[1] + 1
+ utf8 = [tmp[0] + (x >> (6*tmp[1]))]
+ for i in range(tmp[1]-1, -1, -1):
+ utf8.append(0x80 + (0x3F & (x >> (i*6))))
+
+ if bytes != [] and len(bytes) > realbytes:
+ errors = errors + " (overlong form of"
+ for i in utf8:
+ errors = errors + " %02X" % i
+ errors = errors + ")"
+ utf8 = bytes
+ if x >= 0xD800 and x <= 0xDFFF:
+ errors = errors + " (surrogate)"
+ if x >= 0xFFFE and x <= 0xFFFF:
+ errors = errors + " (invalid char)"
+
+ output(x, utf8, errors)
+
+def process_utf8(next):
+ c = next()
+ while c != None:
+ char = [c]
+ i = c
+ if i < 0x80:
+ process_ucs(i) # single-byte char
+ c = next()
+ elif i == 0xfe or i == 0xff:
+ output(-1, char, " (invalid UTF-8 byte)")
+ c = next()
+ elif i >= 0x80 and i <= 0xbf:
+ output(-1, char, " (unexpected continuation byte)")
+ c = next()
+ else:
+ if i >= 0xC0 and i <= 0xDF:
+ acc = i &~ 0xC0
+ cbytes = 1
+ elif i >= 0xE0 and i <= 0xEF:
+ acc = i &~ 0xE0
+ cbytes = 2
+ elif i >= 0xF0 and i <= 0xF7:
+ acc = i &~ 0xF0
+ cbytes = 3
+ elif i >= 0xF8 and i <= 0xFB:
+ acc = i &~ 0xF8
+ cbytes = 4
+ elif i >= 0xFC and i <= 0xFD:
+ acc = i &~ 0xFC
+ cbytes = 5
+ gotone = 0
+ while cbytes > 0:
+ c = next()
+ if c == None or c < 0x80 or c > 0xBF:
+ gotone = 1
+ break
+ char.append(c)
+ acc = (acc << 6) + (c & 0x3F)
+ cbytes = cbytes - 1
+ if not gotone:
+ c = next()
+ if cbytes > 0:
+ output(-1, char, " (incomplete sequence)")
+ else:
+ process_ucs(acc, char)
+
+def do(args):
+ # Class to turn a list into a callable object that returns one
+ # element at a time.
+ class liststepper:
+ def __init__(self, list):
+ self.list = list
+ self.index = 0
+ def __call__(self):
+ if self.index >= len(self.list):
+ return None
+ ret = self.list[self.index]
+ self.index = self.index + 1
+ return ret
+
+ list = []
+ for arg in args:
+ if string.upper(arg[0]) == "U":
+ if len(list) > 0:
+ process_utf8(liststepper(list))
+ list = []
+ assert arg[1] == "+" or arg[1] == "-"
+ process_ucs(string.atoi(arg[2:], 16))
+ else:
+ list.append(string.atoi(arg, 16))
+
+ if len(list) > 0:
+ process_utf8(liststepper(list))
+
+args = sys.argv[1:]
+output_analysis = 1
+han_translations = 0
+
+if args == [] or args == ["--help"] or args == ["--help-admin"]:
+ print "Usage: cvt-utf8 [flags] <hex UTF-8 bytes and/or U+codepoints>"
+ print " e.g. cvt-utf8 e2 82 ac"
+ print " or cvt-utf8 U+20ac"
+ print " or cvt-utf8 U-10ffff"
+ print ""
+ print "Flags: -o or --output just output well-formed UTF-8 instead of"
+ print " an analysis of the input data"
+ print " -h or --han also give Han definitions from unihan db"
+ print ""
+ print "Also: cvt-utf8 --test run Markus Kuhn's decoder stress tests" #'
+ print " cvt-utf8 --input (or -i)"
+ print " read, analyse and decode UTF-8 from stdin"
+ if args == ["--help-admin"]:
+ print " cvt-utf8 --help display user help text"
+ print " cvt-utf8 --help-admin display admin help text (this one)"
+ print " cvt-utf8 --build <infile> <outfile>"
+ print " convert UnicodeData.txt to unicode db"
+ print " cvt-utf8 --build-unihan <infile> <outfile>"
+ print " convert Unihan.txt to unihan db"
+ print " cvt-utf8 --fetch-build <outfile>"
+ print " "+\
+ "build unicode db by download from unicode.org"
+ print " cvt-utf8 --fetch-build-unihan <outfile>"
+ print " "+\
+ "build Unihan db by download from unicode.org"
+ else:
+ print " cvt-utf8 --help display this help text"
+ print " cvt-utf8 --help-admin display admin help text"
+ sys.exit(0)
+
+if args[0] == "-o" or args[0] == "--output":
+ output_analysis = 0
+ args = args[1:]
+
+if args[0] == "-h" or args[0] == "--han":
+ han_translations = 1
+ args = args[1:]
+
+if args[0] == "--build" or args[0] == "--fetch-build":
+ if args[0] == "--build":
+ if len(args) != 3:
+ print "cvt-utf8: --build expects two filename arguments"
+ sys.exit(1)
+ infile = open(args[1], "r")
+ outfile = args[2]
+ else:
+ if len(args) != 2:
+ print "cvt-utf8: --fetch-build expects one filename argument"
+ sys.exit(1)
+ import urllib
+ infile = urllib.urlopen("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
+ outfile = args[1]
+ # Now build the database.
+ if outfile[-3:] == ".db":
+ print "cvt-utf8: warning: you should not append .db to db name"
+
+ db = anydbm.open(outfile, "n")
+ while 1:
+ s = infile.readline()
+ if s == "": break
+ ss = string.split(s, ";")[0]
+ db[ss] = s
+ db.close()
+ sys.exit(0)
+
+if args[0] == "--build-unihan" or args[0] == "--fetch-build-unihan":
+ if args[0] == "--build-unihan":
+ if len(args) != 3:
+ print "cvt-utf8: --build expects two filename arguments"
+ sys.exit(1)
+ infile = open(args[1], "r")
+ s = infile.read(1)
+ # Unihan.txt starts with a hash. If this file starts with a
+ # P, we assume it's a zip file ("PK").
+ if s == "P":
+ infile = zip_untangler(infile, s)
+ s = ""
+ outfile = args[2]
+ else:
+ if len(args) != 2:
+ print "cvt-utf8: --fetch-build-unihan expects one filename argument"
+ sys.exit(1)
+ import urllib
+ infile = urllib.urlopen("ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip")
+ # We know this one is zipped.
+ infile = zip_untangler(infile, "")
+ outfile = args[1]
+ s = ""
+ # Now build the database.
+ if outfile[-3:] == ".db":
+ print "cvt-utf8: warning: you should not append .db to db name"
+
+ db = anydbm.open(outfile, "n")
+ while 1:
+ s = s + infile.readline()
+ if s == "": break
+ while s[-1:] == "\r" or s[-1:] == "\n":
+ s = s[:-1]
+ sa = string.split(s, "\t")
+ if len(sa) == 3 and sa[1] == "kDefinition" and sa[0][:2] == "U+":
+ db[sa[0][2:]] = sa[2]
+ s = ""
+ db.close()
+ sys.exit(0)
+
+locations = []
+locations.append("/usr/share/unicode/unicode")
+locations.append("/usr/lib/unicode/unicode")
+locations.append("/usr/local/share/unicode/unicode")
+locations.append("/usr/local/lib/unicode/unicode")
+locations.append(os.environ["HOME"] + "/share/unicode/unicode")
+locations.append(os.environ["HOME"] + "/lib/unicode/unicode")
+
+for loc in locations:
+ try:
+ db = anydbm.open(loc, "r")
+ except IOError:
+ db = None
+ except anydbm.error:
+ db = None
+ if db != None:
+ break
+if han_translations:
+ i = string.rfind(loc, "/")
+ assert i >= 0
+ hanloc = loc[:i+1] + "unihan"
+ handb = anydbm.open(hanloc, "r")
+ # this has been explicitly required, so we don't squelch exceptions
+
+if args[0] == "--test":
+ do(["CE","BA","E1","BD","B9","CF","83","CE","BC","CE","B5"])
+ do(["00"])
+ do(["C2","80"])
+ do(["E0","A0","80"])
+ do(["F0","90","80","80"])
+ do(["F8","88","80","80","80"])
+ do(["FC","84","80","80","80","80"])
+ do(["7F"])
+ do(["DF","BF"])
+ do(["EF","BF","BF"])
+ do(["F7","BF","BF","BF"])
+ do(["FB","BF","BF","BF","BF"])
+ do(["FD","BF","BF","BF","BF","BF"])
+ do(["ED","9F","BF"])
+ do(["EE","80","80"])
+ do(["EF","BF","BD"])
+ do(["F4","8F","BF","BF"])
+ do(["F4","90","80","80"])
+ do(["80"])
+ do(["BF"])
+ do(["80","BF"])
+ do(["80","BF","80"])
+ do(["80","BF","80","BF"])
+ do(["80","BF","80","BF","80"])
+ do(["80","BF","80","BF","80","BF"])
+ do(["80","BF","80","BF","80","BF","80"])
+ do(["80","81","82","83","84","85","86","87",
+ "88","89","8A","8B","8C","8D","8E","8F",
+ "90","91","92","93","94","95","96","97",
+ "98","99","9A","9B","9C","9D","9E","9F",
+ "A0","A1","A2","A3","A4","A5","A6","A7",
+ "A8","A9","AA","AB","AC","AD","AE","AF",
+ "B0","B1","B2","B3","B4","B5","B6","B7",
+ "B8","B9","BA","BB","BC","BD","BE","BF"])
+ do(["C0","20","C1","20","C2","20","C3","20",
+ "C4","20","C5","20","C6","20","C7","20",
+ "C8","20","C9","20","CA","20","CB","20",
+ "CC","20","CD","20","CE","20","CF","20",
+ "D0","20","D1","20","D2","20","D3","20",
+ "D4","20","D5","20","D6","20","D7","20",
+ "D8","20","D9","20","DA","20","DB","20",
+ "DC","20","DD","20","DE","20","DF","20"])
+ do(["E0","20","E1","20","E2","20","E3","20",
+ "E4","20","E5","20","E6","20","E7","20",
+ "E8","20","E9","20","EA","20","EB","20",
+ "EC","20","ED","20","EE","20","EF","20"])
+ do(["F0","20","F1","20","F2","20","F3","20",
+ "F4","20","F5","20","F6","20","F7","20"])
+ do(["F8","20","F9","20","FA","20","FB","20"])
+ do(["FC","20","FD","20"])
+ do(["C0"])
+ do(["E0","80"])
+ do(["F0","80","80"])
+ do(["F8","80","80","80"])
+ do(["FC","80","80","80","80"])
+ do(["DF"])
+ do(["EF","BF"])
+ do(["F7","BF","BF"])
+ do(["FB","BF","BF","BF"])
+ do(["FD","BF","BF","BF","BF"])
+ do(["C0","E0","80","F0","80","80","F8","80",
+ "80","80","FC","80","80","80","80",
+ "DF","EF","BF","F7","BF","BF","FB",
+ "BF","BF","BF","FD","BF","BF","BF","BF"])
+ do(["FE"])
+ do(["FF"])
+ do(["FE","FE","FF","FF"])
+ do(["C0","AF"])
+ do(["E0","80","AF"])
+ do(["F0","80","80","AF"])
+ do(["F8","80","80","80","AF"])
+ do(["FC","80","80","80","80","AF"])
+ do(["C1","BF"])
+ do(["E0","9F","BF"])
+ do(["F0","8F","BF","BF"])
+ do(["F8","87","BF","BF","BF"])
+ do(["FC","83","BF","BF","BF","BF"])
+ do(["C0","80"])
+ do(["E0","80","80"])
+ do(["F0","80","80","80"])
+ do(["F8","80","80","80","80"])
+ do(["FC","80","80","80","80","80"])
+ do(["ED","A0","80"])
+ do(["ED","AD","BF"])
+ do(["ED","AE","80"])
+ do(["ED","AF","BF"])
+ do(["ED","B0","80"])
+ do(["ED","BE","80"])
+ do(["ED","BF","BF"])
+ do(["ED","A0","80","ED","B0","80"])
+ do(["ED","A0","80","ED","BF","BF"])
+ do(["ED","AD","BF","ED","B0","80"])
+ do(["ED","AD","BF","ED","BF","BF"])
+ do(["ED","AE","80","ED","B0","80"])
+ do(["ED","AE","80","ED","BF","BF"])
+ do(["ED","AF","BF","ED","B0","80"])
+ do(["ED","AF","BF","ED","BF","8F"])
+ do(["EF","BF","BE"])
+ do(["EF","BF","BF"])
+elif args[0] == "--input" or args[0] == "-i":
+ def getchar():
+ s = sys.stdin.read(1)
+ if s == "":
+ return None
+ return ord(s) & 0xFF # ensure it isn't negative
+ process_utf8(getchar)
+else:
+ do(args)
--- /dev/null
+\cfg{man-identity}{cvt-utf8}{1}{2004-03-24}{Simon Tatham}{Simon Tatham}
+\cfg{man-mindepth}{1}
+
+\C{cvt-utf8-manpage} Man page for \cw{cvt-utf8}
+
+\H{cvt-utf8-manpage-name} NAME
+
+\cw{cvt-utf8} - convert between UTF-8 and Unicode, and analyse Unicode
+
+\H{cvt-utf8-manpage-synopsis} SYNOPSIS
+
+\c cvt-utf8 [flags] [hex UTF-8 bytes and/or U+codepoints]
+\e bbbbbbbb iiiii iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii
+
+\H{cvt-utf8-manpage-description} DESCRIPTION
+
+\cw{cvt-utf8} is a tool for manipulating and analysing UTF-8 and
+Unicode data. Its functions include:
+
+\b Given a sequence of Unicode code points, convert them to the
+corresponding sequence of bytes in the UTF-8 encoding.
+
+\b Given a sequence of UTF-8 bytes, convert them back into Unicode
+code points.
+
+\b Given any combination of the above inputs, look up each Unicode
+code point in the Unicode character database and identify it.
+
+\b Look up Unified Han characters in the \q{Unihan} database and
+provide their translation text.
+
+By default, \cw{cvt-utf8} expects to receive hex numbers (either
+UTF-8 bytes or Unicode code points) on the command line, and it will
+print out a verbose analysis of the input data. If you need it to
+read UTF-8 from standard input or to write pure UTF-8 to standard
+output, you can do so using command-line options.
+
+\H{cvt-utf8-manpage-options} OPTIONS
+
+\dt \cw{-i}
+
+\dd Read UTF-8 data from standard input and analyse that, instead of
+expecting hex numbers on the command line.
+
+\dt \cw{-o}
+
+\dd Write well-formed UTF-8 to standard output, instead of writing a
+long analysis of the input data.
+
+\dt \cw{-h}
+
+\dd Look up each code point in the Unihan database as well as the
+main Unicode character database.
+
+\H{cvt-utf8-manpage-examples} EXAMPLES
+
+In \cw{cvt-utf8}'s native mode, it simply analyses input Unicode or
+UTF-8 data. For example, you can give a list of Unicode code
+points...
+
+\c $ cvt-utf8 U+20ac U+31 U+30
+\e bbbbbbbbbbbbbbbbbbbbbbbbb
+\c U-000020AC E2 82 AC EURO SIGN
+\c U-00000031 31 DIGIT ONE
+\c U-00000030 30 DIGIT ZERO
+
+... and \cw{cvt-utf8} gives you the UTF-8 encodings plus the
+character definitions.
+
+Alternatively, you can supply a list of UTF-8 bytes...
+
+\c $ cvt-utf8 D0 A0 D1 83 D1 81 D1 81 D0 BA D0 B8 D0 B9
+\e bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+\c U-00000420 D0 A0 CYRILLIC CAPITAL LETTER ER
+\c U-00000443 D1 83 CYRILLIC SMALL LETTER U
+\c U-00000441 D1 81 CYRILLIC SMALL LETTER ES
+\c U-00000441 D1 81 CYRILLIC SMALL LETTER ES
+\c U-0000043A D0 BA CYRILLIC SMALL LETTER KA
+\c U-00000438 D0 B8 CYRILLIC SMALL LETTER I
+\c U-00000439 D0 B9 CYRILLIC SMALL LETTER SHORT I
+
+... and you get back the same output format, including the UTF-8
+code points.
+
+If you supply malformed data, \cw{cvt-utf8} will break it down for
+you and identify the malformed pieces and any correctly formed
+characters:
+
+\c $ cvt-utf8 A9 FE 45 C2 80 90 0A
+\e bbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+\c A9 (unexpected continuation byte)
+\c FE (invalid UTF-8 byte)
+\c U-00000045 45 LATIN CAPITAL LETTER E
+\c U-00000080 C2 80 <control>
+\c 90 (unexpected continuation byte)
+\c U-0000000A 0A <control>
+
+If you need the UTF-8 encoding of a particular character, you can
+use the \cw{-o} option to cause the UTF-8 to be written to standard
+output:
+
+\c $ cvt-utf8 -o U+20AC >> my-utf8-file.txt
+\e bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+
+If you have UTF-8 data in a file or output from another program, you
+can use the \cw{-i} option to have \cw{cvt-utf8} analyse it. This
+works particularly well if you also have my \cw{xcopy} program,
+which can be told to extract UTF-8 data from the X selection and
+write it to its standard output. With these two programs working
+together, if you ever have trouble identifying some text in a
+UTF-8-supporting web browser such as Mozilla, you can simply select
+the text in question, switch to a terminal window, and type
+
+\c $ xcopy -u -r | cvt-utf8 -i
+\e bbbbbbbbbbbbbbbbbbbbbbbbb
+
+If the text is in Chinese, you can get at least a general idea of
+its meaning by using the \cw{-h} option to print the meaning of each
+ideograph from the Unihan database. For example, if you pass in the
+Chinese text meaning \q{Traditional Chinese}:
+
+\c $ cvt-utf8 -h U+7E41 U+9AD4 U+4E2D U+6587
+\e bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+\c U-00007E41 E7 B9 81 <han> complicated, complex, difficult
+\c U-00009AD4 E9 AB 94 <han> body; group, class, body, unit
+\c U-00004E2D E4 B8 AD <han> central; center, middle; in the
+\c midst of; hit (target); attain
+\c U-00006587 E6 96 87 <han> literature, culture, writing
+
+\H{cvt-utf8-manpage-bugs} BUGS
+
+Command-line option processing is very basic. In particular, \cw{-h}
+must come before \cw{-i} or it will not be recognised.
--- /dev/null
+X11LIB=-L/usr/X11R6/lib -lX11
+
+all: xcopy.1 xcopy
+
+xcopy: xcopy.c
+ $(CC) $(CFLAGS) -o $@ $< $(X11LIB)
+
+%.1: %.but
+ halibut --man=$@ $<
+
+clean:
+ rm -f *.1 xcopy
--- /dev/null
+\cfg{man-identity}{xcopy}{1}{2004-08-02}{Simon Tatham}{Simon Tatham}
+\cfg{man-mindepth}{1}
+
+\C{xcopy-manpage} Man page for \cw{xcopy}
+
+\H{xcopy-manpage-name} NAME
+
+\cw{xcopy} - read and write text to/from an X selection from the
+command line
+
+\H{xcopy-manpage-synopsis} SYNOPSIS
+
+\c xcopy [ -r ] [ -u | -c ] [ -C ]
+\e bbbbb bb bb bb bb
+
+\H{xcopy-manpage-description} DESCRIPTION
+
+\cw{xcopy} is a command-line utility for manipulating the X selection.
+
+It has two basic modes. In read mode (\cw{xcopy -r}), it connects to
+your X server, retrieves the contents of the selection as plain
+text, and writes it on standard output. You would then typically
+redirect its output into a file, or pipe it into some other program.
+
+In write mode (just \cw{xcopy}, if \cw{-r} is not specified), it
+will read data from standard input, then connect to your X server
+and place that data in the selection as plain text. So you can pipe
+data into \cw{xcopy}, move to another application, and press Paste.
+
+The X selection model requires the selection-owning client to remain
+connected to the server and hand out its data on request. Therefore,
+\cw{xcopy} in write mode forks off a background process which does
+this. The background process terminates once it is no longer the
+selection owner (i.e. as soon as you select data in another
+application), or if your X session finishes. Normally you can ignore
+its presence, although it might become important to be aware of it
+if (for example) the \cw{xcopy} background process were to be the
+last X client still connected through an SSH tunnel.
+
+\cw{xcopy} currently only handles text data. However, it is capable
+of handling it in the form of plain text, UTF-8, or compound
+(multiple-character-set) text. Use the \cw{-u}, \cw{-c} and \cw{-C}
+options to control this aspect of its behaviour.
+
+\H{xcopy-manpage-options} OPTIONS
+
+By default (if \cw{-r} is not supplied), \cw{xcopy} operates in
+write mode.
+
+\dt \cw{-r}
+
+\dd Places \cw{xcopy} into read mode.
+
+By default (if neither \cw{-c} nor \cw{-u} is supplied), \cw{xcopy}
+reads and writes the selection using the type \cw{STRING}, which
+means that the input or output data is expected to be encoded in
+ISO-8859-1.
+
+\dt \cw{-u}
+
+\dd In read mode, causes \cw{xcopy} to request the selection using
+the type \cw{UTF8_STRING}, which typically means that the returned
+data will be encoded as UTF-8. In write mode, causes \cw{xcopy} to
+\e{give out} the selection as type \cw{UTF8_STRING}, meaning that
+the data piped in to it is expected to be encoded as UTF-8.
+
+\dt \cw{-c}
+
+\dd Similar to \cw{-u}, but uses the type \cw{COMPOUND_TEXT} rather
+than \cw{UTF8_STRING}. \cw{COMPOUND_TEXT} is a complex
+multi-character-set encoding similar to ISO 2022, and is unlikely to
+be a very useful form in which to pass data to or from non-X
+programs. However, it might occasionally be useful to retrieve a
+compound text selection using \cw{xcopy -r -c}, and later on return
+it to the X selection using \cw{xcopy -c} so it can be pasted back
+into a different application.
+
+In write mode, if \cw{xcopy} is operating in \cw{STRING} mode and a
+pasting application requests the selection as \cw{COMPOUND_TEXT},
+\cw{xcopy} will convert the data automatically. This is normally
+what you want.
+
+\dt \cw{-C}
+
+\dd Suppresses conversion to compound text in write mode. This is
+occasionally useful if you are pasting control characters, since the
+compound text specification forbids any control characters and the
+Xlib conversion functions honour this. If you are (for example)
+trying to paste a sequence of editor commands into a terminal
+window, you might well want to paste a string full of escape
+sequences and control characters, in which case you may need to use
+\cw{-C} to disable conversion to compound text.
+
+\H{xcopy-manpage-bugs} BUGS
+
+Occasionally \cw{xcopy -r} completely fails to notice selection data
+owned by another process. I have not yet reproduced this reliably;
+if anyone can, some work with \cw{xmon}(1) would be much
+appreciated...
+
+Automatic conversion between compound text and UTF-8 is not
+currently supported. There are Xlib functions to do it, although
+they don't appear to work very well (missing out many characters
+which they could have converted).
--- /dev/null
+/*
+ * xcopy: quickly pipe text data into, or out of, the primary X
+ * selection
+ */
+
+#include <malloc.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <math.h>
+#include <errno.h>
+#include <assert.h>
+
+#include <X11/X.h>
+#include <X11/Xlib.h>
+#include <X11/Xutil.h>
+#include <X11/Xatom.h>
+
+int init_X(void);
+void run_X(void);
+void done_X(void);
+void full_redraw(void);
+void do_paste(Window window, Atom property, int Delete);
+
+char *pname; /* program name */
+
+void error (char *fmt, ...);
+
+/* set from command-line parameters */
+char *display = NULL;
+enum { STRING, CTEXT, UTF8 } mode = STRING;
+
+/* selection data */
+char *seltext;
+int sellen, selsize;
+#define SELDELTA 16384
+
+/* functional parameters */
+int reading; /* read instead of writing? */
+int convert_to_ctext = True; /* Xmb convert to compound text? */
+
+int main(int ac, char **av) {
+ int n;
+ int eventloop;
+
+ pname = *av;
+
+ /* parse the command line arguments */
+ while (--ac) {
+ char *p = *++av;
+
+ if (!strcmp(p, "-display") || !strcmp(p, "-disp")) {
+ if (!av[1])
+ error ("option `%s' expects a parameter", p);
+ display = *++av, --ac;
+ } else if (!strcmp(p, "-r")) {
+ reading = True;
+ } else if (!strcmp(p, "-u")) {
+ mode = UTF8;
+ } else if (!strcmp(p, "-c")) {
+ mode = CTEXT;
+ } else if (!strcmp(p, "-C")) {
+ convert_to_ctext = False;
+ } else if (*p=='-') {
+ error ("unrecognised option `%s'", p);
+ } else {
+ error ("no parameters required");
+ }
+ }
+
+ if (!reading) {
+ seltext = malloc(SELDELTA);
+ if (!seltext)
+ error ("out of memory");
+ selsize = SELDELTA;
+ sellen = 0;
+ do {
+ n = fread(seltext+sellen, 1, selsize-sellen, stdin);
+ sellen += n;
+ if (sellen >= selsize) {
+ seltext = realloc(seltext, selsize += SELDELTA);
+ if (!seltext)
+ error ("out of memory");
+ }
+ } while (n > 0);
+ if (sellen == selsize) {
+ seltext = realloc(seltext, selsize += SELDELTA);
+ if (!seltext)
+ error ("out of memory");
+ }
+ seltext[sellen] = '\0';
+ }
+
+ eventloop = init_X();
+ if (!reading) {
+ /*
+ * If we are writing the selection, we must go into the
+ * background now.
+ */
+ int pid = fork();
+ if (pid < 0) {
+ error("unable to fork: %s", strerror(errno));
+ } else if (pid > 0) {
+ /*
+ * we are the parent; just exit
+ */
+ return 0;
+ }
+ /*
+ * we are the child
+ */
+ close(0);
+ close(1);
+ close(2);
+ chdir("/");
+ }
+ if (eventloop)
+ run_X();
+ done_X();
+ return 0;
+}
+
+/* handle errors */
+
+void error (char *fmt, ...) {
+ va_list ap;
+ char errbuf[200];
+
+ done_X();
+ va_start (ap, fmt);
+ vsprintf (errbuf, fmt, ap);
+ va_end (ap);
+ fprintf (stderr, "%s: %s\n", pname, errbuf);
+ exit (1);
+}
+
+/* begin the X interface */
+
+char *lcasename = "xcopy";
+char *ucasename = "XCopy";
+
+Display *disp = NULL;
+Window ourwin = None;
+Atom compound_text_atom, targets_atom;
+int screen, wwidth, wheight;
+
+Atom strtype = XA_STRING;
+
+/*
+ * Returns TRUE if we need to enter an event loop, FALSE otherwise.
+ */
+int init_X(void) {
+ Window root;
+ int x = 0, y = 0, width = 512, height = 128;
+ int i, got = 0;
+ XWMHints wm_hints;
+ XSizeHints size_hints;
+ XClassHint class_hints;
+ XTextProperty textprop;
+ XGCValues gcv;
+
+ /* open the X display */
+ disp = XOpenDisplay (display);
+ if (!disp)
+ error ("unable to open display");
+
+ if (mode == UTF8) {
+ strtype = XInternAtom(disp, "UTF8_STRING", False);
+ if (!strtype)
+ error ("unable to get UTF8_STRING property");
+ } else if (mode == CTEXT) {
+ strtype = XInternAtom(disp, "COMPOUND_TEXT", False);
+ if (!strtype)
+ error ("unable to get COMPOUND_TEXT property");
+ }
+ targets_atom = XInternAtom(disp, "TARGETS", False);
+ if (!targets_atom)
+ error ("unable to get TARGETS property");
+
+ /* get the screen and root-window */
+ screen = DefaultScreen (disp);
+ root = RootWindow (disp, screen);
+
+ x = y = 0;
+ width = height = 10; /* doesn't really matter */
+
+ /* actually create the window */
+ ourwin = XCreateSimpleWindow (disp, root, x, y, width, height,0,
+ BlackPixel(disp, screen),
+ WhitePixel(disp, screen));
+
+ /* resource class name */
+ class_hints.res_name = lcasename;
+ class_hints.res_class = ucasename;
+ XSetClassHint (disp, ourwin, &class_hints);
+
+ /* do selection fiddling */
+ if (reading) {
+ /*
+ * We are reading the selection, so we must FIXME.
+ */
+ if (XGetSelectionOwner(disp, XA_PRIMARY) == None) {
+ /* No primary selection, so use the cut buffer. */
+ do_paste(DefaultRootWindow(disp), XA_CUT_BUFFER0, False);
+ return False;
+ } else {
+ Atom sel_property = XInternAtom(disp, "VT_SELECTION", False);
+ XConvertSelection(disp, XA_PRIMARY, strtype,
+ sel_property, ourwin, CurrentTime);
+ return True;
+ }
+ } else {
+ /*
+ * We are writing to the selection, so we establish
+ * ourselves as selection owner. Also place the data in
+ * CUT_BUFFER0, if it isn't of an exotic type (cut buffers
+ * can only take ordinary string data, it turns out).
+ */
+ XSetSelectionOwner (disp, XA_PRIMARY, ourwin, CurrentTime);
+ if (XGetSelectionOwner (disp, XA_PRIMARY) != ourwin)
+ error ("unable to obtain primary X selection\n");
+ compound_text_atom = XInternAtom(disp, "COMPOUND_TEXT", False);
+ if (strtype == XA_STRING) {
+ /*
+ * ICCCM-required cut buffer initialisation.
+ */
+ XChangeProperty(disp, root, XA_CUT_BUFFER0,
+ XA_STRING, 8, PropModeAppend, "", 0);
+ XChangeProperty(disp, root, XA_CUT_BUFFER1,
+ XA_STRING, 8, PropModeAppend, "", 0);
+ XChangeProperty(disp, root, XA_CUT_BUFFER2,
+ XA_STRING, 8, PropModeAppend, "", 0);
+ XChangeProperty(disp, root, XA_CUT_BUFFER3,
+ XA_STRING, 8, PropModeAppend, "", 0);
+ XChangeProperty(disp, root, XA_CUT_BUFFER4,
+ XA_STRING, 8, PropModeAppend, "", 0);
+ XChangeProperty(disp, root, XA_CUT_BUFFER5,
+ XA_STRING, 8, PropModeAppend, "", 0);
+ XChangeProperty(disp, root, XA_CUT_BUFFER6,
+ XA_STRING, 8, PropModeAppend, "", 0);
+ XChangeProperty(disp, root, XA_CUT_BUFFER7,
+ XA_STRING, 8, PropModeAppend, "", 0);
+ /*
+ * Rotate the cut buffers and add our text in CUT_BUFFER0.
+ */
+ XRotateBuffers(disp, 1);
+ XStoreBytes(disp, seltext, sellen);
+ }
+ return True;
+ }
+}
+
+void run_X(void) {
+ XEvent ev, e2;
+
+ while (1) {
+ XNextEvent (disp, &ev);
+ if (reading) {
+ switch (ev.type) {
+ case SelectionNotify:
+ if (ev.xselection.property != None)
+ do_paste(ev.xselection.requestor,
+ ev.xselection.property, True);
+ return;
+ }
+ } else {
+ switch (ev.type) {
+ case SelectionClear:
+ /* Selection has been cleared by another app. */
+ return;
+ case SelectionRequest:
+ e2.xselection.type = SelectionNotify;
+ e2.xselection.requestor = ev.xselectionrequest.requestor;
+ e2.xselection.selection = ev.xselectionrequest.selection;
+ e2.xselection.target = ev.xselectionrequest.target;
+ e2.xselection.time = ev.xselectionrequest.time;
+ if (ev.xselectionrequest.target == strtype) {
+ XChangeProperty (disp, ev.xselectionrequest.requestor,
+ ev.xselectionrequest.property, strtype,
+ 8, PropModeReplace, seltext, sellen);
+ e2.xselection.property = ev.xselectionrequest.property;
+ } else if (ev.xselectionrequest.target == compound_text_atom &&
+ convert_to_ctext) {
+ XTextProperty tp;
+ XmbTextListToTextProperty (disp, &seltext, 1,
+ XCompoundTextStyle, &tp);
+ XChangeProperty (disp, ev.xselectionrequest.requestor,
+ ev.xselectionrequest.property,
+ ev.xselectionrequest.target,
+ tp.format, PropModeReplace,
+ tp.value, tp.nitems);
+ e2.xselection.property = ev.xselectionrequest.property;
+ } else if (ev.xselectionrequest.target == targets_atom) {
+ Atom targets[2];
+ int len = 0;
+ targets[len++] = strtype;
+ if (strtype != compound_text_atom && convert_to_ctext)
+ targets[len++] = compound_text_atom;
+ XChangeProperty (disp, ev.xselectionrequest.requestor,
+ ev.xselectionrequest.property,
+ ev.xselectionrequest.target,
+ 32, PropModeReplace,
+ (unsigned char *)targets, len);
+ } else {
+ e2.xselection.property = None;
+ }
+ XSendEvent (disp, ev.xselectionrequest.requestor, False, 0, &e2);
+ }
+ }
+ }
+}
+
+void done_X(void) {
+ int i;
+
+ if (ourwin != None)
+ XDestroyWindow (disp, ourwin);
+ if (disp)
+ XCloseDisplay (disp);
+}
+
+void do_paste(Window window, Atom property, int Delete) {
+ Atom actual_type;
+ int actual_format, i;
+ long nitems, bytes_after, nread;
+ unsigned char *data;
+
+ nread = 0;
+ while (XGetWindowProperty(disp, window, property, nread / 4, SELDELTA,
+ Delete, AnyPropertyType, &actual_type,
+ &actual_format, &nitems, &bytes_after,
+ (unsigned char **) &data) == Success) {
+ /*
+ * We expect all returned chunks of data to be multiples of
+ * 4 bytes (because we can only request the subsequent
+ * starting offset in 4-byte increments). Of course you can
+ * store an odd number of bytes in a selection, so this
+ * can't be the case every time XGetWindowProperty returns;
+ * but it should be the case every time it returns _and
+ * there is more data to come_.
+ *
+ * Hence, whenever XGetWindowProperty returns, we verify
+ * that the size of the data returned _last_ time was
+ * divisible by 4.
+ */
+ if (nitems > 0)
+ assert((nread & 3) == 0);
+
+ if (actual_type == strtype && nitems > 0) {
+ assert(actual_format == 8);
+ fwrite(data, 1, nitems, stdout);
+ nread += nitems;
+ }
+ XFree(data);
+ if (actual_type != strtype || nitems == 0)
+ break;
+ }
+}