From 9acadc2b1377453e1c10614920bd390c52227e8a Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 20 Nov 2004 08:44:10 +0000 Subject: [PATCH] Move some of my more useful utilities out from my all-purpose `local' and `misc' directories into a `utils' area, where they might end up releasable. git-svn-id: svn://svn.tartarus.org/sgt/utils@4837 cda61777-01e9-0310-a592-d414129be87e --- base64/Makefile | 10 ++ base64/base64.but | 72 ++++++++ base64/base64.c | 263 +++++++++++++++++++++++++++++ cvt-utf8/Makefile | 7 + cvt-utf8/cvt-utf8 | 460 ++++++++++++++++++++++++++++++++++++++++++++++++++ cvt-utf8/cvt-utf8.but | 133 +++++++++++++++ xcopy/Makefile | 12 ++ xcopy/xcopy.but | 104 ++++++++++++ xcopy/xcopy.c | 359 +++++++++++++++++++++++++++++++++++++++ 9 files changed, 1420 insertions(+) create mode 100644 base64/Makefile create mode 100644 base64/base64.but create mode 100644 base64/base64.c create mode 100644 cvt-utf8/Makefile create mode 100755 cvt-utf8/cvt-utf8 create mode 100644 cvt-utf8/cvt-utf8.but create mode 100644 xcopy/Makefile create mode 100644 xcopy/xcopy.but create mode 100644 xcopy/xcopy.c diff --git a/base64/Makefile b/base64/Makefile new file mode 100644 index 0000000..44ec017 --- /dev/null +++ b/base64/Makefile @@ -0,0 +1,10 @@ +all: base64.1 base64 + +base64: base64.c + $(CC) $(CFLAGS) -o $@ $< + +%.1: %.but + halibut --man=$@ $< + +clean: + rm -f *.1 base64 diff --git a/base64/base64.but b/base64/base64.but new file mode 100644 index 0000000..0634d05 --- /dev/null +++ b/base64/base64.but @@ -0,0 +1,72 @@ +\cfg{man-identity}{base64}{1}{2004-08-02}{Simon Tatham}{Simon Tatham} +\cfg{man-mindepth}{1} + +\C{base64-manpage} Man page for \cw{base64} + +\H{base64-manpage-name} NAME + +\cw{base64} - stand-alone encoder and decoder for base64 + +\H{base64-manpage-synopsis} SYNOPSIS + +\c base64 [ -d ] [ filename ] +\e bbbbbb bb iiiiiiii +\c base64 -e [ -cwidth ] [ filename ] +\e bbbbbb bb bbiiiii iiiiiiii + +\H{base64-manpage-description} DESCRIPTION + +\cw{base64} is a command-line utility for encoding and decoding the +\q{base64} encoding. + +This encoding, defined in RFC 2045, is primarily used to encode +binary attachments in MIME e-mail, but is widely used in many other +applications as well. For example, the \q{Content-MD5} mail header +contains a small piece of base64; SSH private keys are generally +stored as base64-encoded blobs; and so on. + +Other utilities, such as \cw{munpack}, exist which will take an +entire MIME-encoded message, identify the base64-encoded subparts, +and decode them. However, these utilities will not help you if you +need to inspect a Content-MD5 header or an SSH private key. + +\cw{base64} is a very simple stand-alone encoder and decoder for the +base64 format \e{alone}. It does not try to understand MIME headers +or anything other than raw data. + +\H{base64-manpage-options} OPTIONS + +By default (if neither \cw{-d} or \cw{-e} is supplied), \cw{base64} +operates in decode mode. + +\dt \cw{-d} + +\dd Places \cw{base64} into decode mode. In this mode, it will read +from standard input or the supplied file name, ignore all characters +that are not part of the base64 alphabet, decode the ones that are, +and output the decoded data on standard output. + +\dt \cw{-e} + +\dd Places \cw{base64} into encode mode. In this mode, it will read +binary data from standard input or the supplied file name, encode it +as base64, and output the encoded data on standard output. + +\dt \cw{-c}\e{width} + +\dd If \cw{base64} is operating in encode mode, this controls the +number of base64 characters output per line of the encoded file. +Normally base64-reading applications do not care about this, so the +default of 64 characters per line is perfectly adequate. + +\lcont{ + +The special value 0 will prevent \cw{base64} from ever writing a +line break in the middle of the data at all. + +The base64 encoding converts between a group of three plaintext +bytes and a group of four encoded bytes. \cw{base64} does not +support breaking an encoded group across a line. Therefore, the +\e{width} parameter passed to \cw{-c} must be a multiple of 4. + +} diff --git a/base64/base64.c b/base64/base64.c new file mode 100644 index 0000000..178c51b --- /dev/null +++ b/base64/base64.c @@ -0,0 +1,263 @@ +#include +#include +#include + +#define isbase64(c) ( ((c) >= 'A' && (c) <= 'Z') || \ + ((c) >= 'a' && (c) <= 'z') || \ + ((c) >= '0' && (c) <= '9') || \ + (c) == '+' || (c) == '/' || (c) == '=' \ + ) + +int base64_decode_atom(char *atom, unsigned char *out) { + int vals[4]; + int i, v, len; + unsigned word; + char c; + + for (i = 0; i < 4; i++) { + c = atom[i]; + if (c >= 'A' && c <= 'Z') + v = c - 'A'; + else if (c >= 'a' && c <= 'z') + v = c - 'a' + 26; + else if (c >= '0' && c <= '9') + v = c - '0' + 52; + else if (c == '+') + v = 62; + else if (c == '/') + v = 63; + else if (c == '=') + v = -1; + else + return 0; /* invalid atom */ + vals[i] = v; + } + + if (vals[0] == -1 || vals[1] == -1) + return 0; + if (vals[2] == -1 && vals[3] != -1) + return 0; + + if (vals[3] != -1) + len = 3; + else if (vals[2] != -1) + len = 2; + else + len = 1; + + word = ((vals[0] << 18) | + (vals[1] << 12) | + ((vals[2] & 0x3F) << 6) | + (vals[3] & 0x3F)); + out[0] = (word >> 16) & 0xFF; + if (len > 1) + out[1] = (word >> 8) & 0xFF; + if (len > 2) + out[2] = word & 0xFF; + return len; +} + +void base64_encode_atom(unsigned char *data, int n, char *out) { + static const char base64_chars[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + + unsigned word; + + word = data[0] << 16; + if (n > 1) + word |= data[1] << 8; + if (n > 2) + word |= data[2]; + out[0] = base64_chars[(word >> 18) & 0x3F]; + out[1] = base64_chars[(word >> 12) & 0x3F]; + if (n > 1) + out[2] = base64_chars[(word >> 6) & 0x3F]; + else + out[2] = '='; + if (n > 2) + out[3] = base64_chars[word & 0x3F]; + else + out[3] = '='; +} + +const char usagemsg[] = + "usage: base64 [-d] [filename] decode from a file or from stdin\n" + " or: base64 -e [-cNNN] [filename] encode from a file or from stdin\n" + " also: base64 --version report version number\n" + " and: base64 --help display this help text\n" + "where: -d decode mode (default)\n" + " -e encode mode\n" + " -cNNN set number of chars per line for encoded output\n" + ; + +void usage(void) { + fputs(usagemsg, stdout); +} + +void version(void) { +#define SVN_REV "$Revision$" + char rev[sizeof(SVN_REV)]; + char *p, *q; + + strcpy(rev, SVN_REV); + + for (p = rev; *p && *p != ':'; p++); + if (*p) { + p++; + while (*p && isspace(*p)) p++; + for (q = p; *q && *q != '$'; q++); + if (*q) *q = '\0'; + printf("base64 revision %s\n", p); + } else { + printf("base64: unknown version\n"); + } +} + +int main(int ac, char **av) { + int encoding = 0; + int cpl = 64; + FILE *fp; + char *fname; + char *eptr; + + fname = NULL; + + while (--ac) { + char *v, *p = *++av; + if (*p == '-') { + while (*p) { + char c = *++p; + switch (c) { + case '-': + if (!strcmp(p, "version")) { + version(); + exit(0); + } + if (!strcmp(p, "help")) { + usage(); + exit(0); + } + break; + case 'v': + case 'V': + version(); + exit(0); + break; + case 'h': + case 'H': + usage(); + exit(0); + break; + case 'd': + encoding = 0; + break; + case 'e': + encoding = 1; + break; + case 'c': + /* + * Options requiring values. + */ + v = p+1; + if (!*v && ac > 1) { + --ac; + v = *++av; + } + if (!*v) { + fprintf(stderr, "base64: option '-%c' expects" + " an argument\n", c); + exit(1); + } + switch (c) { + case 'c': + cpl = strtol(v, &eptr, 10); + if (eptr && *eptr) { + fprintf(stderr, "base64: option -c expects" + " a numeric argument\n"); + exit(1); + } + if (cpl % 4) { + fprintf(stderr, "base64: chars per line should be" + " divisible by 4\n"); + exit(1); + } + break; + } + p = ""; + break; + } + } + } else { + if (!fname) + fname = p; + else { + fprintf(stderr, "base64: expected only one filename\n"); + exit(0); + } + } + } + + if (fname) { + fp = fopen(fname, encoding ? "rb" : "r"); + if (!fp) { + fprintf(stderr, "base64: unable to open '%s': %s\n", fname, + strerror(errno)); + exit(1); + } + } else + fp = stdin; + + if (encoding) { + unsigned char in[3]; + char out[4]; + int column; + int n; + + column = 0; + while (1) { + if (cpl && column >= cpl) { + putchar('\n'); + column = 0; + } + n = fread(in, 1, 3, fp); + if (n == 0) break; + base64_encode_atom(in, n, out); + fwrite(out, 1, 4, stdout); + column += 4; + } + + putchar('\n'); + } else { + char in[4]; + unsigned char out[3]; + int c, i, n, eof; + + eof = 0; + do { + for (i = 0; i < 4; i++) { + do { + c = fgetc(fp); + } while (c != EOF && !isbase64(c)); + if (c == EOF) { + eof = 1; + break; + } + in[i] = c; + } + if (i > 0) { + if (i < 4) { + fprintf(stderr, "base64: warning: number of base64" + " characters was not a multiple of 4\n"); + while (i < 4) in[i++] = '='; + } + n = base64_decode_atom(in, out); + fwrite(out, 1, n, stdout); + } + } while (!eof); + } + + if (fname) + fclose(fp); + + return 0; +} diff --git a/cvt-utf8/Makefile b/cvt-utf8/Makefile new file mode 100644 index 0000000..e766e09 --- /dev/null +++ b/cvt-utf8/Makefile @@ -0,0 +1,7 @@ +all: cvt-utf8.1 + +%.1: %.but + halibut --man=$@ $< + +clean: + rm -f *.1 diff --git a/cvt-utf8/cvt-utf8 b/cvt-utf8/cvt-utf8 new file mode 100755 index 0000000..06a17fd --- /dev/null +++ b/cvt-utf8/cvt-utf8 @@ -0,0 +1,460 @@ +#!/usr/bin/env python + +import sys +import string +import os +import anydbm +import zlib + +class zip_untangler: + def __init__(self, file, datasofar): + self.file = file + assert len(datasofar) < 30 + self.header = datasofar + self.data = "" + self.dataleft = None + self.decompress = zlib.decompressobj() + # Zlib header bytes, expected by decompress obj but not + # present in zip file + self.decompress.decompress("\x78\x9c") + + def readline(self): + if self.dataleft == None: + while len(self.header) < 30: + s = self.file.read(30 - len(self.header)) + assert s != "" + self.header = self.header + s + # Name length and extra length. + namelen = 256 * ord(self.header[27]) + ord(self.header[26]) + extralen = 256 * ord(self.header[29]) + ord(self.header[28]) + while len(self.header) < 30 + namelen + extralen: + s = self.file.read(30 + namelen + extralen - len(self.header)) + assert s != "" + self.header = self.header + s + self.dataleft = \ + 256 * (256 * (256 * ord(self.header[21]) + ord(self.header[20])) \ + + ord(self.header[19])) + ord(self.header[18]) + k = string.find(self.data, "\n") + while k < 0: + rlen = self.dataleft + if rlen > 4096: rlen = 4096 + if rlen == 0: break + d = self.file.read(rlen) + if d == "": break + self.dataleft = self.dataleft - rlen + self.data = self.data + self.decompress.decompress(d) + k = string.find(self.data, "\n") + if k < 0: + ret = self.data + self.data = "" + return ret + else: + ret = self.data[:k+1] + self.data = self.data[k+1:] + return ret + +def hexstr(x): + s = hex(x) + if s[-1:] == "L" or s[-1:] == "l": + s = s[:-1] + if s[:2] == "0x" or s[:2] == "0X": + s = s[2:] + return s + +def charname(x): + if db: + key = hexstr(x) + while len(key) < 4: key = "0" + key + key = string.upper(key) + if han_translations: + try: + value = handb[key] + return " " + value + except KeyError: + pass + try: + value = db[key] + return string.split(value, ";")[1] + except KeyError: + return "" + else: + return "" + +def output(char, bytes, errors): + if output_analysis: + if char == -1: + s = " " + else: + s = "U-%08X " % char + for i in bytes: + s = s + " %02X" % i + for i in range(6-len(bytes)): + s = s + " " + + if char == -1: + name = "" + else: + name = charname(char) + if name != "": + s = s + " " + name + s = s + errors + print s + else: + if char == -1 or errors != "": + # problem chars become U+FFFD REPLACEMENT CHARACTER + sys.stdout.write("\xEF\xBF\xBD") + else: + for i in bytes: + sys.stdout.write(chr(i)) + +def process_ucs(x, bytes=[], errors=""): + if x < 0x80: + utf8 = [x] + realbytes = 1 + else: + if x < 0x800: + tmp = (0xC0, 1) + elif x < 0x10000: + tmp = (0xE0, 2) + elif x < 0x200000: + tmp = (0xF0, 3) + elif x < 0x4000000: + tmp = (0xF8, 4) + else: + assert x < 0x80000000L + tmp = (0xFC, 5) + realbytes = tmp[1] + 1 + utf8 = [tmp[0] + (x >> (6*tmp[1]))] + for i in range(tmp[1]-1, -1, -1): + utf8.append(0x80 + (0x3F & (x >> (i*6)))) + + if bytes != [] and len(bytes) > realbytes: + errors = errors + " (overlong form of" + for i in utf8: + errors = errors + " %02X" % i + errors = errors + ")" + utf8 = bytes + if x >= 0xD800 and x <= 0xDFFF: + errors = errors + " (surrogate)" + if x >= 0xFFFE and x <= 0xFFFF: + errors = errors + " (invalid char)" + + output(x, utf8, errors) + +def process_utf8(next): + c = next() + while c != None: + char = [c] + i = c + if i < 0x80: + process_ucs(i) # single-byte char + c = next() + elif i == 0xfe or i == 0xff: + output(-1, char, " (invalid UTF-8 byte)") + c = next() + elif i >= 0x80 and i <= 0xbf: + output(-1, char, " (unexpected continuation byte)") + c = next() + else: + if i >= 0xC0 and i <= 0xDF: + acc = i &~ 0xC0 + cbytes = 1 + elif i >= 0xE0 and i <= 0xEF: + acc = i &~ 0xE0 + cbytes = 2 + elif i >= 0xF0 and i <= 0xF7: + acc = i &~ 0xF0 + cbytes = 3 + elif i >= 0xF8 and i <= 0xFB: + acc = i &~ 0xF8 + cbytes = 4 + elif i >= 0xFC and i <= 0xFD: + acc = i &~ 0xFC + cbytes = 5 + gotone = 0 + while cbytes > 0: + c = next() + if c == None or c < 0x80 or c > 0xBF: + gotone = 1 + break + char.append(c) + acc = (acc << 6) + (c & 0x3F) + cbytes = cbytes - 1 + if not gotone: + c = next() + if cbytes > 0: + output(-1, char, " (incomplete sequence)") + else: + process_ucs(acc, char) + +def do(args): + # Class to turn a list into a callable object that returns one + # element at a time. + class liststepper: + def __init__(self, list): + self.list = list + self.index = 0 + def __call__(self): + if self.index >= len(self.list): + return None + ret = self.list[self.index] + self.index = self.index + 1 + return ret + + list = [] + for arg in args: + if string.upper(arg[0]) == "U": + if len(list) > 0: + process_utf8(liststepper(list)) + list = [] + assert arg[1] == "+" or arg[1] == "-" + process_ucs(string.atoi(arg[2:], 16)) + else: + list.append(string.atoi(arg, 16)) + + if len(list) > 0: + process_utf8(liststepper(list)) + +args = sys.argv[1:] +output_analysis = 1 +han_translations = 0 + +if args == [] or args == ["--help"] or args == ["--help-admin"]: + print "Usage: cvt-utf8 [flags] " + print " e.g. cvt-utf8 e2 82 ac" + print " or cvt-utf8 U+20ac" + print " or cvt-utf8 U-10ffff" + print "" + print "Flags: -o or --output just output well-formed UTF-8 instead of" + print " an analysis of the input data" + print " -h or --han also give Han definitions from unihan db" + print "" + print "Also: cvt-utf8 --test run Markus Kuhn's decoder stress tests" #' + print " cvt-utf8 --input (or -i)" + print " read, analyse and decode UTF-8 from stdin" + if args == ["--help-admin"]: + print " cvt-utf8 --help display user help text" + print " cvt-utf8 --help-admin display admin help text (this one)" + print " cvt-utf8 --build " + print " convert UnicodeData.txt to unicode db" + print " cvt-utf8 --build-unihan " + print " convert Unihan.txt to unihan db" + print " cvt-utf8 --fetch-build " + print " "+\ + "build unicode db by download from unicode.org" + print " cvt-utf8 --fetch-build-unihan " + print " "+\ + "build Unihan db by download from unicode.org" + else: + print " cvt-utf8 --help display this help text" + print " cvt-utf8 --help-admin display admin help text" + sys.exit(0) + +if args[0] == "-o" or args[0] == "--output": + output_analysis = 0 + args = args[1:] + +if args[0] == "-h" or args[0] == "--han": + han_translations = 1 + args = args[1:] + +if args[0] == "--build" or args[0] == "--fetch-build": + if args[0] == "--build": + if len(args) != 3: + print "cvt-utf8: --build expects two filename arguments" + sys.exit(1) + infile = open(args[1], "r") + outfile = args[2] + else: + if len(args) != 2: + print "cvt-utf8: --fetch-build expects one filename argument" + sys.exit(1) + import urllib + infile = urllib.urlopen("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt") + outfile = args[1] + # Now build the database. + if outfile[-3:] == ".db": + print "cvt-utf8: warning: you should not append .db to db name" + + db = anydbm.open(outfile, "n") + while 1: + s = infile.readline() + if s == "": break + ss = string.split(s, ";")[0] + db[ss] = s + db.close() + sys.exit(0) + +if args[0] == "--build-unihan" or args[0] == "--fetch-build-unihan": + if args[0] == "--build-unihan": + if len(args) != 3: + print "cvt-utf8: --build expects two filename arguments" + sys.exit(1) + infile = open(args[1], "r") + s = infile.read(1) + # Unihan.txt starts with a hash. If this file starts with a + # P, we assume it's a zip file ("PK"). + if s == "P": + infile = zip_untangler(infile, s) + s = "" + outfile = args[2] + else: + if len(args) != 2: + print "cvt-utf8: --fetch-build-unihan expects one filename argument" + sys.exit(1) + import urllib + infile = urllib.urlopen("ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip") + # We know this one is zipped. + infile = zip_untangler(infile, "") + outfile = args[1] + s = "" + # Now build the database. + if outfile[-3:] == ".db": + print "cvt-utf8: warning: you should not append .db to db name" + + db = anydbm.open(outfile, "n") + while 1: + s = s + infile.readline() + if s == "": break + while s[-1:] == "\r" or s[-1:] == "\n": + s = s[:-1] + sa = string.split(s, "\t") + if len(sa) == 3 and sa[1] == "kDefinition" and sa[0][:2] == "U+": + db[sa[0][2:]] = sa[2] + s = "" + db.close() + sys.exit(0) + +locations = [] +locations.append("/usr/share/unicode/unicode") +locations.append("/usr/lib/unicode/unicode") +locations.append("/usr/local/share/unicode/unicode") +locations.append("/usr/local/lib/unicode/unicode") +locations.append(os.environ["HOME"] + "/share/unicode/unicode") +locations.append(os.environ["HOME"] + "/lib/unicode/unicode") + +for loc in locations: + try: + db = anydbm.open(loc, "r") + except IOError: + db = None + except anydbm.error: + db = None + if db != None: + break +if han_translations: + i = string.rfind(loc, "/") + assert i >= 0 + hanloc = loc[:i+1] + "unihan" + handb = anydbm.open(hanloc, "r") + # this has been explicitly required, so we don't squelch exceptions + +if args[0] == "--test": + do(["CE","BA","E1","BD","B9","CF","83","CE","BC","CE","B5"]) + do(["00"]) + do(["C2","80"]) + do(["E0","A0","80"]) + do(["F0","90","80","80"]) + do(["F8","88","80","80","80"]) + do(["FC","84","80","80","80","80"]) + do(["7F"]) + do(["DF","BF"]) + do(["EF","BF","BF"]) + do(["F7","BF","BF","BF"]) + do(["FB","BF","BF","BF","BF"]) + do(["FD","BF","BF","BF","BF","BF"]) + do(["ED","9F","BF"]) + do(["EE","80","80"]) + do(["EF","BF","BD"]) + do(["F4","8F","BF","BF"]) + do(["F4","90","80","80"]) + do(["80"]) + do(["BF"]) + do(["80","BF"]) + do(["80","BF","80"]) + do(["80","BF","80","BF"]) + do(["80","BF","80","BF","80"]) + do(["80","BF","80","BF","80","BF"]) + do(["80","BF","80","BF","80","BF","80"]) + do(["80","81","82","83","84","85","86","87", + "88","89","8A","8B","8C","8D","8E","8F", + "90","91","92","93","94","95","96","97", + "98","99","9A","9B","9C","9D","9E","9F", + "A0","A1","A2","A3","A4","A5","A6","A7", + "A8","A9","AA","AB","AC","AD","AE","AF", + "B0","B1","B2","B3","B4","B5","B6","B7", + "B8","B9","BA","BB","BC","BD","BE","BF"]) + do(["C0","20","C1","20","C2","20","C3","20", + "C4","20","C5","20","C6","20","C7","20", + "C8","20","C9","20","CA","20","CB","20", + "CC","20","CD","20","CE","20","CF","20", + "D0","20","D1","20","D2","20","D3","20", + "D4","20","D5","20","D6","20","D7","20", + "D8","20","D9","20","DA","20","DB","20", + "DC","20","DD","20","DE","20","DF","20"]) + do(["E0","20","E1","20","E2","20","E3","20", + "E4","20","E5","20","E6","20","E7","20", + "E8","20","E9","20","EA","20","EB","20", + "EC","20","ED","20","EE","20","EF","20"]) + do(["F0","20","F1","20","F2","20","F3","20", + "F4","20","F5","20","F6","20","F7","20"]) + do(["F8","20","F9","20","FA","20","FB","20"]) + do(["FC","20","FD","20"]) + do(["C0"]) + do(["E0","80"]) + do(["F0","80","80"]) + do(["F8","80","80","80"]) + do(["FC","80","80","80","80"]) + do(["DF"]) + do(["EF","BF"]) + do(["F7","BF","BF"]) + do(["FB","BF","BF","BF"]) + do(["FD","BF","BF","BF","BF"]) + do(["C0","E0","80","F0","80","80","F8","80", + "80","80","FC","80","80","80","80", + "DF","EF","BF","F7","BF","BF","FB", + "BF","BF","BF","FD","BF","BF","BF","BF"]) + do(["FE"]) + do(["FF"]) + do(["FE","FE","FF","FF"]) + do(["C0","AF"]) + do(["E0","80","AF"]) + do(["F0","80","80","AF"]) + do(["F8","80","80","80","AF"]) + do(["FC","80","80","80","80","AF"]) + do(["C1","BF"]) + do(["E0","9F","BF"]) + do(["F0","8F","BF","BF"]) + do(["F8","87","BF","BF","BF"]) + do(["FC","83","BF","BF","BF","BF"]) + do(["C0","80"]) + do(["E0","80","80"]) + do(["F0","80","80","80"]) + do(["F8","80","80","80","80"]) + do(["FC","80","80","80","80","80"]) + do(["ED","A0","80"]) + do(["ED","AD","BF"]) + do(["ED","AE","80"]) + do(["ED","AF","BF"]) + do(["ED","B0","80"]) + do(["ED","BE","80"]) + do(["ED","BF","BF"]) + do(["ED","A0","80","ED","B0","80"]) + do(["ED","A0","80","ED","BF","BF"]) + do(["ED","AD","BF","ED","B0","80"]) + do(["ED","AD","BF","ED","BF","BF"]) + do(["ED","AE","80","ED","B0","80"]) + do(["ED","AE","80","ED","BF","BF"]) + do(["ED","AF","BF","ED","B0","80"]) + do(["ED","AF","BF","ED","BF","8F"]) + do(["EF","BF","BE"]) + do(["EF","BF","BF"]) +elif args[0] == "--input" or args[0] == "-i": + def getchar(): + s = sys.stdin.read(1) + if s == "": + return None + return ord(s) & 0xFF # ensure it isn't negative + process_utf8(getchar) +else: + do(args) diff --git a/cvt-utf8/cvt-utf8.but b/cvt-utf8/cvt-utf8.but new file mode 100644 index 0000000..427c097 --- /dev/null +++ b/cvt-utf8/cvt-utf8.but @@ -0,0 +1,133 @@ +\cfg{man-identity}{cvt-utf8}{1}{2004-03-24}{Simon Tatham}{Simon Tatham} +\cfg{man-mindepth}{1} + +\C{cvt-utf8-manpage} Man page for \cw{cvt-utf8} + +\H{cvt-utf8-manpage-name} NAME + +\cw{cvt-utf8} - convert between UTF-8 and Unicode, and analyse Unicode + +\H{cvt-utf8-manpage-synopsis} SYNOPSIS + +\c cvt-utf8 [flags] [hex UTF-8 bytes and/or U+codepoints] +\e bbbbbbbb iiiii iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii + +\H{cvt-utf8-manpage-description} DESCRIPTION + +\cw{cvt-utf8} is a tool for manipulating and analysing UTF-8 and +Unicode data. Its functions include: + +\b Given a sequence of Unicode code points, convert them to the +corresponding sequence of bytes in the UTF-8 encoding. + +\b Given a sequence of UTF-8 bytes, convert them back into Unicode +code points. + +\b Given any combination of the above inputs, look up each Unicode +code point in the Unicode character database and identify it. + +\b Look up Unified Han characters in the \q{Unihan} database and +provide their translation text. + +By default, \cw{cvt-utf8} expects to receive hex numbers (either +UTF-8 bytes or Unicode code points) on the command line, and it will +print out a verbose analysis of the input data. If you need it to +read UTF-8 from standard input or to write pure UTF-8 to standard +output, you can do so using command-line options. + +\H{cvt-utf8-manpage-options} OPTIONS + +\dt \cw{-i} + +\dd Read UTF-8 data from standard input and analyse that, instead of +expecting hex numbers on the command line. + +\dt \cw{-o} + +\dd Write well-formed UTF-8 to standard output, instead of writing a +long analysis of the input data. + +\dt \cw{-h} + +\dd Look up each code point in the Unihan database as well as the +main Unicode character database. + +\H{cvt-utf8-manpage-examples} EXAMPLES + +In \cw{cvt-utf8}'s native mode, it simply analyses input Unicode or +UTF-8 data. For example, you can give a list of Unicode code +points... + +\c $ cvt-utf8 U+20ac U+31 U+30 +\e bbbbbbbbbbbbbbbbbbbbbbbbb +\c U-000020AC E2 82 AC EURO SIGN +\c U-00000031 31 DIGIT ONE +\c U-00000030 30 DIGIT ZERO + +... and \cw{cvt-utf8} gives you the UTF-8 encodings plus the +character definitions. + +Alternatively, you can supply a list of UTF-8 bytes... + +\c $ cvt-utf8 D0 A0 D1 83 D1 81 D1 81 D0 BA D0 B8 D0 B9 +\e bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb +\c U-00000420 D0 A0 CYRILLIC CAPITAL LETTER ER +\c U-00000443 D1 83 CYRILLIC SMALL LETTER U +\c U-00000441 D1 81 CYRILLIC SMALL LETTER ES +\c U-00000441 D1 81 CYRILLIC SMALL LETTER ES +\c U-0000043A D0 BA CYRILLIC SMALL LETTER KA +\c U-00000438 D0 B8 CYRILLIC SMALL LETTER I +\c U-00000439 D0 B9 CYRILLIC SMALL LETTER SHORT I + +... and you get back the same output format, including the UTF-8 +code points. + +If you supply malformed data, \cw{cvt-utf8} will break it down for +you and identify the malformed pieces and any correctly formed +characters: + +\c $ cvt-utf8 A9 FE 45 C2 80 90 0A +\e bbbbbbbbbbbbbbbbbbbbbbbbbbbbb +\c A9 (unexpected continuation byte) +\c FE (invalid UTF-8 byte) +\c U-00000045 45 LATIN CAPITAL LETTER E +\c U-00000080 C2 80 +\c 90 (unexpected continuation byte) +\c U-0000000A 0A + +If you need the UTF-8 encoding of a particular character, you can +use the \cw{-o} option to cause the UTF-8 to be written to standard +output: + +\c $ cvt-utf8 -o U+20AC >> my-utf8-file.txt +\e bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb + +If you have UTF-8 data in a file or output from another program, you +can use the \cw{-i} option to have \cw{cvt-utf8} analyse it. This +works particularly well if you also have my \cw{xcopy} program, +which can be told to extract UTF-8 data from the X selection and +write it to its standard output. With these two programs working +together, if you ever have trouble identifying some text in a +UTF-8-supporting web browser such as Mozilla, you can simply select +the text in question, switch to a terminal window, and type + +\c $ xcopy -u -r | cvt-utf8 -i +\e bbbbbbbbbbbbbbbbbbbbbbbbb + +If the text is in Chinese, you can get at least a general idea of +its meaning by using the \cw{-h} option to print the meaning of each +ideograph from the Unihan database. For example, if you pass in the +Chinese text meaning \q{Traditional Chinese}: + +\c $ cvt-utf8 -h U+7E41 U+9AD4 U+4E2D U+6587 +\e bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb +\c U-00007E41 E7 B9 81 complicated, complex, difficult +\c U-00009AD4 E9 AB 94 body; group, class, body, unit +\c U-00004E2D E4 B8 AD central; center, middle; in the +\c midst of; hit (target); attain +\c U-00006587 E6 96 87 literature, culture, writing + +\H{cvt-utf8-manpage-bugs} BUGS + +Command-line option processing is very basic. In particular, \cw{-h} +must come before \cw{-i} or it will not be recognised. diff --git a/xcopy/Makefile b/xcopy/Makefile new file mode 100644 index 0000000..6f193bc --- /dev/null +++ b/xcopy/Makefile @@ -0,0 +1,12 @@ +X11LIB=-L/usr/X11R6/lib -lX11 + +all: xcopy.1 xcopy + +xcopy: xcopy.c + $(CC) $(CFLAGS) -o $@ $< $(X11LIB) + +%.1: %.but + halibut --man=$@ $< + +clean: + rm -f *.1 xcopy diff --git a/xcopy/xcopy.but b/xcopy/xcopy.but new file mode 100644 index 0000000..ba5bcac --- /dev/null +++ b/xcopy/xcopy.but @@ -0,0 +1,104 @@ +\cfg{man-identity}{xcopy}{1}{2004-08-02}{Simon Tatham}{Simon Tatham} +\cfg{man-mindepth}{1} + +\C{xcopy-manpage} Man page for \cw{xcopy} + +\H{xcopy-manpage-name} NAME + +\cw{xcopy} - read and write text to/from an X selection from the +command line + +\H{xcopy-manpage-synopsis} SYNOPSIS + +\c xcopy [ -r ] [ -u | -c ] [ -C ] +\e bbbbb bb bb bb bb + +\H{xcopy-manpage-description} DESCRIPTION + +\cw{xcopy} is a command-line utility for manipulating the X selection. + +It has two basic modes. In read mode (\cw{xcopy -r}), it connects to +your X server, retrieves the contents of the selection as plain +text, and writes it on standard output. You would then typically +redirect its output into a file, or pipe it into some other program. + +In write mode (just \cw{xcopy}, if \cw{-r} is not specified), it +will read data from standard input, then connect to your X server +and place that data in the selection as plain text. So you can pipe +data into \cw{xcopy}, move to another application, and press Paste. + +The X selection model requires the selection-owning client to remain +connected to the server and hand out its data on request. Therefore, +\cw{xcopy} in write mode forks off a background process which does +this. The background process terminates once it is no longer the +selection owner (i.e. as soon as you select data in another +application), or if your X session finishes. Normally you can ignore +its presence, although it might become important to be aware of it +if (for example) the \cw{xcopy} background process were to be the +last X client still connected through an SSH tunnel. + +\cw{xcopy} currently only handles text data. However, it is capable +of handling it in the form of plain text, UTF-8, or compound +(multiple-character-set) text. Use the \cw{-u}, \cw{-c} and \cw{-C} +options to control this aspect of its behaviour. + +\H{xcopy-manpage-options} OPTIONS + +By default (if \cw{-r} is not supplied), \cw{xcopy} operates in +write mode. + +\dt \cw{-r} + +\dd Places \cw{xcopy} into read mode. + +By default (if neither \cw{-c} nor \cw{-u} is supplied), \cw{xcopy} +reads and writes the selection using the type \cw{STRING}, which +means that the input or output data is expected to be encoded in +ISO-8859-1. + +\dt \cw{-u} + +\dd In read mode, causes \cw{xcopy} to request the selection using +the type \cw{UTF8_STRING}, which typically means that the returned +data will be encoded as UTF-8. In write mode, causes \cw{xcopy} to +\e{give out} the selection as type \cw{UTF8_STRING}, meaning that +the data piped in to it is expected to be encoded as UTF-8. + +\dt \cw{-c} + +\dd Similar to \cw{-u}, but uses the type \cw{COMPOUND_TEXT} rather +than \cw{UTF8_STRING}. \cw{COMPOUND_TEXT} is a complex +multi-character-set encoding similar to ISO 2022, and is unlikely to +be a very useful form in which to pass data to or from non-X +programs. However, it might occasionally be useful to retrieve a +compound text selection using \cw{xcopy -r -c}, and later on return +it to the X selection using \cw{xcopy -c} so it can be pasted back +into a different application. + +In write mode, if \cw{xcopy} is operating in \cw{STRING} mode and a +pasting application requests the selection as \cw{COMPOUND_TEXT}, +\cw{xcopy} will convert the data automatically. This is normally +what you want. + +\dt \cw{-C} + +\dd Suppresses conversion to compound text in write mode. This is +occasionally useful if you are pasting control characters, since the +compound text specification forbids any control characters and the +Xlib conversion functions honour this. If you are (for example) +trying to paste a sequence of editor commands into a terminal +window, you might well want to paste a string full of escape +sequences and control characters, in which case you may need to use +\cw{-C} to disable conversion to compound text. + +\H{xcopy-manpage-bugs} BUGS + +Occasionally \cw{xcopy -r} completely fails to notice selection data +owned by another process. I have not yet reproduced this reliably; +if anyone can, some work with \cw{xmon}(1) would be much +appreciated... + +Automatic conversion between compound text and UTF-8 is not +currently supported. There are Xlib functions to do it, although +they don't appear to work very well (missing out many characters +which they could have converted). diff --git a/xcopy/xcopy.c b/xcopy/xcopy.c new file mode 100644 index 0000000..c936cd7 --- /dev/null +++ b/xcopy/xcopy.c @@ -0,0 +1,359 @@ +/* + * xcopy: quickly pipe text data into, or out of, the primary X + * selection + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +int init_X(void); +void run_X(void); +void done_X(void); +void full_redraw(void); +void do_paste(Window window, Atom property, int Delete); + +char *pname; /* program name */ + +void error (char *fmt, ...); + +/* set from command-line parameters */ +char *display = NULL; +enum { STRING, CTEXT, UTF8 } mode = STRING; + +/* selection data */ +char *seltext; +int sellen, selsize; +#define SELDELTA 16384 + +/* functional parameters */ +int reading; /* read instead of writing? */ +int convert_to_ctext = True; /* Xmb convert to compound text? */ + +int main(int ac, char **av) { + int n; + int eventloop; + + pname = *av; + + /* parse the command line arguments */ + while (--ac) { + char *p = *++av; + + if (!strcmp(p, "-display") || !strcmp(p, "-disp")) { + if (!av[1]) + error ("option `%s' expects a parameter", p); + display = *++av, --ac; + } else if (!strcmp(p, "-r")) { + reading = True; + } else if (!strcmp(p, "-u")) { + mode = UTF8; + } else if (!strcmp(p, "-c")) { + mode = CTEXT; + } else if (!strcmp(p, "-C")) { + convert_to_ctext = False; + } else if (*p=='-') { + error ("unrecognised option `%s'", p); + } else { + error ("no parameters required"); + } + } + + if (!reading) { + seltext = malloc(SELDELTA); + if (!seltext) + error ("out of memory"); + selsize = SELDELTA; + sellen = 0; + do { + n = fread(seltext+sellen, 1, selsize-sellen, stdin); + sellen += n; + if (sellen >= selsize) { + seltext = realloc(seltext, selsize += SELDELTA); + if (!seltext) + error ("out of memory"); + } + } while (n > 0); + if (sellen == selsize) { + seltext = realloc(seltext, selsize += SELDELTA); + if (!seltext) + error ("out of memory"); + } + seltext[sellen] = '\0'; + } + + eventloop = init_X(); + if (!reading) { + /* + * If we are writing the selection, we must go into the + * background now. + */ + int pid = fork(); + if (pid < 0) { + error("unable to fork: %s", strerror(errno)); + } else if (pid > 0) { + /* + * we are the parent; just exit + */ + return 0; + } + /* + * we are the child + */ + close(0); + close(1); + close(2); + chdir("/"); + } + if (eventloop) + run_X(); + done_X(); + return 0; +} + +/* handle errors */ + +void error (char *fmt, ...) { + va_list ap; + char errbuf[200]; + + done_X(); + va_start (ap, fmt); + vsprintf (errbuf, fmt, ap); + va_end (ap); + fprintf (stderr, "%s: %s\n", pname, errbuf); + exit (1); +} + +/* begin the X interface */ + +char *lcasename = "xcopy"; +char *ucasename = "XCopy"; + +Display *disp = NULL; +Window ourwin = None; +Atom compound_text_atom, targets_atom; +int screen, wwidth, wheight; + +Atom strtype = XA_STRING; + +/* + * Returns TRUE if we need to enter an event loop, FALSE otherwise. + */ +int init_X(void) { + Window root; + int x = 0, y = 0, width = 512, height = 128; + int i, got = 0; + XWMHints wm_hints; + XSizeHints size_hints; + XClassHint class_hints; + XTextProperty textprop; + XGCValues gcv; + + /* open the X display */ + disp = XOpenDisplay (display); + if (!disp) + error ("unable to open display"); + + if (mode == UTF8) { + strtype = XInternAtom(disp, "UTF8_STRING", False); + if (!strtype) + error ("unable to get UTF8_STRING property"); + } else if (mode == CTEXT) { + strtype = XInternAtom(disp, "COMPOUND_TEXT", False); + if (!strtype) + error ("unable to get COMPOUND_TEXT property"); + } + targets_atom = XInternAtom(disp, "TARGETS", False); + if (!targets_atom) + error ("unable to get TARGETS property"); + + /* get the screen and root-window */ + screen = DefaultScreen (disp); + root = RootWindow (disp, screen); + + x = y = 0; + width = height = 10; /* doesn't really matter */ + + /* actually create the window */ + ourwin = XCreateSimpleWindow (disp, root, x, y, width, height,0, + BlackPixel(disp, screen), + WhitePixel(disp, screen)); + + /* resource class name */ + class_hints.res_name = lcasename; + class_hints.res_class = ucasename; + XSetClassHint (disp, ourwin, &class_hints); + + /* do selection fiddling */ + if (reading) { + /* + * We are reading the selection, so we must FIXME. + */ + if (XGetSelectionOwner(disp, XA_PRIMARY) == None) { + /* No primary selection, so use the cut buffer. */ + do_paste(DefaultRootWindow(disp), XA_CUT_BUFFER0, False); + return False; + } else { + Atom sel_property = XInternAtom(disp, "VT_SELECTION", False); + XConvertSelection(disp, XA_PRIMARY, strtype, + sel_property, ourwin, CurrentTime); + return True; + } + } else { + /* + * We are writing to the selection, so we establish + * ourselves as selection owner. Also place the data in + * CUT_BUFFER0, if it isn't of an exotic type (cut buffers + * can only take ordinary string data, it turns out). + */ + XSetSelectionOwner (disp, XA_PRIMARY, ourwin, CurrentTime); + if (XGetSelectionOwner (disp, XA_PRIMARY) != ourwin) + error ("unable to obtain primary X selection\n"); + compound_text_atom = XInternAtom(disp, "COMPOUND_TEXT", False); + if (strtype == XA_STRING) { + /* + * ICCCM-required cut buffer initialisation. + */ + XChangeProperty(disp, root, XA_CUT_BUFFER0, + XA_STRING, 8, PropModeAppend, "", 0); + XChangeProperty(disp, root, XA_CUT_BUFFER1, + XA_STRING, 8, PropModeAppend, "", 0); + XChangeProperty(disp, root, XA_CUT_BUFFER2, + XA_STRING, 8, PropModeAppend, "", 0); + XChangeProperty(disp, root, XA_CUT_BUFFER3, + XA_STRING, 8, PropModeAppend, "", 0); + XChangeProperty(disp, root, XA_CUT_BUFFER4, + XA_STRING, 8, PropModeAppend, "", 0); + XChangeProperty(disp, root, XA_CUT_BUFFER5, + XA_STRING, 8, PropModeAppend, "", 0); + XChangeProperty(disp, root, XA_CUT_BUFFER6, + XA_STRING, 8, PropModeAppend, "", 0); + XChangeProperty(disp, root, XA_CUT_BUFFER7, + XA_STRING, 8, PropModeAppend, "", 0); + /* + * Rotate the cut buffers and add our text in CUT_BUFFER0. + */ + XRotateBuffers(disp, 1); + XStoreBytes(disp, seltext, sellen); + } + return True; + } +} + +void run_X(void) { + XEvent ev, e2; + + while (1) { + XNextEvent (disp, &ev); + if (reading) { + switch (ev.type) { + case SelectionNotify: + if (ev.xselection.property != None) + do_paste(ev.xselection.requestor, + ev.xselection.property, True); + return; + } + } else { + switch (ev.type) { + case SelectionClear: + /* Selection has been cleared by another app. */ + return; + case SelectionRequest: + e2.xselection.type = SelectionNotify; + e2.xselection.requestor = ev.xselectionrequest.requestor; + e2.xselection.selection = ev.xselectionrequest.selection; + e2.xselection.target = ev.xselectionrequest.target; + e2.xselection.time = ev.xselectionrequest.time; + if (ev.xselectionrequest.target == strtype) { + XChangeProperty (disp, ev.xselectionrequest.requestor, + ev.xselectionrequest.property, strtype, + 8, PropModeReplace, seltext, sellen); + e2.xselection.property = ev.xselectionrequest.property; + } else if (ev.xselectionrequest.target == compound_text_atom && + convert_to_ctext) { + XTextProperty tp; + XmbTextListToTextProperty (disp, &seltext, 1, + XCompoundTextStyle, &tp); + XChangeProperty (disp, ev.xselectionrequest.requestor, + ev.xselectionrequest.property, + ev.xselectionrequest.target, + tp.format, PropModeReplace, + tp.value, tp.nitems); + e2.xselection.property = ev.xselectionrequest.property; + } else if (ev.xselectionrequest.target == targets_atom) { + Atom targets[2]; + int len = 0; + targets[len++] = strtype; + if (strtype != compound_text_atom && convert_to_ctext) + targets[len++] = compound_text_atom; + XChangeProperty (disp, ev.xselectionrequest.requestor, + ev.xselectionrequest.property, + ev.xselectionrequest.target, + 32, PropModeReplace, + (unsigned char *)targets, len); + } else { + e2.xselection.property = None; + } + XSendEvent (disp, ev.xselectionrequest.requestor, False, 0, &e2); + } + } + } +} + +void done_X(void) { + int i; + + if (ourwin != None) + XDestroyWindow (disp, ourwin); + if (disp) + XCloseDisplay (disp); +} + +void do_paste(Window window, Atom property, int Delete) { + Atom actual_type; + int actual_format, i; + long nitems, bytes_after, nread; + unsigned char *data; + + nread = 0; + while (XGetWindowProperty(disp, window, property, nread / 4, SELDELTA, + Delete, AnyPropertyType, &actual_type, + &actual_format, &nitems, &bytes_after, + (unsigned char **) &data) == Success) { + /* + * We expect all returned chunks of data to be multiples of + * 4 bytes (because we can only request the subsequent + * starting offset in 4-byte increments). Of course you can + * store an odd number of bytes in a selection, so this + * can't be the case every time XGetWindowProperty returns; + * but it should be the case every time it returns _and + * there is more data to come_. + * + * Hence, whenever XGetWindowProperty returns, we verify + * that the size of the data returned _last_ time was + * divisible by 4. + */ + if (nitems > 0) + assert((nread & 3) == 0); + + if (actual_type == strtype && nitems > 0) { + assert(actual_format == 8); + fwrite(data, 1, nitems, stdout); + nread += nitems; + } + XFree(data); + if (actual_type != strtype || nitems == 0) + break; + } +} -- 2.11.0