WINMISC = misc version winstore settings tree234 winnet proxy cmdline
UXMISC = misc version uxstore settings tree234 uxnet proxy cmdline
+# Character set library, for use in pterm.
+CHARSET = sbcsdat slookup sbcs utf8 toucs fromucs xenc mimeenc
+
# Standard libraries, and the same with WinSocks 1 and 2.
LIBS = advapi32.lib user32.lib gdi32.lib comctl32.lib comdlg32.lib
+ shell32.lib winmm.lib imm32.lib winspool.lib
+ sshpubk sshaes sshsh512 import winutils puttygen.res LIBS
pterm : [X] pterm terminal wcwidth uxucs uxmisc tree234 misc ldisc ldiscucs
- + logging uxprint settings pty be_none uxstore signal
+ + logging uxprint settings pty be_none uxstore signal CHARSET
plink : [U] uxplink uxcons NONSSH UXSSH be_all logging UXMISC
--- /dev/null
+This subdirectory contains a general character-set conversion
+library, used in the Unix port of PuTTY, and available for use in
+other ports if it should happen to be useful.
+
+I intend to use this same library in other programs at some future
+date. It is therefore a _strong_ design goal that this library
+should remain perfectly general, and not tied to particulars of
+PuTTY. It must not reference any code outside its own subdirectory;
+it should not have PuTTY-specific helper routines added to it unless
+they can be documented in a general manner which might make them
+useful in other circumstances as well.
--- /dev/null
+/*
+ * charset.h - header file for general character set conversion
+ * routines.
+ */
+
+#ifndef charset_charset_h
+#define charset_charset_h
+
+#include <stddef.h>
+
+/*
+ * Enumeration that lists all the multibyte or single-byte
+ * character sets known to this library.
+ */
+typedef enum {
+ CS_NONE, /* used for reporting errors, etc */
+ CS_ISO8859_1,
+ CS_ISO8859_1_X11, /* X font encoding with VT100 glyphs */
+ CS_ISO8859_2,
+ CS_ISO8859_3,
+ CS_ISO8859_4,
+ CS_ISO8859_5,
+ CS_ISO8859_6,
+ CS_ISO8859_7,
+ CS_ISO8859_8,
+ CS_ISO8859_9,
+ CS_ISO8859_10,
+ CS_ISO8859_11,
+ CS_ISO8859_13,
+ CS_ISO8859_14,
+ CS_ISO8859_15,
+ CS_ISO8859_16,
+ CS_CP437,
+ CS_CP850,
+ CS_CP1250,
+ CS_CP1251,
+ CS_CP1252,
+ CS_CP1253,
+ CS_CP1254,
+ CS_CP1255,
+ CS_CP1256,
+ CS_CP1257,
+ CS_CP1258,
+ CS_KOI8_R,
+ CS_KOI8_U,
+ CS_MAC_ROMAN,
+ CS_VISCII,
+ CS_HP_ROMAN8,
+ CS_DEC_MCS,
+ CS_UTF8
+} charset_t;
+
+typedef struct {
+ unsigned long s0;
+} charset_state;
+
+/*
+ * Routine to convert a MB/SB character set to Unicode.
+ *
+ * This routine accepts some number of bytes, updates a state
+ * variable, and outputs some number of Unicode characters. There
+ * are no guarantees. You can't even guarantee that at most one
+ * Unicode character will be output per byte you feed in; for
+ * example, suppose you're reading UTF-8, you've seen E1 80, and
+ * then you suddenly see FE. Now you need to output _two_ error
+ * characters - one for the incomplete sequence E1 80, and one for
+ * the completely invalid UTF-8 byte FE.
+ *
+ * Returns the number of wide characters output; will never output
+ * more than the size of the buffer (as specified on input).
+ * Advances the `input' pointer and decrements `inlen', to indicate
+ * how far along the input string it got.
+ *
+ * The sequence of `errlen' wide characters pointed to by `errstr'
+ * will be used to indicate a conversion error. If `errstr' is
+ * NULL, `errlen' will be ignored, and the library will choose
+ * something sensible to do on its own. For Unicode, this will be
+ * U+FFFD (REPLACEMENT CHARACTER).
+ */
+
+int charset_to_unicode(char **input, int *inlen, wchar_t *output, int outlen,
+ int charset, charset_state *state,
+ const wchar_t *errstr, int errlen);
+
+/*
+ * Routine to convert Unicode to an MB/SB character set.
+ *
+ * This routine accepts some number of Unicode characters, updates
+ * a state variable, and outputs some number of bytes.
+ *
+ * Returns the number of bytes characters output; will never output
+ * more than the size of the buffer (as specified on input), and
+ * will never output a partial MB character. Advances the `input'
+ * pointer and decrements `inlen', to indicate how far along the
+ * input string it got.
+ *
+ * The sequence of `errlen' characters pointed to by `errstr' will
+ * be used to indicate a conversion error. If `errstr' is NULL,
+ * `errlen' will be ignored, and the library will choose something
+ * sensible to do on its own (which will vary depending on the
+ * output charset).
+ */
+
+int charset_from_unicode(wchar_t **input, int *inlen, char *output, int outlen,
+ int charset, charset_state *state,
+ const char *errstr, int errlen);
+
+/*
+ * Convert X11 encoding names to and from our charset identifiers.
+ */
+const char *charset_to_xenc(int charset);
+int charset_from_xenc(const char *name);
+
+/*
+ * Convert MIME encoding names to and from our charset identifiers.
+ */
+const char *charset_to_mimeenc(int charset);
+int charset_from_mimeenc(const char *name);
+
+#endif /* charset_charset_h */
--- /dev/null
+/*
+ * enum.c - enumerate all charsets defined by the library.
+ *
+ * This file maintains a list of every other source file which
+ * contains ENUM_CHARSET definitions. It #includes each one with
+ * ENUM_CHARSETS defined, which causes those source files to do
+ * nothing at all except call the ENUM_CHARSET macro on each
+ * charset they define.
+ *
+ * This file in turn is included from various other places, with
+ * the ENUM_CHARSET macro defined to various different things. This
+ * allows us to have multiple implementations of the master charset
+ * lookup table (a static one and a dynamic one).
+ */
+
+#define ENUM_CHARSETS
+#include "sbcsdat.c"
+#include "utf8.c"
+#undef ENUM_CHARSETS
--- /dev/null
+/*
+ * fromucs.c - convert Unicode to other character sets.
+ */
+
+#include "charset.h"
+#include "internal.h"
+
+struct charset_emit_param {
+ char *output;
+ int outlen;
+ const char *errstr;
+ int errlen;
+ int stopped;
+};
+
+static void charset_emit(void *ctx, long int output)
+{
+ struct charset_emit_param *param = (struct charset_emit_param *)ctx;
+ char outval;
+ char const *p;
+ int outlen;
+
+ if (output == ERROR) {
+ p = param->errstr;
+ outlen = param->errlen;
+ } else {
+ outval = output;
+ p = &outval;
+ outlen = 1;
+ }
+
+ if (param->outlen >= outlen) {
+ while (outlen > 0) {
+ *param->output++ = *p++;
+ param->outlen--;
+ outlen--;
+ }
+ } else {
+ param->stopped = 1;
+ }
+}
+
+int charset_from_unicode(wchar_t **input, int *inlen, char *output, int outlen,
+ int charset, charset_state *state,
+ const char *errstr, int errlen)
+{
+ charset_spec const *spec = charset_find_spec(charset);
+ charset_state localstate;
+ struct charset_emit_param param;
+
+ param.output = output;
+ param.outlen = outlen;
+ param.stopped = 0;
+
+ /*
+ * charset_emit will expect a valid errstr.
+ */
+ if (!errstr) {
+ /* *shrug* this is good enough, and consistent across all SBCS... */
+ param.errstr = ".";
+ param.errlen = 1;
+ }
+ param.errstr = errstr;
+ param.errlen = errlen;
+
+ if (!state) {
+ localstate.s0 = 0;
+ } else {
+ localstate = *state; /* structure copy */
+ }
+ state = &localstate;
+
+ while (*inlen > 0) {
+ int lenbefore = param.output - output;
+ spec->write(spec, **input, &localstate, charset_emit, ¶m);
+ if (param.stopped) {
+ /*
+ * The emit function has _tried_ to output some
+ * characters, but ran up against the end of the
+ * buffer. Leave immediately, and return what happened
+ * _before_ attempting to process this character.
+ */
+ return lenbefore;
+ }
+ if (state)
+ *state = localstate; /* structure copy */
+ (*input)++;
+ (*inlen)--;
+ }
+ return param.output - output;
+}
--- /dev/null
+/*
+ * internal.h - internal header stuff for the charset library.
+ */
+
+#ifndef charset_internal_h
+#define charset_internal_h
+
+/* This invariably comes in handy */
+#define lenof(x) ( sizeof((x)) / sizeof(*(x)) )
+
+/* This is an invalid Unicode value used to indicate an error. */
+#define ERROR 0xFFFFL /* Unicode value representing error */
+
+typedef struct charset_spec charset_spec;
+typedef struct sbcs_data sbcs_data;
+
+struct charset_spec {
+ int charset; /* numeric identifier */
+
+ /*
+ * A function to read the character set and output Unicode
+ * characters. The `emit' function expects to get Unicode chars
+ * passed to it; it should be sent ERROR for any encoding error
+ * on the input.
+ */
+ void (*read)(charset_spec const *charset, long int input_chr,
+ charset_state *state,
+ void (*emit)(void *ctx, long int output), void *emitctx);
+ /*
+ * A function to read Unicode characters and output in this
+ * character set. The `emit' function expects to get byte
+ * values passed to it; it should be sent ERROR for any
+ * non-representable characters on the input.
+ */
+ void (*write)(charset_spec const *charset, long int input_chr,
+ charset_state *state,
+ void (*emit)(void *ctx, long int output), void *emitctx);
+ void const *data;
+};
+
+/*
+ * This is the format of `data' used by the SBCS read and write
+ * functions; so it's the format used in all SBCS definitions.
+ */
+struct sbcs_data {
+ /*
+ * This is a simple mapping table converting each SBCS position
+ * to a Unicode code point. Some positions may contain ERROR,
+ * indicating that that byte value is not defined in the SBCS
+ * in question and its occurrence in input is an error.
+ */
+ unsigned long sbcs2ucs[256];
+
+ /*
+ * This lookup table is used to convert Unicode back to the
+ * SBCS. It consists of the valid byte values in the SBCS,
+ * sorted in order of their Unicode translation. So given a
+ * Unicode value U, you can do a binary search on this table
+ * using the above table as a lookup: when testing the Xth
+ * position in this table, you branch according to whether
+ * sbcs2ucs[ucs2sbcs[X]] is less than, greater than, or equal
+ * to U.
+ *
+ * Note that since there may be fewer than 256 valid byte
+ * values in a particular SBCS, we must supply the length of
+ * this table as well as the contents.
+ */
+ unsigned char ucs2sbcs[256];
+ int nvalid;
+};
+
+/*
+ * Prototypes for internal library functions.
+ */
+charset_spec const *charset_find_spec(int charset);
+void read_sbcs(charset_spec const *charset, long int input_chr,
+ charset_state *state,
+ void (*emit)(void *ctx, long int output), void *emitctx);
+void write_sbcs(charset_spec const *charset, long int input_chr,
+ charset_state *state,
+ void (*emit)(void *ctx, long int output), void *emitctx);
+
+/*
+ * Placate compiler warning about unused parameters, of which we
+ * expect to have some in this library.
+ */
+#define UNUSEDARG(x) ( (x) = (x) )
+
+#endif /* charset_internal_h */
--- /dev/null
+/*
+ * mimeenc.c - translate our internal character set codes to and
+ * from MIME standard character-set names.
+ *
+ */
+
+#include <ctype.h>
+#include "charset.h"
+#include "internal.h"
+
+static const struct {
+ const char *name;
+ int charset;
+} mimeencs[] = {
+ /*
+ * These names are taken from
+ *
+ * http://www.iana.org/assignments/character-sets
+ *
+ * Where multiple encoding names map to the same encoding id
+ * (such as the variety of aliases for ISO-8859-1), the first
+ * is considered canonical and will be returned when
+ * translating the id to a string.
+ */
+ { "ISO-8859-1", CS_ISO8859_1 },
+ { "iso-ir-100", CS_ISO8859_1 },
+ { "ISO_8859-1", CS_ISO8859_1 },
+ { "ISO_8859-1:1987", CS_ISO8859_1 },
+ { "latin1", CS_ISO8859_1 },
+ { "l1", CS_ISO8859_1 },
+ { "IBM819", CS_ISO8859_1 },
+ { "CP819", CS_ISO8859_1 },
+ { "csISOLatin1", CS_ISO8859_1 },
+
+ { "ISO-8859-2", CS_ISO8859_2 },
+ { "ISO_8859-2:1987", CS_ISO8859_2 },
+ { "iso-ir-101", CS_ISO8859_2 },
+ { "ISO_8859-2", CS_ISO8859_2 },
+ { "latin2", CS_ISO8859_2 },
+ { "l2", CS_ISO8859_2 },
+ { "csISOLatin2", CS_ISO8859_2 },
+
+ { "ISO-8859-3", CS_ISO8859_3 },
+ { "ISO_8859-3:1988", CS_ISO8859_3 },
+ { "iso-ir-109", CS_ISO8859_3 },
+ { "ISO_8859-3", CS_ISO8859_3 },
+ { "latin3", CS_ISO8859_3 },
+ { "l3", CS_ISO8859_3 },
+ { "csISOLatin3", CS_ISO8859_3 },
+
+ { "ISO-8859-4", CS_ISO8859_4 },
+ { "ISO_8859-4:1988", CS_ISO8859_4 },
+ { "iso-ir-110", CS_ISO8859_4 },
+ { "ISO_8859-4", CS_ISO8859_4 },
+ { "latin4", CS_ISO8859_4 },
+ { "l4", CS_ISO8859_4 },
+ { "csISOLatin4", CS_ISO8859_4 },
+
+ { "ISO-8859-5", CS_ISO8859_5 },
+ { "ISO_8859-5:1988", CS_ISO8859_5 },
+ { "iso-ir-144", CS_ISO8859_5 },
+ { "ISO_8859-5", CS_ISO8859_5 },
+ { "cyrillic", CS_ISO8859_5 },
+ { "csISOLatinCyrillic", CS_ISO8859_5 },
+
+ { "ISO-8859-6", CS_ISO8859_6 },
+ { "ISO_8859-6:1987", CS_ISO8859_6 },
+ { "iso-ir-127", CS_ISO8859_6 },
+ { "ISO_8859-6", CS_ISO8859_6 },
+ { "ECMA-114", CS_ISO8859_6 },
+ { "ASMO-708", CS_ISO8859_6 },
+ { "arabic", CS_ISO8859_6 },
+ { "csISOLatinArabic", CS_ISO8859_6 },
+
+ { "ISO-8859-7", CS_ISO8859_7 },
+ { "ISO_8859-7:1987", CS_ISO8859_7 },
+ { "iso-ir-126", CS_ISO8859_7 },
+ { "ISO_8859-7", CS_ISO8859_7 },
+ { "ELOT_928", CS_ISO8859_7 },
+ { "ECMA-118", CS_ISO8859_7 },
+ { "greek", CS_ISO8859_7 },
+ { "greek8", CS_ISO8859_7 },
+ { "csISOLatinGreek", CS_ISO8859_7 },
+
+ { "ISO-8859-8", CS_ISO8859_8 },
+ { "ISO_8859-8:1988", CS_ISO8859_8 },
+ { "iso-ir-138", CS_ISO8859_8 },
+ { "ISO_8859-8", CS_ISO8859_8 },
+ { "hebrew", CS_ISO8859_8 },
+ { "csISOLatinHebrew", CS_ISO8859_8 },
+
+ { "ISO-8859-9", CS_ISO8859_9 },
+ { "ISO_8859-9:1989", CS_ISO8859_9 },
+ { "iso-ir-148", CS_ISO8859_9 },
+ { "ISO_8859-9", CS_ISO8859_9 },
+ { "latin5", CS_ISO8859_9 },
+ { "l5", CS_ISO8859_9 },
+ { "csISOLatin5", CS_ISO8859_9 },
+
+ { "ISO-8859-10", CS_ISO8859_10 },
+ { "iso-ir-157", CS_ISO8859_10 },
+ { "l6", CS_ISO8859_10 },
+ { "ISO_8859-10:1992", CS_ISO8859_10 },
+ { "csISOLatin6", CS_ISO8859_10 },
+ { "latin6", CS_ISO8859_10 },
+
+ { "ISO-8859-13", CS_ISO8859_13 },
+
+ { "ISO-8859-14", CS_ISO8859_14 },
+ { "iso-ir-199", CS_ISO8859_14 },
+ { "ISO_8859-14:1998", CS_ISO8859_14 },
+ { "ISO_8859-14", CS_ISO8859_14 },
+ { "latin8", CS_ISO8859_14 },
+ { "iso-celtic", CS_ISO8859_14 },
+ { "l8", CS_ISO8859_14 },
+
+ { "ISO-8859-15", CS_ISO8859_15 },
+ { "ISO_8859-15", CS_ISO8859_15 },
+ { "Latin-9", CS_ISO8859_15 },
+
+ { "ISO-8859-16", CS_ISO8859_16 },
+ { "iso-ir-226", CS_ISO8859_16 },
+ { "ISO_8859-16", CS_ISO8859_16 },
+ { "ISO_8859-16:2001", CS_ISO8859_16 },
+ { "latin10", CS_ISO8859_16 },
+ { "l10", CS_ISO8859_16 },
+
+ { "IBM437", CS_CP437 },
+ { "cp437", CS_CP437 },
+ { "437", CS_CP437 },
+ { "csPC8CodePage437", CS_CP437 },
+
+ { "IBM850", CS_CP850 },
+ { "cp850", CS_CP850 },
+ { "850", CS_CP850 },
+ { "csPC850Multilingual", CS_CP850 },
+
+ { "windows-1250", CS_CP1250 },
+
+ { "windows-1251", CS_CP1251 },
+
+ { "windows-1252", CS_CP1252 },
+
+ { "windows-1253", CS_CP1253 },
+
+ { "windows-1254", CS_CP1254 },
+
+ { "windows-1255", CS_CP1255 },
+
+ { "windows-1256", CS_CP1256 },
+
+ { "windows-1257", CS_CP1257 },
+
+ { "windows-1258", CS_CP1258 },
+
+ { "KOI8-R", CS_KOI8_R },
+ { "csKOI8R", CS_KOI8_R },
+
+ { "KOI8-U", CS_KOI8_U },
+
+ { "macintosh", CS_MAC_ROMAN },
+ { "mac", CS_MAC_ROMAN },
+ { "csMacintosh", CS_MAC_ROMAN },
+
+ { "VISCII", CS_VISCII },
+ { "csVISCII", CS_VISCII },
+
+ { "hp-roman8", CS_HP_ROMAN8 },
+ { "roman8", CS_HP_ROMAN8 },
+ { "r8", CS_HP_ROMAN8 },
+ { "csHPRoman8", CS_HP_ROMAN8 },
+
+ { "DEC-MCS", CS_DEC_MCS },
+ { "dec", CS_DEC_MCS },
+ { "csDECMCS", CS_DEC_MCS },
+
+ { "UTF-8", CS_UTF8 },
+};
+
+const char *charset_to_mimeenc(int charset)
+{
+ int i;
+
+ for (i = 0; i < (int)lenof(mimeencs); i++)
+ if (charset == mimeencs[i].charset)
+ return mimeencs[i].name;
+
+ return NULL; /* not found */
+}
+
+int charset_from_mimeenc(const char *name)
+{
+ int i;
+
+ for (i = 0; i < (int)lenof(mimeencs); i++) {
+ const char *p, *q;
+ p = name;
+ q = mimeencs[i].name;
+ while (*p || *q) {
+ if (tolower(*p) != tolower(*q))
+ break;
+ p++; q++;
+ }
+ if (!*p && !*q)
+ return mimeencs[i].charset;
+ }
+
+ return CS_NONE; /* not found */
+}
--- /dev/null
+/*
+ * sbcs.c - routines to handle single-byte character sets.
+ */
+
+#include "charset.h"
+#include "internal.h"
+
+/*
+ * The charset_spec for any single-byte character set should
+ * provide read_sbcs() as its read function, and its `data' field
+ * should be a wchar_t string constant containing the 256 entries
+ * of the translation table.
+ */
+
+void read_sbcs(charset_spec const *charset, long int input_chr,
+ charset_state *state,
+ void (*emit)(void *ctx, long int output), void *emitctx)
+{
+ wchar_t const *table = (wchar_t const *)charset->data;
+
+ UNUSEDARG(state);
+
+ emit(emitctx, table[input_chr]);
+}
+
+void write_sbcs(charset_spec const *charset, long int input_chr,
+ charset_state *state,
+ void (*emit)(void *ctx, long int output), void *emitctx)
+{
+ wchar_t const *table = (wchar_t const *)charset->data;
+ int i;
+
+ UNUSEDARG(state);
+
+ /*
+ * FIXME: this should work, but it's ludicrously inefficient.
+ * We should be using the ucs2sbcs table.
+ */
+ for (i = 0; i < 256; i++)
+ if (table[i] == input_chr) {
+ emit(emitctx, i);
+ return;
+ }
+ emit(emitctx, ERROR);
+}
--- /dev/null
+ Data file defining single-byte character sets.
+
+ All lines which begin with whitespace are considered comments.
+
+ To generate an SBCS table from a unicode.org mapping table:
+
+ gensbcs() {
+ wget -q -O - "$1" | tr '\r' '\n' | \
+ perl -ne '/^(0x.*)\s+(0x.*)\s+/ and $a[hex $1]=sprintf "%04x", hex $2;' \
+ -e 'BEGIN{for($i=0;$i<256;$i++){$a[$i]="XXXX";' \
+ -e ' if ($i < 32 or $i == 127) {$a[$i]=sprintf "%04x", $i}}}' \
+ -e 'END{for($i=0;$i<256;$i++){printf"%s%s",$a[$i],$i%16==15?"\n":" "}}'
+ }
+
+ (A couple of noteworthy ickinesses here. For a start, any
+ undefined characters in the control-code regions (00-1F and 7F)
+ are assumed to be the Unicode code point corresponding to their
+ index, since the Mac Roman mapping table declines to define them
+ but realistically you don't want to be messing with that sort of
+ thing. Secondly, the Mac mapping tables are shipped with Mac line
+ endings, so note the `tr' to turn them into something legible to
+ Perl...)
+
+ Here are the ISO-8859-x tables, generated by this piece of Bourne
+ shell:
+
+ for i in 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16; do
+ echo charset CS_ISO8859_$i
+ gensbcs http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-$i.TXT
+ echo
+ done
+
+charset CS_ISO8859_1
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
+0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
+00a0 00a1 00a2 00a3 00a4 00a5 00a6 00a7 00a8 00a9 00aa 00ab 00ac 00ad 00ae 00af
+00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 00ba 00bb 00bc 00bd 00be 00bf
+00c0 00c1 00c2 00c3 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf
+00d0 00d1 00d2 00d3 00d4 00d5 00d6 00d7 00d8 00d9 00da 00db 00dc 00dd 00de 00df
+00e0 00e1 00e2 00e3 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef
+00f0 00f1 00f2 00f3 00f4 00f5 00f6 00f7 00f8 00f9 00fa 00fb 00fc 00fd 00fe 00ff
+
+charset CS_ISO8859_2
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
+0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
+00a0 0104 02d8 0141 00a4 013d 015a 00a7 00a8 0160 015e 0164 0179 00ad 017d 017b
+00b0 0105 02db 0142 00b4 013e 015b 02c7 00b8 0161 015f 0165 017a 02dd 017e 017c
+0154 00c1 00c2 0102 00c4 0139 0106 00c7 010c 00c9 0118 00cb 011a 00cd 00ce 010e
+0110 0143 0147 00d3 00d4 0150 00d6 00d7 0158 016e 00da 0170 00dc 00dd 0162 00df
+0155 00e1 00e2 0103 00e4 013a 0107 00e7 010d 00e9 0119 00eb 011b 00ed 00ee 010f
+0111 0144 0148 00f3 00f4 0151 00f6 00f7 0159 016f 00fa 0171 00fc 00fd 0163 02d9
+
+charset CS_ISO8859_3
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
+0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
+00a0 0126 02d8 00a3 00a4 XXXX 0124 00a7 00a8 0130 015e 011e 0134 00ad XXXX 017b
+00b0 0127 00b2 00b3 00b4 00b5 0125 00b7 00b8 0131 015f 011f 0135 00bd XXXX 017c
+00c0 00c1 00c2 XXXX 00c4 010a 0108 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf
+XXXX 00d1 00d2 00d3 00d4 0120 00d6 00d7 011c 00d9 00da 00db 00dc 016c 015c 00df
+00e0 00e1 00e2 XXXX 00e4 010b 0109 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef
+XXXX 00f1 00f2 00f3 00f4 0121 00f6 00f7 011d 00f9 00fa 00fb 00fc 016d 015d 02d9
+
+charset CS_ISO8859_4
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
+0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
+00a0 0104 0138 0156 00a4 0128 013b 00a7 00a8 0160 0112 0122 0166 00ad 017d 00af
+00b0 0105 02db 0157 00b4 0129 013c 02c7 00b8 0161 0113 0123 0167 014a 017e 014b
+0100 00c1 00c2 00c3 00c4 00c5 00c6 012e 010c 00c9 0118 00cb 0116 00cd 00ce 012a
+0110 0145 014c 0136 00d4 00d5 00d6 00d7 00d8 0172 00da 00db 00dc 0168 016a 00df
+0101 00e1 00e2 00e3 00e4 00e5 00e6 012f 010d 00e9 0119 00eb 0117 00ed 00ee 012b
+0111 0146 014d 0137 00f4 00f5 00f6 00f7 00f8 0173 00fa 00fb 00fc 0169 016b 02d9
+
+charset CS_ISO8859_5
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
+0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
+00a0 0401 0402 0403 0404 0405 0406 0407 0408 0409 040a 040b 040c 00ad 040e 040f
+0410 0411 0412 0413 0414 0415 0416 0417 0418 0419 041a 041b 041c 041d 041e 041f
+0420 0421 0422 0423 0424 0425 0426 0427 0428 0429 042a 042b 042c 042d 042e 042f
+0430 0431 0432 0433 0434 0435 0436 0437 0438 0439 043a 043b 043c 043d 043e 043f
+0440 0441 0442 0443 0444 0445 0446 0447 0448 0449 044a 044b 044c 044d 044e 044f
+2116 0451 0452 0453 0454 0455 0456 0457 0458 0459 045a 045b 045c 00a7 045e 045f
+
+charset CS_ISO8859_6
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
+0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
+00a0 XXXX XXXX XXXX 00a4 XXXX XXXX XXXX XXXX XXXX XXXX XXXX 060c 00ad XXXX XXXX
+XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX 061b XXXX XXXX XXXX 061f
+XXXX 0621 0622 0623 0624 0625 0626 0627 0628 0629 062a 062b 062c 062d 062e 062f
+0630 0631 0632 0633 0634 0635 0636 0637 0638 0639 063a XXXX XXXX XXXX XXXX XXXX
+0640 0641 0642 0643 0644 0645 0646 0647 0648 0649 064a 064b 064c 064d 064e 064f
+0650 0651 0652 XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX
+
+charset CS_ISO8859_7
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
+0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
+00a0 2018 2019 00a3 XXXX XXXX 00a6 00a7 00a8 00a9 XXXX 00ab 00ac 00ad XXXX 2015
+00b0 00b1 00b2 00b3 0384 0385 0386 00b7 0388 0389 038a 00bb 038c 00bd 038e 038f
+0390 0391 0392 0393 0394 0395 0396 0397 0398 0399 039a 039b 039c 039d 039e 039f
+03a0 03a1 XXXX 03a3 03a4 03a5 03a6 03a7 03a8 03a9 03aa 03ab 03ac 03ad 03ae 03af
+03b0 03b1 03b2 03b3 03b4 03b5 03b6 03b7 03b8 03b9 03ba 03bb 03bc 03bd 03be 03bf
+03c0 03c1 03c2 03c3 03c4 03c5 03c6 03c7 03c8 03c9 03ca 03cb 03cc 03cd 03ce XXXX
+
+charset CS_ISO8859_8
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
+0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
+00a0 XXXX 00a2 00a3 00a4 00a5 00a6 00a7 00a8 00a9 00d7 00ab 00ac 00ad 00ae 00af
+00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 00f7 00bb 00bc 00bd 00be XXXX
+XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX
+XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX 2017
+05d0 05d1 05d2 05d3 05d4 05d5 05d6 05d7 05d8 05d9 05da 05db 05dc 05dd 05de 05df
+05e0 05e1 05e2 05e3 05e4 05e5 05e6 05e7 05e8 05e9 05ea XXXX XXXX 200e 200f XXXX
+
+charset CS_ISO8859_9
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
+0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
+00a0 00a1 00a2 00a3 00a4 00a5 00a6 00a7 00a8 00a9 00aa 00ab 00ac 00ad 00ae 00af
+00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 00ba 00bb 00bc 00bd 00be 00bf
+00c0 00c1 00c2 00c3 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf
+011e 00d1 00d2 00d3 00d4 00d5 00d6 00d7 00d8 00d9 00da 00db 00dc 0130 015e 00df
+00e0 00e1 00e2 00e3 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef
+011f 00f1 00f2 00f3 00f4 00f5 00f6 00f7 00f8 00f9 00fa 00fb 00fc 0131 015f 00ff
+
+charset CS_ISO8859_10
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
+0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
+00a0 0104 0112 0122 012a 0128 0136 00a7 013b 0110 0160 0166 017d 00ad 016a 014a
+00b0 0105 0113 0123 012b 0129 0137 00b7 013c 0111 0161 0167 017e 2015 016b 014b
+0100 00c1 00c2 00c3 00c4 00c5 00c6 012e 010c 00c9 0118 00cb 0116 00cd 00ce 00cf
+00d0 0145 014c 00d3 00d4 00d5 00d6 0168 00d8 0172 00da 00db 00dc 00dd 00de 00df
+0101 00e1 00e2 00e3 00e4 00e5 00e6 012f 010d 00e9 0119 00eb 0117 00ed 00ee 00ef
+00f0 0146 014d 00f3 00f4 00f5 00f6 0169 00f8 0173 00fa 00fb 00fc 00fd 00fe 0138
+
+charset CS_ISO8859_11
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
+0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
+00a0 0e01 0e02 0e03 0e04 0e05 0e06 0e07 0e08 0e09 0e0a 0e0b 0e0c 0e0d 0e0e 0e0f
+0e10 0e11 0e12 0e13 0e14 0e15 0e16 0e17 0e18 0e19 0e1a 0e1b 0e1c 0e1d 0e1e 0e1f
+0e20 0e21 0e22 0e23 0e24 0e25 0e26 0e27 0e28 0e29 0e2a 0e2b 0e2c 0e2d 0e2e 0e2f
+0e30 0e31 0e32 0e33 0e34 0e35 0e36 0e37 0e38 0e39 0e3a XXXX XXXX XXXX XXXX 0e3f
+0e40 0e41 0e42 0e43 0e44 0e45 0e46 0e47 0e48 0e49 0e4a 0e4b 0e4c 0e4d 0e4e 0e4f
+0e50 0e51 0e52 0e53 0e54 0e55 0e56 0e57 0e58 0e59 0e5a 0e5b XXXX XXXX XXXX XXXX
+
+charset CS_ISO8859_13
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
+0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
+00a0 201d 00a2 00a3 00a4 201e 00a6 00a7 00d8 00a9 0156 00ab 00ac 00ad 00ae 00c6
+00b0 00b1 00b2 00b3 201c 00b5 00b6 00b7 00f8 00b9 0157 00bb 00bc 00bd 00be 00e6
+0104 012e 0100 0106 00c4 00c5 0118 0112 010c 00c9 0179 0116 0122 0136 012a 013b
+0160 0143 0145 00d3 014c 00d5 00d6 00d7 0172 0141 015a 016a 00dc 017b 017d 00df
+0105 012f 0101 0107 00e4 00e5 0119 0113 010d 00e9 017a 0117 0123 0137 012b 013c
+0161 0144 0146 00f3 014d 00f5 00f6 00f7 0173 0142 015b 016b 00fc 017c 017e 2019
+
+charset CS_ISO8859_14
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
+0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
+00a0 1e02 1e03 00a3 010a 010b 1e0a 00a7 1e80 00a9 1e82 1e0b 1ef2 00ad 00ae 0178
+1e1e 1e1f 0120 0121 1e40 1e41 00b6 1e56 1e81 1e57 1e83 1e60 1ef3 1e84 1e85 1e61
+00c0 00c1 00c2 00c3 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf
+0174 00d1 00d2 00d3 00d4 00d5 00d6 1e6a 00d8 00d9 00da 00db 00dc 00dd 0176 00df
+00e0 00e1 00e2 00e3 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef
+0175 00f1 00f2 00f3 00f4 00f5 00f6 1e6b 00f8 00f9 00fa 00fb 00fc 00fd 0177 00ff
+
+charset CS_ISO8859_15
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
+0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
+00a0 00a1 00a2 00a3 20ac 00a5 0160 00a7 0161 00a9 00aa 00ab 00ac 00ad 00ae 00af
+00b0 00b1 00b2 00b3 017d 00b5 00b6 00b7 017e 00b9 00ba 00bb 0152 0153 0178 00bf
+00c0 00c1 00c2 00c3 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf
+00d0 00d1 00d2 00d3 00d4 00d5 00d6 00d7 00d8 00d9 00da 00db 00dc 00dd 00de 00df
+00e0 00e1 00e2 00e3 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef
+00f0 00f1 00f2 00f3 00f4 00f5 00f6 00f7 00f8 00f9 00fa 00fb 00fc 00fd 00fe 00ff
+
+charset CS_ISO8859_16
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
+0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
+00a0 0104 0105 0141 20ac 201e 0160 00a7 0161 00a9 0218 00ab 0179 00ad 017a 017b
+00b0 00b1 010c 0142 017d 201d 00b6 00b7 017e 010d 0219 00bb 0152 0153 0178 017c
+00c0 00c1 00c2 0102 00c4 0106 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf
+0110 0143 00d2 00d3 00d4 0150 00d6 015a 0170 00d9 00da 00db 00dc 0118 021a 00df
+00e0 00e1 00e2 0103 00e4 0107 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef
+0111 0144 00f2 00f3 00f4 0151 00f6 015b 0171 00f9 00fa 00fb 00fc 0119 021b 00ff
+
+ Some X fonts are encoded in a variant form of ISO8859-1:
+ everything above 0x20 (space) is as normal, but the first 32
+ characters contain the VT100 line drawing glyphs as they would
+ appear from positions 0x5F to 0x7E inclusive. Here is the modified
+ ISO8859-1 code table.
+
+charset CS_ISO8859_1_X11
+0020 2666 2592 2409 240c 240d 240a 00b0 00b1 2424 240b 2518 2510 250c 2514 253c
+23ba 23bb 2500 23bc 23bd 251c 2524 2534 252c 2502 2264 2265 03c0 2260 00a3 00b7
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
+0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
+00a0 00a1 00a2 00a3 00a4 00a5 00a6 00a7 00a8 00a9 00aa 00ab 00ac 00ad 00ae 00af
+00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 00ba 00bb 00bc 00bd 00be 00bf
+00c0 00c1 00c2 00c3 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf
+00d0 00d1 00d2 00d3 00d4 00d5 00d6 00d7 00d8 00d9 00da 00db 00dc 00dd 00de 00df
+00e0 00e1 00e2 00e3 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef
+00f0 00f1 00f2 00f3 00f4 00f5 00f6 00f7 00f8 00f9 00fa 00fb 00fc 00fd 00fe 00ff
+
+ Here are some PC (old DOS) code pages, generated by this piece of
+ Bourne shell:
+
+ for i in 437 850; do
+ echo charset CS_CP$i
+ gensbcs http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP$i.TXT
+ echo
+ done
+
+charset CS_CP437
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+00c7 00fc 00e9 00e2 00e4 00e0 00e5 00e7 00ea 00eb 00e8 00ef 00ee 00ec 00c4 00c5
+00c9 00e6 00c6 00f4 00f6 00f2 00fb 00f9 00ff 00d6 00dc 00a2 00a3 00a5 20a7 0192
+00e1 00ed 00f3 00fa 00f1 00d1 00aa 00ba 00bf 2310 00ac 00bd 00bc 00a1 00ab 00bb
+2591 2592 2593 2502 2524 2561 2562 2556 2555 2563 2551 2557 255d 255c 255b 2510
+2514 2534 252c 251c 2500 253c 255e 255f 255a 2554 2569 2566 2560 2550 256c 2567
+2568 2564 2565 2559 2558 2552 2553 256b 256a 2518 250c 2588 2584 258c 2590 2580
+03b1 00df 0393 03c0 03a3 03c3 00b5 03c4 03a6 0398 03a9 03b4 221e 03c6 03b5 2229
+2261 00b1 2265 2264 2320 2321 00f7 2248 00b0 2219 00b7 221a 207f 00b2 25a0 00a0
+
+charset CS_CP850
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+00c7 00fc 00e9 00e2 00e4 00e0 00e5 00e7 00ea 00eb 00e8 00ef 00ee 00ec 00c4 00c5
+00c9 00e6 00c6 00f4 00f6 00f2 00fb 00f9 00ff 00d6 00dc 00f8 00a3 00d8 00d7 0192
+00e1 00ed 00f3 00fa 00f1 00d1 00aa 00ba 00bf 00ae 00ac 00bd 00bc 00a1 00ab 00bb
+2591 2592 2593 2502 2524 00c1 00c2 00c0 00a9 2563 2551 2557 255d 00a2 00a5 2510
+2514 2534 252c 251c 2500 253c 00e3 00c3 255a 2554 2569 2566 2560 2550 256c 00a4
+00f0 00d0 00ca 00cb 00c8 0131 00cd 00ce 00cf 2518 250c 2588 2584 00a6 00cc 2580
+00d3 00df 00d4 00d2 00f5 00d5 00b5 00fe 00de 00da 00db 00d9 00fd 00dd 00af 00b4
+00ad 00b1 2017 00be 00b6 00a7 00f7 00b8 00b0 00a8 00b7 00b9 00b3 00b2 25a0 00a0
+
+ Here are some Windows code pages, generated by this piece of
+ Bourne shell:
+
+ for i in 1250 1251 1252 1253 1254 1255 1256 1257 1258; do
+ echo charset CS_CP$i
+ gensbcs http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP$i.TXT
+ echo
+ done
+
+charset CS_CP1250
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+20ac XXXX 201a XXXX 201e 2026 2020 2021 XXXX 2030 0160 2039 015a 0164 017d 0179
+XXXX 2018 2019 201c 201d 2022 2013 2014 XXXX 2122 0161 203a 015b 0165 017e 017a
+00a0 02c7 02d8 0141 00a4 0104 00a6 00a7 00a8 00a9 015e 00ab 00ac 00ad 00ae 017b
+00b0 00b1 02db 0142 00b4 00b5 00b6 00b7 00b8 0105 015f 00bb 013d 02dd 013e 017c
+0154 00c1 00c2 0102 00c4 0139 0106 00c7 010c 00c9 0118 00cb 011a 00cd 00ce 010e
+0110 0143 0147 00d3 00d4 0150 00d6 00d7 0158 016e 00da 0170 00dc 00dd 0162 00df
+0155 00e1 00e2 0103 00e4 013a 0107 00e7 010d 00e9 0119 00eb 011b 00ed 00ee 010f
+0111 0144 0148 00f3 00f4 0151 00f6 00f7 0159 016f 00fa 0171 00fc 00fd 0163 02d9
+
+charset CS_CP1251
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+0402 0403 201a 0453 201e 2026 2020 2021 20ac 2030 0409 2039 040a 040c 040b 040f
+0452 2018 2019 201c 201d 2022 2013 2014 XXXX 2122 0459 203a 045a 045c 045b 045f
+00a0 040e 045e 0408 00a4 0490 00a6 00a7 0401 00a9 0404 00ab 00ac 00ad 00ae 0407
+00b0 00b1 0406 0456 0491 00b5 00b6 00b7 0451 2116 0454 00bb 0458 0405 0455 0457
+0410 0411 0412 0413 0414 0415 0416 0417 0418 0419 041a 041b 041c 041d 041e 041f
+0420 0421 0422 0423 0424 0425 0426 0427 0428 0429 042a 042b 042c 042d 042e 042f
+0430 0431 0432 0433 0434 0435 0436 0437 0438 0439 043a 043b 043c 043d 043e 043f
+0440 0441 0442 0443 0444 0445 0446 0447 0448 0449 044a 044b 044c 044d 044e 044f
+
+charset CS_CP1252
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+20ac XXXX 201a 0192 201e 2026 2020 2021 02c6 2030 0160 2039 0152 XXXX 017d XXXX
+XXXX 2018 2019 201c 201d 2022 2013 2014 02dc 2122 0161 203a 0153 XXXX 017e 0178
+00a0 00a1 00a2 00a3 00a4 00a5 00a6 00a7 00a8 00a9 00aa 00ab 00ac 00ad 00ae 00af
+00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 00ba 00bb 00bc 00bd 00be 00bf
+00c0 00c1 00c2 00c3 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf
+00d0 00d1 00d2 00d3 00d4 00d5 00d6 00d7 00d8 00d9 00da 00db 00dc 00dd 00de 00df
+00e0 00e1 00e2 00e3 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef
+00f0 00f1 00f2 00f3 00f4 00f5 00f6 00f7 00f8 00f9 00fa 00fb 00fc 00fd 00fe 00ff
+
+charset CS_CP1253
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+20ac XXXX 201a 0192 201e 2026 2020 2021 XXXX 2030 XXXX 2039 XXXX XXXX XXXX XXXX
+XXXX 2018 2019 201c 201d 2022 2013 2014 XXXX 2122 XXXX 203a XXXX XXXX XXXX XXXX
+00a0 0385 0386 00a3 00a4 00a5 00a6 00a7 00a8 00a9 XXXX 00ab 00ac 00ad 00ae 2015
+00b0 00b1 00b2 00b3 0384 00b5 00b6 00b7 0388 0389 038a 00bb 038c 00bd 038e 038f
+0390 0391 0392 0393 0394 0395 0396 0397 0398 0399 039a 039b 039c 039d 039e 039f
+03a0 03a1 XXXX 03a3 03a4 03a5 03a6 03a7 03a8 03a9 03aa 03ab 03ac 03ad 03ae 03af
+03b0 03b1 03b2 03b3 03b4 03b5 03b6 03b7 03b8 03b9 03ba 03bb 03bc 03bd 03be 03bf
+03c0 03c1 03c2 03c3 03c4 03c5 03c6 03c7 03c8 03c9 03ca 03cb 03cc 03cd 03ce XXXX
+
+charset CS_CP1254
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+20ac XXXX 201a 0192 201e 2026 2020 2021 02c6 2030 0160 2039 0152 XXXX XXXX XXXX
+XXXX 2018 2019 201c 201d 2022 2013 2014 02dc 2122 0161 203a 0153 XXXX XXXX 0178
+00a0 00a1 00a2 00a3 00a4 00a5 00a6 00a7 00a8 00a9 00aa 00ab 00ac 00ad 00ae 00af
+00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 00ba 00bb 00bc 00bd 00be 00bf
+00c0 00c1 00c2 00c3 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf
+011e 00d1 00d2 00d3 00d4 00d5 00d6 00d7 00d8 00d9 00da 00db 00dc 0130 015e 00df
+00e0 00e1 00e2 00e3 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef
+011f 00f1 00f2 00f3 00f4 00f5 00f6 00f7 00f8 00f9 00fa 00fb 00fc 0131 015f 00ff
+
+charset CS_CP1255
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+20ac XXXX 201a 0192 201e 2026 2020 2021 02c6 2030 XXXX 2039 XXXX XXXX XXXX XXXX
+XXXX 2018 2019 201c 201d 2022 2013 2014 02dc 2122 XXXX 203a XXXX XXXX XXXX XXXX
+00a0 00a1 00a2 00a3 20aa 00a5 00a6 00a7 00a8 00a9 00d7 00ab 00ac 00ad 00ae 00af
+00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 00f7 00bb 00bc 00bd 00be 00bf
+05b0 05b1 05b2 05b3 05b4 05b5 05b6 05b7 05b8 05b9 XXXX 05bb 05bc 05bd 05be 05bf
+05c0 05c1 05c2 05c3 05f0 05f1 05f2 05f3 05f4 XXXX XXXX XXXX XXXX XXXX XXXX XXXX
+05d0 05d1 05d2 05d3 05d4 05d5 05d6 05d7 05d8 05d9 05da 05db 05dc 05dd 05de 05df
+05e0 05e1 05e2 05e3 05e4 05e5 05e6 05e7 05e8 05e9 05ea XXXX XXXX 200e 200f XXXX
+
+charset CS_CP1256
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+20ac 067e 201a 0192 201e 2026 2020 2021 02c6 2030 0679 2039 0152 0686 0698 0688
+06af 2018 2019 201c 201d 2022 2013 2014 06a9 2122 0691 203a 0153 200c 200d 06ba
+00a0 060c 00a2 00a3 00a4 00a5 00a6 00a7 00a8 00a9 06be 00ab 00ac 00ad 00ae 00af
+00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 061b 00bb 00bc 00bd 00be 061f
+06c1 0621 0622 0623 0624 0625 0626 0627 0628 0629 062a 062b 062c 062d 062e 062f
+0630 0631 0632 0633 0634 0635 0636 00d7 0637 0638 0639 063a 0640 0641 0642 0643
+00e0 0644 00e2 0645 0646 0647 0648 00e7 00e8 00e9 00ea 00eb 0649 064a 00ee 00ef
+064b 064c 064d 064e 00f4 064f 0650 00f7 0651 00f9 0652 00fb 00fc 200e 200f 06d2
+
+charset CS_CP1257
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+20ac XXXX 201a XXXX 201e 2026 2020 2021 XXXX 2030 XXXX 2039 XXXX 00a8 02c7 00b8
+XXXX 2018 2019 201c 201d 2022 2013 2014 XXXX 2122 XXXX 203a XXXX 00af 02db XXXX
+00a0 XXXX 00a2 00a3 00a4 XXXX 00a6 00a7 00d8 00a9 0156 00ab 00ac 00ad 00ae 00c6
+00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00f8 00b9 0157 00bb 00bc 00bd 00be 00e6
+0104 012e 0100 0106 00c4 00c5 0118 0112 010c 00c9 0179 0116 0122 0136 012a 013b
+0160 0143 0145 00d3 014c 00d5 00d6 00d7 0172 0141 015a 016a 00dc 017b 017d 00df
+0105 012f 0101 0107 00e4 00e5 0119 0113 010d 00e9 017a 0117 0123 0137 012b 013c
+0161 0144 0146 00f3 014d 00f5 00f6 00f7 0173 0142 015b 016b 00fc 017c 017e 02d9
+
+charset CS_CP1258
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+20ac XXXX 201a 0192 201e 2026 2020 2021 02c6 2030 XXXX 2039 0152 XXXX XXXX XXXX
+XXXX 2018 2019 201c 201d 2022 2013 2014 02dc 2122 XXXX 203a 0153 XXXX XXXX 0178
+00a0 00a1 00a2 00a3 00a4 00a5 00a6 00a7 00a8 00a9 00aa 00ab 00ac 00ad 00ae 00af
+00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 00ba 00bb 00bc 00bd 00be 00bf
+00c0 00c1 00c2 0102 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 0300 00cd 00ce 00cf
+0110 00d1 0309 00d3 00d4 01a0 00d6 00d7 00d8 00d9 00da 00db 00dc 01af 0303 00df
+00e0 00e1 00e2 0103 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 0301 00ed 00ee 00ef
+0111 00f1 0323 00f3 00f4 01a1 00f6 00f7 00f8 00f9 00fa 00fb 00fc 01b0 20ab 00ff
+
+ KOI8-R, generated by this code:
+
+ { echo charset CS_KOI8_R;
+ gensbcs http://www.unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-R.TXT; }
+
+charset CS_KOI8_R
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+2500 2502 250c 2510 2514 2518 251c 2524 252c 2534 253c 2580 2584 2588 258c 2590
+2591 2592 2593 2320 25a0 2219 221a 2248 2264 2265 00a0 2321 00b0 00b2 00b7 00f7
+2550 2551 2552 0451 2553 2554 2555 2556 2557 2558 2559 255a 255b 255c 255d 255e
+255f 2560 2561 0401 2562 2563 2564 2565 2566 2567 2568 2569 256a 256b 256c 00a9
+044e 0430 0431 0446 0434 0435 0444 0433 0445 0438 0439 043a 043b 043c 043d 043e
+043f 044f 0440 0441 0442 0443 0436 0432 044c 044b 0437 0448 044d 0449 0447 044a
+042e 0410 0411 0426 0414 0415 0424 0413 0425 0418 0419 041a 041b 041c 041d 041e
+041f 042f 0420 0421 0422 0423 0416 0412 042c 042b 0417 0428 042d 0429 0427 042a
+
+ KOI8-U: I can't find an easily machine-processable mapping table
+ for this one, so I've created it by hand-editing the KOI8-R
+ mapping table in accordance with the list of differences specified
+ in RFC2319. Note that RFC2319 has an apparent error: position B4
+ is listed as U+0404 in the main character set list, but as U+0403
+ in Appendix A (differences from KOI8-R). Both agree that it should
+ be CYRILLIC CAPITAL LETTER UKRAINIAN IE, however, and the Unicode
+ character database says that therefore U+0404 is the correct value.
+
+charset CS_KOI8_U
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+2500 2502 250c 2510 2514 2518 251c 2524 252c 2534 253c 2580 2584 2588 258c 2590
+2591 2592 2593 2320 25a0 2219 221a 2248 2264 2265 00a0 2321 00b0 00b2 00b7 00f7
+2550 2551 2552 0451 0454 2554 0456 0457 2557 2558 2559 255a 255b 0491 255d 255e
+255f 2560 2561 0401 0404 2563 0406 0407 2566 2567 2568 2569 256a 0490 256c 00a9
+044e 0430 0431 0446 0434 0435 0444 0433 0445 0438 0439 043a 043b 043c 043d 043e
+043f 044f 0440 0441 0442 0443 0436 0432 044c 044b 0437 0448 044d 0449 0447 044a
+042e 0410 0411 0426 0414 0415 0424 0413 0425 0418 0419 041a 041b 041c 041d 041e
+041f 042f 0420 0421 0422 0423 0416 0412 042c 042b 0417 0428 042d 0429 0427 042a
+
+ Mac Roman, generated by this code:
+
+ { echo charset CS_MAC_ROMAN;
+ gensbcs http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMAN.TXT; }
+
+ The code point F8FF at position F0 is an interesting one. In
+ Unicode, it's the last of the Private Use section. The mapping
+ table states that it should be an Apple logo. I suppose we should
+ just leave it as it is; there's bound to be some software out
+ there that understands U+F8FF to be an Apple logo!
+
+charset CS_MAC_ROMAN
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+00c4 00c5 00c7 00c9 00d1 00d6 00dc 00e1 00e0 00e2 00e4 00e3 00e5 00e7 00e9 00e8
+00ea 00eb 00ed 00ec 00ee 00ef 00f1 00f3 00f2 00f4 00f6 00f5 00fa 00f9 00fb 00fc
+2020 00b0 00a2 00a3 00a7 2022 00b6 00df 00ae 00a9 2122 00b4 00a8 2260 00c6 00d8
+221e 00b1 2264 2265 00a5 00b5 2202 2211 220f 03c0 222b 00aa 00ba 03a9 00e6 00f8
+00bf 00a1 00ac 221a 0192 2248 2206 00ab 00bb 2026 00a0 00c0 00c3 00d5 0152 0153
+2013 2014 201c 201d 2018 2019 00f7 25ca 00ff 0178 2044 20ac 2039 203a fb01 fb02
+2021 00b7 201a 201e 2030 00c2 00ca 00c1 00cb 00c8 00cd 00ce 00cf 00cc 00d3 00d4
+f8ff 00d2 00da 00db 00d9 0131 02c6 02dc 00af 02d8 02d9 02da 00b8 02dd 02db 02c7
+
+ Roman Czyborra's web site (http://czyborra.com/) has a variety of
+ other useful mapping tables, in a slightly different format (and
+ gzipped). Here's a shell/Perl function to generate an SBCS table
+ from a Czyborra mapping table:
+
+ gensbcs_c() {
+ wget -q -O - "$1" | gzip -d | \
+ perl -ne '/^=(.*)\s+U\+(.*)\s+/ and $a[hex $1]=sprintf "%04x", hex $2;' \
+ -e 'BEGIN{for($i=0;$i<256;$i++){$a[$i]="XXXX";' \
+ -e 'if ($i < 32 or ($i >=127 and $i < 160)) {$a[$i]=sprintf "%04x", $i}}}' \
+ -e 'END{for($i=0;$i<256;$i++){printf"%s%s",$a[$i],$i%16==15?"\n":" "}}'
+ }
+
+ So here we have some character sets generated from Czyborra
+ mapping tables: VISCII, HP-Roman8, and the DEC Multinational
+ Character Set.
+
+ { echo charset CS_VISCII;
+ gensbcs_c http://czyborra.com/charsets/viscii.txt.gz; echo;
+ echo charset CS_HP_ROMAN8;
+ gensbcs_c http://czyborra.com/charsets/hp-roman8.txt.gz; echo;
+ echo charset CS_DEC_MCS;
+ gensbcs_c http://czyborra.com/charsets/dec-mcs.txt.gz; echo; }
+
+charset CS_VISCII
+0000 0001 1eb2 0003 0004 1eb4 1eaa 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 1ef6 0015 0016 0017 0018 1ef8 001a 001b 001c 001d 1ef4 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+1ea0 1eae 1eb0 1eb6 1ea4 1ea6 1ea8 1eac 1ebc 1eb8 1ebe 1ec0 1ec2 1ec4 1ec6 1ed0
+1ed2 1ed4 1ed6 1ed8 1ee2 1eda 1edc 1ede 1eca 1ece 1ecc 1ec8 1ee6 0168 1ee4 1ef2
+00d5 1eaf 1eb1 1eb7 1ea5 1ea7 1ea8 1ead 1ebd 1eb9 1ebf 1ec1 1ec3 1ec5 1ec7 1ed1
+1ed3 1ed5 1ed7 1ee0 01a0 1ed9 1edd 1edf 1ecb 1ef0 1ee8 1eea 1eec 01a1 1edb 01af
+00c0 00c1 00c2 00c3 1ea2 0102 1eb3 1eb5 00c8 00c9 00ca 1eba 00cc 00cd 0128 1ef3
+0110 1ee9 00d2 00d3 00d4 1ea1 1ef7 1eeb 1eed 00d9 00da 1ef9 1ef5 00dd 1ee1 01b0
+00e0 00e1 00e2 00e3 1ea3 0103 1eef 1eab 00e8 00e9 00ea 1ebb 00ec 00ed 0129 1ec9
+0111 1ef1 00f2 00f3 00f4 00f5 1ecf 1ecd 1ee5 00f9 00fa 0169 1ee7 00fd 1ee3 1eee
+
+charset CS_HP_ROMAN8
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
+0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
+00a0 00c0 00c2 00c8 00ca 00cb 00ce 00cf 00b4 02cb 02c6 00a8 02dc 00d9 00db 20a4
+00af 00dd 00fd 00b0 00c7 00e7 00d1 00f1 00a1 00bf 00a4 00a3 00a5 00a7 0192 00a2
+00e2 00ea 00f4 00fb 00e1 00e9 00f3 00fa 00e0 00e8 00f2 00f9 00e4 00eb 00f6 00fc
+00c5 00ee 00d8 00c6 00e5 00ed 00f8 00e6 00c4 00ec 00d6 00dc 00c9 00ef 00df 00d4
+00c1 00c3 00e3 00d0 00f0 00cd 00cc 00d3 00d2 00d5 00f5 0160 0161 00da 0178 00ff
+00de 00fe 00b7 00b5 00b6 00be 2014 00bc 00bd 00aa 00ba 00ab 25a0 00bb 00b1 XXXX
+
+charset CS_DEC_MCS
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f
+0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f
+0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f
+XXXX 00a1 00a2 00a3 XXXX 00a5 XXXX 00a7 00a4 00a9 00aa 00ab XXXX XXXX XXXX XXXX
+00b0 00b1 00b2 00b3 XXXX 00b5 00b6 00b7 XXXX 00b9 00ba 00bb 00bc 00bd XXXX 00bf
+00c0 00c1 00c2 00c3 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf
+XXXX 00d1 00d2 00d3 00d4 00d5 00d6 0152 00d8 00d9 00da 00db 00dc 0178 XXXX 00df
+00e0 00e1 00e2 00e3 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef
+XXXX 00f1 00f2 00f3 00f4 00f5 00f6 0153 00f8 00f9 00fa 00fb 00fc 00ff XXXX XXXX
--- /dev/null
+#!/usr/bin/env perl -w
+
+# This script generates sbcsdat.c (the data for all the SBCSes) from its
+# source form sbcs.dat.
+
+$infile = "sbcs.dat";
+$outfile = "sbcsdat.c";
+
+open FOO, $infile;
+open BAR, ">$outfile";
+select BAR;
+
+print "/*\n";
+print " * sbcsdat.c - data definitions for single-byte character sets.\n";
+print " *\n";
+print " * Generated by sbcsgen.pl from sbcs.dat.\n";
+print " * You should edit those files rather than editing this one.\n";
+print " */\n";
+print "\n";
+print "#ifndef ENUM_CHARSETS\n";
+print "\n";
+print "#include \"charset.h\"\n";
+print "#include \"internal.h\"\n";
+print "\n";
+
+my $charsetname = undef;
+my @vals = ();
+
+my @charsetnames = ();
+
+while (<FOO>) {
+ chomp;
+ if (/^charset (.*)$/) {
+ $charsetname = $1;
+ @vals = ();
+ } elsif (/^[0-9a-fA-FX]/) {
+ push @vals, map { $_ eq "XXXX" ? -1 : hex $_ } split / +/, $_;
+ if (scalar @vals > 256) {
+ die "$infile:$.: charset $charsetname has more than 256 values\n";
+ } elsif (scalar @vals == 256) {
+ &outcharset($charsetname, @vals);
+ push @charsetnames, $charsetname;
+ $charsetname = undef;
+ @vals = ();
+ }
+ }
+}
+
+print "#else /* ENUM_CHARSETS */\n";
+print "\n";
+
+foreach $i (@charsetnames) {
+ print "ENUM_CHARSET($i)\n";
+}
+
+print "\n";
+print "#endif /* ENUM_CHARSETS */\n";
+
+sub outcharset($@) {
+ my ($name, @vals) = @_;
+ my ($prefix, $i, @sorted);
+
+ print "static const sbcs_data data_$name = {\n";
+ print " {\n";
+ $prefix = " ";
+ @sorted = ();
+ for ($i = 0; $i < 256; $i++) {
+ if ($vals[$i] < 0) {
+ printf "%sERROR ", $prefix;
+ } else {
+ printf "%s0x%04x", $prefix, $vals[$i];
+ push @sorted, [$i, $vals[$i]];
+ }
+ if ($i % 8 == 7) {
+ $prefix = ",\n ";
+ } else {
+ $prefix = ", ";
+ }
+ }
+ print "\n },\n {\n";
+ @sorted = sort { $a->[1] <=> $b->[1] } @sorted;
+ $prefix = " ";
+ for ($i = 0; $i < scalar @sorted; $i++) {
+ printf "%s0x%02x", $prefix, $sorted[$i]->[0];
+ if ($i % 8 == 7) {
+ $prefix = ",\n ";
+ } else {
+ $prefix = ", ";
+ }
+ }
+ printf "\n },\n %d\n", scalar @sorted;
+ print "};\n";
+ print "const charset_spec charset_$name = {\n" .
+ " $name, read_sbcs, write_sbcs, &data_$name\n};\n\n";
+}
--- /dev/null
+/*
+ * slookup.c - static lookup of character sets.
+ */
+
+#include "charset.h"
+#include "internal.h"
+
+#define ENUM_CHARSET(x) extern charset_spec const charset_##x;
+#include "enum.c"
+#undef ENUM_CHARSET
+
+static charset_spec const *const cs_table[] = {
+
+#define ENUM_CHARSET(x) &charset_##x,
+#include "enum.c"
+#undef ENUM_CHARSET
+
+};
+
+charset_spec const *charset_find_spec(int charset)
+{
+ int i;
+
+ for (i = 0; i < (int)lenof(cs_table); i++)
+ if (cs_table[i]->charset == charset)
+ return cs_table[i];
+
+ return NULL;
+}
--- /dev/null
+/*
+ * toucs.c - convert charsets to Unicode.
+ */
+
+#include "charset.h"
+#include "internal.h"
+
+struct unicode_emit_param {
+ wchar_t *output;
+ int outlen;
+ const wchar_t *errstr;
+ int errlen;
+ int stopped;
+};
+
+static void unicode_emit(void *ctx, long int output)
+{
+ struct unicode_emit_param *param = (struct unicode_emit_param *)ctx;
+ wchar_t outval;
+ wchar_t const *p;
+ int outlen;
+
+ if (output == ERROR) {
+ if (param->errstr) {
+ p = param->errstr;
+ outlen = param->errlen;
+ } else {
+ outval = 0xFFFD; /* U+FFFD REPLACEMENT CHARACTER */
+ p = &outval;
+ outlen = 1;
+ }
+ } else {
+ outval = output;
+ p = &outval;
+ outlen = 1;
+ }
+
+ if (param->outlen >= outlen) {
+ while (outlen > 0) {
+ *param->output++ = *p++;
+ param->outlen--;
+ outlen--;
+ }
+ } else {
+ param->stopped = 1;
+ }
+}
+
+int charset_to_unicode(char **input, int *inlen, wchar_t *output, int outlen,
+ int charset, charset_state *state,
+ const wchar_t *errstr, int errlen)
+{
+ charset_spec const *spec = charset_find_spec(charset);
+ charset_state localstate;
+ struct unicode_emit_param param;
+
+ param.output = output;
+ param.outlen = outlen;
+ param.errstr = errstr;
+ param.errlen = errlen;
+ param.stopped = 0;
+
+ if (!state) {
+ localstate.s0 = 0;
+ } else {
+ localstate = *state; /* structure copy */
+ }
+
+ while (*inlen > 0) {
+ int lenbefore = param.output - output;
+ spec->read(spec, (unsigned char)**input, &localstate,
+ unicode_emit, ¶m);
+ if (param.stopped) {
+ /*
+ * The emit function has _tried_ to output some
+ * characters, but ran up against the end of the
+ * buffer. Leave immediately, and return what happened
+ * _before_ attempting to process this character.
+ */
+ return lenbefore;
+ }
+ if (state)
+ *state = localstate; /* structure copy */
+ (*input)++;
+ (*inlen)--;
+ }
+
+ return param.output - output;
+}
--- /dev/null
+/*
+ * utf8.c - routines to handle UTF-8.
+ */
+
+#ifndef ENUM_CHARSETS
+
+#include "charset.h"
+#include "internal.h"
+
+/*
+ * UTF-8 has no associated data, so `charset' may be ignored.
+ */
+
+void read_utf8(charset_spec const *charset, long int input_chr,
+ charset_state *state,
+ void (*emit)(void *ctx, long int output), void *emitctx)
+{
+ UNUSEDARG(charset);
+
+ /*
+ * For reading UTF-8, the `state' word contains:
+ *
+ * - in bits 29-31, the number of bytes expected to be in the
+ * current multibyte character (which we can tell instantly
+ * from the first byte, of course).
+ *
+ * - in bits 26-28, the number of bytes _seen so far_ in the
+ * current multibyte character.
+ *
+ * - in the remainder of the word, the current value of the
+ * character, which is shifted upwards by 6 bits to
+ * accommodate each new byte.
+ *
+ * As required, the state is zero when we are not in the middle
+ * of a multibyte character at all.
+ *
+ * For example, when reading E9 8D 8B, starting at state=0:
+ *
+ * - after E9, the state is 0x64000009
+ * - after 8D, the state is 0x6800024d
+ * - after 8B, the state conceptually becomes 0x6c00934b, at
+ * which point we notice we've got as many characters as we
+ * were expecting, output U+934B, and reset the state to
+ * zero.
+ *
+ * Note that the maximum number of bits we might need to store
+ * in the character value field is 25 (U+7FFFFFFF contains 31
+ * bits, but we will never actually store its full value
+ * because when we receive the last 6 bits in the final
+ * continuation byte we will output it and revert the state to
+ * zero). Hence the character value field never collides with
+ * the byte counts.
+ */
+
+ if (input_chr < 0x80) {
+ /*
+ * Single-byte character. If the state is nonzero before
+ * coming here, output an error for an incomplete sequence.
+ * Then output the character.
+ */
+ if (state->s0 != 0) {
+ emit(emitctx, ERROR);
+ state->s0 = 0;
+ }
+ emit(emitctx, input_chr);
+ } else if (input_chr == 0xFE || input_chr == 0xFF) {
+ /*
+ * FE and FF bytes should _never_ occur in UTF-8. They are
+ * automatic errors; if the state was nonzero to start
+ * with, output a further error for an incomplete sequence.
+ */
+ if (state->s0 != 0) {
+ emit(emitctx, ERROR);
+ state->s0 = 0;
+ }
+ emit(emitctx, ERROR);
+ } else if (input_chr >= 0x80 && input_chr < 0xC0) {
+ /*
+ * Continuation byte. Output an error for an unexpected
+ * continuation byte, if the state is zero.
+ */
+ if (state->s0 == 0) {
+ emit(emitctx, ERROR);
+ } else {
+ unsigned long charval;
+ unsigned long topstuff;
+ int bytes;
+
+ /*
+ * Otherwise, accumulate more of the character value.
+ */
+ charval = state->s0 & 0x03ffffffL;
+ charval = (charval << 6) | (input_chr & 0x3F);
+
+ /*
+ * Check the byte counts; if we have not reached the
+ * end of the character, update the state and return.
+ */
+ topstuff = state->s0 & 0xfc000000L;
+ topstuff += 0x04000000L; /* add one to the byte count */
+ if (((topstuff << 3) ^ topstuff) & 0xe0000000L) {
+ state->s0 = topstuff | charval;
+ return;
+ }
+
+ /*
+ * Now we know we've reached the end of the character.
+ * `charval' is the Unicode value. We should check for
+ * various invalid things, and then either output
+ * charval or an error. In all cases we reset the state
+ * to zero.
+ */
+ bytes = topstuff >> 29;
+ state->s0 = 0;
+
+ if (charval >= 0xD800 && charval < 0xE000) {
+ /*
+ * Surrogates (0xD800-0xDFFF) may never be encoded
+ * in UTF-8. A surrogate pair in Unicode should
+ * have been encoded as a single UTF-8 character
+ * occupying more than three bytes.
+ */
+ emit(emitctx, ERROR);
+ } else if (charval == 0xFFFE || charval == 0xFFFF) {
+ /*
+ * U+FFFE and U+FFFF are invalid Unicode characters
+ * and may never be encoded in UTF-8. (This is one
+ * reason why U+FFFF is our way of signalling an
+ * error to our `emit' function :-)
+ */
+ emit(emitctx, ERROR);
+ } else if ((charval <= 0x7FL /* && bytes > 1 */) ||
+ (charval <= 0x7FFL && bytes > 2) ||
+ (charval <= 0xFFFFL && bytes > 3) ||
+ (charval <= 0x1FFFFFL && bytes > 4) ||
+ (charval <= 0x3FFFFFFL && bytes > 5)) {
+ /*
+ * Overlong sequences are not to be tolerated,
+ * under any circumstances.
+ */
+ emit(emitctx, ERROR);
+ } else {
+ /*
+ * Oh, all right. We'll let this one off.
+ */
+ emit(emitctx, charval);
+ }
+ }
+
+ } else {
+ /*
+ * Lead byte. First output an error for an incomplete
+ * sequence, if the state is nonzero.
+ */
+ if (state->s0 != 0)
+ emit(emitctx, ERROR);
+
+ /*
+ * Now deal with the lead byte: work out the number of
+ * bytes we expect to see in this character, and extract
+ * the initial bits of it too.
+ */
+ if (input_chr >= 0xC0 && input_chr < 0xE0) {
+ state->s0 = 0x44000000L | (input_chr & 0x1F);
+ } else if (input_chr >= 0xE0 && input_chr < 0xF0) {
+ state->s0 = 0x64000000L | (input_chr & 0x0F);
+ } else if (input_chr >= 0xF0 && input_chr < 0xF8) {
+ state->s0 = 0x84000000L | (input_chr & 0x07);
+ } else if (input_chr >= 0xF8 && input_chr < 0xFC) {
+ state->s0 = 0xa4000000L | (input_chr & 0x03);
+ } else if (input_chr >= 0xFC && input_chr < 0xFE) {
+ state->s0 = 0xc4000000L | (input_chr & 0x01);
+ }
+ }
+}
+
+/*
+ * UTF-8 is a stateless multi-byte encoding (in the sense that just
+ * after any character has been completed, the state is always the
+ * same); hence when writing it, there is no need to use the
+ * charset_state.
+ */
+
+void write_utf8(charset_spec const *charset, long int input_chr,
+ charset_state *state,
+ void (*emit)(void *ctx, long int output), void *emitctx)
+{
+ UNUSEDARG(charset);
+ UNUSEDARG(state);
+
+ /*
+ * Refuse to output any illegal code points.
+ */
+ if (input_chr == 0xFFFE || input_chr == 0xFFFF ||
+ (input_chr >= 0xD800 && input_chr < 0xE000)) {
+ emit(emitctx, ERROR);
+ } else if (input_chr < 0x80) { /* one-byte character */
+ emit(emitctx, input_chr);
+ } else if (input_chr < 0x800) { /* two-byte character */
+ emit(emitctx, 0xC0 | (0x1F & (input_chr >> 6)));
+ emit(emitctx, 0x80 | (0x3F & (input_chr )));
+ } else if (input_chr < 0x10000) { /* three-byte character */
+ emit(emitctx, 0xE0 | (0x0F & (input_chr >> 12)));
+ emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
+ emit(emitctx, 0x80 | (0x3F & (input_chr )));
+ } else if (input_chr < 0x200000) { /* four-byte character */
+ emit(emitctx, 0xF0 | (0x07 & (input_chr >> 18)));
+ emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
+ emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
+ emit(emitctx, 0x80 | (0x3F & (input_chr )));
+ } else if (input_chr < 0x4000000) {/* five-byte character */
+ emit(emitctx, 0xF8 | (0x03 & (input_chr >> 24)));
+ emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
+ emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
+ emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
+ emit(emitctx, 0x80 | (0x3F & (input_chr )));
+ } else { /* six-byte character */
+ emit(emitctx, 0xFC | (0x01 & (input_chr >> 30)));
+ emit(emitctx, 0x80 | (0x3F & (input_chr >> 24)));
+ emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
+ emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
+ emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
+ emit(emitctx, 0x80 | (0x3F & (input_chr )));
+ }
+}
+
+#ifdef TESTMODE
+
+#include <stdio.h>
+#include <stdarg.h>
+
+int total_errs = 0;
+
+void utf8_emit(void *ctx, long output)
+{
+ wchar_t **p = (wchar_t **)ctx;
+ *(*p)++ = output;
+}
+
+void utf8_read_test(int line, char *input, int inlen, ...)
+{
+ va_list ap;
+ wchar_t *p, str[512];
+ int i;
+ charset_state state;
+ unsigned long l;
+
+ state.s0 = 0;
+ p = str;
+
+ for (i = 0; i < inlen; i++)
+ read_utf8(NULL, input[i] & 0xFF, &state, utf8_emit, &p);
+
+ va_start(ap, inlen);
+ l = 0;
+ for (i = 0; i < p - str; i++) {
+ l = va_arg(ap, long int);
+ if (l == -1) {
+ printf("%d: correct string shorter than output\n", line);
+ total_errs++;
+ break;
+ }
+ if (l != str[i]) {
+ printf("%d: char %d came out as %08x, should be %08x\n",
+ line, i, str[i], l);
+ total_errs++;
+ }
+ }
+ if (l != -1) {
+ l = va_arg(ap, long int);
+ if (l != -1) {
+ printf("%d: correct string longer than output\n", line);
+ total_errs++;
+ }
+ }
+ va_end(ap);
+}
+
+void utf8_write_test(int line, const long *input, int inlen, ...)
+{
+ va_list ap;
+ wchar_t *p, str[512];
+ int i;
+ charset_state state;
+ unsigned long l;
+
+ state.s0 = 0;
+ p = str;
+
+ for (i = 0; i < inlen; i++)
+ write_utf8(NULL, input[i], &state, utf8_emit, &p);
+
+ va_start(ap, inlen);
+ l = 0;
+ for (i = 0; i < p - str; i++) {
+ l = va_arg(ap, long int);
+ if (l == -1) {
+ printf("%d: correct string shorter than output\n", line);
+ total_errs++;
+ break;
+ }
+ if (l != str[i]) {
+ printf("%d: char %d came out as %08x, should be %08x\n",
+ line, i, str[i], l);
+ total_errs++;
+ }
+ }
+ if (l != -1) {
+ l = va_arg(ap, long int);
+ if (l != -1) {
+ printf("%d: correct string longer than output\n", line);
+ total_errs++;
+ }
+ }
+ va_end(ap);
+}
+
+/* Macro to concoct the first three parameters of utf8_read_test. */
+#define TESTSTR(x) __LINE__, x, lenof(x)
+
+int main(void)
+{
+ printf("read tests beginning\n");
+ utf8_read_test(TESTSTR("\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"),
+ 0x000003BA, /* GREEK SMALL LETTER KAPPA */
+ 0x00001F79, /* GREEK SMALL LETTER OMICRON WITH OXIA */
+ 0x000003C3, /* GREEK SMALL LETTER SIGMA */
+ 0x000003BC, /* GREEK SMALL LETTER MU */
+ 0x000003B5, /* GREEK SMALL LETTER EPSILON */
+ 0, -1);
+ utf8_read_test(TESTSTR("\x00"),
+ 0x00000000, /* <control> */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xC2\x80"),
+ 0x00000080, /* <control> */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xE0\xA0\x80"),
+ 0x00000800, /* <no name available> */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xF0\x90\x80\x80"),
+ 0x00010000, /* <no name available> */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xF8\x88\x80\x80\x80"),
+ 0x00200000, /* <no name available> */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xFC\x84\x80\x80\x80\x80"),
+ 0x04000000, /* <no name available> */
+ 0, -1);
+ utf8_read_test(TESTSTR("\x7F"),
+ 0x0000007F, /* <control> */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xDF\xBF"),
+ 0x000007FF, /* <no name available> */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
+ 0x0000FFFD, /* REPLACEMENT CHARACTER */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
+ ERROR, /* <no name available> (invalid char) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xF7\xBF\xBF\xBF"),
+ 0x001FFFFF, /* <no name available> */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF\xBF"),
+ 0x03FFFFFF, /* <no name available> */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF\xBF"),
+ 0x7FFFFFFF, /* <no name available> */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xED\x9F\xBF"),
+ 0x0000D7FF, /* <no name available> */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xEE\x80\x80"),
+ 0x0000E000, /* <Private Use, First> */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
+ 0x0000FFFD, /* REPLACEMENT CHARACTER */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xF4\x8F\xBF\xBF"),
+ 0x0010FFFF, /* <no name available> */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xF4\x90\x80\x80"),
+ 0x00110000, /* <no name available> */
+ 0, -1);
+ utf8_read_test(TESTSTR("\x80"),
+ ERROR, /* (unexpected continuation byte) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xBF"),
+ ERROR, /* (unexpected continuation byte) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\x80\xBF"),
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\x80\xBF\x80"),
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\x80\xBF\x80\xBF"),
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80"),
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF"),
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF\x80"),
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"),
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ ERROR, /* (unexpected continuation byte) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xC0\x20\xC1\x20\xC2\x20\xC3\x20\xC4\x20\xC5\x20\xC6\x20\xC7\x20"),
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xE0\x20\xE1\x20\xE2\x20\xE3\x20\xE4\x20\xE5\x20\xE6\x20\xE7\x20\xE8\x20\xE9\x20\xEA\x20\xEB\x20\xEC\x20\xED\x20\xEE\x20\xEF\x20"),
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xF0\x20\xF1\x20\xF2\x20\xF3\x20\xF4\x20\xF5\x20\xF6\x20\xF7\x20"),
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xF8\x20\xF9\x20\xFA\x20\xFB\x20"),
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xFC\x20\xFD\x20"),
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ ERROR, /* (incomplete sequence) */
+ 0x00000020, /* SPACE */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xC0"),
+ ERROR, /* (incomplete sequence) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xE0\x80"),
+ ERROR, /* (incomplete sequence) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xF0\x80\x80"),
+ ERROR, /* (incomplete sequence) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xF8\x80\x80\x80"),
+ ERROR, /* (incomplete sequence) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80"),
+ ERROR, /* (incomplete sequence) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xDF"),
+ ERROR, /* (incomplete sequence) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xEF\xBF"),
+ ERROR, /* (incomplete sequence) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xF7\xBF\xBF"),
+ ERROR, /* (incomplete sequence) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF"),
+ ERROR, /* (incomplete sequence) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF"),
+ ERROR, /* (incomplete sequence) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xC0\xE0\x80\xF0\x80\x80\xF8\x80\x80\x80\xFC\x80\x80\x80\x80\xDF\xEF\xBF\xF7\xBF\xBF\xFB\xBF\xBF\xBF\xFD\xBF\xBF\xBF\xBF"),
+ ERROR, /* (incomplete sequence) */
+ ERROR, /* (incomplete sequence) */
+ ERROR, /* (incomplete sequence) */
+ ERROR, /* (incomplete sequence) */
+ ERROR, /* (incomplete sequence) */
+ ERROR, /* (incomplete sequence) */
+ ERROR, /* (incomplete sequence) */
+ ERROR, /* (incomplete sequence) */
+ ERROR, /* (incomplete sequence) */
+ ERROR, /* (incomplete sequence) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xFE"),
+ ERROR, /* (invalid UTF-8 byte) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xFF"),
+ ERROR, /* (invalid UTF-8 byte) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xFE\xFE\xFF\xFF"),
+ ERROR, /* (invalid UTF-8 byte) */
+ ERROR, /* (invalid UTF-8 byte) */
+ ERROR, /* (invalid UTF-8 byte) */
+ ERROR, /* (invalid UTF-8 byte) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xC0\xAF"),
+ ERROR, /* SOLIDUS (overlong form of 2F) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xE0\x80\xAF"),
+ ERROR, /* SOLIDUS (overlong form of 2F) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xF0\x80\x80\xAF"),
+ ERROR, /* SOLIDUS (overlong form of 2F) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xF8\x80\x80\x80\xAF"),
+ ERROR, /* SOLIDUS (overlong form of 2F) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\xAF"),
+ ERROR, /* SOLIDUS (overlong form of 2F) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xC1\xBF"),
+ ERROR, /* <control> (overlong form of 7F) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xE0\x9F\xBF"),
+ ERROR, /* <no name available> (overlong form of DF BF) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xF0\x8F\xBF\xBF"),
+ ERROR, /* <no name available> (overlong form of EF BF BF) (invalid char) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xF8\x87\xBF\xBF\xBF"),
+ ERROR, /* <no name available> (overlong form of F7 BF BF BF) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xFC\x83\xBF\xBF\xBF\xBF"),
+ ERROR, /* <no name available> (overlong form of FB BF BF BF BF) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xC0\x80"),
+ ERROR, /* <control> (overlong form of 00) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xE0\x80\x80"),
+ ERROR, /* <control> (overlong form of 00) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xF0\x80\x80\x80"),
+ ERROR, /* <control> (overlong form of 00) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xF8\x80\x80\x80\x80"),
+ ERROR, /* <control> (overlong form of 00) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\x80"),
+ ERROR, /* <control> (overlong form of 00) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xED\xA0\x80"),
+ ERROR, /* <Non Private Use High Surrogate, First> (surrogate) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xED\xAD\xBF"),
+ ERROR, /* <Non Private Use High Surrogate, Last> (surrogate) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xED\xAE\x80"),
+ ERROR, /* <Private Use High Surrogate, First> (surrogate) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xED\xAF\xBF"),
+ ERROR, /* <Private Use High Surrogate, Last> (surrogate) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xED\xB0\x80"),
+ ERROR, /* <Low Surrogate, First> (surrogate) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xED\xBE\x80"),
+ ERROR, /* <no name available> (surrogate) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xED\xBF\xBF"),
+ ERROR, /* <Low Surrogate, Last> (surrogate) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xB0\x80"),
+ ERROR, /* <Non Private Use High Surrogate, First> (surrogate) */
+ ERROR, /* <Low Surrogate, First> (surrogate) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xBF\xBF"),
+ ERROR, /* <Non Private Use High Surrogate, First> (surrogate) */
+ ERROR, /* <Low Surrogate, Last> (surrogate) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xB0\x80"),
+ ERROR, /* <Non Private Use High Surrogate, Last> (surrogate) */
+ ERROR, /* <Low Surrogate, First> (surrogate) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xBF\xBF"),
+ ERROR, /* <Non Private Use High Surrogate, Last> (surrogate) */
+ ERROR, /* <Low Surrogate, Last> (surrogate) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xB0\x80"),
+ ERROR, /* <Private Use High Surrogate, First> (surrogate) */
+ ERROR, /* <Low Surrogate, First> (surrogate) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xBF\xBF"),
+ ERROR, /* <Private Use High Surrogate, First> (surrogate) */
+ ERROR, /* <Low Surrogate, Last> (surrogate) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xB0\x80"),
+ ERROR, /* <Private Use High Surrogate, Last> (surrogate) */
+ ERROR, /* <Low Surrogate, First> (surrogate) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xBF\xBF"),
+ ERROR, /* <Private Use High Surrogate, Last> (surrogate) */
+ ERROR, /* <Low Surrogate, Last> (surrogate) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xEF\xBF\xBE"),
+ ERROR, /* <no name available> (invalid char) */
+ 0, -1);
+ utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
+ ERROR, /* <no name available> (invalid char) */
+ 0, -1);
+ printf("read tests completed\n");
+ printf("write tests beginning\n");
+ {
+ const static long str[] =
+ {0x03BAL, 0x1F79L, 0x03C3L, 0x03BCL, 0x03B5L, 0};
+ utf8_write_test(TESTSTR(str),
+ 0xCE, 0xBA,
+ 0xE1, 0xBD, 0xB9,
+ 0xCF, 0x83,
+ 0xCE, 0xBC,
+ 0xCE, 0xB5,
+ 0, -1);
+ }
+ {
+ const static long str[] = {0x0000L, 0};
+ utf8_write_test(TESTSTR(str),
+ 0x00,
+ 0, -1);
+ }
+ {
+ const static long str[] = {0x0080L, 0};
+ utf8_write_test(TESTSTR(str),
+ 0xC2, 0x80,
+ 0, -1);
+ }
+ {
+ const static long str[] = {0x0800L, 0};
+ utf8_write_test(TESTSTR(str),
+ 0xE0, 0xA0, 0x80,
+ 0, -1);
+ }
+ {
+ const static long str[] = {0x00010000L, 0};
+ utf8_write_test(TESTSTR(str),
+ 0xF0, 0x90, 0x80, 0x80,
+ 0, -1);
+ }
+ {
+ const static long str[] = {0x00200000L, 0};
+ utf8_write_test(TESTSTR(str),
+ 0xF8, 0x88, 0x80, 0x80, 0x80,
+ 0, -1);
+ }
+ {
+ const static long str[] = {0x04000000L, 0};
+ utf8_write_test(TESTSTR(str),
+ 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80,
+ 0, -1);
+ }
+ {
+ const static long str[] = {0x007FL, 0};
+ utf8_write_test(TESTSTR(str),
+ 0x7F,
+ 0, -1);
+ }
+ {
+ const static long str[] = {0x07FFL, 0};
+ utf8_write_test(TESTSTR(str),
+ 0xDF, 0xBF,
+ 0, -1);
+ }
+ {
+ const static long str[] = {0xFFFDL, 0};
+ utf8_write_test(TESTSTR(str),
+ 0xEF, 0xBF, 0xBD,
+ 0, -1);
+ }
+ {
+ const static long str[] = {0xFFFFL, 0};
+ utf8_write_test(TESTSTR(str),
+ ERROR,
+ 0, -1);
+ }
+ {
+ const static long str[] = {0x001FFFFFL, 0};
+ utf8_write_test(TESTSTR(str),
+ 0xF7, 0xBF, 0xBF, 0xBF,
+ 0, -1);
+ }
+ {
+ const static long str[] = {0x03FFFFFFL, 0};
+ utf8_write_test(TESTSTR(str),
+ 0xFB, 0xBF, 0xBF, 0xBF, 0xBF,
+ 0, -1);
+ }
+ {
+ const static long str[] = {0x7FFFFFFFL, 0};
+ utf8_write_test(TESTSTR(str),
+ 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF,
+ 0, -1);
+ }
+ {
+ const static long str[] = {0xD7FFL, 0};
+ utf8_write_test(TESTSTR(str),
+ 0xED, 0x9F, 0xBF,
+ 0, -1);
+ }
+ {
+ const static long str[] = {0xD800L, 0};
+ utf8_write_test(TESTSTR(str),
+ ERROR,
+ 0, -1);
+ }
+ {
+ const static long str[] = {0xD800L, 0xDC00L, 0};
+ utf8_write_test(TESTSTR(str),
+ ERROR,
+ ERROR,
+ 0, -1);
+ }
+ {
+ const static long str[] = {0xDFFFL, 0};
+ utf8_write_test(TESTSTR(str),
+ ERROR,
+ 0, -1);
+ }
+ {
+ const static long str[] = {0xE000L, 0};
+ utf8_write_test(TESTSTR(str),
+ 0xEE, 0x80, 0x80,
+ 0, -1);
+ }
+ printf("write tests completed\n");
+
+ printf("total: %d errors\n", total_errs);
+ return (total_errs != 0);
+}
+#endif /* TESTMODE */
+
+const charset_spec charset_CS_UTF8 = {
+ CS_UTF8, read_utf8, write_utf8, NULL
+};
+
+#else /* ENUM_CHARSETS */
+
+ENUM_CHARSET(CS_UTF8)
+
+#endif /* ENUM_CHARSETS */
--- /dev/null
+/*
+ * xenc.c - translate our internal character set codes to and from
+ * X11 character encoding names.
+ *
+ */
+
+#include <ctype.h>
+#include "charset.h"
+#include "internal.h"
+
+static const struct {
+ const char *name;
+ int charset;
+} xencs[] = {
+ /*
+ * Officially registered encoding names. This list is derived
+ * from the font encodings section of
+ *
+ * http://ftp.x.org/pub/DOCS/registry
+ *
+ * Where multiple encoding names map to the same encoding id
+ * (such as iso8859-15 and fcd8859-15), the first is considered
+ * canonical and will be returned when translating the id to a
+ * string.
+ */
+ { "iso8859-1", CS_ISO8859_1 },
+ { "iso8859-2", CS_ISO8859_2 },
+ { "iso8859-3", CS_ISO8859_3 },
+ { "iso8859-4", CS_ISO8859_4 },
+ { "iso8859-5", CS_ISO8859_5 },
+ { "iso8859-6", CS_ISO8859_6 },
+ { "iso8859-7", CS_ISO8859_7 },
+ { "iso8859-8", CS_ISO8859_8 },
+ { "iso8859-9", CS_ISO8859_9 },
+ { "iso8859-10", CS_ISO8859_10 },
+ { "iso8859-13", CS_ISO8859_13 },
+ { "iso8859-14", CS_ISO8859_14 },
+ { "iso8859-15", CS_ISO8859_15 },
+ { "fcd8859-15", CS_ISO8859_15 },
+ { "hp-roman8", CS_HP_ROMAN8 },
+ { "koi8-r", CS_KOI8_R },
+ /*
+ * Unofficial encoding names found in the wild.
+ */
+ { "iso8859-16", CS_ISO8859_16 },
+ { "koi8-u", CS_KOI8_U },
+ { "ibm-cp437", CS_CP437 },
+ { "ibm-cp850", CS_CP850 },
+ { "microsoft-cp1250", CS_CP1250 },
+ { "microsoft-cp1251", CS_CP1251 },
+ { "microsoft-cp1252", CS_CP1252 },
+ { "microsoft-cp1253", CS_CP1253 },
+ { "microsoft-cp1254", CS_CP1254 },
+ { "microsoft-cp1255", CS_CP1255 },
+ { "microsoft-cp1256", CS_CP1256 },
+ { "microsoft-cp1257", CS_CP1257 },
+ { "microsoft-cp1258", CS_CP1258 },
+ { "mac-roman", CS_MAC_ROMAN },
+ { "viscii1.1-1", CS_VISCII },
+ { "viscii1-1", CS_VISCII },
+};
+
+const char *charset_to_xenc(int charset)
+{
+ int i;
+
+ for (i = 0; i < (int)lenof(xencs); i++)
+ if (charset == xencs[i].charset)
+ return xencs[i].name;
+
+ return NULL; /* not found */
+}
+
+int charset_from_xenc(const char *name)
+{
+ int i;
+
+ for (i = 0; i < (int)lenof(xencs); i++) {
+ const char *p, *q;
+ p = name;
+ q = xencs[i].name;
+ while (*p || *q) {
+ if (tolower(*p) != tolower(*q))
+ break;
+ p++; q++;
+ }
+ if (!*p && !*q)
+ return xencs[i].charset;
+ }
+
+ return CS_NONE; /* not found */
+}
open IN, "Recipe" or die "unable to open Recipe file\n";
-@incdirs = ("", "unix/", "mac/");
+# HACK: One of the source files in `charset' is auto-generated by
+# sbcsgen.pl. We need to generate that _now_, before attempting
+# dependency analysis.
+eval 'chdir "charset"; require "sbcsgen.pl"; chdir ".."';
+
+@incdirs = ("", "charset/", "unix/", "mac/");
$help = ""; # list of newline-free lines of help text
%programs = (); # maps prog name + type letter to listref of objects/resources
"# TOOLPATH = /opt/gcc/bin\n".
"CC = \$(TOOLPATH)cc\n".
"\n".
-&splitline("CFLAGS = -Wall -g -I. -I.. `gtk-config --cflags`")."\n".
+&splitline("CFLAGS = -Wall -g -I. -I.. -I../charset `gtk-config --cflags`")."\n".
"XLDFLAGS = `gtk-config --libs`\n".
"ULDFLAGS =#\n".
"INSTALL=install\n",
.IP "\fB\-log\fP \fIfilename\fP"
This option makes \fIpterm\fP log all the terminal output to a file
as well as displaying it in the terminal.
+.IP "\fB\-cs\fP \fIcharset\fP"
+This option specifies the character set in which \fIpterm\fP should
+assume the session is operating. This character set will be used to
+interpret all the data received from the session, and all input you
+type or paste into \fIpterm\fP will be converted into this character
+set before being sent to the session.
+
+Any character set name which is valid in a MIME header (and
+supported by \fIpterm\fP) should be valid here (examples are
+"ISO-8859-1", "windows-1252" or "UTF-8"). Also, any character
+encoding which is valid in an X logical font description should be
+valid ("ibm-cp437", for example).
+
+Character set names are case-insensitive.
.IP "\fB\-nethack\fP"
Tells \fIpterm\fP to enable NetHack keypad mode, in which the
numeric keypad generates the NetHack "hjklyubn" direction keys. This
This option should be set to either 0 or 1; the default is 1. When
set to 1, any activity in the display causes the position of the
scrollback to be reset to the very bottom.
+.IP "\fBpterm.LineCodePage\fP"
+This option specifies the character set to be used for the session.
+This is the same as the \fI\-cs\fP command-line option.
+.IP "\fBpterm.NoRemoteCharset\fP"
+This option disables the terminal's ability to change its character
+set when it receives escape sequences telling it to. You might need
+to do this to interoperate with programs which incorrectly change
+the character set to something they think is sensible.
.IP "\fBpterm.BCE\fP"
This option should be set to either 0 or 1; the default is 1. When
set to 1, the various control sequences that erase parts of the
#include <X11/Xutil.h>
#define PUTTY_DO_GLOBALS /* actually _define_ globals */
+
#include "putty.h"
#include "terminal.h"
GtkAdjustment *sbar_adjust;
GdkPixmap *pixmap;
GdkFont *fonts[2]; /* normal and bold (for now!) */
+ struct {
+ int charset;
+ int is_wide;
+ } fontinfo[2];
GdkCursor *rawcursor, *textcursor, *blankcursor, *currcursor;
GdkColor cols[NCOLOURS];
GdkColormap *colmap;
wchar_t *pastein_data;
int pastein_data_len;
- char *pasteout_data;
- int pasteout_data_len;
+ char *pasteout_data, *pasteout_data_utf8;
+ int pasteout_data_len, pasteout_data_utf8_len;
int font_width, font_height;
int ignore_sbar;
int mouseptr_visible;
guint term_paste_idle_id;
- GdkAtom compound_text_atom;
+ GdkAtom compound_text_atom, utf8_string_atom;
int alt_keycode;
int alt_digits;
char wintitle[sizeof(((Config *)0)->wintitle)];
printf("\n");
#endif
- ldisc_send(inst->ldisc, output+start, end-start, 1);
+ /*
+ * The stuff we've just generated is assumed to be
+ * ISO-8859-1! This sounds insane, but `man XLookupString'
+ * agrees: strings of this type returned from the X server
+ * are hardcoded to 8859-1. Strictly speaking we should be
+ * doing this using some sort of GtkIMContext, which (if
+ * we're lucky) would give us our data directly in Unicode;
+ * but that's not supported in GTK 1.2 as far as I can
+ * tell, and it's poorly documented even in 2.0, so it'll
+ * have to wait.
+ */
+ lpage_send(inst->ldisc, CS_ISO8859_1, output+start, end-start, 1);
+
show_mouseptr(inst, 0);
term_seen_key_event(inst->term);
term_out(inst->term);
struct gui_data *inst = (struct gui_data *)frontend;
if (inst->pasteout_data)
sfree(inst->pasteout_data);
+ if (inst->pasteout_data_utf8)
+ sfree(inst->pasteout_data_utf8);
+
+ inst->pasteout_data_utf8 = smalloc(len*6);
+ inst->pasteout_data_utf8_len = len*6;
+ {
+ wchar_t *tmp = data;
+ int tmplen = len;
+ inst->pasteout_data_utf8_len =
+ charset_from_unicode(&tmp, &tmplen, inst->pasteout_data_utf8,
+ inst->pasteout_data_utf8_len,
+ CS_UTF8, NULL, NULL, 0);
+ inst->pasteout_data_utf8 =
+ srealloc(inst->pasteout_data_utf8, inst->pasteout_data_utf8_len);
+ }
+
inst->pasteout_data = smalloc(len);
inst->pasteout_data_len = len;
- wc_to_mb(0, 0, data, len, inst->pasteout_data, inst->pasteout_data_len,
+ wc_to_mb(line_codepage, 0, data, len,
+ inst->pasteout_data, inst->pasteout_data_len,
NULL, NULL);
if (gtk_selection_owner_set(inst->area, GDK_SELECTION_PRIMARY,
GDK_SELECTION_TYPE_STRING, 1);
gtk_selection_add_target(inst->area, GDK_SELECTION_PRIMARY,
inst->compound_text_atom, 1);
+ gtk_selection_add_target(inst->area, GDK_SELECTION_PRIMARY,
+ inst->utf8_string_atom, 1);
}
}
guint info, guint time_stamp, gpointer data)
{
struct gui_data *inst = (struct gui_data *)data;
- gtk_selection_data_set(seldata, GDK_SELECTION_TYPE_STRING, 8,
- inst->pasteout_data, inst->pasteout_data_len);
+ if (seldata->target == inst->utf8_string_atom)
+ gtk_selection_data_set(seldata, seldata->target, 8,
+ inst->pasteout_data_utf8,
+ inst->pasteout_data_utf8_len);
+ else
+ gtk_selection_data_set(seldata, seldata->target, 8,
+ inst->pasteout_data, inst->pasteout_data_len);
}
gint selection_clear(GtkWidget *widget, GdkEventSelection *seldata,
term_deselect(inst->term);
if (inst->pasteout_data)
sfree(inst->pasteout_data);
+ if (inst->pasteout_data_utf8)
+ sfree(inst->pasteout_data_utf8);
inst->pasteout_data = NULL;
inst->pasteout_data_len = 0;
+ inst->pasteout_data_utf8 = NULL;
+ inst->pasteout_data_utf8_len = 0;
return TRUE;
}
* moment is to call gtk_selection_convert(), and when the data
* comes back _then_ we can call term_do_paste().
*/
+
+ /*
+ * First we attempt to retrieve the selection as a UTF-8 string
+ * (which we will convert to the correct code page before
+ * sending to the session, of course). If that fails,
+ * selection_received() will be informed and will fall back to
+ * an ordinary string.
+ */
gtk_selection_convert(inst->area, GDK_SELECTION_PRIMARY,
- GDK_SELECTION_TYPE_STRING, GDK_CURRENT_TIME);
+ inst->utf8_string_atom, GDK_CURRENT_TIME);
}
gint idle_paste_func(gpointer data); /* forward ref */
{
struct gui_data *inst = (struct gui_data *)data;
+ if (seldata->target == inst->utf8_string_atom && seldata->length <= 0) {
+ /*
+ * Failed to get a UTF-8 selection string. Try an ordinary
+ * string.
+ */
+ gtk_selection_convert(inst->area, GDK_SELECTION_PRIMARY,
+ GDK_SELECTION_TYPE_STRING, GDK_CURRENT_TIME);
+ return;
+ }
+
+ /*
+ * Any other failure should just go foom.
+ */
if (seldata->length <= 0 ||
- seldata->type != GDK_SELECTION_TYPE_STRING)
+ (seldata->type != GDK_SELECTION_TYPE_STRING &&
+ seldata->type != inst->utf8_string_atom))
return; /* Nothing happens. */
if (inst->pastein_data)
inst->pastein_data = smalloc(seldata->length * sizeof(wchar_t));
inst->pastein_data_len = seldata->length;
- mb_to_wc(0, 0, seldata->data, seldata->length,
- inst->pastein_data, inst->pastein_data_len);
+ inst->pastein_data_len =
+ mb_to_wc((seldata->type == inst->utf8_string_atom ?
+ CS_UTF8 : line_codepage),
+ 0, seldata->data, seldata->length,
+ inst->pastein_data, inst->pastein_data_len);
term_do_paste(inst->term);
rlen*inst->font_width, inst->font_height);
gdk_gc_set_foreground(gc, &inst->cols[nfg]);
- gdk_draw_text(inst->pixmap, inst->fonts[fontid], gc,
- x*inst->font_width+cfg.window_border,
- y*inst->font_height+cfg.window_border+inst->fonts[0]->ascent,
- text, len);
+ {
+ GdkWChar *gwcs;
+ gchar *gcs;
+ wchar_t *wcs;
+ int i;
+
+ wcs = smalloc(sizeof(wchar_t) * (len+1));
+ for (i = 0; i < len; i++) {
+ wcs[i] = (wchar_t) ((attr & CSET_MASK) + (text[i] & CHAR_MASK));
+ }
+
+ if (inst->fontinfo[fontid].is_wide) {
+ gwcs = smalloc(sizeof(GdkWChar) * (len+1));
+ /*
+ * FIXME: when we have a wide-char equivalent of
+ * from_unicode, use it instead of this.
+ */
+ for (i = 0; i <= len; i++)
+ gwcs[i] = wcs[i];
+ gdk_draw_text_wc(inst->pixmap, inst->fonts[fontid], gc,
+ x*inst->font_width+cfg.window_border,
+ y*inst->font_height+cfg.window_border+inst->fonts[0]->ascent,
+ gwcs, len*2);
+ sfree(gwcs);
+ } else {
+ wchar_t *wcstmp = wcs;
+ int lentmp = len;
+ gcs = smalloc(sizeof(GdkWChar) * (len+1));
+ charset_from_unicode(&wcstmp, &lentmp, gcs, len,
+ inst->fontinfo[fontid].charset,
+ NULL, ".", 1);
+ gdk_draw_text(inst->pixmap, inst->fonts[fontid], gc,
+ x*inst->font_width+cfg.window_border,
+ y*inst->font_height+cfg.window_border+inst->fonts[0]->ascent,
+ gcs, len);
+ sfree(gcs);
+ }
+ sfree(wcs);
+ }
if (shadow) {
gdk_draw_text(inst->pixmap, inst->fonts[fontid], gc,
strncpy(cfg.boldfont, val, sizeof(cfg.boldfont));
cfg.boldfont[sizeof(cfg.boldfont)-1] = '\0';
+ } else if (!strcmp(p, "-cs")) {
+ EXPECTS_ARG;
+ SECOND_PASS_ONLY;
+ strncpy(cfg.line_codepage, val, sizeof(cfg.line_codepage));
+ cfg.line_codepage[sizeof(cfg.line_codepage)-1] = '\0';
+
} else if (!strcmp(p, "-geometry")) {
int flags, x, y, w, h;
EXPECTS_ARG;
}
}
+static void set_font_info(struct gui_data *inst, int fontid)
+{
+ GdkFont *font = inst->fonts[fontid];
+ XFontStruct *xfs = GDK_FONT_XFONT(font);
+ Display *disp = GDK_FONT_XDISPLAY(font);
+ Atom charset_registry, charset_encoding;
+ unsigned long registry_ret, encoding_ret;
+ charset_registry = XInternAtom(disp, "CHARSET_REGISTRY", False);
+ charset_encoding = XInternAtom(disp, "CHARSET_ENCODING", False);
+ inst->fontinfo[fontid].charset = CS_NONE;
+ inst->fontinfo[fontid].is_wide = 0;
+ if (XGetFontProperty(xfs, charset_registry, ®istry_ret) &&
+ XGetFontProperty(xfs, charset_encoding, &encoding_ret)) {
+ char *reg, *enc;
+ reg = XGetAtomName(disp, (Atom)registry_ret);
+ enc = XGetAtomName(disp, (Atom)encoding_ret);
+ if (reg && enc) {
+ char *encoding = dupcat(reg, "-", enc, NULL);
+ inst->fontinfo[fontid].charset = charset_from_xenc(encoding);
+ /* FIXME: when libcharset supports wide encodings fix this. */
+ if (!strcasecmp(encoding, "iso10646-1"))
+ inst->fontinfo[fontid].is_wide = 1;
+
+ /*
+ * Hack for X line-drawing characters: if the primary
+ * font is encoded as ISO-8859-anything, and has valid
+ * glyphs in the first 32 char positions, it is assumed
+ * that those glyphs are the VT100 line-drawing
+ * character set.
+ *
+ * Actually, we'll hack even harder by only checking
+ * position 0x19 (vertical line, VT100 linedrawing
+ * `x'). Then we can check it easily by seeing if the
+ * ascent and descent differ.
+ */
+ if (inst->fontinfo[fontid].charset == CS_ISO8859_1) {
+ int lb, rb, wid, asc, desc;
+ gchar text[2];
+
+ text[1] = '\0';
+ text[0] = '\x12';
+ gdk_string_extents(inst->fonts[fontid], text,
+ &lb, &rb, &wid, &asc, &desc);
+ if (asc != desc)
+ inst->fontinfo[fontid].charset = CS_ISO8859_1_X11;
+ }
+
+ /*
+ * FIXME: this is a hack. Currently fonts with
+ * incomprehensible encodings are dealt with by
+ * pretending they're 8859-1. It's ugly, but it's good
+ * enough to stop things crashing. Should do something
+ * better here.
+ */
+ if (inst->fontinfo[fontid].charset == CS_NONE)
+ inst->fontinfo[fontid].charset = CS_ISO8859_1;
+
+ sfree(encoding);
+ }
+ }
+}
+
int main(int argc, char **argv)
{
extern int pty_master_fd; /* declared in pty.c */
fprintf(stderr, "pterm: unable to load font \"%s\"\n", cfg.font);
exit(1);
}
+ set_font_info(inst, 0);
if (cfg.boldfont[0]) {
inst->fonts[1] = gdk_font_load(cfg.boldfont);
if (!inst->fonts[1]) {
cfg.boldfont);
exit(1);
}
+ set_font_info(inst, 1);
} else
inst->fonts[1] = NULL;
inst->font_height = inst->fonts[0]->ascent + inst->fonts[0]->descent;
inst->compound_text_atom = gdk_atom_intern("COMPOUND_TEXT", FALSE);
+ inst->utf8_string_atom = gdk_atom_intern("UTF8_STRING", FALSE);
init_ucs();
#ifndef PUTTY_UNIX_H
#define PUTTY_UNIX_H
+#include "charset.h"
+
typedef void *Context; /* FIXME: probably needs changing */
extern Backend pty_backend;
int first_socket(int *state, int *rwx);
int next_socket(int *state, int *rwx);
-#define DEFAULT_CODEPAGE 0 /* FIXME: no idea how to do this */
+/*
+ * In the Unix Unicode layer, DEFAULT_CODEPAGE is a special value
+ * which causes mb_to_wc and wc_to_mb to call _libc_ rather than
+ * libcharset. That way, we can interface the various charsets
+ * supported by libcharset with the one supported by mbstowcs and
+ * wcstombs (which will be the character set in which stuff read
+ * from the command line or config files is assumed to be encoded).
+ */
+#define DEFAULT_CODEPAGE 0xFFFF
+#define CP_UTF8 CS_UTF8 /* from libcharset */
#define strnicmp strncasecmp
#define stricmp strcasecmp
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
+#include <locale.h>
+#include <limits.h>
+#include <wchar.h>
#include <time.h>
+
#include "putty.h"
#include "terminal.h"
#include "misc.h"
/*
* Unix Unicode-handling routines.
- *
- * FIXME: currently trivial stub versions assuming all codepages
- * are ISO8859-1.
*/
int is_dbcs_leadbyte(int codepage, char byte)
int mb_to_wc(int codepage, int flags, char *mbstr, int mblen,
wchar_t *wcstr, int wclen)
{
- int ret = 0;
- while (mblen > 0 && wclen > 0) {
- *wcstr++ = (unsigned char) *mbstr++;
- mblen--, wclen--, ret++;
- }
- return ret; /* FIXME: check error codes! */
+ if (codepage == DEFAULT_CODEPAGE) {
+ int n = 0;
+ mbstate_t state = { 0 };
+
+ setlocale(LC_CTYPE, "");
+
+ while (mblen > 0) {
+ size_t i = mbrtowc(wcstr+n, mbstr, (size_t)mblen, &state);
+ if (i == (size_t)-1 || i == (size_t)-2)
+ break;
+ n++;
+ mbstr += i;
+ mblen -= i;
+ }
+
+ setlocale(LC_CTYPE, "C");
+
+ return n;
+ } else
+ return charset_to_unicode(&mbstr, &mblen, wcstr, wclen, codepage,
+ NULL, NULL, 0);
}
int wc_to_mb(int codepage, int flags, wchar_t *wcstr, int wclen,
char *mbstr, int mblen, char *defchr, int *defused)
{
- int ret = 0;
+ /* FIXME: we should remove the defused param completely... */
if (defused)
*defused = 0;
- while (mblen > 0 && wclen > 0) {
- if (*wcstr >= 0x100) {
- if (defchr)
- *mbstr++ = *defchr;
- else
- *mbstr++ = '.';
- if (defused)
- *defused = 1;
- } else
- *mbstr++ = (unsigned char) *wcstr;
- wcstr++;
- mblen--, wclen--, ret++;
- }
- return ret; /* FIXME: check error codes! */
+
+ if (codepage == DEFAULT_CODEPAGE) {
+ char output[MB_LEN_MAX];
+ mbstate_t state = { 0 };
+ int n = 0;
+
+ setlocale(LC_CTYPE, "");
+
+ while (wclen > 0) {
+ int i = wcrtomb(output, wcstr[0], &state);
+ if (i == (size_t)-1 || i > n - mblen)
+ break;
+ memcpy(mbstr+n, output, i);
+ n += i;
+ wcstr++;
+ wclen--;
+ }
+
+ setlocale(LC_CTYPE, "C");
+
+ return n;
+ } else
+ return charset_from_unicode(&wcstr, &wclen, mbstr, mblen, codepage,
+ NULL, NULL, 0);
}
void init_ucs(void)
{
int i;
- /* Find the line control characters. FIXME: this is not right. */
- for (i = 0; i < 256; i++)
- if (i < ' ' || (i >= 0x7F && i < 0xA0))
- unitab_ctrl[i] = i;
+
+ /*
+ * In the platform-independent parts of the code, font_codepage
+ * is used only for system DBCS support - which we don't
+ * support at all. So we set this to something which will never
+ * be used.
+ */
+ font_codepage = -1;
+
+ /*
+ * line_codepage should be decoded from the specification in
+ * cfg.
+ */
+ line_codepage = charset_from_mimeenc(cfg.line_codepage);
+ if (line_codepage == CS_NONE)
+ line_codepage = charset_from_xenc(cfg.line_codepage);
+ /* If it's still CS_NONE, we should assume direct-to-font. */
+
+ /* FIXME: this is a hack. Currently fonts with incomprehensible
+ * encodings are dealt with by pretending they're 8859-1. It's
+ * ugly, but it's good enough to stop things crashing. Should do
+ * something better here. */
+ if (line_codepage == CS_NONE)
+ line_codepage = CS_ISO8859_1;
+
+ /*
+ * Set up unitab_line, by translating each individual character
+ * in the line codepage into Unicode.
+ */
+ for (i = 0; i < 256; i++) {
+ char c[1], *p;
+ wchar_t wc[1];
+ int len;
+ c[0] = i;
+ p = c;
+ len = 1;
+ if (1 == charset_to_unicode(&p,&len,wc,1,line_codepage,NULL,L"",0))
+ unitab_line[i] = wc[0];
else
- unitab_ctrl[i] = 0xFF;
+ unitab_line[i] = 0xFFFD;
+ }
+ /*
+ * Set up unitab_xterm. This is the same as unitab_line except
+ * in the line-drawing regions, where it follows the Unicode
+ * encoding.
+ *
+ * (Note that the strange X encoding of line-drawing characters
+ * in the bottom 32 glyphs of ISO8859-1 fonts is taken care of
+ * by the font encoding, which will spot such a font and act as
+ * if it were in a variant encoding of ISO8859-1.)
+ */
for (i = 0; i < 256; i++) {
- unitab_line[i] = unitab_scoacs[i] = i;
- unitab_xterm[i] = (i >= 0x5F && i < 0x7F) ? ((i+1) & 0x1F) : i;
+ static const wchar_t unitab_xterm_std[32] = {
+ 0x2666, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1,
+ 0x2424, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0x23ba,
+ 0x23bb, 0x2500, 0x23bc, 0x23bd, 0x251c, 0x2524, 0x2534, 0x252c,
+ 0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x0020
+ };
+ if (i >= 0x5F && i < 0x7F)
+ unitab_xterm[i] = unitab_xterm_std[i & 0x1F];
+ else
+ unitab_xterm[i] = unitab_line[i];
}
+
+ /*
+ * Set up unitab_scoacs. The SCO Alternate Character Set is
+ * simply CP437.
+ */
+ for (i = 0; i < 256; i++) {
+ char c[1], *p;
+ wchar_t wc[1];
+ int len;
+ c[0] = i;
+ p = c;
+ len = 1;
+ if (1 == charset_to_unicode(&p,&len,wc,1,CS_CP437,NULL,L"",0))
+ unitab_scoacs[i] = wc[0];
+ else
+ unitab_scoacs[i] = 0xFFFD;
+ }
+
+ /* Find the line control characters. */
+ for (i = 0; i < 256; i++)
+ if (unitab_line[i] < ' '
+ || (unitab_line[i] >= 0x7F && unitab_line[i] < 0xA0))
+ unitab_ctrl[i] = i;
+ else
+ unitab_ctrl[i] = 0xFF;
}