From 2dc6356a02ebe2e5c0428cefc18e64882d85b4a6 Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 31 Dec 2002 12:20:34 +0000 Subject: [PATCH] First draft of Unicode support in pterm. It's pretty complete: it does UTF-8 copy and paste (falling back to normal strings if necessary), it understands X font encodings and translates things accordingly so that if you have a Unicode font you can ask for virtually any single-byte encoding and get it (Mac-Roman pterm, anyone?), and so on. There's work left to be done (wide fonts for CJK spring to mind), but I reckon this is a pretty good start. git-svn-id: svn://svn.tartarus.org/sgt/putty@2395 cda61777-01e9-0310-a592-d414129be87e --- Recipe | 5 +- charset/.cvsignore | 1 + charset/README | 11 + charset/charset.h | 120 ++++++++ charset/enum.c | 19 ++ charset/fromucs.c | 91 ++++++ charset/internal.h | 89 ++++++ charset/mimeenc.c | 209 +++++++++++++ charset/sbcs.c | 45 +++ charset/sbcs.dat | 698 ++++++++++++++++++++++++++++++++++++++++++ charset/sbcsgen.pl | 95 ++++++ charset/slookup.c | 29 ++ charset/toucs.c | 89 ++++++ charset/utf8.c | 877 +++++++++++++++++++++++++++++++++++++++++++++++++++++ charset/xenc.c | 92 ++++++ mkfiles.pl | 9 +- unix/pterm.1 | 22 ++ unix/pterm.c | 206 ++++++++++++- unix/unix.h | 13 +- unix/uxucs.c | 166 ++++++++-- 20 files changed, 2836 insertions(+), 50 deletions(-) create mode 100644 charset/.cvsignore create mode 100644 charset/README create mode 100644 charset/charset.h create mode 100644 charset/enum.c create mode 100644 charset/fromucs.c create mode 100644 charset/internal.h create mode 100644 charset/mimeenc.c create mode 100644 charset/sbcs.c create mode 100644 charset/sbcs.dat create mode 100644 charset/sbcsgen.pl create mode 100644 charset/slookup.c create mode 100644 charset/toucs.c create mode 100644 charset/utf8.c create mode 100644 charset/xenc.c diff --git a/Recipe b/Recipe index aa3e0f91..2ec14c7b 100644 --- a/Recipe +++ b/Recipe @@ -110,6 +110,9 @@ SFTP = sftp int64 logging WINMISC = misc version winstore settings tree234 winnet proxy cmdline UXMISC = misc version uxstore settings tree234 uxnet proxy cmdline +# Character set library, for use in pterm. +CHARSET = sbcsdat slookup sbcs utf8 toucs fromucs xenc mimeenc + # Standard libraries, and the same with WinSocks 1 and 2. LIBS = advapi32.lib user32.lib gdi32.lib comctl32.lib comdlg32.lib + shell32.lib winmm.lib imm32.lib winspool.lib @@ -137,7 +140,7 @@ puttygen : [G] puttygen sshrsag sshdssg sshprime sshdes sshbn sshmd5 version + sshpubk sshaes sshsh512 import winutils puttygen.res LIBS pterm : [X] pterm terminal wcwidth uxucs uxmisc tree234 misc ldisc ldiscucs - + logging uxprint settings pty be_none uxstore signal + + logging uxprint settings pty be_none uxstore signal CHARSET plink : [U] uxplink uxcons NONSSH UXSSH be_all logging UXMISC diff --git a/charset/.cvsignore b/charset/.cvsignore new file mode 100644 index 00000000..fc5615a7 --- /dev/null +++ b/charset/.cvsignore @@ -0,0 +1 @@ +sbcsdat.c diff --git a/charset/README b/charset/README new file mode 100644 index 00000000..248a8b2a --- /dev/null +++ b/charset/README @@ -0,0 +1,11 @@ +This subdirectory contains a general character-set conversion +library, used in the Unix port of PuTTY, and available for use in +other ports if it should happen to be useful. + +I intend to use this same library in other programs at some future +date. It is therefore a _strong_ design goal that this library +should remain perfectly general, and not tied to particulars of +PuTTY. It must not reference any code outside its own subdirectory; +it should not have PuTTY-specific helper routines added to it unless +they can be documented in a general manner which might make them +useful in other circumstances as well. diff --git a/charset/charset.h b/charset/charset.h new file mode 100644 index 00000000..e3067f82 --- /dev/null +++ b/charset/charset.h @@ -0,0 +1,120 @@ +/* + * charset.h - header file for general character set conversion + * routines. + */ + +#ifndef charset_charset_h +#define charset_charset_h + +#include + +/* + * Enumeration that lists all the multibyte or single-byte + * character sets known to this library. + */ +typedef enum { + CS_NONE, /* used for reporting errors, etc */ + CS_ISO8859_1, + CS_ISO8859_1_X11, /* X font encoding with VT100 glyphs */ + CS_ISO8859_2, + CS_ISO8859_3, + CS_ISO8859_4, + CS_ISO8859_5, + CS_ISO8859_6, + CS_ISO8859_7, + CS_ISO8859_8, + CS_ISO8859_9, + CS_ISO8859_10, + CS_ISO8859_11, + CS_ISO8859_13, + CS_ISO8859_14, + CS_ISO8859_15, + CS_ISO8859_16, + CS_CP437, + CS_CP850, + CS_CP1250, + CS_CP1251, + CS_CP1252, + CS_CP1253, + CS_CP1254, + CS_CP1255, + CS_CP1256, + CS_CP1257, + CS_CP1258, + CS_KOI8_R, + CS_KOI8_U, + CS_MAC_ROMAN, + CS_VISCII, + CS_HP_ROMAN8, + CS_DEC_MCS, + CS_UTF8 +} charset_t; + +typedef struct { + unsigned long s0; +} charset_state; + +/* + * Routine to convert a MB/SB character set to Unicode. + * + * This routine accepts some number of bytes, updates a state + * variable, and outputs some number of Unicode characters. There + * are no guarantees. You can't even guarantee that at most one + * Unicode character will be output per byte you feed in; for + * example, suppose you're reading UTF-8, you've seen E1 80, and + * then you suddenly see FE. Now you need to output _two_ error + * characters - one for the incomplete sequence E1 80, and one for + * the completely invalid UTF-8 byte FE. + * + * Returns the number of wide characters output; will never output + * more than the size of the buffer (as specified on input). + * Advances the `input' pointer and decrements `inlen', to indicate + * how far along the input string it got. + * + * The sequence of `errlen' wide characters pointed to by `errstr' + * will be used to indicate a conversion error. If `errstr' is + * NULL, `errlen' will be ignored, and the library will choose + * something sensible to do on its own. For Unicode, this will be + * U+FFFD (REPLACEMENT CHARACTER). + */ + +int charset_to_unicode(char **input, int *inlen, wchar_t *output, int outlen, + int charset, charset_state *state, + const wchar_t *errstr, int errlen); + +/* + * Routine to convert Unicode to an MB/SB character set. + * + * This routine accepts some number of Unicode characters, updates + * a state variable, and outputs some number of bytes. + * + * Returns the number of bytes characters output; will never output + * more than the size of the buffer (as specified on input), and + * will never output a partial MB character. Advances the `input' + * pointer and decrements `inlen', to indicate how far along the + * input string it got. + * + * The sequence of `errlen' characters pointed to by `errstr' will + * be used to indicate a conversion error. If `errstr' is NULL, + * `errlen' will be ignored, and the library will choose something + * sensible to do on its own (which will vary depending on the + * output charset). + */ + +int charset_from_unicode(wchar_t **input, int *inlen, char *output, int outlen, + int charset, charset_state *state, + const char *errstr, int errlen); + +/* + * Convert X11 encoding names to and from our charset identifiers. + */ +const char *charset_to_xenc(int charset); +int charset_from_xenc(const char *name); + +/* + * Convert MIME encoding names to and from our charset identifiers. + */ +const char *charset_to_mimeenc(int charset); +int charset_from_mimeenc(const char *name); + +#endif /* charset_charset_h */ diff --git a/charset/enum.c b/charset/enum.c new file mode 100644 index 00000000..347647d1 --- /dev/null +++ b/charset/enum.c @@ -0,0 +1,19 @@ +/* + * enum.c - enumerate all charsets defined by the library. + * + * This file maintains a list of every other source file which + * contains ENUM_CHARSET definitions. It #includes each one with + * ENUM_CHARSETS defined, which causes those source files to do + * nothing at all except call the ENUM_CHARSET macro on each + * charset they define. + * + * This file in turn is included from various other places, with + * the ENUM_CHARSET macro defined to various different things. This + * allows us to have multiple implementations of the master charset + * lookup table (a static one and a dynamic one). + */ + +#define ENUM_CHARSETS +#include "sbcsdat.c" +#include "utf8.c" +#undef ENUM_CHARSETS diff --git a/charset/fromucs.c b/charset/fromucs.c new file mode 100644 index 00000000..f08375f0 --- /dev/null +++ b/charset/fromucs.c @@ -0,0 +1,91 @@ +/* + * fromucs.c - convert Unicode to other character sets. + */ + +#include "charset.h" +#include "internal.h" + +struct charset_emit_param { + char *output; + int outlen; + const char *errstr; + int errlen; + int stopped; +}; + +static void charset_emit(void *ctx, long int output) +{ + struct charset_emit_param *param = (struct charset_emit_param *)ctx; + char outval; + char const *p; + int outlen; + + if (output == ERROR) { + p = param->errstr; + outlen = param->errlen; + } else { + outval = output; + p = &outval; + outlen = 1; + } + + if (param->outlen >= outlen) { + while (outlen > 0) { + *param->output++ = *p++; + param->outlen--; + outlen--; + } + } else { + param->stopped = 1; + } +} + +int charset_from_unicode(wchar_t **input, int *inlen, char *output, int outlen, + int charset, charset_state *state, + const char *errstr, int errlen) +{ + charset_spec const *spec = charset_find_spec(charset); + charset_state localstate; + struct charset_emit_param param; + + param.output = output; + param.outlen = outlen; + param.stopped = 0; + + /* + * charset_emit will expect a valid errstr. + */ + if (!errstr) { + /* *shrug* this is good enough, and consistent across all SBCS... */ + param.errstr = "."; + param.errlen = 1; + } + param.errstr = errstr; + param.errlen = errlen; + + if (!state) { + localstate.s0 = 0; + } else { + localstate = *state; /* structure copy */ + } + state = &localstate; + + while (*inlen > 0) { + int lenbefore = param.output - output; + spec->write(spec, **input, &localstate, charset_emit, ¶m); + if (param.stopped) { + /* + * The emit function has _tried_ to output some + * characters, but ran up against the end of the + * buffer. Leave immediately, and return what happened + * _before_ attempting to process this character. + */ + return lenbefore; + } + if (state) + *state = localstate; /* structure copy */ + (*input)++; + (*inlen)--; + } + return param.output - output; +} diff --git a/charset/internal.h b/charset/internal.h new file mode 100644 index 00000000..38df2eaa --- /dev/null +++ b/charset/internal.h @@ -0,0 +1,89 @@ +/* + * internal.h - internal header stuff for the charset library. + */ + +#ifndef charset_internal_h +#define charset_internal_h + +/* This invariably comes in handy */ +#define lenof(x) ( sizeof((x)) / sizeof(*(x)) ) + +/* This is an invalid Unicode value used to indicate an error. */ +#define ERROR 0xFFFFL /* Unicode value representing error */ + +typedef struct charset_spec charset_spec; +typedef struct sbcs_data sbcs_data; + +struct charset_spec { + int charset; /* numeric identifier */ + + /* + * A function to read the character set and output Unicode + * characters. The `emit' function expects to get Unicode chars + * passed to it; it should be sent ERROR for any encoding error + * on the input. + */ + void (*read)(charset_spec const *charset, long int input_chr, + charset_state *state, + void (*emit)(void *ctx, long int output), void *emitctx); + /* + * A function to read Unicode characters and output in this + * character set. The `emit' function expects to get byte + * values passed to it; it should be sent ERROR for any + * non-representable characters on the input. + */ + void (*write)(charset_spec const *charset, long int input_chr, + charset_state *state, + void (*emit)(void *ctx, long int output), void *emitctx); + void const *data; +}; + +/* + * This is the format of `data' used by the SBCS read and write + * functions; so it's the format used in all SBCS definitions. + */ +struct sbcs_data { + /* + * This is a simple mapping table converting each SBCS position + * to a Unicode code point. Some positions may contain ERROR, + * indicating that that byte value is not defined in the SBCS + * in question and its occurrence in input is an error. + */ + unsigned long sbcs2ucs[256]; + + /* + * This lookup table is used to convert Unicode back to the + * SBCS. It consists of the valid byte values in the SBCS, + * sorted in order of their Unicode translation. So given a + * Unicode value U, you can do a binary search on this table + * using the above table as a lookup: when testing the Xth + * position in this table, you branch according to whether + * sbcs2ucs[ucs2sbcs[X]] is less than, greater than, or equal + * to U. + * + * Note that since there may be fewer than 256 valid byte + * values in a particular SBCS, we must supply the length of + * this table as well as the contents. + */ + unsigned char ucs2sbcs[256]; + int nvalid; +}; + +/* + * Prototypes for internal library functions. + */ +charset_spec const *charset_find_spec(int charset); +void read_sbcs(charset_spec const *charset, long int input_chr, + charset_state *state, + void (*emit)(void *ctx, long int output), void *emitctx); +void write_sbcs(charset_spec const *charset, long int input_chr, + charset_state *state, + void (*emit)(void *ctx, long int output), void *emitctx); + +/* + * Placate compiler warning about unused parameters, of which we + * expect to have some in this library. + */ +#define UNUSEDARG(x) ( (x) = (x) ) + +#endif /* charset_internal_h */ diff --git a/charset/mimeenc.c b/charset/mimeenc.c new file mode 100644 index 00000000..b5b1eb99 --- /dev/null +++ b/charset/mimeenc.c @@ -0,0 +1,209 @@ +/* + * mimeenc.c - translate our internal character set codes to and + * from MIME standard character-set names. + * + */ + +#include +#include "charset.h" +#include "internal.h" + +static const struct { + const char *name; + int charset; +} mimeencs[] = { + /* + * These names are taken from + * + * http://www.iana.org/assignments/character-sets + * + * Where multiple encoding names map to the same encoding id + * (such as the variety of aliases for ISO-8859-1), the first + * is considered canonical and will be returned when + * translating the id to a string. + */ + { "ISO-8859-1", CS_ISO8859_1 }, + { "iso-ir-100", CS_ISO8859_1 }, + { "ISO_8859-1", CS_ISO8859_1 }, + { "ISO_8859-1:1987", CS_ISO8859_1 }, + { "latin1", CS_ISO8859_1 }, + { "l1", CS_ISO8859_1 }, + { "IBM819", CS_ISO8859_1 }, + { "CP819", CS_ISO8859_1 }, + { "csISOLatin1", CS_ISO8859_1 }, + + { "ISO-8859-2", CS_ISO8859_2 }, + { "ISO_8859-2:1987", CS_ISO8859_2 }, + { "iso-ir-101", CS_ISO8859_2 }, + { "ISO_8859-2", CS_ISO8859_2 }, + { "latin2", CS_ISO8859_2 }, + { "l2", CS_ISO8859_2 }, + { "csISOLatin2", CS_ISO8859_2 }, + + { "ISO-8859-3", CS_ISO8859_3 }, + { "ISO_8859-3:1988", CS_ISO8859_3 }, + { "iso-ir-109", CS_ISO8859_3 }, + { "ISO_8859-3", CS_ISO8859_3 }, + { "latin3", CS_ISO8859_3 }, + { "l3", CS_ISO8859_3 }, + { "csISOLatin3", CS_ISO8859_3 }, + + { "ISO-8859-4", CS_ISO8859_4 }, + { "ISO_8859-4:1988", CS_ISO8859_4 }, + { "iso-ir-110", CS_ISO8859_4 }, + { "ISO_8859-4", CS_ISO8859_4 }, + { "latin4", CS_ISO8859_4 }, + { "l4", CS_ISO8859_4 }, + { "csISOLatin4", CS_ISO8859_4 }, + + { "ISO-8859-5", CS_ISO8859_5 }, + { "ISO_8859-5:1988", CS_ISO8859_5 }, + { "iso-ir-144", CS_ISO8859_5 }, + { "ISO_8859-5", CS_ISO8859_5 }, + { "cyrillic", CS_ISO8859_5 }, + { "csISOLatinCyrillic", CS_ISO8859_5 }, + + { "ISO-8859-6", CS_ISO8859_6 }, + { "ISO_8859-6:1987", CS_ISO8859_6 }, + { "iso-ir-127", CS_ISO8859_6 }, + { "ISO_8859-6", CS_ISO8859_6 }, + { "ECMA-114", CS_ISO8859_6 }, + { "ASMO-708", CS_ISO8859_6 }, + { "arabic", CS_ISO8859_6 }, + { "csISOLatinArabic", CS_ISO8859_6 }, + + { "ISO-8859-7", CS_ISO8859_7 }, + { "ISO_8859-7:1987", CS_ISO8859_7 }, + { "iso-ir-126", CS_ISO8859_7 }, + { "ISO_8859-7", CS_ISO8859_7 }, + { "ELOT_928", CS_ISO8859_7 }, + { "ECMA-118", CS_ISO8859_7 }, + { "greek", CS_ISO8859_7 }, + { "greek8", CS_ISO8859_7 }, + { "csISOLatinGreek", CS_ISO8859_7 }, + + { "ISO-8859-8", CS_ISO8859_8 }, + { "ISO_8859-8:1988", CS_ISO8859_8 }, + { "iso-ir-138", CS_ISO8859_8 }, + { "ISO_8859-8", CS_ISO8859_8 }, + { "hebrew", CS_ISO8859_8 }, + { "csISOLatinHebrew", CS_ISO8859_8 }, + + { "ISO-8859-9", CS_ISO8859_9 }, + { "ISO_8859-9:1989", CS_ISO8859_9 }, + { "iso-ir-148", CS_ISO8859_9 }, + { "ISO_8859-9", CS_ISO8859_9 }, + { "latin5", CS_ISO8859_9 }, + { "l5", CS_ISO8859_9 }, + { "csISOLatin5", CS_ISO8859_9 }, + + { "ISO-8859-10", CS_ISO8859_10 }, + { "iso-ir-157", CS_ISO8859_10 }, + { "l6", CS_ISO8859_10 }, + { "ISO_8859-10:1992", CS_ISO8859_10 }, + { "csISOLatin6", CS_ISO8859_10 }, + { "latin6", CS_ISO8859_10 }, + + { "ISO-8859-13", CS_ISO8859_13 }, + + { "ISO-8859-14", CS_ISO8859_14 }, + { "iso-ir-199", CS_ISO8859_14 }, + { "ISO_8859-14:1998", CS_ISO8859_14 }, + { "ISO_8859-14", CS_ISO8859_14 }, + { "latin8", CS_ISO8859_14 }, + { "iso-celtic", CS_ISO8859_14 }, + { "l8", CS_ISO8859_14 }, + + { "ISO-8859-15", CS_ISO8859_15 }, + { "ISO_8859-15", CS_ISO8859_15 }, + { "Latin-9", CS_ISO8859_15 }, + + { "ISO-8859-16", CS_ISO8859_16 }, + { "iso-ir-226", CS_ISO8859_16 }, + { "ISO_8859-16", CS_ISO8859_16 }, + { "ISO_8859-16:2001", CS_ISO8859_16 }, + { "latin10", CS_ISO8859_16 }, + { "l10", CS_ISO8859_16 }, + + { "IBM437", CS_CP437 }, + { "cp437", CS_CP437 }, + { "437", CS_CP437 }, + { "csPC8CodePage437", CS_CP437 }, + + { "IBM850", CS_CP850 }, + { "cp850", CS_CP850 }, + { "850", CS_CP850 }, + { "csPC850Multilingual", CS_CP850 }, + + { "windows-1250", CS_CP1250 }, + + { "windows-1251", CS_CP1251 }, + + { "windows-1252", CS_CP1252 }, + + { "windows-1253", CS_CP1253 }, + + { "windows-1254", CS_CP1254 }, + + { "windows-1255", CS_CP1255 }, + + { "windows-1256", CS_CP1256 }, + + { "windows-1257", CS_CP1257 }, + + { "windows-1258", CS_CP1258 }, + + { "KOI8-R", CS_KOI8_R }, + { "csKOI8R", CS_KOI8_R }, + + { "KOI8-U", CS_KOI8_U }, + + { "macintosh", CS_MAC_ROMAN }, + { "mac", CS_MAC_ROMAN }, + { "csMacintosh", CS_MAC_ROMAN }, + + { "VISCII", CS_VISCII }, + { "csVISCII", CS_VISCII }, + + { "hp-roman8", CS_HP_ROMAN8 }, + { "roman8", CS_HP_ROMAN8 }, + { "r8", CS_HP_ROMAN8 }, + { "csHPRoman8", CS_HP_ROMAN8 }, + + { "DEC-MCS", CS_DEC_MCS }, + { "dec", CS_DEC_MCS }, + { "csDECMCS", CS_DEC_MCS }, + + { "UTF-8", CS_UTF8 }, +}; + +const char *charset_to_mimeenc(int charset) +{ + int i; + + for (i = 0; i < (int)lenof(mimeencs); i++) + if (charset == mimeencs[i].charset) + return mimeencs[i].name; + + return NULL; /* not found */ +} + +int charset_from_mimeenc(const char *name) +{ + int i; + + for (i = 0; i < (int)lenof(mimeencs); i++) { + const char *p, *q; + p = name; + q = mimeencs[i].name; + while (*p || *q) { + if (tolower(*p) != tolower(*q)) + break; + p++; q++; + } + if (!*p && !*q) + return mimeencs[i].charset; + } + + return CS_NONE; /* not found */ +} diff --git a/charset/sbcs.c b/charset/sbcs.c new file mode 100644 index 00000000..a71e5d84 --- /dev/null +++ b/charset/sbcs.c @@ -0,0 +1,45 @@ +/* + * sbcs.c - routines to handle single-byte character sets. + */ + +#include "charset.h" +#include "internal.h" + +/* + * The charset_spec for any single-byte character set should + * provide read_sbcs() as its read function, and its `data' field + * should be a wchar_t string constant containing the 256 entries + * of the translation table. + */ + +void read_sbcs(charset_spec const *charset, long int input_chr, + charset_state *state, + void (*emit)(void *ctx, long int output), void *emitctx) +{ + wchar_t const *table = (wchar_t const *)charset->data; + + UNUSEDARG(state); + + emit(emitctx, table[input_chr]); +} + +void write_sbcs(charset_spec const *charset, long int input_chr, + charset_state *state, + void (*emit)(void *ctx, long int output), void *emitctx) +{ + wchar_t const *table = (wchar_t const *)charset->data; + int i; + + UNUSEDARG(state); + + /* + * FIXME: this should work, but it's ludicrously inefficient. + * We should be using the ucs2sbcs table. + */ + for (i = 0; i < 256; i++) + if (table[i] == input_chr) { + emit(emitctx, i); + return; + } + emit(emitctx, ERROR); +} diff --git a/charset/sbcs.dat b/charset/sbcs.dat new file mode 100644 index 00000000..2071717f --- /dev/null +++ b/charset/sbcs.dat @@ -0,0 +1,698 @@ + Data file defining single-byte character sets. + + All lines which begin with whitespace are considered comments. + + To generate an SBCS table from a unicode.org mapping table: + + gensbcs() { + wget -q -O - "$1" | tr '\r' '\n' | \ + perl -ne '/^(0x.*)\s+(0x.*)\s+/ and $a[hex $1]=sprintf "%04x", hex $2;' \ + -e 'BEGIN{for($i=0;$i<256;$i++){$a[$i]="XXXX";' \ + -e ' if ($i < 32 or $i == 127) {$a[$i]=sprintf "%04x", $i}}}' \ + -e 'END{for($i=0;$i<256;$i++){printf"%s%s",$a[$i],$i%16==15?"\n":" "}}' + } + + (A couple of noteworthy ickinesses here. For a start, any + undefined characters in the control-code regions (00-1F and 7F) + are assumed to be the Unicode code point corresponding to their + index, since the Mac Roman mapping table declines to define them + but realistically you don't want to be messing with that sort of + thing. Secondly, the Mac mapping tables are shipped with Mac line + endings, so note the `tr' to turn them into something legible to + Perl...) + + Here are the ISO-8859-x tables, generated by this piece of Bourne + shell: + + for i in 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16; do + echo charset CS_ISO8859_$i + gensbcs http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-$i.TXT + echo + done + +charset CS_ISO8859_1 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f +0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f +00a0 00a1 00a2 00a3 00a4 00a5 00a6 00a7 00a8 00a9 00aa 00ab 00ac 00ad 00ae 00af +00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 00ba 00bb 00bc 00bd 00be 00bf +00c0 00c1 00c2 00c3 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf +00d0 00d1 00d2 00d3 00d4 00d5 00d6 00d7 00d8 00d9 00da 00db 00dc 00dd 00de 00df +00e0 00e1 00e2 00e3 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef +00f0 00f1 00f2 00f3 00f4 00f5 00f6 00f7 00f8 00f9 00fa 00fb 00fc 00fd 00fe 00ff + +charset CS_ISO8859_2 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f +0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f +00a0 0104 02d8 0141 00a4 013d 015a 00a7 00a8 0160 015e 0164 0179 00ad 017d 017b +00b0 0105 02db 0142 00b4 013e 015b 02c7 00b8 0161 015f 0165 017a 02dd 017e 017c +0154 00c1 00c2 0102 00c4 0139 0106 00c7 010c 00c9 0118 00cb 011a 00cd 00ce 010e +0110 0143 0147 00d3 00d4 0150 00d6 00d7 0158 016e 00da 0170 00dc 00dd 0162 00df +0155 00e1 00e2 0103 00e4 013a 0107 00e7 010d 00e9 0119 00eb 011b 00ed 00ee 010f +0111 0144 0148 00f3 00f4 0151 00f6 00f7 0159 016f 00fa 0171 00fc 00fd 0163 02d9 + +charset CS_ISO8859_3 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f +0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f +00a0 0126 02d8 00a3 00a4 XXXX 0124 00a7 00a8 0130 015e 011e 0134 00ad XXXX 017b +00b0 0127 00b2 00b3 00b4 00b5 0125 00b7 00b8 0131 015f 011f 0135 00bd XXXX 017c +00c0 00c1 00c2 XXXX 00c4 010a 0108 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf +XXXX 00d1 00d2 00d3 00d4 0120 00d6 00d7 011c 00d9 00da 00db 00dc 016c 015c 00df +00e0 00e1 00e2 XXXX 00e4 010b 0109 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef +XXXX 00f1 00f2 00f3 00f4 0121 00f6 00f7 011d 00f9 00fa 00fb 00fc 016d 015d 02d9 + +charset CS_ISO8859_4 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f +0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f +00a0 0104 0138 0156 00a4 0128 013b 00a7 00a8 0160 0112 0122 0166 00ad 017d 00af +00b0 0105 02db 0157 00b4 0129 013c 02c7 00b8 0161 0113 0123 0167 014a 017e 014b +0100 00c1 00c2 00c3 00c4 00c5 00c6 012e 010c 00c9 0118 00cb 0116 00cd 00ce 012a +0110 0145 014c 0136 00d4 00d5 00d6 00d7 00d8 0172 00da 00db 00dc 0168 016a 00df +0101 00e1 00e2 00e3 00e4 00e5 00e6 012f 010d 00e9 0119 00eb 0117 00ed 00ee 012b +0111 0146 014d 0137 00f4 00f5 00f6 00f7 00f8 0173 00fa 00fb 00fc 0169 016b 02d9 + +charset CS_ISO8859_5 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f +0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f +00a0 0401 0402 0403 0404 0405 0406 0407 0408 0409 040a 040b 040c 00ad 040e 040f +0410 0411 0412 0413 0414 0415 0416 0417 0418 0419 041a 041b 041c 041d 041e 041f +0420 0421 0422 0423 0424 0425 0426 0427 0428 0429 042a 042b 042c 042d 042e 042f +0430 0431 0432 0433 0434 0435 0436 0437 0438 0439 043a 043b 043c 043d 043e 043f +0440 0441 0442 0443 0444 0445 0446 0447 0448 0449 044a 044b 044c 044d 044e 044f +2116 0451 0452 0453 0454 0455 0456 0457 0458 0459 045a 045b 045c 00a7 045e 045f + +charset CS_ISO8859_6 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f +0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f +00a0 XXXX XXXX XXXX 00a4 XXXX XXXX XXXX XXXX XXXX XXXX XXXX 060c 00ad XXXX XXXX +XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX 061b XXXX XXXX XXXX 061f +XXXX 0621 0622 0623 0624 0625 0626 0627 0628 0629 062a 062b 062c 062d 062e 062f +0630 0631 0632 0633 0634 0635 0636 0637 0638 0639 063a XXXX XXXX XXXX XXXX XXXX +0640 0641 0642 0643 0644 0645 0646 0647 0648 0649 064a 064b 064c 064d 064e 064f +0650 0651 0652 XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX + +charset CS_ISO8859_7 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f +0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f +00a0 2018 2019 00a3 XXXX XXXX 00a6 00a7 00a8 00a9 XXXX 00ab 00ac 00ad XXXX 2015 +00b0 00b1 00b2 00b3 0384 0385 0386 00b7 0388 0389 038a 00bb 038c 00bd 038e 038f +0390 0391 0392 0393 0394 0395 0396 0397 0398 0399 039a 039b 039c 039d 039e 039f +03a0 03a1 XXXX 03a3 03a4 03a5 03a6 03a7 03a8 03a9 03aa 03ab 03ac 03ad 03ae 03af +03b0 03b1 03b2 03b3 03b4 03b5 03b6 03b7 03b8 03b9 03ba 03bb 03bc 03bd 03be 03bf +03c0 03c1 03c2 03c3 03c4 03c5 03c6 03c7 03c8 03c9 03ca 03cb 03cc 03cd 03ce XXXX + +charset CS_ISO8859_8 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f +0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f +00a0 XXXX 00a2 00a3 00a4 00a5 00a6 00a7 00a8 00a9 00d7 00ab 00ac 00ad 00ae 00af +00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 00f7 00bb 00bc 00bd 00be XXXX +XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX +XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX 2017 +05d0 05d1 05d2 05d3 05d4 05d5 05d6 05d7 05d8 05d9 05da 05db 05dc 05dd 05de 05df +05e0 05e1 05e2 05e3 05e4 05e5 05e6 05e7 05e8 05e9 05ea XXXX XXXX 200e 200f XXXX + +charset CS_ISO8859_9 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f +0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f +00a0 00a1 00a2 00a3 00a4 00a5 00a6 00a7 00a8 00a9 00aa 00ab 00ac 00ad 00ae 00af +00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 00ba 00bb 00bc 00bd 00be 00bf +00c0 00c1 00c2 00c3 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf +011e 00d1 00d2 00d3 00d4 00d5 00d6 00d7 00d8 00d9 00da 00db 00dc 0130 015e 00df +00e0 00e1 00e2 00e3 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef +011f 00f1 00f2 00f3 00f4 00f5 00f6 00f7 00f8 00f9 00fa 00fb 00fc 0131 015f 00ff + +charset CS_ISO8859_10 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f +0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f +00a0 0104 0112 0122 012a 0128 0136 00a7 013b 0110 0160 0166 017d 00ad 016a 014a +00b0 0105 0113 0123 012b 0129 0137 00b7 013c 0111 0161 0167 017e 2015 016b 014b +0100 00c1 00c2 00c3 00c4 00c5 00c6 012e 010c 00c9 0118 00cb 0116 00cd 00ce 00cf +00d0 0145 014c 00d3 00d4 00d5 00d6 0168 00d8 0172 00da 00db 00dc 00dd 00de 00df +0101 00e1 00e2 00e3 00e4 00e5 00e6 012f 010d 00e9 0119 00eb 0117 00ed 00ee 00ef +00f0 0146 014d 00f3 00f4 00f5 00f6 0169 00f8 0173 00fa 00fb 00fc 00fd 00fe 0138 + +charset CS_ISO8859_11 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f +0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f +00a0 0e01 0e02 0e03 0e04 0e05 0e06 0e07 0e08 0e09 0e0a 0e0b 0e0c 0e0d 0e0e 0e0f +0e10 0e11 0e12 0e13 0e14 0e15 0e16 0e17 0e18 0e19 0e1a 0e1b 0e1c 0e1d 0e1e 0e1f +0e20 0e21 0e22 0e23 0e24 0e25 0e26 0e27 0e28 0e29 0e2a 0e2b 0e2c 0e2d 0e2e 0e2f +0e30 0e31 0e32 0e33 0e34 0e35 0e36 0e37 0e38 0e39 0e3a XXXX XXXX XXXX XXXX 0e3f +0e40 0e41 0e42 0e43 0e44 0e45 0e46 0e47 0e48 0e49 0e4a 0e4b 0e4c 0e4d 0e4e 0e4f +0e50 0e51 0e52 0e53 0e54 0e55 0e56 0e57 0e58 0e59 0e5a 0e5b XXXX XXXX XXXX XXXX + +charset CS_ISO8859_13 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f +0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f +00a0 201d 00a2 00a3 00a4 201e 00a6 00a7 00d8 00a9 0156 00ab 00ac 00ad 00ae 00c6 +00b0 00b1 00b2 00b3 201c 00b5 00b6 00b7 00f8 00b9 0157 00bb 00bc 00bd 00be 00e6 +0104 012e 0100 0106 00c4 00c5 0118 0112 010c 00c9 0179 0116 0122 0136 012a 013b +0160 0143 0145 00d3 014c 00d5 00d6 00d7 0172 0141 015a 016a 00dc 017b 017d 00df +0105 012f 0101 0107 00e4 00e5 0119 0113 010d 00e9 017a 0117 0123 0137 012b 013c +0161 0144 0146 00f3 014d 00f5 00f6 00f7 0173 0142 015b 016b 00fc 017c 017e 2019 + +charset CS_ISO8859_14 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f +0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f +00a0 1e02 1e03 00a3 010a 010b 1e0a 00a7 1e80 00a9 1e82 1e0b 1ef2 00ad 00ae 0178 +1e1e 1e1f 0120 0121 1e40 1e41 00b6 1e56 1e81 1e57 1e83 1e60 1ef3 1e84 1e85 1e61 +00c0 00c1 00c2 00c3 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf +0174 00d1 00d2 00d3 00d4 00d5 00d6 1e6a 00d8 00d9 00da 00db 00dc 00dd 0176 00df +00e0 00e1 00e2 00e3 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef +0175 00f1 00f2 00f3 00f4 00f5 00f6 1e6b 00f8 00f9 00fa 00fb 00fc 00fd 0177 00ff + +charset CS_ISO8859_15 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f +0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f +00a0 00a1 00a2 00a3 20ac 00a5 0160 00a7 0161 00a9 00aa 00ab 00ac 00ad 00ae 00af +00b0 00b1 00b2 00b3 017d 00b5 00b6 00b7 017e 00b9 00ba 00bb 0152 0153 0178 00bf +00c0 00c1 00c2 00c3 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf +00d0 00d1 00d2 00d3 00d4 00d5 00d6 00d7 00d8 00d9 00da 00db 00dc 00dd 00de 00df +00e0 00e1 00e2 00e3 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef +00f0 00f1 00f2 00f3 00f4 00f5 00f6 00f7 00f8 00f9 00fa 00fb 00fc 00fd 00fe 00ff + +charset CS_ISO8859_16 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f +0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f +00a0 0104 0105 0141 20ac 201e 0160 00a7 0161 00a9 0218 00ab 0179 00ad 017a 017b +00b0 00b1 010c 0142 017d 201d 00b6 00b7 017e 010d 0219 00bb 0152 0153 0178 017c +00c0 00c1 00c2 0102 00c4 0106 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf +0110 0143 00d2 00d3 00d4 0150 00d6 015a 0170 00d9 00da 00db 00dc 0118 021a 00df +00e0 00e1 00e2 0103 00e4 0107 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef +0111 0144 00f2 00f3 00f4 0151 00f6 015b 0171 00f9 00fa 00fb 00fc 0119 021b 00ff + + Some X fonts are encoded in a variant form of ISO8859-1: + everything above 0x20 (space) is as normal, but the first 32 + characters contain the VT100 line drawing glyphs as they would + appear from positions 0x5F to 0x7E inclusive. Here is the modified + ISO8859-1 code table. + +charset CS_ISO8859_1_X11 +0020 2666 2592 2409 240c 240d 240a 00b0 00b1 2424 240b 2518 2510 250c 2514 253c +23ba 23bb 2500 23bc 23bd 251c 2524 2534 252c 2502 2264 2265 03c0 2260 00a3 00b7 +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f +0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f +00a0 00a1 00a2 00a3 00a4 00a5 00a6 00a7 00a8 00a9 00aa 00ab 00ac 00ad 00ae 00af +00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 00ba 00bb 00bc 00bd 00be 00bf +00c0 00c1 00c2 00c3 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf +00d0 00d1 00d2 00d3 00d4 00d5 00d6 00d7 00d8 00d9 00da 00db 00dc 00dd 00de 00df +00e0 00e1 00e2 00e3 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef +00f0 00f1 00f2 00f3 00f4 00f5 00f6 00f7 00f8 00f9 00fa 00fb 00fc 00fd 00fe 00ff + + Here are some PC (old DOS) code pages, generated by this piece of + Bourne shell: + + for i in 437 850; do + echo charset CS_CP$i + gensbcs http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP$i.TXT + echo + done + +charset CS_CP437 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +00c7 00fc 00e9 00e2 00e4 00e0 00e5 00e7 00ea 00eb 00e8 00ef 00ee 00ec 00c4 00c5 +00c9 00e6 00c6 00f4 00f6 00f2 00fb 00f9 00ff 00d6 00dc 00a2 00a3 00a5 20a7 0192 +00e1 00ed 00f3 00fa 00f1 00d1 00aa 00ba 00bf 2310 00ac 00bd 00bc 00a1 00ab 00bb +2591 2592 2593 2502 2524 2561 2562 2556 2555 2563 2551 2557 255d 255c 255b 2510 +2514 2534 252c 251c 2500 253c 255e 255f 255a 2554 2569 2566 2560 2550 256c 2567 +2568 2564 2565 2559 2558 2552 2553 256b 256a 2518 250c 2588 2584 258c 2590 2580 +03b1 00df 0393 03c0 03a3 03c3 00b5 03c4 03a6 0398 03a9 03b4 221e 03c6 03b5 2229 +2261 00b1 2265 2264 2320 2321 00f7 2248 00b0 2219 00b7 221a 207f 00b2 25a0 00a0 + +charset CS_CP850 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +00c7 00fc 00e9 00e2 00e4 00e0 00e5 00e7 00ea 00eb 00e8 00ef 00ee 00ec 00c4 00c5 +00c9 00e6 00c6 00f4 00f6 00f2 00fb 00f9 00ff 00d6 00dc 00f8 00a3 00d8 00d7 0192 +00e1 00ed 00f3 00fa 00f1 00d1 00aa 00ba 00bf 00ae 00ac 00bd 00bc 00a1 00ab 00bb +2591 2592 2593 2502 2524 00c1 00c2 00c0 00a9 2563 2551 2557 255d 00a2 00a5 2510 +2514 2534 252c 251c 2500 253c 00e3 00c3 255a 2554 2569 2566 2560 2550 256c 00a4 +00f0 00d0 00ca 00cb 00c8 0131 00cd 00ce 00cf 2518 250c 2588 2584 00a6 00cc 2580 +00d3 00df 00d4 00d2 00f5 00d5 00b5 00fe 00de 00da 00db 00d9 00fd 00dd 00af 00b4 +00ad 00b1 2017 00be 00b6 00a7 00f7 00b8 00b0 00a8 00b7 00b9 00b3 00b2 25a0 00a0 + + Here are some Windows code pages, generated by this piece of + Bourne shell: + + for i in 1250 1251 1252 1253 1254 1255 1256 1257 1258; do + echo charset CS_CP$i + gensbcs http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP$i.TXT + echo + done + +charset CS_CP1250 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +20ac XXXX 201a XXXX 201e 2026 2020 2021 XXXX 2030 0160 2039 015a 0164 017d 0179 +XXXX 2018 2019 201c 201d 2022 2013 2014 XXXX 2122 0161 203a 015b 0165 017e 017a +00a0 02c7 02d8 0141 00a4 0104 00a6 00a7 00a8 00a9 015e 00ab 00ac 00ad 00ae 017b +00b0 00b1 02db 0142 00b4 00b5 00b6 00b7 00b8 0105 015f 00bb 013d 02dd 013e 017c +0154 00c1 00c2 0102 00c4 0139 0106 00c7 010c 00c9 0118 00cb 011a 00cd 00ce 010e +0110 0143 0147 00d3 00d4 0150 00d6 00d7 0158 016e 00da 0170 00dc 00dd 0162 00df +0155 00e1 00e2 0103 00e4 013a 0107 00e7 010d 00e9 0119 00eb 011b 00ed 00ee 010f +0111 0144 0148 00f3 00f4 0151 00f6 00f7 0159 016f 00fa 0171 00fc 00fd 0163 02d9 + +charset CS_CP1251 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +0402 0403 201a 0453 201e 2026 2020 2021 20ac 2030 0409 2039 040a 040c 040b 040f +0452 2018 2019 201c 201d 2022 2013 2014 XXXX 2122 0459 203a 045a 045c 045b 045f +00a0 040e 045e 0408 00a4 0490 00a6 00a7 0401 00a9 0404 00ab 00ac 00ad 00ae 0407 +00b0 00b1 0406 0456 0491 00b5 00b6 00b7 0451 2116 0454 00bb 0458 0405 0455 0457 +0410 0411 0412 0413 0414 0415 0416 0417 0418 0419 041a 041b 041c 041d 041e 041f +0420 0421 0422 0423 0424 0425 0426 0427 0428 0429 042a 042b 042c 042d 042e 042f +0430 0431 0432 0433 0434 0435 0436 0437 0438 0439 043a 043b 043c 043d 043e 043f +0440 0441 0442 0443 0444 0445 0446 0447 0448 0449 044a 044b 044c 044d 044e 044f + +charset CS_CP1252 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +20ac XXXX 201a 0192 201e 2026 2020 2021 02c6 2030 0160 2039 0152 XXXX 017d XXXX +XXXX 2018 2019 201c 201d 2022 2013 2014 02dc 2122 0161 203a 0153 XXXX 017e 0178 +00a0 00a1 00a2 00a3 00a4 00a5 00a6 00a7 00a8 00a9 00aa 00ab 00ac 00ad 00ae 00af +00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 00ba 00bb 00bc 00bd 00be 00bf +00c0 00c1 00c2 00c3 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf +00d0 00d1 00d2 00d3 00d4 00d5 00d6 00d7 00d8 00d9 00da 00db 00dc 00dd 00de 00df +00e0 00e1 00e2 00e3 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef +00f0 00f1 00f2 00f3 00f4 00f5 00f6 00f7 00f8 00f9 00fa 00fb 00fc 00fd 00fe 00ff + +charset CS_CP1253 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +20ac XXXX 201a 0192 201e 2026 2020 2021 XXXX 2030 XXXX 2039 XXXX XXXX XXXX XXXX +XXXX 2018 2019 201c 201d 2022 2013 2014 XXXX 2122 XXXX 203a XXXX XXXX XXXX XXXX +00a0 0385 0386 00a3 00a4 00a5 00a6 00a7 00a8 00a9 XXXX 00ab 00ac 00ad 00ae 2015 +00b0 00b1 00b2 00b3 0384 00b5 00b6 00b7 0388 0389 038a 00bb 038c 00bd 038e 038f +0390 0391 0392 0393 0394 0395 0396 0397 0398 0399 039a 039b 039c 039d 039e 039f +03a0 03a1 XXXX 03a3 03a4 03a5 03a6 03a7 03a8 03a9 03aa 03ab 03ac 03ad 03ae 03af +03b0 03b1 03b2 03b3 03b4 03b5 03b6 03b7 03b8 03b9 03ba 03bb 03bc 03bd 03be 03bf +03c0 03c1 03c2 03c3 03c4 03c5 03c6 03c7 03c8 03c9 03ca 03cb 03cc 03cd 03ce XXXX + +charset CS_CP1254 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +20ac XXXX 201a 0192 201e 2026 2020 2021 02c6 2030 0160 2039 0152 XXXX XXXX XXXX +XXXX 2018 2019 201c 201d 2022 2013 2014 02dc 2122 0161 203a 0153 XXXX XXXX 0178 +00a0 00a1 00a2 00a3 00a4 00a5 00a6 00a7 00a8 00a9 00aa 00ab 00ac 00ad 00ae 00af +00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 00ba 00bb 00bc 00bd 00be 00bf +00c0 00c1 00c2 00c3 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf +011e 00d1 00d2 00d3 00d4 00d5 00d6 00d7 00d8 00d9 00da 00db 00dc 0130 015e 00df +00e0 00e1 00e2 00e3 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef +011f 00f1 00f2 00f3 00f4 00f5 00f6 00f7 00f8 00f9 00fa 00fb 00fc 0131 015f 00ff + +charset CS_CP1255 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +20ac XXXX 201a 0192 201e 2026 2020 2021 02c6 2030 XXXX 2039 XXXX XXXX XXXX XXXX +XXXX 2018 2019 201c 201d 2022 2013 2014 02dc 2122 XXXX 203a XXXX XXXX XXXX XXXX +00a0 00a1 00a2 00a3 20aa 00a5 00a6 00a7 00a8 00a9 00d7 00ab 00ac 00ad 00ae 00af +00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 00f7 00bb 00bc 00bd 00be 00bf +05b0 05b1 05b2 05b3 05b4 05b5 05b6 05b7 05b8 05b9 XXXX 05bb 05bc 05bd 05be 05bf +05c0 05c1 05c2 05c3 05f0 05f1 05f2 05f3 05f4 XXXX XXXX XXXX XXXX XXXX XXXX XXXX +05d0 05d1 05d2 05d3 05d4 05d5 05d6 05d7 05d8 05d9 05da 05db 05dc 05dd 05de 05df +05e0 05e1 05e2 05e3 05e4 05e5 05e6 05e7 05e8 05e9 05ea XXXX XXXX 200e 200f XXXX + +charset CS_CP1256 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +20ac 067e 201a 0192 201e 2026 2020 2021 02c6 2030 0679 2039 0152 0686 0698 0688 +06af 2018 2019 201c 201d 2022 2013 2014 06a9 2122 0691 203a 0153 200c 200d 06ba +00a0 060c 00a2 00a3 00a4 00a5 00a6 00a7 00a8 00a9 06be 00ab 00ac 00ad 00ae 00af +00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 061b 00bb 00bc 00bd 00be 061f +06c1 0621 0622 0623 0624 0625 0626 0627 0628 0629 062a 062b 062c 062d 062e 062f +0630 0631 0632 0633 0634 0635 0636 00d7 0637 0638 0639 063a 0640 0641 0642 0643 +00e0 0644 00e2 0645 0646 0647 0648 00e7 00e8 00e9 00ea 00eb 0649 064a 00ee 00ef +064b 064c 064d 064e 00f4 064f 0650 00f7 0651 00f9 0652 00fb 00fc 200e 200f 06d2 + +charset CS_CP1257 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +20ac XXXX 201a XXXX 201e 2026 2020 2021 XXXX 2030 XXXX 2039 XXXX 00a8 02c7 00b8 +XXXX 2018 2019 201c 201d 2022 2013 2014 XXXX 2122 XXXX 203a XXXX 00af 02db XXXX +00a0 XXXX 00a2 00a3 00a4 XXXX 00a6 00a7 00d8 00a9 0156 00ab 00ac 00ad 00ae 00c6 +00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00f8 00b9 0157 00bb 00bc 00bd 00be 00e6 +0104 012e 0100 0106 00c4 00c5 0118 0112 010c 00c9 0179 0116 0122 0136 012a 013b +0160 0143 0145 00d3 014c 00d5 00d6 00d7 0172 0141 015a 016a 00dc 017b 017d 00df +0105 012f 0101 0107 00e4 00e5 0119 0113 010d 00e9 017a 0117 0123 0137 012b 013c +0161 0144 0146 00f3 014d 00f5 00f6 00f7 0173 0142 015b 016b 00fc 017c 017e 02d9 + +charset CS_CP1258 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +20ac XXXX 201a 0192 201e 2026 2020 2021 02c6 2030 XXXX 2039 0152 XXXX XXXX XXXX +XXXX 2018 2019 201c 201d 2022 2013 2014 02dc 2122 XXXX 203a 0153 XXXX XXXX 0178 +00a0 00a1 00a2 00a3 00a4 00a5 00a6 00a7 00a8 00a9 00aa 00ab 00ac 00ad 00ae 00af +00b0 00b1 00b2 00b3 00b4 00b5 00b6 00b7 00b8 00b9 00ba 00bb 00bc 00bd 00be 00bf +00c0 00c1 00c2 0102 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 0300 00cd 00ce 00cf +0110 00d1 0309 00d3 00d4 01a0 00d6 00d7 00d8 00d9 00da 00db 00dc 01af 0303 00df +00e0 00e1 00e2 0103 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 0301 00ed 00ee 00ef +0111 00f1 0323 00f3 00f4 01a1 00f6 00f7 00f8 00f9 00fa 00fb 00fc 01b0 20ab 00ff + + KOI8-R, generated by this code: + + { echo charset CS_KOI8_R; + gensbcs http://www.unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-R.TXT; } + +charset CS_KOI8_R +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +2500 2502 250c 2510 2514 2518 251c 2524 252c 2534 253c 2580 2584 2588 258c 2590 +2591 2592 2593 2320 25a0 2219 221a 2248 2264 2265 00a0 2321 00b0 00b2 00b7 00f7 +2550 2551 2552 0451 2553 2554 2555 2556 2557 2558 2559 255a 255b 255c 255d 255e +255f 2560 2561 0401 2562 2563 2564 2565 2566 2567 2568 2569 256a 256b 256c 00a9 +044e 0430 0431 0446 0434 0435 0444 0433 0445 0438 0439 043a 043b 043c 043d 043e +043f 044f 0440 0441 0442 0443 0436 0432 044c 044b 0437 0448 044d 0449 0447 044a +042e 0410 0411 0426 0414 0415 0424 0413 0425 0418 0419 041a 041b 041c 041d 041e +041f 042f 0420 0421 0422 0423 0416 0412 042c 042b 0417 0428 042d 0429 0427 042a + + KOI8-U: I can't find an easily machine-processable mapping table + for this one, so I've created it by hand-editing the KOI8-R + mapping table in accordance with the list of differences specified + in RFC2319. Note that RFC2319 has an apparent error: position B4 + is listed as U+0404 in the main character set list, but as U+0403 + in Appendix A (differences from KOI8-R). Both agree that it should + be CYRILLIC CAPITAL LETTER UKRAINIAN IE, however, and the Unicode + character database says that therefore U+0404 is the correct value. + +charset CS_KOI8_U +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +2500 2502 250c 2510 2514 2518 251c 2524 252c 2534 253c 2580 2584 2588 258c 2590 +2591 2592 2593 2320 25a0 2219 221a 2248 2264 2265 00a0 2321 00b0 00b2 00b7 00f7 +2550 2551 2552 0451 0454 2554 0456 0457 2557 2558 2559 255a 255b 0491 255d 255e +255f 2560 2561 0401 0404 2563 0406 0407 2566 2567 2568 2569 256a 0490 256c 00a9 +044e 0430 0431 0446 0434 0435 0444 0433 0445 0438 0439 043a 043b 043c 043d 043e +043f 044f 0440 0441 0442 0443 0436 0432 044c 044b 0437 0448 044d 0449 0447 044a +042e 0410 0411 0426 0414 0415 0424 0413 0425 0418 0419 041a 041b 041c 041d 041e +041f 042f 0420 0421 0422 0423 0416 0412 042c 042b 0417 0428 042d 0429 0427 042a + + Mac Roman, generated by this code: + + { echo charset CS_MAC_ROMAN; + gensbcs http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMAN.TXT; } + + The code point F8FF at position F0 is an interesting one. In + Unicode, it's the last of the Private Use section. The mapping + table states that it should be an Apple logo. I suppose we should + just leave it as it is; there's bound to be some software out + there that understands U+F8FF to be an Apple logo! + +charset CS_MAC_ROMAN +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +00c4 00c5 00c7 00c9 00d1 00d6 00dc 00e1 00e0 00e2 00e4 00e3 00e5 00e7 00e9 00e8 +00ea 00eb 00ed 00ec 00ee 00ef 00f1 00f3 00f2 00f4 00f6 00f5 00fa 00f9 00fb 00fc +2020 00b0 00a2 00a3 00a7 2022 00b6 00df 00ae 00a9 2122 00b4 00a8 2260 00c6 00d8 +221e 00b1 2264 2265 00a5 00b5 2202 2211 220f 03c0 222b 00aa 00ba 03a9 00e6 00f8 +00bf 00a1 00ac 221a 0192 2248 2206 00ab 00bb 2026 00a0 00c0 00c3 00d5 0152 0153 +2013 2014 201c 201d 2018 2019 00f7 25ca 00ff 0178 2044 20ac 2039 203a fb01 fb02 +2021 00b7 201a 201e 2030 00c2 00ca 00c1 00cb 00c8 00cd 00ce 00cf 00cc 00d3 00d4 +f8ff 00d2 00da 00db 00d9 0131 02c6 02dc 00af 02d8 02d9 02da 00b8 02dd 02db 02c7 + + Roman Czyborra's web site (http://czyborra.com/) has a variety of + other useful mapping tables, in a slightly different format (and + gzipped). Here's a shell/Perl function to generate an SBCS table + from a Czyborra mapping table: + + gensbcs_c() { + wget -q -O - "$1" | gzip -d | \ + perl -ne '/^=(.*)\s+U\+(.*)\s+/ and $a[hex $1]=sprintf "%04x", hex $2;' \ + -e 'BEGIN{for($i=0;$i<256;$i++){$a[$i]="XXXX";' \ + -e 'if ($i < 32 or ($i >=127 and $i < 160)) {$a[$i]=sprintf "%04x", $i}}}' \ + -e 'END{for($i=0;$i<256;$i++){printf"%s%s",$a[$i],$i%16==15?"\n":" "}}' + } + + So here we have some character sets generated from Czyborra + mapping tables: VISCII, HP-Roman8, and the DEC Multinational + Character Set. + + { echo charset CS_VISCII; + gensbcs_c http://czyborra.com/charsets/viscii.txt.gz; echo; + echo charset CS_HP_ROMAN8; + gensbcs_c http://czyborra.com/charsets/hp-roman8.txt.gz; echo; + echo charset CS_DEC_MCS; + gensbcs_c http://czyborra.com/charsets/dec-mcs.txt.gz; echo; } + +charset CS_VISCII +0000 0001 1eb2 0003 0004 1eb4 1eaa 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 1ef6 0015 0016 0017 0018 1ef8 001a 001b 001c 001d 1ef4 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +1ea0 1eae 1eb0 1eb6 1ea4 1ea6 1ea8 1eac 1ebc 1eb8 1ebe 1ec0 1ec2 1ec4 1ec6 1ed0 +1ed2 1ed4 1ed6 1ed8 1ee2 1eda 1edc 1ede 1eca 1ece 1ecc 1ec8 1ee6 0168 1ee4 1ef2 +00d5 1eaf 1eb1 1eb7 1ea5 1ea7 1ea8 1ead 1ebd 1eb9 1ebf 1ec1 1ec3 1ec5 1ec7 1ed1 +1ed3 1ed5 1ed7 1ee0 01a0 1ed9 1edd 1edf 1ecb 1ef0 1ee8 1eea 1eec 01a1 1edb 01af +00c0 00c1 00c2 00c3 1ea2 0102 1eb3 1eb5 00c8 00c9 00ca 1eba 00cc 00cd 0128 1ef3 +0110 1ee9 00d2 00d3 00d4 1ea1 1ef7 1eeb 1eed 00d9 00da 1ef9 1ef5 00dd 1ee1 01b0 +00e0 00e1 00e2 00e3 1ea3 0103 1eef 1eab 00e8 00e9 00ea 1ebb 00ec 00ed 0129 1ec9 +0111 1ef1 00f2 00f3 00f4 00f5 1ecf 1ecd 1ee5 00f9 00fa 0169 1ee7 00fd 1ee3 1eee + +charset CS_HP_ROMAN8 +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f +0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f +00a0 00c0 00c2 00c8 00ca 00cb 00ce 00cf 00b4 02cb 02c6 00a8 02dc 00d9 00db 20a4 +00af 00dd 00fd 00b0 00c7 00e7 00d1 00f1 00a1 00bf 00a4 00a3 00a5 00a7 0192 00a2 +00e2 00ea 00f4 00fb 00e1 00e9 00f3 00fa 00e0 00e8 00f2 00f9 00e4 00eb 00f6 00fc +00c5 00ee 00d8 00c6 00e5 00ed 00f8 00e6 00c4 00ec 00d6 00dc 00c9 00ef 00df 00d4 +00c1 00c3 00e3 00d0 00f0 00cd 00cc 00d3 00d2 00d5 00f5 0160 0161 00da 0178 00ff +00de 00fe 00b7 00b5 00b6 00be 2014 00bc 00bd 00aa 00ba 00ab 25a0 00bb 00b1 XXXX + +charset CS_DEC_MCS +0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f +0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f +0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f +0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f +0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f +0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 005c 005d 005e 005f +0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f +0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 007e 007f +0080 0081 0082 0083 0084 0085 0086 0087 0088 0089 008a 008b 008c 008d 008e 008f +0090 0091 0092 0093 0094 0095 0096 0097 0098 0099 009a 009b 009c 009d 009e 009f +XXXX 00a1 00a2 00a3 XXXX 00a5 XXXX 00a7 00a4 00a9 00aa 00ab XXXX XXXX XXXX XXXX +00b0 00b1 00b2 00b3 XXXX 00b5 00b6 00b7 XXXX 00b9 00ba 00bb 00bc 00bd XXXX 00bf +00c0 00c1 00c2 00c3 00c4 00c5 00c6 00c7 00c8 00c9 00ca 00cb 00cc 00cd 00ce 00cf +XXXX 00d1 00d2 00d3 00d4 00d5 00d6 0152 00d8 00d9 00da 00db 00dc 0178 XXXX 00df +00e0 00e1 00e2 00e3 00e4 00e5 00e6 00e7 00e8 00e9 00ea 00eb 00ec 00ed 00ee 00ef +XXXX 00f1 00f2 00f3 00f4 00f5 00f6 0153 00f8 00f9 00fa 00fb 00fc 00ff XXXX XXXX diff --git a/charset/sbcsgen.pl b/charset/sbcsgen.pl new file mode 100644 index 00000000..0d6ebc9c --- /dev/null +++ b/charset/sbcsgen.pl @@ -0,0 +1,95 @@ +#!/usr/bin/env perl -w + +# This script generates sbcsdat.c (the data for all the SBCSes) from its +# source form sbcs.dat. + +$infile = "sbcs.dat"; +$outfile = "sbcsdat.c"; + +open FOO, $infile; +open BAR, ">$outfile"; +select BAR; + +print "/*\n"; +print " * sbcsdat.c - data definitions for single-byte character sets.\n"; +print " *\n"; +print " * Generated by sbcsgen.pl from sbcs.dat.\n"; +print " * You should edit those files rather than editing this one.\n"; +print " */\n"; +print "\n"; +print "#ifndef ENUM_CHARSETS\n"; +print "\n"; +print "#include \"charset.h\"\n"; +print "#include \"internal.h\"\n"; +print "\n"; + +my $charsetname = undef; +my @vals = (); + +my @charsetnames = (); + +while () { + chomp; + if (/^charset (.*)$/) { + $charsetname = $1; + @vals = (); + } elsif (/^[0-9a-fA-FX]/) { + push @vals, map { $_ eq "XXXX" ? -1 : hex $_ } split / +/, $_; + if (scalar @vals > 256) { + die "$infile:$.: charset $charsetname has more than 256 values\n"; + } elsif (scalar @vals == 256) { + &outcharset($charsetname, @vals); + push @charsetnames, $charsetname; + $charsetname = undef; + @vals = (); + } + } +} + +print "#else /* ENUM_CHARSETS */\n"; +print "\n"; + +foreach $i (@charsetnames) { + print "ENUM_CHARSET($i)\n"; +} + +print "\n"; +print "#endif /* ENUM_CHARSETS */\n"; + +sub outcharset($@) { + my ($name, @vals) = @_; + my ($prefix, $i, @sorted); + + print "static const sbcs_data data_$name = {\n"; + print " {\n"; + $prefix = " "; + @sorted = (); + for ($i = 0; $i < 256; $i++) { + if ($vals[$i] < 0) { + printf "%sERROR ", $prefix; + } else { + printf "%s0x%04x", $prefix, $vals[$i]; + push @sorted, [$i, $vals[$i]]; + } + if ($i % 8 == 7) { + $prefix = ",\n "; + } else { + $prefix = ", "; + } + } + print "\n },\n {\n"; + @sorted = sort { $a->[1] <=> $b->[1] } @sorted; + $prefix = " "; + for ($i = 0; $i < scalar @sorted; $i++) { + printf "%s0x%02x", $prefix, $sorted[$i]->[0]; + if ($i % 8 == 7) { + $prefix = ",\n "; + } else { + $prefix = ", "; + } + } + printf "\n },\n %d\n", scalar @sorted; + print "};\n"; + print "const charset_spec charset_$name = {\n" . + " $name, read_sbcs, write_sbcs, &data_$name\n};\n\n"; +} diff --git a/charset/slookup.c b/charset/slookup.c new file mode 100644 index 00000000..7a4d7c28 --- /dev/null +++ b/charset/slookup.c @@ -0,0 +1,29 @@ +/* + * slookup.c - static lookup of character sets. + */ + +#include "charset.h" +#include "internal.h" + +#define ENUM_CHARSET(x) extern charset_spec const charset_##x; +#include "enum.c" +#undef ENUM_CHARSET + +static charset_spec const *const cs_table[] = { + +#define ENUM_CHARSET(x) &charset_##x, +#include "enum.c" +#undef ENUM_CHARSET + +}; + +charset_spec const *charset_find_spec(int charset) +{ + int i; + + for (i = 0; i < (int)lenof(cs_table); i++) + if (cs_table[i]->charset == charset) + return cs_table[i]; + + return NULL; +} diff --git a/charset/toucs.c b/charset/toucs.c new file mode 100644 index 00000000..4a6c12f6 --- /dev/null +++ b/charset/toucs.c @@ -0,0 +1,89 @@ +/* + * toucs.c - convert charsets to Unicode. + */ + +#include "charset.h" +#include "internal.h" + +struct unicode_emit_param { + wchar_t *output; + int outlen; + const wchar_t *errstr; + int errlen; + int stopped; +}; + +static void unicode_emit(void *ctx, long int output) +{ + struct unicode_emit_param *param = (struct unicode_emit_param *)ctx; + wchar_t outval; + wchar_t const *p; + int outlen; + + if (output == ERROR) { + if (param->errstr) { + p = param->errstr; + outlen = param->errlen; + } else { + outval = 0xFFFD; /* U+FFFD REPLACEMENT CHARACTER */ + p = &outval; + outlen = 1; + } + } else { + outval = output; + p = &outval; + outlen = 1; + } + + if (param->outlen >= outlen) { + while (outlen > 0) { + *param->output++ = *p++; + param->outlen--; + outlen--; + } + } else { + param->stopped = 1; + } +} + +int charset_to_unicode(char **input, int *inlen, wchar_t *output, int outlen, + int charset, charset_state *state, + const wchar_t *errstr, int errlen) +{ + charset_spec const *spec = charset_find_spec(charset); + charset_state localstate; + struct unicode_emit_param param; + + param.output = output; + param.outlen = outlen; + param.errstr = errstr; + param.errlen = errlen; + param.stopped = 0; + + if (!state) { + localstate.s0 = 0; + } else { + localstate = *state; /* structure copy */ + } + + while (*inlen > 0) { + int lenbefore = param.output - output; + spec->read(spec, (unsigned char)**input, &localstate, + unicode_emit, ¶m); + if (param.stopped) { + /* + * The emit function has _tried_ to output some + * characters, but ran up against the end of the + * buffer. Leave immediately, and return what happened + * _before_ attempting to process this character. + */ + return lenbefore; + } + if (state) + *state = localstate; /* structure copy */ + (*input)++; + (*inlen)--; + } + + return param.output - output; +} diff --git a/charset/utf8.c b/charset/utf8.c new file mode 100644 index 00000000..c82a2159 --- /dev/null +++ b/charset/utf8.c @@ -0,0 +1,877 @@ +/* + * utf8.c - routines to handle UTF-8. + */ + +#ifndef ENUM_CHARSETS + +#include "charset.h" +#include "internal.h" + +/* + * UTF-8 has no associated data, so `charset' may be ignored. + */ + +void read_utf8(charset_spec const *charset, long int input_chr, + charset_state *state, + void (*emit)(void *ctx, long int output), void *emitctx) +{ + UNUSEDARG(charset); + + /* + * For reading UTF-8, the `state' word contains: + * + * - in bits 29-31, the number of bytes expected to be in the + * current multibyte character (which we can tell instantly + * from the first byte, of course). + * + * - in bits 26-28, the number of bytes _seen so far_ in the + * current multibyte character. + * + * - in the remainder of the word, the current value of the + * character, which is shifted upwards by 6 bits to + * accommodate each new byte. + * + * As required, the state is zero when we are not in the middle + * of a multibyte character at all. + * + * For example, when reading E9 8D 8B, starting at state=0: + * + * - after E9, the state is 0x64000009 + * - after 8D, the state is 0x6800024d + * - after 8B, the state conceptually becomes 0x6c00934b, at + * which point we notice we've got as many characters as we + * were expecting, output U+934B, and reset the state to + * zero. + * + * Note that the maximum number of bits we might need to store + * in the character value field is 25 (U+7FFFFFFF contains 31 + * bits, but we will never actually store its full value + * because when we receive the last 6 bits in the final + * continuation byte we will output it and revert the state to + * zero). Hence the character value field never collides with + * the byte counts. + */ + + if (input_chr < 0x80) { + /* + * Single-byte character. If the state is nonzero before + * coming here, output an error for an incomplete sequence. + * Then output the character. + */ + if (state->s0 != 0) { + emit(emitctx, ERROR); + state->s0 = 0; + } + emit(emitctx, input_chr); + } else if (input_chr == 0xFE || input_chr == 0xFF) { + /* + * FE and FF bytes should _never_ occur in UTF-8. They are + * automatic errors; if the state was nonzero to start + * with, output a further error for an incomplete sequence. + */ + if (state->s0 != 0) { + emit(emitctx, ERROR); + state->s0 = 0; + } + emit(emitctx, ERROR); + } else if (input_chr >= 0x80 && input_chr < 0xC0) { + /* + * Continuation byte. Output an error for an unexpected + * continuation byte, if the state is zero. + */ + if (state->s0 == 0) { + emit(emitctx, ERROR); + } else { + unsigned long charval; + unsigned long topstuff; + int bytes; + + /* + * Otherwise, accumulate more of the character value. + */ + charval = state->s0 & 0x03ffffffL; + charval = (charval << 6) | (input_chr & 0x3F); + + /* + * Check the byte counts; if we have not reached the + * end of the character, update the state and return. + */ + topstuff = state->s0 & 0xfc000000L; + topstuff += 0x04000000L; /* add one to the byte count */ + if (((topstuff << 3) ^ topstuff) & 0xe0000000L) { + state->s0 = topstuff | charval; + return; + } + + /* + * Now we know we've reached the end of the character. + * `charval' is the Unicode value. We should check for + * various invalid things, and then either output + * charval or an error. In all cases we reset the state + * to zero. + */ + bytes = topstuff >> 29; + state->s0 = 0; + + if (charval >= 0xD800 && charval < 0xE000) { + /* + * Surrogates (0xD800-0xDFFF) may never be encoded + * in UTF-8. A surrogate pair in Unicode should + * have been encoded as a single UTF-8 character + * occupying more than three bytes. + */ + emit(emitctx, ERROR); + } else if (charval == 0xFFFE || charval == 0xFFFF) { + /* + * U+FFFE and U+FFFF are invalid Unicode characters + * and may never be encoded in UTF-8. (This is one + * reason why U+FFFF is our way of signalling an + * error to our `emit' function :-) + */ + emit(emitctx, ERROR); + } else if ((charval <= 0x7FL /* && bytes > 1 */) || + (charval <= 0x7FFL && bytes > 2) || + (charval <= 0xFFFFL && bytes > 3) || + (charval <= 0x1FFFFFL && bytes > 4) || + (charval <= 0x3FFFFFFL && bytes > 5)) { + /* + * Overlong sequences are not to be tolerated, + * under any circumstances. + */ + emit(emitctx, ERROR); + } else { + /* + * Oh, all right. We'll let this one off. + */ + emit(emitctx, charval); + } + } + + } else { + /* + * Lead byte. First output an error for an incomplete + * sequence, if the state is nonzero. + */ + if (state->s0 != 0) + emit(emitctx, ERROR); + + /* + * Now deal with the lead byte: work out the number of + * bytes we expect to see in this character, and extract + * the initial bits of it too. + */ + if (input_chr >= 0xC0 && input_chr < 0xE0) { + state->s0 = 0x44000000L | (input_chr & 0x1F); + } else if (input_chr >= 0xE0 && input_chr < 0xF0) { + state->s0 = 0x64000000L | (input_chr & 0x0F); + } else if (input_chr >= 0xF0 && input_chr < 0xF8) { + state->s0 = 0x84000000L | (input_chr & 0x07); + } else if (input_chr >= 0xF8 && input_chr < 0xFC) { + state->s0 = 0xa4000000L | (input_chr & 0x03); + } else if (input_chr >= 0xFC && input_chr < 0xFE) { + state->s0 = 0xc4000000L | (input_chr & 0x01); + } + } +} + +/* + * UTF-8 is a stateless multi-byte encoding (in the sense that just + * after any character has been completed, the state is always the + * same); hence when writing it, there is no need to use the + * charset_state. + */ + +void write_utf8(charset_spec const *charset, long int input_chr, + charset_state *state, + void (*emit)(void *ctx, long int output), void *emitctx) +{ + UNUSEDARG(charset); + UNUSEDARG(state); + + /* + * Refuse to output any illegal code points. + */ + if (input_chr == 0xFFFE || input_chr == 0xFFFF || + (input_chr >= 0xD800 && input_chr < 0xE000)) { + emit(emitctx, ERROR); + } else if (input_chr < 0x80) { /* one-byte character */ + emit(emitctx, input_chr); + } else if (input_chr < 0x800) { /* two-byte character */ + emit(emitctx, 0xC0 | (0x1F & (input_chr >> 6))); + emit(emitctx, 0x80 | (0x3F & (input_chr ))); + } else if (input_chr < 0x10000) { /* three-byte character */ + emit(emitctx, 0xE0 | (0x0F & (input_chr >> 12))); + emit(emitctx, 0x80 | (0x3F & (input_chr >> 6))); + emit(emitctx, 0x80 | (0x3F & (input_chr ))); + } else if (input_chr < 0x200000) { /* four-byte character */ + emit(emitctx, 0xF0 | (0x07 & (input_chr >> 18))); + emit(emitctx, 0x80 | (0x3F & (input_chr >> 12))); + emit(emitctx, 0x80 | (0x3F & (input_chr >> 6))); + emit(emitctx, 0x80 | (0x3F & (input_chr ))); + } else if (input_chr < 0x4000000) {/* five-byte character */ + emit(emitctx, 0xF8 | (0x03 & (input_chr >> 24))); + emit(emitctx, 0x80 | (0x3F & (input_chr >> 18))); + emit(emitctx, 0x80 | (0x3F & (input_chr >> 12))); + emit(emitctx, 0x80 | (0x3F & (input_chr >> 6))); + emit(emitctx, 0x80 | (0x3F & (input_chr ))); + } else { /* six-byte character */ + emit(emitctx, 0xFC | (0x01 & (input_chr >> 30))); + emit(emitctx, 0x80 | (0x3F & (input_chr >> 24))); + emit(emitctx, 0x80 | (0x3F & (input_chr >> 18))); + emit(emitctx, 0x80 | (0x3F & (input_chr >> 12))); + emit(emitctx, 0x80 | (0x3F & (input_chr >> 6))); + emit(emitctx, 0x80 | (0x3F & (input_chr ))); + } +} + +#ifdef TESTMODE + +#include +#include + +int total_errs = 0; + +void utf8_emit(void *ctx, long output) +{ + wchar_t **p = (wchar_t **)ctx; + *(*p)++ = output; +} + +void utf8_read_test(int line, char *input, int inlen, ...) +{ + va_list ap; + wchar_t *p, str[512]; + int i; + charset_state state; + unsigned long l; + + state.s0 = 0; + p = str; + + for (i = 0; i < inlen; i++) + read_utf8(NULL, input[i] & 0xFF, &state, utf8_emit, &p); + + va_start(ap, inlen); + l = 0; + for (i = 0; i < p - str; i++) { + l = va_arg(ap, long int); + if (l == -1) { + printf("%d: correct string shorter than output\n", line); + total_errs++; + break; + } + if (l != str[i]) { + printf("%d: char %d came out as %08x, should be %08x\n", + line, i, str[i], l); + total_errs++; + } + } + if (l != -1) { + l = va_arg(ap, long int); + if (l != -1) { + printf("%d: correct string longer than output\n", line); + total_errs++; + } + } + va_end(ap); +} + +void utf8_write_test(int line, const long *input, int inlen, ...) +{ + va_list ap; + wchar_t *p, str[512]; + int i; + charset_state state; + unsigned long l; + + state.s0 = 0; + p = str; + + for (i = 0; i < inlen; i++) + write_utf8(NULL, input[i], &state, utf8_emit, &p); + + va_start(ap, inlen); + l = 0; + for (i = 0; i < p - str; i++) { + l = va_arg(ap, long int); + if (l == -1) { + printf("%d: correct string shorter than output\n", line); + total_errs++; + break; + } + if (l != str[i]) { + printf("%d: char %d came out as %08x, should be %08x\n", + line, i, str[i], l); + total_errs++; + } + } + if (l != -1) { + l = va_arg(ap, long int); + if (l != -1) { + printf("%d: correct string longer than output\n", line); + total_errs++; + } + } + va_end(ap); +} + +/* Macro to concoct the first three parameters of utf8_read_test. */ +#define TESTSTR(x) __LINE__, x, lenof(x) + +int main(void) +{ + printf("read tests beginning\n"); + utf8_read_test(TESTSTR("\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"), + 0x000003BA, /* GREEK SMALL LETTER KAPPA */ + 0x00001F79, /* GREEK SMALL LETTER OMICRON WITH OXIA */ + 0x000003C3, /* GREEK SMALL LETTER SIGMA */ + 0x000003BC, /* GREEK SMALL LETTER MU */ + 0x000003B5, /* GREEK SMALL LETTER EPSILON */ + 0, -1); + utf8_read_test(TESTSTR("\x00"), + 0x00000000, /* */ + 0, -1); + utf8_read_test(TESTSTR("\xC2\x80"), + 0x00000080, /* */ + 0, -1); + utf8_read_test(TESTSTR("\xE0\xA0\x80"), + 0x00000800, /* */ + 0, -1); + utf8_read_test(TESTSTR("\xF0\x90\x80\x80"), + 0x00010000, /* */ + 0, -1); + utf8_read_test(TESTSTR("\xF8\x88\x80\x80\x80"), + 0x00200000, /* */ + 0, -1); + utf8_read_test(TESTSTR("\xFC\x84\x80\x80\x80\x80"), + 0x04000000, /* */ + 0, -1); + utf8_read_test(TESTSTR("\x7F"), + 0x0000007F, /* */ + 0, -1); + utf8_read_test(TESTSTR("\xDF\xBF"), + 0x000007FF, /* */ + 0, -1); + utf8_read_test(TESTSTR("\xEF\xBF\xBD"), + 0x0000FFFD, /* REPLACEMENT CHARACTER */ + 0, -1); + utf8_read_test(TESTSTR("\xEF\xBF\xBF"), + ERROR, /* (invalid char) */ + 0, -1); + utf8_read_test(TESTSTR("\xF7\xBF\xBF\xBF"), + 0x001FFFFF, /* */ + 0, -1); + utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF\xBF"), + 0x03FFFFFF, /* */ + 0, -1); + utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF\xBF"), + 0x7FFFFFFF, /* */ + 0, -1); + utf8_read_test(TESTSTR("\xED\x9F\xBF"), + 0x0000D7FF, /* */ + 0, -1); + utf8_read_test(TESTSTR("\xEE\x80\x80"), + 0x0000E000, /* */ + 0, -1); + utf8_read_test(TESTSTR("\xEF\xBF\xBD"), + 0x0000FFFD, /* REPLACEMENT CHARACTER */ + 0, -1); + utf8_read_test(TESTSTR("\xF4\x8F\xBF\xBF"), + 0x0010FFFF, /* */ + 0, -1); + utf8_read_test(TESTSTR("\xF4\x90\x80\x80"), + 0x00110000, /* */ + 0, -1); + utf8_read_test(TESTSTR("\x80"), + ERROR, /* (unexpected continuation byte) */ + 0, -1); + utf8_read_test(TESTSTR("\xBF"), + ERROR, /* (unexpected continuation byte) */ + 0, -1); + utf8_read_test(TESTSTR("\x80\xBF"), + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + 0, -1); + utf8_read_test(TESTSTR("\x80\xBF\x80"), + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + 0, -1); + utf8_read_test(TESTSTR("\x80\xBF\x80\xBF"), + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + 0, -1); + utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80"), + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + 0, -1); + utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF"), + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + 0, -1); + utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF\x80"), + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + 0, -1); + utf8_read_test(TESTSTR("\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"), + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + ERROR, /* (unexpected continuation byte) */ + 0, -1); + utf8_read_test(TESTSTR("\xC0\x20\xC1\x20\xC2\x20\xC3\x20\xC4\x20\xC5\x20\xC6\x20\xC7\x20"), + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + 0, -1); + utf8_read_test(TESTSTR("\xE0\x20\xE1\x20\xE2\x20\xE3\x20\xE4\x20\xE5\x20\xE6\x20\xE7\x20\xE8\x20\xE9\x20\xEA\x20\xEB\x20\xEC\x20\xED\x20\xEE\x20\xEF\x20"), + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + 0, -1); + utf8_read_test(TESTSTR("\xF0\x20\xF1\x20\xF2\x20\xF3\x20\xF4\x20\xF5\x20\xF6\x20\xF7\x20"), + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + 0, -1); + utf8_read_test(TESTSTR("\xF8\x20\xF9\x20\xFA\x20\xFB\x20"), + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + 0, -1); + utf8_read_test(TESTSTR("\xFC\x20\xFD\x20"), + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + ERROR, /* (incomplete sequence) */ + 0x00000020, /* SPACE */ + 0, -1); + utf8_read_test(TESTSTR("\xC0"), + ERROR, /* (incomplete sequence) */ + 0, -1); + utf8_read_test(TESTSTR("\xE0\x80"), + ERROR, /* (incomplete sequence) */ + 0, -1); + utf8_read_test(TESTSTR("\xF0\x80\x80"), + ERROR, /* (incomplete sequence) */ + 0, -1); + utf8_read_test(TESTSTR("\xF8\x80\x80\x80"), + ERROR, /* (incomplete sequence) */ + 0, -1); + utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80"), + ERROR, /* (incomplete sequence) */ + 0, -1); + utf8_read_test(TESTSTR("\xDF"), + ERROR, /* (incomplete sequence) */ + 0, -1); + utf8_read_test(TESTSTR("\xEF\xBF"), + ERROR, /* (incomplete sequence) */ + 0, -1); + utf8_read_test(TESTSTR("\xF7\xBF\xBF"), + ERROR, /* (incomplete sequence) */ + 0, -1); + utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF"), + ERROR, /* (incomplete sequence) */ + 0, -1); + utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF"), + ERROR, /* (incomplete sequence) */ + 0, -1); + utf8_read_test(TESTSTR("\xC0\xE0\x80\xF0\x80\x80\xF8\x80\x80\x80\xFC\x80\x80\x80\x80\xDF\xEF\xBF\xF7\xBF\xBF\xFB\xBF\xBF\xBF\xFD\xBF\xBF\xBF\xBF"), + ERROR, /* (incomplete sequence) */ + ERROR, /* (incomplete sequence) */ + ERROR, /* (incomplete sequence) */ + ERROR, /* (incomplete sequence) */ + ERROR, /* (incomplete sequence) */ + ERROR, /* (incomplete sequence) */ + ERROR, /* (incomplete sequence) */ + ERROR, /* (incomplete sequence) */ + ERROR, /* (incomplete sequence) */ + ERROR, /* (incomplete sequence) */ + 0, -1); + utf8_read_test(TESTSTR("\xFE"), + ERROR, /* (invalid UTF-8 byte) */ + 0, -1); + utf8_read_test(TESTSTR("\xFF"), + ERROR, /* (invalid UTF-8 byte) */ + 0, -1); + utf8_read_test(TESTSTR("\xFE\xFE\xFF\xFF"), + ERROR, /* (invalid UTF-8 byte) */ + ERROR, /* (invalid UTF-8 byte) */ + ERROR, /* (invalid UTF-8 byte) */ + ERROR, /* (invalid UTF-8 byte) */ + 0, -1); + utf8_read_test(TESTSTR("\xC0\xAF"), + ERROR, /* SOLIDUS (overlong form of 2F) */ + 0, -1); + utf8_read_test(TESTSTR("\xE0\x80\xAF"), + ERROR, /* SOLIDUS (overlong form of 2F) */ + 0, -1); + utf8_read_test(TESTSTR("\xF0\x80\x80\xAF"), + ERROR, /* SOLIDUS (overlong form of 2F) */ + 0, -1); + utf8_read_test(TESTSTR("\xF8\x80\x80\x80\xAF"), + ERROR, /* SOLIDUS (overlong form of 2F) */ + 0, -1); + utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\xAF"), + ERROR, /* SOLIDUS (overlong form of 2F) */ + 0, -1); + utf8_read_test(TESTSTR("\xC1\xBF"), + ERROR, /* (overlong form of 7F) */ + 0, -1); + utf8_read_test(TESTSTR("\xE0\x9F\xBF"), + ERROR, /* (overlong form of DF BF) */ + 0, -1); + utf8_read_test(TESTSTR("\xF0\x8F\xBF\xBF"), + ERROR, /* (overlong form of EF BF BF) (invalid char) */ + 0, -1); + utf8_read_test(TESTSTR("\xF8\x87\xBF\xBF\xBF"), + ERROR, /* (overlong form of F7 BF BF BF) */ + 0, -1); + utf8_read_test(TESTSTR("\xFC\x83\xBF\xBF\xBF\xBF"), + ERROR, /* (overlong form of FB BF BF BF BF) */ + 0, -1); + utf8_read_test(TESTSTR("\xC0\x80"), + ERROR, /* (overlong form of 00) */ + 0, -1); + utf8_read_test(TESTSTR("\xE0\x80\x80"), + ERROR, /* (overlong form of 00) */ + 0, -1); + utf8_read_test(TESTSTR("\xF0\x80\x80\x80"), + ERROR, /* (overlong form of 00) */ + 0, -1); + utf8_read_test(TESTSTR("\xF8\x80\x80\x80\x80"), + ERROR, /* (overlong form of 00) */ + 0, -1); + utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\x80"), + ERROR, /* (overlong form of 00) */ + 0, -1); + utf8_read_test(TESTSTR("\xED\xA0\x80"), + ERROR, /* (surrogate) */ + 0, -1); + utf8_read_test(TESTSTR("\xED\xAD\xBF"), + ERROR, /* (surrogate) */ + 0, -1); + utf8_read_test(TESTSTR("\xED\xAE\x80"), + ERROR, /* (surrogate) */ + 0, -1); + utf8_read_test(TESTSTR("\xED\xAF\xBF"), + ERROR, /* (surrogate) */ + 0, -1); + utf8_read_test(TESTSTR("\xED\xB0\x80"), + ERROR, /* (surrogate) */ + 0, -1); + utf8_read_test(TESTSTR("\xED\xBE\x80"), + ERROR, /* (surrogate) */ + 0, -1); + utf8_read_test(TESTSTR("\xED\xBF\xBF"), + ERROR, /* (surrogate) */ + 0, -1); + utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xB0\x80"), + ERROR, /* (surrogate) */ + ERROR, /* (surrogate) */ + 0, -1); + utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xBF\xBF"), + ERROR, /* (surrogate) */ + ERROR, /* (surrogate) */ + 0, -1); + utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xB0\x80"), + ERROR, /* (surrogate) */ + ERROR, /* (surrogate) */ + 0, -1); + utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xBF\xBF"), + ERROR, /* (surrogate) */ + ERROR, /* (surrogate) */ + 0, -1); + utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xB0\x80"), + ERROR, /* (surrogate) */ + ERROR, /* (surrogate) */ + 0, -1); + utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xBF\xBF"), + ERROR, /* (surrogate) */ + ERROR, /* (surrogate) */ + 0, -1); + utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xB0\x80"), + ERROR, /* (surrogate) */ + ERROR, /* (surrogate) */ + 0, -1); + utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xBF\xBF"), + ERROR, /* (surrogate) */ + ERROR, /* (surrogate) */ + 0, -1); + utf8_read_test(TESTSTR("\xEF\xBF\xBE"), + ERROR, /* (invalid char) */ + 0, -1); + utf8_read_test(TESTSTR("\xEF\xBF\xBF"), + ERROR, /* (invalid char) */ + 0, -1); + printf("read tests completed\n"); + printf("write tests beginning\n"); + { + const static long str[] = + {0x03BAL, 0x1F79L, 0x03C3L, 0x03BCL, 0x03B5L, 0}; + utf8_write_test(TESTSTR(str), + 0xCE, 0xBA, + 0xE1, 0xBD, 0xB9, + 0xCF, 0x83, + 0xCE, 0xBC, + 0xCE, 0xB5, + 0, -1); + } + { + const static long str[] = {0x0000L, 0}; + utf8_write_test(TESTSTR(str), + 0x00, + 0, -1); + } + { + const static long str[] = {0x0080L, 0}; + utf8_write_test(TESTSTR(str), + 0xC2, 0x80, + 0, -1); + } + { + const static long str[] = {0x0800L, 0}; + utf8_write_test(TESTSTR(str), + 0xE0, 0xA0, 0x80, + 0, -1); + } + { + const static long str[] = {0x00010000L, 0}; + utf8_write_test(TESTSTR(str), + 0xF0, 0x90, 0x80, 0x80, + 0, -1); + } + { + const static long str[] = {0x00200000L, 0}; + utf8_write_test(TESTSTR(str), + 0xF8, 0x88, 0x80, 0x80, 0x80, + 0, -1); + } + { + const static long str[] = {0x04000000L, 0}; + utf8_write_test(TESTSTR(str), + 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80, + 0, -1); + } + { + const static long str[] = {0x007FL, 0}; + utf8_write_test(TESTSTR(str), + 0x7F, + 0, -1); + } + { + const static long str[] = {0x07FFL, 0}; + utf8_write_test(TESTSTR(str), + 0xDF, 0xBF, + 0, -1); + } + { + const static long str[] = {0xFFFDL, 0}; + utf8_write_test(TESTSTR(str), + 0xEF, 0xBF, 0xBD, + 0, -1); + } + { + const static long str[] = {0xFFFFL, 0}; + utf8_write_test(TESTSTR(str), + ERROR, + 0, -1); + } + { + const static long str[] = {0x001FFFFFL, 0}; + utf8_write_test(TESTSTR(str), + 0xF7, 0xBF, 0xBF, 0xBF, + 0, -1); + } + { + const static long str[] = {0x03FFFFFFL, 0}; + utf8_write_test(TESTSTR(str), + 0xFB, 0xBF, 0xBF, 0xBF, 0xBF, + 0, -1); + } + { + const static long str[] = {0x7FFFFFFFL, 0}; + utf8_write_test(TESTSTR(str), + 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF, + 0, -1); + } + { + const static long str[] = {0xD7FFL, 0}; + utf8_write_test(TESTSTR(str), + 0xED, 0x9F, 0xBF, + 0, -1); + } + { + const static long str[] = {0xD800L, 0}; + utf8_write_test(TESTSTR(str), + ERROR, + 0, -1); + } + { + const static long str[] = {0xD800L, 0xDC00L, 0}; + utf8_write_test(TESTSTR(str), + ERROR, + ERROR, + 0, -1); + } + { + const static long str[] = {0xDFFFL, 0}; + utf8_write_test(TESTSTR(str), + ERROR, + 0, -1); + } + { + const static long str[] = {0xE000L, 0}; + utf8_write_test(TESTSTR(str), + 0xEE, 0x80, 0x80, + 0, -1); + } + printf("write tests completed\n"); + + printf("total: %d errors\n", total_errs); + return (total_errs != 0); +} +#endif /* TESTMODE */ + +const charset_spec charset_CS_UTF8 = { + CS_UTF8, read_utf8, write_utf8, NULL +}; + +#else /* ENUM_CHARSETS */ + +ENUM_CHARSET(CS_UTF8) + +#endif /* ENUM_CHARSETS */ diff --git a/charset/xenc.c b/charset/xenc.c new file mode 100644 index 00000000..5359b16d --- /dev/null +++ b/charset/xenc.c @@ -0,0 +1,92 @@ +/* + * xenc.c - translate our internal character set codes to and from + * X11 character encoding names. + * + */ + +#include +#include "charset.h" +#include "internal.h" + +static const struct { + const char *name; + int charset; +} xencs[] = { + /* + * Officially registered encoding names. This list is derived + * from the font encodings section of + * + * http://ftp.x.org/pub/DOCS/registry + * + * Where multiple encoding names map to the same encoding id + * (such as iso8859-15 and fcd8859-15), the first is considered + * canonical and will be returned when translating the id to a + * string. + */ + { "iso8859-1", CS_ISO8859_1 }, + { "iso8859-2", CS_ISO8859_2 }, + { "iso8859-3", CS_ISO8859_3 }, + { "iso8859-4", CS_ISO8859_4 }, + { "iso8859-5", CS_ISO8859_5 }, + { "iso8859-6", CS_ISO8859_6 }, + { "iso8859-7", CS_ISO8859_7 }, + { "iso8859-8", CS_ISO8859_8 }, + { "iso8859-9", CS_ISO8859_9 }, + { "iso8859-10", CS_ISO8859_10 }, + { "iso8859-13", CS_ISO8859_13 }, + { "iso8859-14", CS_ISO8859_14 }, + { "iso8859-15", CS_ISO8859_15 }, + { "fcd8859-15", CS_ISO8859_15 }, + { "hp-roman8", CS_HP_ROMAN8 }, + { "koi8-r", CS_KOI8_R }, + /* + * Unofficial encoding names found in the wild. + */ + { "iso8859-16", CS_ISO8859_16 }, + { "koi8-u", CS_KOI8_U }, + { "ibm-cp437", CS_CP437 }, + { "ibm-cp850", CS_CP850 }, + { "microsoft-cp1250", CS_CP1250 }, + { "microsoft-cp1251", CS_CP1251 }, + { "microsoft-cp1252", CS_CP1252 }, + { "microsoft-cp1253", CS_CP1253 }, + { "microsoft-cp1254", CS_CP1254 }, + { "microsoft-cp1255", CS_CP1255 }, + { "microsoft-cp1256", CS_CP1256 }, + { "microsoft-cp1257", CS_CP1257 }, + { "microsoft-cp1258", CS_CP1258 }, + { "mac-roman", CS_MAC_ROMAN }, + { "viscii1.1-1", CS_VISCII }, + { "viscii1-1", CS_VISCII }, +}; + +const char *charset_to_xenc(int charset) +{ + int i; + + for (i = 0; i < (int)lenof(xencs); i++) + if (charset == xencs[i].charset) + return xencs[i].name; + + return NULL; /* not found */ +} + +int charset_from_xenc(const char *name) +{ + int i; + + for (i = 0; i < (int)lenof(xencs); i++) { + const char *p, *q; + p = name; + q = xencs[i].name; + while (*p || *q) { + if (tolower(*p) != tolower(*q)) + break; + p++; q++; + } + if (!*p && !*q) + return xencs[i].charset; + } + + return CS_NONE; /* not found */ +} diff --git a/mkfiles.pl b/mkfiles.pl index 30417b28..384a88e6 100755 --- a/mkfiles.pl +++ b/mkfiles.pl @@ -11,7 +11,12 @@ use FileHandle; open IN, "Recipe" or die "unable to open Recipe file\n"; -@incdirs = ("", "unix/", "mac/"); +# HACK: One of the source files in `charset' is auto-generated by +# sbcsgen.pl. We need to generate that _now_, before attempting +# dependency analysis. +eval 'chdir "charset"; require "sbcsgen.pl"; chdir ".."'; + +@incdirs = ("", "charset/", "unix/", "mac/"); $help = ""; # list of newline-free lines of help text %programs = (); # maps prog name + type letter to listref of objects/resources @@ -534,7 +539,7 @@ print "# TOOLPATH = /opt/gcc/bin\n". "CC = \$(TOOLPATH)cc\n". "\n". -&splitline("CFLAGS = -Wall -g -I. -I.. `gtk-config --cflags`")."\n". +&splitline("CFLAGS = -Wall -g -I. -I.. -I../charset `gtk-config --cflags`")."\n". "XLDFLAGS = `gtk-config --libs`\n". "ULDFLAGS =#\n". "INSTALL=install\n", diff --git a/unix/pterm.1 b/unix/pterm.1 index 569bd9d9..0cad7252 100644 --- a/unix/pterm.1 +++ b/unix/pterm.1 @@ -90,6 +90,20 @@ to specify it explicitly if you have changed the default using the .IP "\fB\-log\fP \fIfilename\fP" This option makes \fIpterm\fP log all the terminal output to a file as well as displaying it in the terminal. +.IP "\fB\-cs\fP \fIcharset\fP" +This option specifies the character set in which \fIpterm\fP should +assume the session is operating. This character set will be used to +interpret all the data received from the session, and all input you +type or paste into \fIpterm\fP will be converted into this character +set before being sent to the session. + +Any character set name which is valid in a MIME header (and +supported by \fIpterm\fP) should be valid here (examples are +"ISO-8859-1", "windows-1252" or "UTF-8"). Also, any character +encoding which is valid in an X logical font description should be +valid ("ibm-cp437", for example). + +Character set names are case-insensitive. .IP "\fB\-nethack\fP" Tells \fIpterm\fP to enable NetHack keypad mode, in which the numeric keypad generates the NetHack "hjklyubn" direction keys. This @@ -385,6 +399,14 @@ reset to the very bottom. This option should be set to either 0 or 1; the default is 1. When set to 1, any activity in the display causes the position of the scrollback to be reset to the very bottom. +.IP "\fBpterm.LineCodePage\fP" +This option specifies the character set to be used for the session. +This is the same as the \fI\-cs\fP command-line option. +.IP "\fBpterm.NoRemoteCharset\fP" +This option disables the terminal's ability to change its character +set when it receives escape sequences telling it to. You might need +to do this to interoperate with programs which incorrectly change +the character set to something they think is sensible. .IP "\fBpterm.BCE\fP" This option should be set to either 0 or 1; the default is 1. When set to 1, the various control sequences that erase parts of the diff --git a/unix/pterm.c b/unix/pterm.c index 714bef1a..9c86b326 100644 --- a/unix/pterm.c +++ b/unix/pterm.c @@ -24,6 +24,7 @@ #include #define PUTTY_DO_GLOBALS /* actually _define_ globals */ + #include "putty.h" #include "terminal.h" @@ -39,18 +40,22 @@ struct gui_data { GtkAdjustment *sbar_adjust; GdkPixmap *pixmap; GdkFont *fonts[2]; /* normal and bold (for now!) */ + struct { + int charset; + int is_wide; + } fontinfo[2]; GdkCursor *rawcursor, *textcursor, *blankcursor, *currcursor; GdkColor cols[NCOLOURS]; GdkColormap *colmap; wchar_t *pastein_data; int pastein_data_len; - char *pasteout_data; - int pasteout_data_len; + char *pasteout_data, *pasteout_data_utf8; + int pasteout_data_len, pasteout_data_utf8_len; int font_width, font_height; int ignore_sbar; int mouseptr_visible; guint term_paste_idle_id; - GdkAtom compound_text_atom; + GdkAtom compound_text_atom, utf8_string_atom; int alt_keycode; int alt_digits; char wintitle[sizeof(((Config *)0)->wintitle)]; @@ -831,7 +836,19 @@ gint key_event(GtkWidget *widget, GdkEventKey *event, gpointer data) printf("\n"); #endif - ldisc_send(inst->ldisc, output+start, end-start, 1); + /* + * The stuff we've just generated is assumed to be + * ISO-8859-1! This sounds insane, but `man XLookupString' + * agrees: strings of this type returned from the X server + * are hardcoded to 8859-1. Strictly speaking we should be + * doing this using some sort of GtkIMContext, which (if + * we're lucky) would give us our data directly in Unicode; + * but that's not supported in GTK 1.2 as far as I can + * tell, and it's poorly documented even in 2.0, so it'll + * have to wait. + */ + lpage_send(inst->ldisc, CS_ISO8859_1, output+start, end-start, 1); + show_mouseptr(inst, 0); term_seen_key_event(inst->term); term_out(inst->term); @@ -1198,9 +1215,26 @@ void write_clip(void *frontend, wchar_t * data, int len, int must_deselect) struct gui_data *inst = (struct gui_data *)frontend; if (inst->pasteout_data) sfree(inst->pasteout_data); + if (inst->pasteout_data_utf8) + sfree(inst->pasteout_data_utf8); + + inst->pasteout_data_utf8 = smalloc(len*6); + inst->pasteout_data_utf8_len = len*6; + { + wchar_t *tmp = data; + int tmplen = len; + inst->pasteout_data_utf8_len = + charset_from_unicode(&tmp, &tmplen, inst->pasteout_data_utf8, + inst->pasteout_data_utf8_len, + CS_UTF8, NULL, NULL, 0); + inst->pasteout_data_utf8 = + srealloc(inst->pasteout_data_utf8, inst->pasteout_data_utf8_len); + } + inst->pasteout_data = smalloc(len); inst->pasteout_data_len = len; - wc_to_mb(0, 0, data, len, inst->pasteout_data, inst->pasteout_data_len, + wc_to_mb(line_codepage, 0, data, len, + inst->pasteout_data, inst->pasteout_data_len, NULL, NULL); if (gtk_selection_owner_set(inst->area, GDK_SELECTION_PRIMARY, @@ -1209,6 +1243,8 @@ void write_clip(void *frontend, wchar_t * data, int len, int must_deselect) GDK_SELECTION_TYPE_STRING, 1); gtk_selection_add_target(inst->area, GDK_SELECTION_PRIMARY, inst->compound_text_atom, 1); + gtk_selection_add_target(inst->area, GDK_SELECTION_PRIMARY, + inst->utf8_string_atom, 1); } } @@ -1216,8 +1252,13 @@ void selection_get(GtkWidget *widget, GtkSelectionData *seldata, guint info, guint time_stamp, gpointer data) { struct gui_data *inst = (struct gui_data *)data; - gtk_selection_data_set(seldata, GDK_SELECTION_TYPE_STRING, 8, - inst->pasteout_data, inst->pasteout_data_len); + if (seldata->target == inst->utf8_string_atom) + gtk_selection_data_set(seldata, seldata->target, 8, + inst->pasteout_data_utf8, + inst->pasteout_data_utf8_len); + else + gtk_selection_data_set(seldata, seldata->target, 8, + inst->pasteout_data, inst->pasteout_data_len); } gint selection_clear(GtkWidget *widget, GdkEventSelection *seldata, @@ -1227,8 +1268,12 @@ gint selection_clear(GtkWidget *widget, GdkEventSelection *seldata, term_deselect(inst->term); if (inst->pasteout_data) sfree(inst->pasteout_data); + if (inst->pasteout_data_utf8) + sfree(inst->pasteout_data_utf8); inst->pasteout_data = NULL; inst->pasteout_data_len = 0; + inst->pasteout_data_utf8 = NULL; + inst->pasteout_data_utf8_len = 0; return TRUE; } @@ -1240,8 +1285,16 @@ void request_paste(void *frontend) * moment is to call gtk_selection_convert(), and when the data * comes back _then_ we can call term_do_paste(). */ + + /* + * First we attempt to retrieve the selection as a UTF-8 string + * (which we will convert to the correct code page before + * sending to the session, of course). If that fails, + * selection_received() will be informed and will fall back to + * an ordinary string. + */ gtk_selection_convert(inst->area, GDK_SELECTION_PRIMARY, - GDK_SELECTION_TYPE_STRING, GDK_CURRENT_TIME); + inst->utf8_string_atom, GDK_CURRENT_TIME); } gint idle_paste_func(gpointer data); /* forward ref */ @@ -1251,8 +1304,22 @@ void selection_received(GtkWidget *widget, GtkSelectionData *seldata, { struct gui_data *inst = (struct gui_data *)data; + if (seldata->target == inst->utf8_string_atom && seldata->length <= 0) { + /* + * Failed to get a UTF-8 selection string. Try an ordinary + * string. + */ + gtk_selection_convert(inst->area, GDK_SELECTION_PRIMARY, + GDK_SELECTION_TYPE_STRING, GDK_CURRENT_TIME); + return; + } + + /* + * Any other failure should just go foom. + */ if (seldata->length <= 0 || - seldata->type != GDK_SELECTION_TYPE_STRING) + (seldata->type != GDK_SELECTION_TYPE_STRING && + seldata->type != inst->utf8_string_atom)) return; /* Nothing happens. */ if (inst->pastein_data) @@ -1260,8 +1327,11 @@ void selection_received(GtkWidget *widget, GtkSelectionData *seldata, inst->pastein_data = smalloc(seldata->length * sizeof(wchar_t)); inst->pastein_data_len = seldata->length; - mb_to_wc(0, 0, seldata->data, seldata->length, - inst->pastein_data, inst->pastein_data_len); + inst->pastein_data_len = + mb_to_wc((seldata->type == inst->utf8_string_atom ? + CS_UTF8 : line_codepage), + 0, seldata->data, seldata->length, + inst->pastein_data, inst->pastein_data_len); term_do_paste(inst->term); @@ -1457,10 +1527,45 @@ void do_text_internal(Context ctx, int x, int y, char *text, int len, rlen*inst->font_width, inst->font_height); gdk_gc_set_foreground(gc, &inst->cols[nfg]); - gdk_draw_text(inst->pixmap, inst->fonts[fontid], gc, - x*inst->font_width+cfg.window_border, - y*inst->font_height+cfg.window_border+inst->fonts[0]->ascent, - text, len); + { + GdkWChar *gwcs; + gchar *gcs; + wchar_t *wcs; + int i; + + wcs = smalloc(sizeof(wchar_t) * (len+1)); + for (i = 0; i < len; i++) { + wcs[i] = (wchar_t) ((attr & CSET_MASK) + (text[i] & CHAR_MASK)); + } + + if (inst->fontinfo[fontid].is_wide) { + gwcs = smalloc(sizeof(GdkWChar) * (len+1)); + /* + * FIXME: when we have a wide-char equivalent of + * from_unicode, use it instead of this. + */ + for (i = 0; i <= len; i++) + gwcs[i] = wcs[i]; + gdk_draw_text_wc(inst->pixmap, inst->fonts[fontid], gc, + x*inst->font_width+cfg.window_border, + y*inst->font_height+cfg.window_border+inst->fonts[0]->ascent, + gwcs, len*2); + sfree(gwcs); + } else { + wchar_t *wcstmp = wcs; + int lentmp = len; + gcs = smalloc(sizeof(GdkWChar) * (len+1)); + charset_from_unicode(&wcstmp, &lentmp, gcs, len, + inst->fontinfo[fontid].charset, + NULL, ".", 1); + gdk_draw_text(inst->pixmap, inst->fonts[fontid], gc, + x*inst->font_width+cfg.window_border, + y*inst->font_height+cfg.window_border+inst->fonts[0]->ascent, + gcs, len); + sfree(gcs); + } + sfree(wcs); + } if (shadow) { gdk_draw_text(inst->pixmap, inst->fonts[fontid], gc, @@ -1818,6 +1923,12 @@ int do_cmdline(int argc, char **argv, int do_everything) strncpy(cfg.boldfont, val, sizeof(cfg.boldfont)); cfg.boldfont[sizeof(cfg.boldfont)-1] = '\0'; + } else if (!strcmp(p, "-cs")) { + EXPECTS_ARG; + SECOND_PASS_ONLY; + strncpy(cfg.line_codepage, val, sizeof(cfg.line_codepage)); + cfg.line_codepage[sizeof(cfg.line_codepage)-1] = '\0'; + } else if (!strcmp(p, "-geometry")) { int flags, x, y, w, h; EXPECTS_ARG; @@ -1955,6 +2066,68 @@ static void block_signal(int sig, int block_it) { } } +static void set_font_info(struct gui_data *inst, int fontid) +{ + GdkFont *font = inst->fonts[fontid]; + XFontStruct *xfs = GDK_FONT_XFONT(font); + Display *disp = GDK_FONT_XDISPLAY(font); + Atom charset_registry, charset_encoding; + unsigned long registry_ret, encoding_ret; + charset_registry = XInternAtom(disp, "CHARSET_REGISTRY", False); + charset_encoding = XInternAtom(disp, "CHARSET_ENCODING", False); + inst->fontinfo[fontid].charset = CS_NONE; + inst->fontinfo[fontid].is_wide = 0; + if (XGetFontProperty(xfs, charset_registry, ®istry_ret) && + XGetFontProperty(xfs, charset_encoding, &encoding_ret)) { + char *reg, *enc; + reg = XGetAtomName(disp, (Atom)registry_ret); + enc = XGetAtomName(disp, (Atom)encoding_ret); + if (reg && enc) { + char *encoding = dupcat(reg, "-", enc, NULL); + inst->fontinfo[fontid].charset = charset_from_xenc(encoding); + /* FIXME: when libcharset supports wide encodings fix this. */ + if (!strcasecmp(encoding, "iso10646-1")) + inst->fontinfo[fontid].is_wide = 1; + + /* + * Hack for X line-drawing characters: if the primary + * font is encoded as ISO-8859-anything, and has valid + * glyphs in the first 32 char positions, it is assumed + * that those glyphs are the VT100 line-drawing + * character set. + * + * Actually, we'll hack even harder by only checking + * position 0x19 (vertical line, VT100 linedrawing + * `x'). Then we can check it easily by seeing if the + * ascent and descent differ. + */ + if (inst->fontinfo[fontid].charset == CS_ISO8859_1) { + int lb, rb, wid, asc, desc; + gchar text[2]; + + text[1] = '\0'; + text[0] = '\x12'; + gdk_string_extents(inst->fonts[fontid], text, + &lb, &rb, &wid, &asc, &desc); + if (asc != desc) + inst->fontinfo[fontid].charset = CS_ISO8859_1_X11; + } + + /* + * FIXME: this is a hack. Currently fonts with + * incomprehensible encodings are dealt with by + * pretending they're 8859-1. It's ugly, but it's good + * enough to stop things crashing. Should do something + * better here. + */ + if (inst->fontinfo[fontid].charset == CS_NONE) + inst->fontinfo[fontid].charset = CS_ISO8859_1; + + sfree(encoding); + } + } +} + int main(int argc, char **argv) { extern int pty_master_fd; /* declared in pty.c */ @@ -1987,6 +2160,7 @@ int main(int argc, char **argv) fprintf(stderr, "pterm: unable to load font \"%s\"\n", cfg.font); exit(1); } + set_font_info(inst, 0); if (cfg.boldfont[0]) { inst->fonts[1] = gdk_font_load(cfg.boldfont); if (!inst->fonts[1]) { @@ -1994,6 +2168,7 @@ int main(int argc, char **argv) cfg.boldfont); exit(1); } + set_font_info(inst, 1); } else inst->fonts[1] = NULL; @@ -2001,6 +2176,7 @@ int main(int argc, char **argv) inst->font_height = inst->fonts[0]->ascent + inst->fonts[0]->descent; inst->compound_text_atom = gdk_atom_intern("COMPOUND_TEXT", FALSE); + inst->utf8_string_atom = gdk_atom_intern("UTF8_STRING", FALSE); init_ucs(); diff --git a/unix/unix.h b/unix/unix.h index fb312df2..9aa044d1 100644 --- a/unix/unix.h +++ b/unix/unix.h @@ -1,6 +1,8 @@ #ifndef PUTTY_UNIX_H #define PUTTY_UNIX_H +#include "charset.h" + typedef void *Context; /* FIXME: probably needs changing */ extern Backend pty_backend; @@ -47,7 +49,16 @@ int select_result(int fd, int event); int first_socket(int *state, int *rwx); int next_socket(int *state, int *rwx); -#define DEFAULT_CODEPAGE 0 /* FIXME: no idea how to do this */ +/* + * In the Unix Unicode layer, DEFAULT_CODEPAGE is a special value + * which causes mb_to_wc and wc_to_mb to call _libc_ rather than + * libcharset. That way, we can interface the various charsets + * supported by libcharset with the one supported by mbstowcs and + * wcstombs (which will be the character set in which stuff read + * from the command line or config files is assumed to be encoded). + */ +#define DEFAULT_CODEPAGE 0xFFFF +#define CP_UTF8 CS_UTF8 /* from libcharset */ #define strnicmp strncasecmp #define stricmp strcasecmp diff --git a/unix/uxucs.c b/unix/uxucs.c index fb0b36ab..2bf65a8a 100644 --- a/unix/uxucs.c +++ b/unix/uxucs.c @@ -1,17 +1,18 @@ #include #include #include +#include +#include +#include #include + #include "putty.h" #include "terminal.h" #include "misc.h" /* * Unix Unicode-handling routines. - * - * FIXME: currently trivial stub versions assuming all codepages - * are ISO8859-1. */ int is_dbcs_leadbyte(int codepage, char byte) @@ -22,48 +23,151 @@ int is_dbcs_leadbyte(int codepage, char byte) int mb_to_wc(int codepage, int flags, char *mbstr, int mblen, wchar_t *wcstr, int wclen) { - int ret = 0; - while (mblen > 0 && wclen > 0) { - *wcstr++ = (unsigned char) *mbstr++; - mblen--, wclen--, ret++; - } - return ret; /* FIXME: check error codes! */ + if (codepage == DEFAULT_CODEPAGE) { + int n = 0; + mbstate_t state = { 0 }; + + setlocale(LC_CTYPE, ""); + + while (mblen > 0) { + size_t i = mbrtowc(wcstr+n, mbstr, (size_t)mblen, &state); + if (i == (size_t)-1 || i == (size_t)-2) + break; + n++; + mbstr += i; + mblen -= i; + } + + setlocale(LC_CTYPE, "C"); + + return n; + } else + return charset_to_unicode(&mbstr, &mblen, wcstr, wclen, codepage, + NULL, NULL, 0); } int wc_to_mb(int codepage, int flags, wchar_t *wcstr, int wclen, char *mbstr, int mblen, char *defchr, int *defused) { - int ret = 0; + /* FIXME: we should remove the defused param completely... */ if (defused) *defused = 0; - while (mblen > 0 && wclen > 0) { - if (*wcstr >= 0x100) { - if (defchr) - *mbstr++ = *defchr; - else - *mbstr++ = '.'; - if (defused) - *defused = 1; - } else - *mbstr++ = (unsigned char) *wcstr; - wcstr++; - mblen--, wclen--, ret++; - } - return ret; /* FIXME: check error codes! */ + + if (codepage == DEFAULT_CODEPAGE) { + char output[MB_LEN_MAX]; + mbstate_t state = { 0 }; + int n = 0; + + setlocale(LC_CTYPE, ""); + + while (wclen > 0) { + int i = wcrtomb(output, wcstr[0], &state); + if (i == (size_t)-1 || i > n - mblen) + break; + memcpy(mbstr+n, output, i); + n += i; + wcstr++; + wclen--; + } + + setlocale(LC_CTYPE, "C"); + + return n; + } else + return charset_from_unicode(&wcstr, &wclen, mbstr, mblen, codepage, + NULL, NULL, 0); } void init_ucs(void) { int i; - /* Find the line control characters. FIXME: this is not right. */ - for (i = 0; i < 256; i++) - if (i < ' ' || (i >= 0x7F && i < 0xA0)) - unitab_ctrl[i] = i; + + /* + * In the platform-independent parts of the code, font_codepage + * is used only for system DBCS support - which we don't + * support at all. So we set this to something which will never + * be used. + */ + font_codepage = -1; + + /* + * line_codepage should be decoded from the specification in + * cfg. + */ + line_codepage = charset_from_mimeenc(cfg.line_codepage); + if (line_codepage == CS_NONE) + line_codepage = charset_from_xenc(cfg.line_codepage); + /* If it's still CS_NONE, we should assume direct-to-font. */ + + /* FIXME: this is a hack. Currently fonts with incomprehensible + * encodings are dealt with by pretending they're 8859-1. It's + * ugly, but it's good enough to stop things crashing. Should do + * something better here. */ + if (line_codepage == CS_NONE) + line_codepage = CS_ISO8859_1; + + /* + * Set up unitab_line, by translating each individual character + * in the line codepage into Unicode. + */ + for (i = 0; i < 256; i++) { + char c[1], *p; + wchar_t wc[1]; + int len; + c[0] = i; + p = c; + len = 1; + if (1 == charset_to_unicode(&p,&len,wc,1,line_codepage,NULL,L"",0)) + unitab_line[i] = wc[0]; else - unitab_ctrl[i] = 0xFF; + unitab_line[i] = 0xFFFD; + } + /* + * Set up unitab_xterm. This is the same as unitab_line except + * in the line-drawing regions, where it follows the Unicode + * encoding. + * + * (Note that the strange X encoding of line-drawing characters + * in the bottom 32 glyphs of ISO8859-1 fonts is taken care of + * by the font encoding, which will spot such a font and act as + * if it were in a variant encoding of ISO8859-1.) + */ for (i = 0; i < 256; i++) { - unitab_line[i] = unitab_scoacs[i] = i; - unitab_xterm[i] = (i >= 0x5F && i < 0x7F) ? ((i+1) & 0x1F) : i; + static const wchar_t unitab_xterm_std[32] = { + 0x2666, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1, + 0x2424, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0x23ba, + 0x23bb, 0x2500, 0x23bc, 0x23bd, 0x251c, 0x2524, 0x2534, 0x252c, + 0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x0020 + }; + if (i >= 0x5F && i < 0x7F) + unitab_xterm[i] = unitab_xterm_std[i & 0x1F]; + else + unitab_xterm[i] = unitab_line[i]; } + + /* + * Set up unitab_scoacs. The SCO Alternate Character Set is + * simply CP437. + */ + for (i = 0; i < 256; i++) { + char c[1], *p; + wchar_t wc[1]; + int len; + c[0] = i; + p = c; + len = 1; + if (1 == charset_to_unicode(&p,&len,wc,1,CS_CP437,NULL,L"",0)) + unitab_scoacs[i] = wc[0]; + else + unitab_scoacs[i] = 0xFFFD; + } + + /* Find the line control characters. */ + for (i = 0; i < 256; i++) + if (unitab_line[i] < ' ' + || (unitab_line[i] >= 0x7F && unitab_line[i] < 0xA0)) + unitab_ctrl[i] = i; + else + unitab_ctrl[i] = 0xFF; } -- 2.11.0