X-Git-Url: https://git.distorted.org.uk/~mdw/sgt/charset/blobdiff_plain/ee45694b123040008ffadc8cb5182ce7041e84b2..b063a840d1dded0455a70fc3e71ef8f92e8644ab:/iso2022s.c diff --git a/iso2022s.c b/iso2022s.c index 087fd96..e48c885 100644 --- a/iso2022s.c +++ b/iso2022s.c @@ -1,9 +1,5 @@ /* * iso2022s.c - support for ISO-2022 subset encodings. - * - * (The `s' suffix on the filename is there to leave `iso2022.c' - * free for the unlikely event that I ever attempt to implement - * _full_ ISO-2022 in this library!) */ #ifndef ENUM_CHARSETS @@ -42,7 +38,7 @@ struct iso2022 { * in ASCII order, so that we can narrow down the list as * necessary. */ - struct iso2022_escape *escapes; /* must be sorted in ASCII order! */ + const struct iso2022_escape *escapes;/* must be sorted in ASCII order! */ int nescapes; /* @@ -450,12 +446,13 @@ static int write_iso2022s(charset_spec const *charset, long int input_chr, static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes) { switch (subcharset) { + case 1: /* JIS X 0201 bottom half */ + if (bytes == 0x5C) + return 0xA5; + else if (bytes == 0x7E) + return 0x203E; + /* else fall through to ASCII */ case 0: return bytes; /* one-byte ASCII */ - case 1: /* JIS X 0201 half-width katakana */ - if (bytes >= 0x21 && bytes <= 0x5F) - return bytes + (0xFF61 - 0x21); - else - return ERROR; /* (no break needed since all control paths have returned) */ case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21, ((bytes ) & 0xFF) - 0x21); @@ -470,9 +467,9 @@ static int iso2022jp_from_ucs(long int ucs, int *subcharset, *subcharset = 0; *bytes = ucs; return 1; - } else if (ucs >= 0xFF61 && ucs <= 0xFF9F) { + } else if (ucs == 0xA5 || ucs == 0x203E) { *subcharset = 1; - *bytes = ucs - (0xFF61 - 0x21); + *bytes = (ucs == 0xA5 ? 0x5C : 0x7E); return 1; } else if (unicode_to_jisx0208(ucs, &r, &c)) { *subcharset = 2; @@ -482,13 +479,13 @@ static int iso2022jp_from_ucs(long int ucs, int *subcharset, return 0; } } -static struct iso2022_escape iso2022jp_escapes[] = { +static const struct iso2022_escape iso2022jp_escapes[] = { {"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1}, /* we ignore this one */ {"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2}, {"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0}, {"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1}, }; -static struct iso2022 iso2022jp = { +static const struct iso2022 iso2022jp = { iso2022jp_escapes, lenof(iso2022jp_escapes), "\1\1\2", "\3", 0x80000000, NULL, FALSE, iso2022jp_to_ucs, iso2022jp_from_ucs @@ -525,12 +522,12 @@ static int iso2022kr_from_ucs(long int ucs, int *subcharset, return 0; } } -static struct iso2022_escape iso2022kr_escapes[] = { +static const struct iso2022_escape iso2022kr_escapes[] = { {"\016", 0x8FFFFFFF, 0x10000000, -1, -1}, {"\017", 0x8FFFFFFF, 0x00000000, 0, 0}, {"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1}, /* bits[11:6] <- 1 */ }; -static struct iso2022 iso2022kr = { +static const struct iso2022 iso2022kr = { iso2022kr_escapes, lenof(iso2022kr_escapes), "\1\2", "\2", 0x80000040, "\033$)C", FALSE, iso2022kr_to_ucs, iso2022kr_from_ucs @@ -562,7 +559,8 @@ enum { CTEXT_ISO8859_9, CTEXT_GB2312, CTEXT_KSC5601, - CTEXT_JISX0208 + CTEXT_JISX0208, + CTEXT_JISX0212 }; static long int ctext_to_ucs(int subcharset, unsigned long bytes) { @@ -599,6 +597,9 @@ static long int ctext_to_ucs(int subcharset, unsigned long bytes) case CTEXT_JISX0208: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21, ((bytes ) & 0xFF) - 0x21); + case CTEXT_JISX0212: + return jisx0212_to_unicode(((bytes >> 8) & 0xFF) - 0x21, + ((bytes ) & 0xFF) - 0x21); default: return ERROR; } } @@ -666,12 +667,16 @@ static int ctext_from_ucs(long int ucs, int *subcharset, unsigned long *bytes) *subcharset = CTEXT_JISX0208; *bytes = ((r+0x21) << 8) | (c+0x21); return 1; + } else if (unicode_to_jisx0212(ucs, &r, &c)) { + *subcharset = CTEXT_JISX0212; + *bytes = ((r+0x21) << 8) | (c+0x21); + return 1; } else { return 0; } } #define SEQ(str,cont,cs) \ - {str,~(63<<(6*((cont&~RO)))),(cs)<<(6*((cont&~RO))),(cont),(cs)} + {str,~(63<<(6*(((cont)&~RO)))),(cs)<<(6*(((cont)&~RO))),(cont),(cs)} /* * Compound text defines restrictions on which container can take * which character sets. Things labelled `left half of' can only go @@ -684,13 +689,15 @@ static int ctext_from_ucs(long int ucs, int *subcharset, unsigned long *bytes) * in either; we prefer GR where possible since this leads to a * more compact EUC-like encoding. */ -static struct iso2022_escape ctext_escapes[] = { +static const struct iso2022_escape ctext_escapes[] = { SEQ("\033$(A", 0|RO, CTEXT_GB2312), SEQ("\033$(B", 0|RO, CTEXT_JISX0208), SEQ("\033$(C", 0|RO, CTEXT_KSC5601), + SEQ("\033$(D", 0|RO, CTEXT_JISX0212), SEQ("\033$)A", 1, CTEXT_GB2312), SEQ("\033$)B", 1, CTEXT_JISX0208), SEQ("\033$)C", 1, CTEXT_KSC5601), + SEQ("\033$)D", 1, CTEXT_JISX0212), SEQ("\033(B", 0, CTEXT_ASCII), SEQ("\033(J", 0, CTEXT_JISX0201_LEFT), SEQ("\033)I", 1, CTEXT_JISX0201_RIGHT), @@ -703,10 +710,37 @@ static struct iso2022_escape ctext_escapes[] = { SEQ("\033-H", 1, CTEXT_ISO8859_8), SEQ("\033-L", 1, CTEXT_ISO8859_5), SEQ("\033-M", 1, CTEXT_ISO8859_9), + + /* + * Cross-testing against Xutf8TextListToTextProperty() turns up + * some additional character sets and ISO 2022 features + * supported by that and not by us: + * + * - Single-byte right-hand-half character sets `ESC - f', + * `ESC - T' and `ESC - Y'. + * + * - A really horrifying mechanism used to escape completely + * from the ISO 2022 framework: ESC % / + * . Xutf8* uses this to encode + * "iso8859-14", "iso8859-15" and "big5-0". + * * This mechanism is particularly nasty because we can't + * efficiently encode it on the fly! It requires that the + * length of the text encoded in the foreign charset is + * given _before_ the text in question, so if we're + * receiving one character at a time we simply can't look + * ahead and so we would have to encode each individual + * character in a separate one of these sequences. + * + * - ESC % G and ESC % @ to shift to and from UTF-8 mode, as a + * last resort for anything we still don't support. + * * Interestingly, ctext.ps actually _disallows_ this: it + * says that the above extension mechanism is the only + * one permitted. Ho hum. + */ }; -static struct iso2022 ctext = { +static const struct iso2022 ctext = { ctext_escapes, lenof(ctext_escapes), - "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\2", /* must match the enum above */ + "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\2\2", /* must match the enum above */ "", 0x80000000 | (CTEXT_ASCII<<0) | (CTEXT_ISO8859_1<<6), "", TRUE, ctext_to_ucs, ctext_from_ucs };