X-Git-Url: https://git.distorted.org.uk/~mdw/sgt/charset/blobdiff_plain/b97e542708b7c9ebd09991420fbbf6785d5ccd87..HEAD:/iso2022s.c diff --git a/iso2022s.c b/iso2022s.c index 3cf8fa4..a1eceb8 100644 --- a/iso2022s.c +++ b/iso2022s.c @@ -446,12 +446,13 @@ static int write_iso2022s(charset_spec const *charset, long int input_chr, static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes) { switch (subcharset) { + case 1: /* JIS X 0201 bottom half */ + if (bytes == 0x5C) + return 0xA5; + else if (bytes == 0x7E) + return 0x203E; + /* else fall through to ASCII */ case 0: return bytes; /* one-byte ASCII */ - case 1: /* JIS X 0201 half-width katakana */ - if (bytes >= 0x21 && bytes <= 0x5F) - return bytes + (0xFF61 - 0x21); - else - return ERROR; /* (no break needed since all control paths have returned) */ case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21, ((bytes ) & 0xFF) - 0x21); @@ -466,9 +467,9 @@ static int iso2022jp_from_ucs(long int ucs, int *subcharset, *subcharset = 0; *bytes = ucs; return 1; - } else if (ucs >= 0xFF61 && ucs <= 0xFF9F) { + } else if (ucs == 0xA5 || ucs == 0x203E) { *subcharset = 1; - *bytes = ucs - (0xFF61 - 0x21); + *bytes = (ucs == 0xA5 ? 0x5C : 0x7E); return 1; } else if (unicode_to_jisx0208(ucs, &r, &c)) { *subcharset = 2; @@ -535,222 +536,9 @@ const charset_spec charset_CS_ISO2022_KR = { CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr }; -/* - * The COMPOUND_TEXT encoding used in X selections. Defined by the - * X consortium. - * - * This encoding has quite a few sub-charsets. The order I assign - * to them here is given in an enum. - */ -enum { - /* This must match the bytes-per-character string given below. */ - CTEXT_ASCII, - CTEXT_JISX0201_LEFT, - CTEXT_JISX0201_RIGHT, - CTEXT_ISO8859_1, - CTEXT_ISO8859_2, - CTEXT_ISO8859_3, - CTEXT_ISO8859_4, - CTEXT_ISO8859_5, - CTEXT_ISO8859_6, - CTEXT_ISO8859_7, - CTEXT_ISO8859_8, - CTEXT_ISO8859_9, - CTEXT_GB2312, - CTEXT_KSC5601, - CTEXT_JISX0208, - CTEXT_JISX0212 -}; -static long int ctext_to_ucs(int subcharset, unsigned long bytes) -{ - switch (subcharset) { - case CTEXT_ASCII: return bytes; /* one-byte ASCII */ - case CTEXT_JISX0201_LEFT: /* ASCII with yen and overline */ - return sbcs_to_unicode(&sbcsdata_CS_JISX0201, bytes & 0x7F); - case CTEXT_JISX0201_RIGHT: /* JIS X 0201 half-width katakana */ - return sbcs_to_unicode(&sbcsdata_CS_JISX0201, (bytes & 0x7F) | 0x80); - case CTEXT_ISO8859_1: - return sbcs_to_unicode(&sbcsdata_CS_ISO8859_1, (bytes & 0x7F) | 0x80); - case CTEXT_ISO8859_2: - return sbcs_to_unicode(&sbcsdata_CS_ISO8859_2, (bytes & 0x7F) | 0x80); - case CTEXT_ISO8859_3: - return sbcs_to_unicode(&sbcsdata_CS_ISO8859_3, (bytes & 0x7F) | 0x80); - case CTEXT_ISO8859_4: - return sbcs_to_unicode(&sbcsdata_CS_ISO8859_4, (bytes & 0x7F) | 0x80); - case CTEXT_ISO8859_5: - return sbcs_to_unicode(&sbcsdata_CS_ISO8859_5, (bytes & 0x7F) | 0x80); - case CTEXT_ISO8859_6: - return sbcs_to_unicode(&sbcsdata_CS_ISO8859_6, (bytes & 0x7F) | 0x80); - case CTEXT_ISO8859_7: - return sbcs_to_unicode(&sbcsdata_CS_ISO8859_7, (bytes & 0x7F) | 0x80); - case CTEXT_ISO8859_8: - return sbcs_to_unicode(&sbcsdata_CS_ISO8859_8, (bytes & 0x7F) | 0x80); - case CTEXT_ISO8859_9: - return sbcs_to_unicode(&sbcsdata_CS_ISO8859_9, (bytes & 0x7F) | 0x80); - case CTEXT_GB2312: - return gb2312_to_unicode(((bytes >> 8) & 0xFF) - 0x21, - ((bytes ) & 0xFF) - 0x21); - case CTEXT_KSC5601: - return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21, - ((bytes ) & 0xFF) - 0x21); - case CTEXT_JISX0208: - return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21, - ((bytes ) & 0xFF) - 0x21); - case CTEXT_JISX0212: - return jisx0212_to_unicode(((bytes >> 8) & 0xFF) - 0x21, - ((bytes ) & 0xFF) - 0x21); - default: return ERROR; - } -} -static int ctext_from_ucs(long int ucs, int *subcharset, unsigned long *bytes) -{ - int r, c; - if (ucs < 0x80) { - *subcharset = CTEXT_ASCII; - *bytes = ucs; - return 1; - } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_1, ucs)) != ERROR) { - *subcharset = CTEXT_ISO8859_1; - *bytes = c - 0x80; - return 1; - } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_2, ucs)) != ERROR) { - *subcharset = CTEXT_ISO8859_2; - *bytes = c - 0x80; - return 1; - } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_3, ucs)) != ERROR) { - *subcharset = CTEXT_ISO8859_3; - *bytes = c - 0x80; - return 1; - } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_4, ucs)) != ERROR) { - *subcharset = CTEXT_ISO8859_4; - *bytes = c - 0x80; - return 1; - } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_5, ucs)) != ERROR) { - *subcharset = CTEXT_ISO8859_5; - *bytes = c - 0x80; - return 1; - } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_6, ucs)) != ERROR) { - *subcharset = CTEXT_ISO8859_6; - *bytes = c - 0x80; - return 1; - } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_7, ucs)) != ERROR) { - *subcharset = CTEXT_ISO8859_7; - *bytes = c - 0x80; - return 1; - } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_8, ucs)) != ERROR) { - *subcharset = CTEXT_ISO8859_8; - *bytes = c - 0x80; - return 1; - } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_9, ucs)) != ERROR) { - *subcharset = CTEXT_ISO8859_9; - *bytes = c - 0x80; - return 1; - } else if ((c = sbcs_from_unicode(&sbcsdata_CS_JISX0201, ucs)) != ERROR) { - if (c < 0x80) { - *subcharset = CTEXT_JISX0201_LEFT; - } else { - *subcharset = CTEXT_JISX0201_RIGHT; - c -= 0x80; - } - *bytes = c; - return 1; - } else if (unicode_to_gb2312(ucs, &r, &c)) { - *subcharset = CTEXT_GB2312; - *bytes = ((r+0x21) << 8) | (c+0x21); - return 1; - } else if (unicode_to_ksx1001(ucs, &r, &c)) { - *subcharset = CTEXT_KSC5601; - *bytes = ((r+0x21) << 8) | (c+0x21); - return 1; - } else if (unicode_to_jisx0208(ucs, &r, &c)) { - *subcharset = CTEXT_JISX0208; - *bytes = ((r+0x21) << 8) | (c+0x21); - return 1; - } else if (unicode_to_jisx0212(ucs, &r, &c)) { - *subcharset = CTEXT_JISX0212; - *bytes = ((r+0x21) << 8) | (c+0x21); - return 1; - } else { - return 0; - } -} -#define SEQ(str,cont,cs) \ - {str,~(63<<(6*(((cont)&~RO)))),(cs)<<(6*(((cont)&~RO))),(cont),(cs)} -/* - * Compound text defines restrictions on which container can take - * which character sets. Things labelled `left half of' can only go - * in GL; things labelled `right half of' can only go in GR; and 96 - * or 96^n character sets only _fit_ in GR. Thus: - * - ASCII can only go in GL since it is the left half of 8859-*. - * - All the 8859 sets can only go in GR. - * - JISX0201 left is GL only; JISX0201 right is GR only. - * - The three multibyte sets (GB2312, JISX0208, KSC5601) can go - * in either; we prefer GR where possible since this leads to a - * more compact EUC-like encoding. - */ -static const struct iso2022_escape ctext_escapes[] = { - SEQ("\033$(A", 0|RO, CTEXT_GB2312), - SEQ("\033$(B", 0|RO, CTEXT_JISX0208), - SEQ("\033$(C", 0|RO, CTEXT_KSC5601), - SEQ("\033$(D", 0|RO, CTEXT_JISX0212), - SEQ("\033$)A", 1, CTEXT_GB2312), - SEQ("\033$)B", 1, CTEXT_JISX0208), - SEQ("\033$)C", 1, CTEXT_KSC5601), - SEQ("\033$)D", 1, CTEXT_JISX0212), - SEQ("\033(B", 0, CTEXT_ASCII), - SEQ("\033(J", 0, CTEXT_JISX0201_LEFT), - SEQ("\033)I", 1, CTEXT_JISX0201_RIGHT), - SEQ("\033-A", 1, CTEXT_ISO8859_1), - SEQ("\033-B", 1, CTEXT_ISO8859_2), - SEQ("\033-C", 1, CTEXT_ISO8859_3), - SEQ("\033-D", 1, CTEXT_ISO8859_4), - SEQ("\033-F", 1, CTEXT_ISO8859_7), - SEQ("\033-G", 1, CTEXT_ISO8859_6), - SEQ("\033-H", 1, CTEXT_ISO8859_8), - SEQ("\033-L", 1, CTEXT_ISO8859_5), - SEQ("\033-M", 1, CTEXT_ISO8859_9), - - /* - * Cross-testing against Xutf8TextListToTextProperty() turns up - * some additional character sets and ISO 2022 features - * supported by that and not by us: - * - * - Single-byte right-hand-half character sets `ESC - f', - * `ESC - T' and `ESC - Y'. - * - * - A really horrifying mechanism used to escape completely - * from the ISO 2022 framework: ESC % / - * . Xutf8* uses this to encode - * "iso8859-14", "iso8859-15" and "big5-0". - * * This mechanism is particularly nasty because we can't - * efficiently encode it on the fly! It requires that the - * length of the text encoded in the foreign charset is - * given _before_ the text in question, so if we're - * receiving one character at a time we simply can't look - * ahead and so we would have to encode each individual - * character in a separate one of these sequences. - * - * - ESC % G and ESC % @ to shift to and from UTF-8 mode, as a - * last resort for anything we still don't support. - * * Interestingly, ctext.ps actually _disallows_ this: it - * says that the above extension mechanism is the only - * one permitted. Ho hum. - */ -}; -static const struct iso2022 ctext = { - ctext_escapes, lenof(ctext_escapes), - "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\2\2", /* must match the enum above */ - "", 0x80000000 | (CTEXT_ASCII<<0) | (CTEXT_ISO8859_1<<6), "", TRUE, - ctext_to_ucs, ctext_from_ucs -}; -const charset_spec charset_CS_CTEXT = { - CS_CTEXT, read_iso2022s, write_iso2022s, &ctext -}; - #else /* ENUM_CHARSETS */ ENUM_CHARSET(CS_ISO2022_JP) ENUM_CHARSET(CS_ISO2022_KR) -ENUM_CHARSET(CS_CTEXT) #endif /* ENUM_CHARSETS */