X-Git-Url: https://git.distorted.org.uk/~mdw/sgt/charset/blobdiff_plain/01081d4e1448c952c0967e71448b504e26360923..HEAD:/iso2022s.c diff --git a/iso2022s.c b/iso2022s.c index fd75ab4..a1eceb8 100644 --- a/iso2022s.c +++ b/iso2022s.c @@ -1,9 +1,5 @@ /* * iso2022s.c - support for ISO-2022 subset encodings. - * - * (The `s' suffix on the filename is there to leave `iso2022.c' - * free for the unlikely event that I ever attempt to implement - * _full_ ISO-2022 in this library!) */ #ifndef ENUM_CHARSETS @@ -42,7 +38,7 @@ struct iso2022 { * in ASCII order, so that we can narrow down the list as * necessary. */ - struct iso2022_escape *escapes; /* must be sorted in ASCII order! */ + const struct iso2022_escape *escapes;/* must be sorted in ASCII order! */ int nescapes; /* @@ -450,12 +446,13 @@ static int write_iso2022s(charset_spec const *charset, long int input_chr, static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes) { switch (subcharset) { + case 1: /* JIS X 0201 bottom half */ + if (bytes == 0x5C) + return 0xA5; + else if (bytes == 0x7E) + return 0x203E; + /* else fall through to ASCII */ case 0: return bytes; /* one-byte ASCII */ - case 1: /* JIS X 0201 half-width katakana */ - if (bytes >= 0x21 && bytes <= 0x5F) - return bytes + (0xFF61 - 0x21); - else - return ERROR; /* (no break needed since all control paths have returned) */ case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21, ((bytes ) & 0xFF) - 0x21); @@ -470,9 +467,9 @@ static int iso2022jp_from_ucs(long int ucs, int *subcharset, *subcharset = 0; *bytes = ucs; return 1; - } else if (ucs >= 0xFF61 && ucs <= 0xFF9F) { + } else if (ucs == 0xA5 || ucs == 0x203E) { *subcharset = 1; - *bytes = ucs - (0xFF61 - 0x21); + *bytes = (ucs == 0xA5 ? 0x5C : 0x7E); return 1; } else if (unicode_to_jisx0208(ucs, &r, &c)) { *subcharset = 2; @@ -482,13 +479,13 @@ static int iso2022jp_from_ucs(long int ucs, int *subcharset, return 0; } } -static struct iso2022_escape iso2022jp_escapes[] = { +static const struct iso2022_escape iso2022jp_escapes[] = { {"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1}, /* we ignore this one */ {"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2}, {"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0}, {"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1}, }; -static struct iso2022 iso2022jp = { +static const struct iso2022 iso2022jp = { iso2022jp_escapes, lenof(iso2022jp_escapes), "\1\1\2", "\3", 0x80000000, NULL, FALSE, iso2022jp_to_ucs, iso2022jp_from_ucs @@ -525,12 +522,12 @@ static int iso2022kr_from_ucs(long int ucs, int *subcharset, return 0; } } -static struct iso2022_escape iso2022kr_escapes[] = { +static const struct iso2022_escape iso2022kr_escapes[] = { {"\016", 0x8FFFFFFF, 0x10000000, -1, -1}, {"\017", 0x8FFFFFFF, 0x00000000, 0, 0}, {"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1}, /* bits[11:6] <- 1 */ }; -static struct iso2022 iso2022kr = { +static const struct iso2022 iso2022kr = { iso2022kr_escapes, lenof(iso2022kr_escapes), "\1\2", "\2", 0x80000040, "\033$)C", FALSE, iso2022kr_to_ucs, iso2022kr_from_ucs @@ -539,185 +536,9 @@ const charset_spec charset_CS_ISO2022_KR = { CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr }; -/* - * The COMPOUND_TEXT encoding used in X selections. Defined by the - * X consortium. - * - * This encoding has quite a few sub-charsets. The order I assign - * to them here is given in an enum. - */ -enum { - /* This must match the bytes-per-character string given below. */ - CTEXT_ASCII, - CTEXT_JISX0201_LEFT, - CTEXT_JISX0201_RIGHT, - CTEXT_ISO8859_1, - CTEXT_ISO8859_2, - CTEXT_ISO8859_3, - CTEXT_ISO8859_4, - CTEXT_ISO8859_5, - CTEXT_ISO8859_6, - CTEXT_ISO8859_7, - CTEXT_ISO8859_8, - CTEXT_ISO8859_9, - CTEXT_GB2312, - CTEXT_KSC5601, - CTEXT_JISX0208 -}; -static long int ctext_to_ucs(int subcharset, unsigned long bytes) -{ - switch (subcharset) { - case CTEXT_ASCII: return bytes; /* one-byte ASCII */ - case CTEXT_JISX0201_LEFT: /* ASCII with yen and overline */ - return sbcs_to_unicode(&sbcsdata_CS_JISX0201, bytes & 0x7F); - case CTEXT_JISX0201_RIGHT: /* JIS X 0201 half-width katakana */ - return sbcs_to_unicode(&sbcsdata_CS_JISX0201, (bytes & 0x7F) | 0x80); - case CTEXT_ISO8859_1: - return sbcs_to_unicode(&sbcsdata_CS_ISO8859_1, (bytes & 0x7F) | 0x80); - case CTEXT_ISO8859_2: - return sbcs_to_unicode(&sbcsdata_CS_ISO8859_2, (bytes & 0x7F) | 0x80); - case CTEXT_ISO8859_3: - return sbcs_to_unicode(&sbcsdata_CS_ISO8859_3, (bytes & 0x7F) | 0x80); - case CTEXT_ISO8859_4: - return sbcs_to_unicode(&sbcsdata_CS_ISO8859_4, (bytes & 0x7F) | 0x80); - case CTEXT_ISO8859_5: - return sbcs_to_unicode(&sbcsdata_CS_ISO8859_5, (bytes & 0x7F) | 0x80); - case CTEXT_ISO8859_6: - return sbcs_to_unicode(&sbcsdata_CS_ISO8859_6, (bytes & 0x7F) | 0x80); - case CTEXT_ISO8859_7: - return sbcs_to_unicode(&sbcsdata_CS_ISO8859_7, (bytes & 0x7F) | 0x80); - case CTEXT_ISO8859_8: - return sbcs_to_unicode(&sbcsdata_CS_ISO8859_8, (bytes & 0x7F) | 0x80); - case CTEXT_ISO8859_9: - return sbcs_to_unicode(&sbcsdata_CS_ISO8859_9, (bytes & 0x7F) | 0x80); - case CTEXT_GB2312: - return gb2312_to_unicode(((bytes >> 8) & 0xFF) - 0x21, - ((bytes ) & 0xFF) - 0x21); - case CTEXT_KSC5601: - return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21, - ((bytes ) & 0xFF) - 0x21); - case CTEXT_JISX0208: - return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21, - ((bytes ) & 0xFF) - 0x21); - default: return ERROR; - } -} -static int ctext_from_ucs(long int ucs, int *subcharset, unsigned long *bytes) -{ - int r, c; - if (ucs < 0x80) { - *subcharset = CTEXT_ASCII; - *bytes = ucs; - return 1; - } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_1, ucs)) != ERROR) { - *subcharset = CTEXT_ISO8859_1; - *bytes = c - 0x80; - return 1; - } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_2, ucs)) != ERROR) { - *subcharset = CTEXT_ISO8859_2; - *bytes = c - 0x80; - return 1; - } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_3, ucs)) != ERROR) { - *subcharset = CTEXT_ISO8859_3; - *bytes = c - 0x80; - return 1; - } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_4, ucs)) != ERROR) { - *subcharset = CTEXT_ISO8859_4; - *bytes = c - 0x80; - return 1; - } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_5, ucs)) != ERROR) { - *subcharset = CTEXT_ISO8859_5; - *bytes = c - 0x80; - return 1; - } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_6, ucs)) != ERROR) { - *subcharset = CTEXT_ISO8859_6; - *bytes = c - 0x80; - return 1; - } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_7, ucs)) != ERROR) { - *subcharset = CTEXT_ISO8859_7; - *bytes = c - 0x80; - return 1; - } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_8, ucs)) != ERROR) { - *subcharset = CTEXT_ISO8859_8; - *bytes = c - 0x80; - return 1; - } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_9, ucs)) != ERROR) { - *subcharset = CTEXT_ISO8859_9; - *bytes = c - 0x80; - return 1; - } else if ((c = sbcs_from_unicode(&sbcsdata_CS_JISX0201, ucs)) != ERROR) { - if (c < 0x80) { - *subcharset = CTEXT_JISX0201_LEFT; - } else { - *subcharset = CTEXT_JISX0201_RIGHT; - c -= 0x80; - } - *bytes = c; - return 1; - } else if (unicode_to_gb2312(ucs, &r, &c)) { - *subcharset = CTEXT_GB2312; - *bytes = ((r+0x21) << 8) | (c+0x21); - return 1; - } else if (unicode_to_ksx1001(ucs, &r, &c)) { - *subcharset = CTEXT_KSC5601; - *bytes = ((r+0x21) << 8) | (c+0x21); - return 1; - } else if (unicode_to_jisx0208(ucs, &r, &c)) { - *subcharset = CTEXT_JISX0208; - *bytes = ((r+0x21) << 8) | (c+0x21); - return 1; - } else { - return 0; - } -} -#define SEQ(str,cont,cs) \ - {str,~(63<<(6*((cont&~RO)))),(cs)<<(6*((cont&~RO))),(cont),(cs)} -/* - * Compound text defines restrictions on which container can take - * which character sets. Things labelled `left half of' can only go - * in GL; things labelled `right half of' can only go in GR; and 96 - * or 96^n character sets only _fit_ in GR. Thus: - * - ASCII can only go in GL since it is the left half of 8859-*. - * - All the 8859 sets can only go in GR. - * - JISX0201 left is GL only; JISX0201 right is GR only. - * - The three multibyte sets (GB2312, JISX0208, KSC5601) can go - * in either; we prefer GR where possible since this leads to a - * more compact EUC-like encoding. - */ -static struct iso2022_escape ctext_escapes[] = { - SEQ("\033$(A", 0|RO, CTEXT_GB2312), - SEQ("\033$(B", 0|RO, CTEXT_JISX0208), - SEQ("\033$(C", 0|RO, CTEXT_KSC5601), - SEQ("\033$)A", 1, CTEXT_GB2312), - SEQ("\033$)B", 1, CTEXT_JISX0208), - SEQ("\033$)C", 1, CTEXT_KSC5601), - SEQ("\033(B", 0, CTEXT_ASCII), - SEQ("\033(J", 0, CTEXT_JISX0201_LEFT), - SEQ("\033-A", 1, CTEXT_ISO8859_1), - SEQ("\033-B", 1, CTEXT_ISO8859_2), - SEQ("\033-C", 1, CTEXT_ISO8859_3), - SEQ("\033-D", 1, CTEXT_ISO8859_4), - SEQ("\033-F", 1, CTEXT_ISO8859_7), - SEQ("\033-G", 1, CTEXT_ISO8859_6), - SEQ("\033-H", 1, CTEXT_ISO8859_8), - SEQ("\033)I", 1, CTEXT_JISX0201_RIGHT), - SEQ("\033-L", 1, CTEXT_ISO8859_5), - SEQ("\033-M", 1, CTEXT_ISO8859_9), -}; -static struct iso2022 ctext = { - ctext_escapes, lenof(ctext_escapes), - "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\2", /* must match the enum above */ - "", 0x80000000 | (CTEXT_ASCII<<0) | (CTEXT_ASCII<<6), "", TRUE, - ctext_to_ucs, ctext_from_ucs -}; -const charset_spec charset_CS_CTEXT = { - CS_CTEXT, read_iso2022s, write_iso2022s, &ctext -}; - #else /* ENUM_CHARSETS */ ENUM_CHARSET(CS_ISO2022_JP) ENUM_CHARSET(CS_ISO2022_KR) -ENUM_CHARSET(CS_CTEXT) #endif /* ENUM_CHARSETS */