X-Git-Url: https://git.distorted.org.uk/~mdw/sgt/charset/blobdiff_plain/ee45694b123040008ffadc8cb5182ce7041e84b2..7804475c3f05dfeb234d7a3e7578a2db2c5562c8:/iso2022s.c diff --git a/iso2022s.c b/iso2022s.c index 087fd96..000c57e 100644 --- a/iso2022s.c +++ b/iso2022s.c @@ -42,7 +42,7 @@ struct iso2022 { * in ASCII order, so that we can narrow down the list as * necessary. */ - struct iso2022_escape *escapes; /* must be sorted in ASCII order! */ + const struct iso2022_escape *escapes;/* must be sorted in ASCII order! */ int nescapes; /* @@ -482,13 +482,13 @@ static int iso2022jp_from_ucs(long int ucs, int *subcharset, return 0; } } -static struct iso2022_escape iso2022jp_escapes[] = { +static const struct iso2022_escape iso2022jp_escapes[] = { {"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1}, /* we ignore this one */ {"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2}, {"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0}, {"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1}, }; -static struct iso2022 iso2022jp = { +static const struct iso2022 iso2022jp = { iso2022jp_escapes, lenof(iso2022jp_escapes), "\1\1\2", "\3", 0x80000000, NULL, FALSE, iso2022jp_to_ucs, iso2022jp_from_ucs @@ -525,12 +525,12 @@ static int iso2022kr_from_ucs(long int ucs, int *subcharset, return 0; } } -static struct iso2022_escape iso2022kr_escapes[] = { +static const struct iso2022_escape iso2022kr_escapes[] = { {"\016", 0x8FFFFFFF, 0x10000000, -1, -1}, {"\017", 0x8FFFFFFF, 0x00000000, 0, 0}, {"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1}, /* bits[11:6] <- 1 */ }; -static struct iso2022 iso2022kr = { +static const struct iso2022 iso2022kr = { iso2022kr_escapes, lenof(iso2022kr_escapes), "\1\2", "\2", 0x80000040, "\033$)C", FALSE, iso2022kr_to_ucs, iso2022kr_from_ucs @@ -562,7 +562,8 @@ enum { CTEXT_ISO8859_9, CTEXT_GB2312, CTEXT_KSC5601, - CTEXT_JISX0208 + CTEXT_JISX0208, + CTEXT_JISX0212 }; static long int ctext_to_ucs(int subcharset, unsigned long bytes) { @@ -599,6 +600,9 @@ static long int ctext_to_ucs(int subcharset, unsigned long bytes) case CTEXT_JISX0208: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21, ((bytes ) & 0xFF) - 0x21); + case CTEXT_JISX0212: + return jisx0212_to_unicode(((bytes >> 8) & 0xFF) - 0x21, + ((bytes ) & 0xFF) - 0x21); default: return ERROR; } } @@ -666,12 +670,16 @@ static int ctext_from_ucs(long int ucs, int *subcharset, unsigned long *bytes) *subcharset = CTEXT_JISX0208; *bytes = ((r+0x21) << 8) | (c+0x21); return 1; + } else if (unicode_to_jisx0212(ucs, &r, &c)) { + *subcharset = CTEXT_JISX0212; + *bytes = ((r+0x21) << 8) | (c+0x21); + return 1; } else { return 0; } } #define SEQ(str,cont,cs) \ - {str,~(63<<(6*((cont&~RO)))),(cs)<<(6*((cont&~RO))),(cont),(cs)} + {str,~(63<<(6*(((cont)&~RO)))),(cs)<<(6*(((cont)&~RO))),(cont),(cs)} /* * Compound text defines restrictions on which container can take * which character sets. Things labelled `left half of' can only go @@ -684,13 +692,15 @@ static int ctext_from_ucs(long int ucs, int *subcharset, unsigned long *bytes) * in either; we prefer GR where possible since this leads to a * more compact EUC-like encoding. */ -static struct iso2022_escape ctext_escapes[] = { +static const struct iso2022_escape ctext_escapes[] = { SEQ("\033$(A", 0|RO, CTEXT_GB2312), SEQ("\033$(B", 0|RO, CTEXT_JISX0208), SEQ("\033$(C", 0|RO, CTEXT_KSC5601), + SEQ("\033$(D", 0|RO, CTEXT_JISX0212), SEQ("\033$)A", 1, CTEXT_GB2312), SEQ("\033$)B", 1, CTEXT_JISX0208), SEQ("\033$)C", 1, CTEXT_KSC5601), + SEQ("\033$)D", 1, CTEXT_JISX0212), SEQ("\033(B", 0, CTEXT_ASCII), SEQ("\033(J", 0, CTEXT_JISX0201_LEFT), SEQ("\033)I", 1, CTEXT_JISX0201_RIGHT), @@ -703,10 +713,37 @@ static struct iso2022_escape ctext_escapes[] = { SEQ("\033-H", 1, CTEXT_ISO8859_8), SEQ("\033-L", 1, CTEXT_ISO8859_5), SEQ("\033-M", 1, CTEXT_ISO8859_9), + + /* + * Cross-testing against Xutf8TextListToTextProperty() turns up + * some additional character sets and ISO 2022 features + * supported by that and not by us: + * + * - Single-byte right-hand-half character sets `ESC - f', + * `ESC - T' and `ESC - Y'. + * + * - A really horrifying mechanism used to escape completely + * from the ISO 2022 framework: ESC % / + * . Xutf8* uses this to encode + * "iso8859-14", "iso8859-15" and "big5-0". + * * This mechanism is particularly nasty because we can't + * efficiently encode it on the fly! It requires that the + * length of the text encoded in the foreign charset is + * given _before_ the text in question, so if we're + * receiving one character at a time we simply can't look + * ahead and so we would have to encode each individual + * character in a separate one of these sequences. + * + * - ESC % G and ESC % @ to shift to and from UTF-8 mode, as a + * last resort for anything we still don't support. + * * Interestingly, ctext.ps actually _disallows_ this: it + * says that the above extension mechanism is the only + * one permitted. Ho hum. + */ }; -static struct iso2022 ctext = { +static const struct iso2022 ctext = { ctext_escapes, lenof(ctext_escapes), - "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\2", /* must match the enum above */ + "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\2\2", /* must match the enum above */ "", 0x80000000 | (CTEXT_ASCII<<0) | (CTEXT_ISO8859_1<<6), "", TRUE, ctext_to_ucs, ctext_from_ucs };