X-Git-Url: https://git.distorted.org.uk/~mdw/sgt/charset/blobdiff_plain/c601368c81dc3ea407d218ea6133b48ea8a3b278..a89fe3cf498bb23b385b0fc1a7b229035c7eb8b3:/iso2022s.c diff --git a/iso2022s.c b/iso2022s.c index 53b5d44..3cf8fa4 100644 --- a/iso2022s.c +++ b/iso2022s.c @@ -1,9 +1,5 @@ /* * iso2022s.c - support for ISO-2022 subset encodings. - * - * (The `s' suffix on the filename is there to leave `iso2022.c' - * free for the unlikely event that I ever attempt to implement - * _full_ ISO-2022 in this library!) */ #ifndef ENUM_CHARSETS @@ -42,7 +38,7 @@ struct iso2022 { * in ASCII order, so that we can narrow down the list as * necessary. */ - struct iso2022_escape *escapes; /* must be sorted in ASCII order! */ + const struct iso2022_escape *escapes;/* must be sorted in ASCII order! */ int nescapes; /* @@ -482,13 +478,13 @@ static int iso2022jp_from_ucs(long int ucs, int *subcharset, return 0; } } -static struct iso2022_escape iso2022jp_escapes[] = { +static const struct iso2022_escape iso2022jp_escapes[] = { {"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1}, /* we ignore this one */ {"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2}, {"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0}, {"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1}, }; -static struct iso2022 iso2022jp = { +static const struct iso2022 iso2022jp = { iso2022jp_escapes, lenof(iso2022jp_escapes), "\1\1\2", "\3", 0x80000000, NULL, FALSE, iso2022jp_to_ucs, iso2022jp_from_ucs @@ -525,12 +521,12 @@ static int iso2022kr_from_ucs(long int ucs, int *subcharset, return 0; } } -static struct iso2022_escape iso2022kr_escapes[] = { +static const struct iso2022_escape iso2022kr_escapes[] = { {"\016", 0x8FFFFFFF, 0x10000000, -1, -1}, {"\017", 0x8FFFFFFF, 0x00000000, 0, 0}, {"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1}, /* bits[11:6] <- 1 */ }; -static struct iso2022 iso2022kr = { +static const struct iso2022 iso2022kr = { iso2022kr_escapes, lenof(iso2022kr_escapes), "\1\2", "\2", 0x80000040, "\033$)C", FALSE, iso2022kr_to_ucs, iso2022kr_from_ucs @@ -692,7 +688,7 @@ static int ctext_from_ucs(long int ucs, int *subcharset, unsigned long *bytes) * in either; we prefer GR where possible since this leads to a * more compact EUC-like encoding. */ -static struct iso2022_escape ctext_escapes[] = { +static const struct iso2022_escape ctext_escapes[] = { SEQ("\033$(A", 0|RO, CTEXT_GB2312), SEQ("\033$(B", 0|RO, CTEXT_JISX0208), SEQ("\033$(C", 0|RO, CTEXT_KSC5601), @@ -713,8 +709,35 @@ static struct iso2022_escape ctext_escapes[] = { SEQ("\033-H", 1, CTEXT_ISO8859_8), SEQ("\033-L", 1, CTEXT_ISO8859_5), SEQ("\033-M", 1, CTEXT_ISO8859_9), + + /* + * Cross-testing against Xutf8TextListToTextProperty() turns up + * some additional character sets and ISO 2022 features + * supported by that and not by us: + * + * - Single-byte right-hand-half character sets `ESC - f', + * `ESC - T' and `ESC - Y'. + * + * - A really horrifying mechanism used to escape completely + * from the ISO 2022 framework: ESC % / + * . Xutf8* uses this to encode + * "iso8859-14", "iso8859-15" and "big5-0". + * * This mechanism is particularly nasty because we can't + * efficiently encode it on the fly! It requires that the + * length of the text encoded in the foreign charset is + * given _before_ the text in question, so if we're + * receiving one character at a time we simply can't look + * ahead and so we would have to encode each individual + * character in a separate one of these sequences. + * + * - ESC % G and ESC % @ to shift to and from UTF-8 mode, as a + * last resort for anything we still don't support. + * * Interestingly, ctext.ps actually _disallows_ this: it + * says that the above extension mechanism is the only + * one permitted. Ho hum. + */ }; -static struct iso2022 ctext = { +static const struct iso2022 ctext = { ctext_escapes, lenof(ctext_escapes), "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\2\2", /* must match the enum above */ "", 0x80000000 | (CTEXT_ASCII<<0) | (CTEXT_ISO8859_1<<6), "", TRUE,