X-Git-Url: https://git.distorted.org.uk/~mdw/sgt/charset/blobdiff_plain/a89fe3cf498bb23b385b0fc1a7b229035c7eb8b3..113375ca1cf0896e3a20a207eac1c2c404cbd4ae:/iso2022.c diff --git a/iso2022.c b/iso2022.c index e88f9e7..a570860 100644 --- a/iso2022.c +++ b/iso2022.c @@ -15,7 +15,8 @@ * sets are passed through, so a post-processor could fix them up if * necessary. * - * DOCS is not currently supported. It will be one day. + * DOCS to UTF-8 works. Other DOCS sequences are ignored, which will + * produce surprising results. */ #ifndef ENUM_CHARSETS @@ -34,6 +35,15 @@ enum {S4, S6, M4, M6}; +static long int emacs_big5_1_to_unicode(int, int); +static long int emacs_big5_2_to_unicode(int, int); +static long int cns11643_1_to_unicode(int, int); +static long int cns11643_2_to_unicode(int, int); +static long int cns11643_3_to_unicode(int, int); +static long int cns11643_4_to_unicode(int, int); +static long int cns11643_5_to_unicode(int, int); +static long int cns11643_6_to_unicode(int, int); +static long int cns11643_7_to_unicode(int, int); static long int null_dbcs_to_unicode(int, int); const struct iso2022_subcharset { @@ -42,9 +52,10 @@ const struct iso2022_subcharset { const sbcs_data *sbcs_base; long int (*dbcs_fn)(int, int); } iso2022_subcharsets[] = { - { S4, 0, 'B', 0x00, &sbcsdata_CS_ASCII }, - + { S4, 0, '0', 0x00, &sbcsdata_CS_DEC_GRAPHICS }, { S4, 0, '<', 0x80, &sbcsdata_CS_DEC_MCS }, + { S4, 0, 'A', 0x00, &sbcsdata_CS_BS4730 }, + { S4, 0, 'B', 0x00, &sbcsdata_CS_ASCII }, { S4, 0, 'I', 0x80, &sbcsdata_CS_JISX0201 }, { S4, 0, 'J', 0x00, &sbcsdata_CS_JISX0201 }, { S4, 0, '~' }, @@ -67,10 +78,19 @@ const struct iso2022_subcharset { #if 0 { M4, 0, '@' }, /* JIS C 6226-1978 */ #endif + { M4, 0, '0', -0x21, 0, &emacs_big5_1_to_unicode }, + { M4, 0, '1', -0x21, 0, &emacs_big5_2_to_unicode }, { M4, 0, 'A', -0x21, 0, &gb2312_to_unicode }, { M4, 0, 'B', -0x21, 0, &jisx0208_to_unicode }, { M4, 0, 'C', -0x21, 0, &ksx1001_to_unicode }, { M4, 0, 'D', -0x21, 0, &jisx0212_to_unicode }, + { M4, 0, 'G', -0x21, 0, &cns11643_1_to_unicode }, + { M4, 0, 'H', -0x21, 0, &cns11643_2_to_unicode }, + { M4, 0, 'I', -0x21, 0, &cns11643_3_to_unicode }, + { M4, 0, 'J', -0x21, 0, &cns11643_4_to_unicode }, + { M4, 0, 'K', -0x21, 0, &cns11643_5_to_unicode }, + { M4, 0, 'L', -0x21, 0, &cns11643_6_to_unicode }, + { M4, 0, 'M', -0x21, 0, &cns11643_7_to_unicode }, { M4, 0, '~', 0, 0, &null_dbcs_to_unicode }, /* empty 94^n-set */ { M6, 0, '~', 0, 0, &null_dbcs_to_unicode }, /* empty 96^n-set */ }; @@ -80,6 +100,63 @@ static long int null_dbcs_to_unicode(int r, int c) return ERROR; } +/* + * Emacs encodes Big5 in COMPOUND_TEXT as two 94x94 character sets. + * We treat Big5 as a 94x191 character set with a bunch of undefined + * columns in the middle, so we have to mess around a bit to make + * things fit. + */ + +static long int emacs_big5_1_to_unicode(int r, int c) +{ + unsigned long s; + s = r * 94 + c; + r = s / 157; + c = s % 157; + if (c >= 64) c += 34; /* Skip over the gap */ + return big5_to_unicode(r, c); +} + +static long int emacs_big5_2_to_unicode(int r, int c) +{ + unsigned long s; + s = r * 94 + c; + r = s / 157 + 40; + c = s % 157; + if (c >= 64) c += 34; /* Skip over the gap */ + return big5_to_unicode(r, c); +} + +/* Wrappers for cns11643_to_unicode() */ +static long int cns11643_1_to_unicode(int r, int c) +{ + return cns11643_to_unicode(0, r, c); +} +static long int cns11643_2_to_unicode(int r, int c) +{ + return cns11643_to_unicode(1, r, c); +} +static long int cns11643_3_to_unicode(int r, int c) +{ + return cns11643_to_unicode(2, r, c); +} +static long int cns11643_4_to_unicode(int r, int c) +{ + return cns11643_to_unicode(3, r, c); +} +static long int cns11643_5_to_unicode(int r, int c) +{ + return cns11643_to_unicode(4, r, c); +} +static long int cns11643_6_to_unicode(int r, int c) +{ + return cns11643_to_unicode(5, r, c); +} +static long int cns11643_7_to_unicode(int r, int c) +{ + return cns11643_to_unicode(6, r, c); +} + /* States, or "what we're currently accumulating". */ enum { IDLE, /* None of the below */ @@ -88,7 +165,8 @@ enum { ESCSEQ, /* Accumulating an escape sequence */ ESCDROP, /* Discarding an escape sequence */ ESCPASS, /* Passing through an escape sequence */ - DOCSUTF8 /* DOCSed into UTF-8 */ + DOCSUTF8, /* DOCSed into UTF-8 */ + DOCSCTEXT /* DOCSed into a COMPOUND_TEXT extended segment */ }; #if 1 @@ -143,8 +221,7 @@ static void do_utf8(long int input_chr, ustate.s1 = 0; ustate.s0 = state->s0 & 0x03ffffffL; - utf8 = charset_find_spec(CS_UTF8); - utf8->read(utf8, input_chr, &ustate, emit, emitctx); + read_utf8(NULL, input_chr, &ustate, emit, emitctx); state->s0 = (state->s0 & ~0x03ffffffL) | (ustate.s0 & 0x03ffffffL); } @@ -181,6 +258,110 @@ static void docs_utf8(long int input_chr, state->s0 = (state->s0 & ~0x0c000000L) | (retstate << 26); } +struct ctext_encoding { + char const *name; + charset_spec const *subcs; +}; + +/* + * In theory, this list is in , + * but XLib appears to have its own ideas, and encodes these three + * (as of X11R6.8.2) + */ + +extern charset_spec const charset_CS_ISO8859_14; +extern charset_spec const charset_CS_ISO8859_15; +extern charset_spec const charset_CS_BIG5; + +static struct ctext_encoding const ctext_encodings[] = { + { "big5-0\2", &charset_CS_BIG5 }, + { "iso8859-14\2", &charset_CS_ISO8859_14 }, + { "iso8859-15\2", &charset_CS_ISO8859_15 } +}; + +static void docs_ctext(long int input_chr, + charset_state *state, + void (*emit)(void *ctx, long int output), + void *emitctx) +{ + /* + * s0[27:26] = first entry in ctext_encodings that matches + * s0[25:22] = number of characters successfully matched, 0xf if all + * s0[21:8] count the number of octets left in the segment + * s0[7:0] are for sub-charset use + */ + int n = (state->s0 >> 22) & 0xf, i = (state->s0 >> 26) & 3, oi = i, j; + int length = (state->s0 >> 8) & 0x3fff; + + if (!length) { + /* Haven't read length yet */ + if ((state->s0 & 0xff) == 0) + /* ... or even the first byte */ + state->s0 |= input_chr; + else { + length = (state->s0 & 0x7f) * 0x80 + (input_chr & 0x7f); + if (length == 0) + state->s0 = 0; + else + state->s0 = (state->s0 & 0xf0000000) | (length << 8); + } + return; + } + + j = i; + if (n == 0xe) { + /* Skipping unknown encoding. Look out for STX. */ + if (input_chr == 2) + state->s0 = (state->s0 & 0xf0000000) | (i << 26) | (0xf << 22); + } else if (n != 0xf) { + while (j < lenof(ctext_encodings) && + !memcmp(ctext_encodings[j].name, + ctext_encodings[oi].name, n)) { + if (ctext_encodings[j].name[n] < input_chr) + i = ++j; + else + break; + } + if (i >= lenof(ctext_encodings) || + memcmp(ctext_encodings[i].name, + ctext_encodings[oi].name, n) || + ctext_encodings[i].name[n] != input_chr) { + /* Doom! We haven't heard of this encoding */ + i = lenof(ctext_encodings); + n = 0xe; + } else { + /* + * Otherwise, we have found an additional character in our + * encoding name. See if we have reached the _end_ of our + * name. + */ + n++; + if (!ctext_encodings[i].name[n]) + n = 0xf; + } + /* + * Failing _that_, we simply update our encoding-name- + * tracking state. + */ + assert(i < 4 && n < 16); + state->s0 = (state->s0 & 0xf0000000) | (i << 26) | (n << 22); + } else { + if (i >= lenof(ctext_encodings)) + emit(emitctx, ERROR); + else { + charset_state substate; + charset_spec const *subcs = ctext_encodings[i].subcs; + substate.s1 = 0; + substate.s0 = state->s0 & 0xff; + subcs->read(subcs, input_chr, &substate, emit, emitctx); + state->s0 = (state->s0 & ~0xff) | (substate.s0 & 0xff); + } + } + if (!--length) + state->s0 = 0; + else + state->s0 = (state->s0 &~0x003fff00) | (length << 8); +} static void read_iso2022(charset_spec const *charset, long int input_chr, charset_state *state, @@ -190,14 +371,16 @@ static void read_iso2022(charset_spec const *charset, long int input_chr, /* dump_state(state); */ /* - * We've got 64 bits of state to play with. - * - * Locking-shift state: 2 bits each GL/GR - * Single-shift state: 2 bits - * Charset designation state: n bits each G0/G1/G2/G3 - * MBCS/esc seq accumulation: 14 bits (assume max 4-byte sets) - * MBCS state: 2 bits (off, ESC, GL, GR) - * For no good reason, put long-term state in s1, short term in s0. + * We have to make fairly efficient use of the 64 bits of state + * available to us. Long-term state goes in s1, and consists of + * the identities of the character sets designated as G0/G1/G2/G3 + * and the locking-shift states for GL and GR. Short-term state + * goes in s0: The bottom half of s0 accumulates characters for an + * escape sequence or a multi-byte character, while the top three + * bits indicate what they're being accumulated for. After DOCS, + * the bottom 29 bits of state are available for the DOCS function + * to use -- the UTF-8 one uses the bottom 26 for UTF-8 decoding + * and the top two to recognised ESC % @. * * s0[31:29] = state enum * s0[24:0] = accumulated bytes @@ -238,6 +421,10 @@ static void read_iso2022(charset_spec const *charset, long int input_chr, docs_utf8(input_chr, state, emit, emitctx); return; } + if (MODE == DOCSCTEXT) { + docs_ctext(input_chr, state, emit, emitctx); + return; + } if ((input_chr & 0x60) == 0x00) { /* C0 or C1 control */ @@ -453,6 +640,13 @@ static void read_iso2022(charset_spec const *charset, long int input_chr, break; } break; + case '/': + switch (input_chr) { + case '1': case '2': + ENTER_MODE(DOCSCTEXT); + break; + } + break; } break; default: @@ -466,8 +660,16 @@ static void read_iso2022(charset_spec const *charset, long int input_chr, } } +static int write_iso2022(charset_spec const *charset, long int input_chr, + charset_state *state, + void (*emit)(void *ctx, long int output), + void *emitctx) +{ + return FALSE; +} + const charset_spec charset_CS_ISO2022 = { - CS_ISO2022, read_iso2022, NULL, NULL + CS_ISO2022, read_iso2022, write_iso2022, NULL }; #ifdef TESTMODE @@ -585,7 +787,7 @@ int main(void) iso2022_read_test(TESTSTR("\x1b$-~\x1b~\xa0\xff"), ERROR, 0, -1); /* Designate control sets */ iso2022_read_test(TESTSTR("\x1b!@"), 0x1b, '!', '@', 0, -1); - /* Designate other coding system */ + /* Designate other coding system (UTF-8) */ iso2022_read_test(TESTSTR("\x1b%G" "\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"), 0x03BA, 0x1F79, 0x03C3, 0x03BC, 0x03B5, 0, -1); @@ -594,6 +796,22 @@ int main(void) iso2022_read_test(TESTSTR("\x1b%G\xCE\x1b%@"), ERROR, 0, -1); iso2022_read_test(TESTSTR("\x1b%G\xCE\xBA\x1b%\x1b%@"), 0x03BA, 0x1B, '%', 0, -1); + /* DOCS (COMPOUND_TEXT extended segment) */ + iso2022_read_test(TESTSTR("\x1b%/1\x80\x80"), 0, -1); + iso2022_read_test(TESTSTR("\x1b%/1\x80\x8fiso-8859-15\2xyz\x1b(B"), + ERROR, ERROR, ERROR, 0, -1); + iso2022_read_test(TESTSTR("\x1b%/1\x80\x8eiso8859-15\2xyz\x1b(B"), + 'x', 'y', 'z', 0, -1); + iso2022_read_test(TESTSTR("\x1b-A\x1b%/2\x80\x89" + "big5-0\2\xa1\x40\xa1\x40"), + 0x3000, 0xa1, 0x40, 0, -1); + /* Emacs Big5-in-ISO-2022 mapping */ + iso2022_read_test(TESTSTR("\x1b$(0&x86\x1b(B \x1b$(0DeBv"), + 0x5143, 0x6c23, ' ', ' ', 0x958b, 0x767c, 0, -1); + /* Test from RFC 1922 (ISO-2022-CN) */ + iso2022_read_test(TESTSTR("\x1b$)A\x0e=;;;\x1b$)GG(_P\x0f"), + 0x4EA4, 0x6362, 0x4EA4, 0x63db, 0, -1); + printf("read tests completed\n"); printf("total: %d errors\n", total_errs); return (total_errs != 0);