enum {S4, S6, M4, M6};
+static long int emacs_big5_1_to_unicode(int, int);
+static long int emacs_big5_2_to_unicode(int, int);
+static long int cns11643_1_to_unicode(int, int);
+static long int cns11643_2_to_unicode(int, int);
+static long int cns11643_3_to_unicode(int, int);
+static long int cns11643_4_to_unicode(int, int);
+static long int cns11643_5_to_unicode(int, int);
+static long int cns11643_6_to_unicode(int, int);
+static long int cns11643_7_to_unicode(int, int);
static long int null_dbcs_to_unicode(int, int);
const struct iso2022_subcharset {
const sbcs_data *sbcs_base;
long int (*dbcs_fn)(int, int);
} iso2022_subcharsets[] = {
- { S4, 0, 'B', 0x00, &sbcsdata_CS_ASCII },
-
+ { S4, 0, '0', 0x00, &sbcsdata_CS_DEC_GRAPHICS },
{ S4, 0, '<', 0x80, &sbcsdata_CS_DEC_MCS },
+ { S4, 0, 'A', 0x00, &sbcsdata_CS_BS4730 },
+ { S4, 0, 'B', 0x00, &sbcsdata_CS_ASCII },
{ S4, 0, 'I', 0x80, &sbcsdata_CS_JISX0201 },
{ S4, 0, 'J', 0x00, &sbcsdata_CS_JISX0201 },
{ S4, 0, '~' },
#if 0
{ M4, 0, '@' }, /* JIS C 6226-1978 */
#endif
+ { M4, 0, '0', -0x21, 0, &emacs_big5_1_to_unicode },
+ { M4, 0, '1', -0x21, 0, &emacs_big5_2_to_unicode },
{ M4, 0, 'A', -0x21, 0, &gb2312_to_unicode },
{ M4, 0, 'B', -0x21, 0, &jisx0208_to_unicode },
{ M4, 0, 'C', -0x21, 0, &ksx1001_to_unicode },
{ M4, 0, 'D', -0x21, 0, &jisx0212_to_unicode },
+ { M4, 0, 'G', -0x21, 0, &cns11643_1_to_unicode },
+ { M4, 0, 'H', -0x21, 0, &cns11643_2_to_unicode },
+ { M4, 0, 'I', -0x21, 0, &cns11643_3_to_unicode },
+ { M4, 0, 'J', -0x21, 0, &cns11643_4_to_unicode },
+ { M4, 0, 'K', -0x21, 0, &cns11643_5_to_unicode },
+ { M4, 0, 'L', -0x21, 0, &cns11643_6_to_unicode },
+ { M4, 0, 'M', -0x21, 0, &cns11643_7_to_unicode },
{ M4, 0, '~', 0, 0, &null_dbcs_to_unicode }, /* empty 94^n-set */
{ M6, 0, '~', 0, 0, &null_dbcs_to_unicode }, /* empty 96^n-set */
};
return ERROR;
}
+/*
+ * Emacs encodes Big5 in COMPOUND_TEXT as two 94x94 character sets.
+ * We treat Big5 as a 94x191 character set with a bunch of undefined
+ * columns in the middle, so we have to mess around a bit to make
+ * things fit.
+ */
+
+static long int emacs_big5_1_to_unicode(int r, int c)
+{
+ unsigned long s;
+ s = r * 94 + c;
+ r = s / 157;
+ c = s % 157;
+ if (c >= 64) c += 34; /* Skip over the gap */
+ return big5_to_unicode(r, c);
+}
+
+static long int emacs_big5_2_to_unicode(int r, int c)
+{
+ unsigned long s;
+ s = r * 94 + c;
+ r = s / 157 + 40;
+ c = s % 157;
+ if (c >= 64) c += 34; /* Skip over the gap */
+ return big5_to_unicode(r, c);
+}
+
+/* Wrappers for cns11643_to_unicode() */
+static long int cns11643_1_to_unicode(int r, int c)
+{
+ return cns11643_to_unicode(0, r, c);
+}
+static long int cns11643_2_to_unicode(int r, int c)
+{
+ return cns11643_to_unicode(1, r, c);
+}
+static long int cns11643_3_to_unicode(int r, int c)
+{
+ return cns11643_to_unicode(2, r, c);
+}
+static long int cns11643_4_to_unicode(int r, int c)
+{
+ return cns11643_to_unicode(3, r, c);
+}
+static long int cns11643_5_to_unicode(int r, int c)
+{
+ return cns11643_to_unicode(4, r, c);
+}
+static long int cns11643_6_to_unicode(int r, int c)
+{
+ return cns11643_to_unicode(5, r, c);
+}
+static long int cns11643_7_to_unicode(int r, int c)
+{
+ return cns11643_to_unicode(6, r, c);
+}
+
/* States, or "what we're currently accumulating". */
enum {
IDLE, /* None of the below */
ESCSEQ, /* Accumulating an escape sequence */
ESCDROP, /* Discarding an escape sequence */
ESCPASS, /* Passing through an escape sequence */
- DOCSUTF8 /* DOCSed into UTF-8 */
+ DOCSUTF8, /* DOCSed into UTF-8 */
+ DOCSCTEXT /* DOCSed into a COMPOUND_TEXT extended segment */
};
-#if 1
+#if 0
#include <stdio.h>
static void dump_state(charset_state *s)
{
void *emitctx)
{
charset_state ustate;
- charset_spec const *utf8;
ustate.s1 = 0;
ustate.s0 = state->s0 & 0x03ffffffL;
state->s0 = (state->s0 & ~0x0c000000L) | (retstate << 26);
}
+struct ctext_encoding {
+ char const *name;
+ charset_spec const *subcs;
+};
+
+/*
+ * In theory, this list is in <http://ftp.x.org/pub/docs/registry>,
+ * but XLib appears to have its own ideas, and encodes these three
+ * (as of X11R6.8.2)
+ */
+
+extern charset_spec const charset_CS_ISO8859_14;
+extern charset_spec const charset_CS_ISO8859_15;
+extern charset_spec const charset_CS_BIG5;
+
+static struct ctext_encoding const ctext_encodings[] = {
+ { "big5-0\2", &charset_CS_BIG5 },
+ { "iso8859-14\2", &charset_CS_ISO8859_14 },
+ { "iso8859-15\2", &charset_CS_ISO8859_15 }
+};
+
+static void docs_ctext(long int input_chr,
+ charset_state *state,
+ void (*emit)(void *ctx, long int output),
+ void *emitctx)
+{
+ /*
+ * s0[27:26] = first entry in ctext_encodings that matches
+ * s0[25:22] = number of characters successfully matched, 0xf if all
+ * s0[21:8] count the number of octets left in the segment
+ * s0[7:0] are for sub-charset use
+ */
+ int n = (state->s0 >> 22) & 0xf, i = (state->s0 >> 26) & 3, oi = i, j;
+ int length = (state->s0 >> 8) & 0x3fff;
+
+ if (!length) {
+ /* Haven't read length yet */
+ if ((state->s0 & 0xff) == 0)
+ /* ... or even the first byte */
+ state->s0 |= input_chr;
+ else {
+ length = (state->s0 & 0x7f) * 0x80 + (input_chr & 0x7f);
+ if (length == 0)
+ state->s0 = 0;
+ else
+ state->s0 = (state->s0 & 0xf0000000) | (length << 8);
+ }
+ return;
+ }
+
+ j = i;
+ if (n == 0xe) {
+ /* Skipping unknown encoding. Look out for STX. */
+ if (input_chr == 2)
+ state->s0 = (state->s0 & 0xf0000000) | (i << 26) | (0xf << 22);
+ } else if (n != 0xf) {
+ while (j < lenof(ctext_encodings) &&
+ !memcmp(ctext_encodings[j].name,
+ ctext_encodings[oi].name, n)) {
+ if (ctext_encodings[j].name[n] < input_chr)
+ i = ++j;
+ else
+ break;
+ }
+ if (i >= lenof(ctext_encodings) ||
+ memcmp(ctext_encodings[i].name,
+ ctext_encodings[oi].name, n) ||
+ ctext_encodings[i].name[n] != input_chr) {
+ /* Doom! We haven't heard of this encoding */
+ i = lenof(ctext_encodings);
+ n = 0xe;
+ } else {
+ /*
+ * Otherwise, we have found an additional character in our
+ * encoding name. See if we have reached the _end_ of our
+ * name.
+ */
+ n++;
+ if (!ctext_encodings[i].name[n])
+ n = 0xf;
+ }
+ /*
+ * Failing _that_, we simply update our encoding-name-
+ * tracking state.
+ */
+ assert(i < 4 && n < 16);
+ state->s0 = (state->s0 & 0xf0000000) | (i << 26) | (n << 22);
+ } else {
+ if (i >= lenof(ctext_encodings))
+ emit(emitctx, ERROR);
+ else {
+ charset_state substate;
+ charset_spec const *subcs = ctext_encodings[i].subcs;
+ substate.s1 = 0;
+ substate.s0 = state->s0 & 0xff;
+ subcs->read(subcs, input_chr, &substate, emit, emitctx);
+ state->s0 = (state->s0 & ~0xff) | (substate.s0 & 0xff);
+ }
+ }
+ if (!--length)
+ state->s0 = 0;
+ else
+ state->s0 = (state->s0 &~0x003fff00) | (length << 8);
+}
static void read_iso2022(charset_spec const *charset, long int input_chr,
charset_state *state,
docs_utf8(input_chr, state, emit, emitctx);
return;
}
+ if (MODE == DOCSCTEXT) {
+ docs_ctext(input_chr, state, emit, emitctx);
+ return;
+ }
if ((input_chr & 0x60) == 0x00) {
/* C0 or C1 control */
break;
}
break;
+ case '/':
+ switch (input_chr) {
+ case '1': case '2':
+ ENTER_MODE(DOCSCTEXT);
+ break;
+ }
+ break;
}
break;
default:
iso2022_read_test(TESTSTR("\x1b$-~\x1b~\xa0\xff"), ERROR, 0, -1);
/* Designate control sets */
iso2022_read_test(TESTSTR("\x1b!@"), 0x1b, '!', '@', 0, -1);
- /* Designate other coding system */
+ /* Designate other coding system (UTF-8) */
iso2022_read_test(TESTSTR("\x1b%G"
"\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"),
0x03BA, 0x1F79, 0x03C3, 0x03BC, 0x03B5, 0, -1);
iso2022_read_test(TESTSTR("\x1b%G\xCE\x1b%@"), ERROR, 0, -1);
iso2022_read_test(TESTSTR("\x1b%G\xCE\xBA\x1b%\x1b%@"),
0x03BA, 0x1B, '%', 0, -1);
+ /* DOCS (COMPOUND_TEXT extended segment) */
+ iso2022_read_test(TESTSTR("\x1b%/1\x80\x80"), 0, -1);
+ iso2022_read_test(TESTSTR("\x1b%/1\x80\x8fiso-8859-15\2xyz\x1b(B"),
+ ERROR, ERROR, ERROR, 0, -1);
+ iso2022_read_test(TESTSTR("\x1b%/1\x80\x8eiso8859-15\2xyz\x1b(B"),
+ 'x', 'y', 'z', 0, -1);
+ iso2022_read_test(TESTSTR("\x1b-A\x1b%/2\x80\x89"
+ "big5-0\2\xa1\x40\xa1\x40"),
+ 0x3000, 0xa1, 0x40, 0, -1);
+ /* Emacs Big5-in-ISO-2022 mapping */
+ iso2022_read_test(TESTSTR("\x1b$(0&x86\x1b(B \x1b$(0DeBv"),
+ 0x5143, 0x6c23, ' ', ' ', 0x958b, 0x767c, 0, -1);
+ /* Test from RFC 1922 (ISO-2022-CN) */
+ iso2022_read_test(TESTSTR("\x1b$)A\x0e=;;;\x1b$)GG(_P\x0f"),
+ 0x4EA4, 0x6362, 0x4EA4, 0x63db, 0, -1);
+
printf("read tests completed\n");
printf("total: %d errors\n", total_errs);
return (total_errs != 0);