X-Git-Url: https://git.distorted.org.uk/~mdw/sgt/charset/blobdiff_plain/a89fe3cf498bb23b385b0fc1a7b229035c7eb8b3..113375ca1cf0896e3a20a207eac1c2c404cbd4ae:/iso2022.c

diff --git a/iso2022.c b/iso2022.c
index e88f9e7..a570860 100644
--- a/iso2022.c
+++ b/iso2022.c
@@ -15,7 +15,8 @@
  * sets are passed through, so a post-processor could fix them up if
  * necessary.
  *
- * DOCS is not currently supported.  It will be one day.
+ * DOCS to UTF-8 works.  Other DOCS sequences are ignored, which will
+ * produce surprising results.
  */
 
 #ifndef ENUM_CHARSETS
@@ -34,6 +35,15 @@
 
 enum {S4, S6, M4, M6};
 
+static long int emacs_big5_1_to_unicode(int, int);
+static long int emacs_big5_2_to_unicode(int, int);
+static long int cns11643_1_to_unicode(int, int);
+static long int cns11643_2_to_unicode(int, int);
+static long int cns11643_3_to_unicode(int, int);
+static long int cns11643_4_to_unicode(int, int);
+static long int cns11643_5_to_unicode(int, int);
+static long int cns11643_6_to_unicode(int, int);
+static long int cns11643_7_to_unicode(int, int);
 static long int null_dbcs_to_unicode(int, int);
 
 const struct iso2022_subcharset {
@@ -42,9 +52,10 @@ const struct iso2022_subcharset {
     const sbcs_data *sbcs_base;
     long int (*dbcs_fn)(int, int);
 } iso2022_subcharsets[] = {
-    { S4, 0, 'B', 0x00, &sbcsdata_CS_ASCII },
-
+    { S4, 0, '0', 0x00, &sbcsdata_CS_DEC_GRAPHICS },
     { S4, 0, '<', 0x80, &sbcsdata_CS_DEC_MCS },
+    { S4, 0, 'A', 0x00, &sbcsdata_CS_BS4730 },
+    { S4, 0, 'B', 0x00, &sbcsdata_CS_ASCII },
     { S4, 0, 'I', 0x80, &sbcsdata_CS_JISX0201 },
     { S4, 0, 'J', 0x00, &sbcsdata_CS_JISX0201 },
     { S4, 0, '~' },
@@ -67,10 +78,19 @@ const struct iso2022_subcharset {
 #if 0
     { M4, 0, '@' }, /* JIS C 6226-1978 */
 #endif
+    { M4, 0, '0', -0x21, 0, &emacs_big5_1_to_unicode },
+    { M4, 0, '1', -0x21, 0, &emacs_big5_2_to_unicode },
     { M4, 0, 'A', -0x21, 0, &gb2312_to_unicode },
     { M4, 0, 'B', -0x21, 0, &jisx0208_to_unicode },
     { M4, 0, 'C', -0x21, 0, &ksx1001_to_unicode },
     { M4, 0, 'D', -0x21, 0, &jisx0212_to_unicode },
+    { M4, 0, 'G', -0x21, 0, &cns11643_1_to_unicode },
+    { M4, 0, 'H', -0x21, 0, &cns11643_2_to_unicode },
+    { M4, 0, 'I', -0x21, 0, &cns11643_3_to_unicode },
+    { M4, 0, 'J', -0x21, 0, &cns11643_4_to_unicode },
+    { M4, 0, 'K', -0x21, 0, &cns11643_5_to_unicode },
+    { M4, 0, 'L', -0x21, 0, &cns11643_6_to_unicode },
+    { M4, 0, 'M', -0x21, 0, &cns11643_7_to_unicode },
     { M4, 0, '~', 0, 0, &null_dbcs_to_unicode }, /* empty 94^n-set */
     { M6, 0, '~', 0, 0, &null_dbcs_to_unicode }, /* empty 96^n-set */
 };
@@ -80,6 +100,63 @@ static long int null_dbcs_to_unicode(int r, int c)
     return ERROR;
 }
 
+/*
+ * Emacs encodes Big5 in COMPOUND_TEXT as two 94x94 character sets.
+ * We treat Big5 as a 94x191 character set with a bunch of undefined
+ * columns in the middle, so we have to mess around a bit to make
+ * things fit.
+ */
+
+static long int emacs_big5_1_to_unicode(int r, int c)
+{
+    unsigned long s;
+    s = r * 94 + c;
+    r = s / 157;
+    c = s % 157;
+    if (c >= 64) c += 34; /* Skip over the gap */
+    return big5_to_unicode(r, c);
+}
+
+static long int emacs_big5_2_to_unicode(int r, int c)
+{
+    unsigned long s;
+    s = r * 94 + c;
+    r = s / 157 + 40;
+    c = s % 157;
+    if (c >= 64) c += 34; /* Skip over the gap */
+    return big5_to_unicode(r, c);
+}
+
+/* Wrappers for cns11643_to_unicode() */
+static long int cns11643_1_to_unicode(int r, int c)
+{
+    return cns11643_to_unicode(0, r, c);
+}
+static long int cns11643_2_to_unicode(int r, int c)
+{
+    return cns11643_to_unicode(1, r, c);
+}
+static long int cns11643_3_to_unicode(int r, int c)
+{
+    return cns11643_to_unicode(2, r, c);
+}
+static long int cns11643_4_to_unicode(int r, int c)
+{
+    return cns11643_to_unicode(3, r, c);
+}
+static long int cns11643_5_to_unicode(int r, int c)
+{
+    return cns11643_to_unicode(4, r, c);
+}
+static long int cns11643_6_to_unicode(int r, int c)
+{
+    return cns11643_to_unicode(5, r, c);
+}
+static long int cns11643_7_to_unicode(int r, int c)
+{
+    return cns11643_to_unicode(6, r, c);
+}
+
 /* States, or "what we're currently accumulating". */
 enum {
     IDLE,	/* None of the below */
@@ -88,7 +165,8 @@ enum {
     ESCSEQ,	/* Accumulating an escape sequence */
     ESCDROP,	/* Discarding an escape sequence */
     ESCPASS,	/* Passing through an escape sequence */
-    DOCSUTF8	/* DOCSed into UTF-8 */
+    DOCSUTF8,	/* DOCSed into UTF-8 */
+    DOCSCTEXT	/* DOCSed into a COMPOUND_TEXT extended segment */
 };
 
 #if 1
@@ -143,8 +221,7 @@ static void do_utf8(long int input_chr,
 
     ustate.s1 = 0;
     ustate.s0 = state->s0 & 0x03ffffffL;
-    utf8 = charset_find_spec(CS_UTF8);
-    utf8->read(utf8, input_chr, &ustate, emit, emitctx);
+    read_utf8(NULL, input_chr, &ustate, emit, emitctx);
     state->s0 = (state->s0 & ~0x03ffffffL) | (ustate.s0 & 0x03ffffffL);
 }
 
@@ -181,6 +258,110 @@ static void docs_utf8(long int input_chr,
     state->s0 = (state->s0 & ~0x0c000000L) | (retstate << 26);
 }
 
+struct ctext_encoding {
+    char const *name;
+    charset_spec const *subcs;
+};
+
+/*
+ * In theory, this list is in <http://ftp.x.org/pub/docs/registry>,
+ * but XLib appears to have its own ideas, and encodes these three
+ * (as of X11R6.8.2)
+ */
+
+extern charset_spec const charset_CS_ISO8859_14;
+extern charset_spec const charset_CS_ISO8859_15;
+extern charset_spec const charset_CS_BIG5;
+
+static struct ctext_encoding const ctext_encodings[] = {
+    { "big5-0\2", &charset_CS_BIG5 },
+    { "iso8859-14\2", &charset_CS_ISO8859_14 },
+    { "iso8859-15\2", &charset_CS_ISO8859_15 }
+};
+
+static void docs_ctext(long int input_chr,
+		       charset_state *state,
+		       void (*emit)(void *ctx, long int output),
+		       void *emitctx)
+{
+    /*
+     * s0[27:26] = first entry in ctext_encodings that matches
+     * s0[25:22] = number of characters successfully matched, 0xf if all
+     * s0[21:8] count the number of octets left in the segment
+     * s0[7:0] are for sub-charset use
+     */
+    int n = (state->s0 >> 22) & 0xf, i = (state->s0 >> 26) & 3, oi = i, j;
+    int length = (state->s0 >> 8) & 0x3fff;
+
+    if (!length) {
+	/* Haven't read length yet */
+	if ((state->s0 & 0xff) == 0)
+	    /* ... or even the first byte */
+	    state->s0 |= input_chr;
+	else {
+	    length = (state->s0 & 0x7f) * 0x80 + (input_chr & 0x7f);
+	    if (length == 0)
+		state->s0 = 0;
+	    else
+		state->s0 = (state->s0 & 0xf0000000) | (length << 8);
+	}
+	return;
+    }
+
+    j = i;
+    if (n == 0xe) {
+	/* Skipping unknown encoding.  Look out for STX. */
+	if (input_chr == 2)
+	    state->s0 = (state->s0 & 0xf0000000) | (i << 26) | (0xf << 22);
+    } else if (n != 0xf) {
+	while (j < lenof(ctext_encodings) &&
+	       !memcmp(ctext_encodings[j].name,
+		       ctext_encodings[oi].name, n)) {
+	    if (ctext_encodings[j].name[n] < input_chr)
+		i = ++j;
+	    else
+		break;
+	}
+	if (i >= lenof(ctext_encodings) ||
+	    memcmp(ctext_encodings[i].name,
+		   ctext_encodings[oi].name, n) ||
+	    ctext_encodings[i].name[n] != input_chr) {
+	    /* Doom!  We haven't heard of this encoding */
+	    i = lenof(ctext_encodings);
+	    n = 0xe;
+	} else {
+	    /*
+	     * Otherwise, we have found an additional character in our
+	     * encoding name. See if we have reached the _end_ of our
+	     * name.
+	     */
+	    n++;
+	    if (!ctext_encodings[i].name[n])
+		n = 0xf;
+	}
+	/*
+	 * Failing _that_, we simply update our encoding-name-
+	 * tracking state.
+	 */
+	assert(i < 4 && n < 16);
+	state->s0 = (state->s0 & 0xf0000000) | (i << 26) | (n << 22);
+    } else {
+	if (i >= lenof(ctext_encodings))
+	    emit(emitctx, ERROR);
+	else {
+	    charset_state substate;
+	    charset_spec const *subcs = ctext_encodings[i].subcs;
+	    substate.s1 = 0;
+	    substate.s0 = state->s0 & 0xff;
+	    subcs->read(subcs, input_chr, &substate, emit, emitctx);
+	    state->s0 = (state->s0 & ~0xff) | (substate.s0 & 0xff);
+	}
+    }
+    if (!--length)
+	state->s0 = 0;
+    else
+	state->s0 = (state->s0 &~0x003fff00) | (length << 8);
+}
 
 static void read_iso2022(charset_spec const *charset, long int input_chr,
 			  charset_state *state,
@@ -190,14 +371,16 @@ static void read_iso2022(charset_spec const *charset, long int input_chr,
 
     /* dump_state(state); */
     /*
-     * We've got 64 bits of state to play with.
-     *
-     * Locking-shift state: 2 bits each GL/GR
-     * Single-shift state: 2 bits
-     * Charset designation state: n bits each G0/G1/G2/G3
-     * MBCS/esc seq accumulation: 14 bits (assume max 4-byte sets)
-     * MBCS state: 2 bits (off, ESC, GL, GR)
-     * For no good reason, put long-term state in s1, short term in s0.
+     * We have to make fairly efficient use of the 64 bits of state
+     * available to us.  Long-term state goes in s1, and consists of
+     * the identities of the character sets designated as G0/G1/G2/G3
+     * and the locking-shift states for GL and GR.  Short-term state
+     * goes in s0: The bottom half of s0 accumulates characters for an
+     * escape sequence or a multi-byte character, while the top three
+     * bits indicate what they're being accumulated for.  After DOCS,
+     * the bottom 29 bits of state are available for the DOCS function
+     * to use -- the UTF-8 one uses the bottom 26 for UTF-8 decoding
+     * and the top two to recognised ESC % @.
      *
      * s0[31:29] = state enum
      * s0[24:0] = accumulated bytes
@@ -238,6 +421,10 @@ static void read_iso2022(charset_spec const *charset, long int input_chr,
 	docs_utf8(input_chr, state, emit, emitctx);
 	return;
     }
+    if (MODE == DOCSCTEXT) {
+	docs_ctext(input_chr, state, emit, emitctx);
+	return;
+    }
 
     if ((input_chr & 0x60) == 0x00) {
 	/* C0 or C1 control */
@@ -453,6 +640,13 @@ static void read_iso2022(charset_spec const *charset, long int input_chr,
 		    break;
 		}
 		break;
+	      case '/':
+		switch (input_chr) {
+		  case '1': case '2':
+		    ENTER_MODE(DOCSCTEXT);
+		    break;
+		}
+		break;
 	    }
 	    break;
 	  default:
@@ -466,8 +660,16 @@ static void read_iso2022(charset_spec const *charset, long int input_chr,
     }
 }
 
+static int write_iso2022(charset_spec const *charset, long int input_chr,
+			 charset_state *state,
+			 void (*emit)(void *ctx, long int output),
+			 void *emitctx)
+{
+    return FALSE;
+}
+
 const charset_spec charset_CS_ISO2022 = {
-    CS_ISO2022, read_iso2022, NULL, NULL
+    CS_ISO2022, read_iso2022, write_iso2022, NULL
 };
 
 #ifdef TESTMODE
@@ -585,7 +787,7 @@ int main(void)
     iso2022_read_test(TESTSTR("\x1b$-~\x1b~\xa0\xff"), ERROR, 0, -1);
     /* Designate control sets */
     iso2022_read_test(TESTSTR("\x1b!@"), 0x1b, '!', '@', 0, -1);
-    /* Designate other coding system */
+    /* Designate other coding system (UTF-8) */
     iso2022_read_test(TESTSTR("\x1b%G"
 			      "\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"),
 		      0x03BA, 0x1F79, 0x03C3, 0x03BC, 0x03B5, 0, -1);
@@ -594,6 +796,22 @@ int main(void)
     iso2022_read_test(TESTSTR("\x1b%G\xCE\x1b%@"), ERROR, 0, -1);
     iso2022_read_test(TESTSTR("\x1b%G\xCE\xBA\x1b%\x1b%@"),
 		      0x03BA, 0x1B, '%', 0, -1);
+    /* DOCS (COMPOUND_TEXT extended segment) */
+    iso2022_read_test(TESTSTR("\x1b%/1\x80\x80"), 0, -1);
+    iso2022_read_test(TESTSTR("\x1b%/1\x80\x8fiso-8859-15\2xyz\x1b(B"),
+		      ERROR, ERROR, ERROR, 0, -1);
+    iso2022_read_test(TESTSTR("\x1b%/1\x80\x8eiso8859-15\2xyz\x1b(B"),
+		      'x', 'y', 'z', 0, -1);
+    iso2022_read_test(TESTSTR("\x1b-A\x1b%/2\x80\x89"
+			      "big5-0\2\xa1\x40\xa1\x40"),
+		      0x3000, 0xa1, 0x40, 0, -1);
+    /* Emacs Big5-in-ISO-2022 mapping */
+    iso2022_read_test(TESTSTR("\x1b$(0&x86\x1b(B  \x1b$(0DeBv"),
+		      0x5143, 0x6c23, ' ', ' ', 0x958b, 0x767c, 0, -1);
+    /* Test from RFC 1922 (ISO-2022-CN) */
+    iso2022_read_test(TESTSTR("\x1b$)A\x0e=;;;\x1b$)GG(_P\x0f"),
+		      0x4EA4, 0x6362, 0x4EA4, 0x63db, 0, -1);
+    
     printf("read tests completed\n");
     printf("total: %d errors\n", total_errs);
     return (total_errs != 0);