X-Git-Url: https://git.distorted.org.uk/~mdw/sgt/charset/blobdiff_plain/3cca0edf4d9c3afe0c51c4b5bb59f99c743f7067..b063a840d1dded0455a70fc3e71ef8f92e8644ab:/iso2022s.c

diff --git a/iso2022s.c b/iso2022s.c
index e202207..e48c885 100644
--- a/iso2022s.c
+++ b/iso2022s.c
@@ -1,9 +1,5 @@
 /*
  * iso2022s.c - support for ISO-2022 subset encodings.
- * 
- * (The `s' suffix on the filename is there to leave `iso2022.c'
- * free for the unlikely event that I ever attempt to implement
- * _full_ ISO-2022 in this library!)
  */
 
 #ifndef ENUM_CHARSETS
@@ -14,6 +10,7 @@
 
 #include "charset.h"
 #include "internal.h"
+#include "sbcsdat.h"
 
 #define SO (0x0E)
 #define SI (0x0F)
@@ -26,9 +23,14 @@ struct iso2022_escape {
     /*
      * For output, these variables help us figure out which escape
      * sequences we need to get where we want to be.
+     * 
+     * `container' should be in the range 0-3, but can also be ORed
+     * with the bit flag RO to indicate that this is not a
+     * preferred container to use for this charset during output.
      */
     int container, subcharset;
 };
+#define RO 0x80
 
 struct iso2022 {
     /*
@@ -36,7 +38,7 @@ struct iso2022 {
      * in ASCII order, so that we can narrow down the list as
      * necessary.
      */
-    struct iso2022_escape *escapes;    /* must be sorted in ASCII order! */
+    const struct iso2022_escape *escapes;/* must be sorted in ASCII order! */
     int nescapes;
 
     /*
@@ -74,6 +76,11 @@ struct iso2022 {
     char const *initial_sequence;
 
     /*
+     * Is this an 8-bit ISO 2022 subset?
+     */
+    int eightbit;
+
+    /*
      * Function calls to do the actual translation.
      */
     long int (*to_ucs)(int subcharset, unsigned long bytes);
@@ -115,6 +122,11 @@ static void read_iso2022s(charset_spec const *charset, long int input_chr,
      * 	     either 2 or 3.
      * 	   + Hence: 0 is SI, 1 is SO, 4 is SS2-from-SI, 5 is
      * 	     SS2-from-SO, 6 is SS3-from-SI, 7 is SS3-from-SO.
+     * 	   + For added fun: in an _8-bit_ ISO 2022 subset, we have
+     * 	     the further special value 2, which means that we're
+     * 	     theoretically in SI but the current character being
+     * 	     accumulated is composed of 8-bit characters and will
+     * 	     therefore be interpreted as if in SO.
      * 
      * 	- The next nibble of s1 (27:24) indicates how many bytes
      * 	  have been accumulated in the current character.
@@ -214,9 +226,12 @@ static void read_iso2022s(charset_spec const *charset, long int input_chr,
     /*
      * If this isn't an escape sequence, it must be part of a
      * character. One possibility is that it's a control character
-     * (outside the space 21-7E), in which case we output it verbatim.
+     * (00-20 or 7F-9F; also in non-8-bit ISO 2022 subsets I'm
+     * going to treat all top-half characters as controls), in
+     * which case we output it verbatim.
      */
-    if (input_chr < 0x21 || input_chr > 0x7E) {
+    if (input_chr < 0x21 ||
+	(input_chr > 0x7E && (!iso->eightbit || input_chr < 0xA0))) {
 	/*
 	 * We might be in mid-multibyte-character. If so, clear the
 	 * character state and emit an error token for the
@@ -245,8 +260,45 @@ static void read_iso2022s(charset_spec const *charset, long int input_chr,
 	unsigned long chr;
 	int chrlen, cont, subcharset, bytes;
 
+	/*
+	 * Verify that we've seen the right kind of character for
+	 * what we're currently doing. This only matters in 8-bit
+	 * subsets.
+	 */
+	if (iso->eightbit) {
+	    cont = (state->s1 >> 28) & 7;
+	    /*
+	     * If cont==0, we're entitled to see either GL or GR
+	     * characters. If cont==2, we expect only GR; otherwise
+	     * we expect only GL.
+	     * 
+	     * If we see a GR character while cont==0, we set
+	     * cont=2 immediately.
+	     */
+	    if ((cont == 2 && !(input_chr & 0x80)) ||
+		(cont != 0 && cont != 2 && (input_chr & 0x80))) {
+		/*
+		 * Clear the previous character; it was prematurely
+		 * terminated by this error.
+		 */
+		state->s1 &= ~0x0F000000;
+		state->s0 &= 0xFF000000;
+		emit(emitctx, ERROR);
+		/*
+		 * If we were in the SS2 or SS3 container, we
+		 * automatically exit it.
+		 */
+		if (state->s1 & 0x60000000)
+		    state->s1 &= 0x9FFFFFFF;
+	    }
+
+	    if (cont == 0 && (input_chr & 0x80)) {
+		state->s1 |= 0x20000000;
+	    }
+	}
+
 	/* The current character and its length. */
-	chr = ((state->s0 & 0x00FFFFFF) << 8) | input_chr;
+	chr = ((state->s0 & 0x00FFFFFF) << 8) | (input_chr & 0x7F);
 	chrlen = ((state->s1 >> 24) & 0xF) + 1;
 	/* The current sub-charset. */
 	cont = (state->s1 >> 28) & 7;
@@ -280,7 +332,7 @@ static int write_iso2022s(charset_spec const *charset, long int input_chr,
 			  void *emitctx)
 {
     struct iso2022 const *iso = (struct iso2022 *)charset->data;
-    int subcharset, len, i, j, cont;
+    int subcharset, len, i, j, cont, topbit = 0;
     unsigned long bytes;
 
     /*
@@ -333,7 +385,8 @@ static int write_iso2022s(charset_spec const *charset, long int input_chr,
      * necessary, and then output the given bytes.
      */
     for (i = 0; i < iso->nescapes; i++)
-	if (iso->escapes[i].subcharset == subcharset)
+	if (iso->escapes[i].subcharset == subcharset &&
+	    !(iso->escapes[i].container & RO))
 	    break;
     assert(i < iso->nescapes);
 
@@ -343,7 +396,7 @@ static int write_iso2022s(charset_spec const *charset, long int input_chr,
      * already _be_ selected in that container! Check before we go
      * to the effort of emitting the sequence.
      */
-    cont = iso->escapes[i].container;
+    cont = iso->escapes[i].container &~ RO;
     if (((state->s1 >> (6*cont)) & 0x3F) != (unsigned)subcharset) {
 	for (j = 0; iso->escapes[i].sequence[j]; j++)
 	    emit(emitctx, iso->escapes[i].sequence[j]);
@@ -360,9 +413,17 @@ static int write_iso2022s(charset_spec const *charset, long int input_chr,
 	emit(emitctx, ESC);
 	emit(emitctx, 'L' + cont);     /* comes out to 'N' or 'O' */
     } else {
-	/* Emit SI or SO, but only if the current container isn't already
-	 * the right one. */
-	if (((state->s1 >> 28) & 7) != (unsigned)cont) {
+	/*
+	 * Emit SI or SO, but only if the current container isn't already
+	 * the right one.
+	 * 
+	 * Also, in an 8-bit subset, we need not do this; we'll
+	 * just use 8-bit characters to output SO-container
+	 * characters.
+	 */
+	if (iso->eightbit && cont == 1 && ((state->s1 >> 28) & 7) == 0) {
+	    topbit = 0x80;
+	} else if (((state->s1 >> 28) & 7) != (unsigned)cont) {
 	    emit(emitctx, cont ? SO : SI);
 	    state->s1 = (state->s1 & 0x8FFFFFFF) | (cont << 28);
 	}
@@ -374,7 +435,7 @@ static int write_iso2022s(charset_spec const *charset, long int input_chr,
      */
     len = iso->nbytes[subcharset];
     while (len--)
-	emit(emitctx, (bytes >> (8*len)) & 0xFF);
+	emit(emitctx, ((bytes >> (8*len)) & 0xFF) | topbit);
 
     return TRUE;
 }
@@ -385,12 +446,13 @@ static int write_iso2022s(charset_spec const *charset, long int input_chr,
 static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes)
 {
     switch (subcharset) {
+      case 1:			       /* JIS X 0201 bottom half */
+	if (bytes == 0x5C)
+	    return 0xA5;
+	else if (bytes == 0x7E)
+	    return 0x203E;
+	/* else fall through to ASCII */
       case 0: return bytes;	       /* one-byte ASCII */
-      case 1:			       /* JIS X 0201 half-width katakana */
-	if (bytes >= 0x21 && bytes <= 0x5F)
-	    return bytes + (0xFF61 - 0x21);
-	else
-	    return ERROR;
 	/* (no break needed since all control paths have returned) */
       case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
 					 ((bytes     ) & 0xFF) - 0x21);
@@ -405,9 +467,9 @@ static int iso2022jp_from_ucs(long int ucs, int *subcharset,
 	*subcharset = 0;
 	*bytes = ucs;
 	return 1;
-    } else if (ucs >= 0xFF61 && ucs <= 0xFF9F) {
+    } else if (ucs == 0xA5 || ucs == 0x203E) {
 	*subcharset = 1;
-	*bytes = ucs - (0xFF61 - 0x21);
+	*bytes = (ucs == 0xA5 ? 0x5C : 0x7E);
 	return 1;
     } else if (unicode_to_jisx0208(ucs, &r, &c)) {
 	*subcharset = 2;
@@ -417,15 +479,16 @@ static int iso2022jp_from_ucs(long int ucs, int *subcharset,
 	return 0;
     }
 }
-static struct iso2022_escape iso2022jp_escapes[] = {
+static const struct iso2022_escape iso2022jp_escapes[] = {
     {"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1},   /* we ignore this one */
     {"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2},
     {"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0},
     {"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1},
 };
-static struct iso2022 iso2022jp = {
+static const struct iso2022 iso2022jp = {
     iso2022jp_escapes, lenof(iso2022jp_escapes),
-    "\1\1\2", "\3", 0x80000000, NULL, iso2022jp_to_ucs, iso2022jp_from_ucs
+    "\1\1\2", "\3", 0x80000000, NULL, FALSE,
+    iso2022jp_to_ucs, iso2022jp_from_ucs
 };
 const charset_spec charset_CS_ISO2022_JP = {
     CS_ISO2022_JP, read_iso2022s, write_iso2022s, &iso2022jp
@@ -459,22 +522,236 @@ static int iso2022kr_from_ucs(long int ucs, int *subcharset,
 	return 0;
     }
 }
-static struct iso2022_escape iso2022kr_escapes[] = {
+static const struct iso2022_escape iso2022kr_escapes[] = {
     {"\016", 0x8FFFFFFF, 0x10000000, -1, -1},
     {"\017", 0x8FFFFFFF, 0x00000000, 0, 0},
     {"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1},   /* bits[11:6] <- 1 */
 };
-static struct iso2022 iso2022kr = {
+static const struct iso2022 iso2022kr = {
     iso2022kr_escapes, lenof(iso2022kr_escapes),
-    "\1\2", "\2", 0x80000040, "\033$)C", iso2022kr_to_ucs, iso2022kr_from_ucs
+    "\1\2", "\2", 0x80000040, "\033$)C", FALSE,
+    iso2022kr_to_ucs, iso2022kr_from_ucs
 };
 const charset_spec charset_CS_ISO2022_KR = {
     CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr
 };
 
+/*
+ * The COMPOUND_TEXT encoding used in X selections. Defined by the
+ * X consortium.
+ * 
+ * This encoding has quite a few sub-charsets. The order I assign
+ * to them here is given in an enum.
+ */
+enum {
+    /* This must match the bytes-per-character string given below. */
+    CTEXT_ASCII,
+    CTEXT_JISX0201_LEFT,
+    CTEXT_JISX0201_RIGHT,
+    CTEXT_ISO8859_1,
+    CTEXT_ISO8859_2,
+    CTEXT_ISO8859_3,
+    CTEXT_ISO8859_4,
+    CTEXT_ISO8859_5,
+    CTEXT_ISO8859_6,
+    CTEXT_ISO8859_7,
+    CTEXT_ISO8859_8,
+    CTEXT_ISO8859_9,
+    CTEXT_GB2312,
+    CTEXT_KSC5601,
+    CTEXT_JISX0208,
+    CTEXT_JISX0212
+};
+static long int ctext_to_ucs(int subcharset, unsigned long bytes)
+{
+    switch (subcharset) {
+      case CTEXT_ASCII: return bytes;	       /* one-byte ASCII */
+      case CTEXT_JISX0201_LEFT:        /* ASCII with yen and overline */
+	return sbcs_to_unicode(&sbcsdata_CS_JISX0201, bytes & 0x7F);
+      case CTEXT_JISX0201_RIGHT:       /* JIS X 0201 half-width katakana */
+	return sbcs_to_unicode(&sbcsdata_CS_JISX0201, (bytes & 0x7F) | 0x80);
+      case CTEXT_ISO8859_1:
+	return sbcs_to_unicode(&sbcsdata_CS_ISO8859_1, (bytes & 0x7F) | 0x80);
+      case CTEXT_ISO8859_2:
+	return sbcs_to_unicode(&sbcsdata_CS_ISO8859_2, (bytes & 0x7F) | 0x80);
+      case CTEXT_ISO8859_3:
+	return sbcs_to_unicode(&sbcsdata_CS_ISO8859_3, (bytes & 0x7F) | 0x80);
+      case CTEXT_ISO8859_4:
+	return sbcs_to_unicode(&sbcsdata_CS_ISO8859_4, (bytes & 0x7F) | 0x80);
+      case CTEXT_ISO8859_5:
+	return sbcs_to_unicode(&sbcsdata_CS_ISO8859_5, (bytes & 0x7F) | 0x80);
+      case CTEXT_ISO8859_6:
+	return sbcs_to_unicode(&sbcsdata_CS_ISO8859_6, (bytes & 0x7F) | 0x80);
+      case CTEXT_ISO8859_7:
+	return sbcs_to_unicode(&sbcsdata_CS_ISO8859_7, (bytes & 0x7F) | 0x80);
+      case CTEXT_ISO8859_8:
+	return sbcs_to_unicode(&sbcsdata_CS_ISO8859_8, (bytes & 0x7F) | 0x80);
+      case CTEXT_ISO8859_9:
+	return sbcs_to_unicode(&sbcsdata_CS_ISO8859_9, (bytes & 0x7F) | 0x80);
+      case CTEXT_GB2312:
+	return gb2312_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
+				 ((bytes     ) & 0xFF) - 0x21);
+      case CTEXT_KSC5601:
+	return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
+				  ((bytes     ) & 0xFF) - 0x21);
+      case CTEXT_JISX0208:
+	return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
+				   ((bytes     ) & 0xFF) - 0x21);
+      case CTEXT_JISX0212:
+	return jisx0212_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
+				   ((bytes     ) & 0xFF) - 0x21);
+      default: return ERROR;
+    }
+}
+static int ctext_from_ucs(long int ucs, int *subcharset, unsigned long *bytes)
+{
+    int r, c;
+    if (ucs < 0x80) {
+	*subcharset = CTEXT_ASCII;
+	*bytes = ucs;
+	return 1;
+    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_1, ucs)) != ERROR) {
+	*subcharset = CTEXT_ISO8859_1;
+	*bytes = c - 0x80;
+	return 1;
+    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_2, ucs)) != ERROR) {
+	*subcharset = CTEXT_ISO8859_2;
+	*bytes = c - 0x80;
+	return 1;
+    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_3, ucs)) != ERROR) {
+	*subcharset = CTEXT_ISO8859_3;
+	*bytes = c - 0x80;
+	return 1;
+    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_4, ucs)) != ERROR) {
+	*subcharset = CTEXT_ISO8859_4;
+	*bytes = c - 0x80;
+	return 1;
+    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_5, ucs)) != ERROR) {
+	*subcharset = CTEXT_ISO8859_5;
+	*bytes = c - 0x80;
+	return 1;
+    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_6, ucs)) != ERROR) {
+	*subcharset = CTEXT_ISO8859_6;
+	*bytes = c - 0x80;
+	return 1;
+    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_7, ucs)) != ERROR) {
+	*subcharset = CTEXT_ISO8859_7;
+	*bytes = c - 0x80;
+	return 1;
+    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_8, ucs)) != ERROR) {
+	*subcharset = CTEXT_ISO8859_8;
+	*bytes = c - 0x80;
+	return 1;
+    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_9, ucs)) != ERROR) {
+	*subcharset = CTEXT_ISO8859_9;
+	*bytes = c - 0x80;
+	return 1;
+    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_JISX0201, ucs)) != ERROR) {
+	if (c < 0x80) {
+	    *subcharset = CTEXT_JISX0201_LEFT;
+	} else {
+	    *subcharset = CTEXT_JISX0201_RIGHT;
+	    c -= 0x80;
+	}
+	*bytes = c;
+	return 1;
+    } else if (unicode_to_gb2312(ucs, &r, &c)) {
+	*subcharset = CTEXT_GB2312;
+	*bytes = ((r+0x21) << 8) | (c+0x21);
+	return 1;
+    } else if (unicode_to_ksx1001(ucs, &r, &c)) {
+	*subcharset = CTEXT_KSC5601;
+	*bytes = ((r+0x21) << 8) | (c+0x21);
+	return 1;
+    } else if (unicode_to_jisx0208(ucs, &r, &c)) {
+	*subcharset = CTEXT_JISX0208;
+	*bytes = ((r+0x21) << 8) | (c+0x21);
+	return 1;
+    } else if (unicode_to_jisx0212(ucs, &r, &c)) {
+	*subcharset = CTEXT_JISX0212;
+	*bytes = ((r+0x21) << 8) | (c+0x21);
+	return 1;
+    } else {
+	return 0;
+    }
+}
+#define SEQ(str,cont,cs) \
+    {str,~(63<<(6*(((cont)&~RO)))),(cs)<<(6*(((cont)&~RO))),(cont),(cs)}
+/*
+ * Compound text defines restrictions on which container can take
+ * which character sets. Things labelled `left half of' can only go
+ * in GL; things labelled `right half of' can only go in GR; and 96
+ * or 96^n character sets only _fit_ in GR. Thus:
+ *  - ASCII can only go in GL since it is the left half of 8859-*.
+ *  - All the 8859 sets can only go in GR.
+ *  - JISX0201 left is GL only; JISX0201 right is GR only.
+ *  - The three multibyte sets (GB2312, JISX0208, KSC5601) can go
+ *    in either; we prefer GR where possible since this leads to a
+ *    more compact EUC-like encoding.
+ */
+static const struct iso2022_escape ctext_escapes[] = {
+    SEQ("\033$(A", 0|RO, CTEXT_GB2312),
+    SEQ("\033$(B", 0|RO, CTEXT_JISX0208),
+    SEQ("\033$(C", 0|RO, CTEXT_KSC5601),
+    SEQ("\033$(D", 0|RO, CTEXT_JISX0212),
+    SEQ("\033$)A", 1, CTEXT_GB2312),
+    SEQ("\033$)B", 1, CTEXT_JISX0208),
+    SEQ("\033$)C", 1, CTEXT_KSC5601),
+    SEQ("\033$)D", 1, CTEXT_JISX0212),
+    SEQ("\033(B", 0, CTEXT_ASCII),
+    SEQ("\033(J", 0, CTEXT_JISX0201_LEFT),
+    SEQ("\033)I", 1, CTEXT_JISX0201_RIGHT),
+    SEQ("\033-A", 1, CTEXT_ISO8859_1),
+    SEQ("\033-B", 1, CTEXT_ISO8859_2),
+    SEQ("\033-C", 1, CTEXT_ISO8859_3),
+    SEQ("\033-D", 1, CTEXT_ISO8859_4),
+    SEQ("\033-F", 1, CTEXT_ISO8859_7),
+    SEQ("\033-G", 1, CTEXT_ISO8859_6),
+    SEQ("\033-H", 1, CTEXT_ISO8859_8),
+    SEQ("\033-L", 1, CTEXT_ISO8859_5),
+    SEQ("\033-M", 1, CTEXT_ISO8859_9),
+
+    /*
+     * Cross-testing against Xutf8TextListToTextProperty() turns up
+     * some additional character sets and ISO 2022 features
+     * supported by that and not by us:
+     * 
+     * 	- Single-byte right-hand-half character sets `ESC - f',
+     * 	  `ESC - T' and `ESC - Y'.
+     * 
+     * 	- A really horrifying mechanism used to escape completely
+     * 	  from the ISO 2022 framework: ESC % / <length>
+     * 	  <charset-name> <text>. Xutf8* uses this to encode
+     * 	  "iso8859-14", "iso8859-15" and "big5-0".
+     * 	   * This mechanism is particularly nasty because we can't
+     * 	     efficiently encode it on the fly! It requires that the
+     * 	     length of the text encoded in the foreign charset is
+     * 	     given _before_ the text in question, so if we're
+     * 	     receiving one character at a time we simply can't look
+     * 	     ahead and so we would have to encode each individual
+     * 	     character in a separate one of these sequences.
+     * 
+     * 	- ESC % G and ESC % @ to shift to and from UTF-8 mode, as a
+     * 	  last resort for anything we still don't support.
+     * 	   * Interestingly, ctext.ps actually _disallows_ this: it
+     * 	     says that the above extension mechanism is the only
+     * 	     one permitted. Ho hum.
+     */
+};
+static const struct iso2022 ctext = {
+    ctext_escapes, lenof(ctext_escapes),
+    "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\2\2",  /* must match the enum above */
+    "", 0x80000000 | (CTEXT_ASCII<<0) | (CTEXT_ISO8859_1<<6), "", TRUE,
+    ctext_to_ucs, ctext_from_ucs
+};
+const charset_spec charset_CS_CTEXT = {
+    CS_CTEXT, read_iso2022s, write_iso2022s, &ctext
+};
+
 #else /* ENUM_CHARSETS */
 
 ENUM_CHARSET(CS_ISO2022_JP)
 ENUM_CHARSET(CS_ISO2022_KR)
+ENUM_CHARSET(CS_CTEXT)
 
 #endif /* ENUM_CHARSETS */