Silly of me to overlook it: another obvious way you might like to

[sgt/charset] / iso2022s.c
diff --git a/iso2022s.c b/iso2022s.c

index e202207..a1eceb8 100644 (file)
--- a/iso2022s.c
+++ b/iso2022s.c
@@ -1,9 +1,5 @@
  /*
   * iso2022s.c - support for ISO-2022 subset encodings.
- * 
- * (The `s' suffix on the filename is there to leave `iso2022.c'
- * free for the unlikely event that I ever attempt to implement
- * _full_ ISO-2022 in this library!)
   */
  
  #ifndef ENUM_CHARSETS
@@ -14,6 +10,7 @@
  
  #include "charset.h"
  #include "internal.h"
+#include "sbcsdat.h"
  
  #define SO (0x0E)
  #define SI (0x0F)
@@ -26,9 +23,14 @@ struct iso2022_escape {
      /*
       * For output, these variables help us figure out which escape
       * sequences we need to get where we want to be.
+     * 
+     * `container' should be in the range 0-3, but can also be ORed
+     * with the bit flag RO to indicate that this is not a
+     * preferred container to use for this charset during output.
       */
      int container, subcharset;
  };
+#define RO 0x80
  
  struct iso2022 {
      /*
@@ -36,7 +38,7 @@ struct iso2022 {
       * in ASCII order, so that we can narrow down the list as
       * necessary.
       */
-    struct iso2022_escape *escapes;    /* must be sorted in ASCII order! */
+    const struct iso2022_escape *escapes;/* must be sorted in ASCII order! */
      int nescapes;
  
      /*
@@ -74,6 +76,11 @@ struct iso2022 {
      char const *initial_sequence;
  
      /*
+     * Is this an 8-bit ISO 2022 subset?
+     */
+    int eightbit;
+
+    /*
       * Function calls to do the actual translation.
       */
      long int (*to_ucs)(int subcharset, unsigned long bytes);
@@ -115,6 +122,11 @@ static void read_iso2022s(charset_spec const *charset, long int input_chr,
       *              either 2 or 3.
       *            + Hence: 0 is SI, 1 is SO, 4 is SS2-from-SI, 5 is
       *              SS2-from-SO, 6 is SS3-from-SI, 7 is SS3-from-SO.
+     *            + For added fun: in an _8-bit_ ISO 2022 subset, we have
+     *              the further special value 2, which means that we're
+     *              theoretically in SI but the current character being
+     *              accumulated is composed of 8-bit characters and will
+     *              therefore be interpreted as if in SO.
       * 
       *         - The next nibble of s1 (27:24) indicates how many bytes
       *           have been accumulated in the current character.
@@ -214,9 +226,12 @@ static void read_iso2022s(charset_spec const *charset, long int input_chr,
      /*
       * If this isn't an escape sequence, it must be part of a
       * character. One possibility is that it's a control character
-     * (outside the space 21-7E), in which case we output it verbatim.
+     * (00-20 or 7F-9F; also in non-8-bit ISO 2022 subsets I'm
+     * going to treat all top-half characters as controls), in
+     * which case we output it verbatim.
       */
-    if (input_chr < 0x21 || input_chr > 0x7E) {
+    if (input_chr < 0x21 ||
+       (input_chr > 0x7E && (!iso->eightbit || input_chr < 0xA0))) {
         /*
          * We might be in mid-multibyte-character. If so, clear the
          * character state and emit an error token for the
@@ -245,8 +260,45 @@ static void read_iso2022s(charset_spec const *charset, long int input_chr,
         unsigned long chr;
         int chrlen, cont, subcharset, bytes;
  
+       /*
+        * Verify that we've seen the right kind of character for
+        * what we're currently doing. This only matters in 8-bit
+        * subsets.
+        */
+       if (iso->eightbit) {
+           cont = (state->s1 >> 28) & 7;
+           /*
+            * If cont==0, we're entitled to see either GL or GR
+            * characters. If cont==2, we expect only GR; otherwise
+            * we expect only GL.
+            * 
+            * If we see a GR character while cont==0, we set
+            * cont=2 immediately.
+            */
+           if ((cont == 2 && !(input_chr & 0x80)) ||
+               (cont != 0 && cont != 2 && (input_chr & 0x80))) {
+               /*
+                * Clear the previous character; it was prematurely
+                * terminated by this error.
+                */
+               state->s1 &= ~0x0F000000;
+               state->s0 &= 0xFF000000;
+               emit(emitctx, ERROR);
+               /*
+                * If we were in the SS2 or SS3 container, we
+                * automatically exit it.
+                */
+               if (state->s1 & 0x60000000)
+                   state->s1 &= 0x9FFFFFFF;
+           }
+
+           if (cont == 0 && (input_chr & 0x80)) {
+               state->s1 |= 0x20000000;
+           }
+       }
+
         /* The current character and its length. */
-       chr = ((state->s0 & 0x00FFFFFF) << 8) | input_chr;
+       chr = ((state->s0 & 0x00FFFFFF) << 8) | (input_chr & 0x7F);
         chrlen = ((state->s1 >> 24) & 0xF) + 1;
         /* The current sub-charset. */
         cont = (state->s1 >> 28) & 7;
@@ -280,7 +332,7 @@ static int write_iso2022s(charset_spec const *charset, long int input_chr,
                           void *emitctx)
  {
      struct iso2022 const *iso = (struct iso2022 *)charset->data;
-    int subcharset, len, i, j, cont;
+    int subcharset, len, i, j, cont, topbit = 0;
      unsigned long bytes;
  
      /*
@@ -333,7 +385,8 @@ static int write_iso2022s(charset_spec const *charset, long int input_chr,
       * necessary, and then output the given bytes.
       */
      for (i = 0; i < iso->nescapes; i++)
-       if (iso->escapes[i].subcharset == subcharset)
+       if (iso->escapes[i].subcharset == subcharset &&
+           !(iso->escapes[i].container & RO))
             break;
      assert(i < iso->nescapes);
  
@@ -343,7 +396,7 @@ static int write_iso2022s(charset_spec const *charset, long int input_chr,
       * already _be_ selected in that container! Check before we go
       * to the effort of emitting the sequence.
       */
-    cont = iso->escapes[i].container;
+    cont = iso->escapes[i].container &~ RO;
      if (((state->s1 >> (6*cont)) & 0x3F) != (unsigned)subcharset) {
         for (j = 0; iso->escapes[i].sequence[j]; j++)
             emit(emitctx, iso->escapes[i].sequence[j]);
@@ -360,9 +413,17 @@ static int write_iso2022s(charset_spec const *charset, long int input_chr,
         emit(emitctx, ESC);
         emit(emitctx, 'L' + cont);     /* comes out to 'N' or 'O' */
      } else {
-       /* Emit SI or SO, but only if the current container isn't already
-        * the right one. */
-       if (((state->s1 >> 28) & 7) != (unsigned)cont) {
+       /*
+        * Emit SI or SO, but only if the current container isn't already
+        * the right one.
+        * 
+        * Also, in an 8-bit subset, we need not do this; we'll
+        * just use 8-bit characters to output SO-container
+        * characters.
+        */
+       if (iso->eightbit && cont == 1 && ((state->s1 >> 28) & 7) == 0) {
+           topbit = 0x80;
+       } else if (((state->s1 >> 28) & 7) != (unsigned)cont) {
             emit(emitctx, cont ? SO : SI);
             state->s1 = (state->s1 & 0x8FFFFFFF) | (cont << 28);
         }
@@ -374,7 +435,7 @@ static int write_iso2022s(charset_spec const *charset, long int input_chr,
       */
      len = iso->nbytes[subcharset];
      while (len--)
-       emit(emitctx, (bytes >> (8*len)) & 0xFF);
+       emit(emitctx, ((bytes >> (8*len)) & 0xFF) | topbit);
  
      return TRUE;
  }
@@ -385,12 +446,13 @@ static int write_iso2022s(charset_spec const *charset, long int input_chr,
  static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes)
  {
      switch (subcharset) {
+      case 1:                         /* JIS X 0201 bottom half */
+       if (bytes == 0x5C)
+           return 0xA5;
+       else if (bytes == 0x7E)
+           return 0x203E;
+       /* else fall through to ASCII */
        case 0: return bytes;           /* one-byte ASCII */
-      case 1:                         /* JIS X 0201 half-width katakana */
-       if (bytes >= 0x21 && bytes <= 0x5F)
-           return bytes + (0xFF61 - 0x21);
-       else
-           return ERROR;
         /* (no break needed since all control paths have returned) */
        case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
                                          ((bytes     ) & 0xFF) - 0x21);
@@ -405,9 +467,9 @@ static int iso2022jp_from_ucs(long int ucs, int *subcharset,
         *subcharset = 0;
         *bytes = ucs;
         return 1;
-    } else if (ucs >= 0xFF61 && ucs <= 0xFF9F) {
+    } else if (ucs == 0xA5 || ucs == 0x203E) {
         *subcharset = 1;
-       *bytes = ucs - (0xFF61 - 0x21);
+       *bytes = (ucs == 0xA5 ? 0x5C : 0x7E);
         return 1;
      } else if (unicode_to_jisx0208(ucs, &r, &c)) {
         *subcharset = 2;
@@ -417,15 +479,16 @@ static int iso2022jp_from_ucs(long int ucs, int *subcharset,
         return 0;
      }
  }
-static struct iso2022_escape iso2022jp_escapes[] = {
+static const struct iso2022_escape iso2022jp_escapes[] = {
      {"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1},   /* we ignore this one */
      {"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2},
      {"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0},
      {"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1},
  };
-static struct iso2022 iso2022jp = {
+static const struct iso2022 iso2022jp = {
      iso2022jp_escapes, lenof(iso2022jp_escapes),
-    "\1\1\2", "\3", 0x80000000, NULL, iso2022jp_to_ucs, iso2022jp_from_ucs
+    "\1\1\2", "\3", 0x80000000, NULL, FALSE,
+    iso2022jp_to_ucs, iso2022jp_from_ucs
  };
  const charset_spec charset_CS_ISO2022_JP = {
      CS_ISO2022_JP, read_iso2022s, write_iso2022s, &iso2022jp
@@ -459,14 +522,15 @@ static int iso2022kr_from_ucs(long int ucs, int *subcharset,
         return 0;
      }
  }
-static struct iso2022_escape iso2022kr_escapes[] = {
+static const struct iso2022_escape iso2022kr_escapes[] = {
      {"\016", 0x8FFFFFFF, 0x10000000, -1, -1},
      {"\017", 0x8FFFFFFF, 0x00000000, 0, 0},
      {"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1},   /* bits[11:6] <- 1 */
  };
-static struct iso2022 iso2022kr = {
+static const struct iso2022 iso2022kr = {
      iso2022kr_escapes, lenof(iso2022kr_escapes),
-    "\1\2", "\2", 0x80000040, "\033$)C", iso2022kr_to_ucs, iso2022kr_from_ucs
+    "\1\2", "\2", 0x80000040, "\033$)C", FALSE,
+    iso2022kr_to_ucs, iso2022kr_from_ucs
  };
  const charset_spec charset_CS_ISO2022_KR = {
      CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr