TIS-620 is equivalent to ISO 8859-11, so map the MIME name for the former to

[sgt/charset] / iso2022s.c
diff --git a/iso2022s.c b/iso2022s.c

index 087fd96..000c57e 100644 (file)
--- a/iso2022s.c
+++ b/iso2022s.c
@@ -42,7 +42,7 @@ struct iso2022 {
       * in ASCII order, so that we can narrow down the list as
       * necessary.
       */
-    struct iso2022_escape *escapes;    /* must be sorted in ASCII order! */
+    const struct iso2022_escape *escapes;/* must be sorted in ASCII order! */
      int nescapes;
  
      /*
@@ -482,13 +482,13 @@ static int iso2022jp_from_ucs(long int ucs, int *subcharset,
         return 0;
      }
  }
-static struct iso2022_escape iso2022jp_escapes[] = {
+static const struct iso2022_escape iso2022jp_escapes[] = {
      {"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1},   /* we ignore this one */
      {"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2},
      {"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0},
      {"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1},
  };
-static struct iso2022 iso2022jp = {
+static const struct iso2022 iso2022jp = {
      iso2022jp_escapes, lenof(iso2022jp_escapes),
      "\1\1\2", "\3", 0x80000000, NULL, FALSE,
      iso2022jp_to_ucs, iso2022jp_from_ucs
@@ -525,12 +525,12 @@ static int iso2022kr_from_ucs(long int ucs, int *subcharset,
         return 0;
      }
  }
-static struct iso2022_escape iso2022kr_escapes[] = {
+static const struct iso2022_escape iso2022kr_escapes[] = {
      {"\016", 0x8FFFFFFF, 0x10000000, -1, -1},
      {"\017", 0x8FFFFFFF, 0x00000000, 0, 0},
      {"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1},   /* bits[11:6] <- 1 */
  };
-static struct iso2022 iso2022kr = {
+static const struct iso2022 iso2022kr = {
      iso2022kr_escapes, lenof(iso2022kr_escapes),
      "\1\2", "\2", 0x80000040, "\033$)C", FALSE,
      iso2022kr_to_ucs, iso2022kr_from_ucs
@@ -562,7 +562,8 @@ enum {
      CTEXT_ISO8859_9,
      CTEXT_GB2312,
      CTEXT_KSC5601,
-    CTEXT_JISX0208
+    CTEXT_JISX0208,
+    CTEXT_JISX0212
  };
  static long int ctext_to_ucs(int subcharset, unsigned long bytes)
  {
@@ -599,6 +600,9 @@ static long int ctext_to_ucs(int subcharset, unsigned long bytes)
        case CTEXT_JISX0208:
         return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
                                    ((bytes     ) & 0xFF) - 0x21);
+      case CTEXT_JISX0212:
+       return jisx0212_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
+                                  ((bytes     ) & 0xFF) - 0x21);
        default: return ERROR;
      }
  }
@@ -666,12 +670,16 @@ static int ctext_from_ucs(long int ucs, int *subcharset, unsigned long *bytes)
         *subcharset = CTEXT_JISX0208;
         *bytes = ((r+0x21) << 8) | (c+0x21);
         return 1;
+    } else if (unicode_to_jisx0212(ucs, &r, &c)) {
+       *subcharset = CTEXT_JISX0212;
+       *bytes = ((r+0x21) << 8) | (c+0x21);
+       return 1;
      } else {
         return 0;
      }
  }
  #define SEQ(str,cont,cs) \
-    {str,~(63<<(6*((cont&~RO)))),(cs)<<(6*((cont&~RO))),(cont),(cs)}
+    {str,~(63<<(6*(((cont)&~RO)))),(cs)<<(6*(((cont)&~RO))),(cont),(cs)}
  /*
   * Compound text defines restrictions on which container can take
   * which character sets. Things labelled `left half of' can only go
@@ -684,13 +692,15 @@ static int ctext_from_ucs(long int ucs, int *subcharset, unsigned long *bytes)
   *    in either; we prefer GR where possible since this leads to a
   *    more compact EUC-like encoding.
   */
-static struct iso2022_escape ctext_escapes[] = {
+static const struct iso2022_escape ctext_escapes[] = {
      SEQ("\033$(A", 0|RO, CTEXT_GB2312),
      SEQ("\033$(B", 0|RO, CTEXT_JISX0208),
      SEQ("\033$(C", 0|RO, CTEXT_KSC5601),
+    SEQ("\033$(D", 0|RO, CTEXT_JISX0212),
      SEQ("\033$)A", 1, CTEXT_GB2312),
      SEQ("\033$)B", 1, CTEXT_JISX0208),
      SEQ("\033$)C", 1, CTEXT_KSC5601),
+    SEQ("\033$)D", 1, CTEXT_JISX0212),
      SEQ("\033(B", 0, CTEXT_ASCII),
      SEQ("\033(J", 0, CTEXT_JISX0201_LEFT),
      SEQ("\033)I", 1, CTEXT_JISX0201_RIGHT),
@@ -703,10 +713,37 @@ static struct iso2022_escape ctext_escapes[] = {
      SEQ("\033-H", 1, CTEXT_ISO8859_8),
      SEQ("\033-L", 1, CTEXT_ISO8859_5),
      SEQ("\033-M", 1, CTEXT_ISO8859_9),
+
+    /*
+     * Cross-testing against Xutf8TextListToTextProperty() turns up
+     * some additional character sets and ISO 2022 features
+     * supported by that and not by us:
+     * 
+     *         - Single-byte right-hand-half character sets `ESC - f',
+     *           `ESC - T' and `ESC - Y'.
+     * 
+     *         - A really horrifying mechanism used to escape completely
+     *           from the ISO 2022 framework: ESC % / <length>
+     *           <charset-name> <text>. Xutf8* uses this to encode
+     *           "iso8859-14", "iso8859-15" and "big5-0".
+     *            * This mechanism is particularly nasty because we can't
+     *              efficiently encode it on the fly! It requires that the
+     *              length of the text encoded in the foreign charset is
+     *              given _before_ the text in question, so if we're
+     *              receiving one character at a time we simply can't look
+     *              ahead and so we would have to encode each individual
+     *              character in a separate one of these sequences.
+     * 
+     *         - ESC % G and ESC % @ to shift to and from UTF-8 mode, as a
+     *           last resort for anything we still don't support.
+     *            * Interestingly, ctext.ps actually _disallows_ this: it
+     *              says that the above extension mechanism is the only
+     *              one permitted. Ho hum.
+     */
  };
-static struct iso2022 ctext = {
+static const struct iso2022 ctext = {
      ctext_escapes, lenof(ctext_escapes),
-    "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\2",  /* must match the enum above */
+    "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\2\2",  /* must match the enum above */
      "", 0x80000000 | (CTEXT_ASCII<<0) | (CTEXT_ISO8859_1<<6), "", TRUE,
      ctext_to_ucs, ctext_from_ucs
  };