Silly of me to overlook it: another obvious way you might like to

[sgt/charset] / iso2022s.c
diff --git a/iso2022s.c b/iso2022s.c

index 3cf8fa4..a1eceb8 100644 (file)
--- a/iso2022s.c
+++ b/iso2022s.c
@@ -446,12 +446,13 @@ static int write_iso2022s(charset_spec const *charset, long int input_chr,
  static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes)
  {
      switch (subcharset) {
  static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes)
  {
      switch (subcharset) {
+      case 1:                         /* JIS X 0201 bottom half */
+       if (bytes == 0x5C)
+           return 0xA5;
+       else if (bytes == 0x7E)
+           return 0x203E;
+       /* else fall through to ASCII */
        case 0: return bytes;           /* one-byte ASCII */
        case 0: return bytes;           /* one-byte ASCII */
-      case 1:                         /* JIS X 0201 half-width katakana */
-       if (bytes >= 0x21 && bytes <= 0x5F)
-           return bytes + (0xFF61 - 0x21);
-       else
-           return ERROR;
         /* (no break needed since all control paths have returned) */
        case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
                                          ((bytes     ) & 0xFF) - 0x21);
         /* (no break needed since all control paths have returned) */
        case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
                                          ((bytes     ) & 0xFF) - 0x21);
@@ -466,9 +467,9 @@ static int iso2022jp_from_ucs(long int ucs, int *subcharset,
         *subcharset = 0;
         *bytes = ucs;
         return 1;
         *subcharset = 0;
         *bytes = ucs;
         return 1;
-    } else if (ucs >= 0xFF61 && ucs <= 0xFF9F) {
+    } else if (ucs == 0xA5 || ucs == 0x203E) {
         *subcharset = 1;
         *subcharset = 1;
-       *bytes = ucs - (0xFF61 - 0x21);
+       *bytes = (ucs == 0xA5 ? 0x5C : 0x7E);
         return 1;
      } else if (unicode_to_jisx0208(ucs, &r, &c)) {
         *subcharset = 2;
         return 1;
      } else if (unicode_to_jisx0208(ucs, &r, &c)) {
         *subcharset = 2;
@@ -535,222 +536,9 @@ const charset_spec charset_CS_ISO2022_KR = {
      CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr
  };
  
      CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr
  };
  
-/*
- * The COMPOUND_TEXT encoding used in X selections. Defined by the
- * X consortium.
- * 
- * This encoding has quite a few sub-charsets. The order I assign
- * to them here is given in an enum.
- */
-enum {
-    /* This must match the bytes-per-character string given below. */
-    CTEXT_ASCII,
-    CTEXT_JISX0201_LEFT,
-    CTEXT_JISX0201_RIGHT,
-    CTEXT_ISO8859_1,
-    CTEXT_ISO8859_2,
-    CTEXT_ISO8859_3,
-    CTEXT_ISO8859_4,
-    CTEXT_ISO8859_5,
-    CTEXT_ISO8859_6,
-    CTEXT_ISO8859_7,
-    CTEXT_ISO8859_8,
-    CTEXT_ISO8859_9,
-    CTEXT_GB2312,
-    CTEXT_KSC5601,
-    CTEXT_JISX0208,
-    CTEXT_JISX0212
-};
-static long int ctext_to_ucs(int subcharset, unsigned long bytes)
-{
-    switch (subcharset) {
-      case CTEXT_ASCII: return bytes;         /* one-byte ASCII */
-      case CTEXT_JISX0201_LEFT:        /* ASCII with yen and overline */
-       return sbcs_to_unicode(&sbcsdata_CS_JISX0201, bytes & 0x7F);
-      case CTEXT_JISX0201_RIGHT:       /* JIS X 0201 half-width katakana */
-       return sbcs_to_unicode(&sbcsdata_CS_JISX0201, (bytes & 0x7F) | 0x80);
-      case CTEXT_ISO8859_1:
-       return sbcs_to_unicode(&sbcsdata_CS_ISO8859_1, (bytes & 0x7F) | 0x80);
-      case CTEXT_ISO8859_2:
-       return sbcs_to_unicode(&sbcsdata_CS_ISO8859_2, (bytes & 0x7F) | 0x80);
-      case CTEXT_ISO8859_3:
-       return sbcs_to_unicode(&sbcsdata_CS_ISO8859_3, (bytes & 0x7F) | 0x80);
-      case CTEXT_ISO8859_4:
-       return sbcs_to_unicode(&sbcsdata_CS_ISO8859_4, (bytes & 0x7F) | 0x80);
-      case CTEXT_ISO8859_5:
-       return sbcs_to_unicode(&sbcsdata_CS_ISO8859_5, (bytes & 0x7F) | 0x80);
-      case CTEXT_ISO8859_6:
-       return sbcs_to_unicode(&sbcsdata_CS_ISO8859_6, (bytes & 0x7F) | 0x80);
-      case CTEXT_ISO8859_7:
-       return sbcs_to_unicode(&sbcsdata_CS_ISO8859_7, (bytes & 0x7F) | 0x80);
-      case CTEXT_ISO8859_8:
-       return sbcs_to_unicode(&sbcsdata_CS_ISO8859_8, (bytes & 0x7F) | 0x80);
-      case CTEXT_ISO8859_9:
-       return sbcs_to_unicode(&sbcsdata_CS_ISO8859_9, (bytes & 0x7F) | 0x80);
-      case CTEXT_GB2312:
-       return gb2312_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
-                                ((bytes     ) & 0xFF) - 0x21);
-      case CTEXT_KSC5601:
-       return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
-                                 ((bytes     ) & 0xFF) - 0x21);
-      case CTEXT_JISX0208:
-       return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
-                                  ((bytes     ) & 0xFF) - 0x21);
-      case CTEXT_JISX0212:
-       return jisx0212_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
-                                  ((bytes     ) & 0xFF) - 0x21);
-      default: return ERROR;
-    }
-}
-static int ctext_from_ucs(long int ucs, int *subcharset, unsigned long *bytes)
-{
-    int r, c;
-    if (ucs < 0x80) {
-       *subcharset = CTEXT_ASCII;
-       *bytes = ucs;
-       return 1;
-    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_1, ucs)) != ERROR) {
-       *subcharset = CTEXT_ISO8859_1;
-       *bytes = c - 0x80;
-       return 1;
-    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_2, ucs)) != ERROR) {
-       *subcharset = CTEXT_ISO8859_2;
-       *bytes = c - 0x80;
-       return 1;
-    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_3, ucs)) != ERROR) {
-       *subcharset = CTEXT_ISO8859_3;
-       *bytes = c - 0x80;
-       return 1;
-    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_4, ucs)) != ERROR) {
-       *subcharset = CTEXT_ISO8859_4;
-       *bytes = c - 0x80;
-       return 1;
-    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_5, ucs)) != ERROR) {
-       *subcharset = CTEXT_ISO8859_5;
-       *bytes = c - 0x80;
-       return 1;
-    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_6, ucs)) != ERROR) {
-       *subcharset = CTEXT_ISO8859_6;
-       *bytes = c - 0x80;
-       return 1;
-    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_7, ucs)) != ERROR) {
-       *subcharset = CTEXT_ISO8859_7;
-       *bytes = c - 0x80;
-       return 1;
-    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_8, ucs)) != ERROR) {
-       *subcharset = CTEXT_ISO8859_8;
-       *bytes = c - 0x80;
-       return 1;
-    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_9, ucs)) != ERROR) {
-       *subcharset = CTEXT_ISO8859_9;
-       *bytes = c - 0x80;
-       return 1;
-    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_JISX0201, ucs)) != ERROR) {
-       if (c < 0x80) {
-           *subcharset = CTEXT_JISX0201_LEFT;
-       } else {
-           *subcharset = CTEXT_JISX0201_RIGHT;
-           c -= 0x80;
-       }
-       *bytes = c;
-       return 1;
-    } else if (unicode_to_gb2312(ucs, &r, &c)) {
-       *subcharset = CTEXT_GB2312;
-       *bytes = ((r+0x21) << 8) | (c+0x21);
-       return 1;
-    } else if (unicode_to_ksx1001(ucs, &r, &c)) {
-       *subcharset = CTEXT_KSC5601;
-       *bytes = ((r+0x21) << 8) | (c+0x21);
-       return 1;
-    } else if (unicode_to_jisx0208(ucs, &r, &c)) {
-       *subcharset = CTEXT_JISX0208;
-       *bytes = ((r+0x21) << 8) | (c+0x21);
-       return 1;
-    } else if (unicode_to_jisx0212(ucs, &r, &c)) {
-       *subcharset = CTEXT_JISX0212;
-       *bytes = ((r+0x21) << 8) | (c+0x21);
-       return 1;
-    } else {
-       return 0;
-    }
-}
-#define SEQ(str,cont,cs) \
-    {str,~(63<<(6*(((cont)&~RO)))),(cs)<<(6*(((cont)&~RO))),(cont),(cs)}
-/*
- * Compound text defines restrictions on which container can take
- * which character sets. Things labelled `left half of' can only go
- * in GL; things labelled `right half of' can only go in GR; and 96
- * or 96^n character sets only _fit_ in GR. Thus:
- *  - ASCII can only go in GL since it is the left half of 8859-*.
- *  - All the 8859 sets can only go in GR.
- *  - JISX0201 left is GL only; JISX0201 right is GR only.
- *  - The three multibyte sets (GB2312, JISX0208, KSC5601) can go
- *    in either; we prefer GR where possible since this leads to a
- *    more compact EUC-like encoding.
- */
-static const struct iso2022_escape ctext_escapes[] = {
-    SEQ("\033$(A", 0|RO, CTEXT_GB2312),
-    SEQ("\033$(B", 0|RO, CTEXT_JISX0208),
-    SEQ("\033$(C", 0|RO, CTEXT_KSC5601),
-    SEQ("\033$(D", 0|RO, CTEXT_JISX0212),
-    SEQ("\033$)A", 1, CTEXT_GB2312),
-    SEQ("\033$)B", 1, CTEXT_JISX0208),
-    SEQ("\033$)C", 1, CTEXT_KSC5601),
-    SEQ("\033$)D", 1, CTEXT_JISX0212),
-    SEQ("\033(B", 0, CTEXT_ASCII),
-    SEQ("\033(J", 0, CTEXT_JISX0201_LEFT),
-    SEQ("\033)I", 1, CTEXT_JISX0201_RIGHT),
-    SEQ("\033-A", 1, CTEXT_ISO8859_1),
-    SEQ("\033-B", 1, CTEXT_ISO8859_2),
-    SEQ("\033-C", 1, CTEXT_ISO8859_3),
-    SEQ("\033-D", 1, CTEXT_ISO8859_4),
-    SEQ("\033-F", 1, CTEXT_ISO8859_7),
-    SEQ("\033-G", 1, CTEXT_ISO8859_6),
-    SEQ("\033-H", 1, CTEXT_ISO8859_8),
-    SEQ("\033-L", 1, CTEXT_ISO8859_5),
-    SEQ("\033-M", 1, CTEXT_ISO8859_9),
-
-    /*
-     * Cross-testing against Xutf8TextListToTextProperty() turns up
-     * some additional character sets and ISO 2022 features
-     * supported by that and not by us:
-     * 
-     *         - Single-byte right-hand-half character sets `ESC - f',
-     *           `ESC - T' and `ESC - Y'.
-     * 
-     *         - A really horrifying mechanism used to escape completely
-     *           from the ISO 2022 framework: ESC % / <length>
-     *           <charset-name> <text>. Xutf8* uses this to encode
-     *           "iso8859-14", "iso8859-15" and "big5-0".
-     *            * This mechanism is particularly nasty because we can't
-     *              efficiently encode it on the fly! It requires that the
-     *              length of the text encoded in the foreign charset is
-     *              given _before_ the text in question, so if we're
-     *              receiving one character at a time we simply can't look
-     *              ahead and so we would have to encode each individual
-     *              character in a separate one of these sequences.
-     * 
-     *         - ESC % G and ESC % @ to shift to and from UTF-8 mode, as a
-     *           last resort for anything we still don't support.
-     *            * Interestingly, ctext.ps actually _disallows_ this: it
-     *              says that the above extension mechanism is the only
-     *              one permitted. Ho hum.
-     */
-};
-static const struct iso2022 ctext = {
-    ctext_escapes, lenof(ctext_escapes),
-    "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\2\2",  /* must match the enum above */
-    "", 0x80000000 | (CTEXT_ASCII<<0) | (CTEXT_ISO8859_1<<6), "", TRUE,
-    ctext_to_ucs, ctext_from_ucs
-};
-const charset_spec charset_CS_CTEXT = {
-    CS_CTEXT, read_iso2022s, write_iso2022s, &ctext
-};
-
  #else /* ENUM_CHARSETS */
  
  ENUM_CHARSET(CS_ISO2022_JP)
  ENUM_CHARSET(CS_ISO2022_KR)
  #else /* ENUM_CHARSETS */
  
  ENUM_CHARSET(CS_ISO2022_JP)
  ENUM_CHARSET(CS_ISO2022_KR)
-ENUM_CHARSET(CS_CTEXT)
  
  #endif /* ENUM_CHARSETS */
  
  #endif /* ENUM_CHARSETS */