The COMPOUND_TEXT encoding used by some X applications to transfer

author simon <simon@cda61777-01e9-0310-a592-d414129be87e>

Sat, 25 Sep 2004 13:24:27 +0000 (13:24 +0000)

committer simon <simon@cda61777-01e9-0310-a592-d414129be87e>

Sat, 25 Sep 2004 13:24:27 +0000 (13:24 +0000)
author simon <simon@cda61777-01e9-0310-a592-d414129be87e>
Sat, 25 Sep 2004 13:24:27 +0000 (13:24 +0000)
committer simon <simon@cda61777-01e9-0310-a592-d414129be87e>
Sat, 25 Sep 2004 13:24:27 +0000 (13:24 +0000)
diff --git a/.cvsignore b/.cvsignore

index dbd5675..e63d048 100644 (file)
--- a/.cvsignore
+++ b/.cvsignore
@@ -1 +1 @@
-sbcsdat.c convcs
+sbcsdat.c sbcsdat.h convcs cstable
diff --git a/Makefile b/Makefile

index bf937c4..400d39c 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -37,13 +37,21 @@
  
  $(LIBCHARSET_GENPFX)all: \
         $(LIBCHARSET_OBJDIR)libcharset.a \
  
  $(LIBCHARSET_GENPFX)all: \
         $(LIBCHARSET_OBJDIR)libcharset.a \
-       $(LIBCHARSET_OBJDIR)convcs
+       $(LIBCHARSET_OBJDIR)convcs \
+       $(LIBCHARSET_OBJDIR)cstable
  
  
-$(LIBCHARSET_OBJDIR)convcs: $(LIBCHARSET_OBJDIR)libcharset.a
+$(LIBCHARSET_OBJDIR)convcs: $(LIBCHARSET_SRCDIR)test.c \
+       $(LIBCHARSET_OBJDIR)libcharset.a
         $(CC) $(CFLAGS) -o $(LIBCHARSET_OBJDIR)convcs \
                 $(LIBCHARSET_SRCDIR)test.c \
                 $(LIBCHARSET_OBJDIR)libcharset.a
  
         $(CC) $(CFLAGS) -o $(LIBCHARSET_OBJDIR)convcs \
                 $(LIBCHARSET_SRCDIR)test.c \
                 $(LIBCHARSET_OBJDIR)libcharset.a
  
+$(LIBCHARSET_OBJDIR)cstable: $(LIBCHARSET_SRCDIR)cstable.c \
+       $(LIBCHARSET_OBJDIR)libcharset.a
+       $(CC) $(CFLAGS) -o $(LIBCHARSET_OBJDIR)cstable \
+               $(LIBCHARSET_SRCDIR)cstable.c \
+               $(LIBCHARSET_OBJDIR)libcharset.a
+
  LIBCHARSET_OBJS = \
         $(LIBCHARSET_OBJDIR)$(LIBCHARSET_OBJPFX)big5enc.o \
         $(LIBCHARSET_OBJDIR)$(LIBCHARSET_OBJPFX)big5set.o \
  LIBCHARSET_OBJS = \
         $(LIBCHARSET_OBJDIR)$(LIBCHARSET_OBJPFX)big5enc.o \
         $(LIBCHARSET_OBJDIR)$(LIBCHARSET_OBJPFX)big5set.o \
@@ -105,7 +113,8 @@ $(LIBCHARSET_OBJDIR)$(LIBCHARSET_OBJPFX)hz.o: \
         $(CC) $(CFLAGS) $(MD) -c -o $@ $<
  
  $(LIBCHARSET_OBJDIR)$(LIBCHARSET_OBJPFX)iso2022s.o: \
         $(CC) $(CFLAGS) $(MD) -c -o $@ $<
  
  $(LIBCHARSET_OBJDIR)$(LIBCHARSET_OBJPFX)iso2022s.o: \
-       $(LIBCHARSET_SRCDIR)iso2022s.c
+       $(LIBCHARSET_SRCDIR)iso2022s.c \
+       $(LIBCHARSET_OBJDIR)sbcsdat.h
         $(CC) $(CFLAGS) $(MD) -c -o $@ $<
  
  $(LIBCHARSET_OBJDIR)$(LIBCHARSET_OBJPFX)istate.o: \
         $(CC) $(CFLAGS) $(MD) -c -o $@ $<
  
  $(LIBCHARSET_OBJDIR)$(LIBCHARSET_OBJPFX)istate.o: \
@@ -183,14 +192,17 @@ $(LIBCHARSET_OBJDIR)$(LIBCHARSET_OBJPFX)sbcsdat.o: \
         $(LIBCHARSET_OBJDIR)sbcsdat.c
         $(CC) $(CFLAGS) $(MD) -c -o $@ $<
  
         $(LIBCHARSET_OBJDIR)sbcsdat.c
         $(CC) $(CFLAGS) $(MD) -c -o $@ $<
  
-$(LIBCHARSET_OBJDIR)sbcsdat.c: \
+$(LIBCHARSET_OBJDIR)sbcsdat.c $(LIBCHARSET_OBJDIR)sbcsdat.h: \
         $(LIBCHARSET_SRCDIR)sbcs.dat \
         $(LIBCHARSET_SRCDIR)sbcsgen.pl
         perl $(LIBCHARSET_SRCDIR)sbcsgen.pl \
         $(LIBCHARSET_SRCDIR)sbcs.dat \
         $(LIBCHARSET_SRCDIR)sbcsgen.pl
         perl $(LIBCHARSET_SRCDIR)sbcsgen.pl \
-               $(LIBCHARSET_SRCDIR)sbcs.dat $@
+               $(LIBCHARSET_SRCDIR)sbcs.dat \
+               $(LIBCHARSET_OBJDIR)sbcsdat.c \
+               $(LIBCHARSET_OBJDIR)sbcsdat.h
  
  $(LIBCHARSET_GENPFX)clean:
         rm -f $(LIBCHARSET_OBJDIR)$(LIBCHARSET_OBJPFX)*.o \
                 $(LIBCHARSET_OBJDIR)libcharset.a \
                 $(LIBCHARSET_OBJDIR)sbcsdat.c \
  
  $(LIBCHARSET_GENPFX)clean:
         rm -f $(LIBCHARSET_OBJDIR)$(LIBCHARSET_OBJPFX)*.o \
                 $(LIBCHARSET_OBJDIR)libcharset.a \
                 $(LIBCHARSET_OBJDIR)sbcsdat.c \
+               $(LIBCHARSET_OBJDIR)sbcsdat.h \
                 $(LIBCHARSET_OBJDIR)convcs
                 $(LIBCHARSET_OBJDIR)convcs
diff --git a/charset.h b/charset.h

index 883dcfb..ebaafc9 100644 (file)
--- a/charset.h
+++ b/charset.h
@@ -45,6 +45,7 @@ typedef enum {
      CS_KOI8_R,
      CS_KOI8_U,
      CS_KOI8_RU,
      CS_KOI8_R,
      CS_KOI8_U,
      CS_KOI8_RU,
+    CS_JISX0201,
      CS_MAC_ROMAN,
      CS_MAC_TURKISH,
      CS_MAC_CROATIAN,
      CS_MAC_ROMAN,
      CS_MAC_TURKISH,
      CS_MAC_CROATIAN,
@@ -84,6 +85,7 @@ typedef enum {
      CS_HZ,
      CS_CP949,
      CS_PDF,
      CS_HZ,
      CS_CP949,
      CS_PDF,
+    CS_CTEXT,
  } charset_t;
  
  typedef struct {
  } charset_t;
  
  typedef struct {
diff --git a/cstable.c b/cstable.c

new file mode 100644 (file)

index 0000000..1336aac
--- /dev/null
+++ b/cstable.c
@@ -0,0 +1,78 @@
+/*
+ * cstable.c - libcharset supporting utility which draws up a map
+ * of the whole Unicode BMP and annotates it with details of which
+ * other character sets each character appears in.
+ * 
+ * Note this is not a libcharset _client_; it is part of the
+ * libcharset _package_, using libcharset internals.
+ */
+
+#include "charset.h"
+#include "internal.h"
+#include "sbcsdat.h"
+
+#define ENUM_CHARSET(x) extern charset_spec const charset_##x;
+#include "enum.c"
+#undef ENUM_CHARSET
+static charset_spec const *const cs_table[] = {
+#define ENUM_CHARSET(x) &charset_##x,
+#include "enum.c"
+#undef ENUM_CHARSET
+};
+
+int main(void)
+{
+    long int c;
+
+    for (c = 0; c < 0x10000; c++) {
+       int i, row, col;
+       char const *sep = "";
+
+       printf("U+%04x:", c);
+
+       /*
+        * Look up in SBCSes.
+        */
+       for (i = 0; i < lenof(cs_table); i++)
+           if (cs_table[i]->read == read_sbcs &&
+               sbcs_from_unicode(cs_table[i]->data, c) != ERROR) {
+               printf("%s %s", sep,
+                      charset_to_localenc(cs_table[i]->charset));
+               sep = ";";
+           }
+
+       /*
+        * Look up individually in MBCS base charsets.
+        */
+       if (unicode_to_big5(c, &row, &col)) {
+           printf("%s Big5", sep);
+           sep = ";";
+       }
+       if (unicode_to_gb2312(c, &row, &col)) {
+           printf("%s GB2312", sep);
+           sep = ";";
+       }
+
+       if (unicode_to_jisx0208(c, &row, &col)) {
+           printf("%s JIS X 0208", sep);
+           sep = ";";
+       }
+
+       if (unicode_to_ksx1001(c, &row, &col)) {
+           printf("%s KS X 1001", sep);
+           sep = ";";
+       }
+
+       if (unicode_to_cp949(c, &row, &col)) {
+           printf("%s CP949", sep);
+           sep = ";";
+       }
+
+       if (!*sep)
+           printf(" unicode-only");
+
+       printf("\n");
+    }
+
+    return 0;
+}
diff --git a/internal.h b/internal.h

index ce9f146..df5996f 100644 (file)
--- a/internal.h
+++ b/internal.h
@@ -91,6 +91,8 @@ void read_sbcs(charset_spec const *charset, long int input_chr,
  int write_sbcs(charset_spec const *charset, long int input_chr,
                charset_state *state,
                void (*emit)(void *ctx, long int output), void *emitctx);
  int write_sbcs(charset_spec const *charset, long int input_chr,
                charset_state *state,
                void (*emit)(void *ctx, long int output), void *emitctx);
+long int sbcs_to_unicode(const struct sbcs_data *sd, long int input_chr);
+long int sbcs_from_unicode(const struct sbcs_data *sd, long int input_chr);
  
  long int big5_to_unicode(int r, int c);
  int unicode_to_big5(long int unicode, int *r, int *c);
  
  long int big5_to_unicode(int r, int c);
  int unicode_to_big5(long int unicode, int *r, int *c);
diff --git a/iso2022s.c b/iso2022s.c

index e202207..fd75ab4 100644 (file)
--- a/iso2022s.c
+++ b/iso2022s.c
@@ -14,6 +14,7 @@
  
  #include "charset.h"
  #include "internal.h"
  
  #include "charset.h"
  #include "internal.h"
+#include "sbcsdat.h"
  
  #define SO (0x0E)
  #define SI (0x0F)
  
  #define SO (0x0E)
  #define SI (0x0F)
@@ -26,9 +27,14 @@ struct iso2022_escape {
      /*
       * For output, these variables help us figure out which escape
       * sequences we need to get where we want to be.
      /*
       * For output, these variables help us figure out which escape
       * sequences we need to get where we want to be.
+     * 
+     * `container' should be in the range 0-3, but can also be ORed
+     * with the bit flag RO to indicate that this is not a
+     * preferred container to use for this charset during output.
       */
      int container, subcharset;
  };
       */
      int container, subcharset;
  };
+#define RO 0x80
  
  struct iso2022 {
      /*
  
  struct iso2022 {
      /*
@@ -74,6 +80,11 @@ struct iso2022 {
      char const *initial_sequence;
  
      /*
      char const *initial_sequence;
  
      /*
+     * Is this an 8-bit ISO 2022 subset?
+     */
+    int eightbit;
+
+    /*
       * Function calls to do the actual translation.
       */
      long int (*to_ucs)(int subcharset, unsigned long bytes);
       * Function calls to do the actual translation.
       */
      long int (*to_ucs)(int subcharset, unsigned long bytes);
@@ -115,6 +126,11 @@ static void read_iso2022s(charset_spec const *charset, long int input_chr,
       *              either 2 or 3.
       *            + Hence: 0 is SI, 1 is SO, 4 is SS2-from-SI, 5 is
       *              SS2-from-SO, 6 is SS3-from-SI, 7 is SS3-from-SO.
       *              either 2 or 3.
       *            + Hence: 0 is SI, 1 is SO, 4 is SS2-from-SI, 5 is
       *              SS2-from-SO, 6 is SS3-from-SI, 7 is SS3-from-SO.
+     *            + For added fun: in an _8-bit_ ISO 2022 subset, we have
+     *              the further special value 2, which means that we're
+     *              theoretically in SI but the current character being
+     *              accumulated is composed of 8-bit characters and will
+     *              therefore be interpreted as if in SO.
       * 
       *         - The next nibble of s1 (27:24) indicates how many bytes
       *           have been accumulated in the current character.
       * 
       *         - The next nibble of s1 (27:24) indicates how many bytes
       *           have been accumulated in the current character.
@@ -214,9 +230,12 @@ static void read_iso2022s(charset_spec const *charset, long int input_chr,
      /*
       * If this isn't an escape sequence, it must be part of a
       * character. One possibility is that it's a control character
      /*
       * If this isn't an escape sequence, it must be part of a
       * character. One possibility is that it's a control character
-     * (outside the space 21-7E), in which case we output it verbatim.
+     * (00-20 or 7F-9F; also in non-8-bit ISO 2022 subsets I'm
+     * going to treat all top-half characters as controls), in
+     * which case we output it verbatim.
       */
       */
-    if (input_chr < 0x21 || input_chr > 0x7E) {
+    if (input_chr < 0x21 ||
+       (input_chr > 0x7E && (!iso->eightbit || input_chr < 0xA0))) {
         /*
          * We might be in mid-multibyte-character. If so, clear the
          * character state and emit an error token for the
         /*
          * We might be in mid-multibyte-character. If so, clear the
          * character state and emit an error token for the
@@ -245,8 +264,45 @@ static void read_iso2022s(charset_spec const *charset, long int input_chr,
         unsigned long chr;
         int chrlen, cont, subcharset, bytes;
  
         unsigned long chr;
         int chrlen, cont, subcharset, bytes;
  
+       /*
+        * Verify that we've seen the right kind of character for
+        * what we're currently doing. This only matters in 8-bit
+        * subsets.
+        */
+       if (iso->eightbit) {
+           cont = (state->s1 >> 28) & 7;
+           /*
+            * If cont==0, we're entitled to see either GL or GR
+            * characters. If cont==2, we expect only GR; otherwise
+            * we expect only GL.
+            * 
+            * If we see a GR character while cont==0, we set
+            * cont=2 immediately.
+            */
+           if ((cont == 2 && !(input_chr & 0x80)) ||
+               (cont != 0 && cont != 2 && (input_chr & 0x80))) {
+               /*
+                * Clear the previous character; it was prematurely
+                * terminated by this error.
+                */
+               state->s1 &= ~0x0F000000;
+               state->s0 &= 0xFF000000;
+               emit(emitctx, ERROR);
+               /*
+                * If we were in the SS2 or SS3 container, we
+                * automatically exit it.
+                */
+               if (state->s1 & 0x60000000)
+                   state->s1 &= 0x9FFFFFFF;
+           }
+
+           if (cont == 0 && (input_chr & 0x80)) {
+               state->s1 |= 0x20000000;
+           }
+       }
+
         /* The current character and its length. */
         /* The current character and its length. */
-       chr = ((state->s0 & 0x00FFFFFF) << 8) | input_chr;
+       chr = ((state->s0 & 0x00FFFFFF) << 8) | (input_chr & 0x7F);
         chrlen = ((state->s1 >> 24) & 0xF) + 1;
         /* The current sub-charset. */
         cont = (state->s1 >> 28) & 7;
         chrlen = ((state->s1 >> 24) & 0xF) + 1;
         /* The current sub-charset. */
         cont = (state->s1 >> 28) & 7;
@@ -280,7 +336,7 @@ static int write_iso2022s(charset_spec const *charset, long int input_chr,
                           void *emitctx)
  {
      struct iso2022 const *iso = (struct iso2022 *)charset->data;
                           void *emitctx)
  {
      struct iso2022 const *iso = (struct iso2022 *)charset->data;
-    int subcharset, len, i, j, cont;
+    int subcharset, len, i, j, cont, topbit = 0;
      unsigned long bytes;
  
      /*
      unsigned long bytes;
  
      /*
@@ -333,7 +389,8 @@ static int write_iso2022s(charset_spec const *charset, long int input_chr,
       * necessary, and then output the given bytes.
       */
      for (i = 0; i < iso->nescapes; i++)
       * necessary, and then output the given bytes.
       */
      for (i = 0; i < iso->nescapes; i++)
-       if (iso->escapes[i].subcharset == subcharset)
+       if (iso->escapes[i].subcharset == subcharset &&
+           !(iso->escapes[i].container & RO))
             break;
      assert(i < iso->nescapes);
  
             break;
      assert(i < iso->nescapes);
  
@@ -343,7 +400,7 @@ static int write_iso2022s(charset_spec const *charset, long int input_chr,
       * already _be_ selected in that container! Check before we go
       * to the effort of emitting the sequence.
       */
       * already _be_ selected in that container! Check before we go
       * to the effort of emitting the sequence.
       */
-    cont = iso->escapes[i].container;
+    cont = iso->escapes[i].container &~ RO;
      if (((state->s1 >> (6*cont)) & 0x3F) != (unsigned)subcharset) {
         for (j = 0; iso->escapes[i].sequence[j]; j++)
             emit(emitctx, iso->escapes[i].sequence[j]);
      if (((state->s1 >> (6*cont)) & 0x3F) != (unsigned)subcharset) {
         for (j = 0; iso->escapes[i].sequence[j]; j++)
             emit(emitctx, iso->escapes[i].sequence[j]);
@@ -360,9 +417,17 @@ static int write_iso2022s(charset_spec const *charset, long int input_chr,
         emit(emitctx, ESC);
         emit(emitctx, 'L' + cont);     /* comes out to 'N' or 'O' */
      } else {
         emit(emitctx, ESC);
         emit(emitctx, 'L' + cont);     /* comes out to 'N' or 'O' */
      } else {
-       /* Emit SI or SO, but only if the current container isn't already
-        * the right one. */
-       if (((state->s1 >> 28) & 7) != (unsigned)cont) {
+       /*
+        * Emit SI or SO, but only if the current container isn't already
+        * the right one.
+        * 
+        * Also, in an 8-bit subset, we need not do this; we'll
+        * just use 8-bit characters to output SO-container
+        * characters.
+        */
+       if (iso->eightbit && cont == 1 && ((state->s1 >> 28) & 7) == 0) {
+           topbit = 0x80;
+       } else if (((state->s1 >> 28) & 7) != (unsigned)cont) {
             emit(emitctx, cont ? SO : SI);
             state->s1 = (state->s1 & 0x8FFFFFFF) | (cont << 28);
         }
             emit(emitctx, cont ? SO : SI);
             state->s1 = (state->s1 & 0x8FFFFFFF) | (cont << 28);
         }
@@ -374,7 +439,7 @@ static int write_iso2022s(charset_spec const *charset, long int input_chr,
       */
      len = iso->nbytes[subcharset];
      while (len--)
       */
      len = iso->nbytes[subcharset];
      while (len--)
-       emit(emitctx, (bytes >> (8*len)) & 0xFF);
+       emit(emitctx, ((bytes >> (8*len)) & 0xFF) | topbit);
  
      return TRUE;
  }
  
      return TRUE;
  }
@@ -425,7 +490,8 @@ static struct iso2022_escape iso2022jp_escapes[] = {
  };
  static struct iso2022 iso2022jp = {
      iso2022jp_escapes, lenof(iso2022jp_escapes),
  };
  static struct iso2022 iso2022jp = {
      iso2022jp_escapes, lenof(iso2022jp_escapes),
-    "\1\1\2", "\3", 0x80000000, NULL, iso2022jp_to_ucs, iso2022jp_from_ucs
+    "\1\1\2", "\3", 0x80000000, NULL, FALSE,
+    iso2022jp_to_ucs, iso2022jp_from_ucs
  };
  const charset_spec charset_CS_ISO2022_JP = {
      CS_ISO2022_JP, read_iso2022s, write_iso2022s, &iso2022jp
  };
  const charset_spec charset_CS_ISO2022_JP = {
      CS_ISO2022_JP, read_iso2022s, write_iso2022s, &iso2022jp
@@ -466,15 +532,192 @@ static struct iso2022_escape iso2022kr_escapes[] = {
  };
  static struct iso2022 iso2022kr = {
      iso2022kr_escapes, lenof(iso2022kr_escapes),
  };
  static struct iso2022 iso2022kr = {
      iso2022kr_escapes, lenof(iso2022kr_escapes),
-    "\1\2", "\2", 0x80000040, "\033$)C", iso2022kr_to_ucs, iso2022kr_from_ucs
+    "\1\2", "\2", 0x80000040, "\033$)C", FALSE,
+    iso2022kr_to_ucs, iso2022kr_from_ucs
  };
  const charset_spec charset_CS_ISO2022_KR = {
      CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr
  };
  
  };
  const charset_spec charset_CS_ISO2022_KR = {
      CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr
  };
  
+/*
+ * The COMPOUND_TEXT encoding used in X selections. Defined by the
+ * X consortium.
+ * 
+ * This encoding has quite a few sub-charsets. The order I assign
+ * to them here is given in an enum.
+ */
+enum {
+    /* This must match the bytes-per-character string given below. */
+    CTEXT_ASCII,
+    CTEXT_JISX0201_LEFT,
+    CTEXT_JISX0201_RIGHT,
+    CTEXT_ISO8859_1,
+    CTEXT_ISO8859_2,
+    CTEXT_ISO8859_3,
+    CTEXT_ISO8859_4,
+    CTEXT_ISO8859_5,
+    CTEXT_ISO8859_6,
+    CTEXT_ISO8859_7,
+    CTEXT_ISO8859_8,
+    CTEXT_ISO8859_9,
+    CTEXT_GB2312,
+    CTEXT_KSC5601,
+    CTEXT_JISX0208
+};
+static long int ctext_to_ucs(int subcharset, unsigned long bytes)
+{
+    switch (subcharset) {
+      case CTEXT_ASCII: return bytes;         /* one-byte ASCII */
+      case CTEXT_JISX0201_LEFT:        /* ASCII with yen and overline */
+       return sbcs_to_unicode(&sbcsdata_CS_JISX0201, bytes & 0x7F);
+      case CTEXT_JISX0201_RIGHT:       /* JIS X 0201 half-width katakana */
+       return sbcs_to_unicode(&sbcsdata_CS_JISX0201, (bytes & 0x7F) | 0x80);
+      case CTEXT_ISO8859_1:
+       return sbcs_to_unicode(&sbcsdata_CS_ISO8859_1, (bytes & 0x7F) | 0x80);
+      case CTEXT_ISO8859_2:
+       return sbcs_to_unicode(&sbcsdata_CS_ISO8859_2, (bytes & 0x7F) | 0x80);
+      case CTEXT_ISO8859_3:
+       return sbcs_to_unicode(&sbcsdata_CS_ISO8859_3, (bytes & 0x7F) | 0x80);
+      case CTEXT_ISO8859_4:
+       return sbcs_to_unicode(&sbcsdata_CS_ISO8859_4, (bytes & 0x7F) | 0x80);
+      case CTEXT_ISO8859_5:
+       return sbcs_to_unicode(&sbcsdata_CS_ISO8859_5, (bytes & 0x7F) | 0x80);
+      case CTEXT_ISO8859_6:
+       return sbcs_to_unicode(&sbcsdata_CS_ISO8859_6, (bytes & 0x7F) | 0x80);
+      case CTEXT_ISO8859_7:
+       return sbcs_to_unicode(&sbcsdata_CS_ISO8859_7, (bytes & 0x7F) | 0x80);
+      case CTEXT_ISO8859_8:
+       return sbcs_to_unicode(&sbcsdata_CS_ISO8859_8, (bytes & 0x7F) | 0x80);
+      case CTEXT_ISO8859_9:
+       return sbcs_to_unicode(&sbcsdata_CS_ISO8859_9, (bytes & 0x7F) | 0x80);
+      case CTEXT_GB2312:
+       return gb2312_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
+                                ((bytes     ) & 0xFF) - 0x21);
+      case CTEXT_KSC5601:
+       return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
+                                 ((bytes     ) & 0xFF) - 0x21);
+      case CTEXT_JISX0208:
+       return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
+                                  ((bytes     ) & 0xFF) - 0x21);
+      default: return ERROR;
+    }
+}
+static int ctext_from_ucs(long int ucs, int *subcharset, unsigned long *bytes)
+{
+    int r, c;
+    if (ucs < 0x80) {
+       *subcharset = CTEXT_ASCII;
+       *bytes = ucs;
+       return 1;
+    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_1, ucs)) != ERROR) {
+       *subcharset = CTEXT_ISO8859_1;
+       *bytes = c - 0x80;
+       return 1;
+    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_2, ucs)) != ERROR) {
+       *subcharset = CTEXT_ISO8859_2;
+       *bytes = c - 0x80;
+       return 1;
+    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_3, ucs)) != ERROR) {
+       *subcharset = CTEXT_ISO8859_3;
+       *bytes = c - 0x80;
+       return 1;
+    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_4, ucs)) != ERROR) {
+       *subcharset = CTEXT_ISO8859_4;
+       *bytes = c - 0x80;
+       return 1;
+    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_5, ucs)) != ERROR) {
+       *subcharset = CTEXT_ISO8859_5;
+       *bytes = c - 0x80;
+       return 1;
+    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_6, ucs)) != ERROR) {
+       *subcharset = CTEXT_ISO8859_6;
+       *bytes = c - 0x80;
+       return 1;
+    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_7, ucs)) != ERROR) {
+       *subcharset = CTEXT_ISO8859_7;
+       *bytes = c - 0x80;
+       return 1;
+    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_8, ucs)) != ERROR) {
+       *subcharset = CTEXT_ISO8859_8;
+       *bytes = c - 0x80;
+       return 1;
+    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_9, ucs)) != ERROR) {
+       *subcharset = CTEXT_ISO8859_9;
+       *bytes = c - 0x80;
+       return 1;
+    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_JISX0201, ucs)) != ERROR) {
+       if (c < 0x80) {
+           *subcharset = CTEXT_JISX0201_LEFT;
+       } else {
+           *subcharset = CTEXT_JISX0201_RIGHT;
+           c -= 0x80;
+       }
+       *bytes = c;
+       return 1;
+    } else if (unicode_to_gb2312(ucs, &r, &c)) {
+       *subcharset = CTEXT_GB2312;
+       *bytes = ((r+0x21) << 8) | (c+0x21);
+       return 1;
+    } else if (unicode_to_ksx1001(ucs, &r, &c)) {
+       *subcharset = CTEXT_KSC5601;
+       *bytes = ((r+0x21) << 8) | (c+0x21);
+       return 1;
+    } else if (unicode_to_jisx0208(ucs, &r, &c)) {
+       *subcharset = CTEXT_JISX0208;
+       *bytes = ((r+0x21) << 8) | (c+0x21);
+       return 1;
+    } else {
+       return 0;
+    }
+}
+#define SEQ(str,cont,cs) \
+    {str,~(63<<(6*((cont&~RO)))),(cs)<<(6*((cont&~RO))),(cont),(cs)}
+/*
+ * Compound text defines restrictions on which container can take
+ * which character sets. Things labelled `left half of' can only go
+ * in GL; things labelled `right half of' can only go in GR; and 96
+ * or 96^n character sets only _fit_ in GR. Thus:
+ *  - ASCII can only go in GL since it is the left half of 8859-*.
+ *  - All the 8859 sets can only go in GR.
+ *  - JISX0201 left is GL only; JISX0201 right is GR only.
+ *  - The three multibyte sets (GB2312, JISX0208, KSC5601) can go
+ *    in either; we prefer GR where possible since this leads to a
+ *    more compact EUC-like encoding.
+ */
+static struct iso2022_escape ctext_escapes[] = {
+    SEQ("\033$(A", 0|RO, CTEXT_GB2312),
+    SEQ("\033$(B", 0|RO, CTEXT_JISX0208),
+    SEQ("\033$(C", 0|RO, CTEXT_KSC5601),
+    SEQ("\033$)A", 1, CTEXT_GB2312),
+    SEQ("\033$)B", 1, CTEXT_JISX0208),
+    SEQ("\033$)C", 1, CTEXT_KSC5601),
+    SEQ("\033(B", 0, CTEXT_ASCII),
+    SEQ("\033(J", 0, CTEXT_JISX0201_LEFT),
+    SEQ("\033-A", 1, CTEXT_ISO8859_1),
+    SEQ("\033-B", 1, CTEXT_ISO8859_2),
+    SEQ("\033-C", 1, CTEXT_ISO8859_3),
+    SEQ("\033-D", 1, CTEXT_ISO8859_4),
+    SEQ("\033-F", 1, CTEXT_ISO8859_7),
+    SEQ("\033-G", 1, CTEXT_ISO8859_6),
+    SEQ("\033-H", 1, CTEXT_ISO8859_8),
+    SEQ("\033)I", 1, CTEXT_JISX0201_RIGHT),
+    SEQ("\033-L", 1, CTEXT_ISO8859_5),
+    SEQ("\033-M", 1, CTEXT_ISO8859_9),
+};
+static struct iso2022 ctext = {
+    ctext_escapes, lenof(ctext_escapes),
+    "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\2",  /* must match the enum above */
+    "", 0x80000000 | (CTEXT_ASCII<<0) | (CTEXT_ASCII<<6), "", TRUE,
+    ctext_to_ucs, ctext_from_ucs
+};
+const charset_spec charset_CS_CTEXT = {
+    CS_CTEXT, read_iso2022s, write_iso2022s, &ctext
+};
+
  #else /* ENUM_CHARSETS */
  
  ENUM_CHARSET(CS_ISO2022_JP)
  ENUM_CHARSET(CS_ISO2022_KR)
  #else /* ENUM_CHARSETS */
  
  ENUM_CHARSET(CS_ISO2022_JP)
  ENUM_CHARSET(CS_ISO2022_KR)
+ENUM_CHARSET(CS_CTEXT)
  
  #endif /* ENUM_CHARSETS */
  
  #endif /* ENUM_CHARSETS */
diff --git a/localenc.c b/localenc.c

index 1382224..b620c66 100644 (file)
--- a/localenc.c
+++ b/localenc.c
@@ -67,6 +67,10 @@ static const struct {
      { "KOI8-R", CS_KOI8_R, 1 },
      { "KOI8-U", CS_KOI8_U, 1 },
      { "KOI8-RU", CS_KOI8_RU, 1 },
      { "KOI8-R", CS_KOI8_R, 1 },
      { "KOI8-U", CS_KOI8_U, 1 },
      { "KOI8-RU", CS_KOI8_RU, 1 },
+    { "JIS X 0201", CS_JISX0201, 1 },
+    { "JIS-X-0201", CS_JISX0201, 1 },
+    { "JIS_X_0201", CS_JISX0201, 1 },
+    { "JISX0201", CS_JISX0201, 1 },
      { "Mac Roman", CS_MAC_ROMAN, 1 },
      { "Mac Turkish", CS_MAC_TURKISH, 1 },
      { "Mac Croatian", CS_MAC_CROATIAN, 1 },
      { "Mac Roman", CS_MAC_ROMAN, 1 },
      { "Mac Turkish", CS_MAC_TURKISH, 1 },
      { "Mac Croatian", CS_MAC_CROATIAN, 1 },
@@ -106,6 +110,11 @@ static const struct {
      { "UTF-16", CS_UTF16, 1 },
      { "CP949", CS_CP949, 1 },
      { "PDFDocEncoding", CS_PDF, 1 },
      { "UTF-16", CS_UTF16, 1 },
      { "CP949", CS_CP949, 1 },
      { "PDFDocEncoding", CS_PDF, 1 },
+    { "COMPOUND_TEXT", CS_CTEXT, 1 },
+    { "COMPOUND-TEXT", CS_CTEXT, 1 },
+    { "COMPOUND TEXT", CS_CTEXT, 1 },
+    { "COMPOUNDTEXT", CS_CTEXT, 1 },
+    { "CTEXT", CS_CTEXT, 1 },
  };
  
  const char *charset_to_localenc(int charset)
  };
  
  const char *charset_to_localenc(int charset)
diff --git a/mimeenc.c b/mimeenc.c

index fd7e975..4751923 100644 (file)
--- a/mimeenc.c
+++ b/mimeenc.c
@@ -201,6 +201,10 @@ static const struct {
  
      { "KOI8-RU", CS_KOI8_RU },        /* WILD */
  
  
      { "KOI8-RU", CS_KOI8_RU },        /* WILD */
  
+    { "JIS_X0201", CS_JISX0201 },
+    { "X0201", CS_JISX0201 },
+    { "csHalfWidthKatakana", CS_JISX0201 },
+
      { "macintosh", CS_MAC_ROMAN_OLD },
      { "mac", CS_MAC_ROMAN_OLD },
      { "csMacintosh", CS_MAC_ROMAN_OLD },
      { "macintosh", CS_MAC_ROMAN_OLD },
      { "mac", CS_MAC_ROMAN_OLD },
      { "csMacintosh", CS_MAC_ROMAN_OLD },
diff --git a/sbcs.c b/sbcs.c

index bf4e4b1..ab7b997 100644 (file)
--- a/sbcs.c
+++ b/sbcs.c
@@ -12,6 +12,11 @@
   * of the translation table.
   */
  
   * of the translation table.
   */
  
+long int sbcs_to_unicode(const struct sbcs_data *sd, long int input_chr)
+{
+    return sd->sbcs2ucs[input_chr];
+}
+
  void read_sbcs(charset_spec const *charset, long int input_chr,
                charset_state *state,
                void (*emit)(void *ctx, long int output), void *emitctx)
  void read_sbcs(charset_spec const *charset, long int input_chr,
                charset_state *state,
                void (*emit)(void *ctx, long int output), void *emitctx)
@@ -20,21 +25,13 @@ void read_sbcs(charset_spec const *charset, long int input_chr,
  
      UNUSEDARG(state);
  
  
      UNUSEDARG(state);
  
-    emit(emitctx, sd->sbcs2ucs[input_chr]);
+    emit(emitctx, sbcs_to_unicode(sd, input_chr));
  }
  
  }
  
-int write_sbcs(charset_spec const *charset, long int input_chr,
-              charset_state *state,
-              void (*emit)(void *ctx, long int output), void *emitctx)
+long int sbcs_from_unicode(const struct sbcs_data *sd, long int input_chr)
  {
  {
-    const struct sbcs_data *sd = charset->data;
      int i, j, k, c;
  
      int i, j, k, c;
  
-    UNUSEDARG(state);
-
-    if (input_chr == -1)
-       return TRUE;                   /* stateless; no cleanup required */
-
      /*
       * Binary-search in the ucs2sbcs table.
       */
      /*
       * Binary-search in the ucs2sbcs table.
       */
@@ -48,9 +45,22 @@ int write_sbcs(charset_spec const *charset, long int input_chr,
         else if (input_chr > (long int)sd->sbcs2ucs[c])
             i = k;
         else {
         else if (input_chr > (long int)sd->sbcs2ucs[c])
             i = k;
         else {
-           emit(emitctx, c);
-           return TRUE;
+           return c;
         }
      }
         }
      }
-    return FALSE;
+    return ERROR;
+}
+
+int write_sbcs(charset_spec const *charset, long int input_chr,
+              charset_state *state,
+              void (*emit)(void *ctx, long int output), void *emitctx)
+{
+    const struct sbcs_data *sd = charset->data;
+
+    UNUSEDARG(state);
+
+    if (input_chr == -1)
+       return TRUE;                   /* stateless; no cleanup required */
+
+    emit(emitctx, sbcs_from_unicode(sd, input_chr));
  }
  }
diff --git a/sbcs.dat b/sbcs.dat

index da77cc9..75080f3 100644 (file)
--- a/sbcs.dat
+++ b/sbcs.dat
@@ -653,6 +653,32 @@ charset CS_KOI8_RU
  042e 0410 0411 0426 0414 0415 0424 0413 0425 0418 0419 041a 041b 041c 041d 041e
  041f 042f 0420 0421 0422 0423 0416 0412 042c 042b 0417 0428 042d 0429 0427 042a
  
  042e 0410 0411 0426 0414 0415 0424 0413 0425 0418 0419 041a 041b 041c 041d 041e
  041f 042f 0420 0421 0422 0423 0416 0412 042c 042b 0417 0428 042d 0429 0427 042a
  
+  JIS X 0201, also known as JIS-Roman. Bottom half is basically
+  ASCII, but with yen in place of backslash and overline in place of
+  tilde. Top half contains half-width katakana. Generated by the
+  following bourne shell:
+
+    echo charset CS_JISX0201
+    gensbcs http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0201.TXT
+
+charset CS_JISX0201
+0000 0001 0002 0003 0004 0005 0006 0007 0008 0009 000a 000b 000c 000d 000e 000f
+0010 0011 0012 0013 0014 0015 0016 0017 0018 0019 001a 001b 001c 001d 001e 001f
+0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
+0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
+0040 0041 0042 0043 0044 0045 0046 0047 0048 0049 004a 004b 004c 004d 004e 004f
+0050 0051 0052 0053 0054 0055 0056 0057 0058 0059 005a 005b 00a5 005d 005e 005f
+0060 0061 0062 0063 0064 0065 0066 0067 0068 0069 006a 006b 006c 006d 006e 006f
+0070 0071 0072 0073 0074 0075 0076 0077 0078 0079 007a 007b 007c 007d 203e 007f
+XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX
+XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX
+XXXX ff61 ff62 ff63 ff64 ff65 ff66 ff67 ff68 ff69 ff6a ff6b ff6c ff6d ff6e ff6f
+ff70 ff71 ff72 ff73 ff74 ff75 ff76 ff77 ff78 ff79 ff7a ff7b ff7c ff7d ff7e ff7f
+ff80 ff81 ff82 ff83 ff84 ff85 ff86 ff87 ff88 ff89 ff8a ff8b ff8c ff8d ff8e ff8f
+ff90 ff91 ff92 ff93 ff94 ff95 ff96 ff97 ff98 ff99 ff9a ff9b ff9c ff9d ff9e ff9f
+XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX
+XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX
+
    Various Mac character sets, generated by:
  
    for i in ROMAN TURKISH CROATIAN ICELAND ROMANIAN GREEK CYRILLIC THAI \
    Various Mac character sets, generated by:
  
    for i in ROMAN TURKISH CROATIAN ICELAND ROMANIAN GREEK CYRILLIC THAI \
diff --git a/sbcsgen.pl b/sbcsgen.pl

index 02aaa0f..d5b83a8 100644 (file)
--- a/sbcsgen.pl
+++ b/sbcsgen.pl
@@ -7,6 +7,8 @@ $infile = "sbcs.dat";
  $infile = shift @ARGV if defined $ARGV[0];
  $outfile = "sbcsdat.c";
  $outfile = shift @ARGV if defined $ARGV[0];
  $infile = shift @ARGV if defined $ARGV[0];
  $outfile = "sbcsdat.c";
  $outfile = shift @ARGV if defined $ARGV[0];
+$outheader = "sbcsdat.h";
+$outheader = shift @ARGV if defined $ARGV[0];
  
  open FOO, $infile;
  open BAR, ">$outfile";
  
  open FOO, $infile;
  open BAR, ">$outfile";
@@ -65,11 +67,37 @@ foreach $i (@charsetnames) {
  print "\n";
  print "#endif /* ENUM_CHARSETS */\n";
  
  print "\n";
  print "#endif /* ENUM_CHARSETS */\n";
  
+close BAR;
+
+open BAR, ">$outheader";
+select BAR;
+
+print "/*\n";
+print " * sbcsdat.h - header file for SBCS data structures.\n";
+print " *\n";
+print " * Generated by sbcsgen.pl from sbcs.dat.\n";
+print " * You should edit those files rather than editing this one.\n";
+print " */\n";
+print "\n";
+print "#ifndef charset_sbcsdat_h\n";
+print "#define charset_sbcsdat_h\n";
+print "\n";
+print "#include \"charset.h\"\n";
+print "#include \"internal.h\"\n";
+print "\n";
+foreach $i (@charsetnames) {
+    print "extern const sbcs_data sbcsdata_$i;\n";
+}
+print "\n";
+print "#endif /* charset_sbcsdat_h */\n";
+
+close BAR;
+
  sub outcharset($$$) {
      my ($name, $vals, $sortpriority) = @_;
      my ($prefix, $i, @sorted);
  
  sub outcharset($$$) {
      my ($name, $vals, $sortpriority) = @_;
      my ($prefix, $i, @sorted);
  
-    print "static const sbcs_data data_$name = {\n";
+    print "const sbcs_data sbcsdata_$name = {\n";
      print "    {\n";
      $prefix = "    ";
      @sorted = ();
      print "    {\n";
      $prefix = "    ";
      @sorted = ();
@@ -107,5 +135,5 @@ sub outcharset($$$) {
      printf "\n    },\n    %d\n", $j;
      print "};\n";
      print "const charset_spec charset_$name = {\n" .
      printf "\n    },\n    %d\n", $j;
      print "};\n";
      print "const charset_spec charset_$name = {\n" .
-          "    $name, read_sbcs, write_sbcs, &data_$name\n};\n\n";
+          "    $name, read_sbcs, write_sbcs, &sbcsdata_$name\n};\n\n";
  }
  }
author	simon <simon@cda61777-01e9-0310-a592-d414129be87e>
	Sat, 25 Sep 2004 13:24:27 +0000 (13:24 +0000)
committer	simon <simon@cda61777-01e9-0310-a592-d414129be87e>
	Sat, 25 Sep 2004 13:24:27 +0000 (13:24 +0000)
.cvsignore		patch \| blob \| blame \| history
Makefile		patch \| blob \| blame \| history
charset.h		patch \| blob \| blame \| history
cstable.c	[new file with mode: 0644]	patch \| blob
internal.h		patch \| blob \| blame \| history
iso2022s.c		patch \| blob \| blame \| history
localenc.c		patch \| blob \| blame \| history
mimeenc.c		patch \| blob \| blame \| history
sbcs.c		patch \| blob \| blame \| history
sbcs.dat		patch \| blob \| blame \| history
sbcsgen.pl		patch \| blob \| blame \| history