EUC-TW implementation, plus an explanation of why ISO-2022-CN is difficult.

author ben <ben@cda61777-01e9-0310-a592-d414129be87e>

Sat, 24 Sep 2005 17:50:36 +0000 (17:50 +0000)

committer ben <ben@cda61777-01e9-0310-a592-d414129be87e>

Sat, 24 Sep 2005 17:50:36 +0000 (17:50 +0000)
author ben <ben@cda61777-01e9-0310-a592-d414129be87e>
Sat, 24 Sep 2005 17:50:36 +0000 (17:50 +0000)
committer ben <ben@cda61777-01e9-0310-a592-d414129be87e>
Sat, 24 Sep 2005 17:50:36 +0000 (17:50 +0000)
diff --git a/README b/README

index 456dfd8..8eb7c25 100644 (file)
--- a/README
+++ b/README
@@ -25,8 +25,10 @@ not currently support. Those that I know of are:
     machinery in place, and support all the underlying character
     sets.
  
- - ISO-2022-CN and ISO-2022-CN-EXT (RFC 1922), and EUC-TW. These
-   encodings depend on the CNS 11643-1992 character set.
+ - ISO-2022-CN and ISO-2022-CN-EXT (RFC 1922). These are a little tricky
+   as they allow use of both GB2312 (simplified Chinese) and CNS 11643
+   (traditional Chinese), so we may need some way to specify which to
+   prefer.
  
   - The Hong Kong (HKSCS) extension to Big5. Again, mapping tables
     are available in the Unihan database.
diff --git a/charset.h b/charset.h

index cc7b324..5d64877 100644 (file)
--- a/charset.h
+++ b/charset.h
@@ -88,7 +88,8 @@ typedef enum {
      CS_CTEXT,
      CS_ISO2022,
      CS_BS4730,
-    CS_DEC_GRAPHICS
+    CS_DEC_GRAPHICS,
+    CS_EUC_TW
  } charset_t;
  
  typedef struct {
diff --git a/euc.c b/euc.c

index 5d33a6f..b3d43ff 100644 (file)
--- a/euc.c
+++ b/euc.c
@@ -226,10 +226,47 @@ const charset_spec charset_CS_EUC_JP = {
      CS_EUC_JP, read_euc, write_euc, &euc_jp
  };
  
+/*
+ * EUC-TW encodes CNS 11643 (all planes).
+ */
+static long int euc_tw_to_ucs(unsigned long state)
+{
+    int plane;
+    switch (state >> 28) {
+      case 1: return cns11643_to_unicode(0, ((state >> 8) & 0xFF) - 0xA1,
+                                           ((state     ) & 0xFF) - 0xA1);
+      case 2:
+       plane = ((state >> 8) & 0xFF) - 0xA1;
+       if (plane >= 7) return ERROR;
+       return cns11643_to_unicode(plane, ((state >> 8) & 0xFF) - 0xA1,
+                                         ((state     ) & 0xFF) - 0xA1);
+      default: return ERROR;
+    }
+}
+static unsigned long euc_tw_from_ucs(long int ucs)
+{
+    int p, r, c;
+    if (unicode_to_cns11643(ucs, &p, &r, &c)) {
+       if (p == 0)
+           return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1);
+       else
+           return 0x20000000 |
+               ((p + 0xA1) << 16) | ((r+0xA1) << 8) | (c+0xA1);
+    } else
+       return 0;
+}
+static const struct euc euc_tw = {
+    {2,3,0}, euc_tw_to_ucs, euc_tw_from_ucs
+};
+const charset_spec charset_CS_EUC_TW = {
+    CS_EUC_TW, read_euc, write_euc, &euc_tw
+};
+
  #else /* ENUM_CHARSETS */
  
  ENUM_CHARSET(CS_EUC_CN)
  ENUM_CHARSET(CS_EUC_KR)
  ENUM_CHARSET(CS_EUC_JP)
+ENUM_CHARSET(CS_EUC_TW)
  
  #endif /* ENUM_CHARSETS */
diff --git a/localenc.c b/localenc.c

index 1df0fd5..869579c 100644 (file)
--- a/localenc.c
+++ b/localenc.c
@@ -104,6 +104,7 @@ static const struct {
      { "EUC-CN", CS_EUC_CN, 1 },
      { "EUC-KR", CS_EUC_KR, 1 },
      { "EUC-JP", CS_EUC_JP, 1 },
+    { "EUC-JP", CS_EUC_TW, 1 },
      { "ISO-2022-JP", CS_ISO2022_JP, 1 },
      { "ISO-2022-KR", CS_ISO2022_KR, 1 },
      { "Big5", CS_BIG5, 1 },
author	ben <ben@cda61777-01e9-0310-a592-d414129be87e>
	Sat, 24 Sep 2005 17:50:36 +0000 (17:50 +0000)
committer	ben <ben@cda61777-01e9-0310-a592-d414129be87e>
	Sat, 24 Sep 2005 17:50:36 +0000 (17:50 +0000)
README		patch \| blob \| blame \| history
charset.h		patch \| blob \| blame \| history
euc.c		patch \| blob \| blame \| history
localenc.c		patch \| blob \| blame \| history