From b063a840d1dded0455a70fc3e71ef8f92e8644ab Mon Sep 17 00:00:00 2001 From: ben Date: Sat, 24 Sep 2005 17:50:36 +0000 Subject: [PATCH] EUC-TW implementation, plus an explanation of why ISO-2022-CN is difficult. git-svn-id: svn://svn.tartarus.org/sgt/charset@6353 cda61777-01e9-0310-a592-d414129be87e --- README | 6 ++++-- charset.h | 3 ++- euc.c | 37 +++++++++++++++++++++++++++++++++++++ localenc.c | 1 + 4 files changed, 44 insertions(+), 3 deletions(-) diff --git a/README b/README index 456dfd8..8eb7c25 100644 --- a/README +++ b/README @@ -25,8 +25,10 @@ not currently support. Those that I know of are: machinery in place, and support all the underlying character sets. - - ISO-2022-CN and ISO-2022-CN-EXT (RFC 1922), and EUC-TW. These - encodings depend on the CNS 11643-1992 character set. + - ISO-2022-CN and ISO-2022-CN-EXT (RFC 1922). These are a little tricky + as they allow use of both GB2312 (simplified Chinese) and CNS 11643 + (traditional Chinese), so we may need some way to specify which to + prefer. - The Hong Kong (HKSCS) extension to Big5. Again, mapping tables are available in the Unihan database. diff --git a/charset.h b/charset.h index cc7b324..5d64877 100644 --- a/charset.h +++ b/charset.h @@ -88,7 +88,8 @@ typedef enum { CS_CTEXT, CS_ISO2022, CS_BS4730, - CS_DEC_GRAPHICS + CS_DEC_GRAPHICS, + CS_EUC_TW } charset_t; typedef struct { diff --git a/euc.c b/euc.c index 5d33a6f..b3d43ff 100644 --- a/euc.c +++ b/euc.c @@ -226,10 +226,47 @@ const charset_spec charset_CS_EUC_JP = { CS_EUC_JP, read_euc, write_euc, &euc_jp }; +/* + * EUC-TW encodes CNS 11643 (all planes). + */ +static long int euc_tw_to_ucs(unsigned long state) +{ + int plane; + switch (state >> 28) { + case 1: return cns11643_to_unicode(0, ((state >> 8) & 0xFF) - 0xA1, + ((state ) & 0xFF) - 0xA1); + case 2: + plane = ((state >> 8) & 0xFF) - 0xA1; + if (plane >= 7) return ERROR; + return cns11643_to_unicode(plane, ((state >> 8) & 0xFF) - 0xA1, + ((state ) & 0xFF) - 0xA1); + default: return ERROR; + } +} +static unsigned long euc_tw_from_ucs(long int ucs) +{ + int p, r, c; + if (unicode_to_cns11643(ucs, &p, &r, &c)) { + if (p == 0) + return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1); + else + return 0x20000000 | + ((p + 0xA1) << 16) | ((r+0xA1) << 8) | (c+0xA1); + } else + return 0; +} +static const struct euc euc_tw = { + {2,3,0}, euc_tw_to_ucs, euc_tw_from_ucs +}; +const charset_spec charset_CS_EUC_TW = { + CS_EUC_TW, read_euc, write_euc, &euc_tw +}; + #else /* ENUM_CHARSETS */ ENUM_CHARSET(CS_EUC_CN) ENUM_CHARSET(CS_EUC_KR) ENUM_CHARSET(CS_EUC_JP) +ENUM_CHARSET(CS_EUC_TW) #endif /* ENUM_CHARSETS */ diff --git a/localenc.c b/localenc.c index 1df0fd5..869579c 100644 --- a/localenc.c +++ b/localenc.c @@ -104,6 +104,7 @@ static const struct { { "EUC-CN", CS_EUC_CN, 1 }, { "EUC-KR", CS_EUC_KR, 1 }, { "EUC-JP", CS_EUC_JP, 1 }, + { "EUC-JP", CS_EUC_TW, 1 }, { "ISO-2022-JP", CS_ISO2022_JP, 1 }, { "ISO-2022-KR", CS_ISO2022_KR, 1 }, { "Big5", CS_BIG5, 1 }, -- 2.11.0