From 32361bda545f94503610f2cdfc85374a07c56635 Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 9 Apr 2007 11:29:22 +0000 Subject: [PATCH] Add a mechanism for translating to and from the coding system symbols used by GNU Emacs. This is likely to be useful for generating or interpreting "coding:" entries in file local variables. git-svn-id: svn://svn.tartarus.org/sgt/charset@7455 cda61777-01e9-0310-a592-d414129be87e --- Makefile | 1 + charset.h | 7 ++++ emacsenc.c | 130 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ localenc.c | 2 + 4 files changed, 140 insertions(+) create mode 100644 emacsenc.c diff --git a/Makefile b/Makefile index fc956df..3426bbf 100644 --- a/Makefile +++ b/Makefile @@ -57,6 +57,7 @@ LIBCHARSET_OBJS = \ $(LIBCHARSET_OBJDIR)$(LIBCHARSET_OBJPFX)big5set.o \ $(LIBCHARSET_OBJDIR)$(LIBCHARSET_OBJPFX)cns11643.o \ $(LIBCHARSET_OBJDIR)$(LIBCHARSET_OBJPFX)cp949.o \ + $(LIBCHARSET_OBJDIR)$(LIBCHARSET_OBJPFX)emacsenc.o \ $(LIBCHARSET_OBJDIR)$(LIBCHARSET_OBJPFX)euc.o \ $(LIBCHARSET_OBJDIR)$(LIBCHARSET_OBJPFX)fromucs.o \ $(LIBCHARSET_OBJDIR)$(LIBCHARSET_OBJPFX)gb2312.o \ diff --git a/charset.h b/charset.h index c8e9f16..545784f 100644 --- a/charset.h +++ b/charset.h @@ -198,6 +198,13 @@ int charset_from_macenc(int script, int region, int sysvers, const char *fontname); /* + * Convert GNU Emacs coding system symbol to and from our charset + * identifiers. + */ +const char *charset_to_emacsenc(int charset); +int charset_from_emacsenc(const char *name); + +/* * Upgrade a charset identifier to a superset charset which is * often confused with it. For example, people whose MUAs report * their mail as ASCII or ISO8859-1 often in practice turn out to diff --git a/emacsenc.c b/emacsenc.c new file mode 100644 index 0000000..f1046bb --- /dev/null +++ b/emacsenc.c @@ -0,0 +1,130 @@ +/* + * emacsenc.c - translate our internal character set codes to and from + * GNU Emacs coding system symbols. Derived from running M-x + * list-coding-systems in Emacs 21.3. + * + */ + +#include +#include "charset.h" +#include "internal.h" + +static const struct { + const char *name; + int charset; +} emacsencs[] = { + /* + * Where multiple encoding names map to the same encoding id + * (such as iso-latin-1 and iso-8859-1), the first is considered + * canonical and will be returned when translating the id to a + * string. + */ + { "us-ascii", CS_ASCII }, + { "iso-latin-9", CS_ISO8859_15 }, + { "iso-8859-15", CS_ISO8859_15 }, + { "latin-9", CS_ISO8859_15 }, + { "latin-0", CS_ISO8859_15 }, + { "iso-latin-1", CS_ISO8859_1 }, + { "iso-8859-1", CS_ISO8859_1 }, + { "latin-1", CS_ISO8859_1 }, + { "iso-latin-2", CS_ISO8859_2 }, + { "iso-8859-2", CS_ISO8859_2 }, + { "latin-2", CS_ISO8859_2 }, + { "iso-latin-3", CS_ISO8859_3 }, + { "iso-8859-3", CS_ISO8859_3 }, + { "latin-3", CS_ISO8859_3 }, + { "iso-latin-4", CS_ISO8859_4 }, + { "iso-8859-4", CS_ISO8859_4 }, + { "latin-4", CS_ISO8859_4 }, + { "cyrillic-iso-8bit", CS_ISO8859_5 }, + { "iso-8859-5", CS_ISO8859_5 }, + { "greek-iso-8bit", CS_ISO8859_7 }, + { "iso-8859-7", CS_ISO8859_7 }, + { "hebrew-iso-8bit", CS_ISO8859_8 }, + { "iso-8859-8", CS_ISO8859_8 }, + { "iso-8859-8-e", CS_ISO8859_8 }, + { "iso-8859-8-i", CS_ISO8859_8 }, + { "iso-latin-5", CS_ISO8859_9 }, + { "iso-8859-9", CS_ISO8859_9 }, + { "latin-5", CS_ISO8859_9 }, + { "chinese-big5", CS_BIG5 }, + { "big5", CS_BIG5 }, + { "cn-big5", CS_BIG5 }, + { "cp437", CS_CP437 }, + { "cp850", CS_CP850 }, + { "cp866", CS_CP866 }, + { "cp1250", CS_CP1250 }, + { "cp1251", CS_CP1251 }, + { "cp1253", CS_CP1253 }, + { "cp1257", CS_CP1257 }, + { "japanese-iso-8bit", CS_EUC_JP }, + { "euc-japan-1990", CS_EUC_JP }, + { "euc-japan", CS_EUC_JP }, + { "euc-jp", CS_EUC_JP }, + { "iso-2022-jp", CS_ISO2022_JP }, + { "junet", CS_ISO2022_JP }, + { "korean-iso-8bit", CS_EUC_KR }, + { "euc-kr", CS_EUC_KR }, + { "euc-korea", CS_EUC_KR }, + { "iso-2022-kr", CS_ISO2022_KR }, + { "korean-iso-7bit-lock", CS_ISO2022_KR }, + { "mac-roman", CS_MAC_ROMAN }, + { "cyrillic-koi8", CS_KOI8_R }, + { "koi8-r", CS_KOI8_R }, + { "koi8", CS_KOI8_R }, + { "japanese-shift-jis", CS_SHIFT_JIS }, + { "shift_jis", CS_SHIFT_JIS }, + { "sjis", CS_SHIFT_JIS }, + { "thai-tis620", CS_ISO8859_11 }, + { "th-tis620", CS_ISO8859_11 }, + { "tis620", CS_ISO8859_11 }, + { "tis-620", CS_ISO8859_11 }, + { "mule-utf-16-be", CS_UTF16BE }, + { "utf-16-be", CS_UTF16BE }, + { "mule-utf-16-le", CS_UTF16LE }, + { "utf-16-le", CS_UTF16LE }, + { "mule-utf-8", CS_UTF8 }, + { "utf-8", CS_UTF8 }, + { "vietnamese-viscii", CS_VISCII }, + { "viscii", CS_VISCII }, + { "iso-latin-8", CS_ISO8859_14 }, + { "iso-8859-14", CS_ISO8859_14 }, + { "latin-8", CS_ISO8859_14 }, + { "compound-text", CS_CTEXT }, + { "x-ctext", CS_CTEXT }, + { "ctext", CS_CTEXT }, + { "chinese-hz", CS_HZ }, + { "hz-gb-2312", CS_HZ }, + { "hz", CS_HZ }, +}; + +const char *charset_to_emacsenc(int charset) +{ + int i; + + for (i = 0; i < (int)lenof(emacsencs); i++) + if (charset == emacsencs[i].charset) + return emacsencs[i].name; + + return NULL; /* not found */ +} + +int charset_from_emacsenc(const char *name) +{ + int i; + + for (i = 0; i < (int)lenof(emacsencs); i++) { + const char *p, *q; + p = name; + q = emacsencs[i].name; + while (*p || *q) { + if (tolower(*p) != tolower(*q)) + break; + p++; q++; + } + if (!*p && !*q) + return emacsencs[i].charset; + } + + return CS_NONE; /* not found */ +} diff --git a/localenc.c b/localenc.c index 115ea7c..5c761a8 100644 --- a/localenc.c +++ b/localenc.c @@ -145,6 +145,8 @@ int charset_from_localenc(const char *name) return i; if ( (i = charset_from_xenc(name)) != CS_NONE) return i; + if ( (i = charset_from_emacsenc(name)) != CS_NONE) + return i; for (i = 0; i < (int)lenof(localencs); i++) { const char *p, *q; -- 2.11.0