Silly of me to overlook it: another obvious way you might like to
[sgt/charset] / mimeenc.c
1 /*
2 * mimeenc.c - translate our internal character set codes to and
3 * from MIME standard character-set names.
4 *
5 */
6
7 #include <ctype.h>
8 #include "charset.h"
9 #include "internal.h"
10
11 static const struct {
12 const char *name;
13 int charset;
14 } mimeencs[] = {
15 /*
16 * Most of these names are taken from
17 *
18 * http://www.iana.org/assignments/character-sets
19 *
20 * Where multiple encoding names map to the same encoding id
21 * (such as the variety of aliases for ISO-8859-1), the first
22 * is considered canonical and will be returned when
23 * translating the id to a string.
24 *
25 * I also list here a few names which aren't in the above web
26 * page, but which I've seen in the wild in real mail. These
27 * are marked with a comment saying WILD.
28 */
29
30 { "US-ASCII", CS_ASCII },
31 { "ANSI_X3.4-1968", CS_ASCII },
32 { "iso-ir-6", CS_ASCII },
33 { "ANSI_X3.4-1986", CS_ASCII },
34 { "ISO_646.irv:1991", CS_ASCII },
35 { "ASCII", CS_ASCII },
36 { "ISO646-US", CS_ASCII },
37 { "us", CS_ASCII },
38 { "IBM367", CS_ASCII },
39 { "cp367", CS_ASCII },
40 { "csASCII", CS_ASCII },
41 { "646", CS_ASCII }, /* WILD */
42
43 { "BS_4730", CS_BS4730 },
44 { "iso-ir-4", CS_BS4730 },
45 { "ISO646-GB", CS_BS4730 },
46 { "gb", CS_BS4730 },
47 { "uk", CS_BS4730 },
48 { "csISO4UnitedKingdom", CS_BS4730 },
49
50 { "ISO-8859-1", CS_ISO8859_1 },
51 { "ISO8859-1", CS_ISO8859_1 }, /* WILD */
52 { "iso-ir-100", CS_ISO8859_1 },
53 { "ISO_8859-1", CS_ISO8859_1 },
54 { "ISO_8859-1:1987", CS_ISO8859_1 },
55 { "latin1", CS_ISO8859_1 },
56 { "l1", CS_ISO8859_1 },
57 { "IBM819", CS_ISO8859_1 },
58 { "CP819", CS_ISO8859_1 },
59 { "csISOLatin1", CS_ISO8859_1 },
60
61 { "ISO-8859-2", CS_ISO8859_2 },
62 { "ISO8859-2", CS_ISO8859_2 }, /* WILD */
63 { "ISO_8859-2:1987", CS_ISO8859_2 },
64 { "iso-ir-101", CS_ISO8859_2 },
65 { "ISO_8859-2", CS_ISO8859_2 },
66 { "latin2", CS_ISO8859_2 },
67 { "l2", CS_ISO8859_2 },
68 { "csISOLatin2", CS_ISO8859_2 },
69
70 { "ISO-8859-3", CS_ISO8859_3 },
71 { "ISO8859-3", CS_ISO8859_3 }, /* WILD */
72 { "ISO_8859-3:1988", CS_ISO8859_3 },
73 { "iso-ir-109", CS_ISO8859_3 },
74 { "ISO_8859-3", CS_ISO8859_3 },
75 { "latin3", CS_ISO8859_3 },
76 { "l3", CS_ISO8859_3 },
77 { "csISOLatin3", CS_ISO8859_3 },
78
79 { "ISO-8859-4", CS_ISO8859_4 },
80 { "ISO8859-4", CS_ISO8859_4 }, /* WILD */
81 { "ISO_8859-4:1988", CS_ISO8859_4 },
82 { "iso-ir-110", CS_ISO8859_4 },
83 { "ISO_8859-4", CS_ISO8859_4 },
84 { "latin4", CS_ISO8859_4 },
85 { "l4", CS_ISO8859_4 },
86 { "csISOLatin4", CS_ISO8859_4 },
87
88 { "ISO-8859-5", CS_ISO8859_5 },
89 { "ISO8859-5", CS_ISO8859_5 }, /* WILD */
90 { "ISO_8859-5:1988", CS_ISO8859_5 },
91 { "iso-ir-144", CS_ISO8859_5 },
92 { "ISO_8859-5", CS_ISO8859_5 },
93 { "cyrillic", CS_ISO8859_5 },
94 { "csISOLatinCyrillic", CS_ISO8859_5 },
95
96 { "ISO-8859-6", CS_ISO8859_6 },
97 { "ISO8859-6", CS_ISO8859_6 }, /* WILD */
98 { "ISO_8859-6:1987", CS_ISO8859_6 },
99 { "iso-ir-127", CS_ISO8859_6 },
100 { "ISO_8859-6", CS_ISO8859_6 },
101 { "ECMA-114", CS_ISO8859_6 },
102 { "ASMO-708", CS_ISO8859_6 },
103 { "arabic", CS_ISO8859_6 },
104 { "csISOLatinArabic", CS_ISO8859_6 },
105
106 { "ISO-8859-7", CS_ISO8859_7 },
107 { "ISO8859-7", CS_ISO8859_7 }, /* WILD */
108 { "ISO_8859-7:1987", CS_ISO8859_7 },
109 { "iso-ir-126", CS_ISO8859_7 },
110 { "ISO_8859-7", CS_ISO8859_7 },
111 { "ELOT_928", CS_ISO8859_7 },
112 { "ECMA-118", CS_ISO8859_7 },
113 { "greek", CS_ISO8859_7 },
114 { "greek8", CS_ISO8859_7 },
115 { "csISOLatinGreek", CS_ISO8859_7 },
116
117 { "ISO-8859-8", CS_ISO8859_8 },
118 { "ISO8859-8", CS_ISO8859_8 }, /* WILD */
119 { "ISO_8859-8:1988", CS_ISO8859_8 },
120 { "iso-ir-138", CS_ISO8859_8 },
121 { "ISO_8859-8", CS_ISO8859_8 },
122 { "hebrew", CS_ISO8859_8 },
123 { "csISOLatinHebrew", CS_ISO8859_8 },
124
125 { "ISO-8859-9", CS_ISO8859_9 },
126 { "ISO8859-9", CS_ISO8859_9 }, /* WILD */
127 { "ISO_8859-9:1989", CS_ISO8859_9 },
128 { "iso-ir-148", CS_ISO8859_9 },
129 { "ISO_8859-9", CS_ISO8859_9 },
130 { "latin5", CS_ISO8859_9 },
131 { "l5", CS_ISO8859_9 },
132 { "csISOLatin5", CS_ISO8859_9 },
133
134 { "ISO-8859-10", CS_ISO8859_10 },
135 { "ISO8859-10", CS_ISO8859_10 }, /* WILD */
136 { "iso-ir-157", CS_ISO8859_10 },
137 { "l6", CS_ISO8859_10 },
138 { "ISO_8859-10:1992", CS_ISO8859_10 },
139 { "csISOLatin6", CS_ISO8859_10 },
140 { "latin6", CS_ISO8859_10 },
141
142 { "TIS-620", CS_ISO8859_11 },
143
144 { "ISO-8859-13", CS_ISO8859_13 },
145 { "ISO8859-13", CS_ISO8859_13 }, /* WILD */
146
147 { "ISO-8859-14", CS_ISO8859_14 },
148 { "ISO8859-14", CS_ISO8859_14 }, /* WILD */
149 { "iso-ir-199", CS_ISO8859_14 },
150 { "ISO_8859-14:1998", CS_ISO8859_14 },
151 { "ISO_8859-14", CS_ISO8859_14 },
152 { "latin8", CS_ISO8859_14 },
153 { "iso-celtic", CS_ISO8859_14 },
154 { "l8", CS_ISO8859_14 },
155
156 { "ISO-8859-15", CS_ISO8859_15 },
157 { "ISO8859-15", CS_ISO8859_15 }, /* WILD */
158 { "ISO_8859-15", CS_ISO8859_15 },
159 { "Latin-9", CS_ISO8859_15 },
160
161 { "ISO-8859-16", CS_ISO8859_16 },
162 { "ISO8859-16", CS_ISO8859_16 }, /* WILD */
163 { "iso-ir-226", CS_ISO8859_16 },
164 { "ISO_8859-16", CS_ISO8859_16 },
165 { "ISO_8859-16:2001", CS_ISO8859_16 },
166 { "latin10", CS_ISO8859_16 },
167 { "l10", CS_ISO8859_16 },
168
169 { "IBM437", CS_CP437 },
170 { "cp437", CS_CP437 },
171 { "437", CS_CP437 },
172 { "csPC8CodePage437", CS_CP437 },
173
174 { "IBM850", CS_CP850 },
175 { "cp850", CS_CP850 },
176 { "850", CS_CP850 },
177 { "csPC850Multilingual", CS_CP850 },
178
179 { "IBM852", CS_CP852 },
180 { "cp852", CS_CP852 },
181 { "852", CS_CP852 },
182 { "csIBM852", CS_CP852 },
183
184 { "IBM866", CS_CP866 },
185 { "cp866", CS_CP866 },
186 { "866", CS_CP866 },
187 { "csIBM866", CS_CP866 },
188
189 { "windows-874", CS_CP874 }, /* WILD */
190
191 { "windows-1250", CS_CP1250 },
192 { "win-1250", CS_CP1250 }, /* WILD */
193
194 { "windows-1251", CS_CP1251 },
195 { "win-1251", CS_CP1251 }, /* WILD */
196
197 { "windows-1252", CS_CP1252 },
198 { "win-1252", CS_CP1252 }, /* WILD */
199
200 { "windows-1253", CS_CP1253 },
201 { "win-1253", CS_CP1253 }, /* WILD */
202
203 { "windows-1254", CS_CP1254 },
204 { "win-1254", CS_CP1254 }, /* WILD */
205
206 { "windows-1255", CS_CP1255 },
207 { "win-1255", CS_CP1255 }, /* WILD */
208
209 { "windows-1256", CS_CP1256 },
210 { "win-1256", CS_CP1256 }, /* WILD */
211
212 { "windows-1257", CS_CP1257 },
213 { "win-1257", CS_CP1257 }, /* WILD */
214
215 { "windows-1258", CS_CP1258 },
216 { "win-1258", CS_CP1258 }, /* WILD */
217
218 { "KOI8-R", CS_KOI8_R },
219 { "csKOI8R", CS_KOI8_R },
220
221 { "KOI8-U", CS_KOI8_U },
222
223 { "KOI8-RU", CS_KOI8_RU }, /* WILD */
224
225 { "JIS_X0201", CS_JISX0201 },
226 { "X0201", CS_JISX0201 },
227 { "csHalfWidthKatakana", CS_JISX0201 },
228
229 { "macintosh", CS_MAC_ROMAN_OLD },
230 { "mac", CS_MAC_ROMAN_OLD },
231 { "csMacintosh", CS_MAC_ROMAN_OLD },
232
233 { "VISCII", CS_VISCII },
234 { "csVISCII", CS_VISCII },
235
236 { "hp-roman8", CS_HP_ROMAN8 },
237 { "roman8", CS_HP_ROMAN8 },
238 { "r8", CS_HP_ROMAN8 },
239 { "csHPRoman8", CS_HP_ROMAN8 },
240
241 { "DEC-MCS", CS_DEC_MCS },
242 { "dec", CS_DEC_MCS },
243 { "csDECMCS", CS_DEC_MCS },
244
245 { "UTF-8", CS_UTF8 },
246
247 { "UTF-7", CS_UTF7 },
248 { "UNICODE-1-1-UTF-7", CS_UTF7 },
249 { "csUnicode11UTF7", CS_UTF7 },
250
251 /*
252 * Quite why the EUC-CN encoding is known to MIME by the name
253 * of its underlying character set, I'm not entirely sure, but
254 * it is. Shrug.
255 */
256 { "GB2312", CS_EUC_CN },
257 { "csGB2312", CS_EUC_CN },
258
259 { "EUC-KR", CS_EUC_KR },
260 { "csEUCKR", CS_EUC_KR },
261
262 { "EUC-JP", CS_EUC_JP },
263 { "csEUCPkdFmtJapanese", CS_EUC_JP },
264 { "Extended_UNIX_Code_Packed_Format_for_Japanese", CS_EUC_JP },
265
266 { "ISO-2022-JP", CS_ISO2022_JP },
267 { "csISO2022JP", CS_ISO2022_JP },
268
269 { "ISO-2022-KR", CS_ISO2022_KR },
270 { "csISO2022KR", CS_ISO2022_KR },
271
272 { "Big5", CS_BIG5 },
273 { "csBig5", CS_BIG5 },
274 { "Big-5", CS_BIG5 }, /* WILD */
275 { "ChineseBig5", CS_BIG5 }, /* WILD */
276
277 { "Shift_JIS", CS_SHIFT_JIS },
278 { "MS_Kanji", CS_SHIFT_JIS },
279 { "csShiftJIS", CS_SHIFT_JIS },
280 { "x-sjis", CS_SHIFT_JIS }, /* WILD */
281
282 { "HZ-GB-2312", CS_HZ },
283
284 { "UTF-16BE", CS_UTF16BE },
285
286 { "UTF-16LE", CS_UTF16LE },
287
288 { "UTF-16", CS_UTF16 },
289
290 /*
291 * This bit is fiddly and possibly technically incorrect; but
292 * rumour has it that the KSC 5601 encoding is a subset of
293 * Microsoft CP949, and that MS products tend to announce CP949
294 * as KSC 5601 in much the same way they seem willing to
295 * announce CP1252 as its subset ISO 8859-1. So I cheat
296 * shamelessly here by letting KSC 5601 map to CP949.
297 */
298 { "KS_C_5601-1987", CS_CP949 },
299 { "iso-ir-149", CS_CP949 },
300 { "KS_C_5601-1989", CS_CP949 },
301 { "KSC_5601", CS_CP949 },
302 { "korean", CS_CP949 },
303 { "csKSC56011987", CS_CP949 },
304 { "KSC5601", CS_CP949 }, /* WILD */
305
306 #if 0
307 { "ISO-2022-JP-2", CS_ISO2022_JP_2 },
308 { "csISO2022JP2", CS_ISO2022_JP_2 },
309 #endif
310 };
311
312 const char *charset_to_mimeenc(int charset)
313 {
314 int i;
315
316 for (i = 0; i < (int)lenof(mimeencs); i++)
317 if (charset == mimeencs[i].charset)
318 return mimeencs[i].name;
319
320 return NULL; /* not found */
321 }
322
323 int charset_from_mimeenc(const char *name)
324 {
325 int i;
326
327 for (i = 0; i < (int)lenof(mimeencs); i++) {
328 const char *p, *q;
329 p = name;
330 q = mimeencs[i].name;
331 while (*p || *q) {
332 if (tolower((unsigned char)*p) != tolower((unsigned char)*q))
333 break;
334 p++; q++;
335 }
336 if (!*p && !*q)
337 return mimeencs[i].charset;
338 }
339
340 return CS_NONE; /* not found */
341 }