c6d25d8d |
1 | /* |
2 | * charset.h - header file for general character set conversion |
3 | * routines. |
4 | */ |
5 | |
6 | #ifndef charset_charset_h |
7 | #define charset_charset_h |
8 | |
9 | #include <stddef.h> |
10 | |
11 | /* |
12 | * Enumeration that lists all the multibyte or single-byte |
13 | * character sets known to this library. |
14 | */ |
15 | typedef enum { |
16 | CS_NONE, /* used for reporting errors, etc */ |
17 | CS_ASCII, /* ordinary US-ASCII is worth having! */ |
18 | CS_ISO8859_1, |
19 | CS_ISO8859_1_X11, /* X font encoding with VT100 glyphs */ |
20 | CS_ISO8859_2, |
21 | CS_ISO8859_3, |
22 | CS_ISO8859_4, |
23 | CS_ISO8859_5, |
24 | CS_ISO8859_6, |
25 | CS_ISO8859_7, |
26 | CS_ISO8859_8, |
27 | CS_ISO8859_9, |
28 | CS_ISO8859_10, |
29 | CS_ISO8859_11, |
30 | CS_ISO8859_13, |
31 | CS_ISO8859_14, |
32 | CS_ISO8859_15, |
33 | CS_ISO8859_16, |
34 | CS_CP437, |
35 | CS_CP850, |
5930e9ef |
36 | CS_CP852, |
9b7e7a92 |
37 | CS_CP866, |
36eb7564 |
38 | CS_CP874, |
c6d25d8d |
39 | CS_CP1250, |
40 | CS_CP1251, |
41 | CS_CP1252, |
42 | CS_CP1253, |
43 | CS_CP1254, |
44 | CS_CP1255, |
45 | CS_CP1256, |
46 | CS_CP1257, |
47 | CS_CP1258, |
48 | CS_KOI8_R, |
49 | CS_KOI8_U, |
50 | CS_KOI8_RU, |
01081d4e |
51 | CS_JISX0201, |
c6d25d8d |
52 | CS_MAC_ROMAN, |
53 | CS_MAC_TURKISH, |
54 | CS_MAC_CROATIAN, |
55 | CS_MAC_ICELAND, |
56 | CS_MAC_ROMANIAN, |
57 | CS_MAC_GREEK, |
58 | CS_MAC_CYRILLIC, |
59 | CS_MAC_THAI, |
60 | CS_MAC_CENTEURO, |
61 | CS_MAC_SYMBOL, |
62 | CS_MAC_DINGBATS, |
63 | CS_MAC_ROMAN_OLD, |
64 | CS_MAC_CROATIAN_OLD, |
65 | CS_MAC_ICELAND_OLD, |
66 | CS_MAC_ROMANIAN_OLD, |
67 | CS_MAC_GREEK_OLD, |
68 | CS_MAC_CYRILLIC_OLD, |
69 | CS_MAC_UKRAINE, |
70 | CS_MAC_VT100, |
71 | CS_MAC_VT100_OLD, |
72 | CS_VISCII, |
73 | CS_HP_ROMAN8, |
74 | CS_DEC_MCS, |
75 | CS_UTF8, |
76 | CS_UTF7, |
77 | CS_UTF7_CONSERVATIVE, |
78 | CS_UTF16, |
79 | CS_UTF16BE, |
80 | CS_UTF16LE, |
81 | CS_EUC_JP, |
82 | CS_EUC_CN, |
83 | CS_EUC_KR, |
84 | CS_ISO2022_JP, |
85 | CS_ISO2022_KR, |
86 | CS_BIG5, |
87 | CS_SHIFT_JIS, |
88 | CS_HZ, |
89 | CS_CP949, |
cdb08fdc |
90 | CS_PDF, |
032fbecf |
91 | CS_PSSTD, |
01081d4e |
92 | CS_CTEXT, |
294941fa |
93 | CS_ISO2022, |
94 | CS_BS4730, |
b063a840 |
95 | CS_DEC_GRAPHICS, |
3e5305f1 |
96 | CS_EUC_TW, |
97 | CS_LIMIT /* dummy value indicating extent of enum */ |
c6d25d8d |
98 | } charset_t; |
99 | |
100 | typedef struct { |
101 | unsigned long s0, s1; |
102 | } charset_state; |
103 | |
104 | /* |
105 | * This macro is used to initialise a charset_state structure: |
106 | * |
107 | * charset_state mystate = CHARSET_INIT_STATE; |
108 | */ |
109 | #define CHARSET_INIT_STATE { 0L, 0L } /* a suitable initialiser */ |
110 | |
111 | /* |
112 | * This external variable contains the same data, but is provided |
113 | * for easy structure-copy assignment: |
114 | * |
115 | * mystate = charset_init_state; |
116 | */ |
117 | extern const charset_state charset_init_state; |
118 | |
119 | /* |
120 | * Routine to convert a MB/SB character set to Unicode. |
121 | * |
122 | * This routine accepts some number of bytes, updates a state |
123 | * variable, and outputs some number of Unicode characters. There |
124 | * are no guarantees. You can't even guarantee that at most one |
125 | * Unicode character will be output per byte you feed in; for |
126 | * example, suppose you're reading UTF-8, you've seen E1 80, and |
127 | * then you suddenly see FE. Now you need to output _two_ error |
128 | * characters - one for the incomplete sequence E1 80, and one for |
129 | * the completely invalid UTF-8 byte FE. |
130 | * |
131 | * Returns the number of wide characters output; will never output |
132 | * more than the size of the buffer (as specified on input). |
133 | * Advances the `input' pointer and decrements `inlen', to indicate |
134 | * how far along the input string it got. |
135 | * |
136 | * The sequence of `errlen' wide characters pointed to by `errstr' |
137 | * will be used to indicate a conversion error. If `errstr' is |
138 | * NULL, `errlen' will be ignored, and the library will choose |
139 | * something sensible to do on its own. For Unicode, this will be |
140 | * U+FFFD (REPLACEMENT CHARACTER). |
49152469 |
141 | * |
142 | * `output' may be NULL, in which case the entire translation will |
143 | * be performed in theory (e.g. a dry run to work out how much |
144 | * space needs to be allocated for the real thing). `outlen' may |
145 | * also be negative, indicating an unlimited buffer length |
146 | * (although this is almost certainly unwise if `output' is _not_ |
147 | * NULL). |
c6d25d8d |
148 | */ |
149 | |
150 | int charset_to_unicode(const char **input, int *inlen, |
151 | wchar_t *output, int outlen, |
152 | int charset, charset_state *state, |
153 | const wchar_t *errstr, int errlen); |
154 | |
155 | /* |
156 | * Routine to convert Unicode to an MB/SB character set. |
157 | * |
158 | * This routine accepts some number of Unicode characters, updates |
159 | * a state variable, and outputs some number of bytes. |
160 | * |
161 | * Returns the number of bytes output; will never output more than |
162 | * the size of the buffer (as specified on input), and will never |
163 | * output a partial MB character. Advances the `input' pointer and |
164 | * decrements `inlen', to indicate how far along the input string |
165 | * it got. |
166 | * |
167 | * If `error' is non-NULL and a character is found which cannot be |
168 | * expressed in the output charset, conversion will terminate at |
169 | * that character (so `input' points to the offending character) |
170 | * and `*error' will be set to TRUE; if `error' is non-NULL and no |
171 | * difficult characters are encountered, `*error' will be set to |
172 | * FALSE. If `error' is NULL, difficult characters will simply be |
173 | * ignored. |
174 | * |
175 | * If `input' is NULL, this routine will output the necessary bytes |
176 | * to reset the encoding state in any way which might be required |
177 | * at the end of an output piece of text. |
49152469 |
178 | * |
179 | * `output' may be NULL, in which case the entire translation will |
180 | * be performed in theory (e.g. a dry run to work out how much |
181 | * space needs to be allocated for the real thing). `outlen' may |
182 | * also be negative, indicating an unlimited buffer length |
183 | * (although this is almost certainly unwise if `output' is _not_ |
184 | * NULL). |
c6d25d8d |
185 | */ |
186 | |
187 | int charset_from_unicode(const wchar_t **input, int *inlen, |
188 | char *output, int outlen, |
189 | int charset, charset_state *state, int *error); |
190 | |
191 | /* |
192 | * Convert X11 encoding names to and from our charset identifiers. |
193 | */ |
194 | const char *charset_to_xenc(int charset); |
195 | int charset_from_xenc(const char *name); |
196 | |
197 | /* |
198 | * Convert MIME encoding names to and from our charset identifiers. |
199 | */ |
200 | const char *charset_to_mimeenc(int charset); |
201 | int charset_from_mimeenc(const char *name); |
202 | |
203 | /* |
204 | * Convert our own encoding names to and from our charset |
205 | * identifiers. |
206 | */ |
207 | const char *charset_to_localenc(int charset); |
208 | int charset_from_localenc(const char *name); |
209 | int charset_localenc_nth(int n); |
210 | |
211 | /* |
212 | * Convert Mac OS script/region/font to our charset identifiers. |
213 | */ |
214 | int charset_from_macenc(int script, int region, int sysvers, |
215 | const char *fontname); |
216 | |
217 | /* |
32361bda |
218 | * Convert GNU Emacs coding system symbol to and from our charset |
219 | * identifiers. |
220 | */ |
221 | const char *charset_to_emacsenc(int charset); |
222 | int charset_from_emacsenc(const char *name); |
223 | |
224 | /* |
c6d25d8d |
225 | * Upgrade a charset identifier to a superset charset which is |
226 | * often confused with it. For example, people whose MUAs report |
227 | * their mail as ASCII or ISO8859-1 often in practice turn out to |
228 | * be using CP1252 quote characters, so when parsing incoming mail |
229 | * it is prudent to treat ASCII and ISO8859-1 as aliases for CP1252 |
230 | * - and since it's a superset of both, this will cause no |
231 | * genuinely correct mail to be parsed wrongly. |
232 | */ |
233 | int charset_upgrade(int charset); |
234 | |
235 | /* |
236 | * This function returns TRUE if the input charset is a vaguely |
237 | * sensible superset of ASCII. That is, it returns FALSE for 7-bit |
238 | * encoding formats such as HZ and UTF-7. |
239 | */ |
240 | int charset_contains_ascii(int charset); |
241 | |
8a731dfa |
242 | /* |
243 | * This function tries to deduce the CS_* identifier of the charset |
244 | * used in the current C locale. It falls back to CS_ASCII if it |
245 | * can't figure it out at all, so it will always return a valid |
246 | * charset. |
247 | * |
248 | * (Note that you should have already called setlocale(LC_CTYPE, |
249 | * "") to guarantee that this function will do the right thing.) |
250 | */ |
251 | int charset_from_locale(void); |
252 | |
3e5305f1 |
253 | /* |
254 | * This function simply reports whether a charset identifier |
255 | * corresponds to an actually usable charset. Not everything in the |
256 | * above enum does: CS_NONE, for a start, and occasionally other slots |
257 | * in the enum are reserved before they actually go into service. |
258 | * |
259 | * This function permits clients to iterate over _all_ supported |
260 | * charsets by means of a loop such as |
261 | * |
262 | * for (cs = 0; cs < CS_LIMIT; cs++) |
263 | * if (charset_exists(cs)) |
264 | * do_stuff_with(cs); |
265 | */ |
266 | int charset_exists(int charset); |
267 | |
c6d25d8d |
268 | #endif /* charset_charset_h */ |