c6d25d8d |
1 | /* |
2 | * charset.h - header file for general character set conversion |
3 | * routines. |
4 | */ |
5 | |
6 | #ifndef charset_charset_h |
7 | #define charset_charset_h |
8 | |
9 | #include <stddef.h> |
10 | |
11 | /* |
12 | * Enumeration that lists all the multibyte or single-byte |
13 | * character sets known to this library. |
14 | */ |
15 | typedef enum { |
16 | CS_NONE, /* used for reporting errors, etc */ |
17 | CS_ASCII, /* ordinary US-ASCII is worth having! */ |
18 | CS_ISO8859_1, |
19 | CS_ISO8859_1_X11, /* X font encoding with VT100 glyphs */ |
20 | CS_ISO8859_2, |
21 | CS_ISO8859_3, |
22 | CS_ISO8859_4, |
23 | CS_ISO8859_5, |
24 | CS_ISO8859_6, |
25 | CS_ISO8859_7, |
26 | CS_ISO8859_8, |
27 | CS_ISO8859_9, |
28 | CS_ISO8859_10, |
29 | CS_ISO8859_11, |
30 | CS_ISO8859_13, |
31 | CS_ISO8859_14, |
32 | CS_ISO8859_15, |
33 | CS_ISO8859_16, |
34 | CS_CP437, |
35 | CS_CP850, |
9b7e7a92 |
36 | CS_CP866, |
c6d25d8d |
37 | CS_CP1250, |
38 | CS_CP1251, |
39 | CS_CP1252, |
40 | CS_CP1253, |
41 | CS_CP1254, |
42 | CS_CP1255, |
43 | CS_CP1256, |
44 | CS_CP1257, |
45 | CS_CP1258, |
46 | CS_KOI8_R, |
47 | CS_KOI8_U, |
48 | CS_KOI8_RU, |
01081d4e |
49 | CS_JISX0201, |
c6d25d8d |
50 | CS_MAC_ROMAN, |
51 | CS_MAC_TURKISH, |
52 | CS_MAC_CROATIAN, |
53 | CS_MAC_ICELAND, |
54 | CS_MAC_ROMANIAN, |
55 | CS_MAC_GREEK, |
56 | CS_MAC_CYRILLIC, |
57 | CS_MAC_THAI, |
58 | CS_MAC_CENTEURO, |
59 | CS_MAC_SYMBOL, |
60 | CS_MAC_DINGBATS, |
61 | CS_MAC_ROMAN_OLD, |
62 | CS_MAC_CROATIAN_OLD, |
63 | CS_MAC_ICELAND_OLD, |
64 | CS_MAC_ROMANIAN_OLD, |
65 | CS_MAC_GREEK_OLD, |
66 | CS_MAC_CYRILLIC_OLD, |
67 | CS_MAC_UKRAINE, |
68 | CS_MAC_VT100, |
69 | CS_MAC_VT100_OLD, |
70 | CS_VISCII, |
71 | CS_HP_ROMAN8, |
72 | CS_DEC_MCS, |
73 | CS_UTF8, |
74 | CS_UTF7, |
75 | CS_UTF7_CONSERVATIVE, |
76 | CS_UTF16, |
77 | CS_UTF16BE, |
78 | CS_UTF16LE, |
79 | CS_EUC_JP, |
80 | CS_EUC_CN, |
81 | CS_EUC_KR, |
82 | CS_ISO2022_JP, |
83 | CS_ISO2022_KR, |
84 | CS_BIG5, |
85 | CS_SHIFT_JIS, |
86 | CS_HZ, |
87 | CS_CP949, |
cdb08fdc |
88 | CS_PDF, |
032fbecf |
89 | CS_PSSTD, |
01081d4e |
90 | CS_CTEXT, |
294941fa |
91 | CS_ISO2022, |
92 | CS_BS4730, |
b063a840 |
93 | CS_DEC_GRAPHICS, |
94 | CS_EUC_TW |
c6d25d8d |
95 | } charset_t; |
96 | |
97 | typedef struct { |
98 | unsigned long s0, s1; |
99 | } charset_state; |
100 | |
101 | /* |
102 | * This macro is used to initialise a charset_state structure: |
103 | * |
104 | * charset_state mystate = CHARSET_INIT_STATE; |
105 | */ |
106 | #define CHARSET_INIT_STATE { 0L, 0L } /* a suitable initialiser */ |
107 | |
108 | /* |
109 | * This external variable contains the same data, but is provided |
110 | * for easy structure-copy assignment: |
111 | * |
112 | * mystate = charset_init_state; |
113 | */ |
114 | extern const charset_state charset_init_state; |
115 | |
116 | /* |
117 | * Routine to convert a MB/SB character set to Unicode. |
118 | * |
119 | * This routine accepts some number of bytes, updates a state |
120 | * variable, and outputs some number of Unicode characters. There |
121 | * are no guarantees. You can't even guarantee that at most one |
122 | * Unicode character will be output per byte you feed in; for |
123 | * example, suppose you're reading UTF-8, you've seen E1 80, and |
124 | * then you suddenly see FE. Now you need to output _two_ error |
125 | * characters - one for the incomplete sequence E1 80, and one for |
126 | * the completely invalid UTF-8 byte FE. |
127 | * |
128 | * Returns the number of wide characters output; will never output |
129 | * more than the size of the buffer (as specified on input). |
130 | * Advances the `input' pointer and decrements `inlen', to indicate |
131 | * how far along the input string it got. |
132 | * |
133 | * The sequence of `errlen' wide characters pointed to by `errstr' |
134 | * will be used to indicate a conversion error. If `errstr' is |
135 | * NULL, `errlen' will be ignored, and the library will choose |
136 | * something sensible to do on its own. For Unicode, this will be |
137 | * U+FFFD (REPLACEMENT CHARACTER). |
49152469 |
138 | * |
139 | * `output' may be NULL, in which case the entire translation will |
140 | * be performed in theory (e.g. a dry run to work out how much |
141 | * space needs to be allocated for the real thing). `outlen' may |
142 | * also be negative, indicating an unlimited buffer length |
143 | * (although this is almost certainly unwise if `output' is _not_ |
144 | * NULL). |
c6d25d8d |
145 | */ |
146 | |
147 | int charset_to_unicode(const char **input, int *inlen, |
148 | wchar_t *output, int outlen, |
149 | int charset, charset_state *state, |
150 | const wchar_t *errstr, int errlen); |
151 | |
152 | /* |
153 | * Routine to convert Unicode to an MB/SB character set. |
154 | * |
155 | * This routine accepts some number of Unicode characters, updates |
156 | * a state variable, and outputs some number of bytes. |
157 | * |
158 | * Returns the number of bytes output; will never output more than |
159 | * the size of the buffer (as specified on input), and will never |
160 | * output a partial MB character. Advances the `input' pointer and |
161 | * decrements `inlen', to indicate how far along the input string |
162 | * it got. |
163 | * |
164 | * If `error' is non-NULL and a character is found which cannot be |
165 | * expressed in the output charset, conversion will terminate at |
166 | * that character (so `input' points to the offending character) |
167 | * and `*error' will be set to TRUE; if `error' is non-NULL and no |
168 | * difficult characters are encountered, `*error' will be set to |
169 | * FALSE. If `error' is NULL, difficult characters will simply be |
170 | * ignored. |
171 | * |
172 | * If `input' is NULL, this routine will output the necessary bytes |
173 | * to reset the encoding state in any way which might be required |
174 | * at the end of an output piece of text. |
49152469 |
175 | * |
176 | * `output' may be NULL, in which case the entire translation will |
177 | * be performed in theory (e.g. a dry run to work out how much |
178 | * space needs to be allocated for the real thing). `outlen' may |
179 | * also be negative, indicating an unlimited buffer length |
180 | * (although this is almost certainly unwise if `output' is _not_ |
181 | * NULL). |
c6d25d8d |
182 | */ |
183 | |
184 | int charset_from_unicode(const wchar_t **input, int *inlen, |
185 | char *output, int outlen, |
186 | int charset, charset_state *state, int *error); |
187 | |
188 | /* |
189 | * Convert X11 encoding names to and from our charset identifiers. |
190 | */ |
191 | const char *charset_to_xenc(int charset); |
192 | int charset_from_xenc(const char *name); |
193 | |
194 | /* |
195 | * Convert MIME encoding names to and from our charset identifiers. |
196 | */ |
197 | const char *charset_to_mimeenc(int charset); |
198 | int charset_from_mimeenc(const char *name); |
199 | |
200 | /* |
201 | * Convert our own encoding names to and from our charset |
202 | * identifiers. |
203 | */ |
204 | const char *charset_to_localenc(int charset); |
205 | int charset_from_localenc(const char *name); |
206 | int charset_localenc_nth(int n); |
207 | |
208 | /* |
209 | * Convert Mac OS script/region/font to our charset identifiers. |
210 | */ |
211 | int charset_from_macenc(int script, int region, int sysvers, |
212 | const char *fontname); |
213 | |
214 | /* |
32361bda |
215 | * Convert GNU Emacs coding system symbol to and from our charset |
216 | * identifiers. |
217 | */ |
218 | const char *charset_to_emacsenc(int charset); |
219 | int charset_from_emacsenc(const char *name); |
220 | |
221 | /* |
c6d25d8d |
222 | * Upgrade a charset identifier to a superset charset which is |
223 | * often confused with it. For example, people whose MUAs report |
224 | * their mail as ASCII or ISO8859-1 often in practice turn out to |
225 | * be using CP1252 quote characters, so when parsing incoming mail |
226 | * it is prudent to treat ASCII and ISO8859-1 as aliases for CP1252 |
227 | * - and since it's a superset of both, this will cause no |
228 | * genuinely correct mail to be parsed wrongly. |
229 | */ |
230 | int charset_upgrade(int charset); |
231 | |
232 | /* |
233 | * This function returns TRUE if the input charset is a vaguely |
234 | * sensible superset of ASCII. That is, it returns FALSE for 7-bit |
235 | * encoding formats such as HZ and UTF-7. |
236 | */ |
237 | int charset_contains_ascii(int charset); |
238 | |
8a731dfa |
239 | /* |
240 | * This function tries to deduce the CS_* identifier of the charset |
241 | * used in the current C locale. It falls back to CS_ASCII if it |
242 | * can't figure it out at all, so it will always return a valid |
243 | * charset. |
244 | * |
245 | * (Note that you should have already called setlocale(LC_CTYPE, |
246 | * "") to guarantee that this function will do the right thing.) |
247 | */ |
248 | int charset_from_locale(void); |
249 | |
c6d25d8d |
250 | #endif /* charset_charset_h */ |