2dc6356a |
1 | /* |
2 | * charset.h - header file for general character set conversion |
3 | * routines. |
4 | */ |
5 | |
6 | #ifndef charset_charset_h |
7 | #define charset_charset_h |
8 | |
9 | #include <stddef.h> |
10 | |
11 | /* |
12 | * Enumeration that lists all the multibyte or single-byte |
13 | * character sets known to this library. |
14 | */ |
15 | typedef enum { |
16 | CS_NONE, /* used for reporting errors, etc */ |
17 | CS_ISO8859_1, |
18 | CS_ISO8859_1_X11, /* X font encoding with VT100 glyphs */ |
19 | CS_ISO8859_2, |
20 | CS_ISO8859_3, |
21 | CS_ISO8859_4, |
22 | CS_ISO8859_5, |
23 | CS_ISO8859_6, |
24 | CS_ISO8859_7, |
25 | CS_ISO8859_8, |
26 | CS_ISO8859_9, |
27 | CS_ISO8859_10, |
28 | CS_ISO8859_11, |
29 | CS_ISO8859_13, |
30 | CS_ISO8859_14, |
31 | CS_ISO8859_15, |
32 | CS_ISO8859_16, |
33 | CS_CP437, |
34 | CS_CP850, |
35 | CS_CP1250, |
36 | CS_CP1251, |
37 | CS_CP1252, |
38 | CS_CP1253, |
39 | CS_CP1254, |
40 | CS_CP1255, |
41 | CS_CP1256, |
42 | CS_CP1257, |
43 | CS_CP1258, |
44 | CS_KOI8_R, |
45 | CS_KOI8_U, |
46 | CS_MAC_ROMAN, |
47 | CS_VISCII, |
48 | CS_HP_ROMAN8, |
49 | CS_DEC_MCS, |
50 | CS_UTF8 |
51 | } charset_t; |
52 | |
53 | typedef struct { |
54 | unsigned long s0; |
55 | } charset_state; |
56 | |
57 | /* |
58 | * Routine to convert a MB/SB character set to Unicode. |
59 | * |
60 | * This routine accepts some number of bytes, updates a state |
61 | * variable, and outputs some number of Unicode characters. There |
62 | * are no guarantees. You can't even guarantee that at most one |
63 | * Unicode character will be output per byte you feed in; for |
64 | * example, suppose you're reading UTF-8, you've seen E1 80, and |
65 | * then you suddenly see FE. Now you need to output _two_ error |
66 | * characters - one for the incomplete sequence E1 80, and one for |
67 | * the completely invalid UTF-8 byte FE. |
68 | * |
69 | * Returns the number of wide characters output; will never output |
70 | * more than the size of the buffer (as specified on input). |
71 | * Advances the `input' pointer and decrements `inlen', to indicate |
72 | * how far along the input string it got. |
73 | * |
74 | * The sequence of `errlen' wide characters pointed to by `errstr' |
75 | * will be used to indicate a conversion error. If `errstr' is |
76 | * NULL, `errlen' will be ignored, and the library will choose |
77 | * something sensible to do on its own. For Unicode, this will be |
78 | * U+FFFD (REPLACEMENT CHARACTER). |
79 | */ |
80 | |
81 | int charset_to_unicode(char **input, int *inlen, wchar_t *output, int outlen, |
82 | int charset, charset_state *state, |
83 | const wchar_t *errstr, int errlen); |
84 | |
85 | /* |
86 | * Routine to convert Unicode to an MB/SB character set. |
87 | * |
88 | * This routine accepts some number of Unicode characters, updates |
89 | * a state variable, and outputs some number of bytes. |
90 | * |
91 | * Returns the number of bytes characters output; will never output |
92 | * more than the size of the buffer (as specified on input), and |
93 | * will never output a partial MB character. Advances the `input' |
94 | * pointer and decrements `inlen', to indicate how far along the |
95 | * input string it got. |
96 | * |
97 | * The sequence of `errlen' characters pointed to by `errstr' will |
98 | * be used to indicate a conversion error. If `errstr' is NULL, |
99 | * `errlen' will be ignored, and the library will choose something |
100 | * sensible to do on its own (which will vary depending on the |
101 | * output charset). |
102 | */ |
103 | |
104 | int charset_from_unicode(wchar_t **input, int *inlen, char *output, int outlen, |
105 | int charset, charset_state *state, |
106 | const char *errstr, int errlen); |
107 | |
108 | /* |
109 | * Convert X11 encoding names to and from our charset identifiers. |
110 | */ |
111 | const char *charset_to_xenc(int charset); |
112 | int charset_from_xenc(const char *name); |
113 | |
114 | /* |
115 | * Convert MIME encoding names to and from our charset identifiers. |
116 | */ |
117 | const char *charset_to_mimeenc(int charset); |
118 | int charset_from_mimeenc(const char *name); |
119 | |
120 | #endif /* charset_charset_h */ |