2dc6356a |
1 | /* |
2 | * charset.h - header file for general character set conversion |
3 | * routines. |
4 | */ |
5 | |
6 | #ifndef charset_charset_h |
7 | #define charset_charset_h |
8 | |
9 | #include <stddef.h> |
10 | |
11 | /* |
12 | * Enumeration that lists all the multibyte or single-byte |
13 | * character sets known to this library. |
14 | */ |
15 | typedef enum { |
16 | CS_NONE, /* used for reporting errors, etc */ |
17 | CS_ISO8859_1, |
18 | CS_ISO8859_1_X11, /* X font encoding with VT100 glyphs */ |
19 | CS_ISO8859_2, |
20 | CS_ISO8859_3, |
21 | CS_ISO8859_4, |
22 | CS_ISO8859_5, |
23 | CS_ISO8859_6, |
24 | CS_ISO8859_7, |
25 | CS_ISO8859_8, |
26 | CS_ISO8859_9, |
27 | CS_ISO8859_10, |
28 | CS_ISO8859_11, |
29 | CS_ISO8859_13, |
30 | CS_ISO8859_14, |
31 | CS_ISO8859_15, |
32 | CS_ISO8859_16, |
33 | CS_CP437, |
34 | CS_CP850, |
6d177721 |
35 | CS_CP866, |
2dc6356a |
36 | CS_CP1250, |
37 | CS_CP1251, |
38 | CS_CP1252, |
39 | CS_CP1253, |
40 | CS_CP1254, |
41 | CS_CP1255, |
42 | CS_CP1256, |
43 | CS_CP1257, |
44 | CS_CP1258, |
45 | CS_KOI8_R, |
46 | CS_KOI8_U, |
47 | CS_MAC_ROMAN, |
9a4486bd |
48 | CS_MAC_TURKISH, |
49 | CS_MAC_CROATIAN, |
50 | CS_MAC_ICELAND, |
51 | CS_MAC_ROMANIAN, |
52 | CS_MAC_GREEK, |
53 | CS_MAC_CYRILLIC, |
54 | CS_MAC_THAI, |
55 | CS_MAC_CENTEURO, |
56 | CS_MAC_SYMBOL, |
57 | CS_MAC_DINGBATS, |
58 | CS_MAC_ROMAN_OLD, |
59 | CS_MAC_CROATIAN_OLD, |
60 | CS_MAC_ICELAND_OLD, |
61 | CS_MAC_ROMANIAN_OLD, |
62 | CS_MAC_GREEK_OLD, |
63 | CS_MAC_CYRILLIC_OLD, |
64 | CS_MAC_UKRAINE, |
65 | CS_MAC_VT100, |
66 | CS_MAC_VT100_OLD, |
2dc6356a |
67 | CS_VISCII, |
68 | CS_HP_ROMAN8, |
69 | CS_DEC_MCS, |
70 | CS_UTF8 |
71 | } charset_t; |
72 | |
73 | typedef struct { |
74 | unsigned long s0; |
75 | } charset_state; |
76 | |
77 | /* |
78 | * Routine to convert a MB/SB character set to Unicode. |
79 | * |
80 | * This routine accepts some number of bytes, updates a state |
81 | * variable, and outputs some number of Unicode characters. There |
82 | * are no guarantees. You can't even guarantee that at most one |
83 | * Unicode character will be output per byte you feed in; for |
84 | * example, suppose you're reading UTF-8, you've seen E1 80, and |
85 | * then you suddenly see FE. Now you need to output _two_ error |
86 | * characters - one for the incomplete sequence E1 80, and one for |
87 | * the completely invalid UTF-8 byte FE. |
88 | * |
89 | * Returns the number of wide characters output; will never output |
90 | * more than the size of the buffer (as specified on input). |
91 | * Advances the `input' pointer and decrements `inlen', to indicate |
92 | * how far along the input string it got. |
93 | * |
94 | * The sequence of `errlen' wide characters pointed to by `errstr' |
95 | * will be used to indicate a conversion error. If `errstr' is |
96 | * NULL, `errlen' will be ignored, and the library will choose |
97 | * something sensible to do on its own. For Unicode, this will be |
98 | * U+FFFD (REPLACEMENT CHARACTER). |
99 | */ |
100 | |
57191fa4 |
101 | int charset_to_unicode(const char **input, int *inlen, |
102 | wchar_t *output, int outlen, |
2dc6356a |
103 | int charset, charset_state *state, |
104 | const wchar_t *errstr, int errlen); |
105 | |
106 | /* |
107 | * Routine to convert Unicode to an MB/SB character set. |
108 | * |
109 | * This routine accepts some number of Unicode characters, updates |
110 | * a state variable, and outputs some number of bytes. |
111 | * |
112 | * Returns the number of bytes characters output; will never output |
113 | * more than the size of the buffer (as specified on input), and |
114 | * will never output a partial MB character. Advances the `input' |
115 | * pointer and decrements `inlen', to indicate how far along the |
116 | * input string it got. |
117 | * |
118 | * The sequence of `errlen' characters pointed to by `errstr' will |
119 | * be used to indicate a conversion error. If `errstr' is NULL, |
120 | * `errlen' will be ignored, and the library will choose something |
121 | * sensible to do on its own (which will vary depending on the |
122 | * output charset). |
123 | */ |
124 | |
57191fa4 |
125 | int charset_from_unicode(const wchar_t **input, int *inlen, |
126 | char *output, int outlen, |
2dc6356a |
127 | int charset, charset_state *state, |
128 | const char *errstr, int errlen); |
129 | |
130 | /* |
131 | * Convert X11 encoding names to and from our charset identifiers. |
132 | */ |
133 | const char *charset_to_xenc(int charset); |
134 | int charset_from_xenc(const char *name); |
135 | |
136 | /* |
137 | * Convert MIME encoding names to and from our charset identifiers. |
138 | */ |
139 | const char *charset_to_mimeenc(int charset); |
140 | int charset_from_mimeenc(const char *name); |
141 | |
8ef2b196 |
142 | /* |
d4413bd2 |
143 | * Convert our own encoding names to and from our charset |
144 | * identifiers. |
145 | */ |
146 | const char *charset_to_localenc(int charset); |
147 | int charset_from_localenc(const char *name); |
148 | int charset_localenc_nth(int n); |
149 | |
150 | /* |
8ef2b196 |
151 | * Convert Mac OS script/region/font to our charset identifiers. |
152 | */ |
153 | int charset_from_macenc(int script, int region, int sysvers, |
154 | const char *fontname); |
155 | |
2dc6356a |
156 | #endif /* charset_charset_h */ |