2dc6356a |
1 | /* |
2 | * charset.h - header file for general character set conversion |
3 | * routines. |
4 | */ |
5 | |
6 | #ifndef charset_charset_h |
7 | #define charset_charset_h |
8 | |
9 | #include <stddef.h> |
10 | |
11 | /* |
12 | * Enumeration that lists all the multibyte or single-byte |
13 | * character sets known to this library. |
14 | */ |
15 | typedef enum { |
16 | CS_NONE, /* used for reporting errors, etc */ |
17 | CS_ISO8859_1, |
18 | CS_ISO8859_1_X11, /* X font encoding with VT100 glyphs */ |
19 | CS_ISO8859_2, |
20 | CS_ISO8859_3, |
21 | CS_ISO8859_4, |
22 | CS_ISO8859_5, |
23 | CS_ISO8859_6, |
24 | CS_ISO8859_7, |
25 | CS_ISO8859_8, |
26 | CS_ISO8859_9, |
27 | CS_ISO8859_10, |
28 | CS_ISO8859_11, |
29 | CS_ISO8859_13, |
30 | CS_ISO8859_14, |
31 | CS_ISO8859_15, |
32 | CS_ISO8859_16, |
33 | CS_CP437, |
34 | CS_CP850, |
6d177721 |
35 | CS_CP866, |
2dc6356a |
36 | CS_CP1250, |
37 | CS_CP1251, |
38 | CS_CP1252, |
39 | CS_CP1253, |
40 | CS_CP1254, |
41 | CS_CP1255, |
42 | CS_CP1256, |
43 | CS_CP1257, |
44 | CS_CP1258, |
45 | CS_KOI8_R, |
46 | CS_KOI8_U, |
47 | CS_MAC_ROMAN, |
9a4486bd |
48 | CS_MAC_TURKISH, |
49 | CS_MAC_CROATIAN, |
50 | CS_MAC_ICELAND, |
51 | CS_MAC_ROMANIAN, |
52 | CS_MAC_GREEK, |
53 | CS_MAC_CYRILLIC, |
54 | CS_MAC_THAI, |
55 | CS_MAC_CENTEURO, |
56 | CS_MAC_SYMBOL, |
57 | CS_MAC_DINGBATS, |
58 | CS_MAC_ROMAN_OLD, |
59 | CS_MAC_CROATIAN_OLD, |
60 | CS_MAC_ICELAND_OLD, |
61 | CS_MAC_ROMANIAN_OLD, |
62 | CS_MAC_GREEK_OLD, |
63 | CS_MAC_CYRILLIC_OLD, |
64 | CS_MAC_UKRAINE, |
65 | CS_MAC_VT100, |
66 | CS_MAC_VT100_OLD, |
2dc6356a |
67 | CS_VISCII, |
68 | CS_HP_ROMAN8, |
69 | CS_DEC_MCS, |
70 | CS_UTF8 |
71 | } charset_t; |
72 | |
73 | typedef struct { |
74 | unsigned long s0; |
75 | } charset_state; |
76 | |
77 | /* |
78 | * Routine to convert a MB/SB character set to Unicode. |
79 | * |
80 | * This routine accepts some number of bytes, updates a state |
81 | * variable, and outputs some number of Unicode characters. There |
82 | * are no guarantees. You can't even guarantee that at most one |
83 | * Unicode character will be output per byte you feed in; for |
84 | * example, suppose you're reading UTF-8, you've seen E1 80, and |
85 | * then you suddenly see FE. Now you need to output _two_ error |
86 | * characters - one for the incomplete sequence E1 80, and one for |
87 | * the completely invalid UTF-8 byte FE. |
88 | * |
89 | * Returns the number of wide characters output; will never output |
90 | * more than the size of the buffer (as specified on input). |
91 | * Advances the `input' pointer and decrements `inlen', to indicate |
92 | * how far along the input string it got. |
93 | * |
94 | * The sequence of `errlen' wide characters pointed to by `errstr' |
95 | * will be used to indicate a conversion error. If `errstr' is |
96 | * NULL, `errlen' will be ignored, and the library will choose |
97 | * something sensible to do on its own. For Unicode, this will be |
98 | * U+FFFD (REPLACEMENT CHARACTER). |
99 | */ |
100 | |
101 | int charset_to_unicode(char **input, int *inlen, wchar_t *output, int outlen, |
102 | int charset, charset_state *state, |
103 | const wchar_t *errstr, int errlen); |
104 | |
105 | /* |
106 | * Routine to convert Unicode to an MB/SB character set. |
107 | * |
108 | * This routine accepts some number of Unicode characters, updates |
109 | * a state variable, and outputs some number of bytes. |
110 | * |
111 | * Returns the number of bytes characters output; will never output |
112 | * more than the size of the buffer (as specified on input), and |
113 | * will never output a partial MB character. Advances the `input' |
114 | * pointer and decrements `inlen', to indicate how far along the |
115 | * input string it got. |
116 | * |
117 | * The sequence of `errlen' characters pointed to by `errstr' will |
118 | * be used to indicate a conversion error. If `errstr' is NULL, |
119 | * `errlen' will be ignored, and the library will choose something |
120 | * sensible to do on its own (which will vary depending on the |
121 | * output charset). |
122 | */ |
123 | |
124 | int charset_from_unicode(wchar_t **input, int *inlen, char *output, int outlen, |
125 | int charset, charset_state *state, |
126 | const char *errstr, int errlen); |
127 | |
128 | /* |
129 | * Convert X11 encoding names to and from our charset identifiers. |
130 | */ |
131 | const char *charset_to_xenc(int charset); |
132 | int charset_from_xenc(const char *name); |
133 | |
134 | /* |
135 | * Convert MIME encoding names to and from our charset identifiers. |
136 | */ |
137 | const char *charset_to_mimeenc(int charset); |
138 | int charset_from_mimeenc(const char *name); |
139 | |
8ef2b196 |
140 | /* |
d4413bd2 |
141 | * Convert our own encoding names to and from our charset |
142 | * identifiers. |
143 | */ |
144 | const char *charset_to_localenc(int charset); |
145 | int charset_from_localenc(const char *name); |
146 | int charset_localenc_nth(int n); |
147 | |
148 | /* |
8ef2b196 |
149 | * Convert Mac OS script/region/font to our charset identifiers. |
150 | */ |
151 | int charset_from_macenc(int script, int region, int sysvers, |
152 | const char *fontname); |
153 | |
2dc6356a |
154 | #endif /* charset_charset_h */ |