2dc6356a |
1 | /* |
2 | * charset.h - header file for general character set conversion |
3 | * routines. |
4 | */ |
5 | |
6 | #ifndef charset_charset_h |
7 | #define charset_charset_h |
8 | |
9 | #include <stddef.h> |
10 | |
11 | /* |
12 | * Enumeration that lists all the multibyte or single-byte |
13 | * character sets known to this library. |
14 | */ |
15 | typedef enum { |
16 | CS_NONE, /* used for reporting errors, etc */ |
17 | CS_ISO8859_1, |
18 | CS_ISO8859_1_X11, /* X font encoding with VT100 glyphs */ |
19 | CS_ISO8859_2, |
20 | CS_ISO8859_3, |
21 | CS_ISO8859_4, |
22 | CS_ISO8859_5, |
23 | CS_ISO8859_6, |
24 | CS_ISO8859_7, |
25 | CS_ISO8859_8, |
26 | CS_ISO8859_9, |
27 | CS_ISO8859_10, |
28 | CS_ISO8859_11, |
29 | CS_ISO8859_13, |
30 | CS_ISO8859_14, |
31 | CS_ISO8859_15, |
32 | CS_ISO8859_16, |
33 | CS_CP437, |
34 | CS_CP850, |
35 | CS_CP1250, |
36 | CS_CP1251, |
37 | CS_CP1252, |
38 | CS_CP1253, |
39 | CS_CP1254, |
40 | CS_CP1255, |
41 | CS_CP1256, |
42 | CS_CP1257, |
43 | CS_CP1258, |
44 | CS_KOI8_R, |
45 | CS_KOI8_U, |
46 | CS_MAC_ROMAN, |
9a4486bd |
47 | CS_MAC_TURKISH, |
48 | CS_MAC_CROATIAN, |
49 | CS_MAC_ICELAND, |
50 | CS_MAC_ROMANIAN, |
51 | CS_MAC_GREEK, |
52 | CS_MAC_CYRILLIC, |
53 | CS_MAC_THAI, |
54 | CS_MAC_CENTEURO, |
55 | CS_MAC_SYMBOL, |
56 | CS_MAC_DINGBATS, |
57 | CS_MAC_ROMAN_OLD, |
58 | CS_MAC_CROATIAN_OLD, |
59 | CS_MAC_ICELAND_OLD, |
60 | CS_MAC_ROMANIAN_OLD, |
61 | CS_MAC_GREEK_OLD, |
62 | CS_MAC_CYRILLIC_OLD, |
63 | CS_MAC_UKRAINE, |
64 | CS_MAC_VT100, |
65 | CS_MAC_VT100_OLD, |
2dc6356a |
66 | CS_VISCII, |
67 | CS_HP_ROMAN8, |
68 | CS_DEC_MCS, |
69 | CS_UTF8 |
70 | } charset_t; |
71 | |
72 | typedef struct { |
73 | unsigned long s0; |
74 | } charset_state; |
75 | |
76 | /* |
77 | * Routine to convert a MB/SB character set to Unicode. |
78 | * |
79 | * This routine accepts some number of bytes, updates a state |
80 | * variable, and outputs some number of Unicode characters. There |
81 | * are no guarantees. You can't even guarantee that at most one |
82 | * Unicode character will be output per byte you feed in; for |
83 | * example, suppose you're reading UTF-8, you've seen E1 80, and |
84 | * then you suddenly see FE. Now you need to output _two_ error |
85 | * characters - one for the incomplete sequence E1 80, and one for |
86 | * the completely invalid UTF-8 byte FE. |
87 | * |
88 | * Returns the number of wide characters output; will never output |
89 | * more than the size of the buffer (as specified on input). |
90 | * Advances the `input' pointer and decrements `inlen', to indicate |
91 | * how far along the input string it got. |
92 | * |
93 | * The sequence of `errlen' wide characters pointed to by `errstr' |
94 | * will be used to indicate a conversion error. If `errstr' is |
95 | * NULL, `errlen' will be ignored, and the library will choose |
96 | * something sensible to do on its own. For Unicode, this will be |
97 | * U+FFFD (REPLACEMENT CHARACTER). |
98 | */ |
99 | |
100 | int charset_to_unicode(char **input, int *inlen, wchar_t *output, int outlen, |
101 | int charset, charset_state *state, |
102 | const wchar_t *errstr, int errlen); |
103 | |
104 | /* |
105 | * Routine to convert Unicode to an MB/SB character set. |
106 | * |
107 | * This routine accepts some number of Unicode characters, updates |
108 | * a state variable, and outputs some number of bytes. |
109 | * |
110 | * Returns the number of bytes characters output; will never output |
111 | * more than the size of the buffer (as specified on input), and |
112 | * will never output a partial MB character. Advances the `input' |
113 | * pointer and decrements `inlen', to indicate how far along the |
114 | * input string it got. |
115 | * |
116 | * The sequence of `errlen' characters pointed to by `errstr' will |
117 | * be used to indicate a conversion error. If `errstr' is NULL, |
118 | * `errlen' will be ignored, and the library will choose something |
119 | * sensible to do on its own (which will vary depending on the |
120 | * output charset). |
121 | */ |
122 | |
123 | int charset_from_unicode(wchar_t **input, int *inlen, char *output, int outlen, |
124 | int charset, charset_state *state, |
125 | const char *errstr, int errlen); |
126 | |
127 | /* |
128 | * Convert X11 encoding names to and from our charset identifiers. |
129 | */ |
130 | const char *charset_to_xenc(int charset); |
131 | int charset_from_xenc(const char *name); |
132 | |
133 | /* |
134 | * Convert MIME encoding names to and from our charset identifiers. |
135 | */ |
136 | const char *charset_to_mimeenc(int charset); |
137 | int charset_from_mimeenc(const char *name); |
138 | |
8ef2b196 |
139 | /* |
140 | * Convert Mac OS script/region/font to our charset identifiers. |
141 | */ |
142 | int charset_from_macenc(int script, int region, int sysvers, |
143 | const char *fontname); |
144 | |
2dc6356a |
145 | #endif /* charset_charset_h */ |