| 1 | /* |
| 2 | * charset.h - header file for general character set conversion |
| 3 | * routines. |
| 4 | */ |
| 5 | |
| 6 | #ifndef charset_charset_h |
| 7 | #define charset_charset_h |
| 8 | |
| 9 | #include <stddef.h> |
| 10 | |
| 11 | /* |
| 12 | * Enumeration that lists all the multibyte or single-byte |
| 13 | * character sets known to this library. |
| 14 | */ |
| 15 | typedef enum { |
| 16 | CS_NONE, /* used for reporting errors, etc */ |
| 17 | CS_ASCII, /* ordinary US-ASCII is worth having! */ |
| 18 | CS_ISO8859_1, |
| 19 | CS_ISO8859_1_X11, /* X font encoding with VT100 glyphs */ |
| 20 | CS_ISO8859_2, |
| 21 | CS_ISO8859_3, |
| 22 | CS_ISO8859_4, |
| 23 | CS_ISO8859_5, |
| 24 | CS_ISO8859_6, |
| 25 | CS_ISO8859_7, |
| 26 | CS_ISO8859_8, |
| 27 | CS_ISO8859_9, |
| 28 | CS_ISO8859_10, |
| 29 | CS_ISO8859_11, |
| 30 | CS_ISO8859_13, |
| 31 | CS_ISO8859_14, |
| 32 | CS_ISO8859_15, |
| 33 | CS_ISO8859_16, |
| 34 | CS_CP437, |
| 35 | CS_CP850, |
| 36 | CS_CP852, |
| 37 | CS_CP866, |
| 38 | CS_CP874, |
| 39 | CS_CP1250, |
| 40 | CS_CP1251, |
| 41 | CS_CP1252, |
| 42 | CS_CP1253, |
| 43 | CS_CP1254, |
| 44 | CS_CP1255, |
| 45 | CS_CP1256, |
| 46 | CS_CP1257, |
| 47 | CS_CP1258, |
| 48 | CS_KOI8_R, |
| 49 | CS_KOI8_U, |
| 50 | CS_KOI8_RU, |
| 51 | CS_JISX0201, |
| 52 | CS_MAC_ROMAN, |
| 53 | CS_MAC_TURKISH, |
| 54 | CS_MAC_CROATIAN, |
| 55 | CS_MAC_ICELAND, |
| 56 | CS_MAC_ROMANIAN, |
| 57 | CS_MAC_GREEK, |
| 58 | CS_MAC_CYRILLIC, |
| 59 | CS_MAC_THAI, |
| 60 | CS_MAC_CENTEURO, |
| 61 | CS_MAC_SYMBOL, |
| 62 | CS_MAC_DINGBATS, |
| 63 | CS_MAC_ROMAN_OLD, |
| 64 | CS_MAC_CROATIAN_OLD, |
| 65 | CS_MAC_ICELAND_OLD, |
| 66 | CS_MAC_ROMANIAN_OLD, |
| 67 | CS_MAC_GREEK_OLD, |
| 68 | CS_MAC_CYRILLIC_OLD, |
| 69 | CS_MAC_UKRAINE, |
| 70 | CS_MAC_VT100, |
| 71 | CS_MAC_VT100_OLD, |
| 72 | CS_VISCII, |
| 73 | CS_HP_ROMAN8, |
| 74 | CS_DEC_MCS, |
| 75 | CS_UTF8, |
| 76 | CS_UTF7, |
| 77 | CS_UTF7_CONSERVATIVE, |
| 78 | CS_UTF16, |
| 79 | CS_UTF16BE, |
| 80 | CS_UTF16LE, |
| 81 | CS_EUC_JP, |
| 82 | CS_EUC_CN, |
| 83 | CS_EUC_KR, |
| 84 | CS_ISO2022_JP, |
| 85 | CS_ISO2022_KR, |
| 86 | CS_BIG5, |
| 87 | CS_SHIFT_JIS, |
| 88 | CS_HZ, |
| 89 | CS_CP949, |
| 90 | CS_PDF, |
| 91 | CS_PSSTD, |
| 92 | CS_CTEXT, |
| 93 | CS_ISO2022, |
| 94 | CS_BS4730, |
| 95 | CS_DEC_GRAPHICS, |
| 96 | CS_EUC_TW, |
| 97 | CS_LIMIT /* dummy value indicating extent of enum */ |
| 98 | } charset_t; |
| 99 | |
| 100 | typedef struct { |
| 101 | unsigned long s0, s1; |
| 102 | } charset_state; |
| 103 | |
| 104 | /* |
| 105 | * This macro is used to initialise a charset_state structure: |
| 106 | * |
| 107 | * charset_state mystate = CHARSET_INIT_STATE; |
| 108 | */ |
| 109 | #define CHARSET_INIT_STATE { 0L, 0L } /* a suitable initialiser */ |
| 110 | |
| 111 | /* |
| 112 | * This external variable contains the same data, but is provided |
| 113 | * for easy structure-copy assignment: |
| 114 | * |
| 115 | * mystate = charset_init_state; |
| 116 | */ |
| 117 | extern const charset_state charset_init_state; |
| 118 | |
| 119 | /* |
| 120 | * Routine to convert a MB/SB character set to Unicode. |
| 121 | * |
| 122 | * This routine accepts some number of bytes, updates a state |
| 123 | * variable, and outputs some number of Unicode characters. There |
| 124 | * are no guarantees. You can't even guarantee that at most one |
| 125 | * Unicode character will be output per byte you feed in; for |
| 126 | * example, suppose you're reading UTF-8, you've seen E1 80, and |
| 127 | * then you suddenly see FE. Now you need to output _two_ error |
| 128 | * characters - one for the incomplete sequence E1 80, and one for |
| 129 | * the completely invalid UTF-8 byte FE. |
| 130 | * |
| 131 | * Returns the number of wide characters output; will never output |
| 132 | * more than the size of the buffer (as specified on input). |
| 133 | * Advances the `input' pointer and decrements `inlen', to indicate |
| 134 | * how far along the input string it got. |
| 135 | * |
| 136 | * The sequence of `errlen' wide characters pointed to by `errstr' |
| 137 | * will be used to indicate a conversion error. If `errstr' is |
| 138 | * NULL, `errlen' will be ignored, and the library will choose |
| 139 | * something sensible to do on its own. For Unicode, this will be |
| 140 | * U+FFFD (REPLACEMENT CHARACTER). |
| 141 | * |
| 142 | * `output' may be NULL, in which case the entire translation will |
| 143 | * be performed in theory (e.g. a dry run to work out how much |
| 144 | * space needs to be allocated for the real thing). `outlen' may |
| 145 | * also be negative, indicating an unlimited buffer length |
| 146 | * (although this is almost certainly unwise if `output' is _not_ |
| 147 | * NULL). |
| 148 | */ |
| 149 | |
| 150 | int charset_to_unicode(const char **input, int *inlen, |
| 151 | wchar_t *output, int outlen, |
| 152 | int charset, charset_state *state, |
| 153 | const wchar_t *errstr, int errlen); |
| 154 | |
| 155 | /* |
| 156 | * Routine to convert Unicode to an MB/SB character set. |
| 157 | * |
| 158 | * This routine accepts some number of Unicode characters, updates |
| 159 | * a state variable, and outputs some number of bytes. |
| 160 | * |
| 161 | * Returns the number of bytes output; will never output more than |
| 162 | * the size of the buffer (as specified on input), and will never |
| 163 | * output a partial MB character. Advances the `input' pointer and |
| 164 | * decrements `inlen', to indicate how far along the input string |
| 165 | * it got. |
| 166 | * |
| 167 | * If `error' is non-NULL and a character is found which cannot be |
| 168 | * expressed in the output charset, conversion will terminate at |
| 169 | * that character (so `input' points to the offending character) |
| 170 | * and `*error' will be set to TRUE; if `error' is non-NULL and no |
| 171 | * difficult characters are encountered, `*error' will be set to |
| 172 | * FALSE. If `error' is NULL, difficult characters will simply be |
| 173 | * ignored. |
| 174 | * |
| 175 | * If `input' is NULL, this routine will output the necessary bytes |
| 176 | * to reset the encoding state in any way which might be required |
| 177 | * at the end of an output piece of text. |
| 178 | * |
| 179 | * `output' may be NULL, in which case the entire translation will |
| 180 | * be performed in theory (e.g. a dry run to work out how much |
| 181 | * space needs to be allocated for the real thing). `outlen' may |
| 182 | * also be negative, indicating an unlimited buffer length |
| 183 | * (although this is almost certainly unwise if `output' is _not_ |
| 184 | * NULL). |
| 185 | */ |
| 186 | |
| 187 | int charset_from_unicode(const wchar_t **input, int *inlen, |
| 188 | char *output, int outlen, |
| 189 | int charset, charset_state *state, int *error); |
| 190 | |
| 191 | /* |
| 192 | * Convert X11 encoding names to and from our charset identifiers. |
| 193 | */ |
| 194 | const char *charset_to_xenc(int charset); |
| 195 | int charset_from_xenc(const char *name); |
| 196 | |
| 197 | /* |
| 198 | * Convert MIME encoding names to and from our charset identifiers. |
| 199 | */ |
| 200 | const char *charset_to_mimeenc(int charset); |
| 201 | int charset_from_mimeenc(const char *name); |
| 202 | |
| 203 | /* |
| 204 | * Convert our own encoding names to and from our charset |
| 205 | * identifiers. |
| 206 | */ |
| 207 | const char *charset_to_localenc(int charset); |
| 208 | int charset_from_localenc(const char *name); |
| 209 | int charset_localenc_nth(int n); |
| 210 | |
| 211 | /* |
| 212 | * Convert Mac OS script/region/font to our charset identifiers. |
| 213 | */ |
| 214 | int charset_from_macenc(int script, int region, int sysvers, |
| 215 | const char *fontname); |
| 216 | |
| 217 | /* |
| 218 | * Convert GNU Emacs coding system symbol to and from our charset |
| 219 | * identifiers. |
| 220 | */ |
| 221 | const char *charset_to_emacsenc(int charset); |
| 222 | int charset_from_emacsenc(const char *name); |
| 223 | |
| 224 | /* |
| 225 | * Upgrade a charset identifier to a superset charset which is |
| 226 | * often confused with it. For example, people whose MUAs report |
| 227 | * their mail as ASCII or ISO8859-1 often in practice turn out to |
| 228 | * be using CP1252 quote characters, so when parsing incoming mail |
| 229 | * it is prudent to treat ASCII and ISO8859-1 as aliases for CP1252 |
| 230 | * - and since it's a superset of both, this will cause no |
| 231 | * genuinely correct mail to be parsed wrongly. |
| 232 | */ |
| 233 | int charset_upgrade(int charset); |
| 234 | |
| 235 | /* |
| 236 | * This function returns TRUE if the input charset is a vaguely |
| 237 | * sensible superset of ASCII. That is, it returns FALSE for 7-bit |
| 238 | * encoding formats such as HZ and UTF-7. |
| 239 | */ |
| 240 | int charset_contains_ascii(int charset); |
| 241 | |
| 242 | /* |
| 243 | * This function tries to deduce the CS_* identifier of the charset |
| 244 | * used in the current C locale. It falls back to CS_ASCII if it |
| 245 | * can't figure it out at all, so it will always return a valid |
| 246 | * charset. |
| 247 | * |
| 248 | * (Note that you should have already called setlocale(LC_CTYPE, |
| 249 | * "") to guarantee that this function will do the right thing.) |
| 250 | */ |
| 251 | int charset_from_locale(void); |
| 252 | |
| 253 | /* |
| 254 | * This function simply reports whether a charset identifier |
| 255 | * corresponds to an actually usable charset. Not everything in the |
| 256 | * above enum does: CS_NONE, for a start, and occasionally other slots |
| 257 | * in the enum are reserved before they actually go into service. |
| 258 | * |
| 259 | * This function permits clients to iterate over _all_ supported |
| 260 | * charsets by means of a loop such as |
| 261 | * |
| 262 | * for (cs = 0; cs < CS_LIMIT; cs++) |
| 263 | * if (charset_exists(cs)) |
| 264 | * do_stuff_with(cs); |
| 265 | */ |
| 266 | int charset_exists(int charset); |
| 267 | |
| 268 | #endif /* charset_charset_h */ |