| 1 | /* |
| 2 | * charset.h - header file for general character set conversion |
| 3 | * routines. |
| 4 | */ |
| 5 | |
| 6 | #ifndef charset_charset_h |
| 7 | #define charset_charset_h |
| 8 | |
| 9 | #include <stddef.h> |
| 10 | |
| 11 | /* |
| 12 | * Enumeration that lists all the multibyte or single-byte |
| 13 | * character sets known to this library. |
| 14 | */ |
| 15 | typedef enum { |
| 16 | CS_NONE, /* used for reporting errors, etc */ |
| 17 | CS_ISO8859_1, |
| 18 | CS_ISO8859_1_X11, /* X font encoding with VT100 glyphs */ |
| 19 | CS_ISO8859_2, |
| 20 | CS_ISO8859_3, |
| 21 | CS_ISO8859_4, |
| 22 | CS_ISO8859_5, |
| 23 | CS_ISO8859_6, |
| 24 | CS_ISO8859_7, |
| 25 | CS_ISO8859_8, |
| 26 | CS_ISO8859_9, |
| 27 | CS_ISO8859_10, |
| 28 | CS_ISO8859_11, |
| 29 | CS_ISO8859_13, |
| 30 | CS_ISO8859_14, |
| 31 | CS_ISO8859_15, |
| 32 | CS_ISO8859_16, |
| 33 | CS_CP437, |
| 34 | CS_CP850, |
| 35 | CS_CP852, |
| 36 | CS_CP866, |
| 37 | CS_CP1250, |
| 38 | CS_CP1251, |
| 39 | CS_CP1252, |
| 40 | CS_CP1253, |
| 41 | CS_CP1254, |
| 42 | CS_CP1255, |
| 43 | CS_CP1256, |
| 44 | CS_CP1257, |
| 45 | CS_CP1258, |
| 46 | CS_KOI8_R, |
| 47 | CS_KOI8_U, |
| 48 | CS_MAC_ROMAN, |
| 49 | CS_MAC_TURKISH, |
| 50 | CS_MAC_CROATIAN, |
| 51 | CS_MAC_ICELAND, |
| 52 | CS_MAC_ROMANIAN, |
| 53 | CS_MAC_GREEK, |
| 54 | CS_MAC_CYRILLIC, |
| 55 | CS_MAC_THAI, |
| 56 | CS_MAC_CENTEURO, |
| 57 | CS_MAC_SYMBOL, |
| 58 | CS_MAC_DINGBATS, |
| 59 | CS_MAC_ROMAN_OLD, |
| 60 | CS_MAC_CROATIAN_OLD, |
| 61 | CS_MAC_ICELAND_OLD, |
| 62 | CS_MAC_ROMANIAN_OLD, |
| 63 | CS_MAC_GREEK_OLD, |
| 64 | CS_MAC_CYRILLIC_OLD, |
| 65 | CS_MAC_UKRAINE, |
| 66 | CS_MAC_VT100, |
| 67 | CS_MAC_VT100_OLD, |
| 68 | CS_VISCII, |
| 69 | CS_HP_ROMAN8, |
| 70 | CS_DEC_MCS, |
| 71 | CS_UTF8 |
| 72 | } charset_t; |
| 73 | |
| 74 | typedef struct { |
| 75 | unsigned long s0; |
| 76 | } charset_state; |
| 77 | |
| 78 | /* |
| 79 | * Routine to convert a MB/SB character set to Unicode. |
| 80 | * |
| 81 | * This routine accepts some number of bytes, updates a state |
| 82 | * variable, and outputs some number of Unicode characters. There |
| 83 | * are no guarantees. You can't even guarantee that at most one |
| 84 | * Unicode character will be output per byte you feed in; for |
| 85 | * example, suppose you're reading UTF-8, you've seen E1 80, and |
| 86 | * then you suddenly see FE. Now you need to output _two_ error |
| 87 | * characters - one for the incomplete sequence E1 80, and one for |
| 88 | * the completely invalid UTF-8 byte FE. |
| 89 | * |
| 90 | * Returns the number of wide characters output; will never output |
| 91 | * more than the size of the buffer (as specified on input). |
| 92 | * Advances the `input' pointer and decrements `inlen', to indicate |
| 93 | * how far along the input string it got. |
| 94 | * |
| 95 | * The sequence of `errlen' wide characters pointed to by `errstr' |
| 96 | * will be used to indicate a conversion error. If `errstr' is |
| 97 | * NULL, `errlen' will be ignored, and the library will choose |
| 98 | * something sensible to do on its own. For Unicode, this will be |
| 99 | * U+FFFD (REPLACEMENT CHARACTER). |
| 100 | */ |
| 101 | |
| 102 | int charset_to_unicode(const char **input, int *inlen, |
| 103 | wchar_t *output, int outlen, |
| 104 | int charset, charset_state *state, |
| 105 | const wchar_t *errstr, int errlen); |
| 106 | |
| 107 | /* |
| 108 | * Routine to convert Unicode to an MB/SB character set. |
| 109 | * |
| 110 | * This routine accepts some number of Unicode characters, updates |
| 111 | * a state variable, and outputs some number of bytes. |
| 112 | * |
| 113 | * Returns the number of bytes characters output; will never output |
| 114 | * more than the size of the buffer (as specified on input), and |
| 115 | * will never output a partial MB character. Advances the `input' |
| 116 | * pointer and decrements `inlen', to indicate how far along the |
| 117 | * input string it got. |
| 118 | * |
| 119 | * The sequence of `errlen' characters pointed to by `errstr' will |
| 120 | * be used to indicate a conversion error. If `errstr' is NULL, |
| 121 | * `errlen' will be ignored, and the library will choose something |
| 122 | * sensible to do on its own (which will vary depending on the |
| 123 | * output charset). |
| 124 | */ |
| 125 | |
| 126 | int charset_from_unicode(const wchar_t **input, int *inlen, |
| 127 | char *output, int outlen, |
| 128 | int charset, charset_state *state, |
| 129 | const char *errstr, int errlen); |
| 130 | |
| 131 | /* |
| 132 | * Convert X11 encoding names to and from our charset identifiers. |
| 133 | */ |
| 134 | const char *charset_to_xenc(int charset); |
| 135 | int charset_from_xenc(const char *name); |
| 136 | |
| 137 | /* |
| 138 | * Convert MIME encoding names to and from our charset identifiers. |
| 139 | */ |
| 140 | const char *charset_to_mimeenc(int charset); |
| 141 | int charset_from_mimeenc(const char *name); |
| 142 | |
| 143 | /* |
| 144 | * Convert our own encoding names to and from our charset |
| 145 | * identifiers. |
| 146 | */ |
| 147 | const char *charset_to_localenc(int charset); |
| 148 | int charset_from_localenc(const char *name); |
| 149 | int charset_localenc_nth(int n); |
| 150 | |
| 151 | /* |
| 152 | * Convert Mac OS script/region/font to our charset identifiers. |
| 153 | */ |
| 154 | int charset_from_macenc(int script, int region, int sysvers, |
| 155 | const char *fontname); |
| 156 | |
| 157 | #endif /* charset_charset_h */ |