| 1 | /* |
| 2 | * euc.c - routines to handle the various EUC multibyte encodings. |
| 3 | */ |
| 4 | |
| 5 | #ifndef ENUM_CHARSETS |
| 6 | |
| 7 | #include "charset.h" |
| 8 | #include "internal.h" |
| 9 | |
| 10 | struct euc { |
| 11 | int nchars[3]; /* GR, SS2+GR, SS3+GR */ |
| 12 | long int (*to_ucs)(unsigned long state); |
| 13 | unsigned long (*from_ucs)(long int ucs); |
| 14 | }; |
| 15 | |
| 16 | static void read_euc(charset_spec const *charset, long int input_chr, |
| 17 | charset_state *state, |
| 18 | void (*emit)(void *ctx, long int output), void *emitctx) |
| 19 | { |
| 20 | struct euc const *euc = (struct euc *)charset->data; |
| 21 | |
| 22 | /* |
| 23 | * For EUC input, our state variable divides into three parts: |
| 24 | * |
| 25 | * - Topmost nibble (bits 31:28) is nonzero if we're |
| 26 | * accumulating a multibyte character, and it indicates |
| 27 | * which section we're in: 1 for GR chars, 2 for things |
| 28 | * beginning with SS2, 3 for things beginning with SS3. |
| 29 | * |
| 30 | * - Next nibble (bits 27:24) indicates how many bytes of the |
| 31 | * character we've accumulated so far. |
| 32 | * |
| 33 | * - The rest (bits 23:0) are those bytes in full, accumulated |
| 34 | * as a large integer (so that seeing A1 A2 A3, in a |
| 35 | * hypothetical EUC whose GR encoding is three-byte, runs |
| 36 | * our state variable from 0 -> 0x110000A1 -> 0x1200A1A2 -> |
| 37 | * 0x13A1A2A3, at which point it gets translated and output |
| 38 | * and resets to zero). |
| 39 | */ |
| 40 | |
| 41 | if (state->s0 != 0) { |
| 42 | |
| 43 | /* |
| 44 | * At this point, no matter whether we had an SS2 or SS3 |
| 45 | * introducer or not, we _always_ expect a GR character. |
| 46 | * Anything else causes us to emit ERROR for an incomplete |
| 47 | * character, and then reset to state 0 to process the |
| 48 | * character in its own way. |
| 49 | */ |
| 50 | if (input_chr < 0xA1 || input_chr == 0xFF) { |
| 51 | emit(emitctx, ERROR); |
| 52 | state->s0 = 0; |
| 53 | } else |
| 54 | state->s0 = (((state->s0 & 0xFF000000) + 0x01000000) | |
| 55 | ((state->s0 & 0x0000FFFF) << 8) | input_chr); |
| 56 | |
| 57 | } |
| 58 | |
| 59 | if (state->s0 == 0) { |
| 60 | /* |
| 61 | * The input character determines which of the four |
| 62 | * possible charsets we're going to be in. |
| 63 | */ |
| 64 | if (input_chr < 0x80) { /* this is always ASCII */ |
| 65 | emit(emitctx, input_chr); |
| 66 | } else if (input_chr == 0x8E) {/* SS2 means charset 2 */ |
| 67 | state->s0 = 0x20000000; |
| 68 | } else if (input_chr == 0x8F) {/* SS3 means charset 3 */ |
| 69 | state->s0 = 0x30000000; |
| 70 | } else if (input_chr < 0xA1 || input_chr == 0xFF) { /* errors */ |
| 71 | emit(emitctx, ERROR); |
| 72 | } else { /* A1-FE means charset 1 */ |
| 73 | state->s0 = 0x11000000 | input_chr; |
| 74 | } |
| 75 | } |
| 76 | |
| 77 | /* |
| 78 | * Finally, if we have accumulated a complete character, output |
| 79 | * it. |
| 80 | */ |
| 81 | if (state->s0 != 0 && |
| 82 | ((state->s0 & 0x0F000000) >> 24) >= |
| 83 | (unsigned)euc->nchars[(state->s0 >> 28)-1]) { |
| 84 | emit(emitctx, euc->to_ucs(state->s0)); |
| 85 | state->s0 = 0; |
| 86 | } |
| 87 | } |
| 88 | |
| 89 | /* |
| 90 | * All EUCs are stateless multi-byte encodings (in the sense that |
| 91 | * just after any character has been completed, the state is always |
| 92 | * the same); hence when writing them, there is no need to use the |
| 93 | * charset_state. |
| 94 | */ |
| 95 | |
| 96 | static int write_euc(charset_spec const *charset, long int input_chr, |
| 97 | charset_state *state, |
| 98 | void (*emit)(void *ctx, long int output), void *emitctx) |
| 99 | { |
| 100 | struct euc const *euc = (struct euc *)charset->data; |
| 101 | unsigned long c; |
| 102 | int cset, len; |
| 103 | |
| 104 | UNUSEDARG(state); |
| 105 | |
| 106 | if (input_chr == -1) |
| 107 | return TRUE; /* stateless; no cleanup required */ |
| 108 | |
| 109 | /* ASCII is the easy bit, and is always the same. */ |
| 110 | if (input_chr < 0x80) { |
| 111 | emit(emitctx, input_chr); |
| 112 | return TRUE; |
| 113 | } |
| 114 | |
| 115 | c = euc->from_ucs(input_chr); |
| 116 | if (!c) { |
| 117 | return FALSE; |
| 118 | } |
| 119 | |
| 120 | cset = c >> 28; |
| 121 | len = euc->nchars[cset-1]; |
| 122 | c &= 0xFFFFFF; |
| 123 | |
| 124 | if (cset > 1) |
| 125 | emit(emitctx, 0x8C + cset); /* SS2/SS3 */ |
| 126 | |
| 127 | while (len--) |
| 128 | emit(emitctx, (c >> (8*len)) & 0xFF); |
| 129 | return TRUE; |
| 130 | } |
| 131 | |
| 132 | /* |
| 133 | * EUC-CN encodes GB2312 only. |
| 134 | */ |
| 135 | static long int euc_cn_to_ucs(unsigned long state) |
| 136 | { |
| 137 | switch (state >> 28) { |
| 138 | case 1: return gb2312_to_unicode(((state >> 8) & 0xFF) - 0xA1, |
| 139 | ((state ) & 0xFF) - 0xA1); |
| 140 | default: return ERROR; |
| 141 | } |
| 142 | } |
| 143 | static unsigned long euc_cn_from_ucs(long int ucs) |
| 144 | { |
| 145 | int r, c; |
| 146 | if (unicode_to_gb2312(ucs, &r, &c)) |
| 147 | return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1); |
| 148 | else |
| 149 | return 0; |
| 150 | } |
| 151 | static const struct euc euc_cn = { |
| 152 | {2,0,0}, euc_cn_to_ucs, euc_cn_from_ucs |
| 153 | }; |
| 154 | const charset_spec charset_CS_EUC_CN = { |
| 155 | CS_EUC_CN, read_euc, write_euc, &euc_cn |
| 156 | }; |
| 157 | |
| 158 | /* |
| 159 | * EUC-KR encodes KS X 1001 only. |
| 160 | */ |
| 161 | static long int euc_kr_to_ucs(unsigned long state) |
| 162 | { |
| 163 | switch (state >> 28) { |
| 164 | case 1: return ksx1001_to_unicode(((state >> 8) & 0xFF) - 0xA1, |
| 165 | ((state ) & 0xFF) - 0xA1); |
| 166 | default: return ERROR; |
| 167 | } |
| 168 | } |
| 169 | static unsigned long euc_kr_from_ucs(long int ucs) |
| 170 | { |
| 171 | int r, c; |
| 172 | if (unicode_to_ksx1001(ucs, &r, &c)) |
| 173 | return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1); |
| 174 | else |
| 175 | return 0; |
| 176 | } |
| 177 | static const struct euc euc_kr = { |
| 178 | {2,0,0}, euc_kr_to_ucs, euc_kr_from_ucs |
| 179 | }; |
| 180 | const charset_spec charset_CS_EUC_KR = { |
| 181 | CS_EUC_KR, read_euc, write_euc, &euc_kr |
| 182 | }; |
| 183 | |
| 184 | /* |
| 185 | * EUC-JP encodes several character sets. |
| 186 | */ |
| 187 | static long int euc_jp_to_ucs(unsigned long state) |
| 188 | { |
| 189 | switch (state >> 28) { |
| 190 | case 1: return jisx0208_to_unicode(((state >> 8) & 0xFF) - 0xA1, |
| 191 | ((state ) & 0xFF) - 0xA1); |
| 192 | case 2: |
| 193 | /* |
| 194 | * This is the top half of JIS X 0201. That means A1-DF map |
| 195 | * to FF61-FF9F, and nothing else is valid. |
| 196 | */ |
| 197 | { |
| 198 | int c = state & 0xFF; |
| 199 | if (c >= 0xA1 && c <= 0xDF) |
| 200 | return c + (0xFF61 - 0xA1); |
| 201 | else |
| 202 | return ERROR; |
| 203 | } |
| 204 | /* (no break needed since all control paths have returned) */ |
| 205 | case 3: return jisx0212_to_unicode(((state >> 8) & 0xFF) - 0xA1, |
| 206 | ((state ) & 0xFF) - 0xA1); |
| 207 | default: return ERROR; /* placate optimisers */ |
| 208 | } |
| 209 | } |
| 210 | static unsigned long euc_jp_from_ucs(long int ucs) |
| 211 | { |
| 212 | int r, c; |
| 213 | if (ucs >= 0xFF61 && ucs <= 0xFF9F) |
| 214 | return 0x20000000 | (ucs - (0xFF61 - 0xA1)); |
| 215 | else if (unicode_to_jisx0208(ucs, &r, &c)) |
| 216 | return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1); |
| 217 | else if (unicode_to_jisx0212(ucs, &r, &c)) |
| 218 | return 0x30000000 | ((r+0xA1) << 8) | (c+0xA1); |
| 219 | else |
| 220 | return 0; |
| 221 | } |
| 222 | static const struct euc euc_jp = { |
| 223 | {2,1,2}, euc_jp_to_ucs, euc_jp_from_ucs |
| 224 | }; |
| 225 | const charset_spec charset_CS_EUC_JP = { |
| 226 | CS_EUC_JP, read_euc, write_euc, &euc_jp |
| 227 | }; |
| 228 | |
| 229 | /* |
| 230 | * EUC-TW encodes CNS 11643 (all planes). |
| 231 | */ |
| 232 | static long int euc_tw_to_ucs(unsigned long state) |
| 233 | { |
| 234 | int plane; |
| 235 | switch (state >> 28) { |
| 236 | case 1: return cns11643_to_unicode(0, ((state >> 8) & 0xFF) - 0xA1, |
| 237 | ((state ) & 0xFF) - 0xA1); |
| 238 | case 2: |
| 239 | plane = ((state >> 8) & 0xFF) - 0xA1; |
| 240 | if (plane >= 7) return ERROR; |
| 241 | return cns11643_to_unicode(plane, ((state >> 8) & 0xFF) - 0xA1, |
| 242 | ((state ) & 0xFF) - 0xA1); |
| 243 | default: return ERROR; |
| 244 | } |
| 245 | } |
| 246 | static unsigned long euc_tw_from_ucs(long int ucs) |
| 247 | { |
| 248 | int p, r, c; |
| 249 | if (unicode_to_cns11643(ucs, &p, &r, &c)) { |
| 250 | if (p == 0) |
| 251 | return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1); |
| 252 | else |
| 253 | return 0x20000000 | |
| 254 | ((p + 0xA1) << 16) | ((r+0xA1) << 8) | (c+0xA1); |
| 255 | } else |
| 256 | return 0; |
| 257 | } |
| 258 | static const struct euc euc_tw = { |
| 259 | {2,3,0}, euc_tw_to_ucs, euc_tw_from_ucs |
| 260 | }; |
| 261 | const charset_spec charset_CS_EUC_TW = { |
| 262 | CS_EUC_TW, read_euc, write_euc, &euc_tw |
| 263 | }; |
| 264 | |
| 265 | #else /* ENUM_CHARSETS */ |
| 266 | |
| 267 | ENUM_CHARSET(CS_EUC_CN) |
| 268 | ENUM_CHARSET(CS_EUC_KR) |
| 269 | ENUM_CHARSET(CS_EUC_JP) |
| 270 | ENUM_CHARSET(CS_EUC_TW) |
| 271 | |
| 272 | #endif /* ENUM_CHARSETS */ |