c6d25d8d |
1 | /* |
2 | * euc.c - routines to handle the various EUC multibyte encodings. |
3 | */ |
4 | |
5 | #ifndef ENUM_CHARSETS |
6 | |
7 | #include "charset.h" |
8 | #include "internal.h" |
9 | |
10 | struct euc { |
11 | int nchars[3]; /* GR, SS2+GR, SS3+GR */ |
12 | long int (*to_ucs)(unsigned long state); |
13 | unsigned long (*from_ucs)(long int ucs); |
14 | }; |
15 | |
16 | static void read_euc(charset_spec const *charset, long int input_chr, |
17 | charset_state *state, |
18 | void (*emit)(void *ctx, long int output), void *emitctx) |
19 | { |
20 | struct euc const *euc = (struct euc *)charset->data; |
21 | |
22 | /* |
23 | * For EUC input, our state variable divides into three parts: |
24 | * |
25 | * - Topmost nibble (bits 31:28) is nonzero if we're |
26 | * accumulating a multibyte character, and it indicates |
27 | * which section we're in: 1 for GR chars, 2 for things |
28 | * beginning with SS2, 3 for things beginning with SS3. |
29 | * |
30 | * - Next nibble (bits 27:24) indicates how many bytes of the |
31 | * character we've accumulated so far. |
32 | * |
33 | * - The rest (bits 23:0) are those bytes in full, accumulated |
34 | * as a large integer (so that seeing A1 A2 A3, in a |
35 | * hypothetical EUC whose GR encoding is three-byte, runs |
36 | * our state variable from 0 -> 0x110000A1 -> 0x1200A1A2 -> |
37 | * 0x13A1A2A3, at which point it gets translated and output |
38 | * and resets to zero). |
39 | */ |
40 | |
41 | if (state->s0 != 0) { |
42 | |
43 | /* |
44 | * At this point, no matter whether we had an SS2 or SS3 |
45 | * introducer or not, we _always_ expect a GR character. |
46 | * Anything else causes us to emit ERROR for an incomplete |
47 | * character, and then reset to state 0 to process the |
48 | * character in its own way. |
49 | */ |
50 | if (input_chr < 0xA1 || input_chr == 0xFF) { |
51 | emit(emitctx, ERROR); |
52 | state->s0 = 0; |
53 | } else |
54 | state->s0 = (((state->s0 & 0xFF000000) + 0x01000000) | |
55 | ((state->s0 & 0x0000FFFF) << 8) | input_chr); |
56 | |
57 | } |
58 | |
59 | if (state->s0 == 0) { |
60 | /* |
61 | * The input character determines which of the four |
62 | * possible charsets we're going to be in. |
63 | */ |
64 | if (input_chr < 0x80) { /* this is always ASCII */ |
65 | emit(emitctx, input_chr); |
66 | } else if (input_chr == 0x8E) {/* SS2 means charset 2 */ |
67 | state->s0 = 0x20000000; |
68 | } else if (input_chr == 0x8F) {/* SS3 means charset 3 */ |
69 | state->s0 = 0x30000000; |
70 | } else if (input_chr < 0xA1 || input_chr == 0xFF) { /* errors */ |
71 | emit(emitctx, ERROR); |
72 | } else { /* A1-FE means charset 1 */ |
73 | state->s0 = 0x11000000 | input_chr; |
74 | } |
75 | } |
76 | |
77 | /* |
78 | * Finally, if we have accumulated a complete character, output |
79 | * it. |
80 | */ |
81 | if (state->s0 != 0 && |
3cca0edf |
82 | ((state->s0 & 0x0F000000) >> 24) >= |
83 | (unsigned)euc->nchars[(state->s0 >> 28)-1]) { |
c6d25d8d |
84 | emit(emitctx, euc->to_ucs(state->s0)); |
85 | state->s0 = 0; |
86 | } |
87 | } |
88 | |
89 | /* |
90 | * All EUCs are stateless multi-byte encodings (in the sense that |
91 | * just after any character has been completed, the state is always |
92 | * the same); hence when writing them, there is no need to use the |
93 | * charset_state. |
94 | */ |
95 | |
96 | static int write_euc(charset_spec const *charset, long int input_chr, |
97 | charset_state *state, |
98 | void (*emit)(void *ctx, long int output), void *emitctx) |
99 | { |
100 | struct euc const *euc = (struct euc *)charset->data; |
101 | unsigned long c; |
102 | int cset, len; |
103 | |
104 | UNUSEDARG(state); |
105 | |
106 | if (input_chr == -1) |
107 | return TRUE; /* stateless; no cleanup required */ |
108 | |
109 | /* ASCII is the easy bit, and is always the same. */ |
110 | if (input_chr < 0x80) { |
111 | emit(emitctx, input_chr); |
112 | return TRUE; |
113 | } |
114 | |
115 | c = euc->from_ucs(input_chr); |
116 | if (!c) { |
117 | return FALSE; |
118 | } |
119 | |
120 | cset = c >> 28; |
121 | len = euc->nchars[cset-1]; |
122 | c &= 0xFFFFFF; |
123 | |
124 | if (cset > 1) |
125 | emit(emitctx, 0x8C + cset); /* SS2/SS3 */ |
126 | |
127 | while (len--) |
128 | emit(emitctx, (c >> (8*len)) & 0xFF); |
129 | return TRUE; |
130 | } |
131 | |
132 | /* |
133 | * EUC-CN encodes GB2312 only. |
134 | */ |
135 | static long int euc_cn_to_ucs(unsigned long state) |
136 | { |
137 | switch (state >> 28) { |
138 | case 1: return gb2312_to_unicode(((state >> 8) & 0xFF) - 0xA1, |
139 | ((state ) & 0xFF) - 0xA1); |
140 | default: return ERROR; |
141 | } |
142 | } |
143 | static unsigned long euc_cn_from_ucs(long int ucs) |
144 | { |
145 | int r, c; |
146 | if (unicode_to_gb2312(ucs, &r, &c)) |
147 | return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1); |
148 | else |
149 | return 0; |
150 | } |
151 | static const struct euc euc_cn = { |
152 | {2,0,0}, euc_cn_to_ucs, euc_cn_from_ucs |
153 | }; |
154 | const charset_spec charset_CS_EUC_CN = { |
155 | CS_EUC_CN, read_euc, write_euc, &euc_cn |
156 | }; |
157 | |
158 | /* |
159 | * EUC-KR encodes KS X 1001 only. |
160 | */ |
161 | static long int euc_kr_to_ucs(unsigned long state) |
162 | { |
163 | switch (state >> 28) { |
164 | case 1: return ksx1001_to_unicode(((state >> 8) & 0xFF) - 0xA1, |
165 | ((state ) & 0xFF) - 0xA1); |
166 | default: return ERROR; |
167 | } |
168 | } |
169 | static unsigned long euc_kr_from_ucs(long int ucs) |
170 | { |
171 | int r, c; |
172 | if (unicode_to_ksx1001(ucs, &r, &c)) |
173 | return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1); |
174 | else |
175 | return 0; |
176 | } |
177 | static const struct euc euc_kr = { |
178 | {2,0,0}, euc_kr_to_ucs, euc_kr_from_ucs |
179 | }; |
180 | const charset_spec charset_CS_EUC_KR = { |
181 | CS_EUC_KR, read_euc, write_euc, &euc_kr |
182 | }; |
183 | |
184 | /* |
185 | * EUC-JP encodes several character sets. |
186 | */ |
187 | static long int euc_jp_to_ucs(unsigned long state) |
188 | { |
189 | switch (state >> 28) { |
190 | case 1: return jisx0208_to_unicode(((state >> 8) & 0xFF) - 0xA1, |
191 | ((state ) & 0xFF) - 0xA1); |
192 | case 2: |
193 | /* |
194 | * This is the top half of JIS X 0201. That means A1-DF map |
195 | * to FF61-FF9F, and nothing else is valid. |
196 | */ |
197 | { |
198 | int c = state & 0xFF; |
199 | if (c >= 0xA1 && c <= 0xDF) |
200 | return c + (0xFF61 - 0xA1); |
201 | else |
202 | return ERROR; |
203 | } |
204 | /* (no break needed since all control paths have returned) */ |
205 | case 3: return jisx0212_to_unicode(((state >> 8) & 0xFF) - 0xA1, |
206 | ((state ) & 0xFF) - 0xA1); |
207 | default: return ERROR; /* placate optimisers */ |
208 | } |
209 | } |
210 | static unsigned long euc_jp_from_ucs(long int ucs) |
211 | { |
212 | int r, c; |
213 | if (ucs >= 0xFF61 && ucs <= 0xFF9F) |
214 | return 0x20000000 | (ucs - (0xFF61 - 0xA1)); |
215 | else if (unicode_to_jisx0208(ucs, &r, &c)) |
216 | return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1); |
217 | else if (unicode_to_jisx0212(ucs, &r, &c)) |
218 | return 0x30000000 | ((r+0xA1) << 8) | (c+0xA1); |
219 | else |
220 | return 0; |
221 | } |
222 | static const struct euc euc_jp = { |
223 | {2,1,2}, euc_jp_to_ucs, euc_jp_from_ucs |
224 | }; |
225 | const charset_spec charset_CS_EUC_JP = { |
226 | CS_EUC_JP, read_euc, write_euc, &euc_jp |
227 | }; |
228 | |
229 | #else /* ENUM_CHARSETS */ |
230 | |
231 | ENUM_CHARSET(CS_EUC_CN) |
232 | ENUM_CHARSET(CS_EUC_KR) |
233 | ENUM_CHARSET(CS_EUC_JP) |
234 | |
235 | #endif /* ENUM_CHARSETS */ |