Silly of me to overlook it: another obvious way you might like to
[sgt/charset] / euc.c
1 /*
2 * euc.c - routines to handle the various EUC multibyte encodings.
3 */
4
5 #ifndef ENUM_CHARSETS
6
7 #include "charset.h"
8 #include "internal.h"
9
10 struct euc {
11 int nchars[3]; /* GR, SS2+GR, SS3+GR */
12 long int (*to_ucs)(unsigned long state);
13 unsigned long (*from_ucs)(long int ucs);
14 };
15
16 static void read_euc(charset_spec const *charset, long int input_chr,
17 charset_state *state,
18 void (*emit)(void *ctx, long int output), void *emitctx)
19 {
20 struct euc const *euc = (struct euc *)charset->data;
21
22 /*
23 * For EUC input, our state variable divides into three parts:
24 *
25 * - Topmost nibble (bits 31:28) is nonzero if we're
26 * accumulating a multibyte character, and it indicates
27 * which section we're in: 1 for GR chars, 2 for things
28 * beginning with SS2, 3 for things beginning with SS3.
29 *
30 * - Next nibble (bits 27:24) indicates how many bytes of the
31 * character we've accumulated so far.
32 *
33 * - The rest (bits 23:0) are those bytes in full, accumulated
34 * as a large integer (so that seeing A1 A2 A3, in a
35 * hypothetical EUC whose GR encoding is three-byte, runs
36 * our state variable from 0 -> 0x110000A1 -> 0x1200A1A2 ->
37 * 0x13A1A2A3, at which point it gets translated and output
38 * and resets to zero).
39 */
40
41 if (state->s0 != 0) {
42
43 /*
44 * At this point, no matter whether we had an SS2 or SS3
45 * introducer or not, we _always_ expect a GR character.
46 * Anything else causes us to emit ERROR for an incomplete
47 * character, and then reset to state 0 to process the
48 * character in its own way.
49 */
50 if (input_chr < 0xA1 || input_chr == 0xFF) {
51 emit(emitctx, ERROR);
52 state->s0 = 0;
53 } else
54 state->s0 = (((state->s0 & 0xFF000000) + 0x01000000) |
55 ((state->s0 & 0x0000FFFF) << 8) | input_chr);
56
57 }
58
59 if (state->s0 == 0) {
60 /*
61 * The input character determines which of the four
62 * possible charsets we're going to be in.
63 */
64 if (input_chr < 0x80) { /* this is always ASCII */
65 emit(emitctx, input_chr);
66 } else if (input_chr == 0x8E) {/* SS2 means charset 2 */
67 state->s0 = 0x20000000;
68 } else if (input_chr == 0x8F) {/* SS3 means charset 3 */
69 state->s0 = 0x30000000;
70 } else if (input_chr < 0xA1 || input_chr == 0xFF) { /* errors */
71 emit(emitctx, ERROR);
72 } else { /* A1-FE means charset 1 */
73 state->s0 = 0x11000000 | input_chr;
74 }
75 }
76
77 /*
78 * Finally, if we have accumulated a complete character, output
79 * it.
80 */
81 if (state->s0 != 0 &&
82 ((state->s0 & 0x0F000000) >> 24) >=
83 (unsigned)euc->nchars[(state->s0 >> 28)-1]) {
84 emit(emitctx, euc->to_ucs(state->s0));
85 state->s0 = 0;
86 }
87 }
88
89 /*
90 * All EUCs are stateless multi-byte encodings (in the sense that
91 * just after any character has been completed, the state is always
92 * the same); hence when writing them, there is no need to use the
93 * charset_state.
94 */
95
96 static int write_euc(charset_spec const *charset, long int input_chr,
97 charset_state *state,
98 void (*emit)(void *ctx, long int output), void *emitctx)
99 {
100 struct euc const *euc = (struct euc *)charset->data;
101 unsigned long c;
102 int cset, len;
103
104 UNUSEDARG(state);
105
106 if (input_chr == -1)
107 return TRUE; /* stateless; no cleanup required */
108
109 /* ASCII is the easy bit, and is always the same. */
110 if (input_chr < 0x80) {
111 emit(emitctx, input_chr);
112 return TRUE;
113 }
114
115 c = euc->from_ucs(input_chr);
116 if (!c) {
117 return FALSE;
118 }
119
120 cset = c >> 28;
121 len = euc->nchars[cset-1];
122 c &= 0xFFFFFF;
123
124 if (cset > 1)
125 emit(emitctx, 0x8C + cset); /* SS2/SS3 */
126
127 while (len--)
128 emit(emitctx, (c >> (8*len)) & 0xFF);
129 return TRUE;
130 }
131
132 /*
133 * EUC-CN encodes GB2312 only.
134 */
135 static long int euc_cn_to_ucs(unsigned long state)
136 {
137 switch (state >> 28) {
138 case 1: return gb2312_to_unicode(((state >> 8) & 0xFF) - 0xA1,
139 ((state ) & 0xFF) - 0xA1);
140 default: return ERROR;
141 }
142 }
143 static unsigned long euc_cn_from_ucs(long int ucs)
144 {
145 int r, c;
146 if (unicode_to_gb2312(ucs, &r, &c))
147 return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1);
148 else
149 return 0;
150 }
151 static const struct euc euc_cn = {
152 {2,0,0}, euc_cn_to_ucs, euc_cn_from_ucs
153 };
154 const charset_spec charset_CS_EUC_CN = {
155 CS_EUC_CN, read_euc, write_euc, &euc_cn
156 };
157
158 /*
159 * EUC-KR encodes KS X 1001 only.
160 */
161 static long int euc_kr_to_ucs(unsigned long state)
162 {
163 switch (state >> 28) {
164 case 1: return ksx1001_to_unicode(((state >> 8) & 0xFF) - 0xA1,
165 ((state ) & 0xFF) - 0xA1);
166 default: return ERROR;
167 }
168 }
169 static unsigned long euc_kr_from_ucs(long int ucs)
170 {
171 int r, c;
172 if (unicode_to_ksx1001(ucs, &r, &c))
173 return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1);
174 else
175 return 0;
176 }
177 static const struct euc euc_kr = {
178 {2,0,0}, euc_kr_to_ucs, euc_kr_from_ucs
179 };
180 const charset_spec charset_CS_EUC_KR = {
181 CS_EUC_KR, read_euc, write_euc, &euc_kr
182 };
183
184 /*
185 * EUC-JP encodes several character sets.
186 */
187 static long int euc_jp_to_ucs(unsigned long state)
188 {
189 switch (state >> 28) {
190 case 1: return jisx0208_to_unicode(((state >> 8) & 0xFF) - 0xA1,
191 ((state ) & 0xFF) - 0xA1);
192 case 2:
193 /*
194 * This is the top half of JIS X 0201. That means A1-DF map
195 * to FF61-FF9F, and nothing else is valid.
196 */
197 {
198 int c = state & 0xFF;
199 if (c >= 0xA1 && c <= 0xDF)
200 return c + (0xFF61 - 0xA1);
201 else
202 return ERROR;
203 }
204 /* (no break needed since all control paths have returned) */
205 case 3: return jisx0212_to_unicode(((state >> 8) & 0xFF) - 0xA1,
206 ((state ) & 0xFF) - 0xA1);
207 default: return ERROR; /* placate optimisers */
208 }
209 }
210 static unsigned long euc_jp_from_ucs(long int ucs)
211 {
212 int r, c;
213 if (ucs >= 0xFF61 && ucs <= 0xFF9F)
214 return 0x20000000 | (ucs - (0xFF61 - 0xA1));
215 else if (unicode_to_jisx0208(ucs, &r, &c))
216 return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1);
217 else if (unicode_to_jisx0212(ucs, &r, &c))
218 return 0x30000000 | ((r+0xA1) << 8) | (c+0xA1);
219 else
220 return 0;
221 }
222 static const struct euc euc_jp = {
223 {2,1,2}, euc_jp_to_ucs, euc_jp_from_ucs
224 };
225 const charset_spec charset_CS_EUC_JP = {
226 CS_EUC_JP, read_euc, write_euc, &euc_jp
227 };
228
229 /*
230 * EUC-TW encodes CNS 11643 (all planes).
231 */
232 static long int euc_tw_to_ucs(unsigned long state)
233 {
234 int plane;
235 switch (state >> 28) {
236 case 1: return cns11643_to_unicode(0, ((state >> 8) & 0xFF) - 0xA1,
237 ((state ) & 0xFF) - 0xA1);
238 case 2:
239 plane = ((state >> 8) & 0xFF) - 0xA1;
240 if (plane >= 7) return ERROR;
241 return cns11643_to_unicode(plane, ((state >> 8) & 0xFF) - 0xA1,
242 ((state ) & 0xFF) - 0xA1);
243 default: return ERROR;
244 }
245 }
246 static unsigned long euc_tw_from_ucs(long int ucs)
247 {
248 int p, r, c;
249 if (unicode_to_cns11643(ucs, &p, &r, &c)) {
250 if (p == 0)
251 return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1);
252 else
253 return 0x20000000 |
254 ((p + 0xA1) << 16) | ((r+0xA1) << 8) | (c+0xA1);
255 } else
256 return 0;
257 }
258 static const struct euc euc_tw = {
259 {2,3,0}, euc_tw_to_ucs, euc_tw_from_ucs
260 };
261 const charset_spec charset_CS_EUC_TW = {
262 CS_EUC_TW, read_euc, write_euc, &euc_tw
263 };
264
265 #else /* ENUM_CHARSETS */
266
267 ENUM_CHARSET(CS_EUC_CN)
268 ENUM_CHARSET(CS_EUC_KR)
269 ENUM_CHARSET(CS_EUC_JP)
270 ENUM_CHARSET(CS_EUC_TW)
271
272 #endif /* ENUM_CHARSETS */