2 * euc.c - routines to handle the various EUC multibyte encodings.
11 int nchars
[3]; /* GR, SS2+GR, SS3+GR */
12 long int (*to_ucs
)(unsigned long state
);
13 unsigned long (*from_ucs
)(long int ucs
);
16 static void read_euc(charset_spec
const *charset
, long int input_chr
,
18 void (*emit
)(void *ctx
, long int output
), void *emitctx
)
20 struct euc
const *euc
= (struct euc
*)charset
->data
;
23 * For EUC input, our state variable divides into three parts:
25 * - Topmost nibble (bits 31:28) is nonzero if we're
26 * accumulating a multibyte character, and it indicates
27 * which section we're in: 1 for GR chars, 2 for things
28 * beginning with SS2, 3 for things beginning with SS3.
30 * - Next nibble (bits 27:24) indicates how many bytes of the
31 * character we've accumulated so far.
33 * - The rest (bits 23:0) are those bytes in full, accumulated
34 * as a large integer (so that seeing A1 A2 A3, in a
35 * hypothetical EUC whose GR encoding is three-byte, runs
36 * our state variable from 0 -> 0x110000A1 -> 0x1200A1A2 ->
37 * 0x13A1A2A3, at which point it gets translated and output
38 * and resets to zero).
44 * At this point, no matter whether we had an SS2 or SS3
45 * introducer or not, we _always_ expect a GR character.
46 * Anything else causes us to emit ERROR for an incomplete
47 * character, and then reset to state 0 to process the
48 * character in its own way.
50 if (input_chr
< 0xA1 || input_chr
== 0xFF) {
54 state
->s0
= (((state
->s0
& 0xFF000000) + 0x01000000) |
55 ((state
->s0
& 0x0000FFFF) << 8) | input_chr
);
61 * The input character determines which of the four
62 * possible charsets we're going to be in.
64 if (input_chr
< 0x80) { /* this is always ASCII */
65 emit(emitctx
, input_chr
);
66 } else if (input_chr
== 0x8E) {/* SS2 means charset 2 */
67 state
->s0
= 0x20000000;
68 } else if (input_chr
== 0x8F) {/* SS3 means charset 3 */
69 state
->s0
= 0x30000000;
70 } else if (input_chr
< 0xA1 || input_chr
== 0xFF) { /* errors */
72 } else { /* A1-FE means charset 1 */
73 state
->s0
= 0x11000000 | input_chr
;
78 * Finally, if we have accumulated a complete character, output
82 ((state
->s0
& 0x0F000000) >> 24) >=
83 (unsigned)euc
->nchars
[(state
->s0
>> 28)-1]) {
84 emit(emitctx
, euc
->to_ucs(state
->s0
));
90 * All EUCs are stateless multi-byte encodings (in the sense that
91 * just after any character has been completed, the state is always
92 * the same); hence when writing them, there is no need to use the
96 static int write_euc(charset_spec
const *charset
, long int input_chr
,
98 void (*emit
)(void *ctx
, long int output
), void *emitctx
)
100 struct euc
const *euc
= (struct euc
*)charset
->data
;
107 return TRUE
; /* stateless; no cleanup required */
109 /* ASCII is the easy bit, and is always the same. */
110 if (input_chr
< 0x80) {
111 emit(emitctx
, input_chr
);
115 c
= euc
->from_ucs(input_chr
);
121 len
= euc
->nchars
[cset
-1];
125 emit(emitctx
, 0x8C + cset
); /* SS2/SS3 */
128 emit(emitctx
, (c
>> (8*len
)) & 0xFF);
133 * EUC-CN encodes GB2312 only.
135 static long int euc_cn_to_ucs(unsigned long state
)
137 switch (state
>> 28) {
138 case 1: return gb2312_to_unicode(((state
>> 8) & 0xFF) - 0xA1,
139 ((state
) & 0xFF) - 0xA1);
140 default: return ERROR
;
143 static unsigned long euc_cn_from_ucs(long int ucs
)
146 if (unicode_to_gb2312(ucs
, &r
, &c
))
147 return 0x10000000 | ((r
+0xA1) << 8) | (c
+0xA1);
151 static const struct euc euc_cn
= {
152 {2,0,0}, euc_cn_to_ucs
, euc_cn_from_ucs
154 const charset_spec charset_CS_EUC_CN
= {
155 CS_EUC_CN
, read_euc
, write_euc
, &euc_cn
159 * EUC-KR encodes KS X 1001 only.
161 static long int euc_kr_to_ucs(unsigned long state
)
163 switch (state
>> 28) {
164 case 1: return ksx1001_to_unicode(((state
>> 8) & 0xFF) - 0xA1,
165 ((state
) & 0xFF) - 0xA1);
166 default: return ERROR
;
169 static unsigned long euc_kr_from_ucs(long int ucs
)
172 if (unicode_to_ksx1001(ucs
, &r
, &c
))
173 return 0x10000000 | ((r
+0xA1) << 8) | (c
+0xA1);
177 static const struct euc euc_kr
= {
178 {2,0,0}, euc_kr_to_ucs
, euc_kr_from_ucs
180 const charset_spec charset_CS_EUC_KR
= {
181 CS_EUC_KR
, read_euc
, write_euc
, &euc_kr
185 * EUC-JP encodes several character sets.
187 static long int euc_jp_to_ucs(unsigned long state
)
189 switch (state
>> 28) {
190 case 1: return jisx0208_to_unicode(((state
>> 8) & 0xFF) - 0xA1,
191 ((state
) & 0xFF) - 0xA1);
194 * This is the top half of JIS X 0201. That means A1-DF map
195 * to FF61-FF9F, and nothing else is valid.
198 int c
= state
& 0xFF;
199 if (c
>= 0xA1 && c
<= 0xDF)
200 return c
+ (0xFF61 - 0xA1);
204 /* (no break needed since all control paths have returned) */
205 case 3: return jisx0212_to_unicode(((state
>> 8) & 0xFF) - 0xA1,
206 ((state
) & 0xFF) - 0xA1);
207 default: return ERROR
; /* placate optimisers */
210 static unsigned long euc_jp_from_ucs(long int ucs
)
213 if (ucs
>= 0xFF61 && ucs
<= 0xFF9F)
214 return 0x20000000 | (ucs
- (0xFF61 - 0xA1));
215 else if (unicode_to_jisx0208(ucs
, &r
, &c
))
216 return 0x10000000 | ((r
+0xA1) << 8) | (c
+0xA1);
217 else if (unicode_to_jisx0212(ucs
, &r
, &c
))
218 return 0x30000000 | ((r
+0xA1) << 8) | (c
+0xA1);
222 static const struct euc euc_jp
= {
223 {2,1,2}, euc_jp_to_ucs
, euc_jp_from_ucs
225 const charset_spec charset_CS_EUC_JP
= {
226 CS_EUC_JP
, read_euc
, write_euc
, &euc_jp
229 #else /* ENUM_CHARSETS */
231 ENUM_CHARSET(CS_EUC_CN
)
232 ENUM_CHARSET(CS_EUC_KR
)
233 ENUM_CHARSET(CS_EUC_JP
)
235 #endif /* ENUM_CHARSETS */