c6d25d8d |
1 | /* |
2 | * iso2022s.c - support for ISO-2022 subset encodings. |
3 | * |
4 | * (The `s' suffix on the filename is there to leave `iso2022.c' |
5 | * free for the unlikely event that I ever attempt to implement |
6 | * _full_ ISO-2022 in this library!) |
7 | */ |
8 | |
9 | #ifndef ENUM_CHARSETS |
10 | |
11 | #include <stdio.h> |
12 | #include <string.h> |
13 | #include <assert.h> |
14 | |
15 | #include "charset.h" |
16 | #include "internal.h" |
01081d4e |
17 | #include "sbcsdat.h" |
c6d25d8d |
18 | |
19 | #define SO (0x0E) |
20 | #define SI (0x0F) |
21 | #define ESC (0x1B) |
22 | |
23 | /* Functional description of a single ISO 2022 escape sequence. */ |
24 | struct iso2022_escape { |
25 | char const *sequence; |
26 | unsigned long andbits, xorbits; |
27 | /* |
28 | * For output, these variables help us figure out which escape |
29 | * sequences we need to get where we want to be. |
01081d4e |
30 | * |
31 | * `container' should be in the range 0-3, but can also be ORed |
32 | * with the bit flag RO to indicate that this is not a |
33 | * preferred container to use for this charset during output. |
c6d25d8d |
34 | */ |
35 | int container, subcharset; |
36 | }; |
01081d4e |
37 | #define RO 0x80 |
c6d25d8d |
38 | |
39 | struct iso2022 { |
40 | /* |
41 | * List of escape sequences supported in this subset. Must be |
42 | * in ASCII order, so that we can narrow down the list as |
43 | * necessary. |
44 | */ |
8bade113 |
45 | const struct iso2022_escape *escapes;/* must be sorted in ASCII order! */ |
c6d25d8d |
46 | int nescapes; |
47 | |
48 | /* |
49 | * We assign indices from 0 upwards to the sub-charsets of a |
50 | * given ISO 2022 subset. nbytes[i] tells us how many bytes per |
51 | * character are required by sub-charset i. (It's a string |
52 | * mainly because that makes it easier to declare in C syntax |
53 | * than an int array.) |
54 | */ |
55 | char const *nbytes; |
56 | |
57 | /* |
58 | * The characters in this string are indices-plus-one (so that |
59 | * NUL can still terminate) of escape sequences in `escapes'. |
60 | * These escapes are output in the given sequence to reset the |
61 | * encoding state, unless it turns out that a given escape |
62 | * would not change the state at all. |
63 | */ |
64 | char const *reset; |
65 | |
66 | /* |
67 | * Initial value of s1, in case the default container contents |
68 | * needs to be something other than charset 0 in all cases. |
69 | * (Note that this must have the top bit set!) |
70 | */ |
71 | unsigned long s1; |
72 | |
73 | /* |
74 | * For output, some ISO 2022 subsets _mandate_ an initial shift |
75 | * sequence. If so, here it is so we can output it. (For the |
76 | * sake of basic sanity we won't bother to _require_ it on |
77 | * input, although it should of course be listed under |
78 | * `escapes' above so that we ignore it when present.) |
79 | */ |
80 | char const *initial_sequence; |
81 | |
82 | /* |
01081d4e |
83 | * Is this an 8-bit ISO 2022 subset? |
84 | */ |
85 | int eightbit; |
86 | |
87 | /* |
c6d25d8d |
88 | * Function calls to do the actual translation. |
89 | */ |
90 | long int (*to_ucs)(int subcharset, unsigned long bytes); |
91 | int (*from_ucs)(long int ucs, int *subcharset, unsigned long *bytes); |
92 | }; |
93 | |
94 | static void read_iso2022s(charset_spec const *charset, long int input_chr, |
95 | charset_state *state, |
96 | void (*emit)(void *ctx, long int output), |
97 | void *emitctx) |
98 | { |
99 | struct iso2022 const *iso = (struct iso2022 *)charset->data; |
100 | |
101 | /* |
102 | * For reading ISO-2022 subsets, we divide up our state |
103 | * variables as follows: |
104 | * |
105 | * - The top byte of s0 (bits 31:24) indicates, if nonzero, |
106 | * that we are part-way through a recognised ISO-2022 escape |
107 | * sequence. Five of those bits (31:27) give the index of |
108 | * the first member of the escapes list matching what we |
109 | * have so far; the remaining three (26:24) give the number |
110 | * of characters we have seen so far. |
111 | * |
112 | * - The top bit of s1 (bit 31) is non-zero at all times, to |
113 | * indicate that we have performed any necessary |
114 | * initialisation. When we start, we detect a zero s1 and |
115 | * respond to it by initialising the default container |
116 | * contents. |
117 | * |
118 | * - The next three bits of s1 (bits 30:28) indicate which |
119 | * _container_ is currently selected. This isn't quite as |
120 | * simple as it sounds, since we have to preserve memory of |
121 | * which of the SI/SO containers we came from when we're |
122 | * temporarily in SS2/SS3. Hence, what happens is: |
123 | * + bit 28 indicates SI/SO. |
124 | * + if we're in an SS2/SS3 container, that's indicated by |
125 | * the two bits above that being nonzero and holding |
126 | * either 2 or 3. |
127 | * + Hence: 0 is SI, 1 is SO, 4 is SS2-from-SI, 5 is |
128 | * SS2-from-SO, 6 is SS3-from-SI, 7 is SS3-from-SO. |
01081d4e |
129 | * + For added fun: in an _8-bit_ ISO 2022 subset, we have |
130 | * the further special value 2, which means that we're |
131 | * theoretically in SI but the current character being |
132 | * accumulated is composed of 8-bit characters and will |
133 | * therefore be interpreted as if in SO. |
c6d25d8d |
134 | * |
135 | * - The next nibble of s1 (27:24) indicates how many bytes |
136 | * have been accumulated in the current character. |
137 | * |
138 | * - The remaining three bytes of s1 are divided into four |
139 | * six-bit sections, and each section gives the current |
140 | * sub-charset selected in one of the possible containers. |
141 | * (Those containers are SI, SO, SS2 and SS3, respectively |
142 | * and in order from the bottom of s0 to the top.) |
143 | * |
144 | * - The bottom 24 bits of s0 give the accumulated character |
145 | * data so far. |
146 | * |
147 | * (Note that this means s1 contains all the parts of the state |
148 | * which might need to be operated on by escape sequences. |
149 | * Cunning, eh?) |
150 | */ |
151 | |
152 | if (!(state->s1 & 0x80000000)) { |
153 | state->s1 = iso->s1; |
154 | } |
155 | |
156 | /* |
157 | * So. Firstly, we process escape sequences, if we're in the |
158 | * middle of one or if we see a possible introducer (SI, SO, |
159 | * ESC). |
160 | */ |
161 | if ((state->s0 >> 24) || |
162 | (input_chr == SO || input_chr == SI || input_chr == ESC)) { |
163 | int n = (state->s0 >> 24) & 7, i = (state->s0 >> 27), oi = i, j; |
164 | |
165 | /* |
166 | * If this is the start of an escape sequence, we might be |
167 | * in mid-character. If so, clear the character state and |
168 | * emit an error token for the incomplete character. |
169 | */ |
170 | if (state->s1 & 0x0F000000) { |
171 | state->s1 &= ~0x0F000000; |
172 | state->s0 &= 0xFF000000; |
173 | /* |
174 | * If we were in the SS2 or SS3 container, we |
175 | * automatically exit it. |
176 | */ |
177 | if (state->s1 & 0x60000000) |
178 | state->s1 &= 0x9FFFFFFF; |
179 | emit(emitctx, ERROR); |
180 | } |
181 | |
182 | j = i; |
183 | while (j < iso->nescapes && |
184 | !memcmp(iso->escapes[j].sequence, |
185 | iso->escapes[oi].sequence, n)) { |
186 | if (iso->escapes[j].sequence[n] < input_chr) |
187 | i = ++j; |
188 | else |
189 | break; |
190 | } |
191 | if (i >= iso->nescapes || |
192 | memcmp(iso->escapes[i].sequence, |
193 | iso->escapes[oi].sequence, n) || |
194 | iso->escapes[i].sequence[n] != input_chr) { |
195 | /* |
196 | * This character does not appear in any valid escape |
197 | * sequence. Therefore, we must emit all the characters |
198 | * we had previously swallowed, plus this one, and |
199 | * return to non-escape-sequence state. |
200 | */ |
201 | for (j = 0; j < n; j++) |
202 | emit(emitctx, iso->escapes[oi].sequence[j]); |
203 | emit(emitctx, input_chr); |
204 | state->s0 = 0; |
205 | return; |
206 | } |
207 | |
208 | /* |
209 | * Otherwise, we have found an additional character in our |
210 | * escape sequence. See if we have reached the _end_ of our |
211 | * sequence (and therefore must process the sequence). |
212 | */ |
213 | n++; |
214 | if (!iso->escapes[i].sequence[n]) { |
215 | state->s0 = 0; |
216 | state->s1 &= iso->escapes[i].andbits; |
217 | state->s1 ^= iso->escapes[i].xorbits; |
218 | return; |
219 | } |
220 | |
221 | /* |
222 | * Failing _that_, we simply update our escape-sequence- |
223 | * tracking state. |
224 | */ |
225 | assert(i < 32 && n < 8); |
226 | state->s0 = (i << 27) | (n << 24); |
227 | return; |
228 | } |
229 | |
230 | /* |
231 | * If this isn't an escape sequence, it must be part of a |
232 | * character. One possibility is that it's a control character |
01081d4e |
233 | * (00-20 or 7F-9F; also in non-8-bit ISO 2022 subsets I'm |
234 | * going to treat all top-half characters as controls), in |
235 | * which case we output it verbatim. |
c6d25d8d |
236 | */ |
01081d4e |
237 | if (input_chr < 0x21 || |
238 | (input_chr > 0x7E && (!iso->eightbit || input_chr < 0xA0))) { |
c6d25d8d |
239 | /* |
240 | * We might be in mid-multibyte-character. If so, clear the |
241 | * character state and emit an error token for the |
242 | * incomplete character. |
243 | */ |
244 | if (state->s1 & 0x0F000000) { |
245 | state->s1 &= ~0x0F000000; |
246 | state->s0 &= 0xFF000000; |
247 | emit(emitctx, ERROR); |
248 | /* |
249 | * If we were in the SS2 or SS3 container, we |
250 | * automatically exit it. |
251 | */ |
252 | if (state->s1 & 0x60000000) |
253 | state->s1 &= 0x9FFFFFFF; |
254 | } |
255 | |
256 | emit(emitctx, input_chr); |
257 | return; |
258 | } |
259 | |
260 | /* |
261 | * Otherwise, accumulate character data. |
262 | */ |
263 | { |
264 | unsigned long chr; |
265 | int chrlen, cont, subcharset, bytes; |
266 | |
01081d4e |
267 | /* |
268 | * Verify that we've seen the right kind of character for |
269 | * what we're currently doing. This only matters in 8-bit |
270 | * subsets. |
271 | */ |
272 | if (iso->eightbit) { |
273 | cont = (state->s1 >> 28) & 7; |
274 | /* |
275 | * If cont==0, we're entitled to see either GL or GR |
276 | * characters. If cont==2, we expect only GR; otherwise |
277 | * we expect only GL. |
278 | * |
279 | * If we see a GR character while cont==0, we set |
280 | * cont=2 immediately. |
281 | */ |
282 | if ((cont == 2 && !(input_chr & 0x80)) || |
283 | (cont != 0 && cont != 2 && (input_chr & 0x80))) { |
284 | /* |
285 | * Clear the previous character; it was prematurely |
286 | * terminated by this error. |
287 | */ |
288 | state->s1 &= ~0x0F000000; |
289 | state->s0 &= 0xFF000000; |
290 | emit(emitctx, ERROR); |
291 | /* |
292 | * If we were in the SS2 or SS3 container, we |
293 | * automatically exit it. |
294 | */ |
295 | if (state->s1 & 0x60000000) |
296 | state->s1 &= 0x9FFFFFFF; |
297 | } |
298 | |
299 | if (cont == 0 && (input_chr & 0x80)) { |
300 | state->s1 |= 0x20000000; |
301 | } |
302 | } |
303 | |
c6d25d8d |
304 | /* The current character and its length. */ |
01081d4e |
305 | chr = ((state->s0 & 0x00FFFFFF) << 8) | (input_chr & 0x7F); |
c6d25d8d |
306 | chrlen = ((state->s1 >> 24) & 0xF) + 1; |
307 | /* The current sub-charset. */ |
308 | cont = (state->s1 >> 28) & 7; |
309 | if (cont > 1) cont >>= 1; |
310 | subcharset = (state->s1 >> (6*cont)) & 0x3F; |
311 | /* The number of bytes-per-character in that sub-charset. */ |
312 | bytes = iso->nbytes[subcharset]; |
313 | |
314 | /* |
315 | * If this character is now complete, we convert and emit |
316 | * it. Otherwise, we simply update the state and return. |
317 | */ |
318 | if (chrlen >= bytes) { |
319 | emit(emitctx, iso->to_ucs(subcharset, chr)); |
320 | chr = chrlen = 0; |
321 | /* |
322 | * If we were in the SS2 or SS3 container, we |
323 | * automatically exit it. |
324 | */ |
325 | if (state->s1 & 0x60000000) |
326 | state->s1 &= 0x9FFFFFFF; |
327 | } |
328 | state->s0 = (state->s0 & 0xFF000000) | chr; |
329 | state->s1 = (state->s1 & 0xF0FFFFFF) | (chrlen << 24); |
330 | } |
331 | } |
332 | |
333 | static int write_iso2022s(charset_spec const *charset, long int input_chr, |
334 | charset_state *state, |
335 | void (*emit)(void *ctx, long int output), |
336 | void *emitctx) |
337 | { |
338 | struct iso2022 const *iso = (struct iso2022 *)charset->data; |
01081d4e |
339 | int subcharset, len, i, j, cont, topbit = 0; |
c6d25d8d |
340 | unsigned long bytes; |
341 | |
342 | /* |
343 | * For output, our s1 state variable contains most of the same |
344 | * stuff as it did for input - initial-state indicator bit, |
345 | * current container, and current subcharset selected in each |
346 | * container. |
347 | */ |
348 | |
349 | /* |
350 | * Analyse the character and find out what subcharset it needs |
351 | * to go in. |
352 | */ |
353 | if (input_chr >= 0 && !iso->from_ucs(input_chr, &subcharset, &bytes)) |
354 | return FALSE; |
355 | |
356 | if (!(state->s1 & 0x80000000)) { |
357 | state->s1 = iso->s1; |
358 | if (iso->initial_sequence) |
359 | for (i = 0; iso->initial_sequence[i]; i++) |
360 | emit(emitctx, iso->initial_sequence[i]); |
361 | } |
362 | |
363 | if (input_chr == -1) { |
364 | unsigned long oldstate; |
365 | int k; |
366 | |
367 | /* |
368 | * Special case: reset encoding state. |
369 | */ |
370 | for (i = 0; iso->reset[i]; i++) { |
371 | j = iso->reset[i] - 1; |
372 | oldstate = state->s1; |
373 | state->s1 &= iso->escapes[j].andbits; |
374 | state->s1 ^= iso->escapes[j].xorbits; |
375 | if (state->s1 != oldstate) { |
376 | /* We must actually emit this sequence. */ |
377 | for (k = 0; iso->escapes[j].sequence[k]; k++) |
378 | emit(emitctx, iso->escapes[j].sequence[k]); |
379 | } |
380 | } |
381 | |
382 | return TRUE; |
383 | } |
384 | |
385 | /* |
386 | * Now begins the fun. We now know what subcharset we want. So |
387 | * we must find out which container we should select it into, |
388 | * select it into it if necessary, select that _container_ if |
389 | * necessary, and then output the given bytes. |
390 | */ |
391 | for (i = 0; i < iso->nescapes; i++) |
01081d4e |
392 | if (iso->escapes[i].subcharset == subcharset && |
393 | !(iso->escapes[i].container & RO)) |
c6d25d8d |
394 | break; |
395 | assert(i < iso->nescapes); |
396 | |
397 | /* |
398 | * We've found the escape sequence which would select this |
399 | * subcharset into a container. However, that subcharset might |
400 | * already _be_ selected in that container! Check before we go |
401 | * to the effort of emitting the sequence. |
402 | */ |
01081d4e |
403 | cont = iso->escapes[i].container &~ RO; |
3cca0edf |
404 | if (((state->s1 >> (6*cont)) & 0x3F) != (unsigned)subcharset) { |
c6d25d8d |
405 | for (j = 0; iso->escapes[i].sequence[j]; j++) |
406 | emit(emitctx, iso->escapes[i].sequence[j]); |
407 | state->s1 &= iso->escapes[i].andbits; |
408 | state->s1 ^= iso->escapes[i].xorbits; |
409 | } |
410 | |
411 | /* |
412 | * Now we know what container our subcharset is in, so we want |
413 | * to select that container. |
414 | */ |
415 | if (cont > 1) { |
416 | /* SS2 or SS3; just output the sequence and be done. */ |
417 | emit(emitctx, ESC); |
418 | emit(emitctx, 'L' + cont); /* comes out to 'N' or 'O' */ |
419 | } else { |
01081d4e |
420 | /* |
421 | * Emit SI or SO, but only if the current container isn't already |
422 | * the right one. |
423 | * |
424 | * Also, in an 8-bit subset, we need not do this; we'll |
425 | * just use 8-bit characters to output SO-container |
426 | * characters. |
427 | */ |
428 | if (iso->eightbit && cont == 1 && ((state->s1 >> 28) & 7) == 0) { |
429 | topbit = 0x80; |
430 | } else if (((state->s1 >> 28) & 7) != (unsigned)cont) { |
c6d25d8d |
431 | emit(emitctx, cont ? SO : SI); |
432 | state->s1 = (state->s1 & 0x8FFFFFFF) | (cont << 28); |
433 | } |
434 | } |
435 | |
436 | /* |
437 | * We're done. Subcharset is selected in container, container |
438 | * is selected. All we need now is to write out the bytes. |
439 | */ |
440 | len = iso->nbytes[subcharset]; |
441 | while (len--) |
01081d4e |
442 | emit(emitctx, ((bytes >> (8*len)) & 0xFF) | topbit); |
c6d25d8d |
443 | |
444 | return TRUE; |
445 | } |
446 | |
447 | /* |
448 | * ISO-2022-JP, defined in RFC 1468. |
449 | */ |
450 | static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes) |
451 | { |
452 | switch (subcharset) { |
453 | case 0: return bytes; /* one-byte ASCII */ |
454 | case 1: /* JIS X 0201 half-width katakana */ |
455 | if (bytes >= 0x21 && bytes <= 0x5F) |
456 | return bytes + (0xFF61 - 0x21); |
457 | else |
458 | return ERROR; |
459 | /* (no break needed since all control paths have returned) */ |
460 | case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21, |
461 | ((bytes ) & 0xFF) - 0x21); |
462 | default: return ERROR; |
463 | } |
464 | } |
465 | static int iso2022jp_from_ucs(long int ucs, int *subcharset, |
466 | unsigned long *bytes) |
467 | { |
468 | int r, c; |
469 | if (ucs < 0x80) { |
470 | *subcharset = 0; |
471 | *bytes = ucs; |
472 | return 1; |
473 | } else if (ucs >= 0xFF61 && ucs <= 0xFF9F) { |
474 | *subcharset = 1; |
475 | *bytes = ucs - (0xFF61 - 0x21); |
476 | return 1; |
477 | } else if (unicode_to_jisx0208(ucs, &r, &c)) { |
478 | *subcharset = 2; |
479 | *bytes = ((r+0x21) << 8) | (c+0x21); |
480 | return 1; |
481 | } else { |
482 | return 0; |
483 | } |
484 | } |
8bade113 |
485 | static const struct iso2022_escape iso2022jp_escapes[] = { |
c6d25d8d |
486 | {"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1}, /* we ignore this one */ |
487 | {"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2}, |
488 | {"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0}, |
489 | {"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1}, |
490 | }; |
8bade113 |
491 | static const struct iso2022 iso2022jp = { |
c6d25d8d |
492 | iso2022jp_escapes, lenof(iso2022jp_escapes), |
01081d4e |
493 | "\1\1\2", "\3", 0x80000000, NULL, FALSE, |
494 | iso2022jp_to_ucs, iso2022jp_from_ucs |
c6d25d8d |
495 | }; |
496 | const charset_spec charset_CS_ISO2022_JP = { |
497 | CS_ISO2022_JP, read_iso2022s, write_iso2022s, &iso2022jp |
498 | }; |
499 | |
500 | /* |
501 | * ISO-2022-KR, defined in RFC 1557. |
502 | */ |
503 | static long int iso2022kr_to_ucs(int subcharset, unsigned long bytes) |
504 | { |
505 | switch (subcharset) { |
506 | case 0: return bytes; /* one-byte ASCII */ |
507 | case 1: return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21, |
508 | ((bytes ) & 0xFF) - 0x21); |
509 | default: return ERROR; |
510 | } |
511 | } |
512 | static int iso2022kr_from_ucs(long int ucs, int *subcharset, |
513 | unsigned long *bytes) |
514 | { |
515 | int r, c; |
516 | if (ucs < 0x80) { |
517 | *subcharset = 0; |
518 | *bytes = ucs; |
519 | return 1; |
520 | } else if (unicode_to_ksx1001(ucs, &r, &c)) { |
521 | *subcharset = 1; |
522 | *bytes = ((r+0x21) << 8) | (c+0x21); |
523 | return 1; |
524 | } else { |
525 | return 0; |
526 | } |
527 | } |
8bade113 |
528 | static const struct iso2022_escape iso2022kr_escapes[] = { |
c6d25d8d |
529 | {"\016", 0x8FFFFFFF, 0x10000000, -1, -1}, |
530 | {"\017", 0x8FFFFFFF, 0x00000000, 0, 0}, |
531 | {"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1}, /* bits[11:6] <- 1 */ |
532 | }; |
8bade113 |
533 | static const struct iso2022 iso2022kr = { |
c6d25d8d |
534 | iso2022kr_escapes, lenof(iso2022kr_escapes), |
01081d4e |
535 | "\1\2", "\2", 0x80000040, "\033$)C", FALSE, |
536 | iso2022kr_to_ucs, iso2022kr_from_ucs |
c6d25d8d |
537 | }; |
538 | const charset_spec charset_CS_ISO2022_KR = { |
539 | CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr |
540 | }; |
541 | |
01081d4e |
542 | /* |
543 | * The COMPOUND_TEXT encoding used in X selections. Defined by the |
544 | * X consortium. |
545 | * |
546 | * This encoding has quite a few sub-charsets. The order I assign |
547 | * to them here is given in an enum. |
548 | */ |
549 | enum { |
550 | /* This must match the bytes-per-character string given below. */ |
551 | CTEXT_ASCII, |
552 | CTEXT_JISX0201_LEFT, |
553 | CTEXT_JISX0201_RIGHT, |
554 | CTEXT_ISO8859_1, |
555 | CTEXT_ISO8859_2, |
556 | CTEXT_ISO8859_3, |
557 | CTEXT_ISO8859_4, |
558 | CTEXT_ISO8859_5, |
559 | CTEXT_ISO8859_6, |
560 | CTEXT_ISO8859_7, |
561 | CTEXT_ISO8859_8, |
562 | CTEXT_ISO8859_9, |
563 | CTEXT_GB2312, |
564 | CTEXT_KSC5601, |
dd872b07 |
565 | CTEXT_JISX0208, |
566 | CTEXT_JISX0212 |
01081d4e |
567 | }; |
568 | static long int ctext_to_ucs(int subcharset, unsigned long bytes) |
569 | { |
570 | switch (subcharset) { |
571 | case CTEXT_ASCII: return bytes; /* one-byte ASCII */ |
572 | case CTEXT_JISX0201_LEFT: /* ASCII with yen and overline */ |
573 | return sbcs_to_unicode(&sbcsdata_CS_JISX0201, bytes & 0x7F); |
574 | case CTEXT_JISX0201_RIGHT: /* JIS X 0201 half-width katakana */ |
575 | return sbcs_to_unicode(&sbcsdata_CS_JISX0201, (bytes & 0x7F) | 0x80); |
576 | case CTEXT_ISO8859_1: |
577 | return sbcs_to_unicode(&sbcsdata_CS_ISO8859_1, (bytes & 0x7F) | 0x80); |
578 | case CTEXT_ISO8859_2: |
579 | return sbcs_to_unicode(&sbcsdata_CS_ISO8859_2, (bytes & 0x7F) | 0x80); |
580 | case CTEXT_ISO8859_3: |
581 | return sbcs_to_unicode(&sbcsdata_CS_ISO8859_3, (bytes & 0x7F) | 0x80); |
582 | case CTEXT_ISO8859_4: |
583 | return sbcs_to_unicode(&sbcsdata_CS_ISO8859_4, (bytes & 0x7F) | 0x80); |
584 | case CTEXT_ISO8859_5: |
585 | return sbcs_to_unicode(&sbcsdata_CS_ISO8859_5, (bytes & 0x7F) | 0x80); |
586 | case CTEXT_ISO8859_6: |
587 | return sbcs_to_unicode(&sbcsdata_CS_ISO8859_6, (bytes & 0x7F) | 0x80); |
588 | case CTEXT_ISO8859_7: |
589 | return sbcs_to_unicode(&sbcsdata_CS_ISO8859_7, (bytes & 0x7F) | 0x80); |
590 | case CTEXT_ISO8859_8: |
591 | return sbcs_to_unicode(&sbcsdata_CS_ISO8859_8, (bytes & 0x7F) | 0x80); |
592 | case CTEXT_ISO8859_9: |
593 | return sbcs_to_unicode(&sbcsdata_CS_ISO8859_9, (bytes & 0x7F) | 0x80); |
594 | case CTEXT_GB2312: |
595 | return gb2312_to_unicode(((bytes >> 8) & 0xFF) - 0x21, |
596 | ((bytes ) & 0xFF) - 0x21); |
597 | case CTEXT_KSC5601: |
598 | return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21, |
599 | ((bytes ) & 0xFF) - 0x21); |
600 | case CTEXT_JISX0208: |
601 | return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21, |
602 | ((bytes ) & 0xFF) - 0x21); |
dd872b07 |
603 | case CTEXT_JISX0212: |
604 | return jisx0212_to_unicode(((bytes >> 8) & 0xFF) - 0x21, |
605 | ((bytes ) & 0xFF) - 0x21); |
01081d4e |
606 | default: return ERROR; |
607 | } |
608 | } |
609 | static int ctext_from_ucs(long int ucs, int *subcharset, unsigned long *bytes) |
610 | { |
611 | int r, c; |
612 | if (ucs < 0x80) { |
613 | *subcharset = CTEXT_ASCII; |
614 | *bytes = ucs; |
615 | return 1; |
616 | } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_1, ucs)) != ERROR) { |
617 | *subcharset = CTEXT_ISO8859_1; |
618 | *bytes = c - 0x80; |
619 | return 1; |
620 | } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_2, ucs)) != ERROR) { |
621 | *subcharset = CTEXT_ISO8859_2; |
622 | *bytes = c - 0x80; |
623 | return 1; |
624 | } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_3, ucs)) != ERROR) { |
625 | *subcharset = CTEXT_ISO8859_3; |
626 | *bytes = c - 0x80; |
627 | return 1; |
628 | } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_4, ucs)) != ERROR) { |
629 | *subcharset = CTEXT_ISO8859_4; |
630 | *bytes = c - 0x80; |
631 | return 1; |
632 | } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_5, ucs)) != ERROR) { |
633 | *subcharset = CTEXT_ISO8859_5; |
634 | *bytes = c - 0x80; |
635 | return 1; |
636 | } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_6, ucs)) != ERROR) { |
637 | *subcharset = CTEXT_ISO8859_6; |
638 | *bytes = c - 0x80; |
639 | return 1; |
640 | } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_7, ucs)) != ERROR) { |
641 | *subcharset = CTEXT_ISO8859_7; |
642 | *bytes = c - 0x80; |
643 | return 1; |
644 | } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_8, ucs)) != ERROR) { |
645 | *subcharset = CTEXT_ISO8859_8; |
646 | *bytes = c - 0x80; |
647 | return 1; |
648 | } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_9, ucs)) != ERROR) { |
649 | *subcharset = CTEXT_ISO8859_9; |
650 | *bytes = c - 0x80; |
651 | return 1; |
652 | } else if ((c = sbcs_from_unicode(&sbcsdata_CS_JISX0201, ucs)) != ERROR) { |
653 | if (c < 0x80) { |
654 | *subcharset = CTEXT_JISX0201_LEFT; |
655 | } else { |
656 | *subcharset = CTEXT_JISX0201_RIGHT; |
657 | c -= 0x80; |
658 | } |
659 | *bytes = c; |
660 | return 1; |
661 | } else if (unicode_to_gb2312(ucs, &r, &c)) { |
662 | *subcharset = CTEXT_GB2312; |
663 | *bytes = ((r+0x21) << 8) | (c+0x21); |
664 | return 1; |
665 | } else if (unicode_to_ksx1001(ucs, &r, &c)) { |
666 | *subcharset = CTEXT_KSC5601; |
667 | *bytes = ((r+0x21) << 8) | (c+0x21); |
668 | return 1; |
669 | } else if (unicode_to_jisx0208(ucs, &r, &c)) { |
670 | *subcharset = CTEXT_JISX0208; |
671 | *bytes = ((r+0x21) << 8) | (c+0x21); |
672 | return 1; |
dd872b07 |
673 | } else if (unicode_to_jisx0212(ucs, &r, &c)) { |
674 | *subcharset = CTEXT_JISX0212; |
675 | *bytes = ((r+0x21) << 8) | (c+0x21); |
676 | return 1; |
01081d4e |
677 | } else { |
678 | return 0; |
679 | } |
680 | } |
681 | #define SEQ(str,cont,cs) \ |
c601368c |
682 | {str,~(63<<(6*(((cont)&~RO)))),(cs)<<(6*(((cont)&~RO))),(cont),(cs)} |
01081d4e |
683 | /* |
684 | * Compound text defines restrictions on which container can take |
685 | * which character sets. Things labelled `left half of' can only go |
686 | * in GL; things labelled `right half of' can only go in GR; and 96 |
687 | * or 96^n character sets only _fit_ in GR. Thus: |
688 | * - ASCII can only go in GL since it is the left half of 8859-*. |
689 | * - All the 8859 sets can only go in GR. |
690 | * - JISX0201 left is GL only; JISX0201 right is GR only. |
691 | * - The three multibyte sets (GB2312, JISX0208, KSC5601) can go |
692 | * in either; we prefer GR where possible since this leads to a |
693 | * more compact EUC-like encoding. |
694 | */ |
8bade113 |
695 | static const struct iso2022_escape ctext_escapes[] = { |
01081d4e |
696 | SEQ("\033$(A", 0|RO, CTEXT_GB2312), |
697 | SEQ("\033$(B", 0|RO, CTEXT_JISX0208), |
698 | SEQ("\033$(C", 0|RO, CTEXT_KSC5601), |
dd872b07 |
699 | SEQ("\033$(D", 0|RO, CTEXT_JISX0212), |
01081d4e |
700 | SEQ("\033$)A", 1, CTEXT_GB2312), |
701 | SEQ("\033$)B", 1, CTEXT_JISX0208), |
702 | SEQ("\033$)C", 1, CTEXT_KSC5601), |
dd872b07 |
703 | SEQ("\033$)D", 1, CTEXT_JISX0212), |
01081d4e |
704 | SEQ("\033(B", 0, CTEXT_ASCII), |
705 | SEQ("\033(J", 0, CTEXT_JISX0201_LEFT), |
ee45694b |
706 | SEQ("\033)I", 1, CTEXT_JISX0201_RIGHT), |
01081d4e |
707 | SEQ("\033-A", 1, CTEXT_ISO8859_1), |
708 | SEQ("\033-B", 1, CTEXT_ISO8859_2), |
709 | SEQ("\033-C", 1, CTEXT_ISO8859_3), |
710 | SEQ("\033-D", 1, CTEXT_ISO8859_4), |
711 | SEQ("\033-F", 1, CTEXT_ISO8859_7), |
712 | SEQ("\033-G", 1, CTEXT_ISO8859_6), |
713 | SEQ("\033-H", 1, CTEXT_ISO8859_8), |
01081d4e |
714 | SEQ("\033-L", 1, CTEXT_ISO8859_5), |
715 | SEQ("\033-M", 1, CTEXT_ISO8859_9), |
716 | }; |
8bade113 |
717 | static const struct iso2022 ctext = { |
01081d4e |
718 | ctext_escapes, lenof(ctext_escapes), |
dd872b07 |
719 | "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\2\2", /* must match the enum above */ |
ee45694b |
720 | "", 0x80000000 | (CTEXT_ASCII<<0) | (CTEXT_ISO8859_1<<6), "", TRUE, |
01081d4e |
721 | ctext_to_ucs, ctext_from_ucs |
722 | }; |
723 | const charset_spec charset_CS_CTEXT = { |
724 | CS_CTEXT, read_iso2022s, write_iso2022s, &ctext |
725 | }; |
726 | |
c6d25d8d |
727 | #else /* ENUM_CHARSETS */ |
728 | |
729 | ENUM_CHARSET(CS_ISO2022_JP) |
730 | ENUM_CHARSET(CS_ISO2022_KR) |
01081d4e |
731 | ENUM_CHARSET(CS_CTEXT) |
c6d25d8d |
732 | |
733 | #endif /* ENUM_CHARSETS */ |