Ben points out that ESC ( J in ISO-2022-JP should encode the
[sgt/charset] / iso2022s.c
1 /*
2 * iso2022s.c - support for ISO-2022 subset encodings.
3 */
4
5 #ifndef ENUM_CHARSETS
6
7 #include <stdio.h>
8 #include <string.h>
9 #include <assert.h>
10
11 #include "charset.h"
12 #include "internal.h"
13 #include "sbcsdat.h"
14
15 #define SO (0x0E)
16 #define SI (0x0F)
17 #define ESC (0x1B)
18
19 /* Functional description of a single ISO 2022 escape sequence. */
20 struct iso2022_escape {
21 char const *sequence;
22 unsigned long andbits, xorbits;
23 /*
24 * For output, these variables help us figure out which escape
25 * sequences we need to get where we want to be.
26 *
27 * `container' should be in the range 0-3, but can also be ORed
28 * with the bit flag RO to indicate that this is not a
29 * preferred container to use for this charset during output.
30 */
31 int container, subcharset;
32 };
33 #define RO 0x80
34
35 struct iso2022 {
36 /*
37 * List of escape sequences supported in this subset. Must be
38 * in ASCII order, so that we can narrow down the list as
39 * necessary.
40 */
41 const struct iso2022_escape *escapes;/* must be sorted in ASCII order! */
42 int nescapes;
43
44 /*
45 * We assign indices from 0 upwards to the sub-charsets of a
46 * given ISO 2022 subset. nbytes[i] tells us how many bytes per
47 * character are required by sub-charset i. (It's a string
48 * mainly because that makes it easier to declare in C syntax
49 * than an int array.)
50 */
51 char const *nbytes;
52
53 /*
54 * The characters in this string are indices-plus-one (so that
55 * NUL can still terminate) of escape sequences in `escapes'.
56 * These escapes are output in the given sequence to reset the
57 * encoding state, unless it turns out that a given escape
58 * would not change the state at all.
59 */
60 char const *reset;
61
62 /*
63 * Initial value of s1, in case the default container contents
64 * needs to be something other than charset 0 in all cases.
65 * (Note that this must have the top bit set!)
66 */
67 unsigned long s1;
68
69 /*
70 * For output, some ISO 2022 subsets _mandate_ an initial shift
71 * sequence. If so, here it is so we can output it. (For the
72 * sake of basic sanity we won't bother to _require_ it on
73 * input, although it should of course be listed under
74 * `escapes' above so that we ignore it when present.)
75 */
76 char const *initial_sequence;
77
78 /*
79 * Is this an 8-bit ISO 2022 subset?
80 */
81 int eightbit;
82
83 /*
84 * Function calls to do the actual translation.
85 */
86 long int (*to_ucs)(int subcharset, unsigned long bytes);
87 int (*from_ucs)(long int ucs, int *subcharset, unsigned long *bytes);
88 };
89
90 static void read_iso2022s(charset_spec const *charset, long int input_chr,
91 charset_state *state,
92 void (*emit)(void *ctx, long int output),
93 void *emitctx)
94 {
95 struct iso2022 const *iso = (struct iso2022 *)charset->data;
96
97 /*
98 * For reading ISO-2022 subsets, we divide up our state
99 * variables as follows:
100 *
101 * - The top byte of s0 (bits 31:24) indicates, if nonzero,
102 * that we are part-way through a recognised ISO-2022 escape
103 * sequence. Five of those bits (31:27) give the index of
104 * the first member of the escapes list matching what we
105 * have so far; the remaining three (26:24) give the number
106 * of characters we have seen so far.
107 *
108 * - The top bit of s1 (bit 31) is non-zero at all times, to
109 * indicate that we have performed any necessary
110 * initialisation. When we start, we detect a zero s1 and
111 * respond to it by initialising the default container
112 * contents.
113 *
114 * - The next three bits of s1 (bits 30:28) indicate which
115 * _container_ is currently selected. This isn't quite as
116 * simple as it sounds, since we have to preserve memory of
117 * which of the SI/SO containers we came from when we're
118 * temporarily in SS2/SS3. Hence, what happens is:
119 * + bit 28 indicates SI/SO.
120 * + if we're in an SS2/SS3 container, that's indicated by
121 * the two bits above that being nonzero and holding
122 * either 2 or 3.
123 * + Hence: 0 is SI, 1 is SO, 4 is SS2-from-SI, 5 is
124 * SS2-from-SO, 6 is SS3-from-SI, 7 is SS3-from-SO.
125 * + For added fun: in an _8-bit_ ISO 2022 subset, we have
126 * the further special value 2, which means that we're
127 * theoretically in SI but the current character being
128 * accumulated is composed of 8-bit characters and will
129 * therefore be interpreted as if in SO.
130 *
131 * - The next nibble of s1 (27:24) indicates how many bytes
132 * have been accumulated in the current character.
133 *
134 * - The remaining three bytes of s1 are divided into four
135 * six-bit sections, and each section gives the current
136 * sub-charset selected in one of the possible containers.
137 * (Those containers are SI, SO, SS2 and SS3, respectively
138 * and in order from the bottom of s0 to the top.)
139 *
140 * - The bottom 24 bits of s0 give the accumulated character
141 * data so far.
142 *
143 * (Note that this means s1 contains all the parts of the state
144 * which might need to be operated on by escape sequences.
145 * Cunning, eh?)
146 */
147
148 if (!(state->s1 & 0x80000000)) {
149 state->s1 = iso->s1;
150 }
151
152 /*
153 * So. Firstly, we process escape sequences, if we're in the
154 * middle of one or if we see a possible introducer (SI, SO,
155 * ESC).
156 */
157 if ((state->s0 >> 24) ||
158 (input_chr == SO || input_chr == SI || input_chr == ESC)) {
159 int n = (state->s0 >> 24) & 7, i = (state->s0 >> 27), oi = i, j;
160
161 /*
162 * If this is the start of an escape sequence, we might be
163 * in mid-character. If so, clear the character state and
164 * emit an error token for the incomplete character.
165 */
166 if (state->s1 & 0x0F000000) {
167 state->s1 &= ~0x0F000000;
168 state->s0 &= 0xFF000000;
169 /*
170 * If we were in the SS2 or SS3 container, we
171 * automatically exit it.
172 */
173 if (state->s1 & 0x60000000)
174 state->s1 &= 0x9FFFFFFF;
175 emit(emitctx, ERROR);
176 }
177
178 j = i;
179 while (j < iso->nescapes &&
180 !memcmp(iso->escapes[j].sequence,
181 iso->escapes[oi].sequence, n)) {
182 if (iso->escapes[j].sequence[n] < input_chr)
183 i = ++j;
184 else
185 break;
186 }
187 if (i >= iso->nescapes ||
188 memcmp(iso->escapes[i].sequence,
189 iso->escapes[oi].sequence, n) ||
190 iso->escapes[i].sequence[n] != input_chr) {
191 /*
192 * This character does not appear in any valid escape
193 * sequence. Therefore, we must emit all the characters
194 * we had previously swallowed, plus this one, and
195 * return to non-escape-sequence state.
196 */
197 for (j = 0; j < n; j++)
198 emit(emitctx, iso->escapes[oi].sequence[j]);
199 emit(emitctx, input_chr);
200 state->s0 = 0;
201 return;
202 }
203
204 /*
205 * Otherwise, we have found an additional character in our
206 * escape sequence. See if we have reached the _end_ of our
207 * sequence (and therefore must process the sequence).
208 */
209 n++;
210 if (!iso->escapes[i].sequence[n]) {
211 state->s0 = 0;
212 state->s1 &= iso->escapes[i].andbits;
213 state->s1 ^= iso->escapes[i].xorbits;
214 return;
215 }
216
217 /*
218 * Failing _that_, we simply update our escape-sequence-
219 * tracking state.
220 */
221 assert(i < 32 && n < 8);
222 state->s0 = (i << 27) | (n << 24);
223 return;
224 }
225
226 /*
227 * If this isn't an escape sequence, it must be part of a
228 * character. One possibility is that it's a control character
229 * (00-20 or 7F-9F; also in non-8-bit ISO 2022 subsets I'm
230 * going to treat all top-half characters as controls), in
231 * which case we output it verbatim.
232 */
233 if (input_chr < 0x21 ||
234 (input_chr > 0x7E && (!iso->eightbit || input_chr < 0xA0))) {
235 /*
236 * We might be in mid-multibyte-character. If so, clear the
237 * character state and emit an error token for the
238 * incomplete character.
239 */
240 if (state->s1 & 0x0F000000) {
241 state->s1 &= ~0x0F000000;
242 state->s0 &= 0xFF000000;
243 emit(emitctx, ERROR);
244 /*
245 * If we were in the SS2 or SS3 container, we
246 * automatically exit it.
247 */
248 if (state->s1 & 0x60000000)
249 state->s1 &= 0x9FFFFFFF;
250 }
251
252 emit(emitctx, input_chr);
253 return;
254 }
255
256 /*
257 * Otherwise, accumulate character data.
258 */
259 {
260 unsigned long chr;
261 int chrlen, cont, subcharset, bytes;
262
263 /*
264 * Verify that we've seen the right kind of character for
265 * what we're currently doing. This only matters in 8-bit
266 * subsets.
267 */
268 if (iso->eightbit) {
269 cont = (state->s1 >> 28) & 7;
270 /*
271 * If cont==0, we're entitled to see either GL or GR
272 * characters. If cont==2, we expect only GR; otherwise
273 * we expect only GL.
274 *
275 * If we see a GR character while cont==0, we set
276 * cont=2 immediately.
277 */
278 if ((cont == 2 && !(input_chr & 0x80)) ||
279 (cont != 0 && cont != 2 && (input_chr & 0x80))) {
280 /*
281 * Clear the previous character; it was prematurely
282 * terminated by this error.
283 */
284 state->s1 &= ~0x0F000000;
285 state->s0 &= 0xFF000000;
286 emit(emitctx, ERROR);
287 /*
288 * If we were in the SS2 or SS3 container, we
289 * automatically exit it.
290 */
291 if (state->s1 & 0x60000000)
292 state->s1 &= 0x9FFFFFFF;
293 }
294
295 if (cont == 0 && (input_chr & 0x80)) {
296 state->s1 |= 0x20000000;
297 }
298 }
299
300 /* The current character and its length. */
301 chr = ((state->s0 & 0x00FFFFFF) << 8) | (input_chr & 0x7F);
302 chrlen = ((state->s1 >> 24) & 0xF) + 1;
303 /* The current sub-charset. */
304 cont = (state->s1 >> 28) & 7;
305 if (cont > 1) cont >>= 1;
306 subcharset = (state->s1 >> (6*cont)) & 0x3F;
307 /* The number of bytes-per-character in that sub-charset. */
308 bytes = iso->nbytes[subcharset];
309
310 /*
311 * If this character is now complete, we convert and emit
312 * it. Otherwise, we simply update the state and return.
313 */
314 if (chrlen >= bytes) {
315 emit(emitctx, iso->to_ucs(subcharset, chr));
316 chr = chrlen = 0;
317 /*
318 * If we were in the SS2 or SS3 container, we
319 * automatically exit it.
320 */
321 if (state->s1 & 0x60000000)
322 state->s1 &= 0x9FFFFFFF;
323 }
324 state->s0 = (state->s0 & 0xFF000000) | chr;
325 state->s1 = (state->s1 & 0xF0FFFFFF) | (chrlen << 24);
326 }
327 }
328
329 static int write_iso2022s(charset_spec const *charset, long int input_chr,
330 charset_state *state,
331 void (*emit)(void *ctx, long int output),
332 void *emitctx)
333 {
334 struct iso2022 const *iso = (struct iso2022 *)charset->data;
335 int subcharset, len, i, j, cont, topbit = 0;
336 unsigned long bytes;
337
338 /*
339 * For output, our s1 state variable contains most of the same
340 * stuff as it did for input - initial-state indicator bit,
341 * current container, and current subcharset selected in each
342 * container.
343 */
344
345 /*
346 * Analyse the character and find out what subcharset it needs
347 * to go in.
348 */
349 if (input_chr >= 0 && !iso->from_ucs(input_chr, &subcharset, &bytes))
350 return FALSE;
351
352 if (!(state->s1 & 0x80000000)) {
353 state->s1 = iso->s1;
354 if (iso->initial_sequence)
355 for (i = 0; iso->initial_sequence[i]; i++)
356 emit(emitctx, iso->initial_sequence[i]);
357 }
358
359 if (input_chr == -1) {
360 unsigned long oldstate;
361 int k;
362
363 /*
364 * Special case: reset encoding state.
365 */
366 for (i = 0; iso->reset[i]; i++) {
367 j = iso->reset[i] - 1;
368 oldstate = state->s1;
369 state->s1 &= iso->escapes[j].andbits;
370 state->s1 ^= iso->escapes[j].xorbits;
371 if (state->s1 != oldstate) {
372 /* We must actually emit this sequence. */
373 for (k = 0; iso->escapes[j].sequence[k]; k++)
374 emit(emitctx, iso->escapes[j].sequence[k]);
375 }
376 }
377
378 return TRUE;
379 }
380
381 /*
382 * Now begins the fun. We now know what subcharset we want. So
383 * we must find out which container we should select it into,
384 * select it into it if necessary, select that _container_ if
385 * necessary, and then output the given bytes.
386 */
387 for (i = 0; i < iso->nescapes; i++)
388 if (iso->escapes[i].subcharset == subcharset &&
389 !(iso->escapes[i].container & RO))
390 break;
391 assert(i < iso->nescapes);
392
393 /*
394 * We've found the escape sequence which would select this
395 * subcharset into a container. However, that subcharset might
396 * already _be_ selected in that container! Check before we go
397 * to the effort of emitting the sequence.
398 */
399 cont = iso->escapes[i].container &~ RO;
400 if (((state->s1 >> (6*cont)) & 0x3F) != (unsigned)subcharset) {
401 for (j = 0; iso->escapes[i].sequence[j]; j++)
402 emit(emitctx, iso->escapes[i].sequence[j]);
403 state->s1 &= iso->escapes[i].andbits;
404 state->s1 ^= iso->escapes[i].xorbits;
405 }
406
407 /*
408 * Now we know what container our subcharset is in, so we want
409 * to select that container.
410 */
411 if (cont > 1) {
412 /* SS2 or SS3; just output the sequence and be done. */
413 emit(emitctx, ESC);
414 emit(emitctx, 'L' + cont); /* comes out to 'N' or 'O' */
415 } else {
416 /*
417 * Emit SI or SO, but only if the current container isn't already
418 * the right one.
419 *
420 * Also, in an 8-bit subset, we need not do this; we'll
421 * just use 8-bit characters to output SO-container
422 * characters.
423 */
424 if (iso->eightbit && cont == 1 && ((state->s1 >> 28) & 7) == 0) {
425 topbit = 0x80;
426 } else if (((state->s1 >> 28) & 7) != (unsigned)cont) {
427 emit(emitctx, cont ? SO : SI);
428 state->s1 = (state->s1 & 0x8FFFFFFF) | (cont << 28);
429 }
430 }
431
432 /*
433 * We're done. Subcharset is selected in container, container
434 * is selected. All we need now is to write out the bytes.
435 */
436 len = iso->nbytes[subcharset];
437 while (len--)
438 emit(emitctx, ((bytes >> (8*len)) & 0xFF) | topbit);
439
440 return TRUE;
441 }
442
443 /*
444 * ISO-2022-JP, defined in RFC 1468.
445 */
446 static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes)
447 {
448 switch (subcharset) {
449 case 1: /* JIS X 0201 bottom half */
450 if (bytes == 0x5C)
451 return 0xA5;
452 else if (bytes == 0x7E)
453 return 0x203E;
454 /* else fall through to ASCII */
455 case 0: return bytes; /* one-byte ASCII */
456 /* (no break needed since all control paths have returned) */
457 case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
458 ((bytes ) & 0xFF) - 0x21);
459 default: return ERROR;
460 }
461 }
462 static int iso2022jp_from_ucs(long int ucs, int *subcharset,
463 unsigned long *bytes)
464 {
465 int r, c;
466 if (ucs < 0x80) {
467 *subcharset = 0;
468 *bytes = ucs;
469 return 1;
470 } else if (ucs == 0xA5 || ucs == 0x203E) {
471 *subcharset = 1;
472 *bytes = (ucs == 0xA5 ? 0x5C : 0x7E);
473 return 1;
474 } else if (unicode_to_jisx0208(ucs, &r, &c)) {
475 *subcharset = 2;
476 *bytes = ((r+0x21) << 8) | (c+0x21);
477 return 1;
478 } else {
479 return 0;
480 }
481 }
482 static const struct iso2022_escape iso2022jp_escapes[] = {
483 {"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1}, /* we ignore this one */
484 {"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2},
485 {"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0},
486 {"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1},
487 };
488 static const struct iso2022 iso2022jp = {
489 iso2022jp_escapes, lenof(iso2022jp_escapes),
490 "\1\1\2", "\3", 0x80000000, NULL, FALSE,
491 iso2022jp_to_ucs, iso2022jp_from_ucs
492 };
493 const charset_spec charset_CS_ISO2022_JP = {
494 CS_ISO2022_JP, read_iso2022s, write_iso2022s, &iso2022jp
495 };
496
497 /*
498 * ISO-2022-KR, defined in RFC 1557.
499 */
500 static long int iso2022kr_to_ucs(int subcharset, unsigned long bytes)
501 {
502 switch (subcharset) {
503 case 0: return bytes; /* one-byte ASCII */
504 case 1: return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
505 ((bytes ) & 0xFF) - 0x21);
506 default: return ERROR;
507 }
508 }
509 static int iso2022kr_from_ucs(long int ucs, int *subcharset,
510 unsigned long *bytes)
511 {
512 int r, c;
513 if (ucs < 0x80) {
514 *subcharset = 0;
515 *bytes = ucs;
516 return 1;
517 } else if (unicode_to_ksx1001(ucs, &r, &c)) {
518 *subcharset = 1;
519 *bytes = ((r+0x21) << 8) | (c+0x21);
520 return 1;
521 } else {
522 return 0;
523 }
524 }
525 static const struct iso2022_escape iso2022kr_escapes[] = {
526 {"\016", 0x8FFFFFFF, 0x10000000, -1, -1},
527 {"\017", 0x8FFFFFFF, 0x00000000, 0, 0},
528 {"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1}, /* bits[11:6] <- 1 */
529 };
530 static const struct iso2022 iso2022kr = {
531 iso2022kr_escapes, lenof(iso2022kr_escapes),
532 "\1\2", "\2", 0x80000040, "\033$)C", FALSE,
533 iso2022kr_to_ucs, iso2022kr_from_ucs
534 };
535 const charset_spec charset_CS_ISO2022_KR = {
536 CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr
537 };
538
539 /*
540 * The COMPOUND_TEXT encoding used in X selections. Defined by the
541 * X consortium.
542 *
543 * This encoding has quite a few sub-charsets. The order I assign
544 * to them here is given in an enum.
545 */
546 enum {
547 /* This must match the bytes-per-character string given below. */
548 CTEXT_ASCII,
549 CTEXT_JISX0201_LEFT,
550 CTEXT_JISX0201_RIGHT,
551 CTEXT_ISO8859_1,
552 CTEXT_ISO8859_2,
553 CTEXT_ISO8859_3,
554 CTEXT_ISO8859_4,
555 CTEXT_ISO8859_5,
556 CTEXT_ISO8859_6,
557 CTEXT_ISO8859_7,
558 CTEXT_ISO8859_8,
559 CTEXT_ISO8859_9,
560 CTEXT_GB2312,
561 CTEXT_KSC5601,
562 CTEXT_JISX0208,
563 CTEXT_JISX0212
564 };
565 static long int ctext_to_ucs(int subcharset, unsigned long bytes)
566 {
567 switch (subcharset) {
568 case CTEXT_ASCII: return bytes; /* one-byte ASCII */
569 case CTEXT_JISX0201_LEFT: /* ASCII with yen and overline */
570 return sbcs_to_unicode(&sbcsdata_CS_JISX0201, bytes & 0x7F);
571 case CTEXT_JISX0201_RIGHT: /* JIS X 0201 half-width katakana */
572 return sbcs_to_unicode(&sbcsdata_CS_JISX0201, (bytes & 0x7F) | 0x80);
573 case CTEXT_ISO8859_1:
574 return sbcs_to_unicode(&sbcsdata_CS_ISO8859_1, (bytes & 0x7F) | 0x80);
575 case CTEXT_ISO8859_2:
576 return sbcs_to_unicode(&sbcsdata_CS_ISO8859_2, (bytes & 0x7F) | 0x80);
577 case CTEXT_ISO8859_3:
578 return sbcs_to_unicode(&sbcsdata_CS_ISO8859_3, (bytes & 0x7F) | 0x80);
579 case CTEXT_ISO8859_4:
580 return sbcs_to_unicode(&sbcsdata_CS_ISO8859_4, (bytes & 0x7F) | 0x80);
581 case CTEXT_ISO8859_5:
582 return sbcs_to_unicode(&sbcsdata_CS_ISO8859_5, (bytes & 0x7F) | 0x80);
583 case CTEXT_ISO8859_6:
584 return sbcs_to_unicode(&sbcsdata_CS_ISO8859_6, (bytes & 0x7F) | 0x80);
585 case CTEXT_ISO8859_7:
586 return sbcs_to_unicode(&sbcsdata_CS_ISO8859_7, (bytes & 0x7F) | 0x80);
587 case CTEXT_ISO8859_8:
588 return sbcs_to_unicode(&sbcsdata_CS_ISO8859_8, (bytes & 0x7F) | 0x80);
589 case CTEXT_ISO8859_9:
590 return sbcs_to_unicode(&sbcsdata_CS_ISO8859_9, (bytes & 0x7F) | 0x80);
591 case CTEXT_GB2312:
592 return gb2312_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
593 ((bytes ) & 0xFF) - 0x21);
594 case CTEXT_KSC5601:
595 return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
596 ((bytes ) & 0xFF) - 0x21);
597 case CTEXT_JISX0208:
598 return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
599 ((bytes ) & 0xFF) - 0x21);
600 case CTEXT_JISX0212:
601 return jisx0212_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
602 ((bytes ) & 0xFF) - 0x21);
603 default: return ERROR;
604 }
605 }
606 static int ctext_from_ucs(long int ucs, int *subcharset, unsigned long *bytes)
607 {
608 int r, c;
609 if (ucs < 0x80) {
610 *subcharset = CTEXT_ASCII;
611 *bytes = ucs;
612 return 1;
613 } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_1, ucs)) != ERROR) {
614 *subcharset = CTEXT_ISO8859_1;
615 *bytes = c - 0x80;
616 return 1;
617 } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_2, ucs)) != ERROR) {
618 *subcharset = CTEXT_ISO8859_2;
619 *bytes = c - 0x80;
620 return 1;
621 } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_3, ucs)) != ERROR) {
622 *subcharset = CTEXT_ISO8859_3;
623 *bytes = c - 0x80;
624 return 1;
625 } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_4, ucs)) != ERROR) {
626 *subcharset = CTEXT_ISO8859_4;
627 *bytes = c - 0x80;
628 return 1;
629 } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_5, ucs)) != ERROR) {
630 *subcharset = CTEXT_ISO8859_5;
631 *bytes = c - 0x80;
632 return 1;
633 } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_6, ucs)) != ERROR) {
634 *subcharset = CTEXT_ISO8859_6;
635 *bytes = c - 0x80;
636 return 1;
637 } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_7, ucs)) != ERROR) {
638 *subcharset = CTEXT_ISO8859_7;
639 *bytes = c - 0x80;
640 return 1;
641 } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_8, ucs)) != ERROR) {
642 *subcharset = CTEXT_ISO8859_8;
643 *bytes = c - 0x80;
644 return 1;
645 } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_9, ucs)) != ERROR) {
646 *subcharset = CTEXT_ISO8859_9;
647 *bytes = c - 0x80;
648 return 1;
649 } else if ((c = sbcs_from_unicode(&sbcsdata_CS_JISX0201, ucs)) != ERROR) {
650 if (c < 0x80) {
651 *subcharset = CTEXT_JISX0201_LEFT;
652 } else {
653 *subcharset = CTEXT_JISX0201_RIGHT;
654 c -= 0x80;
655 }
656 *bytes = c;
657 return 1;
658 } else if (unicode_to_gb2312(ucs, &r, &c)) {
659 *subcharset = CTEXT_GB2312;
660 *bytes = ((r+0x21) << 8) | (c+0x21);
661 return 1;
662 } else if (unicode_to_ksx1001(ucs, &r, &c)) {
663 *subcharset = CTEXT_KSC5601;
664 *bytes = ((r+0x21) << 8) | (c+0x21);
665 return 1;
666 } else if (unicode_to_jisx0208(ucs, &r, &c)) {
667 *subcharset = CTEXT_JISX0208;
668 *bytes = ((r+0x21) << 8) | (c+0x21);
669 return 1;
670 } else if (unicode_to_jisx0212(ucs, &r, &c)) {
671 *subcharset = CTEXT_JISX0212;
672 *bytes = ((r+0x21) << 8) | (c+0x21);
673 return 1;
674 } else {
675 return 0;
676 }
677 }
678 #define SEQ(str,cont,cs) \
679 {str,~(63<<(6*(((cont)&~RO)))),(cs)<<(6*(((cont)&~RO))),(cont),(cs)}
680 /*
681 * Compound text defines restrictions on which container can take
682 * which character sets. Things labelled `left half of' can only go
683 * in GL; things labelled `right half of' can only go in GR; and 96
684 * or 96^n character sets only _fit_ in GR. Thus:
685 * - ASCII can only go in GL since it is the left half of 8859-*.
686 * - All the 8859 sets can only go in GR.
687 * - JISX0201 left is GL only; JISX0201 right is GR only.
688 * - The three multibyte sets (GB2312, JISX0208, KSC5601) can go
689 * in either; we prefer GR where possible since this leads to a
690 * more compact EUC-like encoding.
691 */
692 static const struct iso2022_escape ctext_escapes[] = {
693 SEQ("\033$(A", 0|RO, CTEXT_GB2312),
694 SEQ("\033$(B", 0|RO, CTEXT_JISX0208),
695 SEQ("\033$(C", 0|RO, CTEXT_KSC5601),
696 SEQ("\033$(D", 0|RO, CTEXT_JISX0212),
697 SEQ("\033$)A", 1, CTEXT_GB2312),
698 SEQ("\033$)B", 1, CTEXT_JISX0208),
699 SEQ("\033$)C", 1, CTEXT_KSC5601),
700 SEQ("\033$)D", 1, CTEXT_JISX0212),
701 SEQ("\033(B", 0, CTEXT_ASCII),
702 SEQ("\033(J", 0, CTEXT_JISX0201_LEFT),
703 SEQ("\033)I", 1, CTEXT_JISX0201_RIGHT),
704 SEQ("\033-A", 1, CTEXT_ISO8859_1),
705 SEQ("\033-B", 1, CTEXT_ISO8859_2),
706 SEQ("\033-C", 1, CTEXT_ISO8859_3),
707 SEQ("\033-D", 1, CTEXT_ISO8859_4),
708 SEQ("\033-F", 1, CTEXT_ISO8859_7),
709 SEQ("\033-G", 1, CTEXT_ISO8859_6),
710 SEQ("\033-H", 1, CTEXT_ISO8859_8),
711 SEQ("\033-L", 1, CTEXT_ISO8859_5),
712 SEQ("\033-M", 1, CTEXT_ISO8859_9),
713
714 /*
715 * Cross-testing against Xutf8TextListToTextProperty() turns up
716 * some additional character sets and ISO 2022 features
717 * supported by that and not by us:
718 *
719 * - Single-byte right-hand-half character sets `ESC - f',
720 * `ESC - T' and `ESC - Y'.
721 *
722 * - A really horrifying mechanism used to escape completely
723 * from the ISO 2022 framework: ESC % / <length>
724 * <charset-name> <text>. Xutf8* uses this to encode
725 * "iso8859-14", "iso8859-15" and "big5-0".
726 * * This mechanism is particularly nasty because we can't
727 * efficiently encode it on the fly! It requires that the
728 * length of the text encoded in the foreign charset is
729 * given _before_ the text in question, so if we're
730 * receiving one character at a time we simply can't look
731 * ahead and so we would have to encode each individual
732 * character in a separate one of these sequences.
733 *
734 * - ESC % G and ESC % @ to shift to and from UTF-8 mode, as a
735 * last resort for anything we still don't support.
736 * * Interestingly, ctext.ps actually _disallows_ this: it
737 * says that the above extension mechanism is the only
738 * one permitted. Ho hum.
739 */
740 };
741 static const struct iso2022 ctext = {
742 ctext_escapes, lenof(ctext_escapes),
743 "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\2\2", /* must match the enum above */
744 "", 0x80000000 | (CTEXT_ASCII<<0) | (CTEXT_ISO8859_1<<6), "", TRUE,
745 ctext_to_ucs, ctext_from_ucs
746 };
747 const charset_spec charset_CS_CTEXT = {
748 CS_CTEXT, read_iso2022s, write_iso2022s, &ctext
749 };
750
751 #else /* ENUM_CHARSETS */
752
753 ENUM_CHARSET(CS_ISO2022_JP)
754 ENUM_CHARSET(CS_ISO2022_KR)
755 ENUM_CHARSET(CS_CTEXT)
756
757 #endif /* ENUM_CHARSETS */