Explicitly constify a bunch of static data declarations which were
[sgt/charset] / iso2022s.c
1 /*
2 * iso2022s.c - support for ISO-2022 subset encodings.
3 *
4 * (The `s' suffix on the filename is there to leave `iso2022.c'
5 * free for the unlikely event that I ever attempt to implement
6 * _full_ ISO-2022 in this library!)
7 */
8
9 #ifndef ENUM_CHARSETS
10
11 #include <stdio.h>
12 #include <string.h>
13 #include <assert.h>
14
15 #include "charset.h"
16 #include "internal.h"
17 #include "sbcsdat.h"
18
19 #define SO (0x0E)
20 #define SI (0x0F)
21 #define ESC (0x1B)
22
23 /* Functional description of a single ISO 2022 escape sequence. */
24 struct iso2022_escape {
25 char const *sequence;
26 unsigned long andbits, xorbits;
27 /*
28 * For output, these variables help us figure out which escape
29 * sequences we need to get where we want to be.
30 *
31 * `container' should be in the range 0-3, but can also be ORed
32 * with the bit flag RO to indicate that this is not a
33 * preferred container to use for this charset during output.
34 */
35 int container, subcharset;
36 };
37 #define RO 0x80
38
39 struct iso2022 {
40 /*
41 * List of escape sequences supported in this subset. Must be
42 * in ASCII order, so that we can narrow down the list as
43 * necessary.
44 */
45 const struct iso2022_escape *escapes;/* must be sorted in ASCII order! */
46 int nescapes;
47
48 /*
49 * We assign indices from 0 upwards to the sub-charsets of a
50 * given ISO 2022 subset. nbytes[i] tells us how many bytes per
51 * character are required by sub-charset i. (It's a string
52 * mainly because that makes it easier to declare in C syntax
53 * than an int array.)
54 */
55 char const *nbytes;
56
57 /*
58 * The characters in this string are indices-plus-one (so that
59 * NUL can still terminate) of escape sequences in `escapes'.
60 * These escapes are output in the given sequence to reset the
61 * encoding state, unless it turns out that a given escape
62 * would not change the state at all.
63 */
64 char const *reset;
65
66 /*
67 * Initial value of s1, in case the default container contents
68 * needs to be something other than charset 0 in all cases.
69 * (Note that this must have the top bit set!)
70 */
71 unsigned long s1;
72
73 /*
74 * For output, some ISO 2022 subsets _mandate_ an initial shift
75 * sequence. If so, here it is so we can output it. (For the
76 * sake of basic sanity we won't bother to _require_ it on
77 * input, although it should of course be listed under
78 * `escapes' above so that we ignore it when present.)
79 */
80 char const *initial_sequence;
81
82 /*
83 * Is this an 8-bit ISO 2022 subset?
84 */
85 int eightbit;
86
87 /*
88 * Function calls to do the actual translation.
89 */
90 long int (*to_ucs)(int subcharset, unsigned long bytes);
91 int (*from_ucs)(long int ucs, int *subcharset, unsigned long *bytes);
92 };
93
94 static void read_iso2022s(charset_spec const *charset, long int input_chr,
95 charset_state *state,
96 void (*emit)(void *ctx, long int output),
97 void *emitctx)
98 {
99 struct iso2022 const *iso = (struct iso2022 *)charset->data;
100
101 /*
102 * For reading ISO-2022 subsets, we divide up our state
103 * variables as follows:
104 *
105 * - The top byte of s0 (bits 31:24) indicates, if nonzero,
106 * that we are part-way through a recognised ISO-2022 escape
107 * sequence. Five of those bits (31:27) give the index of
108 * the first member of the escapes list matching what we
109 * have so far; the remaining three (26:24) give the number
110 * of characters we have seen so far.
111 *
112 * - The top bit of s1 (bit 31) is non-zero at all times, to
113 * indicate that we have performed any necessary
114 * initialisation. When we start, we detect a zero s1 and
115 * respond to it by initialising the default container
116 * contents.
117 *
118 * - The next three bits of s1 (bits 30:28) indicate which
119 * _container_ is currently selected. This isn't quite as
120 * simple as it sounds, since we have to preserve memory of
121 * which of the SI/SO containers we came from when we're
122 * temporarily in SS2/SS3. Hence, what happens is:
123 * + bit 28 indicates SI/SO.
124 * + if we're in an SS2/SS3 container, that's indicated by
125 * the two bits above that being nonzero and holding
126 * either 2 or 3.
127 * + Hence: 0 is SI, 1 is SO, 4 is SS2-from-SI, 5 is
128 * SS2-from-SO, 6 is SS3-from-SI, 7 is SS3-from-SO.
129 * + For added fun: in an _8-bit_ ISO 2022 subset, we have
130 * the further special value 2, which means that we're
131 * theoretically in SI but the current character being
132 * accumulated is composed of 8-bit characters and will
133 * therefore be interpreted as if in SO.
134 *
135 * - The next nibble of s1 (27:24) indicates how many bytes
136 * have been accumulated in the current character.
137 *
138 * - The remaining three bytes of s1 are divided into four
139 * six-bit sections, and each section gives the current
140 * sub-charset selected in one of the possible containers.
141 * (Those containers are SI, SO, SS2 and SS3, respectively
142 * and in order from the bottom of s0 to the top.)
143 *
144 * - The bottom 24 bits of s0 give the accumulated character
145 * data so far.
146 *
147 * (Note that this means s1 contains all the parts of the state
148 * which might need to be operated on by escape sequences.
149 * Cunning, eh?)
150 */
151
152 if (!(state->s1 & 0x80000000)) {
153 state->s1 = iso->s1;
154 }
155
156 /*
157 * So. Firstly, we process escape sequences, if we're in the
158 * middle of one or if we see a possible introducer (SI, SO,
159 * ESC).
160 */
161 if ((state->s0 >> 24) ||
162 (input_chr == SO || input_chr == SI || input_chr == ESC)) {
163 int n = (state->s0 >> 24) & 7, i = (state->s0 >> 27), oi = i, j;
164
165 /*
166 * If this is the start of an escape sequence, we might be
167 * in mid-character. If so, clear the character state and
168 * emit an error token for the incomplete character.
169 */
170 if (state->s1 & 0x0F000000) {
171 state->s1 &= ~0x0F000000;
172 state->s0 &= 0xFF000000;
173 /*
174 * If we were in the SS2 or SS3 container, we
175 * automatically exit it.
176 */
177 if (state->s1 & 0x60000000)
178 state->s1 &= 0x9FFFFFFF;
179 emit(emitctx, ERROR);
180 }
181
182 j = i;
183 while (j < iso->nescapes &&
184 !memcmp(iso->escapes[j].sequence,
185 iso->escapes[oi].sequence, n)) {
186 if (iso->escapes[j].sequence[n] < input_chr)
187 i = ++j;
188 else
189 break;
190 }
191 if (i >= iso->nescapes ||
192 memcmp(iso->escapes[i].sequence,
193 iso->escapes[oi].sequence, n) ||
194 iso->escapes[i].sequence[n] != input_chr) {
195 /*
196 * This character does not appear in any valid escape
197 * sequence. Therefore, we must emit all the characters
198 * we had previously swallowed, plus this one, and
199 * return to non-escape-sequence state.
200 */
201 for (j = 0; j < n; j++)
202 emit(emitctx, iso->escapes[oi].sequence[j]);
203 emit(emitctx, input_chr);
204 state->s0 = 0;
205 return;
206 }
207
208 /*
209 * Otherwise, we have found an additional character in our
210 * escape sequence. See if we have reached the _end_ of our
211 * sequence (and therefore must process the sequence).
212 */
213 n++;
214 if (!iso->escapes[i].sequence[n]) {
215 state->s0 = 0;
216 state->s1 &= iso->escapes[i].andbits;
217 state->s1 ^= iso->escapes[i].xorbits;
218 return;
219 }
220
221 /*
222 * Failing _that_, we simply update our escape-sequence-
223 * tracking state.
224 */
225 assert(i < 32 && n < 8);
226 state->s0 = (i << 27) | (n << 24);
227 return;
228 }
229
230 /*
231 * If this isn't an escape sequence, it must be part of a
232 * character. One possibility is that it's a control character
233 * (00-20 or 7F-9F; also in non-8-bit ISO 2022 subsets I'm
234 * going to treat all top-half characters as controls), in
235 * which case we output it verbatim.
236 */
237 if (input_chr < 0x21 ||
238 (input_chr > 0x7E && (!iso->eightbit || input_chr < 0xA0))) {
239 /*
240 * We might be in mid-multibyte-character. If so, clear the
241 * character state and emit an error token for the
242 * incomplete character.
243 */
244 if (state->s1 & 0x0F000000) {
245 state->s1 &= ~0x0F000000;
246 state->s0 &= 0xFF000000;
247 emit(emitctx, ERROR);
248 /*
249 * If we were in the SS2 or SS3 container, we
250 * automatically exit it.
251 */
252 if (state->s1 & 0x60000000)
253 state->s1 &= 0x9FFFFFFF;
254 }
255
256 emit(emitctx, input_chr);
257 return;
258 }
259
260 /*
261 * Otherwise, accumulate character data.
262 */
263 {
264 unsigned long chr;
265 int chrlen, cont, subcharset, bytes;
266
267 /*
268 * Verify that we've seen the right kind of character for
269 * what we're currently doing. This only matters in 8-bit
270 * subsets.
271 */
272 if (iso->eightbit) {
273 cont = (state->s1 >> 28) & 7;
274 /*
275 * If cont==0, we're entitled to see either GL or GR
276 * characters. If cont==2, we expect only GR; otherwise
277 * we expect only GL.
278 *
279 * If we see a GR character while cont==0, we set
280 * cont=2 immediately.
281 */
282 if ((cont == 2 && !(input_chr & 0x80)) ||
283 (cont != 0 && cont != 2 && (input_chr & 0x80))) {
284 /*
285 * Clear the previous character; it was prematurely
286 * terminated by this error.
287 */
288 state->s1 &= ~0x0F000000;
289 state->s0 &= 0xFF000000;
290 emit(emitctx, ERROR);
291 /*
292 * If we were in the SS2 or SS3 container, we
293 * automatically exit it.
294 */
295 if (state->s1 & 0x60000000)
296 state->s1 &= 0x9FFFFFFF;
297 }
298
299 if (cont == 0 && (input_chr & 0x80)) {
300 state->s1 |= 0x20000000;
301 }
302 }
303
304 /* The current character and its length. */
305 chr = ((state->s0 & 0x00FFFFFF) << 8) | (input_chr & 0x7F);
306 chrlen = ((state->s1 >> 24) & 0xF) + 1;
307 /* The current sub-charset. */
308 cont = (state->s1 >> 28) & 7;
309 if (cont > 1) cont >>= 1;
310 subcharset = (state->s1 >> (6*cont)) & 0x3F;
311 /* The number of bytes-per-character in that sub-charset. */
312 bytes = iso->nbytes[subcharset];
313
314 /*
315 * If this character is now complete, we convert and emit
316 * it. Otherwise, we simply update the state and return.
317 */
318 if (chrlen >= bytes) {
319 emit(emitctx, iso->to_ucs(subcharset, chr));
320 chr = chrlen = 0;
321 /*
322 * If we were in the SS2 or SS3 container, we
323 * automatically exit it.
324 */
325 if (state->s1 & 0x60000000)
326 state->s1 &= 0x9FFFFFFF;
327 }
328 state->s0 = (state->s0 & 0xFF000000) | chr;
329 state->s1 = (state->s1 & 0xF0FFFFFF) | (chrlen << 24);
330 }
331 }
332
333 static int write_iso2022s(charset_spec const *charset, long int input_chr,
334 charset_state *state,
335 void (*emit)(void *ctx, long int output),
336 void *emitctx)
337 {
338 struct iso2022 const *iso = (struct iso2022 *)charset->data;
339 int subcharset, len, i, j, cont, topbit = 0;
340 unsigned long bytes;
341
342 /*
343 * For output, our s1 state variable contains most of the same
344 * stuff as it did for input - initial-state indicator bit,
345 * current container, and current subcharset selected in each
346 * container.
347 */
348
349 /*
350 * Analyse the character and find out what subcharset it needs
351 * to go in.
352 */
353 if (input_chr >= 0 && !iso->from_ucs(input_chr, &subcharset, &bytes))
354 return FALSE;
355
356 if (!(state->s1 & 0x80000000)) {
357 state->s1 = iso->s1;
358 if (iso->initial_sequence)
359 for (i = 0; iso->initial_sequence[i]; i++)
360 emit(emitctx, iso->initial_sequence[i]);
361 }
362
363 if (input_chr == -1) {
364 unsigned long oldstate;
365 int k;
366
367 /*
368 * Special case: reset encoding state.
369 */
370 for (i = 0; iso->reset[i]; i++) {
371 j = iso->reset[i] - 1;
372 oldstate = state->s1;
373 state->s1 &= iso->escapes[j].andbits;
374 state->s1 ^= iso->escapes[j].xorbits;
375 if (state->s1 != oldstate) {
376 /* We must actually emit this sequence. */
377 for (k = 0; iso->escapes[j].sequence[k]; k++)
378 emit(emitctx, iso->escapes[j].sequence[k]);
379 }
380 }
381
382 return TRUE;
383 }
384
385 /*
386 * Now begins the fun. We now know what subcharset we want. So
387 * we must find out which container we should select it into,
388 * select it into it if necessary, select that _container_ if
389 * necessary, and then output the given bytes.
390 */
391 for (i = 0; i < iso->nescapes; i++)
392 if (iso->escapes[i].subcharset == subcharset &&
393 !(iso->escapes[i].container & RO))
394 break;
395 assert(i < iso->nescapes);
396
397 /*
398 * We've found the escape sequence which would select this
399 * subcharset into a container. However, that subcharset might
400 * already _be_ selected in that container! Check before we go
401 * to the effort of emitting the sequence.
402 */
403 cont = iso->escapes[i].container &~ RO;
404 if (((state->s1 >> (6*cont)) & 0x3F) != (unsigned)subcharset) {
405 for (j = 0; iso->escapes[i].sequence[j]; j++)
406 emit(emitctx, iso->escapes[i].sequence[j]);
407 state->s1 &= iso->escapes[i].andbits;
408 state->s1 ^= iso->escapes[i].xorbits;
409 }
410
411 /*
412 * Now we know what container our subcharset is in, so we want
413 * to select that container.
414 */
415 if (cont > 1) {
416 /* SS2 or SS3; just output the sequence and be done. */
417 emit(emitctx, ESC);
418 emit(emitctx, 'L' + cont); /* comes out to 'N' or 'O' */
419 } else {
420 /*
421 * Emit SI or SO, but only if the current container isn't already
422 * the right one.
423 *
424 * Also, in an 8-bit subset, we need not do this; we'll
425 * just use 8-bit characters to output SO-container
426 * characters.
427 */
428 if (iso->eightbit && cont == 1 && ((state->s1 >> 28) & 7) == 0) {
429 topbit = 0x80;
430 } else if (((state->s1 >> 28) & 7) != (unsigned)cont) {
431 emit(emitctx, cont ? SO : SI);
432 state->s1 = (state->s1 & 0x8FFFFFFF) | (cont << 28);
433 }
434 }
435
436 /*
437 * We're done. Subcharset is selected in container, container
438 * is selected. All we need now is to write out the bytes.
439 */
440 len = iso->nbytes[subcharset];
441 while (len--)
442 emit(emitctx, ((bytes >> (8*len)) & 0xFF) | topbit);
443
444 return TRUE;
445 }
446
447 /*
448 * ISO-2022-JP, defined in RFC 1468.
449 */
450 static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes)
451 {
452 switch (subcharset) {
453 case 0: return bytes; /* one-byte ASCII */
454 case 1: /* JIS X 0201 half-width katakana */
455 if (bytes >= 0x21 && bytes <= 0x5F)
456 return bytes + (0xFF61 - 0x21);
457 else
458 return ERROR;
459 /* (no break needed since all control paths have returned) */
460 case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
461 ((bytes ) & 0xFF) - 0x21);
462 default: return ERROR;
463 }
464 }
465 static int iso2022jp_from_ucs(long int ucs, int *subcharset,
466 unsigned long *bytes)
467 {
468 int r, c;
469 if (ucs < 0x80) {
470 *subcharset = 0;
471 *bytes = ucs;
472 return 1;
473 } else if (ucs >= 0xFF61 && ucs <= 0xFF9F) {
474 *subcharset = 1;
475 *bytes = ucs - (0xFF61 - 0x21);
476 return 1;
477 } else if (unicode_to_jisx0208(ucs, &r, &c)) {
478 *subcharset = 2;
479 *bytes = ((r+0x21) << 8) | (c+0x21);
480 return 1;
481 } else {
482 return 0;
483 }
484 }
485 static const struct iso2022_escape iso2022jp_escapes[] = {
486 {"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1}, /* we ignore this one */
487 {"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2},
488 {"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0},
489 {"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1},
490 };
491 static const struct iso2022 iso2022jp = {
492 iso2022jp_escapes, lenof(iso2022jp_escapes),
493 "\1\1\2", "\3", 0x80000000, NULL, FALSE,
494 iso2022jp_to_ucs, iso2022jp_from_ucs
495 };
496 const charset_spec charset_CS_ISO2022_JP = {
497 CS_ISO2022_JP, read_iso2022s, write_iso2022s, &iso2022jp
498 };
499
500 /*
501 * ISO-2022-KR, defined in RFC 1557.
502 */
503 static long int iso2022kr_to_ucs(int subcharset, unsigned long bytes)
504 {
505 switch (subcharset) {
506 case 0: return bytes; /* one-byte ASCII */
507 case 1: return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
508 ((bytes ) & 0xFF) - 0x21);
509 default: return ERROR;
510 }
511 }
512 static int iso2022kr_from_ucs(long int ucs, int *subcharset,
513 unsigned long *bytes)
514 {
515 int r, c;
516 if (ucs < 0x80) {
517 *subcharset = 0;
518 *bytes = ucs;
519 return 1;
520 } else if (unicode_to_ksx1001(ucs, &r, &c)) {
521 *subcharset = 1;
522 *bytes = ((r+0x21) << 8) | (c+0x21);
523 return 1;
524 } else {
525 return 0;
526 }
527 }
528 static const struct iso2022_escape iso2022kr_escapes[] = {
529 {"\016", 0x8FFFFFFF, 0x10000000, -1, -1},
530 {"\017", 0x8FFFFFFF, 0x00000000, 0, 0},
531 {"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1}, /* bits[11:6] <- 1 */
532 };
533 static const struct iso2022 iso2022kr = {
534 iso2022kr_escapes, lenof(iso2022kr_escapes),
535 "\1\2", "\2", 0x80000040, "\033$)C", FALSE,
536 iso2022kr_to_ucs, iso2022kr_from_ucs
537 };
538 const charset_spec charset_CS_ISO2022_KR = {
539 CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr
540 };
541
542 /*
543 * The COMPOUND_TEXT encoding used in X selections. Defined by the
544 * X consortium.
545 *
546 * This encoding has quite a few sub-charsets. The order I assign
547 * to them here is given in an enum.
548 */
549 enum {
550 /* This must match the bytes-per-character string given below. */
551 CTEXT_ASCII,
552 CTEXT_JISX0201_LEFT,
553 CTEXT_JISX0201_RIGHT,
554 CTEXT_ISO8859_1,
555 CTEXT_ISO8859_2,
556 CTEXT_ISO8859_3,
557 CTEXT_ISO8859_4,
558 CTEXT_ISO8859_5,
559 CTEXT_ISO8859_6,
560 CTEXT_ISO8859_7,
561 CTEXT_ISO8859_8,
562 CTEXT_ISO8859_9,
563 CTEXT_GB2312,
564 CTEXT_KSC5601,
565 CTEXT_JISX0208,
566 CTEXT_JISX0212
567 };
568 static long int ctext_to_ucs(int subcharset, unsigned long bytes)
569 {
570 switch (subcharset) {
571 case CTEXT_ASCII: return bytes; /* one-byte ASCII */
572 case CTEXT_JISX0201_LEFT: /* ASCII with yen and overline */
573 return sbcs_to_unicode(&sbcsdata_CS_JISX0201, bytes & 0x7F);
574 case CTEXT_JISX0201_RIGHT: /* JIS X 0201 half-width katakana */
575 return sbcs_to_unicode(&sbcsdata_CS_JISX0201, (bytes & 0x7F) | 0x80);
576 case CTEXT_ISO8859_1:
577 return sbcs_to_unicode(&sbcsdata_CS_ISO8859_1, (bytes & 0x7F) | 0x80);
578 case CTEXT_ISO8859_2:
579 return sbcs_to_unicode(&sbcsdata_CS_ISO8859_2, (bytes & 0x7F) | 0x80);
580 case CTEXT_ISO8859_3:
581 return sbcs_to_unicode(&sbcsdata_CS_ISO8859_3, (bytes & 0x7F) | 0x80);
582 case CTEXT_ISO8859_4:
583 return sbcs_to_unicode(&sbcsdata_CS_ISO8859_4, (bytes & 0x7F) | 0x80);
584 case CTEXT_ISO8859_5:
585 return sbcs_to_unicode(&sbcsdata_CS_ISO8859_5, (bytes & 0x7F) | 0x80);
586 case CTEXT_ISO8859_6:
587 return sbcs_to_unicode(&sbcsdata_CS_ISO8859_6, (bytes & 0x7F) | 0x80);
588 case CTEXT_ISO8859_7:
589 return sbcs_to_unicode(&sbcsdata_CS_ISO8859_7, (bytes & 0x7F) | 0x80);
590 case CTEXT_ISO8859_8:
591 return sbcs_to_unicode(&sbcsdata_CS_ISO8859_8, (bytes & 0x7F) | 0x80);
592 case CTEXT_ISO8859_9:
593 return sbcs_to_unicode(&sbcsdata_CS_ISO8859_9, (bytes & 0x7F) | 0x80);
594 case CTEXT_GB2312:
595 return gb2312_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
596 ((bytes ) & 0xFF) - 0x21);
597 case CTEXT_KSC5601:
598 return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
599 ((bytes ) & 0xFF) - 0x21);
600 case CTEXT_JISX0208:
601 return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
602 ((bytes ) & 0xFF) - 0x21);
603 case CTEXT_JISX0212:
604 return jisx0212_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
605 ((bytes ) & 0xFF) - 0x21);
606 default: return ERROR;
607 }
608 }
609 static int ctext_from_ucs(long int ucs, int *subcharset, unsigned long *bytes)
610 {
611 int r, c;
612 if (ucs < 0x80) {
613 *subcharset = CTEXT_ASCII;
614 *bytes = ucs;
615 return 1;
616 } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_1, ucs)) != ERROR) {
617 *subcharset = CTEXT_ISO8859_1;
618 *bytes = c - 0x80;
619 return 1;
620 } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_2, ucs)) != ERROR) {
621 *subcharset = CTEXT_ISO8859_2;
622 *bytes = c - 0x80;
623 return 1;
624 } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_3, ucs)) != ERROR) {
625 *subcharset = CTEXT_ISO8859_3;
626 *bytes = c - 0x80;
627 return 1;
628 } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_4, ucs)) != ERROR) {
629 *subcharset = CTEXT_ISO8859_4;
630 *bytes = c - 0x80;
631 return 1;
632 } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_5, ucs)) != ERROR) {
633 *subcharset = CTEXT_ISO8859_5;
634 *bytes = c - 0x80;
635 return 1;
636 } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_6, ucs)) != ERROR) {
637 *subcharset = CTEXT_ISO8859_6;
638 *bytes = c - 0x80;
639 return 1;
640 } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_7, ucs)) != ERROR) {
641 *subcharset = CTEXT_ISO8859_7;
642 *bytes = c - 0x80;
643 return 1;
644 } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_8, ucs)) != ERROR) {
645 *subcharset = CTEXT_ISO8859_8;
646 *bytes = c - 0x80;
647 return 1;
648 } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_9, ucs)) != ERROR) {
649 *subcharset = CTEXT_ISO8859_9;
650 *bytes = c - 0x80;
651 return 1;
652 } else if ((c = sbcs_from_unicode(&sbcsdata_CS_JISX0201, ucs)) != ERROR) {
653 if (c < 0x80) {
654 *subcharset = CTEXT_JISX0201_LEFT;
655 } else {
656 *subcharset = CTEXT_JISX0201_RIGHT;
657 c -= 0x80;
658 }
659 *bytes = c;
660 return 1;
661 } else if (unicode_to_gb2312(ucs, &r, &c)) {
662 *subcharset = CTEXT_GB2312;
663 *bytes = ((r+0x21) << 8) | (c+0x21);
664 return 1;
665 } else if (unicode_to_ksx1001(ucs, &r, &c)) {
666 *subcharset = CTEXT_KSC5601;
667 *bytes = ((r+0x21) << 8) | (c+0x21);
668 return 1;
669 } else if (unicode_to_jisx0208(ucs, &r, &c)) {
670 *subcharset = CTEXT_JISX0208;
671 *bytes = ((r+0x21) << 8) | (c+0x21);
672 return 1;
673 } else if (unicode_to_jisx0212(ucs, &r, &c)) {
674 *subcharset = CTEXT_JISX0212;
675 *bytes = ((r+0x21) << 8) | (c+0x21);
676 return 1;
677 } else {
678 return 0;
679 }
680 }
681 #define SEQ(str,cont,cs) \
682 {str,~(63<<(6*(((cont)&~RO)))),(cs)<<(6*(((cont)&~RO))),(cont),(cs)}
683 /*
684 * Compound text defines restrictions on which container can take
685 * which character sets. Things labelled `left half of' can only go
686 * in GL; things labelled `right half of' can only go in GR; and 96
687 * or 96^n character sets only _fit_ in GR. Thus:
688 * - ASCII can only go in GL since it is the left half of 8859-*.
689 * - All the 8859 sets can only go in GR.
690 * - JISX0201 left is GL only; JISX0201 right is GR only.
691 * - The three multibyte sets (GB2312, JISX0208, KSC5601) can go
692 * in either; we prefer GR where possible since this leads to a
693 * more compact EUC-like encoding.
694 */
695 static const struct iso2022_escape ctext_escapes[] = {
696 SEQ("\033$(A", 0|RO, CTEXT_GB2312),
697 SEQ("\033$(B", 0|RO, CTEXT_JISX0208),
698 SEQ("\033$(C", 0|RO, CTEXT_KSC5601),
699 SEQ("\033$(D", 0|RO, CTEXT_JISX0212),
700 SEQ("\033$)A", 1, CTEXT_GB2312),
701 SEQ("\033$)B", 1, CTEXT_JISX0208),
702 SEQ("\033$)C", 1, CTEXT_KSC5601),
703 SEQ("\033$)D", 1, CTEXT_JISX0212),
704 SEQ("\033(B", 0, CTEXT_ASCII),
705 SEQ("\033(J", 0, CTEXT_JISX0201_LEFT),
706 SEQ("\033)I", 1, CTEXT_JISX0201_RIGHT),
707 SEQ("\033-A", 1, CTEXT_ISO8859_1),
708 SEQ("\033-B", 1, CTEXT_ISO8859_2),
709 SEQ("\033-C", 1, CTEXT_ISO8859_3),
710 SEQ("\033-D", 1, CTEXT_ISO8859_4),
711 SEQ("\033-F", 1, CTEXT_ISO8859_7),
712 SEQ("\033-G", 1, CTEXT_ISO8859_6),
713 SEQ("\033-H", 1, CTEXT_ISO8859_8),
714 SEQ("\033-L", 1, CTEXT_ISO8859_5),
715 SEQ("\033-M", 1, CTEXT_ISO8859_9),
716 };
717 static const struct iso2022 ctext = {
718 ctext_escapes, lenof(ctext_escapes),
719 "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\2\2", /* must match the enum above */
720 "", 0x80000000 | (CTEXT_ASCII<<0) | (CTEXT_ISO8859_1<<6), "", TRUE,
721 ctext_to_ucs, ctext_from_ucs
722 };
723 const charset_spec charset_CS_CTEXT = {
724 CS_CTEXT, read_iso2022s, write_iso2022s, &ctext
725 };
726
727 #else /* ENUM_CHARSETS */
728
729 ENUM_CHARSET(CS_ISO2022_JP)
730 ENUM_CHARSET(CS_ISO2022_KR)
731 ENUM_CHARSET(CS_CTEXT)
732
733 #endif /* ENUM_CHARSETS */