mdw@git.distorted.org.uk Git - sgt/charset/blob - iso2022s.c

   1 /*
   2  * iso2022s.c - support for ISO-2022 subset encodings.
   3  *
   4  * (The `s' suffix on the filename is there to leave `iso2022.c'
   5  * free for the unlikely event that I ever attempt to implement
   6  * _full_ ISO-2022 in this library!)
   7  */
   8
   9 #ifndef ENUM_CHARSETS
  10
  11 #include <stdio.h>
  12 #include <string.h>
  13 #include <assert.h>
  14
  15 #include "charset.h"
  16 #include "internal.h"
  17 #include "sbcsdat.h"
  18
  19 #define SO (0x0E)
  20 #define SI (0x0F)
  21 #define ESC (0x1B)
  22
  23 /* Functional description of a single ISO 2022 escape sequence. */
  24 struct iso2022_escape {
  25     char const *sequence;
  26     unsigned long andbits, xorbits;
  27     /*
  28      * For output, these variables help us figure out which escape
  29      * sequences we need to get where we want to be.
  30      *
  31      * `container' should be in the range 0-3, but can also be ORed
  32      * with the bit flag RO to indicate that this is not a
  33      * preferred container to use for this charset during output.
  34      */
  35     int container, subcharset;
  36 };
  37 #define RO 0x80
  38
  39 struct iso2022 {
  40     /*
  41      * List of escape sequences supported in this subset. Must be
  42      * in ASCII order, so that we can narrow down the list as
  43      * necessary.
  44      */
  45     const struct iso2022_escape *escapes;/* must be sorted in ASCII order! */
  46     int nescapes;
  47
  48     /*
  49      * We assign indices from 0 upwards to the sub-charsets of a
  50      * given ISO 2022 subset. nbytes[i] tells us how many bytes per
  51      * character are required by sub-charset i. (It's a string
  52      * mainly because that makes it easier to declare in C syntax
  53      * than an int array.)
  54      */
  55     char const *nbytes;
  56
  57     /*
  58      * The characters in this string are indices-plus-one (so that
  59      * NUL can still terminate) of escape sequences in `escapes'.
  60      * These escapes are output in the given sequence to reset the
  61      * encoding state, unless it turns out that a given escape
  62      * would not change the state at all.
  63      */
  64     char const *reset;
  65
  66     /*
  67      * Initial value of s1, in case the default container contents
  68      * needs to be something other than charset 0 in all cases.
  69      * (Note that this must have the top bit set!)
  70      */
  71     unsigned long s1;
  72
  73     /*
  74      * For output, some ISO 2022 subsets _mandate_ an initial shift
  75      * sequence. If so, here it is so we can output it. (For the
  76      * sake of basic sanity we won't bother to _require_ it on
  77      * input, although it should of course be listed under
  78      * `escapes' above so that we ignore it when present.)
  79      */
  80     char const *initial_sequence;
  81
  82     /*
  83      * Is this an 8-bit ISO 2022 subset?
  84      */
  85     int eightbit;
  86
  87     /*
  88      * Function calls to do the actual translation.
  89      */
  90     long int (*to_ucs)(int subcharset, unsigned long bytes);
  91     int (*from_ucs)(long int ucs, int *subcharset, unsigned long *bytes);
  92 };
  93
  94 static void read_iso2022s(charset_spec const *charset, long int input_chr,
  95                           charset_state *state,
  96                           void (*emit)(void *ctx, long int output),
  97                           void *emitctx)
  98 {
  99     struct iso2022 const *iso = (struct iso2022 *)charset->data;
 100
 101     /*
 102      * For reading ISO-2022 subsets, we divide up our state
 103      * variables as follows:
 104      *
 105      *  - The top byte of s0 (bits 31:24) indicates, if nonzero,
 106      *    that we are part-way through a recognised ISO-2022 escape
 107      *    sequence. Five of those bits (31:27) give the index of
 108      *    the first member of the escapes list matching what we
 109      *    have so far; the remaining three (26:24) give the number
 110      *    of characters we have seen so far.
 111      *
 112      *  - The top bit of s1 (bit 31) is non-zero at all times, to
 113      *    indicate that we have performed any necessary
 114      *    initialisation. When we start, we detect a zero s1 and
 115      *    respond to it by initialising the default container
 116      *    contents.
 117      *
 118      *  - The next three bits of s1 (bits 30:28) indicate which
 119      *    _container_ is currently selected. This isn't quite as
 120      *    simple as it sounds, since we have to preserve memory of
 121      *    which of the SI/SO containers we came from when we're
 122      *    temporarily in SS2/SS3. Hence, what happens is:
 123      *     + bit 28 indicates SI/SO.
 124      *     + if we're in an SS2/SS3 container, that's indicated by
 125      *       the two bits above that being nonzero and holding
 126      *       either 2 or 3.
 127      *     + Hence: 0 is SI, 1 is SO, 4 is SS2-from-SI, 5 is
 128      *       SS2-from-SO, 6 is SS3-from-SI, 7 is SS3-from-SO.
 129      *     + For added fun: in an _8-bit_ ISO 2022 subset, we have
 130      *       the further special value 2, which means that we're
 131      *       theoretically in SI but the current character being
 132      *       accumulated is composed of 8-bit characters and will
 133      *       therefore be interpreted as if in SO.
 134      *
 135      *  - The next nibble of s1 (27:24) indicates how many bytes
 136      *    have been accumulated in the current character.
 137      *
 138      *  - The remaining three bytes of s1 are divided into four
 139      *    six-bit sections, and each section gives the current
 140      *    sub-charset selected in one of the possible containers.
 141      *    (Those containers are SI, SO, SS2 and SS3, respectively
 142      *    and in order from the bottom of s0 to the top.)
 143      *
 144      *  - The bottom 24 bits of s0 give the accumulated character
 145      *    data so far.
 146      *
 147      * (Note that this means s1 contains all the parts of the state
 148      * which might need to be operated on by escape sequences.
 149      * Cunning, eh?)
 150      */
 151
 152     if (!(state->s1 & 0x80000000)) {
 153         state->s1 = iso->s1;
 154     }
 155
 156     /*
 157      * So. Firstly, we process escape sequences, if we're in the
 158      * middle of one or if we see a possible introducer (SI, SO,
 159      * ESC).
 160      */
 161     if ((state->s0 >> 24) ||
 162         (input_chr == SO || input_chr == SI || input_chr == ESC)) {
 163         int n = (state->s0 >> 24) & 7, i = (state->s0 >> 27), oi = i, j;
 164
 165         /*
 166          * If this is the start of an escape sequence, we might be
 167          * in mid-character. If so, clear the character state and
 168          * emit an error token for the incomplete character.
 169          */
 170         if (state->s1 & 0x0F000000) {
 171             state->s1 &= ~0x0F000000;
 172             state->s0 &= 0xFF000000;
 173             /*
 174              * If we were in the SS2 or SS3 container, we
 175              * automatically exit it.
 176              */
 177             if (state->s1 & 0x60000000)
 178                 state->s1 &= 0x9FFFFFFF;
 179             emit(emitctx, ERROR);
 180         }
 181
 182         j = i;
 183         while (j < iso->nescapes &&
 184                !memcmp(iso->escapes[j].sequence,
 185                        iso->escapes[oi].sequence, n)) {
 186             if (iso->escapes[j].sequence[n] < input_chr)
 187                 i = ++j;
 188             else
 189                 break;
 190         }
 191         if (i >= iso->nescapes ||
 192             memcmp(iso->escapes[i].sequence,
 193                    iso->escapes[oi].sequence, n) ||
 194             iso->escapes[i].sequence[n] != input_chr) {
 195             /*
 196              * This character does not appear in any valid escape
 197              * sequence. Therefore, we must emit all the characters
 198              * we had previously swallowed, plus this one, and
 199              * return to non-escape-sequence state.
 200              */
 201             for (j = 0; j < n; j++)
 202                 emit(emitctx, iso->escapes[oi].sequence[j]);
 203             emit(emitctx, input_chr);
 204             state->s0 = 0;
 205             return;
 206         }
 207
 208         /*
 209          * Otherwise, we have found an additional character in our
 210          * escape sequence. See if we have reached the _end_ of our
 211          * sequence (and therefore must process the sequence).
 212          */
 213         n++;
 214         if (!iso->escapes[i].sequence[n]) {
 215             state->s0 = 0;
 216             state->s1 &= iso->escapes[i].andbits;
 217             state->s1 ^= iso->escapes[i].xorbits;
 218             return;
 219         }
 220
 221         /*
 222          * Failing _that_, we simply update our escape-sequence-
 223          * tracking state.
 224          */
 225         assert(i < 32 && n < 8);
 226         state->s0 = (i << 27) | (n << 24);
 227         return;
 228     }
 229
 230     /*
 231      * If this isn't an escape sequence, it must be part of a
 232      * character. One possibility is that it's a control character
 233      * (00-20 or 7F-9F; also in non-8-bit ISO 2022 subsets I'm
 234      * going to treat all top-half characters as controls), in
 235      * which case we output it verbatim.
 236      */
 237     if (input_chr < 0x21 ||
 238         (input_chr > 0x7E && (!iso->eightbit || input_chr < 0xA0))) {
 239         /*
 240          * We might be in mid-multibyte-character. If so, clear the
 241          * character state and emit an error token for the
 242          * incomplete character.
 243          */
 244         if (state->s1 & 0x0F000000) {
 245             state->s1 &= ~0x0F000000;
 246             state->s0 &= 0xFF000000;
 247             emit(emitctx, ERROR);
 248             /*
 249              * If we were in the SS2 or SS3 container, we
 250              * automatically exit it.
 251              */
 252             if (state->s1 & 0x60000000)
 253                 state->s1 &= 0x9FFFFFFF;
 254         }
 255
 256         emit(emitctx, input_chr);
 257         return;
 258     }
 259
 260     /*
 261      * Otherwise, accumulate character data.
 262      */
 263     {
 264         unsigned long chr;
 265         int chrlen, cont, subcharset, bytes;
 266
 267         /*
 268          * Verify that we've seen the right kind of character for
 269          * what we're currently doing. This only matters in 8-bit
 270          * subsets.
 271          */
 272         if (iso->eightbit) {
 273             cont = (state->s1 >> 28) & 7;
 274             /*
 275              * If cont==0, we're entitled to see either GL or GR
 276              * characters. If cont==2, we expect only GR; otherwise
 277              * we expect only GL.
 278              *
 279              * If we see a GR character while cont==0, we set
 280              * cont=2 immediately.
 281              */
 282             if ((cont == 2 && !(input_chr & 0x80)) ||
 283                 (cont != 0 && cont != 2 && (input_chr & 0x80))) {
 284                 /*
 285                  * Clear the previous character; it was prematurely
 286                  * terminated by this error.
 287                  */
 288                 state->s1 &= ~0x0F000000;
 289                 state->s0 &= 0xFF000000;
 290                 emit(emitctx, ERROR);
 291                 /*
 292                  * If we were in the SS2 or SS3 container, we
 293                  * automatically exit it.
 294                  */
 295                 if (state->s1 & 0x60000000)
 296                     state->s1 &= 0x9FFFFFFF;
 297             }
 298
 299             if (cont == 0 && (input_chr & 0x80)) {
 300                 state->s1 |= 0x20000000;
 301             }
 302         }
 303
 304         /* The current character and its length. */
 305         chr = ((state->s0 & 0x00FFFFFF) << 8) | (input_chr & 0x7F);
 306         chrlen = ((state->s1 >> 24) & 0xF) + 1;
 307         /* The current sub-charset. */
 308         cont = (state->s1 >> 28) & 7;
 309         if (cont > 1) cont >>= 1;
 310         subcharset = (state->s1 >> (6*cont)) & 0x3F;
 311         /* The number of bytes-per-character in that sub-charset. */
 312         bytes = iso->nbytes[subcharset];
 313
 314         /*
 315          * If this character is now complete, we convert and emit
 316          * it. Otherwise, we simply update the state and return.
 317          */
 318         if (chrlen >= bytes) {
 319             emit(emitctx, iso->to_ucs(subcharset, chr));
 320             chr = chrlen = 0;
 321             /*
 322              * If we were in the SS2 or SS3 container, we
 323              * automatically exit it.
 324              */
 325             if (state->s1 & 0x60000000)
 326                 state->s1 &= 0x9FFFFFFF;
 327         }
 328         state->s0 = (state->s0 & 0xFF000000) | chr;
 329         state->s1 = (state->s1 & 0xF0FFFFFF) | (chrlen << 24);
 330     }
 331 }
 332
 333 static int write_iso2022s(charset_spec const *charset, long int input_chr,
 334                           charset_state *state,
 335                           void (*emit)(void *ctx, long int output),
 336                           void *emitctx)
 337 {
 338     struct iso2022 const *iso = (struct iso2022 *)charset->data;
 339     int subcharset, len, i, j, cont, topbit = 0;
 340     unsigned long bytes;
 341
 342     /*
 343      * For output, our s1 state variable contains most of the same
 344      * stuff as it did for input - initial-state indicator bit,
 345      * current container, and current subcharset selected in each
 346      * container.
 347      */
 348
 349     /*
 350      * Analyse the character and find out what subcharset it needs
 351      * to go in.
 352      */
 353     if (input_chr >= 0 && !iso->from_ucs(input_chr, &subcharset, &bytes))
 354         return FALSE;
 355
 356     if (!(state->s1 & 0x80000000)) {
 357         state->s1 = iso->s1;
 358         if (iso->initial_sequence)
 359             for (i = 0; iso->initial_sequence[i]; i++)
 360                 emit(emitctx, iso->initial_sequence[i]);
 361     }
 362
 363     if (input_chr == -1) {
 364         unsigned long oldstate;
 365         int k;
 366
 367         /*
 368          * Special case: reset encoding state.
 369          */
 370         for (i = 0; iso->reset[i]; i++) {
 371             j = iso->reset[i] - 1;
 372             oldstate = state->s1;
 373             state->s1 &= iso->escapes[j].andbits;
 374             state->s1 ^= iso->escapes[j].xorbits;
 375             if (state->s1 != oldstate) {
 376                 /* We must actually emit this sequence. */
 377                 for (k = 0; iso->escapes[j].sequence[k]; k++)
 378                     emit(emitctx, iso->escapes[j].sequence[k]);
 379             }
 380         }
 381
 382         return TRUE;
 383     }
 384
 385     /*
 386      * Now begins the fun. We now know what subcharset we want. So
 387      * we must find out which container we should select it into,
 388      * select it into it if necessary, select that _container_ if
 389      * necessary, and then output the given bytes.
 390      */
 391     for (i = 0; i < iso->nescapes; i++)
 392         if (iso->escapes[i].subcharset == subcharset &&
 393             !(iso->escapes[i].container & RO))
 394             break;
 395     assert(i < iso->nescapes);
 396
 397     /*
 398      * We've found the escape sequence which would select this
 399      * subcharset into a container. However, that subcharset might
 400      * already _be_ selected in that container! Check before we go
 401      * to the effort of emitting the sequence.
 402      */
 403     cont = iso->escapes[i].container &~ RO;
 404     if (((state->s1 >> (6*cont)) & 0x3F) != (unsigned)subcharset) {
 405         for (j = 0; iso->escapes[i].sequence[j]; j++)
 406             emit(emitctx, iso->escapes[i].sequence[j]);
 407         state->s1 &= iso->escapes[i].andbits;
 408         state->s1 ^= iso->escapes[i].xorbits;
 409     }
 410
 411     /*
 412      * Now we know what container our subcharset is in, so we want
 413      * to select that container.
 414      */
 415     if (cont > 1) {
 416         /* SS2 or SS3; just output the sequence and be done. */
 417         emit(emitctx, ESC);
 418         emit(emitctx, 'L' + cont);     /* comes out to 'N' or 'O' */
 419     } else {
 420         /*
 421          * Emit SI or SO, but only if the current container isn't already
 422          * the right one.
 423          *
 424          * Also, in an 8-bit subset, we need not do this; we'll
 425          * just use 8-bit characters to output SO-container
 426          * characters.
 427          */
 428         if (iso->eightbit && cont == 1 && ((state->s1 >> 28) & 7) == 0) {
 429             topbit = 0x80;
 430         } else if (((state->s1 >> 28) & 7) != (unsigned)cont) {
 431             emit(emitctx, cont ? SO : SI);
 432             state->s1 = (state->s1 & 0x8FFFFFFF) | (cont << 28);
 433         }
 434     }
 435
 436     /*
 437      * We're done. Subcharset is selected in container, container
 438      * is selected. All we need now is to write out the bytes.
 439      */
 440     len = iso->nbytes[subcharset];
 441     while (len--)
 442         emit(emitctx, ((bytes >> (8*len)) & 0xFF) | topbit);
 443
 444     return TRUE;
 445 }
 446
 447 /*
 448  * ISO-2022-JP, defined in RFC 1468.
 449  */
 450 static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes)
 451 {
 452     switch (subcharset) {
 453       case 0: return bytes;            /* one-byte ASCII */
 454       case 1:                          /* JIS X 0201 half-width katakana */
 455         if (bytes >= 0x21 && bytes <= 0x5F)
 456             return bytes + (0xFF61 - 0x21);
 457         else
 458             return ERROR;
 459         /* (no break needed since all control paths have returned) */
 460       case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
 461                                          ((bytes     ) & 0xFF) - 0x21);
 462       default: return ERROR;
 463     }
 464 }
 465 static int iso2022jp_from_ucs(long int ucs, int *subcharset,
 466                               unsigned long *bytes)
 467 {
 468     int r, c;
 469     if (ucs < 0x80) {
 470         *subcharset = 0;
 471         *bytes = ucs;
 472         return 1;
 473     } else if (ucs >= 0xFF61 && ucs <= 0xFF9F) {
 474         *subcharset = 1;
 475         *bytes = ucs - (0xFF61 - 0x21);
 476         return 1;
 477     } else if (unicode_to_jisx0208(ucs, &r, &c)) {
 478         *subcharset = 2;
 479         *bytes = ((r+0x21) << 8) | (c+0x21);
 480         return 1;
 481     } else {
 482         return 0;
 483     }
 484 }
 485 static const struct iso2022_escape iso2022jp_escapes[] = {
 486     {"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1},   /* we ignore this one */
 487     {"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2},
 488     {"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0},
 489     {"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1},
 490 };
 491 static const struct iso2022 iso2022jp = {
 492     iso2022jp_escapes, lenof(iso2022jp_escapes),
 493     "\1\1\2", "\3", 0x80000000, NULL, FALSE,
 494     iso2022jp_to_ucs, iso2022jp_from_ucs
 495 };
 496 const charset_spec charset_CS_ISO2022_JP = {
 497     CS_ISO2022_JP, read_iso2022s, write_iso2022s, &iso2022jp
 498 };
 499
 500 /*
 501  * ISO-2022-KR, defined in RFC 1557.
 502  */
 503 static long int iso2022kr_to_ucs(int subcharset, unsigned long bytes)
 504 {
 505     switch (subcharset) {
 506       case 0: return bytes;            /* one-byte ASCII */
 507       case 1: return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
 508                                         ((bytes     ) & 0xFF) - 0x21);
 509       default: return ERROR;
 510     }
 511 }
 512 static int iso2022kr_from_ucs(long int ucs, int *subcharset,
 513                               unsigned long *bytes)
 514 {
 515     int r, c;
 516     if (ucs < 0x80) {
 517         *subcharset = 0;
 518         *bytes = ucs;
 519         return 1;
 520     } else if (unicode_to_ksx1001(ucs, &r, &c)) {
 521         *subcharset = 1;
 522         *bytes = ((r+0x21) << 8) | (c+0x21);
 523         return 1;
 524     } else {
 525         return 0;
 526     }
 527 }
 528 static const struct iso2022_escape iso2022kr_escapes[] = {
 529     {"\016", 0x8FFFFFFF, 0x10000000, -1, -1},
 530     {"\017", 0x8FFFFFFF, 0x00000000, 0, 0},
 531     {"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1},   /* bits[11:6] <- 1 */
 532 };
 533 static const struct iso2022 iso2022kr = {
 534     iso2022kr_escapes, lenof(iso2022kr_escapes),
 535     "\1\2", "\2", 0x80000040, "\033$)C", FALSE,
 536     iso2022kr_to_ucs, iso2022kr_from_ucs
 537 };
 538 const charset_spec charset_CS_ISO2022_KR = {
 539     CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr
 540 };
 541
 542 /*
 543  * The COMPOUND_TEXT encoding used in X selections. Defined by the
 544  * X consortium.
 545  *
 546  * This encoding has quite a few sub-charsets. The order I assign
 547  * to them here is given in an enum.
 548  */
 549 enum {
 550     /* This must match the bytes-per-character string given below. */
 551     CTEXT_ASCII,
 552     CTEXT_JISX0201_LEFT,
 553     CTEXT_JISX0201_RIGHT,
 554     CTEXT_ISO8859_1,
 555     CTEXT_ISO8859_2,
 556     CTEXT_ISO8859_3,
 557     CTEXT_ISO8859_4,
 558     CTEXT_ISO8859_5,
 559     CTEXT_ISO8859_6,
 560     CTEXT_ISO8859_7,
 561     CTEXT_ISO8859_8,
 562     CTEXT_ISO8859_9,
 563     CTEXT_GB2312,
 564     CTEXT_KSC5601,
 565     CTEXT_JISX0208,
 566     CTEXT_JISX0212
 567 };
 568 static long int ctext_to_ucs(int subcharset, unsigned long bytes)
 569 {
 570     switch (subcharset) {
 571       case CTEXT_ASCII: return bytes;          /* one-byte ASCII */
 572       case CTEXT_JISX0201_LEFT:        /* ASCII with yen and overline */
 573         return sbcs_to_unicode(&sbcsdata_CS_JISX0201, bytes & 0x7F);
 574       case CTEXT_JISX0201_RIGHT:       /* JIS X 0201 half-width katakana */
 575         return sbcs_to_unicode(&sbcsdata_CS_JISX0201, (bytes & 0x7F) | 0x80);
 576       case CTEXT_ISO8859_1:
 577         return sbcs_to_unicode(&sbcsdata_CS_ISO8859_1, (bytes & 0x7F) | 0x80);
 578       case CTEXT_ISO8859_2:
 579         return sbcs_to_unicode(&sbcsdata_CS_ISO8859_2, (bytes & 0x7F) | 0x80);
 580       case CTEXT_ISO8859_3:
 581         return sbcs_to_unicode(&sbcsdata_CS_ISO8859_3, (bytes & 0x7F) | 0x80);
 582       case CTEXT_ISO8859_4:
 583         return sbcs_to_unicode(&sbcsdata_CS_ISO8859_4, (bytes & 0x7F) | 0x80);
 584       case CTEXT_ISO8859_5:
 585         return sbcs_to_unicode(&sbcsdata_CS_ISO8859_5, (bytes & 0x7F) | 0x80);
 586       case CTEXT_ISO8859_6:
 587         return sbcs_to_unicode(&sbcsdata_CS_ISO8859_6, (bytes & 0x7F) | 0x80);
 588       case CTEXT_ISO8859_7:
 589         return sbcs_to_unicode(&sbcsdata_CS_ISO8859_7, (bytes & 0x7F) | 0x80);
 590       case CTEXT_ISO8859_8:
 591         return sbcs_to_unicode(&sbcsdata_CS_ISO8859_8, (bytes & 0x7F) | 0x80);
 592       case CTEXT_ISO8859_9:
 593         return sbcs_to_unicode(&sbcsdata_CS_ISO8859_9, (bytes & 0x7F) | 0x80);
 594       case CTEXT_GB2312:
 595         return gb2312_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
 596                                  ((bytes     ) & 0xFF) - 0x21);
 597       case CTEXT_KSC5601:
 598         return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
 599                                   ((bytes     ) & 0xFF) - 0x21);
 600       case CTEXT_JISX0208:
 601         return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
 602                                    ((bytes     ) & 0xFF) - 0x21);
 603       case CTEXT_JISX0212:
 604         return jisx0212_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
 605                                    ((bytes     ) & 0xFF) - 0x21);
 606       default: return ERROR;
 607     }
 608 }
 609 static int ctext_from_ucs(long int ucs, int *subcharset, unsigned long *bytes)
 610 {
 611     int r, c;
 612     if (ucs < 0x80) {
 613         *subcharset = CTEXT_ASCII;
 614         *bytes = ucs;
 615         return 1;
 616     } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_1, ucs)) != ERROR) {
 617         *subcharset = CTEXT_ISO8859_1;
 618         *bytes = c - 0x80;
 619         return 1;
 620     } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_2, ucs)) != ERROR) {
 621         *subcharset = CTEXT_ISO8859_2;
 622         *bytes = c - 0x80;
 623         return 1;
 624     } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_3, ucs)) != ERROR) {
 625         *subcharset = CTEXT_ISO8859_3;
 626         *bytes = c - 0x80;
 627         return 1;
 628     } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_4, ucs)) != ERROR) {
 629         *subcharset = CTEXT_ISO8859_4;
 630         *bytes = c - 0x80;
 631         return 1;
 632     } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_5, ucs)) != ERROR) {
 633         *subcharset = CTEXT_ISO8859_5;
 634         *bytes = c - 0x80;
 635         return 1;
 636     } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_6, ucs)) != ERROR) {
 637         *subcharset = CTEXT_ISO8859_6;
 638         *bytes = c - 0x80;
 639         return 1;
 640     } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_7, ucs)) != ERROR) {
 641         *subcharset = CTEXT_ISO8859_7;
 642         *bytes = c - 0x80;
 643         return 1;
 644     } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_8, ucs)) != ERROR) {
 645         *subcharset = CTEXT_ISO8859_8;
 646         *bytes = c - 0x80;
 647         return 1;
 648     } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_9, ucs)) != ERROR) {
 649         *subcharset = CTEXT_ISO8859_9;
 650         *bytes = c - 0x80;
 651         return 1;
 652     } else if ((c = sbcs_from_unicode(&sbcsdata_CS_JISX0201, ucs)) != ERROR) {
 653         if (c < 0x80) {
 654             *subcharset = CTEXT_JISX0201_LEFT;
 655         } else {
 656             *subcharset = CTEXT_JISX0201_RIGHT;
 657             c -= 0x80;
 658         }
 659         *bytes = c;
 660         return 1;
 661     } else if (unicode_to_gb2312(ucs, &r, &c)) {
 662         *subcharset = CTEXT_GB2312;
 663         *bytes = ((r+0x21) << 8) | (c+0x21);
 664         return 1;
 665     } else if (unicode_to_ksx1001(ucs, &r, &c)) {
 666         *subcharset = CTEXT_KSC5601;
 667         *bytes = ((r+0x21) << 8) | (c+0x21);
 668         return 1;
 669     } else if (unicode_to_jisx0208(ucs, &r, &c)) {
 670         *subcharset = CTEXT_JISX0208;
 671         *bytes = ((r+0x21) << 8) | (c+0x21);
 672         return 1;
 673     } else if (unicode_to_jisx0212(ucs, &r, &c)) {
 674         *subcharset = CTEXT_JISX0212;
 675         *bytes = ((r+0x21) << 8) | (c+0x21);
 676         return 1;
 677     } else {
 678         return 0;
 679     }
 680 }
 681 #define SEQ(str,cont,cs) \
 682     {str,~(63<<(6*(((cont)&~RO)))),(cs)<<(6*(((cont)&~RO))),(cont),(cs)}
 683 /*
 684  * Compound text defines restrictions on which container can take
 685  * which character sets. Things labelled `left half of' can only go
 686  * in GL; things labelled `right half of' can only go in GR; and 96
 687  * or 96^n character sets only _fit_ in GR. Thus:
 688  *  - ASCII can only go in GL since it is the left half of 8859-*.
 689  *  - All the 8859 sets can only go in GR.
 690  *  - JISX0201 left is GL only; JISX0201 right is GR only.
 691  *  - The three multibyte sets (GB2312, JISX0208, KSC5601) can go
 692  *    in either; we prefer GR where possible since this leads to a
 693  *    more compact EUC-like encoding.
 694  */
 695 static const struct iso2022_escape ctext_escapes[] = {
 696     SEQ("\033$(A", 0|RO, CTEXT_GB2312),
 697     SEQ("\033$(B", 0|RO, CTEXT_JISX0208),
 698     SEQ("\033$(C", 0|RO, CTEXT_KSC5601),
 699     SEQ("\033$(D", 0|RO, CTEXT_JISX0212),
 700     SEQ("\033$)A", 1, CTEXT_GB2312),
 701     SEQ("\033$)B", 1, CTEXT_JISX0208),
 702     SEQ("\033$)C", 1, CTEXT_KSC5601),
 703     SEQ("\033$)D", 1, CTEXT_JISX0212),
 704     SEQ("\033(B", 0, CTEXT_ASCII),
 705     SEQ("\033(J", 0, CTEXT_JISX0201_LEFT),
 706     SEQ("\033)I", 1, CTEXT_JISX0201_RIGHT),
 707     SEQ("\033-A", 1, CTEXT_ISO8859_1),
 708     SEQ("\033-B", 1, CTEXT_ISO8859_2),
 709     SEQ("\033-C", 1, CTEXT_ISO8859_3),
 710     SEQ("\033-D", 1, CTEXT_ISO8859_4),
 711     SEQ("\033-F", 1, CTEXT_ISO8859_7),
 712     SEQ("\033-G", 1, CTEXT_ISO8859_6),
 713     SEQ("\033-H", 1, CTEXT_ISO8859_8),
 714     SEQ("\033-L", 1, CTEXT_ISO8859_5),
 715     SEQ("\033-M", 1, CTEXT_ISO8859_9),
 716 };
 717 static const struct iso2022 ctext = {
 718     ctext_escapes, lenof(ctext_escapes),
 719     "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\2\2",  /* must match the enum above */
 720     "", 0x80000000 | (CTEXT_ASCII<<0) | (CTEXT_ISO8859_1<<6), "", TRUE,
 721     ctext_to_ucs, ctext_from_ucs
 722 };
 723 const charset_spec charset_CS_CTEXT = {
 724     CS_CTEXT, read_iso2022s, write_iso2022s, &ctext
 725 };
 726
 727 #else /* ENUM_CHARSETS */
 728
 729 ENUM_CHARSET(CS_ISO2022_JP)
 730 ENUM_CHARSET(CS_ISO2022_KR)
 731 ENUM_CHARSET(CS_CTEXT)
 732
 733 #endif /* ENUM_CHARSETS */