mdw@git.distorted.org.uk Git - sgt/charset/blob - iso2022s.c

   1 /*
   2  * iso2022s.c - support for ISO-2022 subset encodings.
   3  */
   4
   5 #ifndef ENUM_CHARSETS
   6
   7 #include <stdio.h>
   8 #include <string.h>
   9 #include <assert.h>
  10
  11 #include "charset.h"
  12 #include "internal.h"
  13 #include "sbcsdat.h"
  14
  15 #define SO (0x0E)
  16 #define SI (0x0F)
  17 #define ESC (0x1B)
  18
  19 /* Functional description of a single ISO 2022 escape sequence. */
  20 struct iso2022_escape {
  21     char const *sequence;
  22     unsigned long andbits, xorbits;
  23     /*
  24      * For output, these variables help us figure out which escape
  25      * sequences we need to get where we want to be.
  26      *
  27      * `container' should be in the range 0-3, but can also be ORed
  28      * with the bit flag RO to indicate that this is not a
  29      * preferred container to use for this charset during output.
  30      */
  31     int container, subcharset;
  32 };
  33 #define RO 0x80
  34
  35 struct iso2022 {
  36     /*
  37      * List of escape sequences supported in this subset. Must be
  38      * in ASCII order, so that we can narrow down the list as
  39      * necessary.
  40      */
  41     const struct iso2022_escape *escapes;/* must be sorted in ASCII order! */
  42     int nescapes;
  43
  44     /*
  45      * We assign indices from 0 upwards to the sub-charsets of a
  46      * given ISO 2022 subset. nbytes[i] tells us how many bytes per
  47      * character are required by sub-charset i. (It's a string
  48      * mainly because that makes it easier to declare in C syntax
  49      * than an int array.)
  50      */
  51     char const *nbytes;
  52
  53     /*
  54      * The characters in this string are indices-plus-one (so that
  55      * NUL can still terminate) of escape sequences in `escapes'.
  56      * These escapes are output in the given sequence to reset the
  57      * encoding state, unless it turns out that a given escape
  58      * would not change the state at all.
  59      */
  60     char const *reset;
  61
  62     /*
  63      * Initial value of s1, in case the default container contents
  64      * needs to be something other than charset 0 in all cases.
  65      * (Note that this must have the top bit set!)
  66      */
  67     unsigned long s1;
  68
  69     /*
  70      * For output, some ISO 2022 subsets _mandate_ an initial shift
  71      * sequence. If so, here it is so we can output it. (For the
  72      * sake of basic sanity we won't bother to _require_ it on
  73      * input, although it should of course be listed under
  74      * `escapes' above so that we ignore it when present.)
  75      */
  76     char const *initial_sequence;
  77
  78     /*
  79      * Is this an 8-bit ISO 2022 subset?
  80      */
  81     int eightbit;
  82
  83     /*
  84      * Function calls to do the actual translation.
  85      */
  86     long int (*to_ucs)(int subcharset, unsigned long bytes);
  87     int (*from_ucs)(long int ucs, int *subcharset, unsigned long *bytes);
  88 };
  89
  90 static void read_iso2022s(charset_spec const *charset, long int input_chr,
  91                           charset_state *state,
  92                           void (*emit)(void *ctx, long int output),
  93                           void *emitctx)
  94 {
  95     struct iso2022 const *iso = (struct iso2022 *)charset->data;
  96
  97     /*
  98      * For reading ISO-2022 subsets, we divide up our state
  99      * variables as follows:
 100      *
 101      *  - The top byte of s0 (bits 31:24) indicates, if nonzero,
 102      *    that we are part-way through a recognised ISO-2022 escape
 103      *    sequence. Five of those bits (31:27) give the index of
 104      *    the first member of the escapes list matching what we
 105      *    have so far; the remaining three (26:24) give the number
 106      *    of characters we have seen so far.
 107      *
 108      *  - The top bit of s1 (bit 31) is non-zero at all times, to
 109      *    indicate that we have performed any necessary
 110      *    initialisation. When we start, we detect a zero s1 and
 111      *    respond to it by initialising the default container
 112      *    contents.
 113      *
 114      *  - The next three bits of s1 (bits 30:28) indicate which
 115      *    _container_ is currently selected. This isn't quite as
 116      *    simple as it sounds, since we have to preserve memory of
 117      *    which of the SI/SO containers we came from when we're
 118      *    temporarily in SS2/SS3. Hence, what happens is:
 119      *     + bit 28 indicates SI/SO.
 120      *     + if we're in an SS2/SS3 container, that's indicated by
 121      *       the two bits above that being nonzero and holding
 122      *       either 2 or 3.
 123      *     + Hence: 0 is SI, 1 is SO, 4 is SS2-from-SI, 5 is
 124      *       SS2-from-SO, 6 is SS3-from-SI, 7 is SS3-from-SO.
 125      *     + For added fun: in an _8-bit_ ISO 2022 subset, we have
 126      *       the further special value 2, which means that we're
 127      *       theoretically in SI but the current character being
 128      *       accumulated is composed of 8-bit characters and will
 129      *       therefore be interpreted as if in SO.
 130      *
 131      *  - The next nibble of s1 (27:24) indicates how many bytes
 132      *    have been accumulated in the current character.
 133      *
 134      *  - The remaining three bytes of s1 are divided into four
 135      *    six-bit sections, and each section gives the current
 136      *    sub-charset selected in one of the possible containers.
 137      *    (Those containers are SI, SO, SS2 and SS3, respectively
 138      *    and in order from the bottom of s0 to the top.)
 139      *
 140      *  - The bottom 24 bits of s0 give the accumulated character
 141      *    data so far.
 142      *
 143      * (Note that this means s1 contains all the parts of the state
 144      * which might need to be operated on by escape sequences.
 145      * Cunning, eh?)
 146      */
 147
 148     if (!(state->s1 & 0x80000000)) {
 149         state->s1 = iso->s1;
 150     }
 151
 152     /*
 153      * So. Firstly, we process escape sequences, if we're in the
 154      * middle of one or if we see a possible introducer (SI, SO,
 155      * ESC).
 156      */
 157     if ((state->s0 >> 24) ||
 158         (input_chr == SO || input_chr == SI || input_chr == ESC)) {
 159         int n = (state->s0 >> 24) & 7, i = (state->s0 >> 27), oi = i, j;
 160
 161         /*
 162          * If this is the start of an escape sequence, we might be
 163          * in mid-character. If so, clear the character state and
 164          * emit an error token for the incomplete character.
 165          */
 166         if (state->s1 & 0x0F000000) {
 167             state->s1 &= ~0x0F000000;
 168             state->s0 &= 0xFF000000;
 169             /*
 170              * If we were in the SS2 or SS3 container, we
 171              * automatically exit it.
 172              */
 173             if (state->s1 & 0x60000000)
 174                 state->s1 &= 0x9FFFFFFF;
 175             emit(emitctx, ERROR);
 176         }
 177
 178         j = i;
 179         while (j < iso->nescapes &&
 180                !memcmp(iso->escapes[j].sequence,
 181                        iso->escapes[oi].sequence, n)) {
 182             if (iso->escapes[j].sequence[n] < input_chr)
 183                 i = ++j;
 184             else
 185                 break;
 186         }
 187         if (i >= iso->nescapes ||
 188             memcmp(iso->escapes[i].sequence,
 189                    iso->escapes[oi].sequence, n) ||
 190             iso->escapes[i].sequence[n] != input_chr) {
 191             /*
 192              * This character does not appear in any valid escape
 193              * sequence. Therefore, we must emit all the characters
 194              * we had previously swallowed, plus this one, and
 195              * return to non-escape-sequence state.
 196              */
 197             for (j = 0; j < n; j++)
 198                 emit(emitctx, iso->escapes[oi].sequence[j]);
 199             emit(emitctx, input_chr);
 200             state->s0 = 0;
 201             return;
 202         }
 203
 204         /*
 205          * Otherwise, we have found an additional character in our
 206          * escape sequence. See if we have reached the _end_ of our
 207          * sequence (and therefore must process the sequence).
 208          */
 209         n++;
 210         if (!iso->escapes[i].sequence[n]) {
 211             state->s0 = 0;
 212             state->s1 &= iso->escapes[i].andbits;
 213             state->s1 ^= iso->escapes[i].xorbits;
 214             return;
 215         }
 216
 217         /*
 218          * Failing _that_, we simply update our escape-sequence-
 219          * tracking state.
 220          */
 221         assert(i < 32 && n < 8);
 222         state->s0 = (i << 27) | (n << 24);
 223         return;
 224     }
 225
 226     /*
 227      * If this isn't an escape sequence, it must be part of a
 228      * character. One possibility is that it's a control character
 229      * (00-20 or 7F-9F; also in non-8-bit ISO 2022 subsets I'm
 230      * going to treat all top-half characters as controls), in
 231      * which case we output it verbatim.
 232      */
 233     if (input_chr < 0x21 ||
 234         (input_chr > 0x7E && (!iso->eightbit || input_chr < 0xA0))) {
 235         /*
 236          * We might be in mid-multibyte-character. If so, clear the
 237          * character state and emit an error token for the
 238          * incomplete character.
 239          */
 240         if (state->s1 & 0x0F000000) {
 241             state->s1 &= ~0x0F000000;
 242             state->s0 &= 0xFF000000;
 243             emit(emitctx, ERROR);
 244             /*
 245              * If we were in the SS2 or SS3 container, we
 246              * automatically exit it.
 247              */
 248             if (state->s1 & 0x60000000)
 249                 state->s1 &= 0x9FFFFFFF;
 250         }
 251
 252         emit(emitctx, input_chr);
 253         return;
 254     }
 255
 256     /*
 257      * Otherwise, accumulate character data.
 258      */
 259     {
 260         unsigned long chr;
 261         int chrlen, cont, subcharset, bytes;
 262
 263         /*
 264          * Verify that we've seen the right kind of character for
 265          * what we're currently doing. This only matters in 8-bit
 266          * subsets.
 267          */
 268         if (iso->eightbit) {
 269             cont = (state->s1 >> 28) & 7;
 270             /*
 271              * If cont==0, we're entitled to see either GL or GR
 272              * characters. If cont==2, we expect only GR; otherwise
 273              * we expect only GL.
 274              *
 275              * If we see a GR character while cont==0, we set
 276              * cont=2 immediately.
 277              */
 278             if ((cont == 2 && !(input_chr & 0x80)) ||
 279                 (cont != 0 && cont != 2 && (input_chr & 0x80))) {
 280                 /*
 281                  * Clear the previous character; it was prematurely
 282                  * terminated by this error.
 283                  */
 284                 state->s1 &= ~0x0F000000;
 285                 state->s0 &= 0xFF000000;
 286                 emit(emitctx, ERROR);
 287                 /*
 288                  * If we were in the SS2 or SS3 container, we
 289                  * automatically exit it.
 290                  */
 291                 if (state->s1 & 0x60000000)
 292                     state->s1 &= 0x9FFFFFFF;
 293             }
 294
 295             if (cont == 0 && (input_chr & 0x80)) {
 296                 state->s1 |= 0x20000000;
 297             }
 298         }
 299
 300         /* The current character and its length. */
 301         chr = ((state->s0 & 0x00FFFFFF) << 8) | (input_chr & 0x7F);
 302         chrlen = ((state->s1 >> 24) & 0xF) + 1;
 303         /* The current sub-charset. */
 304         cont = (state->s1 >> 28) & 7;
 305         if (cont > 1) cont >>= 1;
 306         subcharset = (state->s1 >> (6*cont)) & 0x3F;
 307         /* The number of bytes-per-character in that sub-charset. */
 308         bytes = iso->nbytes[subcharset];
 309
 310         /*
 311          * If this character is now complete, we convert and emit
 312          * it. Otherwise, we simply update the state and return.
 313          */
 314         if (chrlen >= bytes) {
 315             emit(emitctx, iso->to_ucs(subcharset, chr));
 316             chr = chrlen = 0;
 317             /*
 318              * If we were in the SS2 or SS3 container, we
 319              * automatically exit it.
 320              */
 321             if (state->s1 & 0x60000000)
 322                 state->s1 &= 0x9FFFFFFF;
 323         }
 324         state->s0 = (state->s0 & 0xFF000000) | chr;
 325         state->s1 = (state->s1 & 0xF0FFFFFF) | (chrlen << 24);
 326     }
 327 }
 328
 329 static int write_iso2022s(charset_spec const *charset, long int input_chr,
 330                           charset_state *state,
 331                           void (*emit)(void *ctx, long int output),
 332                           void *emitctx)
 333 {
 334     struct iso2022 const *iso = (struct iso2022 *)charset->data;
 335     int subcharset, len, i, j, cont, topbit = 0;
 336     unsigned long bytes;
 337
 338     /*
 339      * For output, our s1 state variable contains most of the same
 340      * stuff as it did for input - initial-state indicator bit,
 341      * current container, and current subcharset selected in each
 342      * container.
 343      */
 344
 345     /*
 346      * Analyse the character and find out what subcharset it needs
 347      * to go in.
 348      */
 349     if (input_chr >= 0 && !iso->from_ucs(input_chr, &subcharset, &bytes))
 350         return FALSE;
 351
 352     if (!(state->s1 & 0x80000000)) {
 353         state->s1 = iso->s1;
 354         if (iso->initial_sequence)
 355             for (i = 0; iso->initial_sequence[i]; i++)
 356                 emit(emitctx, iso->initial_sequence[i]);
 357     }
 358
 359     if (input_chr == -1) {
 360         unsigned long oldstate;
 361         int k;
 362
 363         /*
 364          * Special case: reset encoding state.
 365          */
 366         for (i = 0; iso->reset[i]; i++) {
 367             j = iso->reset[i] - 1;
 368             oldstate = state->s1;
 369             state->s1 &= iso->escapes[j].andbits;
 370             state->s1 ^= iso->escapes[j].xorbits;
 371             if (state->s1 != oldstate) {
 372                 /* We must actually emit this sequence. */
 373                 for (k = 0; iso->escapes[j].sequence[k]; k++)
 374                     emit(emitctx, iso->escapes[j].sequence[k]);
 375             }
 376         }
 377
 378         return TRUE;
 379     }
 380
 381     /*
 382      * Now begins the fun. We now know what subcharset we want. So
 383      * we must find out which container we should select it into,
 384      * select it into it if necessary, select that _container_ if
 385      * necessary, and then output the given bytes.
 386      */
 387     for (i = 0; i < iso->nescapes; i++)
 388         if (iso->escapes[i].subcharset == subcharset &&
 389             !(iso->escapes[i].container & RO))
 390             break;
 391     assert(i < iso->nescapes);
 392
 393     /*
 394      * We've found the escape sequence which would select this
 395      * subcharset into a container. However, that subcharset might
 396      * already _be_ selected in that container! Check before we go
 397      * to the effort of emitting the sequence.
 398      */
 399     cont = iso->escapes[i].container &~ RO;
 400     if (((state->s1 >> (6*cont)) & 0x3F) != (unsigned)subcharset) {
 401         for (j = 0; iso->escapes[i].sequence[j]; j++)
 402             emit(emitctx, iso->escapes[i].sequence[j]);
 403         state->s1 &= iso->escapes[i].andbits;
 404         state->s1 ^= iso->escapes[i].xorbits;
 405     }
 406
 407     /*
 408      * Now we know what container our subcharset is in, so we want
 409      * to select that container.
 410      */
 411     if (cont > 1) {
 412         /* SS2 or SS3; just output the sequence and be done. */
 413         emit(emitctx, ESC);
 414         emit(emitctx, 'L' + cont);     /* comes out to 'N' or 'O' */
 415     } else {
 416         /*
 417          * Emit SI or SO, but only if the current container isn't already
 418          * the right one.
 419          *
 420          * Also, in an 8-bit subset, we need not do this; we'll
 421          * just use 8-bit characters to output SO-container
 422          * characters.
 423          */
 424         if (iso->eightbit && cont == 1 && ((state->s1 >> 28) & 7) == 0) {
 425             topbit = 0x80;
 426         } else if (((state->s1 >> 28) & 7) != (unsigned)cont) {
 427             emit(emitctx, cont ? SO : SI);
 428             state->s1 = (state->s1 & 0x8FFFFFFF) | (cont << 28);
 429         }
 430     }
 431
 432     /*
 433      * We're done. Subcharset is selected in container, container
 434      * is selected. All we need now is to write out the bytes.
 435      */
 436     len = iso->nbytes[subcharset];
 437     while (len--)
 438         emit(emitctx, ((bytes >> (8*len)) & 0xFF) | topbit);
 439
 440     return TRUE;
 441 }
 442
 443 /*
 444  * ISO-2022-JP, defined in RFC 1468.
 445  */
 446 static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes)
 447 {
 448     switch (subcharset) {
 449       case 1:                          /* JIS X 0201 bottom half */
 450         if (bytes == 0x5C)
 451             return 0xA5;
 452         else if (bytes == 0x7E)
 453             return 0x203E;
 454         /* else fall through to ASCII */
 455       case 0: return bytes;            /* one-byte ASCII */
 456         /* (no break needed since all control paths have returned) */
 457       case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
 458                                          ((bytes     ) & 0xFF) - 0x21);
 459       default: return ERROR;
 460     }
 461 }
 462 static int iso2022jp_from_ucs(long int ucs, int *subcharset,
 463                               unsigned long *bytes)
 464 {
 465     int r, c;
 466     if (ucs < 0x80) {
 467         *subcharset = 0;
 468         *bytes = ucs;
 469         return 1;
 470     } else if (ucs == 0xA5 || ucs == 0x203E) {
 471         *subcharset = 1;
 472         *bytes = (ucs == 0xA5 ? 0x5C : 0x7E);
 473         return 1;
 474     } else if (unicode_to_jisx0208(ucs, &r, &c)) {
 475         *subcharset = 2;
 476         *bytes = ((r+0x21) << 8) | (c+0x21);
 477         return 1;
 478     } else {
 479         return 0;
 480     }
 481 }
 482 static const struct iso2022_escape iso2022jp_escapes[] = {
 483     {"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1},   /* we ignore this one */
 484     {"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2},
 485     {"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0},
 486     {"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1},
 487 };
 488 static const struct iso2022 iso2022jp = {
 489     iso2022jp_escapes, lenof(iso2022jp_escapes),
 490     "\1\1\2", "\3", 0x80000000, NULL, FALSE,
 491     iso2022jp_to_ucs, iso2022jp_from_ucs
 492 };
 493 const charset_spec charset_CS_ISO2022_JP = {
 494     CS_ISO2022_JP, read_iso2022s, write_iso2022s, &iso2022jp
 495 };
 496
 497 /*
 498  * ISO-2022-KR, defined in RFC 1557.
 499  */
 500 static long int iso2022kr_to_ucs(int subcharset, unsigned long bytes)
 501 {
 502     switch (subcharset) {
 503       case 0: return bytes;            /* one-byte ASCII */
 504       case 1: return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
 505                                         ((bytes     ) & 0xFF) - 0x21);
 506       default: return ERROR;
 507     }
 508 }
 509 static int iso2022kr_from_ucs(long int ucs, int *subcharset,
 510                               unsigned long *bytes)
 511 {
 512     int r, c;
 513     if (ucs < 0x80) {
 514         *subcharset = 0;
 515         *bytes = ucs;
 516         return 1;
 517     } else if (unicode_to_ksx1001(ucs, &r, &c)) {
 518         *subcharset = 1;
 519         *bytes = ((r+0x21) << 8) | (c+0x21);
 520         return 1;
 521     } else {
 522         return 0;
 523     }
 524 }
 525 static const struct iso2022_escape iso2022kr_escapes[] = {
 526     {"\016", 0x8FFFFFFF, 0x10000000, -1, -1},
 527     {"\017", 0x8FFFFFFF, 0x00000000, 0, 0},
 528     {"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1},   /* bits[11:6] <- 1 */
 529 };
 530 static const struct iso2022 iso2022kr = {
 531     iso2022kr_escapes, lenof(iso2022kr_escapes),
 532     "\1\2", "\2", 0x80000040, "\033$)C", FALSE,
 533     iso2022kr_to_ucs, iso2022kr_from_ucs
 534 };
 535 const charset_spec charset_CS_ISO2022_KR = {
 536     CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr
 537 };
 538
 539 #else /* ENUM_CHARSETS */
 540
 541 ENUM_CHARSET(CS_ISO2022_JP)
 542 ENUM_CHARSET(CS_ISO2022_KR)
 543
 544 #endif /* ENUM_CHARSETS */