mdw@git.distorted.org.uk Git - sgt/charset/blob - iso2022s.c

   1 /*
   2  * iso2022s.c - support for ISO-2022 subset encodings.
   3  *
   4  * (The `s' suffix on the filename is there to leave `iso2022.c'
   5  * free for the unlikely event that I ever attempt to implement
   6  * _full_ ISO-2022 in this library!)
   7  */
   8
   9 #ifndef ENUM_CHARSETS
  10
  11 #include <stdio.h>
  12 #include <string.h>
  13 #include <assert.h>
  14
  15 #include "charset.h"
  16 #include "internal.h"
  17
  18 #define SO (0x0E)
  19 #define SI (0x0F)
  20 #define ESC (0x1B)
  21
  22 /* Functional description of a single ISO 2022 escape sequence. */
  23 struct iso2022_escape {
  24     char const *sequence;
  25     unsigned long andbits, xorbits;
  26     /*
  27      * For output, these variables help us figure out which escape
  28      * sequences we need to get where we want to be.
  29      */
  30     int container, subcharset;
  31 };
  32
  33 struct iso2022 {
  34     /*
  35      * List of escape sequences supported in this subset. Must be
  36      * in ASCII order, so that we can narrow down the list as
  37      * necessary.
  38      */
  39     struct iso2022_escape *escapes;    /* must be sorted in ASCII order! */
  40     int nescapes;
  41
  42     /*
  43      * We assign indices from 0 upwards to the sub-charsets of a
  44      * given ISO 2022 subset. nbytes[i] tells us how many bytes per
  45      * character are required by sub-charset i. (It's a string
  46      * mainly because that makes it easier to declare in C syntax
  47      * than an int array.)
  48      */
  49     char const *nbytes;
  50
  51     /*
  52      * The characters in this string are indices-plus-one (so that
  53      * NUL can still terminate) of escape sequences in `escapes'.
  54      * These escapes are output in the given sequence to reset the
  55      * encoding state, unless it turns out that a given escape
  56      * would not change the state at all.
  57      */
  58     char const *reset;
  59
  60     /*
  61      * Initial value of s1, in case the default container contents
  62      * needs to be something other than charset 0 in all cases.
  63      * (Note that this must have the top bit set!)
  64      */
  65     unsigned long s1;
  66
  67     /*
  68      * For output, some ISO 2022 subsets _mandate_ an initial shift
  69      * sequence. If so, here it is so we can output it. (For the
  70      * sake of basic sanity we won't bother to _require_ it on
  71      * input, although it should of course be listed under
  72      * `escapes' above so that we ignore it when present.)
  73      */
  74     char const *initial_sequence;
  75
  76     /*
  77      * Function calls to do the actual translation.
  78      */
  79     long int (*to_ucs)(int subcharset, unsigned long bytes);
  80     int (*from_ucs)(long int ucs, int *subcharset, unsigned long *bytes);
  81 };
  82
  83 static void read_iso2022s(charset_spec const *charset, long int input_chr,
  84                           charset_state *state,
  85                           void (*emit)(void *ctx, long int output),
  86                           void *emitctx)
  87 {
  88     struct iso2022 const *iso = (struct iso2022 *)charset->data;
  89
  90     /*
  91      * For reading ISO-2022 subsets, we divide up our state
  92      * variables as follows:
  93      *
  94      *  - The top byte of s0 (bits 31:24) indicates, if nonzero,
  95      *    that we are part-way through a recognised ISO-2022 escape
  96      *    sequence. Five of those bits (31:27) give the index of
  97      *    the first member of the escapes list matching what we
  98      *    have so far; the remaining three (26:24) give the number
  99      *    of characters we have seen so far.
 100      *
 101      *  - The top bit of s1 (bit 31) is non-zero at all times, to
 102      *    indicate that we have performed any necessary
 103      *    initialisation. When we start, we detect a zero s1 and
 104      *    respond to it by initialising the default container
 105      *    contents.
 106      *
 107      *  - The next three bits of s1 (bits 30:28) indicate which
 108      *    _container_ is currently selected. This isn't quite as
 109      *    simple as it sounds, since we have to preserve memory of
 110      *    which of the SI/SO containers we came from when we're
 111      *    temporarily in SS2/SS3. Hence, what happens is:
 112      *     + bit 28 indicates SI/SO.
 113      *     + if we're in an SS2/SS3 container, that's indicated by
 114      *       the two bits above that being nonzero and holding
 115      *       either 2 or 3.
 116      *     + Hence: 0 is SI, 1 is SO, 4 is SS2-from-SI, 5 is
 117      *       SS2-from-SO, 6 is SS3-from-SI, 7 is SS3-from-SO.
 118      *
 119      *  - The next nibble of s1 (27:24) indicates how many bytes
 120      *    have been accumulated in the current character.
 121      *
 122      *  - The remaining three bytes of s1 are divided into four
 123      *    six-bit sections, and each section gives the current
 124      *    sub-charset selected in one of the possible containers.
 125      *    (Those containers are SI, SO, SS2 and SS3, respectively
 126      *    and in order from the bottom of s0 to the top.)
 127      *
 128      *  - The bottom 24 bits of s0 give the accumulated character
 129      *    data so far.
 130      *
 131      * (Note that this means s1 contains all the parts of the state
 132      * which might need to be operated on by escape sequences.
 133      * Cunning, eh?)
 134      */
 135
 136     if (!(state->s1 & 0x80000000)) {
 137         state->s1 = iso->s1;
 138     }
 139
 140     /*
 141      * So. Firstly, we process escape sequences, if we're in the
 142      * middle of one or if we see a possible introducer (SI, SO,
 143      * ESC).
 144      */
 145     if ((state->s0 >> 24) ||
 146         (input_chr == SO || input_chr == SI || input_chr == ESC)) {
 147         int n = (state->s0 >> 24) & 7, i = (state->s0 >> 27), oi = i, j;
 148
 149         /*
 150          * If this is the start of an escape sequence, we might be
 151          * in mid-character. If so, clear the character state and
 152          * emit an error token for the incomplete character.
 153          */
 154         if (state->s1 & 0x0F000000) {
 155             state->s1 &= ~0x0F000000;
 156             state->s0 &= 0xFF000000;
 157             /*
 158              * If we were in the SS2 or SS3 container, we
 159              * automatically exit it.
 160              */
 161             if (state->s1 & 0x60000000)
 162                 state->s1 &= 0x9FFFFFFF;
 163             emit(emitctx, ERROR);
 164         }
 165
 166         j = i;
 167         while (j < iso->nescapes &&
 168                !memcmp(iso->escapes[j].sequence,
 169                        iso->escapes[oi].sequence, n)) {
 170             if (iso->escapes[j].sequence[n] < input_chr)
 171                 i = ++j;
 172             else
 173                 break;
 174         }
 175         if (i >= iso->nescapes ||
 176             memcmp(iso->escapes[i].sequence,
 177                    iso->escapes[oi].sequence, n) ||
 178             iso->escapes[i].sequence[n] != input_chr) {
 179             /*
 180              * This character does not appear in any valid escape
 181              * sequence. Therefore, we must emit all the characters
 182              * we had previously swallowed, plus this one, and
 183              * return to non-escape-sequence state.
 184              */
 185             for (j = 0; j < n; j++)
 186                 emit(emitctx, iso->escapes[oi].sequence[j]);
 187             emit(emitctx, input_chr);
 188             state->s0 = 0;
 189             return;
 190         }
 191
 192         /*
 193          * Otherwise, we have found an additional character in our
 194          * escape sequence. See if we have reached the _end_ of our
 195          * sequence (and therefore must process the sequence).
 196          */
 197         n++;
 198         if (!iso->escapes[i].sequence[n]) {
 199             state->s0 = 0;
 200             state->s1 &= iso->escapes[i].andbits;
 201             state->s1 ^= iso->escapes[i].xorbits;
 202             return;
 203         }
 204
 205         /*
 206          * Failing _that_, we simply update our escape-sequence-
 207          * tracking state.
 208          */
 209         assert(i < 32 && n < 8);
 210         state->s0 = (i << 27) | (n << 24);
 211         return;
 212     }
 213
 214     /*
 215      * If this isn't an escape sequence, it must be part of a
 216      * character. One possibility is that it's a control character
 217      * (outside the space 21-7E), in which case we output it verbatim.
 218      */
 219     if (input_chr < 0x21 || input_chr > 0x7E) {
 220         /*
 221          * We might be in mid-multibyte-character. If so, clear the
 222          * character state and emit an error token for the
 223          * incomplete character.
 224          */
 225         if (state->s1 & 0x0F000000) {
 226             state->s1 &= ~0x0F000000;
 227             state->s0 &= 0xFF000000;
 228             emit(emitctx, ERROR);
 229             /*
 230              * If we were in the SS2 or SS3 container, we
 231              * automatically exit it.
 232              */
 233             if (state->s1 & 0x60000000)
 234                 state->s1 &= 0x9FFFFFFF;
 235         }
 236
 237         emit(emitctx, input_chr);
 238         return;
 239     }
 240
 241     /*
 242      * Otherwise, accumulate character data.
 243      */
 244     {
 245         unsigned long chr;
 246         int chrlen, cont, subcharset, bytes;
 247
 248         /* The current character and its length. */
 249         chr = ((state->s0 & 0x00FFFFFF) << 8) | input_chr;
 250         chrlen = ((state->s1 >> 24) & 0xF) + 1;
 251         /* The current sub-charset. */
 252         cont = (state->s1 >> 28) & 7;
 253         if (cont > 1) cont >>= 1;
 254         subcharset = (state->s1 >> (6*cont)) & 0x3F;
 255         /* The number of bytes-per-character in that sub-charset. */
 256         bytes = iso->nbytes[subcharset];
 257
 258         /*
 259          * If this character is now complete, we convert and emit
 260          * it. Otherwise, we simply update the state and return.
 261          */
 262         if (chrlen >= bytes) {
 263             emit(emitctx, iso->to_ucs(subcharset, chr));
 264             chr = chrlen = 0;
 265             /*
 266              * If we were in the SS2 or SS3 container, we
 267              * automatically exit it.
 268              */
 269             if (state->s1 & 0x60000000)
 270                 state->s1 &= 0x9FFFFFFF;
 271         }
 272         state->s0 = (state->s0 & 0xFF000000) | chr;
 273         state->s1 = (state->s1 & 0xF0FFFFFF) | (chrlen << 24);
 274     }
 275 }
 276
 277 static int write_iso2022s(charset_spec const *charset, long int input_chr,
 278                           charset_state *state,
 279                           void (*emit)(void *ctx, long int output),
 280                           void *emitctx)
 281 {
 282     struct iso2022 const *iso = (struct iso2022 *)charset->data;
 283     int subcharset, len, i, j, cont;
 284     unsigned long bytes;
 285
 286     /*
 287      * For output, our s1 state variable contains most of the same
 288      * stuff as it did for input - initial-state indicator bit,
 289      * current container, and current subcharset selected in each
 290      * container.
 291      */
 292
 293     /*
 294      * Analyse the character and find out what subcharset it needs
 295      * to go in.
 296      */
 297     if (input_chr >= 0 && !iso->from_ucs(input_chr, &subcharset, &bytes))
 298         return FALSE;
 299
 300     if (!(state->s1 & 0x80000000)) {
 301         state->s1 = iso->s1;
 302         if (iso->initial_sequence)
 303             for (i = 0; iso->initial_sequence[i]; i++)
 304                 emit(emitctx, iso->initial_sequence[i]);
 305     }
 306
 307     if (input_chr == -1) {
 308         unsigned long oldstate;
 309         int k;
 310
 311         /*
 312          * Special case: reset encoding state.
 313          */
 314         for (i = 0; iso->reset[i]; i++) {
 315             j = iso->reset[i] - 1;
 316             oldstate = state->s1;
 317             state->s1 &= iso->escapes[j].andbits;
 318             state->s1 ^= iso->escapes[j].xorbits;
 319             if (state->s1 != oldstate) {
 320                 /* We must actually emit this sequence. */
 321                 for (k = 0; iso->escapes[j].sequence[k]; k++)
 322                     emit(emitctx, iso->escapes[j].sequence[k]);
 323             }
 324         }
 325
 326         return TRUE;
 327     }
 328
 329     /*
 330      * Now begins the fun. We now know what subcharset we want. So
 331      * we must find out which container we should select it into,
 332      * select it into it if necessary, select that _container_ if
 333      * necessary, and then output the given bytes.
 334      */
 335     for (i = 0; i < iso->nescapes; i++)
 336         if (iso->escapes[i].subcharset == subcharset)
 337             break;
 338     assert(i < iso->nescapes);
 339
 340     /*
 341      * We've found the escape sequence which would select this
 342      * subcharset into a container. However, that subcharset might
 343      * already _be_ selected in that container! Check before we go
 344      * to the effort of emitting the sequence.
 345      */
 346     cont = iso->escapes[i].container;
 347     if (((state->s1 >> (6*cont)) & 0x3F) != (unsigned)subcharset) {
 348         for (j = 0; iso->escapes[i].sequence[j]; j++)
 349             emit(emitctx, iso->escapes[i].sequence[j]);
 350         state->s1 &= iso->escapes[i].andbits;
 351         state->s1 ^= iso->escapes[i].xorbits;
 352     }
 353
 354     /*
 355      * Now we know what container our subcharset is in, so we want
 356      * to select that container.
 357      */
 358     if (cont > 1) {
 359         /* SS2 or SS3; just output the sequence and be done. */
 360         emit(emitctx, ESC);
 361         emit(emitctx, 'L' + cont);     /* comes out to 'N' or 'O' */
 362     } else {
 363         /* Emit SI or SO, but only if the current container isn't already
 364          * the right one. */
 365         if (((state->s1 >> 28) & 7) != (unsigned)cont) {
 366             emit(emitctx, cont ? SO : SI);
 367             state->s1 = (state->s1 & 0x8FFFFFFF) | (cont << 28);
 368         }
 369     }
 370
 371     /*
 372      * We're done. Subcharset is selected in container, container
 373      * is selected. All we need now is to write out the bytes.
 374      */
 375     len = iso->nbytes[subcharset];
 376     while (len--)
 377         emit(emitctx, (bytes >> (8*len)) & 0xFF);
 378
 379     return TRUE;
 380 }
 381
 382 /*
 383  * ISO-2022-JP, defined in RFC 1468.
 384  */
 385 static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes)
 386 {
 387     switch (subcharset) {
 388       case 0: return bytes;            /* one-byte ASCII */
 389       case 1:                          /* JIS X 0201 half-width katakana */
 390         if (bytes >= 0x21 && bytes <= 0x5F)
 391             return bytes + (0xFF61 - 0x21);
 392         else
 393             return ERROR;
 394         /* (no break needed since all control paths have returned) */
 395       case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
 396                                          ((bytes     ) & 0xFF) - 0x21);
 397       default: return ERROR;
 398     }
 399 }
 400 static int iso2022jp_from_ucs(long int ucs, int *subcharset,
 401                               unsigned long *bytes)
 402 {
 403     int r, c;
 404     if (ucs < 0x80) {
 405         *subcharset = 0;
 406         *bytes = ucs;
 407         return 1;
 408     } else if (ucs >= 0xFF61 && ucs <= 0xFF9F) {
 409         *subcharset = 1;
 410         *bytes = ucs - (0xFF61 - 0x21);
 411         return 1;
 412     } else if (unicode_to_jisx0208(ucs, &r, &c)) {
 413         *subcharset = 2;
 414         *bytes = ((r+0x21) << 8) | (c+0x21);
 415         return 1;
 416     } else {
 417         return 0;
 418     }
 419 }
 420 static struct iso2022_escape iso2022jp_escapes[] = {
 421     {"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1},   /* we ignore this one */
 422     {"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2},
 423     {"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0},
 424     {"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1},
 425 };
 426 static struct iso2022 iso2022jp = {
 427     iso2022jp_escapes, lenof(iso2022jp_escapes),
 428     "\1\1\2", "\3", 0x80000000, NULL, iso2022jp_to_ucs, iso2022jp_from_ucs
 429 };
 430 const charset_spec charset_CS_ISO2022_JP = {
 431     CS_ISO2022_JP, read_iso2022s, write_iso2022s, &iso2022jp
 432 };
 433
 434 /*
 435  * ISO-2022-KR, defined in RFC 1557.
 436  */
 437 static long int iso2022kr_to_ucs(int subcharset, unsigned long bytes)
 438 {
 439     switch (subcharset) {
 440       case 0: return bytes;            /* one-byte ASCII */
 441       case 1: return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
 442                                         ((bytes     ) & 0xFF) - 0x21);
 443       default: return ERROR;
 444     }
 445 }
 446 static int iso2022kr_from_ucs(long int ucs, int *subcharset,
 447                               unsigned long *bytes)
 448 {
 449     int r, c;
 450     if (ucs < 0x80) {
 451         *subcharset = 0;
 452         *bytes = ucs;
 453         return 1;
 454     } else if (unicode_to_ksx1001(ucs, &r, &c)) {
 455         *subcharset = 1;
 456         *bytes = ((r+0x21) << 8) | (c+0x21);
 457         return 1;
 458     } else {
 459         return 0;
 460     }
 461 }
 462 static struct iso2022_escape iso2022kr_escapes[] = {
 463     {"\016", 0x8FFFFFFF, 0x10000000, -1, -1},
 464     {"\017", 0x8FFFFFFF, 0x00000000, 0, 0},
 465     {"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1},   /* bits[11:6] <- 1 */
 466 };
 467 static struct iso2022 iso2022kr = {
 468     iso2022kr_escapes, lenof(iso2022kr_escapes),
 469     "\1\2", "\2", 0x80000040, "\033$)C", iso2022kr_to_ucs, iso2022kr_from_ucs
 470 };
 471 const charset_spec charset_CS_ISO2022_KR = {
 472     CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr
 473 };
 474
 475 #else /* ENUM_CHARSETS */
 476
 477 ENUM_CHARSET(CS_ISO2022_JP)
 478 ENUM_CHARSET(CS_ISO2022_KR)
 479
 480 #endif /* ENUM_CHARSETS */