mdw@git.distorted.org.uk Git - sgt/charset/blob - iso2022.c

   1 /*
   2  * iso2022.c - support for ISO/IEC 2022 (alias ECMA-35).
   3  *
   4  * This isn't a complete implementation of ISO/IEC 2022, but it's
   5  * close.  It can decode 8-bit and 7-bit versions, with support for
   6  * single-byte and multi-byte character sets, all four containers
   7  * (G0, G1, G2, and G3), using both single-shift and locking-shift
   8  * sequences.
   9  *
  10  * The general principle is that any valid ISO/IEC 2022 sequence
  11  * should either be correctly decoded or should emit an ERROR.  The
  12  * only exception to this is that the C0 and C1 sets are fixed as
  13  * those of ISO/IEC 6429.  Escape sequences for designating control
  14  * sets are passed through, so a post-processor could fix them up if
  15  * necessary.
  16  *
  17  * DOCS to UTF-8 works.  Other DOCS sequences are ignored, which will
  18  * produce surprising results.
  19  */
  20
  21 #ifndef ENUM_CHARSETS
  22
  23 #include <assert.h>
  24 #include <string.h>
  25
  26 #include "charset.h"
  27 #include "internal.h"
  28 #include "sbcsdat.h"
  29
  30 #define LS1 (0x0E)
  31 #define LS0 (0x0F)
  32 #define ESC (0x1B)
  33 #define SS2 (0x8E)
  34 #define SS3 (0x8F)
  35
  36 enum {S4, S6, M4, M6};
  37
  38 static long int emacs_big5_1_to_unicode(int, int);
  39 static long int emacs_big5_2_to_unicode(int, int);
  40 static int unicode_to_emacs_big5(long int, int *, int *, int *);
  41 static long int cns11643_1_to_unicode(int, int);
  42 static long int cns11643_2_to_unicode(int, int);
  43 static long int cns11643_3_to_unicode(int, int);
  44 static long int cns11643_4_to_unicode(int, int);
  45 static long int cns11643_5_to_unicode(int, int);
  46 static long int cns11643_6_to_unicode(int, int);
  47 static long int cns11643_7_to_unicode(int, int);
  48 static long int null_dbcs_to_unicode(int, int);
  49 static int unicode_to_null_dbcs(long int, int *, int *);
  50
  51 typedef int (*to_dbcs_t)(long int, int *, int *);
  52 typedef int (*to_dbcs_planar_t)(long int, int *, int *, int *);
  53
  54 /*
  55  * These macros cast between to_dbcs_planar_t and to_dbcs_t, in
  56  * such a way as to cause a compile-time error if the input is not
  57  * of the appropriate type.
  58  *
  59  * Defining these portably is quite fiddly. My first effort was as
  60  * follows:
  61  *   #define DEPLANARISE(x) ( (x) == (to_dbcs_planar_t)NULL, (to_dbcs_t)(x) )
  62  *
  63  * so that the comparison on the left of the comma provokes the
  64  * type check error, and the cast on the right is the actual
  65  * desired result.
  66  *
  67  * gcc was entirely happy with this. However, when used in a static
  68  * initialiser, MSVC objected - justifiably - that the first half
  69  * of the comma expression wasn't constant and thus the expression
  70  * as a whole was not a constant expression. We can get round this
  71  * by enclosing the comparison in `sizeof', so that it isn't
  72  * actually evaluated.
  73  *
  74  * But then we run into a second problem, which is that C actually
  75  * disallows the use of the comma operator within a constant
  76  * expression for any purpose at all! Presumably this is on the
  77  * basis that its purpose is to have side effects and constant
  78  * expressions can't; unfortunately, this specific case is one in
  79  * which the desired side effect is a compile-time rather than a
  80  * run-time one.
  81  *
  82  * We are permitted to use ?:, however, and that works quite well
  83  * since the actual result of the sizeof expression _is_ evaluable
  84  * at compile time. So here's my final answer, with the unfortunate
  85  * remaining problem of evaluating its arguments multiple times:
  86  */
  87 #define TYPECHECK(x,y) ( sizeof((x)) == sizeof((x)) ? (y) : (y) )
  88 #define DEPLANARISE(x) TYPECHECK((x) == (to_dbcs_planar_t)NULL, (to_dbcs_t)(x))
  89 #define REPLANARISE(x) TYPECHECK((x) == (to_dbcs_t)NULL, (to_dbcs_planar_t)(x))
  90
  91 /*
  92  * Values used in the `enable' field. Each of these identifies a
  93  * class of character sets; we then have a bitmask indicating which
  94  * classes are allowable in a given mode.
  95  *
  96  * These values are currently only checked on output: for input,
  97  * any ISO 2022 we can comprehend at all is considered acceptable.
  98  */
  99 #define CCS 1                          /* CTEXT standard */
 100 #define COS 2                          /* other standard */
 101 #define CPU 3                          /* private use */
 102 #define CDC 4                          /* DOCS for CTEXT */
 103 #define CDU 5                          /* DOCS for UTF-8 */
 104 #define CNU 31                         /* never used */
 105
 106 struct iso2022_mode {
 107     int enable_mask;
 108     char ltype, li, lf, rtype, ri, rf;
 109 };
 110
 111 const struct iso2022_subcharset {
 112     char type, i, f, enable;
 113     int offset;
 114     const sbcs_data *sbcs_base;
 115     long int (*from_dbcs)(int, int);
 116
 117     /*
 118      * If to_dbcs_plane < 0, then to_dbcs is used as expected.
 119      * However, if to_dbcs_plane >= 0, then to_dbcs is expected to
 120      * be cast to a to_dbcs_planar_t before use, and the returned
 121      * plane value (the first int *) must equal to_dbcs_plane.
 122      *
 123      * I'd have preferred to do this by means of a union, but you
 124      * can't initialise a selected field of a union at compile
 125      * time. Function pointer casts are guaranteed to work sensibly
 126      * in ISO C (that is, it's undefined what happens if you call a
 127      * function via the wrong type of pointer, but if you cast it
 128      * back to the right type before calling it then it must work),
 129      * so this is safe if ugly.
 130      */
 131     to_dbcs_t to_dbcs;
 132     int to_dbcs_plane;                 /* use to_dbcs_planar iff >= 0 */
 133 } iso2022_subcharsets[] = {
 134     /*
 135      * We list these subcharsets in preference order for output.
 136      * Since the best-defined use of ISO 2022 output is compound
 137      * text, we'll use a preference order which matches that. So we
 138      * begin with the charsets defined in the compound text spec.
 139      */
 140     { S4, 0, 'B', CCS, 0x00, &sbcsdata_CS_ASCII },
 141     { S6, 0, 'A', CCS, 0x80, &sbcsdata_CS_ISO8859_1 },
 142     { S6, 0, 'B', CCS, 0x80, &sbcsdata_CS_ISO8859_2 },
 143     { S6, 0, 'C', CCS, 0x80, &sbcsdata_CS_ISO8859_3 },
 144     { S6, 0, 'D', CCS, 0x80, &sbcsdata_CS_ISO8859_4 },
 145     { S6, 0, 'F', CCS, 0x80, &sbcsdata_CS_ISO8859_7 },
 146     { S6, 0, 'G', CCS, 0x80, &sbcsdata_CS_ISO8859_6 },
 147     { S6, 0, 'H', CCS, 0x80, &sbcsdata_CS_ISO8859_8 },
 148     { S6, 0, 'L', CCS, 0x80, &sbcsdata_CS_ISO8859_5 },
 149     { S6, 0, 'M', CCS, 0x80, &sbcsdata_CS_ISO8859_9 },
 150     { S4, 0, 'I', CCS, 0x80, &sbcsdata_CS_JISX0201 },
 151     { S4, 0, 'J', CCS, 0x00, &sbcsdata_CS_JISX0201 },
 152     { M4, 0, 'A', CCS, -0x21, 0, &gb2312_to_unicode, &unicode_to_gb2312, -1 },
 153     { M4, 0, 'B', CCS, -0x21, 0, &jisx0208_to_unicode, &unicode_to_jisx0208, -1 },
 154     { M4, 0, 'C', CCS, -0x21, 0, &ksx1001_to_unicode, &unicode_to_ksx1001, -1 },
 155     { M4, 0, 'D', CCS, -0x21, 0, &jisx0212_to_unicode, &unicode_to_jisx0212, -1 },
 156
 157     /*
 158      * Next, other reasonably standard things: the rest of the ISO
 159      * 8859 sets, UK-ASCII, and CNS 11643.
 160      */
 161     { S6, 0, 'T', COS, 0x80, &sbcsdata_CS_ISO8859_11 },
 162     { S6, 0, 'V', COS, 0x80, &sbcsdata_CS_ISO8859_10 },
 163     { S6, 0, 'Y', COS, 0x80, &sbcsdata_CS_ISO8859_13 },
 164     { S6, 0, '_', COS, 0x80, &sbcsdata_CS_ISO8859_14 },
 165     { S6, 0, 'b', COS, 0x80, &sbcsdata_CS_ISO8859_15 },
 166     { S6, 0, 'f', COS, 0x80, &sbcsdata_CS_ISO8859_16 },
 167     { S4, 0, 'A', COS, 0x00, &sbcsdata_CS_BS4730 },
 168     { M4, 0, 'G', COS, -0x21, 0, &cns11643_1_to_unicode, DEPLANARISE(&unicode_to_cns11643), 0 },
 169     { M4, 0, 'H', COS, -0x21, 0, &cns11643_2_to_unicode, DEPLANARISE(&unicode_to_cns11643), 1 },
 170     { M4, 0, 'I', COS, -0x21, 0, &cns11643_3_to_unicode, DEPLANARISE(&unicode_to_cns11643), 2 },
 171     { M4, 0, 'J', COS, -0x21, 0, &cns11643_4_to_unicode, DEPLANARISE(&unicode_to_cns11643), 3 },
 172     { M4, 0, 'K', COS, -0x21, 0, &cns11643_5_to_unicode, DEPLANARISE(&unicode_to_cns11643), 4 },
 173     { M4, 0, 'L', COS, -0x21, 0, &cns11643_6_to_unicode, DEPLANARISE(&unicode_to_cns11643), 5 },
 174     { M4, 0, 'M', COS, -0x21, 0, &cns11643_7_to_unicode, DEPLANARISE(&unicode_to_cns11643), 6 },
 175
 176     /*
 177      * Private-use designations: DEC private sets and Emacs's Big5
 178      * abomination.
 179      */
 180     { S4, 0, '0', CPU, 0x00, &sbcsdata_CS_DEC_GRAPHICS },
 181     { S4, 0, '<', CPU, 0x80, &sbcsdata_CS_DEC_MCS },
 182     { M4, 0, '0', CPU, -0x21, 0, &emacs_big5_1_to_unicode, DEPLANARISE(&unicode_to_emacs_big5), 1 },
 183     { M4, 0, '1', CPU, -0x21, 0, &emacs_big5_2_to_unicode, DEPLANARISE(&unicode_to_emacs_big5), 2 },
 184
 185     /*
 186      * Ben left this conditioned out without explanation,
 187      * presumably on the grounds that we don't have a translation
 188      * table for it.
 189      */
 190 #if 0
 191     { M4, 0, '@', CNU }, /* JIS C 6226-1978 */
 192 #endif
 193
 194     /*
 195      * Finally, fallback entries for null character sets.
 196      */
 197     { S4, 0, '~', CNU },
 198     { S6, 0, '~', CNU }, /* empty 96-set */
 199     { M4, 0, '~', CNU, 0, 0, &null_dbcs_to_unicode, &unicode_to_null_dbcs, -1 }, /* empty 94^n-set */
 200     { M6, 0, '~', CNU, 0, 0, &null_dbcs_to_unicode, &unicode_to_null_dbcs, -1 }, /* empty 96^n-set */
 201 };
 202
 203 static long int null_dbcs_to_unicode(int r, int c)
 204 {
 205     UNUSEDARG(r);
 206     UNUSEDARG(c);
 207     return ERROR;
 208 }
 209 static int unicode_to_null_dbcs(long int unicode, int *r, int *c)
 210 {
 211     UNUSEDARG(unicode);
 212     UNUSEDARG(r);
 213     UNUSEDARG(c);
 214     return 0;                          /* failed to convert anything */
 215 }
 216
 217 /*
 218  * Emacs encodes Big5 in COMPOUND_TEXT as two 94x94 character sets.
 219  * We treat Big5 as a 94x191 character set with a bunch of undefined
 220  * columns in the middle, so we have to mess around a bit to make
 221  * things fit.
 222  */
 223
 224 static long int emacs_big5_1_to_unicode(int r, int c)
 225 {
 226     unsigned long s;
 227     s = r * 94 + c;
 228     r = s / 157;
 229     c = s % 157;
 230     if (c >= 64) c += 34; /* Skip over the gap */
 231     return big5_to_unicode(r, c);
 232 }
 233
 234 static long int emacs_big5_2_to_unicode(int r, int c)
 235 {
 236     unsigned long s;
 237     s = r * 94 + c;
 238     r = s / 157 + 40;
 239     c = s % 157;
 240     if (c >= 64) c += 34; /* Skip over the gap */
 241     return big5_to_unicode(r, c);
 242 }
 243
 244 static int unicode_to_emacs_big5(long int unicode, int *p, int *r, int *c)
 245 {
 246     int rr, cc, s;
 247     if (!unicode_to_big5(unicode, &rr, &cc))
 248         return 0;
 249     if (cc >= 64) {
 250         cc -= 34;
 251         assert(cc >= 64);
 252     }
 253     s = rr * 157 + cc;
 254     if (s >= 40*157) {
 255         *p = 2;
 256         s -= 40*157;
 257     } else {
 258         *p = 1;
 259     }
 260     *r = s / 94;
 261     *c = s % 94;
 262     return 1;
 263 }
 264
 265 /* Wrappers for cns11643_to_unicode() */
 266 static long int cns11643_1_to_unicode(int r, int c)
 267 {
 268     return cns11643_to_unicode(0, r, c);
 269 }
 270 static long int cns11643_2_to_unicode(int r, int c)
 271 {
 272     return cns11643_to_unicode(1, r, c);
 273 }
 274 static long int cns11643_3_to_unicode(int r, int c)
 275 {
 276     return cns11643_to_unicode(2, r, c);
 277 }
 278 static long int cns11643_4_to_unicode(int r, int c)
 279 {
 280     return cns11643_to_unicode(3, r, c);
 281 }
 282 static long int cns11643_5_to_unicode(int r, int c)
 283 {
 284     return cns11643_to_unicode(4, r, c);
 285 }
 286 static long int cns11643_6_to_unicode(int r, int c)
 287 {
 288     return cns11643_to_unicode(5, r, c);
 289 }
 290 static long int cns11643_7_to_unicode(int r, int c)
 291 {
 292     return cns11643_to_unicode(6, r, c);
 293 }
 294
 295 /* States, or "what we're currently accumulating". */
 296 enum {
 297     IDLE,       /* None of the below */
 298     SS2CHAR,    /* Accumulating a character after SS2 */
 299     SS3CHAR,    /* Accumulating a character after SS3 */
 300     ESCSEQ,     /* Accumulating an escape sequence */
 301     ESCDROP,    /* Discarding an escape sequence */
 302     ESCPASS,    /* Passing through an escape sequence */
 303     DOCSUTF8,   /* DOCSed into UTF-8 */
 304     DOCSCTEXT   /* DOCSed into a COMPOUND_TEXT extended segment */
 305 };
 306
 307 #if 0
 308 #include <stdio.h>
 309 static void dump_state(charset_state *s)
 310 {
 311     unsigned s0 = s->s0, s1 = s->s1;
 312     char const * const modes[] = { "IDLE", "SS2CHAR", "SS3CHAR",
 313                                    "ESCSEQ", "ESCDROP", "ESCPASS",
 314                                    "DOCSUTF8" };
 315
 316     fprintf(stderr, "s0: %s", modes[s0 >> 29]);
 317     fprintf(stderr, " %02x %02x %02x   ", (s0 >> 16) & 0xff, (s0 >> 8) & 0xff,
 318             s0 & 0xff);
 319     fprintf(stderr, "s1: LS%d LS%dR", (s1 >> 30) & 3, (s1 >> 28) & 3);
 320     fprintf(stderr, " %d %d %d %d\n", s1 & 0x7f, (s1 >> 7) & 0x7f,
 321             (s1 >> 14) & 0x7f, (s1 >> 21) & 0x7f);
 322 }
 323 #endif
 324
 325 static void designate(charset_state *state, int container,
 326                       int type, int ibyte, int fbyte)
 327 {
 328     unsigned long i;
 329
 330     assert(container >= 0 && container <= 3);
 331     assert(type == S4 || type == S6 || type == M4 || type == M6);
 332
 333     for (i = 0; i < lenof(iso2022_subcharsets); i++) {
 334         if (iso2022_subcharsets[i].type == type &&
 335             iso2022_subcharsets[i].i == ibyte &&
 336             iso2022_subcharsets[i].f == fbyte) {
 337             state->s1 &= ~(0x7fL << (container * 7));
 338             state->s1 |= (i << (container * 7));
 339             return;
 340         }
 341     }
 342     /*
 343      * If we don't find the charset, invoke the empty one, so we
 344      * output ERROR rather than garbage.
 345      */
 346     designate(state, container, type, 0, '~');
 347 }
 348
 349 static void do_utf8(long int input_chr,
 350                     charset_state *state,
 351                     void (*emit)(void *ctx, long int output),
 352                     void *emitctx)
 353 {
 354     charset_state ustate;
 355
 356     ustate.s1 = 0;
 357     ustate.s0 = state->s0 & 0x03ffffffL;
 358     read_utf8(NULL, input_chr, &ustate, emit, emitctx);
 359     state->s0 = (state->s0 & ~0x03ffffffL) | (ustate.s0 & 0x03ffffffL);
 360 }
 361
 362 static void docs_utf8(long int input_chr,
 363                       charset_state *state,
 364                       void (*emit)(void *ctx, long int output),
 365                       void *emitctx)
 366 {
 367     int retstate;
 368
 369     /*
 370      * Bits [25:0] of s0 are reserved for read_utf8().
 371      * Bits [27:26] are a tiny state machine to recognise ESC % @.
 372      */
 373     retstate = (state->s0 & 0x0c000000L) >> 26;
 374     if (retstate == 1 && input_chr == '%')
 375         retstate = 2;
 376     else if (retstate == 2 && input_chr == '@') {
 377         /* If we've got a partial UTF-8 sequence, complain. */
 378         if (state->s0 & 0x03ffffffL)
 379             emit(emitctx, ERROR);
 380         state->s0 = 0;
 381         return;
 382     } else {
 383         if (retstate >= 1) do_utf8(ESC, state, emit, emitctx);
 384         if (retstate >= 2) do_utf8('%', state, emit, emitctx);
 385         retstate = 0;
 386         if (input_chr == ESC)
 387             retstate = 1;
 388         else {
 389             do_utf8(input_chr, state, emit, emitctx);
 390         }
 391     }
 392     state->s0 = (state->s0 & ~0x0c000000L) | (retstate << 26);
 393 }
 394
 395 struct ctext_encoding {
 396     char const *name;
 397     char octets_per_char, enable;
 398     charset_spec const *subcs;
 399 };
 400
 401 /*
 402  * In theory, this list is in <ftp://ftp.x.org/pub/DOCS/registry>,
 403  * but XLib appears to have its own ideas, and encodes these three
 404  * (as of X11R6.8.2)
 405  */
 406
 407 extern charset_spec const charset_CS_ISO8859_14;
 408 extern charset_spec const charset_CS_ISO8859_15;
 409 extern charset_spec const charset_CS_BIG5;
 410
 411 static struct ctext_encoding const ctext_encodings[] = {
 412     { "big5-0\2", 0 /* variable */, CDC, &charset_CS_BIG5 },
 413     { "iso8859-14\2", 1, CDC, &charset_CS_ISO8859_14 },
 414     { "iso8859-15\2", 1, CDC, &charset_CS_ISO8859_15 }
 415 };
 416
 417 static void docs_ctext(long int input_chr,
 418                        charset_state *state,
 419                        void (*emit)(void *ctx, long int output),
 420                        void *emitctx)
 421 {
 422     /*
 423      * s0[27:26] = first entry in ctext_encodings that matches
 424      * s0[25:22] = number of characters successfully matched, 0xf if all
 425      * s0[21:8] count the number of octets left in the segment
 426      * s0[7:0] are for sub-charset use
 427      */
 428     int n = (state->s0 >> 22) & 0xf, i = (state->s0 >> 26) & 3, oi = i, j;
 429     int length = (state->s0 >> 8) & 0x3fff;
 430
 431     /*
 432      * Note that we do not bother checking the octets-per-character
 433      * byte against the selected charset when reading. It's
 434      * extremely unlikely that this code will ever have to deal
 435      * with two charset identifiers with the same name and
 436      * different octets-per-character values! If it ever happens,
 437      * we'll have to edit this file anyway so we can modify the
 438      * code then...
 439      */
 440
 441     if (!length) {
 442         /* Haven't read length yet */
 443         if ((state->s0 & 0xff) == 0)
 444             /* ... or even the first byte */
 445             state->s0 |= input_chr;
 446         else {
 447             length = (state->s0 & 0x7f) * 0x80 + (input_chr & 0x7f);
 448             if (length == 0)
 449                 state->s0 = 0;
 450             else
 451                 state->s0 = (state->s0 & 0xf0000000) | (length << 8);
 452         }
 453         return;
 454     }
 455
 456     j = i;
 457     if (n == 0xe) {
 458         /* Skipping unknown encoding.  Look out for STX. */
 459         if (input_chr == 2)
 460             state->s0 = (state->s0 & 0xf0000000) | (i << 26) | (0xf << 22);
 461     } else if (n != 0xf) {
 462         while ((unsigned)j < lenof(ctext_encodings) &&
 463                !memcmp(ctext_encodings[j].name,
 464                        ctext_encodings[oi].name, n)) {
 465             if (ctext_encodings[j].name[n] < input_chr)
 466                 i = ++j;
 467             else
 468                 break;
 469         }
 470         if ((unsigned)i >= lenof(ctext_encodings) ||
 471             memcmp(ctext_encodings[i].name,
 472                    ctext_encodings[oi].name, n) ||
 473             ctext_encodings[i].name[n] != input_chr) {
 474             /* Doom!  We haven't heard of this encoding */
 475             i = lenof(ctext_encodings);
 476             n = 0xe;
 477         } else {
 478             /*
 479              * Otherwise, we have found an additional character in our
 480              * encoding name. See if we have reached the _end_ of our
 481              * name.
 482              */
 483             n++;
 484             if (!ctext_encodings[i].name[n])
 485                 n = 0xf;
 486         }
 487         /*
 488          * Failing _that_, we simply update our encoding-name-
 489          * tracking state.
 490          */
 491         assert(i < 4 && n < 16);
 492         state->s0 = (state->s0 & 0xf0000000) | (i << 26) | (n << 22);
 493     } else {
 494         if ((unsigned)i >= lenof(ctext_encodings))
 495             emit(emitctx, ERROR);
 496         else {
 497             charset_state substate;
 498             charset_spec const *subcs = ctext_encodings[i].subcs;
 499             substate.s1 = 0;
 500             substate.s0 = state->s0 & 0xff;
 501             subcs->read(subcs, input_chr, &substate, emit, emitctx);
 502             state->s0 = (state->s0 & ~0xff) | (substate.s0 & 0xff);
 503         }
 504     }
 505     if (!--length)
 506         state->s0 = 0;
 507     else
 508         state->s0 = (state->s0 &~0x003fff00) | (length << 8);
 509 }
 510
 511 static void read_iso2022(charset_spec const *charset, long int input_chr,
 512                          charset_state *state,
 513                          void (*emit)(void *ctx, long int output),
 514                          void *emitctx)
 515 {
 516     struct iso2022_mode const *mode = (struct iso2022_mode *)charset->data;
 517
 518     /* dump_state(state); */
 519     /*
 520      * We have to make fairly efficient use of the 64 bits of state
 521      * available to us.  Long-term state goes in s1, and consists of
 522      * the identities of the character sets designated as G0/G1/G2/G3
 523      * and the locking-shift states for GL and GR.  Short-term state
 524      * goes in s0: The bottom half of s0 accumulates characters for an
 525      * escape sequence or a multi-byte character, while the top three
 526      * bits indicate what they're being accumulated for.  After DOCS,
 527      * the bottom 29 bits of state are available for the DOCS function
 528      * to use -- the UTF-8 one uses the bottom 26 for UTF-8 decoding
 529      * and the top two to recognised ESC % @.
 530      *
 531      * s0[31:29] = state enum
 532      * s0[24:0] = accumulated bytes
 533      * s1[31:30] = GL locking-shift state
 534      * s1[29:28] = GR locking-shift state
 535      * s1[27:21] = G3 charset
 536      * s1[20:14] = G2 charset
 537      * s1[13:7] = G1 charset
 538      * s1[6:0] = G0 charset
 539      */
 540
 541 #define LEFT 30
 542 #define RIGHT 28
 543 #define LOCKING_SHIFT(n,side) \
 544         (state->s1 = (state->s1 & ~(3UL<<(side))) | ((n ## UL)<<(side)))
 545 #define MODE ((state->s0 & 0xe0000000UL) >> 29)
 546 #define ENTER_MODE(m) (state->s0 = (state->s0 & ~0xe0000000UL) | ((unsigned long)(m)<<29))
 547 #define SINGLE_SHIFT(n) ENTER_MODE(SS2CHAR - 2 + (n))
 548 #define ASSERT_IDLE do {                                                \
 549         if (state->s0 != 0) emit(emitctx, ERROR);                       \
 550         state->s0 = 0;                                                  \
 551 } while (0)
 552
 553     if (state->s1 == 0) {
 554         /*
 555          * Since there's no LS0R, this means we must just have started.
 556          * Set up a sane initial state (LS0, LS1R, ASCII in G0/G1/G2/G3).
 557          */
 558         LOCKING_SHIFT(0, LEFT);
 559         LOCKING_SHIFT(1, RIGHT);
 560         designate(state, 0, mode->ltype, mode->li, mode->lf);
 561         designate(state, 1, mode->rtype, mode->ri, mode->rf);
 562         designate(state, 2, S4, 0, 'B');
 563         designate(state, 3, S4, 0, 'B');
 564     }
 565
 566     if (MODE == DOCSUTF8) {
 567         docs_utf8(input_chr, state, emit, emitctx);
 568         return;
 569     }
 570     if (MODE == DOCSCTEXT) {
 571         docs_ctext(input_chr, state, emit, emitctx);
 572         return;
 573     }
 574
 575     if ((input_chr & 0x60) == 0x00) {
 576         /* C0 or C1 control */
 577         ASSERT_IDLE;
 578         switch (input_chr) {
 579           case ESC:
 580             ENTER_MODE(ESCSEQ);
 581             break;
 582           case LS0:
 583             LOCKING_SHIFT(0, LEFT);
 584             break;
 585           case LS1:
 586             LOCKING_SHIFT(1, LEFT);
 587             break;
 588           case SS2:
 589             SINGLE_SHIFT(2);
 590             break;
 591           case SS3:
 592             SINGLE_SHIFT(3);
 593             break;
 594           default:
 595             emit(emitctx, input_chr);
 596             break;
 597         }
 598     } else if ((input_chr & 0x80) || MODE < ESCSEQ) {
 599         int is_gl = 0;
 600         struct iso2022_subcharset const *subcs;
 601         unsigned container;
 602         long input_7bit;
 603         /*
 604          * Actual data.
 605          * Force idle state if we're in mid escape sequence, or in a
 606          * multi-byte character with a different top bit.
 607          */
 608         if (MODE >= ESCSEQ ||
 609             ((state->s0 & 0x00ff0000L) != 0 &&
 610              (((state->s0 >> 16) ^ input_chr) & 0x80)))
 611             ASSERT_IDLE;
 612         if (MODE == SS2CHAR || MODE == SS3CHAR) /* Single-shift */
 613             container = MODE - SS2CHAR + 2;
 614         else if (input_chr >= 0x80) /* GR */
 615             container = (state->s1 >> 28) & 3;
 616         else { /* GL */
 617             container = state->s1 >> 30;
 618             is_gl = 1;
 619         }
 620         input_7bit = input_chr & ~0x80;
 621         subcs = &iso2022_subcharsets[(state->s1 >> (container * 7)) & 0x7f];
 622         if ((subcs->type == S4 || subcs->type == M4) &&
 623             (input_7bit == 0x20 || input_7bit == 0x7f)) {
 624             /* characters not in 94-char set */
 625             if (is_gl) emit(emitctx, input_7bit);
 626             else emit(emitctx, ERROR);
 627         } else if (subcs->type == M4 || subcs->type == M6) {
 628             if ((state->s0 & 0x00ff0000L) == 0) {
 629                 state->s0 |= input_chr << 16;
 630                 return;
 631             } else {
 632                 emit(emitctx,
 633                      subcs->from_dbcs(((state->s0 >> 16) & 0x7f) +
 634                                       subcs->offset,
 635                                       input_7bit + subcs->offset));
 636             }
 637         } else {
 638             if ((state->s0 & 0x00ff0000L) != 0)
 639                 emit(emitctx, ERROR);
 640             emit(emitctx, subcs->sbcs_base ?
 641                  sbcs_to_unicode(subcs->sbcs_base, input_7bit + subcs->offset):
 642                  ERROR);
 643         }
 644         state->s0 = 0;
 645     } else {
 646         unsigned i1, i2;
 647         if (MODE == ESCPASS) {
 648             emit(emitctx, input_chr);
 649             if ((input_chr & 0xf0) != 0x20)
 650                 ENTER_MODE(IDLE);
 651             return;
 652         }
 653
 654         /*
 655          * Intermediate bytes shall be any of the 16 positions of
 656          * column 02 of the code table; they are denoted by the symbol
 657          * I.
 658          */
 659         if ((input_chr & 0xf0) == 0x20) {
 660             if (((state->s0 >> 16) & 0xff) == 0)
 661                 state->s0 |= input_chr << 16;
 662             else if (((state->s0 >> 8) & 0xff) == 0)
 663                 state->s0 |= input_chr << 8;
 664             else {
 665                 /* Long escape sequence.  Switch to ESCPASS or ESCDROP. */
 666                 i1 = (state->s0 >> 16) & 0xff;
 667                 i2 = (state->s0 >> 8) & 0xff;
 668                 switch (i1) {
 669                   case '(': case ')': case '*': case '+':
 670                   case '-': case '.': case '/':
 671                   case '$':
 672                     ENTER_MODE(ESCDROP);
 673                     break;
 674                   default:
 675                     emit(emitctx, ESC);
 676                     emit(emitctx, i1);
 677                     emit(emitctx, i2);
 678                     emit(emitctx, input_chr);
 679                     state->s0 = 0;
 680                     ENTER_MODE(ESCPASS);
 681                     break;
 682                 }
 683             }
 684             return;
 685         }
 686
 687         /*
 688          * Final bytes shall be any of the 79 positions of columns 03
 689          * to 07 of the code table excluding position 07/15; they are
 690          * denoted by the symbol F.
 691          */
 692         i1 = (state->s0 >> 16) & 0xff;
 693         i2 = (state->s0 >> 8) & 0xff;
 694         if (MODE == ESCDROP)
 695             input_chr = 0; /* Make sure it won't match. */
 696         state->s0 = 0;
 697         switch (i1) {
 698           case 0: /* No intermediate bytes */
 699             switch (input_chr) {
 700               case 'N': /* SS2 */
 701                 SINGLE_SHIFT(2);
 702                 break;
 703               case 'O': /* SS3 */
 704                 SINGLE_SHIFT(3);
 705                 break;
 706               case 'n': /* LS2 */
 707                 LOCKING_SHIFT(2, LEFT);
 708                 break;
 709               case 'o': /* LS3 */
 710                 LOCKING_SHIFT(3, LEFT);
 711                 break;
 712               case '|': /* LS3R */
 713                 LOCKING_SHIFT(3, RIGHT);
 714                 break;
 715               case '}': /* LS2R */
 716                 LOCKING_SHIFT(2, RIGHT);
 717                 break;
 718               case '~': /* LS1R */
 719                 LOCKING_SHIFT(1, RIGHT);
 720                 break;
 721               default:
 722                 /* Unsupported escape sequence.  Spit it back out. */
 723                 emit(emitctx, ESC);
 724                 emit(emitctx, input_chr);
 725             }
 726             break;
 727           case ' ': /* ACS */
 728             /*
 729              * Various coding structure facilities specify that designating
 730              * a code element also invokes it.  As far as I can see, invoking
 731              * it now will have the same practical effect, since those
 732              * facilities also ban the use of locking shifts.
 733              */
 734             switch (input_chr) {
 735               case 'A': /* G0 element used and invoked into GL */
 736                 LOCKING_SHIFT(0, LEFT);
 737                 break;
 738               case 'C': /* G0 in GL, G1 in GR */
 739               case 'D': /* Ditto, at least for 8-bit codes */
 740               case 'L': /* ISO 4873 (ECMA-43) level 1 */
 741               case 'M': /* ISO 4873 (ECMA-43) level 2 */
 742                 LOCKING_SHIFT(0, LEFT);
 743                 LOCKING_SHIFT(1, RIGHT);
 744                 break;
 745             }
 746             break;
 747           case '&': /* IRR */
 748             /*
 749              * IRR (Identify Revised Registration) is ignored here,
 750              * since any revised registration must be
 751              * upward-compatible with the old one, so either we'll
 752              * support the new one or we'll emit ERROR when we run
 753              * into a new character.  In either case, there's nothing
 754              * to be done here.
 755              */
 756             break;
 757           case '(': /* GZD4 */  case ')': /* G1D4 */
 758           case '*': /* G2D4 */  case '+': /* G3D4 */
 759             designate(state, i1 - '(', S4, i2, input_chr);
 760             break;
 761           case '-': /* G1D6 */  case '.': /* G2D6 */  case '/': /* G3D6 */
 762             designate(state, i1 - ',', S6, i2, input_chr);
 763             break;
 764           case '$': /* G?DM? */
 765             switch (i2) {
 766               case 0: /* Obsolete version of GZDM4 */
 767                 i2 = '(';
 768               case '(': /* GZDM4 */  case ')': /* G1DM4 */
 769               case '*': /* G2DM4 */  case '+': /* G3DM4 */
 770                 designate(state, i2 - '(', M4, 0, input_chr);
 771                 break;
 772               case '-': /* G1DM6 */
 773               case '.': /* G2DM6 */  case '/': /* G3DM6 */
 774                 designate(state, i2 - ',', M6, 0, input_chr);
 775                 break;
 776               default:
 777                 emit(emitctx, ERROR);
 778                 break;
 779             }
 780           case '%': /* DOCS */
 781             /* XXX What's a reasonable way to handle an unrecognised DOCS? */
 782             switch (i2) {
 783               case 0:
 784                 switch (input_chr) {
 785                   case 'G':
 786                     ENTER_MODE(DOCSUTF8);
 787                     break;
 788                 }
 789                 break;
 790               case '/':
 791                 switch (input_chr) {
 792                   case '1': case '2':
 793                     ENTER_MODE(DOCSCTEXT);
 794                     break;
 795                 }
 796                 break;
 797             }
 798             break;
 799           default:
 800             /* Unsupported nF escape sequence.  Re-emit it. */
 801             emit(emitctx, ESC);
 802             emit(emitctx, i1);
 803             if (i2) emit(emitctx, i2);
 804             emit(emitctx, input_chr);
 805             break;
 806         }
 807     }
 808 }
 809
 810 static void oselect(charset_state *state, int i, int right,
 811                     void (*emit)(void *ctx, long int output),
 812                     void *emitctx)
 813 {
 814     int shift = (right ? 31-7 : 31-7-7);
 815     struct iso2022_subcharset const *subcs = &iso2022_subcharsets[i];
 816
 817     if (((state->s1 >> shift) & 0x7F) != (unsigned)i) {
 818         state->s1 &= ~(0x7FL << shift);
 819         state->s1 |= (i << shift);
 820
 821         if (emit) {
 822             emit(emitctx, ESC);
 823             if (subcs->type == M4 || subcs->type == M6)
 824                 emit(emitctx, '$');
 825             if (subcs->type == S6 || subcs->type == M6) {
 826                 assert(right);
 827                 emit(emitctx, '-');
 828             } else if (right) {
 829                 emit(emitctx, ')');
 830             } else {
 831                 emit(emitctx, '(');
 832             }
 833             if (subcs->i)
 834                 emit(emitctx, subcs->i);
 835             emit(emitctx, subcs->f);
 836         }
 837     }
 838 }
 839
 840 static void docs_char(charset_state *state,
 841                       void (*emit)(void *ctx, long int output),
 842                       void *emitctx, int cset, char *data, int datalen)
 843 {
 844     int curr_cset, currlen, i;
 845
 846     /*
 847      * cset is the index into ctext_encodings[]. It can also be -1
 848      * to mean DOCS UTF-8, or -2 to mean no DOCS (ordinary 2022).
 849      * In the latter case, `chr' is ignored.
 850      */
 851
 852     /*
 853      * First, terminate a DOCS segment if necessary. We always have
 854      * to terminate a DOCS segment if one is active and we're about
 855      * to switch to a different one; we might also have to
 856      * terminate a length-encoded DOCS segment if we've run out of
 857      * storage space to accumulate characters in it.
 858      */
 859     curr_cset = ((state->s1 >> 14) & 7) - 2;
 860     currlen = ((state->s1 >> 11) & 7);
 861     if ((curr_cset != -2 && curr_cset != cset) ||
 862         (curr_cset >= 0 && currlen + datalen > 5)) {
 863         if (curr_cset == -1) {
 864             /*
 865              * Terminating DOCS UTF-8 is easy.
 866              */
 867             emit(emitctx, ESC);
 868             emit(emitctx, '%');
 869             emit(emitctx, '@');
 870         } else {
 871             int len;
 872
 873             /*
 874              * To terminate a length-encoded DOCS segment we must
 875              * actually output the whole thing.
 876              */
 877             emit(emitctx, ESC);
 878             emit(emitctx, '%');
 879             emit(emitctx, '/');
 880             emit(emitctx, '0' + ctext_encodings[curr_cset].octets_per_char);
 881             len = currlen + datalen +
 882                 strlen(ctext_encodings[curr_cset].name);
 883             assert(len < (1 << 14));
 884             emit(emitctx, 0x80 | ((len >> 7) & 0x7F));
 885             emit(emitctx, 0x80 | ((len     ) & 0x7F));
 886             /* The name stored in ctext_encodings[] includes the trailing \2 */
 887             for (i = 0; ctext_encodings[curr_cset].name[i]; i++)
 888                 emit(emitctx, ctext_encodings[curr_cset].name[i]);
 889             for (i = 0; i < currlen; i++)
 890                 emit(emitctx,
 891                      (i == 0 ? state->s1 : state->s0 >> (8*(4-i))) & 0xFF);
 892             for (i = 0; i < datalen; i++)
 893                 emit(emitctx, data[i]);
 894
 895             /*
 896              * We've now dealt with the input data, so clear it so
 897              * we don't try to do so again below.
 898              */
 899             datalen = 0;
 900         }
 901         curr_cset = -2;
 902     }
 903
 904     /*
 905      * Now, start a DOCS segment if necessary.
 906      */
 907     if (curr_cset != cset) {
 908         assert(cset != -2);
 909         if (cset == -1) {
 910             /*
 911              * Start DOCS UTF-8.
 912              */
 913             emit(emitctx, ESC);
 914             emit(emitctx, '%');
 915             emit(emitctx, 'G');
 916         } else {
 917             /*
 918              * Starting a length-encoded DOCS segment is simply a
 919              * matter of setting our stored length counter to zero.
 920              */
 921             currlen = 0;
 922             state->s1 &= ~(7 << 11);
 923             state->s1 &= ~0xFF;
 924             state->s0 = 0;
 925         }
 926     }
 927     state->s1 &= ~(7 << 14);
 928     assert((cset+2) >= 0 && (cset+2) < 8);
 929     state->s1 |= ((cset+2) << 14);
 930
 931     /*
 932      * Now we're in the right DOCS state. Actually deal with the
 933      * input data, if we haven't already done so above.
 934      */
 935     if (datalen > 0) {
 936         assert(cset != 2);
 937         if (cset == -1) {
 938             /*
 939              * In DOCS UTF-8, we output data as soon as we get it.
 940              */
 941             for (i = 0; i < datalen; i++)
 942                 emit(emitctx, data[i]);
 943         } else {
 944             /*
 945              * In length-encoded DOCS, we just store our data and
 946              * bide our time. It'll all be output when we fill up
 947              * or switch to another character set.
 948              */
 949             assert(currlen + datalen <= 5);   /* overflow handled already */
 950             for (i = 0; i < datalen; i++) {
 951                 if (currlen + i == 0)
 952                     state->s1 |= data[i] & 0xFF;
 953                 else
 954                     state->s0 |= (data[i] & 0xFF) << (8*(4-(currlen+i)));
 955             }
 956             currlen += datalen;
 957             assert(currlen >= 0 && currlen < 8);
 958             state->s1 &= ~(7 << 11);
 959             state->s1 |= (currlen << 11);
 960         }
 961     }
 962 }
 963
 964 static void write_to_pointer(void *ctx, long int output)
 965 {
 966     char **ptr = (char **)ctx;
 967     *(*ptr)++ = output;
 968 }
 969
 970 /*
 971  * Writing full ISO-2022 is not useful in very many circumstances.
 972  * One of the few situations in which it _is_ useful is generating
 973  * X11 COMPOUND_TEXT; therefore, this writing function will obey
 974  * the compound text restrictions and hence output the subset of
 975  * ISO-2022 that's usable in that context.
 976  *
 977  * The subset in question is roughly that we use GL/GR for G0/G1
 978  * always, and that the _only_ escape sequences we output (other
 979  * than the occasional DOCS) are those which designate different
 980  * subcharsets into G0 and G1. There are additional constraints
 981  * about which things go in which container; see below.
 982  *
 983  * FIXME: this wants some decent tests to be written, and also the
 984  * exact output policy for compound text wants thinking about more
 985  * carefully.
 986  */
 987 static int write_iso2022(charset_spec const *charset, long int input_chr,
 988                          charset_state *state,
 989                          void (*emit)(void *ctx, long int output),
 990                          void *emitctx)
 991 {
 992     int i;
 993     struct iso2022_subcharset const *subcs;
 994     struct iso2022_mode const *mode = (struct iso2022_mode *)charset->data;
 995     to_dbcs_planar_t last_planar_dbcs = NULL;
 996     int last_p, last_r, last_c;
 997     long int c1, c2;
 998
 999     /*
1000      * For output, I allocate the state variables as follows:
1001      *
1002      *  s1[31] == 1 if output state has been initialised
1003      *  s1[30:24] == G1 charset (always in GR)
1004      *  s1[23:17] == G0 charset (always in GL)
1005      *  s1[16:14] == DOCS index plus 2 (because -1 and -2 are special)
1006      *  s1[13:11] == number of DOCS accumulated characters (up to five)
1007      *  s1[7:0] + s0[31:0] == DOCS collected characters
1008      */
1009
1010     if (!state->s1) {
1011         state->s0 = 0x00000000UL;
1012         state->s1 = 0x80000000UL;
1013         /*
1014          * Start with US-ASCII in GL and also in GR.
1015          */
1016         for (i = 0; (unsigned)i < lenof(iso2022_subcharsets); i++) {
1017             subcs = &iso2022_subcharsets[i];
1018             if (subcs->type == mode->ltype &&
1019                 subcs->i == mode->li &&
1020                 subcs->f == mode->lf)
1021                 oselect(state, i, FALSE, NULL, NULL);
1022             if (subcs->type == mode->rtype &&
1023                 subcs->i == mode->ri &&
1024                 subcs->f == mode->rf)
1025                 oselect(state, i, TRUE, NULL, NULL);
1026         }
1027     }
1028
1029     if (input_chr == -1) {
1030         /*
1031          * Special case: reset encoding state.
1032          */
1033         docs_char(state, emit, emitctx, -2, NULL, 0);   /* leave DOCS */
1034
1035         for (i = 0; (unsigned)i < lenof(iso2022_subcharsets); i++) {
1036             subcs = &iso2022_subcharsets[i];
1037             if (subcs->type == mode->ltype &&
1038                 subcs->i == mode->li &&
1039                 subcs->f == mode->lf)
1040                 oselect(state, i, FALSE, emit, emitctx);
1041             if (subcs->type == mode->rtype &&
1042                 subcs->i == mode->ri &&
1043                 subcs->f == mode->rf)
1044                 oselect(state, i, TRUE, emit, emitctx);
1045         }
1046         return TRUE;
1047     }
1048
1049     /*
1050      * Special-case characters: Space, Delete, and anything in C0
1051      * or C1 are output unchanged.
1052      */
1053     if (input_chr <= 0x20 || (input_chr >= 0x7F && input_chr < 0xA0)) {
1054         emit(emitctx, input_chr);
1055         return TRUE;
1056     }
1057
1058     /*
1059      * Analyse the input character and work out which subcharset it
1060      * belongs to.
1061      */
1062     for (i = 0; (unsigned)i < lenof(iso2022_subcharsets); i++) {
1063         subcs = &iso2022_subcharsets[i];
1064         if (!(mode->enable_mask & (1 << subcs->enable)))
1065             continue;                  /* this charset is disabled */
1066         if (subcs->sbcs_base) {
1067             c1 = sbcs_from_unicode(subcs->sbcs_base, input_chr);
1068             c1 -= subcs->offset;
1069             if (c1 >= 0x20 && c1 <= 0x7f) {
1070                 c2 = 0;
1071                 break;
1072             }
1073         } else if (subcs->to_dbcs) {
1074             if (subcs->to_dbcs_plane >= 0) {
1075                 /*
1076                  * Since multiplanar DBCSes almost by definition
1077                  * involve several entries in iso2022_subcharsets
1078                  * with the same to_dbcs function and different
1079                  * plane values, we remember the last such function
1080                  * we called and what its result was, so that we
1081                  * don't (for example) have to call
1082                  * unicode_to_cns11643 seven times.
1083                  */
1084                 if (last_planar_dbcs != REPLANARISE(subcs->to_dbcs)) {
1085                     last_planar_dbcs = REPLANARISE(subcs->to_dbcs);
1086                     if (!last_planar_dbcs(input_chr,
1087                                           &last_p, &last_r, &last_c))
1088                         last_p = -1;
1089                 }
1090             } else {
1091                 last_p = subcs->to_dbcs_plane;
1092                 if (!subcs->to_dbcs(input_chr, &last_r, &last_c))
1093                     last_p = 0;        /* cannot match since to_dbcs_plane<0 */
1094             }
1095
1096             if (last_p == subcs->to_dbcs_plane) {
1097                 c1 = last_r - subcs->offset;
1098                 c2 = last_c - subcs->offset;
1099                 assert(c1 >= 0x20 && c1 <= 0x7f);
1100                 assert(c2 >= 0x20 && c2 <= 0x7f);
1101                 break;
1102             }
1103         }
1104     }
1105
1106     if ((unsigned)i < lenof(iso2022_subcharsets)) {
1107         int right;
1108
1109         /*
1110          * Our character is represented by c1 (and possibly also
1111          * c2) in subcharset `subcs'. So now we must decide whether
1112          * to designate that character set into G0/GL or G1/GR.
1113          *
1114          * Any S6 or M6 subcharset has to go in GR because it won't
1115          * fit in GL. In addition, the compound text rules state
1116          * that any single-byte subcharset defined as the
1117          * right-hand half of some SBCS must go in GR.
1118          *
1119          * M4 subcharsets can go in either half according to the
1120          * rules. I choose to put them in GR always because it's a
1121          * simple policy with reasonable behaviour (facilitates
1122          * switching between them and ASCII).
1123          */
1124         right = (subcs->type == S6 || subcs->type == M6 || subcs->type == M4 ||
1125                  (subcs->sbcs_base && subcs->offset == 0x80));
1126
1127         /*
1128          * If we're in a DOCS mode, leave it.
1129          */
1130         docs_char(state, emit, emitctx, -2, NULL, 0);
1131
1132         /*
1133          * If this subcharset is not already selected in that
1134          * container, select it.
1135          */
1136         oselect(state, i, right, emit, emitctx);
1137
1138         /*
1139          * Now emit the actual characters.
1140          */
1141         if (right) {
1142             assert(c1 >= 0x20 && c1 <= 0x7f);
1143             emit(emitctx, c1 | 0x80);
1144             if (c2) {
1145                 assert(c2 >= 0x20 && c2 <= 0x7f);
1146                 emit(emitctx, c2 | 0x80);
1147             }
1148         } else {
1149             assert(c1 > 0x20 && c1 < 0x7f);
1150             emit(emitctx, c1);
1151             if (c2) {
1152                 assert(c2 > 0x20 && c2 < 0x7f);
1153                 emit(emitctx, c2);
1154             }
1155         }
1156
1157         return TRUE;
1158     }
1159
1160     /*
1161      * Fall back to DOCS.
1162      */
1163     {
1164         char data[10];
1165         char *p = data;
1166         int i, cs;
1167
1168         cs = -2;                       /* means failure */
1169
1170         for (i = 0; (unsigned)i <= lenof(ctext_encodings); i++) {
1171             charset_state substate;
1172             charset_spec const *subcs = ctext_encodings[i].subcs;
1173
1174             /*
1175              * We assume that all character sets dealt with by DOCS
1176              * are stateless for output purposes.
1177              */
1178             substate.s1 = substate.s0 = 0;
1179             p = data;
1180
1181             if ((unsigned)i < lenof(ctext_encodings)) {
1182                 if ((mode->enable_mask & (1 << ctext_encodings[i].enable)) &&
1183                     subcs->write(subcs, input_chr, &substate,
1184                                  write_to_pointer, &p)) {
1185                     cs = i;
1186                     break;
1187                 }
1188             } else {
1189                 if ((mode->enable_mask & (1 << CDU)) &&
1190                     write_utf8(NULL, input_chr, NULL, write_to_pointer, &p)) {
1191                     cs = -1;
1192                     break;
1193                 }
1194             }
1195         }
1196
1197         if (cs != -2) {
1198             docs_char(state, emit, emitctx, cs, data, p - data);
1199             return TRUE;
1200         }
1201     }
1202
1203     return FALSE;
1204 }
1205
1206 /*
1207  * Full ISO 2022 output with all options on. Not entirely sure what
1208  * if anything this is useful for, but here it is anyway. All
1209  * output character sets and DOCS variants are permitted; all
1210  * containers start out with ASCII in them.
1211  */
1212 static const struct iso2022_mode iso2022_all = {
1213     (1<<CCS) | (1<<COS) | (1<<CPU) | (1<<CDC) | (1<<CDU),
1214     S4, 0, 'B', S4, 0, 'B',
1215 };
1216
1217 const charset_spec charset_CS_ISO2022 = {
1218     CS_ISO2022, read_iso2022, write_iso2022, &iso2022_all
1219 };
1220
1221 /*
1222  * X11 compound text. A subset of output charsets is permitted, and
1223  * G1/GR starts off in ISO8859-1.
1224  */
1225 static const struct iso2022_mode iso2022_ctext = {
1226     (1<<CCS) | (1<<CDC),
1227     S4, 0, 'B', S6, 0, 'A',
1228 };
1229
1230 const charset_spec charset_CS_CTEXT = {
1231     CS_CTEXT, read_iso2022, write_iso2022, &iso2022_ctext
1232 };
1233
1234 #ifdef TESTMODE
1235
1236 #include <stdio.h>
1237 #include <stdarg.h>
1238 #include <string.h>
1239
1240 int total_errs = 0;
1241
1242 void iso2022_emit(void *ctx, long output)
1243 {
1244     wchar_t **p = (wchar_t **)ctx;
1245     *(*p)++ = output;
1246 }
1247
1248 void iso2022_read_test(int line, char *input, int inlen, ...)
1249 {
1250     va_list ap;
1251     wchar_t *p, str[512];
1252     int i;
1253     charset_state state;
1254     unsigned long l;
1255
1256     state.s0 = state.s1 = 0;
1257     p = str;
1258
1259     for (i = 0; i < inlen; i++)
1260         read_iso2022(NULL, input[i] & 0xFF, &state, iso2022_emit, &p);
1261
1262     va_start(ap, inlen);
1263     l = 0;
1264     for (i = 0; i < p - str; i++) {
1265         l = va_arg(ap, long int);
1266         if (l == -1) {
1267             printf("%d: correct string shorter than output\n", line);
1268             total_errs++;
1269             break;
1270         }
1271         if (l != str[i]) {
1272             printf("%d: char %d came out as %08x, should be %08lx\n",
1273                     line, i, str[i], l);
1274             total_errs++;
1275         }
1276     }
1277     if (l != -1) {
1278         l = va_arg(ap, long int);
1279         if (l != -1) {
1280             printf("%d: correct string longer than output\n", line);
1281             total_errs++;
1282         }
1283     }
1284     va_end(ap);
1285 }
1286
1287 /* Macro to concoct the first three parameters of iso2022_read_test. */
1288 #define TESTSTR(x) __LINE__, x, lenof(x)
1289
1290 int main(void)
1291 {
1292     printf("read tests beginning\n");
1293     /* Simple test (Emacs sample text for Japanese, in ISO-2022-JP) */
1294     iso2022_read_test(TESTSTR("Japanese (\x1b$BF|K\\8l\x1b(B)\t"
1295                               "\x1b$B$3$s$K$A$O\x1b(B, "
1296                               "\x1b$B%3%s%K%A%O\x1b(B\n"),
1297                       'J','a','p','a','n','e','s','e',' ','(',
1298                       0x65E5, 0x672C, 0x8A9E, ')', '\t',
1299                       0x3053, 0x3093, 0x306b, 0x3061, 0x306f, ',', ' ',
1300                       0x30b3, 0x30f3, 0x30cb, 0x30c1, 0x30cf, '\n', 0, -1);
1301     /* Same thing in EUC-JP (with designations, and half-width katakana) */
1302     iso2022_read_test(TESTSTR("\x1b$)B\x1b*I\x1b$+D"
1303                               "Japanese (\xc6\xfc\xcb\xdc\xb8\xec)\t"
1304                               "\xa4\xb3\xa4\xf3\xa4\xcb\xa4\xc1\xa4\xcf, "
1305                               "\x8e\xba\x8e\xdd\x8e\xc6\x8e\xc1\x8e\xca\n"),
1306                       'J','a','p','a','n','e','s','e',' ','(',
1307                       0x65E5, 0x672C, 0x8A9E, ')', '\t',
1308                       0x3053, 0x3093, 0x306b, 0x3061, 0x306f, ',', ' ',
1309                       0xff7a, 0xff9d, 0xff86, 0xff81, 0xff8a, '\n', 0, -1);
1310     /* Multibyte single-shift */
1311     iso2022_read_test(TESTSTR("\x1b$)B\x1b*I\x1b$+D\x8f\"/!"),
1312                       0x02D8, '!', 0, -1);
1313     /* Non-existent SBCS */
1314     iso2022_read_test(TESTSTR("\x1b(!Zfnord\n"),
1315                       ERROR, ERROR, ERROR, ERROR, ERROR, '\n', 0, -1);
1316     /* Pass-through of ordinary escape sequences, including a long one */
1317     iso2022_read_test(TESTSTR("\x1b""b\x1b#5\x1b#!!!5"),
1318                       0x1B, 'b', 0x1B, '#', '5',
1319                       0x1B, '#', '!', '!', '!', '5', 0, -1);
1320     /* Non-existent DBCS (also 5-byte escape sequence) */
1321     iso2022_read_test(TESTSTR("\x1b$(!Bfnord!"),
1322                       ERROR, ERROR, ERROR, 0, -1);
1323     /* Incomplete DB characters */
1324     iso2022_read_test(TESTSTR("\x1b$B(,(\x1b(BHi\x1b$B(,(\n"),
1325                       0x2501, ERROR, 'H', 'i', 0x2501, ERROR, '\n', 0, -1);
1326     iso2022_read_test(TESTSTR("\x1b$)B\x1b*I\x1b$+D\xa4""B"),
1327                       ERROR, 'B', 0, -1);
1328     iso2022_read_test(TESTSTR("\x1b$)B\x1b*I\x1b$+D\x0e\x1b|$\xa2\xaf"),
1329                       ERROR, 0x02D8, 0, -1);
1330     /* Incomplete escape sequence */
1331     iso2022_read_test(TESTSTR("\x1b\n"), ERROR, '\n', 0, -1);
1332     iso2022_read_test(TESTSTR("\x1b-A\x1b~\x1b\xa1"), ERROR, 0xa1, 0, -1);
1333     /* Incomplete single-shift */
1334     iso2022_read_test(TESTSTR("\x8e\n"), ERROR, '\n', 0, -1);
1335     iso2022_read_test(TESTSTR("\x1b$*B\x8e(\n"), ERROR, '\n', 0, -1);
1336     /* Corner cases (02/00 and 07/15) */
1337     iso2022_read_test(TESTSTR("\x1b(B\x20\x7f"), 0x20, 0x7f, 0, -1);
1338     iso2022_read_test(TESTSTR("\x1b(I\x20\x7f"), 0x20, 0x7f, 0, -1);
1339     iso2022_read_test(TESTSTR("\x1b$B\x20\x7f"), 0x20, 0x7f, 0, -1);
1340     iso2022_read_test(TESTSTR("\x1b-A\x0e\x20\x7f"), 0xa0, 0xff, 0, -1);
1341     iso2022_read_test(TESTSTR("\x1b$-~\x0e\x20\x7f"), ERROR, 0, -1);
1342     iso2022_read_test(TESTSTR("\x1b)B\xa0\xff"), ERROR, ERROR, 0, -1);
1343     iso2022_read_test(TESTSTR("\x1b)I\xa0\xff"), ERROR, ERROR, 0, -1);
1344     iso2022_read_test(TESTSTR("\x1b$)B\xa0\xff"), ERROR, ERROR, 0, -1);
1345     iso2022_read_test(TESTSTR("\x1b-A\x1b~\xa0\xff"), 0xa0, 0xff, 0, -1);
1346     iso2022_read_test(TESTSTR("\x1b$-~\x1b~\xa0\xff"), ERROR, 0, -1);
1347     /* Designate control sets */
1348     iso2022_read_test(TESTSTR("\x1b!@"), 0x1b, '!', '@', 0, -1);
1349     /* Designate other coding system (UTF-8) */
1350     iso2022_read_test(TESTSTR("\x1b%G"
1351                               "\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"),
1352                       0x03BA, 0x1F79, 0x03C3, 0x03BC, 0x03B5, 0, -1);
1353     iso2022_read_test(TESTSTR("\x1b-A\x1b%G\xCE\xBA\x1b%@\xa0"),
1354                       0x03BA, 0xA0, 0, -1);
1355     iso2022_read_test(TESTSTR("\x1b%G\xCE\x1b%@"), ERROR, 0, -1);
1356     iso2022_read_test(TESTSTR("\x1b%G\xCE\xBA\x1b%\x1b%@"),
1357                       0x03BA, 0x1B, '%', 0, -1);
1358     /* DOCS (COMPOUND_TEXT extended segment) */
1359     iso2022_read_test(TESTSTR("\x1b%/1\x80\x80"), 0, -1);
1360     iso2022_read_test(TESTSTR("\x1b%/1\x80\x8fiso-8859-15\2xyz\x1b(B"),
1361                       ERROR, ERROR, ERROR, 0, -1);
1362     iso2022_read_test(TESTSTR("\x1b%/1\x80\x8eiso8859-15\2xyz\x1b(B"),
1363                       'x', 'y', 'z', 0, -1);
1364     iso2022_read_test(TESTSTR("\x1b-A\x1b%/2\x80\x89"
1365                               "big5-0\2\xa1\x40\xa1\x40"),
1366                       0x3000, 0xa1, 0x40, 0, -1);
1367     /* Emacs Big5-in-ISO-2022 mapping */
1368     iso2022_read_test(TESTSTR("\x1b$(0&x86\x1b(B  \x1b$(0DeBv"),
1369                       0x5143, 0x6c23, ' ', ' ', 0x958b, 0x767c, 0, -1);
1370     /* Test from RFC 1922 (ISO-2022-CN) */
1371     iso2022_read_test(TESTSTR("\x1b$)A\x0e=;;;\x1b$)GG(_P\x0f"),
1372                       0x4EA4, 0x6362, 0x4EA4, 0x63db, 0, -1);
1373
1374     printf("read tests completed\n");
1375     printf("total: %d errors\n", total_errs);
1376     return (total_errs != 0);
1377 }
1378
1379 #endif /* TESTMODE */
1380
1381 #else /* ENUM_CHARSETS */
1382
1383 ENUM_CHARSET(CS_ISO2022)
1384
1385 #endif