mdw@git.distorted.org.uk Git - sgt/charset/blob - iso2022.c

   1 /*
   2  * iso2022.c - support for ISO/IEC 2022 (alias ECMA-35).
   3  *
   4  * This isn't a complete implementation of ISO/IEC 2022, but it's
   5  * close.  It only handles decoding, because a fully general encoder
   6  * isn't really useful.  It can decode 8-bit and 7-bit versions, with
   7  * support for single-byte and multi-byte character sets, all four
   8  * containers (G0, G1, G2, and G3), using both single-shift and
   9  * locking-shift sequences.
  10  *
  11  * The general principle is that any valid ISO/IEC 2022 sequence
  12  * should either be correctly decoded or should emit an ERROR.  The
  13  * only exception to this is that the C0 and C1 sets are fixed as
  14  * those of ISO/IEC 6429.  Escape sequences for designating control
  15  * sets are passed through, so a post-processor could fix them up if
  16  * necessary.
  17  *
  18  * DOCS to UTF-8 works.  Other DOCS sequences are ignored, which will
  19  * produce surprising results.
  20  */
  21
  22 #ifndef ENUM_CHARSETS
  23
  24 #include <assert.h>
  25 #include <string.h>
  26
  27 #include "charset.h"
  28 #include "internal.h"
  29 #include "sbcsdat.h"
  30
  31 #define LS1 (0x0E)
  32 #define LS0 (0x0F)
  33 #define ESC (0x1B)
  34 #define SS2 (0x8E)
  35 #define SS3 (0x8F)
  36
  37 enum {S4, S6, M4, M6};
  38
  39 static long int emacs_big5_1_to_unicode(int, int);
  40 static long int emacs_big5_2_to_unicode(int, int);
  41 static int unicode_to_emacs_big5(long int, int *, int *, int *);
  42 static long int cns11643_1_to_unicode(int, int);
  43 static long int cns11643_2_to_unicode(int, int);
  44 static long int cns11643_3_to_unicode(int, int);
  45 static long int cns11643_4_to_unicode(int, int);
  46 static long int cns11643_5_to_unicode(int, int);
  47 static long int cns11643_6_to_unicode(int, int);
  48 static long int cns11643_7_to_unicode(int, int);
  49 static long int null_dbcs_to_unicode(int, int);
  50 static int unicode_to_null_dbcs(long int, int *, int *);
  51
  52 typedef int (*to_dbcs_t)(long int, int *, int *);
  53 typedef int (*to_dbcs_planar_t)(long int, int *, int *, int *);
  54
  55 /*
  56  * These macros cast between to_dbcs_planar_t and to_dbcs_t, in
  57  * such a way as to cause a compile-time error if the input is not
  58  * of the appropriate type.
  59  *
  60  * Defining these portably is quite fiddly. My first effort was as
  61  * follows:
  62  *   #define DEPLANARISE(x) ( (x) == (to_dbcs_planar_t)NULL, (to_dbcs_t)(x) )
  63  *
  64  * so that the comparison on the left of the comma provokes the
  65  * type check error, and the cast on the right is the actual
  66  * desired result.
  67  *
  68  * gcc was entirely happy with this. However, when used in a static
  69  * initialiser, MSVC objected - justifiably - that the first half
  70  * of the comma expression wasn't constant and thus the expression
  71  * as a whole was not a constant expression. We can get round this
  72  * by enclosing the comparison in `sizeof', so that it isn't
  73  * actually evaluated.
  74  *
  75  * But then we run into a second problem, which is that C actually
  76  * disallows the use of the comma operator within a constant
  77  * expression for any purpose at all! Presumably this is on the
  78  * basis that its purpose is to have side effects and constant
  79  * expressions can't; unfortunately, this specific case is one in
  80  * which the desired side effect is a compile-time rather than a
  81  * run-time one.
  82  *
  83  * We are permitted to use ?:, however, and that works quite well
  84  * since the actual result of the sizeof expression _is_ evaluable
  85  * at compile time. So here's my final answer, with the unfortunate
  86  * remaining problem of evaluating its arguments multiple times:
  87  */
  88 #define TYPECHECK(x,y) ( sizeof((x)) == sizeof((x)) ? (y) : (y) )
  89 #define DEPLANARISE(x) TYPECHECK((x) == (to_dbcs_planar_t)NULL, (to_dbcs_t)(x))
  90 #define REPLANARISE(x) TYPECHECK((x) == (to_dbcs_t)NULL, (to_dbcs_planar_t)(x))
  91
  92 /*
  93  * Values used in the `enable' field. Each of these identifies a
  94  * class of character sets; we then have a bitmask indicating which
  95  * classes are allowable in a given mode.
  96  *
  97  * These values are currently only checked on output: for input,
  98  * any ISO 2022 we can comprehend at all is considered acceptable.
  99  */
 100 #define CCS 1                          /* CTEXT standard */
 101 #define COS 2                          /* other standard */
 102 #define CPU 3                          /* private use */
 103 #define CDC 4                          /* DOCS for CTEXT */
 104 #define CDU 5                          /* DOCS for UTF-8 */
 105 #define CNU 31                         /* never used */
 106
 107 struct iso2022_mode {
 108     int enable_mask;
 109     char ltype, li, lf, rtype, ri, rf;
 110 };
 111
 112 const struct iso2022_subcharset {
 113     char type, i, f, enable;
 114     int offset;
 115     const sbcs_data *sbcs_base;
 116     long int (*from_dbcs)(int, int);
 117
 118     /*
 119      * If to_dbcs_plane < 0, then to_dbcs is used as expected.
 120      * However, if to_dbcs_plane >= 0, then to_dbcs is expected to
 121      * be cast to a to_dbcs_planar_t before use, and the returned
 122      * plane value (the first int *) must equal to_dbcs_plane.
 123      *
 124      * I'd have preferred to do this by means of a union, but you
 125      * can't initialise a selected field of a union at compile
 126      * time. Function pointer casts are guaranteed to work sensibly
 127      * in ISO C (that is, it's undefined what happens if you call a
 128      * function via the wrong type of pointer, but if you cast it
 129      * back to the right type before calling it then it must work),
 130      * so this is safe if ugly.
 131      */
 132     to_dbcs_t to_dbcs;
 133     int to_dbcs_plane;                 /* use to_dbcs_planar iff >= 0 */
 134 } iso2022_subcharsets[] = {
 135     /*
 136      * We list these subcharsets in preference order for output.
 137      * Since the best-defined use of ISO 2022 output is compound
 138      * text, we'll use a preference order which matches that. So we
 139      * begin with the charsets defined in the compound text spec.
 140      */
 141     { S4, 0, 'B', CCS, 0x00, &sbcsdata_CS_ASCII },
 142     { S6, 0, 'A', CCS, 0x80, &sbcsdata_CS_ISO8859_1 },
 143     { S6, 0, 'B', CCS, 0x80, &sbcsdata_CS_ISO8859_2 },
 144     { S6, 0, 'C', CCS, 0x80, &sbcsdata_CS_ISO8859_3 },
 145     { S6, 0, 'D', CCS, 0x80, &sbcsdata_CS_ISO8859_4 },
 146     { S6, 0, 'F', CCS, 0x80, &sbcsdata_CS_ISO8859_7 },
 147     { S6, 0, 'G', CCS, 0x80, &sbcsdata_CS_ISO8859_6 },
 148     { S6, 0, 'H', CCS, 0x80, &sbcsdata_CS_ISO8859_8 },
 149     { S6, 0, 'L', CCS, 0x80, &sbcsdata_CS_ISO8859_5 },
 150     { S6, 0, 'M', CCS, 0x80, &sbcsdata_CS_ISO8859_9 },
 151     { S4, 0, 'I', CCS, 0x80, &sbcsdata_CS_JISX0201 },
 152     { S4, 0, 'J', CCS, 0x00, &sbcsdata_CS_JISX0201 },
 153     { M4, 0, 'A', CCS, -0x21, 0, &gb2312_to_unicode, &unicode_to_gb2312, -1 },
 154     { M4, 0, 'B', CCS, -0x21, 0, &jisx0208_to_unicode, &unicode_to_jisx0208, -1 },
 155     { M4, 0, 'C', CCS, -0x21, 0, &ksx1001_to_unicode, &unicode_to_ksx1001, -1 },
 156     { M4, 0, 'D', CCS, -0x21, 0, &jisx0212_to_unicode, &unicode_to_jisx0212, -1 },
 157
 158     /*
 159      * Next, other reasonably standard things: the rest of the ISO
 160      * 8859 sets, UK-ASCII, and CNS 11643.
 161      */
 162     { S6, 0, 'T', COS, 0x80, &sbcsdata_CS_ISO8859_11 },
 163     { S6, 0, 'V', COS, 0x80, &sbcsdata_CS_ISO8859_10 },
 164     { S6, 0, 'Y', COS, 0x80, &sbcsdata_CS_ISO8859_13 },
 165     { S6, 0, '_', COS, 0x80, &sbcsdata_CS_ISO8859_14 },
 166     { S6, 0, 'b', COS, 0x80, &sbcsdata_CS_ISO8859_15 },
 167     { S6, 0, 'f', COS, 0x80, &sbcsdata_CS_ISO8859_16 },
 168     { S4, 0, 'A', COS, 0x00, &sbcsdata_CS_BS4730 },
 169     { M4, 0, 'G', COS, -0x21, 0, &cns11643_1_to_unicode, DEPLANARISE(&unicode_to_cns11643), 0 },
 170     { M4, 0, 'H', COS, -0x21, 0, &cns11643_2_to_unicode, DEPLANARISE(&unicode_to_cns11643), 1 },
 171     { M4, 0, 'I', COS, -0x21, 0, &cns11643_3_to_unicode, DEPLANARISE(&unicode_to_cns11643), 2 },
 172     { M4, 0, 'J', COS, -0x21, 0, &cns11643_4_to_unicode, DEPLANARISE(&unicode_to_cns11643), 3 },
 173     { M4, 0, 'K', COS, -0x21, 0, &cns11643_5_to_unicode, DEPLANARISE(&unicode_to_cns11643), 4 },
 174     { M4, 0, 'L', COS, -0x21, 0, &cns11643_6_to_unicode, DEPLANARISE(&unicode_to_cns11643), 5 },
 175     { M4, 0, 'M', COS, -0x21, 0, &cns11643_7_to_unicode, DEPLANARISE(&unicode_to_cns11643), 6 },
 176
 177     /*
 178      * Private-use designations: DEC private sets and Emacs's Big5
 179      * abomination.
 180      */
 181     { S4, 0, '0', CPU, 0x00, &sbcsdata_CS_DEC_GRAPHICS },
 182     { S4, 0, '<', CPU, 0x80, &sbcsdata_CS_DEC_MCS },
 183     { M4, 0, '0', CPU, -0x21, 0, &emacs_big5_1_to_unicode, DEPLANARISE(&unicode_to_emacs_big5), 1 },
 184     { M4, 0, '1', CPU, -0x21, 0, &emacs_big5_2_to_unicode, DEPLANARISE(&unicode_to_emacs_big5), 2 },
 185
 186     /*
 187      * Ben left this conditioned out without explanation,
 188      * presumably on the grounds that we don't have a translation
 189      * table for it.
 190      */
 191 #if 0
 192     { M4, 0, '@', CNU }, /* JIS C 6226-1978 */
 193 #endif
 194
 195     /*
 196      * Finally, fallback entries for null character sets.
 197      */
 198     { S4, 0, '~', CNU },
 199     { S6, 0, '~', CNU }, /* empty 96-set */
 200     { M4, 0, '~', CNU, 0, 0, &null_dbcs_to_unicode, &unicode_to_null_dbcs, -1 }, /* empty 94^n-set */
 201     { M6, 0, '~', CNU, 0, 0, &null_dbcs_to_unicode, &unicode_to_null_dbcs, -1 }, /* empty 96^n-set */
 202 };
 203
 204 static long int null_dbcs_to_unicode(int r, int c)
 205 {
 206     UNUSEDARG(r);
 207     UNUSEDARG(c);
 208     return ERROR;
 209 }
 210 static int unicode_to_null_dbcs(long int unicode, int *r, int *c)
 211 {
 212     UNUSEDARG(unicode);
 213     UNUSEDARG(r);
 214     UNUSEDARG(c);
 215     return 0;                          /* failed to convert anything */
 216 }
 217
 218 /*
 219  * Emacs encodes Big5 in COMPOUND_TEXT as two 94x94 character sets.
 220  * We treat Big5 as a 94x191 character set with a bunch of undefined
 221  * columns in the middle, so we have to mess around a bit to make
 222  * things fit.
 223  */
 224
 225 static long int emacs_big5_1_to_unicode(int r, int c)
 226 {
 227     unsigned long s;
 228     s = r * 94 + c;
 229     r = s / 157;
 230     c = s % 157;
 231     if (c >= 64) c += 34; /* Skip over the gap */
 232     return big5_to_unicode(r, c);
 233 }
 234
 235 static long int emacs_big5_2_to_unicode(int r, int c)
 236 {
 237     unsigned long s;
 238     s = r * 94 + c;
 239     r = s / 157 + 40;
 240     c = s % 157;
 241     if (c >= 64) c += 34; /* Skip over the gap */
 242     return big5_to_unicode(r, c);
 243 }
 244
 245 static int unicode_to_emacs_big5(long int unicode, int *p, int *r, int *c)
 246 {
 247     int rr, cc, s;
 248     if (!unicode_to_big5(unicode, &rr, &cc))
 249         return 0;
 250     if (cc >= 64) {
 251         cc -= 34;
 252         assert(cc >= 64);
 253     }
 254     s = rr * 157 + cc;
 255     if (s >= 40*157) {
 256         *p = 2;
 257         s -= 40*157;
 258     } else {
 259         *p = 1;
 260     }
 261     *r = s / 94;
 262     *c = s % 94;
 263     return 1;
 264 }
 265
 266 /* Wrappers for cns11643_to_unicode() */
 267 static long int cns11643_1_to_unicode(int r, int c)
 268 {
 269     return cns11643_to_unicode(0, r, c);
 270 }
 271 static long int cns11643_2_to_unicode(int r, int c)
 272 {
 273     return cns11643_to_unicode(1, r, c);
 274 }
 275 static long int cns11643_3_to_unicode(int r, int c)
 276 {
 277     return cns11643_to_unicode(2, r, c);
 278 }
 279 static long int cns11643_4_to_unicode(int r, int c)
 280 {
 281     return cns11643_to_unicode(3, r, c);
 282 }
 283 static long int cns11643_5_to_unicode(int r, int c)
 284 {
 285     return cns11643_to_unicode(4, r, c);
 286 }
 287 static long int cns11643_6_to_unicode(int r, int c)
 288 {
 289     return cns11643_to_unicode(5, r, c);
 290 }
 291 static long int cns11643_7_to_unicode(int r, int c)
 292 {
 293     return cns11643_to_unicode(6, r, c);
 294 }
 295
 296 /* States, or "what we're currently accumulating". */
 297 enum {
 298     IDLE,       /* None of the below */
 299     SS2CHAR,    /* Accumulating a character after SS2 */
 300     SS3CHAR,    /* Accumulating a character after SS3 */
 301     ESCSEQ,     /* Accumulating an escape sequence */
 302     ESCDROP,    /* Discarding an escape sequence */
 303     ESCPASS,    /* Passing through an escape sequence */
 304     DOCSUTF8,   /* DOCSed into UTF-8 */
 305     DOCSCTEXT   /* DOCSed into a COMPOUND_TEXT extended segment */
 306 };
 307
 308 #if 0
 309 #include <stdio.h>
 310 static void dump_state(charset_state *s)
 311 {
 312     unsigned s0 = s->s0, s1 = s->s1;
 313     char const * const modes[] = { "IDLE", "SS2CHAR", "SS3CHAR",
 314                                    "ESCSEQ", "ESCDROP", "ESCPASS",
 315                                    "DOCSUTF8" };
 316
 317     fprintf(stderr, "s0: %s", modes[s0 >> 29]);
 318     fprintf(stderr, " %02x %02x %02x   ", (s0 >> 16) & 0xff, (s0 >> 8) & 0xff,
 319             s0 & 0xff);
 320     fprintf(stderr, "s1: LS%d LS%dR", (s1 >> 30) & 3, (s1 >> 28) & 3);
 321     fprintf(stderr, " %d %d %d %d\n", s1 & 0x7f, (s1 >> 7) & 0x7f,
 322             (s1 >> 14) & 0x7f, (s1 >> 21) & 0x7f);
 323 }
 324 #endif
 325
 326 static void designate(charset_state *state, int container,
 327                       int type, int ibyte, int fbyte)
 328 {
 329     unsigned long i;
 330
 331     assert(container >= 0 && container <= 3);
 332     assert(type == S4 || type == S6 || type == M4 || type == M6);
 333
 334     for (i = 0; i < lenof(iso2022_subcharsets); i++) {
 335         if (iso2022_subcharsets[i].type == type &&
 336             iso2022_subcharsets[i].i == ibyte &&
 337             iso2022_subcharsets[i].f == fbyte) {
 338             state->s1 &= ~(0x7fL << (container * 7));
 339             state->s1 |= (i << (container * 7));
 340             return;
 341         }
 342     }
 343     /*
 344      * If we don't find the charset, invoke the empty one, so we
 345      * output ERROR rather than garbage.
 346      */
 347     designate(state, container, type, 0, '~');
 348 }
 349
 350 static void do_utf8(long int input_chr,
 351                     charset_state *state,
 352                     void (*emit)(void *ctx, long int output),
 353                     void *emitctx)
 354 {
 355     charset_state ustate;
 356
 357     ustate.s1 = 0;
 358     ustate.s0 = state->s0 & 0x03ffffffL;
 359     read_utf8(NULL, input_chr, &ustate, emit, emitctx);
 360     state->s0 = (state->s0 & ~0x03ffffffL) | (ustate.s0 & 0x03ffffffL);
 361 }
 362
 363 static void docs_utf8(long int input_chr,
 364                       charset_state *state,
 365                       void (*emit)(void *ctx, long int output),
 366                       void *emitctx)
 367 {
 368     int retstate;
 369
 370     /*
 371      * Bits [25:0] of s0 are reserved for read_utf8().
 372      * Bits [27:26] are a tiny state machine to recognise ESC % @.
 373      */
 374     retstate = (state->s0 & 0x0c000000L) >> 26;
 375     if (retstate == 1 && input_chr == '%')
 376         retstate = 2;
 377     else if (retstate == 2 && input_chr == '@') {
 378         /* If we've got a partial UTF-8 sequence, complain. */
 379         if (state->s0 & 0x03ffffffL)
 380             emit(emitctx, ERROR);
 381         state->s0 = 0;
 382         return;
 383     } else {
 384         if (retstate >= 1) do_utf8(ESC, state, emit, emitctx);
 385         if (retstate >= 2) do_utf8('%', state, emit, emitctx);
 386         retstate = 0;
 387         if (input_chr == ESC)
 388             retstate = 1;
 389         else {
 390             do_utf8(input_chr, state, emit, emitctx);
 391         }
 392     }
 393     state->s0 = (state->s0 & ~0x0c000000L) | (retstate << 26);
 394 }
 395
 396 struct ctext_encoding {
 397     char const *name;
 398     char octets_per_char, enable;
 399     charset_spec const *subcs;
 400 };
 401
 402 /*
 403  * In theory, this list is in <ftp://ftp.x.org/pub/DOCS/registry>,
 404  * but XLib appears to have its own ideas, and encodes these three
 405  * (as of X11R6.8.2)
 406  */
 407
 408 extern charset_spec const charset_CS_ISO8859_14;
 409 extern charset_spec const charset_CS_ISO8859_15;
 410 extern charset_spec const charset_CS_BIG5;
 411
 412 static struct ctext_encoding const ctext_encodings[] = {
 413     { "big5-0\2", 0 /* variable */, CDC, &charset_CS_BIG5 },
 414     { "iso8859-14\2", 1, CDC, &charset_CS_ISO8859_14 },
 415     { "iso8859-15\2", 1, CDC, &charset_CS_ISO8859_15 }
 416 };
 417
 418 static void docs_ctext(long int input_chr,
 419                        charset_state *state,
 420                        void (*emit)(void *ctx, long int output),
 421                        void *emitctx)
 422 {
 423     /*
 424      * s0[27:26] = first entry in ctext_encodings that matches
 425      * s0[25:22] = number of characters successfully matched, 0xf if all
 426      * s0[21:8] count the number of octets left in the segment
 427      * s0[7:0] are for sub-charset use
 428      */
 429     int n = (state->s0 >> 22) & 0xf, i = (state->s0 >> 26) & 3, oi = i, j;
 430     int length = (state->s0 >> 8) & 0x3fff;
 431
 432     /*
 433      * Note that we do not bother checking the octets-per-character
 434      * byte against the selected charset when reading. It's
 435      * extremely unlikely that this code will ever have to deal
 436      * with two charset identifiers with the same name and
 437      * different octets-per-character values! If it ever happens,
 438      * we'll have to edit this file anyway so we can modify the
 439      * code then...
 440      */
 441
 442     if (!length) {
 443         /* Haven't read length yet */
 444         if ((state->s0 & 0xff) == 0)
 445             /* ... or even the first byte */
 446             state->s0 |= input_chr;
 447         else {
 448             length = (state->s0 & 0x7f) * 0x80 + (input_chr & 0x7f);
 449             if (length == 0)
 450                 state->s0 = 0;
 451             else
 452                 state->s0 = (state->s0 & 0xf0000000) | (length << 8);
 453         }
 454         return;
 455     }
 456
 457     j = i;
 458     if (n == 0xe) {
 459         /* Skipping unknown encoding.  Look out for STX. */
 460         if (input_chr == 2)
 461             state->s0 = (state->s0 & 0xf0000000) | (i << 26) | (0xf << 22);
 462     } else if (n != 0xf) {
 463         while ((unsigned)j < lenof(ctext_encodings) &&
 464                !memcmp(ctext_encodings[j].name,
 465                        ctext_encodings[oi].name, n)) {
 466             if (ctext_encodings[j].name[n] < input_chr)
 467                 i = ++j;
 468             else
 469                 break;
 470         }
 471         if ((unsigned)i >= lenof(ctext_encodings) ||
 472             memcmp(ctext_encodings[i].name,
 473                    ctext_encodings[oi].name, n) ||
 474             ctext_encodings[i].name[n] != input_chr) {
 475             /* Doom!  We haven't heard of this encoding */
 476             i = lenof(ctext_encodings);
 477             n = 0xe;
 478         } else {
 479             /*
 480              * Otherwise, we have found an additional character in our
 481              * encoding name. See if we have reached the _end_ of our
 482              * name.
 483              */
 484             n++;
 485             if (!ctext_encodings[i].name[n])
 486                 n = 0xf;
 487         }
 488         /*
 489          * Failing _that_, we simply update our encoding-name-
 490          * tracking state.
 491          */
 492         assert(i < 4 && n < 16);
 493         state->s0 = (state->s0 & 0xf0000000) | (i << 26) | (n << 22);
 494     } else {
 495         if ((unsigned)i >= lenof(ctext_encodings))
 496             emit(emitctx, ERROR);
 497         else {
 498             charset_state substate;
 499             charset_spec const *subcs = ctext_encodings[i].subcs;
 500             substate.s1 = 0;
 501             substate.s0 = state->s0 & 0xff;
 502             subcs->read(subcs, input_chr, &substate, emit, emitctx);
 503             state->s0 = (state->s0 & ~0xff) | (substate.s0 & 0xff);
 504         }
 505     }
 506     if (!--length)
 507         state->s0 = 0;
 508     else
 509         state->s0 = (state->s0 &~0x003fff00) | (length << 8);
 510 }
 511
 512 static void read_iso2022(charset_spec const *charset, long int input_chr,
 513                          charset_state *state,
 514                          void (*emit)(void *ctx, long int output),
 515                          void *emitctx)
 516 {
 517     struct iso2022_mode const *mode = (struct iso2022_mode *)charset->data;
 518
 519     /* dump_state(state); */
 520     /*
 521      * We have to make fairly efficient use of the 64 bits of state
 522      * available to us.  Long-term state goes in s1, and consists of
 523      * the identities of the character sets designated as G0/G1/G2/G3
 524      * and the locking-shift states for GL and GR.  Short-term state
 525      * goes in s0: The bottom half of s0 accumulates characters for an
 526      * escape sequence or a multi-byte character, while the top three
 527      * bits indicate what they're being accumulated for.  After DOCS,
 528      * the bottom 29 bits of state are available for the DOCS function
 529      * to use -- the UTF-8 one uses the bottom 26 for UTF-8 decoding
 530      * and the top two to recognised ESC % @.
 531      *
 532      * s0[31:29] = state enum
 533      * s0[24:0] = accumulated bytes
 534      * s1[31:30] = GL locking-shift state
 535      * s1[29:28] = GR locking-shift state
 536      * s1[27:21] = G3 charset
 537      * s1[20:14] = G2 charset
 538      * s1[13:7] = G1 charset
 539      * s1[6:0] = G0 charset
 540      */
 541
 542 #define LEFT 30
 543 #define RIGHT 28
 544 #define LOCKING_SHIFT(n,side) \
 545         (state->s1 = (state->s1 & ~(3UL<<(side))) | ((n ## UL)<<(side)))
 546 #define MODE ((state->s0 & 0xe0000000UL) >> 29)
 547 #define ENTER_MODE(m) (state->s0 = (state->s0 & ~0xe0000000UL) | ((unsigned long)(m)<<29))
 548 #define SINGLE_SHIFT(n) ENTER_MODE(SS2CHAR - 2 + (n))
 549 #define ASSERT_IDLE do {                                                \
 550         if (state->s0 != 0) emit(emitctx, ERROR);                       \
 551         state->s0 = 0;                                                  \
 552 } while (0)
 553
 554     if (state->s1 == 0) {
 555         /*
 556          * Since there's no LS0R, this means we must just have started.
 557          * Set up a sane initial state (LS0, LS1R, ASCII in G0/G1/G2/G3).
 558          */
 559         LOCKING_SHIFT(0, LEFT);
 560         LOCKING_SHIFT(1, RIGHT);
 561         designate(state, 0, mode->ltype, mode->li, mode->lf);
 562         designate(state, 1, mode->rtype, mode->ri, mode->rf);
 563         designate(state, 2, S4, 0, 'B');
 564         designate(state, 3, S4, 0, 'B');
 565     }
 566
 567     if (MODE == DOCSUTF8) {
 568         docs_utf8(input_chr, state, emit, emitctx);
 569         return;
 570     }
 571     if (MODE == DOCSCTEXT) {
 572         docs_ctext(input_chr, state, emit, emitctx);
 573         return;
 574     }
 575
 576     if ((input_chr & 0x60) == 0x00) {
 577         /* C0 or C1 control */
 578         ASSERT_IDLE;
 579         switch (input_chr) {
 580           case ESC:
 581             ENTER_MODE(ESCSEQ);
 582             break;
 583           case LS0:
 584             LOCKING_SHIFT(0, LEFT);
 585             break;
 586           case LS1:
 587             LOCKING_SHIFT(1, LEFT);
 588             break;
 589           case SS2:
 590             SINGLE_SHIFT(2);
 591             break;
 592           case SS3:
 593             SINGLE_SHIFT(3);
 594             break;
 595           default:
 596             emit(emitctx, input_chr);
 597             break;
 598         }
 599     } else if ((input_chr & 0x80) || MODE < ESCSEQ) {
 600         int is_gl = 0;
 601         struct iso2022_subcharset const *subcs;
 602         unsigned container;
 603         long input_7bit;
 604         /*
 605          * Actual data.
 606          * Force idle state if we're in mid escape sequence, or in a
 607          * multi-byte character with a different top bit.
 608          */
 609         if (MODE >= ESCSEQ ||
 610             ((state->s0 & 0x00ff0000L) != 0 &&
 611              (((state->s0 >> 16) ^ input_chr) & 0x80)))
 612             ASSERT_IDLE;
 613         if (MODE == SS2CHAR || MODE == SS3CHAR) /* Single-shift */
 614             container = MODE - SS2CHAR + 2;
 615         else if (input_chr >= 0x80) /* GR */
 616             container = (state->s1 >> 28) & 3;
 617         else { /* GL */
 618             container = state->s1 >> 30;
 619             is_gl = 1;
 620         }
 621         input_7bit = input_chr & ~0x80;
 622         subcs = &iso2022_subcharsets[(state->s1 >> (container * 7)) & 0x7f];
 623         if ((subcs->type == S4 || subcs->type == M4) &&
 624             (input_7bit == 0x20 || input_7bit == 0x7f)) {
 625             /* characters not in 94-char set */
 626             if (is_gl) emit(emitctx, input_7bit);
 627             else emit(emitctx, ERROR);
 628         } else if (subcs->type == M4 || subcs->type == M6) {
 629             if ((state->s0 & 0x00ff0000L) == 0) {
 630                 state->s0 |= input_chr << 16;
 631                 return;
 632             } else {
 633                 emit(emitctx,
 634                      subcs->from_dbcs(((state->s0 >> 16) & 0x7f) +
 635                                       subcs->offset,
 636                                       input_7bit + subcs->offset));
 637             }
 638         } else {
 639             if ((state->s0 & 0x00ff0000L) != 0)
 640                 emit(emitctx, ERROR);
 641             emit(emitctx, subcs->sbcs_base ?
 642                  sbcs_to_unicode(subcs->sbcs_base, input_7bit + subcs->offset):
 643                  ERROR);
 644         }
 645         state->s0 = 0;
 646     } else {
 647         unsigned i1, i2;
 648         if (MODE == ESCPASS) {
 649             emit(emitctx, input_chr);
 650             if ((input_chr & 0xf0) != 0x20)
 651                 ENTER_MODE(IDLE);
 652             return;
 653         }
 654
 655         /*
 656          * Intermediate bytes shall be any of the 16 positions of
 657          * column 02 of the code table; they are denoted by the symbol
 658          * I.
 659          */
 660         if ((input_chr & 0xf0) == 0x20) {
 661             if (((state->s0 >> 16) & 0xff) == 0)
 662                 state->s0 |= input_chr << 16;
 663             else if (((state->s0 >> 8) & 0xff) == 0)
 664                 state->s0 |= input_chr << 8;
 665             else {
 666                 /* Long escape sequence.  Switch to ESCPASS or ESCDROP. */
 667                 i1 = (state->s0 >> 16) & 0xff;
 668                 i2 = (state->s0 >> 8) & 0xff;
 669                 switch (i1) {
 670                   case '(': case ')': case '*': case '+':
 671                   case '-': case '.': case '/':
 672                   case '$':
 673                     ENTER_MODE(ESCDROP);
 674                     break;
 675                   default:
 676                     emit(emitctx, ESC);
 677                     emit(emitctx, i1);
 678                     emit(emitctx, i2);
 679                     emit(emitctx, input_chr);
 680                     state->s0 = 0;
 681                     ENTER_MODE(ESCPASS);
 682                     break;
 683                 }
 684             }
 685             return;
 686         }
 687
 688         /*
 689          * Final bytes shall be any of the 79 positions of columns 03
 690          * to 07 of the code table excluding position 07/15; they are
 691          * denoted by the symbol F.
 692          */
 693         i1 = (state->s0 >> 16) & 0xff;
 694         i2 = (state->s0 >> 8) & 0xff;
 695         if (MODE == ESCDROP)
 696             input_chr = 0; /* Make sure it won't match. */
 697         state->s0 = 0;
 698         switch (i1) {
 699           case 0: /* No intermediate bytes */
 700             switch (input_chr) {
 701               case 'N': /* SS2 */
 702                 SINGLE_SHIFT(2);
 703                 break;
 704               case 'O': /* SS3 */
 705                 SINGLE_SHIFT(3);
 706                 break;
 707               case 'n': /* LS2 */
 708                 LOCKING_SHIFT(2, LEFT);
 709                 break;
 710               case 'o': /* LS3 */
 711                 LOCKING_SHIFT(3, LEFT);
 712                 break;
 713               case '|': /* LS3R */
 714                 LOCKING_SHIFT(3, RIGHT);
 715                 break;
 716               case '}': /* LS2R */
 717                 LOCKING_SHIFT(2, RIGHT);
 718                 break;
 719               case '~': /* LS1R */
 720                 LOCKING_SHIFT(1, RIGHT);
 721                 break;
 722               default:
 723                 /* Unsupported escape sequence.  Spit it back out. */
 724                 emit(emitctx, ESC);
 725                 emit(emitctx, input_chr);
 726             }
 727             break;
 728           case ' ': /* ACS */
 729             /*
 730              * Various coding structure facilities specify that designating
 731              * a code element also invokes it.  As far as I can see, invoking
 732              * it now will have the same practical effect, since those
 733              * facilities also ban the use of locking shifts.
 734              */
 735             switch (input_chr) {
 736               case 'A': /* G0 element used and invoked into GL */
 737                 LOCKING_SHIFT(0, LEFT);
 738                 break;
 739               case 'C': /* G0 in GL, G1 in GR */
 740               case 'D': /* Ditto, at least for 8-bit codes */
 741               case 'L': /* ISO 4873 (ECMA-43) level 1 */
 742               case 'M': /* ISO 4873 (ECMA-43) level 2 */
 743                 LOCKING_SHIFT(0, LEFT);
 744                 LOCKING_SHIFT(1, RIGHT);
 745                 break;
 746             }
 747             break;
 748           case '&': /* IRR */
 749             /*
 750              * IRR (Identify Revised Registration) is ignored here,
 751              * since any revised registration must be
 752              * upward-compatible with the old one, so either we'll
 753              * support the new one or we'll emit ERROR when we run
 754              * into a new character.  In either case, there's nothing
 755              * to be done here.
 756              */
 757             break;
 758           case '(': /* GZD4 */  case ')': /* G1D4 */
 759           case '*': /* G2D4 */  case '+': /* G3D4 */
 760             designate(state, i1 - '(', S4, i2, input_chr);
 761             break;
 762           case '-': /* G1D6 */  case '.': /* G2D6 */  case '/': /* G3D6 */
 763             designate(state, i1 - ',', S6, i2, input_chr);
 764             break;
 765           case '$': /* G?DM? */
 766             switch (i2) {
 767               case 0: /* Obsolete version of GZDM4 */
 768                 i2 = '(';
 769               case '(': /* GZDM4 */  case ')': /* G1DM4 */
 770               case '*': /* G2DM4 */  case '+': /* G3DM4 */
 771                 designate(state, i2 - '(', M4, 0, input_chr);
 772                 break;
 773               case '-': /* G1DM6 */
 774               case '.': /* G2DM6 */  case '/': /* G3DM6 */
 775                 designate(state, i2 - ',', M6, 0, input_chr);
 776                 break;
 777               default:
 778                 emit(emitctx, ERROR);
 779                 break;
 780             }
 781           case '%': /* DOCS */
 782             /* XXX What's a reasonable way to handle an unrecognised DOCS? */
 783             switch (i2) {
 784               case 0:
 785                 switch (input_chr) {
 786                   case 'G':
 787                     ENTER_MODE(DOCSUTF8);
 788                     break;
 789                 }
 790                 break;
 791               case '/':
 792                 switch (input_chr) {
 793                   case '1': case '2':
 794                     ENTER_MODE(DOCSCTEXT);
 795                     break;
 796                 }
 797                 break;
 798             }
 799             break;
 800           default:
 801             /* Unsupported nF escape sequence.  Re-emit it. */
 802             emit(emitctx, ESC);
 803             emit(emitctx, i1);
 804             if (i2) emit(emitctx, i2);
 805             emit(emitctx, input_chr);
 806             break;
 807         }
 808     }
 809 }
 810
 811 static void oselect(charset_state *state, int i, int right,
 812                     void (*emit)(void *ctx, long int output),
 813                     void *emitctx)
 814 {
 815     int shift = (right ? 31-7 : 31-7-7);
 816     struct iso2022_subcharset const *subcs = &iso2022_subcharsets[i];
 817
 818     if (((state->s1 >> shift) & 0x7F) != (unsigned)i) {
 819         state->s1 &= ~(0x7FL << shift);
 820         state->s1 |= (i << shift);
 821
 822         if (emit) {
 823             emit(emitctx, ESC);
 824             if (subcs->type == M4 || subcs->type == M6)
 825                 emit(emitctx, '$');
 826             if (subcs->type == S6 || subcs->type == M6) {
 827                 assert(right);
 828                 emit(emitctx, '-');
 829             } else if (right) {
 830                 emit(emitctx, ')');
 831             } else {
 832                 emit(emitctx, '(');
 833             }
 834             if (subcs->i)
 835                 emit(emitctx, subcs->i);
 836             emit(emitctx, subcs->f);
 837         }
 838     }
 839 }
 840
 841 static void docs_char(charset_state *state,
 842                       void (*emit)(void *ctx, long int output),
 843                       void *emitctx, int cset, char *data, int datalen)
 844 {
 845     int curr_cset, currlen, i;
 846
 847     /*
 848      * cset is the index into ctext_encodings[]. It can also be -1
 849      * to mean DOCS UTF-8, or -2 to mean no DOCS (ordinary 2022).
 850      * In the latter case, `chr' is ignored.
 851      */
 852
 853     /*
 854      * First, terminate a DOCS segment if necessary. We always have
 855      * to terminate a DOCS segment if one is active and we're about
 856      * to switch to a different one; we might also have to
 857      * terminate a length-encoded DOCS segment if we've run out of
 858      * storage space to accumulate characters in it.
 859      */
 860     curr_cset = ((state->s1 >> 14) & 7) - 2;
 861     currlen = ((state->s1 >> 11) & 7);
 862     if ((curr_cset != -2 && curr_cset != cset) ||
 863         (curr_cset >= 0 && currlen + datalen > 5)) {
 864         if (curr_cset == -1) {
 865             /*
 866              * Terminating DOCS UTF-8 is easy.
 867              */
 868             emit(emitctx, ESC);
 869             emit(emitctx, '%');
 870             emit(emitctx, '@');
 871         } else {
 872             int len;
 873
 874             /*
 875              * To terminate a length-encoded DOCS segment we must
 876              * actually output the whole thing.
 877              */
 878             emit(emitctx, ESC);
 879             emit(emitctx, '%');
 880             emit(emitctx, '/');
 881             emit(emitctx, '0' + ctext_encodings[curr_cset].octets_per_char);
 882             len = currlen + datalen +
 883                 strlen(ctext_encodings[curr_cset].name);
 884             assert(len < (1 << 14));
 885             emit(emitctx, 0x80 | ((len >> 7) & 0x7F));
 886             emit(emitctx, 0x80 | ((len     ) & 0x7F));
 887             /* The name stored in ctext_encodings[] includes the trailing \2 */
 888             for (i = 0; ctext_encodings[curr_cset].name[i]; i++)
 889                 emit(emitctx, ctext_encodings[curr_cset].name[i]);
 890             for (i = 0; i < currlen; i++)
 891                 emit(emitctx,
 892                      (i == 0 ? state->s1 : state->s0 >> (8*(4-i))) & 0xFF);
 893             for (i = 0; i < datalen; i++)
 894                 emit(emitctx, data[i]);
 895
 896             /*
 897              * We've now dealt with the input data, so clear it so
 898              * we don't try to do so again below.
 899              */
 900             datalen = 0;
 901         }
 902         curr_cset = -2;
 903     }
 904
 905     /*
 906      * Now, start a DOCS segment if necessary.
 907      */
 908     if (curr_cset != cset) {
 909         assert(cset != -2);
 910         if (cset == -1) {
 911             /*
 912              * Start DOCS UTF-8.
 913              */
 914             emit(emitctx, ESC);
 915             emit(emitctx, '%');
 916             emit(emitctx, 'G');
 917         } else {
 918             /*
 919              * Starting a length-encoded DOCS segment is simply a
 920              * matter of setting our stored length counter to zero.
 921              */
 922             currlen = 0;
 923             state->s1 &= ~(7 << 11);
 924             state->s1 &= ~0xFF;
 925             state->s0 = 0;
 926         }
 927     }
 928     state->s1 &= ~(7 << 14);
 929     assert((cset+2) >= 0 && (cset+2) < 8);
 930     state->s1 |= ((cset+2) << 14);
 931
 932     /*
 933      * Now we're in the right DOCS state. Actually deal with the
 934      * input data, if we haven't already done so above.
 935      */
 936     if (datalen > 0) {
 937         assert(cset != 2);
 938         if (cset == -1) {
 939             /*
 940              * In DOCS UTF-8, we output data as soon as we get it.
 941              */
 942             for (i = 0; i < datalen; i++)
 943                 emit(emitctx, data[i]);
 944         } else {
 945             /*
 946              * In length-encoded DOCS, we just store our data and
 947              * bide our time. It'll all be output when we fill up
 948              * or switch to another character set.
 949              */
 950             assert(currlen + datalen <= 5);   /* overflow handled already */
 951             for (i = 0; i < datalen; i++) {
 952                 if (currlen + i == 0)
 953                     state->s1 |= data[i] & 0xFF;
 954                 else
 955                     state->s0 |= (data[i] & 0xFF) << (8*(4-(currlen+i)));
 956             }
 957             currlen += datalen;
 958             assert(currlen >= 0 && currlen < 8);
 959             state->s1 &= ~(7 << 11);
 960             state->s1 |= (currlen << 11);
 961         }
 962     }
 963 }
 964
 965 static void write_to_pointer(void *ctx, long int output)
 966 {
 967     char **ptr = (char **)ctx;
 968     *(*ptr)++ = output;
 969 }
 970
 971 /*
 972  * Writing full ISO-2022 is not useful in very many circumstances.
 973  * One of the few situations in which it _is_ useful is generating
 974  * X11 COMPOUND_TEXT; therefore, this writing function will obey
 975  * the compound text restrictions and hence output the subset of
 976  * ISO-2022 that's usable in that context.
 977  *
 978  * The subset in question is roughly that we use GL/GR for G0/G1
 979  * always, and that the _only_ escape sequences we output (other
 980  * than the occasional DOCS) are those which designate different
 981  * subcharsets into G0 and G1. There are additional constraints
 982  * about which things go in which container; see below.
 983  *
 984  * FIXME: this wants some decent tests to be written, and also the
 985  * exact output policy for compound text wants thinking about more
 986  * carefully.
 987  */
 988 static int write_iso2022(charset_spec const *charset, long int input_chr,
 989                          charset_state *state,
 990                          void (*emit)(void *ctx, long int output),
 991                          void *emitctx)
 992 {
 993     int i;
 994     struct iso2022_subcharset const *subcs;
 995     struct iso2022_mode const *mode = (struct iso2022_mode *)charset->data;
 996     to_dbcs_planar_t last_planar_dbcs = NULL;
 997     int last_p, last_r, last_c;
 998     long int c1, c2;
 999
1000     /*
1001      * For output, I allocate the state variables as follows:
1002      *
1003      *  s1[31] == 1 if output state has been initialised
1004      *  s1[30:24] == G1 charset (always in GR)
1005      *  s1[23:17] == G0 charset (always in GL)
1006      *  s1[16:14] == DOCS index plus 2 (because -1 and -2 are special)
1007      *  s1[13:11] == number of DOCS accumulated characters (up to five)
1008      *  s1[7:0] + s0[31:0] == DOCS collected characters
1009      */
1010
1011     if (!state->s1) {
1012         state->s0 = 0x00000000UL;
1013         state->s1 = 0x80000000UL;
1014         /*
1015          * Start with US-ASCII in GL and also in GR.
1016          */
1017         for (i = 0; (unsigned)i < lenof(iso2022_subcharsets); i++) {
1018             subcs = &iso2022_subcharsets[i];
1019             if (subcs->type == mode->ltype &&
1020                 subcs->i == mode->li &&
1021                 subcs->f == mode->lf)
1022                 oselect(state, i, FALSE, NULL, NULL);
1023             if (subcs->type == mode->rtype &&
1024                 subcs->i == mode->ri &&
1025                 subcs->f == mode->rf)
1026                 oselect(state, i, TRUE, NULL, NULL);
1027         }
1028     }
1029
1030     if (input_chr == -1) {
1031         /*
1032          * Special case: reset encoding state.
1033          */
1034         docs_char(state, emit, emitctx, -2, NULL, 0);   /* leave DOCS */
1035
1036         for (i = 0; (unsigned)i < lenof(iso2022_subcharsets); i++) {
1037             subcs = &iso2022_subcharsets[i];
1038             if (subcs->type == mode->ltype &&
1039                 subcs->i == mode->li &&
1040                 subcs->f == mode->lf)
1041                 oselect(state, i, FALSE, emit, emitctx);
1042             if (subcs->type == mode->rtype &&
1043                 subcs->i == mode->ri &&
1044                 subcs->f == mode->rf)
1045                 oselect(state, i, TRUE, emit, emitctx);
1046         }
1047         return TRUE;
1048     }
1049
1050     /*
1051      * Special-case characters: Space, Delete, and anything in C0
1052      * or C1 are output unchanged.
1053      */
1054     if (input_chr <= 0x20 || (input_chr >= 0x7F && input_chr < 0xA0)) {
1055         emit(emitctx, input_chr);
1056         return TRUE;
1057     }
1058
1059     /*
1060      * Analyse the input character and work out which subcharset it
1061      * belongs to.
1062      */
1063     for (i = 0; (unsigned)i < lenof(iso2022_subcharsets); i++) {
1064         subcs = &iso2022_subcharsets[i];
1065         if (!(mode->enable_mask & (1 << subcs->enable)))
1066             continue;                  /* this charset is disabled */
1067         if (subcs->sbcs_base) {
1068             c1 = sbcs_from_unicode(subcs->sbcs_base, input_chr);
1069             c1 -= subcs->offset;
1070             if (c1 >= 0x20 && c1 <= 0x7f) {
1071                 c2 = 0;
1072                 break;
1073             }
1074         } else if (subcs->to_dbcs) {
1075             if (subcs->to_dbcs_plane >= 0) {
1076                 /*
1077                  * Since multiplanar DBCSes almost by definition
1078                  * involve several entries in iso2022_subcharsets
1079                  * with the same to_dbcs function and different
1080                  * plane values, we remember the last such function
1081                  * we called and what its result was, so that we
1082                  * don't (for example) have to call
1083                  * unicode_to_cns11643 seven times.
1084                  */
1085                 if (last_planar_dbcs != REPLANARISE(subcs->to_dbcs)) {
1086                     last_planar_dbcs = REPLANARISE(subcs->to_dbcs);
1087                     if (!last_planar_dbcs(input_chr,
1088                                           &last_p, &last_r, &last_c))
1089                         last_p = -1;
1090                 }
1091             } else {
1092                 last_p = subcs->to_dbcs_plane;
1093                 if (!subcs->to_dbcs(input_chr, &last_r, &last_c))
1094                     last_p = 0;        /* cannot match since to_dbcs_plane<0 */
1095             }
1096
1097             if (last_p == subcs->to_dbcs_plane) {
1098                 c1 = last_r - subcs->offset;
1099                 c2 = last_c - subcs->offset;
1100                 assert(c1 >= 0x20 && c1 <= 0x7f);
1101                 assert(c2 >= 0x20 && c2 <= 0x7f);
1102                 break;
1103             }
1104         }
1105     }
1106
1107     if ((unsigned)i < lenof(iso2022_subcharsets)) {
1108         int right;
1109
1110         /*
1111          * Our character is represented by c1 (and possibly also
1112          * c2) in subcharset `subcs'. So now we must decide whether
1113          * to designate that character set into G0/GL or G1/GR.
1114          *
1115          * Any S6 or M6 subcharset has to go in GR because it won't
1116          * fit in GL. In addition, the compound text rules state
1117          * that any single-byte subcharset defined as the
1118          * right-hand half of some SBCS must go in GR.
1119          *
1120          * M4 subcharsets can go in either half according to the
1121          * rules. I choose to put them in GR always because it's a
1122          * simple policy with reasonable behaviour (facilitates
1123          * switching between them and ASCII).
1124          */
1125         right = (subcs->type == S6 || subcs->type == M6 || subcs->type == M4 ||
1126                  (subcs->sbcs_base && subcs->offset == 0x80));
1127
1128         /*
1129          * If we're in a DOCS mode, leave it.
1130          */
1131         docs_char(state, emit, emitctx, -2, NULL, 0);
1132
1133         /*
1134          * If this subcharset is not already selected in that
1135          * container, select it.
1136          */
1137         oselect(state, i, right, emit, emitctx);
1138
1139         /*
1140          * Now emit the actual characters.
1141          */
1142         if (right) {
1143             assert(c1 >= 0x20 && c1 <= 0x7f);
1144             emit(emitctx, c1 | 0x80);
1145             if (c2) {
1146                 assert(c2 >= 0x20 && c2 <= 0x7f);
1147                 emit(emitctx, c2 | 0x80);
1148             }
1149         } else {
1150             assert(c1 > 0x20 && c1 < 0x7f);
1151             emit(emitctx, c1);
1152             if (c2) {
1153                 assert(c2 > 0x20 && c2 < 0x7f);
1154                 emit(emitctx, c2);
1155             }
1156         }
1157
1158         return TRUE;
1159     }
1160
1161     /*
1162      * Fall back to DOCS.
1163      */
1164     {
1165         char data[10];
1166         char *p = data;
1167         int i, cs;
1168
1169         cs = -2;                       /* means failure */
1170
1171         for (i = 0; (unsigned)i <= lenof(ctext_encodings); i++) {
1172             charset_state substate;
1173             charset_spec const *subcs = ctext_encodings[i].subcs;
1174
1175             /*
1176              * We assume that all character sets dealt with by DOCS
1177              * are stateless for output purposes.
1178              */
1179             substate.s1 = substate.s0 = 0;
1180             p = data;
1181
1182             if ((unsigned)i < lenof(ctext_encodings)) {
1183                 if ((mode->enable_mask & (1 << ctext_encodings[i].enable)) &&
1184                     subcs->write(subcs, input_chr, &substate,
1185                                  write_to_pointer, &p)) {
1186                     cs = i;
1187                     break;
1188                 }
1189             } else {
1190                 if ((mode->enable_mask & (1 << CDU)) &&
1191                     write_utf8(NULL, input_chr, NULL, write_to_pointer, &p)) {
1192                     cs = -1;
1193                     break;
1194                 }
1195             }
1196         }
1197
1198         if (cs != -2) {
1199             docs_char(state, emit, emitctx, cs, data, p - data);
1200             return TRUE;
1201         }
1202     }
1203
1204     return FALSE;
1205 }
1206
1207 /*
1208  * Full ISO 2022 output with all options on. Not entirely sure what
1209  * if anything this is useful for, but here it is anyway. All
1210  * output character sets and DOCS variants are permitted; all
1211  * containers start out with ASCII in them.
1212  */
1213 static const struct iso2022_mode iso2022_all = {
1214     (1<<CCS) | (1<<COS) | (1<<CPU) | (1<<CDC) | (1<<CDU),
1215     S4, 0, 'B', S4, 0, 'B',
1216 };
1217
1218 const charset_spec charset_CS_ISO2022 = {
1219     CS_ISO2022, read_iso2022, write_iso2022, &iso2022_all
1220 };
1221
1222 /*
1223  * X11 compound text. A subset of output charsets is permitted, and
1224  * G1/GR starts off in ISO8859-1.
1225  */
1226 static const struct iso2022_mode iso2022_ctext = {
1227     (1<<CCS) | (1<<CDC),
1228     S4, 0, 'B', S6, 0, 'A',
1229 };
1230
1231 const charset_spec charset_CS_CTEXT = {
1232     CS_CTEXT, read_iso2022, write_iso2022, &iso2022_ctext
1233 };
1234
1235 #ifdef TESTMODE
1236
1237 #include <stdio.h>
1238 #include <stdarg.h>
1239 #include <string.h>
1240
1241 int total_errs = 0;
1242
1243 void iso2022_emit(void *ctx, long output)
1244 {
1245     wchar_t **p = (wchar_t **)ctx;
1246     *(*p)++ = output;
1247 }
1248
1249 void iso2022_read_test(int line, char *input, int inlen, ...)
1250 {
1251     va_list ap;
1252     wchar_t *p, str[512];
1253     int i;
1254     charset_state state;
1255     unsigned long l;
1256
1257     state.s0 = state.s1 = 0;
1258     p = str;
1259
1260     for (i = 0; i < inlen; i++)
1261         read_iso2022(NULL, input[i] & 0xFF, &state, iso2022_emit, &p);
1262
1263     va_start(ap, inlen);
1264     l = 0;
1265     for (i = 0; i < p - str; i++) {
1266         l = va_arg(ap, long int);
1267         if (l == -1) {
1268             printf("%d: correct string shorter than output\n", line);
1269             total_errs++;
1270             break;
1271         }
1272         if (l != str[i]) {
1273             printf("%d: char %d came out as %08x, should be %08lx\n",
1274                     line, i, str[i], l);
1275             total_errs++;
1276         }
1277     }
1278     if (l != -1) {
1279         l = va_arg(ap, long int);
1280         if (l != -1) {
1281             printf("%d: correct string longer than output\n", line);
1282             total_errs++;
1283         }
1284     }
1285     va_end(ap);
1286 }
1287
1288 /* Macro to concoct the first three parameters of iso2022_read_test. */
1289 #define TESTSTR(x) __LINE__, x, lenof(x)
1290
1291 int main(void)
1292 {
1293     printf("read tests beginning\n");
1294     /* Simple test (Emacs sample text for Japanese, in ISO-2022-JP) */
1295     iso2022_read_test(TESTSTR("Japanese (\x1b$BF|K\\8l\x1b(B)\t"
1296                               "\x1b$B$3$s$K$A$O\x1b(B, "
1297                               "\x1b$B%3%s%K%A%O\x1b(B\n"),
1298                       'J','a','p','a','n','e','s','e',' ','(',
1299                       0x65E5, 0x672C, 0x8A9E, ')', '\t',
1300                       0x3053, 0x3093, 0x306b, 0x3061, 0x306f, ',', ' ',
1301                       0x30b3, 0x30f3, 0x30cb, 0x30c1, 0x30cf, '\n', 0, -1);
1302     /* Same thing in EUC-JP (with designations, and half-width katakana) */
1303     iso2022_read_test(TESTSTR("\x1b$)B\x1b*I\x1b$+D"
1304                               "Japanese (\xc6\xfc\xcb\xdc\xb8\xec)\t"
1305                               "\xa4\xb3\xa4\xf3\xa4\xcb\xa4\xc1\xa4\xcf, "
1306                               "\x8e\xba\x8e\xdd\x8e\xc6\x8e\xc1\x8e\xca\n"),
1307                       'J','a','p','a','n','e','s','e',' ','(',
1308                       0x65E5, 0x672C, 0x8A9E, ')', '\t',
1309                       0x3053, 0x3093, 0x306b, 0x3061, 0x306f, ',', ' ',
1310                       0xff7a, 0xff9d, 0xff86, 0xff81, 0xff8a, '\n', 0, -1);
1311     /* Multibyte single-shift */
1312     iso2022_read_test(TESTSTR("\x1b$)B\x1b*I\x1b$+D\x8f\"/!"),
1313                       0x02D8, '!', 0, -1);
1314     /* Non-existent SBCS */
1315     iso2022_read_test(TESTSTR("\x1b(!Zfnord\n"),
1316                       ERROR, ERROR, ERROR, ERROR, ERROR, '\n', 0, -1);
1317     /* Pass-through of ordinary escape sequences, including a long one */
1318     iso2022_read_test(TESTSTR("\x1b""b\x1b#5\x1b#!!!5"),
1319                       0x1B, 'b', 0x1B, '#', '5',
1320                       0x1B, '#', '!', '!', '!', '5', 0, -1);
1321     /* Non-existent DBCS (also 5-byte escape sequence) */
1322     iso2022_read_test(TESTSTR("\x1b$(!Bfnord!"),
1323                       ERROR, ERROR, ERROR, 0, -1);
1324     /* Incomplete DB characters */
1325     iso2022_read_test(TESTSTR("\x1b$B(,(\x1b(BHi\x1b$B(,(\n"),
1326                       0x2501, ERROR, 'H', 'i', 0x2501, ERROR, '\n', 0, -1);
1327     iso2022_read_test(TESTSTR("\x1b$)B\x1b*I\x1b$+D\xa4""B"),
1328                       ERROR, 'B', 0, -1);
1329     iso2022_read_test(TESTSTR("\x1b$)B\x1b*I\x1b$+D\x0e\x1b|$\xa2\xaf"),
1330                       ERROR, 0x02D8, 0, -1);
1331     /* Incomplete escape sequence */
1332     iso2022_read_test(TESTSTR("\x1b\n"), ERROR, '\n', 0, -1);
1333     iso2022_read_test(TESTSTR("\x1b-A\x1b~\x1b\xa1"), ERROR, 0xa1, 0, -1);
1334     /* Incomplete single-shift */
1335     iso2022_read_test(TESTSTR("\x8e\n"), ERROR, '\n', 0, -1);
1336     iso2022_read_test(TESTSTR("\x1b$*B\x8e(\n"), ERROR, '\n', 0, -1);
1337     /* Corner cases (02/00 and 07/15) */
1338     iso2022_read_test(TESTSTR("\x1b(B\x20\x7f"), 0x20, 0x7f, 0, -1);
1339     iso2022_read_test(TESTSTR("\x1b(I\x20\x7f"), 0x20, 0x7f, 0, -1);
1340     iso2022_read_test(TESTSTR("\x1b$B\x20\x7f"), 0x20, 0x7f, 0, -1);
1341     iso2022_read_test(TESTSTR("\x1b-A\x0e\x20\x7f"), 0xa0, 0xff, 0, -1);
1342     iso2022_read_test(TESTSTR("\x1b$-~\x0e\x20\x7f"), ERROR, 0, -1);
1343     iso2022_read_test(TESTSTR("\x1b)B\xa0\xff"), ERROR, ERROR, 0, -1);
1344     iso2022_read_test(TESTSTR("\x1b)I\xa0\xff"), ERROR, ERROR, 0, -1);
1345     iso2022_read_test(TESTSTR("\x1b$)B\xa0\xff"), ERROR, ERROR, 0, -1);
1346     iso2022_read_test(TESTSTR("\x1b-A\x1b~\xa0\xff"), 0xa0, 0xff, 0, -1);
1347     iso2022_read_test(TESTSTR("\x1b$-~\x1b~\xa0\xff"), ERROR, 0, -1);
1348     /* Designate control sets */
1349     iso2022_read_test(TESTSTR("\x1b!@"), 0x1b, '!', '@', 0, -1);
1350     /* Designate other coding system (UTF-8) */
1351     iso2022_read_test(TESTSTR("\x1b%G"
1352                               "\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"),
1353                       0x03BA, 0x1F79, 0x03C3, 0x03BC, 0x03B5, 0, -1);
1354     iso2022_read_test(TESTSTR("\x1b-A\x1b%G\xCE\xBA\x1b%@\xa0"),
1355                       0x03BA, 0xA0, 0, -1);
1356     iso2022_read_test(TESTSTR("\x1b%G\xCE\x1b%@"), ERROR, 0, -1);
1357     iso2022_read_test(TESTSTR("\x1b%G\xCE\xBA\x1b%\x1b%@"),
1358                       0x03BA, 0x1B, '%', 0, -1);
1359     /* DOCS (COMPOUND_TEXT extended segment) */
1360     iso2022_read_test(TESTSTR("\x1b%/1\x80\x80"), 0, -1);
1361     iso2022_read_test(TESTSTR("\x1b%/1\x80\x8fiso-8859-15\2xyz\x1b(B"),
1362                       ERROR, ERROR, ERROR, 0, -1);
1363     iso2022_read_test(TESTSTR("\x1b%/1\x80\x8eiso8859-15\2xyz\x1b(B"),
1364                       'x', 'y', 'z', 0, -1);
1365     iso2022_read_test(TESTSTR("\x1b-A\x1b%/2\x80\x89"
1366                               "big5-0\2\xa1\x40\xa1\x40"),
1367                       0x3000, 0xa1, 0x40, 0, -1);
1368     /* Emacs Big5-in-ISO-2022 mapping */
1369     iso2022_read_test(TESTSTR("\x1b$(0&x86\x1b(B  \x1b$(0DeBv"),
1370                       0x5143, 0x6c23, ' ', ' ', 0x958b, 0x767c, 0, -1);
1371     /* Test from RFC 1922 (ISO-2022-CN) */
1372     iso2022_read_test(TESTSTR("\x1b$)A\x0e=;;;\x1b$)GG(_P\x0f"),
1373                       0x4EA4, 0x6362, 0x4EA4, 0x63db, 0, -1);
1374
1375     printf("read tests completed\n");
1376     printf("total: %d errors\n", total_errs);
1377     return (total_errs != 0);
1378 }
1379
1380 #endif /* TESTMODE */
1381
1382 #else /* ENUM_CHARSETS */
1383
1384 ENUM_CHARSET(CS_ISO2022)
1385
1386 #endif