mdw@git.distorted.org.uk Git - sgt/charset/blob - iso2022.c

   1 /*
   2  * iso2022.c - support for ISO/IEC 2022 (alias ECMA-35).
   3  *
   4  * This isn't a complete implementation of ISO/IEC 2022, but it's
   5  * close.  It only handles decoding, because a fully general encoder
   6  * isn't really useful.  It can decode 8-bit and 7-bit versions, with
   7  * support for single-byte and multi-byte character sets, all four
   8  * containers (G0, G1, G2, and G3), using both single-shift and
   9  * locking-shift sequences.
  10  *
  11  * The general principle is that any valid ISO/IEC 2022 sequence
  12  * should either be correctly decoded or should emit an ERROR.  The
  13  * only exception to this is that the C0 and C1 sets are fixed as
  14  * those of ISO/IEC 6429.  Escape sequences for designating control
  15  * sets are passed through, so a post-processor could fix them up if
  16  * necessary.
  17  *
  18  * DOCS to UTF-8 works.  Other DOCS sequences are ignored, which will
  19  * produce surprising results.
  20  */
  21
  22 #ifndef ENUM_CHARSETS
  23
  24 #include <assert.h>
  25 #include <string.h>
  26
  27 #include "charset.h"
  28 #include "internal.h"
  29 #include "sbcsdat.h"
  30
  31 #define LS1 (0x0E)
  32 #define LS0 (0x0F)
  33 #define ESC (0x1B)
  34 #define SS2 (0x8E)
  35 #define SS3 (0x8F)
  36
  37 enum {S4, S6, M4, M6};
  38
  39 static long int emacs_big5_1_to_unicode(int, int);
  40 static long int emacs_big5_2_to_unicode(int, int);
  41 static int unicode_to_emacs_big5(long int, int *, int *, int *);
  42 static long int cns11643_1_to_unicode(int, int);
  43 static long int cns11643_2_to_unicode(int, int);
  44 static long int cns11643_3_to_unicode(int, int);
  45 static long int cns11643_4_to_unicode(int, int);
  46 static long int cns11643_5_to_unicode(int, int);
  47 static long int cns11643_6_to_unicode(int, int);
  48 static long int cns11643_7_to_unicode(int, int);
  49 static long int null_dbcs_to_unicode(int, int);
  50 static int unicode_to_null_dbcs(long int, int *, int *);
  51
  52 typedef int (*to_dbcs_t)(long int, int *, int *);
  53 typedef int (*to_dbcs_planar_t)(long int, int *, int *, int *);
  54
  55 /*
  56  * Cast between to_dbcs_planar_t and to_dbcs_t.
  57  *
  58  * I (SGT) originally defined these two macros as follows:
  59
  60 #define DEPLANARISE(x) ( (x) == (to_dbcs_planar_t)NULL, (to_dbcs_t)(x) )
  61 #define REPLANARISE(x) ( (x) == (to_dbcs_t)NULL, (to_dbcs_planar_t)(x) )
  62
  63  * When compiled with gcc, this had the effect of type-checking the
  64  * input, so that DEPLANARISE would cast a to_dbcs_t to a
  65  * to_dbcs_planar_t but cause a compile error if passed any other
  66  * input type, and vice versa. However, MSVC felt that this was a
  67  * non-constant expression and hence not legal to use in a static
  68  * initialiser, and probably rightly so: I haven't had a chance to
  69  * check with the C standard, but I'd be surprised if it _required_
  70  * compilers to keep an open mind long enough to discover that the
  71  * non-constant part of the expression has its result thrown away.
  72  *
  73  * I can't think of any other means of performing this type check
  74  * which doesn't have the same problem, so I'm taking the type
  75  * checks out, with regret.
  76  */
  77 #define DEPLANARISE(x) ( (to_dbcs_t)(x) )
  78 #define REPLANARISE(x) ( (to_dbcs_planar_t)(x) )
  79
  80 /*
  81  * Values used in the `enable' field. Each of these identifies a
  82  * class of character sets; we then have a bitmask indicating which
  83  * classes are allowable in a given mode.
  84  *
  85  * These values are currently only checked on output: for input,
  86  * any ISO 2022 we can comprehend at all is considered acceptable.
  87  */
  88 #define CCS 1                          /* CTEXT standard */
  89 #define COS 2                          /* other standard */
  90 #define CPU 3                          /* private use */
  91 #define CDC 4                          /* DOCS for CTEXT */
  92 #define CDU 5                          /* DOCS for UTF-8 */
  93 #define CNU 31                         /* never used */
  94
  95 struct iso2022_mode {
  96     int enable_mask;
  97     char ltype, li, lf, rtype, ri, rf;
  98 };
  99
 100 const struct iso2022_subcharset {
 101     char type, i, f, enable;
 102     int offset;
 103     const sbcs_data *sbcs_base;
 104     long int (*from_dbcs)(int, int);
 105
 106     /*
 107      * If to_dbcs_plane < 0, then to_dbcs is used as expected.
 108      * However, if to_dbcs_plane >= 0, then to_dbcs is expected to
 109      * be cast to a to_dbcs_planar_t before use, and the returned
 110      * plane value (the first int *) must equal to_dbcs_plane.
 111      *
 112      * I'd have preferred to do this by means of a union, but you
 113      * can't initialise a selected field of a union at compile
 114      * time. Function pointer casts are guaranteed to work sensibly
 115      * in ISO C (that is, it's undefined what happens if you call a
 116      * function via the wrong type of pointer, but if you cast it
 117      * back to the right type before calling it then it must work),
 118      * so this is safe if ugly.
 119      */
 120     to_dbcs_t to_dbcs;
 121     int to_dbcs_plane;                 /* use to_dbcs_planar iff >= 0 */
 122 } iso2022_subcharsets[] = {
 123     /*
 124      * We list these subcharsets in preference order for output.
 125      * Since the best-defined use of ISO 2022 output is compound
 126      * text, we'll use a preference order which matches that. So we
 127      * begin with the charsets defined in the compound text spec.
 128      */
 129     { S4, 0, 'B', CCS, 0x00, &sbcsdata_CS_ASCII },
 130     { S6, 0, 'A', CCS, 0x80, &sbcsdata_CS_ISO8859_1 },
 131     { S6, 0, 'B', CCS, 0x80, &sbcsdata_CS_ISO8859_2 },
 132     { S6, 0, 'C', CCS, 0x80, &sbcsdata_CS_ISO8859_3 },
 133     { S6, 0, 'D', CCS, 0x80, &sbcsdata_CS_ISO8859_4 },
 134     { S6, 0, 'F', CCS, 0x80, &sbcsdata_CS_ISO8859_7 },
 135     { S6, 0, 'G', CCS, 0x80, &sbcsdata_CS_ISO8859_6 },
 136     { S6, 0, 'H', CCS, 0x80, &sbcsdata_CS_ISO8859_8 },
 137     { S6, 0, 'L', CCS, 0x80, &sbcsdata_CS_ISO8859_5 },
 138     { S6, 0, 'M', CCS, 0x80, &sbcsdata_CS_ISO8859_9 },
 139     { S4, 0, 'I', CCS, 0x80, &sbcsdata_CS_JISX0201 },
 140     { S4, 0, 'J', CCS, 0x00, &sbcsdata_CS_JISX0201 },
 141     { M4, 0, 'A', CCS, -0x21, 0, &gb2312_to_unicode, &unicode_to_gb2312, -1 },
 142     { M4, 0, 'B', CCS, -0x21, 0, &jisx0208_to_unicode, &unicode_to_jisx0208, -1 },
 143     { M4, 0, 'C', CCS, -0x21, 0, &ksx1001_to_unicode, &unicode_to_ksx1001, -1 },
 144     { M4, 0, 'D', CCS, -0x21, 0, &jisx0212_to_unicode, &unicode_to_jisx0212, -1 },
 145
 146     /*
 147      * Next, other reasonably standard things: the rest of the ISO
 148      * 8859 sets, UK-ASCII, and CNS 11643.
 149      */
 150     { S6, 0, 'T', COS, 0x80, &sbcsdata_CS_ISO8859_11 },
 151     { S6, 0, 'V', COS, 0x80, &sbcsdata_CS_ISO8859_10 },
 152     { S6, 0, 'Y', COS, 0x80, &sbcsdata_CS_ISO8859_13 },
 153     { S6, 0, '_', COS, 0x80, &sbcsdata_CS_ISO8859_14 },
 154     { S6, 0, 'b', COS, 0x80, &sbcsdata_CS_ISO8859_15 },
 155     { S6, 0, 'f', COS, 0x80, &sbcsdata_CS_ISO8859_16 },
 156     { S4, 0, 'A', COS, 0x00, &sbcsdata_CS_BS4730 },
 157     { M4, 0, 'G', COS, -0x21, 0, &cns11643_1_to_unicode, DEPLANARISE(&unicode_to_cns11643), 0 },
 158     { M4, 0, 'H', COS, -0x21, 0, &cns11643_2_to_unicode, DEPLANARISE(&unicode_to_cns11643), 1 },
 159     { M4, 0, 'I', COS, -0x21, 0, &cns11643_3_to_unicode, DEPLANARISE(&unicode_to_cns11643), 2 },
 160     { M4, 0, 'J', COS, -0x21, 0, &cns11643_4_to_unicode, DEPLANARISE(&unicode_to_cns11643), 3 },
 161     { M4, 0, 'K', COS, -0x21, 0, &cns11643_5_to_unicode, DEPLANARISE(&unicode_to_cns11643), 4 },
 162     { M4, 0, 'L', COS, -0x21, 0, &cns11643_6_to_unicode, DEPLANARISE(&unicode_to_cns11643), 5 },
 163     { M4, 0, 'M', COS, -0x21, 0, &cns11643_7_to_unicode, DEPLANARISE(&unicode_to_cns11643), 6 },
 164
 165     /*
 166      * Private-use designations: DEC private sets and Emacs's Big5
 167      * abomination.
 168      */
 169     { S4, 0, '0', CPU, 0x00, &sbcsdata_CS_DEC_GRAPHICS },
 170     { S4, 0, '<', CPU, 0x80, &sbcsdata_CS_DEC_MCS },
 171     { M4, 0, '0', CPU, -0x21, 0, &emacs_big5_1_to_unicode, DEPLANARISE(&unicode_to_emacs_big5), 1 },
 172     { M4, 0, '1', CPU, -0x21, 0, &emacs_big5_2_to_unicode, DEPLANARISE(&unicode_to_emacs_big5), 2 },
 173
 174     /*
 175      * Ben left this conditioned out without explanation,
 176      * presumably on the grounds that we don't have a translation
 177      * table for it.
 178      */
 179 #if 0
 180     { M4, 0, '@', CNU }, /* JIS C 6226-1978 */
 181 #endif
 182
 183     /*
 184      * Finally, fallback entries for null character sets.
 185      */
 186     { S4, 0, '~', CNU },
 187     { S6, 0, '~', CNU }, /* empty 96-set */
 188     { M4, 0, '~', CNU, 0, 0, &null_dbcs_to_unicode, &unicode_to_null_dbcs, -1 }, /* empty 94^n-set */
 189     { M6, 0, '~', CNU, 0, 0, &null_dbcs_to_unicode, &unicode_to_null_dbcs, -1 }, /* empty 96^n-set */
 190 };
 191
 192 static long int null_dbcs_to_unicode(int r, int c)
 193 {
 194     UNUSEDARG(r);
 195     UNUSEDARG(c);
 196     return ERROR;
 197 }
 198 static int unicode_to_null_dbcs(long int unicode, int *r, int *c)
 199 {
 200     UNUSEDARG(unicode);
 201     UNUSEDARG(r);
 202     UNUSEDARG(c);
 203     return 0;                          /* failed to convert anything */
 204 }
 205
 206 /*
 207  * Emacs encodes Big5 in COMPOUND_TEXT as two 94x94 character sets.
 208  * We treat Big5 as a 94x191 character set with a bunch of undefined
 209  * columns in the middle, so we have to mess around a bit to make
 210  * things fit.
 211  */
 212
 213 static long int emacs_big5_1_to_unicode(int r, int c)
 214 {
 215     unsigned long s;
 216     s = r * 94 + c;
 217     r = s / 157;
 218     c = s % 157;
 219     if (c >= 64) c += 34; /* Skip over the gap */
 220     return big5_to_unicode(r, c);
 221 }
 222
 223 static long int emacs_big5_2_to_unicode(int r, int c)
 224 {
 225     unsigned long s;
 226     s = r * 94 + c;
 227     r = s / 157 + 40;
 228     c = s % 157;
 229     if (c >= 64) c += 34; /* Skip over the gap */
 230     return big5_to_unicode(r, c);
 231 }
 232
 233 static int unicode_to_emacs_big5(long int unicode, int *p, int *r, int *c)
 234 {
 235     int rr, cc, s;
 236     if (!unicode_to_big5(unicode, &rr, &cc))
 237         return 0;
 238     if (cc >= 64) {
 239         cc -= 34;
 240         assert(cc >= 64);
 241     }
 242     s = rr * 157 + cc;
 243     if (s >= 40*157) {
 244         *p = 2;
 245         s -= 40*157;
 246     } else {
 247         *p = 1;
 248     }
 249     *r = s / 94;
 250     *c = s % 94;
 251     return 1;
 252 }
 253
 254 /* Wrappers for cns11643_to_unicode() */
 255 static long int cns11643_1_to_unicode(int r, int c)
 256 {
 257     return cns11643_to_unicode(0, r, c);
 258 }
 259 static long int cns11643_2_to_unicode(int r, int c)
 260 {
 261     return cns11643_to_unicode(1, r, c);
 262 }
 263 static long int cns11643_3_to_unicode(int r, int c)
 264 {
 265     return cns11643_to_unicode(2, r, c);
 266 }
 267 static long int cns11643_4_to_unicode(int r, int c)
 268 {
 269     return cns11643_to_unicode(3, r, c);
 270 }
 271 static long int cns11643_5_to_unicode(int r, int c)
 272 {
 273     return cns11643_to_unicode(4, r, c);
 274 }
 275 static long int cns11643_6_to_unicode(int r, int c)
 276 {
 277     return cns11643_to_unicode(5, r, c);
 278 }
 279 static long int cns11643_7_to_unicode(int r, int c)
 280 {
 281     return cns11643_to_unicode(6, r, c);
 282 }
 283
 284 /* States, or "what we're currently accumulating". */
 285 enum {
 286     IDLE,       /* None of the below */
 287     SS2CHAR,    /* Accumulating a character after SS2 */
 288     SS3CHAR,    /* Accumulating a character after SS3 */
 289     ESCSEQ,     /* Accumulating an escape sequence */
 290     ESCDROP,    /* Discarding an escape sequence */
 291     ESCPASS,    /* Passing through an escape sequence */
 292     DOCSUTF8,   /* DOCSed into UTF-8 */
 293     DOCSCTEXT   /* DOCSed into a COMPOUND_TEXT extended segment */
 294 };
 295
 296 #if 0
 297 #include <stdio.h>
 298 static void dump_state(charset_state *s)
 299 {
 300     unsigned s0 = s->s0, s1 = s->s1;
 301     char const * const modes[] = { "IDLE", "SS2CHAR", "SS3CHAR",
 302                                    "ESCSEQ", "ESCDROP", "ESCPASS",
 303                                    "DOCSUTF8" };
 304
 305     fprintf(stderr, "s0: %s", modes[s0 >> 29]);
 306     fprintf(stderr, " %02x %02x %02x   ", (s0 >> 16) & 0xff, (s0 >> 8) & 0xff,
 307             s0 & 0xff);
 308     fprintf(stderr, "s1: LS%d LS%dR", (s1 >> 30) & 3, (s1 >> 28) & 3);
 309     fprintf(stderr, " %d %d %d %d\n", s1 & 0x7f, (s1 >> 7) & 0x7f,
 310             (s1 >> 14) & 0x7f, (s1 >> 21) & 0x7f);
 311 }
 312 #endif
 313
 314 static void designate(charset_state *state, int container,
 315                       int type, int ibyte, int fbyte)
 316 {
 317     unsigned long i;
 318
 319     assert(container >= 0 && container <= 3);
 320     assert(type == S4 || type == S6 || type == M4 || type == M6);
 321
 322     for (i = 0; i < lenof(iso2022_subcharsets); i++) {
 323         if (iso2022_subcharsets[i].type == type &&
 324             iso2022_subcharsets[i].i == ibyte &&
 325             iso2022_subcharsets[i].f == fbyte) {
 326             state->s1 &= ~(0x7fL << (container * 7));
 327             state->s1 |= (i << (container * 7));
 328             return;
 329         }
 330     }
 331     /*
 332      * If we don't find the charset, invoke the empty one, so we
 333      * output ERROR rather than garbage.
 334      */
 335     designate(state, container, type, 0, '~');
 336 }
 337
 338 static void do_utf8(long int input_chr,
 339                     charset_state *state,
 340                     void (*emit)(void *ctx, long int output),
 341                     void *emitctx)
 342 {
 343     charset_state ustate;
 344
 345     ustate.s1 = 0;
 346     ustate.s0 = state->s0 & 0x03ffffffL;
 347     read_utf8(NULL, input_chr, &ustate, emit, emitctx);
 348     state->s0 = (state->s0 & ~0x03ffffffL) | (ustate.s0 & 0x03ffffffL);
 349 }
 350
 351 static void docs_utf8(long int input_chr,
 352                       charset_state *state,
 353                       void (*emit)(void *ctx, long int output),
 354                       void *emitctx)
 355 {
 356     int retstate;
 357
 358     /*
 359      * Bits [25:0] of s0 are reserved for read_utf8().
 360      * Bits [27:26] are a tiny state machine to recognise ESC % @.
 361      */
 362     retstate = (state->s0 & 0x0c000000L) >> 26;
 363     if (retstate == 1 && input_chr == '%')
 364         retstate = 2;
 365     else if (retstate == 2 && input_chr == '@') {
 366         /* If we've got a partial UTF-8 sequence, complain. */
 367         if (state->s0 & 0x03ffffffL)
 368             emit(emitctx, ERROR);
 369         state->s0 = 0;
 370         return;
 371     } else {
 372         if (retstate >= 1) do_utf8(ESC, state, emit, emitctx);
 373         if (retstate >= 2) do_utf8('%', state, emit, emitctx);
 374         retstate = 0;
 375         if (input_chr == ESC)
 376             retstate = 1;
 377         else {
 378             do_utf8(input_chr, state, emit, emitctx);
 379         }
 380     }
 381     state->s0 = (state->s0 & ~0x0c000000L) | (retstate << 26);
 382 }
 383
 384 struct ctext_encoding {
 385     char const *name;
 386     char octets_per_char, enable;
 387     charset_spec const *subcs;
 388 };
 389
 390 /*
 391  * In theory, this list is in <ftp://ftp.x.org/pub/DOCS/registry>,
 392  * but XLib appears to have its own ideas, and encodes these three
 393  * (as of X11R6.8.2)
 394  */
 395
 396 extern charset_spec const charset_CS_ISO8859_14;
 397 extern charset_spec const charset_CS_ISO8859_15;
 398 extern charset_spec const charset_CS_BIG5;
 399
 400 static struct ctext_encoding const ctext_encodings[] = {
 401     { "big5-0\2", 0 /* variable */, CDC, &charset_CS_BIG5 },
 402     { "iso8859-14\2", 1, CDC, &charset_CS_ISO8859_14 },
 403     { "iso8859-15\2", 1, CDC, &charset_CS_ISO8859_15 }
 404 };
 405
 406 static void docs_ctext(long int input_chr,
 407                        charset_state *state,
 408                        void (*emit)(void *ctx, long int output),
 409                        void *emitctx)
 410 {
 411     /*
 412      * s0[27:26] = first entry in ctext_encodings that matches
 413      * s0[25:22] = number of characters successfully matched, 0xf if all
 414      * s0[21:8] count the number of octets left in the segment
 415      * s0[7:0] are for sub-charset use
 416      */
 417     int n = (state->s0 >> 22) & 0xf, i = (state->s0 >> 26) & 3, oi = i, j;
 418     int length = (state->s0 >> 8) & 0x3fff;
 419
 420     /*
 421      * Note that we do not bother checking the octets-per-character
 422      * byte against the selected charset when reading. It's
 423      * extremely unlikely that this code will ever have to deal
 424      * with two charset identifiers with the same name and
 425      * different octets-per-character values! If it ever happens,
 426      * we'll have to edit this file anyway so we can modify the
 427      * code then...
 428      */
 429
 430     if (!length) {
 431         /* Haven't read length yet */
 432         if ((state->s0 & 0xff) == 0)
 433             /* ... or even the first byte */
 434             state->s0 |= input_chr;
 435         else {
 436             length = (state->s0 & 0x7f) * 0x80 + (input_chr & 0x7f);
 437             if (length == 0)
 438                 state->s0 = 0;
 439             else
 440                 state->s0 = (state->s0 & 0xf0000000) | (length << 8);
 441         }
 442         return;
 443     }
 444
 445     j = i;
 446     if (n == 0xe) {
 447         /* Skipping unknown encoding.  Look out for STX. */
 448         if (input_chr == 2)
 449             state->s0 = (state->s0 & 0xf0000000) | (i << 26) | (0xf << 22);
 450     } else if (n != 0xf) {
 451         while ((unsigned)j < lenof(ctext_encodings) &&
 452                !memcmp(ctext_encodings[j].name,
 453                        ctext_encodings[oi].name, n)) {
 454             if (ctext_encodings[j].name[n] < input_chr)
 455                 i = ++j;
 456             else
 457                 break;
 458         }
 459         if ((unsigned)i >= lenof(ctext_encodings) ||
 460             memcmp(ctext_encodings[i].name,
 461                    ctext_encodings[oi].name, n) ||
 462             ctext_encodings[i].name[n] != input_chr) {
 463             /* Doom!  We haven't heard of this encoding */
 464             i = lenof(ctext_encodings);
 465             n = 0xe;
 466         } else {
 467             /*
 468              * Otherwise, we have found an additional character in our
 469              * encoding name. See if we have reached the _end_ of our
 470              * name.
 471              */
 472             n++;
 473             if (!ctext_encodings[i].name[n])
 474                 n = 0xf;
 475         }
 476         /*
 477          * Failing _that_, we simply update our encoding-name-
 478          * tracking state.
 479          */
 480         assert(i < 4 && n < 16);
 481         state->s0 = (state->s0 & 0xf0000000) | (i << 26) | (n << 22);
 482     } else {
 483         if ((unsigned)i >= lenof(ctext_encodings))
 484             emit(emitctx, ERROR);
 485         else {
 486             charset_state substate;
 487             charset_spec const *subcs = ctext_encodings[i].subcs;
 488             substate.s1 = 0;
 489             substate.s0 = state->s0 & 0xff;
 490             subcs->read(subcs, input_chr, &substate, emit, emitctx);
 491             state->s0 = (state->s0 & ~0xff) | (substate.s0 & 0xff);
 492         }
 493     }
 494     if (!--length)
 495         state->s0 = 0;
 496     else
 497         state->s0 = (state->s0 &~0x003fff00) | (length << 8);
 498 }
 499
 500 static void read_iso2022(charset_spec const *charset, long int input_chr,
 501                          charset_state *state,
 502                          void (*emit)(void *ctx, long int output),
 503                          void *emitctx)
 504 {
 505     struct iso2022_mode const *mode = (struct iso2022_mode *)charset->data;
 506
 507     /* dump_state(state); */
 508     /*
 509      * We have to make fairly efficient use of the 64 bits of state
 510      * available to us.  Long-term state goes in s1, and consists of
 511      * the identities of the character sets designated as G0/G1/G2/G3
 512      * and the locking-shift states for GL and GR.  Short-term state
 513      * goes in s0: The bottom half of s0 accumulates characters for an
 514      * escape sequence or a multi-byte character, while the top three
 515      * bits indicate what they're being accumulated for.  After DOCS,
 516      * the bottom 29 bits of state are available for the DOCS function
 517      * to use -- the UTF-8 one uses the bottom 26 for UTF-8 decoding
 518      * and the top two to recognised ESC % @.
 519      *
 520      * s0[31:29] = state enum
 521      * s0[24:0] = accumulated bytes
 522      * s1[31:30] = GL locking-shift state
 523      * s1[29:28] = GR locking-shift state
 524      * s1[27:21] = G3 charset
 525      * s1[20:14] = G2 charset
 526      * s1[13:7] = G1 charset
 527      * s1[6:0] = G0 charset
 528      */
 529
 530 #define LEFT 30
 531 #define RIGHT 28
 532 #define LOCKING_SHIFT(n,side) \
 533         (state->s1 = (state->s1 & ~(3L<<(side))) | ((n ## L)<<(side)))
 534 #define MODE ((state->s0 & 0xe0000000L) >> 29)
 535 #define ENTER_MODE(m) (state->s0 = (state->s0 & ~0xe0000000L) | ((m)<<29))
 536 #define SINGLE_SHIFT(n) ENTER_MODE(SS2CHAR - 2 + (n))
 537 #define ASSERT_IDLE do {                                                \
 538         if (state->s0 != 0) emit(emitctx, ERROR);                       \
 539         state->s0 = 0;                                                  \
 540 } while (0)
 541
 542     if (state->s1 == 0) {
 543         /*
 544          * Since there's no LS0R, this means we must just have started.
 545          * Set up a sane initial state (LS0, LS1R, ASCII in G0/G1/G2/G3).
 546          */
 547         LOCKING_SHIFT(0, LEFT);
 548         LOCKING_SHIFT(1, RIGHT);
 549         designate(state, 0, mode->ltype, mode->li, mode->lf);
 550         designate(state, 1, mode->rtype, mode->ri, mode->rf);
 551         designate(state, 2, S4, 0, 'B');
 552         designate(state, 3, S4, 0, 'B');
 553     }
 554
 555     if (MODE == DOCSUTF8) {
 556         docs_utf8(input_chr, state, emit, emitctx);
 557         return;
 558     }
 559     if (MODE == DOCSCTEXT) {
 560         docs_ctext(input_chr, state, emit, emitctx);
 561         return;
 562     }
 563
 564     if ((input_chr & 0x60) == 0x00) {
 565         /* C0 or C1 control */
 566         ASSERT_IDLE;
 567         switch (input_chr) {
 568           case ESC:
 569             ENTER_MODE(ESCSEQ);
 570             break;
 571           case LS0:
 572             LOCKING_SHIFT(0, LEFT);
 573             break;
 574           case LS1:
 575             LOCKING_SHIFT(1, LEFT);
 576             break;
 577           case SS2:
 578             SINGLE_SHIFT(2);
 579             break;
 580           case SS3:
 581             SINGLE_SHIFT(3);
 582             break;
 583           default:
 584             emit(emitctx, input_chr);
 585             break;
 586         }
 587     } else if ((input_chr & 0x80) || MODE < ESCSEQ) {
 588         int is_gl = 0;
 589         struct iso2022_subcharset const *subcs;
 590         unsigned container;
 591         long input_7bit;
 592         /*
 593          * Actual data.
 594          * Force idle state if we're in mid escape sequence, or in a
 595          * multi-byte character with a different top bit.
 596          */
 597         if (MODE >= ESCSEQ ||
 598             ((state->s0 & 0x00ff0000L) != 0 &&
 599              (((state->s0 >> 16) ^ input_chr) & 0x80)))
 600             ASSERT_IDLE;
 601         if (MODE == SS2CHAR || MODE == SS3CHAR) /* Single-shift */
 602             container = MODE - SS2CHAR + 2;
 603         else if (input_chr >= 0x80) /* GR */
 604             container = (state->s1 >> 28) & 3;
 605         else { /* GL */
 606             container = state->s1 >> 30;
 607             is_gl = 1;
 608         }
 609         input_7bit = input_chr & ~0x80;
 610         subcs = &iso2022_subcharsets[(state->s1 >> (container * 7)) & 0x7f];
 611         if ((subcs->type == S4 || subcs->type == M4) &&
 612             (input_7bit == 0x20 || input_7bit == 0x7f)) {
 613             /* characters not in 94-char set */
 614             if (is_gl) emit(emitctx, input_7bit);
 615             else emit(emitctx, ERROR);
 616         } else if (subcs->type == M4 || subcs->type == M6) {
 617             if ((state->s0 & 0x00ff0000L) == 0) {
 618                 state->s0 |= input_chr << 16;
 619                 return;
 620             } else {
 621                 emit(emitctx,
 622                      subcs->from_dbcs(((state->s0 >> 16) & 0x7f) +
 623                                       subcs->offset,
 624                                       input_7bit + subcs->offset));
 625             }
 626         } else {
 627             if ((state->s0 & 0x00ff0000L) != 0)
 628                 emit(emitctx, ERROR);
 629             emit(emitctx, subcs->sbcs_base ?
 630                  sbcs_to_unicode(subcs->sbcs_base, input_7bit + subcs->offset):
 631                  ERROR);
 632         }
 633         state->s0 = 0;
 634     } else {
 635         unsigned i1, i2;
 636         if (MODE == ESCPASS) {
 637             emit(emitctx, input_chr);
 638             if ((input_chr & 0xf0) != 0x20)
 639                 ENTER_MODE(IDLE);
 640             return;
 641         }
 642
 643         /*
 644          * Intermediate bytes shall be any of the 16 positions of
 645          * column 02 of the code table; they are denoted by the symbol
 646          * I.
 647          */
 648         if ((input_chr & 0xf0) == 0x20) {
 649             if (((state->s0 >> 16) & 0xff) == 0)
 650                 state->s0 |= input_chr << 16;
 651             else if (((state->s0 >> 8) & 0xff) == 0)
 652                 state->s0 |= input_chr << 8;
 653             else {
 654                 /* Long escape sequence.  Switch to ESCPASS or ESCDROP. */
 655                 i1 = (state->s0 >> 16) & 0xff;
 656                 i2 = (state->s0 >> 8) & 0xff;
 657                 switch (i1) {
 658                   case '(': case ')': case '*': case '+':
 659                   case '-': case '.': case '/':
 660                   case '$':
 661                     ENTER_MODE(ESCDROP);
 662                     break;
 663                   default:
 664                     emit(emitctx, ESC);
 665                     emit(emitctx, i1);
 666                     emit(emitctx, i2);
 667                     emit(emitctx, input_chr);
 668                     state->s0 = 0;
 669                     ENTER_MODE(ESCPASS);
 670                     break;
 671                 }
 672             }
 673             return;
 674         }
 675
 676         /*
 677          * Final bytes shall be any of the 79 positions of columns 03
 678          * to 07 of the code table excluding position 07/15; they are
 679          * denoted by the symbol F.
 680          */
 681         i1 = (state->s0 >> 16) & 0xff;
 682         i2 = (state->s0 >> 8) & 0xff;
 683         if (MODE == ESCDROP)
 684             input_chr = 0; /* Make sure it won't match. */
 685         state->s0 = 0;
 686         switch (i1) {
 687           case 0: /* No intermediate bytes */
 688             switch (input_chr) {
 689               case 'N': /* SS2 */
 690                 SINGLE_SHIFT(2);
 691                 break;
 692               case 'O': /* SS3 */
 693                 SINGLE_SHIFT(3);
 694                 break;
 695               case 'n': /* LS2 */
 696                 LOCKING_SHIFT(2, LEFT);
 697                 break;
 698               case 'o': /* LS3 */
 699                 LOCKING_SHIFT(3, LEFT);
 700                 break;
 701               case '|': /* LS3R */
 702                 LOCKING_SHIFT(3, RIGHT);
 703                 break;
 704               case '}': /* LS2R */
 705                 LOCKING_SHIFT(2, RIGHT);
 706                 break;
 707               case '~': /* LS1R */
 708                 LOCKING_SHIFT(1, RIGHT);
 709                 break;
 710               default:
 711                 /* Unsupported escape sequence.  Spit it back out. */
 712                 emit(emitctx, ESC);
 713                 emit(emitctx, input_chr);
 714             }
 715             break;
 716           case ' ': /* ACS */
 717             /*
 718              * Various coding structure facilities specify that designating
 719              * a code element also invokes it.  As far as I can see, invoking
 720              * it now will have the same practical effect, since those
 721              * facilities also ban the use of locking shifts.
 722              */
 723             switch (input_chr) {
 724               case 'A': /* G0 element used and invoked into GL */
 725                 LOCKING_SHIFT(0, LEFT);
 726                 break;
 727               case 'C': /* G0 in GL, G1 in GR */
 728               case 'D': /* Ditto, at least for 8-bit codes */
 729               case 'L': /* ISO 4873 (ECMA-43) level 1 */
 730               case 'M': /* ISO 4873 (ECMA-43) level 2 */
 731                 LOCKING_SHIFT(0, LEFT);
 732                 LOCKING_SHIFT(1, RIGHT);
 733                 break;
 734             }
 735             break;
 736           case '&': /* IRR */
 737             /*
 738              * IRR (Identify Revised Registration) is ignored here,
 739              * since any revised registration must be
 740              * upward-compatible with the old one, so either we'll
 741              * support the new one or we'll emit ERROR when we run
 742              * into a new character.  In either case, there's nothing
 743              * to be done here.
 744              */
 745             break;
 746           case '(': /* GZD4 */  case ')': /* G1D4 */
 747           case '*': /* G2D4 */  case '+': /* G3D4 */
 748             designate(state, i1 - '(', S4, i2, input_chr);
 749             break;
 750           case '-': /* G1D6 */  case '.': /* G2D6 */  case '/': /* G3D6 */
 751             designate(state, i1 - ',', S6, i2, input_chr);
 752             break;
 753           case '$': /* G?DM? */
 754             switch (i2) {
 755               case 0: /* Obsolete version of GZDM4 */
 756                 i2 = '(';
 757               case '(': /* GZDM4 */  case ')': /* G1DM4 */
 758               case '*': /* G2DM4 */  case '+': /* G3DM4 */
 759                 designate(state, i2 - '(', M4, 0, input_chr);
 760                 break;
 761               case '-': /* G1DM6 */
 762               case '.': /* G2DM6 */  case '/': /* G3DM6 */
 763                 designate(state, i2 - ',', M6, 0, input_chr);
 764                 break;
 765               default:
 766                 emit(emitctx, ERROR);
 767                 break;
 768             }
 769           case '%': /* DOCS */
 770             /* XXX What's a reasonable way to handle an unrecognised DOCS? */
 771             switch (i2) {
 772               case 0:
 773                 switch (input_chr) {
 774                   case 'G':
 775                     ENTER_MODE(DOCSUTF8);
 776                     break;
 777                 }
 778                 break;
 779               case '/':
 780                 switch (input_chr) {
 781                   case '1': case '2':
 782                     ENTER_MODE(DOCSCTEXT);
 783                     break;
 784                 }
 785                 break;
 786             }
 787             break;
 788           default:
 789             /* Unsupported nF escape sequence.  Re-emit it. */
 790             emit(emitctx, ESC);
 791             emit(emitctx, i1);
 792             if (i2) emit(emitctx, i2);
 793             emit(emitctx, input_chr);
 794             break;
 795         }
 796     }
 797 }
 798
 799 static void oselect(charset_state *state, int i, int right,
 800                     void (*emit)(void *ctx, long int output),
 801                     void *emitctx)
 802 {
 803     int shift = (right ? 31-7 : 31-7-7);
 804     struct iso2022_subcharset const *subcs = &iso2022_subcharsets[i];
 805
 806     if (((state->s1 >> shift) & 0x7F) != (unsigned)i) {
 807         state->s1 &= ~(0x7FL << shift);
 808         state->s1 |= (i << shift);
 809
 810         if (emit) {
 811             emit(emitctx, ESC);
 812             if (subcs->type == M4 || subcs->type == M6)
 813                 emit(emitctx, '$');
 814             if (subcs->type == S6 || subcs->type == M6) {
 815                 assert(right);
 816                 emit(emitctx, '-');
 817             } else if (right) {
 818                 emit(emitctx, ')');
 819             } else {
 820                 emit(emitctx, '(');
 821             }
 822             if (subcs->i)
 823                 emit(emitctx, subcs->i);
 824             emit(emitctx, subcs->f);
 825         }
 826     }
 827 }
 828
 829 static void docs_char(charset_state *state,
 830                       void (*emit)(void *ctx, long int output),
 831                       void *emitctx, int cset, char *data, int datalen)
 832 {
 833     int curr_cset, currlen, i;
 834
 835     /*
 836      * cset is the index into ctext_encodings[]. It can also be -1
 837      * to mean DOCS UTF-8, or -2 to mean no DOCS (ordinary 2022).
 838      * In the latter case, `chr' is ignored.
 839      */
 840
 841     /*
 842      * First, terminate a DOCS segment if necessary. We always have
 843      * to terminate a DOCS segment if one is active and we're about
 844      * to switch to a different one; we might also have to
 845      * terminate a length-encoded DOCS segment if we've run out of
 846      * storage space to accumulate characters in it.
 847      */
 848     curr_cset = ((state->s1 >> 14) & 7) - 2;
 849     currlen = ((state->s1 >> 11) & 7);
 850     if ((curr_cset != -2 && curr_cset != cset) ||
 851         (curr_cset >= 0 && currlen + datalen > 5)) {
 852         if (curr_cset == -1) {
 853             /*
 854              * Terminating DOCS UTF-8 is easy.
 855              */
 856             emit(emitctx, ESC);
 857             emit(emitctx, '%');
 858             emit(emitctx, '@');
 859         } else {
 860             int len;
 861
 862             /*
 863              * To terminate a length-encoded DOCS segment we must
 864              * actually output the whole thing.
 865              */
 866             emit(emitctx, ESC);
 867             emit(emitctx, '%');
 868             emit(emitctx, '/');
 869             emit(emitctx, '0' + ctext_encodings[curr_cset].octets_per_char);
 870             len = currlen + datalen +
 871                 strlen(ctext_encodings[curr_cset].name);
 872             assert(len < (1 << 14));
 873             emit(emitctx, 0x80 | ((len >> 7) & 0x7F));
 874             emit(emitctx, 0x80 | ((len     ) & 0x7F));
 875             /* The name stored in ctext_encodings[] includes the trailing \2 */
 876             for (i = 0; ctext_encodings[curr_cset].name[i]; i++)
 877                 emit(emitctx, ctext_encodings[curr_cset].name[i]);
 878             for (i = 0; i < currlen; i++)
 879                 emit(emitctx,
 880                      (i == 0 ? state->s1 : state->s0 >> (8*(4-i))) & 0xFF);
 881             for (i = 0; i < datalen; i++)
 882                 emit(emitctx, data[i]);
 883
 884             /*
 885              * We've now dealt with the input data, so clear it so
 886              * we don't try to do so again below.
 887              */
 888             datalen = 0;
 889         }
 890         curr_cset = -2;
 891     }
 892
 893     /*
 894      * Now, start a DOCS segment if necessary.
 895      */
 896     if (curr_cset != cset) {
 897         assert(cset != -2);
 898         if (cset == -1) {
 899             /*
 900              * Start DOCS UTF-8.
 901              */
 902             emit(emitctx, ESC);
 903             emit(emitctx, '%');
 904             emit(emitctx, 'G');
 905         } else {
 906             /*
 907              * Starting a length-encoded DOCS segment is simply a
 908              * matter of setting our stored length counter to zero.
 909              */
 910             currlen = 0;
 911             state->s1 &= ~(7 << 11);
 912             state->s1 &= ~0xFF;
 913             state->s0 = 0;
 914         }
 915     }
 916     state->s1 &= ~(7 << 14);
 917     assert((cset+2) >= 0 && (cset+2) < 8);
 918     state->s1 |= ((cset+2) << 14);
 919
 920     /*
 921      * Now we're in the right DOCS state. Actually deal with the
 922      * input data, if we haven't already done so above.
 923      */
 924     if (datalen > 0) {
 925         assert(cset != 2);
 926         if (cset == -1) {
 927             /*
 928              * In DOCS UTF-8, we output data as soon as we get it.
 929              */
 930             for (i = 0; i < datalen; i++)
 931                 emit(emitctx, data[i]);
 932         } else {
 933             /*
 934              * In length-encoded DOCS, we just store our data and
 935              * bide our time. It'll all be output when we fill up
 936              * or switch to another character set.
 937              */
 938             assert(currlen + datalen <= 5);   /* overflow handled already */
 939             for (i = 0; i < datalen; i++) {
 940                 if (currlen + i == 0)
 941                     state->s1 |= data[i] & 0xFF;
 942                 else
 943                     state->s0 |= (data[i] & 0xFF) << (8*(4-(currlen+i)));
 944             }
 945             currlen += datalen;
 946             assert(currlen >= 0 && currlen < 8);
 947             state->s1 &= ~(7 << 11);
 948             state->s1 |= (currlen << 11);
 949         }
 950     }
 951 }
 952
 953 static void write_to_pointer(void *ctx, long int output)
 954 {
 955     char **ptr = (char **)ctx;
 956     *(*ptr)++ = output;
 957 }
 958
 959 /*
 960  * Writing full ISO-2022 is not useful in very many circumstances.
 961  * One of the few situations in which it _is_ useful is generating
 962  * X11 COMPOUND_TEXT; therefore, this writing function will obey
 963  * the compound text restrictions and hence output the subset of
 964  * ISO-2022 that's usable in that context.
 965  *
 966  * The subset in question is roughly that we use GL/GR for G0/G1
 967  * always, and that the _only_ escape sequences we output (other
 968  * than the occasional DOCS) are those which designate different
 969  * subcharsets into G0 and G1. There are additional constraints
 970  * about which things go in which container; see below.
 971  *
 972  * FIXME: this wants some decent tests to be written, and also the
 973  * exact output policy for compound text wants thinking about more
 974  * carefully.
 975  */
 976 static int write_iso2022(charset_spec const *charset, long int input_chr,
 977                          charset_state *state,
 978                          void (*emit)(void *ctx, long int output),
 979                          void *emitctx)
 980 {
 981     int i;
 982     struct iso2022_subcharset const *subcs;
 983     struct iso2022_mode const *mode = (struct iso2022_mode *)charset->data;
 984     to_dbcs_planar_t last_planar_dbcs = NULL;
 985     int last_p, last_r, last_c;
 986     long int c1, c2;
 987
 988     /*
 989      * For output, I allocate the state variables as follows:
 990      *
 991      *  s1[31] == 1 if output state has been initialised
 992      *  s1[30:24] == G1 charset (always in GR)
 993      *  s1[23:17] == G0 charset (always in GL)
 994      *  s1[16:14] == DOCS index plus 2 (because -1 and -2 are special)
 995      *  s1[13:11] == number of DOCS accumulated characters (up to five)
 996      *  s1[7:0] + s0[31:0] == DOCS collected characters
 997      */
 998
 999     if (!state->s1) {
1000         state->s0 = 0x00000000UL;
1001         state->s1 = 0x80000000UL;
1002         /*
1003          * Start with US-ASCII in GL and also in GR.
1004          */
1005         for (i = 0; (unsigned)i < lenof(iso2022_subcharsets); i++) {
1006             subcs = &iso2022_subcharsets[i];
1007             if (subcs->type == mode->ltype &&
1008                 subcs->i == mode->li &&
1009                 subcs->f == mode->lf)
1010                 oselect(state, i, FALSE, NULL, NULL);
1011             if (subcs->type == mode->rtype &&
1012                 subcs->i == mode->ri &&
1013                 subcs->f == mode->rf)
1014                 oselect(state, i, TRUE, NULL, NULL);
1015         }
1016     }
1017
1018     if (input_chr == -1) {
1019         /*
1020          * Special case: reset encoding state.
1021          */
1022         docs_char(state, emit, emitctx, -2, NULL, 0);   /* leave DOCS */
1023
1024         for (i = 0; (unsigned)i < lenof(iso2022_subcharsets); i++) {
1025             subcs = &iso2022_subcharsets[i];
1026             if (subcs->type == mode->ltype &&
1027                 subcs->i == mode->li &&
1028                 subcs->f == mode->lf)
1029                 oselect(state, i, FALSE, emit, emitctx);
1030             if (subcs->type == mode->rtype &&
1031                 subcs->i == mode->ri &&
1032                 subcs->f == mode->rf)
1033                 oselect(state, i, TRUE, emit, emitctx);
1034         }
1035         return TRUE;
1036     }
1037
1038     /*
1039      * Special-case characters: Space, Delete, and anything in C0
1040      * or C1 are output unchanged.
1041      */
1042     if (input_chr <= 0x20 || (input_chr >= 0x7F && input_chr < 0xA0)) {
1043         emit(emitctx, input_chr);
1044         return TRUE;
1045     }
1046
1047     /*
1048      * Analyse the input character and work out which subcharset it
1049      * belongs to.
1050      */
1051     for (i = 0; (unsigned)i < lenof(iso2022_subcharsets); i++) {
1052         subcs = &iso2022_subcharsets[i];
1053         if (!(mode->enable_mask & (1 << subcs->enable)))
1054             continue;                  /* this charset is disabled */
1055         if (subcs->sbcs_base) {
1056             c1 = sbcs_from_unicode(subcs->sbcs_base, input_chr);
1057             c1 -= subcs->offset;
1058             if (c1 >= 0x20 && c1 <= 0x7f) {
1059                 c2 = 0;
1060                 break;
1061             }
1062         } else if (subcs->to_dbcs) {
1063             if (subcs->to_dbcs_plane >= 0) {
1064                 /*
1065                  * Since multiplanar DBCSes almost by definition
1066                  * involve several entries in iso2022_subcharsets
1067                  * with the same to_dbcs function and different
1068                  * plane values, we remember the last such function
1069                  * we called and what its result was, so that we
1070                  * don't (for example) have to call
1071                  * unicode_to_cns11643 seven times.
1072                  */
1073                 if (last_planar_dbcs != REPLANARISE(subcs->to_dbcs)) {
1074                     last_planar_dbcs = REPLANARISE(subcs->to_dbcs);
1075                     if (!last_planar_dbcs(input_chr,
1076                                           &last_p, &last_r, &last_c))
1077                         last_p = -1;
1078                 }
1079             } else {
1080                 last_p = subcs->to_dbcs_plane;
1081                 if (!subcs->to_dbcs(input_chr, &last_r, &last_c))
1082                     last_p = 0;        /* cannot match since to_dbcs_plane<0 */
1083             }
1084
1085             if (last_p == subcs->to_dbcs_plane) {
1086                 c1 = last_r - subcs->offset;
1087                 c2 = last_c - subcs->offset;
1088                 assert(c1 >= 0x20 && c1 <= 0x7f);
1089                 assert(c2 >= 0x20 && c2 <= 0x7f);
1090                 break;
1091             }
1092         }
1093     }
1094
1095     if ((unsigned)i < lenof(iso2022_subcharsets)) {
1096         int right;
1097
1098         /*
1099          * Our character is represented by c1 (and possibly also
1100          * c2) in subcharset `subcs'. So now we must decide whether
1101          * to designate that character set into G0/GL or G1/GR.
1102          *
1103          * Any S6 or M6 subcharset has to go in GR because it won't
1104          * fit in GL. In addition, the compound text rules state
1105          * that any single-byte subcharset defined as the
1106          * right-hand half of some SBCS must go in GR.
1107          *
1108          * M4 subcharsets can go in either half according to the
1109          * rules. I choose to put them in GR always because it's a
1110          * simple policy with reasonable behaviour (facilitates
1111          * switching between them and ASCII).
1112          */
1113         right = (subcs->type == S6 || subcs->type == M6 || subcs->type == M4 ||
1114                  (subcs->sbcs_base && subcs->offset == 0x80));
1115
1116         /*
1117          * If we're in a DOCS mode, leave it.
1118          */
1119         docs_char(state, emit, emitctx, -2, NULL, 0);
1120
1121         /*
1122          * If this subcharset is not already selected in that
1123          * container, select it.
1124          */
1125         oselect(state, i, right, emit, emitctx);
1126
1127         /*
1128          * Now emit the actual characters.
1129          */
1130         if (right) {
1131             assert(c1 >= 0x20 && c1 <= 0x7f);
1132             emit(emitctx, c1 | 0x80);
1133             if (c2) {
1134                 assert(c2 >= 0x20 && c2 <= 0x7f);
1135                 emit(emitctx, c2 | 0x80);
1136             }
1137         } else {
1138             assert(c1 > 0x20 && c1 < 0x7f);
1139             emit(emitctx, c1);
1140             if (c2) {
1141                 assert(c2 > 0x20 && c2 < 0x7f);
1142                 emit(emitctx, c2);
1143             }
1144         }
1145
1146         return TRUE;
1147     }
1148
1149     /*
1150      * Fall back to DOCS.
1151      */
1152     {
1153         char data[10];
1154         char *p = data;
1155         int i, cs;
1156
1157         cs = -2;                       /* means failure */
1158
1159         for (i = 0; (unsigned)i <= lenof(ctext_encodings); i++) {
1160             charset_state substate;
1161             charset_spec const *subcs = ctext_encodings[i].subcs;
1162
1163             /*
1164              * We assume that all character sets dealt with by DOCS
1165              * are stateless for output purposes.
1166              */
1167             substate.s1 = substate.s0 = 0;
1168             p = data;
1169
1170             if ((unsigned)i < lenof(ctext_encodings)) {
1171                 if ((mode->enable_mask & (1 << ctext_encodings[i].enable)) &&
1172                     subcs->write(subcs, input_chr, &substate,
1173                                  write_to_pointer, &p)) {
1174                     cs = i;
1175                     break;
1176                 }
1177             } else {
1178                 if ((mode->enable_mask & (1 << CDU)) &&
1179                     write_utf8(NULL, input_chr, NULL, write_to_pointer, &p)) {
1180                     cs = -1;
1181                     break;
1182                 }
1183             }
1184         }
1185
1186         if (cs != -2) {
1187             docs_char(state, emit, emitctx, cs, data, p - data);
1188             return TRUE;
1189         }
1190     }
1191
1192     return FALSE;
1193 }
1194
1195 /*
1196  * Full ISO 2022 output with all options on. Not entirely sure what
1197  * if anything this is useful for, but here it is anyway. All
1198  * output character sets and DOCS variants are permitted; all
1199  * containers start out with ASCII in them.
1200  */
1201 static const struct iso2022_mode iso2022_all = {
1202     (1<<CCS) | (1<<COS) | (1<<CPU) | (1<<CDC) | (1<<CDU),
1203     S4, 0, 'B', S4, 0, 'B',
1204 };
1205
1206 const charset_spec charset_CS_ISO2022 = {
1207     CS_ISO2022, read_iso2022, write_iso2022, &iso2022_all
1208 };
1209
1210 /*
1211  * X11 compound text. A subset of output charsets is permitted, and
1212  * G1/GR starts off in ISO8859-1.
1213  */
1214 static const struct iso2022_mode iso2022_ctext = {
1215     (1<<CCS) | (1<<CDC),
1216     S4, 0, 'B', S6, 0, 'A',
1217 };
1218
1219 const charset_spec charset_CS_CTEXT = {
1220     CS_CTEXT, read_iso2022, write_iso2022, &iso2022_ctext
1221 };
1222
1223 #ifdef TESTMODE
1224
1225 #include <stdio.h>
1226 #include <stdarg.h>
1227 #include <string.h>
1228
1229 int total_errs = 0;
1230
1231 void iso2022_emit(void *ctx, long output)
1232 {
1233     wchar_t **p = (wchar_t **)ctx;
1234     *(*p)++ = output;
1235 }
1236
1237 void iso2022_read_test(int line, char *input, int inlen, ...)
1238 {
1239     va_list ap;
1240     wchar_t *p, str[512];
1241     int i;
1242     charset_state state;
1243     unsigned long l;
1244
1245     state.s0 = state.s1 = 0;
1246     p = str;
1247
1248     for (i = 0; i < inlen; i++)
1249         read_iso2022(NULL, input[i] & 0xFF, &state, iso2022_emit, &p);
1250
1251     va_start(ap, inlen);
1252     l = 0;
1253     for (i = 0; i < p - str; i++) {
1254         l = va_arg(ap, long int);
1255         if (l == -1) {
1256             printf("%d: correct string shorter than output\n", line);
1257             total_errs++;
1258             break;
1259         }
1260         if (l != str[i]) {
1261             printf("%d: char %d came out as %08x, should be %08lx\n",
1262                     line, i, str[i], l);
1263             total_errs++;
1264         }
1265     }
1266     if (l != -1) {
1267         l = va_arg(ap, long int);
1268         if (l != -1) {
1269             printf("%d: correct string longer than output\n", line);
1270             total_errs++;
1271         }
1272     }
1273     va_end(ap);
1274 }
1275
1276 /* Macro to concoct the first three parameters of iso2022_read_test. */
1277 #define TESTSTR(x) __LINE__, x, lenof(x)
1278
1279 int main(void)
1280 {
1281     printf("read tests beginning\n");
1282     /* Simple test (Emacs sample text for Japanese, in ISO-2022-JP) */
1283     iso2022_read_test(TESTSTR("Japanese (\x1b$BF|K\\8l\x1b(B)\t"
1284                               "\x1b$B$3$s$K$A$O\x1b(B, "
1285                               "\x1b$B%3%s%K%A%O\x1b(B\n"),
1286                       'J','a','p','a','n','e','s','e',' ','(',
1287                       0x65E5, 0x672C, 0x8A9E, ')', '\t',
1288                       0x3053, 0x3093, 0x306b, 0x3061, 0x306f, ',', ' ',
1289                       0x30b3, 0x30f3, 0x30cb, 0x30c1, 0x30cf, '\n', 0, -1);
1290     /* Same thing in EUC-JP (with designations, and half-width katakana) */
1291     iso2022_read_test(TESTSTR("\x1b$)B\x1b*I\x1b$+D"
1292                               "Japanese (\xc6\xfc\xcb\xdc\xb8\xec)\t"
1293                               "\xa4\xb3\xa4\xf3\xa4\xcb\xa4\xc1\xa4\xcf, "
1294                               "\x8e\xba\x8e\xdd\x8e\xc6\x8e\xc1\x8e\xca\n"),
1295                       'J','a','p','a','n','e','s','e',' ','(',
1296                       0x65E5, 0x672C, 0x8A9E, ')', '\t',
1297                       0x3053, 0x3093, 0x306b, 0x3061, 0x306f, ',', ' ',
1298                       0xff7a, 0xff9d, 0xff86, 0xff81, 0xff8a, '\n', 0, -1);
1299     /* Multibyte single-shift */
1300     iso2022_read_test(TESTSTR("\x1b$)B\x1b*I\x1b$+D\x8f\"/!"),
1301                       0x02D8, '!', 0, -1);
1302     /* Non-existent SBCS */
1303     iso2022_read_test(TESTSTR("\x1b(!Zfnord\n"),
1304                       ERROR, ERROR, ERROR, ERROR, ERROR, '\n', 0, -1);
1305     /* Pass-through of ordinary escape sequences, including a long one */
1306     iso2022_read_test(TESTSTR("\x1b""b\x1b#5\x1b#!!!5"),
1307                       0x1B, 'b', 0x1B, '#', '5',
1308                       0x1B, '#', '!', '!', '!', '5', 0, -1);
1309     /* Non-existent DBCS (also 5-byte escape sequence) */
1310     iso2022_read_test(TESTSTR("\x1b$(!Bfnord!"),
1311                       ERROR, ERROR, ERROR, 0, -1);
1312     /* Incomplete DB characters */
1313     iso2022_read_test(TESTSTR("\x1b$B(,(\x1b(BHi\x1b$B(,(\n"),
1314                       0x2501, ERROR, 'H', 'i', 0x2501, ERROR, '\n', 0, -1);
1315     iso2022_read_test(TESTSTR("\x1b$)B\x1b*I\x1b$+D\xa4""B"),
1316                       ERROR, 'B', 0, -1);
1317     iso2022_read_test(TESTSTR("\x1b$)B\x1b*I\x1b$+D\x0e\x1b|$\xa2\xaf"),
1318                       ERROR, 0x02D8, 0, -1);
1319     /* Incomplete escape sequence */
1320     iso2022_read_test(TESTSTR("\x1b\n"), ERROR, '\n', 0, -1);
1321     iso2022_read_test(TESTSTR("\x1b-A\x1b~\x1b\xa1"), ERROR, 0xa1, 0, -1);
1322     /* Incomplete single-shift */
1323     iso2022_read_test(TESTSTR("\x8e\n"), ERROR, '\n', 0, -1);
1324     iso2022_read_test(TESTSTR("\x1b$*B\x8e(\n"), ERROR, '\n', 0, -1);
1325     /* Corner cases (02/00 and 07/15) */
1326     iso2022_read_test(TESTSTR("\x1b(B\x20\x7f"), 0x20, 0x7f, 0, -1);
1327     iso2022_read_test(TESTSTR("\x1b(I\x20\x7f"), 0x20, 0x7f, 0, -1);
1328     iso2022_read_test(TESTSTR("\x1b$B\x20\x7f"), 0x20, 0x7f, 0, -1);
1329     iso2022_read_test(TESTSTR("\x1b-A\x0e\x20\x7f"), 0xa0, 0xff, 0, -1);
1330     iso2022_read_test(TESTSTR("\x1b$-~\x0e\x20\x7f"), ERROR, 0, -1);
1331     iso2022_read_test(TESTSTR("\x1b)B\xa0\xff"), ERROR, ERROR, 0, -1);
1332     iso2022_read_test(TESTSTR("\x1b)I\xa0\xff"), ERROR, ERROR, 0, -1);
1333     iso2022_read_test(TESTSTR("\x1b$)B\xa0\xff"), ERROR, ERROR, 0, -1);
1334     iso2022_read_test(TESTSTR("\x1b-A\x1b~\xa0\xff"), 0xa0, 0xff, 0, -1);
1335     iso2022_read_test(TESTSTR("\x1b$-~\x1b~\xa0\xff"), ERROR, 0, -1);
1336     /* Designate control sets */
1337     iso2022_read_test(TESTSTR("\x1b!@"), 0x1b, '!', '@', 0, -1);
1338     /* Designate other coding system (UTF-8) */
1339     iso2022_read_test(TESTSTR("\x1b%G"
1340                               "\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"),
1341                       0x03BA, 0x1F79, 0x03C3, 0x03BC, 0x03B5, 0, -1);
1342     iso2022_read_test(TESTSTR("\x1b-A\x1b%G\xCE\xBA\x1b%@\xa0"),
1343                       0x03BA, 0xA0, 0, -1);
1344     iso2022_read_test(TESTSTR("\x1b%G\xCE\x1b%@"), ERROR, 0, -1);
1345     iso2022_read_test(TESTSTR("\x1b%G\xCE\xBA\x1b%\x1b%@"),
1346                       0x03BA, 0x1B, '%', 0, -1);
1347     /* DOCS (COMPOUND_TEXT extended segment) */
1348     iso2022_read_test(TESTSTR("\x1b%/1\x80\x80"), 0, -1);
1349     iso2022_read_test(TESTSTR("\x1b%/1\x80\x8fiso-8859-15\2xyz\x1b(B"),
1350                       ERROR, ERROR, ERROR, 0, -1);
1351     iso2022_read_test(TESTSTR("\x1b%/1\x80\x8eiso8859-15\2xyz\x1b(B"),
1352                       'x', 'y', 'z', 0, -1);
1353     iso2022_read_test(TESTSTR("\x1b-A\x1b%/2\x80\x89"
1354                               "big5-0\2\xa1\x40\xa1\x40"),
1355                       0x3000, 0xa1, 0x40, 0, -1);
1356     /* Emacs Big5-in-ISO-2022 mapping */
1357     iso2022_read_test(TESTSTR("\x1b$(0&x86\x1b(B  \x1b$(0DeBv"),
1358                       0x5143, 0x6c23, ' ', ' ', 0x958b, 0x767c, 0, -1);
1359     /* Test from RFC 1922 (ISO-2022-CN) */
1360     iso2022_read_test(TESTSTR("\x1b$)A\x0e=;;;\x1b$)GG(_P\x0f"),
1361                       0x4EA4, 0x6362, 0x4EA4, 0x63db, 0, -1);
1362
1363     printf("read tests completed\n");
1364     printf("total: %d errors\n", total_errs);
1365     return (total_errs != 0);
1366 }
1367
1368 #endif /* TESTMODE */
1369
1370 #else /* ENUM_CHARSETS */
1371
1372 ENUM_CHARSET(CS_ISO2022)
1373
1374 #endif