mdw@git.distorted.org.uk Git - sgt/charset/blob - iso2022.c

   1 /*
   2  * iso2022.c - support for ISO/IEC 2022 (alias ECMA-35).
   3  *
   4  * This isn't a complete implementation of ISO/IEC 2022, but it's
   5  * close.  It only handles decoding, because a fully general encoder
   6  * isn't really useful.  It can decode 8-bit and 7-bit versions, with
   7  * support for single-byte and multi-byte character sets, all four
   8  * containers (G0, G1, G2, and G3), using both single-shift and
   9  * locking-shift sequences.
  10  *
  11  * The general principle is that any valid ISO/IEC 2022 sequence
  12  * should either be correctly decoded or should emit an ERROR.  The
  13  * only exception to this is that the C0 and C1 sets are fixed as
  14  * those of ISO/IEC 6429.  Escape sequences for designating control
  15  * sets are passed through, so a post-processor could fix them up if
  16  * necessary.
  17  *
  18  * DOCS to UTF-8 works.  Other DOCS sequences are ignored, which will
  19  * produce surprising results.
  20  */
  21
  22 #ifndef ENUM_CHARSETS
  23
  24 #include <assert.h>
  25
  26 #include "charset.h"
  27 #include "internal.h"
  28 #include "sbcsdat.h"
  29
  30 #define LS1 (0x0E)
  31 #define LS0 (0x0F)
  32 #define ESC (0x1B)
  33 #define SS2 (0x8E)
  34 #define SS3 (0x8F)
  35
  36 enum {S4, S6, M4, M6};
  37
  38 static long int emacs_big5_1_to_unicode(int, int);
  39 static long int emacs_big5_2_to_unicode(int, int);
  40 static int unicode_to_emacs_big5(long int, int *, int *, int *);
  41 static long int cns11643_1_to_unicode(int, int);
  42 static long int cns11643_2_to_unicode(int, int);
  43 static long int cns11643_3_to_unicode(int, int);
  44 static long int cns11643_4_to_unicode(int, int);
  45 static long int cns11643_5_to_unicode(int, int);
  46 static long int cns11643_6_to_unicode(int, int);
  47 static long int cns11643_7_to_unicode(int, int);
  48 static long int null_dbcs_to_unicode(int, int);
  49 static int unicode_to_null_dbcs(long int, int *, int *);
  50
  51 typedef int (*to_dbcs_t)(long int, int *, int *);
  52 typedef int (*to_dbcs_planar_t)(long int, int *, int *, int *);
  53
  54 /* Cast between to_dbcs_planar_t and to_dbcs_t, type-checking first */
  55 #define DEPLANARISE(x) ( (x) == (to_dbcs_planar_t)NULL, (to_dbcs_t)(x) )
  56 #define REPLANARISE(x) ( (x) == (to_dbcs_t)NULL, (to_dbcs_planar_t)(x) )
  57
  58 /*
  59  * Values used in the `enable' field. Each of these identifies a
  60  * class of character sets; we then have a bitmask indicating which
  61  * classes are allowable in a given mode.
  62  *
  63  * These values are currently only checked on output: for input,
  64  * any ISO 2022 we can comprehend at all is considered acceptable.
  65  */
  66 #define CCS 1                          /* CTEXT standard */
  67 #define COS 2                          /* other standard */
  68 #define CPU 3                          /* private use */
  69 #define CDC 4                          /* DOCS for CTEXT */
  70 #define CDU 5                          /* DOCS for UTF-8 */
  71 #define CNU 31                         /* never used */
  72
  73 struct iso2022_mode {
  74     int enable_mask;
  75     char ltype, li, lf, rtype, ri, rf;
  76 };
  77
  78 const struct iso2022_subcharset {
  79     char type, i, f, enable;
  80     int offset;
  81     const sbcs_data *sbcs_base;
  82     long int (*from_dbcs)(int, int);
  83
  84     /*
  85      * If to_dbcs_plane < 0, then to_dbcs is used as expected.
  86      * However, if to_dbcs_plane >= 0, then to_dbcs is expected to
  87      * be cast to a to_dbcs_planar_t before use, and the returned
  88      * plane value (the first int *) must equal to_dbcs_plane.
  89      *
  90      * I'd have preferred to do this by means of a union, but you
  91      * can't initialise a selected field of a union at compile
  92      * time. Function pointer casts are guaranteed to work sensibly
  93      * in ISO C (that is, it's undefined what happens if you call a
  94      * function via the wrong type of pointer, but if you cast it
  95      * back to the right type before calling it then it must work),
  96      * so this is safe if ugly.
  97      */
  98     to_dbcs_t to_dbcs;
  99     int to_dbcs_plane;                 /* use to_dbcs_planar iff >= 0 */
 100 } iso2022_subcharsets[] = {
 101     /*
 102      * We list these subcharsets in preference order for output.
 103      * Since the best-defined use of ISO 2022 output is compound
 104      * text, we'll use a preference order which matches that. So we
 105      * begin with the charsets defined in the compound text spec.
 106      */
 107     { S4, 0, 'B', CCS, 0x00, &sbcsdata_CS_ASCII },
 108     { S6, 0, 'A', CCS, 0x80, &sbcsdata_CS_ISO8859_1 },
 109     { S6, 0, 'B', CCS, 0x80, &sbcsdata_CS_ISO8859_2 },
 110     { S6, 0, 'C', CCS, 0x80, &sbcsdata_CS_ISO8859_3 },
 111     { S6, 0, 'D', CCS, 0x80, &sbcsdata_CS_ISO8859_4 },
 112     { S6, 0, 'F', CCS, 0x80, &sbcsdata_CS_ISO8859_7 },
 113     { S6, 0, 'G', CCS, 0x80, &sbcsdata_CS_ISO8859_6 },
 114     { S6, 0, 'H', CCS, 0x80, &sbcsdata_CS_ISO8859_8 },
 115     { S6, 0, 'L', CCS, 0x80, &sbcsdata_CS_ISO8859_5 },
 116     { S6, 0, 'M', CCS, 0x80, &sbcsdata_CS_ISO8859_9 },
 117     { S4, 0, 'I', CCS, 0x80, &sbcsdata_CS_JISX0201 },
 118     { S4, 0, 'J', CCS, 0x00, &sbcsdata_CS_JISX0201 },
 119     { M4, 0, 'A', CCS, -0x21, 0, &gb2312_to_unicode, &unicode_to_gb2312, -1 },
 120     { M4, 0, 'B', CCS, -0x21, 0, &jisx0208_to_unicode, &unicode_to_jisx0208, -1 },
 121     { M4, 0, 'C', CCS, -0x21, 0, &ksx1001_to_unicode, &unicode_to_ksx1001, -1 },
 122     { M4, 0, 'D', CCS, -0x21, 0, &jisx0212_to_unicode, &unicode_to_jisx0212, -1 },
 123
 124     /*
 125      * Next, other reasonably standard things: the rest of the ISO
 126      * 8859 sets, UK-ASCII, and CNS 11643.
 127      */
 128     { S6, 0, 'T', COS, 0x80, &sbcsdata_CS_ISO8859_11 },
 129     { S6, 0, 'V', COS, 0x80, &sbcsdata_CS_ISO8859_10 },
 130     { S6, 0, 'Y', COS, 0x80, &sbcsdata_CS_ISO8859_13 },
 131     { S6, 0, '_', COS, 0x80, &sbcsdata_CS_ISO8859_14 },
 132     { S6, 0, 'b', COS, 0x80, &sbcsdata_CS_ISO8859_15 },
 133     { S6, 0, 'f', COS, 0x80, &sbcsdata_CS_ISO8859_16 },
 134     { S4, 0, 'A', COS, 0x00, &sbcsdata_CS_BS4730 },
 135     { M4, 0, 'G', COS, -0x21, 0, &cns11643_1_to_unicode, DEPLANARISE(&unicode_to_cns11643), 0 },
 136     { M4, 0, 'H', COS, -0x21, 0, &cns11643_2_to_unicode, DEPLANARISE(&unicode_to_cns11643), 1 },
 137     { M4, 0, 'I', COS, -0x21, 0, &cns11643_3_to_unicode, DEPLANARISE(&unicode_to_cns11643), 2 },
 138     { M4, 0, 'J', COS, -0x21, 0, &cns11643_4_to_unicode, DEPLANARISE(&unicode_to_cns11643), 3 },
 139     { M4, 0, 'K', COS, -0x21, 0, &cns11643_5_to_unicode, DEPLANARISE(&unicode_to_cns11643), 4 },
 140     { M4, 0, 'L', COS, -0x21, 0, &cns11643_6_to_unicode, DEPLANARISE(&unicode_to_cns11643), 5 },
 141     { M4, 0, 'M', COS, -0x21, 0, &cns11643_7_to_unicode, DEPLANARISE(&unicode_to_cns11643), 6 },
 142
 143     /*
 144      * Private-use designations: DEC private sets and Emacs's Big5
 145      * abomination.
 146      */
 147     { S4, 0, '0', CPU, 0x00, &sbcsdata_CS_DEC_GRAPHICS },
 148     { S4, 0, '<', CPU, 0x80, &sbcsdata_CS_DEC_MCS },
 149     { M4, 0, '0', CPU, -0x21, 0, &emacs_big5_1_to_unicode, DEPLANARISE(&unicode_to_emacs_big5), 1 },
 150     { M4, 0, '1', CPU, -0x21, 0, &emacs_big5_2_to_unicode, DEPLANARISE(&unicode_to_emacs_big5), 2 },
 151
 152     /*
 153      * Ben left this conditioned out without explanation,
 154      * presumably on the grounds that we don't have a translation
 155      * table for it.
 156      */
 157 #if 0
 158     { M4, 0, '@', CNU }, /* JIS C 6226-1978 */
 159 #endif
 160
 161     /*
 162      * Finally, fallback entries for null character sets.
 163      */
 164     { S4, 0, '~', CNU },
 165     { S6, 0, '~', CNU }, /* empty 96-set */
 166     { M4, 0, '~', CNU, 0, 0, &null_dbcs_to_unicode, &unicode_to_null_dbcs, -1 }, /* empty 94^n-set */
 167     { M6, 0, '~', CNU, 0, 0, &null_dbcs_to_unicode, &unicode_to_null_dbcs, -1 }, /* empty 96^n-set */
 168 };
 169
 170 static long int null_dbcs_to_unicode(int r, int c)
 171 {
 172     return ERROR;
 173 }
 174 static int unicode_to_null_dbcs(long int unicode, int *r, int *c)
 175 {
 176     return 0;                          /* failed to convert anything */
 177 }
 178
 179 /*
 180  * Emacs encodes Big5 in COMPOUND_TEXT as two 94x94 character sets.
 181  * We treat Big5 as a 94x191 character set with a bunch of undefined
 182  * columns in the middle, so we have to mess around a bit to make
 183  * things fit.
 184  */
 185
 186 static long int emacs_big5_1_to_unicode(int r, int c)
 187 {
 188     unsigned long s;
 189     s = r * 94 + c;
 190     r = s / 157;
 191     c = s % 157;
 192     if (c >= 64) c += 34; /* Skip over the gap */
 193     return big5_to_unicode(r, c);
 194 }
 195
 196 static long int emacs_big5_2_to_unicode(int r, int c)
 197 {
 198     unsigned long s;
 199     s = r * 94 + c;
 200     r = s / 157 + 40;
 201     c = s % 157;
 202     if (c >= 64) c += 34; /* Skip over the gap */
 203     return big5_to_unicode(r, c);
 204 }
 205
 206 static int unicode_to_emacs_big5(long int unicode, int *p, int *r, int *c)
 207 {
 208     int rr, cc, s;
 209     if (!unicode_to_big5(unicode, &rr, &cc))
 210         return 0;
 211     if (cc >= 64) {
 212         cc -= 34;
 213         assert(cc >= 64);
 214     }
 215     s = rr * 157 + cc;
 216     if (s >= 40*157) {
 217         *p = 2;
 218         s -= 40*157;
 219     } else {
 220         *p = 1;
 221     }
 222     *r = s / 94;
 223     *c = s % 94;
 224     return 1;
 225 }
 226
 227 /* Wrappers for cns11643_to_unicode() */
 228 static long int cns11643_1_to_unicode(int r, int c)
 229 {
 230     return cns11643_to_unicode(0, r, c);
 231 }
 232 static long int cns11643_2_to_unicode(int r, int c)
 233 {
 234     return cns11643_to_unicode(1, r, c);
 235 }
 236 static long int cns11643_3_to_unicode(int r, int c)
 237 {
 238     return cns11643_to_unicode(2, r, c);
 239 }
 240 static long int cns11643_4_to_unicode(int r, int c)
 241 {
 242     return cns11643_to_unicode(3, r, c);
 243 }
 244 static long int cns11643_5_to_unicode(int r, int c)
 245 {
 246     return cns11643_to_unicode(4, r, c);
 247 }
 248 static long int cns11643_6_to_unicode(int r, int c)
 249 {
 250     return cns11643_to_unicode(5, r, c);
 251 }
 252 static long int cns11643_7_to_unicode(int r, int c)
 253 {
 254     return cns11643_to_unicode(6, r, c);
 255 }
 256
 257 /* States, or "what we're currently accumulating". */
 258 enum {
 259     IDLE,       /* None of the below */
 260     SS2CHAR,    /* Accumulating a character after SS2 */
 261     SS3CHAR,    /* Accumulating a character after SS3 */
 262     ESCSEQ,     /* Accumulating an escape sequence */
 263     ESCDROP,    /* Discarding an escape sequence */
 264     ESCPASS,    /* Passing through an escape sequence */
 265     DOCSUTF8,   /* DOCSed into UTF-8 */
 266     DOCSCTEXT   /* DOCSed into a COMPOUND_TEXT extended segment */
 267 };
 268
 269 #if 0
 270 #include <stdio.h>
 271 static void dump_state(charset_state *s)
 272 {
 273     unsigned s0 = s->s0, s1 = s->s1;
 274     char const * const modes[] = { "IDLE", "SS2CHAR", "SS3CHAR",
 275                                    "ESCSEQ", "ESCDROP", "ESCPASS",
 276                                    "DOCSUTF8" };
 277
 278     fprintf(stderr, "s0: %s", modes[s0 >> 29]);
 279     fprintf(stderr, " %02x %02x %02x   ", (s0 >> 16) & 0xff, (s0 >> 8) & 0xff,
 280             s0 & 0xff);
 281     fprintf(stderr, "s1: LS%d LS%dR", (s1 >> 30) & 3, (s1 >> 28) & 3);
 282     fprintf(stderr, " %d %d %d %d\n", s1 & 0x7f, (s1 >> 7) & 0x7f,
 283             (s1 >> 14) & 0x7f, (s1 >> 21) & 0x7f);
 284 }
 285 #endif
 286
 287 static void designate(charset_state *state, int container,
 288                       int type, int ibyte, int fbyte)
 289 {
 290     unsigned long i;
 291
 292     assert(container >= 0 && container <= 3);
 293     assert(type == S4 || type == S6 || type == M4 || type == M6);
 294
 295     for (i = 0; i < lenof(iso2022_subcharsets); i++) {
 296         if (iso2022_subcharsets[i].type == type &&
 297             iso2022_subcharsets[i].i == ibyte &&
 298             iso2022_subcharsets[i].f == fbyte) {
 299             state->s1 &= ~(0x7fL << (container * 7));
 300             state->s1 |= (i << (container * 7));
 301             return;
 302         }
 303     }
 304     /*
 305      * If we don't find the charset, invoke the empty one, so we
 306      * output ERROR rather than garbage.
 307      */
 308     designate(state, container, type, 0, '~');
 309 }
 310
 311 static void do_utf8(long int input_chr,
 312                     charset_state *state,
 313                     void (*emit)(void *ctx, long int output),
 314                     void *emitctx)
 315 {
 316     charset_state ustate;
 317
 318     ustate.s1 = 0;
 319     ustate.s0 = state->s0 & 0x03ffffffL;
 320     read_utf8(NULL, input_chr, &ustate, emit, emitctx);
 321     state->s0 = (state->s0 & ~0x03ffffffL) | (ustate.s0 & 0x03ffffffL);
 322 }
 323
 324 static void docs_utf8(long int input_chr,
 325                       charset_state *state,
 326                       void (*emit)(void *ctx, long int output),
 327                       void *emitctx)
 328 {
 329     int retstate;
 330
 331     /*
 332      * Bits [25:0] of s0 are reserved for read_utf8().
 333      * Bits [27:26] are a tiny state machine to recognise ESC % @.
 334      */
 335     retstate = (state->s0 & 0x0c000000L) >> 26;
 336     if (retstate == 1 && input_chr == '%')
 337         retstate = 2;
 338     else if (retstate == 2 && input_chr == '@') {
 339         /* If we've got a partial UTF-8 sequence, complain. */
 340         if (state->s0 & 0x03ffffffL)
 341             emit(emitctx, ERROR);
 342         state->s0 = 0;
 343         return;
 344     } else {
 345         if (retstate >= 1) do_utf8(ESC, state, emit, emitctx);
 346         if (retstate >= 2) do_utf8('%', state, emit, emitctx);
 347         retstate = 0;
 348         if (input_chr == ESC)
 349             retstate = 1;
 350         else {
 351             do_utf8(input_chr, state, emit, emitctx);
 352         }
 353     }
 354     state->s0 = (state->s0 & ~0x0c000000L) | (retstate << 26);
 355 }
 356
 357 struct ctext_encoding {
 358     char const *name;
 359     char octets_per_char, enable;
 360     charset_spec const *subcs;
 361 };
 362
 363 /*
 364  * In theory, this list is in <ftp://ftp.x.org/pub/DOCS/registry>,
 365  * but XLib appears to have its own ideas, and encodes these three
 366  * (as of X11R6.8.2)
 367  */
 368
 369 extern charset_spec const charset_CS_ISO8859_14;
 370 extern charset_spec const charset_CS_ISO8859_15;
 371 extern charset_spec const charset_CS_BIG5;
 372
 373 static struct ctext_encoding const ctext_encodings[] = {
 374     { "big5-0\2", 0 /* variable */, CDC, &charset_CS_BIG5 },
 375     { "iso8859-14\2", 1, CDC, &charset_CS_ISO8859_14 },
 376     { "iso8859-15\2", 1, CDC, &charset_CS_ISO8859_15 }
 377 };
 378
 379 static void docs_ctext(long int input_chr,
 380                        charset_state *state,
 381                        void (*emit)(void *ctx, long int output),
 382                        void *emitctx)
 383 {
 384     /*
 385      * s0[27:26] = first entry in ctext_encodings that matches
 386      * s0[25:22] = number of characters successfully matched, 0xf if all
 387      * s0[21:8] count the number of octets left in the segment
 388      * s0[7:0] are for sub-charset use
 389      */
 390     int n = (state->s0 >> 22) & 0xf, i = (state->s0 >> 26) & 3, oi = i, j;
 391     int length = (state->s0 >> 8) & 0x3fff;
 392
 393     /*
 394      * Note that we do not bother checking the octets-per-character
 395      * byte against the selected charset when reading. It's
 396      * extremely unlikely that this code will ever have to deal
 397      * with two charset identifiers with the same name and
 398      * different octets-per-character values! If it ever happens,
 399      * we'll have to edit this file anyway so we can modify the
 400      * code then...
 401      */
 402
 403     if (!length) {
 404         /* Haven't read length yet */
 405         if ((state->s0 & 0xff) == 0)
 406             /* ... or even the first byte */
 407             state->s0 |= input_chr;
 408         else {
 409             length = (state->s0 & 0x7f) * 0x80 + (input_chr & 0x7f);
 410             if (length == 0)
 411                 state->s0 = 0;
 412             else
 413                 state->s0 = (state->s0 & 0xf0000000) | (length << 8);
 414         }
 415         return;
 416     }
 417
 418     j = i;
 419     if (n == 0xe) {
 420         /* Skipping unknown encoding.  Look out for STX. */
 421         if (input_chr == 2)
 422             state->s0 = (state->s0 & 0xf0000000) | (i << 26) | (0xf << 22);
 423     } else if (n != 0xf) {
 424         while (j < lenof(ctext_encodings) &&
 425                !memcmp(ctext_encodings[j].name,
 426                        ctext_encodings[oi].name, n)) {
 427             if (ctext_encodings[j].name[n] < input_chr)
 428                 i = ++j;
 429             else
 430                 break;
 431         }
 432         if (i >= lenof(ctext_encodings) ||
 433             memcmp(ctext_encodings[i].name,
 434                    ctext_encodings[oi].name, n) ||
 435             ctext_encodings[i].name[n] != input_chr) {
 436             /* Doom!  We haven't heard of this encoding */
 437             i = lenof(ctext_encodings);
 438             n = 0xe;
 439         } else {
 440             /*
 441              * Otherwise, we have found an additional character in our
 442              * encoding name. See if we have reached the _end_ of our
 443              * name.
 444              */
 445             n++;
 446             if (!ctext_encodings[i].name[n])
 447                 n = 0xf;
 448         }
 449         /*
 450          * Failing _that_, we simply update our encoding-name-
 451          * tracking state.
 452          */
 453         assert(i < 4 && n < 16);
 454         state->s0 = (state->s0 & 0xf0000000) | (i << 26) | (n << 22);
 455     } else {
 456         if (i >= lenof(ctext_encodings))
 457             emit(emitctx, ERROR);
 458         else {
 459             charset_state substate;
 460             charset_spec const *subcs = ctext_encodings[i].subcs;
 461             substate.s1 = 0;
 462             substate.s0 = state->s0 & 0xff;
 463             subcs->read(subcs, input_chr, &substate, emit, emitctx);
 464             state->s0 = (state->s0 & ~0xff) | (substate.s0 & 0xff);
 465         }
 466     }
 467     if (!--length)
 468         state->s0 = 0;
 469     else
 470         state->s0 = (state->s0 &~0x003fff00) | (length << 8);
 471 }
 472
 473 static void read_iso2022(charset_spec const *charset, long int input_chr,
 474                          charset_state *state,
 475                          void (*emit)(void *ctx, long int output),
 476                          void *emitctx)
 477 {
 478     struct iso2022_mode const *mode = (struct iso2022_mode *)charset->data;
 479
 480     /* dump_state(state); */
 481     /*
 482      * We have to make fairly efficient use of the 64 bits of state
 483      * available to us.  Long-term state goes in s1, and consists of
 484      * the identities of the character sets designated as G0/G1/G2/G3
 485      * and the locking-shift states for GL and GR.  Short-term state
 486      * goes in s0: The bottom half of s0 accumulates characters for an
 487      * escape sequence or a multi-byte character, while the top three
 488      * bits indicate what they're being accumulated for.  After DOCS,
 489      * the bottom 29 bits of state are available for the DOCS function
 490      * to use -- the UTF-8 one uses the bottom 26 for UTF-8 decoding
 491      * and the top two to recognised ESC % @.
 492      *
 493      * s0[31:29] = state enum
 494      * s0[24:0] = accumulated bytes
 495      * s1[31:30] = GL locking-shift state
 496      * s1[29:28] = GR locking-shift state
 497      * s1[27:21] = G3 charset
 498      * s1[20:14] = G2 charset
 499      * s1[13:7] = G1 charset
 500      * s1[6:0] = G0 charset
 501      */
 502
 503 #define LEFT 30
 504 #define RIGHT 28
 505 #define LOCKING_SHIFT(n,side) \
 506         (state->s1 = (state->s1 & ~(3L<<(side))) | ((n ## L)<<(side)))
 507 #define MODE ((state->s0 & 0xe0000000L) >> 29)
 508 #define ENTER_MODE(m) (state->s0 = (state->s0 & ~0xe0000000L) | ((m)<<29))
 509 #define SINGLE_SHIFT(n) ENTER_MODE(SS2CHAR - 2 + (n))
 510 #define ASSERT_IDLE do {                                                \
 511         if (state->s0 != 0) emit(emitctx, ERROR);                       \
 512         state->s0 = 0;                                                  \
 513 } while (0)
 514
 515     if (state->s1 == 0) {
 516         /*
 517          * Since there's no LS0R, this means we must just have started.
 518          * Set up a sane initial state (LS0, LS1R, ASCII in G0/G1/G2/G3).
 519          */
 520         LOCKING_SHIFT(0, LEFT);
 521         LOCKING_SHIFT(1, RIGHT);
 522         designate(state, 0, mode->ltype, mode->li, mode->lf);
 523         designate(state, 1, mode->rtype, mode->ri, mode->rf);
 524         designate(state, 2, S4, 0, 'B');
 525         designate(state, 3, S4, 0, 'B');
 526     }
 527
 528     if (MODE == DOCSUTF8) {
 529         docs_utf8(input_chr, state, emit, emitctx);
 530         return;
 531     }
 532     if (MODE == DOCSCTEXT) {
 533         docs_ctext(input_chr, state, emit, emitctx);
 534         return;
 535     }
 536
 537     if ((input_chr & 0x60) == 0x00) {
 538         /* C0 or C1 control */
 539         ASSERT_IDLE;
 540         switch (input_chr) {
 541           case ESC:
 542             ENTER_MODE(ESCSEQ);
 543             break;
 544           case LS0:
 545             LOCKING_SHIFT(0, LEFT);
 546             break;
 547           case LS1:
 548             LOCKING_SHIFT(1, LEFT);
 549             break;
 550           case SS2:
 551             SINGLE_SHIFT(2);
 552             break;
 553           case SS3:
 554             SINGLE_SHIFT(3);
 555             break;
 556           default:
 557             emit(emitctx, input_chr);
 558             break;
 559         }
 560     } else if ((input_chr & 0x80) || MODE < ESCSEQ) {
 561         int is_gl = 0;
 562         struct iso2022_subcharset const *subcs;
 563         unsigned container;
 564         long input_7bit;
 565         /*
 566          * Actual data.
 567          * Force idle state if we're in mid escape sequence, or in a
 568          * multi-byte character with a different top bit.
 569          */
 570         if (MODE >= ESCSEQ ||
 571             ((state->s0 & 0x00ff0000L) != 0 &&
 572              (((state->s0 >> 16) ^ input_chr) & 0x80)))
 573             ASSERT_IDLE;
 574         if (MODE == SS2CHAR || MODE == SS3CHAR) /* Single-shift */
 575             container = MODE - SS2CHAR + 2;
 576         else if (input_chr >= 0x80) /* GR */
 577             container = (state->s1 >> 28) & 3;
 578         else { /* GL */
 579             container = state->s1 >> 30;
 580             is_gl = 1;
 581         }
 582         input_7bit = input_chr & ~0x80;
 583         subcs = &iso2022_subcharsets[(state->s1 >> (container * 7)) & 0x7f];
 584         if ((subcs->type == S4 || subcs->type == M4) &&
 585             (input_7bit == 0x20 || input_7bit == 0x7f)) {
 586             /* characters not in 94-char set */
 587             if (is_gl) emit(emitctx, input_7bit);
 588             else emit(emitctx, ERROR);
 589         } else if (subcs->type == M4 || subcs->type == M6) {
 590             if ((state->s0 & 0x00ff0000L) == 0) {
 591                 state->s0 |= input_chr << 16;
 592                 return;
 593             } else {
 594                 emit(emitctx,
 595                      subcs->from_dbcs(((state->s0 >> 16) & 0x7f) +
 596                                       subcs->offset,
 597                                       input_7bit + subcs->offset));
 598             }
 599         } else {
 600             if ((state->s0 & 0x00ff0000L) != 0)
 601                 emit(emitctx, ERROR);
 602             emit(emitctx, subcs->sbcs_base ?
 603                  sbcs_to_unicode(subcs->sbcs_base, input_7bit + subcs->offset):
 604                  ERROR);
 605         }
 606         state->s0 = 0;
 607     } else {
 608         unsigned i1, i2;
 609         if (MODE == ESCPASS) {
 610             emit(emitctx, input_chr);
 611             if ((input_chr & 0xf0) != 0x20)
 612                 ENTER_MODE(IDLE);
 613             return;
 614         }
 615
 616         /*
 617          * Intermediate bytes shall be any of the 16 positions of
 618          * column 02 of the code table; they are denoted by the symbol
 619          * I.
 620          */
 621         if ((input_chr & 0xf0) == 0x20) {
 622             if (((state->s0 >> 16) & 0xff) == 0)
 623                 state->s0 |= input_chr << 16;
 624             else if (((state->s0 >> 8) & 0xff) == 0)
 625                 state->s0 |= input_chr << 8;
 626             else {
 627                 /* Long escape sequence.  Switch to ESCPASS or ESCDROP. */
 628                 i1 = (state->s0 >> 16) & 0xff;
 629                 i2 = (state->s0 >> 8) & 0xff;
 630                 switch (i1) {
 631                   case '(': case ')': case '*': case '+':
 632                   case '-': case '.': case '/':
 633                   case '$':
 634                     ENTER_MODE(ESCDROP);
 635                     break;
 636                   default:
 637                     emit(emitctx, ESC);
 638                     emit(emitctx, i1);
 639                     emit(emitctx, i2);
 640                     emit(emitctx, input_chr);
 641                     state->s0 = 0;
 642                     ENTER_MODE(ESCPASS);
 643                     break;
 644                 }
 645             }
 646             return;
 647         }
 648
 649         /*
 650          * Final bytes shall be any of the 79 positions of columns 03
 651          * to 07 of the code table excluding position 07/15; they are
 652          * denoted by the symbol F.
 653          */
 654         i1 = (state->s0 >> 16) & 0xff;
 655         i2 = (state->s0 >> 8) & 0xff;
 656         if (MODE == ESCDROP)
 657             input_chr = 0; /* Make sure it won't match. */
 658         state->s0 = 0;
 659         switch (i1) {
 660           case 0: /* No intermediate bytes */
 661             switch (input_chr) {
 662               case 'N': /* SS2 */
 663                 SINGLE_SHIFT(2);
 664                 break;
 665               case 'O': /* SS3 */
 666                 SINGLE_SHIFT(3);
 667                 break;
 668               case 'n': /* LS2 */
 669                 LOCKING_SHIFT(2, LEFT);
 670                 break;
 671               case 'o': /* LS3 */
 672                 LOCKING_SHIFT(3, LEFT);
 673                 break;
 674               case '|': /* LS3R */
 675                 LOCKING_SHIFT(3, RIGHT);
 676                 break;
 677               case '}': /* LS2R */
 678                 LOCKING_SHIFT(2, RIGHT);
 679                 break;
 680               case '~': /* LS1R */
 681                 LOCKING_SHIFT(1, RIGHT);
 682                 break;
 683               default:
 684                 /* Unsupported escape sequence.  Spit it back out. */
 685                 emit(emitctx, ESC);
 686                 emit(emitctx, input_chr);
 687             }
 688             break;
 689           case ' ': /* ACS */
 690             /*
 691              * Various coding structure facilities specify that designating
 692              * a code element also invokes it.  As far as I can see, invoking
 693              * it now will have the same practical effect, since those
 694              * facilities also ban the use of locking shifts.
 695              */
 696             switch (input_chr) {
 697               case 'A': /* G0 element used and invoked into GL */
 698                 LOCKING_SHIFT(0, LEFT);
 699                 break;
 700               case 'C': /* G0 in GL, G1 in GR */
 701               case 'D': /* Ditto, at least for 8-bit codes */
 702               case 'L': /* ISO 4873 (ECMA-43) level 1 */
 703               case 'M': /* ISO 4873 (ECMA-43) level 2 */
 704                 LOCKING_SHIFT(0, LEFT);
 705                 LOCKING_SHIFT(1, RIGHT);
 706                 break;
 707             }
 708             break;
 709           case '&': /* IRR */
 710             /*
 711              * IRR (Identify Revised Registration) is ignored here,
 712              * since any revised registration must be
 713              * upward-compatible with the old one, so either we'll
 714              * support the new one or we'll emit ERROR when we run
 715              * into a new character.  In either case, there's nothing
 716              * to be done here.
 717              */
 718             break;
 719           case '(': /* GZD4 */  case ')': /* G1D4 */
 720           case '*': /* G2D4 */  case '+': /* G3D4 */
 721             designate(state, i1 - '(', S4, i2, input_chr);
 722             break;
 723           case '-': /* G1D6 */  case '.': /* G2D6 */  case '/': /* G3D6 */
 724             designate(state, i1 - ',', S6, i2, input_chr);
 725             break;
 726           case '$': /* G?DM? */
 727             switch (i2) {
 728               case 0: /* Obsolete version of GZDM4 */
 729                 i2 = '(';
 730               case '(': /* GZDM4 */  case ')': /* G1DM4 */
 731               case '*': /* G2DM4 */  case '+': /* G3DM4 */
 732                 designate(state, i2 - '(', M4, 0, input_chr);
 733                 break;
 734               case '-': /* G1DM6 */
 735               case '.': /* G2DM6 */  case '/': /* G3DM6 */
 736                 designate(state, i2 - ',', M6, 0, input_chr);
 737                 break;
 738               default:
 739                 emit(emitctx, ERROR);
 740                 break;
 741             }
 742           case '%': /* DOCS */
 743             /* XXX What's a reasonable way to handle an unrecognised DOCS? */
 744             switch (i2) {
 745               case 0:
 746                 switch (input_chr) {
 747                   case 'G':
 748                     ENTER_MODE(DOCSUTF8);
 749                     break;
 750                 }
 751                 break;
 752               case '/':
 753                 switch (input_chr) {
 754                   case '1': case '2':
 755                     ENTER_MODE(DOCSCTEXT);
 756                     break;
 757                 }
 758                 break;
 759             }
 760             break;
 761           default:
 762             /* Unsupported nF escape sequence.  Re-emit it. */
 763             emit(emitctx, ESC);
 764             emit(emitctx, i1);
 765             if (i2) emit(emitctx, i2);
 766             emit(emitctx, input_chr);
 767             break;
 768         }
 769     }
 770 }
 771
 772 static void oselect(charset_state *state, int i, int right,
 773                     void (*emit)(void *ctx, long int output),
 774                     void *emitctx)
 775 {
 776     int shift = (right ? 31-7 : 31-7-7);
 777     struct iso2022_subcharset const *subcs = &iso2022_subcharsets[i];
 778
 779     if (((state->s1 >> shift) & 0x7F) != i) {
 780         state->s1 &= ~(0x7FL << shift);
 781         state->s1 |= (i << shift);
 782
 783         if (emit) {
 784             emit(emitctx, ESC);
 785             if (subcs->type == M4 || subcs->type == M6)
 786                 emit(emitctx, '$');
 787             if (subcs->type == S6 || subcs->type == M6) {
 788                 assert(right);
 789                 emit(emitctx, '-');
 790             } else if (right) {
 791                 emit(emitctx, ')');
 792             } else {
 793                 emit(emitctx, '(');
 794             }
 795             if (subcs->i)
 796                 emit(emitctx, subcs->i);
 797             emit(emitctx, subcs->f);
 798         }
 799     }
 800 }
 801
 802 static void docs_char(charset_state *state,
 803                       void (*emit)(void *ctx, long int output),
 804                       void *emitctx, int cset, char *data, int datalen)
 805 {
 806     int curr_cset, currlen, i;
 807
 808     /*
 809      * cset is the index into ctext_encodings[]. It can also be -1
 810      * to mean DOCS UTF-8, or -2 to mean no DOCS (ordinary 2022).
 811      * In the latter case, `chr' is ignored.
 812      */
 813
 814     /*
 815      * First, terminate a DOCS segment if necessary. We always have
 816      * to terminate a DOCS segment if one is active and we're about
 817      * to switch to a different one; we might also have to
 818      * terminate a length-encoded DOCS segment if we've run out of
 819      * storage space to accumulate characters in it.
 820      */
 821     curr_cset = ((state->s1 >> 14) & 7) - 2;
 822     currlen = ((state->s1 >> 11) & 7);
 823     if ((curr_cset != -2 && curr_cset != cset) ||
 824         (curr_cset >= 0 && currlen + datalen > 5)) {
 825         if (curr_cset == -1) {
 826             /*
 827              * Terminating DOCS UTF-8 is easy.
 828              */
 829             emit(emitctx, ESC);
 830             emit(emitctx, '%');
 831             emit(emitctx, '@');
 832         } else {
 833             int len;
 834
 835             /*
 836              * To terminate a length-encoded DOCS segment we must
 837              * actually output the whole thing.
 838              */
 839             emit(emitctx, ESC);
 840             emit(emitctx, '%');
 841             emit(emitctx, '/');
 842             emit(emitctx, '0' + ctext_encodings[curr_cset].octets_per_char);
 843             len = currlen + datalen +
 844                 strlen(ctext_encodings[curr_cset].name);
 845             assert(len < (1 << 14));
 846             emit(emitctx, 0x80 | ((len >> 7) & 0x7F));
 847             emit(emitctx, 0x80 | ((len     ) & 0x7F));
 848             /* The name stored in ctext_encodings[] includes the trailing \2 */
 849             for (i = 0; ctext_encodings[curr_cset].name[i]; i++)
 850                 emit(emitctx, ctext_encodings[curr_cset].name[i]);
 851             for (i = 0; i < currlen; i++)
 852                 emit(emitctx,
 853                      (i == 0 ? state->s1 : state->s0 >> (8*(4-i))) & 0xFF);
 854             for (i = 0; i < datalen; i++)
 855                 emit(emitctx, data[i]);
 856
 857             /*
 858              * We've now dealt with the input data, so clear it so
 859              * we don't try to do so again below.
 860              */
 861             datalen = 0;
 862         }
 863         curr_cset = -2;
 864     }
 865
 866     /*
 867      * Now, start a DOCS segment if necessary.
 868      */
 869     if (curr_cset != cset) {
 870         assert(cset != -2);
 871         if (cset == -1) {
 872             /*
 873              * Start DOCS UTF-8.
 874              */
 875             emit(emitctx, ESC);
 876             emit(emitctx, '%');
 877             emit(emitctx, 'G');
 878         } else {
 879             /*
 880              * Starting a length-encoded DOCS segment is simply a
 881              * matter of setting our stored length counter to zero.
 882              */
 883             currlen = 0;
 884             state->s1 &= ~(7 << 11);
 885             state->s1 &= ~0xFF;
 886             state->s0 = 0;
 887         }
 888     }
 889     state->s1 &= ~(7 << 14);
 890     assert((cset+2) >= 0 && (cset+2) < 8);
 891     state->s1 |= ((cset+2) << 14);
 892
 893     /*
 894      * Now we're in the right DOCS state. Actually deal with the
 895      * input data, if we haven't already done so above.
 896      */
 897     if (datalen > 0) {
 898         assert(cset != 2);
 899         if (cset == -1) {
 900             /*
 901              * In DOCS UTF-8, we output data as soon as we get it.
 902              */
 903             for (i = 0; i < datalen; i++)
 904                 emit(emitctx, data[i]);
 905         } else {
 906             /*
 907              * In length-encoded DOCS, we just store our data and
 908              * bide our time. It'll all be output when we fill up
 909              * or switch to another character set.
 910              */
 911             assert(currlen + datalen <= 5);   /* overflow handled already */
 912             for (i = 0; i < datalen; i++) {
 913                 if (currlen + i == 0)
 914                     state->s1 |= data[i] & 0xFF;
 915                 else
 916                     state->s0 |= (data[i] & 0xFF) << (8*(4-(currlen+i)));
 917             }
 918             currlen += datalen;
 919             assert(currlen >= 0 && currlen < 8);
 920             state->s1 &= ~(7 << 11);
 921             state->s1 |= (currlen << 11);
 922         }
 923     }
 924 }
 925
 926 static void write_to_pointer(void *ctx, long int output)
 927 {
 928     char **ptr = (char **)ctx;
 929     *(*ptr)++ = output;
 930 }
 931
 932 /*
 933  * Writing full ISO-2022 is not useful in very many circumstances.
 934  * One of the few situations in which it _is_ useful is generating
 935  * X11 COMPOUND_TEXT; therefore, this writing function will obey
 936  * the compound text restrictions and hence output the subset of
 937  * ISO-2022 that's usable in that context.
 938  *
 939  * The subset in question is roughly that we use GL/GR for G0/G1
 940  * always, and that the _only_ escape sequences we output (other
 941  * than the occasional DOCS) are those which designate different
 942  * subcharsets into G0 and G1. There are additional constraints
 943  * about which things go in which container; see below.
 944  *
 945  * FIXME: this wants some decent tests to be written, and also the
 946  * exact output policy for compound text wants thinking about more
 947  * carefully.
 948  */
 949 static int write_iso2022(charset_spec const *charset, long int input_chr,
 950                          charset_state *state,
 951                          void (*emit)(void *ctx, long int output),
 952                          void *emitctx)
 953 {
 954     int i;
 955     struct iso2022_subcharset const *subcs;
 956     struct iso2022_mode const *mode = (struct iso2022_mode *)charset->data;
 957     to_dbcs_planar_t last_planar_dbcs = NULL;
 958     int last_p, last_r, last_c;
 959     long int c1, c2;
 960
 961     /*
 962      * For output, I allocate the state variables as follows:
 963      *
 964      *  s1[31] == 1 if output state has been initialised
 965      *  s1[30:24] == G1 charset (always in GR)
 966      *  s1[23:17] == G0 charset (always in GL)
 967      *  s1[16:14] == DOCS index plus 2 (because -1 and -2 are special)
 968      *  s1[13:11] == number of DOCS accumulated characters (up to five)
 969      *  s1[7:0] + s0[31:0] == DOCS collected characters
 970      */
 971
 972     if (!state->s1) {
 973         state->s0 = 0x00000000UL;
 974         state->s1 = 0x80000000UL;
 975         /*
 976          * Start with US-ASCII in GL and also in GR.
 977          */
 978         for (i = 0; i < lenof(iso2022_subcharsets); i++) {
 979             subcs = &iso2022_subcharsets[i];
 980             if (subcs->type == mode->ltype &&
 981                 subcs->i == mode->li &&
 982                 subcs->f == mode->lf)
 983                 oselect(state, i, FALSE, NULL, NULL);
 984             if (subcs->type == mode->rtype &&
 985                 subcs->i == mode->ri &&
 986                 subcs->f == mode->rf)
 987                 oselect(state, i, TRUE, NULL, NULL);
 988         }
 989     }
 990
 991     if (input_chr == -1) {
 992         /*
 993          * Special case: reset encoding state.
 994          */
 995         docs_char(state, emit, emitctx, -2, NULL, 0);   /* leave DOCS */
 996
 997         for (i = 0; i < lenof(iso2022_subcharsets); i++) {
 998             subcs = &iso2022_subcharsets[i];
 999             if (subcs->type == mode->ltype &&
1000                 subcs->i == mode->li &&
1001                 subcs->f == mode->lf)
1002                 oselect(state, i, FALSE, emit, emitctx);
1003             if (subcs->type == mode->rtype &&
1004                 subcs->i == mode->ri &&
1005                 subcs->f == mode->rf)
1006                 oselect(state, i, TRUE, emit, emitctx);
1007         }
1008         return TRUE;
1009     }
1010
1011     /*
1012      * Special-case characters: Space, Delete, and anything in C0
1013      * or C1 are output unchanged.
1014      */
1015     if (input_chr <= 0x20 || (input_chr >= 0x7F && input_chr < 0xA0)) {
1016         emit(emitctx, input_chr);
1017         return TRUE;
1018     }
1019
1020     /*
1021      * Analyse the input character and work out which subcharset it
1022      * belongs to.
1023      */
1024     for (i = 0; i < lenof(iso2022_subcharsets); i++) {
1025         subcs = &iso2022_subcharsets[i];
1026         if (!(mode->enable_mask & (1 << subcs->enable)))
1027             continue;                  /* this charset is disabled */
1028         if (subcs->sbcs_base) {
1029             c1 = sbcs_from_unicode(subcs->sbcs_base, input_chr);
1030             c1 -= subcs->offset;
1031             if (c1 >= 0x20 && c1 <= 0x7f) {
1032                 c2 = 0;
1033                 break;
1034             }
1035         } else if (subcs->to_dbcs) {
1036             if (subcs->to_dbcs_plane >= 0) {
1037                 /*
1038                  * Since multiplanar DBCSes almost by definition
1039                  * involve several entries in iso2022_subcharsets
1040                  * with the same to_dbcs function and different
1041                  * plane values, we remember the last such function
1042                  * we called and what its result was, so that we
1043                  * don't (for example) have to call
1044                  * unicode_to_cns11643 seven times.
1045                  */
1046                 if (last_planar_dbcs != REPLANARISE(subcs->to_dbcs)) {
1047                     last_planar_dbcs = REPLANARISE(subcs->to_dbcs);
1048                     if (!last_planar_dbcs(input_chr,
1049                                           &last_p, &last_r, &last_c))
1050                         last_p = -1;
1051                 }
1052             } else {
1053                 last_p = subcs->to_dbcs_plane;
1054                 if (!subcs->to_dbcs(input_chr, &last_r, &last_c))
1055                     last_p = 0;        /* cannot match since to_dbcs_plane<0 */
1056             }
1057
1058             if (last_p == subcs->to_dbcs_plane) {
1059                 c1 = last_r - subcs->offset;
1060                 c2 = last_c - subcs->offset;
1061                 assert(c1 >= 0x20 && c1 <= 0x7f);
1062                 assert(c2 >= 0x20 && c2 <= 0x7f);
1063                 break;
1064             }
1065         }
1066     }
1067
1068     if (i < lenof(iso2022_subcharsets)) {
1069         int right;
1070
1071         /*
1072          * Our character is represented by c1 (and possibly also
1073          * c2) in subcharset `subcs'. So now we must decide whether
1074          * to designate that character set into G0/GL or G1/GR.
1075          *
1076          * Any S6 or M6 subcharset has to go in GR because it won't
1077          * fit in GL. In addition, the compound text rules state
1078          * that any single-byte subcharset defined as the
1079          * right-hand half of some SBCS must go in GR.
1080          *
1081          * M4 subcharsets can go in either half according to the
1082          * rules. I choose to put them in GR always because it's a
1083          * simple policy with reasonable behaviour (facilitates
1084          * switching between them and ASCII).
1085          */
1086         right = (subcs->type == S6 || subcs->type == M6 || subcs->type == M4 ||
1087                  (subcs->sbcs_base && subcs->offset == 0x80));
1088
1089         /*
1090          * If we're in a DOCS mode, leave it.
1091          */
1092         docs_char(state, emit, emitctx, -2, NULL, 0);
1093
1094         /*
1095          * If this subcharset is not already selected in that
1096          * container, select it.
1097          */
1098         oselect(state, i, right, emit, emitctx);
1099
1100         /*
1101          * Now emit the actual characters.
1102          */
1103         if (right) {
1104             assert(c1 >= 0x20 && c1 <= 0x7f);
1105             emit(emitctx, c1 | 0x80);
1106             if (c2) {
1107                 assert(c2 >= 0x20 && c2 <= 0x7f);
1108                 emit(emitctx, c2 | 0x80);
1109             }
1110         } else {
1111             assert(c1 > 0x20 && c1 < 0x7f);
1112             emit(emitctx, c1);
1113             if (c2) {
1114                 assert(c2 > 0x20 && c2 < 0x7f);
1115                 emit(emitctx, c2);
1116             }
1117         }
1118
1119         return TRUE;
1120     }
1121
1122     /*
1123      * Fall back to DOCS.
1124      */
1125     {
1126         char data[10];
1127         char *p = data;
1128         int i, cs;
1129
1130         cs = -2;                       /* means failure */
1131
1132         for (i = 0; i <= lenof(ctext_encodings); i++) {
1133             charset_state substate;
1134             charset_spec const *subcs = ctext_encodings[i].subcs;
1135
1136             /*
1137              * We assume that all character sets dealt with by DOCS
1138              * are stateless for output purposes.
1139              */
1140             substate.s1 = substate.s0 = 0;
1141             p = data;
1142
1143             if (i < lenof(ctext_encodings)) {
1144                 if ((mode->enable_mask & (1 << ctext_encodings[i].enable)) &&
1145                     subcs->write(subcs, input_chr, &substate,
1146                                  write_to_pointer, &p)) {
1147                     cs = i;
1148                     break;
1149                 }
1150             } else {
1151                 if ((mode->enable_mask & (1 << CDU)) &&
1152                     write_utf8(NULL, input_chr, NULL, write_to_pointer, &p)) {
1153                     cs = -1;
1154                     break;
1155                 }
1156             }
1157         }
1158
1159         if (cs != -2) {
1160             docs_char(state, emit, emitctx, cs, data, p - data);
1161             return TRUE;
1162         }
1163     }
1164
1165     return FALSE;
1166 }
1167
1168 /*
1169  * Full ISO 2022 output with all options on. Not entirely sure what
1170  * if anything this is useful for, but here it is anyway. All
1171  * output character sets and DOCS variants are permitted; all
1172  * containers start out with ASCII in them.
1173  */
1174 static const struct iso2022_mode iso2022_all = {
1175     (1<<CCS) | (1<<COS) | (1<<CPU) | (1<<CDC) | (1<<CDU),
1176     S4, 0, 'B', S4, 0, 'B',
1177 };
1178
1179 const charset_spec charset_CS_ISO2022 = {
1180     CS_ISO2022, read_iso2022, write_iso2022, &iso2022_all
1181 };
1182
1183 /*
1184  * X11 compound text. A subset of output charsets is permitted, and
1185  * G1/GR starts off in ISO8859-1.
1186  */
1187 static const struct iso2022_mode iso2022_ctext = {
1188     (1<<CCS) | (1<<CDC),
1189     S4, 0, 'B', S6, 0, 'A',
1190 };
1191
1192 const charset_spec charset_CS_CTEXT = {
1193     CS_CTEXT, read_iso2022, write_iso2022, &iso2022_ctext
1194 };
1195
1196 #ifdef TESTMODE
1197
1198 #include <stdio.h>
1199 #include <stdarg.h>
1200 #include <string.h>
1201
1202 int total_errs = 0;
1203
1204 void iso2022_emit(void *ctx, long output)
1205 {
1206     wchar_t **p = (wchar_t **)ctx;
1207     *(*p)++ = output;
1208 }
1209
1210 void iso2022_read_test(int line, char *input, int inlen, ...)
1211 {
1212     va_list ap;
1213     wchar_t *p, str[512];
1214     int i;
1215     charset_state state;
1216     unsigned long l;
1217
1218     state.s0 = state.s1 = 0;
1219     p = str;
1220
1221     for (i = 0; i < inlen; i++)
1222         read_iso2022(NULL, input[i] & 0xFF, &state, iso2022_emit, &p);
1223
1224     va_start(ap, inlen);
1225     l = 0;
1226     for (i = 0; i < p - str; i++) {
1227         l = va_arg(ap, long int);
1228         if (l == -1) {
1229             printf("%d: correct string shorter than output\n", line);
1230             total_errs++;
1231             break;
1232         }
1233         if (l != str[i]) {
1234             printf("%d: char %d came out as %08x, should be %08lx\n",
1235                     line, i, str[i], l);
1236             total_errs++;
1237         }
1238     }
1239     if (l != -1) {
1240         l = va_arg(ap, long int);
1241         if (l != -1) {
1242             printf("%d: correct string longer than output\n", line);
1243             total_errs++;
1244         }
1245     }
1246     va_end(ap);
1247 }
1248
1249 /* Macro to concoct the first three parameters of iso2022_read_test. */
1250 #define TESTSTR(x) __LINE__, x, lenof(x)
1251
1252 int main(void)
1253 {
1254     printf("read tests beginning\n");
1255     /* Simple test (Emacs sample text for Japanese, in ISO-2022-JP) */
1256     iso2022_read_test(TESTSTR("Japanese (\x1b$BF|K\\8l\x1b(B)\t"
1257                               "\x1b$B$3$s$K$A$O\x1b(B, "
1258                               "\x1b$B%3%s%K%A%O\x1b(B\n"),
1259                       'J','a','p','a','n','e','s','e',' ','(',
1260                       0x65E5, 0x672C, 0x8A9E, ')', '\t',
1261                       0x3053, 0x3093, 0x306b, 0x3061, 0x306f, ',', ' ',
1262                       0x30b3, 0x30f3, 0x30cb, 0x30c1, 0x30cf, '\n', 0, -1);
1263     /* Same thing in EUC-JP (with designations, and half-width katakana) */
1264     iso2022_read_test(TESTSTR("\x1b$)B\x1b*I\x1b$+D"
1265                               "Japanese (\xc6\xfc\xcb\xdc\xb8\xec)\t"
1266                               "\xa4\xb3\xa4\xf3\xa4\xcb\xa4\xc1\xa4\xcf, "
1267                               "\x8e\xba\x8e\xdd\x8e\xc6\x8e\xc1\x8e\xca\n"),
1268                       'J','a','p','a','n','e','s','e',' ','(',
1269                       0x65E5, 0x672C, 0x8A9E, ')', '\t',
1270                       0x3053, 0x3093, 0x306b, 0x3061, 0x306f, ',', ' ',
1271                       0xff7a, 0xff9d, 0xff86, 0xff81, 0xff8a, '\n', 0, -1);
1272     /* Multibyte single-shift */
1273     iso2022_read_test(TESTSTR("\x1b$)B\x1b*I\x1b$+D\x8f\"/!"),
1274                       0x02D8, '!', 0, -1);
1275     /* Non-existent SBCS */
1276     iso2022_read_test(TESTSTR("\x1b(!Zfnord\n"),
1277                       ERROR, ERROR, ERROR, ERROR, ERROR, '\n', 0, -1);
1278     /* Pass-through of ordinary escape sequences, including a long one */
1279     iso2022_read_test(TESTSTR("\x1b""b\x1b#5\x1b#!!!5"),
1280                       0x1B, 'b', 0x1B, '#', '5',
1281                       0x1B, '#', '!', '!', '!', '5', 0, -1);
1282     /* Non-existent DBCS (also 5-byte escape sequence) */
1283     iso2022_read_test(TESTSTR("\x1b$(!Bfnord!"),
1284                       ERROR, ERROR, ERROR, 0, -1);
1285     /* Incomplete DB characters */
1286     iso2022_read_test(TESTSTR("\x1b$B(,(\x1b(BHi\x1b$B(,(\n"),
1287                       0x2501, ERROR, 'H', 'i', 0x2501, ERROR, '\n', 0, -1);
1288     iso2022_read_test(TESTSTR("\x1b$)B\x1b*I\x1b$+D\xa4""B"),
1289                       ERROR, 'B', 0, -1);
1290     iso2022_read_test(TESTSTR("\x1b$)B\x1b*I\x1b$+D\x0e\x1b|$\xa2\xaf"),
1291                       ERROR, 0x02D8, 0, -1);
1292     /* Incomplete escape sequence */
1293     iso2022_read_test(TESTSTR("\x1b\n"), ERROR, '\n', 0, -1);
1294     iso2022_read_test(TESTSTR("\x1b-A\x1b~\x1b\xa1"), ERROR, 0xa1, 0, -1);
1295     /* Incomplete single-shift */
1296     iso2022_read_test(TESTSTR("\x8e\n"), ERROR, '\n', 0, -1);
1297     iso2022_read_test(TESTSTR("\x1b$*B\x8e(\n"), ERROR, '\n', 0, -1);
1298     /* Corner cases (02/00 and 07/15) */
1299     iso2022_read_test(TESTSTR("\x1b(B\x20\x7f"), 0x20, 0x7f, 0, -1);
1300     iso2022_read_test(TESTSTR("\x1b(I\x20\x7f"), 0x20, 0x7f, 0, -1);
1301     iso2022_read_test(TESTSTR("\x1b$B\x20\x7f"), 0x20, 0x7f, 0, -1);
1302     iso2022_read_test(TESTSTR("\x1b-A\x0e\x20\x7f"), 0xa0, 0xff, 0, -1);
1303     iso2022_read_test(TESTSTR("\x1b$-~\x0e\x20\x7f"), ERROR, 0, -1);
1304     iso2022_read_test(TESTSTR("\x1b)B\xa0\xff"), ERROR, ERROR, 0, -1);
1305     iso2022_read_test(TESTSTR("\x1b)I\xa0\xff"), ERROR, ERROR, 0, -1);
1306     iso2022_read_test(TESTSTR("\x1b$)B\xa0\xff"), ERROR, ERROR, 0, -1);
1307     iso2022_read_test(TESTSTR("\x1b-A\x1b~\xa0\xff"), 0xa0, 0xff, 0, -1);
1308     iso2022_read_test(TESTSTR("\x1b$-~\x1b~\xa0\xff"), ERROR, 0, -1);
1309     /* Designate control sets */
1310     iso2022_read_test(TESTSTR("\x1b!@"), 0x1b, '!', '@', 0, -1);
1311     /* Designate other coding system (UTF-8) */
1312     iso2022_read_test(TESTSTR("\x1b%G"
1313                               "\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"),
1314                       0x03BA, 0x1F79, 0x03C3, 0x03BC, 0x03B5, 0, -1);
1315     iso2022_read_test(TESTSTR("\x1b-A\x1b%G\xCE\xBA\x1b%@\xa0"),
1316                       0x03BA, 0xA0, 0, -1);
1317     iso2022_read_test(TESTSTR("\x1b%G\xCE\x1b%@"), ERROR, 0, -1);
1318     iso2022_read_test(TESTSTR("\x1b%G\xCE\xBA\x1b%\x1b%@"),
1319                       0x03BA, 0x1B, '%', 0, -1);
1320     /* DOCS (COMPOUND_TEXT extended segment) */
1321     iso2022_read_test(TESTSTR("\x1b%/1\x80\x80"), 0, -1);
1322     iso2022_read_test(TESTSTR("\x1b%/1\x80\x8fiso-8859-15\2xyz\x1b(B"),
1323                       ERROR, ERROR, ERROR, 0, -1);
1324     iso2022_read_test(TESTSTR("\x1b%/1\x80\x8eiso8859-15\2xyz\x1b(B"),
1325                       'x', 'y', 'z', 0, -1);
1326     iso2022_read_test(TESTSTR("\x1b-A\x1b%/2\x80\x89"
1327                               "big5-0\2\xa1\x40\xa1\x40"),
1328                       0x3000, 0xa1, 0x40, 0, -1);
1329     /* Emacs Big5-in-ISO-2022 mapping */
1330     iso2022_read_test(TESTSTR("\x1b$(0&x86\x1b(B  \x1b$(0DeBv"),
1331                       0x5143, 0x6c23, ' ', ' ', 0x958b, 0x767c, 0, -1);
1332     /* Test from RFC 1922 (ISO-2022-CN) */
1333     iso2022_read_test(TESTSTR("\x1b$)A\x0e=;;;\x1b$)GG(_P\x0f"),
1334                       0x4EA4, 0x6362, 0x4EA4, 0x63db, 0, -1);
1335
1336     printf("read tests completed\n");
1337     printf("total: %d errors\n", total_errs);
1338     return (total_errs != 0);
1339 }
1340
1341 #endif /* TESTMODE */
1342
1343 #else /* ENUM_CHARSETS */
1344
1345 ENUM_CHARSET(CS_ISO2022)
1346
1347 #endif