mdw@git.distorted.org.uk Git - sgt/charset/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* iso2022.c - support for ISO/IEC 2022 (alias ECMA-35).
	3	*
	4	* This isn't a complete implementation of ISO/IEC 2022, but it's
	5	* close. It only handles decoding, because a fully general encoder
	6	* isn't really useful. It can decode 8-bit and 7-bit versions, with
	7	* support for single-byte and multi-byte character sets, all four
	8	* containers (G0, G1, G2, and G3), using both single-shift and
	9	* locking-shift sequences.
	10	*
	11	* The general principle is that any valid ISO/IEC 2022 sequence
	12	* should either be correctly decoded or should emit an ERROR. The
	13	* only exception to this is that the C0 and C1 sets are fixed as
	14	* those of ISO/IEC 6429. Escape sequences for designating control
	15	* sets are passed through, so a post-processor could fix them up if
	16	* necessary.
	17	*
	18	* DOCS to UTF-8 works. Other DOCS sequences are ignored, which will
	19	* produce surprising results.
	20	*/
	21
	22	#ifndef ENUM_CHARSETS
	23
	24	#include <assert.h>
	25
	26	#include "charset.h"
	27	#include "internal.h"
	28	#include "sbcsdat.h"
	29
	30	#define LS1 (0x0E)
	31	#define LS0 (0x0F)
	32	#define ESC (0x1B)
	33	#define SS2 (0x8E)
	34	#define SS3 (0x8F)
	35
	36	enum {S4, S6, M4, M6};
	37
	38	static long int emacs_big5_1_to_unicode(int, int);
	39	static long int emacs_big5_2_to_unicode(int, int);
	40	static int unicode_to_emacs_big5(long int, int , int , int *);
	41	static long int cns11643_1_to_unicode(int, int);
	42	static long int cns11643_2_to_unicode(int, int);
	43	static long int cns11643_3_to_unicode(int, int);
	44	static long int cns11643_4_to_unicode(int, int);
	45	static long int cns11643_5_to_unicode(int, int);
	46	static long int cns11643_6_to_unicode(int, int);
	47	static long int cns11643_7_to_unicode(int, int);
	48	static long int null_dbcs_to_unicode(int, int);
	49	static int unicode_to_null_dbcs(long int, int , int );
	50
	51	typedef int (to_dbcs_t)(long int, int , int *);
	52	typedef int (to_dbcs_planar_t)(long int, int , int , int );
	53
	54	/* Cast between to_dbcs_planar_t and to_dbcs_t, type-checking first */
	55	#define DEPLANARISE(x) ( (x) == (to_dbcs_planar_t)NULL, (to_dbcs_t)(x) )
	56	#define REPLANARISE(x) ( (x) == (to_dbcs_t)NULL, (to_dbcs_planar_t)(x) )
	57
	58	/*
	59	* Values used in the `enable' field. Each of these identifies a
	60	* class of character sets; we then have a bitmask indicating which
	61	* classes are allowable in a given mode.
	62	*
	63	* These values are currently only checked on output: for input,
	64	* any ISO 2022 we can comprehend at all is considered acceptable.
	65	*/
	66	#define CCS 1 /* CTEXT standard */
	67	#define COS 2 /* other standard */
	68	#define CPU 3 /* private use */
	69	#define CDC 4 /* DOCS for CTEXT */
	70	#define CDU 5 /* DOCS for UTF-8 */
	71	#define CNU 31 /* never used */
	72
	73	struct iso2022_mode {
	74	int enable_mask;
	75	char ltype, li, lf, rtype, ri, rf;
	76	};
	77
	78	const struct iso2022_subcharset {
	79	char type, i, f, enable;
	80	int offset;
	81	const sbcs_data *sbcs_base;
	82	long int (*from_dbcs)(int, int);
	83
	84	/*
	85	* If to_dbcs_plane < 0, then to_dbcs is used as expected.
	86	* However, if to_dbcs_plane >= 0, then to_dbcs is expected to
	87	* be cast to a to_dbcs_planar_t before use, and the returned
	88	* plane value (the first int *) must equal to_dbcs_plane.
	89	*
	90	* I'd have preferred to do this by means of a union, but you
	91	* can't initialise a selected field of a union at compile
	92	* time. Function pointer casts are guaranteed to work sensibly
	93	* in ISO C (that is, it's undefined what happens if you call a
	94	* function via the wrong type of pointer, but if you cast it
	95	* back to the right type before calling it then it must work),
	96	* so this is safe if ugly.
	97	*/
	98	to_dbcs_t to_dbcs;
	99	int to_dbcs_plane; /* use to_dbcs_planar iff >= 0 */
	100	} iso2022_subcharsets[] = {
	101	/*
	102	* We list these subcharsets in preference order for output.
	103	* Since the best-defined use of ISO 2022 output is compound
	104	* text, we'll use a preference order which matches that. So we
	105	* begin with the charsets defined in the compound text spec.
	106	*/
	107	{ S4, 0, 'B', CCS, 0x00, &sbcsdata_CS_ASCII },
	108	{ S6, 0, 'A', CCS, 0x80, &sbcsdata_CS_ISO8859_1 },
	109	{ S6, 0, 'B', CCS, 0x80, &sbcsdata_CS_ISO8859_2 },
	110	{ S6, 0, 'C', CCS, 0x80, &sbcsdata_CS_ISO8859_3 },
	111	{ S6, 0, 'D', CCS, 0x80, &sbcsdata_CS_ISO8859_4 },
	112	{ S6, 0, 'F', CCS, 0x80, &sbcsdata_CS_ISO8859_7 },
	113	{ S6, 0, 'G', CCS, 0x80, &sbcsdata_CS_ISO8859_6 },
	114	{ S6, 0, 'H', CCS, 0x80, &sbcsdata_CS_ISO8859_8 },
	115	{ S6, 0, 'L', CCS, 0x80, &sbcsdata_CS_ISO8859_5 },
	116	{ S6, 0, 'M', CCS, 0x80, &sbcsdata_CS_ISO8859_9 },
	117	{ S4, 0, 'I', CCS, 0x80, &sbcsdata_CS_JISX0201 },
	118	{ S4, 0, 'J', CCS, 0x00, &sbcsdata_CS_JISX0201 },
	119	{ M4, 0, 'A', CCS, -0x21, 0, &gb2312_to_unicode, &unicode_to_gb2312, -1 },
	120	{ M4, 0, 'B', CCS, -0x21, 0, &jisx0208_to_unicode, &unicode_to_jisx0208, -1 },
	121	{ M4, 0, 'C', CCS, -0x21, 0, &ksx1001_to_unicode, &unicode_to_ksx1001, -1 },
	122	{ M4, 0, 'D', CCS, -0x21, 0, &jisx0212_to_unicode, &unicode_to_jisx0212, -1 },
	123
	124	/*
	125	* Next, other reasonably standard things: the rest of the ISO
	126	* 8859 sets, UK-ASCII, and CNS 11643.
	127	*/
	128	{ S6, 0, 'T', COS, 0x80, &sbcsdata_CS_ISO8859_11 },
	129	{ S6, 0, 'V', COS, 0x80, &sbcsdata_CS_ISO8859_10 },
	130	{ S6, 0, 'Y', COS, 0x80, &sbcsdata_CS_ISO8859_13 },
	131	{ S6, 0, '_', COS, 0x80, &sbcsdata_CS_ISO8859_14 },
	132	{ S6, 0, 'b', COS, 0x80, &sbcsdata_CS_ISO8859_15 },
	133	{ S6, 0, 'f', COS, 0x80, &sbcsdata_CS_ISO8859_16 },
	134	{ S4, 0, 'A', COS, 0x00, &sbcsdata_CS_BS4730 },
	135	{ M4, 0, 'G', COS, -0x21, 0, &cns11643_1_to_unicode, DEPLANARISE(&unicode_to_cns11643), 0 },
	136	{ M4, 0, 'H', COS, -0x21, 0, &cns11643_2_to_unicode, DEPLANARISE(&unicode_to_cns11643), 1 },
	137	{ M4, 0, 'I', COS, -0x21, 0, &cns11643_3_to_unicode, DEPLANARISE(&unicode_to_cns11643), 2 },
	138	{ M4, 0, 'J', COS, -0x21, 0, &cns11643_4_to_unicode, DEPLANARISE(&unicode_to_cns11643), 3 },
	139	{ M4, 0, 'K', COS, -0x21, 0, &cns11643_5_to_unicode, DEPLANARISE(&unicode_to_cns11643), 4 },
	140	{ M4, 0, 'L', COS, -0x21, 0, &cns11643_6_to_unicode, DEPLANARISE(&unicode_to_cns11643), 5 },
	141	{ M4, 0, 'M', COS, -0x21, 0, &cns11643_7_to_unicode, DEPLANARISE(&unicode_to_cns11643), 6 },
	142
	143	/*
	144	* Private-use designations: DEC private sets and Emacs's Big5
	145	* abomination.
	146	*/
	147	{ S4, 0, '0', CPU, 0x00, &sbcsdata_CS_DEC_GRAPHICS },
	148	{ S4, 0, '<', CPU, 0x80, &sbcsdata_CS_DEC_MCS },
	149	{ M4, 0, '0', CPU, -0x21, 0, &emacs_big5_1_to_unicode, DEPLANARISE(&unicode_to_emacs_big5), 1 },
	150	{ M4, 0, '1', CPU, -0x21, 0, &emacs_big5_2_to_unicode, DEPLANARISE(&unicode_to_emacs_big5), 2 },
	151
	152	/*
	153	* Ben left this conditioned out without explanation,
	154	* presumably on the grounds that we don't have a translation
	155	* table for it.
	156	*/
	157	#if 0
	158	{ M4, 0, '@', CNU }, /* JIS C 6226-1978 */
	159	#endif
	160
	161	/*
	162	* Finally, fallback entries for null character sets.
	163	*/
	164	{ S4, 0, '~', CNU },
	165	{ S6, 0, '~', CNU }, /* empty 96-set */
	166	{ M4, 0, '~', CNU, 0, 0, &null_dbcs_to_unicode, &unicode_to_null_dbcs, -1 }, /* empty 94^n-set */
	167	{ M6, 0, '~', CNU, 0, 0, &null_dbcs_to_unicode, &unicode_to_null_dbcs, -1 }, /* empty 96^n-set */
	168	};
	169
	170	static long int null_dbcs_to_unicode(int r, int c)
	171	{
	172	return ERROR;
	173	}
	174	static int unicode_to_null_dbcs(long int unicode, int r, int c)
	175	{
	176	return 0; /* failed to convert anything */
	177	}
	178
	179	/*
	180	* Emacs encodes Big5 in COMPOUND_TEXT as two 94x94 character sets.
	181	* We treat Big5 as a 94x191 character set with a bunch of undefined
	182	* columns in the middle, so we have to mess around a bit to make
	183	* things fit.
	184	*/
	185
	186	static long int emacs_big5_1_to_unicode(int r, int c)
	187	{
	188	unsigned long s;
	189	s = r * 94 + c;
	190	r = s / 157;
	191	c = s % 157;
	192	if (c >= 64) c += 34; /* Skip over the gap */
	193	return big5_to_unicode(r, c);
	194	}
	195
	196	static long int emacs_big5_2_to_unicode(int r, int c)
	197	{
	198	unsigned long s;
	199	s = r * 94 + c;
	200	r = s / 157 + 40;
	201	c = s % 157;
	202	if (c >= 64) c += 34; /* Skip over the gap */
	203	return big5_to_unicode(r, c);
	204	}
	205
	206	static int unicode_to_emacs_big5(long int unicode, int p, int r, int *c)
	207	{
	208	int rr, cc, s;
	209	if (!unicode_to_big5(unicode, &rr, &cc))
	210	return 0;
	211	if (cc >= 64) {
	212	cc -= 34;
	213	assert(cc >= 64);
	214	}
	215	s = rr * 157 + cc;
	216	if (s >= 40*157) {
	217	*p = 2;
	218	s -= 40*157;
	219	} else {
	220	*p = 1;
	221	}
	222	*r = s / 94;
	223	*c = s % 94;
	224	return 1;
	225	}
	226
	227	/* Wrappers for cns11643_to_unicode() */
	228	static long int cns11643_1_to_unicode(int r, int c)
	229	{
	230	return cns11643_to_unicode(0, r, c);
	231	}
	232	static long int cns11643_2_to_unicode(int r, int c)
	233	{
	234	return cns11643_to_unicode(1, r, c);
	235	}
	236	static long int cns11643_3_to_unicode(int r, int c)
	237	{
	238	return cns11643_to_unicode(2, r, c);
	239	}
	240	static long int cns11643_4_to_unicode(int r, int c)
	241	{
	242	return cns11643_to_unicode(3, r, c);
	243	}
	244	static long int cns11643_5_to_unicode(int r, int c)
	245	{
	246	return cns11643_to_unicode(4, r, c);
	247	}
	248	static long int cns11643_6_to_unicode(int r, int c)
	249	{
	250	return cns11643_to_unicode(5, r, c);
	251	}
	252	static long int cns11643_7_to_unicode(int r, int c)
	253	{
	254	return cns11643_to_unicode(6, r, c);
	255	}
	256
	257	/* States, or "what we're currently accumulating". */
	258	enum {
	259	IDLE, /* None of the below */
	260	SS2CHAR, /* Accumulating a character after SS2 */
	261	SS3CHAR, /* Accumulating a character after SS3 */
	262	ESCSEQ, /* Accumulating an escape sequence */
	263	ESCDROP, /* Discarding an escape sequence */
	264	ESCPASS, /* Passing through an escape sequence */
	265	DOCSUTF8, /* DOCSed into UTF-8 */
	266	DOCSCTEXT /* DOCSed into a COMPOUND_TEXT extended segment */
	267	};
	268
	269	#if 0
	270	#include <stdio.h>
	271	static void dump_state(charset_state *s)
	272	{
	273	unsigned s0 = s->s0, s1 = s->s1;
	274	char const * const modes[] = { "IDLE", "SS2CHAR", "SS3CHAR",
	275	"ESCSEQ", "ESCDROP", "ESCPASS",
	276	"DOCSUTF8" };
	277
	278	fprintf(stderr, "s0: %s", modes[s0 >> 29]);
	279	fprintf(stderr, " %02x %02x %02x ", (s0 >> 16) & 0xff, (s0 >> 8) & 0xff,
	280	s0 & 0xff);
	281	fprintf(stderr, "s1: LS%d LS%dR", (s1 >> 30) & 3, (s1 >> 28) & 3);
	282	fprintf(stderr, " %d %d %d %d\n", s1 & 0x7f, (s1 >> 7) & 0x7f,
	283	(s1 >> 14) & 0x7f, (s1 >> 21) & 0x7f);
	284	}
	285	#endif
	286
	287	static void designate(charset_state *state, int container,
	288	int type, int ibyte, int fbyte)
	289	{
	290	unsigned long i;
	291
	292	assert(container >= 0 && container <= 3);
	293	assert(type == S4 \|\| type == S6 \|\| type == M4 \|\| type == M6);
	294
	295	for (i = 0; i < lenof(iso2022_subcharsets); i++) {
	296	if (iso2022_subcharsets[i].type == type &&
	297	iso2022_subcharsets[i].i == ibyte &&
	298	iso2022_subcharsets[i].f == fbyte) {
	299	state->s1 &= ~(0x7fL << (container * 7));
	300	state->s1 \|= (i << (container * 7));
	301	return;
	302	}
	303	}
	304	/*
	305	* If we don't find the charset, invoke the empty one, so we
	306	* output ERROR rather than garbage.
	307	*/
	308	designate(state, container, type, 0, '~');
	309	}
	310
	311	static void do_utf8(long int input_chr,
	312	charset_state *state,
	313	void (emit)(void ctx, long int output),
	314	void *emitctx)
	315	{
	316	charset_state ustate;
	317
	318	ustate.s1 = 0;
	319	ustate.s0 = state->s0 & 0x03ffffffL;
	320	read_utf8(NULL, input_chr, &ustate, emit, emitctx);
	321	state->s0 = (state->s0 & ~0x03ffffffL) \| (ustate.s0 & 0x03ffffffL);
	322	}
	323
	324	static void docs_utf8(long int input_chr,
	325	charset_state *state,
	326	void (emit)(void ctx, long int output),
	327	void *emitctx)
	328	{
	329	int retstate;
	330
	331	/*
	332	* Bits [25:0] of s0 are reserved for read_utf8().
	333	* Bits [27:26] are a tiny state machine to recognise ESC % @.
	334	*/
	335	retstate = (state->s0 & 0x0c000000L) >> 26;
	336	if (retstate == 1 && input_chr == '%')
	337	retstate = 2;
	338	else if (retstate == 2 && input_chr == '@') {
	339	/* If we've got a partial UTF-8 sequence, complain. */
	340	if (state->s0 & 0x03ffffffL)
	341	emit(emitctx, ERROR);
	342	state->s0 = 0;
	343	return;
	344	} else {
	345	if (retstate >= 1) do_utf8(ESC, state, emit, emitctx);
	346	if (retstate >= 2) do_utf8('%', state, emit, emitctx);
	347	retstate = 0;
	348	if (input_chr == ESC)
	349	retstate = 1;
	350	else {
	351	do_utf8(input_chr, state, emit, emitctx);
	352	}
	353	}
	354	state->s0 = (state->s0 & ~0x0c000000L) \| (retstate << 26);
	355	}
	356
	357	struct ctext_encoding {
	358	char const *name;
	359	char octets_per_char, enable;
	360	charset_spec const *subcs;
	361	};
	362
	363	/*
	364	* In theory, this list is in <ftp://ftp.x.org/pub/DOCS/registry>,
	365	* but XLib appears to have its own ideas, and encodes these three
	366	* (as of X11R6.8.2)
	367	*/
	368
	369	extern charset_spec const charset_CS_ISO8859_14;
	370	extern charset_spec const charset_CS_ISO8859_15;
	371	extern charset_spec const charset_CS_BIG5;
	372
	373	static struct ctext_encoding const ctext_encodings[] = {
	374	{ "big5-0\2", 0 /* variable */, CDC, &charset_CS_BIG5 },
	375	{ "iso8859-14\2", 1, CDC, &charset_CS_ISO8859_14 },
	376	{ "iso8859-15\2", 1, CDC, &charset_CS_ISO8859_15 }
	377	};
	378
	379	static void docs_ctext(long int input_chr,
	380	charset_state *state,
	381	void (emit)(void ctx, long int output),
	382	void *emitctx)
	383	{
	384	/*
	385	* s0[27:26] = first entry in ctext_encodings that matches
	386	* s0[25:22] = number of characters successfully matched, 0xf if all
	387	* s0[21:8] count the number of octets left in the segment
	388	* s0[7:0] are for sub-charset use
	389	*/
	390	int n = (state->s0 >> 22) & 0xf, i = (state->s0 >> 26) & 3, oi = i, j;
	391	int length = (state->s0 >> 8) & 0x3fff;
	392
	393	/*
	394	* Note that we do not bother checking the octets-per-character
	395	* byte against the selected charset when reading. It's
	396	* extremely unlikely that this code will ever have to deal
	397	* with two charset identifiers with the same name and
	398	* different octets-per-character values! If it ever happens,
	399	* we'll have to edit this file anyway so we can modify the
	400	* code then...
	401	*/
	402
	403	if (!length) {
	404	/* Haven't read length yet */
	405	if ((state->s0 & 0xff) == 0)
	406	/* ... or even the first byte */
	407	state->s0 \|= input_chr;
	408	else {
	409	length = (state->s0 & 0x7f) * 0x80 + (input_chr & 0x7f);
	410	if (length == 0)
	411	state->s0 = 0;
	412	else
	413	state->s0 = (state->s0 & 0xf0000000) \| (length << 8);
	414	}
	415	return;
	416	}
	417
	418	j = i;
	419	if (n == 0xe) {
	420	/* Skipping unknown encoding. Look out for STX. */
	421	if (input_chr == 2)
	422	state->s0 = (state->s0 & 0xf0000000) \| (i << 26) \| (0xf << 22);
	423	} else if (n != 0xf) {
	424	while (j < lenof(ctext_encodings) &&
	425	!memcmp(ctext_encodings[j].name,
	426	ctext_encodings[oi].name, n)) {
	427	if (ctext_encodings[j].name[n] < input_chr)
	428	i = ++j;
	429	else
	430	break;
	431	}
	432	if (i >= lenof(ctext_encodings) \|\|
	433	memcmp(ctext_encodings[i].name,
	434	ctext_encodings[oi].name, n) \|\|
	435	ctext_encodings[i].name[n] != input_chr) {
	436	/* Doom! We haven't heard of this encoding */
	437	i = lenof(ctext_encodings);
	438	n = 0xe;
	439	} else {
	440	/*
	441	* Otherwise, we have found an additional character in our
	442	* encoding name. See if we have reached the _end_ of our
	443	* name.
	444	*/
	445	n++;
	446	if (!ctext_encodings[i].name[n])
	447	n = 0xf;
	448	}
	449	/*
	450	* Failing _that_, we simply update our encoding-name-
	451	* tracking state.
	452	*/
	453	assert(i < 4 && n < 16);
	454	state->s0 = (state->s0 & 0xf0000000) \| (i << 26) \| (n << 22);
	455	} else {
	456	if (i >= lenof(ctext_encodings))
	457	emit(emitctx, ERROR);
	458	else {
	459	charset_state substate;
	460	charset_spec const *subcs = ctext_encodings[i].subcs;
	461	substate.s1 = 0;
	462	substate.s0 = state->s0 & 0xff;
	463	subcs->read(subcs, input_chr, &substate, emit, emitctx);
	464	state->s0 = (state->s0 & ~0xff) \| (substate.s0 & 0xff);
	465	}
	466	}
	467	if (!--length)
	468	state->s0 = 0;
	469	else
	470	state->s0 = (state->s0 &~0x003fff00) \| (length << 8);
	471	}
	472
	473	static void read_iso2022(charset_spec const *charset, long int input_chr,
	474	charset_state *state,
	475	void (emit)(void ctx, long int output),
	476	void *emitctx)
	477	{
	478	struct iso2022_mode const mode = (struct iso2022_mode )charset->data;
	479
	480	/* dump_state(state); */
	481	/*
	482	* We have to make fairly efficient use of the 64 bits of state
	483	* available to us. Long-term state goes in s1, and consists of
	484	* the identities of the character sets designated as G0/G1/G2/G3
	485	* and the locking-shift states for GL and GR. Short-term state
	486	* goes in s0: The bottom half of s0 accumulates characters for an
	487	* escape sequence or a multi-byte character, while the top three
	488	* bits indicate what they're being accumulated for. After DOCS,
	489	* the bottom 29 bits of state are available for the DOCS function
	490	* to use -- the UTF-8 one uses the bottom 26 for UTF-8 decoding
	491	* and the top two to recognised ESC % @.
	492	*
	493	* s0[31:29] = state enum
	494	* s0[24:0] = accumulated bytes
	495	* s1[31:30] = GL locking-shift state
	496	* s1[29:28] = GR locking-shift state
	497	* s1[27:21] = G3 charset
	498	* s1[20:14] = G2 charset
	499	* s1[13:7] = G1 charset
	500	* s1[6:0] = G0 charset
	501	*/
	502
	503	#define LEFT 30
	504	#define RIGHT 28
	505	#define LOCKING_SHIFT(n,side) \
	506	(state->s1 = (state->s1 & ~(3L<<(side))) \| ((n ## L)<<(side)))
	507	#define MODE ((state->s0 & 0xe0000000L) >> 29)
	508	#define ENTER_MODE(m) (state->s0 = (state->s0 & ~0xe0000000L) \| ((m)<<29))
	509	#define SINGLE_SHIFT(n) ENTER_MODE(SS2CHAR - 2 + (n))
	510	#define ASSERT_IDLE do { \
	511	if (state->s0 != 0) emit(emitctx, ERROR); \
	512	state->s0 = 0; \
	513	} while (0)
	514
	515	if (state->s1 == 0) {
	516	/*
	517	* Since there's no LS0R, this means we must just have started.
	518	* Set up a sane initial state (LS0, LS1R, ASCII in G0/G1/G2/G3).
	519	*/
	520	LOCKING_SHIFT(0, LEFT);
	521	LOCKING_SHIFT(1, RIGHT);
	522	designate(state, 0, mode->ltype, mode->li, mode->lf);
	523	designate(state, 1, mode->rtype, mode->ri, mode->rf);
	524	designate(state, 2, S4, 0, 'B');
	525	designate(state, 3, S4, 0, 'B');
	526	}
	527
	528	if (MODE == DOCSUTF8) {
	529	docs_utf8(input_chr, state, emit, emitctx);
	530	return;
	531	}
	532	if (MODE == DOCSCTEXT) {
	533	docs_ctext(input_chr, state, emit, emitctx);
	534	return;
	535	}
	536
	537	if ((input_chr & 0x60) == 0x00) {
	538	/* C0 or C1 control */
	539	ASSERT_IDLE;
	540	switch (input_chr) {
	541	case ESC:
	542	ENTER_MODE(ESCSEQ);
	543	break;
	544	case LS0:
	545	LOCKING_SHIFT(0, LEFT);
	546	break;
	547	case LS1:
	548	LOCKING_SHIFT(1, LEFT);
	549	break;
	550	case SS2:
	551	SINGLE_SHIFT(2);
	552	break;
	553	case SS3:
	554	SINGLE_SHIFT(3);
	555	break;
	556	default:
	557	emit(emitctx, input_chr);
	558	break;
	559	}
	560	} else if ((input_chr & 0x80) \|\| MODE < ESCSEQ) {
	561	int is_gl = 0;
	562	struct iso2022_subcharset const *subcs;
	563	unsigned container;
	564	long input_7bit;
	565	/*
	566	* Actual data.
	567	* Force idle state if we're in mid escape sequence, or in a
	568	* multi-byte character with a different top bit.
	569	*/
	570	if (MODE >= ESCSEQ \|\|
	571	((state->s0 & 0x00ff0000L) != 0 &&
	572	(((state->s0 >> 16) ^ input_chr) & 0x80)))
	573	ASSERT_IDLE;
	574	if (MODE == SS2CHAR \|\| MODE == SS3CHAR) /* Single-shift */
	575	container = MODE - SS2CHAR + 2;
	576	else if (input_chr >= 0x80) /* GR */
	577	container = (state->s1 >> 28) & 3;
	578	else { /* GL */
	579	container = state->s1 >> 30;
	580	is_gl = 1;
	581	}
	582	input_7bit = input_chr & ~0x80;
	583	subcs = &iso2022_subcharsets[(state->s1 >> (container * 7)) & 0x7f];
	584	if ((subcs->type == S4 \|\| subcs->type == M4) &&
	585	(input_7bit == 0x20 \|\| input_7bit == 0x7f)) {
	586	/* characters not in 94-char set */
	587	if (is_gl) emit(emitctx, input_7bit);
	588	else emit(emitctx, ERROR);
	589	} else if (subcs->type == M4 \|\| subcs->type == M6) {
	590	if ((state->s0 & 0x00ff0000L) == 0) {
	591	state->s0 \|= input_chr << 16;
	592	return;
	593	} else {
	594	emit(emitctx,
	595	subcs->from_dbcs(((state->s0 >> 16) & 0x7f) +
	596	subcs->offset,
	597	input_7bit + subcs->offset));
	598	}
	599	} else {
	600	if ((state->s0 & 0x00ff0000L) != 0)
	601	emit(emitctx, ERROR);
	602	emit(emitctx, subcs->sbcs_base ?
	603	sbcs_to_unicode(subcs->sbcs_base, input_7bit + subcs->offset):
	604	ERROR);
	605	}
	606	state->s0 = 0;
	607	} else {
	608	unsigned i1, i2;
	609	if (MODE == ESCPASS) {
	610	emit(emitctx, input_chr);
	611	if ((input_chr & 0xf0) != 0x20)
	612	ENTER_MODE(IDLE);
	613	return;
	614	}
	615
	616	/*
	617	* Intermediate bytes shall be any of the 16 positions of
	618	* column 02 of the code table; they are denoted by the symbol
	619	* I.
	620	*/
	621	if ((input_chr & 0xf0) == 0x20) {
	622	if (((state->s0 >> 16) & 0xff) == 0)
	623	state->s0 \|= input_chr << 16;
	624	else if (((state->s0 >> 8) & 0xff) == 0)
	625	state->s0 \|= input_chr << 8;
	626	else {
	627	/* Long escape sequence. Switch to ESCPASS or ESCDROP. */
	628	i1 = (state->s0 >> 16) & 0xff;
	629	i2 = (state->s0 >> 8) & 0xff;
	630	switch (i1) {
	631	case '(': case ')': case '*': case '+':
	632	case '-': case '.': case '/':
	633	case '$':
	634	ENTER_MODE(ESCDROP);
	635	break;
	636	default:
	637	emit(emitctx, ESC);
	638	emit(emitctx, i1);
	639	emit(emitctx, i2);
	640	emit(emitctx, input_chr);
	641	state->s0 = 0;
	642	ENTER_MODE(ESCPASS);
	643	break;
	644	}
	645	}
	646	return;
	647	}
	648
	649	/*
	650	* Final bytes shall be any of the 79 positions of columns 03
	651	* to 07 of the code table excluding position 07/15; they are
	652	* denoted by the symbol F.
	653	*/
	654	i1 = (state->s0 >> 16) & 0xff;
	655	i2 = (state->s0 >> 8) & 0xff;
	656	if (MODE == ESCDROP)
	657	input_chr = 0; /* Make sure it won't match. */
	658	state->s0 = 0;
	659	switch (i1) {
	660	case 0: /* No intermediate bytes */
	661	switch (input_chr) {
	662	case 'N': /* SS2 */
	663	SINGLE_SHIFT(2);
	664	break;
	665	case 'O': /* SS3 */
	666	SINGLE_SHIFT(3);
	667	break;
	668	case 'n': /* LS2 */
	669	LOCKING_SHIFT(2, LEFT);
	670	break;
	671	case 'o': /* LS3 */
	672	LOCKING_SHIFT(3, LEFT);
	673	break;
	674	case '\|': /* LS3R */
	675	LOCKING_SHIFT(3, RIGHT);
	676	break;
	677	case '}': /* LS2R */
	678	LOCKING_SHIFT(2, RIGHT);
	679	break;
	680	case '~': /* LS1R */
	681	LOCKING_SHIFT(1, RIGHT);
	682	break;
	683	default:
	684	/* Unsupported escape sequence. Spit it back out. */
	685	emit(emitctx, ESC);
	686	emit(emitctx, input_chr);
	687	}
	688	break;
	689	case ' ': /* ACS */
	690	/*
	691	* Various coding structure facilities specify that designating
	692	* a code element also invokes it. As far as I can see, invoking
	693	* it now will have the same practical effect, since those
	694	* facilities also ban the use of locking shifts.
	695	*/
	696	switch (input_chr) {
	697	case 'A': /* G0 element used and invoked into GL */
	698	LOCKING_SHIFT(0, LEFT);
	699	break;
	700	case 'C': /* G0 in GL, G1 in GR */
	701	case 'D': /* Ditto, at least for 8-bit codes */
	702	case 'L': /* ISO 4873 (ECMA-43) level 1 */
	703	case 'M': /* ISO 4873 (ECMA-43) level 2 */
	704	LOCKING_SHIFT(0, LEFT);
	705	LOCKING_SHIFT(1, RIGHT);
	706	break;
	707	}
	708	break;
	709	case '&': /* IRR */
	710	/*
	711	* IRR (Identify Revised Registration) is ignored here,
	712	* since any revised registration must be
	713	* upward-compatible with the old one, so either we'll
	714	* support the new one or we'll emit ERROR when we run
	715	* into a new character. In either case, there's nothing
	716	* to be done here.
	717	*/
	718	break;
	719	case '(': /* GZD4 / case ')': / G1D4 */
	720	case '': / G2D4 / case '+': / G3D4 */
	721	designate(state, i1 - '(', S4, i2, input_chr);
	722	break;
	723	case '-': /* G1D6 / case '.': / G2D6 / case '/': / G3D6 */
	724	designate(state, i1 - ',', S6, i2, input_chr);
	725	break;
	726	case '$': /* G?DM? */
	727	switch (i2) {
	728	case 0: /* Obsolete version of GZDM4 */
	729	i2 = '(';
	730	case '(': /* GZDM4 / case ')': / G1DM4 */
	731	case '': / G2DM4 / case '+': / G3DM4 */
	732	designate(state, i2 - '(', M4, 0, input_chr);
	733	break;
	734	case '-': /* G1DM6 */
	735	case '.': /* G2DM6 / case '/': / G3DM6 */
	736	designate(state, i2 - ',', M6, 0, input_chr);
	737	break;
	738	default:
	739	emit(emitctx, ERROR);
	740	break;
	741	}
	742	case '%': /* DOCS */
	743	/* XXX What's a reasonable way to handle an unrecognised DOCS? */
	744	switch (i2) {
	745	case 0:
	746	switch (input_chr) {
	747	case 'G':
	748	ENTER_MODE(DOCSUTF8);
	749	break;
	750	}
	751	break;
	752	case '/':
	753	switch (input_chr) {
	754	case '1': case '2':
	755	ENTER_MODE(DOCSCTEXT);
	756	break;
	757	}
	758	break;
	759	}
	760	break;
	761	default:
	762	/* Unsupported nF escape sequence. Re-emit it. */
	763	emit(emitctx, ESC);
	764	emit(emitctx, i1);
	765	if (i2) emit(emitctx, i2);
	766	emit(emitctx, input_chr);
	767	break;
	768	}
	769	}
	770	}
	771
	772	static void oselect(charset_state *state, int i, int right,
	773	void (emit)(void ctx, long int output),
	774	void *emitctx)
	775	{
	776	int shift = (right ? 31-7 : 31-7-7);
	777	struct iso2022_subcharset const *subcs = &iso2022_subcharsets[i];
	778
	779	if (((state->s1 >> shift) & 0x7F) != i) {
	780	state->s1 &= ~(0x7FL << shift);
	781	state->s1 \|= (i << shift);
	782
	783	if (emit) {
	784	emit(emitctx, ESC);
	785	if (subcs->type == M4 \|\| subcs->type == M6)
	786	emit(emitctx, '$');
	787	if (subcs->type == S6 \|\| subcs->type == M6) {
	788	assert(right);
	789	emit(emitctx, '-');
	790	} else if (right) {
	791	emit(emitctx, ')');
	792	} else {
	793	emit(emitctx, '(');
	794	}
	795	if (subcs->i)
	796	emit(emitctx, subcs->i);
	797	emit(emitctx, subcs->f);
	798	}
	799	}
	800	}
	801
	802	static void docs_char(charset_state *state,
	803	void (emit)(void ctx, long int output),
	804	void emitctx, int cset, char data, int datalen)
	805	{
	806	int curr_cset, currlen, i;
	807
	808	/*
	809	* cset is the index into ctext_encodings[]. It can also be -1
	810	* to mean DOCS UTF-8, or -2 to mean no DOCS (ordinary 2022).
	811	* In the latter case, `chr' is ignored.
	812	*/
	813
	814	/*
	815	* First, terminate a DOCS segment if necessary. We always have
	816	* to terminate a DOCS segment if one is active and we're about
	817	* to switch to a different one; we might also have to
	818	* terminate a length-encoded DOCS segment if we've run out of
	819	* storage space to accumulate characters in it.
	820	*/
	821	curr_cset = ((state->s1 >> 14) & 7) - 2;
	822	currlen = ((state->s1 >> 11) & 7);
	823	if ((curr_cset != -2 && curr_cset != cset) \|\|
	824	(curr_cset >= 0 && currlen + datalen > 5)) {
	825	if (curr_cset == -1) {
	826	/*
	827	* Terminating DOCS UTF-8 is easy.
	828	*/
	829	emit(emitctx, ESC);
	830	emit(emitctx, '%');
	831	emit(emitctx, '@');
	832	} else {
	833	int len;
	834
	835	/*
	836	* To terminate a length-encoded DOCS segment we must
	837	* actually output the whole thing.
	838	*/
	839	emit(emitctx, ESC);
	840	emit(emitctx, '%');
	841	emit(emitctx, '/');
	842	emit(emitctx, '0' + ctext_encodings[curr_cset].octets_per_char);
	843	len = currlen + datalen +
	844	strlen(ctext_encodings[curr_cset].name);
	845	assert(len < (1 << 14));
	846	emit(emitctx, 0x80 \| ((len >> 7) & 0x7F));
	847	emit(emitctx, 0x80 \| ((len ) & 0x7F));
	848	/* The name stored in ctext_encodings[] includes the trailing \2 */
	849	for (i = 0; ctext_encodings[curr_cset].name[i]; i++)
	850	emit(emitctx, ctext_encodings[curr_cset].name[i]);
	851	for (i = 0; i < currlen; i++)
	852	emit(emitctx,
	853	(i == 0 ? state->s1 : state->s0 >> (8*(4-i))) & 0xFF);
	854	for (i = 0; i < datalen; i++)
	855	emit(emitctx, data[i]);
	856
	857	/*
	858	* We've now dealt with the input data, so clear it so
	859	* we don't try to do so again below.
	860	*/
	861	datalen = 0;
	862	}
	863	curr_cset = -2;
	864	}
	865
	866	/*
	867	* Now, start a DOCS segment if necessary.
	868	*/
	869	if (curr_cset != cset) {
	870	assert(cset != -2);
	871	if (cset == -1) {
	872	/*
	873	* Start DOCS UTF-8.
	874	*/
	875	emit(emitctx, ESC);
	876	emit(emitctx, '%');
	877	emit(emitctx, 'G');
	878	} else {
	879	/*
	880	* Starting a length-encoded DOCS segment is simply a
	881	* matter of setting our stored length counter to zero.
	882	*/
	883	currlen = 0;
	884	state->s1 &= ~(7 << 11);
	885	state->s1 &= ~0xFF;
	886	state->s0 = 0;
	887	}
	888	}
	889	state->s1 &= ~(7 << 14);
	890	assert((cset+2) >= 0 && (cset+2) < 8);
	891	state->s1 \|= ((cset+2) << 14);
	892
	893	/*
	894	* Now we're in the right DOCS state. Actually deal with the
	895	* input data, if we haven't already done so above.
	896	*/
	897	if (datalen > 0) {
	898	assert(cset != 2);
	899	if (cset == -1) {
	900	/*
	901	* In DOCS UTF-8, we output data as soon as we get it.
	902	*/
	903	for (i = 0; i < datalen; i++)
	904	emit(emitctx, data[i]);
	905	} else {
	906	/*
	907	* In length-encoded DOCS, we just store our data and
	908	* bide our time. It'll all be output when we fill up
	909	* or switch to another character set.
	910	*/
	911	assert(currlen + datalen <= 5); /* overflow handled already */
	912	for (i = 0; i < datalen; i++) {
	913	if (currlen + i == 0)
	914	state->s1 \|= data[i] & 0xFF;
	915	else
	916	state->s0 \|= (data[i] & 0xFF) << (8*(4-(currlen+i)));
	917	}
	918	currlen += datalen;
	919	assert(currlen >= 0 && currlen < 8);
	920	state->s1 &= ~(7 << 11);
	921	state->s1 \|= (currlen << 11);
	922	}
	923	}
	924	}
	925
	926	static void write_to_pointer(void *ctx, long int output)
	927	{
	928	char ptr = (char )ctx;
	929	(ptr)++ = output;
	930	}
	931
	932	/*
	933	* Writing full ISO-2022 is not useful in very many circumstances.
	934	* One of the few situations in which it _is_ useful is generating
	935	* X11 COMPOUND_TEXT; therefore, this writing function will obey
	936	* the compound text restrictions and hence output the subset of
	937	* ISO-2022 that's usable in that context.
	938	*
	939	* The subset in question is roughly that we use GL/GR for G0/G1
	940	* always, and that the _only_ escape sequences we output (other
	941	* than the occasional DOCS) are those which designate different
	942	* subcharsets into G0 and G1. There are additional constraints
	943	* about which things go in which container; see below.
	944	*
	945	* FIXME: this wants some decent tests to be written, and also the
	946	* exact output policy for compound text wants thinking about more
	947	* carefully.
	948	*/
	949	static int write_iso2022(charset_spec const *charset, long int input_chr,
	950	charset_state *state,
	951	void (emit)(void ctx, long int output),
	952	void *emitctx)
	953	{
	954	int i;
	955	struct iso2022_subcharset const *subcs;
	956	struct iso2022_mode const mode = (struct iso2022_mode )charset->data;
	957	to_dbcs_planar_t last_planar_dbcs = NULL;
	958	int last_p, last_r, last_c;
	959	long int c1, c2;
	960
	961	/*
	962	* For output, I allocate the state variables as follows:
	963	*
	964	* s1[31] == 1 if output state has been initialised
	965	* s1[30:24] == G1 charset (always in GR)
	966	* s1[23:17] == G0 charset (always in GL)
	967	* s1[16:14] == DOCS index plus 2 (because -1 and -2 are special)
	968	* s1[13:11] == number of DOCS accumulated characters (up to five)
	969	* s1[7:0] + s0[31:0] == DOCS collected characters
	970	*/
	971
	972	if (!state->s1) {
	973	state->s0 = 0x00000000UL;
	974	state->s1 = 0x80000000UL;
	975	/*
	976	* Start with US-ASCII in GL and also in GR.
	977	*/
	978	for (i = 0; i < lenof(iso2022_subcharsets); i++) {
	979	subcs = &iso2022_subcharsets[i];
	980	if (subcs->type == mode->ltype &&
	981	subcs->i == mode->li &&
	982	subcs->f == mode->lf)
	983	oselect(state, i, FALSE, NULL, NULL);
	984	if (subcs->type == mode->rtype &&
	985	subcs->i == mode->ri &&
	986	subcs->f == mode->rf)
	987	oselect(state, i, TRUE, NULL, NULL);
	988	}
	989	}
	990
	991	if (input_chr == -1) {
	992	/*
	993	* Special case: reset encoding state.
	994	*/
	995	docs_char(state, emit, emitctx, -2, NULL, 0); /* leave DOCS */
	996
	997	for (i = 0; i < lenof(iso2022_subcharsets); i++) {
	998	subcs = &iso2022_subcharsets[i];
	999	if (subcs->type == mode->ltype &&
	1000	subcs->i == mode->li &&
	1001	subcs->f == mode->lf)
	1002	oselect(state, i, FALSE, emit, emitctx);
	1003	if (subcs->type == mode->rtype &&
	1004	subcs->i == mode->ri &&
	1005	subcs->f == mode->rf)
	1006	oselect(state, i, TRUE, emit, emitctx);
	1007	}
	1008	return TRUE;
	1009	}
	1010
	1011	/*
	1012	* Special-case characters: Space, Delete, and anything in C0
	1013	* or C1 are output unchanged.
	1014	*/
	1015	if (input_chr <= 0x20 \|\| (input_chr >= 0x7F && input_chr < 0xA0)) {
	1016	emit(emitctx, input_chr);
	1017	return TRUE;
	1018	}
	1019
	1020	/*
	1021	* Analyse the input character and work out which subcharset it
	1022	* belongs to.
	1023	*/
	1024	for (i = 0; i < lenof(iso2022_subcharsets); i++) {
	1025	subcs = &iso2022_subcharsets[i];
	1026	if (!(mode->enable_mask & (1 << subcs->enable)))
	1027	continue; /* this charset is disabled */
	1028	if (subcs->sbcs_base) {
	1029	c1 = sbcs_from_unicode(subcs->sbcs_base, input_chr);
	1030	c1 -= subcs->offset;
	1031	if (c1 >= 0x20 && c1 <= 0x7f) {
	1032	c2 = 0;
	1033	break;
	1034	}
	1035	} else if (subcs->to_dbcs) {
	1036	if (subcs->to_dbcs_plane >= 0) {
	1037	/*
	1038	* Since multiplanar DBCSes almost by definition
	1039	* involve several entries in iso2022_subcharsets
	1040	* with the same to_dbcs function and different
	1041	* plane values, we remember the last such function
	1042	* we called and what its result was, so that we
	1043	* don't (for example) have to call
	1044	* unicode_to_cns11643 seven times.
	1045	*/
	1046	if (last_planar_dbcs != REPLANARISE(subcs->to_dbcs)) {
	1047	last_planar_dbcs = REPLANARISE(subcs->to_dbcs);
	1048	if (!last_planar_dbcs(input_chr,
	1049	&last_p, &last_r, &last_c))
	1050	last_p = -1;
	1051	}
	1052	} else {
	1053	last_p = subcs->to_dbcs_plane;
	1054	if (!subcs->to_dbcs(input_chr, &last_r, &last_c))
	1055	last_p = 0; /* cannot match since to_dbcs_plane<0 */
	1056	}
	1057
	1058	if (last_p == subcs->to_dbcs_plane) {
	1059	c1 = last_r - subcs->offset;
	1060	c2 = last_c - subcs->offset;
	1061	assert(c1 >= 0x20 && c1 <= 0x7f);
	1062	assert(c2 >= 0x20 && c2 <= 0x7f);
	1063	break;
	1064	}
	1065	}
	1066	}
	1067
	1068	if (i < lenof(iso2022_subcharsets)) {
	1069	int right;
	1070
	1071	/*
	1072	* Our character is represented by c1 (and possibly also
	1073	* c2) in subcharset `subcs'. So now we must decide whether
	1074	* to designate that character set into G0/GL or G1/GR.
	1075	*
	1076	* Any S6 or M6 subcharset has to go in GR because it won't
	1077	* fit in GL. In addition, the compound text rules state
	1078	* that any single-byte subcharset defined as the
	1079	* right-hand half of some SBCS must go in GR.
	1080	*
	1081	* M4 subcharsets can go in either half according to the
	1082	* rules. I choose to put them in GR always because it's a
	1083	* simple policy with reasonable behaviour (facilitates
	1084	* switching between them and ASCII).
	1085	*/
	1086	right = (subcs->type == S6 \|\| subcs->type == M6 \|\| subcs->type == M4 \|\|
	1087	(subcs->sbcs_base && subcs->offset == 0x80));
	1088
	1089	/*
	1090	* If we're in a DOCS mode, leave it.
	1091	*/
	1092	docs_char(state, emit, emitctx, -2, NULL, 0);
	1093
	1094	/*
	1095	* If this subcharset is not already selected in that
	1096	* container, select it.
	1097	*/
	1098	oselect(state, i, right, emit, emitctx);
	1099
	1100	/*
	1101	* Now emit the actual characters.
	1102	*/
	1103	if (right) {
	1104	assert(c1 >= 0x20 && c1 <= 0x7f);
	1105	emit(emitctx, c1 \| 0x80);
	1106	if (c2) {
	1107	assert(c2 >= 0x20 && c2 <= 0x7f);
	1108	emit(emitctx, c2 \| 0x80);
	1109	}
	1110	} else {
	1111	assert(c1 > 0x20 && c1 < 0x7f);
	1112	emit(emitctx, c1);
	1113	if (c2) {
	1114	assert(c2 > 0x20 && c2 < 0x7f);
	1115	emit(emitctx, c2);
	1116	}
	1117	}
	1118
	1119	return TRUE;
	1120	}
	1121
	1122	/*
	1123	* Fall back to DOCS.
	1124	*/
	1125	{
	1126	char data[10];
	1127	char *p = data;
	1128	int i, cs;
	1129
	1130	cs = -2; /* means failure */
	1131
	1132	for (i = 0; i <= lenof(ctext_encodings); i++) {
	1133	charset_state substate;
	1134	charset_spec const *subcs = ctext_encodings[i].subcs;
	1135
	1136	/*
	1137	* We assume that all character sets dealt with by DOCS
	1138	* are stateless for output purposes.
	1139	*/
	1140	substate.s1 = substate.s0 = 0;
	1141	p = data;
	1142
	1143	if (i < lenof(ctext_encodings)) {
	1144	if ((mode->enable_mask & (1 << ctext_encodings[i].enable)) &&
	1145	subcs->write(subcs, input_chr, &substate,
	1146	write_to_pointer, &p)) {
	1147	cs = i;
	1148	break;
	1149	}
	1150	} else {
	1151	if ((mode->enable_mask & (1 << CDU)) &&
	1152	write_utf8(NULL, input_chr, NULL, write_to_pointer, &p)) {
	1153	cs = -1;
	1154	break;
	1155	}
	1156	}
	1157	}
	1158
	1159	if (cs != -2) {
	1160	docs_char(state, emit, emitctx, cs, data, p - data);
	1161	return TRUE;
	1162	}
	1163	}
	1164
	1165	return FALSE;
	1166	}
	1167
	1168	/*
	1169	* Full ISO 2022 output with all options on. Not entirely sure what
	1170	* if anything this is useful for, but here it is anyway. All
	1171	* output character sets and DOCS variants are permitted; all
	1172	* containers start out with ASCII in them.
	1173	*/
	1174	static const struct iso2022_mode iso2022_all = {
	1175	(1<<CCS) \| (1<<COS) \| (1<<CPU) \| (1<<CDC) \| (1<<CDU),
	1176	S4, 0, 'B', S4, 0, 'B',
	1177	};
	1178
	1179	const charset_spec charset_CS_ISO2022 = {
	1180	CS_ISO2022, read_iso2022, write_iso2022, &iso2022_all
	1181	};
	1182
	1183	/*
	1184	* X11 compound text. A subset of output charsets is permitted, and
	1185	* G1/GR starts off in ISO8859-1.
	1186	*/
	1187	static const struct iso2022_mode iso2022_ctext = {
	1188	(1<<CCS) \| (1<<CDC),
	1189	S4, 0, 'B', S6, 0, 'A',
	1190	};
	1191
	1192	const charset_spec charset_CS_CTEXT = {
	1193	CS_CTEXT, read_iso2022, write_iso2022, &iso2022_ctext
	1194	};
	1195
	1196	#ifdef TESTMODE
	1197
	1198	#include <stdio.h>
	1199	#include <stdarg.h>
	1200	#include <string.h>
	1201
	1202	int total_errs = 0;
	1203
	1204	void iso2022_emit(void *ctx, long output)
	1205	{
	1206	wchar_t p = (wchar_t )ctx;
	1207	(p)++ = output;
	1208	}
	1209
	1210	void iso2022_read_test(int line, char *input, int inlen, ...)
	1211	{
	1212	va_list ap;
	1213	wchar_t *p, str[512];
	1214	int i;
	1215	charset_state state;
	1216	unsigned long l;
	1217
	1218	state.s0 = state.s1 = 0;
	1219	p = str;
	1220
	1221	for (i = 0; i < inlen; i++)
	1222	read_iso2022(NULL, input[i] & 0xFF, &state, iso2022_emit, &p);
	1223
	1224	va_start(ap, inlen);
	1225	l = 0;
	1226	for (i = 0; i < p - str; i++) {
	1227	l = va_arg(ap, long int);
	1228	if (l == -1) {
	1229	printf("%d: correct string shorter than output\n", line);
	1230	total_errs++;
	1231	break;
	1232	}
	1233	if (l != str[i]) {
	1234	printf("%d: char %d came out as %08x, should be %08lx\n",
	1235	line, i, str[i], l);
	1236	total_errs++;
	1237	}
	1238	}
	1239	if (l != -1) {
	1240	l = va_arg(ap, long int);
	1241	if (l != -1) {
	1242	printf("%d: correct string longer than output\n", line);
	1243	total_errs++;
	1244	}
	1245	}
	1246	va_end(ap);
	1247	}
	1248
	1249	/* Macro to concoct the first three parameters of iso2022_read_test. */
	1250	#define TESTSTR(x) __LINE__, x, lenof(x)
	1251
	1252	int main(void)
	1253	{
	1254	printf("read tests beginning\n");
	1255	/* Simple test (Emacs sample text for Japanese, in ISO-2022-JP) */
	1256	iso2022_read_test(TESTSTR("Japanese (\x1b$BF\|K\\8l\x1b(B)\t"
	1257	"\x1b$B$3$s$K$A$O\x1b(B, "
	1258	"\x1b$B%3%s%K%A%O\x1b(B\n"),
	1259	'J','a','p','a','n','e','s','e',' ','(',
	1260	0x65E5, 0x672C, 0x8A9E, ')', '\t',
	1261	0x3053, 0x3093, 0x306b, 0x3061, 0x306f, ',', ' ',
	1262	0x30b3, 0x30f3, 0x30cb, 0x30c1, 0x30cf, '\n', 0, -1);
	1263	/* Same thing in EUC-JP (with designations, and half-width katakana) */
	1264	iso2022_read_test(TESTSTR("\x1b$)B\x1b*I\x1b$+D"
	1265	"Japanese (\xc6\xfc\xcb\xdc\xb8\xec)\t"
	1266	"\xa4\xb3\xa4\xf3\xa4\xcb\xa4\xc1\xa4\xcf, "
	1267	"\x8e\xba\x8e\xdd\x8e\xc6\x8e\xc1\x8e\xca\n"),
	1268	'J','a','p','a','n','e','s','e',' ','(',
	1269	0x65E5, 0x672C, 0x8A9E, ')', '\t',
	1270	0x3053, 0x3093, 0x306b, 0x3061, 0x306f, ',', ' ',
	1271	0xff7a, 0xff9d, 0xff86, 0xff81, 0xff8a, '\n', 0, -1);
	1272	/* Multibyte single-shift */
	1273	iso2022_read_test(TESTSTR("\x1b$)B\x1b*I\x1b$+D\x8f\"/!"),
	1274	0x02D8, '!', 0, -1);
	1275	/* Non-existent SBCS */
	1276	iso2022_read_test(TESTSTR("\x1b(!Zfnord\n"),
	1277	ERROR, ERROR, ERROR, ERROR, ERROR, '\n', 0, -1);
	1278	/* Pass-through of ordinary escape sequences, including a long one */
	1279	iso2022_read_test(TESTSTR("\x1b""b\x1b#5\x1b#!!!5"),
	1280	0x1B, 'b', 0x1B, '#', '5',
	1281	0x1B, '#', '!', '!', '!', '5', 0, -1);
	1282	/* Non-existent DBCS (also 5-byte escape sequence) */
	1283	iso2022_read_test(TESTSTR("\x1b$(!Bfnord!"),
	1284	ERROR, ERROR, ERROR, 0, -1);
	1285	/* Incomplete DB characters */
	1286	iso2022_read_test(TESTSTR("\x1b$B(,(\x1b(BHi\x1b$B(,(\n"),
	1287	0x2501, ERROR, 'H', 'i', 0x2501, ERROR, '\n', 0, -1);
	1288	iso2022_read_test(TESTSTR("\x1b$)B\x1b*I\x1b$+D\xa4""B"),
	1289	ERROR, 'B', 0, -1);
	1290	iso2022_read_test(TESTSTR("\x1b$)B\x1b*I\x1b$+D\x0e\x1b\|$\xa2\xaf"),
	1291	ERROR, 0x02D8, 0, -1);
	1292	/* Incomplete escape sequence */
	1293	iso2022_read_test(TESTSTR("\x1b\n"), ERROR, '\n', 0, -1);
	1294	iso2022_read_test(TESTSTR("\x1b-A\x1b~\x1b\xa1"), ERROR, 0xa1, 0, -1);
	1295	/* Incomplete single-shift */
	1296	iso2022_read_test(TESTSTR("\x8e\n"), ERROR, '\n', 0, -1);
	1297	iso2022_read_test(TESTSTR("\x1b$*B\x8e(\n"), ERROR, '\n', 0, -1);
	1298	/* Corner cases (02/00 and 07/15) */
	1299	iso2022_read_test(TESTSTR("\x1b(B\x20\x7f"), 0x20, 0x7f, 0, -1);
	1300	iso2022_read_test(TESTSTR("\x1b(I\x20\x7f"), 0x20, 0x7f, 0, -1);
	1301	iso2022_read_test(TESTSTR("\x1b$B\x20\x7f"), 0x20, 0x7f, 0, -1);
	1302	iso2022_read_test(TESTSTR("\x1b-A\x0e\x20\x7f"), 0xa0, 0xff, 0, -1);
	1303	iso2022_read_test(TESTSTR("\x1b$-~\x0e\x20\x7f"), ERROR, 0, -1);
	1304	iso2022_read_test(TESTSTR("\x1b)B\xa0\xff"), ERROR, ERROR, 0, -1);
	1305	iso2022_read_test(TESTSTR("\x1b)I\xa0\xff"), ERROR, ERROR, 0, -1);
	1306	iso2022_read_test(TESTSTR("\x1b$)B\xa0\xff"), ERROR, ERROR, 0, -1);
	1307	iso2022_read_test(TESTSTR("\x1b-A\x1b~\xa0\xff"), 0xa0, 0xff, 0, -1);
	1308	iso2022_read_test(TESTSTR("\x1b$-~\x1b~\xa0\xff"), ERROR, 0, -1);
	1309	/* Designate control sets */
	1310	iso2022_read_test(TESTSTR("\x1b!@"), 0x1b, '!', '@', 0, -1);
	1311	/* Designate other coding system (UTF-8) */
	1312	iso2022_read_test(TESTSTR("\x1b%G"
	1313	"\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"),
	1314	0x03BA, 0x1F79, 0x03C3, 0x03BC, 0x03B5, 0, -1);
	1315	iso2022_read_test(TESTSTR("\x1b-A\x1b%G\xCE\xBA\x1b%@\xa0"),
	1316	0x03BA, 0xA0, 0, -1);
	1317	iso2022_read_test(TESTSTR("\x1b%G\xCE\x1b%@"), ERROR, 0, -1);
	1318	iso2022_read_test(TESTSTR("\x1b%G\xCE\xBA\x1b%\x1b%@"),
	1319	0x03BA, 0x1B, '%', 0, -1);
	1320	/* DOCS (COMPOUND_TEXT extended segment) */
	1321	iso2022_read_test(TESTSTR("\x1b%/1\x80\x80"), 0, -1);
	1322	iso2022_read_test(TESTSTR("\x1b%/1\x80\x8fiso-8859-15\2xyz\x1b(B"),
	1323	ERROR, ERROR, ERROR, 0, -1);
	1324	iso2022_read_test(TESTSTR("\x1b%/1\x80\x8eiso8859-15\2xyz\x1b(B"),
	1325	'x', 'y', 'z', 0, -1);
	1326	iso2022_read_test(TESTSTR("\x1b-A\x1b%/2\x80\x89"
	1327	"big5-0\2\xa1\x40\xa1\x40"),
	1328	0x3000, 0xa1, 0x40, 0, -1);
	1329	/* Emacs Big5-in-ISO-2022 mapping */
	1330	iso2022_read_test(TESTSTR("\x1b$(0&x86\x1b(B \x1b$(0DeBv"),
	1331	0x5143, 0x6c23, ' ', ' ', 0x958b, 0x767c, 0, -1);
	1332	/* Test from RFC 1922 (ISO-2022-CN) */
	1333	iso2022_read_test(TESTSTR("\x1b$)A\x0e=;;;\x1b$)GG(_P\x0f"),
	1334	0x4EA4, 0x6362, 0x4EA4, 0x63db, 0, -1);
	1335
	1336	printf("read tests completed\n");
	1337	printf("total: %d errors\n", total_errs);
	1338	return (total_errs != 0);
	1339	}
	1340
	1341	#endif /* TESTMODE */
	1342
	1343	#else /* ENUM_CHARSETS */
	1344
	1345	ENUM_CHARSET(CS_ISO2022)
	1346
	1347	#endif