mdw@git.distorted.org.uk Git - sgt/charset/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* iso2022.c - support for ISO/IEC 2022 (alias ECMA-35).
	3	*
	4	* This isn't a complete implementation of ISO/IEC 2022, but it's
	5	* close. It can decode 8-bit and 7-bit versions, with support for
	6	* single-byte and multi-byte character sets, all four containers
	7	* (G0, G1, G2, and G3), using both single-shift and locking-shift
	8	* sequences.
	9	*
	10	* The general principle is that any valid ISO/IEC 2022 sequence
	11	* should either be correctly decoded or should emit an ERROR. The
	12	* only exception to this is that the C0 and C1 sets are fixed as
	13	* those of ISO/IEC 6429. Escape sequences for designating control
	14	* sets are passed through, so a post-processor could fix them up if
	15	* necessary.
	16	*
	17	* DOCS to UTF-8 works. Other DOCS sequences are ignored, which will
	18	* produce surprising results.
	19	*/
	20
	21	#ifndef ENUM_CHARSETS
	22
	23	#include <assert.h>
	24	#include <string.h>
	25
	26	#include "charset.h"
	27	#include "internal.h"
	28	#include "sbcsdat.h"
	29
	30	#define LS1 (0x0E)
	31	#define LS0 (0x0F)
	32	#define ESC (0x1B)
	33	#define SS2 (0x8E)
	34	#define SS3 (0x8F)
	35
	36	enum {S4, S6, M4, M6};
	37
	38	static long int emacs_big5_1_to_unicode(int, int);
	39	static long int emacs_big5_2_to_unicode(int, int);
	40	static int unicode_to_emacs_big5(long int, int , int , int *);
	41	static long int cns11643_1_to_unicode(int, int);
	42	static long int cns11643_2_to_unicode(int, int);
	43	static long int cns11643_3_to_unicode(int, int);
	44	static long int cns11643_4_to_unicode(int, int);
	45	static long int cns11643_5_to_unicode(int, int);
	46	static long int cns11643_6_to_unicode(int, int);
	47	static long int cns11643_7_to_unicode(int, int);
	48	static long int null_dbcs_to_unicode(int, int);
	49	static int unicode_to_null_dbcs(long int, int , int );
	50
	51	typedef int (to_dbcs_t)(long int, int , int *);
	52	typedef int (to_dbcs_planar_t)(long int, int , int , int );
	53
	54	/*
	55	* These macros cast between to_dbcs_planar_t and to_dbcs_t, in
	56	* such a way as to cause a compile-time error if the input is not
	57	* of the appropriate type.
	58	*
	59	* Defining these portably is quite fiddly. My first effort was as
	60	* follows:
	61	* #define DEPLANARISE(x) ( (x) == (to_dbcs_planar_t)NULL, (to_dbcs_t)(x) )
	62	*
	63	* so that the comparison on the left of the comma provokes the
	64	* type check error, and the cast on the right is the actual
	65	* desired result.
	66	*
	67	* gcc was entirely happy with this. However, when used in a static
	68	* initialiser, MSVC objected - justifiably - that the first half
	69	* of the comma expression wasn't constant and thus the expression
	70	* as a whole was not a constant expression. We can get round this
	71	* by enclosing the comparison in `sizeof', so that it isn't
	72	* actually evaluated.
	73	*
	74	* But then we run into a second problem, which is that C actually
	75	* disallows the use of the comma operator within a constant
	76	* expression for any purpose at all! Presumably this is on the
	77	* basis that its purpose is to have side effects and constant
	78	* expressions can't; unfortunately, this specific case is one in
	79	* which the desired side effect is a compile-time rather than a
	80	* run-time one.
	81	*
	82	* We are permitted to use ?:, however, and that works quite well
	83	* since the actual result of the sizeof expression _is_ evaluable
	84	* at compile time. So here's my final answer:
	85	*/
	86	#define TYPECHECK(x,y) ( sizeof((x)) == sizeof((x)) ? (y) : (y) )
	87	#define DEPLANARISE(x) TYPECHECK((x) == (to_dbcs_planar_t)NULL, (to_dbcs_t)(x))
	88	#define REPLANARISE(x) TYPECHECK((x) == (to_dbcs_t)NULL, (to_dbcs_planar_t)(x))
	89
	90	/*
	91	* Values used in the `enable' field. Each of these identifies a
	92	* class of character sets; we then have a bitmask indicating which
	93	* classes are allowable in a given mode.
	94	*
	95	* These values are currently only checked on output: for input,
	96	* any ISO 2022 we can comprehend at all is considered acceptable.
	97	*/
	98	#define CCS 1 /* CTEXT standard */
	99	#define COS 2 /* other standard */
	100	#define CPU 3 /* private use */
	101	#define CDC 4 /* DOCS for CTEXT */
	102	#define CDU 5 /* DOCS for UTF-8 */
	103	#define CNU 31 /* never used */
	104
	105	struct iso2022_mode {
	106	int enable_mask;
	107	char ltype, li, lf, rtype, ri, rf;
	108	};
	109
	110	const struct iso2022_subcharset {
	111	char type, i, f, enable;
	112	int offset;
	113	const sbcs_data *sbcs_base;
	114	long int (*from_dbcs)(int, int);
	115
	116	/*
	117	* If to_dbcs_plane < 0, then to_dbcs is used as expected.
	118	* However, if to_dbcs_plane >= 0, then to_dbcs is expected to
	119	* be cast to a to_dbcs_planar_t before use, and the returned
	120	* plane value (the first int *) must equal to_dbcs_plane.
	121	*
	122	* I'd have preferred to do this by means of a union, but you
	123	* can't initialise a selected field of a union at compile
	124	* time. Function pointer casts are guaranteed to work sensibly
	125	* in ISO C (that is, it's undefined what happens if you call a
	126	* function via the wrong type of pointer, but if you cast it
	127	* back to the right type before calling it then it must work),
	128	* so this is safe if ugly.
	129	*/
	130	to_dbcs_t to_dbcs;
	131	int to_dbcs_plane; /* use to_dbcs_planar iff >= 0 */
	132	} iso2022_subcharsets[] = {
	133	/*
	134	* We list these subcharsets in preference order for output.
	135	* Since the best-defined use of ISO 2022 output is compound
	136	* text, we'll use a preference order which matches that. So we
	137	* begin with the charsets defined in the compound text spec.
	138	*/
	139	{ S4, 0, 'B', CCS, 0x00, &sbcsdata_CS_ASCII, NULL, NULL, 0 },
	140	{ S6, 0, 'A', CCS, 0x80, &sbcsdata_CS_ISO8859_1, NULL, NULL, 0 },
	141	{ S6, 0, 'B', CCS, 0x80, &sbcsdata_CS_ISO8859_2, NULL, NULL, 0 },
	142	{ S6, 0, 'C', CCS, 0x80, &sbcsdata_CS_ISO8859_3, NULL, NULL, 0 },
	143	{ S6, 0, 'D', CCS, 0x80, &sbcsdata_CS_ISO8859_4, NULL, NULL, 0 },
	144	{ S6, 0, 'F', CCS, 0x80, &sbcsdata_CS_ISO8859_7, NULL, NULL, 0 },
	145	{ S6, 0, 'G', CCS, 0x80, &sbcsdata_CS_ISO8859_6, NULL, NULL, 0 },
	146	{ S6, 0, 'H', CCS, 0x80, &sbcsdata_CS_ISO8859_8, NULL, NULL, 0 },
	147	{ S6, 0, 'L', CCS, 0x80, &sbcsdata_CS_ISO8859_5, NULL, NULL, 0 },
	148	{ S6, 0, 'M', CCS, 0x80, &sbcsdata_CS_ISO8859_9, NULL, NULL, 0 },
	149	{ S4, 0, 'I', CCS, 0x80, &sbcsdata_CS_JISX0201, NULL, NULL, 0 },
	150	{ S4, 0, 'J', CCS, 0x00, &sbcsdata_CS_JISX0201, NULL, NULL, 0 },
	151	{ M4, 0, 'A', CCS, -0x21, 0, &gb2312_to_unicode, &unicode_to_gb2312, -1 },
	152	{ M4, 0, 'B', CCS, -0x21, 0, &jisx0208_to_unicode, &unicode_to_jisx0208, -1 },
	153	{ M4, 0, 'C', CCS, -0x21, 0, &ksx1001_to_unicode, &unicode_to_ksx1001, -1 },
	154	{ M4, 0, 'D', CCS, -0x21, 0, &jisx0212_to_unicode, &unicode_to_jisx0212, -1 },
	155
	156	/*
	157	* Next, other reasonably standard things: the rest of the ISO
	158	* 8859 sets, UK-ASCII, and CNS 11643.
	159	*/
	160	{ S6, 0, 'T', COS, 0x80, &sbcsdata_CS_ISO8859_11, NULL, NULL, 0 },
	161	{ S6, 0, 'V', COS, 0x80, &sbcsdata_CS_ISO8859_10, NULL, NULL, 0 },
	162	{ S6, 0, 'Y', COS, 0x80, &sbcsdata_CS_ISO8859_13, NULL, NULL, 0 },
	163	{ S6, 0, '_', COS, 0x80, &sbcsdata_CS_ISO8859_14, NULL, NULL, 0 },
	164	{ S6, 0, 'b', COS, 0x80, &sbcsdata_CS_ISO8859_15, NULL, NULL, 0 },
	165	{ S6, 0, 'f', COS, 0x80, &sbcsdata_CS_ISO8859_16, NULL, NULL, 0 },
	166	{ S4, 0, 'A', COS, 0x00, &sbcsdata_CS_BS4730, NULL, NULL, 0 },
	167	{ M4, 0, 'G', COS, -0x21, 0, &cns11643_1_to_unicode, DEPLANARISE(&unicode_to_cns11643), 0 },
	168	{ M4, 0, 'H', COS, -0x21, 0, &cns11643_2_to_unicode, DEPLANARISE(&unicode_to_cns11643), 1 },
	169	{ M4, 0, 'I', COS, -0x21, 0, &cns11643_3_to_unicode, DEPLANARISE(&unicode_to_cns11643), 2 },
	170	{ M4, 0, 'J', COS, -0x21, 0, &cns11643_4_to_unicode, DEPLANARISE(&unicode_to_cns11643), 3 },
	171	{ M4, 0, 'K', COS, -0x21, 0, &cns11643_5_to_unicode, DEPLANARISE(&unicode_to_cns11643), 4 },
	172	{ M4, 0, 'L', COS, -0x21, 0, &cns11643_6_to_unicode, DEPLANARISE(&unicode_to_cns11643), 5 },
	173	{ M4, 0, 'M', COS, -0x21, 0, &cns11643_7_to_unicode, DEPLANARISE(&unicode_to_cns11643), 6 },
	174
	175	/*
	176	* Private-use designations: DEC private sets and Emacs's Big5
	177	* abomination.
	178	*/
	179	{ S4, 0, '0', CPU, 0x00, &sbcsdata_CS_DEC_GRAPHICS, NULL, NULL, 0 },
	180	{ S4, 0, '<', CPU, 0x80, &sbcsdata_CS_DEC_MCS, NULL, NULL, 0 },
	181	{ M4, 0, '0', CPU, -0x21, 0, &emacs_big5_1_to_unicode, DEPLANARISE(&unicode_to_emacs_big5), 1 },
	182	{ M4, 0, '1', CPU, -0x21, 0, &emacs_big5_2_to_unicode, DEPLANARISE(&unicode_to_emacs_big5), 2 },
	183
	184	/*
	185	* Ben left this conditioned out without explanation,
	186	* presumably on the grounds that we don't have a translation
	187	* table for it.
	188	*/
	189	#if 0
	190	{ M4, 0, '@', CNU }, /* JIS C 6226-1978 */
	191	#endif
	192
	193	/*
	194	* Finally, fallback entries for null character sets.
	195	*/
	196	{ S4, 0, '~', CNU, 0, NULL, NULL, NULL, 0 },
	197	{ S6, 0, '~', CNU, 0, NULL, NULL, NULL, 0 }, /* empty 96-set */
	198	{ M4, 0, '~', CNU, 0, 0, &null_dbcs_to_unicode, &unicode_to_null_dbcs, -1 }, /* empty 94^n-set */
	199	{ M6, 0, '~', CNU, 0, 0, &null_dbcs_to_unicode, &unicode_to_null_dbcs, -1 }, /* empty 96^n-set */
	200	};
	201
	202	static long int null_dbcs_to_unicode(int r, int c)
	203	{
	204	UNUSEDARG(r);
	205	UNUSEDARG(c);
	206	return ERROR;
	207	}
	208	static int unicode_to_null_dbcs(long int unicode, int r, int c)
	209	{
	210	UNUSEDARG(unicode);
	211	UNUSEDARG(r);
	212	UNUSEDARG(c);
	213	return 0; /* failed to convert anything */
	214	}
	215
	216	/*
	217	* Emacs encodes Big5 in COMPOUND_TEXT as two 94x94 character sets.
	218	* We treat Big5 as a 94x191 character set with a bunch of undefined
	219	* columns in the middle, so we have to mess around a bit to make
	220	* things fit.
	221	*/
	222
	223	static long int emacs_big5_1_to_unicode(int r, int c)
	224	{
	225	unsigned long s;
	226	s = r * 94 + c;
	227	r = s / 157;
	228	c = s % 157;
	229	if (c >= 64) c += 34; /* Skip over the gap */
	230	return big5_to_unicode(r, c);
	231	}
	232
	233	static long int emacs_big5_2_to_unicode(int r, int c)
	234	{
	235	unsigned long s;
	236	s = r * 94 + c;
	237	r = s / 157 + 40;
	238	c = s % 157;
	239	if (c >= 64) c += 34; /* Skip over the gap */
	240	return big5_to_unicode(r, c);
	241	}
	242
	243	static int unicode_to_emacs_big5(long int unicode, int p, int r, int *c)
	244	{
	245	int rr, cc, s;
	246	if (!unicode_to_big5(unicode, &rr, &cc))
	247	return 0;
	248	if (cc >= 64) {
	249	cc -= 34;
	250	assert(cc >= 64);
	251	}
	252	s = rr * 157 + cc;
	253	if (s >= 40*157) {
	254	*p = 2;
	255	s -= 40*157;
	256	} else {
	257	*p = 1;
	258	}
	259	*r = s / 94;
	260	*c = s % 94;
	261	return 1;
	262	}
	263
	264	/* Wrappers for cns11643_to_unicode() */
	265	static long int cns11643_1_to_unicode(int r, int c)
	266	{
	267	return cns11643_to_unicode(0, r, c);
	268	}
	269	static long int cns11643_2_to_unicode(int r, int c)
	270	{
	271	return cns11643_to_unicode(1, r, c);
	272	}
	273	static long int cns11643_3_to_unicode(int r, int c)
	274	{
	275	return cns11643_to_unicode(2, r, c);
	276	}
	277	static long int cns11643_4_to_unicode(int r, int c)
	278	{
	279	return cns11643_to_unicode(3, r, c);
	280	}
	281	static long int cns11643_5_to_unicode(int r, int c)
	282	{
	283	return cns11643_to_unicode(4, r, c);
	284	}
	285	static long int cns11643_6_to_unicode(int r, int c)
	286	{
	287	return cns11643_to_unicode(5, r, c);
	288	}
	289	static long int cns11643_7_to_unicode(int r, int c)
	290	{
	291	return cns11643_to_unicode(6, r, c);
	292	}
	293
	294	/* States, or "what we're currently accumulating". */
	295	enum {
	296	IDLE, /* None of the below */
	297	SS2CHAR, /* Accumulating a character after SS2 */
	298	SS3CHAR, /* Accumulating a character after SS3 */
	299	ESCSEQ, /* Accumulating an escape sequence */
	300	ESCDROP, /* Discarding an escape sequence */
	301	ESCPASS, /* Passing through an escape sequence */
	302	DOCSUTF8, /* DOCSed into UTF-8 */
	303	DOCSCTEXT /* DOCSed into a COMPOUND_TEXT extended segment */
	304	};
	305
	306	#if 0
	307	#include <stdio.h>
	308	static void dump_state(charset_state *s)
	309	{
	310	unsigned s0 = s->s0, s1 = s->s1;
	311	char const * const modes[] = { "IDLE", "SS2CHAR", "SS3CHAR",
	312	"ESCSEQ", "ESCDROP", "ESCPASS",
	313	"DOCSUTF8" };
	314
	315	fprintf(stderr, "s0: %s", modes[s0 >> 29]);
	316	fprintf(stderr, " %02x %02x %02x ", (s0 >> 16) & 0xff, (s0 >> 8) & 0xff,
	317	s0 & 0xff);
	318	fprintf(stderr, "s1: LS%d LS%dR", (s1 >> 30) & 3, (s1 >> 28) & 3);
	319	fprintf(stderr, " %d %d %d %d\n", s1 & 0x7f, (s1 >> 7) & 0x7f,
	320	(s1 >> 14) & 0x7f, (s1 >> 21) & 0x7f);
	321	}
	322	#endif
	323
	324	static void designate(charset_state *state, int container,
	325	int type, int ibyte, int fbyte)
	326	{
	327	unsigned long i;
	328
	329	assert(container >= 0 && container <= 3);
	330	assert(type == S4 \|\| type == S6 \|\| type == M4 \|\| type == M6);
	331
	332	for (i = 0; i < lenof(iso2022_subcharsets); i++) {
	333	if (iso2022_subcharsets[i].type == type &&
	334	iso2022_subcharsets[i].i == ibyte &&
	335	iso2022_subcharsets[i].f == fbyte) {
	336	state->s1 &= ~(0x7fL << (container * 7));
	337	state->s1 \|= (i << (container * 7));
	338	return;
	339	}
	340	}
	341	/*
	342	* If we don't find the charset, invoke the empty one, so we
	343	* output ERROR rather than garbage.
	344	*/
	345	designate(state, container, type, 0, '~');
	346	}
	347
	348	static void do_utf8(long int input_chr,
	349	charset_state *state,
	350	void (emit)(void ctx, long int output),
	351	void *emitctx)
	352	{
	353	charset_state ustate;
	354
	355	ustate.s1 = 0;
	356	ustate.s0 = state->s0 & 0x03ffffffL;
	357	read_utf8(NULL, input_chr, &ustate, emit, emitctx);
	358	state->s0 = (state->s0 & ~0x03ffffffL) \| (ustate.s0 & 0x03ffffffL);
	359	}
	360
	361	static void docs_utf8(long int input_chr,
	362	charset_state *state,
	363	void (emit)(void ctx, long int output),
	364	void *emitctx)
	365	{
	366	int retstate;
	367
	368	/*
	369	* Bits [25:0] of s0 are reserved for read_utf8().
	370	* Bits [27:26] are a tiny state machine to recognise ESC % @.
	371	*/
	372	retstate = (state->s0 & 0x0c000000L) >> 26;
	373	if (retstate == 1 && input_chr == '%')
	374	retstate = 2;
	375	else if (retstate == 2 && input_chr == '@') {
	376	/* If we've got a partial UTF-8 sequence, complain. */
	377	if (state->s0 & 0x03ffffffL)
	378	emit(emitctx, ERROR);
	379	state->s0 = 0;
	380	return;
	381	} else {
	382	if (retstate >= 1) do_utf8(ESC, state, emit, emitctx);
	383	if (retstate >= 2) do_utf8('%', state, emit, emitctx);
	384	retstate = 0;
	385	if (input_chr == ESC)
	386	retstate = 1;
	387	else {
	388	do_utf8(input_chr, state, emit, emitctx);
	389	}
	390	}
	391	state->s0 = (state->s0 & ~0x0c000000L) \| (retstate << 26);
	392	}
	393
	394	struct ctext_encoding {
	395	char const *name;
	396	char octets_per_char, enable;
	397	charset_spec const *subcs;
	398	};
	399
	400	/*
	401	* In theory, this list is in <ftp://ftp.x.org/pub/DOCS/registry>,
	402	* but XLib appears to have its own ideas, and encodes these three
	403	* (as of X11R6.8.2)
	404	*/
	405
	406	extern charset_spec const charset_CS_ISO8859_14;
	407	extern charset_spec const charset_CS_ISO8859_15;
	408	extern charset_spec const charset_CS_BIG5;
	409
	410	static struct ctext_encoding const ctext_encodings[] = {
	411	{ "big5-0\2", 0 /* variable */, CDC, &charset_CS_BIG5 },
	412	{ "iso8859-14\2", 1, CDC, &charset_CS_ISO8859_14 },
	413	{ "iso8859-15\2", 1, CDC, &charset_CS_ISO8859_15 }
	414	};
	415
	416	static void docs_ctext(long int input_chr,
	417	charset_state *state,
	418	void (emit)(void ctx, long int output),
	419	void *emitctx)
	420	{
	421	/*
	422	* s0[27:26] = first entry in ctext_encodings that matches
	423	* s0[25:22] = number of characters successfully matched, 0xf if all
	424	* s0[21:8] count the number of octets left in the segment
	425	* s0[7:0] are for sub-charset use
	426	*/
	427	int n = (state->s0 >> 22) & 0xf, i = (state->s0 >> 26) & 3, oi = i, j;
	428	int length = (state->s0 >> 8) & 0x3fff;
	429
	430	/*
	431	* Note that we do not bother checking the octets-per-character
	432	* byte against the selected charset when reading. It's
	433	* extremely unlikely that this code will ever have to deal
	434	* with two charset identifiers with the same name and
	435	* different octets-per-character values! If it ever happens,
	436	* we'll have to edit this file anyway so we can modify the
	437	* code then...
	438	*/
	439
	440	if (!length) {
	441	/* Haven't read length yet */
	442	if ((state->s0 & 0xff) == 0)
	443	/* ... or even the first byte */
	444	state->s0 \|= input_chr;
	445	else {
	446	length = (state->s0 & 0x7f) * 0x80 + (input_chr & 0x7f);
	447	if (length == 0)
	448	state->s0 = 0;
	449	else
	450	state->s0 = (state->s0 & 0xf0000000) \| (length << 8);
	451	}
	452	return;
	453	}
	454
	455	j = i;
	456	if (n == 0xe) {
	457	/* Skipping unknown encoding. Look out for STX. */
	458	if (input_chr == 2)
	459	state->s0 = (state->s0 & 0xf0000000) \| (i << 26) \| (0xf << 22);
	460	} else if (n != 0xf) {
	461	while ((unsigned)j < lenof(ctext_encodings) &&
	462	!memcmp(ctext_encodings[j].name,
	463	ctext_encodings[oi].name, n)) {
	464	if (ctext_encodings[j].name[n] < input_chr)
	465	i = ++j;
	466	else
	467	break;
	468	}
	469	if ((unsigned)i >= lenof(ctext_encodings) \|\|
	470	memcmp(ctext_encodings[i].name,
	471	ctext_encodings[oi].name, n) \|\|
	472	ctext_encodings[i].name[n] != input_chr) {
	473	/* Doom! We haven't heard of this encoding */
	474	i = lenof(ctext_encodings);
	475	n = 0xe;
	476	} else {
	477	/*
	478	* Otherwise, we have found an additional character in our
	479	* encoding name. See if we have reached the _end_ of our
	480	* name.
	481	*/
	482	n++;
	483	if (!ctext_encodings[i].name[n])
	484	n = 0xf;
	485	}
	486	/*
	487	* Failing _that_, we simply update our encoding-name-
	488	* tracking state.
	489	*/
	490	assert(i < 4 && n < 16);
	491	state->s0 = (state->s0 & 0xf0000000) \| (i << 26) \| (n << 22);
	492	} else {
	493	if ((unsigned)i >= lenof(ctext_encodings))
	494	emit(emitctx, ERROR);
	495	else {
	496	charset_state substate;
	497	charset_spec const *subcs = ctext_encodings[i].subcs;
	498	substate.s1 = 0;
	499	substate.s0 = state->s0 & 0xff;
	500	subcs->read(subcs, input_chr, &substate, emit, emitctx);
	501	state->s0 = (state->s0 & ~0xff) \| (substate.s0 & 0xff);
	502	}
	503	}
	504	if (!--length)
	505	state->s0 = 0;
	506	else
	507	state->s0 = (state->s0 &~0x003fff00) \| (length << 8);
	508	}
	509
	510	static void read_iso2022(charset_spec const *charset, long int input_chr,
	511	charset_state *state,
	512	void (emit)(void ctx, long int output),
	513	void *emitctx)
	514	{
	515	struct iso2022_mode const mode = (struct iso2022_mode )charset->data;
	516
	517	/* dump_state(state); */
	518	/*
	519	* We have to make fairly efficient use of the 64 bits of state
	520	* available to us. Long-term state goes in s1, and consists of
	521	* the identities of the character sets designated as G0/G1/G2/G3
	522	* and the locking-shift states for GL and GR. Short-term state
	523	* goes in s0: The bottom half of s0 accumulates characters for an
	524	* escape sequence or a multi-byte character, while the top three
	525	* bits indicate what they're being accumulated for. After DOCS,
	526	* the bottom 29 bits of state are available for the DOCS function
	527	* to use -- the UTF-8 one uses the bottom 26 for UTF-8 decoding
	528	* and the top two to recognised ESC % @.
	529	*
	530	* s0[31:29] = state enum
	531	* s0[24:0] = accumulated bytes
	532	* s1[31:30] = GL locking-shift state
	533	* s1[29:28] = GR locking-shift state
	534	* s1[27:21] = G3 charset
	535	* s1[20:14] = G2 charset
	536	* s1[13:7] = G1 charset
	537	* s1[6:0] = G0 charset
	538	*/
	539
	540	#define LEFT 30
	541	#define RIGHT 28
	542	#define LOCKING_SHIFT(n,side) \
	543	(state->s1 = (state->s1 & ~(3UL<<(side))) \| ((n ## UL)<<(side)))
	544	#define MODE ((state->s0 & 0xe0000000UL) >> 29)
	545	#define ENTER_MODE(m) (state->s0 = (state->s0 & ~0xe0000000UL) \| ((unsigned long)(m)<<29))
	546	#define SINGLE_SHIFT(n) ENTER_MODE(SS2CHAR - 2 + (n))
	547	#define ASSERT_IDLE do { \
	548	if (state->s0 != 0) emit(emitctx, ERROR); \
	549	state->s0 = 0; \
	550	} while (0)
	551
	552	if (state->s1 == 0) {
	553	/*
	554	* Since there's no LS0R, this means we must just have started.
	555	* Set up a sane initial state (LS0, LS1R, ASCII in G0/G1/G2/G3).
	556	*/
	557	LOCKING_SHIFT(0, LEFT);
	558	LOCKING_SHIFT(1, RIGHT);
	559	designate(state, 0, mode->ltype, mode->li, mode->lf);
	560	designate(state, 1, mode->rtype, mode->ri, mode->rf);
	561	designate(state, 2, S4, 0, 'B');
	562	designate(state, 3, S4, 0, 'B');
	563	}
	564
	565	if (MODE == DOCSUTF8) {
	566	docs_utf8(input_chr, state, emit, emitctx);
	567	return;
	568	}
	569	if (MODE == DOCSCTEXT) {
	570	docs_ctext(input_chr, state, emit, emitctx);
	571	return;
	572	}
	573
	574	if ((input_chr & 0x60) == 0x00) {
	575	/* C0 or C1 control */
	576	ASSERT_IDLE;
	577	switch (input_chr) {
	578	case ESC:
	579	ENTER_MODE(ESCSEQ);
	580	break;
	581	case LS0:
	582	LOCKING_SHIFT(0, LEFT);
	583	break;
	584	case LS1:
	585	LOCKING_SHIFT(1, LEFT);
	586	break;
	587	case SS2:
	588	SINGLE_SHIFT(2);
	589	break;
	590	case SS3:
	591	SINGLE_SHIFT(3);
	592	break;
	593	default:
	594	emit(emitctx, input_chr);
	595	break;
	596	}
	597	} else if ((input_chr & 0x80) \|\| MODE < ESCSEQ) {
	598	int is_gl = 0;
	599	struct iso2022_subcharset const *subcs;
	600	unsigned container;
	601	long input_7bit;
	602	/*
	603	* Actual data.
	604	* Force idle state if we're in mid escape sequence, or in a
	605	* multi-byte character with a different top bit.
	606	*/
	607	if (MODE >= ESCSEQ \|\|
	608	((state->s0 & 0x00ff0000L) != 0 &&
	609	(((state->s0 >> 16) ^ input_chr) & 0x80)))
	610	ASSERT_IDLE;
	611	if (MODE == SS2CHAR \|\| MODE == SS3CHAR) /* Single-shift */
	612	container = MODE - SS2CHAR + 2;
	613	else if (input_chr >= 0x80) /* GR */
	614	container = (state->s1 >> 28) & 3;
	615	else { /* GL */
	616	container = state->s1 >> 30;
	617	is_gl = 1;
	618	}
	619	input_7bit = input_chr & ~0x80;
	620	subcs = &iso2022_subcharsets[(state->s1 >> (container * 7)) & 0x7f];
	621	if ((subcs->type == S4 \|\| subcs->type == M4) &&
	622	(input_7bit == 0x20 \|\| input_7bit == 0x7f)) {
	623	/* characters not in 94-char set */
	624	if (is_gl) emit(emitctx, input_7bit);
	625	else emit(emitctx, ERROR);
	626	} else if (subcs->type == M4 \|\| subcs->type == M6) {
	627	if ((state->s0 & 0x00ff0000L) == 0) {
	628	state->s0 \|= input_chr << 16;
	629	return;
	630	} else {
	631	emit(emitctx,
	632	subcs->from_dbcs(((state->s0 >> 16) & 0x7f) +
	633	subcs->offset,
	634	input_7bit + subcs->offset));
	635	}
	636	} else {
	637	if ((state->s0 & 0x00ff0000L) != 0)
	638	emit(emitctx, ERROR);
	639	emit(emitctx, subcs->sbcs_base ?
	640	sbcs_to_unicode(subcs->sbcs_base, input_7bit + subcs->offset):
	641	ERROR);
	642	}
	643	state->s0 = 0;
	644	} else {
	645	unsigned i1, i2;
	646	if (MODE == ESCPASS) {
	647	emit(emitctx, input_chr);
	648	if ((input_chr & 0xf0) != 0x20)
	649	ENTER_MODE(IDLE);
	650	return;
	651	}
	652
	653	/*
	654	* Intermediate bytes shall be any of the 16 positions of
	655	* column 02 of the code table; they are denoted by the symbol
	656	* I.
	657	*/
	658	if ((input_chr & 0xf0) == 0x20) {
	659	if (((state->s0 >> 16) & 0xff) == 0)
	660	state->s0 \|= input_chr << 16;
	661	else if (((state->s0 >> 8) & 0xff) == 0)
	662	state->s0 \|= input_chr << 8;
	663	else {
	664	/* Long escape sequence. Switch to ESCPASS or ESCDROP. */
	665	i1 = (state->s0 >> 16) & 0xff;
	666	i2 = (state->s0 >> 8) & 0xff;
	667	switch (i1) {
	668	case '(': case ')': case '*': case '+':
	669	case '-': case '.': case '/':
	670	case '$':
	671	ENTER_MODE(ESCDROP);
	672	break;
	673	default:
	674	emit(emitctx, ESC);
	675	emit(emitctx, i1);
	676	emit(emitctx, i2);
	677	emit(emitctx, input_chr);
	678	state->s0 = 0;
	679	ENTER_MODE(ESCPASS);
	680	break;
	681	}
	682	}
	683	return;
	684	}
	685
	686	/*
	687	* Final bytes shall be any of the 79 positions of columns 03
	688	* to 07 of the code table excluding position 07/15; they are
	689	* denoted by the symbol F.
	690	*/
	691	i1 = (state->s0 >> 16) & 0xff;
	692	i2 = (state->s0 >> 8) & 0xff;
	693	if (MODE == ESCDROP)
	694	input_chr = 0; /* Make sure it won't match. */
	695	state->s0 = 0;
	696	switch (i1) {
	697	case 0: /* No intermediate bytes */
	698	switch (input_chr) {
	699	case 'N': /* SS2 */
	700	SINGLE_SHIFT(2);
	701	break;
	702	case 'O': /* SS3 */
	703	SINGLE_SHIFT(3);
	704	break;
	705	case 'n': /* LS2 */
	706	LOCKING_SHIFT(2, LEFT);
	707	break;
	708	case 'o': /* LS3 */
	709	LOCKING_SHIFT(3, LEFT);
	710	break;
	711	case '\|': /* LS3R */
	712	LOCKING_SHIFT(3, RIGHT);
	713	break;
	714	case '}': /* LS2R */
	715	LOCKING_SHIFT(2, RIGHT);
	716	break;
	717	case '~': /* LS1R */
	718	LOCKING_SHIFT(1, RIGHT);
	719	break;
	720	default:
	721	/* Unsupported escape sequence. Spit it back out. */
	722	emit(emitctx, ESC);
	723	emit(emitctx, input_chr);
	724	}
	725	break;
	726	case ' ': /* ACS */
	727	/*
	728	* Various coding structure facilities specify that designating
	729	* a code element also invokes it. As far as I can see, invoking
	730	* it now will have the same practical effect, since those
	731	* facilities also ban the use of locking shifts.
	732	*/
	733	switch (input_chr) {
	734	case 'A': /* G0 element used and invoked into GL */
	735	LOCKING_SHIFT(0, LEFT);
	736	break;
	737	case 'C': /* G0 in GL, G1 in GR */
	738	case 'D': /* Ditto, at least for 8-bit codes */
	739	case 'L': /* ISO 4873 (ECMA-43) level 1 */
	740	case 'M': /* ISO 4873 (ECMA-43) level 2 */
	741	LOCKING_SHIFT(0, LEFT);
	742	LOCKING_SHIFT(1, RIGHT);
	743	break;
	744	}
	745	break;
	746	case '&': /* IRR */
	747	/*
	748	* IRR (Identify Revised Registration) is ignored here,
	749	* since any revised registration must be
	750	* upward-compatible with the old one, so either we'll
	751	* support the new one or we'll emit ERROR when we run
	752	* into a new character. In either case, there's nothing
	753	* to be done here.
	754	*/
	755	break;
	756	case '(': /* GZD4 / case ')': / G1D4 */
	757	case '': / G2D4 / case '+': / G3D4 */
	758	designate(state, i1 - '(', S4, i2, input_chr);
	759	break;
	760	case '-': /* G1D6 / case '.': / G2D6 / case '/': / G3D6 */
	761	designate(state, i1 - ',', S6, i2, input_chr);
	762	break;
	763	case '$': /* G?DM? */
	764	switch (i2) {
	765	case 0: /* Obsolete version of GZDM4 */
	766	i2 = '(';
	767	case '(': /* GZDM4 / case ')': / G1DM4 */
	768	case '': / G2DM4 / case '+': / G3DM4 */
	769	designate(state, i2 - '(', M4, 0, input_chr);
	770	break;
	771	case '-': /* G1DM6 */
	772	case '.': /* G2DM6 / case '/': / G3DM6 */
	773	designate(state, i2 - ',', M6, 0, input_chr);
	774	break;
	775	default:
	776	emit(emitctx, ERROR);
	777	break;
	778	}
	779	case '%': /* DOCS */
	780	/* XXX What's a reasonable way to handle an unrecognised DOCS? */
	781	switch (i2) {
	782	case 0:
	783	switch (input_chr) {
	784	case 'G':
	785	ENTER_MODE(DOCSUTF8);
	786	break;
	787	}
	788	break;
	789	case '/':
	790	switch (input_chr) {
	791	case '1': case '2':
	792	ENTER_MODE(DOCSCTEXT);
	793	break;
	794	}
	795	break;
	796	}
	797	break;
	798	default:
	799	/* Unsupported nF escape sequence. Re-emit it. */
	800	emit(emitctx, ESC);
	801	emit(emitctx, i1);
	802	if (i2) emit(emitctx, i2);
	803	emit(emitctx, input_chr);
	804	break;
	805	}
	806	}
	807	}
	808
	809	static void oselect(charset_state *state, int i, int right,
	810	void (emit)(void ctx, long int output),
	811	void *emitctx)
	812	{
	813	int shift = (right ? 31-7 : 31-7-7);
	814	struct iso2022_subcharset const *subcs = &iso2022_subcharsets[i];
	815
	816	if (((state->s1 >> shift) & 0x7F) != (unsigned)i) {
	817	state->s1 &= ~(0x7FL << shift);
	818	state->s1 \|= (i << shift);
	819
	820	if (emit) {
	821	emit(emitctx, ESC);
	822	if (subcs->type == M4 \|\| subcs->type == M6)
	823	emit(emitctx, '$');
	824	if (subcs->type == S6 \|\| subcs->type == M6) {
	825	assert(right);
	826	emit(emitctx, '-');
	827	} else if (right) {
	828	emit(emitctx, ')');
	829	} else {
	830	emit(emitctx, '(');
	831	}
	832	if (subcs->i)
	833	emit(emitctx, subcs->i);
	834	emit(emitctx, subcs->f);
	835	}
	836	}
	837	}
	838
	839	static void docs_char(charset_state *state,
	840	void (emit)(void ctx, long int output),
	841	void emitctx, int cset, char data, int datalen)
	842	{
	843	int curr_cset, currlen, i;
	844
	845	/*
	846	* cset is the index into ctext_encodings[]. It can also be -1
	847	* to mean DOCS UTF-8, or -2 to mean no DOCS (ordinary 2022).
	848	* In the latter case, `chr' is ignored.
	849	*/
	850
	851	/*
	852	* First, terminate a DOCS segment if necessary. We always have
	853	* to terminate a DOCS segment if one is active and we're about
	854	* to switch to a different one; we might also have to
	855	* terminate a length-encoded DOCS segment if we've run out of
	856	* storage space to accumulate characters in it.
	857	*/
	858	curr_cset = ((state->s1 >> 14) & 7) - 2;
	859	currlen = ((state->s1 >> 11) & 7);
	860	if ((curr_cset != -2 && curr_cset != cset) \|\|
	861	(curr_cset >= 0 && currlen + datalen > 5)) {
	862	if (curr_cset == -1) {
	863	/*
	864	* Terminating DOCS UTF-8 is easy.
	865	*/
	866	emit(emitctx, ESC);
	867	emit(emitctx, '%');
	868	emit(emitctx, '@');
	869	} else {
	870	int len;
	871
	872	/*
	873	* To terminate a length-encoded DOCS segment we must
	874	* actually output the whole thing.
	875	*/
	876	emit(emitctx, ESC);
	877	emit(emitctx, '%');
	878	emit(emitctx, '/');
	879	emit(emitctx, '0' + ctext_encodings[curr_cset].octets_per_char);
	880	len = currlen + datalen +
	881	strlen(ctext_encodings[curr_cset].name);
	882	assert(len < (1 << 14));
	883	emit(emitctx, 0x80 \| ((len >> 7) & 0x7F));
	884	emit(emitctx, 0x80 \| ((len ) & 0x7F));
	885	/* The name stored in ctext_encodings[] includes the trailing \2 */
	886	for (i = 0; ctext_encodings[curr_cset].name[i]; i++)
	887	emit(emitctx, ctext_encodings[curr_cset].name[i]);
	888	for (i = 0; i < currlen; i++)
	889	emit(emitctx,
	890	(i == 0 ? state->s1 : state->s0 >> (8*(4-i))) & 0xFF);
	891	for (i = 0; i < datalen; i++)
	892	emit(emitctx, data[i]);
	893
	894	/*
	895	* We've now dealt with the input data, so clear it so
	896	* we don't try to do so again below.
	897	*/
	898	datalen = 0;
	899	}
	900	curr_cset = -2;
	901	}
	902
	903	/*
	904	* Now, start a DOCS segment if necessary.
	905	*/
	906	if (curr_cset != cset) {
	907	assert(cset != -2);
	908	if (cset == -1) {
	909	/*
	910	* Start DOCS UTF-8.
	911	*/
	912	emit(emitctx, ESC);
	913	emit(emitctx, '%');
	914	emit(emitctx, 'G');
	915	} else {
	916	/*
	917	* Starting a length-encoded DOCS segment is simply a
	918	* matter of setting our stored length counter to zero.
	919	*/
	920	currlen = 0;
	921	state->s1 &= ~(7 << 11);
	922	state->s1 &= ~0xFF;
	923	state->s0 = 0;
	924	}
	925	}
	926	state->s1 &= ~(7 << 14);
	927	assert((cset+2) >= 0 && (cset+2) < 8);
	928	state->s1 \|= ((cset+2) << 14);
	929
	930	/*
	931	* Now we're in the right DOCS state. Actually deal with the
	932	* input data, if we haven't already done so above.
	933	*/
	934	if (datalen > 0) {
	935	assert(cset != 2);
	936	if (cset == -1) {
	937	/*
	938	* In DOCS UTF-8, we output data as soon as we get it.
	939	*/
	940	for (i = 0; i < datalen; i++)
	941	emit(emitctx, data[i]);
	942	} else {
	943	/*
	944	* In length-encoded DOCS, we just store our data and
	945	* bide our time. It'll all be output when we fill up
	946	* or switch to another character set.
	947	*/
	948	assert(currlen + datalen <= 5); /* overflow handled already */
	949	for (i = 0; i < datalen; i++) {
	950	if (currlen + i == 0)
	951	state->s1 \|= data[i] & 0xFF;
	952	else
	953	state->s0 \|= (data[i] & 0xFF) << (8*(4-(currlen+i)));
	954	}
	955	currlen += datalen;
	956	assert(currlen >= 0 && currlen < 8);
	957	state->s1 &= ~(7 << 11);
	958	state->s1 \|= (currlen << 11);
	959	}
	960	}
	961	}
	962
	963	static void write_to_pointer(void *ctx, long int output)
	964	{
	965	char ptr = (char )ctx;
	966	(ptr)++ = output;
	967	}
	968
	969	/*
	970	* Writing full ISO-2022 is not useful in very many circumstances.
	971	* One of the few situations in which it _is_ useful is generating
	972	* X11 COMPOUND_TEXT; therefore, this writing function will obey
	973	* the compound text restrictions and hence output the subset of
	974	* ISO-2022 that's usable in that context.
	975	*
	976	* The subset in question is roughly that we use GL/GR for G0/G1
	977	* always, and that the _only_ escape sequences we output (other
	978	* than the occasional DOCS) are those which designate different
	979	* subcharsets into G0 and G1. There are additional constraints
	980	* about which things go in which container; see below.
	981	*
	982	* FIXME: this wants some decent tests to be written, and also the
	983	* exact output policy for compound text wants thinking about more
	984	* carefully.
	985	*/
	986	static int write_iso2022(charset_spec const *charset, long int input_chr,
	987	charset_state *state,
	988	void (emit)(void ctx, long int output),
	989	void *emitctx)
	990	{
	991	int i;
	992	struct iso2022_subcharset const *subcs;
	993	struct iso2022_mode const mode = (struct iso2022_mode )charset->data;
	994	to_dbcs_planar_t last_planar_dbcs = NULL;
	995	int last_p, last_r, last_c;
	996	long int c1, c2;
	997
	998	/*
	999	* For output, I allocate the state variables as follows:
	1000	*
	1001	* s1[31] == 1 if output state has been initialised
	1002	* s1[30:24] == G1 charset (always in GR)
	1003	* s1[23:17] == G0 charset (always in GL)
	1004	* s1[16:14] == DOCS index plus 2 (because -1 and -2 are special)
	1005	* s1[13:11] == number of DOCS accumulated characters (up to five)
	1006	* s1[7:0] + s0[31:0] == DOCS collected characters
	1007	*/
	1008
	1009	if (!state->s1) {
	1010	state->s0 = 0x00000000UL;
	1011	state->s1 = 0x80000000UL;
	1012	/*
	1013	* Start with US-ASCII in GL and also in GR.
	1014	*/
	1015	for (i = 0; (unsigned)i < lenof(iso2022_subcharsets); i++) {
	1016	subcs = &iso2022_subcharsets[i];
	1017	if (subcs->type == mode->ltype &&
	1018	subcs->i == mode->li &&
	1019	subcs->f == mode->lf)
	1020	oselect(state, i, FALSE, NULL, NULL);
	1021	if (subcs->type == mode->rtype &&
	1022	subcs->i == mode->ri &&
	1023	subcs->f == mode->rf)
	1024	oselect(state, i, TRUE, NULL, NULL);
	1025	}
	1026	}
	1027
	1028	if (input_chr == -1) {
	1029	/*
	1030	* Special case: reset encoding state.
	1031	*/
	1032	docs_char(state, emit, emitctx, -2, NULL, 0); /* leave DOCS */
	1033
	1034	for (i = 0; (unsigned)i < lenof(iso2022_subcharsets); i++) {
	1035	subcs = &iso2022_subcharsets[i];
	1036	if (subcs->type == mode->ltype &&
	1037	subcs->i == mode->li &&
	1038	subcs->f == mode->lf)
	1039	oselect(state, i, FALSE, emit, emitctx);
	1040	if (subcs->type == mode->rtype &&
	1041	subcs->i == mode->ri &&
	1042	subcs->f == mode->rf)
	1043	oselect(state, i, TRUE, emit, emitctx);
	1044	}
	1045	return TRUE;
	1046	}
	1047
	1048	/*
	1049	* Special-case characters: Space, Delete, and anything in C0
	1050	* or C1 are output unchanged.
	1051	*/
	1052	if (input_chr <= 0x20 \|\| (input_chr >= 0x7F && input_chr < 0xA0)) {
	1053	emit(emitctx, input_chr);
	1054	return TRUE;
	1055	}
	1056
	1057	/*
	1058	* Analyse the input character and work out which subcharset it
	1059	* belongs to.
	1060	*/
	1061	for (i = 0; (unsigned)i < lenof(iso2022_subcharsets); i++) {
	1062	subcs = &iso2022_subcharsets[i];
	1063	if (!(mode->enable_mask & (1 << subcs->enable)))
	1064	continue; /* this charset is disabled */
	1065	if (subcs->sbcs_base) {
	1066	c1 = sbcs_from_unicode(subcs->sbcs_base, input_chr);
	1067	c1 -= subcs->offset;
	1068	if (c1 >= 0x20 && c1 <= 0x7f) {
	1069	c2 = 0;
	1070	break;
	1071	}
	1072	} else if (subcs->to_dbcs) {
	1073	if (subcs->to_dbcs_plane >= 0) {
	1074	/*
	1075	* Since multiplanar DBCSes almost by definition
	1076	* involve several entries in iso2022_subcharsets
	1077	* with the same to_dbcs function and different
	1078	* plane values, we remember the last such function
	1079	* we called and what its result was, so that we
	1080	* don't (for example) have to call
	1081	* unicode_to_cns11643 seven times.
	1082	*/
	1083	if (last_planar_dbcs != REPLANARISE(subcs->to_dbcs)) {
	1084	last_planar_dbcs = REPLANARISE(subcs->to_dbcs);
	1085	if (!last_planar_dbcs(input_chr,
	1086	&last_p, &last_r, &last_c))
	1087	last_p = -1;
	1088	}
	1089	} else {
	1090	last_p = subcs->to_dbcs_plane;
	1091	if (!subcs->to_dbcs(input_chr, &last_r, &last_c))
	1092	last_p = 0; /* cannot match since to_dbcs_plane<0 */
	1093	}
	1094
	1095	if (last_p == subcs->to_dbcs_plane) {
	1096	c1 = last_r - subcs->offset;
	1097	c2 = last_c - subcs->offset;
	1098	assert(c1 >= 0x20 && c1 <= 0x7f);
	1099	assert(c2 >= 0x20 && c2 <= 0x7f);
	1100	break;
	1101	}
	1102	}
	1103	}
	1104
	1105	if ((unsigned)i < lenof(iso2022_subcharsets)) {
	1106	int right;
	1107
	1108	/*
	1109	* Our character is represented by c1 (and possibly also
	1110	* c2) in subcharset `subcs'. So now we must decide whether
	1111	* to designate that character set into G0/GL or G1/GR.
	1112	*
	1113	* Any S6 or M6 subcharset has to go in GR because it won't
	1114	* fit in GL. In addition, the compound text rules state
	1115	* that any single-byte subcharset defined as the
	1116	* right-hand half of some SBCS must go in GR.
	1117	*
	1118	* M4 subcharsets can go in either half according to the
	1119	* rules. I choose to put them in GR always because it's a
	1120	* simple policy with reasonable behaviour (facilitates
	1121	* switching between them and ASCII).
	1122	*/
	1123	right = (subcs->type == S6 \|\| subcs->type == M6 \|\| subcs->type == M4 \|\|
	1124	(subcs->sbcs_base && subcs->offset == 0x80));
	1125
	1126	/*
	1127	* If we're in a DOCS mode, leave it.
	1128	*/
	1129	docs_char(state, emit, emitctx, -2, NULL, 0);
	1130
	1131	/*
	1132	* If this subcharset is not already selected in that
	1133	* container, select it.
	1134	*/
	1135	oselect(state, i, right, emit, emitctx);
	1136
	1137	/*
	1138	* Now emit the actual characters.
	1139	*/
	1140	if (right) {
	1141	assert(c1 >= 0x20 && c1 <= 0x7f);
	1142	emit(emitctx, c1 \| 0x80);
	1143	if (c2) {
	1144	assert(c2 >= 0x20 && c2 <= 0x7f);
	1145	emit(emitctx, c2 \| 0x80);
	1146	}
	1147	} else {
	1148	assert(c1 > 0x20 && c1 < 0x7f);
	1149	emit(emitctx, c1);
	1150	if (c2) {
	1151	assert(c2 > 0x20 && c2 < 0x7f);
	1152	emit(emitctx, c2);
	1153	}
	1154	}
	1155
	1156	return TRUE;
	1157	}
	1158
	1159	/*
	1160	* Fall back to DOCS.
	1161	*/
	1162	{
	1163	char data[10];
	1164	char *p = data;
	1165	int i, cs;
	1166
	1167	cs = -2; /* means failure */
	1168
	1169	for (i = 0; (unsigned)i <= lenof(ctext_encodings); i++) {
	1170	charset_state substate;
	1171	charset_spec const *subcs = ctext_encodings[i].subcs;
	1172
	1173	/*
	1174	* We assume that all character sets dealt with by DOCS
	1175	* are stateless for output purposes.
	1176	*/
	1177	substate.s1 = substate.s0 = 0;
	1178	p = data;
	1179
	1180	if ((unsigned)i < lenof(ctext_encodings)) {
	1181	if ((mode->enable_mask & (1 << ctext_encodings[i].enable)) &&
	1182	subcs->write(subcs, input_chr, &substate,
	1183	write_to_pointer, &p)) {
	1184	cs = i;
	1185	break;
	1186	}
	1187	} else {
	1188	if ((mode->enable_mask & (1 << CDU)) &&
	1189	write_utf8(NULL, input_chr, NULL, write_to_pointer, &p)) {
	1190	cs = -1;
	1191	break;
	1192	}
	1193	}
	1194	}
	1195
	1196	if (cs != -2) {
	1197	docs_char(state, emit, emitctx, cs, data, p - data);
	1198	return TRUE;
	1199	}
	1200	}
	1201
	1202	return FALSE;
	1203	}
	1204
	1205	/*
	1206	* Full ISO 2022 output with all options on. Not entirely sure what
	1207	* if anything this is useful for, but here it is anyway. All
	1208	* output character sets and DOCS variants are permitted; all
	1209	* containers start out with ASCII in them.
	1210	*/
	1211	static const struct iso2022_mode iso2022_all = {
	1212	(1<<CCS) \| (1<<COS) \| (1<<CPU) \| (1<<CDC) \| (1<<CDU),
	1213	S4, 0, 'B', S4, 0, 'B',
	1214	};
	1215
	1216	const charset_spec charset_CS_ISO2022 = {
	1217	CS_ISO2022, read_iso2022, write_iso2022, &iso2022_all
	1218	};
	1219
	1220	/*
	1221	* X11 compound text. A subset of output charsets is permitted, and
	1222	* G1/GR starts off in ISO8859-1.
	1223	*/
	1224	static const struct iso2022_mode iso2022_ctext = {
	1225	(1<<CCS) \| (1<<CDC),
	1226	S4, 0, 'B', S6, 0, 'A',
	1227	};
	1228
	1229	const charset_spec charset_CS_CTEXT = {
	1230	CS_CTEXT, read_iso2022, write_iso2022, &iso2022_ctext
	1231	};
	1232
	1233	#ifdef TESTMODE
	1234
	1235	#include <stdio.h>
	1236	#include <stdarg.h>
	1237	#include <string.h>
	1238
	1239	int total_errs = 0;
	1240
	1241	void iso2022_emit(void *ctx, long output)
	1242	{
	1243	wchar_t p = (wchar_t )ctx;
	1244	(p)++ = output;
	1245	}
	1246
	1247	void iso2022_read_test(int line, char *input, int inlen, ...)
	1248	{
	1249	va_list ap;
	1250	wchar_t *p, str[512];
	1251	int i;
	1252	charset_state state;
	1253	unsigned long l;
	1254
	1255	state.s0 = state.s1 = 0;
	1256	p = str;
	1257
	1258	for (i = 0; i < inlen; i++)
	1259	read_iso2022(NULL, input[i] & 0xFF, &state, iso2022_emit, &p);
	1260
	1261	va_start(ap, inlen);
	1262	l = 0;
	1263	for (i = 0; i < p - str; i++) {
	1264	l = va_arg(ap, long int);
	1265	if (l == -1) {
	1266	printf("%d: correct string shorter than output\n", line);
	1267	total_errs++;
	1268	break;
	1269	}
	1270	if (l != str[i]) {
	1271	printf("%d: char %d came out as %08x, should be %08lx\n",
	1272	line, i, str[i], l);
	1273	total_errs++;
	1274	}
	1275	}
	1276	if (l != -1) {
	1277	l = va_arg(ap, long int);
	1278	if (l != -1) {
	1279	printf("%d: correct string longer than output\n", line);
	1280	total_errs++;
	1281	}
	1282	}
	1283	va_end(ap);
	1284	}
	1285
	1286	/* Macro to concoct the first three parameters of iso2022_read_test. */
	1287	#define TESTSTR(x) __LINE__, x, lenof(x)
	1288
	1289	int main(void)
	1290	{
	1291	printf("read tests beginning\n");
	1292	/* Simple test (Emacs sample text for Japanese, in ISO-2022-JP) */
	1293	iso2022_read_test(TESTSTR("Japanese (\x1b$BF\|K\\8l\x1b(B)\t"
	1294	"\x1b$B$3$s$K$A$O\x1b(B, "
	1295	"\x1b$B%3%s%K%A%O\x1b(B\n"),
	1296	'J','a','p','a','n','e','s','e',' ','(',
	1297	0x65E5, 0x672C, 0x8A9E, ')', '\t',
	1298	0x3053, 0x3093, 0x306b, 0x3061, 0x306f, ',', ' ',
	1299	0x30b3, 0x30f3, 0x30cb, 0x30c1, 0x30cf, '\n', 0, -1);
	1300	/* Same thing in EUC-JP (with designations, and half-width katakana) */
	1301	iso2022_read_test(TESTSTR("\x1b$)B\x1b*I\x1b$+D"
	1302	"Japanese (\xc6\xfc\xcb\xdc\xb8\xec)\t"
	1303	"\xa4\xb3\xa4\xf3\xa4\xcb\xa4\xc1\xa4\xcf, "
	1304	"\x8e\xba\x8e\xdd\x8e\xc6\x8e\xc1\x8e\xca\n"),
	1305	'J','a','p','a','n','e','s','e',' ','(',
	1306	0x65E5, 0x672C, 0x8A9E, ')', '\t',
	1307	0x3053, 0x3093, 0x306b, 0x3061, 0x306f, ',', ' ',
	1308	0xff7a, 0xff9d, 0xff86, 0xff81, 0xff8a, '\n', 0, -1);
	1309	/* Multibyte single-shift */
	1310	iso2022_read_test(TESTSTR("\x1b$)B\x1b*I\x1b$+D\x8f\"/!"),
	1311	0x02D8, '!', 0, -1);
	1312	/* Non-existent SBCS */
	1313	iso2022_read_test(TESTSTR("\x1b(!Zfnord\n"),
	1314	ERROR, ERROR, ERROR, ERROR, ERROR, '\n', 0, -1);
	1315	/* Pass-through of ordinary escape sequences, including a long one */
	1316	iso2022_read_test(TESTSTR("\x1b""b\x1b#5\x1b#!!!5"),
	1317	0x1B, 'b', 0x1B, '#', '5',
	1318	0x1B, '#', '!', '!', '!', '5', 0, -1);
	1319	/* Non-existent DBCS (also 5-byte escape sequence) */
	1320	iso2022_read_test(TESTSTR("\x1b$(!Bfnord!"),
	1321	ERROR, ERROR, ERROR, 0, -1);
	1322	/* Incomplete DB characters */
	1323	iso2022_read_test(TESTSTR("\x1b$B(,(\x1b(BHi\x1b$B(,(\n"),
	1324	0x2501, ERROR, 'H', 'i', 0x2501, ERROR, '\n', 0, -1);
	1325	iso2022_read_test(TESTSTR("\x1b$)B\x1b*I\x1b$+D\xa4""B"),
	1326	ERROR, 'B', 0, -1);
	1327	iso2022_read_test(TESTSTR("\x1b$)B\x1b*I\x1b$+D\x0e\x1b\|$\xa2\xaf"),
	1328	ERROR, 0x02D8, 0, -1);
	1329	/* Incomplete escape sequence */
	1330	iso2022_read_test(TESTSTR("\x1b\n"), ERROR, '\n', 0, -1);
	1331	iso2022_read_test(TESTSTR("\x1b-A\x1b~\x1b\xa1"), ERROR, 0xa1, 0, -1);
	1332	/* Incomplete single-shift */
	1333	iso2022_read_test(TESTSTR("\x8e\n"), ERROR, '\n', 0, -1);
	1334	iso2022_read_test(TESTSTR("\x1b$*B\x8e(\n"), ERROR, '\n', 0, -1);
	1335	/* Corner cases (02/00 and 07/15) */
	1336	iso2022_read_test(TESTSTR("\x1b(B\x20\x7f"), 0x20, 0x7f, 0, -1);
	1337	iso2022_read_test(TESTSTR("\x1b(I\x20\x7f"), 0x20, 0x7f, 0, -1);
	1338	iso2022_read_test(TESTSTR("\x1b$B\x20\x7f"), 0x20, 0x7f, 0, -1);
	1339	iso2022_read_test(TESTSTR("\x1b-A\x0e\x20\x7f"), 0xa0, 0xff, 0, -1);
	1340	iso2022_read_test(TESTSTR("\x1b$-~\x0e\x20\x7f"), ERROR, 0, -1);
	1341	iso2022_read_test(TESTSTR("\x1b)B\xa0\xff"), ERROR, ERROR, 0, -1);
	1342	iso2022_read_test(TESTSTR("\x1b)I\xa0\xff"), ERROR, ERROR, 0, -1);
	1343	iso2022_read_test(TESTSTR("\x1b$)B\xa0\xff"), ERROR, ERROR, 0, -1);
	1344	iso2022_read_test(TESTSTR("\x1b-A\x1b~\xa0\xff"), 0xa0, 0xff, 0, -1);
	1345	iso2022_read_test(TESTSTR("\x1b$-~\x1b~\xa0\xff"), ERROR, 0, -1);
	1346	/* Designate control sets */
	1347	iso2022_read_test(TESTSTR("\x1b!@"), 0x1b, '!', '@', 0, -1);
	1348	/* Designate other coding system (UTF-8) */
	1349	iso2022_read_test(TESTSTR("\x1b%G"
	1350	"\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"),
	1351	0x03BA, 0x1F79, 0x03C3, 0x03BC, 0x03B5, 0, -1);
	1352	iso2022_read_test(TESTSTR("\x1b-A\x1b%G\xCE\xBA\x1b%@\xa0"),
	1353	0x03BA, 0xA0, 0, -1);
	1354	iso2022_read_test(TESTSTR("\x1b%G\xCE\x1b%@"), ERROR, 0, -1);
	1355	iso2022_read_test(TESTSTR("\x1b%G\xCE\xBA\x1b%\x1b%@"),
	1356	0x03BA, 0x1B, '%', 0, -1);
	1357	/* DOCS (COMPOUND_TEXT extended segment) */
	1358	iso2022_read_test(TESTSTR("\x1b%/1\x80\x80"), 0, -1);
	1359	iso2022_read_test(TESTSTR("\x1b%/1\x80\x8fiso-8859-15\2xyz\x1b(B"),
	1360	ERROR, ERROR, ERROR, 0, -1);
	1361	iso2022_read_test(TESTSTR("\x1b%/1\x80\x8eiso8859-15\2xyz\x1b(B"),
	1362	'x', 'y', 'z', 0, -1);
	1363	iso2022_read_test(TESTSTR("\x1b-A\x1b%/2\x80\x89"
	1364	"big5-0\2\xa1\x40\xa1\x40"),
	1365	0x3000, 0xa1, 0x40, 0, -1);
	1366	/* Emacs Big5-in-ISO-2022 mapping */
	1367	iso2022_read_test(TESTSTR("\x1b$(0&x86\x1b(B \x1b$(0DeBv"),
	1368	0x5143, 0x6c23, ' ', ' ', 0x958b, 0x767c, 0, -1);
	1369	/* Test from RFC 1922 (ISO-2022-CN) */
	1370	iso2022_read_test(TESTSTR("\x1b$)A\x0e=;;;\x1b$)GG(_P\x0f"),
	1371	0x4EA4, 0x6362, 0x4EA4, 0x63db, 0, -1);
	1372
	1373	printf("read tests completed\n");
	1374	printf("total: %d errors\n", total_errs);
	1375	return (total_errs != 0);
	1376	}
	1377
	1378	#endif /* TESTMODE */
	1379
	1380	#else /* ENUM_CHARSETS */
	1381
	1382	ENUM_CHARSET(CS_ISO2022)
	1383
	1384	#endif