mdw@git.distorted.org.uk Git - sgt/charset/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* iso2022s.c - support for ISO-2022 subset encodings.
	3	*
	4	* (The `s' suffix on the filename is there to leave `iso2022.c'
	5	* free for the unlikely event that I ever attempt to implement
	6	* _full_ ISO-2022 in this library!)
	7	*/
	8
	9	#ifndef ENUM_CHARSETS
	10
	11	#include <stdio.h>
	12	#include <string.h>
	13	#include <assert.h>
	14
	15	#include "charset.h"
	16	#include "internal.h"
	17	#include "sbcsdat.h"
	18
	19	#define SO (0x0E)
	20	#define SI (0x0F)
	21	#define ESC (0x1B)
	22
	23	/* Functional description of a single ISO 2022 escape sequence. */
	24	struct iso2022_escape {
	25	char const *sequence;
	26	unsigned long andbits, xorbits;
	27	/*
	28	* For output, these variables help us figure out which escape
	29	* sequences we need to get where we want to be.
	30	*
	31	* `container' should be in the range 0-3, but can also be ORed
	32	* with the bit flag RO to indicate that this is not a
	33	* preferred container to use for this charset during output.
	34	*/
	35	int container, subcharset;
	36	};
	37	#define RO 0x80
	38
	39	struct iso2022 {
	40	/*
	41	* List of escape sequences supported in this subset. Must be
	42	* in ASCII order, so that we can narrow down the list as
	43	* necessary.
	44	*/
	45	const struct iso2022_escape escapes;/ must be sorted in ASCII order! */
	46	int nescapes;
	47
	48	/*
	49	* We assign indices from 0 upwards to the sub-charsets of a
	50	* given ISO 2022 subset. nbytes[i] tells us how many bytes per
	51	* character are required by sub-charset i. (It's a string
	52	* mainly because that makes it easier to declare in C syntax
	53	* than an int array.)
	54	*/
	55	char const *nbytes;
	56
	57	/*
	58	* The characters in this string are indices-plus-one (so that
	59	* NUL can still terminate) of escape sequences in `escapes'.
	60	* These escapes are output in the given sequence to reset the
	61	* encoding state, unless it turns out that a given escape
	62	* would not change the state at all.
	63	*/
	64	char const *reset;
	65
	66	/*
	67	* Initial value of s1, in case the default container contents
	68	* needs to be something other than charset 0 in all cases.
	69	* (Note that this must have the top bit set!)
	70	*/
	71	unsigned long s1;
	72
	73	/*
	74	* For output, some ISO 2022 subsets _mandate_ an initial shift
	75	* sequence. If so, here it is so we can output it. (For the
	76	* sake of basic sanity we won't bother to _require_ it on
	77	* input, although it should of course be listed under
	78	* `escapes' above so that we ignore it when present.)
	79	*/
	80	char const *initial_sequence;
	81
	82	/*
	83	* Is this an 8-bit ISO 2022 subset?
	84	*/
	85	int eightbit;
	86
	87	/*
	88	* Function calls to do the actual translation.
	89	*/
	90	long int (*to_ucs)(int subcharset, unsigned long bytes);
	91	int (from_ucs)(long int ucs, int subcharset, unsigned long *bytes);
	92	};
	93
	94	static void read_iso2022s(charset_spec const *charset, long int input_chr,
	95	charset_state *state,
	96	void (emit)(void ctx, long int output),
	97	void *emitctx)
	98	{
	99	struct iso2022 const iso = (struct iso2022 )charset->data;
	100
	101	/*
	102	* For reading ISO-2022 subsets, we divide up our state
	103	* variables as follows:
	104	*
	105	* - The top byte of s0 (bits 31:24) indicates, if nonzero,
	106	* that we are part-way through a recognised ISO-2022 escape
	107	* sequence. Five of those bits (31:27) give the index of
	108	* the first member of the escapes list matching what we
	109	* have so far; the remaining three (26:24) give the number
	110	* of characters we have seen so far.
	111	*
	112	* - The top bit of s1 (bit 31) is non-zero at all times, to
	113	* indicate that we have performed any necessary
	114	* initialisation. When we start, we detect a zero s1 and
	115	* respond to it by initialising the default container
	116	* contents.
	117	*
	118	* - The next three bits of s1 (bits 30:28) indicate which
	119	* _container_ is currently selected. This isn't quite as
	120	* simple as it sounds, since we have to preserve memory of
	121	* which of the SI/SO containers we came from when we're
	122	* temporarily in SS2/SS3. Hence, what happens is:
	123	* + bit 28 indicates SI/SO.
	124	* + if we're in an SS2/SS3 container, that's indicated by
	125	* the two bits above that being nonzero and holding
	126	* either 2 or 3.
	127	* + Hence: 0 is SI, 1 is SO, 4 is SS2-from-SI, 5 is
	128	* SS2-from-SO, 6 is SS3-from-SI, 7 is SS3-from-SO.
	129	* + For added fun: in an _8-bit_ ISO 2022 subset, we have
	130	* the further special value 2, which means that we're
	131	* theoretically in SI but the current character being
	132	* accumulated is composed of 8-bit characters and will
	133	* therefore be interpreted as if in SO.
	134	*
	135	* - The next nibble of s1 (27:24) indicates how many bytes
	136	* have been accumulated in the current character.
	137	*
	138	* - The remaining three bytes of s1 are divided into four
	139	* six-bit sections, and each section gives the current
	140	* sub-charset selected in one of the possible containers.
	141	* (Those containers are SI, SO, SS2 and SS3, respectively
	142	* and in order from the bottom of s0 to the top.)
	143	*
	144	* - The bottom 24 bits of s0 give the accumulated character
	145	* data so far.
	146	*
	147	* (Note that this means s1 contains all the parts of the state
	148	* which might need to be operated on by escape sequences.
	149	* Cunning, eh?)
	150	*/
	151
	152	if (!(state->s1 & 0x80000000)) {
	153	state->s1 = iso->s1;
	154	}
	155
	156	/*
	157	* So. Firstly, we process escape sequences, if we're in the
	158	* middle of one or if we see a possible introducer (SI, SO,
	159	* ESC).
	160	*/
	161	if ((state->s0 >> 24) \|\|
	162	(input_chr == SO \|\| input_chr == SI \|\| input_chr == ESC)) {
	163	int n = (state->s0 >> 24) & 7, i = (state->s0 >> 27), oi = i, j;
	164
	165	/*
	166	* If this is the start of an escape sequence, we might be
	167	* in mid-character. If so, clear the character state and
	168	* emit an error token for the incomplete character.
	169	*/
	170	if (state->s1 & 0x0F000000) {
	171	state->s1 &= ~0x0F000000;
	172	state->s0 &= 0xFF000000;
	173	/*
	174	* If we were in the SS2 or SS3 container, we
	175	* automatically exit it.
	176	*/
	177	if (state->s1 & 0x60000000)
	178	state->s1 &= 0x9FFFFFFF;
	179	emit(emitctx, ERROR);
	180	}
	181
	182	j = i;
	183	while (j < iso->nescapes &&
	184	!memcmp(iso->escapes[j].sequence,
	185	iso->escapes[oi].sequence, n)) {
	186	if (iso->escapes[j].sequence[n] < input_chr)
	187	i = ++j;
	188	else
	189	break;
	190	}
	191	if (i >= iso->nescapes \|\|
	192	memcmp(iso->escapes[i].sequence,
	193	iso->escapes[oi].sequence, n) \|\|
	194	iso->escapes[i].sequence[n] != input_chr) {
	195	/*
	196	* This character does not appear in any valid escape
	197	* sequence. Therefore, we must emit all the characters
	198	* we had previously swallowed, plus this one, and
	199	* return to non-escape-sequence state.
	200	*/
	201	for (j = 0; j < n; j++)
	202	emit(emitctx, iso->escapes[oi].sequence[j]);
	203	emit(emitctx, input_chr);
	204	state->s0 = 0;
	205	return;
	206	}
	207
	208	/*
	209	* Otherwise, we have found an additional character in our
	210	* escape sequence. See if we have reached the _end_ of our
	211	* sequence (and therefore must process the sequence).
	212	*/
	213	n++;
	214	if (!iso->escapes[i].sequence[n]) {
	215	state->s0 = 0;
	216	state->s1 &= iso->escapes[i].andbits;
	217	state->s1 ^= iso->escapes[i].xorbits;
	218	return;
	219	}
	220
	221	/*
	222	* Failing _that_, we simply update our escape-sequence-
	223	* tracking state.
	224	*/
	225	assert(i < 32 && n < 8);
	226	state->s0 = (i << 27) \| (n << 24);
	227	return;
	228	}
	229
	230	/*
	231	* If this isn't an escape sequence, it must be part of a
	232	* character. One possibility is that it's a control character
	233	* (00-20 or 7F-9F; also in non-8-bit ISO 2022 subsets I'm
	234	* going to treat all top-half characters as controls), in
	235	* which case we output it verbatim.
	236	*/
	237	if (input_chr < 0x21 \|\|
	238	(input_chr > 0x7E && (!iso->eightbit \|\| input_chr < 0xA0))) {
	239	/*
	240	* We might be in mid-multibyte-character. If so, clear the
	241	* character state and emit an error token for the
	242	* incomplete character.
	243	*/
	244	if (state->s1 & 0x0F000000) {
	245	state->s1 &= ~0x0F000000;
	246	state->s0 &= 0xFF000000;
	247	emit(emitctx, ERROR);
	248	/*
	249	* If we were in the SS2 or SS3 container, we
	250	* automatically exit it.
	251	*/
	252	if (state->s1 & 0x60000000)
	253	state->s1 &= 0x9FFFFFFF;
	254	}
	255
	256	emit(emitctx, input_chr);
	257	return;
	258	}
	259
	260	/*
	261	* Otherwise, accumulate character data.
	262	*/
	263	{
	264	unsigned long chr;
	265	int chrlen, cont, subcharset, bytes;
	266
	267	/*
	268	* Verify that we've seen the right kind of character for
	269	* what we're currently doing. This only matters in 8-bit
	270	* subsets.
	271	*/
	272	if (iso->eightbit) {
	273	cont = (state->s1 >> 28) & 7;
	274	/*
	275	* If cont==0, we're entitled to see either GL or GR
	276	* characters. If cont==2, we expect only GR; otherwise
	277	* we expect only GL.
	278	*
	279	* If we see a GR character while cont==0, we set
	280	* cont=2 immediately.
	281	*/
	282	if ((cont == 2 && !(input_chr & 0x80)) \|\|
	283	(cont != 0 && cont != 2 && (input_chr & 0x80))) {
	284	/*
	285	* Clear the previous character; it was prematurely
	286	* terminated by this error.
	287	*/
	288	state->s1 &= ~0x0F000000;
	289	state->s0 &= 0xFF000000;
	290	emit(emitctx, ERROR);
	291	/*
	292	* If we were in the SS2 or SS3 container, we
	293	* automatically exit it.
	294	*/
	295	if (state->s1 & 0x60000000)
	296	state->s1 &= 0x9FFFFFFF;
	297	}
	298
	299	if (cont == 0 && (input_chr & 0x80)) {
	300	state->s1 \|= 0x20000000;
	301	}
	302	}
	303
	304	/* The current character and its length. */
	305	chr = ((state->s0 & 0x00FFFFFF) << 8) \| (input_chr & 0x7F);
	306	chrlen = ((state->s1 >> 24) & 0xF) + 1;
	307	/* The current sub-charset. */
	308	cont = (state->s1 >> 28) & 7;
	309	if (cont > 1) cont >>= 1;
	310	subcharset = (state->s1 >> (6*cont)) & 0x3F;
	311	/* The number of bytes-per-character in that sub-charset. */
	312	bytes = iso->nbytes[subcharset];
	313
	314	/*
	315	* If this character is now complete, we convert and emit
	316	* it. Otherwise, we simply update the state and return.
	317	*/
	318	if (chrlen >= bytes) {
	319	emit(emitctx, iso->to_ucs(subcharset, chr));
	320	chr = chrlen = 0;
	321	/*
	322	* If we were in the SS2 or SS3 container, we
	323	* automatically exit it.
	324	*/
	325	if (state->s1 & 0x60000000)
	326	state->s1 &= 0x9FFFFFFF;
	327	}
	328	state->s0 = (state->s0 & 0xFF000000) \| chr;
	329	state->s1 = (state->s1 & 0xF0FFFFFF) \| (chrlen << 24);
	330	}
	331	}
	332
	333	static int write_iso2022s(charset_spec const *charset, long int input_chr,
	334	charset_state *state,
	335	void (emit)(void ctx, long int output),
	336	void *emitctx)
	337	{
	338	struct iso2022 const iso = (struct iso2022 )charset->data;
	339	int subcharset, len, i, j, cont, topbit = 0;
	340	unsigned long bytes;
	341
	342	/*
	343	* For output, our s1 state variable contains most of the same
	344	* stuff as it did for input - initial-state indicator bit,
	345	* current container, and current subcharset selected in each
	346	* container.
	347	*/
	348
	349	/*
	350	* Analyse the character and find out what subcharset it needs
	351	* to go in.
	352	*/
	353	if (input_chr >= 0 && !iso->from_ucs(input_chr, &subcharset, &bytes))
	354	return FALSE;
	355
	356	if (!(state->s1 & 0x80000000)) {
	357	state->s1 = iso->s1;
	358	if (iso->initial_sequence)
	359	for (i = 0; iso->initial_sequence[i]; i++)
	360	emit(emitctx, iso->initial_sequence[i]);
	361	}
	362
	363	if (input_chr == -1) {
	364	unsigned long oldstate;
	365	int k;
	366
	367	/*
	368	* Special case: reset encoding state.
	369	*/
	370	for (i = 0; iso->reset[i]; i++) {
	371	j = iso->reset[i] - 1;
	372	oldstate = state->s1;
	373	state->s1 &= iso->escapes[j].andbits;
	374	state->s1 ^= iso->escapes[j].xorbits;
	375	if (state->s1 != oldstate) {
	376	/* We must actually emit this sequence. */
	377	for (k = 0; iso->escapes[j].sequence[k]; k++)
	378	emit(emitctx, iso->escapes[j].sequence[k]);
	379	}
	380	}
	381
	382	return TRUE;
	383	}
	384
	385	/*
	386	* Now begins the fun. We now know what subcharset we want. So
	387	* we must find out which container we should select it into,
	388	* select it into it if necessary, select that _container_ if
	389	* necessary, and then output the given bytes.
	390	*/
	391	for (i = 0; i < iso->nescapes; i++)
	392	if (iso->escapes[i].subcharset == subcharset &&
	393	!(iso->escapes[i].container & RO))
	394	break;
	395	assert(i < iso->nescapes);
	396
	397	/*
	398	* We've found the escape sequence which would select this
	399	* subcharset into a container. However, that subcharset might
	400	* already _be_ selected in that container! Check before we go
	401	* to the effort of emitting the sequence.
	402	*/
	403	cont = iso->escapes[i].container &~ RO;
	404	if (((state->s1 >> (6*cont)) & 0x3F) != (unsigned)subcharset) {
	405	for (j = 0; iso->escapes[i].sequence[j]; j++)
	406	emit(emitctx, iso->escapes[i].sequence[j]);
	407	state->s1 &= iso->escapes[i].andbits;
	408	state->s1 ^= iso->escapes[i].xorbits;
	409	}
	410
	411	/*
	412	* Now we know what container our subcharset is in, so we want
	413	* to select that container.
	414	*/
	415	if (cont > 1) {
	416	/* SS2 or SS3; just output the sequence and be done. */
	417	emit(emitctx, ESC);
	418	emit(emitctx, 'L' + cont); /* comes out to 'N' or 'O' */
	419	} else {
	420	/*
	421	* Emit SI or SO, but only if the current container isn't already
	422	* the right one.
	423	*
	424	* Also, in an 8-bit subset, we need not do this; we'll
	425	* just use 8-bit characters to output SO-container
	426	* characters.
	427	*/
	428	if (iso->eightbit && cont == 1 && ((state->s1 >> 28) & 7) == 0) {
	429	topbit = 0x80;
	430	} else if (((state->s1 >> 28) & 7) != (unsigned)cont) {
	431	emit(emitctx, cont ? SO : SI);
	432	state->s1 = (state->s1 & 0x8FFFFFFF) \| (cont << 28);
	433	}
	434	}
	435
	436	/*
	437	* We're done. Subcharset is selected in container, container
	438	* is selected. All we need now is to write out the bytes.
	439	*/
	440	len = iso->nbytes[subcharset];
	441	while (len--)
	442	emit(emitctx, ((bytes >> (8*len)) & 0xFF) \| topbit);
	443
	444	return TRUE;
	445	}
	446
	447	/*
	448	* ISO-2022-JP, defined in RFC 1468.
	449	*/
	450	static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes)
	451	{
	452	switch (subcharset) {
	453	case 0: return bytes; /* one-byte ASCII */
	454	case 1: /* JIS X 0201 half-width katakana */
	455	if (bytes >= 0x21 && bytes <= 0x5F)
	456	return bytes + (0xFF61 - 0x21);
	457	else
	458	return ERROR;
	459	/* (no break needed since all control paths have returned) */
	460	case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
	461	((bytes ) & 0xFF) - 0x21);
	462	default: return ERROR;
	463	}
	464	}
	465	static int iso2022jp_from_ucs(long int ucs, int *subcharset,
	466	unsigned long *bytes)
	467	{
	468	int r, c;
	469	if (ucs < 0x80) {
	470	*subcharset = 0;
	471	*bytes = ucs;
	472	return 1;
	473	} else if (ucs >= 0xFF61 && ucs <= 0xFF9F) {
	474	*subcharset = 1;
	475	*bytes = ucs - (0xFF61 - 0x21);
	476	return 1;
	477	} else if (unicode_to_jisx0208(ucs, &r, &c)) {
	478	*subcharset = 2;
	479	*bytes = ((r+0x21) << 8) \| (c+0x21);
	480	return 1;
	481	} else {
	482	return 0;
	483	}
	484	}
	485	static const struct iso2022_escape iso2022jp_escapes[] = {
	486	{"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1}, /* we ignore this one */
	487	{"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2},
	488	{"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0},
	489	{"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1},
	490	};
	491	static const struct iso2022 iso2022jp = {
	492	iso2022jp_escapes, lenof(iso2022jp_escapes),
	493	"\1\1\2", "\3", 0x80000000, NULL, FALSE,
	494	iso2022jp_to_ucs, iso2022jp_from_ucs
	495	};
	496	const charset_spec charset_CS_ISO2022_JP = {
	497	CS_ISO2022_JP, read_iso2022s, write_iso2022s, &iso2022jp
	498	};
	499
	500	/*
	501	* ISO-2022-KR, defined in RFC 1557.
	502	*/
	503	static long int iso2022kr_to_ucs(int subcharset, unsigned long bytes)
	504	{
	505	switch (subcharset) {
	506	case 0: return bytes; /* one-byte ASCII */
	507	case 1: return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
	508	((bytes ) & 0xFF) - 0x21);
	509	default: return ERROR;
	510	}
	511	}
	512	static int iso2022kr_from_ucs(long int ucs, int *subcharset,
	513	unsigned long *bytes)
	514	{
	515	int r, c;
	516	if (ucs < 0x80) {
	517	*subcharset = 0;
	518	*bytes = ucs;
	519	return 1;
	520	} else if (unicode_to_ksx1001(ucs, &r, &c)) {
	521	*subcharset = 1;
	522	*bytes = ((r+0x21) << 8) \| (c+0x21);
	523	return 1;
	524	} else {
	525	return 0;
	526	}
	527	}
	528	static const struct iso2022_escape iso2022kr_escapes[] = {
	529	{"\016", 0x8FFFFFFF, 0x10000000, -1, -1},
	530	{"\017", 0x8FFFFFFF, 0x00000000, 0, 0},
	531	{"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1}, /* bits[11:6] <- 1 */
	532	};
	533	static const struct iso2022 iso2022kr = {
	534	iso2022kr_escapes, lenof(iso2022kr_escapes),
	535	"\1\2", "\2", 0x80000040, "\033$)C", FALSE,
	536	iso2022kr_to_ucs, iso2022kr_from_ucs
	537	};
	538	const charset_spec charset_CS_ISO2022_KR = {
	539	CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr
	540	};
	541
	542	/*
	543	* The COMPOUND_TEXT encoding used in X selections. Defined by the
	544	* X consortium.
	545	*
	546	* This encoding has quite a few sub-charsets. The order I assign
	547	* to them here is given in an enum.
	548	*/
	549	enum {
	550	/* This must match the bytes-per-character string given below. */
	551	CTEXT_ASCII,
	552	CTEXT_JISX0201_LEFT,
	553	CTEXT_JISX0201_RIGHT,
	554	CTEXT_ISO8859_1,
	555	CTEXT_ISO8859_2,
	556	CTEXT_ISO8859_3,
	557	CTEXT_ISO8859_4,
	558	CTEXT_ISO8859_5,
	559	CTEXT_ISO8859_6,
	560	CTEXT_ISO8859_7,
	561	CTEXT_ISO8859_8,
	562	CTEXT_ISO8859_9,
	563	CTEXT_GB2312,
	564	CTEXT_KSC5601,
	565	CTEXT_JISX0208,
	566	CTEXT_JISX0212
	567	};
	568	static long int ctext_to_ucs(int subcharset, unsigned long bytes)
	569	{
	570	switch (subcharset) {
	571	case CTEXT_ASCII: return bytes; /* one-byte ASCII */
	572	case CTEXT_JISX0201_LEFT: /* ASCII with yen and overline */
	573	return sbcs_to_unicode(&sbcsdata_CS_JISX0201, bytes & 0x7F);
	574	case CTEXT_JISX0201_RIGHT: /* JIS X 0201 half-width katakana */
	575	return sbcs_to_unicode(&sbcsdata_CS_JISX0201, (bytes & 0x7F) \| 0x80);
	576	case CTEXT_ISO8859_1:
	577	return sbcs_to_unicode(&sbcsdata_CS_ISO8859_1, (bytes & 0x7F) \| 0x80);
	578	case CTEXT_ISO8859_2:
	579	return sbcs_to_unicode(&sbcsdata_CS_ISO8859_2, (bytes & 0x7F) \| 0x80);
	580	case CTEXT_ISO8859_3:
	581	return sbcs_to_unicode(&sbcsdata_CS_ISO8859_3, (bytes & 0x7F) \| 0x80);
	582	case CTEXT_ISO8859_4:
	583	return sbcs_to_unicode(&sbcsdata_CS_ISO8859_4, (bytes & 0x7F) \| 0x80);
	584	case CTEXT_ISO8859_5:
	585	return sbcs_to_unicode(&sbcsdata_CS_ISO8859_5, (bytes & 0x7F) \| 0x80);
	586	case CTEXT_ISO8859_6:
	587	return sbcs_to_unicode(&sbcsdata_CS_ISO8859_6, (bytes & 0x7F) \| 0x80);
	588	case CTEXT_ISO8859_7:
	589	return sbcs_to_unicode(&sbcsdata_CS_ISO8859_7, (bytes & 0x7F) \| 0x80);
	590	case CTEXT_ISO8859_8:
	591	return sbcs_to_unicode(&sbcsdata_CS_ISO8859_8, (bytes & 0x7F) \| 0x80);
	592	case CTEXT_ISO8859_9:
	593	return sbcs_to_unicode(&sbcsdata_CS_ISO8859_9, (bytes & 0x7F) \| 0x80);
	594	case CTEXT_GB2312:
	595	return gb2312_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
	596	((bytes ) & 0xFF) - 0x21);
	597	case CTEXT_KSC5601:
	598	return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
	599	((bytes ) & 0xFF) - 0x21);
	600	case CTEXT_JISX0208:
	601	return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
	602	((bytes ) & 0xFF) - 0x21);
	603	case CTEXT_JISX0212:
	604	return jisx0212_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
	605	((bytes ) & 0xFF) - 0x21);
	606	default: return ERROR;
	607	}
	608	}
	609	static int ctext_from_ucs(long int ucs, int subcharset, unsigned long bytes)
	610	{
	611	int r, c;
	612	if (ucs < 0x80) {
	613	*subcharset = CTEXT_ASCII;
	614	*bytes = ucs;
	615	return 1;
	616	} else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_1, ucs)) != ERROR) {
	617	*subcharset = CTEXT_ISO8859_1;
	618	*bytes = c - 0x80;
	619	return 1;
	620	} else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_2, ucs)) != ERROR) {
	621	*subcharset = CTEXT_ISO8859_2;
	622	*bytes = c - 0x80;
	623	return 1;
	624	} else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_3, ucs)) != ERROR) {
	625	*subcharset = CTEXT_ISO8859_3;
	626	*bytes = c - 0x80;
	627	return 1;
	628	} else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_4, ucs)) != ERROR) {
	629	*subcharset = CTEXT_ISO8859_4;
	630	*bytes = c - 0x80;
	631	return 1;
	632	} else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_5, ucs)) != ERROR) {
	633	*subcharset = CTEXT_ISO8859_5;
	634	*bytes = c - 0x80;
	635	return 1;
	636	} else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_6, ucs)) != ERROR) {
	637	*subcharset = CTEXT_ISO8859_6;
	638	*bytes = c - 0x80;
	639	return 1;
	640	} else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_7, ucs)) != ERROR) {
	641	*subcharset = CTEXT_ISO8859_7;
	642	*bytes = c - 0x80;
	643	return 1;
	644	} else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_8, ucs)) != ERROR) {
	645	*subcharset = CTEXT_ISO8859_8;
	646	*bytes = c - 0x80;
	647	return 1;
	648	} else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_9, ucs)) != ERROR) {
	649	*subcharset = CTEXT_ISO8859_9;
	650	*bytes = c - 0x80;
	651	return 1;
	652	} else if ((c = sbcs_from_unicode(&sbcsdata_CS_JISX0201, ucs)) != ERROR) {
	653	if (c < 0x80) {
	654	*subcharset = CTEXT_JISX0201_LEFT;
	655	} else {
	656	*subcharset = CTEXT_JISX0201_RIGHT;
	657	c -= 0x80;
	658	}
	659	*bytes = c;
	660	return 1;
	661	} else if (unicode_to_gb2312(ucs, &r, &c)) {
	662	*subcharset = CTEXT_GB2312;
	663	*bytes = ((r+0x21) << 8) \| (c+0x21);
	664	return 1;
	665	} else if (unicode_to_ksx1001(ucs, &r, &c)) {
	666	*subcharset = CTEXT_KSC5601;
	667	*bytes = ((r+0x21) << 8) \| (c+0x21);
	668	return 1;
	669	} else if (unicode_to_jisx0208(ucs, &r, &c)) {
	670	*subcharset = CTEXT_JISX0208;
	671	*bytes = ((r+0x21) << 8) \| (c+0x21);
	672	return 1;
	673	} else if (unicode_to_jisx0212(ucs, &r, &c)) {
	674	*subcharset = CTEXT_JISX0212;
	675	*bytes = ((r+0x21) << 8) \| (c+0x21);
	676	return 1;
	677	} else {
	678	return 0;
	679	}
	680	}
	681	#define SEQ(str,cont,cs) \
	682	{str,~(63<<(6(((cont)&~RO)))),(cs)<<(6(((cont)&~RO))),(cont),(cs)}
	683	/*
	684	* Compound text defines restrictions on which container can take
	685	* which character sets. Things labelled `left half of' can only go
	686	* in GL; things labelled `right half of' can only go in GR; and 96
	687	* or 96^n character sets only _fit_ in GR. Thus:
	688	* - ASCII can only go in GL since it is the left half of 8859-*.
	689	* - All the 8859 sets can only go in GR.
	690	* - JISX0201 left is GL only; JISX0201 right is GR only.
	691	* - The three multibyte sets (GB2312, JISX0208, KSC5601) can go
	692	* in either; we prefer GR where possible since this leads to a
	693	* more compact EUC-like encoding.
	694	*/
	695	static const struct iso2022_escape ctext_escapes[] = {
	696	SEQ("\033$(A", 0\|RO, CTEXT_GB2312),
	697	SEQ("\033$(B", 0\|RO, CTEXT_JISX0208),
	698	SEQ("\033$(C", 0\|RO, CTEXT_KSC5601),
	699	SEQ("\033$(D", 0\|RO, CTEXT_JISX0212),
	700	SEQ("\033$)A", 1, CTEXT_GB2312),
	701	SEQ("\033$)B", 1, CTEXT_JISX0208),
	702	SEQ("\033$)C", 1, CTEXT_KSC5601),
	703	SEQ("\033$)D", 1, CTEXT_JISX0212),
	704	SEQ("\033(B", 0, CTEXT_ASCII),
	705	SEQ("\033(J", 0, CTEXT_JISX0201_LEFT),
	706	SEQ("\033)I", 1, CTEXT_JISX0201_RIGHT),
	707	SEQ("\033-A", 1, CTEXT_ISO8859_1),
	708	SEQ("\033-B", 1, CTEXT_ISO8859_2),
	709	SEQ("\033-C", 1, CTEXT_ISO8859_3),
	710	SEQ("\033-D", 1, CTEXT_ISO8859_4),
	711	SEQ("\033-F", 1, CTEXT_ISO8859_7),
	712	SEQ("\033-G", 1, CTEXT_ISO8859_6),
	713	SEQ("\033-H", 1, CTEXT_ISO8859_8),
	714	SEQ("\033-L", 1, CTEXT_ISO8859_5),
	715	SEQ("\033-M", 1, CTEXT_ISO8859_9),
	716	};
	717	static const struct iso2022 ctext = {
	718	ctext_escapes, lenof(ctext_escapes),
	719	"\1\1\1\1\1\1\1\1\1\1\1\1\2\2\2\2", /* must match the enum above */
	720	"", 0x80000000 \| (CTEXT_ASCII<<0) \| (CTEXT_ISO8859_1<<6), "", TRUE,
	721	ctext_to_ucs, ctext_from_ucs
	722	};
	723	const charset_spec charset_CS_CTEXT = {
	724	CS_CTEXT, read_iso2022s, write_iso2022s, &ctext
	725	};
	726
	727	#else /* ENUM_CHARSETS */
	728
	729	ENUM_CHARSET(CS_ISO2022_JP)
	730	ENUM_CHARSET(CS_ISO2022_KR)
	731	ENUM_CHARSET(CS_CTEXT)
	732
	733	#endif /* ENUM_CHARSETS */