[sgt/charset] / iso2022s.c

/*
 * iso2022s.c - support for ISO-2022 subset encodings.
 * 
 * (The `s' suffix on the filename is there to leave `iso2022.c'
 * free for the unlikely event that I ever attempt to implement
 * _full_ ISO-2022 in this library!)
 */

#ifndef ENUM_CHARSETS

#include <stdio.h>
#include <string.h>
#include <assert.h>

#include "charset.h"
#include "internal.h"

#define SO (0x0E)
#define SI (0x0F)
#define ESC (0x1B)

/* Functional description of a single ISO 2022 escape sequence. */
struct iso2022_escape {
    char const *sequence;
    unsigned long andbits, xorbits;
    /*
     * For output, these variables help us figure out which escape
     * sequences we need to get where we want to be.
     */
    int container, subcharset;
};

struct iso2022 {
    /*
     * List of escape sequences supported in this subset. Must be
     * in ASCII order, so that we can narrow down the list as
     * necessary.
     */
    struct iso2022_escape *escapes;    /* must be sorted in ASCII order! */
    int nescapes;

    /*
     * We assign indices from 0 upwards to the sub-charsets of a
     * given ISO 2022 subset. nbytes[i] tells us how many bytes per
     * character are required by sub-charset i. (It's a string
     * mainly because that makes it easier to declare in C syntax
     * than an int array.)
     */
    char const *nbytes;

    /*
     * The characters in this string are indices-plus-one (so that
     * NUL can still terminate) of escape sequences in `escapes'.
     * These escapes are output in the given sequence to reset the
     * encoding state, unless it turns out that a given escape
     * would not change the state at all.
     */
    char const *reset;

    /*
     * Initial value of s1, in case the default container contents
     * needs to be something other than charset 0 in all cases.
     * (Note that this must have the top bit set!)
     */
    unsigned long s1;

    /*
     * For output, some ISO 2022 subsets _mandate_ an initial shift
     * sequence. If so, here it is so we can output it. (For the
     * sake of basic sanity we won't bother to _require_ it on
     * input, although it should of course be listed under
     * `escapes' above so that we ignore it when present.)
     */
    char const *initial_sequence;

    /*
     * Function calls to do the actual translation.
     */
    long int (*to_ucs)(int subcharset, unsigned long bytes);
    int (*from_ucs)(long int ucs, int *subcharset, unsigned long *bytes);
};

static void read_iso2022s(charset_spec const *charset, long int input_chr,
			  charset_state *state,
			  void (*emit)(void *ctx, long int output),
			  void *emitctx)
{
    struct iso2022 const *iso = (struct iso2022 *)charset->data;

    /*
     * For reading ISO-2022 subsets, we divide up our state
     * variables as follows:
     * 
     * 	- The top byte of s0 (bits 31:24) indicates, if nonzero,
     * 	  that we are part-way through a recognised ISO-2022 escape
     * 	  sequence. Five of those bits (31:27) give the index of
     * 	  the first member of the escapes list matching what we
     * 	  have so far; the remaining three (26:24) give the number
     * 	  of characters we have seen so far.
     * 
     * 	- The top bit of s1 (bit 31) is non-zero at all times, to
     * 	  indicate that we have performed any necessary
     * 	  initialisation. When we start, we detect a zero s1 and
     * 	  respond to it by initialising the default container
     * 	  contents.
     * 
     * 	- The next three bits of s1 (bits 30:28) indicate which
     * 	  _container_ is currently selected. This isn't quite as
     * 	  simple as it sounds, since we have to preserve memory of
     * 	  which of the SI/SO containers we came from when we're
     * 	  temporarily in SS2/SS3. Hence, what happens is:
     *     + bit 28 indicates SI/SO.
     * 	   + if we're in an SS2/SS3 container, that's indicated by
     * 	     the two bits above that being nonzero and holding
     * 	     either 2 or 3.
     * 	   + Hence: 0 is SI, 1 is SO, 4 is SS2-from-SI, 5 is
     * 	     SS2-from-SO, 6 is SS3-from-SI, 7 is SS3-from-SO.
     * 
     * 	- The next nibble of s1 (27:24) indicates how many bytes
     * 	  have been accumulated in the current character.
     * 
     * 	- The remaining three bytes of s1 are divided into four
     * 	  six-bit sections, and each section gives the current
     * 	  sub-charset selected in one of the possible containers.
     * 	  (Those containers are SI, SO, SS2 and SS3, respectively
     * 	  and in order from the bottom of s0 to the top.)
     * 
     * 	- The bottom 24 bits of s0 give the accumulated character
     * 	  data so far.
     * 
     * (Note that this means s1 contains all the parts of the state
     * which might need to be operated on by escape sequences.
     * Cunning, eh?)
     */

    if (!(state->s1 & 0x80000000)) {
	state->s1 = iso->s1;
    }

    /*
     * So. Firstly, we process escape sequences, if we're in the
     * middle of one or if we see a possible introducer (SI, SO,
     * ESC).
     */
    if ((state->s0 >> 24) ||
	(input_chr == SO || input_chr == SI || input_chr == ESC)) {
	int n = (state->s0 >> 24) & 7, i = (state->s0 >> 27), oi = i, j;

	/*
	 * If this is the start of an escape sequence, we might be
	 * in mid-character. If so, clear the character state and
	 * emit an error token for the incomplete character.
	 */
	if (state->s1 & 0x0F000000) {
	    state->s1 &= ~0x0F000000;
	    state->s0 &= 0xFF000000;
	    /*
	     * If we were in the SS2 or SS3 container, we
	     * automatically exit it.
	     */
	    if (state->s1 & 0x60000000)
		state->s1 &= 0x9FFFFFFF;
	    emit(emitctx, ERROR);
	}

	j = i;
	while (j < iso->nescapes &&
	       !memcmp(iso->escapes[j].sequence,
		       iso->escapes[oi].sequence, n)) {
	    if (iso->escapes[j].sequence[n] < input_chr)
		i = ++j;
	    else
		break;
	}
	if (i >= iso->nescapes ||
	    memcmp(iso->escapes[i].sequence,
		   iso->escapes[oi].sequence, n) ||
	    iso->escapes[i].sequence[n] != input_chr) {
	    /*
	     * This character does not appear in any valid escape
	     * sequence. Therefore, we must emit all the characters
	     * we had previously swallowed, plus this one, and
	     * return to non-escape-sequence state.
	     */
	    for (j = 0; j < n; j++)
		emit(emitctx, iso->escapes[oi].sequence[j]);
	    emit(emitctx, input_chr);
	    state->s0 = 0;
	    return;
	}

	/*
	 * Otherwise, we have found an additional character in our
	 * escape sequence. See if we have reached the _end_ of our
	 * sequence (and therefore must process the sequence).
	 */
	n++;
	if (!iso->escapes[i].sequence[n]) {
	    state->s0 = 0;
	    state->s1 &= iso->escapes[i].andbits;
	    state->s1 ^= iso->escapes[i].xorbits;
	    return;
	}

	/*
	 * Failing _that_, we simply update our escape-sequence-
	 * tracking state.
	 */
	assert(i < 32 && n < 8);
	state->s0 = (i << 27) | (n << 24);
	return;
    }

    /*
     * If this isn't an escape sequence, it must be part of a
     * character. One possibility is that it's a control character
     * (outside the space 21-7E), in which case we output it verbatim.
     */
    if (input_chr < 0x21 || input_chr > 0x7E) {
	/*
	 * We might be in mid-multibyte-character. If so, clear the
	 * character state and emit an error token for the
	 * incomplete character.
	 */
	if (state->s1 & 0x0F000000) {
	    state->s1 &= ~0x0F000000;
	    state->s0 &= 0xFF000000;
	    emit(emitctx, ERROR);
	    /*
	     * If we were in the SS2 or SS3 container, we
	     * automatically exit it.
	     */
	    if (state->s1 & 0x60000000)
		state->s1 &= 0x9FFFFFFF;
	}

	emit(emitctx, input_chr);
	return;
    }

    /*
     * Otherwise, accumulate character data.
     */
    {
	unsigned long chr;
	int chrlen, cont, subcharset, bytes;

	/* The current character and its length. */
	chr = ((state->s0 & 0x00FFFFFF) << 8) | input_chr;
	chrlen = ((state->s1 >> 24) & 0xF) + 1;
	/* The current sub-charset. */
	cont = (state->s1 >> 28) & 7;
	if (cont > 1) cont >>= 1;
	subcharset = (state->s1 >> (6*cont)) & 0x3F;
	/* The number of bytes-per-character in that sub-charset. */
	bytes = iso->nbytes[subcharset];

	/*
	 * If this character is now complete, we convert and emit
	 * it. Otherwise, we simply update the state and return.
	 */
	if (chrlen >= bytes) {
	    emit(emitctx, iso->to_ucs(subcharset, chr));
	    chr = chrlen = 0;
	    /*
	     * If we were in the SS2 or SS3 container, we
	     * automatically exit it.
	     */
	    if (state->s1 & 0x60000000)
		state->s1 &= 0x9FFFFFFF;
	}
	state->s0 = (state->s0 & 0xFF000000) | chr;
	state->s1 = (state->s1 & 0xF0FFFFFF) | (chrlen << 24);
    }
}

static int write_iso2022s(charset_spec const *charset, long int input_chr,
			  charset_state *state,
			  void (*emit)(void *ctx, long int output),
			  void *emitctx)
{
    struct iso2022 const *iso = (struct iso2022 *)charset->data;
    int subcharset, len, i, j, cont;
    unsigned long bytes;

    /*
     * For output, our s1 state variable contains most of the same
     * stuff as it did for input - initial-state indicator bit,
     * current container, and current subcharset selected in each
     * container.
     */

    /*
     * Analyse the character and find out what subcharset it needs
     * to go in.
     */
    if (input_chr >= 0 && !iso->from_ucs(input_chr, &subcharset, &bytes))
	return FALSE;

    if (!(state->s1 & 0x80000000)) {
	state->s1 = iso->s1;
	if (iso->initial_sequence)
	    for (i = 0; iso->initial_sequence[i]; i++)
		emit(emitctx, iso->initial_sequence[i]);
    }

    if (input_chr == -1) {
	unsigned long oldstate;
	int k;

	/*
	 * Special case: reset encoding state.
	 */
	for (i = 0; iso->reset[i]; i++) {
	    j = iso->reset[i] - 1;
	    oldstate = state->s1;
	    state->s1 &= iso->escapes[j].andbits;
	    state->s1 ^= iso->escapes[j].xorbits;
	    if (state->s1 != oldstate) {
		/* We must actually emit this sequence. */
		for (k = 0; iso->escapes[j].sequence[k]; k++)
		    emit(emitctx, iso->escapes[j].sequence[k]);
	    }
	}

	return TRUE;
    }

    /*
     * Now begins the fun. We now know what subcharset we want. So
     * we must find out which container we should select it into,
     * select it into it if necessary, select that _container_ if
     * necessary, and then output the given bytes.
     */
    for (i = 0; i < iso->nescapes; i++)
	if (iso->escapes[i].subcharset == subcharset)
	    break;
    assert(i < iso->nescapes);

    /*
     * We've found the escape sequence which would select this
     * subcharset into a container. However, that subcharset might
     * already _be_ selected in that container! Check before we go
     * to the effort of emitting the sequence.
     */
    cont = iso->escapes[i].container;
    if (((state->s1 >> (6*cont)) & 0x3F) != (unsigned)subcharset) {
	for (j = 0; iso->escapes[i].sequence[j]; j++)
	    emit(emitctx, iso->escapes[i].sequence[j]);
	state->s1 &= iso->escapes[i].andbits;
	state->s1 ^= iso->escapes[i].xorbits;
    }

    /*
     * Now we know what container our subcharset is in, so we want
     * to select that container.
     */
    if (cont > 1) {
	/* SS2 or SS3; just output the sequence and be done. */
	emit(emitctx, ESC);
	emit(emitctx, 'L' + cont);     /* comes out to 'N' or 'O' */
    } else {
	/* Emit SI or SO, but only if the current container isn't already
	 * the right one. */
	if (((state->s1 >> 28) & 7) != (unsigned)cont) {
	    emit(emitctx, cont ? SO : SI);
	    state->s1 = (state->s1 & 0x8FFFFFFF) | (cont << 28);
	}
    }

    /*
     * We're done. Subcharset is selected in container, container
     * is selected. All we need now is to write out the bytes.
     */
    len = iso->nbytes[subcharset];
    while (len--)
	emit(emitctx, (bytes >> (8*len)) & 0xFF);

    return TRUE;
}

/*
 * ISO-2022-JP, defined in RFC 1468.
 */
static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes)
{
    switch (subcharset) {
      case 0: return bytes;	       /* one-byte ASCII */
      case 1:			       /* JIS X 0201 half-width katakana */
	if (bytes >= 0x21 && bytes <= 0x5F)
	    return bytes + (0xFF61 - 0x21);
	else
	    return ERROR;
	/* (no break needed since all control paths have returned) */
      case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
					 ((bytes     ) & 0xFF) - 0x21);
      default: return ERROR;
    }
}
static int iso2022jp_from_ucs(long int ucs, int *subcharset,
			      unsigned long *bytes)
{
    int r, c;
    if (ucs < 0x80) {
	*subcharset = 0;
	*bytes = ucs;
	return 1;
    } else if (ucs >= 0xFF61 && ucs <= 0xFF9F) {
	*subcharset = 1;
	*bytes = ucs - (0xFF61 - 0x21);
	return 1;
    } else if (unicode_to_jisx0208(ucs, &r, &c)) {
	*subcharset = 2;
	*bytes = ((r+0x21) << 8) | (c+0x21);
	return 1;
    } else {
	return 0;
    }
}
static struct iso2022_escape iso2022jp_escapes[] = {
    {"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1},   /* we ignore this one */
    {"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2},
    {"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0},
    {"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1},
};
static struct iso2022 iso2022jp = {
    iso2022jp_escapes, lenof(iso2022jp_escapes),
    "\1\1\2", "\3", 0x80000000, NULL, iso2022jp_to_ucs, iso2022jp_from_ucs
};
const charset_spec charset_CS_ISO2022_JP = {
    CS_ISO2022_JP, read_iso2022s, write_iso2022s, &iso2022jp
};

/*
 * ISO-2022-KR, defined in RFC 1557.
 */
static long int iso2022kr_to_ucs(int subcharset, unsigned long bytes)
{
    switch (subcharset) {
      case 0: return bytes;	       /* one-byte ASCII */
      case 1: return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
					((bytes     ) & 0xFF) - 0x21);
      default: return ERROR;
    }
}
static int iso2022kr_from_ucs(long int ucs, int *subcharset,
			      unsigned long *bytes)
{
    int r, c;
    if (ucs < 0x80) {
	*subcharset = 0;
	*bytes = ucs;
	return 1;
    } else if (unicode_to_ksx1001(ucs, &r, &c)) {
	*subcharset = 1;
	*bytes = ((r+0x21) << 8) | (c+0x21);
	return 1;
    } else {
	return 0;
    }
}
static struct iso2022_escape iso2022kr_escapes[] = {
    {"\016", 0x8FFFFFFF, 0x10000000, -1, -1},
    {"\017", 0x8FFFFFFF, 0x00000000, 0, 0},
    {"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1},   /* bits[11:6] <- 1 */
};
static struct iso2022 iso2022kr = {
    iso2022kr_escapes, lenof(iso2022kr_escapes),
    "\1\2", "\2", 0x80000040, "\033$)C", iso2022kr_to_ucs, iso2022kr_from_ucs
};
const charset_spec charset_CS_ISO2022_KR = {
    CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr
};

#else /* ENUM_CHARSETS */

ENUM_CHARSET(CS_ISO2022_JP)
ENUM_CHARSET(CS_ISO2022_KR)

#endif /* ENUM_CHARSETS */
Commit	Line	Data
c6d25d8d	1	/*
	2	* iso2022s.c - support for ISO-2022 subset encodings.
	3	*
	4	* (The `s' suffix on the filename is there to leave `iso2022.c'
	5	* free for the unlikely event that I ever attempt to implement
	6	* _full_ ISO-2022 in this library!)
	7	*/
	8
	9	#ifndef ENUM_CHARSETS
	10
	11	#include <stdio.h>
	12	#include <string.h>
	13	#include <assert.h>
	14
	15	#include "charset.h"
	16	#include "internal.h"
	17
	18	#define SO (0x0E)
	19	#define SI (0x0F)
	20	#define ESC (0x1B)
	21
	22	/* Functional description of a single ISO 2022 escape sequence. */
	23	struct iso2022_escape {
	24	char const *sequence;
	25	unsigned long andbits, xorbits;
	26	/*
	27	* For output, these variables help us figure out which escape
	28	* sequences we need to get where we want to be.
	29	*/
	30	int container, subcharset;
	31	};
	32
	33	struct iso2022 {
	34	/*
	35	* List of escape sequences supported in this subset. Must be
	36	* in ASCII order, so that we can narrow down the list as
	37	* necessary.
	38	*/
	39	struct iso2022_escape escapes; / must be sorted in ASCII order! */
	40	int nescapes;
	41
	42	/*
	43	* We assign indices from 0 upwards to the sub-charsets of a
	44	* given ISO 2022 subset. nbytes[i] tells us how many bytes per
	45	* character are required by sub-charset i. (It's a string
	46	* mainly because that makes it easier to declare in C syntax
	47	* than an int array.)
	48	*/
	49	char const *nbytes;
	50
	51	/*
	52	* The characters in this string are indices-plus-one (so that
	53	* NUL can still terminate) of escape sequences in `escapes'.
	54	* These escapes are output in the given sequence to reset the
	55	* encoding state, unless it turns out that a given escape
	56	* would not change the state at all.
	57	*/
	58	char const *reset;
	59
	60	/*
	61	* Initial value of s1, in case the default container contents
	62	* needs to be something other than charset 0 in all cases.
	63	* (Note that this must have the top bit set!)
	64	*/
65	unsigned long s1;
66
67	/*
68	* For output, some ISO 2022 subsets _mandate_ an initial shift
69	* sequence. If so, here it is so we can output it. (For the
70	* sake of basic sanity we won't bother to _require_ it on
71	* input, although it should of course be listed under
72	* `escapes' above so that we ignore it when present.)
73	*/
74	char const *initial_sequence;
75
76	/*
77	* Function calls to do the actual translation.
78	*/
79	long int (*to_ucs)(int subcharset, unsigned long bytes);
80	int (from_ucs)(long int ucs, int subcharset, unsigned long *bytes);
81	};
82
83	static void read_iso2022s(charset_spec const *charset, long int input_chr,
84	charset_state *state,
85	void (emit)(void ctx, long int output),
86	void *emitctx)
87	{
88	struct iso2022 const iso = (struct iso2022 )charset->data;
89
90	/*
91	* For reading ISO-2022 subsets, we divide up our state
92	* variables as follows:
93	*
94	* - The top byte of s0 (bits 31:24) indicates, if nonzero,
95	* that we are part-way through a recognised ISO-2022 escape
96	* sequence. Five of those bits (31:27) give the index of
97	* the first member of the escapes list matching what we
98	* have so far; the remaining three (26:24) give the number
99	* of characters we have seen so far.
100	*
101	* - The top bit of s1 (bit 31) is non-zero at all times, to
102	* indicate that we have performed any necessary
103	* initialisation. When we start, we detect a zero s1 and
104	* respond to it by initialising the default container
105	* contents.
106	*
107	* - The next three bits of s1 (bits 30:28) indicate which
108	* _container_ is currently selected. This isn't quite as
109	* simple as it sounds, since we have to preserve memory of
110	* which of the SI/SO containers we came from when we're
111	* temporarily in SS2/SS3. Hence, what happens is:
112	* + bit 28 indicates SI/SO.
113	* + if we're in an SS2/SS3 container, that's indicated by
114	* the two bits above that being nonzero and holding
115	* either 2 or 3.
116	* + Hence: 0 is SI, 1 is SO, 4 is SS2-from-SI, 5 is
117	* SS2-from-SO, 6 is SS3-from-SI, 7 is SS3-from-SO.
118	*
119	* - The next nibble of s1 (27:24) indicates how many bytes
120	* have been accumulated in the current character.
121	*
122	* - The remaining three bytes of s1 are divided into four
123	* six-bit sections, and each section gives the current
124	* sub-charset selected in one of the possible containers.
125	* (Those containers are SI, SO, SS2 and SS3, respectively
126	* and in order from the bottom of s0 to the top.)
127	*
128	* - The bottom 24 bits of s0 give the accumulated character
129	* data so far.
130	*
131	* (Note that this means s1 contains all the parts of the state
132	* which might need to be operated on by escape sequences.
133	* Cunning, eh?)
134	*/
135
136	if (!(state->s1 & 0x80000000)) {
137	state->s1 = iso->s1;
138	}
139
140	/*
141	* So. Firstly, we process escape sequences, if we're in the
142	* middle of one or if we see a possible introducer (SI, SO,
143	* ESC).
144	*/
145	if ((state->s0 >> 24) \|\|
146	(input_chr == SO \|\| input_chr == SI \|\| input_chr == ESC)) {
147	int n = (state->s0 >> 24) & 7, i = (state->s0 >> 27), oi = i, j;
148
149	/*
150	* If this is the start of an escape sequence, we might be
151	* in mid-character. If so, clear the character state and
152	* emit an error token for the incomplete character.
153	*/
154	if (state->s1 & 0x0F000000) {
155	state->s1 &= ~0x0F000000;
156	state->s0 &= 0xFF000000;
157	/*
158	* If we were in the SS2 or SS3 container, we
159	* automatically exit it.
160	*/
161	if (state->s1 & 0x60000000)
162	state->s1 &= 0x9FFFFFFF;
163	emit(emitctx, ERROR);
164	}
165
166	j = i;
167	while (j < iso->nescapes &&
168	!memcmp(iso->escapes[j].sequence,
169	iso->escapes[oi].sequence, n)) {
170	if (iso->escapes[j].sequence[n] < input_chr)
171	i = ++j;
172	else
173	break;
174	}
175	if (i >= iso->nescapes \|\|
176	memcmp(iso->escapes[i].sequence,
177	iso->escapes[oi].sequence, n) \|\|
178	iso->escapes[i].sequence[n] != input_chr) {
179	/*
180	* This character does not appear in any valid escape
181	* sequence. Therefore, we must emit all the characters
182	* we had previously swallowed, plus this one, and
183	* return to non-escape-sequence state.
184	*/
185	for (j = 0; j < n; j++)
186	emit(emitctx, iso->escapes[oi].sequence[j]);
187	emit(emitctx, input_chr);
188	state->s0 = 0;
189	return;
190	}
191
192	/*
193	* Otherwise, we have found an additional character in our
194	* escape sequence. See if we have reached the _end_ of our
195	* sequence (and therefore must process the sequence).
196	*/
197	n++;
198	if (!iso->escapes[i].sequence[n]) {
199	state->s0 = 0;
200	state->s1 &= iso->escapes[i].andbits;
201	state->s1 ^= iso->escapes[i].xorbits;
202	return;
203	}
204
205	/*
206	* Failing _that_, we simply update our escape-sequence-
207	* tracking state.
208	*/
209	assert(i < 32 && n < 8);
210	state->s0 = (i << 27) \| (n << 24);
211	return;
212	}
213
214	/*
215	* If this isn't an escape sequence, it must be part of a
216	* character. One possibility is that it's a control character
217	* (outside the space 21-7E), in which case we output it verbatim.
218	*/
219	if (input_chr < 0x21 \|\| input_chr > 0x7E) {
220	/*
221	* We might be in mid-multibyte-character. If so, clear the
222	* character state and emit an error token for the
223	* incomplete character.
224	*/
225	if (state->s1 & 0x0F000000) {
226	state->s1 &= ~0x0F000000;
227	state->s0 &= 0xFF000000;
228	emit(emitctx, ERROR);
229	/*
230	* If we were in the SS2 or SS3 container, we
231	* automatically exit it.
232	*/
233	if (state->s1 & 0x60000000)
234	state->s1 &= 0x9FFFFFFF;
235	}
236
237	emit(emitctx, input_chr);
238	return;
239	}
240
241	/*
242	* Otherwise, accumulate character data.
243	*/
244	{
245	unsigned long chr;
246	int chrlen, cont, subcharset, bytes;
247
248	/* The current character and its length. */
249	chr = ((state->s0 & 0x00FFFFFF) << 8) \| input_chr;
250	chrlen = ((state->s1 >> 24) & 0xF) + 1;
251	/* The current sub-charset. */
252	cont = (state->s1 >> 28) & 7;
253	if (cont > 1) cont >>= 1;
254	subcharset = (state->s1 >> (6*cont)) & 0x3F;
255	/* The number of bytes-per-character in that sub-charset. */
256	bytes = iso->nbytes[subcharset];
257
258	/*
259	* If this character is now complete, we convert and emit
260	* it. Otherwise, we simply update the state and return.
261	*/
262	if (chrlen >= bytes) {
263	emit(emitctx, iso->to_ucs(subcharset, chr));
264	chr = chrlen = 0;
265	/*
266	* If we were in the SS2 or SS3 container, we
267	* automatically exit it.
268	*/
269	if (state->s1 & 0x60000000)
270	state->s1 &= 0x9FFFFFFF;
271	}
272	state->s0 = (state->s0 & 0xFF000000) \| chr;
273	state->s1 = (state->s1 & 0xF0FFFFFF) \| (chrlen << 24);
274	}
275	}
276
277	static int write_iso2022s(charset_spec const *charset, long int input_chr,
278	charset_state *state,
279	void (emit)(void ctx, long int output),
280	void *emitctx)
281	{
282	struct iso2022 const iso = (struct iso2022 )charset->data;
283	int subcharset, len, i, j, cont;
284	unsigned long bytes;
285
286	/*
287	* For output, our s1 state variable contains most of the same
288	* stuff as it did for input - initial-state indicator bit,
289	* current container, and current subcharset selected in each
290	* container.
291	*/
292
293	/*
294	* Analyse the character and find out what subcharset it needs
295	* to go in.
296	*/
297	if (input_chr >= 0 && !iso->from_ucs(input_chr, &subcharset, &bytes))
298	return FALSE;
299
300	if (!(state->s1 & 0x80000000)) {
301	state->s1 = iso->s1;
302	if (iso->initial_sequence)
303	for (i = 0; iso->initial_sequence[i]; i++)
304	emit(emitctx, iso->initial_sequence[i]);
305	}
306
307	if (input_chr == -1) {
308	unsigned long oldstate;
309	int k;
310
311	/*
312	* Special case: reset encoding state.
313	*/
314	for (i = 0; iso->reset[i]; i++) {
315	j = iso->reset[i] - 1;
316	oldstate = state->s1;
317	state->s1 &= iso->escapes[j].andbits;
318	state->s1 ^= iso->escapes[j].xorbits;
319	if (state->s1 != oldstate) {
320	/* We must actually emit this sequence. */
321	for (k = 0; iso->escapes[j].sequence[k]; k++)
322	emit(emitctx, iso->escapes[j].sequence[k]);
323	}
324	}
325
326	return TRUE;
327	}
328
329	/*
330	* Now begins the fun. We now know what subcharset we want. So
331	* we must find out which container we should select it into,
332	* select it into it if necessary, select that _container_ if
333	* necessary, and then output the given bytes.
334	*/
335	for (i = 0; i < iso->nescapes; i++)
336	if (iso->escapes[i].subcharset == subcharset)
337	break;
338	assert(i < iso->nescapes);
339
340	/*
341	* We've found the escape sequence which would select this
342	* subcharset into a container. However, that subcharset might
343	* already _be_ selected in that container! Check before we go
344	* to the effort of emitting the sequence.
345	*/
346	cont = iso->escapes[i].container;
3cca0edf	347	if (((state->s1 >> (6*cont)) & 0x3F) != (unsigned)subcharset) {
c6d25d8d	348	for (j = 0; iso->escapes[i].sequence[j]; j++)
	349	emit(emitctx, iso->escapes[i].sequence[j]);
	350	state->s1 &= iso->escapes[i].andbits;
	351	state->s1 ^= iso->escapes[i].xorbits;
	352	}
	353
	354	/*
	355	* Now we know what container our subcharset is in, so we want
	356	* to select that container.
	357	*/
	358	if (cont > 1) {
	359	/* SS2 or SS3; just output the sequence and be done. */
	360	emit(emitctx, ESC);
	361	emit(emitctx, 'L' + cont); /* comes out to 'N' or 'O' */
	362	} else {
	363	/* Emit SI or SO, but only if the current container isn't already
	364	* the right one. */
3cca0edf	365	if (((state->s1 >> 28) & 7) != (unsigned)cont) {
c6d25d8d	366	emit(emitctx, cont ? SO : SI);
	367	state->s1 = (state->s1 & 0x8FFFFFFF) \| (cont << 28);
	368	}
	369	}
	370
	371	/*
	372	* We're done. Subcharset is selected in container, container
	373	* is selected. All we need now is to write out the bytes.
	374	*/
	375	len = iso->nbytes[subcharset];
	376	while (len--)
	377	emit(emitctx, (bytes >> (8*len)) & 0xFF);
	378
	379	return TRUE;
	380	}
	381
	382	/*
	383	* ISO-2022-JP, defined in RFC 1468.
	384	*/
	385	static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes)
	386	{
	387	switch (subcharset) {
	388	case 0: return bytes; /* one-byte ASCII */
	389	case 1: /* JIS X 0201 half-width katakana */
	390	if (bytes >= 0x21 && bytes <= 0x5F)
	391	return bytes + (0xFF61 - 0x21);
	392	else
	393	return ERROR;
	394	/* (no break needed since all control paths have returned) */
	395	case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
	396	((bytes ) & 0xFF) - 0x21);
	397	default: return ERROR;
	398	}
	399	}
	400	static int iso2022jp_from_ucs(long int ucs, int *subcharset,
	401	unsigned long *bytes)
	402	{
	403	int r, c;
	404	if (ucs < 0x80) {
	405	*subcharset = 0;
	406	*bytes = ucs;
	407	return 1;
	408	} else if (ucs >= 0xFF61 && ucs <= 0xFF9F) {
	409	*subcharset = 1;
	410	*bytes = ucs - (0xFF61 - 0x21);
	411	return 1;
	412	} else if (unicode_to_jisx0208(ucs, &r, &c)) {
	413	*subcharset = 2;
	414	*bytes = ((r+0x21) << 8) \| (c+0x21);
	415	return 1;
	416	} else {
	417	return 0;
	418	}
	419	}
	420	static struct iso2022_escape iso2022jp_escapes[] = {
	421	{"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1}, /* we ignore this one */
	422	{"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2},
	423	{"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0},
	424	{"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1},
	425	};
	426	static struct iso2022 iso2022jp = {
	427	iso2022jp_escapes, lenof(iso2022jp_escapes),
	428	"\1\1\2", "\3", 0x80000000, NULL, iso2022jp_to_ucs, iso2022jp_from_ucs
	429	};
430	const charset_spec charset_CS_ISO2022_JP = {
431	CS_ISO2022_JP, read_iso2022s, write_iso2022s, &iso2022jp
432	};
433
434	/*
435	* ISO-2022-KR, defined in RFC 1557.
436	*/
437	static long int iso2022kr_to_ucs(int subcharset, unsigned long bytes)
438	{
439	switch (subcharset) {
440	case 0: return bytes; /* one-byte ASCII */
441	case 1: return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
442	((bytes ) & 0xFF) - 0x21);
443	default: return ERROR;
444	}
445	}
446	static int iso2022kr_from_ucs(long int ucs, int *subcharset,
447	unsigned long *bytes)
448	{
449	int r, c;
450	if (ucs < 0x80) {
451	*subcharset = 0;
452	*bytes = ucs;
453	return 1;
454	} else if (unicode_to_ksx1001(ucs, &r, &c)) {
455	*subcharset = 1;
456	*bytes = ((r+0x21) << 8) \| (c+0x21);
457	return 1;
458	} else {
459	return 0;
460	}
461	}
462	static struct iso2022_escape iso2022kr_escapes[] = {
463	{"\016", 0x8FFFFFFF, 0x10000000, -1, -1},
464	{"\017", 0x8FFFFFFF, 0x00000000, 0, 0},
465	{"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1}, /* bits[11:6] <- 1 */
466	};
467	static struct iso2022 iso2022kr = {
468	iso2022kr_escapes, lenof(iso2022kr_escapes),
469	"\1\2", "\2", 0x80000040, "\033$)C", iso2022kr_to_ucs, iso2022kr_from_ucs
470	};
471	const charset_spec charset_CS_ISO2022_KR = {
472	CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr
473	};
474
475	#else /* ENUM_CHARSETS */
476
477	ENUM_CHARSET(CS_ISO2022_JP)
478	ENUM_CHARSET(CS_ISO2022_KR)
479
480	#endif /* ENUM_CHARSETS */