[sgt/charset] / iso2022s.c

/*
 * iso2022s.c - support for ISO-2022 subset encodings.
 */

#ifndef ENUM_CHARSETS

#include <stdio.h>
#include <string.h>
#include <assert.h>

#include "charset.h"
#include "internal.h"
#include "sbcsdat.h"

#define SO (0x0E)
#define SI (0x0F)
#define ESC (0x1B)

/* Functional description of a single ISO 2022 escape sequence. */
struct iso2022_escape {
    char const *sequence;
    unsigned long andbits, xorbits;
    /*
     * For output, these variables help us figure out which escape
     * sequences we need to get where we want to be.
     * 
     * `container' should be in the range 0-3, but can also be ORed
     * with the bit flag RO to indicate that this is not a
     * preferred container to use for this charset during output.
     */
    int container, subcharset;
};
#define RO 0x80

struct iso2022 {
    /*
     * List of escape sequences supported in this subset. Must be
     * in ASCII order, so that we can narrow down the list as
     * necessary.
     */
    const struct iso2022_escape *escapes;/* must be sorted in ASCII order! */
    int nescapes;

    /*
     * We assign indices from 0 upwards to the sub-charsets of a
     * given ISO 2022 subset. nbytes[i] tells us how many bytes per
     * character are required by sub-charset i. (It's a string
     * mainly because that makes it easier to declare in C syntax
     * than an int array.)
     */
    char const *nbytes;

    /*
     * The characters in this string are indices-plus-one (so that
     * NUL can still terminate) of escape sequences in `escapes'.
     * These escapes are output in the given sequence to reset the
     * encoding state, unless it turns out that a given escape
     * would not change the state at all.
     */
    char const *reset;

    /*
     * Initial value of s1, in case the default container contents
     * needs to be something other than charset 0 in all cases.
     * (Note that this must have the top bit set!)
     */
    unsigned long s1;

    /*
     * For output, some ISO 2022 subsets _mandate_ an initial shift
     * sequence. If so, here it is so we can output it. (For the
     * sake of basic sanity we won't bother to _require_ it on
     * input, although it should of course be listed under
     * `escapes' above so that we ignore it when present.)
     */
    char const *initial_sequence;

    /*
     * Is this an 8-bit ISO 2022 subset?
     */
    int eightbit;

    /*
     * Function calls to do the actual translation.
     */
    long int (*to_ucs)(int subcharset, unsigned long bytes);
    int (*from_ucs)(long int ucs, int *subcharset, unsigned long *bytes);
};

static void read_iso2022s(charset_spec const *charset, long int input_chr,
			  charset_state *state,
			  void (*emit)(void *ctx, long int output),
			  void *emitctx)
{
    struct iso2022 const *iso = (struct iso2022 *)charset->data;

    /*
     * For reading ISO-2022 subsets, we divide up our state
     * variables as follows:
     * 
     * 	- The top byte of s0 (bits 31:24) indicates, if nonzero,
     * 	  that we are part-way through a recognised ISO-2022 escape
     * 	  sequence. Five of those bits (31:27) give the index of
     * 	  the first member of the escapes list matching what we
     * 	  have so far; the remaining three (26:24) give the number
     * 	  of characters we have seen so far.
     * 
     * 	- The top bit of s1 (bit 31) is non-zero at all times, to
     * 	  indicate that we have performed any necessary
     * 	  initialisation. When we start, we detect a zero s1 and
     * 	  respond to it by initialising the default container
     * 	  contents.
     * 
     * 	- The next three bits of s1 (bits 30:28) indicate which
     * 	  _container_ is currently selected. This isn't quite as
     * 	  simple as it sounds, since we have to preserve memory of
     * 	  which of the SI/SO containers we came from when we're
     * 	  temporarily in SS2/SS3. Hence, what happens is:
     *     + bit 28 indicates SI/SO.
     * 	   + if we're in an SS2/SS3 container, that's indicated by
     * 	     the two bits above that being nonzero and holding
     * 	     either 2 or 3.
     * 	   + Hence: 0 is SI, 1 is SO, 4 is SS2-from-SI, 5 is
     * 	     SS2-from-SO, 6 is SS3-from-SI, 7 is SS3-from-SO.
     * 	   + For added fun: in an _8-bit_ ISO 2022 subset, we have
     * 	     the further special value 2, which means that we're
     * 	     theoretically in SI but the current character being
     * 	     accumulated is composed of 8-bit characters and will
     * 	     therefore be interpreted as if in SO.
     * 
     * 	- The next nibble of s1 (27:24) indicates how many bytes
     * 	  have been accumulated in the current character.
     * 
     * 	- The remaining three bytes of s1 are divided into four
     * 	  six-bit sections, and each section gives the current
     * 	  sub-charset selected in one of the possible containers.
     * 	  (Those containers are SI, SO, SS2 and SS3, respectively
     * 	  and in order from the bottom of s0 to the top.)
     * 
     * 	- The bottom 24 bits of s0 give the accumulated character
     * 	  data so far.
     * 
     * (Note that this means s1 contains all the parts of the state
     * which might need to be operated on by escape sequences.
     * Cunning, eh?)
     */

    if (!(state->s1 & 0x80000000)) {
	state->s1 = iso->s1;
    }

    /*
     * So. Firstly, we process escape sequences, if we're in the
     * middle of one or if we see a possible introducer (SI, SO,
     * ESC).
     */
    if ((state->s0 >> 24) ||
	(input_chr == SO || input_chr == SI || input_chr == ESC)) {
	int n = (state->s0 >> 24) & 7, i = (state->s0 >> 27), oi = i, j;

	/*
	 * If this is the start of an escape sequence, we might be
	 * in mid-character. If so, clear the character state and
	 * emit an error token for the incomplete character.
	 */
	if (state->s1 & 0x0F000000) {
	    state->s1 &= ~0x0F000000;
	    state->s0 &= 0xFF000000;
	    /*
	     * If we were in the SS2 or SS3 container, we
	     * automatically exit it.
	     */
	    if (state->s1 & 0x60000000)
		state->s1 &= 0x9FFFFFFF;
	    emit(emitctx, ERROR);
	}

	j = i;
	while (j < iso->nescapes &&
	       !memcmp(iso->escapes[j].sequence,
		       iso->escapes[oi].sequence, n)) {
	    if (iso->escapes[j].sequence[n] < input_chr)
		i = ++j;
	    else
		break;
	}
	if (i >= iso->nescapes ||
	    memcmp(iso->escapes[i].sequence,
		   iso->escapes[oi].sequence, n) ||
	    iso->escapes[i].sequence[n] != input_chr) {
	    /*
	     * This character does not appear in any valid escape
	     * sequence. Therefore, we must emit all the characters
	     * we had previously swallowed, plus this one, and
	     * return to non-escape-sequence state.
	     */
	    for (j = 0; j < n; j++)
		emit(emitctx, iso->escapes[oi].sequence[j]);
	    emit(emitctx, input_chr);
	    state->s0 = 0;
	    return;
	}

	/*
	 * Otherwise, we have found an additional character in our
	 * escape sequence. See if we have reached the _end_ of our
	 * sequence (and therefore must process the sequence).
	 */
	n++;
	if (!iso->escapes[i].sequence[n]) {
	    state->s0 = 0;
	    state->s1 &= iso->escapes[i].andbits;
	    state->s1 ^= iso->escapes[i].xorbits;
	    return;
	}

	/*
	 * Failing _that_, we simply update our escape-sequence-
	 * tracking state.
	 */
	assert(i < 32 && n < 8);
	state->s0 = (i << 27) | (n << 24);
	return;
    }

    /*
     * If this isn't an escape sequence, it must be part of a
     * character. One possibility is that it's a control character
     * (00-20 or 7F-9F; also in non-8-bit ISO 2022 subsets I'm
     * going to treat all top-half characters as controls), in
     * which case we output it verbatim.
     */
    if (input_chr < 0x21 ||
	(input_chr > 0x7E && (!iso->eightbit || input_chr < 0xA0))) {
	/*
	 * We might be in mid-multibyte-character. If so, clear the
	 * character state and emit an error token for the
	 * incomplete character.
	 */
	if (state->s1 & 0x0F000000) {
	    state->s1 &= ~0x0F000000;
	    state->s0 &= 0xFF000000;
	    emit(emitctx, ERROR);
	    /*
	     * If we were in the SS2 or SS3 container, we
	     * automatically exit it.
	     */
	    if (state->s1 & 0x60000000)
		state->s1 &= 0x9FFFFFFF;
	}

	emit(emitctx, input_chr);
	return;
    }

    /*
     * Otherwise, accumulate character data.
     */
    {
	unsigned long chr;
	int chrlen, cont, subcharset, bytes;

	/*
	 * Verify that we've seen the right kind of character for
	 * what we're currently doing. This only matters in 8-bit
	 * subsets.
	 */
	if (iso->eightbit) {
	    cont = (state->s1 >> 28) & 7;
	    /*
	     * If cont==0, we're entitled to see either GL or GR
	     * characters. If cont==2, we expect only GR; otherwise
	     * we expect only GL.
	     * 
	     * If we see a GR character while cont==0, we set
	     * cont=2 immediately.
	     */
	    if ((cont == 2 && !(input_chr & 0x80)) ||
		(cont != 0 && cont != 2 && (input_chr & 0x80))) {
		/*
		 * Clear the previous character; it was prematurely
		 * terminated by this error.
		 */
		state->s1 &= ~0x0F000000;
		state->s0 &= 0xFF000000;
		emit(emitctx, ERROR);
		/*
		 * If we were in the SS2 or SS3 container, we
		 * automatically exit it.
		 */
		if (state->s1 & 0x60000000)
		    state->s1 &= 0x9FFFFFFF;
	    }

	    if (cont == 0 && (input_chr & 0x80)) {
		state->s1 |= 0x20000000;
	    }
	}

	/* The current character and its length. */
	chr = ((state->s0 & 0x00FFFFFF) << 8) | (input_chr & 0x7F);
	chrlen = ((state->s1 >> 24) & 0xF) + 1;
	/* The current sub-charset. */
	cont = (state->s1 >> 28) & 7;
	if (cont > 1) cont >>= 1;
	subcharset = (state->s1 >> (6*cont)) & 0x3F;
	/* The number of bytes-per-character in that sub-charset. */
	bytes = iso->nbytes[subcharset];

	/*
	 * If this character is now complete, we convert and emit
	 * it. Otherwise, we simply update the state and return.
	 */
	if (chrlen >= bytes) {
	    emit(emitctx, iso->to_ucs(subcharset, chr));
	    chr = chrlen = 0;
	    /*
	     * If we were in the SS2 or SS3 container, we
	     * automatically exit it.
	     */
	    if (state->s1 & 0x60000000)
		state->s1 &= 0x9FFFFFFF;
	}
	state->s0 = (state->s0 & 0xFF000000) | chr;
	state->s1 = (state->s1 & 0xF0FFFFFF) | (chrlen << 24);
    }
}

static int write_iso2022s(charset_spec const *charset, long int input_chr,
			  charset_state *state,
			  void (*emit)(void *ctx, long int output),
			  void *emitctx)
{
    struct iso2022 const *iso = (struct iso2022 *)charset->data;
    int subcharset, len, i, j, cont, topbit = 0;
    unsigned long bytes;

    /*
     * For output, our s1 state variable contains most of the same
     * stuff as it did for input - initial-state indicator bit,
     * current container, and current subcharset selected in each
     * container.
     */

    /*
     * Analyse the character and find out what subcharset it needs
     * to go in.
     */
    if (input_chr >= 0 && !iso->from_ucs(input_chr, &subcharset, &bytes))
	return FALSE;

    if (!(state->s1 & 0x80000000)) {
	state->s1 = iso->s1;
	if (iso->initial_sequence)
	    for (i = 0; iso->initial_sequence[i]; i++)
		emit(emitctx, iso->initial_sequence[i]);
    }

    if (input_chr == -1) {
	unsigned long oldstate;
	int k;

	/*
	 * Special case: reset encoding state.
	 */
	for (i = 0; iso->reset[i]; i++) {
	    j = iso->reset[i] - 1;
	    oldstate = state->s1;
	    state->s1 &= iso->escapes[j].andbits;
	    state->s1 ^= iso->escapes[j].xorbits;
	    if (state->s1 != oldstate) {
		/* We must actually emit this sequence. */
		for (k = 0; iso->escapes[j].sequence[k]; k++)
		    emit(emitctx, iso->escapes[j].sequence[k]);
	    }
	}

	return TRUE;
    }

    /*
     * Now begins the fun. We now know what subcharset we want. So
     * we must find out which container we should select it into,
     * select it into it if necessary, select that _container_ if
     * necessary, and then output the given bytes.
     */
    for (i = 0; i < iso->nescapes; i++)
	if (iso->escapes[i].subcharset == subcharset &&
	    !(iso->escapes[i].container & RO))
	    break;
    assert(i < iso->nescapes);

    /*
     * We've found the escape sequence which would select this
     * subcharset into a container. However, that subcharset might
     * already _be_ selected in that container! Check before we go
     * to the effort of emitting the sequence.
     */
    cont = iso->escapes[i].container &~ RO;
    if (((state->s1 >> (6*cont)) & 0x3F) != (unsigned)subcharset) {
	for (j = 0; iso->escapes[i].sequence[j]; j++)
	    emit(emitctx, iso->escapes[i].sequence[j]);
	state->s1 &= iso->escapes[i].andbits;
	state->s1 ^= iso->escapes[i].xorbits;
    }

    /*
     * Now we know what container our subcharset is in, so we want
     * to select that container.
     */
    if (cont > 1) {
	/* SS2 or SS3; just output the sequence and be done. */
	emit(emitctx, ESC);
	emit(emitctx, 'L' + cont);     /* comes out to 'N' or 'O' */
    } else {
	/*
	 * Emit SI or SO, but only if the current container isn't already
	 * the right one.
	 * 
	 * Also, in an 8-bit subset, we need not do this; we'll
	 * just use 8-bit characters to output SO-container
	 * characters.
	 */
	if (iso->eightbit && cont == 1 && ((state->s1 >> 28) & 7) == 0) {
	    topbit = 0x80;
	} else if (((state->s1 >> 28) & 7) != (unsigned)cont) {
	    emit(emitctx, cont ? SO : SI);
	    state->s1 = (state->s1 & 0x8FFFFFFF) | (cont << 28);
	}
    }

    /*
     * We're done. Subcharset is selected in container, container
     * is selected. All we need now is to write out the bytes.
     */
    len = iso->nbytes[subcharset];
    while (len--)
	emit(emitctx, ((bytes >> (8*len)) & 0xFF) | topbit);

    return TRUE;
}

/*
 * ISO-2022-JP, defined in RFC 1468.
 */
static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes)
{
    switch (subcharset) {
      case 1:			       /* JIS X 0201 bottom half */
	if (bytes == 0x5C)
	    return 0xA5;
	else if (bytes == 0x7E)
	    return 0x203E;
	/* else fall through to ASCII */
      case 0: return bytes;	       /* one-byte ASCII */
	/* (no break needed since all control paths have returned) */
      case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
					 ((bytes     ) & 0xFF) - 0x21);
      default: return ERROR;
    }
}
static int iso2022jp_from_ucs(long int ucs, int *subcharset,
			      unsigned long *bytes)
{
    int r, c;
    if (ucs < 0x80) {
	*subcharset = 0;
	*bytes = ucs;
	return 1;
    } else if (ucs == 0xA5 || ucs == 0x203E) {
	*subcharset = 1;
	*bytes = (ucs == 0xA5 ? 0x5C : 0x7E);
	return 1;
    } else if (unicode_to_jisx0208(ucs, &r, &c)) {
	*subcharset = 2;
	*bytes = ((r+0x21) << 8) | (c+0x21);
	return 1;
    } else {
	return 0;
    }
}
static const struct iso2022_escape iso2022jp_escapes[] = {
    {"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1},   /* we ignore this one */
    {"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2},
    {"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0},
    {"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1},
};
static const struct iso2022 iso2022jp = {
    iso2022jp_escapes, lenof(iso2022jp_escapes),
    "\1\1\2", "\3", 0x80000000, NULL, FALSE,
    iso2022jp_to_ucs, iso2022jp_from_ucs
};
const charset_spec charset_CS_ISO2022_JP = {
    CS_ISO2022_JP, read_iso2022s, write_iso2022s, &iso2022jp
};

/*
 * ISO-2022-KR, defined in RFC 1557.
 */
static long int iso2022kr_to_ucs(int subcharset, unsigned long bytes)
{
    switch (subcharset) {
      case 0: return bytes;	       /* one-byte ASCII */
      case 1: return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
					((bytes     ) & 0xFF) - 0x21);
      default: return ERROR;
    }
}
static int iso2022kr_from_ucs(long int ucs, int *subcharset,
			      unsigned long *bytes)
{
    int r, c;
    if (ucs < 0x80) {
	*subcharset = 0;
	*bytes = ucs;
	return 1;
    } else if (unicode_to_ksx1001(ucs, &r, &c)) {
	*subcharset = 1;
	*bytes = ((r+0x21) << 8) | (c+0x21);
	return 1;
    } else {
	return 0;
    }
}
static const struct iso2022_escape iso2022kr_escapes[] = {
    {"\016", 0x8FFFFFFF, 0x10000000, -1, -1},
    {"\017", 0x8FFFFFFF, 0x00000000, 0, 0},
    {"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1},   /* bits[11:6] <- 1 */
};
static const struct iso2022 iso2022kr = {
    iso2022kr_escapes, lenof(iso2022kr_escapes),
    "\1\2", "\2", 0x80000040, "\033$)C", FALSE,
    iso2022kr_to_ucs, iso2022kr_from_ucs
};
const charset_spec charset_CS_ISO2022_KR = {
    CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr
};

#else /* ENUM_CHARSETS */

ENUM_CHARSET(CS_ISO2022_JP)
ENUM_CHARSET(CS_ISO2022_KR)

#endif /* ENUM_CHARSETS */
Commit	Line	Data
c6d25d8d	1	/*
c6d25d8d	2	* iso2022s.c - support for ISO-2022 subset encodings.
c6d25d8d	3	*/
	4
	5	#ifndef ENUM_CHARSETS
	6
	7	#include <stdio.h>
	8	#include <string.h>
	9	#include <assert.h>
	10
	11	#include "charset.h"
	12	#include "internal.h"
01081d4e	13	#include "sbcsdat.h"
c6d25d8d	14
	15	#define SO (0x0E)
	16	#define SI (0x0F)
	17	#define ESC (0x1B)
	18
	19	/* Functional description of a single ISO 2022 escape sequence. */
	20	struct iso2022_escape {
	21	char const *sequence;
	22	unsigned long andbits, xorbits;
	23	/*
	24	* For output, these variables help us figure out which escape
	25	* sequences we need to get where we want to be.
01081d4e	26	*
	27	* `container' should be in the range 0-3, but can also be ORed
	28	* with the bit flag RO to indicate that this is not a
	29	* preferred container to use for this charset during output.
c6d25d8d	30	*/
	31	int container, subcharset;
	32	};
01081d4e	33	#define RO 0x80
c6d25d8d	34
	35	struct iso2022 {
	36	/*
	37	* List of escape sequences supported in this subset. Must be
	38	* in ASCII order, so that we can narrow down the list as
	39	* necessary.
	40	*/
8bade113	41	const struct iso2022_escape escapes;/ must be sorted in ASCII order! */
c6d25d8d	42	int nescapes;
	43
	44	/*
	45	* We assign indices from 0 upwards to the sub-charsets of a
	46	* given ISO 2022 subset. nbytes[i] tells us how many bytes per
	47	* character are required by sub-charset i. (It's a string
	48	* mainly because that makes it easier to declare in C syntax
	49	* than an int array.)
	50	*/
	51	char const *nbytes;
	52
	53	/*
	54	* The characters in this string are indices-plus-one (so that
	55	* NUL can still terminate) of escape sequences in `escapes'.
	56	* These escapes are output in the given sequence to reset the
	57	* encoding state, unless it turns out that a given escape
	58	* would not change the state at all.
	59	*/
	60	char const *reset;
	61
	62	/*
	63	* Initial value of s1, in case the default container contents
	64	* needs to be something other than charset 0 in all cases.
	65	* (Note that this must have the top bit set!)
	66	*/
	67	unsigned long s1;
	68
	69	/*
	70	* For output, some ISO 2022 subsets _mandate_ an initial shift
	71	* sequence. If so, here it is so we can output it. (For the
	72	* sake of basic sanity we won't bother to _require_ it on
	73	* input, although it should of course be listed under
	74	* `escapes' above so that we ignore it when present.)
	75	*/
	76	char const *initial_sequence;
	77
	78	/*
01081d4e	79	* Is this an 8-bit ISO 2022 subset?
	80	*/
	81	int eightbit;
	82
	83	/*
c6d25d8d	84	* Function calls to do the actual translation.
	85	*/
	86	long int (*to_ucs)(int subcharset, unsigned long bytes);
	87	int (from_ucs)(long int ucs, int subcharset, unsigned long *bytes);
	88	};
	89
	90	static void read_iso2022s(charset_spec const *charset, long int input_chr,
	91	charset_state *state,
	92	void (emit)(void ctx, long int output),
	93	void *emitctx)
	94	{
	95	struct iso2022 const iso = (struct iso2022 )charset->data;
	96
	97	/*
	98	* For reading ISO-2022 subsets, we divide up our state
	99	* variables as follows:
	100	*
	101	* - The top byte of s0 (bits 31:24) indicates, if nonzero,
	102	* that we are part-way through a recognised ISO-2022 escape
	103	* sequence. Five of those bits (31:27) give the index of
	104	* the first member of the escapes list matching what we
	105	* have so far; the remaining three (26:24) give the number
	106	* of characters we have seen so far.
	107	*
	108	* - The top bit of s1 (bit 31) is non-zero at all times, to
	109	* indicate that we have performed any necessary
	110	* initialisation. When we start, we detect a zero s1 and
	111	* respond to it by initialising the default container
	112	* contents.
	113	*
	114	* - The next three bits of s1 (bits 30:28) indicate which
	115	* _container_ is currently selected. This isn't quite as
	116	* simple as it sounds, since we have to preserve memory of
	117	* which of the SI/SO containers we came from when we're
	118	* temporarily in SS2/SS3. Hence, what happens is:
	119	* + bit 28 indicates SI/SO.
	120	* + if we're in an SS2/SS3 container, that's indicated by
	121	* the two bits above that being nonzero and holding
	122	* either 2 or 3.
	123	* + Hence: 0 is SI, 1 is SO, 4 is SS2-from-SI, 5 is
	124	* SS2-from-SO, 6 is SS3-from-SI, 7 is SS3-from-SO.
01081d4e	125	* + For added fun: in an _8-bit_ ISO 2022 subset, we have
	126	* the further special value 2, which means that we're
	127	* theoretically in SI but the current character being
	128	* accumulated is composed of 8-bit characters and will
	129	* therefore be interpreted as if in SO.
c6d25d8d	130	*
	131	* - The next nibble of s1 (27:24) indicates how many bytes
	132	* have been accumulated in the current character.
	133	*
	134	* - The remaining three bytes of s1 are divided into four
	135	* six-bit sections, and each section gives the current
	136	* sub-charset selected in one of the possible containers.
	137	* (Those containers are SI, SO, SS2 and SS3, respectively
	138	* and in order from the bottom of s0 to the top.)
	139	*
	140	* - The bottom 24 bits of s0 give the accumulated character
	141	* data so far.
	142	*
	143	* (Note that this means s1 contains all the parts of the state
	144	* which might need to be operated on by escape sequences.
	145	* Cunning, eh?)
	146	*/
	147
	148	if (!(state->s1 & 0x80000000)) {
	149	state->s1 = iso->s1;
	150	}
	151
	152	/*
	153	* So. Firstly, we process escape sequences, if we're in the
	154	* middle of one or if we see a possible introducer (SI, SO,
	155	* ESC).
	156	*/
	157	if ((state->s0 >> 24) \|\|
	158	(input_chr == SO \|\| input_chr == SI \|\| input_chr == ESC)) {
	159	int n = (state->s0 >> 24) & 7, i = (state->s0 >> 27), oi = i, j;
	160
	161	/*
	162	* If this is the start of an escape sequence, we might be
	163	* in mid-character. If so, clear the character state and
	164	* emit an error token for the incomplete character.
	165	*/
	166	if (state->s1 & 0x0F000000) {
	167	state->s1 &= ~0x0F000000;
	168	state->s0 &= 0xFF000000;
	169	/*
	170	* If we were in the SS2 or SS3 container, we
	171	* automatically exit it.
	172	*/
	173	if (state->s1 & 0x60000000)
	174	state->s1 &= 0x9FFFFFFF;
	175	emit(emitctx, ERROR);
	176	}
	177
	178	j = i;
	179	while (j < iso->nescapes &&
	180	!memcmp(iso->escapes[j].sequence,
	181	iso->escapes[oi].sequence, n)) {
	182	if (iso->escapes[j].sequence[n] < input_chr)
	183	i = ++j;
	184	else
	185	break;
	186	}
	187	if (i >= iso->nescapes \|\|
	188	memcmp(iso->escapes[i].sequence,
	189	iso->escapes[oi].sequence, n) \|\|
	190	iso->escapes[i].sequence[n] != input_chr) {
	191	/*
	192	* This character does not appear in any valid escape
	193	* sequence. Therefore, we must emit all the characters
194	* we had previously swallowed, plus this one, and
195	* return to non-escape-sequence state.
196	*/
197	for (j = 0; j < n; j++)
198	emit(emitctx, iso->escapes[oi].sequence[j]);
199	emit(emitctx, input_chr);
200	state->s0 = 0;
201	return;
202	}
203
204	/*
205	* Otherwise, we have found an additional character in our
206	* escape sequence. See if we have reached the _end_ of our
207	* sequence (and therefore must process the sequence).
208	*/
209	n++;
210	if (!iso->escapes[i].sequence[n]) {
211	state->s0 = 0;
212	state->s1 &= iso->escapes[i].andbits;
213	state->s1 ^= iso->escapes[i].xorbits;
214	return;
215	}
216
217	/*
218	* Failing _that_, we simply update our escape-sequence-
219	* tracking state.
220	*/
221	assert(i < 32 && n < 8);
222	state->s0 = (i << 27) \| (n << 24);
223	return;
224	}
225
226	/*
227	* If this isn't an escape sequence, it must be part of a
228	* character. One possibility is that it's a control character
01081d4e	229	* (00-20 or 7F-9F; also in non-8-bit ISO 2022 subsets I'm
	230	* going to treat all top-half characters as controls), in
	231	* which case we output it verbatim.
c6d25d8d	232	*/
01081d4e	233	if (input_chr < 0x21 \|\|
01081d4e	234	(input_chr > 0x7E && (!iso->eightbit \|\| input_chr < 0xA0))) {
c6d25d8d	235	/*
	236	* We might be in mid-multibyte-character. If so, clear the
	237	* character state and emit an error token for the
	238	* incomplete character.
	239	*/
	240	if (state->s1 & 0x0F000000) {
	241	state->s1 &= ~0x0F000000;
	242	state->s0 &= 0xFF000000;
	243	emit(emitctx, ERROR);
	244	/*
	245	* If we were in the SS2 or SS3 container, we
	246	* automatically exit it.
	247	*/
	248	if (state->s1 & 0x60000000)
	249	state->s1 &= 0x9FFFFFFF;
	250	}
	251
	252	emit(emitctx, input_chr);
	253	return;
	254	}
	255
	256	/*
	257	* Otherwise, accumulate character data.
	258	*/
	259	{
	260	unsigned long chr;
	261	int chrlen, cont, subcharset, bytes;
	262
01081d4e	263	/*
	264	* Verify that we've seen the right kind of character for
	265	* what we're currently doing. This only matters in 8-bit
	266	* subsets.
	267	*/
	268	if (iso->eightbit) {
	269	cont = (state->s1 >> 28) & 7;
	270	/*
	271	* If cont==0, we're entitled to see either GL or GR
	272	* characters. If cont==2, we expect only GR; otherwise
	273	* we expect only GL.
	274	*
	275	* If we see a GR character while cont==0, we set
	276	* cont=2 immediately.
	277	*/
	278	if ((cont == 2 && !(input_chr & 0x80)) \|\|
	279	(cont != 0 && cont != 2 && (input_chr & 0x80))) {
	280	/*
	281	* Clear the previous character; it was prematurely
	282	* terminated by this error.
	283	*/
	284	state->s1 &= ~0x0F000000;
	285	state->s0 &= 0xFF000000;
	286	emit(emitctx, ERROR);
	287	/*
	288	* If we were in the SS2 or SS3 container, we
	289	* automatically exit it.
	290	*/
	291	if (state->s1 & 0x60000000)
	292	state->s1 &= 0x9FFFFFFF;
	293	}
	294
	295	if (cont == 0 && (input_chr & 0x80)) {
	296	state->s1 \|= 0x20000000;
	297	}
	298	}
	299
c6d25d8d	300	/* The current character and its length. */
01081d4e	301	chr = ((state->s0 & 0x00FFFFFF) << 8) \| (input_chr & 0x7F);
c6d25d8d	302	chrlen = ((state->s1 >> 24) & 0xF) + 1;
	303	/* The current sub-charset. */
	304	cont = (state->s1 >> 28) & 7;
	305	if (cont > 1) cont >>= 1;
	306	subcharset = (state->s1 >> (6*cont)) & 0x3F;
	307	/* The number of bytes-per-character in that sub-charset. */
	308	bytes = iso->nbytes[subcharset];
	309
	310	/*
	311	* If this character is now complete, we convert and emit
	312	* it. Otherwise, we simply update the state and return.
	313	*/
	314	if (chrlen >= bytes) {
	315	emit(emitctx, iso->to_ucs(subcharset, chr));
	316	chr = chrlen = 0;
	317	/*
	318	* If we were in the SS2 or SS3 container, we
	319	* automatically exit it.
	320	*/
	321	if (state->s1 & 0x60000000)
	322	state->s1 &= 0x9FFFFFFF;
	323	}
	324	state->s0 = (state->s0 & 0xFF000000) \| chr;
	325	state->s1 = (state->s1 & 0xF0FFFFFF) \| (chrlen << 24);
	326	}
	327	}
	328
	329	static int write_iso2022s(charset_spec const *charset, long int input_chr,
	330	charset_state *state,
	331	void (emit)(void ctx, long int output),
	332	void *emitctx)
	333	{
	334	struct iso2022 const iso = (struct iso2022 )charset->data;
01081d4e	335	int subcharset, len, i, j, cont, topbit = 0;
c6d25d8d	336	unsigned long bytes;
	337
	338	/*
	339	* For output, our s1 state variable contains most of the same
	340	* stuff as it did for input - initial-state indicator bit,
	341	* current container, and current subcharset selected in each
	342	* container.
	343	*/
	344
	345	/*
	346	* Analyse the character and find out what subcharset it needs
	347	* to go in.
	348	*/
	349	if (input_chr >= 0 && !iso->from_ucs(input_chr, &subcharset, &bytes))
	350	return FALSE;
	351
	352	if (!(state->s1 & 0x80000000)) {
	353	state->s1 = iso->s1;
	354	if (iso->initial_sequence)
	355	for (i = 0; iso->initial_sequence[i]; i++)
	356	emit(emitctx, iso->initial_sequence[i]);
	357	}
	358
	359	if (input_chr == -1) {
	360	unsigned long oldstate;
	361	int k;
	362
	363	/*
	364	* Special case: reset encoding state.
	365	*/
	366	for (i = 0; iso->reset[i]; i++) {
	367	j = iso->reset[i] - 1;
	368	oldstate = state->s1;
	369	state->s1 &= iso->escapes[j].andbits;
	370	state->s1 ^= iso->escapes[j].xorbits;
	371	if (state->s1 != oldstate) {
	372	/* We must actually emit this sequence. */
	373	for (k = 0; iso->escapes[j].sequence[k]; k++)
	374	emit(emitctx, iso->escapes[j].sequence[k]);
	375	}
	376	}
	377
	378	return TRUE;
	379	}
	380
	381	/*
	382	* Now begins the fun. We now know what subcharset we want. So
	383	* we must find out which container we should select it into,
	384	* select it into it if necessary, select that _container_ if
	385	* necessary, and then output the given bytes.
	386	*/
	387	for (i = 0; i < iso->nescapes; i++)
01081d4e	388	if (iso->escapes[i].subcharset == subcharset &&
01081d4e	389	!(iso->escapes[i].container & RO))
c6d25d8d	390	break;
	391	assert(i < iso->nescapes);
	392
	393	/*
	394	* We've found the escape sequence which would select this
	395	* subcharset into a container. However, that subcharset might
	396	* already _be_ selected in that container! Check before we go
	397	* to the effort of emitting the sequence.
	398	*/
01081d4e	399	cont = iso->escapes[i].container &~ RO;
3cca0edf	400	if (((state->s1 >> (6*cont)) & 0x3F) != (unsigned)subcharset) {
c6d25d8d	401	for (j = 0; iso->escapes[i].sequence[j]; j++)
	402	emit(emitctx, iso->escapes[i].sequence[j]);
	403	state->s1 &= iso->escapes[i].andbits;
	404	state->s1 ^= iso->escapes[i].xorbits;
	405	}
	406
	407	/*
	408	* Now we know what container our subcharset is in, so we want
	409	* to select that container.
	410	*/
	411	if (cont > 1) {
	412	/* SS2 or SS3; just output the sequence and be done. */
	413	emit(emitctx, ESC);
	414	emit(emitctx, 'L' + cont); /* comes out to 'N' or 'O' */
	415	} else {
01081d4e	416	/*
	417	* Emit SI or SO, but only if the current container isn't already
	418	* the right one.
	419	*
	420	* Also, in an 8-bit subset, we need not do this; we'll
	421	* just use 8-bit characters to output SO-container
	422	* characters.
	423	*/
	424	if (iso->eightbit && cont == 1 && ((state->s1 >> 28) & 7) == 0) {
	425	topbit = 0x80;
	426	} else if (((state->s1 >> 28) & 7) != (unsigned)cont) {
c6d25d8d	427	emit(emitctx, cont ? SO : SI);
	428	state->s1 = (state->s1 & 0x8FFFFFFF) \| (cont << 28);
	429	}
	430	}
	431
	432	/*
	433	* We're done. Subcharset is selected in container, container
	434	* is selected. All we need now is to write out the bytes.
	435	*/
	436	len = iso->nbytes[subcharset];
	437	while (len--)
01081d4e	438	emit(emitctx, ((bytes >> (8*len)) & 0xFF) \| topbit);
c6d25d8d	439
	440	return TRUE;
	441	}
	442
	443	/*
	444	* ISO-2022-JP, defined in RFC 1468.
	445	*/
	446	static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes)
	447	{
	448	switch (subcharset) {
a933148c	449	case 1: /* JIS X 0201 bottom half */
	450	if (bytes == 0x5C)
	451	return 0xA5;
	452	else if (bytes == 0x7E)
	453	return 0x203E;
	454	/* else fall through to ASCII */
c6d25d8d	455	case 0: return bytes; /* one-byte ASCII */
c6d25d8d	456	/* (no break needed since all control paths have returned) */
	457	case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
	458	((bytes ) & 0xFF) - 0x21);
	459	default: return ERROR;
	460	}
	461	}
	462	static int iso2022jp_from_ucs(long int ucs, int *subcharset,
	463	unsigned long *bytes)
	464	{
	465	int r, c;
	466	if (ucs < 0x80) {
	467	*subcharset = 0;
	468	*bytes = ucs;
	469	return 1;
a933148c	470	} else if (ucs == 0xA5 \|\| ucs == 0x203E) {
c6d25d8d	471	*subcharset = 1;
a933148c	472	*bytes = (ucs == 0xA5 ? 0x5C : 0x7E);
c6d25d8d	473	return 1;
	474	} else if (unicode_to_jisx0208(ucs, &r, &c)) {
	475	*subcharset = 2;
	476	*bytes = ((r+0x21) << 8) \| (c+0x21);
	477	return 1;
	478	} else {
	479	return 0;
	480	}
	481	}
8bade113	482	static const struct iso2022_escape iso2022jp_escapes[] = {
c6d25d8d	483	{"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1}, /* we ignore this one */
	484	{"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2},
	485	{"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0},
	486	{"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1},
	487	};
8bade113	488	static const struct iso2022 iso2022jp = {
c6d25d8d	489	iso2022jp_escapes, lenof(iso2022jp_escapes),
01081d4e	490	"\1\1\2", "\3", 0x80000000, NULL, FALSE,
01081d4e	491	iso2022jp_to_ucs, iso2022jp_from_ucs
c6d25d8d	492	};
	493	const charset_spec charset_CS_ISO2022_JP = {
	494	CS_ISO2022_JP, read_iso2022s, write_iso2022s, &iso2022jp
	495	};
	496
	497	/*
	498	* ISO-2022-KR, defined in RFC 1557.
	499	*/
	500	static long int iso2022kr_to_ucs(int subcharset, unsigned long bytes)
	501	{
	502	switch (subcharset) {
	503	case 0: return bytes; /* one-byte ASCII */
	504	case 1: return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
	505	((bytes ) & 0xFF) - 0x21);
	506	default: return ERROR;
	507	}
	508	}
	509	static int iso2022kr_from_ucs(long int ucs, int *subcharset,
	510	unsigned long *bytes)
	511	{
	512	int r, c;
	513	if (ucs < 0x80) {
	514	*subcharset = 0;
	515	*bytes = ucs;
	516	return 1;
	517	} else if (unicode_to_ksx1001(ucs, &r, &c)) {
	518	*subcharset = 1;
	519	*bytes = ((r+0x21) << 8) \| (c+0x21);
	520	return 1;
	521	} else {
	522	return 0;
	523	}
	524	}
8bade113	525	static const struct iso2022_escape iso2022kr_escapes[] = {
c6d25d8d	526	{"\016", 0x8FFFFFFF, 0x10000000, -1, -1},
	527	{"\017", 0x8FFFFFFF, 0x00000000, 0, 0},
	528	{"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1}, /* bits[11:6] <- 1 */
	529	};
8bade113	530	static const struct iso2022 iso2022kr = {
c6d25d8d	531	iso2022kr_escapes, lenof(iso2022kr_escapes),
01081d4e	532	"\1\2", "\2", 0x80000040, "\033$)C", FALSE,
01081d4e	533	iso2022kr_to_ucs, iso2022kr_from_ucs
c6d25d8d	534	};
	535	const charset_spec charset_CS_ISO2022_KR = {
	536	CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr
	537	};
	538
	539	#else /* ENUM_CHARSETS */
	540
	541	ENUM_CHARSET(CS_ISO2022_JP)
	542	ENUM_CHARSET(CS_ISO2022_KR)
	543
	544	#endif /* ENUM_CHARSETS */