/*
* iso2022s.c - support for ISO-2022 subset encodings.
- *
- * (The `s' suffix on the filename is there to leave `iso2022.c'
- * free for the unlikely event that I ever attempt to implement
- * _full_ ISO-2022 in this library!)
*/
#ifndef ENUM_CHARSETS
#include "charset.h"
#include "internal.h"
+#include "sbcsdat.h"
#define SO (0x0E)
#define SI (0x0F)
/*
* For output, these variables help us figure out which escape
* sequences we need to get where we want to be.
+ *
+ * `container' should be in the range 0-3, but can also be ORed
+ * with the bit flag RO to indicate that this is not a
+ * preferred container to use for this charset during output.
*/
int container, subcharset;
};
+#define RO 0x80
struct iso2022 {
/*
* in ASCII order, so that we can narrow down the list as
* necessary.
*/
- struct iso2022_escape *escapes; /* must be sorted in ASCII order! */
+ const struct iso2022_escape *escapes;/* must be sorted in ASCII order! */
int nescapes;
/*
char const *initial_sequence;
/*
+ * Is this an 8-bit ISO 2022 subset?
+ */
+ int eightbit;
+
+ /*
* Function calls to do the actual translation.
*/
long int (*to_ucs)(int subcharset, unsigned long bytes);
* either 2 or 3.
* + Hence: 0 is SI, 1 is SO, 4 is SS2-from-SI, 5 is
* SS2-from-SO, 6 is SS3-from-SI, 7 is SS3-from-SO.
+ * + For added fun: in an _8-bit_ ISO 2022 subset, we have
+ * the further special value 2, which means that we're
+ * theoretically in SI but the current character being
+ * accumulated is composed of 8-bit characters and will
+ * therefore be interpreted as if in SO.
*
* - The next nibble of s1 (27:24) indicates how many bytes
* have been accumulated in the current character.
/*
* If this isn't an escape sequence, it must be part of a
* character. One possibility is that it's a control character
- * (outside the space 21-7E), in which case we output it verbatim.
+ * (00-20 or 7F-9F; also in non-8-bit ISO 2022 subsets I'm
+ * going to treat all top-half characters as controls), in
+ * which case we output it verbatim.
*/
- if (input_chr < 0x21 || input_chr > 0x7E) {
+ if (input_chr < 0x21 ||
+ (input_chr > 0x7E && (!iso->eightbit || input_chr < 0xA0))) {
/*
* We might be in mid-multibyte-character. If so, clear the
* character state and emit an error token for the
unsigned long chr;
int chrlen, cont, subcharset, bytes;
+ /*
+ * Verify that we've seen the right kind of character for
+ * what we're currently doing. This only matters in 8-bit
+ * subsets.
+ */
+ if (iso->eightbit) {
+ cont = (state->s1 >> 28) & 7;
+ /*
+ * If cont==0, we're entitled to see either GL or GR
+ * characters. If cont==2, we expect only GR; otherwise
+ * we expect only GL.
+ *
+ * If we see a GR character while cont==0, we set
+ * cont=2 immediately.
+ */
+ if ((cont == 2 && !(input_chr & 0x80)) ||
+ (cont != 0 && cont != 2 && (input_chr & 0x80))) {
+ /*
+ * Clear the previous character; it was prematurely
+ * terminated by this error.
+ */
+ state->s1 &= ~0x0F000000;
+ state->s0 &= 0xFF000000;
+ emit(emitctx, ERROR);
+ /*
+ * If we were in the SS2 or SS3 container, we
+ * automatically exit it.
+ */
+ if (state->s1 & 0x60000000)
+ state->s1 &= 0x9FFFFFFF;
+ }
+
+ if (cont == 0 && (input_chr & 0x80)) {
+ state->s1 |= 0x20000000;
+ }
+ }
+
/* The current character and its length. */
- chr = ((state->s0 & 0x00FFFFFF) << 8) | input_chr;
+ chr = ((state->s0 & 0x00FFFFFF) << 8) | (input_chr & 0x7F);
chrlen = ((state->s1 >> 24) & 0xF) + 1;
/* The current sub-charset. */
cont = (state->s1 >> 28) & 7;
void *emitctx)
{
struct iso2022 const *iso = (struct iso2022 *)charset->data;
- int subcharset, len, i, j, cont;
+ int subcharset, len, i, j, cont, topbit = 0;
unsigned long bytes;
/*
* necessary, and then output the given bytes.
*/
for (i = 0; i < iso->nescapes; i++)
- if (iso->escapes[i].subcharset == subcharset)
+ if (iso->escapes[i].subcharset == subcharset &&
+ !(iso->escapes[i].container & RO))
break;
assert(i < iso->nescapes);
* already _be_ selected in that container! Check before we go
* to the effort of emitting the sequence.
*/
- cont = iso->escapes[i].container;
+ cont = iso->escapes[i].container &~ RO;
if (((state->s1 >> (6*cont)) & 0x3F) != (unsigned)subcharset) {
for (j = 0; iso->escapes[i].sequence[j]; j++)
emit(emitctx, iso->escapes[i].sequence[j]);
emit(emitctx, ESC);
emit(emitctx, 'L' + cont); /* comes out to 'N' or 'O' */
} else {
- /* Emit SI or SO, but only if the current container isn't already
- * the right one. */
- if (((state->s1 >> 28) & 7) != (unsigned)cont) {
+ /*
+ * Emit SI or SO, but only if the current container isn't already
+ * the right one.
+ *
+ * Also, in an 8-bit subset, we need not do this; we'll
+ * just use 8-bit characters to output SO-container
+ * characters.
+ */
+ if (iso->eightbit && cont == 1 && ((state->s1 >> 28) & 7) == 0) {
+ topbit = 0x80;
+ } else if (((state->s1 >> 28) & 7) != (unsigned)cont) {
emit(emitctx, cont ? SO : SI);
state->s1 = (state->s1 & 0x8FFFFFFF) | (cont << 28);
}
*/
len = iso->nbytes[subcharset];
while (len--)
- emit(emitctx, (bytes >> (8*len)) & 0xFF);
+ emit(emitctx, ((bytes >> (8*len)) & 0xFF) | topbit);
return TRUE;
}
static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes)
{
switch (subcharset) {
+ case 1: /* JIS X 0201 bottom half */
+ if (bytes == 0x5C)
+ return 0xA5;
+ else if (bytes == 0x7E)
+ return 0x203E;
+ /* else fall through to ASCII */
case 0: return bytes; /* one-byte ASCII */
- case 1: /* JIS X 0201 half-width katakana */
- if (bytes >= 0x21 && bytes <= 0x5F)
- return bytes + (0xFF61 - 0x21);
- else
- return ERROR;
/* (no break needed since all control paths have returned) */
case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
((bytes ) & 0xFF) - 0x21);
*subcharset = 0;
*bytes = ucs;
return 1;
- } else if (ucs >= 0xFF61 && ucs <= 0xFF9F) {
+ } else if (ucs == 0xA5 || ucs == 0x203E) {
*subcharset = 1;
- *bytes = ucs - (0xFF61 - 0x21);
+ *bytes = (ucs == 0xA5 ? 0x5C : 0x7E);
return 1;
} else if (unicode_to_jisx0208(ucs, &r, &c)) {
*subcharset = 2;
return 0;
}
}
-static struct iso2022_escape iso2022jp_escapes[] = {
+static const struct iso2022_escape iso2022jp_escapes[] = {
{"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1}, /* we ignore this one */
{"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2},
{"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0},
{"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1},
};
-static struct iso2022 iso2022jp = {
+static const struct iso2022 iso2022jp = {
iso2022jp_escapes, lenof(iso2022jp_escapes),
- "\1\1\2", "\3", 0x80000000, NULL, iso2022jp_to_ucs, iso2022jp_from_ucs
+ "\1\1\2", "\3", 0x80000000, NULL, FALSE,
+ iso2022jp_to_ucs, iso2022jp_from_ucs
};
const charset_spec charset_CS_ISO2022_JP = {
CS_ISO2022_JP, read_iso2022s, write_iso2022s, &iso2022jp
return 0;
}
}
-static struct iso2022_escape iso2022kr_escapes[] = {
+static const struct iso2022_escape iso2022kr_escapes[] = {
{"\016", 0x8FFFFFFF, 0x10000000, -1, -1},
{"\017", 0x8FFFFFFF, 0x00000000, 0, 0},
{"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1}, /* bits[11:6] <- 1 */
};
-static struct iso2022 iso2022kr = {
+static const struct iso2022 iso2022kr = {
iso2022kr_escapes, lenof(iso2022kr_escapes),
- "\1\2", "\2", 0x80000040, "\033$)C", iso2022kr_to_ucs, iso2022kr_from_ucs
+ "\1\2", "\2", 0x80000040, "\033$)C", FALSE,
+ iso2022kr_to_ucs, iso2022kr_from_ucs
};
const charset_spec charset_CS_ISO2022_KR = {
CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr