mdw@git.distorted.org.uk Git - sgt/charset/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* iso2022s.c - support for ISO-2022 subset encodings.
	3	*
	4	* (The `s' suffix on the filename is there to leave `iso2022.c'
	5	* free for the unlikely event that I ever attempt to implement
	6	* _full_ ISO-2022 in this library!)
	7	*/
	8
	9	#ifndef ENUM_CHARSETS
	10
	11	#include <stdio.h>
	12	#include <string.h>
	13	#include <assert.h>
	14
	15	#include "charset.h"
	16	#include "internal.h"
	17
	18	#define SO (0x0E)
	19	#define SI (0x0F)
	20	#define ESC (0x1B)
	21
	22	/* Functional description of a single ISO 2022 escape sequence. */
	23	struct iso2022_escape {
	24	char const *sequence;
	25	unsigned long andbits, xorbits;
	26	/*
	27	* For output, these variables help us figure out which escape
	28	* sequences we need to get where we want to be.
	29	*/
	30	int container, subcharset;
	31	};
	32
	33	struct iso2022 {
	34	/*
	35	* List of escape sequences supported in this subset. Must be
	36	* in ASCII order, so that we can narrow down the list as
	37	* necessary.
	38	*/
	39	struct iso2022_escape escapes; / must be sorted in ASCII order! */
	40	int nescapes;
	41
	42	/*
	43	* We assign indices from 0 upwards to the sub-charsets of a
	44	* given ISO 2022 subset. nbytes[i] tells us how many bytes per
	45	* character are required by sub-charset i. (It's a string
	46	* mainly because that makes it easier to declare in C syntax
	47	* than an int array.)
	48	*/
	49	char const *nbytes;
	50
	51	/*
	52	* The characters in this string are indices-plus-one (so that
	53	* NUL can still terminate) of escape sequences in `escapes'.
	54	* These escapes are output in the given sequence to reset the
	55	* encoding state, unless it turns out that a given escape
	56	* would not change the state at all.
	57	*/
	58	char const *reset;
	59
	60	/*
	61	* Initial value of s1, in case the default container contents
	62	* needs to be something other than charset 0 in all cases.
	63	* (Note that this must have the top bit set!)
	64	*/
	65	unsigned long s1;
	66
	67	/*
	68	* For output, some ISO 2022 subsets _mandate_ an initial shift
	69	* sequence. If so, here it is so we can output it. (For the
	70	* sake of basic sanity we won't bother to _require_ it on
	71	* input, although it should of course be listed under
	72	* `escapes' above so that we ignore it when present.)
	73	*/
	74	char const *initial_sequence;
	75
	76	/*
	77	* Function calls to do the actual translation.
	78	*/
	79	long int (*to_ucs)(int subcharset, unsigned long bytes);
	80	int (from_ucs)(long int ucs, int subcharset, unsigned long *bytes);
	81	};
	82
	83	static void read_iso2022s(charset_spec const *charset, long int input_chr,
	84	charset_state *state,
	85	void (emit)(void ctx, long int output),
	86	void *emitctx)
	87	{
	88	struct iso2022 const iso = (struct iso2022 )charset->data;
	89
	90	/*
	91	* For reading ISO-2022 subsets, we divide up our state
	92	* variables as follows:
	93	*
	94	* - The top byte of s0 (bits 31:24) indicates, if nonzero,
	95	* that we are part-way through a recognised ISO-2022 escape
	96	* sequence. Five of those bits (31:27) give the index of
	97	* the first member of the escapes list matching what we
	98	* have so far; the remaining three (26:24) give the number
	99	* of characters we have seen so far.
	100	*
	101	* - The top bit of s1 (bit 31) is non-zero at all times, to
	102	* indicate that we have performed any necessary
	103	* initialisation. When we start, we detect a zero s1 and
	104	* respond to it by initialising the default container
	105	* contents.
	106	*
	107	* - The next three bits of s1 (bits 30:28) indicate which
	108	* _container_ is currently selected. This isn't quite as
	109	* simple as it sounds, since we have to preserve memory of
	110	* which of the SI/SO containers we came from when we're
	111	* temporarily in SS2/SS3. Hence, what happens is:
	112	* + bit 28 indicates SI/SO.
	113	* + if we're in an SS2/SS3 container, that's indicated by
	114	* the two bits above that being nonzero and holding
	115	* either 2 or 3.
	116	* + Hence: 0 is SI, 1 is SO, 4 is SS2-from-SI, 5 is
	117	* SS2-from-SO, 6 is SS3-from-SI, 7 is SS3-from-SO.
	118	*
	119	* - The next nibble of s1 (27:24) indicates how many bytes
	120	* have been accumulated in the current character.
	121	*
	122	* - The remaining three bytes of s1 are divided into four
	123	* six-bit sections, and each section gives the current
	124	* sub-charset selected in one of the possible containers.
	125	* (Those containers are SI, SO, SS2 and SS3, respectively
	126	* and in order from the bottom of s0 to the top.)
	127	*
	128	* - The bottom 24 bits of s0 give the accumulated character
	129	* data so far.
	130	*
	131	* (Note that this means s1 contains all the parts of the state
	132	* which might need to be operated on by escape sequences.
	133	* Cunning, eh?)
	134	*/
	135
	136	if (!(state->s1 & 0x80000000)) {
	137	state->s1 = iso->s1;
	138	}
	139
	140	/*
	141	* So. Firstly, we process escape sequences, if we're in the
	142	* middle of one or if we see a possible introducer (SI, SO,
	143	* ESC).
	144	*/
	145	if ((state->s0 >> 24) \|\|
	146	(input_chr == SO \|\| input_chr == SI \|\| input_chr == ESC)) {
	147	int n = (state->s0 >> 24) & 7, i = (state->s0 >> 27), oi = i, j;
	148
	149	/*
	150	* If this is the start of an escape sequence, we might be
	151	* in mid-character. If so, clear the character state and
	152	* emit an error token for the incomplete character.
	153	*/
	154	if (state->s1 & 0x0F000000) {
	155	state->s1 &= ~0x0F000000;
	156	state->s0 &= 0xFF000000;
	157	/*
	158	* If we were in the SS2 or SS3 container, we
	159	* automatically exit it.
	160	*/
	161	if (state->s1 & 0x60000000)
	162	state->s1 &= 0x9FFFFFFF;
	163	emit(emitctx, ERROR);
	164	}
	165
	166	j = i;
	167	while (j < iso->nescapes &&
	168	!memcmp(iso->escapes[j].sequence,
	169	iso->escapes[oi].sequence, n)) {
	170	if (iso->escapes[j].sequence[n] < input_chr)
	171	i = ++j;
	172	else
	173	break;
	174	}
	175	if (i >= iso->nescapes \|\|
	176	memcmp(iso->escapes[i].sequence,
	177	iso->escapes[oi].sequence, n) \|\|
	178	iso->escapes[i].sequence[n] != input_chr) {
	179	/*
	180	* This character does not appear in any valid escape
	181	* sequence. Therefore, we must emit all the characters
	182	* we had previously swallowed, plus this one, and
	183	* return to non-escape-sequence state.
	184	*/
	185	for (j = 0; j < n; j++)
	186	emit(emitctx, iso->escapes[oi].sequence[j]);
	187	emit(emitctx, input_chr);
	188	state->s0 = 0;
	189	return;
	190	}
	191
	192	/*
	193	* Otherwise, we have found an additional character in our
	194	* escape sequence. See if we have reached the _end_ of our
	195	* sequence (and therefore must process the sequence).
	196	*/
	197	n++;
	198	if (!iso->escapes[i].sequence[n]) {
	199	state->s0 = 0;
	200	state->s1 &= iso->escapes[i].andbits;
	201	state->s1 ^= iso->escapes[i].xorbits;
	202	return;
	203	}
	204
	205	/*
	206	* Failing _that_, we simply update our escape-sequence-
	207	* tracking state.
	208	*/
	209	assert(i < 32 && n < 8);
	210	state->s0 = (i << 27) \| (n << 24);
	211	return;
	212	}
	213
	214	/*
	215	* If this isn't an escape sequence, it must be part of a
	216	* character. One possibility is that it's a control character
	217	* (outside the space 21-7E), in which case we output it verbatim.
	218	*/
	219	if (input_chr < 0x21 \|\| input_chr > 0x7E) {
	220	/*
	221	* We might be in mid-multibyte-character. If so, clear the
	222	* character state and emit an error token for the
	223	* incomplete character.
	224	*/
	225	if (state->s1 & 0x0F000000) {
	226	state->s1 &= ~0x0F000000;
	227	state->s0 &= 0xFF000000;
	228	emit(emitctx, ERROR);
	229	/*
	230	* If we were in the SS2 or SS3 container, we
	231	* automatically exit it.
	232	*/
	233	if (state->s1 & 0x60000000)
	234	state->s1 &= 0x9FFFFFFF;
	235	}
	236
	237	emit(emitctx, input_chr);
	238	return;
	239	}
	240
	241	/*
	242	* Otherwise, accumulate character data.
	243	*/
	244	{
	245	unsigned long chr;
	246	int chrlen, cont, subcharset, bytes;
	247
	248	/* The current character and its length. */
	249	chr = ((state->s0 & 0x00FFFFFF) << 8) \| input_chr;
	250	chrlen = ((state->s1 >> 24) & 0xF) + 1;
	251	/* The current sub-charset. */
	252	cont = (state->s1 >> 28) & 7;
	253	if (cont > 1) cont >>= 1;
	254	subcharset = (state->s1 >> (6*cont)) & 0x3F;
	255	/* The number of bytes-per-character in that sub-charset. */
	256	bytes = iso->nbytes[subcharset];
	257
	258	/*
	259	* If this character is now complete, we convert and emit
	260	* it. Otherwise, we simply update the state and return.
	261	*/
	262	if (chrlen >= bytes) {
	263	emit(emitctx, iso->to_ucs(subcharset, chr));
	264	chr = chrlen = 0;
	265	/*
	266	* If we were in the SS2 or SS3 container, we
	267	* automatically exit it.
	268	*/
	269	if (state->s1 & 0x60000000)
	270	state->s1 &= 0x9FFFFFFF;
	271	}
	272	state->s0 = (state->s0 & 0xFF000000) \| chr;
	273	state->s1 = (state->s1 & 0xF0FFFFFF) \| (chrlen << 24);
	274	}
	275	}
	276
	277	static int write_iso2022s(charset_spec const *charset, long int input_chr,
	278	charset_state *state,
	279	void (emit)(void ctx, long int output),
	280	void *emitctx)
	281	{
	282	struct iso2022 const iso = (struct iso2022 )charset->data;
	283	int subcharset, len, i, j, cont;
	284	unsigned long bytes;
	285
	286	/*
	287	* For output, our s1 state variable contains most of the same
	288	* stuff as it did for input - initial-state indicator bit,
	289	* current container, and current subcharset selected in each
	290	* container.
	291	*/
	292
	293	/*
	294	* Analyse the character and find out what subcharset it needs
	295	* to go in.
	296	*/
	297	if (input_chr >= 0 && !iso->from_ucs(input_chr, &subcharset, &bytes))
	298	return FALSE;
	299
	300	if (!(state->s1 & 0x80000000)) {
	301	state->s1 = iso->s1;
	302	if (iso->initial_sequence)
	303	for (i = 0; iso->initial_sequence[i]; i++)
	304	emit(emitctx, iso->initial_sequence[i]);
	305	}
	306
	307	if (input_chr == -1) {
	308	unsigned long oldstate;
	309	int k;
	310
	311	/*
	312	* Special case: reset encoding state.
	313	*/
	314	for (i = 0; iso->reset[i]; i++) {
	315	j = iso->reset[i] - 1;
	316	oldstate = state->s1;
	317	state->s1 &= iso->escapes[j].andbits;
	318	state->s1 ^= iso->escapes[j].xorbits;
	319	if (state->s1 != oldstate) {
	320	/* We must actually emit this sequence. */
	321	for (k = 0; iso->escapes[j].sequence[k]; k++)
	322	emit(emitctx, iso->escapes[j].sequence[k]);
	323	}
	324	}
	325
	326	return TRUE;
	327	}
	328
	329	/*
	330	* Now begins the fun. We now know what subcharset we want. So
	331	* we must find out which container we should select it into,
	332	* select it into it if necessary, select that _container_ if
	333	* necessary, and then output the given bytes.
	334	*/
	335	for (i = 0; i < iso->nescapes; i++)
	336	if (iso->escapes[i].subcharset == subcharset)
	337	break;
	338	assert(i < iso->nescapes);
	339
	340	/*
	341	* We've found the escape sequence which would select this
	342	* subcharset into a container. However, that subcharset might
	343	* already _be_ selected in that container! Check before we go
	344	* to the effort of emitting the sequence.
	345	*/
	346	cont = iso->escapes[i].container;
	347	if (((state->s1 >> (6*cont)) & 0x3F) != subcharset) {
	348	for (j = 0; iso->escapes[i].sequence[j]; j++)
	349	emit(emitctx, iso->escapes[i].sequence[j]);
	350	state->s1 &= iso->escapes[i].andbits;
	351	state->s1 ^= iso->escapes[i].xorbits;
	352	}
	353
	354	/*
	355	* Now we know what container our subcharset is in, so we want
	356	* to select that container.
	357	*/
	358	if (cont > 1) {
	359	/* SS2 or SS3; just output the sequence and be done. */
	360	emit(emitctx, ESC);
	361	emit(emitctx, 'L' + cont); /* comes out to 'N' or 'O' */
	362	} else {
	363	/* Emit SI or SO, but only if the current container isn't already
	364	* the right one. */
	365	if (((state->s1 >> 28) & 7) != cont) {
	366	emit(emitctx, cont ? SO : SI);
	367	state->s1 = (state->s1 & 0x8FFFFFFF) \| (cont << 28);
	368	}
	369	}
	370
	371	/*
	372	* We're done. Subcharset is selected in container, container
	373	* is selected. All we need now is to write out the bytes.
	374	*/
	375	len = iso->nbytes[subcharset];
	376	while (len--)
	377	emit(emitctx, (bytes >> (8*len)) & 0xFF);
	378
	379	return TRUE;
	380	}
	381
	382	/*
	383	* ISO-2022-JP, defined in RFC 1468.
	384	*/
	385	static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes)
	386	{
	387	switch (subcharset) {
	388	case 0: return bytes; /* one-byte ASCII */
	389	case 1: /* JIS X 0201 half-width katakana */
	390	if (bytes >= 0x21 && bytes <= 0x5F)
	391	return bytes + (0xFF61 - 0x21);
	392	else
	393	return ERROR;
	394	/* (no break needed since all control paths have returned) */
	395	case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
	396	((bytes ) & 0xFF) - 0x21);
	397	default: return ERROR;
	398	}
	399	}
	400	static int iso2022jp_from_ucs(long int ucs, int *subcharset,
	401	unsigned long *bytes)
	402	{
	403	int r, c;
	404	if (ucs < 0x80) {
	405	*subcharset = 0;
	406	*bytes = ucs;
	407	return 1;
	408	} else if (ucs >= 0xFF61 && ucs <= 0xFF9F) {
	409	*subcharset = 1;
	410	*bytes = ucs - (0xFF61 - 0x21);
	411	return 1;
	412	} else if (unicode_to_jisx0208(ucs, &r, &c)) {
	413	*subcharset = 2;
	414	*bytes = ((r+0x21) << 8) \| (c+0x21);
	415	return 1;
	416	} else {
	417	return 0;
	418	}
	419	}
	420	static struct iso2022_escape iso2022jp_escapes[] = {
	421	{"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1}, /* we ignore this one */
	422	{"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2},
	423	{"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0},
	424	{"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1},
	425	};
	426	static struct iso2022 iso2022jp = {
	427	iso2022jp_escapes, lenof(iso2022jp_escapes),
	428	"\1\1\2", "\3", 0x80000000, NULL, iso2022jp_to_ucs, iso2022jp_from_ucs
	429	};
	430	const charset_spec charset_CS_ISO2022_JP = {
	431	CS_ISO2022_JP, read_iso2022s, write_iso2022s, &iso2022jp
	432	};
	433
	434	/*
	435	* ISO-2022-KR, defined in RFC 1557.
	436	*/
	437	static long int iso2022kr_to_ucs(int subcharset, unsigned long bytes)
	438	{
	439	switch (subcharset) {
	440	case 0: return bytes; /* one-byte ASCII */
	441	case 1: return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
	442	((bytes ) & 0xFF) - 0x21);
	443	default: return ERROR;
	444	}
	445	}
	446	static int iso2022kr_from_ucs(long int ucs, int *subcharset,
	447	unsigned long *bytes)
	448	{
	449	int r, c;
	450	if (ucs < 0x80) {
	451	*subcharset = 0;
	452	*bytes = ucs;
	453	return 1;
	454	} else if (unicode_to_ksx1001(ucs, &r, &c)) {
	455	*subcharset = 1;
	456	*bytes = ((r+0x21) << 8) \| (c+0x21);
	457	return 1;
	458	} else {
	459	return 0;
	460	}
	461	}
	462	static struct iso2022_escape iso2022kr_escapes[] = {
	463	{"\016", 0x8FFFFFFF, 0x10000000, -1, -1},
	464	{"\017", 0x8FFFFFFF, 0x00000000, 0, 0},
	465	{"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1}, /* bits[11:6] <- 1 */
	466	};
	467	static struct iso2022 iso2022kr = {
	468	iso2022kr_escapes, lenof(iso2022kr_escapes),
	469	"\1\2", "\2", 0x80000040, "\033$)C", iso2022kr_to_ucs, iso2022kr_from_ucs
	470	};
	471	const charset_spec charset_CS_ISO2022_KR = {
	472	CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr
	473	};
	474
	475	#else /* ENUM_CHARSETS */
	476
	477	ENUM_CHARSET(CS_ISO2022_JP)
	478	ENUM_CHARSET(CS_ISO2022_KR)
	479
	480	#endif /* ENUM_CHARSETS */