mdw@git.distorted.org.uk Git - sgt/charset/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* euc.c - routines to handle the various EUC multibyte encodings.
	3	*/
	4
	5	#ifndef ENUM_CHARSETS
	6
	7	#include "charset.h"
	8	#include "internal.h"
	9
	10	struct euc {
	11	int nchars[3]; /* GR, SS2+GR, SS3+GR */
	12	long int (*to_ucs)(unsigned long state);
	13	unsigned long (*from_ucs)(long int ucs);
	14	};
	15
	16	static void read_euc(charset_spec const *charset, long int input_chr,
	17	charset_state *state,
	18	void (emit)(void ctx, long int output), void *emitctx)
	19	{
	20	struct euc const euc = (struct euc )charset->data;
	21
	22	/*
	23	* For EUC input, our state variable divides into three parts:
	24	*
	25	* - Topmost nibble (bits 31:28) is nonzero if we're
	26	* accumulating a multibyte character, and it indicates
	27	* which section we're in: 1 for GR chars, 2 for things
	28	* beginning with SS2, 3 for things beginning with SS3.
	29	*
	30	* - Next nibble (bits 27:24) indicates how many bytes of the
	31	* character we've accumulated so far.
	32	*
	33	* - The rest (bits 23:0) are those bytes in full, accumulated
	34	* as a large integer (so that seeing A1 A2 A3, in a
	35	* hypothetical EUC whose GR encoding is three-byte, runs
	36	* our state variable from 0 -> 0x110000A1 -> 0x1200A1A2 ->
	37	* 0x13A1A2A3, at which point it gets translated and output
	38	* and resets to zero).
	39	*/
	40
	41	if (state->s0 != 0) {
	42
	43	/*
	44	* At this point, no matter whether we had an SS2 or SS3
	45	* introducer or not, we _always_ expect a GR character.
	46	* Anything else causes us to emit ERROR for an incomplete
	47	* character, and then reset to state 0 to process the
	48	* character in its own way.
	49	*/
	50	if (input_chr < 0xA1 \|\| input_chr == 0xFF) {
	51	emit(emitctx, ERROR);
	52	state->s0 = 0;
	53	} else
	54	state->s0 = (((state->s0 & 0xFF000000) + 0x01000000) \|
	55	((state->s0 & 0x0000FFFF) << 8) \| input_chr);
	56
	57	}
	58
	59	if (state->s0 == 0) {
	60	/*
	61	* The input character determines which of the four
	62	* possible charsets we're going to be in.
	63	*/
	64	if (input_chr < 0x80) { /* this is always ASCII */
	65	emit(emitctx, input_chr);
	66	} else if (input_chr == 0x8E) {/* SS2 means charset 2 */
	67	state->s0 = 0x20000000;
	68	} else if (input_chr == 0x8F) {/* SS3 means charset 3 */
	69	state->s0 = 0x30000000;
	70	} else if (input_chr < 0xA1 \|\| input_chr == 0xFF) { /* errors */
	71	emit(emitctx, ERROR);
	72	} else { /* A1-FE means charset 1 */
	73	state->s0 = 0x11000000 \| input_chr;
	74	}
	75	}
	76
	77	/*
	78	* Finally, if we have accumulated a complete character, output
	79	* it.
	80	*/
	81	if (state->s0 != 0 &&
	82	((state->s0 & 0x0F000000) >> 24) >=
	83	(unsigned)euc->nchars[(state->s0 >> 28)-1]) {
	84	emit(emitctx, euc->to_ucs(state->s0));
	85	state->s0 = 0;
	86	}
	87	}
	88
	89	/*
	90	* All EUCs are stateless multi-byte encodings (in the sense that
	91	* just after any character has been completed, the state is always
	92	* the same); hence when writing them, there is no need to use the
	93	* charset_state.
	94	*/
	95
	96	static int write_euc(charset_spec const *charset, long int input_chr,
	97	charset_state *state,
	98	void (emit)(void ctx, long int output), void *emitctx)
	99	{
	100	struct euc const euc = (struct euc )charset->data;
	101	unsigned long c;
	102	int cset, len;
	103
	104	UNUSEDARG(state);
	105
	106	if (input_chr == -1)
	107	return TRUE; /* stateless; no cleanup required */
	108
	109	/* ASCII is the easy bit, and is always the same. */
	110	if (input_chr < 0x80) {
	111	emit(emitctx, input_chr);
	112	return TRUE;
	113	}
	114
	115	c = euc->from_ucs(input_chr);
	116	if (!c) {
	117	return FALSE;
	118	}
	119
	120	cset = c >> 28;
	121	len = euc->nchars[cset-1];
	122	c &= 0xFFFFFF;
	123
	124	if (cset > 1)
	125	emit(emitctx, 0x8C + cset); /* SS2/SS3 */
	126
	127	while (len--)
	128	emit(emitctx, (c >> (8*len)) & 0xFF);
	129	return TRUE;
	130	}
	131
	132	/*
	133	* EUC-CN encodes GB2312 only.
	134	*/
	135	static long int euc_cn_to_ucs(unsigned long state)
	136	{
	137	switch (state >> 28) {
	138	case 1: return gb2312_to_unicode(((state >> 8) & 0xFF) - 0xA1,
	139	((state ) & 0xFF) - 0xA1);
	140	default: return ERROR;
	141	}
	142	}
	143	static unsigned long euc_cn_from_ucs(long int ucs)
	144	{
	145	int r, c;
	146	if (unicode_to_gb2312(ucs, &r, &c))
	147	return 0x10000000 \| ((r+0xA1) << 8) \| (c+0xA1);
	148	else
	149	return 0;
	150	}
	151	static const struct euc euc_cn = {
	152	{2,0,0}, euc_cn_to_ucs, euc_cn_from_ucs
	153	};
	154	const charset_spec charset_CS_EUC_CN = {
	155	CS_EUC_CN, read_euc, write_euc, &euc_cn
	156	};
	157
	158	/*
	159	* EUC-KR encodes KS X 1001 only.
	160	*/
	161	static long int euc_kr_to_ucs(unsigned long state)
	162	{
	163	switch (state >> 28) {
	164	case 1: return ksx1001_to_unicode(((state >> 8) & 0xFF) - 0xA1,
	165	((state ) & 0xFF) - 0xA1);
	166	default: return ERROR;
	167	}
	168	}
	169	static unsigned long euc_kr_from_ucs(long int ucs)
	170	{
	171	int r, c;
	172	if (unicode_to_ksx1001(ucs, &r, &c))
	173	return 0x10000000 \| ((r+0xA1) << 8) \| (c+0xA1);
	174	else
	175	return 0;
	176	}
	177	static const struct euc euc_kr = {
	178	{2,0,0}, euc_kr_to_ucs, euc_kr_from_ucs
	179	};
	180	const charset_spec charset_CS_EUC_KR = {
	181	CS_EUC_KR, read_euc, write_euc, &euc_kr
	182	};
	183
	184	/*
	185	* EUC-JP encodes several character sets.
	186	*/
	187	static long int euc_jp_to_ucs(unsigned long state)
	188	{
	189	switch (state >> 28) {
	190	case 1: return jisx0208_to_unicode(((state >> 8) & 0xFF) - 0xA1,
	191	((state ) & 0xFF) - 0xA1);
	192	case 2:
	193	/*
	194	* This is the top half of JIS X 0201. That means A1-DF map
	195	* to FF61-FF9F, and nothing else is valid.
	196	*/
	197	{
	198	int c = state & 0xFF;
	199	if (c >= 0xA1 && c <= 0xDF)
	200	return c + (0xFF61 - 0xA1);
	201	else
	202	return ERROR;
	203	}
	204	/* (no break needed since all control paths have returned) */
	205	case 3: return jisx0212_to_unicode(((state >> 8) & 0xFF) - 0xA1,
	206	((state ) & 0xFF) - 0xA1);
	207	default: return ERROR; /* placate optimisers */
	208	}
	209	}
	210	static unsigned long euc_jp_from_ucs(long int ucs)
	211	{
	212	int r, c;
	213	if (ucs >= 0xFF61 && ucs <= 0xFF9F)
	214	return 0x20000000 \| (ucs - (0xFF61 - 0xA1));
	215	else if (unicode_to_jisx0208(ucs, &r, &c))
	216	return 0x10000000 \| ((r+0xA1) << 8) \| (c+0xA1);
	217	else if (unicode_to_jisx0212(ucs, &r, &c))
	218	return 0x30000000 \| ((r+0xA1) << 8) \| (c+0xA1);
	219	else
	220	return 0;
	221	}
	222	static const struct euc euc_jp = {
	223	{2,1,2}, euc_jp_to_ucs, euc_jp_from_ucs
	224	};
	225	const charset_spec charset_CS_EUC_JP = {
	226	CS_EUC_JP, read_euc, write_euc, &euc_jp
	227	};
	228
	229	/*
	230	* EUC-TW encodes CNS 11643 (all planes).
	231	*/
	232	static long int euc_tw_to_ucs(unsigned long state)
	233	{
	234	int plane;
	235	switch (state >> 28) {
	236	case 1: return cns11643_to_unicode(0, ((state >> 8) & 0xFF) - 0xA1,
	237	((state ) & 0xFF) - 0xA1);
	238	case 2:
	239	plane = ((state >> 8) & 0xFF) - 0xA1;
	240	if (plane >= 7) return ERROR;
	241	return cns11643_to_unicode(plane, ((state >> 8) & 0xFF) - 0xA1,
	242	((state ) & 0xFF) - 0xA1);
	243	default: return ERROR;
	244	}
	245	}
	246	static unsigned long euc_tw_from_ucs(long int ucs)
	247	{
	248	int p, r, c;
	249	if (unicode_to_cns11643(ucs, &p, &r, &c)) {
	250	if (p == 0)
	251	return 0x10000000 \| ((r+0xA1) << 8) \| (c+0xA1);
	252	else
	253	return 0x20000000 \|
	254	((p + 0xA1) << 16) \| ((r+0xA1) << 8) \| (c+0xA1);
	255	} else
	256	return 0;
	257	}
	258	static const struct euc euc_tw = {
	259	{2,3,0}, euc_tw_to_ucs, euc_tw_from_ucs
	260	};
	261	const charset_spec charset_CS_EUC_TW = {
	262	CS_EUC_TW, read_euc, write_euc, &euc_tw
	263	};
	264
	265	#else /* ENUM_CHARSETS */
	266
	267	ENUM_CHARSET(CS_EUC_CN)
	268	ENUM_CHARSET(CS_EUC_KR)
	269	ENUM_CHARSET(CS_EUC_JP)
	270	ENUM_CHARSET(CS_EUC_TW)
	271
	272	#endif /* ENUM_CHARSETS */