[sgt/charset] / euc.c

/*
 * euc.c - routines to handle the various EUC multibyte encodings.
 */

#ifndef ENUM_CHARSETS

#include "charset.h"
#include "internal.h"

struct euc {
    int nchars[3];		       /* GR, SS2+GR, SS3+GR */
    long int (*to_ucs)(unsigned long state);
    unsigned long (*from_ucs)(long int ucs);
};

static void read_euc(charset_spec const *charset, long int input_chr,
		     charset_state *state,
		     void (*emit)(void *ctx, long int output), void *emitctx)
{
    struct euc const *euc = (struct euc *)charset->data;

    /*
     * For EUC input, our state variable divides into three parts:
     * 
     * 	- Topmost nibble (bits 31:28) is nonzero if we're
     * 	  accumulating a multibyte character, and it indicates
     * 	  which section we're in: 1 for GR chars, 2 for things
     * 	  beginning with SS2, 3 for things beginning with SS3.
     * 
     * 	- Next nibble (bits 27:24) indicates how many bytes of the
     * 	  character we've accumulated so far.
     * 
     * 	- The rest (bits 23:0) are those bytes in full, accumulated
     * 	  as a large integer (so that seeing A1 A2 A3, in a
     * 	  hypothetical EUC whose GR encoding is three-byte, runs
     * 	  our state variable from 0 -> 0x110000A1 -> 0x1200A1A2 ->
     * 	  0x13A1A2A3, at which point it gets translated and output
     * 	  and resets to zero).
     */

    if (state->s0 != 0) {

	/*
	 * At this point, no matter whether we had an SS2 or SS3
	 * introducer or not, we _always_ expect a GR character.
	 * Anything else causes us to emit ERROR for an incomplete
	 * character, and then reset to state 0 to process the
	 * character in its own way.
	 */
	if (input_chr < 0xA1 || input_chr == 0xFF) {
	    emit(emitctx, ERROR);
	    state->s0 = 0;
	} else
	    state->s0 = (((state->s0 & 0xFF000000) + 0x01000000) |
			 ((state->s0 & 0x0000FFFF) << 8) | input_chr);

    }

    if (state->s0 == 0) {
	/*
	 * The input character determines which of the four
	 * possible charsets we're going to be in.
	 */
	if (input_chr < 0x80) {	       /* this is always ASCII */
	    emit(emitctx, input_chr);
	} else if (input_chr == 0x8E) {/* SS2 means charset 2 */
	    state->s0 = 0x20000000;
	} else if (input_chr == 0x8F) {/* SS3 means charset 3 */
	    state->s0 = 0x30000000;
	} else if (input_chr < 0xA1 || input_chr == 0xFF) {   /* errors */
	    emit(emitctx, ERROR);
	} else {		       /* A1-FE means charset 1 */
	    state->s0 = 0x11000000 | input_chr;
	}
    }

    /*
     * Finally, if we have accumulated a complete character, output
     * it.
     */
    if (state->s0 != 0 &&
	((state->s0 & 0x0F000000) >> 24) >=
	(unsigned)euc->nchars[(state->s0 >> 28)-1]) {
	emit(emitctx, euc->to_ucs(state->s0));
	state->s0 = 0;
    }
}

/*
 * All EUCs are stateless multi-byte encodings (in the sense that
 * just after any character has been completed, the state is always
 * the same); hence when writing them, there is no need to use the
 * charset_state.
 */

static int write_euc(charset_spec const *charset, long int input_chr,
		     charset_state *state,
		     void (*emit)(void *ctx, long int output), void *emitctx)
{
    struct euc const *euc = (struct euc *)charset->data;
    unsigned long c;
    int cset, len;

    UNUSEDARG(state);

    if (input_chr == -1)
	return TRUE;		       /* stateless; no cleanup required */

    /* ASCII is the easy bit, and is always the same. */
    if (input_chr < 0x80) {
	emit(emitctx, input_chr);
	return TRUE;
    }

    c = euc->from_ucs(input_chr);
    if (!c) {
	return FALSE;
    }

    cset = c >> 28;
    len = euc->nchars[cset-1];
    c &= 0xFFFFFF;

    if (cset > 1)
	emit(emitctx, 0x8C + cset);    /* SS2/SS3 */

    while (len--)
	emit(emitctx, (c >> (8*len)) & 0xFF);
    return TRUE;
}

/*
 * EUC-CN encodes GB2312 only.
 */
static long int euc_cn_to_ucs(unsigned long state)
{
    switch (state >> 28) {
      case 1: return gb2312_to_unicode(((state >> 8) & 0xFF) - 0xA1,
				       ((state     ) & 0xFF) - 0xA1);
      default: return ERROR;
    }
}
static unsigned long euc_cn_from_ucs(long int ucs)
{
    int r, c;
    if (unicode_to_gb2312(ucs, &r, &c))
	return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1);
    else
	return 0;
}
static const struct euc euc_cn = {
    {2,0,0}, euc_cn_to_ucs, euc_cn_from_ucs
};
const charset_spec charset_CS_EUC_CN = {
    CS_EUC_CN, read_euc, write_euc, &euc_cn
};

/*
 * EUC-KR encodes KS X 1001 only.
 */
static long int euc_kr_to_ucs(unsigned long state)
{
    switch (state >> 28) {
      case 1: return ksx1001_to_unicode(((state >> 8) & 0xFF) - 0xA1,
				       ((state     ) & 0xFF) - 0xA1);
      default: return ERROR;
    }
}
static unsigned long euc_kr_from_ucs(long int ucs)
{
    int r, c;
    if (unicode_to_ksx1001(ucs, &r, &c))
	return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1);
    else
	return 0;
}
static const struct euc euc_kr = {
    {2,0,0}, euc_kr_to_ucs, euc_kr_from_ucs
};
const charset_spec charset_CS_EUC_KR = {
    CS_EUC_KR, read_euc, write_euc, &euc_kr
};

/*
 * EUC-JP encodes several character sets.
 */
static long int euc_jp_to_ucs(unsigned long state)
{
    switch (state >> 28) {
      case 1: return jisx0208_to_unicode(((state >> 8) & 0xFF) - 0xA1,
					 ((state     ) & 0xFF) - 0xA1);
      case 2:
	/*
	 * This is the top half of JIS X 0201. That means A1-DF map
	 * to FF61-FF9F, and nothing else is valid.
	 */
	{
	    int c = state & 0xFF;
	    if (c >= 0xA1 && c <= 0xDF)
		return c + (0xFF61 - 0xA1);
	    else
		return ERROR;
	}
	/* (no break needed since all control paths have returned) */
      case 3: return jisx0212_to_unicode(((state >> 8) & 0xFF) - 0xA1,
					 ((state     ) & 0xFF) - 0xA1);
      default: return ERROR;	       /* placate optimisers */
    }
}
static unsigned long euc_jp_from_ucs(long int ucs)
{
    int r, c;
    if (ucs >= 0xFF61 && ucs <= 0xFF9F)
	return 0x20000000 | (ucs - (0xFF61 - 0xA1));
    else if (unicode_to_jisx0208(ucs, &r, &c))
	return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1);
    else if (unicode_to_jisx0212(ucs, &r, &c))
	return 0x30000000 | ((r+0xA1) << 8) | (c+0xA1);
    else
	return 0;
}
static const struct euc euc_jp = {
    {2,1,2}, euc_jp_to_ucs, euc_jp_from_ucs
};
const charset_spec charset_CS_EUC_JP = {
    CS_EUC_JP, read_euc, write_euc, &euc_jp
};

#else /* ENUM_CHARSETS */

ENUM_CHARSET(CS_EUC_CN)
ENUM_CHARSET(CS_EUC_KR)
ENUM_CHARSET(CS_EUC_JP)

#endif /* ENUM_CHARSETS */
Commit	Line	Data
c6d25d8d	1	/*
	2	* euc.c - routines to handle the various EUC multibyte encodings.
	3	*/
	4
	5	#ifndef ENUM_CHARSETS
	6
	7	#include "charset.h"
	8	#include "internal.h"
	9
	10	struct euc {
	11	int nchars[3]; /* GR, SS2+GR, SS3+GR */
	12	long int (*to_ucs)(unsigned long state);
	13	unsigned long (*from_ucs)(long int ucs);
	14	};
	15
	16	static void read_euc(charset_spec const *charset, long int input_chr,
	17	charset_state *state,
	18	void (emit)(void ctx, long int output), void *emitctx)
	19	{
	20	struct euc const euc = (struct euc )charset->data;
	21
	22	/*
	23	* For EUC input, our state variable divides into three parts:
	24	*
	25	* - Topmost nibble (bits 31:28) is nonzero if we're
	26	* accumulating a multibyte character, and it indicates
	27	* which section we're in: 1 for GR chars, 2 for things
	28	* beginning with SS2, 3 for things beginning with SS3.
	29	*
	30	* - Next nibble (bits 27:24) indicates how many bytes of the
	31	* character we've accumulated so far.
	32	*
	33	* - The rest (bits 23:0) are those bytes in full, accumulated
	34	* as a large integer (so that seeing A1 A2 A3, in a
	35	* hypothetical EUC whose GR encoding is three-byte, runs
	36	* our state variable from 0 -> 0x110000A1 -> 0x1200A1A2 ->
	37	* 0x13A1A2A3, at which point it gets translated and output
	38	* and resets to zero).
	39	*/
	40
	41	if (state->s0 != 0) {
	42
	43	/*
	44	* At this point, no matter whether we had an SS2 or SS3
	45	* introducer or not, we _always_ expect a GR character.
	46	* Anything else causes us to emit ERROR for an incomplete
	47	* character, and then reset to state 0 to process the
	48	* character in its own way.
	49	*/
	50	if (input_chr < 0xA1 \|\| input_chr == 0xFF) {
	51	emit(emitctx, ERROR);
	52	state->s0 = 0;
	53	} else
	54	state->s0 = (((state->s0 & 0xFF000000) + 0x01000000) \|
	55	((state->s0 & 0x0000FFFF) << 8) \| input_chr);
	56
	57	}
	58
	59	if (state->s0 == 0) {
	60	/*
	61	* The input character determines which of the four
	62	* possible charsets we're going to be in.
	63	*/
	64	if (input_chr < 0x80) { /* this is always ASCII */
65	emit(emitctx, input_chr);
66	} else if (input_chr == 0x8E) {/* SS2 means charset 2 */
67	state->s0 = 0x20000000;
68	} else if (input_chr == 0x8F) {/* SS3 means charset 3 */
69	state->s0 = 0x30000000;
70	} else if (input_chr < 0xA1 \|\| input_chr == 0xFF) { /* errors */
71	emit(emitctx, ERROR);
72	} else { /* A1-FE means charset 1 */
73	state->s0 = 0x11000000 \| input_chr;
74	}
75	}
76
77	/*
78	* Finally, if we have accumulated a complete character, output
79	* it.
80	*/
81	if (state->s0 != 0 &&
3cca0edf	82	((state->s0 & 0x0F000000) >> 24) >=
3cca0edf	83	(unsigned)euc->nchars[(state->s0 >> 28)-1]) {
c6d25d8d	84	emit(emitctx, euc->to_ucs(state->s0));
	85	state->s0 = 0;
	86	}
	87	}
	88
	89	/*
	90	* All EUCs are stateless multi-byte encodings (in the sense that
	91	* just after any character has been completed, the state is always
	92	* the same); hence when writing them, there is no need to use the
	93	* charset_state.
	94	*/
	95
	96	static int write_euc(charset_spec const *charset, long int input_chr,
	97	charset_state *state,
	98	void (emit)(void ctx, long int output), void *emitctx)
	99	{
	100	struct euc const euc = (struct euc )charset->data;
	101	unsigned long c;
	102	int cset, len;
	103
	104	UNUSEDARG(state);
	105
	106	if (input_chr == -1)
	107	return TRUE; /* stateless; no cleanup required */
	108
	109	/* ASCII is the easy bit, and is always the same. */
	110	if (input_chr < 0x80) {
	111	emit(emitctx, input_chr);
	112	return TRUE;
	113	}
	114
	115	c = euc->from_ucs(input_chr);
	116	if (!c) {
	117	return FALSE;
	118	}
	119
	120	cset = c >> 28;
	121	len = euc->nchars[cset-1];
	122	c &= 0xFFFFFF;
	123
	124	if (cset > 1)
	125	emit(emitctx, 0x8C + cset); /* SS2/SS3 */
	126
	127	while (len--)
	128	emit(emitctx, (c >> (8*len)) & 0xFF);
	129	return TRUE;
	130	}
	131
	132	/*
	133	* EUC-CN encodes GB2312 only.
	134	*/
	135	static long int euc_cn_to_ucs(unsigned long state)
	136	{
	137	switch (state >> 28) {
	138	case 1: return gb2312_to_unicode(((state >> 8) & 0xFF) - 0xA1,
	139	((state ) & 0xFF) - 0xA1);
	140	default: return ERROR;
	141	}
	142	}
	143	static unsigned long euc_cn_from_ucs(long int ucs)
	144	{
	145	int r, c;
	146	if (unicode_to_gb2312(ucs, &r, &c))
	147	return 0x10000000 \| ((r+0xA1) << 8) \| (c+0xA1);
148	else
149	return 0;
150	}
151	static const struct euc euc_cn = {
152	{2,0,0}, euc_cn_to_ucs, euc_cn_from_ucs
153	};
154	const charset_spec charset_CS_EUC_CN = {
155	CS_EUC_CN, read_euc, write_euc, &euc_cn
156	};
157
158	/*
159	* EUC-KR encodes KS X 1001 only.
160	*/
161	static long int euc_kr_to_ucs(unsigned long state)
162	{
163	switch (state >> 28) {
164	case 1: return ksx1001_to_unicode(((state >> 8) & 0xFF) - 0xA1,
165	((state ) & 0xFF) - 0xA1);
166	default: return ERROR;
167	}
168	}
169	static unsigned long euc_kr_from_ucs(long int ucs)
170	{
171	int r, c;
172	if (unicode_to_ksx1001(ucs, &r, &c))
173	return 0x10000000 \| ((r+0xA1) << 8) \| (c+0xA1);
174	else
175	return 0;
176	}
177	static const struct euc euc_kr = {
178	{2,0,0}, euc_kr_to_ucs, euc_kr_from_ucs
179	};
180	const charset_spec charset_CS_EUC_KR = {
181	CS_EUC_KR, read_euc, write_euc, &euc_kr
182	};
183
184	/*
185	* EUC-JP encodes several character sets.
186	*/
187	static long int euc_jp_to_ucs(unsigned long state)
188	{
189	switch (state >> 28) {
190	case 1: return jisx0208_to_unicode(((state >> 8) & 0xFF) - 0xA1,
191	((state ) & 0xFF) - 0xA1);
192	case 2:
193	/*
194	* This is the top half of JIS X 0201. That means A1-DF map
195	* to FF61-FF9F, and nothing else is valid.
196	*/
197	{
198	int c = state & 0xFF;
199	if (c >= 0xA1 && c <= 0xDF)
200	return c + (0xFF61 - 0xA1);
201	else
202	return ERROR;
203	}
204	/* (no break needed since all control paths have returned) */
205	case 3: return jisx0212_to_unicode(((state >> 8) & 0xFF) - 0xA1,
206	((state ) & 0xFF) - 0xA1);
207	default: return ERROR; /* placate optimisers */
208	}
209	}
210	static unsigned long euc_jp_from_ucs(long int ucs)
211	{
212	int r, c;
213	if (ucs >= 0xFF61 && ucs <= 0xFF9F)
214	return 0x20000000 \| (ucs - (0xFF61 - 0xA1));
215	else if (unicode_to_jisx0208(ucs, &r, &c))
216	return 0x10000000 \| ((r+0xA1) << 8) \| (c+0xA1);
217	else if (unicode_to_jisx0212(ucs, &r, &c))
218	return 0x30000000 \| ((r+0xA1) << 8) \| (c+0xA1);
219	else
220	return 0;
221	}
222	static const struct euc euc_jp = {
223	{2,1,2}, euc_jp_to_ucs, euc_jp_from_ucs
224	};
225	const charset_spec charset_CS_EUC_JP = {
226	CS_EUC_JP, read_euc, write_euc, &euc_jp
227	};
228
229	#else /* ENUM_CHARSETS */
230
231	ENUM_CHARSET(CS_EUC_CN)
232	ENUM_CHARSET(CS_EUC_KR)
233	ENUM_CHARSET(CS_EUC_JP)
234
235	#endif /* ENUM_CHARSETS */