mdw@git.distorted.org.uk Git - sgt/charset/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* hz.c - HZ textual encoding of ASCII and GB2312, as defined in RFC 1843.
	3	*/
	4
	5	#ifndef ENUM_CHARSETS
	6
	7	#include <assert.h>
	8
	9	#include "charset.h"
	10	#include "internal.h"
	11
	12	static void read_hz(charset_spec const *charset, long int input_chr,
	13	charset_state *state,
	14	void (emit)(void ctx, long int output), void *emitctx)
	15	{
	16	/*
	17	* When reading, our state variables are:
	18	*
	19	* - s0 is 0 in ASCII mode, 1 in GB2312 mode.
	20	*
	21	* - s1 stores a character we have just seen but not fully
	22	* processed. So in ASCII mode, this can only ever be zero
	23	* (no character) or 0x7E (~); in GB2312 mode it can be
	24	* anything from 0x21-0x7E.
	25	*/
	26
	27	UNUSEDARG(charset);
	28
	29	if (state->s0 == 0) {
	30	/*
	31	* ASCII mode.
	32	*/
	33
	34	if (state->s1) {
	35	assert(state->s1 == '~');
	36	state->s1 = 0;
	37	/* Process the character after a tilde. */
	38	switch (input_chr) {
	39	case '~':
	40	emit(emitctx, input_chr);
	41	return;
	42	case '\n':
	43	return; /* ~\n is ignored */
	44	case '{':
	45	state->s0 = 1; /* switch to GB2312 mode */
	46	return;
	47	}
	48	} else if (input_chr == '~') {
	49	state->s1 = '~';
	50	return;
	51	} else {
	52	/* In ASCII mode, any non-tildes go straight */
	53	emit(emitctx, input_chr);
	54	return;
	55	}
	56	} else {
	57	/*
	58	* GB2312 mode. As I understand it, we expect never to see
	59	* anything in this mode that isn't 0x21-0x7E. So if we do,
	60	* we'll simply throw an error and return to ASCII mode.
	61	*/
	62	if (input_chr < 0x21 \|\| input_chr > 0x7E) {
	63	emit(emitctx, ERROR);
	64	state->s0 = state->s1 = 0;
	65	return;
	66	}
	67
	68	/*
	69	* So if we don't have a character stored already, store
	70	* this one...
	71	*/
	72	if (!state->s1) {
	73	state->s1 = input_chr;
	74	return;
	75	}
	76
	77	/*
	78	* ... otherwise, combine the stored char with this one.
	79	* This will give either `~}', the escape sequence to
	80	* return to ASCII mode, or something which we translate
	81	* through GB2312.
	82	*/
	83	if (state->s1 == '~' && input_chr == '}') {
	84	state->s1 = state->s0 = 0;
	85	return;
	86	}
	87
	88	emit(emitctx, gb2312_to_unicode(state->s1 - 0x21, input_chr - 0x21));
	89	state->s1 = 0;
	90	}
	91	}
	92
	93	static int write_hz(charset_spec const *charset, long int input_chr,
	94	charset_state *state,
	95	void (emit)(void ctx, long int output), void *emitctx)
	96	{
	97	int desired_state, r, c;
	98
	99	UNUSEDARG(charset);
	100
	101	/*
	102	* Analyse the input char.
	103	*/
	104	if (input_chr < 0x80) {
	105	desired_state = 0;
	106	c = input_chr;
	107	} else if (unicode_to_gb2312(input_chr, &r, &c)) {
	108	desired_state = 1;
	109	} else {
	110	return FALSE;
	111	}
	112
	113	if (state->s0 != (unsigned)desired_state) {
	114	emit(emitctx, '~');
	115	emit(emitctx, desired_state ? '{' : '}');
	116	state->s0 = desired_state;
	117	}
	118
	119	if (input_chr < 0)
	120	return TRUE; /* special case: just reset state */
	121
	122	if (state->s0) {
	123	/*
	124	* GB mode.
	125	*/
	126	emit(emitctx, 0x21 + r);
	127	emit(emitctx, 0x21 + c);
	128	} else {
	129	emit(emitctx, c);
	130	}
	131	return TRUE;
	132	}
	133
	134	const charset_spec charset_CS_HZ = {
	135	CS_HZ, read_hz, write_hz, NULL
	136	};
	137
	138	#else /* ENUM_CHARSETS */
	139
	140	ENUM_CHARSET(CS_HZ)
	141
	142	#endif /* ENUM_CHARSETS */