[sgt/charset] / utf16.c

/*
 * utf16.c - routines to handle UTF-16 (RFC 2781).
 */

#ifndef ENUM_CHARSETS

#include "charset.h"
#include "internal.h"

struct utf16 {
    int s0;			       /* initial value of state->s0 */
};

static void read_utf16(charset_spec const *charset, long int input_chr,
		       charset_state *state,
		       void (*emit)(void *ctx, long int output),
		       void *emitctx)
{
    struct utf16 const *utf = (struct utf16 *)charset->data;
    long int hw;

    /*
     * State variable s1 handles the combining of bytes into
     * transport-endianness halfwords. It contains:
     * 
     *  - 0 if we're between halfwords
     *  - 0x100 plus the first byte if we're in mid-halfword
     * 
     * State variable s0 handles everything from there upwards. It
     * contains:
     * 
     * 	- Bottom 16 bits are set to a surrogate value if we've just
     * 	  seen one.
     * 	- Next two bits (17:16) indicate possible endiannesses. Bit
     * 	  17 is set if we might be BE; bit 16 if we might be LE. If
     * 	  they're both zero, it has to be because this is right at
     * 	  the start, so the first thing we do is set them to the
     * 	  correct initial state.
     * 	- The bit after that (18) is 1 iff we have already seen at
     * 	  least one halfword (meaning we should pass any further
     * 	  BOMs straight through).
     */

    /* Set up s0 if this is the start. */
    if (state->s0 == 0)
	state->s0 = utf->s0;

    /* Accumulate a transport-endianness halfword. */
    if (state->s1 == 0) {
	state->s1 = 0x100 | input_chr;
	return;
    }
    hw = ((state->s1 & 0xFF) << 8) + input_chr;
    state->s1 = 0;

    /* Process BOM and determine byte order. */
    if (!(state->s0 & 0x40000)) {
	state->s0 |= 0x40000;
	if (hw == 0xFEFF && (state->s0 & 0x20000)) {
	    /*
	     * Text starts with a big-endian BOM, and big-
	     * endianness is a possibility. So clear the
	     * little-endian bit (the BOM confirms our endianness),
	     * and return without emitting the BOM in Unicode.
	     */
	    state->s0 &= ~0x10000;
	    return;
	} else if (hw == 0xFFFE && (state->s0 & 0x10000)) {
	    /*
	     * Text starts with a little-endian BOM, and little-
	     * endianness is a possibility. So clear the big-endian
	     * bit (the BOM confirms our endianness), and return
	     * without emitting the BOM in Unicode.
	     */
	    state->s0 &= ~0x20000;
	    return;
	} else {
	    /*
	     * Text does not begin with a BOM. RFC 2781 states that
	     * in this case we must assume big-endianness if we
	     * haven't been told otherwise by the content type.
	     */
	    if ((state->s0 & 0x30000) == 0x30000)
		state->s0 &= ~0x10000; /* clear LE bit */
	}
    }

    /*
     * Byte-swap transport-endianness halfword if necessary. We may
     * now test individual endianness bits, since we can be sure
     * exactly one is set.
     */
    if (state->s0 & 0x10000)
	hw = ((hw >> 8) | (hw << 8)) & 0xFFFF;

    /*
     * Now that the endianness issue has been dealt with, what
     * reaches this point should be a stream of halfwords in
     * sensible numeric form. So now we process surrogates.
     */
    if (state->s0 & 0xFFFF) {
	/*
	 * We have already seen a high surrogate, so we expect a
	 * low surrogate. Whinge if we didn't get it.
	 */
	if (hw < 0xDC00 || hw >= 0xE000) {
	    emit(emitctx, ERROR);
	} else {
	    hw &= 0x3FF;
	    hw |= (state->s0 & 0x3FF) << 10;
	    emit(emitctx, hw + 0x10000);
	}
	state->s0 &= 0xFFFF0000;
    } else {
	/*
	 * Any low surrogate is an error.
	 */
	if (hw >= 0xDC00 && hw < 0xE000) {
	    emit(emitctx, ERROR);
	    return;
	}

	/*
	 * Any high surrogate is simply stored until we see the
	 * next halfword.
	 */
	if (hw >= 0xD800 && hw < 0xDC00) {
	    state->s0 |= hw;
	    return;
	}

	/*
	 * Anything else we simply output.
	 */
	emit(emitctx, hw);
    }
}

/*
 * Repeated code in write_utf16 abstracted out for sanity.
 */
static void emithl(void (*emit)(void *ctx, long int output), void *emitctx,
		   unsigned long s0, long int hw)
{
    int h = (hw >> 8) & 0xFF, l = hw & 0xFF;

    if (s0 & 0x20000) {
	/* Big-endian takes priority over little, if both are allowed. */
	emit(emitctx, h);
	emit(emitctx, l);
    } else {
	emit(emitctx, l);
	emit(emitctx, h);
    }
}

static int write_utf16(charset_spec const *charset, long int input_chr,
		       charset_state *state,
		       void (*emit)(void *ctx, long int output),
		       void *emitctx)
{
    struct utf16 const *utf = (struct utf16 *)charset->data;

    /*
     * state->s0 == 0 means we have not output anything yet (and so
     * must output a BOM before we do anything else). state->s0 ==
     * 1 means we are off and running.
     */

    if (input_chr < 0)
	return TRUE;		       /* no cleanup required */

    if ((input_chr >= 0xD800 && input_chr < 0xE000) ||
	input_chr >= 0x110000) {
	/*
	 * We can't output surrogates, or anything above 0x10FFFF.
	 */
	return FALSE;
    }

    if (!state->s0) {
	state->s0 = 1;
	emithl(emit, emitctx, utf->s0, 0xFEFF);
    }

    if (input_chr < 0x10000) {
	emithl(emit, emitctx, utf->s0, input_chr);
    } else {
	input_chr -= 0x10000;
	/* now input_chr is between 0 and 0xFFFFF inclusive */
	emithl(emit, emitctx, utf->s0, 0xD800 | ((input_chr >> 10) & 0x3FF));
	emithl(emit, emitctx, utf->s0, 0xDC00 | (input_chr & 0x3FF));
    }
    return TRUE;
}

static struct utf16 const utf16_bigendian = { 0x20000 };
static const struct utf16 utf16_littleendian = { 0x10000 };
static const struct utf16 utf16_variable_endianness = { 0x30000 };

const charset_spec charset_CS_UTF16BE = {
    CS_UTF16BE, read_utf16, write_utf16, &utf16_bigendian
};
const charset_spec charset_CS_UTF16LE = {
    CS_UTF16LE, read_utf16, write_utf16, &utf16_littleendian
};
const charset_spec charset_CS_UTF16 = {
    CS_UTF16, read_utf16, write_utf16, &utf16_variable_endianness
};

#else /* ENUM_CHARSETS */

ENUM_CHARSET(CS_UTF16)
ENUM_CHARSET(CS_UTF16BE)
ENUM_CHARSET(CS_UTF16LE)

#endif /* ENUM_CHARSETS */
Commit	Line	Data
c6d25d8d	1	/*
	2	* utf16.c - routines to handle UTF-16 (RFC 2781).
	3	*/
	4
	5	#ifndef ENUM_CHARSETS
	6
	7	#include "charset.h"
	8	#include "internal.h"
	9
	10	struct utf16 {
	11	int s0; /* initial value of state->s0 */
	12	};
	13
	14	static void read_utf16(charset_spec const *charset, long int input_chr,
	15	charset_state *state,
	16	void (emit)(void ctx, long int output),
	17	void *emitctx)
	18	{
	19	struct utf16 const utf = (struct utf16 )charset->data;
	20	long int hw;
	21
	22	/*
	23	* State variable s1 handles the combining of bytes into
	24	* transport-endianness halfwords. It contains:
	25	*
	26	* - 0 if we're between halfwords
	27	* - 0x100 plus the first byte if we're in mid-halfword
	28	*
	29	* State variable s0 handles everything from there upwards. It
	30	* contains:
	31	*
	32	* - Bottom 16 bits are set to a surrogate value if we've just
	33	* seen one.
	34	* - Next two bits (17:16) indicate possible endiannesses. Bit
	35	* 17 is set if we might be BE; bit 16 if we might be LE. If
	36	* they're both zero, it has to be because this is right at
	37	* the start, so the first thing we do is set them to the
	38	* correct initial state.
	39	* - The bit after that (18) is 1 iff we have already seen at
	40	* least one halfword (meaning we should pass any further
	41	* BOMs straight through).
	42	*/
	43
	44	/* Set up s0 if this is the start. */
	45	if (state->s0 == 0)
	46	state->s0 = utf->s0;
	47
	48	/* Accumulate a transport-endianness halfword. */
	49	if (state->s1 == 0) {
	50	state->s1 = 0x100 \| input_chr;
	51	return;
	52	}
	53	hw = ((state->s1 & 0xFF) << 8) + input_chr;
	54	state->s1 = 0;
	55
	56	/* Process BOM and determine byte order. */
	57	if (!(state->s0 & 0x40000)) {
	58	state->s0 \|= 0x40000;
	59	if (hw == 0xFEFF && (state->s0 & 0x20000)) {
	60	/*
	61	* Text starts with a big-endian BOM, and big-
	62	* endianness is a possibility. So clear the
	63	* little-endian bit (the BOM confirms our endianness),
	64	* and return without emitting the BOM in Unicode.
65	*/
66	state->s0 &= ~0x10000;
67	return;
68	} else if (hw == 0xFFFE && (state->s0 & 0x10000)) {
69	/*
70	* Text starts with a little-endian BOM, and little-
71	* endianness is a possibility. So clear the big-endian
72	* bit (the BOM confirms our endianness), and return
73	* without emitting the BOM in Unicode.
74	*/
75	state->s0 &= ~0x20000;
76	return;
77	} else {
78	/*
79	* Text does not begin with a BOM. RFC 2781 states that
80	* in this case we must assume big-endianness if we
81	* haven't been told otherwise by the content type.
82	*/
83	if ((state->s0 & 0x30000) == 0x30000)
84	state->s0 &= ~0x10000; /* clear LE bit */
85	}
86	}
87
88	/*
89	* Byte-swap transport-endianness halfword if necessary. We may
90	* now test individual endianness bits, since we can be sure
91	* exactly one is set.
92	*/
93	if (state->s0 & 0x10000)
94	hw = ((hw >> 8) \| (hw << 8)) & 0xFFFF;
95
96	/*
97	* Now that the endianness issue has been dealt with, what
98	* reaches this point should be a stream of halfwords in
99	* sensible numeric form. So now we process surrogates.
100	*/
101	if (state->s0 & 0xFFFF) {
102	/*
103	* We have already seen a high surrogate, so we expect a
104	* low surrogate. Whinge if we didn't get it.
105	*/
106	if (hw < 0xDC00 \|\| hw >= 0xE000) {
107	emit(emitctx, ERROR);
108	} else {
109	hw &= 0x3FF;
110	hw \|= (state->s0 & 0x3FF) << 10;
111	emit(emitctx, hw + 0x10000);
112	}
113	state->s0 &= 0xFFFF0000;
114	} else {
115	/*
116	* Any low surrogate is an error.
117	*/
118	if (hw >= 0xDC00 && hw < 0xE000) {
119	emit(emitctx, ERROR);
120	return;
121	}
122
123	/*
124	* Any high surrogate is simply stored until we see the
125	* next halfword.
126	*/
127	if (hw >= 0xD800 && hw < 0xDC00) {
128	state->s0 \|= hw;
129	return;
130	}
131
132	/*
133	* Anything else we simply output.
134	*/
135	emit(emitctx, hw);
136	}
137	}
138
139	/*
140	* Repeated code in write_utf16 abstracted out for sanity.
141	*/
142	static void emithl(void (emit)(void ctx, long int output), void *emitctx,
143	unsigned long s0, long int hw)
144	{
145	int h = (hw >> 8) & 0xFF, l = hw & 0xFF;
146
147	if (s0 & 0x20000) {
148	/* Big-endian takes priority over little, if both are allowed. */
149	emit(emitctx, h);
150	emit(emitctx, l);
151	} else {
152	emit(emitctx, l);
153	emit(emitctx, h);
154	}
155	}
156
157	static int write_utf16(charset_spec const *charset, long int input_chr,
158	charset_state *state,
159	void (emit)(void ctx, long int output),
160	void *emitctx)
161	{
162	struct utf16 const utf = (struct utf16 )charset->data;
163
164	/*
165	* state->s0 == 0 means we have not output anything yet (and so
166	* must output a BOM before we do anything else). state->s0 ==
167	* 1 means we are off and running.
168	*/
169
170	if (input_chr < 0)
171	return TRUE; /* no cleanup required */
172
173	if ((input_chr >= 0xD800 && input_chr < 0xE000) \|\|
174	input_chr >= 0x110000) {
175	/*
176	* We can't output surrogates, or anything above 0x10FFFF.
177	*/
178	return FALSE;
179	}
180
181	if (!state->s0) {
182	state->s0 = 1;
183	emithl(emit, emitctx, utf->s0, 0xFEFF);
184	}
185
186	if (input_chr < 0x10000) {
187	emithl(emit, emitctx, utf->s0, input_chr);
188	} else {
189	input_chr -= 0x10000;
190	/* now input_chr is between 0 and 0xFFFFF inclusive */
191	emithl(emit, emitctx, utf->s0, 0xD800 \| ((input_chr >> 10) & 0x3FF));
192	emithl(emit, emitctx, utf->s0, 0xDC00 \| (input_chr & 0x3FF));
193	}
194	return TRUE;
195	}
196
b97e5427	197	static struct utf16 const utf16_bigendian = { 0x20000 };
c6d25d8d	198	static const struct utf16 utf16_littleendian = { 0x10000 };
	199	static const struct utf16 utf16_variable_endianness = { 0x30000 };
	200
	201	const charset_spec charset_CS_UTF16BE = {
	202	CS_UTF16BE, read_utf16, write_utf16, &utf16_bigendian
	203	};
	204	const charset_spec charset_CS_UTF16LE = {
	205	CS_UTF16LE, read_utf16, write_utf16, &utf16_littleendian
	206	};
	207	const charset_spec charset_CS_UTF16 = {
	208	CS_UTF16, read_utf16, write_utf16, &utf16_variable_endianness
	209	};
	210
	211	#else /* ENUM_CHARSETS */
	212
	213	ENUM_CHARSET(CS_UTF16)
	214	ENUM_CHARSET(CS_UTF16BE)
	215	ENUM_CHARSET(CS_UTF16LE)
	216
	217	#endif /* ENUM_CHARSETS */