Silly of me to overlook it: another obvious way you might like to
[sgt/charset] / utf16.c
CommitLineData
c6d25d8d 1/*
2 * utf16.c - routines to handle UTF-16 (RFC 2781).
3 */
4
5#ifndef ENUM_CHARSETS
6
7#include "charset.h"
8#include "internal.h"
9
10struct utf16 {
11 int s0; /* initial value of state->s0 */
12};
13
14static void read_utf16(charset_spec const *charset, long int input_chr,
15 charset_state *state,
16 void (*emit)(void *ctx, long int output),
17 void *emitctx)
18{
19 struct utf16 const *utf = (struct utf16 *)charset->data;
20 long int hw;
21
22 /*
23 * State variable s1 handles the combining of bytes into
24 * transport-endianness halfwords. It contains:
25 *
26 * - 0 if we're between halfwords
27 * - 0x100 plus the first byte if we're in mid-halfword
28 *
29 * State variable s0 handles everything from there upwards. It
30 * contains:
31 *
32 * - Bottom 16 bits are set to a surrogate value if we've just
33 * seen one.
34 * - Next two bits (17:16) indicate possible endiannesses. Bit
35 * 17 is set if we might be BE; bit 16 if we might be LE. If
36 * they're both zero, it has to be because this is right at
37 * the start, so the first thing we do is set them to the
38 * correct initial state.
39 * - The bit after that (18) is 1 iff we have already seen at
40 * least one halfword (meaning we should pass any further
41 * BOMs straight through).
42 */
43
44 /* Set up s0 if this is the start. */
45 if (state->s0 == 0)
46 state->s0 = utf->s0;
47
48 /* Accumulate a transport-endianness halfword. */
49 if (state->s1 == 0) {
50 state->s1 = 0x100 | input_chr;
51 return;
52 }
53 hw = ((state->s1 & 0xFF) << 8) + input_chr;
54 state->s1 = 0;
55
56 /* Process BOM and determine byte order. */
57 if (!(state->s0 & 0x40000)) {
58 state->s0 |= 0x40000;
59 if (hw == 0xFEFF && (state->s0 & 0x20000)) {
60 /*
61 * Text starts with a big-endian BOM, and big-
62 * endianness is a possibility. So clear the
63 * little-endian bit (the BOM confirms our endianness),
64 * and return without emitting the BOM in Unicode.
65 */
66 state->s0 &= ~0x10000;
67 return;
68 } else if (hw == 0xFFFE && (state->s0 & 0x10000)) {
69 /*
70 * Text starts with a little-endian BOM, and little-
71 * endianness is a possibility. So clear the big-endian
72 * bit (the BOM confirms our endianness), and return
73 * without emitting the BOM in Unicode.
74 */
75 state->s0 &= ~0x20000;
76 return;
77 } else {
78 /*
79 * Text does not begin with a BOM. RFC 2781 states that
80 * in this case we must assume big-endianness if we
81 * haven't been told otherwise by the content type.
82 */
83 if ((state->s0 & 0x30000) == 0x30000)
84 state->s0 &= ~0x10000; /* clear LE bit */
85 }
86 }
87
88 /*
89 * Byte-swap transport-endianness halfword if necessary. We may
90 * now test individual endianness bits, since we can be sure
91 * exactly one is set.
92 */
93 if (state->s0 & 0x10000)
94 hw = ((hw >> 8) | (hw << 8)) & 0xFFFF;
95
96 /*
97 * Now that the endianness issue has been dealt with, what
98 * reaches this point should be a stream of halfwords in
99 * sensible numeric form. So now we process surrogates.
100 */
101 if (state->s0 & 0xFFFF) {
102 /*
103 * We have already seen a high surrogate, so we expect a
104 * low surrogate. Whinge if we didn't get it.
105 */
106 if (hw < 0xDC00 || hw >= 0xE000) {
107 emit(emitctx, ERROR);
108 } else {
109 hw &= 0x3FF;
110 hw |= (state->s0 & 0x3FF) << 10;
111 emit(emitctx, hw + 0x10000);
112 }
113 state->s0 &= 0xFFFF0000;
114 } else {
115 /*
116 * Any low surrogate is an error.
117 */
118 if (hw >= 0xDC00 && hw < 0xE000) {
119 emit(emitctx, ERROR);
120 return;
121 }
122
123 /*
124 * Any high surrogate is simply stored until we see the
125 * next halfword.
126 */
127 if (hw >= 0xD800 && hw < 0xDC00) {
128 state->s0 |= hw;
129 return;
130 }
131
132 /*
133 * Anything else we simply output.
134 */
135 emit(emitctx, hw);
136 }
137}
138
139/*
140 * Repeated code in write_utf16 abstracted out for sanity.
141 */
142static void emithl(void (*emit)(void *ctx, long int output), void *emitctx,
143 unsigned long s0, long int hw)
144{
145 int h = (hw >> 8) & 0xFF, l = hw & 0xFF;
146
147 if (s0 & 0x20000) {
148 /* Big-endian takes priority over little, if both are allowed. */
149 emit(emitctx, h);
150 emit(emitctx, l);
151 } else {
152 emit(emitctx, l);
153 emit(emitctx, h);
154 }
155}
156
157static int write_utf16(charset_spec const *charset, long int input_chr,
158 charset_state *state,
159 void (*emit)(void *ctx, long int output),
160 void *emitctx)
161{
162 struct utf16 const *utf = (struct utf16 *)charset->data;
163
164 /*
165 * state->s0 == 0 means we have not output anything yet (and so
166 * must output a BOM before we do anything else). state->s0 ==
167 * 1 means we are off and running.
168 */
169
170 if (input_chr < 0)
171 return TRUE; /* no cleanup required */
172
173 if ((input_chr >= 0xD800 && input_chr < 0xE000) ||
174 input_chr >= 0x110000) {
175 /*
176 * We can't output surrogates, or anything above 0x10FFFF.
177 */
178 return FALSE;
179 }
180
181 if (!state->s0) {
182 state->s0 = 1;
183 emithl(emit, emitctx, utf->s0, 0xFEFF);
184 }
185
186 if (input_chr < 0x10000) {
187 emithl(emit, emitctx, utf->s0, input_chr);
188 } else {
189 input_chr -= 0x10000;
190 /* now input_chr is between 0 and 0xFFFFF inclusive */
191 emithl(emit, emitctx, utf->s0, 0xD800 | ((input_chr >> 10) & 0x3FF));
192 emithl(emit, emitctx, utf->s0, 0xDC00 | (input_chr & 0x3FF));
193 }
194 return TRUE;
195}
196
9aa9036c 197static const struct utf16 utf16_bigendian = { 0x20000 };
c6d25d8d 198static const struct utf16 utf16_littleendian = { 0x10000 };
199static const struct utf16 utf16_variable_endianness = { 0x30000 };
200
201const charset_spec charset_CS_UTF16BE = {
202 CS_UTF16BE, read_utf16, write_utf16, &utf16_bigendian
203};
204const charset_spec charset_CS_UTF16LE = {
205 CS_UTF16LE, read_utf16, write_utf16, &utf16_littleendian
206};
207const charset_spec charset_CS_UTF16 = {
208 CS_UTF16, read_utf16, write_utf16, &utf16_variable_endianness
209};
210
211#else /* ENUM_CHARSETS */
212
213ENUM_CHARSET(CS_UTF16)
214ENUM_CHARSET(CS_UTF16BE)
215ENUM_CHARSET(CS_UTF16LE)
216
217#endif /* ENUM_CHARSETS */