c6d25d8d |
1 | /* |
2 | * utf16.c - routines to handle UTF-16 (RFC 2781). |
3 | */ |
4 | |
5 | #ifndef ENUM_CHARSETS |
6 | |
7 | #include "charset.h" |
8 | #include "internal.h" |
9 | |
10 | struct utf16 { |
11 | int s0; /* initial value of state->s0 */ |
12 | }; |
13 | |
14 | static void read_utf16(charset_spec const *charset, long int input_chr, |
15 | charset_state *state, |
16 | void (*emit)(void *ctx, long int output), |
17 | void *emitctx) |
18 | { |
19 | struct utf16 const *utf = (struct utf16 *)charset->data; |
20 | long int hw; |
21 | |
22 | /* |
23 | * State variable s1 handles the combining of bytes into |
24 | * transport-endianness halfwords. It contains: |
25 | * |
26 | * - 0 if we're between halfwords |
27 | * - 0x100 plus the first byte if we're in mid-halfword |
28 | * |
29 | * State variable s0 handles everything from there upwards. It |
30 | * contains: |
31 | * |
32 | * - Bottom 16 bits are set to a surrogate value if we've just |
33 | * seen one. |
34 | * - Next two bits (17:16) indicate possible endiannesses. Bit |
35 | * 17 is set if we might be BE; bit 16 if we might be LE. If |
36 | * they're both zero, it has to be because this is right at |
37 | * the start, so the first thing we do is set them to the |
38 | * correct initial state. |
39 | * - The bit after that (18) is 1 iff we have already seen at |
40 | * least one halfword (meaning we should pass any further |
41 | * BOMs straight through). |
42 | */ |
43 | |
44 | /* Set up s0 if this is the start. */ |
45 | if (state->s0 == 0) |
46 | state->s0 = utf->s0; |
47 | |
48 | /* Accumulate a transport-endianness halfword. */ |
49 | if (state->s1 == 0) { |
50 | state->s1 = 0x100 | input_chr; |
51 | return; |
52 | } |
53 | hw = ((state->s1 & 0xFF) << 8) + input_chr; |
54 | state->s1 = 0; |
55 | |
56 | /* Process BOM and determine byte order. */ |
57 | if (!(state->s0 & 0x40000)) { |
58 | state->s0 |= 0x40000; |
59 | if (hw == 0xFEFF && (state->s0 & 0x20000)) { |
60 | /* |
61 | * Text starts with a big-endian BOM, and big- |
62 | * endianness is a possibility. So clear the |
63 | * little-endian bit (the BOM confirms our endianness), |
64 | * and return without emitting the BOM in Unicode. |
65 | */ |
66 | state->s0 &= ~0x10000; |
67 | return; |
68 | } else if (hw == 0xFFFE && (state->s0 & 0x10000)) { |
69 | /* |
70 | * Text starts with a little-endian BOM, and little- |
71 | * endianness is a possibility. So clear the big-endian |
72 | * bit (the BOM confirms our endianness), and return |
73 | * without emitting the BOM in Unicode. |
74 | */ |
75 | state->s0 &= ~0x20000; |
76 | return; |
77 | } else { |
78 | /* |
79 | * Text does not begin with a BOM. RFC 2781 states that |
80 | * in this case we must assume big-endianness if we |
81 | * haven't been told otherwise by the content type. |
82 | */ |
83 | if ((state->s0 & 0x30000) == 0x30000) |
84 | state->s0 &= ~0x10000; /* clear LE bit */ |
85 | } |
86 | } |
87 | |
88 | /* |
89 | * Byte-swap transport-endianness halfword if necessary. We may |
90 | * now test individual endianness bits, since we can be sure |
91 | * exactly one is set. |
92 | */ |
93 | if (state->s0 & 0x10000) |
94 | hw = ((hw >> 8) | (hw << 8)) & 0xFFFF; |
95 | |
96 | /* |
97 | * Now that the endianness issue has been dealt with, what |
98 | * reaches this point should be a stream of halfwords in |
99 | * sensible numeric form. So now we process surrogates. |
100 | */ |
101 | if (state->s0 & 0xFFFF) { |
102 | /* |
103 | * We have already seen a high surrogate, so we expect a |
104 | * low surrogate. Whinge if we didn't get it. |
105 | */ |
106 | if (hw < 0xDC00 || hw >= 0xE000) { |
107 | emit(emitctx, ERROR); |
108 | } else { |
109 | hw &= 0x3FF; |
110 | hw |= (state->s0 & 0x3FF) << 10; |
111 | emit(emitctx, hw + 0x10000); |
112 | } |
113 | state->s0 &= 0xFFFF0000; |
114 | } else { |
115 | /* |
116 | * Any low surrogate is an error. |
117 | */ |
118 | if (hw >= 0xDC00 && hw < 0xE000) { |
119 | emit(emitctx, ERROR); |
120 | return; |
121 | } |
122 | |
123 | /* |
124 | * Any high surrogate is simply stored until we see the |
125 | * next halfword. |
126 | */ |
127 | if (hw >= 0xD800 && hw < 0xDC00) { |
128 | state->s0 |= hw; |
129 | return; |
130 | } |
131 | |
132 | /* |
133 | * Anything else we simply output. |
134 | */ |
135 | emit(emitctx, hw); |
136 | } |
137 | } |
138 | |
139 | /* |
140 | * Repeated code in write_utf16 abstracted out for sanity. |
141 | */ |
142 | static void emithl(void (*emit)(void *ctx, long int output), void *emitctx, |
143 | unsigned long s0, long int hw) |
144 | { |
145 | int h = (hw >> 8) & 0xFF, l = hw & 0xFF; |
146 | |
147 | if (s0 & 0x20000) { |
148 | /* Big-endian takes priority over little, if both are allowed. */ |
149 | emit(emitctx, h); |
150 | emit(emitctx, l); |
151 | } else { |
152 | emit(emitctx, l); |
153 | emit(emitctx, h); |
154 | } |
155 | } |
156 | |
157 | static int write_utf16(charset_spec const *charset, long int input_chr, |
158 | charset_state *state, |
159 | void (*emit)(void *ctx, long int output), |
160 | void *emitctx) |
161 | { |
162 | struct utf16 const *utf = (struct utf16 *)charset->data; |
163 | |
164 | /* |
165 | * state->s0 == 0 means we have not output anything yet (and so |
166 | * must output a BOM before we do anything else). state->s0 == |
167 | * 1 means we are off and running. |
168 | */ |
169 | |
170 | if (input_chr < 0) |
171 | return TRUE; /* no cleanup required */ |
172 | |
173 | if ((input_chr >= 0xD800 && input_chr < 0xE000) || |
174 | input_chr >= 0x110000) { |
175 | /* |
176 | * We can't output surrogates, or anything above 0x10FFFF. |
177 | */ |
178 | return FALSE; |
179 | } |
180 | |
181 | if (!state->s0) { |
182 | state->s0 = 1; |
183 | emithl(emit, emitctx, utf->s0, 0xFEFF); |
184 | } |
185 | |
186 | if (input_chr < 0x10000) { |
187 | emithl(emit, emitctx, utf->s0, input_chr); |
188 | } else { |
189 | input_chr -= 0x10000; |
190 | /* now input_chr is between 0 and 0xFFFFF inclusive */ |
191 | emithl(emit, emitctx, utf->s0, 0xD800 | ((input_chr >> 10) & 0x3FF)); |
192 | emithl(emit, emitctx, utf->s0, 0xDC00 | (input_chr & 0x3FF)); |
193 | } |
194 | return TRUE; |
195 | } |
196 | |
b97e5427 |
197 | static struct utf16 const utf16_bigendian = { 0x20000 }; |
c6d25d8d |
198 | static const struct utf16 utf16_littleendian = { 0x10000 }; |
199 | static const struct utf16 utf16_variable_endianness = { 0x30000 }; |
200 | |
201 | const charset_spec charset_CS_UTF16BE = { |
202 | CS_UTF16BE, read_utf16, write_utf16, &utf16_bigendian |
203 | }; |
204 | const charset_spec charset_CS_UTF16LE = { |
205 | CS_UTF16LE, read_utf16, write_utf16, &utf16_littleendian |
206 | }; |
207 | const charset_spec charset_CS_UTF16 = { |
208 | CS_UTF16, read_utf16, write_utf16, &utf16_variable_endianness |
209 | }; |
210 | |
211 | #else /* ENUM_CHARSETS */ |
212 | |
213 | ENUM_CHARSET(CS_UTF16) |
214 | ENUM_CHARSET(CS_UTF16BE) |
215 | ENUM_CHARSET(CS_UTF16LE) |
216 | |
217 | #endif /* ENUM_CHARSETS */ |