2 * utf16.c - routines to handle UTF-16 (RFC 2781).
11 int s0
; /* initial value of state->s0 */
14 static void read_utf16(charset_spec
const *charset
, long int input_chr
,
16 void (*emit
)(void *ctx
, long int output
),
19 struct utf16
const *utf
= (struct utf16
*)charset
->data
;
23 * State variable s1 handles the combining of bytes into
24 * transport-endianness halfwords. It contains:
26 * - 0 if we're between halfwords
27 * - 0x100 plus the first byte if we're in mid-halfword
29 * State variable s0 handles everything from there upwards. It
32 * - Bottom 16 bits are set to a surrogate value if we've just
34 * - Next two bits (17:16) indicate possible endiannesses. Bit
35 * 17 is set if we might be BE; bit 16 if we might be LE. If
36 * they're both zero, it has to be because this is right at
37 * the start, so the first thing we do is set them to the
38 * correct initial state.
39 * - The bit after that (18) is 1 iff we have already seen at
40 * least one halfword (meaning we should pass any further
41 * BOMs straight through).
44 /* Set up s0 if this is the start. */
48 /* Accumulate a transport-endianness halfword. */
50 state
->s1
= 0x100 | input_chr
;
53 hw
= ((state
->s1
& 0xFF) << 8) + input_chr
;
56 /* Process BOM and determine byte order. */
57 if (!(state
->s0
& 0x40000)) {
59 if (hw
== 0xFEFF && (state
->s0
& 0x20000)) {
61 * Text starts with a big-endian BOM, and big-
62 * endianness is a possibility. So clear the
63 * little-endian bit (the BOM confirms our endianness),
64 * and return without emitting the BOM in Unicode.
66 state
->s0
&= ~0x10000;
68 } else if (hw
== 0xFFFE && (state
->s0
& 0x10000)) {
70 * Text starts with a little-endian BOM, and little-
71 * endianness is a possibility. So clear the big-endian
72 * bit (the BOM confirms our endianness), and return
73 * without emitting the BOM in Unicode.
75 state
->s0
&= ~0x20000;
79 * Text does not begin with a BOM. RFC 2781 states that
80 * in this case we must assume big-endianness if we
81 * haven't been told otherwise by the content type.
83 if ((state
->s0
& 0x30000) == 0x30000)
84 state
->s0
&= ~0x10000; /* clear LE bit */
89 * Byte-swap transport-endianness halfword if necessary. We may
90 * now test individual endianness bits, since we can be sure
93 if (state
->s0
& 0x10000)
94 hw
= ((hw
>> 8) | (hw
<< 8)) & 0xFFFF;
97 * Now that the endianness issue has been dealt with, what
98 * reaches this point should be a stream of halfwords in
99 * sensible numeric form. So now we process surrogates.
101 if (state
->s0
& 0xFFFF) {
103 * We have already seen a high surrogate, so we expect a
104 * low surrogate. Whinge if we didn't get it.
106 if (hw
< 0xDC00 || hw
>= 0xE000) {
107 emit(emitctx
, ERROR
);
110 hw
|= (state
->s0
& 0x3FF) << 10;
111 emit(emitctx
, hw
+ 0x10000);
113 state
->s0
&= 0xFFFF0000;
116 * Any low surrogate is an error.
118 if (hw
>= 0xDC00 && hw
< 0xE000) {
119 emit(emitctx
, ERROR
);
124 * Any high surrogate is simply stored until we see the
127 if (hw
>= 0xD800 && hw
< 0xDC00) {
133 * Anything else we simply output.
140 * Repeated code in write_utf16 abstracted out for sanity.
142 static void emithl(void (*emit
)(void *ctx
, long int output
), void *emitctx
,
143 unsigned long s0
, long int hw
)
145 int h
= (hw
>> 8) & 0xFF, l
= hw
& 0xFF;
148 /* Big-endian takes priority over little, if both are allowed. */
157 static int write_utf16(charset_spec
const *charset
, long int input_chr
,
158 charset_state
*state
,
159 void (*emit
)(void *ctx
, long int output
),
162 struct utf16
const *utf
= (struct utf16
*)charset
->data
;
165 * state->s0 == 0 means we have not output anything yet (and so
166 * must output a BOM before we do anything else). state->s0 ==
167 * 1 means we are off and running.
171 return TRUE
; /* no cleanup required */
173 if ((input_chr
>= 0xD800 && input_chr
< 0xE000) ||
174 input_chr
>= 0x110000) {
176 * We can't output surrogates, or anything above 0x10FFFF.
183 emithl(emit
, emitctx
, utf
->s0
, 0xFEFF);
186 if (input_chr
< 0x10000) {
187 emithl(emit
, emitctx
, utf
->s0
, input_chr
);
189 input_chr
-= 0x10000;
190 /* now input_chr is between 0 and 0xFFFFF inclusive */
191 emithl(emit
, emitctx
, utf
->s0
, 0xD800 | ((input_chr
>> 10) & 0x3FF));
192 emithl(emit
, emitctx
, utf
->s0
, 0xDC00 | (input_chr
& 0x3FF));
197 static const struct utf16 utf16_bigendian
= { 0x20000 };
198 static const struct utf16 utf16_littleendian
= { 0x10000 };
199 static const struct utf16 utf16_variable_endianness
= { 0x30000 };
201 const charset_spec charset_CS_UTF16BE
= {
202 CS_UTF16BE
, read_utf16
, write_utf16
, &utf16_bigendian
204 const charset_spec charset_CS_UTF16LE
= {
205 CS_UTF16LE
, read_utf16
, write_utf16
, &utf16_littleendian
207 const charset_spec charset_CS_UTF16
= {
208 CS_UTF16
, read_utf16
, write_utf16
, &utf16_variable_endianness
211 #else /* ENUM_CHARSETS */
213 ENUM_CHARSET(CS_UTF16
)
214 ENUM_CHARSET(CS_UTF16BE
)
215 ENUM_CHARSET(CS_UTF16LE
)
217 #endif /* ENUM_CHARSETS */