mdw@git.distorted.org.uk Git - sgt/charset/blob - utf16.c

   1 /*
   2  * utf16.c - routines to handle UTF-16 (RFC 2781).
   3  */
   4
   5 #ifndef ENUM_CHARSETS
   6
   7 #include "charset.h"
   8 #include "internal.h"
   9
  10 struct utf16 {
  11     int s0;                            /* initial value of state->s0 */
  12 };
  13
  14 static void read_utf16(charset_spec const *charset, long int input_chr,
  15                        charset_state *state,
  16                        void (*emit)(void *ctx, long int output),
  17                        void *emitctx)
  18 {
  19     struct utf16 const *utf = (struct utf16 *)charset->data;
  20     long int hw;
  21
  22     /*
  23      * State variable s1 handles the combining of bytes into
  24      * transport-endianness halfwords. It contains:
  25      *
  26      *  - 0 if we're between halfwords
  27      *  - 0x100 plus the first byte if we're in mid-halfword
  28      *
  29      * State variable s0 handles everything from there upwards. It
  30      * contains:
  31      *
  32      *  - Bottom 16 bits are set to a surrogate value if we've just
  33      *    seen one.
  34      *  - Next two bits (17:16) indicate possible endiannesses. Bit
  35      *    17 is set if we might be BE; bit 16 if we might be LE. If
  36      *    they're both zero, it has to be because this is right at
  37      *    the start, so the first thing we do is set them to the
  38      *    correct initial state.
  39      *  - The bit after that (18) is 1 iff we have already seen at
  40      *    least one halfword (meaning we should pass any further
  41      *    BOMs straight through).
  42      */
  43
  44     /* Set up s0 if this is the start. */
  45     if (state->s0 == 0)
  46         state->s0 = utf->s0;
  47
  48     /* Accumulate a transport-endianness halfword. */
  49     if (state->s1 == 0) {
  50         state->s1 = 0x100 | input_chr;
  51         return;
  52     }
  53     hw = ((state->s1 & 0xFF) << 8) + input_chr;
  54     state->s1 = 0;
  55
  56     /* Process BOM and determine byte order. */
  57     if (!(state->s0 & 0x40000)) {
  58         state->s0 |= 0x40000;
  59         if (hw == 0xFEFF && (state->s0 & 0x20000)) {
  60             /*
  61              * Text starts with a big-endian BOM, and big-
  62              * endianness is a possibility. So clear the
  63              * little-endian bit (the BOM confirms our endianness),
  64              * and return without emitting the BOM in Unicode.
  65              */
  66             state->s0 &= ~0x10000;
  67             return;
  68         } else if (hw == 0xFFFE && (state->s0 & 0x10000)) {
  69             /*
  70              * Text starts with a little-endian BOM, and little-
  71              * endianness is a possibility. So clear the big-endian
  72              * bit (the BOM confirms our endianness), and return
  73              * without emitting the BOM in Unicode.
  74              */
  75             state->s0 &= ~0x20000;
  76             return;
  77         } else {
  78             /*
  79              * Text does not begin with a BOM. RFC 2781 states that
  80              * in this case we must assume big-endianness if we
  81              * haven't been told otherwise by the content type.
  82              */
  83             if ((state->s0 & 0x30000) == 0x30000)
  84                 state->s0 &= ~0x10000; /* clear LE bit */
  85         }
  86     }
  87
  88     /*
  89      * Byte-swap transport-endianness halfword if necessary. We may
  90      * now test individual endianness bits, since we can be sure
  91      * exactly one is set.
  92      */
  93     if (state->s0 & 0x10000)
  94         hw = ((hw >> 8) | (hw << 8)) & 0xFFFF;
  95
  96     /*
  97      * Now that the endianness issue has been dealt with, what
  98      * reaches this point should be a stream of halfwords in
  99      * sensible numeric form. So now we process surrogates.
 100      */
 101     if (state->s0 & 0xFFFF) {
 102         /*
 103          * We have already seen a high surrogate, so we expect a
 104          * low surrogate. Whinge if we didn't get it.
 105          */
 106         if (hw < 0xDC00 || hw >= 0xE000) {
 107             emit(emitctx, ERROR);
 108         } else {
 109             hw &= 0x3FF;
 110             hw |= (state->s0 & 0x3FF) << 10;
 111             emit(emitctx, hw + 0x10000);
 112         }
 113         state->s0 &= 0xFFFF0000;
 114     } else {
 115         /*
 116          * Any low surrogate is an error.
 117          */
 118         if (hw >= 0xDC00 && hw < 0xE000) {
 119             emit(emitctx, ERROR);
 120             return;
 121         }
 122
 123         /*
 124          * Any high surrogate is simply stored until we see the
 125          * next halfword.
 126          */
 127         if (hw >= 0xD800 && hw < 0xDC00) {
 128             state->s0 |= hw;
 129             return;
 130         }
 131
 132         /*
 133          * Anything else we simply output.
 134          */
 135         emit(emitctx, hw);
 136     }
 137 }
 138
 139 /*
 140  * Repeated code in write_utf16 abstracted out for sanity.
 141  */
 142 static void emithl(void (*emit)(void *ctx, long int output), void *emitctx,
 143                    unsigned long s0, long int hw)
 144 {
 145     int h = (hw >> 8) & 0xFF, l = hw & 0xFF;
 146
 147     if (s0 & 0x20000) {
 148         /* Big-endian takes priority over little, if both are allowed. */
 149         emit(emitctx, h);
 150         emit(emitctx, l);
 151     } else {
 152         emit(emitctx, l);
 153         emit(emitctx, h);
 154     }
 155 }
 156
 157 static int write_utf16(charset_spec const *charset, long int input_chr,
 158                        charset_state *state,
 159                        void (*emit)(void *ctx, long int output),
 160                        void *emitctx)
 161 {
 162     struct utf16 const *utf = (struct utf16 *)charset->data;
 163
 164     /*
 165      * state->s0 == 0 means we have not output anything yet (and so
 166      * must output a BOM before we do anything else). state->s0 ==
 167      * 1 means we are off and running.
 168      */
 169
 170     if (input_chr < 0)
 171         return TRUE;                   /* no cleanup required */
 172
 173     if ((input_chr >= 0xD800 && input_chr < 0xE000) ||
 174         input_chr >= 0x110000) {
 175         /*
 176          * We can't output surrogates, or anything above 0x10FFFF.
 177          */
 178         return FALSE;
 179     }
 180
 181     if (!state->s0) {
 182         state->s0 = 1;
 183         emithl(emit, emitctx, utf->s0, 0xFEFF);
 184     }
 185
 186     if (input_chr < 0x10000) {
 187         emithl(emit, emitctx, utf->s0, input_chr);
 188     } else {
 189         input_chr -= 0x10000;
 190         /* now input_chr is between 0 and 0xFFFFF inclusive */
 191         emithl(emit, emitctx, utf->s0, 0xD800 | ((input_chr >> 10) & 0x3FF));
 192         emithl(emit, emitctx, utf->s0, 0xDC00 | (input_chr & 0x3FF));
 193     }
 194     return TRUE;
 195 }
 196
 197 static const struct utf16 utf16_bigendian = { 0x20000 };
 198 static const struct utf16 utf16_littleendian = { 0x10000 };
 199 static const struct utf16 utf16_variable_endianness = { 0x30000 };
 200
 201 const charset_spec charset_CS_UTF16BE = {
 202     CS_UTF16BE, read_utf16, write_utf16, &utf16_bigendian
 203 };
 204 const charset_spec charset_CS_UTF16LE = {
 205     CS_UTF16LE, read_utf16, write_utf16, &utf16_littleendian
 206 };
 207 const charset_spec charset_CS_UTF16 = {
 208     CS_UTF16, read_utf16, write_utf16, &utf16_variable_endianness
 209 };
 210
 211 #else /* ENUM_CHARSETS */
 212
 213 ENUM_CHARSET(CS_UTF16)
 214 ENUM_CHARSET(CS_UTF16BE)
 215 ENUM_CHARSET(CS_UTF16LE)
 216
 217 #endif /* ENUM_CHARSETS */