mdw@git.distorted.org.uk Git - sgt/charset/blob - utf7.c

   1 /*
   2  * utf7.c - routines to handle UTF-7 (RFC 1642 / RFC 2152).
   3  */
   4
   5 #ifndef ENUM_CHARSETS
   6
   7 #include "charset.h"
   8 #include "internal.h"
   9
  10 /*
  11  * This array is generated by a piece of Perl:
  12
  13 perl -e 'for $i (0..32) { $a[$i] |= 2; } $a[32] |= 1;' \
  14      -e 'for $i ("a".."z","A".."Z","0".."9","'\''","(",' \
  15      -e '        ")",",","-",".","/",":","?") { $a[ord $i] |= 1; }' \
  16      -e 'for $i ("!","\"","#","\$","%","&","*",";","<","=",">","\@",' \
  17      -e '        "[","]","^","_","`","{","|","}") { $a[ord $i] |= 2; }' \
  18      -e 'for $i ("a".."z","A".."Z","0".."9","+","/") { $a[ord $i] |= 4; }' \
  19      -e 'for $i (0..127) { printf "%s%d,%s", $i%32?"":"    ", $a[$i],' \
  20      -e '                  ($i+1)%32?"":"\n"; }'
  21
  22  */
  23 static const unsigned char utf7_ascii_properties[128] = {
  24     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  25     3,2,2,2,2,2,2,1,1,1,2,4,1,1,1,5,5,5,5,5,5,5,5,5,5,5,1,2,2,2,2,1,
  26     2,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,2,0,2,2,2,
  27     2,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,2,2,2,0,0,
  28 };
  29 #define SET_D(c) ((c) >= 0 && (c) < 0x80 && (utf7_ascii_properties[(c)] & 1))
  30 #define SET_O(c) ((c) >= 0 && (c) < 0x80 && (utf7_ascii_properties[(c)] & 2))
  31 #define SET_B(c) ((c) >= 0 && (c) < 0x80 && (utf7_ascii_properties[(c)] & 4))
  32
  33 #define base64_value(c) ( (c) >= 'A' && (c) <= 'Z' ? (c) - 'A' : \
  34                           (c) >= 'a' && (c) <= 'z' ? (c) - 'a' + 26 : \
  35                           (c) >= '0' && (c) <= '9' ? (c) - '0' + 52 : \
  36                           (c) == '+' ? 62 : 63 )
  37
  38 static const char *const base64_chars =
  39     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
  40
  41 static void read_utf7(charset_spec const *charset, long int input_chr,
  42                       charset_state *state,
  43                       void (*emit)(void *ctx, long int output), void *emitctx)
  44 {
  45     long int hw;
  46
  47     UNUSEDARG(charset);
  48
  49     /*
  50      * state->s0 is used to handle the conversion of the UTF-7
  51      * transport format into a stream of halfwords. Its layout is:
  52      *
  53      *  - In normal ASCII mode, it is zero.
  54      *
  55      *  - Otherwise, it holds a leading 1 followed by all the bits
  56      *    so far accumulated in base64 digits.
  57      *
  58      *  - Special case: when we have only just seen the initial `+'
  59      *    which enters base64 mode, it is set to 2 rather than 1
  60      *    (this is an otherwise unused value since base64 always
  61      *    accumulates an even number of bits at a time), so that
  62      *    the special sequence `+-' can be made to encode `+'
  63      *    easily.
  64      *
  65      * state->s1 is used to handle the conversion of those
  66      * halfwords into Unicode values. It contains a high surrogate
  67      * value if we've just seen one, and 0 otherwise.
  68      */
  69
  70     if (!state->s0) {
  71         if (input_chr == '+')
  72             state->s0 = 2;
  73         else
  74             emit(emitctx, input_chr);
  75         return;
  76     } else {
  77         if (!SET_B(input_chr)) {
  78             /*
  79              * base64 mode ends here. Emit the character we have,
  80              * unless it's a minus in which case we should swallow
  81              * it.
  82              */
  83             if (input_chr != '-')
  84                 emit(emitctx, input_chr);
  85             else if (state->s0 == 2)
  86                 emit(emitctx, '+');    /* special case */
  87             state->s0 = 0;
  88             return;
  89         }
  90
  91         /*
  92          * Now we have a base64 character, so add it to our state,
  93          * first correcting the special case value of s0.
  94          */
  95         if (state->s0 == 2)
  96             state->s0 = 1;
  97         state->s0 = (state->s0 << 6) | base64_value(input_chr);
  98     }
  99
 100     /*
 101      * If we don't have a whole halfword at this point, bale out.
 102      */
 103     if (!(state->s0 & 0xFFFF0000))
 104         return;
 105
 106     /*
 107      * Otherwise, extract the halfword. There are three
 108      * possibilities for where the top set bit might be.
 109      */
 110     if (state->s0 & 0x00100000) {
 111         hw = (state->s0 >> 4) & 0xFFFF;
 112         state->s0 = (state->s0 & 0xF) | 0x10;
 113     } else if (state->s0 & 0x00040000) {
 114         hw = (state->s0 >> 2) & 0xFFFF;
 115         state->s0 = (state->s0 & 3) | 4;
 116     } else {
 117         hw = state->s0 & 0xFFFF;
 118         state->s0 = 1;
 119     }
 120
 121     /*
 122      * Now what reaches this point should be a stream of halfwords
 123      * in sensible numeric form. So now we process surrogates.
 124      */
 125     if (state->s1) {
 126         /*
 127          * We have already seen a high surrogate, so we expect a
 128          * low surrogate. Whinge if we didn't get it.
 129          */
 130         if (hw < 0xDC00 || hw >= 0xE000) {
 131             emit(emitctx, ERROR);
 132         } else {
 133             hw &= 0x3FF;
 134             hw |= (state->s1 & 0x3FF) << 10;
 135             emit(emitctx, hw + 0x10000);
 136         }
 137         state->s1 = 0;
 138     } else {
 139         /*
 140          * Any low surrogate is an error.
 141          */
 142         if (hw >= 0xDC00 && hw < 0xE000) {
 143             emit(emitctx, ERROR);
 144             return;
 145         }
 146
 147         /*
 148          * Any high surrogate is simply stored until we see the
 149          * next halfword.
 150          */
 151         if (hw >= 0xD800 && hw < 0xDC00) {
 152             state->s1 = hw;
 153             return;
 154         }
 155
 156         /*
 157          * Anything else we simply output.
 158          */
 159         emit(emitctx, hw);
 160     }
 161 }
 162
 163 /*
 164  * For writing UTF-7, we supply two charset definitions, one of
 165  * which will directly encode Set O characters and the other of
 166  * which will cautiously base64 them.
 167  */
 168 static int write_utf7(charset_spec const *charset, long int input_chr,
 169                       charset_state *state,
 170                       void (*emit)(void *ctx, long int output),
 171                       void *emitctx)
 172 {
 173     unsigned long hws[2];
 174     int nhws;
 175     int i;
 176
 177     /*
 178      * For writing: state->s0 contains accumulated base64 data with
 179      * a 1 in front, and state->s1 indicates how many bits of it we
 180      * have.
 181      */
 182
 183     if ((input_chr >= 0xD800 && input_chr < 0xE000) ||
 184         input_chr >= 0x110000) {
 185         /*
 186          * We can't output surrogates, or anything above 0x10FFFF.
 187          */
 188         return FALSE;
 189     }
 190
 191     /*
 192      * Look for characters which we output in ASCII mode. A special
 193      * case here is +, which can be encoded as the empty base64
 194      * escape sequence `+-': if we're _already_ in ASCII mode we do
 195      * that, but if we're in base64 mode at the point we see the +
 196      * then we simply stay in base64 mode and output it as a
 197      * halfword. (Switching back would cost three bytes, whereas
 198      * staying in base64 costs only 2 2/3.)
 199      */
 200     if (input_chr == -1 || SET_D(input_chr) ||
 201         (charset->charset == CS_UTF7 && SET_O(input_chr)) ||
 202         (!state->s0 && input_chr == '+')) {
 203         if (state->s0) {
 204             /*
 205              * These characters are output in ASCII mode, so flush any
 206              * lingering base64 data.
 207              */
 208             state->s0 <<= 6 - state->s1;
 209             emit(emitctx, base64_chars[state->s0 & 0x3F]);
 210             /*
 211              * I'm going to arbitrarily decide to always use the
 212              * terminating minus sign. It's easier than figuring out
 213              * whether to do so or not, and looks prettier besides.
 214              */
 215             emit(emitctx, '-');
 216             state->s0 = state->s1 = 0;
 217         }
 218
 219         /*
 220          * Now output the character.
 221          */
 222         if (input_chr != -1)           /* special case: just reset state */
 223             emit(emitctx, input_chr);
 224         if (input_chr == '+')
 225             emit(emitctx, '-');        /* +- encodes + */
 226         return TRUE;
 227     }
 228
 229     /*
 230      * Now we know we have a character that needs to be output as
 231      * either one base64-encoded halfword or two. So first figure
 232      * out how many...
 233      */
 234     if (input_chr < 0x10000) {
 235         nhws = 1;
 236         hws[0] = input_chr;
 237     } else {
 238         input_chr -= 0x10000;
 239         if (input_chr >= 0x100000) {
 240             /* Anything above 0x10FFFF is outside UTF-7 range. */
 241             return FALSE;
 242         }
 243
 244         nhws = 2;
 245         hws[0] = 0xD800 | ((input_chr >> 10) & 0x3FF);
 246         hws[1] = 0xDC00 | (input_chr & 0x3FF);
 247     }
 248
 249     /*
 250      * ... switch into base64 mode if required ...
 251      */
 252     if (!state->s0) {
 253         emit(emitctx, '+');
 254         state->s0 = 1;
 255         state->s1 = 0;
 256     }
 257
 258     /*
 259      * ... and do the base64 output.
 260      */
 261     for (i = 0; i < nhws; i++) {
 262         state->s0 = (state->s0 << 16) | hws[i];
 263         state->s1 += 16;
 264
 265         while (state->s1 >= 6) {
 266             /*
 267              * The top set bit must be in position 16, 18 or 20.
 268              */
 269             unsigned long out, topbit;
 270
 271             out = (state->s0 >> (state->s1 - 6)) & 0x3F;
 272             state->s1 -= 6;
 273             topbit = 1 << state->s1;
 274             state->s0 = (state->s0 & (topbit-1)) | topbit;
 275
 276             emit(emitctx, base64_chars[out]);
 277         }
 278     }
 279     return TRUE;
 280 }
 281
 282 const charset_spec charset_CS_UTF7 = {
 283     CS_UTF7, read_utf7, write_utf7, NULL
 284 };
 285
 286 const charset_spec charset_CS_UTF7_CONSERVATIVE = {
 287     CS_UTF7_CONSERVATIVE, read_utf7, write_utf7, NULL
 288 };
 289
 290 #else /* ENUM_CHARSETS */
 291
 292 ENUM_CHARSET(CS_UTF7)
 293 ENUM_CHARSET(CS_UTF7_CONSERVATIVE)
 294
 295 #endif /* ENUM_CHARSETS */