git.distorted.org.uk Git - u/mdw/putty/blob - charset/utf8.c

   1 /*
   2  * utf8.c - routines to handle UTF-8.
   3  */
   4
   5 #ifndef ENUM_CHARSETS
   6
   7 #include "charset.h"
   8 #include "internal.h"
   9
  10 void read_utf8(charset_spec const *, long int, charset_state *,
  11                void (*)(void *, long int), void *);
  12 void write_utf8(charset_spec const *, long int,
  13                 charset_state *, void (*)(void *, long int), void *);
  14
  15 /*
  16  * UTF-8 has no associated data, so `charset' may be ignored.
  17  */
  18
  19 void read_utf8(charset_spec const *charset, long int input_chr,
  20                charset_state *state,
  21                void (*emit)(void *ctx, long int output), void *emitctx)
  22 {
  23     UNUSEDARG(charset);
  24
  25     /*
  26      * For reading UTF-8, the `state' word contains:
  27      *
  28      *  - in bits 29-31, the number of bytes expected to be in the
  29      *    current multibyte character (which we can tell instantly
  30      *    from the first byte, of course).
  31      *
  32      *  - in bits 26-28, the number of bytes _seen so far_ in the
  33      *    current multibyte character.
  34      *
  35      *  - in the remainder of the word, the current value of the
  36      *    character, which is shifted upwards by 6 bits to
  37      *    accommodate each new byte.
  38      *
  39      * As required, the state is zero when we are not in the middle
  40      * of a multibyte character at all.
  41      *
  42      * For example, when reading E9 8D 8B, starting at state=0:
  43      *
  44      *  - after E9, the state is 0x64000009
  45      *  - after 8D, the state is 0x6800024d
  46      *  - after 8B, the state conceptually becomes 0x6c00934b, at
  47      *    which point we notice we've got as many characters as we
  48      *    were expecting, output U+934B, and reset the state to
  49      *    zero.
  50      *
  51      * Note that the maximum number of bits we might need to store
  52      * in the character value field is 25 (U+7FFFFFFF contains 31
  53      * bits, but we will never actually store its full value
  54      * because when we receive the last 6 bits in the final
  55      * continuation byte we will output it and revert the state to
  56      * zero). Hence the character value field never collides with
  57      * the byte counts.
  58      */
  59
  60     if (input_chr < 0x80) {
  61         /*
  62          * Single-byte character. If the state is nonzero before
  63          * coming here, output an error for an incomplete sequence.
  64          * Then output the character.
  65          */
  66         if (state->s0 != 0) {
  67             emit(emitctx, ERROR);
  68             state->s0 = 0;
  69         }
  70         emit(emitctx, input_chr);
  71     } else if (input_chr == 0xFE || input_chr == 0xFF) {
  72         /*
  73          * FE and FF bytes should _never_ occur in UTF-8. They are
  74          * automatic errors; if the state was nonzero to start
  75          * with, output a further error for an incomplete sequence.
  76          */
  77         if (state->s0 != 0) {
  78             emit(emitctx, ERROR);
  79             state->s0 = 0;
  80         }
  81         emit(emitctx, ERROR);
  82     } else if (input_chr >= 0x80 && input_chr < 0xC0) {
  83         /*
  84          * Continuation byte. Output an error for an unexpected
  85          * continuation byte, if the state is zero.
  86          */
  87         if (state->s0 == 0) {
  88             emit(emitctx, ERROR);
  89         } else {
  90             unsigned long charval;
  91             unsigned long topstuff;
  92             int bytes;
  93
  94             /*
  95              * Otherwise, accumulate more of the character value.
  96              */
  97             charval = state->s0 & 0x03ffffffL;
  98             charval = (charval << 6) | (input_chr & 0x3F);
  99
 100             /*
 101              * Check the byte counts; if we have not reached the
 102              * end of the character, update the state and return.
 103              */
 104             topstuff = state->s0 & 0xfc000000L;
 105             topstuff += 0x04000000L;   /* add one to the byte count */
 106             if (((topstuff << 3) ^ topstuff) & 0xe0000000L) {
 107                 state->s0 = topstuff | charval;
 108                 return;
 109             }
 110
 111             /*
 112              * Now we know we've reached the end of the character.
 113              * `charval' is the Unicode value. We should check for
 114              * various invalid things, and then either output
 115              * charval or an error. In all cases we reset the state
 116              * to zero.
 117              */
 118             bytes = topstuff >> 29;
 119             state->s0 = 0;
 120
 121             if (charval >= 0xD800 && charval < 0xE000) {
 122                 /*
 123                  * Surrogates (0xD800-0xDFFF) may never be encoded
 124                  * in UTF-8. A surrogate pair in Unicode should
 125                  * have been encoded as a single UTF-8 character
 126                  * occupying more than three bytes.
 127                  */
 128                 emit(emitctx, ERROR);
 129             } else if (charval == 0xFFFE || charval == 0xFFFF) {
 130                 /*
 131                  * U+FFFE and U+FFFF are invalid Unicode characters
 132                  * and may never be encoded in UTF-8. (This is one
 133                  * reason why U+FFFF is our way of signalling an
 134                  * error to our `emit' function :-)
 135                  */
 136                 emit(emitctx, ERROR);
 137             } else if ((charval <= 0x7FL /* && bytes > 1 */) ||
 138                        (charval <= 0x7FFL && bytes > 2) ||
 139                        (charval <= 0xFFFFL && bytes > 3) ||
 140                        (charval <= 0x1FFFFFL && bytes > 4) ||
 141                        (charval <= 0x3FFFFFFL && bytes > 5)) {
 142                 /*
 143                  * Overlong sequences are not to be tolerated,
 144                  * under any circumstances.
 145                  */
 146                 emit(emitctx, ERROR);
 147             } else {
 148                 /*
 149                  * Oh, all right. We'll let this one off.
 150                  */
 151                 emit(emitctx, charval);
 152             }
 153         }
 154
 155     } else {
 156         /*
 157          * Lead byte. First output an error for an incomplete
 158          * sequence, if the state is nonzero.
 159          */
 160         if (state->s0 != 0)
 161             emit(emitctx, ERROR);
 162
 163         /*
 164          * Now deal with the lead byte: work out the number of
 165          * bytes we expect to see in this character, and extract
 166          * the initial bits of it too.
 167          */
 168         if (input_chr >= 0xC0 && input_chr < 0xE0) {
 169             state->s0 = 0x44000000L | (input_chr & 0x1F);
 170         } else if (input_chr >= 0xE0 && input_chr < 0xF0) {
 171             state->s0 = 0x64000000L | (input_chr & 0x0F);
 172         } else if (input_chr >= 0xF0 && input_chr < 0xF8) {
 173             state->s0 = 0x84000000L | (input_chr & 0x07);
 174         } else if (input_chr >= 0xF8 && input_chr < 0xFC) {
 175             state->s0 = 0xa4000000L | (input_chr & 0x03);
 176         } else if (input_chr >= 0xFC && input_chr < 0xFE) {
 177             state->s0 = 0xc4000000L | (input_chr & 0x01);
 178         }
 179     }
 180 }
 181
 182 /*
 183  * UTF-8 is a stateless multi-byte encoding (in the sense that just
 184  * after any character has been completed, the state is always the
 185  * same); hence when writing it, there is no need to use the
 186  * charset_state.
 187  */
 188
 189 void write_utf8(charset_spec const *charset, long int input_chr,
 190                 charset_state *state,
 191                 void (*emit)(void *ctx, long int output), void *emitctx)
 192 {
 193     UNUSEDARG(charset);
 194     UNUSEDARG(state);
 195
 196     /*
 197      * Refuse to output any illegal code points.
 198      */
 199     if (input_chr == 0xFFFE || input_chr == 0xFFFF ||
 200         (input_chr >= 0xD800 && input_chr < 0xE000)) {
 201         emit(emitctx, ERROR);
 202     } else if (input_chr < 0x80) {     /* one-byte character */
 203         emit(emitctx, input_chr);
 204     } else if (input_chr < 0x800) {    /* two-byte character */
 205         emit(emitctx, 0xC0 | (0x1F & (input_chr >>  6)));
 206         emit(emitctx, 0x80 | (0x3F & (input_chr      )));
 207     } else if (input_chr < 0x10000) {  /* three-byte character */
 208         emit(emitctx, 0xE0 | (0x0F & (input_chr >> 12)));
 209         emit(emitctx, 0x80 | (0x3F & (input_chr >>  6)));
 210         emit(emitctx, 0x80 | (0x3F & (input_chr      )));
 211     } else if (input_chr < 0x200000) { /* four-byte character */
 212         emit(emitctx, 0xF0 | (0x07 & (input_chr >> 18)));
 213         emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
 214         emit(emitctx, 0x80 | (0x3F & (input_chr >>  6)));
 215         emit(emitctx, 0x80 | (0x3F & (input_chr      )));
 216     } else if (input_chr < 0x4000000) {/* five-byte character */
 217         emit(emitctx, 0xF8 | (0x03 & (input_chr >> 24)));
 218         emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
 219         emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
 220         emit(emitctx, 0x80 | (0x3F & (input_chr >>  6)));
 221         emit(emitctx, 0x80 | (0x3F & (input_chr      )));
 222     } else {                           /* six-byte character */
 223         emit(emitctx, 0xFC | (0x01 & (input_chr >> 30)));
 224         emit(emitctx, 0x80 | (0x3F & (input_chr >> 24)));
 225         emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
 226         emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
 227         emit(emitctx, 0x80 | (0x3F & (input_chr >>  6)));
 228         emit(emitctx, 0x80 | (0x3F & (input_chr      )));
 229     }
 230 }
 231
 232 #ifdef TESTMODE
 233
 234 #include <stdio.h>
 235 #include <stdarg.h>
 236
 237 int total_errs = 0;
 238
 239 void utf8_emit(void *ctx, long output)
 240 {
 241     wchar_t **p = (wchar_t **)ctx;
 242     *(*p)++ = output;
 243 }
 244
 245 void utf8_read_test(int line, char *input, int inlen, ...)
 246 {
 247     va_list ap;
 248     wchar_t *p, str[512];
 249     int i;
 250     charset_state state;
 251     unsigned long l;
 252
 253     state.s0 = 0;
 254     p = str;
 255
 256     for (i = 0; i < inlen; i++)
 257         read_utf8(NULL, input[i] & 0xFF, &state, utf8_emit, &p);
 258
 259     va_start(ap, inlen);
 260     l = 0;
 261     for (i = 0; i < p - str; i++) {
 262         l = va_arg(ap, long int);
 263         if (l == -1) {
 264             printf("%d: correct string shorter than output\n", line);
 265             total_errs++;
 266             break;
 267         }
 268         if (l != str[i]) {
 269             printf("%d: char %d came out as %08x, should be %08x\n",
 270                     line, i, str[i], l);
 271             total_errs++;
 272         }
 273     }
 274     if (l != -1) {
 275         l = va_arg(ap, long int);
 276         if (l != -1) {
 277             printf("%d: correct string longer than output\n", line);
 278             total_errs++;
 279         }
 280     }
 281     va_end(ap);
 282 }
 283
 284 void utf8_write_test(int line, const long *input, int inlen, ...)
 285 {
 286     va_list ap;
 287     wchar_t *p, str[512];
 288     int i;
 289     charset_state state;
 290     unsigned long l;
 291
 292     state.s0 = 0;
 293     p = str;
 294
 295     for (i = 0; i < inlen; i++)
 296         write_utf8(NULL, input[i], &state, utf8_emit, &p);
 297
 298     va_start(ap, inlen);
 299     l = 0;
 300     for (i = 0; i < p - str; i++) {
 301         l = va_arg(ap, long int);
 302         if (l == -1) {
 303             printf("%d: correct string shorter than output\n", line);
 304             total_errs++;
 305             break;
 306         }
 307         if (l != str[i]) {
 308             printf("%d: char %d came out as %08x, should be %08x\n",
 309                     line, i, str[i], l);
 310             total_errs++;
 311         }
 312     }
 313     if (l != -1) {
 314         l = va_arg(ap, long int);
 315         if (l != -1) {
 316             printf("%d: correct string longer than output\n", line);
 317             total_errs++;
 318         }
 319     }
 320     va_end(ap);
 321 }
 322
 323 /* Macro to concoct the first three parameters of utf8_read_test. */
 324 #define TESTSTR(x) __LINE__, x, lenof(x)
 325
 326 int main(void)
 327 {
 328     printf("read tests beginning\n");
 329     utf8_read_test(TESTSTR("\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"),
 330                    0x000003BA, /* GREEK SMALL LETTER KAPPA */
 331                    0x00001F79, /* GREEK SMALL LETTER OMICRON WITH OXIA */
 332                    0x000003C3, /* GREEK SMALL LETTER SIGMA */
 333                    0x000003BC, /* GREEK SMALL LETTER MU */
 334                    0x000003B5, /* GREEK SMALL LETTER EPSILON */
 335                    0, -1);
 336     utf8_read_test(TESTSTR("\x00"),
 337                    0x00000000, /* <control> */
 338                    0, -1);
 339     utf8_read_test(TESTSTR("\xC2\x80"),
 340                    0x00000080, /* <control> */
 341                    0, -1);
 342     utf8_read_test(TESTSTR("\xE0\xA0\x80"),
 343                    0x00000800, /* <no name available> */
 344                    0, -1);
 345     utf8_read_test(TESTSTR("\xF0\x90\x80\x80"),
 346                    0x00010000, /* <no name available> */
 347                    0, -1);
 348     utf8_read_test(TESTSTR("\xF8\x88\x80\x80\x80"),
 349                    0x00200000, /* <no name available> */
 350                    0, -1);
 351     utf8_read_test(TESTSTR("\xFC\x84\x80\x80\x80\x80"),
 352                    0x04000000, /* <no name available> */
 353                    0, -1);
 354     utf8_read_test(TESTSTR("\x7F"),
 355                    0x0000007F, /* <control> */
 356                    0, -1);
 357     utf8_read_test(TESTSTR("\xDF\xBF"),
 358                    0x000007FF, /* <no name available> */
 359                    0, -1);
 360     utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
 361                    0x0000FFFD, /* REPLACEMENT CHARACTER */
 362                    0, -1);
 363     utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
 364                    ERROR,      /* <no name available> (invalid char) */
 365                    0, -1);
 366     utf8_read_test(TESTSTR("\xF7\xBF\xBF\xBF"),
 367                    0x001FFFFF, /* <no name available> */
 368                    0, -1);
 369     utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF\xBF"),
 370                    0x03FFFFFF, /* <no name available> */
 371                    0, -1);
 372     utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF\xBF"),
 373                    0x7FFFFFFF, /* <no name available> */
 374                    0, -1);
 375     utf8_read_test(TESTSTR("\xED\x9F\xBF"),
 376                    0x0000D7FF, /* <no name available> */
 377                    0, -1);
 378     utf8_read_test(TESTSTR("\xEE\x80\x80"),
 379                    0x0000E000, /* <Private Use, First> */
 380                    0, -1);
 381     utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
 382                    0x0000FFFD, /* REPLACEMENT CHARACTER */
 383                    0, -1);
 384     utf8_read_test(TESTSTR("\xF4\x8F\xBF\xBF"),
 385                    0x0010FFFF, /* <no name available> */
 386                    0, -1);
 387     utf8_read_test(TESTSTR("\xF4\x90\x80\x80"),
 388                    0x00110000, /* <no name available> */
 389                    0, -1);
 390     utf8_read_test(TESTSTR("\x80"),
 391                    ERROR,      /* (unexpected continuation byte) */
 392                    0, -1);
 393     utf8_read_test(TESTSTR("\xBF"),
 394                    ERROR,      /* (unexpected continuation byte) */
 395                    0, -1);
 396     utf8_read_test(TESTSTR("\x80\xBF"),
 397                    ERROR,      /* (unexpected continuation byte) */
 398                    ERROR,      /* (unexpected continuation byte) */
 399                    0, -1);
 400     utf8_read_test(TESTSTR("\x80\xBF\x80"),
 401                    ERROR,      /* (unexpected continuation byte) */
 402                    ERROR,      /* (unexpected continuation byte) */
 403                    ERROR,      /* (unexpected continuation byte) */
 404                    0, -1);
 405     utf8_read_test(TESTSTR("\x80\xBF\x80\xBF"),
 406                    ERROR,      /* (unexpected continuation byte) */
 407                    ERROR,      /* (unexpected continuation byte) */
 408                    ERROR,      /* (unexpected continuation byte) */
 409                    ERROR,      /* (unexpected continuation byte) */
 410                    0, -1);
 411     utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80"),
 412                    ERROR,      /* (unexpected continuation byte) */
 413                    ERROR,      /* (unexpected continuation byte) */
 414                    ERROR,      /* (unexpected continuation byte) */
 415                    ERROR,      /* (unexpected continuation byte) */
 416                    ERROR,      /* (unexpected continuation byte) */
 417                    0, -1);
 418     utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF"),
 419                    ERROR,      /* (unexpected continuation byte) */
 420                    ERROR,      /* (unexpected continuation byte) */
 421                    ERROR,      /* (unexpected continuation byte) */
 422                    ERROR,      /* (unexpected continuation byte) */
 423                    ERROR,      /* (unexpected continuation byte) */
 424                    ERROR,      /* (unexpected continuation byte) */
 425                    0, -1);
 426     utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF\x80"),
 427                    ERROR,      /* (unexpected continuation byte) */
 428                    ERROR,      /* (unexpected continuation byte) */
 429                    ERROR,      /* (unexpected continuation byte) */
 430                    ERROR,      /* (unexpected continuation byte) */
 431                    ERROR,      /* (unexpected continuation byte) */
 432                    ERROR,      /* (unexpected continuation byte) */
 433                    ERROR,      /* (unexpected continuation byte) */
 434                    0, -1);
 435     utf8_read_test(TESTSTR("\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"),
 436                    ERROR,      /* (unexpected continuation byte) */
 437                    ERROR,      /* (unexpected continuation byte) */
 438                    ERROR,      /* (unexpected continuation byte) */
 439                    ERROR,      /* (unexpected continuation byte) */
 440                    ERROR,      /* (unexpected continuation byte) */
 441                    ERROR,      /* (unexpected continuation byte) */
 442                    ERROR,      /* (unexpected continuation byte) */
 443                    ERROR,      /* (unexpected continuation byte) */
 444                    ERROR,      /* (unexpected continuation byte) */
 445                    ERROR,      /* (unexpected continuation byte) */
 446                    ERROR,      /* (unexpected continuation byte) */
 447                    ERROR,      /* (unexpected continuation byte) */
 448                    ERROR,      /* (unexpected continuation byte) */
 449                    ERROR,      /* (unexpected continuation byte) */
 450                    ERROR,      /* (unexpected continuation byte) */
 451                    ERROR,      /* (unexpected continuation byte) */
 452                    ERROR,      /* (unexpected continuation byte) */
 453                    ERROR,      /* (unexpected continuation byte) */
 454                    ERROR,      /* (unexpected continuation byte) */
 455                    ERROR,      /* (unexpected continuation byte) */
 456                    ERROR,      /* (unexpected continuation byte) */
 457                    ERROR,      /* (unexpected continuation byte) */
 458                    ERROR,      /* (unexpected continuation byte) */
 459                    ERROR,      /* (unexpected continuation byte) */
 460                    ERROR,      /* (unexpected continuation byte) */
 461                    ERROR,      /* (unexpected continuation byte) */
 462                    ERROR,      /* (unexpected continuation byte) */
 463                    ERROR,      /* (unexpected continuation byte) */
 464                    ERROR,      /* (unexpected continuation byte) */
 465                    ERROR,      /* (unexpected continuation byte) */
 466                    ERROR,      /* (unexpected continuation byte) */
 467                    ERROR,      /* (unexpected continuation byte) */
 468                    ERROR,      /* (unexpected continuation byte) */
 469                    ERROR,      /* (unexpected continuation byte) */
 470                    ERROR,      /* (unexpected continuation byte) */
 471                    ERROR,      /* (unexpected continuation byte) */
 472                    ERROR,      /* (unexpected continuation byte) */
 473                    ERROR,      /* (unexpected continuation byte) */
 474                    ERROR,      /* (unexpected continuation byte) */
 475                    ERROR,      /* (unexpected continuation byte) */
 476                    ERROR,      /* (unexpected continuation byte) */
 477                    ERROR,      /* (unexpected continuation byte) */
 478                    ERROR,      /* (unexpected continuation byte) */
 479                    ERROR,      /* (unexpected continuation byte) */
 480                    ERROR,      /* (unexpected continuation byte) */
 481                    ERROR,      /* (unexpected continuation byte) */
 482                    ERROR,      /* (unexpected continuation byte) */
 483                    ERROR,      /* (unexpected continuation byte) */
 484                    ERROR,      /* (unexpected continuation byte) */
 485                    ERROR,      /* (unexpected continuation byte) */
 486                    ERROR,      /* (unexpected continuation byte) */
 487                    ERROR,      /* (unexpected continuation byte) */
 488                    ERROR,      /* (unexpected continuation byte) */
 489                    ERROR,      /* (unexpected continuation byte) */
 490                    ERROR,      /* (unexpected continuation byte) */
 491                    ERROR,      /* (unexpected continuation byte) */
 492                    ERROR,      /* (unexpected continuation byte) */
 493                    ERROR,      /* (unexpected continuation byte) */
 494                    ERROR,      /* (unexpected continuation byte) */
 495                    ERROR,      /* (unexpected continuation byte) */
 496                    ERROR,      /* (unexpected continuation byte) */
 497                    ERROR,      /* (unexpected continuation byte) */
 498                    ERROR,      /* (unexpected continuation byte) */
 499                    ERROR,      /* (unexpected continuation byte) */
 500                    0, -1);
 501     utf8_read_test(TESTSTR("\xC0\x20\xC1\x20\xC2\x20\xC3\x20\xC4\x20\xC5\x20\xC6\x20\xC7\x20"),
 502                    ERROR,      /* (incomplete sequence) */
 503                    0x00000020, /* SPACE */
 504                    ERROR,      /* (incomplete sequence) */
 505                    0x00000020, /* SPACE */
 506                    ERROR,      /* (incomplete sequence) */
 507                    0x00000020, /* SPACE */
 508                    ERROR,      /* (incomplete sequence) */
 509                    0x00000020, /* SPACE */
 510                    ERROR,      /* (incomplete sequence) */
 511                    0x00000020, /* SPACE */
 512                    ERROR,      /* (incomplete sequence) */
 513                    0x00000020, /* SPACE */
 514                    ERROR,      /* (incomplete sequence) */
 515                    0x00000020, /* SPACE */
 516                    ERROR,      /* (incomplete sequence) */
 517                    0x00000020, /* SPACE */
 518                    0, -1);
 519     utf8_read_test(TESTSTR("\xE0\x20\xE1\x20\xE2\x20\xE3\x20\xE4\x20\xE5\x20\xE6\x20\xE7\x20\xE8\x20\xE9\x20\xEA\x20\xEB\x20\xEC\x20\xED\x20\xEE\x20\xEF\x20"),
 520                    ERROR,      /* (incomplete sequence) */
 521                    0x00000020, /* SPACE */
 522                    ERROR,      /* (incomplete sequence) */
 523                    0x00000020, /* SPACE */
 524                    ERROR,      /* (incomplete sequence) */
 525                    0x00000020, /* SPACE */
 526                    ERROR,      /* (incomplete sequence) */
 527                    0x00000020, /* SPACE */
 528                    ERROR,      /* (incomplete sequence) */
 529                    0x00000020, /* SPACE */
 530                    ERROR,      /* (incomplete sequence) */
 531                    0x00000020, /* SPACE */
 532                    ERROR,      /* (incomplete sequence) */
 533                    0x00000020, /* SPACE */
 534                    ERROR,      /* (incomplete sequence) */
 535                    0x00000020, /* SPACE */
 536                    ERROR,      /* (incomplete sequence) */
 537                    0x00000020, /* SPACE */
 538                    ERROR,      /* (incomplete sequence) */
 539                    0x00000020, /* SPACE */
 540                    ERROR,      /* (incomplete sequence) */
 541                    0x00000020, /* SPACE */
 542                    ERROR,      /* (incomplete sequence) */
 543                    0x00000020, /* SPACE */
 544                    ERROR,      /* (incomplete sequence) */
 545                    0x00000020, /* SPACE */
 546                    ERROR,      /* (incomplete sequence) */
 547                    0x00000020, /* SPACE */
 548                    ERROR,      /* (incomplete sequence) */
 549                    0x00000020, /* SPACE */
 550                    ERROR,      /* (incomplete sequence) */
 551                    0x00000020, /* SPACE */
 552                    0, -1);
 553     utf8_read_test(TESTSTR("\xF0\x20\xF1\x20\xF2\x20\xF3\x20\xF4\x20\xF5\x20\xF6\x20\xF7\x20"),
 554                    ERROR,      /* (incomplete sequence) */
 555                    0x00000020, /* SPACE */
 556                    ERROR,      /* (incomplete sequence) */
 557                    0x00000020, /* SPACE */
 558                    ERROR,      /* (incomplete sequence) */
 559                    0x00000020, /* SPACE */
 560                    ERROR,      /* (incomplete sequence) */
 561                    0x00000020, /* SPACE */
 562                    ERROR,      /* (incomplete sequence) */
 563                    0x00000020, /* SPACE */
 564                    ERROR,      /* (incomplete sequence) */
 565                    0x00000020, /* SPACE */
 566                    ERROR,      /* (incomplete sequence) */
 567                    0x00000020, /* SPACE */
 568                    ERROR,      /* (incomplete sequence) */
 569                    0x00000020, /* SPACE */
 570                    0, -1);
 571     utf8_read_test(TESTSTR("\xF8\x20\xF9\x20\xFA\x20\xFB\x20"),
 572                    ERROR,      /* (incomplete sequence) */
 573                    0x00000020, /* SPACE */
 574                    ERROR,      /* (incomplete sequence) */
 575                    0x00000020, /* SPACE */
 576                    ERROR,      /* (incomplete sequence) */
 577                    0x00000020, /* SPACE */
 578                    ERROR,      /* (incomplete sequence) */
 579                    0x00000020, /* SPACE */
 580                    0, -1);
 581     utf8_read_test(TESTSTR("\xFC\x20\xFD\x20"),
 582                    ERROR,      /* (incomplete sequence) */
 583                    0x00000020, /* SPACE */
 584                    ERROR,      /* (incomplete sequence) */
 585                    0x00000020, /* SPACE */
 586                    0, -1);
 587     utf8_read_test(TESTSTR("\xC0"),
 588                    ERROR,      /* (incomplete sequence) */
 589                    0, -1);
 590     utf8_read_test(TESTSTR("\xE0\x80"),
 591                    ERROR,      /* (incomplete sequence) */
 592                    0, -1);
 593     utf8_read_test(TESTSTR("\xF0\x80\x80"),
 594                    ERROR,      /* (incomplete sequence) */
 595                    0, -1);
 596     utf8_read_test(TESTSTR("\xF8\x80\x80\x80"),
 597                    ERROR,      /* (incomplete sequence) */
 598                    0, -1);
 599     utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80"),
 600                    ERROR,      /* (incomplete sequence) */
 601                    0, -1);
 602     utf8_read_test(TESTSTR("\xDF"),
 603                    ERROR,      /* (incomplete sequence) */
 604                    0, -1);
 605     utf8_read_test(TESTSTR("\xEF\xBF"),
 606                    ERROR,      /* (incomplete sequence) */
 607                    0, -1);
 608     utf8_read_test(TESTSTR("\xF7\xBF\xBF"),
 609                    ERROR,      /* (incomplete sequence) */
 610                    0, -1);
 611     utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF"),
 612                    ERROR,      /* (incomplete sequence) */
 613                    0, -1);
 614     utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF"),
 615                    ERROR,      /* (incomplete sequence) */
 616                    0, -1);
 617     utf8_read_test(TESTSTR("\xC0\xE0\x80\xF0\x80\x80\xF8\x80\x80\x80\xFC\x80\x80\x80\x80\xDF\xEF\xBF\xF7\xBF\xBF\xFB\xBF\xBF\xBF\xFD\xBF\xBF\xBF\xBF"),
 618                    ERROR,      /* (incomplete sequence) */
 619                    ERROR,      /* (incomplete sequence) */
 620                    ERROR,      /* (incomplete sequence) */
 621                    ERROR,      /* (incomplete sequence) */
 622                    ERROR,      /* (incomplete sequence) */
 623                    ERROR,      /* (incomplete sequence) */
 624                    ERROR,      /* (incomplete sequence) */
 625                    ERROR,      /* (incomplete sequence) */
 626                    ERROR,      /* (incomplete sequence) */
 627                    ERROR,      /* (incomplete sequence) */
 628                    0, -1);
 629     utf8_read_test(TESTSTR("\xFE"),
 630                    ERROR,      /* (invalid UTF-8 byte) */
 631                    0, -1);
 632     utf8_read_test(TESTSTR("\xFF"),
 633                    ERROR,      /* (invalid UTF-8 byte) */
 634                    0, -1);
 635     utf8_read_test(TESTSTR("\xFE\xFE\xFF\xFF"),
 636                    ERROR,      /* (invalid UTF-8 byte) */
 637                    ERROR,      /* (invalid UTF-8 byte) */
 638                    ERROR,      /* (invalid UTF-8 byte) */
 639                    ERROR,      /* (invalid UTF-8 byte) */
 640                    0, -1);
 641     utf8_read_test(TESTSTR("\xC0\xAF"),
 642                    ERROR,      /* SOLIDUS (overlong form of 2F) */
 643                    0, -1);
 644     utf8_read_test(TESTSTR("\xE0\x80\xAF"),
 645                    ERROR,      /* SOLIDUS (overlong form of 2F) */
 646                    0, -1);
 647     utf8_read_test(TESTSTR("\xF0\x80\x80\xAF"),
 648                    ERROR,      /* SOLIDUS (overlong form of 2F) */
 649                    0, -1);
 650     utf8_read_test(TESTSTR("\xF8\x80\x80\x80\xAF"),
 651                    ERROR,      /* SOLIDUS (overlong form of 2F) */
 652                    0, -1);
 653     utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\xAF"),
 654                    ERROR,      /* SOLIDUS (overlong form of 2F) */
 655                    0, -1);
 656     utf8_read_test(TESTSTR("\xC1\xBF"),
 657                    ERROR,      /* <control> (overlong form of 7F) */
 658                    0, -1);
 659     utf8_read_test(TESTSTR("\xE0\x9F\xBF"),
 660                    ERROR,      /* <no name available> (overlong form of DF BF) */
 661                    0, -1);
 662     utf8_read_test(TESTSTR("\xF0\x8F\xBF\xBF"),
 663                    ERROR,      /* <no name available> (overlong form of EF BF BF) (invalid char) */
 664                    0, -1);
 665     utf8_read_test(TESTSTR("\xF8\x87\xBF\xBF\xBF"),
 666                    ERROR,      /* <no name available> (overlong form of F7 BF BF BF) */
 667                    0, -1);
 668     utf8_read_test(TESTSTR("\xFC\x83\xBF\xBF\xBF\xBF"),
 669                    ERROR,      /* <no name available> (overlong form of FB BF BF BF BF) */
 670                    0, -1);
 671     utf8_read_test(TESTSTR("\xC0\x80"),
 672                    ERROR,      /* <control> (overlong form of 00) */
 673                    0, -1);
 674     utf8_read_test(TESTSTR("\xE0\x80\x80"),
 675                    ERROR,      /* <control> (overlong form of 00) */
 676                    0, -1);
 677     utf8_read_test(TESTSTR("\xF0\x80\x80\x80"),
 678                    ERROR,      /* <control> (overlong form of 00) */
 679                    0, -1);
 680     utf8_read_test(TESTSTR("\xF8\x80\x80\x80\x80"),
 681                    ERROR,      /* <control> (overlong form of 00) */
 682                    0, -1);
 683     utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\x80"),
 684                    ERROR,      /* <control> (overlong form of 00) */
 685                    0, -1);
 686     utf8_read_test(TESTSTR("\xED\xA0\x80"),
 687                    ERROR,      /* <Non Private Use High Surrogate, First> (surrogate) */
 688                    0, -1);
 689     utf8_read_test(TESTSTR("\xED\xAD\xBF"),
 690                    ERROR,      /* <Non Private Use High Surrogate, Last> (surrogate) */
 691                    0, -1);
 692     utf8_read_test(TESTSTR("\xED\xAE\x80"),
 693                    ERROR,      /* <Private Use High Surrogate, First> (surrogate) */
 694                    0, -1);
 695     utf8_read_test(TESTSTR("\xED\xAF\xBF"),
 696                    ERROR,      /* <Private Use High Surrogate, Last> (surrogate) */
 697                    0, -1);
 698     utf8_read_test(TESTSTR("\xED\xB0\x80"),
 699                    ERROR,      /* <Low Surrogate, First> (surrogate) */
 700                    0, -1);
 701     utf8_read_test(TESTSTR("\xED\xBE\x80"),
 702                    ERROR,      /* <no name available> (surrogate) */
 703                    0, -1);
 704     utf8_read_test(TESTSTR("\xED\xBF\xBF"),
 705                    ERROR,      /* <Low Surrogate, Last> (surrogate) */
 706                    0, -1);
 707     utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xB0\x80"),
 708                    ERROR,      /* <Non Private Use High Surrogate, First> (surrogate) */
 709                    ERROR,      /* <Low Surrogate, First> (surrogate) */
 710                    0, -1);
 711     utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xBF\xBF"),
 712                    ERROR,      /* <Non Private Use High Surrogate, First> (surrogate) */
 713                    ERROR,      /* <Low Surrogate, Last> (surrogate) */
 714                    0, -1);
 715     utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xB0\x80"),
 716                    ERROR,      /* <Non Private Use High Surrogate, Last> (surrogate) */
 717                    ERROR,      /* <Low Surrogate, First> (surrogate) */
 718                    0, -1);
 719     utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xBF\xBF"),
 720                    ERROR,      /* <Non Private Use High Surrogate, Last> (surrogate) */
 721                    ERROR,      /* <Low Surrogate, Last> (surrogate) */
 722                    0, -1);
 723     utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xB0\x80"),
 724                    ERROR,      /* <Private Use High Surrogate, First> (surrogate) */
 725                    ERROR,      /* <Low Surrogate, First> (surrogate) */
 726                    0, -1);
 727     utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xBF\xBF"),
 728                    ERROR,      /* <Private Use High Surrogate, First> (surrogate) */
 729                    ERROR,      /* <Low Surrogate, Last> (surrogate) */
 730                    0, -1);
 731     utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xB0\x80"),
 732                    ERROR,      /* <Private Use High Surrogate, Last> (surrogate) */
 733                    ERROR,      /* <Low Surrogate, First> (surrogate) */
 734                    0, -1);
 735     utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xBF\xBF"),
 736                    ERROR,      /* <Private Use High Surrogate, Last> (surrogate) */
 737                    ERROR,      /* <Low Surrogate, Last> (surrogate) */
 738                    0, -1);
 739     utf8_read_test(TESTSTR("\xEF\xBF\xBE"),
 740                    ERROR,      /* <no name available> (invalid char) */
 741                    0, -1);
 742     utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
 743                    ERROR,      /* <no name available> (invalid char) */
 744                    0, -1);
 745     printf("read tests completed\n");
 746     printf("write tests beginning\n");
 747     {
 748         const static long str[] =
 749         {0x03BAL, 0x1F79L, 0x03C3L, 0x03BCL, 0x03B5L, 0};
 750         utf8_write_test(TESTSTR(str),
 751                         0xCE, 0xBA,
 752                         0xE1, 0xBD, 0xB9,
 753                         0xCF, 0x83,
 754                         0xCE, 0xBC,
 755                         0xCE, 0xB5,
 756                         0, -1);
 757     }
 758     {
 759         const static long str[] = {0x0000L, 0};
 760         utf8_write_test(TESTSTR(str),
 761                         0x00,
 762                         0, -1);
 763     }
 764     {
 765         const static long str[] = {0x0080L, 0};
 766         utf8_write_test(TESTSTR(str),
 767                         0xC2, 0x80,
 768                         0, -1);
 769     }
 770     {
 771         const static long str[] = {0x0800L, 0};
 772         utf8_write_test(TESTSTR(str),
 773                         0xE0, 0xA0, 0x80,
 774                         0, -1);
 775     }
 776     {
 777         const static long str[] = {0x00010000L, 0};
 778         utf8_write_test(TESTSTR(str),
 779                         0xF0, 0x90, 0x80, 0x80,
 780                         0, -1);
 781     }
 782     {
 783         const static long str[] = {0x00200000L, 0};
 784         utf8_write_test(TESTSTR(str),
 785                         0xF8, 0x88, 0x80, 0x80, 0x80,
 786                         0, -1);
 787     }
 788     {
 789         const static long str[] = {0x04000000L, 0};
 790         utf8_write_test(TESTSTR(str),
 791                         0xFC, 0x84, 0x80, 0x80, 0x80, 0x80,
 792                         0, -1);
 793     }
 794     {
 795         const static long str[] = {0x007FL, 0};
 796         utf8_write_test(TESTSTR(str),
 797                         0x7F,
 798                         0, -1);
 799     }
 800     {
 801         const static long str[] = {0x07FFL, 0};
 802         utf8_write_test(TESTSTR(str),
 803                         0xDF, 0xBF,
 804                         0, -1);
 805     }
 806     {
 807         const static long str[] = {0xFFFDL, 0};
 808         utf8_write_test(TESTSTR(str),
 809                         0xEF, 0xBF, 0xBD,
 810                         0, -1);
 811     }
 812     {
 813         const static long str[] = {0xFFFFL, 0};
 814         utf8_write_test(TESTSTR(str),
 815                         ERROR,
 816                         0, -1);
 817     }
 818     {
 819         const static long str[] = {0x001FFFFFL, 0};
 820         utf8_write_test(TESTSTR(str),
 821                         0xF7, 0xBF, 0xBF, 0xBF,
 822                         0, -1);
 823     }
 824     {
 825         const static long str[] = {0x03FFFFFFL, 0};
 826         utf8_write_test(TESTSTR(str),
 827                         0xFB, 0xBF, 0xBF, 0xBF, 0xBF,
 828                         0, -1);
 829     }
 830     {
 831         const static long str[] = {0x7FFFFFFFL, 0};
 832         utf8_write_test(TESTSTR(str),
 833                         0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF,
 834                         0, -1);
 835     }
 836     {
 837         const static long str[] = {0xD7FFL, 0};
 838         utf8_write_test(TESTSTR(str),
 839                         0xED, 0x9F, 0xBF,
 840                         0, -1);
 841     }
 842     {
 843         const static long str[] = {0xD800L, 0};
 844         utf8_write_test(TESTSTR(str),
 845                         ERROR,
 846                         0, -1);
 847     }
 848     {
 849         const static long str[] = {0xD800L, 0xDC00L, 0};
 850         utf8_write_test(TESTSTR(str),
 851                         ERROR,
 852                         ERROR,
 853                         0, -1);
 854     }
 855     {
 856         const static long str[] = {0xDFFFL, 0};
 857         utf8_write_test(TESTSTR(str),
 858                         ERROR,
 859                         0, -1);
 860     }
 861     {
 862         const static long str[] = {0xE000L, 0};
 863         utf8_write_test(TESTSTR(str),
 864                         0xEE, 0x80, 0x80,
 865                         0, -1);
 866     }
 867     printf("write tests completed\n");
 868
 869     printf("total: %d errors\n", total_errs);
 870     return (total_errs != 0);
 871 }
 872 #endif /* TESTMODE */
 873
 874 const charset_spec charset_CS_UTF8 = {
 875     CS_UTF8, read_utf8, write_utf8, NULL
 876 };
 877
 878 #else /* ENUM_CHARSETS */
 879
 880 ENUM_CHARSET(CS_UTF8)
 881
 882 #endif /* ENUM_CHARSETS */