mdw@git.distorted.org.uk Git - sgt/charset/blob - utf8.c

   1 /*
   2  * utf8.c - routines to handle UTF-8.
   3  */
   4
   5 #ifndef ENUM_CHARSETS
   6
   7 #include "charset.h"
   8 #include "internal.h"
   9
  10 /*
  11  * UTF-8 has no associated data, so `charset' may be ignored.
  12  */
  13
  14 void read_utf8(charset_spec const *charset, long int input_chr,
  15                charset_state *state,
  16                void (*emit)(void *ctx, long int output), void *emitctx)
  17 {
  18     UNUSEDARG(charset);
  19
  20     /*
  21      * For reading UTF-8, the `state' word contains the character
  22      * being accumulated.  This is shifted left by six bits each
  23      * time a character is added, and there's a single '1' bit
  24      * in what would be bit 31 of the final character, which we
  25      * use to detect when it's complete.
  26      *
  27      * As required, the state is zero when we are not in the middle
  28      * of a multibyte character at all.
  29      *
  30      * For example, when reading E9 8D 8B, starting at state=0:
  31      *
  32      *  - after E9, the state is 0x00080009
  33      *  - after 8D, the state is 0x0200024d
  34      *  - after 8B, the state conceptually becomes 0x8000934b, at
  35      *    which point we notice we've got as many characters as we
  36      *    were expecting, output U+934B, and reset the state to
  37      *    zero.
  38      *
  39      * If we detect an overlong sequence, we shift the marker bit
  40      * right one bit.  This is safe because an overlong sequence
  41      * can't encode a top-bit-set character.  Not that we worry
  42      * about what overlong sequences are trying to encode, but
  43      * it's nice to know that we could if we wanted to.
  44      *
  45      * Note that the maximum number of bits we might need to store
  46      * in the character value field is 25 (U+7FFFFFFF contains 31
  47      * bits, but we will never actually store its full value
  48      * because when we receive the last 6 bits in the final
  49      * continuation byte we will output it and revert the state to
  50      * zero). Hence we need 26 bits in total.
  51      */
  52
  53     if (input_chr < 0x80) {
  54         /*
  55          * Single-byte character. If the state is nonzero before
  56          * coming here, output an error for an incomplete sequence.
  57          * Then output the character.
  58          */
  59         if (state->s0 != 0) {
  60             emit(emitctx, ERROR);
  61             state->s0 = 0;
  62         }
  63         emit(emitctx, input_chr);
  64     } else if (input_chr == 0xFE || input_chr == 0xFF) {
  65         /*
  66          * FE and FF bytes should _never_ occur in UTF-8. They are
  67          * automatic errors; if the state was nonzero to start
  68          * with, output a further error for an incomplete sequence.
  69          */
  70         if (state->s0 != 0) {
  71             emit(emitctx, ERROR);
  72             state->s0 = 0;
  73         }
  74         emit(emitctx, ERROR);
  75     } else if (input_chr >= 0x80 && input_chr < 0xC0) {
  76         /*
  77          * Continuation byte. Output an error for an unexpected
  78          * continuation byte, if the state is zero.
  79          */
  80         if (state->s0 == 0) {
  81             emit(emitctx, ERROR);
  82         } else {
  83             unsigned long charval;
  84
  85             /*
  86              * Otherwise, accumulate more of the character value.
  87              */
  88             charval = state->s0;
  89             charval = (charval << 6) | (input_chr & 0x3F);
  90
  91             /*
  92              * Detect overlong encodings.  We're looking for too many
  93              * leading zeroes given our position in the character.  If
  94              * we find an overlong encoding, clear the current marker
  95              * bit and set the bit below it.  Overlong two-byte
  96              * encodings are a special case, and are detected when we
  97              * read their inital byte.
  98              */
  99             if ((charval & 0xffffffe0L) == 0x02000000L)
 100                 charval ^= 0x03000000L;
 101             else if ((charval & 0xfffffff0L) == 0x00080000L)
 102                 charval ^= 0x000c0000L;
 103             else if ((charval & 0xfffffff8L) == 0x00002000L)
 104                 charval ^= 0x00003000L;
 105             else if ((charval & 0xfffffffcL) == 0x00000080L)
 106                 charval ^= 0x000000c0L;
 107
 108             /*
 109              * Check the byte counts; if we have not reached the
 110              * end of the character, update the state and return.
 111              */
 112             if (!(charval & 0xc0000000L)) {
 113                 state->s0 = charval;
 114                 return;
 115             }
 116
 117             /*
 118              * Clear the marker bit, or set it if it's clear,
 119              * indicating an overlong sequence.
 120              */
 121             charval ^= 0x80000000L;
 122
 123             /*
 124              * Now we know we've reached the end of the character.
 125              * `charval' is the Unicode value. We should check for
 126              * various invalid things, and then either output
 127              * charval or an error. In all cases we reset the state
 128              * to zero.
 129              */
 130             state->s0 = 0;
 131
 132             if (charval & 0x80000000L) {
 133                 /* We got an overlong sequence. */
 134                 emit(emitctx, ERROR);
 135             } else if (charval >= 0xD800 && charval < 0xE000) {
 136                 /*
 137                  * Surrogates (0xD800-0xDFFF) may never be encoded
 138                  * in UTF-8. A surrogate pair in Unicode should
 139                  * have been encoded as a single UTF-8 character
 140                  * occupying more than three bytes.
 141                  */
 142                 emit(emitctx, ERROR);
 143             } else if (charval == 0xFFFE || charval == 0xFFFF) {
 144                 /*
 145                  * U+FFFE and U+FFFF are invalid Unicode characters
 146                  * and may never be encoded in UTF-8. (This is one
 147                  * reason why U+FFFF is our way of signalling an
 148                  * error to our `emit' function :-)
 149                  */
 150                 emit(emitctx, ERROR);
 151             } else {
 152                 /*
 153                  * Oh, all right. We'll let this one off.
 154                  */
 155                 emit(emitctx, charval);
 156             }
 157         }
 158
 159     } else {
 160         /*
 161          * Lead byte. First output an error for an incomplete
 162          * sequence, if the state is nonzero.
 163          */
 164         if (state->s0 != 0)
 165             emit(emitctx, ERROR);
 166
 167         /*
 168          * Now deal with the lead byte: work out the number of
 169          * bytes we expect to see in this character, and extract
 170          * the initial bits of it too.
 171          */
 172         if (input_chr >= 0xC0 && input_chr < 0xC2) {
 173             /* beginning of an overlong two-byte sequence */
 174             state->s0 = 0x01000000L | (input_chr & 0x1F);
 175         } else if (input_chr >= 0xC2 && input_chr < 0xE0) {
 176             state->s0 = 0x02000000L | (input_chr & 0x1F);
 177         } else if (input_chr >= 0xE0 && input_chr < 0xF0) {
 178             state->s0 = 0x00080000L | (input_chr & 0x0F);
 179         } else if (input_chr >= 0xF0 && input_chr < 0xF8) {
 180             state->s0 = 0x00002000L | (input_chr & 0x07);
 181         } else if (input_chr >= 0xF8 && input_chr < 0xFC) {
 182             state->s0 = 0x00000080L | (input_chr & 0x03);
 183         } else if (input_chr >= 0xFC && input_chr < 0xFE) {
 184             state->s0 = 0x00000002L | (input_chr & 0x01);
 185         }
 186     }
 187 }
 188
 189 /*
 190  * UTF-8 is a stateless multi-byte encoding (in the sense that just
 191  * after any character has been completed, the state is always the
 192  * same); hence when writing it, there is no need to use the
 193  * charset_state.
 194  */
 195
 196 static int write_utf8(charset_spec const *charset, long int input_chr,
 197                       charset_state *state,
 198                       void (*emit)(void *ctx, long int output),
 199                       void *emitctx)
 200 {
 201     UNUSEDARG(charset);
 202     UNUSEDARG(state);
 203
 204     if (input_chr == -1)
 205         return TRUE;                   /* stateless; no cleanup required */
 206
 207     /*
 208      * Refuse to output any illegal code points.
 209      */
 210     if (input_chr == 0xFFFE || input_chr == 0xFFFF ||
 211         (input_chr >= 0xD800 && input_chr < 0xE000)) {
 212         return FALSE;
 213     } else if (input_chr < 0x80) {     /* one-byte character */
 214         emit(emitctx, input_chr);
 215         return TRUE;
 216     } else if (input_chr < 0x800) {    /* two-byte character */
 217         emit(emitctx, 0xC0 | (0x1F & (input_chr >>  6)));
 218         emit(emitctx, 0x80 | (0x3F & (input_chr      )));
 219         return TRUE;
 220     } else if (input_chr < 0x10000) {  /* three-byte character */
 221         emit(emitctx, 0xE0 | (0x0F & (input_chr >> 12)));
 222         emit(emitctx, 0x80 | (0x3F & (input_chr >>  6)));
 223         emit(emitctx, 0x80 | (0x3F & (input_chr      )));
 224         return TRUE;
 225     } else if (input_chr < 0x200000) { /* four-byte character */
 226         emit(emitctx, 0xF0 | (0x07 & (input_chr >> 18)));
 227         emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
 228         emit(emitctx, 0x80 | (0x3F & (input_chr >>  6)));
 229         emit(emitctx, 0x80 | (0x3F & (input_chr      )));
 230         return TRUE;
 231     } else if (input_chr < 0x4000000) {/* five-byte character */
 232         emit(emitctx, 0xF8 | (0x03 & (input_chr >> 24)));
 233         emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
 234         emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
 235         emit(emitctx, 0x80 | (0x3F & (input_chr >>  6)));
 236         emit(emitctx, 0x80 | (0x3F & (input_chr      )));
 237         return TRUE;
 238     } else {                           /* six-byte character */
 239         emit(emitctx, 0xFC | (0x01 & (input_chr >> 30)));
 240         emit(emitctx, 0x80 | (0x3F & (input_chr >> 24)));
 241         emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
 242         emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
 243         emit(emitctx, 0x80 | (0x3F & (input_chr >>  6)));
 244         emit(emitctx, 0x80 | (0x3F & (input_chr      )));
 245         return TRUE;
 246     }
 247 }
 248
 249 #ifdef TESTMODE
 250
 251 #include <stdio.h>
 252 #include <stdarg.h>
 253
 254 int total_errs = 0;
 255
 256 void utf8_emit(void *ctx, long output)
 257 {
 258     wchar_t **p = (wchar_t **)ctx;
 259     *(*p)++ = output;
 260 }
 261
 262 void utf8_read_test(int line, char *input, int inlen, ...)
 263 {
 264     va_list ap;
 265     wchar_t *p, str[512];
 266     int i;
 267     charset_state state;
 268     unsigned long l;
 269
 270     state.s0 = 0;
 271     p = str;
 272
 273     for (i = 0; i < inlen; i++)
 274         read_utf8(NULL, input[i] & 0xFF, &state, utf8_emit, &p);
 275
 276     va_start(ap, inlen);
 277     l = 0;
 278     for (i = 0; i < p - str; i++) {
 279         l = va_arg(ap, long int);
 280         if (l == -1) {
 281             printf("%d: correct string shorter than output\n", line);
 282             total_errs++;
 283             break;
 284         }
 285         if (l != str[i]) {
 286             printf("%d: char %d came out as %08x, should be %08x\n",
 287                     line, i, str[i], l);
 288             total_errs++;
 289         }
 290     }
 291     if (l != -1) {
 292         l = va_arg(ap, long int);
 293         if (l != -1) {
 294             printf("%d: correct string longer than output\n", line);
 295             total_errs++;
 296         }
 297     }
 298     va_end(ap);
 299 }
 300
 301 void utf8_write_test(int line, const long *input, int inlen, ...)
 302 {
 303     va_list ap;
 304     wchar_t *p, str[512];
 305     int i;
 306     charset_state state;
 307     unsigned long l;
 308
 309     state.s0 = 0;
 310     p = str;
 311
 312     for (i = 0; i < inlen; i++) {
 313         if (!write_utf8(NULL, input[i], &state, utf8_emit, &p))
 314             utf8_emit(&p, ERROR);
 315     }
 316
 317     va_start(ap, inlen);
 318     l = 0;
 319     for (i = 0; i < p - str; i++) {
 320         l = va_arg(ap, long int);
 321         if (l == -1) {
 322             printf("%d: correct string shorter than output\n", line);
 323             total_errs++;
 324             break;
 325         }
 326         if (l != str[i]) {
 327             printf("%d: char %d came out as %08x, should be %08x\n",
 328                     line, i, str[i], l);
 329             total_errs++;
 330         }
 331     }
 332     if (l != -1) {
 333         l = va_arg(ap, long int);
 334         if (l != -1) {
 335             printf("%d: correct string longer than output\n", line);
 336             total_errs++;
 337         }
 338     }
 339     va_end(ap);
 340 }
 341
 342 /* Macro to concoct the first three parameters of utf8_read_test. */
 343 #define TESTSTR(x) __LINE__, x, lenof(x)
 344
 345 int main(void)
 346 {
 347     printf("read tests beginning\n");
 348     utf8_read_test(TESTSTR("\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"),
 349                    0x000003BA, /* GREEK SMALL LETTER KAPPA */
 350                    0x00001F79, /* GREEK SMALL LETTER OMICRON WITH OXIA */
 351                    0x000003C3, /* GREEK SMALL LETTER SIGMA */
 352                    0x000003BC, /* GREEK SMALL LETTER MU */
 353                    0x000003B5, /* GREEK SMALL LETTER EPSILON */
 354                    0, -1);
 355     utf8_read_test(TESTSTR("\x00"),
 356                    0x00000000, /* <control> */
 357                    0, -1);
 358     utf8_read_test(TESTSTR("\xC2\x80"),
 359                    0x00000080, /* <control> */
 360                    0, -1);
 361     utf8_read_test(TESTSTR("\xE0\xA0\x80"),
 362                    0x00000800, /* <no name available> */
 363                    0, -1);
 364     utf8_read_test(TESTSTR("\xF0\x90\x80\x80"),
 365                    0x00010000, /* <no name available> */
 366                    0, -1);
 367     utf8_read_test(TESTSTR("\xF8\x88\x80\x80\x80"),
 368                    0x00200000, /* <no name available> */
 369                    0, -1);
 370     utf8_read_test(TESTSTR("\xFC\x84\x80\x80\x80\x80"),
 371                    0x04000000, /* <no name available> */
 372                    0, -1);
 373     utf8_read_test(TESTSTR("\x7F"),
 374                    0x0000007F, /* <control> */
 375                    0, -1);
 376     utf8_read_test(TESTSTR("\xDF\xBF"),
 377                    0x000007FF, /* <no name available> */
 378                    0, -1);
 379     utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
 380                    0x0000FFFD, /* REPLACEMENT CHARACTER */
 381                    0, -1);
 382     utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
 383                    ERROR,      /* <no name available> (invalid char) */
 384                    0, -1);
 385     utf8_read_test(TESTSTR("\xF7\xBF\xBF\xBF"),
 386                    0x001FFFFF, /* <no name available> */
 387                    0, -1);
 388     utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF\xBF"),
 389                    0x03FFFFFF, /* <no name available> */
 390                    0, -1);
 391     utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF\xBF"),
 392                    0x7FFFFFFF, /* <no name available> */
 393                    0, -1);
 394     utf8_read_test(TESTSTR("\xED\x9F\xBF"),
 395                    0x0000D7FF, /* <no name available> */
 396                    0, -1);
 397     utf8_read_test(TESTSTR("\xEE\x80\x80"),
 398                    0x0000E000, /* <Private Use, First> */
 399                    0, -1);
 400     utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
 401                    0x0000FFFD, /* REPLACEMENT CHARACTER */
 402                    0, -1);
 403     utf8_read_test(TESTSTR("\xF4\x8F\xBF\xBF"),
 404                    0x0010FFFF, /* <no name available> */
 405                    0, -1);
 406     utf8_read_test(TESTSTR("\xF4\x90\x80\x80"),
 407                    0x00110000, /* <no name available> */
 408                    0, -1);
 409     utf8_read_test(TESTSTR("\x80"),
 410                    ERROR,      /* (unexpected continuation byte) */
 411                    0, -1);
 412     utf8_read_test(TESTSTR("\xBF"),
 413                    ERROR,      /* (unexpected continuation byte) */
 414                    0, -1);
 415     utf8_read_test(TESTSTR("\x80\xBF"),
 416                    ERROR,      /* (unexpected continuation byte) */
 417                    ERROR,      /* (unexpected continuation byte) */
 418                    0, -1);
 419     utf8_read_test(TESTSTR("\x80\xBF\x80"),
 420                    ERROR,      /* (unexpected continuation byte) */
 421                    ERROR,      /* (unexpected continuation byte) */
 422                    ERROR,      /* (unexpected continuation byte) */
 423                    0, -1);
 424     utf8_read_test(TESTSTR("\x80\xBF\x80\xBF"),
 425                    ERROR,      /* (unexpected continuation byte) */
 426                    ERROR,      /* (unexpected continuation byte) */
 427                    ERROR,      /* (unexpected continuation byte) */
 428                    ERROR,      /* (unexpected continuation byte) */
 429                    0, -1);
 430     utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80"),
 431                    ERROR,      /* (unexpected continuation byte) */
 432                    ERROR,      /* (unexpected continuation byte) */
 433                    ERROR,      /* (unexpected continuation byte) */
 434                    ERROR,      /* (unexpected continuation byte) */
 435                    ERROR,      /* (unexpected continuation byte) */
 436                    0, -1);
 437     utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF"),
 438                    ERROR,      /* (unexpected continuation byte) */
 439                    ERROR,      /* (unexpected continuation byte) */
 440                    ERROR,      /* (unexpected continuation byte) */
 441                    ERROR,      /* (unexpected continuation byte) */
 442                    ERROR,      /* (unexpected continuation byte) */
 443                    ERROR,      /* (unexpected continuation byte) */
 444                    0, -1);
 445     utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF\x80"),
 446                    ERROR,      /* (unexpected continuation byte) */
 447                    ERROR,      /* (unexpected continuation byte) */
 448                    ERROR,      /* (unexpected continuation byte) */
 449                    ERROR,      /* (unexpected continuation byte) */
 450                    ERROR,      /* (unexpected continuation byte) */
 451                    ERROR,      /* (unexpected continuation byte) */
 452                    ERROR,      /* (unexpected continuation byte) */
 453                    0, -1);
 454     utf8_read_test(TESTSTR("\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"),
 455                    ERROR,      /* (unexpected continuation byte) */
 456                    ERROR,      /* (unexpected continuation byte) */
 457                    ERROR,      /* (unexpected continuation byte) */
 458                    ERROR,      /* (unexpected continuation byte) */
 459                    ERROR,      /* (unexpected continuation byte) */
 460                    ERROR,      /* (unexpected continuation byte) */
 461                    ERROR,      /* (unexpected continuation byte) */
 462                    ERROR,      /* (unexpected continuation byte) */
 463                    ERROR,      /* (unexpected continuation byte) */
 464                    ERROR,      /* (unexpected continuation byte) */
 465                    ERROR,      /* (unexpected continuation byte) */
 466                    ERROR,      /* (unexpected continuation byte) */
 467                    ERROR,      /* (unexpected continuation byte) */
 468                    ERROR,      /* (unexpected continuation byte) */
 469                    ERROR,      /* (unexpected continuation byte) */
 470                    ERROR,      /* (unexpected continuation byte) */
 471                    ERROR,      /* (unexpected continuation byte) */
 472                    ERROR,      /* (unexpected continuation byte) */
 473                    ERROR,      /* (unexpected continuation byte) */
 474                    ERROR,      /* (unexpected continuation byte) */
 475                    ERROR,      /* (unexpected continuation byte) */
 476                    ERROR,      /* (unexpected continuation byte) */
 477                    ERROR,      /* (unexpected continuation byte) */
 478                    ERROR,      /* (unexpected continuation byte) */
 479                    ERROR,      /* (unexpected continuation byte) */
 480                    ERROR,      /* (unexpected continuation byte) */
 481                    ERROR,      /* (unexpected continuation byte) */
 482                    ERROR,      /* (unexpected continuation byte) */
 483                    ERROR,      /* (unexpected continuation byte) */
 484                    ERROR,      /* (unexpected continuation byte) */
 485                    ERROR,      /* (unexpected continuation byte) */
 486                    ERROR,      /* (unexpected continuation byte) */
 487                    ERROR,      /* (unexpected continuation byte) */
 488                    ERROR,      /* (unexpected continuation byte) */
 489                    ERROR,      /* (unexpected continuation byte) */
 490                    ERROR,      /* (unexpected continuation byte) */
 491                    ERROR,      /* (unexpected continuation byte) */
 492                    ERROR,      /* (unexpected continuation byte) */
 493                    ERROR,      /* (unexpected continuation byte) */
 494                    ERROR,      /* (unexpected continuation byte) */
 495                    ERROR,      /* (unexpected continuation byte) */
 496                    ERROR,      /* (unexpected continuation byte) */
 497                    ERROR,      /* (unexpected continuation byte) */
 498                    ERROR,      /* (unexpected continuation byte) */
 499                    ERROR,      /* (unexpected continuation byte) */
 500                    ERROR,      /* (unexpected continuation byte) */
 501                    ERROR,      /* (unexpected continuation byte) */
 502                    ERROR,      /* (unexpected continuation byte) */
 503                    ERROR,      /* (unexpected continuation byte) */
 504                    ERROR,      /* (unexpected continuation byte) */
 505                    ERROR,      /* (unexpected continuation byte) */
 506                    ERROR,      /* (unexpected continuation byte) */
 507                    ERROR,      /* (unexpected continuation byte) */
 508                    ERROR,      /* (unexpected continuation byte) */
 509                    ERROR,      /* (unexpected continuation byte) */
 510                    ERROR,      /* (unexpected continuation byte) */
 511                    ERROR,      /* (unexpected continuation byte) */
 512                    ERROR,      /* (unexpected continuation byte) */
 513                    ERROR,      /* (unexpected continuation byte) */
 514                    ERROR,      /* (unexpected continuation byte) */
 515                    ERROR,      /* (unexpected continuation byte) */
 516                    ERROR,      /* (unexpected continuation byte) */
 517                    ERROR,      /* (unexpected continuation byte) */
 518                    ERROR,      /* (unexpected continuation byte) */
 519                    0, -1);
 520     utf8_read_test(TESTSTR("\xC0\x20\xC1\x20\xC2\x20\xC3\x20\xC4\x20\xC5\x20\xC6\x20\xC7\x20"),
 521                    ERROR,      /* (incomplete sequence) */
 522                    0x00000020, /* SPACE */
 523                    ERROR,      /* (incomplete sequence) */
 524                    0x00000020, /* SPACE */
 525                    ERROR,      /* (incomplete sequence) */
 526                    0x00000020, /* SPACE */
 527                    ERROR,      /* (incomplete sequence) */
 528                    0x00000020, /* SPACE */
 529                    ERROR,      /* (incomplete sequence) */
 530                    0x00000020, /* SPACE */
 531                    ERROR,      /* (incomplete sequence) */
 532                    0x00000020, /* SPACE */
 533                    ERROR,      /* (incomplete sequence) */
 534                    0x00000020, /* SPACE */
 535                    ERROR,      /* (incomplete sequence) */
 536                    0x00000020, /* SPACE */
 537                    0, -1);
 538     utf8_read_test(TESTSTR("\xE0\x20\xE1\x20\xE2\x20\xE3\x20\xE4\x20\xE5\x20\xE6\x20\xE7\x20\xE8\x20\xE9\x20\xEA\x20\xEB\x20\xEC\x20\xED\x20\xEE\x20\xEF\x20"),
 539                    ERROR,      /* (incomplete sequence) */
 540                    0x00000020, /* SPACE */
 541                    ERROR,      /* (incomplete sequence) */
 542                    0x00000020, /* SPACE */
 543                    ERROR,      /* (incomplete sequence) */
 544                    0x00000020, /* SPACE */
 545                    ERROR,      /* (incomplete sequence) */
 546                    0x00000020, /* SPACE */
 547                    ERROR,      /* (incomplete sequence) */
 548                    0x00000020, /* SPACE */
 549                    ERROR,      /* (incomplete sequence) */
 550                    0x00000020, /* SPACE */
 551                    ERROR,      /* (incomplete sequence) */
 552                    0x00000020, /* SPACE */
 553                    ERROR,      /* (incomplete sequence) */
 554                    0x00000020, /* SPACE */
 555                    ERROR,      /* (incomplete sequence) */
 556                    0x00000020, /* SPACE */
 557                    ERROR,      /* (incomplete sequence) */
 558                    0x00000020, /* SPACE */
 559                    ERROR,      /* (incomplete sequence) */
 560                    0x00000020, /* SPACE */
 561                    ERROR,      /* (incomplete sequence) */
 562                    0x00000020, /* SPACE */
 563                    ERROR,      /* (incomplete sequence) */
 564                    0x00000020, /* SPACE */
 565                    ERROR,      /* (incomplete sequence) */
 566                    0x00000020, /* SPACE */
 567                    ERROR,      /* (incomplete sequence) */
 568                    0x00000020, /* SPACE */
 569                    ERROR,      /* (incomplete sequence) */
 570                    0x00000020, /* SPACE */
 571                    0, -1);
 572     utf8_read_test(TESTSTR("\xF0\x20\xF1\x20\xF2\x20\xF3\x20\xF4\x20\xF5\x20\xF6\x20\xF7\x20"),
 573                    ERROR,      /* (incomplete sequence) */
 574                    0x00000020, /* SPACE */
 575                    ERROR,      /* (incomplete sequence) */
 576                    0x00000020, /* SPACE */
 577                    ERROR,      /* (incomplete sequence) */
 578                    0x00000020, /* SPACE */
 579                    ERROR,      /* (incomplete sequence) */
 580                    0x00000020, /* SPACE */
 581                    ERROR,      /* (incomplete sequence) */
 582                    0x00000020, /* SPACE */
 583                    ERROR,      /* (incomplete sequence) */
 584                    0x00000020, /* SPACE */
 585                    ERROR,      /* (incomplete sequence) */
 586                    0x00000020, /* SPACE */
 587                    ERROR,      /* (incomplete sequence) */
 588                    0x00000020, /* SPACE */
 589                    0, -1);
 590     utf8_read_test(TESTSTR("\xF8\x20\xF9\x20\xFA\x20\xFB\x20"),
 591                    ERROR,      /* (incomplete sequence) */
 592                    0x00000020, /* SPACE */
 593                    ERROR,      /* (incomplete sequence) */
 594                    0x00000020, /* SPACE */
 595                    ERROR,      /* (incomplete sequence) */
 596                    0x00000020, /* SPACE */
 597                    ERROR,      /* (incomplete sequence) */
 598                    0x00000020, /* SPACE */
 599                    0, -1);
 600     utf8_read_test(TESTSTR("\xFC\x20\xFD\x20"),
 601                    ERROR,      /* (incomplete sequence) */
 602                    0x00000020, /* SPACE */
 603                    ERROR,      /* (incomplete sequence) */
 604                    0x00000020, /* SPACE */
 605                    0, -1);
 606     utf8_read_test(TESTSTR("\xC0"),
 607                    ERROR,      /* (incomplete sequence) */
 608                    0, -1);
 609     utf8_read_test(TESTSTR("\xE0\x80"),
 610                    ERROR,      /* (incomplete sequence) */
 611                    0, -1);
 612     utf8_read_test(TESTSTR("\xF0\x80\x80"),
 613                    ERROR,      /* (incomplete sequence) */
 614                    0, -1);
 615     utf8_read_test(TESTSTR("\xF8\x80\x80\x80"),
 616                    ERROR,      /* (incomplete sequence) */
 617                    0, -1);
 618     utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80"),
 619                    ERROR,      /* (incomplete sequence) */
 620                    0, -1);
 621     utf8_read_test(TESTSTR("\xDF"),
 622                    ERROR,      /* (incomplete sequence) */
 623                    0, -1);
 624     utf8_read_test(TESTSTR("\xEF\xBF"),
 625                    ERROR,      /* (incomplete sequence) */
 626                    0, -1);
 627     utf8_read_test(TESTSTR("\xF7\xBF\xBF"),
 628                    ERROR,      /* (incomplete sequence) */
 629                    0, -1);
 630     utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF"),
 631                    ERROR,      /* (incomplete sequence) */
 632                    0, -1);
 633     utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF"),
 634                    ERROR,      /* (incomplete sequence) */
 635                    0, -1);
 636     utf8_read_test(TESTSTR("\xC0\xE0\x80\xF0\x80\x80\xF8\x80\x80\x80\xFC\x80\x80\x80\x80\xDF\xEF\xBF\xF7\xBF\xBF\xFB\xBF\xBF\xBF\xFD\xBF\xBF\xBF\xBF"),
 637                    ERROR,      /* (incomplete sequence) */
 638                    ERROR,      /* (incomplete sequence) */
 639                    ERROR,      /* (incomplete sequence) */
 640                    ERROR,      /* (incomplete sequence) */
 641                    ERROR,      /* (incomplete sequence) */
 642                    ERROR,      /* (incomplete sequence) */
 643                    ERROR,      /* (incomplete sequence) */
 644                    ERROR,      /* (incomplete sequence) */
 645                    ERROR,      /* (incomplete sequence) */
 646                    ERROR,      /* (incomplete sequence) */
 647                    0, -1);
 648     utf8_read_test(TESTSTR("\xFE"),
 649                    ERROR,      /* (invalid UTF-8 byte) */
 650                    0, -1);
 651     utf8_read_test(TESTSTR("\xFF"),
 652                    ERROR,      /* (invalid UTF-8 byte) */
 653                    0, -1);
 654     utf8_read_test(TESTSTR("\xFE\xFE\xFF\xFF"),
 655                    ERROR,      /* (invalid UTF-8 byte) */
 656                    ERROR,      /* (invalid UTF-8 byte) */
 657                    ERROR,      /* (invalid UTF-8 byte) */
 658                    ERROR,      /* (invalid UTF-8 byte) */
 659                    0, -1);
 660     utf8_read_test(TESTSTR("\xC0\xAF"),
 661                    ERROR,      /* SOLIDUS (overlong form of 2F) */
 662                    0, -1);
 663     utf8_read_test(TESTSTR("\xE0\x80\xAF"),
 664                    ERROR,      /* SOLIDUS (overlong form of 2F) */
 665                    0, -1);
 666     utf8_read_test(TESTSTR("\xF0\x80\x80\xAF"),
 667                    ERROR,      /* SOLIDUS (overlong form of 2F) */
 668                    0, -1);
 669     utf8_read_test(TESTSTR("\xF8\x80\x80\x80\xAF"),
 670                    ERROR,      /* SOLIDUS (overlong form of 2F) */
 671                    0, -1);
 672     utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\xAF"),
 673                    ERROR,      /* SOLIDUS (overlong form of 2F) */
 674                    0, -1);
 675     utf8_read_test(TESTSTR("\xC1\xBF"),
 676                    ERROR,      /* <control> (overlong form of 7F) */
 677                    0, -1);
 678     utf8_read_test(TESTSTR("\xE0\x9F\xBF"),
 679                    ERROR,      /* <no name available> (overlong form of DF BF) */
 680                    0, -1);
 681     utf8_read_test(TESTSTR("\xF0\x8F\xBF\xBF"),
 682                    ERROR,      /* <no name available> (overlong form of EF BF BF) (invalid char) */
 683                    0, -1);
 684     utf8_read_test(TESTSTR("\xF8\x87\xBF\xBF\xBF"),
 685                    ERROR,      /* <no name available> (overlong form of F7 BF BF BF) */
 686                    0, -1);
 687     utf8_read_test(TESTSTR("\xFC\x83\xBF\xBF\xBF\xBF"),
 688                    ERROR,      /* <no name available> (overlong form of FB BF BF BF BF) */
 689                    0, -1);
 690     utf8_read_test(TESTSTR("\xC0\x80"),
 691                    ERROR,      /* <control> (overlong form of 00) */
 692                    0, -1);
 693     utf8_read_test(TESTSTR("\xE0\x80\x80"),
 694                    ERROR,      /* <control> (overlong form of 00) */
 695                    0, -1);
 696     utf8_read_test(TESTSTR("\xF0\x80\x80\x80"),
 697                    ERROR,      /* <control> (overlong form of 00) */
 698                    0, -1);
 699     utf8_read_test(TESTSTR("\xF8\x80\x80\x80\x80"),
 700                    ERROR,      /* <control> (overlong form of 00) */
 701                    0, -1);
 702     utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\x80"),
 703                    ERROR,      /* <control> (overlong form of 00) */
 704                    0, -1);
 705     utf8_read_test(TESTSTR("\xED\xA0\x80"),
 706                    ERROR,      /* <Non Private Use High Surrogate, First> (surrogate) */
 707                    0, -1);
 708     utf8_read_test(TESTSTR("\xED\xAD\xBF"),
 709                    ERROR,      /* <Non Private Use High Surrogate, Last> (surrogate) */
 710                    0, -1);
 711     utf8_read_test(TESTSTR("\xED\xAE\x80"),
 712                    ERROR,      /* <Private Use High Surrogate, First> (surrogate) */
 713                    0, -1);
 714     utf8_read_test(TESTSTR("\xED\xAF\xBF"),
 715                    ERROR,      /* <Private Use High Surrogate, Last> (surrogate) */
 716                    0, -1);
 717     utf8_read_test(TESTSTR("\xED\xB0\x80"),
 718                    ERROR,      /* <Low Surrogate, First> (surrogate) */
 719                    0, -1);
 720     utf8_read_test(TESTSTR("\xED\xBE\x80"),
 721                    ERROR,      /* <no name available> (surrogate) */
 722                    0, -1);
 723     utf8_read_test(TESTSTR("\xED\xBF\xBF"),
 724                    ERROR,      /* <Low Surrogate, Last> (surrogate) */
 725                    0, -1);
 726     utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xB0\x80"),
 727                    ERROR,      /* <Non Private Use High Surrogate, First> (surrogate) */
 728                    ERROR,      /* <Low Surrogate, First> (surrogate) */
 729                    0, -1);
 730     utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xBF\xBF"),
 731                    ERROR,      /* <Non Private Use High Surrogate, First> (surrogate) */
 732                    ERROR,      /* <Low Surrogate, Last> (surrogate) */
 733                    0, -1);
 734     utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xB0\x80"),
 735                    ERROR,      /* <Non Private Use High Surrogate, Last> (surrogate) */
 736                    ERROR,      /* <Low Surrogate, First> (surrogate) */
 737                    0, -1);
 738     utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xBF\xBF"),
 739                    ERROR,      /* <Non Private Use High Surrogate, Last> (surrogate) */
 740                    ERROR,      /* <Low Surrogate, Last> (surrogate) */
 741                    0, -1);
 742     utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xB0\x80"),
 743                    ERROR,      /* <Private Use High Surrogate, First> (surrogate) */
 744                    ERROR,      /* <Low Surrogate, First> (surrogate) */
 745                    0, -1);
 746     utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xBF\xBF"),
 747                    ERROR,      /* <Private Use High Surrogate, First> (surrogate) */
 748                    ERROR,      /* <Low Surrogate, Last> (surrogate) */
 749                    0, -1);
 750     utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xB0\x80"),
 751                    ERROR,      /* <Private Use High Surrogate, Last> (surrogate) */
 752                    ERROR,      /* <Low Surrogate, First> (surrogate) */
 753                    0, -1);
 754     utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xBF\xBF"),
 755                    ERROR,      /* <Private Use High Surrogate, Last> (surrogate) */
 756                    ERROR,      /* <Low Surrogate, Last> (surrogate) */
 757                    0, -1);
 758     utf8_read_test(TESTSTR("\xEF\xBF\xBE"),
 759                    ERROR,      /* <no name available> (invalid char) */
 760                    0, -1);
 761     utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
 762                    ERROR,      /* <no name available> (invalid char) */
 763                    0, -1);
 764     printf("read tests completed\n");
 765     printf("write tests beginning\n");
 766     {
 767         const static long str[] =
 768         {0x03BAL, 0x1F79L, 0x03C3L, 0x03BCL, 0x03B5L, 0};
 769         utf8_write_test(TESTSTR(str),
 770                         0xCE, 0xBA,
 771                         0xE1, 0xBD, 0xB9,
 772                         0xCF, 0x83,
 773                         0xCE, 0xBC,
 774                         0xCE, 0xB5,
 775                         0, -1);
 776     }
 777     {
 778         const static long str[] = {0x0000L, 0};
 779         utf8_write_test(TESTSTR(str),
 780                         0x00,
 781                         0, -1);
 782     }
 783     {
 784         const static long str[] = {0x0080L, 0};
 785         utf8_write_test(TESTSTR(str),
 786                         0xC2, 0x80,
 787                         0, -1);
 788     }
 789     {
 790         const static long str[] = {0x0800L, 0};
 791         utf8_write_test(TESTSTR(str),
 792                         0xE0, 0xA0, 0x80,
 793                         0, -1);
 794     }
 795     {
 796         const static long str[] = {0x00010000L, 0};
 797         utf8_write_test(TESTSTR(str),
 798                         0xF0, 0x90, 0x80, 0x80,
 799                         0, -1);
 800     }
 801     {
 802         const static long str[] = {0x00200000L, 0};
 803         utf8_write_test(TESTSTR(str),
 804                         0xF8, 0x88, 0x80, 0x80, 0x80,
 805                         0, -1);
 806     }
 807     {
 808         const static long str[] = {0x04000000L, 0};
 809         utf8_write_test(TESTSTR(str),
 810                         0xFC, 0x84, 0x80, 0x80, 0x80, 0x80,
 811                         0, -1);
 812     }
 813     {
 814         const static long str[] = {0x007FL, 0};
 815         utf8_write_test(TESTSTR(str),
 816                         0x7F,
 817                         0, -1);
 818     }
 819     {
 820         const static long str[] = {0x07FFL, 0};
 821         utf8_write_test(TESTSTR(str),
 822                         0xDF, 0xBF,
 823                         0, -1);
 824     }
 825     {
 826         const static long str[] = {0xFFFDL, 0};
 827         utf8_write_test(TESTSTR(str),
 828                         0xEF, 0xBF, 0xBD,
 829                         0, -1);
 830     }
 831     {
 832         const static long str[] = {0xFFFFL, 0};
 833         utf8_write_test(TESTSTR(str),
 834                         ERROR,
 835                         0, -1);
 836     }
 837     {
 838         const static long str[] = {0x001FFFFFL, 0};
 839         utf8_write_test(TESTSTR(str),
 840                         0xF7, 0xBF, 0xBF, 0xBF,
 841                         0, -1);
 842     }
 843     {
 844         const static long str[] = {0x03FFFFFFL, 0};
 845         utf8_write_test(TESTSTR(str),
 846                         0xFB, 0xBF, 0xBF, 0xBF, 0xBF,
 847                         0, -1);
 848     }
 849     {
 850         const static long str[] = {0x7FFFFFFFL, 0};
 851         utf8_write_test(TESTSTR(str),
 852                         0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF,
 853                         0, -1);
 854     }
 855     {
 856         const static long str[] = {0xD7FFL, 0};
 857         utf8_write_test(TESTSTR(str),
 858                         0xED, 0x9F, 0xBF,
 859                         0, -1);
 860     }
 861     {
 862         const static long str[] = {0xD800L, 0};
 863         utf8_write_test(TESTSTR(str),
 864                         ERROR,
 865                         0, -1);
 866     }
 867     {
 868         const static long str[] = {0xD800L, 0xDC00L, 0};
 869         utf8_write_test(TESTSTR(str),
 870                         ERROR,
 871                         ERROR,
 872                         0, -1);
 873     }
 874     {
 875         const static long str[] = {0xDFFFL, 0};
 876         utf8_write_test(TESTSTR(str),
 877                         ERROR,
 878                         0, -1);
 879     }
 880     {
 881         const static long str[] = {0xE000L, 0};
 882         utf8_write_test(TESTSTR(str),
 883                         0xEE, 0x80, 0x80,
 884                         0, -1);
 885     }
 886     printf("write tests completed\n");
 887
 888     printf("total: %d errors\n", total_errs);
 889     return (total_errs != 0);
 890 }
 891 #endif /* TESTMODE */
 892
 893 const charset_spec charset_CS_UTF8 = {
 894     CS_UTF8, read_utf8, write_utf8, NULL
 895 };
 896
 897 #else /* ENUM_CHARSETS */
 898
 899 ENUM_CHARSET(CS_UTF8)
 900
 901 #endif /* ENUM_CHARSETS */