2 * utf8.c - routines to handle UTF-8.
11 * UTF-8 has no associated data, so `charset' may be ignored.
14 static void read_utf8(charset_spec
const *charset
, long int input_chr
,
16 void (*emit
)(void *ctx
, long int output
), void *emitctx
)
21 * For reading UTF-8, the `state' word contains the character
22 * being accumulated. This is shifted left by six bits each
23 * time a character is added, and there's a single '1' bit
24 * in what would be bit 31 of the final character, which we
25 * use to detect when it's complete.
27 * As required, the state is zero when we are not in the middle
28 * of a multibyte character at all.
30 * For example, when reading E9 8D 8B, starting at state=0:
32 * - after E9, the state is 0x00080009
33 * - after 8D, the state is 0x0200024d
34 * - after 8B, the state conceptually becomes 0x8000934b, at
35 * which point we notice we've got as many characters as we
36 * were expecting, output U+934B, and reset the state to
39 * If we detect an overlong sequence, we shift the marker bit
40 * right one bit. This is safe because an overlong sequence
41 * can't encode a top-bit-set character. Not that we worry
42 * about what overlong sequences are trying to encode, but
43 * it's nice to know that we could if we wanted to.
45 * Note that the maximum number of bits we might need to store
46 * in the character value field is 25 (U+7FFFFFFF contains 31
47 * bits, but we will never actually store its full value
48 * because when we receive the last 6 bits in the final
49 * continuation byte we will output it and revert the state to
50 * zero). Hence we need 26 bits in total.
53 if (input_chr
< 0x80) {
55 * Single-byte character. If the state is nonzero before
56 * coming here, output an error for an incomplete sequence.
57 * Then output the character.
63 emit(emitctx
, input_chr
);
64 } else if (input_chr
== 0xFE || input_chr
== 0xFF) {
66 * FE and FF bytes should _never_ occur in UTF-8. They are
67 * automatic errors; if the state was nonzero to start
68 * with, output a further error for an incomplete sequence.
75 } else if (input_chr
>= 0x80 && input_chr
< 0xC0) {
77 * Continuation byte. Output an error for an unexpected
78 * continuation byte, if the state is zero.
83 unsigned long charval
;
86 * Otherwise, accumulate more of the character value.
89 charval
= (charval
<< 6) | (input_chr
& 0x3F);
92 * Detect overlong encodings. We're looking for too many
93 * leading zeroes given our position in the character. If
94 * we find an overlong encoding, clear the current marker
95 * bit and set the bit below it. Overlong two-byte
96 * encodings are a special case, and are detected when we
97 * read their inital byte.
99 if ((charval
& 0xffffffe0L
) == 0x02000000L
)
100 charval
^= 0x03000000L
;
101 else if ((charval
& 0xfffffff0L
) == 0x00080000L
)
102 charval
^= 0x000c0000L
;
103 else if ((charval
& 0xfffffff8L
) == 0x00002000L
)
104 charval
^= 0x00003000L
;
105 else if ((charval
& 0xfffffffcL
) == 0x00000080L
)
106 charval
^= 0x000000c0L
;
109 * Check the byte counts; if we have not reached the
110 * end of the character, update the state and return.
112 if (!(charval
& 0xc0000000L
)) {
118 * Clear the marker bit, or set it if it's clear,
119 * indicating an overlong sequence.
121 charval
^= 0x80000000L
;
124 * Now we know we've reached the end of the character.
125 * `charval' is the Unicode value. We should check for
126 * various invalid things, and then either output
127 * charval or an error. In all cases we reset the state
132 if (charval
& 0x80000000L
) {
133 /* We got an overlong sequence. */
134 emit(emitctx
, ERROR
);
135 } else if (charval
>= 0xD800 && charval
< 0xE000) {
137 * Surrogates (0xD800-0xDFFF) may never be encoded
138 * in UTF-8. A surrogate pair in Unicode should
139 * have been encoded as a single UTF-8 character
140 * occupying more than three bytes.
142 emit(emitctx
, ERROR
);
143 } else if (charval
== 0xFFFE || charval
== 0xFFFF) {
145 * U+FFFE and U+FFFF are invalid Unicode characters
146 * and may never be encoded in UTF-8. (This is one
147 * reason why U+FFFF is our way of signalling an
148 * error to our `emit' function :-)
150 emit(emitctx
, ERROR
);
153 * Oh, all right. We'll let this one off.
155 emit(emitctx
, charval
);
161 * Lead byte. First output an error for an incomplete
162 * sequence, if the state is nonzero.
165 emit(emitctx
, ERROR
);
168 * Now deal with the lead byte: work out the number of
169 * bytes we expect to see in this character, and extract
170 * the initial bits of it too.
172 if (input_chr
>= 0xC0 && input_chr
< 0xC2) {
173 /* beginning of an overlong two-byte sequence */
174 state
->s0
= 0x01000000L
| (input_chr
& 0x1F);
175 } else if (input_chr
>= 0xC2 && input_chr
< 0xE0) {
176 state
->s0
= 0x02000000L
| (input_chr
& 0x1F);
177 } else if (input_chr
>= 0xE0 && input_chr
< 0xF0) {
178 state
->s0
= 0x00080000L
| (input_chr
& 0x0F);
179 } else if (input_chr
>= 0xF0 && input_chr
< 0xF8) {
180 state
->s0
= 0x00002000L
| (input_chr
& 0x07);
181 } else if (input_chr
>= 0xF8 && input_chr
< 0xFC) {
182 state
->s0
= 0x00000080L
| (input_chr
& 0x03);
183 } else if (input_chr
>= 0xFC && input_chr
< 0xFE) {
184 state
->s0
= 0x00000002L
| (input_chr
& 0x01);
190 * UTF-8 is a stateless multi-byte encoding (in the sense that just
191 * after any character has been completed, the state is always the
192 * same); hence when writing it, there is no need to use the
196 static int write_utf8(charset_spec
const *charset
, long int input_chr
,
197 charset_state
*state
,
198 void (*emit
)(void *ctx
, long int output
),
205 return TRUE
; /* stateless; no cleanup required */
208 * Refuse to output any illegal code points.
210 if (input_chr
== 0xFFFE || input_chr
== 0xFFFF ||
211 (input_chr
>= 0xD800 && input_chr
< 0xE000)) {
213 } else if (input_chr
< 0x80) { /* one-byte character */
214 emit(emitctx
, input_chr
);
216 } else if (input_chr
< 0x800) { /* two-byte character */
217 emit(emitctx
, 0xC0 | (0x1F & (input_chr
>> 6)));
218 emit(emitctx
, 0x80 | (0x3F & (input_chr
)));
220 } else if (input_chr
< 0x10000) { /* three-byte character */
221 emit(emitctx
, 0xE0 | (0x0F & (input_chr
>> 12)));
222 emit(emitctx
, 0x80 | (0x3F & (input_chr
>> 6)));
223 emit(emitctx
, 0x80 | (0x3F & (input_chr
)));
225 } else if (input_chr
< 0x200000) { /* four-byte character */
226 emit(emitctx
, 0xF0 | (0x07 & (input_chr
>> 18)));
227 emit(emitctx
, 0x80 | (0x3F & (input_chr
>> 12)));
228 emit(emitctx
, 0x80 | (0x3F & (input_chr
>> 6)));
229 emit(emitctx
, 0x80 | (0x3F & (input_chr
)));
231 } else if (input_chr
< 0x4000000) {/* five-byte character */
232 emit(emitctx
, 0xF8 | (0x03 & (input_chr
>> 24)));
233 emit(emitctx
, 0x80 | (0x3F & (input_chr
>> 18)));
234 emit(emitctx
, 0x80 | (0x3F & (input_chr
>> 12)));
235 emit(emitctx
, 0x80 | (0x3F & (input_chr
>> 6)));
236 emit(emitctx
, 0x80 | (0x3F & (input_chr
)));
238 } else { /* six-byte character */
239 emit(emitctx
, 0xFC | (0x01 & (input_chr
>> 30)));
240 emit(emitctx
, 0x80 | (0x3F & (input_chr
>> 24)));
241 emit(emitctx
, 0x80 | (0x3F & (input_chr
>> 18)));
242 emit(emitctx
, 0x80 | (0x3F & (input_chr
>> 12)));
243 emit(emitctx
, 0x80 | (0x3F & (input_chr
>> 6)));
244 emit(emitctx
, 0x80 | (0x3F & (input_chr
)));
256 void utf8_emit(void *ctx
, long output
)
258 wchar_t **p
= (wchar_t **)ctx
;
262 void utf8_read_test(int line
, char *input
, int inlen
, ...)
265 wchar_t *p
, str
[512];
273 for (i
= 0; i
< inlen
; i
++)
274 read_utf8(NULL
, input
[i
] & 0xFF, &state
, utf8_emit
, &p
);
278 for (i
= 0; i
< p
- str
; i
++) {
279 l
= va_arg(ap
, long int);
281 printf("%d: correct string shorter than output\n", line
);
286 printf("%d: char %d came out as %08x, should be %08x\n",
292 l
= va_arg(ap
, long int);
294 printf("%d: correct string longer than output\n", line
);
301 void utf8_write_test(int line
, const long *input
, int inlen
, ...)
304 wchar_t *p
, str
[512];
312 for (i
= 0; i
< inlen
; i
++) {
313 if (!write_utf8(NULL
, input
[i
], &state
, utf8_emit
, &p
))
314 utf8_emit(&p
, ERROR
);
319 for (i
= 0; i
< p
- str
; i
++) {
320 l
= va_arg(ap
, long int);
322 printf("%d: correct string shorter than output\n", line
);
327 printf("%d: char %d came out as %08x, should be %08x\n",
333 l
= va_arg(ap
, long int);
335 printf("%d: correct string longer than output\n", line
);
342 /* Macro to concoct the first three parameters of utf8_read_test. */
343 #define TESTSTR(x) __LINE__, x, lenof(x)
347 printf("read tests beginning\n");
348 utf8_read_test(TESTSTR("\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"),
349 0x000003BA, /* GREEK SMALL LETTER KAPPA */
350 0x00001F79, /* GREEK SMALL LETTER OMICRON WITH OXIA */
351 0x000003C3, /* GREEK SMALL LETTER SIGMA */
352 0x000003BC, /* GREEK SMALL LETTER MU */
353 0x000003B5, /* GREEK SMALL LETTER EPSILON */
355 utf8_read_test(TESTSTR("\x00"),
356 0x00000000, /* <control> */
358 utf8_read_test(TESTSTR("\xC2\x80"),
359 0x00000080, /* <control> */
361 utf8_read_test(TESTSTR("\xE0\xA0\x80"),
362 0x00000800, /* <no name available> */
364 utf8_read_test(TESTSTR("\xF0\x90\x80\x80"),
365 0x00010000, /* <no name available> */
367 utf8_read_test(TESTSTR("\xF8\x88\x80\x80\x80"),
368 0x00200000, /* <no name available> */
370 utf8_read_test(TESTSTR("\xFC\x84\x80\x80\x80\x80"),
371 0x04000000, /* <no name available> */
373 utf8_read_test(TESTSTR("\x7F"),
374 0x0000007F, /* <control> */
376 utf8_read_test(TESTSTR("\xDF\xBF"),
377 0x000007FF, /* <no name available> */
379 utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
380 0x0000FFFD, /* REPLACEMENT CHARACTER */
382 utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
383 ERROR
, /* <no name available> (invalid char) */
385 utf8_read_test(TESTSTR("\xF7\xBF\xBF\xBF"),
386 0x001FFFFF, /* <no name available> */
388 utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF\xBF"),
389 0x03FFFFFF, /* <no name available> */
391 utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF\xBF"),
392 0x7FFFFFFF, /* <no name available> */
394 utf8_read_test(TESTSTR("\xED\x9F\xBF"),
395 0x0000D7FF, /* <no name available> */
397 utf8_read_test(TESTSTR("\xEE\x80\x80"),
398 0x0000E000, /* <Private Use, First> */
400 utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
401 0x0000FFFD, /* REPLACEMENT CHARACTER */
403 utf8_read_test(TESTSTR("\xF4\x8F\xBF\xBF"),
404 0x0010FFFF, /* <no name available> */
406 utf8_read_test(TESTSTR("\xF4\x90\x80\x80"),
407 0x00110000, /* <no name available> */
409 utf8_read_test(TESTSTR("\x80"),
410 ERROR
, /* (unexpected continuation byte) */
412 utf8_read_test(TESTSTR("\xBF"),
413 ERROR
, /* (unexpected continuation byte) */
415 utf8_read_test(TESTSTR("\x80\xBF"),
416 ERROR
, /* (unexpected continuation byte) */
417 ERROR
, /* (unexpected continuation byte) */
419 utf8_read_test(TESTSTR("\x80\xBF\x80"),
420 ERROR
, /* (unexpected continuation byte) */
421 ERROR
, /* (unexpected continuation byte) */
422 ERROR
, /* (unexpected continuation byte) */
424 utf8_read_test(TESTSTR("\x80\xBF\x80\xBF"),
425 ERROR
, /* (unexpected continuation byte) */
426 ERROR
, /* (unexpected continuation byte) */
427 ERROR
, /* (unexpected continuation byte) */
428 ERROR
, /* (unexpected continuation byte) */
430 utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80"),
431 ERROR
, /* (unexpected continuation byte) */
432 ERROR
, /* (unexpected continuation byte) */
433 ERROR
, /* (unexpected continuation byte) */
434 ERROR
, /* (unexpected continuation byte) */
435 ERROR
, /* (unexpected continuation byte) */
437 utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF"),
438 ERROR
, /* (unexpected continuation byte) */
439 ERROR
, /* (unexpected continuation byte) */
440 ERROR
, /* (unexpected continuation byte) */
441 ERROR
, /* (unexpected continuation byte) */
442 ERROR
, /* (unexpected continuation byte) */
443 ERROR
, /* (unexpected continuation byte) */
445 utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF\x80"),
446 ERROR
, /* (unexpected continuation byte) */
447 ERROR
, /* (unexpected continuation byte) */
448 ERROR
, /* (unexpected continuation byte) */
449 ERROR
, /* (unexpected continuation byte) */
450 ERROR
, /* (unexpected continuation byte) */
451 ERROR
, /* (unexpected continuation byte) */
452 ERROR
, /* (unexpected continuation byte) */
454 utf8_read_test(TESTSTR("\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"),
455 ERROR
, /* (unexpected continuation byte) */
456 ERROR
, /* (unexpected continuation byte) */
457 ERROR
, /* (unexpected continuation byte) */
458 ERROR
, /* (unexpected continuation byte) */
459 ERROR
, /* (unexpected continuation byte) */
460 ERROR
, /* (unexpected continuation byte) */
461 ERROR
, /* (unexpected continuation byte) */
462 ERROR
, /* (unexpected continuation byte) */
463 ERROR
, /* (unexpected continuation byte) */
464 ERROR
, /* (unexpected continuation byte) */
465 ERROR
, /* (unexpected continuation byte) */
466 ERROR
, /* (unexpected continuation byte) */
467 ERROR
, /* (unexpected continuation byte) */
468 ERROR
, /* (unexpected continuation byte) */
469 ERROR
, /* (unexpected continuation byte) */
470 ERROR
, /* (unexpected continuation byte) */
471 ERROR
, /* (unexpected continuation byte) */
472 ERROR
, /* (unexpected continuation byte) */
473 ERROR
, /* (unexpected continuation byte) */
474 ERROR
, /* (unexpected continuation byte) */
475 ERROR
, /* (unexpected continuation byte) */
476 ERROR
, /* (unexpected continuation byte) */
477 ERROR
, /* (unexpected continuation byte) */
478 ERROR
, /* (unexpected continuation byte) */
479 ERROR
, /* (unexpected continuation byte) */
480 ERROR
, /* (unexpected continuation byte) */
481 ERROR
, /* (unexpected continuation byte) */
482 ERROR
, /* (unexpected continuation byte) */
483 ERROR
, /* (unexpected continuation byte) */
484 ERROR
, /* (unexpected continuation byte) */
485 ERROR
, /* (unexpected continuation byte) */
486 ERROR
, /* (unexpected continuation byte) */
487 ERROR
, /* (unexpected continuation byte) */
488 ERROR
, /* (unexpected continuation byte) */
489 ERROR
, /* (unexpected continuation byte) */
490 ERROR
, /* (unexpected continuation byte) */
491 ERROR
, /* (unexpected continuation byte) */
492 ERROR
, /* (unexpected continuation byte) */
493 ERROR
, /* (unexpected continuation byte) */
494 ERROR
, /* (unexpected continuation byte) */
495 ERROR
, /* (unexpected continuation byte) */
496 ERROR
, /* (unexpected continuation byte) */
497 ERROR
, /* (unexpected continuation byte) */
498 ERROR
, /* (unexpected continuation byte) */
499 ERROR
, /* (unexpected continuation byte) */
500 ERROR
, /* (unexpected continuation byte) */
501 ERROR
, /* (unexpected continuation byte) */
502 ERROR
, /* (unexpected continuation byte) */
503 ERROR
, /* (unexpected continuation byte) */
504 ERROR
, /* (unexpected continuation byte) */
505 ERROR
, /* (unexpected continuation byte) */
506 ERROR
, /* (unexpected continuation byte) */
507 ERROR
, /* (unexpected continuation byte) */
508 ERROR
, /* (unexpected continuation byte) */
509 ERROR
, /* (unexpected continuation byte) */
510 ERROR
, /* (unexpected continuation byte) */
511 ERROR
, /* (unexpected continuation byte) */
512 ERROR
, /* (unexpected continuation byte) */
513 ERROR
, /* (unexpected continuation byte) */
514 ERROR
, /* (unexpected continuation byte) */
515 ERROR
, /* (unexpected continuation byte) */
516 ERROR
, /* (unexpected continuation byte) */
517 ERROR
, /* (unexpected continuation byte) */
518 ERROR
, /* (unexpected continuation byte) */
520 utf8_read_test(TESTSTR("\xC0\x20\xC1\x20\xC2\x20\xC3\x20\xC4\x20\xC5\x20\xC6\x20\xC7\x20"),
521 ERROR
, /* (incomplete sequence) */
522 0x00000020, /* SPACE */
523 ERROR
, /* (incomplete sequence) */
524 0x00000020, /* SPACE */
525 ERROR
, /* (incomplete sequence) */
526 0x00000020, /* SPACE */
527 ERROR
, /* (incomplete sequence) */
528 0x00000020, /* SPACE */
529 ERROR
, /* (incomplete sequence) */
530 0x00000020, /* SPACE */
531 ERROR
, /* (incomplete sequence) */
532 0x00000020, /* SPACE */
533 ERROR
, /* (incomplete sequence) */
534 0x00000020, /* SPACE */
535 ERROR
, /* (incomplete sequence) */
536 0x00000020, /* SPACE */
538 utf8_read_test(TESTSTR("\xE0\x20\xE1\x20\xE2\x20\xE3\x20\xE4\x20\xE5\x20\xE6\x20\xE7\x20\xE8\x20\xE9\x20\xEA\x20\xEB\x20\xEC\x20\xED\x20\xEE\x20\xEF\x20"),
539 ERROR
, /* (incomplete sequence) */
540 0x00000020, /* SPACE */
541 ERROR
, /* (incomplete sequence) */
542 0x00000020, /* SPACE */
543 ERROR
, /* (incomplete sequence) */
544 0x00000020, /* SPACE */
545 ERROR
, /* (incomplete sequence) */
546 0x00000020, /* SPACE */
547 ERROR
, /* (incomplete sequence) */
548 0x00000020, /* SPACE */
549 ERROR
, /* (incomplete sequence) */
550 0x00000020, /* SPACE */
551 ERROR
, /* (incomplete sequence) */
552 0x00000020, /* SPACE */
553 ERROR
, /* (incomplete sequence) */
554 0x00000020, /* SPACE */
555 ERROR
, /* (incomplete sequence) */
556 0x00000020, /* SPACE */
557 ERROR
, /* (incomplete sequence) */
558 0x00000020, /* SPACE */
559 ERROR
, /* (incomplete sequence) */
560 0x00000020, /* SPACE */
561 ERROR
, /* (incomplete sequence) */
562 0x00000020, /* SPACE */
563 ERROR
, /* (incomplete sequence) */
564 0x00000020, /* SPACE */
565 ERROR
, /* (incomplete sequence) */
566 0x00000020, /* SPACE */
567 ERROR
, /* (incomplete sequence) */
568 0x00000020, /* SPACE */
569 ERROR
, /* (incomplete sequence) */
570 0x00000020, /* SPACE */
572 utf8_read_test(TESTSTR("\xF0\x20\xF1\x20\xF2\x20\xF3\x20\xF4\x20\xF5\x20\xF6\x20\xF7\x20"),
573 ERROR
, /* (incomplete sequence) */
574 0x00000020, /* SPACE */
575 ERROR
, /* (incomplete sequence) */
576 0x00000020, /* SPACE */
577 ERROR
, /* (incomplete sequence) */
578 0x00000020, /* SPACE */
579 ERROR
, /* (incomplete sequence) */
580 0x00000020, /* SPACE */
581 ERROR
, /* (incomplete sequence) */
582 0x00000020, /* SPACE */
583 ERROR
, /* (incomplete sequence) */
584 0x00000020, /* SPACE */
585 ERROR
, /* (incomplete sequence) */
586 0x00000020, /* SPACE */
587 ERROR
, /* (incomplete sequence) */
588 0x00000020, /* SPACE */
590 utf8_read_test(TESTSTR("\xF8\x20\xF9\x20\xFA\x20\xFB\x20"),
591 ERROR
, /* (incomplete sequence) */
592 0x00000020, /* SPACE */
593 ERROR
, /* (incomplete sequence) */
594 0x00000020, /* SPACE */
595 ERROR
, /* (incomplete sequence) */
596 0x00000020, /* SPACE */
597 ERROR
, /* (incomplete sequence) */
598 0x00000020, /* SPACE */
600 utf8_read_test(TESTSTR("\xFC\x20\xFD\x20"),
601 ERROR
, /* (incomplete sequence) */
602 0x00000020, /* SPACE */
603 ERROR
, /* (incomplete sequence) */
604 0x00000020, /* SPACE */
606 utf8_read_test(TESTSTR("\xC0"),
607 ERROR
, /* (incomplete sequence) */
609 utf8_read_test(TESTSTR("\xE0\x80"),
610 ERROR
, /* (incomplete sequence) */
612 utf8_read_test(TESTSTR("\xF0\x80\x80"),
613 ERROR
, /* (incomplete sequence) */
615 utf8_read_test(TESTSTR("\xF8\x80\x80\x80"),
616 ERROR
, /* (incomplete sequence) */
618 utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80"),
619 ERROR
, /* (incomplete sequence) */
621 utf8_read_test(TESTSTR("\xDF"),
622 ERROR
, /* (incomplete sequence) */
624 utf8_read_test(TESTSTR("\xEF\xBF"),
625 ERROR
, /* (incomplete sequence) */
627 utf8_read_test(TESTSTR("\xF7\xBF\xBF"),
628 ERROR
, /* (incomplete sequence) */
630 utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF"),
631 ERROR
, /* (incomplete sequence) */
633 utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF"),
634 ERROR
, /* (incomplete sequence) */
636 utf8_read_test(TESTSTR("\xC0\xE0\x80\xF0\x80\x80\xF8\x80\x80\x80\xFC\x80\x80\x80\x80\xDF\xEF\xBF\xF7\xBF\xBF\xFB\xBF\xBF\xBF\xFD\xBF\xBF\xBF\xBF"),
637 ERROR
, /* (incomplete sequence) */
638 ERROR
, /* (incomplete sequence) */
639 ERROR
, /* (incomplete sequence) */
640 ERROR
, /* (incomplete sequence) */
641 ERROR
, /* (incomplete sequence) */
642 ERROR
, /* (incomplete sequence) */
643 ERROR
, /* (incomplete sequence) */
644 ERROR
, /* (incomplete sequence) */
645 ERROR
, /* (incomplete sequence) */
646 ERROR
, /* (incomplete sequence) */
648 utf8_read_test(TESTSTR("\xFE"),
649 ERROR
, /* (invalid UTF-8 byte) */
651 utf8_read_test(TESTSTR("\xFF"),
652 ERROR
, /* (invalid UTF-8 byte) */
654 utf8_read_test(TESTSTR("\xFE\xFE\xFF\xFF"),
655 ERROR
, /* (invalid UTF-8 byte) */
656 ERROR
, /* (invalid UTF-8 byte) */
657 ERROR
, /* (invalid UTF-8 byte) */
658 ERROR
, /* (invalid UTF-8 byte) */
660 utf8_read_test(TESTSTR("\xC0\xAF"),
661 ERROR
, /* SOLIDUS (overlong form of 2F) */
663 utf8_read_test(TESTSTR("\xE0\x80\xAF"),
664 ERROR
, /* SOLIDUS (overlong form of 2F) */
666 utf8_read_test(TESTSTR("\xF0\x80\x80\xAF"),
667 ERROR
, /* SOLIDUS (overlong form of 2F) */
669 utf8_read_test(TESTSTR("\xF8\x80\x80\x80\xAF"),
670 ERROR
, /* SOLIDUS (overlong form of 2F) */
672 utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\xAF"),
673 ERROR
, /* SOLIDUS (overlong form of 2F) */
675 utf8_read_test(TESTSTR("\xC1\xBF"),
676 ERROR
, /* <control> (overlong form of 7F) */
678 utf8_read_test(TESTSTR("\xE0\x9F\xBF"),
679 ERROR
, /* <no name available> (overlong form of DF BF) */
681 utf8_read_test(TESTSTR("\xF0\x8F\xBF\xBF"),
682 ERROR
, /* <no name available> (overlong form of EF BF BF) (invalid char) */
684 utf8_read_test(TESTSTR("\xF8\x87\xBF\xBF\xBF"),
685 ERROR
, /* <no name available> (overlong form of F7 BF BF BF) */
687 utf8_read_test(TESTSTR("\xFC\x83\xBF\xBF\xBF\xBF"),
688 ERROR
, /* <no name available> (overlong form of FB BF BF BF BF) */
690 utf8_read_test(TESTSTR("\xC0\x80"),
691 ERROR
, /* <control> (overlong form of 00) */
693 utf8_read_test(TESTSTR("\xE0\x80\x80"),
694 ERROR
, /* <control> (overlong form of 00) */
696 utf8_read_test(TESTSTR("\xF0\x80\x80\x80"),
697 ERROR
, /* <control> (overlong form of 00) */
699 utf8_read_test(TESTSTR("\xF8\x80\x80\x80\x80"),
700 ERROR
, /* <control> (overlong form of 00) */
702 utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\x80"),
703 ERROR
, /* <control> (overlong form of 00) */
705 utf8_read_test(TESTSTR("\xED\xA0\x80"),
706 ERROR
, /* <Non Private Use High Surrogate, First> (surrogate) */
708 utf8_read_test(TESTSTR("\xED\xAD\xBF"),
709 ERROR
, /* <Non Private Use High Surrogate, Last> (surrogate) */
711 utf8_read_test(TESTSTR("\xED\xAE\x80"),
712 ERROR
, /* <Private Use High Surrogate, First> (surrogate) */
714 utf8_read_test(TESTSTR("\xED\xAF\xBF"),
715 ERROR
, /* <Private Use High Surrogate, Last> (surrogate) */
717 utf8_read_test(TESTSTR("\xED\xB0\x80"),
718 ERROR
, /* <Low Surrogate, First> (surrogate) */
720 utf8_read_test(TESTSTR("\xED\xBE\x80"),
721 ERROR
, /* <no name available> (surrogate) */
723 utf8_read_test(TESTSTR("\xED\xBF\xBF"),
724 ERROR
, /* <Low Surrogate, Last> (surrogate) */
726 utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xB0\x80"),
727 ERROR
, /* <Non Private Use High Surrogate, First> (surrogate) */
728 ERROR
, /* <Low Surrogate, First> (surrogate) */
730 utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xBF\xBF"),
731 ERROR
, /* <Non Private Use High Surrogate, First> (surrogate) */
732 ERROR
, /* <Low Surrogate, Last> (surrogate) */
734 utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xB0\x80"),
735 ERROR
, /* <Non Private Use High Surrogate, Last> (surrogate) */
736 ERROR
, /* <Low Surrogate, First> (surrogate) */
738 utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xBF\xBF"),
739 ERROR
, /* <Non Private Use High Surrogate, Last> (surrogate) */
740 ERROR
, /* <Low Surrogate, Last> (surrogate) */
742 utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xB0\x80"),
743 ERROR
, /* <Private Use High Surrogate, First> (surrogate) */
744 ERROR
, /* <Low Surrogate, First> (surrogate) */
746 utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xBF\xBF"),
747 ERROR
, /* <Private Use High Surrogate, First> (surrogate) */
748 ERROR
, /* <Low Surrogate, Last> (surrogate) */
750 utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xB0\x80"),
751 ERROR
, /* <Private Use High Surrogate, Last> (surrogate) */
752 ERROR
, /* <Low Surrogate, First> (surrogate) */
754 utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xBF\xBF"),
755 ERROR
, /* <Private Use High Surrogate, Last> (surrogate) */
756 ERROR
, /* <Low Surrogate, Last> (surrogate) */
758 utf8_read_test(TESTSTR("\xEF\xBF\xBE"),
759 ERROR
, /* <no name available> (invalid char) */
761 utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
762 ERROR
, /* <no name available> (invalid char) */
764 printf("read tests completed\n");
765 printf("write tests beginning\n");
767 const static long str
[] =
768 {0x03BAL
, 0x1F79L
, 0x03C3L
, 0x03BCL
, 0x03B5L
, 0};
769 utf8_write_test(TESTSTR(str
),
778 const static long str
[] = {0x0000L
, 0};
779 utf8_write_test(TESTSTR(str
),
784 const static long str
[] = {0x0080L
, 0};
785 utf8_write_test(TESTSTR(str
),
790 const static long str
[] = {0x0800L
, 0};
791 utf8_write_test(TESTSTR(str
),
796 const static long str
[] = {0x00010000L
, 0};
797 utf8_write_test(TESTSTR(str
),
798 0xF0, 0x90, 0x80, 0x80,
802 const static long str
[] = {0x00200000L
, 0};
803 utf8_write_test(TESTSTR(str
),
804 0xF8, 0x88, 0x80, 0x80, 0x80,
808 const static long str
[] = {0x04000000L
, 0};
809 utf8_write_test(TESTSTR(str
),
810 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80,
814 const static long str
[] = {0x007FL
, 0};
815 utf8_write_test(TESTSTR(str
),
820 const static long str
[] = {0x07FFL
, 0};
821 utf8_write_test(TESTSTR(str
),
826 const static long str
[] = {0xFFFDL
, 0};
827 utf8_write_test(TESTSTR(str
),
832 const static long str
[] = {0xFFFFL
, 0};
833 utf8_write_test(TESTSTR(str
),
838 const static long str
[] = {0x001FFFFFL
, 0};
839 utf8_write_test(TESTSTR(str
),
840 0xF7, 0xBF, 0xBF, 0xBF,
844 const static long str
[] = {0x03FFFFFFL
, 0};
845 utf8_write_test(TESTSTR(str
),
846 0xFB, 0xBF, 0xBF, 0xBF, 0xBF,
850 const static long str
[] = {0x7FFFFFFFL
, 0};
851 utf8_write_test(TESTSTR(str
),
852 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF,
856 const static long str
[] = {0xD7FFL
, 0};
857 utf8_write_test(TESTSTR(str
),
862 const static long str
[] = {0xD800L
, 0};
863 utf8_write_test(TESTSTR(str
),
868 const static long str
[] = {0xD800L
, 0xDC00L
, 0};
869 utf8_write_test(TESTSTR(str
),
875 const static long str
[] = {0xDFFFL
, 0};
876 utf8_write_test(TESTSTR(str
),
881 const static long str
[] = {0xE000L
, 0};
882 utf8_write_test(TESTSTR(str
),
886 printf("write tests completed\n");
888 printf("total: %d errors\n", total_errs
);
889 return (total_errs
!= 0);
891 #endif /* TESTMODE */
893 const charset_spec charset_CS_UTF8
= {
894 CS_UTF8
, read_utf8
, write_utf8
, NULL
897 #else /* ENUM_CHARSETS */
899 ENUM_CHARSET(CS_UTF8
)
901 #endif /* ENUM_CHARSETS */