I've apparently had this todo-list comment sitting on stormhawk for
[sgt/charset] / utf8.c
CommitLineData
c6d25d8d 1/*
2 * utf8.c - routines to handle UTF-8.
3 */
4
5#ifndef ENUM_CHARSETS
6
7#include "charset.h"
8#include "internal.h"
9
10/*
11 * UTF-8 has no associated data, so `charset' may be ignored.
12 */
13
14static void read_utf8(charset_spec const *charset, long int input_chr,
15 charset_state *state,
16 void (*emit)(void *ctx, long int output), void *emitctx)
17{
18 UNUSEDARG(charset);
19
20 /*
21 * For reading UTF-8, the `state' word contains:
22 *
23 * - in bits 29-31, the number of bytes expected to be in the
24 * current multibyte character (which we can tell instantly
25 * from the first byte, of course).
26 *
27 * - in bits 26-28, the number of bytes _seen so far_ in the
28 * current multibyte character.
29 *
30 * - in the remainder of the word, the current value of the
31 * character, which is shifted upwards by 6 bits to
32 * accommodate each new byte.
33 *
34 * As required, the state is zero when we are not in the middle
35 * of a multibyte character at all.
36 *
37 * For example, when reading E9 8D 8B, starting at state=0:
38 *
39 * - after E9, the state is 0x64000009
40 * - after 8D, the state is 0x6800024d
41 * - after 8B, the state conceptually becomes 0x6c00934b, at
42 * which point we notice we've got as many characters as we
43 * were expecting, output U+934B, and reset the state to
44 * zero.
45 *
46 * Note that the maximum number of bits we might need to store
47 * in the character value field is 25 (U+7FFFFFFF contains 31
48 * bits, but we will never actually store its full value
49 * because when we receive the last 6 bits in the final
50 * continuation byte we will output it and revert the state to
51 * zero). Hence the character value field never collides with
52 * the byte counts.
53 */
54
55 if (input_chr < 0x80) {
56 /*
57 * Single-byte character. If the state is nonzero before
58 * coming here, output an error for an incomplete sequence.
59 * Then output the character.
60 */
61 if (state->s0 != 0) {
62 emit(emitctx, ERROR);
63 state->s0 = 0;
64 }
65 emit(emitctx, input_chr);
66 } else if (input_chr == 0xFE || input_chr == 0xFF) {
67 /*
68 * FE and FF bytes should _never_ occur in UTF-8. They are
69 * automatic errors; if the state was nonzero to start
70 * with, output a further error for an incomplete sequence.
71 */
72 if (state->s0 != 0) {
73 emit(emitctx, ERROR);
74 state->s0 = 0;
75 }
76 emit(emitctx, ERROR);
77 } else if (input_chr >= 0x80 && input_chr < 0xC0) {
78 /*
79 * Continuation byte. Output an error for an unexpected
80 * continuation byte, if the state is zero.
81 */
82 if (state->s0 == 0) {
83 emit(emitctx, ERROR);
84 } else {
85 unsigned long charval;
86 unsigned long topstuff;
87 int bytes;
88
89 /*
90 * Otherwise, accumulate more of the character value.
91 */
92 charval = state->s0 & 0x03ffffffL;
93 charval = (charval << 6) | (input_chr & 0x3F);
94
95 /*
96 * Check the byte counts; if we have not reached the
97 * end of the character, update the state and return.
98 */
99 topstuff = state->s0 & 0xfc000000L;
100 topstuff += 0x04000000L; /* add one to the byte count */
101 if (((topstuff << 3) ^ topstuff) & 0xe0000000L) {
102 state->s0 = topstuff | charval;
103 return;
104 }
105
106 /*
107 * Now we know we've reached the end of the character.
108 * `charval' is the Unicode value. We should check for
109 * various invalid things, and then either output
110 * charval or an error. In all cases we reset the state
111 * to zero.
112 */
113 bytes = topstuff >> 29;
114 state->s0 = 0;
115
116 if (charval >= 0xD800 && charval < 0xE000) {
117 /*
118 * Surrogates (0xD800-0xDFFF) may never be encoded
119 * in UTF-8. A surrogate pair in Unicode should
120 * have been encoded as a single UTF-8 character
121 * occupying more than three bytes.
122 */
123 emit(emitctx, ERROR);
124 } else if (charval == 0xFFFE || charval == 0xFFFF) {
125 /*
126 * U+FFFE and U+FFFF are invalid Unicode characters
127 * and may never be encoded in UTF-8. (This is one
128 * reason why U+FFFF is our way of signalling an
129 * error to our `emit' function :-)
130 */
131 emit(emitctx, ERROR);
132 } else if ((charval <= 0x7FL /* && bytes > 1 */) ||
133 (charval <= 0x7FFL && bytes > 2) ||
134 (charval <= 0xFFFFL && bytes > 3) ||
135 (charval <= 0x1FFFFFL && bytes > 4) ||
136 (charval <= 0x3FFFFFFL && bytes > 5)) {
137 /*
138 * Overlong sequences are not to be tolerated,
139 * under any circumstances.
140 */
141 emit(emitctx, ERROR);
142 } else {
143 /*
144 * Oh, all right. We'll let this one off.
145 */
146 emit(emitctx, charval);
147 }
148 }
149
150 } else {
151 /*
152 * Lead byte. First output an error for an incomplete
153 * sequence, if the state is nonzero.
154 */
155 if (state->s0 != 0)
156 emit(emitctx, ERROR);
157
158 /*
159 * Now deal with the lead byte: work out the number of
160 * bytes we expect to see in this character, and extract
161 * the initial bits of it too.
162 */
163 if (input_chr >= 0xC0 && input_chr < 0xE0) {
164 state->s0 = 0x44000000L | (input_chr & 0x1F);
165 } else if (input_chr >= 0xE0 && input_chr < 0xF0) {
166 state->s0 = 0x64000000L | (input_chr & 0x0F);
167 } else if (input_chr >= 0xF0 && input_chr < 0xF8) {
168 state->s0 = 0x84000000L | (input_chr & 0x07);
169 } else if (input_chr >= 0xF8 && input_chr < 0xFC) {
170 state->s0 = 0xa4000000L | (input_chr & 0x03);
171 } else if (input_chr >= 0xFC && input_chr < 0xFE) {
172 state->s0 = 0xc4000000L | (input_chr & 0x01);
173 }
174 }
175}
176
177/*
178 * UTF-8 is a stateless multi-byte encoding (in the sense that just
179 * after any character has been completed, the state is always the
180 * same); hence when writing it, there is no need to use the
181 * charset_state.
182 */
183
184static int write_utf8(charset_spec const *charset, long int input_chr,
185 charset_state *state,
186 void (*emit)(void *ctx, long int output),
187 void *emitctx)
188{
189 UNUSEDARG(charset);
190 UNUSEDARG(state);
191
192 if (input_chr == -1)
193 return TRUE; /* stateless; no cleanup required */
194
195 /*
196 * Refuse to output any illegal code points.
197 */
198 if (input_chr == 0xFFFE || input_chr == 0xFFFF ||
199 (input_chr >= 0xD800 && input_chr < 0xE000)) {
200 return FALSE;
201 } else if (input_chr < 0x80) { /* one-byte character */
202 emit(emitctx, input_chr);
203 return TRUE;
204 } else if (input_chr < 0x800) { /* two-byte character */
205 emit(emitctx, 0xC0 | (0x1F & (input_chr >> 6)));
206 emit(emitctx, 0x80 | (0x3F & (input_chr )));
207 return TRUE;
208 } else if (input_chr < 0x10000) { /* three-byte character */
209 emit(emitctx, 0xE0 | (0x0F & (input_chr >> 12)));
210 emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
211 emit(emitctx, 0x80 | (0x3F & (input_chr )));
212 return TRUE;
213 } else if (input_chr < 0x200000) { /* four-byte character */
214 emit(emitctx, 0xF0 | (0x07 & (input_chr >> 18)));
215 emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
216 emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
217 emit(emitctx, 0x80 | (0x3F & (input_chr )));
218 return TRUE;
219 } else if (input_chr < 0x4000000) {/* five-byte character */
220 emit(emitctx, 0xF8 | (0x03 & (input_chr >> 24)));
221 emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
222 emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
223 emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
224 emit(emitctx, 0x80 | (0x3F & (input_chr )));
225 return TRUE;
226 } else { /* six-byte character */
227 emit(emitctx, 0xFC | (0x01 & (input_chr >> 30)));
228 emit(emitctx, 0x80 | (0x3F & (input_chr >> 24)));
229 emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
230 emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
231 emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
232 emit(emitctx, 0x80 | (0x3F & (input_chr )));
233 return TRUE;
234 }
235}
236
237#ifdef TESTMODE
238
239#include <stdio.h>
240#include <stdarg.h>
241
242int total_errs = 0;
243
244void utf8_emit(void *ctx, long output)
245{
246 wchar_t **p = (wchar_t **)ctx;
247 *(*p)++ = output;
248}
249
250void utf8_read_test(int line, char *input, int inlen, ...)
251{
252 va_list ap;
253 wchar_t *p, str[512];
254 int i;
255 charset_state state;
256 unsigned long l;
257
258 state.s0 = 0;
259 p = str;
260
261 for (i = 0; i < inlen; i++)
262 read_utf8(NULL, input[i] & 0xFF, &state, utf8_emit, &p);
263
264 va_start(ap, inlen);
265 l = 0;
266 for (i = 0; i < p - str; i++) {
267 l = va_arg(ap, long int);
268 if (l == -1) {
269 printf("%d: correct string shorter than output\n", line);
270 total_errs++;
271 break;
272 }
273 if (l != str[i]) {
274 printf("%d: char %d came out as %08x, should be %08x\n",
275 line, i, str[i], l);
276 total_errs++;
277 }
278 }
279 if (l != -1) {
280 l = va_arg(ap, long int);
281 if (l != -1) {
282 printf("%d: correct string longer than output\n", line);
283 total_errs++;
284 }
285 }
286 va_end(ap);
287}
288
289void utf8_write_test(int line, const long *input, int inlen, ...)
290{
291 va_list ap;
292 wchar_t *p, str[512];
293 int i;
294 charset_state state;
295 unsigned long l;
296
297 state.s0 = 0;
298 p = str;
299
300 for (i = 0; i < inlen; i++)
301 write_utf8(NULL, input[i], &state, utf8_emit, &p);
302
303 va_start(ap, inlen);
304 l = 0;
305 for (i = 0; i < p - str; i++) {
306 l = va_arg(ap, long int);
307 if (l == -1) {
308 printf("%d: correct string shorter than output\n", line);
309 total_errs++;
310 break;
311 }
312 if (l != str[i]) {
313 printf("%d: char %d came out as %08x, should be %08x\n",
314 line, i, str[i], l);
315 total_errs++;
316 }
317 }
318 if (l != -1) {
319 l = va_arg(ap, long int);
320 if (l != -1) {
321 printf("%d: correct string longer than output\n", line);
322 total_errs++;
323 }
324 }
325 va_end(ap);
326}
327
328/* Macro to concoct the first three parameters of utf8_read_test. */
329#define TESTSTR(x) __LINE__, x, lenof(x)
330
331int main(void)
332{
333 printf("read tests beginning\n");
334 utf8_read_test(TESTSTR("\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"),
335 0x000003BA, /* GREEK SMALL LETTER KAPPA */
336 0x00001F79, /* GREEK SMALL LETTER OMICRON WITH OXIA */
337 0x000003C3, /* GREEK SMALL LETTER SIGMA */
338 0x000003BC, /* GREEK SMALL LETTER MU */
339 0x000003B5, /* GREEK SMALL LETTER EPSILON */
340 0, -1);
341 utf8_read_test(TESTSTR("\x00"),
342 0x00000000, /* <control> */
343 0, -1);
344 utf8_read_test(TESTSTR("\xC2\x80"),
345 0x00000080, /* <control> */
346 0, -1);
347 utf8_read_test(TESTSTR("\xE0\xA0\x80"),
348 0x00000800, /* <no name available> */
349 0, -1);
350 utf8_read_test(TESTSTR("\xF0\x90\x80\x80"),
351 0x00010000, /* <no name available> */
352 0, -1);
353 utf8_read_test(TESTSTR("\xF8\x88\x80\x80\x80"),
354 0x00200000, /* <no name available> */
355 0, -1);
356 utf8_read_test(TESTSTR("\xFC\x84\x80\x80\x80\x80"),
357 0x04000000, /* <no name available> */
358 0, -1);
359 utf8_read_test(TESTSTR("\x7F"),
360 0x0000007F, /* <control> */
361 0, -1);
362 utf8_read_test(TESTSTR("\xDF\xBF"),
363 0x000007FF, /* <no name available> */
364 0, -1);
365 utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
366 0x0000FFFD, /* REPLACEMENT CHARACTER */
367 0, -1);
368 utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
369 ERROR, /* <no name available> (invalid char) */
370 0, -1);
371 utf8_read_test(TESTSTR("\xF7\xBF\xBF\xBF"),
372 0x001FFFFF, /* <no name available> */
373 0, -1);
374 utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF\xBF"),
375 0x03FFFFFF, /* <no name available> */
376 0, -1);
377 utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF\xBF"),
378 0x7FFFFFFF, /* <no name available> */
379 0, -1);
380 utf8_read_test(TESTSTR("\xED\x9F\xBF"),
381 0x0000D7FF, /* <no name available> */
382 0, -1);
383 utf8_read_test(TESTSTR("\xEE\x80\x80"),
384 0x0000E000, /* <Private Use, First> */
385 0, -1);
386 utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
387 0x0000FFFD, /* REPLACEMENT CHARACTER */
388 0, -1);
389 utf8_read_test(TESTSTR("\xF4\x8F\xBF\xBF"),
390 0x0010FFFF, /* <no name available> */
391 0, -1);
392 utf8_read_test(TESTSTR("\xF4\x90\x80\x80"),
393 0x00110000, /* <no name available> */
394 0, -1);
395 utf8_read_test(TESTSTR("\x80"),
396 ERROR, /* (unexpected continuation byte) */
397 0, -1);
398 utf8_read_test(TESTSTR("\xBF"),
399 ERROR, /* (unexpected continuation byte) */
400 0, -1);
401 utf8_read_test(TESTSTR("\x80\xBF"),
402 ERROR, /* (unexpected continuation byte) */
403 ERROR, /* (unexpected continuation byte) */
404 0, -1);
405 utf8_read_test(TESTSTR("\x80\xBF\x80"),
406 ERROR, /* (unexpected continuation byte) */
407 ERROR, /* (unexpected continuation byte) */
408 ERROR, /* (unexpected continuation byte) */
409 0, -1);
410 utf8_read_test(TESTSTR("\x80\xBF\x80\xBF"),
411 ERROR, /* (unexpected continuation byte) */
412 ERROR, /* (unexpected continuation byte) */
413 ERROR, /* (unexpected continuation byte) */
414 ERROR, /* (unexpected continuation byte) */
415 0, -1);
416 utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80"),
417 ERROR, /* (unexpected continuation byte) */
418 ERROR, /* (unexpected continuation byte) */
419 ERROR, /* (unexpected continuation byte) */
420 ERROR, /* (unexpected continuation byte) */
421 ERROR, /* (unexpected continuation byte) */
422 0, -1);
423 utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF"),
424 ERROR, /* (unexpected continuation byte) */
425 ERROR, /* (unexpected continuation byte) */
426 ERROR, /* (unexpected continuation byte) */
427 ERROR, /* (unexpected continuation byte) */
428 ERROR, /* (unexpected continuation byte) */
429 ERROR, /* (unexpected continuation byte) */
430 0, -1);
431 utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF\x80"),
432 ERROR, /* (unexpected continuation byte) */
433 ERROR, /* (unexpected continuation byte) */
434 ERROR, /* (unexpected continuation byte) */
435 ERROR, /* (unexpected continuation byte) */
436 ERROR, /* (unexpected continuation byte) */
437 ERROR, /* (unexpected continuation byte) */
438 ERROR, /* (unexpected continuation byte) */
439 0, -1);
440 utf8_read_test(TESTSTR("\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"),
441 ERROR, /* (unexpected continuation byte) */
442 ERROR, /* (unexpected continuation byte) */
443 ERROR, /* (unexpected continuation byte) */
444 ERROR, /* (unexpected continuation byte) */
445 ERROR, /* (unexpected continuation byte) */
446 ERROR, /* (unexpected continuation byte) */
447 ERROR, /* (unexpected continuation byte) */
448 ERROR, /* (unexpected continuation byte) */
449 ERROR, /* (unexpected continuation byte) */
450 ERROR, /* (unexpected continuation byte) */
451 ERROR, /* (unexpected continuation byte) */
452 ERROR, /* (unexpected continuation byte) */
453 ERROR, /* (unexpected continuation byte) */
454 ERROR, /* (unexpected continuation byte) */
455 ERROR, /* (unexpected continuation byte) */
456 ERROR, /* (unexpected continuation byte) */
457 ERROR, /* (unexpected continuation byte) */
458 ERROR, /* (unexpected continuation byte) */
459 ERROR, /* (unexpected continuation byte) */
460 ERROR, /* (unexpected continuation byte) */
461 ERROR, /* (unexpected continuation byte) */
462 ERROR, /* (unexpected continuation byte) */
463 ERROR, /* (unexpected continuation byte) */
464 ERROR, /* (unexpected continuation byte) */
465 ERROR, /* (unexpected continuation byte) */
466 ERROR, /* (unexpected continuation byte) */
467 ERROR, /* (unexpected continuation byte) */
468 ERROR, /* (unexpected continuation byte) */
469 ERROR, /* (unexpected continuation byte) */
470 ERROR, /* (unexpected continuation byte) */
471 ERROR, /* (unexpected continuation byte) */
472 ERROR, /* (unexpected continuation byte) */
473 ERROR, /* (unexpected continuation byte) */
474 ERROR, /* (unexpected continuation byte) */
475 ERROR, /* (unexpected continuation byte) */
476 ERROR, /* (unexpected continuation byte) */
477 ERROR, /* (unexpected continuation byte) */
478 ERROR, /* (unexpected continuation byte) */
479 ERROR, /* (unexpected continuation byte) */
480 ERROR, /* (unexpected continuation byte) */
481 ERROR, /* (unexpected continuation byte) */
482 ERROR, /* (unexpected continuation byte) */
483 ERROR, /* (unexpected continuation byte) */
484 ERROR, /* (unexpected continuation byte) */
485 ERROR, /* (unexpected continuation byte) */
486 ERROR, /* (unexpected continuation byte) */
487 ERROR, /* (unexpected continuation byte) */
488 ERROR, /* (unexpected continuation byte) */
489 ERROR, /* (unexpected continuation byte) */
490 ERROR, /* (unexpected continuation byte) */
491 ERROR, /* (unexpected continuation byte) */
492 ERROR, /* (unexpected continuation byte) */
493 ERROR, /* (unexpected continuation byte) */
494 ERROR, /* (unexpected continuation byte) */
495 ERROR, /* (unexpected continuation byte) */
496 ERROR, /* (unexpected continuation byte) */
497 ERROR, /* (unexpected continuation byte) */
498 ERROR, /* (unexpected continuation byte) */
499 ERROR, /* (unexpected continuation byte) */
500 ERROR, /* (unexpected continuation byte) */
501 ERROR, /* (unexpected continuation byte) */
502 ERROR, /* (unexpected continuation byte) */
503 ERROR, /* (unexpected continuation byte) */
504 ERROR, /* (unexpected continuation byte) */
505 0, -1);
506 utf8_read_test(TESTSTR("\xC0\x20\xC1\x20\xC2\x20\xC3\x20\xC4\x20\xC5\x20\xC6\x20\xC7\x20"),
507 ERROR, /* (incomplete sequence) */
508 0x00000020, /* SPACE */
509 ERROR, /* (incomplete sequence) */
510 0x00000020, /* SPACE */
511 ERROR, /* (incomplete sequence) */
512 0x00000020, /* SPACE */
513 ERROR, /* (incomplete sequence) */
514 0x00000020, /* SPACE */
515 ERROR, /* (incomplete sequence) */
516 0x00000020, /* SPACE */
517 ERROR, /* (incomplete sequence) */
518 0x00000020, /* SPACE */
519 ERROR, /* (incomplete sequence) */
520 0x00000020, /* SPACE */
521 ERROR, /* (incomplete sequence) */
522 0x00000020, /* SPACE */
523 0, -1);
524 utf8_read_test(TESTSTR("\xE0\x20\xE1\x20\xE2\x20\xE3\x20\xE4\x20\xE5\x20\xE6\x20\xE7\x20\xE8\x20\xE9\x20\xEA\x20\xEB\x20\xEC\x20\xED\x20\xEE\x20\xEF\x20"),
525 ERROR, /* (incomplete sequence) */
526 0x00000020, /* SPACE */
527 ERROR, /* (incomplete sequence) */
528 0x00000020, /* SPACE */
529 ERROR, /* (incomplete sequence) */
530 0x00000020, /* SPACE */
531 ERROR, /* (incomplete sequence) */
532 0x00000020, /* SPACE */
533 ERROR, /* (incomplete sequence) */
534 0x00000020, /* SPACE */
535 ERROR, /* (incomplete sequence) */
536 0x00000020, /* SPACE */
537 ERROR, /* (incomplete sequence) */
538 0x00000020, /* SPACE */
539 ERROR, /* (incomplete sequence) */
540 0x00000020, /* SPACE */
541 ERROR, /* (incomplete sequence) */
542 0x00000020, /* SPACE */
543 ERROR, /* (incomplete sequence) */
544 0x00000020, /* SPACE */
545 ERROR, /* (incomplete sequence) */
546 0x00000020, /* SPACE */
547 ERROR, /* (incomplete sequence) */
548 0x00000020, /* SPACE */
549 ERROR, /* (incomplete sequence) */
550 0x00000020, /* SPACE */
551 ERROR, /* (incomplete sequence) */
552 0x00000020, /* SPACE */
553 ERROR, /* (incomplete sequence) */
554 0x00000020, /* SPACE */
555 ERROR, /* (incomplete sequence) */
556 0x00000020, /* SPACE */
557 0, -1);
558 utf8_read_test(TESTSTR("\xF0\x20\xF1\x20\xF2\x20\xF3\x20\xF4\x20\xF5\x20\xF6\x20\xF7\x20"),
559 ERROR, /* (incomplete sequence) */
560 0x00000020, /* SPACE */
561 ERROR, /* (incomplete sequence) */
562 0x00000020, /* SPACE */
563 ERROR, /* (incomplete sequence) */
564 0x00000020, /* SPACE */
565 ERROR, /* (incomplete sequence) */
566 0x00000020, /* SPACE */
567 ERROR, /* (incomplete sequence) */
568 0x00000020, /* SPACE */
569 ERROR, /* (incomplete sequence) */
570 0x00000020, /* SPACE */
571 ERROR, /* (incomplete sequence) */
572 0x00000020, /* SPACE */
573 ERROR, /* (incomplete sequence) */
574 0x00000020, /* SPACE */
575 0, -1);
576 utf8_read_test(TESTSTR("\xF8\x20\xF9\x20\xFA\x20\xFB\x20"),
577 ERROR, /* (incomplete sequence) */
578 0x00000020, /* SPACE */
579 ERROR, /* (incomplete sequence) */
580 0x00000020, /* SPACE */
581 ERROR, /* (incomplete sequence) */
582 0x00000020, /* SPACE */
583 ERROR, /* (incomplete sequence) */
584 0x00000020, /* SPACE */
585 0, -1);
586 utf8_read_test(TESTSTR("\xFC\x20\xFD\x20"),
587 ERROR, /* (incomplete sequence) */
588 0x00000020, /* SPACE */
589 ERROR, /* (incomplete sequence) */
590 0x00000020, /* SPACE */
591 0, -1);
592 utf8_read_test(TESTSTR("\xC0"),
593 ERROR, /* (incomplete sequence) */
594 0, -1);
595 utf8_read_test(TESTSTR("\xE0\x80"),
596 ERROR, /* (incomplete sequence) */
597 0, -1);
598 utf8_read_test(TESTSTR("\xF0\x80\x80"),
599 ERROR, /* (incomplete sequence) */
600 0, -1);
601 utf8_read_test(TESTSTR("\xF8\x80\x80\x80"),
602 ERROR, /* (incomplete sequence) */
603 0, -1);
604 utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80"),
605 ERROR, /* (incomplete sequence) */
606 0, -1);
607 utf8_read_test(TESTSTR("\xDF"),
608 ERROR, /* (incomplete sequence) */
609 0, -1);
610 utf8_read_test(TESTSTR("\xEF\xBF"),
611 ERROR, /* (incomplete sequence) */
612 0, -1);
613 utf8_read_test(TESTSTR("\xF7\xBF\xBF"),
614 ERROR, /* (incomplete sequence) */
615 0, -1);
616 utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF"),
617 ERROR, /* (incomplete sequence) */
618 0, -1);
619 utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF"),
620 ERROR, /* (incomplete sequence) */
621 0, -1);
622 utf8_read_test(TESTSTR("\xC0\xE0\x80\xF0\x80\x80\xF8\x80\x80\x80\xFC\x80\x80\x80\x80\xDF\xEF\xBF\xF7\xBF\xBF\xFB\xBF\xBF\xBF\xFD\xBF\xBF\xBF\xBF"),
623 ERROR, /* (incomplete sequence) */
624 ERROR, /* (incomplete sequence) */
625 ERROR, /* (incomplete sequence) */
626 ERROR, /* (incomplete sequence) */
627 ERROR, /* (incomplete sequence) */
628 ERROR, /* (incomplete sequence) */
629 ERROR, /* (incomplete sequence) */
630 ERROR, /* (incomplete sequence) */
631 ERROR, /* (incomplete sequence) */
632 ERROR, /* (incomplete sequence) */
633 0, -1);
634 utf8_read_test(TESTSTR("\xFE"),
635 ERROR, /* (invalid UTF-8 byte) */
636 0, -1);
637 utf8_read_test(TESTSTR("\xFF"),
638 ERROR, /* (invalid UTF-8 byte) */
639 0, -1);
640 utf8_read_test(TESTSTR("\xFE\xFE\xFF\xFF"),
641 ERROR, /* (invalid UTF-8 byte) */
642 ERROR, /* (invalid UTF-8 byte) */
643 ERROR, /* (invalid UTF-8 byte) */
644 ERROR, /* (invalid UTF-8 byte) */
645 0, -1);
646 utf8_read_test(TESTSTR("\xC0\xAF"),
647 ERROR, /* SOLIDUS (overlong form of 2F) */
648 0, -1);
649 utf8_read_test(TESTSTR("\xE0\x80\xAF"),
650 ERROR, /* SOLIDUS (overlong form of 2F) */
651 0, -1);
652 utf8_read_test(TESTSTR("\xF0\x80\x80\xAF"),
653 ERROR, /* SOLIDUS (overlong form of 2F) */
654 0, -1);
655 utf8_read_test(TESTSTR("\xF8\x80\x80\x80\xAF"),
656 ERROR, /* SOLIDUS (overlong form of 2F) */
657 0, -1);
658 utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\xAF"),
659 ERROR, /* SOLIDUS (overlong form of 2F) */
660 0, -1);
661 utf8_read_test(TESTSTR("\xC1\xBF"),
662 ERROR, /* <control> (overlong form of 7F) */
663 0, -1);
664 utf8_read_test(TESTSTR("\xE0\x9F\xBF"),
665 ERROR, /* <no name available> (overlong form of DF BF) */
666 0, -1);
667 utf8_read_test(TESTSTR("\xF0\x8F\xBF\xBF"),
668 ERROR, /* <no name available> (overlong form of EF BF BF) (invalid char) */
669 0, -1);
670 utf8_read_test(TESTSTR("\xF8\x87\xBF\xBF\xBF"),
671 ERROR, /* <no name available> (overlong form of F7 BF BF BF) */
672 0, -1);
673 utf8_read_test(TESTSTR("\xFC\x83\xBF\xBF\xBF\xBF"),
674 ERROR, /* <no name available> (overlong form of FB BF BF BF BF) */
675 0, -1);
676 utf8_read_test(TESTSTR("\xC0\x80"),
677 ERROR, /* <control> (overlong form of 00) */
678 0, -1);
679 utf8_read_test(TESTSTR("\xE0\x80\x80"),
680 ERROR, /* <control> (overlong form of 00) */
681 0, -1);
682 utf8_read_test(TESTSTR("\xF0\x80\x80\x80"),
683 ERROR, /* <control> (overlong form of 00) */
684 0, -1);
685 utf8_read_test(TESTSTR("\xF8\x80\x80\x80\x80"),
686 ERROR, /* <control> (overlong form of 00) */
687 0, -1);
688 utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\x80"),
689 ERROR, /* <control> (overlong form of 00) */
690 0, -1);
691 utf8_read_test(TESTSTR("\xED\xA0\x80"),
692 ERROR, /* <Non Private Use High Surrogate, First> (surrogate) */
693 0, -1);
694 utf8_read_test(TESTSTR("\xED\xAD\xBF"),
695 ERROR, /* <Non Private Use High Surrogate, Last> (surrogate) */
696 0, -1);
697 utf8_read_test(TESTSTR("\xED\xAE\x80"),
698 ERROR, /* <Private Use High Surrogate, First> (surrogate) */
699 0, -1);
700 utf8_read_test(TESTSTR("\xED\xAF\xBF"),
701 ERROR, /* <Private Use High Surrogate, Last> (surrogate) */
702 0, -1);
703 utf8_read_test(TESTSTR("\xED\xB0\x80"),
704 ERROR, /* <Low Surrogate, First> (surrogate) */
705 0, -1);
706 utf8_read_test(TESTSTR("\xED\xBE\x80"),
707 ERROR, /* <no name available> (surrogate) */
708 0, -1);
709 utf8_read_test(TESTSTR("\xED\xBF\xBF"),
710 ERROR, /* <Low Surrogate, Last> (surrogate) */
711 0, -1);
712 utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xB0\x80"),
713 ERROR, /* <Non Private Use High Surrogate, First> (surrogate) */
714 ERROR, /* <Low Surrogate, First> (surrogate) */
715 0, -1);
716 utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xBF\xBF"),
717 ERROR, /* <Non Private Use High Surrogate, First> (surrogate) */
718 ERROR, /* <Low Surrogate, Last> (surrogate) */
719 0, -1);
720 utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xB0\x80"),
721 ERROR, /* <Non Private Use High Surrogate, Last> (surrogate) */
722 ERROR, /* <Low Surrogate, First> (surrogate) */
723 0, -1);
724 utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xBF\xBF"),
725 ERROR, /* <Non Private Use High Surrogate, Last> (surrogate) */
726 ERROR, /* <Low Surrogate, Last> (surrogate) */
727 0, -1);
728 utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xB0\x80"),
729 ERROR, /* <Private Use High Surrogate, First> (surrogate) */
730 ERROR, /* <Low Surrogate, First> (surrogate) */
731 0, -1);
732 utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xBF\xBF"),
733 ERROR, /* <Private Use High Surrogate, First> (surrogate) */
734 ERROR, /* <Low Surrogate, Last> (surrogate) */
735 0, -1);
736 utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xB0\x80"),
737 ERROR, /* <Private Use High Surrogate, Last> (surrogate) */
738 ERROR, /* <Low Surrogate, First> (surrogate) */
739 0, -1);
740 utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xBF\xBF"),
741 ERROR, /* <Private Use High Surrogate, Last> (surrogate) */
742 ERROR, /* <Low Surrogate, Last> (surrogate) */
743 0, -1);
744 utf8_read_test(TESTSTR("\xEF\xBF\xBE"),
745 ERROR, /* <no name available> (invalid char) */
746 0, -1);
747 utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
748 ERROR, /* <no name available> (invalid char) */
749 0, -1);
750 printf("read tests completed\n");
751 printf("write tests beginning\n");
752 {
753 const static long str[] =
754 {0x03BAL, 0x1F79L, 0x03C3L, 0x03BCL, 0x03B5L, 0};
755 utf8_write_test(TESTSTR(str),
756 0xCE, 0xBA,
757 0xE1, 0xBD, 0xB9,
758 0xCF, 0x83,
759 0xCE, 0xBC,
760 0xCE, 0xB5,
761 0, -1);
762 }
763 {
764 const static long str[] = {0x0000L, 0};
765 utf8_write_test(TESTSTR(str),
766 0x00,
767 0, -1);
768 }
769 {
770 const static long str[] = {0x0080L, 0};
771 utf8_write_test(TESTSTR(str),
772 0xC2, 0x80,
773 0, -1);
774 }
775 {
776 const static long str[] = {0x0800L, 0};
777 utf8_write_test(TESTSTR(str),
778 0xE0, 0xA0, 0x80,
779 0, -1);
780 }
781 {
782 const static long str[] = {0x00010000L, 0};
783 utf8_write_test(TESTSTR(str),
784 0xF0, 0x90, 0x80, 0x80,
785 0, -1);
786 }
787 {
788 const static long str[] = {0x00200000L, 0};
789 utf8_write_test(TESTSTR(str),
790 0xF8, 0x88, 0x80, 0x80, 0x80,
791 0, -1);
792 }
793 {
794 const static long str[] = {0x04000000L, 0};
795 utf8_write_test(TESTSTR(str),
796 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80,
797 0, -1);
798 }
799 {
800 const static long str[] = {0x007FL, 0};
801 utf8_write_test(TESTSTR(str),
802 0x7F,
803 0, -1);
804 }
805 {
806 const static long str[] = {0x07FFL, 0};
807 utf8_write_test(TESTSTR(str),
808 0xDF, 0xBF,
809 0, -1);
810 }
811 {
812 const static long str[] = {0xFFFDL, 0};
813 utf8_write_test(TESTSTR(str),
814 0xEF, 0xBF, 0xBD,
815 0, -1);
816 }
817 {
818 const static long str[] = {0xFFFFL, 0};
819 utf8_write_test(TESTSTR(str),
820 ERROR,
821 0, -1);
822 }
823 {
824 const static long str[] = {0x001FFFFFL, 0};
825 utf8_write_test(TESTSTR(str),
826 0xF7, 0xBF, 0xBF, 0xBF,
827 0, -1);
828 }
829 {
830 const static long str[] = {0x03FFFFFFL, 0};
831 utf8_write_test(TESTSTR(str),
832 0xFB, 0xBF, 0xBF, 0xBF, 0xBF,
833 0, -1);
834 }
835 {
836 const static long str[] = {0x7FFFFFFFL, 0};
837 utf8_write_test(TESTSTR(str),
838 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF,
839 0, -1);
840 }
841 {
842 const static long str[] = {0xD7FFL, 0};
843 utf8_write_test(TESTSTR(str),
844 0xED, 0x9F, 0xBF,
845 0, -1);
846 }
847 {
848 const static long str[] = {0xD800L, 0};
849 utf8_write_test(TESTSTR(str),
850 ERROR,
851 0, -1);
852 }
853 {
854 const static long str[] = {0xD800L, 0xDC00L, 0};
855 utf8_write_test(TESTSTR(str),
856 ERROR,
857 ERROR,
858 0, -1);
859 }
860 {
861 const static long str[] = {0xDFFFL, 0};
862 utf8_write_test(TESTSTR(str),
863 ERROR,
864 0, -1);
865 }
866 {
867 const static long str[] = {0xE000L, 0};
868 utf8_write_test(TESTSTR(str),
869 0xEE, 0x80, 0x80,
870 0, -1);
871 }
872 printf("write tests completed\n");
873
874 printf("total: %d errors\n", total_errs);
875 return (total_errs != 0);
876}
877#endif /* TESTMODE */
878
879const charset_spec charset_CS_UTF8 = {
880 CS_UTF8, read_utf8, write_utf8, NULL
881};
882
883#else /* ENUM_CHARSETS */
884
885ENUM_CHARSET(CS_UTF8)
886
887#endif /* ENUM_CHARSETS */