Silly of me to overlook it: another obvious way you might like to
[sgt/charset] / utf8.c
CommitLineData
c6d25d8d 1/*
2 * utf8.c - routines to handle UTF-8.
3 */
4
5#ifndef ENUM_CHARSETS
6
7#include "charset.h"
8#include "internal.h"
9
10/*
8536171f 11 * The internal read_utf8 and write_utf8 functions in this module
12 * are not static, because they're also called internally from
13 * iso2022.c.
14 */
15
16/*
c6d25d8d 17 * UTF-8 has no associated data, so `charset' may be ignored.
18 */
19
7a7dc0a7 20void read_utf8(charset_spec const *charset, long int input_chr,
21 charset_state *state,
22 void (*emit)(void *ctx, long int output), void *emitctx)
c6d25d8d 23{
24 UNUSEDARG(charset);
25
26 /*
e3e8f8c9 27 * For reading UTF-8, the `state' word contains the character
28 * being accumulated. This is shifted left by six bits each
29 * time a character is added, and there's a single '1' bit
30 * in what would be bit 31 of the final character, which we
31 * use to detect when it's complete.
c6d25d8d 32 *
33 * As required, the state is zero when we are not in the middle
34 * of a multibyte character at all.
35 *
36 * For example, when reading E9 8D 8B, starting at state=0:
37 *
e3e8f8c9 38 * - after E9, the state is 0x00080009
39 * - after 8D, the state is 0x0200024d
40 * - after 8B, the state conceptually becomes 0x8000934b, at
c6d25d8d 41 * which point we notice we've got as many characters as we
42 * were expecting, output U+934B, and reset the state to
43 * zero.
44 *
e3e8f8c9 45 * If we detect an overlong sequence, we shift the marker bit
46 * right one bit. This is safe because an overlong sequence
47 * can't encode a top-bit-set character. Not that we worry
48 * about what overlong sequences are trying to encode, but
49 * it's nice to know that we could if we wanted to.
50 *
c6d25d8d 51 * Note that the maximum number of bits we might need to store
52 * in the character value field is 25 (U+7FFFFFFF contains 31
53 * bits, but we will never actually store its full value
54 * because when we receive the last 6 bits in the final
55 * continuation byte we will output it and revert the state to
e3e8f8c9 56 * zero). Hence we need 26 bits in total.
c6d25d8d 57 */
58
59 if (input_chr < 0x80) {
60 /*
61 * Single-byte character. If the state is nonzero before
62 * coming here, output an error for an incomplete sequence.
63 * Then output the character.
64 */
65 if (state->s0 != 0) {
66 emit(emitctx, ERROR);
67 state->s0 = 0;
68 }
69 emit(emitctx, input_chr);
70 } else if (input_chr == 0xFE || input_chr == 0xFF) {
71 /*
72 * FE and FF bytes should _never_ occur in UTF-8. They are
73 * automatic errors; if the state was nonzero to start
74 * with, output a further error for an incomplete sequence.
75 */
76 if (state->s0 != 0) {
77 emit(emitctx, ERROR);
78 state->s0 = 0;
79 }
80 emit(emitctx, ERROR);
81 } else if (input_chr >= 0x80 && input_chr < 0xC0) {
82 /*
83 * Continuation byte. Output an error for an unexpected
84 * continuation byte, if the state is zero.
85 */
86 if (state->s0 == 0) {
87 emit(emitctx, ERROR);
88 } else {
89 unsigned long charval;
c6d25d8d 90
91 /*
92 * Otherwise, accumulate more of the character value.
93 */
e3e8f8c9 94 charval = state->s0;
c6d25d8d 95 charval = (charval << 6) | (input_chr & 0x3F);
96
97 /*
e3e8f8c9 98 * Detect overlong encodings. We're looking for too many
99 * leading zeroes given our position in the character. If
100 * we find an overlong encoding, clear the current marker
101 * bit and set the bit below it. Overlong two-byte
102 * encodings are a special case, and are detected when we
103 * read their inital byte.
104 */
105 if ((charval & 0xffffffe0L) == 0x02000000L)
106 charval ^= 0x03000000L;
107 else if ((charval & 0xfffffff0L) == 0x00080000L)
108 charval ^= 0x000c0000L;
109 else if ((charval & 0xfffffff8L) == 0x00002000L)
110 charval ^= 0x00003000L;
111 else if ((charval & 0xfffffffcL) == 0x00000080L)
112 charval ^= 0x000000c0L;
113
114 /*
c6d25d8d 115 * Check the byte counts; if we have not reached the
116 * end of the character, update the state and return.
117 */
e3e8f8c9 118 if (!(charval & 0xc0000000L)) {
119 state->s0 = charval;
c6d25d8d 120 return;
121 }
122
123 /*
e3e8f8c9 124 * Clear the marker bit, or set it if it's clear,
125 * indicating an overlong sequence.
126 */
127 charval ^= 0x80000000L;
128
129 /*
c6d25d8d 130 * Now we know we've reached the end of the character.
131 * `charval' is the Unicode value. We should check for
132 * various invalid things, and then either output
133 * charval or an error. In all cases we reset the state
134 * to zero.
135 */
c6d25d8d 136 state->s0 = 0;
137
e3e8f8c9 138 if (charval & 0x80000000L) {
139 /* We got an overlong sequence. */
140 emit(emitctx, ERROR);
141 } else if (charval >= 0xD800 && charval < 0xE000) {
c6d25d8d 142 /*
143 * Surrogates (0xD800-0xDFFF) may never be encoded
144 * in UTF-8. A surrogate pair in Unicode should
145 * have been encoded as a single UTF-8 character
146 * occupying more than three bytes.
147 */
148 emit(emitctx, ERROR);
149 } else if (charval == 0xFFFE || charval == 0xFFFF) {
150 /*
151 * U+FFFE and U+FFFF are invalid Unicode characters
152 * and may never be encoded in UTF-8. (This is one
153 * reason why U+FFFF is our way of signalling an
154 * error to our `emit' function :-)
155 */
156 emit(emitctx, ERROR);
c6d25d8d 157 } else {
158 /*
159 * Oh, all right. We'll let this one off.
160 */
161 emit(emitctx, charval);
162 }
163 }
164
165 } else {
166 /*
167 * Lead byte. First output an error for an incomplete
168 * sequence, if the state is nonzero.
169 */
170 if (state->s0 != 0)
171 emit(emitctx, ERROR);
172
173 /*
174 * Now deal with the lead byte: work out the number of
175 * bytes we expect to see in this character, and extract
176 * the initial bits of it too.
177 */
e3e8f8c9 178 if (input_chr >= 0xC0 && input_chr < 0xC2) {
179 /* beginning of an overlong two-byte sequence */
180 state->s0 = 0x01000000L | (input_chr & 0x1F);
181 } else if (input_chr >= 0xC2 && input_chr < 0xE0) {
182 state->s0 = 0x02000000L | (input_chr & 0x1F);
c6d25d8d 183 } else if (input_chr >= 0xE0 && input_chr < 0xF0) {
e3e8f8c9 184 state->s0 = 0x00080000L | (input_chr & 0x0F);
c6d25d8d 185 } else if (input_chr >= 0xF0 && input_chr < 0xF8) {
e3e8f8c9 186 state->s0 = 0x00002000L | (input_chr & 0x07);
c6d25d8d 187 } else if (input_chr >= 0xF8 && input_chr < 0xFC) {
e3e8f8c9 188 state->s0 = 0x00000080L | (input_chr & 0x03);
c6d25d8d 189 } else if (input_chr >= 0xFC && input_chr < 0xFE) {
e3e8f8c9 190 state->s0 = 0x00000002L | (input_chr & 0x01);
c6d25d8d 191 }
192 }
193}
194
195/*
196 * UTF-8 is a stateless multi-byte encoding (in the sense that just
197 * after any character has been completed, the state is always the
198 * same); hence when writing it, there is no need to use the
199 * charset_state.
200 */
201
8536171f 202int write_utf8(charset_spec const *charset, long int input_chr,
203 charset_state *state,
204 void (*emit)(void *ctx, long int output),
205 void *emitctx)
c6d25d8d 206{
207 UNUSEDARG(charset);
208 UNUSEDARG(state);
209
210 if (input_chr == -1)
211 return TRUE; /* stateless; no cleanup required */
212
213 /*
214 * Refuse to output any illegal code points.
215 */
216 if (input_chr == 0xFFFE || input_chr == 0xFFFF ||
217 (input_chr >= 0xD800 && input_chr < 0xE000)) {
218 return FALSE;
219 } else if (input_chr < 0x80) { /* one-byte character */
220 emit(emitctx, input_chr);
221 return TRUE;
222 } else if (input_chr < 0x800) { /* two-byte character */
223 emit(emitctx, 0xC0 | (0x1F & (input_chr >> 6)));
224 emit(emitctx, 0x80 | (0x3F & (input_chr )));
225 return TRUE;
226 } else if (input_chr < 0x10000) { /* three-byte character */
227 emit(emitctx, 0xE0 | (0x0F & (input_chr >> 12)));
228 emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
229 emit(emitctx, 0x80 | (0x3F & (input_chr )));
230 return TRUE;
231 } else if (input_chr < 0x200000) { /* four-byte character */
232 emit(emitctx, 0xF0 | (0x07 & (input_chr >> 18)));
233 emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
234 emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
235 emit(emitctx, 0x80 | (0x3F & (input_chr )));
236 return TRUE;
237 } else if (input_chr < 0x4000000) {/* five-byte character */
238 emit(emitctx, 0xF8 | (0x03 & (input_chr >> 24)));
239 emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
240 emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
241 emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
242 emit(emitctx, 0x80 | (0x3F & (input_chr )));
243 return TRUE;
244 } else { /* six-byte character */
245 emit(emitctx, 0xFC | (0x01 & (input_chr >> 30)));
246 emit(emitctx, 0x80 | (0x3F & (input_chr >> 24)));
247 emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
248 emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
249 emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
250 emit(emitctx, 0x80 | (0x3F & (input_chr )));
251 return TRUE;
252 }
253}
254
255#ifdef TESTMODE
256
257#include <stdio.h>
258#include <stdarg.h>
259
260int total_errs = 0;
261
262void utf8_emit(void *ctx, long output)
263{
264 wchar_t **p = (wchar_t **)ctx;
265 *(*p)++ = output;
266}
267
268void utf8_read_test(int line, char *input, int inlen, ...)
269{
270 va_list ap;
271 wchar_t *p, str[512];
272 int i;
273 charset_state state;
274 unsigned long l;
275
276 state.s0 = 0;
277 p = str;
278
279 for (i = 0; i < inlen; i++)
280 read_utf8(NULL, input[i] & 0xFF, &state, utf8_emit, &p);
281
282 va_start(ap, inlen);
283 l = 0;
284 for (i = 0; i < p - str; i++) {
285 l = va_arg(ap, long int);
286 if (l == -1) {
287 printf("%d: correct string shorter than output\n", line);
288 total_errs++;
289 break;
290 }
291 if (l != str[i]) {
292 printf("%d: char %d came out as %08x, should be %08x\n",
293 line, i, str[i], l);
294 total_errs++;
295 }
296 }
297 if (l != -1) {
298 l = va_arg(ap, long int);
299 if (l != -1) {
300 printf("%d: correct string longer than output\n", line);
301 total_errs++;
302 }
303 }
304 va_end(ap);
305}
306
307void utf8_write_test(int line, const long *input, int inlen, ...)
308{
309 va_list ap;
310 wchar_t *p, str[512];
311 int i;
312 charset_state state;
313 unsigned long l;
314
315 state.s0 = 0;
316 p = str;
317
7804475c 318 for (i = 0; i < inlen; i++) {
319 if (!write_utf8(NULL, input[i], &state, utf8_emit, &p))
320 utf8_emit(&p, ERROR);
321 }
c6d25d8d 322
323 va_start(ap, inlen);
324 l = 0;
325 for (i = 0; i < p - str; i++) {
326 l = va_arg(ap, long int);
327 if (l == -1) {
328 printf("%d: correct string shorter than output\n", line);
329 total_errs++;
330 break;
331 }
332 if (l != str[i]) {
333 printf("%d: char %d came out as %08x, should be %08x\n",
334 line, i, str[i], l);
335 total_errs++;
336 }
337 }
338 if (l != -1) {
339 l = va_arg(ap, long int);
340 if (l != -1) {
341 printf("%d: correct string longer than output\n", line);
342 total_errs++;
343 }
344 }
345 va_end(ap);
346}
347
348/* Macro to concoct the first three parameters of utf8_read_test. */
349#define TESTSTR(x) __LINE__, x, lenof(x)
350
351int main(void)
352{
353 printf("read tests beginning\n");
354 utf8_read_test(TESTSTR("\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"),
355 0x000003BA, /* GREEK SMALL LETTER KAPPA */
356 0x00001F79, /* GREEK SMALL LETTER OMICRON WITH OXIA */
357 0x000003C3, /* GREEK SMALL LETTER SIGMA */
358 0x000003BC, /* GREEK SMALL LETTER MU */
359 0x000003B5, /* GREEK SMALL LETTER EPSILON */
360 0, -1);
361 utf8_read_test(TESTSTR("\x00"),
362 0x00000000, /* <control> */
363 0, -1);
364 utf8_read_test(TESTSTR("\xC2\x80"),
365 0x00000080, /* <control> */
366 0, -1);
367 utf8_read_test(TESTSTR("\xE0\xA0\x80"),
368 0x00000800, /* <no name available> */
369 0, -1);
370 utf8_read_test(TESTSTR("\xF0\x90\x80\x80"),
371 0x00010000, /* <no name available> */
372 0, -1);
373 utf8_read_test(TESTSTR("\xF8\x88\x80\x80\x80"),
374 0x00200000, /* <no name available> */
375 0, -1);
376 utf8_read_test(TESTSTR("\xFC\x84\x80\x80\x80\x80"),
377 0x04000000, /* <no name available> */
378 0, -1);
379 utf8_read_test(TESTSTR("\x7F"),
380 0x0000007F, /* <control> */
381 0, -1);
382 utf8_read_test(TESTSTR("\xDF\xBF"),
383 0x000007FF, /* <no name available> */
384 0, -1);
385 utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
386 0x0000FFFD, /* REPLACEMENT CHARACTER */
387 0, -1);
388 utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
389 ERROR, /* <no name available> (invalid char) */
390 0, -1);
391 utf8_read_test(TESTSTR("\xF7\xBF\xBF\xBF"),
392 0x001FFFFF, /* <no name available> */
393 0, -1);
394 utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF\xBF"),
395 0x03FFFFFF, /* <no name available> */
396 0, -1);
397 utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF\xBF"),
398 0x7FFFFFFF, /* <no name available> */
399 0, -1);
400 utf8_read_test(TESTSTR("\xED\x9F\xBF"),
401 0x0000D7FF, /* <no name available> */
402 0, -1);
403 utf8_read_test(TESTSTR("\xEE\x80\x80"),
404 0x0000E000, /* <Private Use, First> */
405 0, -1);
406 utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
407 0x0000FFFD, /* REPLACEMENT CHARACTER */
408 0, -1);
409 utf8_read_test(TESTSTR("\xF4\x8F\xBF\xBF"),
410 0x0010FFFF, /* <no name available> */
411 0, -1);
412 utf8_read_test(TESTSTR("\xF4\x90\x80\x80"),
413 0x00110000, /* <no name available> */
414 0, -1);
415 utf8_read_test(TESTSTR("\x80"),
416 ERROR, /* (unexpected continuation byte) */
417 0, -1);
418 utf8_read_test(TESTSTR("\xBF"),
419 ERROR, /* (unexpected continuation byte) */
420 0, -1);
421 utf8_read_test(TESTSTR("\x80\xBF"),
422 ERROR, /* (unexpected continuation byte) */
423 ERROR, /* (unexpected continuation byte) */
424 0, -1);
425 utf8_read_test(TESTSTR("\x80\xBF\x80"),
426 ERROR, /* (unexpected continuation byte) */
427 ERROR, /* (unexpected continuation byte) */
428 ERROR, /* (unexpected continuation byte) */
429 0, -1);
430 utf8_read_test(TESTSTR("\x80\xBF\x80\xBF"),
431 ERROR, /* (unexpected continuation byte) */
432 ERROR, /* (unexpected continuation byte) */
433 ERROR, /* (unexpected continuation byte) */
434 ERROR, /* (unexpected continuation byte) */
435 0, -1);
436 utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80"),
437 ERROR, /* (unexpected continuation byte) */
438 ERROR, /* (unexpected continuation byte) */
439 ERROR, /* (unexpected continuation byte) */
440 ERROR, /* (unexpected continuation byte) */
441 ERROR, /* (unexpected continuation byte) */
442 0, -1);
443 utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF"),
444 ERROR, /* (unexpected continuation byte) */
445 ERROR, /* (unexpected continuation byte) */
446 ERROR, /* (unexpected continuation byte) */
447 ERROR, /* (unexpected continuation byte) */
448 ERROR, /* (unexpected continuation byte) */
449 ERROR, /* (unexpected continuation byte) */
450 0, -1);
451 utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF\x80"),
452 ERROR, /* (unexpected continuation byte) */
453 ERROR, /* (unexpected continuation byte) */
454 ERROR, /* (unexpected continuation byte) */
455 ERROR, /* (unexpected continuation byte) */
456 ERROR, /* (unexpected continuation byte) */
457 ERROR, /* (unexpected continuation byte) */
458 ERROR, /* (unexpected continuation byte) */
459 0, -1);
460 utf8_read_test(TESTSTR("\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"),
461 ERROR, /* (unexpected continuation byte) */
462 ERROR, /* (unexpected continuation byte) */
463 ERROR, /* (unexpected continuation byte) */
464 ERROR, /* (unexpected continuation byte) */
465 ERROR, /* (unexpected continuation byte) */
466 ERROR, /* (unexpected continuation byte) */
467 ERROR, /* (unexpected continuation byte) */
468 ERROR, /* (unexpected continuation byte) */
469 ERROR, /* (unexpected continuation byte) */
470 ERROR, /* (unexpected continuation byte) */
471 ERROR, /* (unexpected continuation byte) */
472 ERROR, /* (unexpected continuation byte) */
473 ERROR, /* (unexpected continuation byte) */
474 ERROR, /* (unexpected continuation byte) */
475 ERROR, /* (unexpected continuation byte) */
476 ERROR, /* (unexpected continuation byte) */
477 ERROR, /* (unexpected continuation byte) */
478 ERROR, /* (unexpected continuation byte) */
479 ERROR, /* (unexpected continuation byte) */
480 ERROR, /* (unexpected continuation byte) */
481 ERROR, /* (unexpected continuation byte) */
482 ERROR, /* (unexpected continuation byte) */
483 ERROR, /* (unexpected continuation byte) */
484 ERROR, /* (unexpected continuation byte) */
485 ERROR, /* (unexpected continuation byte) */
486 ERROR, /* (unexpected continuation byte) */
487 ERROR, /* (unexpected continuation byte) */
488 ERROR, /* (unexpected continuation byte) */
489 ERROR, /* (unexpected continuation byte) */
490 ERROR, /* (unexpected continuation byte) */
491 ERROR, /* (unexpected continuation byte) */
492 ERROR, /* (unexpected continuation byte) */
493 ERROR, /* (unexpected continuation byte) */
494 ERROR, /* (unexpected continuation byte) */
495 ERROR, /* (unexpected continuation byte) */
496 ERROR, /* (unexpected continuation byte) */
497 ERROR, /* (unexpected continuation byte) */
498 ERROR, /* (unexpected continuation byte) */
499 ERROR, /* (unexpected continuation byte) */
500 ERROR, /* (unexpected continuation byte) */
501 ERROR, /* (unexpected continuation byte) */
502 ERROR, /* (unexpected continuation byte) */
503 ERROR, /* (unexpected continuation byte) */
504 ERROR, /* (unexpected continuation byte) */
505 ERROR, /* (unexpected continuation byte) */
506 ERROR, /* (unexpected continuation byte) */
507 ERROR, /* (unexpected continuation byte) */
508 ERROR, /* (unexpected continuation byte) */
509 ERROR, /* (unexpected continuation byte) */
510 ERROR, /* (unexpected continuation byte) */
511 ERROR, /* (unexpected continuation byte) */
512 ERROR, /* (unexpected continuation byte) */
513 ERROR, /* (unexpected continuation byte) */
514 ERROR, /* (unexpected continuation byte) */
515 ERROR, /* (unexpected continuation byte) */
516 ERROR, /* (unexpected continuation byte) */
517 ERROR, /* (unexpected continuation byte) */
518 ERROR, /* (unexpected continuation byte) */
519 ERROR, /* (unexpected continuation byte) */
520 ERROR, /* (unexpected continuation byte) */
521 ERROR, /* (unexpected continuation byte) */
522 ERROR, /* (unexpected continuation byte) */
523 ERROR, /* (unexpected continuation byte) */
524 ERROR, /* (unexpected continuation byte) */
525 0, -1);
526 utf8_read_test(TESTSTR("\xC0\x20\xC1\x20\xC2\x20\xC3\x20\xC4\x20\xC5\x20\xC6\x20\xC7\x20"),
527 ERROR, /* (incomplete sequence) */
528 0x00000020, /* SPACE */
529 ERROR, /* (incomplete sequence) */
530 0x00000020, /* SPACE */
531 ERROR, /* (incomplete sequence) */
532 0x00000020, /* SPACE */
533 ERROR, /* (incomplete sequence) */
534 0x00000020, /* SPACE */
535 ERROR, /* (incomplete sequence) */
536 0x00000020, /* SPACE */
537 ERROR, /* (incomplete sequence) */
538 0x00000020, /* SPACE */
539 ERROR, /* (incomplete sequence) */
540 0x00000020, /* SPACE */
541 ERROR, /* (incomplete sequence) */
542 0x00000020, /* SPACE */
543 0, -1);
544 utf8_read_test(TESTSTR("\xE0\x20\xE1\x20\xE2\x20\xE3\x20\xE4\x20\xE5\x20\xE6\x20\xE7\x20\xE8\x20\xE9\x20\xEA\x20\xEB\x20\xEC\x20\xED\x20\xEE\x20\xEF\x20"),
545 ERROR, /* (incomplete sequence) */
546 0x00000020, /* SPACE */
547 ERROR, /* (incomplete sequence) */
548 0x00000020, /* SPACE */
549 ERROR, /* (incomplete sequence) */
550 0x00000020, /* SPACE */
551 ERROR, /* (incomplete sequence) */
552 0x00000020, /* SPACE */
553 ERROR, /* (incomplete sequence) */
554 0x00000020, /* SPACE */
555 ERROR, /* (incomplete sequence) */
556 0x00000020, /* SPACE */
557 ERROR, /* (incomplete sequence) */
558 0x00000020, /* SPACE */
559 ERROR, /* (incomplete sequence) */
560 0x00000020, /* SPACE */
561 ERROR, /* (incomplete sequence) */
562 0x00000020, /* SPACE */
563 ERROR, /* (incomplete sequence) */
564 0x00000020, /* SPACE */
565 ERROR, /* (incomplete sequence) */
566 0x00000020, /* SPACE */
567 ERROR, /* (incomplete sequence) */
568 0x00000020, /* SPACE */
569 ERROR, /* (incomplete sequence) */
570 0x00000020, /* SPACE */
571 ERROR, /* (incomplete sequence) */
572 0x00000020, /* SPACE */
573 ERROR, /* (incomplete sequence) */
574 0x00000020, /* SPACE */
575 ERROR, /* (incomplete sequence) */
576 0x00000020, /* SPACE */
577 0, -1);
578 utf8_read_test(TESTSTR("\xF0\x20\xF1\x20\xF2\x20\xF3\x20\xF4\x20\xF5\x20\xF6\x20\xF7\x20"),
579 ERROR, /* (incomplete sequence) */
580 0x00000020, /* SPACE */
581 ERROR, /* (incomplete sequence) */
582 0x00000020, /* SPACE */
583 ERROR, /* (incomplete sequence) */
584 0x00000020, /* SPACE */
585 ERROR, /* (incomplete sequence) */
586 0x00000020, /* SPACE */
587 ERROR, /* (incomplete sequence) */
588 0x00000020, /* SPACE */
589 ERROR, /* (incomplete sequence) */
590 0x00000020, /* SPACE */
591 ERROR, /* (incomplete sequence) */
592 0x00000020, /* SPACE */
593 ERROR, /* (incomplete sequence) */
594 0x00000020, /* SPACE */
595 0, -1);
596 utf8_read_test(TESTSTR("\xF8\x20\xF9\x20\xFA\x20\xFB\x20"),
597 ERROR, /* (incomplete sequence) */
598 0x00000020, /* SPACE */
599 ERROR, /* (incomplete sequence) */
600 0x00000020, /* SPACE */
601 ERROR, /* (incomplete sequence) */
602 0x00000020, /* SPACE */
603 ERROR, /* (incomplete sequence) */
604 0x00000020, /* SPACE */
605 0, -1);
606 utf8_read_test(TESTSTR("\xFC\x20\xFD\x20"),
607 ERROR, /* (incomplete sequence) */
608 0x00000020, /* SPACE */
609 ERROR, /* (incomplete sequence) */
610 0x00000020, /* SPACE */
611 0, -1);
612 utf8_read_test(TESTSTR("\xC0"),
613 ERROR, /* (incomplete sequence) */
614 0, -1);
615 utf8_read_test(TESTSTR("\xE0\x80"),
616 ERROR, /* (incomplete sequence) */
617 0, -1);
618 utf8_read_test(TESTSTR("\xF0\x80\x80"),
619 ERROR, /* (incomplete sequence) */
620 0, -1);
621 utf8_read_test(TESTSTR("\xF8\x80\x80\x80"),
622 ERROR, /* (incomplete sequence) */
623 0, -1);
624 utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80"),
625 ERROR, /* (incomplete sequence) */
626 0, -1);
627 utf8_read_test(TESTSTR("\xDF"),
628 ERROR, /* (incomplete sequence) */
629 0, -1);
630 utf8_read_test(TESTSTR("\xEF\xBF"),
631 ERROR, /* (incomplete sequence) */
632 0, -1);
633 utf8_read_test(TESTSTR("\xF7\xBF\xBF"),
634 ERROR, /* (incomplete sequence) */
635 0, -1);
636 utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF"),
637 ERROR, /* (incomplete sequence) */
638 0, -1);
639 utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF"),
640 ERROR, /* (incomplete sequence) */
641 0, -1);
642 utf8_read_test(TESTSTR("\xC0\xE0\x80\xF0\x80\x80\xF8\x80\x80\x80\xFC\x80\x80\x80\x80\xDF\xEF\xBF\xF7\xBF\xBF\xFB\xBF\xBF\xBF\xFD\xBF\xBF\xBF\xBF"),
643 ERROR, /* (incomplete sequence) */
644 ERROR, /* (incomplete sequence) */
645 ERROR, /* (incomplete sequence) */
646 ERROR, /* (incomplete sequence) */
647 ERROR, /* (incomplete sequence) */
648 ERROR, /* (incomplete sequence) */
649 ERROR, /* (incomplete sequence) */
650 ERROR, /* (incomplete sequence) */
651 ERROR, /* (incomplete sequence) */
652 ERROR, /* (incomplete sequence) */
653 0, -1);
654 utf8_read_test(TESTSTR("\xFE"),
655 ERROR, /* (invalid UTF-8 byte) */
656 0, -1);
657 utf8_read_test(TESTSTR("\xFF"),
658 ERROR, /* (invalid UTF-8 byte) */
659 0, -1);
660 utf8_read_test(TESTSTR("\xFE\xFE\xFF\xFF"),
661 ERROR, /* (invalid UTF-8 byte) */
662 ERROR, /* (invalid UTF-8 byte) */
663 ERROR, /* (invalid UTF-8 byte) */
664 ERROR, /* (invalid UTF-8 byte) */
665 0, -1);
666 utf8_read_test(TESTSTR("\xC0\xAF"),
667 ERROR, /* SOLIDUS (overlong form of 2F) */
668 0, -1);
669 utf8_read_test(TESTSTR("\xE0\x80\xAF"),
670 ERROR, /* SOLIDUS (overlong form of 2F) */
671 0, -1);
672 utf8_read_test(TESTSTR("\xF0\x80\x80\xAF"),
673 ERROR, /* SOLIDUS (overlong form of 2F) */
674 0, -1);
675 utf8_read_test(TESTSTR("\xF8\x80\x80\x80\xAF"),
676 ERROR, /* SOLIDUS (overlong form of 2F) */
677 0, -1);
678 utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\xAF"),
679 ERROR, /* SOLIDUS (overlong form of 2F) */
680 0, -1);
681 utf8_read_test(TESTSTR("\xC1\xBF"),
682 ERROR, /* <control> (overlong form of 7F) */
683 0, -1);
684 utf8_read_test(TESTSTR("\xE0\x9F\xBF"),
685 ERROR, /* <no name available> (overlong form of DF BF) */
686 0, -1);
687 utf8_read_test(TESTSTR("\xF0\x8F\xBF\xBF"),
688 ERROR, /* <no name available> (overlong form of EF BF BF) (invalid char) */
689 0, -1);
690 utf8_read_test(TESTSTR("\xF8\x87\xBF\xBF\xBF"),
691 ERROR, /* <no name available> (overlong form of F7 BF BF BF) */
692 0, -1);
693 utf8_read_test(TESTSTR("\xFC\x83\xBF\xBF\xBF\xBF"),
694 ERROR, /* <no name available> (overlong form of FB BF BF BF BF) */
695 0, -1);
696 utf8_read_test(TESTSTR("\xC0\x80"),
697 ERROR, /* <control> (overlong form of 00) */
698 0, -1);
699 utf8_read_test(TESTSTR("\xE0\x80\x80"),
700 ERROR, /* <control> (overlong form of 00) */
701 0, -1);
702 utf8_read_test(TESTSTR("\xF0\x80\x80\x80"),
703 ERROR, /* <control> (overlong form of 00) */
704 0, -1);
705 utf8_read_test(TESTSTR("\xF8\x80\x80\x80\x80"),
706 ERROR, /* <control> (overlong form of 00) */
707 0, -1);
708 utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\x80"),
709 ERROR, /* <control> (overlong form of 00) */
710 0, -1);
711 utf8_read_test(TESTSTR("\xED\xA0\x80"),
712 ERROR, /* <Non Private Use High Surrogate, First> (surrogate) */
713 0, -1);
714 utf8_read_test(TESTSTR("\xED\xAD\xBF"),
715 ERROR, /* <Non Private Use High Surrogate, Last> (surrogate) */
716 0, -1);
717 utf8_read_test(TESTSTR("\xED\xAE\x80"),
718 ERROR, /* <Private Use High Surrogate, First> (surrogate) */
719 0, -1);
720 utf8_read_test(TESTSTR("\xED\xAF\xBF"),
721 ERROR, /* <Private Use High Surrogate, Last> (surrogate) */
722 0, -1);
723 utf8_read_test(TESTSTR("\xED\xB0\x80"),
724 ERROR, /* <Low Surrogate, First> (surrogate) */
725 0, -1);
726 utf8_read_test(TESTSTR("\xED\xBE\x80"),
727 ERROR, /* <no name available> (surrogate) */
728 0, -1);
729 utf8_read_test(TESTSTR("\xED\xBF\xBF"),
730 ERROR, /* <Low Surrogate, Last> (surrogate) */
731 0, -1);
732 utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xB0\x80"),
733 ERROR, /* <Non Private Use High Surrogate, First> (surrogate) */
734 ERROR, /* <Low Surrogate, First> (surrogate) */
735 0, -1);
736 utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xBF\xBF"),
737 ERROR, /* <Non Private Use High Surrogate, First> (surrogate) */
738 ERROR, /* <Low Surrogate, Last> (surrogate) */
739 0, -1);
740 utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xB0\x80"),
741 ERROR, /* <Non Private Use High Surrogate, Last> (surrogate) */
742 ERROR, /* <Low Surrogate, First> (surrogate) */
743 0, -1);
744 utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xBF\xBF"),
745 ERROR, /* <Non Private Use High Surrogate, Last> (surrogate) */
746 ERROR, /* <Low Surrogate, Last> (surrogate) */
747 0, -1);
748 utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xB0\x80"),
749 ERROR, /* <Private Use High Surrogate, First> (surrogate) */
750 ERROR, /* <Low Surrogate, First> (surrogate) */
751 0, -1);
752 utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xBF\xBF"),
753 ERROR, /* <Private Use High Surrogate, First> (surrogate) */
754 ERROR, /* <Low Surrogate, Last> (surrogate) */
755 0, -1);
756 utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xB0\x80"),
757 ERROR, /* <Private Use High Surrogate, Last> (surrogate) */
758 ERROR, /* <Low Surrogate, First> (surrogate) */
759 0, -1);
760 utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xBF\xBF"),
761 ERROR, /* <Private Use High Surrogate, Last> (surrogate) */
762 ERROR, /* <Low Surrogate, Last> (surrogate) */
763 0, -1);
764 utf8_read_test(TESTSTR("\xEF\xBF\xBE"),
765 ERROR, /* <no name available> (invalid char) */
766 0, -1);
767 utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
768 ERROR, /* <no name available> (invalid char) */
769 0, -1);
770 printf("read tests completed\n");
771 printf("write tests beginning\n");
772 {
773 const static long str[] =
774 {0x03BAL, 0x1F79L, 0x03C3L, 0x03BCL, 0x03B5L, 0};
775 utf8_write_test(TESTSTR(str),
776 0xCE, 0xBA,
777 0xE1, 0xBD, 0xB9,
778 0xCF, 0x83,
779 0xCE, 0xBC,
780 0xCE, 0xB5,
781 0, -1);
782 }
783 {
784 const static long str[] = {0x0000L, 0};
785 utf8_write_test(TESTSTR(str),
786 0x00,
787 0, -1);
788 }
789 {
790 const static long str[] = {0x0080L, 0};
791 utf8_write_test(TESTSTR(str),
792 0xC2, 0x80,
793 0, -1);
794 }
795 {
796 const static long str[] = {0x0800L, 0};
797 utf8_write_test(TESTSTR(str),
798 0xE0, 0xA0, 0x80,
799 0, -1);
800 }
801 {
802 const static long str[] = {0x00010000L, 0};
803 utf8_write_test(TESTSTR(str),
804 0xF0, 0x90, 0x80, 0x80,
805 0, -1);
806 }
807 {
808 const static long str[] = {0x00200000L, 0};
809 utf8_write_test(TESTSTR(str),
810 0xF8, 0x88, 0x80, 0x80, 0x80,
811 0, -1);
812 }
813 {
814 const static long str[] = {0x04000000L, 0};
815 utf8_write_test(TESTSTR(str),
816 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80,
817 0, -1);
818 }
819 {
820 const static long str[] = {0x007FL, 0};
821 utf8_write_test(TESTSTR(str),
822 0x7F,
823 0, -1);
824 }
825 {
826 const static long str[] = {0x07FFL, 0};
827 utf8_write_test(TESTSTR(str),
828 0xDF, 0xBF,
829 0, -1);
830 }
831 {
832 const static long str[] = {0xFFFDL, 0};
833 utf8_write_test(TESTSTR(str),
834 0xEF, 0xBF, 0xBD,
835 0, -1);
836 }
837 {
838 const static long str[] = {0xFFFFL, 0};
839 utf8_write_test(TESTSTR(str),
840 ERROR,
841 0, -1);
842 }
843 {
844 const static long str[] = {0x001FFFFFL, 0};
845 utf8_write_test(TESTSTR(str),
846 0xF7, 0xBF, 0xBF, 0xBF,
847 0, -1);
848 }
849 {
850 const static long str[] = {0x03FFFFFFL, 0};
851 utf8_write_test(TESTSTR(str),
852 0xFB, 0xBF, 0xBF, 0xBF, 0xBF,
853 0, -1);
854 }
855 {
856 const static long str[] = {0x7FFFFFFFL, 0};
857 utf8_write_test(TESTSTR(str),
858 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF,
859 0, -1);
860 }
861 {
862 const static long str[] = {0xD7FFL, 0};
863 utf8_write_test(TESTSTR(str),
864 0xED, 0x9F, 0xBF,
865 0, -1);
866 }
867 {
868 const static long str[] = {0xD800L, 0};
869 utf8_write_test(TESTSTR(str),
870 ERROR,
871 0, -1);
872 }
873 {
874 const static long str[] = {0xD800L, 0xDC00L, 0};
875 utf8_write_test(TESTSTR(str),
876 ERROR,
877 ERROR,
878 0, -1);
879 }
880 {
881 const static long str[] = {0xDFFFL, 0};
882 utf8_write_test(TESTSTR(str),
883 ERROR,
884 0, -1);
885 }
886 {
887 const static long str[] = {0xE000L, 0};
888 utf8_write_test(TESTSTR(str),
889 0xEE, 0x80, 0x80,
890 0, -1);
891 }
892 printf("write tests completed\n");
893
894 printf("total: %d errors\n", total_errs);
895 return (total_errs != 0);
896}
897#endif /* TESTMODE */
898
899const charset_spec charset_CS_UTF8 = {
900 CS_UTF8, read_utf8, write_utf8, NULL
901};
902
903#else /* ENUM_CHARSETS */
904
905ENUM_CHARSET(CS_UTF8)
906
907#endif /* ENUM_CHARSETS */