Sebastian Kuschel reports that pfd_closing can be called for a socket
[u/mdw/putty] / charset / utf8.c
1 /*
2 * utf8.c - routines to handle UTF-8.
3 */
4
5 #ifndef ENUM_CHARSETS
6
7 #include "charset.h"
8 #include "internal.h"
9
10 void read_utf8(charset_spec const *, long int, charset_state *,
11 void (*)(void *, long int), void *);
12 void write_utf8(charset_spec const *, long int,
13 charset_state *, void (*)(void *, long int), void *);
14
15 /*
16 * UTF-8 has no associated data, so `charset' may be ignored.
17 */
18
19 void read_utf8(charset_spec const *charset, long int input_chr,
20 charset_state *state,
21 void (*emit)(void *ctx, long int output), void *emitctx)
22 {
23 UNUSEDARG(charset);
24
25 /*
26 * For reading UTF-8, the `state' word contains:
27 *
28 * - in bits 29-31, the number of bytes expected to be in the
29 * current multibyte character (which we can tell instantly
30 * from the first byte, of course).
31 *
32 * - in bits 26-28, the number of bytes _seen so far_ in the
33 * current multibyte character.
34 *
35 * - in the remainder of the word, the current value of the
36 * character, which is shifted upwards by 6 bits to
37 * accommodate each new byte.
38 *
39 * As required, the state is zero when we are not in the middle
40 * of a multibyte character at all.
41 *
42 * For example, when reading E9 8D 8B, starting at state=0:
43 *
44 * - after E9, the state is 0x64000009
45 * - after 8D, the state is 0x6800024d
46 * - after 8B, the state conceptually becomes 0x6c00934b, at
47 * which point we notice we've got as many characters as we
48 * were expecting, output U+934B, and reset the state to
49 * zero.
50 *
51 * Note that the maximum number of bits we might need to store
52 * in the character value field is 25 (U+7FFFFFFF contains 31
53 * bits, but we will never actually store its full value
54 * because when we receive the last 6 bits in the final
55 * continuation byte we will output it and revert the state to
56 * zero). Hence the character value field never collides with
57 * the byte counts.
58 */
59
60 if (input_chr < 0x80) {
61 /*
62 * Single-byte character. If the state is nonzero before
63 * coming here, output an error for an incomplete sequence.
64 * Then output the character.
65 */
66 if (state->s0 != 0) {
67 emit(emitctx, ERROR);
68 state->s0 = 0;
69 }
70 emit(emitctx, input_chr);
71 } else if (input_chr == 0xFE || input_chr == 0xFF) {
72 /*
73 * FE and FF bytes should _never_ occur in UTF-8. They are
74 * automatic errors; if the state was nonzero to start
75 * with, output a further error for an incomplete sequence.
76 */
77 if (state->s0 != 0) {
78 emit(emitctx, ERROR);
79 state->s0 = 0;
80 }
81 emit(emitctx, ERROR);
82 } else if (input_chr >= 0x80 && input_chr < 0xC0) {
83 /*
84 * Continuation byte. Output an error for an unexpected
85 * continuation byte, if the state is zero.
86 */
87 if (state->s0 == 0) {
88 emit(emitctx, ERROR);
89 } else {
90 unsigned long charval;
91 unsigned long topstuff;
92 int bytes;
93
94 /*
95 * Otherwise, accumulate more of the character value.
96 */
97 charval = state->s0 & 0x03ffffffL;
98 charval = (charval << 6) | (input_chr & 0x3F);
99
100 /*
101 * Check the byte counts; if we have not reached the
102 * end of the character, update the state and return.
103 */
104 topstuff = state->s0 & 0xfc000000L;
105 topstuff += 0x04000000L; /* add one to the byte count */
106 if (((topstuff << 3) ^ topstuff) & 0xe0000000L) {
107 state->s0 = topstuff | charval;
108 return;
109 }
110
111 /*
112 * Now we know we've reached the end of the character.
113 * `charval' is the Unicode value. We should check for
114 * various invalid things, and then either output
115 * charval or an error. In all cases we reset the state
116 * to zero.
117 */
118 bytes = topstuff >> 29;
119 state->s0 = 0;
120
121 if (charval >= 0xD800 && charval < 0xE000) {
122 /*
123 * Surrogates (0xD800-0xDFFF) may never be encoded
124 * in UTF-8. A surrogate pair in Unicode should
125 * have been encoded as a single UTF-8 character
126 * occupying more than three bytes.
127 */
128 emit(emitctx, ERROR);
129 } else if (charval == 0xFFFE || charval == 0xFFFF) {
130 /*
131 * U+FFFE and U+FFFF are invalid Unicode characters
132 * and may never be encoded in UTF-8. (This is one
133 * reason why U+FFFF is our way of signalling an
134 * error to our `emit' function :-)
135 */
136 emit(emitctx, ERROR);
137 } else if ((charval <= 0x7FL /* && bytes > 1 */) ||
138 (charval <= 0x7FFL && bytes > 2) ||
139 (charval <= 0xFFFFL && bytes > 3) ||
140 (charval <= 0x1FFFFFL && bytes > 4) ||
141 (charval <= 0x3FFFFFFL && bytes > 5)) {
142 /*
143 * Overlong sequences are not to be tolerated,
144 * under any circumstances.
145 */
146 emit(emitctx, ERROR);
147 } else {
148 /*
149 * Oh, all right. We'll let this one off.
150 */
151 emit(emitctx, charval);
152 }
153 }
154
155 } else {
156 /*
157 * Lead byte. First output an error for an incomplete
158 * sequence, if the state is nonzero.
159 */
160 if (state->s0 != 0)
161 emit(emitctx, ERROR);
162
163 /*
164 * Now deal with the lead byte: work out the number of
165 * bytes we expect to see in this character, and extract
166 * the initial bits of it too.
167 */
168 if (input_chr >= 0xC0 && input_chr < 0xE0) {
169 state->s0 = 0x44000000L | (input_chr & 0x1F);
170 } else if (input_chr >= 0xE0 && input_chr < 0xF0) {
171 state->s0 = 0x64000000L | (input_chr & 0x0F);
172 } else if (input_chr >= 0xF0 && input_chr < 0xF8) {
173 state->s0 = 0x84000000L | (input_chr & 0x07);
174 } else if (input_chr >= 0xF8 && input_chr < 0xFC) {
175 state->s0 = 0xa4000000L | (input_chr & 0x03);
176 } else if (input_chr >= 0xFC && input_chr < 0xFE) {
177 state->s0 = 0xc4000000L | (input_chr & 0x01);
178 }
179 }
180 }
181
182 /*
183 * UTF-8 is a stateless multi-byte encoding (in the sense that just
184 * after any character has been completed, the state is always the
185 * same); hence when writing it, there is no need to use the
186 * charset_state.
187 */
188
189 void write_utf8(charset_spec const *charset, long int input_chr,
190 charset_state *state,
191 void (*emit)(void *ctx, long int output), void *emitctx)
192 {
193 UNUSEDARG(charset);
194 UNUSEDARG(state);
195
196 /*
197 * Refuse to output any illegal code points.
198 */
199 if (input_chr == 0xFFFE || input_chr == 0xFFFF ||
200 (input_chr >= 0xD800 && input_chr < 0xE000)) {
201 emit(emitctx, ERROR);
202 } else if (input_chr < 0x80) { /* one-byte character */
203 emit(emitctx, input_chr);
204 } else if (input_chr < 0x800) { /* two-byte character */
205 emit(emitctx, 0xC0 | (0x1F & (input_chr >> 6)));
206 emit(emitctx, 0x80 | (0x3F & (input_chr )));
207 } else if (input_chr < 0x10000) { /* three-byte character */
208 emit(emitctx, 0xE0 | (0x0F & (input_chr >> 12)));
209 emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
210 emit(emitctx, 0x80 | (0x3F & (input_chr )));
211 } else if (input_chr < 0x200000) { /* four-byte character */
212 emit(emitctx, 0xF0 | (0x07 & (input_chr >> 18)));
213 emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
214 emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
215 emit(emitctx, 0x80 | (0x3F & (input_chr )));
216 } else if (input_chr < 0x4000000) {/* five-byte character */
217 emit(emitctx, 0xF8 | (0x03 & (input_chr >> 24)));
218 emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
219 emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
220 emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
221 emit(emitctx, 0x80 | (0x3F & (input_chr )));
222 } else { /* six-byte character */
223 emit(emitctx, 0xFC | (0x01 & (input_chr >> 30)));
224 emit(emitctx, 0x80 | (0x3F & (input_chr >> 24)));
225 emit(emitctx, 0x80 | (0x3F & (input_chr >> 18)));
226 emit(emitctx, 0x80 | (0x3F & (input_chr >> 12)));
227 emit(emitctx, 0x80 | (0x3F & (input_chr >> 6)));
228 emit(emitctx, 0x80 | (0x3F & (input_chr )));
229 }
230 }
231
232 #ifdef TESTMODE
233
234 #include <stdio.h>
235 #include <stdarg.h>
236
237 int total_errs = 0;
238
239 void utf8_emit(void *ctx, long output)
240 {
241 wchar_t **p = (wchar_t **)ctx;
242 *(*p)++ = output;
243 }
244
245 void utf8_read_test(int line, char *input, int inlen, ...)
246 {
247 va_list ap;
248 wchar_t *p, str[512];
249 int i;
250 charset_state state;
251 unsigned long l;
252
253 state.s0 = 0;
254 p = str;
255
256 for (i = 0; i < inlen; i++)
257 read_utf8(NULL, input[i] & 0xFF, &state, utf8_emit, &p);
258
259 va_start(ap, inlen);
260 l = 0;
261 for (i = 0; i < p - str; i++) {
262 l = va_arg(ap, long int);
263 if (l == -1) {
264 printf("%d: correct string shorter than output\n", line);
265 total_errs++;
266 break;
267 }
268 if (l != str[i]) {
269 printf("%d: char %d came out as %08x, should be %08x\n",
270 line, i, str[i], l);
271 total_errs++;
272 }
273 }
274 if (l != -1) {
275 l = va_arg(ap, long int);
276 if (l != -1) {
277 printf("%d: correct string longer than output\n", line);
278 total_errs++;
279 }
280 }
281 va_end(ap);
282 }
283
284 void utf8_write_test(int line, const long *input, int inlen, ...)
285 {
286 va_list ap;
287 wchar_t *p, str[512];
288 int i;
289 charset_state state;
290 unsigned long l;
291
292 state.s0 = 0;
293 p = str;
294
295 for (i = 0; i < inlen; i++)
296 write_utf8(NULL, input[i], &state, utf8_emit, &p);
297
298 va_start(ap, inlen);
299 l = 0;
300 for (i = 0; i < p - str; i++) {
301 l = va_arg(ap, long int);
302 if (l == -1) {
303 printf("%d: correct string shorter than output\n", line);
304 total_errs++;
305 break;
306 }
307 if (l != str[i]) {
308 printf("%d: char %d came out as %08x, should be %08x\n",
309 line, i, str[i], l);
310 total_errs++;
311 }
312 }
313 if (l != -1) {
314 l = va_arg(ap, long int);
315 if (l != -1) {
316 printf("%d: correct string longer than output\n", line);
317 total_errs++;
318 }
319 }
320 va_end(ap);
321 }
322
323 /* Macro to concoct the first three parameters of utf8_read_test. */
324 #define TESTSTR(x) __LINE__, x, lenof(x)
325
326 int main(void)
327 {
328 printf("read tests beginning\n");
329 utf8_read_test(TESTSTR("\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"),
330 0x000003BA, /* GREEK SMALL LETTER KAPPA */
331 0x00001F79, /* GREEK SMALL LETTER OMICRON WITH OXIA */
332 0x000003C3, /* GREEK SMALL LETTER SIGMA */
333 0x000003BC, /* GREEK SMALL LETTER MU */
334 0x000003B5, /* GREEK SMALL LETTER EPSILON */
335 0, -1);
336 utf8_read_test(TESTSTR("\x00"),
337 0x00000000, /* <control> */
338 0, -1);
339 utf8_read_test(TESTSTR("\xC2\x80"),
340 0x00000080, /* <control> */
341 0, -1);
342 utf8_read_test(TESTSTR("\xE0\xA0\x80"),
343 0x00000800, /* <no name available> */
344 0, -1);
345 utf8_read_test(TESTSTR("\xF0\x90\x80\x80"),
346 0x00010000, /* <no name available> */
347 0, -1);
348 utf8_read_test(TESTSTR("\xF8\x88\x80\x80\x80"),
349 0x00200000, /* <no name available> */
350 0, -1);
351 utf8_read_test(TESTSTR("\xFC\x84\x80\x80\x80\x80"),
352 0x04000000, /* <no name available> */
353 0, -1);
354 utf8_read_test(TESTSTR("\x7F"),
355 0x0000007F, /* <control> */
356 0, -1);
357 utf8_read_test(TESTSTR("\xDF\xBF"),
358 0x000007FF, /* <no name available> */
359 0, -1);
360 utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
361 0x0000FFFD, /* REPLACEMENT CHARACTER */
362 0, -1);
363 utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
364 ERROR, /* <no name available> (invalid char) */
365 0, -1);
366 utf8_read_test(TESTSTR("\xF7\xBF\xBF\xBF"),
367 0x001FFFFF, /* <no name available> */
368 0, -1);
369 utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF\xBF"),
370 0x03FFFFFF, /* <no name available> */
371 0, -1);
372 utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF\xBF"),
373 0x7FFFFFFF, /* <no name available> */
374 0, -1);
375 utf8_read_test(TESTSTR("\xED\x9F\xBF"),
376 0x0000D7FF, /* <no name available> */
377 0, -1);
378 utf8_read_test(TESTSTR("\xEE\x80\x80"),
379 0x0000E000, /* <Private Use, First> */
380 0, -1);
381 utf8_read_test(TESTSTR("\xEF\xBF\xBD"),
382 0x0000FFFD, /* REPLACEMENT CHARACTER */
383 0, -1);
384 utf8_read_test(TESTSTR("\xF4\x8F\xBF\xBF"),
385 0x0010FFFF, /* <no name available> */
386 0, -1);
387 utf8_read_test(TESTSTR("\xF4\x90\x80\x80"),
388 0x00110000, /* <no name available> */
389 0, -1);
390 utf8_read_test(TESTSTR("\x80"),
391 ERROR, /* (unexpected continuation byte) */
392 0, -1);
393 utf8_read_test(TESTSTR("\xBF"),
394 ERROR, /* (unexpected continuation byte) */
395 0, -1);
396 utf8_read_test(TESTSTR("\x80\xBF"),
397 ERROR, /* (unexpected continuation byte) */
398 ERROR, /* (unexpected continuation byte) */
399 0, -1);
400 utf8_read_test(TESTSTR("\x80\xBF\x80"),
401 ERROR, /* (unexpected continuation byte) */
402 ERROR, /* (unexpected continuation byte) */
403 ERROR, /* (unexpected continuation byte) */
404 0, -1);
405 utf8_read_test(TESTSTR("\x80\xBF\x80\xBF"),
406 ERROR, /* (unexpected continuation byte) */
407 ERROR, /* (unexpected continuation byte) */
408 ERROR, /* (unexpected continuation byte) */
409 ERROR, /* (unexpected continuation byte) */
410 0, -1);
411 utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80"),
412 ERROR, /* (unexpected continuation byte) */
413 ERROR, /* (unexpected continuation byte) */
414 ERROR, /* (unexpected continuation byte) */
415 ERROR, /* (unexpected continuation byte) */
416 ERROR, /* (unexpected continuation byte) */
417 0, -1);
418 utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF"),
419 ERROR, /* (unexpected continuation byte) */
420 ERROR, /* (unexpected continuation byte) */
421 ERROR, /* (unexpected continuation byte) */
422 ERROR, /* (unexpected continuation byte) */
423 ERROR, /* (unexpected continuation byte) */
424 ERROR, /* (unexpected continuation byte) */
425 0, -1);
426 utf8_read_test(TESTSTR("\x80\xBF\x80\xBF\x80\xBF\x80"),
427 ERROR, /* (unexpected continuation byte) */
428 ERROR, /* (unexpected continuation byte) */
429 ERROR, /* (unexpected continuation byte) */
430 ERROR, /* (unexpected continuation byte) */
431 ERROR, /* (unexpected continuation byte) */
432 ERROR, /* (unexpected continuation byte) */
433 ERROR, /* (unexpected continuation byte) */
434 0, -1);
435 utf8_read_test(TESTSTR("\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"),
436 ERROR, /* (unexpected continuation byte) */
437 ERROR, /* (unexpected continuation byte) */
438 ERROR, /* (unexpected continuation byte) */
439 ERROR, /* (unexpected continuation byte) */
440 ERROR, /* (unexpected continuation byte) */
441 ERROR, /* (unexpected continuation byte) */
442 ERROR, /* (unexpected continuation byte) */
443 ERROR, /* (unexpected continuation byte) */
444 ERROR, /* (unexpected continuation byte) */
445 ERROR, /* (unexpected continuation byte) */
446 ERROR, /* (unexpected continuation byte) */
447 ERROR, /* (unexpected continuation byte) */
448 ERROR, /* (unexpected continuation byte) */
449 ERROR, /* (unexpected continuation byte) */
450 ERROR, /* (unexpected continuation byte) */
451 ERROR, /* (unexpected continuation byte) */
452 ERROR, /* (unexpected continuation byte) */
453 ERROR, /* (unexpected continuation byte) */
454 ERROR, /* (unexpected continuation byte) */
455 ERROR, /* (unexpected continuation byte) */
456 ERROR, /* (unexpected continuation byte) */
457 ERROR, /* (unexpected continuation byte) */
458 ERROR, /* (unexpected continuation byte) */
459 ERROR, /* (unexpected continuation byte) */
460 ERROR, /* (unexpected continuation byte) */
461 ERROR, /* (unexpected continuation byte) */
462 ERROR, /* (unexpected continuation byte) */
463 ERROR, /* (unexpected continuation byte) */
464 ERROR, /* (unexpected continuation byte) */
465 ERROR, /* (unexpected continuation byte) */
466 ERROR, /* (unexpected continuation byte) */
467 ERROR, /* (unexpected continuation byte) */
468 ERROR, /* (unexpected continuation byte) */
469 ERROR, /* (unexpected continuation byte) */
470 ERROR, /* (unexpected continuation byte) */
471 ERROR, /* (unexpected continuation byte) */
472 ERROR, /* (unexpected continuation byte) */
473 ERROR, /* (unexpected continuation byte) */
474 ERROR, /* (unexpected continuation byte) */
475 ERROR, /* (unexpected continuation byte) */
476 ERROR, /* (unexpected continuation byte) */
477 ERROR, /* (unexpected continuation byte) */
478 ERROR, /* (unexpected continuation byte) */
479 ERROR, /* (unexpected continuation byte) */
480 ERROR, /* (unexpected continuation byte) */
481 ERROR, /* (unexpected continuation byte) */
482 ERROR, /* (unexpected continuation byte) */
483 ERROR, /* (unexpected continuation byte) */
484 ERROR, /* (unexpected continuation byte) */
485 ERROR, /* (unexpected continuation byte) */
486 ERROR, /* (unexpected continuation byte) */
487 ERROR, /* (unexpected continuation byte) */
488 ERROR, /* (unexpected continuation byte) */
489 ERROR, /* (unexpected continuation byte) */
490 ERROR, /* (unexpected continuation byte) */
491 ERROR, /* (unexpected continuation byte) */
492 ERROR, /* (unexpected continuation byte) */
493 ERROR, /* (unexpected continuation byte) */
494 ERROR, /* (unexpected continuation byte) */
495 ERROR, /* (unexpected continuation byte) */
496 ERROR, /* (unexpected continuation byte) */
497 ERROR, /* (unexpected continuation byte) */
498 ERROR, /* (unexpected continuation byte) */
499 ERROR, /* (unexpected continuation byte) */
500 0, -1);
501 utf8_read_test(TESTSTR("\xC0\x20\xC1\x20\xC2\x20\xC3\x20\xC4\x20\xC5\x20\xC6\x20\xC7\x20"),
502 ERROR, /* (incomplete sequence) */
503 0x00000020, /* SPACE */
504 ERROR, /* (incomplete sequence) */
505 0x00000020, /* SPACE */
506 ERROR, /* (incomplete sequence) */
507 0x00000020, /* SPACE */
508 ERROR, /* (incomplete sequence) */
509 0x00000020, /* SPACE */
510 ERROR, /* (incomplete sequence) */
511 0x00000020, /* SPACE */
512 ERROR, /* (incomplete sequence) */
513 0x00000020, /* SPACE */
514 ERROR, /* (incomplete sequence) */
515 0x00000020, /* SPACE */
516 ERROR, /* (incomplete sequence) */
517 0x00000020, /* SPACE */
518 0, -1);
519 utf8_read_test(TESTSTR("\xE0\x20\xE1\x20\xE2\x20\xE3\x20\xE4\x20\xE5\x20\xE6\x20\xE7\x20\xE8\x20\xE9\x20\xEA\x20\xEB\x20\xEC\x20\xED\x20\xEE\x20\xEF\x20"),
520 ERROR, /* (incomplete sequence) */
521 0x00000020, /* SPACE */
522 ERROR, /* (incomplete sequence) */
523 0x00000020, /* SPACE */
524 ERROR, /* (incomplete sequence) */
525 0x00000020, /* SPACE */
526 ERROR, /* (incomplete sequence) */
527 0x00000020, /* SPACE */
528 ERROR, /* (incomplete sequence) */
529 0x00000020, /* SPACE */
530 ERROR, /* (incomplete sequence) */
531 0x00000020, /* SPACE */
532 ERROR, /* (incomplete sequence) */
533 0x00000020, /* SPACE */
534 ERROR, /* (incomplete sequence) */
535 0x00000020, /* SPACE */
536 ERROR, /* (incomplete sequence) */
537 0x00000020, /* SPACE */
538 ERROR, /* (incomplete sequence) */
539 0x00000020, /* SPACE */
540 ERROR, /* (incomplete sequence) */
541 0x00000020, /* SPACE */
542 ERROR, /* (incomplete sequence) */
543 0x00000020, /* SPACE */
544 ERROR, /* (incomplete sequence) */
545 0x00000020, /* SPACE */
546 ERROR, /* (incomplete sequence) */
547 0x00000020, /* SPACE */
548 ERROR, /* (incomplete sequence) */
549 0x00000020, /* SPACE */
550 ERROR, /* (incomplete sequence) */
551 0x00000020, /* SPACE */
552 0, -1);
553 utf8_read_test(TESTSTR("\xF0\x20\xF1\x20\xF2\x20\xF3\x20\xF4\x20\xF5\x20\xF6\x20\xF7\x20"),
554 ERROR, /* (incomplete sequence) */
555 0x00000020, /* SPACE */
556 ERROR, /* (incomplete sequence) */
557 0x00000020, /* SPACE */
558 ERROR, /* (incomplete sequence) */
559 0x00000020, /* SPACE */
560 ERROR, /* (incomplete sequence) */
561 0x00000020, /* SPACE */
562 ERROR, /* (incomplete sequence) */
563 0x00000020, /* SPACE */
564 ERROR, /* (incomplete sequence) */
565 0x00000020, /* SPACE */
566 ERROR, /* (incomplete sequence) */
567 0x00000020, /* SPACE */
568 ERROR, /* (incomplete sequence) */
569 0x00000020, /* SPACE */
570 0, -1);
571 utf8_read_test(TESTSTR("\xF8\x20\xF9\x20\xFA\x20\xFB\x20"),
572 ERROR, /* (incomplete sequence) */
573 0x00000020, /* SPACE */
574 ERROR, /* (incomplete sequence) */
575 0x00000020, /* SPACE */
576 ERROR, /* (incomplete sequence) */
577 0x00000020, /* SPACE */
578 ERROR, /* (incomplete sequence) */
579 0x00000020, /* SPACE */
580 0, -1);
581 utf8_read_test(TESTSTR("\xFC\x20\xFD\x20"),
582 ERROR, /* (incomplete sequence) */
583 0x00000020, /* SPACE */
584 ERROR, /* (incomplete sequence) */
585 0x00000020, /* SPACE */
586 0, -1);
587 utf8_read_test(TESTSTR("\xC0"),
588 ERROR, /* (incomplete sequence) */
589 0, -1);
590 utf8_read_test(TESTSTR("\xE0\x80"),
591 ERROR, /* (incomplete sequence) */
592 0, -1);
593 utf8_read_test(TESTSTR("\xF0\x80\x80"),
594 ERROR, /* (incomplete sequence) */
595 0, -1);
596 utf8_read_test(TESTSTR("\xF8\x80\x80\x80"),
597 ERROR, /* (incomplete sequence) */
598 0, -1);
599 utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80"),
600 ERROR, /* (incomplete sequence) */
601 0, -1);
602 utf8_read_test(TESTSTR("\xDF"),
603 ERROR, /* (incomplete sequence) */
604 0, -1);
605 utf8_read_test(TESTSTR("\xEF\xBF"),
606 ERROR, /* (incomplete sequence) */
607 0, -1);
608 utf8_read_test(TESTSTR("\xF7\xBF\xBF"),
609 ERROR, /* (incomplete sequence) */
610 0, -1);
611 utf8_read_test(TESTSTR("\xFB\xBF\xBF\xBF"),
612 ERROR, /* (incomplete sequence) */
613 0, -1);
614 utf8_read_test(TESTSTR("\xFD\xBF\xBF\xBF\xBF"),
615 ERROR, /* (incomplete sequence) */
616 0, -1);
617 utf8_read_test(TESTSTR("\xC0\xE0\x80\xF0\x80\x80\xF8\x80\x80\x80\xFC\x80\x80\x80\x80\xDF\xEF\xBF\xF7\xBF\xBF\xFB\xBF\xBF\xBF\xFD\xBF\xBF\xBF\xBF"),
618 ERROR, /* (incomplete sequence) */
619 ERROR, /* (incomplete sequence) */
620 ERROR, /* (incomplete sequence) */
621 ERROR, /* (incomplete sequence) */
622 ERROR, /* (incomplete sequence) */
623 ERROR, /* (incomplete sequence) */
624 ERROR, /* (incomplete sequence) */
625 ERROR, /* (incomplete sequence) */
626 ERROR, /* (incomplete sequence) */
627 ERROR, /* (incomplete sequence) */
628 0, -1);
629 utf8_read_test(TESTSTR("\xFE"),
630 ERROR, /* (invalid UTF-8 byte) */
631 0, -1);
632 utf8_read_test(TESTSTR("\xFF"),
633 ERROR, /* (invalid UTF-8 byte) */
634 0, -1);
635 utf8_read_test(TESTSTR("\xFE\xFE\xFF\xFF"),
636 ERROR, /* (invalid UTF-8 byte) */
637 ERROR, /* (invalid UTF-8 byte) */
638 ERROR, /* (invalid UTF-8 byte) */
639 ERROR, /* (invalid UTF-8 byte) */
640 0, -1);
641 utf8_read_test(TESTSTR("\xC0\xAF"),
642 ERROR, /* SOLIDUS (overlong form of 2F) */
643 0, -1);
644 utf8_read_test(TESTSTR("\xE0\x80\xAF"),
645 ERROR, /* SOLIDUS (overlong form of 2F) */
646 0, -1);
647 utf8_read_test(TESTSTR("\xF0\x80\x80\xAF"),
648 ERROR, /* SOLIDUS (overlong form of 2F) */
649 0, -1);
650 utf8_read_test(TESTSTR("\xF8\x80\x80\x80\xAF"),
651 ERROR, /* SOLIDUS (overlong form of 2F) */
652 0, -1);
653 utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\xAF"),
654 ERROR, /* SOLIDUS (overlong form of 2F) */
655 0, -1);
656 utf8_read_test(TESTSTR("\xC1\xBF"),
657 ERROR, /* <control> (overlong form of 7F) */
658 0, -1);
659 utf8_read_test(TESTSTR("\xE0\x9F\xBF"),
660 ERROR, /* <no name available> (overlong form of DF BF) */
661 0, -1);
662 utf8_read_test(TESTSTR("\xF0\x8F\xBF\xBF"),
663 ERROR, /* <no name available> (overlong form of EF BF BF) (invalid char) */
664 0, -1);
665 utf8_read_test(TESTSTR("\xF8\x87\xBF\xBF\xBF"),
666 ERROR, /* <no name available> (overlong form of F7 BF BF BF) */
667 0, -1);
668 utf8_read_test(TESTSTR("\xFC\x83\xBF\xBF\xBF\xBF"),
669 ERROR, /* <no name available> (overlong form of FB BF BF BF BF) */
670 0, -1);
671 utf8_read_test(TESTSTR("\xC0\x80"),
672 ERROR, /* <control> (overlong form of 00) */
673 0, -1);
674 utf8_read_test(TESTSTR("\xE0\x80\x80"),
675 ERROR, /* <control> (overlong form of 00) */
676 0, -1);
677 utf8_read_test(TESTSTR("\xF0\x80\x80\x80"),
678 ERROR, /* <control> (overlong form of 00) */
679 0, -1);
680 utf8_read_test(TESTSTR("\xF8\x80\x80\x80\x80"),
681 ERROR, /* <control> (overlong form of 00) */
682 0, -1);
683 utf8_read_test(TESTSTR("\xFC\x80\x80\x80\x80\x80"),
684 ERROR, /* <control> (overlong form of 00) */
685 0, -1);
686 utf8_read_test(TESTSTR("\xED\xA0\x80"),
687 ERROR, /* <Non Private Use High Surrogate, First> (surrogate) */
688 0, -1);
689 utf8_read_test(TESTSTR("\xED\xAD\xBF"),
690 ERROR, /* <Non Private Use High Surrogate, Last> (surrogate) */
691 0, -1);
692 utf8_read_test(TESTSTR("\xED\xAE\x80"),
693 ERROR, /* <Private Use High Surrogate, First> (surrogate) */
694 0, -1);
695 utf8_read_test(TESTSTR("\xED\xAF\xBF"),
696 ERROR, /* <Private Use High Surrogate, Last> (surrogate) */
697 0, -1);
698 utf8_read_test(TESTSTR("\xED\xB0\x80"),
699 ERROR, /* <Low Surrogate, First> (surrogate) */
700 0, -1);
701 utf8_read_test(TESTSTR("\xED\xBE\x80"),
702 ERROR, /* <no name available> (surrogate) */
703 0, -1);
704 utf8_read_test(TESTSTR("\xED\xBF\xBF"),
705 ERROR, /* <Low Surrogate, Last> (surrogate) */
706 0, -1);
707 utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xB0\x80"),
708 ERROR, /* <Non Private Use High Surrogate, First> (surrogate) */
709 ERROR, /* <Low Surrogate, First> (surrogate) */
710 0, -1);
711 utf8_read_test(TESTSTR("\xED\xA0\x80\xED\xBF\xBF"),
712 ERROR, /* <Non Private Use High Surrogate, First> (surrogate) */
713 ERROR, /* <Low Surrogate, Last> (surrogate) */
714 0, -1);
715 utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xB0\x80"),
716 ERROR, /* <Non Private Use High Surrogate, Last> (surrogate) */
717 ERROR, /* <Low Surrogate, First> (surrogate) */
718 0, -1);
719 utf8_read_test(TESTSTR("\xED\xAD\xBF\xED\xBF\xBF"),
720 ERROR, /* <Non Private Use High Surrogate, Last> (surrogate) */
721 ERROR, /* <Low Surrogate, Last> (surrogate) */
722 0, -1);
723 utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xB0\x80"),
724 ERROR, /* <Private Use High Surrogate, First> (surrogate) */
725 ERROR, /* <Low Surrogate, First> (surrogate) */
726 0, -1);
727 utf8_read_test(TESTSTR("\xED\xAE\x80\xED\xBF\xBF"),
728 ERROR, /* <Private Use High Surrogate, First> (surrogate) */
729 ERROR, /* <Low Surrogate, Last> (surrogate) */
730 0, -1);
731 utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xB0\x80"),
732 ERROR, /* <Private Use High Surrogate, Last> (surrogate) */
733 ERROR, /* <Low Surrogate, First> (surrogate) */
734 0, -1);
735 utf8_read_test(TESTSTR("\xED\xAF\xBF\xED\xBF\xBF"),
736 ERROR, /* <Private Use High Surrogate, Last> (surrogate) */
737 ERROR, /* <Low Surrogate, Last> (surrogate) */
738 0, -1);
739 utf8_read_test(TESTSTR("\xEF\xBF\xBE"),
740 ERROR, /* <no name available> (invalid char) */
741 0, -1);
742 utf8_read_test(TESTSTR("\xEF\xBF\xBF"),
743 ERROR, /* <no name available> (invalid char) */
744 0, -1);
745 printf("read tests completed\n");
746 printf("write tests beginning\n");
747 {
748 const static long str[] =
749 {0x03BAL, 0x1F79L, 0x03C3L, 0x03BCL, 0x03B5L, 0};
750 utf8_write_test(TESTSTR(str),
751 0xCE, 0xBA,
752 0xE1, 0xBD, 0xB9,
753 0xCF, 0x83,
754 0xCE, 0xBC,
755 0xCE, 0xB5,
756 0, -1);
757 }
758 {
759 const static long str[] = {0x0000L, 0};
760 utf8_write_test(TESTSTR(str),
761 0x00,
762 0, -1);
763 }
764 {
765 const static long str[] = {0x0080L, 0};
766 utf8_write_test(TESTSTR(str),
767 0xC2, 0x80,
768 0, -1);
769 }
770 {
771 const static long str[] = {0x0800L, 0};
772 utf8_write_test(TESTSTR(str),
773 0xE0, 0xA0, 0x80,
774 0, -1);
775 }
776 {
777 const static long str[] = {0x00010000L, 0};
778 utf8_write_test(TESTSTR(str),
779 0xF0, 0x90, 0x80, 0x80,
780 0, -1);
781 }
782 {
783 const static long str[] = {0x00200000L, 0};
784 utf8_write_test(TESTSTR(str),
785 0xF8, 0x88, 0x80, 0x80, 0x80,
786 0, -1);
787 }
788 {
789 const static long str[] = {0x04000000L, 0};
790 utf8_write_test(TESTSTR(str),
791 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80,
792 0, -1);
793 }
794 {
795 const static long str[] = {0x007FL, 0};
796 utf8_write_test(TESTSTR(str),
797 0x7F,
798 0, -1);
799 }
800 {
801 const static long str[] = {0x07FFL, 0};
802 utf8_write_test(TESTSTR(str),
803 0xDF, 0xBF,
804 0, -1);
805 }
806 {
807 const static long str[] = {0xFFFDL, 0};
808 utf8_write_test(TESTSTR(str),
809 0xEF, 0xBF, 0xBD,
810 0, -1);
811 }
812 {
813 const static long str[] = {0xFFFFL, 0};
814 utf8_write_test(TESTSTR(str),
815 ERROR,
816 0, -1);
817 }
818 {
819 const static long str[] = {0x001FFFFFL, 0};
820 utf8_write_test(TESTSTR(str),
821 0xF7, 0xBF, 0xBF, 0xBF,
822 0, -1);
823 }
824 {
825 const static long str[] = {0x03FFFFFFL, 0};
826 utf8_write_test(TESTSTR(str),
827 0xFB, 0xBF, 0xBF, 0xBF, 0xBF,
828 0, -1);
829 }
830 {
831 const static long str[] = {0x7FFFFFFFL, 0};
832 utf8_write_test(TESTSTR(str),
833 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF,
834 0, -1);
835 }
836 {
837 const static long str[] = {0xD7FFL, 0};
838 utf8_write_test(TESTSTR(str),
839 0xED, 0x9F, 0xBF,
840 0, -1);
841 }
842 {
843 const static long str[] = {0xD800L, 0};
844 utf8_write_test(TESTSTR(str),
845 ERROR,
846 0, -1);
847 }
848 {
849 const static long str[] = {0xD800L, 0xDC00L, 0};
850 utf8_write_test(TESTSTR(str),
851 ERROR,
852 ERROR,
853 0, -1);
854 }
855 {
856 const static long str[] = {0xDFFFL, 0};
857 utf8_write_test(TESTSTR(str),
858 ERROR,
859 0, -1);
860 }
861 {
862 const static long str[] = {0xE000L, 0};
863 utf8_write_test(TESTSTR(str),
864 0xEE, 0x80, 0x80,
865 0, -1);
866 }
867 printf("write tests completed\n");
868
869 printf("total: %d errors\n", total_errs);
870 return (total_errs != 0);
871 }
872 #endif /* TESTMODE */
873
874 const charset_spec charset_CS_UTF8 = {
875 CS_UTF8, read_utf8, write_utf8, NULL
876 };
877
878 #else /* ENUM_CHARSETS */
879
880 ENUM_CHARSET(CS_UTF8)
881
882 #endif /* ENUM_CHARSETS */