c6d25d8d |
1 | /* |
2 | * iso2022s.c - support for ISO-2022 subset encodings. |
c6d25d8d |
3 | */ |
4 | |
5 | #ifndef ENUM_CHARSETS |
6 | |
7 | #include <stdio.h> |
8 | #include <string.h> |
9 | #include <assert.h> |
10 | |
11 | #include "charset.h" |
12 | #include "internal.h" |
01081d4e |
13 | #include "sbcsdat.h" |
c6d25d8d |
14 | |
15 | #define SO (0x0E) |
16 | #define SI (0x0F) |
17 | #define ESC (0x1B) |
18 | |
19 | /* Functional description of a single ISO 2022 escape sequence. */ |
20 | struct iso2022_escape { |
21 | char const *sequence; |
22 | unsigned long andbits, xorbits; |
23 | /* |
24 | * For output, these variables help us figure out which escape |
25 | * sequences we need to get where we want to be. |
01081d4e |
26 | * |
27 | * `container' should be in the range 0-3, but can also be ORed |
28 | * with the bit flag RO to indicate that this is not a |
29 | * preferred container to use for this charset during output. |
c6d25d8d |
30 | */ |
31 | int container, subcharset; |
32 | }; |
01081d4e |
33 | #define RO 0x80 |
c6d25d8d |
34 | |
35 | struct iso2022 { |
36 | /* |
37 | * List of escape sequences supported in this subset. Must be |
38 | * in ASCII order, so that we can narrow down the list as |
39 | * necessary. |
40 | */ |
8bade113 |
41 | const struct iso2022_escape *escapes;/* must be sorted in ASCII order! */ |
c6d25d8d |
42 | int nescapes; |
43 | |
44 | /* |
45 | * We assign indices from 0 upwards to the sub-charsets of a |
46 | * given ISO 2022 subset. nbytes[i] tells us how many bytes per |
47 | * character are required by sub-charset i. (It's a string |
48 | * mainly because that makes it easier to declare in C syntax |
49 | * than an int array.) |
50 | */ |
51 | char const *nbytes; |
52 | |
53 | /* |
54 | * The characters in this string are indices-plus-one (so that |
55 | * NUL can still terminate) of escape sequences in `escapes'. |
56 | * These escapes are output in the given sequence to reset the |
57 | * encoding state, unless it turns out that a given escape |
58 | * would not change the state at all. |
59 | */ |
60 | char const *reset; |
61 | |
62 | /* |
63 | * Initial value of s1, in case the default container contents |
64 | * needs to be something other than charset 0 in all cases. |
65 | * (Note that this must have the top bit set!) |
66 | */ |
67 | unsigned long s1; |
68 | |
69 | /* |
70 | * For output, some ISO 2022 subsets _mandate_ an initial shift |
71 | * sequence. If so, here it is so we can output it. (For the |
72 | * sake of basic sanity we won't bother to _require_ it on |
73 | * input, although it should of course be listed under |
74 | * `escapes' above so that we ignore it when present.) |
75 | */ |
76 | char const *initial_sequence; |
77 | |
78 | /* |
01081d4e |
79 | * Is this an 8-bit ISO 2022 subset? |
80 | */ |
81 | int eightbit; |
82 | |
83 | /* |
c6d25d8d |
84 | * Function calls to do the actual translation. |
85 | */ |
86 | long int (*to_ucs)(int subcharset, unsigned long bytes); |
87 | int (*from_ucs)(long int ucs, int *subcharset, unsigned long *bytes); |
88 | }; |
89 | |
90 | static void read_iso2022s(charset_spec const *charset, long int input_chr, |
91 | charset_state *state, |
92 | void (*emit)(void *ctx, long int output), |
93 | void *emitctx) |
94 | { |
95 | struct iso2022 const *iso = (struct iso2022 *)charset->data; |
96 | |
97 | /* |
98 | * For reading ISO-2022 subsets, we divide up our state |
99 | * variables as follows: |
100 | * |
101 | * - The top byte of s0 (bits 31:24) indicates, if nonzero, |
102 | * that we are part-way through a recognised ISO-2022 escape |
103 | * sequence. Five of those bits (31:27) give the index of |
104 | * the first member of the escapes list matching what we |
105 | * have so far; the remaining three (26:24) give the number |
106 | * of characters we have seen so far. |
107 | * |
108 | * - The top bit of s1 (bit 31) is non-zero at all times, to |
109 | * indicate that we have performed any necessary |
110 | * initialisation. When we start, we detect a zero s1 and |
111 | * respond to it by initialising the default container |
112 | * contents. |
113 | * |
114 | * - The next three bits of s1 (bits 30:28) indicate which |
115 | * _container_ is currently selected. This isn't quite as |
116 | * simple as it sounds, since we have to preserve memory of |
117 | * which of the SI/SO containers we came from when we're |
118 | * temporarily in SS2/SS3. Hence, what happens is: |
119 | * + bit 28 indicates SI/SO. |
120 | * + if we're in an SS2/SS3 container, that's indicated by |
121 | * the two bits above that being nonzero and holding |
122 | * either 2 or 3. |
123 | * + Hence: 0 is SI, 1 is SO, 4 is SS2-from-SI, 5 is |
124 | * SS2-from-SO, 6 is SS3-from-SI, 7 is SS3-from-SO. |
01081d4e |
125 | * + For added fun: in an _8-bit_ ISO 2022 subset, we have |
126 | * the further special value 2, which means that we're |
127 | * theoretically in SI but the current character being |
128 | * accumulated is composed of 8-bit characters and will |
129 | * therefore be interpreted as if in SO. |
c6d25d8d |
130 | * |
131 | * - The next nibble of s1 (27:24) indicates how many bytes |
132 | * have been accumulated in the current character. |
133 | * |
134 | * - The remaining three bytes of s1 are divided into four |
135 | * six-bit sections, and each section gives the current |
136 | * sub-charset selected in one of the possible containers. |
137 | * (Those containers are SI, SO, SS2 and SS3, respectively |
138 | * and in order from the bottom of s0 to the top.) |
139 | * |
140 | * - The bottom 24 bits of s0 give the accumulated character |
141 | * data so far. |
142 | * |
143 | * (Note that this means s1 contains all the parts of the state |
144 | * which might need to be operated on by escape sequences. |
145 | * Cunning, eh?) |
146 | */ |
147 | |
148 | if (!(state->s1 & 0x80000000)) { |
149 | state->s1 = iso->s1; |
150 | } |
151 | |
152 | /* |
153 | * So. Firstly, we process escape sequences, if we're in the |
154 | * middle of one or if we see a possible introducer (SI, SO, |
155 | * ESC). |
156 | */ |
157 | if ((state->s0 >> 24) || |
158 | (input_chr == SO || input_chr == SI || input_chr == ESC)) { |
159 | int n = (state->s0 >> 24) & 7, i = (state->s0 >> 27), oi = i, j; |
160 | |
161 | /* |
162 | * If this is the start of an escape sequence, we might be |
163 | * in mid-character. If so, clear the character state and |
164 | * emit an error token for the incomplete character. |
165 | */ |
166 | if (state->s1 & 0x0F000000) { |
167 | state->s1 &= ~0x0F000000; |
168 | state->s0 &= 0xFF000000; |
169 | /* |
170 | * If we were in the SS2 or SS3 container, we |
171 | * automatically exit it. |
172 | */ |
173 | if (state->s1 & 0x60000000) |
174 | state->s1 &= 0x9FFFFFFF; |
175 | emit(emitctx, ERROR); |
176 | } |
177 | |
178 | j = i; |
179 | while (j < iso->nescapes && |
180 | !memcmp(iso->escapes[j].sequence, |
181 | iso->escapes[oi].sequence, n)) { |
182 | if (iso->escapes[j].sequence[n] < input_chr) |
183 | i = ++j; |
184 | else |
185 | break; |
186 | } |
187 | if (i >= iso->nescapes || |
188 | memcmp(iso->escapes[i].sequence, |
189 | iso->escapes[oi].sequence, n) || |
190 | iso->escapes[i].sequence[n] != input_chr) { |
191 | /* |
192 | * This character does not appear in any valid escape |
193 | * sequence. Therefore, we must emit all the characters |
194 | * we had previously swallowed, plus this one, and |
195 | * return to non-escape-sequence state. |
196 | */ |
197 | for (j = 0; j < n; j++) |
198 | emit(emitctx, iso->escapes[oi].sequence[j]); |
199 | emit(emitctx, input_chr); |
200 | state->s0 = 0; |
201 | return; |
202 | } |
203 | |
204 | /* |
205 | * Otherwise, we have found an additional character in our |
206 | * escape sequence. See if we have reached the _end_ of our |
207 | * sequence (and therefore must process the sequence). |
208 | */ |
209 | n++; |
210 | if (!iso->escapes[i].sequence[n]) { |
211 | state->s0 = 0; |
212 | state->s1 &= iso->escapes[i].andbits; |
213 | state->s1 ^= iso->escapes[i].xorbits; |
214 | return; |
215 | } |
216 | |
217 | /* |
218 | * Failing _that_, we simply update our escape-sequence- |
219 | * tracking state. |
220 | */ |
221 | assert(i < 32 && n < 8); |
222 | state->s0 = (i << 27) | (n << 24); |
223 | return; |
224 | } |
225 | |
226 | /* |
227 | * If this isn't an escape sequence, it must be part of a |
228 | * character. One possibility is that it's a control character |
01081d4e |
229 | * (00-20 or 7F-9F; also in non-8-bit ISO 2022 subsets I'm |
230 | * going to treat all top-half characters as controls), in |
231 | * which case we output it verbatim. |
c6d25d8d |
232 | */ |
01081d4e |
233 | if (input_chr < 0x21 || |
234 | (input_chr > 0x7E && (!iso->eightbit || input_chr < 0xA0))) { |
c6d25d8d |
235 | /* |
236 | * We might be in mid-multibyte-character. If so, clear the |
237 | * character state and emit an error token for the |
238 | * incomplete character. |
239 | */ |
240 | if (state->s1 & 0x0F000000) { |
241 | state->s1 &= ~0x0F000000; |
242 | state->s0 &= 0xFF000000; |
243 | emit(emitctx, ERROR); |
244 | /* |
245 | * If we were in the SS2 or SS3 container, we |
246 | * automatically exit it. |
247 | */ |
248 | if (state->s1 & 0x60000000) |
249 | state->s1 &= 0x9FFFFFFF; |
250 | } |
251 | |
252 | emit(emitctx, input_chr); |
253 | return; |
254 | } |
255 | |
256 | /* |
257 | * Otherwise, accumulate character data. |
258 | */ |
259 | { |
260 | unsigned long chr; |
261 | int chrlen, cont, subcharset, bytes; |
262 | |
01081d4e |
263 | /* |
264 | * Verify that we've seen the right kind of character for |
265 | * what we're currently doing. This only matters in 8-bit |
266 | * subsets. |
267 | */ |
268 | if (iso->eightbit) { |
269 | cont = (state->s1 >> 28) & 7; |
270 | /* |
271 | * If cont==0, we're entitled to see either GL or GR |
272 | * characters. If cont==2, we expect only GR; otherwise |
273 | * we expect only GL. |
274 | * |
275 | * If we see a GR character while cont==0, we set |
276 | * cont=2 immediately. |
277 | */ |
278 | if ((cont == 2 && !(input_chr & 0x80)) || |
279 | (cont != 0 && cont != 2 && (input_chr & 0x80))) { |
280 | /* |
281 | * Clear the previous character; it was prematurely |
282 | * terminated by this error. |
283 | */ |
284 | state->s1 &= ~0x0F000000; |
285 | state->s0 &= 0xFF000000; |
286 | emit(emitctx, ERROR); |
287 | /* |
288 | * If we were in the SS2 or SS3 container, we |
289 | * automatically exit it. |
290 | */ |
291 | if (state->s1 & 0x60000000) |
292 | state->s1 &= 0x9FFFFFFF; |
293 | } |
294 | |
295 | if (cont == 0 && (input_chr & 0x80)) { |
296 | state->s1 |= 0x20000000; |
297 | } |
298 | } |
299 | |
c6d25d8d |
300 | /* The current character and its length. */ |
01081d4e |
301 | chr = ((state->s0 & 0x00FFFFFF) << 8) | (input_chr & 0x7F); |
c6d25d8d |
302 | chrlen = ((state->s1 >> 24) & 0xF) + 1; |
303 | /* The current sub-charset. */ |
304 | cont = (state->s1 >> 28) & 7; |
305 | if (cont > 1) cont >>= 1; |
306 | subcharset = (state->s1 >> (6*cont)) & 0x3F; |
307 | /* The number of bytes-per-character in that sub-charset. */ |
308 | bytes = iso->nbytes[subcharset]; |
309 | |
310 | /* |
311 | * If this character is now complete, we convert and emit |
312 | * it. Otherwise, we simply update the state and return. |
313 | */ |
314 | if (chrlen >= bytes) { |
315 | emit(emitctx, iso->to_ucs(subcharset, chr)); |
316 | chr = chrlen = 0; |
317 | /* |
318 | * If we were in the SS2 or SS3 container, we |
319 | * automatically exit it. |
320 | */ |
321 | if (state->s1 & 0x60000000) |
322 | state->s1 &= 0x9FFFFFFF; |
323 | } |
324 | state->s0 = (state->s0 & 0xFF000000) | chr; |
325 | state->s1 = (state->s1 & 0xF0FFFFFF) | (chrlen << 24); |
326 | } |
327 | } |
328 | |
329 | static int write_iso2022s(charset_spec const *charset, long int input_chr, |
330 | charset_state *state, |
331 | void (*emit)(void *ctx, long int output), |
332 | void *emitctx) |
333 | { |
334 | struct iso2022 const *iso = (struct iso2022 *)charset->data; |
01081d4e |
335 | int subcharset, len, i, j, cont, topbit = 0; |
c6d25d8d |
336 | unsigned long bytes; |
337 | |
338 | /* |
339 | * For output, our s1 state variable contains most of the same |
340 | * stuff as it did for input - initial-state indicator bit, |
341 | * current container, and current subcharset selected in each |
342 | * container. |
343 | */ |
344 | |
345 | /* |
346 | * Analyse the character and find out what subcharset it needs |
347 | * to go in. |
348 | */ |
349 | if (input_chr >= 0 && !iso->from_ucs(input_chr, &subcharset, &bytes)) |
350 | return FALSE; |
351 | |
352 | if (!(state->s1 & 0x80000000)) { |
353 | state->s1 = iso->s1; |
354 | if (iso->initial_sequence) |
355 | for (i = 0; iso->initial_sequence[i]; i++) |
356 | emit(emitctx, iso->initial_sequence[i]); |
357 | } |
358 | |
359 | if (input_chr == -1) { |
360 | unsigned long oldstate; |
361 | int k; |
362 | |
363 | /* |
364 | * Special case: reset encoding state. |
365 | */ |
366 | for (i = 0; iso->reset[i]; i++) { |
367 | j = iso->reset[i] - 1; |
368 | oldstate = state->s1; |
369 | state->s1 &= iso->escapes[j].andbits; |
370 | state->s1 ^= iso->escapes[j].xorbits; |
371 | if (state->s1 != oldstate) { |
372 | /* We must actually emit this sequence. */ |
373 | for (k = 0; iso->escapes[j].sequence[k]; k++) |
374 | emit(emitctx, iso->escapes[j].sequence[k]); |
375 | } |
376 | } |
377 | |
378 | return TRUE; |
379 | } |
380 | |
381 | /* |
382 | * Now begins the fun. We now know what subcharset we want. So |
383 | * we must find out which container we should select it into, |
384 | * select it into it if necessary, select that _container_ if |
385 | * necessary, and then output the given bytes. |
386 | */ |
387 | for (i = 0; i < iso->nescapes; i++) |
01081d4e |
388 | if (iso->escapes[i].subcharset == subcharset && |
389 | !(iso->escapes[i].container & RO)) |
c6d25d8d |
390 | break; |
391 | assert(i < iso->nescapes); |
392 | |
393 | /* |
394 | * We've found the escape sequence which would select this |
395 | * subcharset into a container. However, that subcharset might |
396 | * already _be_ selected in that container! Check before we go |
397 | * to the effort of emitting the sequence. |
398 | */ |
01081d4e |
399 | cont = iso->escapes[i].container &~ RO; |
3cca0edf |
400 | if (((state->s1 >> (6*cont)) & 0x3F) != (unsigned)subcharset) { |
c6d25d8d |
401 | for (j = 0; iso->escapes[i].sequence[j]; j++) |
402 | emit(emitctx, iso->escapes[i].sequence[j]); |
403 | state->s1 &= iso->escapes[i].andbits; |
404 | state->s1 ^= iso->escapes[i].xorbits; |
405 | } |
406 | |
407 | /* |
408 | * Now we know what container our subcharset is in, so we want |
409 | * to select that container. |
410 | */ |
411 | if (cont > 1) { |
412 | /* SS2 or SS3; just output the sequence and be done. */ |
413 | emit(emitctx, ESC); |
414 | emit(emitctx, 'L' + cont); /* comes out to 'N' or 'O' */ |
415 | } else { |
01081d4e |
416 | /* |
417 | * Emit SI or SO, but only if the current container isn't already |
418 | * the right one. |
419 | * |
420 | * Also, in an 8-bit subset, we need not do this; we'll |
421 | * just use 8-bit characters to output SO-container |
422 | * characters. |
423 | */ |
424 | if (iso->eightbit && cont == 1 && ((state->s1 >> 28) & 7) == 0) { |
425 | topbit = 0x80; |
426 | } else if (((state->s1 >> 28) & 7) != (unsigned)cont) { |
c6d25d8d |
427 | emit(emitctx, cont ? SO : SI); |
428 | state->s1 = (state->s1 & 0x8FFFFFFF) | (cont << 28); |
429 | } |
430 | } |
431 | |
432 | /* |
433 | * We're done. Subcharset is selected in container, container |
434 | * is selected. All we need now is to write out the bytes. |
435 | */ |
436 | len = iso->nbytes[subcharset]; |
437 | while (len--) |
01081d4e |
438 | emit(emitctx, ((bytes >> (8*len)) & 0xFF) | topbit); |
c6d25d8d |
439 | |
440 | return TRUE; |
441 | } |
442 | |
443 | /* |
444 | * ISO-2022-JP, defined in RFC 1468. |
445 | */ |
446 | static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes) |
447 | { |
448 | switch (subcharset) { |
a933148c |
449 | case 1: /* JIS X 0201 bottom half */ |
450 | if (bytes == 0x5C) |
451 | return 0xA5; |
452 | else if (bytes == 0x7E) |
453 | return 0x203E; |
454 | /* else fall through to ASCII */ |
c6d25d8d |
455 | case 0: return bytes; /* one-byte ASCII */ |
c6d25d8d |
456 | /* (no break needed since all control paths have returned) */ |
457 | case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21, |
458 | ((bytes ) & 0xFF) - 0x21); |
459 | default: return ERROR; |
460 | } |
461 | } |
462 | static int iso2022jp_from_ucs(long int ucs, int *subcharset, |
463 | unsigned long *bytes) |
464 | { |
465 | int r, c; |
466 | if (ucs < 0x80) { |
467 | *subcharset = 0; |
468 | *bytes = ucs; |
469 | return 1; |
a933148c |
470 | } else if (ucs == 0xA5 || ucs == 0x203E) { |
c6d25d8d |
471 | *subcharset = 1; |
a933148c |
472 | *bytes = (ucs == 0xA5 ? 0x5C : 0x7E); |
c6d25d8d |
473 | return 1; |
474 | } else if (unicode_to_jisx0208(ucs, &r, &c)) { |
475 | *subcharset = 2; |
476 | *bytes = ((r+0x21) << 8) | (c+0x21); |
477 | return 1; |
478 | } else { |
479 | return 0; |
480 | } |
481 | } |
8bade113 |
482 | static const struct iso2022_escape iso2022jp_escapes[] = { |
c6d25d8d |
483 | {"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1}, /* we ignore this one */ |
484 | {"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2}, |
485 | {"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0}, |
486 | {"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1}, |
487 | }; |
8bade113 |
488 | static const struct iso2022 iso2022jp = { |
c6d25d8d |
489 | iso2022jp_escapes, lenof(iso2022jp_escapes), |
01081d4e |
490 | "\1\1\2", "\3", 0x80000000, NULL, FALSE, |
491 | iso2022jp_to_ucs, iso2022jp_from_ucs |
c6d25d8d |
492 | }; |
493 | const charset_spec charset_CS_ISO2022_JP = { |
494 | CS_ISO2022_JP, read_iso2022s, write_iso2022s, &iso2022jp |
495 | }; |
496 | |
497 | /* |
498 | * ISO-2022-KR, defined in RFC 1557. |
499 | */ |
500 | static long int iso2022kr_to_ucs(int subcharset, unsigned long bytes) |
501 | { |
502 | switch (subcharset) { |
503 | case 0: return bytes; /* one-byte ASCII */ |
504 | case 1: return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21, |
505 | ((bytes ) & 0xFF) - 0x21); |
506 | default: return ERROR; |
507 | } |
508 | } |
509 | static int iso2022kr_from_ucs(long int ucs, int *subcharset, |
510 | unsigned long *bytes) |
511 | { |
512 | int r, c; |
513 | if (ucs < 0x80) { |
514 | *subcharset = 0; |
515 | *bytes = ucs; |
516 | return 1; |
517 | } else if (unicode_to_ksx1001(ucs, &r, &c)) { |
518 | *subcharset = 1; |
519 | *bytes = ((r+0x21) << 8) | (c+0x21); |
520 | return 1; |
521 | } else { |
522 | return 0; |
523 | } |
524 | } |
8bade113 |
525 | static const struct iso2022_escape iso2022kr_escapes[] = { |
c6d25d8d |
526 | {"\016", 0x8FFFFFFF, 0x10000000, -1, -1}, |
527 | {"\017", 0x8FFFFFFF, 0x00000000, 0, 0}, |
528 | {"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1}, /* bits[11:6] <- 1 */ |
529 | }; |
8bade113 |
530 | static const struct iso2022 iso2022kr = { |
c6d25d8d |
531 | iso2022kr_escapes, lenof(iso2022kr_escapes), |
01081d4e |
532 | "\1\2", "\2", 0x80000040, "\033$)C", FALSE, |
533 | iso2022kr_to_ucs, iso2022kr_from_ucs |
c6d25d8d |
534 | }; |
535 | const charset_spec charset_CS_ISO2022_KR = { |
536 | CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr |
537 | }; |
538 | |
539 | #else /* ENUM_CHARSETS */ |
540 | |
541 | ENUM_CHARSET(CS_ISO2022_JP) |
542 | ENUM_CHARSET(CS_ISO2022_KR) |
543 | |
544 | #endif /* ENUM_CHARSETS */ |