c6d25d8d |
1 | /* |
2 | * iso2022s.c - support for ISO-2022 subset encodings. |
c6d25d8d |
3 | */ |
4 | |
5 | #ifndef ENUM_CHARSETS |
6 | |
7 | #include <stdio.h> |
8 | #include <string.h> |
9 | #include <assert.h> |
10 | |
11 | #include "charset.h" |
12 | #include "internal.h" |
01081d4e |
13 | #include "sbcsdat.h" |
c6d25d8d |
14 | |
15 | #define SO (0x0E) |
16 | #define SI (0x0F) |
17 | #define ESC (0x1B) |
18 | |
19 | /* Functional description of a single ISO 2022 escape sequence. */ |
20 | struct iso2022_escape { |
21 | char const *sequence; |
22 | unsigned long andbits, xorbits; |
23 | /* |
24 | * For output, these variables help us figure out which escape |
25 | * sequences we need to get where we want to be. |
01081d4e |
26 | * |
27 | * `container' should be in the range 0-3, but can also be ORed |
28 | * with the bit flag RO to indicate that this is not a |
29 | * preferred container to use for this charset during output. |
c6d25d8d |
30 | */ |
31 | int container, subcharset; |
32 | }; |
01081d4e |
33 | #define RO 0x80 |
c6d25d8d |
34 | |
35 | struct iso2022 { |
36 | /* |
37 | * List of escape sequences supported in this subset. Must be |
38 | * in ASCII order, so that we can narrow down the list as |
39 | * necessary. |
40 | */ |
8bade113 |
41 | const struct iso2022_escape *escapes;/* must be sorted in ASCII order! */ |
c6d25d8d |
42 | int nescapes; |
43 | |
44 | /* |
45 | * We assign indices from 0 upwards to the sub-charsets of a |
46 | * given ISO 2022 subset. nbytes[i] tells us how many bytes per |
47 | * character are required by sub-charset i. (It's a string |
48 | * mainly because that makes it easier to declare in C syntax |
49 | * than an int array.) |
50 | */ |
51 | char const *nbytes; |
52 | |
53 | /* |
54 | * The characters in this string are indices-plus-one (so that |
55 | * NUL can still terminate) of escape sequences in `escapes'. |
56 | * These escapes are output in the given sequence to reset the |
57 | * encoding state, unless it turns out that a given escape |
58 | * would not change the state at all. |
59 | */ |
60 | char const *reset; |
61 | |
62 | /* |
63 | * Initial value of s1, in case the default container contents |
64 | * needs to be something other than charset 0 in all cases. |
65 | * (Note that this must have the top bit set!) |
66 | */ |
67 | unsigned long s1; |
68 | |
69 | /* |
70 | * For output, some ISO 2022 subsets _mandate_ an initial shift |
71 | * sequence. If so, here it is so we can output it. (For the |
72 | * sake of basic sanity we won't bother to _require_ it on |
73 | * input, although it should of course be listed under |
74 | * `escapes' above so that we ignore it when present.) |
75 | */ |
76 | char const *initial_sequence; |
77 | |
78 | /* |
01081d4e |
79 | * Is this an 8-bit ISO 2022 subset? |
80 | */ |
81 | int eightbit; |
82 | |
83 | /* |
c6d25d8d |
84 | * Function calls to do the actual translation. |
85 | */ |
86 | long int (*to_ucs)(int subcharset, unsigned long bytes); |
87 | int (*from_ucs)(long int ucs, int *subcharset, unsigned long *bytes); |
88 | }; |
89 | |
90 | static void read_iso2022s(charset_spec const *charset, long int input_chr, |
91 | charset_state *state, |
92 | void (*emit)(void *ctx, long int output), |
93 | void *emitctx) |
94 | { |
95 | struct iso2022 const *iso = (struct iso2022 *)charset->data; |
96 | |
97 | /* |
98 | * For reading ISO-2022 subsets, we divide up our state |
99 | * variables as follows: |
100 | * |
101 | * - The top byte of s0 (bits 31:24) indicates, if nonzero, |
102 | * that we are part-way through a recognised ISO-2022 escape |
103 | * sequence. Five of those bits (31:27) give the index of |
104 | * the first member of the escapes list matching what we |
105 | * have so far; the remaining three (26:24) give the number |
106 | * of characters we have seen so far. |
107 | * |
108 | * - The top bit of s1 (bit 31) is non-zero at all times, to |
109 | * indicate that we have performed any necessary |
110 | * initialisation. When we start, we detect a zero s1 and |
111 | * respond to it by initialising the default container |
112 | * contents. |
113 | * |
114 | * - The next three bits of s1 (bits 30:28) indicate which |
115 | * _container_ is currently selected. This isn't quite as |
116 | * simple as it sounds, since we have to preserve memory of |
117 | * which of the SI/SO containers we came from when we're |
118 | * temporarily in SS2/SS3. Hence, what happens is: |
119 | * + bit 28 indicates SI/SO. |
120 | * + if we're in an SS2/SS3 container, that's indicated by |
121 | * the two bits above that being nonzero and holding |
122 | * either 2 or 3. |
123 | * + Hence: 0 is SI, 1 is SO, 4 is SS2-from-SI, 5 is |
124 | * SS2-from-SO, 6 is SS3-from-SI, 7 is SS3-from-SO. |
01081d4e |
125 | * + For added fun: in an _8-bit_ ISO 2022 subset, we have |
126 | * the further special value 2, which means that we're |
127 | * theoretically in SI but the current character being |
128 | * accumulated is composed of 8-bit characters and will |
129 | * therefore be interpreted as if in SO. |
c6d25d8d |
130 | * |
131 | * - The next nibble of s1 (27:24) indicates how many bytes |
132 | * have been accumulated in the current character. |
133 | * |
134 | * - The remaining three bytes of s1 are divided into four |
135 | * six-bit sections, and each section gives the current |
136 | * sub-charset selected in one of the possible containers. |
137 | * (Those containers are SI, SO, SS2 and SS3, respectively |
138 | * and in order from the bottom of s0 to the top.) |
139 | * |
140 | * - The bottom 24 bits of s0 give the accumulated character |
141 | * data so far. |
142 | * |
143 | * (Note that this means s1 contains all the parts of the state |
144 | * which might need to be operated on by escape sequences. |
145 | * Cunning, eh?) |
146 | */ |
147 | |
148 | if (!(state->s1 & 0x80000000)) { |
149 | state->s1 = iso->s1; |
150 | } |
151 | |
152 | /* |
153 | * So. Firstly, we process escape sequences, if we're in the |
154 | * middle of one or if we see a possible introducer (SI, SO, |
155 | * ESC). |
156 | */ |
157 | if ((state->s0 >> 24) || |
158 | (input_chr == SO || input_chr == SI || input_chr == ESC)) { |
159 | int n = (state->s0 >> 24) & 7, i = (state->s0 >> 27), oi = i, j; |
160 | |
161 | /* |
162 | * If this is the start of an escape sequence, we might be |
163 | * in mid-character. If so, clear the character state and |
164 | * emit an error token for the incomplete character. |
165 | */ |
166 | if (state->s1 & 0x0F000000) { |
167 | state->s1 &= ~0x0F000000; |
168 | state->s0 &= 0xFF000000; |
169 | /* |
170 | * If we were in the SS2 or SS3 container, we |
171 | * automatically exit it. |
172 | */ |
173 | if (state->s1 & 0x60000000) |
174 | state->s1 &= 0x9FFFFFFF; |
175 | emit(emitctx, ERROR); |
176 | } |
177 | |
178 | j = i; |
179 | while (j < iso->nescapes && |
180 | !memcmp(iso->escapes[j].sequence, |
181 | iso->escapes[oi].sequence, n)) { |
182 | if (iso->escapes[j].sequence[n] < input_chr) |
183 | i = ++j; |
184 | else |
185 | break; |
186 | } |
187 | if (i >= iso->nescapes || |
188 | memcmp(iso->escapes[i].sequence, |
189 | iso->escapes[oi].sequence, n) || |
190 | iso->escapes[i].sequence[n] != input_chr) { |
191 | /* |
192 | * This character does not appear in any valid escape |
193 | * sequence. Therefore, we must emit all the characters |
194 | * we had previously swallowed, plus this one, and |
195 | * return to non-escape-sequence state. |
196 | */ |
197 | for (j = 0; j < n; j++) |
198 | emit(emitctx, iso->escapes[oi].sequence[j]); |
199 | emit(emitctx, input_chr); |
200 | state->s0 = 0; |
201 | return; |
202 | } |
203 | |
204 | /* |
205 | * Otherwise, we have found an additional character in our |
206 | * escape sequence. See if we have reached the _end_ of our |
207 | * sequence (and therefore must process the sequence). |
208 | */ |
209 | n++; |
210 | if (!iso->escapes[i].sequence[n]) { |
211 | state->s0 = 0; |
212 | state->s1 &= iso->escapes[i].andbits; |
213 | state->s1 ^= iso->escapes[i].xorbits; |
214 | return; |
215 | } |
216 | |
217 | /* |
218 | * Failing _that_, we simply update our escape-sequence- |
219 | * tracking state. |
220 | */ |
221 | assert(i < 32 && n < 8); |
222 | state->s0 = (i << 27) | (n << 24); |
223 | return; |
224 | } |
225 | |
226 | /* |
227 | * If this isn't an escape sequence, it must be part of a |
228 | * character. One possibility is that it's a control character |
01081d4e |
229 | * (00-20 or 7F-9F; also in non-8-bit ISO 2022 subsets I'm |
230 | * going to treat all top-half characters as controls), in |
231 | * which case we output it verbatim. |
c6d25d8d |
232 | */ |
01081d4e |
233 | if (input_chr < 0x21 || |
234 | (input_chr > 0x7E && (!iso->eightbit || input_chr < 0xA0))) { |
c6d25d8d |
235 | /* |
236 | * We might be in mid-multibyte-character. If so, clear the |
237 | * character state and emit an error token for the |
238 | * incomplete character. |
239 | */ |
240 | if (state->s1 & 0x0F000000) { |
241 | state->s1 &= ~0x0F000000; |
242 | state->s0 &= 0xFF000000; |
243 | emit(emitctx, ERROR); |
244 | /* |
245 | * If we were in the SS2 or SS3 container, we |
246 | * automatically exit it. |
247 | */ |
248 | if (state->s1 & 0x60000000) |
249 | state->s1 &= 0x9FFFFFFF; |
250 | } |
251 | |
252 | emit(emitctx, input_chr); |
253 | return; |
254 | } |
255 | |
256 | /* |
257 | * Otherwise, accumulate character data. |
258 | */ |
259 | { |
260 | unsigned long chr; |
261 | int chrlen, cont, subcharset, bytes; |
262 | |
01081d4e |
263 | /* |
264 | * Verify that we've seen the right kind of character for |
265 | * what we're currently doing. This only matters in 8-bit |
266 | * subsets. |
267 | */ |
268 | if (iso->eightbit) { |
269 | cont = (state->s1 >> 28) & 7; |
270 | /* |
271 | * If cont==0, we're entitled to see either GL or GR |
272 | * characters. If cont==2, we expect only GR; otherwise |
273 | * we expect only GL. |
274 | * |
275 | * If we see a GR character while cont==0, we set |
276 | * cont=2 immediately. |
277 | */ |
278 | if ((cont == 2 && !(input_chr & 0x80)) || |
279 | (cont != 0 && cont != 2 && (input_chr & 0x80))) { |
280 | /* |
281 | * Clear the previous character; it was prematurely |
282 | * terminated by this error. |
283 | */ |
284 | state->s1 &= ~0x0F000000; |
285 | state->s0 &= 0xFF000000; |
286 | emit(emitctx, ERROR); |
287 | /* |
288 | * If we were in the SS2 or SS3 container, we |
289 | * automatically exit it. |
290 | */ |
291 | if (state->s1 & 0x60000000) |
292 | state->s1 &= 0x9FFFFFFF; |
293 | } |
294 | |
295 | if (cont == 0 && (input_chr & 0x80)) { |
296 | state->s1 |= 0x20000000; |
297 | } |
298 | } |
299 | |
c6d25d8d |
300 | /* The current character and its length. */ |
01081d4e |
301 | chr = ((state->s0 & 0x00FFFFFF) << 8) | (input_chr & 0x7F); |
c6d25d8d |
302 | chrlen = ((state->s1 >> 24) & 0xF) + 1; |
303 | /* The current sub-charset. */ |
304 | cont = (state->s1 >> 28) & 7; |
305 | if (cont > 1) cont >>= 1; |
306 | subcharset = (state->s1 >> (6*cont)) & 0x3F; |
307 | /* The number of bytes-per-character in that sub-charset. */ |
308 | bytes = iso->nbytes[subcharset]; |
309 | |
310 | /* |
311 | * If this character is now complete, we convert and emit |
312 | * it. Otherwise, we simply update the state and return. |
313 | */ |
314 | if (chrlen >= bytes) { |
315 | emit(emitctx, iso->to_ucs(subcharset, chr)); |
316 | chr = chrlen = 0; |
317 | /* |
318 | * If we were in the SS2 or SS3 container, we |
319 | * automatically exit it. |
320 | */ |
321 | if (state->s1 & 0x60000000) |
322 | state->s1 &= 0x9FFFFFFF; |
323 | } |
324 | state->s0 = (state->s0 & 0xFF000000) | chr; |
325 | state->s1 = (state->s1 & 0xF0FFFFFF) | (chrlen << 24); |
326 | } |
327 | } |
328 | |
329 | static int write_iso2022s(charset_spec const *charset, long int input_chr, |
330 | charset_state *state, |
331 | void (*emit)(void *ctx, long int output), |
332 | void *emitctx) |
333 | { |
334 | struct iso2022 const *iso = (struct iso2022 *)charset->data; |
01081d4e |
335 | int subcharset, len, i, j, cont, topbit = 0; |
c6d25d8d |
336 | unsigned long bytes; |
337 | |
338 | /* |
339 | * For output, our s1 state variable contains most of the same |
340 | * stuff as it did for input - initial-state indicator bit, |
341 | * current container, and current subcharset selected in each |
342 | * container. |
343 | */ |
344 | |
345 | /* |
346 | * Analyse the character and find out what subcharset it needs |
347 | * to go in. |
348 | */ |
349 | if (input_chr >= 0 && !iso->from_ucs(input_chr, &subcharset, &bytes)) |
350 | return FALSE; |
351 | |
352 | if (!(state->s1 & 0x80000000)) { |
353 | state->s1 = iso->s1; |
354 | if (iso->initial_sequence) |
355 | for (i = 0; iso->initial_sequence[i]; i++) |
356 | emit(emitctx, iso->initial_sequence[i]); |
357 | } |
358 | |
359 | if (input_chr == -1) { |
360 | unsigned long oldstate; |
361 | int k; |
362 | |
363 | /* |
364 | * Special case: reset encoding state. |
365 | */ |
366 | for (i = 0; iso->reset[i]; i++) { |
367 | j = iso->reset[i] - 1; |
368 | oldstate = state->s1; |
369 | state->s1 &= iso->escapes[j].andbits; |
370 | state->s1 ^= iso->escapes[j].xorbits; |
371 | if (state->s1 != oldstate) { |
372 | /* We must actually emit this sequence. */ |
373 | for (k = 0; iso->escapes[j].sequence[k]; k++) |
374 | emit(emitctx, iso->escapes[j].sequence[k]); |
375 | } |
376 | } |
377 | |
378 | return TRUE; |
379 | } |
380 | |
381 | /* |
382 | * Now begins the fun. We now know what subcharset we want. So |
383 | * we must find out which container we should select it into, |
384 | * select it into it if necessary, select that _container_ if |
385 | * necessary, and then output the given bytes. |
386 | */ |
387 | for (i = 0; i < iso->nescapes; i++) |
01081d4e |
388 | if (iso->escapes[i].subcharset == subcharset && |
389 | !(iso->escapes[i].container & RO)) |
c6d25d8d |
390 | break; |
391 | assert(i < iso->nescapes); |
392 | |
393 | /* |
394 | * We've found the escape sequence which would select this |
395 | * subcharset into a container. However, that subcharset might |
396 | * already _be_ selected in that container! Check before we go |
397 | * to the effort of emitting the sequence. |
398 | */ |
01081d4e |
399 | cont = iso->escapes[i].container &~ RO; |
3cca0edf |
400 | if (((state->s1 >> (6*cont)) & 0x3F) != (unsigned)subcharset) { |
c6d25d8d |
401 | for (j = 0; iso->escapes[i].sequence[j]; j++) |
402 | emit(emitctx, iso->escapes[i].sequence[j]); |
403 | state->s1 &= iso->escapes[i].andbits; |
404 | state->s1 ^= iso->escapes[i].xorbits; |
405 | } |
406 | |
407 | /* |
408 | * Now we know what container our subcharset is in, so we want |
409 | * to select that container. |
410 | */ |
411 | if (cont > 1) { |
412 | /* SS2 or SS3; just output the sequence and be done. */ |
413 | emit(emitctx, ESC); |
414 | emit(emitctx, 'L' + cont); /* comes out to 'N' or 'O' */ |
415 | } else { |
01081d4e |
416 | /* |
417 | * Emit SI or SO, but only if the current container isn't already |
418 | * the right one. |
419 | * |
420 | * Also, in an 8-bit subset, we need not do this; we'll |
421 | * just use 8-bit characters to output SO-container |
422 | * characters. |
423 | */ |
424 | if (iso->eightbit && cont == 1 && ((state->s1 >> 28) & 7) == 0) { |
425 | topbit = 0x80; |
426 | } else if (((state->s1 >> 28) & 7) != (unsigned)cont) { |
c6d25d8d |
427 | emit(emitctx, cont ? SO : SI); |
428 | state->s1 = (state->s1 & 0x8FFFFFFF) | (cont << 28); |
429 | } |
430 | } |
431 | |
432 | /* |
433 | * We're done. Subcharset is selected in container, container |
434 | * is selected. All we need now is to write out the bytes. |
435 | */ |
436 | len = iso->nbytes[subcharset]; |
437 | while (len--) |
01081d4e |
438 | emit(emitctx, ((bytes >> (8*len)) & 0xFF) | topbit); |
c6d25d8d |
439 | |
440 | return TRUE; |
441 | } |
442 | |
443 | /* |
444 | * ISO-2022-JP, defined in RFC 1468. |
445 | */ |
446 | static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes) |
447 | { |
448 | switch (subcharset) { |
449 | case 0: return bytes; /* one-byte ASCII */ |
450 | case 1: /* JIS X 0201 half-width katakana */ |
451 | if (bytes >= 0x21 && bytes <= 0x5F) |
452 | return bytes + (0xFF61 - 0x21); |
453 | else |
454 | return ERROR; |
455 | /* (no break needed since all control paths have returned) */ |
456 | case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21, |
457 | ((bytes ) & 0xFF) - 0x21); |
458 | default: return ERROR; |
459 | } |
460 | } |
461 | static int iso2022jp_from_ucs(long int ucs, int *subcharset, |
462 | unsigned long *bytes) |
463 | { |
464 | int r, c; |
465 | if (ucs < 0x80) { |
466 | *subcharset = 0; |
467 | *bytes = ucs; |
468 | return 1; |
469 | } else if (ucs >= 0xFF61 && ucs <= 0xFF9F) { |
470 | *subcharset = 1; |
471 | *bytes = ucs - (0xFF61 - 0x21); |
472 | return 1; |
473 | } else if (unicode_to_jisx0208(ucs, &r, &c)) { |
474 | *subcharset = 2; |
475 | *bytes = ((r+0x21) << 8) | (c+0x21); |
476 | return 1; |
477 | } else { |
478 | return 0; |
479 | } |
480 | } |
8bade113 |
481 | static const struct iso2022_escape iso2022jp_escapes[] = { |
c6d25d8d |
482 | {"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1}, /* we ignore this one */ |
483 | {"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2}, |
484 | {"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0}, |
485 | {"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1}, |
486 | }; |
8bade113 |
487 | static const struct iso2022 iso2022jp = { |
c6d25d8d |
488 | iso2022jp_escapes, lenof(iso2022jp_escapes), |
01081d4e |
489 | "\1\1\2", "\3", 0x80000000, NULL, FALSE, |
490 | iso2022jp_to_ucs, iso2022jp_from_ucs |
c6d25d8d |
491 | }; |
492 | const charset_spec charset_CS_ISO2022_JP = { |
493 | CS_ISO2022_JP, read_iso2022s, write_iso2022s, &iso2022jp |
494 | }; |
495 | |
496 | /* |
497 | * ISO-2022-KR, defined in RFC 1557. |
498 | */ |
499 | static long int iso2022kr_to_ucs(int subcharset, unsigned long bytes) |
500 | { |
501 | switch (subcharset) { |
502 | case 0: return bytes; /* one-byte ASCII */ |
503 | case 1: return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21, |
504 | ((bytes ) & 0xFF) - 0x21); |
505 | default: return ERROR; |
506 | } |
507 | } |
508 | static int iso2022kr_from_ucs(long int ucs, int *subcharset, |
509 | unsigned long *bytes) |
510 | { |
511 | int r, c; |
512 | if (ucs < 0x80) { |
513 | *subcharset = 0; |
514 | *bytes = ucs; |
515 | return 1; |
516 | } else if (unicode_to_ksx1001(ucs, &r, &c)) { |
517 | *subcharset = 1; |
518 | *bytes = ((r+0x21) << 8) | (c+0x21); |
519 | return 1; |
520 | } else { |
521 | return 0; |
522 | } |
523 | } |
8bade113 |
524 | static const struct iso2022_escape iso2022kr_escapes[] = { |
c6d25d8d |
525 | {"\016", 0x8FFFFFFF, 0x10000000, -1, -1}, |
526 | {"\017", 0x8FFFFFFF, 0x00000000, 0, 0}, |
527 | {"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1}, /* bits[11:6] <- 1 */ |
528 | }; |
8bade113 |
529 | static const struct iso2022 iso2022kr = { |
c6d25d8d |
530 | iso2022kr_escapes, lenof(iso2022kr_escapes), |
01081d4e |
531 | "\1\2", "\2", 0x80000040, "\033$)C", FALSE, |
532 | iso2022kr_to_ucs, iso2022kr_from_ucs |
c6d25d8d |
533 | }; |
534 | const charset_spec charset_CS_ISO2022_KR = { |
535 | CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr |
536 | }; |
537 | |
01081d4e |
538 | /* |
539 | * The COMPOUND_TEXT encoding used in X selections. Defined by the |
540 | * X consortium. |
541 | * |
542 | * This encoding has quite a few sub-charsets. The order I assign |
543 | * to them here is given in an enum. |
544 | */ |
545 | enum { |
546 | /* This must match the bytes-per-character string given below. */ |
547 | CTEXT_ASCII, |
548 | CTEXT_JISX0201_LEFT, |
549 | CTEXT_JISX0201_RIGHT, |
550 | CTEXT_ISO8859_1, |
551 | CTEXT_ISO8859_2, |
552 | CTEXT_ISO8859_3, |
553 | CTEXT_ISO8859_4, |
554 | CTEXT_ISO8859_5, |
555 | CTEXT_ISO8859_6, |
556 | CTEXT_ISO8859_7, |
557 | CTEXT_ISO8859_8, |
558 | CTEXT_ISO8859_9, |
559 | CTEXT_GB2312, |
560 | CTEXT_KSC5601, |
dd872b07 |
561 | CTEXT_JISX0208, |
562 | CTEXT_JISX0212 |
01081d4e |
563 | }; |
564 | static long int ctext_to_ucs(int subcharset, unsigned long bytes) |
565 | { |
566 | switch (subcharset) { |
567 | case CTEXT_ASCII: return bytes; /* one-byte ASCII */ |
568 | case CTEXT_JISX0201_LEFT: /* ASCII with yen and overline */ |
569 | return sbcs_to_unicode(&sbcsdata_CS_JISX0201, bytes & 0x7F); |
570 | case CTEXT_JISX0201_RIGHT: /* JIS X 0201 half-width katakana */ |
571 | return sbcs_to_unicode(&sbcsdata_CS_JISX0201, (bytes & 0x7F) | 0x80); |
572 | case CTEXT_ISO8859_1: |
573 | return sbcs_to_unicode(&sbcsdata_CS_ISO8859_1, (bytes & 0x7F) | 0x80); |
574 | case CTEXT_ISO8859_2: |
575 | return sbcs_to_unicode(&sbcsdata_CS_ISO8859_2, (bytes & 0x7F) | 0x80); |
576 | case CTEXT_ISO8859_3: |
577 | return sbcs_to_unicode(&sbcsdata_CS_ISO8859_3, (bytes & 0x7F) | 0x80); |
578 | case CTEXT_ISO8859_4: |
579 | return sbcs_to_unicode(&sbcsdata_CS_ISO8859_4, (bytes & 0x7F) | 0x80); |
580 | case CTEXT_ISO8859_5: |
581 | return sbcs_to_unicode(&sbcsdata_CS_ISO8859_5, (bytes & 0x7F) | 0x80); |
582 | case CTEXT_ISO8859_6: |
583 | return sbcs_to_unicode(&sbcsdata_CS_ISO8859_6, (bytes & 0x7F) | 0x80); |
584 | case CTEXT_ISO8859_7: |
585 | return sbcs_to_unicode(&sbcsdata_CS_ISO8859_7, (bytes & 0x7F) | 0x80); |
586 | case CTEXT_ISO8859_8: |
587 | return sbcs_to_unicode(&sbcsdata_CS_ISO8859_8, (bytes & 0x7F) | 0x80); |
588 | case CTEXT_ISO8859_9: |
589 | return sbcs_to_unicode(&sbcsdata_CS_ISO8859_9, (bytes & 0x7F) | 0x80); |
590 | case CTEXT_GB2312: |
591 | return gb2312_to_unicode(((bytes >> 8) & 0xFF) - 0x21, |
592 | ((bytes ) & 0xFF) - 0x21); |
593 | case CTEXT_KSC5601: |
594 | return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21, |
595 | ((bytes ) & 0xFF) - 0x21); |
596 | case CTEXT_JISX0208: |
597 | return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21, |
598 | ((bytes ) & 0xFF) - 0x21); |
dd872b07 |
599 | case CTEXT_JISX0212: |
600 | return jisx0212_to_unicode(((bytes >> 8) & 0xFF) - 0x21, |
601 | ((bytes ) & 0xFF) - 0x21); |
01081d4e |
602 | default: return ERROR; |
603 | } |
604 | } |
605 | static int ctext_from_ucs(long int ucs, int *subcharset, unsigned long *bytes) |
606 | { |
607 | int r, c; |
608 | if (ucs < 0x80) { |
609 | *subcharset = CTEXT_ASCII; |
610 | *bytes = ucs; |
611 | return 1; |
612 | } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_1, ucs)) != ERROR) { |
613 | *subcharset = CTEXT_ISO8859_1; |
614 | *bytes = c - 0x80; |
615 | return 1; |
616 | } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_2, ucs)) != ERROR) { |
617 | *subcharset = CTEXT_ISO8859_2; |
618 | *bytes = c - 0x80; |
619 | return 1; |
620 | } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_3, ucs)) != ERROR) { |
621 | *subcharset = CTEXT_ISO8859_3; |
622 | *bytes = c - 0x80; |
623 | return 1; |
624 | } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_4, ucs)) != ERROR) { |
625 | *subcharset = CTEXT_ISO8859_4; |
626 | *bytes = c - 0x80; |
627 | return 1; |
628 | } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_5, ucs)) != ERROR) { |
629 | *subcharset = CTEXT_ISO8859_5; |
630 | *bytes = c - 0x80; |
631 | return 1; |
632 | } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_6, ucs)) != ERROR) { |
633 | *subcharset = CTEXT_ISO8859_6; |
634 | *bytes = c - 0x80; |
635 | return 1; |
636 | } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_7, ucs)) != ERROR) { |
637 | *subcharset = CTEXT_ISO8859_7; |
638 | *bytes = c - 0x80; |
639 | return 1; |
640 | } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_8, ucs)) != ERROR) { |
641 | *subcharset = CTEXT_ISO8859_8; |
642 | *bytes = c - 0x80; |
643 | return 1; |
644 | } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_9, ucs)) != ERROR) { |
645 | *subcharset = CTEXT_ISO8859_9; |
646 | *bytes = c - 0x80; |
647 | return 1; |
648 | } else if ((c = sbcs_from_unicode(&sbcsdata_CS_JISX0201, ucs)) != ERROR) { |
649 | if (c < 0x80) { |
650 | *subcharset = CTEXT_JISX0201_LEFT; |
651 | } else { |
652 | *subcharset = CTEXT_JISX0201_RIGHT; |
653 | c -= 0x80; |
654 | } |
655 | *bytes = c; |
656 | return 1; |
657 | } else if (unicode_to_gb2312(ucs, &r, &c)) { |
658 | *subcharset = CTEXT_GB2312; |
659 | *bytes = ((r+0x21) << 8) | (c+0x21); |
660 | return 1; |
661 | } else if (unicode_to_ksx1001(ucs, &r, &c)) { |
662 | *subcharset = CTEXT_KSC5601; |
663 | *bytes = ((r+0x21) << 8) | (c+0x21); |
664 | return 1; |
665 | } else if (unicode_to_jisx0208(ucs, &r, &c)) { |
666 | *subcharset = CTEXT_JISX0208; |
667 | *bytes = ((r+0x21) << 8) | (c+0x21); |
668 | return 1; |
dd872b07 |
669 | } else if (unicode_to_jisx0212(ucs, &r, &c)) { |
670 | *subcharset = CTEXT_JISX0212; |
671 | *bytes = ((r+0x21) << 8) | (c+0x21); |
672 | return 1; |
01081d4e |
673 | } else { |
674 | return 0; |
675 | } |
676 | } |
677 | #define SEQ(str,cont,cs) \ |
c601368c |
678 | {str,~(63<<(6*(((cont)&~RO)))),(cs)<<(6*(((cont)&~RO))),(cont),(cs)} |
01081d4e |
679 | /* |
680 | * Compound text defines restrictions on which container can take |
681 | * which character sets. Things labelled `left half of' can only go |
682 | * in GL; things labelled `right half of' can only go in GR; and 96 |
683 | * or 96^n character sets only _fit_ in GR. Thus: |
684 | * - ASCII can only go in GL since it is the left half of 8859-*. |
685 | * - All the 8859 sets can only go in GR. |
686 | * - JISX0201 left is GL only; JISX0201 right is GR only. |
687 | * - The three multibyte sets (GB2312, JISX0208, KSC5601) can go |
688 | * in either; we prefer GR where possible since this leads to a |
689 | * more compact EUC-like encoding. |
690 | */ |
8bade113 |
691 | static const struct iso2022_escape ctext_escapes[] = { |
01081d4e |
692 | SEQ("\033$(A", 0|RO, CTEXT_GB2312), |
693 | SEQ("\033$(B", 0|RO, CTEXT_JISX0208), |
694 | SEQ("\033$(C", 0|RO, CTEXT_KSC5601), |
dd872b07 |
695 | SEQ("\033$(D", 0|RO, CTEXT_JISX0212), |
01081d4e |
696 | SEQ("\033$)A", 1, CTEXT_GB2312), |
697 | SEQ("\033$)B", 1, CTEXT_JISX0208), |
698 | SEQ("\033$)C", 1, CTEXT_KSC5601), |
dd872b07 |
699 | SEQ("\033$)D", 1, CTEXT_JISX0212), |
01081d4e |
700 | SEQ("\033(B", 0, CTEXT_ASCII), |
701 | SEQ("\033(J", 0, CTEXT_JISX0201_LEFT), |
ee45694b |
702 | SEQ("\033)I", 1, CTEXT_JISX0201_RIGHT), |
01081d4e |
703 | SEQ("\033-A", 1, CTEXT_ISO8859_1), |
704 | SEQ("\033-B", 1, CTEXT_ISO8859_2), |
705 | SEQ("\033-C", 1, CTEXT_ISO8859_3), |
706 | SEQ("\033-D", 1, CTEXT_ISO8859_4), |
707 | SEQ("\033-F", 1, CTEXT_ISO8859_7), |
708 | SEQ("\033-G", 1, CTEXT_ISO8859_6), |
709 | SEQ("\033-H", 1, CTEXT_ISO8859_8), |
01081d4e |
710 | SEQ("\033-L", 1, CTEXT_ISO8859_5), |
711 | SEQ("\033-M", 1, CTEXT_ISO8859_9), |
d959146d |
712 | |
713 | /* |
714 | * Cross-testing against Xutf8TextListToTextProperty() turns up |
715 | * some additional character sets and ISO 2022 features |
716 | * supported by that and not by us: |
717 | * |
718 | * - Single-byte right-hand-half character sets `ESC - f', |
719 | * `ESC - T' and `ESC - Y'. |
720 | * |
721 | * - A really horrifying mechanism used to escape completely |
722 | * from the ISO 2022 framework: ESC % / <length> |
723 | * <charset-name> <text>. Xutf8* uses this to encode |
724 | * "iso8859-14", "iso8859-15" and "big5-0". |
725 | * * This mechanism is particularly nasty because we can't |
726 | * efficiently encode it on the fly! It requires that the |
727 | * length of the text encoded in the foreign charset is |
728 | * given _before_ the text in question, so if we're |
729 | * receiving one character at a time we simply can't look |
730 | * ahead and so we would have to encode each individual |
731 | * character in a separate one of these sequences. |
732 | * |
733 | * - ESC % G and ESC % @ to shift to and from UTF-8 mode, as a |
734 | * last resort for anything we still don't support. |
735 | * * Interestingly, ctext.ps actually _disallows_ this: it |
736 | * says that the above extension mechanism is the only |
737 | * one permitted. Ho hum. |
738 | */ |
01081d4e |
739 | }; |
8bade113 |
740 | static const struct iso2022 ctext = { |
01081d4e |
741 | ctext_escapes, lenof(ctext_escapes), |
dd872b07 |
742 | "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\2\2", /* must match the enum above */ |
ee45694b |
743 | "", 0x80000000 | (CTEXT_ASCII<<0) | (CTEXT_ISO8859_1<<6), "", TRUE, |
01081d4e |
744 | ctext_to_ucs, ctext_from_ucs |
745 | }; |
746 | const charset_spec charset_CS_CTEXT = { |
747 | CS_CTEXT, read_iso2022s, write_iso2022s, &ctext |
748 | }; |
749 | |
c6d25d8d |
750 | #else /* ENUM_CHARSETS */ |
751 | |
752 | ENUM_CHARSET(CS_ISO2022_JP) |
753 | ENUM_CHARSET(CS_ISO2022_KR) |
01081d4e |
754 | ENUM_CHARSET(CS_CTEXT) |
c6d25d8d |
755 | |
756 | #endif /* ENUM_CHARSETS */ |