`gcc -Wall' points out some signed/unsigned comparisons. Fixed.
[sgt/charset] / iso2022s.c
1 /*
2 * iso2022s.c - support for ISO-2022 subset encodings.
3 *
4 * (The `s' suffix on the filename is there to leave `iso2022.c'
5 * free for the unlikely event that I ever attempt to implement
6 * _full_ ISO-2022 in this library!)
7 */
8
9 #ifndef ENUM_CHARSETS
10
11 #include <stdio.h>
12 #include <string.h>
13 #include <assert.h>
14
15 #include "charset.h"
16 #include "internal.h"
17
18 #define SO (0x0E)
19 #define SI (0x0F)
20 #define ESC (0x1B)
21
22 /* Functional description of a single ISO 2022 escape sequence. */
23 struct iso2022_escape {
24 char const *sequence;
25 unsigned long andbits, xorbits;
26 /*
27 * For output, these variables help us figure out which escape
28 * sequences we need to get where we want to be.
29 */
30 int container, subcharset;
31 };
32
33 struct iso2022 {
34 /*
35 * List of escape sequences supported in this subset. Must be
36 * in ASCII order, so that we can narrow down the list as
37 * necessary.
38 */
39 struct iso2022_escape *escapes; /* must be sorted in ASCII order! */
40 int nescapes;
41
42 /*
43 * We assign indices from 0 upwards to the sub-charsets of a
44 * given ISO 2022 subset. nbytes[i] tells us how many bytes per
45 * character are required by sub-charset i. (It's a string
46 * mainly because that makes it easier to declare in C syntax
47 * than an int array.)
48 */
49 char const *nbytes;
50
51 /*
52 * The characters in this string are indices-plus-one (so that
53 * NUL can still terminate) of escape sequences in `escapes'.
54 * These escapes are output in the given sequence to reset the
55 * encoding state, unless it turns out that a given escape
56 * would not change the state at all.
57 */
58 char const *reset;
59
60 /*
61 * Initial value of s1, in case the default container contents
62 * needs to be something other than charset 0 in all cases.
63 * (Note that this must have the top bit set!)
64 */
65 unsigned long s1;
66
67 /*
68 * For output, some ISO 2022 subsets _mandate_ an initial shift
69 * sequence. If so, here it is so we can output it. (For the
70 * sake of basic sanity we won't bother to _require_ it on
71 * input, although it should of course be listed under
72 * `escapes' above so that we ignore it when present.)
73 */
74 char const *initial_sequence;
75
76 /*
77 * Function calls to do the actual translation.
78 */
79 long int (*to_ucs)(int subcharset, unsigned long bytes);
80 int (*from_ucs)(long int ucs, int *subcharset, unsigned long *bytes);
81 };
82
83 static void read_iso2022s(charset_spec const *charset, long int input_chr,
84 charset_state *state,
85 void (*emit)(void *ctx, long int output),
86 void *emitctx)
87 {
88 struct iso2022 const *iso = (struct iso2022 *)charset->data;
89
90 /*
91 * For reading ISO-2022 subsets, we divide up our state
92 * variables as follows:
93 *
94 * - The top byte of s0 (bits 31:24) indicates, if nonzero,
95 * that we are part-way through a recognised ISO-2022 escape
96 * sequence. Five of those bits (31:27) give the index of
97 * the first member of the escapes list matching what we
98 * have so far; the remaining three (26:24) give the number
99 * of characters we have seen so far.
100 *
101 * - The top bit of s1 (bit 31) is non-zero at all times, to
102 * indicate that we have performed any necessary
103 * initialisation. When we start, we detect a zero s1 and
104 * respond to it by initialising the default container
105 * contents.
106 *
107 * - The next three bits of s1 (bits 30:28) indicate which
108 * _container_ is currently selected. This isn't quite as
109 * simple as it sounds, since we have to preserve memory of
110 * which of the SI/SO containers we came from when we're
111 * temporarily in SS2/SS3. Hence, what happens is:
112 * + bit 28 indicates SI/SO.
113 * + if we're in an SS2/SS3 container, that's indicated by
114 * the two bits above that being nonzero and holding
115 * either 2 or 3.
116 * + Hence: 0 is SI, 1 is SO, 4 is SS2-from-SI, 5 is
117 * SS2-from-SO, 6 is SS3-from-SI, 7 is SS3-from-SO.
118 *
119 * - The next nibble of s1 (27:24) indicates how many bytes
120 * have been accumulated in the current character.
121 *
122 * - The remaining three bytes of s1 are divided into four
123 * six-bit sections, and each section gives the current
124 * sub-charset selected in one of the possible containers.
125 * (Those containers are SI, SO, SS2 and SS3, respectively
126 * and in order from the bottom of s0 to the top.)
127 *
128 * - The bottom 24 bits of s0 give the accumulated character
129 * data so far.
130 *
131 * (Note that this means s1 contains all the parts of the state
132 * which might need to be operated on by escape sequences.
133 * Cunning, eh?)
134 */
135
136 if (!(state->s1 & 0x80000000)) {
137 state->s1 = iso->s1;
138 }
139
140 /*
141 * So. Firstly, we process escape sequences, if we're in the
142 * middle of one or if we see a possible introducer (SI, SO,
143 * ESC).
144 */
145 if ((state->s0 >> 24) ||
146 (input_chr == SO || input_chr == SI || input_chr == ESC)) {
147 int n = (state->s0 >> 24) & 7, i = (state->s0 >> 27), oi = i, j;
148
149 /*
150 * If this is the start of an escape sequence, we might be
151 * in mid-character. If so, clear the character state and
152 * emit an error token for the incomplete character.
153 */
154 if (state->s1 & 0x0F000000) {
155 state->s1 &= ~0x0F000000;
156 state->s0 &= 0xFF000000;
157 /*
158 * If we were in the SS2 or SS3 container, we
159 * automatically exit it.
160 */
161 if (state->s1 & 0x60000000)
162 state->s1 &= 0x9FFFFFFF;
163 emit(emitctx, ERROR);
164 }
165
166 j = i;
167 while (j < iso->nescapes &&
168 !memcmp(iso->escapes[j].sequence,
169 iso->escapes[oi].sequence, n)) {
170 if (iso->escapes[j].sequence[n] < input_chr)
171 i = ++j;
172 else
173 break;
174 }
175 if (i >= iso->nescapes ||
176 memcmp(iso->escapes[i].sequence,
177 iso->escapes[oi].sequence, n) ||
178 iso->escapes[i].sequence[n] != input_chr) {
179 /*
180 * This character does not appear in any valid escape
181 * sequence. Therefore, we must emit all the characters
182 * we had previously swallowed, plus this one, and
183 * return to non-escape-sequence state.
184 */
185 for (j = 0; j < n; j++)
186 emit(emitctx, iso->escapes[oi].sequence[j]);
187 emit(emitctx, input_chr);
188 state->s0 = 0;
189 return;
190 }
191
192 /*
193 * Otherwise, we have found an additional character in our
194 * escape sequence. See if we have reached the _end_ of our
195 * sequence (and therefore must process the sequence).
196 */
197 n++;
198 if (!iso->escapes[i].sequence[n]) {
199 state->s0 = 0;
200 state->s1 &= iso->escapes[i].andbits;
201 state->s1 ^= iso->escapes[i].xorbits;
202 return;
203 }
204
205 /*
206 * Failing _that_, we simply update our escape-sequence-
207 * tracking state.
208 */
209 assert(i < 32 && n < 8);
210 state->s0 = (i << 27) | (n << 24);
211 return;
212 }
213
214 /*
215 * If this isn't an escape sequence, it must be part of a
216 * character. One possibility is that it's a control character
217 * (outside the space 21-7E), in which case we output it verbatim.
218 */
219 if (input_chr < 0x21 || input_chr > 0x7E) {
220 /*
221 * We might be in mid-multibyte-character. If so, clear the
222 * character state and emit an error token for the
223 * incomplete character.
224 */
225 if (state->s1 & 0x0F000000) {
226 state->s1 &= ~0x0F000000;
227 state->s0 &= 0xFF000000;
228 emit(emitctx, ERROR);
229 /*
230 * If we were in the SS2 or SS3 container, we
231 * automatically exit it.
232 */
233 if (state->s1 & 0x60000000)
234 state->s1 &= 0x9FFFFFFF;
235 }
236
237 emit(emitctx, input_chr);
238 return;
239 }
240
241 /*
242 * Otherwise, accumulate character data.
243 */
244 {
245 unsigned long chr;
246 int chrlen, cont, subcharset, bytes;
247
248 /* The current character and its length. */
249 chr = ((state->s0 & 0x00FFFFFF) << 8) | input_chr;
250 chrlen = ((state->s1 >> 24) & 0xF) + 1;
251 /* The current sub-charset. */
252 cont = (state->s1 >> 28) & 7;
253 if (cont > 1) cont >>= 1;
254 subcharset = (state->s1 >> (6*cont)) & 0x3F;
255 /* The number of bytes-per-character in that sub-charset. */
256 bytes = iso->nbytes[subcharset];
257
258 /*
259 * If this character is now complete, we convert and emit
260 * it. Otherwise, we simply update the state and return.
261 */
262 if (chrlen >= bytes) {
263 emit(emitctx, iso->to_ucs(subcharset, chr));
264 chr = chrlen = 0;
265 /*
266 * If we were in the SS2 or SS3 container, we
267 * automatically exit it.
268 */
269 if (state->s1 & 0x60000000)
270 state->s1 &= 0x9FFFFFFF;
271 }
272 state->s0 = (state->s0 & 0xFF000000) | chr;
273 state->s1 = (state->s1 & 0xF0FFFFFF) | (chrlen << 24);
274 }
275 }
276
277 static int write_iso2022s(charset_spec const *charset, long int input_chr,
278 charset_state *state,
279 void (*emit)(void *ctx, long int output),
280 void *emitctx)
281 {
282 struct iso2022 const *iso = (struct iso2022 *)charset->data;
283 int subcharset, len, i, j, cont;
284 unsigned long bytes;
285
286 /*
287 * For output, our s1 state variable contains most of the same
288 * stuff as it did for input - initial-state indicator bit,
289 * current container, and current subcharset selected in each
290 * container.
291 */
292
293 /*
294 * Analyse the character and find out what subcharset it needs
295 * to go in.
296 */
297 if (input_chr >= 0 && !iso->from_ucs(input_chr, &subcharset, &bytes))
298 return FALSE;
299
300 if (!(state->s1 & 0x80000000)) {
301 state->s1 = iso->s1;
302 if (iso->initial_sequence)
303 for (i = 0; iso->initial_sequence[i]; i++)
304 emit(emitctx, iso->initial_sequence[i]);
305 }
306
307 if (input_chr == -1) {
308 unsigned long oldstate;
309 int k;
310
311 /*
312 * Special case: reset encoding state.
313 */
314 for (i = 0; iso->reset[i]; i++) {
315 j = iso->reset[i] - 1;
316 oldstate = state->s1;
317 state->s1 &= iso->escapes[j].andbits;
318 state->s1 ^= iso->escapes[j].xorbits;
319 if (state->s1 != oldstate) {
320 /* We must actually emit this sequence. */
321 for (k = 0; iso->escapes[j].sequence[k]; k++)
322 emit(emitctx, iso->escapes[j].sequence[k]);
323 }
324 }
325
326 return TRUE;
327 }
328
329 /*
330 * Now begins the fun. We now know what subcharset we want. So
331 * we must find out which container we should select it into,
332 * select it into it if necessary, select that _container_ if
333 * necessary, and then output the given bytes.
334 */
335 for (i = 0; i < iso->nescapes; i++)
336 if (iso->escapes[i].subcharset == subcharset)
337 break;
338 assert(i < iso->nescapes);
339
340 /*
341 * We've found the escape sequence which would select this
342 * subcharset into a container. However, that subcharset might
343 * already _be_ selected in that container! Check before we go
344 * to the effort of emitting the sequence.
345 */
346 cont = iso->escapes[i].container;
347 if (((state->s1 >> (6*cont)) & 0x3F) != (unsigned)subcharset) {
348 for (j = 0; iso->escapes[i].sequence[j]; j++)
349 emit(emitctx, iso->escapes[i].sequence[j]);
350 state->s1 &= iso->escapes[i].andbits;
351 state->s1 ^= iso->escapes[i].xorbits;
352 }
353
354 /*
355 * Now we know what container our subcharset is in, so we want
356 * to select that container.
357 */
358 if (cont > 1) {
359 /* SS2 or SS3; just output the sequence and be done. */
360 emit(emitctx, ESC);
361 emit(emitctx, 'L' + cont); /* comes out to 'N' or 'O' */
362 } else {
363 /* Emit SI or SO, but only if the current container isn't already
364 * the right one. */
365 if (((state->s1 >> 28) & 7) != (unsigned)cont) {
366 emit(emitctx, cont ? SO : SI);
367 state->s1 = (state->s1 & 0x8FFFFFFF) | (cont << 28);
368 }
369 }
370
371 /*
372 * We're done. Subcharset is selected in container, container
373 * is selected. All we need now is to write out the bytes.
374 */
375 len = iso->nbytes[subcharset];
376 while (len--)
377 emit(emitctx, (bytes >> (8*len)) & 0xFF);
378
379 return TRUE;
380 }
381
382 /*
383 * ISO-2022-JP, defined in RFC 1468.
384 */
385 static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes)
386 {
387 switch (subcharset) {
388 case 0: return bytes; /* one-byte ASCII */
389 case 1: /* JIS X 0201 half-width katakana */
390 if (bytes >= 0x21 && bytes <= 0x5F)
391 return bytes + (0xFF61 - 0x21);
392 else
393 return ERROR;
394 /* (no break needed since all control paths have returned) */
395 case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
396 ((bytes ) & 0xFF) - 0x21);
397 default: return ERROR;
398 }
399 }
400 static int iso2022jp_from_ucs(long int ucs, int *subcharset,
401 unsigned long *bytes)
402 {
403 int r, c;
404 if (ucs < 0x80) {
405 *subcharset = 0;
406 *bytes = ucs;
407 return 1;
408 } else if (ucs >= 0xFF61 && ucs <= 0xFF9F) {
409 *subcharset = 1;
410 *bytes = ucs - (0xFF61 - 0x21);
411 return 1;
412 } else if (unicode_to_jisx0208(ucs, &r, &c)) {
413 *subcharset = 2;
414 *bytes = ((r+0x21) << 8) | (c+0x21);
415 return 1;
416 } else {
417 return 0;
418 }
419 }
420 static struct iso2022_escape iso2022jp_escapes[] = {
421 {"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1}, /* we ignore this one */
422 {"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2},
423 {"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0},
424 {"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1},
425 };
426 static struct iso2022 iso2022jp = {
427 iso2022jp_escapes, lenof(iso2022jp_escapes),
428 "\1\1\2", "\3", 0x80000000, NULL, iso2022jp_to_ucs, iso2022jp_from_ucs
429 };
430 const charset_spec charset_CS_ISO2022_JP = {
431 CS_ISO2022_JP, read_iso2022s, write_iso2022s, &iso2022jp
432 };
433
434 /*
435 * ISO-2022-KR, defined in RFC 1557.
436 */
437 static long int iso2022kr_to_ucs(int subcharset, unsigned long bytes)
438 {
439 switch (subcharset) {
440 case 0: return bytes; /* one-byte ASCII */
441 case 1: return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
442 ((bytes ) & 0xFF) - 0x21);
443 default: return ERROR;
444 }
445 }
446 static int iso2022kr_from_ucs(long int ucs, int *subcharset,
447 unsigned long *bytes)
448 {
449 int r, c;
450 if (ucs < 0x80) {
451 *subcharset = 0;
452 *bytes = ucs;
453 return 1;
454 } else if (unicode_to_ksx1001(ucs, &r, &c)) {
455 *subcharset = 1;
456 *bytes = ((r+0x21) << 8) | (c+0x21);
457 return 1;
458 } else {
459 return 0;
460 }
461 }
462 static struct iso2022_escape iso2022kr_escapes[] = {
463 {"\016", 0x8FFFFFFF, 0x10000000, -1, -1},
464 {"\017", 0x8FFFFFFF, 0x00000000, 0, 0},
465 {"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1}, /* bits[11:6] <- 1 */
466 };
467 static struct iso2022 iso2022kr = {
468 iso2022kr_escapes, lenof(iso2022kr_escapes),
469 "\1\2", "\2", 0x80000040, "\033$)C", iso2022kr_to_ucs, iso2022kr_from_ucs
470 };
471 const charset_spec charset_CS_ISO2022_KR = {
472 CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr
473 };
474
475 #else /* ENUM_CHARSETS */
476
477 ENUM_CHARSET(CS_ISO2022_JP)
478 ENUM_CHARSET(CS_ISO2022_KR)
479
480 #endif /* ENUM_CHARSETS */