2 * iso2022s.c - support for ISO-2022 subset encodings.
4 * (The `s' suffix on the filename is there to leave `iso2022.c'
5 * free for the unlikely event that I ever attempt to implement
6 * _full_ ISO-2022 in this library!)
22 /* Functional description of a single ISO 2022 escape sequence. */
23 struct iso2022_escape
{
25 unsigned long andbits
, xorbits
;
27 * For output, these variables help us figure out which escape
28 * sequences we need to get where we want to be.
30 int container
, subcharset
;
35 * List of escape sequences supported in this subset. Must be
36 * in ASCII order, so that we can narrow down the list as
39 struct iso2022_escape
*escapes
; /* must be sorted in ASCII order! */
43 * We assign indices from 0 upwards to the sub-charsets of a
44 * given ISO 2022 subset. nbytes[i] tells us how many bytes per
45 * character are required by sub-charset i. (It's a string
46 * mainly because that makes it easier to declare in C syntax
52 * The characters in this string are indices-plus-one (so that
53 * NUL can still terminate) of escape sequences in `escapes'.
54 * These escapes are output in the given sequence to reset the
55 * encoding state, unless it turns out that a given escape
56 * would not change the state at all.
61 * Initial value of s1, in case the default container contents
62 * needs to be something other than charset 0 in all cases.
63 * (Note that this must have the top bit set!)
68 * For output, some ISO 2022 subsets _mandate_ an initial shift
69 * sequence. If so, here it is so we can output it. (For the
70 * sake of basic sanity we won't bother to _require_ it on
71 * input, although it should of course be listed under
72 * `escapes' above so that we ignore it when present.)
74 char const *initial_sequence
;
77 * Function calls to do the actual translation.
79 long int (*to_ucs
)(int subcharset
, unsigned long bytes
);
80 int (*from_ucs
)(long int ucs
, int *subcharset
, unsigned long *bytes
);
83 static void read_iso2022s(charset_spec
const *charset
, long int input_chr
,
85 void (*emit
)(void *ctx
, long int output
),
88 struct iso2022
const *iso
= (struct iso2022
*)charset
->data
;
91 * For reading ISO-2022 subsets, we divide up our state
92 * variables as follows:
94 * - The top byte of s0 (bits 31:24) indicates, if nonzero,
95 * that we are part-way through a recognised ISO-2022 escape
96 * sequence. Five of those bits (31:27) give the index of
97 * the first member of the escapes list matching what we
98 * have so far; the remaining three (26:24) give the number
99 * of characters we have seen so far.
101 * - The top bit of s1 (bit 31) is non-zero at all times, to
102 * indicate that we have performed any necessary
103 * initialisation. When we start, we detect a zero s1 and
104 * respond to it by initialising the default container
107 * - The next three bits of s1 (bits 30:28) indicate which
108 * _container_ is currently selected. This isn't quite as
109 * simple as it sounds, since we have to preserve memory of
110 * which of the SI/SO containers we came from when we're
111 * temporarily in SS2/SS3. Hence, what happens is:
112 * + bit 28 indicates SI/SO.
113 * + if we're in an SS2/SS3 container, that's indicated by
114 * the two bits above that being nonzero and holding
116 * + Hence: 0 is SI, 1 is SO, 4 is SS2-from-SI, 5 is
117 * SS2-from-SO, 6 is SS3-from-SI, 7 is SS3-from-SO.
119 * - The next nibble of s1 (27:24) indicates how many bytes
120 * have been accumulated in the current character.
122 * - The remaining three bytes of s1 are divided into four
123 * six-bit sections, and each section gives the current
124 * sub-charset selected in one of the possible containers.
125 * (Those containers are SI, SO, SS2 and SS3, respectively
126 * and in order from the bottom of s0 to the top.)
128 * - The bottom 24 bits of s0 give the accumulated character
131 * (Note that this means s1 contains all the parts of the state
132 * which might need to be operated on by escape sequences.
136 if (!(state
->s1
& 0x80000000)) {
141 * So. Firstly, we process escape sequences, if we're in the
142 * middle of one or if we see a possible introducer (SI, SO,
145 if ((state
->s0
>> 24) ||
146 (input_chr
== SO
|| input_chr
== SI
|| input_chr
== ESC
)) {
147 int n
= (state
->s0
>> 24) & 7, i
= (state
->s0
>> 27), oi
= i
, j
;
150 * If this is the start of an escape sequence, we might be
151 * in mid-character. If so, clear the character state and
152 * emit an error token for the incomplete character.
154 if (state
->s1
& 0x0F000000) {
155 state
->s1
&= ~0x0F000000;
156 state
->s0
&= 0xFF000000;
158 * If we were in the SS2 or SS3 container, we
159 * automatically exit it.
161 if (state
->s1
& 0x60000000)
162 state
->s1
&= 0x9FFFFFFF;
163 emit(emitctx
, ERROR
);
167 while (j
< iso
->nescapes
&&
168 !memcmp(iso
->escapes
[j
].sequence
,
169 iso
->escapes
[oi
].sequence
, n
)) {
170 if (iso
->escapes
[j
].sequence
[n
] < input_chr
)
175 if (i
>= iso
->nescapes
||
176 memcmp(iso
->escapes
[i
].sequence
,
177 iso
->escapes
[oi
].sequence
, n
) ||
178 iso
->escapes
[i
].sequence
[n
] != input_chr
) {
180 * This character does not appear in any valid escape
181 * sequence. Therefore, we must emit all the characters
182 * we had previously swallowed, plus this one, and
183 * return to non-escape-sequence state.
185 for (j
= 0; j
< n
; j
++)
186 emit(emitctx
, iso
->escapes
[oi
].sequence
[j
]);
187 emit(emitctx
, input_chr
);
193 * Otherwise, we have found an additional character in our
194 * escape sequence. See if we have reached the _end_ of our
195 * sequence (and therefore must process the sequence).
198 if (!iso
->escapes
[i
].sequence
[n
]) {
200 state
->s1
&= iso
->escapes
[i
].andbits
;
201 state
->s1
^= iso
->escapes
[i
].xorbits
;
206 * Failing _that_, we simply update our escape-sequence-
209 assert(i
< 32 && n
< 8);
210 state
->s0
= (i
<< 27) | (n
<< 24);
215 * If this isn't an escape sequence, it must be part of a
216 * character. One possibility is that it's a control character
217 * (outside the space 21-7E), in which case we output it verbatim.
219 if (input_chr
< 0x21 || input_chr
> 0x7E) {
221 * We might be in mid-multibyte-character. If so, clear the
222 * character state and emit an error token for the
223 * incomplete character.
225 if (state
->s1
& 0x0F000000) {
226 state
->s1
&= ~0x0F000000;
227 state
->s0
&= 0xFF000000;
228 emit(emitctx
, ERROR
);
230 * If we were in the SS2 or SS3 container, we
231 * automatically exit it.
233 if (state
->s1
& 0x60000000)
234 state
->s1
&= 0x9FFFFFFF;
237 emit(emitctx
, input_chr
);
242 * Otherwise, accumulate character data.
246 int chrlen
, cont
, subcharset
, bytes
;
248 /* The current character and its length. */
249 chr
= ((state
->s0
& 0x00FFFFFF) << 8) | input_chr
;
250 chrlen
= ((state
->s1
>> 24) & 0xF) + 1;
251 /* The current sub-charset. */
252 cont
= (state
->s1
>> 28) & 7;
253 if (cont
> 1) cont
>>= 1;
254 subcharset
= (state
->s1
>> (6*cont
)) & 0x3F;
255 /* The number of bytes-per-character in that sub-charset. */
256 bytes
= iso
->nbytes
[subcharset
];
259 * If this character is now complete, we convert and emit
260 * it. Otherwise, we simply update the state and return.
262 if (chrlen
>= bytes
) {
263 emit(emitctx
, iso
->to_ucs(subcharset
, chr
));
266 * If we were in the SS2 or SS3 container, we
267 * automatically exit it.
269 if (state
->s1
& 0x60000000)
270 state
->s1
&= 0x9FFFFFFF;
272 state
->s0
= (state
->s0
& 0xFF000000) | chr
;
273 state
->s1
= (state
->s1
& 0xF0FFFFFF) | (chrlen
<< 24);
277 static int write_iso2022s(charset_spec
const *charset
, long int input_chr
,
278 charset_state
*state
,
279 void (*emit
)(void *ctx
, long int output
),
282 struct iso2022
const *iso
= (struct iso2022
*)charset
->data
;
283 int subcharset
, len
, i
, j
, cont
;
287 * For output, our s1 state variable contains most of the same
288 * stuff as it did for input - initial-state indicator bit,
289 * current container, and current subcharset selected in each
294 * Analyse the character and find out what subcharset it needs
297 if (input_chr
>= 0 && !iso
->from_ucs(input_chr
, &subcharset
, &bytes
))
300 if (!(state
->s1
& 0x80000000)) {
302 if (iso
->initial_sequence
)
303 for (i
= 0; iso
->initial_sequence
[i
]; i
++)
304 emit(emitctx
, iso
->initial_sequence
[i
]);
307 if (input_chr
== -1) {
308 unsigned long oldstate
;
312 * Special case: reset encoding state.
314 for (i
= 0; iso
->reset
[i
]; i
++) {
315 j
= iso
->reset
[i
] - 1;
316 oldstate
= state
->s1
;
317 state
->s1
&= iso
->escapes
[j
].andbits
;
318 state
->s1
^= iso
->escapes
[j
].xorbits
;
319 if (state
->s1
!= oldstate
) {
320 /* We must actually emit this sequence. */
321 for (k
= 0; iso
->escapes
[j
].sequence
[k
]; k
++)
322 emit(emitctx
, iso
->escapes
[j
].sequence
[k
]);
330 * Now begins the fun. We now know what subcharset we want. So
331 * we must find out which container we should select it into,
332 * select it into it if necessary, select that _container_ if
333 * necessary, and then output the given bytes.
335 for (i
= 0; i
< iso
->nescapes
; i
++)
336 if (iso
->escapes
[i
].subcharset
== subcharset
)
338 assert(i
< iso
->nescapes
);
341 * We've found the escape sequence which would select this
342 * subcharset into a container. However, that subcharset might
343 * already _be_ selected in that container! Check before we go
344 * to the effort of emitting the sequence.
346 cont
= iso
->escapes
[i
].container
;
347 if (((state
->s1
>> (6*cont
)) & 0x3F) != (unsigned)subcharset
) {
348 for (j
= 0; iso
->escapes
[i
].sequence
[j
]; j
++)
349 emit(emitctx
, iso
->escapes
[i
].sequence
[j
]);
350 state
->s1
&= iso
->escapes
[i
].andbits
;
351 state
->s1
^= iso
->escapes
[i
].xorbits
;
355 * Now we know what container our subcharset is in, so we want
356 * to select that container.
359 /* SS2 or SS3; just output the sequence and be done. */
361 emit(emitctx
, 'L' + cont
); /* comes out to 'N' or 'O' */
363 /* Emit SI or SO, but only if the current container isn't already
365 if (((state
->s1
>> 28) & 7) != (unsigned)cont
) {
366 emit(emitctx
, cont ? SO
: SI
);
367 state
->s1
= (state
->s1
& 0x8FFFFFFF) | (cont
<< 28);
372 * We're done. Subcharset is selected in container, container
373 * is selected. All we need now is to write out the bytes.
375 len
= iso
->nbytes
[subcharset
];
377 emit(emitctx
, (bytes
>> (8*len
)) & 0xFF);
383 * ISO-2022-JP, defined in RFC 1468.
385 static long int iso2022jp_to_ucs(int subcharset
, unsigned long bytes
)
387 switch (subcharset
) {
388 case 0: return bytes
; /* one-byte ASCII */
389 case 1: /* JIS X 0201 half-width katakana */
390 if (bytes
>= 0x21 && bytes
<= 0x5F)
391 return bytes
+ (0xFF61 - 0x21);
394 /* (no break needed since all control paths have returned) */
395 case 2: return jisx0208_to_unicode(((bytes
>> 8) & 0xFF) - 0x21,
396 ((bytes
) & 0xFF) - 0x21);
397 default: return ERROR
;
400 static int iso2022jp_from_ucs(long int ucs
, int *subcharset
,
401 unsigned long *bytes
)
408 } else if (ucs
>= 0xFF61 && ucs
<= 0xFF9F) {
410 *bytes
= ucs
- (0xFF61 - 0x21);
412 } else if (unicode_to_jisx0208(ucs
, &r
, &c
)) {
414 *bytes
= ((r
+0x21) << 8) | (c
+0x21);
420 static struct iso2022_escape iso2022jp_escapes
[] = {
421 {"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1}, /* we ignore this one */
422 {"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2},
423 {"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0},
424 {"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1},
426 static struct iso2022 iso2022jp
= {
427 iso2022jp_escapes
, lenof(iso2022jp_escapes
),
428 "\1\1\2", "\3", 0x80000000, NULL
, iso2022jp_to_ucs
, iso2022jp_from_ucs
430 const charset_spec charset_CS_ISO2022_JP
= {
431 CS_ISO2022_JP
, read_iso2022s
, write_iso2022s
, &iso2022jp
435 * ISO-2022-KR, defined in RFC 1557.
437 static long int iso2022kr_to_ucs(int subcharset
, unsigned long bytes
)
439 switch (subcharset
) {
440 case 0: return bytes
; /* one-byte ASCII */
441 case 1: return ksx1001_to_unicode(((bytes
>> 8) & 0xFF) - 0x21,
442 ((bytes
) & 0xFF) - 0x21);
443 default: return ERROR
;
446 static int iso2022kr_from_ucs(long int ucs
, int *subcharset
,
447 unsigned long *bytes
)
454 } else if (unicode_to_ksx1001(ucs
, &r
, &c
)) {
456 *bytes
= ((r
+0x21) << 8) | (c
+0x21);
462 static struct iso2022_escape iso2022kr_escapes
[] = {
463 {"\016", 0x8FFFFFFF, 0x10000000, -1, -1},
464 {"\017", 0x8FFFFFFF, 0x00000000, 0, 0},
465 {"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1}, /* bits[11:6] <- 1 */
467 static struct iso2022 iso2022kr
= {
468 iso2022kr_escapes
, lenof(iso2022kr_escapes
),
469 "\1\2", "\2", 0x80000040, "\033$)C", iso2022kr_to_ucs
, iso2022kr_from_ucs
471 const charset_spec charset_CS_ISO2022_KR
= {
472 CS_ISO2022_KR
, read_iso2022s
, write_iso2022s
, &iso2022kr
475 #else /* ENUM_CHARSETS */
477 ENUM_CHARSET(CS_ISO2022_JP
)
478 ENUM_CHARSET(CS_ISO2022_KR
)
480 #endif /* ENUM_CHARSETS */