Silly of me to overlook it: another obvious way you might like to
[sgt/charset] / hz.c
CommitLineData
c6d25d8d 1/*
2 * hz.c - HZ textual encoding of ASCII and GB2312, as defined in RFC 1843.
3 */
4
5#ifndef ENUM_CHARSETS
6
7#include <assert.h>
8
9#include "charset.h"
10#include "internal.h"
11
12static void read_hz(charset_spec const *charset, long int input_chr,
13 charset_state *state,
14 void (*emit)(void *ctx, long int output), void *emitctx)
15{
16 /*
17 * When reading, our state variables are:
18 *
19 * - s0 is 0 in ASCII mode, 1 in GB2312 mode.
20 *
21 * - s1 stores a character we have just seen but not fully
22 * processed. So in ASCII mode, this can only ever be zero
23 * (no character) or 0x7E (~); in GB2312 mode it can be
24 * anything from 0x21-0x7E.
25 */
26
27 UNUSEDARG(charset);
28
29 if (state->s0 == 0) {
30 /*
31 * ASCII mode.
32 */
33
34 if (state->s1) {
35 assert(state->s1 == '~');
36 state->s1 = 0;
37 /* Process the character after a tilde. */
38 switch (input_chr) {
39 case '~':
40 emit(emitctx, input_chr);
41 return;
42 case '\n':
43 return; /* ~\n is ignored */
44 case '{':
45 state->s0 = 1; /* switch to GB2312 mode */
46 return;
47 }
48 } else if (input_chr == '~') {
49 state->s1 = '~';
50 return;
51 } else {
52 /* In ASCII mode, any non-tildes go straight */
53 emit(emitctx, input_chr);
54 return;
55 }
56 } else {
57 /*
58 * GB2312 mode. As I understand it, we expect never to see
59 * anything in this mode that isn't 0x21-0x7E. So if we do,
60 * we'll simply throw an error and return to ASCII mode.
61 */
62 if (input_chr < 0x21 || input_chr > 0x7E) {
63 emit(emitctx, ERROR);
64 state->s0 = state->s1 = 0;
65 return;
66 }
67
68 /*
69 * So if we don't have a character stored already, store
70 * this one...
71 */
72 if (!state->s1) {
73 state->s1 = input_chr;
74 return;
75 }
76
77 /*
78 * ... otherwise, combine the stored char with this one.
79 * This will give either `~}', the escape sequence to
80 * return to ASCII mode, or something which we translate
81 * through GB2312.
82 */
83 if (state->s1 == '~' && input_chr == '}') {
84 state->s1 = state->s0 = 0;
85 return;
86 }
87
88 emit(emitctx, gb2312_to_unicode(state->s1 - 0x21, input_chr - 0x21));
89 state->s1 = 0;
90 }
91}
92
93static int write_hz(charset_spec const *charset, long int input_chr,
94 charset_state *state,
95 void (*emit)(void *ctx, long int output), void *emitctx)
96{
97 int desired_state, r, c;
98
99 UNUSEDARG(charset);
100
101 /*
102 * Analyse the input char.
103 */
104 if (input_chr < 0x80) {
105 desired_state = 0;
106 c = input_chr;
107 } else if (unicode_to_gb2312(input_chr, &r, &c)) {
108 desired_state = 1;
109 } else {
110 return FALSE;
111 }
112
3cca0edf 113 if (state->s0 != (unsigned)desired_state) {
c6d25d8d 114 emit(emitctx, '~');
115 emit(emitctx, desired_state ? '{' : '}');
116 state->s0 = desired_state;
117 }
118
119 if (input_chr < 0)
120 return TRUE; /* special case: just reset state */
121
122 if (state->s0) {
123 /*
124 * GB mode.
125 */
126 emit(emitctx, 0x21 + r);
127 emit(emitctx, 0x21 + c);
128 } else {
129 emit(emitctx, c);
130 }
131 return TRUE;
132}
133
134const charset_spec charset_CS_HZ = {
135 CS_HZ, read_hz, write_hz, NULL
136};
137
138#else /* ENUM_CHARSETS */
139
140ENUM_CHARSET(CS_HZ)
141
142#endif /* ENUM_CHARSETS */