c6d25d8d |
1 | /* |
2 | * hz.c - HZ textual encoding of ASCII and GB2312, as defined in RFC 1843. |
3 | */ |
4 | |
5 | #ifndef ENUM_CHARSETS |
6 | |
7 | #include <assert.h> |
8 | |
9 | #include "charset.h" |
10 | #include "internal.h" |
11 | |
12 | static void read_hz(charset_spec const *charset, long int input_chr, |
13 | charset_state *state, |
14 | void (*emit)(void *ctx, long int output), void *emitctx) |
15 | { |
16 | /* |
17 | * When reading, our state variables are: |
18 | * |
19 | * - s0 is 0 in ASCII mode, 1 in GB2312 mode. |
20 | * |
21 | * - s1 stores a character we have just seen but not fully |
22 | * processed. So in ASCII mode, this can only ever be zero |
23 | * (no character) or 0x7E (~); in GB2312 mode it can be |
24 | * anything from 0x21-0x7E. |
25 | */ |
26 | |
27 | UNUSEDARG(charset); |
28 | |
29 | if (state->s0 == 0) { |
30 | /* |
31 | * ASCII mode. |
32 | */ |
33 | |
34 | if (state->s1) { |
35 | assert(state->s1 == '~'); |
36 | state->s1 = 0; |
37 | /* Process the character after a tilde. */ |
38 | switch (input_chr) { |
39 | case '~': |
40 | emit(emitctx, input_chr); |
41 | return; |
42 | case '\n': |
43 | return; /* ~\n is ignored */ |
44 | case '{': |
45 | state->s0 = 1; /* switch to GB2312 mode */ |
46 | return; |
47 | } |
48 | } else if (input_chr == '~') { |
49 | state->s1 = '~'; |
50 | return; |
51 | } else { |
52 | /* In ASCII mode, any non-tildes go straight */ |
53 | emit(emitctx, input_chr); |
54 | return; |
55 | } |
56 | } else { |
57 | /* |
58 | * GB2312 mode. As I understand it, we expect never to see |
59 | * anything in this mode that isn't 0x21-0x7E. So if we do, |
60 | * we'll simply throw an error and return to ASCII mode. |
61 | */ |
62 | if (input_chr < 0x21 || input_chr > 0x7E) { |
63 | emit(emitctx, ERROR); |
64 | state->s0 = state->s1 = 0; |
65 | return; |
66 | } |
67 | |
68 | /* |
69 | * So if we don't have a character stored already, store |
70 | * this one... |
71 | */ |
72 | if (!state->s1) { |
73 | state->s1 = input_chr; |
74 | return; |
75 | } |
76 | |
77 | /* |
78 | * ... otherwise, combine the stored char with this one. |
79 | * This will give either `~}', the escape sequence to |
80 | * return to ASCII mode, or something which we translate |
81 | * through GB2312. |
82 | */ |
83 | if (state->s1 == '~' && input_chr == '}') { |
84 | state->s1 = state->s0 = 0; |
85 | return; |
86 | } |
87 | |
88 | emit(emitctx, gb2312_to_unicode(state->s1 - 0x21, input_chr - 0x21)); |
89 | state->s1 = 0; |
90 | } |
91 | } |
92 | |
93 | static int write_hz(charset_spec const *charset, long int input_chr, |
94 | charset_state *state, |
95 | void (*emit)(void *ctx, long int output), void *emitctx) |
96 | { |
97 | int desired_state, r, c; |
98 | |
99 | UNUSEDARG(charset); |
100 | |
101 | /* |
102 | * Analyse the input char. |
103 | */ |
104 | if (input_chr < 0x80) { |
105 | desired_state = 0; |
106 | c = input_chr; |
107 | } else if (unicode_to_gb2312(input_chr, &r, &c)) { |
108 | desired_state = 1; |
109 | } else { |
110 | return FALSE; |
111 | } |
112 | |
3cca0edf |
113 | if (state->s0 != (unsigned)desired_state) { |
c6d25d8d |
114 | emit(emitctx, '~'); |
115 | emit(emitctx, desired_state ? '{' : '}'); |
116 | state->s0 = desired_state; |
117 | } |
118 | |
119 | if (input_chr < 0) |
120 | return TRUE; /* special case: just reset state */ |
121 | |
122 | if (state->s0) { |
123 | /* |
124 | * GB mode. |
125 | */ |
126 | emit(emitctx, 0x21 + r); |
127 | emit(emitctx, 0x21 + c); |
128 | } else { |
129 | emit(emitctx, c); |
130 | } |
131 | return TRUE; |
132 | } |
133 | |
134 | const charset_spec charset_CS_HZ = { |
135 | CS_HZ, read_hz, write_hz, NULL |
136 | }; |
137 | |
138 | #else /* ENUM_CHARSETS */ |
139 | |
140 | ENUM_CHARSET(CS_HZ) |
141 | |
142 | #endif /* ENUM_CHARSETS */ |