First draft of Unicode support in pterm. It's pretty complete: it
[u/mdw/putty] / unix / uxucs.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <ctype.h>
4 #include <locale.h>
5 #include <limits.h>
6 #include <wchar.h>
7
8 #include <time.h>
9
10 #include "putty.h"
11 #include "terminal.h"
12 #include "misc.h"
13
14 /*
15 * Unix Unicode-handling routines.
16 */
17
18 int is_dbcs_leadbyte(int codepage, char byte)
19 {
20 return 0; /* we don't do DBCS */
21 }
22
23 int mb_to_wc(int codepage, int flags, char *mbstr, int mblen,
24 wchar_t *wcstr, int wclen)
25 {
26 if (codepage == DEFAULT_CODEPAGE) {
27 int n = 0;
28 mbstate_t state = { 0 };
29
30 setlocale(LC_CTYPE, "");
31
32 while (mblen > 0) {
33 size_t i = mbrtowc(wcstr+n, mbstr, (size_t)mblen, &state);
34 if (i == (size_t)-1 || i == (size_t)-2)
35 break;
36 n++;
37 mbstr += i;
38 mblen -= i;
39 }
40
41 setlocale(LC_CTYPE, "C");
42
43 return n;
44 } else
45 return charset_to_unicode(&mbstr, &mblen, wcstr, wclen, codepage,
46 NULL, NULL, 0);
47 }
48
49 int wc_to_mb(int codepage, int flags, wchar_t *wcstr, int wclen,
50 char *mbstr, int mblen, char *defchr, int *defused)
51 {
52 /* FIXME: we should remove the defused param completely... */
53 if (defused)
54 *defused = 0;
55
56 if (codepage == DEFAULT_CODEPAGE) {
57 char output[MB_LEN_MAX];
58 mbstate_t state = { 0 };
59 int n = 0;
60
61 setlocale(LC_CTYPE, "");
62
63 while (wclen > 0) {
64 int i = wcrtomb(output, wcstr[0], &state);
65 if (i == (size_t)-1 || i > n - mblen)
66 break;
67 memcpy(mbstr+n, output, i);
68 n += i;
69 wcstr++;
70 wclen--;
71 }
72
73 setlocale(LC_CTYPE, "C");
74
75 return n;
76 } else
77 return charset_from_unicode(&wcstr, &wclen, mbstr, mblen, codepage,
78 NULL, NULL, 0);
79 }
80
81 void init_ucs(void)
82 {
83 int i;
84
85 /*
86 * In the platform-independent parts of the code, font_codepage
87 * is used only for system DBCS support - which we don't
88 * support at all. So we set this to something which will never
89 * be used.
90 */
91 font_codepage = -1;
92
93 /*
94 * line_codepage should be decoded from the specification in
95 * cfg.
96 */
97 line_codepage = charset_from_mimeenc(cfg.line_codepage);
98 if (line_codepage == CS_NONE)
99 line_codepage = charset_from_xenc(cfg.line_codepage);
100 /* If it's still CS_NONE, we should assume direct-to-font. */
101
102 /* FIXME: this is a hack. Currently fonts with incomprehensible
103 * encodings are dealt with by pretending they're 8859-1. It's
104 * ugly, but it's good enough to stop things crashing. Should do
105 * something better here. */
106 if (line_codepage == CS_NONE)
107 line_codepage = CS_ISO8859_1;
108
109 /*
110 * Set up unitab_line, by translating each individual character
111 * in the line codepage into Unicode.
112 */
113 for (i = 0; i < 256; i++) {
114 char c[1], *p;
115 wchar_t wc[1];
116 int len;
117 c[0] = i;
118 p = c;
119 len = 1;
120 if (1 == charset_to_unicode(&p,&len,wc,1,line_codepage,NULL,L"",0))
121 unitab_line[i] = wc[0];
122 else
123 unitab_line[i] = 0xFFFD;
124 }
125
126 /*
127 * Set up unitab_xterm. This is the same as unitab_line except
128 * in the line-drawing regions, where it follows the Unicode
129 * encoding.
130 *
131 * (Note that the strange X encoding of line-drawing characters
132 * in the bottom 32 glyphs of ISO8859-1 fonts is taken care of
133 * by the font encoding, which will spot such a font and act as
134 * if it were in a variant encoding of ISO8859-1.)
135 */
136 for (i = 0; i < 256; i++) {
137 static const wchar_t unitab_xterm_std[32] = {
138 0x2666, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1,
139 0x2424, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0x23ba,
140 0x23bb, 0x2500, 0x23bc, 0x23bd, 0x251c, 0x2524, 0x2534, 0x252c,
141 0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x0020
142 };
143 if (i >= 0x5F && i < 0x7F)
144 unitab_xterm[i] = unitab_xterm_std[i & 0x1F];
145 else
146 unitab_xterm[i] = unitab_line[i];
147 }
148
149 /*
150 * Set up unitab_scoacs. The SCO Alternate Character Set is
151 * simply CP437.
152 */
153 for (i = 0; i < 256; i++) {
154 char c[1], *p;
155 wchar_t wc[1];
156 int len;
157 c[0] = i;
158 p = c;
159 len = 1;
160 if (1 == charset_to_unicode(&p,&len,wc,1,CS_CP437,NULL,L"",0))
161 unitab_scoacs[i] = wc[0];
162 else
163 unitab_scoacs[i] = 0xFFFD;
164 }
165
166 /* Find the line control characters. */
167 for (i = 0; i < 256; i++)
168 if (unitab_line[i] < ' '
169 || (unitab_line[i] >= 0x7F && unitab_line[i] < 0xA0))
170 unitab_ctrl[i] = i;
171 else
172 unitab_ctrl[i] = 0xFF;
173 }