| 1 | #include <stdio.h> |
| 2 | #include <stdlib.h> |
| 3 | #include <ctype.h> |
| 4 | #include <locale.h> |
| 5 | #include <limits.h> |
| 6 | #include <wchar.h> |
| 7 | |
| 8 | #include <time.h> |
| 9 | |
| 10 | #include "putty.h" |
| 11 | #include "charset.h" |
| 12 | #include "terminal.h" |
| 13 | #include "misc.h" |
| 14 | |
| 15 | /* |
| 16 | * Unix Unicode-handling routines. |
| 17 | */ |
| 18 | |
| 19 | int is_dbcs_leadbyte(int codepage, char byte) |
| 20 | { |
| 21 | return 0; /* we don't do DBCS */ |
| 22 | } |
| 23 | |
| 24 | int mb_to_wc(int codepage, int flags, const char *mbstr, int mblen, |
| 25 | wchar_t *wcstr, int wclen) |
| 26 | { |
| 27 | if (codepage == DEFAULT_CODEPAGE) { |
| 28 | int n = 0; |
| 29 | mbstate_t state; |
| 30 | |
| 31 | memset(&state, 0, sizeof state); |
| 32 | |
| 33 | while (mblen > 0) { |
| 34 | size_t i = mbrtowc(wcstr+n, mbstr, (size_t)mblen, &state); |
| 35 | if (i == (size_t)-1 || i == (size_t)-2) |
| 36 | break; |
| 37 | n++; |
| 38 | mbstr += i; |
| 39 | mblen -= i; |
| 40 | } |
| 41 | |
| 42 | return n; |
| 43 | } else if (codepage == CS_NONE) { |
| 44 | int n = 0; |
| 45 | |
| 46 | while (mblen > 0) { |
| 47 | wcstr[n] = 0xD800 | (mbstr[0] & 0xFF); |
| 48 | n++; |
| 49 | mbstr++; |
| 50 | mblen--; |
| 51 | } |
| 52 | |
| 53 | return n; |
| 54 | } else |
| 55 | return charset_to_unicode(&mbstr, &mblen, wcstr, wclen, codepage, |
| 56 | NULL, NULL, 0); |
| 57 | } |
| 58 | |
| 59 | int wc_to_mb(int codepage, int flags, const wchar_t *wcstr, int wclen, |
| 60 | char *mbstr, int mblen, char *defchr, int *defused, |
| 61 | struct unicode_data *ucsdata) |
| 62 | { |
| 63 | /* FIXME: we should remove the defused param completely... */ |
| 64 | if (defused) |
| 65 | *defused = 0; |
| 66 | |
| 67 | if (codepage == DEFAULT_CODEPAGE) { |
| 68 | char output[MB_LEN_MAX]; |
| 69 | mbstate_t state; |
| 70 | int n = 0; |
| 71 | |
| 72 | memset(&state, 0, sizeof state); |
| 73 | |
| 74 | while (wclen > 0) { |
| 75 | int i = wcrtomb(output, wcstr[0], &state); |
| 76 | if (i == (size_t)-1 || i > n - mblen) |
| 77 | break; |
| 78 | memcpy(mbstr+n, output, i); |
| 79 | n += i; |
| 80 | wcstr++; |
| 81 | wclen--; |
| 82 | } |
| 83 | |
| 84 | return n; |
| 85 | } else if (codepage == CS_NONE) { |
| 86 | int n = 0; |
| 87 | while (wclen > 0 && n < mblen) { |
| 88 | if (*wcstr >= 0xD800 && *wcstr < 0xD900) |
| 89 | mbstr[n++] = (*wcstr & 0xFF); |
| 90 | else if (defchr) |
| 91 | mbstr[n++] = *defchr; |
| 92 | wcstr++; |
| 93 | wclen--; |
| 94 | } |
| 95 | return n; |
| 96 | } else { |
| 97 | return charset_from_unicode(&wcstr, &wclen, mbstr, mblen, codepage, |
| 98 | NULL, defchr?defchr:NULL, defchr?1:0); |
| 99 | } |
| 100 | } |
| 101 | |
| 102 | /* |
| 103 | * Return value is TRUE if pterm is to run in direct-to-font mode. |
| 104 | */ |
| 105 | int init_ucs(struct unicode_data *ucsdata, char *linecharset, |
| 106 | int utf8_override, int font_charset, int vtmode) |
| 107 | { |
| 108 | int i, ret = 0; |
| 109 | |
| 110 | /* |
| 111 | * In the platform-independent parts of the code, font_codepage |
| 112 | * is used only for system DBCS support - which we don't |
| 113 | * support at all. So we set this to something which will never |
| 114 | * be used. |
| 115 | */ |
| 116 | ucsdata->font_codepage = -1; |
| 117 | |
| 118 | /* |
| 119 | * If utf8_override is set and the POSIX locale settings |
| 120 | * dictate a UTF-8 character set, then just go straight for |
| 121 | * UTF-8. |
| 122 | */ |
| 123 | ucsdata->line_codepage = CS_NONE; |
| 124 | if (utf8_override) { |
| 125 | const char *s; |
| 126 | if (((s = getenv("LC_ALL")) && *s) || |
| 127 | ((s = getenv("LC_CTYPE")) && *s) || |
| 128 | ((s = getenv("LANG")) && *s)) { |
| 129 | if (strstr(s, "UTF-8")) |
| 130 | ucsdata->line_codepage = CS_UTF8; |
| 131 | } |
| 132 | } |
| 133 | |
| 134 | /* |
| 135 | * Failing that, line_codepage should be decoded from the |
| 136 | * specification in conf. |
| 137 | */ |
| 138 | if (ucsdata->line_codepage == CS_NONE) |
| 139 | ucsdata->line_codepage = decode_codepage(linecharset); |
| 140 | |
| 141 | /* |
| 142 | * If line_codepage is _still_ CS_NONE, we assume we're using |
| 143 | * the font's own encoding. This has been passed in to us, so |
| 144 | * we use that. If it's still CS_NONE after _that_ - i.e. the |
| 145 | * font we were given had an incomprehensible charset - then we |
| 146 | * fall back to using the D800 page. |
| 147 | */ |
| 148 | if (ucsdata->line_codepage == CS_NONE) |
| 149 | ucsdata->line_codepage = font_charset; |
| 150 | |
| 151 | if (ucsdata->line_codepage == CS_NONE) |
| 152 | ret = 1; |
| 153 | |
| 154 | /* |
| 155 | * Set up unitab_line, by translating each individual character |
| 156 | * in the line codepage into Unicode. |
| 157 | */ |
| 158 | for (i = 0; i < 256; i++) { |
| 159 | char c[1]; |
| 160 | const char *p; |
| 161 | wchar_t wc[1]; |
| 162 | int len; |
| 163 | c[0] = i; |
| 164 | p = c; |
| 165 | len = 1; |
| 166 | if (ucsdata->line_codepage == CS_NONE) |
| 167 | ucsdata->unitab_line[i] = 0xD800 | i; |
| 168 | else if (1 == charset_to_unicode(&p, &len, wc, 1, |
| 169 | ucsdata->line_codepage, |
| 170 | NULL, L"", 0)) |
| 171 | ucsdata->unitab_line[i] = wc[0]; |
| 172 | else |
| 173 | ucsdata->unitab_line[i] = 0xFFFD; |
| 174 | } |
| 175 | |
| 176 | /* |
| 177 | * Set up unitab_xterm. This is the same as unitab_line except |
| 178 | * in the line-drawing regions, where it follows the Unicode |
| 179 | * encoding. |
| 180 | * |
| 181 | * (Note that the strange X encoding of line-drawing characters |
| 182 | * in the bottom 32 glyphs of ISO8859-1 fonts is taken care of |
| 183 | * by the font encoding, which will spot such a font and act as |
| 184 | * if it were in a variant encoding of ISO8859-1.) |
| 185 | */ |
| 186 | for (i = 0; i < 256; i++) { |
| 187 | static const wchar_t unitab_xterm_std[32] = { |
| 188 | 0x2666, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1, |
| 189 | 0x2424, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0x23ba, |
| 190 | 0x23bb, 0x2500, 0x23bc, 0x23bd, 0x251c, 0x2524, 0x2534, 0x252c, |
| 191 | 0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x0020 |
| 192 | }; |
| 193 | static const wchar_t unitab_xterm_poorman[32] = |
| 194 | L"*#****o~**+++++-----++++|****L. "; |
| 195 | |
| 196 | const wchar_t *ptr; |
| 197 | |
| 198 | if (vtmode == VT_POORMAN) |
| 199 | ptr = unitab_xterm_poorman; |
| 200 | else |
| 201 | ptr = unitab_xterm_std; |
| 202 | |
| 203 | if (i >= 0x5F && i < 0x7F) |
| 204 | ucsdata->unitab_xterm[i] = ptr[i & 0x1F]; |
| 205 | else |
| 206 | ucsdata->unitab_xterm[i] = ucsdata->unitab_line[i]; |
| 207 | } |
| 208 | |
| 209 | /* |
| 210 | * Set up unitab_scoacs. The SCO Alternate Character Set is |
| 211 | * simply CP437. |
| 212 | */ |
| 213 | for (i = 0; i < 256; i++) { |
| 214 | char c[1]; |
| 215 | const char *p; |
| 216 | wchar_t wc[1]; |
| 217 | int len; |
| 218 | c[0] = i; |
| 219 | p = c; |
| 220 | len = 1; |
| 221 | if (1 == charset_to_unicode(&p, &len, wc, 1, CS_CP437, NULL, L"", 0)) |
| 222 | ucsdata->unitab_scoacs[i] = wc[0]; |
| 223 | else |
| 224 | ucsdata->unitab_scoacs[i] = 0xFFFD; |
| 225 | } |
| 226 | |
| 227 | /* |
| 228 | * Find the control characters in the line codepage. For |
| 229 | * direct-to-font mode using the D800 hack, we assume 00-1F and |
| 230 | * 7F are controls, but allow 80-9F through. (It's as good a |
| 231 | * guess as anything; and my bet is that half the weird fonts |
| 232 | * used in this way will be IBM or MS code pages anyway.) |
| 233 | */ |
| 234 | for (i = 0; i < 256; i++) { |
| 235 | int lineval = ucsdata->unitab_line[i]; |
| 236 | if (lineval < ' ' || (lineval >= 0x7F && lineval < 0xA0) || |
| 237 | (lineval >= 0xD800 && lineval < 0xD820) || (lineval == 0xD87F)) |
| 238 | ucsdata->unitab_ctrl[i] = i; |
| 239 | else |
| 240 | ucsdata->unitab_ctrl[i] = 0xFF; |
| 241 | } |
| 242 | |
| 243 | return ret; |
| 244 | } |
| 245 | |
| 246 | const char *cp_name(int codepage) |
| 247 | { |
| 248 | if (codepage == CS_NONE) |
| 249 | return "Use font encoding"; |
| 250 | return charset_to_localenc(codepage); |
| 251 | } |
| 252 | |
| 253 | const char *cp_enumerate(int index) |
| 254 | { |
| 255 | int charset; |
| 256 | if (index == 0) |
| 257 | return "Use font encoding"; |
| 258 | charset = charset_localenc_nth(index-1); |
| 259 | if (charset == CS_NONE) |
| 260 | return NULL; |
| 261 | return charset_to_localenc(charset); |
| 262 | } |
| 263 | |
| 264 | int decode_codepage(char *cp_name) |
| 265 | { |
| 266 | if (!*cp_name) |
| 267 | return CS_NONE; /* use font encoding */ |
| 268 | return charset_from_localenc(cp_name); |
| 269 | } |