[u/mdw/putty] / unix / uxucs.c

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <locale.h>
#include <limits.h>
#include <wchar.h>

#include <time.h>

#include "putty.h"
#include "charset.h"
#include "terminal.h"
#include "misc.h"

/*
 * Unix Unicode-handling routines.
 */

int is_dbcs_leadbyte(int codepage, char byte)
{
    return 0;			       /* we don't do DBCS */
}

int mb_to_wc(int codepage, int flags, char *mbstr, int mblen,
	     wchar_t *wcstr, int wclen)
{
    if (codepage == DEFAULT_CODEPAGE) {
	int n = 0;
	mbstate_t state;

	memset(&state, 0, sizeof state);
	setlocale(LC_CTYPE, "");

	while (mblen > 0) {
	    size_t i = mbrtowc(wcstr+n, mbstr, (size_t)mblen, &state);
	    if (i == (size_t)-1 || i == (size_t)-2)
		break;
	    n++;
	    mbstr += i;
	    mblen -= i;
	}

	setlocale(LC_CTYPE, "C");

	return n;
    } else if (codepage == CS_NONE) {
	int n = 0;

	while (mblen > 0) {
	    wcstr[n] = 0xD800 | (mbstr[0] & 0xFF);
	    n++;
	    mbstr++;
	    mblen--;
	}

	return n;
    } else
	return charset_to_unicode(&mbstr, &mblen, wcstr, wclen, codepage,
				  NULL, NULL, 0);
}

int wc_to_mb(int codepage, int flags, wchar_t *wcstr, int wclen,
	     char *mbstr, int mblen, char *defchr, int *defused,
	     struct unicode_data *ucsdata)
{
    /* FIXME: we should remove the defused param completely... */
    if (defused)
	*defused = 0;

    if (codepage == DEFAULT_CODEPAGE) {
	char output[MB_LEN_MAX];
	mbstate_t state;
	int n = 0;

	memset(&state, 0, sizeof state);
	setlocale(LC_CTYPE, "");

	while (wclen > 0) {
	    int i = wcrtomb(output, wcstr[0], &state);
	    if (i == (size_t)-1 || i > n - mblen)
		break;
	    memcpy(mbstr+n, output, i);
	    n += i;
	    wcstr++;
	    wclen--;
	}

	setlocale(LC_CTYPE, "C");

	return n;
    } else if (codepage == CS_NONE) {
	int n = 0;
	while (wclen > 0 && n < mblen) {
	    if (*wcstr >= 0xD800 && *wcstr < 0xD900)
		mbstr[n++] = (*wcstr & 0xFF);
	    else if (defchr)
		mbstr[n++] = *defchr;
	    wcstr++;
	    wclen--;
	}
	return n;
    } else {
	return charset_from_unicode(&wcstr, &wclen, mbstr, mblen, codepage,
				    NULL, defchr?defchr:NULL, defchr?1:0);
    }
}

/*
 * Return value is TRUE if pterm is to run in direct-to-font mode.
 */
int init_ucs(struct unicode_data *ucsdata, char *linecharset,
	     int utf8_override, int font_charset, int vtmode)
{
    int i, ret = 0;

    /*
     * In the platform-independent parts of the code, font_codepage
     * is used only for system DBCS support - which we don't
     * support at all. So we set this to something which will never
     * be used.
     */
    ucsdata->font_codepage = -1;

    /*
     * If utf8_override is set and the POSIX locale settings
     * dictate a UTF-8 character set, then just go straight for
     * UTF-8.
     */
    ucsdata->line_codepage = CS_NONE;
    if (utf8_override) {
	const char *s;
	if (((s = getenv("LC_ALL"))   && *s) ||
	    ((s = getenv("LC_CTYPE")) && *s) ||
	    ((s = getenv("LANG"))     && *s)) {
	    if (strstr(s, "UTF-8"))
		ucsdata->line_codepage = CS_UTF8;
	}
    }

    /*
     * Failing that, line_codepage should be decoded from the
     * specification in conf.
     */
    if (ucsdata->line_codepage == CS_NONE)
	ucsdata->line_codepage = decode_codepage(linecharset);

    /*
     * If line_codepage is _still_ CS_NONE, we assume we're using
     * the font's own encoding. This has been passed in to us, so
     * we use that. If it's still CS_NONE after _that_ - i.e. the
     * font we were given had an incomprehensible charset - then we
     * fall back to using the D800 page.
     */
    if (ucsdata->line_codepage == CS_NONE)
	ucsdata->line_codepage = font_charset;

    if (ucsdata->line_codepage == CS_NONE)
	ret = 1;

    /*
     * Set up unitab_line, by translating each individual character
     * in the line codepage into Unicode.
     */
    for (i = 0; i < 256; i++) {
	char c[1], *p;
	wchar_t wc[1];
	int len;
	c[0] = i;
	p = c;
	len = 1;
	if (ucsdata->line_codepage == CS_NONE)
	    ucsdata->unitab_line[i] = 0xD800 | i;
	else if (1 == charset_to_unicode(&p, &len, wc, 1,
					 ucsdata->line_codepage,
					 NULL, L"", 0))
	    ucsdata->unitab_line[i] = wc[0];
	else
	    ucsdata->unitab_line[i] = 0xFFFD;
    }

    /*
     * Set up unitab_xterm. This is the same as unitab_line except
     * in the line-drawing regions, where it follows the Unicode
     * encoding.
     * 
     * (Note that the strange X encoding of line-drawing characters
     * in the bottom 32 glyphs of ISO8859-1 fonts is taken care of
     * by the font encoding, which will spot such a font and act as
     * if it were in a variant encoding of ISO8859-1.)
     */
    for (i = 0; i < 256; i++) {
	static const wchar_t unitab_xterm_std[32] = {
	    0x2666, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1,
	    0x2424, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0x23ba,
	    0x23bb, 0x2500, 0x23bc, 0x23bd, 0x251c, 0x2524, 0x2534, 0x252c,
	    0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x0020
	};
	static const wchar_t unitab_xterm_poorman[32] =
	    L"*#****o~**+++++-----++++|****L. ";

	const wchar_t *ptr;

	if (vtmode == VT_POORMAN)
	    ptr = unitab_xterm_poorman;
	else
	    ptr = unitab_xterm_std;

	if (i >= 0x5F && i < 0x7F)
	    ucsdata->unitab_xterm[i] = ptr[i & 0x1F];
	else
	    ucsdata->unitab_xterm[i] = ucsdata->unitab_line[i];
    }

    /*
     * Set up unitab_scoacs. The SCO Alternate Character Set is
     * simply CP437.
     */
    for (i = 0; i < 256; i++) {
	char c[1], *p;
	wchar_t wc[1];
	int len;
	c[0] = i;
	p = c;
	len = 1;
	if (1 == charset_to_unicode(&p, &len, wc, 1, CS_CP437, NULL, L"", 0))
	    ucsdata->unitab_scoacs[i] = wc[0];
	else
	    ucsdata->unitab_scoacs[i] = 0xFFFD;
    }

    /*
     * Find the control characters in the line codepage. For
     * direct-to-font mode using the D800 hack, we assume 00-1F and
     * 7F are controls, but allow 80-9F through. (It's as good a
     * guess as anything; and my bet is that half the weird fonts
     * used in this way will be IBM or MS code pages anyway.)
     */
    for (i = 0; i < 256; i++) {
	int lineval = ucsdata->unitab_line[i];
	if (lineval < ' ' || (lineval >= 0x7F && lineval < 0xA0) ||
	    (lineval >= 0xD800 && lineval < 0xD820) || (lineval == 0xD87F))
	    ucsdata->unitab_ctrl[i] = i;
	else
	    ucsdata->unitab_ctrl[i] = 0xFF;
    }

    return ret;
}

const char *cp_name(int codepage)
{
    if (codepage == CS_NONE)
	return "Use font encoding";
    return charset_to_localenc(codepage);
}

const char *cp_enumerate(int index)
{
    int charset;
    if (index == 0)
	return "Use font encoding";
    charset = charset_localenc_nth(index-1);
    if (charset == CS_NONE)
	return NULL;
    return charset_to_localenc(charset);
}

int decode_codepage(char *cp_name)
{
    if (!*cp_name)
	return CS_NONE;		       /* use font encoding */
    return charset_from_localenc(cp_name);
}
Commit	Line	Data
1709795f	1	#include <stdio.h>
	2	#include <stdlib.h>
	3	#include <ctype.h>
2dc6356a	4	#include <locale.h>
	5	#include <limits.h>
	6	#include <wchar.h>
1709795f	7
1709795f	8	#include <time.h>
2dc6356a	9
1709795f	10	#include "putty.h"
d4413bd2	11	#include "charset.h"
887035a5	12	#include "terminal.h"
1709795f	13	#include "misc.h"
	14
	15	/*
	16	* Unix Unicode-handling routines.
1709795f	17	*/
1709795f	18
1709795f	19	int is_dbcs_leadbyte(int codepage, char byte)
	20	{
	21	return 0; /* we don't do DBCS */
	22	}
	23
	24	int mb_to_wc(int codepage, int flags, char *mbstr, int mblen,
	25	wchar_t *wcstr, int wclen)
	26	{
2dc6356a	27	if (codepage == DEFAULT_CODEPAGE) {
2dc6356a	28	int n = 0;
d4e1d591	29	mbstate_t state;
2dc6356a	30
d4e1d591	31	memset(&state, 0, sizeof state);
2dc6356a	32	setlocale(LC_CTYPE, "");
	33
	34	while (mblen > 0) {
	35	size_t i = mbrtowc(wcstr+n, mbstr, (size_t)mblen, &state);
	36	if (i == (size_t)-1 \|\| i == (size_t)-2)
	37	break;
	38	n++;
	39	mbstr += i;
	40	mblen -= i;
	41	}
	42
	43	setlocale(LC_CTYPE, "C");
	44
	45	return n;
facd762c	46	} else if (codepage == CS_NONE) {
	47	int n = 0;
	48
	49	while (mblen > 0) {
	50	wcstr[n] = 0xD800 \| (mbstr[0] & 0xFF);
	51	n++;
	52	mbstr++;
	53	mblen--;
	54	}
	55
	56	return n;
2dc6356a	57	} else
	58	return charset_to_unicode(&mbstr, &mblen, wcstr, wclen, codepage,
	59	NULL, NULL, 0);
e6346999	60	}
	61
	62	int wc_to_mb(int codepage, int flags, wchar_t *wcstr, int wclen,
21d2b241	63	char mbstr, int mblen, char defchr, int *defused,
21d2b241	64	struct unicode_data *ucsdata)
e6346999	65	{
2dc6356a	66	/* FIXME: we should remove the defused param completely... */
e6346999	67	if (defused)
e6346999	68	*defused = 0;
2dc6356a	69
	70	if (codepage == DEFAULT_CODEPAGE) {
	71	char output[MB_LEN_MAX];
d4e1d591	72	mbstate_t state;
2dc6356a	73	int n = 0;
2dc6356a	74
d4e1d591	75	memset(&state, 0, sizeof state);
2dc6356a	76	setlocale(LC_CTYPE, "");
	77
	78	while (wclen > 0) {
	79	int i = wcrtomb(output, wcstr[0], &state);
	80	if (i == (size_t)-1 \|\| i > n - mblen)
	81	break;
	82	memcpy(mbstr+n, output, i);
	83	n += i;
	84	wcstr++;
	85	wclen--;
	86	}
	87
	88	setlocale(LC_CTYPE, "C");
	89
	90	return n;
facd762c	91	} else if (codepage == CS_NONE) {
	92	int n = 0;
	93	while (wclen > 0 && n < mblen) {
	94	if (wcstr >= 0xD800 && wcstr < 0xD900)
	95	mbstr[n++] = (*wcstr & 0xFF);
	96	else if (defchr)
	97	mbstr[n++] = *defchr;
	98	wcstr++;
	99	wclen--;
	100	}
	101	return n;
	102	} else {
2dc6356a	103	return charset_from_unicode(&wcstr, &wclen, mbstr, mblen, codepage,
0f993689	104	NULL, defchr?defchr:NULL, defchr?1:0);
facd762c	105	}
1709795f	106	}
1709795f	107
085f4a68	108	/*
	109	* Return value is TRUE if pterm is to run in direct-to-font mode.
	110	*/
6ac7f054	111	int init_ucs(struct unicode_data ucsdata, char linecharset,
6ac7f054	112	int utf8_override, int font_charset, int vtmode)
1709795f	113	{
085f4a68	114	int i, ret = 0;
2dc6356a	115
	116	/*
	117	* In the platform-independent parts of the code, font_codepage
	118	* is used only for system DBCS support - which we don't
	119	* support at all. So we set this to something which will never
	120	* be used.
	121	*/
21d2b241	122	ucsdata->font_codepage = -1;
2dc6356a	123
2dc6356a	124	/*
6ac7f054	125	* If utf8_override is set and the POSIX locale settings
	126	* dictate a UTF-8 character set, then just go straight for
	127	* UTF-8.
2dc6356a	128	*/
6ac7f054	129	ucsdata->line_codepage = CS_NONE;
	130	if (utf8_override) {
	131	const char *s;
	132	if (((s = getenv("LC_ALL")) && *s) \|\|
	133	((s = getenv("LC_CTYPE")) && *s) \|\|
	134	((s = getenv("LANG")) && *s)) {
	135	if (strstr(s, "UTF-8"))
	136	ucsdata->line_codepage = CS_UTF8;
	137	}
	138	}
	139
	140	/*
	141	* Failing that, line_codepage should be decoded from the
4a693cfc	142	* specification in conf.
6ac7f054	143	*/
	144	if (ucsdata->line_codepage == CS_NONE)
	145	ucsdata->line_codepage = decode_codepage(linecharset);
2dc6356a	146
facd762c	147	/*
	148	* If line_codepage is _still_ CS_NONE, we assume we're using
	149	* the font's own encoding. This has been passed in to us, so
	150	* we use that. If it's still CS_NONE after _that_ - i.e. the
	151	* font we were given had an incomprehensible charset - then we
	152	* fall back to using the D800 page.
	153	*/
21d2b241	154	if (ucsdata->line_codepage == CS_NONE)
21d2b241	155	ucsdata->line_codepage = font_charset;
2dc6356a	156
21d2b241	157	if (ucsdata->line_codepage == CS_NONE)
085f4a68	158	ret = 1;
085f4a68	159
2dc6356a	160	/*
	161	* Set up unitab_line, by translating each individual character
	162	* in the line codepage into Unicode.
	163	*/
	164	for (i = 0; i < 256; i++) {
	165	char c[1], *p;
	166	wchar_t wc[1];
	167	int len;
	168	c[0] = i;
	169	p = c;
	170	len = 1;
21d2b241	171	if (ucsdata->line_codepage == CS_NONE)
	172	ucsdata->unitab_line[i] = 0xD800 \| i;
	173	else if (1 == charset_to_unicode(&p, &len, wc, 1,
	174	ucsdata->line_codepage,
facd762c	175	NULL, L"", 0))
21d2b241	176	ucsdata->unitab_line[i] = wc[0];
1709795f	177	else
21d2b241	178	ucsdata->unitab_line[i] = 0xFFFD;
2dc6356a	179	}
1709795f	180
2dc6356a	181	/*
	182	* Set up unitab_xterm. This is the same as unitab_line except
	183	* in the line-drawing regions, where it follows the Unicode
	184	* encoding.
	185	*
	186	* (Note that the strange X encoding of line-drawing characters
	187	* in the bottom 32 glyphs of ISO8859-1 fonts is taken care of
	188	* by the font encoding, which will spot such a font and act as
	189	* if it were in a variant encoding of ISO8859-1.)
	190	*/
1709795f	191	for (i = 0; i < 256; i++) {
2dc6356a	192	static const wchar_t unitab_xterm_std[32] = {
	193	0x2666, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1,
	194	0x2424, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0x23ba,
	195	0x23bb, 0x2500, 0x23bc, 0x23bd, 0x251c, 0x2524, 0x2534, 0x252c,
	196	0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x0020
	197	};
3900c2d6	198	static const wchar_t unitab_xterm_poorman[32] =
	199	L"#*o~+++++-----++++\|****L. ";
	200
	201	const wchar_t *ptr;
	202
	203	if (vtmode == VT_POORMAN)
	204	ptr = unitab_xterm_poorman;
	205	else
	206	ptr = unitab_xterm_std;
	207
2dc6356a	208	if (i >= 0x5F && i < 0x7F)
3900c2d6	209	ucsdata->unitab_xterm[i] = ptr[i & 0x1F];
2dc6356a	210	else
21d2b241	211	ucsdata->unitab_xterm[i] = ucsdata->unitab_line[i];
1709795f	212	}
2dc6356a	213
	214	/*
	215	* Set up unitab_scoacs. The SCO Alternate Character Set is
	216	* simply CP437.
	217	*/
	218	for (i = 0; i < 256; i++) {
	219	char c[1], *p;
	220	wchar_t wc[1];
	221	int len;
	222	c[0] = i;
	223	p = c;
	224	len = 1;
facd762c	225	if (1 == charset_to_unicode(&p, &len, wc, 1, CS_CP437, NULL, L"", 0))
21d2b241	226	ucsdata->unitab_scoacs[i] = wc[0];
2dc6356a	227	else
21d2b241	228	ucsdata->unitab_scoacs[i] = 0xFFFD;
2dc6356a	229	}
2dc6356a	230
facd762c	231	/*
	232	* Find the control characters in the line codepage. For
	233	* direct-to-font mode using the D800 hack, we assume 00-1F and
	234	* 7F are controls, but allow 80-9F through. (It's as good a
	235	* guess as anything; and my bet is that half the weird fonts
	236	* used in this way will be IBM or MS code pages anyway.)
	237	*/
	238	for (i = 0; i < 256; i++) {
21d2b241	239	int lineval = ucsdata->unitab_line[i];
facd762c	240	if (lineval < ' ' \|\| (lineval >= 0x7F && lineval < 0xA0) \|\|
facd762c	241	(lineval >= 0xD800 && lineval < 0xD820) \|\| (lineval == 0xD87F))
21d2b241	242	ucsdata->unitab_ctrl[i] = i;
2dc6356a	243	else
21d2b241	244	ucsdata->unitab_ctrl[i] = 0xFF;
facd762c	245	}
085f4a68	246
085f4a68	247	return ret;
126ce234	248	}
d4413bd2	249
	250	const char *cp_name(int codepage)
	251	{
	252	if (codepage == CS_NONE)
	253	return "Use font encoding";
	254	return charset_to_localenc(codepage);
	255	}
	256
	257	const char *cp_enumerate(int index)
	258	{
	259	int charset;
	260	if (index == 0)
	261	return "Use font encoding";
	262	charset = charset_localenc_nth(index-1);
	263	if (charset == CS_NONE)
	264	return NULL;
	265	return charset_to_localenc(charset);
	266	}
	267
	268	int decode_codepage(char *cp_name)
	269	{
	270	if (!*cp_name)
	271	return CS_NONE; /* use font encoding */
	272	return charset_from_localenc(cp_name);
	273	}