[u/mdw/putty] / unix / uxucs.c

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <locale.h>
#include <limits.h>
#include <wchar.h>

#include <time.h>

#include "putty.h"
#include "charset.h"
#include "terminal.h"
#include "misc.h"

/*
 * Unix Unicode-handling routines.
 */

int is_dbcs_leadbyte(int codepage, char byte)
{
    return 0;			       /* we don't do DBCS */
}

int mb_to_wc(int codepage, int flags, const char *mbstr, int mblen,
	     wchar_t *wcstr, int wclen)
{
    if (codepage == DEFAULT_CODEPAGE) {
	int n = 0;
	mbstate_t state;

	memset(&state, 0, sizeof state);
	setlocale(LC_CTYPE, "");

	while (mblen > 0) {
	    size_t i = mbrtowc(wcstr+n, mbstr, (size_t)mblen, &state);
	    if (i == (size_t)-1 || i == (size_t)-2)
		break;
	    n++;
	    mbstr += i;
	    mblen -= i;
	}

	setlocale(LC_CTYPE, "C");

	return n;
    } else if (codepage == CS_NONE) {
	int n = 0;

	while (mblen > 0) {
	    wcstr[n] = 0xD800 | (mbstr[0] & 0xFF);
	    n++;
	    mbstr++;
	    mblen--;
	}

	return n;
    } else
	return charset_to_unicode(&mbstr, &mblen, wcstr, wclen, codepage,
				  NULL, NULL, 0);
}

int wc_to_mb(int codepage, int flags, const wchar_t *wcstr, int wclen,
	     char *mbstr, int mblen, char *defchr, int *defused,
	     struct unicode_data *ucsdata)
{
    /* FIXME: we should remove the defused param completely... */
    if (defused)
	*defused = 0;

    if (codepage == DEFAULT_CODEPAGE) {
	char output[MB_LEN_MAX];
	mbstate_t state;
	int n = 0;

	memset(&state, 0, sizeof state);
	setlocale(LC_CTYPE, "");

	while (wclen > 0) {
	    int i = wcrtomb(output, wcstr[0], &state);
	    if (i == (size_t)-1 || i > n - mblen)
		break;
	    memcpy(mbstr+n, output, i);
	    n += i;
	    wcstr++;
	    wclen--;
	}

	setlocale(LC_CTYPE, "C");

	return n;
    } else if (codepage == CS_NONE) {
	int n = 0;
	while (wclen > 0 && n < mblen) {
	    if (*wcstr >= 0xD800 && *wcstr < 0xD900)
		mbstr[n++] = (*wcstr & 0xFF);
	    else if (defchr)
		mbstr[n++] = *defchr;
	    wcstr++;
	    wclen--;
	}
	return n;
    } else {
	return charset_from_unicode(&wcstr, &wclen, mbstr, mblen, codepage,
				    NULL, defchr?defchr:NULL, defchr?1:0);
    }
}

/*
 * Return value is TRUE if pterm is to run in direct-to-font mode.
 */
int init_ucs(struct unicode_data *ucsdata, char *linecharset,
	     int utf8_override, int font_charset, int vtmode)
{
    int i, ret = 0;

    /*
     * In the platform-independent parts of the code, font_codepage
     * is used only for system DBCS support - which we don't
     * support at all. So we set this to something which will never
     * be used.
     */
    ucsdata->font_codepage = -1;

    /*
     * If utf8_override is set and the POSIX locale settings
     * dictate a UTF-8 character set, then just go straight for
     * UTF-8.
     */
    ucsdata->line_codepage = CS_NONE;
    if (utf8_override) {
	const char *s;
	if (((s = getenv("LC_ALL"))   && *s) ||
	    ((s = getenv("LC_CTYPE")) && *s) ||
	    ((s = getenv("LANG"))     && *s)) {
	    if (strstr(s, "UTF-8"))
		ucsdata->line_codepage = CS_UTF8;
	}
    }

    /*
     * Failing that, line_codepage should be decoded from the
     * specification in conf.
     */
    if (ucsdata->line_codepage == CS_NONE)
	ucsdata->line_codepage = decode_codepage(linecharset);

    /*
     * If line_codepage is _still_ CS_NONE, we assume we're using
     * the font's own encoding. This has been passed in to us, so
     * we use that. If it's still CS_NONE after _that_ - i.e. the
     * font we were given had an incomprehensible charset - then we
     * fall back to using the D800 page.
     */
    if (ucsdata->line_codepage == CS_NONE)
	ucsdata->line_codepage = font_charset;

    if (ucsdata->line_codepage == CS_NONE)
	ret = 1;

    /*
     * Set up unitab_line, by translating each individual character
     * in the line codepage into Unicode.
     */
    for (i = 0; i < 256; i++) {
	char c[1];
        const char *p;
	wchar_t wc[1];
	int len;
	c[0] = i;
	p = c;
	len = 1;
	if (ucsdata->line_codepage == CS_NONE)
	    ucsdata->unitab_line[i] = 0xD800 | i;
	else if (1 == charset_to_unicode(&p, &len, wc, 1,
					 ucsdata->line_codepage,
					 NULL, L"", 0))
	    ucsdata->unitab_line[i] = wc[0];
	else
	    ucsdata->unitab_line[i] = 0xFFFD;
    }

    /*
     * Set up unitab_xterm. This is the same as unitab_line except
     * in the line-drawing regions, where it follows the Unicode
     * encoding.
     * 
     * (Note that the strange X encoding of line-drawing characters
     * in the bottom 32 glyphs of ISO8859-1 fonts is taken care of
     * by the font encoding, which will spot such a font and act as
     * if it were in a variant encoding of ISO8859-1.)
     */
    for (i = 0; i < 256; i++) {
	static const wchar_t unitab_xterm_std[32] = {
	    0x2666, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1,
	    0x2424, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0x23ba,
	    0x23bb, 0x2500, 0x23bc, 0x23bd, 0x251c, 0x2524, 0x2534, 0x252c,
	    0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x0020
	};
	static const wchar_t unitab_xterm_poorman[32] =
	    L"*#****o~**+++++-----++++|****L. ";

	const wchar_t *ptr;

	if (vtmode == VT_POORMAN)
	    ptr = unitab_xterm_poorman;
	else
	    ptr = unitab_xterm_std;

	if (i >= 0x5F && i < 0x7F)
	    ucsdata->unitab_xterm[i] = ptr[i & 0x1F];
	else
	    ucsdata->unitab_xterm[i] = ucsdata->unitab_line[i];
    }

    /*
     * Set up unitab_scoacs. The SCO Alternate Character Set is
     * simply CP437.
     */
    for (i = 0; i < 256; i++) {
	char c[1];
        const char *p;
	wchar_t wc[1];
	int len;
	c[0] = i;
	p = c;
	len = 1;
	if (1 == charset_to_unicode(&p, &len, wc, 1, CS_CP437, NULL, L"", 0))
	    ucsdata->unitab_scoacs[i] = wc[0];
	else
	    ucsdata->unitab_scoacs[i] = 0xFFFD;
    }

    /*
     * Find the control characters in the line codepage. For
     * direct-to-font mode using the D800 hack, we assume 00-1F and
     * 7F are controls, but allow 80-9F through. (It's as good a
     * guess as anything; and my bet is that half the weird fonts
     * used in this way will be IBM or MS code pages anyway.)
     */
    for (i = 0; i < 256; i++) {
	int lineval = ucsdata->unitab_line[i];
	if (lineval < ' ' || (lineval >= 0x7F && lineval < 0xA0) ||
	    (lineval >= 0xD800 && lineval < 0xD820) || (lineval == 0xD87F))
	    ucsdata->unitab_ctrl[i] = i;
	else
	    ucsdata->unitab_ctrl[i] = 0xFF;
    }

    return ret;
}

const char *cp_name(int codepage)
{
    if (codepage == CS_NONE)
	return "Use font encoding";
    return charset_to_localenc(codepage);
}

const char *cp_enumerate(int index)
{
    int charset;
    if (index == 0)
	return "Use font encoding";
    charset = charset_localenc_nth(index-1);
    if (charset == CS_NONE)
	return NULL;
    return charset_to_localenc(charset);
}

int decode_codepage(char *cp_name)
{
    if (!*cp_name)
	return CS_NONE;		       /* use font encoding */
    return charset_from_localenc(cp_name);
}
Commit	Line	Data
1709795f	1	#include <stdio.h>
	2	#include <stdlib.h>
	3	#include <ctype.h>
2dc6356a	4	#include <locale.h>
	5	#include <limits.h>
	6	#include <wchar.h>
1709795f	7
1709795f	8	#include <time.h>
2dc6356a	9
1709795f	10	#include "putty.h"
d4413bd2	11	#include "charset.h"
887035a5	12	#include "terminal.h"
1709795f	13	#include "misc.h"
	14
	15	/*
	16	* Unix Unicode-handling routines.
1709795f	17	*/
1709795f	18
1709795f	19	int is_dbcs_leadbyte(int codepage, char byte)
	20	{
	21	return 0; /* we don't do DBCS */
	22	}
	23
57191fa4	24	int mb_to_wc(int codepage, int flags, const char *mbstr, int mblen,
1709795f	25	wchar_t *wcstr, int wclen)
1709795f	26	{
2dc6356a	27	if (codepage == DEFAULT_CODEPAGE) {
2dc6356a	28	int n = 0;
d4e1d591	29	mbstate_t state;
2dc6356a	30
d4e1d591	31	memset(&state, 0, sizeof state);
2dc6356a	32	setlocale(LC_CTYPE, "");
	33
	34	while (mblen > 0) {
	35	size_t i = mbrtowc(wcstr+n, mbstr, (size_t)mblen, &state);
	36	if (i == (size_t)-1 \|\| i == (size_t)-2)
	37	break;
	38	n++;
	39	mbstr += i;
	40	mblen -= i;
	41	}
	42
	43	setlocale(LC_CTYPE, "C");
	44
	45	return n;
facd762c	46	} else if (codepage == CS_NONE) {
	47	int n = 0;
	48
	49	while (mblen > 0) {
	50	wcstr[n] = 0xD800 \| (mbstr[0] & 0xFF);
	51	n++;
	52	mbstr++;
	53	mblen--;
	54	}
	55
	56	return n;
2dc6356a	57	} else
	58	return charset_to_unicode(&mbstr, &mblen, wcstr, wclen, codepage,
	59	NULL, NULL, 0);
e6346999	60	}
e6346999	61
57191fa4	62	int wc_to_mb(int codepage, int flags, const wchar_t *wcstr, int wclen,
21d2b241	63	char mbstr, int mblen, char defchr, int *defused,
21d2b241	64	struct unicode_data *ucsdata)
e6346999	65	{
2dc6356a	66	/* FIXME: we should remove the defused param completely... */
e6346999	67	if (defused)
e6346999	68	*defused = 0;
2dc6356a	69
	70	if (codepage == DEFAULT_CODEPAGE) {
	71	char output[MB_LEN_MAX];
d4e1d591	72	mbstate_t state;
2dc6356a	73	int n = 0;
2dc6356a	74
d4e1d591	75	memset(&state, 0, sizeof state);
2dc6356a	76	setlocale(LC_CTYPE, "");
	77
	78	while (wclen > 0) {
	79	int i = wcrtomb(output, wcstr[0], &state);
	80	if (i == (size_t)-1 \|\| i > n - mblen)
	81	break;
	82	memcpy(mbstr+n, output, i);
	83	n += i;
	84	wcstr++;
	85	wclen--;
	86	}
	87
	88	setlocale(LC_CTYPE, "C");
	89
	90	return n;
facd762c	91	} else if (codepage == CS_NONE) {
	92	int n = 0;
	93	while (wclen > 0 && n < mblen) {
	94	if (wcstr >= 0xD800 && wcstr < 0xD900)
	95	mbstr[n++] = (*wcstr & 0xFF);
	96	else if (defchr)
	97	mbstr[n++] = *defchr;
	98	wcstr++;
	99	wclen--;
	100	}
	101	return n;
	102	} else {
2dc6356a	103	return charset_from_unicode(&wcstr, &wclen, mbstr, mblen, codepage,
0f993689	104	NULL, defchr?defchr:NULL, defchr?1:0);
facd762c	105	}
1709795f	106	}
1709795f	107
085f4a68	108	/*
	109	* Return value is TRUE if pterm is to run in direct-to-font mode.
	110	*/
6ac7f054	111	int init_ucs(struct unicode_data ucsdata, char linecharset,
6ac7f054	112	int utf8_override, int font_charset, int vtmode)
1709795f	113	{
085f4a68	114	int i, ret = 0;
2dc6356a	115
	116	/*
	117	* In the platform-independent parts of the code, font_codepage
	118	* is used only for system DBCS support - which we don't
	119	* support at all. So we set this to something which will never
	120	* be used.
	121	*/
21d2b241	122	ucsdata->font_codepage = -1;
2dc6356a	123
2dc6356a	124	/*
6ac7f054	125	* If utf8_override is set and the POSIX locale settings
	126	* dictate a UTF-8 character set, then just go straight for
	127	* UTF-8.
2dc6356a	128	*/
6ac7f054	129	ucsdata->line_codepage = CS_NONE;
	130	if (utf8_override) {
	131	const char *s;
	132	if (((s = getenv("LC_ALL")) && *s) \|\|
	133	((s = getenv("LC_CTYPE")) && *s) \|\|
	134	((s = getenv("LANG")) && *s)) {
	135	if (strstr(s, "UTF-8"))
	136	ucsdata->line_codepage = CS_UTF8;
	137	}
	138	}
	139
	140	/*
	141	* Failing that, line_codepage should be decoded from the
4a693cfc	142	* specification in conf.
6ac7f054	143	*/
	144	if (ucsdata->line_codepage == CS_NONE)
	145	ucsdata->line_codepage = decode_codepage(linecharset);
2dc6356a	146
facd762c	147	/*
	148	* If line_codepage is _still_ CS_NONE, we assume we're using
	149	* the font's own encoding. This has been passed in to us, so
	150	* we use that. If it's still CS_NONE after _that_ - i.e. the
	151	* font we were given had an incomprehensible charset - then we
	152	* fall back to using the D800 page.
	153	*/
21d2b241	154	if (ucsdata->line_codepage == CS_NONE)
21d2b241	155	ucsdata->line_codepage = font_charset;
2dc6356a	156
21d2b241	157	if (ucsdata->line_codepage == CS_NONE)
085f4a68	158	ret = 1;
085f4a68	159
2dc6356a	160	/*
	161	* Set up unitab_line, by translating each individual character
	162	* in the line codepage into Unicode.
	163	*/
	164	for (i = 0; i < 256; i++) {
57191fa4	165	char c[1];
57191fa4	166	const char *p;
2dc6356a	167	wchar_t wc[1];
	168	int len;
	169	c[0] = i;
	170	p = c;
	171	len = 1;
21d2b241	172	if (ucsdata->line_codepage == CS_NONE)
	173	ucsdata->unitab_line[i] = 0xD800 \| i;
	174	else if (1 == charset_to_unicode(&p, &len, wc, 1,
	175	ucsdata->line_codepage,
facd762c	176	NULL, L"", 0))
21d2b241	177	ucsdata->unitab_line[i] = wc[0];
1709795f	178	else
21d2b241	179	ucsdata->unitab_line[i] = 0xFFFD;
2dc6356a	180	}
1709795f	181
2dc6356a	182	/*
	183	* Set up unitab_xterm. This is the same as unitab_line except
	184	* in the line-drawing regions, where it follows the Unicode
	185	* encoding.
	186	*
	187	* (Note that the strange X encoding of line-drawing characters
	188	* in the bottom 32 glyphs of ISO8859-1 fonts is taken care of
	189	* by the font encoding, which will spot such a font and act as
	190	* if it were in a variant encoding of ISO8859-1.)
	191	*/
1709795f	192	for (i = 0; i < 256; i++) {
2dc6356a	193	static const wchar_t unitab_xterm_std[32] = {
	194	0x2666, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1,
	195	0x2424, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0x23ba,
	196	0x23bb, 0x2500, 0x23bc, 0x23bd, 0x251c, 0x2524, 0x2534, 0x252c,
	197	0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x0020
	198	};
3900c2d6	199	static const wchar_t unitab_xterm_poorman[32] =
	200	L"#*o~+++++-----++++\|****L. ";
	201
	202	const wchar_t *ptr;
	203
	204	if (vtmode == VT_POORMAN)
	205	ptr = unitab_xterm_poorman;
	206	else
	207	ptr = unitab_xterm_std;
	208
2dc6356a	209	if (i >= 0x5F && i < 0x7F)
3900c2d6	210	ucsdata->unitab_xterm[i] = ptr[i & 0x1F];
2dc6356a	211	else
21d2b241	212	ucsdata->unitab_xterm[i] = ucsdata->unitab_line[i];
1709795f	213	}
2dc6356a	214
	215	/*
	216	* Set up unitab_scoacs. The SCO Alternate Character Set is
	217	* simply CP437.
	218	*/
	219	for (i = 0; i < 256; i++) {
57191fa4	220	char c[1];
57191fa4	221	const char *p;
2dc6356a	222	wchar_t wc[1];
	223	int len;
	224	c[0] = i;
	225	p = c;
	226	len = 1;
facd762c	227	if (1 == charset_to_unicode(&p, &len, wc, 1, CS_CP437, NULL, L"", 0))
21d2b241	228	ucsdata->unitab_scoacs[i] = wc[0];
2dc6356a	229	else
21d2b241	230	ucsdata->unitab_scoacs[i] = 0xFFFD;
2dc6356a	231	}
2dc6356a	232
facd762c	233	/*
	234	* Find the control characters in the line codepage. For
	235	* direct-to-font mode using the D800 hack, we assume 00-1F and
	236	* 7F are controls, but allow 80-9F through. (It's as good a
	237	* guess as anything; and my bet is that half the weird fonts
	238	* used in this way will be IBM or MS code pages anyway.)
	239	*/
	240	for (i = 0; i < 256; i++) {
21d2b241	241	int lineval = ucsdata->unitab_line[i];
facd762c	242	if (lineval < ' ' \|\| (lineval >= 0x7F && lineval < 0xA0) \|\|
facd762c	243	(lineval >= 0xD800 && lineval < 0xD820) \|\| (lineval == 0xD87F))
21d2b241	244	ucsdata->unitab_ctrl[i] = i;
2dc6356a	245	else
21d2b241	246	ucsdata->unitab_ctrl[i] = 0xFF;
facd762c	247	}
085f4a68	248
085f4a68	249	return ret;
126ce234	250	}
d4413bd2	251
	252	const char *cp_name(int codepage)
	253	{
	254	if (codepage == CS_NONE)
	255	return "Use font encoding";
	256	return charset_to_localenc(codepage);
	257	}
	258
	259	const char *cp_enumerate(int index)
	260	{
	261	int charset;
	262	if (index == 0)
	263	return "Use font encoding";
	264	charset = charset_localenc_nth(index-1);
	265	if (charset == CS_NONE)
	266	return NULL;
	267	return charset_to_localenc(charset);
	268	}
	269
	270	int decode_codepage(char *cp_name)
	271	{
	272	if (!*cp_name)
	273	return CS_NONE; /* use font encoding */
	274	return charset_from_localenc(cp_name);
	275	}