[u/mdw/putty] / unix / uxucs.c

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <locale.h>
#include <limits.h>
#include <wchar.h>

#include <time.h>

#include "putty.h"
#include "charset.h"
#include "terminal.h"
#include "misc.h"

/*
 * Unix Unicode-handling routines.
 */

int is_dbcs_leadbyte(int codepage, char byte)
{
    return 0;			       /* we don't do DBCS */
}

int mb_to_wc(int codepage, int flags, const char *mbstr, int mblen,
	     wchar_t *wcstr, int wclen)
{
    if (codepage == DEFAULT_CODEPAGE) {
	int n = 0;
	mbstate_t state;

	memset(&state, 0, sizeof state);

	while (mblen > 0) {
	    size_t i = mbrtowc(wcstr+n, mbstr, (size_t)mblen, &state);
	    if (i == (size_t)-1 || i == (size_t)-2)
		break;
	    n++;
	    mbstr += i;
	    mblen -= i;
	}

	return n;
    } else if (codepage == CS_NONE) {
	int n = 0;

	while (mblen > 0) {
	    wcstr[n] = 0xD800 | (mbstr[0] & 0xFF);
	    n++;
	    mbstr++;
	    mblen--;
	}

	return n;
    } else
	return charset_to_unicode(&mbstr, &mblen, wcstr, wclen, codepage,
				  NULL, NULL, 0);
}

int wc_to_mb(int codepage, int flags, const wchar_t *wcstr, int wclen,
	     char *mbstr, int mblen, char *defchr, int *defused,
	     struct unicode_data *ucsdata)
{
    /* FIXME: we should remove the defused param completely... */
    if (defused)
	*defused = 0;

    if (codepage == DEFAULT_CODEPAGE) {
	char output[MB_LEN_MAX];
	mbstate_t state;
	int n = 0;

	memset(&state, 0, sizeof state);

	while (wclen > 0) {
	    int i = wcrtomb(output, wcstr[0], &state);
	    if (i == (size_t)-1 || i > n - mblen)
		break;
	    memcpy(mbstr+n, output, i);
	    n += i;
	    wcstr++;
	    wclen--;
	}

	return n;
    } else if (codepage == CS_NONE) {
	int n = 0;
	while (wclen > 0 && n < mblen) {
	    if (*wcstr >= 0xD800 && *wcstr < 0xD900)
		mbstr[n++] = (*wcstr & 0xFF);
	    else if (defchr)
		mbstr[n++] = *defchr;
	    wcstr++;
	    wclen--;
	}
	return n;
    } else {
	return charset_from_unicode(&wcstr, &wclen, mbstr, mblen, codepage,
				    NULL, defchr?defchr:NULL, defchr?1:0);
    }
}

/*
 * Return value is TRUE if pterm is to run in direct-to-font mode.
 */
int init_ucs(struct unicode_data *ucsdata, char *linecharset,
	     int utf8_override, int font_charset, int vtmode)
{
    int i, ret = 0;

    /*
     * In the platform-independent parts of the code, font_codepage
     * is used only for system DBCS support - which we don't
     * support at all. So we set this to something which will never
     * be used.
     */
    ucsdata->font_codepage = -1;

    /*
     * If utf8_override is set and the POSIX locale settings
     * dictate a UTF-8 character set, then just go straight for
     * UTF-8.
     */
    ucsdata->line_codepage = CS_NONE;
    if (utf8_override) {
	const char *s;
	if (((s = getenv("LC_ALL"))   && *s) ||
	    ((s = getenv("LC_CTYPE")) && *s) ||
	    ((s = getenv("LANG"))     && *s)) {
	    if (strstr(s, "UTF-8"))
		ucsdata->line_codepage = CS_UTF8;
	}
    }

    /*
     * Failing that, line_codepage should be decoded from the
     * specification in conf.
     */
    if (ucsdata->line_codepage == CS_NONE)
	ucsdata->line_codepage = decode_codepage(linecharset);

    /*
     * If line_codepage is _still_ CS_NONE, we assume we're using
     * the font's own encoding. This has been passed in to us, so
     * we use that. If it's still CS_NONE after _that_ - i.e. the
     * font we were given had an incomprehensible charset - then we
     * fall back to using the D800 page.
     */
    if (ucsdata->line_codepage == CS_NONE)
	ucsdata->line_codepage = font_charset;

    if (ucsdata->line_codepage == CS_NONE)
	ret = 1;

    /*
     * Set up unitab_line, by translating each individual character
     * in the line codepage into Unicode.
     */
    for (i = 0; i < 256; i++) {
	char c[1];
        const char *p;
	wchar_t wc[1];
	int len;
	c[0] = i;
	p = c;
	len = 1;
	if (ucsdata->line_codepage == CS_NONE)
	    ucsdata->unitab_line[i] = 0xD800 | i;
	else if (1 == charset_to_unicode(&p, &len, wc, 1,
					 ucsdata->line_codepage,
					 NULL, L"", 0))
	    ucsdata->unitab_line[i] = wc[0];
	else
	    ucsdata->unitab_line[i] = 0xFFFD;
    }

    /*
     * Set up unitab_xterm. This is the same as unitab_line except
     * in the line-drawing regions, where it follows the Unicode
     * encoding.
     * 
     * (Note that the strange X encoding of line-drawing characters
     * in the bottom 32 glyphs of ISO8859-1 fonts is taken care of
     * by the font encoding, which will spot such a font and act as
     * if it were in a variant encoding of ISO8859-1.)
     */
    for (i = 0; i < 256; i++) {
	static const wchar_t unitab_xterm_std[32] = {
	    0x2666, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1,
	    0x2424, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0x23ba,
	    0x23bb, 0x2500, 0x23bc, 0x23bd, 0x251c, 0x2524, 0x2534, 0x252c,
	    0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x0020
	};
	static const wchar_t unitab_xterm_poorman[32] =
	    L"*#****o~**+++++-----++++|****L. ";

	const wchar_t *ptr;

	if (vtmode == VT_POORMAN)
	    ptr = unitab_xterm_poorman;
	else
	    ptr = unitab_xterm_std;

	if (i >= 0x5F && i < 0x7F)
	    ucsdata->unitab_xterm[i] = ptr[i & 0x1F];
	else
	    ucsdata->unitab_xterm[i] = ucsdata->unitab_line[i];
    }

    /*
     * Set up unitab_scoacs. The SCO Alternate Character Set is
     * simply CP437.
     */
    for (i = 0; i < 256; i++) {
	char c[1];
        const char *p;
	wchar_t wc[1];
	int len;
	c[0] = i;
	p = c;
	len = 1;
	if (1 == charset_to_unicode(&p, &len, wc, 1, CS_CP437, NULL, L"", 0))
	    ucsdata->unitab_scoacs[i] = wc[0];
	else
	    ucsdata->unitab_scoacs[i] = 0xFFFD;
    }

    /*
     * Find the control characters in the line codepage. For
     * direct-to-font mode using the D800 hack, we assume 00-1F and
     * 7F are controls, but allow 80-9F through. (It's as good a
     * guess as anything; and my bet is that half the weird fonts
     * used in this way will be IBM or MS code pages anyway.)
     */
    for (i = 0; i < 256; i++) {
	int lineval = ucsdata->unitab_line[i];
	if (lineval < ' ' || (lineval >= 0x7F && lineval < 0xA0) ||
	    (lineval >= 0xD800 && lineval < 0xD820) || (lineval == 0xD87F))
	    ucsdata->unitab_ctrl[i] = i;
	else
	    ucsdata->unitab_ctrl[i] = 0xFF;
    }

    return ret;
}

const char *cp_name(int codepage)
{
    if (codepage == CS_NONE)
	return "Use font encoding";
    return charset_to_localenc(codepage);
}

const char *cp_enumerate(int index)
{
    int charset;
    charset = charset_localenc_nth(index);
    if (charset == CS_NONE) {
        /* "Use font encoding" comes after all the named charsets */
        if (charset_localenc_nth(index-1) != CS_NONE)
            return "Use font encoding";
	return NULL;
    }
    return charset_to_localenc(charset);
}

int decode_codepage(char *cp_name)
{
    if (!cp_name || !*cp_name)
	return CS_UTF8;
    return charset_from_localenc(cp_name);
}
Commit	Line	Data
1709795f	1	#include <stdio.h>
	2	#include <stdlib.h>
	3	#include <ctype.h>
2dc6356a	4	#include <locale.h>
	5	#include <limits.h>
	6	#include <wchar.h>
1709795f	7
1709795f	8	#include <time.h>
2dc6356a	9
1709795f	10	#include "putty.h"
d4413bd2	11	#include "charset.h"
887035a5	12	#include "terminal.h"
1709795f	13	#include "misc.h"
	14
	15	/*
	16	* Unix Unicode-handling routines.
1709795f	17	*/
1709795f	18
1709795f	19	int is_dbcs_leadbyte(int codepage, char byte)
	20	{
	21	return 0; /* we don't do DBCS */
	22	}
	23
57191fa4	24	int mb_to_wc(int codepage, int flags, const char *mbstr, int mblen,
1709795f	25	wchar_t *wcstr, int wclen)
1709795f	26	{
2dc6356a	27	if (codepage == DEFAULT_CODEPAGE) {
2dc6356a	28	int n = 0;
d4e1d591	29	mbstate_t state;
2dc6356a	30
d4e1d591	31	memset(&state, 0, sizeof state);
2dc6356a	32
	33	while (mblen > 0) {
	34	size_t i = mbrtowc(wcstr+n, mbstr, (size_t)mblen, &state);
	35	if (i == (size_t)-1 \|\| i == (size_t)-2)
	36	break;
	37	n++;
	38	mbstr += i;
	39	mblen -= i;
	40	}
	41
2dc6356a	42	return n;
facd762c	43	} else if (codepage == CS_NONE) {
	44	int n = 0;
	45
	46	while (mblen > 0) {
	47	wcstr[n] = 0xD800 \| (mbstr[0] & 0xFF);
	48	n++;
	49	mbstr++;
	50	mblen--;
	51	}
	52
	53	return n;
2dc6356a	54	} else
	55	return charset_to_unicode(&mbstr, &mblen, wcstr, wclen, codepage,
	56	NULL, NULL, 0);
e6346999	57	}
e6346999	58
57191fa4	59	int wc_to_mb(int codepage, int flags, const wchar_t *wcstr, int wclen,
21d2b241	60	char mbstr, int mblen, char defchr, int *defused,
21d2b241	61	struct unicode_data *ucsdata)
e6346999	62	{
2dc6356a	63	/* FIXME: we should remove the defused param completely... */
e6346999	64	if (defused)
e6346999	65	*defused = 0;
2dc6356a	66
	67	if (codepage == DEFAULT_CODEPAGE) {
	68	char output[MB_LEN_MAX];
d4e1d591	69	mbstate_t state;
2dc6356a	70	int n = 0;
2dc6356a	71
d4e1d591	72	memset(&state, 0, sizeof state);
2dc6356a	73
	74	while (wclen > 0) {
	75	int i = wcrtomb(output, wcstr[0], &state);
	76	if (i == (size_t)-1 \|\| i > n - mblen)
	77	break;
	78	memcpy(mbstr+n, output, i);
	79	n += i;
	80	wcstr++;
	81	wclen--;
	82	}
	83
2dc6356a	84	return n;
facd762c	85	} else if (codepage == CS_NONE) {
	86	int n = 0;
	87	while (wclen > 0 && n < mblen) {
	88	if (wcstr >= 0xD800 && wcstr < 0xD900)
	89	mbstr[n++] = (*wcstr & 0xFF);
	90	else if (defchr)
	91	mbstr[n++] = *defchr;
	92	wcstr++;
	93	wclen--;
	94	}
	95	return n;
	96	} else {
2dc6356a	97	return charset_from_unicode(&wcstr, &wclen, mbstr, mblen, codepage,
0f993689	98	NULL, defchr?defchr:NULL, defchr?1:0);
facd762c	99	}
1709795f	100	}
1709795f	101
085f4a68	102	/*
	103	* Return value is TRUE if pterm is to run in direct-to-font mode.
	104	*/
6ac7f054	105	int init_ucs(struct unicode_data ucsdata, char linecharset,
6ac7f054	106	int utf8_override, int font_charset, int vtmode)
1709795f	107	{
085f4a68	108	int i, ret = 0;
2dc6356a	109
	110	/*
	111	* In the platform-independent parts of the code, font_codepage
	112	* is used only for system DBCS support - which we don't
	113	* support at all. So we set this to something which will never
	114	* be used.
	115	*/
21d2b241	116	ucsdata->font_codepage = -1;
2dc6356a	117
2dc6356a	118	/*
6ac7f054	119	* If utf8_override is set and the POSIX locale settings
	120	* dictate a UTF-8 character set, then just go straight for
	121	* UTF-8.
2dc6356a	122	*/
6ac7f054	123	ucsdata->line_codepage = CS_NONE;
	124	if (utf8_override) {
	125	const char *s;
	126	if (((s = getenv("LC_ALL")) && *s) \|\|
	127	((s = getenv("LC_CTYPE")) && *s) \|\|
	128	((s = getenv("LANG")) && *s)) {
	129	if (strstr(s, "UTF-8"))
	130	ucsdata->line_codepage = CS_UTF8;
	131	}
	132	}
	133
	134	/*
	135	* Failing that, line_codepage should be decoded from the
4a693cfc	136	* specification in conf.
6ac7f054	137	*/
	138	if (ucsdata->line_codepage == CS_NONE)
	139	ucsdata->line_codepage = decode_codepage(linecharset);
2dc6356a	140
facd762c	141	/*
	142	* If line_codepage is _still_ CS_NONE, we assume we're using
	143	* the font's own encoding. This has been passed in to us, so
	144	* we use that. If it's still CS_NONE after _that_ - i.e. the
	145	* font we were given had an incomprehensible charset - then we
	146	* fall back to using the D800 page.
	147	*/
21d2b241	148	if (ucsdata->line_codepage == CS_NONE)
21d2b241	149	ucsdata->line_codepage = font_charset;
2dc6356a	150
21d2b241	151	if (ucsdata->line_codepage == CS_NONE)
085f4a68	152	ret = 1;
085f4a68	153
2dc6356a	154	/*
	155	* Set up unitab_line, by translating each individual character
	156	* in the line codepage into Unicode.
	157	*/
	158	for (i = 0; i < 256; i++) {
57191fa4	159	char c[1];
57191fa4	160	const char *p;
2dc6356a	161	wchar_t wc[1];
	162	int len;
	163	c[0] = i;
	164	p = c;
	165	len = 1;
21d2b241	166	if (ucsdata->line_codepage == CS_NONE)
	167	ucsdata->unitab_line[i] = 0xD800 \| i;
	168	else if (1 == charset_to_unicode(&p, &len, wc, 1,
	169	ucsdata->line_codepage,
facd762c	170	NULL, L"", 0))
21d2b241	171	ucsdata->unitab_line[i] = wc[0];
1709795f	172	else
21d2b241	173	ucsdata->unitab_line[i] = 0xFFFD;
2dc6356a	174	}
1709795f	175
2dc6356a	176	/*
	177	* Set up unitab_xterm. This is the same as unitab_line except
	178	* in the line-drawing regions, where it follows the Unicode
	179	* encoding.
	180	*
	181	* (Note that the strange X encoding of line-drawing characters
	182	* in the bottom 32 glyphs of ISO8859-1 fonts is taken care of
	183	* by the font encoding, which will spot such a font and act as
	184	* if it were in a variant encoding of ISO8859-1.)
	185	*/
1709795f	186	for (i = 0; i < 256; i++) {
2dc6356a	187	static const wchar_t unitab_xterm_std[32] = {
	188	0x2666, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1,
	189	0x2424, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0x23ba,
	190	0x23bb, 0x2500, 0x23bc, 0x23bd, 0x251c, 0x2524, 0x2534, 0x252c,
	191	0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x0020
	192	};
3900c2d6	193	static const wchar_t unitab_xterm_poorman[32] =
	194	L"#*o~+++++-----++++\|****L. ";
	195
	196	const wchar_t *ptr;
	197
	198	if (vtmode == VT_POORMAN)
	199	ptr = unitab_xterm_poorman;
	200	else
	201	ptr = unitab_xterm_std;
	202
2dc6356a	203	if (i >= 0x5F && i < 0x7F)
3900c2d6	204	ucsdata->unitab_xterm[i] = ptr[i & 0x1F];
2dc6356a	205	else
21d2b241	206	ucsdata->unitab_xterm[i] = ucsdata->unitab_line[i];
1709795f	207	}
2dc6356a	208
	209	/*
	210	* Set up unitab_scoacs. The SCO Alternate Character Set is
	211	* simply CP437.
	212	*/
	213	for (i = 0; i < 256; i++) {
57191fa4	214	char c[1];
57191fa4	215	const char *p;
2dc6356a	216	wchar_t wc[1];
	217	int len;
	218	c[0] = i;
	219	p = c;
	220	len = 1;
facd762c	221	if (1 == charset_to_unicode(&p, &len, wc, 1, CS_CP437, NULL, L"", 0))
21d2b241	222	ucsdata->unitab_scoacs[i] = wc[0];
2dc6356a	223	else
21d2b241	224	ucsdata->unitab_scoacs[i] = 0xFFFD;
2dc6356a	225	}
2dc6356a	226
facd762c	227	/*
	228	* Find the control characters in the line codepage. For
	229	* direct-to-font mode using the D800 hack, we assume 00-1F and
	230	* 7F are controls, but allow 80-9F through. (It's as good a
	231	* guess as anything; and my bet is that half the weird fonts
	232	* used in this way will be IBM or MS code pages anyway.)
	233	*/
	234	for (i = 0; i < 256; i++) {
21d2b241	235	int lineval = ucsdata->unitab_line[i];
facd762c	236	if (lineval < ' ' \|\| (lineval >= 0x7F && lineval < 0xA0) \|\|
facd762c	237	(lineval >= 0xD800 && lineval < 0xD820) \|\| (lineval == 0xD87F))
21d2b241	238	ucsdata->unitab_ctrl[i] = i;
2dc6356a	239	else
21d2b241	240	ucsdata->unitab_ctrl[i] = 0xFF;
facd762c	241	}
085f4a68	242
085f4a68	243	return ret;
126ce234	244	}
d4413bd2	245
	246	const char *cp_name(int codepage)
	247	{
	248	if (codepage == CS_NONE)
	249	return "Use font encoding";
	250	return charset_to_localenc(codepage);
	251	}
	252
	253	const char *cp_enumerate(int index)
	254	{
	255	int charset;
7e8aecee	256	charset = charset_localenc_nth(index);
	257	if (charset == CS_NONE) {
	258	/* "Use font encoding" comes after all the named charsets */
	259	if (charset_localenc_nth(index-1) != CS_NONE)
	260	return "Use font encoding";
d4413bd2	261	return NULL;
7e8aecee	262	}
d4413bd2	263	return charset_to_localenc(charset);
	264	}
	265
	266	int decode_codepage(char *cp_name)
	267	{
93511b79	268	if (!cp_name \|\| !*cp_name)
7e8aecee	269	return CS_UTF8;
d4413bd2	270	return charset_from_localenc(cp_name);
d4413bd2	271	}