[u/mdw/putty] / unix / uxucs.c

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <locale.h>
#include <limits.h>
#include <wchar.h>

#include <time.h>

#include "putty.h"
#include "charset.h"
#include "terminal.h"
#include "misc.h"

/*
 * Unix Unicode-handling routines.
 */

int is_dbcs_leadbyte(int codepage, char byte)
{
    return 0;			       /* we don't do DBCS */
}

int mb_to_wc(int codepage, int flags, char *mbstr, int mblen,
	     wchar_t *wcstr, int wclen)
{
    if (codepage == DEFAULT_CODEPAGE) {
	int n = 0;
	mbstate_t state = { 0 };

	setlocale(LC_CTYPE, "");

	while (mblen > 0) {
	    size_t i = mbrtowc(wcstr+n, mbstr, (size_t)mblen, &state);
	    if (i == (size_t)-1 || i == (size_t)-2)
		break;
	    n++;
	    mbstr += i;
	    mblen -= i;
	}

	setlocale(LC_CTYPE, "C");

	return n;
    } else if (codepage == CS_NONE) {
	int n = 0;

	while (mblen > 0) {
	    wcstr[n] = 0xD800 | (mbstr[0] & 0xFF);
	    n++;
	    mbstr++;
	    mblen--;
	}

	return n;
    } else
	return charset_to_unicode(&mbstr, &mblen, wcstr, wclen, codepage,
				  NULL, NULL, 0);
}

int wc_to_mb(int codepage, int flags, wchar_t *wcstr, int wclen,
	     char *mbstr, int mblen, char *defchr, int *defused,
	     struct unicode_data *ucsdata)
{
    /* FIXME: we should remove the defused param completely... */
    if (defused)
	*defused = 0;

    if (codepage == DEFAULT_CODEPAGE) {
	char output[MB_LEN_MAX];
	mbstate_t state = { 0 };
	int n = 0;

	setlocale(LC_CTYPE, "");

	while (wclen > 0) {
	    int i = wcrtomb(output, wcstr[0], &state);
	    if (i == (size_t)-1 || i > n - mblen)
		break;
	    memcpy(mbstr+n, output, i);
	    n += i;
	    wcstr++;
	    wclen--;
	}

	setlocale(LC_CTYPE, "C");

	return n;
    } else if (codepage == CS_NONE) {
	int n = 0;
	while (wclen > 0 && n < mblen) {
	    if (*wcstr >= 0xD800 && *wcstr < 0xD900)
		mbstr[n++] = (*wcstr & 0xFF);
	    else if (defchr)
		mbstr[n++] = *defchr;
	    wcstr++;
	    wclen--;
	}
	return n;
    } else {
	return charset_from_unicode(&wcstr, &wclen, mbstr, mblen, codepage,
				    NULL, defchr?defchr:NULL, defchr?1:0);
    }
}

/*
 * Return value is TRUE if pterm is to run in direct-to-font mode.
 */
int init_ucs(struct unicode_data *ucsdata, char *linecharset,
	     int utf8_override, int font_charset, int vtmode)
{
    int i, ret = 0;

    /*
     * In the platform-independent parts of the code, font_codepage
     * is used only for system DBCS support - which we don't
     * support at all. So we set this to something which will never
     * be used.
     */
    ucsdata->font_codepage = -1;

    /*
     * If utf8_override is set and the POSIX locale settings
     * dictate a UTF-8 character set, then just go straight for
     * UTF-8.
     */
    ucsdata->line_codepage = CS_NONE;
    if (utf8_override) {
	const char *s;
	if (((s = getenv("LC_ALL"))   && *s) ||
	    ((s = getenv("LC_CTYPE")) && *s) ||
	    ((s = getenv("LANG"))     && *s)) {
	    if (strstr(s, "UTF-8"))
		ucsdata->line_codepage = CS_UTF8;
	}
    }

    /*
     * Failing that, line_codepage should be decoded from the
     * specification in cfg.
     */
    if (ucsdata->line_codepage == CS_NONE)
	ucsdata->line_codepage = decode_codepage(linecharset);

    /*
     * If line_codepage is _still_ CS_NONE, we assume we're using
     * the font's own encoding. This has been passed in to us, so
     * we use that. If it's still CS_NONE after _that_ - i.e. the
     * font we were given had an incomprehensible charset - then we
     * fall back to using the D800 page.
     */
    if (ucsdata->line_codepage == CS_NONE)
	ucsdata->line_codepage = font_charset;

    if (ucsdata->line_codepage == CS_NONE)
	ret = 1;

    /*
     * Set up unitab_line, by translating each individual character
     * in the line codepage into Unicode.
     */
    for (i = 0; i < 256; i++) {
	char c[1], *p;
	wchar_t wc[1];
	int len;
	c[0] = i;
	p = c;
	len = 1;
	if (ucsdata->line_codepage == CS_NONE)
	    ucsdata->unitab_line[i] = 0xD800 | i;
	else if (1 == charset_to_unicode(&p, &len, wc, 1,
					 ucsdata->line_codepage,
					 NULL, L"", 0))
	    ucsdata->unitab_line[i] = wc[0];
	else
	    ucsdata->unitab_line[i] = 0xFFFD;
    }

    /*
     * Set up unitab_xterm. This is the same as unitab_line except
     * in the line-drawing regions, where it follows the Unicode
     * encoding.
     * 
     * (Note that the strange X encoding of line-drawing characters
     * in the bottom 32 glyphs of ISO8859-1 fonts is taken care of
     * by the font encoding, which will spot such a font and act as
     * if it were in a variant encoding of ISO8859-1.)
     */
    for (i = 0; i < 256; i++) {
	static const wchar_t unitab_xterm_std[32] = {
	    0x2666, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1,
	    0x2424, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0x23ba,
	    0x23bb, 0x2500, 0x23bc, 0x23bd, 0x251c, 0x2524, 0x2534, 0x252c,
	    0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x0020
	};
	static const wchar_t unitab_xterm_poorman[32] =
	    L"*#****o~**+++++-----++++|****L. ";

	const wchar_t *ptr;

	if (vtmode == VT_POORMAN)
	    ptr = unitab_xterm_poorman;
	else
	    ptr = unitab_xterm_std;

	if (i >= 0x5F && i < 0x7F)
	    ucsdata->unitab_xterm[i] = ptr[i & 0x1F];
	else
	    ucsdata->unitab_xterm[i] = ucsdata->unitab_line[i];
    }

    /*
     * Set up unitab_scoacs. The SCO Alternate Character Set is
     * simply CP437.
     */
    for (i = 0; i < 256; i++) {
	char c[1], *p;
	wchar_t wc[1];
	int len;
	c[0] = i;
	p = c;
	len = 1;
	if (1 == charset_to_unicode(&p, &len, wc, 1, CS_CP437, NULL, L"", 0))
	    ucsdata->unitab_scoacs[i] = wc[0];
	else
	    ucsdata->unitab_scoacs[i] = 0xFFFD;
    }

    /*
     * Find the control characters in the line codepage. For
     * direct-to-font mode using the D800 hack, we assume 00-1F and
     * 7F are controls, but allow 80-9F through. (It's as good a
     * guess as anything; and my bet is that half the weird fonts
     * used in this way will be IBM or MS code pages anyway.)
     */
    for (i = 0; i < 256; i++) {
	int lineval = ucsdata->unitab_line[i];
	if (lineval < ' ' || (lineval >= 0x7F && lineval < 0xA0) ||
	    (lineval >= 0xD800 && lineval < 0xD820) || (lineval == 0xD87F))
	    ucsdata->unitab_ctrl[i] = i;
	else
	    ucsdata->unitab_ctrl[i] = 0xFF;
    }

    return ret;
}

const char *cp_name(int codepage)
{
    if (codepage == CS_NONE)
	return "Use font encoding";
    return charset_to_localenc(codepage);
}

const char *cp_enumerate(int index)
{
    int charset;
    if (index == 0)
	return "Use font encoding";
    charset = charset_localenc_nth(index-1);
    if (charset == CS_NONE)
	return NULL;
    return charset_to_localenc(charset);
}

int decode_codepage(char *cp_name)
{
    if (!*cp_name)
	return CS_NONE;		       /* use font encoding */
    return charset_from_localenc(cp_name);
}
Commit	Line	Data
1709795f	1	#include <stdio.h>
	2	#include <stdlib.h>
	3	#include <ctype.h>
2dc6356a	4	#include <locale.h>
	5	#include <limits.h>
	6	#include <wchar.h>
1709795f	7
1709795f	8	#include <time.h>
2dc6356a	9
1709795f	10	#include "putty.h"
d4413bd2	11	#include "charset.h"
887035a5	12	#include "terminal.h"
1709795f	13	#include "misc.h"
	14
	15	/*
	16	* Unix Unicode-handling routines.
1709795f	17	*/
1709795f	18
1709795f	19	int is_dbcs_leadbyte(int codepage, char byte)
	20	{
	21	return 0; /* we don't do DBCS */
	22	}
	23
	24	int mb_to_wc(int codepage, int flags, char *mbstr, int mblen,
	25	wchar_t *wcstr, int wclen)
	26	{
2dc6356a	27	if (codepage == DEFAULT_CODEPAGE) {
	28	int n = 0;
	29	mbstate_t state = { 0 };
	30
	31	setlocale(LC_CTYPE, "");
	32
	33	while (mblen > 0) {
	34	size_t i = mbrtowc(wcstr+n, mbstr, (size_t)mblen, &state);
	35	if (i == (size_t)-1 \|\| i == (size_t)-2)
	36	break;
	37	n++;
	38	mbstr += i;
	39	mblen -= i;
	40	}
	41
	42	setlocale(LC_CTYPE, "C");
	43
	44	return n;
facd762c	45	} else if (codepage == CS_NONE) {
	46	int n = 0;
	47
	48	while (mblen > 0) {
	49	wcstr[n] = 0xD800 \| (mbstr[0] & 0xFF);
	50	n++;
	51	mbstr++;
	52	mblen--;
	53	}
	54
	55	return n;
2dc6356a	56	} else
	57	return charset_to_unicode(&mbstr, &mblen, wcstr, wclen, codepage,
	58	NULL, NULL, 0);
e6346999	59	}
	60
	61	int wc_to_mb(int codepage, int flags, wchar_t *wcstr, int wclen,
21d2b241	62	char mbstr, int mblen, char defchr, int *defused,
21d2b241	63	struct unicode_data *ucsdata)
e6346999	64	{
2dc6356a	65	/* FIXME: we should remove the defused param completely... */
e6346999	66	if (defused)
e6346999	67	*defused = 0;
2dc6356a	68
	69	if (codepage == DEFAULT_CODEPAGE) {
	70	char output[MB_LEN_MAX];
	71	mbstate_t state = { 0 };
	72	int n = 0;
	73
	74	setlocale(LC_CTYPE, "");
	75
	76	while (wclen > 0) {
	77	int i = wcrtomb(output, wcstr[0], &state);
	78	if (i == (size_t)-1 \|\| i > n - mblen)
	79	break;
	80	memcpy(mbstr+n, output, i);
	81	n += i;
	82	wcstr++;
	83	wclen--;
	84	}
	85
	86	setlocale(LC_CTYPE, "C");
	87
	88	return n;
facd762c	89	} else if (codepage == CS_NONE) {
	90	int n = 0;
	91	while (wclen > 0 && n < mblen) {
	92	if (wcstr >= 0xD800 && wcstr < 0xD900)
	93	mbstr[n++] = (*wcstr & 0xFF);
	94	else if (defchr)
	95	mbstr[n++] = *defchr;
	96	wcstr++;
	97	wclen--;
	98	}
	99	return n;
	100	} else {
2dc6356a	101	return charset_from_unicode(&wcstr, &wclen, mbstr, mblen, codepage,
0f993689	102	NULL, defchr?defchr:NULL, defchr?1:0);
facd762c	103	}
1709795f	104	}
1709795f	105
085f4a68	106	/*
	107	* Return value is TRUE if pterm is to run in direct-to-font mode.
	108	*/
6ac7f054	109	int init_ucs(struct unicode_data ucsdata, char linecharset,
6ac7f054	110	int utf8_override, int font_charset, int vtmode)
1709795f	111	{
085f4a68	112	int i, ret = 0;
2dc6356a	113
	114	/*
	115	* In the platform-independent parts of the code, font_codepage
	116	* is used only for system DBCS support - which we don't
	117	* support at all. So we set this to something which will never
	118	* be used.
	119	*/
21d2b241	120	ucsdata->font_codepage = -1;
2dc6356a	121
2dc6356a	122	/*
6ac7f054	123	* If utf8_override is set and the POSIX locale settings
	124	* dictate a UTF-8 character set, then just go straight for
	125	* UTF-8.
2dc6356a	126	*/
6ac7f054	127	ucsdata->line_codepage = CS_NONE;
	128	if (utf8_override) {
	129	const char *s;
	130	if (((s = getenv("LC_ALL")) && *s) \|\|
	131	((s = getenv("LC_CTYPE")) && *s) \|\|
	132	((s = getenv("LANG")) && *s)) {
	133	if (strstr(s, "UTF-8"))
	134	ucsdata->line_codepage = CS_UTF8;
	135	}
	136	}
	137
	138	/*
	139	* Failing that, line_codepage should be decoded from the
	140	* specification in cfg.
	141	*/
	142	if (ucsdata->line_codepage == CS_NONE)
	143	ucsdata->line_codepage = decode_codepage(linecharset);
2dc6356a	144
facd762c	145	/*
	146	* If line_codepage is _still_ CS_NONE, we assume we're using
	147	* the font's own encoding. This has been passed in to us, so
	148	* we use that. If it's still CS_NONE after _that_ - i.e. the
	149	* font we were given had an incomprehensible charset - then we
	150	* fall back to using the D800 page.
	151	*/
21d2b241	152	if (ucsdata->line_codepage == CS_NONE)
21d2b241	153	ucsdata->line_codepage = font_charset;
2dc6356a	154
21d2b241	155	if (ucsdata->line_codepage == CS_NONE)
085f4a68	156	ret = 1;
085f4a68	157
2dc6356a	158	/*
	159	* Set up unitab_line, by translating each individual character
	160	* in the line codepage into Unicode.
	161	*/
	162	for (i = 0; i < 256; i++) {
	163	char c[1], *p;
	164	wchar_t wc[1];
	165	int len;
	166	c[0] = i;
	167	p = c;
	168	len = 1;
21d2b241	169	if (ucsdata->line_codepage == CS_NONE)
	170	ucsdata->unitab_line[i] = 0xD800 \| i;
	171	else if (1 == charset_to_unicode(&p, &len, wc, 1,
	172	ucsdata->line_codepage,
facd762c	173	NULL, L"", 0))
21d2b241	174	ucsdata->unitab_line[i] = wc[0];
1709795f	175	else
21d2b241	176	ucsdata->unitab_line[i] = 0xFFFD;
2dc6356a	177	}
1709795f	178
2dc6356a	179	/*
	180	* Set up unitab_xterm. This is the same as unitab_line except
	181	* in the line-drawing regions, where it follows the Unicode
	182	* encoding.
	183	*
	184	* (Note that the strange X encoding of line-drawing characters
	185	* in the bottom 32 glyphs of ISO8859-1 fonts is taken care of
	186	* by the font encoding, which will spot such a font and act as
	187	* if it were in a variant encoding of ISO8859-1.)
	188	*/
1709795f	189	for (i = 0; i < 256; i++) {
2dc6356a	190	static const wchar_t unitab_xterm_std[32] = {
	191	0x2666, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1,
	192	0x2424, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0x23ba,
	193	0x23bb, 0x2500, 0x23bc, 0x23bd, 0x251c, 0x2524, 0x2534, 0x252c,
	194	0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x0020
	195	};
3900c2d6	196	static const wchar_t unitab_xterm_poorman[32] =
	197	L"#*o~+++++-----++++\|****L. ";
	198
	199	const wchar_t *ptr;
	200
	201	if (vtmode == VT_POORMAN)
	202	ptr = unitab_xterm_poorman;
	203	else
	204	ptr = unitab_xterm_std;
	205
2dc6356a	206	if (i >= 0x5F && i < 0x7F)
3900c2d6	207	ucsdata->unitab_xterm[i] = ptr[i & 0x1F];
2dc6356a	208	else
21d2b241	209	ucsdata->unitab_xterm[i] = ucsdata->unitab_line[i];
1709795f	210	}
2dc6356a	211
	212	/*
	213	* Set up unitab_scoacs. The SCO Alternate Character Set is
	214	* simply CP437.
	215	*/
	216	for (i = 0; i < 256; i++) {
	217	char c[1], *p;
	218	wchar_t wc[1];
	219	int len;
	220	c[0] = i;
	221	p = c;
	222	len = 1;
facd762c	223	if (1 == charset_to_unicode(&p, &len, wc, 1, CS_CP437, NULL, L"", 0))
21d2b241	224	ucsdata->unitab_scoacs[i] = wc[0];
2dc6356a	225	else
21d2b241	226	ucsdata->unitab_scoacs[i] = 0xFFFD;
2dc6356a	227	}
2dc6356a	228
facd762c	229	/*
	230	* Find the control characters in the line codepage. For
	231	* direct-to-font mode using the D800 hack, we assume 00-1F and
	232	* 7F are controls, but allow 80-9F through. (It's as good a
	233	* guess as anything; and my bet is that half the weird fonts
	234	* used in this way will be IBM or MS code pages anyway.)
	235	*/
	236	for (i = 0; i < 256; i++) {
21d2b241	237	int lineval = ucsdata->unitab_line[i];
facd762c	238	if (lineval < ' ' \|\| (lineval >= 0x7F && lineval < 0xA0) \|\|
facd762c	239	(lineval >= 0xD800 && lineval < 0xD820) \|\| (lineval == 0xD87F))
21d2b241	240	ucsdata->unitab_ctrl[i] = i;
2dc6356a	241	else
21d2b241	242	ucsdata->unitab_ctrl[i] = 0xFF;
facd762c	243	}
085f4a68	244
085f4a68	245	return ret;
126ce234	246	}
d4413bd2	247
	248	const char *cp_name(int codepage)
	249	{
	250	if (codepage == CS_NONE)
	251	return "Use font encoding";
	252	return charset_to_localenc(codepage);
	253	}
	254
	255	const char *cp_enumerate(int index)
	256	{
	257	int charset;
	258	if (index == 0)
	259	return "Use font encoding";
	260	charset = charset_localenc_nth(index-1);
	261	if (charset == CS_NONE)
	262	return NULL;
	263	return charset_to_localenc(charset);
	264	}
	265
	266	int decode_codepage(char *cp_name)
	267	{
	268	if (!*cp_name)
	269	return CS_NONE; /* use font encoding */
	270	return charset_from_localenc(cp_name);
	271	}