[sgt/putty] / unix / uxucs.c

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <locale.h>
#include <limits.h>
#include <wchar.h>

#include <time.h>

#include "putty.h"
#include "charset.h"
#include "terminal.h"
#include "misc.h"

/*
 * Unix Unicode-handling routines.
 */

int is_dbcs_leadbyte(int codepage, char byte)
{
    return 0;			       /* we don't do DBCS */
}

int mb_to_wc(int codepage, int flags, char *mbstr, int mblen,
	     wchar_t *wcstr, int wclen)
{
    if (codepage == DEFAULT_CODEPAGE) {
	int n = 0;
	mbstate_t state = { 0 };

	setlocale(LC_CTYPE, "");

	while (mblen > 0) {
	    size_t i = mbrtowc(wcstr+n, mbstr, (size_t)mblen, &state);
	    if (i == (size_t)-1 || i == (size_t)-2)
		break;
	    n++;
	    mbstr += i;
	    mblen -= i;
	}

	setlocale(LC_CTYPE, "C");

	return n;
    } else if (codepage == CS_NONE) {
	int n = 0;

	while (mblen > 0) {
	    wcstr[n] = 0xD800 | (mbstr[0] & 0xFF);
	    n++;
	    mbstr++;
	    mblen--;
	}

	return n;
    } else
	return charset_to_unicode(&mbstr, &mblen, wcstr, wclen, codepage,
				  NULL, NULL, 0);
}

int wc_to_mb(int codepage, int flags, wchar_t *wcstr, int wclen,
	     char *mbstr, int mblen, char *defchr, int *defused,
	     struct unicode_data *ucsdata)
{
    /* FIXME: we should remove the defused param completely... */
    if (defused)
	*defused = 0;

    if (codepage == DEFAULT_CODEPAGE) {
	char output[MB_LEN_MAX];
	mbstate_t state = { 0 };
	int n = 0;

	setlocale(LC_CTYPE, "");

	while (wclen > 0) {
	    int i = wcrtomb(output, wcstr[0], &state);
	    if (i == (size_t)-1 || i > n - mblen)
		break;
	    memcpy(mbstr+n, output, i);
	    n += i;
	    wcstr++;
	    wclen--;
	}

	setlocale(LC_CTYPE, "C");

	return n;
    } else if (codepage == CS_NONE) {
	int n = 0;
	while (wclen > 0 && n < mblen) {
	    if (*wcstr >= 0xD800 && *wcstr < 0xD900)
		mbstr[n++] = (*wcstr & 0xFF);
	    else if (defchr)
		mbstr[n++] = *defchr;
	    wcstr++;
	    wclen--;
	}
	return n;
    } else {
	return charset_from_unicode(&wcstr, &wclen, mbstr, mblen, codepage,
				    NULL, defchr?defchr:NULL, defchr?1:0);
    }
}

/*
 * Return value is TRUE if pterm is to run in direct-to-font mode.
 */
int init_ucs(struct unicode_data *ucsdata, 
	     char *linecharset, int font_charset, int vtmode)
{
    int i, ret = 0;

    /*
     * In the platform-independent parts of the code, font_codepage
     * is used only for system DBCS support - which we don't
     * support at all. So we set this to something which will never
     * be used.
     */
    ucsdata->font_codepage = -1;

    /*
     * line_codepage should be decoded from the specification in
     * cfg.
     */
    ucsdata->line_codepage = decode_codepage(linecharset);

    /*
     * If line_codepage is _still_ CS_NONE, we assume we're using
     * the font's own encoding. This has been passed in to us, so
     * we use that. If it's still CS_NONE after _that_ - i.e. the
     * font we were given had an incomprehensible charset - then we
     * fall back to using the D800 page.
     */
    if (ucsdata->line_codepage == CS_NONE)
	ucsdata->line_codepage = font_charset;

    if (ucsdata->line_codepage == CS_NONE)
	ret = 1;

    /*
     * Set up unitab_line, by translating each individual character
     * in the line codepage into Unicode.
     */
    for (i = 0; i < 256; i++) {
	char c[1], *p;
	wchar_t wc[1];
	int len;
	c[0] = i;
	p = c;
	len = 1;
	if (ucsdata->line_codepage == CS_NONE)
	    ucsdata->unitab_line[i] = 0xD800 | i;
	else if (1 == charset_to_unicode(&p, &len, wc, 1,
					 ucsdata->line_codepage,
					 NULL, L"", 0))
	    ucsdata->unitab_line[i] = wc[0];
	else
	    ucsdata->unitab_line[i] = 0xFFFD;
    }

    /*
     * Set up unitab_xterm. This is the same as unitab_line except
     * in the line-drawing regions, where it follows the Unicode
     * encoding.
     * 
     * (Note that the strange X encoding of line-drawing characters
     * in the bottom 32 glyphs of ISO8859-1 fonts is taken care of
     * by the font encoding, which will spot such a font and act as
     * if it were in a variant encoding of ISO8859-1.)
     */
    for (i = 0; i < 256; i++) {
	static const wchar_t unitab_xterm_std[32] = {
	    0x2666, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1,
	    0x2424, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0x23ba,
	    0x23bb, 0x2500, 0x23bc, 0x23bd, 0x251c, 0x2524, 0x2534, 0x252c,
	    0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x0020
	};
	static const wchar_t unitab_xterm_poorman[32] =
	    L"*#****o~**+++++-----++++|****L. ";

	const wchar_t *ptr;

	if (vtmode == VT_POORMAN)
	    ptr = unitab_xterm_poorman;
	else
	    ptr = unitab_xterm_std;

	if (i >= 0x5F && i < 0x7F)
	    ucsdata->unitab_xterm[i] = ptr[i & 0x1F];
	else
	    ucsdata->unitab_xterm[i] = ucsdata->unitab_line[i];
    }

    /*
     * Set up unitab_scoacs. The SCO Alternate Character Set is
     * simply CP437.
     */
    for (i = 0; i < 256; i++) {
	char c[1], *p;
	wchar_t wc[1];
	int len;
	c[0] = i;
	p = c;
	len = 1;
	if (1 == charset_to_unicode(&p, &len, wc, 1, CS_CP437, NULL, L"", 0))
	    ucsdata->unitab_scoacs[i] = wc[0];
	else
	    ucsdata->unitab_scoacs[i] = 0xFFFD;
    }

    /*
     * Find the control characters in the line codepage. For
     * direct-to-font mode using the D800 hack, we assume 00-1F and
     * 7F are controls, but allow 80-9F through. (It's as good a
     * guess as anything; and my bet is that half the weird fonts
     * used in this way will be IBM or MS code pages anyway.)
     */
    for (i = 0; i < 256; i++) {
	int lineval = ucsdata->unitab_line[i];
	if (lineval < ' ' || (lineval >= 0x7F && lineval < 0xA0) ||
	    (lineval >= 0xD800 && lineval < 0xD820) || (lineval == 0xD87F))
	    ucsdata->unitab_ctrl[i] = i;
	else
	    ucsdata->unitab_ctrl[i] = 0xFF;
    }

    return ret;
}

const char *cp_name(int codepage)
{
    if (codepage == CS_NONE)
	return "Use font encoding";
    return charset_to_localenc(codepage);
}

const char *cp_enumerate(int index)
{
    int charset;
    if (index == 0)
	return "Use font encoding";
    charset = charset_localenc_nth(index-1);
    if (charset == CS_NONE)
	return NULL;
    return charset_to_localenc(charset);
}

int decode_codepage(char *cp_name)
{
    if (!*cp_name)
	return CS_NONE;		       /* use font encoding */
    return charset_from_localenc(cp_name);
}
Commit	Line	Data
1709795f	1	#include <stdio.h>
	2	#include <stdlib.h>
	3	#include <ctype.h>
2dc6356a	4	#include <locale.h>
	5	#include <limits.h>
	6	#include <wchar.h>
1709795f	7
1709795f	8	#include <time.h>
2dc6356a	9
1709795f	10	#include "putty.h"
d4413bd2	11	#include "charset.h"
887035a5	12	#include "terminal.h"
1709795f	13	#include "misc.h"
	14
	15	/*
	16	* Unix Unicode-handling routines.
1709795f	17	*/
1709795f	18
1709795f	19	int is_dbcs_leadbyte(int codepage, char byte)
	20	{
	21	return 0; /* we don't do DBCS */
	22	}
	23
	24	int mb_to_wc(int codepage, int flags, char *mbstr, int mblen,
	25	wchar_t *wcstr, int wclen)
	26	{
2dc6356a	27	if (codepage == DEFAULT_CODEPAGE) {
	28	int n = 0;
	29	mbstate_t state = { 0 };
	30
	31	setlocale(LC_CTYPE, "");
	32
	33	while (mblen > 0) {
	34	size_t i = mbrtowc(wcstr+n, mbstr, (size_t)mblen, &state);
	35	if (i == (size_t)-1 \|\| i == (size_t)-2)
	36	break;
	37	n++;
	38	mbstr += i;
	39	mblen -= i;
	40	}
	41
	42	setlocale(LC_CTYPE, "C");
	43
	44	return n;
facd762c	45	} else if (codepage == CS_NONE) {
	46	int n = 0;
	47
	48	while (mblen > 0) {
	49	wcstr[n] = 0xD800 \| (mbstr[0] & 0xFF);
	50	n++;
	51	mbstr++;
	52	mblen--;
	53	}
	54
	55	return n;
2dc6356a	56	} else
	57	return charset_to_unicode(&mbstr, &mblen, wcstr, wclen, codepage,
	58	NULL, NULL, 0);
e6346999	59	}
	60
	61	int wc_to_mb(int codepage, int flags, wchar_t *wcstr, int wclen,
21d2b241	62	char mbstr, int mblen, char defchr, int *defused,
21d2b241	63	struct unicode_data *ucsdata)
e6346999	64	{
2dc6356a	65	/* FIXME: we should remove the defused param completely... */
e6346999	66	if (defused)
e6346999	67	*defused = 0;
2dc6356a	68
	69	if (codepage == DEFAULT_CODEPAGE) {
	70	char output[MB_LEN_MAX];
	71	mbstate_t state = { 0 };
	72	int n = 0;
	73
	74	setlocale(LC_CTYPE, "");
	75
	76	while (wclen > 0) {
	77	int i = wcrtomb(output, wcstr[0], &state);
	78	if (i == (size_t)-1 \|\| i > n - mblen)
	79	break;
	80	memcpy(mbstr+n, output, i);
	81	n += i;
	82	wcstr++;
	83	wclen--;
	84	}
	85
	86	setlocale(LC_CTYPE, "C");
	87
	88	return n;
facd762c	89	} else if (codepage == CS_NONE) {
	90	int n = 0;
	91	while (wclen > 0 && n < mblen) {
	92	if (wcstr >= 0xD800 && wcstr < 0xD900)
	93	mbstr[n++] = (*wcstr & 0xFF);
	94	else if (defchr)
	95	mbstr[n++] = *defchr;
	96	wcstr++;
	97	wclen--;
	98	}
	99	return n;
	100	} else {
2dc6356a	101	return charset_from_unicode(&wcstr, &wclen, mbstr, mblen, codepage,
0f993689	102	NULL, defchr?defchr:NULL, defchr?1:0);
facd762c	103	}
1709795f	104	}
1709795f	105
085f4a68	106	/*
	107	* Return value is TRUE if pterm is to run in direct-to-font mode.
	108	*/
21d2b241	109	int init_ucs(struct unicode_data *ucsdata,
3900c2d6	110	char *linecharset, int font_charset, int vtmode)
1709795f	111	{
085f4a68	112	int i, ret = 0;
2dc6356a	113
	114	/*
	115	* In the platform-independent parts of the code, font_codepage
	116	* is used only for system DBCS support - which we don't
	117	* support at all. So we set this to something which will never
	118	* be used.
	119	*/
21d2b241	120	ucsdata->font_codepage = -1;
2dc6356a	121
	122	/*
	123	* line_codepage should be decoded from the specification in
	124	* cfg.
	125	*/
d4413bd2	126	ucsdata->line_codepage = decode_codepage(linecharset);
2dc6356a	127
facd762c	128	/*
	129	* If line_codepage is _still_ CS_NONE, we assume we're using
	130	* the font's own encoding. This has been passed in to us, so
	131	* we use that. If it's still CS_NONE after _that_ - i.e. the
	132	* font we were given had an incomprehensible charset - then we
	133	* fall back to using the D800 page.
	134	*/
21d2b241	135	if (ucsdata->line_codepage == CS_NONE)
21d2b241	136	ucsdata->line_codepage = font_charset;
2dc6356a	137
21d2b241	138	if (ucsdata->line_codepage == CS_NONE)
085f4a68	139	ret = 1;
085f4a68	140
2dc6356a	141	/*
	142	* Set up unitab_line, by translating each individual character
	143	* in the line codepage into Unicode.
	144	*/
	145	for (i = 0; i < 256; i++) {
	146	char c[1], *p;
	147	wchar_t wc[1];
	148	int len;
	149	c[0] = i;
	150	p = c;
	151	len = 1;
21d2b241	152	if (ucsdata->line_codepage == CS_NONE)
	153	ucsdata->unitab_line[i] = 0xD800 \| i;
	154	else if (1 == charset_to_unicode(&p, &len, wc, 1,
	155	ucsdata->line_codepage,
facd762c	156	NULL, L"", 0))
21d2b241	157	ucsdata->unitab_line[i] = wc[0];
1709795f	158	else
21d2b241	159	ucsdata->unitab_line[i] = 0xFFFD;
2dc6356a	160	}
1709795f	161
2dc6356a	162	/*
	163	* Set up unitab_xterm. This is the same as unitab_line except
	164	* in the line-drawing regions, where it follows the Unicode
	165	* encoding.
	166	*
	167	* (Note that the strange X encoding of line-drawing characters
	168	* in the bottom 32 glyphs of ISO8859-1 fonts is taken care of
	169	* by the font encoding, which will spot such a font and act as
	170	* if it were in a variant encoding of ISO8859-1.)
	171	*/
1709795f	172	for (i = 0; i < 256; i++) {
2dc6356a	173	static const wchar_t unitab_xterm_std[32] = {
	174	0x2666, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1,
	175	0x2424, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0x23ba,
	176	0x23bb, 0x2500, 0x23bc, 0x23bd, 0x251c, 0x2524, 0x2534, 0x252c,
	177	0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x0020
	178	};
3900c2d6	179	static const wchar_t unitab_xterm_poorman[32] =
	180	L"#*o~+++++-----++++\|****L. ";
	181
	182	const wchar_t *ptr;
	183
	184	if (vtmode == VT_POORMAN)
	185	ptr = unitab_xterm_poorman;
	186	else
	187	ptr = unitab_xterm_std;
	188
2dc6356a	189	if (i >= 0x5F && i < 0x7F)
3900c2d6	190	ucsdata->unitab_xterm[i] = ptr[i & 0x1F];
2dc6356a	191	else
21d2b241	192	ucsdata->unitab_xterm[i] = ucsdata->unitab_line[i];
1709795f	193	}
2dc6356a	194
	195	/*
	196	* Set up unitab_scoacs. The SCO Alternate Character Set is
	197	* simply CP437.
	198	*/
	199	for (i = 0; i < 256; i++) {
	200	char c[1], *p;
	201	wchar_t wc[1];
	202	int len;
	203	c[0] = i;
	204	p = c;
	205	len = 1;
facd762c	206	if (1 == charset_to_unicode(&p, &len, wc, 1, CS_CP437, NULL, L"", 0))
21d2b241	207	ucsdata->unitab_scoacs[i] = wc[0];
2dc6356a	208	else
21d2b241	209	ucsdata->unitab_scoacs[i] = 0xFFFD;
2dc6356a	210	}
2dc6356a	211
facd762c	212	/*
	213	* Find the control characters in the line codepage. For
	214	* direct-to-font mode using the D800 hack, we assume 00-1F and
	215	* 7F are controls, but allow 80-9F through. (It's as good a
	216	* guess as anything; and my bet is that half the weird fonts
	217	* used in this way will be IBM or MS code pages anyway.)
	218	*/
	219	for (i = 0; i < 256; i++) {
21d2b241	220	int lineval = ucsdata->unitab_line[i];
facd762c	221	if (lineval < ' ' \|\| (lineval >= 0x7F && lineval < 0xA0) \|\|
facd762c	222	(lineval >= 0xD800 && lineval < 0xD820) \|\| (lineval == 0xD87F))
21d2b241	223	ucsdata->unitab_ctrl[i] = i;
2dc6356a	224	else
21d2b241	225	ucsdata->unitab_ctrl[i] = 0xFF;
facd762c	226	}
085f4a68	227
085f4a68	228	return ret;
126ce234	229	}
d4413bd2	230
	231	const char *cp_name(int codepage)
	232	{
	233	if (codepage == CS_NONE)
	234	return "Use font encoding";
	235	return charset_to_localenc(codepage);
	236	}
	237
	238	const char *cp_enumerate(int index)
	239	{
	240	int charset;
	241	if (index == 0)
	242	return "Use font encoding";
	243	charset = charset_localenc_nth(index-1);
	244	if (charset == CS_NONE)
	245	return NULL;
	246	return charset_to_localenc(charset);
	247	}
	248
	249	int decode_codepage(char *cp_name)
	250	{
	251	if (!*cp_name)
	252	return CS_NONE; /* use font encoding */
	253	return charset_from_localenc(cp_name);
	254	}