[sgt/putty] / unix / uxucs.c

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <locale.h>
#include <limits.h>
#include <wchar.h>

#include <time.h>

#include "putty.h"
#include "charset.h"
#include "terminal.h"
#include "misc.h"

/*
 * Unix Unicode-handling routines.
 */

int is_dbcs_leadbyte(int codepage, char byte)
{
    return 0;			       /* we don't do DBCS */
}

int mb_to_wc(int codepage, int flags, char *mbstr, int mblen,
	     wchar_t *wcstr, int wclen)
{
    if (codepage == DEFAULT_CODEPAGE) {
	int n = 0;
	mbstate_t state = { 0 };

	setlocale(LC_CTYPE, "");

	while (mblen > 0) {
	    size_t i = mbrtowc(wcstr+n, mbstr, (size_t)mblen, &state);
	    if (i == (size_t)-1 || i == (size_t)-2)
		break;
	    n++;
	    mbstr += i;
	    mblen -= i;
	}

	setlocale(LC_CTYPE, "C");

	return n;
    } else if (codepage == CS_NONE) {
	int n = 0;

	while (mblen > 0) {
	    wcstr[n] = 0xD800 | (mbstr[0] & 0xFF);
	    n++;
	    mbstr++;
	    mblen--;
	}

	return n;
    } else
	return charset_to_unicode(&mbstr, &mblen, wcstr, wclen, codepage,
				  NULL, NULL, 0);
}

int wc_to_mb(int codepage, int flags, wchar_t *wcstr, int wclen,
	     char *mbstr, int mblen, char *defchr, int *defused,
	     struct unicode_data *ucsdata)
{
    /* FIXME: we should remove the defused param completely... */
    if (defused)
	*defused = 0;

    if (codepage == DEFAULT_CODEPAGE) {
	char output[MB_LEN_MAX];
	mbstate_t state = { 0 };
	int n = 0;

	setlocale(LC_CTYPE, "");

	while (wclen > 0) {
	    int i = wcrtomb(output, wcstr[0], &state);
	    if (i == (size_t)-1 || i > n - mblen)
		break;
	    memcpy(mbstr+n, output, i);
	    n += i;
	    wcstr++;
	    wclen--;
	}

	setlocale(LC_CTYPE, "C");

	return n;
    } else if (codepage == CS_NONE) {
	int n = 0;
	while (wclen > 0 && n < mblen) {
	    if (*wcstr >= 0xD800 && *wcstr < 0xD900)
		mbstr[n++] = (*wcstr & 0xFF);
	    else if (defchr)
		mbstr[n++] = *defchr;
	    wcstr++;
	    wclen--;
	}
	return n;
    } else {
	return charset_from_unicode(&wcstr, &wclen, mbstr, mblen, codepage,
				    NULL, defchr?defchr:NULL, defchr?1:0);
    }
}

/*
 * Return value is TRUE if pterm is to run in direct-to-font mode.
 */
int init_ucs(struct unicode_data *ucsdata, 
	     char *linecharset, int font_charset)
{
    int i, ret = 0;

    /*
     * In the platform-independent parts of the code, font_codepage
     * is used only for system DBCS support - which we don't
     * support at all. So we set this to something which will never
     * be used.
     */
    ucsdata->font_codepage = -1;

    /*
     * line_codepage should be decoded from the specification in
     * cfg.
     */
    ucsdata->line_codepage = decode_codepage(linecharset);

    /*
     * If line_codepage is _still_ CS_NONE, we assume we're using
     * the font's own encoding. This has been passed in to us, so
     * we use that. If it's still CS_NONE after _that_ - i.e. the
     * font we were given had an incomprehensible charset - then we
     * fall back to using the D800 page.
     */
    if (ucsdata->line_codepage == CS_NONE)
	ucsdata->line_codepage = font_charset;

    if (ucsdata->line_codepage == CS_NONE)
	ret = 1;

    /*
     * Set up unitab_line, by translating each individual character
     * in the line codepage into Unicode.
     */
    for (i = 0; i < 256; i++) {
	char c[1], *p;
	wchar_t wc[1];
	int len;
	c[0] = i;
	p = c;
	len = 1;
	if (ucsdata->line_codepage == CS_NONE)
	    ucsdata->unitab_line[i] = 0xD800 | i;
	else if (1 == charset_to_unicode(&p, &len, wc, 1,
					 ucsdata->line_codepage,
					 NULL, L"", 0))
	    ucsdata->unitab_line[i] = wc[0];
	else
	    ucsdata->unitab_line[i] = 0xFFFD;
    }

    /*
     * Set up unitab_xterm. This is the same as unitab_line except
     * in the line-drawing regions, where it follows the Unicode
     * encoding.
     * 
     * (Note that the strange X encoding of line-drawing characters
     * in the bottom 32 glyphs of ISO8859-1 fonts is taken care of
     * by the font encoding, which will spot such a font and act as
     * if it were in a variant encoding of ISO8859-1.)
     */
    for (i = 0; i < 256; i++) {
	static const wchar_t unitab_xterm_std[32] = {
	    0x2666, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1,
	    0x2424, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0x23ba,
	    0x23bb, 0x2500, 0x23bc, 0x23bd, 0x251c, 0x2524, 0x2534, 0x252c,
	    0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x0020
	};
	if (i >= 0x5F && i < 0x7F)
	    ucsdata->unitab_xterm[i] = unitab_xterm_std[i & 0x1F];
	else
	    ucsdata->unitab_xterm[i] = ucsdata->unitab_line[i];
    }

    /*
     * Set up unitab_scoacs. The SCO Alternate Character Set is
     * simply CP437.
     */
    for (i = 0; i < 256; i++) {
	char c[1], *p;
	wchar_t wc[1];
	int len;
	c[0] = i;
	p = c;
	len = 1;
	if (1 == charset_to_unicode(&p, &len, wc, 1, CS_CP437, NULL, L"", 0))
	    ucsdata->unitab_scoacs[i] = wc[0];
	else
	    ucsdata->unitab_scoacs[i] = 0xFFFD;
    }

    /*
     * Find the control characters in the line codepage. For
     * direct-to-font mode using the D800 hack, we assume 00-1F and
     * 7F are controls, but allow 80-9F through. (It's as good a
     * guess as anything; and my bet is that half the weird fonts
     * used in this way will be IBM or MS code pages anyway.)
     */
    for (i = 0; i < 256; i++) {
	int lineval = ucsdata->unitab_line[i];
	if (lineval < ' ' || (lineval >= 0x7F && lineval < 0xA0) ||
	    (lineval >= 0xD800 && lineval < 0xD820) || (lineval == 0xD87F))
	    ucsdata->unitab_ctrl[i] = i;
	else
	    ucsdata->unitab_ctrl[i] = 0xFF;
    }

    return ret;
}

const char *cp_name(int codepage)
{
    if (codepage == CS_NONE)
	return "Use font encoding";
    return charset_to_localenc(codepage);
}

const char *cp_enumerate(int index)
{
    int charset;
    if (index == 0)
	return "Use font encoding";
    charset = charset_localenc_nth(index-1);
    if (charset == CS_NONE)
	return NULL;
    return charset_to_localenc(charset);
}

int decode_codepage(char *cp_name)
{
    if (!*cp_name)
	return CS_NONE;		       /* use font encoding */
    return charset_from_localenc(cp_name);
}
Commit	Line	Data
1709795f	1	#include <stdio.h>
	2	#include <stdlib.h>
	3	#include <ctype.h>
2dc6356a	4	#include <locale.h>
	5	#include <limits.h>
	6	#include <wchar.h>
1709795f	7
1709795f	8	#include <time.h>
2dc6356a	9
1709795f	10	#include "putty.h"
d4413bd2	11	#include "charset.h"
887035a5	12	#include "terminal.h"
1709795f	13	#include "misc.h"
	14
	15	/*
	16	* Unix Unicode-handling routines.
1709795f	17	*/
1709795f	18
1709795f	19	int is_dbcs_leadbyte(int codepage, char byte)
	20	{
	21	return 0; /* we don't do DBCS */
	22	}
	23
	24	int mb_to_wc(int codepage, int flags, char *mbstr, int mblen,
	25	wchar_t *wcstr, int wclen)
	26	{
2dc6356a	27	if (codepage == DEFAULT_CODEPAGE) {
	28	int n = 0;
	29	mbstate_t state = { 0 };
	30
	31	setlocale(LC_CTYPE, "");
	32
	33	while (mblen > 0) {
	34	size_t i = mbrtowc(wcstr+n, mbstr, (size_t)mblen, &state);
	35	if (i == (size_t)-1 \|\| i == (size_t)-2)
	36	break;
	37	n++;
	38	mbstr += i;
	39	mblen -= i;
	40	}
	41
	42	setlocale(LC_CTYPE, "C");
	43
	44	return n;
facd762c	45	} else if (codepage == CS_NONE) {
	46	int n = 0;
	47
	48	while (mblen > 0) {
	49	wcstr[n] = 0xD800 \| (mbstr[0] & 0xFF);
	50	n++;
	51	mbstr++;
	52	mblen--;
	53	}
	54
	55	return n;
2dc6356a	56	} else
	57	return charset_to_unicode(&mbstr, &mblen, wcstr, wclen, codepage,
	58	NULL, NULL, 0);
e6346999	59	}
	60
	61	int wc_to_mb(int codepage, int flags, wchar_t *wcstr, int wclen,
21d2b241	62	char mbstr, int mblen, char defchr, int *defused,
21d2b241	63	struct unicode_data *ucsdata)
e6346999	64	{
2dc6356a	65	/* FIXME: we should remove the defused param completely... */
e6346999	66	if (defused)
e6346999	67	*defused = 0;
2dc6356a	68
	69	if (codepage == DEFAULT_CODEPAGE) {
	70	char output[MB_LEN_MAX];
	71	mbstate_t state = { 0 };
	72	int n = 0;
	73
	74	setlocale(LC_CTYPE, "");
	75
	76	while (wclen > 0) {
	77	int i = wcrtomb(output, wcstr[0], &state);
	78	if (i == (size_t)-1 \|\| i > n - mblen)
	79	break;
	80	memcpy(mbstr+n, output, i);
	81	n += i;
	82	wcstr++;
	83	wclen--;
	84	}
	85
	86	setlocale(LC_CTYPE, "C");
	87
	88	return n;
facd762c	89	} else if (codepage == CS_NONE) {
	90	int n = 0;
	91	while (wclen > 0 && n < mblen) {
	92	if (wcstr >= 0xD800 && wcstr < 0xD900)
	93	mbstr[n++] = (*wcstr & 0xFF);
	94	else if (defchr)
	95	mbstr[n++] = *defchr;
	96	wcstr++;
	97	wclen--;
	98	}
	99	return n;
	100	} else {
2dc6356a	101	return charset_from_unicode(&wcstr, &wclen, mbstr, mblen, codepage,
0f993689	102	NULL, defchr?defchr:NULL, defchr?1:0);
facd762c	103	}
1709795f	104	}
1709795f	105
085f4a68	106	/*
	107	* Return value is TRUE if pterm is to run in direct-to-font mode.
	108	*/
21d2b241	109	int init_ucs(struct unicode_data *ucsdata,
21d2b241	110	char *linecharset, int font_charset)
1709795f	111	{
085f4a68	112	int i, ret = 0;
2dc6356a	113
	114	/*
	115	* In the platform-independent parts of the code, font_codepage
	116	* is used only for system DBCS support - which we don't
	117	* support at all. So we set this to something which will never
	118	* be used.
	119	*/
21d2b241	120	ucsdata->font_codepage = -1;
2dc6356a	121
	122	/*
	123	* line_codepage should be decoded from the specification in
	124	* cfg.
	125	*/
d4413bd2	126	ucsdata->line_codepage = decode_codepage(linecharset);
2dc6356a	127
facd762c	128	/*
	129	* If line_codepage is _still_ CS_NONE, we assume we're using
	130	* the font's own encoding. This has been passed in to us, so
	131	* we use that. If it's still CS_NONE after _that_ - i.e. the
	132	* font we were given had an incomprehensible charset - then we
	133	* fall back to using the D800 page.
	134	*/
21d2b241	135	if (ucsdata->line_codepage == CS_NONE)
21d2b241	136	ucsdata->line_codepage = font_charset;
2dc6356a	137
21d2b241	138	if (ucsdata->line_codepage == CS_NONE)
085f4a68	139	ret = 1;
085f4a68	140
2dc6356a	141	/*
	142	* Set up unitab_line, by translating each individual character
	143	* in the line codepage into Unicode.
	144	*/
	145	for (i = 0; i < 256; i++) {
	146	char c[1], *p;
	147	wchar_t wc[1];
	148	int len;
	149	c[0] = i;
	150	p = c;
	151	len = 1;
21d2b241	152	if (ucsdata->line_codepage == CS_NONE)
	153	ucsdata->unitab_line[i] = 0xD800 \| i;
	154	else if (1 == charset_to_unicode(&p, &len, wc, 1,
	155	ucsdata->line_codepage,
facd762c	156	NULL, L"", 0))
21d2b241	157	ucsdata->unitab_line[i] = wc[0];
1709795f	158	else
21d2b241	159	ucsdata->unitab_line[i] = 0xFFFD;
2dc6356a	160	}
1709795f	161
2dc6356a	162	/*
	163	* Set up unitab_xterm. This is the same as unitab_line except
	164	* in the line-drawing regions, where it follows the Unicode
	165	* encoding.
	166	*
	167	* (Note that the strange X encoding of line-drawing characters
	168	* in the bottom 32 glyphs of ISO8859-1 fonts is taken care of
	169	* by the font encoding, which will spot such a font and act as
	170	* if it were in a variant encoding of ISO8859-1.)
	171	*/
1709795f	172	for (i = 0; i < 256; i++) {
2dc6356a	173	static const wchar_t unitab_xterm_std[32] = {
	174	0x2666, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1,
	175	0x2424, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0x23ba,
	176	0x23bb, 0x2500, 0x23bc, 0x23bd, 0x251c, 0x2524, 0x2534, 0x252c,
	177	0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x0020
	178	};
	179	if (i >= 0x5F && i < 0x7F)
21d2b241	180	ucsdata->unitab_xterm[i] = unitab_xterm_std[i & 0x1F];
2dc6356a	181	else
21d2b241	182	ucsdata->unitab_xterm[i] = ucsdata->unitab_line[i];
1709795f	183	}
2dc6356a	184
	185	/*
	186	* Set up unitab_scoacs. The SCO Alternate Character Set is
	187	* simply CP437.
	188	*/
	189	for (i = 0; i < 256; i++) {
	190	char c[1], *p;
	191	wchar_t wc[1];
	192	int len;
	193	c[0] = i;
	194	p = c;
	195	len = 1;
facd762c	196	if (1 == charset_to_unicode(&p, &len, wc, 1, CS_CP437, NULL, L"", 0))
21d2b241	197	ucsdata->unitab_scoacs[i] = wc[0];
2dc6356a	198	else
21d2b241	199	ucsdata->unitab_scoacs[i] = 0xFFFD;
2dc6356a	200	}
2dc6356a	201
facd762c	202	/*
	203	* Find the control characters in the line codepage. For
	204	* direct-to-font mode using the D800 hack, we assume 00-1F and
	205	* 7F are controls, but allow 80-9F through. (It's as good a
	206	* guess as anything; and my bet is that half the weird fonts
	207	* used in this way will be IBM or MS code pages anyway.)
	208	*/
	209	for (i = 0; i < 256; i++) {
21d2b241	210	int lineval = ucsdata->unitab_line[i];
facd762c	211	if (lineval < ' ' \|\| (lineval >= 0x7F && lineval < 0xA0) \|\|
facd762c	212	(lineval >= 0xD800 && lineval < 0xD820) \|\| (lineval == 0xD87F))
21d2b241	213	ucsdata->unitab_ctrl[i] = i;
2dc6356a	214	else
21d2b241	215	ucsdata->unitab_ctrl[i] = 0xFF;
facd762c	216	}
085f4a68	217
085f4a68	218	return ret;
126ce234	219	}
d4413bd2	220
	221	const char *cp_name(int codepage)
	222	{
	223	if (codepage == CS_NONE)
	224	return "Use font encoding";
	225	return charset_to_localenc(codepage);
	226	}
	227
	228	const char *cp_enumerate(int index)
	229	{
	230	int charset;
	231	if (index == 0)
	232	return "Use font encoding";
	233	charset = charset_localenc_nth(index-1);
	234	if (charset == CS_NONE)
	235	return NULL;
	236	return charset_to_localenc(charset);
	237	}
	238
	239	int decode_codepage(char *cp_name)
	240	{
	241	if (!*cp_name)
	242	return CS_NONE; /* use font encoding */
	243	return charset_from_localenc(cp_name);
	244	}