From facd762ca8b5eb6cad4171cb3deb2f3ef2957dc0 Mon Sep 17 00:00:00 2001
From: simon <simon@cda61777-01e9-0310-a592-d414129be87e>
Date: Wed, 1 Jan 2003 22:25:25 +0000
Subject: [PATCH] Proper support for using the font's own character encoding.
 If we know what that encoding actually is, we can do our best to support
 additional charsets (VT100 linedrawing, SCO ACS, UTF-8 mode) using the
 available characters; if we don't, we fall back to a mode where we disable
 all Unicode cut-and-paste and assume any Unicode character is undisplayable.

git-svn-id: svn://svn.tartarus.org/sgt/putty@2413 cda61777-01e9-0310-a592-d414129be87e
---
 putty.h      |   2 +-
 unix/pterm.1 |   4 ++
 unix/pterm.c | 119 ++++++++++++++++++++++++++++++++++++-----------------------
 unix/unix.h  |   5 +++
 unix/uxucs.c |  64 +++++++++++++++++++++++++-------
 winstuff.h   |   5 +++
 6 files changed, 138 insertions(+), 61 deletions(-)

diff --git a/putty.h b/putty.h
index 113d355e..c956d508 100644
--- a/putty.h
+++ b/putty.h
@@ -589,7 +589,7 @@ extern char ver[];
 #ifndef CP_UTF8
 #define CP_UTF8 65001
 #endif
-void init_ucs(void);
+/* void init_ucs(void); -- this is now in platform-specific headers */
 int is_dbcs_leadbyte(int codepage, char byte);
 int mb_to_wc(int codepage, int flags, char *mbstr, int mblen,
 	     wchar_t *wcstr, int wclen);
diff --git a/unix/pterm.1 b/unix/pterm.1
index e08d033a..bbdb081a 100644
--- a/unix/pterm.1
+++ b/unix/pterm.1
@@ -110,6 +110,10 @@ supported by \fIpterm\fP) should be valid here (examples are
 encoding which is valid in an X logical font description should be
 valid ("ibm-cp437", for example).
 
+\fIpterm\fP's default behaviour is to use the same character
+encoding as its primary font. If you supply a Unicode (iso10646-1)
+font, it will default to the UTF-8 character set.
+
 Character set names are case-insensitive.
 .IP "\fB\-nethack\fP"
 Tells \fIpterm\fP to enable NetHack keypad mode, in which the
diff --git a/unix/pterm.c b/unix/pterm.c
index fa9fe1df..b7f93f9c 100644
--- a/unix/pterm.c
+++ b/unix/pterm.c
@@ -836,18 +836,27 @@ gint key_event(GtkWidget *widget, GdkEventKey *event, gpointer data)
 	printf("\n");
 #endif
 
-	/*
-	 * The stuff we've just generated is assumed to be
-	 * ISO-8859-1! This sounds insane, but `man XLookupString'
-	 * agrees: strings of this type returned from the X server
-	 * are hardcoded to 8859-1. Strictly speaking we should be
-	 * doing this using some sort of GtkIMContext, which (if
-	 * we're lucky) would give us our data directly in Unicode;
-	 * but that's not supported in GTK 1.2 as far as I can
-	 * tell, and it's poorly documented even in 2.0, so it'll
-	 * have to wait.
-	 */
-	lpage_send(inst->ldisc, CS_ISO8859_1, output+start, end-start, 1);
+	if (inst->fontinfo[0].charset != CS_NONE) {
+	    /*
+	     * The stuff we've just generated is assumed to be
+	     * ISO-8859-1! This sounds insane, but `man
+	     * XLookupString' agrees: strings of this type returned
+	     * from the X server are hardcoded to 8859-1. Strictly
+	     * speaking we should be doing this using some sort of
+	     * GtkIMContext, which (if we're lucky) would give us
+	     * our data directly in Unicode; but that's not
+	     * supported in GTK 1.2 as far as I can tell, and it's
+	     * poorly documented even in 2.0, so it'll have to
+	     * wait.
+	     */
+	    lpage_send(inst->ldisc, CS_ISO8859_1, output+start, end-start, 1);
+	} else {
+	    /*
+	     * In direct-to-font mode, we just send the string
+	     * exactly as we received it.
+	     */
+	    ldisc_send(inst->ldisc, output+start, end-start, 1);
+	}
 
 	show_mouseptr(inst, 0);
 	term_seen_key_event(inst->term);
@@ -1218,17 +1227,25 @@ void write_clip(void *frontend, wchar_t * data, int len, int must_deselect)
     if (inst->pasteout_data_utf8)
 	sfree(inst->pasteout_data_utf8);
 
-    inst->pasteout_data_utf8 = smalloc(len*6);
-    inst->pasteout_data_utf8_len = len*6;
-    {
+    /*
+     * Set up UTF-8 paste data. This only happens if we aren't in
+     * direct-to-font mode using the D800 hack.
+     */
+    if (inst->fontinfo[0].charset != CS_NONE) {
 	wchar_t *tmp = data;
 	int tmplen = len;
+
+	inst->pasteout_data_utf8 = smalloc(len*6);
+	inst->pasteout_data_utf8_len = len*6;
 	inst->pasteout_data_utf8_len =
 	    charset_from_unicode(&tmp, &tmplen, inst->pasteout_data_utf8,
 				 inst->pasteout_data_utf8_len,
 				 CS_UTF8, NULL, NULL, 0);
 	inst->pasteout_data_utf8 =
 	    srealloc(inst->pasteout_data_utf8, inst->pasteout_data_utf8_len);
+    } else {
+	inst->pasteout_data_utf8 = NULL;
+	inst->pasteout_data_utf8_len = 0;
     }
 
     inst->pasteout_data = smalloc(len);
@@ -1243,8 +1260,9 @@ void write_clip(void *frontend, wchar_t * data, int len, int must_deselect)
 				 GDK_SELECTION_TYPE_STRING, 1);
 	gtk_selection_add_target(inst->area, GDK_SELECTION_PRIMARY,
 				 inst->compound_text_atom, 1);
-	gtk_selection_add_target(inst->area, GDK_SELECTION_PRIMARY,
-				 inst->utf8_string_atom, 1);
+	if (inst->pasteout_data_utf8)
+	    gtk_selection_add_target(inst->area, GDK_SELECTION_PRIMARY,
+				     inst->utf8_string_atom, 1);
     }
 }
 
@@ -1286,15 +1304,24 @@ void request_paste(void *frontend)
      * comes back _then_ we can call term_do_paste().
      */
 
-    /*
-     * First we attempt to retrieve the selection as a UTF-8 string
-     * (which we will convert to the correct code page before
-     * sending to the session, of course). If that fails,
-     * selection_received() will be informed and will fall back to
-     * an ordinary string.
-     */
-    gtk_selection_convert(inst->area, GDK_SELECTION_PRIMARY,
-			  inst->utf8_string_atom, GDK_CURRENT_TIME);
+    if (inst->fontinfo[0].charset != CS_NONE) {
+	/*
+	 * First we attempt to retrieve the selection as a UTF-8
+	 * string (which we will convert to the correct code page
+	 * before sending to the session, of course). If that
+	 * fails, selection_received() will be informed and will
+	 * fall back to an ordinary string.
+	 */
+	gtk_selection_convert(inst->area, GDK_SELECTION_PRIMARY,
+			      inst->utf8_string_atom, GDK_CURRENT_TIME);
+    } else {
+	/*
+	 * If we're in direct-to-font mode, we disable UTF-8
+	 * pasting, and go straight to ordinary string data.
+	 */
+	gtk_selection_convert(inst->area, GDK_SELECTION_PRIMARY,
+			      GDK_SELECTION_TYPE_STRING, GDK_CURRENT_TIME);
+    }
 }
 
 gint idle_paste_func(gpointer data);   /* forward ref */
@@ -1562,12 +1589,9 @@ void do_text_internal(Context ctx, int x, int y, char *text, int len,
 			     gwcs, len*2);
 	    sfree(gwcs);
 	} else {
-	    wchar_t *wcstmp = wcs;
-	    int lentmp = len;
 	    gcs = smalloc(sizeof(GdkWChar) * (len+1));
-	    charset_from_unicode(&wcstmp, &lentmp, gcs, len,
-				 inst->fontinfo[fontid].charset,
-				 NULL, ".", 1);
+	    wc_to_mb(inst->fontinfo[fontid].charset, 0,
+		     wcs, len, gcs, len, ".", NULL);
 	    gdk_draw_text(inst->pixmap, inst->fonts[fontid], gc,
 			  x*inst->font_width+cfg.window_border,
 			  y*inst->font_height+cfg.window_border+inst->fonts[0]->ascent,
@@ -2101,13 +2125,20 @@ static void block_signal(int sig, int block_it) {
   }
 }
 
-static void set_font_info(struct gui_data *inst, int fontid)
+/*
+ * This function retrieves the character set encoding of a font. It
+ * returns the character set without the X11 hack (in case the user
+ * asks to use the font's own encoding).
+ */
+static int set_font_info(struct gui_data *inst, int fontid)
 {
     GdkFont *font = inst->fonts[fontid];
     XFontStruct *xfs = GDK_FONT_XFONT(font);
     Display *disp = GDK_FONT_XDISPLAY(font);
     Atom charset_registry, charset_encoding;
     unsigned long registry_ret, encoding_ret;
+    int retval = CS_NONE;
+
     charset_registry = XInternAtom(disp, "CHARSET_REGISTRY", False);
     charset_encoding = XInternAtom(disp, "CHARSET_ENCODING", False);
     inst->fontinfo[fontid].charset = CS_NONE;
@@ -2119,10 +2150,13 @@ static void set_font_info(struct gui_data *inst, int fontid)
 	enc = XGetAtomName(disp, (Atom)encoding_ret);
 	if (reg && enc) {
 	    char *encoding = dupcat(reg, "-", enc, NULL);
-	    inst->fontinfo[fontid].charset = charset_from_xenc(encoding);
+	    retval = inst->fontinfo[fontid].charset =
+		charset_from_xenc(encoding);
 	    /* FIXME: when libcharset supports wide encodings fix this. */
-	    if (!strcasecmp(encoding, "iso10646-1"))
+	    if (!strcasecmp(encoding, "iso10646-1")) {
 		inst->fontinfo[fontid].is_wide = 1;
+		retval = CS_UTF8;
+	    }
 
 	    /*
 	     * Hack for X line-drawing characters: if the primary
@@ -2148,19 +2182,11 @@ static void set_font_info(struct gui_data *inst, int fontid)
 		    inst->fontinfo[fontid].charset = CS_ISO8859_1_X11;
 	    }
 
-	    /*
-	     * FIXME: this is a hack. Currently fonts with
-	     * incomprehensible encodings are dealt with by
-	     * pretending they're 8859-1. It's ugly, but it's good
-	     * enough to stop things crashing. Should do something
-	     * better here.
-	     */
-	    if (inst->fontinfo[fontid].charset == CS_NONE)
-		inst->fontinfo[fontid].charset = CS_ISO8859_1;
-
 	    sfree(encoding);
 	}
     }
+
+    return retval;
 }
 
 int main(int argc, char **argv)
@@ -2168,6 +2194,7 @@ int main(int argc, char **argv)
     extern int pty_master_fd;	       /* declared in pty.c */
     extern void pty_pre_init(void);    /* declared in pty.c */
     struct gui_data *inst;
+    int font_charset;
 
     /* defer any child exit handling until we're ready to deal with
      * it */
@@ -2195,7 +2222,7 @@ int main(int argc, char **argv)
 	fprintf(stderr, "pterm: unable to load font \"%s\"\n", cfg.font);
 	exit(1);
     }
-    set_font_info(inst, 0);
+    font_charset = set_font_info(inst, 0);
     if (cfg.boldfont[0]) {
 	inst->fonts[1] = gdk_font_load(cfg.boldfont);
 	if (!inst->fonts[1]) {
@@ -2233,7 +2260,7 @@ int main(int argc, char **argv)
     inst->compound_text_atom = gdk_atom_intern("COMPOUND_TEXT", FALSE);
     inst->utf8_string_atom = gdk_atom_intern("UTF8_STRING", FALSE);
 
-    init_ucs();
+    init_ucs(font_charset);
 
     inst->window = gtk_window_new(GTK_WINDOW_TOPLEVEL);
 
diff --git a/unix/unix.h b/unix/unix.h
index 9aa044d1..ab5dc88c 100644
--- a/unix/unix.h
+++ b/unix/unix.h
@@ -66,4 +66,9 @@ int next_socket(int *state, int *rwx);
 /* BSD-semantics version of signal() */
 void (*putty_signal(int sig, void (*func)(int)))(int);
 
+/*
+ * Exports from unicode.c.
+ */
+void init_ucs(int font_charset);
+
 #endif
diff --git a/unix/uxucs.c b/unix/uxucs.c
index 2bf65a8a..928acae9 100644
--- a/unix/uxucs.c
+++ b/unix/uxucs.c
@@ -41,6 +41,17 @@ int mb_to_wc(int codepage, int flags, char *mbstr, int mblen,
 	setlocale(LC_CTYPE, "C");
 
 	return n;
+    } else if (codepage == CS_NONE) {
+	int n = 0;
+
+	while (mblen > 0) {
+	    wcstr[n] = 0xD800 | (mbstr[0] & 0xFF);
+	    n++;
+	    mbstr++;
+	    mblen--;
+	}
+
+	return n;
     } else
 	return charset_to_unicode(&mbstr, &mblen, wcstr, wclen, codepage,
 				  NULL, NULL, 0);
@@ -73,12 +84,24 @@ int wc_to_mb(int codepage, int flags, wchar_t *wcstr, int wclen,
 	setlocale(LC_CTYPE, "C");
 
 	return n;
-    } else
+    } else if (codepage == CS_NONE) {
+	int n = 0;
+	while (wclen > 0 && n < mblen) {
+	    if (*wcstr >= 0xD800 && *wcstr < 0xD900)
+		mbstr[n++] = (*wcstr & 0xFF);
+	    else if (defchr)
+		mbstr[n++] = *defchr;
+	    wcstr++;
+	    wclen--;
+	}
+	return n;
+    } else {
 	return charset_from_unicode(&wcstr, &wclen, mbstr, mblen, codepage,
 				    NULL, NULL, 0);
+    }
 }
 
-void init_ucs(void)
+void init_ucs(int font_charset)
 {
     int i;
 
@@ -97,14 +120,16 @@ void init_ucs(void)
     line_codepage = charset_from_mimeenc(cfg.line_codepage);
     if (line_codepage == CS_NONE)
 	line_codepage = charset_from_xenc(cfg.line_codepage);
-    /* If it's still CS_NONE, we should assume direct-to-font. */
 
-    /* FIXME: this is a hack. Currently fonts with incomprehensible
-     * encodings are dealt with by pretending they're 8859-1. It's
-     * ugly, but it's good enough to stop things crashing. Should do
-     * something better here. */
+    /*
+     * If line_codepage is _still_ CS_NONE, we assume we're using
+     * the font's own encoding. This has been passed in to us, so
+     * we use that. If it's still CS_NONE after _that_ - i.e. the
+     * font we were given had an incomprehensible charset - then we
+     * fall back to using the D800 page.
+     */
     if (line_codepage == CS_NONE)
-	line_codepage = CS_ISO8859_1;
+	line_codepage = font_charset;
 
     /*
      * Set up unitab_line, by translating each individual character
@@ -117,7 +142,10 @@ void init_ucs(void)
 	c[0] = i;
 	p = c;
 	len = 1;
-	if (1 == charset_to_unicode(&p,&len,wc,1,line_codepage,NULL,L"",0))
+	if (line_codepage == CS_NONE)
+	    unitab_line[i] = 0xD800 | i;
+	else if (1 == charset_to_unicode(&p, &len, wc, 1, line_codepage,
+					 NULL, L"", 0))
 	    unitab_line[i] = wc[0];
 	else
 	    unitab_line[i] = 0xFFFD;
@@ -157,17 +185,25 @@ void init_ucs(void)
 	c[0] = i;
 	p = c;
 	len = 1;
-	if (1 == charset_to_unicode(&p,&len,wc,1,CS_CP437,NULL,L"",0))
+	if (1 == charset_to_unicode(&p, &len, wc, 1, CS_CP437, NULL, L"", 0))
 	    unitab_scoacs[i] = wc[0];
 	else
 	    unitab_scoacs[i] = 0xFFFD;
     }
 
-    /* Find the line control characters. */
-    for (i = 0; i < 256; i++)
-	if (unitab_line[i] < ' '
-	    || (unitab_line[i] >= 0x7F && unitab_line[i] < 0xA0))
+    /*
+     * Find the control characters in the line codepage. For
+     * direct-to-font mode using the D800 hack, we assume 00-1F and
+     * 7F are controls, but allow 80-9F through. (It's as good a
+     * guess as anything; and my bet is that half the weird fonts
+     * used in this way will be IBM or MS code pages anyway.)
+     */
+    for (i = 0; i < 256; i++) {
+	int lineval = unitab_line[i];
+	if (lineval < ' ' || (lineval >= 0x7F && lineval < 0xA0) ||
+	    (lineval >= 0xD800 && lineval < 0xD820) || (lineval == 0xD87F))
 	    unitab_ctrl[i] = i;
 	else
 	    unitab_ctrl[i] = 0xFF;
+    }
 }
diff --git a/winstuff.h b/winstuff.h
index 06fb9440..e2d700ee 100644
--- a/winstuff.h
+++ b/winstuff.h
@@ -189,4 +189,9 @@ void force_normal(HWND hwnd);
 void UpdateSizeTip(HWND src, int cx, int cy);
 void EnableSizeTip(int bEnable);
 
+/*
+ * Exports from unicode.c.
+ */
+void init_ucs(void);
+
 #endif
-- 
2.11.0