X-Git-Url: https://git.distorted.org.uk/~mdw/sgt/charset/blobdiff_plain/8bade1133caa9fbcecdeed2608dcbb4918180a62..c6cef4fab2c8d40a6256ce9f0c231a97a9f236e4:/iso2022s.c

diff --git a/iso2022s.c b/iso2022s.c
index 3b01529..e48c885 100644
--- a/iso2022s.c
+++ b/iso2022s.c
@@ -1,9 +1,5 @@
 /*
  * iso2022s.c - support for ISO-2022 subset encodings.
- * 
- * (The `s' suffix on the filename is there to leave `iso2022.c'
- * free for the unlikely event that I ever attempt to implement
- * _full_ ISO-2022 in this library!)
  */
 
 #ifndef ENUM_CHARSETS
@@ -450,12 +446,13 @@ static int write_iso2022s(charset_spec const *charset, long int input_chr,
 static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes)
 {
     switch (subcharset) {
+      case 1:			       /* JIS X 0201 bottom half */
+	if (bytes == 0x5C)
+	    return 0xA5;
+	else if (bytes == 0x7E)
+	    return 0x203E;
+	/* else fall through to ASCII */
       case 0: return bytes;	       /* one-byte ASCII */
-      case 1:			       /* JIS X 0201 half-width katakana */
-	if (bytes >= 0x21 && bytes <= 0x5F)
-	    return bytes + (0xFF61 - 0x21);
-	else
-	    return ERROR;
 	/* (no break needed since all control paths have returned) */
       case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
 					 ((bytes     ) & 0xFF) - 0x21);
@@ -470,9 +467,9 @@ static int iso2022jp_from_ucs(long int ucs, int *subcharset,
 	*subcharset = 0;
 	*bytes = ucs;
 	return 1;
-    } else if (ucs >= 0xFF61 && ucs <= 0xFF9F) {
+    } else if (ucs == 0xA5 || ucs == 0x203E) {
 	*subcharset = 1;
-	*bytes = ucs - (0xFF61 - 0x21);
+	*bytes = (ucs == 0xA5 ? 0x5C : 0x7E);
 	return 1;
     } else if (unicode_to_jisx0208(ucs, &r, &c)) {
 	*subcharset = 2;
@@ -713,6 +710,33 @@ static const struct iso2022_escape ctext_escapes[] = {
     SEQ("\033-H", 1, CTEXT_ISO8859_8),
     SEQ("\033-L", 1, CTEXT_ISO8859_5),
     SEQ("\033-M", 1, CTEXT_ISO8859_9),
+
+    /*
+     * Cross-testing against Xutf8TextListToTextProperty() turns up
+     * some additional character sets and ISO 2022 features
+     * supported by that and not by us:
+     * 
+     * 	- Single-byte right-hand-half character sets `ESC - f',
+     * 	  `ESC - T' and `ESC - Y'.
+     * 
+     * 	- A really horrifying mechanism used to escape completely
+     * 	  from the ISO 2022 framework: ESC % / <length>
+     * 	  <charset-name> <text>. Xutf8* uses this to encode
+     * 	  "iso8859-14", "iso8859-15" and "big5-0".
+     * 	   * This mechanism is particularly nasty because we can't
+     * 	     efficiently encode it on the fly! It requires that the
+     * 	     length of the text encoded in the foreign charset is
+     * 	     given _before_ the text in question, so if we're
+     * 	     receiving one character at a time we simply can't look
+     * 	     ahead and so we would have to encode each individual
+     * 	     character in a separate one of these sequences.
+     * 
+     * 	- ESC % G and ESC % @ to shift to and from UTF-8 mode, as a
+     * 	  last resort for anything we still don't support.
+     * 	   * Interestingly, ctext.ps actually _disallows_ this: it
+     * 	     says that the above extension mechanism is the only
+     * 	     one permitted. Ho hum.
+     */
 };
 static const struct iso2022 ctext = {
     ctext_escapes, lenof(ctext_escapes),