[sgt/charset] / superset.c

/*
 * superset.c: deal with character sets which are supersets of
 * others.
 */

#include "charset.h"

/*
 * Just in case it's ever useful again, this rather simplistic
 * piece of Perl/sh analyses sbcs.dat and determines which pairs of
 * character sets are identical in the A0-FF region. This doesn't
 * prove supersethood, but it spots obvious cases.

perl -ne '/^[^ ]{4} / and defined ($line) and $line < 16 and do {' \
      -e '  chomp; print " $_" if $line>=10; print "\n" if ++$line==16; };' \
      -e '/^charset (.*)$/ and do { $line = 0; printf "%30s:", $1; };' \
  sbcs.dat | sort +1 | uniq -f1 -D

 * When run on sbcs.dat rev 1.3, it reports only two sets of matches:
 * 
 *  - ISO8859_1, ISO8859_1_X11 and CP1252 all match.
 *  - ISO8859_4 and CP1254 match.
 * 
 * FIXME: There is more to it than this, and in particular there's
 * even more to it than simple subsethood. Look at CP1255 and
 * ISO8859_8: they match at every code point defined in both, but
 * they each define at least one code point the other doesn't. It
 * isn't clear how I should handle this. The right thing might be
 * to define yet another SBCS which is the union of both, and
 * upgrade both to that. Or it might be that the unicode.org
 * mapping table for CP1255 is simply out of date, and the mapping
 * ISO8859_8 has which it doesn't (DF -> U+2017 DOUBLE LOW LINE)
 * should be present in it too, which would make it a proper
 * superset of ISO8859_8 and solve the problem.
 * 
 * However, for the moment I'm satisfied with enhancing this table
 * as and when necessary; the idea is not to include _all_ superset
 * relations here, the idea is to spot charset IDs which are used
 * _in practice_ to mean other charset IDs. So unless and until I
 * find out that there really is confusion between ISO8859_8 and
 * CP1255, I don't need to do anything about it here.
 */

int charset_upgrade(int charset)
{
    if (charset == CS_ASCII || charset == CS_ISO8859_1)
	charset = CS_CP1252;
    if (charset == CS_ISO8859_4)
	charset = CS_CP1254;
    if (charset == CS_EUC_KR)
	charset = CS_CP949;
    return charset;
}

/*
 * This function returns TRUE if the input charset is a vaguely
 * sensible superset of ASCII. That is, it returns FALSE for 7-bit
 * encoding formats such as HZ and UTF-7.
 */
int charset_contains_ascii(int charset)
{
    return (charset != CS_HZ &&
	    charset != CS_UTF7 &&
	    charset != CS_UTF7_CONSERVATIVE);
}
Commit	Line	Data
c6d25d8d	1	/*
	2	* superset.c: deal with character sets which are supersets of
	3	* others.
	4	*/
	5
	6	#include "charset.h"
	7
	8	/*
	9	* Just in case it's ever useful again, this rather simplistic
	10	* piece of Perl/sh analyses sbcs.dat and determines which pairs of
	11	* character sets are identical in the A0-FF region. This doesn't
	12	* prove supersethood, but it spots obvious cases.
	13
	14	perl -ne '/^[^ ]{4} / and defined ($line) and $line < 16 and do {' \
	15	-e ' chomp; print " $_" if $line>=10; print "\n" if ++$line==16; };' \
	16	-e '/^charset (.*)$/ and do { $line = 0; printf "%30s:", $1; };' \
	17	sbcs.dat \| sort +1 \| uniq -f1 -D
	18
	19	* When run on sbcs.dat rev 1.3, it reports only two sets of matches:
	20	*
	21	* - ISO8859_1, ISO8859_1_X11 and CP1252 all match.
	22	* - ISO8859_4 and CP1254 match.
	23	*
	24	* FIXME: There is more to it than this, and in particular there's
	25	* even more to it than simple subsethood. Look at CP1255 and
	26	* ISO8859_8: they match at every code point defined in both, but
	27	* they each define at least one code point the other doesn't. It
	28	* isn't clear how I should handle this. The right thing might be
	29	* to define yet another SBCS which is the union of both, and
	30	* upgrade both to that. Or it might be that the unicode.org
	31	* mapping table for CP1255 is simply out of date, and the mapping
	32	* ISO8859_8 has which it doesn't (DF -> U+2017 DOUBLE LOW LINE)
	33	* should be present in it too, which would make it a proper
	34	* superset of ISO8859_8 and solve the problem.
	35	*
	36	* However, for the moment I'm satisfied with enhancing this table
	37	* as and when necessary; the idea is not to include _all_ superset
	38	* relations here, the idea is to spot charset IDs which are used
	39	* _in practice_ to mean other charset IDs. So unless and until I
	40	* find out that there really is confusion between ISO8859_8 and
	41	* CP1255, I don't need to do anything about it here.
	42	*/
	43
	44	int charset_upgrade(int charset)
	45	{
	46	if (charset == CS_ASCII \|\| charset == CS_ISO8859_1)
	47	charset = CS_CP1252;
	48	if (charset == CS_ISO8859_4)
	49	charset = CS_CP1254;
	50	if (charset == CS_EUC_KR)
	51	charset = CS_CP949;
	52	return charset;
	53	}
	54
	55	/*
	56	* This function returns TRUE if the input charset is a vaguely
	57	* sensible superset of ASCII. That is, it returns FALSE for 7-bit
	58	* encoding formats such as HZ and UTF-7.
	59	*/
	60	int charset_contains_ascii(int charset)
	61	{
	62	return (charset != CS_HZ &&
	63	charset != CS_UTF7 &&
	64	charset != CS_UTF7_CONSERVATIVE);
65	}