Silly of me to overlook it: another obvious way you might like to
[sgt/charset] / superset.c
CommitLineData
c6d25d8d 1/*
2 * superset.c: deal with character sets which are supersets of
3 * others.
4 */
5
6#include "charset.h"
7
8/*
9 * Just in case it's ever useful again, this rather simplistic
10 * piece of Perl/sh analyses sbcs.dat and determines which pairs of
11 * character sets are identical in the A0-FF region. This doesn't
12 * prove supersethood, but it spots obvious cases.
13
14perl -ne '/^[^ ]{4} / and defined ($line) and $line < 16 and do {' \
15 -e ' chomp; print " $_" if $line>=10; print "\n" if ++$line==16; };' \
16 -e '/^charset (.*)$/ and do { $line = 0; printf "%30s:", $1; };' \
17 sbcs.dat | sort +1 | uniq -f1 -D
18
19 * When run on sbcs.dat rev 1.3, it reports only two sets of matches:
20 *
21 * - ISO8859_1, ISO8859_1_X11 and CP1252 all match.
22 * - ISO8859_4 and CP1254 match.
23 *
24 * FIXME: There is more to it than this, and in particular there's
25 * even more to it than simple subsethood. Look at CP1255 and
26 * ISO8859_8: they match at every code point defined in both, but
27 * they each define at least one code point the other doesn't. It
28 * isn't clear how I should handle this. The right thing might be
29 * to define yet another SBCS which is the union of both, and
30 * upgrade both to that. Or it might be that the unicode.org
31 * mapping table for CP1255 is simply out of date, and the mapping
32 * ISO8859_8 has which it doesn't (DF -> U+2017 DOUBLE LOW LINE)
33 * should be present in it too, which would make it a proper
34 * superset of ISO8859_8 and solve the problem.
35 *
36 * However, for the moment I'm satisfied with enhancing this table
37 * as and when necessary; the idea is not to include _all_ superset
38 * relations here, the idea is to spot charset IDs which are used
39 * _in practice_ to mean other charset IDs. So unless and until I
40 * find out that there really is confusion between ISO8859_8 and
41 * CP1255, I don't need to do anything about it here.
42 */
43
44int charset_upgrade(int charset)
45{
46 if (charset == CS_ASCII || charset == CS_ISO8859_1)
47 charset = CS_CP1252;
48 if (charset == CS_ISO8859_4)
49 charset = CS_CP1254;
50 if (charset == CS_EUC_KR)
51 charset = CS_CP949;
52 return charset;
53}
54
55/*
56 * This function returns TRUE if the input charset is a vaguely
57 * sensible superset of ASCII. That is, it returns FALSE for 7-bit
58 * encoding formats such as HZ and UTF-7.
59 */
60int charset_contains_ascii(int charset)
61{
62 return (charset != CS_HZ &&
63 charset != CS_UTF7 &&
64 charset != CS_UTF7_CONSERVATIVE);
65}