| 1 | /* |
| 2 | * superset.c: deal with character sets which are supersets of |
| 3 | * others. |
| 4 | */ |
| 5 | |
| 6 | #include "charset.h" |
| 7 | |
| 8 | /* |
| 9 | * Just in case it's ever useful again, this rather simplistic |
| 10 | * piece of Perl/sh analyses sbcs.dat and determines which pairs of |
| 11 | * character sets are identical in the A0-FF region. This doesn't |
| 12 | * prove supersethood, but it spots obvious cases. |
| 13 | |
| 14 | perl -ne '/^[^ ]{4} / and defined ($line) and $line < 16 and do {' \ |
| 15 | -e ' chomp; print " $_" if $line>=10; print "\n" if ++$line==16; };' \ |
| 16 | -e '/^charset (.*)$/ and do { $line = 0; printf "%30s:", $1; };' \ |
| 17 | sbcs.dat | sort +1 | uniq -f1 -D |
| 18 | |
| 19 | * When run on sbcs.dat rev 1.3, it reports only two sets of matches: |
| 20 | * |
| 21 | * - ISO8859_1, ISO8859_1_X11 and CP1252 all match. |
| 22 | * - ISO8859_4 and CP1254 match. |
| 23 | * |
| 24 | * FIXME: There is more to it than this, and in particular there's |
| 25 | * even more to it than simple subsethood. Look at CP1255 and |
| 26 | * ISO8859_8: they match at every code point defined in both, but |
| 27 | * they each define at least one code point the other doesn't. It |
| 28 | * isn't clear how I should handle this. The right thing might be |
| 29 | * to define yet another SBCS which is the union of both, and |
| 30 | * upgrade both to that. Or it might be that the unicode.org |
| 31 | * mapping table for CP1255 is simply out of date, and the mapping |
| 32 | * ISO8859_8 has which it doesn't (DF -> U+2017 DOUBLE LOW LINE) |
| 33 | * should be present in it too, which would make it a proper |
| 34 | * superset of ISO8859_8 and solve the problem. |
| 35 | * |
| 36 | * However, for the moment I'm satisfied with enhancing this table |
| 37 | * as and when necessary; the idea is not to include _all_ superset |
| 38 | * relations here, the idea is to spot charset IDs which are used |
| 39 | * _in practice_ to mean other charset IDs. So unless and until I |
| 40 | * find out that there really is confusion between ISO8859_8 and |
| 41 | * CP1255, I don't need to do anything about it here. |
| 42 | */ |
| 43 | |
| 44 | int charset_upgrade(int charset) |
| 45 | { |
| 46 | if (charset == CS_ASCII || charset == CS_ISO8859_1) |
| 47 | charset = CS_CP1252; |
| 48 | if (charset == CS_ISO8859_4) |
| 49 | charset = CS_CP1254; |
| 50 | if (charset == CS_EUC_KR) |
| 51 | charset = CS_CP949; |
| 52 | return charset; |
| 53 | } |
| 54 | |
| 55 | /* |
| 56 | * This function returns TRUE if the input charset is a vaguely |
| 57 | * sensible superset of ASCII. That is, it returns FALSE for 7-bit |
| 58 | * encoding formats such as HZ and UTF-7. |
| 59 | */ |
| 60 | int charset_contains_ascii(int charset) |
| 61 | { |
| 62 | return (charset != CS_HZ && |
| 63 | charset != CS_UTF7 && |
| 64 | charset != CS_UTF7_CONSERVATIVE); |
| 65 | } |