X-Git-Url: https://git.distorted.org.uk/~mdw/sgt/charset/blobdiff_plain/3e5305f1131c78e7dee9d43ee33d8ea6e5e28c86..38558cf145aa6e1de02c28b533c5c539b3f65e84:/confuse.c diff --git a/confuse.c b/confuse.c new file mode 100644 index 0000000..207c5b9 --- /dev/null +++ b/confuse.c @@ -0,0 +1,149 @@ +/* + * libcharset client utility which, given two Unicode code points, + * will search for character sets which encode the two code points the + * same way. The idea is that if you see some piece of misencoded text + * which uses (say) an oe ligature where you expected (as it might be) + * a pound sign, you can use this utility to suggest which two + * character sets might have been confused with each other to cause + * that effect. + */ + +#include +#include +#include + +#include "charset.h" + +#define MAXENCLEN 20 + +int main(int argc, char **argv) +{ + wchar_t *chars; + struct enc { char string[MAXENCLEN]; int len; } *encodings; + int nchars; + int i, j, k, cs; + const char *sep; + + chars = malloc(argc * sizeof(wchar_t)); + if (!chars) { + fprintf(stderr, "out of memory\n"); + return 1; + } + + nchars = 0; + + while (--argc) { + char *p = *++argv; + char *orig = p; + char *end; + int base = 16, semi_ok = 0; + + if ((p[0] == 'U' || p[0] == 'u') && + (p[1] == '-' || p[1] == '+')) { + p += 2; + } else if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) { + p += 2; + } else if (p[0] == '&' && p[1] == '#') { + p += 2; + if (p[0] == 'x' || p[0] == 'X') + p++; + else + base = 10; + semi_ok = 1; + } + + chars[nchars++] = strtoul(p, &end, base); + if (!*end || (semi_ok && !strcmp(end, ";"))) + continue; + else { + fprintf(stderr, "unable to parse '%s' as a Unicode code point\n", + orig); + return 1; + } + } + + encodings = malloc(nchars * CS_LIMIT * sizeof(struct enc)); + for (cs = 0; cs < CS_LIMIT; cs++) { + for (i = 0; i < nchars; i++) { + wchar_t inbuf[1]; + const wchar_t *inptr; + int inlen, error, ret; + + if (!charset_exists(cs)) { + encodings[i*CS_LIMIT+cs].len = 0; + continue; + } + + inbuf[0] = chars[i]; + inptr = inbuf; + inlen = 1; + error = 0; + ret = charset_from_unicode(&inptr, &inlen, + encodings[i*CS_LIMIT+cs].string, + MAXENCLEN, cs, NULL, &error); + if (error || inlen > 0) + encodings[i*CS_LIMIT+cs].len = 0; + else + encodings[i*CS_LIMIT+cs].len = ret; + } + } + + /* + * Really simple and slow approach to finding each distinct string + * and outputting it. + */ + for (i = 0; i < nchars*CS_LIMIT; i++) { + const char *thisstr = encodings[i].string; + int thislen = encodings[i].len; + + if (thislen == 0) + continue; + for (j = 0; j < i; j++) + if (encodings[j].len == thislen && + !memcmp(encodings[j].string, thisstr, thislen)) + break; + if (j < i) + continue; /* not the first instance of this encoding */ + + /* + * See if every character is encoded like this somewhere. + */ + for (j = 0; j < nchars; j++) { + for (cs = 0; cs < CS_LIMIT; cs++) { + if (encodings[j*CS_LIMIT+cs].len == thislen && + !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen)) + break; + } + if (cs == CS_LIMIT) + break; /* this char not in any cs */ + } + if (j < nchars) + continue; /* some char not in any cs */ + + /* + * Match! Print the encoding, then all charsets. + */ + for (j = 0; j < nchars; j++) { + for (k = 0; k < thislen; k++) + printf("%s%02X", k>0?" ":"", (unsigned)(thisstr[k] & 0xFF)); + printf(" = "); + if (chars[j] >= 0x10000) + printf("U-%08X", (unsigned)chars[j]); + else + printf("U+%04X", (unsigned)chars[j]); + printf(" in:"); + sep = " "; + for (cs = 0; cs < CS_LIMIT; cs++) + if (encodings[j*CS_LIMIT+cs].len == thislen && + !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen)) + { + printf("%s%s", sep, charset_to_localenc(cs)); + sep = ", "; + } + printf("\n"); + } + printf("\n"); + } + + return 0; +}