From 38558cf145aa6e1de02c28b533c5c539b3f65e84 Mon Sep 17 00:00:00 2001 From: simon Date: Wed, 18 Jul 2012 22:52:00 +0000 Subject: [PATCH] A slightly silly new utility: 'confuse'. You provide it with some Unicode values (typically two of them), and it finds cases in which the provided characters are all encoded as the same thing in different charsets and prints those charsets. So if you encounter, for example, some piece of text which has U+0153 LATIN SMALL LIGATURE OE where you might have expected U+00A3 POUND SIGN, simply run 'confuse 153 a3' and it'll tell you which character sets the sender and receiver of the text might have got confused between. git-svn-id: svn://svn.tartarus.org/sgt/charset@9581 cda61777-01e9-0310-a592-d414129be87e --- Makefile | 9 +++- confuse.c | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+), 1 deletion(-) create mode 100644 confuse.c diff --git a/Makefile b/Makefile index 0a049fc..5d04960 100644 --- a/Makefile +++ b/Makefile @@ -38,7 +38,8 @@ $(LIBCHARSET_GENPFX)all: \ $(LIBCHARSET_OBJDIR)libcharset.a \ $(LIBCHARSET_OBJDIR)convcs \ - $(LIBCHARSET_OBJDIR)cstable + $(LIBCHARSET_OBJDIR)cstable \ + $(LIBCHARSET_OBJDIR)confuse $(LIBCHARSET_OBJDIR)convcs: $(LIBCHARSET_SRCDIR)test.c \ $(LIBCHARSET_OBJDIR)libcharset.a @@ -52,6 +53,12 @@ $(LIBCHARSET_OBJDIR)cstable: $(LIBCHARSET_SRCDIR)cstable.c \ $(LIBCHARSET_SRCDIR)cstable.c \ $(LIBCHARSET_OBJDIR)libcharset.a +$(LIBCHARSET_OBJDIR)confuse: $(LIBCHARSET_SRCDIR)confuse.c \ + $(LIBCHARSET_OBJDIR)libcharset.a + $(CC) $(CFLAGS) -o $(LIBCHARSET_OBJDIR)confuse \ + $(LIBCHARSET_SRCDIR)confuse.c \ + $(LIBCHARSET_OBJDIR)libcharset.a + LIBCHARSET_OBJS = \ $(LIBCHARSET_OBJDIR)$(LIBCHARSET_OBJPFX)big5enc.o \ $(LIBCHARSET_OBJDIR)$(LIBCHARSET_OBJPFX)big5set.o \ diff --git a/confuse.c b/confuse.c new file mode 100644 index 0000000..207c5b9 --- /dev/null +++ b/confuse.c @@ -0,0 +1,149 @@ +/* + * libcharset client utility which, given two Unicode code points, + * will search for character sets which encode the two code points the + * same way. The idea is that if you see some piece of misencoded text + * which uses (say) an oe ligature where you expected (as it might be) + * a pound sign, you can use this utility to suggest which two + * character sets might have been confused with each other to cause + * that effect. + */ + +#include +#include +#include + +#include "charset.h" + +#define MAXENCLEN 20 + +int main(int argc, char **argv) +{ + wchar_t *chars; + struct enc { char string[MAXENCLEN]; int len; } *encodings; + int nchars; + int i, j, k, cs; + const char *sep; + + chars = malloc(argc * sizeof(wchar_t)); + if (!chars) { + fprintf(stderr, "out of memory\n"); + return 1; + } + + nchars = 0; + + while (--argc) { + char *p = *++argv; + char *orig = p; + char *end; + int base = 16, semi_ok = 0; + + if ((p[0] == 'U' || p[0] == 'u') && + (p[1] == '-' || p[1] == '+')) { + p += 2; + } else if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) { + p += 2; + } else if (p[0] == '&' && p[1] == '#') { + p += 2; + if (p[0] == 'x' || p[0] == 'X') + p++; + else + base = 10; + semi_ok = 1; + } + + chars[nchars++] = strtoul(p, &end, base); + if (!*end || (semi_ok && !strcmp(end, ";"))) + continue; + else { + fprintf(stderr, "unable to parse '%s' as a Unicode code point\n", + orig); + return 1; + } + } + + encodings = malloc(nchars * CS_LIMIT * sizeof(struct enc)); + for (cs = 0; cs < CS_LIMIT; cs++) { + for (i = 0; i < nchars; i++) { + wchar_t inbuf[1]; + const wchar_t *inptr; + int inlen, error, ret; + + if (!charset_exists(cs)) { + encodings[i*CS_LIMIT+cs].len = 0; + continue; + } + + inbuf[0] = chars[i]; + inptr = inbuf; + inlen = 1; + error = 0; + ret = charset_from_unicode(&inptr, &inlen, + encodings[i*CS_LIMIT+cs].string, + MAXENCLEN, cs, NULL, &error); + if (error || inlen > 0) + encodings[i*CS_LIMIT+cs].len = 0; + else + encodings[i*CS_LIMIT+cs].len = ret; + } + } + + /* + * Really simple and slow approach to finding each distinct string + * and outputting it. + */ + for (i = 0; i < nchars*CS_LIMIT; i++) { + const char *thisstr = encodings[i].string; + int thislen = encodings[i].len; + + if (thislen == 0) + continue; + for (j = 0; j < i; j++) + if (encodings[j].len == thislen && + !memcmp(encodings[j].string, thisstr, thislen)) + break; + if (j < i) + continue; /* not the first instance of this encoding */ + + /* + * See if every character is encoded like this somewhere. + */ + for (j = 0; j < nchars; j++) { + for (cs = 0; cs < CS_LIMIT; cs++) { + if (encodings[j*CS_LIMIT+cs].len == thislen && + !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen)) + break; + } + if (cs == CS_LIMIT) + break; /* this char not in any cs */ + } + if (j < nchars) + continue; /* some char not in any cs */ + + /* + * Match! Print the encoding, then all charsets. + */ + for (j = 0; j < nchars; j++) { + for (k = 0; k < thislen; k++) + printf("%s%02X", k>0?" ":"", (unsigned)(thisstr[k] & 0xFF)); + printf(" = "); + if (chars[j] >= 0x10000) + printf("U-%08X", (unsigned)chars[j]); + else + printf("U+%04X", (unsigned)chars[j]); + printf(" in:"); + sep = " "; + for (cs = 0; cs < CS_LIMIT; cs++) + if (encodings[j*CS_LIMIT+cs].len == thislen && + !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen)) + { + printf("%s%s", sep, charset_to_localenc(cs)); + sep = ", "; + } + printf("\n"); + } + printf("\n"); + } + + return 0; +} -- 2.11.0