$(LIBCHARSET_GENPFX)all: \
$(LIBCHARSET_OBJDIR)libcharset.a \
$(LIBCHARSET_OBJDIR)convcs \
- $(LIBCHARSET_OBJDIR)cstable
+ $(LIBCHARSET_OBJDIR)cstable \
+ $(LIBCHARSET_OBJDIR)confuse
$(LIBCHARSET_OBJDIR)convcs: $(LIBCHARSET_SRCDIR)test.c \
$(LIBCHARSET_OBJDIR)libcharset.a
$(LIBCHARSET_SRCDIR)cstable.c \
$(LIBCHARSET_OBJDIR)libcharset.a
+$(LIBCHARSET_OBJDIR)confuse: $(LIBCHARSET_SRCDIR)confuse.c \
+ $(LIBCHARSET_OBJDIR)libcharset.a
+ $(CC) $(CFLAGS) -o $(LIBCHARSET_OBJDIR)confuse \
+ $(LIBCHARSET_SRCDIR)confuse.c \
+ $(LIBCHARSET_OBJDIR)libcharset.a
+
LIBCHARSET_OBJS = \
$(LIBCHARSET_OBJDIR)$(LIBCHARSET_OBJPFX)big5enc.o \
$(LIBCHARSET_OBJDIR)$(LIBCHARSET_OBJPFX)big5set.o \
--- /dev/null
+/*
+ * libcharset client utility which, given two Unicode code points,
+ * will search for character sets which encode the two code points the
+ * same way. The idea is that if you see some piece of misencoded text
+ * which uses (say) an oe ligature where you expected (as it might be)
+ * a pound sign, you can use this utility to suggest which two
+ * character sets might have been confused with each other to cause
+ * that effect.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "charset.h"
+
+#define MAXENCLEN 20
+
+int main(int argc, char **argv)
+{
+ wchar_t *chars;
+ struct enc { char string[MAXENCLEN]; int len; } *encodings;
+ int nchars;
+ int i, j, k, cs;
+ const char *sep;
+
+ chars = malloc(argc * sizeof(wchar_t));
+ if (!chars) {
+ fprintf(stderr, "out of memory\n");
+ return 1;
+ }
+
+ nchars = 0;
+
+ while (--argc) {
+ char *p = *++argv;
+ char *orig = p;
+ char *end;
+ int base = 16, semi_ok = 0;
+
+ if ((p[0] == 'U' || p[0] == 'u') &&
+ (p[1] == '-' || p[1] == '+')) {
+ p += 2;
+ } else if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
+ p += 2;
+ } else if (p[0] == '&' && p[1] == '#') {
+ p += 2;
+ if (p[0] == 'x' || p[0] == 'X')
+ p++;
+ else
+ base = 10;
+ semi_ok = 1;
+ }
+
+ chars[nchars++] = strtoul(p, &end, base);
+ if (!*end || (semi_ok && !strcmp(end, ";")))
+ continue;
+ else {
+ fprintf(stderr, "unable to parse '%s' as a Unicode code point\n",
+ orig);
+ return 1;
+ }
+ }
+
+ encodings = malloc(nchars * CS_LIMIT * sizeof(struct enc));
+ for (cs = 0; cs < CS_LIMIT; cs++) {
+ for (i = 0; i < nchars; i++) {
+ wchar_t inbuf[1];
+ const wchar_t *inptr;
+ int inlen, error, ret;
+
+ if (!charset_exists(cs)) {
+ encodings[i*CS_LIMIT+cs].len = 0;
+ continue;
+ }
+
+ inbuf[0] = chars[i];
+ inptr = inbuf;
+ inlen = 1;
+ error = 0;
+ ret = charset_from_unicode(&inptr, &inlen,
+ encodings[i*CS_LIMIT+cs].string,
+ MAXENCLEN, cs, NULL, &error);
+ if (error || inlen > 0)
+ encodings[i*CS_LIMIT+cs].len = 0;
+ else
+ encodings[i*CS_LIMIT+cs].len = ret;
+ }
+ }
+
+ /*
+ * Really simple and slow approach to finding each distinct string
+ * and outputting it.
+ */
+ for (i = 0; i < nchars*CS_LIMIT; i++) {
+ const char *thisstr = encodings[i].string;
+ int thislen = encodings[i].len;
+
+ if (thislen == 0)
+ continue;
+ for (j = 0; j < i; j++)
+ if (encodings[j].len == thislen &&
+ !memcmp(encodings[j].string, thisstr, thislen))
+ break;
+ if (j < i)
+ continue; /* not the first instance of this encoding */
+
+ /*
+ * See if every character is encoded like this somewhere.
+ */
+ for (j = 0; j < nchars; j++) {
+ for (cs = 0; cs < CS_LIMIT; cs++) {
+ if (encodings[j*CS_LIMIT+cs].len == thislen &&
+ !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen))
+ break;
+ }
+ if (cs == CS_LIMIT)
+ break; /* this char not in any cs */
+ }
+ if (j < nchars)
+ continue; /* some char not in any cs */
+
+ /*
+ * Match! Print the encoding, then all charsets.
+ */
+ for (j = 0; j < nchars; j++) {
+ for (k = 0; k < thislen; k++)
+ printf("%s%02X", k>0?" ":"", (unsigned)(thisstr[k] & 0xFF));
+ printf(" = ");
+ if (chars[j] >= 0x10000)
+ printf("U-%08X", (unsigned)chars[j]);
+ else
+ printf("U+%04X", (unsigned)chars[j]);
+ printf(" in:");
+ sep = " ";
+ for (cs = 0; cs < CS_LIMIT; cs++)
+ if (encodings[j*CS_LIMIT+cs].len == thislen &&
+ !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen))
+ {
+ printf("%s%s", sep, charset_to_localenc(cs));
+ sep = ", ";
+ }
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+ return 0;
+}