A slightly silly new utility: 'confuse'. You provide it with some

[sgt/charset] / confuse.c
diff --git a/confuse.c b/confuse.c

new file mode 100644 (file)

index 0000000..207c5b9
--- /dev/null
+++ b/confuse.c
@@ -0,0 +1,149 @@
+/*
+ * libcharset client utility which, given two Unicode code points,
+ * will search for character sets which encode the two code points the
+ * same way. The idea is that if you see some piece of misencoded text
+ * which uses (say) an oe ligature where you expected (as it might be)
+ * a pound sign, you can use this utility to suggest which two
+ * character sets might have been confused with each other to cause
+ * that effect.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "charset.h"
+
+#define MAXENCLEN 20
+
+int main(int argc, char **argv)
+{
+    wchar_t *chars;
+    struct enc { char string[MAXENCLEN]; int len; } *encodings;
+    int nchars;
+    int i, j, k, cs;
+    const char *sep;
+
+    chars = malloc(argc * sizeof(wchar_t));
+    if (!chars) {
+        fprintf(stderr, "out of memory\n");
+        return 1;
+    }
+
+    nchars = 0;
+
+    while (--argc) {
+        char *p = *++argv;
+        char *orig = p;
+        char *end;
+        int base = 16, semi_ok = 0;
+
+        if ((p[0] == 'U' || p[0] == 'u') &&
+            (p[1] == '-' || p[1] == '+')) {
+            p += 2;
+        } else if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
+            p += 2;
+        } else if (p[0] == '&' && p[1] == '#') {
+            p += 2;
+            if (p[0] == 'x' || p[0] == 'X')
+                p++;
+            else
+                base = 10;
+            semi_ok = 1;
+        }
+
+        chars[nchars++] = strtoul(p, &end, base);
+        if (!*end || (semi_ok && !strcmp(end, ";")))
+            continue;
+        else {
+            fprintf(stderr, "unable to parse '%s' as a Unicode code point\n",
+                    orig);
+            return 1;
+        }
+    }
+
+    encodings = malloc(nchars * CS_LIMIT * sizeof(struct enc));
+    for (cs = 0; cs < CS_LIMIT; cs++) {
+        for (i = 0; i < nchars; i++) {
+            wchar_t inbuf[1];
+            const wchar_t *inptr;
+            int inlen, error, ret;
+
+            if (!charset_exists(cs)) {
+                encodings[i*CS_LIMIT+cs].len = 0;
+                continue;
+            }
+
+            inbuf[0] = chars[i];
+            inptr = inbuf;
+            inlen = 1;
+            error = 0;
+            ret = charset_from_unicode(&inptr, &inlen,
+                                       encodings[i*CS_LIMIT+cs].string,
+                                       MAXENCLEN, cs, NULL, &error);
+            if (error || inlen > 0)
+                encodings[i*CS_LIMIT+cs].len = 0;
+            else
+                encodings[i*CS_LIMIT+cs].len = ret;
+        }
+    }
+
+    /*
+     * Really simple and slow approach to finding each distinct string
+     * and outputting it.
+     */
+    for (i = 0; i < nchars*CS_LIMIT; i++) {
+        const char *thisstr = encodings[i].string;
+        int thislen = encodings[i].len;
+
+        if (thislen == 0)
+            continue;
+        for (j = 0; j < i; j++)
+            if (encodings[j].len == thislen &&
+                !memcmp(encodings[j].string, thisstr, thislen))
+                break;
+        if (j < i)
+            continue;        /* not the first instance of this encoding */
+
+        /*
+         * See if every character is encoded like this somewhere.
+         */
+        for (j = 0; j < nchars; j++) {
+            for (cs = 0; cs < CS_LIMIT; cs++) {
+                if (encodings[j*CS_LIMIT+cs].len == thislen &&
+                    !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen))
+                    break;
+            }
+            if (cs == CS_LIMIT)
+                break;                 /* this char not in any cs */
+        }
+        if (j < nchars)
+            continue;                  /* some char not in any cs */
+
+        /*
+         * Match! Print the encoding, then all charsets.
+         */
+        for (j = 0; j < nchars; j++) {
+            for (k = 0; k < thislen; k++)
+                printf("%s%02X", k>0?" ":"", (unsigned)(thisstr[k] & 0xFF));
+            printf(" = ");
+            if (chars[j] >= 0x10000)
+                printf("U-%08X", (unsigned)chars[j]);
+            else
+                printf("U+%04X", (unsigned)chars[j]);
+            printf(" in:");
+            sep = " ";
+            for (cs = 0; cs < CS_LIMIT; cs++)
+                if (encodings[j*CS_LIMIT+cs].len == thislen &&
+                    !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen))
+                {
+                    printf("%s%s", sep, charset_to_localenc(cs));
+                    sep = ", ";
+                }
+            printf("\n");
+        }
+        printf("\n");
+    }
+
+    return 0;
+}