Silly of me to overlook it: another obvious way you might like to
[sgt/charset] / confuse.c
1 /*
2 * libcharset client utility which, given two Unicode code points,
3 * will search for character sets which encode the two code points the
4 * same way. The idea is that if you see some piece of misencoded text
5 * which uses (say) an oe ligature where you expected (as it might be)
6 * a pound sign, you can use this utility to suggest which two
7 * character sets might have been confused with each other to cause
8 * that effect.
9 */
10
11 #include <stdio.h>
12 #include <string.h>
13 #include <stdlib.h>
14 #include <locale.h>
15
16 #include "charset.h"
17
18 #define MAXENCLEN 20
19
20 int main(int argc, char **argv)
21 {
22 wchar_t *chars;
23 struct enc { char string[MAXENCLEN]; int len; } *encodings;
24 int nchars;
25 int i, j, k, cs;
26 const char *sep;
27
28 setlocale(LC_ALL, "");
29
30 chars = malloc(argc * sizeof(wchar_t));
31 if (!chars) {
32 fprintf(stderr, "out of memory\n");
33 return 1;
34 }
35
36 nchars = 0;
37
38 while (--argc) {
39 char *p = *++argv;
40 char *orig = p;
41 char *end;
42 int base = 16, semi_ok = 0;
43 wchar_t ch;
44
45 if ((p[0] == 'U' || p[0] == 'u') &&
46 (p[1] == '-' || p[1] == '+')) {
47 p += 2;
48 } else if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
49 p += 2;
50 } else if (p[0] == '&' && p[1] == '#') {
51 p += 2;
52 if (p[0] == 'x' || p[0] == 'X')
53 p++;
54 else
55 base = 10;
56 semi_ok = 1;
57 } else if (mbtowc(&ch, p, strlen(p)) == strlen(p)) {
58 chars[nchars++] = ch;
59 continue;
60 }
61
62 chars[nchars++] = strtoul(p, &end, base);
63 if (!*end || (semi_ok && !strcmp(end, ";")))
64 continue;
65 else {
66 fprintf(stderr, "unable to parse '%s' as a Unicode code point\n",
67 orig);
68 return 1;
69 }
70 }
71
72 encodings = malloc(nchars * CS_LIMIT * sizeof(struct enc));
73 for (cs = 0; cs < CS_LIMIT; cs++) {
74 for (i = 0; i < nchars; i++) {
75 wchar_t inbuf[1];
76 const wchar_t *inptr;
77 int inlen, error, ret;
78
79 if (!charset_exists(cs)) {
80 encodings[i*CS_LIMIT+cs].len = 0;
81 continue;
82 }
83
84 inbuf[0] = chars[i];
85 inptr = inbuf;
86 inlen = 1;
87 error = 0;
88 ret = charset_from_unicode(&inptr, &inlen,
89 encodings[i*CS_LIMIT+cs].string,
90 MAXENCLEN, cs, NULL, &error);
91 if (error || inlen > 0)
92 encodings[i*CS_LIMIT+cs].len = 0;
93 else
94 encodings[i*CS_LIMIT+cs].len = ret;
95 }
96 }
97
98 /*
99 * Really simple and slow approach to finding each distinct string
100 * and outputting it.
101 */
102 for (i = 0; i < nchars*CS_LIMIT; i++) {
103 const char *thisstr = encodings[i].string;
104 int thislen = encodings[i].len;
105
106 if (thislen == 0)
107 continue;
108 for (j = 0; j < i; j++)
109 if (encodings[j].len == thislen &&
110 !memcmp(encodings[j].string, thisstr, thislen))
111 break;
112 if (j < i)
113 continue; /* not the first instance of this encoding */
114
115 /*
116 * See if every character is encoded like this somewhere.
117 */
118 for (j = 0; j < nchars; j++) {
119 for (cs = 0; cs < CS_LIMIT; cs++) {
120 if (encodings[j*CS_LIMIT+cs].len == thislen &&
121 !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen))
122 break;
123 }
124 if (cs == CS_LIMIT)
125 break; /* this char not in any cs */
126 }
127 if (j < nchars)
128 continue; /* some char not in any cs */
129
130 /*
131 * Match! Print the encoding, then all charsets.
132 */
133 for (j = 0; j < nchars; j++) {
134 for (k = 0; k < thislen; k++)
135 printf("%s%02X", k>0?" ":"", (unsigned)(thisstr[k] & 0xFF));
136 printf(" = ");
137 if (chars[j] >= 0x10000)
138 printf("U-%08X", (unsigned)chars[j]);
139 else
140 printf("U+%04X", (unsigned)chars[j]);
141 printf(" in:");
142 sep = " ";
143 for (cs = 0; cs < CS_LIMIT; cs++)
144 if (encodings[j*CS_LIMIT+cs].len == thislen &&
145 !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen))
146 {
147 printf("%s%s", sep, charset_to_localenc(cs));
148 sep = ", ";
149 }
150 printf("\n");
151 }
152 printf("\n");
153 }
154
155 return 0;
156 }