207c5b9ffdf0a05f2f5219b6d1b15c11e3b077bb
[sgt/charset] / confuse.c
1 /*
2 * libcharset client utility which, given two Unicode code points,
3 * will search for character sets which encode the two code points the
4 * same way. The idea is that if you see some piece of misencoded text
5 * which uses (say) an oe ligature where you expected (as it might be)
6 * a pound sign, you can use this utility to suggest which two
7 * character sets might have been confused with each other to cause
8 * that effect.
9 */
10
11 #include <stdio.h>
12 #include <string.h>
13 #include <stdlib.h>
14
15 #include "charset.h"
16
17 #define MAXENCLEN 20
18
19 int main(int argc, char **argv)
20 {
21 wchar_t *chars;
22 struct enc { char string[MAXENCLEN]; int len; } *encodings;
23 int nchars;
24 int i, j, k, cs;
25 const char *sep;
26
27 chars = malloc(argc * sizeof(wchar_t));
28 if (!chars) {
29 fprintf(stderr, "out of memory\n");
30 return 1;
31 }
32
33 nchars = 0;
34
35 while (--argc) {
36 char *p = *++argv;
37 char *orig = p;
38 char *end;
39 int base = 16, semi_ok = 0;
40
41 if ((p[0] == 'U' || p[0] == 'u') &&
42 (p[1] == '-' || p[1] == '+')) {
43 p += 2;
44 } else if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
45 p += 2;
46 } else if (p[0] == '&' && p[1] == '#') {
47 p += 2;
48 if (p[0] == 'x' || p[0] == 'X')
49 p++;
50 else
51 base = 10;
52 semi_ok = 1;
53 }
54
55 chars[nchars++] = strtoul(p, &end, base);
56 if (!*end || (semi_ok && !strcmp(end, ";")))
57 continue;
58 else {
59 fprintf(stderr, "unable to parse '%s' as a Unicode code point\n",
60 orig);
61 return 1;
62 }
63 }
64
65 encodings = malloc(nchars * CS_LIMIT * sizeof(struct enc));
66 for (cs = 0; cs < CS_LIMIT; cs++) {
67 for (i = 0; i < nchars; i++) {
68 wchar_t inbuf[1];
69 const wchar_t *inptr;
70 int inlen, error, ret;
71
72 if (!charset_exists(cs)) {
73 encodings[i*CS_LIMIT+cs].len = 0;
74 continue;
75 }
76
77 inbuf[0] = chars[i];
78 inptr = inbuf;
79 inlen = 1;
80 error = 0;
81 ret = charset_from_unicode(&inptr, &inlen,
82 encodings[i*CS_LIMIT+cs].string,
83 MAXENCLEN, cs, NULL, &error);
84 if (error || inlen > 0)
85 encodings[i*CS_LIMIT+cs].len = 0;
86 else
87 encodings[i*CS_LIMIT+cs].len = ret;
88 }
89 }
90
91 /*
92 * Really simple and slow approach to finding each distinct string
93 * and outputting it.
94 */
95 for (i = 0; i < nchars*CS_LIMIT; i++) {
96 const char *thisstr = encodings[i].string;
97 int thislen = encodings[i].len;
98
99 if (thislen == 0)
100 continue;
101 for (j = 0; j < i; j++)
102 if (encodings[j].len == thislen &&
103 !memcmp(encodings[j].string, thisstr, thislen))
104 break;
105 if (j < i)
106 continue; /* not the first instance of this encoding */
107
108 /*
109 * See if every character is encoded like this somewhere.
110 */
111 for (j = 0; j < nchars; j++) {
112 for (cs = 0; cs < CS_LIMIT; cs++) {
113 if (encodings[j*CS_LIMIT+cs].len == thislen &&
114 !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen))
115 break;
116 }
117 if (cs == CS_LIMIT)
118 break; /* this char not in any cs */
119 }
120 if (j < nchars)
121 continue; /* some char not in any cs */
122
123 /*
124 * Match! Print the encoding, then all charsets.
125 */
126 for (j = 0; j < nchars; j++) {
127 for (k = 0; k < thislen; k++)
128 printf("%s%02X", k>0?" ":"", (unsigned)(thisstr[k] & 0xFF));
129 printf(" = ");
130 if (chars[j] >= 0x10000)
131 printf("U-%08X", (unsigned)chars[j]);
132 else
133 printf("U+%04X", (unsigned)chars[j]);
134 printf(" in:");
135 sep = " ";
136 for (cs = 0; cs < CS_LIMIT; cs++)
137 if (encodings[j*CS_LIMIT+cs].len == thislen &&
138 !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen))
139 {
140 printf("%s%s", sep, charset_to_localenc(cs));
141 sep = ", ";
142 }
143 printf("\n");
144 }
145 printf("\n");
146 }
147
148 return 0;
149 }