38558cf1 |
1 | /* |
2 | * libcharset client utility which, given two Unicode code points, |
3 | * will search for character sets which encode the two code points the |
4 | * same way. The idea is that if you see some piece of misencoded text |
5 | * which uses (say) an oe ligature where you expected (as it might be) |
6 | * a pound sign, you can use this utility to suggest which two |
7 | * character sets might have been confused with each other to cause |
8 | * that effect. |
9 | */ |
10 | |
11 | #include <stdio.h> |
12 | #include <string.h> |
13 | #include <stdlib.h> |
14 | |
15 | #include "charset.h" |
16 | |
17 | #define MAXENCLEN 20 |
18 | |
19 | int main(int argc, char **argv) |
20 | { |
21 | wchar_t *chars; |
22 | struct enc { char string[MAXENCLEN]; int len; } *encodings; |
23 | int nchars; |
24 | int i, j, k, cs; |
25 | const char *sep; |
26 | |
27 | chars = malloc(argc * sizeof(wchar_t)); |
28 | if (!chars) { |
29 | fprintf(stderr, "out of memory\n"); |
30 | return 1; |
31 | } |
32 | |
33 | nchars = 0; |
34 | |
35 | while (--argc) { |
36 | char *p = *++argv; |
37 | char *orig = p; |
38 | char *end; |
39 | int base = 16, semi_ok = 0; |
40 | |
41 | if ((p[0] == 'U' || p[0] == 'u') && |
42 | (p[1] == '-' || p[1] == '+')) { |
43 | p += 2; |
44 | } else if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) { |
45 | p += 2; |
46 | } else if (p[0] == '&' && p[1] == '#') { |
47 | p += 2; |
48 | if (p[0] == 'x' || p[0] == 'X') |
49 | p++; |
50 | else |
51 | base = 10; |
52 | semi_ok = 1; |
53 | } |
54 | |
55 | chars[nchars++] = strtoul(p, &end, base); |
56 | if (!*end || (semi_ok && !strcmp(end, ";"))) |
57 | continue; |
58 | else { |
59 | fprintf(stderr, "unable to parse '%s' as a Unicode code point\n", |
60 | orig); |
61 | return 1; |
62 | } |
63 | } |
64 | |
65 | encodings = malloc(nchars * CS_LIMIT * sizeof(struct enc)); |
66 | for (cs = 0; cs < CS_LIMIT; cs++) { |
67 | for (i = 0; i < nchars; i++) { |
68 | wchar_t inbuf[1]; |
69 | const wchar_t *inptr; |
70 | int inlen, error, ret; |
71 | |
72 | if (!charset_exists(cs)) { |
73 | encodings[i*CS_LIMIT+cs].len = 0; |
74 | continue; |
75 | } |
76 | |
77 | inbuf[0] = chars[i]; |
78 | inptr = inbuf; |
79 | inlen = 1; |
80 | error = 0; |
81 | ret = charset_from_unicode(&inptr, &inlen, |
82 | encodings[i*CS_LIMIT+cs].string, |
83 | MAXENCLEN, cs, NULL, &error); |
84 | if (error || inlen > 0) |
85 | encodings[i*CS_LIMIT+cs].len = 0; |
86 | else |
87 | encodings[i*CS_LIMIT+cs].len = ret; |
88 | } |
89 | } |
90 | |
91 | /* |
92 | * Really simple and slow approach to finding each distinct string |
93 | * and outputting it. |
94 | */ |
95 | for (i = 0; i < nchars*CS_LIMIT; i++) { |
96 | const char *thisstr = encodings[i].string; |
97 | int thislen = encodings[i].len; |
98 | |
99 | if (thislen == 0) |
100 | continue; |
101 | for (j = 0; j < i; j++) |
102 | if (encodings[j].len == thislen && |
103 | !memcmp(encodings[j].string, thisstr, thislen)) |
104 | break; |
105 | if (j < i) |
106 | continue; /* not the first instance of this encoding */ |
107 | |
108 | /* |
109 | * See if every character is encoded like this somewhere. |
110 | */ |
111 | for (j = 0; j < nchars; j++) { |
112 | for (cs = 0; cs < CS_LIMIT; cs++) { |
113 | if (encodings[j*CS_LIMIT+cs].len == thislen && |
114 | !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen)) |
115 | break; |
116 | } |
117 | if (cs == CS_LIMIT) |
118 | break; /* this char not in any cs */ |
119 | } |
120 | if (j < nchars) |
121 | continue; /* some char not in any cs */ |
122 | |
123 | /* |
124 | * Match! Print the encoding, then all charsets. |
125 | */ |
126 | for (j = 0; j < nchars; j++) { |
127 | for (k = 0; k < thislen; k++) |
128 | printf("%s%02X", k>0?" ":"", (unsigned)(thisstr[k] & 0xFF)); |
129 | printf(" = "); |
130 | if (chars[j] >= 0x10000) |
131 | printf("U-%08X", (unsigned)chars[j]); |
132 | else |
133 | printf("U+%04X", (unsigned)chars[j]); |
134 | printf(" in:"); |
135 | sep = " "; |
136 | for (cs = 0; cs < CS_LIMIT; cs++) |
137 | if (encodings[j*CS_LIMIT+cs].len == thislen && |
138 | !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen)) |
139 | { |
140 | printf("%s%s", sep, charset_to_localenc(cs)); |
141 | sep = ", "; |
142 | } |
143 | printf("\n"); |
144 | } |
145 | printf("\n"); |
146 | } |
147 | |
148 | return 0; |
149 | } |