38558cf1 |
1 | /* |
2 | * libcharset client utility which, given two Unicode code points, |
3 | * will search for character sets which encode the two code points the |
4 | * same way. The idea is that if you see some piece of misencoded text |
5 | * which uses (say) an oe ligature where you expected (as it might be) |
6 | * a pound sign, you can use this utility to suggest which two |
7 | * character sets might have been confused with each other to cause |
8 | * that effect. |
9 | */ |
10 | |
11 | #include <stdio.h> |
12 | #include <string.h> |
13 | #include <stdlib.h> |
2265dc5c |
14 | #include <locale.h> |
38558cf1 |
15 | |
16 | #include "charset.h" |
17 | |
18 | #define MAXENCLEN 20 |
19 | |
20 | int main(int argc, char **argv) |
21 | { |
22 | wchar_t *chars; |
23 | struct enc { char string[MAXENCLEN]; int len; } *encodings; |
24 | int nchars; |
25 | int i, j, k, cs; |
26 | const char *sep; |
27 | |
2265dc5c |
28 | setlocale(LC_ALL, ""); |
29 | |
38558cf1 |
30 | chars = malloc(argc * sizeof(wchar_t)); |
31 | if (!chars) { |
32 | fprintf(stderr, "out of memory\n"); |
33 | return 1; |
34 | } |
35 | |
36 | nchars = 0; |
37 | |
38 | while (--argc) { |
39 | char *p = *++argv; |
40 | char *orig = p; |
41 | char *end; |
42 | int base = 16, semi_ok = 0; |
2265dc5c |
43 | wchar_t ch; |
38558cf1 |
44 | |
45 | if ((p[0] == 'U' || p[0] == 'u') && |
46 | (p[1] == '-' || p[1] == '+')) { |
47 | p += 2; |
48 | } else if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) { |
49 | p += 2; |
50 | } else if (p[0] == '&' && p[1] == '#') { |
51 | p += 2; |
52 | if (p[0] == 'x' || p[0] == 'X') |
53 | p++; |
54 | else |
55 | base = 10; |
56 | semi_ok = 1; |
2265dc5c |
57 | } else if (mbtowc(&ch, p, strlen(p)) == strlen(p)) { |
58 | chars[nchars++] = ch; |
59 | continue; |
38558cf1 |
60 | } |
61 | |
62 | chars[nchars++] = strtoul(p, &end, base); |
63 | if (!*end || (semi_ok && !strcmp(end, ";"))) |
64 | continue; |
65 | else { |
66 | fprintf(stderr, "unable to parse '%s' as a Unicode code point\n", |
67 | orig); |
68 | return 1; |
69 | } |
70 | } |
71 | |
72 | encodings = malloc(nchars * CS_LIMIT * sizeof(struct enc)); |
73 | for (cs = 0; cs < CS_LIMIT; cs++) { |
74 | for (i = 0; i < nchars; i++) { |
75 | wchar_t inbuf[1]; |
76 | const wchar_t *inptr; |
77 | int inlen, error, ret; |
78 | |
79 | if (!charset_exists(cs)) { |
80 | encodings[i*CS_LIMIT+cs].len = 0; |
81 | continue; |
82 | } |
83 | |
84 | inbuf[0] = chars[i]; |
85 | inptr = inbuf; |
86 | inlen = 1; |
87 | error = 0; |
88 | ret = charset_from_unicode(&inptr, &inlen, |
89 | encodings[i*CS_LIMIT+cs].string, |
90 | MAXENCLEN, cs, NULL, &error); |
91 | if (error || inlen > 0) |
92 | encodings[i*CS_LIMIT+cs].len = 0; |
93 | else |
94 | encodings[i*CS_LIMIT+cs].len = ret; |
95 | } |
96 | } |
97 | |
98 | /* |
99 | * Really simple and slow approach to finding each distinct string |
100 | * and outputting it. |
101 | */ |
102 | for (i = 0; i < nchars*CS_LIMIT; i++) { |
103 | const char *thisstr = encodings[i].string; |
104 | int thislen = encodings[i].len; |
105 | |
106 | if (thislen == 0) |
107 | continue; |
108 | for (j = 0; j < i; j++) |
109 | if (encodings[j].len == thislen && |
110 | !memcmp(encodings[j].string, thisstr, thislen)) |
111 | break; |
112 | if (j < i) |
113 | continue; /* not the first instance of this encoding */ |
114 | |
115 | /* |
116 | * See if every character is encoded like this somewhere. |
117 | */ |
118 | for (j = 0; j < nchars; j++) { |
119 | for (cs = 0; cs < CS_LIMIT; cs++) { |
120 | if (encodings[j*CS_LIMIT+cs].len == thislen && |
121 | !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen)) |
122 | break; |
123 | } |
124 | if (cs == CS_LIMIT) |
125 | break; /* this char not in any cs */ |
126 | } |
127 | if (j < nchars) |
128 | continue; /* some char not in any cs */ |
129 | |
130 | /* |
131 | * Match! Print the encoding, then all charsets. |
132 | */ |
133 | for (j = 0; j < nchars; j++) { |
134 | for (k = 0; k < thislen; k++) |
135 | printf("%s%02X", k>0?" ":"", (unsigned)(thisstr[k] & 0xFF)); |
136 | printf(" = "); |
137 | if (chars[j] >= 0x10000) |
138 | printf("U-%08X", (unsigned)chars[j]); |
139 | else |
140 | printf("U+%04X", (unsigned)chars[j]); |
141 | printf(" in:"); |
142 | sep = " "; |
143 | for (cs = 0; cs < CS_LIMIT; cs++) |
144 | if (encodings[j*CS_LIMIT+cs].len == thislen && |
145 | !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen)) |
146 | { |
147 | printf("%s%s", sep, charset_to_localenc(cs)); |
148 | sep = ", "; |
149 | } |
150 | printf("\n"); |
151 | } |
152 | printf("\n"); |
153 | } |
154 | |
155 | return 0; |
156 | } |