2 * libcharset client utility which, given two Unicode code points,
3 * will search for character sets which encode the two code points the
4 * same way. The idea is that if you see some piece of misencoded text
5 * which uses (say) an oe ligature where you expected (as it might be)
6 * a pound sign, you can use this utility to suggest which two
7 * character sets might have been confused with each other to cause
20 int main(int argc
, char **argv
)
23 struct enc
{ char string
[MAXENCLEN
]; int len
; } *encodings
;
28 setlocale(LC_ALL
, "");
30 chars
= malloc(argc
* sizeof(wchar_t));
32 fprintf(stderr
, "out of memory\n");
42 int base
= 16, semi_ok
= 0;
45 if ((p
[0] == 'U' || p
[0] == 'u') &&
46 (p
[1] == '-' || p
[1] == '+')) {
48 } else if (p
[0] == '0' && (p
[1] == 'x' || p
[1] == 'X')) {
50 } else if (p
[0] == '&' && p
[1] == '#') {
52 if (p
[0] == 'x' || p
[0] == 'X')
57 } else if (mbtowc(&ch
, p
, strlen(p
)) == strlen(p
)) {
62 chars
[nchars
++] = strtoul(p
, &end
, base
);
63 if (!*end
|| (semi_ok
&& !strcmp(end
, ";")))
66 fprintf(stderr
, "unable to parse '%s' as a Unicode code point\n",
72 encodings
= malloc(nchars
* CS_LIMIT
* sizeof(struct enc
));
73 for (cs
= 0; cs
< CS_LIMIT
; cs
++) {
74 for (i
= 0; i
< nchars
; i
++) {
77 int inlen
, error
, ret
;
79 if (!charset_exists(cs
)) {
80 encodings
[i
*CS_LIMIT
+cs
].len
= 0;
88 ret
= charset_from_unicode(&inptr
, &inlen
,
89 encodings
[i
*CS_LIMIT
+cs
].string
,
90 MAXENCLEN
, cs
, NULL
, &error
);
91 if (error
|| inlen
> 0)
92 encodings
[i
*CS_LIMIT
+cs
].len
= 0;
94 encodings
[i
*CS_LIMIT
+cs
].len
= ret
;
99 * Really simple and slow approach to finding each distinct string
102 for (i
= 0; i
< nchars
*CS_LIMIT
; i
++) {
103 const char *thisstr
= encodings
[i
].string
;
104 int thislen
= encodings
[i
].len
;
108 for (j
= 0; j
< i
; j
++)
109 if (encodings
[j
].len
== thislen
&&
110 !memcmp(encodings
[j
].string
, thisstr
, thislen
))
113 continue; /* not the first instance of this encoding */
116 * See if every character is encoded like this somewhere.
118 for (j
= 0; j
< nchars
; j
++) {
119 for (cs
= 0; cs
< CS_LIMIT
; cs
++) {
120 if (encodings
[j
*CS_LIMIT
+cs
].len
== thislen
&&
121 !memcmp(encodings
[j
*CS_LIMIT
+cs
].string
, thisstr
, thislen
))
125 break; /* this char not in any cs */
128 continue; /* some char not in any cs */
131 * Match! Print the encoding, then all charsets.
133 for (j
= 0; j
< nchars
; j
++) {
134 for (k
= 0; k
< thislen
; k
++)
135 printf("%s%02X", k
>0?
" ":"", (unsigned)(thisstr
[k
] & 0xFF));
137 if (chars
[j
] >= 0x10000)
138 printf("U-%08X", (unsigned)chars
[j
]);
140 printf("U+%04X", (unsigned)chars
[j
]);
143 for (cs
= 0; cs
< CS_LIMIT
; cs
++)
144 if (encodings
[j
*CS_LIMIT
+cs
].len
== thislen
&&
145 !memcmp(encodings
[j
*CS_LIMIT
+cs
].string
, thisstr
, thislen
))
147 printf("%s%s", sep
, charset_to_localenc(cs
));