207c5b9ffdf0a05f2f5219b6d1b15c11e3b077bb
2 * libcharset client utility which, given two Unicode code points,
3 * will search for character sets which encode the two code points the
4 * same way. The idea is that if you see some piece of misencoded text
5 * which uses (say) an oe ligature where you expected (as it might be)
6 * a pound sign, you can use this utility to suggest which two
7 * character sets might have been confused with each other to cause
19 int main(int argc
, char **argv
)
22 struct enc
{ char string
[MAXENCLEN
]; int len
; } *encodings
;
27 chars
= malloc(argc
* sizeof(wchar_t));
29 fprintf(stderr
, "out of memory\n");
39 int base
= 16, semi_ok
= 0;
41 if ((p
[0] == 'U' || p
[0] == 'u') &&
42 (p
[1] == '-' || p
[1] == '+')) {
44 } else if (p
[0] == '0' && (p
[1] == 'x' || p
[1] == 'X')) {
46 } else if (p
[0] == '&' && p
[1] == '#') {
48 if (p
[0] == 'x' || p
[0] == 'X')
55 chars
[nchars
++] = strtoul(p
, &end
, base
);
56 if (!*end
|| (semi_ok
&& !strcmp(end
, ";")))
59 fprintf(stderr
, "unable to parse '%s' as a Unicode code point\n",
65 encodings
= malloc(nchars
* CS_LIMIT
* sizeof(struct enc
));
66 for (cs
= 0; cs
< CS_LIMIT
; cs
++) {
67 for (i
= 0; i
< nchars
; i
++) {
70 int inlen
, error
, ret
;
72 if (!charset_exists(cs
)) {
73 encodings
[i
*CS_LIMIT
+cs
].len
= 0;
81 ret
= charset_from_unicode(&inptr
, &inlen
,
82 encodings
[i
*CS_LIMIT
+cs
].string
,
83 MAXENCLEN
, cs
, NULL
, &error
);
84 if (error
|| inlen
> 0)
85 encodings
[i
*CS_LIMIT
+cs
].len
= 0;
87 encodings
[i
*CS_LIMIT
+cs
].len
= ret
;
92 * Really simple and slow approach to finding each distinct string
95 for (i
= 0; i
< nchars
*CS_LIMIT
; i
++) {
96 const char *thisstr
= encodings
[i
].string
;
97 int thislen
= encodings
[i
].len
;
101 for (j
= 0; j
< i
; j
++)
102 if (encodings
[j
].len
== thislen
&&
103 !memcmp(encodings
[j
].string
, thisstr
, thislen
))
106 continue; /* not the first instance of this encoding */
109 * See if every character is encoded like this somewhere.
111 for (j
= 0; j
< nchars
; j
++) {
112 for (cs
= 0; cs
< CS_LIMIT
; cs
++) {
113 if (encodings
[j
*CS_LIMIT
+cs
].len
== thislen
&&
114 !memcmp(encodings
[j
*CS_LIMIT
+cs
].string
, thisstr
, thislen
))
118 break; /* this char not in any cs */
121 continue; /* some char not in any cs */
124 * Match! Print the encoding, then all charsets.
126 for (j
= 0; j
< nchars
; j
++) {
127 for (k
= 0; k
< thislen
; k
++)
128 printf("%s%02X", k
>0?
" ":"", (unsigned)(thisstr
[k
] & 0xFF));
130 if (chars
[j
] >= 0x10000)
131 printf("U-%08X", (unsigned)chars
[j
]);
133 printf("U+%04X", (unsigned)chars
[j
]);
136 for (cs
= 0; cs
< CS_LIMIT
; cs
++)
137 if (encodings
[j
*CS_LIMIT
+cs
].len
== thislen
&&
138 !memcmp(encodings
[j
*CS_LIMIT
+cs
].string
, thisstr
, thislen
))
140 printf("%s%s", sep
, charset_to_localenc(cs
));