[sgt/charset] / confuse.c

/*
 * libcharset client utility which, given two Unicode code points,
 * will search for character sets which encode the two code points the
 * same way. The idea is that if you see some piece of misencoded text
 * which uses (say) an oe ligature where you expected (as it might be)
 * a pound sign, you can use this utility to suggest which two
 * character sets might have been confused with each other to cause
 * that effect.
 */

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#include "charset.h"

#define MAXENCLEN 20

int main(int argc, char **argv)
{
    wchar_t *chars;
    struct enc { char string[MAXENCLEN]; int len; } *encodings;
    int nchars;
    int i, j, k, cs;
    const char *sep;

    chars = malloc(argc * sizeof(wchar_t));
    if (!chars) {
        fprintf(stderr, "out of memory\n");
        return 1;
    }

    nchars = 0;

    while (--argc) {
        char *p = *++argv;
        char *orig = p;
        char *end;
        int base = 16, semi_ok = 0;

        if ((p[0] == 'U' || p[0] == 'u') &&
            (p[1] == '-' || p[1] == '+')) {
            p += 2;
        } else if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
            p += 2;
        } else if (p[0] == '&' && p[1] == '#') {
            p += 2;
            if (p[0] == 'x' || p[0] == 'X')
                p++;
            else
                base = 10;
            semi_ok = 1;
        }

        chars[nchars++] = strtoul(p, &end, base);
        if (!*end || (semi_ok && !strcmp(end, ";")))
            continue;
        else {
            fprintf(stderr, "unable to parse '%s' as a Unicode code point\n",
                    orig);
            return 1;
        }
    }

    encodings = malloc(nchars * CS_LIMIT * sizeof(struct enc));
    for (cs = 0; cs < CS_LIMIT; cs++) {
        for (i = 0; i < nchars; i++) {
            wchar_t inbuf[1];
            const wchar_t *inptr;
            int inlen, error, ret;

            if (!charset_exists(cs)) {
                encodings[i*CS_LIMIT+cs].len = 0;
                continue;
            }

            inbuf[0] = chars[i];
            inptr = inbuf;
            inlen = 1;
            error = 0;
            ret = charset_from_unicode(&inptr, &inlen,
                                       encodings[i*CS_LIMIT+cs].string,
                                       MAXENCLEN, cs, NULL, &error);
            if (error || inlen > 0)
                encodings[i*CS_LIMIT+cs].len = 0;
            else
                encodings[i*CS_LIMIT+cs].len = ret;
        }
    }

    /*
     * Really simple and slow approach to finding each distinct string
     * and outputting it.
     */
    for (i = 0; i < nchars*CS_LIMIT; i++) {
        const char *thisstr = encodings[i].string;
        int thislen = encodings[i].len;

        if (thislen == 0)
            continue;
        for (j = 0; j < i; j++)
            if (encodings[j].len == thislen &&
                !memcmp(encodings[j].string, thisstr, thislen))
                break;
        if (j < i)
            continue;        /* not the first instance of this encoding */

        /*
         * See if every character is encoded like this somewhere.
         */
        for (j = 0; j < nchars; j++) {
            for (cs = 0; cs < CS_LIMIT; cs++) {
                if (encodings[j*CS_LIMIT+cs].len == thislen &&
                    !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen))
                    break;
            }
            if (cs == CS_LIMIT)
                break;                 /* this char not in any cs */
        }
        if (j < nchars)
            continue;                  /* some char not in any cs */

        /*
         * Match! Print the encoding, then all charsets.
         */
        for (j = 0; j < nchars; j++) {
            for (k = 0; k < thislen; k++)
                printf("%s%02X", k>0?" ":"", (unsigned)(thisstr[k] & 0xFF));
            printf(" = ");
            if (chars[j] >= 0x10000)
                printf("U-%08X", (unsigned)chars[j]);
            else
                printf("U+%04X", (unsigned)chars[j]);
            printf(" in:");
            sep = " ";
            for (cs = 0; cs < CS_LIMIT; cs++)
                if (encodings[j*CS_LIMIT+cs].len == thislen &&
                    !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen))
                {
                    printf("%s%s", sep, charset_to_localenc(cs));
                    sep = ", ";
                }
            printf("\n");
        }
        printf("\n");
    }

    return 0;
}
Commit	Line	Data
38558cf1	1	/*
	2	* libcharset client utility which, given two Unicode code points,
	3	* will search for character sets which encode the two code points the
	4	* same way. The idea is that if you see some piece of misencoded text
	5	* which uses (say) an oe ligature where you expected (as it might be)
	6	* a pound sign, you can use this utility to suggest which two
	7	* character sets might have been confused with each other to cause
	8	* that effect.
	9	*/
	10
	11	#include <stdio.h>
	12	#include <string.h>
	13	#include <stdlib.h>
	14
	15	#include "charset.h"
	16
	17	#define MAXENCLEN 20
	18
	19	int main(int argc, char **argv)
	20	{
	21	wchar_t *chars;
	22	struct enc { char string[MAXENCLEN]; int len; } *encodings;
	23	int nchars;
	24	int i, j, k, cs;
	25	const char *sep;
	26
	27	chars = malloc(argc * sizeof(wchar_t));
	28	if (!chars) {
	29	fprintf(stderr, "out of memory\n");
	30	return 1;
	31	}
	32
	33	nchars = 0;
	34
	35	while (--argc) {
	36	char p = ++argv;
	37	char *orig = p;
	38	char *end;
	39	int base = 16, semi_ok = 0;
	40
	41	if ((p[0] == 'U' \|\| p[0] == 'u') &&
	42	(p[1] == '-' \|\| p[1] == '+')) {
	43	p += 2;
	44	} else if (p[0] == '0' && (p[1] == 'x' \|\| p[1] == 'X')) {
	45	p += 2;
	46	} else if (p[0] == '&' && p[1] == '#') {
	47	p += 2;
	48	if (p[0] == 'x' \|\| p[0] == 'X')
	49	p++;
	50	else
	51	base = 10;
	52	semi_ok = 1;
	53	}
	54
	55	chars[nchars++] = strtoul(p, &end, base);
	56	if (!*end \|\| (semi_ok && !strcmp(end, ";")))
	57	continue;
	58	else {
	59	fprintf(stderr, "unable to parse '%s' as a Unicode code point\n",
	60	orig);
	61	return 1;
	62	}
	63	}
	64
65	encodings = malloc(nchars * CS_LIMIT * sizeof(struct enc));
66	for (cs = 0; cs < CS_LIMIT; cs++) {
67	for (i = 0; i < nchars; i++) {
68	wchar_t inbuf[1];
69	const wchar_t *inptr;
70	int inlen, error, ret;
71
72	if (!charset_exists(cs)) {
73	encodings[i*CS_LIMIT+cs].len = 0;
74	continue;
75	}
76
77	inbuf[0] = chars[i];
78	inptr = inbuf;
79	inlen = 1;
80	error = 0;
81	ret = charset_from_unicode(&inptr, &inlen,
82	encodings[i*CS_LIMIT+cs].string,
83	MAXENCLEN, cs, NULL, &error);
84	if (error \|\| inlen > 0)
85	encodings[i*CS_LIMIT+cs].len = 0;
86	else
87	encodings[i*CS_LIMIT+cs].len = ret;
88	}
89	}
90
91	/*
92	* Really simple and slow approach to finding each distinct string
93	* and outputting it.
94	*/
95	for (i = 0; i < nchars*CS_LIMIT; i++) {
96	const char *thisstr = encodings[i].string;
97	int thislen = encodings[i].len;
98
99	if (thislen == 0)
100	continue;
101	for (j = 0; j < i; j++)
102	if (encodings[j].len == thislen &&
103	!memcmp(encodings[j].string, thisstr, thislen))
104	break;
105	if (j < i)
106	continue; /* not the first instance of this encoding */
107
108	/*
109	* See if every character is encoded like this somewhere.
110	*/
111	for (j = 0; j < nchars; j++) {
112	for (cs = 0; cs < CS_LIMIT; cs++) {
113	if (encodings[j*CS_LIMIT+cs].len == thislen &&
114	!memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen))
115	break;
116	}
117	if (cs == CS_LIMIT)
118	break; /* this char not in any cs */
119	}
120	if (j < nchars)
121	continue; /* some char not in any cs */
122
123	/*
124	* Match! Print the encoding, then all charsets.
125	*/
126	for (j = 0; j < nchars; j++) {
127	for (k = 0; k < thislen; k++)
128	printf("%s%02X", k>0?" ":"", (unsigned)(thisstr[k] & 0xFF));
129	printf(" = ");
130	if (chars[j] >= 0x10000)
131	printf("U-%08X", (unsigned)chars[j]);
132	else
133	printf("U+%04X", (unsigned)chars[j]);
134	printf(" in:");
135	sep = " ";
136	for (cs = 0; cs < CS_LIMIT; cs++)
137	if (encodings[j*CS_LIMIT+cs].len == thislen &&
138	!memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen))
139	{
140	printf("%s%s", sep, charset_to_localenc(cs));
141	sep = ", ";
142	}
143	printf("\n");
144	}
145	printf("\n");
146	}
147
148	return 0;
149	}