[sgt/charset] / confuse.c

/*
 * libcharset client utility which, given two Unicode code points,
 * will search for character sets which encode the two code points the
 * same way. The idea is that if you see some piece of misencoded text
 * which uses (say) an oe ligature where you expected (as it might be)
 * a pound sign, you can use this utility to suggest which two
 * character sets might have been confused with each other to cause
 * that effect.
 */

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <locale.h>

#include "charset.h"

#define MAXENCLEN 20

int main(int argc, char **argv)
{
    wchar_t *chars;
    struct enc { char string[MAXENCLEN]; int len; } *encodings;
    int nchars;
    int i, j, k, cs;
    const char *sep;

    setlocale(LC_ALL, "");

    chars = malloc(argc * sizeof(wchar_t));
    if (!chars) {
        fprintf(stderr, "out of memory\n");
        return 1;
    }

    nchars = 0;

    while (--argc) {
        char *p = *++argv;
        char *orig = p;
        char *end;
        int base = 16, semi_ok = 0;
        wchar_t ch;

        if ((p[0] == 'U' || p[0] == 'u') &&
            (p[1] == '-' || p[1] == '+')) {
            p += 2;
        } else if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
            p += 2;
        } else if (p[0] == '&' && p[1] == '#') {
            p += 2;
            if (p[0] == 'x' || p[0] == 'X')
                p++;
            else
                base = 10;
            semi_ok = 1;
        } else if (mbtowc(&ch, p, strlen(p)) == strlen(p)) {
            chars[nchars++] = ch;
            continue;
        }

        chars[nchars++] = strtoul(p, &end, base);
        if (!*end || (semi_ok && !strcmp(end, ";")))
            continue;
        else {
            fprintf(stderr, "unable to parse '%s' as a Unicode code point\n",
                    orig);
            return 1;
        }
    }

    encodings = malloc(nchars * CS_LIMIT * sizeof(struct enc));
    for (cs = 0; cs < CS_LIMIT; cs++) {
        for (i = 0; i < nchars; i++) {
            wchar_t inbuf[1];
            const wchar_t *inptr;
            int inlen, error, ret;

            if (!charset_exists(cs)) {
                encodings[i*CS_LIMIT+cs].len = 0;
                continue;
            }

            inbuf[0] = chars[i];
            inptr = inbuf;
            inlen = 1;
            error = 0;
            ret = charset_from_unicode(&inptr, &inlen,
                                       encodings[i*CS_LIMIT+cs].string,
                                       MAXENCLEN, cs, NULL, &error);
            if (error || inlen > 0)
                encodings[i*CS_LIMIT+cs].len = 0;
            else
                encodings[i*CS_LIMIT+cs].len = ret;
        }
    }

    /*
     * Really simple and slow approach to finding each distinct string
     * and outputting it.
     */
    for (i = 0; i < nchars*CS_LIMIT; i++) {
        const char *thisstr = encodings[i].string;
        int thislen = encodings[i].len;

        if (thislen == 0)
            continue;
        for (j = 0; j < i; j++)
            if (encodings[j].len == thislen &&
                !memcmp(encodings[j].string, thisstr, thislen))
                break;
        if (j < i)
            continue;        /* not the first instance of this encoding */

        /*
         * See if every character is encoded like this somewhere.
         */
        for (j = 0; j < nchars; j++) {
            for (cs = 0; cs < CS_LIMIT; cs++) {
                if (encodings[j*CS_LIMIT+cs].len == thislen &&
                    !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen))
                    break;
            }
            if (cs == CS_LIMIT)
                break;                 /* this char not in any cs */
        }
        if (j < nchars)
            continue;                  /* some char not in any cs */

        /*
         * Match! Print the encoding, then all charsets.
         */
        for (j = 0; j < nchars; j++) {
            for (k = 0; k < thislen; k++)
                printf("%s%02X", k>0?" ":"", (unsigned)(thisstr[k] & 0xFF));
            printf(" = ");
            if (chars[j] >= 0x10000)
                printf("U-%08X", (unsigned)chars[j]);
            else
                printf("U+%04X", (unsigned)chars[j]);
            printf(" in:");
            sep = " ";
            for (cs = 0; cs < CS_LIMIT; cs++)
                if (encodings[j*CS_LIMIT+cs].len == thislen &&
                    !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen))
                {
                    printf("%s%s", sep, charset_to_localenc(cs));
                    sep = ", ";
                }
            printf("\n");
        }
        printf("\n");
    }

    return 0;
}
Commit	Line	Data
38558cf1	1	/*
	2	* libcharset client utility which, given two Unicode code points,
	3	* will search for character sets which encode the two code points the
	4	* same way. The idea is that if you see some piece of misencoded text
	5	* which uses (say) an oe ligature where you expected (as it might be)
	6	* a pound sign, you can use this utility to suggest which two
	7	* character sets might have been confused with each other to cause
	8	* that effect.
	9	*/
	10
	11	#include <stdio.h>
	12	#include <string.h>
	13	#include <stdlib.h>
2265dc5c	14	#include <locale.h>
38558cf1	15
	16	#include "charset.h"
	17
	18	#define MAXENCLEN 20
	19
	20	int main(int argc, char **argv)
	21	{
	22	wchar_t *chars;
	23	struct enc { char string[MAXENCLEN]; int len; } *encodings;
	24	int nchars;
	25	int i, j, k, cs;
	26	const char *sep;
	27
2265dc5c	28	setlocale(LC_ALL, "");
2265dc5c	29
38558cf1	30	chars = malloc(argc * sizeof(wchar_t));
	31	if (!chars) {
	32	fprintf(stderr, "out of memory\n");
	33	return 1;
	34	}
	35
	36	nchars = 0;
	37
	38	while (--argc) {
	39	char p = ++argv;
	40	char *orig = p;
	41	char *end;
	42	int base = 16, semi_ok = 0;
2265dc5c	43	wchar_t ch;
38558cf1	44
	45	if ((p[0] == 'U' \|\| p[0] == 'u') &&
	46	(p[1] == '-' \|\| p[1] == '+')) {
	47	p += 2;
	48	} else if (p[0] == '0' && (p[1] == 'x' \|\| p[1] == 'X')) {
	49	p += 2;
	50	} else if (p[0] == '&' && p[1] == '#') {
	51	p += 2;
	52	if (p[0] == 'x' \|\| p[0] == 'X')
	53	p++;
	54	else
	55	base = 10;
	56	semi_ok = 1;
2265dc5c	57	} else if (mbtowc(&ch, p, strlen(p)) == strlen(p)) {
	58	chars[nchars++] = ch;
	59	continue;
38558cf1	60	}
	61
	62	chars[nchars++] = strtoul(p, &end, base);
	63	if (!*end \|\| (semi_ok && !strcmp(end, ";")))
	64	continue;
	65	else {
	66	fprintf(stderr, "unable to parse '%s' as a Unicode code point\n",
	67	orig);
	68	return 1;
	69	}
	70	}
	71
	72	encodings = malloc(nchars * CS_LIMIT * sizeof(struct enc));
	73	for (cs = 0; cs < CS_LIMIT; cs++) {
	74	for (i = 0; i < nchars; i++) {
	75	wchar_t inbuf[1];
	76	const wchar_t *inptr;
	77	int inlen, error, ret;
	78
	79	if (!charset_exists(cs)) {
	80	encodings[i*CS_LIMIT+cs].len = 0;
	81	continue;
	82	}
	83
	84	inbuf[0] = chars[i];
	85	inptr = inbuf;
	86	inlen = 1;
	87	error = 0;
	88	ret = charset_from_unicode(&inptr, &inlen,
	89	encodings[i*CS_LIMIT+cs].string,
	90	MAXENCLEN, cs, NULL, &error);
	91	if (error \|\| inlen > 0)
	92	encodings[i*CS_LIMIT+cs].len = 0;
	93	else
	94	encodings[i*CS_LIMIT+cs].len = ret;
	95	}
	96	}
	97
	98	/*
	99	* Really simple and slow approach to finding each distinct string
	100	* and outputting it.
	101	*/
	102	for (i = 0; i < nchars*CS_LIMIT; i++) {
	103	const char *thisstr = encodings[i].string;
	104	int thislen = encodings[i].len;
	105
	106	if (thislen == 0)
	107	continue;
	108	for (j = 0; j < i; j++)
	109	if (encodings[j].len == thislen &&
	110	!memcmp(encodings[j].string, thisstr, thislen))
	111	break;
	112	if (j < i)
	113	continue; /* not the first instance of this encoding */
	114
	115	/*
	116	* See if every character is encoded like this somewhere.
	117	*/
	118	for (j = 0; j < nchars; j++) {
	119	for (cs = 0; cs < CS_LIMIT; cs++) {
	120	if (encodings[j*CS_LIMIT+cs].len == thislen &&
	121	!memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen))
	122	break;
	123	}
124	if (cs == CS_LIMIT)
125	break; /* this char not in any cs */
126	}
127	if (j < nchars)
128	continue; /* some char not in any cs */
129
130	/*
131	* Match! Print the encoding, then all charsets.
132	*/
133	for (j = 0; j < nchars; j++) {
134	for (k = 0; k < thislen; k++)
135	printf("%s%02X", k>0?" ":"", (unsigned)(thisstr[k] & 0xFF));
136	printf(" = ");
137	if (chars[j] >= 0x10000)
138	printf("U-%08X", (unsigned)chars[j]);
139	else
140	printf("U+%04X", (unsigned)chars[j]);
141	printf(" in:");
142	sep = " ";
143	for (cs = 0; cs < CS_LIMIT; cs++)
144	if (encodings[j*CS_LIMIT+cs].len == thislen &&
145	!memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen))
146	{
147	printf("%s%s", sep, charset_to_localenc(cs));
148	sep = ", ";
149	}
150	printf("\n");
151	}
152	printf("\n");
153	}
154
155	return 0;
156	}