mdw@git.distorted.org.uk Git - sgt/charset/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* libcharset client utility which, given two Unicode code points,
	3	* will search for character sets which encode the two code points the
	4	* same way. The idea is that if you see some piece of misencoded text
	5	* which uses (say) an oe ligature where you expected (as it might be)
	6	* a pound sign, you can use this utility to suggest which two
	7	* character sets might have been confused with each other to cause
	8	* that effect.
	9	*/
	10
	11	#include <stdio.h>
	12	#include <string.h>
	13	#include <stdlib.h>
	14	#include <locale.h>
	15
	16	#include "charset.h"
	17
	18	#define MAXENCLEN 20
	19
	20	int main(int argc, char **argv)
	21	{
	22	wchar_t *chars;
	23	struct enc { char string[MAXENCLEN]; int len; } *encodings;
	24	int nchars;
	25	int i, j, k, cs;
	26	const char *sep;
	27
	28	setlocale(LC_ALL, "");
	29
	30	chars = malloc(argc * sizeof(wchar_t));
	31	if (!chars) {
	32	fprintf(stderr, "out of memory\n");
	33	return 1;
	34	}
	35
	36	nchars = 0;
	37
	38	while (--argc) {
	39	char p = ++argv;
	40	char *orig = p;
	41	char *end;
	42	int base = 16, semi_ok = 0;
	43	wchar_t ch;
	44
	45	if ((p[0] == 'U' \|\| p[0] == 'u') &&
	46	(p[1] == '-' \|\| p[1] == '+')) {
	47	p += 2;
	48	} else if (p[0] == '0' && (p[1] == 'x' \|\| p[1] == 'X')) {
	49	p += 2;
	50	} else if (p[0] == '&' && p[1] == '#') {
	51	p += 2;
	52	if (p[0] == 'x' \|\| p[0] == 'X')
	53	p++;
	54	else
	55	base = 10;
	56	semi_ok = 1;
	57	} else if (mbtowc(&ch, p, strlen(p)) == strlen(p)) {
	58	chars[nchars++] = ch;
	59	continue;
	60	}
	61
	62	chars[nchars++] = strtoul(p, &end, base);
	63	if (!*end \|\| (semi_ok && !strcmp(end, ";")))
	64	continue;
	65	else {
	66	fprintf(stderr, "unable to parse '%s' as a Unicode code point\n",
	67	orig);
	68	return 1;
	69	}
	70	}
	71
	72	encodings = malloc(nchars * CS_LIMIT * sizeof(struct enc));
	73	for (cs = 0; cs < CS_LIMIT; cs++) {
	74	for (i = 0; i < nchars; i++) {
	75	wchar_t inbuf[1];
	76	const wchar_t *inptr;
	77	int inlen, error, ret;
	78
	79	if (!charset_exists(cs)) {
	80	encodings[i*CS_LIMIT+cs].len = 0;
	81	continue;
	82	}
	83
	84	inbuf[0] = chars[i];
	85	inptr = inbuf;
	86	inlen = 1;
	87	error = 0;
	88	ret = charset_from_unicode(&inptr, &inlen,
	89	encodings[i*CS_LIMIT+cs].string,
	90	MAXENCLEN, cs, NULL, &error);
	91	if (error \|\| inlen > 0)
	92	encodings[i*CS_LIMIT+cs].len = 0;
	93	else
	94	encodings[i*CS_LIMIT+cs].len = ret;
	95	}
	96	}
	97
	98	/*
	99	* Really simple and slow approach to finding each distinct string
	100	* and outputting it.
	101	*/
	102	for (i = 0; i < nchars*CS_LIMIT; i++) {
	103	const char *thisstr = encodings[i].string;
	104	int thislen = encodings[i].len;
	105
	106	if (thislen == 0)
	107	continue;
	108	for (j = 0; j < i; j++)
	109	if (encodings[j].len == thislen &&
	110	!memcmp(encodings[j].string, thisstr, thislen))
	111	break;
	112	if (j < i)
	113	continue; /* not the first instance of this encoding */
	114
	115	/*
	116	* See if every character is encoded like this somewhere.
	117	*/
	118	for (j = 0; j < nchars; j++) {
	119	for (cs = 0; cs < CS_LIMIT; cs++) {
	120	if (encodings[j*CS_LIMIT+cs].len == thislen &&
	121	!memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen))
	122	break;
	123	}
	124	if (cs == CS_LIMIT)
	125	break; /* this char not in any cs */
	126	}
	127	if (j < nchars)
	128	continue; /* some char not in any cs */
	129
	130	/*
	131	* Match! Print the encoding, then all charsets.
	132	*/
	133	for (j = 0; j < nchars; j++) {
	134	for (k = 0; k < thislen; k++)
	135	printf("%s%02X", k>0?" ":"", (unsigned)(thisstr[k] & 0xFF));
	136	printf(" = ");
	137	if (chars[j] >= 0x10000)
	138	printf("U-%08X", (unsigned)chars[j]);
	139	else
	140	printf("U+%04X", (unsigned)chars[j]);
	141	printf(" in:");
	142	sep = " ";
	143	for (cs = 0; cs < CS_LIMIT; cs++)
	144	if (encodings[j*CS_LIMIT+cs].len == thislen &&
	145	!memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen))
	146	{
	147	printf("%s%s", sep, charset_to_localenc(cs));
	148	sep = ", ";
	149	}
	150	printf("\n");
	151	}
	152	printf("\n");
	153	}
	154
	155	return 0;
	156	}