| 1 | /* |
| 2 | * libcharset client utility which, given two Unicode code points, |
| 3 | * will search for character sets which encode the two code points the |
| 4 | * same way. The idea is that if you see some piece of misencoded text |
| 5 | * which uses (say) an oe ligature where you expected (as it might be) |
| 6 | * a pound sign, you can use this utility to suggest which two |
| 7 | * character sets might have been confused with each other to cause |
| 8 | * that effect. |
| 9 | */ |
| 10 | |
| 11 | #include <stdio.h> |
| 12 | #include <string.h> |
| 13 | #include <stdlib.h> |
| 14 | #include <locale.h> |
| 15 | |
| 16 | #include "charset.h" |
| 17 | |
| 18 | #define MAXENCLEN 20 |
| 19 | |
| 20 | int main(int argc, char **argv) |
| 21 | { |
| 22 | wchar_t *chars; |
| 23 | struct enc { char string[MAXENCLEN]; int len; } *encodings; |
| 24 | int nchars; |
| 25 | int i, j, k, cs; |
| 26 | const char *sep; |
| 27 | |
| 28 | setlocale(LC_ALL, ""); |
| 29 | |
| 30 | chars = malloc(argc * sizeof(wchar_t)); |
| 31 | if (!chars) { |
| 32 | fprintf(stderr, "out of memory\n"); |
| 33 | return 1; |
| 34 | } |
| 35 | |
| 36 | nchars = 0; |
| 37 | |
| 38 | while (--argc) { |
| 39 | char *p = *++argv; |
| 40 | char *orig = p; |
| 41 | char *end; |
| 42 | int base = 16, semi_ok = 0; |
| 43 | wchar_t ch; |
| 44 | |
| 45 | if ((p[0] == 'U' || p[0] == 'u') && |
| 46 | (p[1] == '-' || p[1] == '+')) { |
| 47 | p += 2; |
| 48 | } else if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) { |
| 49 | p += 2; |
| 50 | } else if (p[0] == '&' && p[1] == '#') { |
| 51 | p += 2; |
| 52 | if (p[0] == 'x' || p[0] == 'X') |
| 53 | p++; |
| 54 | else |
| 55 | base = 10; |
| 56 | semi_ok = 1; |
| 57 | } else if (mbtowc(&ch, p, strlen(p)) == strlen(p)) { |
| 58 | chars[nchars++] = ch; |
| 59 | continue; |
| 60 | } |
| 61 | |
| 62 | chars[nchars++] = strtoul(p, &end, base); |
| 63 | if (!*end || (semi_ok && !strcmp(end, ";"))) |
| 64 | continue; |
| 65 | else { |
| 66 | fprintf(stderr, "unable to parse '%s' as a Unicode code point\n", |
| 67 | orig); |
| 68 | return 1; |
| 69 | } |
| 70 | } |
| 71 | |
| 72 | encodings = malloc(nchars * CS_LIMIT * sizeof(struct enc)); |
| 73 | for (cs = 0; cs < CS_LIMIT; cs++) { |
| 74 | for (i = 0; i < nchars; i++) { |
| 75 | wchar_t inbuf[1]; |
| 76 | const wchar_t *inptr; |
| 77 | int inlen, error, ret; |
| 78 | |
| 79 | if (!charset_exists(cs)) { |
| 80 | encodings[i*CS_LIMIT+cs].len = 0; |
| 81 | continue; |
| 82 | } |
| 83 | |
| 84 | inbuf[0] = chars[i]; |
| 85 | inptr = inbuf; |
| 86 | inlen = 1; |
| 87 | error = 0; |
| 88 | ret = charset_from_unicode(&inptr, &inlen, |
| 89 | encodings[i*CS_LIMIT+cs].string, |
| 90 | MAXENCLEN, cs, NULL, &error); |
| 91 | if (error || inlen > 0) |
| 92 | encodings[i*CS_LIMIT+cs].len = 0; |
| 93 | else |
| 94 | encodings[i*CS_LIMIT+cs].len = ret; |
| 95 | } |
| 96 | } |
| 97 | |
| 98 | /* |
| 99 | * Really simple and slow approach to finding each distinct string |
| 100 | * and outputting it. |
| 101 | */ |
| 102 | for (i = 0; i < nchars*CS_LIMIT; i++) { |
| 103 | const char *thisstr = encodings[i].string; |
| 104 | int thislen = encodings[i].len; |
| 105 | |
| 106 | if (thislen == 0) |
| 107 | continue; |
| 108 | for (j = 0; j < i; j++) |
| 109 | if (encodings[j].len == thislen && |
| 110 | !memcmp(encodings[j].string, thisstr, thislen)) |
| 111 | break; |
| 112 | if (j < i) |
| 113 | continue; /* not the first instance of this encoding */ |
| 114 | |
| 115 | /* |
| 116 | * See if every character is encoded like this somewhere. |
| 117 | */ |
| 118 | for (j = 0; j < nchars; j++) { |
| 119 | for (cs = 0; cs < CS_LIMIT; cs++) { |
| 120 | if (encodings[j*CS_LIMIT+cs].len == thislen && |
| 121 | !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen)) |
| 122 | break; |
| 123 | } |
| 124 | if (cs == CS_LIMIT) |
| 125 | break; /* this char not in any cs */ |
| 126 | } |
| 127 | if (j < nchars) |
| 128 | continue; /* some char not in any cs */ |
| 129 | |
| 130 | /* |
| 131 | * Match! Print the encoding, then all charsets. |
| 132 | */ |
| 133 | for (j = 0; j < nchars; j++) { |
| 134 | for (k = 0; k < thislen; k++) |
| 135 | printf("%s%02X", k>0?" ":"", (unsigned)(thisstr[k] & 0xFF)); |
| 136 | printf(" = "); |
| 137 | if (chars[j] >= 0x10000) |
| 138 | printf("U-%08X", (unsigned)chars[j]); |
| 139 | else |
| 140 | printf("U+%04X", (unsigned)chars[j]); |
| 141 | printf(" in:"); |
| 142 | sep = " "; |
| 143 | for (cs = 0; cs < CS_LIMIT; cs++) |
| 144 | if (encodings[j*CS_LIMIT+cs].len == thislen && |
| 145 | !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen)) |
| 146 | { |
| 147 | printf("%s%s", sep, charset_to_localenc(cs)); |
| 148 | sep = ", "; |
| 149 | } |
| 150 | printf("\n"); |
| 151 | } |
| 152 | printf("\n"); |
| 153 | } |
| 154 | |
| 155 | return 0; |
| 156 | } |