mdw@git.distorted.org.uk Git - sgt/charset/blob - confuse.c

   1 /*
   2  * libcharset client utility which, given two Unicode code points,
   3  * will search for character sets which encode the two code points the
   4  * same way. The idea is that if you see some piece of misencoded text
   5  * which uses (say) an oe ligature where you expected (as it might be)
   6  * a pound sign, you can use this utility to suggest which two
   7  * character sets might have been confused with each other to cause
   8  * that effect.
   9  */
  10
  11 #include <stdio.h>
  12 #include <string.h>
  13 #include <stdlib.h>
  14 #include <locale.h>
  15
  16 #include "charset.h"
  17
  18 #define MAXENCLEN 20
  19
  20 int main(int argc, char **argv)
  21 {
  22     wchar_t *chars;
  23     struct enc { char string[MAXENCLEN]; int len; } *encodings;
  24     int nchars;
  25     int i, j, k, cs;
  26     const char *sep;
  27
  28     setlocale(LC_ALL, "");
  29
  30     chars = malloc(argc * sizeof(wchar_t));
  31     if (!chars) {
  32         fprintf(stderr, "out of memory\n");
  33         return 1;
  34     }
  35
  36     nchars = 0;
  37
  38     while (--argc) {
  39         char *p = *++argv;
  40         char *orig = p;
  41         char *end;
  42         int base = 16, semi_ok = 0;
  43         wchar_t ch;
  44
  45         if ((p[0] == 'U' || p[0] == 'u') &&
  46             (p[1] == '-' || p[1] == '+')) {
  47             p += 2;
  48         } else if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
  49             p += 2;
  50         } else if (p[0] == '&' && p[1] == '#') {
  51             p += 2;
  52             if (p[0] == 'x' || p[0] == 'X')
  53                 p++;
  54             else
  55                 base = 10;
  56             semi_ok = 1;
  57         } else if (mbtowc(&ch, p, strlen(p)) == strlen(p)) {
  58             chars[nchars++] = ch;
  59             continue;
  60         }
  61
  62         chars[nchars++] = strtoul(p, &end, base);
  63         if (!*end || (semi_ok && !strcmp(end, ";")))
  64             continue;
  65         else {
  66             fprintf(stderr, "unable to parse '%s' as a Unicode code point\n",
  67                     orig);
  68             return 1;
  69         }
  70     }
  71
  72     encodings = malloc(nchars * CS_LIMIT * sizeof(struct enc));
  73     for (cs = 0; cs < CS_LIMIT; cs++) {
  74         for (i = 0; i < nchars; i++) {
  75             wchar_t inbuf[1];
  76             const wchar_t *inptr;
  77             int inlen, error, ret;
  78
  79             if (!charset_exists(cs)) {
  80                 encodings[i*CS_LIMIT+cs].len = 0;
  81                 continue;
  82             }
  83
  84             inbuf[0] = chars[i];
  85             inptr = inbuf;
  86             inlen = 1;
  87             error = 0;
  88             ret = charset_from_unicode(&inptr, &inlen,
  89                                        encodings[i*CS_LIMIT+cs].string,
  90                                        MAXENCLEN, cs, NULL, &error);
  91             if (error || inlen > 0)
  92                 encodings[i*CS_LIMIT+cs].len = 0;
  93             else
  94                 encodings[i*CS_LIMIT+cs].len = ret;
  95         }
  96     }
  97
  98     /*
  99      * Really simple and slow approach to finding each distinct string
 100      * and outputting it.
 101      */
 102     for (i = 0; i < nchars*CS_LIMIT; i++) {
 103         const char *thisstr = encodings[i].string;
 104         int thislen = encodings[i].len;
 105
 106         if (thislen == 0)
 107             continue;
 108         for (j = 0; j < i; j++)
 109             if (encodings[j].len == thislen &&
 110                 !memcmp(encodings[j].string, thisstr, thislen))
 111                 break;
 112         if (j < i)
 113             continue;        /* not the first instance of this encoding */
 114
 115         /*
 116          * See if every character is encoded like this somewhere.
 117          */
 118         for (j = 0; j < nchars; j++) {
 119             for (cs = 0; cs < CS_LIMIT; cs++) {
 120                 if (encodings[j*CS_LIMIT+cs].len == thislen &&
 121                     !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen))
 122                     break;
 123             }
 124             if (cs == CS_LIMIT)
 125                 break;                 /* this char not in any cs */
 126         }
 127         if (j < nchars)
 128             continue;                  /* some char not in any cs */
 129
 130         /*
 131          * Match! Print the encoding, then all charsets.
 132          */
 133         for (j = 0; j < nchars; j++) {
 134             for (k = 0; k < thislen; k++)
 135                 printf("%s%02X", k>0?" ":"", (unsigned)(thisstr[k] & 0xFF));
 136             printf(" = ");
 137             if (chars[j] >= 0x10000)
 138                 printf("U-%08X", (unsigned)chars[j]);
 139             else
 140                 printf("U+%04X", (unsigned)chars[j]);
 141             printf(" in:");
 142             sep = " ";
 143             for (cs = 0; cs < CS_LIMIT; cs++)
 144                 if (encodings[j*CS_LIMIT+cs].len == thislen &&
 145                     !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen))
 146                 {
 147                     printf("%s%s", sep, charset_to_localenc(cs));
 148                     sep = ", ";
 149                 }
 150             printf("\n");
 151         }
 152         printf("\n");
 153     }
 154
 155     return 0;
 156 }