mdw@git.distorted.org.uk Git - sgt/charset/blob - confuse.c

   1 /*
   2  * libcharset client utility which, given two Unicode code points,
   3  * will search for character sets which encode the two code points the
   4  * same way. The idea is that if you see some piece of misencoded text
   5  * which uses (say) an oe ligature where you expected (as it might be)
   6  * a pound sign, you can use this utility to suggest which two
   7  * character sets might have been confused with each other to cause
   8  * that effect.
   9  */
  10
  11 #include <stdio.h>
  12 #include <string.h>
  13 #include <stdlib.h>
  14
  15 #include "charset.h"
  16
  17 #define MAXENCLEN 20
  18
  19 int main(int argc, char **argv)
  20 {
  21     wchar_t *chars;
  22     struct enc { char string[MAXENCLEN]; int len; } *encodings;
  23     int nchars;
  24     int i, j, k, cs;
  25     const char *sep;
  26
  27     chars = malloc(argc * sizeof(wchar_t));
  28     if (!chars) {
  29         fprintf(stderr, "out of memory\n");
  30         return 1;
  31     }
  32
  33     nchars = 0;
  34
  35     while (--argc) {
  36         char *p = *++argv;
  37         char *orig = p;
  38         char *end;
  39         int base = 16, semi_ok = 0;
  40
  41         if ((p[0] == 'U' || p[0] == 'u') &&
  42             (p[1] == '-' || p[1] == '+')) {
  43             p += 2;
  44         } else if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
  45             p += 2;
  46         } else if (p[0] == '&' && p[1] == '#') {
  47             p += 2;
  48             if (p[0] == 'x' || p[0] == 'X')
  49                 p++;
  50             else
  51                 base = 10;
  52             semi_ok = 1;
  53         }
  54
  55         chars[nchars++] = strtoul(p, &end, base);
  56         if (!*end || (semi_ok && !strcmp(end, ";")))
  57             continue;
  58         else {
  59             fprintf(stderr, "unable to parse '%s' as a Unicode code point\n",
  60                     orig);
  61             return 1;
  62         }
  63     }
  64
  65     encodings = malloc(nchars * CS_LIMIT * sizeof(struct enc));
  66     for (cs = 0; cs < CS_LIMIT; cs++) {
  67         for (i = 0; i < nchars; i++) {
  68             wchar_t inbuf[1];
  69             const wchar_t *inptr;
  70             int inlen, error, ret;
  71
  72             if (!charset_exists(cs)) {
  73                 encodings[i*CS_LIMIT+cs].len = 0;
  74                 continue;
  75             }
  76
  77             inbuf[0] = chars[i];
  78             inptr = inbuf;
  79             inlen = 1;
  80             error = 0;
  81             ret = charset_from_unicode(&inptr, &inlen,
  82                                        encodings[i*CS_LIMIT+cs].string,
  83                                        MAXENCLEN, cs, NULL, &error);
  84             if (error || inlen > 0)
  85                 encodings[i*CS_LIMIT+cs].len = 0;
  86             else
  87                 encodings[i*CS_LIMIT+cs].len = ret;
  88         }
  89     }
  90
  91     /*
  92      * Really simple and slow approach to finding each distinct string
  93      * and outputting it.
  94      */
  95     for (i = 0; i < nchars*CS_LIMIT; i++) {
  96         const char *thisstr = encodings[i].string;
  97         int thislen = encodings[i].len;
  98
  99         if (thislen == 0)
 100             continue;
 101         for (j = 0; j < i; j++)
 102             if (encodings[j].len == thislen &&
 103                 !memcmp(encodings[j].string, thisstr, thislen))
 104                 break;
 105         if (j < i)
 106             continue;        /* not the first instance of this encoding */
 107
 108         /*
 109          * See if every character is encoded like this somewhere.
 110          */
 111         for (j = 0; j < nchars; j++) {
 112             for (cs = 0; cs < CS_LIMIT; cs++) {
 113                 if (encodings[j*CS_LIMIT+cs].len == thislen &&
 114                     !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen))
 115                     break;
 116             }
 117             if (cs == CS_LIMIT)
 118                 break;                 /* this char not in any cs */
 119         }
 120         if (j < nchars)
 121             continue;                  /* some char not in any cs */
 122
 123         /*
 124          * Match! Print the encoding, then all charsets.
 125          */
 126         for (j = 0; j < nchars; j++) {
 127             for (k = 0; k < thislen; k++)
 128                 printf("%s%02X", k>0?" ":"", (unsigned)(thisstr[k] & 0xFF));
 129             printf(" = ");
 130             if (chars[j] >= 0x10000)
 131                 printf("U-%08X", (unsigned)chars[j]);
 132             else
 133                 printf("U+%04X", (unsigned)chars[j]);
 134             printf(" in:");
 135             sep = " ";
 136             for (cs = 0; cs < CS_LIMIT; cs++)
 137                 if (encodings[j*CS_LIMIT+cs].len == thislen &&
 138                     !memcmp(encodings[j*CS_LIMIT+cs].string, thisstr, thislen))
 139                 {
 140                     printf("%s%s", sep, charset_to_localenc(cs));
 141                     sep = ", ";
 142                 }
 143             printf("\n");
 144         }
 145         printf("\n");
 146     }
 147
 148     return 0;
 149 }