mdw@git.distorted.org.uk Git - sgt/charset/blob - euc.c

   1 /*
   2  * euc.c - routines to handle the various EUC multibyte encodings.
   3  */
   4
   5 #ifndef ENUM_CHARSETS
   6
   7 #include "charset.h"
   8 #include "internal.h"
   9
  10 struct euc {
  11     int nchars[3];                     /* GR, SS2+GR, SS3+GR */
  12     long int (*to_ucs)(unsigned long state);
  13     unsigned long (*from_ucs)(long int ucs);
  14 };
  15
  16 static void read_euc(charset_spec const *charset, long int input_chr,
  17                      charset_state *state,
  18                      void (*emit)(void *ctx, long int output), void *emitctx)
  19 {
  20     struct euc const *euc = (struct euc *)charset->data;
  21
  22     /*
  23      * For EUC input, our state variable divides into three parts:
  24      *
  25      *  - Topmost nibble (bits 31:28) is nonzero if we're
  26      *    accumulating a multibyte character, and it indicates
  27      *    which section we're in: 1 for GR chars, 2 for things
  28      *    beginning with SS2, 3 for things beginning with SS3.
  29      *
  30      *  - Next nibble (bits 27:24) indicates how many bytes of the
  31      *    character we've accumulated so far.
  32      *
  33      *  - The rest (bits 23:0) are those bytes in full, accumulated
  34      *    as a large integer (so that seeing A1 A2 A3, in a
  35      *    hypothetical EUC whose GR encoding is three-byte, runs
  36      *    our state variable from 0 -> 0x110000A1 -> 0x1200A1A2 ->
  37      *    0x13A1A2A3, at which point it gets translated and output
  38      *    and resets to zero).
  39      */
  40
  41     if (state->s0 != 0) {
  42
  43         /*
  44          * At this point, no matter whether we had an SS2 or SS3
  45          * introducer or not, we _always_ expect a GR character.
  46          * Anything else causes us to emit ERROR for an incomplete
  47          * character, and then reset to state 0 to process the
  48          * character in its own way.
  49          */
  50         if (input_chr < 0xA1 || input_chr == 0xFF) {
  51             emit(emitctx, ERROR);
  52             state->s0 = 0;
  53         } else
  54             state->s0 = (((state->s0 & 0xFF000000) + 0x01000000) |
  55                          ((state->s0 & 0x0000FFFF) << 8) | input_chr);
  56
  57     }
  58
  59     if (state->s0 == 0) {
  60         /*
  61          * The input character determines which of the four
  62          * possible charsets we're going to be in.
  63          */
  64         if (input_chr < 0x80) {        /* this is always ASCII */
  65             emit(emitctx, input_chr);
  66         } else if (input_chr == 0x8E) {/* SS2 means charset 2 */
  67             state->s0 = 0x20000000;
  68         } else if (input_chr == 0x8F) {/* SS3 means charset 3 */
  69             state->s0 = 0x30000000;
  70         } else if (input_chr < 0xA1 || input_chr == 0xFF) {   /* errors */
  71             emit(emitctx, ERROR);
  72         } else {                       /* A1-FE means charset 1 */
  73             state->s0 = 0x11000000 | input_chr;
  74         }
  75     }
  76
  77     /*
  78      * Finally, if we have accumulated a complete character, output
  79      * it.
  80      */
  81     if (state->s0 != 0 &&
  82         ((state->s0 & 0x0F000000) >> 24) >=
  83         (unsigned)euc->nchars[(state->s0 >> 28)-1]) {
  84         emit(emitctx, euc->to_ucs(state->s0));
  85         state->s0 = 0;
  86     }
  87 }
  88
  89 /*
  90  * All EUCs are stateless multi-byte encodings (in the sense that
  91  * just after any character has been completed, the state is always
  92  * the same); hence when writing them, there is no need to use the
  93  * charset_state.
  94  */
  95
  96 static int write_euc(charset_spec const *charset, long int input_chr,
  97                      charset_state *state,
  98                      void (*emit)(void *ctx, long int output), void *emitctx)
  99 {
 100     struct euc const *euc = (struct euc *)charset->data;
 101     unsigned long c;
 102     int cset, len;
 103
 104     UNUSEDARG(state);
 105
 106     if (input_chr == -1)
 107         return TRUE;                   /* stateless; no cleanup required */
 108
 109     /* ASCII is the easy bit, and is always the same. */
 110     if (input_chr < 0x80) {
 111         emit(emitctx, input_chr);
 112         return TRUE;
 113     }
 114
 115     c = euc->from_ucs(input_chr);
 116     if (!c) {
 117         return FALSE;
 118     }
 119
 120     cset = c >> 28;
 121     len = euc->nchars[cset-1];
 122     c &= 0xFFFFFF;
 123
 124     if (cset > 1)
 125         emit(emitctx, 0x8C + cset);    /* SS2/SS3 */
 126
 127     while (len--)
 128         emit(emitctx, (c >> (8*len)) & 0xFF);
 129     return TRUE;
 130 }
 131
 132 /*
 133  * EUC-CN encodes GB2312 only.
 134  */
 135 static long int euc_cn_to_ucs(unsigned long state)
 136 {
 137     switch (state >> 28) {
 138       case 1: return gb2312_to_unicode(((state >> 8) & 0xFF) - 0xA1,
 139                                        ((state     ) & 0xFF) - 0xA1);
 140       default: return ERROR;
 141     }
 142 }
 143 static unsigned long euc_cn_from_ucs(long int ucs)
 144 {
 145     int r, c;
 146     if (unicode_to_gb2312(ucs, &r, &c))
 147         return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1);
 148     else
 149         return 0;
 150 }
 151 static const struct euc euc_cn = {
 152     {2,0,0}, euc_cn_to_ucs, euc_cn_from_ucs
 153 };
 154 const charset_spec charset_CS_EUC_CN = {
 155     CS_EUC_CN, read_euc, write_euc, &euc_cn
 156 };
 157
 158 /*
 159  * EUC-KR encodes KS X 1001 only.
 160  */
 161 static long int euc_kr_to_ucs(unsigned long state)
 162 {
 163     switch (state >> 28) {
 164       case 1: return ksx1001_to_unicode(((state >> 8) & 0xFF) - 0xA1,
 165                                        ((state     ) & 0xFF) - 0xA1);
 166       default: return ERROR;
 167     }
 168 }
 169 static unsigned long euc_kr_from_ucs(long int ucs)
 170 {
 171     int r, c;
 172     if (unicode_to_ksx1001(ucs, &r, &c))
 173         return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1);
 174     else
 175         return 0;
 176 }
 177 static const struct euc euc_kr = {
 178     {2,0,0}, euc_kr_to_ucs, euc_kr_from_ucs
 179 };
 180 const charset_spec charset_CS_EUC_KR = {
 181     CS_EUC_KR, read_euc, write_euc, &euc_kr
 182 };
 183
 184 /*
 185  * EUC-JP encodes several character sets.
 186  */
 187 static long int euc_jp_to_ucs(unsigned long state)
 188 {
 189     switch (state >> 28) {
 190       case 1: return jisx0208_to_unicode(((state >> 8) & 0xFF) - 0xA1,
 191                                          ((state     ) & 0xFF) - 0xA1);
 192       case 2:
 193         /*
 194          * This is the top half of JIS X 0201. That means A1-DF map
 195          * to FF61-FF9F, and nothing else is valid.
 196          */
 197         {
 198             int c = state & 0xFF;
 199             if (c >= 0xA1 && c <= 0xDF)
 200                 return c + (0xFF61 - 0xA1);
 201             else
 202                 return ERROR;
 203         }
 204         /* (no break needed since all control paths have returned) */
 205       case 3: return jisx0212_to_unicode(((state >> 8) & 0xFF) - 0xA1,
 206                                          ((state     ) & 0xFF) - 0xA1);
 207       default: return ERROR;           /* placate optimisers */
 208     }
 209 }
 210 static unsigned long euc_jp_from_ucs(long int ucs)
 211 {
 212     int r, c;
 213     if (ucs >= 0xFF61 && ucs <= 0xFF9F)
 214         return 0x20000000 | (ucs - (0xFF61 - 0xA1));
 215     else if (unicode_to_jisx0208(ucs, &r, &c))
 216         return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1);
 217     else if (unicode_to_jisx0212(ucs, &r, &c))
 218         return 0x30000000 | ((r+0xA1) << 8) | (c+0xA1);
 219     else
 220         return 0;
 221 }
 222 static const struct euc euc_jp = {
 223     {2,1,2}, euc_jp_to_ucs, euc_jp_from_ucs
 224 };
 225 const charset_spec charset_CS_EUC_JP = {
 226     CS_EUC_JP, read_euc, write_euc, &euc_jp
 227 };
 228
 229 /*
 230  * EUC-TW encodes CNS 11643 (all planes).
 231  */
 232 static long int euc_tw_to_ucs(unsigned long state)
 233 {
 234     int plane;
 235     switch (state >> 28) {
 236       case 1: return cns11643_to_unicode(0, ((state >> 8) & 0xFF) - 0xA1,
 237                                             ((state     ) & 0xFF) - 0xA1);
 238       case 2:
 239         plane = ((state >> 8) & 0xFF) - 0xA1;
 240         if (plane >= 7) return ERROR;
 241         return cns11643_to_unicode(plane, ((state >> 8) & 0xFF) - 0xA1,
 242                                           ((state     ) & 0xFF) - 0xA1);
 243       default: return ERROR;
 244     }
 245 }
 246 static unsigned long euc_tw_from_ucs(long int ucs)
 247 {
 248     int p, r, c;
 249     if (unicode_to_cns11643(ucs, &p, &r, &c)) {
 250         if (p == 0)
 251             return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1);
 252         else
 253             return 0x20000000 |
 254                 ((p + 0xA1) << 16) | ((r+0xA1) << 8) | (c+0xA1);
 255     } else
 256         return 0;
 257 }
 258 static const struct euc euc_tw = {
 259     {2,3,0}, euc_tw_to_ucs, euc_tw_from_ucs
 260 };
 261 const charset_spec charset_CS_EUC_TW = {
 262     CS_EUC_TW, read_euc, write_euc, &euc_tw
 263 };
 264
 265 #else /* ENUM_CHARSETS */
 266
 267 ENUM_CHARSET(CS_EUC_CN)
 268 ENUM_CHARSET(CS_EUC_KR)
 269 ENUM_CHARSET(CS_EUC_JP)
 270 ENUM_CHARSET(CS_EUC_TW)
 271
 272 #endif /* ENUM_CHARSETS */