mdw@git.distorted.org.uk Git - sgt/charset/blob - euc.c

   1 /*
   2  * euc.c - routines to handle the various EUC multibyte encodings.
   3  */
   4
   5 #ifndef ENUM_CHARSETS
   6
   7 #include "charset.h"
   8 #include "internal.h"
   9
  10 struct euc {
  11     int nchars[3];                     /* GR, SS2+GR, SS3+GR */
  12     long int (*to_ucs)(unsigned long state);
  13     unsigned long (*from_ucs)(long int ucs);
  14 };
  15
  16 static void read_euc(charset_spec const *charset, long int input_chr,
  17                      charset_state *state,
  18                      void (*emit)(void *ctx, long int output), void *emitctx)
  19 {
  20     struct euc const *euc = (struct euc *)charset->data;
  21
  22     /*
  23      * For EUC input, our state variable divides into three parts:
  24      *
  25      *  - Topmost nibble (bits 31:28) is nonzero if we're
  26      *    accumulating a multibyte character, and it indicates
  27      *    which section we're in: 1 for GR chars, 2 for things
  28      *    beginning with SS2, 3 for things beginning with SS3.
  29      *
  30      *  - Next nibble (bits 27:24) indicates how many bytes of the
  31      *    character we've accumulated so far.
  32      *
  33      *  - The rest (bits 23:0) are those bytes in full, accumulated
  34      *    as a large integer (so that seeing A1 A2 A3, in a
  35      *    hypothetical EUC whose GR encoding is three-byte, runs
  36      *    our state variable from 0 -> 0x110000A1 -> 0x1200A1A2 ->
  37      *    0x13A1A2A3, at which point it gets translated and output
  38      *    and resets to zero).
  39      */
  40
  41     if (state->s0 != 0) {
  42
  43         /*
  44          * At this point, no matter whether we had an SS2 or SS3
  45          * introducer or not, we _always_ expect a GR character.
  46          * Anything else causes us to emit ERROR for an incomplete
  47          * character, and then reset to state 0 to process the
  48          * character in its own way.
  49          */
  50         if (input_chr < 0xA1 || input_chr == 0xFF) {
  51             emit(emitctx, ERROR);
  52             state->s0 = 0;
  53         } else
  54             state->s0 = (((state->s0 & 0xFF000000) + 0x01000000) |
  55                          ((state->s0 & 0x0000FFFF) << 8) | input_chr);
  56
  57     }
  58
  59     if (state->s0 == 0) {
  60         /*
  61          * The input character determines which of the four
  62          * possible charsets we're going to be in.
  63          */
  64         if (input_chr < 0x80) {        /* this is always ASCII */
  65             emit(emitctx, input_chr);
  66         } else if (input_chr == 0x8E) {/* SS2 means charset 2 */
  67             state->s0 = 0x20000000;
  68         } else if (input_chr == 0x8F) {/* SS3 means charset 3 */
  69             state->s0 = 0x30000000;
  70         } else if (input_chr < 0xA1 || input_chr == 0xFF) {   /* errors */
  71             emit(emitctx, ERROR);
  72         } else {                       /* A1-FE means charset 1 */
  73             state->s0 = 0x11000000 | input_chr;
  74         }
  75     }
  76
  77     /*
  78      * Finally, if we have accumulated a complete character, output
  79      * it.
  80      */
  81     if (state->s0 != 0 &&
  82         ((state->s0 & 0x0F000000) >> 24) >= euc->nchars[(state->s0 >> 28)-1]) {
  83         emit(emitctx, euc->to_ucs(state->s0));
  84         state->s0 = 0;
  85     }
  86 }
  87
  88 /*
  89  * All EUCs are stateless multi-byte encodings (in the sense that
  90  * just after any character has been completed, the state is always
  91  * the same); hence when writing them, there is no need to use the
  92  * charset_state.
  93  */
  94
  95 static int write_euc(charset_spec const *charset, long int input_chr,
  96                      charset_state *state,
  97                      void (*emit)(void *ctx, long int output), void *emitctx)
  98 {
  99     struct euc const *euc = (struct euc *)charset->data;
 100     unsigned long c;
 101     int cset, len;
 102
 103     UNUSEDARG(state);
 104
 105     if (input_chr == -1)
 106         return TRUE;                   /* stateless; no cleanup required */
 107
 108     /* ASCII is the easy bit, and is always the same. */
 109     if (input_chr < 0x80) {
 110         emit(emitctx, input_chr);
 111         return TRUE;
 112     }
 113
 114     c = euc->from_ucs(input_chr);
 115     if (!c) {
 116         return FALSE;
 117     }
 118
 119     cset = c >> 28;
 120     len = euc->nchars[cset-1];
 121     c &= 0xFFFFFF;
 122
 123     if (cset > 1)
 124         emit(emitctx, 0x8C + cset);    /* SS2/SS3 */
 125
 126     while (len--)
 127         emit(emitctx, (c >> (8*len)) & 0xFF);
 128     return TRUE;
 129 }
 130
 131 /*
 132  * EUC-CN encodes GB2312 only.
 133  */
 134 static long int euc_cn_to_ucs(unsigned long state)
 135 {
 136     switch (state >> 28) {
 137       case 1: return gb2312_to_unicode(((state >> 8) & 0xFF) - 0xA1,
 138                                        ((state     ) & 0xFF) - 0xA1);
 139       default: return ERROR;
 140     }
 141 }
 142 static unsigned long euc_cn_from_ucs(long int ucs)
 143 {
 144     int r, c;
 145     if (unicode_to_gb2312(ucs, &r, &c))
 146         return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1);
 147     else
 148         return 0;
 149 }
 150 static const struct euc euc_cn = {
 151     {2,0,0}, euc_cn_to_ucs, euc_cn_from_ucs
 152 };
 153 const charset_spec charset_CS_EUC_CN = {
 154     CS_EUC_CN, read_euc, write_euc, &euc_cn
 155 };
 156
 157 /*
 158  * EUC-KR encodes KS X 1001 only.
 159  */
 160 static long int euc_kr_to_ucs(unsigned long state)
 161 {
 162     switch (state >> 28) {
 163       case 1: return ksx1001_to_unicode(((state >> 8) & 0xFF) - 0xA1,
 164                                        ((state     ) & 0xFF) - 0xA1);
 165       default: return ERROR;
 166     }
 167 }
 168 static unsigned long euc_kr_from_ucs(long int ucs)
 169 {
 170     int r, c;
 171     if (unicode_to_ksx1001(ucs, &r, &c))
 172         return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1);
 173     else
 174         return 0;
 175 }
 176 static const struct euc euc_kr = {
 177     {2,0,0}, euc_kr_to_ucs, euc_kr_from_ucs
 178 };
 179 const charset_spec charset_CS_EUC_KR = {
 180     CS_EUC_KR, read_euc, write_euc, &euc_kr
 181 };
 182
 183 /*
 184  * EUC-JP encodes several character sets.
 185  */
 186 static long int euc_jp_to_ucs(unsigned long state)
 187 {
 188     switch (state >> 28) {
 189       case 1: return jisx0208_to_unicode(((state >> 8) & 0xFF) - 0xA1,
 190                                          ((state     ) & 0xFF) - 0xA1);
 191       case 2:
 192         /*
 193          * This is the top half of JIS X 0201. That means A1-DF map
 194          * to FF61-FF9F, and nothing else is valid.
 195          */
 196         {
 197             int c = state & 0xFF;
 198             if (c >= 0xA1 && c <= 0xDF)
 199                 return c + (0xFF61 - 0xA1);
 200             else
 201                 return ERROR;
 202         }
 203         /* (no break needed since all control paths have returned) */
 204       case 3: return jisx0212_to_unicode(((state >> 8) & 0xFF) - 0xA1,
 205                                          ((state     ) & 0xFF) - 0xA1);
 206       default: return ERROR;           /* placate optimisers */
 207     }
 208 }
 209 static unsigned long euc_jp_from_ucs(long int ucs)
 210 {
 211     int r, c;
 212     if (ucs >= 0xFF61 && ucs <= 0xFF9F)
 213         return 0x20000000 | (ucs - (0xFF61 - 0xA1));
 214     else if (unicode_to_jisx0208(ucs, &r, &c))
 215         return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1);
 216     else if (unicode_to_jisx0212(ucs, &r, &c))
 217         return 0x30000000 | ((r+0xA1) << 8) | (c+0xA1);
 218     else
 219         return 0;
 220 }
 221 static const struct euc euc_jp = {
 222     {2,1,2}, euc_jp_to_ucs, euc_jp_from_ucs
 223 };
 224 const charset_spec charset_CS_EUC_JP = {
 225     CS_EUC_JP, read_euc, write_euc, &euc_jp
 226 };
 227
 228 #else /* ENUM_CHARSETS */
 229
 230 ENUM_CHARSET(CS_EUC_CN)
 231 ENUM_CHARSET(CS_EUC_KR)
 232 ENUM_CHARSET(CS_EUC_JP)
 233
 234 #endif /* ENUM_CHARSETS */