| 1 | /* |
| 2 | * internal.h - internal header stuff for the charset library. |
| 3 | */ |
| 4 | |
| 5 | #ifndef charset_internal_h |
| 6 | #define charset_internal_h |
| 7 | |
| 8 | /* This invariably comes in handy */ |
| 9 | #define lenof(x) ( sizeof((x)) / sizeof(*(x)) ) |
| 10 | |
| 11 | /* This is an invalid Unicode value used to indicate an error. */ |
| 12 | #define ERROR 0xFFFFL /* Unicode value representing error */ |
| 13 | |
| 14 | #undef TRUE |
| 15 | #define TRUE 1 |
| 16 | #undef FALSE |
| 17 | #define FALSE 0 |
| 18 | |
| 19 | typedef struct charset_spec charset_spec; |
| 20 | typedef struct sbcs_data sbcs_data; |
| 21 | |
| 22 | struct charset_spec { |
| 23 | int charset; /* numeric identifier */ |
| 24 | |
| 25 | /* |
| 26 | * A function to read the character set and output Unicode |
| 27 | * characters. The `emit' function expects to get Unicode chars |
| 28 | * passed to it; it should be sent ERROR for any encoding error |
| 29 | * on the input. |
| 30 | */ |
| 31 | void (*read)(charset_spec const *charset, long int input_chr, |
| 32 | charset_state *state, |
| 33 | void (*emit)(void *ctx, long int output), void *emitctx); |
| 34 | /* |
| 35 | * A function to read Unicode characters and output in this |
| 36 | * character set. The `emit' function expects to get byte |
| 37 | * values passed to it. |
| 38 | * |
| 39 | * A non-representable input character should cause a FALSE |
| 40 | * return, _before_ `emit' is called. Successful conversion |
| 41 | * causes a TRUE return. |
| 42 | * |
| 43 | * If `input_chr' is -1, this function must revert the encoding |
| 44 | * state to any default required at the end of a piece of |
| 45 | * encoded text. |
| 46 | */ |
| 47 | int (*write)(charset_spec const *charset, long int input_chr, |
| 48 | charset_state *state, |
| 49 | void (*emit)(void *ctx, long int output), void *emitctx); |
| 50 | void const *data; |
| 51 | }; |
| 52 | |
| 53 | /* |
| 54 | * This is the format of `data' used by the SBCS read and write |
| 55 | * functions; so it's the format used in all SBCS definitions. |
| 56 | */ |
| 57 | struct sbcs_data { |
| 58 | /* |
| 59 | * This is a simple mapping table converting each SBCS position |
| 60 | * to a Unicode code point. Some positions may contain ERROR, |
| 61 | * indicating that that byte value is not defined in the SBCS |
| 62 | * in question and its occurrence in input is an error. |
| 63 | */ |
| 64 | unsigned long sbcs2ucs[256]; |
| 65 | |
| 66 | /* |
| 67 | * This lookup table is used to convert Unicode back to the |
| 68 | * SBCS. It consists of the valid byte values in the SBCS, |
| 69 | * sorted in order of their Unicode translation. So given a |
| 70 | * Unicode value U, you can do a binary search on this table |
| 71 | * using the above table as a lookup: when testing the Xth |
| 72 | * position in this table, you branch according to whether |
| 73 | * sbcs2ucs[ucs2sbcs[X]] is less than, greater than, or equal |
| 74 | * to U. |
| 75 | * |
| 76 | * Note that since there may be fewer than 256 valid byte |
| 77 | * values in a particular SBCS, we must supply the length of |
| 78 | * this table as well as the contents. |
| 79 | */ |
| 80 | unsigned char ucs2sbcs[256]; |
| 81 | int nvalid; |
| 82 | }; |
| 83 | |
| 84 | /* |
| 85 | * Prototypes for internal library functions. |
| 86 | */ |
| 87 | charset_spec const *charset_find_spec(int charset); |
| 88 | void read_sbcs(charset_spec const *charset, long int input_chr, |
| 89 | charset_state *state, |
| 90 | void (*emit)(void *ctx, long int output), void *emitctx); |
| 91 | int write_sbcs(charset_spec const *charset, long int input_chr, |
| 92 | charset_state *state, |
| 93 | void (*emit)(void *ctx, long int output), void *emitctx); |
| 94 | long int sbcs_to_unicode(const struct sbcs_data *sd, long int input_chr); |
| 95 | long int sbcs_from_unicode(const struct sbcs_data *sd, long int input_chr); |
| 96 | |
| 97 | void read_utf8(charset_spec const *charset, long int input_chr, |
| 98 | charset_state *state, |
| 99 | void (*emit)(void *ctx, long int output), void *emitctx); |
| 100 | int write_utf8(charset_spec const *charset, long int input_chr, |
| 101 | charset_state *state, |
| 102 | void (*emit)(void *ctx, long int output), |
| 103 | void *emitctx); |
| 104 | |
| 105 | long int big5_to_unicode(int r, int c); |
| 106 | int unicode_to_big5(long int unicode, int *r, int *c); |
| 107 | long int cns11643_to_unicode(int p, int r, int c); |
| 108 | int unicode_to_cns11643(long int unicode, int *p, int *r, int *c); |
| 109 | long int cp949_to_unicode(int r, int c); |
| 110 | int unicode_to_cp949(long int unicode, int *r, int *c); |
| 111 | long int ksx1001_to_unicode(int r, int c); |
| 112 | int unicode_to_ksx1001(long int unicode, int *r, int *c); |
| 113 | long int gb2312_to_unicode(int r, int c); |
| 114 | int unicode_to_gb2312(long int unicode, int *r, int *c); |
| 115 | long int jisx0208_to_unicode(int r, int c); |
| 116 | int unicode_to_jisx0208(long int unicode, int *r, int *c); |
| 117 | long int jisx0212_to_unicode(int r, int c); |
| 118 | int unicode_to_jisx0212(long int unicode, int *r, int *c); |
| 119 | |
| 120 | /* |
| 121 | * Placate compiler warning about unused parameters, of which we |
| 122 | * expect to have some in this library. |
| 123 | */ |
| 124 | #define UNUSEDARG(x) ( (x) = (x) ) |
| 125 | |
| 126 | #endif /* charset_internal_h */ |