Preferred MIME name for ASCII is "US-ASCII", not "ANSI_X3.4-1968". Oops.
[sgt/charset] / internal.h
CommitLineData
c6d25d8d 1/*
2 * internal.h - internal header stuff for the charset library.
3 */
4
5#ifndef charset_internal_h
6#define charset_internal_h
7
8/* This invariably comes in handy */
9#define lenof(x) ( sizeof((x)) / sizeof(*(x)) )
10
11/* This is an invalid Unicode value used to indicate an error. */
12#define ERROR 0xFFFFL /* Unicode value representing error */
13
14#undef TRUE
15#define TRUE 1
16#undef FALSE
17#define FALSE 0
18
19typedef struct charset_spec charset_spec;
20typedef struct sbcs_data sbcs_data;
21
22struct charset_spec {
23 int charset; /* numeric identifier */
24
25 /*
26 * A function to read the character set and output Unicode
27 * characters. The `emit' function expects to get Unicode chars
28 * passed to it; it should be sent ERROR for any encoding error
29 * on the input.
30 */
31 void (*read)(charset_spec const *charset, long int input_chr,
32 charset_state *state,
33 void (*emit)(void *ctx, long int output), void *emitctx);
34 /*
35 * A function to read Unicode characters and output in this
36 * character set. The `emit' function expects to get byte
37 * values passed to it.
38 *
39 * A non-representable input character should cause a FALSE
40 * return, _before_ `emit' is called. Successful conversion
41 * causes a TRUE return.
42 *
43 * If `input_chr' is -1, this function must revert the encoding
44 * state to any default required at the end of a piece of
45 * encoded text.
46 */
47 int (*write)(charset_spec const *charset, long int input_chr,
48 charset_state *state,
49 void (*emit)(void *ctx, long int output), void *emitctx);
50 void const *data;
51};
52
53/*
54 * This is the format of `data' used by the SBCS read and write
55 * functions; so it's the format used in all SBCS definitions.
56 */
57struct sbcs_data {
58 /*
59 * This is a simple mapping table converting each SBCS position
60 * to a Unicode code point. Some positions may contain ERROR,
61 * indicating that that byte value is not defined in the SBCS
62 * in question and its occurrence in input is an error.
63 */
64 unsigned long sbcs2ucs[256];
65
66 /*
67 * This lookup table is used to convert Unicode back to the
68 * SBCS. It consists of the valid byte values in the SBCS,
69 * sorted in order of their Unicode translation. So given a
70 * Unicode value U, you can do a binary search on this table
71 * using the above table as a lookup: when testing the Xth
72 * position in this table, you branch according to whether
73 * sbcs2ucs[ucs2sbcs[X]] is less than, greater than, or equal
74 * to U.
75 *
76 * Note that since there may be fewer than 256 valid byte
77 * values in a particular SBCS, we must supply the length of
78 * this table as well as the contents.
79 */
80 unsigned char ucs2sbcs[256];
81 int nvalid;
82};
83
84/*
85 * Prototypes for internal library functions.
86 */
87charset_spec const *charset_find_spec(int charset);
88void read_sbcs(charset_spec const *charset, long int input_chr,
89 charset_state *state,
90 void (*emit)(void *ctx, long int output), void *emitctx);
91int write_sbcs(charset_spec const *charset, long int input_chr,
92 charset_state *state,
93 void (*emit)(void *ctx, long int output), void *emitctx);
94
95long int big5_to_unicode(int r, int c);
96int unicode_to_big5(long int unicode, int *r, int *c);
97long int cp949_to_unicode(int r, int c);
98int unicode_to_cp949(long int unicode, int *r, int *c);
99long int ksx1001_to_unicode(int r, int c);
100int unicode_to_ksx1001(long int unicode, int *r, int *c);
101long int gb2312_to_unicode(int r, int c);
102int unicode_to_gb2312(long int unicode, int *r, int *c);
103long int jisx0208_to_unicode(int r, int c);
104int unicode_to_jisx0208(long int unicode, int *r, int *c);
105long int jisx0212_to_unicode(int r, int c);
106int unicode_to_jisx0212(long int unicode, int *r, int *c);
107
108/*
109 * Placate compiler warning about unused parameters, of which we
110 * expect to have some in this library.
111 */
112#define UNUSEDARG(x) ( (x) = (x) )
113
114#endif /* charset_internal_h */