X-Git-Url: https://git.distorted.org.uk/~mdw/sgt/charset/blobdiff_plain/35f8c24335935ec42fda37849e4d5ee6a6466e7a..c2cf6b7ade4f244a31b83030856b0502ec90db9e:/iso2022.c diff --git a/iso2022.c b/iso2022.c index 73fc023..6e527dd 100644 --- a/iso2022.c +++ b/iso2022.c @@ -2,11 +2,10 @@ * iso2022.c - support for ISO/IEC 2022 (alias ECMA-35). * * This isn't a complete implementation of ISO/IEC 2022, but it's - * close. It only handles decoding, because a fully general encoder - * isn't really useful. It can decode 8-bit and 7-bit versions, with - * support for single-byte and multi-byte character sets, all four - * containers (G0, G1, G2, and G3), using both single-shift and - * locking-shift sequences. + * close. It can decode 8-bit and 7-bit versions, with support for + * single-byte and multi-byte character sets, all four containers + * (G0, G1, G2, and G3), using both single-shift and locking-shift + * sequences. * * The general principle is that any valid ISO/IEC 2022 sequence * should either be correctly decoded or should emit an ERROR. The @@ -53,29 +52,40 @@ typedef int (*to_dbcs_t)(long int, int *, int *); typedef int (*to_dbcs_planar_t)(long int, int *, int *, int *); /* - * Cast between to_dbcs_planar_t and to_dbcs_t. + * These macros cast between to_dbcs_planar_t and to_dbcs_t, in + * such a way as to cause a compile-time error if the input is not + * of the appropriate type. * - * I (SGT) originally defined these two macros as follows: - -#define DEPLANARISE(x) ( (x) == (to_dbcs_planar_t)NULL, (to_dbcs_t)(x) ) -#define REPLANARISE(x) ( (x) == (to_dbcs_t)NULL, (to_dbcs_planar_t)(x) ) - - * When compiled with gcc, this had the effect of type-checking the - * input, so that DEPLANARISE would cast a to_dbcs_t to a - * to_dbcs_planar_t but cause a compile error if passed any other - * input type, and vice versa. However, MSVC felt that this was a - * non-constant expression and hence not legal to use in a static - * initialiser, and probably rightly so: I haven't had a chance to - * check with the C standard, but I'd be surprised if it _required_ - * compilers to keep an open mind long enough to discover that the - * non-constant part of the expression has its result thrown away. + * Defining these portably is quite fiddly. My first effort was as + * follows: + * #define DEPLANARISE(x) ( (x) == (to_dbcs_planar_t)NULL, (to_dbcs_t)(x) ) * - * I can't think of any other means of performing this type check - * which doesn't have the same problem, so I'm taking the type - * checks out, with regret. + * so that the comparison on the left of the comma provokes the + * type check error, and the cast on the right is the actual + * desired result. + * + * gcc was entirely happy with this. However, when used in a static + * initialiser, MSVC objected - justifiably - that the first half + * of the comma expression wasn't constant and thus the expression + * as a whole was not a constant expression. We can get round this + * by enclosing the comparison in `sizeof', so that it isn't + * actually evaluated. + * + * But then we run into a second problem, which is that C actually + * disallows the use of the comma operator within a constant + * expression for any purpose at all! Presumably this is on the + * basis that its purpose is to have side effects and constant + * expressions can't; unfortunately, this specific case is one in + * which the desired side effect is a compile-time rather than a + * run-time one. + * + * We are permitted to use ?:, however, and that works quite well + * since the actual result of the sizeof expression _is_ evaluable + * at compile time. So here's my final answer: */ -#define DEPLANARISE(x) ( (to_dbcs_t)(x) ) -#define REPLANARISE(x) ( (to_dbcs_planar_t)(x) ) +#define TYPECHECK(x,y) ( sizeof((x)) == sizeof((x)) ? (y) : (y) ) +#define DEPLANARISE(x) TYPECHECK((x) == (to_dbcs_planar_t)NULL, (to_dbcs_t)(x)) +#define REPLANARISE(x) TYPECHECK((x) == (to_dbcs_t)NULL, (to_dbcs_planar_t)(x)) /* * Values used in the `enable' field. Each of these identifies a @@ -126,18 +136,18 @@ const struct iso2022_subcharset { * text, we'll use a preference order which matches that. So we * begin with the charsets defined in the compound text spec. */ - { S4, 0, 'B', CCS, 0x00, &sbcsdata_CS_ASCII }, - { S6, 0, 'A', CCS, 0x80, &sbcsdata_CS_ISO8859_1 }, - { S6, 0, 'B', CCS, 0x80, &sbcsdata_CS_ISO8859_2 }, - { S6, 0, 'C', CCS, 0x80, &sbcsdata_CS_ISO8859_3 }, - { S6, 0, 'D', CCS, 0x80, &sbcsdata_CS_ISO8859_4 }, - { S6, 0, 'F', CCS, 0x80, &sbcsdata_CS_ISO8859_7 }, - { S6, 0, 'G', CCS, 0x80, &sbcsdata_CS_ISO8859_6 }, - { S6, 0, 'H', CCS, 0x80, &sbcsdata_CS_ISO8859_8 }, - { S6, 0, 'L', CCS, 0x80, &sbcsdata_CS_ISO8859_5 }, - { S6, 0, 'M', CCS, 0x80, &sbcsdata_CS_ISO8859_9 }, - { S4, 0, 'I', CCS, 0x80, &sbcsdata_CS_JISX0201 }, - { S4, 0, 'J', CCS, 0x00, &sbcsdata_CS_JISX0201 }, + { S4, 0, 'B', CCS, 0x00, &sbcsdata_CS_ASCII, NULL, NULL, 0 }, + { S6, 0, 'A', CCS, 0x80, &sbcsdata_CS_ISO8859_1, NULL, NULL, 0 }, + { S6, 0, 'B', CCS, 0x80, &sbcsdata_CS_ISO8859_2, NULL, NULL, 0 }, + { S6, 0, 'C', CCS, 0x80, &sbcsdata_CS_ISO8859_3, NULL, NULL, 0 }, + { S6, 0, 'D', CCS, 0x80, &sbcsdata_CS_ISO8859_4, NULL, NULL, 0 }, + { S6, 0, 'F', CCS, 0x80, &sbcsdata_CS_ISO8859_7, NULL, NULL, 0 }, + { S6, 0, 'G', CCS, 0x80, &sbcsdata_CS_ISO8859_6, NULL, NULL, 0 }, + { S6, 0, 'H', CCS, 0x80, &sbcsdata_CS_ISO8859_8, NULL, NULL, 0 }, + { S6, 0, 'L', CCS, 0x80, &sbcsdata_CS_ISO8859_5, NULL, NULL, 0 }, + { S6, 0, 'M', CCS, 0x80, &sbcsdata_CS_ISO8859_9, NULL, NULL, 0 }, + { S4, 0, 'I', CCS, 0x80, &sbcsdata_CS_JISX0201, NULL, NULL, 0 }, + { S4, 0, 'J', CCS, 0x00, &sbcsdata_CS_JISX0201, NULL, NULL, 0 }, { M4, 0, 'A', CCS, -0x21, 0, &gb2312_to_unicode, &unicode_to_gb2312, -1 }, { M4, 0, 'B', CCS, -0x21, 0, &jisx0208_to_unicode, &unicode_to_jisx0208, -1 }, { M4, 0, 'C', CCS, -0x21, 0, &ksx1001_to_unicode, &unicode_to_ksx1001, -1 }, @@ -147,13 +157,13 @@ const struct iso2022_subcharset { * Next, other reasonably standard things: the rest of the ISO * 8859 sets, UK-ASCII, and CNS 11643. */ - { S6, 0, 'T', COS, 0x80, &sbcsdata_CS_ISO8859_11 }, - { S6, 0, 'V', COS, 0x80, &sbcsdata_CS_ISO8859_10 }, - { S6, 0, 'Y', COS, 0x80, &sbcsdata_CS_ISO8859_13 }, - { S6, 0, '_', COS, 0x80, &sbcsdata_CS_ISO8859_14 }, - { S6, 0, 'b', COS, 0x80, &sbcsdata_CS_ISO8859_15 }, - { S6, 0, 'f', COS, 0x80, &sbcsdata_CS_ISO8859_16 }, - { S4, 0, 'A', COS, 0x00, &sbcsdata_CS_BS4730 }, + { S6, 0, 'T', COS, 0x80, &sbcsdata_CS_ISO8859_11, NULL, NULL, 0 }, + { S6, 0, 'V', COS, 0x80, &sbcsdata_CS_ISO8859_10, NULL, NULL, 0 }, + { S6, 0, 'Y', COS, 0x80, &sbcsdata_CS_ISO8859_13, NULL, NULL, 0 }, + { S6, 0, '_', COS, 0x80, &sbcsdata_CS_ISO8859_14, NULL, NULL, 0 }, + { S6, 0, 'b', COS, 0x80, &sbcsdata_CS_ISO8859_15, NULL, NULL, 0 }, + { S6, 0, 'f', COS, 0x80, &sbcsdata_CS_ISO8859_16, NULL, NULL, 0 }, + { S4, 0, 'A', COS, 0x00, &sbcsdata_CS_BS4730, NULL, NULL, 0 }, { M4, 0, 'G', COS, -0x21, 0, &cns11643_1_to_unicode, DEPLANARISE(&unicode_to_cns11643), 0 }, { M4, 0, 'H', COS, -0x21, 0, &cns11643_2_to_unicode, DEPLANARISE(&unicode_to_cns11643), 1 }, { M4, 0, 'I', COS, -0x21, 0, &cns11643_3_to_unicode, DEPLANARISE(&unicode_to_cns11643), 2 }, @@ -166,8 +176,8 @@ const struct iso2022_subcharset { * Private-use designations: DEC private sets and Emacs's Big5 * abomination. */ - { S4, 0, '0', CPU, 0x00, &sbcsdata_CS_DEC_GRAPHICS }, - { S4, 0, '<', CPU, 0x80, &sbcsdata_CS_DEC_MCS }, + { S4, 0, '0', CPU, 0x00, &sbcsdata_CS_DEC_GRAPHICS, NULL, NULL, 0 }, + { S4, 0, '<', CPU, 0x80, &sbcsdata_CS_DEC_MCS, NULL, NULL, 0 }, { M4, 0, '0', CPU, -0x21, 0, &emacs_big5_1_to_unicode, DEPLANARISE(&unicode_to_emacs_big5), 1 }, { M4, 0, '1', CPU, -0x21, 0, &emacs_big5_2_to_unicode, DEPLANARISE(&unicode_to_emacs_big5), 2 }, @@ -183,8 +193,8 @@ const struct iso2022_subcharset { /* * Finally, fallback entries for null character sets. */ - { S4, 0, '~', CNU }, - { S6, 0, '~', CNU }, /* empty 96-set */ + { S4, 0, '~', CNU, 0, NULL, NULL, NULL, 0 }, + { S6, 0, '~', CNU, 0, NULL, NULL, NULL, 0 }, /* empty 96-set */ { M4, 0, '~', CNU, 0, 0, &null_dbcs_to_unicode, &unicode_to_null_dbcs, -1 }, /* empty 94^n-set */ { M6, 0, '~', CNU, 0, 0, &null_dbcs_to_unicode, &unicode_to_null_dbcs, -1 }, /* empty 96^n-set */ }; @@ -530,9 +540,9 @@ static void read_iso2022(charset_spec const *charset, long int input_chr, #define LEFT 30 #define RIGHT 28 #define LOCKING_SHIFT(n,side) \ - (state->s1 = (state->s1 & ~(3L<<(side))) | ((n ## L)<<(side))) -#define MODE ((state->s0 & 0xe0000000L) >> 29) -#define ENTER_MODE(m) (state->s0 = (state->s0 & ~0xe0000000L) | ((m)<<29)) + (state->s1 = (state->s1 & ~(3UL<<(side))) | ((n ## UL)<<(side))) +#define MODE ((state->s0 & 0xe0000000UL) >> 29) +#define ENTER_MODE(m) (state->s0 = (state->s0 & ~0xe0000000UL) | ((unsigned long)(m)<<29)) #define SINGLE_SHIFT(n) ENTER_MODE(SS2CHAR - 2 + (n)) #define ASSERT_IDLE do { \ if (state->s0 != 0) emit(emitctx, ERROR); \