* iso2022.c - support for ISO/IEC 2022 (alias ECMA-35).
*
* This isn't a complete implementation of ISO/IEC 2022, but it's
- * close. It only handles decoding, because a fully general encoder
- * isn't really useful. It can decode 8-bit and 7-bit versions, with
- * support for single-byte and multi-byte character sets, all four
- * containers (G0, G1, G2, and G3), using both single-shift and
- * locking-shift sequences.
+ * close. It can decode 8-bit and 7-bit versions, with support for
+ * single-byte and multi-byte character sets, all four containers
+ * (G0, G1, G2, and G3), using both single-shift and locking-shift
+ * sequences.
*
* The general principle is that any valid ISO/IEC 2022 sequence
* should either be correctly decoded or should emit an ERROR. The
#ifndef ENUM_CHARSETS
#include <assert.h>
+#include <string.h>
#include "charset.h"
#include "internal.h"
typedef int (*to_dbcs_t)(long int, int *, int *);
typedef int (*to_dbcs_planar_t)(long int, int *, int *, int *);
-/* Cast between to_dbcs_planar_t and to_dbcs_t, type-checking first */
-#define DEPLANARISE(x) ( (x) == (to_dbcs_planar_t)NULL, (to_dbcs_t)(x) )
-#define REPLANARISE(x) ( (x) == (to_dbcs_t)NULL, (to_dbcs_planar_t)(x) )
+/*
+ * These macros cast between to_dbcs_planar_t and to_dbcs_t, in
+ * such a way as to cause a compile-time error if the input is not
+ * of the appropriate type.
+ *
+ * Defining these portably is quite fiddly. My first effort was as
+ * follows:
+ * #define DEPLANARISE(x) ( (x) == (to_dbcs_planar_t)NULL, (to_dbcs_t)(x) )
+ *
+ * so that the comparison on the left of the comma provokes the
+ * type check error, and the cast on the right is the actual
+ * desired result.
+ *
+ * gcc was entirely happy with this. However, when used in a static
+ * initialiser, MSVC objected - justifiably - that the first half
+ * of the comma expression wasn't constant and thus the expression
+ * as a whole was not a constant expression. We can get round this
+ * by enclosing the comparison in `sizeof', so that it isn't
+ * actually evaluated.
+ *
+ * But then we run into a second problem, which is that C actually
+ * disallows the use of the comma operator within a constant
+ * expression for any purpose at all! Presumably this is on the
+ * basis that its purpose is to have side effects and constant
+ * expressions can't; unfortunately, this specific case is one in
+ * which the desired side effect is a compile-time rather than a
+ * run-time one.
+ *
+ * We are permitted to use ?:, however, and that works quite well
+ * since the actual result of the sizeof expression _is_ evaluable
+ * at compile time. So here's my final answer:
+ */
+#define TYPECHECK(x,y) ( sizeof((x)) == sizeof((x)) ? (y) : (y) )
+#define DEPLANARISE(x) TYPECHECK((x) == (to_dbcs_planar_t)NULL, (to_dbcs_t)(x))
+#define REPLANARISE(x) TYPECHECK((x) == (to_dbcs_t)NULL, (to_dbcs_planar_t)(x))
/*
* Values used in the `enable' field. Each of these identifies a
* text, we'll use a preference order which matches that. So we
* begin with the charsets defined in the compound text spec.
*/
- { S4, 0, 'B', CCS, 0x00, &sbcsdata_CS_ASCII },
- { S6, 0, 'A', CCS, 0x80, &sbcsdata_CS_ISO8859_1 },
- { S6, 0, 'B', CCS, 0x80, &sbcsdata_CS_ISO8859_2 },
- { S6, 0, 'C', CCS, 0x80, &sbcsdata_CS_ISO8859_3 },
- { S6, 0, 'D', CCS, 0x80, &sbcsdata_CS_ISO8859_4 },
- { S6, 0, 'F', CCS, 0x80, &sbcsdata_CS_ISO8859_7 },
- { S6, 0, 'G', CCS, 0x80, &sbcsdata_CS_ISO8859_6 },
- { S6, 0, 'H', CCS, 0x80, &sbcsdata_CS_ISO8859_8 },
- { S6, 0, 'L', CCS, 0x80, &sbcsdata_CS_ISO8859_5 },
- { S6, 0, 'M', CCS, 0x80, &sbcsdata_CS_ISO8859_9 },
- { S4, 0, 'I', CCS, 0x80, &sbcsdata_CS_JISX0201 },
- { S4, 0, 'J', CCS, 0x00, &sbcsdata_CS_JISX0201 },
+ { S4, 0, 'B', CCS, 0x00, &sbcsdata_CS_ASCII, NULL, NULL, 0 },
+ { S6, 0, 'A', CCS, 0x80, &sbcsdata_CS_ISO8859_1, NULL, NULL, 0 },
+ { S6, 0, 'B', CCS, 0x80, &sbcsdata_CS_ISO8859_2, NULL, NULL, 0 },
+ { S6, 0, 'C', CCS, 0x80, &sbcsdata_CS_ISO8859_3, NULL, NULL, 0 },
+ { S6, 0, 'D', CCS, 0x80, &sbcsdata_CS_ISO8859_4, NULL, NULL, 0 },
+ { S6, 0, 'F', CCS, 0x80, &sbcsdata_CS_ISO8859_7, NULL, NULL, 0 },
+ { S6, 0, 'G', CCS, 0x80, &sbcsdata_CS_ISO8859_6, NULL, NULL, 0 },
+ { S6, 0, 'H', CCS, 0x80, &sbcsdata_CS_ISO8859_8, NULL, NULL, 0 },
+ { S6, 0, 'L', CCS, 0x80, &sbcsdata_CS_ISO8859_5, NULL, NULL, 0 },
+ { S6, 0, 'M', CCS, 0x80, &sbcsdata_CS_ISO8859_9, NULL, NULL, 0 },
+ { S4, 0, 'I', CCS, 0x80, &sbcsdata_CS_JISX0201, NULL, NULL, 0 },
+ { S4, 0, 'J', CCS, 0x00, &sbcsdata_CS_JISX0201, NULL, NULL, 0 },
{ M4, 0, 'A', CCS, -0x21, 0, &gb2312_to_unicode, &unicode_to_gb2312, -1 },
{ M4, 0, 'B', CCS, -0x21, 0, &jisx0208_to_unicode, &unicode_to_jisx0208, -1 },
{ M4, 0, 'C', CCS, -0x21, 0, &ksx1001_to_unicode, &unicode_to_ksx1001, -1 },
* Next, other reasonably standard things: the rest of the ISO
* 8859 sets, UK-ASCII, and CNS 11643.
*/
- { S6, 0, 'T', COS, 0x80, &sbcsdata_CS_ISO8859_11 },
- { S6, 0, 'V', COS, 0x80, &sbcsdata_CS_ISO8859_10 },
- { S6, 0, 'Y', COS, 0x80, &sbcsdata_CS_ISO8859_13 },
- { S6, 0, '_', COS, 0x80, &sbcsdata_CS_ISO8859_14 },
- { S6, 0, 'b', COS, 0x80, &sbcsdata_CS_ISO8859_15 },
- { S6, 0, 'f', COS, 0x80, &sbcsdata_CS_ISO8859_16 },
- { S4, 0, 'A', COS, 0x00, &sbcsdata_CS_BS4730 },
+ { S6, 0, 'T', COS, 0x80, &sbcsdata_CS_ISO8859_11, NULL, NULL, 0 },
+ { S6, 0, 'V', COS, 0x80, &sbcsdata_CS_ISO8859_10, NULL, NULL, 0 },
+ { S6, 0, 'Y', COS, 0x80, &sbcsdata_CS_ISO8859_13, NULL, NULL, 0 },
+ { S6, 0, '_', COS, 0x80, &sbcsdata_CS_ISO8859_14, NULL, NULL, 0 },
+ { S6, 0, 'b', COS, 0x80, &sbcsdata_CS_ISO8859_15, NULL, NULL, 0 },
+ { S6, 0, 'f', COS, 0x80, &sbcsdata_CS_ISO8859_16, NULL, NULL, 0 },
+ { S4, 0, 'A', COS, 0x00, &sbcsdata_CS_BS4730, NULL, NULL, 0 },
{ M4, 0, 'G', COS, -0x21, 0, &cns11643_1_to_unicode, DEPLANARISE(&unicode_to_cns11643), 0 },
{ M4, 0, 'H', COS, -0x21, 0, &cns11643_2_to_unicode, DEPLANARISE(&unicode_to_cns11643), 1 },
{ M4, 0, 'I', COS, -0x21, 0, &cns11643_3_to_unicode, DEPLANARISE(&unicode_to_cns11643), 2 },
* Private-use designations: DEC private sets and Emacs's Big5
* abomination.
*/
- { S4, 0, '0', CPU, 0x00, &sbcsdata_CS_DEC_GRAPHICS },
- { S4, 0, '<', CPU, 0x80, &sbcsdata_CS_DEC_MCS },
+ { S4, 0, '0', CPU, 0x00, &sbcsdata_CS_DEC_GRAPHICS, NULL, NULL, 0 },
+ { S4, 0, '<', CPU, 0x80, &sbcsdata_CS_DEC_MCS, NULL, NULL, 0 },
{ M4, 0, '0', CPU, -0x21, 0, &emacs_big5_1_to_unicode, DEPLANARISE(&unicode_to_emacs_big5), 1 },
{ M4, 0, '1', CPU, -0x21, 0, &emacs_big5_2_to_unicode, DEPLANARISE(&unicode_to_emacs_big5), 2 },
/*
* Finally, fallback entries for null character sets.
*/
- { S4, 0, '~', CNU },
- { S6, 0, '~', CNU }, /* empty 96-set */
+ { S4, 0, '~', CNU, 0, NULL, NULL, NULL, 0 },
+ { S6, 0, '~', CNU, 0, NULL, NULL, NULL, 0 }, /* empty 96-set */
{ M4, 0, '~', CNU, 0, 0, &null_dbcs_to_unicode, &unicode_to_null_dbcs, -1 }, /* empty 94^n-set */
{ M6, 0, '~', CNU, 0, 0, &null_dbcs_to_unicode, &unicode_to_null_dbcs, -1 }, /* empty 96^n-set */
};
static long int null_dbcs_to_unicode(int r, int c)
{
+ UNUSEDARG(r);
+ UNUSEDARG(c);
return ERROR;
}
static int unicode_to_null_dbcs(long int unicode, int *r, int *c)
{
+ UNUSEDARG(unicode);
+ UNUSEDARG(r);
+ UNUSEDARG(c);
return 0; /* failed to convert anything */
}
if (input_chr == 2)
state->s0 = (state->s0 & 0xf0000000) | (i << 26) | (0xf << 22);
} else if (n != 0xf) {
- while (j < lenof(ctext_encodings) &&
+ while ((unsigned)j < lenof(ctext_encodings) &&
!memcmp(ctext_encodings[j].name,
ctext_encodings[oi].name, n)) {
if (ctext_encodings[j].name[n] < input_chr)
else
break;
}
- if (i >= lenof(ctext_encodings) ||
+ if ((unsigned)i >= lenof(ctext_encodings) ||
memcmp(ctext_encodings[i].name,
ctext_encodings[oi].name, n) ||
ctext_encodings[i].name[n] != input_chr) {
assert(i < 4 && n < 16);
state->s0 = (state->s0 & 0xf0000000) | (i << 26) | (n << 22);
} else {
- if (i >= lenof(ctext_encodings))
+ if ((unsigned)i >= lenof(ctext_encodings))
emit(emitctx, ERROR);
else {
charset_state substate;
#define LEFT 30
#define RIGHT 28
#define LOCKING_SHIFT(n,side) \
- (state->s1 = (state->s1 & ~(3L<<(side))) | ((n ## L)<<(side)))
-#define MODE ((state->s0 & 0xe0000000L) >> 29)
-#define ENTER_MODE(m) (state->s0 = (state->s0 & ~0xe0000000L) | ((m)<<29))
+ (state->s1 = (state->s1 & ~(3UL<<(side))) | ((n ## UL)<<(side)))
+#define MODE ((state->s0 & 0xe0000000UL) >> 29)
+#define ENTER_MODE(m) (state->s0 = (state->s0 & ~0xe0000000UL) | ((unsigned long)(m)<<29))
#define SINGLE_SHIFT(n) ENTER_MODE(SS2CHAR - 2 + (n))
#define ASSERT_IDLE do { \
if (state->s0 != 0) emit(emitctx, ERROR); \
int shift = (right ? 31-7 : 31-7-7);
struct iso2022_subcharset const *subcs = &iso2022_subcharsets[i];
- if (((state->s1 >> shift) & 0x7F) != i) {
+ if (((state->s1 >> shift) & 0x7F) != (unsigned)i) {
state->s1 &= ~(0x7FL << shift);
state->s1 |= (i << shift);
/*
* Start with US-ASCII in GL and also in GR.
*/
- for (i = 0; i < lenof(iso2022_subcharsets); i++) {
+ for (i = 0; (unsigned)i < lenof(iso2022_subcharsets); i++) {
subcs = &iso2022_subcharsets[i];
if (subcs->type == mode->ltype &&
subcs->i == mode->li &&
*/
docs_char(state, emit, emitctx, -2, NULL, 0); /* leave DOCS */
- for (i = 0; i < lenof(iso2022_subcharsets); i++) {
+ for (i = 0; (unsigned)i < lenof(iso2022_subcharsets); i++) {
subcs = &iso2022_subcharsets[i];
if (subcs->type == mode->ltype &&
subcs->i == mode->li &&
* Analyse the input character and work out which subcharset it
* belongs to.
*/
- for (i = 0; i < lenof(iso2022_subcharsets); i++) {
+ for (i = 0; (unsigned)i < lenof(iso2022_subcharsets); i++) {
subcs = &iso2022_subcharsets[i];
if (!(mode->enable_mask & (1 << subcs->enable)))
continue; /* this charset is disabled */
}
}
- if (i < lenof(iso2022_subcharsets)) {
+ if ((unsigned)i < lenof(iso2022_subcharsets)) {
int right;
/*
cs = -2; /* means failure */
- for (i = 0; i <= lenof(ctext_encodings); i++) {
+ for (i = 0; (unsigned)i <= lenof(ctext_encodings); i++) {
charset_state substate;
charset_spec const *subcs = ctext_encodings[i].subcs;
substate.s1 = substate.s0 = 0;
p = data;
- if (i < lenof(ctext_encodings)) {
+ if ((unsigned)i < lenof(ctext_encodings)) {
if ((mode->enable_mask & (1 << ctext_encodings[i].enable)) &&
subcs->write(subcs, input_chr, &substate,
write_to_pointer, &p)) {