From e34ba5c3b8a7bcb8fceb437125da3a6a6f6d2dba Mon Sep 17 00:00:00 2001 From: simon Date: Mon, 19 Apr 2004 17:09:49 +0000 Subject: [PATCH] Support for \cfg{input-charset}. Input files can now be in ASCII, 8859-*, UTF-8, or a variety of more fun encodings including various multibyte ones. git-svn-id: svn://svn.tartarus.org/sgt/halibut@4095 cda61777-01e9-0310-a592-d414129be87e --- doc/index.but | 3 ++ doc/input.but | 67 ++++++++++++++++++++++++++------------- halibut.h | 6 ++++ input.c | 97 +++++++++++++++++++++++++++++++++++++++++---------------- inputs/test.but | 20 +++++++++++- main.c | 1 + 6 files changed, 145 insertions(+), 49 deletions(-) diff --git a/doc/index.but b/doc/index.but index 7b87c13..2cf1ebc 100644 --- a/doc/index.but +++ b/doc/index.but @@ -221,6 +221,9 @@ directive \IM{\\cfg\{appendix\}} \c{appendix} configuration directive \IM{\\cfg\{appendix\}} \cw{\\cfg\{appendix\}} +\IM{\\cfg\{input-charset\}} \c{input-charset} configuration directive +\IM{\\cfg\{input-charset\}} \cw{\\cfg\{input-charset\}} + \IM{configuring heading display} configuring heading display \IM{configuring heading display} headings, configuring display \IM{configuring heading display} section headings, configuring display diff --git a/doc/input.but b/doc/input.but index daf8092..19eea1f 100644 --- a/doc/input.but +++ b/doc/input.but @@ -52,6 +52,10 @@ and Halibut would generate the text This \\ is a backslash, and these are \{braces\}. } +If you want to write your input file in a character set other than +ASCII, you can do so by using the \c{\\cfg\{input-charset\}} +command. See \k{input-config} for details of this. + \H{input-inline} Simple \i{inline formatting commands} Halibut formatting commands all begin with a backslash, followed by @@ -190,12 +194,13 @@ Here is some \q{text in quotes}. and in every output format Halibut generates, it will choose the best quote characters available to it in that format. -You can still use ordinary ASCII \i{double quotes} if you prefer; or -you could even use the \c{\\u} command (see \k{input-unicode}) to -generate \i{Unicode matched quotes} (single or double) and fall back -to the normal ASCII one if they aren't available. But I recommend -using the built-in \c{\\q} command in most cases, because it's -simple and does the best it can everywhere. +You can still use the ordinary quote characters of your choice if +you prefer; or you could even use the \c{\\u} command (see +\k{input-unicode}) to generate \i{Unicode matched quotes} (single or +double) in a way which will automatically fall back to the normal +ASCII one if they aren't available. But I recommend using the +built-in \c{\\q} command in most cases, because it's simple and does +the best it can everywhere. (Note that if you're using the \c{\\c} or \c{\\cw} commands to display literal computer code, you probably \e{will} want to use @@ -324,23 +329,18 @@ indexing.) \S{input-unicode} \c{\\u}: Specifying arbitrary \i{Unicode} characters -When Halibut is finished, it should have full Unicode support. You -should be able to specify any (reasonably well known) \i{character -set} for your input document, and Halibut should convert it all to -Unicode as it reads it in. Similarly, you should be able to specify -the character set you want for each output format and have all the -conversion done automatically. - -Currently, none of this is actually supported. Input text files are -assumed to be in \i{ISO 8859-1}, and each output format has its own -non-configurable character set (although the HTML output can use the -\c{Ӓ} mechanism to output any Unicode character it likes). +Halibut has extensive support for Unicode and character set +conversion. You can specify any (reasonably well known) \i{character +set} for your input document, and Halibut will convert it all to +Unicode as it reads it in. See \k{input-config} for more details of +this. If you need to specify a Unicode character in your input document -which is not supported by the input character set, you can use the -\i\c{\\u} command to do this. \c{\\u} expects to be followed by a -sequence of hex digits; so that \c{\\u0041}, for example, denotes -the Unicode character \cw{0x0041}, which is the capital letter A. +which is not supported by the input character set you have chosen, +you can use the \i\c{\\u} command to do this. \c{\\u} expects to be +followed by a sequence of hex digits; so that \c{\\u0041}, for +example, denotes the Unicode character \cw{0x0041}, which is the +capital letter A. If a Unicode character specified in this way is not supported in a particular \e{output} format, you probably don't just want it to be @@ -1273,6 +1273,31 @@ subsections of a chapter. \dd Exactly like \c{chapter}, but changes the name given to appendices. +\dt \I\cw{\\cfg\{input-charset\}}\cw{\\cfg\{input-charset\}\{}\e{character set name}\cw{\}} + +\dd This tells Halibut what \i{character set} you are writing your +input file in. By default, it is assumed to be US-ASCII (meaning +\e{only} plain \i{ASCII}, with no accented characters at all). + +\lcont{ + +You can specify any well-known name for any supported character set. +For example, \c{iso-8859-1}, \c{iso8859-1} and \c{iso_8859-1} are +all recognised, \c{GB2312} and \c{EUC-CN} both work, and so on. + +This directive takes effect immediately after the \c{\\cfg} command. +All text after that in the file is expected to be in the new +character set. You can even change character set several times +within a file if you really want to. + +When Halibut reads the input file, everything you type will be +converted into \i{Unicode} from the character set you specify here, +will be processed as Unicode by Halibut internally, and will be +written to the various output formats in whatever character sets +they deem appropriate. + +} + In addition to these configuration commands, there are also configuration commands provided by each individual output format. These configuration commands are discussed along with each output diff --git a/halibut.h b/halibut.h index 9abc13f..9aa2c59 100644 --- a/halibut.h +++ b/halibut.h @@ -6,6 +6,8 @@ #include #include +#include "charset.h" + #ifdef __GNUC__ #define NORETURN __attribute__((__noreturn__)) #else @@ -66,6 +68,10 @@ struct input_Tag { filepos pos; int reportcols; /* report column numbers in errors */ macrostack *stack; /* macro expansions in force */ + int defcharset, charset; /* character sets for input files */ + charset_state csstate; + wchar_t wc[16]; /* wide chars from input conversion */ + int nwc, wcpos; /* size of, and position in, wc[] */ }; /* diff --git a/input.c b/input.c index efce410..d607e86 100644 --- a/input.c +++ b/input.c @@ -82,6 +82,16 @@ static void macrocleanup(tree234 *macros) { freetree234(macros); } +static void input_configure(input *in, paragraph *cfg) { + assert(cfg->type == para_Config); + + if (!ustricmp(cfg->keyword, L"input-charset")) { + char *csname = utoa_dup(uadv(cfg->keyword)); + in->charset = charset_from_localenc(csname); + sfree(csname); + } +} + /* * Can return EOF */ @@ -103,36 +113,63 @@ static int get(input *in, filepos *pos) { return c; } else if (in->currfp) { - int c = getc(in->currfp); - if (c == EOF) { - fclose(in->currfp); - in->currfp = NULL; - } - /* Track line numbers, for error reporting */ - if (pos) - *pos = in->pos; - if (in->reportcols) { - switch (c) { - case '\t': - in->pos.col = 1 + (in->pos.col + TAB_STOP-1) % TAB_STOP; - break; - case '\n': - in->pos.col = 1; - in->pos.line++; - break; - default: - in->pos.col++; - break; + while (in->wcpos >= in->nwc) { + + int c = getc(in->currfp); + + if (c == EOF) { + fclose(in->currfp); + in->currfp = NULL; + return EOF; + } + /* Track line numbers, for error reporting */ + if (pos) + *pos = in->pos; + if (in->reportcols) { + switch (c) { + case '\t': + in->pos.col = 1 + (in->pos.col + TAB_STOP-1) % TAB_STOP; + break; + case '\n': + in->pos.col = 1; + in->pos.line++; + break; + default: + in->pos.col++; + break; + } + } else { + in->pos.col = -1; + if (c == '\n') + in->pos.line++; + } + + /* + * Do input character set translation, so that we return + * Unicode. + */ + { + char buf[1]; + char const *p; + int inlen; + + buf[0] = (char)c; + p = buf; + inlen = 1; + + in->nwc = charset_to_unicode(&p, &inlen, + in->wc, lenof(in->wc), + in->charset, &in->csstate, + NULL, 0); + assert(p == buf+1 && inlen == 0); + + in->wcpos = 0; } - } else { - in->pos.col = -1; - if (c == '\n') - in->pos.line++; } - /* FIXME: do input charmap translation. We should be returning - * Unicode here. */ - return c; + + return in->wc[in->wcpos++]; + } else return EOF; } @@ -884,6 +921,10 @@ static void read_file(paragraph ***ret, input *in, indexdata *idx) { already = TRUE;/* inhibit get_token at top of loop */ prev_para_type = par.type; addpara(par, ret); + + if (par.type == para_Config) { + input_configure(in, &par); + } continue; /* next paragraph */ } } @@ -1421,6 +1462,8 @@ paragraph *read_input(input *in, indexdata *idx) { in->currfp = fopen(in->filenames[in->currindex], "r"); if (in->currfp) { setpos(in, in->filenames[in->currindex]); + in->charset = in->defcharset; + in->csstate = charset_init_state; read_file(&hptr, in, idx); } in->currindex++; diff --git a/inputs/test.but b/inputs/test.but index 43bddc9..7b0df07 100644 --- a/inputs/test.but +++ b/inputs/test.but @@ -32,7 +32,7 @@ a bit] \define{eur} \u20AC{EUR } -\versionid $Id: test.but,v 1.26 2004/04/09 18:47:33 simon Exp $ +\versionid $Id: test.but,v 1.27 2004/04/19 17:09:49 simon Exp $ \C{ch\\ap} First chapter title; for similar wrapping reasons this chapter title will be ludicrously long. I wonder how much more @@ -168,6 +168,24 @@ And a nested macro: \metacoopt. A slightly more difficult macro: \eur\.2500. +Test of input character set switching. + +\b 8859 character in ASCII mode: expect nothing useful. [coöpt] + +\cfg{input-charset}{ISO-8859-1} + +\b 8859 character in 8859 mode: expect the right thing. [coöpt] + +\cfg{input-charset}{UTF-8} + +\b 8859 character in UTF-8 mode: expect the wrong thing. [coöpt] + +\b UTF-8 sequence in UTF-8 mode: expect the right thing again. [coöpt] + +\cfg{input-charset}{ASCII} + +Back to ASCII again. + Oh, while I'm here: some special characters. The \\, \{ and \} characters, to be precise. And their code equivalents, \c{\\}, \i\c{\{}, \c{\}}. diff --git a/main.c b/main.c index 9af17b3..182cbec 100644 --- a/main.c +++ b/main.c @@ -270,6 +270,7 @@ int main(int argc, char **argv) { in.pushback = NULL; in.reportcols = reportcols; in.stack = NULL; + in.defcharset = CS_ASCII; idx = make_index(); -- 2.11.0