From 675958c383f1dfc9c2c5a7455654c0961fdf6a65 Mon Sep 17 00:00:00 2001 From: jacob Date: Thu, 17 Feb 2005 19:00:48 +0000 Subject: [PATCH] Changes/additions to input character set handling: - After discussion with Simon, change the default input charset back to ASCII, rather than trying to work it out from the locale, for the sake of promoting .but file portability. - Add a new command-line option "--input-charset=csname", which overrides the ASCII default for all input files (since there's no other way to use a non-ASCII-compatible input file). - Output a warning if -Cinput-charset:foo is specified that it has no effect. - Update the docs to match all this. Also try to clarify some other things in this area that caught me out. git-svn-id: svn://svn.tartarus.org/sgt/halibut@5332 cda61777-01e9-0310-a592-d414129be87e --- doc/input.but | 12 +++++++----- doc/manpage.but | 5 +++++ doc/running.but | 20 ++++++++++++++++++++ error.c | 11 +++++++++++ halibut.h | 2 ++ help.c | 3 ++- main.c | 19 ++++++++++++++++++- 7 files changed, 65 insertions(+), 7 deletions(-) diff --git a/doc/input.but b/doc/input.but index f0097c5..7d5eb97 100644 --- a/doc/input.but +++ b/doc/input.but @@ -1338,9 +1338,9 @@ For example, \c{iso-8859-1}, \c{iso8859-1} and \c{iso_8859-1} are all recognised, \c{GB2312} and \c{EUC-CN} both work, and so on. This directive takes effect immediately after the \c{\\cfg} command. -All text after that in the file is expected to be in the new -character set. You can even change character set several times -within a file if you really want to. +All text after that until the end of the input file is expected to be +in the new character set. You can even change character set several +times within a file if you really want to. When Halibut reads the input file, everything you type will be converted into \i{Unicode} from the character set you specify here, @@ -1403,8 +1403,10 @@ The \i{default settings} for the above options are: \c \cfg{appendix}{Appendix} \c \cfg{input-charset}{ASCII} -(The default settings for \cw{\\cfg\{quotes\}} are backend-specific; -see \k{output}.) +The default for \cw{\\cfg\{input-charset\}} can be changed with the +\cw{--input-charset} option; see \k{running-options}. The default +settings for \cw{\\cfg\{quotes\}} are backend-specific; see +\k{output}. \H{input-macro} Defining \i{macros} diff --git a/doc/manpage.but b/doc/manpage.but index 94b42d6..5889e59 100644 --- a/doc/manpage.but +++ b/doc/manpage.but @@ -106,6 +106,11 @@ Halibut. Using this directive is exactly equivalent to appending an extra input file to the command line which contains the directive \cw{\\cfg\{}\e{word}\cw{\}\{}\e{word}\cw{\}\{}\e{word}...\cw{\}}. +\dt \cw{--input-charset}\cw{=}\e{charset} + +\dd Changes the assumed character set for input files from the +default of ASCII. + \dt \cw{--precise} \dd Makes Halibut report the column number as well as the line diff --git a/doc/running.but b/doc/running.but index 37e36f3..56790ec 100644 --- a/doc/running.but +++ b/doc/running.but @@ -163,6 +163,10 @@ might have to type four backslashes on your shell command line. This is not part of Halibut's own behaviour, and it cannot do anything about it.) +Configuration directives created in this way take effect after all +other input has been processed. (In most cases, this has the effect of +overriding any other instances of the directive in the input.) + } The options which set the output file names actually work by @@ -178,6 +182,22 @@ directives.) In addition to these, there are also a few other options: +\dt \i\cw{--input-charset}\cw{=}\e{charset} + +\dd Changes the default assumed character set for all input files from +ASCII to something else. (\cw{-Cinput-charset} cannot be used for +this, as \cw{-C} directives are processed after all other input, so +wouldn't affect any files.) + +\lcont{ + +Any \cw{\\cfg\{input-charset\}} directives within input files override +this option. + +See \k{input-config} for more information about the input character set. + +} + \dt \i\cw{--help} \dd Print a brief help message and exit immediately. (Don't confuse diff --git a/error.c b/error.c index 7b9ceaa..0011b18 100644 --- a/error.c +++ b/error.c @@ -37,6 +37,17 @@ static void do_error(int code, va_list ap) { sprintf(error, "unrecognised option `-%.200s'", sp); flags = PREFIX; break; + case err_cmdcharset: + sp = va_arg(ap, char *); + sprintf(error, "character set `%.200s' not recognised", sp); + flags = PREFIX; + break; + case err_futileopt: + sp = va_arg(ap, char *); + sp2 = va_arg(ap, char *); + sprintf(error, "warning: option `-%s' has no effect%s", sp, sp2); + flags = PREFIX; + break; case err_noinput: /* no arguments */ sprintf(error, "no input files"); flags = PREFIX; diff --git a/halibut.h b/halibut.h index f76d059..8c7caed 100644 --- a/halibut.h +++ b/halibut.h @@ -206,6 +206,8 @@ enum { err_nomemory, /* out of memory */ err_optnoarg, /* option `-%s' requires an argument */ err_nosuchopt, /* unrecognised option `-%s' */ + err_cmdcharset, /* unrecognised charset %s (cmdline) */ + err_futileopt, /* futile option `-%s'%s */ err_noinput, /* no input files */ err_cantopen, /* unable to open input file `%s' */ err_nodata, /* no data in input files */ diff --git a/help.c b/help.c index 8104006..7235032 100644 --- a/help.c +++ b/help.c @@ -15,6 +15,7 @@ static char *helptext[] = { " --ps[=filename] generate PostScript output", " --pdf[=filename] generate PDF output", " -Cfoo:bar:baz append \\cfg{foo}{bar}{baz} to input", + " --input-charset=cs change default input file charset", " --precise report column numbers in error messages", " --help display this text", " --version display version number", @@ -23,7 +24,7 @@ static char *helptext[] = { }; static char *usagetext[] = { - "usage: halibut [--format[=filename]] [-Cconfig...] file.but [file.but...]", + "usage: halibut [--format[=filename]] [options] file.but [file.but...]", NULL }; diff --git a/main.c b/main.c index f09b33e..8334d15 100644 --- a/main.c +++ b/main.c @@ -43,6 +43,7 @@ int main(int argc, char **argv) { int nogo; int errs; int reportcols; + int input_charset; int debug; int backendbits, prebackbits; int k, b; @@ -58,6 +59,7 @@ int main(int argc, char **argv) { nfiles = 0; nogo = errs = FALSE; reportcols = 0; + input_charset = CS_ASCII; debug = 0; backendbits = 0; cfg = cfg_tail = NULL; @@ -113,6 +115,17 @@ int main(int argc, char **argv) { } if (k < (int)lenof(backends)) { /* do nothing */; + } else if (!strcmp(opt, "-input-charset")) { + if (!val) { + errs = TRUE, error(err_optnoarg, opt); + } else { + int charset = charset_from_localenc(val); + if (charset == CS_NONE) { + errs = TRUE, error(err_cmdcharset, val); + } else { + input_charset = charset; + } + } } else if (!strcmp(opt, "-help")) { help(); nogo = TRUE; @@ -193,6 +206,10 @@ int main(int argc, char **argv) { while (*q) { if (*q == ':') { *r = '\0'; + /* XXX ad-hoc diagnostic */ + if (!strcmp(s, "input-charset")) + error(err_futileopt, "Cinput-charset", + "; use --input-charset"); cmdline_cfg_add(para, s); r = s; } else { @@ -263,7 +280,7 @@ int main(int argc, char **argv) { in.pushback = NULL; in.reportcols = reportcols; in.stack = NULL; - in.defcharset = charset_from_locale(); + in.defcharset = input_charset; idx = make_index(); -- 2.11.0