X-Git-Url: https://git.distorted.org.uk/~mdw/sgt/halibut/blobdiff_plain/e4ea58f8cb4dccaa8e99306a3497de1e37600480..51d05cd0a6de05c0adc8986e1d534e426d6f582c:/input.c diff --git a/input.c b/input.c index 1187e94..1410022 100644 --- a/input.c +++ b/input.c @@ -18,7 +18,7 @@ static void setpos(input *in, char *fname) { static void unget(input *in, int c, filepos *pos) { if (in->npushback >= in->pushbacksize) { in->pushbacksize = in->npushback + 16; - in->pushback = resize(in->pushback, in->pushbacksize); + in->pushback = sresize(in->pushback, in->pushbacksize, pushback); } in->pushback[in->npushback].chr = c; in->pushback[in->npushback].pos = *pos; /* structure copy */ @@ -45,7 +45,7 @@ static int macrocmp(void *av, void *bv) { } static void macrodef(tree234 *macros, wchar_t *name, wchar_t *text, filepos fpos) { - macro *m = mknew(macro); + macro *m = snew(macro); m->name = name; m->text = text; if (add234(macros, m) != m) { @@ -60,7 +60,7 @@ static int macrolookup(tree234 *macros, input *in, wchar_t *name, m.name = name; gotit = find234(macros, &m, NULL); if (gotit) { - macrostack *expansion = mknew(macrostack); + macrostack *expansion = snew(macrostack); expansion->next = in->stack; expansion->text = gotit->text; expansion->pos = *pos; /* structure copy */ @@ -86,9 +86,7 @@ static void input_configure(input *in, paragraph *cfg) { assert(cfg->type == para_Config); if (!ustricmp(cfg->keyword, L"input-charset")) { - char *csname = utoa_dup(uadv(cfg->keyword), CS_ASCII); - in->charset = charset_from_localenc(csname); - sfree(csname); + in->charset = charset_from_ustr(&cfg->fpos, uadv(cfg->keyword)); } } @@ -105,6 +103,8 @@ static int get(input *in, filepos *pos, rdstringc *rsc) { } else if (in->stack) { wchar_t c = in->stack->text[in->stack->ptr]; + if (pos) + *pos = in->stack->pos; if (in->stack->text[++in->stack->ptr] == L'\0') { macrostack *tmp = in->stack; in->stack = tmp->next; @@ -119,7 +119,8 @@ static int get(input *in, filepos *pos, rdstringc *rsc) { int c = getc(in->currfp); if (c == EOF) { - fclose(in->currfp); + if (in->wantclose) + fclose(in->currfp); in->currfp = NULL; return EOF; } @@ -221,6 +222,7 @@ enum { c_c, /* code */ c_cfg, /* configuration directive */ c_copyright, /* copyright statement */ + c_cq, /* quoted code (sugar for \q{\cw{x}}) */ c_cw, /* weak code */ c_date, /* document processing date */ c_dd, /* description list: description */ @@ -291,6 +293,7 @@ static void match_kw(token *tok) { {"c", c_c}, /* code */ {"cfg", c_cfg}, /* configuration directive */ {"copyright", c_copyright}, /* copyright statement */ + {"cq", c_cq}, /* quoted code (sugar for \q{\cw{x}}) */ {"cw", c_cw}, /* weak code */ {"date", c_date}, /* document processing date */ {"dd", c_dd}, /* description list: description */ @@ -424,6 +427,7 @@ token get_token(input *in) { c == '#' || c == '{' || c == '}' || c == '.') { /* single-char command */ rdadd(&rs, c); + prevpos = rsc.pos; } else if (c == 'u') { int len = 0; do { @@ -560,7 +564,7 @@ static word *addword(word newword, word ***hptrptr) { word *mnewword; if (!hptrptr) return NULL; - mnewword = mknew(word); + mnewword = snew(word); *mnewword = newword; /* structure copy */ mnewword->next = NULL; **hptrptr = mnewword; @@ -572,7 +576,7 @@ static word *addword(word newword, word ***hptrptr) { * Adds a new paragraph to a linked list */ static paragraph *addpara(paragraph newpara, paragraph ***hptrptr) { - paragraph *mnewpara = mknew(paragraph); + paragraph *mnewpara = snew(paragraph); *mnewpara = newpara; /* structure copy */ mnewpara->next = NULL; **hptrptr = mnewpara; @@ -589,11 +593,11 @@ static paragraph *addpara(paragraph newpara, paragraph ***hptrptr) { /* * Reads a single file (ie until get() returns EOF) */ -static void read_file(paragraph ***ret, input *in, indexdata *idx) { +static void read_file(paragraph ***ret, input *in, indexdata *idx, + tree234 *macros) { token t; paragraph par; word wd, **whptr, **idximplicit; - tree234 *macros; wchar_t utext[2], *wdtext; int style, spcstyle; int already; @@ -607,11 +611,12 @@ static void read_file(paragraph ***ret, input *in, indexdata *idx) { stack_style = 2, /* \e, \c, \cw */ stack_idx = 4, /* \I, \i, \ii */ stack_hyper = 8, /* \W */ - stack_quote = 16, /* \q */ + stack_quote = 16 /* \q */ } type; word **whptr; /* to restore from \u alternatives */ word **idximplicit; /* to restore from \u alternatives */ filepos fpos; + int in_code; } *sitem; stack parsestk; struct crossparaitem { @@ -628,7 +633,6 @@ static void read_file(paragraph ***ret, input *in, indexdata *idx) { t.text = NULL; t.origtext = NULL; - macros = newtree234(macrocmp); already = FALSE; crossparastk = stk_new(); @@ -739,7 +743,7 @@ static void read_file(paragraph ***ret, input *in, indexdata *idx) { * nested lists, code paras etc). Hence, the previous * paragraph must be of a list type. */ - sitem = mknew(struct crossparaitem); + sitem = snew(struct crossparaitem); stop = (struct crossparaitem *)stk_top(crossparastk); if (stop) *sitem = *stop; @@ -769,7 +773,7 @@ static void read_file(paragraph ***ret, input *in, indexdata *idx) { * block-quoted (typically they will be indented a * bit). */ - sitem = mknew(struct crossparaitem); + sitem = snew(struct crossparaitem); stop = (struct crossparaitem *)stk_top(crossparastk); if (stop) *sitem = *stop; @@ -805,6 +809,11 @@ static void read_file(paragraph ***ret, input *in, indexdata *idx) { continue; } + while (t.type == tok_cmd && + macrolookup(macros, in, t.text, &t.pos)) { + dtor(t), t = get_token(in); + } + /* * This token begins a paragraph. See if it's one of the * special commands that define a paragraph type. @@ -827,8 +836,10 @@ static void read_file(paragraph ***ret, input *in, indexdata *idx) { needkw = 4; break; case c__comment: - if (isbrace(in)) + if (isbrace(in)) { + needkw = -1; break; /* `\#{': isn't a comment para */ + } do { dtor(t), t = get_token(in); } while (t.type != tok_eop && t.type != tok_eof); @@ -895,7 +906,21 @@ static void read_file(paragraph ***ret, input *in, indexdata *idx) { /* Get keywords. */ dtor(t), t = get_token(in); fp = t.pos; - while (t.type == tok_lbrace) { + while (t.type == tok_lbrace || + (t.type == tok_white && (needkw & 24))) { + /* + * In paragraph types which can't accept any + * body text (such as \cfg), we are lenient + * about whitespace between keywords. This is + * important for \cfg in particular since it + * can often have many keywords which are long + * pieces of text, so it's useful to permit the + * user to wrap the line between them. + */ + if (t.type == tok_white) { + dtor(t), t = get_token(in); /* eat the space */ + continue; + } /* This is a keyword. */ nkeys++; /* FIXME: there will be bugs if anyone specifies an @@ -904,11 +929,16 @@ static void read_file(paragraph ***ret, input *in, indexdata *idx) { t.type == tok_word || t.type == tok_white || (t.type == tok_cmd && t.cmd == c__nbsp) || - (t.type == tok_cmd && t.cmd == c__escaped)) { + (t.type == tok_cmd && t.cmd == c__escaped) || + (t.type == tok_cmd && t.cmd == c_u)) { if (t.type == tok_white || (t.type == tok_cmd && t.cmd == c__nbsp)) { rdadd(&rs, ' '); rdaddc(&rsc, ' '); + } else if (t.type == tok_cmd && t.cmd == c_u) { + rdadd(&rs, t.aux); + rdaddc(&rsc, '\\'); + rdaddsc(&rsc, t.origtext); } else { rdadds(&rs, t.text); rdaddsc(&rsc, t.origtext); @@ -948,7 +978,8 @@ static void read_file(paragraph ***ret, input *in, indexdata *idx) { rdadd(¯otext, L'\n'); rdadds(¯otext, t.text); dtor(t), t = get_token(in); - if (t.type == tok_eop) break; + if (t.type == tok_eop || t.type == tok_eof) + break; } macrodef(macros, rs.text, macrotext.text, fp); continue; /* next paragraph */ @@ -992,10 +1023,11 @@ static void read_file(paragraph ***ret, input *in, indexdata *idx) { * Mid-paragraph commands: * * \K \k - * \c \cw + * \c \cw \cq * \e * \i \ii * \I + * \q * \u * \W * \date @@ -1083,7 +1115,7 @@ static void read_file(paragraph ***ret, input *in, indexdata *idx) { case tok_lbrace: error(err_unexbrace, &t.pos); /* Error recovery: push nop */ - sitem = mknew(struct stack_item); + sitem = snew(struct stack_item); sitem->type = stack_nop; sitem->fpos = t.pos; stk_push(parsestk, sitem); @@ -1192,25 +1224,54 @@ static void read_file(paragraph ***ret, input *in, indexdata *idx) { } break; case c_q: + case c_cq: + type = t.cmd; dtor(t), t = get_token(in); if (t.type != tok_lbrace) { error(err_explbr, &t.pos); } else { - wd.text = NULL; - wd.type = toquotestyle(style); - wd.alt = NULL; - wd.aux = quote_Open; - wd.fpos = t.pos; - wd.breaks = FALSE; - if (!indexing || index_visible) - addword(wd, &whptr); - if (indexing) { - rdadd(&indexstr, L'"'); - addword(wd, &idximplicit); + /* + * Enforce that \q may not be used anywhere + * within \c. (It shouldn't be necessary + * since the whole point of \c should be + * that the user wants to exercise exact + * control over the glyphs used, and + * forbidding it has the useful effect of + * relieving some backends of having to + * make difficult decisions.) + */ + int stype; + + if (style != word_Code && style != word_WeakCode) { + wd.text = NULL; + wd.type = toquotestyle(style); + wd.alt = NULL; + wd.aux = quote_Open; + wd.fpos = t.pos; + wd.breaks = FALSE; + if (!indexing || index_visible) + addword(wd, &whptr); + if (indexing) { + rdadd(&indexstr, L'"'); + addword(wd, &idximplicit); + } + stype = stack_quote; + } else { + error(err_codequote, &t.pos); + stype = stack_nop; } - sitem = mknew(struct stack_item); + sitem = snew(struct stack_item); sitem->fpos = t.pos; - sitem->type = stack_quote; + sitem->type = stype; + if (type == c_cq) { + if (style != word_Normal) { + error(err_nestedstyles, &t.pos); + } else { + style = word_WeakCode; + spcstyle = tospacestyle(style); + sitem->type |= stack_style; + } + } stk_push(parsestk, sitem); } break; @@ -1285,7 +1346,7 @@ static void read_file(paragraph ***ret, input *in, indexdata *idx) { * delimiting the text marked by the link. */ dtor(t), t = get_token(in); - sitem = mknew(struct stack_item); + sitem = snew(struct stack_item); sitem->fpos = wd.fpos; sitem->type = stack_hyper; /* @@ -1351,7 +1412,7 @@ static void read_file(paragraph ***ret, input *in, indexdata *idx) { error(err_nestedstyles, &t.pos); /* Error recovery: eat lbrace, push nop. */ dtor(t), t = get_token(in); - sitem = mknew(struct stack_item); + sitem = snew(struct stack_item); sitem->fpos = t.pos; sitem->type = stack_nop; stk_push(parsestk, sitem); @@ -1364,7 +1425,7 @@ static void read_file(paragraph ***ret, input *in, indexdata *idx) { type == c_cw ? word_WeakCode : word_Emph); spcstyle = tospacestyle(style); - sitem = mknew(struct stack_item); + sitem = snew(struct stack_item); sitem->fpos = t.pos; sitem->type = stack_style; stk_push(parsestk, sitem); @@ -1378,12 +1439,12 @@ static void read_file(paragraph ***ret, input *in, indexdata *idx) { error(err_nestedindex, &t.pos); /* Error recovery: eat lbrace, push nop. */ dtor(t), t = get_token(in); - sitem = mknew(struct stack_item); + sitem = snew(struct stack_item); sitem->fpos = t.pos; sitem->type = stack_nop; stk_push(parsestk, sitem); } - sitem = mknew(struct stack_item); + sitem = snew(struct stack_item); sitem->fpos = t.pos; sitem->type = stack_idx; dtor(t), t = get_token(in); @@ -1453,7 +1514,7 @@ static void read_file(paragraph ***ret, input *in, indexdata *idx) { * sidetrack from the main thread of the * paragraph. */ - sitem = mknew(struct stack_item); + sitem = snew(struct stack_item); sitem->fpos = t.pos; sitem->type = stack_ualt; sitem->whptr = whptr; @@ -1486,7 +1547,17 @@ static void read_file(paragraph ***ret, input *in, indexdata *idx) { } stk_free(parsestk); prev_para_type = par.type; - addpara(par, ret); + /* + * Before we add the paragraph to the output list, we + * should check that there was any text in it at all; there + * might not be if (for example) the paragraph contained + * nothing but an unrecognised command sequence, and if we + * put an empty paragraph on the list it may confuse the + * back ends later on. + */ + if (par.words) { + addpara(par, ret); + } if (t.type == tok_eof) already = TRUE; } @@ -1504,27 +1575,89 @@ static void read_file(paragraph ***ret, input *in, indexdata *idx) { * this cleanup doesn't happen. */ dtor(t); - macrocleanup(macros); stk_free(crossparastk); } +struct { + char const *magic; + size_t nmagic; + int binary; + void (*reader)(input *); +} magics[] = { + { "%!FontType1-", 12, FALSE, &read_pfa_file }, + { "%!PS-AdobeFont-", 15, FALSE, &read_pfa_file }, + { "\x80\x01", 2, TRUE, &read_pfb_file }, + { "StartFontMetrics", 16, FALSE, &read_afm_file }, + { "\x00\x01\x00\x00", 4, TRUE, &read_sfnt_file }, + { "true", 4, TRUE, &read_sfnt_file }, +}; + paragraph *read_input(input *in, indexdata *idx) { paragraph *head = NULL; paragraph **hptr = &head; + tree234 *macros; + char mag[16]; + size_t len, i; + int binary; + void (*reader)(input *); + + macros = newtree234(macrocmp); while (in->currindex < in->nfiles) { - in->currfp = fopen(in->filenames[in->currindex], "r"); + setpos(in, in->filenames[in->currindex]); + in->charset = in->defcharset; + in->csstate = charset_init_state; + in->wcpos = in->nwc = 0; + in->pushback_chars = NULL; + + if (!in->filenames[in->currindex]) { + in->currfp = stdin; + in->wantclose = FALSE; /* don't fclose stdin */ + /* + * When reading standard input, we always expect to see + * an actual Halibut file and not any of the unusual + * input types like fonts. + */ + reader = NULL; + } else { + /* + * Open the file in binary mode to look for magic + * numbers. We'll switch to text mode if we find we're + * looking at a text file type. + */ + in->currfp = fopen(in->filenames[in->currindex], "rb"); + binary = FALSE; /* default to Halibut source, which is text */ + if (in->currfp) { + in->wantclose = TRUE; + reader = NULL; + len = fread(mag, 1, sizeof(mag), in->currfp); + for (i = 0; i < lenof(magics); i++) { + if (len >= magics[i].nmagic && + memcmp(mag, magics[i].magic, magics[i].nmagic) == 0) { + reader = magics[i].reader; + binary = magics[i].binary; + break; + } + } + rewind(in->currfp); + } + if (!binary) { + fclose(in->currfp); + in->currfp = fopen(in->filenames[in->currindex], "r"); + } + } if (in->currfp) { - setpos(in, in->filenames[in->currindex]); - in->charset = in->defcharset; - in->csstate = charset_init_state; - in->wcpos = in->nwc = 0; - in->pushback_chars = NULL; - read_file(&hptr, in, idx); + if (reader == NULL) { + read_file(&hptr, in, idx, macros); + } else { + (*reader)(in); + } } in->currindex++; } + macrocleanup(macros); + return head; }