mdw@git.distorted.org.uk Git - sgt/halibut/blob - input.c

   1 /*
   2  * input.c: read the source form
   3  */
   4
   5 #include <stdio.h>
   6 #include <assert.h>
   7 #include <time.h>
   8 #include "halibut.h"
   9
  10 #define TAB_STOP 8                     /* for column number tracking */
  11
  12 static void setpos(input *in, char *fname) {
  13     in->pos.filename = fname;
  14     in->pos.line = 1;
  15     in->pos.col = (in->reportcols ? 1 : -1);
  16 }
  17
  18 static void unget(input *in, int c, filepos *pos) {
  19     if (in->npushback >= in->pushbacksize) {
  20         in->pushbacksize = in->npushback + 16;
  21         in->pushback = sresize(in->pushback, in->pushbacksize, pushback);
  22     }
  23     in->pushback[in->npushback].chr = c;
  24     in->pushback[in->npushback].pos = *pos;   /* structure copy */
  25     in->npushback++;
  26 }
  27
  28 /* ---------------------------------------------------------------------- */
  29 /*
  30  * Macro subsystem
  31  */
  32 typedef struct macro_Tag macro;
  33 struct macro_Tag {
  34     wchar_t *name, *text;
  35 };
  36 struct macrostack_Tag {
  37     macrostack *next;
  38     wchar_t *text;
  39     int ptr, npushback;
  40     filepos pos;
  41 };
  42 static int macrocmp(void *av, void *bv) {
  43     macro *a = (macro *)av, *b = (macro *)bv;
  44     return ustrcmp(a->name, b->name);
  45 }
  46 static void macrodef(tree234 *macros, wchar_t *name, wchar_t *text,
  47                      filepos fpos) {
  48     macro *m = snew(macro);
  49     m->name = name;
  50     m->text = text;
  51     if (add234(macros, m) != m) {
  52         error(err_macroexists, &fpos, name);
  53         sfree(name);
  54         sfree(text);
  55     }
  56 }
  57 static int macrolookup(tree234 *macros, input *in, wchar_t *name,
  58                        filepos *pos) {
  59     macro m, *gotit;
  60     m.name = name;
  61     gotit = find234(macros, &m, NULL);
  62     if (gotit) {
  63         macrostack *expansion = snew(macrostack);
  64         expansion->next = in->stack;
  65         expansion->text = gotit->text;
  66         expansion->pos = *pos;         /* structure copy */
  67         expansion->ptr = 0;
  68         expansion->npushback = in->npushback;
  69         in->stack = expansion;
  70         return TRUE;
  71     } else
  72         return FALSE;
  73 }
  74 static void macrocleanup(tree234 *macros) {
  75     int ti;
  76     macro *m;
  77     for (ti = 0; (m = (macro *)index234(macros, ti)) != NULL; ti++) {
  78         sfree(m->name);
  79         sfree(m->text);
  80         sfree(m);
  81     }
  82     freetree234(macros);
  83 }
  84
  85 static void input_configure(input *in, paragraph *cfg) {
  86     assert(cfg->type == para_Config);
  87
  88     if (!ustricmp(cfg->keyword, L"input-charset")) {
  89         in->charset = charset_from_ustr(&cfg->fpos, uadv(cfg->keyword));
  90     }
  91 }
  92
  93 /*
  94  * Can return EOF
  95  */
  96 static int get(input *in, filepos *pos, rdstringc *rsc) {
  97     int pushbackpt = in->stack ? in->stack->npushback : 0;
  98     if (in->npushback > pushbackpt) {
  99         --in->npushback;
 100         if (pos)
 101             *pos = in->pushback[in->npushback].pos;   /* structure copy */
 102         return in->pushback[in->npushback].chr;
 103     }
 104     else if (in->stack) {
 105         wchar_t c = in->stack->text[in->stack->ptr];
 106         if (pos)
 107             *pos = in->stack->pos;
 108         if (in->stack->text[++in->stack->ptr] == L'\0') {
 109             macrostack *tmp = in->stack;
 110             in->stack = tmp->next;
 111             sfree(tmp);
 112         }
 113         return c;
 114     }
 115     else if (in->currfp) {
 116
 117         while (in->wcpos >= in->nwc) {
 118
 119             int c = getc(in->currfp);
 120
 121             if (c == EOF) {
 122                 if (in->wantclose)
 123                     fclose(in->currfp);
 124                 in->currfp = NULL;
 125                 return EOF;
 126             }
 127
 128             if (rsc)
 129                 rdaddc(rsc, c);
 130
 131             /* Track line numbers, for error reporting */
 132             if (pos)
 133                 *pos = in->pos;
 134             if (in->reportcols) {
 135                 switch (c) {
 136                   case '\t':
 137                     in->pos.col = 1 + (in->pos.col + TAB_STOP-1) % TAB_STOP;
 138                     break;
 139                   case '\n':
 140                     in->pos.col = 1;
 141                     in->pos.line++;
 142                     break;
 143                   default:
 144                     in->pos.col++;
 145                     break;
 146                 }
 147             } else {
 148                 in->pos.col = -1;
 149                 if (c == '\n')
 150                     in->pos.line++;
 151             }
 152
 153             /*
 154              * Do input character set translation, so that we return
 155              * Unicode.
 156              */
 157             {
 158                 char buf[1];
 159                 char const *p;
 160                 int inlen;
 161
 162                 buf[0] = (char)c;
 163                 p = buf;
 164                 inlen = 1;
 165
 166                 in->nwc = charset_to_unicode(&p, &inlen,
 167                                              in->wc, lenof(in->wc),
 168                                              in->charset, &in->csstate,
 169                                              NULL, 0);
 170                 assert(p == buf+1 && inlen == 0);
 171
 172                 in->wcpos = 0;
 173             }
 174         }
 175
 176         return in->wc[in->wcpos++];
 177
 178     } else
 179         return EOF;
 180 }
 181
 182 /*
 183  * Lexical analysis of source files.
 184  */
 185 typedef struct token_Tag token;
 186 struct token_Tag {
 187     int type;
 188     int cmd, aux;
 189     wchar_t *text;
 190     char *origtext;
 191     filepos pos;
 192 };
 193 enum {
 194     tok_eof,                           /* end of file */
 195     tok_eop,                           /* end of paragraph */
 196     tok_white,                         /* whitespace */
 197     tok_word,                          /* a word or word fragment */
 198     tok_cmd,                           /* \command */
 199     tok_lbrace,                        /* { */
 200     tok_rbrace                         /* } */
 201 };
 202
 203 /* Halibut command keywords. */
 204 enum {
 205     c__invalid,                        /* invalid command */
 206     c__comment,                        /* comment command (\#) */
 207     c__escaped,                        /* escaped character */
 208     c__nop,                            /* no-op */
 209     c__nbsp,                           /* nonbreaking space */
 210     c_A,                               /* appendix heading */
 211     c_B,                               /* bibliography entry */
 212     c_BR,                              /* bibliography rewrite */
 213     c_C,                               /* chapter heading */
 214     c_H,                               /* heading */
 215     c_I,                               /* invisible index mark */
 216     c_IM,                              /* index merge/rewrite */
 217     c_K,                               /* capitalised cross-reference */
 218     c_S,                               /* aux field is 0, 1, 2, ... */
 219     c_U,                               /* unnumbered-chapter heading */
 220     c_W,                               /* Web hyperlink */
 221     c_b,                               /* bulletted list */
 222     c_c,                               /* code */
 223     c_cfg,                             /* configuration directive */
 224     c_copyright,                       /* copyright statement */
 225     c_cq,                              /* quoted code (sugar for \q{\cw{x}}) */
 226     c_cw,                              /* weak code */
 227     c_date,                            /* document processing date */
 228     c_dd,                              /* description list: description */
 229     c_define,                          /* macro definition */
 230     c_dt,                              /* description list: described thing */
 231     c_e,                               /* emphasis */
 232     c_i,                               /* visible index mark */
 233     c_ii,                              /* uncapitalised visible index mark */
 234     c_k,                               /* uncapitalised cross-reference */
 235     c_lcont,                           /* continuation para(s) for list item */
 236     c_n,                               /* numbered list */
 237     c_nocite,                          /* bibliography trickery */
 238     c_preamble,                        /* (obsolete) preamble text */
 239     c_q,                               /* quote marks */
 240     c_quote,                           /* block-quoted paragraphs */
 241     c_rule,                            /* horizontal rule */
 242     c_title,                           /* document title */
 243     c_u,                               /* aux field is char code */
 244     c_versionid                        /* document RCS id */
 245 };
 246
 247 /* Perhaps whitespace should be defined in a more Unicode-friendly way? */
 248 #define iswhite(c) ( (c)==32 || (c)==9 || (c)==13 || (c)==10 )
 249 #define isnl(c) ( (c)==10 )
 250 #define isdec(c) ( ((c)>='0'&&(c)<='9') )
 251 #define fromdec(c) ( (c)-'0' )
 252 #define ishex(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='F') || ((c)>='a'&&(c)<='f'))
 253 #define fromhex(c) ( (c)<='9' ? (c)-'0' : ((c)&0xDF) - ('A'-10) )
 254 #define iscmd(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='Z') || ((c)>='a'&&(c)<='z'))
 255
 256 /*
 257  * Keyword comparison function. Like strcmp, but between a wchar_t *
 258  * and a char *.
 259  */
 260 static int kwcmp(wchar_t const *p, char const *q) {
 261     int i;
 262     do {
 263         i = *p - *q;
 264     } while (*p++ && *q++ && !i);
 265     return i;
 266 }
 267
 268 /*
 269  * Match a keyword.
 270  */
 271 static void match_kw(token *tok) {
 272     /*
 273      * FIXME. The ids are explicit in here so as to allow long-name
 274      * equivalents to the various very short keywords.
 275      */
 276     static const struct { char const *name; int id; } keywords[] = {
 277         {"#", c__comment},             /* comment command (\#) */
 278         {"-", c__escaped},             /* nonbreaking hyphen */
 279         {".", c__nop},                 /* no-op */
 280         {"A", c_A},                    /* appendix heading */
 281         {"B", c_B},                    /* bibliography entry */
 282         {"BR", c_BR},                  /* bibliography rewrite */
 283         {"C", c_C},                    /* chapter heading */
 284         {"H", c_H},                    /* heading */
 285         {"I", c_I},                    /* invisible index mark */
 286         {"IM", c_IM},                  /* index merge/rewrite */
 287         {"K", c_K},                    /* capitalised cross-reference */
 288         {"U", c_U},                    /* unnumbered-chapter heading */
 289         {"W", c_W},                    /* Web hyperlink */
 290         {"\\", c__escaped},            /* escaped backslash (\\) */
 291         {"_", c__nbsp},                /* nonbreaking space (\_) */
 292         {"b", c_b},                    /* bulletted list */
 293         {"c", c_c},                    /* code */
 294         {"cfg", c_cfg},                /* configuration directive */
 295         {"copyright", c_copyright},    /* copyright statement */
 296         {"cq", c_cq},                  /* quoted code (sugar for \q{\cw{x}}) */
 297         {"cw", c_cw},                  /* weak code */
 298         {"date", c_date},              /* document processing date */
 299         {"dd", c_dd},                  /* description list: description */
 300         {"define", c_define},          /* macro definition */
 301         {"dt", c_dt},                  /* description list: described thing */
 302         {"e", c_e},                    /* emphasis */
 303         {"i", c_i},                    /* visible index mark */
 304         {"ii", c_ii},                  /* uncapitalised visible index mark */
 305         {"k", c_k},                    /* uncapitalised cross-reference */
 306         {"lcont", c_lcont},            /* continuation para(s) for list item */
 307         {"n", c_n},                    /* numbered list */
 308         {"nocite", c_nocite},          /* bibliography trickery */
 309         {"preamble", c_preamble},      /* (obsolete) preamble text */
 310         {"q", c_q},                    /* quote marks */
 311         {"quote", c_quote},            /* block-quoted paragraphs */
 312         {"rule", c_rule},              /* horizontal rule */
 313         {"title", c_title},            /* document title */
 314         {"versionid", c_versionid},    /* document RCS id */
 315         {"{", c__escaped},             /* escaped lbrace (\{) */
 316         {"}", c__escaped},             /* escaped rbrace (\}) */
 317     };
 318     int i, j, k, c;
 319
 320     /*
 321      * Special cases: \S{0,1,2,...} and \uABCD. If the syntax
 322      * doesn't match correctly, we just fall through to the
 323      * binary-search phase.
 324      */
 325     if (tok->text[0] == 'S') {
 326         /* We expect numeric characters thereafter. */
 327         wchar_t *p = tok->text+1;
 328         int n;
 329         if (!*p)
 330             n = 1;
 331         else {
 332             n = 0;
 333             while (*p && isdec(*p)) {
 334                 n = 10 * n + fromdec(*p);
 335                 p++;
 336             }
 337         }
 338         if (!*p) {
 339             tok->cmd = c_S;
 340             tok->aux = n;
 341             return;
 342         }
 343     } else if (tok->text[0] == 'u') {
 344         /* We expect hex characters thereafter. */
 345         wchar_t *p = tok->text+1;
 346         int n = 0;
 347         while (*p && ishex(*p)) {
 348             n = 16 * n + fromhex(*p);
 349             p++;
 350         }
 351         if (!*p) {
 352             tok->cmd = c_u;
 353             tok->aux = n;
 354             return;
 355         }
 356     }
 357
 358     i = -1;
 359     j = sizeof(keywords)/sizeof(*keywords);
 360     while (j-i > 1) {
 361         k = (i+j)/2;
 362         c = kwcmp(tok->text, keywords[k].name);
 363         if (c < 0)
 364             j = k;
 365         else if (c > 0)
 366             i = k;
 367         else /* c == 0 */ {
 368             tok->cmd = keywords[k].id;
 369             return;
 370         }
 371     }
 372
 373     tok->cmd = c__invalid;
 374 }
 375
 376
 377 /*
 378  * Read a token from the input file, in the normal way (`normal' in
 379  * the sense that code paragraphs work a different way).
 380  */
 381 token get_token(input *in) {
 382     int c;
 383     int nls;
 384     int prevpos;
 385     token ret;
 386     rdstring rs = { 0, 0, NULL };
 387     rdstringc rsc = { 0, 0, NULL };
 388     filepos cpos;
 389
 390     ret.text = NULL;                   /* default */
 391     ret.origtext = NULL;               /* default */
 392     if (in->pushback_chars) {
 393         rdaddsc(&rsc, in->pushback_chars);
 394         sfree(in->pushback_chars);
 395         in->pushback_chars = NULL;
 396     }
 397     c = get(in, &cpos, &rsc);
 398     ret.pos = cpos;
 399     if (iswhite(c)) {                  /* tok_white or tok_eop */
 400         nls = 0;
 401         prevpos = 0;
 402         do {
 403             if (isnl(c))
 404                 nls++;
 405             prevpos = rsc.pos;
 406         } while ((c = get(in, &cpos, &rsc)) != EOF && iswhite(c));
 407         if (c == EOF) {
 408             ret.type = tok_eof;
 409             sfree(rsc.text);
 410             return ret;
 411         }
 412         if (rsc.text) {
 413             in->pushback_chars = dupstr(rsc.text + prevpos);
 414             sfree(rsc.text);
 415         }
 416         unget(in, c, &cpos);
 417         ret.type = (nls > 1 ? tok_eop : tok_white);
 418         return ret;
 419     } else if (c == EOF) {             /* tok_eof */
 420         ret.type = tok_eof;
 421         sfree(rsc.text);
 422         return ret;
 423     } else if (c == '\\') {            /* tok_cmd */
 424         rsc.pos = prevpos = 0;
 425         c = get(in, &cpos, &rsc);
 426         if (c == '-' || c == '\\' || c == '_' ||
 427             c == '#' || c == '{' || c == '}' || c == '.') {
 428             /* single-char command */
 429             rdadd(&rs, c);
 430             prevpos = rsc.pos;
 431         } else if (c == 'u') {
 432             int len = 0;
 433             do {
 434                 rdadd(&rs, c);
 435                 len++;
 436                 prevpos = rsc.pos;
 437                 c = get(in, &cpos, &rsc);
 438             } while (ishex(c) && len < 5);
 439             unget(in, c, &cpos);
 440         } else if (iscmd(c)) {
 441             do {
 442                 rdadd(&rs, c);
 443                 prevpos = rsc.pos;
 444                 c = get(in, &cpos, &rsc);
 445             } while (iscmd(c));
 446             unget(in, c, &cpos);
 447         }
 448         /*
 449          * Now match the command against the list of available
 450          * ones.
 451          */
 452         ret.type = tok_cmd;
 453         ret.text = ustrdup(rs.text);
 454         if (rsc.text) {
 455             in->pushback_chars = dupstr(rsc.text + prevpos);
 456             rsc.text[prevpos] = '\0';
 457             ret.origtext = dupstr(rsc.text);
 458         } else {
 459             ret.origtext = dupstr("");
 460         }
 461         match_kw(&ret);
 462         sfree(rs.text);
 463         sfree(rsc.text);
 464         return ret;
 465     } else if (c == '{') {             /* tok_lbrace */
 466         ret.type = tok_lbrace;
 467         sfree(rsc.text);
 468         return ret;
 469     } else if (c == '}') {             /* tok_rbrace */
 470         ret.type = tok_rbrace;
 471         sfree(rsc.text);
 472         return ret;
 473     } else {                           /* tok_word */
 474         /*
 475          * Read a word: the longest possible contiguous sequence of
 476          * things other than whitespace, backslash, braces and
 477          * hyphen. A hyphen terminates the word but is returned as
 478          * part of it; everything else is pushed back for the next
 479          * token. The `aux' field contains TRUE if the word ends in
 480          * a hyphen.
 481          */
 482         ret.aux = FALSE;               /* assumed for now */
 483         prevpos = 0;
 484         while (1) {
 485             if (iswhite(c) || c=='{' || c=='}' || c=='\\' || c==EOF) {
 486                 /* Put back the character that caused termination */
 487                 unget(in, c, &cpos);
 488                 break;
 489             } else {
 490                 rdadd(&rs, c);
 491                 if (c == '-') {
 492                     prevpos = rsc.pos;
 493                     ret.aux = TRUE;
 494                     break;             /* hyphen terminates word */
 495                 }
 496             }
 497             prevpos = rsc.pos;
 498             c = get(in, &cpos, &rsc);
 499         }
 500         ret.type = tok_word;
 501         ret.text = ustrdup(rs.text);
 502         if (rsc.text) {
 503             in->pushback_chars = dupstr(rsc.text + prevpos);
 504             rsc.text[prevpos] = '\0';
 505             ret.origtext = dupstr(rsc.text);
 506         } else {
 507             ret.origtext = dupstr("");
 508         }
 509         sfree(rs.text);
 510         sfree(rsc.text);
 511         return ret;
 512     }
 513 }
 514
 515 /*
 516  * Determine whether the next input character is an open brace (for
 517  * telling code paragraphs from paragraphs which merely start with
 518  * code).
 519  */
 520 int isbrace(input *in) {
 521     int c;
 522     filepos cpos;
 523
 524     c = get(in, &cpos, NULL);
 525     unget(in, c, &cpos);
 526     return (c == '{');
 527 }
 528
 529 /*
 530  * Read the rest of a line that starts `\c'. Including nothing at
 531  * all (tok_word with empty text).
 532  */
 533 token get_codepar_token(input *in) {
 534     int c;
 535     token ret;
 536     rdstring rs = { 0, 0, NULL };
 537     filepos cpos;
 538
 539     ret.type = tok_word;
 540     ret.origtext = NULL;
 541     c = get(in, &cpos, NULL);          /* expect (and discard) one space */
 542     ret.pos = cpos;
 543     if (c == ' ') {
 544         c = get(in, &cpos, NULL);
 545         ret.pos = cpos;
 546     }
 547     while (!isnl(c) && c != EOF) {
 548         int c2 = c;
 549         c = get(in, &cpos, NULL);
 550         /* Discard \r just before \n. */
 551         if (c2 != 13 || !isnl(c))
 552             rdadd(&rs, c2);
 553     }
 554     unget(in, c, &cpos);
 555     ret.text = ustrdup(rs.text);
 556     sfree(rs.text);
 557     return ret;
 558 }
 559
 560 /*
 561  * Adds a new word to a linked list
 562  */
 563 static word *addword(word newword, word ***hptrptr) {
 564     word *mnewword;
 565     if (!hptrptr)
 566         return NULL;
 567     mnewword = snew(word);
 568     *mnewword = newword;               /* structure copy */
 569     mnewword->next = NULL;
 570     **hptrptr = mnewword;
 571     *hptrptr = &mnewword->next;
 572     return mnewword;
 573 }
 574
 575 /*
 576  * Adds a new paragraph to a linked list
 577  */
 578 static paragraph *addpara(paragraph newpara, paragraph ***hptrptr) {
 579     paragraph *mnewpara = snew(paragraph);
 580     *mnewpara = newpara;               /* structure copy */
 581     mnewpara->next = NULL;
 582     **hptrptr = mnewpara;
 583     *hptrptr = &mnewpara->next;
 584     return mnewpara;
 585 }
 586
 587 /*
 588  * Destructor before token is reassigned; should catch most memory
 589  * leaks
 590  */
 591 #define dtor(t) ( sfree(t.text), sfree(t.origtext) )
 592
 593 /*
 594  * Reads a single file (ie until get() returns EOF)
 595  */
 596 static void read_file(paragraph ***ret, input *in, indexdata *idx,
 597                       tree234 *macros) {
 598     token t;
 599     paragraph par;
 600     word wd, **whptr, **idximplicit;
 601     wchar_t utext[2], *wdtext;
 602     int style, spcstyle;
 603     int already;
 604     int iswhite, seenwhite;
 605     int type;
 606     int prev_para_type;
 607     struct stack_item {
 608         enum {
 609             stack_nop = 0,             /* do nothing (for error recovery) */
 610             stack_ualt = 1,            /* \u alternative */
 611             stack_style = 2,           /* \e, \c, \cw */
 612             stack_idx = 4,             /* \I, \i, \ii */
 613             stack_hyper = 8,           /* \W */
 614             stack_quote = 16           /* \q */
 615         } type;
 616         word **whptr;                  /* to restore from \u alternatives */
 617         word **idximplicit;            /* to restore from \u alternatives */
 618         filepos fpos;
 619         int in_code;
 620     } *sitem;
 621     stack parsestk;
 622     struct crossparaitem {
 623         int type;                      /* currently c_lcont, c_quote or -1 */
 624         int seen_lcont, seen_quote;
 625     };
 626     stack crossparastk;
 627     word *indexword, *uword, *iword;
 628     word *idxwordlist;
 629     rdstring indexstr;
 630     int index_downcase, index_visible, indexing;
 631     const rdstring nullrs = { 0, 0, NULL };
 632     wchar_t uchr;
 633
 634     t.text = NULL;
 635     t.origtext = NULL;
 636     already = FALSE;
 637
 638     crossparastk = stk_new();
 639
 640     /*
 641      * Loop on each paragraph.
 642      */
 643     while (1) {
 644         int start_cmd = c__invalid;
 645         par.words = NULL;
 646         par.keyword = NULL;
 647         par.origkeyword = NULL;
 648         whptr = &par.words;
 649
 650         /*
 651          * Get a token.
 652          */
 653         do {
 654             if (!already) {
 655                 dtor(t), t = get_token(in);
 656             }
 657             already = FALSE;
 658         } while (t.type == tok_eop);
 659         if (t.type == tok_eof)
 660             break;
 661
 662         /*
 663          * Parse code paragraphs separately.
 664          */
 665         if (t.type == tok_cmd && t.cmd == c_c && !isbrace(in)) {
 666             int wtype = word_WeakCode;
 667
 668             par.type = para_Code;
 669             par.fpos = t.pos;
 670             while (1) {
 671                 dtor(t), t = get_codepar_token(in);
 672                 wd.type = wtype;
 673                 wd.breaks = FALSE;     /* shouldn't need this... */
 674                 wd.text = ustrdup(t.text);
 675                 wd.alt = NULL;
 676                 wd.fpos = t.pos;
 677                 addword(wd, &whptr);
 678                 dtor(t), t = get_token(in);
 679                 if (t.type == tok_white) {
 680                     /*
 681                      * The newline after a code-paragraph line
 682                      */
 683                     dtor(t), t = get_token(in);
 684                 }
 685                 if (t.type == tok_eop || t.type == tok_eof ||
 686                     t.type == tok_rbrace) { /* might be } terminating \lcont */
 687                     if (t.type == tok_rbrace)
 688                         already = TRUE;
 689                     break;
 690                 } else if (t.type == tok_cmd && t.cmd == c_c) {
 691                     wtype = word_WeakCode;
 692                 } else if (t.type == tok_cmd && t.cmd == c_e &&
 693                            wtype == word_WeakCode) {
 694                     wtype = word_Emph;
 695                 } else {
 696                     error(err_brokencodepara, &t.pos);
 697                     prev_para_type = par.type;
 698                     addpara(par, ret);
 699                     while (t.type != tok_eop)   /* error recovery: */
 700                         dtor(t), t = get_token(in);   /* eat rest of paragraph */
 701                     goto codeparabroken;   /* ick, but such is life */
 702                 }
 703             }
 704             prev_para_type = par.type;
 705             addpara(par, ret);
 706             codeparabroken:
 707             continue;
 708         }
 709
 710         /*
 711          * Spot the special commands that define a grouping of more
 712          * than one paragraph, and also the closing braces that
 713          * finish them.
 714          */
 715         if (t.type == tok_cmd &&
 716             (t.cmd == c_lcont || t.cmd == c_quote)) {
 717             struct crossparaitem *sitem, *stop;
 718             int cmd = t.cmd;
 719
 720             /*
 721              * Expect, and swallow, an open brace.
 722              */
 723             dtor(t), t = get_token(in);
 724             if (t.type != tok_lbrace) {
 725                 error(err_explbr, &t.pos);
 726                 continue;
 727             }
 728
 729             /*
 730              * Also expect, and swallow, any whitespace after that
 731              * (a newline before a code paragraph wouldn't be
 732              * surprising).
 733              */
 734             do {
 735                 dtor(t), t = get_token(in);
 736             } while (t.type == tok_white);
 737             already = TRUE;
 738
 739             if (cmd == c_lcont) {
 740                 /*
 741                  * \lcont causes a continuation of a list item into
 742                  * multiple paragraphs (which may in turn contain
 743                  * nested lists, code paras etc). Hence, the previous
 744                  * paragraph must be of a list type.
 745                  */
 746                 sitem = snew(struct crossparaitem);
 747                 stop = (struct crossparaitem *)stk_top(crossparastk);
 748                 if (stop)
 749                     *sitem = *stop;
 750                 else
 751                     sitem->seen_quote = sitem->seen_lcont = 0;
 752
 753                 if (prev_para_type == para_Bullet ||
 754                     prev_para_type == para_NumberedList ||
 755                     prev_para_type == para_Description) {
 756                     sitem->type = c_lcont;
 757                     sitem->seen_lcont = 1;
 758                     par.type = para_LcontPush;
 759                     prev_para_type = par.type;
 760                     addpara(par, ret);
 761                 } else {
 762                     /*
 763                      * Push a null item on the cross-para stack so that
 764                      * when we see the corresponding closing brace we
 765                      * don't give a cascade error.
 766                      */
 767                     sitem->type = -1;
 768                     error(err_misplacedlcont, &t.pos);
 769                 }
 770             } else {
 771                 /*
 772                  * \quote causes a group of paragraphs to be
 773                  * block-quoted (typically they will be indented a
 774                  * bit).
 775                  */
 776                 sitem = snew(struct crossparaitem);
 777                 stop = (struct crossparaitem *)stk_top(crossparastk);
 778                 if (stop)
 779                     *sitem = *stop;
 780                 else
 781                     sitem->seen_quote = sitem->seen_lcont = 0;
 782                 sitem->type = c_quote;
 783                 sitem->seen_quote = 1;
 784                 par.type = para_QuotePush;
 785                 prev_para_type = par.type;
 786                 addpara(par, ret);
 787             }
 788             stk_push(crossparastk, sitem);
 789             continue;
 790         } else if (t.type == tok_rbrace) {
 791             struct crossparaitem *sitem = stk_pop(crossparastk);
 792             if (!sitem)
 793                 error(err_unexbrace, &t.pos);
 794             else {
 795                 switch (sitem->type) {
 796                   case c_lcont:
 797                     par.type = para_LcontPop;
 798                     prev_para_type = par.type;
 799                     addpara(par, ret);
 800                     break;
 801                   case c_quote:
 802                     par.type = para_QuotePop;
 803                     prev_para_type = par.type;
 804                     addpara(par, ret);
 805                     break;
 806                 }
 807                 sfree(sitem);
 808             }
 809             continue;
 810         }
 811
 812         while (t.type == tok_cmd &&
 813                macrolookup(macros, in, t.text, &t.pos)) {
 814             dtor(t), t = get_token(in);
 815         }
 816
 817         /*
 818          * This token begins a paragraph. See if it's one of the
 819          * special commands that define a paragraph type.
 820          *
 821          * (note that \# is special in a way, and \nocite takes no
 822          * text)
 823          */
 824         par.type = para_Normal;
 825         if (t.type == tok_cmd) {
 826             int needkw;
 827             int is_macro = FALSE;
 828
 829             par.fpos = t.pos;
 830             switch (t.cmd) {
 831               default:
 832                 needkw = -1;
 833                 break;
 834               case c__invalid:
 835                 error(err_badparatype, t.text, &t.pos);
 836                 needkw = 4;
 837                 break;
 838               case c__comment:
 839                 if (isbrace(in)) {
 840                     needkw = -1;
 841                     break;             /* `\#{': isn't a comment para */
 842                 }
 843                 do {
 844                     dtor(t), t = get_token(in);
 845                 } while (t.type != tok_eop && t.type != tok_eof);
 846                 continue;              /* next paragraph */
 847                 /*
 848                  * `needkw' values:
 849                  *
 850                  *   1 -- exactly one keyword
 851                  *   2 -- at least one keyword
 852                  *   4 -- any number of keywords including zero
 853                  *   8 -- at least one keyword and then nothing else
 854                  *  16 -- nothing at all! no keywords, no body
 855                  *  32 -- no keywords at all
 856                  */
 857               case c_A: needkw = 2; par.type = para_Appendix; break;
 858               case c_B: needkw = 2; par.type = para_Biblio; break;
 859               case c_BR: needkw = 1; par.type = para_BR;
 860                 start_cmd = c_BR; break;
 861               case c_C: needkw = 2; par.type = para_Chapter; break;
 862               case c_H: needkw = 2; par.type = para_Heading;
 863                 par.aux = 0;
 864                 break;
 865               case c_IM: needkw = 2; par.type = para_IM;
 866                 start_cmd = c_IM; break;
 867               case c_S: needkw = 2; par.type = para_Subsect;
 868                 par.aux = t.aux; break;
 869               case c_U: needkw = 32; par.type = para_UnnumberedChapter; break;
 870                 /* For \b and \n the keyword is optional */
 871               case c_b: needkw = 4; par.type = para_Bullet; break;
 872               case c_dt: needkw = 4; par.type = para_DescribedThing; break;
 873               case c_dd: needkw = 4; par.type = para_Description; break;
 874               case c_n: needkw = 4; par.type = para_NumberedList; break;
 875               case c_cfg: needkw = 8; par.type = para_Config;
 876                 start_cmd = c_cfg; break;
 877               case c_copyright: needkw = 32; par.type = para_Copyright; break;
 878               case c_define: is_macro = TRUE; needkw = 1; break;
 879                 /* For \nocite the keyword is _everything_ */
 880               case c_nocite: needkw = 8; par.type = para_NoCite; break;
 881               case c_preamble: needkw = 32; par.type = para_Normal; break;
 882               case c_rule: needkw = 16; par.type = para_Rule; break;
 883               case c_title: needkw = 32; par.type = para_Title; break;
 884               case c_versionid: needkw = 32; par.type = para_VersionID; break;
 885             }
 886
 887             if (par.type == para_Chapter ||
 888                 par.type == para_Heading ||
 889                 par.type == para_Subsect ||
 890                 par.type == para_Appendix ||
 891                 par.type == para_UnnumberedChapter) {
 892                 struct crossparaitem *sitem = stk_top(crossparastk);
 893                 if (sitem && (sitem->seen_lcont || sitem->seen_quote)) {
 894                     error(err_sectmarkerinblock,
 895                           &t.pos,
 896                           (sitem->seen_lcont ? "lcont" : "quote"));
 897                 }
 898             }
 899
 900             if (needkw > 0) {
 901                 rdstring rs = { 0, 0, NULL };
 902                 rdstringc rsc = { 0, 0, NULL };
 903                 int nkeys = 0;
 904                 filepos fp;
 905
 906                 /* Get keywords. */
 907                 dtor(t), t = get_token(in);
 908                 fp = t.pos;
 909                 while (t.type == tok_lbrace ||
 910                        (t.type == tok_white && (needkw & 24))) {
 911                     /*
 912                      * In paragraph types which can't accept any
 913                      * body text (such as \cfg), we are lenient
 914                      * about whitespace between keywords. This is
 915                      * important for \cfg in particular since it
 916                      * can often have many keywords which are long
 917                      * pieces of text, so it's useful to permit the
 918                      * user to wrap the line between them.
 919                      */
 920                     if (t.type == tok_white) {
 921                         dtor(t), t = get_token(in); /* eat the space */
 922                         continue;
 923                     }
 924                     /* This is a keyword. */
 925                     nkeys++;
 926                     /* FIXME: there will be bugs if anyone specifies an
 927                      * empty keyword (\foo{}), so trap this case. */
 928                     while (dtor(t), t = get_token(in),
 929                            t.type == tok_word ||
 930                            t.type == tok_white ||
 931                            (t.type == tok_cmd && t.cmd == c__nbsp) ||
 932                            (t.type == tok_cmd && t.cmd == c__escaped) ||
 933                            (t.type == tok_cmd && t.cmd == c_u)) {
 934                         if (t.type == tok_white ||
 935                             (t.type == tok_cmd && t.cmd == c__nbsp)) {
 936                             rdadd(&rs, ' ');
 937                             rdaddc(&rsc, ' ');
 938                         } else if (t.type == tok_cmd && t.cmd == c_u) {
 939                             rdadd(&rs, t.aux);
 940                             rdaddc(&rsc, '\\');
 941                             rdaddsc(&rsc, t.origtext);
 942                         } else {
 943                             rdadds(&rs, t.text);
 944                             rdaddsc(&rsc, t.origtext);
 945                         }
 946                     }
 947                     if (t.type != tok_rbrace) {
 948                         error(err_kwunclosed, &t.pos);
 949                         continue;
 950                     }
 951                     rdadd(&rs, 0);     /* add string terminator */
 952                     rdaddc(&rsc, 0);   /* add string terminator */
 953                     dtor(t), t = get_token(in); /* eat right brace */
 954                 }
 955
 956                 rdadd(&rs, 0);         /* add string terminator */
 957                 rdaddc(&rsc, 0);       /* add string terminator */
 958
 959                 /* See whether we have the right number of keywords. */
 960                 if ((needkw & 48) && nkeys > 0)
 961                     error(err_kwillegal, &fp);
 962                 if ((needkw & 11) && nkeys == 0)
 963                     error(err_kwexpected, &fp);
 964                 if ((needkw & 5) && nkeys > 1)
 965                     error(err_kwtoomany, &fp);
 966
 967                 if (is_macro) {
 968                     /*
 969                      * Macro definition. Get the rest of the line
 970                      * as a code-paragraph token, repeatedly until
 971                      * there's nothing more left of it. Separate
 972                      * with newlines.
 973                      */
 974                     rdstring macrotext = { 0, 0, NULL };
 975                     while (1) {
 976                         dtor(t), t = get_codepar_token(in);
 977                         if (macrotext.pos > 0)
 978                             rdadd(&macrotext, L'\n');
 979                         rdadds(&macrotext, t.text);
 980                         dtor(t), t = get_token(in);
 981                         if (t.type == tok_eop || t.type == tok_eof)
 982                             break;
 983                     }
 984                     macrodef(macros, rs.text, macrotext.text, fp);
 985                     continue;          /* next paragraph */
 986                 }
 987
 988                 par.keyword = rdtrim(&rs);
 989                 par.origkeyword = rdtrimc(&rsc);
 990
 991                 /* Move to EOP in case of needkw==8 or 16 (no body) */
 992                 if (needkw & 24) {
 993                     /* We allow whitespace even when we expect no para body */
 994                     while (t.type == tok_white)
 995                         dtor(t), t = get_token(in);
 996                     if (t.type != tok_eop && t.type != tok_eof &&
 997                         (start_cmd == c__invalid ||
 998                          t.type != tok_cmd || t.cmd != start_cmd)) {
 999                         error(err_bodyillegal, &t.pos);
1000                         /* Error recovery: eat the rest of the paragraph */
1001                         while (t.type != tok_eop && t.type != tok_eof &&
1002                                (start_cmd == c__invalid ||
1003                                 t.type != tok_cmd || t.cmd != start_cmd))
1004                             dtor(t), t = get_token(in);
1005                     }
1006                     if (t.type == tok_cmd)
1007                         already = TRUE;/* inhibit get_token at top of loop */
1008                     prev_para_type = par.type;
1009                     addpara(par, ret);
1010
1011                     if (par.type == para_Config) {
1012                         input_configure(in, &par);
1013                     }
1014                     continue;          /* next paragraph */
1015                 }
1016             }
1017         }
1018
1019         /*
1020          * Now read the actual paragraph, word by word, adding to
1021          * the paragraph list.
1022          *
1023          * Mid-paragraph commands:
1024          *
1025          *  \K \k
1026          *  \c \cw \cq
1027          *  \e
1028          *  \i \ii
1029          *  \I
1030          *  \q
1031          *  \u
1032          *  \W
1033          *  \date
1034          *  \\ \{ \}
1035          */
1036         parsestk = stk_new();
1037         style = word_Normal;
1038         spcstyle = word_WhiteSpace;
1039         indexing = FALSE;
1040         seenwhite = TRUE;
1041         while (t.type != tok_eop && t.type != tok_eof) {
1042             iswhite = FALSE;
1043             already = FALSE;
1044
1045             /* Handle implicit paragraph breaks after \IM, \BR etc */
1046             if (start_cmd != c__invalid &&
1047                 t.type == tok_cmd && t.cmd == start_cmd) {
1048                 already = TRUE;        /* inhibit get_token at top of loop */
1049                 break;
1050             }
1051
1052             if (t.type == tok_cmd && t.cmd == c__nop) {
1053                 dtor(t), t = get_token(in);
1054                 continue;              /* do nothing! */
1055             }
1056
1057             if (t.type == tok_cmd && t.cmd == c__escaped) {
1058                 t.type = tok_word;     /* nice and simple */
1059                 t.aux = 0;             /* even if `\-' - nonbreaking! */
1060             }
1061             if (t.type == tok_cmd && t.cmd == c__nbsp) {
1062                 t.type = tok_word;     /* nice and simple */
1063                 sfree(t.text);
1064                 t.text = ustrdup(L" ");  /* text is ` ' not `_' */
1065                 t.aux = 0;             /* (nonbreaking) */
1066             }
1067             switch (t.type) {
1068               case tok_white:
1069                 if (whptr == &par.words)
1070                     break;             /* strip whitespace at start of para */
1071                 wd.text = NULL;
1072                 wd.type = spcstyle;
1073                 wd.alt = NULL;
1074                 wd.aux = 0;
1075                 wd.fpos = t.pos;
1076                 wd.breaks = FALSE;
1077
1078                 /*
1079                  * Inhibit use of whitespace if it's (probably the
1080                  * newline) before a repeat \IM / \BR type
1081                  * directive.
1082                  */
1083                 if (start_cmd != c__invalid) {
1084                     dtor(t), t = get_token(in);
1085                     already = TRUE;
1086                     if (t.type == tok_cmd && t.cmd == start_cmd)
1087                         break;
1088                 }
1089
1090                 if (indexing)
1091                     rdadd(&indexstr, ' ');
1092                 if (!indexing || index_visible)
1093                     addword(wd, &whptr);
1094                 if (indexing)
1095                     addword(wd, &idximplicit);
1096                 iswhite = TRUE;
1097                 break;
1098               case tok_word:
1099                 if (indexing)
1100                     rdadds(&indexstr, t.text);
1101                 wd.type = style;
1102                 wd.alt = NULL;
1103                 wd.aux = 0;
1104                 wd.fpos = t.pos;
1105                 wd.breaks = t.aux;
1106                 if (!indexing || index_visible) {
1107                     wd.text = ustrdup(t.text);
1108                     addword(wd, &whptr);
1109                 }
1110                 if (indexing) {
1111                     wd.text = ustrdup(t.text);
1112                     addword(wd, &idximplicit);
1113                 }
1114                 break;
1115               case tok_lbrace:
1116                 error(err_unexbrace, &t.pos);
1117                 /* Error recovery: push nop */
1118                 sitem = snew(struct stack_item);
1119                 sitem->type = stack_nop;
1120                 sitem->fpos = t.pos;
1121                 stk_push(parsestk, sitem);
1122                 break;
1123               case tok_rbrace:
1124                 sitem = stk_pop(parsestk);
1125                 if (!sitem) {
1126                     /*
1127                      * This closing brace could have been an
1128                      * indication that the cross-paragraph stack
1129                      * wants popping. Accordingly, we treat it here
1130                      * as an indication that the paragraph is over.
1131                      */
1132                     already = TRUE;
1133                     goto finished_para;
1134                 } else {
1135                     if (sitem->type & stack_ualt) {
1136                         whptr = sitem->whptr;
1137                         idximplicit = sitem->idximplicit;
1138                     }
1139                     if (sitem->type & stack_style) {
1140                         style = word_Normal;
1141                         spcstyle = word_WhiteSpace;
1142                     }
1143                     if (sitem->type & stack_idx) {
1144                         indexword->text = ustrdup(indexstr.text);
1145                         if (index_downcase) {
1146                             word *w;
1147
1148                             ustrlow(indexword->text);
1149                             ustrlow(indexstr.text);
1150
1151                             for (w = idxwordlist; w; w = w->next)
1152                                 if (w->text)
1153                                     ustrlow(w->text);
1154                         }
1155                         indexing = FALSE;
1156                         rdadd(&indexstr, L'\0');
1157                         index_merge(idx, FALSE, indexstr.text,
1158                                     idxwordlist, &sitem->fpos);
1159                         sfree(indexstr.text);
1160                     }
1161                     if (sitem->type & stack_hyper) {
1162                         wd.text = NULL;
1163                         wd.type = word_HyperEnd;
1164                         wd.alt = NULL;
1165                         wd.aux = 0;
1166                         wd.fpos = t.pos;
1167                         wd.breaks = FALSE;
1168                         if (!indexing || index_visible)
1169                             addword(wd, &whptr);
1170                         if (indexing)
1171                             addword(wd, &idximplicit);
1172                     }
1173                     if (sitem->type & stack_quote) {
1174                         wd.text = NULL;
1175                         wd.type = toquotestyle(style);
1176                         wd.alt = NULL;
1177                         wd.aux = quote_Close;
1178                         wd.fpos = t.pos;
1179                         wd.breaks = FALSE;
1180                         if (!indexing || index_visible)
1181                             addword(wd, &whptr);
1182                         if (indexing) {
1183                             rdadd(&indexstr, L'"');
1184                             addword(wd, &idximplicit);
1185                         }
1186                     }
1187                 }
1188                 sfree(sitem);
1189                 break;
1190               case tok_cmd:
1191                 switch (t.cmd) {
1192                   case c__comment:
1193                     /*
1194                      * In-paragraph comment: \#{ balanced braces }
1195                      *
1196                      * Anything goes here; even tok_eop. We should
1197                      * eat whitespace after the close brace _if_
1198                      * there was whitespace before the \#.
1199                      */
1200                     dtor(t), t = get_token(in);
1201                     if (t.type != tok_lbrace) {
1202                         error(err_explbr, &t.pos);
1203                     } else {
1204                         int braces = 1;
1205                         while (braces > 0) {
1206                             dtor(t), t = get_token(in);
1207                             if (t.type == tok_lbrace)
1208                                 braces++;
1209                             else if (t.type == tok_rbrace)
1210                                 braces--;
1211                             else if (t.type == tok_eof) {
1212                                 error(err_commenteof, &t.pos);
1213                                 break;
1214                             }
1215                         }
1216                     }
1217                     if (seenwhite) {
1218                         already = TRUE;
1219                         dtor(t), t = get_token(in);
1220                         if (t.type == tok_white) {
1221                             iswhite = TRUE;
1222                             already = FALSE;
1223                         }
1224                     }
1225                     break;
1226                   case c_q:
1227                   case c_cq:
1228                     type = t.cmd;
1229                     dtor(t), t = get_token(in);
1230                     if (t.type != tok_lbrace) {
1231                         error(err_explbr, &t.pos);
1232                     } else {
1233                         /*
1234                          * Enforce that \q may not be used anywhere
1235                          * within \c. (It shouldn't be necessary
1236                          * since the whole point of \c should be
1237                          * that the user wants to exercise exact
1238                          * control over the glyphs used, and
1239                          * forbidding it has the useful effect of
1240                          * relieving some backends of having to
1241                          * make difficult decisions.)
1242                          */
1243                         int stype;
1244
1245                         if (style != word_Code && style != word_WeakCode) {
1246                             wd.text = NULL;
1247                             wd.type = toquotestyle(style);
1248                             wd.alt = NULL;
1249                             wd.aux = quote_Open;
1250                             wd.fpos = t.pos;
1251                             wd.breaks = FALSE;
1252                             if (!indexing || index_visible)
1253                                 addword(wd, &whptr);
1254                             if (indexing) {
1255                                 rdadd(&indexstr, L'"');
1256                                 addword(wd, &idximplicit);
1257                             }
1258                             stype = stack_quote;
1259                         } else {
1260                             error(err_codequote, &t.pos);
1261                             stype = stack_nop;
1262                         }
1263                         sitem = snew(struct stack_item);
1264                         sitem->fpos = t.pos;
1265                         sitem->type = stype;
1266                         if (type == c_cq) {
1267                             if (style != word_Normal) {
1268                                 error(err_nestedstyles, &t.pos);
1269                             } else {
1270                                 style = word_WeakCode;
1271                                 spcstyle = tospacestyle(style);
1272                                 sitem->type |= stack_style;
1273                             }
1274                         }
1275                         stk_push(parsestk, sitem);
1276                     }
1277                     break;
1278                   case c_K:
1279                   case c_k:
1280                   case c_W:
1281                   case c_date:
1282                     /*
1283                      * Keyword, hyperlink, or \date. We expect a
1284                      * left brace, some text, and then a right
1285                      * brace. No nesting; no arguments.
1286                      */
1287                     wd.fpos = t.pos;
1288                     wd.breaks = FALSE;
1289                     if (t.cmd == c_K)
1290                         wd.type = word_UpperXref;
1291                     else if (t.cmd == c_k)
1292                         wd.type = word_LowerXref;
1293                     else if (t.cmd == c_W)
1294                         wd.type = word_HyperLink;
1295                     else
1296                         wd.type = word_Normal;
1297                     dtor(t), t = get_token(in);
1298                     if (t.type != tok_lbrace) {
1299                         if (wd.type == word_Normal) {
1300                             time_t thetime = time(NULL);
1301                             struct tm *broken = localtime(&thetime);
1302                             already = TRUE;
1303                             wdtext = ustrftime(NULL, broken);
1304                             wd.type = style;
1305                         } else {
1306                             error(err_explbr, &t.pos);
1307                             wdtext = NULL;
1308                         }
1309                     } else {
1310                         rdstring rs = { 0, 0, NULL };
1311                         while (dtor(t), t = get_token(in),
1312                                t.type == tok_word || t.type == tok_white) {
1313                             if (t.type == tok_white)
1314                                 rdadd(&rs, ' ');
1315                             else
1316                                 rdadds(&rs, t.text);
1317                         }
1318                         if (wd.type == word_Normal) {
1319                             time_t thetime = time(NULL);
1320                             struct tm *broken = localtime(&thetime);
1321                             wdtext = ustrftime(rs.text, broken);
1322                             wd.type = style;
1323                         } else {
1324                             wdtext = ustrdup(rs.text);
1325                         }
1326                         sfree(rs.text);
1327                         if (t.type != tok_rbrace) {
1328                             error(err_kwexprbr, &t.pos);
1329                         }
1330                     }
1331                     wd.alt = NULL;
1332                     wd.aux = 0;
1333                     if (!indexing || index_visible) {
1334                         wd.text = ustrdup(wdtext);
1335                         addword(wd, &whptr);
1336                     }
1337                     if (indexing) {
1338                         wd.text = ustrdup(wdtext);
1339                         addword(wd, &idximplicit);
1340                     }
1341                     sfree(wdtext);
1342                     if (wd.type == word_HyperLink) {
1343                         /*
1344                          * Hyperlinks are different: they then
1345                          * expect another left brace, to begin
1346                          * delimiting the text marked by the link.
1347                          */
1348                         dtor(t), t = get_token(in);
1349                         sitem = snew(struct stack_item);
1350                         sitem->fpos = wd.fpos;
1351                         sitem->type = stack_hyper;
1352                         /*
1353                          * Special cases: \W{}\i, \W{}\ii
1354                          */
1355                         if (t.type == tok_cmd &&
1356                             (t.cmd == c_i || t.cmd == c_ii)) {
1357                             if (indexing) {
1358                                 error(err_nestedindex, &t.pos);
1359                             } else {
1360                                 /* Add an index-reference word with no
1361                                  * text as yet */
1362                                 wd.type = word_IndexRef;
1363                                 wd.text = NULL;
1364                                 wd.alt = NULL;
1365                                 wd.aux = 0;
1366                                 wd.breaks = FALSE;
1367                                 indexword = addword(wd, &whptr);
1368                                 /* Set up a rdstring to read the
1369                                  * index text */
1370                                 indexstr = nullrs;
1371                                 /* Flags so that we do the Right
1372                                  * Things with text */
1373                                 index_visible = (type != c_I);
1374                                 index_downcase = (type == c_ii);
1375                                 indexing = TRUE;
1376                                 idxwordlist = NULL;
1377                                 idximplicit = &idxwordlist;
1378
1379                                 sitem->type |= stack_idx;
1380                             }
1381                             dtor(t), t = get_token(in);
1382                         }
1383                         /*
1384                          * Special cases: \W{}\c, \W{}\e, \W{}\cw
1385                          */
1386                         if (t.type == tok_cmd &&
1387                             (t.cmd == c_e || t.cmd == c_c || t.cmd == c_cw)) {
1388                             if (style != word_Normal)
1389                                 error(err_nestedstyles, &t.pos);
1390                             else {
1391                                 style = (t.cmd == c_c ? word_Code :
1392                                          t.cmd == c_cw ? word_WeakCode :
1393                                          word_Emph);
1394                                 spcstyle = tospacestyle(style);
1395                                 sitem->type |= stack_style;
1396                             }
1397                             dtor(t), t = get_token(in);
1398                         }
1399                         if (t.type != tok_lbrace) {
1400                             error(err_explbr, &t.pos);
1401                             sfree(sitem);
1402                         } else {
1403                             stk_push(parsestk, sitem);
1404                         }
1405                     }
1406                     break;
1407                   case c_c:
1408                   case c_cw:
1409                   case c_e:
1410                     type = t.cmd;
1411                     if (style != word_Normal) {
1412                         error(err_nestedstyles, &t.pos);
1413                         /* Error recovery: eat lbrace, push nop. */
1414                         dtor(t), t = get_token(in);
1415                         sitem = snew(struct stack_item);
1416                         sitem->fpos = t.pos;
1417                         sitem->type = stack_nop;
1418                         stk_push(parsestk, sitem);
1419                     }
1420                     dtor(t), t = get_token(in);
1421                     if (t.type != tok_lbrace) {
1422                         error(err_explbr, &t.pos);
1423                     } else {
1424                         style = (type == c_c ? word_Code :
1425                                  type == c_cw ? word_WeakCode :
1426                                  word_Emph);
1427                         spcstyle = tospacestyle(style);
1428                         sitem = snew(struct stack_item);
1429                         sitem->fpos = t.pos;
1430                         sitem->type = stack_style;
1431                         stk_push(parsestk, sitem);
1432                     }
1433                     break;
1434                   case c_i:
1435                   case c_ii:
1436                   case c_I:
1437                     type = t.cmd;
1438                     if (indexing) {
1439                         error(err_nestedindex, &t.pos);
1440                         /* Error recovery: eat lbrace, push nop. */
1441                         dtor(t), t = get_token(in);
1442                         sitem = snew(struct stack_item);
1443                         sitem->fpos = t.pos;
1444                         sitem->type = stack_nop;
1445                         stk_push(parsestk, sitem);
1446                     }
1447                     sitem = snew(struct stack_item);
1448                     sitem->fpos = t.pos;
1449                     sitem->type = stack_idx;
1450                     dtor(t), t = get_token(in);
1451                     /*
1452                      * Special cases: \i\c, \i\e, \i\cw
1453                      */
1454                     wd.fpos = t.pos;
1455                     if (t.type == tok_cmd &&
1456                         (t.cmd == c_e || t.cmd == c_c || t.cmd == c_cw)) {
1457                         if (style != word_Normal)
1458                             error(err_nestedstyles, &t.pos);
1459                         else {
1460                             style = (t.cmd == c_c ? word_Code :
1461                                      t.cmd == c_cw ? word_WeakCode :
1462                                      word_Emph);
1463                             spcstyle = tospacestyle(style);
1464                             sitem->type |= stack_style;
1465                         }
1466                         dtor(t), t = get_token(in);
1467                     }
1468                     if (t.type != tok_lbrace) {
1469                         sfree(sitem);
1470                         error(err_explbr, &t.pos);
1471                     } else {
1472                         /* Add an index-reference word with no text as yet */
1473                         wd.type = word_IndexRef;
1474                         wd.text = NULL;
1475                         wd.alt = NULL;
1476                         wd.aux = 0;
1477                         wd.breaks = FALSE;
1478                         indexword = addword(wd, &whptr);
1479                         /* Set up a rdstring to read the index text */
1480                         indexstr = nullrs;
1481                         /* Flags so that we do the Right Things with text */
1482                         index_visible = (type != c_I);
1483                         index_downcase = (type == c_ii);
1484                         indexing = TRUE;
1485                         idxwordlist = NULL;
1486                         idximplicit = &idxwordlist;
1487                         /* Stack item to close the indexing on exit */
1488                         stk_push(parsestk, sitem);
1489                     }
1490                     break;
1491                   case c_u:
1492                     uchr = t.aux;
1493                     utext[0] = uchr; utext[1] = 0;
1494                     wd.type = style;
1495                     wd.breaks = FALSE;
1496                     wd.alt = NULL;
1497                     wd.aux = 0;
1498                     wd.fpos = t.pos;
1499                     if (!indexing || index_visible) {
1500                         wd.text = ustrdup(utext);
1501                         uword = addword(wd, &whptr);
1502                     } else
1503                         uword = NULL;
1504                     if (indexing) {
1505                         wd.text = ustrdup(utext);
1506                         iword = addword(wd, &idximplicit);
1507                     } else
1508                         iword = NULL;
1509                     dtor(t), t = get_token(in);
1510                     if (t.type == tok_lbrace) {
1511                         /*
1512                          * \u with a left brace. Until the brace
1513                          * closes, all further words go on a
1514                          * sidetrack from the main thread of the
1515                          * paragraph.
1516                          */
1517                         sitem = snew(struct stack_item);
1518                         sitem->fpos = t.pos;
1519                         sitem->type = stack_ualt;
1520                         sitem->whptr = whptr;
1521                         sitem->idximplicit = idximplicit;
1522                         stk_push(parsestk, sitem);
1523                         whptr = uword ? &uword->alt : NULL;
1524                         idximplicit = iword ? &iword->alt : NULL;
1525                     } else {
1526                         if (indexing)
1527                             rdadd(&indexstr, uchr);
1528                         already = TRUE;
1529                     }
1530                     break;
1531                   default:
1532                     if (!macrolookup(macros, in, t.text, &t.pos))
1533                         error(err_badmidcmd, t.text, &t.pos);
1534                     break;
1535                 }
1536             }
1537             if (!already)
1538                 dtor(t), t = get_token(in);
1539             seenwhite = iswhite;
1540         }
1541         finished_para:
1542         /* Check the stack is empty */
1543         if (stk_top(parsestk)) {
1544             while ((sitem = stk_pop(parsestk)))
1545                 sfree(sitem);
1546             error(err_missingrbrace, &t.pos);
1547         }
1548         stk_free(parsestk);
1549         prev_para_type = par.type;
1550         /*
1551          * Before we add the paragraph to the output list, we
1552          * should check that there was any text in it at all; there
1553          * might not be if (for example) the paragraph contained
1554          * nothing but an unrecognised command sequence, and if we
1555          * put an empty paragraph on the list it may confuse the
1556          * back ends later on.
1557          */
1558         if (par.words) {
1559             addpara(par, ret);
1560         }
1561         if (t.type == tok_eof)
1562             already = TRUE;
1563     }
1564
1565     if (stk_top(crossparastk)) {
1566         void *p;
1567
1568         error(err_missingrbrace2, &t.pos);
1569         while ((p = stk_pop(crossparastk)))
1570             sfree(p);
1571     }
1572
1573     /*
1574      * We break to here rather than returning, because otherwise
1575      * this cleanup doesn't happen.
1576      */
1577     dtor(t);
1578
1579     stk_free(crossparastk);
1580 }
1581
1582 struct {
1583     char const *magic;
1584     size_t nmagic;
1585     int binary;
1586     void (*reader)(input *);
1587 } magics[] = {
1588     { "%!FontType1-",     12, FALSE, &read_pfa_file },
1589     { "%!PS-AdobeFont-",  15, FALSE, &read_pfa_file },
1590     { "\x80\x01",          2, TRUE,  &read_pfb_file },
1591     { "StartFontMetrics", 16, FALSE, &read_afm_file },
1592     { "\x00\x01\x00\x00",  4, TRUE,  &read_sfnt_file },
1593     { "true",              4, TRUE,  &read_sfnt_file },
1594 };
1595
1596 paragraph *read_input(input *in, indexdata *idx) {
1597     paragraph *head = NULL;
1598     paragraph **hptr = &head;
1599     tree234 *macros;
1600     char mag[16];
1601     size_t len, i;
1602     int binary;
1603     void (*reader)(input *);
1604
1605     macros = newtree234(macrocmp);
1606
1607     while (in->currindex < in->nfiles) {
1608         setpos(in, in->filenames[in->currindex]);
1609         in->charset = in->defcharset;
1610         in->csstate = charset_init_state;
1611         in->wcpos = in->nwc = 0;
1612         in->pushback_chars = NULL;
1613
1614         if (!in->filenames[in->currindex]) {
1615             in->currfp = stdin;
1616             in->wantclose = FALSE;     /* don't fclose stdin */
1617             /*
1618              * When reading standard input, we always expect to see
1619              * an actual Halibut file and not any of the unusual
1620              * input types like fonts.
1621              */
1622             reader = NULL;
1623         } else {
1624             /*
1625              * Open the file in binary mode to look for magic
1626              * numbers. We'll switch to text mode if we find we're
1627              * looking at a text file type.
1628              */
1629             in->currfp = fopen(in->filenames[in->currindex], "rb");
1630             binary = FALSE; /* default to Halibut source, which is text */
1631             if (in->currfp) {
1632                 in->wantclose = TRUE;
1633                 reader = NULL;
1634                 len = fread(mag, 1, sizeof(mag), in->currfp);
1635                 for (i = 0; i < lenof(magics); i++) {
1636                     if (len >= magics[i].nmagic &&
1637                         memcmp(mag, magics[i].magic, magics[i].nmagic) == 0) {
1638                         reader = magics[i].reader;
1639                         binary = magics[i].binary;
1640                         break;
1641                     }
1642                 }
1643                 rewind(in->currfp);
1644             }
1645             if (!binary) {
1646                 fclose(in->currfp);
1647                 in->currfp = fopen(in->filenames[in->currindex], "r");
1648             }
1649         }
1650         if (in->currfp) {
1651             if (reader == NULL) {
1652                 read_file(&hptr, in, idx, macros);
1653             } else {
1654                 (*reader)(in);
1655             }
1656         }
1657         in->currindex++;
1658     }
1659
1660     macrocleanup(macros);
1661
1662     return head;
1663 }