mdw@git.distorted.org.uk Git - sgt/halibut/blob - input.c

   1 /*
   2  * input.c: read the source form
   3  */
   4
   5 #include <stdio.h>
   6 #include <assert.h>
   7 #include <time.h>
   8 #include "halibut.h"
   9
  10 #define TAB_STOP 8                     /* for column number tracking */
  11
  12 static void setpos(input *in, char *fname) {
  13     in->pos.filename = fname;
  14     in->pos.line = 1;
  15     in->pos.col = (in->reportcols ? 1 : -1);
  16 }
  17
  18 static void unget(input *in, int c, filepos *pos) {
  19     if (in->npushback >= in->pushbacksize) {
  20         in->pushbacksize = in->npushback + 16;
  21         in->pushback = sresize(in->pushback, in->pushbacksize, pushback);
  22     }
  23     in->pushback[in->npushback].chr = c;
  24     in->pushback[in->npushback].pos = *pos;   /* structure copy */
  25     in->npushback++;
  26 }
  27
  28 /* ---------------------------------------------------------------------- */
  29 /*
  30  * Macro subsystem
  31  */
  32 typedef struct macro_Tag macro;
  33 struct macro_Tag {
  34     wchar_t *name, *text;
  35 };
  36 struct macrostack_Tag {
  37     macrostack *next;
  38     wchar_t *text;
  39     int ptr, npushback;
  40     filepos pos;
  41 };
  42 static int macrocmp(void *av, void *bv) {
  43     macro *a = (macro *)av, *b = (macro *)bv;
  44     return ustrcmp(a->name, b->name);
  45 }
  46 static void macrodef(tree234 *macros, wchar_t *name, wchar_t *text,
  47                      filepos fpos) {
  48     macro *m = snew(macro);
  49     m->name = name;
  50     m->text = text;
  51     if (add234(macros, m) != m) {
  52         error(err_macroexists, &fpos, name);
  53         sfree(name);
  54         sfree(text);
  55     }
  56 }
  57 static int macrolookup(tree234 *macros, input *in, wchar_t *name,
  58                        filepos *pos) {
  59     macro m, *gotit;
  60     m.name = name;
  61     gotit = find234(macros, &m, NULL);
  62     if (gotit) {
  63         macrostack *expansion = snew(macrostack);
  64         expansion->next = in->stack;
  65         expansion->text = gotit->text;
  66         expansion->pos = *pos;         /* structure copy */
  67         expansion->ptr = 0;
  68         expansion->npushback = in->npushback;
  69         in->stack = expansion;
  70         return TRUE;
  71     } else
  72         return FALSE;
  73 }
  74 static void macrocleanup(tree234 *macros) {
  75     int ti;
  76     macro *m;
  77     for (ti = 0; (m = (macro *)index234(macros, ti)) != NULL; ti++) {
  78         sfree(m->name);
  79         sfree(m->text);
  80         sfree(m);
  81     }
  82     freetree234(macros);
  83 }
  84
  85 static void input_configure(input *in, paragraph *cfg) {
  86     assert(cfg->type == para_Config);
  87
  88     if (!ustricmp(cfg->keyword, L"input-charset")) {
  89         in->charset = charset_from_ustr(&cfg->fpos, uadv(cfg->keyword));
  90     }
  91 }
  92
  93 /*
  94  * Can return EOF
  95  */
  96 static int get(input *in, filepos *pos, rdstringc *rsc) {
  97     int pushbackpt = in->stack ? in->stack->npushback : 0;
  98     if (in->npushback > pushbackpt) {
  99         --in->npushback;
 100         if (pos)
 101             *pos = in->pushback[in->npushback].pos;   /* structure copy */
 102         return in->pushback[in->npushback].chr;
 103     }
 104     else if (in->stack) {
 105         wchar_t c = in->stack->text[in->stack->ptr];
 106         if (pos)
 107             *pos = in->stack->pos;
 108         if (in->stack->text[++in->stack->ptr] == L'\0') {
 109             macrostack *tmp = in->stack;
 110             in->stack = tmp->next;
 111             sfree(tmp);
 112         }
 113         return c;
 114     }
 115     else if (in->currfp) {
 116
 117         while (in->wcpos >= in->nwc) {
 118
 119             int c = getc(in->currfp);
 120
 121             if (c == EOF) {
 122                 fclose(in->currfp);
 123                 in->currfp = NULL;
 124                 return EOF;
 125             }
 126
 127             if (rsc)
 128                 rdaddc(rsc, c);
 129
 130             /* Track line numbers, for error reporting */
 131             if (pos)
 132                 *pos = in->pos;
 133             if (in->reportcols) {
 134                 switch (c) {
 135                   case '\t':
 136                     in->pos.col = 1 + (in->pos.col + TAB_STOP-1) % TAB_STOP;
 137                     break;
 138                   case '\n':
 139                     in->pos.col = 1;
 140                     in->pos.line++;
 141                     break;
 142                   default:
 143                     in->pos.col++;
 144                     break;
 145                 }
 146             } else {
 147                 in->pos.col = -1;
 148                 if (c == '\n')
 149                     in->pos.line++;
 150             }
 151
 152             /*
 153              * Do input character set translation, so that we return
 154              * Unicode.
 155              */
 156             {
 157                 char buf[1];
 158                 char const *p;
 159                 int inlen;
 160
 161                 buf[0] = (char)c;
 162                 p = buf;
 163                 inlen = 1;
 164
 165                 in->nwc = charset_to_unicode(&p, &inlen,
 166                                              in->wc, lenof(in->wc),
 167                                              in->charset, &in->csstate,
 168                                              NULL, 0);
 169                 assert(p == buf+1 && inlen == 0);
 170
 171                 in->wcpos = 0;
 172             }
 173         }
 174
 175         return in->wc[in->wcpos++];
 176
 177     } else
 178         return EOF;
 179 }
 180
 181 /*
 182  * Lexical analysis of source files.
 183  */
 184 typedef struct token_Tag token;
 185 struct token_Tag {
 186     int type;
 187     int cmd, aux;
 188     wchar_t *text;
 189     char *origtext;
 190     filepos pos;
 191 };
 192 enum {
 193     tok_eof,                           /* end of file */
 194     tok_eop,                           /* end of paragraph */
 195     tok_white,                         /* whitespace */
 196     tok_word,                          /* a word or word fragment */
 197     tok_cmd,                           /* \command */
 198     tok_lbrace,                        /* { */
 199     tok_rbrace                         /* } */
 200 };
 201
 202 /* Halibut command keywords. */
 203 enum {
 204     c__invalid,                        /* invalid command */
 205     c__comment,                        /* comment command (\#) */
 206     c__escaped,                        /* escaped character */
 207     c__nop,                            /* no-op */
 208     c__nbsp,                           /* nonbreaking space */
 209     c_A,                               /* appendix heading */
 210     c_B,                               /* bibliography entry */
 211     c_BR,                              /* bibliography rewrite */
 212     c_C,                               /* chapter heading */
 213     c_H,                               /* heading */
 214     c_I,                               /* invisible index mark */
 215     c_IM,                              /* index merge/rewrite */
 216     c_K,                               /* capitalised cross-reference */
 217     c_S,                               /* aux field is 0, 1, 2, ... */
 218     c_U,                               /* unnumbered-chapter heading */
 219     c_W,                               /* Web hyperlink */
 220     c_b,                               /* bulletted list */
 221     c_c,                               /* code */
 222     c_cfg,                             /* configuration directive */
 223     c_copyright,                       /* copyright statement */
 224     c_cq,                              /* quoted code (sugar for \q{\cw{x}}) */
 225     c_cw,                              /* weak code */
 226     c_date,                            /* document processing date */
 227     c_dd,                              /* description list: description */
 228     c_define,                          /* macro definition */
 229     c_dt,                              /* description list: described thing */
 230     c_e,                               /* emphasis */
 231     c_i,                               /* visible index mark */
 232     c_ii,                              /* uncapitalised visible index mark */
 233     c_k,                               /* uncapitalised cross-reference */
 234     c_lcont,                           /* continuation para(s) for list item */
 235     c_n,                               /* numbered list */
 236     c_nocite,                          /* bibliography trickery */
 237     c_preamble,                        /* (obsolete) preamble text */
 238     c_q,                               /* quote marks */
 239     c_quote,                           /* block-quoted paragraphs */
 240     c_rule,                            /* horizontal rule */
 241     c_title,                           /* document title */
 242     c_u,                               /* aux field is char code */
 243     c_versionid                        /* document RCS id */
 244 };
 245
 246 /* Perhaps whitespace should be defined in a more Unicode-friendly way? */
 247 #define iswhite(c) ( (c)==32 || (c)==9 || (c)==13 || (c)==10 )
 248 #define isnl(c) ( (c)==10 )
 249 #define isdec(c) ( ((c)>='0'&&(c)<='9') )
 250 #define fromdec(c) ( (c)-'0' )
 251 #define ishex(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='F') || ((c)>='a'&&(c)<='f'))
 252 #define fromhex(c) ( (c)<='9' ? (c)-'0' : ((c)&0xDF) - ('A'-10) )
 253 #define iscmd(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='Z') || ((c)>='a'&&(c)<='z'))
 254
 255 /*
 256  * Keyword comparison function. Like strcmp, but between a wchar_t *
 257  * and a char *.
 258  */
 259 static int kwcmp(wchar_t const *p, char const *q) {
 260     int i;
 261     do {
 262         i = *p - *q;
 263     } while (*p++ && *q++ && !i);
 264     return i;
 265 }
 266
 267 /*
 268  * Match a keyword.
 269  */
 270 static void match_kw(token *tok) {
 271     /*
 272      * FIXME. The ids are explicit in here so as to allow long-name
 273      * equivalents to the various very short keywords.
 274      */
 275     static const struct { char const *name; int id; } keywords[] = {
 276         {"#", c__comment},             /* comment command (\#) */
 277         {"-", c__escaped},             /* nonbreaking hyphen */
 278         {".", c__nop},                 /* no-op */
 279         {"A", c_A},                    /* appendix heading */
 280         {"B", c_B},                    /* bibliography entry */
 281         {"BR", c_BR},                  /* bibliography rewrite */
 282         {"C", c_C},                    /* chapter heading */
 283         {"H", c_H},                    /* heading */
 284         {"I", c_I},                    /* invisible index mark */
 285         {"IM", c_IM},                  /* index merge/rewrite */
 286         {"K", c_K},                    /* capitalised cross-reference */
 287         {"U", c_U},                    /* unnumbered-chapter heading */
 288         {"W", c_W},                    /* Web hyperlink */
 289         {"\\", c__escaped},            /* escaped backslash (\\) */
 290         {"_", c__nbsp},                /* nonbreaking space (\_) */
 291         {"b", c_b},                    /* bulletted list */
 292         {"c", c_c},                    /* code */
 293         {"cfg", c_cfg},                /* configuration directive */
 294         {"copyright", c_copyright},    /* copyright statement */
 295         {"cq", c_cq},                  /* quoted code (sugar for \q{\cw{x}}) */
 296         {"cw", c_cw},                  /* weak code */
 297         {"date", c_date},              /* document processing date */
 298         {"dd", c_dd},                  /* description list: description */
 299         {"define", c_define},          /* macro definition */
 300         {"dt", c_dt},                  /* description list: described thing */
 301         {"e", c_e},                    /* emphasis */
 302         {"i", c_i},                    /* visible index mark */
 303         {"ii", c_ii},                  /* uncapitalised visible index mark */
 304         {"k", c_k},                    /* uncapitalised cross-reference */
 305         {"lcont", c_lcont},            /* continuation para(s) for list item */
 306         {"n", c_n},                    /* numbered list */
 307         {"nocite", c_nocite},          /* bibliography trickery */
 308         {"preamble", c_preamble},      /* (obsolete) preamble text */
 309         {"q", c_q},                    /* quote marks */
 310         {"quote", c_quote},            /* block-quoted paragraphs */
 311         {"rule", c_rule},              /* horizontal rule */
 312         {"title", c_title},            /* document title */
 313         {"versionid", c_versionid},    /* document RCS id */
 314         {"{", c__escaped},             /* escaped lbrace (\{) */
 315         {"}", c__escaped},             /* escaped rbrace (\}) */
 316     };
 317     int i, j, k, c;
 318
 319     /*
 320      * Special cases: \S{0,1,2,...} and \uABCD. If the syntax
 321      * doesn't match correctly, we just fall through to the
 322      * binary-search phase.
 323      */
 324     if (tok->text[0] == 'S') {
 325         /* We expect numeric characters thereafter. */
 326         wchar_t *p = tok->text+1;
 327         int n;
 328         if (!*p)
 329             n = 1;
 330         else {
 331             n = 0;
 332             while (*p && isdec(*p)) {
 333                 n = 10 * n + fromdec(*p);
 334                 p++;
 335             }
 336         }
 337         if (!*p) {
 338             tok->cmd = c_S;
 339             tok->aux = n;
 340             return;
 341         }
 342     } else if (tok->text[0] == 'u') {
 343         /* We expect hex characters thereafter. */
 344         wchar_t *p = tok->text+1;
 345         int n = 0;
 346         while (*p && ishex(*p)) {
 347             n = 16 * n + fromhex(*p);
 348             p++;
 349         }
 350         if (!*p) {
 351             tok->cmd = c_u;
 352             tok->aux = n;
 353             return;
 354         }
 355     }
 356
 357     i = -1;
 358     j = sizeof(keywords)/sizeof(*keywords);
 359     while (j-i > 1) {
 360         k = (i+j)/2;
 361         c = kwcmp(tok->text, keywords[k].name);
 362         if (c < 0)
 363             j = k;
 364         else if (c > 0)
 365             i = k;
 366         else /* c == 0 */ {
 367             tok->cmd = keywords[k].id;
 368             return;
 369         }
 370     }
 371
 372     tok->cmd = c__invalid;
 373 }
 374
 375
 376 /*
 377  * Read a token from the input file, in the normal way (`normal' in
 378  * the sense that code paragraphs work a different way).
 379  */
 380 token get_token(input *in) {
 381     int c;
 382     int nls;
 383     int prevpos;
 384     token ret;
 385     rdstring rs = { 0, 0, NULL };
 386     rdstringc rsc = { 0, 0, NULL };
 387     filepos cpos;
 388
 389     ret.text = NULL;                   /* default */
 390     ret.origtext = NULL;               /* default */
 391     if (in->pushback_chars) {
 392         rdaddsc(&rsc, in->pushback_chars);
 393         sfree(in->pushback_chars);
 394         in->pushback_chars = NULL;
 395     }
 396     c = get(in, &cpos, &rsc);
 397     ret.pos = cpos;
 398     if (iswhite(c)) {                  /* tok_white or tok_eop */
 399         nls = 0;
 400         prevpos = 0;
 401         do {
 402             if (isnl(c))
 403                 nls++;
 404             prevpos = rsc.pos;
 405         } while ((c = get(in, &cpos, &rsc)) != EOF && iswhite(c));
 406         if (c == EOF) {
 407             ret.type = tok_eof;
 408             sfree(rsc.text);
 409             return ret;
 410         }
 411         if (rsc.text) {
 412             in->pushback_chars = dupstr(rsc.text + prevpos);
 413             sfree(rsc.text);
 414         }
 415         unget(in, c, &cpos);
 416         ret.type = (nls > 1 ? tok_eop : tok_white);
 417         return ret;
 418     } else if (c == EOF) {             /* tok_eof */
 419         ret.type = tok_eof;
 420         sfree(rsc.text);
 421         return ret;
 422     } else if (c == '\\') {            /* tok_cmd */
 423         rsc.pos = prevpos = 0;
 424         c = get(in, &cpos, &rsc);
 425         if (c == '-' || c == '\\' || c == '_' ||
 426             c == '#' || c == '{' || c == '}' || c == '.') {
 427             /* single-char command */
 428             rdadd(&rs, c);
 429             prevpos = rsc.pos;
 430         } else if (c == 'u') {
 431             int len = 0;
 432             do {
 433                 rdadd(&rs, c);
 434                 len++;
 435                 prevpos = rsc.pos;
 436                 c = get(in, &cpos, &rsc);
 437             } while (ishex(c) && len < 5);
 438             unget(in, c, &cpos);
 439         } else if (iscmd(c)) {
 440             do {
 441                 rdadd(&rs, c);
 442                 prevpos = rsc.pos;
 443                 c = get(in, &cpos, &rsc);
 444             } while (iscmd(c));
 445             unget(in, c, &cpos);
 446         }
 447         /*
 448          * Now match the command against the list of available
 449          * ones.
 450          */
 451         ret.type = tok_cmd;
 452         ret.text = ustrdup(rs.text);
 453         if (rsc.text) {
 454             in->pushback_chars = dupstr(rsc.text + prevpos);
 455             rsc.text[prevpos] = '\0';
 456             ret.origtext = dupstr(rsc.text);
 457         } else {
 458             ret.origtext = dupstr("");
 459         }
 460         match_kw(&ret);
 461         sfree(rs.text);
 462         sfree(rsc.text);
 463         return ret;
 464     } else if (c == '{') {             /* tok_lbrace */
 465         ret.type = tok_lbrace;
 466         sfree(rsc.text);
 467         return ret;
 468     } else if (c == '}') {             /* tok_rbrace */
 469         ret.type = tok_rbrace;
 470         sfree(rsc.text);
 471         return ret;
 472     } else {                           /* tok_word */
 473         /*
 474          * Read a word: the longest possible contiguous sequence of
 475          * things other than whitespace, backslash, braces and
 476          * hyphen. A hyphen terminates the word but is returned as
 477          * part of it; everything else is pushed back for the next
 478          * token. The `aux' field contains TRUE if the word ends in
 479          * a hyphen.
 480          */
 481         ret.aux = FALSE;               /* assumed for now */
 482         prevpos = 0;
 483         while (1) {
 484             if (iswhite(c) || c=='{' || c=='}' || c=='\\' || c==EOF) {
 485                 /* Put back the character that caused termination */
 486                 unget(in, c, &cpos);
 487                 break;
 488             } else {
 489                 rdadd(&rs, c);
 490                 if (c == '-') {
 491                     prevpos = rsc.pos;
 492                     ret.aux = TRUE;
 493                     break;             /* hyphen terminates word */
 494                 }
 495             }
 496             prevpos = rsc.pos;
 497             c = get(in, &cpos, &rsc);
 498         }
 499         ret.type = tok_word;
 500         ret.text = ustrdup(rs.text);
 501         if (rsc.text) {
 502             in->pushback_chars = dupstr(rsc.text + prevpos);
 503             rsc.text[prevpos] = '\0';
 504             ret.origtext = dupstr(rsc.text);
 505         } else {
 506             ret.origtext = dupstr("");
 507         }
 508         sfree(rs.text);
 509         sfree(rsc.text);
 510         return ret;
 511     }
 512 }
 513
 514 /*
 515  * Determine whether the next input character is an open brace (for
 516  * telling code paragraphs from paragraphs which merely start with
 517  * code).
 518  */
 519 int isbrace(input *in) {
 520     int c;
 521     filepos cpos;
 522
 523     c = get(in, &cpos, NULL);
 524     unget(in, c, &cpos);
 525     return (c == '{');
 526 }
 527
 528 /*
 529  * Read the rest of a line that starts `\c'. Including nothing at
 530  * all (tok_word with empty text).
 531  */
 532 token get_codepar_token(input *in) {
 533     int c;
 534     token ret;
 535     rdstring rs = { 0, 0, NULL };
 536     filepos cpos;
 537
 538     ret.type = tok_word;
 539     ret.origtext = NULL;
 540     c = get(in, &cpos, NULL);          /* expect (and discard) one space */
 541     ret.pos = cpos;
 542     if (c == ' ') {
 543         c = get(in, &cpos, NULL);
 544         ret.pos = cpos;
 545     }
 546     while (!isnl(c) && c != EOF) {
 547         int c2 = c;
 548         c = get(in, &cpos, NULL);
 549         /* Discard \r just before \n. */
 550         if (c2 != 13 || !isnl(c))
 551             rdadd(&rs, c2);
 552     }
 553     unget(in, c, &cpos);
 554     ret.text = ustrdup(rs.text);
 555     sfree(rs.text);
 556     return ret;
 557 }
 558
 559 /*
 560  * Adds a new word to a linked list
 561  */
 562 static word *addword(word newword, word ***hptrptr) {
 563     word *mnewword;
 564     if (!hptrptr)
 565         return NULL;
 566     mnewword = snew(word);
 567     *mnewword = newword;               /* structure copy */
 568     mnewword->next = NULL;
 569     **hptrptr = mnewword;
 570     *hptrptr = &mnewword->next;
 571     return mnewword;
 572 }
 573
 574 /*
 575  * Adds a new paragraph to a linked list
 576  */
 577 static paragraph *addpara(paragraph newpara, paragraph ***hptrptr) {
 578     paragraph *mnewpara = snew(paragraph);
 579     *mnewpara = newpara;               /* structure copy */
 580     mnewpara->next = NULL;
 581     **hptrptr = mnewpara;
 582     *hptrptr = &mnewpara->next;
 583     return mnewpara;
 584 }
 585
 586 /*
 587  * Destructor before token is reassigned; should catch most memory
 588  * leaks
 589  */
 590 #define dtor(t) ( sfree(t.text), sfree(t.origtext) )
 591
 592 /*
 593  * Reads a single file (ie until get() returns EOF)
 594  */
 595 static void read_file(paragraph ***ret, input *in, indexdata *idx,
 596                       tree234 *macros) {
 597     token t;
 598     paragraph par;
 599     word wd, **whptr, **idximplicit;
 600     wchar_t utext[2], *wdtext;
 601     int style, spcstyle;
 602     int already;
 603     int iswhite, seenwhite;
 604     int type;
 605     int prev_para_type;
 606     struct stack_item {
 607         enum {
 608             stack_nop = 0,             /* do nothing (for error recovery) */
 609             stack_ualt = 1,            /* \u alternative */
 610             stack_style = 2,           /* \e, \c, \cw */
 611             stack_idx = 4,             /* \I, \i, \ii */
 612             stack_hyper = 8,           /* \W */
 613             stack_quote = 16           /* \q */
 614         } type;
 615         word **whptr;                  /* to restore from \u alternatives */
 616         word **idximplicit;            /* to restore from \u alternatives */
 617         filepos fpos;
 618         int in_code;
 619     } *sitem;
 620     stack parsestk;
 621     struct crossparaitem {
 622         int type;                      /* currently c_lcont, c_quote or -1 */
 623         int seen_lcont, seen_quote;
 624     };
 625     stack crossparastk;
 626     word *indexword, *uword, *iword;
 627     word *idxwordlist;
 628     rdstring indexstr;
 629     int index_downcase, index_visible, indexing;
 630     const rdstring nullrs = { 0, 0, NULL };
 631     wchar_t uchr;
 632
 633     t.text = NULL;
 634     t.origtext = NULL;
 635     already = FALSE;
 636
 637     crossparastk = stk_new();
 638
 639     /*
 640      * Loop on each paragraph.
 641      */
 642     while (1) {
 643         int start_cmd = c__invalid;
 644         par.words = NULL;
 645         par.keyword = NULL;
 646         par.origkeyword = NULL;
 647         whptr = &par.words;
 648
 649         /*
 650          * Get a token.
 651          */
 652         do {
 653             if (!already) {
 654                 dtor(t), t = get_token(in);
 655             }
 656             already = FALSE;
 657         } while (t.type == tok_eop);
 658         if (t.type == tok_eof)
 659             break;
 660
 661         /*
 662          * Parse code paragraphs separately.
 663          */
 664         if (t.type == tok_cmd && t.cmd == c_c && !isbrace(in)) {
 665             int wtype = word_WeakCode;
 666
 667             par.type = para_Code;
 668             par.fpos = t.pos;
 669             while (1) {
 670                 dtor(t), t = get_codepar_token(in);
 671                 wd.type = wtype;
 672                 wd.breaks = FALSE;     /* shouldn't need this... */
 673                 wd.text = ustrdup(t.text);
 674                 wd.alt = NULL;
 675                 wd.fpos = t.pos;
 676                 addword(wd, &whptr);
 677                 dtor(t), t = get_token(in);
 678                 if (t.type == tok_white) {
 679                     /*
 680                      * The newline after a code-paragraph line
 681                      */
 682                     dtor(t), t = get_token(in);
 683                 }
 684                 if (t.type == tok_eop || t.type == tok_eof ||
 685                     t.type == tok_rbrace) { /* might be } terminating \lcont */
 686                     if (t.type == tok_rbrace)
 687                         already = TRUE;
 688                     break;
 689                 } else if (t.type == tok_cmd && t.cmd == c_c) {
 690                     wtype = word_WeakCode;
 691                 } else if (t.type == tok_cmd && t.cmd == c_e &&
 692                            wtype == word_WeakCode) {
 693                     wtype = word_Emph;
 694                 } else {
 695                     error(err_brokencodepara, &t.pos);
 696                     prev_para_type = par.type;
 697                     addpara(par, ret);
 698                     while (t.type != tok_eop)   /* error recovery: */
 699                         dtor(t), t = get_token(in);   /* eat rest of paragraph */
 700                     goto codeparabroken;   /* ick, but such is life */
 701                 }
 702             }
 703             prev_para_type = par.type;
 704             addpara(par, ret);
 705             codeparabroken:
 706             continue;
 707         }
 708
 709         /*
 710          * Spot the special commands that define a grouping of more
 711          * than one paragraph, and also the closing braces that
 712          * finish them.
 713          */
 714         if (t.type == tok_cmd &&
 715             (t.cmd == c_lcont || t.cmd == c_quote)) {
 716             struct crossparaitem *sitem, *stop;
 717             int cmd = t.cmd;
 718
 719             /*
 720              * Expect, and swallow, an open brace.
 721              */
 722             dtor(t), t = get_token(in);
 723             if (t.type != tok_lbrace) {
 724                 error(err_explbr, &t.pos);
 725                 continue;
 726             }
 727
 728             /*
 729              * Also expect, and swallow, any whitespace after that
 730              * (a newline before a code paragraph wouldn't be
 731              * surprising).
 732              */
 733             do {
 734                 dtor(t), t = get_token(in);
 735             } while (t.type == tok_white);
 736             already = TRUE;
 737
 738             if (cmd == c_lcont) {
 739                 /*
 740                  * \lcont causes a continuation of a list item into
 741                  * multiple paragraphs (which may in turn contain
 742                  * nested lists, code paras etc). Hence, the previous
 743                  * paragraph must be of a list type.
 744                  */
 745                 sitem = snew(struct crossparaitem);
 746                 stop = (struct crossparaitem *)stk_top(crossparastk);
 747                 if (stop)
 748                     *sitem = *stop;
 749                 else
 750                     sitem->seen_quote = sitem->seen_lcont = 0;
 751
 752                 if (prev_para_type == para_Bullet ||
 753                     prev_para_type == para_NumberedList ||
 754                     prev_para_type == para_Description) {
 755                     sitem->type = c_lcont;
 756                     sitem->seen_lcont = 1;
 757                     par.type = para_LcontPush;
 758                     prev_para_type = par.type;
 759                     addpara(par, ret);
 760                 } else {
 761                     /*
 762                      * Push a null item on the cross-para stack so that
 763                      * when we see the corresponding closing brace we
 764                      * don't give a cascade error.
 765                      */
 766                     sitem->type = -1;
 767                     error(err_misplacedlcont, &t.pos);
 768                 }
 769             } else {
 770                 /*
 771                  * \quote causes a group of paragraphs to be
 772                  * block-quoted (typically they will be indented a
 773                  * bit).
 774                  */
 775                 sitem = snew(struct crossparaitem);
 776                 stop = (struct crossparaitem *)stk_top(crossparastk);
 777                 if (stop)
 778                     *sitem = *stop;
 779                 else
 780                     sitem->seen_quote = sitem->seen_lcont = 0;
 781                 sitem->type = c_quote;
 782                 sitem->seen_quote = 1;
 783                 par.type = para_QuotePush;
 784                 prev_para_type = par.type;
 785                 addpara(par, ret);
 786             }
 787             stk_push(crossparastk, sitem);
 788             continue;
 789         } else if (t.type == tok_rbrace) {
 790             struct crossparaitem *sitem = stk_pop(crossparastk);
 791             if (!sitem)
 792                 error(err_unexbrace, &t.pos);
 793             else {
 794                 switch (sitem->type) {
 795                   case c_lcont:
 796                     par.type = para_LcontPop;
 797                     prev_para_type = par.type;
 798                     addpara(par, ret);
 799                     break;
 800                   case c_quote:
 801                     par.type = para_QuotePop;
 802                     prev_para_type = par.type;
 803                     addpara(par, ret);
 804                     break;
 805                 }
 806                 sfree(sitem);
 807             }
 808             continue;
 809         }
 810
 811         while (t.type == tok_cmd &&
 812                macrolookup(macros, in, t.text, &t.pos)) {
 813             dtor(t), t = get_token(in);
 814         }
 815
 816         /*
 817          * This token begins a paragraph. See if it's one of the
 818          * special commands that define a paragraph type.
 819          *
 820          * (note that \# is special in a way, and \nocite takes no
 821          * text)
 822          */
 823         par.type = para_Normal;
 824         if (t.type == tok_cmd) {
 825             int needkw;
 826             int is_macro = FALSE;
 827
 828             par.fpos = t.pos;
 829             switch (t.cmd) {
 830               default:
 831                 needkw = -1;
 832                 break;
 833               case c__invalid:
 834                 error(err_badparatype, t.text, &t.pos);
 835                 needkw = 4;
 836                 break;
 837               case c__comment:
 838                 if (isbrace(in))
 839                     break;             /* `\#{': isn't a comment para */
 840                 do {
 841                     dtor(t), t = get_token(in);
 842                 } while (t.type != tok_eop && t.type != tok_eof);
 843                 continue;              /* next paragraph */
 844                 /*
 845                  * `needkw' values:
 846                  *
 847                  *   1 -- exactly one keyword
 848                  *   2 -- at least one keyword
 849                  *   4 -- any number of keywords including zero
 850                  *   8 -- at least one keyword and then nothing else
 851                  *  16 -- nothing at all! no keywords, no body
 852                  *  32 -- no keywords at all
 853                  */
 854               case c_A: needkw = 2; par.type = para_Appendix; break;
 855               case c_B: needkw = 2; par.type = para_Biblio; break;
 856               case c_BR: needkw = 1; par.type = para_BR;
 857                 start_cmd = c_BR; break;
 858               case c_C: needkw = 2; par.type = para_Chapter; break;
 859               case c_H: needkw = 2; par.type = para_Heading;
 860                 par.aux = 0;
 861                 break;
 862               case c_IM: needkw = 2; par.type = para_IM;
 863                 start_cmd = c_IM; break;
 864               case c_S: needkw = 2; par.type = para_Subsect;
 865                 par.aux = t.aux; break;
 866               case c_U: needkw = 32; par.type = para_UnnumberedChapter; break;
 867                 /* For \b and \n the keyword is optional */
 868               case c_b: needkw = 4; par.type = para_Bullet; break;
 869               case c_dt: needkw = 4; par.type = para_DescribedThing; break;
 870               case c_dd: needkw = 4; par.type = para_Description; break;
 871               case c_n: needkw = 4; par.type = para_NumberedList; break;
 872               case c_cfg: needkw = 8; par.type = para_Config;
 873                 start_cmd = c_cfg; break;
 874               case c_copyright: needkw = 32; par.type = para_Copyright; break;
 875               case c_define: is_macro = TRUE; needkw = 1; break;
 876                 /* For \nocite the keyword is _everything_ */
 877               case c_nocite: needkw = 8; par.type = para_NoCite; break;
 878               case c_preamble: needkw = 32; par.type = para_Normal; break;
 879               case c_rule: needkw = 16; par.type = para_Rule; break;
 880               case c_title: needkw = 32; par.type = para_Title; break;
 881               case c_versionid: needkw = 32; par.type = para_VersionID; break;
 882             }
 883
 884             if (par.type == para_Chapter ||
 885                 par.type == para_Heading ||
 886                 par.type == para_Subsect ||
 887                 par.type == para_Appendix ||
 888                 par.type == para_UnnumberedChapter) {
 889                 struct crossparaitem *sitem = stk_top(crossparastk);
 890                 if (sitem && (sitem->seen_lcont || sitem->seen_quote)) {
 891                     error(err_sectmarkerinblock,
 892                           &t.pos,
 893                           (sitem->seen_lcont ? "lcont" : "quote"));
 894                 }
 895             }
 896
 897             if (needkw > 0) {
 898                 rdstring rs = { 0, 0, NULL };
 899                 rdstringc rsc = { 0, 0, NULL };
 900                 int nkeys = 0;
 901                 filepos fp;
 902
 903                 /* Get keywords. */
 904                 dtor(t), t = get_token(in);
 905                 fp = t.pos;
 906                 while (t.type == tok_lbrace ||
 907                        (t.type == tok_white && (needkw & 24))) {
 908                     /*
 909                      * In paragraph types which can't accept any
 910                      * body text (such as \cfg), we are lenient
 911                      * about whitespace between keywords. This is
 912                      * important for \cfg in particular since it
 913                      * can often have many keywords which are long
 914                      * pieces of text, so it's useful to permit the
 915                      * user to wrap the line between them.
 916                      */
 917                     if (t.type == tok_white) {
 918                         dtor(t), t = get_token(in); /* eat the space */
 919                         continue;
 920                     }
 921                     /* This is a keyword. */
 922                     nkeys++;
 923                     /* FIXME: there will be bugs if anyone specifies an
 924                      * empty keyword (\foo{}), so trap this case. */
 925                     while (dtor(t), t = get_token(in),
 926                            t.type == tok_word ||
 927                            t.type == tok_white ||
 928                            (t.type == tok_cmd && t.cmd == c__nbsp) ||
 929                            (t.type == tok_cmd && t.cmd == c__escaped) ||
 930                            (t.type == tok_cmd && t.cmd == c_u)) {
 931                         if (t.type == tok_white ||
 932                             (t.type == tok_cmd && t.cmd == c__nbsp)) {
 933                             rdadd(&rs, ' ');
 934                             rdaddc(&rsc, ' ');
 935                         } else if (t.type == tok_cmd && t.cmd == c_u) {
 936                             rdadd(&rs, t.aux);
 937                             rdaddc(&rsc, '\\');
 938                             rdaddsc(&rsc, t.origtext);
 939                         } else {
 940                             rdadds(&rs, t.text);
 941                             rdaddsc(&rsc, t.origtext);
 942                         }
 943                     }
 944                     if (t.type != tok_rbrace) {
 945                         error(err_kwunclosed, &t.pos);
 946                         continue;
 947                     }
 948                     rdadd(&rs, 0);     /* add string terminator */
 949                     rdaddc(&rsc, 0);   /* add string terminator */
 950                     dtor(t), t = get_token(in); /* eat right brace */
 951                 }
 952
 953                 rdadd(&rs, 0);         /* add string terminator */
 954                 rdaddc(&rsc, 0);       /* add string terminator */
 955
 956                 /* See whether we have the right number of keywords. */
 957                 if ((needkw & 48) && nkeys > 0)
 958                     error(err_kwillegal, &fp);
 959                 if ((needkw & 11) && nkeys == 0)
 960                     error(err_kwexpected, &fp);
 961                 if ((needkw & 5) && nkeys > 1)
 962                     error(err_kwtoomany, &fp);
 963
 964                 if (is_macro) {
 965                     /*
 966                      * Macro definition. Get the rest of the line
 967                      * as a code-paragraph token, repeatedly until
 968                      * there's nothing more left of it. Separate
 969                      * with newlines.
 970                      */
 971                     rdstring macrotext = { 0, 0, NULL };
 972                     while (1) {
 973                         dtor(t), t = get_codepar_token(in);
 974                         if (macrotext.pos > 0)
 975                             rdadd(&macrotext, L'\n');
 976                         rdadds(&macrotext, t.text);
 977                         dtor(t), t = get_token(in);
 978                         if (t.type == tok_eop) break;
 979                     }
 980                     macrodef(macros, rs.text, macrotext.text, fp);
 981                     continue;          /* next paragraph */
 982                 }
 983
 984                 par.keyword = rdtrim(&rs);
 985                 par.origkeyword = rdtrimc(&rsc);
 986
 987                 /* Move to EOP in case of needkw==8 or 16 (no body) */
 988                 if (needkw & 24) {
 989                     /* We allow whitespace even when we expect no para body */
 990                     while (t.type == tok_white)
 991                         dtor(t), t = get_token(in);
 992                     if (t.type != tok_eop && t.type != tok_eof &&
 993                         (start_cmd == c__invalid ||
 994                          t.type != tok_cmd || t.cmd != start_cmd)) {
 995                         error(err_bodyillegal, &t.pos);
 996                         /* Error recovery: eat the rest of the paragraph */
 997                         while (t.type != tok_eop && t.type != tok_eof &&
 998                                (start_cmd == c__invalid ||
 999                                 t.type != tok_cmd || t.cmd != start_cmd))
1000                             dtor(t), t = get_token(in);
1001                     }
1002                     if (t.type == tok_cmd)
1003                         already = TRUE;/* inhibit get_token at top of loop */
1004                     prev_para_type = par.type;
1005                     addpara(par, ret);
1006
1007                     if (par.type == para_Config) {
1008                         input_configure(in, &par);
1009                     }
1010                     continue;          /* next paragraph */
1011                 }
1012             }
1013         }
1014
1015         /*
1016          * Now read the actual paragraph, word by word, adding to
1017          * the paragraph list.
1018          *
1019          * Mid-paragraph commands:
1020          *
1021          *  \K \k
1022          *  \c \cw \cq
1023          *  \e
1024          *  \i \ii
1025          *  \I
1026          *  \q
1027          *  \u
1028          *  \W
1029          *  \date
1030          *  \\ \{ \}
1031          */
1032         parsestk = stk_new();
1033         style = word_Normal;
1034         spcstyle = word_WhiteSpace;
1035         indexing = FALSE;
1036         seenwhite = TRUE;
1037         while (t.type != tok_eop && t.type != tok_eof) {
1038             iswhite = FALSE;
1039             already = FALSE;
1040
1041             /* Handle implicit paragraph breaks after \IM, \BR etc */
1042             if (start_cmd != c__invalid &&
1043                 t.type == tok_cmd && t.cmd == start_cmd) {
1044                 already = TRUE;        /* inhibit get_token at top of loop */
1045                 break;
1046             }
1047
1048             if (t.type == tok_cmd && t.cmd == c__nop) {
1049                 dtor(t), t = get_token(in);
1050                 continue;              /* do nothing! */
1051             }
1052
1053             if (t.type == tok_cmd && t.cmd == c__escaped) {
1054                 t.type = tok_word;     /* nice and simple */
1055                 t.aux = 0;             /* even if `\-' - nonbreaking! */
1056             }
1057             if (t.type == tok_cmd && t.cmd == c__nbsp) {
1058                 t.type = tok_word;     /* nice and simple */
1059                 sfree(t.text);
1060                 t.text = ustrdup(L" ");  /* text is ` ' not `_' */
1061                 t.aux = 0;             /* (nonbreaking) */
1062             }
1063             switch (t.type) {
1064               case tok_white:
1065                 if (whptr == &par.words)
1066                     break;             /* strip whitespace at start of para */
1067                 wd.text = NULL;
1068                 wd.type = spcstyle;
1069                 wd.alt = NULL;
1070                 wd.aux = 0;
1071                 wd.fpos = t.pos;
1072                 wd.breaks = FALSE;
1073
1074                 /*
1075                  * Inhibit use of whitespace if it's (probably the
1076                  * newline) before a repeat \IM / \BR type
1077                  * directive.
1078                  */
1079                 if (start_cmd != c__invalid) {
1080                     dtor(t), t = get_token(in);
1081                     already = TRUE;
1082                     if (t.type == tok_cmd && t.cmd == start_cmd)
1083                         break;
1084                 }
1085
1086                 if (indexing)
1087                     rdadd(&indexstr, ' ');
1088                 if (!indexing || index_visible)
1089                     addword(wd, &whptr);
1090                 if (indexing)
1091                     addword(wd, &idximplicit);
1092                 iswhite = TRUE;
1093                 break;
1094               case tok_word:
1095                 if (indexing)
1096                     rdadds(&indexstr, t.text);
1097                 wd.type = style;
1098                 wd.alt = NULL;
1099                 wd.aux = 0;
1100                 wd.fpos = t.pos;
1101                 wd.breaks = t.aux;
1102                 if (!indexing || index_visible) {
1103                     wd.text = ustrdup(t.text);
1104                     addword(wd, &whptr);
1105                 }
1106                 if (indexing) {
1107                     wd.text = ustrdup(t.text);
1108                     addword(wd, &idximplicit);
1109                 }
1110                 break;
1111               case tok_lbrace:
1112                 error(err_unexbrace, &t.pos);
1113                 /* Error recovery: push nop */
1114                 sitem = snew(struct stack_item);
1115                 sitem->type = stack_nop;
1116                 sitem->fpos = t.pos;
1117                 stk_push(parsestk, sitem);
1118                 break;
1119               case tok_rbrace:
1120                 sitem = stk_pop(parsestk);
1121                 if (!sitem) {
1122                     /*
1123                      * This closing brace could have been an
1124                      * indication that the cross-paragraph stack
1125                      * wants popping. Accordingly, we treat it here
1126                      * as an indication that the paragraph is over.
1127                      */
1128                     already = TRUE;
1129                     goto finished_para;
1130                 } else {
1131                     if (sitem->type & stack_ualt) {
1132                         whptr = sitem->whptr;
1133                         idximplicit = sitem->idximplicit;
1134                     }
1135                     if (sitem->type & stack_style) {
1136                         style = word_Normal;
1137                         spcstyle = word_WhiteSpace;
1138                     }
1139                     if (sitem->type & stack_idx) {
1140                         indexword->text = ustrdup(indexstr.text);
1141                         if (index_downcase) {
1142                             word *w;
1143
1144                             ustrlow(indexword->text);
1145                             ustrlow(indexstr.text);
1146
1147                             for (w = idxwordlist; w; w = w->next)
1148                                 if (w->text)
1149                                     ustrlow(w->text);
1150                         }
1151                         indexing = FALSE;
1152                         rdadd(&indexstr, L'\0');
1153                         index_merge(idx, FALSE, indexstr.text,
1154                                     idxwordlist, &sitem->fpos);
1155                         sfree(indexstr.text);
1156                     }
1157                     if (sitem->type & stack_hyper) {
1158                         wd.text = NULL;
1159                         wd.type = word_HyperEnd;
1160                         wd.alt = NULL;
1161                         wd.aux = 0;
1162                         wd.fpos = t.pos;
1163                         wd.breaks = FALSE;
1164                         if (!indexing || index_visible)
1165                             addword(wd, &whptr);
1166                         if (indexing)
1167                             addword(wd, &idximplicit);
1168                     }
1169                     if (sitem->type & stack_quote) {
1170                         wd.text = NULL;
1171                         wd.type = toquotestyle(style);
1172                         wd.alt = NULL;
1173                         wd.aux = quote_Close;
1174                         wd.fpos = t.pos;
1175                         wd.breaks = FALSE;
1176                         if (!indexing || index_visible)
1177                             addword(wd, &whptr);
1178                         if (indexing) {
1179                             rdadd(&indexstr, L'"');
1180                             addword(wd, &idximplicit);
1181                         }
1182                     }
1183                 }
1184                 sfree(sitem);
1185                 break;
1186               case tok_cmd:
1187                 switch (t.cmd) {
1188                   case c__comment:
1189                     /*
1190                      * In-paragraph comment: \#{ balanced braces }
1191                      *
1192                      * Anything goes here; even tok_eop. We should
1193                      * eat whitespace after the close brace _if_
1194                      * there was whitespace before the \#.
1195                      */
1196                     dtor(t), t = get_token(in);
1197                     if (t.type != tok_lbrace) {
1198                         error(err_explbr, &t.pos);
1199                     } else {
1200                         int braces = 1;
1201                         while (braces > 0) {
1202                             dtor(t), t = get_token(in);
1203                             if (t.type == tok_lbrace)
1204                                 braces++;
1205                             else if (t.type == tok_rbrace)
1206                                 braces--;
1207                             else if (t.type == tok_eof) {
1208                                 error(err_commenteof, &t.pos);
1209                                 break;
1210                             }
1211                         }
1212                     }
1213                     if (seenwhite) {
1214                         already = TRUE;
1215                         dtor(t), t = get_token(in);
1216                         if (t.type == tok_white) {
1217                             iswhite = TRUE;
1218                             already = FALSE;
1219                         }
1220                     }
1221                     break;
1222                   case c_q:
1223                   case c_cq:
1224                     type = t.cmd;
1225                     dtor(t), t = get_token(in);
1226                     if (t.type != tok_lbrace) {
1227                         error(err_explbr, &t.pos);
1228                     } else {
1229                         /*
1230                          * Enforce that \q may not be used anywhere
1231                          * within \c. (It shouldn't be necessary
1232                          * since the whole point of \c should be
1233                          * that the user wants to exercise exact
1234                          * control over the glyphs used, and
1235                          * forbidding it has the useful effect of
1236                          * relieving some backends of having to
1237                          * make difficult decisions.)
1238                          */
1239                         int stype;
1240
1241                         if (style != word_Code && style != word_WeakCode) {
1242                             wd.text = NULL;
1243                             wd.type = toquotestyle(style);
1244                             wd.alt = NULL;
1245                             wd.aux = quote_Open;
1246                             wd.fpos = t.pos;
1247                             wd.breaks = FALSE;
1248                             if (!indexing || index_visible)
1249                                 addword(wd, &whptr);
1250                             if (indexing) {
1251                                 rdadd(&indexstr, L'"');
1252                                 addword(wd, &idximplicit);
1253                             }
1254                             stype = stack_quote;
1255                         } else {
1256                             error(err_codequote, &t.pos);
1257                             stype = stack_nop;
1258                         }
1259                         sitem = snew(struct stack_item);
1260                         sitem->fpos = t.pos;
1261                         sitem->type = stype;
1262                         if (type == c_cq) {
1263                             if (style != word_Normal) {
1264                                 error(err_nestedstyles, &t.pos);
1265                             } else {
1266                                 style = word_WeakCode;
1267                                 spcstyle = tospacestyle(style);
1268                                 sitem->type |= stack_style;
1269                             }
1270                         }
1271                         stk_push(parsestk, sitem);
1272                     }
1273                     break;
1274                   case c_K:
1275                   case c_k:
1276                   case c_W:
1277                   case c_date:
1278                     /*
1279                      * Keyword, hyperlink, or \date. We expect a
1280                      * left brace, some text, and then a right
1281                      * brace. No nesting; no arguments.
1282                      */
1283                     wd.fpos = t.pos;
1284                     wd.breaks = FALSE;
1285                     if (t.cmd == c_K)
1286                         wd.type = word_UpperXref;
1287                     else if (t.cmd == c_k)
1288                         wd.type = word_LowerXref;
1289                     else if (t.cmd == c_W)
1290                         wd.type = word_HyperLink;
1291                     else
1292                         wd.type = word_Normal;
1293                     dtor(t), t = get_token(in);
1294                     if (t.type != tok_lbrace) {
1295                         if (wd.type == word_Normal) {
1296                             time_t thetime = time(NULL);
1297                             struct tm *broken = localtime(&thetime);
1298                             already = TRUE;
1299                             wdtext = ustrftime(NULL, broken);
1300                             wd.type = style;
1301                         } else {
1302                             error(err_explbr, &t.pos);
1303                             wdtext = NULL;
1304                         }
1305                     } else {
1306                         rdstring rs = { 0, 0, NULL };
1307                         while (dtor(t), t = get_token(in),
1308                                t.type == tok_word || t.type == tok_white) {
1309                             if (t.type == tok_white)
1310                                 rdadd(&rs, ' ');
1311                             else
1312                                 rdadds(&rs, t.text);
1313                         }
1314                         if (wd.type == word_Normal) {
1315                             time_t thetime = time(NULL);
1316                             struct tm *broken = localtime(&thetime);
1317                             wdtext = ustrftime(rs.text, broken);
1318                             wd.type = style;
1319                         } else {
1320                             wdtext = ustrdup(rs.text);
1321                         }
1322                         sfree(rs.text);
1323                         if (t.type != tok_rbrace) {
1324                             error(err_kwexprbr, &t.pos);
1325                         }
1326                     }
1327                     wd.alt = NULL;
1328                     wd.aux = 0;
1329                     if (!indexing || index_visible) {
1330                         wd.text = ustrdup(wdtext);
1331                         addword(wd, &whptr);
1332                     }
1333                     if (indexing) {
1334                         wd.text = ustrdup(wdtext);
1335                         addword(wd, &idximplicit);
1336                     }
1337                     sfree(wdtext);
1338                     if (wd.type == word_HyperLink) {
1339                         /*
1340                          * Hyperlinks are different: they then
1341                          * expect another left brace, to begin
1342                          * delimiting the text marked by the link.
1343                          */
1344                         dtor(t), t = get_token(in);
1345                         sitem = snew(struct stack_item);
1346                         sitem->fpos = wd.fpos;
1347                         sitem->type = stack_hyper;
1348                         /*
1349                          * Special cases: \W{}\i, \W{}\ii
1350                          */
1351                         if (t.type == tok_cmd &&
1352                             (t.cmd == c_i || t.cmd == c_ii)) {
1353                             if (indexing) {
1354                                 error(err_nestedindex, &t.pos);
1355                             } else {
1356                                 /* Add an index-reference word with no
1357                                  * text as yet */
1358                                 wd.type = word_IndexRef;
1359                                 wd.text = NULL;
1360                                 wd.alt = NULL;
1361                                 wd.aux = 0;
1362                                 wd.breaks = FALSE;
1363                                 indexword = addword(wd, &whptr);
1364                                 /* Set up a rdstring to read the
1365                                  * index text */
1366                                 indexstr = nullrs;
1367                                 /* Flags so that we do the Right
1368                                  * Things with text */
1369                                 index_visible = (type != c_I);
1370                                 index_downcase = (type == c_ii);
1371                                 indexing = TRUE;
1372                                 idxwordlist = NULL;
1373                                 idximplicit = &idxwordlist;
1374
1375                                 sitem->type |= stack_idx;
1376                             }
1377                             dtor(t), t = get_token(in);
1378                         }
1379                         /*
1380                          * Special cases: \W{}\c, \W{}\e, \W{}\cw
1381                          */
1382                         if (t.type == tok_cmd &&
1383                             (t.cmd == c_e || t.cmd == c_c || t.cmd == c_cw)) {
1384                             if (style != word_Normal)
1385                                 error(err_nestedstyles, &t.pos);
1386                             else {
1387                                 style = (t.cmd == c_c ? word_Code :
1388                                          t.cmd == c_cw ? word_WeakCode :
1389                                          word_Emph);
1390                                 spcstyle = tospacestyle(style);
1391                                 sitem->type |= stack_style;
1392                             }
1393                             dtor(t), t = get_token(in);
1394                         }
1395                         if (t.type != tok_lbrace) {
1396                             error(err_explbr, &t.pos);
1397                             sfree(sitem);
1398                         } else {
1399                             stk_push(parsestk, sitem);
1400                         }
1401                     }
1402                     break;
1403                   case c_c:
1404                   case c_cw:
1405                   case c_e:
1406                     type = t.cmd;
1407                     if (style != word_Normal) {
1408                         error(err_nestedstyles, &t.pos);
1409                         /* Error recovery: eat lbrace, push nop. */
1410                         dtor(t), t = get_token(in);
1411                         sitem = snew(struct stack_item);
1412                         sitem->fpos = t.pos;
1413                         sitem->type = stack_nop;
1414                         stk_push(parsestk, sitem);
1415                     }
1416                     dtor(t), t = get_token(in);
1417                     if (t.type != tok_lbrace) {
1418                         error(err_explbr, &t.pos);
1419                     } else {
1420                         style = (type == c_c ? word_Code :
1421                                  type == c_cw ? word_WeakCode :
1422                                  word_Emph);
1423                         spcstyle = tospacestyle(style);
1424                         sitem = snew(struct stack_item);
1425                         sitem->fpos = t.pos;
1426                         sitem->type = stack_style;
1427                         stk_push(parsestk, sitem);
1428                     }
1429                     break;
1430                   case c_i:
1431                   case c_ii:
1432                   case c_I:
1433                     type = t.cmd;
1434                     if (indexing) {
1435                         error(err_nestedindex, &t.pos);
1436                         /* Error recovery: eat lbrace, push nop. */
1437                         dtor(t), t = get_token(in);
1438                         sitem = snew(struct stack_item);
1439                         sitem->fpos = t.pos;
1440                         sitem->type = stack_nop;
1441                         stk_push(parsestk, sitem);
1442                     }
1443                     sitem = snew(struct stack_item);
1444                     sitem->fpos = t.pos;
1445                     sitem->type = stack_idx;
1446                     dtor(t), t = get_token(in);
1447                     /*
1448                      * Special cases: \i\c, \i\e, \i\cw
1449                      */
1450                     wd.fpos = t.pos;
1451                     if (t.type == tok_cmd &&
1452                         (t.cmd == c_e || t.cmd == c_c || t.cmd == c_cw)) {
1453                         if (style != word_Normal)
1454                             error(err_nestedstyles, &t.pos);
1455                         else {
1456                             style = (t.cmd == c_c ? word_Code :
1457                                      t.cmd == c_cw ? word_WeakCode :
1458                                      word_Emph);
1459                             spcstyle = tospacestyle(style);
1460                             sitem->type |= stack_style;
1461                         }
1462                         dtor(t), t = get_token(in);
1463                     }
1464                     if (t.type != tok_lbrace) {
1465                         sfree(sitem);
1466                         error(err_explbr, &t.pos);
1467                     } else {
1468                         /* Add an index-reference word with no text as yet */
1469                         wd.type = word_IndexRef;
1470                         wd.text = NULL;
1471                         wd.alt = NULL;
1472                         wd.aux = 0;
1473                         wd.breaks = FALSE;
1474                         indexword = addword(wd, &whptr);
1475                         /* Set up a rdstring to read the index text */
1476                         indexstr = nullrs;
1477                         /* Flags so that we do the Right Things with text */
1478                         index_visible = (type != c_I);
1479                         index_downcase = (type == c_ii);
1480                         indexing = TRUE;
1481                         idxwordlist = NULL;
1482                         idximplicit = &idxwordlist;
1483                         /* Stack item to close the indexing on exit */
1484                         stk_push(parsestk, sitem);
1485                     }
1486                     break;
1487                   case c_u:
1488                     uchr = t.aux;
1489                     utext[0] = uchr; utext[1] = 0;
1490                     wd.type = style;
1491                     wd.breaks = FALSE;
1492                     wd.alt = NULL;
1493                     wd.aux = 0;
1494                     wd.fpos = t.pos;
1495                     if (!indexing || index_visible) {
1496                         wd.text = ustrdup(utext);
1497                         uword = addword(wd, &whptr);
1498                     } else
1499                         uword = NULL;
1500                     if (indexing) {
1501                         wd.text = ustrdup(utext);
1502                         iword = addword(wd, &idximplicit);
1503                     } else
1504                         iword = NULL;
1505                     dtor(t), t = get_token(in);
1506                     if (t.type == tok_lbrace) {
1507                         /*
1508                          * \u with a left brace. Until the brace
1509                          * closes, all further words go on a
1510                          * sidetrack from the main thread of the
1511                          * paragraph.
1512                          */
1513                         sitem = snew(struct stack_item);
1514                         sitem->fpos = t.pos;
1515                         sitem->type = stack_ualt;
1516                         sitem->whptr = whptr;
1517                         sitem->idximplicit = idximplicit;
1518                         stk_push(parsestk, sitem);
1519                         whptr = uword ? &uword->alt : NULL;
1520                         idximplicit = iword ? &iword->alt : NULL;
1521                     } else {
1522                         if (indexing)
1523                             rdadd(&indexstr, uchr);
1524                         already = TRUE;
1525                     }
1526                     break;
1527                   default:
1528                     if (!macrolookup(macros, in, t.text, &t.pos))
1529                         error(err_badmidcmd, t.text, &t.pos);
1530                     break;
1531                 }
1532             }
1533             if (!already)
1534                 dtor(t), t = get_token(in);
1535             seenwhite = iswhite;
1536         }
1537         finished_para:
1538         /* Check the stack is empty */
1539         if (stk_top(parsestk)) {
1540             while ((sitem = stk_pop(parsestk)))
1541                 sfree(sitem);
1542             error(err_missingrbrace, &t.pos);
1543         }
1544         stk_free(parsestk);
1545         prev_para_type = par.type;
1546         /*
1547          * Before we add the paragraph to the output list, we
1548          * should check that there was any text in it at all; there
1549          * might not be if (for example) the paragraph contained
1550          * nothing but an unrecognised command sequence, and if we
1551          * put an empty paragraph on the list it may confuse the
1552          * back ends later on.
1553          */
1554         if (par.words) {
1555             addpara(par, ret);
1556         }
1557         if (t.type == tok_eof)
1558             already = TRUE;
1559     }
1560
1561     if (stk_top(crossparastk)) {
1562         void *p;
1563
1564         error(err_missingrbrace2, &t.pos);
1565         while ((p = stk_pop(crossparastk)))
1566             sfree(p);
1567     }
1568
1569     /*
1570      * We break to here rather than returning, because otherwise
1571      * this cleanup doesn't happen.
1572      */
1573     dtor(t);
1574
1575     stk_free(crossparastk);
1576 }
1577
1578 struct {
1579     char const *magic;
1580     size_t nmagic;
1581     void (*reader)(input *);
1582 } magics[] = {
1583     { "%!FontType1-",     12, &read_pfa_file },
1584     { "%!PS-AdobeFont-",  15, &read_pfa_file },
1585     { "\x80\x01",          2, &read_pfb_file },
1586     { "StartFontMetrics", 16, &read_afm_file },
1587     { "\x00\x01\x00\x00",  4, &read_sfnt_file },
1588     { "true",              4, &read_sfnt_file },
1589 };
1590
1591 paragraph *read_input(input *in, indexdata *idx) {
1592     paragraph *head = NULL;
1593     paragraph **hptr = &head;
1594     tree234 *macros;
1595     char mag[16];
1596     size_t len, i;
1597     void (*reader)(input *);
1598
1599     macros = newtree234(macrocmp);
1600
1601     while (in->currindex < in->nfiles) {
1602         in->currfp = fopen(in->filenames[in->currindex], "r");
1603         if (in->currfp) {
1604             setpos(in, in->filenames[in->currindex]);
1605             in->charset = in->defcharset;
1606             in->csstate = charset_init_state;
1607             in->wcpos = in->nwc = 0;
1608             in->pushback_chars = NULL;
1609             reader = NULL;
1610             len = fread(mag, 1, sizeof(mag), in->currfp);
1611             for (i = 0; i < lenof(magics); i++) {
1612                 if (len >= magics[i].nmagic &&
1613                     memcmp(mag, magics[i].magic, magics[i].nmagic) == 0) {
1614                     reader = magics[i].reader;
1615                     break;
1616                 }
1617             }
1618             rewind(in->currfp);
1619             if (reader == NULL)
1620                 read_file(&hptr, in, idx, macros);
1621             else
1622                 (*reader)(in);
1623         }
1624         in->currindex++;
1625     }
1626
1627     macrocleanup(macros);
1628
1629     return head;
1630 }