mdw@git.distorted.org.uk Git - sgt/halibut/blob - input.c

   1 /*
   2  * input.c: read the source form
   3  */
   4
   5 #include <stdio.h>
   6 #include <assert.h>
   7 #include <time.h>
   8 #include "halibut.h"
   9
  10 #define TAB_STOP 8                     /* for column number tracking */
  11
  12 static void setpos(input *in, char *fname) {
  13     in->pos.filename = fname;
  14     in->pos.line = 1;
  15     in->pos.col = (in->reportcols ? 1 : -1);
  16 }
  17
  18 static void unget(input *in, int c, filepos *pos) {
  19     if (in->npushback >= in->pushbacksize) {
  20         in->pushbacksize = in->npushback + 16;
  21         in->pushback = resize(in->pushback, in->pushbacksize);
  22     }
  23     in->pushback[in->npushback].chr = c;
  24     in->pushback[in->npushback].pos = *pos;   /* structure copy */
  25     in->npushback++;
  26 }
  27
  28 /* ---------------------------------------------------------------------- */
  29 /*
  30  * Macro subsystem
  31  */
  32 typedef struct macro_Tag macro;
  33 struct macro_Tag {
  34     wchar_t *name, *text;
  35 };
  36 struct macrostack_Tag {
  37     macrostack *next;
  38     wchar_t *text;
  39     int ptr, npushback;
  40     filepos pos;
  41 };
  42 static int macrocmp(void *av, void *bv) {
  43     macro *a = (macro *)av, *b = (macro *)bv;
  44     return ustrcmp(a->name, b->name);
  45 }
  46 static void macrodef(tree234 *macros, wchar_t *name, wchar_t *text,
  47                      filepos fpos) {
  48     macro *m = mknew(macro);
  49     m->name = name;
  50     m->text = text;
  51     if (add234(macros, m) != m) {
  52         error(err_macroexists, &fpos, name);
  53         sfree(name);
  54         sfree(text);
  55     }
  56 }
  57 static int macrolookup(tree234 *macros, input *in, wchar_t *name,
  58                        filepos *pos) {
  59     macro m, *gotit;
  60     m.name = name;
  61     gotit = find234(macros, &m, NULL);
  62     if (gotit) {
  63         macrostack *expansion = mknew(macrostack);
  64         expansion->next = in->stack;
  65         expansion->text = gotit->text;
  66         expansion->pos = *pos;         /* structure copy */
  67         expansion->ptr = 0;
  68         expansion->npushback = in->npushback;
  69         in->stack = expansion;
  70         return TRUE;
  71     } else
  72         return FALSE;
  73 }
  74 static void macrocleanup(tree234 *macros) {
  75     int ti;
  76     macro *m;
  77     for (ti = 0; (m = (macro *)index234(macros, ti)) != NULL; ti++) {
  78         sfree(m->name);
  79         sfree(m->text);
  80         sfree(m);
  81     }
  82     freetree234(macros);
  83 }
  84
  85 /*
  86  * Can return EOF
  87  */
  88 static int get(input *in, filepos *pos) {
  89     int pushbackpt = in->stack ? in->stack->npushback : 0;
  90     if (in->npushback > pushbackpt) {
  91         --in->npushback;
  92         if (pos)
  93             *pos = in->pushback[in->npushback].pos;   /* structure copy */
  94         return in->pushback[in->npushback].chr;
  95     }
  96     else if (in->stack) {
  97         wchar_t c = in->stack->text[in->stack->ptr];
  98         if (in->stack->text[++in->stack->ptr] == L'\0') {
  99             macrostack *tmp = in->stack;
 100             in->stack = tmp->next;
 101             sfree(tmp);
 102         }
 103         return c;
 104     }
 105     else if (in->currfp) {
 106         int c = getc(in->currfp);
 107
 108         if (c == EOF) {
 109             fclose(in->currfp);
 110             in->currfp = NULL;
 111         }
 112         /* Track line numbers, for error reporting */
 113         if (pos)
 114             *pos = in->pos;
 115         if (in->reportcols) {
 116             switch (c) {
 117               case '\t':
 118                 in->pos.col = 1 + (in->pos.col + TAB_STOP-1) % TAB_STOP;
 119                 break;
 120               case '\n':
 121                 in->pos.col = 1;
 122                 in->pos.line++;
 123                 break;
 124               default:
 125                 in->pos.col++;
 126                 break;
 127             }
 128         } else {
 129             in->pos.col = -1;
 130             if (c == '\n')
 131                 in->pos.line++;
 132         }
 133         /* FIXME: do input charmap translation. We should be returning
 134          * Unicode here. */
 135         return c;
 136     } else
 137         return EOF;
 138 }
 139
 140 /*
 141  * Lexical analysis of source files.
 142  */
 143 typedef struct token_Tag token;
 144 struct token_Tag {
 145     int type;
 146     int cmd, aux;
 147     wchar_t *text;
 148     filepos pos;
 149 };
 150 enum {
 151     tok_eof,                           /* end of file */
 152     tok_eop,                           /* end of paragraph */
 153     tok_white,                         /* whitespace */
 154     tok_word,                          /* a word or word fragment */
 155     tok_cmd,                           /* \command */
 156     tok_lbrace,                        /* { */
 157     tok_rbrace                         /* } */
 158 };
 159
 160 /* Halibut command keywords. */
 161 enum {
 162     c__invalid,                        /* invalid command */
 163     c__comment,                        /* comment command (\#) */
 164     c__escaped,                        /* escaped character */
 165     c__nbsp,                           /* nonbreaking space */
 166     c_A,                               /* appendix heading */
 167     c_B,                               /* bibliography entry */
 168     c_BR,                              /* bibliography rewrite */
 169     c_C,                               /* chapter heading */
 170     c_H,                               /* heading */
 171     c_I,                               /* invisible index mark */
 172     c_IM,                              /* index merge/rewrite */
 173     c_K,                               /* capitalised cross-reference */
 174     c_S,                               /* aux field is 0, 1, 2, ... */
 175     c_U,                               /* unnumbered-chapter heading */
 176     c_W,                               /* Web hyperlink */
 177     c_b,                               /* bulletted list */
 178     c_c,                               /* code */
 179     c_cfg,                             /* configuration directive */
 180     c_copyright,                       /* copyright statement */
 181     c_cw,                              /* weak code */
 182     c_date,                            /* document processing date */
 183     c_define,                          /* macro definition */
 184     c_e,                               /* emphasis */
 185     c_i,                               /* visible index mark */
 186     c_ii,                              /* uncapitalised visible index mark */
 187     c_k,                               /* uncapitalised cross-reference */
 188     c_n,                               /* numbered list */
 189     c_nocite,                          /* bibliography trickery */
 190     c_preamble,                        /* document preamble text */
 191     c_q,                               /* quote marks */
 192     c_rule,                            /* horizontal rule */
 193     c_title,                           /* document title */
 194     c_u,                               /* aux field is char code */
 195     c_versionid                        /* document RCS id */
 196 };
 197
 198 /* Perhaps whitespace should be defined in a more Unicode-friendly way? */
 199 #define iswhite(c) ( (c)==32 || (c)==9 || (c)==13 || (c)==10 )
 200 #define isnl(c) ( (c)==10 )
 201 #define isdec(c) ( ((c)>='0'&&(c)<='9') )
 202 #define fromdec(c) ( (c)-'0' )
 203 #define ishex(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='F') || ((c)>='a'&&(c)<='f'))
 204 #define fromhex(c) ( (c)<='9' ? (c)-'0' : ((c)&0xDF) - ('A'-10) )
 205 #define iscmd(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='Z') || ((c)>='a'&&(c)<='z'))
 206
 207 /*
 208  * Keyword comparison function. Like strcmp, but between a wchar_t *
 209  * and a char *.
 210  */
 211 static int kwcmp(wchar_t const *p, char const *q) {
 212     int i;
 213     do {
 214         i = *p - *q;
 215     } while (*p++ && *q++ && !i);
 216     return i;
 217 }
 218
 219 /*
 220  * Match a keyword.
 221  */
 222 static void match_kw(token *tok) {
 223     /*
 224      * FIXME. The ids are explicit in here so as to allow long-name
 225      * equivalents to the various very short keywords.
 226      */
 227     static const struct { char const *name; int id; } keywords[] = {
 228         {"#", c__comment},             /* comment command (\#) */
 229         {"-", c__escaped},             /* nonbreaking hyphen */
 230         {"A", c_A},                    /* appendix heading */
 231         {"B", c_B},                    /* bibliography entry */
 232         {"BR", c_BR},                  /* bibliography rewrite */
 233         {"C", c_C},                    /* chapter heading */
 234         {"H", c_H},                    /* heading */
 235         {"I", c_I},                    /* invisible index mark */
 236         {"IM", c_IM},                  /* index merge/rewrite */
 237         {"K", c_K},                    /* capitalised cross-reference */
 238         {"U", c_U},                    /* unnumbered-chapter heading */
 239         {"W", c_W},                    /* Web hyperlink */
 240         {"\\", c__escaped},            /* escaped backslash (\\) */
 241         {"_", c__nbsp},                /* nonbreaking space (\_) */
 242         {"b", c_b},                    /* bulletted list */
 243         {"c", c_c},                    /* code */
 244         {"cfg", c_cfg},                /* configuration directive */
 245         {"copyright", c_copyright},    /* copyright statement */
 246         {"cw", c_cw},                  /* weak code */
 247         {"date", c_date},              /* document processing date */
 248         {"define", c_define},          /* macro definition */
 249         {"e", c_e},                    /* emphasis */
 250         {"i", c_i},                    /* visible index mark */
 251         {"ii", c_ii},                  /* uncapitalised visible index mark */
 252         {"k", c_k},                    /* uncapitalised cross-reference */
 253         {"n", c_n},                    /* numbered list */
 254         {"nocite", c_nocite},          /* bibliography trickery */
 255         {"preamble", c_preamble},      /* document preamble text */
 256         {"q", c_q},                    /* quote marks */
 257         {"rule", c_rule},              /* horizontal rule */
 258         {"title", c_title},            /* document title */
 259         {"versionid", c_versionid},    /* document RCS id */
 260         {"{", c__escaped},             /* escaped lbrace (\{) */
 261         {"}", c__escaped},             /* escaped rbrace (\}) */
 262     };
 263     int i, j, k, c;
 264
 265     /*
 266      * Special cases: \S{0,1,2,...} and \uABCD. If the syntax
 267      * doesn't match correctly, we just fall through to the
 268      * binary-search phase.
 269      */
 270     if (tok->text[0] == 'S') {
 271         /* We expect numeric characters thereafter. */
 272         wchar_t *p = tok->text+1;
 273         int n;
 274         if (!*p)
 275             n = 1;
 276         else {
 277             n = 0;
 278             while (*p && isdec(*p)) {
 279                 n = 10 * n + fromdec(*p);
 280                 p++;
 281             }
 282         }
 283         if (!*p) {
 284             tok->cmd = c_S;
 285             tok->aux = n;
 286             return;
 287         }
 288     } else if (tok->text[0] == 'u') {
 289         /* We expect hex characters thereafter. */
 290         wchar_t *p = tok->text+1;
 291         int n = 0;
 292         while (*p && ishex(*p)) {
 293             n = 16 * n + fromhex(*p);
 294             p++;
 295         }
 296         if (!*p) {
 297             tok->cmd = c_u;
 298             tok->aux = n;
 299             return;
 300         }
 301     }
 302
 303     i = -1;
 304     j = sizeof(keywords)/sizeof(*keywords);
 305     while (j-i > 1) {
 306         k = (i+j)/2;
 307         c = kwcmp(tok->text, keywords[k].name);
 308         if (c < 0)
 309             j = k;
 310         else if (c > 0)
 311             i = k;
 312         else /* c == 0 */ {
 313             tok->cmd = keywords[k].id;
 314             return;
 315         }
 316     }
 317
 318     tok->cmd = c__invalid;
 319 }
 320
 321
 322 /*
 323  * Read a token from the input file, in the normal way (`normal' in
 324  * the sense that code paragraphs work a different way).
 325  */
 326 token get_token(input *in) {
 327     int c;
 328     int nls;
 329     token ret;
 330     rdstring rs = { 0, 0, NULL };
 331     filepos cpos;
 332
 333     ret.text = NULL;                   /* default */
 334     c = get(in, &cpos);
 335     ret.pos = cpos;
 336     if (iswhite(c)) {                  /* tok_white or tok_eop */
 337         nls = 0;
 338         do {
 339             if (isnl(c))
 340                 nls++;
 341         } while ((c = get(in, &cpos)) != EOF && iswhite(c));
 342         if (c == EOF) {
 343             ret.type = tok_eof;
 344             return ret;
 345         }
 346         unget(in, c, &cpos);
 347         ret.type = (nls > 1 ? tok_eop : tok_white);
 348         return ret;
 349     } else if (c == EOF) {             /* tok_eof */
 350         ret.type = tok_eof;
 351         return ret;
 352     } else if (c == '\\') {            /* tok_cmd */
 353         c = get(in, &cpos);
 354         if (c == '-' || c == '\\' || c == '_' ||
 355             c == '#' || c == '{' || c == '}') {
 356             /* single-char command */
 357             rdadd(&rs, c);
 358         } else if (c == 'u') {
 359             int len = 0;
 360             do {
 361                 rdadd(&rs, c);
 362                 len++;
 363                 c = get(in, &cpos);
 364             } while (ishex(c) && len < 5);
 365             unget(in, c, &cpos);
 366         } else if (iscmd(c)) {
 367             do {
 368                 rdadd(&rs, c);
 369                 c = get(in, &cpos);
 370             } while (iscmd(c));
 371             unget(in, c, &cpos);
 372         }
 373         /*
 374          * Now match the command against the list of available
 375          * ones.
 376          */
 377         ret.type = tok_cmd;
 378         ret.text = ustrdup(rs.text);
 379         match_kw(&ret);
 380         sfree(rs.text);
 381         return ret;
 382     } else if (c == '{') {             /* tok_lbrace */
 383         ret.type = tok_lbrace;
 384         return ret;
 385     } else if (c == '}') {             /* tok_rbrace */
 386         ret.type = tok_rbrace;
 387         return ret;
 388     } else {                           /* tok_word */
 389         /*
 390          * Read a word: the longest possible contiguous sequence of
 391          * things other than whitespace, backslash, braces and
 392          * hyphen. A hyphen terminates the word but is returned as
 393          * part of it; everything else is pushed back for the next
 394          * token. The `aux' field contains TRUE if the word ends in
 395          * a hyphen.
 396          */
 397         ret.aux = FALSE;               /* assumed for now */
 398         while (1) {
 399             if (iswhite(c) || c=='{' || c=='}' || c=='\\' || c==EOF) {
 400                 /* Put back the character that caused termination */
 401                 unget(in, c, &cpos);
 402                 break;
 403             } else {
 404                 rdadd(&rs, c);
 405                 if (c == '-') {
 406                     ret.aux = TRUE;
 407                     break;             /* hyphen terminates word */
 408                 }
 409             }
 410             c = get(in, &cpos);
 411         }
 412         ret.type = tok_word;
 413         ret.text = ustrdup(rs.text);
 414         sfree(rs.text);
 415         return ret;
 416     }
 417 }
 418
 419 /*
 420  * Determine whether the next input character is an open brace (for
 421  * telling code paragraphs from paragraphs which merely start with
 422  * code).
 423  */
 424 int isbrace(input *in) {
 425     int c;
 426     filepos cpos;
 427
 428     c = get(in, &cpos);
 429     unget(in, c, &cpos);
 430     return (c == '{');
 431 }
 432
 433 /*
 434  * Read the rest of a line that starts `\c'. Including nothing at
 435  * all (tok_word with empty text).
 436  */
 437 token get_codepar_token(input *in) {
 438     int c;
 439     token ret;
 440     rdstring rs = { 0, 0, NULL };
 441     filepos cpos;
 442
 443     ret.type = tok_word;
 444     c = get(in, &cpos);                /* expect (and discard) one space */
 445     ret.pos = cpos;
 446     if (c == ' ') {
 447         c = get(in, &cpos);
 448         ret.pos = cpos;
 449     }
 450     while (!isnl(c) && c != EOF) {
 451         int c2 = c;
 452         c = get(in, &cpos);
 453         /* Discard \r just before \n. */
 454         if (c2 != 13 || !isnl(c))
 455             rdadd(&rs, c2);
 456     }
 457     unget(in, c, &cpos);
 458     ret.text = ustrdup(rs.text);
 459     sfree(rs.text);
 460     return ret;
 461 }
 462
 463 /*
 464  * Adds a new word to a linked list
 465  */
 466 static word *addword(word newword, word ***hptrptr) {
 467     word *mnewword;
 468     if (!hptrptr)
 469         return NULL;
 470     mnewword = mknew(word);
 471     *mnewword = newword;               /* structure copy */
 472     mnewword->next = NULL;
 473     **hptrptr = mnewword;
 474     *hptrptr = &mnewword->next;
 475     return mnewword;
 476 }
 477
 478 /*
 479  * Adds a new paragraph to a linked list
 480  */
 481 static paragraph *addpara(paragraph newpara, paragraph ***hptrptr) {
 482     paragraph *mnewpara = mknew(paragraph);
 483     *mnewpara = newpara;               /* structure copy */
 484     mnewpara->next = NULL;
 485     **hptrptr = mnewpara;
 486     *hptrptr = &mnewpara->next;
 487     return mnewpara;
 488 }
 489
 490 /*
 491  * Destructor before token is reassigned; should catch most memory
 492  * leaks
 493  */
 494 #define dtor(t) ( sfree(t.text) )
 495
 496 /*
 497  * Reads a single file (ie until get() returns EOF)
 498  */
 499 static void read_file(paragraph ***ret, input *in, indexdata *idx) {
 500     token t;
 501     paragraph par;
 502     word wd, **whptr, **idximplicit;
 503     tree234 *macros;
 504     wchar_t utext[2], *wdtext;
 505     int style, spcstyle;
 506     int already;
 507     int iswhite, seenwhite;
 508     int type;
 509     struct stack_item {
 510         enum {
 511             stack_nop = 0,             /* do nothing (for error recovery) */
 512             stack_ualt = 1,            /* \u alternative */
 513             stack_style = 2,           /* \e, \c, \cw */
 514             stack_idx = 4,             /* \I, \i, \ii */
 515             stack_hyper = 8,           /* \W */
 516             stack_quote = 16,          /* \q */
 517         } type;
 518         word **whptr;                  /* to restore from \u alternatives */
 519         word **idximplicit;            /* to restore from \u alternatives */
 520     } *sitem;
 521     stack parsestk;
 522     word *indexword, *uword, *iword;
 523     word *idxwordlist;
 524     rdstring indexstr;
 525     int index_downcase, index_visible, indexing;
 526     const rdstring nullrs = { 0, 0, NULL };
 527     wchar_t uchr;
 528
 529     t.text = NULL;
 530     macros = newtree234(macrocmp);
 531     already = FALSE;
 532
 533     /*
 534      * Loop on each paragraph.
 535      */
 536     while (1) {
 537         int start_cmd = c__invalid;
 538         par.words = NULL;
 539         par.keyword = NULL;
 540         whptr = &par.words;
 541
 542         /*
 543          * Get a token.
 544          */
 545         if (!already) {
 546             dtor(t), t = get_token(in);
 547         }
 548         already = FALSE;
 549         if (t.type == tok_eof)
 550             break;
 551
 552         /*
 553          * Parse code paragraphs separately.
 554          */
 555         if (t.type == tok_cmd && t.cmd == c_c && !isbrace(in)) {
 556             par.type = para_Code;
 557             par.fpos = t.pos;
 558             while (1) {
 559                 dtor(t), t = get_codepar_token(in);
 560                 wd.type = word_WeakCode;
 561                 wd.breaks = FALSE;     /* shouldn't need this... */
 562                 wd.text = ustrdup(t.text);
 563                 wd.alt = NULL;
 564                 wd.fpos = t.pos;
 565                 addword(wd, &whptr);
 566                 dtor(t), t = get_token(in);
 567                 if (t.type == tok_white) {
 568                     /*
 569                      * The newline after a code-paragraph line
 570                      */
 571                     dtor(t), t = get_token(in);
 572                 }
 573                 if (t.type == tok_eop || t.type == tok_eof)
 574                     break;
 575                 else if (t.type != tok_cmd || t.cmd != c_c) {
 576                     error(err_brokencodepara, &t.pos);
 577                     addpara(par, ret);
 578                     while (t.type != tok_eop)   /* error recovery: */
 579                         dtor(t), t = get_token(in);   /* eat rest of paragraph */
 580                     goto codeparabroken;   /* ick, but such is life */
 581                 }
 582             }
 583             addpara(par, ret);
 584             codeparabroken:
 585             continue;
 586         }
 587
 588         /*
 589          * This token begins a paragraph. See if it's one of the
 590          * special commands that define a paragraph type.
 591          *
 592          * (note that \# is special in a way, and \nocite takes no
 593          * text)
 594          */
 595         par.type = para_Normal;
 596         if (t.type == tok_cmd) {
 597             int needkw;
 598             int is_macro = FALSE;
 599
 600             par.fpos = t.pos;
 601             switch (t.cmd) {
 602               default:
 603                 needkw = -1;
 604                 break;
 605               case c__invalid:
 606                 error(err_badparatype, t.text, &t.pos);
 607                 needkw = 4;
 608                 break;
 609               case c__comment:
 610                 if (isbrace(in))
 611                     break;             /* `\#{': isn't a comment para */
 612                 do {
 613                     dtor(t), t = get_token(in);
 614                 } while (t.type != tok_eop && t.type != tok_eof);
 615                 continue;              /* next paragraph */
 616                 /*
 617                  * `needkw' values:
 618                  *
 619                  *   1 -- exactly one keyword
 620                  *   2 -- at least one keyword
 621                  *   4 -- any number of keywords including zero
 622                  *   8 -- at least one keyword and then nothing else
 623                  *  16 -- nothing at all! no keywords, no body
 624                  *  32 -- no keywords at all
 625                  */
 626               case c_A: needkw = 2; par.type = para_Appendix; break;
 627               case c_B: needkw = 2; par.type = para_Biblio; break;
 628               case c_BR: needkw = 1; par.type = para_BR;
 629                 start_cmd = c_BR; break;
 630               case c_C: needkw = 2; par.type = para_Chapter; break;
 631               case c_H: needkw = 2; par.type = para_Heading;
 632                 par.aux = 0;
 633                 break;
 634               case c_IM: needkw = 2; par.type = para_IM;
 635                 start_cmd = c_IM; break;
 636               case c_S: needkw = 2; par.type = para_Subsect;
 637                 par.aux = t.aux; break;
 638               case c_U: needkw = 32; par.type = para_UnnumberedChapter; break;
 639                 /* For \b and \n the keyword is optional */
 640               case c_b: needkw = 4; par.type = para_Bullet; break;
 641               case c_n: needkw = 4; par.type = para_NumberedList; break;
 642               case c_cfg: needkw = 8; par.type = para_Config;
 643                 start_cmd = c_cfg; break;
 644               case c_copyright: needkw = 32; par.type = para_Copyright; break;
 645               case c_define: is_macro = TRUE; needkw = 1; break;
 646                 /* For \nocite the keyword is _everything_ */
 647               case c_nocite: needkw = 8; par.type = para_NoCite; break;
 648               case c_preamble: needkw = 32; par.type = para_Preamble; break;
 649               case c_rule: needkw = 16; par.type = para_Rule; break;
 650               case c_title: needkw = 32; par.type = para_Title; break;
 651               case c_versionid: needkw = 32; par.type = para_VersionID; break;
 652             }
 653
 654             if (needkw > 0) {
 655                 rdstring rs = { 0, 0, NULL };
 656                 int nkeys = 0;
 657                 filepos fp;
 658
 659                 /* Get keywords. */
 660                 dtor(t), t = get_token(in);
 661                 fp = t.pos;
 662                 while (t.type == tok_lbrace) {
 663                     /* This is a keyword. */
 664                     nkeys++;
 665                     /* FIXME: there will be bugs if anyone specifies an
 666                      * empty keyword (\foo{}), so trap this case. */
 667                     while (dtor(t), t = get_token(in),
 668                            t.type == tok_word ||
 669                            t.type == tok_white ||
 670                            (t.type == tok_cmd && t.cmd == c__nbsp) ||
 671                            (t.type == tok_cmd && t.cmd == c__escaped)) {
 672                         if (t.type == tok_white ||
 673                             (t.type == tok_cmd && t.cmd == c__nbsp))
 674                             rdadd(&rs, ' ');
 675                         else
 676                             rdadds(&rs, t.text);
 677                     }
 678                     if (t.type != tok_rbrace) {
 679                         error(err_kwunclosed, &t.pos);
 680                         continue;
 681                     }
 682                     rdadd(&rs, 0);     /* add string terminator */
 683                     dtor(t), t = get_token(in); /* eat right brace */
 684                 }
 685
 686                 rdadd(&rs, 0);     /* add string terminator */
 687
 688                 /* See whether we have the right number of keywords. */
 689                 if ((needkw & 48) && nkeys > 0)
 690                     error(err_kwillegal, &fp);
 691                 if ((needkw & 11) && nkeys == 0)
 692                     error(err_kwexpected, &fp);
 693                 if ((needkw & 5) && nkeys > 1)
 694                     error(err_kwtoomany, &fp);
 695
 696                 if (is_macro) {
 697                     /*
 698                      * Macro definition. Get the rest of the line
 699                      * as a code-paragraph token, repeatedly until
 700                      * there's nothing more left of it. Separate
 701                      * with newlines.
 702                      */
 703                     rdstring macrotext = { 0, 0, NULL };
 704                     while (1) {
 705                         dtor(t), t = get_codepar_token(in);
 706                         if (macrotext.pos > 0)
 707                             rdadd(&macrotext, L'\n');
 708                         rdadds(&macrotext, t.text);
 709                         dtor(t), t = get_token(in);
 710                         if (t.type == tok_eop) break;
 711                     }
 712                     macrodef(macros, rs.text, macrotext.text, fp);
 713                     continue;          /* next paragraph */
 714                 }
 715
 716                 par.keyword = rdtrim(&rs);
 717
 718                 /* Move to EOP in case of needkw==8 or 16 (no body) */
 719                 if (needkw & 24) {
 720                     /* We allow whitespace even when we expect no para body */
 721                     while (t.type == tok_white)
 722                         dtor(t), t = get_token(in);
 723                     if (t.type != tok_eop && t.type != tok_eof &&
 724                         (start_cmd == c__invalid ||
 725                          t.type != tok_cmd || t.cmd != start_cmd)) {
 726                         error(err_bodyillegal, &t.pos);
 727                         /* Error recovery: eat the rest of the paragraph */
 728                         while (t.type != tok_eop && t.type != tok_eof &&
 729                                (start_cmd == c__invalid ||
 730                                 t.type != tok_cmd || t.cmd != start_cmd))
 731                             dtor(t), t = get_token(in);
 732                     }
 733                     if (t.type == tok_cmd)
 734                         already = TRUE;/* inhibit get_token at top of loop */
 735                     addpara(par, ret);
 736                     continue;          /* next paragraph */
 737                 }
 738             }
 739         }
 740
 741         /*
 742          * Now read the actual paragraph, word by word, adding to
 743          * the paragraph list.
 744          *
 745          * Mid-paragraph commands:
 746          *
 747          *  \K \k
 748          *  \c \cw
 749          *  \e
 750          *  \i \ii
 751          *  \I
 752          *  \u
 753          *  \W
 754          *  \date
 755          *  \\ \{ \}
 756          */
 757         parsestk = stk_new();
 758         style = word_Normal;
 759         spcstyle = word_WhiteSpace;
 760         indexing = FALSE;
 761         seenwhite = TRUE;
 762         while (t.type != tok_eop && t.type != tok_eof) {
 763             iswhite = FALSE;
 764             already = FALSE;
 765
 766             /* Handle implicit paragraph breaks after \IM, \BR etc */
 767             if (start_cmd != c__invalid &&
 768                 t.type == tok_cmd && t.cmd == start_cmd) {
 769                 already = TRUE;        /* inhibit get_token at top of loop */
 770                 break;
 771             }
 772
 773             if (t.type == tok_cmd && t.cmd == c__escaped) {
 774                 t.type = tok_word;     /* nice and simple */
 775                 t.aux = 0;             /* even if `\-' - nonbreaking! */
 776             }
 777             if (t.type == tok_cmd && t.cmd == c__nbsp) {
 778                 t.type = tok_word;     /* nice and simple */
 779                 sfree(t.text);
 780                 t.text = ustrdup(L" ");  /* text is ` ' not `_' */
 781                 t.aux = 0;             /* (nonbreaking) */
 782             }
 783             switch (t.type) {
 784               case tok_white:
 785                 if (whptr == &par.words)
 786                     break;             /* strip whitespace at start of para */
 787                 wd.text = NULL;
 788                 wd.type = spcstyle;
 789                 wd.alt = NULL;
 790                 wd.aux = 0;
 791                 wd.fpos = t.pos;
 792                 wd.breaks = FALSE;
 793
 794                 /*
 795                  * Inhibit use of whitespace if it's (probably the
 796                  * newline) before a repeat \IM / \BR type
 797                  * directive.
 798                  */
 799                 if (start_cmd != c__invalid) {
 800                     dtor(t), t = get_token(in);
 801                     already = TRUE;
 802                     if (t.type == tok_cmd && t.cmd == start_cmd)
 803                         break;
 804                 }
 805
 806                 if (indexing)
 807                     rdadd(&indexstr, ' ');
 808                 if (!indexing || index_visible)
 809                     addword(wd, &whptr);
 810                 if (indexing)
 811                     addword(wd, &idximplicit);
 812                 iswhite = TRUE;
 813                 break;
 814               case tok_word:
 815                 if (indexing)
 816                     rdadds(&indexstr, t.text);
 817                 wd.type = style;
 818                 wd.alt = NULL;
 819                 wd.aux = 0;
 820                 wd.fpos = t.pos;
 821                 wd.breaks = t.aux;
 822                 if (!indexing || index_visible) {
 823                     wd.text = ustrdup(t.text);
 824                     addword(wd, &whptr);
 825                 }
 826                 if (indexing) {
 827                     wd.text = ustrdup(t.text);
 828                     addword(wd, &idximplicit);
 829                 }
 830                 break;
 831               case tok_lbrace:
 832                 error(err_unexbrace, &t.pos);
 833                 /* Error recovery: push nop */
 834                 sitem = mknew(struct stack_item);
 835                 sitem->type = stack_nop;
 836                 stk_push(parsestk, sitem);
 837                 break;
 838               case tok_rbrace:
 839                 sitem = stk_pop(parsestk);
 840                 if (!sitem)
 841                     error(err_unexbrace, &t.pos);
 842                 else {
 843                     if (sitem->type & stack_ualt) {
 844                         whptr = sitem->whptr;
 845                         idximplicit = sitem->idximplicit;
 846                     }
 847                     if (sitem->type & stack_style) {
 848                         style = word_Normal;
 849                         spcstyle = word_WhiteSpace;
 850                     }
 851                     if (sitem->type & stack_idx) {
 852                         indexword->text = ustrdup(indexstr.text);
 853                         if (index_downcase)
 854                             ustrlow(indexword->text);
 855                         indexing = FALSE;
 856                         rdadd(&indexstr, L'\0');
 857                         index_merge(idx, FALSE, indexstr.text, idxwordlist);
 858                         sfree(indexstr.text);
 859                     }
 860                     if (sitem->type & stack_hyper) {
 861                         wd.text = NULL;
 862                         wd.type = word_HyperEnd;
 863                         wd.alt = NULL;
 864                         wd.aux = 0;
 865                         wd.fpos = t.pos;
 866                         wd.breaks = FALSE;
 867                         if (!indexing || index_visible)
 868                             addword(wd, &whptr);
 869                         if (indexing)
 870                             addword(wd, &idximplicit);
 871                     }
 872                     if (sitem->type & stack_quote) {
 873                         wd.text = NULL;
 874                         wd.type = toquotestyle(style);
 875                         wd.alt = NULL;
 876                         wd.aux = quote_Close;
 877                         wd.fpos = t.pos;
 878                         wd.breaks = FALSE;
 879                         if (!indexing || index_visible)
 880                             addword(wd, &whptr);
 881                         if (indexing) {
 882                             rdadd(&indexstr, L'"');
 883                             addword(wd, &idximplicit);
 884                         }
 885                     }
 886                 }
 887                 sfree(sitem);
 888                 break;
 889               case tok_cmd:
 890                 switch (t.cmd) {
 891                   case c__comment:
 892                     /*
 893                      * In-paragraph comment: \#{ balanced braces }
 894                      *
 895                      * Anything goes here; even tok_eop. We should
 896                      * eat whitespace after the close brace _if_
 897                      * there was whitespace before the \#.
 898                      */
 899                     dtor(t), t = get_token(in);
 900                     if (t.type != tok_lbrace) {
 901                         error(err_explbr, &t.pos);
 902                     } else {
 903                         int braces = 1;
 904                         while (braces > 0) {
 905                             dtor(t), t = get_token(in);
 906                             if (t.type == tok_lbrace)
 907                                 braces++;
 908                             else if (t.type == tok_rbrace)
 909                                 braces--;
 910                             else if (t.type == tok_eof) {
 911                                 error(err_commenteof, &t.pos);
 912                                 break;
 913                             }
 914                         }
 915                     }
 916                     if (seenwhite) {
 917                         already = TRUE;
 918                         dtor(t), t = get_token(in);
 919                         if (t.type == tok_white) {
 920                             iswhite = TRUE;
 921                             already = FALSE;
 922                         }
 923                     }
 924                     break;
 925                   case c_q:
 926                     dtor(t), t = get_token(in);
 927                     if (t.type != tok_lbrace) {
 928                         error(err_explbr, &t.pos);
 929                     } else {
 930                         wd.text = NULL;
 931                         wd.type = toquotestyle(style);
 932                         wd.alt = NULL;
 933                         wd.aux = quote_Open;
 934                         wd.fpos = t.pos;
 935                         wd.breaks = FALSE;
 936                         if (!indexing || index_visible)
 937                             addword(wd, &whptr);
 938                         if (indexing) {
 939                             rdadd(&indexstr, L'"');
 940                             addword(wd, &idximplicit);
 941                         }
 942                         sitem = mknew(struct stack_item);
 943                         sitem->type = stack_quote;
 944                         stk_push(parsestk, sitem);
 945                     }
 946                     break;
 947                   case c_K:
 948                   case c_k:
 949                   case c_W:
 950                   case c_date:
 951                     /*
 952                      * Keyword, hyperlink, or \date. We expect a
 953                      * left brace, some text, and then a right
 954                      * brace. No nesting; no arguments.
 955                      */
 956                     wd.fpos = t.pos;
 957                     wd.breaks = FALSE;
 958                     if (t.cmd == c_K)
 959                         wd.type = word_UpperXref;
 960                     else if (t.cmd == c_k)
 961                         wd.type = word_LowerXref;
 962                     else if (t.cmd == c_W)
 963                         wd.type = word_HyperLink;
 964                     else
 965                         wd.type = word_Normal;
 966                     dtor(t), t = get_token(in);
 967                     if (t.type != tok_lbrace) {
 968                         if (wd.type == word_Normal) {
 969                             time_t thetime = time(NULL);
 970                             struct tm *broken = localtime(&thetime);
 971                             already = TRUE;
 972                             wdtext = ustrftime(NULL, broken);
 973                             wd.type = style;
 974                         } else {
 975                             error(err_explbr, &t.pos);
 976                             wdtext = NULL;
 977                         }
 978                     } else {
 979                         rdstring rs = { 0, 0, NULL };
 980                         while (dtor(t), t = get_token(in),
 981                                t.type == tok_word || t.type == tok_white) {
 982                             if (t.type == tok_white)
 983                                 rdadd(&rs, ' ');
 984                             else
 985                                 rdadds(&rs, t.text);
 986                         }
 987                         if (wd.type == word_Normal) {
 988                             time_t thetime = time(NULL);
 989                             struct tm *broken = localtime(&thetime);
 990                             wdtext = ustrftime(rs.text, broken);
 991                             wd.type = style;
 992                         } else {
 993                             wdtext = ustrdup(rs.text);
 994                         }
 995                         sfree(rs.text);
 996                         if (t.type != tok_rbrace) {
 997                             error(err_kwexprbr, &t.pos);
 998                         }
 999                     }
1000                     wd.alt = NULL;
1001                     wd.aux = 0;
1002                     if (!indexing || index_visible) {
1003                         wd.text = ustrdup(wdtext);
1004                         addword(wd, &whptr);
1005                     }
1006                     if (indexing) {
1007                         wd.text = ustrdup(wdtext);
1008                         addword(wd, &idximplicit);
1009                     }
1010                     sfree(wdtext);
1011                     if (wd.type == word_HyperLink) {
1012                         /*
1013                          * Hyperlinks are different: they then
1014                          * expect another left brace, to begin
1015                          * delimiting the text marked by the link.
1016                          */
1017                         dtor(t), t = get_token(in);
1018                         /*
1019                          * Special cases: \W{}\c, \W{}\e, \W{}\cw
1020                          */
1021                         sitem = mknew(struct stack_item);
1022                         sitem->type = stack_hyper;
1023                         if (t.type == tok_cmd &&
1024                             (t.cmd == c_e || t.cmd == c_c || t.cmd == c_cw)) {
1025                             if (style != word_Normal)
1026                                 error(err_nestedstyles, &t.pos);
1027                             else {
1028                                 style = (t.cmd == c_c ? word_Code :
1029                                          t.cmd == c_cw ? word_WeakCode :
1030                                          word_Emph);
1031                                 spcstyle = tospacestyle(style);
1032                                 sitem->type |= stack_style;
1033                             }
1034                             dtor(t), t = get_token(in);
1035                         }
1036                         if (t.type != tok_lbrace) {
1037                             error(err_explbr, &t.pos);
1038                             sfree(sitem);
1039                         } else {
1040                             stk_push(parsestk, sitem);
1041                         }
1042                     }
1043                     break;
1044                   case c_c:
1045                   case c_cw:
1046                   case c_e:
1047                     type = t.cmd;
1048                     if (style != word_Normal) {
1049                         error(err_nestedstyles, &t.pos);
1050                         /* Error recovery: eat lbrace, push nop. */
1051                         dtor(t), t = get_token(in);
1052                         sitem = mknew(struct stack_item);
1053                         sitem->type = stack_nop;
1054                         stk_push(parsestk, sitem);
1055                     }
1056                     dtor(t), t = get_token(in);
1057                     if (t.type != tok_lbrace) {
1058                         error(err_explbr, &t.pos);
1059                     } else {
1060                         style = (type == c_c ? word_Code :
1061                                  type == c_cw ? word_WeakCode :
1062                                  word_Emph);
1063                         spcstyle = tospacestyle(style);
1064                         sitem = mknew(struct stack_item);
1065                         sitem->type = stack_style;
1066                         stk_push(parsestk, sitem);
1067                     }
1068                     break;
1069                   case c_i:
1070                   case c_ii:
1071                   case c_I:
1072                     type = t.cmd;
1073                     if (indexing) {
1074                         error(err_nestedindex, &t.pos);
1075                         /* Error recovery: eat lbrace, push nop. */
1076                         dtor(t), t = get_token(in);
1077                         sitem = mknew(struct stack_item);
1078                         sitem->type = stack_nop;
1079                         stk_push(parsestk, sitem);
1080                     }
1081                     sitem = mknew(struct stack_item);
1082                     sitem->type = stack_idx;
1083                     dtor(t), t = get_token(in);
1084                     /*
1085                      * Special cases: \i\c, \i\e, \i\cw
1086                      */
1087                     wd.fpos = t.pos;
1088                     if (t.type == tok_cmd &&
1089                         (t.cmd == c_e || t.cmd == c_c || t.cmd == c_cw)) {
1090                         if (style != word_Normal)
1091                             error(err_nestedstyles, &t.pos);
1092                         else {
1093                             style = (t.cmd == c_c ? word_Code :
1094                                      t.cmd == c_cw ? word_WeakCode :
1095                                      word_Emph);
1096                             spcstyle = tospacestyle(style);
1097                             sitem->type |= stack_style;
1098                         }
1099                         dtor(t), t = get_token(in);
1100                     }
1101                     if (t.type != tok_lbrace) {
1102                         sfree(sitem);
1103                         error(err_explbr, &t.pos);
1104                     } else {
1105                         /* Add an index-reference word with no text as yet */
1106                         wd.type = word_IndexRef;
1107                         wd.text = NULL;
1108                         wd.alt = NULL;
1109                         wd.aux = 0;
1110                         wd.breaks = FALSE;
1111                         indexword = addword(wd, &whptr);
1112                         /* Set up a rdstring to read the index text */
1113                         indexstr = nullrs;
1114                         /* Flags so that we do the Right Things with text */
1115                         index_visible = (type != c_I);
1116                         index_downcase = (type == c_ii);
1117                         indexing = TRUE;
1118                         idxwordlist = NULL;
1119                         idximplicit = &idxwordlist;
1120                         /* Stack item to close the indexing on exit */
1121                         stk_push(parsestk, sitem);
1122                     }
1123                     break;
1124                   case c_u:
1125                     uchr = t.aux;
1126                     utext[0] = uchr; utext[1] = 0;
1127                     wd.type = style;
1128                     wd.breaks = FALSE;
1129                     wd.alt = NULL;
1130                     wd.aux = 0;
1131                     wd.fpos = t.pos;
1132                     if (!indexing || index_visible) {
1133                         wd.text = ustrdup(utext);
1134                         uword = addword(wd, &whptr);
1135                     } else
1136                         uword = NULL;
1137                     if (indexing) {
1138                         wd.text = ustrdup(utext);
1139                         iword = addword(wd, &idximplicit);
1140                     } else
1141                         iword = NULL;
1142                     dtor(t), t = get_token(in);
1143                     if (t.type == tok_lbrace) {
1144                         /*
1145                          * \u with a left brace. Until the brace
1146                          * closes, all further words go on a
1147                          * sidetrack from the main thread of the
1148                          * paragraph.
1149                          */
1150                         sitem = mknew(struct stack_item);
1151                         sitem->type = stack_ualt;
1152                         sitem->whptr = whptr;
1153                         sitem->idximplicit = idximplicit;
1154                         stk_push(parsestk, sitem);
1155                         whptr = uword ? &uword->alt : NULL;
1156                         idximplicit = iword ? &iword->alt : NULL;
1157                     } else {
1158                         if (indexing)
1159                             rdadd(&indexstr, uchr);
1160                         already = TRUE;
1161                     }
1162                     break;
1163                   default:
1164                     if (!macrolookup(macros, in, t.text, &t.pos))
1165                         error(err_badmidcmd, t.text, &t.pos);
1166                     break;
1167                 }
1168             }
1169             if (!already)
1170                 dtor(t), t = get_token(in);
1171             seenwhite = iswhite;
1172         }
1173         /* Check the stack is empty */
1174         if (NULL != (sitem = stk_pop(parsestk))) {
1175             do {
1176                 sfree(sitem);
1177                 sitem = stk_pop(parsestk);
1178             } while (sitem);
1179             error(err_missingrbrace, &t.pos);
1180         }
1181         stk_free(parsestk);
1182         addpara(par, ret);
1183     }
1184
1185     /*
1186      * We break to here rather than returning, because otherwise
1187      * this cleanup doesn't happen.
1188      */
1189     dtor(t);
1190     macrocleanup(macros);
1191 }
1192
1193 paragraph *read_input(input *in, indexdata *idx) {
1194     paragraph *head = NULL;
1195     paragraph **hptr = &head;
1196
1197     while (in->currindex < in->nfiles) {
1198         in->currfp = fopen(in->filenames[in->currindex], "r");
1199         if (in->currfp) {
1200             setpos(in, in->filenames[in->currindex]);
1201             read_file(&hptr, in, idx);
1202         }
1203         in->currindex++;
1204     }
1205
1206     return head;
1207 }