mdw@git.distorted.org.uk Git - sgt/halibut/blob - input.c

   1 /*
   2  * input.c: read the source form
   3  */
   4
   5 #include <stdio.h>
   6 #include <assert.h>
   7 #include <time.h>
   8 #include "halibut.h"
   9
  10 #define TAB_STOP 8                     /* for column number tracking */
  11
  12 static void setpos(input *in, char *fname) {
  13     in->pos.filename = fname;
  14     in->pos.line = 1;
  15     in->pos.col = (in->reportcols ? 1 : -1);
  16 }
  17
  18 static void unget(input *in, int c, filepos *pos) {
  19     if (in->npushback >= in->pushbacksize) {
  20         in->pushbacksize = in->npushback + 16;
  21         in->pushback = resize(in->pushback, in->pushbacksize);
  22     }
  23     in->pushback[in->npushback].chr = c;
  24     in->pushback[in->npushback].pos = *pos;   /* structure copy */
  25     in->npushback++;
  26 }
  27
  28 /* ---------------------------------------------------------------------- */
  29 /*
  30  * Macro subsystem
  31  */
  32 typedef struct macro_Tag macro;
  33 struct macro_Tag {
  34     wchar_t *name, *text;
  35 };
  36 struct macrostack_Tag {
  37     macrostack *next;
  38     wchar_t *text;
  39     int ptr, npushback;
  40     filepos pos;
  41 };
  42 static int macrocmp(void *av, void *bv) {
  43     macro *a = (macro *)av, *b = (macro *)bv;
  44     return ustrcmp(a->name, b->name);
  45 }
  46 static void macrodef(tree234 *macros, wchar_t *name, wchar_t *text,
  47                      filepos fpos) {
  48     macro *m = mknew(macro);
  49     m->name = name;
  50     m->text = text;
  51     if (add234(macros, m) != m) {
  52         error(err_macroexists, &fpos, name);
  53         sfree(name);
  54         sfree(text);
  55     }
  56 }
  57 static int macrolookup(tree234 *macros, input *in, wchar_t *name,
  58                        filepos *pos) {
  59     macro m, *gotit;
  60     m.name = name;
  61     gotit = find234(macros, &m, NULL);
  62     if (gotit) {
  63         macrostack *expansion = mknew(macrostack);
  64         expansion->next = in->stack;
  65         expansion->text = gotit->text;
  66         expansion->pos = *pos;         /* structure copy */
  67         expansion->ptr = 0;
  68         expansion->npushback = in->npushback;
  69         in->stack = expansion;
  70         return TRUE;
  71     } else
  72         return FALSE;
  73 }
  74 static void macrocleanup(tree234 *macros) {
  75     int ti;
  76     macro *m;
  77     for (ti = 0; (m = (macro *)index234(macros, ti)) != NULL; ti++) {
  78         sfree(m->name);
  79         sfree(m->text);
  80         sfree(m);
  81     }
  82     freetree234(macros);
  83 }
  84
  85 /*
  86  * Can return EOF
  87  */
  88 static int get(input *in, filepos *pos) {
  89     int pushbackpt = in->stack ? in->stack->npushback : 0;
  90     if (in->npushback > pushbackpt) {
  91         --in->npushback;
  92         if (pos)
  93             *pos = in->pushback[in->npushback].pos;   /* structure copy */
  94         return in->pushback[in->npushback].chr;
  95     }
  96     else if (in->stack) {
  97         wchar_t c = in->stack->text[in->stack->ptr];
  98         if (in->stack->text[++in->stack->ptr] == L'\0') {
  99             macrostack *tmp = in->stack;
 100             in->stack = tmp->next;
 101             sfree(tmp);
 102         }
 103         return c;
 104     }
 105     else if (in->currfp) {
 106         int c = getc(in->currfp);
 107
 108         if (c == EOF) {
 109             fclose(in->currfp);
 110             in->currfp = NULL;
 111         }
 112         /* Track line numbers, for error reporting */
 113         if (pos)
 114             *pos = in->pos;
 115         if (in->reportcols) {
 116             switch (c) {
 117               case '\t':
 118                 in->pos.col = 1 + (in->pos.col + TAB_STOP-1) % TAB_STOP;
 119                 break;
 120               case '\n':
 121                 in->pos.col = 1;
 122                 in->pos.line++;
 123                 break;
 124               default:
 125                 in->pos.col++;
 126                 break;
 127             }
 128         } else {
 129             in->pos.col = -1;
 130             if (c == '\n')
 131                 in->pos.line++;
 132         }
 133         /* FIXME: do input charmap translation. We should be returning
 134          * Unicode here. */
 135         return c;
 136     } else
 137         return EOF;
 138 }
 139
 140 /*
 141  * Lexical analysis of source files.
 142  */
 143 typedef struct token_Tag token;
 144 struct token_Tag {
 145     int type;
 146     int cmd, aux;
 147     wchar_t *text;
 148     filepos pos;
 149 };
 150 enum {
 151     tok_eof,                           /* end of file */
 152     tok_eop,                           /* end of paragraph */
 153     tok_white,                         /* whitespace */
 154     tok_word,                          /* a word or word fragment */
 155     tok_cmd,                           /* \command */
 156     tok_lbrace,                        /* { */
 157     tok_rbrace                         /* } */
 158 };
 159
 160 /* Halibut command keywords. */
 161 enum {
 162     c__invalid,                        /* invalid command */
 163     c__comment,                        /* comment command (\#) */
 164     c__escaped,                        /* escaped character */
 165     c__nbsp,                           /* nonbreaking space */
 166     c_A,                               /* appendix heading */
 167     c_B,                               /* bibliography entry */
 168     c_BR,                              /* bibliography rewrite */
 169     c_C,                               /* chapter heading */
 170     c_H,                               /* heading */
 171     c_I,                               /* invisible index mark */
 172     c_IM,                              /* index merge/rewrite */
 173     c_K,                               /* capitalised cross-reference */
 174     c_S,                               /* aux field is 0, 1, 2, ... */
 175     c_U,                               /* unnumbered-chapter heading */
 176     c_W,                               /* Web hyperlink */
 177     c_b,                               /* bulletted list */
 178     c_c,                               /* code */
 179     c_cfg,                             /* configuration directive */
 180     c_copyright,                       /* copyright statement */
 181     c_cw,                              /* weak code */
 182     c_date,                            /* document processing date */
 183     c_dd,                              /* description list: description */
 184     c_define,                          /* macro definition */
 185     c_dt,                              /* description list: described thing */
 186     c_e,                               /* emphasis */
 187     c_i,                               /* visible index mark */
 188     c_ii,                              /* uncapitalised visible index mark */
 189     c_k,                               /* uncapitalised cross-reference */
 190     c_lcont,                           /* continuation para(s) for list item */
 191     c_n,                               /* numbered list */
 192     c_nocite,                          /* bibliography trickery */
 193     c_preamble,                        /* document preamble text */
 194     c_q,                               /* quote marks */
 195     c_rule,                            /* horizontal rule */
 196     c_title,                           /* document title */
 197     c_u,                               /* aux field is char code */
 198     c_versionid                        /* document RCS id */
 199 };
 200
 201 /* Perhaps whitespace should be defined in a more Unicode-friendly way? */
 202 #define iswhite(c) ( (c)==32 || (c)==9 || (c)==13 || (c)==10 )
 203 #define isnl(c) ( (c)==10 )
 204 #define isdec(c) ( ((c)>='0'&&(c)<='9') )
 205 #define fromdec(c) ( (c)-'0' )
 206 #define ishex(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='F') || ((c)>='a'&&(c)<='f'))
 207 #define fromhex(c) ( (c)<='9' ? (c)-'0' : ((c)&0xDF) - ('A'-10) )
 208 #define iscmd(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='Z') || ((c)>='a'&&(c)<='z'))
 209
 210 /*
 211  * Keyword comparison function. Like strcmp, but between a wchar_t *
 212  * and a char *.
 213  */
 214 static int kwcmp(wchar_t const *p, char const *q) {
 215     int i;
 216     do {
 217         i = *p - *q;
 218     } while (*p++ && *q++ && !i);
 219     return i;
 220 }
 221
 222 /*
 223  * Match a keyword.
 224  */
 225 static void match_kw(token *tok) {
 226     /*
 227      * FIXME. The ids are explicit in here so as to allow long-name
 228      * equivalents to the various very short keywords.
 229      */
 230     static const struct { char const *name; int id; } keywords[] = {
 231         {"#", c__comment},             /* comment command (\#) */
 232         {"-", c__escaped},             /* nonbreaking hyphen */
 233         {"A", c_A},                    /* appendix heading */
 234         {"B", c_B},                    /* bibliography entry */
 235         {"BR", c_BR},                  /* bibliography rewrite */
 236         {"C", c_C},                    /* chapter heading */
 237         {"H", c_H},                    /* heading */
 238         {"I", c_I},                    /* invisible index mark */
 239         {"IM", c_IM},                  /* index merge/rewrite */
 240         {"K", c_K},                    /* capitalised cross-reference */
 241         {"U", c_U},                    /* unnumbered-chapter heading */
 242         {"W", c_W},                    /* Web hyperlink */
 243         {"\\", c__escaped},            /* escaped backslash (\\) */
 244         {"_", c__nbsp},                /* nonbreaking space (\_) */
 245         {"b", c_b},                    /* bulletted list */
 246         {"c", c_c},                    /* code */
 247         {"cfg", c_cfg},                /* configuration directive */
 248         {"copyright", c_copyright},    /* copyright statement */
 249         {"cw", c_cw},                  /* weak code */
 250         {"date", c_date},              /* document processing date */
 251         {"dd", c_dd},                  /* description list: description */
 252         {"define", c_define},          /* macro definition */
 253         {"dt", c_dt},                  /* description list: described thing */
 254         {"e", c_e},                    /* emphasis */
 255         {"i", c_i},                    /* visible index mark */
 256         {"ii", c_ii},                  /* uncapitalised visible index mark */
 257         {"k", c_k},                    /* uncapitalised cross-reference */
 258         {"lcont", c_lcont},            /* continuation para(s) for list item */
 259         {"n", c_n},                    /* numbered list */
 260         {"nocite", c_nocite},          /* bibliography trickery */
 261         {"preamble", c_preamble},      /* document preamble text */
 262         {"q", c_q},                    /* quote marks */
 263         {"rule", c_rule},              /* horizontal rule */
 264         {"title", c_title},            /* document title */
 265         {"versionid", c_versionid},    /* document RCS id */
 266         {"{", c__escaped},             /* escaped lbrace (\{) */
 267         {"}", c__escaped},             /* escaped rbrace (\}) */
 268     };
 269     int i, j, k, c;
 270
 271     /*
 272      * Special cases: \S{0,1,2,...} and \uABCD. If the syntax
 273      * doesn't match correctly, we just fall through to the
 274      * binary-search phase.
 275      */
 276     if (tok->text[0] == 'S') {
 277         /* We expect numeric characters thereafter. */
 278         wchar_t *p = tok->text+1;
 279         int n;
 280         if (!*p)
 281             n = 1;
 282         else {
 283             n = 0;
 284             while (*p && isdec(*p)) {
 285                 n = 10 * n + fromdec(*p);
 286                 p++;
 287             }
 288         }
 289         if (!*p) {
 290             tok->cmd = c_S;
 291             tok->aux = n;
 292             return;
 293         }
 294     } else if (tok->text[0] == 'u') {
 295         /* We expect hex characters thereafter. */
 296         wchar_t *p = tok->text+1;
 297         int n = 0;
 298         while (*p && ishex(*p)) {
 299             n = 16 * n + fromhex(*p);
 300             p++;
 301         }
 302         if (!*p) {
 303             tok->cmd = c_u;
 304             tok->aux = n;
 305             return;
 306         }
 307     }
 308
 309     i = -1;
 310     j = sizeof(keywords)/sizeof(*keywords);
 311     while (j-i > 1) {
 312         k = (i+j)/2;
 313         c = kwcmp(tok->text, keywords[k].name);
 314         if (c < 0)
 315             j = k;
 316         else if (c > 0)
 317             i = k;
 318         else /* c == 0 */ {
 319             tok->cmd = keywords[k].id;
 320             return;
 321         }
 322     }
 323
 324     tok->cmd = c__invalid;
 325 }
 326
 327
 328 /*
 329  * Read a token from the input file, in the normal way (`normal' in
 330  * the sense that code paragraphs work a different way).
 331  */
 332 token get_token(input *in) {
 333     int c;
 334     int nls;
 335     token ret;
 336     rdstring rs = { 0, 0, NULL };
 337     filepos cpos;
 338
 339     ret.text = NULL;                   /* default */
 340     c = get(in, &cpos);
 341     ret.pos = cpos;
 342     if (iswhite(c)) {                  /* tok_white or tok_eop */
 343         nls = 0;
 344         do {
 345             if (isnl(c))
 346                 nls++;
 347         } while ((c = get(in, &cpos)) != EOF && iswhite(c));
 348         if (c == EOF) {
 349             ret.type = tok_eof;
 350             return ret;
 351         }
 352         unget(in, c, &cpos);
 353         ret.type = (nls > 1 ? tok_eop : tok_white);
 354         return ret;
 355     } else if (c == EOF) {             /* tok_eof */
 356         ret.type = tok_eof;
 357         return ret;
 358     } else if (c == '\\') {            /* tok_cmd */
 359         c = get(in, &cpos);
 360         if (c == '-' || c == '\\' || c == '_' ||
 361             c == '#' || c == '{' || c == '}') {
 362             /* single-char command */
 363             rdadd(&rs, c);
 364         } else if (c == 'u') {
 365             int len = 0;
 366             do {
 367                 rdadd(&rs, c);
 368                 len++;
 369                 c = get(in, &cpos);
 370             } while (ishex(c) && len < 5);
 371             unget(in, c, &cpos);
 372         } else if (iscmd(c)) {
 373             do {
 374                 rdadd(&rs, c);
 375                 c = get(in, &cpos);
 376             } while (iscmd(c));
 377             unget(in, c, &cpos);
 378         }
 379         /*
 380          * Now match the command against the list of available
 381          * ones.
 382          */
 383         ret.type = tok_cmd;
 384         ret.text = ustrdup(rs.text);
 385         match_kw(&ret);
 386         sfree(rs.text);
 387         return ret;
 388     } else if (c == '{') {             /* tok_lbrace */
 389         ret.type = tok_lbrace;
 390         return ret;
 391     } else if (c == '}') {             /* tok_rbrace */
 392         ret.type = tok_rbrace;
 393         return ret;
 394     } else {                           /* tok_word */
 395         /*
 396          * Read a word: the longest possible contiguous sequence of
 397          * things other than whitespace, backslash, braces and
 398          * hyphen. A hyphen terminates the word but is returned as
 399          * part of it; everything else is pushed back for the next
 400          * token. The `aux' field contains TRUE if the word ends in
 401          * a hyphen.
 402          */
 403         ret.aux = FALSE;               /* assumed for now */
 404         while (1) {
 405             if (iswhite(c) || c=='{' || c=='}' || c=='\\' || c==EOF) {
 406                 /* Put back the character that caused termination */
 407                 unget(in, c, &cpos);
 408                 break;
 409             } else {
 410                 rdadd(&rs, c);
 411                 if (c == '-') {
 412                     ret.aux = TRUE;
 413                     break;             /* hyphen terminates word */
 414                 }
 415             }
 416             c = get(in, &cpos);
 417         }
 418         ret.type = tok_word;
 419         ret.text = ustrdup(rs.text);
 420         sfree(rs.text);
 421         return ret;
 422     }
 423 }
 424
 425 /*
 426  * Determine whether the next input character is an open brace (for
 427  * telling code paragraphs from paragraphs which merely start with
 428  * code).
 429  */
 430 int isbrace(input *in) {
 431     int c;
 432     filepos cpos;
 433
 434     c = get(in, &cpos);
 435     unget(in, c, &cpos);
 436     return (c == '{');
 437 }
 438
 439 /*
 440  * Read the rest of a line that starts `\c'. Including nothing at
 441  * all (tok_word with empty text).
 442  */
 443 token get_codepar_token(input *in) {
 444     int c;
 445     token ret;
 446     rdstring rs = { 0, 0, NULL };
 447     filepos cpos;
 448
 449     ret.type = tok_word;
 450     c = get(in, &cpos);                /* expect (and discard) one space */
 451     ret.pos = cpos;
 452     if (c == ' ') {
 453         c = get(in, &cpos);
 454         ret.pos = cpos;
 455     }
 456     while (!isnl(c) && c != EOF) {
 457         int c2 = c;
 458         c = get(in, &cpos);
 459         /* Discard \r just before \n. */
 460         if (c2 != 13 || !isnl(c))
 461             rdadd(&rs, c2);
 462     }
 463     unget(in, c, &cpos);
 464     ret.text = ustrdup(rs.text);
 465     sfree(rs.text);
 466     return ret;
 467 }
 468
 469 /*
 470  * Adds a new word to a linked list
 471  */
 472 static word *addword(word newword, word ***hptrptr) {
 473     word *mnewword;
 474     if (!hptrptr)
 475         return NULL;
 476     mnewword = mknew(word);
 477     *mnewword = newword;               /* structure copy */
 478     mnewword->next = NULL;
 479     **hptrptr = mnewword;
 480     *hptrptr = &mnewword->next;
 481     return mnewword;
 482 }
 483
 484 /*
 485  * Adds a new paragraph to a linked list
 486  */
 487 static paragraph *addpara(paragraph newpara, paragraph ***hptrptr) {
 488     paragraph *mnewpara = mknew(paragraph);
 489     *mnewpara = newpara;               /* structure copy */
 490     mnewpara->next = NULL;
 491     **hptrptr = mnewpara;
 492     *hptrptr = &mnewpara->next;
 493     return mnewpara;
 494 }
 495
 496 /*
 497  * Destructor before token is reassigned; should catch most memory
 498  * leaks
 499  */
 500 #define dtor(t) ( sfree(t.text) )
 501
 502 /*
 503  * Reads a single file (ie until get() returns EOF)
 504  */
 505 static void read_file(paragraph ***ret, input *in, indexdata *idx) {
 506     token t;
 507     paragraph par;
 508     word wd, **whptr, **idximplicit;
 509     tree234 *macros;
 510     wchar_t utext[2], *wdtext;
 511     int style, spcstyle;
 512     int already;
 513     int iswhite, seenwhite;
 514     int type;
 515     int prev_para_type;
 516     struct stack_item {
 517         enum {
 518             stack_nop = 0,             /* do nothing (for error recovery) */
 519             stack_ualt = 1,            /* \u alternative */
 520             stack_style = 2,           /* \e, \c, \cw */
 521             stack_idx = 4,             /* \I, \i, \ii */
 522             stack_hyper = 8,           /* \W */
 523             stack_quote = 16,          /* \q */
 524         } type;
 525         word **whptr;                  /* to restore from \u alternatives */
 526         word **idximplicit;            /* to restore from \u alternatives */
 527     } *sitem;
 528     stack parsestk;
 529     struct crossparaitem {
 530         int type;                      /* currently c_lcont or -1 */
 531         int seen_lcont;
 532     };
 533     stack crossparastk;
 534     word *indexword, *uword, *iword;
 535     word *idxwordlist;
 536     rdstring indexstr;
 537     int index_downcase, index_visible, indexing;
 538     const rdstring nullrs = { 0, 0, NULL };
 539     wchar_t uchr;
 540
 541     t.text = NULL;
 542     macros = newtree234(macrocmp);
 543     already = FALSE;
 544
 545     crossparastk = stk_new();
 546
 547     /*
 548      * Loop on each paragraph.
 549      */
 550     while (1) {
 551         int start_cmd = c__invalid;
 552         par.words = NULL;
 553         par.keyword = NULL;
 554         whptr = &par.words;
 555
 556         /*
 557          * Get a token.
 558          */
 559         do {
 560             if (!already) {
 561                 dtor(t), t = get_token(in);
 562             }
 563             already = FALSE;
 564         } while (t.type == tok_eop);
 565         if (t.type == tok_eof)
 566             break;
 567
 568         /*
 569          * Parse code paragraphs separately.
 570          */
 571         if (t.type == tok_cmd && t.cmd == c_c && !isbrace(in)) {
 572             int wtype = word_WeakCode;
 573
 574             par.type = para_Code;
 575             par.fpos = t.pos;
 576             while (1) {
 577                 dtor(t), t = get_codepar_token(in);
 578                 wd.type = wtype;
 579                 wd.breaks = FALSE;     /* shouldn't need this... */
 580                 wd.text = ustrdup(t.text);
 581                 wd.alt = NULL;
 582                 wd.fpos = t.pos;
 583                 addword(wd, &whptr);
 584                 dtor(t), t = get_token(in);
 585                 if (t.type == tok_white) {
 586                     /*
 587                      * The newline after a code-paragraph line
 588                      */
 589                     dtor(t), t = get_token(in);
 590                 }
 591                 if (t.type == tok_eop || t.type == tok_eof)
 592                     break;
 593                 else if (t.type == tok_cmd && t.cmd == c_c)
 594                     wtype = word_WeakCode;
 595                 else if (t.type == tok_cmd && t.cmd == c_e &&
 596                          wtype == word_WeakCode)
 597                     wtype = word_Emph;
 598                 else {
 599                     error(err_brokencodepara, &t.pos);
 600                     prev_para_type = par.type;
 601                     addpara(par, ret);
 602                     while (t.type != tok_eop)   /* error recovery: */
 603                         dtor(t), t = get_token(in);   /* eat rest of paragraph */
 604                     goto codeparabroken;   /* ick, but such is life */
 605                 }
 606             }
 607             prev_para_type = par.type;
 608             addpara(par, ret);
 609             codeparabroken:
 610             continue;
 611         }
 612
 613         /*
 614          * Spot the special commands that define a grouping of more
 615          * than one paragraph, and also the closing braces that
 616          * finish them.
 617          */
 618         if (t.type == tok_cmd &&
 619             t.cmd == c_lcont) {
 620             struct crossparaitem *sitem, *stop;
 621
 622             /*
 623              * Expect, and swallow, an open brace.
 624              */
 625             dtor(t), t = get_token(in);
 626             if (t.type != tok_lbrace) {
 627                 error(err_explbr, &t.pos);
 628                 continue;
 629             }
 630
 631             /*
 632              * \lcont causes a continuation of a list item into
 633              * multiple paragraphs (which may in turn contain
 634              * nested lists, code paras etc). Hence, the previous
 635              * paragraph must be of a list type.
 636              */
 637             sitem = mknew(struct crossparaitem);
 638             stop = (struct crossparaitem *)stk_top(crossparastk);
 639             if (prev_para_type == para_Bullet ||
 640                 prev_para_type == para_NumberedList ||
 641                 prev_para_type == para_Description) {
 642                 sitem->type = c_lcont;
 643                 sitem->seen_lcont = 1;
 644                 par.type = para_LcontPush;
 645                 prev_para_type = par.type;
 646                 addpara(par, ret);
 647             } else {
 648                 /*
 649                  * Push a null item on the cross-para stack so that
 650                  * when we see the corresponding closing brace we
 651                  * don't give a cascade error.
 652                  */
 653                 sitem->type = -1;
 654                 sitem->seen_lcont = (stop ? stop->seen_lcont : 0);
 655                 error(err_misplacedlcont, &t.pos);
 656             }
 657             stk_push(crossparastk, sitem);
 658             continue;
 659         } else if (t.type == tok_rbrace) {
 660             struct crossparaitem *sitem = stk_pop(crossparastk);
 661             if (!sitem)
 662                 error(err_unexbrace, &t.pos);
 663             else {
 664                 switch (sitem->type) {
 665                   case c_lcont:
 666                     par.type = para_LcontPop;
 667                     prev_para_type = par.type;
 668                     addpara(par, ret);
 669                     break;
 670                 }
 671                 sfree(sitem);
 672             }
 673             continue;
 674         }
 675
 676         /*
 677          * This token begins a paragraph. See if it's one of the
 678          * special commands that define a paragraph type.
 679          *
 680          * (note that \# is special in a way, and \nocite takes no
 681          * text)
 682          */
 683         par.type = para_Normal;
 684         if (t.type == tok_cmd) {
 685             int needkw;
 686             int is_macro = FALSE;
 687
 688             par.fpos = t.pos;
 689             switch (t.cmd) {
 690               default:
 691                 needkw = -1;
 692                 break;
 693               case c__invalid:
 694                 error(err_badparatype, t.text, &t.pos);
 695                 needkw = 4;
 696                 break;
 697               case c__comment:
 698                 if (isbrace(in))
 699                     break;             /* `\#{': isn't a comment para */
 700                 do {
 701                     dtor(t), t = get_token(in);
 702                 } while (t.type != tok_eop && t.type != tok_eof);
 703                 continue;              /* next paragraph */
 704                 /*
 705                  * `needkw' values:
 706                  *
 707                  *   1 -- exactly one keyword
 708                  *   2 -- at least one keyword
 709                  *   4 -- any number of keywords including zero
 710                  *   8 -- at least one keyword and then nothing else
 711                  *  16 -- nothing at all! no keywords, no body
 712                  *  32 -- no keywords at all
 713                  */
 714               case c_A: needkw = 2; par.type = para_Appendix; break;
 715               case c_B: needkw = 2; par.type = para_Biblio; break;
 716               case c_BR: needkw = 1; par.type = para_BR;
 717                 start_cmd = c_BR; break;
 718               case c_C: needkw = 2; par.type = para_Chapter; break;
 719               case c_H: needkw = 2; par.type = para_Heading;
 720                 par.aux = 0;
 721                 break;
 722               case c_IM: needkw = 2; par.type = para_IM;
 723                 start_cmd = c_IM; break;
 724               case c_S: needkw = 2; par.type = para_Subsect;
 725                 par.aux = t.aux; break;
 726               case c_U: needkw = 32; par.type = para_UnnumberedChapter; break;
 727                 /* For \b and \n the keyword is optional */
 728               case c_b: needkw = 4; par.type = para_Bullet; break;
 729               case c_dt: needkw = 4; par.type = para_DescribedThing; break;
 730               case c_dd: needkw = 4; par.type = para_Description; break;
 731               case c_n: needkw = 4; par.type = para_NumberedList; break;
 732               case c_cfg: needkw = 8; par.type = para_Config;
 733                 start_cmd = c_cfg; break;
 734               case c_copyright: needkw = 32; par.type = para_Copyright; break;
 735               case c_define: is_macro = TRUE; needkw = 1; break;
 736                 /* For \nocite the keyword is _everything_ */
 737               case c_nocite: needkw = 8; par.type = para_NoCite; break;
 738               case c_preamble: needkw = 32; par.type = para_Preamble; break;
 739               case c_rule: needkw = 16; par.type = para_Rule; break;
 740               case c_title: needkw = 32; par.type = para_Title; break;
 741               case c_versionid: needkw = 32; par.type = para_VersionID; break;
 742             }
 743
 744             if (par.type == para_Chapter ||
 745                 par.type == para_Heading ||
 746                 par.type == para_Subsect ||
 747                 par.type == para_Appendix ||
 748                 par.type == para_UnnumberedChapter) {
 749                 struct crossparaitem *sitem = stk_top(crossparastk);
 750                 if (sitem && sitem->seen_lcont) {
 751                     error(err_sectmarkerinlcont, &t.pos);
 752                 }
 753             }
 754
 755             if (needkw > 0) {
 756                 rdstring rs = { 0, 0, NULL };
 757                 int nkeys = 0;
 758                 filepos fp;
 759
 760                 /* Get keywords. */
 761                 dtor(t), t = get_token(in);
 762                 fp = t.pos;
 763                 while (t.type == tok_lbrace) {
 764                     /* This is a keyword. */
 765                     nkeys++;
 766                     /* FIXME: there will be bugs if anyone specifies an
 767                      * empty keyword (\foo{}), so trap this case. */
 768                     while (dtor(t), t = get_token(in),
 769                            t.type == tok_word ||
 770                            t.type == tok_white ||
 771                            (t.type == tok_cmd && t.cmd == c__nbsp) ||
 772                            (t.type == tok_cmd && t.cmd == c__escaped)) {
 773                         if (t.type == tok_white ||
 774                             (t.type == tok_cmd && t.cmd == c__nbsp))
 775                             rdadd(&rs, ' ');
 776                         else
 777                             rdadds(&rs, t.text);
 778                     }
 779                     if (t.type != tok_rbrace) {
 780                         error(err_kwunclosed, &t.pos);
 781                         continue;
 782                     }
 783                     rdadd(&rs, 0);     /* add string terminator */
 784                     dtor(t), t = get_token(in); /* eat right brace */
 785                 }
 786
 787                 rdadd(&rs, 0);     /* add string terminator */
 788
 789                 /* See whether we have the right number of keywords. */
 790                 if ((needkw & 48) && nkeys > 0)
 791                     error(err_kwillegal, &fp);
 792                 if ((needkw & 11) && nkeys == 0)
 793                     error(err_kwexpected, &fp);
 794                 if ((needkw & 5) && nkeys > 1)
 795                     error(err_kwtoomany, &fp);
 796
 797                 if (is_macro) {
 798                     /*
 799                      * Macro definition. Get the rest of the line
 800                      * as a code-paragraph token, repeatedly until
 801                      * there's nothing more left of it. Separate
 802                      * with newlines.
 803                      */
 804                     rdstring macrotext = { 0, 0, NULL };
 805                     while (1) {
 806                         dtor(t), t = get_codepar_token(in);
 807                         if (macrotext.pos > 0)
 808                             rdadd(&macrotext, L'\n');
 809                         rdadds(&macrotext, t.text);
 810                         dtor(t), t = get_token(in);
 811                         if (t.type == tok_eop) break;
 812                     }
 813                     macrodef(macros, rs.text, macrotext.text, fp);
 814                     continue;          /* next paragraph */
 815                 }
 816
 817                 par.keyword = rdtrim(&rs);
 818
 819                 /* Move to EOP in case of needkw==8 or 16 (no body) */
 820                 if (needkw & 24) {
 821                     /* We allow whitespace even when we expect no para body */
 822                     while (t.type == tok_white)
 823                         dtor(t), t = get_token(in);
 824                     if (t.type != tok_eop && t.type != tok_eof &&
 825                         (start_cmd == c__invalid ||
 826                          t.type != tok_cmd || t.cmd != start_cmd)) {
 827                         error(err_bodyillegal, &t.pos);
 828                         /* Error recovery: eat the rest of the paragraph */
 829                         while (t.type != tok_eop && t.type != tok_eof &&
 830                                (start_cmd == c__invalid ||
 831                                 t.type != tok_cmd || t.cmd != start_cmd))
 832                             dtor(t), t = get_token(in);
 833                     }
 834                     if (t.type == tok_cmd)
 835                         already = TRUE;/* inhibit get_token at top of loop */
 836                     prev_para_type = par.type;
 837                     addpara(par, ret);
 838                     continue;          /* next paragraph */
 839                 }
 840             }
 841         }
 842
 843         /*
 844          * Now read the actual paragraph, word by word, adding to
 845          * the paragraph list.
 846          *
 847          * Mid-paragraph commands:
 848          *
 849          *  \K \k
 850          *  \c \cw
 851          *  \e
 852          *  \i \ii
 853          *  \I
 854          *  \u
 855          *  \W
 856          *  \date
 857          *  \\ \{ \}
 858          */
 859         parsestk = stk_new();
 860         style = word_Normal;
 861         spcstyle = word_WhiteSpace;
 862         indexing = FALSE;
 863         seenwhite = TRUE;
 864         while (t.type != tok_eop && t.type != tok_eof) {
 865             iswhite = FALSE;
 866             already = FALSE;
 867
 868             /* Handle implicit paragraph breaks after \IM, \BR etc */
 869             if (start_cmd != c__invalid &&
 870                 t.type == tok_cmd && t.cmd == start_cmd) {
 871                 already = TRUE;        /* inhibit get_token at top of loop */
 872                 break;
 873             }
 874
 875             if (t.type == tok_cmd && t.cmd == c__escaped) {
 876                 t.type = tok_word;     /* nice and simple */
 877                 t.aux = 0;             /* even if `\-' - nonbreaking! */
 878             }
 879             if (t.type == tok_cmd && t.cmd == c__nbsp) {
 880                 t.type = tok_word;     /* nice and simple */
 881                 sfree(t.text);
 882                 t.text = ustrdup(L" ");  /* text is ` ' not `_' */
 883                 t.aux = 0;             /* (nonbreaking) */
 884             }
 885             switch (t.type) {
 886               case tok_white:
 887                 if (whptr == &par.words)
 888                     break;             /* strip whitespace at start of para */
 889                 wd.text = NULL;
 890                 wd.type = spcstyle;
 891                 wd.alt = NULL;
 892                 wd.aux = 0;
 893                 wd.fpos = t.pos;
 894                 wd.breaks = FALSE;
 895
 896                 /*
 897                  * Inhibit use of whitespace if it's (probably the
 898                  * newline) before a repeat \IM / \BR type
 899                  * directive.
 900                  */
 901                 if (start_cmd != c__invalid) {
 902                     dtor(t), t = get_token(in);
 903                     already = TRUE;
 904                     if (t.type == tok_cmd && t.cmd == start_cmd)
 905                         break;
 906                 }
 907
 908                 if (indexing)
 909                     rdadd(&indexstr, ' ');
 910                 if (!indexing || index_visible)
 911                     addword(wd, &whptr);
 912                 if (indexing)
 913                     addword(wd, &idximplicit);
 914                 iswhite = TRUE;
 915                 break;
 916               case tok_word:
 917                 if (indexing)
 918                     rdadds(&indexstr, t.text);
 919                 wd.type = style;
 920                 wd.alt = NULL;
 921                 wd.aux = 0;
 922                 wd.fpos = t.pos;
 923                 wd.breaks = t.aux;
 924                 if (!indexing || index_visible) {
 925                     wd.text = ustrdup(t.text);
 926                     addword(wd, &whptr);
 927                 }
 928                 if (indexing) {
 929                     wd.text = ustrdup(t.text);
 930                     addword(wd, &idximplicit);
 931                 }
 932                 break;
 933               case tok_lbrace:
 934                 error(err_unexbrace, &t.pos);
 935                 /* Error recovery: push nop */
 936                 sitem = mknew(struct stack_item);
 937                 sitem->type = stack_nop;
 938                 stk_push(parsestk, sitem);
 939                 break;
 940               case tok_rbrace:
 941                 sitem = stk_pop(parsestk);
 942                 if (!sitem) {
 943                     /*
 944                      * This closing brace could have been an
 945                      * indication that the cross-paragraph stack
 946                      * wants popping. Accordingly, we treat it here
 947                      * as an indication that the paragraph is over.
 948                      */
 949                     already = TRUE;
 950                     goto finished_para;
 951                 } else {
 952                     if (sitem->type & stack_ualt) {
 953                         whptr = sitem->whptr;
 954                         idximplicit = sitem->idximplicit;
 955                     }
 956                     if (sitem->type & stack_style) {
 957                         style = word_Normal;
 958                         spcstyle = word_WhiteSpace;
 959                     }
 960                     if (sitem->type & stack_idx) {
 961                         indexword->text = ustrdup(indexstr.text);
 962                         if (index_downcase)
 963                             ustrlow(indexword->text);
 964                         indexing = FALSE;
 965                         rdadd(&indexstr, L'\0');
 966                         index_merge(idx, FALSE, indexstr.text, idxwordlist);
 967                         sfree(indexstr.text);
 968                     }
 969                     if (sitem->type & stack_hyper) {
 970                         wd.text = NULL;
 971                         wd.type = word_HyperEnd;
 972                         wd.alt = NULL;
 973                         wd.aux = 0;
 974                         wd.fpos = t.pos;
 975                         wd.breaks = FALSE;
 976                         if (!indexing || index_visible)
 977                             addword(wd, &whptr);
 978                         if (indexing)
 979                             addword(wd, &idximplicit);
 980                     }
 981                     if (sitem->type & stack_quote) {
 982                         wd.text = NULL;
 983                         wd.type = toquotestyle(style);
 984                         wd.alt = NULL;
 985                         wd.aux = quote_Close;
 986                         wd.fpos = t.pos;
 987                         wd.breaks = FALSE;
 988                         if (!indexing || index_visible)
 989                             addword(wd, &whptr);
 990                         if (indexing) {
 991                             rdadd(&indexstr, L'"');
 992                             addword(wd, &idximplicit);
 993                         }
 994                     }
 995                 }
 996                 sfree(sitem);
 997                 break;
 998               case tok_cmd:
 999                 switch (t.cmd) {
1000                   case c__comment:
1001                     /*
1002                      * In-paragraph comment: \#{ balanced braces }
1003                      *
1004                      * Anything goes here; even tok_eop. We should
1005                      * eat whitespace after the close brace _if_
1006                      * there was whitespace before the \#.
1007                      */
1008                     dtor(t), t = get_token(in);
1009                     if (t.type != tok_lbrace) {
1010                         error(err_explbr, &t.pos);
1011                     } else {
1012                         int braces = 1;
1013                         while (braces > 0) {
1014                             dtor(t), t = get_token(in);
1015                             if (t.type == tok_lbrace)
1016                                 braces++;
1017                             else if (t.type == tok_rbrace)
1018                                 braces--;
1019                             else if (t.type == tok_eof) {
1020                                 error(err_commenteof, &t.pos);
1021                                 break;
1022                             }
1023                         }
1024                     }
1025                     if (seenwhite) {
1026                         already = TRUE;
1027                         dtor(t), t = get_token(in);
1028                         if (t.type == tok_white) {
1029                             iswhite = TRUE;
1030                             already = FALSE;
1031                         }
1032                     }
1033                     break;
1034                   case c_q:
1035                     dtor(t), t = get_token(in);
1036                     if (t.type != tok_lbrace) {
1037                         error(err_explbr, &t.pos);
1038                     } else {
1039                         wd.text = NULL;
1040                         wd.type = toquotestyle(style);
1041                         wd.alt = NULL;
1042                         wd.aux = quote_Open;
1043                         wd.fpos = t.pos;
1044                         wd.breaks = FALSE;
1045                         if (!indexing || index_visible)
1046                             addword(wd, &whptr);
1047                         if (indexing) {
1048                             rdadd(&indexstr, L'"');
1049                             addword(wd, &idximplicit);
1050                         }
1051                         sitem = mknew(struct stack_item);
1052                         sitem->type = stack_quote;
1053                         stk_push(parsestk, sitem);
1054                     }
1055                     break;
1056                   case c_K:
1057                   case c_k:
1058                   case c_W:
1059                   case c_date:
1060                     /*
1061                      * Keyword, hyperlink, or \date. We expect a
1062                      * left brace, some text, and then a right
1063                      * brace. No nesting; no arguments.
1064                      */
1065                     wd.fpos = t.pos;
1066                     wd.breaks = FALSE;
1067                     if (t.cmd == c_K)
1068                         wd.type = word_UpperXref;
1069                     else if (t.cmd == c_k)
1070                         wd.type = word_LowerXref;
1071                     else if (t.cmd == c_W)
1072                         wd.type = word_HyperLink;
1073                     else
1074                         wd.type = word_Normal;
1075                     dtor(t), t = get_token(in);
1076                     if (t.type != tok_lbrace) {
1077                         if (wd.type == word_Normal) {
1078                             time_t thetime = time(NULL);
1079                             struct tm *broken = localtime(&thetime);
1080                             already = TRUE;
1081                             wdtext = ustrftime(NULL, broken);
1082                             wd.type = style;
1083                         } else {
1084                             error(err_explbr, &t.pos);
1085                             wdtext = NULL;
1086                         }
1087                     } else {
1088                         rdstring rs = { 0, 0, NULL };
1089                         while (dtor(t), t = get_token(in),
1090                                t.type == tok_word || t.type == tok_white) {
1091                             if (t.type == tok_white)
1092                                 rdadd(&rs, ' ');
1093                             else
1094                                 rdadds(&rs, t.text);
1095                         }
1096                         if (wd.type == word_Normal) {
1097                             time_t thetime = time(NULL);
1098                             struct tm *broken = localtime(&thetime);
1099                             wdtext = ustrftime(rs.text, broken);
1100                             wd.type = style;
1101                         } else {
1102                             wdtext = ustrdup(rs.text);
1103                         }
1104                         sfree(rs.text);
1105                         if (t.type != tok_rbrace) {
1106                             error(err_kwexprbr, &t.pos);
1107                         }
1108                     }
1109                     wd.alt = NULL;
1110                     wd.aux = 0;
1111                     if (!indexing || index_visible) {
1112                         wd.text = ustrdup(wdtext);
1113                         addword(wd, &whptr);
1114                     }
1115                     if (indexing) {
1116                         wd.text = ustrdup(wdtext);
1117                         addword(wd, &idximplicit);
1118                     }
1119                     sfree(wdtext);
1120                     if (wd.type == word_HyperLink) {
1121                         /*
1122                          * Hyperlinks are different: they then
1123                          * expect another left brace, to begin
1124                          * delimiting the text marked by the link.
1125                          */
1126                         dtor(t), t = get_token(in);
1127                         /*
1128                          * Special cases: \W{}\c, \W{}\e, \W{}\cw
1129                          */
1130                         sitem = mknew(struct stack_item);
1131                         sitem->type = stack_hyper;
1132                         if (t.type == tok_cmd &&
1133                             (t.cmd == c_e || t.cmd == c_c || t.cmd == c_cw)) {
1134                             if (style != word_Normal)
1135                                 error(err_nestedstyles, &t.pos);
1136                             else {
1137                                 style = (t.cmd == c_c ? word_Code :
1138                                          t.cmd == c_cw ? word_WeakCode :
1139                                          word_Emph);
1140                                 spcstyle = tospacestyle(style);
1141                                 sitem->type |= stack_style;
1142                             }
1143                             dtor(t), t = get_token(in);
1144                         }
1145                         if (t.type != tok_lbrace) {
1146                             error(err_explbr, &t.pos);
1147                             sfree(sitem);
1148                         } else {
1149                             stk_push(parsestk, sitem);
1150                         }
1151                     }
1152                     break;
1153                   case c_c:
1154                   case c_cw:
1155                   case c_e:
1156                     type = t.cmd;
1157                     if (style != word_Normal) {
1158                         error(err_nestedstyles, &t.pos);
1159                         /* Error recovery: eat lbrace, push nop. */
1160                         dtor(t), t = get_token(in);
1161                         sitem = mknew(struct stack_item);
1162                         sitem->type = stack_nop;
1163                         stk_push(parsestk, sitem);
1164                     }
1165                     dtor(t), t = get_token(in);
1166                     if (t.type != tok_lbrace) {
1167                         error(err_explbr, &t.pos);
1168                     } else {
1169                         style = (type == c_c ? word_Code :
1170                                  type == c_cw ? word_WeakCode :
1171                                  word_Emph);
1172                         spcstyle = tospacestyle(style);
1173                         sitem = mknew(struct stack_item);
1174                         sitem->type = stack_style;
1175                         stk_push(parsestk, sitem);
1176                     }
1177                     break;
1178                   case c_i:
1179                   case c_ii:
1180                   case c_I:
1181                     type = t.cmd;
1182                     if (indexing) {
1183                         error(err_nestedindex, &t.pos);
1184                         /* Error recovery: eat lbrace, push nop. */
1185                         dtor(t), t = get_token(in);
1186                         sitem = mknew(struct stack_item);
1187                         sitem->type = stack_nop;
1188                         stk_push(parsestk, sitem);
1189                     }
1190                     sitem = mknew(struct stack_item);
1191                     sitem->type = stack_idx;
1192                     dtor(t), t = get_token(in);
1193                     /*
1194                      * Special cases: \i\c, \i\e, \i\cw
1195                      */
1196                     wd.fpos = t.pos;
1197                     if (t.type == tok_cmd &&
1198                         (t.cmd == c_e || t.cmd == c_c || t.cmd == c_cw)) {
1199                         if (style != word_Normal)
1200                             error(err_nestedstyles, &t.pos);
1201                         else {
1202                             style = (t.cmd == c_c ? word_Code :
1203                                      t.cmd == c_cw ? word_WeakCode :
1204                                      word_Emph);
1205                             spcstyle = tospacestyle(style);
1206                             sitem->type |= stack_style;
1207                         }
1208                         dtor(t), t = get_token(in);
1209                     }
1210                     if (t.type != tok_lbrace) {
1211                         sfree(sitem);
1212                         error(err_explbr, &t.pos);
1213                     } else {
1214                         /* Add an index-reference word with no text as yet */
1215                         wd.type = word_IndexRef;
1216                         wd.text = NULL;
1217                         wd.alt = NULL;
1218                         wd.aux = 0;
1219                         wd.breaks = FALSE;
1220                         indexword = addword(wd, &whptr);
1221                         /* Set up a rdstring to read the index text */
1222                         indexstr = nullrs;
1223                         /* Flags so that we do the Right Things with text */
1224                         index_visible = (type != c_I);
1225                         index_downcase = (type == c_ii);
1226                         indexing = TRUE;
1227                         idxwordlist = NULL;
1228                         idximplicit = &idxwordlist;
1229                         /* Stack item to close the indexing on exit */
1230                         stk_push(parsestk, sitem);
1231                     }
1232                     break;
1233                   case c_u:
1234                     uchr = t.aux;
1235                     utext[0] = uchr; utext[1] = 0;
1236                     wd.type = style;
1237                     wd.breaks = FALSE;
1238                     wd.alt = NULL;
1239                     wd.aux = 0;
1240                     wd.fpos = t.pos;
1241                     if (!indexing || index_visible) {
1242                         wd.text = ustrdup(utext);
1243                         uword = addword(wd, &whptr);
1244                     } else
1245                         uword = NULL;
1246                     if (indexing) {
1247                         wd.text = ustrdup(utext);
1248                         iword = addword(wd, &idximplicit);
1249                     } else
1250                         iword = NULL;
1251                     dtor(t), t = get_token(in);
1252                     if (t.type == tok_lbrace) {
1253                         /*
1254                          * \u with a left brace. Until the brace
1255                          * closes, all further words go on a
1256                          * sidetrack from the main thread of the
1257                          * paragraph.
1258                          */
1259                         sitem = mknew(struct stack_item);
1260                         sitem->type = stack_ualt;
1261                         sitem->whptr = whptr;
1262                         sitem->idximplicit = idximplicit;
1263                         stk_push(parsestk, sitem);
1264                         whptr = uword ? &uword->alt : NULL;
1265                         idximplicit = iword ? &iword->alt : NULL;
1266                     } else {
1267                         if (indexing)
1268                             rdadd(&indexstr, uchr);
1269                         already = TRUE;
1270                     }
1271                     break;
1272                   default:
1273                     if (!macrolookup(macros, in, t.text, &t.pos))
1274                         error(err_badmidcmd, t.text, &t.pos);
1275                     break;
1276                 }
1277             }
1278             if (!already)
1279                 dtor(t), t = get_token(in);
1280             seenwhite = iswhite;
1281         }
1282         finished_para:
1283         /* Check the stack is empty */
1284         if (stk_top(parsestk)) {
1285             while ((sitem = stk_pop(parsestk)))
1286                 sfree(sitem);
1287             error(err_missingrbrace, &t.pos);
1288         }
1289         stk_free(parsestk);
1290         prev_para_type = par.type;
1291         addpara(par, ret);
1292         if (t.type == tok_eof)
1293             already = TRUE;
1294     }
1295
1296     if (stk_top(crossparastk)) {
1297         void *p;
1298
1299         error(err_missingrbrace2, &t.pos);
1300         while ((p = stk_pop(crossparastk)))
1301             sfree(p);
1302     }
1303
1304     /*
1305      * We break to here rather than returning, because otherwise
1306      * this cleanup doesn't happen.
1307      */
1308     dtor(t);
1309     macrocleanup(macros);
1310
1311     stk_free(crossparastk);
1312 }
1313
1314 paragraph *read_input(input *in, indexdata *idx) {
1315     paragraph *head = NULL;
1316     paragraph **hptr = &head;
1317
1318     while (in->currindex < in->nfiles) {
1319         in->currfp = fopen(in->filenames[in->currindex], "r");
1320         if (in->currfp) {
1321             setpos(in, in->filenames[in->currindex]);
1322             read_file(&hptr, in, idx);
1323         }
1324         in->currindex++;
1325     }
1326
1327     return head;
1328 }