mdw@git.distorted.org.uk Git - sgt/halibut/blob - input.c

   1 /*
   2  * input.c: read the source form
   3  */
   4
   5 #include <stdio.h>
   6 #include <assert.h>
   7 #include <time.h>
   8 #include "halibut.h"
   9
  10 #define TAB_STOP 8                     /* for column number tracking */
  11
  12 static void setpos(input *in, char *fname) {
  13     in->pos.filename = fname;
  14     in->pos.line = 1;
  15     in->pos.col = (in->reportcols ? 1 : -1);
  16 }
  17
  18 static void unget(input *in, int c, filepos *pos) {
  19     if (in->npushback >= in->pushbacksize) {
  20         in->pushbacksize = in->npushback + 16;
  21         in->pushback = resize(in->pushback, in->pushbacksize);
  22     }
  23     in->pushback[in->npushback].chr = c;
  24     in->pushback[in->npushback].pos = *pos;   /* structure copy */
  25     in->npushback++;
  26 }
  27
  28 /* ---------------------------------------------------------------------- */
  29 /*
  30  * Macro subsystem
  31  */
  32 typedef struct macro_Tag macro;
  33 struct macro_Tag {
  34     wchar_t *name, *text;
  35 };
  36 struct macrostack_Tag {
  37     macrostack *next;
  38     wchar_t *text;
  39     int ptr, npushback;
  40     filepos pos;
  41 };
  42 static int macrocmp(void *av, void *bv) {
  43     macro *a = (macro *)av, *b = (macro *)bv;
  44     return ustrcmp(a->name, b->name);
  45 }
  46 static void macrodef(tree234 *macros, wchar_t *name, wchar_t *text,
  47                      filepos fpos) {
  48     macro *m = mknew(macro);
  49     m->name = name;
  50     m->text = text;
  51     if (add234(macros, m) != m) {
  52         error(err_macroexists, &fpos, name);
  53         sfree(name);
  54         sfree(text);
  55     }
  56 }
  57 static int macrolookup(tree234 *macros, input *in, wchar_t *name,
  58                        filepos *pos) {
  59     macro m, *gotit;
  60     m.name = name;
  61     gotit = find234(macros, &m, NULL);
  62     if (gotit) {
  63         macrostack *expansion = mknew(macrostack);
  64         expansion->next = in->stack;
  65         expansion->text = gotit->text;
  66         expansion->pos = *pos;         /* structure copy */
  67         expansion->ptr = 0;
  68         expansion->npushback = in->npushback;
  69         in->stack = expansion;
  70         return TRUE;
  71     } else
  72         return FALSE;
  73 }
  74 static void macrocleanup(tree234 *macros) {
  75     int ti;
  76     macro *m;
  77     for (ti = 0; (m = (macro *)index234(macros, ti)) != NULL; ti++) {
  78         sfree(m->name);
  79         sfree(m->text);
  80         sfree(m);
  81     }
  82     freetree234(macros);
  83 }
  84
  85 /*
  86  * Can return EOF
  87  */
  88 static int get(input *in, filepos *pos) {
  89     int pushbackpt = in->stack ? in->stack->npushback : 0;
  90     if (in->npushback > pushbackpt) {
  91         --in->npushback;
  92         if (pos)
  93             *pos = in->pushback[in->npushback].pos;   /* structure copy */
  94         return in->pushback[in->npushback].chr;
  95     }
  96     else if (in->stack) {
  97         wchar_t c = in->stack->text[in->stack->ptr];
  98         if (in->stack->text[++in->stack->ptr] == L'\0') {
  99             macrostack *tmp = in->stack;
 100             in->stack = tmp->next;
 101             sfree(tmp);
 102         }
 103         return c;
 104     }
 105     else if (in->currfp) {
 106         int c = getc(in->currfp);
 107
 108         if (c == EOF) {
 109             fclose(in->currfp);
 110             in->currfp = NULL;
 111         }
 112         /* Track line numbers, for error reporting */
 113         if (pos)
 114             *pos = in->pos;
 115         if (in->reportcols) {
 116             switch (c) {
 117               case '\t':
 118                 in->pos.col = 1 + (in->pos.col + TAB_STOP-1) % TAB_STOP;
 119                 break;
 120               case '\n':
 121                 in->pos.col = 1;
 122                 in->pos.line++;
 123                 break;
 124               default:
 125                 in->pos.col++;
 126                 break;
 127             }
 128         } else {
 129             in->pos.col = -1;
 130             if (c == '\n')
 131                 in->pos.line++;
 132         }
 133         /* FIXME: do input charmap translation. We should be returning
 134          * Unicode here. */
 135         return c;
 136     } else
 137         return EOF;
 138 }
 139
 140 /*
 141  * Lexical analysis of source files.
 142  */
 143 typedef struct token_Tag token;
 144 struct token_Tag {
 145     int type;
 146     int cmd, aux;
 147     wchar_t *text;
 148     filepos pos;
 149 };
 150 enum {
 151     tok_eof,                           /* end of file */
 152     tok_eop,                           /* end of paragraph */
 153     tok_white,                         /* whitespace */
 154     tok_word,                          /* a word or word fragment */
 155     tok_cmd,                           /* \command */
 156     tok_lbrace,                        /* { */
 157     tok_rbrace                         /* } */
 158 };
 159
 160 /* Halibut command keywords. */
 161 enum {
 162     c__invalid,                        /* invalid command */
 163     c__comment,                        /* comment command (\#) */
 164     c__escaped,                        /* escaped character */
 165     c__nbsp,                           /* nonbreaking space */
 166     c_A,                               /* appendix heading */
 167     c_B,                               /* bibliography entry */
 168     c_BR,                              /* bibliography rewrite */
 169     c_C,                               /* chapter heading */
 170     c_H,                               /* heading */
 171     c_I,                               /* invisible index mark */
 172     c_IM,                              /* index merge/rewrite */
 173     c_K,                               /* capitalised cross-reference */
 174     c_S,                               /* aux field is 0, 1, 2, ... */
 175     c_U,                               /* unnumbered-chapter heading */
 176     c_W,                               /* Web hyperlink */
 177     c_b,                               /* bulletted list */
 178     c_c,                               /* code */
 179     c_cfg,                             /* configuration directive */
 180     c_copyright,                       /* copyright statement */
 181     c_cw,                              /* weak code */
 182     c_date,                            /* document processing date */
 183     c_define,                          /* macro definition */
 184     c_e,                               /* emphasis */
 185     c_i,                               /* visible index mark */
 186     c_ii,                              /* uncapitalised visible index mark */
 187     c_k,                               /* uncapitalised cross-reference */
 188     c_n,                               /* numbered list */
 189     c_nocite,                          /* bibliography trickery */
 190     c_preamble,                        /* document preamble text */
 191     c_q,                               /* quote marks */
 192     c_rule,                            /* horizontal rule */
 193     c_title,                           /* document title */
 194     c_u,                               /* aux field is char code */
 195     c_versionid                        /* document RCS id */
 196 };
 197
 198 /* Perhaps whitespace should be defined in a more Unicode-friendly way? */
 199 #define iswhite(c) ( (c)==32 || (c)==9 || (c)==13 || (c)==10 )
 200 #define isnl(c) ( (c)==10 )
 201 #define isdec(c) ( ((c)>='0'&&(c)<='9') )
 202 #define fromdec(c) ( (c)-'0' )
 203 #define ishex(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='F') || ((c)>='a'&&(c)<='f'))
 204 #define fromhex(c) ( (c)<='9' ? (c)-'0' : ((c)&0xDF) - ('A'-10) )
 205 #define iscmd(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='Z') || ((c)>='a'&&(c)<='z'))
 206
 207 /*
 208  * Keyword comparison function. Like strcmp, but between a wchar_t *
 209  * and a char *.
 210  */
 211 static int kwcmp(wchar_t const *p, char const *q) {
 212     int i;
 213     do {
 214         i = *p - *q;
 215     } while (*p++ && *q++ && !i);
 216     return i;
 217 }
 218
 219 /*
 220  * Match a keyword.
 221  */
 222 static void match_kw(token *tok) {
 223     /*
 224      * FIXME. The ids are explicit in here so as to allow long-name
 225      * equivalents to the various very short keywords.
 226      */
 227     static const struct { char const *name; int id; } keywords[] = {
 228         {"#", c__comment},             /* comment command (\#) */
 229         {"-", c__escaped},             /* nonbreaking hyphen */
 230         {"A", c_A},                    /* appendix heading */
 231         {"B", c_B},                    /* bibliography entry */
 232         {"BR", c_BR},                  /* bibliography rewrite */
 233         {"C", c_C},                    /* chapter heading */
 234         {"H", c_H},                    /* heading */
 235         {"I", c_I},                    /* invisible index mark */
 236         {"IM", c_IM},                  /* index merge/rewrite */
 237         {"K", c_K},                    /* capitalised cross-reference */
 238         {"U", c_U},                    /* unnumbered-chapter heading */
 239         {"W", c_W},                    /* Web hyperlink */
 240         {"\\", c__escaped},            /* escaped backslash (\\) */
 241         {"_", c__nbsp},                /* nonbreaking space (\_) */
 242         {"b", c_b},                    /* bulletted list */
 243         {"c", c_c},                    /* code */
 244         {"cfg", c_cfg},                /* configuration directive */
 245         {"copyright", c_copyright},    /* copyright statement */
 246         {"cw", c_cw},                  /* weak code */
 247         {"date", c_date},              /* document processing date */
 248         {"define", c_define},          /* macro definition */
 249         {"e", c_e},                    /* emphasis */
 250         {"i", c_i},                    /* visible index mark */
 251         {"ii", c_ii},                  /* uncapitalised visible index mark */
 252         {"k", c_k},                    /* uncapitalised cross-reference */
 253         {"n", c_n},                    /* numbered list */
 254         {"nocite", c_nocite},          /* bibliography trickery */
 255         {"preamble", c_preamble},      /* document preamble text */
 256         {"q", c_q},                    /* quote marks */
 257         {"rule", c_rule},              /* horizontal rule */
 258         {"title", c_title},            /* document title */
 259         {"versionid", c_versionid},    /* document RCS id */
 260         {"{", c__escaped},             /* escaped lbrace (\{) */
 261         {"}", c__escaped},             /* escaped rbrace (\}) */
 262     };
 263     int i, j, k, c;
 264
 265     /*
 266      * Special cases: \S{0,1,2,...} and \uABCD. If the syntax
 267      * doesn't match correctly, we just fall through to the
 268      * binary-search phase.
 269      */
 270     if (tok->text[0] == 'S') {
 271         /* We expect numeric characters thereafter. */
 272         wchar_t *p = tok->text+1;
 273         int n;
 274         if (!*p)
 275             n = 1;
 276         else {
 277             n = 0;
 278             while (*p && isdec(*p)) {
 279                 n = 10 * n + fromdec(*p);
 280                 p++;
 281             }
 282         }
 283         if (!*p) {
 284             tok->cmd = c_S;
 285             tok->aux = n;
 286             return;
 287         }
 288     } else if (tok->text[0] == 'u') {
 289         /* We expect hex characters thereafter. */
 290         wchar_t *p = tok->text+1;
 291         int n = 0;
 292         while (*p && ishex(*p)) {
 293             n = 16 * n + fromhex(*p);
 294             p++;
 295         }
 296         if (!*p) {
 297             tok->cmd = c_u;
 298             tok->aux = n;
 299             return;
 300         }
 301     }
 302
 303     i = -1;
 304     j = sizeof(keywords)/sizeof(*keywords);
 305     while (j-i > 1) {
 306         k = (i+j)/2;
 307         c = kwcmp(tok->text, keywords[k].name);
 308         if (c < 0)
 309             j = k;
 310         else if (c > 0)
 311             i = k;
 312         else /* c == 0 */ {
 313             tok->cmd = keywords[k].id;
 314             return;
 315         }
 316     }
 317
 318     tok->cmd = c__invalid;
 319 }
 320
 321
 322 /*
 323  * Read a token from the input file, in the normal way (`normal' in
 324  * the sense that code paragraphs work a different way).
 325  */
 326 token get_token(input *in) {
 327     int c;
 328     int nls;
 329     token ret;
 330     rdstring rs = { 0, 0, NULL };
 331     filepos cpos;
 332
 333     ret.text = NULL;                   /* default */
 334     c = get(in, &cpos);
 335     ret.pos = cpos;
 336     if (iswhite(c)) {                  /* tok_white or tok_eop */
 337         nls = 0;
 338         do {
 339             if (isnl(c))
 340                 nls++;
 341         } while ((c = get(in, &cpos)) != EOF && iswhite(c));
 342         if (c == EOF) {
 343             ret.type = tok_eof;
 344             return ret;
 345         }
 346         unget(in, c, &cpos);
 347         ret.type = (nls > 1 ? tok_eop : tok_white);
 348         return ret;
 349     } else if (c == EOF) {             /* tok_eof */
 350         ret.type = tok_eof;
 351         return ret;
 352     } else if (c == '\\') {            /* tok_cmd */
 353         c = get(in, &cpos);
 354         if (c == '-' || c == '\\' || c == '_' ||
 355             c == '#' || c == '{' || c == '}') {
 356             /* single-char command */
 357             rdadd(&rs, c);
 358         } else if (c == 'u') {
 359             int len = 0;
 360             do {
 361                 rdadd(&rs, c);
 362                 len++;
 363                 c = get(in, &cpos);
 364             } while (ishex(c) && len < 5);
 365             unget(in, c, &cpos);
 366         } else if (iscmd(c)) {
 367             do {
 368                 rdadd(&rs, c);
 369                 c = get(in, &cpos);
 370             } while (iscmd(c));
 371             unget(in, c, &cpos);
 372         }
 373         /*
 374          * Now match the command against the list of available
 375          * ones.
 376          */
 377         ret.type = tok_cmd;
 378         ret.text = ustrdup(rs.text);
 379         match_kw(&ret);
 380         sfree(rs.text);
 381         return ret;
 382     } else if (c == '{') {             /* tok_lbrace */
 383         ret.type = tok_lbrace;
 384         return ret;
 385     } else if (c == '}') {             /* tok_rbrace */
 386         ret.type = tok_rbrace;
 387         return ret;
 388     } else {                           /* tok_word */
 389         /*
 390          * Read a word: the longest possible contiguous sequence of
 391          * things other than whitespace, backslash, braces and
 392          * hyphen. A hyphen terminates the word but is returned as
 393          * part of it; everything else is pushed back for the next
 394          * token. The `aux' field contains TRUE if the word ends in
 395          * a hyphen.
 396          */
 397         ret.aux = FALSE;               /* assumed for now */
 398         while (1) {
 399             if (iswhite(c) || c=='{' || c=='}' || c=='\\' || c==EOF) {
 400                 /* Put back the character that caused termination */
 401                 unget(in, c, &cpos);
 402                 break;
 403             } else {
 404                 rdadd(&rs, c);
 405                 if (c == '-') {
 406                     ret.aux = TRUE;
 407                     break;             /* hyphen terminates word */
 408                 }
 409             }
 410             c = get(in, &cpos);
 411         }
 412         ret.type = tok_word;
 413         ret.text = ustrdup(rs.text);
 414         sfree(rs.text);
 415         return ret;
 416     }
 417 }
 418
 419 /*
 420  * Determine whether the next input character is an open brace (for
 421  * telling code paragraphs from paragraphs which merely start with
 422  * code).
 423  */
 424 int isbrace(input *in) {
 425     int c;
 426     filepos cpos;
 427
 428     c = get(in, &cpos);
 429     unget(in, c, &cpos);
 430     return (c == '{');
 431 }
 432
 433 /*
 434  * Read the rest of a line that starts `\c'. Including nothing at
 435  * all (tok_word with empty text).
 436  */
 437 token get_codepar_token(input *in) {
 438     int c;
 439     token ret;
 440     rdstring rs = { 0, 0, NULL };
 441     filepos cpos;
 442
 443     ret.type = tok_word;
 444     c = get(in, &cpos);                /* expect (and discard) one space */
 445     ret.pos = cpos;
 446     if (c == ' ') {
 447         c = get(in, &cpos);
 448         ret.pos = cpos;
 449     }
 450     while (!isnl(c) && c != EOF) {
 451         int c2 = c;
 452         c = get(in, &cpos);
 453         /* Discard \r just before \n. */
 454         if (c2 != 13 || !isnl(c))
 455             rdadd(&rs, c2);
 456     }
 457     unget(in, c, &cpos);
 458     ret.text = ustrdup(rs.text);
 459     sfree(rs.text);
 460     return ret;
 461 }
 462
 463 /*
 464  * Adds a new word to a linked list
 465  */
 466 static word *addword(word newword, word ***hptrptr) {
 467     word *mnewword;
 468     if (!hptrptr)
 469         return NULL;
 470     mnewword = mknew(word);
 471     *mnewword = newword;               /* structure copy */
 472     mnewword->next = NULL;
 473     **hptrptr = mnewword;
 474     *hptrptr = &mnewword->next;
 475     return mnewword;
 476 }
 477
 478 /*
 479  * Adds a new paragraph to a linked list
 480  */
 481 static paragraph *addpara(paragraph newpara, paragraph ***hptrptr) {
 482     paragraph *mnewpara = mknew(paragraph);
 483     *mnewpara = newpara;               /* structure copy */
 484     mnewpara->next = NULL;
 485     **hptrptr = mnewpara;
 486     *hptrptr = &mnewpara->next;
 487     return mnewpara;
 488 }
 489
 490 /*
 491  * Destructor before token is reassigned; should catch most memory
 492  * leaks
 493  */
 494 #define dtor(t) ( sfree(t.text) )
 495
 496 /*
 497  * Reads a single file (ie until get() returns EOF)
 498  */
 499 static void read_file(paragraph ***ret, input *in, indexdata *idx) {
 500     token t;
 501     paragraph par;
 502     word wd, **whptr, **idximplicit;
 503     tree234 *macros;
 504     wchar_t utext[2], *wdtext;
 505     int style, spcstyle;
 506     int already;
 507     int iswhite, seenwhite;
 508     int type;
 509     struct stack_item {
 510         enum {
 511             stack_nop = 0,             /* do nothing (for error recovery) */
 512             stack_ualt = 1,            /* \u alternative */
 513             stack_style = 2,           /* \e, \c, \cw */
 514             stack_idx = 4,             /* \I, \i, \ii */
 515             stack_hyper = 8,           /* \W */
 516             stack_quote = 16,          /* \q */
 517         } type;
 518         word **whptr;                  /* to restore from \u alternatives */
 519         word **idximplicit;            /* to restore from \u alternatives */
 520     } *sitem;
 521     stack parsestk;
 522     word *indexword, *uword, *iword;
 523     word *idxwordlist;
 524     rdstring indexstr;
 525     int index_downcase, index_visible, indexing;
 526     const rdstring nullrs = { 0, 0, NULL };
 527     wchar_t uchr;
 528
 529     t.text = NULL;
 530     macros = newtree234(macrocmp);
 531
 532     /*
 533      * Loop on each paragraph.
 534      */
 535     while (1) {
 536         par.words = NULL;
 537         par.keyword = NULL;
 538         whptr = &par.words;
 539
 540         /*
 541          * Get a token.
 542          */
 543         dtor(t), t = get_token(in);
 544         if (t.type == tok_eof)
 545             return;
 546
 547         /*
 548          * Parse code paragraphs separately.
 549          */
 550         if (t.type == tok_cmd && t.cmd == c_c && !isbrace(in)) {
 551             par.type = para_Code;
 552             par.fpos = t.pos;
 553             while (1) {
 554                 dtor(t), t = get_codepar_token(in);
 555                 wd.type = word_WeakCode;
 556                 wd.breaks = FALSE;     /* shouldn't need this... */
 557                 wd.text = ustrdup(t.text);
 558                 wd.alt = NULL;
 559                 wd.fpos = t.pos;
 560                 addword(wd, &whptr);
 561                 dtor(t), t = get_token(in);
 562                 if (t.type == tok_white) {
 563                     /*
 564                      * The newline after a code-paragraph line
 565                      */
 566                     dtor(t), t = get_token(in);
 567                 }
 568                 if (t.type == tok_eop || t.type == tok_eof)
 569                     break;
 570                 else if (t.type != tok_cmd || t.cmd != c_c) {
 571                     error(err_brokencodepara, &t.pos);
 572                     addpara(par, ret);
 573                     while (t.type != tok_eop)   /* error recovery: */
 574                         dtor(t), t = get_token(in);   /* eat rest of paragraph */
 575                     goto codeparabroken;   /* ick, but such is life */
 576                 }
 577             }
 578             addpara(par, ret);
 579             codeparabroken:
 580             continue;
 581         }
 582
 583         /*
 584          * This token begins a paragraph. See if it's one of the
 585          * special commands that define a paragraph type.
 586          *
 587          * (note that \# is special in a way, and \nocite takes no
 588          * text)
 589          */
 590         par.type = para_Normal;
 591         if (t.type == tok_cmd) {
 592             int needkw;
 593             int is_macro = FALSE;
 594
 595             par.fpos = t.pos;
 596             switch (t.cmd) {
 597               default:
 598                 needkw = -1;
 599                 break;
 600               case c__invalid:
 601                 error(err_badparatype, t.text, &t.pos);
 602                 needkw = 4;
 603                 break;
 604               case c__comment:
 605                 if (isbrace(in))
 606                     break;             /* `\#{': isn't a comment para */
 607                 do {
 608                     dtor(t), t = get_token(in);
 609                 } while (t.type != tok_eop && t.type != tok_eof);
 610                 continue;              /* next paragraph */
 611                 /*
 612                  * `needkw' values:
 613                  *
 614                  *   1 -- exactly one keyword
 615                  *   2 -- at least one keyword
 616                  *   4 -- any number of keywords including zero
 617                  *   8 -- at least one keyword and then nothing else
 618                  *  16 -- nothing at all! no keywords, no body
 619                  *  32 -- no keywords at all
 620                  */
 621               case c_A: needkw = 2; par.type = para_Appendix; break;
 622               case c_B: needkw = 2; par.type = para_Biblio; break;
 623               case c_BR: needkw = 1; par.type = para_BR; break;
 624               case c_C: needkw = 2; par.type = para_Chapter; break;
 625               case c_H: needkw = 2; par.type = para_Heading;
 626                 par.aux = 0;
 627                 break;
 628               case c_IM: needkw = 2; par.type = para_IM; break;
 629               case c_S: needkw = 2; par.type = para_Subsect;
 630                 par.aux = t.aux; break;
 631               case c_U: needkw = 32; par.type = para_UnnumberedChapter; break;
 632                 /* For \b and \n the keyword is optional */
 633               case c_b: needkw = 4; par.type = para_Bullet; break;
 634               case c_n: needkw = 4; par.type = para_NumberedList; break;
 635               case c_cfg: needkw = 8; par.type = para_Config; break;
 636               case c_copyright: needkw = 32; par.type = para_Copyright; break;
 637               case c_define: is_macro = TRUE; needkw = 1; break;
 638                 /* For \nocite the keyword is _everything_ */
 639               case c_nocite: needkw = 8; par.type = para_NoCite; break;
 640               case c_preamble: needkw = 32; par.type = para_Preamble; break;
 641               case c_rule: needkw = 16; par.type = para_Rule; break;
 642               case c_title: needkw = 32; par.type = para_Title; break;
 643               case c_versionid: needkw = 32; par.type = para_VersionID; break;
 644             }
 645
 646             if (needkw > 0) {
 647                 rdstring rs = { 0, 0, NULL };
 648                 int nkeys = 0;
 649                 filepos fp;
 650
 651                 /* Get keywords. */
 652                 dtor(t), t = get_token(in);
 653                 fp = t.pos;
 654                 while (t.type == tok_lbrace) {
 655                     /* This is a keyword. */
 656                     nkeys++;
 657                     /* FIXME: there will be bugs if anyone specifies an
 658                      * empty keyword (\foo{}), so trap this case. */
 659                     while (dtor(t), t = get_token(in),
 660                            t.type == tok_word ||
 661                            t.type == tok_white ||
 662                            (t.type == tok_cmd && t.cmd == c__nbsp) ||
 663                            (t.type == tok_cmd && t.cmd == c__escaped)) {
 664                         if (t.type == tok_white ||
 665                             (t.type == tok_cmd && t.cmd == c__nbsp))
 666                             rdadd(&rs, ' ');
 667                         else
 668                             rdadds(&rs, t.text);
 669                     }
 670                     if (t.type != tok_rbrace) {
 671                         error(err_kwunclosed, &t.pos);
 672                         continue;
 673                     }
 674                     rdadd(&rs, 0);     /* add string terminator */
 675                     dtor(t), t = get_token(in); /* eat right brace */
 676                 }
 677
 678                 rdadd(&rs, 0);     /* add string terminator */
 679
 680                 /* See whether we have the right number of keywords. */
 681                 if ((needkw & 48) && nkeys > 0)
 682                     error(err_kwillegal, &fp);
 683                 if ((needkw & 11) && nkeys == 0)
 684                     error(err_kwexpected, &fp);
 685                 if ((needkw & 5) && nkeys > 1)
 686                     error(err_kwtoomany, &fp);
 687
 688                 if (is_macro) {
 689                     /*
 690                      * Macro definition. Get the rest of the line
 691                      * as a code-paragraph token, repeatedly until
 692                      * there's nothing more left of it. Separate
 693                      * with newlines.
 694                      */
 695                     rdstring macrotext = { 0, 0, NULL };
 696                     while (1) {
 697                         dtor(t), t = get_codepar_token(in);
 698                         if (macrotext.pos > 0)
 699                             rdadd(&macrotext, L'\n');
 700                         rdadds(&macrotext, t.text);
 701                         dtor(t), t = get_token(in);
 702                         if (t.type == tok_eop) break;
 703                     }
 704                     macrodef(macros, rs.text, macrotext.text, fp);
 705                     continue;          /* next paragraph */
 706                 }
 707
 708                 par.keyword = rdtrim(&rs);
 709
 710                 /* Move to EOP in case of needkw==8 or 16 (no body) */
 711                 if (needkw & 24) {
 712                     if (t.type != tok_eop && t.type != tok_eof) {
 713                         error(err_bodyillegal, &t.pos);
 714                         /* Error recovery: eat the rest of the paragraph */
 715                         while (t.type != tok_eop && t.type != tok_eof)
 716                             dtor(t), t = get_token(in);
 717                     }
 718                     addpara(par, ret);
 719                     continue;          /* next paragraph */
 720                 }
 721             }
 722         }
 723
 724         /*
 725          * Now read the actual paragraph, word by word, adding to
 726          * the paragraph list.
 727          *
 728          * Mid-paragraph commands:
 729          *
 730          *  \K \k
 731          *  \c \cw
 732          *  \e
 733          *  \i \ii
 734          *  \I
 735          *  \u
 736          *  \W
 737          *  \date
 738          *  \\ \{ \}
 739          */
 740         parsestk = stk_new();
 741         style = word_Normal;
 742         spcstyle = word_WhiteSpace;
 743         indexing = FALSE;
 744         seenwhite = TRUE;
 745         while (t.type != tok_eop && t.type != tok_eof) {
 746             iswhite = FALSE;
 747             already = FALSE;
 748             if (t.type == tok_cmd && t.cmd == c__escaped) {
 749                 t.type = tok_word;     /* nice and simple */
 750                 t.aux = 0;             /* even if `\-' - nonbreaking! */
 751             }
 752             if (t.type == tok_cmd && t.cmd == c__nbsp) {
 753                 t.type = tok_word;     /* nice and simple */
 754                 sfree(t.text);
 755                 t.text = ustrdup(L" ");  /* text is ` ' not `_' */
 756                 t.aux = 0;             /* (nonbreaking) */
 757             }
 758             switch (t.type) {
 759               case tok_white:
 760                 if (whptr == &par.words)
 761                     break;             /* strip whitespace at start of para */
 762                 wd.text = NULL;
 763                 wd.type = spcstyle;
 764                 wd.alt = NULL;
 765                 wd.aux = 0;
 766                 wd.fpos = t.pos;
 767                 wd.breaks = FALSE;
 768                 if (indexing)
 769                     rdadd(&indexstr, ' ');
 770                 if (!indexing || index_visible)
 771                     addword(wd, &whptr);
 772                 if (indexing)
 773                     addword(wd, &idximplicit);
 774                 iswhite = TRUE;
 775                 break;
 776               case tok_word:
 777                 if (indexing)
 778                     rdadds(&indexstr, t.text);
 779                 wd.type = style;
 780                 wd.alt = NULL;
 781                 wd.aux = 0;
 782                 wd.fpos = t.pos;
 783                 wd.breaks = t.aux;
 784                 if (!indexing || index_visible) {
 785                     wd.text = ustrdup(t.text);
 786                     addword(wd, &whptr);
 787                 }
 788                 if (indexing) {
 789                     wd.text = ustrdup(t.text);
 790                     addword(wd, &idximplicit);
 791                 }
 792                 break;
 793               case tok_lbrace:
 794                 error(err_unexbrace, &t.pos);
 795                 /* Error recovery: push nop */
 796                 sitem = mknew(struct stack_item);
 797                 sitem->type = stack_nop;
 798                 stk_push(parsestk, sitem);
 799                 break;
 800               case tok_rbrace:
 801                 sitem = stk_pop(parsestk);
 802                 if (!sitem)
 803                     error(err_unexbrace, &t.pos);
 804                 else {
 805                     if (sitem->type & stack_ualt) {
 806                         whptr = sitem->whptr;
 807                         idximplicit = sitem->idximplicit;
 808                     }
 809                     if (sitem->type & stack_style) {
 810                         style = word_Normal;
 811                         spcstyle = word_WhiteSpace;
 812                     }
 813                     if (sitem->type & stack_idx) {
 814                         indexword->text = ustrdup(indexstr.text);
 815                         if (index_downcase)
 816                             ustrlow(indexword->text);
 817                         indexing = FALSE;
 818                         rdadd(&indexstr, L'\0');
 819                         index_merge(idx, FALSE, indexstr.text, idxwordlist);
 820                         sfree(indexstr.text);
 821                     }
 822                     if (sitem->type & stack_hyper) {
 823                         wd.text = NULL;
 824                         wd.type = word_HyperEnd;
 825                         wd.alt = NULL;
 826                         wd.aux = 0;
 827                         wd.fpos = t.pos;
 828                         wd.breaks = FALSE;
 829                         if (!indexing || index_visible)
 830                             addword(wd, &whptr);
 831                         if (indexing)
 832                             addword(wd, &idximplicit);
 833                     }
 834                     if (sitem->type & stack_quote) {
 835                         wd.text = NULL;
 836                         wd.type = toquotestyle(style);
 837                         wd.alt = NULL;
 838                         wd.aux = quote_Close;
 839                         wd.fpos = t.pos;
 840                         wd.breaks = FALSE;
 841                         if (!indexing || index_visible)
 842                             addword(wd, &whptr);
 843                         if (indexing) {
 844                             rdadd(&indexstr, L'"');
 845                             addword(wd, &idximplicit);
 846                         }
 847                     }
 848                 }
 849                 sfree(sitem);
 850                 break;
 851               case tok_cmd:
 852                 switch (t.cmd) {
 853                   case c__comment:
 854                     /*
 855                      * In-paragraph comment: \#{ balanced braces }
 856                      *
 857                      * Anything goes here; even tok_eop. We should
 858                      * eat whitespace after the close brace _if_
 859                      * there was whitespace before the \#.
 860                      */
 861                     dtor(t), t = get_token(in);
 862                     if (t.type != tok_lbrace) {
 863                         error(err_explbr, &t.pos);
 864                     } else {
 865                         int braces = 1;
 866                         while (braces > 0) {
 867                             dtor(t), t = get_token(in);
 868                             if (t.type == tok_lbrace)
 869                                 braces++;
 870                             else if (t.type == tok_rbrace)
 871                                 braces--;
 872                             else if (t.type == tok_eof) {
 873                                 error(err_commenteof, &t.pos);
 874                                 break;
 875                             }
 876                         }
 877                     }
 878                     if (seenwhite) {
 879                         already = TRUE;
 880                         dtor(t), t = get_token(in);
 881                         if (t.type == tok_white) {
 882                             iswhite = TRUE;
 883                             already = FALSE;
 884                         }
 885                     }
 886                     break;
 887                   case c_q:
 888                     dtor(t), t = get_token(in);
 889                     if (t.type != tok_lbrace) {
 890                         error(err_explbr, &t.pos);
 891                     } else {
 892                         wd.text = NULL;
 893                         wd.type = toquotestyle(style);
 894                         wd.alt = NULL;
 895                         wd.aux = quote_Open;
 896                         wd.fpos = t.pos;
 897                         wd.breaks = FALSE;
 898                         if (!indexing || index_visible)
 899                             addword(wd, &whptr);
 900                         if (indexing) {
 901                             rdadd(&indexstr, L'"');
 902                             addword(wd, &idximplicit);
 903                         }
 904                         sitem = mknew(struct stack_item);
 905                         sitem->type = stack_quote;
 906                         stk_push(parsestk, sitem);
 907                     }
 908                     break;
 909                   case c_K:
 910                   case c_k:
 911                   case c_W:
 912                   case c_date:
 913                     /*
 914                      * Keyword, hyperlink, or \date. We expect a
 915                      * left brace, some text, and then a right
 916                      * brace. No nesting; no arguments.
 917                      */
 918                     wd.fpos = t.pos;
 919                     wd.breaks = FALSE;
 920                     if (t.cmd == c_K)
 921                         wd.type = word_UpperXref;
 922                     else if (t.cmd == c_k)
 923                         wd.type = word_LowerXref;
 924                     else if (t.cmd == c_W)
 925                         wd.type = word_HyperLink;
 926                     else
 927                         wd.type = word_Normal;
 928                     dtor(t), t = get_token(in);
 929                     if (t.type != tok_lbrace) {
 930                         if (wd.type == word_Normal) {
 931                             time_t thetime = time(NULL);
 932                             struct tm *broken = localtime(&thetime);
 933                             already = TRUE;
 934                             wdtext = ustrftime(NULL, broken);
 935                             wd.type = style;
 936                         } else {
 937                             error(err_explbr, &t.pos);
 938                             wdtext = NULL;
 939                         }
 940                     } else {
 941                         rdstring rs = { 0, 0, NULL };
 942                         while (dtor(t), t = get_token(in),
 943                                t.type == tok_word || t.type == tok_white) {
 944                             if (t.type == tok_white)
 945                                 rdadd(&rs, ' ');
 946                             else
 947                                 rdadds(&rs, t.text);
 948                         }
 949                         if (wd.type == word_Normal) {
 950                             time_t thetime = time(NULL);
 951                             struct tm *broken = localtime(&thetime);
 952                             wdtext = ustrftime(rs.text, broken);
 953                             wd.type = style;
 954                         } else {
 955                             wdtext = ustrdup(rs.text);
 956                         }
 957                         sfree(rs.text);
 958                         if (t.type != tok_rbrace) {
 959                             error(err_kwexprbr, &t.pos);
 960                         }
 961                     }
 962                     wd.alt = NULL;
 963                     wd.aux = 0;
 964                     if (!indexing || index_visible) {
 965                         wd.text = ustrdup(wdtext);
 966                         addword(wd, &whptr);
 967                     }
 968                     if (indexing) {
 969                         wd.text = ustrdup(wdtext);
 970                         addword(wd, &idximplicit);
 971                     }
 972                     sfree(wdtext);
 973                     if (wd.type == word_HyperLink) {
 974                         /*
 975                          * Hyperlinks are different: they then
 976                          * expect another left brace, to begin
 977                          * delimiting the text marked by the link.
 978                          */
 979                         dtor(t), t = get_token(in);
 980                         /*
 981                          * Special cases: \W{}\c, \W{}\e, \W{}\cw
 982                          */
 983                         sitem = mknew(struct stack_item);
 984                         sitem->type = stack_hyper;
 985                         if (t.type == tok_cmd &&
 986                             (t.cmd == c_e || t.cmd == c_c || t.cmd == c_cw)) {
 987                             if (style != word_Normal)
 988                                 error(err_nestedstyles, &t.pos);
 989                             else {
 990                                 style = (t.cmd == c_c ? word_Code :
 991                                          t.cmd == c_cw ? word_WeakCode :
 992                                          word_Emph);
 993                                 spcstyle = tospacestyle(style);
 994                                 sitem->type |= stack_style;
 995                             }
 996                             dtor(t), t = get_token(in);
 997                         }
 998                         if (t.type != tok_lbrace) {
 999                             error(err_explbr, &t.pos);
1000                             sfree(sitem);
1001                         } else {
1002                             stk_push(parsestk, sitem);
1003                         }
1004                     }
1005                     break;
1006                   case c_c:
1007                   case c_cw:
1008                   case c_e:
1009                     type = t.cmd;
1010                     if (style != word_Normal) {
1011                         error(err_nestedstyles, &t.pos);
1012                         /* Error recovery: eat lbrace, push nop. */
1013                         dtor(t), t = get_token(in);
1014                         sitem = mknew(struct stack_item);
1015                         sitem->type = stack_nop;
1016                         stk_push(parsestk, sitem);
1017                     }
1018                     dtor(t), t = get_token(in);
1019                     if (t.type != tok_lbrace) {
1020                         error(err_explbr, &t.pos);
1021                     } else {
1022                         style = (type == c_c ? word_Code :
1023                                  type == c_cw ? word_WeakCode :
1024                                  word_Emph);
1025                         spcstyle = tospacestyle(style);
1026                         sitem = mknew(struct stack_item);
1027                         sitem->type = stack_style;
1028                         stk_push(parsestk, sitem);
1029                     }
1030                     break;
1031                   case c_i:
1032                   case c_ii:
1033                   case c_I:
1034                     type = t.cmd;
1035                     if (indexing) {
1036                         error(err_nestedindex, &t.pos);
1037                         /* Error recovery: eat lbrace, push nop. */
1038                         dtor(t), t = get_token(in);
1039                         sitem = mknew(struct stack_item);
1040                         sitem->type = stack_nop;
1041                         stk_push(parsestk, sitem);
1042                     }
1043                     sitem = mknew(struct stack_item);
1044                     sitem->type = stack_idx;
1045                     dtor(t), t = get_token(in);
1046                     /*
1047                      * Special cases: \i\c, \i\e, \i\cw
1048                      */
1049                     wd.fpos = t.pos;
1050                     if (t.type == tok_cmd &&
1051                         (t.cmd == c_e || t.cmd == c_c || t.cmd == c_cw)) {
1052                         if (style != word_Normal)
1053                             error(err_nestedstyles, &t.pos);
1054                         else {
1055                             style = (t.cmd == c_c ? word_Code :
1056                                      t.cmd == c_cw ? word_WeakCode :
1057                                      word_Emph);
1058                             spcstyle = tospacestyle(style);
1059                             sitem->type |= stack_style;
1060                         }
1061                         dtor(t), t = get_token(in);
1062                     }
1063                     if (t.type != tok_lbrace) {
1064                         sfree(sitem);
1065                         error(err_explbr, &t.pos);
1066                     } else {
1067                         /* Add an index-reference word with no text as yet */
1068                         wd.type = word_IndexRef;
1069                         wd.text = NULL;
1070                         wd.alt = NULL;
1071                         wd.aux = 0;
1072                         wd.breaks = FALSE;
1073                         indexword = addword(wd, &whptr);
1074                         /* Set up a rdstring to read the index text */
1075                         indexstr = nullrs;
1076                         /* Flags so that we do the Right Things with text */
1077                         index_visible = (type != c_I);
1078                         index_downcase = (type == c_ii);
1079                         indexing = TRUE;
1080                         idxwordlist = NULL;
1081                         idximplicit = &idxwordlist;
1082                         /* Stack item to close the indexing on exit */
1083                         stk_push(parsestk, sitem);
1084                     }
1085                     break;
1086                   case c_u:
1087                     uchr = t.aux;
1088                     utext[0] = uchr; utext[1] = 0;
1089                     wd.type = style;
1090                     wd.breaks = FALSE;
1091                     wd.alt = NULL;
1092                     wd.aux = 0;
1093                     wd.fpos = t.pos;
1094                     if (!indexing || index_visible) {
1095                         wd.text = ustrdup(utext);
1096                         uword = addword(wd, &whptr);
1097                     } else
1098                         uword = NULL;
1099                     if (indexing) {
1100                         wd.text = ustrdup(utext);
1101                         iword = addword(wd, &idximplicit);
1102                     } else
1103                         iword = NULL;
1104                     dtor(t), t = get_token(in);
1105                     if (t.type == tok_lbrace) {
1106                         /*
1107                          * \u with a left brace. Until the brace
1108                          * closes, all further words go on a
1109                          * sidetrack from the main thread of the
1110                          * paragraph.
1111                          */
1112                         sitem = mknew(struct stack_item);
1113                         sitem->type = stack_ualt;
1114                         sitem->whptr = whptr;
1115                         sitem->idximplicit = idximplicit;
1116                         stk_push(parsestk, sitem);
1117                         whptr = uword ? &uword->alt : NULL;
1118                         idximplicit = iword ? &iword->alt : NULL;
1119                     } else {
1120                         if (indexing)
1121                             rdadd(&indexstr, uchr);
1122                         already = TRUE;
1123                     }
1124                     break;
1125                   default:
1126                     if (!macrolookup(macros, in, t.text, &t.pos))
1127                         error(err_badmidcmd, t.text, &t.pos);
1128                     break;
1129                 }
1130             }
1131             if (!already)
1132                 dtor(t), t = get_token(in);
1133             seenwhite = iswhite;
1134         }
1135         /* Check the stack is empty */
1136         if (NULL != (sitem = stk_pop(parsestk))) {
1137             do {
1138                 sfree(sitem);
1139                 sitem = stk_pop(parsestk);
1140             } while (sitem);
1141             error(err_missingrbrace, &t.pos);
1142         }
1143         stk_free(parsestk);
1144         addpara(par, ret);
1145     }
1146     dtor(t);
1147     macrocleanup(macros);
1148 }
1149
1150 paragraph *read_input(input *in, indexdata *idx) {
1151     paragraph *head = NULL;
1152     paragraph **hptr = &head;
1153
1154     while (in->currindex < in->nfiles) {
1155         in->currfp = fopen(in->filenames[in->currindex], "r");
1156         if (in->currfp) {
1157             setpos(in, in->filenames[in->currindex]);
1158             read_file(&hptr, in, idx);
1159         }
1160         in->currindex++;
1161     }
1162
1163     return head;
1164 }