Cleanups to complete the man page backend. Also, an additional new
[sgt/halibut] / input.c
1 /*
2 * input.c: read the source form
3 */
4
5 #include <stdio.h>
6 #include <assert.h>
7 #include <time.h>
8 #include "halibut.h"
9
10 #define TAB_STOP 8 /* for column number tracking */
11
12 static void setpos(input *in, char *fname) {
13 in->pos.filename = fname;
14 in->pos.line = 1;
15 in->pos.col = (in->reportcols ? 1 : -1);
16 }
17
18 static void unget(input *in, int c, filepos *pos) {
19 if (in->npushback >= in->pushbacksize) {
20 in->pushbacksize = in->npushback + 16;
21 in->pushback = resize(in->pushback, in->pushbacksize);
22 }
23 in->pushback[in->npushback].chr = c;
24 in->pushback[in->npushback].pos = *pos; /* structure copy */
25 in->npushback++;
26 }
27
28 /* ---------------------------------------------------------------------- */
29 /*
30 * Macro subsystem
31 */
32 typedef struct macro_Tag macro;
33 struct macro_Tag {
34 wchar_t *name, *text;
35 };
36 struct macrostack_Tag {
37 macrostack *next;
38 wchar_t *text;
39 int ptr, npushback;
40 filepos pos;
41 };
42 static int macrocmp(void *av, void *bv) {
43 macro *a = (macro *)av, *b = (macro *)bv;
44 return ustrcmp(a->name, b->name);
45 }
46 static void macrodef(tree234 *macros, wchar_t *name, wchar_t *text,
47 filepos fpos) {
48 macro *m = mknew(macro);
49 m->name = name;
50 m->text = text;
51 if (add234(macros, m) != m) {
52 error(err_macroexists, &fpos, name);
53 sfree(name);
54 sfree(text);
55 }
56 }
57 static int macrolookup(tree234 *macros, input *in, wchar_t *name,
58 filepos *pos) {
59 macro m, *gotit;
60 m.name = name;
61 gotit = find234(macros, &m, NULL);
62 if (gotit) {
63 macrostack *expansion = mknew(macrostack);
64 expansion->next = in->stack;
65 expansion->text = gotit->text;
66 expansion->pos = *pos; /* structure copy */
67 expansion->ptr = 0;
68 expansion->npushback = in->npushback;
69 in->stack = expansion;
70 return TRUE;
71 } else
72 return FALSE;
73 }
74 static void macrocleanup(tree234 *macros) {
75 int ti;
76 macro *m;
77 for (ti = 0; (m = (macro *)index234(macros, ti)) != NULL; ti++) {
78 sfree(m->name);
79 sfree(m->text);
80 sfree(m);
81 }
82 freetree234(macros);
83 }
84
85 /*
86 * Can return EOF
87 */
88 static int get(input *in, filepos *pos) {
89 int pushbackpt = in->stack ? in->stack->npushback : 0;
90 if (in->npushback > pushbackpt) {
91 --in->npushback;
92 if (pos)
93 *pos = in->pushback[in->npushback].pos; /* structure copy */
94 return in->pushback[in->npushback].chr;
95 }
96 else if (in->stack) {
97 wchar_t c = in->stack->text[in->stack->ptr];
98 if (in->stack->text[++in->stack->ptr] == L'\0') {
99 macrostack *tmp = in->stack;
100 in->stack = tmp->next;
101 sfree(tmp);
102 }
103 return c;
104 }
105 else if (in->currfp) {
106 int c = getc(in->currfp);
107
108 if (c == EOF) {
109 fclose(in->currfp);
110 in->currfp = NULL;
111 }
112 /* Track line numbers, for error reporting */
113 if (pos)
114 *pos = in->pos;
115 if (in->reportcols) {
116 switch (c) {
117 case '\t':
118 in->pos.col = 1 + (in->pos.col + TAB_STOP-1) % TAB_STOP;
119 break;
120 case '\n':
121 in->pos.col = 1;
122 in->pos.line++;
123 break;
124 default:
125 in->pos.col++;
126 break;
127 }
128 } else {
129 in->pos.col = -1;
130 if (c == '\n')
131 in->pos.line++;
132 }
133 /* FIXME: do input charmap translation. We should be returning
134 * Unicode here. */
135 return c;
136 } else
137 return EOF;
138 }
139
140 /*
141 * Lexical analysis of source files.
142 */
143 typedef struct token_Tag token;
144 struct token_Tag {
145 int type;
146 int cmd, aux;
147 wchar_t *text;
148 filepos pos;
149 };
150 enum {
151 tok_eof, /* end of file */
152 tok_eop, /* end of paragraph */
153 tok_white, /* whitespace */
154 tok_word, /* a word or word fragment */
155 tok_cmd, /* \command */
156 tok_lbrace, /* { */
157 tok_rbrace /* } */
158 };
159
160 /* Halibut command keywords. */
161 enum {
162 c__invalid, /* invalid command */
163 c__comment, /* comment command (\#) */
164 c__escaped, /* escaped character */
165 c__nbsp, /* nonbreaking space */
166 c_A, /* appendix heading */
167 c_B, /* bibliography entry */
168 c_BR, /* bibliography rewrite */
169 c_C, /* chapter heading */
170 c_H, /* heading */
171 c_I, /* invisible index mark */
172 c_IM, /* index merge/rewrite */
173 c_K, /* capitalised cross-reference */
174 c_S, /* aux field is 0, 1, 2, ... */
175 c_U, /* unnumbered-chapter heading */
176 c_W, /* Web hyperlink */
177 c_b, /* bulletted list */
178 c_c, /* code */
179 c_cfg, /* configuration directive */
180 c_copyright, /* copyright statement */
181 c_cw, /* weak code */
182 c_date, /* document processing date */
183 c_dd, /* description list: description */
184 c_define, /* macro definition */
185 c_dt, /* description list: described thing */
186 c_e, /* emphasis */
187 c_i, /* visible index mark */
188 c_ii, /* uncapitalised visible index mark */
189 c_k, /* uncapitalised cross-reference */
190 c_lcont, /* continuation para(s) for list item */
191 c_n, /* numbered list */
192 c_nocite, /* bibliography trickery */
193 c_preamble, /* document preamble text */
194 c_q, /* quote marks */
195 c_rule, /* horizontal rule */
196 c_title, /* document title */
197 c_u, /* aux field is char code */
198 c_versionid /* document RCS id */
199 };
200
201 /* Perhaps whitespace should be defined in a more Unicode-friendly way? */
202 #define iswhite(c) ( (c)==32 || (c)==9 || (c)==13 || (c)==10 )
203 #define isnl(c) ( (c)==10 )
204 #define isdec(c) ( ((c)>='0'&&(c)<='9') )
205 #define fromdec(c) ( (c)-'0' )
206 #define ishex(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='F') || ((c)>='a'&&(c)<='f'))
207 #define fromhex(c) ( (c)<='9' ? (c)-'0' : ((c)&0xDF) - ('A'-10) )
208 #define iscmd(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='Z') || ((c)>='a'&&(c)<='z'))
209
210 /*
211 * Keyword comparison function. Like strcmp, but between a wchar_t *
212 * and a char *.
213 */
214 static int kwcmp(wchar_t const *p, char const *q) {
215 int i;
216 do {
217 i = *p - *q;
218 } while (*p++ && *q++ && !i);
219 return i;
220 }
221
222 /*
223 * Match a keyword.
224 */
225 static void match_kw(token *tok) {
226 /*
227 * FIXME. The ids are explicit in here so as to allow long-name
228 * equivalents to the various very short keywords.
229 */
230 static const struct { char const *name; int id; } keywords[] = {
231 {"#", c__comment}, /* comment command (\#) */
232 {"-", c__escaped}, /* nonbreaking hyphen */
233 {"A", c_A}, /* appendix heading */
234 {"B", c_B}, /* bibliography entry */
235 {"BR", c_BR}, /* bibliography rewrite */
236 {"C", c_C}, /* chapter heading */
237 {"H", c_H}, /* heading */
238 {"I", c_I}, /* invisible index mark */
239 {"IM", c_IM}, /* index merge/rewrite */
240 {"K", c_K}, /* capitalised cross-reference */
241 {"U", c_U}, /* unnumbered-chapter heading */
242 {"W", c_W}, /* Web hyperlink */
243 {"\\", c__escaped}, /* escaped backslash (\\) */
244 {"_", c__nbsp}, /* nonbreaking space (\_) */
245 {"b", c_b}, /* bulletted list */
246 {"c", c_c}, /* code */
247 {"cfg", c_cfg}, /* configuration directive */
248 {"copyright", c_copyright}, /* copyright statement */
249 {"cw", c_cw}, /* weak code */
250 {"date", c_date}, /* document processing date */
251 {"dd", c_dd}, /* description list: description */
252 {"define", c_define}, /* macro definition */
253 {"dt", c_dt}, /* description list: described thing */
254 {"e", c_e}, /* emphasis */
255 {"i", c_i}, /* visible index mark */
256 {"ii", c_ii}, /* uncapitalised visible index mark */
257 {"k", c_k}, /* uncapitalised cross-reference */
258 {"lcont", c_lcont}, /* continuation para(s) for list item */
259 {"n", c_n}, /* numbered list */
260 {"nocite", c_nocite}, /* bibliography trickery */
261 {"preamble", c_preamble}, /* document preamble text */
262 {"q", c_q}, /* quote marks */
263 {"rule", c_rule}, /* horizontal rule */
264 {"title", c_title}, /* document title */
265 {"versionid", c_versionid}, /* document RCS id */
266 {"{", c__escaped}, /* escaped lbrace (\{) */
267 {"}", c__escaped}, /* escaped rbrace (\}) */
268 };
269 int i, j, k, c;
270
271 /*
272 * Special cases: \S{0,1,2,...} and \uABCD. If the syntax
273 * doesn't match correctly, we just fall through to the
274 * binary-search phase.
275 */
276 if (tok->text[0] == 'S') {
277 /* We expect numeric characters thereafter. */
278 wchar_t *p = tok->text+1;
279 int n;
280 if (!*p)
281 n = 1;
282 else {
283 n = 0;
284 while (*p && isdec(*p)) {
285 n = 10 * n + fromdec(*p);
286 p++;
287 }
288 }
289 if (!*p) {
290 tok->cmd = c_S;
291 tok->aux = n;
292 return;
293 }
294 } else if (tok->text[0] == 'u') {
295 /* We expect hex characters thereafter. */
296 wchar_t *p = tok->text+1;
297 int n = 0;
298 while (*p && ishex(*p)) {
299 n = 16 * n + fromhex(*p);
300 p++;
301 }
302 if (!*p) {
303 tok->cmd = c_u;
304 tok->aux = n;
305 return;
306 }
307 }
308
309 i = -1;
310 j = sizeof(keywords)/sizeof(*keywords);
311 while (j-i > 1) {
312 k = (i+j)/2;
313 c = kwcmp(tok->text, keywords[k].name);
314 if (c < 0)
315 j = k;
316 else if (c > 0)
317 i = k;
318 else /* c == 0 */ {
319 tok->cmd = keywords[k].id;
320 return;
321 }
322 }
323
324 tok->cmd = c__invalid;
325 }
326
327
328 /*
329 * Read a token from the input file, in the normal way (`normal' in
330 * the sense that code paragraphs work a different way).
331 */
332 token get_token(input *in) {
333 int c;
334 int nls;
335 token ret;
336 rdstring rs = { 0, 0, NULL };
337 filepos cpos;
338
339 ret.text = NULL; /* default */
340 c = get(in, &cpos);
341 ret.pos = cpos;
342 if (iswhite(c)) { /* tok_white or tok_eop */
343 nls = 0;
344 do {
345 if (isnl(c))
346 nls++;
347 } while ((c = get(in, &cpos)) != EOF && iswhite(c));
348 if (c == EOF) {
349 ret.type = tok_eof;
350 return ret;
351 }
352 unget(in, c, &cpos);
353 ret.type = (nls > 1 ? tok_eop : tok_white);
354 return ret;
355 } else if (c == EOF) { /* tok_eof */
356 ret.type = tok_eof;
357 return ret;
358 } else if (c == '\\') { /* tok_cmd */
359 c = get(in, &cpos);
360 if (c == '-' || c == '\\' || c == '_' ||
361 c == '#' || c == '{' || c == '}') {
362 /* single-char command */
363 rdadd(&rs, c);
364 } else if (c == 'u') {
365 int len = 0;
366 do {
367 rdadd(&rs, c);
368 len++;
369 c = get(in, &cpos);
370 } while (ishex(c) && len < 5);
371 unget(in, c, &cpos);
372 } else if (iscmd(c)) {
373 do {
374 rdadd(&rs, c);
375 c = get(in, &cpos);
376 } while (iscmd(c));
377 unget(in, c, &cpos);
378 }
379 /*
380 * Now match the command against the list of available
381 * ones.
382 */
383 ret.type = tok_cmd;
384 ret.text = ustrdup(rs.text);
385 match_kw(&ret);
386 sfree(rs.text);
387 return ret;
388 } else if (c == '{') { /* tok_lbrace */
389 ret.type = tok_lbrace;
390 return ret;
391 } else if (c == '}') { /* tok_rbrace */
392 ret.type = tok_rbrace;
393 return ret;
394 } else { /* tok_word */
395 /*
396 * Read a word: the longest possible contiguous sequence of
397 * things other than whitespace, backslash, braces and
398 * hyphen. A hyphen terminates the word but is returned as
399 * part of it; everything else is pushed back for the next
400 * token. The `aux' field contains TRUE if the word ends in
401 * a hyphen.
402 */
403 ret.aux = FALSE; /* assumed for now */
404 while (1) {
405 if (iswhite(c) || c=='{' || c=='}' || c=='\\' || c==EOF) {
406 /* Put back the character that caused termination */
407 unget(in, c, &cpos);
408 break;
409 } else {
410 rdadd(&rs, c);
411 if (c == '-') {
412 ret.aux = TRUE;
413 break; /* hyphen terminates word */
414 }
415 }
416 c = get(in, &cpos);
417 }
418 ret.type = tok_word;
419 ret.text = ustrdup(rs.text);
420 sfree(rs.text);
421 return ret;
422 }
423 }
424
425 /*
426 * Determine whether the next input character is an open brace (for
427 * telling code paragraphs from paragraphs which merely start with
428 * code).
429 */
430 int isbrace(input *in) {
431 int c;
432 filepos cpos;
433
434 c = get(in, &cpos);
435 unget(in, c, &cpos);
436 return (c == '{');
437 }
438
439 /*
440 * Read the rest of a line that starts `\c'. Including nothing at
441 * all (tok_word with empty text).
442 */
443 token get_codepar_token(input *in) {
444 int c;
445 token ret;
446 rdstring rs = { 0, 0, NULL };
447 filepos cpos;
448
449 ret.type = tok_word;
450 c = get(in, &cpos); /* expect (and discard) one space */
451 ret.pos = cpos;
452 if (c == ' ') {
453 c = get(in, &cpos);
454 ret.pos = cpos;
455 }
456 while (!isnl(c) && c != EOF) {
457 int c2 = c;
458 c = get(in, &cpos);
459 /* Discard \r just before \n. */
460 if (c2 != 13 || !isnl(c))
461 rdadd(&rs, c2);
462 }
463 unget(in, c, &cpos);
464 ret.text = ustrdup(rs.text);
465 sfree(rs.text);
466 return ret;
467 }
468
469 /*
470 * Adds a new word to a linked list
471 */
472 static word *addword(word newword, word ***hptrptr) {
473 word *mnewword;
474 if (!hptrptr)
475 return NULL;
476 mnewword = mknew(word);
477 *mnewword = newword; /* structure copy */
478 mnewword->next = NULL;
479 **hptrptr = mnewword;
480 *hptrptr = &mnewword->next;
481 return mnewword;
482 }
483
484 /*
485 * Adds a new paragraph to a linked list
486 */
487 static paragraph *addpara(paragraph newpara, paragraph ***hptrptr) {
488 paragraph *mnewpara = mknew(paragraph);
489 *mnewpara = newpara; /* structure copy */
490 mnewpara->next = NULL;
491 **hptrptr = mnewpara;
492 *hptrptr = &mnewpara->next;
493 return mnewpara;
494 }
495
496 /*
497 * Destructor before token is reassigned; should catch most memory
498 * leaks
499 */
500 #define dtor(t) ( sfree(t.text) )
501
502 /*
503 * Reads a single file (ie until get() returns EOF)
504 */
505 static void read_file(paragraph ***ret, input *in, indexdata *idx) {
506 token t;
507 paragraph par;
508 word wd, **whptr, **idximplicit;
509 tree234 *macros;
510 wchar_t utext[2], *wdtext;
511 int style, spcstyle;
512 int already;
513 int iswhite, seenwhite;
514 int type;
515 int prev_para_type;
516 struct stack_item {
517 enum {
518 stack_nop = 0, /* do nothing (for error recovery) */
519 stack_ualt = 1, /* \u alternative */
520 stack_style = 2, /* \e, \c, \cw */
521 stack_idx = 4, /* \I, \i, \ii */
522 stack_hyper = 8, /* \W */
523 stack_quote = 16, /* \q */
524 } type;
525 word **whptr; /* to restore from \u alternatives */
526 word **idximplicit; /* to restore from \u alternatives */
527 } *sitem;
528 stack parsestk;
529 struct crossparaitem {
530 int type; /* currently c_lcont or -1 */
531 int seen_lcont;
532 };
533 stack crossparastk;
534 word *indexword, *uword, *iword;
535 word *idxwordlist;
536 rdstring indexstr;
537 int index_downcase, index_visible, indexing;
538 const rdstring nullrs = { 0, 0, NULL };
539 wchar_t uchr;
540
541 t.text = NULL;
542 macros = newtree234(macrocmp);
543 already = FALSE;
544
545 crossparastk = stk_new();
546
547 /*
548 * Loop on each paragraph.
549 */
550 while (1) {
551 int start_cmd = c__invalid;
552 par.words = NULL;
553 par.keyword = NULL;
554 whptr = &par.words;
555
556 /*
557 * Get a token.
558 */
559 do {
560 if (!already) {
561 dtor(t), t = get_token(in);
562 }
563 already = FALSE;
564 } while (t.type == tok_eop);
565 if (t.type == tok_eof)
566 break;
567
568 /*
569 * Parse code paragraphs separately.
570 */
571 if (t.type == tok_cmd && t.cmd == c_c && !isbrace(in)) {
572 int wtype = word_WeakCode;
573
574 par.type = para_Code;
575 par.fpos = t.pos;
576 while (1) {
577 dtor(t), t = get_codepar_token(in);
578 wd.type = wtype;
579 wd.breaks = FALSE; /* shouldn't need this... */
580 wd.text = ustrdup(t.text);
581 wd.alt = NULL;
582 wd.fpos = t.pos;
583 addword(wd, &whptr);
584 dtor(t), t = get_token(in);
585 if (t.type == tok_white) {
586 /*
587 * The newline after a code-paragraph line
588 */
589 dtor(t), t = get_token(in);
590 }
591 if (t.type == tok_eop || t.type == tok_eof)
592 break;
593 else if (t.type == tok_cmd && t.cmd == c_c)
594 wtype = word_WeakCode;
595 else if (t.type == tok_cmd && t.cmd == c_e &&
596 wtype == word_WeakCode)
597 wtype = word_Emph;
598 else {
599 error(err_brokencodepara, &t.pos);
600 prev_para_type = par.type;
601 addpara(par, ret);
602 while (t.type != tok_eop) /* error recovery: */
603 dtor(t), t = get_token(in); /* eat rest of paragraph */
604 goto codeparabroken; /* ick, but such is life */
605 }
606 }
607 prev_para_type = par.type;
608 addpara(par, ret);
609 codeparabroken:
610 continue;
611 }
612
613 /*
614 * Spot the special commands that define a grouping of more
615 * than one paragraph, and also the closing braces that
616 * finish them.
617 */
618 if (t.type == tok_cmd &&
619 t.cmd == c_lcont) {
620 struct crossparaitem *sitem, *stop;
621
622 /*
623 * Expect, and swallow, an open brace.
624 */
625 dtor(t), t = get_token(in);
626 if (t.type != tok_lbrace) {
627 error(err_explbr, &t.pos);
628 continue;
629 }
630
631 /*
632 * \lcont causes a continuation of a list item into
633 * multiple paragraphs (which may in turn contain
634 * nested lists, code paras etc). Hence, the previous
635 * paragraph must be of a list type.
636 */
637 sitem = mknew(struct crossparaitem);
638 stop = (struct crossparaitem *)stk_top(crossparastk);
639 if (prev_para_type == para_Bullet ||
640 prev_para_type == para_NumberedList ||
641 prev_para_type == para_Description) {
642 sitem->type = c_lcont;
643 sitem->seen_lcont = 1;
644 par.type = para_LcontPush;
645 prev_para_type = par.type;
646 addpara(par, ret);
647 } else {
648 /*
649 * Push a null item on the cross-para stack so that
650 * when we see the corresponding closing brace we
651 * don't give a cascade error.
652 */
653 sitem->type = -1;
654 sitem->seen_lcont = (stop ? stop->seen_lcont : 0);
655 error(err_misplacedlcont, &t.pos);
656 }
657 stk_push(crossparastk, sitem);
658 continue;
659 } else if (t.type == tok_rbrace) {
660 struct crossparaitem *sitem = stk_pop(crossparastk);
661 if (!sitem)
662 error(err_unexbrace, &t.pos);
663 else {
664 switch (sitem->type) {
665 case c_lcont:
666 par.type = para_LcontPop;
667 prev_para_type = par.type;
668 addpara(par, ret);
669 break;
670 }
671 sfree(sitem);
672 }
673 continue;
674 }
675
676 /*
677 * This token begins a paragraph. See if it's one of the
678 * special commands that define a paragraph type.
679 *
680 * (note that \# is special in a way, and \nocite takes no
681 * text)
682 */
683 par.type = para_Normal;
684 if (t.type == tok_cmd) {
685 int needkw;
686 int is_macro = FALSE;
687
688 par.fpos = t.pos;
689 switch (t.cmd) {
690 default:
691 needkw = -1;
692 break;
693 case c__invalid:
694 error(err_badparatype, t.text, &t.pos);
695 needkw = 4;
696 break;
697 case c__comment:
698 if (isbrace(in))
699 break; /* `\#{': isn't a comment para */
700 do {
701 dtor(t), t = get_token(in);
702 } while (t.type != tok_eop && t.type != tok_eof);
703 continue; /* next paragraph */
704 /*
705 * `needkw' values:
706 *
707 * 1 -- exactly one keyword
708 * 2 -- at least one keyword
709 * 4 -- any number of keywords including zero
710 * 8 -- at least one keyword and then nothing else
711 * 16 -- nothing at all! no keywords, no body
712 * 32 -- no keywords at all
713 */
714 case c_A: needkw = 2; par.type = para_Appendix; break;
715 case c_B: needkw = 2; par.type = para_Biblio; break;
716 case c_BR: needkw = 1; par.type = para_BR;
717 start_cmd = c_BR; break;
718 case c_C: needkw = 2; par.type = para_Chapter; break;
719 case c_H: needkw = 2; par.type = para_Heading;
720 par.aux = 0;
721 break;
722 case c_IM: needkw = 2; par.type = para_IM;
723 start_cmd = c_IM; break;
724 case c_S: needkw = 2; par.type = para_Subsect;
725 par.aux = t.aux; break;
726 case c_U: needkw = 32; par.type = para_UnnumberedChapter; break;
727 /* For \b and \n the keyword is optional */
728 case c_b: needkw = 4; par.type = para_Bullet; break;
729 case c_dt: needkw = 4; par.type = para_DescribedThing; break;
730 case c_dd: needkw = 4; par.type = para_Description; break;
731 case c_n: needkw = 4; par.type = para_NumberedList; break;
732 case c_cfg: needkw = 8; par.type = para_Config;
733 start_cmd = c_cfg; break;
734 case c_copyright: needkw = 32; par.type = para_Copyright; break;
735 case c_define: is_macro = TRUE; needkw = 1; break;
736 /* For \nocite the keyword is _everything_ */
737 case c_nocite: needkw = 8; par.type = para_NoCite; break;
738 case c_preamble: needkw = 32; par.type = para_Preamble; break;
739 case c_rule: needkw = 16; par.type = para_Rule; break;
740 case c_title: needkw = 32; par.type = para_Title; break;
741 case c_versionid: needkw = 32; par.type = para_VersionID; break;
742 }
743
744 if (par.type == para_Chapter ||
745 par.type == para_Heading ||
746 par.type == para_Subsect ||
747 par.type == para_Appendix ||
748 par.type == para_UnnumberedChapter) {
749 struct crossparaitem *sitem = stk_top(crossparastk);
750 if (sitem && sitem->seen_lcont) {
751 error(err_sectmarkerinlcont, &t.pos);
752 }
753 }
754
755 if (needkw > 0) {
756 rdstring rs = { 0, 0, NULL };
757 int nkeys = 0;
758 filepos fp;
759
760 /* Get keywords. */
761 dtor(t), t = get_token(in);
762 fp = t.pos;
763 while (t.type == tok_lbrace) {
764 /* This is a keyword. */
765 nkeys++;
766 /* FIXME: there will be bugs if anyone specifies an
767 * empty keyword (\foo{}), so trap this case. */
768 while (dtor(t), t = get_token(in),
769 t.type == tok_word ||
770 t.type == tok_white ||
771 (t.type == tok_cmd && t.cmd == c__nbsp) ||
772 (t.type == tok_cmd && t.cmd == c__escaped)) {
773 if (t.type == tok_white ||
774 (t.type == tok_cmd && t.cmd == c__nbsp))
775 rdadd(&rs, ' ');
776 else
777 rdadds(&rs, t.text);
778 }
779 if (t.type != tok_rbrace) {
780 error(err_kwunclosed, &t.pos);
781 continue;
782 }
783 rdadd(&rs, 0); /* add string terminator */
784 dtor(t), t = get_token(in); /* eat right brace */
785 }
786
787 rdadd(&rs, 0); /* add string terminator */
788
789 /* See whether we have the right number of keywords. */
790 if ((needkw & 48) && nkeys > 0)
791 error(err_kwillegal, &fp);
792 if ((needkw & 11) && nkeys == 0)
793 error(err_kwexpected, &fp);
794 if ((needkw & 5) && nkeys > 1)
795 error(err_kwtoomany, &fp);
796
797 if (is_macro) {
798 /*
799 * Macro definition. Get the rest of the line
800 * as a code-paragraph token, repeatedly until
801 * there's nothing more left of it. Separate
802 * with newlines.
803 */
804 rdstring macrotext = { 0, 0, NULL };
805 while (1) {
806 dtor(t), t = get_codepar_token(in);
807 if (macrotext.pos > 0)
808 rdadd(&macrotext, L'\n');
809 rdadds(&macrotext, t.text);
810 dtor(t), t = get_token(in);
811 if (t.type == tok_eop) break;
812 }
813 macrodef(macros, rs.text, macrotext.text, fp);
814 continue; /* next paragraph */
815 }
816
817 par.keyword = rdtrim(&rs);
818
819 /* Move to EOP in case of needkw==8 or 16 (no body) */
820 if (needkw & 24) {
821 /* We allow whitespace even when we expect no para body */
822 while (t.type == tok_white)
823 dtor(t), t = get_token(in);
824 if (t.type != tok_eop && t.type != tok_eof &&
825 (start_cmd == c__invalid ||
826 t.type != tok_cmd || t.cmd != start_cmd)) {
827 error(err_bodyillegal, &t.pos);
828 /* Error recovery: eat the rest of the paragraph */
829 while (t.type != tok_eop && t.type != tok_eof &&
830 (start_cmd == c__invalid ||
831 t.type != tok_cmd || t.cmd != start_cmd))
832 dtor(t), t = get_token(in);
833 }
834 if (t.type == tok_cmd)
835 already = TRUE;/* inhibit get_token at top of loop */
836 prev_para_type = par.type;
837 addpara(par, ret);
838 continue; /* next paragraph */
839 }
840 }
841 }
842
843 /*
844 * Now read the actual paragraph, word by word, adding to
845 * the paragraph list.
846 *
847 * Mid-paragraph commands:
848 *
849 * \K \k
850 * \c \cw
851 * \e
852 * \i \ii
853 * \I
854 * \u
855 * \W
856 * \date
857 * \\ \{ \}
858 */
859 parsestk = stk_new();
860 style = word_Normal;
861 spcstyle = word_WhiteSpace;
862 indexing = FALSE;
863 seenwhite = TRUE;
864 while (t.type != tok_eop && t.type != tok_eof) {
865 iswhite = FALSE;
866 already = FALSE;
867
868 /* Handle implicit paragraph breaks after \IM, \BR etc */
869 if (start_cmd != c__invalid &&
870 t.type == tok_cmd && t.cmd == start_cmd) {
871 already = TRUE; /* inhibit get_token at top of loop */
872 break;
873 }
874
875 if (t.type == tok_cmd && t.cmd == c__escaped) {
876 t.type = tok_word; /* nice and simple */
877 t.aux = 0; /* even if `\-' - nonbreaking! */
878 }
879 if (t.type == tok_cmd && t.cmd == c__nbsp) {
880 t.type = tok_word; /* nice and simple */
881 sfree(t.text);
882 t.text = ustrdup(L" "); /* text is ` ' not `_' */
883 t.aux = 0; /* (nonbreaking) */
884 }
885 switch (t.type) {
886 case tok_white:
887 if (whptr == &par.words)
888 break; /* strip whitespace at start of para */
889 wd.text = NULL;
890 wd.type = spcstyle;
891 wd.alt = NULL;
892 wd.aux = 0;
893 wd.fpos = t.pos;
894 wd.breaks = FALSE;
895
896 /*
897 * Inhibit use of whitespace if it's (probably the
898 * newline) before a repeat \IM / \BR type
899 * directive.
900 */
901 if (start_cmd != c__invalid) {
902 dtor(t), t = get_token(in);
903 already = TRUE;
904 if (t.type == tok_cmd && t.cmd == start_cmd)
905 break;
906 }
907
908 if (indexing)
909 rdadd(&indexstr, ' ');
910 if (!indexing || index_visible)
911 addword(wd, &whptr);
912 if (indexing)
913 addword(wd, &idximplicit);
914 iswhite = TRUE;
915 break;
916 case tok_word:
917 if (indexing)
918 rdadds(&indexstr, t.text);
919 wd.type = style;
920 wd.alt = NULL;
921 wd.aux = 0;
922 wd.fpos = t.pos;
923 wd.breaks = t.aux;
924 if (!indexing || index_visible) {
925 wd.text = ustrdup(t.text);
926 addword(wd, &whptr);
927 }
928 if (indexing) {
929 wd.text = ustrdup(t.text);
930 addword(wd, &idximplicit);
931 }
932 break;
933 case tok_lbrace:
934 error(err_unexbrace, &t.pos);
935 /* Error recovery: push nop */
936 sitem = mknew(struct stack_item);
937 sitem->type = stack_nop;
938 stk_push(parsestk, sitem);
939 break;
940 case tok_rbrace:
941 sitem = stk_pop(parsestk);
942 if (!sitem) {
943 /*
944 * This closing brace could have been an
945 * indication that the cross-paragraph stack
946 * wants popping. Accordingly, we treat it here
947 * as an indication that the paragraph is over.
948 */
949 already = TRUE;
950 goto finished_para;
951 } else {
952 if (sitem->type & stack_ualt) {
953 whptr = sitem->whptr;
954 idximplicit = sitem->idximplicit;
955 }
956 if (sitem->type & stack_style) {
957 style = word_Normal;
958 spcstyle = word_WhiteSpace;
959 }
960 if (sitem->type & stack_idx) {
961 indexword->text = ustrdup(indexstr.text);
962 if (index_downcase)
963 ustrlow(indexword->text);
964 indexing = FALSE;
965 rdadd(&indexstr, L'\0');
966 index_merge(idx, FALSE, indexstr.text, idxwordlist);
967 sfree(indexstr.text);
968 }
969 if (sitem->type & stack_hyper) {
970 wd.text = NULL;
971 wd.type = word_HyperEnd;
972 wd.alt = NULL;
973 wd.aux = 0;
974 wd.fpos = t.pos;
975 wd.breaks = FALSE;
976 if (!indexing || index_visible)
977 addword(wd, &whptr);
978 if (indexing)
979 addword(wd, &idximplicit);
980 }
981 if (sitem->type & stack_quote) {
982 wd.text = NULL;
983 wd.type = toquotestyle(style);
984 wd.alt = NULL;
985 wd.aux = quote_Close;
986 wd.fpos = t.pos;
987 wd.breaks = FALSE;
988 if (!indexing || index_visible)
989 addword(wd, &whptr);
990 if (indexing) {
991 rdadd(&indexstr, L'"');
992 addword(wd, &idximplicit);
993 }
994 }
995 }
996 sfree(sitem);
997 break;
998 case tok_cmd:
999 switch (t.cmd) {
1000 case c__comment:
1001 /*
1002 * In-paragraph comment: \#{ balanced braces }
1003 *
1004 * Anything goes here; even tok_eop. We should
1005 * eat whitespace after the close brace _if_
1006 * there was whitespace before the \#.
1007 */
1008 dtor(t), t = get_token(in);
1009 if (t.type != tok_lbrace) {
1010 error(err_explbr, &t.pos);
1011 } else {
1012 int braces = 1;
1013 while (braces > 0) {
1014 dtor(t), t = get_token(in);
1015 if (t.type == tok_lbrace)
1016 braces++;
1017 else if (t.type == tok_rbrace)
1018 braces--;
1019 else if (t.type == tok_eof) {
1020 error(err_commenteof, &t.pos);
1021 break;
1022 }
1023 }
1024 }
1025 if (seenwhite) {
1026 already = TRUE;
1027 dtor(t), t = get_token(in);
1028 if (t.type == tok_white) {
1029 iswhite = TRUE;
1030 already = FALSE;
1031 }
1032 }
1033 break;
1034 case c_q:
1035 dtor(t), t = get_token(in);
1036 if (t.type != tok_lbrace) {
1037 error(err_explbr, &t.pos);
1038 } else {
1039 wd.text = NULL;
1040 wd.type = toquotestyle(style);
1041 wd.alt = NULL;
1042 wd.aux = quote_Open;
1043 wd.fpos = t.pos;
1044 wd.breaks = FALSE;
1045 if (!indexing || index_visible)
1046 addword(wd, &whptr);
1047 if (indexing) {
1048 rdadd(&indexstr, L'"');
1049 addword(wd, &idximplicit);
1050 }
1051 sitem = mknew(struct stack_item);
1052 sitem->type = stack_quote;
1053 stk_push(parsestk, sitem);
1054 }
1055 break;
1056 case c_K:
1057 case c_k:
1058 case c_W:
1059 case c_date:
1060 /*
1061 * Keyword, hyperlink, or \date. We expect a
1062 * left brace, some text, and then a right
1063 * brace. No nesting; no arguments.
1064 */
1065 wd.fpos = t.pos;
1066 wd.breaks = FALSE;
1067 if (t.cmd == c_K)
1068 wd.type = word_UpperXref;
1069 else if (t.cmd == c_k)
1070 wd.type = word_LowerXref;
1071 else if (t.cmd == c_W)
1072 wd.type = word_HyperLink;
1073 else
1074 wd.type = word_Normal;
1075 dtor(t), t = get_token(in);
1076 if (t.type != tok_lbrace) {
1077 if (wd.type == word_Normal) {
1078 time_t thetime = time(NULL);
1079 struct tm *broken = localtime(&thetime);
1080 already = TRUE;
1081 wdtext = ustrftime(NULL, broken);
1082 wd.type = style;
1083 } else {
1084 error(err_explbr, &t.pos);
1085 wdtext = NULL;
1086 }
1087 } else {
1088 rdstring rs = { 0, 0, NULL };
1089 while (dtor(t), t = get_token(in),
1090 t.type == tok_word || t.type == tok_white) {
1091 if (t.type == tok_white)
1092 rdadd(&rs, ' ');
1093 else
1094 rdadds(&rs, t.text);
1095 }
1096 if (wd.type == word_Normal) {
1097 time_t thetime = time(NULL);
1098 struct tm *broken = localtime(&thetime);
1099 wdtext = ustrftime(rs.text, broken);
1100 wd.type = style;
1101 } else {
1102 wdtext = ustrdup(rs.text);
1103 }
1104 sfree(rs.text);
1105 if (t.type != tok_rbrace) {
1106 error(err_kwexprbr, &t.pos);
1107 }
1108 }
1109 wd.alt = NULL;
1110 wd.aux = 0;
1111 if (!indexing || index_visible) {
1112 wd.text = ustrdup(wdtext);
1113 addword(wd, &whptr);
1114 }
1115 if (indexing) {
1116 wd.text = ustrdup(wdtext);
1117 addword(wd, &idximplicit);
1118 }
1119 sfree(wdtext);
1120 if (wd.type == word_HyperLink) {
1121 /*
1122 * Hyperlinks are different: they then
1123 * expect another left brace, to begin
1124 * delimiting the text marked by the link.
1125 */
1126 dtor(t), t = get_token(in);
1127 /*
1128 * Special cases: \W{}\c, \W{}\e, \W{}\cw
1129 */
1130 sitem = mknew(struct stack_item);
1131 sitem->type = stack_hyper;
1132 if (t.type == tok_cmd &&
1133 (t.cmd == c_e || t.cmd == c_c || t.cmd == c_cw)) {
1134 if (style != word_Normal)
1135 error(err_nestedstyles, &t.pos);
1136 else {
1137 style = (t.cmd == c_c ? word_Code :
1138 t.cmd == c_cw ? word_WeakCode :
1139 word_Emph);
1140 spcstyle = tospacestyle(style);
1141 sitem->type |= stack_style;
1142 }
1143 dtor(t), t = get_token(in);
1144 }
1145 if (t.type != tok_lbrace) {
1146 error(err_explbr, &t.pos);
1147 sfree(sitem);
1148 } else {
1149 stk_push(parsestk, sitem);
1150 }
1151 }
1152 break;
1153 case c_c:
1154 case c_cw:
1155 case c_e:
1156 type = t.cmd;
1157 if (style != word_Normal) {
1158 error(err_nestedstyles, &t.pos);
1159 /* Error recovery: eat lbrace, push nop. */
1160 dtor(t), t = get_token(in);
1161 sitem = mknew(struct stack_item);
1162 sitem->type = stack_nop;
1163 stk_push(parsestk, sitem);
1164 }
1165 dtor(t), t = get_token(in);
1166 if (t.type != tok_lbrace) {
1167 error(err_explbr, &t.pos);
1168 } else {
1169 style = (type == c_c ? word_Code :
1170 type == c_cw ? word_WeakCode :
1171 word_Emph);
1172 spcstyle = tospacestyle(style);
1173 sitem = mknew(struct stack_item);
1174 sitem->type = stack_style;
1175 stk_push(parsestk, sitem);
1176 }
1177 break;
1178 case c_i:
1179 case c_ii:
1180 case c_I:
1181 type = t.cmd;
1182 if (indexing) {
1183 error(err_nestedindex, &t.pos);
1184 /* Error recovery: eat lbrace, push nop. */
1185 dtor(t), t = get_token(in);
1186 sitem = mknew(struct stack_item);
1187 sitem->type = stack_nop;
1188 stk_push(parsestk, sitem);
1189 }
1190 sitem = mknew(struct stack_item);
1191 sitem->type = stack_idx;
1192 dtor(t), t = get_token(in);
1193 /*
1194 * Special cases: \i\c, \i\e, \i\cw
1195 */
1196 wd.fpos = t.pos;
1197 if (t.type == tok_cmd &&
1198 (t.cmd == c_e || t.cmd == c_c || t.cmd == c_cw)) {
1199 if (style != word_Normal)
1200 error(err_nestedstyles, &t.pos);
1201 else {
1202 style = (t.cmd == c_c ? word_Code :
1203 t.cmd == c_cw ? word_WeakCode :
1204 word_Emph);
1205 spcstyle = tospacestyle(style);
1206 sitem->type |= stack_style;
1207 }
1208 dtor(t), t = get_token(in);
1209 }
1210 if (t.type != tok_lbrace) {
1211 sfree(sitem);
1212 error(err_explbr, &t.pos);
1213 } else {
1214 /* Add an index-reference word with no text as yet */
1215 wd.type = word_IndexRef;
1216 wd.text = NULL;
1217 wd.alt = NULL;
1218 wd.aux = 0;
1219 wd.breaks = FALSE;
1220 indexword = addword(wd, &whptr);
1221 /* Set up a rdstring to read the index text */
1222 indexstr = nullrs;
1223 /* Flags so that we do the Right Things with text */
1224 index_visible = (type != c_I);
1225 index_downcase = (type == c_ii);
1226 indexing = TRUE;
1227 idxwordlist = NULL;
1228 idximplicit = &idxwordlist;
1229 /* Stack item to close the indexing on exit */
1230 stk_push(parsestk, sitem);
1231 }
1232 break;
1233 case c_u:
1234 uchr = t.aux;
1235 utext[0] = uchr; utext[1] = 0;
1236 wd.type = style;
1237 wd.breaks = FALSE;
1238 wd.alt = NULL;
1239 wd.aux = 0;
1240 wd.fpos = t.pos;
1241 if (!indexing || index_visible) {
1242 wd.text = ustrdup(utext);
1243 uword = addword(wd, &whptr);
1244 } else
1245 uword = NULL;
1246 if (indexing) {
1247 wd.text = ustrdup(utext);
1248 iword = addword(wd, &idximplicit);
1249 } else
1250 iword = NULL;
1251 dtor(t), t = get_token(in);
1252 if (t.type == tok_lbrace) {
1253 /*
1254 * \u with a left brace. Until the brace
1255 * closes, all further words go on a
1256 * sidetrack from the main thread of the
1257 * paragraph.
1258 */
1259 sitem = mknew(struct stack_item);
1260 sitem->type = stack_ualt;
1261 sitem->whptr = whptr;
1262 sitem->idximplicit = idximplicit;
1263 stk_push(parsestk, sitem);
1264 whptr = uword ? &uword->alt : NULL;
1265 idximplicit = iword ? &iword->alt : NULL;
1266 } else {
1267 if (indexing)
1268 rdadd(&indexstr, uchr);
1269 already = TRUE;
1270 }
1271 break;
1272 default:
1273 if (!macrolookup(macros, in, t.text, &t.pos))
1274 error(err_badmidcmd, t.text, &t.pos);
1275 break;
1276 }
1277 }
1278 if (!already)
1279 dtor(t), t = get_token(in);
1280 seenwhite = iswhite;
1281 }
1282 finished_para:
1283 /* Check the stack is empty */
1284 if (stk_top(parsestk)) {
1285 while ((sitem = stk_pop(parsestk)))
1286 sfree(sitem);
1287 error(err_missingrbrace, &t.pos);
1288 }
1289 stk_free(parsestk);
1290 prev_para_type = par.type;
1291 addpara(par, ret);
1292 if (t.type == tok_eof)
1293 already = TRUE;
1294 }
1295
1296 if (stk_top(crossparastk)) {
1297 void *p;
1298
1299 error(err_missingrbrace2, &t.pos);
1300 while ((p = stk_pop(crossparastk)))
1301 sfree(p);
1302 }
1303
1304 /*
1305 * We break to here rather than returning, because otherwise
1306 * this cleanup doesn't happen.
1307 */
1308 dtor(t);
1309 macrocleanup(macros);
1310
1311 stk_free(crossparastk);
1312 }
1313
1314 paragraph *read_input(input *in, indexdata *idx) {
1315 paragraph *head = NULL;
1316 paragraph **hptr = &head;
1317
1318 while (in->currindex < in->nfiles) {
1319 in->currfp = fopen(in->filenames[in->currindex], "r");
1320 if (in->currfp) {
1321 setpos(in, in->filenames[in->currindex]);
1322 read_file(&hptr, in, idx);
1323 }
1324 in->currindex++;
1325 }
1326
1327 return head;
1328 }