Obsoleted the `\preamble' command. Preamble text is now taken to be
[sgt/halibut] / input.c
1 /*
2 * input.c: read the source form
3 */
4
5 #include <stdio.h>
6 #include <assert.h>
7 #include <time.h>
8 #include "halibut.h"
9
10 #define TAB_STOP 8 /* for column number tracking */
11
12 static void setpos(input *in, char *fname) {
13 in->pos.filename = fname;
14 in->pos.line = 1;
15 in->pos.col = (in->reportcols ? 1 : -1);
16 }
17
18 static void unget(input *in, int c, filepos *pos) {
19 if (in->npushback >= in->pushbacksize) {
20 in->pushbacksize = in->npushback + 16;
21 in->pushback = resize(in->pushback, in->pushbacksize);
22 }
23 in->pushback[in->npushback].chr = c;
24 in->pushback[in->npushback].pos = *pos; /* structure copy */
25 in->npushback++;
26 }
27
28 /* ---------------------------------------------------------------------- */
29 /*
30 * Macro subsystem
31 */
32 typedef struct macro_Tag macro;
33 struct macro_Tag {
34 wchar_t *name, *text;
35 };
36 struct macrostack_Tag {
37 macrostack *next;
38 wchar_t *text;
39 int ptr, npushback;
40 filepos pos;
41 };
42 static int macrocmp(void *av, void *bv) {
43 macro *a = (macro *)av, *b = (macro *)bv;
44 return ustrcmp(a->name, b->name);
45 }
46 static void macrodef(tree234 *macros, wchar_t *name, wchar_t *text,
47 filepos fpos) {
48 macro *m = mknew(macro);
49 m->name = name;
50 m->text = text;
51 if (add234(macros, m) != m) {
52 error(err_macroexists, &fpos, name);
53 sfree(name);
54 sfree(text);
55 }
56 }
57 static int macrolookup(tree234 *macros, input *in, wchar_t *name,
58 filepos *pos) {
59 macro m, *gotit;
60 m.name = name;
61 gotit = find234(macros, &m, NULL);
62 if (gotit) {
63 macrostack *expansion = mknew(macrostack);
64 expansion->next = in->stack;
65 expansion->text = gotit->text;
66 expansion->pos = *pos; /* structure copy */
67 expansion->ptr = 0;
68 expansion->npushback = in->npushback;
69 in->stack = expansion;
70 return TRUE;
71 } else
72 return FALSE;
73 }
74 static void macrocleanup(tree234 *macros) {
75 int ti;
76 macro *m;
77 for (ti = 0; (m = (macro *)index234(macros, ti)) != NULL; ti++) {
78 sfree(m->name);
79 sfree(m->text);
80 sfree(m);
81 }
82 freetree234(macros);
83 }
84
85 /*
86 * Can return EOF
87 */
88 static int get(input *in, filepos *pos) {
89 int pushbackpt = in->stack ? in->stack->npushback : 0;
90 if (in->npushback > pushbackpt) {
91 --in->npushback;
92 if (pos)
93 *pos = in->pushback[in->npushback].pos; /* structure copy */
94 return in->pushback[in->npushback].chr;
95 }
96 else if (in->stack) {
97 wchar_t c = in->stack->text[in->stack->ptr];
98 if (in->stack->text[++in->stack->ptr] == L'\0') {
99 macrostack *tmp = in->stack;
100 in->stack = tmp->next;
101 sfree(tmp);
102 }
103 return c;
104 }
105 else if (in->currfp) {
106 int c = getc(in->currfp);
107
108 if (c == EOF) {
109 fclose(in->currfp);
110 in->currfp = NULL;
111 }
112 /* Track line numbers, for error reporting */
113 if (pos)
114 *pos = in->pos;
115 if (in->reportcols) {
116 switch (c) {
117 case '\t':
118 in->pos.col = 1 + (in->pos.col + TAB_STOP-1) % TAB_STOP;
119 break;
120 case '\n':
121 in->pos.col = 1;
122 in->pos.line++;
123 break;
124 default:
125 in->pos.col++;
126 break;
127 }
128 } else {
129 in->pos.col = -1;
130 if (c == '\n')
131 in->pos.line++;
132 }
133 /* FIXME: do input charmap translation. We should be returning
134 * Unicode here. */
135 return c;
136 } else
137 return EOF;
138 }
139
140 /*
141 * Lexical analysis of source files.
142 */
143 typedef struct token_Tag token;
144 struct token_Tag {
145 int type;
146 int cmd, aux;
147 wchar_t *text;
148 filepos pos;
149 };
150 enum {
151 tok_eof, /* end of file */
152 tok_eop, /* end of paragraph */
153 tok_white, /* whitespace */
154 tok_word, /* a word or word fragment */
155 tok_cmd, /* \command */
156 tok_lbrace, /* { */
157 tok_rbrace /* } */
158 };
159
160 /* Halibut command keywords. */
161 enum {
162 c__invalid, /* invalid command */
163 c__comment, /* comment command (\#) */
164 c__escaped, /* escaped character */
165 c__nbsp, /* nonbreaking space */
166 c_A, /* appendix heading */
167 c_B, /* bibliography entry */
168 c_BR, /* bibliography rewrite */
169 c_C, /* chapter heading */
170 c_H, /* heading */
171 c_I, /* invisible index mark */
172 c_IM, /* index merge/rewrite */
173 c_K, /* capitalised cross-reference */
174 c_S, /* aux field is 0, 1, 2, ... */
175 c_U, /* unnumbered-chapter heading */
176 c_W, /* Web hyperlink */
177 c_b, /* bulletted list */
178 c_c, /* code */
179 c_cfg, /* configuration directive */
180 c_copyright, /* copyright statement */
181 c_cw, /* weak code */
182 c_date, /* document processing date */
183 c_dd, /* description list: description */
184 c_define, /* macro definition */
185 c_dt, /* description list: described thing */
186 c_e, /* emphasis */
187 c_i, /* visible index mark */
188 c_ii, /* uncapitalised visible index mark */
189 c_k, /* uncapitalised cross-reference */
190 c_lcont, /* continuation para(s) for list item */
191 c_n, /* numbered list */
192 c_nocite, /* bibliography trickery */
193 c_preamble, /* (obsolete) preamble text */
194 c_q, /* quote marks */
195 c_quote, /* block-quoted paragraphs */
196 c_rule, /* horizontal rule */
197 c_title, /* document title */
198 c_u, /* aux field is char code */
199 c_versionid /* document RCS id */
200 };
201
202 /* Perhaps whitespace should be defined in a more Unicode-friendly way? */
203 #define iswhite(c) ( (c)==32 || (c)==9 || (c)==13 || (c)==10 )
204 #define isnl(c) ( (c)==10 )
205 #define isdec(c) ( ((c)>='0'&&(c)<='9') )
206 #define fromdec(c) ( (c)-'0' )
207 #define ishex(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='F') || ((c)>='a'&&(c)<='f'))
208 #define fromhex(c) ( (c)<='9' ? (c)-'0' : ((c)&0xDF) - ('A'-10) )
209 #define iscmd(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='Z') || ((c)>='a'&&(c)<='z'))
210
211 /*
212 * Keyword comparison function. Like strcmp, but between a wchar_t *
213 * and a char *.
214 */
215 static int kwcmp(wchar_t const *p, char const *q) {
216 int i;
217 do {
218 i = *p - *q;
219 } while (*p++ && *q++ && !i);
220 return i;
221 }
222
223 /*
224 * Match a keyword.
225 */
226 static void match_kw(token *tok) {
227 /*
228 * FIXME. The ids are explicit in here so as to allow long-name
229 * equivalents to the various very short keywords.
230 */
231 static const struct { char const *name; int id; } keywords[] = {
232 {"#", c__comment}, /* comment command (\#) */
233 {"-", c__escaped}, /* nonbreaking hyphen */
234 {"A", c_A}, /* appendix heading */
235 {"B", c_B}, /* bibliography entry */
236 {"BR", c_BR}, /* bibliography rewrite */
237 {"C", c_C}, /* chapter heading */
238 {"H", c_H}, /* heading */
239 {"I", c_I}, /* invisible index mark */
240 {"IM", c_IM}, /* index merge/rewrite */
241 {"K", c_K}, /* capitalised cross-reference */
242 {"U", c_U}, /* unnumbered-chapter heading */
243 {"W", c_W}, /* Web hyperlink */
244 {"\\", c__escaped}, /* escaped backslash (\\) */
245 {"_", c__nbsp}, /* nonbreaking space (\_) */
246 {"b", c_b}, /* bulletted list */
247 {"c", c_c}, /* code */
248 {"cfg", c_cfg}, /* configuration directive */
249 {"copyright", c_copyright}, /* copyright statement */
250 {"cw", c_cw}, /* weak code */
251 {"date", c_date}, /* document processing date */
252 {"dd", c_dd}, /* description list: description */
253 {"define", c_define}, /* macro definition */
254 {"dt", c_dt}, /* description list: described thing */
255 {"e", c_e}, /* emphasis */
256 {"i", c_i}, /* visible index mark */
257 {"ii", c_ii}, /* uncapitalised visible index mark */
258 {"k", c_k}, /* uncapitalised cross-reference */
259 {"lcont", c_lcont}, /* continuation para(s) for list item */
260 {"n", c_n}, /* numbered list */
261 {"nocite", c_nocite}, /* bibliography trickery */
262 {"preamble", c_preamble}, /* (obsolete) preamble text */
263 {"q", c_q}, /* quote marks */
264 {"quote", c_quote}, /* block-quoted paragraphs */
265 {"rule", c_rule}, /* horizontal rule */
266 {"title", c_title}, /* document title */
267 {"versionid", c_versionid}, /* document RCS id */
268 {"{", c__escaped}, /* escaped lbrace (\{) */
269 {"}", c__escaped}, /* escaped rbrace (\}) */
270 };
271 int i, j, k, c;
272
273 /*
274 * Special cases: \S{0,1,2,...} and \uABCD. If the syntax
275 * doesn't match correctly, we just fall through to the
276 * binary-search phase.
277 */
278 if (tok->text[0] == 'S') {
279 /* We expect numeric characters thereafter. */
280 wchar_t *p = tok->text+1;
281 int n;
282 if (!*p)
283 n = 1;
284 else {
285 n = 0;
286 while (*p && isdec(*p)) {
287 n = 10 * n + fromdec(*p);
288 p++;
289 }
290 }
291 if (!*p) {
292 tok->cmd = c_S;
293 tok->aux = n;
294 return;
295 }
296 } else if (tok->text[0] == 'u') {
297 /* We expect hex characters thereafter. */
298 wchar_t *p = tok->text+1;
299 int n = 0;
300 while (*p && ishex(*p)) {
301 n = 16 * n + fromhex(*p);
302 p++;
303 }
304 if (!*p) {
305 tok->cmd = c_u;
306 tok->aux = n;
307 return;
308 }
309 }
310
311 i = -1;
312 j = sizeof(keywords)/sizeof(*keywords);
313 while (j-i > 1) {
314 k = (i+j)/2;
315 c = kwcmp(tok->text, keywords[k].name);
316 if (c < 0)
317 j = k;
318 else if (c > 0)
319 i = k;
320 else /* c == 0 */ {
321 tok->cmd = keywords[k].id;
322 return;
323 }
324 }
325
326 tok->cmd = c__invalid;
327 }
328
329
330 /*
331 * Read a token from the input file, in the normal way (`normal' in
332 * the sense that code paragraphs work a different way).
333 */
334 token get_token(input *in) {
335 int c;
336 int nls;
337 token ret;
338 rdstring rs = { 0, 0, NULL };
339 filepos cpos;
340
341 ret.text = NULL; /* default */
342 c = get(in, &cpos);
343 ret.pos = cpos;
344 if (iswhite(c)) { /* tok_white or tok_eop */
345 nls = 0;
346 do {
347 if (isnl(c))
348 nls++;
349 } while ((c = get(in, &cpos)) != EOF && iswhite(c));
350 if (c == EOF) {
351 ret.type = tok_eof;
352 return ret;
353 }
354 unget(in, c, &cpos);
355 ret.type = (nls > 1 ? tok_eop : tok_white);
356 return ret;
357 } else if (c == EOF) { /* tok_eof */
358 ret.type = tok_eof;
359 return ret;
360 } else if (c == '\\') { /* tok_cmd */
361 c = get(in, &cpos);
362 if (c == '-' || c == '\\' || c == '_' ||
363 c == '#' || c == '{' || c == '}') {
364 /* single-char command */
365 rdadd(&rs, c);
366 } else if (c == 'u') {
367 int len = 0;
368 do {
369 rdadd(&rs, c);
370 len++;
371 c = get(in, &cpos);
372 } while (ishex(c) && len < 5);
373 unget(in, c, &cpos);
374 } else if (iscmd(c)) {
375 do {
376 rdadd(&rs, c);
377 c = get(in, &cpos);
378 } while (iscmd(c));
379 unget(in, c, &cpos);
380 }
381 /*
382 * Now match the command against the list of available
383 * ones.
384 */
385 ret.type = tok_cmd;
386 ret.text = ustrdup(rs.text);
387 match_kw(&ret);
388 sfree(rs.text);
389 return ret;
390 } else if (c == '{') { /* tok_lbrace */
391 ret.type = tok_lbrace;
392 return ret;
393 } else if (c == '}') { /* tok_rbrace */
394 ret.type = tok_rbrace;
395 return ret;
396 } else { /* tok_word */
397 /*
398 * Read a word: the longest possible contiguous sequence of
399 * things other than whitespace, backslash, braces and
400 * hyphen. A hyphen terminates the word but is returned as
401 * part of it; everything else is pushed back for the next
402 * token. The `aux' field contains TRUE if the word ends in
403 * a hyphen.
404 */
405 ret.aux = FALSE; /* assumed for now */
406 while (1) {
407 if (iswhite(c) || c=='{' || c=='}' || c=='\\' || c==EOF) {
408 /* Put back the character that caused termination */
409 unget(in, c, &cpos);
410 break;
411 } else {
412 rdadd(&rs, c);
413 if (c == '-') {
414 ret.aux = TRUE;
415 break; /* hyphen terminates word */
416 }
417 }
418 c = get(in, &cpos);
419 }
420 ret.type = tok_word;
421 ret.text = ustrdup(rs.text);
422 sfree(rs.text);
423 return ret;
424 }
425 }
426
427 /*
428 * Determine whether the next input character is an open brace (for
429 * telling code paragraphs from paragraphs which merely start with
430 * code).
431 */
432 int isbrace(input *in) {
433 int c;
434 filepos cpos;
435
436 c = get(in, &cpos);
437 unget(in, c, &cpos);
438 return (c == '{');
439 }
440
441 /*
442 * Read the rest of a line that starts `\c'. Including nothing at
443 * all (tok_word with empty text).
444 */
445 token get_codepar_token(input *in) {
446 int c;
447 token ret;
448 rdstring rs = { 0, 0, NULL };
449 filepos cpos;
450
451 ret.type = tok_word;
452 c = get(in, &cpos); /* expect (and discard) one space */
453 ret.pos = cpos;
454 if (c == ' ') {
455 c = get(in, &cpos);
456 ret.pos = cpos;
457 }
458 while (!isnl(c) && c != EOF) {
459 int c2 = c;
460 c = get(in, &cpos);
461 /* Discard \r just before \n. */
462 if (c2 != 13 || !isnl(c))
463 rdadd(&rs, c2);
464 }
465 unget(in, c, &cpos);
466 ret.text = ustrdup(rs.text);
467 sfree(rs.text);
468 return ret;
469 }
470
471 /*
472 * Adds a new word to a linked list
473 */
474 static word *addword(word newword, word ***hptrptr) {
475 word *mnewword;
476 if (!hptrptr)
477 return NULL;
478 mnewword = mknew(word);
479 *mnewword = newword; /* structure copy */
480 mnewword->next = NULL;
481 **hptrptr = mnewword;
482 *hptrptr = &mnewword->next;
483 return mnewword;
484 }
485
486 /*
487 * Adds a new paragraph to a linked list
488 */
489 static paragraph *addpara(paragraph newpara, paragraph ***hptrptr) {
490 paragraph *mnewpara = mknew(paragraph);
491 *mnewpara = newpara; /* structure copy */
492 mnewpara->next = NULL;
493 **hptrptr = mnewpara;
494 *hptrptr = &mnewpara->next;
495 return mnewpara;
496 }
497
498 /*
499 * Destructor before token is reassigned; should catch most memory
500 * leaks
501 */
502 #define dtor(t) ( sfree(t.text) )
503
504 /*
505 * Reads a single file (ie until get() returns EOF)
506 */
507 static void read_file(paragraph ***ret, input *in, indexdata *idx) {
508 token t;
509 paragraph par;
510 word wd, **whptr, **idximplicit;
511 tree234 *macros;
512 wchar_t utext[2], *wdtext;
513 int style, spcstyle;
514 int already;
515 int iswhite, seenwhite;
516 int type;
517 int prev_para_type;
518 struct stack_item {
519 enum {
520 stack_nop = 0, /* do nothing (for error recovery) */
521 stack_ualt = 1, /* \u alternative */
522 stack_style = 2, /* \e, \c, \cw */
523 stack_idx = 4, /* \I, \i, \ii */
524 stack_hyper = 8, /* \W */
525 stack_quote = 16, /* \q */
526 } type;
527 word **whptr; /* to restore from \u alternatives */
528 word **idximplicit; /* to restore from \u alternatives */
529 } *sitem;
530 stack parsestk;
531 struct crossparaitem {
532 int type; /* currently c_lcont, c_quote or -1 */
533 int seen_lcont, seen_quote;
534 };
535 stack crossparastk;
536 word *indexword, *uword, *iword;
537 word *idxwordlist;
538 rdstring indexstr;
539 int index_downcase, index_visible, indexing;
540 const rdstring nullrs = { 0, 0, NULL };
541 wchar_t uchr;
542
543 t.text = NULL;
544 macros = newtree234(macrocmp);
545 already = FALSE;
546
547 crossparastk = stk_new();
548
549 /*
550 * Loop on each paragraph.
551 */
552 while (1) {
553 int start_cmd = c__invalid;
554 par.words = NULL;
555 par.keyword = NULL;
556 whptr = &par.words;
557
558 /*
559 * Get a token.
560 */
561 do {
562 if (!already) {
563 dtor(t), t = get_token(in);
564 }
565 already = FALSE;
566 } while (t.type == tok_eop);
567 if (t.type == tok_eof)
568 break;
569
570 /*
571 * Parse code paragraphs separately.
572 */
573 if (t.type == tok_cmd && t.cmd == c_c && !isbrace(in)) {
574 int wtype = word_WeakCode;
575
576 par.type = para_Code;
577 par.fpos = t.pos;
578 while (1) {
579 dtor(t), t = get_codepar_token(in);
580 wd.type = wtype;
581 wd.breaks = FALSE; /* shouldn't need this... */
582 wd.text = ustrdup(t.text);
583 wd.alt = NULL;
584 wd.fpos = t.pos;
585 addword(wd, &whptr);
586 dtor(t), t = get_token(in);
587 if (t.type == tok_white) {
588 /*
589 * The newline after a code-paragraph line
590 */
591 dtor(t), t = get_token(in);
592 }
593 if (t.type == tok_eop || t.type == tok_eof)
594 break;
595 else if (t.type == tok_cmd && t.cmd == c_c)
596 wtype = word_WeakCode;
597 else if (t.type == tok_cmd && t.cmd == c_e &&
598 wtype == word_WeakCode)
599 wtype = word_Emph;
600 else {
601 error(err_brokencodepara, &t.pos);
602 prev_para_type = par.type;
603 addpara(par, ret);
604 while (t.type != tok_eop) /* error recovery: */
605 dtor(t), t = get_token(in); /* eat rest of paragraph */
606 goto codeparabroken; /* ick, but such is life */
607 }
608 }
609 prev_para_type = par.type;
610 addpara(par, ret);
611 codeparabroken:
612 continue;
613 }
614
615 /*
616 * Spot the special commands that define a grouping of more
617 * than one paragraph, and also the closing braces that
618 * finish them.
619 */
620 if (t.type == tok_cmd &&
621 (t.cmd == c_lcont || t.cmd == c_quote)) {
622 struct crossparaitem *sitem, *stop;
623 int cmd = t.cmd;
624
625 /*
626 * Expect, and swallow, an open brace.
627 */
628 dtor(t), t = get_token(in);
629 if (t.type != tok_lbrace) {
630 error(err_explbr, &t.pos);
631 continue;
632 }
633
634 if (cmd == c_lcont) {
635 /*
636 * \lcont causes a continuation of a list item into
637 * multiple paragraphs (which may in turn contain
638 * nested lists, code paras etc). Hence, the previous
639 * paragraph must be of a list type.
640 */
641 sitem = mknew(struct crossparaitem);
642 stop = (struct crossparaitem *)stk_top(crossparastk);
643 if (stop)
644 *sitem = *stop;
645 else
646 sitem->seen_quote = sitem->seen_lcont = 0;
647
648 if (prev_para_type == para_Bullet ||
649 prev_para_type == para_NumberedList ||
650 prev_para_type == para_Description) {
651 sitem->type = c_lcont;
652 sitem->seen_lcont = 1;
653 par.type = para_LcontPush;
654 prev_para_type = par.type;
655 addpara(par, ret);
656 } else {
657 /*
658 * Push a null item on the cross-para stack so that
659 * when we see the corresponding closing brace we
660 * don't give a cascade error.
661 */
662 sitem->type = -1;
663 error(err_misplacedlcont, &t.pos);
664 }
665 } else {
666 /*
667 * \quote causes a group of paragraphs to be
668 * block-quoted (typically they will be indented a
669 * bit).
670 */
671 sitem = mknew(struct crossparaitem);
672 stop = (struct crossparaitem *)stk_top(crossparastk);
673 if (stop)
674 *sitem = *stop;
675 else
676 sitem->seen_quote = sitem->seen_lcont = 0;
677 sitem->type = c_quote;
678 sitem->seen_quote = 1;
679 par.type = para_QuotePush;
680 prev_para_type = par.type;
681 addpara(par, ret);
682 }
683 stk_push(crossparastk, sitem);
684 continue;
685 } else if (t.type == tok_rbrace) {
686 struct crossparaitem *sitem = stk_pop(crossparastk);
687 if (!sitem)
688 error(err_unexbrace, &t.pos);
689 else {
690 switch (sitem->type) {
691 case c_lcont:
692 par.type = para_LcontPop;
693 prev_para_type = par.type;
694 addpara(par, ret);
695 break;
696 case c_quote:
697 par.type = para_QuotePop;
698 prev_para_type = par.type;
699 addpara(par, ret);
700 break;
701 }
702 sfree(sitem);
703 }
704 continue;
705 }
706
707 /*
708 * This token begins a paragraph. See if it's one of the
709 * special commands that define a paragraph type.
710 *
711 * (note that \# is special in a way, and \nocite takes no
712 * text)
713 */
714 par.type = para_Normal;
715 if (t.type == tok_cmd) {
716 int needkw;
717 int is_macro = FALSE;
718
719 par.fpos = t.pos;
720 switch (t.cmd) {
721 default:
722 needkw = -1;
723 break;
724 case c__invalid:
725 error(err_badparatype, t.text, &t.pos);
726 needkw = 4;
727 break;
728 case c__comment:
729 if (isbrace(in))
730 break; /* `\#{': isn't a comment para */
731 do {
732 dtor(t), t = get_token(in);
733 } while (t.type != tok_eop && t.type != tok_eof);
734 continue; /* next paragraph */
735 /*
736 * `needkw' values:
737 *
738 * 1 -- exactly one keyword
739 * 2 -- at least one keyword
740 * 4 -- any number of keywords including zero
741 * 8 -- at least one keyword and then nothing else
742 * 16 -- nothing at all! no keywords, no body
743 * 32 -- no keywords at all
744 */
745 case c_A: needkw = 2; par.type = para_Appendix; break;
746 case c_B: needkw = 2; par.type = para_Biblio; break;
747 case c_BR: needkw = 1; par.type = para_BR;
748 start_cmd = c_BR; break;
749 case c_C: needkw = 2; par.type = para_Chapter; break;
750 case c_H: needkw = 2; par.type = para_Heading;
751 par.aux = 0;
752 break;
753 case c_IM: needkw = 2; par.type = para_IM;
754 start_cmd = c_IM; break;
755 case c_S: needkw = 2; par.type = para_Subsect;
756 par.aux = t.aux; break;
757 case c_U: needkw = 32; par.type = para_UnnumberedChapter; break;
758 /* For \b and \n the keyword is optional */
759 case c_b: needkw = 4; par.type = para_Bullet; break;
760 case c_dt: needkw = 4; par.type = para_DescribedThing; break;
761 case c_dd: needkw = 4; par.type = para_Description; break;
762 case c_n: needkw = 4; par.type = para_NumberedList; break;
763 case c_cfg: needkw = 8; par.type = para_Config;
764 start_cmd = c_cfg; break;
765 case c_copyright: needkw = 32; par.type = para_Copyright; break;
766 case c_define: is_macro = TRUE; needkw = 1; break;
767 /* For \nocite the keyword is _everything_ */
768 case c_nocite: needkw = 8; par.type = para_NoCite; break;
769 case c_preamble: needkw = 32; par.type = para_Normal; break;
770 case c_rule: needkw = 16; par.type = para_Rule; break;
771 case c_title: needkw = 32; par.type = para_Title; break;
772 case c_versionid: needkw = 32; par.type = para_VersionID; break;
773 }
774
775 if (par.type == para_Chapter ||
776 par.type == para_Heading ||
777 par.type == para_Subsect ||
778 par.type == para_Appendix ||
779 par.type == para_UnnumberedChapter) {
780 struct crossparaitem *sitem = stk_top(crossparastk);
781 if (sitem && (sitem->seen_lcont || sitem->seen_quote)) {
782 error(err_sectmarkerinblock,
783 &t.pos,
784 (sitem->seen_lcont ? "lcont" : "quote"));
785 }
786 }
787
788 if (needkw > 0) {
789 rdstring rs = { 0, 0, NULL };
790 int nkeys = 0;
791 filepos fp;
792
793 /* Get keywords. */
794 dtor(t), t = get_token(in);
795 fp = t.pos;
796 while (t.type == tok_lbrace) {
797 /* This is a keyword. */
798 nkeys++;
799 /* FIXME: there will be bugs if anyone specifies an
800 * empty keyword (\foo{}), so trap this case. */
801 while (dtor(t), t = get_token(in),
802 t.type == tok_word ||
803 t.type == tok_white ||
804 (t.type == tok_cmd && t.cmd == c__nbsp) ||
805 (t.type == tok_cmd && t.cmd == c__escaped)) {
806 if (t.type == tok_white ||
807 (t.type == tok_cmd && t.cmd == c__nbsp))
808 rdadd(&rs, ' ');
809 else
810 rdadds(&rs, t.text);
811 }
812 if (t.type != tok_rbrace) {
813 error(err_kwunclosed, &t.pos);
814 continue;
815 }
816 rdadd(&rs, 0); /* add string terminator */
817 dtor(t), t = get_token(in); /* eat right brace */
818 }
819
820 rdadd(&rs, 0); /* add string terminator */
821
822 /* See whether we have the right number of keywords. */
823 if ((needkw & 48) && nkeys > 0)
824 error(err_kwillegal, &fp);
825 if ((needkw & 11) && nkeys == 0)
826 error(err_kwexpected, &fp);
827 if ((needkw & 5) && nkeys > 1)
828 error(err_kwtoomany, &fp);
829
830 if (is_macro) {
831 /*
832 * Macro definition. Get the rest of the line
833 * as a code-paragraph token, repeatedly until
834 * there's nothing more left of it. Separate
835 * with newlines.
836 */
837 rdstring macrotext = { 0, 0, NULL };
838 while (1) {
839 dtor(t), t = get_codepar_token(in);
840 if (macrotext.pos > 0)
841 rdadd(&macrotext, L'\n');
842 rdadds(&macrotext, t.text);
843 dtor(t), t = get_token(in);
844 if (t.type == tok_eop) break;
845 }
846 macrodef(macros, rs.text, macrotext.text, fp);
847 continue; /* next paragraph */
848 }
849
850 par.keyword = rdtrim(&rs);
851
852 /* Move to EOP in case of needkw==8 or 16 (no body) */
853 if (needkw & 24) {
854 /* We allow whitespace even when we expect no para body */
855 while (t.type == tok_white)
856 dtor(t), t = get_token(in);
857 if (t.type != tok_eop && t.type != tok_eof &&
858 (start_cmd == c__invalid ||
859 t.type != tok_cmd || t.cmd != start_cmd)) {
860 error(err_bodyillegal, &t.pos);
861 /* Error recovery: eat the rest of the paragraph */
862 while (t.type != tok_eop && t.type != tok_eof &&
863 (start_cmd == c__invalid ||
864 t.type != tok_cmd || t.cmd != start_cmd))
865 dtor(t), t = get_token(in);
866 }
867 if (t.type == tok_cmd)
868 already = TRUE;/* inhibit get_token at top of loop */
869 prev_para_type = par.type;
870 addpara(par, ret);
871 continue; /* next paragraph */
872 }
873 }
874 }
875
876 /*
877 * Now read the actual paragraph, word by word, adding to
878 * the paragraph list.
879 *
880 * Mid-paragraph commands:
881 *
882 * \K \k
883 * \c \cw
884 * \e
885 * \i \ii
886 * \I
887 * \u
888 * \W
889 * \date
890 * \\ \{ \}
891 */
892 parsestk = stk_new();
893 style = word_Normal;
894 spcstyle = word_WhiteSpace;
895 indexing = FALSE;
896 seenwhite = TRUE;
897 while (t.type != tok_eop && t.type != tok_eof) {
898 iswhite = FALSE;
899 already = FALSE;
900
901 /* Handle implicit paragraph breaks after \IM, \BR etc */
902 if (start_cmd != c__invalid &&
903 t.type == tok_cmd && t.cmd == start_cmd) {
904 already = TRUE; /* inhibit get_token at top of loop */
905 break;
906 }
907
908 if (t.type == tok_cmd && t.cmd == c__escaped) {
909 t.type = tok_word; /* nice and simple */
910 t.aux = 0; /* even if `\-' - nonbreaking! */
911 }
912 if (t.type == tok_cmd && t.cmd == c__nbsp) {
913 t.type = tok_word; /* nice and simple */
914 sfree(t.text);
915 t.text = ustrdup(L" "); /* text is ` ' not `_' */
916 t.aux = 0; /* (nonbreaking) */
917 }
918 switch (t.type) {
919 case tok_white:
920 if (whptr == &par.words)
921 break; /* strip whitespace at start of para */
922 wd.text = NULL;
923 wd.type = spcstyle;
924 wd.alt = NULL;
925 wd.aux = 0;
926 wd.fpos = t.pos;
927 wd.breaks = FALSE;
928
929 /*
930 * Inhibit use of whitespace if it's (probably the
931 * newline) before a repeat \IM / \BR type
932 * directive.
933 */
934 if (start_cmd != c__invalid) {
935 dtor(t), t = get_token(in);
936 already = TRUE;
937 if (t.type == tok_cmd && t.cmd == start_cmd)
938 break;
939 }
940
941 if (indexing)
942 rdadd(&indexstr, ' ');
943 if (!indexing || index_visible)
944 addword(wd, &whptr);
945 if (indexing)
946 addword(wd, &idximplicit);
947 iswhite = TRUE;
948 break;
949 case tok_word:
950 if (indexing)
951 rdadds(&indexstr, t.text);
952 wd.type = style;
953 wd.alt = NULL;
954 wd.aux = 0;
955 wd.fpos = t.pos;
956 wd.breaks = t.aux;
957 if (!indexing || index_visible) {
958 wd.text = ustrdup(t.text);
959 addword(wd, &whptr);
960 }
961 if (indexing) {
962 wd.text = ustrdup(t.text);
963 addword(wd, &idximplicit);
964 }
965 break;
966 case tok_lbrace:
967 error(err_unexbrace, &t.pos);
968 /* Error recovery: push nop */
969 sitem = mknew(struct stack_item);
970 sitem->type = stack_nop;
971 stk_push(parsestk, sitem);
972 break;
973 case tok_rbrace:
974 sitem = stk_pop(parsestk);
975 if (!sitem) {
976 /*
977 * This closing brace could have been an
978 * indication that the cross-paragraph stack
979 * wants popping. Accordingly, we treat it here
980 * as an indication that the paragraph is over.
981 */
982 already = TRUE;
983 goto finished_para;
984 } else {
985 if (sitem->type & stack_ualt) {
986 whptr = sitem->whptr;
987 idximplicit = sitem->idximplicit;
988 }
989 if (sitem->type & stack_style) {
990 style = word_Normal;
991 spcstyle = word_WhiteSpace;
992 }
993 if (sitem->type & stack_idx) {
994 indexword->text = ustrdup(indexstr.text);
995 if (index_downcase)
996 ustrlow(indexword->text);
997 indexing = FALSE;
998 rdadd(&indexstr, L'\0');
999 index_merge(idx, FALSE, indexstr.text, idxwordlist);
1000 sfree(indexstr.text);
1001 }
1002 if (sitem->type & stack_hyper) {
1003 wd.text = NULL;
1004 wd.type = word_HyperEnd;
1005 wd.alt = NULL;
1006 wd.aux = 0;
1007 wd.fpos = t.pos;
1008 wd.breaks = FALSE;
1009 if (!indexing || index_visible)
1010 addword(wd, &whptr);
1011 if (indexing)
1012 addword(wd, &idximplicit);
1013 }
1014 if (sitem->type & stack_quote) {
1015 wd.text = NULL;
1016 wd.type = toquotestyle(style);
1017 wd.alt = NULL;
1018 wd.aux = quote_Close;
1019 wd.fpos = t.pos;
1020 wd.breaks = FALSE;
1021 if (!indexing || index_visible)
1022 addword(wd, &whptr);
1023 if (indexing) {
1024 rdadd(&indexstr, L'"');
1025 addword(wd, &idximplicit);
1026 }
1027 }
1028 }
1029 sfree(sitem);
1030 break;
1031 case tok_cmd:
1032 switch (t.cmd) {
1033 case c__comment:
1034 /*
1035 * In-paragraph comment: \#{ balanced braces }
1036 *
1037 * Anything goes here; even tok_eop. We should
1038 * eat whitespace after the close brace _if_
1039 * there was whitespace before the \#.
1040 */
1041 dtor(t), t = get_token(in);
1042 if (t.type != tok_lbrace) {
1043 error(err_explbr, &t.pos);
1044 } else {
1045 int braces = 1;
1046 while (braces > 0) {
1047 dtor(t), t = get_token(in);
1048 if (t.type == tok_lbrace)
1049 braces++;
1050 else if (t.type == tok_rbrace)
1051 braces--;
1052 else if (t.type == tok_eof) {
1053 error(err_commenteof, &t.pos);
1054 break;
1055 }
1056 }
1057 }
1058 if (seenwhite) {
1059 already = TRUE;
1060 dtor(t), t = get_token(in);
1061 if (t.type == tok_white) {
1062 iswhite = TRUE;
1063 already = FALSE;
1064 }
1065 }
1066 break;
1067 case c_q:
1068 dtor(t), t = get_token(in);
1069 if (t.type != tok_lbrace) {
1070 error(err_explbr, &t.pos);
1071 } else {
1072 wd.text = NULL;
1073 wd.type = toquotestyle(style);
1074 wd.alt = NULL;
1075 wd.aux = quote_Open;
1076 wd.fpos = t.pos;
1077 wd.breaks = FALSE;
1078 if (!indexing || index_visible)
1079 addword(wd, &whptr);
1080 if (indexing) {
1081 rdadd(&indexstr, L'"');
1082 addword(wd, &idximplicit);
1083 }
1084 sitem = mknew(struct stack_item);
1085 sitem->type = stack_quote;
1086 stk_push(parsestk, sitem);
1087 }
1088 break;
1089 case c_K:
1090 case c_k:
1091 case c_W:
1092 case c_date:
1093 /*
1094 * Keyword, hyperlink, or \date. We expect a
1095 * left brace, some text, and then a right
1096 * brace. No nesting; no arguments.
1097 */
1098 wd.fpos = t.pos;
1099 wd.breaks = FALSE;
1100 if (t.cmd == c_K)
1101 wd.type = word_UpperXref;
1102 else if (t.cmd == c_k)
1103 wd.type = word_LowerXref;
1104 else if (t.cmd == c_W)
1105 wd.type = word_HyperLink;
1106 else
1107 wd.type = word_Normal;
1108 dtor(t), t = get_token(in);
1109 if (t.type != tok_lbrace) {
1110 if (wd.type == word_Normal) {
1111 time_t thetime = time(NULL);
1112 struct tm *broken = localtime(&thetime);
1113 already = TRUE;
1114 wdtext = ustrftime(NULL, broken);
1115 wd.type = style;
1116 } else {
1117 error(err_explbr, &t.pos);
1118 wdtext = NULL;
1119 }
1120 } else {
1121 rdstring rs = { 0, 0, NULL };
1122 while (dtor(t), t = get_token(in),
1123 t.type == tok_word || t.type == tok_white) {
1124 if (t.type == tok_white)
1125 rdadd(&rs, ' ');
1126 else
1127 rdadds(&rs, t.text);
1128 }
1129 if (wd.type == word_Normal) {
1130 time_t thetime = time(NULL);
1131 struct tm *broken = localtime(&thetime);
1132 wdtext = ustrftime(rs.text, broken);
1133 wd.type = style;
1134 } else {
1135 wdtext = ustrdup(rs.text);
1136 }
1137 sfree(rs.text);
1138 if (t.type != tok_rbrace) {
1139 error(err_kwexprbr, &t.pos);
1140 }
1141 }
1142 wd.alt = NULL;
1143 wd.aux = 0;
1144 if (!indexing || index_visible) {
1145 wd.text = ustrdup(wdtext);
1146 addword(wd, &whptr);
1147 }
1148 if (indexing) {
1149 wd.text = ustrdup(wdtext);
1150 addword(wd, &idximplicit);
1151 }
1152 sfree(wdtext);
1153 if (wd.type == word_HyperLink) {
1154 /*
1155 * Hyperlinks are different: they then
1156 * expect another left brace, to begin
1157 * delimiting the text marked by the link.
1158 */
1159 dtor(t), t = get_token(in);
1160 /*
1161 * Special cases: \W{}\c, \W{}\e, \W{}\cw
1162 */
1163 sitem = mknew(struct stack_item);
1164 sitem->type = stack_hyper;
1165 if (t.type == tok_cmd &&
1166 (t.cmd == c_e || t.cmd == c_c || t.cmd == c_cw)) {
1167 if (style != word_Normal)
1168 error(err_nestedstyles, &t.pos);
1169 else {
1170 style = (t.cmd == c_c ? word_Code :
1171 t.cmd == c_cw ? word_WeakCode :
1172 word_Emph);
1173 spcstyle = tospacestyle(style);
1174 sitem->type |= stack_style;
1175 }
1176 dtor(t), t = get_token(in);
1177 }
1178 if (t.type != tok_lbrace) {
1179 error(err_explbr, &t.pos);
1180 sfree(sitem);
1181 } else {
1182 stk_push(parsestk, sitem);
1183 }
1184 }
1185 break;
1186 case c_c:
1187 case c_cw:
1188 case c_e:
1189 type = t.cmd;
1190 if (style != word_Normal) {
1191 error(err_nestedstyles, &t.pos);
1192 /* Error recovery: eat lbrace, push nop. */
1193 dtor(t), t = get_token(in);
1194 sitem = mknew(struct stack_item);
1195 sitem->type = stack_nop;
1196 stk_push(parsestk, sitem);
1197 }
1198 dtor(t), t = get_token(in);
1199 if (t.type != tok_lbrace) {
1200 error(err_explbr, &t.pos);
1201 } else {
1202 style = (type == c_c ? word_Code :
1203 type == c_cw ? word_WeakCode :
1204 word_Emph);
1205 spcstyle = tospacestyle(style);
1206 sitem = mknew(struct stack_item);
1207 sitem->type = stack_style;
1208 stk_push(parsestk, sitem);
1209 }
1210 break;
1211 case c_i:
1212 case c_ii:
1213 case c_I:
1214 type = t.cmd;
1215 if (indexing) {
1216 error(err_nestedindex, &t.pos);
1217 /* Error recovery: eat lbrace, push nop. */
1218 dtor(t), t = get_token(in);
1219 sitem = mknew(struct stack_item);
1220 sitem->type = stack_nop;
1221 stk_push(parsestk, sitem);
1222 }
1223 sitem = mknew(struct stack_item);
1224 sitem->type = stack_idx;
1225 dtor(t), t = get_token(in);
1226 /*
1227 * Special cases: \i\c, \i\e, \i\cw
1228 */
1229 wd.fpos = t.pos;
1230 if (t.type == tok_cmd &&
1231 (t.cmd == c_e || t.cmd == c_c || t.cmd == c_cw)) {
1232 if (style != word_Normal)
1233 error(err_nestedstyles, &t.pos);
1234 else {
1235 style = (t.cmd == c_c ? word_Code :
1236 t.cmd == c_cw ? word_WeakCode :
1237 word_Emph);
1238 spcstyle = tospacestyle(style);
1239 sitem->type |= stack_style;
1240 }
1241 dtor(t), t = get_token(in);
1242 }
1243 if (t.type != tok_lbrace) {
1244 sfree(sitem);
1245 error(err_explbr, &t.pos);
1246 } else {
1247 /* Add an index-reference word with no text as yet */
1248 wd.type = word_IndexRef;
1249 wd.text = NULL;
1250 wd.alt = NULL;
1251 wd.aux = 0;
1252 wd.breaks = FALSE;
1253 indexword = addword(wd, &whptr);
1254 /* Set up a rdstring to read the index text */
1255 indexstr = nullrs;
1256 /* Flags so that we do the Right Things with text */
1257 index_visible = (type != c_I);
1258 index_downcase = (type == c_ii);
1259 indexing = TRUE;
1260 idxwordlist = NULL;
1261 idximplicit = &idxwordlist;
1262 /* Stack item to close the indexing on exit */
1263 stk_push(parsestk, sitem);
1264 }
1265 break;
1266 case c_u:
1267 uchr = t.aux;
1268 utext[0] = uchr; utext[1] = 0;
1269 wd.type = style;
1270 wd.breaks = FALSE;
1271 wd.alt = NULL;
1272 wd.aux = 0;
1273 wd.fpos = t.pos;
1274 if (!indexing || index_visible) {
1275 wd.text = ustrdup(utext);
1276 uword = addword(wd, &whptr);
1277 } else
1278 uword = NULL;
1279 if (indexing) {
1280 wd.text = ustrdup(utext);
1281 iword = addword(wd, &idximplicit);
1282 } else
1283 iword = NULL;
1284 dtor(t), t = get_token(in);
1285 if (t.type == tok_lbrace) {
1286 /*
1287 * \u with a left brace. Until the brace
1288 * closes, all further words go on a
1289 * sidetrack from the main thread of the
1290 * paragraph.
1291 */
1292 sitem = mknew(struct stack_item);
1293 sitem->type = stack_ualt;
1294 sitem->whptr = whptr;
1295 sitem->idximplicit = idximplicit;
1296 stk_push(parsestk, sitem);
1297 whptr = uword ? &uword->alt : NULL;
1298 idximplicit = iword ? &iword->alt : NULL;
1299 } else {
1300 if (indexing)
1301 rdadd(&indexstr, uchr);
1302 already = TRUE;
1303 }
1304 break;
1305 default:
1306 if (!macrolookup(macros, in, t.text, &t.pos))
1307 error(err_badmidcmd, t.text, &t.pos);
1308 break;
1309 }
1310 }
1311 if (!already)
1312 dtor(t), t = get_token(in);
1313 seenwhite = iswhite;
1314 }
1315 finished_para:
1316 /* Check the stack is empty */
1317 if (stk_top(parsestk)) {
1318 while ((sitem = stk_pop(parsestk)))
1319 sfree(sitem);
1320 error(err_missingrbrace, &t.pos);
1321 }
1322 stk_free(parsestk);
1323 prev_para_type = par.type;
1324 addpara(par, ret);
1325 if (t.type == tok_eof)
1326 already = TRUE;
1327 }
1328
1329 if (stk_top(crossparastk)) {
1330 void *p;
1331
1332 error(err_missingrbrace2, &t.pos);
1333 while ((p = stk_pop(crossparastk)))
1334 sfree(p);
1335 }
1336
1337 /*
1338 * We break to here rather than returning, because otherwise
1339 * this cleanup doesn't happen.
1340 */
1341 dtor(t);
1342 macrocleanup(macros);
1343
1344 stk_free(crossparastk);
1345 }
1346
1347 paragraph *read_input(input *in, indexdata *idx) {
1348 paragraph *head = NULL;
1349 paragraph **hptr = &head;
1350
1351 while (in->currindex < in->nfiles) {
1352 in->currfp = fopen(in->filenames[in->currindex], "r");
1353 if (in->currfp) {
1354 setpos(in, in->filenames[in->currindex]);
1355 read_file(&hptr, in, idx);
1356 }
1357 in->currindex++;
1358 }
1359
1360 return head;
1361 }