Support the \W{...}\i\c{...} combination used in the NASM manual.
[sgt/halibut] / input.c
1 /*
2 * input.c: read the source form
3 */
4
5 #include <stdio.h>
6 #include <assert.h>
7 #include <time.h>
8 #include "halibut.h"
9
10 #define TAB_STOP 8 /* for column number tracking */
11
12 static void setpos(input *in, char *fname) {
13 in->pos.filename = fname;
14 in->pos.line = 1;
15 in->pos.col = (in->reportcols ? 1 : -1);
16 }
17
18 static void unget(input *in, int c, filepos *pos) {
19 if (in->npushback >= in->pushbacksize) {
20 in->pushbacksize = in->npushback + 16;
21 in->pushback = resize(in->pushback, in->pushbacksize);
22 }
23 in->pushback[in->npushback].chr = c;
24 in->pushback[in->npushback].pos = *pos; /* structure copy */
25 in->npushback++;
26 }
27
28 /* ---------------------------------------------------------------------- */
29 /*
30 * Macro subsystem
31 */
32 typedef struct macro_Tag macro;
33 struct macro_Tag {
34 wchar_t *name, *text;
35 };
36 struct macrostack_Tag {
37 macrostack *next;
38 wchar_t *text;
39 int ptr, npushback;
40 filepos pos;
41 };
42 static int macrocmp(void *av, void *bv) {
43 macro *a = (macro *)av, *b = (macro *)bv;
44 return ustrcmp(a->name, b->name);
45 }
46 static void macrodef(tree234 *macros, wchar_t *name, wchar_t *text,
47 filepos fpos) {
48 macro *m = mknew(macro);
49 m->name = name;
50 m->text = text;
51 if (add234(macros, m) != m) {
52 error(err_macroexists, &fpos, name);
53 sfree(name);
54 sfree(text);
55 }
56 }
57 static int macrolookup(tree234 *macros, input *in, wchar_t *name,
58 filepos *pos) {
59 macro m, *gotit;
60 m.name = name;
61 gotit = find234(macros, &m, NULL);
62 if (gotit) {
63 macrostack *expansion = mknew(macrostack);
64 expansion->next = in->stack;
65 expansion->text = gotit->text;
66 expansion->pos = *pos; /* structure copy */
67 expansion->ptr = 0;
68 expansion->npushback = in->npushback;
69 in->stack = expansion;
70 return TRUE;
71 } else
72 return FALSE;
73 }
74 static void macrocleanup(tree234 *macros) {
75 int ti;
76 macro *m;
77 for (ti = 0; (m = (macro *)index234(macros, ti)) != NULL; ti++) {
78 sfree(m->name);
79 sfree(m->text);
80 sfree(m);
81 }
82 freetree234(macros);
83 }
84
85 /*
86 * Can return EOF
87 */
88 static int get(input *in, filepos *pos) {
89 int pushbackpt = in->stack ? in->stack->npushback : 0;
90 if (in->npushback > pushbackpt) {
91 --in->npushback;
92 if (pos)
93 *pos = in->pushback[in->npushback].pos; /* structure copy */
94 return in->pushback[in->npushback].chr;
95 }
96 else if (in->stack) {
97 wchar_t c = in->stack->text[in->stack->ptr];
98 if (in->stack->text[++in->stack->ptr] == L'\0') {
99 macrostack *tmp = in->stack;
100 in->stack = tmp->next;
101 sfree(tmp);
102 }
103 return c;
104 }
105 else if (in->currfp) {
106 int c = getc(in->currfp);
107
108 if (c == EOF) {
109 fclose(in->currfp);
110 in->currfp = NULL;
111 }
112 /* Track line numbers, for error reporting */
113 if (pos)
114 *pos = in->pos;
115 if (in->reportcols) {
116 switch (c) {
117 case '\t':
118 in->pos.col = 1 + (in->pos.col + TAB_STOP-1) % TAB_STOP;
119 break;
120 case '\n':
121 in->pos.col = 1;
122 in->pos.line++;
123 break;
124 default:
125 in->pos.col++;
126 break;
127 }
128 } else {
129 in->pos.col = -1;
130 if (c == '\n')
131 in->pos.line++;
132 }
133 /* FIXME: do input charmap translation. We should be returning
134 * Unicode here. */
135 return c;
136 } else
137 return EOF;
138 }
139
140 /*
141 * Lexical analysis of source files.
142 */
143 typedef struct token_Tag token;
144 struct token_Tag {
145 int type;
146 int cmd, aux;
147 wchar_t *text;
148 filepos pos;
149 };
150 enum {
151 tok_eof, /* end of file */
152 tok_eop, /* end of paragraph */
153 tok_white, /* whitespace */
154 tok_word, /* a word or word fragment */
155 tok_cmd, /* \command */
156 tok_lbrace, /* { */
157 tok_rbrace /* } */
158 };
159
160 /* Halibut command keywords. */
161 enum {
162 c__invalid, /* invalid command */
163 c__comment, /* comment command (\#) */
164 c__escaped, /* escaped character */
165 c__nop, /* no-op */
166 c__nbsp, /* nonbreaking space */
167 c_A, /* appendix heading */
168 c_B, /* bibliography entry */
169 c_BR, /* bibliography rewrite */
170 c_C, /* chapter heading */
171 c_H, /* heading */
172 c_I, /* invisible index mark */
173 c_IM, /* index merge/rewrite */
174 c_K, /* capitalised cross-reference */
175 c_S, /* aux field is 0, 1, 2, ... */
176 c_U, /* unnumbered-chapter heading */
177 c_W, /* Web hyperlink */
178 c_b, /* bulletted list */
179 c_c, /* code */
180 c_cfg, /* configuration directive */
181 c_copyright, /* copyright statement */
182 c_cw, /* weak code */
183 c_date, /* document processing date */
184 c_dd, /* description list: description */
185 c_define, /* macro definition */
186 c_dt, /* description list: described thing */
187 c_e, /* emphasis */
188 c_i, /* visible index mark */
189 c_ii, /* uncapitalised visible index mark */
190 c_k, /* uncapitalised cross-reference */
191 c_lcont, /* continuation para(s) for list item */
192 c_n, /* numbered list */
193 c_nocite, /* bibliography trickery */
194 c_preamble, /* (obsolete) preamble text */
195 c_q, /* quote marks */
196 c_quote, /* block-quoted paragraphs */
197 c_rule, /* horizontal rule */
198 c_title, /* document title */
199 c_u, /* aux field is char code */
200 c_versionid /* document RCS id */
201 };
202
203 /* Perhaps whitespace should be defined in a more Unicode-friendly way? */
204 #define iswhite(c) ( (c)==32 || (c)==9 || (c)==13 || (c)==10 )
205 #define isnl(c) ( (c)==10 )
206 #define isdec(c) ( ((c)>='0'&&(c)<='9') )
207 #define fromdec(c) ( (c)-'0' )
208 #define ishex(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='F') || ((c)>='a'&&(c)<='f'))
209 #define fromhex(c) ( (c)<='9' ? (c)-'0' : ((c)&0xDF) - ('A'-10) )
210 #define iscmd(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='Z') || ((c)>='a'&&(c)<='z'))
211
212 /*
213 * Keyword comparison function. Like strcmp, but between a wchar_t *
214 * and a char *.
215 */
216 static int kwcmp(wchar_t const *p, char const *q) {
217 int i;
218 do {
219 i = *p - *q;
220 } while (*p++ && *q++ && !i);
221 return i;
222 }
223
224 /*
225 * Match a keyword.
226 */
227 static void match_kw(token *tok) {
228 /*
229 * FIXME. The ids are explicit in here so as to allow long-name
230 * equivalents to the various very short keywords.
231 */
232 static const struct { char const *name; int id; } keywords[] = {
233 {"#", c__comment}, /* comment command (\#) */
234 {"-", c__escaped}, /* nonbreaking hyphen */
235 {".", c__nop}, /* no-op */
236 {"A", c_A}, /* appendix heading */
237 {"B", c_B}, /* bibliography entry */
238 {"BR", c_BR}, /* bibliography rewrite */
239 {"C", c_C}, /* chapter heading */
240 {"H", c_H}, /* heading */
241 {"I", c_I}, /* invisible index mark */
242 {"IM", c_IM}, /* index merge/rewrite */
243 {"K", c_K}, /* capitalised cross-reference */
244 {"U", c_U}, /* unnumbered-chapter heading */
245 {"W", c_W}, /* Web hyperlink */
246 {"\\", c__escaped}, /* escaped backslash (\\) */
247 {"_", c__nbsp}, /* nonbreaking space (\_) */
248 {"b", c_b}, /* bulletted list */
249 {"c", c_c}, /* code */
250 {"cfg", c_cfg}, /* configuration directive */
251 {"copyright", c_copyright}, /* copyright statement */
252 {"cw", c_cw}, /* weak code */
253 {"date", c_date}, /* document processing date */
254 {"dd", c_dd}, /* description list: description */
255 {"define", c_define}, /* macro definition */
256 {"dt", c_dt}, /* description list: described thing */
257 {"e", c_e}, /* emphasis */
258 {"i", c_i}, /* visible index mark */
259 {"ii", c_ii}, /* uncapitalised visible index mark */
260 {"k", c_k}, /* uncapitalised cross-reference */
261 {"lcont", c_lcont}, /* continuation para(s) for list item */
262 {"n", c_n}, /* numbered list */
263 {"nocite", c_nocite}, /* bibliography trickery */
264 {"preamble", c_preamble}, /* (obsolete) preamble text */
265 {"q", c_q}, /* quote marks */
266 {"quote", c_quote}, /* block-quoted paragraphs */
267 {"rule", c_rule}, /* horizontal rule */
268 {"title", c_title}, /* document title */
269 {"versionid", c_versionid}, /* document RCS id */
270 {"{", c__escaped}, /* escaped lbrace (\{) */
271 {"}", c__escaped}, /* escaped rbrace (\}) */
272 };
273 int i, j, k, c;
274
275 /*
276 * Special cases: \S{0,1,2,...} and \uABCD. If the syntax
277 * doesn't match correctly, we just fall through to the
278 * binary-search phase.
279 */
280 if (tok->text[0] == 'S') {
281 /* We expect numeric characters thereafter. */
282 wchar_t *p = tok->text+1;
283 int n;
284 if (!*p)
285 n = 1;
286 else {
287 n = 0;
288 while (*p && isdec(*p)) {
289 n = 10 * n + fromdec(*p);
290 p++;
291 }
292 }
293 if (!*p) {
294 tok->cmd = c_S;
295 tok->aux = n;
296 return;
297 }
298 } else if (tok->text[0] == 'u') {
299 /* We expect hex characters thereafter. */
300 wchar_t *p = tok->text+1;
301 int n = 0;
302 while (*p && ishex(*p)) {
303 n = 16 * n + fromhex(*p);
304 p++;
305 }
306 if (!*p) {
307 tok->cmd = c_u;
308 tok->aux = n;
309 return;
310 }
311 }
312
313 i = -1;
314 j = sizeof(keywords)/sizeof(*keywords);
315 while (j-i > 1) {
316 k = (i+j)/2;
317 c = kwcmp(tok->text, keywords[k].name);
318 if (c < 0)
319 j = k;
320 else if (c > 0)
321 i = k;
322 else /* c == 0 */ {
323 tok->cmd = keywords[k].id;
324 return;
325 }
326 }
327
328 tok->cmd = c__invalid;
329 }
330
331
332 /*
333 * Read a token from the input file, in the normal way (`normal' in
334 * the sense that code paragraphs work a different way).
335 */
336 token get_token(input *in) {
337 int c;
338 int nls;
339 token ret;
340 rdstring rs = { 0, 0, NULL };
341 filepos cpos;
342
343 ret.text = NULL; /* default */
344 c = get(in, &cpos);
345 ret.pos = cpos;
346 if (iswhite(c)) { /* tok_white or tok_eop */
347 nls = 0;
348 do {
349 if (isnl(c))
350 nls++;
351 } while ((c = get(in, &cpos)) != EOF && iswhite(c));
352 if (c == EOF) {
353 ret.type = tok_eof;
354 return ret;
355 }
356 unget(in, c, &cpos);
357 ret.type = (nls > 1 ? tok_eop : tok_white);
358 return ret;
359 } else if (c == EOF) { /* tok_eof */
360 ret.type = tok_eof;
361 return ret;
362 } else if (c == '\\') { /* tok_cmd */
363 c = get(in, &cpos);
364 if (c == '-' || c == '\\' || c == '_' ||
365 c == '#' || c == '{' || c == '}' || c == '.') {
366 /* single-char command */
367 rdadd(&rs, c);
368 } else if (c == 'u') {
369 int len = 0;
370 do {
371 rdadd(&rs, c);
372 len++;
373 c = get(in, &cpos);
374 } while (ishex(c) && len < 5);
375 unget(in, c, &cpos);
376 } else if (iscmd(c)) {
377 do {
378 rdadd(&rs, c);
379 c = get(in, &cpos);
380 } while (iscmd(c));
381 unget(in, c, &cpos);
382 }
383 /*
384 * Now match the command against the list of available
385 * ones.
386 */
387 ret.type = tok_cmd;
388 ret.text = ustrdup(rs.text);
389 match_kw(&ret);
390 sfree(rs.text);
391 return ret;
392 } else if (c == '{') { /* tok_lbrace */
393 ret.type = tok_lbrace;
394 return ret;
395 } else if (c == '}') { /* tok_rbrace */
396 ret.type = tok_rbrace;
397 return ret;
398 } else { /* tok_word */
399 /*
400 * Read a word: the longest possible contiguous sequence of
401 * things other than whitespace, backslash, braces and
402 * hyphen. A hyphen terminates the word but is returned as
403 * part of it; everything else is pushed back for the next
404 * token. The `aux' field contains TRUE if the word ends in
405 * a hyphen.
406 */
407 ret.aux = FALSE; /* assumed for now */
408 while (1) {
409 if (iswhite(c) || c=='{' || c=='}' || c=='\\' || c==EOF) {
410 /* Put back the character that caused termination */
411 unget(in, c, &cpos);
412 break;
413 } else {
414 rdadd(&rs, c);
415 if (c == '-') {
416 ret.aux = TRUE;
417 break; /* hyphen terminates word */
418 }
419 }
420 c = get(in, &cpos);
421 }
422 ret.type = tok_word;
423 ret.text = ustrdup(rs.text);
424 sfree(rs.text);
425 return ret;
426 }
427 }
428
429 /*
430 * Determine whether the next input character is an open brace (for
431 * telling code paragraphs from paragraphs which merely start with
432 * code).
433 */
434 int isbrace(input *in) {
435 int c;
436 filepos cpos;
437
438 c = get(in, &cpos);
439 unget(in, c, &cpos);
440 return (c == '{');
441 }
442
443 /*
444 * Read the rest of a line that starts `\c'. Including nothing at
445 * all (tok_word with empty text).
446 */
447 token get_codepar_token(input *in) {
448 int c;
449 token ret;
450 rdstring rs = { 0, 0, NULL };
451 filepos cpos;
452
453 ret.type = tok_word;
454 c = get(in, &cpos); /* expect (and discard) one space */
455 ret.pos = cpos;
456 if (c == ' ') {
457 c = get(in, &cpos);
458 ret.pos = cpos;
459 }
460 while (!isnl(c) && c != EOF) {
461 int c2 = c;
462 c = get(in, &cpos);
463 /* Discard \r just before \n. */
464 if (c2 != 13 || !isnl(c))
465 rdadd(&rs, c2);
466 }
467 unget(in, c, &cpos);
468 ret.text = ustrdup(rs.text);
469 sfree(rs.text);
470 return ret;
471 }
472
473 /*
474 * Adds a new word to a linked list
475 */
476 static word *addword(word newword, word ***hptrptr) {
477 word *mnewword;
478 if (!hptrptr)
479 return NULL;
480 mnewword = mknew(word);
481 *mnewword = newword; /* structure copy */
482 mnewword->next = NULL;
483 **hptrptr = mnewword;
484 *hptrptr = &mnewword->next;
485 return mnewword;
486 }
487
488 /*
489 * Adds a new paragraph to a linked list
490 */
491 static paragraph *addpara(paragraph newpara, paragraph ***hptrptr) {
492 paragraph *mnewpara = mknew(paragraph);
493 *mnewpara = newpara; /* structure copy */
494 mnewpara->next = NULL;
495 **hptrptr = mnewpara;
496 *hptrptr = &mnewpara->next;
497 return mnewpara;
498 }
499
500 /*
501 * Destructor before token is reassigned; should catch most memory
502 * leaks
503 */
504 #define dtor(t) ( sfree(t.text) )
505
506 /*
507 * Reads a single file (ie until get() returns EOF)
508 */
509 static void read_file(paragraph ***ret, input *in, indexdata *idx) {
510 token t;
511 paragraph par;
512 word wd, **whptr, **idximplicit;
513 tree234 *macros;
514 wchar_t utext[2], *wdtext;
515 int style, spcstyle;
516 int already;
517 int iswhite, seenwhite;
518 int type;
519 int prev_para_type;
520 struct stack_item {
521 enum {
522 stack_nop = 0, /* do nothing (for error recovery) */
523 stack_ualt = 1, /* \u alternative */
524 stack_style = 2, /* \e, \c, \cw */
525 stack_idx = 4, /* \I, \i, \ii */
526 stack_hyper = 8, /* \W */
527 stack_quote = 16, /* \q */
528 } type;
529 word **whptr; /* to restore from \u alternatives */
530 word **idximplicit; /* to restore from \u alternatives */
531 filepos fpos;
532 } *sitem;
533 stack parsestk;
534 struct crossparaitem {
535 int type; /* currently c_lcont, c_quote or -1 */
536 int seen_lcont, seen_quote;
537 };
538 stack crossparastk;
539 word *indexword, *uword, *iword;
540 word *idxwordlist;
541 rdstring indexstr;
542 int index_downcase, index_visible, indexing;
543 const rdstring nullrs = { 0, 0, NULL };
544 wchar_t uchr;
545
546 t.text = NULL;
547 macros = newtree234(macrocmp);
548 already = FALSE;
549
550 crossparastk = stk_new();
551
552 /*
553 * Loop on each paragraph.
554 */
555 while (1) {
556 int start_cmd = c__invalid;
557 par.words = NULL;
558 par.keyword = NULL;
559 whptr = &par.words;
560
561 /*
562 * Get a token.
563 */
564 do {
565 if (!already) {
566 dtor(t), t = get_token(in);
567 }
568 already = FALSE;
569 } while (t.type == tok_eop);
570 if (t.type == tok_eof)
571 break;
572
573 /*
574 * Parse code paragraphs separately.
575 */
576 if (t.type == tok_cmd && t.cmd == c_c && !isbrace(in)) {
577 int wtype = word_WeakCode;
578
579 par.type = para_Code;
580 par.fpos = t.pos;
581 while (1) {
582 dtor(t), t = get_codepar_token(in);
583 wd.type = wtype;
584 wd.breaks = FALSE; /* shouldn't need this... */
585 wd.text = ustrdup(t.text);
586 wd.alt = NULL;
587 wd.fpos = t.pos;
588 addword(wd, &whptr);
589 dtor(t), t = get_token(in);
590 if (t.type == tok_white) {
591 /*
592 * The newline after a code-paragraph line
593 */
594 dtor(t), t = get_token(in);
595 }
596 if (t.type == tok_eop || t.type == tok_eof ||
597 t.type == tok_rbrace) { /* might be } terminating \lcont */
598 if (t.type == tok_rbrace)
599 already = TRUE;
600 break;
601 } else if (t.type == tok_cmd && t.cmd == c_c) {
602 wtype = word_WeakCode;
603 } else if (t.type == tok_cmd && t.cmd == c_e &&
604 wtype == word_WeakCode) {
605 wtype = word_Emph;
606 } else {
607 error(err_brokencodepara, &t.pos);
608 prev_para_type = par.type;
609 addpara(par, ret);
610 while (t.type != tok_eop) /* error recovery: */
611 dtor(t), t = get_token(in); /* eat rest of paragraph */
612 goto codeparabroken; /* ick, but such is life */
613 }
614 }
615 prev_para_type = par.type;
616 addpara(par, ret);
617 codeparabroken:
618 continue;
619 }
620
621 /*
622 * Spot the special commands that define a grouping of more
623 * than one paragraph, and also the closing braces that
624 * finish them.
625 */
626 if (t.type == tok_cmd &&
627 (t.cmd == c_lcont || t.cmd == c_quote)) {
628 struct crossparaitem *sitem, *stop;
629 int cmd = t.cmd;
630
631 /*
632 * Expect, and swallow, an open brace.
633 */
634 dtor(t), t = get_token(in);
635 if (t.type != tok_lbrace) {
636 error(err_explbr, &t.pos);
637 continue;
638 }
639
640 /*
641 * Also expect, and swallow, any whitespace after that
642 * (a newline before a code paragraph wouldn't be
643 * surprising).
644 */
645 do {
646 dtor(t), t = get_token(in);
647 } while (t.type == tok_white);
648 already = TRUE;
649
650 if (cmd == c_lcont) {
651 /*
652 * \lcont causes a continuation of a list item into
653 * multiple paragraphs (which may in turn contain
654 * nested lists, code paras etc). Hence, the previous
655 * paragraph must be of a list type.
656 */
657 sitem = mknew(struct crossparaitem);
658 stop = (struct crossparaitem *)stk_top(crossparastk);
659 if (stop)
660 *sitem = *stop;
661 else
662 sitem->seen_quote = sitem->seen_lcont = 0;
663
664 if (prev_para_type == para_Bullet ||
665 prev_para_type == para_NumberedList ||
666 prev_para_type == para_Description) {
667 sitem->type = c_lcont;
668 sitem->seen_lcont = 1;
669 par.type = para_LcontPush;
670 prev_para_type = par.type;
671 addpara(par, ret);
672 } else {
673 /*
674 * Push a null item on the cross-para stack so that
675 * when we see the corresponding closing brace we
676 * don't give a cascade error.
677 */
678 sitem->type = -1;
679 error(err_misplacedlcont, &t.pos);
680 }
681 } else {
682 /*
683 * \quote causes a group of paragraphs to be
684 * block-quoted (typically they will be indented a
685 * bit).
686 */
687 sitem = mknew(struct crossparaitem);
688 stop = (struct crossparaitem *)stk_top(crossparastk);
689 if (stop)
690 *sitem = *stop;
691 else
692 sitem->seen_quote = sitem->seen_lcont = 0;
693 sitem->type = c_quote;
694 sitem->seen_quote = 1;
695 par.type = para_QuotePush;
696 prev_para_type = par.type;
697 addpara(par, ret);
698 }
699 stk_push(crossparastk, sitem);
700 continue;
701 } else if (t.type == tok_rbrace) {
702 struct crossparaitem *sitem = stk_pop(crossparastk);
703 if (!sitem)
704 error(err_unexbrace, &t.pos);
705 else {
706 switch (sitem->type) {
707 case c_lcont:
708 par.type = para_LcontPop;
709 prev_para_type = par.type;
710 addpara(par, ret);
711 break;
712 case c_quote:
713 par.type = para_QuotePop;
714 prev_para_type = par.type;
715 addpara(par, ret);
716 break;
717 }
718 sfree(sitem);
719 }
720 continue;
721 }
722
723 /*
724 * This token begins a paragraph. See if it's one of the
725 * special commands that define a paragraph type.
726 *
727 * (note that \# is special in a way, and \nocite takes no
728 * text)
729 */
730 par.type = para_Normal;
731 if (t.type == tok_cmd) {
732 int needkw;
733 int is_macro = FALSE;
734
735 par.fpos = t.pos;
736 switch (t.cmd) {
737 default:
738 needkw = -1;
739 break;
740 case c__invalid:
741 error(err_badparatype, t.text, &t.pos);
742 needkw = 4;
743 break;
744 case c__comment:
745 if (isbrace(in))
746 break; /* `\#{': isn't a comment para */
747 do {
748 dtor(t), t = get_token(in);
749 } while (t.type != tok_eop && t.type != tok_eof);
750 continue; /* next paragraph */
751 /*
752 * `needkw' values:
753 *
754 * 1 -- exactly one keyword
755 * 2 -- at least one keyword
756 * 4 -- any number of keywords including zero
757 * 8 -- at least one keyword and then nothing else
758 * 16 -- nothing at all! no keywords, no body
759 * 32 -- no keywords at all
760 */
761 case c_A: needkw = 2; par.type = para_Appendix; break;
762 case c_B: needkw = 2; par.type = para_Biblio; break;
763 case c_BR: needkw = 1; par.type = para_BR;
764 start_cmd = c_BR; break;
765 case c_C: needkw = 2; par.type = para_Chapter; break;
766 case c_H: needkw = 2; par.type = para_Heading;
767 par.aux = 0;
768 break;
769 case c_IM: needkw = 2; par.type = para_IM;
770 start_cmd = c_IM; break;
771 case c_S: needkw = 2; par.type = para_Subsect;
772 par.aux = t.aux; break;
773 case c_U: needkw = 32; par.type = para_UnnumberedChapter; break;
774 /* For \b and \n the keyword is optional */
775 case c_b: needkw = 4; par.type = para_Bullet; break;
776 case c_dt: needkw = 4; par.type = para_DescribedThing; break;
777 case c_dd: needkw = 4; par.type = para_Description; break;
778 case c_n: needkw = 4; par.type = para_NumberedList; break;
779 case c_cfg: needkw = 8; par.type = para_Config;
780 start_cmd = c_cfg; break;
781 case c_copyright: needkw = 32; par.type = para_Copyright; break;
782 case c_define: is_macro = TRUE; needkw = 1; break;
783 /* For \nocite the keyword is _everything_ */
784 case c_nocite: needkw = 8; par.type = para_NoCite; break;
785 case c_preamble: needkw = 32; par.type = para_Normal; break;
786 case c_rule: needkw = 16; par.type = para_Rule; break;
787 case c_title: needkw = 32; par.type = para_Title; break;
788 case c_versionid: needkw = 32; par.type = para_VersionID; break;
789 }
790
791 if (par.type == para_Chapter ||
792 par.type == para_Heading ||
793 par.type == para_Subsect ||
794 par.type == para_Appendix ||
795 par.type == para_UnnumberedChapter) {
796 struct crossparaitem *sitem = stk_top(crossparastk);
797 if (sitem && (sitem->seen_lcont || sitem->seen_quote)) {
798 error(err_sectmarkerinblock,
799 &t.pos,
800 (sitem->seen_lcont ? "lcont" : "quote"));
801 }
802 }
803
804 if (needkw > 0) {
805 rdstring rs = { 0, 0, NULL };
806 int nkeys = 0;
807 filepos fp;
808
809 /* Get keywords. */
810 dtor(t), t = get_token(in);
811 fp = t.pos;
812 while (t.type == tok_lbrace) {
813 /* This is a keyword. */
814 nkeys++;
815 /* FIXME: there will be bugs if anyone specifies an
816 * empty keyword (\foo{}), so trap this case. */
817 while (dtor(t), t = get_token(in),
818 t.type == tok_word ||
819 t.type == tok_white ||
820 (t.type == tok_cmd && t.cmd == c__nbsp) ||
821 (t.type == tok_cmd && t.cmd == c__escaped)) {
822 if (t.type == tok_white ||
823 (t.type == tok_cmd && t.cmd == c__nbsp))
824 rdadd(&rs, ' ');
825 else
826 rdadds(&rs, t.text);
827 }
828 if (t.type != tok_rbrace) {
829 error(err_kwunclosed, &t.pos);
830 continue;
831 }
832 rdadd(&rs, 0); /* add string terminator */
833 dtor(t), t = get_token(in); /* eat right brace */
834 }
835
836 rdadd(&rs, 0); /* add string terminator */
837
838 /* See whether we have the right number of keywords. */
839 if ((needkw & 48) && nkeys > 0)
840 error(err_kwillegal, &fp);
841 if ((needkw & 11) && nkeys == 0)
842 error(err_kwexpected, &fp);
843 if ((needkw & 5) && nkeys > 1)
844 error(err_kwtoomany, &fp);
845
846 if (is_macro) {
847 /*
848 * Macro definition. Get the rest of the line
849 * as a code-paragraph token, repeatedly until
850 * there's nothing more left of it. Separate
851 * with newlines.
852 */
853 rdstring macrotext = { 0, 0, NULL };
854 while (1) {
855 dtor(t), t = get_codepar_token(in);
856 if (macrotext.pos > 0)
857 rdadd(&macrotext, L'\n');
858 rdadds(&macrotext, t.text);
859 dtor(t), t = get_token(in);
860 if (t.type == tok_eop) break;
861 }
862 macrodef(macros, rs.text, macrotext.text, fp);
863 continue; /* next paragraph */
864 }
865
866 par.keyword = rdtrim(&rs);
867
868 /* Move to EOP in case of needkw==8 or 16 (no body) */
869 if (needkw & 24) {
870 /* We allow whitespace even when we expect no para body */
871 while (t.type == tok_white)
872 dtor(t), t = get_token(in);
873 if (t.type != tok_eop && t.type != tok_eof &&
874 (start_cmd == c__invalid ||
875 t.type != tok_cmd || t.cmd != start_cmd)) {
876 error(err_bodyillegal, &t.pos);
877 /* Error recovery: eat the rest of the paragraph */
878 while (t.type != tok_eop && t.type != tok_eof &&
879 (start_cmd == c__invalid ||
880 t.type != tok_cmd || t.cmd != start_cmd))
881 dtor(t), t = get_token(in);
882 }
883 if (t.type == tok_cmd)
884 already = TRUE;/* inhibit get_token at top of loop */
885 prev_para_type = par.type;
886 addpara(par, ret);
887 continue; /* next paragraph */
888 }
889 }
890 }
891
892 /*
893 * Now read the actual paragraph, word by word, adding to
894 * the paragraph list.
895 *
896 * Mid-paragraph commands:
897 *
898 * \K \k
899 * \c \cw
900 * \e
901 * \i \ii
902 * \I
903 * \u
904 * \W
905 * \date
906 * \\ \{ \}
907 */
908 parsestk = stk_new();
909 style = word_Normal;
910 spcstyle = word_WhiteSpace;
911 indexing = FALSE;
912 seenwhite = TRUE;
913 while (t.type != tok_eop && t.type != tok_eof) {
914 iswhite = FALSE;
915 already = FALSE;
916
917 /* Handle implicit paragraph breaks after \IM, \BR etc */
918 if (start_cmd != c__invalid &&
919 t.type == tok_cmd && t.cmd == start_cmd) {
920 already = TRUE; /* inhibit get_token at top of loop */
921 break;
922 }
923
924 if (t.type == tok_cmd && t.cmd == c__nop) {
925 dtor(t), t = get_token(in);
926 continue; /* do nothing! */
927 }
928
929 if (t.type == tok_cmd && t.cmd == c__escaped) {
930 t.type = tok_word; /* nice and simple */
931 t.aux = 0; /* even if `\-' - nonbreaking! */
932 }
933 if (t.type == tok_cmd && t.cmd == c__nbsp) {
934 t.type = tok_word; /* nice and simple */
935 sfree(t.text);
936 t.text = ustrdup(L" "); /* text is ` ' not `_' */
937 t.aux = 0; /* (nonbreaking) */
938 }
939 switch (t.type) {
940 case tok_white:
941 if (whptr == &par.words)
942 break; /* strip whitespace at start of para */
943 wd.text = NULL;
944 wd.type = spcstyle;
945 wd.alt = NULL;
946 wd.aux = 0;
947 wd.fpos = t.pos;
948 wd.breaks = FALSE;
949
950 /*
951 * Inhibit use of whitespace if it's (probably the
952 * newline) before a repeat \IM / \BR type
953 * directive.
954 */
955 if (start_cmd != c__invalid) {
956 dtor(t), t = get_token(in);
957 already = TRUE;
958 if (t.type == tok_cmd && t.cmd == start_cmd)
959 break;
960 }
961
962 if (indexing)
963 rdadd(&indexstr, ' ');
964 if (!indexing || index_visible)
965 addword(wd, &whptr);
966 if (indexing)
967 addword(wd, &idximplicit);
968 iswhite = TRUE;
969 break;
970 case tok_word:
971 if (indexing)
972 rdadds(&indexstr, t.text);
973 wd.type = style;
974 wd.alt = NULL;
975 wd.aux = 0;
976 wd.fpos = t.pos;
977 wd.breaks = t.aux;
978 if (!indexing || index_visible) {
979 wd.text = ustrdup(t.text);
980 addword(wd, &whptr);
981 }
982 if (indexing) {
983 wd.text = ustrdup(t.text);
984 addword(wd, &idximplicit);
985 }
986 break;
987 case tok_lbrace:
988 error(err_unexbrace, &t.pos);
989 /* Error recovery: push nop */
990 sitem = mknew(struct stack_item);
991 sitem->type = stack_nop;
992 sitem->fpos = t.pos;
993 stk_push(parsestk, sitem);
994 break;
995 case tok_rbrace:
996 sitem = stk_pop(parsestk);
997 if (!sitem) {
998 /*
999 * This closing brace could have been an
1000 * indication that the cross-paragraph stack
1001 * wants popping. Accordingly, we treat it here
1002 * as an indication that the paragraph is over.
1003 */
1004 already = TRUE;
1005 goto finished_para;
1006 } else {
1007 if (sitem->type & stack_ualt) {
1008 whptr = sitem->whptr;
1009 idximplicit = sitem->idximplicit;
1010 }
1011 if (sitem->type & stack_style) {
1012 style = word_Normal;
1013 spcstyle = word_WhiteSpace;
1014 }
1015 if (sitem->type & stack_idx) {
1016 indexword->text = ustrdup(indexstr.text);
1017 if (index_downcase) {
1018 word *w;
1019
1020 ustrlow(indexword->text);
1021 ustrlow(indexstr.text);
1022
1023 for (w = idxwordlist; w; w = w->next)
1024 if (w->text)
1025 ustrlow(w->text);
1026 }
1027 indexing = FALSE;
1028 rdadd(&indexstr, L'\0');
1029 index_merge(idx, FALSE, indexstr.text,
1030 idxwordlist, &sitem->fpos);
1031 sfree(indexstr.text);
1032 }
1033 if (sitem->type & stack_hyper) {
1034 wd.text = NULL;
1035 wd.type = word_HyperEnd;
1036 wd.alt = NULL;
1037 wd.aux = 0;
1038 wd.fpos = t.pos;
1039 wd.breaks = FALSE;
1040 if (!indexing || index_visible)
1041 addword(wd, &whptr);
1042 if (indexing)
1043 addword(wd, &idximplicit);
1044 }
1045 if (sitem->type & stack_quote) {
1046 wd.text = NULL;
1047 wd.type = toquotestyle(style);
1048 wd.alt = NULL;
1049 wd.aux = quote_Close;
1050 wd.fpos = t.pos;
1051 wd.breaks = FALSE;
1052 if (!indexing || index_visible)
1053 addword(wd, &whptr);
1054 if (indexing) {
1055 rdadd(&indexstr, L'"');
1056 addword(wd, &idximplicit);
1057 }
1058 }
1059 }
1060 sfree(sitem);
1061 break;
1062 case tok_cmd:
1063 switch (t.cmd) {
1064 case c__comment:
1065 /*
1066 * In-paragraph comment: \#{ balanced braces }
1067 *
1068 * Anything goes here; even tok_eop. We should
1069 * eat whitespace after the close brace _if_
1070 * there was whitespace before the \#.
1071 */
1072 dtor(t), t = get_token(in);
1073 if (t.type != tok_lbrace) {
1074 error(err_explbr, &t.pos);
1075 } else {
1076 int braces = 1;
1077 while (braces > 0) {
1078 dtor(t), t = get_token(in);
1079 if (t.type == tok_lbrace)
1080 braces++;
1081 else if (t.type == tok_rbrace)
1082 braces--;
1083 else if (t.type == tok_eof) {
1084 error(err_commenteof, &t.pos);
1085 break;
1086 }
1087 }
1088 }
1089 if (seenwhite) {
1090 already = TRUE;
1091 dtor(t), t = get_token(in);
1092 if (t.type == tok_white) {
1093 iswhite = TRUE;
1094 already = FALSE;
1095 }
1096 }
1097 break;
1098 case c_q:
1099 dtor(t), t = get_token(in);
1100 if (t.type != tok_lbrace) {
1101 error(err_explbr, &t.pos);
1102 } else {
1103 wd.text = NULL;
1104 wd.type = toquotestyle(style);
1105 wd.alt = NULL;
1106 wd.aux = quote_Open;
1107 wd.fpos = t.pos;
1108 wd.breaks = FALSE;
1109 if (!indexing || index_visible)
1110 addword(wd, &whptr);
1111 if (indexing) {
1112 rdadd(&indexstr, L'"');
1113 addword(wd, &idximplicit);
1114 }
1115 sitem = mknew(struct stack_item);
1116 sitem->fpos = t.pos;
1117 sitem->type = stack_quote;
1118 stk_push(parsestk, sitem);
1119 }
1120 break;
1121 case c_K:
1122 case c_k:
1123 case c_W:
1124 case c_date:
1125 /*
1126 * Keyword, hyperlink, or \date. We expect a
1127 * left brace, some text, and then a right
1128 * brace. No nesting; no arguments.
1129 */
1130 wd.fpos = t.pos;
1131 wd.breaks = FALSE;
1132 if (t.cmd == c_K)
1133 wd.type = word_UpperXref;
1134 else if (t.cmd == c_k)
1135 wd.type = word_LowerXref;
1136 else if (t.cmd == c_W)
1137 wd.type = word_HyperLink;
1138 else
1139 wd.type = word_Normal;
1140 dtor(t), t = get_token(in);
1141 if (t.type != tok_lbrace) {
1142 if (wd.type == word_Normal) {
1143 time_t thetime = time(NULL);
1144 struct tm *broken = localtime(&thetime);
1145 already = TRUE;
1146 wdtext = ustrftime(NULL, broken);
1147 wd.type = style;
1148 } else {
1149 error(err_explbr, &t.pos);
1150 wdtext = NULL;
1151 }
1152 } else {
1153 rdstring rs = { 0, 0, NULL };
1154 while (dtor(t), t = get_token(in),
1155 t.type == tok_word || t.type == tok_white) {
1156 if (t.type == tok_white)
1157 rdadd(&rs, ' ');
1158 else
1159 rdadds(&rs, t.text);
1160 }
1161 if (wd.type == word_Normal) {
1162 time_t thetime = time(NULL);
1163 struct tm *broken = localtime(&thetime);
1164 wdtext = ustrftime(rs.text, broken);
1165 wd.type = style;
1166 } else {
1167 wdtext = ustrdup(rs.text);
1168 }
1169 sfree(rs.text);
1170 if (t.type != tok_rbrace) {
1171 error(err_kwexprbr, &t.pos);
1172 }
1173 }
1174 wd.alt = NULL;
1175 wd.aux = 0;
1176 if (!indexing || index_visible) {
1177 wd.text = ustrdup(wdtext);
1178 addword(wd, &whptr);
1179 }
1180 if (indexing) {
1181 wd.text = ustrdup(wdtext);
1182 addword(wd, &idximplicit);
1183 }
1184 sfree(wdtext);
1185 if (wd.type == word_HyperLink) {
1186 /*
1187 * Hyperlinks are different: they then
1188 * expect another left brace, to begin
1189 * delimiting the text marked by the link.
1190 */
1191 dtor(t), t = get_token(in);
1192 sitem = mknew(struct stack_item);
1193 sitem->fpos = wd.fpos;
1194 sitem->type = stack_hyper;
1195 /*
1196 * Special cases: \W{}\i, \W{}\ii
1197 */
1198 if (t.type == tok_cmd &&
1199 (t.cmd == c_i || t.cmd == c_ii)) {
1200 if (indexing) {
1201 error(err_nestedindex, &t.pos);
1202 } else {
1203 /* Add an index-reference word with no
1204 * text as yet */
1205 wd.type = word_IndexRef;
1206 wd.text = NULL;
1207 wd.alt = NULL;
1208 wd.aux = 0;
1209 wd.breaks = FALSE;
1210 indexword = addword(wd, &whptr);
1211 /* Set up a rdstring to read the
1212 * index text */
1213 indexstr = nullrs;
1214 /* Flags so that we do the Right
1215 * Things with text */
1216 index_visible = (type != c_I);
1217 index_downcase = (type == c_ii);
1218 indexing = TRUE;
1219 idxwordlist = NULL;
1220 idximplicit = &idxwordlist;
1221
1222 sitem->type |= stack_idx;
1223 }
1224 dtor(t), t = get_token(in);
1225 }
1226 /*
1227 * Special cases: \W{}\c, \W{}\e, \W{}\cw
1228 */
1229 if (t.type == tok_cmd &&
1230 (t.cmd == c_e || t.cmd == c_c || t.cmd == c_cw)) {
1231 if (style != word_Normal)
1232 error(err_nestedstyles, &t.pos);
1233 else {
1234 style = (t.cmd == c_c ? word_Code :
1235 t.cmd == c_cw ? word_WeakCode :
1236 word_Emph);
1237 spcstyle = tospacestyle(style);
1238 sitem->type |= stack_style;
1239 }
1240 dtor(t), t = get_token(in);
1241 }
1242 if (t.type != tok_lbrace) {
1243 error(err_explbr, &t.pos);
1244 sfree(sitem);
1245 } else {
1246 stk_push(parsestk, sitem);
1247 }
1248 }
1249 break;
1250 case c_c:
1251 case c_cw:
1252 case c_e:
1253 type = t.cmd;
1254 if (style != word_Normal) {
1255 error(err_nestedstyles, &t.pos);
1256 /* Error recovery: eat lbrace, push nop. */
1257 dtor(t), t = get_token(in);
1258 sitem = mknew(struct stack_item);
1259 sitem->fpos = t.pos;
1260 sitem->type = stack_nop;
1261 stk_push(parsestk, sitem);
1262 }
1263 dtor(t), t = get_token(in);
1264 if (t.type != tok_lbrace) {
1265 error(err_explbr, &t.pos);
1266 } else {
1267 style = (type == c_c ? word_Code :
1268 type == c_cw ? word_WeakCode :
1269 word_Emph);
1270 spcstyle = tospacestyle(style);
1271 sitem = mknew(struct stack_item);
1272 sitem->fpos = t.pos;
1273 sitem->type = stack_style;
1274 stk_push(parsestk, sitem);
1275 }
1276 break;
1277 case c_i:
1278 case c_ii:
1279 case c_I:
1280 type = t.cmd;
1281 if (indexing) {
1282 error(err_nestedindex, &t.pos);
1283 /* Error recovery: eat lbrace, push nop. */
1284 dtor(t), t = get_token(in);
1285 sitem = mknew(struct stack_item);
1286 sitem->fpos = t.pos;
1287 sitem->type = stack_nop;
1288 stk_push(parsestk, sitem);
1289 }
1290 sitem = mknew(struct stack_item);
1291 sitem->fpos = t.pos;
1292 sitem->type = stack_idx;
1293 dtor(t), t = get_token(in);
1294 /*
1295 * Special cases: \i\c, \i\e, \i\cw
1296 */
1297 wd.fpos = t.pos;
1298 if (t.type == tok_cmd &&
1299 (t.cmd == c_e || t.cmd == c_c || t.cmd == c_cw)) {
1300 if (style != word_Normal)
1301 error(err_nestedstyles, &t.pos);
1302 else {
1303 style = (t.cmd == c_c ? word_Code :
1304 t.cmd == c_cw ? word_WeakCode :
1305 word_Emph);
1306 spcstyle = tospacestyle(style);
1307 sitem->type |= stack_style;
1308 }
1309 dtor(t), t = get_token(in);
1310 }
1311 if (t.type != tok_lbrace) {
1312 sfree(sitem);
1313 error(err_explbr, &t.pos);
1314 } else {
1315 /* Add an index-reference word with no text as yet */
1316 wd.type = word_IndexRef;
1317 wd.text = NULL;
1318 wd.alt = NULL;
1319 wd.aux = 0;
1320 wd.breaks = FALSE;
1321 indexword = addword(wd, &whptr);
1322 /* Set up a rdstring to read the index text */
1323 indexstr = nullrs;
1324 /* Flags so that we do the Right Things with text */
1325 index_visible = (type != c_I);
1326 index_downcase = (type == c_ii);
1327 indexing = TRUE;
1328 idxwordlist = NULL;
1329 idximplicit = &idxwordlist;
1330 /* Stack item to close the indexing on exit */
1331 stk_push(parsestk, sitem);
1332 }
1333 break;
1334 case c_u:
1335 uchr = t.aux;
1336 utext[0] = uchr; utext[1] = 0;
1337 wd.type = style;
1338 wd.breaks = FALSE;
1339 wd.alt = NULL;
1340 wd.aux = 0;
1341 wd.fpos = t.pos;
1342 if (!indexing || index_visible) {
1343 wd.text = ustrdup(utext);
1344 uword = addword(wd, &whptr);
1345 } else
1346 uword = NULL;
1347 if (indexing) {
1348 wd.text = ustrdup(utext);
1349 iword = addword(wd, &idximplicit);
1350 } else
1351 iword = NULL;
1352 dtor(t), t = get_token(in);
1353 if (t.type == tok_lbrace) {
1354 /*
1355 * \u with a left brace. Until the brace
1356 * closes, all further words go on a
1357 * sidetrack from the main thread of the
1358 * paragraph.
1359 */
1360 sitem = mknew(struct stack_item);
1361 sitem->fpos = t.pos;
1362 sitem->type = stack_ualt;
1363 sitem->whptr = whptr;
1364 sitem->idximplicit = idximplicit;
1365 stk_push(parsestk, sitem);
1366 whptr = uword ? &uword->alt : NULL;
1367 idximplicit = iword ? &iword->alt : NULL;
1368 } else {
1369 if (indexing)
1370 rdadd(&indexstr, uchr);
1371 already = TRUE;
1372 }
1373 break;
1374 default:
1375 if (!macrolookup(macros, in, t.text, &t.pos))
1376 error(err_badmidcmd, t.text, &t.pos);
1377 break;
1378 }
1379 }
1380 if (!already)
1381 dtor(t), t = get_token(in);
1382 seenwhite = iswhite;
1383 }
1384 finished_para:
1385 /* Check the stack is empty */
1386 if (stk_top(parsestk)) {
1387 while ((sitem = stk_pop(parsestk)))
1388 sfree(sitem);
1389 error(err_missingrbrace, &t.pos);
1390 }
1391 stk_free(parsestk);
1392 prev_para_type = par.type;
1393 addpara(par, ret);
1394 if (t.type == tok_eof)
1395 already = TRUE;
1396 }
1397
1398 if (stk_top(crossparastk)) {
1399 void *p;
1400
1401 error(err_missingrbrace2, &t.pos);
1402 while ((p = stk_pop(crossparastk)))
1403 sfree(p);
1404 }
1405
1406 /*
1407 * We break to here rather than returning, because otherwise
1408 * this cleanup doesn't happen.
1409 */
1410 dtor(t);
1411 macrocleanup(macros);
1412
1413 stk_free(crossparastk);
1414 }
1415
1416 paragraph *read_input(input *in, indexdata *idx) {
1417 paragraph *head = NULL;
1418 paragraph **hptr = &head;
1419
1420 while (in->currindex < in->nfiles) {
1421 in->currfp = fopen(in->filenames[in->currindex], "r");
1422 if (in->currfp) {
1423 setpos(in, in->filenames[in->currindex]);
1424 read_file(&hptr, in, idx);
1425 }
1426 in->currindex++;
1427 }
1428
1429 return head;
1430 }