Just had a play with this newfangled `valgrind' memory debugger
[sgt/halibut] / input.c
1 /*
2 * input.c: read the source form
3 */
4
5 #include <stdio.h>
6 #include <assert.h>
7 #include <time.h>
8 #include "halibut.h"
9
10 #define TAB_STOP 8 /* for column number tracking */
11
12 static void setpos(input *in, char *fname) {
13 in->pos.filename = fname;
14 in->pos.line = 1;
15 in->pos.col = (in->reportcols ? 1 : -1);
16 }
17
18 static void unget(input *in, int c, filepos *pos) {
19 if (in->npushback >= in->pushbacksize) {
20 in->pushbacksize = in->npushback + 16;
21 in->pushback = resize(in->pushback, in->pushbacksize);
22 }
23 in->pushback[in->npushback].chr = c;
24 in->pushback[in->npushback].pos = *pos; /* structure copy */
25 in->npushback++;
26 }
27
28 /* ---------------------------------------------------------------------- */
29 /*
30 * Macro subsystem
31 */
32 typedef struct macro_Tag macro;
33 struct macro_Tag {
34 wchar_t *name, *text;
35 };
36 struct macrostack_Tag {
37 macrostack *next;
38 wchar_t *text;
39 int ptr, npushback;
40 filepos pos;
41 };
42 static int macrocmp(void *av, void *bv) {
43 macro *a = (macro *)av, *b = (macro *)bv;
44 return ustrcmp(a->name, b->name);
45 }
46 static void macrodef(tree234 *macros, wchar_t *name, wchar_t *text,
47 filepos fpos) {
48 macro *m = mknew(macro);
49 m->name = name;
50 m->text = text;
51 if (add234(macros, m) != m) {
52 error(err_macroexists, &fpos, name);
53 sfree(name);
54 sfree(text);
55 }
56 }
57 static int macrolookup(tree234 *macros, input *in, wchar_t *name,
58 filepos *pos) {
59 macro m, *gotit;
60 m.name = name;
61 gotit = find234(macros, &m, NULL);
62 if (gotit) {
63 macrostack *expansion = mknew(macrostack);
64 expansion->next = in->stack;
65 expansion->text = gotit->text;
66 expansion->pos = *pos; /* structure copy */
67 expansion->ptr = 0;
68 expansion->npushback = in->npushback;
69 in->stack = expansion;
70 return TRUE;
71 } else
72 return FALSE;
73 }
74 static void macrocleanup(tree234 *macros) {
75 int ti;
76 macro *m;
77 for (ti = 0; (m = (macro *)index234(macros, ti)) != NULL; ti++) {
78 sfree(m->name);
79 sfree(m->text);
80 sfree(m);
81 }
82 freetree234(macros);
83 }
84
85 /*
86 * Can return EOF
87 */
88 static int get(input *in, filepos *pos) {
89 int pushbackpt = in->stack ? in->stack->npushback : 0;
90 if (in->npushback > pushbackpt) {
91 --in->npushback;
92 if (pos)
93 *pos = in->pushback[in->npushback].pos; /* structure copy */
94 return in->pushback[in->npushback].chr;
95 }
96 else if (in->stack) {
97 wchar_t c = in->stack->text[in->stack->ptr];
98 if (in->stack->text[++in->stack->ptr] == L'\0') {
99 macrostack *tmp = in->stack;
100 in->stack = tmp->next;
101 sfree(tmp);
102 }
103 return c;
104 }
105 else if (in->currfp) {
106 int c = getc(in->currfp);
107
108 if (c == EOF) {
109 fclose(in->currfp);
110 in->currfp = NULL;
111 }
112 /* Track line numbers, for error reporting */
113 if (pos)
114 *pos = in->pos;
115 if (in->reportcols) {
116 switch (c) {
117 case '\t':
118 in->pos.col = 1 + (in->pos.col + TAB_STOP-1) % TAB_STOP;
119 break;
120 case '\n':
121 in->pos.col = 1;
122 in->pos.line++;
123 break;
124 default:
125 in->pos.col++;
126 break;
127 }
128 } else {
129 in->pos.col = -1;
130 if (c == '\n')
131 in->pos.line++;
132 }
133 /* FIXME: do input charmap translation. We should be returning
134 * Unicode here. */
135 return c;
136 } else
137 return EOF;
138 }
139
140 /*
141 * Lexical analysis of source files.
142 */
143 typedef struct token_Tag token;
144 struct token_Tag {
145 int type;
146 int cmd, aux;
147 wchar_t *text;
148 filepos pos;
149 };
150 enum {
151 tok_eof, /* end of file */
152 tok_eop, /* end of paragraph */
153 tok_white, /* whitespace */
154 tok_word, /* a word or word fragment */
155 tok_cmd, /* \command */
156 tok_lbrace, /* { */
157 tok_rbrace /* } */
158 };
159
160 /* Halibut command keywords. */
161 enum {
162 c__invalid, /* invalid command */
163 c__comment, /* comment command (\#) */
164 c__escaped, /* escaped character */
165 c__nbsp, /* nonbreaking space */
166 c_A, /* appendix heading */
167 c_B, /* bibliography entry */
168 c_BR, /* bibliography rewrite */
169 c_C, /* chapter heading */
170 c_H, /* heading */
171 c_I, /* invisible index mark */
172 c_IM, /* index merge/rewrite */
173 c_K, /* capitalised cross-reference */
174 c_S, /* aux field is 0, 1, 2, ... */
175 c_U, /* unnumbered-chapter heading */
176 c_W, /* Web hyperlink */
177 c_b, /* bulletted list */
178 c_c, /* code */
179 c_cfg, /* configuration directive */
180 c_copyright, /* copyright statement */
181 c_cw, /* weak code */
182 c_date, /* document processing date */
183 c_define, /* macro definition */
184 c_e, /* emphasis */
185 c_i, /* visible index mark */
186 c_ii, /* uncapitalised visible index mark */
187 c_k, /* uncapitalised cross-reference */
188 c_n, /* numbered list */
189 c_nocite, /* bibliography trickery */
190 c_preamble, /* document preamble text */
191 c_q, /* quote marks */
192 c_rule, /* horizontal rule */
193 c_title, /* document title */
194 c_u, /* aux field is char code */
195 c_versionid /* document RCS id */
196 };
197
198 /* Perhaps whitespace should be defined in a more Unicode-friendly way? */
199 #define iswhite(c) ( (c)==32 || (c)==9 || (c)==13 || (c)==10 )
200 #define isnl(c) ( (c)==10 )
201 #define isdec(c) ( ((c)>='0'&&(c)<='9') )
202 #define fromdec(c) ( (c)-'0' )
203 #define ishex(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='F') || ((c)>='a'&&(c)<='f'))
204 #define fromhex(c) ( (c)<='9' ? (c)-'0' : ((c)&0xDF) - ('A'-10) )
205 #define iscmd(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='Z') || ((c)>='a'&&(c)<='z'))
206
207 /*
208 * Keyword comparison function. Like strcmp, but between a wchar_t *
209 * and a char *.
210 */
211 static int kwcmp(wchar_t const *p, char const *q) {
212 int i;
213 do {
214 i = *p - *q;
215 } while (*p++ && *q++ && !i);
216 return i;
217 }
218
219 /*
220 * Match a keyword.
221 */
222 static void match_kw(token *tok) {
223 /*
224 * FIXME. The ids are explicit in here so as to allow long-name
225 * equivalents to the various very short keywords.
226 */
227 static const struct { char const *name; int id; } keywords[] = {
228 {"#", c__comment}, /* comment command (\#) */
229 {"-", c__escaped}, /* nonbreaking hyphen */
230 {"A", c_A}, /* appendix heading */
231 {"B", c_B}, /* bibliography entry */
232 {"BR", c_BR}, /* bibliography rewrite */
233 {"C", c_C}, /* chapter heading */
234 {"H", c_H}, /* heading */
235 {"I", c_I}, /* invisible index mark */
236 {"IM", c_IM}, /* index merge/rewrite */
237 {"K", c_K}, /* capitalised cross-reference */
238 {"U", c_U}, /* unnumbered-chapter heading */
239 {"W", c_W}, /* Web hyperlink */
240 {"\\", c__escaped}, /* escaped backslash (\\) */
241 {"_", c__nbsp}, /* nonbreaking space (\_) */
242 {"b", c_b}, /* bulletted list */
243 {"c", c_c}, /* code */
244 {"cfg", c_cfg}, /* configuration directive */
245 {"copyright", c_copyright}, /* copyright statement */
246 {"cw", c_cw}, /* weak code */
247 {"date", c_date}, /* document processing date */
248 {"define", c_define}, /* macro definition */
249 {"e", c_e}, /* emphasis */
250 {"i", c_i}, /* visible index mark */
251 {"ii", c_ii}, /* uncapitalised visible index mark */
252 {"k", c_k}, /* uncapitalised cross-reference */
253 {"n", c_n}, /* numbered list */
254 {"nocite", c_nocite}, /* bibliography trickery */
255 {"preamble", c_preamble}, /* document preamble text */
256 {"q", c_q}, /* quote marks */
257 {"rule", c_rule}, /* horizontal rule */
258 {"title", c_title}, /* document title */
259 {"versionid", c_versionid}, /* document RCS id */
260 {"{", c__escaped}, /* escaped lbrace (\{) */
261 {"}", c__escaped}, /* escaped rbrace (\}) */
262 };
263 int i, j, k, c;
264
265 /*
266 * Special cases: \S{0,1,2,...} and \uABCD. If the syntax
267 * doesn't match correctly, we just fall through to the
268 * binary-search phase.
269 */
270 if (tok->text[0] == 'S') {
271 /* We expect numeric characters thereafter. */
272 wchar_t *p = tok->text+1;
273 int n;
274 if (!*p)
275 n = 1;
276 else {
277 n = 0;
278 while (*p && isdec(*p)) {
279 n = 10 * n + fromdec(*p);
280 p++;
281 }
282 }
283 if (!*p) {
284 tok->cmd = c_S;
285 tok->aux = n;
286 return;
287 }
288 } else if (tok->text[0] == 'u') {
289 /* We expect hex characters thereafter. */
290 wchar_t *p = tok->text+1;
291 int n = 0;
292 while (*p && ishex(*p)) {
293 n = 16 * n + fromhex(*p);
294 p++;
295 }
296 if (!*p) {
297 tok->cmd = c_u;
298 tok->aux = n;
299 return;
300 }
301 }
302
303 i = -1;
304 j = sizeof(keywords)/sizeof(*keywords);
305 while (j-i > 1) {
306 k = (i+j)/2;
307 c = kwcmp(tok->text, keywords[k].name);
308 if (c < 0)
309 j = k;
310 else if (c > 0)
311 i = k;
312 else /* c == 0 */ {
313 tok->cmd = keywords[k].id;
314 return;
315 }
316 }
317
318 tok->cmd = c__invalid;
319 }
320
321
322 /*
323 * Read a token from the input file, in the normal way (`normal' in
324 * the sense that code paragraphs work a different way).
325 */
326 token get_token(input *in) {
327 int c;
328 int nls;
329 token ret;
330 rdstring rs = { 0, 0, NULL };
331 filepos cpos;
332
333 ret.text = NULL; /* default */
334 c = get(in, &cpos);
335 ret.pos = cpos;
336 if (iswhite(c)) { /* tok_white or tok_eop */
337 nls = 0;
338 do {
339 if (isnl(c))
340 nls++;
341 } while ((c = get(in, &cpos)) != EOF && iswhite(c));
342 if (c == EOF) {
343 ret.type = tok_eof;
344 return ret;
345 }
346 unget(in, c, &cpos);
347 ret.type = (nls > 1 ? tok_eop : tok_white);
348 return ret;
349 } else if (c == EOF) { /* tok_eof */
350 ret.type = tok_eof;
351 return ret;
352 } else if (c == '\\') { /* tok_cmd */
353 c = get(in, &cpos);
354 if (c == '-' || c == '\\' || c == '_' ||
355 c == '#' || c == '{' || c == '}') {
356 /* single-char command */
357 rdadd(&rs, c);
358 } else if (c == 'u') {
359 int len = 0;
360 do {
361 rdadd(&rs, c);
362 len++;
363 c = get(in, &cpos);
364 } while (ishex(c) && len < 5);
365 unget(in, c, &cpos);
366 } else if (iscmd(c)) {
367 do {
368 rdadd(&rs, c);
369 c = get(in, &cpos);
370 } while (iscmd(c));
371 unget(in, c, &cpos);
372 }
373 /*
374 * Now match the command against the list of available
375 * ones.
376 */
377 ret.type = tok_cmd;
378 ret.text = ustrdup(rs.text);
379 match_kw(&ret);
380 sfree(rs.text);
381 return ret;
382 } else if (c == '{') { /* tok_lbrace */
383 ret.type = tok_lbrace;
384 return ret;
385 } else if (c == '}') { /* tok_rbrace */
386 ret.type = tok_rbrace;
387 return ret;
388 } else { /* tok_word */
389 /*
390 * Read a word: the longest possible contiguous sequence of
391 * things other than whitespace, backslash, braces and
392 * hyphen. A hyphen terminates the word but is returned as
393 * part of it; everything else is pushed back for the next
394 * token. The `aux' field contains TRUE if the word ends in
395 * a hyphen.
396 */
397 ret.aux = FALSE; /* assumed for now */
398 while (1) {
399 if (iswhite(c) || c=='{' || c=='}' || c=='\\' || c==EOF) {
400 /* Put back the character that caused termination */
401 unget(in, c, &cpos);
402 break;
403 } else {
404 rdadd(&rs, c);
405 if (c == '-') {
406 ret.aux = TRUE;
407 break; /* hyphen terminates word */
408 }
409 }
410 c = get(in, &cpos);
411 }
412 ret.type = tok_word;
413 ret.text = ustrdup(rs.text);
414 sfree(rs.text);
415 return ret;
416 }
417 }
418
419 /*
420 * Determine whether the next input character is an open brace (for
421 * telling code paragraphs from paragraphs which merely start with
422 * code).
423 */
424 int isbrace(input *in) {
425 int c;
426 filepos cpos;
427
428 c = get(in, &cpos);
429 unget(in, c, &cpos);
430 return (c == '{');
431 }
432
433 /*
434 * Read the rest of a line that starts `\c'. Including nothing at
435 * all (tok_word with empty text).
436 */
437 token get_codepar_token(input *in) {
438 int c;
439 token ret;
440 rdstring rs = { 0, 0, NULL };
441 filepos cpos;
442
443 ret.type = tok_word;
444 c = get(in, &cpos); /* expect (and discard) one space */
445 ret.pos = cpos;
446 if (c == ' ') {
447 c = get(in, &cpos);
448 ret.pos = cpos;
449 }
450 while (!isnl(c) && c != EOF) {
451 int c2 = c;
452 c = get(in, &cpos);
453 /* Discard \r just before \n. */
454 if (c2 != 13 || !isnl(c))
455 rdadd(&rs, c2);
456 }
457 unget(in, c, &cpos);
458 ret.text = ustrdup(rs.text);
459 sfree(rs.text);
460 return ret;
461 }
462
463 /*
464 * Adds a new word to a linked list
465 */
466 static word *addword(word newword, word ***hptrptr) {
467 word *mnewword;
468 if (!hptrptr)
469 return NULL;
470 mnewword = mknew(word);
471 *mnewword = newword; /* structure copy */
472 mnewword->next = NULL;
473 **hptrptr = mnewword;
474 *hptrptr = &mnewword->next;
475 return mnewword;
476 }
477
478 /*
479 * Adds a new paragraph to a linked list
480 */
481 static paragraph *addpara(paragraph newpara, paragraph ***hptrptr) {
482 paragraph *mnewpara = mknew(paragraph);
483 *mnewpara = newpara; /* structure copy */
484 mnewpara->next = NULL;
485 **hptrptr = mnewpara;
486 *hptrptr = &mnewpara->next;
487 return mnewpara;
488 }
489
490 /*
491 * Destructor before token is reassigned; should catch most memory
492 * leaks
493 */
494 #define dtor(t) ( sfree(t.text) )
495
496 /*
497 * Reads a single file (ie until get() returns EOF)
498 */
499 static void read_file(paragraph ***ret, input *in, indexdata *idx) {
500 token t;
501 paragraph par;
502 word wd, **whptr, **idximplicit;
503 tree234 *macros;
504 wchar_t utext[2], *wdtext;
505 int style, spcstyle;
506 int already;
507 int iswhite, seenwhite;
508 int type;
509 struct stack_item {
510 enum {
511 stack_nop = 0, /* do nothing (for error recovery) */
512 stack_ualt = 1, /* \u alternative */
513 stack_style = 2, /* \e, \c, \cw */
514 stack_idx = 4, /* \I, \i, \ii */
515 stack_hyper = 8, /* \W */
516 stack_quote = 16, /* \q */
517 } type;
518 word **whptr; /* to restore from \u alternatives */
519 word **idximplicit; /* to restore from \u alternatives */
520 } *sitem;
521 stack parsestk;
522 word *indexword, *uword, *iword;
523 word *idxwordlist;
524 rdstring indexstr;
525 int index_downcase, index_visible, indexing;
526 const rdstring nullrs = { 0, 0, NULL };
527 wchar_t uchr;
528
529 t.text = NULL;
530 macros = newtree234(macrocmp);
531 already = FALSE;
532
533 /*
534 * Loop on each paragraph.
535 */
536 while (1) {
537 int start_cmd = c__invalid;
538 par.words = NULL;
539 par.keyword = NULL;
540 whptr = &par.words;
541
542 /*
543 * Get a token.
544 */
545 if (!already) {
546 dtor(t), t = get_token(in);
547 }
548 already = FALSE;
549 if (t.type == tok_eof)
550 break;
551
552 /*
553 * Parse code paragraphs separately.
554 */
555 if (t.type == tok_cmd && t.cmd == c_c && !isbrace(in)) {
556 par.type = para_Code;
557 par.fpos = t.pos;
558 while (1) {
559 dtor(t), t = get_codepar_token(in);
560 wd.type = word_WeakCode;
561 wd.breaks = FALSE; /* shouldn't need this... */
562 wd.text = ustrdup(t.text);
563 wd.alt = NULL;
564 wd.fpos = t.pos;
565 addword(wd, &whptr);
566 dtor(t), t = get_token(in);
567 if (t.type == tok_white) {
568 /*
569 * The newline after a code-paragraph line
570 */
571 dtor(t), t = get_token(in);
572 }
573 if (t.type == tok_eop || t.type == tok_eof)
574 break;
575 else if (t.type != tok_cmd || t.cmd != c_c) {
576 error(err_brokencodepara, &t.pos);
577 addpara(par, ret);
578 while (t.type != tok_eop) /* error recovery: */
579 dtor(t), t = get_token(in); /* eat rest of paragraph */
580 goto codeparabroken; /* ick, but such is life */
581 }
582 }
583 addpara(par, ret);
584 codeparabroken:
585 continue;
586 }
587
588 /*
589 * This token begins a paragraph. See if it's one of the
590 * special commands that define a paragraph type.
591 *
592 * (note that \# is special in a way, and \nocite takes no
593 * text)
594 */
595 par.type = para_Normal;
596 if (t.type == tok_cmd) {
597 int needkw;
598 int is_macro = FALSE;
599
600 par.fpos = t.pos;
601 switch (t.cmd) {
602 default:
603 needkw = -1;
604 break;
605 case c__invalid:
606 error(err_badparatype, t.text, &t.pos);
607 needkw = 4;
608 break;
609 case c__comment:
610 if (isbrace(in))
611 break; /* `\#{': isn't a comment para */
612 do {
613 dtor(t), t = get_token(in);
614 } while (t.type != tok_eop && t.type != tok_eof);
615 continue; /* next paragraph */
616 /*
617 * `needkw' values:
618 *
619 * 1 -- exactly one keyword
620 * 2 -- at least one keyword
621 * 4 -- any number of keywords including zero
622 * 8 -- at least one keyword and then nothing else
623 * 16 -- nothing at all! no keywords, no body
624 * 32 -- no keywords at all
625 */
626 case c_A: needkw = 2; par.type = para_Appendix; break;
627 case c_B: needkw = 2; par.type = para_Biblio; break;
628 case c_BR: needkw = 1; par.type = para_BR;
629 start_cmd = c_BR; break;
630 case c_C: needkw = 2; par.type = para_Chapter; break;
631 case c_H: needkw = 2; par.type = para_Heading;
632 par.aux = 0;
633 break;
634 case c_IM: needkw = 2; par.type = para_IM;
635 start_cmd = c_IM; break;
636 case c_S: needkw = 2; par.type = para_Subsect;
637 par.aux = t.aux; break;
638 case c_U: needkw = 32; par.type = para_UnnumberedChapter; break;
639 /* For \b and \n the keyword is optional */
640 case c_b: needkw = 4; par.type = para_Bullet; break;
641 case c_n: needkw = 4; par.type = para_NumberedList; break;
642 case c_cfg: needkw = 8; par.type = para_Config;
643 start_cmd = c_cfg; break;
644 case c_copyright: needkw = 32; par.type = para_Copyright; break;
645 case c_define: is_macro = TRUE; needkw = 1; break;
646 /* For \nocite the keyword is _everything_ */
647 case c_nocite: needkw = 8; par.type = para_NoCite; break;
648 case c_preamble: needkw = 32; par.type = para_Preamble; break;
649 case c_rule: needkw = 16; par.type = para_Rule; break;
650 case c_title: needkw = 32; par.type = para_Title; break;
651 case c_versionid: needkw = 32; par.type = para_VersionID; break;
652 }
653
654 if (needkw > 0) {
655 rdstring rs = { 0, 0, NULL };
656 int nkeys = 0;
657 filepos fp;
658
659 /* Get keywords. */
660 dtor(t), t = get_token(in);
661 fp = t.pos;
662 while (t.type == tok_lbrace) {
663 /* This is a keyword. */
664 nkeys++;
665 /* FIXME: there will be bugs if anyone specifies an
666 * empty keyword (\foo{}), so trap this case. */
667 while (dtor(t), t = get_token(in),
668 t.type == tok_word ||
669 t.type == tok_white ||
670 (t.type == tok_cmd && t.cmd == c__nbsp) ||
671 (t.type == tok_cmd && t.cmd == c__escaped)) {
672 if (t.type == tok_white ||
673 (t.type == tok_cmd && t.cmd == c__nbsp))
674 rdadd(&rs, ' ');
675 else
676 rdadds(&rs, t.text);
677 }
678 if (t.type != tok_rbrace) {
679 error(err_kwunclosed, &t.pos);
680 continue;
681 }
682 rdadd(&rs, 0); /* add string terminator */
683 dtor(t), t = get_token(in); /* eat right brace */
684 }
685
686 rdadd(&rs, 0); /* add string terminator */
687
688 /* See whether we have the right number of keywords. */
689 if ((needkw & 48) && nkeys > 0)
690 error(err_kwillegal, &fp);
691 if ((needkw & 11) && nkeys == 0)
692 error(err_kwexpected, &fp);
693 if ((needkw & 5) && nkeys > 1)
694 error(err_kwtoomany, &fp);
695
696 if (is_macro) {
697 /*
698 * Macro definition. Get the rest of the line
699 * as a code-paragraph token, repeatedly until
700 * there's nothing more left of it. Separate
701 * with newlines.
702 */
703 rdstring macrotext = { 0, 0, NULL };
704 while (1) {
705 dtor(t), t = get_codepar_token(in);
706 if (macrotext.pos > 0)
707 rdadd(&macrotext, L'\n');
708 rdadds(&macrotext, t.text);
709 dtor(t), t = get_token(in);
710 if (t.type == tok_eop) break;
711 }
712 macrodef(macros, rs.text, macrotext.text, fp);
713 continue; /* next paragraph */
714 }
715
716 par.keyword = rdtrim(&rs);
717
718 /* Move to EOP in case of needkw==8 or 16 (no body) */
719 if (needkw & 24) {
720 /* We allow whitespace even when we expect no para body */
721 while (t.type == tok_white)
722 dtor(t), t = get_token(in);
723 if (t.type != tok_eop && t.type != tok_eof &&
724 (start_cmd == c__invalid ||
725 t.type != tok_cmd || t.cmd != start_cmd)) {
726 error(err_bodyillegal, &t.pos);
727 /* Error recovery: eat the rest of the paragraph */
728 while (t.type != tok_eop && t.type != tok_eof &&
729 (start_cmd == c__invalid ||
730 t.type != tok_cmd || t.cmd != start_cmd))
731 dtor(t), t = get_token(in);
732 }
733 if (t.type == tok_cmd)
734 already = TRUE;/* inhibit get_token at top of loop */
735 addpara(par, ret);
736 continue; /* next paragraph */
737 }
738 }
739 }
740
741 /*
742 * Now read the actual paragraph, word by word, adding to
743 * the paragraph list.
744 *
745 * Mid-paragraph commands:
746 *
747 * \K \k
748 * \c \cw
749 * \e
750 * \i \ii
751 * \I
752 * \u
753 * \W
754 * \date
755 * \\ \{ \}
756 */
757 parsestk = stk_new();
758 style = word_Normal;
759 spcstyle = word_WhiteSpace;
760 indexing = FALSE;
761 seenwhite = TRUE;
762 while (t.type != tok_eop && t.type != tok_eof) {
763 iswhite = FALSE;
764 already = FALSE;
765
766 /* Handle implicit paragraph breaks after \IM, \BR etc */
767 if (start_cmd != c__invalid &&
768 t.type == tok_cmd && t.cmd == start_cmd) {
769 already = TRUE; /* inhibit get_token at top of loop */
770 break;
771 }
772
773 if (t.type == tok_cmd && t.cmd == c__escaped) {
774 t.type = tok_word; /* nice and simple */
775 t.aux = 0; /* even if `\-' - nonbreaking! */
776 }
777 if (t.type == tok_cmd && t.cmd == c__nbsp) {
778 t.type = tok_word; /* nice and simple */
779 sfree(t.text);
780 t.text = ustrdup(L" "); /* text is ` ' not `_' */
781 t.aux = 0; /* (nonbreaking) */
782 }
783 switch (t.type) {
784 case tok_white:
785 if (whptr == &par.words)
786 break; /* strip whitespace at start of para */
787 wd.text = NULL;
788 wd.type = spcstyle;
789 wd.alt = NULL;
790 wd.aux = 0;
791 wd.fpos = t.pos;
792 wd.breaks = FALSE;
793
794 /*
795 * Inhibit use of whitespace if it's (probably the
796 * newline) before a repeat \IM / \BR type
797 * directive.
798 */
799 if (start_cmd != c__invalid) {
800 dtor(t), t = get_token(in);
801 already = TRUE;
802 if (t.type == tok_cmd && t.cmd == start_cmd)
803 break;
804 }
805
806 if (indexing)
807 rdadd(&indexstr, ' ');
808 if (!indexing || index_visible)
809 addword(wd, &whptr);
810 if (indexing)
811 addword(wd, &idximplicit);
812 iswhite = TRUE;
813 break;
814 case tok_word:
815 if (indexing)
816 rdadds(&indexstr, t.text);
817 wd.type = style;
818 wd.alt = NULL;
819 wd.aux = 0;
820 wd.fpos = t.pos;
821 wd.breaks = t.aux;
822 if (!indexing || index_visible) {
823 wd.text = ustrdup(t.text);
824 addword(wd, &whptr);
825 }
826 if (indexing) {
827 wd.text = ustrdup(t.text);
828 addword(wd, &idximplicit);
829 }
830 break;
831 case tok_lbrace:
832 error(err_unexbrace, &t.pos);
833 /* Error recovery: push nop */
834 sitem = mknew(struct stack_item);
835 sitem->type = stack_nop;
836 stk_push(parsestk, sitem);
837 break;
838 case tok_rbrace:
839 sitem = stk_pop(parsestk);
840 if (!sitem)
841 error(err_unexbrace, &t.pos);
842 else {
843 if (sitem->type & stack_ualt) {
844 whptr = sitem->whptr;
845 idximplicit = sitem->idximplicit;
846 }
847 if (sitem->type & stack_style) {
848 style = word_Normal;
849 spcstyle = word_WhiteSpace;
850 }
851 if (sitem->type & stack_idx) {
852 indexword->text = ustrdup(indexstr.text);
853 if (index_downcase)
854 ustrlow(indexword->text);
855 indexing = FALSE;
856 rdadd(&indexstr, L'\0');
857 index_merge(idx, FALSE, indexstr.text, idxwordlist);
858 sfree(indexstr.text);
859 }
860 if (sitem->type & stack_hyper) {
861 wd.text = NULL;
862 wd.type = word_HyperEnd;
863 wd.alt = NULL;
864 wd.aux = 0;
865 wd.fpos = t.pos;
866 wd.breaks = FALSE;
867 if (!indexing || index_visible)
868 addword(wd, &whptr);
869 if (indexing)
870 addword(wd, &idximplicit);
871 }
872 if (sitem->type & stack_quote) {
873 wd.text = NULL;
874 wd.type = toquotestyle(style);
875 wd.alt = NULL;
876 wd.aux = quote_Close;
877 wd.fpos = t.pos;
878 wd.breaks = FALSE;
879 if (!indexing || index_visible)
880 addword(wd, &whptr);
881 if (indexing) {
882 rdadd(&indexstr, L'"');
883 addword(wd, &idximplicit);
884 }
885 }
886 }
887 sfree(sitem);
888 break;
889 case tok_cmd:
890 switch (t.cmd) {
891 case c__comment:
892 /*
893 * In-paragraph comment: \#{ balanced braces }
894 *
895 * Anything goes here; even tok_eop. We should
896 * eat whitespace after the close brace _if_
897 * there was whitespace before the \#.
898 */
899 dtor(t), t = get_token(in);
900 if (t.type != tok_lbrace) {
901 error(err_explbr, &t.pos);
902 } else {
903 int braces = 1;
904 while (braces > 0) {
905 dtor(t), t = get_token(in);
906 if (t.type == tok_lbrace)
907 braces++;
908 else if (t.type == tok_rbrace)
909 braces--;
910 else if (t.type == tok_eof) {
911 error(err_commenteof, &t.pos);
912 break;
913 }
914 }
915 }
916 if (seenwhite) {
917 already = TRUE;
918 dtor(t), t = get_token(in);
919 if (t.type == tok_white) {
920 iswhite = TRUE;
921 already = FALSE;
922 }
923 }
924 break;
925 case c_q:
926 dtor(t), t = get_token(in);
927 if (t.type != tok_lbrace) {
928 error(err_explbr, &t.pos);
929 } else {
930 wd.text = NULL;
931 wd.type = toquotestyle(style);
932 wd.alt = NULL;
933 wd.aux = quote_Open;
934 wd.fpos = t.pos;
935 wd.breaks = FALSE;
936 if (!indexing || index_visible)
937 addword(wd, &whptr);
938 if (indexing) {
939 rdadd(&indexstr, L'"');
940 addword(wd, &idximplicit);
941 }
942 sitem = mknew(struct stack_item);
943 sitem->type = stack_quote;
944 stk_push(parsestk, sitem);
945 }
946 break;
947 case c_K:
948 case c_k:
949 case c_W:
950 case c_date:
951 /*
952 * Keyword, hyperlink, or \date. We expect a
953 * left brace, some text, and then a right
954 * brace. No nesting; no arguments.
955 */
956 wd.fpos = t.pos;
957 wd.breaks = FALSE;
958 if (t.cmd == c_K)
959 wd.type = word_UpperXref;
960 else if (t.cmd == c_k)
961 wd.type = word_LowerXref;
962 else if (t.cmd == c_W)
963 wd.type = word_HyperLink;
964 else
965 wd.type = word_Normal;
966 dtor(t), t = get_token(in);
967 if (t.type != tok_lbrace) {
968 if (wd.type == word_Normal) {
969 time_t thetime = time(NULL);
970 struct tm *broken = localtime(&thetime);
971 already = TRUE;
972 wdtext = ustrftime(NULL, broken);
973 wd.type = style;
974 } else {
975 error(err_explbr, &t.pos);
976 wdtext = NULL;
977 }
978 } else {
979 rdstring rs = { 0, 0, NULL };
980 while (dtor(t), t = get_token(in),
981 t.type == tok_word || t.type == tok_white) {
982 if (t.type == tok_white)
983 rdadd(&rs, ' ');
984 else
985 rdadds(&rs, t.text);
986 }
987 if (wd.type == word_Normal) {
988 time_t thetime = time(NULL);
989 struct tm *broken = localtime(&thetime);
990 wdtext = ustrftime(rs.text, broken);
991 wd.type = style;
992 } else {
993 wdtext = ustrdup(rs.text);
994 }
995 sfree(rs.text);
996 if (t.type != tok_rbrace) {
997 error(err_kwexprbr, &t.pos);
998 }
999 }
1000 wd.alt = NULL;
1001 wd.aux = 0;
1002 if (!indexing || index_visible) {
1003 wd.text = ustrdup(wdtext);
1004 addword(wd, &whptr);
1005 }
1006 if (indexing) {
1007 wd.text = ustrdup(wdtext);
1008 addword(wd, &idximplicit);
1009 }
1010 sfree(wdtext);
1011 if (wd.type == word_HyperLink) {
1012 /*
1013 * Hyperlinks are different: they then
1014 * expect another left brace, to begin
1015 * delimiting the text marked by the link.
1016 */
1017 dtor(t), t = get_token(in);
1018 /*
1019 * Special cases: \W{}\c, \W{}\e, \W{}\cw
1020 */
1021 sitem = mknew(struct stack_item);
1022 sitem->type = stack_hyper;
1023 if (t.type == tok_cmd &&
1024 (t.cmd == c_e || t.cmd == c_c || t.cmd == c_cw)) {
1025 if (style != word_Normal)
1026 error(err_nestedstyles, &t.pos);
1027 else {
1028 style = (t.cmd == c_c ? word_Code :
1029 t.cmd == c_cw ? word_WeakCode :
1030 word_Emph);
1031 spcstyle = tospacestyle(style);
1032 sitem->type |= stack_style;
1033 }
1034 dtor(t), t = get_token(in);
1035 }
1036 if (t.type != tok_lbrace) {
1037 error(err_explbr, &t.pos);
1038 sfree(sitem);
1039 } else {
1040 stk_push(parsestk, sitem);
1041 }
1042 }
1043 break;
1044 case c_c:
1045 case c_cw:
1046 case c_e:
1047 type = t.cmd;
1048 if (style != word_Normal) {
1049 error(err_nestedstyles, &t.pos);
1050 /* Error recovery: eat lbrace, push nop. */
1051 dtor(t), t = get_token(in);
1052 sitem = mknew(struct stack_item);
1053 sitem->type = stack_nop;
1054 stk_push(parsestk, sitem);
1055 }
1056 dtor(t), t = get_token(in);
1057 if (t.type != tok_lbrace) {
1058 error(err_explbr, &t.pos);
1059 } else {
1060 style = (type == c_c ? word_Code :
1061 type == c_cw ? word_WeakCode :
1062 word_Emph);
1063 spcstyle = tospacestyle(style);
1064 sitem = mknew(struct stack_item);
1065 sitem->type = stack_style;
1066 stk_push(parsestk, sitem);
1067 }
1068 break;
1069 case c_i:
1070 case c_ii:
1071 case c_I:
1072 type = t.cmd;
1073 if (indexing) {
1074 error(err_nestedindex, &t.pos);
1075 /* Error recovery: eat lbrace, push nop. */
1076 dtor(t), t = get_token(in);
1077 sitem = mknew(struct stack_item);
1078 sitem->type = stack_nop;
1079 stk_push(parsestk, sitem);
1080 }
1081 sitem = mknew(struct stack_item);
1082 sitem->type = stack_idx;
1083 dtor(t), t = get_token(in);
1084 /*
1085 * Special cases: \i\c, \i\e, \i\cw
1086 */
1087 wd.fpos = t.pos;
1088 if (t.type == tok_cmd &&
1089 (t.cmd == c_e || t.cmd == c_c || t.cmd == c_cw)) {
1090 if (style != word_Normal)
1091 error(err_nestedstyles, &t.pos);
1092 else {
1093 style = (t.cmd == c_c ? word_Code :
1094 t.cmd == c_cw ? word_WeakCode :
1095 word_Emph);
1096 spcstyle = tospacestyle(style);
1097 sitem->type |= stack_style;
1098 }
1099 dtor(t), t = get_token(in);
1100 }
1101 if (t.type != tok_lbrace) {
1102 sfree(sitem);
1103 error(err_explbr, &t.pos);
1104 } else {
1105 /* Add an index-reference word with no text as yet */
1106 wd.type = word_IndexRef;
1107 wd.text = NULL;
1108 wd.alt = NULL;
1109 wd.aux = 0;
1110 wd.breaks = FALSE;
1111 indexword = addword(wd, &whptr);
1112 /* Set up a rdstring to read the index text */
1113 indexstr = nullrs;
1114 /* Flags so that we do the Right Things with text */
1115 index_visible = (type != c_I);
1116 index_downcase = (type == c_ii);
1117 indexing = TRUE;
1118 idxwordlist = NULL;
1119 idximplicit = &idxwordlist;
1120 /* Stack item to close the indexing on exit */
1121 stk_push(parsestk, sitem);
1122 }
1123 break;
1124 case c_u:
1125 uchr = t.aux;
1126 utext[0] = uchr; utext[1] = 0;
1127 wd.type = style;
1128 wd.breaks = FALSE;
1129 wd.alt = NULL;
1130 wd.aux = 0;
1131 wd.fpos = t.pos;
1132 if (!indexing || index_visible) {
1133 wd.text = ustrdup(utext);
1134 uword = addword(wd, &whptr);
1135 } else
1136 uword = NULL;
1137 if (indexing) {
1138 wd.text = ustrdup(utext);
1139 iword = addword(wd, &idximplicit);
1140 } else
1141 iword = NULL;
1142 dtor(t), t = get_token(in);
1143 if (t.type == tok_lbrace) {
1144 /*
1145 * \u with a left brace. Until the brace
1146 * closes, all further words go on a
1147 * sidetrack from the main thread of the
1148 * paragraph.
1149 */
1150 sitem = mknew(struct stack_item);
1151 sitem->type = stack_ualt;
1152 sitem->whptr = whptr;
1153 sitem->idximplicit = idximplicit;
1154 stk_push(parsestk, sitem);
1155 whptr = uword ? &uword->alt : NULL;
1156 idximplicit = iword ? &iword->alt : NULL;
1157 } else {
1158 if (indexing)
1159 rdadd(&indexstr, uchr);
1160 already = TRUE;
1161 }
1162 break;
1163 default:
1164 if (!macrolookup(macros, in, t.text, &t.pos))
1165 error(err_badmidcmd, t.text, &t.pos);
1166 break;
1167 }
1168 }
1169 if (!already)
1170 dtor(t), t = get_token(in);
1171 seenwhite = iswhite;
1172 }
1173 /* Check the stack is empty */
1174 if (NULL != (sitem = stk_pop(parsestk))) {
1175 do {
1176 sfree(sitem);
1177 sitem = stk_pop(parsestk);
1178 } while (sitem);
1179 error(err_missingrbrace, &t.pos);
1180 }
1181 stk_free(parsestk);
1182 addpara(par, ret);
1183 }
1184
1185 /*
1186 * We break to here rather than returning, because otherwise
1187 * this cleanup doesn't happen.
1188 */
1189 dtor(t);
1190 macrocleanup(macros);
1191 }
1192
1193 paragraph *read_input(input *in, indexdata *idx) {
1194 paragraph *head = NULL;
1195 paragraph **hptr = &head;
1196
1197 while (in->currindex < in->nfiles) {
1198 in->currfp = fopen(in->filenames[in->currindex], "r");
1199 if (in->currfp) {
1200 setpos(in, in->filenames[in->currindex]);
1201 read_file(&hptr, in, idx);
1202 }
1203 in->currindex++;
1204 }
1205
1206 return head;
1207 }