Rename Buttress to Halibut. I _think_ I've caught everything in this pass.
[sgt/halibut] / input.c
1 /*
2 * input.c: read the source form
3 */
4
5 #include <stdio.h>
6 #include <assert.h>
7 #include <time.h>
8 #include "halibut.h"
9
10 #define TAB_STOP 8 /* for column number tracking */
11
12 static void setpos(input *in, char *fname) {
13 in->pos.filename = fname;
14 in->pos.line = 1;
15 in->pos.col = (in->reportcols ? 1 : -1);
16 }
17
18 static void unget(input *in, int c, filepos *pos) {
19 if (in->npushback >= in->pushbacksize) {
20 in->pushbacksize = in->npushback + 16;
21 in->pushback = resize(in->pushback, in->pushbacksize);
22 }
23 in->pushback[in->npushback].chr = c;
24 in->pushback[in->npushback].pos = *pos; /* structure copy */
25 in->npushback++;
26 }
27
28 /* ---------------------------------------------------------------------- */
29 /*
30 * Macro subsystem
31 */
32 typedef struct macro_Tag macro;
33 struct macro_Tag {
34 wchar_t *name, *text;
35 };
36 struct macrostack_Tag {
37 macrostack *next;
38 wchar_t *text;
39 int ptr, npushback;
40 filepos pos;
41 };
42 static int macrocmp(void *av, void *bv) {
43 macro *a = (macro *)av, *b = (macro *)bv;
44 return ustrcmp(a->name, b->name);
45 }
46 static void macrodef(tree234 *macros, wchar_t *name, wchar_t *text,
47 filepos fpos) {
48 macro *m = mknew(macro);
49 m->name = name;
50 m->text = text;
51 if (add234(macros, m) != m) {
52 error(err_macroexists, &fpos, name);
53 sfree(name);
54 sfree(text);
55 }
56 }
57 static int macrolookup(tree234 *macros, input *in, wchar_t *name,
58 filepos *pos) {
59 macro m, *gotit;
60 m.name = name;
61 gotit = find234(macros, &m, NULL);
62 if (gotit) {
63 macrostack *expansion = mknew(macrostack);
64 expansion->next = in->stack;
65 expansion->text = gotit->text;
66 expansion->pos = *pos; /* structure copy */
67 expansion->ptr = 0;
68 expansion->npushback = in->npushback;
69 in->stack = expansion;
70 return TRUE;
71 } else
72 return FALSE;
73 }
74 static void macrocleanup(tree234 *macros) {
75 int ti;
76 macro *m;
77 for (ti = 0; (m = (macro *)index234(macros, ti)) != NULL; ti++) {
78 sfree(m->name);
79 sfree(m->text);
80 sfree(m);
81 }
82 freetree234(macros);
83 }
84
85 /*
86 * Can return EOF
87 */
88 static int get(input *in, filepos *pos) {
89 int pushbackpt = in->stack ? in->stack->npushback : 0;
90 if (in->npushback > pushbackpt) {
91 --in->npushback;
92 if (pos)
93 *pos = in->pushback[in->npushback].pos; /* structure copy */
94 return in->pushback[in->npushback].chr;
95 }
96 else if (in->stack) {
97 wchar_t c = in->stack->text[in->stack->ptr];
98 if (in->stack->text[++in->stack->ptr] == L'\0') {
99 macrostack *tmp = in->stack;
100 in->stack = tmp->next;
101 sfree(tmp);
102 }
103 return c;
104 }
105 else if (in->currfp) {
106 int c = getc(in->currfp);
107
108 if (c == EOF) {
109 fclose(in->currfp);
110 in->currfp = NULL;
111 }
112 /* Track line numbers, for error reporting */
113 if (pos)
114 *pos = in->pos;
115 if (in->reportcols) {
116 switch (c) {
117 case '\t':
118 in->pos.col = 1 + (in->pos.col + TAB_STOP-1) % TAB_STOP;
119 break;
120 case '\n':
121 in->pos.col = 1;
122 in->pos.line++;
123 break;
124 default:
125 in->pos.col++;
126 break;
127 }
128 } else {
129 in->pos.col = -1;
130 if (c == '\n')
131 in->pos.line++;
132 }
133 /* FIXME: do input charmap translation. We should be returning
134 * Unicode here. */
135 return c;
136 } else
137 return EOF;
138 }
139
140 /*
141 * Lexical analysis of source files.
142 */
143 typedef struct token_Tag token;
144 struct token_Tag {
145 int type;
146 int cmd, aux;
147 wchar_t *text;
148 filepos pos;
149 };
150 enum {
151 tok_eof, /* end of file */
152 tok_eop, /* end of paragraph */
153 tok_white, /* whitespace */
154 tok_word, /* a word or word fragment */
155 tok_cmd, /* \command */
156 tok_lbrace, /* { */
157 tok_rbrace /* } */
158 };
159
160 /* Halibut command keywords. */
161 enum {
162 c__invalid, /* invalid command */
163 c__comment, /* comment command (\#) */
164 c__escaped, /* escaped character */
165 c__nbsp, /* nonbreaking space */
166 c_A, /* appendix heading */
167 c_B, /* bibliography entry */
168 c_BR, /* bibliography rewrite */
169 c_C, /* chapter heading */
170 c_H, /* heading */
171 c_I, /* invisible index mark */
172 c_IM, /* index merge/rewrite */
173 c_K, /* capitalised cross-reference */
174 c_S, /* aux field is 0, 1, 2, ... */
175 c_U, /* unnumbered-chapter heading */
176 c_W, /* Web hyperlink */
177 c_b, /* bulletted list */
178 c_c, /* code */
179 c_cfg, /* configuration directive */
180 c_copyright, /* copyright statement */
181 c_cw, /* weak code */
182 c_date, /* document processing date */
183 c_define, /* macro definition */
184 c_e, /* emphasis */
185 c_i, /* visible index mark */
186 c_ii, /* uncapitalised visible index mark */
187 c_k, /* uncapitalised cross-reference */
188 c_n, /* numbered list */
189 c_nocite, /* bibliography trickery */
190 c_preamble, /* document preamble text */
191 c_q, /* quote marks */
192 c_rule, /* horizontal rule */
193 c_title, /* document title */
194 c_u, /* aux field is char code */
195 c_versionid /* document RCS id */
196 };
197
198 /* Perhaps whitespace should be defined in a more Unicode-friendly way? */
199 #define iswhite(c) ( (c)==32 || (c)==9 || (c)==13 || (c)==10 )
200 #define isnl(c) ( (c)==10 )
201 #define isdec(c) ( ((c)>='0'&&(c)<='9') )
202 #define fromdec(c) ( (c)-'0' )
203 #define ishex(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='F') || ((c)>='a'&&(c)<='f'))
204 #define fromhex(c) ( (c)<='9' ? (c)-'0' : ((c)&0xDF) - ('A'-10) )
205 #define iscmd(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='Z') || ((c)>='a'&&(c)<='z'))
206
207 /*
208 * Keyword comparison function. Like strcmp, but between a wchar_t *
209 * and a char *.
210 */
211 static int kwcmp(wchar_t const *p, char const *q) {
212 int i;
213 do {
214 i = *p - *q;
215 } while (*p++ && *q++ && !i);
216 return i;
217 }
218
219 /*
220 * Match a keyword.
221 */
222 static void match_kw(token *tok) {
223 /*
224 * FIXME. The ids are explicit in here so as to allow long-name
225 * equivalents to the various very short keywords.
226 */
227 static const struct { char const *name; int id; } keywords[] = {
228 {"#", c__comment}, /* comment command (\#) */
229 {"-", c__escaped}, /* nonbreaking hyphen */
230 {"A", c_A}, /* appendix heading */
231 {"B", c_B}, /* bibliography entry */
232 {"BR", c_BR}, /* bibliography rewrite */
233 {"C", c_C}, /* chapter heading */
234 {"H", c_H}, /* heading */
235 {"I", c_I}, /* invisible index mark */
236 {"IM", c_IM}, /* index merge/rewrite */
237 {"K", c_K}, /* capitalised cross-reference */
238 {"U", c_U}, /* unnumbered-chapter heading */
239 {"W", c_W}, /* Web hyperlink */
240 {"\\", c__escaped}, /* escaped backslash (\\) */
241 {"_", c__nbsp}, /* nonbreaking space (\_) */
242 {"b", c_b}, /* bulletted list */
243 {"c", c_c}, /* code */
244 {"cfg", c_cfg}, /* configuration directive */
245 {"copyright", c_copyright}, /* copyright statement */
246 {"cw", c_cw}, /* weak code */
247 {"date", c_date}, /* document processing date */
248 {"define", c_define}, /* macro definition */
249 {"e", c_e}, /* emphasis */
250 {"i", c_i}, /* visible index mark */
251 {"ii", c_ii}, /* uncapitalised visible index mark */
252 {"k", c_k}, /* uncapitalised cross-reference */
253 {"n", c_n}, /* numbered list */
254 {"nocite", c_nocite}, /* bibliography trickery */
255 {"preamble", c_preamble}, /* document preamble text */
256 {"q", c_q}, /* quote marks */
257 {"rule", c_rule}, /* horizontal rule */
258 {"title", c_title}, /* document title */
259 {"versionid", c_versionid}, /* document RCS id */
260 {"{", c__escaped}, /* escaped lbrace (\{) */
261 {"}", c__escaped}, /* escaped rbrace (\}) */
262 };
263 int i, j, k, c;
264
265 /*
266 * Special cases: \S{0,1,2,...} and \uABCD. If the syntax
267 * doesn't match correctly, we just fall through to the
268 * binary-search phase.
269 */
270 if (tok->text[0] == 'S') {
271 /* We expect numeric characters thereafter. */
272 wchar_t *p = tok->text+1;
273 int n;
274 if (!*p)
275 n = 1;
276 else {
277 n = 0;
278 while (*p && isdec(*p)) {
279 n = 10 * n + fromdec(*p);
280 p++;
281 }
282 }
283 if (!*p) {
284 tok->cmd = c_S;
285 tok->aux = n;
286 return;
287 }
288 } else if (tok->text[0] == 'u') {
289 /* We expect hex characters thereafter. */
290 wchar_t *p = tok->text+1;
291 int n = 0;
292 while (*p && ishex(*p)) {
293 n = 16 * n + fromhex(*p);
294 p++;
295 }
296 if (!*p) {
297 tok->cmd = c_u;
298 tok->aux = n;
299 return;
300 }
301 }
302
303 i = -1;
304 j = sizeof(keywords)/sizeof(*keywords);
305 while (j-i > 1) {
306 k = (i+j)/2;
307 c = kwcmp(tok->text, keywords[k].name);
308 if (c < 0)
309 j = k;
310 else if (c > 0)
311 i = k;
312 else /* c == 0 */ {
313 tok->cmd = keywords[k].id;
314 return;
315 }
316 }
317
318 tok->cmd = c__invalid;
319 }
320
321
322 /*
323 * Read a token from the input file, in the normal way (`normal' in
324 * the sense that code paragraphs work a different way).
325 */
326 token get_token(input *in) {
327 int c;
328 int nls;
329 token ret;
330 rdstring rs = { 0, 0, NULL };
331 filepos cpos;
332
333 ret.text = NULL; /* default */
334 c = get(in, &cpos);
335 ret.pos = cpos;
336 if (iswhite(c)) { /* tok_white or tok_eop */
337 nls = 0;
338 do {
339 if (isnl(c))
340 nls++;
341 } while ((c = get(in, &cpos)) != EOF && iswhite(c));
342 if (c == EOF) {
343 ret.type = tok_eof;
344 return ret;
345 }
346 unget(in, c, &cpos);
347 ret.type = (nls > 1 ? tok_eop : tok_white);
348 return ret;
349 } else if (c == EOF) { /* tok_eof */
350 ret.type = tok_eof;
351 return ret;
352 } else if (c == '\\') { /* tok_cmd */
353 c = get(in, &cpos);
354 if (c == '-' || c == '\\' || c == '_' ||
355 c == '#' || c == '{' || c == '}') {
356 /* single-char command */
357 rdadd(&rs, c);
358 } else if (c == 'u') {
359 int len = 0;
360 do {
361 rdadd(&rs, c);
362 len++;
363 c = get(in, &cpos);
364 } while (ishex(c) && len < 5);
365 unget(in, c, &cpos);
366 } else if (iscmd(c)) {
367 do {
368 rdadd(&rs, c);
369 c = get(in, &cpos);
370 } while (iscmd(c));
371 unget(in, c, &cpos);
372 }
373 /*
374 * Now match the command against the list of available
375 * ones.
376 */
377 ret.type = tok_cmd;
378 ret.text = ustrdup(rs.text);
379 match_kw(&ret);
380 sfree(rs.text);
381 return ret;
382 } else if (c == '{') { /* tok_lbrace */
383 ret.type = tok_lbrace;
384 return ret;
385 } else if (c == '}') { /* tok_rbrace */
386 ret.type = tok_rbrace;
387 return ret;
388 } else { /* tok_word */
389 /*
390 * Read a word: the longest possible contiguous sequence of
391 * things other than whitespace, backslash, braces and
392 * hyphen. A hyphen terminates the word but is returned as
393 * part of it; everything else is pushed back for the next
394 * token. The `aux' field contains TRUE if the word ends in
395 * a hyphen.
396 */
397 ret.aux = FALSE; /* assumed for now */
398 while (1) {
399 if (iswhite(c) || c=='{' || c=='}' || c=='\\' || c==EOF) {
400 /* Put back the character that caused termination */
401 unget(in, c, &cpos);
402 break;
403 } else {
404 rdadd(&rs, c);
405 if (c == '-') {
406 ret.aux = TRUE;
407 break; /* hyphen terminates word */
408 }
409 }
410 c = get(in, &cpos);
411 }
412 ret.type = tok_word;
413 ret.text = ustrdup(rs.text);
414 sfree(rs.text);
415 return ret;
416 }
417 }
418
419 /*
420 * Determine whether the next input character is an open brace (for
421 * telling code paragraphs from paragraphs which merely start with
422 * code).
423 */
424 int isbrace(input *in) {
425 int c;
426 filepos cpos;
427
428 c = get(in, &cpos);
429 unget(in, c, &cpos);
430 return (c == '{');
431 }
432
433 /*
434 * Read the rest of a line that starts `\c'. Including nothing at
435 * all (tok_word with empty text).
436 */
437 token get_codepar_token(input *in) {
438 int c;
439 token ret;
440 rdstring rs = { 0, 0, NULL };
441 filepos cpos;
442
443 ret.type = tok_word;
444 c = get(in, &cpos); /* expect (and discard) one space */
445 ret.pos = cpos;
446 if (c == ' ') {
447 c = get(in, &cpos);
448 ret.pos = cpos;
449 }
450 while (!isnl(c) && c != EOF) {
451 int c2 = c;
452 c = get(in, &cpos);
453 /* Discard \r just before \n. */
454 if (c2 != 13 || !isnl(c))
455 rdadd(&rs, c2);
456 }
457 unget(in, c, &cpos);
458 ret.text = ustrdup(rs.text);
459 sfree(rs.text);
460 return ret;
461 }
462
463 /*
464 * Adds a new word to a linked list
465 */
466 static word *addword(word newword, word ***hptrptr) {
467 word *mnewword;
468 if (!hptrptr)
469 return NULL;
470 mnewword = mknew(word);
471 *mnewword = newword; /* structure copy */
472 mnewword->next = NULL;
473 **hptrptr = mnewword;
474 *hptrptr = &mnewword->next;
475 return mnewword;
476 }
477
478 /*
479 * Adds a new paragraph to a linked list
480 */
481 static paragraph *addpara(paragraph newpara, paragraph ***hptrptr) {
482 paragraph *mnewpara = mknew(paragraph);
483 *mnewpara = newpara; /* structure copy */
484 mnewpara->next = NULL;
485 **hptrptr = mnewpara;
486 *hptrptr = &mnewpara->next;
487 return mnewpara;
488 }
489
490 /*
491 * Destructor before token is reassigned; should catch most memory
492 * leaks
493 */
494 #define dtor(t) ( sfree(t.text) )
495
496 /*
497 * Reads a single file (ie until get() returns EOF)
498 */
499 static void read_file(paragraph ***ret, input *in, indexdata *idx) {
500 token t;
501 paragraph par;
502 word wd, **whptr, **idximplicit;
503 tree234 *macros;
504 wchar_t utext[2], *wdtext;
505 int style, spcstyle;
506 int already;
507 int iswhite, seenwhite;
508 int type;
509 struct stack_item {
510 enum {
511 stack_nop = 0, /* do nothing (for error recovery) */
512 stack_ualt = 1, /* \u alternative */
513 stack_style = 2, /* \e, \c, \cw */
514 stack_idx = 4, /* \I, \i, \ii */
515 stack_hyper = 8, /* \W */
516 stack_quote = 16, /* \q */
517 } type;
518 word **whptr; /* to restore from \u alternatives */
519 word **idximplicit; /* to restore from \u alternatives */
520 } *sitem;
521 stack parsestk;
522 word *indexword, *uword, *iword;
523 word *idxwordlist;
524 rdstring indexstr;
525 int index_downcase, index_visible, indexing;
526 const rdstring nullrs = { 0, 0, NULL };
527 wchar_t uchr;
528
529 t.text = NULL;
530 macros = newtree234(macrocmp);
531
532 /*
533 * Loop on each paragraph.
534 */
535 while (1) {
536 par.words = NULL;
537 par.keyword = NULL;
538 whptr = &par.words;
539
540 /*
541 * Get a token.
542 */
543 dtor(t), t = get_token(in);
544 if (t.type == tok_eof)
545 return;
546
547 /*
548 * Parse code paragraphs separately.
549 */
550 if (t.type == tok_cmd && t.cmd == c_c && !isbrace(in)) {
551 par.type = para_Code;
552 par.fpos = t.pos;
553 while (1) {
554 dtor(t), t = get_codepar_token(in);
555 wd.type = word_WeakCode;
556 wd.breaks = FALSE; /* shouldn't need this... */
557 wd.text = ustrdup(t.text);
558 wd.alt = NULL;
559 wd.fpos = t.pos;
560 addword(wd, &whptr);
561 dtor(t), t = get_token(in);
562 if (t.type == tok_white) {
563 /*
564 * The newline after a code-paragraph line
565 */
566 dtor(t), t = get_token(in);
567 }
568 if (t.type == tok_eop || t.type == tok_eof)
569 break;
570 else if (t.type != tok_cmd || t.cmd != c_c) {
571 error(err_brokencodepara, &t.pos);
572 addpara(par, ret);
573 while (t.type != tok_eop) /* error recovery: */
574 dtor(t), t = get_token(in); /* eat rest of paragraph */
575 goto codeparabroken; /* ick, but such is life */
576 }
577 }
578 addpara(par, ret);
579 codeparabroken:
580 continue;
581 }
582
583 /*
584 * This token begins a paragraph. See if it's one of the
585 * special commands that define a paragraph type.
586 *
587 * (note that \# is special in a way, and \nocite takes no
588 * text)
589 */
590 par.type = para_Normal;
591 if (t.type == tok_cmd) {
592 int needkw;
593 int is_macro = FALSE;
594
595 par.fpos = t.pos;
596 switch (t.cmd) {
597 default:
598 needkw = -1;
599 break;
600 case c__invalid:
601 error(err_badparatype, t.text, &t.pos);
602 needkw = 4;
603 break;
604 case c__comment:
605 if (isbrace(in))
606 break; /* `\#{': isn't a comment para */
607 do {
608 dtor(t), t = get_token(in);
609 } while (t.type != tok_eop && t.type != tok_eof);
610 continue; /* next paragraph */
611 /*
612 * `needkw' values:
613 *
614 * 1 -- exactly one keyword
615 * 2 -- at least one keyword
616 * 4 -- any number of keywords including zero
617 * 8 -- at least one keyword and then nothing else
618 * 16 -- nothing at all! no keywords, no body
619 * 32 -- no keywords at all
620 */
621 case c_A: needkw = 2; par.type = para_Appendix; break;
622 case c_B: needkw = 2; par.type = para_Biblio; break;
623 case c_BR: needkw = 1; par.type = para_BR; break;
624 case c_C: needkw = 2; par.type = para_Chapter; break;
625 case c_H: needkw = 2; par.type = para_Heading;
626 par.aux = 0;
627 break;
628 case c_IM: needkw = 2; par.type = para_IM; break;
629 case c_S: needkw = 2; par.type = para_Subsect;
630 par.aux = t.aux; break;
631 case c_U: needkw = 32; par.type = para_UnnumberedChapter; break;
632 /* For \b and \n the keyword is optional */
633 case c_b: needkw = 4; par.type = para_Bullet; break;
634 case c_n: needkw = 4; par.type = para_NumberedList; break;
635 case c_cfg: needkw = 8; par.type = para_Config; break;
636 case c_copyright: needkw = 32; par.type = para_Copyright; break;
637 case c_define: is_macro = TRUE; needkw = 1; break;
638 /* For \nocite the keyword is _everything_ */
639 case c_nocite: needkw = 8; par.type = para_NoCite; break;
640 case c_preamble: needkw = 32; par.type = para_Preamble; break;
641 case c_rule: needkw = 16; par.type = para_Rule; break;
642 case c_title: needkw = 32; par.type = para_Title; break;
643 case c_versionid: needkw = 32; par.type = para_VersionID; break;
644 }
645
646 if (needkw > 0) {
647 rdstring rs = { 0, 0, NULL };
648 int nkeys = 0;
649 filepos fp;
650
651 /* Get keywords. */
652 dtor(t), t = get_token(in);
653 fp = t.pos;
654 while (t.type == tok_lbrace) {
655 /* This is a keyword. */
656 nkeys++;
657 /* FIXME: there will be bugs if anyone specifies an
658 * empty keyword (\foo{}), so trap this case. */
659 while (dtor(t), t = get_token(in),
660 t.type == tok_word ||
661 t.type == tok_white ||
662 (t.type == tok_cmd && t.cmd == c__nbsp) ||
663 (t.type == tok_cmd && t.cmd == c__escaped)) {
664 if (t.type == tok_white ||
665 (t.type == tok_cmd && t.cmd == c__nbsp))
666 rdadd(&rs, ' ');
667 else
668 rdadds(&rs, t.text);
669 }
670 if (t.type != tok_rbrace) {
671 error(err_kwunclosed, &t.pos);
672 continue;
673 }
674 rdadd(&rs, 0); /* add string terminator */
675 dtor(t), t = get_token(in); /* eat right brace */
676 }
677
678 rdadd(&rs, 0); /* add string terminator */
679
680 /* See whether we have the right number of keywords. */
681 if ((needkw & 48) && nkeys > 0)
682 error(err_kwillegal, &fp);
683 if ((needkw & 11) && nkeys == 0)
684 error(err_kwexpected, &fp);
685 if ((needkw & 5) && nkeys > 1)
686 error(err_kwtoomany, &fp);
687
688 if (is_macro) {
689 /*
690 * Macro definition. Get the rest of the line
691 * as a code-paragraph token, repeatedly until
692 * there's nothing more left of it. Separate
693 * with newlines.
694 */
695 rdstring macrotext = { 0, 0, NULL };
696 while (1) {
697 dtor(t), t = get_codepar_token(in);
698 if (macrotext.pos > 0)
699 rdadd(&macrotext, L'\n');
700 rdadds(&macrotext, t.text);
701 dtor(t), t = get_token(in);
702 if (t.type == tok_eop) break;
703 }
704 macrodef(macros, rs.text, macrotext.text, fp);
705 continue; /* next paragraph */
706 }
707
708 par.keyword = rdtrim(&rs);
709
710 /* Move to EOP in case of needkw==8 or 16 (no body) */
711 if (needkw & 24) {
712 if (t.type != tok_eop && t.type != tok_eof) {
713 error(err_bodyillegal, &t.pos);
714 /* Error recovery: eat the rest of the paragraph */
715 while (t.type != tok_eop && t.type != tok_eof)
716 dtor(t), t = get_token(in);
717 }
718 addpara(par, ret);
719 continue; /* next paragraph */
720 }
721 }
722 }
723
724 /*
725 * Now read the actual paragraph, word by word, adding to
726 * the paragraph list.
727 *
728 * Mid-paragraph commands:
729 *
730 * \K \k
731 * \c \cw
732 * \e
733 * \i \ii
734 * \I
735 * \u
736 * \W
737 * \date
738 * \\ \{ \}
739 */
740 parsestk = stk_new();
741 style = word_Normal;
742 spcstyle = word_WhiteSpace;
743 indexing = FALSE;
744 seenwhite = TRUE;
745 while (t.type != tok_eop && t.type != tok_eof) {
746 iswhite = FALSE;
747 already = FALSE;
748 if (t.type == tok_cmd && t.cmd == c__escaped) {
749 t.type = tok_word; /* nice and simple */
750 t.aux = 0; /* even if `\-' - nonbreaking! */
751 }
752 if (t.type == tok_cmd && t.cmd == c__nbsp) {
753 t.type = tok_word; /* nice and simple */
754 sfree(t.text);
755 t.text = ustrdup(L" "); /* text is ` ' not `_' */
756 t.aux = 0; /* (nonbreaking) */
757 }
758 switch (t.type) {
759 case tok_white:
760 if (whptr == &par.words)
761 break; /* strip whitespace at start of para */
762 wd.text = NULL;
763 wd.type = spcstyle;
764 wd.alt = NULL;
765 wd.aux = 0;
766 wd.fpos = t.pos;
767 wd.breaks = FALSE;
768 if (indexing)
769 rdadd(&indexstr, ' ');
770 if (!indexing || index_visible)
771 addword(wd, &whptr);
772 if (indexing)
773 addword(wd, &idximplicit);
774 iswhite = TRUE;
775 break;
776 case tok_word:
777 if (indexing)
778 rdadds(&indexstr, t.text);
779 wd.type = style;
780 wd.alt = NULL;
781 wd.aux = 0;
782 wd.fpos = t.pos;
783 wd.breaks = t.aux;
784 if (!indexing || index_visible) {
785 wd.text = ustrdup(t.text);
786 addword(wd, &whptr);
787 }
788 if (indexing) {
789 wd.text = ustrdup(t.text);
790 addword(wd, &idximplicit);
791 }
792 break;
793 case tok_lbrace:
794 error(err_unexbrace, &t.pos);
795 /* Error recovery: push nop */
796 sitem = mknew(struct stack_item);
797 sitem->type = stack_nop;
798 stk_push(parsestk, sitem);
799 break;
800 case tok_rbrace:
801 sitem = stk_pop(parsestk);
802 if (!sitem)
803 error(err_unexbrace, &t.pos);
804 else {
805 if (sitem->type & stack_ualt) {
806 whptr = sitem->whptr;
807 idximplicit = sitem->idximplicit;
808 }
809 if (sitem->type & stack_style) {
810 style = word_Normal;
811 spcstyle = word_WhiteSpace;
812 }
813 if (sitem->type & stack_idx) {
814 indexword->text = ustrdup(indexstr.text);
815 if (index_downcase)
816 ustrlow(indexword->text);
817 indexing = FALSE;
818 rdadd(&indexstr, L'\0');
819 index_merge(idx, FALSE, indexstr.text, idxwordlist);
820 sfree(indexstr.text);
821 }
822 if (sitem->type & stack_hyper) {
823 wd.text = NULL;
824 wd.type = word_HyperEnd;
825 wd.alt = NULL;
826 wd.aux = 0;
827 wd.fpos = t.pos;
828 wd.breaks = FALSE;
829 if (!indexing || index_visible)
830 addword(wd, &whptr);
831 if (indexing)
832 addword(wd, &idximplicit);
833 }
834 if (sitem->type & stack_quote) {
835 wd.text = NULL;
836 wd.type = toquotestyle(style);
837 wd.alt = NULL;
838 wd.aux = quote_Close;
839 wd.fpos = t.pos;
840 wd.breaks = FALSE;
841 if (!indexing || index_visible)
842 addword(wd, &whptr);
843 if (indexing) {
844 rdadd(&indexstr, L'"');
845 addword(wd, &idximplicit);
846 }
847 }
848 }
849 sfree(sitem);
850 break;
851 case tok_cmd:
852 switch (t.cmd) {
853 case c__comment:
854 /*
855 * In-paragraph comment: \#{ balanced braces }
856 *
857 * Anything goes here; even tok_eop. We should
858 * eat whitespace after the close brace _if_
859 * there was whitespace before the \#.
860 */
861 dtor(t), t = get_token(in);
862 if (t.type != tok_lbrace) {
863 error(err_explbr, &t.pos);
864 } else {
865 int braces = 1;
866 while (braces > 0) {
867 dtor(t), t = get_token(in);
868 if (t.type == tok_lbrace)
869 braces++;
870 else if (t.type == tok_rbrace)
871 braces--;
872 else if (t.type == tok_eof) {
873 error(err_commenteof, &t.pos);
874 break;
875 }
876 }
877 }
878 if (seenwhite) {
879 already = TRUE;
880 dtor(t), t = get_token(in);
881 if (t.type == tok_white) {
882 iswhite = TRUE;
883 already = FALSE;
884 }
885 }
886 break;
887 case c_q:
888 dtor(t), t = get_token(in);
889 if (t.type != tok_lbrace) {
890 error(err_explbr, &t.pos);
891 } else {
892 wd.text = NULL;
893 wd.type = toquotestyle(style);
894 wd.alt = NULL;
895 wd.aux = quote_Open;
896 wd.fpos = t.pos;
897 wd.breaks = FALSE;
898 if (!indexing || index_visible)
899 addword(wd, &whptr);
900 if (indexing) {
901 rdadd(&indexstr, L'"');
902 addword(wd, &idximplicit);
903 }
904 sitem = mknew(struct stack_item);
905 sitem->type = stack_quote;
906 stk_push(parsestk, sitem);
907 }
908 break;
909 case c_K:
910 case c_k:
911 case c_W:
912 case c_date:
913 /*
914 * Keyword, hyperlink, or \date. We expect a
915 * left brace, some text, and then a right
916 * brace. No nesting; no arguments.
917 */
918 wd.fpos = t.pos;
919 wd.breaks = FALSE;
920 if (t.cmd == c_K)
921 wd.type = word_UpperXref;
922 else if (t.cmd == c_k)
923 wd.type = word_LowerXref;
924 else if (t.cmd == c_W)
925 wd.type = word_HyperLink;
926 else
927 wd.type = word_Normal;
928 dtor(t), t = get_token(in);
929 if (t.type != tok_lbrace) {
930 if (wd.type == word_Normal) {
931 time_t thetime = time(NULL);
932 struct tm *broken = localtime(&thetime);
933 already = TRUE;
934 wdtext = ustrftime(NULL, broken);
935 wd.type = style;
936 } else {
937 error(err_explbr, &t.pos);
938 wdtext = NULL;
939 }
940 } else {
941 rdstring rs = { 0, 0, NULL };
942 while (dtor(t), t = get_token(in),
943 t.type == tok_word || t.type == tok_white) {
944 if (t.type == tok_white)
945 rdadd(&rs, ' ');
946 else
947 rdadds(&rs, t.text);
948 }
949 if (wd.type == word_Normal) {
950 time_t thetime = time(NULL);
951 struct tm *broken = localtime(&thetime);
952 wdtext = ustrftime(rs.text, broken);
953 wd.type = style;
954 } else {
955 wdtext = ustrdup(rs.text);
956 }
957 sfree(rs.text);
958 if (t.type != tok_rbrace) {
959 error(err_kwexprbr, &t.pos);
960 }
961 }
962 wd.alt = NULL;
963 wd.aux = 0;
964 if (!indexing || index_visible) {
965 wd.text = ustrdup(wdtext);
966 addword(wd, &whptr);
967 }
968 if (indexing) {
969 wd.text = ustrdup(wdtext);
970 addword(wd, &idximplicit);
971 }
972 sfree(wdtext);
973 if (wd.type == word_HyperLink) {
974 /*
975 * Hyperlinks are different: they then
976 * expect another left brace, to begin
977 * delimiting the text marked by the link.
978 */
979 dtor(t), t = get_token(in);
980 /*
981 * Special cases: \W{}\c, \W{}\e, \W{}\cw
982 */
983 sitem = mknew(struct stack_item);
984 sitem->type = stack_hyper;
985 if (t.type == tok_cmd &&
986 (t.cmd == c_e || t.cmd == c_c || t.cmd == c_cw)) {
987 if (style != word_Normal)
988 error(err_nestedstyles, &t.pos);
989 else {
990 style = (t.cmd == c_c ? word_Code :
991 t.cmd == c_cw ? word_WeakCode :
992 word_Emph);
993 spcstyle = tospacestyle(style);
994 sitem->type |= stack_style;
995 }
996 dtor(t), t = get_token(in);
997 }
998 if (t.type != tok_lbrace) {
999 error(err_explbr, &t.pos);
1000 sfree(sitem);
1001 } else {
1002 stk_push(parsestk, sitem);
1003 }
1004 }
1005 break;
1006 case c_c:
1007 case c_cw:
1008 case c_e:
1009 type = t.cmd;
1010 if (style != word_Normal) {
1011 error(err_nestedstyles, &t.pos);
1012 /* Error recovery: eat lbrace, push nop. */
1013 dtor(t), t = get_token(in);
1014 sitem = mknew(struct stack_item);
1015 sitem->type = stack_nop;
1016 stk_push(parsestk, sitem);
1017 }
1018 dtor(t), t = get_token(in);
1019 if (t.type != tok_lbrace) {
1020 error(err_explbr, &t.pos);
1021 } else {
1022 style = (type == c_c ? word_Code :
1023 type == c_cw ? word_WeakCode :
1024 word_Emph);
1025 spcstyle = tospacestyle(style);
1026 sitem = mknew(struct stack_item);
1027 sitem->type = stack_style;
1028 stk_push(parsestk, sitem);
1029 }
1030 break;
1031 case c_i:
1032 case c_ii:
1033 case c_I:
1034 type = t.cmd;
1035 if (indexing) {
1036 error(err_nestedindex, &t.pos);
1037 /* Error recovery: eat lbrace, push nop. */
1038 dtor(t), t = get_token(in);
1039 sitem = mknew(struct stack_item);
1040 sitem->type = stack_nop;
1041 stk_push(parsestk, sitem);
1042 }
1043 sitem = mknew(struct stack_item);
1044 sitem->type = stack_idx;
1045 dtor(t), t = get_token(in);
1046 /*
1047 * Special cases: \i\c, \i\e, \i\cw
1048 */
1049 wd.fpos = t.pos;
1050 if (t.type == tok_cmd &&
1051 (t.cmd == c_e || t.cmd == c_c || t.cmd == c_cw)) {
1052 if (style != word_Normal)
1053 error(err_nestedstyles, &t.pos);
1054 else {
1055 style = (t.cmd == c_c ? word_Code :
1056 t.cmd == c_cw ? word_WeakCode :
1057 word_Emph);
1058 spcstyle = tospacestyle(style);
1059 sitem->type |= stack_style;
1060 }
1061 dtor(t), t = get_token(in);
1062 }
1063 if (t.type != tok_lbrace) {
1064 sfree(sitem);
1065 error(err_explbr, &t.pos);
1066 } else {
1067 /* Add an index-reference word with no text as yet */
1068 wd.type = word_IndexRef;
1069 wd.text = NULL;
1070 wd.alt = NULL;
1071 wd.aux = 0;
1072 wd.breaks = FALSE;
1073 indexword = addword(wd, &whptr);
1074 /* Set up a rdstring to read the index text */
1075 indexstr = nullrs;
1076 /* Flags so that we do the Right Things with text */
1077 index_visible = (type != c_I);
1078 index_downcase = (type == c_ii);
1079 indexing = TRUE;
1080 idxwordlist = NULL;
1081 idximplicit = &idxwordlist;
1082 /* Stack item to close the indexing on exit */
1083 stk_push(parsestk, sitem);
1084 }
1085 break;
1086 case c_u:
1087 uchr = t.aux;
1088 utext[0] = uchr; utext[1] = 0;
1089 wd.type = style;
1090 wd.breaks = FALSE;
1091 wd.alt = NULL;
1092 wd.aux = 0;
1093 wd.fpos = t.pos;
1094 if (!indexing || index_visible) {
1095 wd.text = ustrdup(utext);
1096 uword = addword(wd, &whptr);
1097 } else
1098 uword = NULL;
1099 if (indexing) {
1100 wd.text = ustrdup(utext);
1101 iword = addword(wd, &idximplicit);
1102 } else
1103 iword = NULL;
1104 dtor(t), t = get_token(in);
1105 if (t.type == tok_lbrace) {
1106 /*
1107 * \u with a left brace. Until the brace
1108 * closes, all further words go on a
1109 * sidetrack from the main thread of the
1110 * paragraph.
1111 */
1112 sitem = mknew(struct stack_item);
1113 sitem->type = stack_ualt;
1114 sitem->whptr = whptr;
1115 sitem->idximplicit = idximplicit;
1116 stk_push(parsestk, sitem);
1117 whptr = uword ? &uword->alt : NULL;
1118 idximplicit = iword ? &iword->alt : NULL;
1119 } else {
1120 if (indexing)
1121 rdadd(&indexstr, uchr);
1122 already = TRUE;
1123 }
1124 break;
1125 default:
1126 if (!macrolookup(macros, in, t.text, &t.pos))
1127 error(err_badmidcmd, t.text, &t.pos);
1128 break;
1129 }
1130 }
1131 if (!already)
1132 dtor(t), t = get_token(in);
1133 seenwhite = iswhite;
1134 }
1135 /* Check the stack is empty */
1136 if (NULL != (sitem = stk_pop(parsestk))) {
1137 do {
1138 sfree(sitem);
1139 sitem = stk_pop(parsestk);
1140 } while (sitem);
1141 error(err_missingrbrace, &t.pos);
1142 }
1143 stk_free(parsestk);
1144 addpara(par, ret);
1145 }
1146 dtor(t);
1147 macrocleanup(macros);
1148 }
1149
1150 paragraph *read_input(input *in, indexdata *idx) {
1151 paragraph *head = NULL;
1152 paragraph **hptr = &head;
1153
1154 while (in->currindex < in->nfiles) {
1155 in->currfp = fopen(in->filenames[in->currindex], "r");
1156 if (in->currfp) {
1157 setpos(in, in->filenames[in->currindex]);
1158 read_file(&hptr, in, idx);
1159 }
1160 in->currindex++;
1161 }
1162
1163 return head;
1164 }