2 * input.c: read the source form
10 #define TAB_STOP 8 /* for column number tracking */
12 static void setpos(input
*in
, char *fname
) {
13 in
->pos
.filename
= fname
;
15 in
->pos
.col
= (in
->reportcols ?
1 : -1);
18 static void unget(input
*in
, int c
, filepos
*pos
) {
19 if (in
->npushback
>= in
->pushbacksize
) {
20 in
->pushbacksize
= in
->npushback
+ 16;
21 in
->pushback
= sresize(in
->pushback
, in
->pushbacksize
, pushback
);
23 in
->pushback
[in
->npushback
].chr
= c
;
24 in
->pushback
[in
->npushback
].pos
= *pos
; /* structure copy */
28 /* ---------------------------------------------------------------------- */
32 typedef struct macro_Tag macro
;
36 struct macrostack_Tag
{
42 static int macrocmp(void *av
, void *bv
) {
43 macro
*a
= (macro
*)av
, *b
= (macro
*)bv
;
44 return ustrcmp(a
->name
, b
->name
);
46 static void macrodef(tree234
*macros
, wchar_t *name
, wchar_t *text
,
48 macro
*m
= snew(macro
);
51 if (add234(macros
, m
) != m
) {
52 error(err_macroexists
, &fpos
, name
);
57 static int macrolookup(tree234
*macros
, input
*in
, wchar_t *name
,
61 gotit
= find234(macros
, &m
, NULL
);
63 macrostack
*expansion
= snew(macrostack
);
64 expansion
->next
= in
->stack
;
65 expansion
->text
= gotit
->text
;
66 expansion
->pos
= *pos
; /* structure copy */
68 expansion
->npushback
= in
->npushback
;
69 in
->stack
= expansion
;
74 static void macrocleanup(tree234
*macros
) {
77 for (ti
= 0; (m
= (macro
*)index234(macros
, ti
)) != NULL
; ti
++) {
85 static void input_configure(input
*in
, paragraph
*cfg
) {
86 assert(cfg
->type
== para_Config
);
88 if (!ustricmp(cfg
->keyword
, L
"input-charset")) {
89 in
->charset
= charset_from_ustr(&cfg
->fpos
, uadv(cfg
->keyword
));
96 static int get(input
*in
, filepos
*pos
, rdstringc
*rsc
) {
97 int pushbackpt
= in
->stack ? in
->stack
->npushback
: 0;
98 if (in
->npushback
> pushbackpt
) {
101 *pos
= in
->pushback
[in
->npushback
].pos
; /* structure copy */
102 return in
->pushback
[in
->npushback
].chr
;
104 else if (in
->stack
) {
105 wchar_t c
= in
->stack
->text
[in
->stack
->ptr
];
107 *pos
= in
->stack
->pos
;
108 if (in
->stack
->text
[++in
->stack
->ptr
] == L
'\0') {
109 macrostack
*tmp
= in
->stack
;
110 in
->stack
= tmp
->next
;
115 else if (in
->currfp
) {
117 while (in
->wcpos
>= in
->nwc
) {
119 int c
= getc(in
->currfp
);
131 /* Track line numbers, for error reporting */
134 if (in
->reportcols
) {
137 in
->pos
.col
= 1 + (in
->pos
.col
+ TAB_STOP
-1) % TAB_STOP
;
154 * Do input character set translation, so that we return
166 in
->nwc
= charset_to_unicode(&p
, &inlen
,
167 in
->wc
, lenof(in
->wc
),
168 in
->charset
, &in
->csstate
,
170 assert(p
== buf
+1 && inlen
== 0);
176 return in
->wc
[in
->wcpos
++];
183 * Lexical analysis of source files.
185 typedef struct token_Tag token
;
194 tok_eof
, /* end of file */
195 tok_eop
, /* end of paragraph */
196 tok_white
, /* whitespace */
197 tok_word
, /* a word or word fragment */
198 tok_cmd
, /* \command */
203 /* Halibut command keywords. */
205 c__invalid
, /* invalid command */
206 c__comment
, /* comment command (\#) */
207 c__escaped
, /* escaped character */
209 c__nbsp
, /* nonbreaking space */
210 c_A
, /* appendix heading */
211 c_B
, /* bibliography entry */
212 c_BR
, /* bibliography rewrite */
213 c_C
, /* chapter heading */
215 c_I
, /* invisible index mark */
216 c_IM
, /* index merge/rewrite */
217 c_K
, /* capitalised cross-reference */
218 c_S
, /* aux field is 0, 1, 2, ... */
219 c_U
, /* unnumbered-chapter heading */
220 c_W
, /* Web hyperlink */
221 c_b
, /* bulletted list */
223 c_cfg
, /* configuration directive */
224 c_copyright
, /* copyright statement */
225 c_cq
, /* quoted code (sugar for \q{\cw{x}}) */
226 c_cw
, /* weak code */
227 c_date
, /* document processing date */
228 c_dd
, /* description list: description */
229 c_define
, /* macro definition */
230 c_dt
, /* description list: described thing */
232 c_i
, /* visible index mark */
233 c_ii
, /* uncapitalised visible index mark */
234 c_k
, /* uncapitalised cross-reference */
235 c_lcont
, /* continuation para(s) for list item */
236 c_n
, /* numbered list */
237 c_nocite
, /* bibliography trickery */
238 c_preamble
, /* (obsolete) preamble text */
239 c_q
, /* quote marks */
240 c_quote
, /* block-quoted paragraphs */
241 c_rule
, /* horizontal rule */
242 c_title
, /* document title */
243 c_u
, /* aux field is char code */
244 c_versionid
/* document RCS id */
247 /* Perhaps whitespace should be defined in a more Unicode-friendly way? */
248 #define iswhite(c) ( (c)==32 || (c)==9 || (c)==13 || (c)==10 )
249 #define isnl(c) ( (c)==10 )
250 #define isdec(c) ( ((c)>='0'&&(c)<='9') )
251 #define fromdec(c) ( (c)-'0' )
252 #define ishex(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='F') || ((c)>='a'&&(c)<='f'))
253 #define fromhex(c) ( (c)<='9' ? (c)-'0' : ((c)&0xDF) - ('A'-10) )
254 #define iscmd(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='Z') || ((c)>='a'&&(c)<='z'))
257 * Keyword comparison function. Like strcmp, but between a wchar_t *
260 static int kwcmp(wchar_t const *p
, char const *q
) {
264 } while (*p
++ && *q
++ && !i
);
271 static void match_kw(token
*tok
) {
273 * FIXME. The ids are explicit in here so as to allow long-name
274 * equivalents to the various very short keywords.
276 static const struct { char const *name
; int id
; } keywords
[] = {
277 {"#", c__comment
}, /* comment command (\#) */
278 {"-", c__escaped
}, /* nonbreaking hyphen */
279 {".", c__nop
}, /* no-op */
280 {"A", c_A
}, /* appendix heading */
281 {"B", c_B
}, /* bibliography entry */
282 {"BR", c_BR
}, /* bibliography rewrite */
283 {"C", c_C
}, /* chapter heading */
284 {"H", c_H
}, /* heading */
285 {"I", c_I
}, /* invisible index mark */
286 {"IM", c_IM
}, /* index merge/rewrite */
287 {"K", c_K
}, /* capitalised cross-reference */
288 {"U", c_U
}, /* unnumbered-chapter heading */
289 {"W", c_W
}, /* Web hyperlink */
290 {"\\", c__escaped
}, /* escaped backslash (\\) */
291 {"_", c__nbsp
}, /* nonbreaking space (\_) */
292 {"b", c_b
}, /* bulletted list */
293 {"c", c_c
}, /* code */
294 {"cfg", c_cfg
}, /* configuration directive */
295 {"copyright", c_copyright
}, /* copyright statement */
296 {"cq", c_cq
}, /* quoted code (sugar for \q{\cw{x}}) */
297 {"cw", c_cw
}, /* weak code */
298 {"date", c_date
}, /* document processing date */
299 {"dd", c_dd
}, /* description list: description */
300 {"define", c_define
}, /* macro definition */
301 {"dt", c_dt
}, /* description list: described thing */
302 {"e", c_e
}, /* emphasis */
303 {"i", c_i
}, /* visible index mark */
304 {"ii", c_ii
}, /* uncapitalised visible index mark */
305 {"k", c_k
}, /* uncapitalised cross-reference */
306 {"lcont", c_lcont
}, /* continuation para(s) for list item */
307 {"n", c_n
}, /* numbered list */
308 {"nocite", c_nocite
}, /* bibliography trickery */
309 {"preamble", c_preamble
}, /* (obsolete) preamble text */
310 {"q", c_q
}, /* quote marks */
311 {"quote", c_quote
}, /* block-quoted paragraphs */
312 {"rule", c_rule
}, /* horizontal rule */
313 {"title", c_title
}, /* document title */
314 {"versionid", c_versionid
}, /* document RCS id */
315 {"{", c__escaped
}, /* escaped lbrace (\{) */
316 {"}", c__escaped
}, /* escaped rbrace (\}) */
321 * Special cases: \S{0,1,2,...} and \uABCD. If the syntax
322 * doesn't match correctly, we just fall through to the
323 * binary-search phase.
325 if (tok
->text
[0] == 'S') {
326 /* We expect numeric characters thereafter. */
327 wchar_t *p
= tok
->text
+1;
333 while (*p
&& isdec(*p
)) {
334 n
= 10 * n
+ fromdec(*p
);
343 } else if (tok
->text
[0] == 'u') {
344 /* We expect hex characters thereafter. */
345 wchar_t *p
= tok
->text
+1;
347 while (*p
&& ishex(*p
)) {
348 n
= 16 * n
+ fromhex(*p
);
359 j
= sizeof(keywords
)/sizeof(*keywords
);
362 c
= kwcmp(tok
->text
, keywords
[k
].name
);
368 tok
->cmd
= keywords
[k
].id
;
373 tok
->cmd
= c__invalid
;
378 * Read a token from the input file, in the normal way (`normal' in
379 * the sense that code paragraphs work a different way).
381 token
get_token(input
*in
) {
386 rdstring rs
= { 0, 0, NULL
};
387 rdstringc rsc
= { 0, 0, NULL
};
390 ret
.text
= NULL
; /* default */
391 ret
.origtext
= NULL
; /* default */
392 if (in
->pushback_chars
) {
393 rdaddsc(&rsc
, in
->pushback_chars
);
394 sfree(in
->pushback_chars
);
395 in
->pushback_chars
= NULL
;
397 c
= get(in
, &cpos
, &rsc
);
399 if (iswhite(c
)) { /* tok_white or tok_eop */
406 } while ((c
= get(in
, &cpos
, &rsc
)) != EOF
&& iswhite(c
));
413 in
->pushback_chars
= dupstr(rsc
.text
+ prevpos
);
417 ret
.type
= (nls
> 1 ? tok_eop
: tok_white
);
419 } else if (c
== EOF
) { /* tok_eof */
423 } else if (c
== '\\') { /* tok_cmd */
424 rsc
.pos
= prevpos
= 0;
425 c
= get(in
, &cpos
, &rsc
);
426 if (c
== '-' || c
== '\\' || c
== '_' ||
427 c
== '#' || c
== '{' || c
== '}' || c
== '.') {
428 /* single-char command */
431 } else if (c
== 'u') {
437 c
= get(in
, &cpos
, &rsc
);
438 } while (ishex(c
) && len
< 5);
440 } else if (iscmd(c
)) {
444 c
= get(in
, &cpos
, &rsc
);
449 * Now match the command against the list of available
453 ret
.text
= ustrdup(rs
.text
);
455 in
->pushback_chars
= dupstr(rsc
.text
+ prevpos
);
456 rsc
.text
[prevpos
] = '\0';
457 ret
.origtext
= dupstr(rsc
.text
);
459 ret
.origtext
= dupstr("");
465 } else if (c
== '{') { /* tok_lbrace */
466 ret
.type
= tok_lbrace
;
469 } else if (c
== '}') { /* tok_rbrace */
470 ret
.type
= tok_rbrace
;
473 } else { /* tok_word */
475 * Read a word: the longest possible contiguous sequence of
476 * things other than whitespace, backslash, braces and
477 * hyphen. A hyphen terminates the word but is returned as
478 * part of it; everything else is pushed back for the next
479 * token. The `aux' field contains TRUE if the word ends in
482 ret
.aux
= FALSE
; /* assumed for now */
485 if (iswhite(c
) || c
=='{' || c
=='}' || c
=='\\' || c
==EOF
) {
486 /* Put back the character that caused termination */
494 break; /* hyphen terminates word */
498 c
= get(in
, &cpos
, &rsc
);
501 ret
.text
= ustrdup(rs
.text
);
503 in
->pushback_chars
= dupstr(rsc
.text
+ prevpos
);
504 rsc
.text
[prevpos
] = '\0';
505 ret
.origtext
= dupstr(rsc
.text
);
507 ret
.origtext
= dupstr("");
516 * Determine whether the next input character is an open brace (for
517 * telling code paragraphs from paragraphs which merely start with
520 int isbrace(input
*in
) {
524 c
= get(in
, &cpos
, NULL
);
530 * Read the rest of a line that starts `\c'. Including nothing at
531 * all (tok_word with empty text).
533 token
get_codepar_token(input
*in
) {
536 rdstring rs
= { 0, 0, NULL
};
541 c
= get(in
, &cpos
, NULL
); /* expect (and discard) one space */
544 c
= get(in
, &cpos
, NULL
);
547 while (!isnl(c
) && c
!= EOF
) {
549 c
= get(in
, &cpos
, NULL
);
550 /* Discard \r just before \n. */
551 if (c2
!= 13 || !isnl(c
))
555 ret
.text
= ustrdup(rs
.text
);
561 * Adds a new word to a linked list
563 static word
*addword(word newword
, word
***hptrptr
) {
567 mnewword
= snew(word
);
568 *mnewword
= newword
; /* structure copy */
569 mnewword
->next
= NULL
;
570 **hptrptr
= mnewword
;
571 *hptrptr
= &mnewword
->next
;
576 * Adds a new paragraph to a linked list
578 static paragraph
*addpara(paragraph newpara
, paragraph
***hptrptr
) {
579 paragraph
*mnewpara
= snew(paragraph
);
580 *mnewpara
= newpara
; /* structure copy */
581 mnewpara
->next
= NULL
;
582 **hptrptr
= mnewpara
;
583 *hptrptr
= &mnewpara
->next
;
588 * Destructor before token is reassigned; should catch most memory
591 #define dtor(t) ( sfree(t.text), sfree(t.origtext) )
594 * Reads a single file (ie until get() returns EOF)
596 static void read_file(paragraph
***ret
, input
*in
, indexdata
*idx
,
600 word wd
, **whptr
, **idximplicit
;
601 wchar_t utext
[2], *wdtext
;
604 int iswhite
, seenwhite
;
609 stack_nop
= 0, /* do nothing (for error recovery) */
610 stack_ualt
= 1, /* \u alternative */
611 stack_style
= 2, /* \e, \c, \cw */
612 stack_idx
= 4, /* \I, \i, \ii */
613 stack_hyper
= 8, /* \W */
614 stack_quote
= 16 /* \q */
616 word
**whptr
; /* to restore from \u alternatives */
617 word
**idximplicit
; /* to restore from \u alternatives */
622 struct crossparaitem
{
623 int type
; /* currently c_lcont, c_quote or -1 */
624 int seen_lcont
, seen_quote
;
627 word
*indexword
, *uword
, *iword
;
630 int index_downcase
, index_visible
, indexing
;
631 const rdstring nullrs
= { 0, 0, NULL
};
638 crossparastk
= stk_new();
641 * Loop on each paragraph.
644 int start_cmd
= c__invalid
;
647 par
.origkeyword
= NULL
;
655 dtor(t
), t
= get_token(in
);
658 } while (t
.type
== tok_eop
);
659 if (t
.type
== tok_eof
)
663 * Parse code paragraphs separately.
665 if (t
.type
== tok_cmd
&& t
.cmd
== c_c
&& !isbrace(in
)) {
666 int wtype
= word_WeakCode
;
668 par
.type
= para_Code
;
671 dtor(t
), t
= get_codepar_token(in
);
673 wd
.breaks
= FALSE
; /* shouldn't need this... */
674 wd
.text
= ustrdup(t
.text
);
678 dtor(t
), t
= get_token(in
);
679 if (t
.type
== tok_white
) {
681 * The newline after a code-paragraph line
683 dtor(t
), t
= get_token(in
);
685 if (t
.type
== tok_eop
|| t
.type
== tok_eof
||
686 t
.type
== tok_rbrace
) { /* might be } terminating \lcont */
687 if (t
.type
== tok_rbrace
)
690 } else if (t
.type
== tok_cmd
&& t
.cmd
== c_c
) {
691 wtype
= word_WeakCode
;
692 } else if (t
.type
== tok_cmd
&& t
.cmd
== c_e
&&
693 wtype
== word_WeakCode
) {
696 error(err_brokencodepara
, &t
.pos
);
697 prev_para_type
= par
.type
;
699 while (t
.type
!= tok_eop
) /* error recovery: */
700 dtor(t
), t
= get_token(in
); /* eat rest of paragraph */
701 goto codeparabroken
; /* ick, but such is life */
704 prev_para_type
= par
.type
;
711 * Spot the special commands that define a grouping of more
712 * than one paragraph, and also the closing braces that
715 if (t
.type
== tok_cmd
&&
716 (t
.cmd
== c_lcont
|| t
.cmd
== c_quote
)) {
717 struct crossparaitem
*sitem
, *stop
;
721 * Expect, and swallow, an open brace.
723 dtor(t
), t
= get_token(in
);
724 if (t
.type
!= tok_lbrace
) {
725 error(err_explbr
, &t
.pos
);
730 * Also expect, and swallow, any whitespace after that
731 * (a newline before a code paragraph wouldn't be
735 dtor(t
), t
= get_token(in
);
736 } while (t
.type
== tok_white
);
739 if (cmd
== c_lcont
) {
741 * \lcont causes a continuation of a list item into
742 * multiple paragraphs (which may in turn contain
743 * nested lists, code paras etc). Hence, the previous
744 * paragraph must be of a list type.
746 sitem
= snew(struct crossparaitem
);
747 stop
= (struct crossparaitem
*)stk_top(crossparastk
);
751 sitem
->seen_quote
= sitem
->seen_lcont
= 0;
753 if (prev_para_type
== para_Bullet
||
754 prev_para_type
== para_NumberedList
||
755 prev_para_type
== para_Description
) {
756 sitem
->type
= c_lcont
;
757 sitem
->seen_lcont
= 1;
758 par
.type
= para_LcontPush
;
759 prev_para_type
= par
.type
;
763 * Push a null item on the cross-para stack so that
764 * when we see the corresponding closing brace we
765 * don't give a cascade error.
768 error(err_misplacedlcont
, &t
.pos
);
772 * \quote causes a group of paragraphs to be
773 * block-quoted (typically they will be indented a
776 sitem
= snew(struct crossparaitem
);
777 stop
= (struct crossparaitem
*)stk_top(crossparastk
);
781 sitem
->seen_quote
= sitem
->seen_lcont
= 0;
782 sitem
->type
= c_quote
;
783 sitem
->seen_quote
= 1;
784 par
.type
= para_QuotePush
;
785 prev_para_type
= par
.type
;
788 stk_push(crossparastk
, sitem
);
790 } else if (t
.type
== tok_rbrace
) {
791 struct crossparaitem
*sitem
= stk_pop(crossparastk
);
793 error(err_unexbrace
, &t
.pos
);
795 switch (sitem
->type
) {
797 par
.type
= para_LcontPop
;
798 prev_para_type
= par
.type
;
802 par
.type
= para_QuotePop
;
803 prev_para_type
= par
.type
;
812 while (t
.type
== tok_cmd
&&
813 macrolookup(macros
, in
, t
.text
, &t
.pos
)) {
814 dtor(t
), t
= get_token(in
);
818 * This token begins a paragraph. See if it's one of the
819 * special commands that define a paragraph type.
821 * (note that \# is special in a way, and \nocite takes no
824 par
.type
= para_Normal
;
825 if (t
.type
== tok_cmd
) {
827 int is_macro
= FALSE
;
835 error(err_badparatype
, t
.text
, &t
.pos
);
841 break; /* `\#{': isn't a comment para */
844 dtor(t
), t
= get_token(in
);
845 } while (t
.type
!= tok_eop
&& t
.type
!= tok_eof
);
846 continue; /* next paragraph */
850 * 1 -- exactly one keyword
851 * 2 -- at least one keyword
852 * 4 -- any number of keywords including zero
853 * 8 -- at least one keyword and then nothing else
854 * 16 -- nothing at all! no keywords, no body
855 * 32 -- no keywords at all
857 case c_A
: needkw
= 2; par
.type
= para_Appendix
; break;
858 case c_B
: needkw
= 2; par
.type
= para_Biblio
; break;
859 case c_BR
: needkw
= 1; par
.type
= para_BR
;
860 start_cmd
= c_BR
; break;
861 case c_C
: needkw
= 2; par
.type
= para_Chapter
; break;
862 case c_H
: needkw
= 2; par
.type
= para_Heading
;
865 case c_IM
: needkw
= 2; par
.type
= para_IM
;
866 start_cmd
= c_IM
; break;
867 case c_S
: needkw
= 2; par
.type
= para_Subsect
;
868 par
.aux
= t
.aux
; break;
869 case c_U
: needkw
= 32; par
.type
= para_UnnumberedChapter
; break;
870 /* For \b and \n the keyword is optional */
871 case c_b
: needkw
= 4; par
.type
= para_Bullet
; break;
872 case c_dt
: needkw
= 4; par
.type
= para_DescribedThing
; break;
873 case c_dd
: needkw
= 4; par
.type
= para_Description
; break;
874 case c_n
: needkw
= 4; par
.type
= para_NumberedList
; break;
875 case c_cfg
: needkw
= 8; par
.type
= para_Config
;
876 start_cmd
= c_cfg
; break;
877 case c_copyright
: needkw
= 32; par
.type
= para_Copyright
; break;
878 case c_define
: is_macro
= TRUE
; needkw
= 1; break;
879 /* For \nocite the keyword is _everything_ */
880 case c_nocite
: needkw
= 8; par
.type
= para_NoCite
; break;
881 case c_preamble
: needkw
= 32; par
.type
= para_Normal
; break;
882 case c_rule
: needkw
= 16; par
.type
= para_Rule
; break;
883 case c_title
: needkw
= 32; par
.type
= para_Title
; break;
884 case c_versionid
: needkw
= 32; par
.type
= para_VersionID
; break;
887 if (par
.type
== para_Chapter
||
888 par
.type
== para_Heading
||
889 par
.type
== para_Subsect
||
890 par
.type
== para_Appendix
||
891 par
.type
== para_UnnumberedChapter
) {
892 struct crossparaitem
*sitem
= stk_top(crossparastk
);
893 if (sitem
&& (sitem
->seen_lcont
|| sitem
->seen_quote
)) {
894 error(err_sectmarkerinblock
,
896 (sitem
->seen_lcont ?
"lcont" : "quote"));
901 rdstring rs
= { 0, 0, NULL
};
902 rdstringc rsc
= { 0, 0, NULL
};
907 dtor(t
), t
= get_token(in
);
909 while (t
.type
== tok_lbrace
||
910 (t
.type
== tok_white
&& (needkw
& 24))) {
912 * In paragraph types which can't accept any
913 * body text (such as \cfg), we are lenient
914 * about whitespace between keywords. This is
915 * important for \cfg in particular since it
916 * can often have many keywords which are long
917 * pieces of text, so it's useful to permit the
918 * user to wrap the line between them.
920 if (t
.type
== tok_white
) {
921 dtor(t
), t
= get_token(in
); /* eat the space */
924 /* This is a keyword. */
926 /* FIXME: there will be bugs if anyone specifies an
927 * empty keyword (\foo{}), so trap this case. */
928 while (dtor(t
), t
= get_token(in
),
929 t
.type
== tok_word
||
930 t
.type
== tok_white
||
931 (t
.type
== tok_cmd
&& t
.cmd
== c__nbsp
) ||
932 (t
.type
== tok_cmd
&& t
.cmd
== c__escaped
) ||
933 (t
.type
== tok_cmd
&& t
.cmd
== c_u
)) {
934 if (t
.type
== tok_white
||
935 (t
.type
== tok_cmd
&& t
.cmd
== c__nbsp
)) {
938 } else if (t
.type
== tok_cmd
&& t
.cmd
== c_u
) {
941 rdaddsc(&rsc
, t
.origtext
);
944 rdaddsc(&rsc
, t
.origtext
);
947 if (t
.type
!= tok_rbrace
) {
948 error(err_kwunclosed
, &t
.pos
);
951 rdadd(&rs
, 0); /* add string terminator */
952 rdaddc(&rsc
, 0); /* add string terminator */
953 dtor(t
), t
= get_token(in
); /* eat right brace */
956 rdadd(&rs
, 0); /* add string terminator */
957 rdaddc(&rsc
, 0); /* add string terminator */
959 /* See whether we have the right number of keywords. */
960 if ((needkw
& 48) && nkeys
> 0)
961 error(err_kwillegal
, &fp
);
962 if ((needkw
& 11) && nkeys
== 0)
963 error(err_kwexpected
, &fp
);
964 if ((needkw
& 5) && nkeys
> 1)
965 error(err_kwtoomany
, &fp
);
969 * Macro definition. Get the rest of the line
970 * as a code-paragraph token, repeatedly until
971 * there's nothing more left of it. Separate
974 rdstring macrotext
= { 0, 0, NULL
};
976 dtor(t
), t
= get_codepar_token(in
);
977 if (macrotext
.pos
> 0)
978 rdadd(¯otext
, L
'\n');
979 rdadds(¯otext
, t
.text
);
980 dtor(t
), t
= get_token(in
);
981 if (t
.type
== tok_eop
) break;
983 macrodef(macros
, rs
.text
, macrotext
.text
, fp
);
984 continue; /* next paragraph */
987 par
.keyword
= rdtrim(&rs
);
988 par
.origkeyword
= rdtrimc(&rsc
);
990 /* Move to EOP in case of needkw==8 or 16 (no body) */
992 /* We allow whitespace even when we expect no para body */
993 while (t
.type
== tok_white
)
994 dtor(t
), t
= get_token(in
);
995 if (t
.type
!= tok_eop
&& t
.type
!= tok_eof
&&
996 (start_cmd
== c__invalid
||
997 t
.type
!= tok_cmd
|| t
.cmd
!= start_cmd
)) {
998 error(err_bodyillegal
, &t
.pos
);
999 /* Error recovery: eat the rest of the paragraph */
1000 while (t
.type
!= tok_eop
&& t
.type
!= tok_eof
&&
1001 (start_cmd
== c__invalid
||
1002 t
.type
!= tok_cmd
|| t
.cmd
!= start_cmd
))
1003 dtor(t
), t
= get_token(in
);
1005 if (t
.type
== tok_cmd
)
1006 already
= TRUE
;/* inhibit get_token at top of loop */
1007 prev_para_type
= par
.type
;
1010 if (par
.type
== para_Config
) {
1011 input_configure(in
, &par
);
1013 continue; /* next paragraph */
1019 * Now read the actual paragraph, word by word, adding to
1020 * the paragraph list.
1022 * Mid-paragraph commands:
1035 parsestk
= stk_new();
1036 style
= word_Normal
;
1037 spcstyle
= word_WhiteSpace
;
1040 while (t
.type
!= tok_eop
&& t
.type
!= tok_eof
) {
1044 /* Handle implicit paragraph breaks after \IM, \BR etc */
1045 if (start_cmd
!= c__invalid
&&
1046 t
.type
== tok_cmd
&& t
.cmd
== start_cmd
) {
1047 already
= TRUE
; /* inhibit get_token at top of loop */
1051 if (t
.type
== tok_cmd
&& t
.cmd
== c__nop
) {
1052 dtor(t
), t
= get_token(in
);
1053 continue; /* do nothing! */
1056 if (t
.type
== tok_cmd
&& t
.cmd
== c__escaped
) {
1057 t
.type
= tok_word
; /* nice and simple */
1058 t
.aux
= 0; /* even if `\-' - nonbreaking! */
1060 if (t
.type
== tok_cmd
&& t
.cmd
== c__nbsp
) {
1061 t
.type
= tok_word
; /* nice and simple */
1063 t
.text
= ustrdup(L
" "); /* text is ` ' not `_' */
1064 t
.aux
= 0; /* (nonbreaking) */
1068 if (whptr
== &par
.words
)
1069 break; /* strip whitespace at start of para */
1078 * Inhibit use of whitespace if it's (probably the
1079 * newline) before a repeat \IM / \BR type
1082 if (start_cmd
!= c__invalid
) {
1083 dtor(t
), t
= get_token(in
);
1085 if (t
.type
== tok_cmd
&& t
.cmd
== start_cmd
)
1090 rdadd(&indexstr
, ' ');
1091 if (!indexing
|| index_visible
)
1092 addword(wd
, &whptr
);
1094 addword(wd
, &idximplicit
);
1099 rdadds(&indexstr
, t
.text
);
1105 if (!indexing
|| index_visible
) {
1106 wd
.text
= ustrdup(t
.text
);
1107 addword(wd
, &whptr
);
1110 wd
.text
= ustrdup(t
.text
);
1111 addword(wd
, &idximplicit
);
1115 error(err_unexbrace
, &t
.pos
);
1116 /* Error recovery: push nop */
1117 sitem
= snew(struct stack_item
);
1118 sitem
->type
= stack_nop
;
1119 sitem
->fpos
= t
.pos
;
1120 stk_push(parsestk
, sitem
);
1123 sitem
= stk_pop(parsestk
);
1126 * This closing brace could have been an
1127 * indication that the cross-paragraph stack
1128 * wants popping. Accordingly, we treat it here
1129 * as an indication that the paragraph is over.
1134 if (sitem
->type
& stack_ualt
) {
1135 whptr
= sitem
->whptr
;
1136 idximplicit
= sitem
->idximplicit
;
1138 if (sitem
->type
& stack_style
) {
1139 style
= word_Normal
;
1140 spcstyle
= word_WhiteSpace
;
1142 if (sitem
->type
& stack_idx
) {
1143 indexword
->text
= ustrdup(indexstr
.text
);
1144 if (index_downcase
) {
1147 ustrlow(indexword
->text
);
1148 ustrlow(indexstr
.text
);
1150 for (w
= idxwordlist
; w
; w
= w
->next
)
1155 rdadd(&indexstr
, L
'\0');
1156 index_merge(idx
, FALSE
, indexstr
.text
,
1157 idxwordlist
, &sitem
->fpos
);
1158 sfree(indexstr
.text
);
1160 if (sitem
->type
& stack_hyper
) {
1162 wd
.type
= word_HyperEnd
;
1167 if (!indexing
|| index_visible
)
1168 addword(wd
, &whptr
);
1170 addword(wd
, &idximplicit
);
1172 if (sitem
->type
& stack_quote
) {
1174 wd
.type
= toquotestyle(style
);
1176 wd
.aux
= quote_Close
;
1179 if (!indexing
|| index_visible
)
1180 addword(wd
, &whptr
);
1182 rdadd(&indexstr
, L
'"');
1183 addword(wd
, &idximplicit
);
1193 * In-paragraph comment: \#{ balanced braces }
1195 * Anything goes here; even tok_eop. We should
1196 * eat whitespace after the close brace _if_
1197 * there was whitespace before the \#.
1199 dtor(t
), t
= get_token(in
);
1200 if (t
.type
!= tok_lbrace
) {
1201 error(err_explbr
, &t
.pos
);
1204 while (braces
> 0) {
1205 dtor(t
), t
= get_token(in
);
1206 if (t
.type
== tok_lbrace
)
1208 else if (t
.type
== tok_rbrace
)
1210 else if (t
.type
== tok_eof
) {
1211 error(err_commenteof
, &t
.pos
);
1218 dtor(t
), t
= get_token(in
);
1219 if (t
.type
== tok_white
) {
1228 dtor(t
), t
= get_token(in
);
1229 if (t
.type
!= tok_lbrace
) {
1230 error(err_explbr
, &t
.pos
);
1233 * Enforce that \q may not be used anywhere
1234 * within \c. (It shouldn't be necessary
1235 * since the whole point of \c should be
1236 * that the user wants to exercise exact
1237 * control over the glyphs used, and
1238 * forbidding it has the useful effect of
1239 * relieving some backends of having to
1240 * make difficult decisions.)
1244 if (style
!= word_Code
&& style
!= word_WeakCode
) {
1246 wd
.type
= toquotestyle(style
);
1248 wd
.aux
= quote_Open
;
1251 if (!indexing
|| index_visible
)
1252 addword(wd
, &whptr
);
1254 rdadd(&indexstr
, L
'"');
1255 addword(wd
, &idximplicit
);
1257 stype
= stack_quote
;
1259 error(err_codequote
, &t
.pos
);
1262 sitem
= snew(struct stack_item
);
1263 sitem
->fpos
= t
.pos
;
1264 sitem
->type
= stype
;
1266 if (style
!= word_Normal
) {
1267 error(err_nestedstyles
, &t
.pos
);
1269 style
= word_WeakCode
;
1270 spcstyle
= tospacestyle(style
);
1271 sitem
->type
|= stack_style
;
1274 stk_push(parsestk
, sitem
);
1282 * Keyword, hyperlink, or \date. We expect a
1283 * left brace, some text, and then a right
1284 * brace. No nesting; no arguments.
1289 wd
.type
= word_UpperXref
;
1290 else if (t
.cmd
== c_k
)
1291 wd
.type
= word_LowerXref
;
1292 else if (t
.cmd
== c_W
)
1293 wd
.type
= word_HyperLink
;
1295 wd
.type
= word_Normal
;
1296 dtor(t
), t
= get_token(in
);
1297 if (t
.type
!= tok_lbrace
) {
1298 if (wd
.type
== word_Normal
) {
1299 time_t thetime
= time(NULL
);
1300 struct tm
*broken
= localtime(&thetime
);
1302 wdtext
= ustrftime(NULL
, broken
);
1305 error(err_explbr
, &t
.pos
);
1309 rdstring rs
= { 0, 0, NULL
};
1310 while (dtor(t
), t
= get_token(in
),
1311 t
.type
== tok_word
|| t
.type
== tok_white
) {
1312 if (t
.type
== tok_white
)
1315 rdadds(&rs
, t
.text
);
1317 if (wd
.type
== word_Normal
) {
1318 time_t thetime
= time(NULL
);
1319 struct tm
*broken
= localtime(&thetime
);
1320 wdtext
= ustrftime(rs
.text
, broken
);
1323 wdtext
= ustrdup(rs
.text
);
1326 if (t
.type
!= tok_rbrace
) {
1327 error(err_kwexprbr
, &t
.pos
);
1332 if (!indexing
|| index_visible
) {
1333 wd
.text
= ustrdup(wdtext
);
1334 addword(wd
, &whptr
);
1337 wd
.text
= ustrdup(wdtext
);
1338 addword(wd
, &idximplicit
);
1341 if (wd
.type
== word_HyperLink
) {
1343 * Hyperlinks are different: they then
1344 * expect another left brace, to begin
1345 * delimiting the text marked by the link.
1347 dtor(t
), t
= get_token(in
);
1348 sitem
= snew(struct stack_item
);
1349 sitem
->fpos
= wd
.fpos
;
1350 sitem
->type
= stack_hyper
;
1352 * Special cases: \W{}\i, \W{}\ii
1354 if (t
.type
== tok_cmd
&&
1355 (t
.cmd
== c_i
|| t
.cmd
== c_ii
)) {
1357 error(err_nestedindex
, &t
.pos
);
1359 /* Add an index-reference word with no
1361 wd
.type
= word_IndexRef
;
1366 indexword
= addword(wd
, &whptr
);
1367 /* Set up a rdstring to read the
1370 /* Flags so that we do the Right
1371 * Things with text */
1372 index_visible
= (type
!= c_I
);
1373 index_downcase
= (type
== c_ii
);
1376 idximplicit
= &idxwordlist
;
1378 sitem
->type
|= stack_idx
;
1380 dtor(t
), t
= get_token(in
);
1383 * Special cases: \W{}\c, \W{}\e, \W{}\cw
1385 if (t
.type
== tok_cmd
&&
1386 (t
.cmd
== c_e
|| t
.cmd
== c_c
|| t
.cmd
== c_cw
)) {
1387 if (style
!= word_Normal
)
1388 error(err_nestedstyles
, &t
.pos
);
1390 style
= (t
.cmd
== c_c ? word_Code
:
1391 t
.cmd
== c_cw ? word_WeakCode
:
1393 spcstyle
= tospacestyle(style
);
1394 sitem
->type
|= stack_style
;
1396 dtor(t
), t
= get_token(in
);
1398 if (t
.type
!= tok_lbrace
) {
1399 error(err_explbr
, &t
.pos
);
1402 stk_push(parsestk
, sitem
);
1410 if (style
!= word_Normal
) {
1411 error(err_nestedstyles
, &t
.pos
);
1412 /* Error recovery: eat lbrace, push nop. */
1413 dtor(t
), t
= get_token(in
);
1414 sitem
= snew(struct stack_item
);
1415 sitem
->fpos
= t
.pos
;
1416 sitem
->type
= stack_nop
;
1417 stk_push(parsestk
, sitem
);
1419 dtor(t
), t
= get_token(in
);
1420 if (t
.type
!= tok_lbrace
) {
1421 error(err_explbr
, &t
.pos
);
1423 style
= (type
== c_c ? word_Code
:
1424 type
== c_cw ? word_WeakCode
:
1426 spcstyle
= tospacestyle(style
);
1427 sitem
= snew(struct stack_item
);
1428 sitem
->fpos
= t
.pos
;
1429 sitem
->type
= stack_style
;
1430 stk_push(parsestk
, sitem
);
1438 error(err_nestedindex
, &t
.pos
);
1439 /* Error recovery: eat lbrace, push nop. */
1440 dtor(t
), t
= get_token(in
);
1441 sitem
= snew(struct stack_item
);
1442 sitem
->fpos
= t
.pos
;
1443 sitem
->type
= stack_nop
;
1444 stk_push(parsestk
, sitem
);
1446 sitem
= snew(struct stack_item
);
1447 sitem
->fpos
= t
.pos
;
1448 sitem
->type
= stack_idx
;
1449 dtor(t
), t
= get_token(in
);
1451 * Special cases: \i\c, \i\e, \i\cw
1454 if (t
.type
== tok_cmd
&&
1455 (t
.cmd
== c_e
|| t
.cmd
== c_c
|| t
.cmd
== c_cw
)) {
1456 if (style
!= word_Normal
)
1457 error(err_nestedstyles
, &t
.pos
);
1459 style
= (t
.cmd
== c_c ? word_Code
:
1460 t
.cmd
== c_cw ? word_WeakCode
:
1462 spcstyle
= tospacestyle(style
);
1463 sitem
->type
|= stack_style
;
1465 dtor(t
), t
= get_token(in
);
1467 if (t
.type
!= tok_lbrace
) {
1469 error(err_explbr
, &t
.pos
);
1471 /* Add an index-reference word with no text as yet */
1472 wd
.type
= word_IndexRef
;
1477 indexword
= addword(wd
, &whptr
);
1478 /* Set up a rdstring to read the index text */
1480 /* Flags so that we do the Right Things with text */
1481 index_visible
= (type
!= c_I
);
1482 index_downcase
= (type
== c_ii
);
1485 idximplicit
= &idxwordlist
;
1486 /* Stack item to close the indexing on exit */
1487 stk_push(parsestk
, sitem
);
1492 utext
[0] = uchr
; utext
[1] = 0;
1498 if (!indexing
|| index_visible
) {
1499 wd
.text
= ustrdup(utext
);
1500 uword
= addword(wd
, &whptr
);
1504 wd
.text
= ustrdup(utext
);
1505 iword
= addword(wd
, &idximplicit
);
1508 dtor(t
), t
= get_token(in
);
1509 if (t
.type
== tok_lbrace
) {
1511 * \u with a left brace. Until the brace
1512 * closes, all further words go on a
1513 * sidetrack from the main thread of the
1516 sitem
= snew(struct stack_item
);
1517 sitem
->fpos
= t
.pos
;
1518 sitem
->type
= stack_ualt
;
1519 sitem
->whptr
= whptr
;
1520 sitem
->idximplicit
= idximplicit
;
1521 stk_push(parsestk
, sitem
);
1522 whptr
= uword ?
&uword
->alt
: NULL
;
1523 idximplicit
= iword ?
&iword
->alt
: NULL
;
1526 rdadd(&indexstr
, uchr
);
1531 if (!macrolookup(macros
, in
, t
.text
, &t
.pos
))
1532 error(err_badmidcmd
, t
.text
, &t
.pos
);
1537 dtor(t
), t
= get_token(in
);
1538 seenwhite
= iswhite
;
1541 /* Check the stack is empty */
1542 if (stk_top(parsestk
)) {
1543 while ((sitem
= stk_pop(parsestk
)))
1545 error(err_missingrbrace
, &t
.pos
);
1548 prev_para_type
= par
.type
;
1550 * Before we add the paragraph to the output list, we
1551 * should check that there was any text in it at all; there
1552 * might not be if (for example) the paragraph contained
1553 * nothing but an unrecognised command sequence, and if we
1554 * put an empty paragraph on the list it may confuse the
1555 * back ends later on.
1560 if (t
.type
== tok_eof
)
1564 if (stk_top(crossparastk
)) {
1567 error(err_missingrbrace2
, &t
.pos
);
1568 while ((p
= stk_pop(crossparastk
)))
1573 * We break to here rather than returning, because otherwise
1574 * this cleanup doesn't happen.
1578 stk_free(crossparastk
);
1585 void (*reader
)(input
*);
1587 { "%!FontType1-", 12, FALSE
, &read_pfa_file
},
1588 { "%!PS-AdobeFont-", 15, FALSE
, &read_pfa_file
},
1589 { "\x80\x01", 2, TRUE
, &read_pfb_file
},
1590 { "StartFontMetrics", 16, FALSE
, &read_afm_file
},
1591 { "\x00\x01\x00\x00", 4, TRUE
, &read_sfnt_file
},
1592 { "true", 4, TRUE
, &read_sfnt_file
},
1595 paragraph
*read_input(input
*in
, indexdata
*idx
) {
1596 paragraph
*head
= NULL
;
1597 paragraph
**hptr
= &head
;
1602 void (*reader
)(input
*);
1604 macros
= newtree234(macrocmp
);
1606 while (in
->currindex
< in
->nfiles
) {
1607 setpos(in
, in
->filenames
[in
->currindex
]);
1608 in
->charset
= in
->defcharset
;
1609 in
->csstate
= charset_init_state
;
1610 in
->wcpos
= in
->nwc
= 0;
1611 in
->pushback_chars
= NULL
;
1613 if (!in
->filenames
[in
->currindex
]) {
1615 in
->wantclose
= FALSE
; /* don't fclose stdin */
1617 * When reading standard input, we always expect to see
1618 * an actual Halibut file and not any of the unusual
1619 * input types like fonts.
1624 * Open the file in binary mode to look for magic
1625 * numbers. We'll switch to text mode if we find we're
1626 * looking at a text file type.
1628 in
->currfp
= fopen(in
->filenames
[in
->currindex
], "rb");
1629 binary
= FALSE
; /* default to Halibut source, which is text */
1631 in
->wantclose
= TRUE
;
1633 len
= fread(mag
, 1, sizeof(mag
), in
->currfp
);
1634 for (i
= 0; i
< lenof(magics
); i
++) {
1635 if (len
>= magics
[i
].nmagic
&&
1636 memcmp(mag
, magics
[i
].magic
, magics
[i
].nmagic
) == 0) {
1637 reader
= magics
[i
].reader
;
1638 binary
= magics
[i
].binary
;
1646 in
->currfp
= fopen(in
->filenames
[in
->currindex
], "r");
1650 if (reader
== NULL
) {
1651 read_file(&hptr
, in
, idx
, macros
);
1659 macrocleanup(macros
);