Sort out error handling everywhere a charset name is converted into
[sgt/halibut] / bk_html.c
CommitLineData
78c73085 1/*
2 * HTML backend for Halibut
3 */
4
5/*
6 * TODO:
7 *
8 * - I'm never entirely convinced that having a fragment link to
9 * come in at the start of the real text in the file is
10 * sensible. Perhaps for the topmost section in the file, no
11 * fragment should be used? (Though it should probably still be
12 * _there_ even if unused.)
78c73085 13 */
14
15#include <stdio.h>
16#include <stdlib.h>
17#include <assert.h>
18#include <limits.h>
19#include "halibut.h"
20
21#define is_heading_type(type) ( (type) == para_Title || \
22 (type) == para_Chapter || \
23 (type) == para_Appendix || \
24 (type) == para_UnnumberedChapter || \
25 (type) == para_Heading || \
26 (type) == para_Subsect)
27
28#define heading_depth(p) ( (p)->type == para_Subsect ? (p)->aux + 1 : \
29 (p)->type == para_Heading ? 1 : \
30 (p)->type == para_Title ? -1 : 0 )
31
32typedef struct {
33 int just_numbers;
34 wchar_t *number_suffix;
35} sectlevel;
36
37typedef struct {
38 int nasect;
39 sectlevel achapter, *asect;
40 int *contents_depths; /* 0=main, 1=chapter, 2=sect etc */
41 int ncdepths;
42 int address_section, visible_version_id;
43 int leaf_contains_contents, leaf_smallest_contents;
44 char *contents_filename;
45 char *index_filename;
46 char *template_filename;
47 char *single_filename;
48 char *template_fragment;
49 char *head_end, *body_start, *body_end, *addr_start, *addr_end;
50 char *body_tag, *nav_attr;
51 wchar_t *author, *description;
56a99eb6 52 wchar_t *index_text, *contents_text, *preamble_text, *title_separator;
53 wchar_t *nav_prev_text, *nav_next_text, *nav_separator;
54 wchar_t *index_main_sep, *index_multi_sep;
55 wchar_t *pre_versionid, *post_versionid;
78c73085 56 int restrict_charset, output_charset;
57 enum {
27bdc5ab 58 HTML_3_2, HTML_4, ISO_HTML,
78c73085 59 XHTML_1_0_TRANSITIONAL, XHTML_1_0_STRICT
60 } htmlver;
61 wchar_t *lquote, *rquote;
62 int leaf_level;
63} htmlconfig;
64
65#define contents_depth(conf, level) \
66 ( (conf).ncdepths > (level) ? (conf).contents_depths[level] : (level)+2 )
67
68#define is_xhtml(ver) ((ver) >= XHTML_1_0_TRANSITIONAL)
69
70typedef struct htmlfile htmlfile;
71typedef struct htmlsect htmlsect;
72
73struct htmlfile {
74 htmlfile *next;
75 char *filename;
76 int last_fragment_number;
77 int min_heading_depth;
78 htmlsect *first, *last; /* first/last highest-level sections */
79};
80
81struct htmlsect {
82 htmlsect *next, *parent;
83 htmlfile *file;
84 paragraph *title, *text;
85 enum { NORMAL, TOP, INDEX } type;
86 int contents_depth;
87 char *fragment;
88};
89
90typedef struct {
91 htmlfile *head, *tail;
92 htmlfile *single, *index;
3e82de8f 93 tree234 *frags;
78c73085 94} htmlfilelist;
95
96typedef struct {
97 htmlsect *head, *tail;
98} htmlsectlist;
99
100typedef struct {
3e82de8f 101 htmlfile *file;
102 char *fragment;
103} htmlfragment;
104
105typedef struct {
78c73085 106 int nrefs, refsize;
107 word **refs;
108} htmlindex;
109
110typedef struct {
111 htmlsect *section;
112 char *fragment;
1b7bf715 113 int generated, referenced;
78c73085 114} htmlindexref;
115
116typedef struct {
117 /*
118 * This level deals with charset conversion, starting and
119 * ending tags, and writing to the file. It's the lexical
120 * level.
121 */
122 FILE *fp;
b7309494 123 int charset, restrict_charset;
78c73085 124 charset_state cstate;
125 int ver;
126 enum {
127 HO_NEUTRAL, HO_IN_TAG, HO_IN_EMPTY_TAG, HO_IN_TEXT
128 } state;
129 /*
130 * Stuff beyond here deals with the higher syntactic level: it
131 * tracks how many levels of <ul> are currently open when
132 * producing a contents list, for example.
133 */
134 int contents_level;
135} htmloutput;
136
3e82de8f 137static int html_fragment_compare(void *av, void *bv)
138{
139 htmlfragment *a = (htmlfragment *)av;
140 htmlfragment *b = (htmlfragment *)bv;
141 int cmp;
142
143 if ((cmp = strcmp(a->file->filename, b->file->filename)) != 0)
144 return cmp;
145 else
146 return strcmp(a->fragment, b->fragment);
147}
148
78c73085 149static void html_file_section(htmlconfig *cfg, htmlfilelist *files,
150 htmlsect *sect, int depth);
151
152static htmlfile *html_new_file(htmlfilelist *list, char *filename);
153static htmlsect *html_new_sect(htmlsectlist *list, paragraph *title);
154
155/* Flags for html_words() flags parameter */
156#define NOTHING 0x00
157#define MARKUP 0x01
158#define LINKS 0x02
159#define INDEXENTS 0x04
160#define ALL 0x07
161static void html_words(htmloutput *ho, word *words, int flags,
162 htmlfile *file, keywordlist *keywords, htmlconfig *cfg);
163static void html_codepara(htmloutput *ho, word *words);
164
165static void element_open(htmloutput *ho, char const *name);
166static void element_close(htmloutput *ho, char const *name);
167static void element_empty(htmloutput *ho, char const *name);
168static void element_attr(htmloutput *ho, char const *name, char const *value);
169static void element_attr_w(htmloutput *ho, char const *name,
170 wchar_t const *value);
171static void html_text(htmloutput *ho, wchar_t const *str);
35b123a0 172static void html_text_nbsp(htmloutput *ho, wchar_t const *str);
78c73085 173static void html_text_limit(htmloutput *ho, wchar_t const *str, int maxlen);
174static void html_text_limit_internal(htmloutput *ho, wchar_t const *text,
35b123a0 175 int maxlen, int quote_quotes, int nbsp);
78c73085 176static void html_nl(htmloutput *ho);
177static void html_raw(htmloutput *ho, char *text);
178static void html_raw_as_attr(htmloutput *ho, char *text);
179static void cleanup(htmloutput *ho);
180
181static void html_href(htmloutput *ho, htmlfile *thisfile,
182 htmlfile *targetfile, char *targetfrag);
27bdc5ab 183static void html_fragment(htmloutput *ho, char const *fragment);
78c73085 184
185static char *html_format(paragraph *p, char *template_string);
3e82de8f 186static char *html_sanitise_fragment(htmlfilelist *files, htmlfile *file,
187 char *text);
78c73085 188
189static void html_contents_entry(htmloutput *ho, int depth, htmlsect *s,
190 htmlfile *thisfile, keywordlist *keywords,
191 htmlconfig *cfg);
192static void html_section_title(htmloutput *ho, htmlsect *s,
193 htmlfile *thisfile, keywordlist *keywords,
23c9bbc2 194 htmlconfig *cfg, int real);
78c73085 195
196static htmlconfig html_configure(paragraph *source) {
197 htmlconfig ret;
198 paragraph *p;
199
200 /*
201 * Defaults.
202 */
203 ret.leaf_level = 2;
204 ret.achapter.just_numbers = FALSE;
205 ret.achapter.number_suffix = L": ";
206 ret.nasect = 1;
f1530049 207 ret.asect = snewn(ret.nasect, sectlevel);
78c73085 208 ret.asect[0].just_numbers = TRUE;
209 ret.asect[0].number_suffix = L" ";
210 ret.ncdepths = 0;
211 ret.contents_depths = 0;
212 ret.visible_version_id = TRUE;
213 ret.address_section = TRUE;
214 ret.leaf_contains_contents = FALSE;
215 ret.leaf_smallest_contents = 4;
216 ret.single_filename = dupstr("Manual.html");
217 ret.contents_filename = dupstr("Contents.html");
218 ret.index_filename = dupstr("IndexPage.html");
219 ret.template_filename = dupstr("%n.html");
220 ret.template_fragment = dupstr("%b");
221 ret.head_end = ret.body_tag = ret.body_start = ret.body_end =
222 ret.addr_start = ret.addr_end = ret.nav_attr = NULL;
223 ret.author = ret.description = NULL;
b7309494 224 ret.restrict_charset = CS_UTF8;
78c73085 225 ret.output_charset = CS_ASCII;
226 ret.htmlver = HTML_4;
56a99eb6 227 ret.index_text = L"Index";
228 ret.contents_text = L"Contents";
229 ret.preamble_text = L"Preamble";
230 ret.title_separator = L" - ";
231 ret.nav_prev_text = L"Previous";
232 ret.nav_next_text = L"Next";
233 ret.nav_separator = L" | ";
234 ret.index_main_sep = L": ";
235 ret.index_multi_sep = L", ";
236 ret.pre_versionid = L"[";
237 ret.post_versionid = L"]";
78c73085 238 /*
239 * Default quote characters are Unicode matched single quotes,
240 * falling back to ordinary ASCII ".
241 */
242 ret.lquote = L"\x2018\0\x2019\0\"\0\"\0\0";
243 ret.rquote = uadv(ret.lquote);
244
245 /*
246 * Two-pass configuration so that we can pick up global config
247 * (e.g. `quotes') before having it overridden by specific
248 * config (`html-quotes'), irrespective of the order in which
249 * they occur.
250 */
251 for (p = source; p; p = p->next) {
252 if (p->type == para_Config) {
253 if (!ustricmp(p->keyword, L"quotes")) {
254 if (*uadv(p->keyword) && *uadv(uadv(p->keyword))) {
255 ret.lquote = uadv(p->keyword);
256 ret.rquote = uadv(ret.lquote);
257 }
258 }
259 }
260 }
261
262 for (p = source; p; p = p->next) {
263 if (p->type == para_Config) {
264 wchar_t *k = p->keyword;
265
266 if (!ustrnicmp(k, L"xhtml-", 6))
267 k++; /* treat `xhtml-' and `html-' the same */
268
b7309494 269 if (!ustricmp(k, L"html-restrict-charset")) {
0960a3d8 270 ret.restrict_charset = charset_from_ustr(&p->fpos, uadv(k));
b7309494 271 } else if (!ustricmp(k, L"html-output-charset")) {
0960a3d8 272 ret.output_charset = charset_from_ustr(&p->fpos, uadv(k));
27bdc5ab 273 } else if (!ustricmp(k, L"html-version")) {
274 wchar_t *vername = uadv(k);
275 static const struct {
276 const wchar_t *name;
277 int ver;
278 } versions[] = {
279 {L"html3.2", HTML_3_2},
280 {L"html4", HTML_4},
281 {L"iso-html", ISO_HTML},
282 {L"xhtml1.0transitional", XHTML_1_0_TRANSITIONAL},
283 {L"xhtml1.0strict", XHTML_1_0_STRICT}
284 };
285 int i;
286
287 for (i = 0; i < (int)lenof(versions); i++)
288 if (!ustricmp(versions[i].name, vername))
289 break;
290
291 if (i == lenof(versions))
292 error(err_htmlver, &p->fpos, vername);
293 else
294 ret.htmlver = versions[i].ver;
78c73085 295 } else if (!ustricmp(k, L"html-single-filename")) {
296 sfree(ret.single_filename);
297 ret.single_filename = dupstr(adv(p->origkeyword));
298 } else if (!ustricmp(k, L"html-contents-filename")) {
299 sfree(ret.contents_filename);
300 ret.contents_filename = dupstr(adv(p->origkeyword));
301 } else if (!ustricmp(k, L"html-index-filename")) {
302 sfree(ret.index_filename);
303 ret.index_filename = dupstr(adv(p->origkeyword));
304 } else if (!ustricmp(k, L"html-template-filename")) {
305 sfree(ret.template_filename);
306 ret.template_filename = dupstr(adv(p->origkeyword));
307 } else if (!ustricmp(k, L"html-template-fragment")) {
308 sfree(ret.template_fragment);
309 ret.template_fragment = dupstr(adv(p->origkeyword));
310 } else if (!ustricmp(k, L"html-chapter-numeric")) {
311 ret.achapter.just_numbers = utob(uadv(k));
312 } else if (!ustricmp(k, L"html-chapter-suffix")) {
313 ret.achapter.number_suffix = uadv(k);
314 } else if (!ustricmp(k, L"html-leaf-level")) {
315 ret.leaf_level = utoi(uadv(k));
316 } else if (!ustricmp(k, L"html-section-numeric")) {
317 wchar_t *q = uadv(k);
318 int n = 0;
319 if (uisdigit(*q)) {
320 n = utoi(q);
321 q = uadv(q);
322 }
323 if (n >= ret.nasect) {
324 int i;
f1530049 325 ret.asect = sresize(ret.asect, n+1, sectlevel);
78c73085 326 for (i = ret.nasect; i <= n; i++)
327 ret.asect[i] = ret.asect[ret.nasect-1];
328 ret.nasect = n+1;
329 }
330 ret.asect[n].just_numbers = utob(q);
331 } else if (!ustricmp(k, L"html-section-suffix")) {
332 wchar_t *q = uadv(k);
333 int n = 0;
334 if (uisdigit(*q)) {
335 n = utoi(q);
336 q = uadv(q);
337 }
338 if (n >= ret.nasect) {
339 int i;
f1530049 340 ret.asect = sresize(ret.asect, n+1, sectlevel);
78c73085 341 for (i = ret.nasect; i <= n; i++) {
342 ret.asect[i] = ret.asect[ret.nasect-1];
343 }
344 ret.nasect = n+1;
345 }
346 ret.asect[n].number_suffix = q;
347 } else if (!ustricmp(k, L"html-contents-depth") ||
348 !ustrnicmp(k, L"html-contents-depth-", 20)) {
349 /*
350 * Relic of old implementation: this directive used
351 * to be written as \cfg{html-contents-depth-3}{2}
352 * rather than the usual Halibut convention of
353 * \cfg{html-contents-depth}{3}{2}. We therefore
354 * support both.
355 */
356 wchar_t *q = k[19] ? k+20 : uadv(k);
357 int n = 0;
358 if (uisdigit(*q)) {
359 n = utoi(q);
360 q = uadv(q);
361 }
362 if (n >= ret.ncdepths) {
363 int i;
f1530049 364 ret.contents_depths =
365 sresize(ret.contents_depths, n+1, int);
78c73085 366 for (i = ret.ncdepths; i <= n; i++) {
367 ret.contents_depths[i] = i+2;
368 }
369 ret.ncdepths = n+1;
370 }
371 ret.contents_depths[n] = utoi(q);
372 } else if (!ustricmp(k, L"html-head-end")) {
373 ret.head_end = adv(p->origkeyword);
374 } else if (!ustricmp(k, L"html-body-tag")) {
375 ret.body_tag = adv(p->origkeyword);
376 } else if (!ustricmp(k, L"html-body-start")) {
377 ret.body_start = adv(p->origkeyword);
378 } else if (!ustricmp(k, L"html-body-end")) {
379 ret.body_end = adv(p->origkeyword);
380 } else if (!ustricmp(k, L"html-address-start")) {
381 ret.addr_start = adv(p->origkeyword);
382 } else if (!ustricmp(k, L"html-address-end")) {
383 ret.addr_end = adv(p->origkeyword);
384 } else if (!ustricmp(k, L"html-navigation-attributes")) {
385 ret.nav_attr = adv(p->origkeyword);
386 } else if (!ustricmp(k, L"html-author")) {
387 ret.author = uadv(k);
388 } else if (!ustricmp(k, L"html-description")) {
389 ret.description = uadv(k);
390 } else if (!ustricmp(k, L"html-suppress-address")) {
391 ret.address_section = !utob(uadv(k));
392 } else if (!ustricmp(k, L"html-versionid")) {
393 ret.visible_version_id = utob(uadv(k));
394 } else if (!ustricmp(k, L"html-quotes")) {
395 if (*uadv(k) && *uadv(uadv(k))) {
396 ret.lquote = uadv(k);
397 ret.rquote = uadv(ret.lquote);
398 }
399 } else if (!ustricmp(k, L"html-leaf-contains-contents")) {
400 ret.leaf_contains_contents = utob(uadv(k));
401 } else if (!ustricmp(k, L"html-leaf-smallest-contents")) {
402 ret.leaf_smallest_contents = utoi(uadv(k));
75a96e91 403 } else if (!ustricmp(k, L"html-index-text")) {
404 ret.index_text = uadv(k);
405 } else if (!ustricmp(k, L"html-contents-text")) {
406 ret.contents_text = uadv(k);
407 } else if (!ustricmp(k, L"html-preamble-text")) {
408 ret.preamble_text = uadv(k);
409 } else if (!ustricmp(k, L"html-title-separator")) {
410 ret.title_separator = uadv(k);
411 } else if (!ustricmp(k, L"html-nav-prev-text")) {
412 ret.nav_prev_text = uadv(k);
413 } else if (!ustricmp(k, L"html-nav-next-text")) {
414 ret.nav_next_text = uadv(k);
415 } else if (!ustricmp(k, L"html-nav-separator")) {
416 ret.nav_separator = uadv(k);
417 } else if (!ustricmp(k, L"html-index-main-separator")) {
418 ret.index_main_sep = uadv(k);
419 } else if (!ustricmp(k, L"html-index-multiple-separator")) {
420 ret.index_multi_sep = uadv(k);
421 } else if (!ustricmp(k, L"html-pre-versionid")) {
422 ret.pre_versionid = uadv(k);
423 } else if (!ustricmp(k, L"html-post-versionid")) {
424 ret.post_versionid = uadv(k);
78c73085 425 }
426 }
427 }
428
429 /*
430 * Now process fallbacks on quote characters.
431 */
432 while (*uadv(ret.rquote) && *uadv(uadv(ret.rquote)) &&
433 (!cvt_ok(ret.restrict_charset, ret.lquote) ||
434 !cvt_ok(ret.restrict_charset, ret.rquote))) {
435 ret.lquote = uadv(ret.rquote);
436 ret.rquote = uadv(ret.lquote);
437 }
438
439 return ret;
440}
441
442paragraph *html_config_filename(char *filename)
443{
444 /*
445 * If the user passes in a single filename as a parameter to
446 * the `--html' command-line option, then we should assume it
447 * to imply _two_ config directives:
448 * \cfg{html-single-filename}{whatever} and
449 * \cfg{html-leaf-level}{0}; the rationale being that the user
450 * wants their output _in that file_.
451 */
452 paragraph *p, *q;
453
454 p = cmdline_cfg_simple("html-single-filename", filename, NULL);
455 q = cmdline_cfg_simple("html-leaf-level", "0", NULL);
456 p->next = q;
457 return p;
458}
459
460void html_backend(paragraph *sourceform, keywordlist *keywords,
529a6c83 461 indexdata *idx, void *unused)
462{
78c73085 463 paragraph *p;
464 htmlconfig conf;
3e82de8f 465 htmlfilelist files = { NULL, NULL, NULL, NULL, NULL };
78c73085 466 htmlsectlist sects = { NULL, NULL }, nonsects = { NULL, NULL };
467
468 IGNORE(unused);
469
470 conf = html_configure(sourceform);
471
472 /*
473 * We're going to make heavy use of paragraphs' private data
474 * fields in the forthcoming code. Clear them first, so we can
475 * reliably tell whether we have auxiliary data for a
476 * particular paragraph.
477 */
478 for (p = sourceform; p; p = p->next)
479 p->private_data = NULL;
480
3e82de8f 481 files.frags = newtree234(html_fragment_compare);
482
78c73085 483 /*
484 * Start by figuring out into which file each piece of the
485 * document should be put. We'll do this by inventing an
486 * `htmlsect' structure and stashing it in the private_data
487 * field of each section paragraph; we also need one additional
488 * htmlsect for the document index, which won't show up in the
489 * source form but needs to be consistently mentioned in
490 * contents links.
491 *
492 * While we're here, we'll also invent the HTML fragment name
493 * for each section.
494 */
495 {
496 htmlsect *topsect, *sect;
497 int d;
498
56a99eb6 499 topsect = html_new_sect(&sects, NULL);
78c73085 500 topsect->type = TOP;
501 topsect->title = NULL;
502 topsect->text = sourceform;
503 topsect->contents_depth = contents_depth(conf, 0);
504 html_file_section(&conf, &files, topsect, -1);
505 topsect->fragment = NULL;
506
507 for (p = sourceform; p; p = p->next)
508 if (is_heading_type(p->type)) {
509 d = heading_depth(p);
510
511 if (p->type == para_Title) {
512 topsect->title = p;
513 continue;
514 }
515
516 sect = html_new_sect(&sects, p);
517 sect->text = p->next;
518
519 sect->contents_depth = contents_depth(conf, d+1) - (d+1);
520
521 if (p->parent) {
522 sect->parent = (htmlsect *)p->parent->private_data;
523 assert(sect->parent != NULL);
524 } else
525 sect->parent = topsect;
526 p->private_data = sect;
527
528 html_file_section(&conf, &files, sect, d);
529
530 sect->fragment = html_format(p, conf.template_fragment);
3e82de8f 531 sect->fragment = html_sanitise_fragment(&files, sect->file,
532 sect->fragment);
78c73085 533 }
534
535 /* And the index. */
536 sect = html_new_sect(&sects, NULL);
78c73085 537 sect->text = NULL;
538 sect->type = INDEX;
539 sect->parent = topsect;
540 html_file_section(&conf, &files, sect, 0); /* peer of chapters */
56a99eb6 541 sect->fragment = utoa_dup(conf.index_text, CS_ASCII);
3e82de8f 542 sect->fragment = html_sanitise_fragment(&files, sect->file,
543 sect->fragment);
78c73085 544 files.index = sect->file;
545 }
546
547 /*
548 * Go through the keyword list and sort out fragment IDs for
549 * all the potentially referenced paragraphs which _aren't_
550 * headings.
551 */
552 {
553 int i;
554 keyword *kw;
555 htmlsect *sect;
556
557 for (i = 0; (kw = index234(keywords->keys, i)) != NULL; i++) {
558 paragraph *q, *p = kw->para;
559
560 if (!is_heading_type(p->type)) {
561 htmlsect *parent;
562
563 /*
564 * Find the paragraph's parent htmlsect, to
565 * determine which file it will end up in.
566 */
567 q = p->parent;
568 if (!q) {
569 /*
570 * Preamble paragraphs have no parent. So if we
571 * have a non-heading with no parent, it must
572 * be preamble, and therefore its parent
573 * htmlsect must be the preamble one.
574 */
575 assert(sects.head &&
576 sects.head->type == TOP);
577 parent = sects.head;
578 } else
579 parent = (htmlsect *)q->private_data;
580
581 /*
582 * Now we can construct an htmlsect for this
583 * paragraph itself, taking care to put it in the
584 * list of non-sections rather than the list of
585 * sections (so that traverses of the `sects' list
586 * won't attempt to add it to the contents or
587 * anything weird like that).
588 */
589 sect = html_new_sect(&nonsects, p);
590 sect->file = parent->file;
591 sect->parent = parent;
592 p->private_data = sect;
593
594 /*
04781c84 595 * Fragment IDs for these paragraphs will simply be
596 * `p' followed by an integer.
78c73085 597 */
f1530049 598 sect->fragment = snewn(40, char);
04781c84 599 sprintf(sect->fragment, "p%d",
600 sect->file->last_fragment_number++);
3e82de8f 601 sect->fragment = html_sanitise_fragment(&files, sect->file,
602 sect->fragment);
78c73085 603 }
604 }
605 }
606
607 /*
04781c84 608 * Reset the fragment numbers in each file. I've just used them
609 * to generate `p' fragment IDs for non-section paragraphs
610 * (numbered list elements, bibliocited), and now I want to use
611 * them for `i' fragment IDs for index entries.
612 */
613 {
614 htmlfile *file;
615 for (file = files.head; file; file = file->next)
616 file->last_fragment_number = 0;
617 }
618
619 /*
78c73085 620 * Now sort out the index. This involves:
621 *
622 * - For each index term, we set up an htmlindex structure to
623 * store all the references to that term.
624 *
625 * - Then we make a pass over the actual document, finding
626 * every word_IndexRef; for each one, we actually figure out
627 * the HTML filename/fragment pair we will use to reference
628 * it, store that information in the private data field of
629 * the word_IndexRef itself (so we can recreate it when the
630 * time comes to output our HTML), and add a reference to it
631 * to the index term in question.
632 */
633 {
634 int i;
635 indexentry *entry;
636 htmlsect *lastsect;
637 word *w;
638
639 /*
640 * Set up the htmlindex structures.
641 */
642
643 for (i = 0; (entry = index234(idx->entries, i)) != NULL; i++) {
f1530049 644 htmlindex *hi = snew(htmlindex);
78c73085 645
646 hi->nrefs = hi->refsize = 0;
647 hi->refs = NULL;
648
649 entry->backend_data = hi;
650 }
651
652 /*
653 * Run over the document inventing fragments. Each fragment
654 * is of the form `i' followed by an integer.
78c73085 655 */
56a99eb6 656 lastsect = sects.head; /* this is always the top section */
78c73085 657 for (p = sourceform; p; p = p->next) {
56a99eb6 658 if (is_heading_type(p->type) && p->type != para_Title)
78c73085 659 lastsect = (htmlsect *)p->private_data;
660
661 for (w = p->words; w; w = w->next)
662 if (w->type == word_IndexRef) {
f1530049 663 htmlindexref *hr = snew(htmlindexref);
78c73085 664 indextag *tag;
665 int i;
666
1b7bf715 667 hr->referenced = hr->generated = FALSE;
78c73085 668 hr->section = lastsect;
78c73085 669 {
670 char buf[40];
671 sprintf(buf, "i%d",
672 lastsect->file->last_fragment_number++);
673 hr->fragment = dupstr(buf);
3e82de8f 674 hr->fragment =
675 html_sanitise_fragment(&files, hr->section->file,
676 hr->fragment);
78c73085 677 }
678 w->private_data = hr;
679
680 tag = index_findtag(idx, w->text);
681 if (!tag)
682 break;
683
684 for (i = 0; i < tag->nrefs; i++) {
685 indexentry *entry = tag->refs[i];
686 htmlindex *hi = (htmlindex *)entry->backend_data;
687
688 if (hi->nrefs >= hi->refsize) {
689 hi->refsize += 32;
f1530049 690 hi->refs = sresize(hi->refs, hi->refsize, word *);
78c73085 691 }
692
693 hi->refs[hi->nrefs++] = w;
694 }
695 }
696 }
697 }
698
699 /*
700 * Now we're ready to write out the actual HTML files.
701 *
702 * For each file:
703 *
704 * - we open that file and write its header
705 * - we run down the list of sections
706 * - for each section directly contained within that file, we
707 * output the section text
708 * - for each section which is not in the file but which has a
709 * parent that is, we output a contents entry for the
710 * section if appropriate
711 * - finally, we output the file trailer and close the file.
712 */
713 {
714 htmlfile *f, *prevf;
715 htmlsect *s;
716 paragraph *p;
717
718 prevf = NULL;
719
720 for (f = files.head; f; f = f->next) {
721 htmloutput ho;
722 int displaying;
723 enum LISTTYPE { NOLIST, UL, OL, DL };
724 enum ITEMTYPE { NOITEM, LI, DT, DD };
725 struct stackelement {
726 struct stackelement *next;
727 enum LISTTYPE listtype;
728 enum ITEMTYPE itemtype;
729 } *stackhead;
730
731#define listname(lt) ( (lt)==UL ? "ul" : (lt)==OL ? "ol" : "dl" )
732#define itemname(lt) ( (lt)==LI ? "li" : (lt)==DT ? "dt" : "dd" )
733
734 ho.fp = fopen(f->filename, "w");
735 ho.charset = conf.output_charset;
b7309494 736 ho.restrict_charset = conf.restrict_charset;
78c73085 737 ho.cstate = charset_init_state;
738 ho.ver = conf.htmlver;
739 ho.state = HO_NEUTRAL;
740 ho.contents_level = 0;
741
742 /* <!DOCTYPE>. */
743 switch (conf.htmlver) {
744 case HTML_3_2:
745 fprintf(ho.fp, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD "
746 "HTML 3.2 Final//EN\">\n");
747 break;
748 case HTML_4:
749 fprintf(ho.fp, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML"
750 " 4.01//EN\"\n\"http://www.w3.org/TR/html4/"
751 "strict.dtd\">\n");
752 break;
27bdc5ab 753 case ISO_HTML:
754 fprintf(ho.fp, "<!DOCTYPE HTML PUBLIC \"ISO/IEC "
755 "15445:2000//DTD HTML//EN\">\n");
756 break;
78c73085 757 case XHTML_1_0_TRANSITIONAL:
27bdc5ab 758 fprintf(ho.fp, "<?xml version=\"1.0\" encoding=\"%s\"?>\n",
759 charset_to_mimeenc(conf.output_charset));
78c73085 760 fprintf(ho.fp, "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML"
761 " 1.0 Transitional//EN\"\n\"http://www.w3.org/TR/"
762 "xhtml1/DTD/xhtml1-transitional.dtd\">\n");
763 break;
764 case XHTML_1_0_STRICT:
27bdc5ab 765 fprintf(ho.fp, "<?xml version=\"1.0\" encoding=\"%s\"?>\n",
766 charset_to_mimeenc(conf.output_charset));
78c73085 767 fprintf(ho.fp, "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML"
768 " 1.0 Strict//EN\"\n\"http://www.w3.org/TR/xhtml1/"
769 "DTD/xhtml1-strict.dtd\">\n");
770 break;
771 }
772
773 element_open(&ho, "html");
774 if (is_xhtml(conf.htmlver)) {
775 element_attr(&ho, "xmlns", "http://www.w3.org/1999/xhtml");
776 }
777 html_nl(&ho);
778
779 element_open(&ho, "head");
780 html_nl(&ho);
781
782 element_empty(&ho, "meta");
783 element_attr(&ho, "http-equiv", "content-type");
784 {
785 char buf[200];
786 sprintf(buf, "text/html; charset=%.150s",
787 charset_to_mimeenc(conf.output_charset));
788 element_attr(&ho, "content", buf);
789 }
790 html_nl(&ho);
791
792 if (conf.author) {
793 element_empty(&ho, "meta");
794 element_attr(&ho, "name", "author");
795 element_attr_w(&ho, "content", conf.author);
796 html_nl(&ho);
797 }
798
799 if (conf.description) {
800 element_empty(&ho, "meta");
801 element_attr(&ho, "name", "description");
802 element_attr_w(&ho, "content", conf.description);
803 html_nl(&ho);
804 }
805
806 element_open(&ho, "title");
807 if (f->first && f->first->title) {
808 html_words(&ho, f->first->title->words, NOTHING,
809 f, keywords, &conf);
810
811 assert(f->last);
812 if (f->last != f->first && f->last->title) {
56a99eb6 813 html_text(&ho, conf.title_separator);
78c73085 814 html_words(&ho, f->last->title->words, NOTHING,
815 f, keywords, &conf);
816 }
817 }
818 element_close(&ho, "title");
819 html_nl(&ho);
820
821 if (conf.head_end)
822 html_raw(&ho, conf.head_end);
823
824 element_close(&ho, "head");
825 html_nl(&ho);
826
78c73085 827 if (conf.body_tag)
828 html_raw(&ho, conf.body_tag);
829 else
830 element_open(&ho, "body");
831 html_nl(&ho);
832
833 if (conf.body_start)
834 html_raw(&ho, conf.body_start);
835
836 /*
837 * Write out a nav bar. Special case: we don't do this
838 * if there is only one file.
839 */
840 if (files.head != files.tail) {
841 element_open(&ho, "p");
842 if (conf.nav_attr)
843 html_raw_as_attr(&ho, conf.nav_attr);
844
845 if (prevf) {
846 element_open(&ho, "a");
847 element_attr(&ho, "href", prevf->filename);
848 }
56a99eb6 849 html_text(&ho, conf.nav_prev_text);
78c73085 850 if (prevf)
851 element_close(&ho, "a");
852
56a99eb6 853 html_text(&ho, conf.nav_separator);
78c73085 854
855 if (f != files.head) {
856 element_open(&ho, "a");
857 element_attr(&ho, "href", files.head->filename);
858 }
56a99eb6 859 html_text(&ho, conf.contents_text);
78c73085 860 if (f != files.head)
861 element_close(&ho, "a");
862
56a99eb6 863 html_text(&ho, conf.nav_separator);
78c73085 864
865 if (f != files.index) {
866 element_open(&ho, "a");
867 element_attr(&ho, "href", files.index->filename);
868 }
56a99eb6 869 html_text(&ho, conf.index_text);
78c73085 870 if (f != files.index)
871 element_close(&ho, "a");
872
56a99eb6 873 html_text(&ho, conf.nav_separator);
78c73085 874
875 if (f->next) {
876 element_open(&ho, "a");
877 element_attr(&ho, "href", f->next->filename);
878 }
56a99eb6 879 html_text(&ho, conf.nav_next_text);
78c73085 880 if (f->next)
881 element_close(&ho, "a");
882
883 element_close(&ho, "p");
884 html_nl(&ho);
885 }
886 prevf = f;
887
888 /*
889 * Write out a prefix TOC for the file.
890 *
891 * We start by going through the section list and
892 * collecting the sections which need to be added to
893 * the contents. On the way, we also test to see if
894 * this file is a leaf file (defined as one which
895 * contains all descendants of any section it
896 * contains), because this will play a part in our
897 * decision on whether or not to _output_ the TOC.
898 *
899 * Special case: we absolutely do not do this if we're
900 * in single-file mode.
901 */
902 if (files.head != files.tail) {
903 int ntoc = 0, tocsize = 0;
904 htmlsect **toc = NULL;
905 int leaf = TRUE;
906
907 for (s = sects.head; s; s = s->next) {
908 htmlsect *a, *ac;
909 int depth, adepth;
910
911 /*
912 * Search up from this section until we find
913 * the highest-level one which belongs in this
914 * file.
915 */
916 depth = adepth = 0;
917 a = NULL;
918 for (ac = s; ac; ac = ac->parent) {
919 if (ac->file == f) {
920 a = ac;
921 adepth = depth;
922 }
923 depth++;
924 }
925
926 if (s->file != f && a != NULL)
927 leaf = FALSE;
928
929 if (a) {
930 if (adepth <= a->contents_depth) {
931 if (ntoc >= tocsize) {
932 tocsize += 64;
f1530049 933 toc = sresize(toc, tocsize, htmlsect *);
78c73085 934 }
935 toc[ntoc++] = s;
936 }
937 }
938 }
939
940 if (leaf && conf.leaf_contains_contents &&
941 ntoc >= conf.leaf_smallest_contents) {
942 int i;
943
944 for (i = 0; i < ntoc; i++) {
945 htmlsect *s = toc[i];
946 int hlevel = (s->type == TOP ? -1 :
947 s->type == INDEX ? 0 :
948 heading_depth(s->title))
949 - f->min_heading_depth + 1;
950
951 assert(hlevel >= 1);
952 html_contents_entry(&ho, hlevel, s,
953 f, keywords, &conf);
954 }
955 html_contents_entry(&ho, 0, NULL, f, keywords, &conf);
956 }
957 }
958
959 /*
960 * Now go through the document and output some real
961 * text.
962 */
963 displaying = FALSE;
964 for (s = sects.head; s; s = s->next) {
965 if (s->file == f) {
966 /*
967 * This section belongs in this file.
968 * Display it.
969 */
970 displaying = TRUE;
971 } else {
972 htmlsect *a, *ac;
973 int depth, adepth;
974
975 displaying = FALSE;
976
977 /*
978 * Search up from this section until we find
979 * the highest-level one which belongs in this
980 * file.
981 */
982 depth = adepth = 0;
983 a = NULL;
984 for (ac = s; ac; ac = ac->parent) {
985 if (ac->file == f) {
986 a = ac;
987 adepth = depth;
988 }
989 depth++;
990 }
991
992 if (a != NULL) {
993 /*
994 * This section does not belong in this
995 * file, but an ancestor of it does. Write
996 * out a contents table entry, if the depth
997 * doesn't exceed the maximum contents
998 * depth for the ancestor section.
999 */
1000 if (adepth <= a->contents_depth) {
1001 html_contents_entry(&ho, adepth, s,
1002 f, keywords, &conf);
1003 }
1004 }
1005 }
1006
1007 if (displaying) {
1008 int hlevel;
1009 char htag[3];
1010
1011 html_contents_entry(&ho, 0, NULL, f, keywords, &conf);
1012
1013 /*
1014 * Display the section heading.
1015 */
1016
1017 hlevel = (s->type == TOP ? -1 :
1018 s->type == INDEX ? 0 :
1019 heading_depth(s->title))
1020 - f->min_heading_depth + 1;
1021 assert(hlevel >= 1);
1022 /* HTML headings only go up to <h6> */
1023 if (hlevel > 6)
1024 hlevel = 6;
1025 htag[0] = 'h';
1026 htag[1] = '0' + hlevel;
1027 htag[2] = '\0';
1028 element_open(&ho, htag);
1029
1030 /*
1031 * Provide anchor for cross-links to target.
1032 *
78c73085 1033 * (Also we'll have to do this separately in
1034 * other paragraph types - NumberedList and
1035 * BiblioCited.)
1036 */
27bdc5ab 1037 if (s->fragment)
1038 html_fragment(&ho, s->fragment);
78c73085 1039
23c9bbc2 1040 html_section_title(&ho, s, f, keywords, &conf, TRUE);
78c73085 1041
1042 element_close(&ho, htag);
1043
1044 /*
1045 * Now display the section text.
1046 */
1047 if (s->text) {
f1530049 1048 stackhead = snew(struct stackelement);
78c73085 1049 stackhead->next = NULL;
1050 stackhead->listtype = NOLIST;
1051 stackhead->itemtype = NOITEM;
1052
1053 for (p = s->text;; p = p->next) {
1054 enum LISTTYPE listtype;
1055 struct stackelement *se;
1056
1057 /*
1058 * Preliminary switch to figure out what
1059 * sort of list we expect to be inside at
1060 * this stage.
1061 *
1062 * Since p may still be NULL at this point,
1063 * I invent a harmless paragraph type for
1064 * it if it is.
1065 */
1066 switch (p ? p->type : para_Normal) {
1067 case para_Rule:
1068 case para_Normal:
1069 case para_Copyright:
1070 case para_BiblioCited:
1071 case para_Code:
1072 case para_QuotePush:
1073 case para_QuotePop:
1074 case para_Chapter:
1075 case para_Appendix:
1076 case para_UnnumberedChapter:
1077 case para_Heading:
1078 case para_Subsect:
1079 case para_LcontPop:
1080 listtype = NOLIST;
1081 break;
1082
1083 case para_Bullet:
1084 listtype = UL;
1085 break;
1086
1087 case para_NumberedList:
1088 listtype = OL;
1089 break;
1090
1091 case para_DescribedThing:
1092 case para_Description:
1093 listtype = DL;
1094 break;
1095
1096 case para_LcontPush:
f1530049 1097 se = snew(struct stackelement);
78c73085 1098 se->next = stackhead;
1099 se->listtype = NOLIST;
1100 se->itemtype = NOITEM;
1101 stackhead = se;
1102 continue;
1103
1104 default: /* some totally non-printing para */
1105 continue;
1106 }
1107
1108 html_nl(&ho);
1109
1110 /*
1111 * Terminate the most recent list item, if
1112 * any. (We left this until after
1113 * processing LcontPush, since in that case
1114 * the list item won't want to be
1115 * terminated until after the corresponding
1116 * LcontPop.)
1117 */
1118 if (stackhead->itemtype != NOITEM) {
1119 element_close(&ho, itemname(stackhead->itemtype));
1120 html_nl(&ho);
1121 }
1122 stackhead->itemtype = NOITEM;
1123
1124 /*
1125 * Terminate the current list, if it's not
1126 * the one we want to be in.
1127 */
1128 if (listtype != stackhead->listtype &&
1129 stackhead->listtype != NOLIST) {
1130 element_close(&ho, listname(stackhead->listtype));
1131 html_nl(&ho);
1132 }
1133
1134 /*
1135 * Leave the loop if our time has come.
1136 */
1137 if (!p || (is_heading_type(p->type) &&
1138 p->type != para_Title))
1139 break; /* end of section text */
1140
1141 /*
1142 * Start a fresh list if necessary.
1143 */
1144 if (listtype != stackhead->listtype &&
1145 listtype != NOLIST)
1146 element_open(&ho, listname(listtype));
1147
1148 stackhead->listtype = listtype;
1149
1150 switch (p->type) {
1151 case para_Rule:
1152 element_empty(&ho, "hr");
1153 break;
1154 case para_Code:
1155 html_codepara(&ho, p->words);
1156 break;
1157 case para_Normal:
1158 case para_Copyright:
1159 element_open(&ho, "p");
1160 html_nl(&ho);
1161 html_words(&ho, p->words, ALL,
1162 f, keywords, &conf);
1163 html_nl(&ho);
1164 element_close(&ho, "p");
1165 break;
1166 case para_BiblioCited:
1167 element_open(&ho, "p");
1168 if (p->private_data) {
1169 htmlsect *s = (htmlsect *)p->private_data;
27bdc5ab 1170 html_fragment(&ho, s->fragment);
78c73085 1171 }
1172 html_nl(&ho);
1173 html_words(&ho, p->kwtext, ALL,
1174 f, keywords, &conf);
1175 html_text(&ho, L" ");
1176 html_words(&ho, p->words, ALL,
1177 f, keywords, &conf);
1178 html_nl(&ho);
1179 element_close(&ho, "p");
1180 break;
1181 case para_Bullet:
1182 case para_NumberedList:
1183 element_open(&ho, "li");
1184 if (p->private_data) {
1185 htmlsect *s = (htmlsect *)p->private_data;
27bdc5ab 1186 html_fragment(&ho, s->fragment);
78c73085 1187 }
1188 html_nl(&ho);
1189 stackhead->itemtype = LI;
1190 html_words(&ho, p->words, ALL,
1191 f, keywords, &conf);
1192 break;
1193 case para_DescribedThing:
1194 element_open(&ho, "dt");
1195 html_nl(&ho);
1196 stackhead->itemtype = DT;
1197 html_words(&ho, p->words, ALL,
1198 f, keywords, &conf);
1199 break;
1200 case para_Description:
1201 element_open(&ho, "dd");
1202 html_nl(&ho);
1203 stackhead->itemtype = DD;
1204 html_words(&ho, p->words, ALL,
1205 f, keywords, &conf);
1206 break;
1207
1208 case para_QuotePush:
1209 element_open(&ho, "blockquote");
1210 break;
1211 case para_QuotePop:
1212 element_close(&ho, "blockquote");
1213 break;
1214
1215 case para_LcontPop:
1216 se = stackhead;
1217 stackhead = stackhead->next;
1218 assert(stackhead);
1219 sfree(se);
1220 break;
1221 }
1222 }
1223
1224 assert(stackhead && !stackhead->next);
1225 sfree(stackhead);
1226 }
1227
1228 if (s->type == INDEX) {
1229 indexentry *entry;
1230 int i;
1231
1232 /*
1233 * This section is the index. I'll just
1234 * render it as a single paragraph, with a
1235 * colon between the index term and the
1236 * references, and <br> in between each
1237 * entry.
1238 */
1239 element_open(&ho, "p");
1240
1241 for (i = 0; (entry =
1242 index234(idx->entries, i)) != NULL; i++) {
1243 htmlindex *hi = (htmlindex *)entry->backend_data;
1244 int j;
1245
1246 if (i > 0)
1247 element_empty(&ho, "br");
1248 html_nl(&ho);
1249
1250 html_words(&ho, entry->text, MARKUP|LINKS,
1251 f, keywords, &conf);
1252
56a99eb6 1253 html_text(&ho, conf.index_main_sep);
78c73085 1254
1255 for (j = 0; j < hi->nrefs; j++) {
1256 htmlindexref *hr =
1257 (htmlindexref *)hi->refs[j]->private_data;
1258 paragraph *p = hr->section->title;
1259
1260 if (j > 0)
56a99eb6 1261 html_text(&ho, conf.index_multi_sep);
78c73085 1262
1263 html_href(&ho, f, hr->section->file,
1264 hr->fragment);
1b7bf715 1265 hr->referenced = TRUE;
78c73085 1266 if (p && p->kwtext)
1267 html_words(&ho, p->kwtext, MARKUP|LINKS,
1268 f, keywords, &conf);
1269 else if (p && p->words)
1270 html_words(&ho, p->words, MARKUP|LINKS,
1271 f, keywords, &conf);
56a99eb6 1272 else {
1273 /*
1274 * If there is no title at all,
1275 * this must be because our
1276 * target section is the
1277 * preamble section and there
1278 * is no title. So we use the
1279 * preamble_text.
1280 */
1281 html_text(&ho, conf.preamble_text);
1282 }
78c73085 1283 element_close(&ho, "a");
1284 }
1285 }
1286 element_close(&ho, "p");
1287 }
1288 }
1289 }
1290
1291 html_contents_entry(&ho, 0, NULL, f, keywords, &conf);
1292 html_nl(&ho);
1293
1294 {
1295 /*
1296 * Footer.
1297 */
1298 int done_version_ids = FALSE;
1299
1300 element_empty(&ho, "hr");
1301
1302 if (conf.body_end)
1303 html_raw(&ho, conf.body_end);
1304
1305 if (conf.address_section) {
27bdc5ab 1306 int started = FALSE;
1307 if (conf.htmlver == ISO_HTML) {
1308 /*
1309 * The ISO-HTML validator complains if
1310 * there isn't a <div> tag surrounding the
1311 * <address> tag. I'm uncertain of why this
1312 * should be - there appears to be no
1313 * mention of this in the ISO-HTML spec,
1314 * suggesting that it doesn't represent a
1315 * change from HTML 4, but nonetheless the
1316 * HTML 4 validator doesn't seem to mind.
1317 */
1318 element_open(&ho, "div");
1319 }
78c73085 1320 element_open(&ho, "address");
1321 if (conf.addr_start) {
1322 html_raw(&ho, conf.addr_start);
1323 html_nl(&ho);
27bdc5ab 1324 started = TRUE;
78c73085 1325 }
1326 if (conf.visible_version_id) {
78c73085 1327 for (p = sourceform; p; p = p->next)
1328 if (p->type == para_VersionID) {
27bdc5ab 1329 if (started)
78c73085 1330 element_empty(&ho, "br");
1331 html_nl(&ho);
56a99eb6 1332 html_text(&ho, conf.pre_versionid);
78c73085 1333 html_words(&ho, p->words, NOTHING,
1334 f, keywords, &conf);
56a99eb6 1335 html_text(&ho, conf.post_versionid);
78c73085 1336 started = TRUE;
1337 }
78c73085 1338 done_version_ids = TRUE;
1339 }
27bdc5ab 1340 if (conf.addr_end) {
1341 if (started)
1342 element_empty(&ho, "br");
78c73085 1343 html_raw(&ho, conf.addr_end);
27bdc5ab 1344 }
78c73085 1345 element_close(&ho, "address");
27bdc5ab 1346 if (conf.htmlver == ISO_HTML)
1347 element_close(&ho, "div");
78c73085 1348 }
1349
1350 if (!done_version_ids) {
1351 /*
1352 * If the user didn't want the version IDs
1353 * visible, I think we still have a duty to put
1354 * them in an HTML comment.
1355 */
1356 int started = FALSE;
1357 for (p = sourceform; p; p = p->next)
1358 if (p->type == para_VersionID) {
1359 if (!started) {
1360 html_raw(&ho, "<!-- version IDs:\n");
1361 started = TRUE;
1362 }
1363 html_words(&ho, p->words, NOTHING,
1364 f, keywords, &conf);
1365 html_nl(&ho);
1366 }
1367 if (started)
1368 html_raw(&ho, "-->\n");
1369 }
1370 }
1371
1372 element_close(&ho, "body");
1373 html_nl(&ho);
1374 element_close(&ho, "html");
1375 html_nl(&ho);
1376 cleanup(&ho);
1377 }
1378 }
1379
1380 /*
1b7bf715 1381 * Go through and check that no index fragments were referenced
1382 * without being generated, or indeed vice versa.
1383 *
1384 * (When I actually get round to freeing everything, this can
1385 * probably be the freeing loop as well.)
1386 */
1387 for (p = sourceform; p; p = p->next) {
1388 word *w;
1389 for (w = p->words; w; w = w->next)
1390 if (w->type == word_IndexRef) {
1391 htmlindexref *hr = (htmlindexref *)w->private_data;
1392
1393 assert(!hr->referenced == !hr->generated);
1394 }
1395 }
1396
1397 /*
529a6c83 1398 * Free all the working data.
78c73085 1399 */
529a6c83 1400 sfree(conf.asect);
1401 sfree(conf.single_filename);
1402 sfree(conf.contents_filename);
1403 sfree(conf.index_filename);
1404 sfree(conf.template_filename);
1405 sfree(conf.template_fragment);
1406 {
1407 htmlfragment *frag;
1408 while ( (frag = (htmlfragment *)delpos234(files.frags, 0)) != NULL ) {
1409 /*
1410 * frag->fragment is dynamically allocated, but will be
1411 * freed when we process the htmlsect structure which
1412 * it is attached to.
1413 */
1414 sfree(frag);
1415 }
1416 freetree234(files.frags);
1417 }
1418 {
1419 htmlsect *sect, *tmp;
1420 sect = sects.head;
1421 while (sect) {
1422 tmp = sect->next;
1423 sfree(sect->fragment);
1424 sfree(sect);
1425 sect = tmp;
1426 }
1427 sect = nonsects.head;
1428 while (sect) {
1429 tmp = sect->next;
1430 sfree(sect->fragment);
1431 sfree(sect);
1432 sect = tmp;
1433 }
1434 }
1435 {
1436 htmlfile *file, *tmp;
1437 file = files.head;
1438 while (file) {
1439 tmp = file->next;
1440 sfree(file->filename);
1441 sfree(file);
1442 file = tmp;
1443 }
1444 }
1445 {
1446 int i;
1447 indexentry *entry;
1448 for (i = 0; (entry = index234(idx->entries, i)) != NULL; i++) {
1449 htmlindex *hi = (htmlindex *)entry->backend_data;
1450 sfree(hi);
1451 }
1452 }
1453 {
1454 paragraph *p;
1455 word *w;
1456 for (p = sourceform; p; p = p->next)
1457 for (w = p->words; w; w = w->next)
1458 if (w->type == word_IndexRef) {
1459 htmlindexref *hr = (htmlindexref *)w->private_data;
1460 assert(hr != NULL);
1461 sfree(hr->fragment);
1462 sfree(hr);
1463 }
1464 }
78c73085 1465}
1466
1467static void html_file_section(htmlconfig *cfg, htmlfilelist *files,
1468 htmlsect *sect, int depth)
1469{
1470 htmlfile *file;
1471 int ldepth;
1472
1473 /*
1474 * `depth' is derived from the heading_depth() macro at the top
1475 * of this file, which counts title as -1, chapter as 0,
1476 * heading as 1 and subsection as 2. However, the semantics of
1477 * cfg->leaf_level are defined to count chapter as 1, heading
1478 * as 2 etc. So first I increment depth :-(
1479 */
1480 ldepth = depth + 1;
1481
1482 if (cfg->leaf_level == 0) {
1483 /*
1484 * leaf_level==0 is a special case, in which everything is
1485 * put into a single file.
1486 */
1487 if (!files->single)
1488 files->single = html_new_file(files, cfg->single_filename);
1489
1490 file = files->single;
1491 } else {
1492 /*
1493 * If the depth of this section is at or above leaf_level,
1494 * we invent a fresh file and put this section at its head.
1495 * Otherwise, we put it in the same file as its parent
1496 * section.
1497 */
1498 if (ldepth > cfg->leaf_level) {
1499 /*
1500 * We know that sect->parent cannot be NULL. The only
1501 * circumstance in which it can be is if sect is at
1502 * chapter or appendix level, i.e. ldepth==1; and if
1503 * that's the case, then we cannot have entered this
1504 * branch unless cfg->leaf_level==0, in which case we
1505 * would be in the single-file case above and not here
1506 * at all.
1507 */
1508 assert(sect->parent);
1509
1510 file = sect->parent->file;
1511 } else {
1512 if (sect->type == TOP) {
1513 file = html_new_file(files, cfg->contents_filename);
1514 } else if (sect->type == INDEX) {
1515 file = html_new_file(files, cfg->index_filename);
1516 } else {
1517 char *title;
1518
1519 assert(ldepth > 0 && sect->title);
1520 title = html_format(sect->title, cfg->template_filename);
1521 file = html_new_file(files, title);
1522 sfree(title);
1523 }
1524 }
1525 }
1526
1527 sect->file = file;
1528
1529 if (file->min_heading_depth > depth) {
1530 /*
1531 * This heading is at a higher level than any heading we
1532 * have so far placed in this file; so we set the `first'
1533 * pointer.
1534 */
1535 file->min_heading_depth = depth;
1536 file->first = sect;
1537 }
1538
1539 if (file->min_heading_depth == depth)
1540 file->last = sect;
1541}
1542
1543static htmlfile *html_new_file(htmlfilelist *list, char *filename)
1544{
f1530049 1545 htmlfile *ret = snew(htmlfile);
78c73085 1546
1547 ret->next = NULL;
1548 if (list->tail)
1549 list->tail->next = ret;
1550 else
1551 list->head = ret;
1552 list->tail = ret;
1553
1554 ret->filename = dupstr(filename);
1555 ret->last_fragment_number = 0;
1556 ret->min_heading_depth = INT_MAX;
1557 ret->first = ret->last = NULL;
1558
1559 return ret;
1560}
1561
1562static htmlsect *html_new_sect(htmlsectlist *list, paragraph *title)
1563{
f1530049 1564 htmlsect *ret = snew(htmlsect);
78c73085 1565
1566 ret->next = NULL;
1567 if (list->tail)
1568 list->tail->next = ret;
1569 else
1570 list->head = ret;
1571 list->tail = ret;
1572
1573 ret->title = title;
1574 ret->file = NULL;
1575 ret->parent = NULL;
1576 ret->type = NORMAL;
1577
1578 return ret;
1579}
1580
1581static void html_words(htmloutput *ho, word *words, int flags,
1582 htmlfile *file, keywordlist *keywords, htmlconfig *cfg)
1583{
1584 word *w;
1585 char *c;
1586 int style, type;
1587
1588 for (w = words; w; w = w->next) switch (w->type) {
1589 case word_HyperLink:
1590 if (flags & LINKS) {
1591 element_open(ho, "a");
1592 c = utoa_dup(w->text, CS_ASCII);
1593 element_attr(ho, "href", c);
1594 sfree(c);
1595 }
1596 break;
1597 case word_UpperXref:
1598 case word_LowerXref:
1599 if (flags & LINKS) {
1600 keyword *kwl = kw_lookup(keywords, w->text);
1601 paragraph *p = kwl->para;
1602 htmlsect *s = (htmlsect *)p->private_data;
1603
1604 assert(s);
1605
1606 html_href(ho, file, s->file, s->fragment);
1607 }
1608 break;
1609 case word_HyperEnd:
1610 case word_XrefEnd:
1611 if (flags & LINKS)
1612 element_close(ho, "a");
1613 break;
1614 case word_IndexRef:
1615 if (flags & INDEXENTS) {
1616 htmlindexref *hr = (htmlindexref *)w->private_data;
27bdc5ab 1617 html_fragment(ho, hr->fragment);
1b7bf715 1618 hr->generated = TRUE;
78c73085 1619 }
1620 break;
1621 case word_Normal:
1622 case word_Emph:
1623 case word_Code:
1624 case word_WeakCode:
1625 case word_WhiteSpace:
1626 case word_EmphSpace:
1627 case word_CodeSpace:
1628 case word_WkCodeSpace:
1629 case word_Quote:
1630 case word_EmphQuote:
1631 case word_CodeQuote:
1632 case word_WkCodeQuote:
1633 style = towordstyle(w->type);
1634 type = removeattr(w->type);
1635 if (style == word_Emph &&
1636 (attraux(w->aux) == attr_First ||
1637 attraux(w->aux) == attr_Only) &&
1638 (flags & MARKUP))
1639 element_open(ho, "em");
1640 else if ((style == word_Code || style == word_WeakCode) &&
1641 (attraux(w->aux) == attr_First ||
1642 attraux(w->aux) == attr_Only) &&
1643 (flags & MARKUP))
1644 element_open(ho, "code");
1645
1646 if (type == word_WhiteSpace)
1647 html_text(ho, L" ");
1648 else if (type == word_Quote) {
1649 if (quoteaux(w->aux) == quote_Open)
1650 html_text(ho, cfg->lquote);
1651 else
1652 html_text(ho, cfg->rquote);
1653 } else {
35b123a0 1654 if (!w->alt || cvt_ok(ho->restrict_charset, w->text))
1655 html_text_nbsp(ho, w->text);
78c73085 1656 else
1657 html_words(ho, w->alt, flags, file, keywords, cfg);
1658 }
1659
1660 if (style == word_Emph &&
1661 (attraux(w->aux) == attr_Last ||
1662 attraux(w->aux) == attr_Only) &&
1663 (flags & MARKUP))
1664 element_close(ho, "em");
1665 else if ((style == word_Code || style == word_WeakCode) &&
1666 (attraux(w->aux) == attr_Last ||
1667 attraux(w->aux) == attr_Only) &&
1668 (flags & MARKUP))
1669 element_close(ho, "code");
1670
1671 break;
1672 }
1673}
1674
1675static void html_codepara(htmloutput *ho, word *words)
1676{
1677 element_open(ho, "pre");
1678 element_open(ho, "code");
1679 for (; words; words = words->next) if (words->type == word_WeakCode) {
1680 char *open_tag;
1681 wchar_t *t, *e;
1682
1683 t = words->text;
1684 if (words->next && words->next->type == word_Emph) {
1685 e = words->next->text;
1686 words = words->next;
1687 } else
1688 e = NULL;
1689
1690 while (e && *e && *t) {
1691 int n;
1692 int ec = *e;
1693
1694 for (n = 0; t[n] && e[n] && e[n] == ec; n++);
1695
1696 open_tag = NULL;
1697 if (ec == 'i')
1698 open_tag = "em";
1699 else if (ec == 'b')
1700 open_tag = "b";
1701 if (open_tag)
1702 element_open(ho, open_tag);
1703
1704 html_text_limit(ho, t, n);
1705
1706 if (open_tag)
1707 element_close(ho, open_tag);
1708
1709 t += n;
1710 e += n;
1711 }
1712 html_text(ho, t);
1713 html_nl(ho);
1714 }
1715 element_close(ho, "code");
1716 element_close(ho, "pre");
1717}
1718
1719static void html_charset_cleanup(htmloutput *ho)
1720{
1721 char outbuf[256];
1722 int bytes;
1723
1724 bytes = charset_from_unicode(NULL, NULL, outbuf, lenof(outbuf),
1725 ho->charset, &ho->cstate, NULL);
1726 if (bytes > 0)
1727 fwrite(outbuf, 1, bytes, ho->fp);
1728}
1729
35b123a0 1730static void return_mostly_to_neutral(htmloutput *ho)
78c73085 1731{
35b123a0 1732 if (ho->state == HO_IN_EMPTY_TAG && is_xhtml(ho->ver)) {
78c73085 1733 fprintf(ho->fp, " />");
1734 } else if (ho->state == HO_IN_EMPTY_TAG || ho->state == HO_IN_TAG) {
1735 fprintf(ho->fp, ">");
1736 }
1737
1738 ho->state = HO_NEUTRAL;
1739}
1740
35b123a0 1741static void return_to_neutral(htmloutput *ho)
1742{
1743 if (ho->state == HO_IN_TEXT) {
1744 html_charset_cleanup(ho);
1745 }
1746
1747 return_mostly_to_neutral(ho);
1748}
1749
78c73085 1750static void element_open(htmloutput *ho, char const *name)
1751{
1752 return_to_neutral(ho);
1753 fprintf(ho->fp, "<%s", name);
1754 ho->state = HO_IN_TAG;
1755}
1756
1757static void element_close(htmloutput *ho, char const *name)
1758{
1759 return_to_neutral(ho);
1760 fprintf(ho->fp, "</%s>", name);
1761 ho->state = HO_NEUTRAL;
1762}
1763
1764static void element_empty(htmloutput *ho, char const *name)
1765{
1766 return_to_neutral(ho);
1767 fprintf(ho->fp, "<%s", name);
1768 ho->state = HO_IN_EMPTY_TAG;
1769}
1770
1771static void html_nl(htmloutput *ho)
1772{
1773 return_to_neutral(ho);
1774 fputc('\n', ho->fp);
1775}
1776
1777static void html_raw(htmloutput *ho, char *text)
1778{
1779 return_to_neutral(ho);
1780 fputs(text, ho->fp);
1781}
1782
1783static void html_raw_as_attr(htmloutput *ho, char *text)
1784{
1785 assert(ho->state == HO_IN_TAG || ho->state == HO_IN_EMPTY_TAG);
1786 fputc(' ', ho->fp);
1787 fputs(text, ho->fp);
1788}
1789
1790static void element_attr(htmloutput *ho, char const *name, char const *value)
1791{
1792 html_charset_cleanup(ho);
1793 assert(ho->state == HO_IN_TAG || ho->state == HO_IN_EMPTY_TAG);
1794 fprintf(ho->fp, " %s=\"%s\"", name, value);
1795}
1796
1797static void element_attr_w(htmloutput *ho, char const *name,
1798 wchar_t const *value)
1799{
1800 html_charset_cleanup(ho);
1801 fprintf(ho->fp, " %s=\"", name);
35b123a0 1802 html_text_limit_internal(ho, value, 0, TRUE, FALSE);
78c73085 1803 html_charset_cleanup(ho);
1804 fputc('"', ho->fp);
1805}
1806
1807static void html_text(htmloutput *ho, wchar_t const *text)
1808{
35b123a0 1809 return_mostly_to_neutral(ho);
1810 html_text_limit_internal(ho, text, 0, FALSE, FALSE);
1811}
1812
1813static void html_text_nbsp(htmloutput *ho, wchar_t const *text)
1814{
1815 return_mostly_to_neutral(ho);
1816 html_text_limit_internal(ho, text, 0, FALSE, TRUE);
78c73085 1817}
1818
1819static void html_text_limit(htmloutput *ho, wchar_t const *text, int maxlen)
1820{
35b123a0 1821 return_mostly_to_neutral(ho);
1822 html_text_limit_internal(ho, text, maxlen, FALSE, FALSE);
78c73085 1823}
1824
1825static void html_text_limit_internal(htmloutput *ho, wchar_t const *text,
35b123a0 1826 int maxlen, int quote_quotes, int nbsp)
78c73085 1827{
1828 int textlen = ustrlen(text);
1829 char outbuf[256];
1830 int bytes, err;
1831
1832 if (maxlen > 0 && textlen > maxlen)
1833 textlen = maxlen;
1834
1835 while (textlen > 0) {
1836 /* Scan ahead for characters we really can't display in HTML. */
1837 int lenbefore, lenafter;
1838 for (lenbefore = 0; lenbefore < textlen; lenbefore++)
1839 if (text[lenbefore] == L'<' ||
1840 text[lenbefore] == L'>' ||
1841 text[lenbefore] == L'&' ||
35b123a0 1842 (text[lenbefore] == L'"' && quote_quotes) ||
1843 (text[lenbefore] == L' ' && nbsp))
78c73085 1844 break;
1845 lenafter = lenbefore;
1846 bytes = charset_from_unicode(&text, &lenafter, outbuf, lenof(outbuf),
1847 ho->charset, &ho->cstate, &err);
1848 textlen -= (lenbefore - lenafter);
1849 if (bytes > 0)
1850 fwrite(outbuf, 1, bytes, ho->fp);
1851 if (err) {
1852 /*
1853 * We have encountered a character that cannot be
1854 * displayed in the selected output charset. Therefore,
1855 * we use an HTML numeric entity reference.
1856 */
1857 assert(textlen > 0);
1858 fprintf(ho->fp, "&#%ld;", (long int)*text);
1859 text++, textlen--;
1860 } else if (lenafter == 0 && textlen > 0) {
1861 /*
1862 * We have encountered a character which is special to
1863 * HTML.
1864 */
1865 if (*text == L'<')
1866 fprintf(ho->fp, "&lt;");
1867 else if (*text == L'>')
1868 fprintf(ho->fp, "&gt;");
1869 else if (*text == L'&')
1870 fprintf(ho->fp, "&amp;");
1871 else if (*text == L'"')
1872 fprintf(ho->fp, "&quot;");
35b123a0 1873 else if (*text == L' ') {
1874 assert(nbsp);
1875 fprintf(ho->fp, "&nbsp;");
1876 } else
78c73085 1877 assert(!"Can't happen");
1878 text++, textlen--;
1879 }
1880 }
1881}
1882
1883static void cleanup(htmloutput *ho)
1884{
1885 return_to_neutral(ho);
1886 fclose(ho->fp);
1887}
1888
1889static void html_href(htmloutput *ho, htmlfile *thisfile,
1890 htmlfile *targetfile, char *targetfrag)
1891{
1892 rdstringc rs = { 0, 0, NULL };
1893 char *url;
1894
1895 if (targetfile != thisfile)
1896 rdaddsc(&rs, targetfile->filename);
1897 if (targetfrag) {
1898 rdaddc(&rs, '#');
1899 rdaddsc(&rs, targetfrag);
1900 }
1901 url = rs.text;
1902
1903 element_open(ho, "a");
1904 element_attr(ho, "href", url);
1905 sfree(url);
1906}
1907
27bdc5ab 1908static void html_fragment(htmloutput *ho, char const *fragment)
1909{
1910 element_open(ho, "a");
1911 element_attr(ho, "name", fragment);
1912 if (is_xhtml(ho->ver))
1913 element_attr(ho, "id", fragment);
1914 element_close(ho, "a");
1915}
1916
78c73085 1917static char *html_format(paragraph *p, char *template_string)
1918{
1919 char *c, *t;
1920 word *w;
1921 wchar_t *ws, wsbuf[2];
1922 rdstringc rs = { 0, 0, NULL };
1923
1924 t = template_string;
1925 while (*t) {
1926 if (*t == '%' && t[1]) {
1927 int fmt;
1928
1929 t++;
1930 fmt = *t++;
1931
1932 if (fmt == '%') {
1933 rdaddc(&rs, fmt);
1934 continue;
1935 }
1936
1937 w = NULL;
1938 ws = NULL;
1939
1940 if (p->kwtext && fmt == 'n')
1941 w = p->kwtext;
1942 else if (p->kwtext2 && fmt == 'b') {
1943 /*
1944 * HTML fragment names must start with a letter, so
1945 * simply `1.2.3' is not adequate. In this case I'm
1946 * going to cheat slightly by prepending the first
1947 * character of the first word of kwtext, so that
1948 * we get `C1' for chapter 1, `S2.3' for section
1949 * 2.3 etc.
1950 */
1951 if (p->kwtext && p->kwtext->text[0]) {
1952 ws = wsbuf;
1953 wsbuf[1] = '\0';
1954 wsbuf[0] = p->kwtext->text[0];
1955 }
1956 w = p->kwtext2;
1957 } else if (p->keyword && *p->keyword && fmt == 'k')
1958 ws = p->keyword;
1959 else
1960 w = p->words;
1961
1962 if (ws) {
1963 c = utoa_dup(ws, CS_ASCII);
1964 rdaddsc(&rs,c);
1965 sfree(c);
1966 }
1967
1968 while (w) {
1969 if (removeattr(w->type) == word_Normal) {
1970 c = utoa_dup(w->text, CS_ASCII);
1971 rdaddsc(&rs,c);
1972 sfree(c);
1973 }
1974 w = w->next;
1975 }
1976 } else {
1977 rdaddc(&rs, *t++);
1978 }
1979 }
1980
1981 return rdtrimc(&rs);
1982}
1983
3e82de8f 1984static char *html_sanitise_fragment(htmlfilelist *files, htmlfile *file,
1985 char *text)
78c73085 1986{
1987 /*
1988 * The HTML 4 spec's strictest definition of fragment names (<a
1989 * name> and "id" attributes) says that they `must begin with a
1990 * letter and may be followed by any number of letters, digits,
1991 * hyphens, underscores, colons, and periods'.
1992 *
1993 * So here we unceremoniously rip out any characters not
1994 * conforming to this limitation.
1995 */
1996 char *p = text, *q = text;
1997
1998 while (*p && !((*p>='A' && *p<='Z') || (*p>='a' && *p<='z')))
1999 p++;
3e82de8f 2000 if ((*q++ = *p++) != '\0') {
2001 while (*p) {
2002 if ((*p>='A' && *p<='Z') ||
2003 (*p>='a' && *p<='z') ||
2004 (*p>='0' && *p<='9') ||
2005 *p=='-' || *p=='_' || *p==':' || *p=='.')
2006 *q++ = *p;
2007 p++;
2008 }
2009
2010 *q = '\0';
2011 }
2012
2013 /*
2014 * Now we check for clashes with other fragment names, and
2015 * adjust this one if necessary by appending a hyphen followed
2016 * by a number.
2017 */
2018 {
2019 htmlfragment *frag = snew(htmlfragment);
2020 int len = 0; /* >0 indicates we have resized */
2021 int suffix = 1;
2022
2023 frag->file = file;
2024 frag->fragment = text;
2025
2026 while (add234(files->frags, frag) != frag) {
2027 if (!len) {
2028 len = strlen(text);
2029 frag->fragment = text = sresize(text, len+20, char);
2030 }
2031
2032 sprintf(text + len, "-%d", ++suffix);
2033 }
78c73085 2034 }
2035
3e82de8f 2036 return text;
78c73085 2037}
2038
2039static void html_contents_entry(htmloutput *ho, int depth, htmlsect *s,
2040 htmlfile *thisfile, keywordlist *keywords,
2041 htmlconfig *cfg)
2042{
2043 while (ho->contents_level > depth) {
2044 element_close(ho, "ul");
2045 ho->contents_level--;
2046 }
2047
2048 while (ho->contents_level < depth) {
2049 element_open(ho, "ul");
2050 ho->contents_level++;
2051 }
2052
2053 if (!s)
2054 return;
2055
2056 element_open(ho, "li");
2057 html_href(ho, thisfile, s->file, s->fragment);
23c9bbc2 2058 html_section_title(ho, s, thisfile, keywords, cfg, FALSE);
78c73085 2059 element_close(ho, "a");
2060 element_close(ho, "li");
2061}
2062
2063static void html_section_title(htmloutput *ho, htmlsect *s, htmlfile *thisfile,
23c9bbc2 2064 keywordlist *keywords, htmlconfig *cfg,
2065 int real)
78c73085 2066{
2067 if (s->title) {
2068 sectlevel *sl;
2069 word *number;
2070 int depth = heading_depth(s->title);
2071
2072 if (depth < 0)
2073 sl = NULL;
2074 else if (depth == 0)
2075 sl = &cfg->achapter;
2076 else if (depth <= cfg->nasect)
2077 sl = &cfg->asect[depth-1];
2078 else
2079 sl = &cfg->asect[cfg->nasect-1];
2080
2081 if (!sl)
2082 number = NULL;
2083 else if (sl->just_numbers)
2084 number = s->title->kwtext2;
2085 else
2086 number = s->title->kwtext;
2087
2088 if (number) {
2089 html_words(ho, number, MARKUP,
2090 thisfile, keywords, cfg);
2091 html_text(ho, sl->number_suffix);
2092 }
2093
23c9bbc2 2094 html_words(ho, s->title->words, real ? ALL : MARKUP,
78c73085 2095 thisfile, keywords, cfg);
2096 } else {
2097 assert(s->type != NORMAL);
56a99eb6 2098 /*
2099 * If we're printing the full document title for _real_ and
2100 * there isn't one, we don't want to print `Preamble' at
2101 * the top of what ought to just be some text. If we need
2102 * it in any other context such as TOCs, we need to print
2103 * `Preamble'.
2104 */
2105 if (s->type == TOP && !real)
2106 html_text(ho, cfg->preamble_text);
78c73085 2107 else if (s->type == INDEX)
56a99eb6 2108 html_text(ho, cfg->index_text);
78c73085 2109 }
2110}