Fix up the restrict_charset and output_charset configuration for the
[sgt/halibut] / bk_html.c
CommitLineData
78c73085 1/*
2 * HTML backend for Halibut
3 */
4
5/*
6 * TODO:
7 *
8 * - I'm never entirely convinced that having a fragment link to
9 * come in at the start of the real text in the file is
10 * sensible. Perhaps for the topmost section in the file, no
11 * fragment should be used? (Though it should probably still be
12 * _there_ even if unused.)
13 *
14 * - new configurability:
56a99eb6 15 * * index_text, contents_text, preamble_text, title_separator,
16 * nav_prev_text, nav_next_text, nav_separator,
17 * index_main_sep, index_multi_sep, pre_versionid,
18 * post_versionid
78c73085 19 *
3e82de8f 20 * - nonbreaking spaces.
21 *
22 * - free up all the data we have allocated while running this
23 * backend.
78c73085 24 */
25
26#include <stdio.h>
27#include <stdlib.h>
28#include <assert.h>
29#include <limits.h>
30#include "halibut.h"
31
32#define is_heading_type(type) ( (type) == para_Title || \
33 (type) == para_Chapter || \
34 (type) == para_Appendix || \
35 (type) == para_UnnumberedChapter || \
36 (type) == para_Heading || \
37 (type) == para_Subsect)
38
39#define heading_depth(p) ( (p)->type == para_Subsect ? (p)->aux + 1 : \
40 (p)->type == para_Heading ? 1 : \
41 (p)->type == para_Title ? -1 : 0 )
42
43typedef struct {
44 int just_numbers;
45 wchar_t *number_suffix;
46} sectlevel;
47
48typedef struct {
49 int nasect;
50 sectlevel achapter, *asect;
51 int *contents_depths; /* 0=main, 1=chapter, 2=sect etc */
52 int ncdepths;
53 int address_section, visible_version_id;
54 int leaf_contains_contents, leaf_smallest_contents;
55 char *contents_filename;
56 char *index_filename;
57 char *template_filename;
58 char *single_filename;
59 char *template_fragment;
60 char *head_end, *body_start, *body_end, *addr_start, *addr_end;
61 char *body_tag, *nav_attr;
62 wchar_t *author, *description;
56a99eb6 63 wchar_t *index_text, *contents_text, *preamble_text, *title_separator;
64 wchar_t *nav_prev_text, *nav_next_text, *nav_separator;
65 wchar_t *index_main_sep, *index_multi_sep;
66 wchar_t *pre_versionid, *post_versionid;
78c73085 67 int restrict_charset, output_charset;
68 enum {
27bdc5ab 69 HTML_3_2, HTML_4, ISO_HTML,
78c73085 70 XHTML_1_0_TRANSITIONAL, XHTML_1_0_STRICT
71 } htmlver;
72 wchar_t *lquote, *rquote;
73 int leaf_level;
74} htmlconfig;
75
76#define contents_depth(conf, level) \
77 ( (conf).ncdepths > (level) ? (conf).contents_depths[level] : (level)+2 )
78
79#define is_xhtml(ver) ((ver) >= XHTML_1_0_TRANSITIONAL)
80
81typedef struct htmlfile htmlfile;
82typedef struct htmlsect htmlsect;
83
84struct htmlfile {
85 htmlfile *next;
86 char *filename;
87 int last_fragment_number;
88 int min_heading_depth;
89 htmlsect *first, *last; /* first/last highest-level sections */
90};
91
92struct htmlsect {
93 htmlsect *next, *parent;
94 htmlfile *file;
95 paragraph *title, *text;
96 enum { NORMAL, TOP, INDEX } type;
97 int contents_depth;
98 char *fragment;
99};
100
101typedef struct {
102 htmlfile *head, *tail;
103 htmlfile *single, *index;
3e82de8f 104 tree234 *frags;
78c73085 105} htmlfilelist;
106
107typedef struct {
108 htmlsect *head, *tail;
109} htmlsectlist;
110
111typedef struct {
3e82de8f 112 htmlfile *file;
113 char *fragment;
114} htmlfragment;
115
116typedef struct {
78c73085 117 int nrefs, refsize;
118 word **refs;
119} htmlindex;
120
121typedef struct {
122 htmlsect *section;
123 char *fragment;
1b7bf715 124 int generated, referenced;
78c73085 125} htmlindexref;
126
127typedef struct {
128 /*
129 * This level deals with charset conversion, starting and
130 * ending tags, and writing to the file. It's the lexical
131 * level.
132 */
133 FILE *fp;
b7309494 134 int charset, restrict_charset;
78c73085 135 charset_state cstate;
136 int ver;
137 enum {
138 HO_NEUTRAL, HO_IN_TAG, HO_IN_EMPTY_TAG, HO_IN_TEXT
139 } state;
140 /*
141 * Stuff beyond here deals with the higher syntactic level: it
142 * tracks how many levels of <ul> are currently open when
143 * producing a contents list, for example.
144 */
145 int contents_level;
146} htmloutput;
147
3e82de8f 148static int html_fragment_compare(void *av, void *bv)
149{
150 htmlfragment *a = (htmlfragment *)av;
151 htmlfragment *b = (htmlfragment *)bv;
152 int cmp;
153
154 if ((cmp = strcmp(a->file->filename, b->file->filename)) != 0)
155 return cmp;
156 else
157 return strcmp(a->fragment, b->fragment);
158}
159
78c73085 160static void html_file_section(htmlconfig *cfg, htmlfilelist *files,
161 htmlsect *sect, int depth);
162
163static htmlfile *html_new_file(htmlfilelist *list, char *filename);
164static htmlsect *html_new_sect(htmlsectlist *list, paragraph *title);
165
166/* Flags for html_words() flags parameter */
167#define NOTHING 0x00
168#define MARKUP 0x01
169#define LINKS 0x02
170#define INDEXENTS 0x04
171#define ALL 0x07
172static void html_words(htmloutput *ho, word *words, int flags,
173 htmlfile *file, keywordlist *keywords, htmlconfig *cfg);
174static void html_codepara(htmloutput *ho, word *words);
175
176static void element_open(htmloutput *ho, char const *name);
177static void element_close(htmloutput *ho, char const *name);
178static void element_empty(htmloutput *ho, char const *name);
179static void element_attr(htmloutput *ho, char const *name, char const *value);
180static void element_attr_w(htmloutput *ho, char const *name,
181 wchar_t const *value);
182static void html_text(htmloutput *ho, wchar_t const *str);
183static void html_text_limit(htmloutput *ho, wchar_t const *str, int maxlen);
184static void html_text_limit_internal(htmloutput *ho, wchar_t const *text,
185 int maxlen, int quote_quotes);
186static void html_nl(htmloutput *ho);
187static void html_raw(htmloutput *ho, char *text);
188static void html_raw_as_attr(htmloutput *ho, char *text);
189static void cleanup(htmloutput *ho);
190
191static void html_href(htmloutput *ho, htmlfile *thisfile,
192 htmlfile *targetfile, char *targetfrag);
27bdc5ab 193static void html_fragment(htmloutput *ho, char const *fragment);
78c73085 194
195static char *html_format(paragraph *p, char *template_string);
3e82de8f 196static char *html_sanitise_fragment(htmlfilelist *files, htmlfile *file,
197 char *text);
78c73085 198
199static void html_contents_entry(htmloutput *ho, int depth, htmlsect *s,
200 htmlfile *thisfile, keywordlist *keywords,
201 htmlconfig *cfg);
202static void html_section_title(htmloutput *ho, htmlsect *s,
203 htmlfile *thisfile, keywordlist *keywords,
23c9bbc2 204 htmlconfig *cfg, int real);
78c73085 205
206static htmlconfig html_configure(paragraph *source) {
207 htmlconfig ret;
208 paragraph *p;
209
210 /*
211 * Defaults.
212 */
213 ret.leaf_level = 2;
214 ret.achapter.just_numbers = FALSE;
215 ret.achapter.number_suffix = L": ";
216 ret.nasect = 1;
f1530049 217 ret.asect = snewn(ret.nasect, sectlevel);
78c73085 218 ret.asect[0].just_numbers = TRUE;
219 ret.asect[0].number_suffix = L" ";
220 ret.ncdepths = 0;
221 ret.contents_depths = 0;
222 ret.visible_version_id = TRUE;
223 ret.address_section = TRUE;
224 ret.leaf_contains_contents = FALSE;
225 ret.leaf_smallest_contents = 4;
226 ret.single_filename = dupstr("Manual.html");
227 ret.contents_filename = dupstr("Contents.html");
228 ret.index_filename = dupstr("IndexPage.html");
229 ret.template_filename = dupstr("%n.html");
230 ret.template_fragment = dupstr("%b");
231 ret.head_end = ret.body_tag = ret.body_start = ret.body_end =
232 ret.addr_start = ret.addr_end = ret.nav_attr = NULL;
233 ret.author = ret.description = NULL;
b7309494 234 ret.restrict_charset = CS_UTF8;
78c73085 235 ret.output_charset = CS_ASCII;
236 ret.htmlver = HTML_4;
56a99eb6 237 ret.index_text = L"Index";
238 ret.contents_text = L"Contents";
239 ret.preamble_text = L"Preamble";
240 ret.title_separator = L" - ";
241 ret.nav_prev_text = L"Previous";
242 ret.nav_next_text = L"Next";
243 ret.nav_separator = L" | ";
244 ret.index_main_sep = L": ";
245 ret.index_multi_sep = L", ";
246 ret.pre_versionid = L"[";
247 ret.post_versionid = L"]";
78c73085 248 /*
249 * Default quote characters are Unicode matched single quotes,
250 * falling back to ordinary ASCII ".
251 */
252 ret.lquote = L"\x2018\0\x2019\0\"\0\"\0\0";
253 ret.rquote = uadv(ret.lquote);
254
255 /*
256 * Two-pass configuration so that we can pick up global config
257 * (e.g. `quotes') before having it overridden by specific
258 * config (`html-quotes'), irrespective of the order in which
259 * they occur.
260 */
261 for (p = source; p; p = p->next) {
262 if (p->type == para_Config) {
263 if (!ustricmp(p->keyword, L"quotes")) {
264 if (*uadv(p->keyword) && *uadv(uadv(p->keyword))) {
265 ret.lquote = uadv(p->keyword);
266 ret.rquote = uadv(ret.lquote);
267 }
268 }
269 }
270 }
271
272 for (p = source; p; p = p->next) {
273 if (p->type == para_Config) {
274 wchar_t *k = p->keyword;
275
276 if (!ustrnicmp(k, L"xhtml-", 6))
277 k++; /* treat `xhtml-' and `html-' the same */
278
b7309494 279 if (!ustricmp(k, L"html-restrict-charset")) {
78c73085 280 char *csname = utoa_dup(uadv(k), CS_ASCII);
b7309494 281 ret.restrict_charset = charset_from_localenc(csname);
282 sfree(csname);
283 } else if (!ustricmp(k, L"html-output-charset")) {
284 char *csname = utoa_dup(uadv(k), CS_ASCII);
285 ret.output_charset = charset_from_localenc(csname);
78c73085 286 sfree(csname);
27bdc5ab 287 } else if (!ustricmp(k, L"html-version")) {
288 wchar_t *vername = uadv(k);
289 static const struct {
290 const wchar_t *name;
291 int ver;
292 } versions[] = {
293 {L"html3.2", HTML_3_2},
294 {L"html4", HTML_4},
295 {L"iso-html", ISO_HTML},
296 {L"xhtml1.0transitional", XHTML_1_0_TRANSITIONAL},
297 {L"xhtml1.0strict", XHTML_1_0_STRICT}
298 };
299 int i;
300
301 for (i = 0; i < (int)lenof(versions); i++)
302 if (!ustricmp(versions[i].name, vername))
303 break;
304
305 if (i == lenof(versions))
306 error(err_htmlver, &p->fpos, vername);
307 else
308 ret.htmlver = versions[i].ver;
78c73085 309 } else if (!ustricmp(k, L"html-single-filename")) {
310 sfree(ret.single_filename);
311 ret.single_filename = dupstr(adv(p->origkeyword));
312 } else if (!ustricmp(k, L"html-contents-filename")) {
313 sfree(ret.contents_filename);
314 ret.contents_filename = dupstr(adv(p->origkeyword));
315 } else if (!ustricmp(k, L"html-index-filename")) {
316 sfree(ret.index_filename);
317 ret.index_filename = dupstr(adv(p->origkeyword));
318 } else if (!ustricmp(k, L"html-template-filename")) {
319 sfree(ret.template_filename);
320 ret.template_filename = dupstr(adv(p->origkeyword));
321 } else if (!ustricmp(k, L"html-template-fragment")) {
322 sfree(ret.template_fragment);
323 ret.template_fragment = dupstr(adv(p->origkeyword));
324 } else if (!ustricmp(k, L"html-chapter-numeric")) {
325 ret.achapter.just_numbers = utob(uadv(k));
326 } else if (!ustricmp(k, L"html-chapter-suffix")) {
327 ret.achapter.number_suffix = uadv(k);
328 } else if (!ustricmp(k, L"html-leaf-level")) {
329 ret.leaf_level = utoi(uadv(k));
330 } else if (!ustricmp(k, L"html-section-numeric")) {
331 wchar_t *q = uadv(k);
332 int n = 0;
333 if (uisdigit(*q)) {
334 n = utoi(q);
335 q = uadv(q);
336 }
337 if (n >= ret.nasect) {
338 int i;
f1530049 339 ret.asect = sresize(ret.asect, n+1, sectlevel);
78c73085 340 for (i = ret.nasect; i <= n; i++)
341 ret.asect[i] = ret.asect[ret.nasect-1];
342 ret.nasect = n+1;
343 }
344 ret.asect[n].just_numbers = utob(q);
345 } else if (!ustricmp(k, L"html-section-suffix")) {
346 wchar_t *q = uadv(k);
347 int n = 0;
348 if (uisdigit(*q)) {
349 n = utoi(q);
350 q = uadv(q);
351 }
352 if (n >= ret.nasect) {
353 int i;
f1530049 354 ret.asect = sresize(ret.asect, n+1, sectlevel);
78c73085 355 for (i = ret.nasect; i <= n; i++) {
356 ret.asect[i] = ret.asect[ret.nasect-1];
357 }
358 ret.nasect = n+1;
359 }
360 ret.asect[n].number_suffix = q;
361 } else if (!ustricmp(k, L"html-contents-depth") ||
362 !ustrnicmp(k, L"html-contents-depth-", 20)) {
363 /*
364 * Relic of old implementation: this directive used
365 * to be written as \cfg{html-contents-depth-3}{2}
366 * rather than the usual Halibut convention of
367 * \cfg{html-contents-depth}{3}{2}. We therefore
368 * support both.
369 */
370 wchar_t *q = k[19] ? k+20 : uadv(k);
371 int n = 0;
372 if (uisdigit(*q)) {
373 n = utoi(q);
374 q = uadv(q);
375 }
376 if (n >= ret.ncdepths) {
377 int i;
f1530049 378 ret.contents_depths =
379 sresize(ret.contents_depths, n+1, int);
78c73085 380 for (i = ret.ncdepths; i <= n; i++) {
381 ret.contents_depths[i] = i+2;
382 }
383 ret.ncdepths = n+1;
384 }
385 ret.contents_depths[n] = utoi(q);
386 } else if (!ustricmp(k, L"html-head-end")) {
387 ret.head_end = adv(p->origkeyword);
388 } else if (!ustricmp(k, L"html-body-tag")) {
389 ret.body_tag = adv(p->origkeyword);
390 } else if (!ustricmp(k, L"html-body-start")) {
391 ret.body_start = adv(p->origkeyword);
392 } else if (!ustricmp(k, L"html-body-end")) {
393 ret.body_end = adv(p->origkeyword);
394 } else if (!ustricmp(k, L"html-address-start")) {
395 ret.addr_start = adv(p->origkeyword);
396 } else if (!ustricmp(k, L"html-address-end")) {
397 ret.addr_end = adv(p->origkeyword);
398 } else if (!ustricmp(k, L"html-navigation-attributes")) {
399 ret.nav_attr = adv(p->origkeyword);
400 } else if (!ustricmp(k, L"html-author")) {
401 ret.author = uadv(k);
402 } else if (!ustricmp(k, L"html-description")) {
403 ret.description = uadv(k);
404 } else if (!ustricmp(k, L"html-suppress-address")) {
405 ret.address_section = !utob(uadv(k));
406 } else if (!ustricmp(k, L"html-versionid")) {
407 ret.visible_version_id = utob(uadv(k));
408 } else if (!ustricmp(k, L"html-quotes")) {
409 if (*uadv(k) && *uadv(uadv(k))) {
410 ret.lquote = uadv(k);
411 ret.rquote = uadv(ret.lquote);
412 }
413 } else if (!ustricmp(k, L"html-leaf-contains-contents")) {
414 ret.leaf_contains_contents = utob(uadv(k));
415 } else if (!ustricmp(k, L"html-leaf-smallest-contents")) {
416 ret.leaf_smallest_contents = utoi(uadv(k));
417 }
418 }
419 }
420
421 /*
422 * Now process fallbacks on quote characters.
423 */
424 while (*uadv(ret.rquote) && *uadv(uadv(ret.rquote)) &&
425 (!cvt_ok(ret.restrict_charset, ret.lquote) ||
426 !cvt_ok(ret.restrict_charset, ret.rquote))) {
427 ret.lquote = uadv(ret.rquote);
428 ret.rquote = uadv(ret.lquote);
429 }
430
431 return ret;
432}
433
434paragraph *html_config_filename(char *filename)
435{
436 /*
437 * If the user passes in a single filename as a parameter to
438 * the `--html' command-line option, then we should assume it
439 * to imply _two_ config directives:
440 * \cfg{html-single-filename}{whatever} and
441 * \cfg{html-leaf-level}{0}; the rationale being that the user
442 * wants their output _in that file_.
443 */
444 paragraph *p, *q;
445
446 p = cmdline_cfg_simple("html-single-filename", filename, NULL);
447 q = cmdline_cfg_simple("html-leaf-level", "0", NULL);
448 p->next = q;
449 return p;
450}
451
452void html_backend(paragraph *sourceform, keywordlist *keywords,
453 indexdata *idx, void *unused) {
454 paragraph *p;
455 htmlconfig conf;
3e82de8f 456 htmlfilelist files = { NULL, NULL, NULL, NULL, NULL };
78c73085 457 htmlsectlist sects = { NULL, NULL }, nonsects = { NULL, NULL };
458
459 IGNORE(unused);
460
461 conf = html_configure(sourceform);
462
463 /*
464 * We're going to make heavy use of paragraphs' private data
465 * fields in the forthcoming code. Clear them first, so we can
466 * reliably tell whether we have auxiliary data for a
467 * particular paragraph.
468 */
469 for (p = sourceform; p; p = p->next)
470 p->private_data = NULL;
471
3e82de8f 472 files.frags = newtree234(html_fragment_compare);
473
78c73085 474 /*
475 * Start by figuring out into which file each piece of the
476 * document should be put. We'll do this by inventing an
477 * `htmlsect' structure and stashing it in the private_data
478 * field of each section paragraph; we also need one additional
479 * htmlsect for the document index, which won't show up in the
480 * source form but needs to be consistently mentioned in
481 * contents links.
482 *
483 * While we're here, we'll also invent the HTML fragment name
484 * for each section.
485 */
486 {
487 htmlsect *topsect, *sect;
488 int d;
489
56a99eb6 490 topsect = html_new_sect(&sects, NULL);
78c73085 491 topsect->type = TOP;
492 topsect->title = NULL;
493 topsect->text = sourceform;
494 topsect->contents_depth = contents_depth(conf, 0);
495 html_file_section(&conf, &files, topsect, -1);
496 topsect->fragment = NULL;
497
498 for (p = sourceform; p; p = p->next)
499 if (is_heading_type(p->type)) {
500 d = heading_depth(p);
501
502 if (p->type == para_Title) {
503 topsect->title = p;
504 continue;
505 }
506
507 sect = html_new_sect(&sects, p);
508 sect->text = p->next;
509
510 sect->contents_depth = contents_depth(conf, d+1) - (d+1);
511
512 if (p->parent) {
513 sect->parent = (htmlsect *)p->parent->private_data;
514 assert(sect->parent != NULL);
515 } else
516 sect->parent = topsect;
517 p->private_data = sect;
518
519 html_file_section(&conf, &files, sect, d);
520
521 sect->fragment = html_format(p, conf.template_fragment);
3e82de8f 522 sect->fragment = html_sanitise_fragment(&files, sect->file,
523 sect->fragment);
78c73085 524 }
525
526 /* And the index. */
527 sect = html_new_sect(&sects, NULL);
78c73085 528 sect->text = NULL;
529 sect->type = INDEX;
530 sect->parent = topsect;
531 html_file_section(&conf, &files, sect, 0); /* peer of chapters */
56a99eb6 532 sect->fragment = utoa_dup(conf.index_text, CS_ASCII);
3e82de8f 533 sect->fragment = html_sanitise_fragment(&files, sect->file,
534 sect->fragment);
78c73085 535 files.index = sect->file;
536 }
537
538 /*
539 * Go through the keyword list and sort out fragment IDs for
540 * all the potentially referenced paragraphs which _aren't_
541 * headings.
542 */
543 {
544 int i;
545 keyword *kw;
546 htmlsect *sect;
547
548 for (i = 0; (kw = index234(keywords->keys, i)) != NULL; i++) {
549 paragraph *q, *p = kw->para;
550
551 if (!is_heading_type(p->type)) {
552 htmlsect *parent;
553
554 /*
555 * Find the paragraph's parent htmlsect, to
556 * determine which file it will end up in.
557 */
558 q = p->parent;
559 if (!q) {
560 /*
561 * Preamble paragraphs have no parent. So if we
562 * have a non-heading with no parent, it must
563 * be preamble, and therefore its parent
564 * htmlsect must be the preamble one.
565 */
566 assert(sects.head &&
567 sects.head->type == TOP);
568 parent = sects.head;
569 } else
570 parent = (htmlsect *)q->private_data;
571
572 /*
573 * Now we can construct an htmlsect for this
574 * paragraph itself, taking care to put it in the
575 * list of non-sections rather than the list of
576 * sections (so that traverses of the `sects' list
577 * won't attempt to add it to the contents or
578 * anything weird like that).
579 */
580 sect = html_new_sect(&nonsects, p);
581 sect->file = parent->file;
582 sect->parent = parent;
583 p->private_data = sect;
584
585 /*
04781c84 586 * Fragment IDs for these paragraphs will simply be
587 * `p' followed by an integer.
78c73085 588 */
f1530049 589 sect->fragment = snewn(40, char);
04781c84 590 sprintf(sect->fragment, "p%d",
591 sect->file->last_fragment_number++);
3e82de8f 592 sect->fragment = html_sanitise_fragment(&files, sect->file,
593 sect->fragment);
78c73085 594 }
595 }
596 }
597
598 /*
04781c84 599 * Reset the fragment numbers in each file. I've just used them
600 * to generate `p' fragment IDs for non-section paragraphs
601 * (numbered list elements, bibliocited), and now I want to use
602 * them for `i' fragment IDs for index entries.
603 */
604 {
605 htmlfile *file;
606 for (file = files.head; file; file = file->next)
607 file->last_fragment_number = 0;
608 }
609
610 /*
78c73085 611 * Now sort out the index. This involves:
612 *
613 * - For each index term, we set up an htmlindex structure to
614 * store all the references to that term.
615 *
616 * - Then we make a pass over the actual document, finding
617 * every word_IndexRef; for each one, we actually figure out
618 * the HTML filename/fragment pair we will use to reference
619 * it, store that information in the private data field of
620 * the word_IndexRef itself (so we can recreate it when the
621 * time comes to output our HTML), and add a reference to it
622 * to the index term in question.
623 */
624 {
625 int i;
626 indexentry *entry;
627 htmlsect *lastsect;
628 word *w;
629
630 /*
631 * Set up the htmlindex structures.
632 */
633
634 for (i = 0; (entry = index234(idx->entries, i)) != NULL; i++) {
f1530049 635 htmlindex *hi = snew(htmlindex);
78c73085 636
637 hi->nrefs = hi->refsize = 0;
638 hi->refs = NULL;
639
640 entry->backend_data = hi;
641 }
642
643 /*
644 * Run over the document inventing fragments. Each fragment
645 * is of the form `i' followed by an integer.
78c73085 646 */
56a99eb6 647 lastsect = sects.head; /* this is always the top section */
78c73085 648 for (p = sourceform; p; p = p->next) {
56a99eb6 649 if (is_heading_type(p->type) && p->type != para_Title)
78c73085 650 lastsect = (htmlsect *)p->private_data;
651
652 for (w = p->words; w; w = w->next)
653 if (w->type == word_IndexRef) {
f1530049 654 htmlindexref *hr = snew(htmlindexref);
78c73085 655 indextag *tag;
656 int i;
657
1b7bf715 658 hr->referenced = hr->generated = FALSE;
78c73085 659 hr->section = lastsect;
78c73085 660 {
661 char buf[40];
662 sprintf(buf, "i%d",
663 lastsect->file->last_fragment_number++);
664 hr->fragment = dupstr(buf);
3e82de8f 665 hr->fragment =
666 html_sanitise_fragment(&files, hr->section->file,
667 hr->fragment);
78c73085 668 }
669 w->private_data = hr;
670
671 tag = index_findtag(idx, w->text);
672 if (!tag)
673 break;
674
675 for (i = 0; i < tag->nrefs; i++) {
676 indexentry *entry = tag->refs[i];
677 htmlindex *hi = (htmlindex *)entry->backend_data;
678
679 if (hi->nrefs >= hi->refsize) {
680 hi->refsize += 32;
f1530049 681 hi->refs = sresize(hi->refs, hi->refsize, word *);
78c73085 682 }
683
684 hi->refs[hi->nrefs++] = w;
685 }
686 }
687 }
688 }
689
690 /*
691 * Now we're ready to write out the actual HTML files.
692 *
693 * For each file:
694 *
695 * - we open that file and write its header
696 * - we run down the list of sections
697 * - for each section directly contained within that file, we
698 * output the section text
699 * - for each section which is not in the file but which has a
700 * parent that is, we output a contents entry for the
701 * section if appropriate
702 * - finally, we output the file trailer and close the file.
703 */
704 {
705 htmlfile *f, *prevf;
706 htmlsect *s;
707 paragraph *p;
708
709 prevf = NULL;
710
711 for (f = files.head; f; f = f->next) {
712 htmloutput ho;
713 int displaying;
714 enum LISTTYPE { NOLIST, UL, OL, DL };
715 enum ITEMTYPE { NOITEM, LI, DT, DD };
716 struct stackelement {
717 struct stackelement *next;
718 enum LISTTYPE listtype;
719 enum ITEMTYPE itemtype;
720 } *stackhead;
721
722#define listname(lt) ( (lt)==UL ? "ul" : (lt)==OL ? "ol" : "dl" )
723#define itemname(lt) ( (lt)==LI ? "li" : (lt)==DT ? "dt" : "dd" )
724
725 ho.fp = fopen(f->filename, "w");
726 ho.charset = conf.output_charset;
b7309494 727 ho.restrict_charset = conf.restrict_charset;
78c73085 728 ho.cstate = charset_init_state;
729 ho.ver = conf.htmlver;
730 ho.state = HO_NEUTRAL;
731 ho.contents_level = 0;
732
733 /* <!DOCTYPE>. */
734 switch (conf.htmlver) {
735 case HTML_3_2:
736 fprintf(ho.fp, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD "
737 "HTML 3.2 Final//EN\">\n");
738 break;
739 case HTML_4:
740 fprintf(ho.fp, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML"
741 " 4.01//EN\"\n\"http://www.w3.org/TR/html4/"
742 "strict.dtd\">\n");
743 break;
27bdc5ab 744 case ISO_HTML:
745 fprintf(ho.fp, "<!DOCTYPE HTML PUBLIC \"ISO/IEC "
746 "15445:2000//DTD HTML//EN\">\n");
747 break;
78c73085 748 case XHTML_1_0_TRANSITIONAL:
27bdc5ab 749 fprintf(ho.fp, "<?xml version=\"1.0\" encoding=\"%s\"?>\n",
750 charset_to_mimeenc(conf.output_charset));
78c73085 751 fprintf(ho.fp, "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML"
752 " 1.0 Transitional//EN\"\n\"http://www.w3.org/TR/"
753 "xhtml1/DTD/xhtml1-transitional.dtd\">\n");
754 break;
755 case XHTML_1_0_STRICT:
27bdc5ab 756 fprintf(ho.fp, "<?xml version=\"1.0\" encoding=\"%s\"?>\n",
757 charset_to_mimeenc(conf.output_charset));
78c73085 758 fprintf(ho.fp, "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML"
759 " 1.0 Strict//EN\"\n\"http://www.w3.org/TR/xhtml1/"
760 "DTD/xhtml1-strict.dtd\">\n");
761 break;
762 }
763
764 element_open(&ho, "html");
765 if (is_xhtml(conf.htmlver)) {
766 element_attr(&ho, "xmlns", "http://www.w3.org/1999/xhtml");
767 }
768 html_nl(&ho);
769
770 element_open(&ho, "head");
771 html_nl(&ho);
772
773 element_empty(&ho, "meta");
774 element_attr(&ho, "http-equiv", "content-type");
775 {
776 char buf[200];
777 sprintf(buf, "text/html; charset=%.150s",
778 charset_to_mimeenc(conf.output_charset));
779 element_attr(&ho, "content", buf);
780 }
781 html_nl(&ho);
782
783 if (conf.author) {
784 element_empty(&ho, "meta");
785 element_attr(&ho, "name", "author");
786 element_attr_w(&ho, "content", conf.author);
787 html_nl(&ho);
788 }
789
790 if (conf.description) {
791 element_empty(&ho, "meta");
792 element_attr(&ho, "name", "description");
793 element_attr_w(&ho, "content", conf.description);
794 html_nl(&ho);
795 }
796
797 element_open(&ho, "title");
798 if (f->first && f->first->title) {
799 html_words(&ho, f->first->title->words, NOTHING,
800 f, keywords, &conf);
801
802 assert(f->last);
803 if (f->last != f->first && f->last->title) {
56a99eb6 804 html_text(&ho, conf.title_separator);
78c73085 805 html_words(&ho, f->last->title->words, NOTHING,
806 f, keywords, &conf);
807 }
808 }
809 element_close(&ho, "title");
810 html_nl(&ho);
811
812 if (conf.head_end)
813 html_raw(&ho, conf.head_end);
814
815 element_close(&ho, "head");
816 html_nl(&ho);
817
78c73085 818 if (conf.body_tag)
819 html_raw(&ho, conf.body_tag);
820 else
821 element_open(&ho, "body");
822 html_nl(&ho);
823
824 if (conf.body_start)
825 html_raw(&ho, conf.body_start);
826
827 /*
828 * Write out a nav bar. Special case: we don't do this
829 * if there is only one file.
830 */
831 if (files.head != files.tail) {
832 element_open(&ho, "p");
833 if (conf.nav_attr)
834 html_raw_as_attr(&ho, conf.nav_attr);
835
836 if (prevf) {
837 element_open(&ho, "a");
838 element_attr(&ho, "href", prevf->filename);
839 }
56a99eb6 840 html_text(&ho, conf.nav_prev_text);
78c73085 841 if (prevf)
842 element_close(&ho, "a");
843
56a99eb6 844 html_text(&ho, conf.nav_separator);
78c73085 845
846 if (f != files.head) {
847 element_open(&ho, "a");
848 element_attr(&ho, "href", files.head->filename);
849 }
56a99eb6 850 html_text(&ho, conf.contents_text);
78c73085 851 if (f != files.head)
852 element_close(&ho, "a");
853
56a99eb6 854 html_text(&ho, conf.nav_separator);
78c73085 855
856 if (f != files.index) {
857 element_open(&ho, "a");
858 element_attr(&ho, "href", files.index->filename);
859 }
56a99eb6 860 html_text(&ho, conf.index_text);
78c73085 861 if (f != files.index)
862 element_close(&ho, "a");
863
56a99eb6 864 html_text(&ho, conf.nav_separator);
78c73085 865
866 if (f->next) {
867 element_open(&ho, "a");
868 element_attr(&ho, "href", f->next->filename);
869 }
56a99eb6 870 html_text(&ho, conf.nav_next_text);
78c73085 871 if (f->next)
872 element_close(&ho, "a");
873
874 element_close(&ho, "p");
875 html_nl(&ho);
876 }
877 prevf = f;
878
879 /*
880 * Write out a prefix TOC for the file.
881 *
882 * We start by going through the section list and
883 * collecting the sections which need to be added to
884 * the contents. On the way, we also test to see if
885 * this file is a leaf file (defined as one which
886 * contains all descendants of any section it
887 * contains), because this will play a part in our
888 * decision on whether or not to _output_ the TOC.
889 *
890 * Special case: we absolutely do not do this if we're
891 * in single-file mode.
892 */
893 if (files.head != files.tail) {
894 int ntoc = 0, tocsize = 0;
895 htmlsect **toc = NULL;
896 int leaf = TRUE;
897
898 for (s = sects.head; s; s = s->next) {
899 htmlsect *a, *ac;
900 int depth, adepth;
901
902 /*
903 * Search up from this section until we find
904 * the highest-level one which belongs in this
905 * file.
906 */
907 depth = adepth = 0;
908 a = NULL;
909 for (ac = s; ac; ac = ac->parent) {
910 if (ac->file == f) {
911 a = ac;
912 adepth = depth;
913 }
914 depth++;
915 }
916
917 if (s->file != f && a != NULL)
918 leaf = FALSE;
919
920 if (a) {
921 if (adepth <= a->contents_depth) {
922 if (ntoc >= tocsize) {
923 tocsize += 64;
f1530049 924 toc = sresize(toc, tocsize, htmlsect *);
78c73085 925 }
926 toc[ntoc++] = s;
927 }
928 }
929 }
930
931 if (leaf && conf.leaf_contains_contents &&
932 ntoc >= conf.leaf_smallest_contents) {
933 int i;
934
935 for (i = 0; i < ntoc; i++) {
936 htmlsect *s = toc[i];
937 int hlevel = (s->type == TOP ? -1 :
938 s->type == INDEX ? 0 :
939 heading_depth(s->title))
940 - f->min_heading_depth + 1;
941
942 assert(hlevel >= 1);
943 html_contents_entry(&ho, hlevel, s,
944 f, keywords, &conf);
945 }
946 html_contents_entry(&ho, 0, NULL, f, keywords, &conf);
947 }
948 }
949
950 /*
951 * Now go through the document and output some real
952 * text.
953 */
954 displaying = FALSE;
955 for (s = sects.head; s; s = s->next) {
956 if (s->file == f) {
957 /*
958 * This section belongs in this file.
959 * Display it.
960 */
961 displaying = TRUE;
962 } else {
963 htmlsect *a, *ac;
964 int depth, adepth;
965
966 displaying = FALSE;
967
968 /*
969 * Search up from this section until we find
970 * the highest-level one which belongs in this
971 * file.
972 */
973 depth = adepth = 0;
974 a = NULL;
975 for (ac = s; ac; ac = ac->parent) {
976 if (ac->file == f) {
977 a = ac;
978 adepth = depth;
979 }
980 depth++;
981 }
982
983 if (a != NULL) {
984 /*
985 * This section does not belong in this
986 * file, but an ancestor of it does. Write
987 * out a contents table entry, if the depth
988 * doesn't exceed the maximum contents
989 * depth for the ancestor section.
990 */
991 if (adepth <= a->contents_depth) {
992 html_contents_entry(&ho, adepth, s,
993 f, keywords, &conf);
994 }
995 }
996 }
997
998 if (displaying) {
999 int hlevel;
1000 char htag[3];
1001
1002 html_contents_entry(&ho, 0, NULL, f, keywords, &conf);
1003
1004 /*
1005 * Display the section heading.
1006 */
1007
1008 hlevel = (s->type == TOP ? -1 :
1009 s->type == INDEX ? 0 :
1010 heading_depth(s->title))
1011 - f->min_heading_depth + 1;
1012 assert(hlevel >= 1);
1013 /* HTML headings only go up to <h6> */
1014 if (hlevel > 6)
1015 hlevel = 6;
1016 htag[0] = 'h';
1017 htag[1] = '0' + hlevel;
1018 htag[2] = '\0';
1019 element_open(&ho, htag);
1020
1021 /*
1022 * Provide anchor for cross-links to target.
1023 *
78c73085 1024 * (Also we'll have to do this separately in
1025 * other paragraph types - NumberedList and
1026 * BiblioCited.)
1027 */
27bdc5ab 1028 if (s->fragment)
1029 html_fragment(&ho, s->fragment);
78c73085 1030
23c9bbc2 1031 html_section_title(&ho, s, f, keywords, &conf, TRUE);
78c73085 1032
1033 element_close(&ho, htag);
1034
1035 /*
1036 * Now display the section text.
1037 */
1038 if (s->text) {
f1530049 1039 stackhead = snew(struct stackelement);
78c73085 1040 stackhead->next = NULL;
1041 stackhead->listtype = NOLIST;
1042 stackhead->itemtype = NOITEM;
1043
1044 for (p = s->text;; p = p->next) {
1045 enum LISTTYPE listtype;
1046 struct stackelement *se;
1047
1048 /*
1049 * Preliminary switch to figure out what
1050 * sort of list we expect to be inside at
1051 * this stage.
1052 *
1053 * Since p may still be NULL at this point,
1054 * I invent a harmless paragraph type for
1055 * it if it is.
1056 */
1057 switch (p ? p->type : para_Normal) {
1058 case para_Rule:
1059 case para_Normal:
1060 case para_Copyright:
1061 case para_BiblioCited:
1062 case para_Code:
1063 case para_QuotePush:
1064 case para_QuotePop:
1065 case para_Chapter:
1066 case para_Appendix:
1067 case para_UnnumberedChapter:
1068 case para_Heading:
1069 case para_Subsect:
1070 case para_LcontPop:
1071 listtype = NOLIST;
1072 break;
1073
1074 case para_Bullet:
1075 listtype = UL;
1076 break;
1077
1078 case para_NumberedList:
1079 listtype = OL;
1080 break;
1081
1082 case para_DescribedThing:
1083 case para_Description:
1084 listtype = DL;
1085 break;
1086
1087 case para_LcontPush:
f1530049 1088 se = snew(struct stackelement);
78c73085 1089 se->next = stackhead;
1090 se->listtype = NOLIST;
1091 se->itemtype = NOITEM;
1092 stackhead = se;
1093 continue;
1094
1095 default: /* some totally non-printing para */
1096 continue;
1097 }
1098
1099 html_nl(&ho);
1100
1101 /*
1102 * Terminate the most recent list item, if
1103 * any. (We left this until after
1104 * processing LcontPush, since in that case
1105 * the list item won't want to be
1106 * terminated until after the corresponding
1107 * LcontPop.)
1108 */
1109 if (stackhead->itemtype != NOITEM) {
1110 element_close(&ho, itemname(stackhead->itemtype));
1111 html_nl(&ho);
1112 }
1113 stackhead->itemtype = NOITEM;
1114
1115 /*
1116 * Terminate the current list, if it's not
1117 * the one we want to be in.
1118 */
1119 if (listtype != stackhead->listtype &&
1120 stackhead->listtype != NOLIST) {
1121 element_close(&ho, listname(stackhead->listtype));
1122 html_nl(&ho);
1123 }
1124
1125 /*
1126 * Leave the loop if our time has come.
1127 */
1128 if (!p || (is_heading_type(p->type) &&
1129 p->type != para_Title))
1130 break; /* end of section text */
1131
1132 /*
1133 * Start a fresh list if necessary.
1134 */
1135 if (listtype != stackhead->listtype &&
1136 listtype != NOLIST)
1137 element_open(&ho, listname(listtype));
1138
1139 stackhead->listtype = listtype;
1140
1141 switch (p->type) {
1142 case para_Rule:
1143 element_empty(&ho, "hr");
1144 break;
1145 case para_Code:
1146 html_codepara(&ho, p->words);
1147 break;
1148 case para_Normal:
1149 case para_Copyright:
1150 element_open(&ho, "p");
1151 html_nl(&ho);
1152 html_words(&ho, p->words, ALL,
1153 f, keywords, &conf);
1154 html_nl(&ho);
1155 element_close(&ho, "p");
1156 break;
1157 case para_BiblioCited:
1158 element_open(&ho, "p");
1159 if (p->private_data) {
1160 htmlsect *s = (htmlsect *)p->private_data;
27bdc5ab 1161 html_fragment(&ho, s->fragment);
78c73085 1162 }
1163 html_nl(&ho);
1164 html_words(&ho, p->kwtext, ALL,
1165 f, keywords, &conf);
1166 html_text(&ho, L" ");
1167 html_words(&ho, p->words, ALL,
1168 f, keywords, &conf);
1169 html_nl(&ho);
1170 element_close(&ho, "p");
1171 break;
1172 case para_Bullet:
1173 case para_NumberedList:
1174 element_open(&ho, "li");
1175 if (p->private_data) {
1176 htmlsect *s = (htmlsect *)p->private_data;
27bdc5ab 1177 html_fragment(&ho, s->fragment);
78c73085 1178 }
1179 html_nl(&ho);
1180 stackhead->itemtype = LI;
1181 html_words(&ho, p->words, ALL,
1182 f, keywords, &conf);
1183 break;
1184 case para_DescribedThing:
1185 element_open(&ho, "dt");
1186 html_nl(&ho);
1187 stackhead->itemtype = DT;
1188 html_words(&ho, p->words, ALL,
1189 f, keywords, &conf);
1190 break;
1191 case para_Description:
1192 element_open(&ho, "dd");
1193 html_nl(&ho);
1194 stackhead->itemtype = DD;
1195 html_words(&ho, p->words, ALL,
1196 f, keywords, &conf);
1197 break;
1198
1199 case para_QuotePush:
1200 element_open(&ho, "blockquote");
1201 break;
1202 case para_QuotePop:
1203 element_close(&ho, "blockquote");
1204 break;
1205
1206 case para_LcontPop:
1207 se = stackhead;
1208 stackhead = stackhead->next;
1209 assert(stackhead);
1210 sfree(se);
1211 break;
1212 }
1213 }
1214
1215 assert(stackhead && !stackhead->next);
1216 sfree(stackhead);
1217 }
1218
1219 if (s->type == INDEX) {
1220 indexentry *entry;
1221 int i;
1222
1223 /*
1224 * This section is the index. I'll just
1225 * render it as a single paragraph, with a
1226 * colon between the index term and the
1227 * references, and <br> in between each
1228 * entry.
1229 */
1230 element_open(&ho, "p");
1231
1232 for (i = 0; (entry =
1233 index234(idx->entries, i)) != NULL; i++) {
1234 htmlindex *hi = (htmlindex *)entry->backend_data;
1235 int j;
1236
1237 if (i > 0)
1238 element_empty(&ho, "br");
1239 html_nl(&ho);
1240
1241 html_words(&ho, entry->text, MARKUP|LINKS,
1242 f, keywords, &conf);
1243
56a99eb6 1244 html_text(&ho, conf.index_main_sep);
78c73085 1245
1246 for (j = 0; j < hi->nrefs; j++) {
1247 htmlindexref *hr =
1248 (htmlindexref *)hi->refs[j]->private_data;
1249 paragraph *p = hr->section->title;
1250
1251 if (j > 0)
56a99eb6 1252 html_text(&ho, conf.index_multi_sep);
78c73085 1253
1254 html_href(&ho, f, hr->section->file,
1255 hr->fragment);
1b7bf715 1256 hr->referenced = TRUE;
78c73085 1257 if (p && p->kwtext)
1258 html_words(&ho, p->kwtext, MARKUP|LINKS,
1259 f, keywords, &conf);
1260 else if (p && p->words)
1261 html_words(&ho, p->words, MARKUP|LINKS,
1262 f, keywords, &conf);
56a99eb6 1263 else {
1264 /*
1265 * If there is no title at all,
1266 * this must be because our
1267 * target section is the
1268 * preamble section and there
1269 * is no title. So we use the
1270 * preamble_text.
1271 */
1272 html_text(&ho, conf.preamble_text);
1273 }
78c73085 1274 element_close(&ho, "a");
1275 }
1276 }
1277 element_close(&ho, "p");
1278 }
1279 }
1280 }
1281
1282 html_contents_entry(&ho, 0, NULL, f, keywords, &conf);
1283 html_nl(&ho);
1284
1285 {
1286 /*
1287 * Footer.
1288 */
1289 int done_version_ids = FALSE;
1290
1291 element_empty(&ho, "hr");
1292
1293 if (conf.body_end)
1294 html_raw(&ho, conf.body_end);
1295
1296 if (conf.address_section) {
27bdc5ab 1297 int started = FALSE;
1298 if (conf.htmlver == ISO_HTML) {
1299 /*
1300 * The ISO-HTML validator complains if
1301 * there isn't a <div> tag surrounding the
1302 * <address> tag. I'm uncertain of why this
1303 * should be - there appears to be no
1304 * mention of this in the ISO-HTML spec,
1305 * suggesting that it doesn't represent a
1306 * change from HTML 4, but nonetheless the
1307 * HTML 4 validator doesn't seem to mind.
1308 */
1309 element_open(&ho, "div");
1310 }
78c73085 1311 element_open(&ho, "address");
1312 if (conf.addr_start) {
1313 html_raw(&ho, conf.addr_start);
1314 html_nl(&ho);
27bdc5ab 1315 started = TRUE;
78c73085 1316 }
1317 if (conf.visible_version_id) {
78c73085 1318 for (p = sourceform; p; p = p->next)
1319 if (p->type == para_VersionID) {
27bdc5ab 1320 if (started)
78c73085 1321 element_empty(&ho, "br");
1322 html_nl(&ho);
56a99eb6 1323 html_text(&ho, conf.pre_versionid);
78c73085 1324 html_words(&ho, p->words, NOTHING,
1325 f, keywords, &conf);
56a99eb6 1326 html_text(&ho, conf.post_versionid);
78c73085 1327 started = TRUE;
1328 }
78c73085 1329 done_version_ids = TRUE;
1330 }
27bdc5ab 1331 if (conf.addr_end) {
1332 if (started)
1333 element_empty(&ho, "br");
78c73085 1334 html_raw(&ho, conf.addr_end);
27bdc5ab 1335 }
78c73085 1336 element_close(&ho, "address");
27bdc5ab 1337 if (conf.htmlver == ISO_HTML)
1338 element_close(&ho, "div");
78c73085 1339 }
1340
1341 if (!done_version_ids) {
1342 /*
1343 * If the user didn't want the version IDs
1344 * visible, I think we still have a duty to put
1345 * them in an HTML comment.
1346 */
1347 int started = FALSE;
1348 for (p = sourceform; p; p = p->next)
1349 if (p->type == para_VersionID) {
1350 if (!started) {
1351 html_raw(&ho, "<!-- version IDs:\n");
1352 started = TRUE;
1353 }
1354 html_words(&ho, p->words, NOTHING,
1355 f, keywords, &conf);
1356 html_nl(&ho);
1357 }
1358 if (started)
1359 html_raw(&ho, "-->\n");
1360 }
1361 }
1362
1363 element_close(&ho, "body");
1364 html_nl(&ho);
1365 element_close(&ho, "html");
1366 html_nl(&ho);
1367 cleanup(&ho);
1368 }
1369 }
1370
1371 /*
1b7bf715 1372 * Go through and check that no index fragments were referenced
1373 * without being generated, or indeed vice versa.
1374 *
1375 * (When I actually get round to freeing everything, this can
1376 * probably be the freeing loop as well.)
1377 */
1378 for (p = sourceform; p; p = p->next) {
1379 word *w;
1380 for (w = p->words; w; w = w->next)
1381 if (w->type == word_IndexRef) {
1382 htmlindexref *hr = (htmlindexref *)w->private_data;
1383
1384 assert(!hr->referenced == !hr->generated);
1385 }
1386 }
1387
1388 /*
3e82de8f 1389 * FIXME: Free all the working data.
78c73085 1390 */
1391}
1392
1393static void html_file_section(htmlconfig *cfg, htmlfilelist *files,
1394 htmlsect *sect, int depth)
1395{
1396 htmlfile *file;
1397 int ldepth;
1398
1399 /*
1400 * `depth' is derived from the heading_depth() macro at the top
1401 * of this file, which counts title as -1, chapter as 0,
1402 * heading as 1 and subsection as 2. However, the semantics of
1403 * cfg->leaf_level are defined to count chapter as 1, heading
1404 * as 2 etc. So first I increment depth :-(
1405 */
1406 ldepth = depth + 1;
1407
1408 if (cfg->leaf_level == 0) {
1409 /*
1410 * leaf_level==0 is a special case, in which everything is
1411 * put into a single file.
1412 */
1413 if (!files->single)
1414 files->single = html_new_file(files, cfg->single_filename);
1415
1416 file = files->single;
1417 } else {
1418 /*
1419 * If the depth of this section is at or above leaf_level,
1420 * we invent a fresh file and put this section at its head.
1421 * Otherwise, we put it in the same file as its parent
1422 * section.
1423 */
1424 if (ldepth > cfg->leaf_level) {
1425 /*
1426 * We know that sect->parent cannot be NULL. The only
1427 * circumstance in which it can be is if sect is at
1428 * chapter or appendix level, i.e. ldepth==1; and if
1429 * that's the case, then we cannot have entered this
1430 * branch unless cfg->leaf_level==0, in which case we
1431 * would be in the single-file case above and not here
1432 * at all.
1433 */
1434 assert(sect->parent);
1435
1436 file = sect->parent->file;
1437 } else {
1438 if (sect->type == TOP) {
1439 file = html_new_file(files, cfg->contents_filename);
1440 } else if (sect->type == INDEX) {
1441 file = html_new_file(files, cfg->index_filename);
1442 } else {
1443 char *title;
1444
1445 assert(ldepth > 0 && sect->title);
1446 title = html_format(sect->title, cfg->template_filename);
1447 file = html_new_file(files, title);
1448 sfree(title);
1449 }
1450 }
1451 }
1452
1453 sect->file = file;
1454
1455 if (file->min_heading_depth > depth) {
1456 /*
1457 * This heading is at a higher level than any heading we
1458 * have so far placed in this file; so we set the `first'
1459 * pointer.
1460 */
1461 file->min_heading_depth = depth;
1462 file->first = sect;
1463 }
1464
1465 if (file->min_heading_depth == depth)
1466 file->last = sect;
1467}
1468
1469static htmlfile *html_new_file(htmlfilelist *list, char *filename)
1470{
f1530049 1471 htmlfile *ret = snew(htmlfile);
78c73085 1472
1473 ret->next = NULL;
1474 if (list->tail)
1475 list->tail->next = ret;
1476 else
1477 list->head = ret;
1478 list->tail = ret;
1479
1480 ret->filename = dupstr(filename);
1481 ret->last_fragment_number = 0;
1482 ret->min_heading_depth = INT_MAX;
1483 ret->first = ret->last = NULL;
1484
1485 return ret;
1486}
1487
1488static htmlsect *html_new_sect(htmlsectlist *list, paragraph *title)
1489{
f1530049 1490 htmlsect *ret = snew(htmlsect);
78c73085 1491
1492 ret->next = NULL;
1493 if (list->tail)
1494 list->tail->next = ret;
1495 else
1496 list->head = ret;
1497 list->tail = ret;
1498
1499 ret->title = title;
1500 ret->file = NULL;
1501 ret->parent = NULL;
1502 ret->type = NORMAL;
1503
1504 return ret;
1505}
1506
1507static void html_words(htmloutput *ho, word *words, int flags,
1508 htmlfile *file, keywordlist *keywords, htmlconfig *cfg)
1509{
1510 word *w;
1511 char *c;
1512 int style, type;
1513
1514 for (w = words; w; w = w->next) switch (w->type) {
1515 case word_HyperLink:
1516 if (flags & LINKS) {
1517 element_open(ho, "a");
1518 c = utoa_dup(w->text, CS_ASCII);
1519 element_attr(ho, "href", c);
1520 sfree(c);
1521 }
1522 break;
1523 case word_UpperXref:
1524 case word_LowerXref:
1525 if (flags & LINKS) {
1526 keyword *kwl = kw_lookup(keywords, w->text);
1527 paragraph *p = kwl->para;
1528 htmlsect *s = (htmlsect *)p->private_data;
1529
1530 assert(s);
1531
1532 html_href(ho, file, s->file, s->fragment);
1533 }
1534 break;
1535 case word_HyperEnd:
1536 case word_XrefEnd:
1537 if (flags & LINKS)
1538 element_close(ho, "a");
1539 break;
1540 case word_IndexRef:
1541 if (flags & INDEXENTS) {
1542 htmlindexref *hr = (htmlindexref *)w->private_data;
27bdc5ab 1543 html_fragment(ho, hr->fragment);
1b7bf715 1544 hr->generated = TRUE;
78c73085 1545 }
1546 break;
1547 case word_Normal:
1548 case word_Emph:
1549 case word_Code:
1550 case word_WeakCode:
1551 case word_WhiteSpace:
1552 case word_EmphSpace:
1553 case word_CodeSpace:
1554 case word_WkCodeSpace:
1555 case word_Quote:
1556 case word_EmphQuote:
1557 case word_CodeQuote:
1558 case word_WkCodeQuote:
1559 style = towordstyle(w->type);
1560 type = removeattr(w->type);
1561 if (style == word_Emph &&
1562 (attraux(w->aux) == attr_First ||
1563 attraux(w->aux) == attr_Only) &&
1564 (flags & MARKUP))
1565 element_open(ho, "em");
1566 else if ((style == word_Code || style == word_WeakCode) &&
1567 (attraux(w->aux) == attr_First ||
1568 attraux(w->aux) == attr_Only) &&
1569 (flags & MARKUP))
1570 element_open(ho, "code");
1571
1572 if (type == word_WhiteSpace)
1573 html_text(ho, L" ");
1574 else if (type == word_Quote) {
1575 if (quoteaux(w->aux) == quote_Open)
1576 html_text(ho, cfg->lquote);
1577 else
1578 html_text(ho, cfg->rquote);
1579 } else {
b7309494 1580 if (cvt_ok(ho->restrict_charset, w->text) || !w->alt)
78c73085 1581 html_text(ho, w->text);
1582 else
1583 html_words(ho, w->alt, flags, file, keywords, cfg);
1584 }
1585
1586 if (style == word_Emph &&
1587 (attraux(w->aux) == attr_Last ||
1588 attraux(w->aux) == attr_Only) &&
1589 (flags & MARKUP))
1590 element_close(ho, "em");
1591 else if ((style == word_Code || style == word_WeakCode) &&
1592 (attraux(w->aux) == attr_Last ||
1593 attraux(w->aux) == attr_Only) &&
1594 (flags & MARKUP))
1595 element_close(ho, "code");
1596
1597 break;
1598 }
1599}
1600
1601static void html_codepara(htmloutput *ho, word *words)
1602{
1603 element_open(ho, "pre");
1604 element_open(ho, "code");
1605 for (; words; words = words->next) if (words->type == word_WeakCode) {
1606 char *open_tag;
1607 wchar_t *t, *e;
1608
1609 t = words->text;
1610 if (words->next && words->next->type == word_Emph) {
1611 e = words->next->text;
1612 words = words->next;
1613 } else
1614 e = NULL;
1615
1616 while (e && *e && *t) {
1617 int n;
1618 int ec = *e;
1619
1620 for (n = 0; t[n] && e[n] && e[n] == ec; n++);
1621
1622 open_tag = NULL;
1623 if (ec == 'i')
1624 open_tag = "em";
1625 else if (ec == 'b')
1626 open_tag = "b";
1627 if (open_tag)
1628 element_open(ho, open_tag);
1629
1630 html_text_limit(ho, t, n);
1631
1632 if (open_tag)
1633 element_close(ho, open_tag);
1634
1635 t += n;
1636 e += n;
1637 }
1638 html_text(ho, t);
1639 html_nl(ho);
1640 }
1641 element_close(ho, "code");
1642 element_close(ho, "pre");
1643}
1644
1645static void html_charset_cleanup(htmloutput *ho)
1646{
1647 char outbuf[256];
1648 int bytes;
1649
1650 bytes = charset_from_unicode(NULL, NULL, outbuf, lenof(outbuf),
1651 ho->charset, &ho->cstate, NULL);
1652 if (bytes > 0)
1653 fwrite(outbuf, 1, bytes, ho->fp);
1654}
1655
1656static void return_to_neutral(htmloutput *ho)
1657{
1658 if (ho->state == HO_IN_TEXT) {
1659 html_charset_cleanup(ho);
1660 } else if (ho->state == HO_IN_EMPTY_TAG && is_xhtml(ho->ver)) {
1661 fprintf(ho->fp, " />");
1662 } else if (ho->state == HO_IN_EMPTY_TAG || ho->state == HO_IN_TAG) {
1663 fprintf(ho->fp, ">");
1664 }
1665
1666 ho->state = HO_NEUTRAL;
1667}
1668
1669static void element_open(htmloutput *ho, char const *name)
1670{
1671 return_to_neutral(ho);
1672 fprintf(ho->fp, "<%s", name);
1673 ho->state = HO_IN_TAG;
1674}
1675
1676static void element_close(htmloutput *ho, char const *name)
1677{
1678 return_to_neutral(ho);
1679 fprintf(ho->fp, "</%s>", name);
1680 ho->state = HO_NEUTRAL;
1681}
1682
1683static void element_empty(htmloutput *ho, char const *name)
1684{
1685 return_to_neutral(ho);
1686 fprintf(ho->fp, "<%s", name);
1687 ho->state = HO_IN_EMPTY_TAG;
1688}
1689
1690static void html_nl(htmloutput *ho)
1691{
1692 return_to_neutral(ho);
1693 fputc('\n', ho->fp);
1694}
1695
1696static void html_raw(htmloutput *ho, char *text)
1697{
1698 return_to_neutral(ho);
1699 fputs(text, ho->fp);
1700}
1701
1702static void html_raw_as_attr(htmloutput *ho, char *text)
1703{
1704 assert(ho->state == HO_IN_TAG || ho->state == HO_IN_EMPTY_TAG);
1705 fputc(' ', ho->fp);
1706 fputs(text, ho->fp);
1707}
1708
1709static void element_attr(htmloutput *ho, char const *name, char const *value)
1710{
1711 html_charset_cleanup(ho);
1712 assert(ho->state == HO_IN_TAG || ho->state == HO_IN_EMPTY_TAG);
1713 fprintf(ho->fp, " %s=\"%s\"", name, value);
1714}
1715
1716static void element_attr_w(htmloutput *ho, char const *name,
1717 wchar_t const *value)
1718{
1719 html_charset_cleanup(ho);
1720 fprintf(ho->fp, " %s=\"", name);
1721 html_text_limit_internal(ho, value, 0, TRUE);
1722 html_charset_cleanup(ho);
1723 fputc('"', ho->fp);
1724}
1725
1726static void html_text(htmloutput *ho, wchar_t const *text)
1727{
1728 html_text_limit(ho, text, 0);
1729}
1730
1731static void html_text_limit(htmloutput *ho, wchar_t const *text, int maxlen)
1732{
1733 return_to_neutral(ho);
1734 html_text_limit_internal(ho, text, maxlen, FALSE);
1735}
1736
1737static void html_text_limit_internal(htmloutput *ho, wchar_t const *text,
1738 int maxlen, int quote_quotes)
1739{
1740 int textlen = ustrlen(text);
1741 char outbuf[256];
1742 int bytes, err;
1743
1744 if (maxlen > 0 && textlen > maxlen)
1745 textlen = maxlen;
1746
1747 while (textlen > 0) {
1748 /* Scan ahead for characters we really can't display in HTML. */
1749 int lenbefore, lenafter;
1750 for (lenbefore = 0; lenbefore < textlen; lenbefore++)
1751 if (text[lenbefore] == L'<' ||
1752 text[lenbefore] == L'>' ||
1753 text[lenbefore] == L'&' ||
1754 (text[lenbefore] == L'"' && quote_quotes))
1755 break;
1756 lenafter = lenbefore;
1757 bytes = charset_from_unicode(&text, &lenafter, outbuf, lenof(outbuf),
1758 ho->charset, &ho->cstate, &err);
1759 textlen -= (lenbefore - lenafter);
1760 if (bytes > 0)
1761 fwrite(outbuf, 1, bytes, ho->fp);
1762 if (err) {
1763 /*
1764 * We have encountered a character that cannot be
1765 * displayed in the selected output charset. Therefore,
1766 * we use an HTML numeric entity reference.
1767 */
1768 assert(textlen > 0);
1769 fprintf(ho->fp, "&#%ld;", (long int)*text);
1770 text++, textlen--;
1771 } else if (lenafter == 0 && textlen > 0) {
1772 /*
1773 * We have encountered a character which is special to
1774 * HTML.
1775 */
1776 if (*text == L'<')
1777 fprintf(ho->fp, "&lt;");
1778 else if (*text == L'>')
1779 fprintf(ho->fp, "&gt;");
1780 else if (*text == L'&')
1781 fprintf(ho->fp, "&amp;");
1782 else if (*text == L'"')
1783 fprintf(ho->fp, "&quot;");
1784 else
1785 assert(!"Can't happen");
1786 text++, textlen--;
1787 }
1788 }
1789}
1790
1791static void cleanup(htmloutput *ho)
1792{
1793 return_to_neutral(ho);
1794 fclose(ho->fp);
1795}
1796
1797static void html_href(htmloutput *ho, htmlfile *thisfile,
1798 htmlfile *targetfile, char *targetfrag)
1799{
1800 rdstringc rs = { 0, 0, NULL };
1801 char *url;
1802
1803 if (targetfile != thisfile)
1804 rdaddsc(&rs, targetfile->filename);
1805 if (targetfrag) {
1806 rdaddc(&rs, '#');
1807 rdaddsc(&rs, targetfrag);
1808 }
1809 url = rs.text;
1810
1811 element_open(ho, "a");
1812 element_attr(ho, "href", url);
1813 sfree(url);
1814}
1815
27bdc5ab 1816static void html_fragment(htmloutput *ho, char const *fragment)
1817{
1818 element_open(ho, "a");
1819 element_attr(ho, "name", fragment);
1820 if (is_xhtml(ho->ver))
1821 element_attr(ho, "id", fragment);
1822 element_close(ho, "a");
1823}
1824
78c73085 1825static char *html_format(paragraph *p, char *template_string)
1826{
1827 char *c, *t;
1828 word *w;
1829 wchar_t *ws, wsbuf[2];
1830 rdstringc rs = { 0, 0, NULL };
1831
1832 t = template_string;
1833 while (*t) {
1834 if (*t == '%' && t[1]) {
1835 int fmt;
1836
1837 t++;
1838 fmt = *t++;
1839
1840 if (fmt == '%') {
1841 rdaddc(&rs, fmt);
1842 continue;
1843 }
1844
1845 w = NULL;
1846 ws = NULL;
1847
1848 if (p->kwtext && fmt == 'n')
1849 w = p->kwtext;
1850 else if (p->kwtext2 && fmt == 'b') {
1851 /*
1852 * HTML fragment names must start with a letter, so
1853 * simply `1.2.3' is not adequate. In this case I'm
1854 * going to cheat slightly by prepending the first
1855 * character of the first word of kwtext, so that
1856 * we get `C1' for chapter 1, `S2.3' for section
1857 * 2.3 etc.
1858 */
1859 if (p->kwtext && p->kwtext->text[0]) {
1860 ws = wsbuf;
1861 wsbuf[1] = '\0';
1862 wsbuf[0] = p->kwtext->text[0];
1863 }
1864 w = p->kwtext2;
1865 } else if (p->keyword && *p->keyword && fmt == 'k')
1866 ws = p->keyword;
1867 else
1868 w = p->words;
1869
1870 if (ws) {
1871 c = utoa_dup(ws, CS_ASCII);
1872 rdaddsc(&rs,c);
1873 sfree(c);
1874 }
1875
1876 while (w) {
1877 if (removeattr(w->type) == word_Normal) {
1878 c = utoa_dup(w->text, CS_ASCII);
1879 rdaddsc(&rs,c);
1880 sfree(c);
1881 }
1882 w = w->next;
1883 }
1884 } else {
1885 rdaddc(&rs, *t++);
1886 }
1887 }
1888
1889 return rdtrimc(&rs);
1890}
1891
3e82de8f 1892static char *html_sanitise_fragment(htmlfilelist *files, htmlfile *file,
1893 char *text)
78c73085 1894{
1895 /*
1896 * The HTML 4 spec's strictest definition of fragment names (<a
1897 * name> and "id" attributes) says that they `must begin with a
1898 * letter and may be followed by any number of letters, digits,
1899 * hyphens, underscores, colons, and periods'.
1900 *
1901 * So here we unceremoniously rip out any characters not
1902 * conforming to this limitation.
1903 */
1904 char *p = text, *q = text;
1905
1906 while (*p && !((*p>='A' && *p<='Z') || (*p>='a' && *p<='z')))
1907 p++;
3e82de8f 1908 if ((*q++ = *p++) != '\0') {
1909 while (*p) {
1910 if ((*p>='A' && *p<='Z') ||
1911 (*p>='a' && *p<='z') ||
1912 (*p>='0' && *p<='9') ||
1913 *p=='-' || *p=='_' || *p==':' || *p=='.')
1914 *q++ = *p;
1915 p++;
1916 }
1917
1918 *q = '\0';
1919 }
1920
1921 /*
1922 * Now we check for clashes with other fragment names, and
1923 * adjust this one if necessary by appending a hyphen followed
1924 * by a number.
1925 */
1926 {
1927 htmlfragment *frag = snew(htmlfragment);
1928 int len = 0; /* >0 indicates we have resized */
1929 int suffix = 1;
1930
1931 frag->file = file;
1932 frag->fragment = text;
1933
1934 while (add234(files->frags, frag) != frag) {
1935 if (!len) {
1936 len = strlen(text);
1937 frag->fragment = text = sresize(text, len+20, char);
1938 }
1939
1940 sprintf(text + len, "-%d", ++suffix);
1941 }
78c73085 1942 }
1943
3e82de8f 1944 return text;
78c73085 1945}
1946
1947static void html_contents_entry(htmloutput *ho, int depth, htmlsect *s,
1948 htmlfile *thisfile, keywordlist *keywords,
1949 htmlconfig *cfg)
1950{
1951 while (ho->contents_level > depth) {
1952 element_close(ho, "ul");
1953 ho->contents_level--;
1954 }
1955
1956 while (ho->contents_level < depth) {
1957 element_open(ho, "ul");
1958 ho->contents_level++;
1959 }
1960
1961 if (!s)
1962 return;
1963
1964 element_open(ho, "li");
1965 html_href(ho, thisfile, s->file, s->fragment);
23c9bbc2 1966 html_section_title(ho, s, thisfile, keywords, cfg, FALSE);
78c73085 1967 element_close(ho, "a");
1968 element_close(ho, "li");
1969}
1970
1971static void html_section_title(htmloutput *ho, htmlsect *s, htmlfile *thisfile,
23c9bbc2 1972 keywordlist *keywords, htmlconfig *cfg,
1973 int real)
78c73085 1974{
1975 if (s->title) {
1976 sectlevel *sl;
1977 word *number;
1978 int depth = heading_depth(s->title);
1979
1980 if (depth < 0)
1981 sl = NULL;
1982 else if (depth == 0)
1983 sl = &cfg->achapter;
1984 else if (depth <= cfg->nasect)
1985 sl = &cfg->asect[depth-1];
1986 else
1987 sl = &cfg->asect[cfg->nasect-1];
1988
1989 if (!sl)
1990 number = NULL;
1991 else if (sl->just_numbers)
1992 number = s->title->kwtext2;
1993 else
1994 number = s->title->kwtext;
1995
1996 if (number) {
1997 html_words(ho, number, MARKUP,
1998 thisfile, keywords, cfg);
1999 html_text(ho, sl->number_suffix);
2000 }
2001
23c9bbc2 2002 html_words(ho, s->title->words, real ? ALL : MARKUP,
78c73085 2003 thisfile, keywords, cfg);
2004 } else {
2005 assert(s->type != NORMAL);
56a99eb6 2006 /*
2007 * If we're printing the full document title for _real_ and
2008 * there isn't one, we don't want to print `Preamble' at
2009 * the top of what ought to just be some text. If we need
2010 * it in any other context such as TOCs, we need to print
2011 * `Preamble'.
2012 */
2013 if (s->type == TOP && !real)
2014 html_text(ho, cfg->preamble_text);
78c73085 2015 else if (s->type == INDEX)
56a99eb6 2016 html_text(ho, cfg->index_text);
78c73085 2017 }
2018}