X-Git-Url: https://git.distorted.org.uk/~mdw/sgt/halibut/blobdiff_plain/04781c84bdab91b4a1a05115be75830841359bc9..3038ce7acdf6ea5f0af267138287b2ae829d1261:/bk_html.c diff --git a/bk_html.c b/bk_html.c index 5cd145e..1bd573a 100644 --- a/bk_html.c +++ b/bk_html.c @@ -10,31 +10,6 @@ * sensible. Perhaps for the topmost section in the file, no * fragment should be used? (Though it should probably still be * _there_ even if unused.) - * - * - new configurability: - * * a few new things explicitly labelled as `FIXME: - * configurable' or similar. - * * HTML flavour. - * * Some means of specifying the distinction between - * restrict-charset and output-charset. It seems to me that - * `html-charset' is output-charset, and that - * restrict-charset usually wants to be either output-charset - * or UTF-8 (the latter indicating that any Unicode character - * is fair game and it will be specified using &#foo; if it - * isn't in output-charset). However, since XHTML defaults to - * UTF-8 and it's fiddly to tell it otherwise, it's just - * possible that some user may need to set restrict-charset - * to their charset of choice while leaving _output_-charset - * at UTF-8. Figure out some configuration, and apply it. - * - * - test all HTML flavours and ensure they validate sensibly. Fix - * remaining confusion issues such as and obsoleteness - * of . - * - * - nonbreaking spaces. - * - * - free up all the data we have allocated while running this - * backend. */ #include @@ -74,9 +49,13 @@ typedef struct { char *head_end, *body_start, *body_end, *addr_start, *addr_end; char *body_tag, *nav_attr; wchar_t *author, *description; + wchar_t *index_text, *contents_text, *preamble_text, *title_separator; + wchar_t *nav_prev_text, *nav_next_text, *nav_separator; + wchar_t *index_main_sep, *index_multi_sep; + wchar_t *pre_versionid, *post_versionid; int restrict_charset, output_charset; enum { - HTML_3_2, HTML_4, + HTML_3_2, HTML_4, ISO_HTML, XHTML_1_0_TRANSITIONAL, XHTML_1_0_STRICT } htmlver; wchar_t *lquote, *rquote; @@ -141,7 +120,7 @@ typedef struct { * level. */ FILE *fp; - int charset; + int charset, restrict_charset; charset_state cstate; int ver; enum { @@ -190,9 +169,10 @@ static void element_attr(htmloutput *ho, char const *name, char const *value); static void element_attr_w(htmloutput *ho, char const *name, wchar_t const *value); static void html_text(htmloutput *ho, wchar_t const *str); +static void html_text_nbsp(htmloutput *ho, wchar_t const *str); static void html_text_limit(htmloutput *ho, wchar_t const *str, int maxlen); static void html_text_limit_internal(htmloutput *ho, wchar_t const *text, - int maxlen, int quote_quotes); + int maxlen, int quote_quotes, int nbsp); static void html_nl(htmloutput *ho); static void html_raw(htmloutput *ho, char *text); static void html_raw_as_attr(htmloutput *ho, char *text); @@ -200,6 +180,7 @@ static void cleanup(htmloutput *ho); static void html_href(htmloutput *ho, htmlfile *thisfile, htmlfile *targetfile, char *targetfrag); +static void html_fragment(htmloutput *ho, char const *fragment); static char *html_format(paragraph *p, char *template_string); static char *html_sanitise_fragment(htmlfilelist *files, htmlfile *file, @@ -240,9 +221,20 @@ static htmlconfig html_configure(paragraph *source) { ret.head_end = ret.body_tag = ret.body_start = ret.body_end = ret.addr_start = ret.addr_end = ret.nav_attr = NULL; ret.author = ret.description = NULL; - ret.restrict_charset = CS_ASCII; + ret.restrict_charset = CS_UTF8; ret.output_charset = CS_ASCII; ret.htmlver = HTML_4; + ret.index_text = L"Index"; + ret.contents_text = L"Contents"; + ret.preamble_text = L"Preamble"; + ret.title_separator = L" - "; + ret.nav_prev_text = L"Previous"; + ret.nav_next_text = L"Next"; + ret.nav_separator = L" | "; + ret.index_main_sep = L": "; + ret.index_multi_sep = L", "; + ret.pre_versionid = L"["; + ret.post_versionid = L"]"; /* * Default quote characters are Unicode matched single quotes, * falling back to ordinary ASCII ". @@ -274,11 +266,32 @@ static htmlconfig html_configure(paragraph *source) { if (!ustrnicmp(k, L"xhtml-", 6)) k++; /* treat `xhtml-' and `html-' the same */ - if (!ustricmp(k, L"html-charset")) { - char *csname = utoa_dup(uadv(k), CS_ASCII); - ret.restrict_charset = ret.output_charset = - charset_from_localenc(csname); - sfree(csname); + if (!ustricmp(k, L"html-restrict-charset")) { + ret.restrict_charset = charset_from_ustr(&p->fpos, uadv(k)); + } else if (!ustricmp(k, L"html-output-charset")) { + ret.output_charset = charset_from_ustr(&p->fpos, uadv(k)); + } else if (!ustricmp(k, L"html-version")) { + wchar_t *vername = uadv(k); + static const struct { + const wchar_t *name; + int ver; + } versions[] = { + {L"html3.2", HTML_3_2}, + {L"html4", HTML_4}, + {L"iso-html", ISO_HTML}, + {L"xhtml1.0transitional", XHTML_1_0_TRANSITIONAL}, + {L"xhtml1.0strict", XHTML_1_0_STRICT} + }; + int i; + + for (i = 0; i < (int)lenof(versions); i++) + if (!ustricmp(versions[i].name, vername)) + break; + + if (i == lenof(versions)) + error(err_htmlver, &p->fpos, vername); + else + ret.htmlver = versions[i].ver; } else if (!ustricmp(k, L"html-single-filename")) { sfree(ret.single_filename); ret.single_filename = dupstr(adv(p->origkeyword)); @@ -387,6 +400,28 @@ static htmlconfig html_configure(paragraph *source) { ret.leaf_contains_contents = utob(uadv(k)); } else if (!ustricmp(k, L"html-leaf-smallest-contents")) { ret.leaf_smallest_contents = utoi(uadv(k)); + } else if (!ustricmp(k, L"html-index-text")) { + ret.index_text = uadv(k); + } else if (!ustricmp(k, L"html-contents-text")) { + ret.contents_text = uadv(k); + } else if (!ustricmp(k, L"html-preamble-text")) { + ret.preamble_text = uadv(k); + } else if (!ustricmp(k, L"html-title-separator")) { + ret.title_separator = uadv(k); + } else if (!ustricmp(k, L"html-nav-prev-text")) { + ret.nav_prev_text = uadv(k); + } else if (!ustricmp(k, L"html-nav-next-text")) { + ret.nav_next_text = uadv(k); + } else if (!ustricmp(k, L"html-nav-separator")) { + ret.nav_separator = uadv(k); + } else if (!ustricmp(k, L"html-index-main-separator")) { + ret.index_main_sep = uadv(k); + } else if (!ustricmp(k, L"html-index-multiple-separator")) { + ret.index_multi_sep = uadv(k); + } else if (!ustricmp(k, L"html-pre-versionid")) { + ret.pre_versionid = uadv(k); + } else if (!ustricmp(k, L"html-post-versionid")) { + ret.post_versionid = uadv(k); } } } @@ -423,11 +458,13 @@ paragraph *html_config_filename(char *filename) } void html_backend(paragraph *sourceform, keywordlist *keywords, - indexdata *idx, void *unused) { + indexdata *idx, void *unused) +{ paragraph *p; htmlconfig conf; htmlfilelist files = { NULL, NULL, NULL, NULL, NULL }; htmlsectlist sects = { NULL, NULL }, nonsects = { NULL, NULL }; + int has_index; IGNORE(unused); @@ -460,7 +497,7 @@ void html_backend(paragraph *sourceform, keywordlist *keywords, htmlsect *topsect, *sect; int d; - topsect = html_new_sect(§s, p); + topsect = html_new_sect(§s, NULL); topsect->type = TOP; topsect->title = NULL; topsect->text = sourceform; @@ -496,16 +533,20 @@ void html_backend(paragraph *sourceform, keywordlist *keywords, sect->fragment); } - /* And the index. */ - sect = html_new_sect(§s, NULL); - sect->text = NULL; - sect->type = INDEX; - sect->parent = topsect; - html_file_section(&conf, &files, sect, 0); /* peer of chapters */ - sect->fragment = dupstr("Index"); /* FIXME: this _can't_ be right */ - sect->fragment = html_sanitise_fragment(&files, sect->file, - sect->fragment); - files.index = sect->file; + /* And the index, if we have one. */ + has_index = (count234(idx->entries) > 0); + if (has_index) { + sect = html_new_sect(§s, NULL); + sect->text = NULL; + sect->type = INDEX; + sect->parent = topsect; + sect->contents_depth = 0; + html_file_section(&conf, &files, sect, 0); /* peer of chapters */ + sect->fragment = utoa_dup(conf.index_text, CS_ASCII); + sect->fragment = html_sanitise_fragment(&files, sect->file, + sect->fragment); + files.index = sect->file; + } } /* @@ -617,9 +658,9 @@ void html_backend(paragraph *sourceform, keywordlist *keywords, * Run over the document inventing fragments. Each fragment * is of the form `i' followed by an integer. */ - lastsect = NULL; + lastsect = sects.head; /* this is always the top section */ for (p = sourceform; p; p = p->next) { - if (is_heading_type(p->type)) + if (is_heading_type(p->type) && p->type != para_Title) lastsect = (htmlsect *)p->private_data; for (w = p->words; w; w = w->next) @@ -697,6 +738,7 @@ void html_backend(paragraph *sourceform, keywordlist *keywords, ho.fp = fopen(f->filename, "w"); ho.charset = conf.output_charset; + ho.restrict_charset = conf.restrict_charset; ho.cstate = charset_init_state; ho.ver = conf.htmlver; ho.state = HO_NEUTRAL; @@ -713,16 +755,20 @@ void html_backend(paragraph *sourceform, keywordlist *keywords, " 4.01//EN\"\n\"http://www.w3.org/TR/html4/" "strict.dtd\">\n"); break; + case ISO_HTML: + fprintf(ho.fp, "\n"); + break; case XHTML_1_0_TRANSITIONAL: - /* FIXME: to specify character encoding. - * This breaks HTML backwards compat, so perhaps avoid, or - * perhaps only emit when not using the default UTF-8? */ + fprintf(ho.fp, "\n", + charset_to_mimeenc(conf.output_charset)); fprintf(ho.fp, "\n"); break; case XHTML_1_0_STRICT: - /* FIXME: to specify character encoding. */ + fprintf(ho.fp, "\n", + charset_to_mimeenc(conf.output_charset)); fprintf(ho.fp, "\n"); @@ -769,7 +815,7 @@ void html_backend(paragraph *sourceform, keywordlist *keywords, assert(f->last); if (f->last != f->first && f->last->title) { - html_text(&ho, L" - "); /* FIXME: configurable? */ + html_text(&ho, conf.title_separator); html_words(&ho, f->last->title->words, NOTHING, f, keywords, &conf); } @@ -783,7 +829,6 @@ void html_backend(paragraph *sourceform, keywordlist *keywords, element_close(&ho, "head"); html_nl(&ho); - /* FIXME: need to be able to specify replacement for this */ if (conf.body_tag) html_raw(&ho, conf.body_tag); else @@ -806,37 +851,39 @@ void html_backend(paragraph *sourceform, keywordlist *keywords, element_open(&ho, "a"); element_attr(&ho, "href", prevf->filename); } - html_text(&ho, L"Previous");/* FIXME: conf? */ + html_text(&ho, conf.nav_prev_text); if (prevf) element_close(&ho, "a"); - html_text(&ho, L" | "); /* FIXME: conf? */ + html_text(&ho, conf.nav_separator); if (f != files.head) { element_open(&ho, "a"); element_attr(&ho, "href", files.head->filename); } - html_text(&ho, L"Contents");/* FIXME: conf? */ + html_text(&ho, conf.contents_text); if (f != files.head) element_close(&ho, "a"); - html_text(&ho, L" | "); /* FIXME: conf? */ + html_text(&ho, conf.nav_separator); - if (f != files.index) { - element_open(&ho, "a"); - element_attr(&ho, "href", files.index->filename); + if (has_index) { + if (f != files.index) { + element_open(&ho, "a"); + element_attr(&ho, "href", files.index->filename); + } + html_text(&ho, conf.index_text); + if (f != files.index) + element_close(&ho, "a"); } - html_text(&ho, L"Index");/* FIXME: conf? */ - if (f != files.index) - element_close(&ho, "a"); - html_text(&ho, L" | "); /* FIXME: conf? */ + html_text(&ho, conf.nav_separator); if (f->next) { element_open(&ho, "a"); element_attr(&ho, "href", f->next->filename); } - html_text(&ho, L"Next"); /* FIXME: conf? */ + html_text(&ho, conf.nav_next_text); if (f->next) element_close(&ho, "a"); @@ -990,17 +1037,12 @@ void html_backend(paragraph *sourceform, keywordlist *keywords, /* * Provide anchor for cross-links to target. * - * FIXME: AIcurrentlyUI, this needs to be done - * differently in XHTML because is - * deprecated or obsolete. - * * (Also we'll have to do this separately in * other paragraph types - NumberedList and * BiblioCited.) */ - element_open(&ho, "a"); - element_attr(&ho, "name", s->fragment); - element_close(&ho, "a"); + if (s->fragment) + html_fragment(&ho, s->fragment); html_section_title(&ho, s, f, keywords, &conf, TRUE); @@ -1132,9 +1174,7 @@ void html_backend(paragraph *sourceform, keywordlist *keywords, element_open(&ho, "p"); if (p->private_data) { htmlsect *s = (htmlsect *)p->private_data; - element_open(&ho, "a"); - element_attr(&ho, "name", s->fragment); - element_close(&ho, "a"); + html_fragment(&ho, s->fragment); } html_nl(&ho); html_words(&ho, p->kwtext, ALL, @@ -1150,9 +1190,7 @@ void html_backend(paragraph *sourceform, keywordlist *keywords, element_open(&ho, "li"); if (p->private_data) { htmlsect *s = (htmlsect *)p->private_data; - element_open(&ho, "a"); - element_attr(&ho, "name", s->fragment); - element_close(&ho, "a"); + html_fragment(&ho, s->fragment); } html_nl(&ho); stackhead->itemtype = LI; @@ -1219,7 +1257,7 @@ void html_backend(paragraph *sourceform, keywordlist *keywords, html_words(&ho, entry->text, MARKUP|LINKS, f, keywords, &conf); - html_text(&ho, L": ");/* FIXME: configurable */ + html_text(&ho, conf.index_main_sep); for (j = 0; j < hi->nrefs; j++) { htmlindexref *hr = @@ -1227,7 +1265,7 @@ void html_backend(paragraph *sourceform, keywordlist *keywords, paragraph *p = hr->section->title; if (j > 0) - html_text(&ho, L", "); /* FIXME: conf */ + html_text(&ho, conf.index_multi_sep); html_href(&ho, f, hr->section->file, hr->fragment); @@ -1238,8 +1276,17 @@ void html_backend(paragraph *sourceform, keywordlist *keywords, else if (p && p->words) html_words(&ho, p->words, MARKUP|LINKS, f, keywords, &conf); - else - html_text(&ho, L"FIXME"); + else { + /* + * If there is no title at all, + * this must be because our + * target section is the + * preamble section and there + * is no title. So we use the + * preamble_text. + */ + html_text(&ho, conf.preamble_text); + } element_close(&ho, "a"); } } @@ -1263,33 +1310,48 @@ void html_backend(paragraph *sourceform, keywordlist *keywords, html_raw(&ho, conf.body_end); if (conf.address_section) { + int started = FALSE; + if (conf.htmlver == ISO_HTML) { + /* + * The ISO-HTML validator complains if + * there isn't a
tag surrounding the + *
tag. I'm uncertain of why this + * should be - there appears to be no + * mention of this in the ISO-HTML spec, + * suggesting that it doesn't represent a + * change from HTML 4, but nonetheless the + * HTML 4 validator doesn't seem to mind. + */ + element_open(&ho, "div"); + } element_open(&ho, "address"); if (conf.addr_start) { html_raw(&ho, conf.addr_start); html_nl(&ho); + started = TRUE; } if (conf.visible_version_id) { - int started = FALSE; for (p = sourceform; p; p = p->next) if (p->type == para_VersionID) { - if (!started) - element_open(&ho, "p"); - else + if (started) element_empty(&ho, "br"); html_nl(&ho); - html_text(&ho, L"["); /* FIXME: conf? */ + html_text(&ho, conf.pre_versionid); html_words(&ho, p->words, NOTHING, f, keywords, &conf); - html_text(&ho, L"]"); /* FIXME: conf? */ + html_text(&ho, conf.post_versionid); started = TRUE; } - if (started) - element_close(&ho, "p"); done_version_ids = TRUE; } - if (conf.addr_end) + if (conf.addr_end) { + if (started) + element_empty(&ho, "br"); html_raw(&ho, conf.addr_end); + } element_close(&ho, "address"); + if (conf.htmlver == ISO_HTML) + element_close(&ho, "div"); } if (!done_version_ids) { @@ -1340,8 +1402,73 @@ void html_backend(paragraph *sourceform, keywordlist *keywords, } /* - * FIXME: Free all the working data. + * Free all the working data. */ + sfree(conf.asect); + sfree(conf.single_filename); + sfree(conf.contents_filename); + sfree(conf.index_filename); + sfree(conf.template_filename); + sfree(conf.template_fragment); + { + htmlfragment *frag; + while ( (frag = (htmlfragment *)delpos234(files.frags, 0)) != NULL ) { + /* + * frag->fragment is dynamically allocated, but will be + * freed when we process the htmlsect structure which + * it is attached to. + */ + sfree(frag); + } + freetree234(files.frags); + } + { + htmlsect *sect, *tmp; + sect = sects.head; + while (sect) { + tmp = sect->next; + sfree(sect->fragment); + sfree(sect); + sect = tmp; + } + sect = nonsects.head; + while (sect) { + tmp = sect->next; + sfree(sect->fragment); + sfree(sect); + sect = tmp; + } + } + { + htmlfile *file, *tmp; + file = files.head; + while (file) { + tmp = file->next; + sfree(file->filename); + sfree(file); + file = tmp; + } + } + { + int i; + indexentry *entry; + for (i = 0; (entry = index234(idx->entries, i)) != NULL; i++) { + htmlindex *hi = (htmlindex *)entry->backend_data; + sfree(hi); + } + } + { + paragraph *p; + word *w; + for (p = sourceform; p; p = p->next) + for (w = p->words; w; w = w->next) + if (w->type == word_IndexRef) { + htmlindexref *hr = (htmlindexref *)w->private_data; + assert(hr != NULL); + sfree(hr->fragment); + sfree(hr); + } + } } static void html_file_section(htmlconfig *cfg, htmlfilelist *files, @@ -1494,9 +1621,7 @@ static void html_words(htmloutput *ho, word *words, int flags, case word_IndexRef: if (flags & INDEXENTS) { htmlindexref *hr = (htmlindexref *)w->private_data; - element_open(ho, "a"); - element_attr(ho, "name", hr->fragment); - element_close(ho, "a"); + html_fragment(ho, hr->fragment); hr->generated = TRUE; } break; @@ -1533,8 +1658,8 @@ static void html_words(htmloutput *ho, word *words, int flags, else html_text(ho, cfg->rquote); } else { - if (cvt_ok(ho->charset, w->text) || !w->alt) - html_text(ho, w->text); + if (!w->alt || cvt_ok(ho->restrict_charset, w->text)) + html_text_nbsp(ho, w->text); else html_words(ho, w->alt, flags, file, keywords, cfg); } @@ -1609,11 +1734,9 @@ static void html_charset_cleanup(htmloutput *ho) fwrite(outbuf, 1, bytes, ho->fp); } -static void return_to_neutral(htmloutput *ho) +static void return_mostly_to_neutral(htmloutput *ho) { - if (ho->state == HO_IN_TEXT) { - html_charset_cleanup(ho); - } else if (ho->state == HO_IN_EMPTY_TAG && is_xhtml(ho->ver)) { + if (ho->state == HO_IN_EMPTY_TAG && is_xhtml(ho->ver)) { fprintf(ho->fp, " />"); } else if (ho->state == HO_IN_EMPTY_TAG || ho->state == HO_IN_TAG) { fprintf(ho->fp, ">"); @@ -1622,6 +1745,15 @@ static void return_to_neutral(htmloutput *ho) ho->state = HO_NEUTRAL; } +static void return_to_neutral(htmloutput *ho) +{ + if (ho->state == HO_IN_TEXT) { + html_charset_cleanup(ho); + } + + return_mostly_to_neutral(ho); +} + static void element_open(htmloutput *ho, char const *name) { return_to_neutral(ho); @@ -1674,24 +1806,31 @@ static void element_attr_w(htmloutput *ho, char const *name, { html_charset_cleanup(ho); fprintf(ho->fp, " %s=\"", name); - html_text_limit_internal(ho, value, 0, TRUE); + html_text_limit_internal(ho, value, 0, TRUE, FALSE); html_charset_cleanup(ho); fputc('"', ho->fp); } static void html_text(htmloutput *ho, wchar_t const *text) { - html_text_limit(ho, text, 0); + return_mostly_to_neutral(ho); + html_text_limit_internal(ho, text, 0, FALSE, FALSE); +} + +static void html_text_nbsp(htmloutput *ho, wchar_t const *text) +{ + return_mostly_to_neutral(ho); + html_text_limit_internal(ho, text, 0, FALSE, TRUE); } static void html_text_limit(htmloutput *ho, wchar_t const *text, int maxlen) { - return_to_neutral(ho); - html_text_limit_internal(ho, text, maxlen, FALSE); + return_mostly_to_neutral(ho); + html_text_limit_internal(ho, text, maxlen, FALSE, FALSE); } static void html_text_limit_internal(htmloutput *ho, wchar_t const *text, - int maxlen, int quote_quotes) + int maxlen, int quote_quotes, int nbsp) { int textlen = ustrlen(text); char outbuf[256]; @@ -1707,7 +1846,8 @@ static void html_text_limit_internal(htmloutput *ho, wchar_t const *text, if (text[lenbefore] == L'<' || text[lenbefore] == L'>' || text[lenbefore] == L'&' || - (text[lenbefore] == L'"' && quote_quotes)) + (text[lenbefore] == L'"' && quote_quotes) || + (text[lenbefore] == L' ' && nbsp)) break; lenafter = lenbefore; bytes = charset_from_unicode(&text, &lenafter, outbuf, lenof(outbuf), @@ -1737,7 +1877,10 @@ static void html_text_limit_internal(htmloutput *ho, wchar_t const *text, fprintf(ho->fp, "&"); else if (*text == L'"') fprintf(ho->fp, """); - else + else if (*text == L' ') { + assert(nbsp); + fprintf(ho->fp, " "); + } else assert(!"Can't happen"); text++, textlen--; } @@ -1769,6 +1912,15 @@ static void html_href(htmloutput *ho, htmlfile *thisfile, sfree(url); } +static void html_fragment(htmloutput *ho, char const *fragment) +{ + element_open(ho, "a"); + element_attr(ho, "name", fragment); + if (is_xhtml(ho->ver)) + element_attr(ho, "id", fragment); + element_close(ho, "a"); +} + static char *html_format(paragraph *p, char *template_string) { char *c, *t; @@ -1950,9 +2102,16 @@ static void html_section_title(htmloutput *ho, htmlsect *s, htmlfile *thisfile, thisfile, keywords, cfg); } else { assert(s->type != NORMAL); - if (s->type == TOP) - html_text(ho, L"Preamble");/* FIXME: configure */ + /* + * If we're printing the full document title for _real_ and + * there isn't one, we don't want to print `Preamble' at + * the top of what ought to just be some text. If we need + * it in any other context such as TOCs, we need to print + * `Preamble'. + */ + if (s->type == TOP && !real) + html_text(ho, cfg->preamble_text); else if (s->type == INDEX) - html_text(ho, L"Index");/* FIXME: configure */ + html_text(ho, cfg->index_text); } }